diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24346 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9998704047691045, + "eval_steps": 500, + "global_step": 34722, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008639682059700203, + "grad_norm": 11.771962803910544, + "learning_rate": 9.596928982725528e-08, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0017279364119400407, + "grad_norm": 9.020646240501524, + "learning_rate": 1.9193857965451055e-07, + "loss": 0.5688, + "step": 20 + }, + { + "epoch": 0.0025919046179100607, + "grad_norm": 6.34090392679821, + "learning_rate": 2.8790786948176586e-07, + "loss": 0.4585, + "step": 30 + }, + { + "epoch": 0.0034558728238800814, + "grad_norm": 2.8058492274624522, + "learning_rate": 3.838771593090211e-07, + "loss": 0.3803, + "step": 40 + }, + { + "epoch": 0.004319841029850101, + "grad_norm": 2.6778181919123094, + "learning_rate": 4.798464491362765e-07, + "loss": 0.3372, + "step": 50 + }, + { + "epoch": 0.005183809235820121, + "grad_norm": 2.66961075763771, + "learning_rate": 5.758157389635317e-07, + "loss": 0.316, + "step": 60 + }, + { + "epoch": 0.0060477774417901425, + "grad_norm": 2.52693112946638, + "learning_rate": 6.717850287907871e-07, + "loss": 0.2944, + "step": 70 + }, + { + "epoch": 0.006911745647760163, + "grad_norm": 2.687818570630057, + "learning_rate": 7.677543186180422e-07, + "loss": 0.2864, + "step": 80 + }, + { + "epoch": 0.007775713853730183, + "grad_norm": 2.5730686332715154, + "learning_rate": 8.637236084452976e-07, + "loss": 0.2732, + "step": 90 + }, + { + "epoch": 0.008639682059700202, + "grad_norm": 2.6580593069171905, + "learning_rate": 9.59692898272553e-07, + "loss": 0.2706, + "step": 100 + }, + { + "epoch": 0.009503650265670223, + "grad_norm": 2.6483380675263417, + "learning_rate": 1.0556621880998082e-06, + "loss": 0.2658, + "step": 110 + }, + { + "epoch": 0.010367618471640243, + "grad_norm": 3.2034183330693926, + "learning_rate": 1.1516314779270634e-06, + "loss": 0.2574, + "step": 120 + }, + { + "epoch": 0.011231586677610265, + "grad_norm": 3.5452977484012838, + "learning_rate": 1.2476007677543187e-06, + "loss": 0.2564, + "step": 130 + }, + { + "epoch": 0.012095554883580285, + "grad_norm": 2.8513462750096714, + "learning_rate": 1.3435700575815741e-06, + "loss": 0.2488, + "step": 140 + }, + { + "epoch": 0.012959523089550305, + "grad_norm": 2.6333666283166988, + "learning_rate": 1.4395393474088292e-06, + "loss": 0.2462, + "step": 150 + }, + { + "epoch": 0.013823491295520325, + "grad_norm": 2.6748238297309967, + "learning_rate": 1.5355086372360844e-06, + "loss": 0.2462, + "step": 160 + }, + { + "epoch": 0.014687459501490346, + "grad_norm": 2.5303667333233997, + "learning_rate": 1.63147792706334e-06, + "loss": 0.2441, + "step": 170 + }, + { + "epoch": 0.015551427707460366, + "grad_norm": 2.9751307770273425, + "learning_rate": 1.7274472168905951e-06, + "loss": 0.2391, + "step": 180 + }, + { + "epoch": 0.016415395913430386, + "grad_norm": 2.758192405739691, + "learning_rate": 1.8234165067178506e-06, + "loss": 0.2352, + "step": 190 + }, + { + "epoch": 0.017279364119400405, + "grad_norm": 2.5969506757432677, + "learning_rate": 1.919385796545106e-06, + "loss": 0.234, + "step": 200 + }, + { + "epoch": 0.018143332325370427, + "grad_norm": 2.946805205411811, + "learning_rate": 2.015355086372361e-06, + "loss": 0.2368, + "step": 210 + }, + { + "epoch": 0.019007300531340445, + "grad_norm": 2.375602226438953, + "learning_rate": 2.1113243761996164e-06, + "loss": 0.2292, + "step": 220 + }, + { + "epoch": 0.019871268737310467, + "grad_norm": 2.479386341346247, + "learning_rate": 2.2072936660268714e-06, + "loss": 0.2282, + "step": 230 + }, + { + "epoch": 0.020735236943280486, + "grad_norm": 2.530227027626686, + "learning_rate": 2.303262955854127e-06, + "loss": 0.2283, + "step": 240 + }, + { + "epoch": 0.021599205149250508, + "grad_norm": 2.663697247032743, + "learning_rate": 2.3992322456813823e-06, + "loss": 0.2287, + "step": 250 + }, + { + "epoch": 0.02246317335522053, + "grad_norm": 2.4409362189300823, + "learning_rate": 2.4952015355086374e-06, + "loss": 0.2283, + "step": 260 + }, + { + "epoch": 0.023327141561190548, + "grad_norm": 2.4887477176122963, + "learning_rate": 2.5911708253358924e-06, + "loss": 0.2274, + "step": 270 + }, + { + "epoch": 0.02419110976716057, + "grad_norm": 2.169049552229543, + "learning_rate": 2.6871401151631483e-06, + "loss": 0.2259, + "step": 280 + }, + { + "epoch": 0.02505507797313059, + "grad_norm": 2.2321349960796644, + "learning_rate": 2.7831094049904033e-06, + "loss": 0.224, + "step": 290 + }, + { + "epoch": 0.02591904617910061, + "grad_norm": 2.123716642203702, + "learning_rate": 2.8790786948176584e-06, + "loss": 0.2261, + "step": 300 + }, + { + "epoch": 0.02678301438507063, + "grad_norm": 2.2304887712432158, + "learning_rate": 2.975047984644914e-06, + "loss": 0.2223, + "step": 310 + }, + { + "epoch": 0.02764698259104065, + "grad_norm": 2.3168128684841958, + "learning_rate": 3.071017274472169e-06, + "loss": 0.2258, + "step": 320 + }, + { + "epoch": 0.02851095079701067, + "grad_norm": 2.126683137374433, + "learning_rate": 3.1669865642994248e-06, + "loss": 0.2218, + "step": 330 + }, + { + "epoch": 0.02937491900298069, + "grad_norm": 2.167002578947182, + "learning_rate": 3.26295585412668e-06, + "loss": 0.2267, + "step": 340 + }, + { + "epoch": 0.03023888720895071, + "grad_norm": 2.3404259059137194, + "learning_rate": 3.358925143953935e-06, + "loss": 0.2154, + "step": 350 + }, + { + "epoch": 0.031102855414920732, + "grad_norm": 2.1511856683733623, + "learning_rate": 3.4548944337811903e-06, + "loss": 0.221, + "step": 360 + }, + { + "epoch": 0.031966823620890754, + "grad_norm": 2.158113277467384, + "learning_rate": 3.5508637236084453e-06, + "loss": 0.2235, + "step": 370 + }, + { + "epoch": 0.03283079182686077, + "grad_norm": 2.110044048252369, + "learning_rate": 3.6468330134357012e-06, + "loss": 0.2204, + "step": 380 + }, + { + "epoch": 0.03369476003283079, + "grad_norm": 2.0795923977620725, + "learning_rate": 3.7428023032629563e-06, + "loss": 0.2209, + "step": 390 + }, + { + "epoch": 0.03455872823880081, + "grad_norm": 1.9794767623756446, + "learning_rate": 3.838771593090212e-06, + "loss": 0.2186, + "step": 400 + }, + { + "epoch": 0.035422696444770835, + "grad_norm": 1.9015278291873687, + "learning_rate": 3.934740882917467e-06, + "loss": 0.217, + "step": 410 + }, + { + "epoch": 0.03628666465074085, + "grad_norm": 2.123413533371326, + "learning_rate": 4.030710172744722e-06, + "loss": 0.2176, + "step": 420 + }, + { + "epoch": 0.03715063285671087, + "grad_norm": 1.982609733635354, + "learning_rate": 4.126679462571978e-06, + "loss": 0.2161, + "step": 430 + }, + { + "epoch": 0.03801460106268089, + "grad_norm": 1.873506879383276, + "learning_rate": 4.222648752399233e-06, + "loss": 0.218, + "step": 440 + }, + { + "epoch": 0.038878569268650916, + "grad_norm": 1.9435107928689135, + "learning_rate": 4.318618042226488e-06, + "loss": 0.2177, + "step": 450 + }, + { + "epoch": 0.039742537474620934, + "grad_norm": 2.0252961794361086, + "learning_rate": 4.414587332053743e-06, + "loss": 0.2154, + "step": 460 + }, + { + "epoch": 0.04060650568059095, + "grad_norm": 1.700171956783739, + "learning_rate": 4.510556621880998e-06, + "loss": 0.2109, + "step": 470 + }, + { + "epoch": 0.04147047388656097, + "grad_norm": 2.053940448238235, + "learning_rate": 4.606525911708254e-06, + "loss": 0.2131, + "step": 480 + }, + { + "epoch": 0.042334442092531, + "grad_norm": 1.822531476696438, + "learning_rate": 4.702495201535509e-06, + "loss": 0.2124, + "step": 490 + }, + { + "epoch": 0.043198410298501015, + "grad_norm": 1.9175265658812415, + "learning_rate": 4.798464491362765e-06, + "loss": 0.2162, + "step": 500 + }, + { + "epoch": 0.044062378504471034, + "grad_norm": 1.766701105342602, + "learning_rate": 4.89443378119002e-06, + "loss": 0.2205, + "step": 510 + }, + { + "epoch": 0.04492634671044106, + "grad_norm": 1.835679557041508, + "learning_rate": 4.990403071017275e-06, + "loss": 0.2153, + "step": 520 + }, + { + "epoch": 0.04579031491641108, + "grad_norm": 1.7702803984984092, + "learning_rate": 5.086372360844531e-06, + "loss": 0.2133, + "step": 530 + }, + { + "epoch": 0.046654283122381096, + "grad_norm": 1.7541287271210348, + "learning_rate": 5.182341650671785e-06, + "loss": 0.2111, + "step": 540 + }, + { + "epoch": 0.047518251328351115, + "grad_norm": 1.679583479494552, + "learning_rate": 5.278310940499041e-06, + "loss": 0.218, + "step": 550 + }, + { + "epoch": 0.04838221953432114, + "grad_norm": 1.7372110816101225, + "learning_rate": 5.374280230326297e-06, + "loss": 0.2111, + "step": 560 + }, + { + "epoch": 0.04924618774029116, + "grad_norm": 1.718777295330965, + "learning_rate": 5.470249520153551e-06, + "loss": 0.2164, + "step": 570 + }, + { + "epoch": 0.05011015594626118, + "grad_norm": 1.6716688118747165, + "learning_rate": 5.566218809980807e-06, + "loss": 0.2186, + "step": 580 + }, + { + "epoch": 0.050974124152231196, + "grad_norm": 1.7059282422333841, + "learning_rate": 5.662188099808062e-06, + "loss": 0.2113, + "step": 590 + }, + { + "epoch": 0.05183809235820122, + "grad_norm": 1.7518290879187177, + "learning_rate": 5.758157389635317e-06, + "loss": 0.2148, + "step": 600 + }, + { + "epoch": 0.05270206056417124, + "grad_norm": 1.7399694944414077, + "learning_rate": 5.854126679462573e-06, + "loss": 0.2154, + "step": 610 + }, + { + "epoch": 0.05356602877014126, + "grad_norm": 1.6592091944436103, + "learning_rate": 5.950095969289828e-06, + "loss": 0.2154, + "step": 620 + }, + { + "epoch": 0.054429996976111276, + "grad_norm": 1.6428653305834138, + "learning_rate": 6.0460652591170836e-06, + "loss": 0.2122, + "step": 630 + }, + { + "epoch": 0.0552939651820813, + "grad_norm": 1.5199474469183176, + "learning_rate": 6.142034548944338e-06, + "loss": 0.2137, + "step": 640 + }, + { + "epoch": 0.05615793338805132, + "grad_norm": 1.495986683540699, + "learning_rate": 6.238003838771594e-06, + "loss": 0.207, + "step": 650 + }, + { + "epoch": 0.05702190159402134, + "grad_norm": 1.6130097470231044, + "learning_rate": 6.3339731285988495e-06, + "loss": 0.2094, + "step": 660 + }, + { + "epoch": 0.05788586979999136, + "grad_norm": 1.429539862570976, + "learning_rate": 6.429942418426104e-06, + "loss": 0.2144, + "step": 670 + }, + { + "epoch": 0.05874983800596138, + "grad_norm": 1.4829396307386704, + "learning_rate": 6.52591170825336e-06, + "loss": 0.2112, + "step": 680 + }, + { + "epoch": 0.0596138062119314, + "grad_norm": 1.5601103696909149, + "learning_rate": 6.621880998080615e-06, + "loss": 0.2146, + "step": 690 + }, + { + "epoch": 0.06047777441790142, + "grad_norm": 1.5121395416193373, + "learning_rate": 6.71785028790787e-06, + "loss": 0.2085, + "step": 700 + }, + { + "epoch": 0.06134174262387144, + "grad_norm": 1.5247176525676305, + "learning_rate": 6.8138195777351256e-06, + "loss": 0.2133, + "step": 710 + }, + { + "epoch": 0.062205710829841464, + "grad_norm": 1.5231955702558029, + "learning_rate": 6.909788867562381e-06, + "loss": 0.2105, + "step": 720 + }, + { + "epoch": 0.06306967903581148, + "grad_norm": 1.575431244960636, + "learning_rate": 7.005758157389636e-06, + "loss": 0.2112, + "step": 730 + }, + { + "epoch": 0.06393364724178151, + "grad_norm": 1.4052670709962776, + "learning_rate": 7.101727447216891e-06, + "loss": 0.214, + "step": 740 + }, + { + "epoch": 0.06479761544775152, + "grad_norm": 1.5981212939035538, + "learning_rate": 7.1976967370441466e-06, + "loss": 0.212, + "step": 750 + }, + { + "epoch": 0.06566158365372154, + "grad_norm": 1.3490598266529032, + "learning_rate": 7.2936660268714024e-06, + "loss": 0.2156, + "step": 760 + }, + { + "epoch": 0.06652555185969157, + "grad_norm": 1.469722333119062, + "learning_rate": 7.389635316698657e-06, + "loss": 0.2125, + "step": 770 + }, + { + "epoch": 0.06738952006566158, + "grad_norm": 1.4540654642423125, + "learning_rate": 7.4856046065259125e-06, + "loss": 0.2103, + "step": 780 + }, + { + "epoch": 0.06825348827163161, + "grad_norm": 1.5938247069436307, + "learning_rate": 7.581573896353167e-06, + "loss": 0.2074, + "step": 790 + }, + { + "epoch": 0.06911745647760162, + "grad_norm": 1.419450236768579, + "learning_rate": 7.677543186180423e-06, + "loss": 0.2156, + "step": 800 + }, + { + "epoch": 0.06998142468357164, + "grad_norm": 1.4817149124966364, + "learning_rate": 7.773512476007678e-06, + "loss": 0.2062, + "step": 810 + }, + { + "epoch": 0.07084539288954167, + "grad_norm": 1.3562338115519248, + "learning_rate": 7.869481765834934e-06, + "loss": 0.2145, + "step": 820 + }, + { + "epoch": 0.07170936109551168, + "grad_norm": 1.441182293209827, + "learning_rate": 7.965451055662189e-06, + "loss": 0.2096, + "step": 830 + }, + { + "epoch": 0.0725733293014817, + "grad_norm": 1.4090583980671276, + "learning_rate": 8.061420345489444e-06, + "loss": 0.2088, + "step": 840 + }, + { + "epoch": 0.07343729750745173, + "grad_norm": 1.3527688354129277, + "learning_rate": 8.157389635316699e-06, + "loss": 0.207, + "step": 850 + }, + { + "epoch": 0.07430126571342174, + "grad_norm": 1.3845049446769717, + "learning_rate": 8.253358925143955e-06, + "loss": 0.2087, + "step": 860 + }, + { + "epoch": 0.07516523391939177, + "grad_norm": 1.3620809094506507, + "learning_rate": 8.34932821497121e-06, + "loss": 0.2062, + "step": 870 + }, + { + "epoch": 0.07602920212536178, + "grad_norm": 1.2449980327729386, + "learning_rate": 8.445297504798465e-06, + "loss": 0.2286, + "step": 880 + }, + { + "epoch": 0.0768931703313318, + "grad_norm": 1.2459711984543538, + "learning_rate": 8.54126679462572e-06, + "loss": 0.2131, + "step": 890 + }, + { + "epoch": 0.07775713853730183, + "grad_norm": 1.435496434836962, + "learning_rate": 8.637236084452976e-06, + "loss": 0.2132, + "step": 900 + }, + { + "epoch": 0.07862110674327184, + "grad_norm": 1.2279448447576062, + "learning_rate": 8.73320537428023e-06, + "loss": 0.2058, + "step": 910 + }, + { + "epoch": 0.07948507494924187, + "grad_norm": 1.3411791343537367, + "learning_rate": 8.829174664107486e-06, + "loss": 0.2104, + "step": 920 + }, + { + "epoch": 0.0803490431552119, + "grad_norm": 1.3588577237929125, + "learning_rate": 8.925143953934742e-06, + "loss": 0.2086, + "step": 930 + }, + { + "epoch": 0.0812130113611819, + "grad_norm": 1.215089486089123, + "learning_rate": 9.021113243761996e-06, + "loss": 0.2116, + "step": 940 + }, + { + "epoch": 0.08207697956715193, + "grad_norm": 1.2690165783221579, + "learning_rate": 9.117082533589252e-06, + "loss": 0.2107, + "step": 950 + }, + { + "epoch": 0.08294094777312194, + "grad_norm": 1.1716070949493718, + "learning_rate": 9.213051823416507e-06, + "loss": 0.2123, + "step": 960 + }, + { + "epoch": 0.08380491597909197, + "grad_norm": 1.2018320494999504, + "learning_rate": 9.309021113243763e-06, + "loss": 0.2076, + "step": 970 + }, + { + "epoch": 0.084668884185062, + "grad_norm": 1.253590789081327, + "learning_rate": 9.404990403071018e-06, + "loss": 0.2053, + "step": 980 + }, + { + "epoch": 0.085532852391032, + "grad_norm": 1.3322390529767874, + "learning_rate": 9.500959692898273e-06, + "loss": 0.2109, + "step": 990 + }, + { + "epoch": 0.08639682059700203, + "grad_norm": 1.1375523323508605, + "learning_rate": 9.59692898272553e-06, + "loss": 0.2102, + "step": 1000 + }, + { + "epoch": 0.08726078880297206, + "grad_norm": 1.426192453982837, + "learning_rate": 9.692898272552784e-06, + "loss": 0.2174, + "step": 1010 + }, + { + "epoch": 0.08812475700894207, + "grad_norm": 1.2098268943786776, + "learning_rate": 9.78886756238004e-06, + "loss": 0.2101, + "step": 1020 + }, + { + "epoch": 0.08898872521491209, + "grad_norm": 1.2055452181294146, + "learning_rate": 9.884836852207294e-06, + "loss": 0.2049, + "step": 1030 + }, + { + "epoch": 0.08985269342088212, + "grad_norm": 1.1566160374036807, + "learning_rate": 9.98080614203455e-06, + "loss": 0.2098, + "step": 1040 + }, + { + "epoch": 0.09071666162685213, + "grad_norm": 1.2194302882557917, + "learning_rate": 9.999998607883625e-06, + "loss": 0.2079, + "step": 1050 + }, + { + "epoch": 0.09158062983282216, + "grad_norm": 1.0589518495138135, + "learning_rate": 9.999992952412176e-06, + "loss": 0.2098, + "step": 1060 + }, + { + "epoch": 0.09244459803879217, + "grad_norm": 1.1650993014300575, + "learning_rate": 9.999982946583298e-06, + "loss": 0.209, + "step": 1070 + }, + { + "epoch": 0.09330856624476219, + "grad_norm": 1.1322274380889616, + "learning_rate": 9.999968590405698e-06, + "loss": 0.2067, + "step": 1080 + }, + { + "epoch": 0.09417253445073222, + "grad_norm": 1.199347187742619, + "learning_rate": 9.999949883891863e-06, + "loss": 0.2043, + "step": 1090 + }, + { + "epoch": 0.09503650265670223, + "grad_norm": 1.2200569547783202, + "learning_rate": 9.999926827058076e-06, + "loss": 0.2069, + "step": 1100 + }, + { + "epoch": 0.09590047086267225, + "grad_norm": 1.157827031372808, + "learning_rate": 9.999899419924391e-06, + "loss": 0.2167, + "step": 1110 + }, + { + "epoch": 0.09676443906864228, + "grad_norm": 1.0754344065487103, + "learning_rate": 9.999867662514655e-06, + "loss": 0.2039, + "step": 1120 + }, + { + "epoch": 0.09762840727461229, + "grad_norm": 1.1937972053687231, + "learning_rate": 9.999831554856503e-06, + "loss": 0.2103, + "step": 1130 + }, + { + "epoch": 0.09849237548058232, + "grad_norm": 1.1174871907395723, + "learning_rate": 9.99979109698135e-06, + "loss": 0.203, + "step": 1140 + }, + { + "epoch": 0.09935634368655233, + "grad_norm": 1.080160161013022, + "learning_rate": 9.999746288924396e-06, + "loss": 0.2023, + "step": 1150 + }, + { + "epoch": 0.10022031189252235, + "grad_norm": 1.0920396314412057, + "learning_rate": 9.999697130724628e-06, + "loss": 0.205, + "step": 1160 + }, + { + "epoch": 0.10108428009849238, + "grad_norm": 1.173817518913107, + "learning_rate": 9.999643622424817e-06, + "loss": 0.2083, + "step": 1170 + }, + { + "epoch": 0.10194824830446239, + "grad_norm": 1.0403379627522062, + "learning_rate": 9.99958576407152e-06, + "loss": 0.2004, + "step": 1180 + }, + { + "epoch": 0.10281221651043242, + "grad_norm": 1.146331529719693, + "learning_rate": 9.999523555715077e-06, + "loss": 0.2089, + "step": 1190 + }, + { + "epoch": 0.10367618471640244, + "grad_norm": 1.639211053109325, + "learning_rate": 9.999456997409614e-06, + "loss": 0.2056, + "step": 1200 + }, + { + "epoch": 0.10454015292237245, + "grad_norm": 1.0282120502515748, + "learning_rate": 9.999386089213042e-06, + "loss": 0.2004, + "step": 1210 + }, + { + "epoch": 0.10540412112834248, + "grad_norm": 1.1924096638792132, + "learning_rate": 9.999310831187056e-06, + "loss": 0.2038, + "step": 1220 + }, + { + "epoch": 0.10626808933431249, + "grad_norm": 1.3235594482554829, + "learning_rate": 9.999231223397136e-06, + "loss": 0.2098, + "step": 1230 + }, + { + "epoch": 0.10713205754028252, + "grad_norm": 1.0759458188187956, + "learning_rate": 9.999147265912545e-06, + "loss": 0.2045, + "step": 1240 + }, + { + "epoch": 0.10799602574625254, + "grad_norm": 0.9954941962888572, + "learning_rate": 9.999058958806337e-06, + "loss": 0.2005, + "step": 1250 + }, + { + "epoch": 0.10885999395222255, + "grad_norm": 1.1146268522277554, + "learning_rate": 9.998966302155337e-06, + "loss": 0.2018, + "step": 1260 + }, + { + "epoch": 0.10972396215819258, + "grad_norm": 1.0822500532270867, + "learning_rate": 9.998869296040172e-06, + "loss": 0.2042, + "step": 1270 + }, + { + "epoch": 0.1105879303641626, + "grad_norm": 1.0663634430114168, + "learning_rate": 9.99876794054524e-06, + "loss": 0.1999, + "step": 1280 + }, + { + "epoch": 0.11145189857013262, + "grad_norm": 1.0487941810337988, + "learning_rate": 9.998662235758726e-06, + "loss": 0.2035, + "step": 1290 + }, + { + "epoch": 0.11231586677610264, + "grad_norm": 1.0397680680828816, + "learning_rate": 9.998552181772608e-06, + "loss": 0.2008, + "step": 1300 + }, + { + "epoch": 0.11317983498207267, + "grad_norm": 1.007838733865999, + "learning_rate": 9.998437778682632e-06, + "loss": 0.201, + "step": 1310 + }, + { + "epoch": 0.11404380318804268, + "grad_norm": 0.9490225603657636, + "learning_rate": 9.998319026588341e-06, + "loss": 0.2013, + "step": 1320 + }, + { + "epoch": 0.1149077713940127, + "grad_norm": 1.0866065144355808, + "learning_rate": 9.99819592559306e-06, + "loss": 0.2033, + "step": 1330 + }, + { + "epoch": 0.11577173959998271, + "grad_norm": 1.0692981821008887, + "learning_rate": 9.998068475803893e-06, + "loss": 0.1964, + "step": 1340 + }, + { + "epoch": 0.11663570780595274, + "grad_norm": 1.009125477008282, + "learning_rate": 9.99793667733173e-06, + "loss": 0.2033, + "step": 1350 + }, + { + "epoch": 0.11749967601192277, + "grad_norm": 0.995413726812648, + "learning_rate": 9.997800530291249e-06, + "loss": 0.2022, + "step": 1360 + }, + { + "epoch": 0.11836364421789278, + "grad_norm": 1.0153384644661287, + "learning_rate": 9.997660034800904e-06, + "loss": 0.2034, + "step": 1370 + }, + { + "epoch": 0.1192276124238628, + "grad_norm": 1.0060721836065372, + "learning_rate": 9.99751519098294e-06, + "loss": 0.1981, + "step": 1380 + }, + { + "epoch": 0.12009158062983283, + "grad_norm": 0.9395888677265961, + "learning_rate": 9.997365998963378e-06, + "loss": 0.2001, + "step": 1390 + }, + { + "epoch": 0.12095554883580284, + "grad_norm": 0.9207516546879072, + "learning_rate": 9.997212458872026e-06, + "loss": 0.1958, + "step": 1400 + }, + { + "epoch": 0.12181951704177287, + "grad_norm": 0.9562432904941996, + "learning_rate": 9.997054570842476e-06, + "loss": 0.1923, + "step": 1410 + }, + { + "epoch": 0.12268348524774288, + "grad_norm": 0.9733048311881327, + "learning_rate": 9.996892335012106e-06, + "loss": 0.2021, + "step": 1420 + }, + { + "epoch": 0.1235474534537129, + "grad_norm": 0.928328731096014, + "learning_rate": 9.996725751522066e-06, + "loss": 0.1962, + "step": 1430 + }, + { + "epoch": 0.12441142165968293, + "grad_norm": 0.9312940268247356, + "learning_rate": 9.996554820517302e-06, + "loss": 0.1925, + "step": 1440 + }, + { + "epoch": 0.12527538986565295, + "grad_norm": 0.9649248803993781, + "learning_rate": 9.996379542146532e-06, + "loss": 0.1953, + "step": 1450 + }, + { + "epoch": 0.12613935807162296, + "grad_norm": 0.9689645983824585, + "learning_rate": 9.996199916562263e-06, + "loss": 0.1969, + "step": 1460 + }, + { + "epoch": 0.12700332627759298, + "grad_norm": 1.0121475618102251, + "learning_rate": 9.99601594392078e-06, + "loss": 0.1944, + "step": 1470 + }, + { + "epoch": 0.12786729448356302, + "grad_norm": 0.9444136452659068, + "learning_rate": 9.995827624382157e-06, + "loss": 0.1959, + "step": 1480 + }, + { + "epoch": 0.12873126268953303, + "grad_norm": 1.0984999625010117, + "learning_rate": 9.995634958110243e-06, + "loss": 0.1979, + "step": 1490 + }, + { + "epoch": 0.12959523089550304, + "grad_norm": 0.969715543160458, + "learning_rate": 9.995437945272671e-06, + "loss": 0.1948, + "step": 1500 + }, + { + "epoch": 0.13045919910147308, + "grad_norm": 1.0241223672746604, + "learning_rate": 9.995236586040857e-06, + "loss": 0.1945, + "step": 1510 + }, + { + "epoch": 0.1313231673074431, + "grad_norm": 0.964222667785021, + "learning_rate": 9.995030880589998e-06, + "loss": 0.1941, + "step": 1520 + }, + { + "epoch": 0.1321871355134131, + "grad_norm": 0.9432330938575335, + "learning_rate": 9.994820829099074e-06, + "loss": 0.195, + "step": 1530 + }, + { + "epoch": 0.13305110371938314, + "grad_norm": 0.9307652199560369, + "learning_rate": 9.994606431750842e-06, + "loss": 0.1854, + "step": 1540 + }, + { + "epoch": 0.13391507192535315, + "grad_norm": 0.9310493339961098, + "learning_rate": 9.994387688731847e-06, + "loss": 0.1928, + "step": 1550 + }, + { + "epoch": 0.13477904013132316, + "grad_norm": 0.9439479070573529, + "learning_rate": 9.994164600232412e-06, + "loss": 0.195, + "step": 1560 + }, + { + "epoch": 0.13564300833729317, + "grad_norm": 0.9603028405137487, + "learning_rate": 9.993937166446635e-06, + "loss": 0.1898, + "step": 1570 + }, + { + "epoch": 0.13650697654326321, + "grad_norm": 0.8970445894391301, + "learning_rate": 9.993705387572404e-06, + "loss": 0.193, + "step": 1580 + }, + { + "epoch": 0.13737094474923323, + "grad_norm": 0.9137010906999963, + "learning_rate": 9.993469263811383e-06, + "loss": 0.1864, + "step": 1590 + }, + { + "epoch": 0.13823491295520324, + "grad_norm": 0.8831586632426319, + "learning_rate": 9.993228795369017e-06, + "loss": 0.1872, + "step": 1600 + }, + { + "epoch": 0.13909888116117328, + "grad_norm": 0.9134761502587991, + "learning_rate": 9.992983982454528e-06, + "loss": 0.1884, + "step": 1610 + }, + { + "epoch": 0.1399628493671433, + "grad_norm": 0.9339429524570831, + "learning_rate": 9.992734825280926e-06, + "loss": 0.1917, + "step": 1620 + }, + { + "epoch": 0.1408268175731133, + "grad_norm": 0.9113630951414461, + "learning_rate": 9.992481324064991e-06, + "loss": 0.1903, + "step": 1630 + }, + { + "epoch": 0.14169078577908334, + "grad_norm": 0.9585246215583206, + "learning_rate": 9.99222347902729e-06, + "loss": 0.1902, + "step": 1640 + }, + { + "epoch": 0.14255475398505335, + "grad_norm": 0.953890677401058, + "learning_rate": 9.991961290392166e-06, + "loss": 0.1875, + "step": 1650 + }, + { + "epoch": 0.14341872219102336, + "grad_norm": 0.8503479328191382, + "learning_rate": 9.991694758387744e-06, + "loss": 0.1887, + "step": 1660 + }, + { + "epoch": 0.1442826903969934, + "grad_norm": 0.9020194108982177, + "learning_rate": 9.991423883245925e-06, + "loss": 0.194, + "step": 1670 + }, + { + "epoch": 0.1451466586029634, + "grad_norm": 0.9083223160164683, + "learning_rate": 9.99114866520239e-06, + "loss": 0.1938, + "step": 1680 + }, + { + "epoch": 0.14601062680893342, + "grad_norm": 0.9090284777953003, + "learning_rate": 9.9908691044966e-06, + "loss": 0.189, + "step": 1690 + }, + { + "epoch": 0.14687459501490346, + "grad_norm": 0.9353875805981315, + "learning_rate": 9.99058520137179e-06, + "loss": 0.1876, + "step": 1700 + }, + { + "epoch": 0.14773856322087348, + "grad_norm": 0.9804296916341775, + "learning_rate": 9.990296956074979e-06, + "loss": 0.1882, + "step": 1710 + }, + { + "epoch": 0.1486025314268435, + "grad_norm": 0.8978507801311811, + "learning_rate": 9.99000436885696e-06, + "loss": 0.1863, + "step": 1720 + }, + { + "epoch": 0.1494664996328135, + "grad_norm": 0.9050475519755361, + "learning_rate": 9.989707439972306e-06, + "loss": 0.1833, + "step": 1730 + }, + { + "epoch": 0.15033046783878354, + "grad_norm": 0.9315891329324208, + "learning_rate": 9.989406169679367e-06, + "loss": 0.1913, + "step": 1740 + }, + { + "epoch": 0.15119443604475355, + "grad_norm": 0.922244731553486, + "learning_rate": 9.98910055824027e-06, + "loss": 0.1883, + "step": 1750 + }, + { + "epoch": 0.15205840425072356, + "grad_norm": 0.8667210354836978, + "learning_rate": 9.988790605920917e-06, + "loss": 0.1852, + "step": 1760 + }, + { + "epoch": 0.1529223724566936, + "grad_norm": 0.8814053572722416, + "learning_rate": 9.988476312990994e-06, + "loss": 0.1837, + "step": 1770 + }, + { + "epoch": 0.1537863406626636, + "grad_norm": 0.8845096592429691, + "learning_rate": 9.988157679723953e-06, + "loss": 0.1827, + "step": 1780 + }, + { + "epoch": 0.15465030886863362, + "grad_norm": 0.896642858191613, + "learning_rate": 9.98783470639703e-06, + "loss": 0.1853, + "step": 1790 + }, + { + "epoch": 0.15551427707460366, + "grad_norm": 0.9515141760201947, + "learning_rate": 9.987507393291238e-06, + "loss": 0.186, + "step": 1800 + }, + { + "epoch": 0.15637824528057367, + "grad_norm": 0.8838310226390965, + "learning_rate": 9.98717574069136e-06, + "loss": 0.1858, + "step": 1810 + }, + { + "epoch": 0.15724221348654369, + "grad_norm": 1.0419827856105834, + "learning_rate": 9.98683974888596e-06, + "loss": 0.1857, + "step": 1820 + }, + { + "epoch": 0.15810618169251373, + "grad_norm": 0.8554442292653766, + "learning_rate": 9.986499418167373e-06, + "loss": 0.1878, + "step": 1830 + }, + { + "epoch": 0.15897014989848374, + "grad_norm": 0.9172557697246417, + "learning_rate": 9.986154748831715e-06, + "loss": 0.1892, + "step": 1840 + }, + { + "epoch": 0.15983411810445375, + "grad_norm": 0.906975620762014, + "learning_rate": 9.98580574117887e-06, + "loss": 0.184, + "step": 1850 + }, + { + "epoch": 0.1606980863104238, + "grad_norm": 0.8860481268543452, + "learning_rate": 9.9854523955125e-06, + "loss": 0.1891, + "step": 1860 + }, + { + "epoch": 0.1615620545163938, + "grad_norm": 0.9766505643536513, + "learning_rate": 9.985094712140044e-06, + "loss": 0.1882, + "step": 1870 + }, + { + "epoch": 0.1624260227223638, + "grad_norm": 0.8698571209777873, + "learning_rate": 9.98473269137271e-06, + "loss": 0.185, + "step": 1880 + }, + { + "epoch": 0.16328999092833385, + "grad_norm": 0.8758942022132932, + "learning_rate": 9.984366333525483e-06, + "loss": 0.1875, + "step": 1890 + }, + { + "epoch": 0.16415395913430386, + "grad_norm": 0.8078901518813743, + "learning_rate": 9.983995638917122e-06, + "loss": 0.186, + "step": 1900 + }, + { + "epoch": 0.16501792734027387, + "grad_norm": 0.844074206643064, + "learning_rate": 9.98362060787016e-06, + "loss": 0.1829, + "step": 1910 + }, + { + "epoch": 0.16588189554624388, + "grad_norm": 0.9462307510554506, + "learning_rate": 9.983241240710897e-06, + "loss": 0.1849, + "step": 1920 + }, + { + "epoch": 0.16674586375221392, + "grad_norm": 0.8894512873356976, + "learning_rate": 9.982857537769412e-06, + "loss": 0.1804, + "step": 1930 + }, + { + "epoch": 0.16760983195818394, + "grad_norm": 0.8862445385159019, + "learning_rate": 9.982469499379556e-06, + "loss": 0.1855, + "step": 1940 + }, + { + "epoch": 0.16847380016415395, + "grad_norm": 0.8711563518645582, + "learning_rate": 9.982077125878948e-06, + "loss": 0.1814, + "step": 1950 + }, + { + "epoch": 0.169337768370124, + "grad_norm": 0.9050034309741795, + "learning_rate": 9.981680417608983e-06, + "loss": 0.1803, + "step": 1960 + }, + { + "epoch": 0.170201736576094, + "grad_norm": 0.9013350985231641, + "learning_rate": 9.981279374914826e-06, + "loss": 0.185, + "step": 1970 + }, + { + "epoch": 0.171065704782064, + "grad_norm": 0.8654792583323618, + "learning_rate": 9.980873998145413e-06, + "loss": 0.1825, + "step": 1980 + }, + { + "epoch": 0.17192967298803405, + "grad_norm": 0.8348173364643074, + "learning_rate": 9.980464287653451e-06, + "loss": 0.1798, + "step": 1990 + }, + { + "epoch": 0.17279364119400406, + "grad_norm": 0.8524350735085091, + "learning_rate": 9.980050243795418e-06, + "loss": 0.182, + "step": 2000 + }, + { + "epoch": 0.17365760939997407, + "grad_norm": 0.8823380446120698, + "learning_rate": 9.979631866931562e-06, + "loss": 0.1838, + "step": 2010 + }, + { + "epoch": 0.1745215776059441, + "grad_norm": 0.8990809252489962, + "learning_rate": 9.979209157425902e-06, + "loss": 0.1832, + "step": 2020 + }, + { + "epoch": 0.17538554581191412, + "grad_norm": 0.8737258448461562, + "learning_rate": 9.978782115646226e-06, + "loss": 0.1833, + "step": 2030 + }, + { + "epoch": 0.17624951401788413, + "grad_norm": 0.8539185597232698, + "learning_rate": 9.978350741964091e-06, + "loss": 0.1821, + "step": 2040 + }, + { + "epoch": 0.17711348222385417, + "grad_norm": 0.8718315063650803, + "learning_rate": 9.977915036754822e-06, + "loss": 0.1838, + "step": 2050 + }, + { + "epoch": 0.17797745042982419, + "grad_norm": 0.9045138495180984, + "learning_rate": 9.977475000397518e-06, + "loss": 0.1828, + "step": 2060 + }, + { + "epoch": 0.1788414186357942, + "grad_norm": 0.8655192511724848, + "learning_rate": 9.97703063327504e-06, + "loss": 0.1842, + "step": 2070 + }, + { + "epoch": 0.17970538684176424, + "grad_norm": 0.9058187600514871, + "learning_rate": 9.97658193577402e-06, + "loss": 0.1813, + "step": 2080 + }, + { + "epoch": 0.18056935504773425, + "grad_norm": 0.8534047113131205, + "learning_rate": 9.976128908284857e-06, + "loss": 0.1823, + "step": 2090 + }, + { + "epoch": 0.18143332325370426, + "grad_norm": 0.7845439254440908, + "learning_rate": 9.975671551201719e-06, + "loss": 0.1796, + "step": 2100 + }, + { + "epoch": 0.18229729145967427, + "grad_norm": 0.8348503967840605, + "learning_rate": 9.97520986492254e-06, + "loss": 0.1803, + "step": 2110 + }, + { + "epoch": 0.1831612596656443, + "grad_norm": 0.8434028762996415, + "learning_rate": 9.974743849849017e-06, + "loss": 0.1793, + "step": 2120 + }, + { + "epoch": 0.18402522787161432, + "grad_norm": 0.8447485487459954, + "learning_rate": 9.974273506386623e-06, + "loss": 0.1735, + "step": 2130 + }, + { + "epoch": 0.18488919607758433, + "grad_norm": 0.8030215035923475, + "learning_rate": 9.973798834944588e-06, + "loss": 0.1826, + "step": 2140 + }, + { + "epoch": 0.18575316428355437, + "grad_norm": 0.8518647928321847, + "learning_rate": 9.97331983593591e-06, + "loss": 0.1812, + "step": 2150 + }, + { + "epoch": 0.18661713248952438, + "grad_norm": 0.8494439395574299, + "learning_rate": 9.972836509777352e-06, + "loss": 0.1796, + "step": 2160 + }, + { + "epoch": 0.1874811006954944, + "grad_norm": 0.806243507038616, + "learning_rate": 9.972348856889447e-06, + "loss": 0.1803, + "step": 2170 + }, + { + "epoch": 0.18834506890146444, + "grad_norm": 0.7838675544587994, + "learning_rate": 9.971856877696483e-06, + "loss": 0.1808, + "step": 2180 + }, + { + "epoch": 0.18920903710743445, + "grad_norm": 0.8679707131997498, + "learning_rate": 9.971360572626525e-06, + "loss": 0.1795, + "step": 2190 + }, + { + "epoch": 0.19007300531340446, + "grad_norm": 0.8359795113698925, + "learning_rate": 9.970859942111387e-06, + "loss": 0.1829, + "step": 2200 + }, + { + "epoch": 0.1909369735193745, + "grad_norm": 0.8141606983614745, + "learning_rate": 9.97035498658666e-06, + "loss": 0.1775, + "step": 2210 + }, + { + "epoch": 0.1918009417253445, + "grad_norm": 0.8511617271228508, + "learning_rate": 9.969845706491686e-06, + "loss": 0.1819, + "step": 2220 + }, + { + "epoch": 0.19266490993131452, + "grad_norm": 0.8181597729741305, + "learning_rate": 9.96933210226958e-06, + "loss": 0.1816, + "step": 2230 + }, + { + "epoch": 0.19352887813728456, + "grad_norm": 0.8536969679710033, + "learning_rate": 9.968814174367214e-06, + "loss": 0.1795, + "step": 2240 + }, + { + "epoch": 0.19439284634325457, + "grad_norm": 0.8697086511401918, + "learning_rate": 9.968291923235222e-06, + "loss": 0.1799, + "step": 2250 + }, + { + "epoch": 0.19525681454922458, + "grad_norm": 0.8874621853306252, + "learning_rate": 9.967765349328003e-06, + "loss": 0.1748, + "step": 2260 + }, + { + "epoch": 0.19612078275519462, + "grad_norm": 0.828862563408282, + "learning_rate": 9.967234453103712e-06, + "loss": 0.1771, + "step": 2270 + }, + { + "epoch": 0.19698475096116463, + "grad_norm": 0.7766510398391139, + "learning_rate": 9.966699235024266e-06, + "loss": 0.178, + "step": 2280 + }, + { + "epoch": 0.19784871916713465, + "grad_norm": 0.854829690265069, + "learning_rate": 9.966159695555349e-06, + "loss": 0.177, + "step": 2290 + }, + { + "epoch": 0.19871268737310466, + "grad_norm": 0.8222531787450125, + "learning_rate": 9.965615835166396e-06, + "loss": 0.1774, + "step": 2300 + }, + { + "epoch": 0.1995766555790747, + "grad_norm": 0.8173464466969167, + "learning_rate": 9.965067654330604e-06, + "loss": 0.1749, + "step": 2310 + }, + { + "epoch": 0.2004406237850447, + "grad_norm": 0.7951744192540607, + "learning_rate": 9.964515153524932e-06, + "loss": 0.1771, + "step": 2320 + }, + { + "epoch": 0.20130459199101472, + "grad_norm": 0.8413750210733838, + "learning_rate": 9.963958333230097e-06, + "loss": 0.1792, + "step": 2330 + }, + { + "epoch": 0.20216856019698476, + "grad_norm": 0.8451327183723681, + "learning_rate": 9.96339719393057e-06, + "loss": 0.1812, + "step": 2340 + }, + { + "epoch": 0.20303252840295477, + "grad_norm": 0.8346875351231713, + "learning_rate": 9.962831736114585e-06, + "loss": 0.1732, + "step": 2350 + }, + { + "epoch": 0.20389649660892478, + "grad_norm": 0.7717467067418516, + "learning_rate": 9.962261960274132e-06, + "loss": 0.1753, + "step": 2360 + }, + { + "epoch": 0.20476046481489482, + "grad_norm": 0.8407722193490899, + "learning_rate": 9.961687866904954e-06, + "loss": 0.1747, + "step": 2370 + }, + { + "epoch": 0.20562443302086483, + "grad_norm": 0.8444934823842906, + "learning_rate": 9.961109456506559e-06, + "loss": 0.1766, + "step": 2380 + }, + { + "epoch": 0.20648840122683484, + "grad_norm": 0.8247517545324368, + "learning_rate": 9.960526729582203e-06, + "loss": 0.1708, + "step": 2390 + }, + { + "epoch": 0.20735236943280488, + "grad_norm": 0.8276154600773534, + "learning_rate": 9.959939686638901e-06, + "loss": 0.1754, + "step": 2400 + }, + { + "epoch": 0.2082163376387749, + "grad_norm": 0.7878727685300535, + "learning_rate": 9.959348328187424e-06, + "loss": 0.1769, + "step": 2410 + }, + { + "epoch": 0.2090803058447449, + "grad_norm": 0.8280838186074931, + "learning_rate": 9.958752654742296e-06, + "loss": 0.1761, + "step": 2420 + }, + { + "epoch": 0.20994427405071495, + "grad_norm": 0.8279865260833438, + "learning_rate": 9.958152666821796e-06, + "loss": 0.1788, + "step": 2430 + }, + { + "epoch": 0.21080824225668496, + "grad_norm": 0.8203639460882185, + "learning_rate": 9.957548364947959e-06, + "loss": 0.1784, + "step": 2440 + }, + { + "epoch": 0.21167221046265497, + "grad_norm": 0.7849791611006334, + "learning_rate": 9.95693974964657e-06, + "loss": 0.1761, + "step": 2450 + }, + { + "epoch": 0.21253617866862498, + "grad_norm": 0.7544264491850907, + "learning_rate": 9.956326821447168e-06, + "loss": 0.1735, + "step": 2460 + }, + { + "epoch": 0.21340014687459502, + "grad_norm": 0.7935822340896542, + "learning_rate": 9.955709580883047e-06, + "loss": 0.175, + "step": 2470 + }, + { + "epoch": 0.21426411508056503, + "grad_norm": 0.7652213678508909, + "learning_rate": 9.955088028491247e-06, + "loss": 0.1738, + "step": 2480 + }, + { + "epoch": 0.21512808328653504, + "grad_norm": 0.8065981747060027, + "learning_rate": 9.954462164812568e-06, + "loss": 0.1753, + "step": 2490 + }, + { + "epoch": 0.21599205149250508, + "grad_norm": 0.7773019508262726, + "learning_rate": 9.953831990391557e-06, + "loss": 0.1773, + "step": 2500 + }, + { + "epoch": 0.2168560196984751, + "grad_norm": 0.8167560180005594, + "learning_rate": 9.95319750577651e-06, + "loss": 0.1766, + "step": 2510 + }, + { + "epoch": 0.2177199879044451, + "grad_norm": 0.7745012569374494, + "learning_rate": 9.952558711519475e-06, + "loss": 0.1733, + "step": 2520 + }, + { + "epoch": 0.21858395611041515, + "grad_norm": 0.7882505435416458, + "learning_rate": 9.951915608176247e-06, + "loss": 0.173, + "step": 2530 + }, + { + "epoch": 0.21944792431638516, + "grad_norm": 0.7535315864659419, + "learning_rate": 9.951268196306379e-06, + "loss": 0.1724, + "step": 2540 + }, + { + "epoch": 0.22031189252235517, + "grad_norm": 0.7903619770040948, + "learning_rate": 9.950616476473161e-06, + "loss": 0.1776, + "step": 2550 + }, + { + "epoch": 0.2211758607283252, + "grad_norm": 0.827402074944646, + "learning_rate": 9.949960449243638e-06, + "loss": 0.1786, + "step": 2560 + }, + { + "epoch": 0.22203982893429522, + "grad_norm": 0.7786893208413252, + "learning_rate": 9.9493001151886e-06, + "loss": 0.177, + "step": 2570 + }, + { + "epoch": 0.22290379714026523, + "grad_norm": 0.7907751212309051, + "learning_rate": 9.94863547488259e-06, + "loss": 0.1761, + "step": 2580 + }, + { + "epoch": 0.22376776534623527, + "grad_norm": 0.8621582903262195, + "learning_rate": 9.94796652890389e-06, + "loss": 0.1759, + "step": 2590 + }, + { + "epoch": 0.22463173355220528, + "grad_norm": 0.806757543369109, + "learning_rate": 9.947293277834531e-06, + "loss": 0.1754, + "step": 2600 + }, + { + "epoch": 0.2254957017581753, + "grad_norm": 0.7527211655050535, + "learning_rate": 9.946615722260291e-06, + "loss": 0.1719, + "step": 2610 + }, + { + "epoch": 0.22635966996414533, + "grad_norm": 0.8447146520227704, + "learning_rate": 9.945933862770695e-06, + "loss": 0.1747, + "step": 2620 + }, + { + "epoch": 0.22722363817011534, + "grad_norm": 0.7430272978878533, + "learning_rate": 9.94524769995901e-06, + "loss": 0.1723, + "step": 2630 + }, + { + "epoch": 0.22808760637608536, + "grad_norm": 0.7758344166204706, + "learning_rate": 9.944557234422244e-06, + "loss": 0.173, + "step": 2640 + }, + { + "epoch": 0.22895157458205537, + "grad_norm": 0.7902705002047675, + "learning_rate": 9.943862466761154e-06, + "loss": 0.1776, + "step": 2650 + }, + { + "epoch": 0.2298155427880254, + "grad_norm": 0.8419673410782607, + "learning_rate": 9.943163397580237e-06, + "loss": 0.1702, + "step": 2660 + }, + { + "epoch": 0.23067951099399542, + "grad_norm": 0.8381561999082194, + "learning_rate": 9.94246002748774e-06, + "loss": 0.179, + "step": 2670 + }, + { + "epoch": 0.23154347919996543, + "grad_norm": 0.7574228443477528, + "learning_rate": 9.94175235709564e-06, + "loss": 0.1731, + "step": 2680 + }, + { + "epoch": 0.23240744740593547, + "grad_norm": 0.7911399295773064, + "learning_rate": 9.941040387019663e-06, + "loss": 0.1716, + "step": 2690 + }, + { + "epoch": 0.23327141561190548, + "grad_norm": 0.7898506911596503, + "learning_rate": 9.940324117879276e-06, + "loss": 0.1731, + "step": 2700 + }, + { + "epoch": 0.2341353838178755, + "grad_norm": 0.7981346247488054, + "learning_rate": 9.939603550297684e-06, + "loss": 0.1731, + "step": 2710 + }, + { + "epoch": 0.23499935202384553, + "grad_norm": 0.7584900113832275, + "learning_rate": 9.938878684901834e-06, + "loss": 0.17, + "step": 2720 + }, + { + "epoch": 0.23586332022981554, + "grad_norm": 0.803733635587881, + "learning_rate": 9.938149522322411e-06, + "loss": 0.1798, + "step": 2730 + }, + { + "epoch": 0.23672728843578555, + "grad_norm": 0.8047802032077452, + "learning_rate": 9.937416063193841e-06, + "loss": 0.1754, + "step": 2740 + }, + { + "epoch": 0.2375912566417556, + "grad_norm": 0.748847952157505, + "learning_rate": 9.936678308154283e-06, + "loss": 0.1721, + "step": 2750 + }, + { + "epoch": 0.2384552248477256, + "grad_norm": 0.7198022855499399, + "learning_rate": 9.935936257845643e-06, + "loss": 0.1704, + "step": 2760 + }, + { + "epoch": 0.23931919305369562, + "grad_norm": 1.0481929988558047, + "learning_rate": 9.935189912913555e-06, + "loss": 0.1702, + "step": 2770 + }, + { + "epoch": 0.24018316125966566, + "grad_norm": 0.781807883224081, + "learning_rate": 9.934439274007392e-06, + "loss": 0.1752, + "step": 2780 + }, + { + "epoch": 0.24104712946563567, + "grad_norm": 0.7830306240360945, + "learning_rate": 9.93368434178027e-06, + "loss": 0.1771, + "step": 2790 + }, + { + "epoch": 0.24191109767160568, + "grad_norm": 0.7546245521114135, + "learning_rate": 9.93292511688903e-06, + "loss": 0.1726, + "step": 2800 + }, + { + "epoch": 0.24277506587757572, + "grad_norm": 0.769112224734122, + "learning_rate": 9.932161599994253e-06, + "loss": 0.1732, + "step": 2810 + }, + { + "epoch": 0.24363903408354573, + "grad_norm": 0.8248961320812401, + "learning_rate": 9.931393791760258e-06, + "loss": 0.1763, + "step": 2820 + }, + { + "epoch": 0.24450300228951574, + "grad_norm": 0.7577418125024905, + "learning_rate": 9.930621692855089e-06, + "loss": 0.1715, + "step": 2830 + }, + { + "epoch": 0.24536697049548575, + "grad_norm": 0.7300877318840987, + "learning_rate": 9.929845303950533e-06, + "loss": 0.1699, + "step": 2840 + }, + { + "epoch": 0.2462309387014558, + "grad_norm": 0.7810888803907006, + "learning_rate": 9.929064625722103e-06, + "loss": 0.1727, + "step": 2850 + }, + { + "epoch": 0.2470949069074258, + "grad_norm": 0.7601317930300447, + "learning_rate": 9.928279658849044e-06, + "loss": 0.1696, + "step": 2860 + }, + { + "epoch": 0.24795887511339582, + "grad_norm": 0.7865052537791539, + "learning_rate": 9.927490404014335e-06, + "loss": 0.1749, + "step": 2870 + }, + { + "epoch": 0.24882284331936586, + "grad_norm": 0.7668012354719749, + "learning_rate": 9.926696861904688e-06, + "loss": 0.1727, + "step": 2880 + }, + { + "epoch": 0.24968681152533587, + "grad_norm": 0.7585293542729381, + "learning_rate": 9.925899033210537e-06, + "loss": 0.1687, + "step": 2890 + }, + { + "epoch": 0.2505507797313059, + "grad_norm": 0.7357294583323162, + "learning_rate": 9.925096918626057e-06, + "loss": 0.1732, + "step": 2900 + }, + { + "epoch": 0.2514147479372759, + "grad_norm": 0.7723496222824552, + "learning_rate": 9.924290518849143e-06, + "loss": 0.1667, + "step": 2910 + }, + { + "epoch": 0.25227871614324593, + "grad_norm": 0.7719742079374611, + "learning_rate": 9.92347983458142e-06, + "loss": 0.1672, + "step": 2920 + }, + { + "epoch": 0.25314268434921594, + "grad_norm": 0.7707056771464336, + "learning_rate": 9.922664866528245e-06, + "loss": 0.1746, + "step": 2930 + }, + { + "epoch": 0.25400665255518595, + "grad_norm": 0.7292053975039586, + "learning_rate": 9.921845615398696e-06, + "loss": 0.1669, + "step": 2940 + }, + { + "epoch": 0.25487062076115596, + "grad_norm": 0.7241856067663586, + "learning_rate": 9.921022081905584e-06, + "loss": 0.1663, + "step": 2950 + }, + { + "epoch": 0.25573458896712603, + "grad_norm": 0.7466682681906929, + "learning_rate": 9.920194266765443e-06, + "loss": 0.172, + "step": 2960 + }, + { + "epoch": 0.25659855717309604, + "grad_norm": 0.7453627408950962, + "learning_rate": 9.919362170698535e-06, + "loss": 0.1688, + "step": 2970 + }, + { + "epoch": 0.25746252537906605, + "grad_norm": 0.7355063590474924, + "learning_rate": 9.918525794428835e-06, + "loss": 0.1724, + "step": 2980 + }, + { + "epoch": 0.25832649358503607, + "grad_norm": 0.7790954819757584, + "learning_rate": 9.917685138684061e-06, + "loss": 0.1691, + "step": 2990 + }, + { + "epoch": 0.2591904617910061, + "grad_norm": 0.765744357026, + "learning_rate": 9.91684020419564e-06, + "loss": 0.1706, + "step": 3000 + }, + { + "epoch": 0.2600544299969761, + "grad_norm": 0.7430626764761369, + "learning_rate": 9.915990991698725e-06, + "loss": 0.1748, + "step": 3010 + }, + { + "epoch": 0.26091839820294616, + "grad_norm": 0.7305081505894514, + "learning_rate": 9.915137501932196e-06, + "loss": 0.1694, + "step": 3020 + }, + { + "epoch": 0.26178236640891617, + "grad_norm": 0.7738500765507736, + "learning_rate": 9.91427973563865e-06, + "loss": 0.1667, + "step": 3030 + }, + { + "epoch": 0.2626463346148862, + "grad_norm": 0.7902256019188909, + "learning_rate": 9.913417693564406e-06, + "loss": 0.1696, + "step": 3040 + }, + { + "epoch": 0.2635103028208562, + "grad_norm": 0.7646679714717656, + "learning_rate": 9.912551376459502e-06, + "loss": 0.1657, + "step": 3050 + }, + { + "epoch": 0.2643742710268262, + "grad_norm": 0.7451146026894049, + "learning_rate": 9.911680785077699e-06, + "loss": 0.1703, + "step": 3060 + }, + { + "epoch": 0.2652382392327962, + "grad_norm": 0.7807148744435659, + "learning_rate": 9.910805920176472e-06, + "loss": 0.1706, + "step": 3070 + }, + { + "epoch": 0.2661022074387663, + "grad_norm": 0.7263055380070994, + "learning_rate": 9.90992678251702e-06, + "loss": 0.1681, + "step": 3080 + }, + { + "epoch": 0.2669661756447363, + "grad_norm": 0.7663865032837028, + "learning_rate": 9.909043372864256e-06, + "loss": 0.1667, + "step": 3090 + }, + { + "epoch": 0.2678301438507063, + "grad_norm": 0.7186567166331693, + "learning_rate": 9.90815569198681e-06, + "loss": 0.1643, + "step": 3100 + }, + { + "epoch": 0.2686941120566763, + "grad_norm": 0.7614040177140725, + "learning_rate": 9.90726374065703e-06, + "loss": 0.1712, + "step": 3110 + }, + { + "epoch": 0.2695580802626463, + "grad_norm": 0.7593484797688069, + "learning_rate": 9.906367519650976e-06, + "loss": 0.1727, + "step": 3120 + }, + { + "epoch": 0.27042204846861634, + "grad_norm": 0.7784846405411727, + "learning_rate": 9.905467029748427e-06, + "loss": 0.1731, + "step": 3130 + }, + { + "epoch": 0.27128601667458635, + "grad_norm": 0.6936575897699844, + "learning_rate": 9.904562271732877e-06, + "loss": 0.1709, + "step": 3140 + }, + { + "epoch": 0.2721499848805564, + "grad_norm": 0.7455742600336882, + "learning_rate": 9.903653246391526e-06, + "loss": 0.1712, + "step": 3150 + }, + { + "epoch": 0.27301395308652643, + "grad_norm": 0.7639542012395595, + "learning_rate": 9.902739954515298e-06, + "loss": 0.1692, + "step": 3160 + }, + { + "epoch": 0.27387792129249644, + "grad_norm": 0.7654658583654708, + "learning_rate": 9.90182239689882e-06, + "loss": 0.1718, + "step": 3170 + }, + { + "epoch": 0.27474188949846645, + "grad_norm": 0.7254925144092764, + "learning_rate": 9.900900574340433e-06, + "loss": 0.1711, + "step": 3180 + }, + { + "epoch": 0.27560585770443646, + "grad_norm": 0.7784945029552791, + "learning_rate": 9.899974487642191e-06, + "loss": 0.1696, + "step": 3190 + }, + { + "epoch": 0.2764698259104065, + "grad_norm": 0.7292362264828972, + "learning_rate": 9.899044137609857e-06, + "loss": 0.1682, + "step": 3200 + }, + { + "epoch": 0.27733379411637654, + "grad_norm": 0.7488659895432636, + "learning_rate": 9.898109525052904e-06, + "loss": 0.1669, + "step": 3210 + }, + { + "epoch": 0.27819776232234655, + "grad_norm": 0.7356583051338673, + "learning_rate": 9.89717065078451e-06, + "loss": 0.1619, + "step": 3220 + }, + { + "epoch": 0.27906173052831657, + "grad_norm": 0.72285525580165, + "learning_rate": 9.896227515621567e-06, + "loss": 0.1661, + "step": 3230 + }, + { + "epoch": 0.2799256987342866, + "grad_norm": 0.7339085499346532, + "learning_rate": 9.89528012038467e-06, + "loss": 0.1668, + "step": 3240 + }, + { + "epoch": 0.2807896669402566, + "grad_norm": 0.7303332944076532, + "learning_rate": 9.89432846589812e-06, + "loss": 0.1686, + "step": 3250 + }, + { + "epoch": 0.2816536351462266, + "grad_norm": 0.7801708995133046, + "learning_rate": 9.893372552989928e-06, + "loss": 0.167, + "step": 3260 + }, + { + "epoch": 0.28251760335219667, + "grad_norm": 0.7841519896635663, + "learning_rate": 9.892412382491808e-06, + "loss": 0.1687, + "step": 3270 + }, + { + "epoch": 0.2833815715581667, + "grad_norm": 0.7379472562953742, + "learning_rate": 9.891447955239177e-06, + "loss": 0.1678, + "step": 3280 + }, + { + "epoch": 0.2842455397641367, + "grad_norm": 0.7477316830102703, + "learning_rate": 9.890479272071156e-06, + "loss": 0.1682, + "step": 3290 + }, + { + "epoch": 0.2851095079701067, + "grad_norm": 0.7499765853953004, + "learning_rate": 9.88950633383057e-06, + "loss": 0.173, + "step": 3300 + }, + { + "epoch": 0.2859734761760767, + "grad_norm": 0.7617572357950421, + "learning_rate": 9.888529141363949e-06, + "loss": 0.1692, + "step": 3310 + }, + { + "epoch": 0.2868374443820467, + "grad_norm": 0.7928336156705718, + "learning_rate": 9.887547695521518e-06, + "loss": 0.1655, + "step": 3320 + }, + { + "epoch": 0.28770141258801674, + "grad_norm": 0.7303417343810876, + "learning_rate": 9.886561997157207e-06, + "loss": 0.163, + "step": 3330 + }, + { + "epoch": 0.2885653807939868, + "grad_norm": 0.7364254164460936, + "learning_rate": 9.885572047128646e-06, + "loss": 0.1717, + "step": 3340 + }, + { + "epoch": 0.2894293489999568, + "grad_norm": 0.7948365440268766, + "learning_rate": 9.884577846297163e-06, + "loss": 0.1718, + "step": 3350 + }, + { + "epoch": 0.2902933172059268, + "grad_norm": 0.7305274500423871, + "learning_rate": 9.883579395527787e-06, + "loss": 0.1641, + "step": 3360 + }, + { + "epoch": 0.29115728541189684, + "grad_norm": 0.7134297636334582, + "learning_rate": 9.882576695689239e-06, + "loss": 0.1691, + "step": 3370 + }, + { + "epoch": 0.29202125361786685, + "grad_norm": 0.6925354450672917, + "learning_rate": 9.881569747653943e-06, + "loss": 0.1683, + "step": 3380 + }, + { + "epoch": 0.29288522182383686, + "grad_norm": 0.7386046590981896, + "learning_rate": 9.880558552298018e-06, + "loss": 0.1713, + "step": 3390 + }, + { + "epoch": 0.29374919002980693, + "grad_norm": 0.721795755790784, + "learning_rate": 9.879543110501276e-06, + "loss": 0.1667, + "step": 3400 + }, + { + "epoch": 0.29461315823577694, + "grad_norm": 0.7281554532294857, + "learning_rate": 9.878523423147223e-06, + "loss": 0.1667, + "step": 3410 + }, + { + "epoch": 0.29547712644174695, + "grad_norm": 0.7232451772483279, + "learning_rate": 9.877499491123066e-06, + "loss": 0.1682, + "step": 3420 + }, + { + "epoch": 0.29634109464771696, + "grad_norm": 0.712697831818986, + "learning_rate": 9.876471315319699e-06, + "loss": 0.1712, + "step": 3430 + }, + { + "epoch": 0.297205062853687, + "grad_norm": 0.7314848374059167, + "learning_rate": 9.875438896631706e-06, + "loss": 0.1669, + "step": 3440 + }, + { + "epoch": 0.298069031059657, + "grad_norm": 0.7223661801150361, + "learning_rate": 9.87440223595737e-06, + "loss": 0.1694, + "step": 3450 + }, + { + "epoch": 0.298932999265627, + "grad_norm": 0.7297043299090169, + "learning_rate": 9.873361334198661e-06, + "loss": 0.1603, + "step": 3460 + }, + { + "epoch": 0.29979696747159706, + "grad_norm": 0.7633676240410845, + "learning_rate": 9.872316192261238e-06, + "loss": 0.1657, + "step": 3470 + }, + { + "epoch": 0.3006609356775671, + "grad_norm": 0.6987340528429398, + "learning_rate": 9.871266811054449e-06, + "loss": 0.1705, + "step": 3480 + }, + { + "epoch": 0.3015249038835371, + "grad_norm": 0.7405716794127658, + "learning_rate": 9.870213191491335e-06, + "loss": 0.1653, + "step": 3490 + }, + { + "epoch": 0.3023888720895071, + "grad_norm": 0.7479032250490073, + "learning_rate": 9.869155334488622e-06, + "loss": 0.1618, + "step": 3500 + }, + { + "epoch": 0.3032528402954771, + "grad_norm": 0.7501069449128815, + "learning_rate": 9.86809324096672e-06, + "loss": 0.1619, + "step": 3510 + }, + { + "epoch": 0.3041168085014471, + "grad_norm": 0.7702944945336244, + "learning_rate": 9.867026911849728e-06, + "loss": 0.1709, + "step": 3520 + }, + { + "epoch": 0.3049807767074172, + "grad_norm": 0.7111948089944916, + "learning_rate": 9.865956348065431e-06, + "loss": 0.1695, + "step": 3530 + }, + { + "epoch": 0.3058447449133872, + "grad_norm": 0.7294603405967993, + "learning_rate": 9.864881550545296e-06, + "loss": 0.1655, + "step": 3540 + }, + { + "epoch": 0.3067087131193572, + "grad_norm": 0.7354788765353012, + "learning_rate": 9.863802520224474e-06, + "loss": 0.1651, + "step": 3550 + }, + { + "epoch": 0.3075726813253272, + "grad_norm": 0.717913826033429, + "learning_rate": 9.862719258041804e-06, + "loss": 0.1623, + "step": 3560 + }, + { + "epoch": 0.30843664953129724, + "grad_norm": 0.7612142871404554, + "learning_rate": 9.8616317649398e-06, + "loss": 0.1717, + "step": 3570 + }, + { + "epoch": 0.30930061773726725, + "grad_norm": 0.7464556010648316, + "learning_rate": 9.860540041864662e-06, + "loss": 0.1642, + "step": 3580 + }, + { + "epoch": 0.3101645859432373, + "grad_norm": 0.7331397054467885, + "learning_rate": 9.859444089766264e-06, + "loss": 0.1673, + "step": 3590 + }, + { + "epoch": 0.3110285541492073, + "grad_norm": 0.7874566449044291, + "learning_rate": 9.85834390959817e-06, + "loss": 0.1658, + "step": 3600 + }, + { + "epoch": 0.31189252235517734, + "grad_norm": 0.7535513082033617, + "learning_rate": 9.85723950231761e-06, + "loss": 0.1675, + "step": 3610 + }, + { + "epoch": 0.31275649056114735, + "grad_norm": 0.7159751469299143, + "learning_rate": 9.856130868885505e-06, + "loss": 0.1679, + "step": 3620 + }, + { + "epoch": 0.31362045876711736, + "grad_norm": 0.6813273679617754, + "learning_rate": 9.855018010266443e-06, + "loss": 0.1664, + "step": 3630 + }, + { + "epoch": 0.31448442697308737, + "grad_norm": 0.7173235203424494, + "learning_rate": 9.853900927428694e-06, + "loss": 0.163, + "step": 3640 + }, + { + "epoch": 0.3153483951790574, + "grad_norm": 0.7041005574822984, + "learning_rate": 9.852779621344199e-06, + "loss": 0.1648, + "step": 3650 + }, + { + "epoch": 0.31621236338502745, + "grad_norm": 0.7814286351256942, + "learning_rate": 9.851654092988578e-06, + "loss": 0.1649, + "step": 3660 + }, + { + "epoch": 0.31707633159099746, + "grad_norm": 0.726258361940594, + "learning_rate": 9.850524343341121e-06, + "loss": 0.1662, + "step": 3670 + }, + { + "epoch": 0.3179402997969675, + "grad_norm": 0.7291712487787225, + "learning_rate": 9.849390373384793e-06, + "loss": 0.1616, + "step": 3680 + }, + { + "epoch": 0.3188042680029375, + "grad_norm": 0.7562894585114789, + "learning_rate": 9.84825218410623e-06, + "loss": 0.1646, + "step": 3690 + }, + { + "epoch": 0.3196682362089075, + "grad_norm": 0.7025509334909738, + "learning_rate": 9.84710977649574e-06, + "loss": 0.1649, + "step": 3700 + }, + { + "epoch": 0.3205322044148775, + "grad_norm": 0.7193689188065979, + "learning_rate": 9.845963151547302e-06, + "loss": 0.167, + "step": 3710 + }, + { + "epoch": 0.3213961726208476, + "grad_norm": 0.7177232302086264, + "learning_rate": 9.84481231025856e-06, + "loss": 0.1648, + "step": 3720 + }, + { + "epoch": 0.3222601408268176, + "grad_norm": 0.7181865122617623, + "learning_rate": 9.84365725363083e-06, + "loss": 0.166, + "step": 3730 + }, + { + "epoch": 0.3231241090327876, + "grad_norm": 0.7522519602597432, + "learning_rate": 9.842497982669097e-06, + "loss": 0.1665, + "step": 3740 + }, + { + "epoch": 0.3239880772387576, + "grad_norm": 0.7487144507984922, + "learning_rate": 9.84133449838201e-06, + "loss": 0.1601, + "step": 3750 + }, + { + "epoch": 0.3248520454447276, + "grad_norm": 0.7042965860824565, + "learning_rate": 9.840166801781887e-06, + "loss": 0.1638, + "step": 3760 + }, + { + "epoch": 0.32571601365069763, + "grad_norm": 0.7187756482975587, + "learning_rate": 9.838994893884705e-06, + "loss": 0.1695, + "step": 3770 + }, + { + "epoch": 0.3265799818566677, + "grad_norm": 0.6995611012661251, + "learning_rate": 9.837818775710114e-06, + "loss": 0.1648, + "step": 3780 + }, + { + "epoch": 0.3274439500626377, + "grad_norm": 0.7608691305524844, + "learning_rate": 9.836638448281417e-06, + "loss": 0.1654, + "step": 3790 + }, + { + "epoch": 0.3283079182686077, + "grad_norm": 0.7289093633796809, + "learning_rate": 9.835453912625587e-06, + "loss": 0.1667, + "step": 3800 + }, + { + "epoch": 0.32917188647457774, + "grad_norm": 0.7629065996737572, + "learning_rate": 9.834265169773259e-06, + "loss": 0.1713, + "step": 3810 + }, + { + "epoch": 0.33003585468054775, + "grad_norm": 0.6727524986501837, + "learning_rate": 9.833072220758722e-06, + "loss": 0.1646, + "step": 3820 + }, + { + "epoch": 0.33089982288651776, + "grad_norm": 0.7491865346889776, + "learning_rate": 9.831875066619929e-06, + "loss": 0.1645, + "step": 3830 + }, + { + "epoch": 0.33176379109248777, + "grad_norm": 0.7645752796321156, + "learning_rate": 9.830673708398492e-06, + "loss": 0.1633, + "step": 3840 + }, + { + "epoch": 0.33262775929845784, + "grad_norm": 0.7036207961815555, + "learning_rate": 9.829468147139681e-06, + "loss": 0.1653, + "step": 3850 + }, + { + "epoch": 0.33349172750442785, + "grad_norm": 0.6558514059493622, + "learning_rate": 9.828258383892419e-06, + "loss": 0.1593, + "step": 3860 + }, + { + "epoch": 0.33435569571039786, + "grad_norm": 0.6523169696559058, + "learning_rate": 9.827044419709289e-06, + "loss": 0.1665, + "step": 3870 + }, + { + "epoch": 0.33521966391636787, + "grad_norm": 0.719938636914537, + "learning_rate": 9.825826255646532e-06, + "loss": 0.1654, + "step": 3880 + }, + { + "epoch": 0.3360836321223379, + "grad_norm": 0.9142436257642391, + "learning_rate": 9.824603892764033e-06, + "loss": 0.1655, + "step": 3890 + }, + { + "epoch": 0.3369476003283079, + "grad_norm": 0.7308837755716424, + "learning_rate": 9.82337733212534e-06, + "loss": 0.1647, + "step": 3900 + }, + { + "epoch": 0.33781156853427796, + "grad_norm": 0.7170945250305703, + "learning_rate": 9.82214657479765e-06, + "loss": 0.165, + "step": 3910 + }, + { + "epoch": 0.338675536740248, + "grad_norm": 0.7101772423726191, + "learning_rate": 9.820911621851813e-06, + "loss": 0.1674, + "step": 3920 + }, + { + "epoch": 0.339539504946218, + "grad_norm": 0.7105871752653151, + "learning_rate": 9.819672474362324e-06, + "loss": 0.1661, + "step": 3930 + }, + { + "epoch": 0.340403473152188, + "grad_norm": 0.6928866893620583, + "learning_rate": 9.818429133407332e-06, + "loss": 0.1639, + "step": 3940 + }, + { + "epoch": 0.341267441358158, + "grad_norm": 0.7070762196788623, + "learning_rate": 9.817181600068636e-06, + "loss": 0.1598, + "step": 3950 + }, + { + "epoch": 0.342131409564128, + "grad_norm": 0.7131526299925895, + "learning_rate": 9.81592987543168e-06, + "loss": 0.1603, + "step": 3960 + }, + { + "epoch": 0.3429953777700981, + "grad_norm": 0.6857882956396996, + "learning_rate": 9.814673960585556e-06, + "loss": 0.166, + "step": 3970 + }, + { + "epoch": 0.3438593459760681, + "grad_norm": 0.7335700923086347, + "learning_rate": 9.813413856623002e-06, + "loss": 0.167, + "step": 3980 + }, + { + "epoch": 0.3447233141820381, + "grad_norm": 0.6928897108991285, + "learning_rate": 9.812149564640397e-06, + "loss": 0.1612, + "step": 3990 + }, + { + "epoch": 0.3455872823880081, + "grad_norm": 0.7266942679789092, + "learning_rate": 9.810881085737769e-06, + "loss": 0.1617, + "step": 4000 + }, + { + "epoch": 0.34645125059397813, + "grad_norm": 0.7271458299087374, + "learning_rate": 9.809608421018786e-06, + "loss": 0.1606, + "step": 4010 + }, + { + "epoch": 0.34731521879994814, + "grad_norm": 0.7087330966470788, + "learning_rate": 9.80833157159076e-06, + "loss": 0.1635, + "step": 4020 + }, + { + "epoch": 0.34817918700591816, + "grad_norm": 0.6481443225031476, + "learning_rate": 9.807050538564644e-06, + "loss": 0.1577, + "step": 4030 + }, + { + "epoch": 0.3490431552118882, + "grad_norm": 0.701382417957392, + "learning_rate": 9.805765323055025e-06, + "loss": 0.1603, + "step": 4040 + }, + { + "epoch": 0.34990712341785823, + "grad_norm": 0.6975531043581854, + "learning_rate": 9.804475926180139e-06, + "loss": 0.1658, + "step": 4050 + }, + { + "epoch": 0.35077109162382825, + "grad_norm": 0.738431014281204, + "learning_rate": 9.803182349061853e-06, + "loss": 0.1655, + "step": 4060 + }, + { + "epoch": 0.35163505982979826, + "grad_norm": 0.7081474595206806, + "learning_rate": 9.801884592825673e-06, + "loss": 0.1632, + "step": 4070 + }, + { + "epoch": 0.35249902803576827, + "grad_norm": 0.6865576639672578, + "learning_rate": 9.80058265860074e-06, + "loss": 0.1683, + "step": 4080 + }, + { + "epoch": 0.3533629962417383, + "grad_norm": 0.7273806197401467, + "learning_rate": 9.799276547519836e-06, + "loss": 0.1663, + "step": 4090 + }, + { + "epoch": 0.35422696444770835, + "grad_norm": 0.7164869319313873, + "learning_rate": 9.797966260719369e-06, + "loss": 0.1632, + "step": 4100 + }, + { + "epoch": 0.35509093265367836, + "grad_norm": 0.6888892295698765, + "learning_rate": 9.796651799339383e-06, + "loss": 0.1642, + "step": 4110 + }, + { + "epoch": 0.35595490085964837, + "grad_norm": 0.6597106187788412, + "learning_rate": 9.795333164523557e-06, + "loss": 0.1662, + "step": 4120 + }, + { + "epoch": 0.3568188690656184, + "grad_norm": 0.7121146624840744, + "learning_rate": 9.7940103574192e-06, + "loss": 0.1629, + "step": 4130 + }, + { + "epoch": 0.3576828372715884, + "grad_norm": 0.7300742422291335, + "learning_rate": 9.792683379177249e-06, + "loss": 0.1631, + "step": 4140 + }, + { + "epoch": 0.3585468054775584, + "grad_norm": 0.7310423779575642, + "learning_rate": 9.791352230952269e-06, + "loss": 0.1669, + "step": 4150 + }, + { + "epoch": 0.3594107736835285, + "grad_norm": 0.7040579907657136, + "learning_rate": 9.790016913902458e-06, + "loss": 0.1657, + "step": 4160 + }, + { + "epoch": 0.3602747418894985, + "grad_norm": 0.6613582124056474, + "learning_rate": 9.788677429189642e-06, + "loss": 0.165, + "step": 4170 + }, + { + "epoch": 0.3611387100954685, + "grad_norm": 0.7137351700890041, + "learning_rate": 9.787333777979266e-06, + "loss": 0.1628, + "step": 4180 + }, + { + "epoch": 0.3620026783014385, + "grad_norm": 0.6950454669747205, + "learning_rate": 9.785985961440405e-06, + "loss": 0.166, + "step": 4190 + }, + { + "epoch": 0.3628666465074085, + "grad_norm": 0.690613286107089, + "learning_rate": 9.784633980745756e-06, + "loss": 0.1644, + "step": 4200 + }, + { + "epoch": 0.36373061471337853, + "grad_norm": 0.6770452196316469, + "learning_rate": 9.783277837071647e-06, + "loss": 0.1589, + "step": 4210 + }, + { + "epoch": 0.36459458291934854, + "grad_norm": 0.7351025780843091, + "learning_rate": 9.781917531598013e-06, + "loss": 0.1605, + "step": 4220 + }, + { + "epoch": 0.3654585511253186, + "grad_norm": 0.7029824747231594, + "learning_rate": 9.780553065508424e-06, + "loss": 0.1589, + "step": 4230 + }, + { + "epoch": 0.3663225193312886, + "grad_norm": 0.7065773579594148, + "learning_rate": 9.779184439990064e-06, + "loss": 0.1671, + "step": 4240 + }, + { + "epoch": 0.36718648753725863, + "grad_norm": 0.6659108075139156, + "learning_rate": 9.777811656233738e-06, + "loss": 0.1677, + "step": 4250 + }, + { + "epoch": 0.36805045574322864, + "grad_norm": 0.699215116558182, + "learning_rate": 9.776434715433863e-06, + "loss": 0.163, + "step": 4260 + }, + { + "epoch": 0.36891442394919866, + "grad_norm": 0.706583481787705, + "learning_rate": 9.775053618788482e-06, + "loss": 0.1647, + "step": 4270 + }, + { + "epoch": 0.36977839215516867, + "grad_norm": 0.6874162890009886, + "learning_rate": 9.773668367499246e-06, + "loss": 0.1673, + "step": 4280 + }, + { + "epoch": 0.37064236036113873, + "grad_norm": 0.7546642378109868, + "learning_rate": 9.772278962771427e-06, + "loss": 0.164, + "step": 4290 + }, + { + "epoch": 0.37150632856710875, + "grad_norm": 0.6958279018093277, + "learning_rate": 9.770885405813907e-06, + "loss": 0.1639, + "step": 4300 + }, + { + "epoch": 0.37237029677307876, + "grad_norm": 0.6863783205097246, + "learning_rate": 9.769487697839184e-06, + "loss": 0.1606, + "step": 4310 + }, + { + "epoch": 0.37323426497904877, + "grad_norm": 0.7660038203574383, + "learning_rate": 9.768085840063363e-06, + "loss": 0.1658, + "step": 4320 + }, + { + "epoch": 0.3740982331850188, + "grad_norm": 0.7310683895324374, + "learning_rate": 9.766679833706163e-06, + "loss": 0.165, + "step": 4330 + }, + { + "epoch": 0.3749622013909888, + "grad_norm": 0.6682511713805321, + "learning_rate": 9.765269679990913e-06, + "loss": 0.1613, + "step": 4340 + }, + { + "epoch": 0.37582616959695886, + "grad_norm": 0.7295984036992738, + "learning_rate": 9.763855380144546e-06, + "loss": 0.1627, + "step": 4350 + }, + { + "epoch": 0.37669013780292887, + "grad_norm": 0.6661232017039217, + "learning_rate": 9.762436935397608e-06, + "loss": 0.1583, + "step": 4360 + }, + { + "epoch": 0.3775541060088989, + "grad_norm": 0.6649991756043743, + "learning_rate": 9.76101434698425e-06, + "loss": 0.1618, + "step": 4370 + }, + { + "epoch": 0.3784180742148689, + "grad_norm": 0.6923759634173425, + "learning_rate": 9.759587616142225e-06, + "loss": 0.163, + "step": 4380 + }, + { + "epoch": 0.3792820424208389, + "grad_norm": 0.6871423621900502, + "learning_rate": 9.758156744112895e-06, + "loss": 0.1597, + "step": 4390 + }, + { + "epoch": 0.3801460106268089, + "grad_norm": 0.7069305432340762, + "learning_rate": 9.75672173214122e-06, + "loss": 0.1674, + "step": 4400 + }, + { + "epoch": 0.38100997883277893, + "grad_norm": 0.7103059022932815, + "learning_rate": 9.755282581475769e-06, + "loss": 0.1606, + "step": 4410 + }, + { + "epoch": 0.381873947038749, + "grad_norm": 0.6899949710005489, + "learning_rate": 9.753839293368704e-06, + "loss": 0.1619, + "step": 4420 + }, + { + "epoch": 0.382737915244719, + "grad_norm": 0.6866404341566841, + "learning_rate": 9.752391869075791e-06, + "loss": 0.1621, + "step": 4430 + }, + { + "epoch": 0.383601883450689, + "grad_norm": 0.6747754230556255, + "learning_rate": 9.750940309856393e-06, + "loss": 0.1613, + "step": 4440 + }, + { + "epoch": 0.38446585165665903, + "grad_norm": 0.7249657876338548, + "learning_rate": 9.749484616973478e-06, + "loss": 0.1613, + "step": 4450 + }, + { + "epoch": 0.38532981986262904, + "grad_norm": 0.7105209288147724, + "learning_rate": 9.748024791693598e-06, + "loss": 0.1569, + "step": 4460 + }, + { + "epoch": 0.38619378806859905, + "grad_norm": 0.6574038542965889, + "learning_rate": 9.74656083528691e-06, + "loss": 0.1597, + "step": 4470 + }, + { + "epoch": 0.3870577562745691, + "grad_norm": 0.6480703817246564, + "learning_rate": 9.745092749027163e-06, + "loss": 0.1603, + "step": 4480 + }, + { + "epoch": 0.38792172448053913, + "grad_norm": 0.6415644598250044, + "learning_rate": 9.743620534191698e-06, + "loss": 0.1588, + "step": 4490 + }, + { + "epoch": 0.38878569268650914, + "grad_norm": 0.6805308455140375, + "learning_rate": 9.74214419206145e-06, + "loss": 0.1629, + "step": 4500 + }, + { + "epoch": 0.38964966089247915, + "grad_norm": 0.7330716655075384, + "learning_rate": 9.74066372392094e-06, + "loss": 0.161, + "step": 4510 + }, + { + "epoch": 0.39051362909844917, + "grad_norm": 0.6818706767824103, + "learning_rate": 9.73917913105829e-06, + "loss": 0.1591, + "step": 4520 + }, + { + "epoch": 0.3913775973044192, + "grad_norm": 0.7047990410076855, + "learning_rate": 9.737690414765198e-06, + "loss": 0.1641, + "step": 4530 + }, + { + "epoch": 0.39224156551038925, + "grad_norm": 0.6958885127192019, + "learning_rate": 9.736197576336957e-06, + "loss": 0.1598, + "step": 4540 + }, + { + "epoch": 0.39310553371635926, + "grad_norm": 0.7029461177460052, + "learning_rate": 9.734700617072444e-06, + "loss": 0.1561, + "step": 4550 + }, + { + "epoch": 0.39396950192232927, + "grad_norm": 0.6822164097545496, + "learning_rate": 9.733199538274124e-06, + "loss": 0.1576, + "step": 4560 + }, + { + "epoch": 0.3948334701282993, + "grad_norm": 0.6805525304356631, + "learning_rate": 9.731694341248045e-06, + "loss": 0.1582, + "step": 4570 + }, + { + "epoch": 0.3956974383342693, + "grad_norm": 0.7090666255186993, + "learning_rate": 9.730185027303837e-06, + "loss": 0.1625, + "step": 4580 + }, + { + "epoch": 0.3965614065402393, + "grad_norm": 0.6775427538075873, + "learning_rate": 9.728671597754715e-06, + "loss": 0.1599, + "step": 4590 + }, + { + "epoch": 0.3974253747462093, + "grad_norm": 0.7056790719470328, + "learning_rate": 9.727154053917469e-06, + "loss": 0.1628, + "step": 4600 + }, + { + "epoch": 0.3982893429521794, + "grad_norm": 0.638674649902704, + "learning_rate": 9.725632397112474e-06, + "loss": 0.1629, + "step": 4610 + }, + { + "epoch": 0.3991533111581494, + "grad_norm": 0.6550807852274794, + "learning_rate": 9.724106628663683e-06, + "loss": 0.16, + "step": 4620 + }, + { + "epoch": 0.4000172793641194, + "grad_norm": 0.6460332536413842, + "learning_rate": 9.722576749898624e-06, + "loss": 0.1579, + "step": 4630 + }, + { + "epoch": 0.4008812475700894, + "grad_norm": 0.6551551230297281, + "learning_rate": 9.721042762148405e-06, + "loss": 0.1622, + "step": 4640 + }, + { + "epoch": 0.40174521577605943, + "grad_norm": 0.6626205641580281, + "learning_rate": 9.719504666747704e-06, + "loss": 0.1559, + "step": 4650 + }, + { + "epoch": 0.40260918398202944, + "grad_norm": 0.7096427344280777, + "learning_rate": 9.717962465034778e-06, + "loss": 0.1658, + "step": 4660 + }, + { + "epoch": 0.4034731521879995, + "grad_norm": 0.7129316785627647, + "learning_rate": 9.716416158351454e-06, + "loss": 0.1624, + "step": 4670 + }, + { + "epoch": 0.4043371203939695, + "grad_norm": 0.6976638834788887, + "learning_rate": 9.714865748043129e-06, + "loss": 0.164, + "step": 4680 + }, + { + "epoch": 0.40520108859993953, + "grad_norm": 0.6658249350744622, + "learning_rate": 9.713311235458778e-06, + "loss": 0.1631, + "step": 4690 + }, + { + "epoch": 0.40606505680590954, + "grad_norm": 0.6840733659601906, + "learning_rate": 9.711752621950936e-06, + "loss": 0.1615, + "step": 4700 + }, + { + "epoch": 0.40692902501187955, + "grad_norm": 0.6840753154130336, + "learning_rate": 9.71018990887571e-06, + "loss": 0.1617, + "step": 4710 + }, + { + "epoch": 0.40779299321784956, + "grad_norm": 0.6630826297979107, + "learning_rate": 9.708623097592775e-06, + "loss": 0.1621, + "step": 4720 + }, + { + "epoch": 0.4086569614238196, + "grad_norm": 0.7140002244423074, + "learning_rate": 9.70705218946537e-06, + "loss": 0.1622, + "step": 4730 + }, + { + "epoch": 0.40952092962978964, + "grad_norm": 0.6758179638610594, + "learning_rate": 9.705477185860302e-06, + "loss": 0.1594, + "step": 4740 + }, + { + "epoch": 0.41038489783575965, + "grad_norm": 0.6900535022098612, + "learning_rate": 9.703898088147935e-06, + "loss": 0.1632, + "step": 4750 + }, + { + "epoch": 0.41124886604172967, + "grad_norm": 0.6761698450465825, + "learning_rate": 9.702314897702203e-06, + "loss": 0.1614, + "step": 4760 + }, + { + "epoch": 0.4121128342476997, + "grad_norm": 0.6485182080321424, + "learning_rate": 9.700727615900591e-06, + "loss": 0.1604, + "step": 4770 + }, + { + "epoch": 0.4129768024536697, + "grad_norm": 0.689619498277749, + "learning_rate": 9.699136244124155e-06, + "loss": 0.1613, + "step": 4780 + }, + { + "epoch": 0.4138407706596397, + "grad_norm": 0.6848912848714906, + "learning_rate": 9.697540783757502e-06, + "loss": 0.1641, + "step": 4790 + }, + { + "epoch": 0.41470473886560977, + "grad_norm": 0.6931500377479651, + "learning_rate": 9.695941236188797e-06, + "loss": 0.1619, + "step": 4800 + }, + { + "epoch": 0.4155687070715798, + "grad_norm": 0.6851757299026764, + "learning_rate": 9.694337602809765e-06, + "loss": 0.1566, + "step": 4810 + }, + { + "epoch": 0.4164326752775498, + "grad_norm": 0.6624506770445044, + "learning_rate": 9.692729885015684e-06, + "loss": 0.1599, + "step": 4820 + }, + { + "epoch": 0.4172966434835198, + "grad_norm": 0.6387799436823667, + "learning_rate": 9.691118084205382e-06, + "loss": 0.1637, + "step": 4830 + }, + { + "epoch": 0.4181606116894898, + "grad_norm": 0.6918645541241718, + "learning_rate": 9.689502201781247e-06, + "loss": 0.1596, + "step": 4840 + }, + { + "epoch": 0.4190245798954598, + "grad_norm": 0.6718823552411884, + "learning_rate": 9.68788223914921e-06, + "loss": 0.1599, + "step": 4850 + }, + { + "epoch": 0.4198885481014299, + "grad_norm": 0.6412918843776836, + "learning_rate": 9.686258197718761e-06, + "loss": 0.1577, + "step": 4860 + }, + { + "epoch": 0.4207525163073999, + "grad_norm": 0.674420383357471, + "learning_rate": 9.684630078902933e-06, + "loss": 0.1607, + "step": 4870 + }, + { + "epoch": 0.4216164845133699, + "grad_norm": 0.6704220676917114, + "learning_rate": 9.682997884118303e-06, + "loss": 0.1656, + "step": 4880 + }, + { + "epoch": 0.4224804527193399, + "grad_norm": 0.6878812416032285, + "learning_rate": 9.681361614785006e-06, + "loss": 0.1594, + "step": 4890 + }, + { + "epoch": 0.42334442092530994, + "grad_norm": 0.6553717926047634, + "learning_rate": 9.679721272326709e-06, + "loss": 0.1584, + "step": 4900 + }, + { + "epoch": 0.42420838913127995, + "grad_norm": 0.6774054340189088, + "learning_rate": 9.678076858170633e-06, + "loss": 0.1576, + "step": 4910 + }, + { + "epoch": 0.42507235733724996, + "grad_norm": 0.6738216331424101, + "learning_rate": 9.676428373747538e-06, + "loss": 0.1588, + "step": 4920 + }, + { + "epoch": 0.42593632554322003, + "grad_norm": 0.6741765178104295, + "learning_rate": 9.674775820491725e-06, + "loss": 0.1573, + "step": 4930 + }, + { + "epoch": 0.42680029374919004, + "grad_norm": 0.6760301728835072, + "learning_rate": 9.673119199841033e-06, + "loss": 0.1587, + "step": 4940 + }, + { + "epoch": 0.42766426195516005, + "grad_norm": 0.683102732085449, + "learning_rate": 9.671458513236845e-06, + "loss": 0.1599, + "step": 4950 + }, + { + "epoch": 0.42852823016113006, + "grad_norm": 0.6379446609802096, + "learning_rate": 9.669793762124079e-06, + "loss": 0.1565, + "step": 4960 + }, + { + "epoch": 0.4293921983671001, + "grad_norm": 0.6798505709963193, + "learning_rate": 9.668124947951187e-06, + "loss": 0.1621, + "step": 4970 + }, + { + "epoch": 0.4302561665730701, + "grad_norm": 0.6549161090260386, + "learning_rate": 9.666452072170163e-06, + "loss": 0.1582, + "step": 4980 + }, + { + "epoch": 0.43112013477904015, + "grad_norm": 0.6852988744956113, + "learning_rate": 9.664775136236528e-06, + "loss": 0.1566, + "step": 4990 + }, + { + "epoch": 0.43198410298501017, + "grad_norm": 0.6526290012979358, + "learning_rate": 9.663094141609337e-06, + "loss": 0.1587, + "step": 5000 + }, + { + "epoch": 0.4328480711909802, + "grad_norm": 0.6668884578151805, + "learning_rate": 9.661409089751179e-06, + "loss": 0.1575, + "step": 5010 + }, + { + "epoch": 0.4337120393969502, + "grad_norm": 0.6976587290426327, + "learning_rate": 9.659719982128172e-06, + "loss": 0.16, + "step": 5020 + }, + { + "epoch": 0.4345760076029202, + "grad_norm": 0.7079200322711302, + "learning_rate": 9.65802682020996e-06, + "loss": 0.159, + "step": 5030 + }, + { + "epoch": 0.4354399758088902, + "grad_norm": 0.6917877632076461, + "learning_rate": 9.656329605469724e-06, + "loss": 0.1547, + "step": 5040 + }, + { + "epoch": 0.4363039440148603, + "grad_norm": 0.6797501891633531, + "learning_rate": 9.654628339384154e-06, + "loss": 0.161, + "step": 5050 + }, + { + "epoch": 0.4371679122208303, + "grad_norm": 0.6635520032766078, + "learning_rate": 9.652923023433483e-06, + "loss": 0.1551, + "step": 5060 + }, + { + "epoch": 0.4380318804268003, + "grad_norm": 0.6803906100574411, + "learning_rate": 9.651213659101456e-06, + "loss": 0.1606, + "step": 5070 + }, + { + "epoch": 0.4388958486327703, + "grad_norm": 0.6700575525523713, + "learning_rate": 9.649500247875347e-06, + "loss": 0.1604, + "step": 5080 + }, + { + "epoch": 0.4397598168387403, + "grad_norm": 0.6662538170197336, + "learning_rate": 9.647782791245945e-06, + "loss": 0.1568, + "step": 5090 + }, + { + "epoch": 0.44062378504471034, + "grad_norm": 0.6624543848438776, + "learning_rate": 9.646061290707566e-06, + "loss": 0.1574, + "step": 5100 + }, + { + "epoch": 0.44148775325068035, + "grad_norm": 0.7109126656654107, + "learning_rate": 9.644335747758037e-06, + "loss": 0.1591, + "step": 5110 + }, + { + "epoch": 0.4423517214566504, + "grad_norm": 0.6473806141940632, + "learning_rate": 9.642606163898708e-06, + "loss": 0.1595, + "step": 5120 + }, + { + "epoch": 0.4432156896626204, + "grad_norm": 0.7006474036535846, + "learning_rate": 9.640872540634443e-06, + "loss": 0.1557, + "step": 5130 + }, + { + "epoch": 0.44407965786859044, + "grad_norm": 0.641365283402573, + "learning_rate": 9.63913487947362e-06, + "loss": 0.1624, + "step": 5140 + }, + { + "epoch": 0.44494362607456045, + "grad_norm": 0.6959501153561491, + "learning_rate": 9.63739318192813e-06, + "loss": 0.1612, + "step": 5150 + }, + { + "epoch": 0.44580759428053046, + "grad_norm": 0.6750876307270886, + "learning_rate": 9.635647449513375e-06, + "loss": 0.1594, + "step": 5160 + }, + { + "epoch": 0.4466715624865005, + "grad_norm": 0.6462815274785941, + "learning_rate": 9.633897683748271e-06, + "loss": 0.1563, + "step": 5170 + }, + { + "epoch": 0.44753553069247054, + "grad_norm": 0.6795189194974991, + "learning_rate": 9.632143886155242e-06, + "loss": 0.1569, + "step": 5180 + }, + { + "epoch": 0.44839949889844055, + "grad_norm": 0.6386005245305387, + "learning_rate": 9.630386058260219e-06, + "loss": 0.1595, + "step": 5190 + }, + { + "epoch": 0.44926346710441056, + "grad_norm": 0.6696051720370734, + "learning_rate": 9.628624201592637e-06, + "loss": 0.1559, + "step": 5200 + }, + { + "epoch": 0.4501274353103806, + "grad_norm": 0.6525705870370956, + "learning_rate": 9.626858317685446e-06, + "loss": 0.1552, + "step": 5210 + }, + { + "epoch": 0.4509914035163506, + "grad_norm": 0.6601726705245239, + "learning_rate": 9.625088408075088e-06, + "loss": 0.1547, + "step": 5220 + }, + { + "epoch": 0.4518553717223206, + "grad_norm": 0.6949657010269852, + "learning_rate": 9.623314474301513e-06, + "loss": 0.1628, + "step": 5230 + }, + { + "epoch": 0.45271933992829066, + "grad_norm": 0.6272986645491211, + "learning_rate": 9.621536517908175e-06, + "loss": 0.1584, + "step": 5240 + }, + { + "epoch": 0.4535833081342607, + "grad_norm": 0.6564336061317336, + "learning_rate": 9.619754540442023e-06, + "loss": 0.1561, + "step": 5250 + }, + { + "epoch": 0.4544472763402307, + "grad_norm": 0.6339087400986222, + "learning_rate": 9.61796854345351e-06, + "loss": 0.1584, + "step": 5260 + }, + { + "epoch": 0.4553112445462007, + "grad_norm": 0.655855968908132, + "learning_rate": 9.616178528496583e-06, + "loss": 0.1553, + "step": 5270 + }, + { + "epoch": 0.4561752127521707, + "grad_norm": 0.6860757996153211, + "learning_rate": 9.61438449712868e-06, + "loss": 0.16, + "step": 5280 + }, + { + "epoch": 0.4570391809581407, + "grad_norm": 0.6406762286870573, + "learning_rate": 9.612586450910744e-06, + "loss": 0.1536, + "step": 5290 + }, + { + "epoch": 0.45790314916411073, + "grad_norm": 0.6507725769700754, + "learning_rate": 9.610784391407204e-06, + "loss": 0.1602, + "step": 5300 + }, + { + "epoch": 0.4587671173700808, + "grad_norm": 0.6513419928024363, + "learning_rate": 9.608978320185985e-06, + "loss": 0.154, + "step": 5310 + }, + { + "epoch": 0.4596310855760508, + "grad_norm": 0.6557237617400382, + "learning_rate": 9.607168238818496e-06, + "loss": 0.1607, + "step": 5320 + }, + { + "epoch": 0.4604950537820208, + "grad_norm": 0.6489510877793241, + "learning_rate": 9.605354148879643e-06, + "loss": 0.1543, + "step": 5330 + }, + { + "epoch": 0.46135902198799084, + "grad_norm": 0.6518534162138759, + "learning_rate": 9.603536051947815e-06, + "loss": 0.1537, + "step": 5340 + }, + { + "epoch": 0.46222299019396085, + "grad_norm": 0.6435402536124452, + "learning_rate": 9.601713949604887e-06, + "loss": 0.1602, + "step": 5350 + }, + { + "epoch": 0.46308695839993086, + "grad_norm": 0.656285821302999, + "learning_rate": 9.599887843436224e-06, + "loss": 0.1575, + "step": 5360 + }, + { + "epoch": 0.4639509266059009, + "grad_norm": 0.6734174544832082, + "learning_rate": 9.598057735030668e-06, + "loss": 0.1615, + "step": 5370 + }, + { + "epoch": 0.46481489481187094, + "grad_norm": 0.6525119845384318, + "learning_rate": 9.59622362598055e-06, + "loss": 0.1555, + "step": 5380 + }, + { + "epoch": 0.46567886301784095, + "grad_norm": 0.6424183453875718, + "learning_rate": 9.594385517881673e-06, + "loss": 0.1584, + "step": 5390 + }, + { + "epoch": 0.46654283122381096, + "grad_norm": 0.6777539885626862, + "learning_rate": 9.592543412333329e-06, + "loss": 0.1623, + "step": 5400 + }, + { + "epoch": 0.467406799429781, + "grad_norm": 0.6259425254530941, + "learning_rate": 9.59069731093828e-06, + "loss": 0.1599, + "step": 5410 + }, + { + "epoch": 0.468270767635751, + "grad_norm": 0.6932018168621533, + "learning_rate": 9.588847215302772e-06, + "loss": 0.1592, + "step": 5420 + }, + { + "epoch": 0.46913473584172105, + "grad_norm": 0.6477717328424021, + "learning_rate": 9.586993127036522e-06, + "loss": 0.1588, + "step": 5430 + }, + { + "epoch": 0.46999870404769106, + "grad_norm": 0.6592836020610073, + "learning_rate": 9.58513504775272e-06, + "loss": 0.1635, + "step": 5440 + }, + { + "epoch": 0.4708626722536611, + "grad_norm": 0.6457972290946014, + "learning_rate": 9.583272979068032e-06, + "loss": 0.1573, + "step": 5450 + }, + { + "epoch": 0.4717266404596311, + "grad_norm": 0.6707595353255882, + "learning_rate": 9.581406922602593e-06, + "loss": 0.1528, + "step": 5460 + }, + { + "epoch": 0.4725906086656011, + "grad_norm": 0.654924509203685, + "learning_rate": 9.579536879980005e-06, + "loss": 0.154, + "step": 5470 + }, + { + "epoch": 0.4734545768715711, + "grad_norm": 0.6498507426859098, + "learning_rate": 9.577662852827345e-06, + "loss": 0.1554, + "step": 5480 + }, + { + "epoch": 0.4743185450775411, + "grad_norm": 0.6503674843306472, + "learning_rate": 9.575784842775152e-06, + "loss": 0.1578, + "step": 5490 + }, + { + "epoch": 0.4751825132835112, + "grad_norm": 0.6685006182692143, + "learning_rate": 9.573902851457428e-06, + "loss": 0.1579, + "step": 5500 + }, + { + "epoch": 0.4760464814894812, + "grad_norm": 0.647438727181266, + "learning_rate": 9.572016880511645e-06, + "loss": 0.1581, + "step": 5510 + }, + { + "epoch": 0.4769104496954512, + "grad_norm": 0.6524775973331061, + "learning_rate": 9.570126931578734e-06, + "loss": 0.1558, + "step": 5520 + }, + { + "epoch": 0.4777744179014212, + "grad_norm": 0.6328300944860435, + "learning_rate": 9.56823300630309e-06, + "loss": 0.1514, + "step": 5530 + }, + { + "epoch": 0.47863838610739123, + "grad_norm": 0.633087466179911, + "learning_rate": 9.566335106332563e-06, + "loss": 0.1595, + "step": 5540 + }, + { + "epoch": 0.47950235431336125, + "grad_norm": 0.6531923998474526, + "learning_rate": 9.564433233318466e-06, + "loss": 0.1535, + "step": 5550 + }, + { + "epoch": 0.4803663225193313, + "grad_norm": 0.635543450438123, + "learning_rate": 9.562527388915565e-06, + "loss": 0.1557, + "step": 5560 + }, + { + "epoch": 0.4812302907253013, + "grad_norm": 0.628235427511297, + "learning_rate": 9.560617574782085e-06, + "loss": 0.1601, + "step": 5570 + }, + { + "epoch": 0.48209425893127134, + "grad_norm": 0.6322211295045682, + "learning_rate": 9.558703792579702e-06, + "loss": 0.1598, + "step": 5580 + }, + { + "epoch": 0.48295822713724135, + "grad_norm": 0.6423726739039197, + "learning_rate": 9.556786043973547e-06, + "loss": 0.1557, + "step": 5590 + }, + { + "epoch": 0.48382219534321136, + "grad_norm": 0.6773207454316515, + "learning_rate": 9.554864330632198e-06, + "loss": 0.1585, + "step": 5600 + }, + { + "epoch": 0.48468616354918137, + "grad_norm": 0.6499962659091628, + "learning_rate": 9.55293865422769e-06, + "loss": 0.1583, + "step": 5610 + }, + { + "epoch": 0.48555013175515144, + "grad_norm": 0.6653000931931303, + "learning_rate": 9.551009016435495e-06, + "loss": 0.1604, + "step": 5620 + }, + { + "epoch": 0.48641409996112145, + "grad_norm": 0.6360345932914004, + "learning_rate": 9.549075418934543e-06, + "loss": 0.1521, + "step": 5630 + }, + { + "epoch": 0.48727806816709146, + "grad_norm": 0.5933880679418415, + "learning_rate": 9.547137863407204e-06, + "loss": 0.1577, + "step": 5640 + }, + { + "epoch": 0.48814203637306147, + "grad_norm": 0.629981336471149, + "learning_rate": 9.545196351539292e-06, + "loss": 0.1532, + "step": 5650 + }, + { + "epoch": 0.4890060045790315, + "grad_norm": 0.6555386097808079, + "learning_rate": 9.543250885020061e-06, + "loss": 0.156, + "step": 5660 + }, + { + "epoch": 0.4898699727850015, + "grad_norm": 0.6842451255068056, + "learning_rate": 9.54130146554221e-06, + "loss": 0.1587, + "step": 5670 + }, + { + "epoch": 0.4907339409909715, + "grad_norm": 0.6713064235267874, + "learning_rate": 9.539348094801877e-06, + "loss": 0.1552, + "step": 5680 + }, + { + "epoch": 0.4915979091969416, + "grad_norm": 0.6564079624166437, + "learning_rate": 9.537390774498637e-06, + "loss": 0.1537, + "step": 5690 + }, + { + "epoch": 0.4924618774029116, + "grad_norm": 0.6286290451414437, + "learning_rate": 9.535429506335496e-06, + "loss": 0.1566, + "step": 5700 + }, + { + "epoch": 0.4933258456088816, + "grad_norm": 0.6385916147206167, + "learning_rate": 9.533464292018906e-06, + "loss": 0.1553, + "step": 5710 + }, + { + "epoch": 0.4941898138148516, + "grad_norm": 0.6518930348205345, + "learning_rate": 9.531495133258742e-06, + "loss": 0.1544, + "step": 5720 + }, + { + "epoch": 0.4950537820208216, + "grad_norm": 0.629072344541979, + "learning_rate": 9.529522031768317e-06, + "loss": 0.1613, + "step": 5730 + }, + { + "epoch": 0.49591775022679163, + "grad_norm": 0.6572574558805094, + "learning_rate": 9.527544989264375e-06, + "loss": 0.1553, + "step": 5740 + }, + { + "epoch": 0.4967817184327617, + "grad_norm": 0.631342912981415, + "learning_rate": 9.525564007467082e-06, + "loss": 0.1555, + "step": 5750 + }, + { + "epoch": 0.4976456866387317, + "grad_norm": 0.610023752445402, + "learning_rate": 9.523579088100041e-06, + "loss": 0.1532, + "step": 5760 + }, + { + "epoch": 0.4985096548447017, + "grad_norm": 0.6376457831648065, + "learning_rate": 9.521590232890272e-06, + "loss": 0.1577, + "step": 5770 + }, + { + "epoch": 0.49937362305067173, + "grad_norm": 0.6406158974539737, + "learning_rate": 9.519597443568227e-06, + "loss": 0.1552, + "step": 5780 + }, + { + "epoch": 0.5002375912566418, + "grad_norm": 0.6608118104247712, + "learning_rate": 9.517600721867775e-06, + "loss": 0.1617, + "step": 5790 + }, + { + "epoch": 0.5011015594626118, + "grad_norm": 0.6153155011611277, + "learning_rate": 9.51560006952621e-06, + "loss": 0.1566, + "step": 5800 + }, + { + "epoch": 0.5019655276685818, + "grad_norm": 0.6539057272210915, + "learning_rate": 9.513595488284246e-06, + "loss": 0.1549, + "step": 5810 + }, + { + "epoch": 0.5028294958745518, + "grad_norm": 0.6531761984313647, + "learning_rate": 9.511586979886013e-06, + "loss": 0.1552, + "step": 5820 + }, + { + "epoch": 0.5036934640805218, + "grad_norm": 0.638972805350294, + "learning_rate": 9.509574546079061e-06, + "loss": 0.1536, + "step": 5830 + }, + { + "epoch": 0.5045574322864919, + "grad_norm": 0.6454598366800842, + "learning_rate": 9.507558188614353e-06, + "loss": 0.1564, + "step": 5840 + }, + { + "epoch": 0.5054214004924619, + "grad_norm": 0.6179767848023813, + "learning_rate": 9.505537909246266e-06, + "loss": 0.1615, + "step": 5850 + }, + { + "epoch": 0.5062853686984319, + "grad_norm": 0.6574241098778968, + "learning_rate": 9.50351370973259e-06, + "loss": 0.1506, + "step": 5860 + }, + { + "epoch": 0.5071493369044019, + "grad_norm": 0.6180722558466789, + "learning_rate": 9.501485591834525e-06, + "loss": 0.1522, + "step": 5870 + }, + { + "epoch": 0.5080133051103719, + "grad_norm": 0.6538193809641687, + "learning_rate": 9.499453557316684e-06, + "loss": 0.155, + "step": 5880 + }, + { + "epoch": 0.5088772733163419, + "grad_norm": 0.6838416890378471, + "learning_rate": 9.497417607947081e-06, + "loss": 0.1547, + "step": 5890 + }, + { + "epoch": 0.5097412415223119, + "grad_norm": 0.634166321930832, + "learning_rate": 9.495377745497144e-06, + "loss": 0.1538, + "step": 5900 + }, + { + "epoch": 0.510605209728282, + "grad_norm": 0.6333864736473793, + "learning_rate": 9.493333971741698e-06, + "loss": 0.161, + "step": 5910 + }, + { + "epoch": 0.5114691779342521, + "grad_norm": 0.6519770013554314, + "learning_rate": 9.491286288458978e-06, + "loss": 0.1507, + "step": 5920 + }, + { + "epoch": 0.5123331461402221, + "grad_norm": 0.6532553915478285, + "learning_rate": 9.489234697430613e-06, + "loss": 0.1548, + "step": 5930 + }, + { + "epoch": 0.5131971143461921, + "grad_norm": 0.6360296430234144, + "learning_rate": 9.48717920044164e-06, + "loss": 0.1523, + "step": 5940 + }, + { + "epoch": 0.5140610825521621, + "grad_norm": 0.6562037627803998, + "learning_rate": 9.485119799280491e-06, + "loss": 0.1528, + "step": 5950 + }, + { + "epoch": 0.5149250507581321, + "grad_norm": 0.6710431539588906, + "learning_rate": 9.483056495738994e-06, + "loss": 0.1525, + "step": 5960 + }, + { + "epoch": 0.5157890189641021, + "grad_norm": 0.6421136526052125, + "learning_rate": 9.480989291612372e-06, + "loss": 0.1546, + "step": 5970 + }, + { + "epoch": 0.5166529871700721, + "grad_norm": 0.6391902054075337, + "learning_rate": 9.478918188699243e-06, + "loss": 0.1542, + "step": 5980 + }, + { + "epoch": 0.5175169553760421, + "grad_norm": 0.6556659472283152, + "learning_rate": 9.47684318880162e-06, + "loss": 0.158, + "step": 5990 + }, + { + "epoch": 0.5183809235820122, + "grad_norm": 0.6546018179543279, + "learning_rate": 9.474764293724898e-06, + "loss": 0.1572, + "step": 6000 + }, + { + "epoch": 0.5192448917879822, + "grad_norm": 0.6154480749406304, + "learning_rate": 9.472681505277872e-06, + "loss": 0.1543, + "step": 6010 + }, + { + "epoch": 0.5201088599939522, + "grad_norm": 0.6403268547561236, + "learning_rate": 9.470594825272719e-06, + "loss": 0.151, + "step": 6020 + }, + { + "epoch": 0.5209728281999222, + "grad_norm": 0.6236563458442675, + "learning_rate": 9.468504255525e-06, + "loss": 0.1542, + "step": 6030 + }, + { + "epoch": 0.5218367964058923, + "grad_norm": 0.6877503233654196, + "learning_rate": 9.466409797853665e-06, + "loss": 0.1554, + "step": 6040 + }, + { + "epoch": 0.5227007646118623, + "grad_norm": 0.643531814258169, + "learning_rate": 9.464311454081041e-06, + "loss": 0.1532, + "step": 6050 + }, + { + "epoch": 0.5235647328178323, + "grad_norm": 0.6805286157046766, + "learning_rate": 9.462209226032843e-06, + "loss": 0.154, + "step": 6060 + }, + { + "epoch": 0.5244287010238023, + "grad_norm": 0.61245472642869, + "learning_rate": 9.460103115538161e-06, + "loss": 0.1558, + "step": 6070 + }, + { + "epoch": 0.5252926692297724, + "grad_norm": 0.6232098480791012, + "learning_rate": 9.457993124429466e-06, + "loss": 0.1546, + "step": 6080 + }, + { + "epoch": 0.5261566374357424, + "grad_norm": 0.618449033777273, + "learning_rate": 9.4558792545426e-06, + "loss": 0.1557, + "step": 6090 + }, + { + "epoch": 0.5270206056417124, + "grad_norm": 0.6397349484531721, + "learning_rate": 9.453761507716787e-06, + "loss": 0.1542, + "step": 6100 + }, + { + "epoch": 0.5278845738476824, + "grad_norm": 0.6278611478363766, + "learning_rate": 9.451639885794621e-06, + "loss": 0.1574, + "step": 6110 + }, + { + "epoch": 0.5287485420536524, + "grad_norm": 0.6235855930527535, + "learning_rate": 9.449514390622062e-06, + "loss": 0.1544, + "step": 6120 + }, + { + "epoch": 0.5296125102596224, + "grad_norm": 0.6700763747048177, + "learning_rate": 9.447385024048454e-06, + "loss": 0.1521, + "step": 6130 + }, + { + "epoch": 0.5304764784655924, + "grad_norm": 0.6090307268762309, + "learning_rate": 9.445251787926492e-06, + "loss": 0.1574, + "step": 6140 + }, + { + "epoch": 0.5313404466715624, + "grad_norm": 0.6427231492601245, + "learning_rate": 9.443114684112251e-06, + "loss": 0.1492, + "step": 6150 + }, + { + "epoch": 0.5322044148775326, + "grad_norm": 0.6555482109889144, + "learning_rate": 9.440973714465167e-06, + "loss": 0.1551, + "step": 6160 + }, + { + "epoch": 0.5330683830835026, + "grad_norm": 0.6250258619357056, + "learning_rate": 9.438828880848039e-06, + "loss": 0.1524, + "step": 6170 + }, + { + "epoch": 0.5339323512894726, + "grad_norm": 0.6288378897514569, + "learning_rate": 9.436680185127026e-06, + "loss": 0.1536, + "step": 6180 + }, + { + "epoch": 0.5347963194954426, + "grad_norm": 0.6291255544505886, + "learning_rate": 9.434527629171653e-06, + "loss": 0.1584, + "step": 6190 + }, + { + "epoch": 0.5356602877014126, + "grad_norm": 0.651577330866289, + "learning_rate": 9.432371214854797e-06, + "loss": 0.1522, + "step": 6200 + }, + { + "epoch": 0.5365242559073826, + "grad_norm": 0.6177750730266219, + "learning_rate": 9.430210944052696e-06, + "loss": 0.1559, + "step": 6210 + }, + { + "epoch": 0.5373882241133526, + "grad_norm": 0.6229158880730679, + "learning_rate": 9.428046818644942e-06, + "loss": 0.1574, + "step": 6220 + }, + { + "epoch": 0.5382521923193226, + "grad_norm": 0.6628200588962239, + "learning_rate": 9.425878840514487e-06, + "loss": 0.1501, + "step": 6230 + }, + { + "epoch": 0.5391161605252927, + "grad_norm": 0.6597060635008648, + "learning_rate": 9.423707011547622e-06, + "loss": 0.1474, + "step": 6240 + }, + { + "epoch": 0.5399801287312627, + "grad_norm": 0.6543877462107659, + "learning_rate": 9.421531333634e-06, + "loss": 0.1562, + "step": 6250 + }, + { + "epoch": 0.5408440969372327, + "grad_norm": 0.631809391193605, + "learning_rate": 9.419351808666618e-06, + "loss": 0.1532, + "step": 6260 + }, + { + "epoch": 0.5417080651432027, + "grad_norm": 0.597981419686076, + "learning_rate": 9.417168438541821e-06, + "loss": 0.1501, + "step": 6270 + }, + { + "epoch": 0.5425720333491727, + "grad_norm": 0.6143884751433082, + "learning_rate": 9.414981225159303e-06, + "loss": 0.1585, + "step": 6280 + }, + { + "epoch": 0.5434360015551428, + "grad_norm": 0.6797038536136399, + "learning_rate": 9.412790170422094e-06, + "loss": 0.1538, + "step": 6290 + }, + { + "epoch": 0.5442999697611128, + "grad_norm": 0.613381369055774, + "learning_rate": 9.410595276236574e-06, + "loss": 0.1505, + "step": 6300 + }, + { + "epoch": 0.5451639379670828, + "grad_norm": 0.6353511536715715, + "learning_rate": 9.408396544512459e-06, + "loss": 0.1558, + "step": 6310 + }, + { + "epoch": 0.5460279061730529, + "grad_norm": 0.6444380767578983, + "learning_rate": 9.406193977162806e-06, + "loss": 0.1492, + "step": 6320 + }, + { + "epoch": 0.5468918743790229, + "grad_norm": 0.6205572841742492, + "learning_rate": 9.403987576104009e-06, + "loss": 0.1563, + "step": 6330 + }, + { + "epoch": 0.5477558425849929, + "grad_norm": 0.5971746646391922, + "learning_rate": 9.401777343255796e-06, + "loss": 0.1528, + "step": 6340 + }, + { + "epoch": 0.5486198107909629, + "grad_norm": 0.6233544276574221, + "learning_rate": 9.399563280541233e-06, + "loss": 0.1553, + "step": 6350 + }, + { + "epoch": 0.5494837789969329, + "grad_norm": 0.6487060550718584, + "learning_rate": 9.397345389886715e-06, + "loss": 0.1523, + "step": 6360 + }, + { + "epoch": 0.5503477472029029, + "grad_norm": 0.6296095891164247, + "learning_rate": 9.395123673221965e-06, + "loss": 0.1553, + "step": 6370 + }, + { + "epoch": 0.5512117154088729, + "grad_norm": 0.7315998401798692, + "learning_rate": 9.39289813248004e-06, + "loss": 0.1567, + "step": 6380 + }, + { + "epoch": 0.5520756836148429, + "grad_norm": 0.6450303515918221, + "learning_rate": 9.390668769597323e-06, + "loss": 0.1456, + "step": 6390 + }, + { + "epoch": 0.552939651820813, + "grad_norm": 0.6353752953847911, + "learning_rate": 9.38843558651352e-06, + "loss": 0.1591, + "step": 6400 + }, + { + "epoch": 0.553803620026783, + "grad_norm": 0.639900060325134, + "learning_rate": 9.386198585171666e-06, + "loss": 0.1582, + "step": 6410 + }, + { + "epoch": 0.5546675882327531, + "grad_norm": 0.631243331570897, + "learning_rate": 9.38395776751811e-06, + "loss": 0.1535, + "step": 6420 + }, + { + "epoch": 0.5555315564387231, + "grad_norm": 0.6120386993003224, + "learning_rate": 9.381713135502531e-06, + "loss": 0.1504, + "step": 6430 + }, + { + "epoch": 0.5563955246446931, + "grad_norm": 0.6218134724182738, + "learning_rate": 9.379464691077919e-06, + "loss": 0.1523, + "step": 6440 + }, + { + "epoch": 0.5572594928506631, + "grad_norm": 0.7523102118837981, + "learning_rate": 9.377212436200587e-06, + "loss": 0.1521, + "step": 6450 + }, + { + "epoch": 0.5581234610566331, + "grad_norm": 0.6557050210004696, + "learning_rate": 9.374956372830158e-06, + "loss": 0.1561, + "step": 6460 + }, + { + "epoch": 0.5589874292626031, + "grad_norm": 0.6201510619086964, + "learning_rate": 9.37269650292957e-06, + "loss": 0.155, + "step": 6470 + }, + { + "epoch": 0.5598513974685732, + "grad_norm": 0.5811173881087803, + "learning_rate": 9.370432828465079e-06, + "loss": 0.1551, + "step": 6480 + }, + { + "epoch": 0.5607153656745432, + "grad_norm": 0.6012143606073025, + "learning_rate": 9.36816535140624e-06, + "loss": 0.1543, + "step": 6490 + }, + { + "epoch": 0.5615793338805132, + "grad_norm": 0.5825061635910226, + "learning_rate": 9.365894073725929e-06, + "loss": 0.1548, + "step": 6500 + }, + { + "epoch": 0.5624433020864832, + "grad_norm": 0.6375174928498161, + "learning_rate": 9.363618997400319e-06, + "loss": 0.1542, + "step": 6510 + }, + { + "epoch": 0.5633072702924532, + "grad_norm": 0.6218539072355832, + "learning_rate": 9.361340124408893e-06, + "loss": 0.1504, + "step": 6520 + }, + { + "epoch": 0.5641712384984232, + "grad_norm": 0.6377893763114578, + "learning_rate": 9.359057456734437e-06, + "loss": 0.1528, + "step": 6530 + }, + { + "epoch": 0.5650352067043933, + "grad_norm": 0.6059514862386471, + "learning_rate": 9.356770996363034e-06, + "loss": 0.1536, + "step": 6540 + }, + { + "epoch": 0.5658991749103633, + "grad_norm": 0.6782562253690789, + "learning_rate": 9.354480745284075e-06, + "loss": 0.1522, + "step": 6550 + }, + { + "epoch": 0.5667631431163334, + "grad_norm": 0.6530575495241211, + "learning_rate": 9.352186705490245e-06, + "loss": 0.1564, + "step": 6560 + }, + { + "epoch": 0.5676271113223034, + "grad_norm": 0.6271306204208658, + "learning_rate": 9.349888878977525e-06, + "loss": 0.1579, + "step": 6570 + }, + { + "epoch": 0.5684910795282734, + "grad_norm": 0.655379907159268, + "learning_rate": 9.347587267745188e-06, + "loss": 0.1587, + "step": 6580 + }, + { + "epoch": 0.5693550477342434, + "grad_norm": 0.6207039064949125, + "learning_rate": 9.345281873795807e-06, + "loss": 0.1502, + "step": 6590 + }, + { + "epoch": 0.5702190159402134, + "grad_norm": 0.6202852350840276, + "learning_rate": 9.342972699135238e-06, + "loss": 0.1482, + "step": 6600 + }, + { + "epoch": 0.5710829841461834, + "grad_norm": 0.6170685026124321, + "learning_rate": 9.340659745772635e-06, + "loss": 0.1528, + "step": 6610 + }, + { + "epoch": 0.5719469523521534, + "grad_norm": 0.6108278186924734, + "learning_rate": 9.338343015720434e-06, + "loss": 0.153, + "step": 6620 + }, + { + "epoch": 0.5728109205581234, + "grad_norm": 0.6451428011838746, + "learning_rate": 9.33602251099436e-06, + "loss": 0.1483, + "step": 6630 + }, + { + "epoch": 0.5736748887640934, + "grad_norm": 0.6672803417700074, + "learning_rate": 9.33369823361342e-06, + "loss": 0.1525, + "step": 6640 + }, + { + "epoch": 0.5745388569700635, + "grad_norm": 0.638138381181463, + "learning_rate": 9.331370185599902e-06, + "loss": 0.1526, + "step": 6650 + }, + { + "epoch": 0.5754028251760335, + "grad_norm": 0.6285291919757152, + "learning_rate": 9.32903836897938e-06, + "loss": 0.1579, + "step": 6660 + }, + { + "epoch": 0.5762667933820036, + "grad_norm": 0.5894093953466101, + "learning_rate": 9.326702785780704e-06, + "loss": 0.1527, + "step": 6670 + }, + { + "epoch": 0.5771307615879736, + "grad_norm": 0.6363863884972929, + "learning_rate": 9.324363438035998e-06, + "loss": 0.1522, + "step": 6680 + }, + { + "epoch": 0.5779947297939436, + "grad_norm": 0.5954359232987835, + "learning_rate": 9.322020327780667e-06, + "loss": 0.147, + "step": 6690 + }, + { + "epoch": 0.5788586979999136, + "grad_norm": 0.5924570670614575, + "learning_rate": 9.319673457053389e-06, + "loss": 0.1554, + "step": 6700 + }, + { + "epoch": 0.5797226662058836, + "grad_norm": 0.6557729717170221, + "learning_rate": 9.31732282789611e-06, + "loss": 0.1548, + "step": 6710 + }, + { + "epoch": 0.5805866344118537, + "grad_norm": 0.6252295859666467, + "learning_rate": 9.314968442354048e-06, + "loss": 0.1519, + "step": 6720 + }, + { + "epoch": 0.5814506026178237, + "grad_norm": 0.6104984510258042, + "learning_rate": 9.312610302475691e-06, + "loss": 0.1545, + "step": 6730 + }, + { + "epoch": 0.5823145708237937, + "grad_norm": 0.6084878036937621, + "learning_rate": 9.31024841031279e-06, + "loss": 0.1509, + "step": 6740 + }, + { + "epoch": 0.5831785390297637, + "grad_norm": 0.615179111407142, + "learning_rate": 9.307882767920365e-06, + "loss": 0.1496, + "step": 6750 + }, + { + "epoch": 0.5840425072357337, + "grad_norm": 0.6335012641621653, + "learning_rate": 9.305513377356696e-06, + "loss": 0.159, + "step": 6760 + }, + { + "epoch": 0.5849064754417037, + "grad_norm": 0.6432893824981711, + "learning_rate": 9.303140240683326e-06, + "loss": 0.1558, + "step": 6770 + }, + { + "epoch": 0.5857704436476737, + "grad_norm": 0.6288129691404086, + "learning_rate": 9.300763359965057e-06, + "loss": 0.1529, + "step": 6780 + }, + { + "epoch": 0.5866344118536437, + "grad_norm": 0.5996719948442747, + "learning_rate": 9.298382737269944e-06, + "loss": 0.1561, + "step": 6790 + }, + { + "epoch": 0.5874983800596139, + "grad_norm": 0.6318777413266485, + "learning_rate": 9.295998374669307e-06, + "loss": 0.1532, + "step": 6800 + }, + { + "epoch": 0.5883623482655839, + "grad_norm": 0.6393146031927769, + "learning_rate": 9.29361027423771e-06, + "loss": 0.154, + "step": 6810 + }, + { + "epoch": 0.5892263164715539, + "grad_norm": 0.6087589722272293, + "learning_rate": 9.291218438052978e-06, + "loss": 0.1449, + "step": 6820 + }, + { + "epoch": 0.5900902846775239, + "grad_norm": 0.6053164128500995, + "learning_rate": 9.28882286819618e-06, + "loss": 0.1543, + "step": 6830 + }, + { + "epoch": 0.5909542528834939, + "grad_norm": 0.6168383132645895, + "learning_rate": 9.286423566751638e-06, + "loss": 0.153, + "step": 6840 + }, + { + "epoch": 0.5918182210894639, + "grad_norm": 0.6357370911729846, + "learning_rate": 9.284020535806917e-06, + "loss": 0.1493, + "step": 6850 + }, + { + "epoch": 0.5926821892954339, + "grad_norm": 0.5856044394038176, + "learning_rate": 9.28161377745283e-06, + "loss": 0.1526, + "step": 6860 + }, + { + "epoch": 0.5935461575014039, + "grad_norm": 0.595353891775203, + "learning_rate": 9.27920329378343e-06, + "loss": 0.1521, + "step": 6870 + }, + { + "epoch": 0.594410125707374, + "grad_norm": 0.5879333218876476, + "learning_rate": 9.276789086896015e-06, + "loss": 0.1517, + "step": 6880 + }, + { + "epoch": 0.595274093913344, + "grad_norm": 0.6437671621448833, + "learning_rate": 9.274371158891117e-06, + "loss": 0.1528, + "step": 6890 + }, + { + "epoch": 0.596138062119314, + "grad_norm": 0.6362829862629101, + "learning_rate": 9.271949511872514e-06, + "loss": 0.1574, + "step": 6900 + }, + { + "epoch": 0.597002030325284, + "grad_norm": 0.6324049229309815, + "learning_rate": 9.269524147947214e-06, + "loss": 0.1537, + "step": 6910 + }, + { + "epoch": 0.597865998531254, + "grad_norm": 0.6329882824337885, + "learning_rate": 9.267095069225456e-06, + "loss": 0.1519, + "step": 6920 + }, + { + "epoch": 0.5987299667372241, + "grad_norm": 0.6005996025691578, + "learning_rate": 9.264662277820719e-06, + "loss": 0.1512, + "step": 6930 + }, + { + "epoch": 0.5995939349431941, + "grad_norm": 0.6040443399864707, + "learning_rate": 9.262225775849707e-06, + "loss": 0.1525, + "step": 6940 + }, + { + "epoch": 0.6004579031491641, + "grad_norm": 0.6159608848677605, + "learning_rate": 9.259785565432356e-06, + "loss": 0.1533, + "step": 6950 + }, + { + "epoch": 0.6013218713551342, + "grad_norm": 0.6149382032923227, + "learning_rate": 9.257341648691822e-06, + "loss": 0.1509, + "step": 6960 + }, + { + "epoch": 0.6021858395611042, + "grad_norm": 0.6290543016885738, + "learning_rate": 9.254894027754493e-06, + "loss": 0.1539, + "step": 6970 + }, + { + "epoch": 0.6030498077670742, + "grad_norm": 0.5930637622387478, + "learning_rate": 9.25244270474998e-06, + "loss": 0.1543, + "step": 6980 + }, + { + "epoch": 0.6039137759730442, + "grad_norm": 0.5657882487334296, + "learning_rate": 9.249987681811106e-06, + "loss": 0.1492, + "step": 6990 + }, + { + "epoch": 0.6047777441790142, + "grad_norm": 0.6095682916220933, + "learning_rate": 9.247528961073925e-06, + "loss": 0.1516, + "step": 7000 + }, + { + "epoch": 0.6056417123849842, + "grad_norm": 0.5992451055616469, + "learning_rate": 9.2450665446777e-06, + "loss": 0.1534, + "step": 7010 + }, + { + "epoch": 0.6065056805909542, + "grad_norm": 0.5997401846439975, + "learning_rate": 9.242600434764912e-06, + "loss": 0.1521, + "step": 7020 + }, + { + "epoch": 0.6073696487969242, + "grad_norm": 0.5933488089940263, + "learning_rate": 9.240130633481259e-06, + "loss": 0.1518, + "step": 7030 + }, + { + "epoch": 0.6082336170028942, + "grad_norm": 0.6313843488784524, + "learning_rate": 9.237657142975643e-06, + "loss": 0.1538, + "step": 7040 + }, + { + "epoch": 0.6090975852088644, + "grad_norm": 0.6413521714283614, + "learning_rate": 9.235179965400184e-06, + "loss": 0.152, + "step": 7050 + }, + { + "epoch": 0.6099615534148344, + "grad_norm": 0.6184470237558511, + "learning_rate": 9.232699102910208e-06, + "loss": 0.1504, + "step": 7060 + }, + { + "epoch": 0.6108255216208044, + "grad_norm": 0.5707911485975334, + "learning_rate": 9.230214557664241e-06, + "loss": 0.1504, + "step": 7070 + }, + { + "epoch": 0.6116894898267744, + "grad_norm": 0.5777559907197692, + "learning_rate": 9.227726331824021e-06, + "loss": 0.1496, + "step": 7080 + }, + { + "epoch": 0.6125534580327444, + "grad_norm": 0.5954819404017806, + "learning_rate": 9.225234427554485e-06, + "loss": 0.1509, + "step": 7090 + }, + { + "epoch": 0.6134174262387144, + "grad_norm": 0.6192745295620272, + "learning_rate": 9.222738847023772e-06, + "loss": 0.151, + "step": 7100 + }, + { + "epoch": 0.6142813944446844, + "grad_norm": 0.6038949399030311, + "learning_rate": 9.220239592403216e-06, + "loss": 0.1491, + "step": 7110 + }, + { + "epoch": 0.6151453626506544, + "grad_norm": 0.5824949970844027, + "learning_rate": 9.217736665867352e-06, + "loss": 0.1473, + "step": 7120 + }, + { + "epoch": 0.6160093308566245, + "grad_norm": 0.5954051916023153, + "learning_rate": 9.215230069593907e-06, + "loss": 0.1525, + "step": 7130 + }, + { + "epoch": 0.6168732990625945, + "grad_norm": 0.6320195114436121, + "learning_rate": 9.212719805763806e-06, + "loss": 0.1541, + "step": 7140 + }, + { + "epoch": 0.6177372672685645, + "grad_norm": 0.6929376486569512, + "learning_rate": 9.210205876561153e-06, + "loss": 0.1493, + "step": 7150 + }, + { + "epoch": 0.6186012354745345, + "grad_norm": 0.6159597724690932, + "learning_rate": 9.207688284173257e-06, + "loss": 0.1489, + "step": 7160 + }, + { + "epoch": 0.6194652036805045, + "grad_norm": 0.5943323896784803, + "learning_rate": 9.205167030790604e-06, + "loss": 0.1486, + "step": 7170 + }, + { + "epoch": 0.6203291718864746, + "grad_norm": 0.6204073730643097, + "learning_rate": 9.202642118606866e-06, + "loss": 0.1503, + "step": 7180 + }, + { + "epoch": 0.6211931400924446, + "grad_norm": 0.6241579805429692, + "learning_rate": 9.2001135498189e-06, + "loss": 0.1532, + "step": 7190 + }, + { + "epoch": 0.6220571082984147, + "grad_norm": 0.6030366378179512, + "learning_rate": 9.19758132662675e-06, + "loss": 0.149, + "step": 7200 + }, + { + "epoch": 0.6229210765043847, + "grad_norm": 0.6071124192692541, + "learning_rate": 9.195045451233627e-06, + "loss": 0.1541, + "step": 7210 + }, + { + "epoch": 0.6237850447103547, + "grad_norm": 0.6158999535197959, + "learning_rate": 9.192505925845932e-06, + "loss": 0.1527, + "step": 7220 + }, + { + "epoch": 0.6246490129163247, + "grad_norm": 0.5640793686230424, + "learning_rate": 9.189962752673234e-06, + "loss": 0.1465, + "step": 7230 + }, + { + "epoch": 0.6255129811222947, + "grad_norm": 0.5975619759720371, + "learning_rate": 9.187415933928279e-06, + "loss": 0.1525, + "step": 7240 + }, + { + "epoch": 0.6263769493282647, + "grad_norm": 0.5968403447579432, + "learning_rate": 9.184865471826988e-06, + "loss": 0.1506, + "step": 7250 + }, + { + "epoch": 0.6272409175342347, + "grad_norm": 0.6042528506000148, + "learning_rate": 9.182311368588444e-06, + "loss": 0.1486, + "step": 7260 + }, + { + "epoch": 0.6281048857402047, + "grad_norm": 0.606460473515699, + "learning_rate": 9.179753626434905e-06, + "loss": 0.1505, + "step": 7270 + }, + { + "epoch": 0.6289688539461747, + "grad_norm": 0.5903817090048041, + "learning_rate": 9.17719224759179e-06, + "loss": 0.1518, + "step": 7280 + }, + { + "epoch": 0.6298328221521448, + "grad_norm": 0.5952765730532862, + "learning_rate": 9.174627234287688e-06, + "loss": 0.1492, + "step": 7290 + }, + { + "epoch": 0.6306967903581148, + "grad_norm": 0.622409000843169, + "learning_rate": 9.172058588754345e-06, + "loss": 0.1529, + "step": 7300 + }, + { + "epoch": 0.6315607585640849, + "grad_norm": 0.57758523085189, + "learning_rate": 9.169486313226671e-06, + "loss": 0.1506, + "step": 7310 + }, + { + "epoch": 0.6324247267700549, + "grad_norm": 0.6142820795880717, + "learning_rate": 9.166910409942731e-06, + "loss": 0.1486, + "step": 7320 + }, + { + "epoch": 0.6332886949760249, + "grad_norm": 0.5969544846944455, + "learning_rate": 9.16433088114375e-06, + "loss": 0.152, + "step": 7330 + }, + { + "epoch": 0.6341526631819949, + "grad_norm": 0.6077472401893858, + "learning_rate": 9.161747729074105e-06, + "loss": 0.1546, + "step": 7340 + }, + { + "epoch": 0.6350166313879649, + "grad_norm": 0.6153979176395339, + "learning_rate": 9.159160955981326e-06, + "loss": 0.1519, + "step": 7350 + }, + { + "epoch": 0.635880599593935, + "grad_norm": 0.5856990458728484, + "learning_rate": 9.156570564116092e-06, + "loss": 0.1509, + "step": 7360 + }, + { + "epoch": 0.636744567799905, + "grad_norm": 0.5805924301676306, + "learning_rate": 9.153976555732233e-06, + "loss": 0.1441, + "step": 7370 + }, + { + "epoch": 0.637608536005875, + "grad_norm": 0.6405542270797587, + "learning_rate": 9.151378933086728e-06, + "loss": 0.1536, + "step": 7380 + }, + { + "epoch": 0.638472504211845, + "grad_norm": 0.6087345540466436, + "learning_rate": 9.148777698439695e-06, + "loss": 0.1532, + "step": 7390 + }, + { + "epoch": 0.639336472417815, + "grad_norm": 0.5881093647581587, + "learning_rate": 9.146172854054395e-06, + "loss": 0.1497, + "step": 7400 + }, + { + "epoch": 0.640200440623785, + "grad_norm": 0.653373547737377, + "learning_rate": 9.143564402197239e-06, + "loss": 0.1577, + "step": 7410 + }, + { + "epoch": 0.641064408829755, + "grad_norm": 0.5725873284917496, + "learning_rate": 9.140952345137762e-06, + "loss": 0.1482, + "step": 7420 + }, + { + "epoch": 0.6419283770357251, + "grad_norm": 0.6062841375115021, + "learning_rate": 9.138336685148648e-06, + "loss": 0.1519, + "step": 7430 + }, + { + "epoch": 0.6427923452416952, + "grad_norm": 0.6390131647406511, + "learning_rate": 9.13571742450571e-06, + "loss": 0.1472, + "step": 7440 + }, + { + "epoch": 0.6436563134476652, + "grad_norm": 0.6325898539379184, + "learning_rate": 9.133094565487894e-06, + "loss": 0.153, + "step": 7450 + }, + { + "epoch": 0.6445202816536352, + "grad_norm": 0.6244419302476996, + "learning_rate": 9.130468110377283e-06, + "loss": 0.1481, + "step": 7460 + }, + { + "epoch": 0.6453842498596052, + "grad_norm": 0.5917216909133591, + "learning_rate": 9.127838061459077e-06, + "loss": 0.1481, + "step": 7470 + }, + { + "epoch": 0.6462482180655752, + "grad_norm": 0.6155031693074737, + "learning_rate": 9.125204421021616e-06, + "loss": 0.1499, + "step": 7480 + }, + { + "epoch": 0.6471121862715452, + "grad_norm": 0.602270438150178, + "learning_rate": 9.122567191356355e-06, + "loss": 0.1495, + "step": 7490 + }, + { + "epoch": 0.6479761544775152, + "grad_norm": 0.6426993093044683, + "learning_rate": 9.119926374757876e-06, + "loss": 0.1527, + "step": 7500 + }, + { + "epoch": 0.6488401226834852, + "grad_norm": 0.5912880315934217, + "learning_rate": 9.117281973523882e-06, + "loss": 0.1501, + "step": 7510 + }, + { + "epoch": 0.6497040908894552, + "grad_norm": 0.6244924057208089, + "learning_rate": 9.114633989955194e-06, + "loss": 0.1519, + "step": 7520 + }, + { + "epoch": 0.6505680590954253, + "grad_norm": 0.611934191875604, + "learning_rate": 9.111982426355753e-06, + "loss": 0.1469, + "step": 7530 + }, + { + "epoch": 0.6514320273013953, + "grad_norm": 0.5864263801510056, + "learning_rate": 9.109327285032607e-06, + "loss": 0.149, + "step": 7540 + }, + { + "epoch": 0.6522959955073653, + "grad_norm": 0.6010224495299872, + "learning_rate": 9.106668568295927e-06, + "loss": 0.1459, + "step": 7550 + }, + { + "epoch": 0.6531599637133354, + "grad_norm": 0.6050030904357648, + "learning_rate": 9.104006278458986e-06, + "loss": 0.1458, + "step": 7560 + }, + { + "epoch": 0.6540239319193054, + "grad_norm": 0.5676610160897111, + "learning_rate": 9.101340417838171e-06, + "loss": 0.1487, + "step": 7570 + }, + { + "epoch": 0.6548879001252754, + "grad_norm": 0.6592513736662908, + "learning_rate": 9.098670988752975e-06, + "loss": 0.1479, + "step": 7580 + }, + { + "epoch": 0.6557518683312454, + "grad_norm": 0.5927040039853829, + "learning_rate": 9.095997993525999e-06, + "loss": 0.1557, + "step": 7590 + }, + { + "epoch": 0.6566158365372154, + "grad_norm": 0.6020603854415782, + "learning_rate": 9.093321434482935e-06, + "loss": 0.1571, + "step": 7600 + }, + { + "epoch": 0.6574798047431855, + "grad_norm": 0.5872803841047225, + "learning_rate": 9.09064131395259e-06, + "loss": 0.1528, + "step": 7610 + }, + { + "epoch": 0.6583437729491555, + "grad_norm": 0.5901679825406331, + "learning_rate": 9.087957634266862e-06, + "loss": 0.1502, + "step": 7620 + }, + { + "epoch": 0.6592077411551255, + "grad_norm": 0.6311244536626748, + "learning_rate": 9.085270397760748e-06, + "loss": 0.1518, + "step": 7630 + }, + { + "epoch": 0.6600717093610955, + "grad_norm": 0.5872882598881509, + "learning_rate": 9.082579606772339e-06, + "loss": 0.1528, + "step": 7640 + }, + { + "epoch": 0.6609356775670655, + "grad_norm": 0.5882786648598662, + "learning_rate": 9.079885263642818e-06, + "loss": 0.1524, + "step": 7650 + }, + { + "epoch": 0.6617996457730355, + "grad_norm": 0.5823254581363112, + "learning_rate": 9.077187370716461e-06, + "loss": 0.1505, + "step": 7660 + }, + { + "epoch": 0.6626636139790055, + "grad_norm": 0.6065493971147171, + "learning_rate": 9.074485930340631e-06, + "loss": 0.1466, + "step": 7670 + }, + { + "epoch": 0.6635275821849755, + "grad_norm": 0.6491173159325675, + "learning_rate": 9.071780944865775e-06, + "loss": 0.1472, + "step": 7680 + }, + { + "epoch": 0.6643915503909457, + "grad_norm": 0.6409731127559477, + "learning_rate": 9.06907241664543e-06, + "loss": 0.1511, + "step": 7690 + }, + { + "epoch": 0.6652555185969157, + "grad_norm": 0.581674663711164, + "learning_rate": 9.066360348036211e-06, + "loss": 0.1459, + "step": 7700 + }, + { + "epoch": 0.6661194868028857, + "grad_norm": 0.6084235174422291, + "learning_rate": 9.063644741397814e-06, + "loss": 0.1534, + "step": 7710 + }, + { + "epoch": 0.6669834550088557, + "grad_norm": 0.6142204723314244, + "learning_rate": 9.060925599093015e-06, + "loss": 0.1503, + "step": 7720 + }, + { + "epoch": 0.6678474232148257, + "grad_norm": 0.6210363271358172, + "learning_rate": 9.058202923487669e-06, + "loss": 0.1501, + "step": 7730 + }, + { + "epoch": 0.6687113914207957, + "grad_norm": 0.6328872498299684, + "learning_rate": 9.055476716950697e-06, + "loss": 0.15, + "step": 7740 + }, + { + "epoch": 0.6695753596267657, + "grad_norm": 0.6076328150996245, + "learning_rate": 9.052746981854097e-06, + "loss": 0.1475, + "step": 7750 + }, + { + "epoch": 0.6704393278327357, + "grad_norm": 0.5985368263170588, + "learning_rate": 9.050013720572941e-06, + "loss": 0.1479, + "step": 7760 + }, + { + "epoch": 0.6713032960387058, + "grad_norm": 0.6388889436940819, + "learning_rate": 9.04727693548536e-06, + "loss": 0.1526, + "step": 7770 + }, + { + "epoch": 0.6721672642446758, + "grad_norm": 0.603154034262408, + "learning_rate": 9.04453662897256e-06, + "loss": 0.1464, + "step": 7780 + }, + { + "epoch": 0.6730312324506458, + "grad_norm": 0.5762078420887212, + "learning_rate": 9.041792803418808e-06, + "loss": 0.148, + "step": 7790 + }, + { + "epoch": 0.6738952006566158, + "grad_norm": 0.6146161683014285, + "learning_rate": 9.039045461211426e-06, + "loss": 0.1521, + "step": 7800 + }, + { + "epoch": 0.6747591688625859, + "grad_norm": 0.5739686729569367, + "learning_rate": 9.036294604740805e-06, + "loss": 0.1519, + "step": 7810 + }, + { + "epoch": 0.6756231370685559, + "grad_norm": 0.5868335044090508, + "learning_rate": 9.03354023640039e-06, + "loss": 0.1497, + "step": 7820 + }, + { + "epoch": 0.6764871052745259, + "grad_norm": 0.6111017627175641, + "learning_rate": 9.030782358586684e-06, + "loss": 0.1528, + "step": 7830 + }, + { + "epoch": 0.677351073480496, + "grad_norm": 0.6252200831513823, + "learning_rate": 9.028020973699237e-06, + "loss": 0.1524, + "step": 7840 + }, + { + "epoch": 0.678215041686466, + "grad_norm": 0.6345447586951666, + "learning_rate": 9.025256084140656e-06, + "loss": 0.1502, + "step": 7850 + }, + { + "epoch": 0.679079009892436, + "grad_norm": 0.6300359296600807, + "learning_rate": 9.022487692316599e-06, + "loss": 0.1495, + "step": 7860 + }, + { + "epoch": 0.679942978098406, + "grad_norm": 0.5918604092279216, + "learning_rate": 9.019715800635764e-06, + "loss": 0.1482, + "step": 7870 + }, + { + "epoch": 0.680806946304376, + "grad_norm": 0.552573528938169, + "learning_rate": 9.0169404115099e-06, + "loss": 0.1503, + "step": 7880 + }, + { + "epoch": 0.681670914510346, + "grad_norm": 0.6159373617370572, + "learning_rate": 9.014161527353798e-06, + "loss": 0.1482, + "step": 7890 + }, + { + "epoch": 0.682534882716316, + "grad_norm": 0.5782099461611947, + "learning_rate": 9.01137915058529e-06, + "loss": 0.155, + "step": 7900 + }, + { + "epoch": 0.683398850922286, + "grad_norm": 0.5935877354981174, + "learning_rate": 9.008593283625242e-06, + "loss": 0.149, + "step": 7910 + }, + { + "epoch": 0.684262819128256, + "grad_norm": 0.5559859018366894, + "learning_rate": 9.005803928897563e-06, + "loss": 0.1466, + "step": 7920 + }, + { + "epoch": 0.685126787334226, + "grad_norm": 0.6204828417543897, + "learning_rate": 9.003011088829197e-06, + "loss": 0.1509, + "step": 7930 + }, + { + "epoch": 0.6859907555401962, + "grad_norm": 0.5691521373817178, + "learning_rate": 9.000214765850115e-06, + "loss": 0.1468, + "step": 7940 + }, + { + "epoch": 0.6868547237461662, + "grad_norm": 0.5852804581194763, + "learning_rate": 8.997414962393323e-06, + "loss": 0.1463, + "step": 7950 + }, + { + "epoch": 0.6877186919521362, + "grad_norm": 0.6049762063656603, + "learning_rate": 8.994611680894853e-06, + "loss": 0.1471, + "step": 7960 + }, + { + "epoch": 0.6885826601581062, + "grad_norm": 0.5777602090413144, + "learning_rate": 8.991804923793762e-06, + "loss": 0.1494, + "step": 7970 + }, + { + "epoch": 0.6894466283640762, + "grad_norm": 0.6044551263155569, + "learning_rate": 8.988994693532136e-06, + "loss": 0.1469, + "step": 7980 + }, + { + "epoch": 0.6903105965700462, + "grad_norm": 0.5783047865363863, + "learning_rate": 8.986180992555077e-06, + "loss": 0.148, + "step": 7990 + }, + { + "epoch": 0.6911745647760162, + "grad_norm": 0.6233075599008563, + "learning_rate": 8.983363823310712e-06, + "loss": 0.1525, + "step": 8000 + }, + { + "epoch": 0.6920385329819863, + "grad_norm": 0.6006254139480321, + "learning_rate": 8.98054318825018e-06, + "loss": 0.1509, + "step": 8010 + }, + { + "epoch": 0.6929025011879563, + "grad_norm": 0.5769597479586498, + "learning_rate": 8.977719089827638e-06, + "loss": 0.1428, + "step": 8020 + }, + { + "epoch": 0.6937664693939263, + "grad_norm": 0.5885334854420028, + "learning_rate": 8.974891530500263e-06, + "loss": 0.1461, + "step": 8030 + }, + { + "epoch": 0.6946304375998963, + "grad_norm": 0.58922621960749, + "learning_rate": 8.97206051272823e-06, + "loss": 0.1468, + "step": 8040 + }, + { + "epoch": 0.6954944058058663, + "grad_norm": 0.5900619322694628, + "learning_rate": 8.969226038974737e-06, + "loss": 0.1489, + "step": 8050 + }, + { + "epoch": 0.6963583740118363, + "grad_norm": 0.5721375599740359, + "learning_rate": 8.966388111705977e-06, + "loss": 0.1445, + "step": 8060 + }, + { + "epoch": 0.6972223422178064, + "grad_norm": 0.6060535974988339, + "learning_rate": 8.963546733391155e-06, + "loss": 0.1513, + "step": 8070 + }, + { + "epoch": 0.6980863104237764, + "grad_norm": 0.6121984155980555, + "learning_rate": 8.960701906502476e-06, + "loss": 0.1502, + "step": 8080 + }, + { + "epoch": 0.6989502786297465, + "grad_norm": 0.5883564250085476, + "learning_rate": 8.957853633515148e-06, + "loss": 0.1481, + "step": 8090 + }, + { + "epoch": 0.6998142468357165, + "grad_norm": 0.5995778157693623, + "learning_rate": 8.955001916907376e-06, + "loss": 0.1516, + "step": 8100 + }, + { + "epoch": 0.7006782150416865, + "grad_norm": 0.6060847874402603, + "learning_rate": 8.952146759160356e-06, + "loss": 0.1488, + "step": 8110 + }, + { + "epoch": 0.7015421832476565, + "grad_norm": 0.5781934980960773, + "learning_rate": 8.949288162758287e-06, + "loss": 0.1492, + "step": 8120 + }, + { + "epoch": 0.7024061514536265, + "grad_norm": 0.6051536158517672, + "learning_rate": 8.946426130188357e-06, + "loss": 0.1503, + "step": 8130 + }, + { + "epoch": 0.7032701196595965, + "grad_norm": 0.6139354449588857, + "learning_rate": 8.943560663940739e-06, + "loss": 0.1495, + "step": 8140 + }, + { + "epoch": 0.7041340878655665, + "grad_norm": 0.6126263867946146, + "learning_rate": 8.940691766508597e-06, + "loss": 0.1476, + "step": 8150 + }, + { + "epoch": 0.7049980560715365, + "grad_norm": 0.6302258584565575, + "learning_rate": 8.937819440388086e-06, + "loss": 0.1473, + "step": 8160 + }, + { + "epoch": 0.7058620242775066, + "grad_norm": 0.5923426197318572, + "learning_rate": 8.93494368807833e-06, + "loss": 0.1448, + "step": 8170 + }, + { + "epoch": 0.7067259924834766, + "grad_norm": 0.5819670560556821, + "learning_rate": 8.93206451208145e-06, + "loss": 0.1471, + "step": 8180 + }, + { + "epoch": 0.7075899606894466, + "grad_norm": 0.5704325384697982, + "learning_rate": 8.929181914902532e-06, + "loss": 0.1484, + "step": 8190 + }, + { + "epoch": 0.7084539288954167, + "grad_norm": 0.5854395681335803, + "learning_rate": 8.926295899049651e-06, + "loss": 0.1467, + "step": 8200 + }, + { + "epoch": 0.7093178971013867, + "grad_norm": 0.5956571082587886, + "learning_rate": 8.923406467033846e-06, + "loss": 0.1502, + "step": 8210 + }, + { + "epoch": 0.7101818653073567, + "grad_norm": 0.6013675361000554, + "learning_rate": 8.920513621369138e-06, + "loss": 0.1421, + "step": 8220 + }, + { + "epoch": 0.7110458335133267, + "grad_norm": 0.6169527609070847, + "learning_rate": 8.917617364572509e-06, + "loss": 0.1457, + "step": 8230 + }, + { + "epoch": 0.7119098017192967, + "grad_norm": 0.615664017102032, + "learning_rate": 8.914717699163913e-06, + "loss": 0.1528, + "step": 8240 + }, + { + "epoch": 0.7127737699252668, + "grad_norm": 0.5929835165146116, + "learning_rate": 8.91181462766627e-06, + "loss": 0.1528, + "step": 8250 + }, + { + "epoch": 0.7136377381312368, + "grad_norm": 0.5769838984731995, + "learning_rate": 8.908908152605468e-06, + "loss": 0.1467, + "step": 8260 + }, + { + "epoch": 0.7145017063372068, + "grad_norm": 0.5979484694169591, + "learning_rate": 8.905998276510347e-06, + "loss": 0.1495, + "step": 8270 + }, + { + "epoch": 0.7153656745431768, + "grad_norm": 0.6250913684486913, + "learning_rate": 8.903085001912707e-06, + "loss": 0.1503, + "step": 8280 + }, + { + "epoch": 0.7162296427491468, + "grad_norm": 0.5743702279111842, + "learning_rate": 8.90016833134732e-06, + "loss": 0.1521, + "step": 8290 + }, + { + "epoch": 0.7170936109551168, + "grad_norm": 0.5800789077997158, + "learning_rate": 8.89724826735189e-06, + "loss": 0.1488, + "step": 8300 + }, + { + "epoch": 0.7179575791610868, + "grad_norm": 0.6123653987611494, + "learning_rate": 8.894324812467092e-06, + "loss": 0.1411, + "step": 8310 + }, + { + "epoch": 0.718821547367057, + "grad_norm": 0.6413369452138997, + "learning_rate": 8.891397969236541e-06, + "loss": 0.1465, + "step": 8320 + }, + { + "epoch": 0.719685515573027, + "grad_norm": 0.6314994629599093, + "learning_rate": 8.888467740206805e-06, + "loss": 0.1482, + "step": 8330 + }, + { + "epoch": 0.720549483778997, + "grad_norm": 0.5891434300309477, + "learning_rate": 8.885534127927397e-06, + "loss": 0.1478, + "step": 8340 + }, + { + "epoch": 0.721413451984967, + "grad_norm": 0.5858090399630496, + "learning_rate": 8.882597134950772e-06, + "loss": 0.1472, + "step": 8350 + }, + { + "epoch": 0.722277420190937, + "grad_norm": 0.5638968847011544, + "learning_rate": 8.879656763832327e-06, + "loss": 0.1425, + "step": 8360 + }, + { + "epoch": 0.723141388396907, + "grad_norm": 0.6067904865153183, + "learning_rate": 8.876713017130398e-06, + "loss": 0.1459, + "step": 8370 + }, + { + "epoch": 0.724005356602877, + "grad_norm": 0.5851959853995922, + "learning_rate": 8.87376589740626e-06, + "loss": 0.1423, + "step": 8380 + }, + { + "epoch": 0.724869324808847, + "grad_norm": 0.5670882649673757, + "learning_rate": 8.870815407224121e-06, + "loss": 0.1422, + "step": 8390 + }, + { + "epoch": 0.725733293014817, + "grad_norm": 0.5792761245461205, + "learning_rate": 8.867861549151123e-06, + "loss": 0.1511, + "step": 8400 + }, + { + "epoch": 0.726597261220787, + "grad_norm": 0.6384499720411182, + "learning_rate": 8.864904325757336e-06, + "loss": 0.1458, + "step": 8410 + }, + { + "epoch": 0.7274612294267571, + "grad_norm": 0.6239567584341648, + "learning_rate": 8.861943739615761e-06, + "loss": 0.1481, + "step": 8420 + }, + { + "epoch": 0.7283251976327271, + "grad_norm": 0.6526576277949593, + "learning_rate": 8.85897979330232e-06, + "loss": 0.1457, + "step": 8430 + }, + { + "epoch": 0.7291891658386971, + "grad_norm": 0.6192375294210014, + "learning_rate": 8.856012489395865e-06, + "loss": 0.1515, + "step": 8440 + }, + { + "epoch": 0.7300531340446672, + "grad_norm": 0.5829504676468671, + "learning_rate": 8.853041830478165e-06, + "loss": 0.1443, + "step": 8450 + }, + { + "epoch": 0.7309171022506372, + "grad_norm": 0.6240893602044929, + "learning_rate": 8.85006781913391e-06, + "loss": 0.1468, + "step": 8460 + }, + { + "epoch": 0.7317810704566072, + "grad_norm": 0.6402177545734322, + "learning_rate": 8.847090457950704e-06, + "loss": 0.1504, + "step": 8470 + }, + { + "epoch": 0.7326450386625772, + "grad_norm": 0.6354668520522381, + "learning_rate": 8.84410974951907e-06, + "loss": 0.1463, + "step": 8480 + }, + { + "epoch": 0.7335090068685473, + "grad_norm": 0.5727464227472098, + "learning_rate": 8.841125696432438e-06, + "loss": 0.1498, + "step": 8490 + }, + { + "epoch": 0.7343729750745173, + "grad_norm": 0.6018868846257793, + "learning_rate": 8.838138301287156e-06, + "loss": 0.1467, + "step": 8500 + }, + { + "epoch": 0.7352369432804873, + "grad_norm": 0.5644741322670542, + "learning_rate": 8.835147566682472e-06, + "loss": 0.1463, + "step": 8510 + }, + { + "epoch": 0.7361009114864573, + "grad_norm": 0.5673974911271266, + "learning_rate": 8.832153495220543e-06, + "loss": 0.1475, + "step": 8520 + }, + { + "epoch": 0.7369648796924273, + "grad_norm": 0.5964122079459192, + "learning_rate": 8.829156089506427e-06, + "loss": 0.1503, + "step": 8530 + }, + { + "epoch": 0.7378288478983973, + "grad_norm": 0.5715091500499475, + "learning_rate": 8.826155352148089e-06, + "loss": 0.143, + "step": 8540 + }, + { + "epoch": 0.7386928161043673, + "grad_norm": 0.6035211545064121, + "learning_rate": 8.823151285756383e-06, + "loss": 0.1494, + "step": 8550 + }, + { + "epoch": 0.7395567843103373, + "grad_norm": 0.5289010319710973, + "learning_rate": 8.820143892945072e-06, + "loss": 0.1465, + "step": 8560 + }, + { + "epoch": 0.7404207525163073, + "grad_norm": 0.6314524880705613, + "learning_rate": 8.817133176330802e-06, + "loss": 0.1486, + "step": 8570 + }, + { + "epoch": 0.7412847207222775, + "grad_norm": 0.6019438096859134, + "learning_rate": 8.814119138533117e-06, + "loss": 0.149, + "step": 8580 + }, + { + "epoch": 0.7421486889282475, + "grad_norm": 0.5685350683665455, + "learning_rate": 8.811101782174447e-06, + "loss": 0.1482, + "step": 8590 + }, + { + "epoch": 0.7430126571342175, + "grad_norm": 0.5672312522177472, + "learning_rate": 8.808081109880113e-06, + "loss": 0.149, + "step": 8600 + }, + { + "epoch": 0.7438766253401875, + "grad_norm": 0.6033392735761053, + "learning_rate": 8.80505712427832e-06, + "loss": 0.1478, + "step": 8610 + }, + { + "epoch": 0.7447405935461575, + "grad_norm": 0.5852922151350154, + "learning_rate": 8.802029828000157e-06, + "loss": 0.1461, + "step": 8620 + }, + { + "epoch": 0.7456045617521275, + "grad_norm": 0.5895046658401459, + "learning_rate": 8.798999223679586e-06, + "loss": 0.1462, + "step": 8630 + }, + { + "epoch": 0.7464685299580975, + "grad_norm": 0.5961991481966116, + "learning_rate": 8.79596531395346e-06, + "loss": 0.1462, + "step": 8640 + }, + { + "epoch": 0.7473324981640675, + "grad_norm": 0.5457241905915233, + "learning_rate": 8.792928101461493e-06, + "loss": 0.1482, + "step": 8650 + }, + { + "epoch": 0.7481964663700376, + "grad_norm": 0.5709887805971291, + "learning_rate": 8.789887588846288e-06, + "loss": 0.1464, + "step": 8660 + }, + { + "epoch": 0.7490604345760076, + "grad_norm": 0.6179251161053763, + "learning_rate": 8.786843778753311e-06, + "loss": 0.1512, + "step": 8670 + }, + { + "epoch": 0.7499244027819776, + "grad_norm": 0.5417188053967987, + "learning_rate": 8.783796673830896e-06, + "loss": 0.1473, + "step": 8680 + }, + { + "epoch": 0.7507883709879476, + "grad_norm": 0.565076720183294, + "learning_rate": 8.780746276730246e-06, + "loss": 0.145, + "step": 8690 + }, + { + "epoch": 0.7516523391939177, + "grad_norm": 0.5855244277950163, + "learning_rate": 8.777692590105429e-06, + "loss": 0.1462, + "step": 8700 + }, + { + "epoch": 0.7525163073998877, + "grad_norm": 0.5497821035086375, + "learning_rate": 8.774635616613373e-06, + "loss": 0.1442, + "step": 8710 + }, + { + "epoch": 0.7533802756058577, + "grad_norm": 0.6039032637599129, + "learning_rate": 8.771575358913871e-06, + "loss": 0.1462, + "step": 8720 + }, + { + "epoch": 0.7542442438118278, + "grad_norm": 0.5750476580302071, + "learning_rate": 8.768511819669566e-06, + "loss": 0.1453, + "step": 8730 + }, + { + "epoch": 0.7551082120177978, + "grad_norm": 0.5568443722713766, + "learning_rate": 8.765445001545961e-06, + "loss": 0.1503, + "step": 8740 + }, + { + "epoch": 0.7559721802237678, + "grad_norm": 0.6349949989616961, + "learning_rate": 8.76237490721141e-06, + "loss": 0.1458, + "step": 8750 + }, + { + "epoch": 0.7568361484297378, + "grad_norm": 0.5900255153823822, + "learning_rate": 8.75930153933712e-06, + "loss": 0.1488, + "step": 8760 + }, + { + "epoch": 0.7577001166357078, + "grad_norm": 0.5726499989030004, + "learning_rate": 8.756224900597144e-06, + "loss": 0.1463, + "step": 8770 + }, + { + "epoch": 0.7585640848416778, + "grad_norm": 0.5794097752200494, + "learning_rate": 8.75314499366838e-06, + "loss": 0.1476, + "step": 8780 + }, + { + "epoch": 0.7594280530476478, + "grad_norm": 0.5592726753611104, + "learning_rate": 8.750061821230573e-06, + "loss": 0.1456, + "step": 8790 + }, + { + "epoch": 0.7602920212536178, + "grad_norm": 0.5642859448954136, + "learning_rate": 8.746975385966305e-06, + "loss": 0.1444, + "step": 8800 + }, + { + "epoch": 0.7611559894595878, + "grad_norm": 0.602949892871976, + "learning_rate": 8.743885690561002e-06, + "loss": 0.1476, + "step": 8810 + }, + { + "epoch": 0.7620199576655579, + "grad_norm": 0.578749309560873, + "learning_rate": 8.740792737702921e-06, + "loss": 0.1464, + "step": 8820 + }, + { + "epoch": 0.762883925871528, + "grad_norm": 0.5827033788877268, + "learning_rate": 8.737696530083158e-06, + "loss": 0.1444, + "step": 8830 + }, + { + "epoch": 0.763747894077498, + "grad_norm": 0.5737302603011082, + "learning_rate": 8.734597070395635e-06, + "loss": 0.1476, + "step": 8840 + }, + { + "epoch": 0.764611862283468, + "grad_norm": 0.567276002499956, + "learning_rate": 8.731494361337111e-06, + "loss": 0.147, + "step": 8850 + }, + { + "epoch": 0.765475830489438, + "grad_norm": 0.5785412025187946, + "learning_rate": 8.72838840560717e-06, + "loss": 0.1466, + "step": 8860 + }, + { + "epoch": 0.766339798695408, + "grad_norm": 0.6021940515191849, + "learning_rate": 8.725279205908214e-06, + "loss": 0.1424, + "step": 8870 + }, + { + "epoch": 0.767203766901378, + "grad_norm": 0.5678686047684466, + "learning_rate": 8.722166764945476e-06, + "loss": 0.1416, + "step": 8880 + }, + { + "epoch": 0.768067735107348, + "grad_norm": 0.5983675801989564, + "learning_rate": 8.719051085427007e-06, + "loss": 0.1445, + "step": 8890 + }, + { + "epoch": 0.7689317033133181, + "grad_norm": 0.5508988572074687, + "learning_rate": 8.71593217006367e-06, + "loss": 0.1466, + "step": 8900 + }, + { + "epoch": 0.7697956715192881, + "grad_norm": 0.5884231126943428, + "learning_rate": 8.712810021569153e-06, + "loss": 0.1509, + "step": 8910 + }, + { + "epoch": 0.7706596397252581, + "grad_norm": 0.5702397352103349, + "learning_rate": 8.709684642659952e-06, + "loss": 0.15, + "step": 8920 + }, + { + "epoch": 0.7715236079312281, + "grad_norm": 0.5646379676204086, + "learning_rate": 8.706556036055372e-06, + "loss": 0.1448, + "step": 8930 + }, + { + "epoch": 0.7723875761371981, + "grad_norm": 0.5908474945066845, + "learning_rate": 8.703424204477527e-06, + "loss": 0.1484, + "step": 8940 + }, + { + "epoch": 0.7732515443431681, + "grad_norm": 0.5582469316100711, + "learning_rate": 8.700289150651342e-06, + "loss": 0.1452, + "step": 8950 + }, + { + "epoch": 0.7741155125491382, + "grad_norm": 0.5758318223072699, + "learning_rate": 8.69715087730454e-06, + "loss": 0.1439, + "step": 8960 + }, + { + "epoch": 0.7749794807551083, + "grad_norm": 0.5851791361562063, + "learning_rate": 8.694009387167643e-06, + "loss": 0.1461, + "step": 8970 + }, + { + "epoch": 0.7758434489610783, + "grad_norm": 0.55539409751672, + "learning_rate": 8.690864682973983e-06, + "loss": 0.1429, + "step": 8980 + }, + { + "epoch": 0.7767074171670483, + "grad_norm": 0.5972513387417647, + "learning_rate": 8.687716767459677e-06, + "loss": 0.1438, + "step": 8990 + }, + { + "epoch": 0.7775713853730183, + "grad_norm": 0.5449750895753459, + "learning_rate": 8.68456564336364e-06, + "loss": 0.1423, + "step": 9000 + }, + { + "epoch": 0.7784353535789883, + "grad_norm": 0.5629247273553248, + "learning_rate": 8.681411313427584e-06, + "loss": 0.1406, + "step": 9010 + }, + { + "epoch": 0.7792993217849583, + "grad_norm": 0.5815650093028251, + "learning_rate": 8.678253780395997e-06, + "loss": 0.1498, + "step": 9020 + }, + { + "epoch": 0.7801632899909283, + "grad_norm": 0.6061694848721607, + "learning_rate": 8.67509304701617e-06, + "loss": 0.147, + "step": 9030 + }, + { + "epoch": 0.7810272581968983, + "grad_norm": 0.6055037881598447, + "learning_rate": 8.671929116038167e-06, + "loss": 0.1468, + "step": 9040 + }, + { + "epoch": 0.7818912264028683, + "grad_norm": 0.5850345947610444, + "learning_rate": 8.66876199021484e-06, + "loss": 0.1484, + "step": 9050 + }, + { + "epoch": 0.7827551946088384, + "grad_norm": 0.5542978640162831, + "learning_rate": 8.665591672301816e-06, + "loss": 0.1426, + "step": 9060 + }, + { + "epoch": 0.7836191628148084, + "grad_norm": 0.5928333489110242, + "learning_rate": 8.662418165057507e-06, + "loss": 0.1429, + "step": 9070 + }, + { + "epoch": 0.7844831310207785, + "grad_norm": 0.5648483381701337, + "learning_rate": 8.659241471243088e-06, + "loss": 0.1445, + "step": 9080 + }, + { + "epoch": 0.7853470992267485, + "grad_norm": 0.6297067607299209, + "learning_rate": 8.656061593622521e-06, + "loss": 0.1486, + "step": 9090 + }, + { + "epoch": 0.7862110674327185, + "grad_norm": 0.5783598666839724, + "learning_rate": 8.652878534962523e-06, + "loss": 0.1449, + "step": 9100 + }, + { + "epoch": 0.7870750356386885, + "grad_norm": 0.5808145834168469, + "learning_rate": 8.649692298032594e-06, + "loss": 0.1452, + "step": 9110 + }, + { + "epoch": 0.7879390038446585, + "grad_norm": 0.6257064540825436, + "learning_rate": 8.646502885604988e-06, + "loss": 0.1439, + "step": 9120 + }, + { + "epoch": 0.7888029720506285, + "grad_norm": 0.5958553918867221, + "learning_rate": 8.643310300454726e-06, + "loss": 0.1498, + "step": 9130 + }, + { + "epoch": 0.7896669402565986, + "grad_norm": 0.5718667220315514, + "learning_rate": 8.640114545359589e-06, + "loss": 0.1423, + "step": 9140 + }, + { + "epoch": 0.7905309084625686, + "grad_norm": 0.5828008232292384, + "learning_rate": 8.636915623100116e-06, + "loss": 0.1427, + "step": 9150 + }, + { + "epoch": 0.7913948766685386, + "grad_norm": 0.6078309638455678, + "learning_rate": 8.633713536459603e-06, + "loss": 0.1493, + "step": 9160 + }, + { + "epoch": 0.7922588448745086, + "grad_norm": 0.5805499311435128, + "learning_rate": 8.6305082882241e-06, + "loss": 0.1452, + "step": 9170 + }, + { + "epoch": 0.7931228130804786, + "grad_norm": 0.5523180415389615, + "learning_rate": 8.627299881182402e-06, + "loss": 0.1484, + "step": 9180 + }, + { + "epoch": 0.7939867812864486, + "grad_norm": 0.5731511610127191, + "learning_rate": 8.624088318126057e-06, + "loss": 0.1472, + "step": 9190 + }, + { + "epoch": 0.7948507494924186, + "grad_norm": 0.5711686241765782, + "learning_rate": 8.620873601849362e-06, + "loss": 0.1441, + "step": 9200 + }, + { + "epoch": 0.7957147176983888, + "grad_norm": 0.5559112922512367, + "learning_rate": 8.617655735149354e-06, + "loss": 0.15, + "step": 9210 + }, + { + "epoch": 0.7965786859043588, + "grad_norm": 0.5942008849380511, + "learning_rate": 8.614434720825805e-06, + "loss": 0.1453, + "step": 9220 + }, + { + "epoch": 0.7974426541103288, + "grad_norm": 0.5826039170890981, + "learning_rate": 8.611210561681236e-06, + "loss": 0.147, + "step": 9230 + }, + { + "epoch": 0.7983066223162988, + "grad_norm": 0.5809325892616363, + "learning_rate": 8.6079832605209e-06, + "loss": 0.1439, + "step": 9240 + }, + { + "epoch": 0.7991705905222688, + "grad_norm": 0.551222305517412, + "learning_rate": 8.604752820152782e-06, + "loss": 0.1451, + "step": 9250 + }, + { + "epoch": 0.8000345587282388, + "grad_norm": 0.569466216351549, + "learning_rate": 8.601519243387602e-06, + "loss": 0.1408, + "step": 9260 + }, + { + "epoch": 0.8008985269342088, + "grad_norm": 0.5632939536324367, + "learning_rate": 8.598282533038804e-06, + "loss": 0.1414, + "step": 9270 + }, + { + "epoch": 0.8017624951401788, + "grad_norm": 0.5945112711943039, + "learning_rate": 8.595042691922564e-06, + "loss": 0.149, + "step": 9280 + }, + { + "epoch": 0.8026264633461488, + "grad_norm": 0.6257070347054505, + "learning_rate": 8.591799722857779e-06, + "loss": 0.1468, + "step": 9290 + }, + { + "epoch": 0.8034904315521189, + "grad_norm": 0.5502878627500125, + "learning_rate": 8.588553628666067e-06, + "loss": 0.1459, + "step": 9300 + }, + { + "epoch": 0.8043543997580889, + "grad_norm": 0.5542111041310896, + "learning_rate": 8.585304412171767e-06, + "loss": 0.143, + "step": 9310 + }, + { + "epoch": 0.8052183679640589, + "grad_norm": 0.5326126029738462, + "learning_rate": 8.582052076201932e-06, + "loss": 0.1423, + "step": 9320 + }, + { + "epoch": 0.8060823361700289, + "grad_norm": 0.5885736349948085, + "learning_rate": 8.578796623586332e-06, + "loss": 0.1447, + "step": 9330 + }, + { + "epoch": 0.806946304375999, + "grad_norm": 0.584316686720796, + "learning_rate": 8.575538057157448e-06, + "loss": 0.1448, + "step": 9340 + }, + { + "epoch": 0.807810272581969, + "grad_norm": 0.5664241111477385, + "learning_rate": 8.572276379750468e-06, + "loss": 0.145, + "step": 9350 + }, + { + "epoch": 0.808674240787939, + "grad_norm": 0.5993574255434805, + "learning_rate": 8.569011594203291e-06, + "loss": 0.1458, + "step": 9360 + }, + { + "epoch": 0.809538208993909, + "grad_norm": 0.5888740832156935, + "learning_rate": 8.565743703356514e-06, + "loss": 0.1422, + "step": 9370 + }, + { + "epoch": 0.8104021771998791, + "grad_norm": 0.5360589804242816, + "learning_rate": 8.562472710053444e-06, + "loss": 0.1405, + "step": 9380 + }, + { + "epoch": 0.8112661454058491, + "grad_norm": 0.576297123132208, + "learning_rate": 8.559198617140081e-06, + "loss": 0.1421, + "step": 9390 + }, + { + "epoch": 0.8121301136118191, + "grad_norm": 0.589692619006291, + "learning_rate": 8.555921427465124e-06, + "loss": 0.1462, + "step": 9400 + }, + { + "epoch": 0.8129940818177891, + "grad_norm": 0.586639880883335, + "learning_rate": 8.552641143879965e-06, + "loss": 0.1418, + "step": 9410 + }, + { + "epoch": 0.8138580500237591, + "grad_norm": 0.5728715638136743, + "learning_rate": 8.549357769238689e-06, + "loss": 0.1482, + "step": 9420 + }, + { + "epoch": 0.8147220182297291, + "grad_norm": 0.5577477890301851, + "learning_rate": 8.546071306398075e-06, + "loss": 0.142, + "step": 9430 + }, + { + "epoch": 0.8155859864356991, + "grad_norm": 0.5773828818872149, + "learning_rate": 8.54278175821758e-06, + "loss": 0.1462, + "step": 9440 + }, + { + "epoch": 0.8164499546416691, + "grad_norm": 0.6148625238176575, + "learning_rate": 8.539489127559352e-06, + "loss": 0.1439, + "step": 9450 + }, + { + "epoch": 0.8173139228476392, + "grad_norm": 0.5809787205443636, + "learning_rate": 8.53619341728822e-06, + "loss": 0.1446, + "step": 9460 + }, + { + "epoch": 0.8181778910536093, + "grad_norm": 0.5673805345122123, + "learning_rate": 8.532894630271691e-06, + "loss": 0.1464, + "step": 9470 + }, + { + "epoch": 0.8190418592595793, + "grad_norm": 0.5714831001212896, + "learning_rate": 8.529592769379947e-06, + "loss": 0.1467, + "step": 9480 + }, + { + "epoch": 0.8199058274655493, + "grad_norm": 0.8142466454645951, + "learning_rate": 8.52628783748585e-06, + "loss": 0.146, + "step": 9490 + }, + { + "epoch": 0.8207697956715193, + "grad_norm": 0.5736789389590432, + "learning_rate": 8.52297983746493e-06, + "loss": 0.1467, + "step": 9500 + }, + { + "epoch": 0.8216337638774893, + "grad_norm": 0.5758982922754664, + "learning_rate": 8.519668772195389e-06, + "loss": 0.1437, + "step": 9510 + }, + { + "epoch": 0.8224977320834593, + "grad_norm": 0.5451983109559408, + "learning_rate": 8.516354644558091e-06, + "loss": 0.1428, + "step": 9520 + }, + { + "epoch": 0.8233617002894293, + "grad_norm": 0.5781843496988237, + "learning_rate": 8.513037457436571e-06, + "loss": 0.1421, + "step": 9530 + }, + { + "epoch": 0.8242256684953994, + "grad_norm": 0.5660860358702046, + "learning_rate": 8.509717213717023e-06, + "loss": 0.1466, + "step": 9540 + }, + { + "epoch": 0.8250896367013694, + "grad_norm": 0.5875923545017205, + "learning_rate": 8.5063939162883e-06, + "loss": 0.1471, + "step": 9550 + }, + { + "epoch": 0.8259536049073394, + "grad_norm": 0.5793638698834187, + "learning_rate": 8.503067568041912e-06, + "loss": 0.1435, + "step": 9560 + }, + { + "epoch": 0.8268175731133094, + "grad_norm": 0.6082894408541321, + "learning_rate": 8.499738171872024e-06, + "loss": 0.1476, + "step": 9570 + }, + { + "epoch": 0.8276815413192794, + "grad_norm": 0.5410052725196663, + "learning_rate": 8.496405730675453e-06, + "loss": 0.1413, + "step": 9580 + }, + { + "epoch": 0.8285455095252495, + "grad_norm": 0.5668481687999716, + "learning_rate": 8.493070247351665e-06, + "loss": 0.1428, + "step": 9590 + }, + { + "epoch": 0.8294094777312195, + "grad_norm": 0.5396188203926512, + "learning_rate": 8.48973172480277e-06, + "loss": 0.1452, + "step": 9600 + }, + { + "epoch": 0.8302734459371895, + "grad_norm": 0.5944775835927977, + "learning_rate": 8.48639016593353e-06, + "loss": 0.1416, + "step": 9610 + }, + { + "epoch": 0.8311374141431596, + "grad_norm": 0.5707292481605472, + "learning_rate": 8.483045573651345e-06, + "loss": 0.1443, + "step": 9620 + }, + { + "epoch": 0.8320013823491296, + "grad_norm": 0.5596948995294987, + "learning_rate": 8.479697950866249e-06, + "loss": 0.1414, + "step": 9630 + }, + { + "epoch": 0.8328653505550996, + "grad_norm": 0.5789700110781804, + "learning_rate": 8.476347300490919e-06, + "loss": 0.1428, + "step": 9640 + }, + { + "epoch": 0.8337293187610696, + "grad_norm": 0.5816059812515278, + "learning_rate": 8.472993625440666e-06, + "loss": 0.1503, + "step": 9650 + }, + { + "epoch": 0.8345932869670396, + "grad_norm": 0.5583523771021892, + "learning_rate": 8.469636928633426e-06, + "loss": 0.1464, + "step": 9660 + }, + { + "epoch": 0.8354572551730096, + "grad_norm": 0.5783974918171014, + "learning_rate": 8.466277212989778e-06, + "loss": 0.1421, + "step": 9670 + }, + { + "epoch": 0.8363212233789796, + "grad_norm": 0.5637539984292849, + "learning_rate": 8.462914481432912e-06, + "loss": 0.1407, + "step": 9680 + }, + { + "epoch": 0.8371851915849496, + "grad_norm": 0.5469313058157097, + "learning_rate": 8.459548736888651e-06, + "loss": 0.1441, + "step": 9690 + }, + { + "epoch": 0.8380491597909197, + "grad_norm": 0.5781885951173833, + "learning_rate": 8.456179982285437e-06, + "loss": 0.1423, + "step": 9700 + }, + { + "epoch": 0.8389131279968897, + "grad_norm": 0.5929979133896947, + "learning_rate": 8.452808220554332e-06, + "loss": 0.1419, + "step": 9710 + }, + { + "epoch": 0.8397770962028598, + "grad_norm": 0.5688459632744697, + "learning_rate": 8.449433454629015e-06, + "loss": 0.1462, + "step": 9720 + }, + { + "epoch": 0.8406410644088298, + "grad_norm": 0.5603455231856802, + "learning_rate": 8.446055687445774e-06, + "loss": 0.1448, + "step": 9730 + }, + { + "epoch": 0.8415050326147998, + "grad_norm": 0.5859270782944502, + "learning_rate": 8.442674921943516e-06, + "loss": 0.142, + "step": 9740 + }, + { + "epoch": 0.8423690008207698, + "grad_norm": 0.5798335827513441, + "learning_rate": 8.439291161063751e-06, + "loss": 0.141, + "step": 9750 + }, + { + "epoch": 0.8432329690267398, + "grad_norm": 0.5565217504939, + "learning_rate": 8.4359044077506e-06, + "loss": 0.1428, + "step": 9760 + }, + { + "epoch": 0.8440969372327098, + "grad_norm": 0.6040073775552014, + "learning_rate": 8.43251466495078e-06, + "loss": 0.1396, + "step": 9770 + }, + { + "epoch": 0.8449609054386799, + "grad_norm": 0.5687283030783332, + "learning_rate": 8.429121935613614e-06, + "loss": 0.1396, + "step": 9780 + }, + { + "epoch": 0.8458248736446499, + "grad_norm": 0.5425553417122125, + "learning_rate": 8.425726222691027e-06, + "loss": 0.1434, + "step": 9790 + }, + { + "epoch": 0.8466888418506199, + "grad_norm": 0.5518060633249242, + "learning_rate": 8.422327529137534e-06, + "loss": 0.1459, + "step": 9800 + }, + { + "epoch": 0.8475528100565899, + "grad_norm": 0.5433804487253319, + "learning_rate": 8.418925857910245e-06, + "loss": 0.1408, + "step": 9810 + }, + { + "epoch": 0.8484167782625599, + "grad_norm": 0.5442315827085944, + "learning_rate": 8.415521211968862e-06, + "loss": 0.1477, + "step": 9820 + }, + { + "epoch": 0.8492807464685299, + "grad_norm": 0.5521307095439536, + "learning_rate": 8.412113594275676e-06, + "loss": 0.1432, + "step": 9830 + }, + { + "epoch": 0.8501447146744999, + "grad_norm": 0.5785067887210904, + "learning_rate": 8.408703007795559e-06, + "loss": 0.1442, + "step": 9840 + }, + { + "epoch": 0.85100868288047, + "grad_norm": 0.5888646780531525, + "learning_rate": 8.405289455495971e-06, + "loss": 0.1432, + "step": 9850 + }, + { + "epoch": 0.8518726510864401, + "grad_norm": 0.5869356385591026, + "learning_rate": 8.401872940346952e-06, + "loss": 0.1435, + "step": 9860 + }, + { + "epoch": 0.8527366192924101, + "grad_norm": 0.5647084533037493, + "learning_rate": 8.398453465321115e-06, + "loss": 0.1413, + "step": 9870 + }, + { + "epoch": 0.8536005874983801, + "grad_norm": 0.5805929245861594, + "learning_rate": 8.395031033393655e-06, + "loss": 0.1471, + "step": 9880 + }, + { + "epoch": 0.8544645557043501, + "grad_norm": 0.5818692090965515, + "learning_rate": 8.391605647542336e-06, + "loss": 0.1415, + "step": 9890 + }, + { + "epoch": 0.8553285239103201, + "grad_norm": 0.5623579419758806, + "learning_rate": 8.388177310747494e-06, + "loss": 0.1445, + "step": 9900 + }, + { + "epoch": 0.8561924921162901, + "grad_norm": 0.5842301010806707, + "learning_rate": 8.384746025992026e-06, + "loss": 0.1388, + "step": 9910 + }, + { + "epoch": 0.8570564603222601, + "grad_norm": 0.552275823798508, + "learning_rate": 8.381311796261407e-06, + "loss": 0.1454, + "step": 9920 + }, + { + "epoch": 0.8579204285282301, + "grad_norm": 0.5814597997265838, + "learning_rate": 8.37787462454366e-06, + "loss": 0.1426, + "step": 9930 + }, + { + "epoch": 0.8587843967342002, + "grad_norm": 0.5391815636893464, + "learning_rate": 8.374434513829377e-06, + "loss": 0.1425, + "step": 9940 + }, + { + "epoch": 0.8596483649401702, + "grad_norm": 0.5909735682466266, + "learning_rate": 8.370991467111705e-06, + "loss": 0.1457, + "step": 9950 + }, + { + "epoch": 0.8605123331461402, + "grad_norm": 0.5908399739792674, + "learning_rate": 8.367545487386345e-06, + "loss": 0.1423, + "step": 9960 + }, + { + "epoch": 0.8613763013521103, + "grad_norm": 0.5702025368686541, + "learning_rate": 8.36409657765155e-06, + "loss": 0.1407, + "step": 9970 + }, + { + "epoch": 0.8622402695580803, + "grad_norm": 0.616393750494224, + "learning_rate": 8.36064474090812e-06, + "loss": 0.1454, + "step": 9980 + }, + { + "epoch": 0.8631042377640503, + "grad_norm": 0.5793933000509822, + "learning_rate": 8.357189980159405e-06, + "loss": 0.142, + "step": 9990 + }, + { + "epoch": 0.8639682059700203, + "grad_norm": 0.6120735127654011, + "learning_rate": 8.353732298411298e-06, + "loss": 0.143, + "step": 10000 + }, + { + "epoch": 0.8648321741759903, + "grad_norm": 0.5672142424827209, + "learning_rate": 8.350271698672236e-06, + "loss": 0.1465, + "step": 10010 + }, + { + "epoch": 0.8656961423819604, + "grad_norm": 0.5902729900228314, + "learning_rate": 8.34680818395319e-06, + "loss": 0.1435, + "step": 10020 + }, + { + "epoch": 0.8665601105879304, + "grad_norm": 0.5691844420715534, + "learning_rate": 8.343341757267671e-06, + "loss": 0.1439, + "step": 10030 + }, + { + "epoch": 0.8674240787939004, + "grad_norm": 0.5525245356489779, + "learning_rate": 8.339872421631718e-06, + "loss": 0.1421, + "step": 10040 + }, + { + "epoch": 0.8682880469998704, + "grad_norm": 0.5633902551254112, + "learning_rate": 8.336400180063913e-06, + "loss": 0.1405, + "step": 10050 + }, + { + "epoch": 0.8691520152058404, + "grad_norm": 0.5522452533333609, + "learning_rate": 8.332925035585351e-06, + "loss": 0.1406, + "step": 10060 + }, + { + "epoch": 0.8700159834118104, + "grad_norm": 0.5683843284307902, + "learning_rate": 8.329446991219665e-06, + "loss": 0.1414, + "step": 10070 + }, + { + "epoch": 0.8708799516177804, + "grad_norm": 0.8877940351300201, + "learning_rate": 8.325966049993004e-06, + "loss": 0.1443, + "step": 10080 + }, + { + "epoch": 0.8717439198237504, + "grad_norm": 0.6050744894035649, + "learning_rate": 8.322482214934044e-06, + "loss": 0.1441, + "step": 10090 + }, + { + "epoch": 0.8726078880297206, + "grad_norm": 0.5650088110025322, + "learning_rate": 8.318995489073968e-06, + "loss": 0.1416, + "step": 10100 + }, + { + "epoch": 0.8734718562356906, + "grad_norm": 0.5571218501034207, + "learning_rate": 8.315505875446489e-06, + "loss": 0.1452, + "step": 10110 + }, + { + "epoch": 0.8743358244416606, + "grad_norm": 0.531945157846496, + "learning_rate": 8.31201337708782e-06, + "loss": 0.1413, + "step": 10120 + }, + { + "epoch": 0.8751997926476306, + "grad_norm": 0.5793899013617272, + "learning_rate": 8.308517997036687e-06, + "loss": 0.1419, + "step": 10130 + }, + { + "epoch": 0.8760637608536006, + "grad_norm": 0.5522987674972991, + "learning_rate": 8.305019738334328e-06, + "loss": 0.1429, + "step": 10140 + }, + { + "epoch": 0.8769277290595706, + "grad_norm": 0.599533080514142, + "learning_rate": 8.301518604024481e-06, + "loss": 0.1483, + "step": 10150 + }, + { + "epoch": 0.8777916972655406, + "grad_norm": 0.5820611165673861, + "learning_rate": 8.298014597153387e-06, + "loss": 0.1449, + "step": 10160 + }, + { + "epoch": 0.8786556654715106, + "grad_norm": 0.552105008952654, + "learning_rate": 8.294507720769789e-06, + "loss": 0.1422, + "step": 10170 + }, + { + "epoch": 0.8795196336774807, + "grad_norm": 0.589431986283032, + "learning_rate": 8.290997977924922e-06, + "loss": 0.142, + "step": 10180 + }, + { + "epoch": 0.8803836018834507, + "grad_norm": 0.5912403064747072, + "learning_rate": 8.287485371672518e-06, + "loss": 0.1455, + "step": 10190 + }, + { + "epoch": 0.8812475700894207, + "grad_norm": 0.5835849936868334, + "learning_rate": 8.283969905068803e-06, + "loss": 0.141, + "step": 10200 + }, + { + "epoch": 0.8821115382953907, + "grad_norm": 0.5560030512889691, + "learning_rate": 8.280451581172484e-06, + "loss": 0.1424, + "step": 10210 + }, + { + "epoch": 0.8829755065013607, + "grad_norm": 0.5646674155359306, + "learning_rate": 8.276930403044759e-06, + "loss": 0.1457, + "step": 10220 + }, + { + "epoch": 0.8838394747073308, + "grad_norm": 0.5737466726861257, + "learning_rate": 8.273406373749313e-06, + "loss": 0.1441, + "step": 10230 + }, + { + "epoch": 0.8847034429133008, + "grad_norm": 0.5523276034253872, + "learning_rate": 8.269879496352304e-06, + "loss": 0.1405, + "step": 10240 + }, + { + "epoch": 0.8855674111192708, + "grad_norm": 0.5534688383692349, + "learning_rate": 8.266349773922372e-06, + "loss": 0.137, + "step": 10250 + }, + { + "epoch": 0.8864313793252409, + "grad_norm": 0.601059059729315, + "learning_rate": 8.262817209530636e-06, + "loss": 0.1398, + "step": 10260 + }, + { + "epoch": 0.8872953475312109, + "grad_norm": 0.548502422051078, + "learning_rate": 8.259281806250678e-06, + "loss": 0.1411, + "step": 10270 + }, + { + "epoch": 0.8881593157371809, + "grad_norm": 0.5333366385953059, + "learning_rate": 8.255743567158561e-06, + "loss": 0.142, + "step": 10280 + }, + { + "epoch": 0.8890232839431509, + "grad_norm": 0.5154244022617628, + "learning_rate": 8.252202495332808e-06, + "loss": 0.1385, + "step": 10290 + }, + { + "epoch": 0.8898872521491209, + "grad_norm": 0.5613558968123625, + "learning_rate": 8.248658593854408e-06, + "loss": 0.138, + "step": 10300 + }, + { + "epoch": 0.8907512203550909, + "grad_norm": 0.5449671665400901, + "learning_rate": 8.245111865806816e-06, + "loss": 0.1427, + "step": 10310 + }, + { + "epoch": 0.8916151885610609, + "grad_norm": 0.6077993286632034, + "learning_rate": 8.24156231427594e-06, + "loss": 0.1397, + "step": 10320 + }, + { + "epoch": 0.8924791567670309, + "grad_norm": 0.5669974577554227, + "learning_rate": 8.23800994235015e-06, + "loss": 0.1409, + "step": 10330 + }, + { + "epoch": 0.893343124973001, + "grad_norm": 0.553947870504879, + "learning_rate": 8.234454753120268e-06, + "loss": 0.1427, + "step": 10340 + }, + { + "epoch": 0.8942070931789711, + "grad_norm": 0.5480355609367276, + "learning_rate": 8.230896749679566e-06, + "loss": 0.1445, + "step": 10350 + }, + { + "epoch": 0.8950710613849411, + "grad_norm": 0.567683345822826, + "learning_rate": 8.227335935123766e-06, + "loss": 0.1417, + "step": 10360 + }, + { + "epoch": 0.8959350295909111, + "grad_norm": 0.5923492791517332, + "learning_rate": 8.223772312551035e-06, + "loss": 0.1452, + "step": 10370 + }, + { + "epoch": 0.8967989977968811, + "grad_norm": 0.559847173910797, + "learning_rate": 8.220205885061986e-06, + "loss": 0.142, + "step": 10380 + }, + { + "epoch": 0.8976629660028511, + "grad_norm": 0.52678466638974, + "learning_rate": 8.216636655759666e-06, + "loss": 0.1388, + "step": 10390 + }, + { + "epoch": 0.8985269342088211, + "grad_norm": 0.5679754532984375, + "learning_rate": 8.213064627749567e-06, + "loss": 0.1431, + "step": 10400 + }, + { + "epoch": 0.8993909024147911, + "grad_norm": 0.535769991144621, + "learning_rate": 8.209489804139614e-06, + "loss": 0.142, + "step": 10410 + }, + { + "epoch": 0.9002548706207611, + "grad_norm": 0.5849730580217068, + "learning_rate": 8.205912188040164e-06, + "loss": 0.1458, + "step": 10420 + }, + { + "epoch": 0.9011188388267312, + "grad_norm": 0.5895526919570224, + "learning_rate": 8.202331782564e-06, + "loss": 0.1412, + "step": 10430 + }, + { + "epoch": 0.9019828070327012, + "grad_norm": 0.5626343062982044, + "learning_rate": 8.198748590826336e-06, + "loss": 0.1417, + "step": 10440 + }, + { + "epoch": 0.9028467752386712, + "grad_norm": 0.5514554215350627, + "learning_rate": 8.195162615944809e-06, + "loss": 0.1388, + "step": 10450 + }, + { + "epoch": 0.9037107434446412, + "grad_norm": 0.596576561211133, + "learning_rate": 8.191573861039481e-06, + "loss": 0.138, + "step": 10460 + }, + { + "epoch": 0.9045747116506112, + "grad_norm": 0.5608018302461737, + "learning_rate": 8.187982329232826e-06, + "loss": 0.1406, + "step": 10470 + }, + { + "epoch": 0.9054386798565813, + "grad_norm": 0.5922211818250301, + "learning_rate": 8.18438802364974e-06, + "loss": 0.1407, + "step": 10480 + }, + { + "epoch": 0.9063026480625513, + "grad_norm": 0.5626611609158868, + "learning_rate": 8.18079094741753e-06, + "loss": 0.1403, + "step": 10490 + }, + { + "epoch": 0.9071666162685214, + "grad_norm": 0.55739369913719, + "learning_rate": 8.177191103665912e-06, + "loss": 0.1428, + "step": 10500 + }, + { + "epoch": 0.9080305844744914, + "grad_norm": 0.5589210208749367, + "learning_rate": 8.173588495527013e-06, + "loss": 0.1444, + "step": 10510 + }, + { + "epoch": 0.9088945526804614, + "grad_norm": 0.5948997818661523, + "learning_rate": 8.169983126135366e-06, + "loss": 0.1406, + "step": 10520 + }, + { + "epoch": 0.9097585208864314, + "grad_norm": 0.5637575295315885, + "learning_rate": 8.166374998627903e-06, + "loss": 0.1419, + "step": 10530 + }, + { + "epoch": 0.9106224890924014, + "grad_norm": 0.5520322935532269, + "learning_rate": 8.162764116143956e-06, + "loss": 0.1428, + "step": 10540 + }, + { + "epoch": 0.9114864572983714, + "grad_norm": 0.5476065819654563, + "learning_rate": 8.159150481825256e-06, + "loss": 0.1445, + "step": 10550 + }, + { + "epoch": 0.9123504255043414, + "grad_norm": 0.5607845386057297, + "learning_rate": 8.155534098815929e-06, + "loss": 0.1409, + "step": 10560 + }, + { + "epoch": 0.9132143937103114, + "grad_norm": 0.5895145175268459, + "learning_rate": 8.15191497026249e-06, + "loss": 0.1392, + "step": 10570 + }, + { + "epoch": 0.9140783619162814, + "grad_norm": 0.5894062695205331, + "learning_rate": 8.148293099313843e-06, + "loss": 0.1353, + "step": 10580 + }, + { + "epoch": 0.9149423301222515, + "grad_norm": 0.5743939777288788, + "learning_rate": 8.14466848912128e-06, + "loss": 0.1382, + "step": 10590 + }, + { + "epoch": 0.9158062983282215, + "grad_norm": 0.5738967722486941, + "learning_rate": 8.141041142838475e-06, + "loss": 0.1437, + "step": 10600 + }, + { + "epoch": 0.9166702665341916, + "grad_norm": 0.5508385159709146, + "learning_rate": 8.137411063621488e-06, + "loss": 0.1395, + "step": 10610 + }, + { + "epoch": 0.9175342347401616, + "grad_norm": 0.5549344070840297, + "learning_rate": 8.133778254628744e-06, + "loss": 0.1391, + "step": 10620 + }, + { + "epoch": 0.9183982029461316, + "grad_norm": 0.5557337487994121, + "learning_rate": 8.130142719021055e-06, + "loss": 0.1428, + "step": 10630 + }, + { + "epoch": 0.9192621711521016, + "grad_norm": 0.5335590128180864, + "learning_rate": 8.126504459961601e-06, + "loss": 0.1435, + "step": 10640 + }, + { + "epoch": 0.9201261393580716, + "grad_norm": 0.5618082432643633, + "learning_rate": 8.122863480615932e-06, + "loss": 0.1434, + "step": 10650 + }, + { + "epoch": 0.9209901075640416, + "grad_norm": 0.5483727536324223, + "learning_rate": 8.119219784151964e-06, + "loss": 0.1407, + "step": 10660 + }, + { + "epoch": 0.9218540757700117, + "grad_norm": 0.5225716204079307, + "learning_rate": 8.11557337373998e-06, + "loss": 0.1426, + "step": 10670 + }, + { + "epoch": 0.9227180439759817, + "grad_norm": 0.5802251936874245, + "learning_rate": 8.11192425255262e-06, + "loss": 0.1414, + "step": 10680 + }, + { + "epoch": 0.9235820121819517, + "grad_norm": 0.549636716436263, + "learning_rate": 8.108272423764883e-06, + "loss": 0.1439, + "step": 10690 + }, + { + "epoch": 0.9244459803879217, + "grad_norm": 0.5654550374218119, + "learning_rate": 8.104617890554129e-06, + "loss": 0.1427, + "step": 10700 + }, + { + "epoch": 0.9253099485938917, + "grad_norm": 0.5673693824798013, + "learning_rate": 8.100960656100069e-06, + "loss": 0.1429, + "step": 10710 + }, + { + "epoch": 0.9261739167998617, + "grad_norm": 0.5132529576114598, + "learning_rate": 8.097300723584757e-06, + "loss": 0.1377, + "step": 10720 + }, + { + "epoch": 0.9270378850058317, + "grad_norm": 0.5334736844206937, + "learning_rate": 8.093638096192606e-06, + "loss": 0.1405, + "step": 10730 + }, + { + "epoch": 0.9279018532118019, + "grad_norm": 0.5510539851521975, + "learning_rate": 8.089972777110366e-06, + "loss": 0.1383, + "step": 10740 + }, + { + "epoch": 0.9287658214177719, + "grad_norm": 0.543077012923459, + "learning_rate": 8.08630476952713e-06, + "loss": 0.1408, + "step": 10750 + }, + { + "epoch": 0.9296297896237419, + "grad_norm": 0.5386576487008168, + "learning_rate": 8.082634076634334e-06, + "loss": 0.142, + "step": 10760 + }, + { + "epoch": 0.9304937578297119, + "grad_norm": 0.5565406495408177, + "learning_rate": 8.078960701625746e-06, + "loss": 0.1373, + "step": 10770 + }, + { + "epoch": 0.9313577260356819, + "grad_norm": 0.5948123321042771, + "learning_rate": 8.07528464769747e-06, + "loss": 0.1382, + "step": 10780 + }, + { + "epoch": 0.9322216942416519, + "grad_norm": 0.5446315002732925, + "learning_rate": 8.071605918047938e-06, + "loss": 0.1379, + "step": 10790 + }, + { + "epoch": 0.9330856624476219, + "grad_norm": 0.5539597975139731, + "learning_rate": 8.067924515877914e-06, + "loss": 0.138, + "step": 10800 + }, + { + "epoch": 0.9339496306535919, + "grad_norm": 0.5552871380668192, + "learning_rate": 8.064240444390487e-06, + "loss": 0.1402, + "step": 10810 + }, + { + "epoch": 0.934813598859562, + "grad_norm": 0.5673123265243116, + "learning_rate": 8.060553706791066e-06, + "loss": 0.1431, + "step": 10820 + }, + { + "epoch": 0.935677567065532, + "grad_norm": 0.5909182056894116, + "learning_rate": 8.05686430628738e-06, + "loss": 0.1429, + "step": 10830 + }, + { + "epoch": 0.936541535271502, + "grad_norm": 0.5504559786897197, + "learning_rate": 8.053172246089476e-06, + "loss": 0.1454, + "step": 10840 + }, + { + "epoch": 0.937405503477472, + "grad_norm": 0.5806631353235165, + "learning_rate": 8.049477529409712e-06, + "loss": 0.1472, + "step": 10850 + }, + { + "epoch": 0.9382694716834421, + "grad_norm": 0.5709906117492967, + "learning_rate": 8.045780159462769e-06, + "loss": 0.1392, + "step": 10860 + }, + { + "epoch": 0.9391334398894121, + "grad_norm": 0.5940409325929809, + "learning_rate": 8.042080139465617e-06, + "loss": 0.1371, + "step": 10870 + }, + { + "epoch": 0.9399974080953821, + "grad_norm": 0.5644194425180563, + "learning_rate": 8.03837747263755e-06, + "loss": 0.1401, + "step": 10880 + }, + { + "epoch": 0.9408613763013521, + "grad_norm": 0.5513413048942039, + "learning_rate": 8.034672162200153e-06, + "loss": 0.1433, + "step": 10890 + }, + { + "epoch": 0.9417253445073221, + "grad_norm": 0.5451574269752696, + "learning_rate": 8.030964211377317e-06, + "loss": 0.1409, + "step": 10900 + }, + { + "epoch": 0.9425893127132922, + "grad_norm": 0.5651146180037745, + "learning_rate": 8.027253623395231e-06, + "loss": 0.1382, + "step": 10910 + }, + { + "epoch": 0.9434532809192622, + "grad_norm": 0.5398822960393462, + "learning_rate": 8.023540401482373e-06, + "loss": 0.1431, + "step": 10920 + }, + { + "epoch": 0.9443172491252322, + "grad_norm": 0.5437530423433509, + "learning_rate": 8.019824548869518e-06, + "loss": 0.1425, + "step": 10930 + }, + { + "epoch": 0.9451812173312022, + "grad_norm": 0.5546789213874745, + "learning_rate": 8.016106068789727e-06, + "loss": 0.1373, + "step": 10940 + }, + { + "epoch": 0.9460451855371722, + "grad_norm": 0.5471020932209029, + "learning_rate": 8.012384964478346e-06, + "loss": 0.138, + "step": 10950 + }, + { + "epoch": 0.9469091537431422, + "grad_norm": 0.5697798890296741, + "learning_rate": 8.008661239173012e-06, + "loss": 0.1407, + "step": 10960 + }, + { + "epoch": 0.9477731219491122, + "grad_norm": 0.5575286625262722, + "learning_rate": 8.004934896113633e-06, + "loss": 0.1397, + "step": 10970 + }, + { + "epoch": 0.9486370901550822, + "grad_norm": 0.5265828755856463, + "learning_rate": 8.001205938542398e-06, + "loss": 0.1424, + "step": 10980 + }, + { + "epoch": 0.9495010583610524, + "grad_norm": 0.551199244101437, + "learning_rate": 7.997474369703772e-06, + "loss": 0.1419, + "step": 10990 + }, + { + "epoch": 0.9503650265670224, + "grad_norm": 0.5545419227985952, + "learning_rate": 7.993740192844493e-06, + "loss": 0.1408, + "step": 11000 + }, + { + "epoch": 0.9512289947729924, + "grad_norm": 0.5783342185948568, + "learning_rate": 7.990003411213562e-06, + "loss": 0.1387, + "step": 11010 + }, + { + "epoch": 0.9520929629789624, + "grad_norm": 0.5520321411170842, + "learning_rate": 7.986264028062256e-06, + "loss": 0.1414, + "step": 11020 + }, + { + "epoch": 0.9529569311849324, + "grad_norm": 0.5376112537460771, + "learning_rate": 7.982522046644106e-06, + "loss": 0.1407, + "step": 11030 + }, + { + "epoch": 0.9538208993909024, + "grad_norm": 0.5973209607733642, + "learning_rate": 7.97877747021491e-06, + "loss": 0.1381, + "step": 11040 + }, + { + "epoch": 0.9546848675968724, + "grad_norm": 0.5722729711850041, + "learning_rate": 7.975030302032722e-06, + "loss": 0.143, + "step": 11050 + }, + { + "epoch": 0.9555488358028424, + "grad_norm": 0.5505304492880758, + "learning_rate": 7.971280545357851e-06, + "loss": 0.1429, + "step": 11060 + }, + { + "epoch": 0.9564128040088125, + "grad_norm": 0.5430702520493753, + "learning_rate": 7.967528203452856e-06, + "loss": 0.1377, + "step": 11070 + }, + { + "epoch": 0.9572767722147825, + "grad_norm": 0.5379484526659242, + "learning_rate": 7.963773279582548e-06, + "loss": 0.1425, + "step": 11080 + }, + { + "epoch": 0.9581407404207525, + "grad_norm": 0.59660987229937, + "learning_rate": 7.960015777013984e-06, + "loss": 0.1404, + "step": 11090 + }, + { + "epoch": 0.9590047086267225, + "grad_norm": 0.5421623266199536, + "learning_rate": 7.956255699016466e-06, + "loss": 0.1424, + "step": 11100 + }, + { + "epoch": 0.9598686768326925, + "grad_norm": 0.5322756578697195, + "learning_rate": 7.952493048861534e-06, + "loss": 0.1396, + "step": 11110 + }, + { + "epoch": 0.9607326450386626, + "grad_norm": 0.6187852901428197, + "learning_rate": 7.948727829822967e-06, + "loss": 0.141, + "step": 11120 + }, + { + "epoch": 0.9615966132446326, + "grad_norm": 0.6065087922615199, + "learning_rate": 7.94496004517678e-06, + "loss": 0.1411, + "step": 11130 + }, + { + "epoch": 0.9624605814506026, + "grad_norm": 0.5659172316900728, + "learning_rate": 7.941189698201218e-06, + "loss": 0.1396, + "step": 11140 + }, + { + "epoch": 0.9633245496565727, + "grad_norm": 0.5317942345273444, + "learning_rate": 7.937416792176758e-06, + "loss": 0.141, + "step": 11150 + }, + { + "epoch": 0.9641885178625427, + "grad_norm": 0.554066543556405, + "learning_rate": 7.933641330386104e-06, + "loss": 0.1397, + "step": 11160 + }, + { + "epoch": 0.9650524860685127, + "grad_norm": 0.5254678736887252, + "learning_rate": 7.929863316114179e-06, + "loss": 0.1388, + "step": 11170 + }, + { + "epoch": 0.9659164542744827, + "grad_norm": 0.5302754296193825, + "learning_rate": 7.926082752648135e-06, + "loss": 0.1444, + "step": 11180 + }, + { + "epoch": 0.9667804224804527, + "grad_norm": 0.5698851699270783, + "learning_rate": 7.922299643277331e-06, + "loss": 0.1409, + "step": 11190 + }, + { + "epoch": 0.9676443906864227, + "grad_norm": 0.5464315412902958, + "learning_rate": 7.918513991293352e-06, + "loss": 0.1414, + "step": 11200 + }, + { + "epoch": 0.9685083588923927, + "grad_norm": 0.5599688882235971, + "learning_rate": 7.91472579998999e-06, + "loss": 0.1402, + "step": 11210 + }, + { + "epoch": 0.9693723270983627, + "grad_norm": 0.5753649910885886, + "learning_rate": 7.910935072663245e-06, + "loss": 0.1409, + "step": 11220 + }, + { + "epoch": 0.9702362953043328, + "grad_norm": 0.5646681396820311, + "learning_rate": 7.907141812611325e-06, + "loss": 0.141, + "step": 11230 + }, + { + "epoch": 0.9711002635103029, + "grad_norm": 0.5455395876393425, + "learning_rate": 7.903346023134645e-06, + "loss": 0.14, + "step": 11240 + }, + { + "epoch": 0.9719642317162729, + "grad_norm": 0.5643721404005844, + "learning_rate": 7.899547707535816e-06, + "loss": 0.1405, + "step": 11250 + }, + { + "epoch": 0.9728281999222429, + "grad_norm": 0.5734263716016804, + "learning_rate": 7.895746869119647e-06, + "loss": 0.1386, + "step": 11260 + }, + { + "epoch": 0.9736921681282129, + "grad_norm": 0.5754019383795476, + "learning_rate": 7.891943511193148e-06, + "loss": 0.1412, + "step": 11270 + }, + { + "epoch": 0.9745561363341829, + "grad_norm": 0.5357745666067263, + "learning_rate": 7.888137637065514e-06, + "loss": 0.1407, + "step": 11280 + }, + { + "epoch": 0.9754201045401529, + "grad_norm": 0.5463072339025498, + "learning_rate": 7.884329250048134e-06, + "loss": 0.1369, + "step": 11290 + }, + { + "epoch": 0.9762840727461229, + "grad_norm": 0.563329757491908, + "learning_rate": 7.880518353454576e-06, + "loss": 0.1366, + "step": 11300 + }, + { + "epoch": 0.977148040952093, + "grad_norm": 0.5750546590943822, + "learning_rate": 7.876704950600607e-06, + "loss": 0.1379, + "step": 11310 + }, + { + "epoch": 0.978012009158063, + "grad_norm": 0.5285988056427497, + "learning_rate": 7.872889044804155e-06, + "loss": 0.1399, + "step": 11320 + }, + { + "epoch": 0.978875977364033, + "grad_norm": 0.532439311490372, + "learning_rate": 7.869070639385343e-06, + "loss": 0.1422, + "step": 11330 + }, + { + "epoch": 0.979739945570003, + "grad_norm": 0.5398829784687519, + "learning_rate": 7.865249737666458e-06, + "loss": 0.1375, + "step": 11340 + }, + { + "epoch": 0.980603913775973, + "grad_norm": 0.5315198727011683, + "learning_rate": 7.861426342971962e-06, + "loss": 0.1333, + "step": 11350 + }, + { + "epoch": 0.981467881981943, + "grad_norm": 0.5484806587096321, + "learning_rate": 7.857600458628485e-06, + "loss": 0.1383, + "step": 11360 + }, + { + "epoch": 0.9823318501879131, + "grad_norm": 0.5632626864322672, + "learning_rate": 7.85377208796483e-06, + "loss": 0.1408, + "step": 11370 + }, + { + "epoch": 0.9831958183938831, + "grad_norm": 0.5756731418315327, + "learning_rate": 7.849941234311952e-06, + "loss": 0.1423, + "step": 11380 + }, + { + "epoch": 0.9840597865998532, + "grad_norm": 0.569984058319945, + "learning_rate": 7.846107901002976e-06, + "loss": 0.1418, + "step": 11390 + }, + { + "epoch": 0.9849237548058232, + "grad_norm": 0.5394728565154906, + "learning_rate": 7.84227209137318e-06, + "loss": 0.1397, + "step": 11400 + }, + { + "epoch": 0.9857877230117932, + "grad_norm": 0.5294475608511082, + "learning_rate": 7.838433808759994e-06, + "loss": 0.1417, + "step": 11410 + }, + { + "epoch": 0.9866516912177632, + "grad_norm": 0.5788220534347303, + "learning_rate": 7.834593056503009e-06, + "loss": 0.1403, + "step": 11420 + }, + { + "epoch": 0.9875156594237332, + "grad_norm": 0.5495378815031298, + "learning_rate": 7.830749837943952e-06, + "loss": 0.1438, + "step": 11430 + }, + { + "epoch": 0.9883796276297032, + "grad_norm": 0.5602649448652921, + "learning_rate": 7.826904156426706e-06, + "loss": 0.1349, + "step": 11440 + }, + { + "epoch": 0.9892435958356732, + "grad_norm": 0.510550749078522, + "learning_rate": 7.823056015297294e-06, + "loss": 0.1389, + "step": 11450 + }, + { + "epoch": 0.9901075640416432, + "grad_norm": 0.5618787773731678, + "learning_rate": 7.819205417903879e-06, + "loss": 0.1391, + "step": 11460 + }, + { + "epoch": 0.9909715322476133, + "grad_norm": 0.5415142910705364, + "learning_rate": 7.81535236759676e-06, + "loss": 0.1383, + "step": 11470 + }, + { + "epoch": 0.9918355004535833, + "grad_norm": 0.5378807964604567, + "learning_rate": 7.81149686772837e-06, + "loss": 0.1381, + "step": 11480 + }, + { + "epoch": 0.9926994686595533, + "grad_norm": 0.5086407896763732, + "learning_rate": 7.807638921653275e-06, + "loss": 0.1396, + "step": 11490 + }, + { + "epoch": 0.9935634368655234, + "grad_norm": 0.5763401451203161, + "learning_rate": 7.803778532728168e-06, + "loss": 0.1385, + "step": 11500 + }, + { + "epoch": 0.9944274050714934, + "grad_norm": 0.5283421801533311, + "learning_rate": 7.799915704311869e-06, + "loss": 0.1392, + "step": 11510 + }, + { + "epoch": 0.9952913732774634, + "grad_norm": 0.5407041178210695, + "learning_rate": 7.79605043976532e-06, + "loss": 0.1378, + "step": 11520 + }, + { + "epoch": 0.9961553414834334, + "grad_norm": 0.5444185941688278, + "learning_rate": 7.792182742451579e-06, + "loss": 0.1389, + "step": 11530 + }, + { + "epoch": 0.9970193096894034, + "grad_norm": 0.5155116418138258, + "learning_rate": 7.78831261573583e-06, + "loss": 0.1362, + "step": 11540 + }, + { + "epoch": 0.9978832778953735, + "grad_norm": 0.5558368170116909, + "learning_rate": 7.784440062985357e-06, + "loss": 0.1402, + "step": 11550 + }, + { + "epoch": 0.9987472461013435, + "grad_norm": 0.5370499131190705, + "learning_rate": 7.78056508756957e-06, + "loss": 0.1378, + "step": 11560 + }, + { + "epoch": 0.9996112143073135, + "grad_norm": 0.5314120316897529, + "learning_rate": 7.776687692859972e-06, + "loss": 0.1361, + "step": 11570 + }, + { + "epoch": 1.0004751825132836, + "grad_norm": 0.6484295050544892, + "learning_rate": 7.772807882230184e-06, + "loss": 0.1253, + "step": 11580 + }, + { + "epoch": 1.0013391507192535, + "grad_norm": 0.5288373554942792, + "learning_rate": 7.768925659055922e-06, + "loss": 0.1179, + "step": 11590 + }, + { + "epoch": 1.0022031189252236, + "grad_norm": 0.5226558697799325, + "learning_rate": 7.765041026715e-06, + "loss": 0.1162, + "step": 11600 + }, + { + "epoch": 1.0030670871311935, + "grad_norm": 0.5559986175018743, + "learning_rate": 7.761153988587336e-06, + "loss": 0.1136, + "step": 11610 + }, + { + "epoch": 1.0039310553371636, + "grad_norm": 0.5738333391751079, + "learning_rate": 7.757264548054931e-06, + "loss": 0.1129, + "step": 11620 + }, + { + "epoch": 1.0047950235431335, + "grad_norm": 0.5874896405994214, + "learning_rate": 7.75337270850188e-06, + "loss": 0.1155, + "step": 11630 + }, + { + "epoch": 1.0056589917491037, + "grad_norm": 0.57700834782927, + "learning_rate": 7.749478473314371e-06, + "loss": 0.114, + "step": 11640 + }, + { + "epoch": 1.0065229599550736, + "grad_norm": 0.5253879750024364, + "learning_rate": 7.745581845880668e-06, + "loss": 0.1147, + "step": 11650 + }, + { + "epoch": 1.0073869281610437, + "grad_norm": 0.578799776263045, + "learning_rate": 7.741682829591124e-06, + "loss": 0.1162, + "step": 11660 + }, + { + "epoch": 1.0082508963670136, + "grad_norm": 0.5520505063573035, + "learning_rate": 7.737781427838163e-06, + "loss": 0.1111, + "step": 11670 + }, + { + "epoch": 1.0091148645729837, + "grad_norm": 0.5445981382109637, + "learning_rate": 7.733877644016288e-06, + "loss": 0.1146, + "step": 11680 + }, + { + "epoch": 1.0099788327789538, + "grad_norm": 0.51984767612585, + "learning_rate": 7.729971481522079e-06, + "loss": 0.115, + "step": 11690 + }, + { + "epoch": 1.0108428009849237, + "grad_norm": 0.579207833886054, + "learning_rate": 7.726062943754177e-06, + "loss": 0.1146, + "step": 11700 + }, + { + "epoch": 1.0117067691908939, + "grad_norm": 0.6003541965292548, + "learning_rate": 7.722152034113299e-06, + "loss": 0.114, + "step": 11710 + }, + { + "epoch": 1.0125707373968638, + "grad_norm": 0.5344757173700486, + "learning_rate": 7.718238756002214e-06, + "loss": 0.114, + "step": 11720 + }, + { + "epoch": 1.0134347056028339, + "grad_norm": 0.5717675830192779, + "learning_rate": 7.714323112825764e-06, + "loss": 0.1142, + "step": 11730 + }, + { + "epoch": 1.0142986738088038, + "grad_norm": 0.5589395318387775, + "learning_rate": 7.710405107990841e-06, + "loss": 0.1124, + "step": 11740 + }, + { + "epoch": 1.015162642014774, + "grad_norm": 0.5191208223948365, + "learning_rate": 7.706484744906394e-06, + "loss": 0.1132, + "step": 11750 + }, + { + "epoch": 1.0160266102207438, + "grad_norm": 0.5739974899486876, + "learning_rate": 7.702562026983425e-06, + "loss": 0.113, + "step": 11760 + }, + { + "epoch": 1.016890578426714, + "grad_norm": 0.5538985992217317, + "learning_rate": 7.698636957634984e-06, + "loss": 0.1125, + "step": 11770 + }, + { + "epoch": 1.0177545466326838, + "grad_norm": 0.5556788352650281, + "learning_rate": 7.694709540276165e-06, + "loss": 0.1086, + "step": 11780 + }, + { + "epoch": 1.018618514838654, + "grad_norm": 0.557737388023279, + "learning_rate": 7.690779778324106e-06, + "loss": 0.1139, + "step": 11790 + }, + { + "epoch": 1.0194824830446239, + "grad_norm": 0.5533298134289029, + "learning_rate": 7.68684767519799e-06, + "loss": 0.1166, + "step": 11800 + }, + { + "epoch": 1.020346451250594, + "grad_norm": 0.5575744213533839, + "learning_rate": 7.68291323431903e-06, + "loss": 0.1123, + "step": 11810 + }, + { + "epoch": 1.021210419456564, + "grad_norm": 0.5576624574436335, + "learning_rate": 7.678976459110475e-06, + "loss": 0.1147, + "step": 11820 + }, + { + "epoch": 1.022074387662534, + "grad_norm": 0.5582794921471507, + "learning_rate": 7.675037352997606e-06, + "loss": 0.1128, + "step": 11830 + }, + { + "epoch": 1.0229383558685041, + "grad_norm": 0.5765353203515869, + "learning_rate": 7.67109591940773e-06, + "loss": 0.1135, + "step": 11840 + }, + { + "epoch": 1.023802324074474, + "grad_norm": 0.5594470994345416, + "learning_rate": 7.667152161770185e-06, + "loss": 0.1132, + "step": 11850 + }, + { + "epoch": 1.0246662922804441, + "grad_norm": 0.5843630847176365, + "learning_rate": 7.663206083516323e-06, + "loss": 0.1148, + "step": 11860 + }, + { + "epoch": 1.025530260486414, + "grad_norm": 0.5413541740109529, + "learning_rate": 7.659257688079524e-06, + "loss": 0.1113, + "step": 11870 + }, + { + "epoch": 1.0263942286923842, + "grad_norm": 0.5822217976514313, + "learning_rate": 7.655306978895173e-06, + "loss": 0.116, + "step": 11880 + }, + { + "epoch": 1.027258196898354, + "grad_norm": 0.5979006652450949, + "learning_rate": 7.651353959400678e-06, + "loss": 0.1135, + "step": 11890 + }, + { + "epoch": 1.0281221651043242, + "grad_norm": 0.5665894234503989, + "learning_rate": 7.647398633035452e-06, + "loss": 0.1144, + "step": 11900 + }, + { + "epoch": 1.028986133310294, + "grad_norm": 0.5375518216139221, + "learning_rate": 7.643441003240918e-06, + "loss": 0.1177, + "step": 11910 + }, + { + "epoch": 1.0298501015162642, + "grad_norm": 0.5546406845601831, + "learning_rate": 7.639481073460501e-06, + "loss": 0.1154, + "step": 11920 + }, + { + "epoch": 1.0307140697222341, + "grad_norm": 0.5670480535796274, + "learning_rate": 7.635518847139627e-06, + "loss": 0.112, + "step": 11930 + }, + { + "epoch": 1.0315780379282042, + "grad_norm": 0.5887507475464324, + "learning_rate": 7.631554327725722e-06, + "loss": 0.1095, + "step": 11940 + }, + { + "epoch": 1.0324420061341744, + "grad_norm": 0.5896886428008532, + "learning_rate": 7.627587518668205e-06, + "loss": 0.1166, + "step": 11950 + }, + { + "epoch": 1.0333059743401443, + "grad_norm": 0.5690679992282057, + "learning_rate": 7.623618423418488e-06, + "loss": 0.1135, + "step": 11960 + }, + { + "epoch": 1.0341699425461144, + "grad_norm": 0.5763884873413326, + "learning_rate": 7.619647045429975e-06, + "loss": 0.1145, + "step": 11970 + }, + { + "epoch": 1.0350339107520843, + "grad_norm": 0.5547331293683502, + "learning_rate": 7.615673388158052e-06, + "loss": 0.1125, + "step": 11980 + }, + { + "epoch": 1.0358978789580544, + "grad_norm": 0.5809540964348433, + "learning_rate": 7.6116974550600895e-06, + "loss": 0.1147, + "step": 11990 + }, + { + "epoch": 1.0367618471640243, + "grad_norm": 0.5617878261436202, + "learning_rate": 7.60771924959544e-06, + "loss": 0.1152, + "step": 12000 + }, + { + "epoch": 1.0376258153699944, + "grad_norm": 0.5691052166008522, + "learning_rate": 7.603738775225429e-06, + "loss": 0.1118, + "step": 12010 + }, + { + "epoch": 1.0384897835759643, + "grad_norm": 0.5927470323848189, + "learning_rate": 7.599756035413359e-06, + "loss": 0.1142, + "step": 12020 + }, + { + "epoch": 1.0393537517819345, + "grad_norm": 0.5348052360596652, + "learning_rate": 7.595771033624507e-06, + "loss": 0.1138, + "step": 12030 + }, + { + "epoch": 1.0402177199879044, + "grad_norm": 0.5942881290559787, + "learning_rate": 7.5917837733261104e-06, + "loss": 0.1136, + "step": 12040 + }, + { + "epoch": 1.0410816881938745, + "grad_norm": 0.5870254311070779, + "learning_rate": 7.5877942579873755e-06, + "loss": 0.1111, + "step": 12050 + }, + { + "epoch": 1.0419456563998444, + "grad_norm": 0.5560655753288578, + "learning_rate": 7.583802491079473e-06, + "loss": 0.1166, + "step": 12060 + }, + { + "epoch": 1.0428096246058145, + "grad_norm": 0.5812311683484401, + "learning_rate": 7.579808476075529e-06, + "loss": 0.1134, + "step": 12070 + }, + { + "epoch": 1.0436735928117846, + "grad_norm": 0.587965901597796, + "learning_rate": 7.575812216450626e-06, + "loss": 0.1131, + "step": 12080 + }, + { + "epoch": 1.0445375610177545, + "grad_norm": 0.5807654543387688, + "learning_rate": 7.571813715681804e-06, + "loss": 0.1107, + "step": 12090 + }, + { + "epoch": 1.0454015292237246, + "grad_norm": 0.5819264052078913, + "learning_rate": 7.567812977248046e-06, + "loss": 0.1123, + "step": 12100 + }, + { + "epoch": 1.0462654974296945, + "grad_norm": 0.5915930314280453, + "learning_rate": 7.5638100046302855e-06, + "loss": 0.1131, + "step": 12110 + }, + { + "epoch": 1.0471294656356647, + "grad_norm": 0.5258533900711518, + "learning_rate": 7.5598048013114015e-06, + "loss": 0.1136, + "step": 12120 + }, + { + "epoch": 1.0479934338416346, + "grad_norm": 0.5836672177555081, + "learning_rate": 7.555797370776212e-06, + "loss": 0.1181, + "step": 12130 + }, + { + "epoch": 1.0488574020476047, + "grad_norm": 0.563741150677117, + "learning_rate": 7.551787716511472e-06, + "loss": 0.1143, + "step": 12140 + }, + { + "epoch": 1.0497213702535746, + "grad_norm": 0.5745656695349525, + "learning_rate": 7.547775842005871e-06, + "loss": 0.114, + "step": 12150 + }, + { + "epoch": 1.0505853384595447, + "grad_norm": 0.5502725240340415, + "learning_rate": 7.543761750750034e-06, + "loss": 0.114, + "step": 12160 + }, + { + "epoch": 1.0514493066655146, + "grad_norm": 0.5573865072308392, + "learning_rate": 7.53974544623651e-06, + "loss": 0.1123, + "step": 12170 + }, + { + "epoch": 1.0523132748714847, + "grad_norm": 0.5424078163988334, + "learning_rate": 7.535726931959781e-06, + "loss": 0.1125, + "step": 12180 + }, + { + "epoch": 1.0531772430774549, + "grad_norm": 0.6021571647004282, + "learning_rate": 7.531706211416239e-06, + "loss": 0.1138, + "step": 12190 + }, + { + "epoch": 1.0540412112834248, + "grad_norm": 0.5664122542057618, + "learning_rate": 7.527683288104208e-06, + "loss": 0.1115, + "step": 12200 + }, + { + "epoch": 1.0549051794893949, + "grad_norm": 0.5716818115749401, + "learning_rate": 7.523658165523924e-06, + "loss": 0.1114, + "step": 12210 + }, + { + "epoch": 1.0557691476953648, + "grad_norm": 0.6063170462255332, + "learning_rate": 7.5196308471775345e-06, + "loss": 0.1112, + "step": 12220 + }, + { + "epoch": 1.056633115901335, + "grad_norm": 1.2632340648066684, + "learning_rate": 7.5156013365691005e-06, + "loss": 0.1161, + "step": 12230 + }, + { + "epoch": 1.0574970841073048, + "grad_norm": 0.5509074426756472, + "learning_rate": 7.51156963720459e-06, + "loss": 0.1151, + "step": 12240 + }, + { + "epoch": 1.058361052313275, + "grad_norm": 0.5466414245187583, + "learning_rate": 7.5075357525918725e-06, + "loss": 0.1128, + "step": 12250 + }, + { + "epoch": 1.0592250205192448, + "grad_norm": 0.596160426575199, + "learning_rate": 7.503499686240719e-06, + "loss": 0.1114, + "step": 12260 + }, + { + "epoch": 1.060088988725215, + "grad_norm": 0.5642196283298498, + "learning_rate": 7.499461441662807e-06, + "loss": 0.1136, + "step": 12270 + }, + { + "epoch": 1.0609529569311849, + "grad_norm": 0.5849920421619905, + "learning_rate": 7.495421022371697e-06, + "loss": 0.1134, + "step": 12280 + }, + { + "epoch": 1.061816925137155, + "grad_norm": 0.5787275769912021, + "learning_rate": 7.491378431882851e-06, + "loss": 0.1149, + "step": 12290 + }, + { + "epoch": 1.0626808933431249, + "grad_norm": 0.5483503453537147, + "learning_rate": 7.487333673713615e-06, + "loss": 0.1167, + "step": 12300 + }, + { + "epoch": 1.063544861549095, + "grad_norm": 0.5359573243330554, + "learning_rate": 7.483286751383224e-06, + "loss": 0.1146, + "step": 12310 + }, + { + "epoch": 1.064408829755065, + "grad_norm": 0.5875705499528636, + "learning_rate": 7.479237668412793e-06, + "loss": 0.112, + "step": 12320 + }, + { + "epoch": 1.065272797961035, + "grad_norm": 0.5561394006831806, + "learning_rate": 7.47518642832532e-06, + "loss": 0.1139, + "step": 12330 + }, + { + "epoch": 1.0661367661670051, + "grad_norm": 0.5633666250557524, + "learning_rate": 7.47113303464568e-06, + "loss": 0.1125, + "step": 12340 + }, + { + "epoch": 1.067000734372975, + "grad_norm": 0.5623851970677978, + "learning_rate": 7.4670774909006174e-06, + "loss": 0.1181, + "step": 12350 + }, + { + "epoch": 1.0678647025789452, + "grad_norm": 0.5587675735584775, + "learning_rate": 7.463019800618749e-06, + "loss": 0.1133, + "step": 12360 + }, + { + "epoch": 1.068728670784915, + "grad_norm": 0.584981094727008, + "learning_rate": 7.458959967330565e-06, + "loss": 0.1135, + "step": 12370 + }, + { + "epoch": 1.0695926389908852, + "grad_norm": 0.6284088915860303, + "learning_rate": 7.4548979945684105e-06, + "loss": 0.1184, + "step": 12380 + }, + { + "epoch": 1.070456607196855, + "grad_norm": 0.5931271494335132, + "learning_rate": 7.450833885866502e-06, + "loss": 0.1125, + "step": 12390 + }, + { + "epoch": 1.0713205754028252, + "grad_norm": 0.5550532429183336, + "learning_rate": 7.446767644760906e-06, + "loss": 0.1127, + "step": 12400 + }, + { + "epoch": 1.0721845436087951, + "grad_norm": 0.5711627384957843, + "learning_rate": 7.4426992747895495e-06, + "loss": 0.1144, + "step": 12410 + }, + { + "epoch": 1.0730485118147652, + "grad_norm": 0.5497441584250377, + "learning_rate": 7.43862877949221e-06, + "loss": 0.1152, + "step": 12420 + }, + { + "epoch": 1.0739124800207351, + "grad_norm": 0.5849304669143405, + "learning_rate": 7.434556162410514e-06, + "loss": 0.1133, + "step": 12430 + }, + { + "epoch": 1.0747764482267053, + "grad_norm": 0.5584597859335196, + "learning_rate": 7.430481427087935e-06, + "loss": 0.114, + "step": 12440 + }, + { + "epoch": 1.0756404164326754, + "grad_norm": 0.5509002638468832, + "learning_rate": 7.426404577069789e-06, + "loss": 0.1137, + "step": 12450 + }, + { + "epoch": 1.0765043846386453, + "grad_norm": 0.5694890975843275, + "learning_rate": 7.42232561590323e-06, + "loss": 0.1143, + "step": 12460 + }, + { + "epoch": 1.0773683528446154, + "grad_norm": 0.6020473634715721, + "learning_rate": 7.418244547137254e-06, + "loss": 0.1139, + "step": 12470 + }, + { + "epoch": 1.0782323210505853, + "grad_norm": 0.5867109716162718, + "learning_rate": 7.414161374322686e-06, + "loss": 0.1159, + "step": 12480 + }, + { + "epoch": 1.0790962892565554, + "grad_norm": 0.5644144690792395, + "learning_rate": 7.410076101012184e-06, + "loss": 0.1146, + "step": 12490 + }, + { + "epoch": 1.0799602574625253, + "grad_norm": 0.6177197577941457, + "learning_rate": 7.405988730760231e-06, + "loss": 0.116, + "step": 12500 + }, + { + "epoch": 1.0808242256684955, + "grad_norm": 0.5853046102952433, + "learning_rate": 7.401899267123137e-06, + "loss": 0.1145, + "step": 12510 + }, + { + "epoch": 1.0816881938744654, + "grad_norm": 0.5697893771470309, + "learning_rate": 7.397807713659034e-06, + "loss": 0.1162, + "step": 12520 + }, + { + "epoch": 1.0825521620804355, + "grad_norm": 0.573783818853617, + "learning_rate": 7.39371407392787e-06, + "loss": 0.1159, + "step": 12530 + }, + { + "epoch": 1.0834161302864054, + "grad_norm": 0.5563259533540806, + "learning_rate": 7.38961835149141e-06, + "loss": 0.1156, + "step": 12540 + }, + { + "epoch": 1.0842800984923755, + "grad_norm": 0.5776456652808264, + "learning_rate": 7.38552054991323e-06, + "loss": 0.1168, + "step": 12550 + }, + { + "epoch": 1.0851440666983454, + "grad_norm": 0.5605064163789284, + "learning_rate": 7.381420672758714e-06, + "loss": 0.1129, + "step": 12560 + }, + { + "epoch": 1.0860080349043155, + "grad_norm": 0.5724005520433846, + "learning_rate": 7.377318723595055e-06, + "loss": 0.1141, + "step": 12570 + }, + { + "epoch": 1.0868720031102856, + "grad_norm": 0.5763764037044753, + "learning_rate": 7.373214705991245e-06, + "loss": 0.1143, + "step": 12580 + }, + { + "epoch": 1.0877359713162555, + "grad_norm": 0.5517897969244039, + "learning_rate": 7.36910862351808e-06, + "loss": 0.1128, + "step": 12590 + }, + { + "epoch": 1.0885999395222257, + "grad_norm": 0.5459275054821126, + "learning_rate": 7.36500047974815e-06, + "loss": 0.1148, + "step": 12600 + }, + { + "epoch": 1.0894639077281956, + "grad_norm": 0.5503423920751671, + "learning_rate": 7.360890278255838e-06, + "loss": 0.1112, + "step": 12610 + }, + { + "epoch": 1.0903278759341657, + "grad_norm": 0.5640084967541655, + "learning_rate": 7.356778022617318e-06, + "loss": 0.1112, + "step": 12620 + }, + { + "epoch": 1.0911918441401356, + "grad_norm": 0.5875745843307169, + "learning_rate": 7.352663716410553e-06, + "loss": 0.118, + "step": 12630 + }, + { + "epoch": 1.0920558123461057, + "grad_norm": 0.5323955570867941, + "learning_rate": 7.3485473632152844e-06, + "loss": 0.1147, + "step": 12640 + }, + { + "epoch": 1.0929197805520756, + "grad_norm": 0.5483966268609873, + "learning_rate": 7.344428966613044e-06, + "loss": 0.114, + "step": 12650 + }, + { + "epoch": 1.0937837487580457, + "grad_norm": 0.549406276310017, + "learning_rate": 7.340308530187134e-06, + "loss": 0.1108, + "step": 12660 + }, + { + "epoch": 1.0946477169640156, + "grad_norm": 0.5433161641383483, + "learning_rate": 7.336186057522633e-06, + "loss": 0.1159, + "step": 12670 + }, + { + "epoch": 1.0955116851699858, + "grad_norm": 0.5842351306204783, + "learning_rate": 7.332061552206393e-06, + "loss": 0.114, + "step": 12680 + }, + { + "epoch": 1.0963756533759557, + "grad_norm": 0.5616172693897599, + "learning_rate": 7.327935017827034e-06, + "loss": 0.1114, + "step": 12690 + }, + { + "epoch": 1.0972396215819258, + "grad_norm": 0.6017902269821186, + "learning_rate": 7.323806457974939e-06, + "loss": 0.115, + "step": 12700 + }, + { + "epoch": 1.098103589787896, + "grad_norm": 0.5724606965291857, + "learning_rate": 7.319675876242256e-06, + "loss": 0.1141, + "step": 12710 + }, + { + "epoch": 1.0989675579938658, + "grad_norm": 0.5895805939601668, + "learning_rate": 7.315543276222894e-06, + "loss": 0.1133, + "step": 12720 + }, + { + "epoch": 1.099831526199836, + "grad_norm": 0.5827595105149493, + "learning_rate": 7.3114086615125125e-06, + "loss": 0.115, + "step": 12730 + }, + { + "epoch": 1.1006954944058058, + "grad_norm": 0.5300031140016738, + "learning_rate": 7.3072720357085284e-06, + "loss": 0.1141, + "step": 12740 + }, + { + "epoch": 1.101559462611776, + "grad_norm": 0.5659944671796406, + "learning_rate": 7.303133402410104e-06, + "loss": 0.1103, + "step": 12750 + }, + { + "epoch": 1.1024234308177459, + "grad_norm": 0.5575335822538651, + "learning_rate": 7.298992765218156e-06, + "loss": 0.1189, + "step": 12760 + }, + { + "epoch": 1.103287399023716, + "grad_norm": 0.5816088261759306, + "learning_rate": 7.294850127735336e-06, + "loss": 0.1125, + "step": 12770 + }, + { + "epoch": 1.1041513672296859, + "grad_norm": 0.5377912164914728, + "learning_rate": 7.2907054935660415e-06, + "loss": 0.1142, + "step": 12780 + }, + { + "epoch": 1.105015335435656, + "grad_norm": 0.5171845913840188, + "learning_rate": 7.286558866316405e-06, + "loss": 0.1144, + "step": 12790 + }, + { + "epoch": 1.105879303641626, + "grad_norm": 0.5625513547764542, + "learning_rate": 7.282410249594294e-06, + "loss": 0.1139, + "step": 12800 + }, + { + "epoch": 1.106743271847596, + "grad_norm": 0.548817914014772, + "learning_rate": 7.278259647009308e-06, + "loss": 0.1125, + "step": 12810 + }, + { + "epoch": 1.1076072400535661, + "grad_norm": 0.5619622892577025, + "learning_rate": 7.27410706217277e-06, + "loss": 0.1123, + "step": 12820 + }, + { + "epoch": 1.108471208259536, + "grad_norm": 0.5958312479893616, + "learning_rate": 7.269952498697734e-06, + "loss": 0.1151, + "step": 12830 + }, + { + "epoch": 1.1093351764655062, + "grad_norm": 0.6065189288813805, + "learning_rate": 7.265795960198971e-06, + "loss": 0.1145, + "step": 12840 + }, + { + "epoch": 1.110199144671476, + "grad_norm": 0.574744744019306, + "learning_rate": 7.261637450292972e-06, + "loss": 0.1123, + "step": 12850 + }, + { + "epoch": 1.1110631128774462, + "grad_norm": 0.5523033030399436, + "learning_rate": 7.257476972597941e-06, + "loss": 0.1169, + "step": 12860 + }, + { + "epoch": 1.111927081083416, + "grad_norm": 0.5615343679114696, + "learning_rate": 7.2533145307337995e-06, + "loss": 0.1116, + "step": 12870 + }, + { + "epoch": 1.1127910492893862, + "grad_norm": 0.5892480180243681, + "learning_rate": 7.249150128322171e-06, + "loss": 0.1167, + "step": 12880 + }, + { + "epoch": 1.1136550174953561, + "grad_norm": 0.5835398687472092, + "learning_rate": 7.244983768986391e-06, + "loss": 0.1143, + "step": 12890 + }, + { + "epoch": 1.1145189857013262, + "grad_norm": 0.5420813706717189, + "learning_rate": 7.240815456351493e-06, + "loss": 0.1142, + "step": 12900 + }, + { + "epoch": 1.1153829539072961, + "grad_norm": 0.5560693442080733, + "learning_rate": 7.236645194044215e-06, + "loss": 0.1124, + "step": 12910 + }, + { + "epoch": 1.1162469221132663, + "grad_norm": 0.582775487561968, + "learning_rate": 7.232472985692985e-06, + "loss": 0.1158, + "step": 12920 + }, + { + "epoch": 1.1171108903192362, + "grad_norm": 0.5813772164210899, + "learning_rate": 7.228298834927932e-06, + "loss": 0.1151, + "step": 12930 + }, + { + "epoch": 1.1179748585252063, + "grad_norm": 0.551749722566045, + "learning_rate": 7.224122745380866e-06, + "loss": 0.1144, + "step": 12940 + }, + { + "epoch": 1.1188388267311762, + "grad_norm": 0.5987437110852261, + "learning_rate": 7.2199447206852926e-06, + "loss": 0.1131, + "step": 12950 + }, + { + "epoch": 1.1197027949371463, + "grad_norm": 0.5823318038186402, + "learning_rate": 7.215764764476392e-06, + "loss": 0.1144, + "step": 12960 + }, + { + "epoch": 1.1205667631431164, + "grad_norm": 0.562823286330617, + "learning_rate": 7.211582880391036e-06, + "loss": 0.1138, + "step": 12970 + }, + { + "epoch": 1.1214307313490863, + "grad_norm": 0.5689016713168833, + "learning_rate": 7.2073990720677625e-06, + "loss": 0.1082, + "step": 12980 + }, + { + "epoch": 1.1222946995550565, + "grad_norm": 0.5669520425360047, + "learning_rate": 7.203213343146793e-06, + "loss": 0.1123, + "step": 12990 + }, + { + "epoch": 1.1231586677610264, + "grad_norm": 0.5909865997664461, + "learning_rate": 7.199025697270014e-06, + "loss": 0.1168, + "step": 13000 + }, + { + "epoch": 1.1240226359669965, + "grad_norm": 0.6073490868130594, + "learning_rate": 7.194836138080983e-06, + "loss": 0.1164, + "step": 13010 + }, + { + "epoch": 1.1248866041729664, + "grad_norm": 0.5738517459776369, + "learning_rate": 7.19064466922492e-06, + "loss": 0.1115, + "step": 13020 + }, + { + "epoch": 1.1257505723789365, + "grad_norm": 0.5341322056648838, + "learning_rate": 7.186451294348708e-06, + "loss": 0.1127, + "step": 13030 + }, + { + "epoch": 1.1266145405849064, + "grad_norm": 0.5659073456165932, + "learning_rate": 7.182256017100888e-06, + "loss": 0.1132, + "step": 13040 + }, + { + "epoch": 1.1274785087908765, + "grad_norm": 0.5733078496207322, + "learning_rate": 7.178058841131658e-06, + "loss": 0.1152, + "step": 13050 + }, + { + "epoch": 1.1283424769968464, + "grad_norm": 0.5917250921625605, + "learning_rate": 7.173859770092863e-06, + "loss": 0.1137, + "step": 13060 + }, + { + "epoch": 1.1292064452028165, + "grad_norm": 0.5724064155200794, + "learning_rate": 7.1696588076380025e-06, + "loss": 0.1124, + "step": 13070 + }, + { + "epoch": 1.1300704134087867, + "grad_norm": 0.5839270882164534, + "learning_rate": 7.165455957422219e-06, + "loss": 0.1173, + "step": 13080 + }, + { + "epoch": 1.1309343816147566, + "grad_norm": 0.5818668132052538, + "learning_rate": 7.161251223102297e-06, + "loss": 0.1116, + "step": 13090 + }, + { + "epoch": 1.1317983498207267, + "grad_norm": 0.562653394134604, + "learning_rate": 7.15704460833666e-06, + "loss": 0.1169, + "step": 13100 + }, + { + "epoch": 1.1326623180266966, + "grad_norm": 0.5369258651613542, + "learning_rate": 7.152836116785372e-06, + "loss": 0.1144, + "step": 13110 + }, + { + "epoch": 1.1335262862326667, + "grad_norm": 0.5609598754088422, + "learning_rate": 7.148625752110125e-06, + "loss": 0.1128, + "step": 13120 + }, + { + "epoch": 1.1343902544386366, + "grad_norm": 0.5307583270088041, + "learning_rate": 7.1444135179742424e-06, + "loss": 0.1129, + "step": 13130 + }, + { + "epoch": 1.1352542226446067, + "grad_norm": 0.5794753705639407, + "learning_rate": 7.140199418042674e-06, + "loss": 0.116, + "step": 13140 + }, + { + "epoch": 1.1361181908505766, + "grad_norm": 0.5765150526409482, + "learning_rate": 7.135983455981993e-06, + "loss": 0.1161, + "step": 13150 + }, + { + "epoch": 1.1369821590565468, + "grad_norm": 0.588585379901811, + "learning_rate": 7.131765635460394e-06, + "loss": 0.1156, + "step": 13160 + }, + { + "epoch": 1.1378461272625167, + "grad_norm": 0.5268321880508435, + "learning_rate": 7.127545960147685e-06, + "loss": 0.1105, + "step": 13170 + }, + { + "epoch": 1.1387100954684868, + "grad_norm": 0.5409759474589731, + "learning_rate": 7.123324433715293e-06, + "loss": 0.1139, + "step": 13180 + }, + { + "epoch": 1.1395740636744567, + "grad_norm": 0.548753740483243, + "learning_rate": 7.119101059836252e-06, + "loss": 0.1115, + "step": 13190 + }, + { + "epoch": 1.1404380318804268, + "grad_norm": 0.5473787441617439, + "learning_rate": 7.114875842185205e-06, + "loss": 0.1132, + "step": 13200 + }, + { + "epoch": 1.1413020000863967, + "grad_norm": 0.5521716011944642, + "learning_rate": 7.110648784438396e-06, + "loss": 0.1132, + "step": 13210 + }, + { + "epoch": 1.1421659682923668, + "grad_norm": 0.5452012339396002, + "learning_rate": 7.1064198902736766e-06, + "loss": 0.1136, + "step": 13220 + }, + { + "epoch": 1.143029936498337, + "grad_norm": 0.573456706312834, + "learning_rate": 7.10218916337049e-06, + "loss": 0.1117, + "step": 13230 + }, + { + "epoch": 1.1438939047043069, + "grad_norm": 0.5486166950146599, + "learning_rate": 7.097956607409876e-06, + "loss": 0.1138, + "step": 13240 + }, + { + "epoch": 1.144757872910277, + "grad_norm": 0.5526828504425823, + "learning_rate": 7.093722226074467e-06, + "loss": 0.1099, + "step": 13250 + }, + { + "epoch": 1.1456218411162469, + "grad_norm": 0.5772698801339367, + "learning_rate": 7.089486023048482e-06, + "loss": 0.1102, + "step": 13260 + }, + { + "epoch": 1.146485809322217, + "grad_norm": 0.6049052486348215, + "learning_rate": 7.0852480020177265e-06, + "loss": 0.1124, + "step": 13270 + }, + { + "epoch": 1.147349777528187, + "grad_norm": 0.5755929622844455, + "learning_rate": 7.081008166669585e-06, + "loss": 0.1113, + "step": 13280 + }, + { + "epoch": 1.148213745734157, + "grad_norm": 0.5789717682270602, + "learning_rate": 7.076766520693024e-06, + "loss": 0.113, + "step": 13290 + }, + { + "epoch": 1.149077713940127, + "grad_norm": 0.5590626048251914, + "learning_rate": 7.072523067778583e-06, + "loss": 0.1154, + "step": 13300 + }, + { + "epoch": 1.149941682146097, + "grad_norm": 0.5562146192899278, + "learning_rate": 7.068277811618376e-06, + "loss": 0.1147, + "step": 13310 + }, + { + "epoch": 1.150805650352067, + "grad_norm": 0.5391519600284189, + "learning_rate": 7.064030755906084e-06, + "loss": 0.1132, + "step": 13320 + }, + { + "epoch": 1.151669618558037, + "grad_norm": 0.5689682085308547, + "learning_rate": 7.059781904336953e-06, + "loss": 0.1128, + "step": 13330 + }, + { + "epoch": 1.1525335867640072, + "grad_norm": 0.5685437828904983, + "learning_rate": 7.055531260607795e-06, + "loss": 0.1151, + "step": 13340 + }, + { + "epoch": 1.153397554969977, + "grad_norm": 0.5947489843605617, + "learning_rate": 7.051278828416979e-06, + "loss": 0.112, + "step": 13350 + }, + { + "epoch": 1.1542615231759472, + "grad_norm": 0.5463410502004773, + "learning_rate": 7.047024611464428e-06, + "loss": 0.1138, + "step": 13360 + }, + { + "epoch": 1.1551254913819171, + "grad_norm": 0.5374793646965029, + "learning_rate": 7.042768613451623e-06, + "loss": 0.1132, + "step": 13370 + }, + { + "epoch": 1.1559894595878872, + "grad_norm": 0.5966329054253161, + "learning_rate": 7.038510838081588e-06, + "loss": 0.1125, + "step": 13380 + }, + { + "epoch": 1.1568534277938571, + "grad_norm": 0.5531816864338223, + "learning_rate": 7.0342512890589e-06, + "loss": 0.1124, + "step": 13390 + }, + { + "epoch": 1.1577173959998273, + "grad_norm": 0.6014056601060235, + "learning_rate": 7.029989970089675e-06, + "loss": 0.1121, + "step": 13400 + }, + { + "epoch": 1.1585813642057972, + "grad_norm": 0.5984871684047993, + "learning_rate": 7.025726884881572e-06, + "loss": 0.1135, + "step": 13410 + }, + { + "epoch": 1.1594453324117673, + "grad_norm": 0.5583075318226408, + "learning_rate": 7.021462037143783e-06, + "loss": 0.1119, + "step": 13420 + }, + { + "epoch": 1.1603093006177372, + "grad_norm": 0.5502086604536702, + "learning_rate": 7.017195430587037e-06, + "loss": 0.1161, + "step": 13430 + }, + { + "epoch": 1.1611732688237073, + "grad_norm": 0.5918176410941008, + "learning_rate": 7.012927068923592e-06, + "loss": 0.1145, + "step": 13440 + }, + { + "epoch": 1.1620372370296774, + "grad_norm": 0.5545319587622841, + "learning_rate": 7.008656955867232e-06, + "loss": 0.1123, + "step": 13450 + }, + { + "epoch": 1.1629012052356473, + "grad_norm": 0.5673106420372879, + "learning_rate": 7.004385095133268e-06, + "loss": 0.1152, + "step": 13460 + }, + { + "epoch": 1.1637651734416172, + "grad_norm": 0.5840240557788225, + "learning_rate": 7.000111490438527e-06, + "loss": 0.1107, + "step": 13470 + }, + { + "epoch": 1.1646291416475874, + "grad_norm": 0.543595956222494, + "learning_rate": 6.9958361455013556e-06, + "loss": 0.1145, + "step": 13480 + }, + { + "epoch": 1.1654931098535575, + "grad_norm": 0.5490360713143707, + "learning_rate": 6.991559064041618e-06, + "loss": 0.1151, + "step": 13490 + }, + { + "epoch": 1.1663570780595274, + "grad_norm": 0.5905952151929723, + "learning_rate": 6.987280249780682e-06, + "loss": 0.1123, + "step": 13500 + }, + { + "epoch": 1.1672210462654975, + "grad_norm": 0.5708684897331763, + "learning_rate": 6.9829997064414315e-06, + "loss": 0.1117, + "step": 13510 + }, + { + "epoch": 1.1680850144714674, + "grad_norm": 0.5707805140043376, + "learning_rate": 6.9787174377482454e-06, + "loss": 0.1129, + "step": 13520 + }, + { + "epoch": 1.1689489826774375, + "grad_norm": 0.5734976835058482, + "learning_rate": 6.9744334474270134e-06, + "loss": 0.1127, + "step": 13530 + }, + { + "epoch": 1.1698129508834074, + "grad_norm": 0.5754813896572345, + "learning_rate": 6.970147739205115e-06, + "loss": 0.1111, + "step": 13540 + }, + { + "epoch": 1.1706769190893775, + "grad_norm": 0.5805142687488006, + "learning_rate": 6.965860316811432e-06, + "loss": 0.1096, + "step": 13550 + }, + { + "epoch": 1.1715408872953474, + "grad_norm": 0.5677452272337598, + "learning_rate": 6.96157118397633e-06, + "loss": 0.1114, + "step": 13560 + }, + { + "epoch": 1.1724048555013176, + "grad_norm": 0.5478940239346025, + "learning_rate": 6.957280344431669e-06, + "loss": 0.1139, + "step": 13570 + }, + { + "epoch": 1.1732688237072875, + "grad_norm": 0.5705600986096185, + "learning_rate": 6.9529878019107886e-06, + "loss": 0.1141, + "step": 13580 + }, + { + "epoch": 1.1741327919132576, + "grad_norm": 0.5671172827221018, + "learning_rate": 6.948693560148515e-06, + "loss": 0.1153, + "step": 13590 + }, + { + "epoch": 1.1749967601192277, + "grad_norm": 0.5887655397357853, + "learning_rate": 6.944397622881151e-06, + "loss": 0.1166, + "step": 13600 + }, + { + "epoch": 1.1758607283251976, + "grad_norm": 0.5712255557841519, + "learning_rate": 6.940099993846472e-06, + "loss": 0.1116, + "step": 13610 + }, + { + "epoch": 1.1767246965311677, + "grad_norm": 0.5388407173904846, + "learning_rate": 6.935800676783731e-06, + "loss": 0.113, + "step": 13620 + }, + { + "epoch": 1.1775886647371376, + "grad_norm": 0.5524086717947396, + "learning_rate": 6.931499675433644e-06, + "loss": 0.1106, + "step": 13630 + }, + { + "epoch": 1.1784526329431078, + "grad_norm": 0.5900683311612895, + "learning_rate": 6.927196993538396e-06, + "loss": 0.1143, + "step": 13640 + }, + { + "epoch": 1.1793166011490777, + "grad_norm": 0.5571666223561705, + "learning_rate": 6.922892634841632e-06, + "loss": 0.1105, + "step": 13650 + }, + { + "epoch": 1.1801805693550478, + "grad_norm": 0.5449665981514987, + "learning_rate": 6.91858660308846e-06, + "loss": 0.1141, + "step": 13660 + }, + { + "epoch": 1.1810445375610177, + "grad_norm": 0.5464355283512453, + "learning_rate": 6.914278902025436e-06, + "loss": 0.1123, + "step": 13670 + }, + { + "epoch": 1.1819085057669878, + "grad_norm": 0.5519986042239243, + "learning_rate": 6.909969535400578e-06, + "loss": 0.1125, + "step": 13680 + }, + { + "epoch": 1.1827724739729577, + "grad_norm": 0.5614519421544537, + "learning_rate": 6.905658506963344e-06, + "loss": 0.1141, + "step": 13690 + }, + { + "epoch": 1.1836364421789278, + "grad_norm": 0.5704123213334359, + "learning_rate": 6.901345820464647e-06, + "loss": 0.1149, + "step": 13700 + }, + { + "epoch": 1.184500410384898, + "grad_norm": 0.5776361497185494, + "learning_rate": 6.897031479656833e-06, + "loss": 0.1123, + "step": 13710 + }, + { + "epoch": 1.1853643785908679, + "grad_norm": 0.5459654368909052, + "learning_rate": 6.8927154882936955e-06, + "loss": 0.1129, + "step": 13720 + }, + { + "epoch": 1.1862283467968378, + "grad_norm": 0.5748613451436307, + "learning_rate": 6.888397850130458e-06, + "loss": 0.115, + "step": 13730 + }, + { + "epoch": 1.1870923150028079, + "grad_norm": 0.5431693832897374, + "learning_rate": 6.884078568923782e-06, + "loss": 0.1123, + "step": 13740 + }, + { + "epoch": 1.187956283208778, + "grad_norm": 0.5623779523522215, + "learning_rate": 6.879757648431755e-06, + "loss": 0.1127, + "step": 13750 + }, + { + "epoch": 1.188820251414748, + "grad_norm": 0.592553622911199, + "learning_rate": 6.875435092413894e-06, + "loss": 0.1153, + "step": 13760 + }, + { + "epoch": 1.189684219620718, + "grad_norm": 0.5595958783240502, + "learning_rate": 6.871110904631136e-06, + "loss": 0.1138, + "step": 13770 + }, + { + "epoch": 1.190548187826688, + "grad_norm": 0.5633116470772369, + "learning_rate": 6.866785088845838e-06, + "loss": 0.1143, + "step": 13780 + }, + { + "epoch": 1.191412156032658, + "grad_norm": 0.5909659506315208, + "learning_rate": 6.862457648821773e-06, + "loss": 0.1134, + "step": 13790 + }, + { + "epoch": 1.192276124238628, + "grad_norm": 0.5739285382431173, + "learning_rate": 6.858128588324131e-06, + "loss": 0.1133, + "step": 13800 + }, + { + "epoch": 1.193140092444598, + "grad_norm": 0.5721923307570116, + "learning_rate": 6.853797911119509e-06, + "loss": 0.1137, + "step": 13810 + }, + { + "epoch": 1.194004060650568, + "grad_norm": 0.5613644285392295, + "learning_rate": 6.84946562097591e-06, + "loss": 0.1122, + "step": 13820 + }, + { + "epoch": 1.194868028856538, + "grad_norm": 0.5542391369025897, + "learning_rate": 6.845131721662741e-06, + "loss": 0.114, + "step": 13830 + }, + { + "epoch": 1.195731997062508, + "grad_norm": 0.5437931235251273, + "learning_rate": 6.840796216950813e-06, + "loss": 0.1104, + "step": 13840 + }, + { + "epoch": 1.1965959652684781, + "grad_norm": 0.5727878132218367, + "learning_rate": 6.836459110612326e-06, + "loss": 0.1161, + "step": 13850 + }, + { + "epoch": 1.1974599334744482, + "grad_norm": 0.5552817579336806, + "learning_rate": 6.83212040642088e-06, + "loss": 0.1136, + "step": 13860 + }, + { + "epoch": 1.1983239016804181, + "grad_norm": 0.559250789005052, + "learning_rate": 6.827780108151464e-06, + "loss": 0.1125, + "step": 13870 + }, + { + "epoch": 1.1991878698863883, + "grad_norm": 0.5937634902485177, + "learning_rate": 6.823438219580452e-06, + "loss": 0.1083, + "step": 13880 + }, + { + "epoch": 1.2000518380923582, + "grad_norm": 0.5552214918377005, + "learning_rate": 6.819094744485601e-06, + "loss": 0.1129, + "step": 13890 + }, + { + "epoch": 1.2009158062983283, + "grad_norm": 0.5655419429967947, + "learning_rate": 6.8147496866460525e-06, + "loss": 0.1101, + "step": 13900 + }, + { + "epoch": 1.2017797745042982, + "grad_norm": 0.5839619215629936, + "learning_rate": 6.810403049842324e-06, + "loss": 0.1085, + "step": 13910 + }, + { + "epoch": 1.2026437427102683, + "grad_norm": 0.5409388759672646, + "learning_rate": 6.806054837856301e-06, + "loss": 0.1138, + "step": 13920 + }, + { + "epoch": 1.2035077109162382, + "grad_norm": 0.57144561714602, + "learning_rate": 6.801705054471248e-06, + "loss": 0.1145, + "step": 13930 + }, + { + "epoch": 1.2043716791222083, + "grad_norm": 0.5424545215147586, + "learning_rate": 6.79735370347179e-06, + "loss": 0.1139, + "step": 13940 + }, + { + "epoch": 1.2052356473281782, + "grad_norm": 0.5618561017996767, + "learning_rate": 6.793000788643923e-06, + "loss": 0.1116, + "step": 13950 + }, + { + "epoch": 1.2060996155341484, + "grad_norm": 0.5676546554249075, + "learning_rate": 6.788646313774996e-06, + "loss": 0.1133, + "step": 13960 + }, + { + "epoch": 1.2069635837401185, + "grad_norm": 0.6010536091884849, + "learning_rate": 6.784290282653719e-06, + "loss": 0.1125, + "step": 13970 + }, + { + "epoch": 1.2078275519460884, + "grad_norm": 0.6034476266033024, + "learning_rate": 6.779932699070157e-06, + "loss": 0.109, + "step": 13980 + }, + { + "epoch": 1.2086915201520585, + "grad_norm": 0.5223870934855904, + "learning_rate": 6.775573566815725e-06, + "loss": 0.1104, + "step": 13990 + }, + { + "epoch": 1.2095554883580284, + "grad_norm": 0.5894416024611014, + "learning_rate": 6.771212889683182e-06, + "loss": 0.1118, + "step": 14000 + }, + { + "epoch": 1.2104194565639985, + "grad_norm": 0.5336729589337562, + "learning_rate": 6.766850671466637e-06, + "loss": 0.1133, + "step": 14010 + }, + { + "epoch": 1.2112834247699684, + "grad_norm": 0.6024966427336127, + "learning_rate": 6.762486915961536e-06, + "loss": 0.1095, + "step": 14020 + }, + { + "epoch": 1.2121473929759385, + "grad_norm": 0.5413840123335933, + "learning_rate": 6.758121626964665e-06, + "loss": 0.1132, + "step": 14030 + }, + { + "epoch": 1.2130113611819084, + "grad_norm": 0.582683552400572, + "learning_rate": 6.753754808274139e-06, + "loss": 0.1142, + "step": 14040 + }, + { + "epoch": 1.2138753293878786, + "grad_norm": 0.5593554231560006, + "learning_rate": 6.749386463689413e-06, + "loss": 0.111, + "step": 14050 + }, + { + "epoch": 1.2147392975938485, + "grad_norm": 0.5534453036111682, + "learning_rate": 6.74501659701126e-06, + "loss": 0.112, + "step": 14060 + }, + { + "epoch": 1.2156032657998186, + "grad_norm": 0.5373879399272095, + "learning_rate": 6.7406452120417846e-06, + "loss": 0.1104, + "step": 14070 + }, + { + "epoch": 1.2164672340057887, + "grad_norm": 0.5605673893423937, + "learning_rate": 6.736272312584408e-06, + "loss": 0.1146, + "step": 14080 + }, + { + "epoch": 1.2173312022117586, + "grad_norm": 0.5282153572951253, + "learning_rate": 6.7318979024438725e-06, + "loss": 0.1108, + "step": 14090 + }, + { + "epoch": 1.2181951704177285, + "grad_norm": 0.5738009147157338, + "learning_rate": 6.7275219854262295e-06, + "loss": 0.1101, + "step": 14100 + }, + { + "epoch": 1.2190591386236986, + "grad_norm": 0.5537976410855348, + "learning_rate": 6.7231445653388494e-06, + "loss": 0.1117, + "step": 14110 + }, + { + "epoch": 1.2199231068296688, + "grad_norm": 0.536439976576386, + "learning_rate": 6.718765645990402e-06, + "loss": 0.1126, + "step": 14120 + }, + { + "epoch": 1.2207870750356387, + "grad_norm": 0.5614207764845589, + "learning_rate": 6.7143852311908695e-06, + "loss": 0.11, + "step": 14130 + }, + { + "epoch": 1.2216510432416088, + "grad_norm": 0.5799281943109638, + "learning_rate": 6.7100033247515265e-06, + "loss": 0.1118, + "step": 14140 + }, + { + "epoch": 1.2225150114475787, + "grad_norm": 0.5334465896125076, + "learning_rate": 6.705619930484954e-06, + "loss": 0.1137, + "step": 14150 + }, + { + "epoch": 1.2233789796535488, + "grad_norm": 0.5418264694625892, + "learning_rate": 6.701235052205023e-06, + "loss": 0.1118, + "step": 14160 + }, + { + "epoch": 1.2242429478595187, + "grad_norm": 0.5716467405913869, + "learning_rate": 6.696848693726896e-06, + "loss": 0.1137, + "step": 14170 + }, + { + "epoch": 1.2251069160654888, + "grad_norm": 0.5600841783319294, + "learning_rate": 6.692460858867025e-06, + "loss": 0.1117, + "step": 14180 + }, + { + "epoch": 1.2259708842714587, + "grad_norm": 0.5430963957009048, + "learning_rate": 6.6880715514431424e-06, + "loss": 0.11, + "step": 14190 + }, + { + "epoch": 1.2268348524774288, + "grad_norm": 0.610965208613663, + "learning_rate": 6.683680775274267e-06, + "loss": 0.1161, + "step": 14200 + }, + { + "epoch": 1.2276988206833988, + "grad_norm": 0.5428600711535735, + "learning_rate": 6.679288534180692e-06, + "loss": 0.114, + "step": 14210 + }, + { + "epoch": 1.2285627888893689, + "grad_norm": 0.5257402970258083, + "learning_rate": 6.67489483198399e-06, + "loss": 0.1127, + "step": 14220 + }, + { + "epoch": 1.229426757095339, + "grad_norm": 0.5695987153021643, + "learning_rate": 6.670499672506996e-06, + "loss": 0.109, + "step": 14230 + }, + { + "epoch": 1.230290725301309, + "grad_norm": 0.5421509104065498, + "learning_rate": 6.666103059573823e-06, + "loss": 0.1144, + "step": 14240 + }, + { + "epoch": 1.231154693507279, + "grad_norm": 0.5700089455833822, + "learning_rate": 6.661704997009841e-06, + "loss": 0.1096, + "step": 14250 + }, + { + "epoch": 1.232018661713249, + "grad_norm": 0.5770791849329917, + "learning_rate": 6.657305488641687e-06, + "loss": 0.1124, + "step": 14260 + }, + { + "epoch": 1.232882629919219, + "grad_norm": 0.5585275622314702, + "learning_rate": 6.65290453829725e-06, + "loss": 0.1153, + "step": 14270 + }, + { + "epoch": 1.233746598125189, + "grad_norm": 0.5494171663494277, + "learning_rate": 6.648502149805679e-06, + "loss": 0.1166, + "step": 14280 + }, + { + "epoch": 1.234610566331159, + "grad_norm": 0.5765287928053029, + "learning_rate": 6.6440983269973725e-06, + "loss": 0.1122, + "step": 14290 + }, + { + "epoch": 1.235474534537129, + "grad_norm": 0.5763938405118092, + "learning_rate": 6.639693073703974e-06, + "loss": 0.1076, + "step": 14300 + }, + { + "epoch": 1.236338502743099, + "grad_norm": 0.5494980151341401, + "learning_rate": 6.635286393758376e-06, + "loss": 0.1143, + "step": 14310 + }, + { + "epoch": 1.237202470949069, + "grad_norm": 0.5680501144954855, + "learning_rate": 6.63087829099471e-06, + "loss": 0.1137, + "step": 14320 + }, + { + "epoch": 1.2380664391550391, + "grad_norm": 0.5452528726216044, + "learning_rate": 6.6264687692483455e-06, + "loss": 0.111, + "step": 14330 + }, + { + "epoch": 1.2389304073610092, + "grad_norm": 0.575161065532565, + "learning_rate": 6.6220578323558885e-06, + "loss": 0.1155, + "step": 14340 + }, + { + "epoch": 1.2397943755669791, + "grad_norm": 0.5825666496800802, + "learning_rate": 6.617645484155173e-06, + "loss": 0.1138, + "step": 14350 + }, + { + "epoch": 1.240658343772949, + "grad_norm": 0.5573343725422848, + "learning_rate": 6.6132317284852656e-06, + "loss": 0.1115, + "step": 14360 + }, + { + "epoch": 1.2415223119789192, + "grad_norm": 0.5976405807104025, + "learning_rate": 6.60881656918645e-06, + "loss": 0.1119, + "step": 14370 + }, + { + "epoch": 1.2423862801848893, + "grad_norm": 0.5773790829133977, + "learning_rate": 6.60440001010024e-06, + "loss": 0.1109, + "step": 14380 + }, + { + "epoch": 1.2432502483908592, + "grad_norm": 0.5596129375427722, + "learning_rate": 6.599982055069363e-06, + "loss": 0.1133, + "step": 14390 + }, + { + "epoch": 1.2441142165968293, + "grad_norm": 0.5986154505530558, + "learning_rate": 6.595562707937759e-06, + "loss": 0.1112, + "step": 14400 + }, + { + "epoch": 1.2449781848027992, + "grad_norm": 0.5850475888093254, + "learning_rate": 6.591141972550581e-06, + "loss": 0.1122, + "step": 14410 + }, + { + "epoch": 1.2458421530087693, + "grad_norm": 0.5438809542880516, + "learning_rate": 6.5867198527541935e-06, + "loss": 0.1124, + "step": 14420 + }, + { + "epoch": 1.2467061212147392, + "grad_norm": 0.5854222234007489, + "learning_rate": 6.582296352396158e-06, + "loss": 0.1145, + "step": 14430 + }, + { + "epoch": 1.2475700894207093, + "grad_norm": 0.5647614923048787, + "learning_rate": 6.577871475325245e-06, + "loss": 0.1131, + "step": 14440 + }, + { + "epoch": 1.2484340576266792, + "grad_norm": 0.5446916876914634, + "learning_rate": 6.573445225391417e-06, + "loss": 0.1128, + "step": 14450 + }, + { + "epoch": 1.2492980258326494, + "grad_norm": 0.556275506719302, + "learning_rate": 6.569017606445836e-06, + "loss": 0.1104, + "step": 14460 + }, + { + "epoch": 1.2501619940386193, + "grad_norm": 0.5439613445971405, + "learning_rate": 6.564588622340848e-06, + "loss": 0.1085, + "step": 14470 + }, + { + "epoch": 1.2510259622445894, + "grad_norm": 0.5628308747399765, + "learning_rate": 6.560158276929996e-06, + "loss": 0.119, + "step": 14480 + }, + { + "epoch": 1.2518899304505595, + "grad_norm": 0.561991898239134, + "learning_rate": 6.555726574068e-06, + "loss": 0.1146, + "step": 14490 + }, + { + "epoch": 1.2527538986565294, + "grad_norm": 0.5751250413019706, + "learning_rate": 6.551293517610764e-06, + "loss": 0.1085, + "step": 14500 + }, + { + "epoch": 1.2536178668624995, + "grad_norm": 0.5640021015599359, + "learning_rate": 6.546859111415371e-06, + "loss": 0.1102, + "step": 14510 + }, + { + "epoch": 1.2544818350684694, + "grad_norm": 0.5635763265614476, + "learning_rate": 6.5424233593400735e-06, + "loss": 0.1145, + "step": 14520 + }, + { + "epoch": 1.2553458032744396, + "grad_norm": 0.591286004727335, + "learning_rate": 6.537986265244302e-06, + "loss": 0.111, + "step": 14530 + }, + { + "epoch": 1.2562097714804095, + "grad_norm": 0.5522099040559655, + "learning_rate": 6.533547832988647e-06, + "loss": 0.1111, + "step": 14540 + }, + { + "epoch": 1.2570737396863796, + "grad_norm": 0.5470523497296542, + "learning_rate": 6.529108066434872e-06, + "loss": 0.1114, + "step": 14550 + }, + { + "epoch": 1.2579377078923495, + "grad_norm": 0.568333276620229, + "learning_rate": 6.524666969445892e-06, + "loss": 0.1119, + "step": 14560 + }, + { + "epoch": 1.2588016760983196, + "grad_norm": 0.5850616288956318, + "learning_rate": 6.520224545885789e-06, + "loss": 0.1096, + "step": 14570 + }, + { + "epoch": 1.2596656443042895, + "grad_norm": 0.5773097418715036, + "learning_rate": 6.515780799619791e-06, + "loss": 0.1135, + "step": 14580 + }, + { + "epoch": 1.2605296125102596, + "grad_norm": 0.5388132754358779, + "learning_rate": 6.511335734514283e-06, + "loss": 0.1119, + "step": 14590 + }, + { + "epoch": 1.2613935807162298, + "grad_norm": 0.5577823715228171, + "learning_rate": 6.506889354436792e-06, + "loss": 0.1176, + "step": 14600 + }, + { + "epoch": 1.2622575489221997, + "grad_norm": 0.5339935975649758, + "learning_rate": 6.5024416632559935e-06, + "loss": 0.107, + "step": 14610 + }, + { + "epoch": 1.2631215171281696, + "grad_norm": 0.5732228640887302, + "learning_rate": 6.4979926648417025e-06, + "loss": 0.1124, + "step": 14620 + }, + { + "epoch": 1.2639854853341397, + "grad_norm": 0.5903353302881912, + "learning_rate": 6.493542363064871e-06, + "loss": 0.1145, + "step": 14630 + }, + { + "epoch": 1.2648494535401098, + "grad_norm": 0.5470991075480361, + "learning_rate": 6.489090761797583e-06, + "loss": 0.108, + "step": 14640 + }, + { + "epoch": 1.2657134217460797, + "grad_norm": 0.5625592769987781, + "learning_rate": 6.484637864913059e-06, + "loss": 0.1114, + "step": 14650 + }, + { + "epoch": 1.2665773899520498, + "grad_norm": 0.5689791373198402, + "learning_rate": 6.4801836762856406e-06, + "loss": 0.1133, + "step": 14660 + }, + { + "epoch": 1.2674413581580197, + "grad_norm": 0.5444276374757011, + "learning_rate": 6.475728199790796e-06, + "loss": 0.1094, + "step": 14670 + }, + { + "epoch": 1.2683053263639898, + "grad_norm": 0.5667653808009618, + "learning_rate": 6.471271439305115e-06, + "loss": 0.112, + "step": 14680 + }, + { + "epoch": 1.2691692945699597, + "grad_norm": 0.5344168071211037, + "learning_rate": 6.466813398706302e-06, + "loss": 0.1128, + "step": 14690 + }, + { + "epoch": 1.2700332627759299, + "grad_norm": 0.5510709964262109, + "learning_rate": 6.462354081873177e-06, + "loss": 0.1101, + "step": 14700 + }, + { + "epoch": 1.2708972309819, + "grad_norm": 0.5653351061563336, + "learning_rate": 6.457893492685671e-06, + "loss": 0.1126, + "step": 14710 + }, + { + "epoch": 1.27176119918787, + "grad_norm": 0.5565782210839264, + "learning_rate": 6.453431635024817e-06, + "loss": 0.111, + "step": 14720 + }, + { + "epoch": 1.2726251673938398, + "grad_norm": 0.5547676333122443, + "learning_rate": 6.4489685127727606e-06, + "loss": 0.1086, + "step": 14730 + }, + { + "epoch": 1.27348913559981, + "grad_norm": 0.544001901072926, + "learning_rate": 6.44450412981274e-06, + "loss": 0.1111, + "step": 14740 + }, + { + "epoch": 1.27435310380578, + "grad_norm": 0.5267941742084936, + "learning_rate": 6.440038490029092e-06, + "loss": 0.112, + "step": 14750 + }, + { + "epoch": 1.27521707201175, + "grad_norm": 0.5453459720243373, + "learning_rate": 6.435571597307251e-06, + "loss": 0.1119, + "step": 14760 + }, + { + "epoch": 1.27608104021772, + "grad_norm": 0.5647611308613276, + "learning_rate": 6.431103455533735e-06, + "loss": 0.1155, + "step": 14770 + }, + { + "epoch": 1.27694500842369, + "grad_norm": 0.5702592798618671, + "learning_rate": 6.426634068596154e-06, + "loss": 0.1077, + "step": 14780 + }, + { + "epoch": 1.27780897662966, + "grad_norm": 0.6036801878221308, + "learning_rate": 6.422163440383202e-06, + "loss": 0.1168, + "step": 14790 + }, + { + "epoch": 1.27867294483563, + "grad_norm": 0.5962193986965203, + "learning_rate": 6.417691574784647e-06, + "loss": 0.1155, + "step": 14800 + }, + { + "epoch": 1.2795369130416, + "grad_norm": 0.593769084774705, + "learning_rate": 6.413218475691337e-06, + "loss": 0.1136, + "step": 14810 + }, + { + "epoch": 1.28040088124757, + "grad_norm": 0.5378674069612178, + "learning_rate": 6.408744146995197e-06, + "loss": 0.1127, + "step": 14820 + }, + { + "epoch": 1.2812648494535401, + "grad_norm": 0.552753211413115, + "learning_rate": 6.404268592589214e-06, + "loss": 0.1095, + "step": 14830 + }, + { + "epoch": 1.28212881765951, + "grad_norm": 0.5624083853062832, + "learning_rate": 6.39979181636745e-06, + "loss": 0.1084, + "step": 14840 + }, + { + "epoch": 1.2829927858654802, + "grad_norm": 0.5601341611146992, + "learning_rate": 6.395313822225022e-06, + "loss": 0.1125, + "step": 14850 + }, + { + "epoch": 1.2838567540714503, + "grad_norm": 0.5172364101610524, + "learning_rate": 6.390834614058114e-06, + "loss": 0.1114, + "step": 14860 + }, + { + "epoch": 1.2847207222774202, + "grad_norm": 0.5229962139350934, + "learning_rate": 6.38635419576396e-06, + "loss": 0.1132, + "step": 14870 + }, + { + "epoch": 1.28558469048339, + "grad_norm": 0.5756669235276612, + "learning_rate": 6.381872571240852e-06, + "loss": 0.1147, + "step": 14880 + }, + { + "epoch": 1.2864486586893602, + "grad_norm": 0.5563681551307071, + "learning_rate": 6.377389744388127e-06, + "loss": 0.1095, + "step": 14890 + }, + { + "epoch": 1.2873126268953303, + "grad_norm": 0.5854403852531498, + "learning_rate": 6.372905719106172e-06, + "loss": 0.1128, + "step": 14900 + }, + { + "epoch": 1.2881765951013002, + "grad_norm": 0.6036186605098479, + "learning_rate": 6.368420499296413e-06, + "loss": 0.1126, + "step": 14910 + }, + { + "epoch": 1.2890405633072703, + "grad_norm": 0.593748655103996, + "learning_rate": 6.36393408886132e-06, + "loss": 0.1148, + "step": 14920 + }, + { + "epoch": 1.2899045315132402, + "grad_norm": 0.5552533246969583, + "learning_rate": 6.359446491704394e-06, + "loss": 0.11, + "step": 14930 + }, + { + "epoch": 1.2907684997192104, + "grad_norm": 0.5879665603388455, + "learning_rate": 6.3549577117301735e-06, + "loss": 0.1121, + "step": 14940 + }, + { + "epoch": 1.2916324679251803, + "grad_norm": 0.553609513620935, + "learning_rate": 6.35046775284422e-06, + "loss": 0.1108, + "step": 14950 + }, + { + "epoch": 1.2924964361311504, + "grad_norm": 0.5804539813996858, + "learning_rate": 6.345976618953127e-06, + "loss": 0.1125, + "step": 14960 + }, + { + "epoch": 1.2933604043371205, + "grad_norm": 0.5404050350479398, + "learning_rate": 6.341484313964506e-06, + "loss": 0.108, + "step": 14970 + }, + { + "epoch": 1.2942243725430904, + "grad_norm": 0.5717453327587577, + "learning_rate": 6.33699084178699e-06, + "loss": 0.1132, + "step": 14980 + }, + { + "epoch": 1.2950883407490603, + "grad_norm": 0.5571547661988168, + "learning_rate": 6.332496206330227e-06, + "loss": 0.1118, + "step": 14990 + }, + { + "epoch": 1.2959523089550304, + "grad_norm": 0.5894118107294782, + "learning_rate": 6.328000411504876e-06, + "loss": 0.1119, + "step": 15000 + }, + { + "epoch": 1.2968162771610006, + "grad_norm": 0.6271319736229325, + "learning_rate": 6.323503461222602e-06, + "loss": 0.1093, + "step": 15010 + }, + { + "epoch": 1.2976802453669705, + "grad_norm": 0.5714163744986458, + "learning_rate": 6.319005359396084e-06, + "loss": 0.1163, + "step": 15020 + }, + { + "epoch": 1.2985442135729406, + "grad_norm": 0.570485713384322, + "learning_rate": 6.314506109938996e-06, + "loss": 0.1133, + "step": 15030 + }, + { + "epoch": 1.2994081817789105, + "grad_norm": 0.6031381548468219, + "learning_rate": 6.31000571676601e-06, + "loss": 0.1148, + "step": 15040 + }, + { + "epoch": 1.3002721499848806, + "grad_norm": 0.5782702057393322, + "learning_rate": 6.305504183792799e-06, + "loss": 0.1112, + "step": 15050 + }, + { + "epoch": 1.3011361181908505, + "grad_norm": 0.5618969513353681, + "learning_rate": 6.30100151493602e-06, + "loss": 0.1132, + "step": 15060 + }, + { + "epoch": 1.3020000863968206, + "grad_norm": 0.5627948160701411, + "learning_rate": 6.296497714113325e-06, + "loss": 0.1113, + "step": 15070 + }, + { + "epoch": 1.3028640546027908, + "grad_norm": 0.5534126614895483, + "learning_rate": 6.291992785243348e-06, + "loss": 0.112, + "step": 15080 + }, + { + "epoch": 1.3037280228087607, + "grad_norm": 0.5912501317649836, + "learning_rate": 6.287486732245705e-06, + "loss": 0.112, + "step": 15090 + }, + { + "epoch": 1.3045919910147306, + "grad_norm": 0.5600010807396356, + "learning_rate": 6.282979559040988e-06, + "loss": 0.1141, + "step": 15100 + }, + { + "epoch": 1.3054559592207007, + "grad_norm": 0.5935733410748419, + "learning_rate": 6.278471269550766e-06, + "loss": 0.1139, + "step": 15110 + }, + { + "epoch": 1.3063199274266708, + "grad_norm": 0.5600831456724733, + "learning_rate": 6.2739618676975785e-06, + "loss": 0.1116, + "step": 15120 + }, + { + "epoch": 1.3071838956326407, + "grad_norm": 0.5599466998893358, + "learning_rate": 6.2694513574049354e-06, + "loss": 0.1102, + "step": 15130 + }, + { + "epoch": 1.3080478638386106, + "grad_norm": 0.5559964600507826, + "learning_rate": 6.264939742597304e-06, + "loss": 0.1102, + "step": 15140 + }, + { + "epoch": 1.3089118320445807, + "grad_norm": 0.5786344058859312, + "learning_rate": 6.2604270272001235e-06, + "loss": 0.1131, + "step": 15150 + }, + { + "epoch": 1.3097758002505508, + "grad_norm": 0.5657515488388618, + "learning_rate": 6.255913215139778e-06, + "loss": 0.113, + "step": 15160 + }, + { + "epoch": 1.3106397684565207, + "grad_norm": 0.5863407258722373, + "learning_rate": 6.251398310343617e-06, + "loss": 0.1118, + "step": 15170 + }, + { + "epoch": 1.3115037366624909, + "grad_norm": 0.5442709891236638, + "learning_rate": 6.246882316739932e-06, + "loss": 0.1129, + "step": 15180 + }, + { + "epoch": 1.3123677048684608, + "grad_norm": 0.6002637189048005, + "learning_rate": 6.242365238257969e-06, + "loss": 0.1137, + "step": 15190 + }, + { + "epoch": 1.313231673074431, + "grad_norm": 0.5842893880761071, + "learning_rate": 6.237847078827914e-06, + "loss": 0.1126, + "step": 15200 + }, + { + "epoch": 1.3140956412804008, + "grad_norm": 0.5830066962552539, + "learning_rate": 6.233327842380894e-06, + "loss": 0.1161, + "step": 15210 + }, + { + "epoch": 1.314959609486371, + "grad_norm": 0.5373321121823259, + "learning_rate": 6.228807532848973e-06, + "loss": 0.111, + "step": 15220 + }, + { + "epoch": 1.315823577692341, + "grad_norm": 0.6108127472256695, + "learning_rate": 6.224286154165148e-06, + "loss": 0.1101, + "step": 15230 + }, + { + "epoch": 1.316687545898311, + "grad_norm": 0.5696760440842511, + "learning_rate": 6.219763710263349e-06, + "loss": 0.1081, + "step": 15240 + }, + { + "epoch": 1.3175515141042808, + "grad_norm": 0.5489297133762001, + "learning_rate": 6.21524020507843e-06, + "loss": 0.1118, + "step": 15250 + }, + { + "epoch": 1.318415482310251, + "grad_norm": 0.5613132623439822, + "learning_rate": 6.21071564254617e-06, + "loss": 0.1143, + "step": 15260 + }, + { + "epoch": 1.319279450516221, + "grad_norm": 0.5526160450209798, + "learning_rate": 6.206190026603267e-06, + "loss": 0.1126, + "step": 15270 + }, + { + "epoch": 1.320143418722191, + "grad_norm": 0.5769836694482383, + "learning_rate": 6.201663361187336e-06, + "loss": 0.1087, + "step": 15280 + }, + { + "epoch": 1.321007386928161, + "grad_norm": 0.55598238886065, + "learning_rate": 6.1971356502369065e-06, + "loss": 0.1131, + "step": 15290 + }, + { + "epoch": 1.321871355134131, + "grad_norm": 0.5735365686216224, + "learning_rate": 6.192606897691414e-06, + "loss": 0.1126, + "step": 15300 + }, + { + "epoch": 1.3227353233401011, + "grad_norm": 0.5783112362966369, + "learning_rate": 6.188077107491204e-06, + "loss": 0.1112, + "step": 15310 + }, + { + "epoch": 1.323599291546071, + "grad_norm": 0.5683067930605199, + "learning_rate": 6.183546283577523e-06, + "loss": 0.1121, + "step": 15320 + }, + { + "epoch": 1.3244632597520412, + "grad_norm": 0.5464109155323802, + "learning_rate": 6.1790144298925146e-06, + "loss": 0.1137, + "step": 15330 + }, + { + "epoch": 1.3253272279580113, + "grad_norm": 0.5585746473787109, + "learning_rate": 6.174481550379225e-06, + "loss": 0.1134, + "step": 15340 + }, + { + "epoch": 1.3261911961639812, + "grad_norm": 0.5545762428074108, + "learning_rate": 6.1699476489815876e-06, + "loss": 0.1149, + "step": 15350 + }, + { + "epoch": 1.327055164369951, + "grad_norm": 0.5748347832493429, + "learning_rate": 6.165412729644426e-06, + "loss": 0.109, + "step": 15360 + }, + { + "epoch": 1.3279191325759212, + "grad_norm": 0.5842776295530736, + "learning_rate": 6.1608767963134495e-06, + "loss": 0.1072, + "step": 15370 + }, + { + "epoch": 1.3287831007818913, + "grad_norm": 0.5575191047466767, + "learning_rate": 6.156339852935251e-06, + "loss": 0.1117, + "step": 15380 + }, + { + "epoch": 1.3296470689878612, + "grad_norm": 0.5450731649894345, + "learning_rate": 6.1518019034572995e-06, + "loss": 0.1144, + "step": 15390 + }, + { + "epoch": 1.3305110371938313, + "grad_norm": 0.5624842087427264, + "learning_rate": 6.147262951827943e-06, + "loss": 0.1107, + "step": 15400 + }, + { + "epoch": 1.3313750053998012, + "grad_norm": 0.566230353504596, + "learning_rate": 6.142723001996398e-06, + "loss": 0.1078, + "step": 15410 + }, + { + "epoch": 1.3322389736057714, + "grad_norm": 0.5596105650561354, + "learning_rate": 6.138182057912751e-06, + "loss": 0.1114, + "step": 15420 + }, + { + "epoch": 1.3331029418117413, + "grad_norm": 0.5631168310453766, + "learning_rate": 6.133640123527953e-06, + "loss": 0.1094, + "step": 15430 + }, + { + "epoch": 1.3339669100177114, + "grad_norm": 0.5757668723185331, + "learning_rate": 6.12909720279382e-06, + "loss": 0.1133, + "step": 15440 + }, + { + "epoch": 1.3348308782236813, + "grad_norm": 0.5656227206741703, + "learning_rate": 6.12455329966302e-06, + "loss": 0.1102, + "step": 15450 + }, + { + "epoch": 1.3356948464296514, + "grad_norm": 0.5675203492355643, + "learning_rate": 6.120008418089082e-06, + "loss": 0.1108, + "step": 15460 + }, + { + "epoch": 1.3365588146356213, + "grad_norm": 0.5609540526824188, + "learning_rate": 6.11546256202638e-06, + "loss": 0.1083, + "step": 15470 + }, + { + "epoch": 1.3374227828415914, + "grad_norm": 0.5453368748675613, + "learning_rate": 6.110915735430142e-06, + "loss": 0.1068, + "step": 15480 + }, + { + "epoch": 1.3382867510475616, + "grad_norm": 0.5783208678504869, + "learning_rate": 6.106367942256437e-06, + "loss": 0.114, + "step": 15490 + }, + { + "epoch": 1.3391507192535315, + "grad_norm": 0.5772617722487127, + "learning_rate": 6.1018191864621764e-06, + "loss": 0.1103, + "step": 15500 + }, + { + "epoch": 1.3400146874595014, + "grad_norm": 0.5726131201792536, + "learning_rate": 6.097269472005107e-06, + "loss": 0.1108, + "step": 15510 + }, + { + "epoch": 1.3408786556654715, + "grad_norm": 0.5397368294651602, + "learning_rate": 6.092718802843814e-06, + "loss": 0.1106, + "step": 15520 + }, + { + "epoch": 1.3417426238714416, + "grad_norm": 0.5591733492489677, + "learning_rate": 6.088167182937706e-06, + "loss": 0.1129, + "step": 15530 + }, + { + "epoch": 1.3426065920774115, + "grad_norm": 0.5561490349875766, + "learning_rate": 6.083614616247028e-06, + "loss": 0.112, + "step": 15540 + }, + { + "epoch": 1.3434705602833816, + "grad_norm": 0.5819373721015771, + "learning_rate": 6.0790611067328395e-06, + "loss": 0.1072, + "step": 15550 + }, + { + "epoch": 1.3443345284893515, + "grad_norm": 0.5359474980622055, + "learning_rate": 6.074506658357029e-06, + "loss": 0.1109, + "step": 15560 + }, + { + "epoch": 1.3451984966953217, + "grad_norm": 0.5720230417354536, + "learning_rate": 6.069951275082295e-06, + "loss": 0.1078, + "step": 15570 + }, + { + "epoch": 1.3460624649012916, + "grad_norm": 0.5432165606654724, + "learning_rate": 6.065394960872154e-06, + "loss": 0.1078, + "step": 15580 + }, + { + "epoch": 1.3469264331072617, + "grad_norm": 0.5956848004967984, + "learning_rate": 6.06083771969093e-06, + "loss": 0.1087, + "step": 15590 + }, + { + "epoch": 1.3477904013132318, + "grad_norm": 0.5831628365743788, + "learning_rate": 6.056279555503752e-06, + "loss": 0.1075, + "step": 15600 + }, + { + "epoch": 1.3486543695192017, + "grad_norm": 0.5360439964712416, + "learning_rate": 6.0517204722765585e-06, + "loss": 0.1083, + "step": 15610 + }, + { + "epoch": 1.3495183377251716, + "grad_norm": 0.5608239905295521, + "learning_rate": 6.047160473976081e-06, + "loss": 0.1145, + "step": 15620 + }, + { + "epoch": 1.3503823059311417, + "grad_norm": 0.5681136455973719, + "learning_rate": 6.04259956456985e-06, + "loss": 0.1091, + "step": 15630 + }, + { + "epoch": 1.3512462741371118, + "grad_norm": 0.5358581606501858, + "learning_rate": 6.038037748026187e-06, + "loss": 0.1095, + "step": 15640 + }, + { + "epoch": 1.3521102423430817, + "grad_norm": 0.5761867356055901, + "learning_rate": 6.0334750283142056e-06, + "loss": 0.1113, + "step": 15650 + }, + { + "epoch": 1.3529742105490519, + "grad_norm": 0.5717376371598626, + "learning_rate": 6.028911409403801e-06, + "loss": 0.1122, + "step": 15660 + }, + { + "epoch": 1.3538381787550218, + "grad_norm": 0.5571214583689651, + "learning_rate": 6.024346895265659e-06, + "loss": 0.1127, + "step": 15670 + }, + { + "epoch": 1.354702146960992, + "grad_norm": 0.5875745499147877, + "learning_rate": 6.019781489871235e-06, + "loss": 0.1083, + "step": 15680 + }, + { + "epoch": 1.3555661151669618, + "grad_norm": 0.5546175269594011, + "learning_rate": 6.015215197192763e-06, + "loss": 0.1079, + "step": 15690 + }, + { + "epoch": 1.356430083372932, + "grad_norm": 0.5450808281935141, + "learning_rate": 6.010648021203252e-06, + "loss": 0.1112, + "step": 15700 + }, + { + "epoch": 1.3572940515789018, + "grad_norm": 0.549235755180578, + "learning_rate": 6.006079965876476e-06, + "loss": 0.1085, + "step": 15710 + }, + { + "epoch": 1.358158019784872, + "grad_norm": 0.5623635978497271, + "learning_rate": 6.001511035186975e-06, + "loss": 0.1078, + "step": 15720 + }, + { + "epoch": 1.3590219879908418, + "grad_norm": 0.5484310920570102, + "learning_rate": 5.996941233110052e-06, + "loss": 0.1092, + "step": 15730 + }, + { + "epoch": 1.359885956196812, + "grad_norm": 0.5154476200776394, + "learning_rate": 5.992370563621766e-06, + "loss": 0.1085, + "step": 15740 + }, + { + "epoch": 1.360749924402782, + "grad_norm": 0.5553387563055833, + "learning_rate": 5.987799030698932e-06, + "loss": 0.112, + "step": 15750 + }, + { + "epoch": 1.361613892608752, + "grad_norm": 0.5580242137209885, + "learning_rate": 5.9832266383191154e-06, + "loss": 0.1118, + "step": 15760 + }, + { + "epoch": 1.3624778608147219, + "grad_norm": 0.5283878918575788, + "learning_rate": 5.978653390460632e-06, + "loss": 0.1087, + "step": 15770 + }, + { + "epoch": 1.363341829020692, + "grad_norm": 0.5785508559189512, + "learning_rate": 5.974079291102538e-06, + "loss": 0.1108, + "step": 15780 + }, + { + "epoch": 1.3642057972266621, + "grad_norm": 0.55241795550289, + "learning_rate": 5.969504344224635e-06, + "loss": 0.1118, + "step": 15790 + }, + { + "epoch": 1.365069765432632, + "grad_norm": 0.5351889601182599, + "learning_rate": 5.964928553807455e-06, + "loss": 0.1095, + "step": 15800 + }, + { + "epoch": 1.3659337336386022, + "grad_norm": 0.5964305804829771, + "learning_rate": 5.960351923832274e-06, + "loss": 0.1085, + "step": 15810 + }, + { + "epoch": 1.366797701844572, + "grad_norm": 0.5659343518769593, + "learning_rate": 5.955774458281088e-06, + "loss": 0.1119, + "step": 15820 + }, + { + "epoch": 1.3676616700505422, + "grad_norm": 0.5387151801430248, + "learning_rate": 5.951196161136629e-06, + "loss": 0.1104, + "step": 15830 + }, + { + "epoch": 1.368525638256512, + "grad_norm": 1.5548648788419452, + "learning_rate": 5.946617036382344e-06, + "loss": 0.1102, + "step": 15840 + }, + { + "epoch": 1.3693896064624822, + "grad_norm": 0.5558801260199372, + "learning_rate": 5.942037088002408e-06, + "loss": 0.111, + "step": 15850 + }, + { + "epoch": 1.3702535746684523, + "grad_norm": 0.575072919526527, + "learning_rate": 5.937456319981706e-06, + "loss": 0.1137, + "step": 15860 + }, + { + "epoch": 1.3711175428744222, + "grad_norm": 0.5530395900840259, + "learning_rate": 5.932874736305843e-06, + "loss": 0.1101, + "step": 15870 + }, + { + "epoch": 1.3719815110803921, + "grad_norm": 0.5983896943740621, + "learning_rate": 5.9282923409611285e-06, + "loss": 0.1075, + "step": 15880 + }, + { + "epoch": 1.3728454792863622, + "grad_norm": 0.5768004665520459, + "learning_rate": 5.9237091379345765e-06, + "loss": 0.1107, + "step": 15890 + }, + { + "epoch": 1.3737094474923324, + "grad_norm": 0.569097515223848, + "learning_rate": 5.919125131213912e-06, + "loss": 0.1089, + "step": 15900 + }, + { + "epoch": 1.3745734156983023, + "grad_norm": 0.5312624276461287, + "learning_rate": 5.91454032478755e-06, + "loss": 0.11, + "step": 15910 + }, + { + "epoch": 1.3754373839042724, + "grad_norm": 0.5328951454517763, + "learning_rate": 5.909954722644608e-06, + "loss": 0.1088, + "step": 15920 + }, + { + "epoch": 1.3763013521102423, + "grad_norm": 0.5722786730159535, + "learning_rate": 5.905368328774893e-06, + "loss": 0.1071, + "step": 15930 + }, + { + "epoch": 1.3771653203162124, + "grad_norm": 0.582982357808919, + "learning_rate": 5.9007811471689e-06, + "loss": 0.1107, + "step": 15940 + }, + { + "epoch": 1.3780292885221823, + "grad_norm": 0.5409670522081398, + "learning_rate": 5.896193181817811e-06, + "loss": 0.1123, + "step": 15950 + }, + { + "epoch": 1.3788932567281524, + "grad_norm": 0.5525488935001114, + "learning_rate": 5.891604436713491e-06, + "loss": 0.1101, + "step": 15960 + }, + { + "epoch": 1.3797572249341226, + "grad_norm": 0.5840786377822249, + "learning_rate": 5.887014915848478e-06, + "loss": 0.1097, + "step": 15970 + }, + { + "epoch": 1.3806211931400925, + "grad_norm": 0.5384412553004606, + "learning_rate": 5.882424623215993e-06, + "loss": 0.1088, + "step": 15980 + }, + { + "epoch": 1.3814851613460624, + "grad_norm": 0.5833292422162293, + "learning_rate": 5.877833562809922e-06, + "loss": 0.1123, + "step": 15990 + }, + { + "epoch": 1.3823491295520325, + "grad_norm": 0.5517932567301533, + "learning_rate": 5.873241738624824e-06, + "loss": 0.1057, + "step": 16000 + }, + { + "epoch": 1.3832130977580026, + "grad_norm": 0.5756476697861506, + "learning_rate": 5.8686491546559154e-06, + "loss": 0.1086, + "step": 16010 + }, + { + "epoch": 1.3840770659639725, + "grad_norm": 0.5785581813009524, + "learning_rate": 5.8640558148990835e-06, + "loss": 0.1115, + "step": 16020 + }, + { + "epoch": 1.3849410341699424, + "grad_norm": 0.5943858429441137, + "learning_rate": 5.8594617233508645e-06, + "loss": 0.1092, + "step": 16030 + }, + { + "epoch": 1.3858050023759125, + "grad_norm": 0.5745575971627046, + "learning_rate": 5.854866884008452e-06, + "loss": 0.1102, + "step": 16040 + }, + { + "epoch": 1.3866689705818827, + "grad_norm": 0.5348702159362239, + "learning_rate": 5.850271300869691e-06, + "loss": 0.1096, + "step": 16050 + }, + { + "epoch": 1.3875329387878526, + "grad_norm": 0.5694557334714356, + "learning_rate": 5.845674977933074e-06, + "loss": 0.1085, + "step": 16060 + }, + { + "epoch": 1.3883969069938227, + "grad_norm": 0.5693489028372023, + "learning_rate": 5.841077919197734e-06, + "loss": 0.1086, + "step": 16070 + }, + { + "epoch": 1.3892608751997926, + "grad_norm": 0.5593707766156559, + "learning_rate": 5.8364801286634486e-06, + "loss": 0.1095, + "step": 16080 + }, + { + "epoch": 1.3901248434057627, + "grad_norm": 0.6124636333798967, + "learning_rate": 5.831881610330628e-06, + "loss": 0.1166, + "step": 16090 + }, + { + "epoch": 1.3909888116117326, + "grad_norm": 0.5846656666878277, + "learning_rate": 5.8272823682003196e-06, + "loss": 0.1111, + "step": 16100 + }, + { + "epoch": 1.3918527798177027, + "grad_norm": 0.5720609742683443, + "learning_rate": 5.822682406274197e-06, + "loss": 0.1097, + "step": 16110 + }, + { + "epoch": 1.3927167480236728, + "grad_norm": 0.5513132732611473, + "learning_rate": 5.818081728554562e-06, + "loss": 0.1106, + "step": 16120 + }, + { + "epoch": 1.3935807162296427, + "grad_norm": 0.5149649775459123, + "learning_rate": 5.81348033904434e-06, + "loss": 0.1114, + "step": 16130 + }, + { + "epoch": 1.3944446844356126, + "grad_norm": 0.5869210395463293, + "learning_rate": 5.808878241747071e-06, + "loss": 0.1049, + "step": 16140 + }, + { + "epoch": 1.3953086526415828, + "grad_norm": 0.5625464154152859, + "learning_rate": 5.804275440666918e-06, + "loss": 0.1097, + "step": 16150 + }, + { + "epoch": 1.396172620847553, + "grad_norm": 0.5813811467929083, + "learning_rate": 5.7996719398086485e-06, + "loss": 0.1119, + "step": 16160 + }, + { + "epoch": 1.3970365890535228, + "grad_norm": 0.5725158526835552, + "learning_rate": 5.795067743177648e-06, + "loss": 0.1099, + "step": 16170 + }, + { + "epoch": 1.397900557259493, + "grad_norm": 0.5720353741750168, + "learning_rate": 5.790462854779898e-06, + "loss": 0.1083, + "step": 16180 + }, + { + "epoch": 1.3987645254654628, + "grad_norm": 0.5757257456663112, + "learning_rate": 5.785857278621989e-06, + "loss": 0.1074, + "step": 16190 + }, + { + "epoch": 1.399628493671433, + "grad_norm": 0.5751688492722273, + "learning_rate": 5.7812510187111045e-06, + "loss": 0.111, + "step": 16200 + }, + { + "epoch": 1.4004924618774028, + "grad_norm": 0.5308149515056841, + "learning_rate": 5.776644079055029e-06, + "loss": 0.1086, + "step": 16210 + }, + { + "epoch": 1.401356430083373, + "grad_norm": 0.5457555860479729, + "learning_rate": 5.772036463662133e-06, + "loss": 0.1095, + "step": 16220 + }, + { + "epoch": 1.402220398289343, + "grad_norm": 0.5525027323997239, + "learning_rate": 5.7674281765413775e-06, + "loss": 0.1098, + "step": 16230 + }, + { + "epoch": 1.403084366495313, + "grad_norm": 0.5552174838299228, + "learning_rate": 5.7628192217023075e-06, + "loss": 0.109, + "step": 16240 + }, + { + "epoch": 1.4039483347012829, + "grad_norm": 0.6052659498732853, + "learning_rate": 5.758209603155047e-06, + "loss": 0.1072, + "step": 16250 + }, + { + "epoch": 1.404812302907253, + "grad_norm": 0.5614718168621989, + "learning_rate": 5.7535993249103e-06, + "loss": 0.1106, + "step": 16260 + }, + { + "epoch": 1.4056762711132231, + "grad_norm": 0.5536105288949624, + "learning_rate": 5.748988390979346e-06, + "loss": 0.11, + "step": 16270 + }, + { + "epoch": 1.406540239319193, + "grad_norm": 0.5611334021841203, + "learning_rate": 5.74437680537403e-06, + "loss": 0.1101, + "step": 16280 + }, + { + "epoch": 1.4074042075251632, + "grad_norm": 0.5573869486007782, + "learning_rate": 5.73976457210677e-06, + "loss": 0.1105, + "step": 16290 + }, + { + "epoch": 1.408268175731133, + "grad_norm": 0.5516200215230308, + "learning_rate": 5.73515169519054e-06, + "loss": 0.1082, + "step": 16300 + }, + { + "epoch": 1.4091321439371032, + "grad_norm": 0.5526404729155173, + "learning_rate": 5.730538178638881e-06, + "loss": 0.1101, + "step": 16310 + }, + { + "epoch": 1.409996112143073, + "grad_norm": 0.5719197743944662, + "learning_rate": 5.725924026465888e-06, + "loss": 0.1112, + "step": 16320 + }, + { + "epoch": 1.4108600803490432, + "grad_norm": 0.5694236895947357, + "learning_rate": 5.721309242686209e-06, + "loss": 0.1093, + "step": 16330 + }, + { + "epoch": 1.411724048555013, + "grad_norm": 0.6141799055072968, + "learning_rate": 5.716693831315041e-06, + "loss": 0.1126, + "step": 16340 + }, + { + "epoch": 1.4125880167609832, + "grad_norm": 0.5781600972939118, + "learning_rate": 5.7120777963681276e-06, + "loss": 0.1133, + "step": 16350 + }, + { + "epoch": 1.4134519849669531, + "grad_norm": 0.5489573576838745, + "learning_rate": 5.707461141861753e-06, + "loss": 0.1085, + "step": 16360 + }, + { + "epoch": 1.4143159531729232, + "grad_norm": 0.5511980848938117, + "learning_rate": 5.702843871812745e-06, + "loss": 0.1093, + "step": 16370 + }, + { + "epoch": 1.4151799213788934, + "grad_norm": 0.5694496019360336, + "learning_rate": 5.698225990238463e-06, + "loss": 0.1069, + "step": 16380 + }, + { + "epoch": 1.4160438895848633, + "grad_norm": 0.587017896333731, + "learning_rate": 5.6936075011568e-06, + "loss": 0.1109, + "step": 16390 + }, + { + "epoch": 1.4169078577908332, + "grad_norm": 0.568063496878268, + "learning_rate": 5.688988408586177e-06, + "loss": 0.1093, + "step": 16400 + }, + { + "epoch": 1.4177718259968033, + "grad_norm": 0.5679995822925583, + "learning_rate": 5.684368716545541e-06, + "loss": 0.106, + "step": 16410 + }, + { + "epoch": 1.4186357942027734, + "grad_norm": 0.5734866434303718, + "learning_rate": 5.6797484290543594e-06, + "loss": 0.1119, + "step": 16420 + }, + { + "epoch": 1.4194997624087433, + "grad_norm": 0.5431629848080799, + "learning_rate": 5.6751275501326185e-06, + "loss": 0.11, + "step": 16430 + }, + { + "epoch": 1.4203637306147134, + "grad_norm": 0.5720416624760498, + "learning_rate": 5.67050608380082e-06, + "loss": 0.1087, + "step": 16440 + }, + { + "epoch": 1.4212276988206833, + "grad_norm": 0.580302420152164, + "learning_rate": 5.665884034079974e-06, + "loss": 0.1087, + "step": 16450 + }, + { + "epoch": 1.4220916670266535, + "grad_norm": 0.5840176383870129, + "learning_rate": 5.6612614049916e-06, + "loss": 0.1068, + "step": 16460 + }, + { + "epoch": 1.4229556352326234, + "grad_norm": 0.5619479511353946, + "learning_rate": 5.656638200557723e-06, + "loss": 0.1064, + "step": 16470 + }, + { + "epoch": 1.4238196034385935, + "grad_norm": 0.563149143445443, + "learning_rate": 5.652014424800865e-06, + "loss": 0.112, + "step": 16480 + }, + { + "epoch": 1.4246835716445636, + "grad_norm": 0.554179718385933, + "learning_rate": 5.647390081744047e-06, + "loss": 0.1118, + "step": 16490 + }, + { + "epoch": 1.4255475398505335, + "grad_norm": 0.5790448425729009, + "learning_rate": 5.6427651754107856e-06, + "loss": 0.113, + "step": 16500 + }, + { + "epoch": 1.4264115080565034, + "grad_norm": 0.5413167122533629, + "learning_rate": 5.638139709825085e-06, + "loss": 0.1105, + "step": 16510 + }, + { + "epoch": 1.4272754762624735, + "grad_norm": 0.5403591817971234, + "learning_rate": 5.633513689011436e-06, + "loss": 0.1094, + "step": 16520 + }, + { + "epoch": 1.4281394444684437, + "grad_norm": 0.5719167784121851, + "learning_rate": 5.628887116994812e-06, + "loss": 0.1069, + "step": 16530 + }, + { + "epoch": 1.4290034126744136, + "grad_norm": 0.5571643381657813, + "learning_rate": 5.624259997800671e-06, + "loss": 0.1082, + "step": 16540 + }, + { + "epoch": 1.4298673808803837, + "grad_norm": 0.5539391800128624, + "learning_rate": 5.61963233545494e-06, + "loss": 0.1076, + "step": 16550 + }, + { + "epoch": 1.4307313490863536, + "grad_norm": 0.5737886682031937, + "learning_rate": 5.615004133984022e-06, + "loss": 0.1087, + "step": 16560 + }, + { + "epoch": 1.4315953172923237, + "grad_norm": 0.5466801931023841, + "learning_rate": 5.610375397414788e-06, + "loss": 0.1101, + "step": 16570 + }, + { + "epoch": 1.4324592854982936, + "grad_norm": 0.5797886336713219, + "learning_rate": 5.605746129774577e-06, + "loss": 0.1114, + "step": 16580 + }, + { + "epoch": 1.4333232537042637, + "grad_norm": 0.5529740818600277, + "learning_rate": 5.601116335091189e-06, + "loss": 0.1074, + "step": 16590 + }, + { + "epoch": 1.4341872219102336, + "grad_norm": 0.5715134987232192, + "learning_rate": 5.59648601739288e-06, + "loss": 0.1084, + "step": 16600 + }, + { + "epoch": 1.4350511901162037, + "grad_norm": 0.5565474264744146, + "learning_rate": 5.591855180708365e-06, + "loss": 0.1082, + "step": 16610 + }, + { + "epoch": 1.4359151583221736, + "grad_norm": 0.5771393921573549, + "learning_rate": 5.587223829066807e-06, + "loss": 0.1132, + "step": 16620 + }, + { + "epoch": 1.4367791265281438, + "grad_norm": 0.5354337704232355, + "learning_rate": 5.582591966497818e-06, + "loss": 0.1077, + "step": 16630 + }, + { + "epoch": 1.437643094734114, + "grad_norm": 0.5704092054091621, + "learning_rate": 5.577959597031459e-06, + "loss": 0.1084, + "step": 16640 + }, + { + "epoch": 1.4385070629400838, + "grad_norm": 0.547835523974159, + "learning_rate": 5.573326724698225e-06, + "loss": 0.1069, + "step": 16650 + }, + { + "epoch": 1.4393710311460537, + "grad_norm": 0.5550047278667364, + "learning_rate": 5.568693353529053e-06, + "loss": 0.1092, + "step": 16660 + }, + { + "epoch": 1.4402349993520238, + "grad_norm": 0.5598159535674883, + "learning_rate": 5.564059487555311e-06, + "loss": 0.11, + "step": 16670 + }, + { + "epoch": 1.441098967557994, + "grad_norm": 0.5500359408388827, + "learning_rate": 5.559425130808802e-06, + "loss": 0.1093, + "step": 16680 + }, + { + "epoch": 1.4419629357639638, + "grad_norm": 0.5748371580710966, + "learning_rate": 5.554790287321752e-06, + "loss": 0.1095, + "step": 16690 + }, + { + "epoch": 1.442826903969934, + "grad_norm": 0.5589845183126114, + "learning_rate": 5.550154961126812e-06, + "loss": 0.11, + "step": 16700 + }, + { + "epoch": 1.4436908721759039, + "grad_norm": 0.5596557149578248, + "learning_rate": 5.5455191562570535e-06, + "loss": 0.1084, + "step": 16710 + }, + { + "epoch": 1.444554840381874, + "grad_norm": 0.555957437652107, + "learning_rate": 5.540882876745962e-06, + "loss": 0.1099, + "step": 16720 + }, + { + "epoch": 1.4454188085878439, + "grad_norm": 0.5697811632279, + "learning_rate": 5.53624612662744e-06, + "loss": 0.11, + "step": 16730 + }, + { + "epoch": 1.446282776793814, + "grad_norm": 0.540863041698352, + "learning_rate": 5.531608909935795e-06, + "loss": 0.1078, + "step": 16740 + }, + { + "epoch": 1.4471467449997841, + "grad_norm": 0.5439712322944413, + "learning_rate": 5.526971230705744e-06, + "loss": 0.1092, + "step": 16750 + }, + { + "epoch": 1.448010713205754, + "grad_norm": 0.5539653073070816, + "learning_rate": 5.522333092972406e-06, + "loss": 0.1067, + "step": 16760 + }, + { + "epoch": 1.448874681411724, + "grad_norm": 0.5757732852795685, + "learning_rate": 5.517694500771298e-06, + "loss": 0.1077, + "step": 16770 + }, + { + "epoch": 1.449738649617694, + "grad_norm": 0.5523898348981371, + "learning_rate": 5.513055458138329e-06, + "loss": 0.1098, + "step": 16780 + }, + { + "epoch": 1.4506026178236642, + "grad_norm": 0.53039159499405, + "learning_rate": 5.508415969109808e-06, + "loss": 0.1091, + "step": 16790 + }, + { + "epoch": 1.451466586029634, + "grad_norm": 0.575717777450302, + "learning_rate": 5.503776037722425e-06, + "loss": 0.1115, + "step": 16800 + }, + { + "epoch": 1.4523305542356042, + "grad_norm": 0.5295410373537287, + "learning_rate": 5.4991356680132576e-06, + "loss": 0.1106, + "step": 16810 + }, + { + "epoch": 1.453194522441574, + "grad_norm": 0.5637895554867309, + "learning_rate": 5.494494864019767e-06, + "loss": 0.1106, + "step": 16820 + }, + { + "epoch": 1.4540584906475442, + "grad_norm": 0.5647698345641413, + "learning_rate": 5.489853629779789e-06, + "loss": 0.1113, + "step": 16830 + }, + { + "epoch": 1.4549224588535141, + "grad_norm": 0.5480780997759342, + "learning_rate": 5.485211969331536e-06, + "loss": 0.1104, + "step": 16840 + }, + { + "epoch": 1.4557864270594842, + "grad_norm": 0.5962694168716901, + "learning_rate": 5.480569886713586e-06, + "loss": 0.1083, + "step": 16850 + }, + { + "epoch": 1.4566503952654544, + "grad_norm": 0.5468143792266299, + "learning_rate": 5.475927385964892e-06, + "loss": 0.105, + "step": 16860 + }, + { + "epoch": 1.4575143634714243, + "grad_norm": 0.5627954667617832, + "learning_rate": 5.4712844711247685e-06, + "loss": 0.1097, + "step": 16870 + }, + { + "epoch": 1.4583783316773942, + "grad_norm": 0.5498247339749494, + "learning_rate": 5.4666411462328835e-06, + "loss": 0.1073, + "step": 16880 + }, + { + "epoch": 1.4592422998833643, + "grad_norm": 0.5593645586338002, + "learning_rate": 5.461997415329274e-06, + "loss": 0.1099, + "step": 16890 + }, + { + "epoch": 1.4601062680893344, + "grad_norm": 0.5707187551887513, + "learning_rate": 5.457353282454318e-06, + "loss": 0.1121, + "step": 16900 + }, + { + "epoch": 1.4609702362953043, + "grad_norm": 0.553904006667131, + "learning_rate": 5.452708751648753e-06, + "loss": 0.1072, + "step": 16910 + }, + { + "epoch": 1.4618342045012742, + "grad_norm": 0.529572172260642, + "learning_rate": 5.448063826953654e-06, + "loss": 0.1091, + "step": 16920 + }, + { + "epoch": 1.4626981727072443, + "grad_norm": 0.5472381570541359, + "learning_rate": 5.443418512410449e-06, + "loss": 0.1072, + "step": 16930 + }, + { + "epoch": 1.4635621409132145, + "grad_norm": 0.5704255859503703, + "learning_rate": 5.438772812060895e-06, + "loss": 0.1099, + "step": 16940 + }, + { + "epoch": 1.4644261091191844, + "grad_norm": 0.5661130823236894, + "learning_rate": 5.434126729947091e-06, + "loss": 0.1067, + "step": 16950 + }, + { + "epoch": 1.4652900773251545, + "grad_norm": 0.5824585865876896, + "learning_rate": 5.4294802701114645e-06, + "loss": 0.1075, + "step": 16960 + }, + { + "epoch": 1.4661540455311244, + "grad_norm": 0.5712779312011059, + "learning_rate": 5.424833436596774e-06, + "loss": 0.1106, + "step": 16970 + }, + { + "epoch": 1.4670180137370945, + "grad_norm": 0.556641188854497, + "learning_rate": 5.420186233446104e-06, + "loss": 0.1087, + "step": 16980 + }, + { + "epoch": 1.4678819819430644, + "grad_norm": 0.5747662022205271, + "learning_rate": 5.415538664702858e-06, + "loss": 0.1073, + "step": 16990 + }, + { + "epoch": 1.4687459501490345, + "grad_norm": 0.5379635353303245, + "learning_rate": 5.410890734410761e-06, + "loss": 0.1053, + "step": 17000 + }, + { + "epoch": 1.4696099183550047, + "grad_norm": 0.6016166805731681, + "learning_rate": 5.406242446613845e-06, + "loss": 0.1103, + "step": 17010 + }, + { + "epoch": 1.4704738865609746, + "grad_norm": 0.5651647377002924, + "learning_rate": 5.401593805356464e-06, + "loss": 0.1066, + "step": 17020 + }, + { + "epoch": 1.4713378547669445, + "grad_norm": 0.53397079179655, + "learning_rate": 5.39694481468327e-06, + "loss": 0.1108, + "step": 17030 + }, + { + "epoch": 1.4722018229729146, + "grad_norm": 0.6010434080095977, + "learning_rate": 5.392295478639226e-06, + "loss": 0.1098, + "step": 17040 + }, + { + "epoch": 1.4730657911788847, + "grad_norm": 0.5546090791053712, + "learning_rate": 5.38764580126959e-06, + "loss": 0.1114, + "step": 17050 + }, + { + "epoch": 1.4739297593848546, + "grad_norm": 0.5273120526879177, + "learning_rate": 5.382995786619921e-06, + "loss": 0.1088, + "step": 17060 + }, + { + "epoch": 1.4747937275908247, + "grad_norm": 0.5567574055473559, + "learning_rate": 5.378345438736068e-06, + "loss": 0.1085, + "step": 17070 + }, + { + "epoch": 1.4756576957967946, + "grad_norm": 0.5714232269488697, + "learning_rate": 5.373694761664174e-06, + "loss": 0.1074, + "step": 17080 + }, + { + "epoch": 1.4765216640027647, + "grad_norm": 0.5534188993303146, + "learning_rate": 5.369043759450664e-06, + "loss": 0.1112, + "step": 17090 + }, + { + "epoch": 1.4773856322087346, + "grad_norm": 0.5533405667659507, + "learning_rate": 5.364392436142251e-06, + "loss": 0.1084, + "step": 17100 + }, + { + "epoch": 1.4782496004147048, + "grad_norm": 0.5715199286886745, + "learning_rate": 5.359740795785921e-06, + "loss": 0.1088, + "step": 17110 + }, + { + "epoch": 1.479113568620675, + "grad_norm": 0.57254667157946, + "learning_rate": 5.3550888424289415e-06, + "loss": 0.1056, + "step": 17120 + }, + { + "epoch": 1.4799775368266448, + "grad_norm": 0.5199024237592625, + "learning_rate": 5.3504365801188495e-06, + "loss": 0.1052, + "step": 17130 + }, + { + "epoch": 1.4808415050326147, + "grad_norm": 0.5813582218704558, + "learning_rate": 5.3457840129034535e-06, + "loss": 0.1047, + "step": 17140 + }, + { + "epoch": 1.4817054732385848, + "grad_norm": 0.5390351359241751, + "learning_rate": 5.341131144830822e-06, + "loss": 0.1068, + "step": 17150 + }, + { + "epoch": 1.482569441444555, + "grad_norm": 0.5829892827363654, + "learning_rate": 5.336477979949291e-06, + "loss": 0.1063, + "step": 17160 + }, + { + "epoch": 1.4834334096505248, + "grad_norm": 0.5613668918351362, + "learning_rate": 5.33182452230745e-06, + "loss": 0.1093, + "step": 17170 + }, + { + "epoch": 1.484297377856495, + "grad_norm": 0.5668262800654991, + "learning_rate": 5.327170775954149e-06, + "loss": 0.1067, + "step": 17180 + }, + { + "epoch": 1.4851613460624649, + "grad_norm": 0.563123194809761, + "learning_rate": 5.322516744938482e-06, + "loss": 0.1068, + "step": 17190 + }, + { + "epoch": 1.486025314268435, + "grad_norm": 0.5702164031662597, + "learning_rate": 5.317862433309797e-06, + "loss": 0.1123, + "step": 17200 + }, + { + "epoch": 1.4868892824744049, + "grad_norm": 0.5573428134086994, + "learning_rate": 5.3132078451176815e-06, + "loss": 0.1092, + "step": 17210 + }, + { + "epoch": 1.487753250680375, + "grad_norm": 0.5423066018915117, + "learning_rate": 5.308552984411968e-06, + "loss": 0.1065, + "step": 17220 + }, + { + "epoch": 1.488617218886345, + "grad_norm": 0.5895533833702804, + "learning_rate": 5.30389785524272e-06, + "loss": 0.1121, + "step": 17230 + }, + { + "epoch": 1.489481187092315, + "grad_norm": 0.5609587275234417, + "learning_rate": 5.299242461660243e-06, + "loss": 0.1065, + "step": 17240 + }, + { + "epoch": 1.490345155298285, + "grad_norm": 0.575671022001389, + "learning_rate": 5.294586807715066e-06, + "loss": 0.1087, + "step": 17250 + }, + { + "epoch": 1.491209123504255, + "grad_norm": 0.5345702719825773, + "learning_rate": 5.289930897457946e-06, + "loss": 0.1077, + "step": 17260 + }, + { + "epoch": 1.4920730917102252, + "grad_norm": 0.5371496978514604, + "learning_rate": 5.285274734939864e-06, + "loss": 0.1103, + "step": 17270 + }, + { + "epoch": 1.492937059916195, + "grad_norm": 0.5593334573221135, + "learning_rate": 5.280618324212018e-06, + "loss": 0.1074, + "step": 17280 + }, + { + "epoch": 1.493801028122165, + "grad_norm": 0.5295295244997914, + "learning_rate": 5.275961669325828e-06, + "loss": 0.1063, + "step": 17290 + }, + { + "epoch": 1.494664996328135, + "grad_norm": 0.5383691997834555, + "learning_rate": 5.271304774332917e-06, + "loss": 0.1089, + "step": 17300 + }, + { + "epoch": 1.4955289645341052, + "grad_norm": 0.5378075832729008, + "learning_rate": 5.266647643285126e-06, + "loss": 0.1071, + "step": 17310 + }, + { + "epoch": 1.4963929327400751, + "grad_norm": 0.5864711882360478, + "learning_rate": 5.261990280234498e-06, + "loss": 0.1082, + "step": 17320 + }, + { + "epoch": 1.4972569009460452, + "grad_norm": 0.5833725727517236, + "learning_rate": 5.257332689233275e-06, + "loss": 0.109, + "step": 17330 + }, + { + "epoch": 1.4981208691520151, + "grad_norm": 0.617406542429736, + "learning_rate": 5.252674874333902e-06, + "loss": 0.1103, + "step": 17340 + }, + { + "epoch": 1.4989848373579853, + "grad_norm": 0.5393750705119263, + "learning_rate": 5.248016839589015e-06, + "loss": 0.1065, + "step": 17350 + }, + { + "epoch": 1.4998488055639552, + "grad_norm": 0.5525154054728131, + "learning_rate": 5.243358589051445e-06, + "loss": 0.1086, + "step": 17360 + }, + { + "epoch": 1.5007127737699253, + "grad_norm": 0.5663281890236571, + "learning_rate": 5.238700126774208e-06, + "loss": 0.1106, + "step": 17370 + }, + { + "epoch": 1.5015767419758954, + "grad_norm": 0.5639045716521068, + "learning_rate": 5.234041456810501e-06, + "loss": 0.1099, + "step": 17380 + }, + { + "epoch": 1.5024407101818653, + "grad_norm": 0.5501571786531163, + "learning_rate": 5.229382583213713e-06, + "loss": 0.1084, + "step": 17390 + }, + { + "epoch": 1.5033046783878352, + "grad_norm": 0.5555533506524184, + "learning_rate": 5.224723510037395e-06, + "loss": 0.1096, + "step": 17400 + }, + { + "epoch": 1.5041686465938053, + "grad_norm": 0.5565015152184756, + "learning_rate": 5.220064241335286e-06, + "loss": 0.1115, + "step": 17410 + }, + { + "epoch": 1.5050326147997755, + "grad_norm": 0.5594405883971818, + "learning_rate": 5.215404781161283e-06, + "loss": 0.1086, + "step": 17420 + }, + { + "epoch": 1.5058965830057454, + "grad_norm": 0.560011821779313, + "learning_rate": 5.2107451335694595e-06, + "loss": 0.1075, + "step": 17430 + }, + { + "epoch": 1.5067605512117153, + "grad_norm": 0.543119102461241, + "learning_rate": 5.206085302614045e-06, + "loss": 0.1071, + "step": 17440 + }, + { + "epoch": 1.5076245194176854, + "grad_norm": 0.5357688437385529, + "learning_rate": 5.201425292349434e-06, + "loss": 0.1114, + "step": 17450 + }, + { + "epoch": 1.5084884876236555, + "grad_norm": 0.5504332290569045, + "learning_rate": 5.196765106830171e-06, + "loss": 0.1096, + "step": 17460 + }, + { + "epoch": 1.5093524558296254, + "grad_norm": 0.567852538652432, + "learning_rate": 5.192104750110959e-06, + "loss": 0.1074, + "step": 17470 + }, + { + "epoch": 1.5102164240355955, + "grad_norm": 0.5322729979849891, + "learning_rate": 5.187444226246645e-06, + "loss": 0.1099, + "step": 17480 + }, + { + "epoch": 1.5110803922415657, + "grad_norm": 0.5756617595017037, + "learning_rate": 5.182783539292226e-06, + "loss": 0.1086, + "step": 17490 + }, + { + "epoch": 1.5119443604475356, + "grad_norm": 0.5678975606126713, + "learning_rate": 5.178122693302835e-06, + "loss": 0.1057, + "step": 17500 + }, + { + "epoch": 1.5128083286535055, + "grad_norm": 0.5461058418335905, + "learning_rate": 5.173461692333751e-06, + "loss": 0.1068, + "step": 17510 + }, + { + "epoch": 1.5136722968594756, + "grad_norm": 0.571376612212014, + "learning_rate": 5.168800540440381e-06, + "loss": 0.1075, + "step": 17520 + }, + { + "epoch": 1.5145362650654457, + "grad_norm": 0.5497908782485466, + "learning_rate": 5.164139241678267e-06, + "loss": 0.1072, + "step": 17530 + }, + { + "epoch": 1.5154002332714156, + "grad_norm": 0.5914438759722356, + "learning_rate": 5.159477800103079e-06, + "loss": 0.1085, + "step": 17540 + }, + { + "epoch": 1.5162642014773855, + "grad_norm": 0.5316510326275006, + "learning_rate": 5.1548162197706075e-06, + "loss": 0.1075, + "step": 17550 + }, + { + "epoch": 1.5171281696833556, + "grad_norm": 0.5621679449680816, + "learning_rate": 5.150154504736768e-06, + "loss": 0.1071, + "step": 17560 + }, + { + "epoch": 1.5179921378893257, + "grad_norm": 0.5372659772513624, + "learning_rate": 5.145492659057593e-06, + "loss": 0.1061, + "step": 17570 + }, + { + "epoch": 1.5188561060952956, + "grad_norm": 0.5710203008791778, + "learning_rate": 5.140830686789224e-06, + "loss": 0.1069, + "step": 17580 + }, + { + "epoch": 1.5197200743012658, + "grad_norm": 0.5481229869659677, + "learning_rate": 5.1361685919879175e-06, + "loss": 0.1087, + "step": 17590 + }, + { + "epoch": 1.520584042507236, + "grad_norm": 0.5589983135098491, + "learning_rate": 5.131506378710035e-06, + "loss": 0.1109, + "step": 17600 + }, + { + "epoch": 1.5214480107132058, + "grad_norm": 0.5635199037596682, + "learning_rate": 5.126844051012041e-06, + "loss": 0.1103, + "step": 17610 + }, + { + "epoch": 1.5223119789191757, + "grad_norm": 0.5500151319574998, + "learning_rate": 5.122181612950499e-06, + "loss": 0.108, + "step": 17620 + }, + { + "epoch": 1.5231759471251458, + "grad_norm": 0.5371844113385824, + "learning_rate": 5.11751906858207e-06, + "loss": 0.1072, + "step": 17630 + }, + { + "epoch": 1.524039915331116, + "grad_norm": 0.5485761580061022, + "learning_rate": 5.112856421963507e-06, + "loss": 0.1072, + "step": 17640 + }, + { + "epoch": 1.5249038835370858, + "grad_norm": 0.5700920471032753, + "learning_rate": 5.108193677151648e-06, + "loss": 0.1085, + "step": 17650 + }, + { + "epoch": 1.5257678517430557, + "grad_norm": 0.5514566079923485, + "learning_rate": 5.103530838203427e-06, + "loss": 0.1071, + "step": 17660 + }, + { + "epoch": 1.5266318199490259, + "grad_norm": 0.5712587946627912, + "learning_rate": 5.0988679091758465e-06, + "loss": 0.1057, + "step": 17670 + }, + { + "epoch": 1.527495788154996, + "grad_norm": 0.523278063242322, + "learning_rate": 5.094204894125997e-06, + "loss": 0.1055, + "step": 17680 + }, + { + "epoch": 1.5283597563609659, + "grad_norm": 0.547975825208195, + "learning_rate": 5.0895417971110384e-06, + "loss": 0.1069, + "step": 17690 + }, + { + "epoch": 1.5292237245669358, + "grad_norm": 0.5649900655239077, + "learning_rate": 5.0848786221882065e-06, + "loss": 0.1071, + "step": 17700 + }, + { + "epoch": 1.530087692772906, + "grad_norm": 0.5235846263354882, + "learning_rate": 5.0802153734148e-06, + "loss": 0.1086, + "step": 17710 + }, + { + "epoch": 1.530951660978876, + "grad_norm": 0.558486058207212, + "learning_rate": 5.075552054848188e-06, + "loss": 0.108, + "step": 17720 + }, + { + "epoch": 1.531815629184846, + "grad_norm": 0.5591344019040175, + "learning_rate": 5.070888670545794e-06, + "loss": 0.1078, + "step": 17730 + }, + { + "epoch": 1.532679597390816, + "grad_norm": 0.5860722333653754, + "learning_rate": 5.066225224565102e-06, + "loss": 0.1085, + "step": 17740 + }, + { + "epoch": 1.5335435655967862, + "grad_norm": 0.5963012918305679, + "learning_rate": 5.061561720963649e-06, + "loss": 0.1098, + "step": 17750 + }, + { + "epoch": 1.534407533802756, + "grad_norm": 0.5604735839188759, + "learning_rate": 5.056898163799023e-06, + "loss": 0.1081, + "step": 17760 + }, + { + "epoch": 1.535271502008726, + "grad_norm": 0.5521582524965631, + "learning_rate": 5.052234557128859e-06, + "loss": 0.1056, + "step": 17770 + }, + { + "epoch": 1.536135470214696, + "grad_norm": 0.5863548246000648, + "learning_rate": 5.04757090501083e-06, + "loss": 0.1073, + "step": 17780 + }, + { + "epoch": 1.5369994384206662, + "grad_norm": 0.5505531729226412, + "learning_rate": 5.042907211502654e-06, + "loss": 0.1059, + "step": 17790 + }, + { + "epoch": 1.5378634066266361, + "grad_norm": 0.5527561782797406, + "learning_rate": 5.038243480662086e-06, + "loss": 0.107, + "step": 17800 + }, + { + "epoch": 1.538727374832606, + "grad_norm": 0.5403035943659682, + "learning_rate": 5.033579716546908e-06, + "loss": 0.1066, + "step": 17810 + }, + { + "epoch": 1.5395913430385761, + "grad_norm": 0.6183154805975839, + "learning_rate": 5.028915923214935e-06, + "loss": 0.1063, + "step": 17820 + }, + { + "epoch": 1.5404553112445463, + "grad_norm": 0.5501718905367079, + "learning_rate": 5.0242521047240076e-06, + "loss": 0.1036, + "step": 17830 + }, + { + "epoch": 1.5413192794505162, + "grad_norm": 0.5332692223400205, + "learning_rate": 5.019588265131984e-06, + "loss": 0.1061, + "step": 17840 + }, + { + "epoch": 1.5421832476564863, + "grad_norm": 0.5613621146761566, + "learning_rate": 5.014924408496746e-06, + "loss": 0.1068, + "step": 17850 + }, + { + "epoch": 1.5430472158624564, + "grad_norm": 0.5581157336544168, + "learning_rate": 5.010260538876187e-06, + "loss": 0.1075, + "step": 17860 + }, + { + "epoch": 1.5439111840684263, + "grad_norm": 0.559150234766518, + "learning_rate": 5.0055966603282136e-06, + "loss": 0.1046, + "step": 17870 + }, + { + "epoch": 1.5447751522743962, + "grad_norm": 0.5523464636226372, + "learning_rate": 5.000932776910739e-06, + "loss": 0.1081, + "step": 17880 + }, + { + "epoch": 1.5456391204803663, + "grad_norm": 0.5874851979850498, + "learning_rate": 4.996268892681681e-06, + "loss": 0.1059, + "step": 17890 + }, + { + "epoch": 1.5465030886863365, + "grad_norm": 0.5599016847752923, + "learning_rate": 4.991605011698957e-06, + "loss": 0.105, + "step": 17900 + }, + { + "epoch": 1.5473670568923064, + "grad_norm": 0.5536885812459234, + "learning_rate": 4.9869411380204825e-06, + "loss": 0.1114, + "step": 17910 + }, + { + "epoch": 1.5482310250982763, + "grad_norm": 0.5850370040739463, + "learning_rate": 4.982277275704169e-06, + "loss": 0.108, + "step": 17920 + }, + { + "epoch": 1.5490949933042464, + "grad_norm": 0.5476002826516605, + "learning_rate": 4.977613428807913e-06, + "loss": 0.1114, + "step": 17930 + }, + { + "epoch": 1.5499589615102165, + "grad_norm": 0.5458827789684254, + "learning_rate": 4.972949601389601e-06, + "loss": 0.1055, + "step": 17940 + }, + { + "epoch": 1.5508229297161864, + "grad_norm": 0.5413640872702045, + "learning_rate": 4.968285797507104e-06, + "loss": 0.1113, + "step": 17950 + }, + { + "epoch": 1.5516868979221563, + "grad_norm": 0.5752827421319834, + "learning_rate": 4.963622021218271e-06, + "loss": 0.1087, + "step": 17960 + }, + { + "epoch": 1.5525508661281266, + "grad_norm": 0.5860674587550517, + "learning_rate": 4.958958276580922e-06, + "loss": 0.1097, + "step": 17970 + }, + { + "epoch": 1.5534148343340966, + "grad_norm": 0.5444099654254021, + "learning_rate": 4.954294567652858e-06, + "loss": 0.1082, + "step": 17980 + }, + { + "epoch": 1.5542788025400665, + "grad_norm": 0.5578751187043065, + "learning_rate": 4.949630898491842e-06, + "loss": 0.1085, + "step": 17990 + }, + { + "epoch": 1.5551427707460366, + "grad_norm": 0.5607469002061576, + "learning_rate": 4.9449672731556095e-06, + "loss": 0.1103, + "step": 18000 + }, + { + "epoch": 1.5560067389520067, + "grad_norm": 0.597520663614332, + "learning_rate": 4.940303695701848e-06, + "loss": 0.1085, + "step": 18010 + }, + { + "epoch": 1.5568707071579766, + "grad_norm": 0.5635599902531142, + "learning_rate": 4.935640170188212e-06, + "loss": 0.1044, + "step": 18020 + }, + { + "epoch": 1.5577346753639465, + "grad_norm": 0.5477678866631618, + "learning_rate": 4.930976700672308e-06, + "loss": 0.1067, + "step": 18030 + }, + { + "epoch": 1.5585986435699166, + "grad_norm": 0.5633674903097964, + "learning_rate": 4.926313291211695e-06, + "loss": 0.1074, + "step": 18040 + }, + { + "epoch": 1.5594626117758867, + "grad_norm": 0.5457039922820861, + "learning_rate": 4.921649945863875e-06, + "loss": 0.1072, + "step": 18050 + }, + { + "epoch": 1.5603265799818566, + "grad_norm": 0.5529102573041352, + "learning_rate": 4.916986668686299e-06, + "loss": 0.1028, + "step": 18060 + }, + { + "epoch": 1.5611905481878265, + "grad_norm": 0.5526616247690039, + "learning_rate": 4.912323463736358e-06, + "loss": 0.1067, + "step": 18070 + }, + { + "epoch": 1.5620545163937967, + "grad_norm": 0.5906101164405569, + "learning_rate": 4.907660335071379e-06, + "loss": 0.107, + "step": 18080 + }, + { + "epoch": 1.5629184845997668, + "grad_norm": 0.5802445703142322, + "learning_rate": 4.902997286748623e-06, + "loss": 0.1073, + "step": 18090 + }, + { + "epoch": 1.5637824528057367, + "grad_norm": 0.5497703406075652, + "learning_rate": 4.898334322825279e-06, + "loss": 0.1045, + "step": 18100 + }, + { + "epoch": 1.5646464210117068, + "grad_norm": 0.5638475765108427, + "learning_rate": 4.893671447358469e-06, + "loss": 0.1069, + "step": 18110 + }, + { + "epoch": 1.565510389217677, + "grad_norm": 0.5474191633068053, + "learning_rate": 4.889008664405229e-06, + "loss": 0.1067, + "step": 18120 + }, + { + "epoch": 1.5663743574236468, + "grad_norm": 0.5726552269200086, + "learning_rate": 4.8843459780225214e-06, + "loss": 0.105, + "step": 18130 + }, + { + "epoch": 1.5672383256296167, + "grad_norm": 0.57494985885623, + "learning_rate": 4.879683392267223e-06, + "loss": 0.108, + "step": 18140 + }, + { + "epoch": 1.5681022938355869, + "grad_norm": 0.5758913701642095, + "learning_rate": 4.875020911196123e-06, + "loss": 0.1071, + "step": 18150 + }, + { + "epoch": 1.568966262041557, + "grad_norm": 0.5750957605006866, + "learning_rate": 4.870358538865916e-06, + "loss": 0.1085, + "step": 18160 + }, + { + "epoch": 1.5698302302475269, + "grad_norm": 0.5600079931570073, + "learning_rate": 4.865696279333207e-06, + "loss": 0.107, + "step": 18170 + }, + { + "epoch": 1.5706941984534968, + "grad_norm": 0.5617150407874216, + "learning_rate": 4.861034136654501e-06, + "loss": 0.1086, + "step": 18180 + }, + { + "epoch": 1.571558166659467, + "grad_norm": 0.5759016246754355, + "learning_rate": 4.8563721148862015e-06, + "loss": 0.1034, + "step": 18190 + }, + { + "epoch": 1.572422134865437, + "grad_norm": 0.5381126057860631, + "learning_rate": 4.851710218084605e-06, + "loss": 0.1057, + "step": 18200 + }, + { + "epoch": 1.573286103071407, + "grad_norm": 0.577124912521626, + "learning_rate": 4.8470484503059e-06, + "loss": 0.1053, + "step": 18210 + }, + { + "epoch": 1.5741500712773768, + "grad_norm": 0.5509976895411456, + "learning_rate": 4.8423868156061644e-06, + "loss": 0.1045, + "step": 18220 + }, + { + "epoch": 1.5750140394833472, + "grad_norm": 0.5820369404762893, + "learning_rate": 4.837725318041362e-06, + "loss": 0.1089, + "step": 18230 + }, + { + "epoch": 1.575878007689317, + "grad_norm": 0.55949728961922, + "learning_rate": 4.833063961667331e-06, + "loss": 0.1073, + "step": 18240 + }, + { + "epoch": 1.576741975895287, + "grad_norm": 0.5710029043616441, + "learning_rate": 4.828402750539791e-06, + "loss": 0.1062, + "step": 18250 + }, + { + "epoch": 1.577605944101257, + "grad_norm": 0.5382770988006425, + "learning_rate": 4.823741688714335e-06, + "loss": 0.1075, + "step": 18260 + }, + { + "epoch": 1.5784699123072272, + "grad_norm": 0.5658950513457499, + "learning_rate": 4.8190807802464285e-06, + "loss": 0.1083, + "step": 18270 + }, + { + "epoch": 1.5793338805131971, + "grad_norm": 0.5364382299197196, + "learning_rate": 4.814420029191395e-06, + "loss": 0.106, + "step": 18280 + }, + { + "epoch": 1.580197848719167, + "grad_norm": 0.5524618471037588, + "learning_rate": 4.8097594396044326e-06, + "loss": 0.106, + "step": 18290 + }, + { + "epoch": 1.5810618169251371, + "grad_norm": 0.5628924598931844, + "learning_rate": 4.8050990155405894e-06, + "loss": 0.1069, + "step": 18300 + }, + { + "epoch": 1.5819257851311073, + "grad_norm": 0.5420207388695794, + "learning_rate": 4.800438761054775e-06, + "loss": 0.1069, + "step": 18310 + }, + { + "epoch": 1.5827897533370772, + "grad_norm": 0.5910637255202297, + "learning_rate": 4.7957786802017485e-06, + "loss": 0.105, + "step": 18320 + }, + { + "epoch": 1.583653721543047, + "grad_norm": 0.5691128173632788, + "learning_rate": 4.791118777036119e-06, + "loss": 0.1067, + "step": 18330 + }, + { + "epoch": 1.5845176897490172, + "grad_norm": 0.5519021523237372, + "learning_rate": 4.786459055612341e-06, + "loss": 0.1077, + "step": 18340 + }, + { + "epoch": 1.5853816579549873, + "grad_norm": 0.5528173834315456, + "learning_rate": 4.781799519984715e-06, + "loss": 0.1051, + "step": 18350 + }, + { + "epoch": 1.5862456261609572, + "grad_norm": 0.5221856325986058, + "learning_rate": 4.7771401742073696e-06, + "loss": 0.1053, + "step": 18360 + }, + { + "epoch": 1.5871095943669273, + "grad_norm": 0.5619510960078813, + "learning_rate": 4.772481022334278e-06, + "loss": 0.1062, + "step": 18370 + }, + { + "epoch": 1.5879735625728975, + "grad_norm": 0.5186024822105305, + "learning_rate": 4.76782206841924e-06, + "loss": 0.1062, + "step": 18380 + }, + { + "epoch": 1.5888375307788674, + "grad_norm": 0.5525545768358348, + "learning_rate": 4.763163316515886e-06, + "loss": 0.1069, + "step": 18390 + }, + { + "epoch": 1.5897014989848373, + "grad_norm": 0.5551666634710379, + "learning_rate": 4.7585047706776674e-06, + "loss": 0.1064, + "step": 18400 + }, + { + "epoch": 1.5905654671908074, + "grad_norm": 0.5738819938747015, + "learning_rate": 4.753846434957856e-06, + "loss": 0.1084, + "step": 18410 + }, + { + "epoch": 1.5914294353967775, + "grad_norm": 0.5471225790659157, + "learning_rate": 4.749188313409548e-06, + "loss": 0.104, + "step": 18420 + }, + { + "epoch": 1.5922934036027474, + "grad_norm": 0.5619829238273804, + "learning_rate": 4.744530410085641e-06, + "loss": 0.1055, + "step": 18430 + }, + { + "epoch": 1.5931573718087173, + "grad_norm": 0.5637950090825805, + "learning_rate": 4.739872729038853e-06, + "loss": 0.1086, + "step": 18440 + }, + { + "epoch": 1.5940213400146874, + "grad_norm": 0.5903713369226199, + "learning_rate": 4.735215274321706e-06, + "loss": 0.1063, + "step": 18450 + }, + { + "epoch": 1.5948853082206575, + "grad_norm": 0.5685279528763301, + "learning_rate": 4.7305580499865256e-06, + "loss": 0.1077, + "step": 18460 + }, + { + "epoch": 1.5957492764266274, + "grad_norm": 0.5384106682889425, + "learning_rate": 4.725901060085431e-06, + "loss": 0.1061, + "step": 18470 + }, + { + "epoch": 1.5966132446325976, + "grad_norm": 0.5777818816201741, + "learning_rate": 4.721244308670344e-06, + "loss": 0.1084, + "step": 18480 + }, + { + "epoch": 1.5974772128385677, + "grad_norm": 0.5523159068046266, + "learning_rate": 4.71658779979298e-06, + "loss": 0.1057, + "step": 18490 + }, + { + "epoch": 1.5983411810445376, + "grad_norm": 0.5948813436945792, + "learning_rate": 4.711931537504836e-06, + "loss": 0.1085, + "step": 18500 + }, + { + "epoch": 1.5992051492505075, + "grad_norm": 0.5407671177901976, + "learning_rate": 4.7072755258572014e-06, + "loss": 0.1044, + "step": 18510 + }, + { + "epoch": 1.6000691174564776, + "grad_norm": 0.5731745458031607, + "learning_rate": 4.7026197689011425e-06, + "loss": 0.1058, + "step": 18520 + }, + { + "epoch": 1.6009330856624477, + "grad_norm": 0.5522669562395803, + "learning_rate": 4.697964270687507e-06, + "loss": 0.1051, + "step": 18530 + }, + { + "epoch": 1.6017970538684176, + "grad_norm": 0.5441669651971076, + "learning_rate": 4.693309035266921e-06, + "loss": 0.1058, + "step": 18540 + }, + { + "epoch": 1.6026610220743875, + "grad_norm": 0.5659568723921515, + "learning_rate": 4.6886540666897725e-06, + "loss": 0.1097, + "step": 18550 + }, + { + "epoch": 1.6035249902803577, + "grad_norm": 0.568976782651086, + "learning_rate": 4.683999369006223e-06, + "loss": 0.1057, + "step": 18560 + }, + { + "epoch": 1.6043889584863278, + "grad_norm": 0.5368332076086545, + "learning_rate": 4.679344946266202e-06, + "loss": 0.1071, + "step": 18570 + }, + { + "epoch": 1.6052529266922977, + "grad_norm": 0.5472796671016149, + "learning_rate": 4.674690802519394e-06, + "loss": 0.1059, + "step": 18580 + }, + { + "epoch": 1.6061168948982676, + "grad_norm": 0.5871536707812725, + "learning_rate": 4.67003694181524e-06, + "loss": 0.1065, + "step": 18590 + }, + { + "epoch": 1.6069808631042377, + "grad_norm": 0.5637887574564212, + "learning_rate": 4.665383368202939e-06, + "loss": 0.1044, + "step": 18600 + }, + { + "epoch": 1.6078448313102078, + "grad_norm": 0.5616396771556457, + "learning_rate": 4.660730085731438e-06, + "loss": 0.1054, + "step": 18610 + }, + { + "epoch": 1.6087087995161777, + "grad_norm": 0.5556472856134798, + "learning_rate": 4.656077098449434e-06, + "loss": 0.1032, + "step": 18620 + }, + { + "epoch": 1.6095727677221479, + "grad_norm": 0.5683986382033807, + "learning_rate": 4.65142441040536e-06, + "loss": 0.1072, + "step": 18630 + }, + { + "epoch": 1.610436735928118, + "grad_norm": 0.583395037447458, + "learning_rate": 4.646772025647394e-06, + "loss": 0.1051, + "step": 18640 + }, + { + "epoch": 1.6113007041340879, + "grad_norm": 0.5816202934421579, + "learning_rate": 4.642119948223449e-06, + "loss": 0.1072, + "step": 18650 + }, + { + "epoch": 1.6121646723400578, + "grad_norm": 0.5657989657195068, + "learning_rate": 4.6374681821811745e-06, + "loss": 0.1073, + "step": 18660 + }, + { + "epoch": 1.613028640546028, + "grad_norm": 0.5791187780948374, + "learning_rate": 4.6328167315679396e-06, + "loss": 0.1034, + "step": 18670 + }, + { + "epoch": 1.613892608751998, + "grad_norm": 0.5715678977702422, + "learning_rate": 4.6281656004308464e-06, + "loss": 0.1017, + "step": 18680 + }, + { + "epoch": 1.614756576957968, + "grad_norm": 0.5734621932193357, + "learning_rate": 4.62351479281672e-06, + "loss": 0.1064, + "step": 18690 + }, + { + "epoch": 1.6156205451639378, + "grad_norm": 0.5530026596429509, + "learning_rate": 4.618864312772098e-06, + "loss": 0.1079, + "step": 18700 + }, + { + "epoch": 1.616484513369908, + "grad_norm": 0.5601065046640386, + "learning_rate": 4.614214164343237e-06, + "loss": 0.1058, + "step": 18710 + }, + { + "epoch": 1.617348481575878, + "grad_norm": 0.5760129273781665, + "learning_rate": 4.609564351576103e-06, + "loss": 0.1059, + "step": 18720 + }, + { + "epoch": 1.618212449781848, + "grad_norm": 0.5558664942203434, + "learning_rate": 4.604914878516376e-06, + "loss": 0.1055, + "step": 18730 + }, + { + "epoch": 1.619076417987818, + "grad_norm": 0.5550058318089843, + "learning_rate": 4.60026574920943e-06, + "loss": 0.107, + "step": 18740 + }, + { + "epoch": 1.6199403861937882, + "grad_norm": 0.5546104276887956, + "learning_rate": 4.595616967700346e-06, + "loss": 0.1058, + "step": 18750 + }, + { + "epoch": 1.6208043543997581, + "grad_norm": 0.563362489721722, + "learning_rate": 4.590968538033906e-06, + "loss": 0.1028, + "step": 18760 + }, + { + "epoch": 1.621668322605728, + "grad_norm": 0.5664210910466141, + "learning_rate": 4.586320464254579e-06, + "loss": 0.1062, + "step": 18770 + }, + { + "epoch": 1.6225322908116981, + "grad_norm": 0.5604114011064872, + "learning_rate": 4.581672750406527e-06, + "loss": 0.1047, + "step": 18780 + }, + { + "epoch": 1.6233962590176683, + "grad_norm": 0.5739443373151654, + "learning_rate": 4.5770254005336005e-06, + "loss": 0.1076, + "step": 18790 + }, + { + "epoch": 1.6242602272236382, + "grad_norm": 0.5438322646921915, + "learning_rate": 4.5723784186793305e-06, + "loss": 0.1015, + "step": 18800 + }, + { + "epoch": 1.625124195429608, + "grad_norm": 0.5713437054250342, + "learning_rate": 4.56773180888693e-06, + "loss": 0.1049, + "step": 18810 + }, + { + "epoch": 1.6259881636355782, + "grad_norm": 0.5203624733377038, + "learning_rate": 4.563085575199288e-06, + "loss": 0.1033, + "step": 18820 + }, + { + "epoch": 1.6268521318415483, + "grad_norm": 0.5345371509745694, + "learning_rate": 4.558439721658962e-06, + "loss": 0.1029, + "step": 18830 + }, + { + "epoch": 1.6277161000475182, + "grad_norm": 0.5512000186661318, + "learning_rate": 4.5537942523081856e-06, + "loss": 0.1069, + "step": 18840 + }, + { + "epoch": 1.6285800682534881, + "grad_norm": 0.5732026077183064, + "learning_rate": 4.549149171188856e-06, + "loss": 0.1055, + "step": 18850 + }, + { + "epoch": 1.6294440364594585, + "grad_norm": 0.53897272246241, + "learning_rate": 4.5445044823425285e-06, + "loss": 0.1032, + "step": 18860 + }, + { + "epoch": 1.6303080046654284, + "grad_norm": 0.5385728364611817, + "learning_rate": 4.5398601898104215e-06, + "loss": 0.1045, + "step": 18870 + }, + { + "epoch": 1.6311719728713983, + "grad_norm": 0.5739375455349807, + "learning_rate": 4.535216297633407e-06, + "loss": 0.1042, + "step": 18880 + }, + { + "epoch": 1.6320359410773684, + "grad_norm": 0.5851177785810917, + "learning_rate": 4.53057280985201e-06, + "loss": 0.1107, + "step": 18890 + }, + { + "epoch": 1.6328999092833385, + "grad_norm": 0.5626896150925973, + "learning_rate": 4.5259297305064006e-06, + "loss": 0.1067, + "step": 18900 + }, + { + "epoch": 1.6337638774893084, + "grad_norm": 0.5579093379958637, + "learning_rate": 4.521287063636397e-06, + "loss": 0.1035, + "step": 18910 + }, + { + "epoch": 1.6346278456952783, + "grad_norm": 0.5463165649754007, + "learning_rate": 4.516644813281455e-06, + "loss": 0.1072, + "step": 18920 + }, + { + "epoch": 1.6354918139012484, + "grad_norm": 0.5299271950173727, + "learning_rate": 4.512002983480674e-06, + "loss": 0.1045, + "step": 18930 + }, + { + "epoch": 1.6363557821072185, + "grad_norm": 0.5295341692293466, + "learning_rate": 4.507361578272779e-06, + "loss": 0.1058, + "step": 18940 + }, + { + "epoch": 1.6372197503131884, + "grad_norm": 0.5659514898847705, + "learning_rate": 4.50272060169613e-06, + "loss": 0.1056, + "step": 18950 + }, + { + "epoch": 1.6380837185191583, + "grad_norm": 0.5355035407457457, + "learning_rate": 4.49808005778872e-06, + "loss": 0.1058, + "step": 18960 + }, + { + "epoch": 1.6389476867251285, + "grad_norm": 0.5813365434148485, + "learning_rate": 4.493439950588152e-06, + "loss": 0.1048, + "step": 18970 + }, + { + "epoch": 1.6398116549310986, + "grad_norm": 0.5465368834634429, + "learning_rate": 4.48880028413166e-06, + "loss": 0.1094, + "step": 18980 + }, + { + "epoch": 1.6406756231370685, + "grad_norm": 0.5594639004899338, + "learning_rate": 4.484161062456093e-06, + "loss": 0.1077, + "step": 18990 + }, + { + "epoch": 1.6415395913430386, + "grad_norm": 0.5619680445715887, + "learning_rate": 4.479522289597909e-06, + "loss": 0.1036, + "step": 19000 + }, + { + "epoch": 1.6424035595490087, + "grad_norm": 0.5458294496106755, + "learning_rate": 4.474883969593179e-06, + "loss": 0.1042, + "step": 19010 + }, + { + "epoch": 1.6432675277549786, + "grad_norm": 0.5718266897624316, + "learning_rate": 4.470246106477575e-06, + "loss": 0.1033, + "step": 19020 + }, + { + "epoch": 1.6441314959609485, + "grad_norm": 0.5726300062386809, + "learning_rate": 4.46560870428638e-06, + "loss": 0.108, + "step": 19030 + }, + { + "epoch": 1.6449954641669187, + "grad_norm": 0.5593823250439539, + "learning_rate": 4.460971767054469e-06, + "loss": 0.1054, + "step": 19040 + }, + { + "epoch": 1.6458594323728888, + "grad_norm": 0.5771387893259979, + "learning_rate": 4.456335298816314e-06, + "loss": 0.1038, + "step": 19050 + }, + { + "epoch": 1.6467234005788587, + "grad_norm": 0.5206178813990037, + "learning_rate": 4.45169930360598e-06, + "loss": 0.1047, + "step": 19060 + }, + { + "epoch": 1.6475873687848286, + "grad_norm": 0.5546056481802352, + "learning_rate": 4.4470637854571195e-06, + "loss": 0.1053, + "step": 19070 + }, + { + "epoch": 1.6484513369907987, + "grad_norm": 0.563121998454811, + "learning_rate": 4.442428748402974e-06, + "loss": 0.1025, + "step": 19080 + }, + { + "epoch": 1.6493153051967688, + "grad_norm": 0.5740974281010763, + "learning_rate": 4.437794196476357e-06, + "loss": 0.1077, + "step": 19090 + }, + { + "epoch": 1.6501792734027387, + "grad_norm": 0.5798995188610356, + "learning_rate": 4.433160133709668e-06, + "loss": 0.1077, + "step": 19100 + }, + { + "epoch": 1.6510432416087086, + "grad_norm": 0.5399967033915721, + "learning_rate": 4.428526564134879e-06, + "loss": 0.1051, + "step": 19110 + }, + { + "epoch": 1.651907209814679, + "grad_norm": 0.6047608211253362, + "learning_rate": 4.423893491783535e-06, + "loss": 0.1051, + "step": 19120 + }, + { + "epoch": 1.6527711780206489, + "grad_norm": 0.5567507402078828, + "learning_rate": 4.4192609206867395e-06, + "loss": 0.1044, + "step": 19130 + }, + { + "epoch": 1.6536351462266188, + "grad_norm": 0.5509176536860414, + "learning_rate": 4.414628854875171e-06, + "loss": 0.1082, + "step": 19140 + }, + { + "epoch": 1.654499114432589, + "grad_norm": 0.5807724228673988, + "learning_rate": 4.409997298379062e-06, + "loss": 0.1021, + "step": 19150 + }, + { + "epoch": 1.655363082638559, + "grad_norm": 0.5588511431747389, + "learning_rate": 4.405366255228206e-06, + "loss": 0.1057, + "step": 19160 + }, + { + "epoch": 1.656227050844529, + "grad_norm": 0.5760437138864196, + "learning_rate": 4.400735729451943e-06, + "loss": 0.0994, + "step": 19170 + }, + { + "epoch": 1.6570910190504988, + "grad_norm": 0.5862621716618907, + "learning_rate": 4.396105725079169e-06, + "loss": 0.1041, + "step": 19180 + }, + { + "epoch": 1.657954987256469, + "grad_norm": 0.5555879959070747, + "learning_rate": 4.391476246138326e-06, + "loss": 0.1052, + "step": 19190 + }, + { + "epoch": 1.658818955462439, + "grad_norm": 0.5952847226961522, + "learning_rate": 4.386847296657396e-06, + "loss": 0.1081, + "step": 19200 + }, + { + "epoch": 1.659682923668409, + "grad_norm": 0.5348380660765963, + "learning_rate": 4.382218880663902e-06, + "loss": 0.107, + "step": 19210 + }, + { + "epoch": 1.6605468918743789, + "grad_norm": 0.5151912291244257, + "learning_rate": 4.3775910021849e-06, + "loss": 0.1031, + "step": 19220 + }, + { + "epoch": 1.661410860080349, + "grad_norm": 0.6003358328007403, + "learning_rate": 4.372963665246986e-06, + "loss": 0.1064, + "step": 19230 + }, + { + "epoch": 1.6622748282863191, + "grad_norm": 0.5544041061975947, + "learning_rate": 4.368336873876273e-06, + "loss": 0.1014, + "step": 19240 + }, + { + "epoch": 1.663138796492289, + "grad_norm": 0.54913783519371, + "learning_rate": 4.36371063209841e-06, + "loss": 0.1036, + "step": 19250 + }, + { + "epoch": 1.6640027646982591, + "grad_norm": 0.5583738275839267, + "learning_rate": 4.359084943938564e-06, + "loss": 0.1064, + "step": 19260 + }, + { + "epoch": 1.6648667329042293, + "grad_norm": 0.5738850263871381, + "learning_rate": 4.35445981342142e-06, + "loss": 0.1049, + "step": 19270 + }, + { + "epoch": 1.6657307011101992, + "grad_norm": 0.5524237103340026, + "learning_rate": 4.349835244571175e-06, + "loss": 0.1042, + "step": 19280 + }, + { + "epoch": 1.666594669316169, + "grad_norm": 0.5706676298342727, + "learning_rate": 4.345211241411543e-06, + "loss": 0.1093, + "step": 19290 + }, + { + "epoch": 1.6674586375221392, + "grad_norm": 0.5848544135945213, + "learning_rate": 4.340587807965743e-06, + "loss": 0.1072, + "step": 19300 + }, + { + "epoch": 1.6683226057281093, + "grad_norm": 0.6287334419221372, + "learning_rate": 4.335964948256497e-06, + "loss": 0.1074, + "step": 19310 + }, + { + "epoch": 1.6691865739340792, + "grad_norm": 0.5777601253584108, + "learning_rate": 4.331342666306029e-06, + "loss": 0.1049, + "step": 19320 + }, + { + "epoch": 1.670050542140049, + "grad_norm": 0.5518672383982035, + "learning_rate": 4.32672096613606e-06, + "loss": 0.1076, + "step": 19330 + }, + { + "epoch": 1.6709145103460192, + "grad_norm": 0.5457043407209156, + "learning_rate": 4.322099851767807e-06, + "loss": 0.1063, + "step": 19340 + }, + { + "epoch": 1.6717784785519894, + "grad_norm": 0.5424461831045879, + "learning_rate": 4.317479327221976e-06, + "loss": 0.1063, + "step": 19350 + }, + { + "epoch": 1.6726424467579593, + "grad_norm": 0.5805324592882917, + "learning_rate": 4.3128593965187555e-06, + "loss": 0.1064, + "step": 19360 + }, + { + "epoch": 1.6735064149639294, + "grad_norm": 0.6130304203689181, + "learning_rate": 4.3082400636778236e-06, + "loss": 0.1042, + "step": 19370 + }, + { + "epoch": 1.6743703831698995, + "grad_norm": 0.5810864835602517, + "learning_rate": 4.303621332718336e-06, + "loss": 0.1037, + "step": 19380 + }, + { + "epoch": 1.6752343513758694, + "grad_norm": 0.5318618871960004, + "learning_rate": 4.299003207658926e-06, + "loss": 0.1095, + "step": 19390 + }, + { + "epoch": 1.6760983195818393, + "grad_norm": 0.5575880416342754, + "learning_rate": 4.294385692517696e-06, + "loss": 0.1069, + "step": 19400 + }, + { + "epoch": 1.6769622877878094, + "grad_norm": 0.5760120387957547, + "learning_rate": 4.289768791312219e-06, + "loss": 0.1031, + "step": 19410 + }, + { + "epoch": 1.6778262559937795, + "grad_norm": 0.6238448718107859, + "learning_rate": 4.2851525080595356e-06, + "loss": 0.1034, + "step": 19420 + }, + { + "epoch": 1.6786902241997494, + "grad_norm": 0.5617405307505385, + "learning_rate": 4.280536846776151e-06, + "loss": 0.1054, + "step": 19430 + }, + { + "epoch": 1.6795541924057193, + "grad_norm": 0.5540885463967568, + "learning_rate": 4.275921811478021e-06, + "loss": 0.1081, + "step": 19440 + }, + { + "epoch": 1.6804181606116895, + "grad_norm": 0.5559250887793661, + "learning_rate": 4.271307406180565e-06, + "loss": 0.1091, + "step": 19450 + }, + { + "epoch": 1.6812821288176596, + "grad_norm": 0.5462162389581664, + "learning_rate": 4.266693634898648e-06, + "loss": 0.1045, + "step": 19460 + }, + { + "epoch": 1.6821460970236295, + "grad_norm": 0.5768486265339877, + "learning_rate": 4.262080501646594e-06, + "loss": 0.1079, + "step": 19470 + }, + { + "epoch": 1.6830100652295994, + "grad_norm": 0.5679698629432213, + "learning_rate": 4.257468010438156e-06, + "loss": 0.1049, + "step": 19480 + }, + { + "epoch": 1.6838740334355695, + "grad_norm": 0.560233750108615, + "learning_rate": 4.25285616528654e-06, + "loss": 0.1053, + "step": 19490 + }, + { + "epoch": 1.6847380016415396, + "grad_norm": 0.5195629545137693, + "learning_rate": 4.248244970204388e-06, + "loss": 0.1025, + "step": 19500 + }, + { + "epoch": 1.6856019698475095, + "grad_norm": 0.5613215865113323, + "learning_rate": 4.243634429203774e-06, + "loss": 0.1029, + "step": 19510 + }, + { + "epoch": 1.6864659380534797, + "grad_norm": 0.5652931780024508, + "learning_rate": 4.2390245462962035e-06, + "loss": 0.1048, + "step": 19520 + }, + { + "epoch": 1.6873299062594498, + "grad_norm": 0.5816856644432576, + "learning_rate": 4.234415325492608e-06, + "loss": 0.1057, + "step": 19530 + }, + { + "epoch": 1.6881938744654197, + "grad_norm": 0.6059617188417905, + "learning_rate": 4.229806770803349e-06, + "loss": 0.1027, + "step": 19540 + }, + { + "epoch": 1.6890578426713896, + "grad_norm": 0.5445694254477236, + "learning_rate": 4.225198886238201e-06, + "loss": 0.1041, + "step": 19550 + }, + { + "epoch": 1.6899218108773597, + "grad_norm": 0.5581999508448194, + "learning_rate": 4.220591675806359e-06, + "loss": 0.1068, + "step": 19560 + }, + { + "epoch": 1.6907857790833298, + "grad_norm": 0.5546346046115186, + "learning_rate": 4.215985143516431e-06, + "loss": 0.1044, + "step": 19570 + }, + { + "epoch": 1.6916497472892997, + "grad_norm": 0.5439755044085209, + "learning_rate": 4.211379293376438e-06, + "loss": 0.1033, + "step": 19580 + }, + { + "epoch": 1.6925137154952696, + "grad_norm": 0.5656359038198634, + "learning_rate": 4.2067741293938e-06, + "loss": 0.1048, + "step": 19590 + }, + { + "epoch": 1.6933776837012398, + "grad_norm": 0.5590831186640056, + "learning_rate": 4.202169655575347e-06, + "loss": 0.1011, + "step": 19600 + }, + { + "epoch": 1.6942416519072099, + "grad_norm": 0.5394716638115005, + "learning_rate": 4.197565875927306e-06, + "loss": 0.1027, + "step": 19610 + }, + { + "epoch": 1.6951056201131798, + "grad_norm": 0.5634450989871348, + "learning_rate": 4.1929627944552995e-06, + "loss": 0.1058, + "step": 19620 + }, + { + "epoch": 1.69596958831915, + "grad_norm": 0.5515417589498646, + "learning_rate": 4.188360415164344e-06, + "loss": 0.1039, + "step": 19630 + }, + { + "epoch": 1.69683355652512, + "grad_norm": 0.5421289498514134, + "learning_rate": 4.183758742058842e-06, + "loss": 0.1052, + "step": 19640 + }, + { + "epoch": 1.69769752473109, + "grad_norm": 0.5844255281676082, + "learning_rate": 4.179157779142585e-06, + "loss": 0.1006, + "step": 19650 + }, + { + "epoch": 1.6985614929370598, + "grad_norm": 0.5736360543034765, + "learning_rate": 4.174557530418748e-06, + "loss": 0.1057, + "step": 19660 + }, + { + "epoch": 1.69942546114303, + "grad_norm": 0.5552882278271722, + "learning_rate": 4.169957999889877e-06, + "loss": 0.1028, + "step": 19670 + }, + { + "epoch": 1.700289429349, + "grad_norm": 0.544532249195041, + "learning_rate": 4.165359191557901e-06, + "loss": 0.103, + "step": 19680 + }, + { + "epoch": 1.70115339755497, + "grad_norm": 0.5246566731943877, + "learning_rate": 4.160761109424115e-06, + "loss": 0.1018, + "step": 19690 + }, + { + "epoch": 1.7020173657609399, + "grad_norm": 0.5417849366104356, + "learning_rate": 4.15616375748919e-06, + "loss": 0.1039, + "step": 19700 + }, + { + "epoch": 1.70288133396691, + "grad_norm": 0.5741977557459841, + "learning_rate": 4.151567139753152e-06, + "loss": 0.1062, + "step": 19710 + }, + { + "epoch": 1.7037453021728801, + "grad_norm": 0.5637437127823944, + "learning_rate": 4.1469712602153935e-06, + "loss": 0.1064, + "step": 19720 + }, + { + "epoch": 1.70460927037885, + "grad_norm": 0.5947331247456926, + "learning_rate": 4.142376122874664e-06, + "loss": 0.1059, + "step": 19730 + }, + { + "epoch": 1.70547323858482, + "grad_norm": 0.5566283619723043, + "learning_rate": 4.137781731729069e-06, + "loss": 0.1015, + "step": 19740 + }, + { + "epoch": 1.7063372067907903, + "grad_norm": 0.5582794032645616, + "learning_rate": 4.1331880907760595e-06, + "loss": 0.1089, + "step": 19750 + }, + { + "epoch": 1.7072011749967602, + "grad_norm": 0.5836144846557693, + "learning_rate": 4.1285952040124375e-06, + "loss": 0.1022, + "step": 19760 + }, + { + "epoch": 1.70806514320273, + "grad_norm": 0.5676899148191863, + "learning_rate": 4.124003075434351e-06, + "loss": 0.1094, + "step": 19770 + }, + { + "epoch": 1.7089291114087002, + "grad_norm": 0.5207672775373055, + "learning_rate": 4.119411709037286e-06, + "loss": 0.1022, + "step": 19780 + }, + { + "epoch": 1.7097930796146703, + "grad_norm": 0.5410470442889012, + "learning_rate": 4.114821108816063e-06, + "loss": 0.1012, + "step": 19790 + }, + { + "epoch": 1.7106570478206402, + "grad_norm": 0.5620634468728611, + "learning_rate": 4.110231278764837e-06, + "loss": 0.1015, + "step": 19800 + }, + { + "epoch": 1.71152101602661, + "grad_norm": 0.5625517889943247, + "learning_rate": 4.1056422228770986e-06, + "loss": 0.1042, + "step": 19810 + }, + { + "epoch": 1.7123849842325802, + "grad_norm": 0.5551478643367547, + "learning_rate": 4.101053945145655e-06, + "loss": 0.1054, + "step": 19820 + }, + { + "epoch": 1.7132489524385504, + "grad_norm": 0.5961782039221952, + "learning_rate": 4.0964664495626436e-06, + "loss": 0.1068, + "step": 19830 + }, + { + "epoch": 1.7141129206445203, + "grad_norm": 0.577866925877064, + "learning_rate": 4.091879740119518e-06, + "loss": 0.1051, + "step": 19840 + }, + { + "epoch": 1.7149768888504902, + "grad_norm": 0.5938704854801597, + "learning_rate": 4.087293820807054e-06, + "loss": 0.1056, + "step": 19850 + }, + { + "epoch": 1.7158408570564603, + "grad_norm": 0.5403360078049187, + "learning_rate": 4.082708695615326e-06, + "loss": 0.103, + "step": 19860 + }, + { + "epoch": 1.7167048252624304, + "grad_norm": 0.5435756955040987, + "learning_rate": 4.078124368533733e-06, + "loss": 0.104, + "step": 19870 + }, + { + "epoch": 1.7175687934684003, + "grad_norm": 0.5541908421361005, + "learning_rate": 4.07354084355097e-06, + "loss": 0.1004, + "step": 19880 + }, + { + "epoch": 1.7184327616743704, + "grad_norm": 0.5886329233256653, + "learning_rate": 4.06895812465504e-06, + "loss": 0.1068, + "step": 19890 + }, + { + "epoch": 1.7192967298803405, + "grad_norm": 0.5707722202962463, + "learning_rate": 4.064376215833238e-06, + "loss": 0.105, + "step": 19900 + }, + { + "epoch": 1.7201606980863104, + "grad_norm": 0.5674191409460011, + "learning_rate": 4.05979512107216e-06, + "loss": 0.103, + "step": 19910 + }, + { + "epoch": 1.7210246662922803, + "grad_norm": 0.5519385356483035, + "learning_rate": 4.055214844357692e-06, + "loss": 0.1028, + "step": 19920 + }, + { + "epoch": 1.7218886344982505, + "grad_norm": 0.5548644777530287, + "learning_rate": 4.050635389675006e-06, + "loss": 0.1019, + "step": 19930 + }, + { + "epoch": 1.7227526027042206, + "grad_norm": 0.5631549051283312, + "learning_rate": 4.046056761008561e-06, + "loss": 0.1054, + "step": 19940 + }, + { + "epoch": 1.7236165709101905, + "grad_norm": 0.5606486954370578, + "learning_rate": 4.041478962342098e-06, + "loss": 0.1024, + "step": 19950 + }, + { + "epoch": 1.7244805391161604, + "grad_norm": 0.5667956495178411, + "learning_rate": 4.036901997658632e-06, + "loss": 0.1031, + "step": 19960 + }, + { + "epoch": 1.7253445073221305, + "grad_norm": 0.5455738969443668, + "learning_rate": 4.03232587094046e-06, + "loss": 0.1016, + "step": 19970 + }, + { + "epoch": 1.7262084755281006, + "grad_norm": 0.5615537609092984, + "learning_rate": 4.0277505861691405e-06, + "loss": 0.1049, + "step": 19980 + }, + { + "epoch": 1.7270724437340705, + "grad_norm": 0.6077668734633745, + "learning_rate": 4.023176147325505e-06, + "loss": 0.1054, + "step": 19990 + }, + { + "epoch": 1.7279364119400407, + "grad_norm": 0.5353436552303115, + "learning_rate": 4.018602558389648e-06, + "loss": 0.1048, + "step": 20000 + }, + { + "epoch": 1.7288003801460108, + "grad_norm": 0.547027306615421, + "learning_rate": 4.014029823340928e-06, + "loss": 0.1022, + "step": 20010 + }, + { + "epoch": 1.7296643483519807, + "grad_norm": 0.5953265053533482, + "learning_rate": 4.0094579461579505e-06, + "loss": 0.1055, + "step": 20020 + }, + { + "epoch": 1.7305283165579506, + "grad_norm": 0.5527840216631419, + "learning_rate": 4.0048869308185856e-06, + "loss": 0.104, + "step": 20030 + }, + { + "epoch": 1.7313922847639207, + "grad_norm": 0.5631809649129795, + "learning_rate": 4.0003167812999465e-06, + "loss": 0.1031, + "step": 20040 + }, + { + "epoch": 1.7322562529698908, + "grad_norm": 0.5420978278693646, + "learning_rate": 3.9957475015784e-06, + "loss": 0.1024, + "step": 20050 + }, + { + "epoch": 1.7331202211758607, + "grad_norm": 0.5495208887748646, + "learning_rate": 3.9911790956295455e-06, + "loss": 0.099, + "step": 20060 + }, + { + "epoch": 1.7339841893818306, + "grad_norm": 0.6077615563313703, + "learning_rate": 3.986611567428231e-06, + "loss": 0.1039, + "step": 20070 + }, + { + "epoch": 1.7348481575878008, + "grad_norm": 0.5575610185779836, + "learning_rate": 3.982044920948542e-06, + "loss": 0.1041, + "step": 20080 + }, + { + "epoch": 1.7357121257937709, + "grad_norm": 0.5712717988692463, + "learning_rate": 3.977479160163786e-06, + "loss": 0.1036, + "step": 20090 + }, + { + "epoch": 1.7365760939997408, + "grad_norm": 0.5868440064927699, + "learning_rate": 3.972914289046512e-06, + "loss": 0.1051, + "step": 20100 + }, + { + "epoch": 1.7374400622057107, + "grad_norm": 0.5571265568035261, + "learning_rate": 3.968350311568487e-06, + "loss": 0.1059, + "step": 20110 + }, + { + "epoch": 1.7383040304116808, + "grad_norm": 0.5571341134249902, + "learning_rate": 3.963787231700707e-06, + "loss": 0.1009, + "step": 20120 + }, + { + "epoch": 1.739167998617651, + "grad_norm": 0.5442273293247737, + "learning_rate": 3.959225053413379e-06, + "loss": 0.1056, + "step": 20130 + }, + { + "epoch": 1.7400319668236208, + "grad_norm": 0.5392709739565602, + "learning_rate": 3.954663780675932e-06, + "loss": 0.1044, + "step": 20140 + }, + { + "epoch": 1.740895935029591, + "grad_norm": 0.5714865715662484, + "learning_rate": 3.950103417457004e-06, + "loss": 0.1003, + "step": 20150 + }, + { + "epoch": 1.741759903235561, + "grad_norm": 0.5611686800757524, + "learning_rate": 3.945543967724444e-06, + "loss": 0.1062, + "step": 20160 + }, + { + "epoch": 1.742623871441531, + "grad_norm": 0.562876377608983, + "learning_rate": 3.940985435445303e-06, + "loss": 0.1041, + "step": 20170 + }, + { + "epoch": 1.7434878396475009, + "grad_norm": 0.5365688293514245, + "learning_rate": 3.936427824585836e-06, + "loss": 0.1044, + "step": 20180 + }, + { + "epoch": 1.744351807853471, + "grad_norm": 0.5784734187598308, + "learning_rate": 3.931871139111497e-06, + "loss": 0.1012, + "step": 20190 + }, + { + "epoch": 1.7452157760594411, + "grad_norm": 0.56924590426923, + "learning_rate": 3.927315382986935e-06, + "loss": 0.1048, + "step": 20200 + }, + { + "epoch": 1.746079744265411, + "grad_norm": 0.5868943193747296, + "learning_rate": 3.922760560175984e-06, + "loss": 0.103, + "step": 20210 + }, + { + "epoch": 1.746943712471381, + "grad_norm": 0.5684990052889753, + "learning_rate": 3.918206674641674e-06, + "loss": 0.1054, + "step": 20220 + }, + { + "epoch": 1.747807680677351, + "grad_norm": 0.5984985926149132, + "learning_rate": 3.913653730346219e-06, + "loss": 0.1005, + "step": 20230 + }, + { + "epoch": 1.7486716488833212, + "grad_norm": 0.5719741375137701, + "learning_rate": 3.909101731251008e-06, + "loss": 0.1034, + "step": 20240 + }, + { + "epoch": 1.749535617089291, + "grad_norm": 0.5445905911014346, + "learning_rate": 3.904550681316613e-06, + "loss": 0.1004, + "step": 20250 + }, + { + "epoch": 1.7503995852952612, + "grad_norm": 0.5535256139758667, + "learning_rate": 3.900000584502777e-06, + "loss": 0.0999, + "step": 20260 + }, + { + "epoch": 1.7512635535012313, + "grad_norm": 0.5764065311727894, + "learning_rate": 3.8954514447684154e-06, + "loss": 0.1007, + "step": 20270 + }, + { + "epoch": 1.7521275217072012, + "grad_norm": 0.5458153301564745, + "learning_rate": 3.890903266071614e-06, + "loss": 0.1033, + "step": 20280 + }, + { + "epoch": 1.752991489913171, + "grad_norm": 0.562777505638518, + "learning_rate": 3.886356052369613e-06, + "loss": 0.1022, + "step": 20290 + }, + { + "epoch": 1.7538554581191412, + "grad_norm": 0.5552263607405525, + "learning_rate": 3.881809807618822e-06, + "loss": 0.1014, + "step": 20300 + }, + { + "epoch": 1.7547194263251114, + "grad_norm": 0.5809887010445461, + "learning_rate": 3.8772645357748055e-06, + "loss": 0.1025, + "step": 20310 + }, + { + "epoch": 1.7555833945310813, + "grad_norm": 0.5851596272145743, + "learning_rate": 3.872720240792281e-06, + "loss": 0.1038, + "step": 20320 + }, + { + "epoch": 1.7564473627370512, + "grad_norm": 0.5919873050029621, + "learning_rate": 3.868176926625111e-06, + "loss": 0.1023, + "step": 20330 + }, + { + "epoch": 1.7573113309430213, + "grad_norm": 0.5763379694868309, + "learning_rate": 3.863634597226314e-06, + "loss": 0.1012, + "step": 20340 + }, + { + "epoch": 1.7581752991489914, + "grad_norm": 0.5913953702067776, + "learning_rate": 3.859093256548044e-06, + "loss": 0.1058, + "step": 20350 + }, + { + "epoch": 1.7590392673549613, + "grad_norm": 0.5320930353822502, + "learning_rate": 3.854552908541601e-06, + "loss": 0.101, + "step": 20360 + }, + { + "epoch": 1.7599032355609312, + "grad_norm": 0.5513593841607388, + "learning_rate": 3.850013557157413e-06, + "loss": 0.1017, + "step": 20370 + }, + { + "epoch": 1.7607672037669013, + "grad_norm": 0.5272766571509331, + "learning_rate": 3.845475206345048e-06, + "loss": 0.105, + "step": 20380 + }, + { + "epoch": 1.7616311719728714, + "grad_norm": 0.561909459049623, + "learning_rate": 3.840937860053204e-06, + "loss": 0.1056, + "step": 20390 + }, + { + "epoch": 1.7624951401788413, + "grad_norm": 0.5623218739821189, + "learning_rate": 3.836401522229698e-06, + "loss": 0.1018, + "step": 20400 + }, + { + "epoch": 1.7633591083848115, + "grad_norm": 0.5713647997114392, + "learning_rate": 3.831866196821476e-06, + "loss": 0.1009, + "step": 20410 + }, + { + "epoch": 1.7642230765907816, + "grad_norm": 0.5537859362481026, + "learning_rate": 3.827331887774599e-06, + "loss": 0.1021, + "step": 20420 + }, + { + "epoch": 1.7650870447967515, + "grad_norm": 0.5606600806201089, + "learning_rate": 3.8227985990342495e-06, + "loss": 0.0982, + "step": 20430 + }, + { + "epoch": 1.7659510130027214, + "grad_norm": 0.5701330136609685, + "learning_rate": 3.818266334544714e-06, + "loss": 0.1042, + "step": 20440 + }, + { + "epoch": 1.7668149812086915, + "grad_norm": 0.5731712287364021, + "learning_rate": 3.813735098249395e-06, + "loss": 0.1039, + "step": 20450 + }, + { + "epoch": 1.7676789494146616, + "grad_norm": 0.5673002836546218, + "learning_rate": 3.8092048940907944e-06, + "loss": 0.1043, + "step": 20460 + }, + { + "epoch": 1.7685429176206315, + "grad_norm": 0.5670893355009318, + "learning_rate": 3.8046757260105244e-06, + "loss": 0.1024, + "step": 20470 + }, + { + "epoch": 1.7694068858266014, + "grad_norm": 0.5944192806216618, + "learning_rate": 3.800147597949285e-06, + "loss": 0.1017, + "step": 20480 + }, + { + "epoch": 1.7702708540325716, + "grad_norm": 0.5670087860330885, + "learning_rate": 3.7956205138468795e-06, + "loss": 0.1015, + "step": 20490 + }, + { + "epoch": 1.7711348222385417, + "grad_norm": 0.5817270080000759, + "learning_rate": 3.7910944776422e-06, + "loss": 0.1039, + "step": 20500 + }, + { + "epoch": 1.7719987904445116, + "grad_norm": 0.5724007713380568, + "learning_rate": 3.7865694932732296e-06, + "loss": 0.1051, + "step": 20510 + }, + { + "epoch": 1.7728627586504817, + "grad_norm": 0.5473492286689364, + "learning_rate": 3.7820455646770284e-06, + "loss": 0.1024, + "step": 20520 + }, + { + "epoch": 1.7737267268564518, + "grad_norm": 0.5485247247016158, + "learning_rate": 3.7775226957897465e-06, + "loss": 0.1019, + "step": 20530 + }, + { + "epoch": 1.7745906950624217, + "grad_norm": 0.5643486978024851, + "learning_rate": 3.773000890546609e-06, + "loss": 0.104, + "step": 20540 + }, + { + "epoch": 1.7754546632683916, + "grad_norm": 0.5796798917661853, + "learning_rate": 3.7684801528819155e-06, + "loss": 0.1047, + "step": 20550 + }, + { + "epoch": 1.7763186314743618, + "grad_norm": 0.545404357770531, + "learning_rate": 3.763960486729035e-06, + "loss": 0.1056, + "step": 20560 + }, + { + "epoch": 1.7771825996803319, + "grad_norm": 0.5357278759624914, + "learning_rate": 3.759441896020405e-06, + "loss": 0.1036, + "step": 20570 + }, + { + "epoch": 1.7780465678863018, + "grad_norm": 0.5918980827105917, + "learning_rate": 3.7549243846875288e-06, + "loss": 0.1016, + "step": 20580 + }, + { + "epoch": 1.7789105360922717, + "grad_norm": 0.557293641693346, + "learning_rate": 3.750407956660973e-06, + "loss": 0.1025, + "step": 20590 + }, + { + "epoch": 1.7797745042982418, + "grad_norm": 0.5442810225575131, + "learning_rate": 3.745892615870353e-06, + "loss": 0.0984, + "step": 20600 + }, + { + "epoch": 1.780638472504212, + "grad_norm": 0.5844983519187161, + "learning_rate": 3.741378366244346e-06, + "loss": 0.1044, + "step": 20610 + }, + { + "epoch": 1.7815024407101818, + "grad_norm": 0.6092327070617354, + "learning_rate": 3.736865211710678e-06, + "loss": 0.1032, + "step": 20620 + }, + { + "epoch": 1.7823664089161517, + "grad_norm": 0.5845750451198817, + "learning_rate": 3.732353156196123e-06, + "loss": 0.106, + "step": 20630 + }, + { + "epoch": 1.783230377122122, + "grad_norm": 0.5142430347585016, + "learning_rate": 3.727842203626494e-06, + "loss": 0.0974, + "step": 20640 + }, + { + "epoch": 1.784094345328092, + "grad_norm": 0.5477628186400496, + "learning_rate": 3.723332357926649e-06, + "loss": 0.1003, + "step": 20650 + }, + { + "epoch": 1.7849583135340619, + "grad_norm": 0.5303665814483901, + "learning_rate": 3.7188236230204826e-06, + "loss": 0.1077, + "step": 20660 + }, + { + "epoch": 1.785822281740032, + "grad_norm": 0.5762679439825226, + "learning_rate": 3.7143160028309215e-06, + "loss": 0.1047, + "step": 20670 + }, + { + "epoch": 1.7866862499460021, + "grad_norm": 0.5530133329338524, + "learning_rate": 3.7098095012799216e-06, + "loss": 0.0995, + "step": 20680 + }, + { + "epoch": 1.787550218151972, + "grad_norm": 0.5482584661223466, + "learning_rate": 3.7053041222884688e-06, + "loss": 0.1026, + "step": 20690 + }, + { + "epoch": 1.788414186357942, + "grad_norm": 0.580595778741035, + "learning_rate": 3.7007998697765713e-06, + "loss": 0.106, + "step": 20700 + }, + { + "epoch": 1.789278154563912, + "grad_norm": 0.576869662947042, + "learning_rate": 3.696296747663253e-06, + "loss": 0.102, + "step": 20710 + }, + { + "epoch": 1.7901421227698822, + "grad_norm": 0.5597875975808141, + "learning_rate": 3.6917947598665593e-06, + "loss": 0.1014, + "step": 20720 + }, + { + "epoch": 1.791006090975852, + "grad_norm": 0.558351611350602, + "learning_rate": 3.6872939103035465e-06, + "loss": 0.1006, + "step": 20730 + }, + { + "epoch": 1.791870059181822, + "grad_norm": 0.5606337066934608, + "learning_rate": 3.682794202890284e-06, + "loss": 0.105, + "step": 20740 + }, + { + "epoch": 1.792734027387792, + "grad_norm": 0.5690262394802477, + "learning_rate": 3.6782956415418404e-06, + "loss": 0.1015, + "step": 20750 + }, + { + "epoch": 1.7935979955937622, + "grad_norm": 0.5554666935458995, + "learning_rate": 3.673798230172293e-06, + "loss": 0.1012, + "step": 20760 + }, + { + "epoch": 1.794461963799732, + "grad_norm": 0.5826019998816877, + "learning_rate": 3.6693019726947154e-06, + "loss": 0.1007, + "step": 20770 + }, + { + "epoch": 1.7953259320057022, + "grad_norm": 0.552502735816188, + "learning_rate": 3.6648068730211816e-06, + "loss": 0.1005, + "step": 20780 + }, + { + "epoch": 1.7961899002116724, + "grad_norm": 0.5604779925921183, + "learning_rate": 3.660312935062752e-06, + "loss": 0.1015, + "step": 20790 + }, + { + "epoch": 1.7970538684176423, + "grad_norm": 0.553123810366442, + "learning_rate": 3.65582016272948e-06, + "loss": 0.1009, + "step": 20800 + }, + { + "epoch": 1.7979178366236122, + "grad_norm": 0.574040004269223, + "learning_rate": 3.651328559930404e-06, + "loss": 0.1024, + "step": 20810 + }, + { + "epoch": 1.7987818048295823, + "grad_norm": 0.5789047190106877, + "learning_rate": 3.6468381305735485e-06, + "loss": 0.1001, + "step": 20820 + }, + { + "epoch": 1.7996457730355524, + "grad_norm": 0.5743348463122552, + "learning_rate": 3.6423488785659085e-06, + "loss": 0.1063, + "step": 20830 + }, + { + "epoch": 1.8005097412415223, + "grad_norm": 0.568363700602923, + "learning_rate": 3.637860807813462e-06, + "loss": 0.1022, + "step": 20840 + }, + { + "epoch": 1.8013737094474922, + "grad_norm": 0.5632244458817115, + "learning_rate": 3.6333739222211576e-06, + "loss": 0.1005, + "step": 20850 + }, + { + "epoch": 1.8022376776534623, + "grad_norm": 0.5535634272232598, + "learning_rate": 3.628888225692912e-06, + "loss": 0.1038, + "step": 20860 + }, + { + "epoch": 1.8031016458594324, + "grad_norm": 0.5617936512836789, + "learning_rate": 3.6244037221316066e-06, + "loss": 0.1015, + "step": 20870 + }, + { + "epoch": 1.8039656140654023, + "grad_norm": 0.5676594590712748, + "learning_rate": 3.619920415439084e-06, + "loss": 0.0987, + "step": 20880 + }, + { + "epoch": 1.8048295822713725, + "grad_norm": 0.5548626950647346, + "learning_rate": 3.615438309516148e-06, + "loss": 0.1008, + "step": 20890 + }, + { + "epoch": 1.8056935504773426, + "grad_norm": 0.5472900095239701, + "learning_rate": 3.610957408262561e-06, + "loss": 0.103, + "step": 20900 + }, + { + "epoch": 1.8065575186833125, + "grad_norm": 0.556208859315823, + "learning_rate": 3.606477715577026e-06, + "loss": 0.1029, + "step": 20910 + }, + { + "epoch": 1.8074214868892824, + "grad_norm": 0.5843164924577277, + "learning_rate": 3.6019992353572047e-06, + "loss": 0.1014, + "step": 20920 + }, + { + "epoch": 1.8082854550952525, + "grad_norm": 0.5261056588336221, + "learning_rate": 3.5975219714997025e-06, + "loss": 0.1008, + "step": 20930 + }, + { + "epoch": 1.8091494233012226, + "grad_norm": 0.5291736647089412, + "learning_rate": 3.5930459279000606e-06, + "loss": 0.1035, + "step": 20940 + }, + { + "epoch": 1.8100133915071925, + "grad_norm": 0.5657988364870209, + "learning_rate": 3.588571108452764e-06, + "loss": 0.1034, + "step": 20950 + }, + { + "epoch": 1.8108773597131624, + "grad_norm": 0.5825009374567696, + "learning_rate": 3.5840975170512314e-06, + "loss": 0.1002, + "step": 20960 + }, + { + "epoch": 1.8117413279191326, + "grad_norm": 0.5580209417771652, + "learning_rate": 3.579625157587814e-06, + "loss": 0.101, + "step": 20970 + }, + { + "epoch": 1.8126052961251027, + "grad_norm": 0.523370115805435, + "learning_rate": 3.575154033953787e-06, + "loss": 0.1032, + "step": 20980 + }, + { + "epoch": 1.8134692643310726, + "grad_norm": 0.5803826900106622, + "learning_rate": 3.570684150039353e-06, + "loss": 0.1014, + "step": 20990 + }, + { + "epoch": 1.8143332325370425, + "grad_norm": 0.53633215338986, + "learning_rate": 3.5662155097336378e-06, + "loss": 0.1021, + "step": 21000 + }, + { + "epoch": 1.8151972007430126, + "grad_norm": 0.5457685347468603, + "learning_rate": 3.5617481169246845e-06, + "loss": 0.1062, + "step": 21010 + }, + { + "epoch": 1.8160611689489827, + "grad_norm": 0.5436165338718727, + "learning_rate": 3.557281975499446e-06, + "loss": 0.1, + "step": 21020 + }, + { + "epoch": 1.8169251371549526, + "grad_norm": 0.5219347875259863, + "learning_rate": 3.5528170893437918e-06, + "loss": 0.1022, + "step": 21030 + }, + { + "epoch": 1.8177891053609228, + "grad_norm": 0.5459117725002699, + "learning_rate": 3.5483534623424988e-06, + "loss": 0.1026, + "step": 21040 + }, + { + "epoch": 1.8186530735668929, + "grad_norm": 0.5344167085133646, + "learning_rate": 3.5438910983792465e-06, + "loss": 0.0988, + "step": 21050 + }, + { + "epoch": 1.8195170417728628, + "grad_norm": 0.533995913655794, + "learning_rate": 3.539430001336614e-06, + "loss": 0.1022, + "step": 21060 + }, + { + "epoch": 1.8203810099788327, + "grad_norm": 0.5395042423170748, + "learning_rate": 3.534970175096083e-06, + "loss": 0.1031, + "step": 21070 + }, + { + "epoch": 1.8212449781848028, + "grad_norm": 0.5508954228284384, + "learning_rate": 3.5305116235380233e-06, + "loss": 0.1036, + "step": 21080 + }, + { + "epoch": 1.822108946390773, + "grad_norm": 0.5459122344449423, + "learning_rate": 3.526054350541704e-06, + "loss": 0.1028, + "step": 21090 + }, + { + "epoch": 1.8229729145967428, + "grad_norm": 0.573507303035393, + "learning_rate": 3.521598359985271e-06, + "loss": 0.1016, + "step": 21100 + }, + { + "epoch": 1.8238368828027127, + "grad_norm": 0.5437055323857766, + "learning_rate": 3.5171436557457628e-06, + "loss": 0.0984, + "step": 21110 + }, + { + "epoch": 1.8247008510086828, + "grad_norm": 0.5670371375647348, + "learning_rate": 3.512690241699096e-06, + "loss": 0.1008, + "step": 21120 + }, + { + "epoch": 1.825564819214653, + "grad_norm": 0.5628275899816054, + "learning_rate": 3.5082381217200668e-06, + "loss": 0.0983, + "step": 21130 + }, + { + "epoch": 1.8264287874206229, + "grad_norm": 0.57987227106917, + "learning_rate": 3.5037872996823384e-06, + "loss": 0.1009, + "step": 21140 + }, + { + "epoch": 1.827292755626593, + "grad_norm": 0.5597332618992171, + "learning_rate": 3.4993377794584532e-06, + "loss": 0.1042, + "step": 21150 + }, + { + "epoch": 1.8281567238325631, + "grad_norm": 0.5527884910029922, + "learning_rate": 3.4948895649198156e-06, + "loss": 0.1002, + "step": 21160 + }, + { + "epoch": 1.829020692038533, + "grad_norm": 0.5399126337821516, + "learning_rate": 3.4904426599366985e-06, + "loss": 0.0998, + "step": 21170 + }, + { + "epoch": 1.829884660244503, + "grad_norm": 0.5533812170631007, + "learning_rate": 3.4859970683782283e-06, + "loss": 0.1029, + "step": 21180 + }, + { + "epoch": 1.830748628450473, + "grad_norm": 0.570090142425241, + "learning_rate": 3.481552794112395e-06, + "loss": 0.1002, + "step": 21190 + }, + { + "epoch": 1.8316125966564432, + "grad_norm": 0.576237314844463, + "learning_rate": 3.4771098410060433e-06, + "loss": 0.101, + "step": 21200 + }, + { + "epoch": 1.832476564862413, + "grad_norm": 0.5785261238185759, + "learning_rate": 3.47266821292486e-06, + "loss": 0.1026, + "step": 21210 + }, + { + "epoch": 1.833340533068383, + "grad_norm": 0.5391411473596741, + "learning_rate": 3.4682279137333874e-06, + "loss": 0.1009, + "step": 21220 + }, + { + "epoch": 1.834204501274353, + "grad_norm": 0.5717021882218313, + "learning_rate": 3.463788947295008e-06, + "loss": 0.1017, + "step": 21230 + }, + { + "epoch": 1.8350684694803232, + "grad_norm": 0.5641109625964653, + "learning_rate": 3.459351317471948e-06, + "loss": 0.1015, + "step": 21240 + }, + { + "epoch": 1.835932437686293, + "grad_norm": 0.5841912468112753, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.1009, + "step": 21250 + }, + { + "epoch": 1.836796405892263, + "grad_norm": 0.5775667698271872, + "learning_rate": 3.4504800831148523e-06, + "loss": 0.1015, + "step": 21260 + }, + { + "epoch": 1.8376603740982334, + "grad_norm": 0.5511909835861091, + "learning_rate": 3.446046486299437e-06, + "loss": 0.1001, + "step": 21270 + }, + { + "epoch": 1.8385243423042033, + "grad_norm": 0.5344861997274931, + "learning_rate": 3.4416142415365718e-06, + "loss": 0.1046, + "step": 21280 + }, + { + "epoch": 1.8393883105101732, + "grad_norm": 0.5756081848954616, + "learning_rate": 3.437183352682628e-06, + "loss": 0.1033, + "step": 21290 + }, + { + "epoch": 1.8402522787161433, + "grad_norm": 0.5671991668813597, + "learning_rate": 3.432753823592804e-06, + "loss": 0.1046, + "step": 21300 + }, + { + "epoch": 1.8411162469221134, + "grad_norm": 0.5782180776536016, + "learning_rate": 3.42832565812111e-06, + "loss": 0.0986, + "step": 21310 + }, + { + "epoch": 1.8419802151280833, + "grad_norm": 0.5327885416091026, + "learning_rate": 3.4238988601203766e-06, + "loss": 0.103, + "step": 21320 + }, + { + "epoch": 1.8428441833340532, + "grad_norm": 0.5301622627094087, + "learning_rate": 3.4194734334422343e-06, + "loss": 0.1022, + "step": 21330 + }, + { + "epoch": 1.8437081515400233, + "grad_norm": 0.542883154178525, + "learning_rate": 3.4150493819371282e-06, + "loss": 0.1029, + "step": 21340 + }, + { + "epoch": 1.8445721197459934, + "grad_norm": 0.5485117027957886, + "learning_rate": 3.4106267094543068e-06, + "loss": 0.1033, + "step": 21350 + }, + { + "epoch": 1.8454360879519633, + "grad_norm": 0.547398116269584, + "learning_rate": 3.4062054198418143e-06, + "loss": 0.1027, + "step": 21360 + }, + { + "epoch": 1.8463000561579332, + "grad_norm": 0.5467991450188698, + "learning_rate": 3.401785516946495e-06, + "loss": 0.1043, + "step": 21370 + }, + { + "epoch": 1.8471640243639034, + "grad_norm": 0.5856305997104947, + "learning_rate": 3.397367004613985e-06, + "loss": 0.1013, + "step": 21380 + }, + { + "epoch": 1.8480279925698735, + "grad_norm": 0.5382540415719735, + "learning_rate": 3.3929498866887124e-06, + "loss": 0.1011, + "step": 21390 + }, + { + "epoch": 1.8488919607758434, + "grad_norm": 0.5519626832639604, + "learning_rate": 3.3885341670138915e-06, + "loss": 0.098, + "step": 21400 + }, + { + "epoch": 1.8497559289818135, + "grad_norm": 0.5320256820466911, + "learning_rate": 3.384119849431517e-06, + "loss": 0.105, + "step": 21410 + }, + { + "epoch": 1.8506198971877836, + "grad_norm": 0.568130486279502, + "learning_rate": 3.3797069377823676e-06, + "loss": 0.1002, + "step": 21420 + }, + { + "epoch": 1.8514838653937535, + "grad_norm": 0.5478982214510582, + "learning_rate": 3.3752954359059976e-06, + "loss": 0.1039, + "step": 21430 + }, + { + "epoch": 1.8523478335997234, + "grad_norm": 0.544232492931912, + "learning_rate": 3.3708853476407365e-06, + "loss": 0.1004, + "step": 21440 + }, + { + "epoch": 1.8532118018056936, + "grad_norm": 0.5585103563773959, + "learning_rate": 3.366476676823677e-06, + "loss": 0.1019, + "step": 21450 + }, + { + "epoch": 1.8540757700116637, + "grad_norm": 0.5455231576521419, + "learning_rate": 3.3620694272906874e-06, + "loss": 0.0977, + "step": 21460 + }, + { + "epoch": 1.8549397382176336, + "grad_norm": 0.5379648194855917, + "learning_rate": 3.357663602876392e-06, + "loss": 0.0979, + "step": 21470 + }, + { + "epoch": 1.8558037064236035, + "grad_norm": 0.5726162354617428, + "learning_rate": 3.3532592074141823e-06, + "loss": 0.0994, + "step": 21480 + }, + { + "epoch": 1.8566676746295736, + "grad_norm": 0.5827018315751672, + "learning_rate": 3.3488562447361978e-06, + "loss": 0.1, + "step": 21490 + }, + { + "epoch": 1.8575316428355437, + "grad_norm": 0.5635050703224438, + "learning_rate": 3.344454718673339e-06, + "loss": 0.1029, + "step": 21500 + }, + { + "epoch": 1.8583956110415136, + "grad_norm": 0.5565811080368595, + "learning_rate": 3.3400546330552554e-06, + "loss": 0.1026, + "step": 21510 + }, + { + "epoch": 1.8592595792474835, + "grad_norm": 0.564486228128983, + "learning_rate": 3.3356559917103377e-06, + "loss": 0.1019, + "step": 21520 + }, + { + "epoch": 1.8601235474534539, + "grad_norm": 0.5605995453561945, + "learning_rate": 3.3312587984657246e-06, + "loss": 0.1006, + "step": 21530 + }, + { + "epoch": 1.8609875156594238, + "grad_norm": 0.6100355923575964, + "learning_rate": 3.326863057147295e-06, + "loss": 0.1039, + "step": 21540 + }, + { + "epoch": 1.8618514838653937, + "grad_norm": 0.5762711693846725, + "learning_rate": 3.3224687715796656e-06, + "loss": 0.1014, + "step": 21550 + }, + { + "epoch": 1.8627154520713638, + "grad_norm": 0.5366489167927376, + "learning_rate": 3.3180759455861798e-06, + "loss": 0.1003, + "step": 21560 + }, + { + "epoch": 1.863579420277334, + "grad_norm": 0.5662365857970131, + "learning_rate": 3.3136845829889185e-06, + "loss": 0.1027, + "step": 21570 + }, + { + "epoch": 1.8644433884833038, + "grad_norm": 0.5459962285652393, + "learning_rate": 3.309294687608685e-06, + "loss": 0.1005, + "step": 21580 + }, + { + "epoch": 1.8653073566892737, + "grad_norm": 0.573806154613363, + "learning_rate": 3.3049062632650098e-06, + "loss": 0.1011, + "step": 21590 + }, + { + "epoch": 1.8661713248952438, + "grad_norm": 0.5266529570129646, + "learning_rate": 3.3005193137761376e-06, + "loss": 0.1023, + "step": 21600 + }, + { + "epoch": 1.867035293101214, + "grad_norm": 0.5777135015426488, + "learning_rate": 3.2961338429590364e-06, + "loss": 0.1022, + "step": 21610 + }, + { + "epoch": 1.8678992613071839, + "grad_norm": 0.5710616793282451, + "learning_rate": 3.2917498546293823e-06, + "loss": 0.1024, + "step": 21620 + }, + { + "epoch": 1.8687632295131538, + "grad_norm": 0.5774894136897458, + "learning_rate": 3.287367352601568e-06, + "loss": 0.1003, + "step": 21630 + }, + { + "epoch": 1.869627197719124, + "grad_norm": 0.5741261431440448, + "learning_rate": 3.282986340688684e-06, + "loss": 0.1028, + "step": 21640 + }, + { + "epoch": 1.870491165925094, + "grad_norm": 0.5883458021179613, + "learning_rate": 3.278606822702532e-06, + "loss": 0.1008, + "step": 21650 + }, + { + "epoch": 1.871355134131064, + "grad_norm": 0.574927450088088, + "learning_rate": 3.274228802453611e-06, + "loss": 0.1003, + "step": 21660 + }, + { + "epoch": 1.872219102337034, + "grad_norm": 0.5477269695224006, + "learning_rate": 3.269852283751117e-06, + "loss": 0.1023, + "step": 21670 + }, + { + "epoch": 1.8730830705430042, + "grad_norm": 0.5438420121822475, + "learning_rate": 3.2654772704029385e-06, + "loss": 0.1005, + "step": 21680 + }, + { + "epoch": 1.873947038748974, + "grad_norm": 0.5773522247749471, + "learning_rate": 3.2611037662156553e-06, + "loss": 0.101, + "step": 21690 + }, + { + "epoch": 1.874811006954944, + "grad_norm": 0.5270650909460688, + "learning_rate": 3.256731774994534e-06, + "loss": 0.1009, + "step": 21700 + }, + { + "epoch": 1.875674975160914, + "grad_norm": 0.5697439029107149, + "learning_rate": 3.2523613005435273e-06, + "loss": 0.1018, + "step": 21710 + }, + { + "epoch": 1.8765389433668842, + "grad_norm": 0.5889980436086851, + "learning_rate": 3.247992346665262e-06, + "loss": 0.103, + "step": 21720 + }, + { + "epoch": 1.877402911572854, + "grad_norm": 0.5855966040283817, + "learning_rate": 3.243624917161048e-06, + "loss": 0.1004, + "step": 21730 + }, + { + "epoch": 1.878266879778824, + "grad_norm": 0.5645627450678173, + "learning_rate": 3.239259015830865e-06, + "loss": 0.1003, + "step": 21740 + }, + { + "epoch": 1.8791308479847941, + "grad_norm": 0.5607021768032766, + "learning_rate": 3.234894646473368e-06, + "loss": 0.1045, + "step": 21750 + }, + { + "epoch": 1.8799948161907643, + "grad_norm": 0.6018622097238112, + "learning_rate": 3.230531812885872e-06, + "loss": 0.1023, + "step": 21760 + }, + { + "epoch": 1.8808587843967342, + "grad_norm": 0.5716356881800182, + "learning_rate": 3.2261705188643612e-06, + "loss": 0.1022, + "step": 21770 + }, + { + "epoch": 1.8817227526027043, + "grad_norm": 0.5520205261963792, + "learning_rate": 3.2218107682034775e-06, + "loss": 0.1013, + "step": 21780 + }, + { + "epoch": 1.8825867208086744, + "grad_norm": 0.5495910812136405, + "learning_rate": 3.217452564696522e-06, + "loss": 0.1014, + "step": 21790 + }, + { + "epoch": 1.8834506890146443, + "grad_norm": 0.5624942994289897, + "learning_rate": 3.2130959121354475e-06, + "loss": 0.1001, + "step": 21800 + }, + { + "epoch": 1.8843146572206142, + "grad_norm": 0.6136256619750384, + "learning_rate": 3.208740814310859e-06, + "loss": 0.1002, + "step": 21810 + }, + { + "epoch": 1.8851786254265843, + "grad_norm": 0.5520828631133291, + "learning_rate": 3.20438727501201e-06, + "loss": 0.1051, + "step": 21820 + }, + { + "epoch": 1.8860425936325544, + "grad_norm": 0.6030386575456828, + "learning_rate": 3.2000352980267936e-06, + "loss": 0.1001, + "step": 21830 + }, + { + "epoch": 1.8869065618385243, + "grad_norm": 0.5712910795331343, + "learning_rate": 3.195684887141747e-06, + "loss": 0.0997, + "step": 21840 + }, + { + "epoch": 1.8877705300444942, + "grad_norm": 0.5486849114411699, + "learning_rate": 3.191336046142044e-06, + "loss": 0.101, + "step": 21850 + }, + { + "epoch": 1.8886344982504644, + "grad_norm": 0.6141548602098744, + "learning_rate": 3.1869887788114943e-06, + "loss": 0.1029, + "step": 21860 + }, + { + "epoch": 1.8894984664564345, + "grad_norm": 0.5775840450930885, + "learning_rate": 3.1826430889325344e-06, + "loss": 0.1029, + "step": 21870 + }, + { + "epoch": 1.8903624346624044, + "grad_norm": 0.5540741540255073, + "learning_rate": 3.1782989802862313e-06, + "loss": 0.0997, + "step": 21880 + }, + { + "epoch": 1.8912264028683743, + "grad_norm": 0.5645731722914674, + "learning_rate": 3.173956456652274e-06, + "loss": 0.1025, + "step": 21890 + }, + { + "epoch": 1.8920903710743444, + "grad_norm": 0.5934655084949406, + "learning_rate": 3.169615521808978e-06, + "loss": 0.0969, + "step": 21900 + }, + { + "epoch": 1.8929543392803145, + "grad_norm": 0.5541470186648709, + "learning_rate": 3.165276179533267e-06, + "loss": 0.1033, + "step": 21910 + }, + { + "epoch": 1.8938183074862844, + "grad_norm": 0.5738429556296889, + "learning_rate": 3.1609384336006866e-06, + "loss": 0.1016, + "step": 21920 + }, + { + "epoch": 1.8946822756922546, + "grad_norm": 0.546682642116901, + "learning_rate": 3.156602287785391e-06, + "loss": 0.1013, + "step": 21930 + }, + { + "epoch": 1.8955462438982247, + "grad_norm": 0.5727258889036533, + "learning_rate": 3.1522677458601447e-06, + "loss": 0.1018, + "step": 21940 + }, + { + "epoch": 1.8964102121041946, + "grad_norm": 0.5750188686261928, + "learning_rate": 3.1479348115963105e-06, + "loss": 0.0991, + "step": 21950 + }, + { + "epoch": 1.8972741803101645, + "grad_norm": 0.5675659013009478, + "learning_rate": 3.143603488763858e-06, + "loss": 0.1002, + "step": 21960 + }, + { + "epoch": 1.8981381485161346, + "grad_norm": 0.5432009469281879, + "learning_rate": 3.1392737811313535e-06, + "loss": 0.1008, + "step": 21970 + }, + { + "epoch": 1.8990021167221047, + "grad_norm": 0.5323856224850143, + "learning_rate": 3.134945692465957e-06, + "loss": 0.1009, + "step": 21980 + }, + { + "epoch": 1.8998660849280746, + "grad_norm": 0.5519361955646229, + "learning_rate": 3.13061922653342e-06, + "loss": 0.1026, + "step": 21990 + }, + { + "epoch": 1.9007300531340445, + "grad_norm": 0.5620259952273624, + "learning_rate": 3.1262943870980823e-06, + "loss": 0.0992, + "step": 22000 + }, + { + "epoch": 1.9015940213400147, + "grad_norm": 0.5686870753402948, + "learning_rate": 3.121971177922869e-06, + "loss": 0.1001, + "step": 22010 + }, + { + "epoch": 1.9024579895459848, + "grad_norm": 0.6088252851839825, + "learning_rate": 3.1176496027692886e-06, + "loss": 0.1027, + "step": 22020 + }, + { + "epoch": 1.9033219577519547, + "grad_norm": 0.5844603975272236, + "learning_rate": 3.113329665397422e-06, + "loss": 0.1015, + "step": 22030 + }, + { + "epoch": 1.9041859259579248, + "grad_norm": 0.5606232894350783, + "learning_rate": 3.1090113695659297e-06, + "loss": 0.1022, + "step": 22040 + }, + { + "epoch": 1.905049894163895, + "grad_norm": 0.5659034726528784, + "learning_rate": 3.1046947190320463e-06, + "loss": 0.1013, + "step": 22050 + }, + { + "epoch": 1.9059138623698648, + "grad_norm": 0.5340403917026766, + "learning_rate": 3.100379717551567e-06, + "loss": 0.0991, + "step": 22060 + }, + { + "epoch": 1.9067778305758347, + "grad_norm": 0.5618283207865042, + "learning_rate": 3.0960663688788596e-06, + "loss": 0.0996, + "step": 22070 + }, + { + "epoch": 1.9076417987818048, + "grad_norm": 0.5531777403622569, + "learning_rate": 3.0917546767668504e-06, + "loss": 0.103, + "step": 22080 + }, + { + "epoch": 1.908505766987775, + "grad_norm": 0.5497027735815795, + "learning_rate": 3.0874446449670258e-06, + "loss": 0.0972, + "step": 22090 + }, + { + "epoch": 1.9093697351937449, + "grad_norm": 0.5697514613470495, + "learning_rate": 3.083136277229426e-06, + "loss": 0.1011, + "step": 22100 + }, + { + "epoch": 1.9102337033997148, + "grad_norm": 0.5476742899641973, + "learning_rate": 3.0788295773026437e-06, + "loss": 0.1014, + "step": 22110 + }, + { + "epoch": 1.9110976716056849, + "grad_norm": 0.5677127957502706, + "learning_rate": 3.0745245489338217e-06, + "loss": 0.1023, + "step": 22120 + }, + { + "epoch": 1.911961639811655, + "grad_norm": 0.5983383473334517, + "learning_rate": 3.0702211958686503e-06, + "loss": 0.1004, + "step": 22130 + }, + { + "epoch": 1.912825608017625, + "grad_norm": 0.5395467401934758, + "learning_rate": 3.065919521851354e-06, + "loss": 0.1011, + "step": 22140 + }, + { + "epoch": 1.9136895762235948, + "grad_norm": 0.5372402138956013, + "learning_rate": 3.0616195306247054e-06, + "loss": 0.0996, + "step": 22150 + }, + { + "epoch": 1.9145535444295652, + "grad_norm": 0.5403566249572875, + "learning_rate": 3.057321225930009e-06, + "loss": 0.0999, + "step": 22160 + }, + { + "epoch": 1.915417512635535, + "grad_norm": 0.5544226990298059, + "learning_rate": 3.0530246115071036e-06, + "loss": 0.1, + "step": 22170 + }, + { + "epoch": 1.916281480841505, + "grad_norm": 0.5826205818540283, + "learning_rate": 3.0487296910943543e-06, + "loss": 0.0996, + "step": 22180 + }, + { + "epoch": 1.917145449047475, + "grad_norm": 0.5743500225747563, + "learning_rate": 3.044436468428655e-06, + "loss": 0.103, + "step": 22190 + }, + { + "epoch": 1.9180094172534452, + "grad_norm": 0.5633148475444488, + "learning_rate": 3.040144947245422e-06, + "loss": 0.1002, + "step": 22200 + }, + { + "epoch": 1.918873385459415, + "grad_norm": 0.5352323628595675, + "learning_rate": 3.0358551312785912e-06, + "loss": 0.0993, + "step": 22210 + }, + { + "epoch": 1.919737353665385, + "grad_norm": 0.5599302140701811, + "learning_rate": 3.0315670242606123e-06, + "loss": 0.1013, + "step": 22220 + }, + { + "epoch": 1.9206013218713551, + "grad_norm": 0.5660720954378141, + "learning_rate": 3.02728062992245e-06, + "loss": 0.1014, + "step": 22230 + }, + { + "epoch": 1.9214652900773252, + "grad_norm": 0.5380748855737479, + "learning_rate": 3.0229959519935814e-06, + "loss": 0.101, + "step": 22240 + }, + { + "epoch": 1.9223292582832952, + "grad_norm": 0.5884805058572077, + "learning_rate": 3.018712994201989e-06, + "loss": 0.102, + "step": 22250 + }, + { + "epoch": 1.923193226489265, + "grad_norm": 0.5874042489492537, + "learning_rate": 3.0144317602741535e-06, + "loss": 0.1006, + "step": 22260 + }, + { + "epoch": 1.9240571946952352, + "grad_norm": 0.5327072747468091, + "learning_rate": 3.010152253935061e-06, + "loss": 0.0965, + "step": 22270 + }, + { + "epoch": 1.9249211629012053, + "grad_norm": 0.5672748667596161, + "learning_rate": 3.005874478908195e-06, + "loss": 0.0992, + "step": 22280 + }, + { + "epoch": 1.9257851311071752, + "grad_norm": 0.5779026537091281, + "learning_rate": 3.00159843891553e-06, + "loss": 0.1008, + "step": 22290 + }, + { + "epoch": 1.9266490993131453, + "grad_norm": 0.5775941103917845, + "learning_rate": 2.9973241376775314e-06, + "loss": 0.102, + "step": 22300 + }, + { + "epoch": 1.9275130675191154, + "grad_norm": 0.5614889405016127, + "learning_rate": 2.9930515789131507e-06, + "loss": 0.1024, + "step": 22310 + }, + { + "epoch": 1.9283770357250853, + "grad_norm": 0.5207831073526223, + "learning_rate": 2.9887807663398283e-06, + "loss": 0.0995, + "step": 22320 + }, + { + "epoch": 1.9292410039310552, + "grad_norm": 0.5518036833957318, + "learning_rate": 2.9845117036734773e-06, + "loss": 0.0981, + "step": 22330 + }, + { + "epoch": 1.9301049721370254, + "grad_norm": 0.5848049624329538, + "learning_rate": 2.9802443946284944e-06, + "loss": 0.1027, + "step": 22340 + }, + { + "epoch": 1.9309689403429955, + "grad_norm": 0.5595521149024182, + "learning_rate": 2.9759788429177493e-06, + "loss": 0.0969, + "step": 22350 + }, + { + "epoch": 1.9318329085489654, + "grad_norm": 0.5774209309801077, + "learning_rate": 2.9717150522525838e-06, + "loss": 0.098, + "step": 22360 + }, + { + "epoch": 1.9326968767549353, + "grad_norm": 0.582103892789509, + "learning_rate": 2.967453026342802e-06, + "loss": 0.1037, + "step": 22370 + }, + { + "epoch": 1.9335608449609054, + "grad_norm": 0.596134016903453, + "learning_rate": 2.9631927688966783e-06, + "loss": 0.1022, + "step": 22380 + }, + { + "epoch": 1.9344248131668755, + "grad_norm": 0.5787978494309993, + "learning_rate": 2.9589342836209473e-06, + "loss": 0.1014, + "step": 22390 + }, + { + "epoch": 1.9352887813728454, + "grad_norm": 0.5598636799101697, + "learning_rate": 2.9546775742208e-06, + "loss": 0.0982, + "step": 22400 + }, + { + "epoch": 1.9361527495788153, + "grad_norm": 0.5457627399040831, + "learning_rate": 2.9504226443998827e-06, + "loss": 0.1003, + "step": 22410 + }, + { + "epoch": 1.9370167177847857, + "grad_norm": 0.5273927080875033, + "learning_rate": 2.946169497860294e-06, + "loss": 0.0977, + "step": 22420 + }, + { + "epoch": 1.9378806859907556, + "grad_norm": 0.5515535907865661, + "learning_rate": 2.94191813830258e-06, + "loss": 0.1016, + "step": 22430 + }, + { + "epoch": 1.9387446541967255, + "grad_norm": 0.5858401856027835, + "learning_rate": 2.9376685694257358e-06, + "loss": 0.098, + "step": 22440 + }, + { + "epoch": 1.9396086224026956, + "grad_norm": 0.5326821139702289, + "learning_rate": 2.933420794927191e-06, + "loss": 0.0999, + "step": 22450 + }, + { + "epoch": 1.9404725906086657, + "grad_norm": 0.6165635449377432, + "learning_rate": 2.9291748185028197e-06, + "loss": 0.099, + "step": 22460 + }, + { + "epoch": 1.9413365588146356, + "grad_norm": 0.5420456009991401, + "learning_rate": 2.9249306438469305e-06, + "loss": 0.0992, + "step": 22470 + }, + { + "epoch": 1.9422005270206055, + "grad_norm": 0.57820086837121, + "learning_rate": 2.9206882746522665e-06, + "loss": 0.1002, + "step": 22480 + }, + { + "epoch": 1.9430644952265756, + "grad_norm": 0.5818903495695285, + "learning_rate": 2.9164477146099923e-06, + "loss": 0.0996, + "step": 22490 + }, + { + "epoch": 1.9439284634325458, + "grad_norm": 0.5443267800752883, + "learning_rate": 2.9122089674097076e-06, + "loss": 0.1014, + "step": 22500 + }, + { + "epoch": 1.9447924316385157, + "grad_norm": 0.5500539383572447, + "learning_rate": 2.907972036739427e-06, + "loss": 0.1022, + "step": 22510 + }, + { + "epoch": 1.9456563998444856, + "grad_norm": 0.6347343648821124, + "learning_rate": 2.9037369262855947e-06, + "loss": 0.1, + "step": 22520 + }, + { + "epoch": 1.9465203680504557, + "grad_norm": 0.5514477780439476, + "learning_rate": 2.8995036397330563e-06, + "loss": 0.0992, + "step": 22530 + }, + { + "epoch": 1.9473843362564258, + "grad_norm": 0.586202809665758, + "learning_rate": 2.8952721807650854e-06, + "loss": 0.105, + "step": 22540 + }, + { + "epoch": 1.9482483044623957, + "grad_norm": 0.5545892197299369, + "learning_rate": 2.8910425530633556e-06, + "loss": 0.0984, + "step": 22550 + }, + { + "epoch": 1.9491122726683658, + "grad_norm": 0.5730214745435419, + "learning_rate": 2.886814760307952e-06, + "loss": 0.0993, + "step": 22560 + }, + { + "epoch": 1.949976240874336, + "grad_norm": 0.5458153748028479, + "learning_rate": 2.8825888061773623e-06, + "loss": 0.0988, + "step": 22570 + }, + { + "epoch": 1.9508402090803059, + "grad_norm": 0.5921035939497012, + "learning_rate": 2.87836469434847e-06, + "loss": 0.0946, + "step": 22580 + }, + { + "epoch": 1.9517041772862758, + "grad_norm": 0.5560783693469633, + "learning_rate": 2.8741424284965645e-06, + "loss": 0.1013, + "step": 22590 + }, + { + "epoch": 1.9525681454922459, + "grad_norm": 0.549086003919069, + "learning_rate": 2.869922012295322e-06, + "loss": 0.1005, + "step": 22600 + }, + { + "epoch": 1.953432113698216, + "grad_norm": 0.5793251391076313, + "learning_rate": 2.865703449416812e-06, + "loss": 0.1035, + "step": 22610 + }, + { + "epoch": 1.954296081904186, + "grad_norm": 0.5570124709906242, + "learning_rate": 2.8614867435314877e-06, + "loss": 0.0954, + "step": 22620 + }, + { + "epoch": 1.9551600501101558, + "grad_norm": 0.5329637013463601, + "learning_rate": 2.8572718983081972e-06, + "loss": 0.0993, + "step": 22630 + }, + { + "epoch": 1.956024018316126, + "grad_norm": 0.5332785700544586, + "learning_rate": 2.853058917414153e-06, + "loss": 0.0956, + "step": 22640 + }, + { + "epoch": 1.956887986522096, + "grad_norm": 0.570819195946033, + "learning_rate": 2.848847804514962e-06, + "loss": 0.1007, + "step": 22650 + }, + { + "epoch": 1.957751954728066, + "grad_norm": 0.5495513030265846, + "learning_rate": 2.844638563274596e-06, + "loss": 0.1002, + "step": 22660 + }, + { + "epoch": 1.958615922934036, + "grad_norm": 0.5975360173842512, + "learning_rate": 2.840431197355401e-06, + "loss": 0.1002, + "step": 22670 + }, + { + "epoch": 1.9594798911400062, + "grad_norm": 0.560563297872084, + "learning_rate": 2.836225710418091e-06, + "loss": 0.0988, + "step": 22680 + }, + { + "epoch": 1.960343859345976, + "grad_norm": 0.5717780583220105, + "learning_rate": 2.8320221061217446e-06, + "loss": 0.1013, + "step": 22690 + }, + { + "epoch": 1.961207827551946, + "grad_norm": 0.5525346469001077, + "learning_rate": 2.827820388123806e-06, + "loss": 0.1004, + "step": 22700 + }, + { + "epoch": 1.9620717957579161, + "grad_norm": 0.5827285043087085, + "learning_rate": 2.823620560080074e-06, + "loss": 0.1011, + "step": 22710 + }, + { + "epoch": 1.9629357639638862, + "grad_norm": 0.5752788801952603, + "learning_rate": 2.8194226256447033e-06, + "loss": 0.099, + "step": 22720 + }, + { + "epoch": 1.9637997321698561, + "grad_norm": 0.557491580653571, + "learning_rate": 2.815226588470201e-06, + "loss": 0.1018, + "step": 22730 + }, + { + "epoch": 1.964663700375826, + "grad_norm": 0.5916884024884534, + "learning_rate": 2.811032452207429e-06, + "loss": 0.0974, + "step": 22740 + }, + { + "epoch": 1.9655276685817962, + "grad_norm": 0.5386096553878962, + "learning_rate": 2.8068402205055878e-06, + "loss": 0.0989, + "step": 22750 + }, + { + "epoch": 1.9663916367877663, + "grad_norm": 0.5710691676756725, + "learning_rate": 2.8026498970122245e-06, + "loss": 0.0955, + "step": 22760 + }, + { + "epoch": 1.9672556049937362, + "grad_norm": 0.5556909406918205, + "learning_rate": 2.798461485373225e-06, + "loss": 0.1005, + "step": 22770 + }, + { + "epoch": 1.968119573199706, + "grad_norm": 0.557444256716542, + "learning_rate": 2.79427498923281e-06, + "loss": 0.0969, + "step": 22780 + }, + { + "epoch": 1.9689835414056762, + "grad_norm": 0.5905330627179569, + "learning_rate": 2.7900904122335415e-06, + "loss": 0.0987, + "step": 22790 + }, + { + "epoch": 1.9698475096116463, + "grad_norm": 0.5826168610406894, + "learning_rate": 2.7859077580162976e-06, + "loss": 0.0997, + "step": 22800 + }, + { + "epoch": 1.9707114778176162, + "grad_norm": 0.5600933255055477, + "learning_rate": 2.781727030220298e-06, + "loss": 0.0986, + "step": 22810 + }, + { + "epoch": 1.9715754460235864, + "grad_norm": 0.54806352388108, + "learning_rate": 2.777548232483075e-06, + "loss": 0.0976, + "step": 22820 + }, + { + "epoch": 1.9724394142295565, + "grad_norm": 0.5551221906142857, + "learning_rate": 2.773371368440494e-06, + "loss": 0.0983, + "step": 22830 + }, + { + "epoch": 1.9733033824355264, + "grad_norm": 0.5723114218083482, + "learning_rate": 2.7691964417267214e-06, + "loss": 0.0996, + "step": 22840 + }, + { + "epoch": 1.9741673506414963, + "grad_norm": 0.6044292557590281, + "learning_rate": 2.765023455974253e-06, + "loss": 0.0982, + "step": 22850 + }, + { + "epoch": 1.9750313188474664, + "grad_norm": 0.5746034008271442, + "learning_rate": 2.7608524148138878e-06, + "loss": 0.0988, + "step": 22860 + }, + { + "epoch": 1.9758952870534365, + "grad_norm": 0.6032867504998831, + "learning_rate": 2.7566833218747353e-06, + "loss": 0.0999, + "step": 22870 + }, + { + "epoch": 1.9767592552594064, + "grad_norm": 0.5802365470670267, + "learning_rate": 2.7525161807842095e-06, + "loss": 0.1009, + "step": 22880 + }, + { + "epoch": 1.9776232234653763, + "grad_norm": 0.5781767957801193, + "learning_rate": 2.748350995168023e-06, + "loss": 0.0985, + "step": 22890 + }, + { + "epoch": 1.9784871916713465, + "grad_norm": 0.5519359594377506, + "learning_rate": 2.744187768650198e-06, + "loss": 0.0989, + "step": 22900 + }, + { + "epoch": 1.9793511598773166, + "grad_norm": 0.573944538698511, + "learning_rate": 2.7400265048530355e-06, + "loss": 0.0983, + "step": 22910 + }, + { + "epoch": 1.9802151280832865, + "grad_norm": 0.5723397613793625, + "learning_rate": 2.7358672073971433e-06, + "loss": 0.098, + "step": 22920 + }, + { + "epoch": 1.9810790962892566, + "grad_norm": 0.5510971107060265, + "learning_rate": 2.7317098799014086e-06, + "loss": 0.1021, + "step": 22930 + }, + { + "epoch": 1.9819430644952267, + "grad_norm": 0.5351223179335077, + "learning_rate": 2.727554525983015e-06, + "loss": 0.0958, + "step": 22940 + }, + { + "epoch": 1.9828070327011966, + "grad_norm": 0.5911713108321606, + "learning_rate": 2.723401149257414e-06, + "loss": 0.0987, + "step": 22950 + }, + { + "epoch": 1.9836710009071665, + "grad_norm": 0.555620541087196, + "learning_rate": 2.7192497533383517e-06, + "loss": 0.0974, + "step": 22960 + }, + { + "epoch": 1.9845349691131366, + "grad_norm": 0.531489300609613, + "learning_rate": 2.7151003418378426e-06, + "loss": 0.0998, + "step": 22970 + }, + { + "epoch": 1.9853989373191068, + "grad_norm": 0.56067493893783, + "learning_rate": 2.710952918366176e-06, + "loss": 0.0986, + "step": 22980 + }, + { + "epoch": 1.9862629055250767, + "grad_norm": 0.5115950990313571, + "learning_rate": 2.706807486531912e-06, + "loss": 0.0993, + "step": 22990 + }, + { + "epoch": 1.9871268737310466, + "grad_norm": 0.56036178938333, + "learning_rate": 2.7026640499418764e-06, + "loss": 0.0959, + "step": 23000 + }, + { + "epoch": 1.9879908419370167, + "grad_norm": 0.5820731208668521, + "learning_rate": 2.698522612201163e-06, + "loss": 0.1015, + "step": 23010 + }, + { + "epoch": 1.9888548101429868, + "grad_norm": 0.5561062988471792, + "learning_rate": 2.6943831769131235e-06, + "loss": 0.0998, + "step": 23020 + }, + { + "epoch": 1.9897187783489567, + "grad_norm": 0.550576715080664, + "learning_rate": 2.690245747679366e-06, + "loss": 0.0995, + "step": 23030 + }, + { + "epoch": 1.9905827465549266, + "grad_norm": 0.5607772814673408, + "learning_rate": 2.686110328099753e-06, + "loss": 0.0981, + "step": 23040 + }, + { + "epoch": 1.991446714760897, + "grad_norm": 0.5649321440620626, + "learning_rate": 2.681976921772405e-06, + "loss": 0.0948, + "step": 23050 + }, + { + "epoch": 1.9923106829668669, + "grad_norm": 0.5547901042849294, + "learning_rate": 2.6778455322936837e-06, + "loss": 0.1017, + "step": 23060 + }, + { + "epoch": 1.9931746511728368, + "grad_norm": 0.5433895881280426, + "learning_rate": 2.6737161632581983e-06, + "loss": 0.0972, + "step": 23070 + }, + { + "epoch": 1.9940386193788069, + "grad_norm": 0.5879816117729628, + "learning_rate": 2.6695888182588005e-06, + "loss": 0.0983, + "step": 23080 + }, + { + "epoch": 1.994902587584777, + "grad_norm": 0.5639706915617735, + "learning_rate": 2.665463500886577e-06, + "loss": 0.0976, + "step": 23090 + }, + { + "epoch": 1.995766555790747, + "grad_norm": 0.5597079533515157, + "learning_rate": 2.661340214730862e-06, + "loss": 0.0992, + "step": 23100 + }, + { + "epoch": 1.9966305239967168, + "grad_norm": 0.5975735164456436, + "learning_rate": 2.6572189633792052e-06, + "loss": 0.0966, + "step": 23110 + }, + { + "epoch": 1.997494492202687, + "grad_norm": 0.5431942288707596, + "learning_rate": 2.6530997504174007e-06, + "loss": 0.099, + "step": 23120 + }, + { + "epoch": 1.998358460408657, + "grad_norm": 0.5610255705894158, + "learning_rate": 2.64898257942946e-06, + "loss": 0.1023, + "step": 23130 + }, + { + "epoch": 1.999222428614627, + "grad_norm": 0.5565024056381163, + "learning_rate": 2.6448674539976264e-06, + "loss": 0.0952, + "step": 23140 + }, + { + "epoch": 2.000086396820597, + "grad_norm": 0.5762272252445255, + "learning_rate": 2.6407543777023514e-06, + "loss": 0.096, + "step": 23150 + }, + { + "epoch": 2.000950365026567, + "grad_norm": 0.5941202253662053, + "learning_rate": 2.6366433541223156e-06, + "loss": 0.0673, + "step": 23160 + }, + { + "epoch": 2.001814333232537, + "grad_norm": 0.6016764334805269, + "learning_rate": 2.6325343868344064e-06, + "loss": 0.0668, + "step": 23170 + }, + { + "epoch": 2.002678301438507, + "grad_norm": 0.5768547489229648, + "learning_rate": 2.6284274794137243e-06, + "loss": 0.0668, + "step": 23180 + }, + { + "epoch": 2.003542269644477, + "grad_norm": 0.5653152032508831, + "learning_rate": 2.624322635433577e-06, + "loss": 0.0636, + "step": 23190 + }, + { + "epoch": 2.0044062378504472, + "grad_norm": 0.6356562558227943, + "learning_rate": 2.620219858465477e-06, + "loss": 0.0653, + "step": 23200 + }, + { + "epoch": 2.005270206056417, + "grad_norm": 0.6010359673980143, + "learning_rate": 2.616119152079142e-06, + "loss": 0.0668, + "step": 23210 + }, + { + "epoch": 2.006134174262387, + "grad_norm": 0.5909469980255928, + "learning_rate": 2.6120205198424786e-06, + "loss": 0.0637, + "step": 23220 + }, + { + "epoch": 2.006998142468357, + "grad_norm": 0.5797202598631477, + "learning_rate": 2.6079239653215994e-06, + "loss": 0.0665, + "step": 23230 + }, + { + "epoch": 2.0078621106743273, + "grad_norm": 0.5595299434577446, + "learning_rate": 2.603829492080802e-06, + "loss": 0.0648, + "step": 23240 + }, + { + "epoch": 2.008726078880297, + "grad_norm": 0.5868298303035886, + "learning_rate": 2.5997371036825814e-06, + "loss": 0.0649, + "step": 23250 + }, + { + "epoch": 2.009590047086267, + "grad_norm": 0.6136273885472115, + "learning_rate": 2.5956468036876046e-06, + "loss": 0.0658, + "step": 23260 + }, + { + "epoch": 2.0104540152922374, + "grad_norm": 0.6199336639686098, + "learning_rate": 2.591558595654737e-06, + "loss": 0.0661, + "step": 23270 + }, + { + "epoch": 2.0113179834982073, + "grad_norm": 0.5663941452077751, + "learning_rate": 2.587472483141015e-06, + "loss": 0.0635, + "step": 23280 + }, + { + "epoch": 2.0121819517041772, + "grad_norm": 0.5937353796830521, + "learning_rate": 2.5833884697016513e-06, + "loss": 0.0652, + "step": 23290 + }, + { + "epoch": 2.013045919910147, + "grad_norm": 0.6077597290157493, + "learning_rate": 2.5793065588900367e-06, + "loss": 0.0635, + "step": 23300 + }, + { + "epoch": 2.0139098881161175, + "grad_norm": 0.608614512624362, + "learning_rate": 2.575226754257728e-06, + "loss": 0.0648, + "step": 23310 + }, + { + "epoch": 2.0147738563220874, + "grad_norm": 0.6411985030217807, + "learning_rate": 2.571149059354456e-06, + "loss": 0.065, + "step": 23320 + }, + { + "epoch": 2.0156378245280573, + "grad_norm": 0.5980106388366775, + "learning_rate": 2.5670734777281093e-06, + "loss": 0.0645, + "step": 23330 + }, + { + "epoch": 2.016501792734027, + "grad_norm": 0.6363825389790632, + "learning_rate": 2.56300001292474e-06, + "loss": 0.0625, + "step": 23340 + }, + { + "epoch": 2.0173657609399975, + "grad_norm": 0.6223195747565223, + "learning_rate": 2.5589286684885584e-06, + "loss": 0.0647, + "step": 23350 + }, + { + "epoch": 2.0182297291459674, + "grad_norm": 0.6163821451502053, + "learning_rate": 2.5548594479619315e-06, + "loss": 0.0653, + "step": 23360 + }, + { + "epoch": 2.0190936973519373, + "grad_norm": 0.5917851425986639, + "learning_rate": 2.5507923548853775e-06, + "loss": 0.0661, + "step": 23370 + }, + { + "epoch": 2.0199576655579077, + "grad_norm": 0.6484282253432566, + "learning_rate": 2.546727392797561e-06, + "loss": 0.0613, + "step": 23380 + }, + { + "epoch": 2.0208216337638776, + "grad_norm": 0.6039945969146345, + "learning_rate": 2.5426645652352968e-06, + "loss": 0.0639, + "step": 23390 + }, + { + "epoch": 2.0216856019698475, + "grad_norm": 0.6437986864420733, + "learning_rate": 2.5386038757335364e-06, + "loss": 0.0646, + "step": 23400 + }, + { + "epoch": 2.0225495701758174, + "grad_norm": 0.6132933888487462, + "learning_rate": 2.5345453278253818e-06, + "loss": 0.0617, + "step": 23410 + }, + { + "epoch": 2.0234135383817877, + "grad_norm": 0.6140098088819063, + "learning_rate": 2.530488925042056e-06, + "loss": 0.0637, + "step": 23420 + }, + { + "epoch": 2.0242775065877576, + "grad_norm": 0.6116175505543225, + "learning_rate": 2.5264346709129315e-06, + "loss": 0.0656, + "step": 23430 + }, + { + "epoch": 2.0251414747937275, + "grad_norm": 0.6161269270983905, + "learning_rate": 2.5223825689655024e-06, + "loss": 0.0633, + "step": 23440 + }, + { + "epoch": 2.0260054429996974, + "grad_norm": 0.6097595430473298, + "learning_rate": 2.5183326227253915e-06, + "loss": 0.064, + "step": 23450 + }, + { + "epoch": 2.0268694112056678, + "grad_norm": 0.6039768047331386, + "learning_rate": 2.514284835716343e-06, + "loss": 0.0651, + "step": 23460 + }, + { + "epoch": 2.0277333794116377, + "grad_norm": 0.5964483205003952, + "learning_rate": 2.5102392114602326e-06, + "loss": 0.0626, + "step": 23470 + }, + { + "epoch": 2.0285973476176076, + "grad_norm": 0.6335610652801382, + "learning_rate": 2.506195753477043e-06, + "loss": 0.0648, + "step": 23480 + }, + { + "epoch": 2.0294613158235775, + "grad_norm": 0.6032288219480831, + "learning_rate": 2.502154465284879e-06, + "loss": 0.0647, + "step": 23490 + }, + { + "epoch": 2.030325284029548, + "grad_norm": 0.6081416747954673, + "learning_rate": 2.4981153503999536e-06, + "loss": 0.0636, + "step": 23500 + }, + { + "epoch": 2.0311892522355177, + "grad_norm": 0.6190433175441461, + "learning_rate": 2.494078412336589e-06, + "loss": 0.0626, + "step": 23510 + }, + { + "epoch": 2.0320532204414876, + "grad_norm": 0.6352529516782883, + "learning_rate": 2.4900436546072206e-06, + "loss": 0.0657, + "step": 23520 + }, + { + "epoch": 2.032917188647458, + "grad_norm": 0.60396342566828, + "learning_rate": 2.486011080722374e-06, + "loss": 0.0638, + "step": 23530 + }, + { + "epoch": 2.033781156853428, + "grad_norm": 0.6321000602851499, + "learning_rate": 2.4819806941906855e-06, + "loss": 0.0632, + "step": 23540 + }, + { + "epoch": 2.0346451250593978, + "grad_norm": 0.6408999563026007, + "learning_rate": 2.477952498518883e-06, + "loss": 0.0672, + "step": 23550 + }, + { + "epoch": 2.0355090932653677, + "grad_norm": 0.6099633041788447, + "learning_rate": 2.473926497211794e-06, + "loss": 0.063, + "step": 23560 + }, + { + "epoch": 2.036373061471338, + "grad_norm": 0.6080948793986606, + "learning_rate": 2.469902693772325e-06, + "loss": 0.0611, + "step": 23570 + }, + { + "epoch": 2.037237029677308, + "grad_norm": 0.6341190621707892, + "learning_rate": 2.4658810917014823e-06, + "loss": 0.0637, + "step": 23580 + }, + { + "epoch": 2.038100997883278, + "grad_norm": 0.6540975857228662, + "learning_rate": 2.4618616944983525e-06, + "loss": 0.0644, + "step": 23590 + }, + { + "epoch": 2.0389649660892477, + "grad_norm": 0.6366781118625203, + "learning_rate": 2.457844505660102e-06, + "loss": 0.0648, + "step": 23600 + }, + { + "epoch": 2.039828934295218, + "grad_norm": 0.6317871739227648, + "learning_rate": 2.453829528681977e-06, + "loss": 0.0639, + "step": 23610 + }, + { + "epoch": 2.040692902501188, + "grad_norm": 0.625094462950415, + "learning_rate": 2.4498167670572977e-06, + "loss": 0.0661, + "step": 23620 + }, + { + "epoch": 2.041556870707158, + "grad_norm": 0.6075843535052505, + "learning_rate": 2.4458062242774627e-06, + "loss": 0.0669, + "step": 23630 + }, + { + "epoch": 2.042420838913128, + "grad_norm": 0.6231492470009503, + "learning_rate": 2.441797903831934e-06, + "loss": 0.0645, + "step": 23640 + }, + { + "epoch": 2.043284807119098, + "grad_norm": 0.6013322781301382, + "learning_rate": 2.437791809208241e-06, + "loss": 0.0653, + "step": 23650 + }, + { + "epoch": 2.044148775325068, + "grad_norm": 0.6332588139476704, + "learning_rate": 2.433787943891976e-06, + "loss": 0.0663, + "step": 23660 + }, + { + "epoch": 2.045012743531038, + "grad_norm": 0.5952863252399502, + "learning_rate": 2.429786311366796e-06, + "loss": 0.065, + "step": 23670 + }, + { + "epoch": 2.0458767117370082, + "grad_norm": 0.6412555707079038, + "learning_rate": 2.4257869151144128e-06, + "loss": 0.064, + "step": 23680 + }, + { + "epoch": 2.046740679942978, + "grad_norm": 0.6266314277669499, + "learning_rate": 2.421789758614589e-06, + "loss": 0.0616, + "step": 23690 + }, + { + "epoch": 2.047604648148948, + "grad_norm": 0.6024010395703102, + "learning_rate": 2.417794845345142e-06, + "loss": 0.0627, + "step": 23700 + }, + { + "epoch": 2.048468616354918, + "grad_norm": 0.6333664605446039, + "learning_rate": 2.4138021787819365e-06, + "loss": 0.063, + "step": 23710 + }, + { + "epoch": 2.0493325845608883, + "grad_norm": 0.640445584802518, + "learning_rate": 2.409811762398887e-06, + "loss": 0.0625, + "step": 23720 + }, + { + "epoch": 2.050196552766858, + "grad_norm": 0.6389702484627462, + "learning_rate": 2.405823599667939e-06, + "loss": 0.0629, + "step": 23730 + }, + { + "epoch": 2.051060520972828, + "grad_norm": 0.6205228444403665, + "learning_rate": 2.40183769405909e-06, + "loss": 0.0629, + "step": 23740 + }, + { + "epoch": 2.0519244891787984, + "grad_norm": 0.6340486050763664, + "learning_rate": 2.3978540490403645e-06, + "loss": 0.062, + "step": 23750 + }, + { + "epoch": 2.0527884573847683, + "grad_norm": 0.6367859468897559, + "learning_rate": 2.393872668077824e-06, + "loss": 0.0622, + "step": 23760 + }, + { + "epoch": 2.0536524255907382, + "grad_norm": 0.5904483099597296, + "learning_rate": 2.3898935546355584e-06, + "loss": 0.0643, + "step": 23770 + }, + { + "epoch": 2.054516393796708, + "grad_norm": 0.6290343818423563, + "learning_rate": 2.385916712175688e-06, + "loss": 0.0635, + "step": 23780 + }, + { + "epoch": 2.0553803620026785, + "grad_norm": 0.6312722588263152, + "learning_rate": 2.3819421441583545e-06, + "loss": 0.0609, + "step": 23790 + }, + { + "epoch": 2.0562443302086484, + "grad_norm": 0.634502209125537, + "learning_rate": 2.3779698540417202e-06, + "loss": 0.0624, + "step": 23800 + }, + { + "epoch": 2.0571082984146183, + "grad_norm": 0.6736741707092652, + "learning_rate": 2.373999845281966e-06, + "loss": 0.0628, + "step": 23810 + }, + { + "epoch": 2.057972266620588, + "grad_norm": 0.6568078983053083, + "learning_rate": 2.3700321213332873e-06, + "loss": 0.0664, + "step": 23820 + }, + { + "epoch": 2.0588362348265585, + "grad_norm": 0.6025816101246598, + "learning_rate": 2.3660666856478975e-06, + "loss": 0.064, + "step": 23830 + }, + { + "epoch": 2.0597002030325284, + "grad_norm": 0.6673143783562789, + "learning_rate": 2.3621035416760062e-06, + "loss": 0.0639, + "step": 23840 + }, + { + "epoch": 2.0605641712384983, + "grad_norm": 0.5777579786946239, + "learning_rate": 2.3581426928658436e-06, + "loss": 0.0637, + "step": 23850 + }, + { + "epoch": 2.0614281394444682, + "grad_norm": 0.5945588080678396, + "learning_rate": 2.354184142663631e-06, + "loss": 0.0631, + "step": 23860 + }, + { + "epoch": 2.0622921076504386, + "grad_norm": 0.6272905472891154, + "learning_rate": 2.3502278945136007e-06, + "loss": 0.0663, + "step": 23870 + }, + { + "epoch": 2.0631560758564085, + "grad_norm": 0.5814381114745243, + "learning_rate": 2.34627395185797e-06, + "loss": 0.0657, + "step": 23880 + }, + { + "epoch": 2.0640200440623784, + "grad_norm": 0.6194731552781829, + "learning_rate": 2.3423223181369613e-06, + "loss": 0.0658, + "step": 23890 + }, + { + "epoch": 2.0648840122683487, + "grad_norm": 0.6423636477498306, + "learning_rate": 2.3383729967887814e-06, + "loss": 0.0651, + "step": 23900 + }, + { + "epoch": 2.0657479804743186, + "grad_norm": 0.6494081770950135, + "learning_rate": 2.334425991249627e-06, + "loss": 0.0648, + "step": 23910 + }, + { + "epoch": 2.0666119486802885, + "grad_norm": 0.6163722203310215, + "learning_rate": 2.3304813049536802e-06, + "loss": 0.0624, + "step": 23920 + }, + { + "epoch": 2.0674759168862584, + "grad_norm": 0.6247311437412909, + "learning_rate": 2.3265389413331023e-06, + "loss": 0.0632, + "step": 23930 + }, + { + "epoch": 2.0683398850922288, + "grad_norm": 0.6531618747835833, + "learning_rate": 2.3225989038180415e-06, + "loss": 0.0634, + "step": 23940 + }, + { + "epoch": 2.0692038532981987, + "grad_norm": 0.6648124763866536, + "learning_rate": 2.3186611958366135e-06, + "loss": 0.0625, + "step": 23950 + }, + { + "epoch": 2.0700678215041686, + "grad_norm": 0.615739462809398, + "learning_rate": 2.314725820814911e-06, + "loss": 0.0641, + "step": 23960 + }, + { + "epoch": 2.0709317897101385, + "grad_norm": 0.6131030682970773, + "learning_rate": 2.3107927821769954e-06, + "loss": 0.0638, + "step": 23970 + }, + { + "epoch": 2.071795757916109, + "grad_norm": 0.6084449651691136, + "learning_rate": 2.306862083344899e-06, + "loss": 0.0641, + "step": 23980 + }, + { + "epoch": 2.0726597261220787, + "grad_norm": 0.6477878147729919, + "learning_rate": 2.3029337277386136e-06, + "loss": 0.0635, + "step": 23990 + }, + { + "epoch": 2.0735236943280486, + "grad_norm": 0.6254935837523002, + "learning_rate": 2.299007718776096e-06, + "loss": 0.0634, + "step": 24000 + }, + { + "epoch": 2.074387662534019, + "grad_norm": 0.634446437880187, + "learning_rate": 2.2950840598732572e-06, + "loss": 0.0618, + "step": 24010 + }, + { + "epoch": 2.075251630739989, + "grad_norm": 0.627483584074575, + "learning_rate": 2.2911627544439668e-06, + "loss": 0.0631, + "step": 24020 + }, + { + "epoch": 2.0761155989459588, + "grad_norm": 0.6316220677935596, + "learning_rate": 2.287243805900046e-06, + "loss": 0.0646, + "step": 24030 + }, + { + "epoch": 2.0769795671519287, + "grad_norm": 0.638069305465654, + "learning_rate": 2.2833272176512626e-06, + "loss": 0.0627, + "step": 24040 + }, + { + "epoch": 2.077843535357899, + "grad_norm": 0.6679409341830631, + "learning_rate": 2.2794129931053368e-06, + "loss": 0.0614, + "step": 24050 + }, + { + "epoch": 2.078707503563869, + "grad_norm": 0.6324265674564412, + "learning_rate": 2.275501135667927e-06, + "loss": 0.0622, + "step": 24060 + }, + { + "epoch": 2.079571471769839, + "grad_norm": 0.6643281873207468, + "learning_rate": 2.2715916487426338e-06, + "loss": 0.0615, + "step": 24070 + }, + { + "epoch": 2.0804354399758087, + "grad_norm": 0.647967935021621, + "learning_rate": 2.2676845357309922e-06, + "loss": 0.0631, + "step": 24080 + }, + { + "epoch": 2.081299408181779, + "grad_norm": 0.6292387792234486, + "learning_rate": 2.26377980003248e-06, + "loss": 0.0619, + "step": 24090 + }, + { + "epoch": 2.082163376387749, + "grad_norm": 0.5903159353895882, + "learning_rate": 2.2598774450444976e-06, + "loss": 0.0641, + "step": 24100 + }, + { + "epoch": 2.083027344593719, + "grad_norm": 0.664983677188501, + "learning_rate": 2.255977474162379e-06, + "loss": 0.0641, + "step": 24110 + }, + { + "epoch": 2.0838913127996888, + "grad_norm": 0.593897600532623, + "learning_rate": 2.252079890779382e-06, + "loss": 0.0635, + "step": 24120 + }, + { + "epoch": 2.084755281005659, + "grad_norm": 0.6117589894042372, + "learning_rate": 2.2481846982866843e-06, + "loss": 0.061, + "step": 24130 + }, + { + "epoch": 2.085619249211629, + "grad_norm": 0.6177291800970711, + "learning_rate": 2.2442919000733947e-06, + "loss": 0.0626, + "step": 24140 + }, + { + "epoch": 2.086483217417599, + "grad_norm": 0.6117096845706067, + "learning_rate": 2.240401499526522e-06, + "loss": 0.0635, + "step": 24150 + }, + { + "epoch": 2.0873471856235692, + "grad_norm": 0.6329557650601069, + "learning_rate": 2.2365135000310018e-06, + "loss": 0.0605, + "step": 24160 + }, + { + "epoch": 2.088211153829539, + "grad_norm": 0.5882562424900543, + "learning_rate": 2.232627904969677e-06, + "loss": 0.0622, + "step": 24170 + }, + { + "epoch": 2.089075122035509, + "grad_norm": 0.6579717668444609, + "learning_rate": 2.2287447177232957e-06, + "loss": 0.0622, + "step": 24180 + }, + { + "epoch": 2.089939090241479, + "grad_norm": 0.6186116937347731, + "learning_rate": 2.224863941670513e-06, + "loss": 0.0629, + "step": 24190 + }, + { + "epoch": 2.0908030584474493, + "grad_norm": 0.6228935783095523, + "learning_rate": 2.2209855801878886e-06, + "loss": 0.0613, + "step": 24200 + }, + { + "epoch": 2.091667026653419, + "grad_norm": 0.6404729334159126, + "learning_rate": 2.2171096366498785e-06, + "loss": 0.0624, + "step": 24210 + }, + { + "epoch": 2.092530994859389, + "grad_norm": 0.6022716371431075, + "learning_rate": 2.213236114428834e-06, + "loss": 0.0625, + "step": 24220 + }, + { + "epoch": 2.093394963065359, + "grad_norm": 0.6422295848132793, + "learning_rate": 2.2093650168950022e-06, + "loss": 0.0611, + "step": 24230 + }, + { + "epoch": 2.0942589312713293, + "grad_norm": 0.6411305363281238, + "learning_rate": 2.2054963474165165e-06, + "loss": 0.0631, + "step": 24240 + }, + { + "epoch": 2.0951228994772992, + "grad_norm": 0.6223287089938836, + "learning_rate": 2.2016301093594055e-06, + "loss": 0.0631, + "step": 24250 + }, + { + "epoch": 2.095986867683269, + "grad_norm": 0.6423015109263637, + "learning_rate": 2.1977663060875746e-06, + "loss": 0.0633, + "step": 24260 + }, + { + "epoch": 2.0968508358892395, + "grad_norm": 0.623734559704998, + "learning_rate": 2.1939049409628143e-06, + "loss": 0.0635, + "step": 24270 + }, + { + "epoch": 2.0977148040952094, + "grad_norm": 0.655630654823313, + "learning_rate": 2.1900460173447925e-06, + "loss": 0.062, + "step": 24280 + }, + { + "epoch": 2.0985787723011793, + "grad_norm": 0.684247340874467, + "learning_rate": 2.1861895385910535e-06, + "loss": 0.0638, + "step": 24290 + }, + { + "epoch": 2.099442740507149, + "grad_norm": 0.6820724809192963, + "learning_rate": 2.1823355080570145e-06, + "loss": 0.0621, + "step": 24300 + }, + { + "epoch": 2.1003067087131195, + "grad_norm": 0.6534096381485632, + "learning_rate": 2.178483929095961e-06, + "loss": 0.0636, + "step": 24310 + }, + { + "epoch": 2.1011706769190894, + "grad_norm": 0.6510147646848921, + "learning_rate": 2.1746348050590486e-06, + "loss": 0.0614, + "step": 24320 + }, + { + "epoch": 2.1020346451250593, + "grad_norm": 0.6041154097070152, + "learning_rate": 2.170788139295295e-06, + "loss": 0.0634, + "step": 24330 + }, + { + "epoch": 2.1028986133310292, + "grad_norm": 0.6120032247248519, + "learning_rate": 2.166943935151578e-06, + "loss": 0.0624, + "step": 24340 + }, + { + "epoch": 2.1037625815369996, + "grad_norm": 0.6462962519025485, + "learning_rate": 2.163102195972634e-06, + "loss": 0.0668, + "step": 24350 + }, + { + "epoch": 2.1046265497429695, + "grad_norm": 0.7056923665766052, + "learning_rate": 2.159262925101058e-06, + "loss": 0.0636, + "step": 24360 + }, + { + "epoch": 2.1054905179489394, + "grad_norm": 0.643509016704401, + "learning_rate": 2.1554261258772936e-06, + "loss": 0.0661, + "step": 24370 + }, + { + "epoch": 2.1063544861549097, + "grad_norm": 0.6549146285728764, + "learning_rate": 2.151591801639635e-06, + "loss": 0.0619, + "step": 24380 + }, + { + "epoch": 2.1072184543608796, + "grad_norm": 0.6581508720395497, + "learning_rate": 2.147759955724223e-06, + "loss": 0.0618, + "step": 24390 + }, + { + "epoch": 2.1080824225668495, + "grad_norm": 0.6517376056398865, + "learning_rate": 2.1439305914650398e-06, + "loss": 0.0612, + "step": 24400 + }, + { + "epoch": 2.1089463907728194, + "grad_norm": 0.6663625422110055, + "learning_rate": 2.1401037121939176e-06, + "loss": 0.0629, + "step": 24410 + }, + { + "epoch": 2.1098103589787898, + "grad_norm": 0.6199637112899523, + "learning_rate": 2.1362793212405112e-06, + "loss": 0.0606, + "step": 24420 + }, + { + "epoch": 2.1106743271847597, + "grad_norm": 0.6068817093511195, + "learning_rate": 2.1324574219323244e-06, + "loss": 0.0633, + "step": 24430 + }, + { + "epoch": 2.1115382953907296, + "grad_norm": 0.6391262425524984, + "learning_rate": 2.128638017594685e-06, + "loss": 0.0634, + "step": 24440 + }, + { + "epoch": 2.1124022635966995, + "grad_norm": 0.6545074639548908, + "learning_rate": 2.124821111550756e-06, + "loss": 0.0635, + "step": 24450 + }, + { + "epoch": 2.11326623180267, + "grad_norm": 0.6283092609786927, + "learning_rate": 2.1210067071215174e-06, + "loss": 0.0607, + "step": 24460 + }, + { + "epoch": 2.1141302000086397, + "grad_norm": 0.6265855386605454, + "learning_rate": 2.117194807625784e-06, + "loss": 0.0606, + "step": 24470 + }, + { + "epoch": 2.1149941682146096, + "grad_norm": 0.668275663546584, + "learning_rate": 2.113385416380182e-06, + "loss": 0.0641, + "step": 24480 + }, + { + "epoch": 2.1158581364205795, + "grad_norm": 0.6468286051725382, + "learning_rate": 2.1095785366991602e-06, + "loss": 0.0612, + "step": 24490 + }, + { + "epoch": 2.11672210462655, + "grad_norm": 0.6338607054600975, + "learning_rate": 2.1057741718949803e-06, + "loss": 0.0625, + "step": 24500 + }, + { + "epoch": 2.1175860728325198, + "grad_norm": 0.6284795065484166, + "learning_rate": 2.1019723252777152e-06, + "loss": 0.0604, + "step": 24510 + }, + { + "epoch": 2.1184500410384897, + "grad_norm": 0.6653757272633931, + "learning_rate": 2.09817300015525e-06, + "loss": 0.0639, + "step": 24520 + }, + { + "epoch": 2.11931400924446, + "grad_norm": 0.6132171778029453, + "learning_rate": 2.094376199833274e-06, + "loss": 0.0595, + "step": 24530 + }, + { + "epoch": 2.12017797745043, + "grad_norm": 0.6337994827967848, + "learning_rate": 2.0905819276152777e-06, + "loss": 0.0624, + "step": 24540 + }, + { + "epoch": 2.1210419456564, + "grad_norm": 0.6200981039293022, + "learning_rate": 2.086790186802554e-06, + "loss": 0.0627, + "step": 24550 + }, + { + "epoch": 2.1219059138623697, + "grad_norm": 0.607923399933756, + "learning_rate": 2.0830009806941943e-06, + "loss": 0.0633, + "step": 24560 + }, + { + "epoch": 2.12276988206834, + "grad_norm": 0.6327477772847702, + "learning_rate": 2.079214312587083e-06, + "loss": 0.0628, + "step": 24570 + }, + { + "epoch": 2.12363385027431, + "grad_norm": 0.6290055999012083, + "learning_rate": 2.0754301857758968e-06, + "loss": 0.0612, + "step": 24580 + }, + { + "epoch": 2.12449781848028, + "grad_norm": 0.6093290964229778, + "learning_rate": 2.0716486035531007e-06, + "loss": 0.0608, + "step": 24590 + }, + { + "epoch": 2.1253617866862498, + "grad_norm": 0.6269656092540906, + "learning_rate": 2.0678695692089467e-06, + "loss": 0.0602, + "step": 24600 + }, + { + "epoch": 2.12622575489222, + "grad_norm": 0.6486656468551327, + "learning_rate": 2.064093086031469e-06, + "loss": 0.0598, + "step": 24610 + }, + { + "epoch": 2.12708972309819, + "grad_norm": 0.631027633175365, + "learning_rate": 2.0603191573064814e-06, + "loss": 0.0631, + "step": 24620 + }, + { + "epoch": 2.12795369130416, + "grad_norm": 0.6583211890310452, + "learning_rate": 2.0565477863175785e-06, + "loss": 0.063, + "step": 24630 + }, + { + "epoch": 2.12881765951013, + "grad_norm": 0.6045310854163645, + "learning_rate": 2.052778976346127e-06, + "loss": 0.0634, + "step": 24640 + }, + { + "epoch": 2.1296816277161, + "grad_norm": 0.6640903285647783, + "learning_rate": 2.0490127306712656e-06, + "loss": 0.064, + "step": 24650 + }, + { + "epoch": 2.13054559592207, + "grad_norm": 0.6438233531716371, + "learning_rate": 2.0452490525698986e-06, + "loss": 0.0646, + "step": 24660 + }, + { + "epoch": 2.13140956412804, + "grad_norm": 0.617131494530124, + "learning_rate": 2.041487945316705e-06, + "loss": 0.0629, + "step": 24670 + }, + { + "epoch": 2.1322735323340103, + "grad_norm": 0.6065166880037826, + "learning_rate": 2.037729412184118e-06, + "loss": 0.0639, + "step": 24680 + }, + { + "epoch": 2.13313750053998, + "grad_norm": 0.6478988234190833, + "learning_rate": 2.0339734564423365e-06, + "loss": 0.0632, + "step": 24690 + }, + { + "epoch": 2.13400146874595, + "grad_norm": 0.6298992819336114, + "learning_rate": 2.030220081359314e-06, + "loss": 0.061, + "step": 24700 + }, + { + "epoch": 2.13486543695192, + "grad_norm": 0.650993251243365, + "learning_rate": 2.0264692902007576e-06, + "loss": 0.06, + "step": 24710 + }, + { + "epoch": 2.1357294051578903, + "grad_norm": 0.6637271378066566, + "learning_rate": 2.022721086230135e-06, + "loss": 0.0618, + "step": 24720 + }, + { + "epoch": 2.1365933733638602, + "grad_norm": 0.6671955218278872, + "learning_rate": 2.018975472708648e-06, + "loss": 0.0624, + "step": 24730 + }, + { + "epoch": 2.13745734156983, + "grad_norm": 0.6433645148089315, + "learning_rate": 2.015232452895258e-06, + "loss": 0.0612, + "step": 24740 + }, + { + "epoch": 2.1383213097758, + "grad_norm": 0.6355581302816811, + "learning_rate": 2.0114920300466624e-06, + "loss": 0.0628, + "step": 24750 + }, + { + "epoch": 2.1391852779817704, + "grad_norm": 0.6174139399477326, + "learning_rate": 2.0077542074173044e-06, + "loss": 0.0632, + "step": 24760 + }, + { + "epoch": 2.1400492461877403, + "grad_norm": 0.667036387076024, + "learning_rate": 2.0040189882593556e-06, + "loss": 0.0626, + "step": 24770 + }, + { + "epoch": 2.14091321439371, + "grad_norm": 0.6256015668288021, + "learning_rate": 2.0002863758227332e-06, + "loss": 0.0638, + "step": 24780 + }, + { + "epoch": 2.1417771825996805, + "grad_norm": 0.6350675918812858, + "learning_rate": 1.9965563733550797e-06, + "loss": 0.0615, + "step": 24790 + }, + { + "epoch": 2.1426411508056504, + "grad_norm": 0.6560277084131226, + "learning_rate": 1.992828984101768e-06, + "loss": 0.0607, + "step": 24800 + }, + { + "epoch": 2.1435051190116203, + "grad_norm": 0.622302271239541, + "learning_rate": 1.989104211305898e-06, + "loss": 0.0607, + "step": 24810 + }, + { + "epoch": 2.1443690872175902, + "grad_norm": 0.6220514071659232, + "learning_rate": 1.985382058208292e-06, + "loss": 0.0643, + "step": 24820 + }, + { + "epoch": 2.1452330554235606, + "grad_norm": 0.6285288781254312, + "learning_rate": 1.9816625280474966e-06, + "loss": 0.0595, + "step": 24830 + }, + { + "epoch": 2.1460970236295305, + "grad_norm": 0.5995374132946757, + "learning_rate": 1.9779456240597718e-06, + "loss": 0.0628, + "step": 24840 + }, + { + "epoch": 2.1469609918355004, + "grad_norm": 0.6358530584409989, + "learning_rate": 1.9742313494790943e-06, + "loss": 0.0637, + "step": 24850 + }, + { + "epoch": 2.1478249600414703, + "grad_norm": 0.6407208020987373, + "learning_rate": 1.970519707537151e-06, + "loss": 0.0616, + "step": 24860 + }, + { + "epoch": 2.1486889282474406, + "grad_norm": 0.6356132639497752, + "learning_rate": 1.9668107014633458e-06, + "loss": 0.0626, + "step": 24870 + }, + { + "epoch": 2.1495528964534105, + "grad_norm": 0.654976860526239, + "learning_rate": 1.963104334484777e-06, + "loss": 0.0613, + "step": 24880 + }, + { + "epoch": 2.1504168646593804, + "grad_norm": 0.6227712775313601, + "learning_rate": 1.9594006098262584e-06, + "loss": 0.0625, + "step": 24890 + }, + { + "epoch": 2.1512808328653508, + "grad_norm": 0.661425967242727, + "learning_rate": 1.955699530710298e-06, + "loss": 0.066, + "step": 24900 + }, + { + "epoch": 2.1521448010713207, + "grad_norm": 0.6501765614101469, + "learning_rate": 1.952001100357104e-06, + "loss": 0.0617, + "step": 24910 + }, + { + "epoch": 2.1530087692772906, + "grad_norm": 0.6565869828321924, + "learning_rate": 1.9483053219845786e-06, + "loss": 0.0609, + "step": 24920 + }, + { + "epoch": 2.1538727374832605, + "grad_norm": 0.6170406429516456, + "learning_rate": 1.9446121988083176e-06, + "loss": 0.0618, + "step": 24930 + }, + { + "epoch": 2.154736705689231, + "grad_norm": 0.6332346071442021, + "learning_rate": 1.9409217340416094e-06, + "loss": 0.0659, + "step": 24940 + }, + { + "epoch": 2.1556006738952007, + "grad_norm": 0.6310607589991424, + "learning_rate": 1.9372339308954243e-06, + "loss": 0.0617, + "step": 24950 + }, + { + "epoch": 2.1564646421011706, + "grad_norm": 0.631341824342489, + "learning_rate": 1.9335487925784203e-06, + "loss": 0.0619, + "step": 24960 + }, + { + "epoch": 2.1573286103071405, + "grad_norm": 0.6055231058000567, + "learning_rate": 1.9298663222969333e-06, + "loss": 0.0611, + "step": 24970 + }, + { + "epoch": 2.158192578513111, + "grad_norm": 0.6519031072151518, + "learning_rate": 1.926186523254984e-06, + "loss": 0.0623, + "step": 24980 + }, + { + "epoch": 2.1590565467190808, + "grad_norm": 0.6532029733553798, + "learning_rate": 1.9225093986542633e-06, + "loss": 0.0628, + "step": 24990 + }, + { + "epoch": 2.1599205149250507, + "grad_norm": 0.6371257726134606, + "learning_rate": 1.9188349516941363e-06, + "loss": 0.0617, + "step": 25000 + }, + { + "epoch": 2.160784483131021, + "grad_norm": 0.6517130228940046, + "learning_rate": 1.91516318557164e-06, + "loss": 0.062, + "step": 25010 + }, + { + "epoch": 2.161648451336991, + "grad_norm": 0.6314055320125311, + "learning_rate": 1.911494103481476e-06, + "loss": 0.0624, + "step": 25020 + }, + { + "epoch": 2.162512419542961, + "grad_norm": 0.644728408364859, + "learning_rate": 1.907827708616018e-06, + "loss": 0.0611, + "step": 25030 + }, + { + "epoch": 2.1633763877489307, + "grad_norm": 0.6783305329471674, + "learning_rate": 1.904164004165288e-06, + "loss": 0.063, + "step": 25040 + }, + { + "epoch": 2.164240355954901, + "grad_norm": 0.6226296409212154, + "learning_rate": 1.9005029933169815e-06, + "loss": 0.0613, + "step": 25050 + }, + { + "epoch": 2.165104324160871, + "grad_norm": 0.6627104553870933, + "learning_rate": 1.8968446792564405e-06, + "loss": 0.0611, + "step": 25060 + }, + { + "epoch": 2.165968292366841, + "grad_norm": 0.6672474621528279, + "learning_rate": 1.893189065166669e-06, + "loss": 0.0611, + "step": 25070 + }, + { + "epoch": 2.1668322605728108, + "grad_norm": 0.6591423831765176, + "learning_rate": 1.8895361542283103e-06, + "loss": 0.0618, + "step": 25080 + }, + { + "epoch": 2.167696228778781, + "grad_norm": 0.6253784506850278, + "learning_rate": 1.8858859496196685e-06, + "loss": 0.0596, + "step": 25090 + }, + { + "epoch": 2.168560196984751, + "grad_norm": 0.6112277593734251, + "learning_rate": 1.8822384545166845e-06, + "loss": 0.0614, + "step": 25100 + }, + { + "epoch": 2.169424165190721, + "grad_norm": 0.6230633387448746, + "learning_rate": 1.8785936720929437e-06, + "loss": 0.0621, + "step": 25110 + }, + { + "epoch": 2.170288133396691, + "grad_norm": 0.6411573873586922, + "learning_rate": 1.874951605519673e-06, + "loss": 0.0615, + "step": 25120 + }, + { + "epoch": 2.171152101602661, + "grad_norm": 0.6496078251100311, + "learning_rate": 1.8713122579657333e-06, + "loss": 0.0624, + "step": 25130 + }, + { + "epoch": 2.172016069808631, + "grad_norm": 0.6297655237072545, + "learning_rate": 1.8676756325976265e-06, + "loss": 0.0625, + "step": 25140 + }, + { + "epoch": 2.172880038014601, + "grad_norm": 0.6170767552073277, + "learning_rate": 1.8640417325794735e-06, + "loss": 0.0604, + "step": 25150 + }, + { + "epoch": 2.1737440062205713, + "grad_norm": 0.6272151430305527, + "learning_rate": 1.8604105610730378e-06, + "loss": 0.0613, + "step": 25160 + }, + { + "epoch": 2.174607974426541, + "grad_norm": 0.6675092753122248, + "learning_rate": 1.8567821212376986e-06, + "loss": 0.0639, + "step": 25170 + }, + { + "epoch": 2.175471942632511, + "grad_norm": 0.6224564596460632, + "learning_rate": 1.8531564162304677e-06, + "loss": 0.0627, + "step": 25180 + }, + { + "epoch": 2.176335910838481, + "grad_norm": 0.6068390661175612, + "learning_rate": 1.8495334492059653e-06, + "loss": 0.0621, + "step": 25190 + }, + { + "epoch": 2.1771998790444513, + "grad_norm": 0.6523381112434008, + "learning_rate": 1.8459132233164407e-06, + "loss": 0.0622, + "step": 25200 + }, + { + "epoch": 2.1780638472504212, + "grad_norm": 0.6416349928882249, + "learning_rate": 1.8422957417117531e-06, + "loss": 0.0631, + "step": 25210 + }, + { + "epoch": 2.178927815456391, + "grad_norm": 0.6530937293777992, + "learning_rate": 1.8386810075393735e-06, + "loss": 0.061, + "step": 25220 + }, + { + "epoch": 2.179791783662361, + "grad_norm": 0.6660230625168588, + "learning_rate": 1.8350690239443841e-06, + "loss": 0.0602, + "step": 25230 + }, + { + "epoch": 2.1806557518683314, + "grad_norm": 0.6125379807881822, + "learning_rate": 1.8314597940694711e-06, + "loss": 0.0613, + "step": 25240 + }, + { + "epoch": 2.1815197200743013, + "grad_norm": 0.6556372780382884, + "learning_rate": 1.8278533210549304e-06, + "loss": 0.0648, + "step": 25250 + }, + { + "epoch": 2.182383688280271, + "grad_norm": 0.6875735101398713, + "learning_rate": 1.8242496080386546e-06, + "loss": 0.0624, + "step": 25260 + }, + { + "epoch": 2.183247656486241, + "grad_norm": 0.6386277390230637, + "learning_rate": 1.8206486581561356e-06, + "loss": 0.0639, + "step": 25270 + }, + { + "epoch": 2.1841116246922114, + "grad_norm": 0.6383759095811737, + "learning_rate": 1.8170504745404598e-06, + "loss": 0.0635, + "step": 25280 + }, + { + "epoch": 2.1849755928981813, + "grad_norm": 0.6291029502412889, + "learning_rate": 1.8134550603223123e-06, + "loss": 0.0615, + "step": 25290 + }, + { + "epoch": 2.1858395611041512, + "grad_norm": 0.6597481436350051, + "learning_rate": 1.8098624186299628e-06, + "loss": 0.0637, + "step": 25300 + }, + { + "epoch": 2.1867035293101216, + "grad_norm": 0.6060866620443429, + "learning_rate": 1.8062725525892716e-06, + "loss": 0.0593, + "step": 25310 + }, + { + "epoch": 2.1875674975160915, + "grad_norm": 0.6594341783898489, + "learning_rate": 1.802685465323682e-06, + "loss": 0.062, + "step": 25320 + }, + { + "epoch": 2.1884314657220614, + "grad_norm": 0.6967919531607281, + "learning_rate": 1.7991011599542202e-06, + "loss": 0.062, + "step": 25330 + }, + { + "epoch": 2.1892954339280313, + "grad_norm": 0.6355968136614126, + "learning_rate": 1.7955196395994967e-06, + "loss": 0.0619, + "step": 25340 + }, + { + "epoch": 2.1901594021340016, + "grad_norm": 0.6493195599762654, + "learning_rate": 1.7919409073756883e-06, + "loss": 0.0602, + "step": 25350 + }, + { + "epoch": 2.1910233703399715, + "grad_norm": 0.6687212211442863, + "learning_rate": 1.7883649663965574e-06, + "loss": 0.0619, + "step": 25360 + }, + { + "epoch": 2.1918873385459414, + "grad_norm": 0.6783993190646149, + "learning_rate": 1.7847918197734293e-06, + "loss": 0.0627, + "step": 25370 + }, + { + "epoch": 2.1927513067519113, + "grad_norm": 0.6823882867138366, + "learning_rate": 1.781221470615206e-06, + "loss": 0.0609, + "step": 25380 + }, + { + "epoch": 2.1936152749578817, + "grad_norm": 0.6046134509758881, + "learning_rate": 1.7776539220283446e-06, + "loss": 0.0629, + "step": 25390 + }, + { + "epoch": 2.1944792431638516, + "grad_norm": 0.6632724484704178, + "learning_rate": 1.774089177116876e-06, + "loss": 0.0628, + "step": 25400 + }, + { + "epoch": 2.1953432113698215, + "grad_norm": 0.6093237196847797, + "learning_rate": 1.7705272389823869e-06, + "loss": 0.0622, + "step": 25410 + }, + { + "epoch": 2.196207179575792, + "grad_norm": 0.6369052508210911, + "learning_rate": 1.7669681107240223e-06, + "loss": 0.0607, + "step": 25420 + }, + { + "epoch": 2.1970711477817617, + "grad_norm": 0.6407569635936984, + "learning_rate": 1.763411795438482e-06, + "loss": 0.0611, + "step": 25430 + }, + { + "epoch": 2.1979351159877316, + "grad_norm": 0.6563720363166178, + "learning_rate": 1.7598582962200172e-06, + "loss": 0.0599, + "step": 25440 + }, + { + "epoch": 2.1987990841937015, + "grad_norm": 0.673275092712552, + "learning_rate": 1.7563076161604364e-06, + "loss": 0.0598, + "step": 25450 + }, + { + "epoch": 2.199663052399672, + "grad_norm": 0.6380876243254755, + "learning_rate": 1.7527597583490825e-06, + "loss": 0.0606, + "step": 25460 + }, + { + "epoch": 2.2005270206056418, + "grad_norm": 0.6450355340439092, + "learning_rate": 1.7492147258728538e-06, + "loss": 0.0621, + "step": 25470 + }, + { + "epoch": 2.2013909888116117, + "grad_norm": 0.6226050474103794, + "learning_rate": 1.745672521816184e-06, + "loss": 0.061, + "step": 25480 + }, + { + "epoch": 2.2022549570175816, + "grad_norm": 0.6409167634399674, + "learning_rate": 1.7421331492610533e-06, + "loss": 0.0614, + "step": 25490 + }, + { + "epoch": 2.203118925223552, + "grad_norm": 0.6730822678941681, + "learning_rate": 1.7385966112869657e-06, + "loss": 0.0591, + "step": 25500 + }, + { + "epoch": 2.203982893429522, + "grad_norm": 0.6970778786367295, + "learning_rate": 1.7350629109709715e-06, + "loss": 0.0607, + "step": 25510 + }, + { + "epoch": 2.2048468616354917, + "grad_norm": 0.6125821916693323, + "learning_rate": 1.731532051387646e-06, + "loss": 0.0596, + "step": 25520 + }, + { + "epoch": 2.205710829841462, + "grad_norm": 0.6656767212696019, + "learning_rate": 1.7280040356090933e-06, + "loss": 0.0607, + "step": 25530 + }, + { + "epoch": 2.206574798047432, + "grad_norm": 0.6411004681356632, + "learning_rate": 1.724478866704944e-06, + "loss": 0.062, + "step": 25540 + }, + { + "epoch": 2.207438766253402, + "grad_norm": 0.6764489258462001, + "learning_rate": 1.720956547742349e-06, + "loss": 0.0612, + "step": 25550 + }, + { + "epoch": 2.2083027344593718, + "grad_norm": 0.6128942114008559, + "learning_rate": 1.7174370817859854e-06, + "loss": 0.0608, + "step": 25560 + }, + { + "epoch": 2.209166702665342, + "grad_norm": 0.6339329827203459, + "learning_rate": 1.7139204718980434e-06, + "loss": 0.0625, + "step": 25570 + }, + { + "epoch": 2.210030670871312, + "grad_norm": 0.6452354260110682, + "learning_rate": 1.710406721138229e-06, + "loss": 0.0614, + "step": 25580 + }, + { + "epoch": 2.210894639077282, + "grad_norm": 0.6166881292995755, + "learning_rate": 1.7068958325637591e-06, + "loss": 0.0616, + "step": 25590 + }, + { + "epoch": 2.211758607283252, + "grad_norm": 0.6476715295266554, + "learning_rate": 1.7033878092293655e-06, + "loss": 0.061, + "step": 25600 + }, + { + "epoch": 2.212622575489222, + "grad_norm": 0.6517855464246195, + "learning_rate": 1.699882654187282e-06, + "loss": 0.0627, + "step": 25610 + }, + { + "epoch": 2.213486543695192, + "grad_norm": 0.6340260543769496, + "learning_rate": 1.6963803704872478e-06, + "loss": 0.0611, + "step": 25620 + }, + { + "epoch": 2.214350511901162, + "grad_norm": 0.6298347310498605, + "learning_rate": 1.6928809611765051e-06, + "loss": 0.0616, + "step": 25630 + }, + { + "epoch": 2.2152144801071323, + "grad_norm": 0.6243243135913467, + "learning_rate": 1.6893844292997923e-06, + "loss": 0.0615, + "step": 25640 + }, + { + "epoch": 2.216078448313102, + "grad_norm": 0.6340777870224308, + "learning_rate": 1.6858907778993516e-06, + "loss": 0.0609, + "step": 25650 + }, + { + "epoch": 2.216942416519072, + "grad_norm": 0.6083314908430885, + "learning_rate": 1.6824000100149067e-06, + "loss": 0.0596, + "step": 25660 + }, + { + "epoch": 2.217806384725042, + "grad_norm": 0.6726539489645692, + "learning_rate": 1.678912128683685e-06, + "loss": 0.0621, + "step": 25670 + }, + { + "epoch": 2.2186703529310123, + "grad_norm": 0.6194556913540952, + "learning_rate": 1.6754271369403934e-06, + "loss": 0.0613, + "step": 25680 + }, + { + "epoch": 2.2195343211369822, + "grad_norm": 0.6207525296451107, + "learning_rate": 1.671945037817233e-06, + "loss": 0.0616, + "step": 25690 + }, + { + "epoch": 2.220398289342952, + "grad_norm": 0.681932107424091, + "learning_rate": 1.6684658343438769e-06, + "loss": 0.0621, + "step": 25700 + }, + { + "epoch": 2.221262257548922, + "grad_norm": 0.6855599096619783, + "learning_rate": 1.6649895295474895e-06, + "loss": 0.0619, + "step": 25710 + }, + { + "epoch": 2.2221262257548924, + "grad_norm": 0.6129639845472115, + "learning_rate": 1.6615161264527075e-06, + "loss": 0.0607, + "step": 25720 + }, + { + "epoch": 2.2229901939608623, + "grad_norm": 0.6626689168785096, + "learning_rate": 1.6580456280816442e-06, + "loss": 0.0599, + "step": 25730 + }, + { + "epoch": 2.223854162166832, + "grad_norm": 0.6226827855598428, + "learning_rate": 1.654578037453885e-06, + "loss": 0.0602, + "step": 25740 + }, + { + "epoch": 2.224718130372802, + "grad_norm": 0.6147897546663305, + "learning_rate": 1.6511133575864846e-06, + "loss": 0.0595, + "step": 25750 + }, + { + "epoch": 2.2255820985787724, + "grad_norm": 0.6263674040275825, + "learning_rate": 1.6476515914939718e-06, + "loss": 0.0642, + "step": 25760 + }, + { + "epoch": 2.2264460667847423, + "grad_norm": 0.6413262252107033, + "learning_rate": 1.6441927421883274e-06, + "loss": 0.0617, + "step": 25770 + }, + { + "epoch": 2.2273100349907122, + "grad_norm": 0.6571817695841334, + "learning_rate": 1.6407368126790074e-06, + "loss": 0.0614, + "step": 25780 + }, + { + "epoch": 2.228174003196682, + "grad_norm": 0.639650797211082, + "learning_rate": 1.637283805972918e-06, + "loss": 0.0625, + "step": 25790 + }, + { + "epoch": 2.2290379714026525, + "grad_norm": 0.6241557566612796, + "learning_rate": 1.6338337250744313e-06, + "loss": 0.0594, + "step": 25800 + }, + { + "epoch": 2.2299019396086224, + "grad_norm": 0.6436803684620909, + "learning_rate": 1.6303865729853618e-06, + "loss": 0.0591, + "step": 25810 + }, + { + "epoch": 2.2307659078145923, + "grad_norm": 0.6593942452366395, + "learning_rate": 1.6269423527049876e-06, + "loss": 0.0605, + "step": 25820 + }, + { + "epoch": 2.2316298760205626, + "grad_norm": 0.7094732125345027, + "learning_rate": 1.623501067230029e-06, + "loss": 0.0596, + "step": 25830 + }, + { + "epoch": 2.2324938442265325, + "grad_norm": 0.639451168612842, + "learning_rate": 1.6200627195546547e-06, + "loss": 0.0587, + "step": 25840 + }, + { + "epoch": 2.2333578124325024, + "grad_norm": 0.666192956241704, + "learning_rate": 1.6166273126704773e-06, + "loss": 0.0592, + "step": 25850 + }, + { + "epoch": 2.2342217806384723, + "grad_norm": 0.651541166383784, + "learning_rate": 1.6131948495665483e-06, + "loss": 0.0627, + "step": 25860 + }, + { + "epoch": 2.2350857488444427, + "grad_norm": 0.6671884346428812, + "learning_rate": 1.609765333229364e-06, + "loss": 0.0592, + "step": 25870 + }, + { + "epoch": 2.2359497170504126, + "grad_norm": 0.6312027856638661, + "learning_rate": 1.6063387666428514e-06, + "loss": 0.0616, + "step": 25880 + }, + { + "epoch": 2.2368136852563825, + "grad_norm": 0.6351034279727996, + "learning_rate": 1.6029151527883724e-06, + "loss": 0.0619, + "step": 25890 + }, + { + "epoch": 2.2376776534623524, + "grad_norm": 0.650998398374031, + "learning_rate": 1.5994944946447187e-06, + "loss": 0.0596, + "step": 25900 + }, + { + "epoch": 2.2385416216683227, + "grad_norm": 0.64185933391637, + "learning_rate": 1.5960767951881146e-06, + "loss": 0.0613, + "step": 25910 + }, + { + "epoch": 2.2394055898742926, + "grad_norm": 0.6821387202412239, + "learning_rate": 1.5926620573922048e-06, + "loss": 0.0605, + "step": 25920 + }, + { + "epoch": 2.2402695580802625, + "grad_norm": 0.6584422635160079, + "learning_rate": 1.5892502842280605e-06, + "loss": 0.0631, + "step": 25930 + }, + { + "epoch": 2.241133526286233, + "grad_norm": 0.6719097501541714, + "learning_rate": 1.5858414786641723e-06, + "loss": 0.0598, + "step": 25940 + }, + { + "epoch": 2.2419974944922028, + "grad_norm": 0.6524586802098732, + "learning_rate": 1.5824356436664467e-06, + "loss": 0.0604, + "step": 25950 + }, + { + "epoch": 2.2428614626981727, + "grad_norm": 0.6516842713970796, + "learning_rate": 1.5790327821982137e-06, + "loss": 0.0625, + "step": 25960 + }, + { + "epoch": 2.2437254309041426, + "grad_norm": 0.6683175188643405, + "learning_rate": 1.5756328972202023e-06, + "loss": 0.0591, + "step": 25970 + }, + { + "epoch": 2.244589399110113, + "grad_norm": 0.625411302041235, + "learning_rate": 1.5722359916905656e-06, + "loss": 0.0593, + "step": 25980 + }, + { + "epoch": 2.245453367316083, + "grad_norm": 0.6293011621106597, + "learning_rate": 1.5688420685648565e-06, + "loss": 0.0622, + "step": 25990 + }, + { + "epoch": 2.2463173355220527, + "grad_norm": 0.6852578163384612, + "learning_rate": 1.5654511307960346e-06, + "loss": 0.0616, + "step": 26000 + }, + { + "epoch": 2.2471813037280226, + "grad_norm": 0.6510391632446941, + "learning_rate": 1.5620631813344611e-06, + "loss": 0.0643, + "step": 26010 + }, + { + "epoch": 2.248045271933993, + "grad_norm": 0.6375311713032594, + "learning_rate": 1.5586782231279012e-06, + "loss": 0.0628, + "step": 26020 + }, + { + "epoch": 2.248909240139963, + "grad_norm": 0.6571909140263892, + "learning_rate": 1.5552962591215137e-06, + "loss": 0.0603, + "step": 26030 + }, + { + "epoch": 2.2497732083459328, + "grad_norm": 0.6435249357904795, + "learning_rate": 1.5519172922578529e-06, + "loss": 0.0629, + "step": 26040 + }, + { + "epoch": 2.250637176551903, + "grad_norm": 0.6516646258238982, + "learning_rate": 1.5485413254768655e-06, + "loss": 0.0583, + "step": 26050 + }, + { + "epoch": 2.251501144757873, + "grad_norm": 0.6565838894870303, + "learning_rate": 1.5451683617158864e-06, + "loss": 0.0611, + "step": 26060 + }, + { + "epoch": 2.252365112963843, + "grad_norm": 0.6896405697177518, + "learning_rate": 1.541798403909644e-06, + "loss": 0.0601, + "step": 26070 + }, + { + "epoch": 2.253229081169813, + "grad_norm": 0.6312058690706274, + "learning_rate": 1.5384314549902407e-06, + "loss": 0.0598, + "step": 26080 + }, + { + "epoch": 2.254093049375783, + "grad_norm": 0.689835159961576, + "learning_rate": 1.5350675178871717e-06, + "loss": 0.0618, + "step": 26090 + }, + { + "epoch": 2.254957017581753, + "grad_norm": 0.6158946907902951, + "learning_rate": 1.531706595527303e-06, + "loss": 0.0604, + "step": 26100 + }, + { + "epoch": 2.255820985787723, + "grad_norm": 0.6099539265032519, + "learning_rate": 1.5283486908348872e-06, + "loss": 0.0589, + "step": 26110 + }, + { + "epoch": 2.256684953993693, + "grad_norm": 0.6450195523068343, + "learning_rate": 1.5249938067315379e-06, + "loss": 0.0613, + "step": 26120 + }, + { + "epoch": 2.257548922199663, + "grad_norm": 0.6487167626847626, + "learning_rate": 1.5216419461362542e-06, + "loss": 0.0591, + "step": 26130 + }, + { + "epoch": 2.258412890405633, + "grad_norm": 0.6753222278690582, + "learning_rate": 1.5182931119653965e-06, + "loss": 0.0595, + "step": 26140 + }, + { + "epoch": 2.259276858611603, + "grad_norm": 0.7071305222388669, + "learning_rate": 1.5149473071326941e-06, + "loss": 0.0609, + "step": 26150 + }, + { + "epoch": 2.2601408268175733, + "grad_norm": 0.664863282231861, + "learning_rate": 1.5116045345492403e-06, + "loss": 0.06, + "step": 26160 + }, + { + "epoch": 2.2610047950235432, + "grad_norm": 0.6528376283630767, + "learning_rate": 1.5082647971234886e-06, + "loss": 0.0635, + "step": 26170 + }, + { + "epoch": 2.261868763229513, + "grad_norm": 0.6492573255834764, + "learning_rate": 1.5049280977612575e-06, + "loss": 0.0627, + "step": 26180 + }, + { + "epoch": 2.262732731435483, + "grad_norm": 0.6627021537499391, + "learning_rate": 1.501594439365715e-06, + "loss": 0.0598, + "step": 26190 + }, + { + "epoch": 2.2635966996414534, + "grad_norm": 0.6806417650959726, + "learning_rate": 1.4982638248373871e-06, + "loss": 0.0637, + "step": 26200 + }, + { + "epoch": 2.2644606678474233, + "grad_norm": 0.62424115966496, + "learning_rate": 1.4949362570741493e-06, + "loss": 0.0592, + "step": 26210 + }, + { + "epoch": 2.265324636053393, + "grad_norm": 0.6688986783593902, + "learning_rate": 1.4916117389712303e-06, + "loss": 0.0603, + "step": 26220 + }, + { + "epoch": 2.266188604259363, + "grad_norm": 0.7018681679297502, + "learning_rate": 1.4882902734212013e-06, + "loss": 0.0637, + "step": 26230 + }, + { + "epoch": 2.2670525724653334, + "grad_norm": 0.5925240661361009, + "learning_rate": 1.484971863313978e-06, + "loss": 0.061, + "step": 26240 + }, + { + "epoch": 2.2679165406713033, + "grad_norm": 0.6414504235333198, + "learning_rate": 1.4816565115368199e-06, + "loss": 0.064, + "step": 26250 + }, + { + "epoch": 2.2687805088772732, + "grad_norm": 0.656382879329087, + "learning_rate": 1.4783442209743225e-06, + "loss": 0.0597, + "step": 26260 + }, + { + "epoch": 2.2696444770832436, + "grad_norm": 0.6691090047775298, + "learning_rate": 1.4750349945084209e-06, + "loss": 0.0605, + "step": 26270 + }, + { + "epoch": 2.2705084452892135, + "grad_norm": 0.657652046855535, + "learning_rate": 1.4717288350183805e-06, + "loss": 0.0599, + "step": 26280 + }, + { + "epoch": 2.2713724134951834, + "grad_norm": 0.6423009118728417, + "learning_rate": 1.4684257453808032e-06, + "loss": 0.0593, + "step": 26290 + }, + { + "epoch": 2.2722363817011533, + "grad_norm": 0.6485166849916811, + "learning_rate": 1.4651257284696164e-06, + "loss": 0.0595, + "step": 26300 + }, + { + "epoch": 2.273100349907123, + "grad_norm": 0.6207446565216592, + "learning_rate": 1.4618287871560737e-06, + "loss": 0.0601, + "step": 26310 + }, + { + "epoch": 2.2739643181130935, + "grad_norm": 0.6633360470389493, + "learning_rate": 1.4585349243087538e-06, + "loss": 0.061, + "step": 26320 + }, + { + "epoch": 2.2748282863190634, + "grad_norm": 0.6355126266677902, + "learning_rate": 1.4552441427935582e-06, + "loss": 0.0585, + "step": 26330 + }, + { + "epoch": 2.2756922545250333, + "grad_norm": 0.6310809829654427, + "learning_rate": 1.4519564454737063e-06, + "loss": 0.06, + "step": 26340 + }, + { + "epoch": 2.2765562227310037, + "grad_norm": 0.67846250533861, + "learning_rate": 1.4486718352097323e-06, + "loss": 0.0604, + "step": 26350 + }, + { + "epoch": 2.2774201909369736, + "grad_norm": 0.664061169733949, + "learning_rate": 1.4453903148594866e-06, + "loss": 0.0632, + "step": 26360 + }, + { + "epoch": 2.2782841591429435, + "grad_norm": 0.6697299183841422, + "learning_rate": 1.4421118872781291e-06, + "loss": 0.0605, + "step": 26370 + }, + { + "epoch": 2.2791481273489134, + "grad_norm": 0.6498208864924574, + "learning_rate": 1.438836555318135e-06, + "loss": 0.0639, + "step": 26380 + }, + { + "epoch": 2.2800120955548837, + "grad_norm": 0.64671706463022, + "learning_rate": 1.4355643218292742e-06, + "loss": 0.0615, + "step": 26390 + }, + { + "epoch": 2.2808760637608536, + "grad_norm": 0.7394098399800825, + "learning_rate": 1.4322951896586334e-06, + "loss": 0.0606, + "step": 26400 + }, + { + "epoch": 2.2817400319668235, + "grad_norm": 0.642376609395183, + "learning_rate": 1.4290291616505918e-06, + "loss": 0.0605, + "step": 26410 + }, + { + "epoch": 2.2826040001727934, + "grad_norm": 0.6566571466559815, + "learning_rate": 1.4257662406468353e-06, + "loss": 0.0573, + "step": 26420 + }, + { + "epoch": 2.2834679683787638, + "grad_norm": 0.6520952949887607, + "learning_rate": 1.4225064294863372e-06, + "loss": 0.0606, + "step": 26430 + }, + { + "epoch": 2.2843319365847337, + "grad_norm": 0.6838279782209414, + "learning_rate": 1.4192497310053748e-06, + "loss": 0.0623, + "step": 26440 + }, + { + "epoch": 2.2851959047907036, + "grad_norm": 0.6446957305304464, + "learning_rate": 1.41599614803751e-06, + "loss": 0.0599, + "step": 26450 + }, + { + "epoch": 2.286059872996674, + "grad_norm": 0.6410457377021014, + "learning_rate": 1.4127456834135978e-06, + "loss": 0.0625, + "step": 26460 + }, + { + "epoch": 2.286923841202644, + "grad_norm": 0.6186720566777877, + "learning_rate": 1.4094983399617784e-06, + "loss": 0.0614, + "step": 26470 + }, + { + "epoch": 2.2877878094086137, + "grad_norm": 0.6939594322670045, + "learning_rate": 1.4062541205074742e-06, + "loss": 0.0606, + "step": 26480 + }, + { + "epoch": 2.2886517776145836, + "grad_norm": 0.6742929364984191, + "learning_rate": 1.4030130278733967e-06, + "loss": 0.058, + "step": 26490 + }, + { + "epoch": 2.289515745820554, + "grad_norm": 0.6389179635631196, + "learning_rate": 1.3997750648795295e-06, + "loss": 0.0609, + "step": 26500 + }, + { + "epoch": 2.290379714026524, + "grad_norm": 0.6664027390477426, + "learning_rate": 1.3965402343431362e-06, + "loss": 0.0593, + "step": 26510 + }, + { + "epoch": 2.2912436822324938, + "grad_norm": 0.6522631321278886, + "learning_rate": 1.3933085390787531e-06, + "loss": 0.0611, + "step": 26520 + }, + { + "epoch": 2.2921076504384637, + "grad_norm": 0.6604097749419889, + "learning_rate": 1.3900799818981947e-06, + "loss": 0.06, + "step": 26530 + }, + { + "epoch": 2.292971618644434, + "grad_norm": 0.6406399475479331, + "learning_rate": 1.3868545656105342e-06, + "loss": 0.0586, + "step": 26540 + }, + { + "epoch": 2.293835586850404, + "grad_norm": 0.59954172486234, + "learning_rate": 1.3836322930221225e-06, + "loss": 0.061, + "step": 26550 + }, + { + "epoch": 2.294699555056374, + "grad_norm": 0.6664300784047551, + "learning_rate": 1.3804131669365705e-06, + "loss": 0.0595, + "step": 26560 + }, + { + "epoch": 2.295563523262344, + "grad_norm": 0.6188454289593853, + "learning_rate": 1.3771971901547515e-06, + "loss": 0.06, + "step": 26570 + }, + { + "epoch": 2.296427491468314, + "grad_norm": 0.6424177988440397, + "learning_rate": 1.3739843654747986e-06, + "loss": 0.0606, + "step": 26580 + }, + { + "epoch": 2.297291459674284, + "grad_norm": 0.6688160131458641, + "learning_rate": 1.370774695692102e-06, + "loss": 0.0606, + "step": 26590 + }, + { + "epoch": 2.298155427880254, + "grad_norm": 0.6683429083926847, + "learning_rate": 1.3675681835993109e-06, + "loss": 0.0613, + "step": 26600 + }, + { + "epoch": 2.299019396086224, + "grad_norm": 0.7006858852613675, + "learning_rate": 1.3643648319863222e-06, + "loss": 0.0588, + "step": 26610 + }, + { + "epoch": 2.299883364292194, + "grad_norm": 0.6778287862856338, + "learning_rate": 1.3611646436402849e-06, + "loss": 0.0626, + "step": 26620 + }, + { + "epoch": 2.300747332498164, + "grad_norm": 0.6624181219982882, + "learning_rate": 1.3579676213455934e-06, + "loss": 0.0618, + "step": 26630 + }, + { + "epoch": 2.301611300704134, + "grad_norm": 0.7008422542870012, + "learning_rate": 1.3547737678838934e-06, + "loss": 0.0607, + "step": 26640 + }, + { + "epoch": 2.3024752689101042, + "grad_norm": 0.6624919987207697, + "learning_rate": 1.3515830860340678e-06, + "loss": 0.0606, + "step": 26650 + }, + { + "epoch": 2.303339237116074, + "grad_norm": 0.6147145335368871, + "learning_rate": 1.3483955785722418e-06, + "loss": 0.0622, + "step": 26660 + }, + { + "epoch": 2.304203205322044, + "grad_norm": 0.6281804015316972, + "learning_rate": 1.3452112482717788e-06, + "loss": 0.0572, + "step": 26670 + }, + { + "epoch": 2.3050671735280144, + "grad_norm": 0.6451228796733127, + "learning_rate": 1.3420300979032758e-06, + "loss": 0.0606, + "step": 26680 + }, + { + "epoch": 2.3059311417339843, + "grad_norm": 0.6169719002668468, + "learning_rate": 1.3388521302345703e-06, + "loss": 0.0607, + "step": 26690 + }, + { + "epoch": 2.306795109939954, + "grad_norm": 0.685603442557472, + "learning_rate": 1.3356773480307178e-06, + "loss": 0.0618, + "step": 26700 + }, + { + "epoch": 2.307659078145924, + "grad_norm": 0.6443542909589884, + "learning_rate": 1.332505754054016e-06, + "loss": 0.0596, + "step": 26710 + }, + { + "epoch": 2.3085230463518944, + "grad_norm": 0.6542177335627326, + "learning_rate": 1.3293373510639772e-06, + "loss": 0.0599, + "step": 26720 + }, + { + "epoch": 2.3093870145578643, + "grad_norm": 0.618466824135546, + "learning_rate": 1.326172141817349e-06, + "loss": 0.058, + "step": 26730 + }, + { + "epoch": 2.3102509827638342, + "grad_norm": 0.6754024486903456, + "learning_rate": 1.3230101290680859e-06, + "loss": 0.0608, + "step": 26740 + }, + { + "epoch": 2.311114950969804, + "grad_norm": 0.6259336996138705, + "learning_rate": 1.3198513155673742e-06, + "loss": 0.0602, + "step": 26750 + }, + { + "epoch": 2.3119789191757745, + "grad_norm": 0.6952082803278199, + "learning_rate": 1.3166957040636102e-06, + "loss": 0.0611, + "step": 26760 + }, + { + "epoch": 2.3128428873817444, + "grad_norm": 0.679152130889011, + "learning_rate": 1.3135432973024044e-06, + "loss": 0.0572, + "step": 26770 + }, + { + "epoch": 2.3137068555877143, + "grad_norm": 0.6706211214233345, + "learning_rate": 1.3103940980265805e-06, + "loss": 0.0604, + "step": 26780 + }, + { + "epoch": 2.3145708237936846, + "grad_norm": 0.617309011922249, + "learning_rate": 1.3072481089761697e-06, + "loss": 0.0575, + "step": 26790 + }, + { + "epoch": 2.3154347919996545, + "grad_norm": 0.6615416949022743, + "learning_rate": 1.3041053328884128e-06, + "loss": 0.0616, + "step": 26800 + }, + { + "epoch": 2.3162987602056244, + "grad_norm": 0.6747162247811561, + "learning_rate": 1.3009657724977537e-06, + "loss": 0.0619, + "step": 26810 + }, + { + "epoch": 2.3171627284115943, + "grad_norm": 0.6664271188966158, + "learning_rate": 1.2978294305358374e-06, + "loss": 0.0615, + "step": 26820 + }, + { + "epoch": 2.3180266966175647, + "grad_norm": 0.65748922479625, + "learning_rate": 1.294696309731508e-06, + "loss": 0.0623, + "step": 26830 + }, + { + "epoch": 2.3188906648235346, + "grad_norm": 0.6685930838215557, + "learning_rate": 1.2915664128108123e-06, + "loss": 0.0604, + "step": 26840 + }, + { + "epoch": 2.3197546330295045, + "grad_norm": 0.6127967842512623, + "learning_rate": 1.288439742496984e-06, + "loss": 0.0599, + "step": 26850 + }, + { + "epoch": 2.3206186012354744, + "grad_norm": 0.6643011680928712, + "learning_rate": 1.2853163015104563e-06, + "loss": 0.063, + "step": 26860 + }, + { + "epoch": 2.3214825694414447, + "grad_norm": 0.6399180186475122, + "learning_rate": 1.2821960925688493e-06, + "loss": 0.0601, + "step": 26870 + }, + { + "epoch": 2.3223465376474146, + "grad_norm": 0.6309313709379819, + "learning_rate": 1.2790791183869717e-06, + "loss": 0.0597, + "step": 26880 + }, + { + "epoch": 2.3232105058533845, + "grad_norm": 0.6292393124967992, + "learning_rate": 1.2759653816768175e-06, + "loss": 0.0613, + "step": 26890 + }, + { + "epoch": 2.324074474059355, + "grad_norm": 0.6272135357664029, + "learning_rate": 1.2728548851475625e-06, + "loss": 0.0608, + "step": 26900 + }, + { + "epoch": 2.3249384422653248, + "grad_norm": 0.6534930026134436, + "learning_rate": 1.2697476315055673e-06, + "loss": 0.0603, + "step": 26910 + }, + { + "epoch": 2.3258024104712947, + "grad_norm": 0.6627039375093233, + "learning_rate": 1.2666436234543684e-06, + "loss": 0.0602, + "step": 26920 + }, + { + "epoch": 2.3266663786772646, + "grad_norm": 0.6998958974545966, + "learning_rate": 1.2635428636946773e-06, + "loss": 0.0587, + "step": 26930 + }, + { + "epoch": 2.3275303468832345, + "grad_norm": 0.664922356390914, + "learning_rate": 1.2604453549243796e-06, + "loss": 0.0607, + "step": 26940 + }, + { + "epoch": 2.328394315089205, + "grad_norm": 0.6545529848823667, + "learning_rate": 1.2573510998385357e-06, + "loss": 0.0606, + "step": 26950 + }, + { + "epoch": 2.3292582832951747, + "grad_norm": 0.6572234115129042, + "learning_rate": 1.2542601011293714e-06, + "loss": 0.0614, + "step": 26960 + }, + { + "epoch": 2.3301222515011446, + "grad_norm": 0.623619780479768, + "learning_rate": 1.2511723614862803e-06, + "loss": 0.0586, + "step": 26970 + }, + { + "epoch": 2.330986219707115, + "grad_norm": 0.6760745961536608, + "learning_rate": 1.2480878835958199e-06, + "loss": 0.0603, + "step": 26980 + }, + { + "epoch": 2.331850187913085, + "grad_norm": 0.6446461921885446, + "learning_rate": 1.245006670141709e-06, + "loss": 0.0606, + "step": 26990 + }, + { + "epoch": 2.3327141561190547, + "grad_norm": 0.6720018979198659, + "learning_rate": 1.2419287238048327e-06, + "loss": 0.0615, + "step": 27000 + }, + { + "epoch": 2.3335781243250246, + "grad_norm": 0.6209878169214561, + "learning_rate": 1.238854047263221e-06, + "loss": 0.0624, + "step": 27010 + }, + { + "epoch": 2.334442092530995, + "grad_norm": 0.6223483037469352, + "learning_rate": 1.2357826431920706e-06, + "loss": 0.059, + "step": 27020 + }, + { + "epoch": 2.335306060736965, + "grad_norm": 0.6514212276691695, + "learning_rate": 1.2327145142637237e-06, + "loss": 0.0605, + "step": 27030 + }, + { + "epoch": 2.336170028942935, + "grad_norm": 0.6010763844851291, + "learning_rate": 1.2296496631476807e-06, + "loss": 0.0606, + "step": 27040 + }, + { + "epoch": 2.3370339971489047, + "grad_norm": 0.6531152999704667, + "learning_rate": 1.2265880925105777e-06, + "loss": 0.0591, + "step": 27050 + }, + { + "epoch": 2.337897965354875, + "grad_norm": 0.6706702248713575, + "learning_rate": 1.2235298050162093e-06, + "loss": 0.0592, + "step": 27060 + }, + { + "epoch": 2.338761933560845, + "grad_norm": 0.6501204240351136, + "learning_rate": 1.2204748033255054e-06, + "loss": 0.0601, + "step": 27070 + }, + { + "epoch": 2.339625901766815, + "grad_norm": 0.6360109585574683, + "learning_rate": 1.2174230900965407e-06, + "loss": 0.0622, + "step": 27080 + }, + { + "epoch": 2.340489869972785, + "grad_norm": 0.7292331220131177, + "learning_rate": 1.214374667984527e-06, + "loss": 0.0602, + "step": 27090 + }, + { + "epoch": 2.341353838178755, + "grad_norm": 0.64431736347878, + "learning_rate": 1.2113295396418124e-06, + "loss": 0.06, + "step": 27100 + }, + { + "epoch": 2.342217806384725, + "grad_norm": 0.6693597385480407, + "learning_rate": 1.2082877077178834e-06, + "loss": 0.0604, + "step": 27110 + }, + { + "epoch": 2.343081774590695, + "grad_norm": 0.70203505820828, + "learning_rate": 1.2052491748593492e-06, + "loss": 0.0584, + "step": 27120 + }, + { + "epoch": 2.3439457427966652, + "grad_norm": 0.6529968481749552, + "learning_rate": 1.2022139437099601e-06, + "loss": 0.0582, + "step": 27130 + }, + { + "epoch": 2.344809711002635, + "grad_norm": 0.6292470117300617, + "learning_rate": 1.1991820169105827e-06, + "loss": 0.0591, + "step": 27140 + }, + { + "epoch": 2.345673679208605, + "grad_norm": 0.6937281317516001, + "learning_rate": 1.19615339709922e-06, + "loss": 0.0611, + "step": 27150 + }, + { + "epoch": 2.346537647414575, + "grad_norm": 0.667882030426861, + "learning_rate": 1.193128086910985e-06, + "loss": 0.0572, + "step": 27160 + }, + { + "epoch": 2.3474016156205453, + "grad_norm": 0.6415713468157893, + "learning_rate": 1.1901060889781224e-06, + "loss": 0.0596, + "step": 27170 + }, + { + "epoch": 2.348265583826515, + "grad_norm": 0.6686307087579261, + "learning_rate": 1.1870874059299875e-06, + "loss": 0.0594, + "step": 27180 + }, + { + "epoch": 2.349129552032485, + "grad_norm": 0.6509117568868412, + "learning_rate": 1.1840720403930555e-06, + "loss": 0.0594, + "step": 27190 + }, + { + "epoch": 2.3499935202384554, + "grad_norm": 0.6287633557030508, + "learning_rate": 1.1810599949909124e-06, + "loss": 0.0589, + "step": 27200 + }, + { + "epoch": 2.3508574884444253, + "grad_norm": 0.6378929565344178, + "learning_rate": 1.1780512723442556e-06, + "loss": 0.0607, + "step": 27210 + }, + { + "epoch": 2.3517214566503952, + "grad_norm": 0.6682225214396356, + "learning_rate": 1.1750458750708959e-06, + "loss": 0.0609, + "step": 27220 + }, + { + "epoch": 2.352585424856365, + "grad_norm": 0.664213964785015, + "learning_rate": 1.1720438057857458e-06, + "loss": 0.0589, + "step": 27230 + }, + { + "epoch": 2.3534493930623355, + "grad_norm": 0.7004254892208797, + "learning_rate": 1.169045067100824e-06, + "loss": 0.0593, + "step": 27240 + }, + { + "epoch": 2.3543133612683054, + "grad_norm": 0.6778013453063583, + "learning_rate": 1.1660496616252498e-06, + "loss": 0.0593, + "step": 27250 + }, + { + "epoch": 2.3551773294742753, + "grad_norm": 0.6625578892854007, + "learning_rate": 1.1630575919652459e-06, + "loss": 0.0592, + "step": 27260 + }, + { + "epoch": 2.356041297680245, + "grad_norm": 0.6889032338164695, + "learning_rate": 1.16006886072413e-06, + "loss": 0.0598, + "step": 27270 + }, + { + "epoch": 2.3569052658862155, + "grad_norm": 0.656085217318829, + "learning_rate": 1.1570834705023148e-06, + "loss": 0.058, + "step": 27280 + }, + { + "epoch": 2.3577692340921854, + "grad_norm": 0.6322543933666873, + "learning_rate": 1.1541014238973076e-06, + "loss": 0.0594, + "step": 27290 + }, + { + "epoch": 2.3586332022981553, + "grad_norm": 0.6621617238381241, + "learning_rate": 1.1511227235037036e-06, + "loss": 0.0603, + "step": 27300 + }, + { + "epoch": 2.3594971705041257, + "grad_norm": 0.6675369069452474, + "learning_rate": 1.1481473719131935e-06, + "loss": 0.0591, + "step": 27310 + }, + { + "epoch": 2.3603611387100956, + "grad_norm": 0.6403904619439585, + "learning_rate": 1.1451753717145436e-06, + "loss": 0.057, + "step": 27320 + }, + { + "epoch": 2.3612251069160655, + "grad_norm": 0.6974195948916855, + "learning_rate": 1.1422067254936148e-06, + "loss": 0.0564, + "step": 27330 + }, + { + "epoch": 2.3620890751220354, + "grad_norm": 0.6844415275124478, + "learning_rate": 1.139241435833342e-06, + "loss": 0.0605, + "step": 27340 + }, + { + "epoch": 2.3629530433280057, + "grad_norm": 0.6308979148922937, + "learning_rate": 1.1362795053137477e-06, + "loss": 0.0591, + "step": 27350 + }, + { + "epoch": 2.3638170115339756, + "grad_norm": 0.6868448909356765, + "learning_rate": 1.1333209365119198e-06, + "loss": 0.0606, + "step": 27360 + }, + { + "epoch": 2.3646809797399455, + "grad_norm": 0.6814825334027916, + "learning_rate": 1.130365732002034e-06, + "loss": 0.0597, + "step": 27370 + }, + { + "epoch": 2.3655449479459154, + "grad_norm": 0.6505325443513633, + "learning_rate": 1.1274138943553303e-06, + "loss": 0.0601, + "step": 27380 + }, + { + "epoch": 2.3664089161518858, + "grad_norm": 0.6516230369514702, + "learning_rate": 1.1244654261401233e-06, + "loss": 0.0573, + "step": 27390 + }, + { + "epoch": 2.3672728843578557, + "grad_norm": 0.6351111112315094, + "learning_rate": 1.1215203299217941e-06, + "loss": 0.0591, + "step": 27400 + }, + { + "epoch": 2.3681368525638256, + "grad_norm": 0.6323350777816571, + "learning_rate": 1.1185786082627897e-06, + "loss": 0.0582, + "step": 27410 + }, + { + "epoch": 2.369000820769796, + "grad_norm": 0.669953614513429, + "learning_rate": 1.1156402637226254e-06, + "loss": 0.0601, + "step": 27420 + }, + { + "epoch": 2.369864788975766, + "grad_norm": 0.6477445841820543, + "learning_rate": 1.112705298857869e-06, + "loss": 0.0608, + "step": 27430 + }, + { + "epoch": 2.3707287571817357, + "grad_norm": 0.6971705503297287, + "learning_rate": 1.1097737162221584e-06, + "loss": 0.0585, + "step": 27440 + }, + { + "epoch": 2.3715927253877056, + "grad_norm": 0.6874010419235855, + "learning_rate": 1.1068455183661796e-06, + "loss": 0.0611, + "step": 27450 + }, + { + "epoch": 2.3724566935936755, + "grad_norm": 0.6634814836649913, + "learning_rate": 1.103920707837683e-06, + "loss": 0.0573, + "step": 27460 + }, + { + "epoch": 2.373320661799646, + "grad_norm": 0.6668055692462801, + "learning_rate": 1.10099928718146e-06, + "loss": 0.0606, + "step": 27470 + }, + { + "epoch": 2.3741846300056157, + "grad_norm": 0.659507511140097, + "learning_rate": 1.0980812589393637e-06, + "loss": 0.0584, + "step": 27480 + }, + { + "epoch": 2.3750485982115856, + "grad_norm": 0.7058506378776868, + "learning_rate": 1.095166625650289e-06, + "loss": 0.0605, + "step": 27490 + }, + { + "epoch": 2.375912566417556, + "grad_norm": 0.6616478710874778, + "learning_rate": 1.0922553898501799e-06, + "loss": 0.0575, + "step": 27500 + }, + { + "epoch": 2.376776534623526, + "grad_norm": 0.6416220397649288, + "learning_rate": 1.0893475540720215e-06, + "loss": 0.0574, + "step": 27510 + }, + { + "epoch": 2.377640502829496, + "grad_norm": 0.6851881227890764, + "learning_rate": 1.086443120845842e-06, + "loss": 0.0603, + "step": 27520 + }, + { + "epoch": 2.378504471035466, + "grad_norm": 0.643914935400431, + "learning_rate": 1.0835420926987123e-06, + "loss": 0.0594, + "step": 27530 + }, + { + "epoch": 2.379368439241436, + "grad_norm": 0.7065001618716296, + "learning_rate": 1.0806444721547367e-06, + "loss": 0.0598, + "step": 27540 + }, + { + "epoch": 2.380232407447406, + "grad_norm": 0.6531887811707487, + "learning_rate": 1.077750261735055e-06, + "loss": 0.0632, + "step": 27550 + }, + { + "epoch": 2.381096375653376, + "grad_norm": 0.6782902569583262, + "learning_rate": 1.0748594639578391e-06, + "loss": 0.0597, + "step": 27560 + }, + { + "epoch": 2.3819603438593457, + "grad_norm": 0.6590868743045272, + "learning_rate": 1.0719720813382972e-06, + "loss": 0.0596, + "step": 27570 + }, + { + "epoch": 2.382824312065316, + "grad_norm": 0.6852303874619127, + "learning_rate": 1.0690881163886602e-06, + "loss": 0.0586, + "step": 27580 + }, + { + "epoch": 2.383688280271286, + "grad_norm": 0.6691472842481307, + "learning_rate": 1.066207571618187e-06, + "loss": 0.0586, + "step": 27590 + }, + { + "epoch": 2.384552248477256, + "grad_norm": 0.6758493976482691, + "learning_rate": 1.0633304495331614e-06, + "loss": 0.0587, + "step": 27600 + }, + { + "epoch": 2.3854162166832262, + "grad_norm": 0.6360569860336877, + "learning_rate": 1.0604567526368875e-06, + "loss": 0.0598, + "step": 27610 + }, + { + "epoch": 2.386280184889196, + "grad_norm": 0.6688496501564594, + "learning_rate": 1.057586483429694e-06, + "loss": 0.0602, + "step": 27620 + }, + { + "epoch": 2.387144153095166, + "grad_norm": 0.6318508440444366, + "learning_rate": 1.054719644408919e-06, + "loss": 0.0599, + "step": 27630 + }, + { + "epoch": 2.388008121301136, + "grad_norm": 0.6679409504648013, + "learning_rate": 1.051856238068925e-06, + "loss": 0.0589, + "step": 27640 + }, + { + "epoch": 2.3888720895071063, + "grad_norm": 0.6845032317565701, + "learning_rate": 1.0489962669010817e-06, + "loss": 0.0584, + "step": 27650 + }, + { + "epoch": 2.389736057713076, + "grad_norm": 0.6218207202922958, + "learning_rate": 1.0461397333937728e-06, + "loss": 0.0586, + "step": 27660 + }, + { + "epoch": 2.390600025919046, + "grad_norm": 0.6504975420124806, + "learning_rate": 1.0432866400323883e-06, + "loss": 0.0593, + "step": 27670 + }, + { + "epoch": 2.391463994125016, + "grad_norm": 0.6627505995283275, + "learning_rate": 1.0404369892993299e-06, + "loss": 0.0601, + "step": 27680 + }, + { + "epoch": 2.3923279623309863, + "grad_norm": 0.6866333978458881, + "learning_rate": 1.037590783673999e-06, + "loss": 0.0603, + "step": 27690 + }, + { + "epoch": 2.3931919305369562, + "grad_norm": 0.6274084673491703, + "learning_rate": 1.0347480256328025e-06, + "loss": 0.0597, + "step": 27700 + }, + { + "epoch": 2.394055898742926, + "grad_norm": 0.6381370089501482, + "learning_rate": 1.0319087176491455e-06, + "loss": 0.0577, + "step": 27710 + }, + { + "epoch": 2.3949198669488965, + "grad_norm": 0.6704993469854148, + "learning_rate": 1.0290728621934315e-06, + "loss": 0.0599, + "step": 27720 + }, + { + "epoch": 2.3957838351548664, + "grad_norm": 0.6716724508184789, + "learning_rate": 1.0262404617330652e-06, + "loss": 0.0592, + "step": 27730 + }, + { + "epoch": 2.3966478033608363, + "grad_norm": 0.6583842494843922, + "learning_rate": 1.023411518732435e-06, + "loss": 0.058, + "step": 27740 + }, + { + "epoch": 2.397511771566806, + "grad_norm": 0.655274656221077, + "learning_rate": 1.0205860356529318e-06, + "loss": 0.059, + "step": 27750 + }, + { + "epoch": 2.3983757397727765, + "grad_norm": 0.6339739798600978, + "learning_rate": 1.0177640149529277e-06, + "loss": 0.0609, + "step": 27760 + }, + { + "epoch": 2.3992397079787464, + "grad_norm": 0.6884819006309391, + "learning_rate": 1.014945459087791e-06, + "loss": 0.0601, + "step": 27770 + }, + { + "epoch": 2.4001036761847163, + "grad_norm": 0.6504434154351232, + "learning_rate": 1.0121303705098646e-06, + "loss": 0.0583, + "step": 27780 + }, + { + "epoch": 2.400967644390686, + "grad_norm": 0.6478456711603433, + "learning_rate": 1.0093187516684832e-06, + "loss": 0.0624, + "step": 27790 + }, + { + "epoch": 2.4018316125966566, + "grad_norm": 0.6253213623206553, + "learning_rate": 1.0065106050099599e-06, + "loss": 0.0587, + "step": 27800 + }, + { + "epoch": 2.4026955808026265, + "grad_norm": 0.6790278170027676, + "learning_rate": 1.003705932977585e-06, + "loss": 0.0596, + "step": 27810 + }, + { + "epoch": 2.4035595490085964, + "grad_norm": 0.6798115066515837, + "learning_rate": 1.0009047380116283e-06, + "loss": 0.0607, + "step": 27820 + }, + { + "epoch": 2.4044235172145667, + "grad_norm": 0.6604675110327878, + "learning_rate": 9.98107022549331e-07, + "loss": 0.0604, + "step": 27830 + }, + { + "epoch": 2.4052874854205366, + "grad_norm": 0.6790462682499643, + "learning_rate": 9.95312789024912e-07, + "loss": 0.0568, + "step": 27840 + }, + { + "epoch": 2.4061514536265065, + "grad_norm": 0.6128078717477331, + "learning_rate": 9.925220398695562e-07, + "loss": 0.0596, + "step": 27850 + }, + { + "epoch": 2.4070154218324764, + "grad_norm": 0.6753776573614281, + "learning_rate": 9.897347775114185e-07, + "loss": 0.0608, + "step": 27860 + }, + { + "epoch": 2.4078793900384468, + "grad_norm": 0.645529040989447, + "learning_rate": 9.86951004375618e-07, + "loss": 0.0586, + "step": 27870 + }, + { + "epoch": 2.4087433582444167, + "grad_norm": 0.6658258287934132, + "learning_rate": 9.841707228842428e-07, + "loss": 0.0596, + "step": 27880 + }, + { + "epoch": 2.4096073264503866, + "grad_norm": 0.6567521212851223, + "learning_rate": 9.81393935456339e-07, + "loss": 0.0576, + "step": 27890 + }, + { + "epoch": 2.4104712946563565, + "grad_norm": 0.660656906400927, + "learning_rate": 9.786206445079127e-07, + "loss": 0.058, + "step": 27900 + }, + { + "epoch": 2.411335262862327, + "grad_norm": 0.6380899683509319, + "learning_rate": 9.75850852451929e-07, + "loss": 0.0589, + "step": 27910 + }, + { + "epoch": 2.4121992310682967, + "grad_norm": 0.6517954005215502, + "learning_rate": 9.73084561698307e-07, + "loss": 0.0599, + "step": 27920 + }, + { + "epoch": 2.4130631992742666, + "grad_norm": 0.6807352997523919, + "learning_rate": 9.703217746539256e-07, + "loss": 0.0604, + "step": 27930 + }, + { + "epoch": 2.413927167480237, + "grad_norm": 0.633359951134548, + "learning_rate": 9.675624937226037e-07, + "loss": 0.0582, + "step": 27940 + }, + { + "epoch": 2.414791135686207, + "grad_norm": 0.661588990732132, + "learning_rate": 9.648067213051216e-07, + "loss": 0.062, + "step": 27950 + }, + { + "epoch": 2.4156551038921767, + "grad_norm": 0.6643855642016916, + "learning_rate": 9.620544597991992e-07, + "loss": 0.059, + "step": 27960 + }, + { + "epoch": 2.4165190720981466, + "grad_norm": 0.6772126749791726, + "learning_rate": 9.593057115995053e-07, + "loss": 0.06, + "step": 27970 + }, + { + "epoch": 2.417383040304117, + "grad_norm": 0.6194181138318597, + "learning_rate": 9.565604790976485e-07, + "loss": 0.0578, + "step": 27980 + }, + { + "epoch": 2.418247008510087, + "grad_norm": 0.6764386780365282, + "learning_rate": 9.53818764682184e-07, + "loss": 0.058, + "step": 27990 + }, + { + "epoch": 2.419110976716057, + "grad_norm": 0.6927664968419396, + "learning_rate": 9.510805707386006e-07, + "loss": 0.0599, + "step": 28000 + }, + { + "epoch": 2.4199749449220267, + "grad_norm": 0.674259380191846, + "learning_rate": 9.483458996493267e-07, + "loss": 0.0599, + "step": 28010 + }, + { + "epoch": 2.420838913127997, + "grad_norm": 0.6678938458437413, + "learning_rate": 9.456147537937249e-07, + "loss": 0.0607, + "step": 28020 + }, + { + "epoch": 2.421702881333967, + "grad_norm": 0.6678011366556437, + "learning_rate": 9.42887135548089e-07, + "loss": 0.0579, + "step": 28030 + }, + { + "epoch": 2.422566849539937, + "grad_norm": 0.6879787431287953, + "learning_rate": 9.401630472856499e-07, + "loss": 0.0601, + "step": 28040 + }, + { + "epoch": 2.423430817745907, + "grad_norm": 0.6705502097546254, + "learning_rate": 9.374424913765567e-07, + "loss": 0.0579, + "step": 28050 + }, + { + "epoch": 2.424294785951877, + "grad_norm": 0.6599368510358516, + "learning_rate": 9.347254701878943e-07, + "loss": 0.06, + "step": 28060 + }, + { + "epoch": 2.425158754157847, + "grad_norm": 0.6826991683827245, + "learning_rate": 9.320119860836674e-07, + "loss": 0.0614, + "step": 28070 + }, + { + "epoch": 2.426022722363817, + "grad_norm": 0.6762089597321239, + "learning_rate": 9.293020414248072e-07, + "loss": 0.0594, + "step": 28080 + }, + { + "epoch": 2.426886690569787, + "grad_norm": 0.6460291522786121, + "learning_rate": 9.265956385691583e-07, + "loss": 0.0579, + "step": 28090 + }, + { + "epoch": 2.427750658775757, + "grad_norm": 0.6506399156335623, + "learning_rate": 9.238927798714908e-07, + "loss": 0.0581, + "step": 28100 + }, + { + "epoch": 2.428614626981727, + "grad_norm": 0.6684113157689684, + "learning_rate": 9.211934676834882e-07, + "loss": 0.0594, + "step": 28110 + }, + { + "epoch": 2.429478595187697, + "grad_norm": 0.6356040848491912, + "learning_rate": 9.184977043537474e-07, + "loss": 0.0573, + "step": 28120 + }, + { + "epoch": 2.4303425633936673, + "grad_norm": 0.6847493066510186, + "learning_rate": 9.158054922277787e-07, + "loss": 0.0605, + "step": 28130 + }, + { + "epoch": 2.431206531599637, + "grad_norm": 0.7050255385387857, + "learning_rate": 9.131168336480018e-07, + "loss": 0.059, + "step": 28140 + }, + { + "epoch": 2.432070499805607, + "grad_norm": 0.6719753806637919, + "learning_rate": 9.104317309537469e-07, + "loss": 0.0585, + "step": 28150 + }, + { + "epoch": 2.4329344680115774, + "grad_norm": 0.6482007708428015, + "learning_rate": 9.077501864812476e-07, + "loss": 0.0606, + "step": 28160 + }, + { + "epoch": 2.4337984362175473, + "grad_norm": 0.6634719349909597, + "learning_rate": 9.050722025636427e-07, + "loss": 0.0557, + "step": 28170 + }, + { + "epoch": 2.4346624044235172, + "grad_norm": 0.6551058466757594, + "learning_rate": 9.023977815309714e-07, + "loss": 0.0594, + "step": 28180 + }, + { + "epoch": 2.435526372629487, + "grad_norm": 0.6744881656835849, + "learning_rate": 8.997269257101776e-07, + "loss": 0.0607, + "step": 28190 + }, + { + "epoch": 2.436390340835457, + "grad_norm": 0.6918028486996344, + "learning_rate": 8.970596374250984e-07, + "loss": 0.0606, + "step": 28200 + }, + { + "epoch": 2.4372543090414274, + "grad_norm": 0.6315554234710237, + "learning_rate": 8.94395918996468e-07, + "loss": 0.0573, + "step": 28210 + }, + { + "epoch": 2.4381182772473973, + "grad_norm": 0.6532510959054035, + "learning_rate": 8.917357727419157e-07, + "loss": 0.0581, + "step": 28220 + }, + { + "epoch": 2.438982245453367, + "grad_norm": 0.6657451241588958, + "learning_rate": 8.890792009759624e-07, + "loss": 0.0607, + "step": 28230 + }, + { + "epoch": 2.4398462136593375, + "grad_norm": 0.6798649150950511, + "learning_rate": 8.864262060100182e-07, + "loss": 0.0582, + "step": 28240 + }, + { + "epoch": 2.4407101818653074, + "grad_norm": 0.6867048576047796, + "learning_rate": 8.837767901523808e-07, + "loss": 0.0588, + "step": 28250 + }, + { + "epoch": 2.4415741500712773, + "grad_norm": 0.6368106790875463, + "learning_rate": 8.81130955708237e-07, + "loss": 0.0578, + "step": 28260 + }, + { + "epoch": 2.442438118277247, + "grad_norm": 0.6514293184898988, + "learning_rate": 8.784887049796537e-07, + "loss": 0.0553, + "step": 28270 + }, + { + "epoch": 2.4433020864832176, + "grad_norm": 0.6861528684747734, + "learning_rate": 8.758500402655811e-07, + "loss": 0.0583, + "step": 28280 + }, + { + "epoch": 2.4441660546891875, + "grad_norm": 0.6408952542356624, + "learning_rate": 8.732149638618481e-07, + "loss": 0.0587, + "step": 28290 + }, + { + "epoch": 2.4450300228951574, + "grad_norm": 0.6583448327327999, + "learning_rate": 8.70583478061166e-07, + "loss": 0.0599, + "step": 28300 + }, + { + "epoch": 2.4458939911011273, + "grad_norm": 0.6841081059724552, + "learning_rate": 8.679555851531168e-07, + "loss": 0.058, + "step": 28310 + }, + { + "epoch": 2.4467579593070976, + "grad_norm": 0.6287400345247842, + "learning_rate": 8.653312874241587e-07, + "loss": 0.0594, + "step": 28320 + }, + { + "epoch": 2.4476219275130675, + "grad_norm": 0.6443569285000627, + "learning_rate": 8.627105871576214e-07, + "loss": 0.0575, + "step": 28330 + }, + { + "epoch": 2.4484858957190374, + "grad_norm": 0.6537062566918245, + "learning_rate": 8.600934866337035e-07, + "loss": 0.0595, + "step": 28340 + }, + { + "epoch": 2.4493498639250078, + "grad_norm": 0.6728442936950617, + "learning_rate": 8.57479988129476e-07, + "loss": 0.0608, + "step": 28350 + }, + { + "epoch": 2.4502138321309777, + "grad_norm": 0.6392937359718057, + "learning_rate": 8.548700939188686e-07, + "loss": 0.0572, + "step": 28360 + }, + { + "epoch": 2.4510778003369476, + "grad_norm": 0.6616002855883878, + "learning_rate": 8.522638062726823e-07, + "loss": 0.0582, + "step": 28370 + }, + { + "epoch": 2.4519417685429175, + "grad_norm": 0.6671488566913149, + "learning_rate": 8.496611274585759e-07, + "loss": 0.0582, + "step": 28380 + }, + { + "epoch": 2.452805736748888, + "grad_norm": 0.6564631235016541, + "learning_rate": 8.470620597410689e-07, + "loss": 0.061, + "step": 28390 + }, + { + "epoch": 2.4536697049548577, + "grad_norm": 0.6637031889827743, + "learning_rate": 8.444666053815375e-07, + "loss": 0.0622, + "step": 28400 + }, + { + "epoch": 2.4545336731608276, + "grad_norm": 0.7176647827953652, + "learning_rate": 8.418747666382188e-07, + "loss": 0.059, + "step": 28410 + }, + { + "epoch": 2.4553976413667975, + "grad_norm": 0.6896783584801283, + "learning_rate": 8.392865457662002e-07, + "loss": 0.0604, + "step": 28420 + }, + { + "epoch": 2.456261609572768, + "grad_norm": 0.6828733344280327, + "learning_rate": 8.367019450174208e-07, + "loss": 0.0603, + "step": 28430 + }, + { + "epoch": 2.4571255777787377, + "grad_norm": 0.6659408708812785, + "learning_rate": 8.341209666406724e-07, + "loss": 0.0593, + "step": 28440 + }, + { + "epoch": 2.4579895459847076, + "grad_norm": 0.6766561239252696, + "learning_rate": 8.315436128815918e-07, + "loss": 0.059, + "step": 28450 + }, + { + "epoch": 2.458853514190678, + "grad_norm": 0.6530791811491967, + "learning_rate": 8.289698859826667e-07, + "loss": 0.0576, + "step": 28460 + }, + { + "epoch": 2.459717482396648, + "grad_norm": 0.6605880314782795, + "learning_rate": 8.263997881832258e-07, + "loss": 0.0575, + "step": 28470 + }, + { + "epoch": 2.460581450602618, + "grad_norm": 0.6717909641695353, + "learning_rate": 8.238333217194411e-07, + "loss": 0.0578, + "step": 28480 + }, + { + "epoch": 2.4614454188085877, + "grad_norm": 0.6542064505610335, + "learning_rate": 8.212704888243245e-07, + "loss": 0.0576, + "step": 28490 + }, + { + "epoch": 2.462309387014558, + "grad_norm": 0.7133803141730897, + "learning_rate": 8.187112917277279e-07, + "loss": 0.0575, + "step": 28500 + }, + { + "epoch": 2.463173355220528, + "grad_norm": 0.6987238128077268, + "learning_rate": 8.161557326563374e-07, + "loss": 0.0601, + "step": 28510 + }, + { + "epoch": 2.464037323426498, + "grad_norm": 0.658559293565418, + "learning_rate": 8.136038138336754e-07, + "loss": 0.0573, + "step": 28520 + }, + { + "epoch": 2.4649012916324677, + "grad_norm": 0.6867165061226779, + "learning_rate": 8.110555374800988e-07, + "loss": 0.0612, + "step": 28530 + }, + { + "epoch": 2.465765259838438, + "grad_norm": 0.6616775930013254, + "learning_rate": 8.085109058127916e-07, + "loss": 0.0589, + "step": 28540 + }, + { + "epoch": 2.466629228044408, + "grad_norm": 0.6748442619397084, + "learning_rate": 8.059699210457695e-07, + "loss": 0.0602, + "step": 28550 + }, + { + "epoch": 2.467493196250378, + "grad_norm": 0.6784319465044278, + "learning_rate": 8.034325853898716e-07, + "loss": 0.0601, + "step": 28560 + }, + { + "epoch": 2.4683571644563482, + "grad_norm": 0.6696709196103713, + "learning_rate": 8.008989010527674e-07, + "loss": 0.0583, + "step": 28570 + }, + { + "epoch": 2.469221132662318, + "grad_norm": 0.6752973689391882, + "learning_rate": 7.983688702389447e-07, + "loss": 0.0593, + "step": 28580 + }, + { + "epoch": 2.470085100868288, + "grad_norm": 0.6121634874033778, + "learning_rate": 7.958424951497157e-07, + "loss": 0.0586, + "step": 28590 + }, + { + "epoch": 2.470949069074258, + "grad_norm": 0.6580181912566206, + "learning_rate": 7.933197779832091e-07, + "loss": 0.0568, + "step": 28600 + }, + { + "epoch": 2.4718130372802283, + "grad_norm": 0.6552538023898506, + "learning_rate": 7.908007209343716e-07, + "loss": 0.0578, + "step": 28610 + }, + { + "epoch": 2.472677005486198, + "grad_norm": 0.6669667599185005, + "learning_rate": 7.882853261949692e-07, + "loss": 0.0593, + "step": 28620 + }, + { + "epoch": 2.473540973692168, + "grad_norm": 0.6340279306362967, + "learning_rate": 7.857735959535739e-07, + "loss": 0.055, + "step": 28630 + }, + { + "epoch": 2.474404941898138, + "grad_norm": 0.6530282367866861, + "learning_rate": 7.832655323955773e-07, + "loss": 0.0596, + "step": 28640 + }, + { + "epoch": 2.4752689101041083, + "grad_norm": 0.66428227074611, + "learning_rate": 7.807611377031738e-07, + "loss": 0.0576, + "step": 28650 + }, + { + "epoch": 2.4761328783100782, + "grad_norm": 0.5941794758457453, + "learning_rate": 7.782604140553734e-07, + "loss": 0.0583, + "step": 28660 + }, + { + "epoch": 2.476996846516048, + "grad_norm": 0.6814535079374869, + "learning_rate": 7.757633636279826e-07, + "loss": 0.0605, + "step": 28670 + }, + { + "epoch": 2.4778608147220185, + "grad_norm": 0.6802629053649004, + "learning_rate": 7.732699885936201e-07, + "loss": 0.0576, + "step": 28680 + }, + { + "epoch": 2.4787247829279884, + "grad_norm": 0.6726871693359314, + "learning_rate": 7.707802911217027e-07, + "loss": 0.0591, + "step": 28690 + }, + { + "epoch": 2.4795887511339583, + "grad_norm": 0.6870407955530046, + "learning_rate": 7.682942733784476e-07, + "loss": 0.0587, + "step": 28700 + }, + { + "epoch": 2.480452719339928, + "grad_norm": 0.6289813857003265, + "learning_rate": 7.658119375268714e-07, + "loss": 0.0576, + "step": 28710 + }, + { + "epoch": 2.481316687545898, + "grad_norm": 0.6398280282296972, + "learning_rate": 7.633332857267856e-07, + "loss": 0.0582, + "step": 28720 + }, + { + "epoch": 2.4821806557518684, + "grad_norm": 0.643099284809275, + "learning_rate": 7.608583201348002e-07, + "loss": 0.0594, + "step": 28730 + }, + { + "epoch": 2.4830446239578383, + "grad_norm": 0.6469507902323824, + "learning_rate": 7.583870429043134e-07, + "loss": 0.0557, + "step": 28740 + }, + { + "epoch": 2.483908592163808, + "grad_norm": 0.664417794823054, + "learning_rate": 7.55919456185516e-07, + "loss": 0.0563, + "step": 28750 + }, + { + "epoch": 2.4847725603697786, + "grad_norm": 0.654464666999886, + "learning_rate": 7.534555621253875e-07, + "loss": 0.0568, + "step": 28760 + }, + { + "epoch": 2.4856365285757485, + "grad_norm": 0.6736856787614522, + "learning_rate": 7.509953628676963e-07, + "loss": 0.0591, + "step": 28770 + }, + { + "epoch": 2.4865004967817184, + "grad_norm": 0.6689201937782084, + "learning_rate": 7.485388605529942e-07, + "loss": 0.0605, + "step": 28780 + }, + { + "epoch": 2.4873644649876883, + "grad_norm": 0.671068501395764, + "learning_rate": 7.460860573186168e-07, + "loss": 0.0591, + "step": 28790 + }, + { + "epoch": 2.4882284331936586, + "grad_norm": 0.6849956789413019, + "learning_rate": 7.43636955298681e-07, + "loss": 0.0605, + "step": 28800 + }, + { + "epoch": 2.4890924013996285, + "grad_norm": 0.6174444958621187, + "learning_rate": 7.411915566240835e-07, + "loss": 0.0572, + "step": 28810 + }, + { + "epoch": 2.4899563696055984, + "grad_norm": 0.6613532292211507, + "learning_rate": 7.387498634224988e-07, + "loss": 0.0612, + "step": 28820 + }, + { + "epoch": 2.4908203378115683, + "grad_norm": 0.6281340387625589, + "learning_rate": 7.36311877818377e-07, + "loss": 0.057, + "step": 28830 + }, + { + "epoch": 2.4916843060175387, + "grad_norm": 0.6472955772427801, + "learning_rate": 7.338776019329452e-07, + "loss": 0.0595, + "step": 28840 + }, + { + "epoch": 2.4925482742235086, + "grad_norm": 0.6465898107352807, + "learning_rate": 7.314470378841987e-07, + "loss": 0.0579, + "step": 28850 + }, + { + "epoch": 2.4934122424294785, + "grad_norm": 0.6872080656870608, + "learning_rate": 7.290201877869052e-07, + "loss": 0.0577, + "step": 28860 + }, + { + "epoch": 2.494276210635449, + "grad_norm": 0.6546298045170468, + "learning_rate": 7.26597053752599e-07, + "loss": 0.0593, + "step": 28870 + }, + { + "epoch": 2.4951401788414187, + "grad_norm": 0.6501124516401879, + "learning_rate": 7.241776378895865e-07, + "loss": 0.0567, + "step": 28880 + }, + { + "epoch": 2.4960041470473886, + "grad_norm": 0.6489435699910041, + "learning_rate": 7.217619423029332e-07, + "loss": 0.0588, + "step": 28890 + }, + { + "epoch": 2.4968681152533585, + "grad_norm": 0.6535221870925897, + "learning_rate": 7.193499690944706e-07, + "loss": 0.0592, + "step": 28900 + }, + { + "epoch": 2.497732083459329, + "grad_norm": 0.6771730869265828, + "learning_rate": 7.169417203627898e-07, + "loss": 0.0579, + "step": 28910 + }, + { + "epoch": 2.4985960516652987, + "grad_norm": 0.6359104906466674, + "learning_rate": 7.145371982032423e-07, + "loss": 0.0575, + "step": 28920 + }, + { + "epoch": 2.4994600198712686, + "grad_norm": 0.6722845799249196, + "learning_rate": 7.121364047079405e-07, + "loss": 0.059, + "step": 28930 + }, + { + "epoch": 2.5003239880772385, + "grad_norm": 0.6749302305814222, + "learning_rate": 7.097393419657439e-07, + "loss": 0.0584, + "step": 28940 + }, + { + "epoch": 2.501187956283209, + "grad_norm": 0.6397819866220034, + "learning_rate": 7.07346012062275e-07, + "loss": 0.0596, + "step": 28950 + }, + { + "epoch": 2.502051924489179, + "grad_norm": 0.649234976789809, + "learning_rate": 7.049564170799034e-07, + "loss": 0.0588, + "step": 28960 + }, + { + "epoch": 2.5029158926951487, + "grad_norm": 0.6194457878662328, + "learning_rate": 7.025705590977528e-07, + "loss": 0.0586, + "step": 28970 + }, + { + "epoch": 2.503779860901119, + "grad_norm": 0.6254725233188696, + "learning_rate": 7.001884401916898e-07, + "loss": 0.0584, + "step": 28980 + }, + { + "epoch": 2.504643829107089, + "grad_norm": 0.6533137706948257, + "learning_rate": 6.978100624343332e-07, + "loss": 0.0568, + "step": 28990 + }, + { + "epoch": 2.505507797313059, + "grad_norm": 0.6726580638707064, + "learning_rate": 6.954354278950443e-07, + "loss": 0.0582, + "step": 29000 + }, + { + "epoch": 2.5063717655190287, + "grad_norm": 0.7037083198887285, + "learning_rate": 6.930645386399277e-07, + "loss": 0.0614, + "step": 29010 + }, + { + "epoch": 2.507235733724999, + "grad_norm": 0.7033536212322138, + "learning_rate": 6.906973967318287e-07, + "loss": 0.0608, + "step": 29020 + }, + { + "epoch": 2.508099701930969, + "grad_norm": 0.6409072157382765, + "learning_rate": 6.883340042303333e-07, + "loss": 0.0574, + "step": 29030 + }, + { + "epoch": 2.508963670136939, + "grad_norm": 0.6445925601552681, + "learning_rate": 6.859743631917653e-07, + "loss": 0.0583, + "step": 29040 + }, + { + "epoch": 2.509827638342909, + "grad_norm": 0.6992038269097037, + "learning_rate": 6.836184756691838e-07, + "loss": 0.0597, + "step": 29050 + }, + { + "epoch": 2.510691606548879, + "grad_norm": 0.6687242847138208, + "learning_rate": 6.812663437123823e-07, + "loss": 0.058, + "step": 29060 + }, + { + "epoch": 2.511555574754849, + "grad_norm": 0.6608479078655375, + "learning_rate": 6.789179693678855e-07, + "loss": 0.059, + "step": 29070 + }, + { + "epoch": 2.512419542960819, + "grad_norm": 0.6961967420294427, + "learning_rate": 6.765733546789527e-07, + "loss": 0.0596, + "step": 29080 + }, + { + "epoch": 2.5132835111667893, + "grad_norm": 0.6380443357234777, + "learning_rate": 6.742325016855655e-07, + "loss": 0.0579, + "step": 29090 + }, + { + "epoch": 2.514147479372759, + "grad_norm": 0.6437761706113116, + "learning_rate": 6.718954124244386e-07, + "loss": 0.0576, + "step": 29100 + }, + { + "epoch": 2.515011447578729, + "grad_norm": 0.7021053853712458, + "learning_rate": 6.695620889290095e-07, + "loss": 0.058, + "step": 29110 + }, + { + "epoch": 2.515875415784699, + "grad_norm": 0.6825429370809767, + "learning_rate": 6.672325332294383e-07, + "loss": 0.0571, + "step": 29120 + }, + { + "epoch": 2.516739383990669, + "grad_norm": 0.679356433703693, + "learning_rate": 6.649067473526083e-07, + "loss": 0.0574, + "step": 29130 + }, + { + "epoch": 2.517603352196639, + "grad_norm": 0.658900017381107, + "learning_rate": 6.625847333221213e-07, + "loss": 0.0604, + "step": 29140 + }, + { + "epoch": 2.518467320402609, + "grad_norm": 0.6650829530549738, + "learning_rate": 6.602664931583008e-07, + "loss": 0.0577, + "step": 29150 + }, + { + "epoch": 2.519331288608579, + "grad_norm": 0.7048816158879708, + "learning_rate": 6.579520288781826e-07, + "loss": 0.059, + "step": 29160 + }, + { + "epoch": 2.5201952568145494, + "grad_norm": 0.6645875550709922, + "learning_rate": 6.556413424955188e-07, + "loss": 0.0594, + "step": 29170 + }, + { + "epoch": 2.5210592250205193, + "grad_norm": 0.6709488583655743, + "learning_rate": 6.533344360207744e-07, + "loss": 0.0603, + "step": 29180 + }, + { + "epoch": 2.521923193226489, + "grad_norm": 0.6671497082111582, + "learning_rate": 6.510313114611272e-07, + "loss": 0.0593, + "step": 29190 + }, + { + "epoch": 2.5227871614324595, + "grad_norm": 0.6281321494275784, + "learning_rate": 6.487319708204625e-07, + "loss": 0.0571, + "step": 29200 + }, + { + "epoch": 2.5236511296384294, + "grad_norm": 0.6805803298969804, + "learning_rate": 6.464364160993736e-07, + "loss": 0.0593, + "step": 29210 + }, + { + "epoch": 2.5245150978443993, + "grad_norm": 0.6850029853982884, + "learning_rate": 6.441446492951597e-07, + "loss": 0.06, + "step": 29220 + }, + { + "epoch": 2.525379066050369, + "grad_norm": 0.6418791460040815, + "learning_rate": 6.418566724018232e-07, + "loss": 0.0552, + "step": 29230 + }, + { + "epoch": 2.526243034256339, + "grad_norm": 0.6482480515608605, + "learning_rate": 6.39572487410075e-07, + "loss": 0.058, + "step": 29240 + }, + { + "epoch": 2.5271070024623095, + "grad_norm": 0.6396174623886076, + "learning_rate": 6.372920963073165e-07, + "loss": 0.0599, + "step": 29250 + }, + { + "epoch": 2.5279709706682794, + "grad_norm": 0.6809497391776808, + "learning_rate": 6.350155010776576e-07, + "loss": 0.0608, + "step": 29260 + }, + { + "epoch": 2.5288349388742493, + "grad_norm": 0.6779821207469485, + "learning_rate": 6.32742703701899e-07, + "loss": 0.0572, + "step": 29270 + }, + { + "epoch": 2.5296989070802196, + "grad_norm": 0.6653538492392551, + "learning_rate": 6.304737061575438e-07, + "loss": 0.0583, + "step": 29280 + }, + { + "epoch": 2.5305628752861895, + "grad_norm": 0.6416808234550777, + "learning_rate": 6.282085104187796e-07, + "loss": 0.0614, + "step": 29290 + }, + { + "epoch": 2.5314268434921594, + "grad_norm": 0.6550871279640734, + "learning_rate": 6.259471184564952e-07, + "loss": 0.0574, + "step": 29300 + }, + { + "epoch": 2.5322908116981298, + "grad_norm": 0.6707751927794339, + "learning_rate": 6.236895322382653e-07, + "loss": 0.0583, + "step": 29310 + }, + { + "epoch": 2.5331547799040997, + "grad_norm": 0.6483868684196846, + "learning_rate": 6.214357537283527e-07, + "loss": 0.0568, + "step": 29320 + }, + { + "epoch": 2.5340187481100696, + "grad_norm": 0.6548519923942192, + "learning_rate": 6.191857848877097e-07, + "loss": 0.0569, + "step": 29330 + }, + { + "epoch": 2.5348827163160395, + "grad_norm": 0.684998353287605, + "learning_rate": 6.16939627673972e-07, + "loss": 0.0592, + "step": 29340 + }, + { + "epoch": 2.5357466845220094, + "grad_norm": 0.6359323968029438, + "learning_rate": 6.146972840414623e-07, + "loss": 0.0598, + "step": 29350 + }, + { + "epoch": 2.5366106527279797, + "grad_norm": 0.6175361169902326, + "learning_rate": 6.124587559411782e-07, + "loss": 0.0557, + "step": 29360 + }, + { + "epoch": 2.5374746209339496, + "grad_norm": 0.6772844589228746, + "learning_rate": 6.102240453208052e-07, + "loss": 0.0605, + "step": 29370 + }, + { + "epoch": 2.5383385891399195, + "grad_norm": 0.6512332430117873, + "learning_rate": 6.07993154124702e-07, + "loss": 0.0605, + "step": 29380 + }, + { + "epoch": 2.53920255734589, + "grad_norm": 0.6989309110098257, + "learning_rate": 6.057660842939095e-07, + "loss": 0.0592, + "step": 29390 + }, + { + "epoch": 2.5400665255518597, + "grad_norm": 0.6773524899173674, + "learning_rate": 6.035428377661362e-07, + "loss": 0.0589, + "step": 29400 + }, + { + "epoch": 2.5409304937578296, + "grad_norm": 0.668998479726973, + "learning_rate": 6.013234164757709e-07, + "loss": 0.0591, + "step": 29410 + }, + { + "epoch": 2.5417944619638, + "grad_norm": 0.6703732268508873, + "learning_rate": 5.99107822353871e-07, + "loss": 0.057, + "step": 29420 + }, + { + "epoch": 2.54265843016977, + "grad_norm": 0.6635245821998534, + "learning_rate": 5.968960573281645e-07, + "loss": 0.0595, + "step": 29430 + }, + { + "epoch": 2.54352239837574, + "grad_norm": 0.6525266456918762, + "learning_rate": 5.946881233230473e-07, + "loss": 0.0564, + "step": 29440 + }, + { + "epoch": 2.5443863665817097, + "grad_norm": 0.687436696980093, + "learning_rate": 5.924840222595818e-07, + "loss": 0.0583, + "step": 29450 + }, + { + "epoch": 2.5452503347876796, + "grad_norm": 0.6639549603653401, + "learning_rate": 5.902837560554981e-07, + "loss": 0.062, + "step": 29460 + }, + { + "epoch": 2.54611430299365, + "grad_norm": 0.664926820131592, + "learning_rate": 5.880873266251869e-07, + "loss": 0.0595, + "step": 29470 + }, + { + "epoch": 2.54697827119962, + "grad_norm": 0.6871249087091793, + "learning_rate": 5.858947358797018e-07, + "loss": 0.0596, + "step": 29480 + }, + { + "epoch": 2.5478422394055897, + "grad_norm": 0.667792891284496, + "learning_rate": 5.837059857267546e-07, + "loss": 0.058, + "step": 29490 + }, + { + "epoch": 2.54870620761156, + "grad_norm": 0.6350186657516332, + "learning_rate": 5.815210780707192e-07, + "loss": 0.0599, + "step": 29500 + }, + { + "epoch": 2.54957017581753, + "grad_norm": 0.6961370444377433, + "learning_rate": 5.793400148126233e-07, + "loss": 0.0583, + "step": 29510 + }, + { + "epoch": 2.5504341440235, + "grad_norm": 0.6484114304314101, + "learning_rate": 5.77162797850151e-07, + "loss": 0.0579, + "step": 29520 + }, + { + "epoch": 2.5512981122294702, + "grad_norm": 0.6489574041467437, + "learning_rate": 5.749894290776381e-07, + "loss": 0.0581, + "step": 29530 + }, + { + "epoch": 2.55216208043544, + "grad_norm": 0.6704512119761233, + "learning_rate": 5.728199103860738e-07, + "loss": 0.0582, + "step": 29540 + }, + { + "epoch": 2.55302604864141, + "grad_norm": 0.6880307479956702, + "learning_rate": 5.70654243663099e-07, + "loss": 0.0568, + "step": 29550 + }, + { + "epoch": 2.55389001684738, + "grad_norm": 0.6705087029929916, + "learning_rate": 5.684924307929984e-07, + "loss": 0.0571, + "step": 29560 + }, + { + "epoch": 2.55475398505335, + "grad_norm": 0.6728745082079638, + "learning_rate": 5.663344736567083e-07, + "loss": 0.0558, + "step": 29570 + }, + { + "epoch": 2.55561795325932, + "grad_norm": 0.6830199354155625, + "learning_rate": 5.641803741318069e-07, + "loss": 0.0598, + "step": 29580 + }, + { + "epoch": 2.55648192146529, + "grad_norm": 0.6522985920938468, + "learning_rate": 5.620301340925199e-07, + "loss": 0.0551, + "step": 29590 + }, + { + "epoch": 2.55734588967126, + "grad_norm": 0.6888286792099857, + "learning_rate": 5.598837554097092e-07, + "loss": 0.0569, + "step": 29600 + }, + { + "epoch": 2.5582098578772303, + "grad_norm": 0.6438224709357185, + "learning_rate": 5.577412399508831e-07, + "loss": 0.0581, + "step": 29610 + }, + { + "epoch": 2.5590738260832, + "grad_norm": 0.7114816339042405, + "learning_rate": 5.556025895801847e-07, + "loss": 0.0572, + "step": 29620 + }, + { + "epoch": 2.55993779428917, + "grad_norm": 0.6307847616008638, + "learning_rate": 5.534678061583953e-07, + "loss": 0.0565, + "step": 29630 + }, + { + "epoch": 2.56080176249514, + "grad_norm": 0.665655599351334, + "learning_rate": 5.513368915429318e-07, + "loss": 0.0574, + "step": 29640 + }, + { + "epoch": 2.56166573070111, + "grad_norm": 0.6160826997111912, + "learning_rate": 5.492098475878432e-07, + "loss": 0.0559, + "step": 29650 + }, + { + "epoch": 2.5625296989070803, + "grad_norm": 0.6430126390669085, + "learning_rate": 5.470866761438165e-07, + "loss": 0.0571, + "step": 29660 + }, + { + "epoch": 2.56339366711305, + "grad_norm": 0.6612250849594098, + "learning_rate": 5.449673790581611e-07, + "loss": 0.0578, + "step": 29670 + }, + { + "epoch": 2.56425763531902, + "grad_norm": 0.6560418863856894, + "learning_rate": 5.428519581748215e-07, + "loss": 0.0563, + "step": 29680 + }, + { + "epoch": 2.5651216035249904, + "grad_norm": 0.6643504836285753, + "learning_rate": 5.40740415334367e-07, + "loss": 0.0564, + "step": 29690 + }, + { + "epoch": 2.5659855717309603, + "grad_norm": 0.6771315503647855, + "learning_rate": 5.386327523739954e-07, + "loss": 0.0576, + "step": 29700 + }, + { + "epoch": 2.56684953993693, + "grad_norm": 0.6363414935150137, + "learning_rate": 5.365289711275235e-07, + "loss": 0.0576, + "step": 29710 + }, + { + "epoch": 2.5677135081429006, + "grad_norm": 0.6531883331299517, + "learning_rate": 5.34429073425396e-07, + "loss": 0.0591, + "step": 29720 + }, + { + "epoch": 2.5685774763488705, + "grad_norm": 0.720546804487095, + "learning_rate": 5.323330610946769e-07, + "loss": 0.0568, + "step": 29730 + }, + { + "epoch": 2.5694414445548404, + "grad_norm": 0.6329942697006272, + "learning_rate": 5.302409359590483e-07, + "loss": 0.0587, + "step": 29740 + }, + { + "epoch": 2.5703054127608103, + "grad_norm": 0.6687880309557287, + "learning_rate": 5.281526998388115e-07, + "loss": 0.0602, + "step": 29750 + }, + { + "epoch": 2.57116938096678, + "grad_norm": 0.6776722689320893, + "learning_rate": 5.260683545508827e-07, + "loss": 0.0619, + "step": 29760 + }, + { + "epoch": 2.5720333491727505, + "grad_norm": 0.6712745107615856, + "learning_rate": 5.239879019087957e-07, + "loss": 0.0601, + "step": 29770 + }, + { + "epoch": 2.5728973173787204, + "grad_norm": 0.6886401833521799, + "learning_rate": 5.219113437226946e-07, + "loss": 0.0591, + "step": 29780 + }, + { + "epoch": 2.5737612855846903, + "grad_norm": 0.6579663523334925, + "learning_rate": 5.198386817993367e-07, + "loss": 0.0585, + "step": 29790 + }, + { + "epoch": 2.5746252537906607, + "grad_norm": 0.6409753322445424, + "learning_rate": 5.17769917942087e-07, + "loss": 0.0573, + "step": 29800 + }, + { + "epoch": 2.5754892219966306, + "grad_norm": 0.6611922702209809, + "learning_rate": 5.157050539509228e-07, + "loss": 0.059, + "step": 29810 + }, + { + "epoch": 2.5763531902026005, + "grad_norm": 0.6499977380134889, + "learning_rate": 5.136440916224245e-07, + "loss": 0.0572, + "step": 29820 + }, + { + "epoch": 2.577217158408571, + "grad_norm": 0.6872934986169129, + "learning_rate": 5.11587032749779e-07, + "loss": 0.056, + "step": 29830 + }, + { + "epoch": 2.5780811266145407, + "grad_norm": 0.6732910099436433, + "learning_rate": 5.095338791227783e-07, + "loss": 0.0585, + "step": 29840 + }, + { + "epoch": 2.5789450948205106, + "grad_norm": 0.6708757861303539, + "learning_rate": 5.074846325278127e-07, + "loss": 0.0587, + "step": 29850 + }, + { + "epoch": 2.5798090630264805, + "grad_norm": 0.6783891979386204, + "learning_rate": 5.054392947478798e-07, + "loss": 0.056, + "step": 29860 + }, + { + "epoch": 2.5806730312324504, + "grad_norm": 0.6931804578103389, + "learning_rate": 5.033978675625679e-07, + "loss": 0.0572, + "step": 29870 + }, + { + "epoch": 2.5815369994384207, + "grad_norm": 0.6481825220806289, + "learning_rate": 5.013603527480704e-07, + "loss": 0.0581, + "step": 29880 + }, + { + "epoch": 2.5824009676443906, + "grad_norm": 0.6817496187695342, + "learning_rate": 4.993267520771705e-07, + "loss": 0.0566, + "step": 29890 + }, + { + "epoch": 2.5832649358503605, + "grad_norm": 0.6841775081681513, + "learning_rate": 4.972970673192529e-07, + "loss": 0.0588, + "step": 29900 + }, + { + "epoch": 2.584128904056331, + "grad_norm": 0.6475734993713166, + "learning_rate": 4.952713002402859e-07, + "loss": 0.0588, + "step": 29910 + }, + { + "epoch": 2.584992872262301, + "grad_norm": 0.6662802149829226, + "learning_rate": 4.93249452602837e-07, + "loss": 0.0579, + "step": 29920 + }, + { + "epoch": 2.5858568404682707, + "grad_norm": 0.6365663998049235, + "learning_rate": 4.912315261660611e-07, + "loss": 0.058, + "step": 29930 + }, + { + "epoch": 2.586720808674241, + "grad_norm": 0.6486599808954889, + "learning_rate": 4.892175226856994e-07, + "loss": 0.0584, + "step": 29940 + }, + { + "epoch": 2.587584776880211, + "grad_norm": 0.6658531724786169, + "learning_rate": 4.872074439140817e-07, + "loss": 0.0576, + "step": 29950 + }, + { + "epoch": 2.588448745086181, + "grad_norm": 0.7116085843786459, + "learning_rate": 4.852012916001225e-07, + "loss": 0.0594, + "step": 29960 + }, + { + "epoch": 2.5893127132921507, + "grad_norm": 0.6671752461840659, + "learning_rate": 4.831990674893222e-07, + "loss": 0.0573, + "step": 29970 + }, + { + "epoch": 2.5901766814981206, + "grad_norm": 0.6356609240356549, + "learning_rate": 4.812007733237583e-07, + "loss": 0.0582, + "step": 29980 + }, + { + "epoch": 2.591040649704091, + "grad_norm": 0.6244169993393772, + "learning_rate": 4.792064108420941e-07, + "loss": 0.0557, + "step": 29990 + }, + { + "epoch": 2.591904617910061, + "grad_norm": 0.6540666502258284, + "learning_rate": 4.772159817795685e-07, + "loss": 0.0582, + "step": 30000 + }, + { + "epoch": 2.592768586116031, + "grad_norm": 0.6358928816445137, + "learning_rate": 4.752294878680025e-07, + "loss": 0.0576, + "step": 30010 + }, + { + "epoch": 2.593632554322001, + "grad_norm": 0.6572956423353731, + "learning_rate": 4.7324693083578563e-07, + "loss": 0.0584, + "step": 30020 + }, + { + "epoch": 2.594496522527971, + "grad_norm": 0.6604128875290302, + "learning_rate": 4.7126831240789097e-07, + "loss": 0.059, + "step": 30030 + }, + { + "epoch": 2.595360490733941, + "grad_norm": 0.656347890741661, + "learning_rate": 4.692936343058579e-07, + "loss": 0.0584, + "step": 30040 + }, + { + "epoch": 2.5962244589399113, + "grad_norm": 0.6991884383243728, + "learning_rate": 4.673228982478012e-07, + "loss": 0.0579, + "step": 30050 + }, + { + "epoch": 2.597088427145881, + "grad_norm": 0.6979127438966883, + "learning_rate": 4.653561059484035e-07, + "loss": 0.0582, + "step": 30060 + }, + { + "epoch": 2.597952395351851, + "grad_norm": 0.674144107741082, + "learning_rate": 4.633932591189172e-07, + "loss": 0.0581, + "step": 30070 + }, + { + "epoch": 2.598816363557821, + "grad_norm": 0.6871709263612947, + "learning_rate": 4.6143435946716276e-07, + "loss": 0.0614, + "step": 30080 + }, + { + "epoch": 2.599680331763791, + "grad_norm": 0.6587668438432055, + "learning_rate": 4.594794086975252e-07, + "loss": 0.0581, + "step": 30090 + }, + { + "epoch": 2.600544299969761, + "grad_norm": 0.6601269223981766, + "learning_rate": 4.575284085109527e-07, + "loss": 0.0566, + "step": 30100 + }, + { + "epoch": 2.601408268175731, + "grad_norm": 0.6767525995745622, + "learning_rate": 4.555813606049575e-07, + "loss": 0.0554, + "step": 30110 + }, + { + "epoch": 2.602272236381701, + "grad_norm": 0.6629565179971297, + "learning_rate": 4.5363826667361443e-07, + "loss": 0.0564, + "step": 30120 + }, + { + "epoch": 2.6031362045876714, + "grad_norm": 0.7046146642092987, + "learning_rate": 4.5169912840755505e-07, + "loss": 0.0588, + "step": 30130 + }, + { + "epoch": 2.6040001727936413, + "grad_norm": 0.6772719079025742, + "learning_rate": 4.4976394749397076e-07, + "loss": 0.0565, + "step": 30140 + }, + { + "epoch": 2.604864140999611, + "grad_norm": 0.6742492402219806, + "learning_rate": 4.478327256166104e-07, + "loss": 0.0551, + "step": 30150 + }, + { + "epoch": 2.6057281092055815, + "grad_norm": 0.670700818891851, + "learning_rate": 4.459054644557759e-07, + "loss": 0.0572, + "step": 30160 + }, + { + "epoch": 2.6065920774115514, + "grad_norm": 0.6811115544062041, + "learning_rate": 4.43982165688327e-07, + "loss": 0.0583, + "step": 30170 + }, + { + "epoch": 2.6074560456175213, + "grad_norm": 0.6771983529090304, + "learning_rate": 4.4206283098767067e-07, + "loss": 0.0569, + "step": 30180 + }, + { + "epoch": 2.608320013823491, + "grad_norm": 0.7017740502312722, + "learning_rate": 4.4014746202377e-07, + "loss": 0.0585, + "step": 30190 + }, + { + "epoch": 2.609183982029461, + "grad_norm": 0.6755203458518678, + "learning_rate": 4.3823606046313415e-07, + "loss": 0.0567, + "step": 30200 + }, + { + "epoch": 2.6100479502354315, + "grad_norm": 0.6772379176657675, + "learning_rate": 4.363286279688217e-07, + "loss": 0.0582, + "step": 30210 + }, + { + "epoch": 2.6109119184414014, + "grad_norm": 0.6695431410155791, + "learning_rate": 4.3442516620043674e-07, + "loss": 0.0597, + "step": 30220 + }, + { + "epoch": 2.6117758866473713, + "grad_norm": 0.6567571285238, + "learning_rate": 4.325256768141312e-07, + "loss": 0.0574, + "step": 30230 + }, + { + "epoch": 2.6126398548533416, + "grad_norm": 0.6599745824932721, + "learning_rate": 4.306301614625979e-07, + "loss": 0.057, + "step": 30240 + }, + { + "epoch": 2.6135038230593115, + "grad_norm": 0.6378082167289888, + "learning_rate": 4.287386217950734e-07, + "loss": 0.0569, + "step": 30250 + }, + { + "epoch": 2.6143677912652814, + "grad_norm": 0.6809435093701421, + "learning_rate": 4.268510594573344e-07, + "loss": 0.0572, + "step": 30260 + }, + { + "epoch": 2.6152317594712513, + "grad_norm": 0.6741354796855408, + "learning_rate": 4.249674760916961e-07, + "loss": 0.057, + "step": 30270 + }, + { + "epoch": 2.616095727677221, + "grad_norm": 0.6663299971055163, + "learning_rate": 4.2308787333701697e-07, + "loss": 0.0573, + "step": 30280 + }, + { + "epoch": 2.6169596958831916, + "grad_norm": 0.6923628741436045, + "learning_rate": 4.2121225282868273e-07, + "loss": 0.0575, + "step": 30290 + }, + { + "epoch": 2.6178236640891615, + "grad_norm": 0.6422042356059123, + "learning_rate": 4.193406161986241e-07, + "loss": 0.0567, + "step": 30300 + }, + { + "epoch": 2.6186876322951314, + "grad_norm": 0.6971541286553281, + "learning_rate": 4.174729650752979e-07, + "loss": 0.059, + "step": 30310 + }, + { + "epoch": 2.6195516005011017, + "grad_norm": 0.668983792614538, + "learning_rate": 4.1560930108369925e-07, + "loss": 0.0575, + "step": 30320 + }, + { + "epoch": 2.6204155687070716, + "grad_norm": 0.6783496859385131, + "learning_rate": 4.1374962584534886e-07, + "loss": 0.0582, + "step": 30330 + }, + { + "epoch": 2.6212795369130415, + "grad_norm": 0.6566315043249467, + "learning_rate": 4.1189394097830073e-07, + "loss": 0.0587, + "step": 30340 + }, + { + "epoch": 2.622143505119012, + "grad_norm": 0.6665224926516164, + "learning_rate": 4.1004224809713497e-07, + "loss": 0.0569, + "step": 30350 + }, + { + "epoch": 2.6230074733249817, + "grad_norm": 0.6610369842339688, + "learning_rate": 4.081945488129602e-07, + "loss": 0.0566, + "step": 30360 + }, + { + "epoch": 2.6238714415309516, + "grad_norm": 0.6659586041314874, + "learning_rate": 4.06350844733408e-07, + "loss": 0.0584, + "step": 30370 + }, + { + "epoch": 2.6247354097369215, + "grad_norm": 0.6998362957468676, + "learning_rate": 4.0451113746263426e-07, + "loss": 0.0572, + "step": 30380 + }, + { + "epoch": 2.6255993779428914, + "grad_norm": 0.6629821943664789, + "learning_rate": 4.0267542860132017e-07, + "loss": 0.0604, + "step": 30390 + }, + { + "epoch": 2.626463346148862, + "grad_norm": 0.6851188724926308, + "learning_rate": 4.008437197466647e-07, + "loss": 0.0576, + "step": 30400 + }, + { + "epoch": 2.6273273143548317, + "grad_norm": 0.6358552887813863, + "learning_rate": 3.990160124923875e-07, + "loss": 0.0591, + "step": 30410 + }, + { + "epoch": 2.6281912825608016, + "grad_norm": 0.6822769123702047, + "learning_rate": 3.9719230842872714e-07, + "loss": 0.058, + "step": 30420 + }, + { + "epoch": 2.629055250766772, + "grad_norm": 0.6760968470919131, + "learning_rate": 3.9537260914243924e-07, + "loss": 0.058, + "step": 30430 + }, + { + "epoch": 2.629919218972742, + "grad_norm": 0.679594017303257, + "learning_rate": 3.9355691621679403e-07, + "loss": 0.059, + "step": 30440 + }, + { + "epoch": 2.6307831871787117, + "grad_norm": 0.6778234438718367, + "learning_rate": 3.9174523123157617e-07, + "loss": 0.0579, + "step": 30450 + }, + { + "epoch": 2.631647155384682, + "grad_norm": 0.7006233738319552, + "learning_rate": 3.8993755576308413e-07, + "loss": 0.0608, + "step": 30460 + }, + { + "epoch": 2.632511123590652, + "grad_norm": 0.6485879407092817, + "learning_rate": 3.8813389138412595e-07, + "loss": 0.0551, + "step": 30470 + }, + { + "epoch": 2.633375091796622, + "grad_norm": 0.688314346867616, + "learning_rate": 3.863342396640213e-07, + "loss": 0.0578, + "step": 30480 + }, + { + "epoch": 2.634239060002592, + "grad_norm": 0.6914084623518867, + "learning_rate": 3.845386021685971e-07, + "loss": 0.0587, + "step": 30490 + }, + { + "epoch": 2.6351030282085617, + "grad_norm": 0.686323558943809, + "learning_rate": 3.827469804601908e-07, + "loss": 0.0569, + "step": 30500 + }, + { + "epoch": 2.635966996414532, + "grad_norm": 0.6703731117375387, + "learning_rate": 3.8095937609764157e-07, + "loss": 0.0581, + "step": 30510 + }, + { + "epoch": 2.636830964620502, + "grad_norm": 0.6690328063750695, + "learning_rate": 3.791757906362958e-07, + "loss": 0.0572, + "step": 30520 + }, + { + "epoch": 2.637694932826472, + "grad_norm": 0.6455426316238811, + "learning_rate": 3.7739622562800224e-07, + "loss": 0.0565, + "step": 30530 + }, + { + "epoch": 2.638558901032442, + "grad_norm": 0.6888587274112533, + "learning_rate": 3.7562068262111286e-07, + "loss": 0.0581, + "step": 30540 + }, + { + "epoch": 2.639422869238412, + "grad_norm": 0.6825385826605149, + "learning_rate": 3.738491631604779e-07, + "loss": 0.0598, + "step": 30550 + }, + { + "epoch": 2.640286837444382, + "grad_norm": 0.690570583937555, + "learning_rate": 3.72081668787449e-07, + "loss": 0.0562, + "step": 30560 + }, + { + "epoch": 2.6411508056503523, + "grad_norm": 0.6766329158244576, + "learning_rate": 3.703182010398748e-07, + "loss": 0.058, + "step": 30570 + }, + { + "epoch": 2.642014773856322, + "grad_norm": 0.67715272342871, + "learning_rate": 3.6855876145209914e-07, + "loss": 0.0575, + "step": 30580 + }, + { + "epoch": 2.642878742062292, + "grad_norm": 0.6789455137223994, + "learning_rate": 3.668033515549646e-07, + "loss": 0.0574, + "step": 30590 + }, + { + "epoch": 2.643742710268262, + "grad_norm": 0.6383316891492063, + "learning_rate": 3.6505197287580285e-07, + "loss": 0.0582, + "step": 30600 + }, + { + "epoch": 2.644606678474232, + "grad_norm": 0.6629664084374244, + "learning_rate": 3.633046269384427e-07, + "loss": 0.0585, + "step": 30610 + }, + { + "epoch": 2.6454706466802023, + "grad_norm": 0.6913305154208267, + "learning_rate": 3.615613152632008e-07, + "loss": 0.0585, + "step": 30620 + }, + { + "epoch": 2.646334614886172, + "grad_norm": 0.6718079255061178, + "learning_rate": 3.5982203936688776e-07, + "loss": 0.0572, + "step": 30630 + }, + { + "epoch": 2.647198583092142, + "grad_norm": 0.6686806554991838, + "learning_rate": 3.5808680076279577e-07, + "loss": 0.0553, + "step": 30640 + }, + { + "epoch": 2.6480625512981124, + "grad_norm": 0.6736141930150831, + "learning_rate": 3.563556009607122e-07, + "loss": 0.0593, + "step": 30650 + }, + { + "epoch": 2.6489265195040823, + "grad_norm": 0.6240399939775275, + "learning_rate": 3.546284414669055e-07, + "loss": 0.0564, + "step": 30660 + }, + { + "epoch": 2.649790487710052, + "grad_norm": 0.6560981438220517, + "learning_rate": 3.5290532378412956e-07, + "loss": 0.0565, + "step": 30670 + }, + { + "epoch": 2.6506544559160226, + "grad_norm": 0.6680071817104607, + "learning_rate": 3.5118624941162303e-07, + "loss": 0.0566, + "step": 30680 + }, + { + "epoch": 2.6515184241219925, + "grad_norm": 0.6625073324317056, + "learning_rate": 3.4947121984510393e-07, + "loss": 0.0581, + "step": 30690 + }, + { + "epoch": 2.6523823923279624, + "grad_norm": 0.6807614072790078, + "learning_rate": 3.477602365767746e-07, + "loss": 0.0559, + "step": 30700 + }, + { + "epoch": 2.6532463605339323, + "grad_norm": 0.6908495010112696, + "learning_rate": 3.460533010953138e-07, + "loss": 0.0585, + "step": 30710 + }, + { + "epoch": 2.654110328739902, + "grad_norm": 0.6892337877997854, + "learning_rate": 3.4435041488588063e-07, + "loss": 0.0581, + "step": 30720 + }, + { + "epoch": 2.6549742969458725, + "grad_norm": 0.6433255896995709, + "learning_rate": 3.4265157943010774e-07, + "loss": 0.0557, + "step": 30730 + }, + { + "epoch": 2.6558382651518424, + "grad_norm": 0.6486141164989712, + "learning_rate": 3.4095679620610834e-07, + "loss": 0.0585, + "step": 30740 + }, + { + "epoch": 2.6567022333578123, + "grad_norm": 0.6826727156921892, + "learning_rate": 3.3926606668846395e-07, + "loss": 0.058, + "step": 30750 + }, + { + "epoch": 2.6575662015637826, + "grad_norm": 0.658775821207378, + "learning_rate": 3.375793923482351e-07, + "loss": 0.0569, + "step": 30760 + }, + { + "epoch": 2.6584301697697525, + "grad_norm": 0.7123926912059468, + "learning_rate": 3.3589677465294957e-07, + "loss": 0.0544, + "step": 30770 + }, + { + "epoch": 2.6592941379757224, + "grad_norm": 0.693975415540836, + "learning_rate": 3.3421821506660736e-07, + "loss": 0.0586, + "step": 30780 + }, + { + "epoch": 2.6601581061816924, + "grad_norm": 0.66409621627837, + "learning_rate": 3.3254371504967744e-07, + "loss": 0.057, + "step": 30790 + }, + { + "epoch": 2.6610220743876627, + "grad_norm": 0.6408974196362415, + "learning_rate": 3.3087327605909603e-07, + "loss": 0.058, + "step": 30800 + }, + { + "epoch": 2.6618860425936326, + "grad_norm": 0.6548935799283724, + "learning_rate": 3.292068995482672e-07, + "loss": 0.0583, + "step": 30810 + }, + { + "epoch": 2.6627500107996025, + "grad_norm": 0.6114378558059209, + "learning_rate": 3.2754458696705957e-07, + "loss": 0.0583, + "step": 30820 + }, + { + "epoch": 2.6636139790055724, + "grad_norm": 0.6532677500781641, + "learning_rate": 3.2588633976180616e-07, + "loss": 0.0565, + "step": 30830 + }, + { + "epoch": 2.6644779472115427, + "grad_norm": 0.6621994992284654, + "learning_rate": 3.242321593753017e-07, + "loss": 0.0578, + "step": 30840 + }, + { + "epoch": 2.6653419154175126, + "grad_norm": 0.6659908441376852, + "learning_rate": 3.225820472468044e-07, + "loss": 0.0548, + "step": 30850 + }, + { + "epoch": 2.6662058836234825, + "grad_norm": 0.6884289390924854, + "learning_rate": 3.2093600481203135e-07, + "loss": 0.0566, + "step": 30860 + }, + { + "epoch": 2.667069851829453, + "grad_norm": 0.662687602293797, + "learning_rate": 3.1929403350315913e-07, + "loss": 0.0578, + "step": 30870 + }, + { + "epoch": 2.667933820035423, + "grad_norm": 0.6549345644376964, + "learning_rate": 3.176561347488227e-07, + "loss": 0.0571, + "step": 30880 + }, + { + "epoch": 2.6687977882413927, + "grad_norm": 0.6848731424249099, + "learning_rate": 3.1602230997411166e-07, + "loss": 0.0591, + "step": 30890 + }, + { + "epoch": 2.6696617564473626, + "grad_norm": 0.6238420529746252, + "learning_rate": 3.1439256060057486e-07, + "loss": 0.0594, + "step": 30900 + }, + { + "epoch": 2.6705257246533325, + "grad_norm": 0.6593033839201444, + "learning_rate": 3.1276688804620983e-07, + "loss": 0.058, + "step": 30910 + }, + { + "epoch": 2.671389692859303, + "grad_norm": 0.6324222514200223, + "learning_rate": 3.111452937254722e-07, + "loss": 0.0555, + "step": 30920 + }, + { + "epoch": 2.6722536610652727, + "grad_norm": 0.6712429491334161, + "learning_rate": 3.095277790492646e-07, + "loss": 0.059, + "step": 30930 + }, + { + "epoch": 2.6731176292712426, + "grad_norm": 0.6485249395528295, + "learning_rate": 3.0791434542494615e-07, + "loss": 0.057, + "step": 30940 + }, + { + "epoch": 2.673981597477213, + "grad_norm": 0.6944594330088194, + "learning_rate": 3.063049942563173e-07, + "loss": 0.0562, + "step": 30950 + }, + { + "epoch": 2.674845565683183, + "grad_norm": 0.6280925503430318, + "learning_rate": 3.04699726943633e-07, + "loss": 0.056, + "step": 30960 + }, + { + "epoch": 2.675709533889153, + "grad_norm": 0.7066752206732005, + "learning_rate": 3.0309854488359213e-07, + "loss": 0.0578, + "step": 30970 + }, + { + "epoch": 2.676573502095123, + "grad_norm": 0.6903537649768017, + "learning_rate": 3.015014494693391e-07, + "loss": 0.0554, + "step": 30980 + }, + { + "epoch": 2.677437470301093, + "grad_norm": 0.6860432662236696, + "learning_rate": 2.9990844209046355e-07, + "loss": 0.057, + "step": 30990 + }, + { + "epoch": 2.678301438507063, + "grad_norm": 0.6777716875811373, + "learning_rate": 2.983195241329967e-07, + "loss": 0.0569, + "step": 31000 + }, + { + "epoch": 2.679165406713033, + "grad_norm": 0.6624799407879325, + "learning_rate": 2.967346969794138e-07, + "loss": 0.0558, + "step": 31010 + }, + { + "epoch": 2.6800293749190027, + "grad_norm": 0.6752161383911821, + "learning_rate": 2.9515396200862964e-07, + "loss": 0.0567, + "step": 31020 + }, + { + "epoch": 2.680893343124973, + "grad_norm": 0.6767628777851237, + "learning_rate": 2.935773205959985e-07, + "loss": 0.0583, + "step": 31030 + }, + { + "epoch": 2.681757311330943, + "grad_norm": 0.6467792303059987, + "learning_rate": 2.9200477411331197e-07, + "loss": 0.0576, + "step": 31040 + }, + { + "epoch": 2.682621279536913, + "grad_norm": 0.6338846148774004, + "learning_rate": 2.904363239288022e-07, + "loss": 0.0557, + "step": 31050 + }, + { + "epoch": 2.683485247742883, + "grad_norm": 0.6403686317193569, + "learning_rate": 2.8887197140713144e-07, + "loss": 0.0581, + "step": 31060 + }, + { + "epoch": 2.684349215948853, + "grad_norm": 0.6463691594607338, + "learning_rate": 2.873117179094037e-07, + "loss": 0.0608, + "step": 31070 + }, + { + "epoch": 2.685213184154823, + "grad_norm": 0.6908061227463191, + "learning_rate": 2.857555647931509e-07, + "loss": 0.0581, + "step": 31080 + }, + { + "epoch": 2.6860771523607934, + "grad_norm": 0.6817739973491456, + "learning_rate": 2.8420351341234e-07, + "loss": 0.0579, + "step": 31090 + }, + { + "epoch": 2.6869411205667633, + "grad_norm": 0.7034592398074475, + "learning_rate": 2.8265556511736846e-07, + "loss": 0.0578, + "step": 31100 + }, + { + "epoch": 2.687805088772733, + "grad_norm": 0.7360862346410634, + "learning_rate": 2.811117212550629e-07, + "loss": 0.0572, + "step": 31110 + }, + { + "epoch": 2.688669056978703, + "grad_norm": 0.6841861146827901, + "learning_rate": 2.7957198316868164e-07, + "loss": 0.0592, + "step": 31120 + }, + { + "epoch": 2.689533025184673, + "grad_norm": 0.6698510034953965, + "learning_rate": 2.7803635219790736e-07, + "loss": 0.0585, + "step": 31130 + }, + { + "epoch": 2.6903969933906433, + "grad_norm": 0.6243962845040457, + "learning_rate": 2.765048296788514e-07, + "loss": 0.0563, + "step": 31140 + }, + { + "epoch": 2.691260961596613, + "grad_norm": 0.6725898416366625, + "learning_rate": 2.749774169440489e-07, + "loss": 0.0578, + "step": 31150 + }, + { + "epoch": 2.692124929802583, + "grad_norm": 0.6513848193495176, + "learning_rate": 2.7345411532246126e-07, + "loss": 0.0585, + "step": 31160 + }, + { + "epoch": 2.6929888980085535, + "grad_norm": 0.6539524109515839, + "learning_rate": 2.719349261394705e-07, + "loss": 0.0596, + "step": 31170 + }, + { + "epoch": 2.6938528662145234, + "grad_norm": 0.6501080679911405, + "learning_rate": 2.7041985071688257e-07, + "loss": 0.0562, + "step": 31180 + }, + { + "epoch": 2.6947168344204933, + "grad_norm": 0.680056582828761, + "learning_rate": 2.6890889037292255e-07, + "loss": 0.0589, + "step": 31190 + }, + { + "epoch": 2.6955808026264636, + "grad_norm": 0.6577180730454457, + "learning_rate": 2.674020464222349e-07, + "loss": 0.0573, + "step": 31200 + }, + { + "epoch": 2.6964447708324335, + "grad_norm": 0.6778652955187711, + "learning_rate": 2.658993201758864e-07, + "loss": 0.0557, + "step": 31210 + }, + { + "epoch": 2.6973087390384034, + "grad_norm": 0.6401257423410647, + "learning_rate": 2.6440071294135504e-07, + "loss": 0.0572, + "step": 31220 + }, + { + "epoch": 2.6981727072443733, + "grad_norm": 0.6664094217724789, + "learning_rate": 2.629062260225396e-07, + "loss": 0.0564, + "step": 31230 + }, + { + "epoch": 2.699036675450343, + "grad_norm": 0.7071794832969107, + "learning_rate": 2.6141586071975146e-07, + "loss": 0.0577, + "step": 31240 + }, + { + "epoch": 2.6999006436563135, + "grad_norm": 0.6731307049372357, + "learning_rate": 2.5992961832971897e-07, + "loss": 0.0587, + "step": 31250 + }, + { + "epoch": 2.7007646118622834, + "grad_norm": 0.7126246677195216, + "learning_rate": 2.584475001455783e-07, + "loss": 0.0567, + "step": 31260 + }, + { + "epoch": 2.7016285800682533, + "grad_norm": 0.6443499323094868, + "learning_rate": 2.5696950745688175e-07, + "loss": 0.0607, + "step": 31270 + }, + { + "epoch": 2.7024925482742237, + "grad_norm": 0.6613790530887171, + "learning_rate": 2.554956415495902e-07, + "loss": 0.056, + "step": 31280 + }, + { + "epoch": 2.7033565164801936, + "grad_norm": 0.6526390569684382, + "learning_rate": 2.540259037060738e-07, + "loss": 0.0558, + "step": 31290 + }, + { + "epoch": 2.7042204846861635, + "grad_norm": 0.6491466055505781, + "learning_rate": 2.525602952051115e-07, + "loss": 0.0565, + "step": 31300 + }, + { + "epoch": 2.705084452892134, + "grad_norm": 0.6477798234747194, + "learning_rate": 2.510988173218881e-07, + "loss": 0.0566, + "step": 31310 + }, + { + "epoch": 2.7059484210981037, + "grad_norm": 0.678937957222222, + "learning_rate": 2.496414713279988e-07, + "loss": 0.0594, + "step": 31320 + }, + { + "epoch": 2.7068123893040736, + "grad_norm": 0.6481312566936305, + "learning_rate": 2.481882584914369e-07, + "loss": 0.0559, + "step": 31330 + }, + { + "epoch": 2.7076763575100435, + "grad_norm": 0.6400669772772863, + "learning_rate": 2.467391800766056e-07, + "loss": 0.055, + "step": 31340 + }, + { + "epoch": 2.7085403257160134, + "grad_norm": 0.6899428068381959, + "learning_rate": 2.452942373443068e-07, + "loss": 0.0598, + "step": 31350 + }, + { + "epoch": 2.709404293921984, + "grad_norm": 0.7131334900456067, + "learning_rate": 2.438534315517482e-07, + "loss": 0.0579, + "step": 31360 + }, + { + "epoch": 2.7102682621279537, + "grad_norm": 0.6774543991643118, + "learning_rate": 2.424167639525327e-07, + "loss": 0.0574, + "step": 31370 + }, + { + "epoch": 2.7111322303339236, + "grad_norm": 0.6601832620044917, + "learning_rate": 2.40984235796668e-07, + "loss": 0.0566, + "step": 31380 + }, + { + "epoch": 2.711996198539894, + "grad_norm": 0.6510071598504059, + "learning_rate": 2.3955584833055655e-07, + "loss": 0.0571, + "step": 31390 + }, + { + "epoch": 2.712860166745864, + "grad_norm": 0.686821661165841, + "learning_rate": 2.381316027969993e-07, + "loss": 0.0583, + "step": 31400 + }, + { + "epoch": 2.7137241349518337, + "grad_norm": 0.6850970538518514, + "learning_rate": 2.3671150043519388e-07, + "loss": 0.0573, + "step": 31410 + }, + { + "epoch": 2.7145881031578036, + "grad_norm": 0.6976938553504183, + "learning_rate": 2.3529554248073127e-07, + "loss": 0.0571, + "step": 31420 + }, + { + "epoch": 2.715452071363774, + "grad_norm": 0.6707899675092523, + "learning_rate": 2.3388373016559995e-07, + "loss": 0.054, + "step": 31430 + }, + { + "epoch": 2.716316039569744, + "grad_norm": 0.6525033797859318, + "learning_rate": 2.324760647181784e-07, + "loss": 0.0563, + "step": 31440 + }, + { + "epoch": 2.717180007775714, + "grad_norm": 0.6712410318989885, + "learning_rate": 2.3107254736323747e-07, + "loss": 0.0584, + "step": 31450 + }, + { + "epoch": 2.7180439759816837, + "grad_norm": 0.6489822837447691, + "learning_rate": 2.2967317932193868e-07, + "loss": 0.0568, + "step": 31460 + }, + { + "epoch": 2.718907944187654, + "grad_norm": 0.6943226651728736, + "learning_rate": 2.282779618118358e-07, + "loss": 0.0566, + "step": 31470 + }, + { + "epoch": 2.719771912393624, + "grad_norm": 0.7089036129137106, + "learning_rate": 2.2688689604686899e-07, + "loss": 0.0576, + "step": 31480 + }, + { + "epoch": 2.720635880599594, + "grad_norm": 0.6762859711461554, + "learning_rate": 2.254999832373661e-07, + "loss": 0.0571, + "step": 31490 + }, + { + "epoch": 2.721499848805564, + "grad_norm": 0.6544346615382407, + "learning_rate": 2.2411722459004192e-07, + "loss": 0.0566, + "step": 31500 + }, + { + "epoch": 2.722363817011534, + "grad_norm": 0.668302454393643, + "learning_rate": 2.2273862130799684e-07, + "loss": 0.0569, + "step": 31510 + }, + { + "epoch": 2.723227785217504, + "grad_norm": 0.6942525287151675, + "learning_rate": 2.2136417459071802e-07, + "loss": 0.0566, + "step": 31520 + }, + { + "epoch": 2.724091753423474, + "grad_norm": 0.6509997629051298, + "learning_rate": 2.199938856340711e-07, + "loss": 0.0574, + "step": 31530 + }, + { + "epoch": 2.7249557216294438, + "grad_norm": 0.6723635409838065, + "learning_rate": 2.1862775563030903e-07, + "loss": 0.0596, + "step": 31540 + }, + { + "epoch": 2.725819689835414, + "grad_norm": 0.670764933007711, + "learning_rate": 2.1726578576806324e-07, + "loss": 0.0551, + "step": 31550 + }, + { + "epoch": 2.726683658041384, + "grad_norm": 0.6629743678908288, + "learning_rate": 2.1590797723234802e-07, + "loss": 0.0553, + "step": 31560 + }, + { + "epoch": 2.727547626247354, + "grad_norm": 0.6327215531773591, + "learning_rate": 2.145543312045534e-07, + "loss": 0.0566, + "step": 31570 + }, + { + "epoch": 2.7284115944533243, + "grad_norm": 0.6781526282634827, + "learning_rate": 2.1320484886245163e-07, + "loss": 0.0585, + "step": 31580 + }, + { + "epoch": 2.729275562659294, + "grad_norm": 0.6622262488922581, + "learning_rate": 2.1185953138019022e-07, + "loss": 0.0541, + "step": 31590 + }, + { + "epoch": 2.730139530865264, + "grad_norm": 0.667475826799447, + "learning_rate": 2.1051837992829226e-07, + "loss": 0.0548, + "step": 31600 + }, + { + "epoch": 2.7310034990712344, + "grad_norm": 0.657088121258104, + "learning_rate": 2.091813956736577e-07, + "loss": 0.0568, + "step": 31610 + }, + { + "epoch": 2.7318674672772043, + "grad_norm": 0.6498889038053293, + "learning_rate": 2.0784857977955931e-07, + "loss": 0.057, + "step": 31620 + }, + { + "epoch": 2.732731435483174, + "grad_norm": 0.6733409297948825, + "learning_rate": 2.0651993340564614e-07, + "loss": 0.0584, + "step": 31630 + }, + { + "epoch": 2.733595403689144, + "grad_norm": 0.6852886398578109, + "learning_rate": 2.0519545770793348e-07, + "loss": 0.0574, + "step": 31640 + }, + { + "epoch": 2.734459371895114, + "grad_norm": 0.6792628939751382, + "learning_rate": 2.0387515383881452e-07, + "loss": 0.0572, + "step": 31650 + }, + { + "epoch": 2.7353233401010844, + "grad_norm": 0.6599160690463196, + "learning_rate": 2.025590229470481e-07, + "loss": 0.0566, + "step": 31660 + }, + { + "epoch": 2.7361873083070543, + "grad_norm": 0.629698984875205, + "learning_rate": 2.0124706617776546e-07, + "loss": 0.0568, + "step": 31670 + }, + { + "epoch": 2.737051276513024, + "grad_norm": 0.6599918753484036, + "learning_rate": 1.999392846724618e-07, + "loss": 0.0568, + "step": 31680 + }, + { + "epoch": 2.7379152447189945, + "grad_norm": 0.6543758042897919, + "learning_rate": 1.986356795690042e-07, + "loss": 0.0556, + "step": 31690 + }, + { + "epoch": 2.7387792129249644, + "grad_norm": 0.6784143602920379, + "learning_rate": 1.973362520016231e-07, + "loss": 0.0582, + "step": 31700 + }, + { + "epoch": 2.7396431811309343, + "grad_norm": 0.6903678628469099, + "learning_rate": 1.9604100310091523e-07, + "loss": 0.0581, + "step": 31710 + }, + { + "epoch": 2.7405071493369046, + "grad_norm": 0.6463342523156203, + "learning_rate": 1.947499339938408e-07, + "loss": 0.0567, + "step": 31720 + }, + { + "epoch": 2.7413711175428745, + "grad_norm": 0.7025200315096658, + "learning_rate": 1.93463045803724e-07, + "loss": 0.0577, + "step": 31730 + }, + { + "epoch": 2.7422350857488444, + "grad_norm": 0.6884195548334803, + "learning_rate": 1.9218033965025195e-07, + "loss": 0.0578, + "step": 31740 + }, + { + "epoch": 2.7430990539548143, + "grad_norm": 0.6444704425358405, + "learning_rate": 1.9090181664947137e-07, + "loss": 0.0575, + "step": 31750 + }, + { + "epoch": 2.7439630221607842, + "grad_norm": 0.6725158461214576, + "learning_rate": 1.8962747791379023e-07, + "loss": 0.0572, + "step": 31760 + }, + { + "epoch": 2.7448269903667546, + "grad_norm": 0.6760315296837612, + "learning_rate": 1.8835732455197596e-07, + "loss": 0.0571, + "step": 31770 + }, + { + "epoch": 2.7456909585727245, + "grad_norm": 0.6515448653414146, + "learning_rate": 1.870913576691552e-07, + "loss": 0.0574, + "step": 31780 + }, + { + "epoch": 2.7465549267786944, + "grad_norm": 0.702263272691988, + "learning_rate": 1.8582957836681015e-07, + "loss": 0.0576, + "step": 31790 + }, + { + "epoch": 2.7474188949846647, + "grad_norm": 0.674403943690754, + "learning_rate": 1.845719877427815e-07, + "loss": 0.0567, + "step": 31800 + }, + { + "epoch": 2.7482828631906346, + "grad_norm": 0.6919511526668807, + "learning_rate": 1.8331858689126348e-07, + "loss": 0.0577, + "step": 31810 + }, + { + "epoch": 2.7491468313966045, + "grad_norm": 0.669158032499984, + "learning_rate": 1.820693769028059e-07, + "loss": 0.057, + "step": 31820 + }, + { + "epoch": 2.750010799602575, + "grad_norm": 0.6565133834585644, + "learning_rate": 1.8082435886431383e-07, + "loss": 0.0549, + "step": 31830 + }, + { + "epoch": 2.750874767808545, + "grad_norm": 0.6700558974120085, + "learning_rate": 1.7958353385904126e-07, + "loss": 0.0581, + "step": 31840 + }, + { + "epoch": 2.7517387360145147, + "grad_norm": 0.6799608539206533, + "learning_rate": 1.7834690296659852e-07, + "loss": 0.0588, + "step": 31850 + }, + { + "epoch": 2.7526027042204846, + "grad_norm": 0.6342124927694704, + "learning_rate": 1.7711446726294267e-07, + "loss": 0.0586, + "step": 31860 + }, + { + "epoch": 2.7534666724264545, + "grad_norm": 0.6311819082310265, + "learning_rate": 1.7588622782038378e-07, + "loss": 0.0562, + "step": 31870 + }, + { + "epoch": 2.754330640632425, + "grad_norm": 0.674920565799908, + "learning_rate": 1.7466218570757754e-07, + "loss": 0.0595, + "step": 31880 + }, + { + "epoch": 2.7551946088383947, + "grad_norm": 0.6635343214254527, + "learning_rate": 1.7344234198953146e-07, + "loss": 0.0574, + "step": 31890 + }, + { + "epoch": 2.7560585770443646, + "grad_norm": 0.6525627692019093, + "learning_rate": 1.7222669772759716e-07, + "loss": 0.0547, + "step": 31900 + }, + { + "epoch": 2.756922545250335, + "grad_norm": 0.6896387409372627, + "learning_rate": 1.7101525397947406e-07, + "loss": 0.0579, + "step": 31910 + }, + { + "epoch": 2.757786513456305, + "grad_norm": 0.6720002525440155, + "learning_rate": 1.6980801179920681e-07, + "loss": 0.0573, + "step": 31920 + }, + { + "epoch": 2.758650481662275, + "grad_norm": 0.6655959204666657, + "learning_rate": 1.6860497223718188e-07, + "loss": 0.056, + "step": 31930 + }, + { + "epoch": 2.759514449868245, + "grad_norm": 0.6723784296940033, + "learning_rate": 1.6740613634013413e-07, + "loss": 0.0564, + "step": 31940 + }, + { + "epoch": 2.760378418074215, + "grad_norm": 0.6816159314364688, + "learning_rate": 1.6621150515113538e-07, + "loss": 0.0548, + "step": 31950 + }, + { + "epoch": 2.761242386280185, + "grad_norm": 0.6597189697816395, + "learning_rate": 1.6502107970960246e-07, + "loss": 0.0581, + "step": 31960 + }, + { + "epoch": 2.762106354486155, + "grad_norm": 0.7127449095465125, + "learning_rate": 1.6383486105129243e-07, + "loss": 0.0564, + "step": 31970 + }, + { + "epoch": 2.7629703226921247, + "grad_norm": 0.6623458323343064, + "learning_rate": 1.6265285020830245e-07, + "loss": 0.0588, + "step": 31980 + }, + { + "epoch": 2.763834290898095, + "grad_norm": 0.6524127223074473, + "learning_rate": 1.6147504820906546e-07, + "loss": 0.0568, + "step": 31990 + }, + { + "epoch": 2.764698259104065, + "grad_norm": 0.6452682493379834, + "learning_rate": 1.603014560783578e-07, + "loss": 0.0574, + "step": 32000 + }, + { + "epoch": 2.765562227310035, + "grad_norm": 0.6838304105116794, + "learning_rate": 1.5913207483728765e-07, + "loss": 0.0565, + "step": 32010 + }, + { + "epoch": 2.766426195516005, + "grad_norm": 0.7006017396514543, + "learning_rate": 1.579669055033034e-07, + "loss": 0.0563, + "step": 32020 + }, + { + "epoch": 2.767290163721975, + "grad_norm": 0.63390922464745, + "learning_rate": 1.5680594909018575e-07, + "loss": 0.0574, + "step": 32030 + }, + { + "epoch": 2.768154131927945, + "grad_norm": 0.630174268226932, + "learning_rate": 1.5564920660805115e-07, + "loss": 0.0563, + "step": 32040 + }, + { + "epoch": 2.769018100133915, + "grad_norm": 0.6950850152884743, + "learning_rate": 1.544966790633512e-07, + "loss": 0.055, + "step": 32050 + }, + { + "epoch": 2.769882068339885, + "grad_norm": 0.686069735603795, + "learning_rate": 1.5334836745886762e-07, + "loss": 0.0555, + "step": 32060 + }, + { + "epoch": 2.770746036545855, + "grad_norm": 0.6924302699941641, + "learning_rate": 1.5220427279371507e-07, + "loss": 0.0579, + "step": 32070 + }, + { + "epoch": 2.771610004751825, + "grad_norm": 0.6763504845141571, + "learning_rate": 1.51064396063339e-07, + "loss": 0.059, + "step": 32080 + }, + { + "epoch": 2.772473972957795, + "grad_norm": 0.6855868771736366, + "learning_rate": 1.4992873825951548e-07, + "loss": 0.0581, + "step": 32090 + }, + { + "epoch": 2.7733379411637653, + "grad_norm": 0.6702095553630669, + "learning_rate": 1.4879730037034857e-07, + "loss": 0.058, + "step": 32100 + }, + { + "epoch": 2.774201909369735, + "grad_norm": 0.6798131090156784, + "learning_rate": 1.47670083380273e-07, + "loss": 0.0562, + "step": 32110 + }, + { + "epoch": 2.775065877575705, + "grad_norm": 0.6878985668508981, + "learning_rate": 1.465470882700476e-07, + "loss": 0.0587, + "step": 32120 + }, + { + "epoch": 2.7759298457816755, + "grad_norm": 0.6956681523253542, + "learning_rate": 1.4542831601676078e-07, + "loss": 0.0569, + "step": 32130 + }, + { + "epoch": 2.7767938139876454, + "grad_norm": 0.6749804800774928, + "learning_rate": 1.4431376759382664e-07, + "loss": 0.0552, + "step": 32140 + }, + { + "epoch": 2.7776577821936153, + "grad_norm": 0.6820298790434, + "learning_rate": 1.432034439709812e-07, + "loss": 0.0601, + "step": 32150 + }, + { + "epoch": 2.778521750399585, + "grad_norm": 0.7023866826285801, + "learning_rate": 1.4209734611428882e-07, + "loss": 0.0588, + "step": 32160 + }, + { + "epoch": 2.779385718605555, + "grad_norm": 0.7251283545475257, + "learning_rate": 1.4099547498613365e-07, + "loss": 0.0611, + "step": 32170 + }, + { + "epoch": 2.7802496868115254, + "grad_norm": 0.6794399022502436, + "learning_rate": 1.398978315452243e-07, + "loss": 0.0586, + "step": 32180 + }, + { + "epoch": 2.7811136550174953, + "grad_norm": 0.6829730156934579, + "learning_rate": 1.3880441674659017e-07, + "loss": 0.0578, + "step": 32190 + }, + { + "epoch": 2.781977623223465, + "grad_norm": 0.6780607571604497, + "learning_rate": 1.3771523154158194e-07, + "loss": 0.0582, + "step": 32200 + }, + { + "epoch": 2.7828415914294355, + "grad_norm": 0.6944067360719522, + "learning_rate": 1.366302768778699e-07, + "loss": 0.0563, + "step": 32210 + }, + { + "epoch": 2.7837055596354054, + "grad_norm": 0.6926254817665085, + "learning_rate": 1.3554955369944334e-07, + "loss": 0.0575, + "step": 32220 + }, + { + "epoch": 2.7845695278413753, + "grad_norm": 0.6412247121885293, + "learning_rate": 1.3447306294661012e-07, + "loss": 0.0589, + "step": 32230 + }, + { + "epoch": 2.7854334960473457, + "grad_norm": 0.6625032725187675, + "learning_rate": 1.334008055559949e-07, + "loss": 0.057, + "step": 32240 + }, + { + "epoch": 2.7862974642533156, + "grad_norm": 0.6890514579031258, + "learning_rate": 1.3233278246054081e-07, + "loss": 0.0575, + "step": 32250 + }, + { + "epoch": 2.7871614324592855, + "grad_norm": 0.6749984423954664, + "learning_rate": 1.3126899458950403e-07, + "loss": 0.0567, + "step": 32260 + }, + { + "epoch": 2.7880254006652554, + "grad_norm": 0.6058312889602515, + "learning_rate": 1.302094428684586e-07, + "loss": 0.0576, + "step": 32270 + }, + { + "epoch": 2.7888893688712253, + "grad_norm": 0.6640679135768991, + "learning_rate": 1.2915412821929096e-07, + "loss": 0.0549, + "step": 32280 + }, + { + "epoch": 2.7897533370771956, + "grad_norm": 0.6667923213308632, + "learning_rate": 1.281030515602022e-07, + "loss": 0.0542, + "step": 32290 + }, + { + "epoch": 2.7906173052831655, + "grad_norm": 0.6763644991293586, + "learning_rate": 1.2705621380570465e-07, + "loss": 0.0564, + "step": 32300 + }, + { + "epoch": 2.7914812734891354, + "grad_norm": 0.671158560568952, + "learning_rate": 1.2601361586662364e-07, + "loss": 0.0585, + "step": 32310 + }, + { + "epoch": 2.792345241695106, + "grad_norm": 0.6522339042934434, + "learning_rate": 1.249752586500952e-07, + "loss": 0.0581, + "step": 32320 + }, + { + "epoch": 2.7932092099010757, + "grad_norm": 0.6979271307967363, + "learning_rate": 1.239411430595655e-07, + "loss": 0.0586, + "step": 32330 + }, + { + "epoch": 2.7940731781070456, + "grad_norm": 0.6799894206330298, + "learning_rate": 1.229112699947904e-07, + "loss": 0.0563, + "step": 32340 + }, + { + "epoch": 2.794937146313016, + "grad_norm": 0.677619561225624, + "learning_rate": 1.2188564035183425e-07, + "loss": 0.0556, + "step": 32350 + }, + { + "epoch": 2.795801114518986, + "grad_norm": 0.6297337136303034, + "learning_rate": 1.2086425502306986e-07, + "loss": 0.0553, + "step": 32360 + }, + { + "epoch": 2.7966650827249557, + "grad_norm": 0.6450798875927606, + "learning_rate": 1.1984711489717583e-07, + "loss": 0.057, + "step": 32370 + }, + { + "epoch": 2.7975290509309256, + "grad_norm": 0.6710396032708422, + "learning_rate": 1.188342208591392e-07, + "loss": 0.0572, + "step": 32380 + }, + { + "epoch": 2.7983930191368955, + "grad_norm": 0.6870288667869697, + "learning_rate": 1.1782557379025005e-07, + "loss": 0.0566, + "step": 32390 + }, + { + "epoch": 2.799256987342866, + "grad_norm": 0.6549415421772309, + "learning_rate": 1.1682117456810583e-07, + "loss": 0.0591, + "step": 32400 + }, + { + "epoch": 2.8001209555488358, + "grad_norm": 0.6412098336825817, + "learning_rate": 1.1582102406660634e-07, + "loss": 0.0572, + "step": 32410 + }, + { + "epoch": 2.8009849237548057, + "grad_norm": 0.6752618412945657, + "learning_rate": 1.1482512315595607e-07, + "loss": 0.058, + "step": 32420 + }, + { + "epoch": 2.801848891960776, + "grad_norm": 0.6359954770406422, + "learning_rate": 1.1383347270265965e-07, + "loss": 0.0553, + "step": 32430 + }, + { + "epoch": 2.802712860166746, + "grad_norm": 0.6876557978296493, + "learning_rate": 1.1284607356952638e-07, + "loss": 0.0575, + "step": 32440 + }, + { + "epoch": 2.803576828372716, + "grad_norm": 0.7021487811787359, + "learning_rate": 1.1186292661566511e-07, + "loss": 0.0566, + "step": 32450 + }, + { + "epoch": 2.804440796578686, + "grad_norm": 0.6750019608519389, + "learning_rate": 1.108840326964844e-07, + "loss": 0.0576, + "step": 32460 + }, + { + "epoch": 2.805304764784656, + "grad_norm": 0.6277430752290388, + "learning_rate": 1.0990939266369404e-07, + "loss": 0.0587, + "step": 32470 + }, + { + "epoch": 2.806168732990626, + "grad_norm": 0.6530520547027956, + "learning_rate": 1.0893900736530127e-07, + "loss": 0.0573, + "step": 32480 + }, + { + "epoch": 2.807032701196596, + "grad_norm": 0.6696408985828085, + "learning_rate": 1.0797287764561238e-07, + "loss": 0.0566, + "step": 32490 + }, + { + "epoch": 2.8078966694025658, + "grad_norm": 0.6628118939310904, + "learning_rate": 1.0701100434522882e-07, + "loss": 0.056, + "step": 32500 + }, + { + "epoch": 2.808760637608536, + "grad_norm": 0.6358518359835514, + "learning_rate": 1.0605338830105283e-07, + "loss": 0.0567, + "step": 32510 + }, + { + "epoch": 2.809624605814506, + "grad_norm": 0.6552015344909092, + "learning_rate": 1.0510003034627848e-07, + "loss": 0.0558, + "step": 32520 + }, + { + "epoch": 2.810488574020476, + "grad_norm": 0.6786865741009189, + "learning_rate": 1.0415093131039666e-07, + "loss": 0.0542, + "step": 32530 + }, + { + "epoch": 2.8113525422264463, + "grad_norm": 0.6708262449590945, + "learning_rate": 1.0320609201919241e-07, + "loss": 0.0546, + "step": 32540 + }, + { + "epoch": 2.812216510432416, + "grad_norm": 0.6871889215796761, + "learning_rate": 1.0226551329474477e-07, + "loss": 0.0591, + "step": 32550 + }, + { + "epoch": 2.813080478638386, + "grad_norm": 0.6796148770244584, + "learning_rate": 1.0132919595542634e-07, + "loss": 0.0572, + "step": 32560 + }, + { + "epoch": 2.813944446844356, + "grad_norm": 0.6448854547531903, + "learning_rate": 1.0039714081589991e-07, + "loss": 0.057, + "step": 32570 + }, + { + "epoch": 2.8148084150503263, + "grad_norm": 0.7214571409269863, + "learning_rate": 9.946934868712176e-08, + "loss": 0.0566, + "step": 32580 + }, + { + "epoch": 2.815672383256296, + "grad_norm": 0.6864676012136871, + "learning_rate": 9.854582037633953e-08, + "loss": 0.0559, + "step": 32590 + }, + { + "epoch": 2.816536351462266, + "grad_norm": 0.6875452157455118, + "learning_rate": 9.762655668708876e-08, + "loss": 0.0574, + "step": 32600 + }, + { + "epoch": 2.817400319668236, + "grad_norm": 0.6471250629240326, + "learning_rate": 9.671155841919577e-08, + "loss": 0.0574, + "step": 32610 + }, + { + "epoch": 2.8182642878742064, + "grad_norm": 0.6830004863111572, + "learning_rate": 9.580082636877652e-08, + "loss": 0.0569, + "step": 32620 + }, + { + "epoch": 2.8191282560801763, + "grad_norm": 0.6419472467128323, + "learning_rate": 9.489436132823326e-08, + "loss": 0.057, + "step": 32630 + }, + { + "epoch": 2.819992224286146, + "grad_norm": 0.657306933977155, + "learning_rate": 9.399216408625678e-08, + "loss": 0.0585, + "step": 32640 + }, + { + "epoch": 2.8208561924921165, + "grad_norm": 0.6432005216885947, + "learning_rate": 9.309423542782414e-08, + "loss": 0.0573, + "step": 32650 + }, + { + "epoch": 2.8217201606980864, + "grad_norm": 0.7002865300654664, + "learning_rate": 9.22005761341982e-08, + "loss": 0.0588, + "step": 32660 + }, + { + "epoch": 2.8225841289040563, + "grad_norm": 0.6612868667391867, + "learning_rate": 9.131118698292863e-08, + "loss": 0.0566, + "step": 32670 + }, + { + "epoch": 2.823448097110026, + "grad_norm": 0.6761202883755489, + "learning_rate": 9.042606874784809e-08, + "loss": 0.059, + "step": 32680 + }, + { + "epoch": 2.824312065315996, + "grad_norm": 0.6862611791080319, + "learning_rate": 8.954522219907392e-08, + "loss": 0.0547, + "step": 32690 + }, + { + "epoch": 2.8251760335219664, + "grad_norm": 0.6254683022957856, + "learning_rate": 8.866864810300579e-08, + "loss": 0.0564, + "step": 32700 + }, + { + "epoch": 2.8260400017279363, + "grad_norm": 0.6541988728358744, + "learning_rate": 8.77963472223281e-08, + "loss": 0.0576, + "step": 32710 + }, + { + "epoch": 2.8269039699339062, + "grad_norm": 0.7137230691053326, + "learning_rate": 8.692832031600539e-08, + "loss": 0.0568, + "step": 32720 + }, + { + "epoch": 2.8277679381398766, + "grad_norm": 0.6744846313172326, + "learning_rate": 8.60645681392841e-08, + "loss": 0.0563, + "step": 32730 + }, + { + "epoch": 2.8286319063458465, + "grad_norm": 0.7290262081800398, + "learning_rate": 8.520509144369194e-08, + "loss": 0.0582, + "step": 32740 + }, + { + "epoch": 2.8294958745518164, + "grad_norm": 0.6928234087420588, + "learning_rate": 8.434989097703572e-08, + "loss": 0.0565, + "step": 32750 + }, + { + "epoch": 2.8303598427577867, + "grad_norm": 0.6537822863085431, + "learning_rate": 8.349896748340191e-08, + "loss": 0.0576, + "step": 32760 + }, + { + "epoch": 2.8312238109637566, + "grad_norm": 0.6664152071722941, + "learning_rate": 8.265232170315607e-08, + "loss": 0.0584, + "step": 32770 + }, + { + "epoch": 2.8320877791697265, + "grad_norm": 0.6543148772010279, + "learning_rate": 8.180995437294226e-08, + "loss": 0.0564, + "step": 32780 + }, + { + "epoch": 2.8329517473756964, + "grad_norm": 0.6469114237292819, + "learning_rate": 8.097186622568032e-08, + "loss": 0.0566, + "step": 32790 + }, + { + "epoch": 2.8338157155816663, + "grad_norm": 0.6486680998913869, + "learning_rate": 8.013805799056806e-08, + "loss": 0.0559, + "step": 32800 + }, + { + "epoch": 2.8346796837876367, + "grad_norm": 0.6528582438625301, + "learning_rate": 7.930853039307962e-08, + "loss": 0.0569, + "step": 32810 + }, + { + "epoch": 2.8355436519936066, + "grad_norm": 0.6647853143387031, + "learning_rate": 7.848328415496375e-08, + "loss": 0.0584, + "step": 32820 + }, + { + "epoch": 2.8364076201995765, + "grad_norm": 0.6857635854830215, + "learning_rate": 7.766231999424556e-08, + "loss": 0.0569, + "step": 32830 + }, + { + "epoch": 2.837271588405547, + "grad_norm": 0.6888594940329839, + "learning_rate": 7.684563862522254e-08, + "loss": 0.0581, + "step": 32840 + }, + { + "epoch": 2.8381355566115167, + "grad_norm": 0.6483498063174805, + "learning_rate": 7.603324075846741e-08, + "loss": 0.0548, + "step": 32850 + }, + { + "epoch": 2.8389995248174866, + "grad_norm": 0.6961287471485743, + "learning_rate": 7.522512710082531e-08, + "loss": 0.0572, + "step": 32860 + }, + { + "epoch": 2.839863493023457, + "grad_norm": 0.6713877134589054, + "learning_rate": 7.442129835541433e-08, + "loss": 0.0558, + "step": 32870 + }, + { + "epoch": 2.840727461229427, + "grad_norm": 0.6447047687390621, + "learning_rate": 7.362175522162284e-08, + "loss": 0.0558, + "step": 32880 + }, + { + "epoch": 2.8415914294353968, + "grad_norm": 0.6950714605507896, + "learning_rate": 7.282649839511269e-08, + "loss": 0.0562, + "step": 32890 + }, + { + "epoch": 2.8424553976413667, + "grad_norm": 0.662844122297223, + "learning_rate": 7.203552856781482e-08, + "loss": 0.057, + "step": 32900 + }, + { + "epoch": 2.8433193658473366, + "grad_norm": 0.6764003819400013, + "learning_rate": 7.124884642793039e-08, + "loss": 0.0576, + "step": 32910 + }, + { + "epoch": 2.844183334053307, + "grad_norm": 0.6665223091511387, + "learning_rate": 7.046645265992968e-08, + "loss": 0.0546, + "step": 32920 + }, + { + "epoch": 2.845047302259277, + "grad_norm": 0.7105587069690044, + "learning_rate": 6.968834794455203e-08, + "loss": 0.0583, + "step": 32930 + }, + { + "epoch": 2.8459112704652467, + "grad_norm": 0.6579588776400764, + "learning_rate": 6.891453295880591e-08, + "loss": 0.0572, + "step": 32940 + }, + { + "epoch": 2.846775238671217, + "grad_norm": 0.6500028386919731, + "learning_rate": 6.81450083759666e-08, + "loss": 0.0572, + "step": 32950 + }, + { + "epoch": 2.847639206877187, + "grad_norm": 0.7149086521713711, + "learning_rate": 6.737977486557578e-08, + "loss": 0.0543, + "step": 32960 + }, + { + "epoch": 2.848503175083157, + "grad_norm": 0.6856207174938993, + "learning_rate": 6.661883309344253e-08, + "loss": 0.0568, + "step": 32970 + }, + { + "epoch": 2.849367143289127, + "grad_norm": 0.6891817002441347, + "learning_rate": 6.586218372164166e-08, + "loss": 0.0571, + "step": 32980 + }, + { + "epoch": 2.850231111495097, + "grad_norm": 0.6777405340120837, + "learning_rate": 6.510982740851269e-08, + "loss": 0.0564, + "step": 32990 + }, + { + "epoch": 2.851095079701067, + "grad_norm": 0.6673915333144583, + "learning_rate": 6.436176480866141e-08, + "loss": 0.0569, + "step": 33000 + }, + { + "epoch": 2.851959047907037, + "grad_norm": 0.6734418146878922, + "learning_rate": 6.361799657295552e-08, + "loss": 0.0586, + "step": 33010 + }, + { + "epoch": 2.852823016113007, + "grad_norm": 0.6904229506397264, + "learning_rate": 6.287852334852795e-08, + "loss": 0.06, + "step": 33020 + }, + { + "epoch": 2.853686984318977, + "grad_norm": 0.6830159237337381, + "learning_rate": 6.214334577877401e-08, + "loss": 0.0554, + "step": 33030 + }, + { + "epoch": 2.854550952524947, + "grad_norm": 0.7220761887506169, + "learning_rate": 6.141246450335148e-08, + "loss": 0.0586, + "step": 33040 + }, + { + "epoch": 2.855414920730917, + "grad_norm": 0.6791045442446241, + "learning_rate": 6.068588015818055e-08, + "loss": 0.0579, + "step": 33050 + }, + { + "epoch": 2.8562788889368873, + "grad_norm": 0.6860422169888754, + "learning_rate": 5.996359337544277e-08, + "loss": 0.0554, + "step": 33060 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.6926889419675112, + "learning_rate": 5.924560478357988e-08, + "loss": 0.0572, + "step": 33070 + }, + { + "epoch": 2.858006825348827, + "grad_norm": 0.6572432697985694, + "learning_rate": 5.853191500729327e-08, + "loss": 0.0589, + "step": 33080 + }, + { + "epoch": 2.8588707935547975, + "grad_norm": 0.6959692528825726, + "learning_rate": 5.782252466754623e-08, + "loss": 0.0577, + "step": 33090 + }, + { + "epoch": 2.8597347617607674, + "grad_norm": 0.6563165677197141, + "learning_rate": 5.711743438155892e-08, + "loss": 0.0583, + "step": 33100 + }, + { + "epoch": 2.8605987299667373, + "grad_norm": 0.6539622559768795, + "learning_rate": 5.6416644762812277e-08, + "loss": 0.0578, + "step": 33110 + }, + { + "epoch": 2.861462698172707, + "grad_norm": 0.6369086566201649, + "learning_rate": 5.5720156421043556e-08, + "loss": 0.0564, + "step": 33120 + }, + { + "epoch": 2.862326666378677, + "grad_norm": 0.6420979588342911, + "learning_rate": 5.502796996224746e-08, + "loss": 0.0563, + "step": 33130 + }, + { + "epoch": 2.8631906345846474, + "grad_norm": 0.6695727199391603, + "learning_rate": 5.434008598867835e-08, + "loss": 0.0554, + "step": 33140 + }, + { + "epoch": 2.8640546027906173, + "grad_norm": 0.6600310210773601, + "learning_rate": 5.365650509884357e-08, + "loss": 0.0586, + "step": 33150 + }, + { + "epoch": 2.864918570996587, + "grad_norm": 0.6484929852915966, + "learning_rate": 5.297722788750958e-08, + "loss": 0.0556, + "step": 33160 + }, + { + "epoch": 2.8657825392025575, + "grad_norm": 0.7134033307243018, + "learning_rate": 5.230225494569585e-08, + "loss": 0.0543, + "step": 33170 + }, + { + "epoch": 2.8666465074085274, + "grad_norm": 0.6946163701427388, + "learning_rate": 5.1631586860678703e-08, + "loss": 0.0557, + "step": 33180 + }, + { + "epoch": 2.8675104756144973, + "grad_norm": 0.6743236939216565, + "learning_rate": 5.0965224215987484e-08, + "loss": 0.0557, + "step": 33190 + }, + { + "epoch": 2.8683744438204672, + "grad_norm": 0.6385782451925563, + "learning_rate": 5.030316759140674e-08, + "loss": 0.0543, + "step": 33200 + }, + { + "epoch": 2.8692384120264376, + "grad_norm": 0.6894700813928674, + "learning_rate": 4.9645417562973476e-08, + "loss": 0.056, + "step": 33210 + }, + { + "epoch": 2.8701023802324075, + "grad_norm": 0.6675629300734072, + "learning_rate": 4.899197470297823e-08, + "loss": 0.056, + "step": 33220 + }, + { + "epoch": 2.8709663484383774, + "grad_norm": 0.668713281409164, + "learning_rate": 4.834283957996344e-08, + "loss": 0.0547, + "step": 33230 + }, + { + "epoch": 2.8718303166443473, + "grad_norm": 0.6665885336321217, + "learning_rate": 4.7698012758724547e-08, + "loss": 0.0559, + "step": 33240 + }, + { + "epoch": 2.8726942848503176, + "grad_norm": 0.6821105530534564, + "learning_rate": 4.705749480030719e-08, + "loss": 0.0572, + "step": 33250 + }, + { + "epoch": 2.8735582530562875, + "grad_norm": 0.6708658577242343, + "learning_rate": 4.6421286262008924e-08, + "loss": 0.0572, + "step": 33260 + }, + { + "epoch": 2.8744222212622574, + "grad_norm": 0.647767125351178, + "learning_rate": 4.578938769737751e-08, + "loss": 0.0584, + "step": 33270 + }, + { + "epoch": 2.875286189468228, + "grad_norm": 0.6780529409748565, + "learning_rate": 4.5161799656209814e-08, + "loss": 0.0559, + "step": 33280 + }, + { + "epoch": 2.8761501576741977, + "grad_norm": 0.649348742633913, + "learning_rate": 4.453852268455461e-08, + "loss": 0.0593, + "step": 33290 + }, + { + "epoch": 2.8770141258801676, + "grad_norm": 0.6676144330672698, + "learning_rate": 4.391955732470643e-08, + "loss": 0.0566, + "step": 33300 + }, + { + "epoch": 2.8778780940861375, + "grad_norm": 0.686248518816948, + "learning_rate": 4.330490411521116e-08, + "loss": 0.0577, + "step": 33310 + }, + { + "epoch": 2.8787420622921074, + "grad_norm": 0.6842785616361624, + "learning_rate": 4.2694563590861546e-08, + "loss": 0.0562, + "step": 33320 + }, + { + "epoch": 2.8796060304980777, + "grad_norm": 0.6437126649798809, + "learning_rate": 4.2088536282698376e-08, + "loss": 0.057, + "step": 33330 + }, + { + "epoch": 2.8804699987040476, + "grad_norm": 0.6942606890668203, + "learning_rate": 4.14868227180093e-08, + "loss": 0.0588, + "step": 33340 + }, + { + "epoch": 2.8813339669100175, + "grad_norm": 0.6577488295499587, + "learning_rate": 4.0889423420328866e-08, + "loss": 0.0559, + "step": 33350 + }, + { + "epoch": 2.882197935115988, + "grad_norm": 0.6658072223920517, + "learning_rate": 4.029633890943796e-08, + "loss": 0.0557, + "step": 33360 + }, + { + "epoch": 2.8830619033219578, + "grad_norm": 0.6897837859797347, + "learning_rate": 3.97075697013638e-08, + "loss": 0.0577, + "step": 33370 + }, + { + "epoch": 2.8839258715279277, + "grad_norm": 0.6755459921707765, + "learning_rate": 3.912311630837717e-08, + "loss": 0.0582, + "step": 33380 + }, + { + "epoch": 2.884789839733898, + "grad_norm": 0.688782499527184, + "learning_rate": 3.8542979238995746e-08, + "loss": 0.0589, + "step": 33390 + }, + { + "epoch": 2.885653807939868, + "grad_norm": 0.6655676381591664, + "learning_rate": 3.7967158997981315e-08, + "loss": 0.0576, + "step": 33400 + }, + { + "epoch": 2.886517776145838, + "grad_norm": 0.6986677921599695, + "learning_rate": 3.739565608633866e-08, + "loss": 0.0558, + "step": 33410 + }, + { + "epoch": 2.8873817443518077, + "grad_norm": 0.663380414330348, + "learning_rate": 3.6828471001317254e-08, + "loss": 0.0577, + "step": 33420 + }, + { + "epoch": 2.8882457125577776, + "grad_norm": 0.6936533010310243, + "learning_rate": 3.626560423640957e-08, + "loss": 0.0589, + "step": 33430 + }, + { + "epoch": 2.889109680763748, + "grad_norm": 0.6477438704961663, + "learning_rate": 3.570705628134996e-08, + "loss": 0.0577, + "step": 33440 + }, + { + "epoch": 2.889973648969718, + "grad_norm": 0.6871551024773646, + "learning_rate": 3.5152827622116356e-08, + "loss": 0.0571, + "step": 33450 + }, + { + "epoch": 2.8908376171756878, + "grad_norm": 0.6764162456797573, + "learning_rate": 3.460291874092747e-08, + "loss": 0.0584, + "step": 33460 + }, + { + "epoch": 2.891701585381658, + "grad_norm": 0.6676655057360613, + "learning_rate": 3.405733011624446e-08, + "loss": 0.0578, + "step": 33470 + }, + { + "epoch": 2.892565553587628, + "grad_norm": 0.662029257328337, + "learning_rate": 3.3516062222768706e-08, + "loss": 0.0547, + "step": 33480 + }, + { + "epoch": 2.893429521793598, + "grad_norm": 0.7215604812631449, + "learning_rate": 3.297911553144295e-08, + "loss": 0.0572, + "step": 33490 + }, + { + "epoch": 2.8942934899995683, + "grad_norm": 0.6623796053756021, + "learning_rate": 3.2446490509450145e-08, + "loss": 0.0557, + "step": 33500 + }, + { + "epoch": 2.895157458205538, + "grad_norm": 0.6771863576032046, + "learning_rate": 3.1918187620211816e-08, + "loss": 0.0571, + "step": 33510 + }, + { + "epoch": 2.896021426411508, + "grad_norm": 0.6760263824674185, + "learning_rate": 3.1394207323390826e-08, + "loss": 0.0572, + "step": 33520 + }, + { + "epoch": 2.896885394617478, + "grad_norm": 0.664126918625583, + "learning_rate": 3.087455007488804e-08, + "loss": 0.0575, + "step": 33530 + }, + { + "epoch": 2.897749362823448, + "grad_norm": 0.695653549487125, + "learning_rate": 3.035921632684236e-08, + "loss": 0.0578, + "step": 33540 + }, + { + "epoch": 2.898613331029418, + "grad_norm": 0.6849873662405368, + "learning_rate": 2.984820652763232e-08, + "loss": 0.0565, + "step": 33550 + }, + { + "epoch": 2.899477299235388, + "grad_norm": 0.6401444792075645, + "learning_rate": 2.9341521121873385e-08, + "loss": 0.0587, + "step": 33560 + }, + { + "epoch": 2.900341267441358, + "grad_norm": 0.6562321741503842, + "learning_rate": 2.8839160550418466e-08, + "loss": 0.0578, + "step": 33570 + }, + { + "epoch": 2.9012052356473284, + "grad_norm": 0.7165707023965725, + "learning_rate": 2.8341125250357928e-08, + "loss": 0.056, + "step": 33580 + }, + { + "epoch": 2.9020692038532983, + "grad_norm": 0.6535414446662702, + "learning_rate": 2.7847415655019026e-08, + "loss": 0.0562, + "step": 33590 + }, + { + "epoch": 2.902933172059268, + "grad_norm": 0.6590199643709393, + "learning_rate": 2.7358032193964824e-08, + "loss": 0.0568, + "step": 33600 + }, + { + "epoch": 2.9037971402652385, + "grad_norm": 0.7038395574717661, + "learning_rate": 2.6872975292994153e-08, + "loss": 0.0574, + "step": 33610 + }, + { + "epoch": 2.9046611084712084, + "grad_norm": 0.6603831559878955, + "learning_rate": 2.63922453741422e-08, + "loss": 0.0537, + "step": 33620 + }, + { + "epoch": 2.9055250766771783, + "grad_norm": 0.6809259145907269, + "learning_rate": 2.5915842855678276e-08, + "loss": 0.0561, + "step": 33630 + }, + { + "epoch": 2.906389044883148, + "grad_norm": 0.6675281253432127, + "learning_rate": 2.5443768152108583e-08, + "loss": 0.0545, + "step": 33640 + }, + { + "epoch": 2.907253013089118, + "grad_norm": 0.6681893173881671, + "learning_rate": 2.4976021674171236e-08, + "loss": 0.0547, + "step": 33650 + }, + { + "epoch": 2.9081169812950884, + "grad_norm": 0.6592809611380828, + "learning_rate": 2.4512603828839575e-08, + "loss": 0.0561, + "step": 33660 + }, + { + "epoch": 2.9089809495010583, + "grad_norm": 0.6490028684571275, + "learning_rate": 2.4053515019322184e-08, + "loss": 0.0565, + "step": 33670 + }, + { + "epoch": 2.9098449177070282, + "grad_norm": 0.656601858138169, + "learning_rate": 2.3598755645057868e-08, + "loss": 0.0571, + "step": 33680 + }, + { + "epoch": 2.9107088859129986, + "grad_norm": 0.6941640271804507, + "learning_rate": 2.3148326101722352e-08, + "loss": 0.0574, + "step": 33690 + }, + { + "epoch": 2.9115728541189685, + "grad_norm": 0.6603126085653375, + "learning_rate": 2.2702226781219916e-08, + "loss": 0.0581, + "step": 33700 + }, + { + "epoch": 2.9124368223249384, + "grad_norm": 0.668590890656347, + "learning_rate": 2.2260458071691194e-08, + "loss": 0.0568, + "step": 33710 + }, + { + "epoch": 2.9133007905309087, + "grad_norm": 0.6493466506588934, + "learning_rate": 2.1823020357506497e-08, + "loss": 0.0569, + "step": 33720 + }, + { + "epoch": 2.9141647587368786, + "grad_norm": 0.6539909184550178, + "learning_rate": 2.1389914019268045e-08, + "loss": 0.0574, + "step": 33730 + }, + { + "epoch": 2.9150287269428485, + "grad_norm": 0.6527098662158097, + "learning_rate": 2.0961139433810508e-08, + "loss": 0.0556, + "step": 33740 + }, + { + "epoch": 2.9158926951488184, + "grad_norm": 0.7010404064747029, + "learning_rate": 2.0536696974198245e-08, + "loss": 0.0556, + "step": 33750 + }, + { + "epoch": 2.9167566633547883, + "grad_norm": 0.6286359295433286, + "learning_rate": 2.0116587009727516e-08, + "loss": 0.0565, + "step": 33760 + }, + { + "epoch": 2.9176206315607587, + "grad_norm": 0.6462394665804463, + "learning_rate": 1.9700809905924267e-08, + "loss": 0.0551, + "step": 33770 + }, + { + "epoch": 2.9184845997667286, + "grad_norm": 0.6467936095668956, + "learning_rate": 1.928936602454523e-08, + "loss": 0.0566, + "step": 33780 + }, + { + "epoch": 2.9193485679726985, + "grad_norm": 0.6431999964981738, + "learning_rate": 1.888225572357627e-08, + "loss": 0.0527, + "step": 33790 + }, + { + "epoch": 2.920212536178669, + "grad_norm": 0.677044077943376, + "learning_rate": 1.8479479357232933e-08, + "loss": 0.0584, + "step": 33800 + }, + { + "epoch": 2.9210765043846387, + "grad_norm": 0.6490379738984648, + "learning_rate": 1.808103727595989e-08, + "loss": 0.0556, + "step": 33810 + }, + { + "epoch": 2.9219404725906086, + "grad_norm": 0.6365858333329392, + "learning_rate": 1.7686929826430944e-08, + "loss": 0.0579, + "step": 33820 + }, + { + "epoch": 2.9228044407965785, + "grad_norm": 0.6736372234260222, + "learning_rate": 1.7297157351547912e-08, + "loss": 0.055, + "step": 33830 + }, + { + "epoch": 2.9236684090025484, + "grad_norm": 0.6796407556520421, + "learning_rate": 1.6911720190441183e-08, + "loss": 0.0543, + "step": 33840 + }, + { + "epoch": 2.9245323772085188, + "grad_norm": 0.6906670089068052, + "learning_rate": 1.6530618678469723e-08, + "loss": 0.0585, + "step": 33850 + }, + { + "epoch": 2.9253963454144887, + "grad_norm": 0.6885910897171846, + "learning_rate": 1.6153853147218846e-08, + "loss": 0.058, + "step": 33860 + }, + { + "epoch": 2.9262603136204586, + "grad_norm": 0.6780701321572589, + "learning_rate": 1.5781423924502438e-08, + "loss": 0.0557, + "step": 33870 + }, + { + "epoch": 2.927124281826429, + "grad_norm": 0.6588324510313825, + "learning_rate": 1.541333133436018e-08, + "loss": 0.058, + "step": 33880 + }, + { + "epoch": 2.927988250032399, + "grad_norm": 0.6491208370689091, + "learning_rate": 1.5049575697060337e-08, + "loss": 0.0569, + "step": 33890 + }, + { + "epoch": 2.9288522182383687, + "grad_norm": 0.6517897930275939, + "learning_rate": 1.4690157329096399e-08, + "loss": 0.0555, + "step": 33900 + }, + { + "epoch": 2.929716186444339, + "grad_norm": 0.7042225264467061, + "learning_rate": 1.4335076543188776e-08, + "loss": 0.0555, + "step": 33910 + }, + { + "epoch": 2.930580154650309, + "grad_norm": 0.6580352339265412, + "learning_rate": 1.398433364828311e-08, + "loss": 0.0579, + "step": 33920 + }, + { + "epoch": 2.931444122856279, + "grad_norm": 0.6628295724247529, + "learning_rate": 1.3637928949551404e-08, + "loss": 0.0563, + "step": 33930 + }, + { + "epoch": 2.9323080910622488, + "grad_norm": 0.6664636967953322, + "learning_rate": 1.3295862748390898e-08, + "loss": 0.0564, + "step": 33940 + }, + { + "epoch": 2.9331720592682187, + "grad_norm": 0.684321235610932, + "learning_rate": 1.2958135342424071e-08, + "loss": 0.0549, + "step": 33950 + }, + { + "epoch": 2.934036027474189, + "grad_norm": 0.678915128766411, + "learning_rate": 1.262474702549865e-08, + "loss": 0.0569, + "step": 33960 + }, + { + "epoch": 2.934899995680159, + "grad_norm": 0.6801977159105094, + "learning_rate": 1.2295698087685937e-08, + "loss": 0.0582, + "step": 33970 + }, + { + "epoch": 2.935763963886129, + "grad_norm": 0.630776425498901, + "learning_rate": 1.1970988815283025e-08, + "loss": 0.0563, + "step": 33980 + }, + { + "epoch": 2.936627932092099, + "grad_norm": 0.6857107028827767, + "learning_rate": 1.1650619490810588e-08, + "loss": 0.0551, + "step": 33990 + }, + { + "epoch": 2.937491900298069, + "grad_norm": 0.6480468923534133, + "learning_rate": 1.1334590393013434e-08, + "loss": 0.0559, + "step": 34000 + }, + { + "epoch": 2.938355868504039, + "grad_norm": 0.6734319616204011, + "learning_rate": 1.1022901796858832e-08, + "loss": 0.0559, + "step": 34010 + }, + { + "epoch": 2.9392198367100093, + "grad_norm": 0.6603543423140258, + "learning_rate": 1.0715553973539849e-08, + "loss": 0.0588, + "step": 34020 + }, + { + "epoch": 2.940083804915979, + "grad_norm": 0.7978284748396589, + "learning_rate": 1.0412547190470912e-08, + "loss": 0.0564, + "step": 34030 + }, + { + "epoch": 2.940947773121949, + "grad_norm": 0.681328579342505, + "learning_rate": 1.0113881711289464e-08, + "loss": 0.0581, + "step": 34040 + }, + { + "epoch": 2.941811741327919, + "grad_norm": 0.6893855973861117, + "learning_rate": 9.819557795857082e-09, + "loss": 0.058, + "step": 34050 + }, + { + "epoch": 2.942675709533889, + "grad_norm": 0.6615292039101736, + "learning_rate": 9.52957570025559e-09, + "loss": 0.0558, + "step": 34060 + }, + { + "epoch": 2.9435396777398593, + "grad_norm": 0.6897433502814333, + "learning_rate": 9.243935676792603e-09, + "loss": 0.0573, + "step": 34070 + }, + { + "epoch": 2.944403645945829, + "grad_norm": 0.6570832913653472, + "learning_rate": 8.96263797399377e-09, + "loss": 0.0556, + "step": 34080 + }, + { + "epoch": 2.945267614151799, + "grad_norm": 0.6792304176789006, + "learning_rate": 8.685682836609421e-09, + "loss": 0.0543, + "step": 34090 + }, + { + "epoch": 2.9461315823577694, + "grad_norm": 0.6579739271767348, + "learning_rate": 8.413070505610132e-09, + "loss": 0.058, + "step": 34100 + }, + { + "epoch": 2.9469955505637393, + "grad_norm": 0.669809562415206, + "learning_rate": 8.144801218189502e-09, + "loss": 0.0578, + "step": 34110 + }, + { + "epoch": 2.947859518769709, + "grad_norm": 0.6799320416148782, + "learning_rate": 7.880875207760264e-09, + "loss": 0.0567, + "step": 34120 + }, + { + "epoch": 2.9487234869756795, + "grad_norm": 0.6629897134562075, + "learning_rate": 7.621292703957617e-09, + "loss": 0.0578, + "step": 34130 + }, + { + "epoch": 2.9495874551816494, + "grad_norm": 0.6552021162116146, + "learning_rate": 7.366053932637563e-09, + "loss": 0.0538, + "step": 34140 + }, + { + "epoch": 2.9504514233876193, + "grad_norm": 0.683065500854779, + "learning_rate": 7.115159115875792e-09, + "loss": 0.0553, + "step": 34150 + }, + { + "epoch": 2.9513153915935892, + "grad_norm": 0.6874739280286147, + "learning_rate": 6.8686084719693515e-09, + "loss": 0.0597, + "step": 34160 + }, + { + "epoch": 2.952179359799559, + "grad_norm": 0.6422584639658296, + "learning_rate": 6.626402215434979e-09, + "loss": 0.0569, + "step": 34170 + }, + { + "epoch": 2.9530433280055295, + "grad_norm": 0.7158995024434536, + "learning_rate": 6.388540557010214e-09, + "loss": 0.0574, + "step": 34180 + }, + { + "epoch": 2.9539072962114994, + "grad_norm": 0.6680913688006366, + "learning_rate": 6.15502370365173e-09, + "loss": 0.0564, + "step": 34190 + }, + { + "epoch": 2.9547712644174693, + "grad_norm": 0.7093620408628543, + "learning_rate": 5.925851858535891e-09, + "loss": 0.0567, + "step": 34200 + }, + { + "epoch": 2.9556352326234396, + "grad_norm": 0.6286177131295109, + "learning_rate": 5.7010252210593085e-09, + "loss": 0.0561, + "step": 34210 + }, + { + "epoch": 2.9564992008294095, + "grad_norm": 0.6348257609589285, + "learning_rate": 5.480543986837172e-09, + "loss": 0.0525, + "step": 34220 + }, + { + "epoch": 2.9573631690353794, + "grad_norm": 0.7138653645631411, + "learning_rate": 5.264408347704364e-09, + "loss": 0.0563, + "step": 34230 + }, + { + "epoch": 2.95822713724135, + "grad_norm": 0.6958979162027398, + "learning_rate": 5.0526184917149e-09, + "loss": 0.0545, + "step": 34240 + }, + { + "epoch": 2.9590911054473197, + "grad_norm": 0.6673351506192208, + "learning_rate": 4.845174603140823e-09, + "loss": 0.0573, + "step": 34250 + }, + { + "epoch": 2.9599550736532896, + "grad_norm": 0.6597776756569177, + "learning_rate": 4.6420768624738654e-09, + "loss": 0.0543, + "step": 34260 + }, + { + "epoch": 2.9608190418592595, + "grad_norm": 0.6907853197848071, + "learning_rate": 4.44332544642323e-09, + "loss": 0.0561, + "step": 34270 + }, + { + "epoch": 2.9616830100652294, + "grad_norm": 0.6367120740998353, + "learning_rate": 4.2489205279172555e-09, + "loss": 0.0556, + "step": 34280 + }, + { + "epoch": 2.9625469782711997, + "grad_norm": 0.6835548939710773, + "learning_rate": 4.05886227610286e-09, + "loss": 0.0563, + "step": 34290 + }, + { + "epoch": 2.9634109464771696, + "grad_norm": 0.6656890164686422, + "learning_rate": 3.873150856344432e-09, + "loss": 0.0568, + "step": 34300 + }, + { + "epoch": 2.9642749146831395, + "grad_norm": 0.6556734265025637, + "learning_rate": 3.6917864302238315e-09, + "loss": 0.0564, + "step": 34310 + }, + { + "epoch": 2.96513888288911, + "grad_norm": 0.7248595471350836, + "learning_rate": 3.5147691555420526e-09, + "loss": 0.057, + "step": 34320 + }, + { + "epoch": 2.9660028510950798, + "grad_norm": 0.662123530396553, + "learning_rate": 3.34209918631645e-09, + "loss": 0.0583, + "step": 34330 + }, + { + "epoch": 2.9668668193010497, + "grad_norm": 0.6509452814103437, + "learning_rate": 3.1737766727824028e-09, + "loss": 0.0582, + "step": 34340 + }, + { + "epoch": 2.96773078750702, + "grad_norm": 0.7050813393203283, + "learning_rate": 3.0098017613933163e-09, + "loss": 0.0564, + "step": 34350 + }, + { + "epoch": 2.96859475571299, + "grad_norm": 0.6654292686780983, + "learning_rate": 2.8501745948184e-09, + "loss": 0.0559, + "step": 34360 + }, + { + "epoch": 2.96945872391896, + "grad_norm": 0.6821160988216692, + "learning_rate": 2.6948953119459997e-09, + "loss": 0.0571, + "step": 34370 + }, + { + "epoch": 2.9703226921249297, + "grad_norm": 0.684418049892257, + "learning_rate": 2.54396404787971e-09, + "loss": 0.0555, + "step": 34380 + }, + { + "epoch": 2.9711866603308996, + "grad_norm": 0.6814190481168771, + "learning_rate": 2.3973809339405963e-09, + "loss": 0.0571, + "step": 34390 + }, + { + "epoch": 2.97205062853687, + "grad_norm": 0.6795433479173877, + "learning_rate": 2.2551460976666385e-09, + "loss": 0.0565, + "step": 34400 + }, + { + "epoch": 2.97291459674284, + "grad_norm": 0.6785623293407695, + "learning_rate": 2.1172596628127316e-09, + "loss": 0.0567, + "step": 34410 + }, + { + "epoch": 2.9737785649488098, + "grad_norm": 0.6809051694327195, + "learning_rate": 1.9837217493501313e-09, + "loss": 0.0555, + "step": 34420 + }, + { + "epoch": 2.97464253315478, + "grad_norm": 0.6369055902211925, + "learning_rate": 1.8545324734664526e-09, + "loss": 0.0569, + "step": 34430 + }, + { + "epoch": 2.97550650136075, + "grad_norm": 0.6910310396003512, + "learning_rate": 1.7296919475662254e-09, + "loss": 0.0562, + "step": 34440 + }, + { + "epoch": 2.97637046956672, + "grad_norm": 0.7202616011304915, + "learning_rate": 1.6092002802686747e-09, + "loss": 0.0578, + "step": 34450 + }, + { + "epoch": 2.97723443777269, + "grad_norm": 0.6564232420344026, + "learning_rate": 1.49305757641105e-09, + "loss": 0.0561, + "step": 34460 + }, + { + "epoch": 2.9780984059786597, + "grad_norm": 0.6755704437504937, + "learning_rate": 1.3812639370458514e-09, + "loss": 0.0579, + "step": 34470 + }, + { + "epoch": 2.97896237418463, + "grad_norm": 0.6502855806426288, + "learning_rate": 1.2738194594419384e-09, + "loss": 0.0557, + "step": 34480 + }, + { + "epoch": 2.9798263423906, + "grad_norm": 0.6439414003128155, + "learning_rate": 1.1707242370834204e-09, + "loss": 0.0575, + "step": 34490 + }, + { + "epoch": 2.98069031059657, + "grad_norm": 0.6340317895115861, + "learning_rate": 1.0719783596707666e-09, + "loss": 0.0564, + "step": 34500 + }, + { + "epoch": 2.98155427880254, + "grad_norm": 0.6753564648972211, + "learning_rate": 9.775819131202513e-10, + "loss": 0.0556, + "step": 34510 + }, + { + "epoch": 2.98241824700851, + "grad_norm": 0.6455724063217956, + "learning_rate": 8.87534979562843e-10, + "loss": 0.0559, + "step": 34520 + }, + { + "epoch": 2.98328221521448, + "grad_norm": 0.6909467453570447, + "learning_rate": 8.018376373469805e-10, + "loss": 0.0557, + "step": 34530 + }, + { + "epoch": 2.9841461834204503, + "grad_norm": 0.6744291133600068, + "learning_rate": 7.204899610352423e-10, + "loss": 0.0538, + "step": 34540 + }, + { + "epoch": 2.9850101516264202, + "grad_norm": 0.6400069292789241, + "learning_rate": 6.434920214060114e-10, + "loss": 0.0565, + "step": 34550 + }, + { + "epoch": 2.98587411983239, + "grad_norm": 0.6660665422309995, + "learning_rate": 5.708438854523656e-10, + "loss": 0.0567, + "step": 34560 + }, + { + "epoch": 2.98673808803836, + "grad_norm": 0.6877542654558588, + "learning_rate": 5.02545616384853e-10, + "loss": 0.0589, + "step": 34570 + }, + { + "epoch": 2.98760205624433, + "grad_norm": 0.6685372840389431, + "learning_rate": 4.385972736264954e-10, + "loss": 0.0567, + "step": 34580 + }, + { + "epoch": 2.9884660244503003, + "grad_norm": 0.6579606458219801, + "learning_rate": 3.7899891281834025e-10, + "loss": 0.0568, + "step": 34590 + }, + { + "epoch": 2.98932999265627, + "grad_norm": 0.6810902715442537, + "learning_rate": 3.2375058581446407e-10, + "loss": 0.0585, + "step": 34600 + }, + { + "epoch": 2.99019396086224, + "grad_norm": 0.6969850812240518, + "learning_rate": 2.728523406847483e-10, + "loss": 0.0569, + "step": 34610 + }, + { + "epoch": 2.9910579290682104, + "grad_norm": 0.6943564708722185, + "learning_rate": 2.263042217148792e-10, + "loss": 0.0569, + "step": 34620 + }, + { + "epoch": 2.9919218972741803, + "grad_norm": 0.7012718186319877, + "learning_rate": 1.8410626940523757e-10, + "loss": 0.0564, + "step": 34630 + }, + { + "epoch": 2.9927858654801502, + "grad_norm": 0.6768713971350516, + "learning_rate": 1.4625852047034373e-10, + "loss": 0.0591, + "step": 34640 + }, + { + "epoch": 2.9936498336861206, + "grad_norm": 0.6859084073565257, + "learning_rate": 1.1276100784163302e-10, + "loss": 0.0576, + "step": 34650 + }, + { + "epoch": 2.9945138018920905, + "grad_norm": 0.6851362984991908, + "learning_rate": 8.361376066357007e-11, + "loss": 0.0553, + "step": 34660 + }, + { + "epoch": 2.9953777700980604, + "grad_norm": 0.6529437264727286, + "learning_rate": 5.881680429586922e-11, + "loss": 0.0546, + "step": 34670 + }, + { + "epoch": 2.9962417383040303, + "grad_norm": 0.6852860443468494, + "learning_rate": 3.8370160315159834e-11, + "loss": 0.0563, + "step": 34680 + }, + { + "epoch": 2.99710570651, + "grad_norm": 0.6575050306681262, + "learning_rate": 2.2273846509990314e-11, + "loss": 0.0579, + "step": 34690 + }, + { + "epoch": 2.9979696747159705, + "grad_norm": 0.6579715634072224, + "learning_rate": 1.052787688637924e-11, + "loss": 0.0549, + "step": 34700 + }, + { + "epoch": 2.9988336429219404, + "grad_norm": 0.7080004725681195, + "learning_rate": 3.132261663929548e-12, + "loss": 0.06, + "step": 34710 + }, + { + "epoch": 2.9996976111279103, + "grad_norm": 0.6662699930555676, + "learning_rate": 8.700727749388904e-14, + "loss": 0.058, + "step": 34720 + }, + { + "epoch": 2.9998704047691045, + "step": 34722, + "total_flos": 7939707523891200.0, + "train_loss": 0.11055064558756254, + "train_runtime": 74830.8535, + "train_samples_per_second": 59.395, + "train_steps_per_second": 0.464 + } + ], + "logging_steps": 10, + "max_steps": 34722, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 50000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7939707523891200.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}