{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.612238650626014, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000306119325313007, "grad_norm": 13.722074508666992, "learning_rate": 2.8571428571428575e-07, "loss": 1.4533, "step": 10 }, { "epoch": 0.000612238650626014, "grad_norm": 27.087980270385742, "learning_rate": 5.714285714285715e-07, "loss": 1.3747, "step": 20 }, { "epoch": 0.0009183579759390211, "grad_norm": 6.083175182342529, "learning_rate": 8.571428571428572e-07, "loss": 1.067, "step": 30 }, { "epoch": 0.001224477301252028, "grad_norm": 13.301521301269531, "learning_rate": 1.142857142857143e-06, "loss": 0.8574, "step": 40 }, { "epoch": 0.001530596626565035, "grad_norm": 30.148033142089844, "learning_rate": 1.4285714285714286e-06, "loss": 1.0568, "step": 50 }, { "epoch": 0.0018367159518780421, "grad_norm": 12.708663940429688, "learning_rate": 1.7142857142857145e-06, "loss": 1.1623, "step": 60 }, { "epoch": 0.0021428352771910492, "grad_norm": 19.8212947845459, "learning_rate": 2.0000000000000003e-06, "loss": 1.0511, "step": 70 }, { "epoch": 0.002448954602504056, "grad_norm": 20.871768951416016, "learning_rate": 2.285714285714286e-06, "loss": 1.0355, "step": 80 }, { "epoch": 0.002755073927817063, "grad_norm": 13.169136047363281, "learning_rate": 2.5714285714285716e-06, "loss": 0.8989, "step": 90 }, { "epoch": 0.00306119325313007, "grad_norm": 11.613677024841309, "learning_rate": 2.8571428571428573e-06, "loss": 1.0522, "step": 100 }, { "epoch": 0.003367312578443077, "grad_norm": 17.065814971923828, "learning_rate": 3.1428571428571433e-06, "loss": 0.8608, "step": 110 }, { "epoch": 0.0036734319037560843, "grad_norm": 10.867985725402832, "learning_rate": 3.428571428571429e-06, "loss": 0.7922, "step": 120 }, { "epoch": 0.003979551229069091, "grad_norm": 8.49424934387207, "learning_rate": 3.7142857142857146e-06, "loss": 0.65, "step": 130 }, { "epoch": 0.0042856705543820985, "grad_norm": 5.935633659362793, "learning_rate": 4.000000000000001e-06, "loss": 0.4856, "step": 140 }, { "epoch": 0.004591789879695105, "grad_norm": 23.074012756347656, "learning_rate": 4.285714285714286e-06, "loss": 0.5999, "step": 150 }, { "epoch": 0.004897909205008112, "grad_norm": 10.538525581359863, "learning_rate": 4.571428571428572e-06, "loss": 0.6087, "step": 160 }, { "epoch": 0.005204028530321119, "grad_norm": 5.804176330566406, "learning_rate": 4.857142857142858e-06, "loss": 0.3727, "step": 170 }, { "epoch": 0.005510147855634126, "grad_norm": 14.265785217285156, "learning_rate": 5.142857142857143e-06, "loss": 0.2627, "step": 180 }, { "epoch": 0.005816267180947133, "grad_norm": 9.601630210876465, "learning_rate": 5.428571428571429e-06, "loss": 0.59, "step": 190 }, { "epoch": 0.00612238650626014, "grad_norm": 5.146839618682861, "learning_rate": 5.7142857142857145e-06, "loss": 0.456, "step": 200 }, { "epoch": 0.006428505831573147, "grad_norm": 17.136159896850586, "learning_rate": 6e-06, "loss": 0.3948, "step": 210 }, { "epoch": 0.006734625156886154, "grad_norm": 9.188369750976562, "learning_rate": 6.285714285714287e-06, "loss": 0.2692, "step": 220 }, { "epoch": 0.0070407444821991615, "grad_norm": 7.316371440887451, "learning_rate": 6.5714285714285714e-06, "loss": 0.1887, "step": 230 }, { "epoch": 0.007346863807512169, "grad_norm": 10.565173149108887, "learning_rate": 6.857142857142858e-06, "loss": 0.3618, "step": 240 }, { "epoch": 0.007652983132825175, "grad_norm": 15.50622272491455, "learning_rate": 7.142857142857143e-06, "loss": 0.3439, "step": 250 }, { "epoch": 0.007959102458138183, "grad_norm": 3.9203195571899414, "learning_rate": 7.428571428571429e-06, "loss": 0.2, "step": 260 }, { "epoch": 0.008265221783451189, "grad_norm": 4.713544845581055, "learning_rate": 7.714285714285714e-06, "loss": 0.2648, "step": 270 }, { "epoch": 0.008571341108764197, "grad_norm": 5.9793806076049805, "learning_rate": 8.000000000000001e-06, "loss": 0.2204, "step": 280 }, { "epoch": 0.008877460434077203, "grad_norm": 3.0459988117218018, "learning_rate": 8.285714285714285e-06, "loss": 0.2312, "step": 290 }, { "epoch": 0.00918357975939021, "grad_norm": 4.787301540374756, "learning_rate": 8.571428571428573e-06, "loss": 0.227, "step": 300 }, { "epoch": 0.009489699084703217, "grad_norm": 4.622166156768799, "learning_rate": 8.857142857142857e-06, "loss": 0.2628, "step": 310 }, { "epoch": 0.009795818410016224, "grad_norm": 3.9778454303741455, "learning_rate": 9.142857142857144e-06, "loss": 0.241, "step": 320 }, { "epoch": 0.010101937735329232, "grad_norm": 2.5914523601531982, "learning_rate": 9.42857142857143e-06, "loss": 0.2132, "step": 330 }, { "epoch": 0.010408057060642238, "grad_norm": 4.6032395362854, "learning_rate": 9.714285714285715e-06, "loss": 0.1658, "step": 340 }, { "epoch": 0.010714176385955246, "grad_norm": 7.707336902618408, "learning_rate": 1e-05, "loss": 0.1782, "step": 350 }, { "epoch": 0.011020295711268252, "grad_norm": 8.548439979553223, "learning_rate": 1.0285714285714286e-05, "loss": 0.1972, "step": 360 }, { "epoch": 0.01132641503658126, "grad_norm": 4.211006164550781, "learning_rate": 1.0571428571428572e-05, "loss": 0.2028, "step": 370 }, { "epoch": 0.011632534361894266, "grad_norm": 3.3881568908691406, "learning_rate": 1.0857142857142858e-05, "loss": 0.1917, "step": 380 }, { "epoch": 0.011938653687207274, "grad_norm": 10.200140953063965, "learning_rate": 1.1142857142857143e-05, "loss": 0.198, "step": 390 }, { "epoch": 0.01224477301252028, "grad_norm": 9.116769790649414, "learning_rate": 1.1428571428571429e-05, "loss": 0.1893, "step": 400 }, { "epoch": 0.012550892337833287, "grad_norm": 11.12707233428955, "learning_rate": 1.1714285714285715e-05, "loss": 0.1366, "step": 410 }, { "epoch": 0.012857011663146295, "grad_norm": 7.8160552978515625, "learning_rate": 1.2e-05, "loss": 0.1946, "step": 420 }, { "epoch": 0.0131631309884593, "grad_norm": 3.9637463092803955, "learning_rate": 1.2285714285714286e-05, "loss": 0.2068, "step": 430 }, { "epoch": 0.013469250313772309, "grad_norm": 2.8966548442840576, "learning_rate": 1.2571428571428573e-05, "loss": 0.1722, "step": 440 }, { "epoch": 0.013775369639085315, "grad_norm": 3.249385118484497, "learning_rate": 1.2857142857142857e-05, "loss": 0.2103, "step": 450 }, { "epoch": 0.014081488964398323, "grad_norm": 7.362815856933594, "learning_rate": 1.3142857142857143e-05, "loss": 0.2264, "step": 460 }, { "epoch": 0.01438760828971133, "grad_norm": 1.4874478578567505, "learning_rate": 1.3428571428571429e-05, "loss": 0.1375, "step": 470 }, { "epoch": 0.014693727615024337, "grad_norm": 6.0792155265808105, "learning_rate": 1.3714285714285716e-05, "loss": 0.2835, "step": 480 }, { "epoch": 0.014999846940337343, "grad_norm": 10.418745994567871, "learning_rate": 1.4000000000000001e-05, "loss": 0.1312, "step": 490 }, { "epoch": 0.01530596626565035, "grad_norm": 1.7920645475387573, "learning_rate": 1.4285714285714285e-05, "loss": 0.1201, "step": 500 }, { "epoch": 0.015612085590963358, "grad_norm": 5.709236145019531, "learning_rate": 1.4571428571428573e-05, "loss": 0.2056, "step": 510 }, { "epoch": 0.015918204916276366, "grad_norm": 2.8269858360290527, "learning_rate": 1.4857142857142858e-05, "loss": 0.2028, "step": 520 }, { "epoch": 0.016224324241589372, "grad_norm": 3.15082049369812, "learning_rate": 1.5142857142857144e-05, "loss": 0.1399, "step": 530 }, { "epoch": 0.016530443566902378, "grad_norm": 3.621330738067627, "learning_rate": 1.5428571428571428e-05, "loss": 0.1816, "step": 540 }, { "epoch": 0.016836562892215384, "grad_norm": 4.417708873748779, "learning_rate": 1.5714285714285715e-05, "loss": 0.1976, "step": 550 }, { "epoch": 0.017142682217528394, "grad_norm": 4.791176795959473, "learning_rate": 1.6000000000000003e-05, "loss": 0.1844, "step": 560 }, { "epoch": 0.0174488015428414, "grad_norm": 3.744509220123291, "learning_rate": 1.6285714285714287e-05, "loss": 0.1263, "step": 570 }, { "epoch": 0.017754920868154406, "grad_norm": 2.317128896713257, "learning_rate": 1.657142857142857e-05, "loss": 0.2054, "step": 580 }, { "epoch": 0.018061040193467413, "grad_norm": 3.4121298789978027, "learning_rate": 1.6857142857142858e-05, "loss": 0.1825, "step": 590 }, { "epoch": 0.01836715951878042, "grad_norm": 1.3255969285964966, "learning_rate": 1.7142857142857145e-05, "loss": 0.1482, "step": 600 }, { "epoch": 0.01867327884409343, "grad_norm": 3.581892490386963, "learning_rate": 1.742857142857143e-05, "loss": 0.1219, "step": 610 }, { "epoch": 0.018979398169406435, "grad_norm": 4.739587783813477, "learning_rate": 1.7714285714285713e-05, "loss": 0.2194, "step": 620 }, { "epoch": 0.01928551749471944, "grad_norm": 5.951377868652344, "learning_rate": 1.8e-05, "loss": 0.1599, "step": 630 }, { "epoch": 0.019591636820032447, "grad_norm": 1.153567910194397, "learning_rate": 1.8285714285714288e-05, "loss": 0.118, "step": 640 }, { "epoch": 0.019897756145345457, "grad_norm": 2.534512758255005, "learning_rate": 1.8571428571428572e-05, "loss": 0.1901, "step": 650 }, { "epoch": 0.020203875470658463, "grad_norm": 3.742410659790039, "learning_rate": 1.885714285714286e-05, "loss": 0.2614, "step": 660 }, { "epoch": 0.02050999479597147, "grad_norm": 7.217038154602051, "learning_rate": 1.9142857142857143e-05, "loss": 0.1366, "step": 670 }, { "epoch": 0.020816114121284476, "grad_norm": 5.368847846984863, "learning_rate": 1.942857142857143e-05, "loss": 0.1112, "step": 680 }, { "epoch": 0.021122233446597485, "grad_norm": 2.0281848907470703, "learning_rate": 1.9714285714285714e-05, "loss": 0.0877, "step": 690 }, { "epoch": 0.02142835277191049, "grad_norm": 4.826754570007324, "learning_rate": 2e-05, "loss": 0.1343, "step": 700 }, { "epoch": 0.021734472097223498, "grad_norm": 5.433274745941162, "learning_rate": 2.0285714285714286e-05, "loss": 0.1739, "step": 710 }, { "epoch": 0.022040591422536504, "grad_norm": 8.426406860351562, "learning_rate": 2.0571428571428573e-05, "loss": 0.1888, "step": 720 }, { "epoch": 0.02234671074784951, "grad_norm": 1.7104073762893677, "learning_rate": 2.0857142857142857e-05, "loss": 0.1434, "step": 730 }, { "epoch": 0.02265283007316252, "grad_norm": 7.001101970672607, "learning_rate": 2.1142857142857144e-05, "loss": 0.2156, "step": 740 }, { "epoch": 0.022958949398475526, "grad_norm": 5.867307662963867, "learning_rate": 2.1428571428571428e-05, "loss": 0.1839, "step": 750 }, { "epoch": 0.023265068723788532, "grad_norm": 2.6224617958068848, "learning_rate": 2.1714285714285715e-05, "loss": 0.1531, "step": 760 }, { "epoch": 0.02357118804910154, "grad_norm": 3.6148481369018555, "learning_rate": 2.2000000000000003e-05, "loss": 0.1572, "step": 770 }, { "epoch": 0.02387730737441455, "grad_norm": 2.508448362350464, "learning_rate": 2.2285714285714287e-05, "loss": 0.1942, "step": 780 }, { "epoch": 0.024183426699727555, "grad_norm": 1.432924747467041, "learning_rate": 2.257142857142857e-05, "loss": 0.1534, "step": 790 }, { "epoch": 0.02448954602504056, "grad_norm": 3.843273401260376, "learning_rate": 2.2857142857142858e-05, "loss": 0.1226, "step": 800 }, { "epoch": 0.024795665350353567, "grad_norm": 1.1433719396591187, "learning_rate": 2.3142857142857145e-05, "loss": 0.2329, "step": 810 }, { "epoch": 0.025101784675666573, "grad_norm": 6.199098587036133, "learning_rate": 2.342857142857143e-05, "loss": 0.1219, "step": 820 }, { "epoch": 0.025407904000979583, "grad_norm": 1.8114993572235107, "learning_rate": 2.3714285714285717e-05, "loss": 0.0981, "step": 830 }, { "epoch": 0.02571402332629259, "grad_norm": 1.9623548984527588, "learning_rate": 2.4e-05, "loss": 0.1194, "step": 840 }, { "epoch": 0.026020142651605595, "grad_norm": 4.74913215637207, "learning_rate": 2.4285714285714288e-05, "loss": 0.2156, "step": 850 }, { "epoch": 0.0263262619769186, "grad_norm": 5.955575942993164, "learning_rate": 2.4571428571428572e-05, "loss": 0.1523, "step": 860 }, { "epoch": 0.02663238130223161, "grad_norm": 2.2301557064056396, "learning_rate": 2.485714285714286e-05, "loss": 0.1536, "step": 870 }, { "epoch": 0.026938500627544618, "grad_norm": 2.7021446228027344, "learning_rate": 2.5142857142857147e-05, "loss": 0.0872, "step": 880 }, { "epoch": 0.027244619952857624, "grad_norm": 3.3416972160339355, "learning_rate": 2.542857142857143e-05, "loss": 0.1221, "step": 890 }, { "epoch": 0.02755073927817063, "grad_norm": 3.8736658096313477, "learning_rate": 2.5714285714285714e-05, "loss": 0.124, "step": 900 }, { "epoch": 0.027856858603483636, "grad_norm": 2.715658187866211, "learning_rate": 2.6000000000000002e-05, "loss": 0.123, "step": 910 }, { "epoch": 0.028162977928796646, "grad_norm": 3.0920114517211914, "learning_rate": 2.6285714285714286e-05, "loss": 0.1317, "step": 920 }, { "epoch": 0.028469097254109652, "grad_norm": 4.345089912414551, "learning_rate": 2.6571428571428576e-05, "loss": 0.2368, "step": 930 }, { "epoch": 0.02877521657942266, "grad_norm": 2.1359341144561768, "learning_rate": 2.6857142857142857e-05, "loss": 0.1524, "step": 940 }, { "epoch": 0.029081335904735665, "grad_norm": 3.495286703109741, "learning_rate": 2.714285714285714e-05, "loss": 0.1655, "step": 950 }, { "epoch": 0.029387455230048674, "grad_norm": 1.3512831926345825, "learning_rate": 2.742857142857143e-05, "loss": 0.1076, "step": 960 }, { "epoch": 0.02969357455536168, "grad_norm": 4.360333442687988, "learning_rate": 2.7714285714285716e-05, "loss": 0.1655, "step": 970 }, { "epoch": 0.029999693880674687, "grad_norm": 7.860919952392578, "learning_rate": 2.8000000000000003e-05, "loss": 0.1312, "step": 980 }, { "epoch": 0.030305813205987693, "grad_norm": 2.334157943725586, "learning_rate": 2.8285714285714287e-05, "loss": 0.1337, "step": 990 }, { "epoch": 0.0306119325313007, "grad_norm": 2.4221463203430176, "learning_rate": 2.857142857142857e-05, "loss": 0.1276, "step": 1000 }, { "epoch": 0.03091805185661371, "grad_norm": 4.381668567657471, "learning_rate": 2.885714285714286e-05, "loss": 0.1889, "step": 1010 }, { "epoch": 0.031224171181926715, "grad_norm": 1.5481494665145874, "learning_rate": 2.9142857142857146e-05, "loss": 0.1205, "step": 1020 }, { "epoch": 0.031530290507239725, "grad_norm": 4.992359638214111, "learning_rate": 2.9428571428571426e-05, "loss": 0.1224, "step": 1030 }, { "epoch": 0.03183640983255273, "grad_norm": 1.5445855855941772, "learning_rate": 2.9714285714285717e-05, "loss": 0.1468, "step": 1040 }, { "epoch": 0.03214252915786574, "grad_norm": 4.030512809753418, "learning_rate": 3e-05, "loss": 0.0962, "step": 1050 }, { "epoch": 0.032448648483178744, "grad_norm": 1.1227295398712158, "learning_rate": 3.0285714285714288e-05, "loss": 0.1486, "step": 1060 }, { "epoch": 0.03275476780849175, "grad_norm": 3.488286256790161, "learning_rate": 3.057142857142857e-05, "loss": 0.1924, "step": 1070 }, { "epoch": 0.033060887133804756, "grad_norm": 4.336898326873779, "learning_rate": 3.0857142857142856e-05, "loss": 0.098, "step": 1080 }, { "epoch": 0.03336700645911776, "grad_norm": 4.855691432952881, "learning_rate": 3.114285714285715e-05, "loss": 0.1708, "step": 1090 }, { "epoch": 0.03367312578443077, "grad_norm": 2.7496328353881836, "learning_rate": 3.142857142857143e-05, "loss": 0.0827, "step": 1100 }, { "epoch": 0.033979245109743775, "grad_norm": 2.909482479095459, "learning_rate": 3.1714285714285715e-05, "loss": 0.103, "step": 1110 }, { "epoch": 0.03428536443505679, "grad_norm": 3.327176094055176, "learning_rate": 3.2000000000000005e-05, "loss": 0.1228, "step": 1120 }, { "epoch": 0.034591483760369794, "grad_norm": 2.143681526184082, "learning_rate": 3.228571428571428e-05, "loss": 0.1796, "step": 1130 }, { "epoch": 0.0348976030856828, "grad_norm": 1.7042616605758667, "learning_rate": 3.257142857142857e-05, "loss": 0.104, "step": 1140 }, { "epoch": 0.03520372241099581, "grad_norm": 1.3084579706192017, "learning_rate": 3.285714285714286e-05, "loss": 0.1063, "step": 1150 }, { "epoch": 0.03550984173630881, "grad_norm": 5.282706260681152, "learning_rate": 3.314285714285714e-05, "loss": 0.177, "step": 1160 }, { "epoch": 0.03581596106162182, "grad_norm": 4.358266353607178, "learning_rate": 3.342857142857143e-05, "loss": 0.1023, "step": 1170 }, { "epoch": 0.036122080386934825, "grad_norm": 6.677738189697266, "learning_rate": 3.3714285714285716e-05, "loss": 0.2188, "step": 1180 }, { "epoch": 0.03642819971224783, "grad_norm": 2.549927234649658, "learning_rate": 3.4000000000000007e-05, "loss": 0.1935, "step": 1190 }, { "epoch": 0.03673431903756084, "grad_norm": 5.6659040451049805, "learning_rate": 3.428571428571429e-05, "loss": 0.154, "step": 1200 }, { "epoch": 0.03704043836287385, "grad_norm": 6.016873359680176, "learning_rate": 3.4571428571428574e-05, "loss": 0.1292, "step": 1210 }, { "epoch": 0.03734655768818686, "grad_norm": 3.8458967208862305, "learning_rate": 3.485714285714286e-05, "loss": 0.109, "step": 1220 }, { "epoch": 0.03765267701349986, "grad_norm": 1.7299611568450928, "learning_rate": 3.514285714285714e-05, "loss": 0.2003, "step": 1230 }, { "epoch": 0.03795879633881287, "grad_norm": 1.9548261165618896, "learning_rate": 3.5428571428571426e-05, "loss": 0.095, "step": 1240 }, { "epoch": 0.038264915664125876, "grad_norm": 2.279438018798828, "learning_rate": 3.571428571428572e-05, "loss": 0.115, "step": 1250 }, { "epoch": 0.03857103498943888, "grad_norm": 1.9958924055099487, "learning_rate": 3.6e-05, "loss": 0.1282, "step": 1260 }, { "epoch": 0.03887715431475189, "grad_norm": 5.627369403839111, "learning_rate": 3.628571428571429e-05, "loss": 0.1469, "step": 1270 }, { "epoch": 0.039183273640064895, "grad_norm": 4.270880699157715, "learning_rate": 3.6571428571428576e-05, "loss": 0.1013, "step": 1280 }, { "epoch": 0.03948939296537791, "grad_norm": 2.604970693588257, "learning_rate": 3.685714285714286e-05, "loss": 0.1187, "step": 1290 }, { "epoch": 0.039795512290690914, "grad_norm": 5.164241313934326, "learning_rate": 3.7142857142857143e-05, "loss": 0.1487, "step": 1300 }, { "epoch": 0.04010163161600392, "grad_norm": 2.184382677078247, "learning_rate": 3.742857142857143e-05, "loss": 0.1258, "step": 1310 }, { "epoch": 0.040407750941316926, "grad_norm": 1.9819657802581787, "learning_rate": 3.771428571428572e-05, "loss": 0.12, "step": 1320 }, { "epoch": 0.04071387026662993, "grad_norm": 1.5794682502746582, "learning_rate": 3.8e-05, "loss": 0.0808, "step": 1330 }, { "epoch": 0.04101998959194294, "grad_norm": 1.9338228702545166, "learning_rate": 3.8285714285714286e-05, "loss": 0.0493, "step": 1340 }, { "epoch": 0.041326108917255945, "grad_norm": 1.931367039680481, "learning_rate": 3.857142857142858e-05, "loss": 0.0993, "step": 1350 }, { "epoch": 0.04163222824256895, "grad_norm": 1.4986271858215332, "learning_rate": 3.885714285714286e-05, "loss": 0.0678, "step": 1360 }, { "epoch": 0.04193834756788196, "grad_norm": 2.5004217624664307, "learning_rate": 3.9142857142857145e-05, "loss": 0.1655, "step": 1370 }, { "epoch": 0.04224446689319497, "grad_norm": 2.6286938190460205, "learning_rate": 3.942857142857143e-05, "loss": 0.0781, "step": 1380 }, { "epoch": 0.04255058621850798, "grad_norm": 0.8432460427284241, "learning_rate": 3.971428571428571e-05, "loss": 0.1173, "step": 1390 }, { "epoch": 0.04285670554382098, "grad_norm": 3.727076768875122, "learning_rate": 4e-05, "loss": 0.1653, "step": 1400 }, { "epoch": 0.04316282486913399, "grad_norm": 3.1214818954467773, "learning_rate": 4.028571428571429e-05, "loss": 0.0992, "step": 1410 }, { "epoch": 0.043468944194446996, "grad_norm": 2.916867256164551, "learning_rate": 4.057142857142857e-05, "loss": 0.1218, "step": 1420 }, { "epoch": 0.04377506351976, "grad_norm": 1.5214743614196777, "learning_rate": 4.085714285714286e-05, "loss": 0.0418, "step": 1430 }, { "epoch": 0.04408118284507301, "grad_norm": 1.2602075338363647, "learning_rate": 4.1142857142857146e-05, "loss": 0.1018, "step": 1440 }, { "epoch": 0.044387302170386014, "grad_norm": 3.2583959102630615, "learning_rate": 4.1428571428571437e-05, "loss": 0.0729, "step": 1450 }, { "epoch": 0.04469342149569902, "grad_norm": 5.558638095855713, "learning_rate": 4.1714285714285714e-05, "loss": 0.0991, "step": 1460 }, { "epoch": 0.044999540821012034, "grad_norm": 1.959681510925293, "learning_rate": 4.2e-05, "loss": 0.0816, "step": 1470 }, { "epoch": 0.04530566014632504, "grad_norm": 3.9041380882263184, "learning_rate": 4.228571428571429e-05, "loss": 0.2325, "step": 1480 }, { "epoch": 0.045611779471638046, "grad_norm": 4.564482688903809, "learning_rate": 4.257142857142857e-05, "loss": 0.1348, "step": 1490 }, { "epoch": 0.04591789879695105, "grad_norm": 1.7417715787887573, "learning_rate": 4.2857142857142856e-05, "loss": 0.085, "step": 1500 }, { "epoch": 0.04622401812226406, "grad_norm": 1.2307840585708618, "learning_rate": 4.314285714285715e-05, "loss": 0.0686, "step": 1510 }, { "epoch": 0.046530137447577065, "grad_norm": 3.7375292778015137, "learning_rate": 4.342857142857143e-05, "loss": 0.1953, "step": 1520 }, { "epoch": 0.04683625677289007, "grad_norm": 1.9863802194595337, "learning_rate": 4.371428571428572e-05, "loss": 0.1121, "step": 1530 }, { "epoch": 0.04714237609820308, "grad_norm": 3.2265498638153076, "learning_rate": 4.4000000000000006e-05, "loss": 0.0871, "step": 1540 }, { "epoch": 0.047448495423516084, "grad_norm": 2.602877616882324, "learning_rate": 4.428571428571428e-05, "loss": 0.1327, "step": 1550 }, { "epoch": 0.0477546147488291, "grad_norm": 2.243049383163452, "learning_rate": 4.4571428571428574e-05, "loss": 0.0778, "step": 1560 }, { "epoch": 0.0480607340741421, "grad_norm": 1.786422848701477, "learning_rate": 4.485714285714286e-05, "loss": 0.0801, "step": 1570 }, { "epoch": 0.04836685339945511, "grad_norm": 3.4366822242736816, "learning_rate": 4.514285714285714e-05, "loss": 0.1024, "step": 1580 }, { "epoch": 0.048672972724768115, "grad_norm": 2.4498836994171143, "learning_rate": 4.542857142857143e-05, "loss": 0.1145, "step": 1590 }, { "epoch": 0.04897909205008112, "grad_norm": 1.7132132053375244, "learning_rate": 4.5714285714285716e-05, "loss": 0.1792, "step": 1600 }, { "epoch": 0.04928521137539413, "grad_norm": 2.7946715354919434, "learning_rate": 4.600000000000001e-05, "loss": 0.1816, "step": 1610 }, { "epoch": 0.049591330700707134, "grad_norm": 2.2414166927337646, "learning_rate": 4.628571428571429e-05, "loss": 0.0848, "step": 1620 }, { "epoch": 0.04989745002602014, "grad_norm": 0.8873618841171265, "learning_rate": 4.6571428571428575e-05, "loss": 0.0721, "step": 1630 }, { "epoch": 0.05020356935133315, "grad_norm": 3.0563650131225586, "learning_rate": 4.685714285714286e-05, "loss": 0.0928, "step": 1640 }, { "epoch": 0.05050968867664616, "grad_norm": 2.1367316246032715, "learning_rate": 4.714285714285714e-05, "loss": 0.0992, "step": 1650 }, { "epoch": 0.050815808001959166, "grad_norm": 2.652095079421997, "learning_rate": 4.742857142857143e-05, "loss": 0.1067, "step": 1660 }, { "epoch": 0.05112192732727217, "grad_norm": 3.5881683826446533, "learning_rate": 4.771428571428572e-05, "loss": 0.071, "step": 1670 }, { "epoch": 0.05142804665258518, "grad_norm": 2.4605679512023926, "learning_rate": 4.8e-05, "loss": 0.1109, "step": 1680 }, { "epoch": 0.051734165977898185, "grad_norm": 2.4569780826568604, "learning_rate": 4.828571428571429e-05, "loss": 0.0643, "step": 1690 }, { "epoch": 0.05204028530321119, "grad_norm": 2.005300521850586, "learning_rate": 4.8571428571428576e-05, "loss": 0.1383, "step": 1700 }, { "epoch": 0.0523464046285242, "grad_norm": 5.492910385131836, "learning_rate": 4.885714285714286e-05, "loss": 0.1533, "step": 1710 }, { "epoch": 0.0526525239538372, "grad_norm": 3.194068193435669, "learning_rate": 4.9142857142857144e-05, "loss": 0.1215, "step": 1720 }, { "epoch": 0.05295864327915021, "grad_norm": 4.103147506713867, "learning_rate": 4.942857142857143e-05, "loss": 0.0901, "step": 1730 }, { "epoch": 0.05326476260446322, "grad_norm": 2.9048914909362793, "learning_rate": 4.971428571428572e-05, "loss": 0.0869, "step": 1740 }, { "epoch": 0.05357088192977623, "grad_norm": 2.6572113037109375, "learning_rate": 5e-05, "loss": 0.1845, "step": 1750 }, { "epoch": 0.053877001255089235, "grad_norm": 1.5618382692337036, "learning_rate": 5.028571428571429e-05, "loss": 0.0976, "step": 1760 }, { "epoch": 0.05418312058040224, "grad_norm": 2.2336771488189697, "learning_rate": 5.057142857142857e-05, "loss": 0.1073, "step": 1770 }, { "epoch": 0.05448923990571525, "grad_norm": 3.6902849674224854, "learning_rate": 5.085714285714286e-05, "loss": 0.0822, "step": 1780 }, { "epoch": 0.054795359231028254, "grad_norm": 2.184623956680298, "learning_rate": 5.1142857142857145e-05, "loss": 0.1614, "step": 1790 }, { "epoch": 0.05510147855634126, "grad_norm": 5.510834217071533, "learning_rate": 5.142857142857143e-05, "loss": 0.1259, "step": 1800 }, { "epoch": 0.055407597881654266, "grad_norm": 1.6870708465576172, "learning_rate": 5.171428571428571e-05, "loss": 0.1179, "step": 1810 }, { "epoch": 0.05571371720696727, "grad_norm": 1.5169191360473633, "learning_rate": 5.2000000000000004e-05, "loss": 0.1534, "step": 1820 }, { "epoch": 0.056019836532280286, "grad_norm": 1.8488301038742065, "learning_rate": 5.2285714285714294e-05, "loss": 0.0848, "step": 1830 }, { "epoch": 0.05632595585759329, "grad_norm": 2.0648655891418457, "learning_rate": 5.257142857142857e-05, "loss": 0.0996, "step": 1840 }, { "epoch": 0.0566320751829063, "grad_norm": 2.3106281757354736, "learning_rate": 5.285714285714286e-05, "loss": 0.0742, "step": 1850 }, { "epoch": 0.056938194508219304, "grad_norm": 3.0345892906188965, "learning_rate": 5.314285714285715e-05, "loss": 0.105, "step": 1860 }, { "epoch": 0.05724431383353231, "grad_norm": 1.8260643482208252, "learning_rate": 5.342857142857143e-05, "loss": 0.0598, "step": 1870 }, { "epoch": 0.05755043315884532, "grad_norm": 1.47543466091156, "learning_rate": 5.3714285714285714e-05, "loss": 0.1049, "step": 1880 }, { "epoch": 0.05785655248415832, "grad_norm": 2.3532352447509766, "learning_rate": 5.4000000000000005e-05, "loss": 0.1433, "step": 1890 }, { "epoch": 0.05816267180947133, "grad_norm": 1.8412340879440308, "learning_rate": 5.428571428571428e-05, "loss": 0.0643, "step": 1900 }, { "epoch": 0.058468791134784336, "grad_norm": 1.363340973854065, "learning_rate": 5.457142857142857e-05, "loss": 0.1552, "step": 1910 }, { "epoch": 0.05877491046009735, "grad_norm": 3.8581950664520264, "learning_rate": 5.485714285714286e-05, "loss": 0.1457, "step": 1920 }, { "epoch": 0.059081029785410355, "grad_norm": 2.40771222114563, "learning_rate": 5.514285714285714e-05, "loss": 0.0891, "step": 1930 }, { "epoch": 0.05938714911072336, "grad_norm": 0.9919692277908325, "learning_rate": 5.542857142857143e-05, "loss": 0.0518, "step": 1940 }, { "epoch": 0.05969326843603637, "grad_norm": 2.473825454711914, "learning_rate": 5.571428571428572e-05, "loss": 0.0687, "step": 1950 }, { "epoch": 0.059999387761349374, "grad_norm": 6.332611560821533, "learning_rate": 5.6000000000000006e-05, "loss": 0.0783, "step": 1960 }, { "epoch": 0.06030550708666238, "grad_norm": 1.9878852367401123, "learning_rate": 5.628571428571428e-05, "loss": 0.0454, "step": 1970 }, { "epoch": 0.060611626411975386, "grad_norm": 0.9259503483772278, "learning_rate": 5.6571428571428574e-05, "loss": 0.0792, "step": 1980 }, { "epoch": 0.06091774573728839, "grad_norm": 3.198192834854126, "learning_rate": 5.6857142857142865e-05, "loss": 0.1025, "step": 1990 }, { "epoch": 0.0612238650626014, "grad_norm": 1.172147512435913, "learning_rate": 5.714285714285714e-05, "loss": 0.098, "step": 2000 }, { "epoch": 0.06152998438791441, "grad_norm": 0.8683651685714722, "learning_rate": 5.742857142857143e-05, "loss": 0.0904, "step": 2010 }, { "epoch": 0.06183610371322742, "grad_norm": 3.7542357444763184, "learning_rate": 5.771428571428572e-05, "loss": 0.1322, "step": 2020 }, { "epoch": 0.062142223038540424, "grad_norm": 1.425028681755066, "learning_rate": 5.8e-05, "loss": 0.0581, "step": 2030 }, { "epoch": 0.06244834236385343, "grad_norm": 1.5621997117996216, "learning_rate": 5.828571428571429e-05, "loss": 0.0422, "step": 2040 }, { "epoch": 0.06275446168916644, "grad_norm": 1.933261752128601, "learning_rate": 5.8571428571428575e-05, "loss": 0.0595, "step": 2050 }, { "epoch": 0.06306058101447945, "grad_norm": 4.743907928466797, "learning_rate": 5.885714285714285e-05, "loss": 0.098, "step": 2060 }, { "epoch": 0.06336670033979246, "grad_norm": 2.0587880611419678, "learning_rate": 5.914285714285714e-05, "loss": 0.0681, "step": 2070 }, { "epoch": 0.06367281966510546, "grad_norm": 5.733152389526367, "learning_rate": 5.9428571428571434e-05, "loss": 0.0782, "step": 2080 }, { "epoch": 0.06397893899041847, "grad_norm": 3.3169870376586914, "learning_rate": 5.9714285714285724e-05, "loss": 0.1059, "step": 2090 }, { "epoch": 0.06428505831573147, "grad_norm": 1.4932074546813965, "learning_rate": 6e-05, "loss": 0.0366, "step": 2100 }, { "epoch": 0.06459117764104448, "grad_norm": 6.483890056610107, "learning_rate": 6.028571428571429e-05, "loss": 0.1238, "step": 2110 }, { "epoch": 0.06489729696635749, "grad_norm": 0.9433429837226868, "learning_rate": 6.0571428571428576e-05, "loss": 0.0362, "step": 2120 }, { "epoch": 0.0652034162916705, "grad_norm": 2.0938684940338135, "learning_rate": 6.085714285714286e-05, "loss": 0.161, "step": 2130 }, { "epoch": 0.0655095356169835, "grad_norm": 2.502352476119995, "learning_rate": 6.114285714285714e-05, "loss": 0.0841, "step": 2140 }, { "epoch": 0.0658156549422965, "grad_norm": 2.384891986846924, "learning_rate": 6.142857142857143e-05, "loss": 0.0772, "step": 2150 }, { "epoch": 0.06612177426760951, "grad_norm": 4.594301223754883, "learning_rate": 6.171428571428571e-05, "loss": 0.1006, "step": 2160 }, { "epoch": 0.06642789359292252, "grad_norm": 1.1509578227996826, "learning_rate": 6.2e-05, "loss": 0.0408, "step": 2170 }, { "epoch": 0.06673401291823552, "grad_norm": 1.8698395490646362, "learning_rate": 6.22857142857143e-05, "loss": 0.1595, "step": 2180 }, { "epoch": 0.06704013224354853, "grad_norm": 2.4274160861968994, "learning_rate": 6.257142857142857e-05, "loss": 0.0739, "step": 2190 }, { "epoch": 0.06734625156886154, "grad_norm": 1.3545074462890625, "learning_rate": 6.285714285714286e-05, "loss": 0.0665, "step": 2200 }, { "epoch": 0.06765237089417454, "grad_norm": 1.7314223051071167, "learning_rate": 6.314285714285715e-05, "loss": 0.0643, "step": 2210 }, { "epoch": 0.06795849021948755, "grad_norm": 1.0276238918304443, "learning_rate": 6.342857142857143e-05, "loss": 0.0827, "step": 2220 }, { "epoch": 0.06826460954480057, "grad_norm": 2.0185863971710205, "learning_rate": 6.371428571428572e-05, "loss": 0.0945, "step": 2230 }, { "epoch": 0.06857072887011358, "grad_norm": 2.2189199924468994, "learning_rate": 6.400000000000001e-05, "loss": 0.1174, "step": 2240 }, { "epoch": 0.06887684819542658, "grad_norm": 1.166427493095398, "learning_rate": 6.428571428571429e-05, "loss": 0.0713, "step": 2250 }, { "epoch": 0.06918296752073959, "grad_norm": 2.502034902572632, "learning_rate": 6.457142857142856e-05, "loss": 0.0769, "step": 2260 }, { "epoch": 0.0694890868460526, "grad_norm": 0.7158637642860413, "learning_rate": 6.485714285714286e-05, "loss": 0.0722, "step": 2270 }, { "epoch": 0.0697952061713656, "grad_norm": 5.041638374328613, "learning_rate": 6.514285714285715e-05, "loss": 0.137, "step": 2280 }, { "epoch": 0.07010132549667861, "grad_norm": 1.965301752090454, "learning_rate": 6.542857142857142e-05, "loss": 0.0571, "step": 2290 }, { "epoch": 0.07040744482199161, "grad_norm": 9.768935203552246, "learning_rate": 6.571428571428571e-05, "loss": 0.1883, "step": 2300 }, { "epoch": 0.07071356414730462, "grad_norm": 3.2323668003082275, "learning_rate": 6.6e-05, "loss": 0.0782, "step": 2310 }, { "epoch": 0.07101968347261763, "grad_norm": 2.167698383331299, "learning_rate": 6.628571428571428e-05, "loss": 0.1177, "step": 2320 }, { "epoch": 0.07132580279793063, "grad_norm": 1.028816819190979, "learning_rate": 6.657142857142857e-05, "loss": 0.1432, "step": 2330 }, { "epoch": 0.07163192212324364, "grad_norm": 0.6730745434761047, "learning_rate": 6.685714285714286e-05, "loss": 0.1095, "step": 2340 }, { "epoch": 0.07193804144855664, "grad_norm": 4.013037204742432, "learning_rate": 6.714285714285714e-05, "loss": 0.1386, "step": 2350 }, { "epoch": 0.07224416077386965, "grad_norm": 1.8665307760238647, "learning_rate": 6.742857142857143e-05, "loss": 0.086, "step": 2360 }, { "epoch": 0.07255028009918266, "grad_norm": 3.4453351497650146, "learning_rate": 6.771428571428572e-05, "loss": 0.0688, "step": 2370 }, { "epoch": 0.07285639942449566, "grad_norm": 1.9128085374832153, "learning_rate": 6.800000000000001e-05, "loss": 0.0947, "step": 2380 }, { "epoch": 0.07316251874980867, "grad_norm": 1.3770204782485962, "learning_rate": 6.828571428571429e-05, "loss": 0.1228, "step": 2390 }, { "epoch": 0.07346863807512168, "grad_norm": 1.4273111820220947, "learning_rate": 6.857142857142858e-05, "loss": 0.0917, "step": 2400 }, { "epoch": 0.0737747574004347, "grad_norm": 4.442019462585449, "learning_rate": 6.885714285714286e-05, "loss": 0.1422, "step": 2410 }, { "epoch": 0.0740808767257477, "grad_norm": 0.6953336000442505, "learning_rate": 6.914285714285715e-05, "loss": 0.0969, "step": 2420 }, { "epoch": 0.07438699605106071, "grad_norm": 1.8054078817367554, "learning_rate": 6.942857142857143e-05, "loss": 0.1521, "step": 2430 }, { "epoch": 0.07469311537637371, "grad_norm": 3.340853214263916, "learning_rate": 6.971428571428572e-05, "loss": 0.0785, "step": 2440 }, { "epoch": 0.07499923470168672, "grad_norm": 1.2612420320510864, "learning_rate": 7e-05, "loss": 0.0575, "step": 2450 }, { "epoch": 0.07530535402699973, "grad_norm": 2.610926628112793, "learning_rate": 7.028571428571428e-05, "loss": 0.1114, "step": 2460 }, { "epoch": 0.07561147335231273, "grad_norm": 3.5240325927734375, "learning_rate": 7.057142857142858e-05, "loss": 0.1768, "step": 2470 }, { "epoch": 0.07591759267762574, "grad_norm": 2.0550403594970703, "learning_rate": 7.085714285714285e-05, "loss": 0.1088, "step": 2480 }, { "epoch": 0.07622371200293875, "grad_norm": 2.286262273788452, "learning_rate": 7.114285714285714e-05, "loss": 0.082, "step": 2490 }, { "epoch": 0.07652983132825175, "grad_norm": 2.6834468841552734, "learning_rate": 7.142857142857143e-05, "loss": 0.1284, "step": 2500 }, { "epoch": 0.07683595065356476, "grad_norm": 2.3614041805267334, "learning_rate": 7.171428571428572e-05, "loss": 0.0562, "step": 2510 }, { "epoch": 0.07714206997887776, "grad_norm": 2.7669177055358887, "learning_rate": 7.2e-05, "loss": 0.0908, "step": 2520 }, { "epoch": 0.07744818930419077, "grad_norm": 1.7900639772415161, "learning_rate": 7.228571428571429e-05, "loss": 0.0997, "step": 2530 }, { "epoch": 0.07775430862950378, "grad_norm": 1.9878504276275635, "learning_rate": 7.257142857142858e-05, "loss": 0.0619, "step": 2540 }, { "epoch": 0.07806042795481678, "grad_norm": 1.2821983098983765, "learning_rate": 7.285714285714286e-05, "loss": 0.0583, "step": 2550 }, { "epoch": 0.07836654728012979, "grad_norm": 1.9171333312988281, "learning_rate": 7.314285714285715e-05, "loss": 0.0505, "step": 2560 }, { "epoch": 0.0786726666054428, "grad_norm": 1.4551851749420166, "learning_rate": 7.342857142857144e-05, "loss": 0.0967, "step": 2570 }, { "epoch": 0.07897878593075582, "grad_norm": 0.7417320609092712, "learning_rate": 7.371428571428572e-05, "loss": 0.1311, "step": 2580 }, { "epoch": 0.07928490525606882, "grad_norm": 2.4030654430389404, "learning_rate": 7.4e-05, "loss": 0.078, "step": 2590 }, { "epoch": 0.07959102458138183, "grad_norm": 2.030769109725952, "learning_rate": 7.428571428571429e-05, "loss": 0.0611, "step": 2600 }, { "epoch": 0.07989714390669483, "grad_norm": 0.6729026436805725, "learning_rate": 7.457142857142856e-05, "loss": 0.0913, "step": 2610 }, { "epoch": 0.08020326323200784, "grad_norm": 2.4817936420440674, "learning_rate": 7.485714285714285e-05, "loss": 0.062, "step": 2620 }, { "epoch": 0.08050938255732085, "grad_norm": 2.252736806869507, "learning_rate": 7.514285714285715e-05, "loss": 0.1323, "step": 2630 }, { "epoch": 0.08081550188263385, "grad_norm": 5.476583957672119, "learning_rate": 7.542857142857144e-05, "loss": 0.1221, "step": 2640 }, { "epoch": 0.08112162120794686, "grad_norm": 0.8957196474075317, "learning_rate": 7.571428571428571e-05, "loss": 0.0904, "step": 2650 }, { "epoch": 0.08142774053325987, "grad_norm": 2.2705984115600586, "learning_rate": 7.6e-05, "loss": 0.0897, "step": 2660 }, { "epoch": 0.08173385985857287, "grad_norm": 2.504786729812622, "learning_rate": 7.62857142857143e-05, "loss": 0.0669, "step": 2670 }, { "epoch": 0.08203997918388588, "grad_norm": 2.177070379257202, "learning_rate": 7.657142857142857e-05, "loss": 0.1016, "step": 2680 }, { "epoch": 0.08234609850919888, "grad_norm": 1.5839756727218628, "learning_rate": 7.685714285714286e-05, "loss": 0.0642, "step": 2690 }, { "epoch": 0.08265221783451189, "grad_norm": 4.324403762817383, "learning_rate": 7.714285714285715e-05, "loss": 0.1206, "step": 2700 }, { "epoch": 0.0829583371598249, "grad_norm": 2.2228243350982666, "learning_rate": 7.742857142857143e-05, "loss": 0.0978, "step": 2710 }, { "epoch": 0.0832644564851379, "grad_norm": 4.029900550842285, "learning_rate": 7.771428571428572e-05, "loss": 0.1097, "step": 2720 }, { "epoch": 0.08357057581045091, "grad_norm": 1.3069000244140625, "learning_rate": 7.800000000000001e-05, "loss": 0.0792, "step": 2730 }, { "epoch": 0.08387669513576392, "grad_norm": 1.2967958450317383, "learning_rate": 7.828571428571429e-05, "loss": 0.0891, "step": 2740 }, { "epoch": 0.08418281446107692, "grad_norm": 2.178595542907715, "learning_rate": 7.857142857142858e-05, "loss": 0.0815, "step": 2750 }, { "epoch": 0.08448893378638994, "grad_norm": 4.718527793884277, "learning_rate": 7.885714285714286e-05, "loss": 0.1223, "step": 2760 }, { "epoch": 0.08479505311170295, "grad_norm": 1.634864330291748, "learning_rate": 7.914285714285715e-05, "loss": 0.1232, "step": 2770 }, { "epoch": 0.08510117243701595, "grad_norm": 1.850643277168274, "learning_rate": 7.942857142857143e-05, "loss": 0.0793, "step": 2780 }, { "epoch": 0.08540729176232896, "grad_norm": 2.0098989009857178, "learning_rate": 7.971428571428572e-05, "loss": 0.105, "step": 2790 }, { "epoch": 0.08571341108764197, "grad_norm": 1.6357985734939575, "learning_rate": 8e-05, "loss": 0.0773, "step": 2800 }, { "epoch": 0.08601953041295497, "grad_norm": 0.9889956116676331, "learning_rate": 8.028571428571428e-05, "loss": 0.0872, "step": 2810 }, { "epoch": 0.08632564973826798, "grad_norm": 2.061403274536133, "learning_rate": 8.057142857142857e-05, "loss": 0.0635, "step": 2820 }, { "epoch": 0.08663176906358099, "grad_norm": 1.1295543909072876, "learning_rate": 8.085714285714287e-05, "loss": 0.1002, "step": 2830 }, { "epoch": 0.08693788838889399, "grad_norm": 3.0512170791625977, "learning_rate": 8.114285714285714e-05, "loss": 0.1002, "step": 2840 }, { "epoch": 0.087244007714207, "grad_norm": 0.8353195786476135, "learning_rate": 8.142857142857143e-05, "loss": 0.0582, "step": 2850 }, { "epoch": 0.08755012703952, "grad_norm": 1.792365550994873, "learning_rate": 8.171428571428572e-05, "loss": 0.064, "step": 2860 }, { "epoch": 0.08785624636483301, "grad_norm": 1.4019246101379395, "learning_rate": 8.2e-05, "loss": 0.1134, "step": 2870 }, { "epoch": 0.08816236569014602, "grad_norm": 1.5562981367111206, "learning_rate": 8.228571428571429e-05, "loss": 0.0831, "step": 2880 }, { "epoch": 0.08846848501545902, "grad_norm": 2.722562551498413, "learning_rate": 8.257142857142858e-05, "loss": 0.0723, "step": 2890 }, { "epoch": 0.08877460434077203, "grad_norm": 1.9486205577850342, "learning_rate": 8.285714285714287e-05, "loss": 0.0862, "step": 2900 }, { "epoch": 0.08908072366608503, "grad_norm": 1.3745486736297607, "learning_rate": 8.314285714285715e-05, "loss": 0.084, "step": 2910 }, { "epoch": 0.08938684299139804, "grad_norm": 0.511366605758667, "learning_rate": 8.342857142857143e-05, "loss": 0.105, "step": 2920 }, { "epoch": 0.08969296231671105, "grad_norm": 1.723392128944397, "learning_rate": 8.371428571428572e-05, "loss": 0.0596, "step": 2930 }, { "epoch": 0.08999908164202407, "grad_norm": 2.9779887199401855, "learning_rate": 8.4e-05, "loss": 0.075, "step": 2940 }, { "epoch": 0.09030520096733707, "grad_norm": 1.3583229780197144, "learning_rate": 8.428571428571429e-05, "loss": 0.0594, "step": 2950 }, { "epoch": 0.09061132029265008, "grad_norm": 1.8656994104385376, "learning_rate": 8.457142857142858e-05, "loss": 0.1009, "step": 2960 }, { "epoch": 0.09091743961796309, "grad_norm": 2.7392356395721436, "learning_rate": 8.485714285714285e-05, "loss": 0.0844, "step": 2970 }, { "epoch": 0.09122355894327609, "grad_norm": 2.3893442153930664, "learning_rate": 8.514285714285714e-05, "loss": 0.0779, "step": 2980 }, { "epoch": 0.0915296782685891, "grad_norm": 1.7861168384552002, "learning_rate": 8.542857142857144e-05, "loss": 0.1044, "step": 2990 }, { "epoch": 0.0918357975939021, "grad_norm": 1.8546770811080933, "learning_rate": 8.571428571428571e-05, "loss": 0.0731, "step": 3000 }, { "epoch": 0.09214191691921511, "grad_norm": 2.293027639389038, "learning_rate": 8.6e-05, "loss": 0.0897, "step": 3010 }, { "epoch": 0.09244803624452812, "grad_norm": 2.0151960849761963, "learning_rate": 8.62857142857143e-05, "loss": 0.0507, "step": 3020 }, { "epoch": 0.09275415556984112, "grad_norm": 1.8383091688156128, "learning_rate": 8.657142857142858e-05, "loss": 0.0861, "step": 3030 }, { "epoch": 0.09306027489515413, "grad_norm": 2.255585193634033, "learning_rate": 8.685714285714286e-05, "loss": 0.0706, "step": 3040 }, { "epoch": 0.09336639422046714, "grad_norm": 1.6368615627288818, "learning_rate": 8.714285714285715e-05, "loss": 0.0717, "step": 3050 }, { "epoch": 0.09367251354578014, "grad_norm": 1.908597707748413, "learning_rate": 8.742857142857144e-05, "loss": 0.121, "step": 3060 }, { "epoch": 0.09397863287109315, "grad_norm": 2.81709623336792, "learning_rate": 8.771428571428572e-05, "loss": 0.0979, "step": 3070 }, { "epoch": 0.09428475219640615, "grad_norm": 0.822256326675415, "learning_rate": 8.800000000000001e-05, "loss": 0.1203, "step": 3080 }, { "epoch": 0.09459087152171916, "grad_norm": 3.4918341636657715, "learning_rate": 8.828571428571429e-05, "loss": 0.2041, "step": 3090 }, { "epoch": 0.09489699084703217, "grad_norm": 0.7154914736747742, "learning_rate": 8.857142857142857e-05, "loss": 0.0933, "step": 3100 }, { "epoch": 0.09520311017234517, "grad_norm": 3.8584847450256348, "learning_rate": 8.885714285714286e-05, "loss": 0.0748, "step": 3110 }, { "epoch": 0.0955092294976582, "grad_norm": 2.494386911392212, "learning_rate": 8.914285714285715e-05, "loss": 0.1423, "step": 3120 }, { "epoch": 0.0958153488229712, "grad_norm": 1.453735113143921, "learning_rate": 8.942857142857142e-05, "loss": 0.0475, "step": 3130 }, { "epoch": 0.0961214681482842, "grad_norm": 2.467174530029297, "learning_rate": 8.971428571428571e-05, "loss": 0.1228, "step": 3140 }, { "epoch": 0.09642758747359721, "grad_norm": 4.565613269805908, "learning_rate": 9e-05, "loss": 0.1344, "step": 3150 }, { "epoch": 0.09673370679891022, "grad_norm": 3.7958948612213135, "learning_rate": 9.028571428571428e-05, "loss": 0.1109, "step": 3160 }, { "epoch": 0.09703982612422322, "grad_norm": 2.2657694816589355, "learning_rate": 9.057142857142857e-05, "loss": 0.0908, "step": 3170 }, { "epoch": 0.09734594544953623, "grad_norm": 2.160186529159546, "learning_rate": 9.085714285714286e-05, "loss": 0.1174, "step": 3180 }, { "epoch": 0.09765206477484924, "grad_norm": 1.7853460311889648, "learning_rate": 9.114285714285716e-05, "loss": 0.1367, "step": 3190 }, { "epoch": 0.09795818410016224, "grad_norm": 2.8969027996063232, "learning_rate": 9.142857142857143e-05, "loss": 0.1182, "step": 3200 }, { "epoch": 0.09826430342547525, "grad_norm": 1.6273174285888672, "learning_rate": 9.171428571428572e-05, "loss": 0.0549, "step": 3210 }, { "epoch": 0.09857042275078826, "grad_norm": 0.6795701384544373, "learning_rate": 9.200000000000001e-05, "loss": 0.0938, "step": 3220 }, { "epoch": 0.09887654207610126, "grad_norm": 2.4527533054351807, "learning_rate": 9.228571428571429e-05, "loss": 0.1816, "step": 3230 }, { "epoch": 0.09918266140141427, "grad_norm": 1.9990100860595703, "learning_rate": 9.257142857142858e-05, "loss": 0.1205, "step": 3240 }, { "epoch": 0.09948878072672727, "grad_norm": 3.8492624759674072, "learning_rate": 9.285714285714286e-05, "loss": 0.1458, "step": 3250 }, { "epoch": 0.09979490005204028, "grad_norm": 2.5385615825653076, "learning_rate": 9.314285714285715e-05, "loss": 0.0923, "step": 3260 }, { "epoch": 0.10010101937735329, "grad_norm": 1.4854074716567993, "learning_rate": 9.342857142857143e-05, "loss": 0.0726, "step": 3270 }, { "epoch": 0.1004071387026663, "grad_norm": 3.838244915008545, "learning_rate": 9.371428571428572e-05, "loss": 0.1083, "step": 3280 }, { "epoch": 0.10071325802797931, "grad_norm": 1.3289073705673218, "learning_rate": 9.4e-05, "loss": 0.0734, "step": 3290 }, { "epoch": 0.10101937735329232, "grad_norm": 2.2300755977630615, "learning_rate": 9.428571428571429e-05, "loss": 0.0654, "step": 3300 }, { "epoch": 0.10132549667860533, "grad_norm": 2.1355929374694824, "learning_rate": 9.457142857142858e-05, "loss": 0.0766, "step": 3310 }, { "epoch": 0.10163161600391833, "grad_norm": 2.2083301544189453, "learning_rate": 9.485714285714287e-05, "loss": 0.0983, "step": 3320 }, { "epoch": 0.10193773532923134, "grad_norm": 2.210082769393921, "learning_rate": 9.514285714285714e-05, "loss": 0.0832, "step": 3330 }, { "epoch": 0.10224385465454434, "grad_norm": 3.294492244720459, "learning_rate": 9.542857142857143e-05, "loss": 0.1251, "step": 3340 }, { "epoch": 0.10254997397985735, "grad_norm": 0.8155706524848938, "learning_rate": 9.571428571428573e-05, "loss": 0.125, "step": 3350 }, { "epoch": 0.10285609330517036, "grad_norm": 3.151130437850952, "learning_rate": 9.6e-05, "loss": 0.0927, "step": 3360 }, { "epoch": 0.10316221263048336, "grad_norm": 1.5133957862854004, "learning_rate": 9.628571428571429e-05, "loss": 0.0997, "step": 3370 }, { "epoch": 0.10346833195579637, "grad_norm": 3.0737478733062744, "learning_rate": 9.657142857142858e-05, "loss": 0.0988, "step": 3380 }, { "epoch": 0.10377445128110938, "grad_norm": 1.2834123373031616, "learning_rate": 9.685714285714286e-05, "loss": 0.0644, "step": 3390 }, { "epoch": 0.10408057060642238, "grad_norm": 2.1862120628356934, "learning_rate": 9.714285714285715e-05, "loss": 0.1256, "step": 3400 }, { "epoch": 0.10438668993173539, "grad_norm": 1.5963772535324097, "learning_rate": 9.742857142857143e-05, "loss": 0.071, "step": 3410 }, { "epoch": 0.1046928092570484, "grad_norm": 0.7589594721794128, "learning_rate": 9.771428571428572e-05, "loss": 0.0886, "step": 3420 }, { "epoch": 0.1049989285823614, "grad_norm": 1.8637193441390991, "learning_rate": 9.8e-05, "loss": 0.0967, "step": 3430 }, { "epoch": 0.1053050479076744, "grad_norm": 0.9573327898979187, "learning_rate": 9.828571428571429e-05, "loss": 0.0721, "step": 3440 }, { "epoch": 0.10561116723298741, "grad_norm": 4.512915134429932, "learning_rate": 9.857142857142858e-05, "loss": 0.0767, "step": 3450 }, { "epoch": 0.10591728655830042, "grad_norm": 2.248718500137329, "learning_rate": 9.885714285714286e-05, "loss": 0.0977, "step": 3460 }, { "epoch": 0.10622340588361344, "grad_norm": 1.3292741775512695, "learning_rate": 9.914285714285715e-05, "loss": 0.1213, "step": 3470 }, { "epoch": 0.10652952520892645, "grad_norm": 1.491721272468567, "learning_rate": 9.942857142857144e-05, "loss": 0.0921, "step": 3480 }, { "epoch": 0.10683564453423945, "grad_norm": 1.4828684329986572, "learning_rate": 9.971428571428571e-05, "loss": 0.1288, "step": 3490 }, { "epoch": 0.10714176385955246, "grad_norm": 1.1308566331863403, "learning_rate": 0.0001, "loss": 0.0695, "step": 3500 }, { "epoch": 0.10744788318486546, "grad_norm": 1.456294298171997, "learning_rate": 9.999999442048494e-05, "loss": 0.1134, "step": 3510 }, { "epoch": 0.10775400251017847, "grad_norm": 1.2640182971954346, "learning_rate": 9.999997768194094e-05, "loss": 0.0783, "step": 3520 }, { "epoch": 0.10806012183549148, "grad_norm": 1.105157494544983, "learning_rate": 9.99999497843718e-05, "loss": 0.0572, "step": 3530 }, { "epoch": 0.10836624116080448, "grad_norm": 3.2592527866363525, "learning_rate": 9.999991072778369e-05, "loss": 0.1001, "step": 3540 }, { "epoch": 0.10867236048611749, "grad_norm": 1.4612759351730347, "learning_rate": 9.999986051218537e-05, "loss": 0.0528, "step": 3550 }, { "epoch": 0.1089784798114305, "grad_norm": 1.078796625137329, "learning_rate": 9.999979913758803e-05, "loss": 0.0828, "step": 3560 }, { "epoch": 0.1092845991367435, "grad_norm": 1.684867024421692, "learning_rate": 9.999972660400536e-05, "loss": 0.1318, "step": 3570 }, { "epoch": 0.10959071846205651, "grad_norm": 2.03752064704895, "learning_rate": 9.999964291145356e-05, "loss": 0.0582, "step": 3580 }, { "epoch": 0.10989683778736951, "grad_norm": 2.3573412895202637, "learning_rate": 9.999954805995129e-05, "loss": 0.0835, "step": 3590 }, { "epoch": 0.11020295711268252, "grad_norm": 1.6870630979537964, "learning_rate": 9.999944204951974e-05, "loss": 0.0613, "step": 3600 }, { "epoch": 0.11050907643799553, "grad_norm": 1.7388732433319092, "learning_rate": 9.999932488018257e-05, "loss": 0.1021, "step": 3610 }, { "epoch": 0.11081519576330853, "grad_norm": 1.9437974691390991, "learning_rate": 9.999919655196592e-05, "loss": 0.0951, "step": 3620 }, { "epoch": 0.11112131508862154, "grad_norm": 0.9496951699256897, "learning_rate": 9.999905706489842e-05, "loss": 0.088, "step": 3630 }, { "epoch": 0.11142743441393455, "grad_norm": 0.6182589530944824, "learning_rate": 9.999890641901125e-05, "loss": 0.0829, "step": 3640 }, { "epoch": 0.11173355373924757, "grad_norm": 2.1432127952575684, "learning_rate": 9.999874461433796e-05, "loss": 0.1151, "step": 3650 }, { "epoch": 0.11203967306456057, "grad_norm": 2.5572404861450195, "learning_rate": 9.99985716509147e-05, "loss": 0.0828, "step": 3660 }, { "epoch": 0.11234579238987358, "grad_norm": 1.224345088005066, "learning_rate": 9.999838752878006e-05, "loss": 0.0568, "step": 3670 }, { "epoch": 0.11265191171518658, "grad_norm": 1.149556279182434, "learning_rate": 9.999819224797516e-05, "loss": 0.057, "step": 3680 }, { "epoch": 0.11295803104049959, "grad_norm": 1.7400219440460205, "learning_rate": 9.999798580854356e-05, "loss": 0.125, "step": 3690 }, { "epoch": 0.1132641503658126, "grad_norm": 2.8761465549468994, "learning_rate": 9.999776821053133e-05, "loss": 0.0663, "step": 3700 }, { "epoch": 0.1135702696911256, "grad_norm": 1.910689115524292, "learning_rate": 9.999753945398704e-05, "loss": 0.062, "step": 3710 }, { "epoch": 0.11387638901643861, "grad_norm": 1.258952260017395, "learning_rate": 9.999729953896175e-05, "loss": 0.1186, "step": 3720 }, { "epoch": 0.11418250834175162, "grad_norm": 2.0124053955078125, "learning_rate": 9.9997048465509e-05, "loss": 0.0921, "step": 3730 }, { "epoch": 0.11448862766706462, "grad_norm": 4.076353073120117, "learning_rate": 9.999678623368483e-05, "loss": 0.0929, "step": 3740 }, { "epoch": 0.11479474699237763, "grad_norm": 2.3106298446655273, "learning_rate": 9.999651284354774e-05, "loss": 0.0701, "step": 3750 }, { "epoch": 0.11510086631769063, "grad_norm": 1.2859801054000854, "learning_rate": 9.999622829515877e-05, "loss": 0.0769, "step": 3760 }, { "epoch": 0.11540698564300364, "grad_norm": 3.7073140144348145, "learning_rate": 9.999593258858144e-05, "loss": 0.1189, "step": 3770 }, { "epoch": 0.11571310496831665, "grad_norm": 1.6582651138305664, "learning_rate": 9.99956257238817e-05, "loss": 0.099, "step": 3780 }, { "epoch": 0.11601922429362965, "grad_norm": 2.3286426067352295, "learning_rate": 9.999530770112808e-05, "loss": 0.0734, "step": 3790 }, { "epoch": 0.11632534361894266, "grad_norm": 4.097447872161865, "learning_rate": 9.999497852039152e-05, "loss": 0.1122, "step": 3800 }, { "epoch": 0.11663146294425566, "grad_norm": 4.493646144866943, "learning_rate": 9.999463818174553e-05, "loss": 0.1013, "step": 3810 }, { "epoch": 0.11693758226956867, "grad_norm": 1.9190568923950195, "learning_rate": 9.999428668526603e-05, "loss": 0.1041, "step": 3820 }, { "epoch": 0.11724370159488169, "grad_norm": 1.0486172437667847, "learning_rate": 9.999392403103148e-05, "loss": 0.0924, "step": 3830 }, { "epoch": 0.1175498209201947, "grad_norm": 2.1235287189483643, "learning_rate": 9.99935502191228e-05, "loss": 0.0873, "step": 3840 }, { "epoch": 0.1178559402455077, "grad_norm": 1.8535585403442383, "learning_rate": 9.999316524962345e-05, "loss": 0.1188, "step": 3850 }, { "epoch": 0.11816205957082071, "grad_norm": 1.717311143875122, "learning_rate": 9.999276912261934e-05, "loss": 0.1218, "step": 3860 }, { "epoch": 0.11846817889613372, "grad_norm": 0.8064334988594055, "learning_rate": 9.999236183819885e-05, "loss": 0.1401, "step": 3870 }, { "epoch": 0.11877429822144672, "grad_norm": 0.8224121332168579, "learning_rate": 9.999194339645292e-05, "loss": 0.0385, "step": 3880 }, { "epoch": 0.11908041754675973, "grad_norm": 2.0302300453186035, "learning_rate": 9.99915137974749e-05, "loss": 0.0387, "step": 3890 }, { "epoch": 0.11938653687207273, "grad_norm": 1.423257827758789, "learning_rate": 9.999107304136068e-05, "loss": 0.0699, "step": 3900 }, { "epoch": 0.11969265619738574, "grad_norm": 0.8926491141319275, "learning_rate": 9.999062112820865e-05, "loss": 0.1623, "step": 3910 }, { "epoch": 0.11999877552269875, "grad_norm": 1.6212105751037598, "learning_rate": 9.999015805811965e-05, "loss": 0.0465, "step": 3920 }, { "epoch": 0.12030489484801175, "grad_norm": 2.387333631515503, "learning_rate": 9.998968383119703e-05, "loss": 0.0732, "step": 3930 }, { "epoch": 0.12061101417332476, "grad_norm": 3.781348466873169, "learning_rate": 9.998919844754661e-05, "loss": 0.1344, "step": 3940 }, { "epoch": 0.12091713349863777, "grad_norm": 1.392364263534546, "learning_rate": 9.998870190727673e-05, "loss": 0.0644, "step": 3950 }, { "epoch": 0.12122325282395077, "grad_norm": 1.536574363708496, "learning_rate": 9.998819421049823e-05, "loss": 0.184, "step": 3960 }, { "epoch": 0.12152937214926378, "grad_norm": 1.4803749322891235, "learning_rate": 9.99876753573244e-05, "loss": 0.0854, "step": 3970 }, { "epoch": 0.12183549147457678, "grad_norm": 1.6100883483886719, "learning_rate": 9.998714534787104e-05, "loss": 0.0729, "step": 3980 }, { "epoch": 0.12214161079988979, "grad_norm": 0.8463494181632996, "learning_rate": 9.998660418225645e-05, "loss": 0.0618, "step": 3990 }, { "epoch": 0.1224477301252028, "grad_norm": 1.9311325550079346, "learning_rate": 9.998605186060137e-05, "loss": 0.101, "step": 4000 }, { "epoch": 0.12275384945051582, "grad_norm": 1.9936010837554932, "learning_rate": 9.99854883830291e-05, "loss": 0.0744, "step": 4010 }, { "epoch": 0.12305996877582882, "grad_norm": 2.735964775085449, "learning_rate": 9.99849137496654e-05, "loss": 0.0731, "step": 4020 }, { "epoch": 0.12336608810114183, "grad_norm": 1.5466099977493286, "learning_rate": 9.99843279606385e-05, "loss": 0.0975, "step": 4030 }, { "epoch": 0.12367220742645484, "grad_norm": 1.1803419589996338, "learning_rate": 9.998373101607915e-05, "loss": 0.1466, "step": 4040 }, { "epoch": 0.12397832675176784, "grad_norm": 1.9338688850402832, "learning_rate": 9.998312291612057e-05, "loss": 0.0729, "step": 4050 }, { "epoch": 0.12428444607708085, "grad_norm": 2.0798146724700928, "learning_rate": 9.998250366089848e-05, "loss": 0.0983, "step": 4060 }, { "epoch": 0.12459056540239385, "grad_norm": 1.9241279363632202, "learning_rate": 9.998187325055106e-05, "loss": 0.1199, "step": 4070 }, { "epoch": 0.12489668472770686, "grad_norm": 1.2331739664077759, "learning_rate": 9.998123168521904e-05, "loss": 0.0632, "step": 4080 }, { "epoch": 0.12520280405301987, "grad_norm": 1.9821258783340454, "learning_rate": 9.99805789650456e-05, "loss": 0.0969, "step": 4090 }, { "epoch": 0.1255089233783329, "grad_norm": 2.7054407596588135, "learning_rate": 9.997991509017639e-05, "loss": 0.1139, "step": 4100 }, { "epoch": 0.12581504270364588, "grad_norm": 0.5568429827690125, "learning_rate": 9.99792400607596e-05, "loss": 0.1023, "step": 4110 }, { "epoch": 0.1261211620289589, "grad_norm": 3.2683019638061523, "learning_rate": 9.997855387694588e-05, "loss": 0.1179, "step": 4120 }, { "epoch": 0.1264272813542719, "grad_norm": 1.4403700828552246, "learning_rate": 9.997785653888835e-05, "loss": 0.0577, "step": 4130 }, { "epoch": 0.1267334006795849, "grad_norm": 2.1380984783172607, "learning_rate": 9.997714804674268e-05, "loss": 0.0778, "step": 4140 }, { "epoch": 0.1270395200048979, "grad_norm": 1.5225584506988525, "learning_rate": 9.997642840066697e-05, "loss": 0.0974, "step": 4150 }, { "epoch": 0.12734563933021092, "grad_norm": 2.0604209899902344, "learning_rate": 9.997569760082181e-05, "loss": 0.1374, "step": 4160 }, { "epoch": 0.12765175865552392, "grad_norm": 2.543459177017212, "learning_rate": 9.997495564737034e-05, "loss": 0.0829, "step": 4170 }, { "epoch": 0.12795787798083694, "grad_norm": 3.431210994720459, "learning_rate": 9.997420254047814e-05, "loss": 0.1126, "step": 4180 }, { "epoch": 0.12826399730614993, "grad_norm": 1.7762937545776367, "learning_rate": 9.997343828031327e-05, "loss": 0.0943, "step": 4190 }, { "epoch": 0.12857011663146295, "grad_norm": 1.3389352560043335, "learning_rate": 9.997266286704631e-05, "loss": 0.1465, "step": 4200 }, { "epoch": 0.12887623595677594, "grad_norm": 4.508752346038818, "learning_rate": 9.99718763008503e-05, "loss": 0.0856, "step": 4210 }, { "epoch": 0.12918235528208896, "grad_norm": 1.4455230236053467, "learning_rate": 9.997107858190083e-05, "loss": 0.0973, "step": 4220 }, { "epoch": 0.12948847460740195, "grad_norm": 3.017655611038208, "learning_rate": 9.997026971037589e-05, "loss": 0.1205, "step": 4230 }, { "epoch": 0.12979459393271497, "grad_norm": 3.7009048461914062, "learning_rate": 9.996944968645603e-05, "loss": 0.0672, "step": 4240 }, { "epoch": 0.13010071325802797, "grad_norm": 0.6723425984382629, "learning_rate": 9.996861851032426e-05, "loss": 0.044, "step": 4250 }, { "epoch": 0.130406832583341, "grad_norm": 1.6063344478607178, "learning_rate": 9.996777618216607e-05, "loss": 0.1189, "step": 4260 }, { "epoch": 0.130712951908654, "grad_norm": 1.7867684364318848, "learning_rate": 9.996692270216947e-05, "loss": 0.0511, "step": 4270 }, { "epoch": 0.131019071233967, "grad_norm": 2.2494988441467285, "learning_rate": 9.996605807052491e-05, "loss": 0.0958, "step": 4280 }, { "epoch": 0.13132519055928002, "grad_norm": 1.1099655628204346, "learning_rate": 9.99651822874254e-05, "loss": 0.0598, "step": 4290 }, { "epoch": 0.131631309884593, "grad_norm": 1.321224570274353, "learning_rate": 9.996429535306638e-05, "loss": 0.0491, "step": 4300 }, { "epoch": 0.13193742920990603, "grad_norm": 1.4780076742172241, "learning_rate": 9.996339726764576e-05, "loss": 0.0474, "step": 4310 }, { "epoch": 0.13224354853521902, "grad_norm": 3.4245247840881348, "learning_rate": 9.996248803136404e-05, "loss": 0.0728, "step": 4320 }, { "epoch": 0.13254966786053204, "grad_norm": 4.086811065673828, "learning_rate": 9.99615676444241e-05, "loss": 0.1028, "step": 4330 }, { "epoch": 0.13285578718584504, "grad_norm": 1.2661036252975464, "learning_rate": 9.996063610703137e-05, "loss": 0.0994, "step": 4340 }, { "epoch": 0.13316190651115806, "grad_norm": 1.2999640703201294, "learning_rate": 9.995969341939373e-05, "loss": 0.0612, "step": 4350 }, { "epoch": 0.13346802583647105, "grad_norm": 1.6099945306777954, "learning_rate": 9.99587395817216e-05, "loss": 0.1962, "step": 4360 }, { "epoch": 0.13377414516178407, "grad_norm": 1.4053101539611816, "learning_rate": 9.995777459422784e-05, "loss": 0.1123, "step": 4370 }, { "epoch": 0.13408026448709706, "grad_norm": 1.8251488208770752, "learning_rate": 9.995679845712782e-05, "loss": 0.068, "step": 4380 }, { "epoch": 0.13438638381241008, "grad_norm": 1.4258286952972412, "learning_rate": 9.99558111706394e-05, "loss": 0.0882, "step": 4390 }, { "epoch": 0.13469250313772307, "grad_norm": 2.805715560913086, "learning_rate": 9.995481273498291e-05, "loss": 0.0831, "step": 4400 }, { "epoch": 0.1349986224630361, "grad_norm": 1.010980248451233, "learning_rate": 9.995380315038119e-05, "loss": 0.0808, "step": 4410 }, { "epoch": 0.1353047417883491, "grad_norm": 2.5442137718200684, "learning_rate": 9.995278241705956e-05, "loss": 0.0633, "step": 4420 }, { "epoch": 0.1356108611136621, "grad_norm": 2.390239715576172, "learning_rate": 9.995175053524584e-05, "loss": 0.0817, "step": 4430 }, { "epoch": 0.1359169804389751, "grad_norm": 3.037696123123169, "learning_rate": 9.99507075051703e-05, "loss": 0.0969, "step": 4440 }, { "epoch": 0.13622309976428812, "grad_norm": 1.0744383335113525, "learning_rate": 9.994965332706573e-05, "loss": 0.1039, "step": 4450 }, { "epoch": 0.13652921908960114, "grad_norm": 2.2979037761688232, "learning_rate": 9.994858800116742e-05, "loss": 0.0813, "step": 4460 }, { "epoch": 0.13683533841491413, "grad_norm": 1.5292587280273438, "learning_rate": 9.994751152771313e-05, "loss": 0.0831, "step": 4470 }, { "epoch": 0.13714145774022715, "grad_norm": 1.507913589477539, "learning_rate": 9.994642390694308e-05, "loss": 0.1097, "step": 4480 }, { "epoch": 0.13744757706554014, "grad_norm": 3.819197416305542, "learning_rate": 9.994532513910002e-05, "loss": 0.0673, "step": 4490 }, { "epoch": 0.13775369639085316, "grad_norm": 2.0835556983947754, "learning_rate": 9.99442152244292e-05, "loss": 0.0692, "step": 4500 }, { "epoch": 0.13805981571616616, "grad_norm": 3.4188637733459473, "learning_rate": 9.994309416317829e-05, "loss": 0.1514, "step": 4510 }, { "epoch": 0.13836593504147918, "grad_norm": 1.3600996732711792, "learning_rate": 9.99419619555975e-05, "loss": 0.1024, "step": 4520 }, { "epoch": 0.13867205436679217, "grad_norm": 1.399637222290039, "learning_rate": 9.994081860193954e-05, "loss": 0.0576, "step": 4530 }, { "epoch": 0.1389781736921052, "grad_norm": 1.0921401977539062, "learning_rate": 9.993966410245957e-05, "loss": 0.0567, "step": 4540 }, { "epoch": 0.13928429301741818, "grad_norm": 1.8785698413848877, "learning_rate": 9.993849845741524e-05, "loss": 0.0825, "step": 4550 }, { "epoch": 0.1395904123427312, "grad_norm": 0.8130149841308594, "learning_rate": 9.993732166706672e-05, "loss": 0.0733, "step": 4560 }, { "epoch": 0.1398965316680442, "grad_norm": 2.8835952281951904, "learning_rate": 9.993613373167663e-05, "loss": 0.1182, "step": 4570 }, { "epoch": 0.14020265099335721, "grad_norm": 1.8626586198806763, "learning_rate": 9.99349346515101e-05, "loss": 0.0855, "step": 4580 }, { "epoch": 0.1405087703186702, "grad_norm": 1.3189162015914917, "learning_rate": 9.993372442683476e-05, "loss": 0.057, "step": 4590 }, { "epoch": 0.14081488964398323, "grad_norm": 1.1450284719467163, "learning_rate": 9.993250305792066e-05, "loss": 0.0523, "step": 4600 }, { "epoch": 0.14112100896929622, "grad_norm": 1.6839004755020142, "learning_rate": 9.993127054504043e-05, "loss": 0.1214, "step": 4610 }, { "epoch": 0.14142712829460924, "grad_norm": 5.4162397384643555, "learning_rate": 9.993002688846913e-05, "loss": 0.1155, "step": 4620 }, { "epoch": 0.14173324761992226, "grad_norm": 1.0322086811065674, "learning_rate": 9.992877208848433e-05, "loss": 0.0675, "step": 4630 }, { "epoch": 0.14203936694523525, "grad_norm": 0.991107165813446, "learning_rate": 9.992750614536605e-05, "loss": 0.0785, "step": 4640 }, { "epoch": 0.14234548627054827, "grad_norm": 1.8293750286102295, "learning_rate": 9.992622905939685e-05, "loss": 0.1129, "step": 4650 }, { "epoch": 0.14265160559586126, "grad_norm": 1.217090129852295, "learning_rate": 9.992494083086175e-05, "loss": 0.0864, "step": 4660 }, { "epoch": 0.14295772492117428, "grad_norm": 1.1972768306732178, "learning_rate": 9.992364146004822e-05, "loss": 0.0866, "step": 4670 }, { "epoch": 0.14326384424648728, "grad_norm": 0.8755980134010315, "learning_rate": 9.992233094724632e-05, "loss": 0.1354, "step": 4680 }, { "epoch": 0.1435699635718003, "grad_norm": 0.7395352125167847, "learning_rate": 9.992100929274846e-05, "loss": 0.0771, "step": 4690 }, { "epoch": 0.1438760828971133, "grad_norm": 2.291034460067749, "learning_rate": 9.991967649684967e-05, "loss": 0.1077, "step": 4700 }, { "epoch": 0.1441822022224263, "grad_norm": 1.414397120475769, "learning_rate": 9.991833255984736e-05, "loss": 0.0432, "step": 4710 }, { "epoch": 0.1444883215477393, "grad_norm": 1.2590246200561523, "learning_rate": 9.99169774820415e-05, "loss": 0.0875, "step": 4720 }, { "epoch": 0.14479444087305232, "grad_norm": 1.1719069480895996, "learning_rate": 9.991561126373449e-05, "loss": 0.0913, "step": 4730 }, { "epoch": 0.1451005601983653, "grad_norm": 2.276614189147949, "learning_rate": 9.991423390523126e-05, "loss": 0.1663, "step": 4740 }, { "epoch": 0.14540667952367833, "grad_norm": 2.3395400047302246, "learning_rate": 9.991284540683922e-05, "loss": 0.1333, "step": 4750 }, { "epoch": 0.14571279884899133, "grad_norm": 2.6789424419403076, "learning_rate": 9.991144576886823e-05, "loss": 0.1199, "step": 4760 }, { "epoch": 0.14601891817430435, "grad_norm": 2.561779499053955, "learning_rate": 9.991003499163067e-05, "loss": 0.1678, "step": 4770 }, { "epoch": 0.14632503749961734, "grad_norm": 1.7337546348571777, "learning_rate": 9.990861307544141e-05, "loss": 0.2513, "step": 4780 }, { "epoch": 0.14663115682493036, "grad_norm": 2.3852901458740234, "learning_rate": 9.99071800206178e-05, "loss": 0.1982, "step": 4790 }, { "epoch": 0.14693727615024335, "grad_norm": 3.4577414989471436, "learning_rate": 9.990573582747964e-05, "loss": 0.1234, "step": 4800 }, { "epoch": 0.14724339547555637, "grad_norm": 3.1135094165802, "learning_rate": 9.990428049634928e-05, "loss": 0.1392, "step": 4810 }, { "epoch": 0.1475495148008694, "grad_norm": 2.559826135635376, "learning_rate": 9.99028140275515e-05, "loss": 0.1201, "step": 4820 }, { "epoch": 0.14785563412618238, "grad_norm": 4.827753067016602, "learning_rate": 9.990133642141359e-05, "loss": 0.0792, "step": 4830 }, { "epoch": 0.1481617534514954, "grad_norm": 3.8579649925231934, "learning_rate": 9.989984767826532e-05, "loss": 0.1746, "step": 4840 }, { "epoch": 0.1484678727768084, "grad_norm": 2.514698028564453, "learning_rate": 9.989834779843895e-05, "loss": 0.1864, "step": 4850 }, { "epoch": 0.14877399210212142, "grad_norm": 2.481642961502075, "learning_rate": 9.989683678226924e-05, "loss": 0.2089, "step": 4860 }, { "epoch": 0.1490801114274344, "grad_norm": 2.8623929023742676, "learning_rate": 9.98953146300934e-05, "loss": 0.0839, "step": 4870 }, { "epoch": 0.14938623075274743, "grad_norm": 2.310584306716919, "learning_rate": 9.989378134225115e-05, "loss": 0.0804, "step": 4880 }, { "epoch": 0.14969235007806042, "grad_norm": 2.841603994369507, "learning_rate": 9.989223691908472e-05, "loss": 0.1862, "step": 4890 }, { "epoch": 0.14999846940337344, "grad_norm": 3.3011128902435303, "learning_rate": 9.989068136093873e-05, "loss": 0.1437, "step": 4900 }, { "epoch": 0.15030458872868643, "grad_norm": 1.4468625783920288, "learning_rate": 9.98891146681604e-05, "loss": 0.0677, "step": 4910 }, { "epoch": 0.15061070805399945, "grad_norm": 1.7518444061279297, "learning_rate": 9.988753684109938e-05, "loss": 0.0905, "step": 4920 }, { "epoch": 0.15091682737931245, "grad_norm": 2.638500928878784, "learning_rate": 9.988594788010782e-05, "loss": 0.1377, "step": 4930 }, { "epoch": 0.15122294670462547, "grad_norm": 1.8880057334899902, "learning_rate": 9.98843477855403e-05, "loss": 0.1362, "step": 4940 }, { "epoch": 0.15152906602993846, "grad_norm": 1.1879018545150757, "learning_rate": 9.988273655775397e-05, "loss": 0.1045, "step": 4950 }, { "epoch": 0.15183518535525148, "grad_norm": 1.6070337295532227, "learning_rate": 9.988111419710842e-05, "loss": 0.1728, "step": 4960 }, { "epoch": 0.15214130468056447, "grad_norm": 2.703371286392212, "learning_rate": 9.987948070396571e-05, "loss": 0.1278, "step": 4970 }, { "epoch": 0.1524474240058775, "grad_norm": 2.476966381072998, "learning_rate": 9.987783607869043e-05, "loss": 0.1286, "step": 4980 }, { "epoch": 0.1527535433311905, "grad_norm": 2.3841357231140137, "learning_rate": 9.987618032164961e-05, "loss": 0.1143, "step": 4990 }, { "epoch": 0.1530596626565035, "grad_norm": 2.6504666805267334, "learning_rate": 9.987451343321279e-05, "loss": 0.2366, "step": 5000 }, { "epoch": 0.15336578198181652, "grad_norm": 1.7628509998321533, "learning_rate": 9.987283541375199e-05, "loss": 0.1364, "step": 5010 }, { "epoch": 0.15367190130712952, "grad_norm": 1.989778757095337, "learning_rate": 9.987114626364171e-05, "loss": 0.1668, "step": 5020 }, { "epoch": 0.15397802063244254, "grad_norm": 5.443140983581543, "learning_rate": 9.986944598325892e-05, "loss": 0.1668, "step": 5030 }, { "epoch": 0.15428413995775553, "grad_norm": 2.808344602584839, "learning_rate": 9.986773457298311e-05, "loss": 0.1497, "step": 5040 }, { "epoch": 0.15459025928306855, "grad_norm": 2.774857521057129, "learning_rate": 9.986601203319623e-05, "loss": 0.1352, "step": 5050 }, { "epoch": 0.15489637860838154, "grad_norm": 2.176693916320801, "learning_rate": 9.986427836428272e-05, "loss": 0.1117, "step": 5060 }, { "epoch": 0.15520249793369456, "grad_norm": 5.614523887634277, "learning_rate": 9.986253356662948e-05, "loss": 0.1877, "step": 5070 }, { "epoch": 0.15550861725900755, "grad_norm": 2.9412965774536133, "learning_rate": 9.986077764062593e-05, "loss": 0.143, "step": 5080 }, { "epoch": 0.15581473658432057, "grad_norm": 2.15993332862854, "learning_rate": 9.985901058666397e-05, "loss": 0.1636, "step": 5090 }, { "epoch": 0.15612085590963357, "grad_norm": 2.78950572013855, "learning_rate": 9.985723240513796e-05, "loss": 0.1063, "step": 5100 }, { "epoch": 0.15642697523494659, "grad_norm": 1.615659475326538, "learning_rate": 9.985544309644475e-05, "loss": 0.1702, "step": 5110 }, { "epoch": 0.15673309456025958, "grad_norm": 1.369512915611267, "learning_rate": 9.985364266098368e-05, "loss": 0.1378, "step": 5120 }, { "epoch": 0.1570392138855726, "grad_norm": 3.3198158740997314, "learning_rate": 9.98518310991566e-05, "loss": 0.1261, "step": 5130 }, { "epoch": 0.1573453332108856, "grad_norm": 1.1946184635162354, "learning_rate": 9.985000841136775e-05, "loss": 0.1107, "step": 5140 }, { "epoch": 0.1576514525361986, "grad_norm": 1.5015949010849, "learning_rate": 9.9848174598024e-05, "loss": 0.1508, "step": 5150 }, { "epoch": 0.15795757186151163, "grad_norm": 2.693188190460205, "learning_rate": 9.984632965953458e-05, "loss": 0.0906, "step": 5160 }, { "epoch": 0.15826369118682462, "grad_norm": 2.9150328636169434, "learning_rate": 9.984447359631124e-05, "loss": 0.1195, "step": 5170 }, { "epoch": 0.15856981051213764, "grad_norm": 3.3456673622131348, "learning_rate": 9.984260640876821e-05, "loss": 0.1527, "step": 5180 }, { "epoch": 0.15887592983745064, "grad_norm": 1.1003334522247314, "learning_rate": 9.984072809732225e-05, "loss": 0.1012, "step": 5190 }, { "epoch": 0.15918204916276366, "grad_norm": 2.394806146621704, "learning_rate": 9.983883866239252e-05, "loss": 0.1108, "step": 5200 }, { "epoch": 0.15948816848807665, "grad_norm": 1.4121208190917969, "learning_rate": 9.983693810440073e-05, "loss": 0.1608, "step": 5210 }, { "epoch": 0.15979428781338967, "grad_norm": 1.6510982513427734, "learning_rate": 9.983502642377104e-05, "loss": 0.156, "step": 5220 }, { "epoch": 0.16010040713870266, "grad_norm": 2.9082157611846924, "learning_rate": 9.98331036209301e-05, "loss": 0.1148, "step": 5230 }, { "epoch": 0.16040652646401568, "grad_norm": 2.0062601566314697, "learning_rate": 9.983116969630706e-05, "loss": 0.129, "step": 5240 }, { "epoch": 0.16071264578932867, "grad_norm": 2.113518476486206, "learning_rate": 9.98292246503335e-05, "loss": 0.2074, "step": 5250 }, { "epoch": 0.1610187651146417, "grad_norm": 3.6255979537963867, "learning_rate": 9.982726848344353e-05, "loss": 0.1314, "step": 5260 }, { "epoch": 0.16132488443995469, "grad_norm": 0.9369949698448181, "learning_rate": 9.982530119607374e-05, "loss": 0.0429, "step": 5270 }, { "epoch": 0.1616310037652677, "grad_norm": 3.0486414432525635, "learning_rate": 9.982332278866319e-05, "loss": 0.1042, "step": 5280 }, { "epoch": 0.1619371230905807, "grad_norm": 0.5504125356674194, "learning_rate": 9.98213332616534e-05, "loss": 0.0783, "step": 5290 }, { "epoch": 0.16224324241589372, "grad_norm": 2.544940233230591, "learning_rate": 9.981933261548842e-05, "loss": 0.1581, "step": 5300 }, { "epoch": 0.1625493617412067, "grad_norm": 1.9359240531921387, "learning_rate": 9.981732085061474e-05, "loss": 0.1211, "step": 5310 }, { "epoch": 0.16285548106651973, "grad_norm": 2.4455580711364746, "learning_rate": 9.981529796748134e-05, "loss": 0.1704, "step": 5320 }, { "epoch": 0.16316160039183272, "grad_norm": 2.218660354614258, "learning_rate": 9.981326396653972e-05, "loss": 0.1303, "step": 5330 }, { "epoch": 0.16346771971714574, "grad_norm": 2.7238287925720215, "learning_rate": 9.98112188482438e-05, "loss": 0.0874, "step": 5340 }, { "epoch": 0.16377383904245876, "grad_norm": 2.9269092082977295, "learning_rate": 9.980916261305003e-05, "loss": 0.1232, "step": 5350 }, { "epoch": 0.16407995836777176, "grad_norm": 1.9170345067977905, "learning_rate": 9.98070952614173e-05, "loss": 0.1026, "step": 5360 }, { "epoch": 0.16438607769308478, "grad_norm": 2.5054993629455566, "learning_rate": 9.980501679380701e-05, "loss": 0.1134, "step": 5370 }, { "epoch": 0.16469219701839777, "grad_norm": 4.046473503112793, "learning_rate": 9.980292721068303e-05, "loss": 0.1231, "step": 5380 }, { "epoch": 0.1649983163437108, "grad_norm": 0.9197081923484802, "learning_rate": 9.980082651251175e-05, "loss": 0.1827, "step": 5390 }, { "epoch": 0.16530443566902378, "grad_norm": 0.838589608669281, "learning_rate": 9.979871469976196e-05, "loss": 0.0955, "step": 5400 }, { "epoch": 0.1656105549943368, "grad_norm": 1.5975550413131714, "learning_rate": 9.979659177290499e-05, "loss": 0.1601, "step": 5410 }, { "epoch": 0.1659166743196498, "grad_norm": 2.0406949520111084, "learning_rate": 9.979445773241465e-05, "loss": 0.1796, "step": 5420 }, { "epoch": 0.1662227936449628, "grad_norm": 2.3654985427856445, "learning_rate": 9.97923125787672e-05, "loss": 0.2115, "step": 5430 }, { "epoch": 0.1665289129702758, "grad_norm": 2.293264865875244, "learning_rate": 9.97901563124414e-05, "loss": 0.1424, "step": 5440 }, { "epoch": 0.16683503229558883, "grad_norm": 3.6130166053771973, "learning_rate": 9.978798893391849e-05, "loss": 0.208, "step": 5450 }, { "epoch": 0.16714115162090182, "grad_norm": 1.808098554611206, "learning_rate": 9.97858104436822e-05, "loss": 0.0645, "step": 5460 }, { "epoch": 0.16744727094621484, "grad_norm": 2.7572109699249268, "learning_rate": 9.978362084221869e-05, "loss": 0.1628, "step": 5470 }, { "epoch": 0.16775339027152783, "grad_norm": 1.9827396869659424, "learning_rate": 9.978142013001667e-05, "loss": 0.0754, "step": 5480 }, { "epoch": 0.16805950959684085, "grad_norm": 2.912688732147217, "learning_rate": 9.977920830756729e-05, "loss": 0.1474, "step": 5490 }, { "epoch": 0.16836562892215384, "grad_norm": 2.9352900981903076, "learning_rate": 9.977698537536419e-05, "loss": 0.148, "step": 5500 }, { "epoch": 0.16867174824746686, "grad_norm": 2.6216795444488525, "learning_rate": 9.977475133390346e-05, "loss": 0.1227, "step": 5510 }, { "epoch": 0.16897786757277988, "grad_norm": 1.6180533170700073, "learning_rate": 9.97725061836837e-05, "loss": 0.1072, "step": 5520 }, { "epoch": 0.16928398689809288, "grad_norm": 2.1538686752319336, "learning_rate": 9.977024992520602e-05, "loss": 0.1199, "step": 5530 }, { "epoch": 0.1695901062234059, "grad_norm": 2.0452089309692383, "learning_rate": 9.976798255897394e-05, "loss": 0.097, "step": 5540 }, { "epoch": 0.1698962255487189, "grad_norm": 3.1350793838500977, "learning_rate": 9.97657040854935e-05, "loss": 0.1431, "step": 5550 }, { "epoch": 0.1702023448740319, "grad_norm": 1.36935293674469, "learning_rate": 9.976341450527322e-05, "loss": 0.0993, "step": 5560 }, { "epoch": 0.1705084641993449, "grad_norm": 2.016183853149414, "learning_rate": 9.976111381882409e-05, "loss": 0.1339, "step": 5570 }, { "epoch": 0.17081458352465792, "grad_norm": 2.8056368827819824, "learning_rate": 9.975880202665955e-05, "loss": 0.1665, "step": 5580 }, { "epoch": 0.1711207028499709, "grad_norm": 2.858612298965454, "learning_rate": 9.975647912929556e-05, "loss": 0.1127, "step": 5590 }, { "epoch": 0.17142682217528393, "grad_norm": 6.2110371589660645, "learning_rate": 9.975414512725057e-05, "loss": 0.1466, "step": 5600 }, { "epoch": 0.17173294150059693, "grad_norm": 0.7827681303024292, "learning_rate": 9.975180002104548e-05, "loss": 0.1206, "step": 5610 }, { "epoch": 0.17203906082590995, "grad_norm": 3.3894236087799072, "learning_rate": 9.974944381120363e-05, "loss": 0.0763, "step": 5620 }, { "epoch": 0.17234518015122294, "grad_norm": 2.8674023151397705, "learning_rate": 9.974707649825092e-05, "loss": 0.0894, "step": 5630 }, { "epoch": 0.17265129947653596, "grad_norm": 1.906205415725708, "learning_rate": 9.97446980827157e-05, "loss": 0.0961, "step": 5640 }, { "epoch": 0.17295741880184895, "grad_norm": 1.695494294166565, "learning_rate": 9.974230856512874e-05, "loss": 0.0673, "step": 5650 }, { "epoch": 0.17326353812716197, "grad_norm": 2.105290412902832, "learning_rate": 9.973990794602335e-05, "loss": 0.1659, "step": 5660 }, { "epoch": 0.17356965745247496, "grad_norm": 4.587545871734619, "learning_rate": 9.973749622593534e-05, "loss": 0.1673, "step": 5670 }, { "epoch": 0.17387577677778798, "grad_norm": 2.676969289779663, "learning_rate": 9.973507340540291e-05, "loss": 0.1971, "step": 5680 }, { "epoch": 0.17418189610310097, "grad_norm": 2.196540594100952, "learning_rate": 9.973263948496681e-05, "loss": 0.1204, "step": 5690 }, { "epoch": 0.174488015428414, "grad_norm": 1.822795033454895, "learning_rate": 9.973019446517023e-05, "loss": 0.0956, "step": 5700 }, { "epoch": 0.17479413475372702, "grad_norm": 1.591055154800415, "learning_rate": 9.972773834655886e-05, "loss": 0.0835, "step": 5710 }, { "epoch": 0.17510025407904, "grad_norm": 1.9508153200149536, "learning_rate": 9.972527112968088e-05, "loss": 0.1321, "step": 5720 }, { "epoch": 0.17540637340435303, "grad_norm": 1.8790154457092285, "learning_rate": 9.972279281508689e-05, "loss": 0.0842, "step": 5730 }, { "epoch": 0.17571249272966602, "grad_norm": 3.8723273277282715, "learning_rate": 9.972030340333001e-05, "loss": 0.1639, "step": 5740 }, { "epoch": 0.17601861205497904, "grad_norm": 2.7278218269348145, "learning_rate": 9.971780289496584e-05, "loss": 0.0973, "step": 5750 }, { "epoch": 0.17632473138029203, "grad_norm": 1.4774023294448853, "learning_rate": 9.971529129055243e-05, "loss": 0.0919, "step": 5760 }, { "epoch": 0.17663085070560505, "grad_norm": 1.6328507661819458, "learning_rate": 9.971276859065035e-05, "loss": 0.09, "step": 5770 }, { "epoch": 0.17693697003091804, "grad_norm": 0.8575165271759033, "learning_rate": 9.971023479582257e-05, "loss": 0.0766, "step": 5780 }, { "epoch": 0.17724308935623106, "grad_norm": 1.724282145500183, "learning_rate": 9.970768990663464e-05, "loss": 0.1239, "step": 5790 }, { "epoch": 0.17754920868154406, "grad_norm": 1.5203289985656738, "learning_rate": 9.970513392365448e-05, "loss": 0.131, "step": 5800 }, { "epoch": 0.17785532800685708, "grad_norm": 3.1770870685577393, "learning_rate": 9.970256684745258e-05, "loss": 0.2396, "step": 5810 }, { "epoch": 0.17816144733217007, "grad_norm": 2.101348876953125, "learning_rate": 9.969998867860182e-05, "loss": 0.0855, "step": 5820 }, { "epoch": 0.1784675666574831, "grad_norm": 2.3483195304870605, "learning_rate": 9.969739941767762e-05, "loss": 0.109, "step": 5830 }, { "epoch": 0.17877368598279608, "grad_norm": 2.06937837600708, "learning_rate": 9.969479906525785e-05, "loss": 0.0667, "step": 5840 }, { "epoch": 0.1790798053081091, "grad_norm": 1.8407328128814697, "learning_rate": 9.969218762192285e-05, "loss": 0.0904, "step": 5850 }, { "epoch": 0.1793859246334221, "grad_norm": 1.8351871967315674, "learning_rate": 9.968956508825546e-05, "loss": 0.1286, "step": 5860 }, { "epoch": 0.17969204395873511, "grad_norm": 2.223830461502075, "learning_rate": 9.968693146484096e-05, "loss": 0.1195, "step": 5870 }, { "epoch": 0.17999816328404813, "grad_norm": 2.420783281326294, "learning_rate": 9.968428675226714e-05, "loss": 0.102, "step": 5880 }, { "epoch": 0.18030428260936113, "grad_norm": 0.846277117729187, "learning_rate": 9.968163095112424e-05, "loss": 0.0945, "step": 5890 }, { "epoch": 0.18061040193467415, "grad_norm": 1.5545074939727783, "learning_rate": 9.967896406200498e-05, "loss": 0.1277, "step": 5900 }, { "epoch": 0.18091652125998714, "grad_norm": 1.3022046089172363, "learning_rate": 9.967628608550457e-05, "loss": 0.1199, "step": 5910 }, { "epoch": 0.18122264058530016, "grad_norm": 1.0894781351089478, "learning_rate": 9.967359702222068e-05, "loss": 0.0759, "step": 5920 }, { "epoch": 0.18152875991061315, "grad_norm": 1.4262080192565918, "learning_rate": 9.967089687275345e-05, "loss": 0.1564, "step": 5930 }, { "epoch": 0.18183487923592617, "grad_norm": 2.098729372024536, "learning_rate": 9.966818563770548e-05, "loss": 0.1751, "step": 5940 }, { "epoch": 0.18214099856123916, "grad_norm": 1.0680773258209229, "learning_rate": 9.966546331768191e-05, "loss": 0.1036, "step": 5950 }, { "epoch": 0.18244711788655218, "grad_norm": 4.035389423370361, "learning_rate": 9.966272991329029e-05, "loss": 0.1186, "step": 5960 }, { "epoch": 0.18275323721186518, "grad_norm": 6.690606117248535, "learning_rate": 9.965998542514066e-05, "loss": 0.1499, "step": 5970 }, { "epoch": 0.1830593565371782, "grad_norm": 1.0502493381500244, "learning_rate": 9.965722985384551e-05, "loss": 0.0593, "step": 5980 }, { "epoch": 0.1833654758624912, "grad_norm": 1.2743202447891235, "learning_rate": 9.965446320001988e-05, "loss": 0.108, "step": 5990 }, { "epoch": 0.1836715951878042, "grad_norm": 2.7157936096191406, "learning_rate": 9.965168546428121e-05, "loss": 0.1008, "step": 6000 }, { "epoch": 0.1839777145131172, "grad_norm": 2.874995470046997, "learning_rate": 9.964889664724943e-05, "loss": 0.0855, "step": 6010 }, { "epoch": 0.18428383383843022, "grad_norm": 2.922236680984497, "learning_rate": 9.964609674954696e-05, "loss": 0.1289, "step": 6020 }, { "epoch": 0.18458995316374321, "grad_norm": 1.5835667848587036, "learning_rate": 9.964328577179866e-05, "loss": 0.0795, "step": 6030 }, { "epoch": 0.18489607248905623, "grad_norm": 1.9422718286514282, "learning_rate": 9.964046371463193e-05, "loss": 0.161, "step": 6040 }, { "epoch": 0.18520219181436925, "grad_norm": 2.9904704093933105, "learning_rate": 9.963763057867656e-05, "loss": 0.1207, "step": 6050 }, { "epoch": 0.18550831113968225, "grad_norm": 2.7562153339385986, "learning_rate": 9.963478636456488e-05, "loss": 0.1396, "step": 6060 }, { "epoch": 0.18581443046499527, "grad_norm": 1.092964768409729, "learning_rate": 9.963193107293164e-05, "loss": 0.0912, "step": 6070 }, { "epoch": 0.18612054979030826, "grad_norm": 1.339728593826294, "learning_rate": 9.962906470441409e-05, "loss": 0.1112, "step": 6080 }, { "epoch": 0.18642666911562128, "grad_norm": 4.224067687988281, "learning_rate": 9.962618725965196e-05, "loss": 0.0973, "step": 6090 }, { "epoch": 0.18673278844093427, "grad_norm": 1.8960227966308594, "learning_rate": 9.962329873928742e-05, "loss": 0.0909, "step": 6100 }, { "epoch": 0.1870389077662473, "grad_norm": 1.582404375076294, "learning_rate": 9.962039914396515e-05, "loss": 0.091, "step": 6110 }, { "epoch": 0.18734502709156028, "grad_norm": 4.402654647827148, "learning_rate": 9.961748847433226e-05, "loss": 0.1103, "step": 6120 }, { "epoch": 0.1876511464168733, "grad_norm": 2.5607335567474365, "learning_rate": 9.96145667310384e-05, "loss": 0.0864, "step": 6130 }, { "epoch": 0.1879572657421863, "grad_norm": 0.9640365242958069, "learning_rate": 9.96116339147356e-05, "loss": 0.1087, "step": 6140 }, { "epoch": 0.18826338506749932, "grad_norm": 1.610667109489441, "learning_rate": 9.960869002607842e-05, "loss": 0.1317, "step": 6150 }, { "epoch": 0.1885695043928123, "grad_norm": 1.214432716369629, "learning_rate": 9.96057350657239e-05, "loss": 0.1319, "step": 6160 }, { "epoch": 0.18887562371812533, "grad_norm": 2.456973075866699, "learning_rate": 9.96027690343315e-05, "loss": 0.1557, "step": 6170 }, { "epoch": 0.18918174304343832, "grad_norm": 2.1590704917907715, "learning_rate": 9.959979193256321e-05, "loss": 0.1323, "step": 6180 }, { "epoch": 0.18948786236875134, "grad_norm": 3.321239948272705, "learning_rate": 9.959680376108343e-05, "loss": 0.2651, "step": 6190 }, { "epoch": 0.18979398169406433, "grad_norm": 2.116734027862549, "learning_rate": 9.959380452055909e-05, "loss": 0.0692, "step": 6200 }, { "epoch": 0.19010010101937735, "grad_norm": 4.172573566436768, "learning_rate": 9.959079421165956e-05, "loss": 0.1182, "step": 6210 }, { "epoch": 0.19040622034469035, "grad_norm": 1.449631929397583, "learning_rate": 9.958777283505665e-05, "loss": 0.1028, "step": 6220 }, { "epoch": 0.19071233967000337, "grad_norm": 0.9902461767196655, "learning_rate": 9.95847403914247e-05, "loss": 0.2129, "step": 6230 }, { "epoch": 0.1910184589953164, "grad_norm": 1.3662947416305542, "learning_rate": 9.95816968814405e-05, "loss": 0.0793, "step": 6240 }, { "epoch": 0.19132457832062938, "grad_norm": 2.1448543071746826, "learning_rate": 9.957864230578328e-05, "loss": 0.0721, "step": 6250 }, { "epoch": 0.1916306976459424, "grad_norm": 3.143394947052002, "learning_rate": 9.957557666513477e-05, "loss": 0.0773, "step": 6260 }, { "epoch": 0.1919368169712554, "grad_norm": 1.9689536094665527, "learning_rate": 9.957249996017919e-05, "loss": 0.0995, "step": 6270 }, { "epoch": 0.1922429362965684, "grad_norm": 1.7056690454483032, "learning_rate": 9.956941219160314e-05, "loss": 0.1333, "step": 6280 }, { "epoch": 0.1925490556218814, "grad_norm": 1.5207563638687134, "learning_rate": 9.956631336009581e-05, "loss": 0.0612, "step": 6290 }, { "epoch": 0.19285517494719442, "grad_norm": 1.5859475135803223, "learning_rate": 9.956320346634876e-05, "loss": 0.1266, "step": 6300 }, { "epoch": 0.19316129427250742, "grad_norm": 1.6839325428009033, "learning_rate": 9.956008251105608e-05, "loss": 0.1176, "step": 6310 }, { "epoch": 0.19346741359782044, "grad_norm": 1.5198501348495483, "learning_rate": 9.955695049491431e-05, "loss": 0.0898, "step": 6320 }, { "epoch": 0.19377353292313343, "grad_norm": 2.9169883728027344, "learning_rate": 9.955380741862243e-05, "loss": 0.1063, "step": 6330 }, { "epoch": 0.19407965224844645, "grad_norm": 0.7854726910591125, "learning_rate": 9.955065328288193e-05, "loss": 0.1425, "step": 6340 }, { "epoch": 0.19438577157375944, "grad_norm": 1.5364956855773926, "learning_rate": 9.954748808839674e-05, "loss": 0.111, "step": 6350 }, { "epoch": 0.19469189089907246, "grad_norm": 3.438509702682495, "learning_rate": 9.95443118358733e-05, "loss": 0.1422, "step": 6360 }, { "epoch": 0.19499801022438545, "grad_norm": 1.280172348022461, "learning_rate": 9.954112452602045e-05, "loss": 0.0849, "step": 6370 }, { "epoch": 0.19530412954969847, "grad_norm": 1.842815637588501, "learning_rate": 9.953792615954956e-05, "loss": 0.0783, "step": 6380 }, { "epoch": 0.19561024887501147, "grad_norm": 4.767699718475342, "learning_rate": 9.953471673717444e-05, "loss": 0.1104, "step": 6390 }, { "epoch": 0.1959163682003245, "grad_norm": 1.6208871603012085, "learning_rate": 9.953149625961136e-05, "loss": 0.1068, "step": 6400 }, { "epoch": 0.1962224875256375, "grad_norm": 1.5891724824905396, "learning_rate": 9.952826472757908e-05, "loss": 0.113, "step": 6410 }, { "epoch": 0.1965286068509505, "grad_norm": 3.767228126525879, "learning_rate": 9.952502214179882e-05, "loss": 0.1097, "step": 6420 }, { "epoch": 0.19683472617626352, "grad_norm": 1.2226611375808716, "learning_rate": 9.952176850299424e-05, "loss": 0.0628, "step": 6430 }, { "epoch": 0.1971408455015765, "grad_norm": 1.1491564512252808, "learning_rate": 9.95185038118915e-05, "loss": 0.0825, "step": 6440 }, { "epoch": 0.19744696482688953, "grad_norm": 1.4907784461975098, "learning_rate": 9.951522806921923e-05, "loss": 0.0824, "step": 6450 }, { "epoch": 0.19775308415220252, "grad_norm": 2.410827875137329, "learning_rate": 9.951194127570849e-05, "loss": 0.0532, "step": 6460 }, { "epoch": 0.19805920347751554, "grad_norm": 1.6344434022903442, "learning_rate": 9.950864343209285e-05, "loss": 0.112, "step": 6470 }, { "epoch": 0.19836532280282854, "grad_norm": 1.8631178140640259, "learning_rate": 9.95053345391083e-05, "loss": 0.156, "step": 6480 }, { "epoch": 0.19867144212814156, "grad_norm": 2.5490877628326416, "learning_rate": 9.950201459749335e-05, "loss": 0.0597, "step": 6490 }, { "epoch": 0.19897756145345455, "grad_norm": 2.416191339492798, "learning_rate": 9.949868360798893e-05, "loss": 0.087, "step": 6500 }, { "epoch": 0.19928368077876757, "grad_norm": 3.1812102794647217, "learning_rate": 9.949534157133844e-05, "loss": 0.0845, "step": 6510 }, { "epoch": 0.19958980010408056, "grad_norm": 1.7730202674865723, "learning_rate": 9.949198848828778e-05, "loss": 0.126, "step": 6520 }, { "epoch": 0.19989591942939358, "grad_norm": 2.2922608852386475, "learning_rate": 9.948862435958528e-05, "loss": 0.1221, "step": 6530 }, { "epoch": 0.20020203875470657, "grad_norm": 1.963633418083191, "learning_rate": 9.948524918598175e-05, "loss": 0.0983, "step": 6540 }, { "epoch": 0.2005081580800196, "grad_norm": 1.288026213645935, "learning_rate": 9.948186296823048e-05, "loss": 0.0489, "step": 6550 }, { "epoch": 0.2008142774053326, "grad_norm": 2.9604876041412354, "learning_rate": 9.947846570708716e-05, "loss": 0.1791, "step": 6560 }, { "epoch": 0.2011203967306456, "grad_norm": 1.9501591920852661, "learning_rate": 9.947505740331007e-05, "loss": 0.0975, "step": 6570 }, { "epoch": 0.20142651605595863, "grad_norm": 1.17605459690094, "learning_rate": 9.94716380576598e-05, "loss": 0.0965, "step": 6580 }, { "epoch": 0.20173263538127162, "grad_norm": 1.3227699995040894, "learning_rate": 9.946820767089952e-05, "loss": 0.2616, "step": 6590 }, { "epoch": 0.20203875470658464, "grad_norm": 2.9874160289764404, "learning_rate": 9.946476624379484e-05, "loss": 0.0964, "step": 6600 }, { "epoch": 0.20234487403189763, "grad_norm": 1.499634027481079, "learning_rate": 9.946131377711379e-05, "loss": 0.1678, "step": 6610 }, { "epoch": 0.20265099335721065, "grad_norm": 2.5647153854370117, "learning_rate": 9.94578502716269e-05, "loss": 0.1545, "step": 6620 }, { "epoch": 0.20295711268252364, "grad_norm": 2.337531089782715, "learning_rate": 9.945437572810717e-05, "loss": 0.1107, "step": 6630 }, { "epoch": 0.20326323200783666, "grad_norm": 4.580775260925293, "learning_rate": 9.945089014733005e-05, "loss": 0.1575, "step": 6640 }, { "epoch": 0.20356935133314966, "grad_norm": 1.5316671133041382, "learning_rate": 9.944739353007344e-05, "loss": 0.1119, "step": 6650 }, { "epoch": 0.20387547065846268, "grad_norm": 1.6846762895584106, "learning_rate": 9.94438858771177e-05, "loss": 0.1104, "step": 6660 }, { "epoch": 0.20418158998377567, "grad_norm": 1.6555179357528687, "learning_rate": 9.944036718924572e-05, "loss": 0.0604, "step": 6670 }, { "epoch": 0.2044877093090887, "grad_norm": 1.2076518535614014, "learning_rate": 9.943683746724277e-05, "loss": 0.0687, "step": 6680 }, { "epoch": 0.20479382863440168, "grad_norm": 2.0800018310546875, "learning_rate": 9.943329671189662e-05, "loss": 0.1456, "step": 6690 }, { "epoch": 0.2050999479597147, "grad_norm": 1.7368093729019165, "learning_rate": 9.94297449239975e-05, "loss": 0.0831, "step": 6700 }, { "epoch": 0.2054060672850277, "grad_norm": 2.225728988647461, "learning_rate": 9.942618210433811e-05, "loss": 0.1595, "step": 6710 }, { "epoch": 0.2057121866103407, "grad_norm": 3.5186569690704346, "learning_rate": 9.942260825371358e-05, "loss": 0.0672, "step": 6720 }, { "epoch": 0.2060183059356537, "grad_norm": 6.002516746520996, "learning_rate": 9.941902337292155e-05, "loss": 0.1114, "step": 6730 }, { "epoch": 0.20632442526096673, "grad_norm": 4.546398162841797, "learning_rate": 9.941542746276207e-05, "loss": 0.2046, "step": 6740 }, { "epoch": 0.20663054458627972, "grad_norm": 1.18800687789917, "learning_rate": 9.941182052403769e-05, "loss": 0.0906, "step": 6750 }, { "epoch": 0.20693666391159274, "grad_norm": 3.2397372722625732, "learning_rate": 9.940820255755341e-05, "loss": 0.1173, "step": 6760 }, { "epoch": 0.20724278323690576, "grad_norm": 1.7102130651474, "learning_rate": 9.940457356411669e-05, "loss": 0.0623, "step": 6770 }, { "epoch": 0.20754890256221875, "grad_norm": 4.980820178985596, "learning_rate": 9.940093354453745e-05, "loss": 0.0929, "step": 6780 }, { "epoch": 0.20785502188753177, "grad_norm": 1.7486668825149536, "learning_rate": 9.939728249962807e-05, "loss": 0.1313, "step": 6790 }, { "epoch": 0.20816114121284476, "grad_norm": 3.5465691089630127, "learning_rate": 9.939362043020339e-05, "loss": 0.1592, "step": 6800 }, { "epoch": 0.20846726053815778, "grad_norm": 1.2126641273498535, "learning_rate": 9.938994733708074e-05, "loss": 0.1093, "step": 6810 }, { "epoch": 0.20877337986347078, "grad_norm": 1.4718961715698242, "learning_rate": 9.938626322107982e-05, "loss": 0.1298, "step": 6820 }, { "epoch": 0.2090794991887838, "grad_norm": 2.3068504333496094, "learning_rate": 9.938256808302292e-05, "loss": 0.0613, "step": 6830 }, { "epoch": 0.2093856185140968, "grad_norm": 0.8868584632873535, "learning_rate": 9.937886192373469e-05, "loss": 0.0961, "step": 6840 }, { "epoch": 0.2096917378394098, "grad_norm": 1.1997690200805664, "learning_rate": 9.937514474404228e-05, "loss": 0.1775, "step": 6850 }, { "epoch": 0.2099978571647228, "grad_norm": 1.6180576086044312, "learning_rate": 9.937141654477528e-05, "loss": 0.1777, "step": 6860 }, { "epoch": 0.21030397649003582, "grad_norm": 2.973041296005249, "learning_rate": 9.936767732676579e-05, "loss": 0.1183, "step": 6870 }, { "epoch": 0.2106100958153488, "grad_norm": 3.423482656478882, "learning_rate": 9.936392709084827e-05, "loss": 0.0899, "step": 6880 }, { "epoch": 0.21091621514066183, "grad_norm": 1.4296752214431763, "learning_rate": 9.936016583785976e-05, "loss": 0.1115, "step": 6890 }, { "epoch": 0.21122233446597483, "grad_norm": 3.6079916954040527, "learning_rate": 9.935639356863966e-05, "loss": 0.1561, "step": 6900 }, { "epoch": 0.21152845379128785, "grad_norm": 1.7855584621429443, "learning_rate": 9.935261028402989e-05, "loss": 0.0696, "step": 6910 }, { "epoch": 0.21183457311660084, "grad_norm": 1.8227019309997559, "learning_rate": 9.934881598487479e-05, "loss": 0.1207, "step": 6920 }, { "epoch": 0.21214069244191386, "grad_norm": 1.736131191253662, "learning_rate": 9.934501067202117e-05, "loss": 0.1516, "step": 6930 }, { "epoch": 0.21244681176722688, "grad_norm": 2.993750810623169, "learning_rate": 9.934119434631832e-05, "loss": 0.0903, "step": 6940 }, { "epoch": 0.21275293109253987, "grad_norm": 2.2736387252807617, "learning_rate": 9.933736700861798e-05, "loss": 0.0929, "step": 6950 }, { "epoch": 0.2130590504178529, "grad_norm": 2.1808156967163086, "learning_rate": 9.933352865977429e-05, "loss": 0.1118, "step": 6960 }, { "epoch": 0.21336516974316588, "grad_norm": 2.102316379547119, "learning_rate": 9.932967930064394e-05, "loss": 0.1323, "step": 6970 }, { "epoch": 0.2136712890684789, "grad_norm": 1.8099220991134644, "learning_rate": 9.932581893208602e-05, "loss": 0.0775, "step": 6980 }, { "epoch": 0.2139774083937919, "grad_norm": 4.140148639678955, "learning_rate": 9.932194755496207e-05, "loss": 0.1146, "step": 6990 }, { "epoch": 0.21428352771910492, "grad_norm": 2.1364634037017822, "learning_rate": 9.931806517013612e-05, "loss": 0.1044, "step": 7000 }, { "epoch": 0.2145896470444179, "grad_norm": 3.2122089862823486, "learning_rate": 9.931417177847467e-05, "loss": 0.1319, "step": 7010 }, { "epoch": 0.21489576636973093, "grad_norm": 0.8600584864616394, "learning_rate": 9.931026738084659e-05, "loss": 0.084, "step": 7020 }, { "epoch": 0.21520188569504392, "grad_norm": 1.3818918466567993, "learning_rate": 9.930635197812332e-05, "loss": 0.1628, "step": 7030 }, { "epoch": 0.21550800502035694, "grad_norm": 0.8488407135009766, "learning_rate": 9.930242557117869e-05, "loss": 0.0969, "step": 7040 }, { "epoch": 0.21581412434566993, "grad_norm": 1.7704401016235352, "learning_rate": 9.929848816088897e-05, "loss": 0.1297, "step": 7050 }, { "epoch": 0.21612024367098295, "grad_norm": 3.551626443862915, "learning_rate": 9.929453974813295e-05, "loss": 0.1363, "step": 7060 }, { "epoch": 0.21642636299629595, "grad_norm": 1.8400551080703735, "learning_rate": 9.929058033379181e-05, "loss": 0.0992, "step": 7070 }, { "epoch": 0.21673248232160897, "grad_norm": 1.663716435432434, "learning_rate": 9.928660991874925e-05, "loss": 0.1061, "step": 7080 }, { "epoch": 0.21703860164692196, "grad_norm": 2.146801471710205, "learning_rate": 9.928262850389135e-05, "loss": 0.08, "step": 7090 }, { "epoch": 0.21734472097223498, "grad_norm": 1.6850488185882568, "learning_rate": 9.92786360901067e-05, "loss": 0.1086, "step": 7100 }, { "epoch": 0.21765084029754797, "grad_norm": 2.756748676300049, "learning_rate": 9.927463267828633e-05, "loss": 0.1084, "step": 7110 }, { "epoch": 0.217956959622861, "grad_norm": 3.610898733139038, "learning_rate": 9.927061826932373e-05, "loss": 0.1326, "step": 7120 }, { "epoch": 0.218263078948174, "grad_norm": 3.659505844116211, "learning_rate": 9.926659286411484e-05, "loss": 0.0615, "step": 7130 }, { "epoch": 0.218569198273487, "grad_norm": 1.0496928691864014, "learning_rate": 9.926255646355804e-05, "loss": 0.1498, "step": 7140 }, { "epoch": 0.21887531759880002, "grad_norm": 1.8516044616699219, "learning_rate": 9.925850906855419e-05, "loss": 0.0945, "step": 7150 }, { "epoch": 0.21918143692411302, "grad_norm": 2.04975962638855, "learning_rate": 9.925445068000656e-05, "loss": 0.0922, "step": 7160 }, { "epoch": 0.21948755624942604, "grad_norm": 1.790427803993225, "learning_rate": 9.925038129882094e-05, "loss": 0.1058, "step": 7170 }, { "epoch": 0.21979367557473903, "grad_norm": 2.6955127716064453, "learning_rate": 9.924630092590552e-05, "loss": 0.0907, "step": 7180 }, { "epoch": 0.22009979490005205, "grad_norm": 2.5086872577667236, "learning_rate": 9.924220956217096e-05, "loss": 0.14, "step": 7190 }, { "epoch": 0.22040591422536504, "grad_norm": 4.453693866729736, "learning_rate": 9.923810720853037e-05, "loss": 0.1194, "step": 7200 }, { "epoch": 0.22071203355067806, "grad_norm": 0.6834319233894348, "learning_rate": 9.923399386589933e-05, "loss": 0.0899, "step": 7210 }, { "epoch": 0.22101815287599105, "grad_norm": 3.088203191757202, "learning_rate": 9.922986953519584e-05, "loss": 0.0366, "step": 7220 }, { "epoch": 0.22132427220130407, "grad_norm": 1.2310091257095337, "learning_rate": 9.92257342173404e-05, "loss": 0.1155, "step": 7230 }, { "epoch": 0.22163039152661707, "grad_norm": 1.7548909187316895, "learning_rate": 9.922158791325588e-05, "loss": 0.185, "step": 7240 }, { "epoch": 0.22193651085193009, "grad_norm": 3.1979434490203857, "learning_rate": 9.921743062386772e-05, "loss": 0.1161, "step": 7250 }, { "epoch": 0.22224263017724308, "grad_norm": 3.623196601867676, "learning_rate": 9.92132623501037e-05, "loss": 0.1071, "step": 7260 }, { "epoch": 0.2225487495025561, "grad_norm": 2.392791509628296, "learning_rate": 9.920908309289412e-05, "loss": 0.0694, "step": 7270 }, { "epoch": 0.2228548688278691, "grad_norm": 3.5523500442504883, "learning_rate": 9.92048928531717e-05, "loss": 0.083, "step": 7280 }, { "epoch": 0.2231609881531821, "grad_norm": 2.13510799407959, "learning_rate": 9.92006916318716e-05, "loss": 0.0808, "step": 7290 }, { "epoch": 0.22346710747849513, "grad_norm": 1.2910268306732178, "learning_rate": 9.919647942993148e-05, "loss": 0.0893, "step": 7300 }, { "epoch": 0.22377322680380812, "grad_norm": 1.2554218769073486, "learning_rate": 9.919225624829142e-05, "loss": 0.1065, "step": 7310 }, { "epoch": 0.22407934612912114, "grad_norm": 1.4418514966964722, "learning_rate": 9.918802208789395e-05, "loss": 0.0978, "step": 7320 }, { "epoch": 0.22438546545443414, "grad_norm": 2.0142083168029785, "learning_rate": 9.918377694968405e-05, "loss": 0.1177, "step": 7330 }, { "epoch": 0.22469158477974716, "grad_norm": 1.7757893800735474, "learning_rate": 9.917952083460916e-05, "loss": 0.1021, "step": 7340 }, { "epoch": 0.22499770410506015, "grad_norm": 1.2742432355880737, "learning_rate": 9.917525374361912e-05, "loss": 0.1077, "step": 7350 }, { "epoch": 0.22530382343037317, "grad_norm": 2.1916143894195557, "learning_rate": 9.91709756776663e-05, "loss": 0.0859, "step": 7360 }, { "epoch": 0.22560994275568616, "grad_norm": 3.293947458267212, "learning_rate": 9.916668663770551e-05, "loss": 0.1319, "step": 7370 }, { "epoch": 0.22591606208099918, "grad_norm": 1.1119506359100342, "learning_rate": 9.916238662469393e-05, "loss": 0.0922, "step": 7380 }, { "epoch": 0.22622218140631217, "grad_norm": 2.1006476879119873, "learning_rate": 9.915807563959126e-05, "loss": 0.0849, "step": 7390 }, { "epoch": 0.2265283007316252, "grad_norm": 2.0882675647735596, "learning_rate": 9.915375368335962e-05, "loss": 0.1131, "step": 7400 }, { "epoch": 0.22683442005693819, "grad_norm": 1.9564855098724365, "learning_rate": 9.914942075696358e-05, "loss": 0.084, "step": 7410 }, { "epoch": 0.2271405393822512, "grad_norm": 2.2471420764923096, "learning_rate": 9.914507686137019e-05, "loss": 0.0823, "step": 7420 }, { "epoch": 0.2274466587075642, "grad_norm": 1.3060472011566162, "learning_rate": 9.914072199754891e-05, "loss": 0.1532, "step": 7430 }, { "epoch": 0.22775277803287722, "grad_norm": 2.5147480964660645, "learning_rate": 9.913635616647166e-05, "loss": 0.1153, "step": 7440 }, { "epoch": 0.2280588973581902, "grad_norm": 1.1348732709884644, "learning_rate": 9.913197936911281e-05, "loss": 0.0491, "step": 7450 }, { "epoch": 0.22836501668350323, "grad_norm": 1.3829264640808105, "learning_rate": 9.912759160644917e-05, "loss": 0.0711, "step": 7460 }, { "epoch": 0.22867113600881625, "grad_norm": 1.4809577465057373, "learning_rate": 9.912319287946001e-05, "loss": 0.1735, "step": 7470 }, { "epoch": 0.22897725533412924, "grad_norm": 1.7469831705093384, "learning_rate": 9.911878318912706e-05, "loss": 0.1232, "step": 7480 }, { "epoch": 0.22928337465944226, "grad_norm": 1.122323989868164, "learning_rate": 9.911436253643445e-05, "loss": 0.0492, "step": 7490 }, { "epoch": 0.22958949398475526, "grad_norm": 3.000200033187866, "learning_rate": 9.910993092236878e-05, "loss": 0.0509, "step": 7500 }, { "epoch": 0.22989561331006828, "grad_norm": 0.675028920173645, "learning_rate": 9.91054883479191e-05, "loss": 0.1119, "step": 7510 }, { "epoch": 0.23020173263538127, "grad_norm": 3.86321759223938, "learning_rate": 9.910103481407695e-05, "loss": 0.1253, "step": 7520 }, { "epoch": 0.2305078519606943, "grad_norm": 2.1871607303619385, "learning_rate": 9.909657032183623e-05, "loss": 0.1187, "step": 7530 }, { "epoch": 0.23081397128600728, "grad_norm": 4.367818355560303, "learning_rate": 9.909209487219333e-05, "loss": 0.0915, "step": 7540 }, { "epoch": 0.2311200906113203, "grad_norm": 1.2064836025238037, "learning_rate": 9.908760846614709e-05, "loss": 0.1042, "step": 7550 }, { "epoch": 0.2314262099366333, "grad_norm": 1.9264320135116577, "learning_rate": 9.90831111046988e-05, "loss": 0.0814, "step": 7560 }, { "epoch": 0.2317323292619463, "grad_norm": 1.4003725051879883, "learning_rate": 9.907860278885219e-05, "loss": 0.0982, "step": 7570 }, { "epoch": 0.2320384485872593, "grad_norm": 3.164294481277466, "learning_rate": 9.907408351961338e-05, "loss": 0.1111, "step": 7580 }, { "epoch": 0.23234456791257233, "grad_norm": 2.220418930053711, "learning_rate": 9.906955329799104e-05, "loss": 0.1016, "step": 7590 }, { "epoch": 0.23265068723788532, "grad_norm": 0.7259985208511353, "learning_rate": 9.90650121249962e-05, "loss": 0.0433, "step": 7600 }, { "epoch": 0.23295680656319834, "grad_norm": 1.1489619016647339, "learning_rate": 9.906046000164236e-05, "loss": 0.131, "step": 7610 }, { "epoch": 0.23326292588851133, "grad_norm": 3.669386625289917, "learning_rate": 9.905589692894548e-05, "loss": 0.1865, "step": 7620 }, { "epoch": 0.23356904521382435, "grad_norm": 1.8767329454421997, "learning_rate": 9.905132290792394e-05, "loss": 0.0769, "step": 7630 }, { "epoch": 0.23387516453913734, "grad_norm": 0.9765557646751404, "learning_rate": 9.904673793959857e-05, "loss": 0.1079, "step": 7640 }, { "epoch": 0.23418128386445036, "grad_norm": 3.440610408782959, "learning_rate": 9.904214202499266e-05, "loss": 0.1153, "step": 7650 }, { "epoch": 0.23448740318976338, "grad_norm": 2.0724072456359863, "learning_rate": 9.90375351651319e-05, "loss": 0.0784, "step": 7660 }, { "epoch": 0.23479352251507637, "grad_norm": 1.2174110412597656, "learning_rate": 9.903291736104449e-05, "loss": 0.12, "step": 7670 }, { "epoch": 0.2350996418403894, "grad_norm": 2.819883346557617, "learning_rate": 9.902828861376101e-05, "loss": 0.1214, "step": 7680 }, { "epoch": 0.2354057611657024, "grad_norm": 1.9710304737091064, "learning_rate": 9.90236489243145e-05, "loss": 0.0896, "step": 7690 }, { "epoch": 0.2357118804910154, "grad_norm": 2.099344253540039, "learning_rate": 9.901899829374047e-05, "loss": 0.169, "step": 7700 }, { "epoch": 0.2360179998163284, "grad_norm": 3.5108983516693115, "learning_rate": 9.901433672307686e-05, "loss": 0.1367, "step": 7710 }, { "epoch": 0.23632411914164142, "grad_norm": 1.6642096042633057, "learning_rate": 9.900966421336398e-05, "loss": 0.1465, "step": 7720 }, { "epoch": 0.2366302384669544, "grad_norm": 2.9787724018096924, "learning_rate": 9.900498076564471e-05, "loss": 0.1257, "step": 7730 }, { "epoch": 0.23693635779226743, "grad_norm": 2.5805110931396484, "learning_rate": 9.900028638096428e-05, "loss": 0.0969, "step": 7740 }, { "epoch": 0.23724247711758042, "grad_norm": 0.9085254073143005, "learning_rate": 9.899558106037039e-05, "loss": 0.0856, "step": 7750 }, { "epoch": 0.23754859644289344, "grad_norm": 1.731595516204834, "learning_rate": 9.899086480491317e-05, "loss": 0.2087, "step": 7760 }, { "epoch": 0.23785471576820644, "grad_norm": 1.4720028638839722, "learning_rate": 9.89861376156452e-05, "loss": 0.0962, "step": 7770 }, { "epoch": 0.23816083509351946, "grad_norm": 2.671130418777466, "learning_rate": 9.89813994936215e-05, "loss": 0.0828, "step": 7780 }, { "epoch": 0.23846695441883245, "grad_norm": 1.8833438158035278, "learning_rate": 9.897665043989951e-05, "loss": 0.113, "step": 7790 }, { "epoch": 0.23877307374414547, "grad_norm": 1.709023118019104, "learning_rate": 9.897189045553916e-05, "loss": 0.0903, "step": 7800 }, { "epoch": 0.23907919306945846, "grad_norm": 1.6491084098815918, "learning_rate": 9.896711954160276e-05, "loss": 0.0905, "step": 7810 }, { "epoch": 0.23938531239477148, "grad_norm": 1.3380266427993774, "learning_rate": 9.896233769915509e-05, "loss": 0.1079, "step": 7820 }, { "epoch": 0.2396914317200845, "grad_norm": 1.949334979057312, "learning_rate": 9.895754492926337e-05, "loss": 0.1901, "step": 7830 }, { "epoch": 0.2399975510453975, "grad_norm": 1.3812203407287598, "learning_rate": 9.895274123299723e-05, "loss": 0.0631, "step": 7840 }, { "epoch": 0.24030367037071051, "grad_norm": 1.6412403583526611, "learning_rate": 9.89479266114288e-05, "loss": 0.1053, "step": 7850 }, { "epoch": 0.2406097896960235, "grad_norm": 1.7817423343658447, "learning_rate": 9.894310106563259e-05, "loss": 0.1714, "step": 7860 }, { "epoch": 0.24091590902133653, "grad_norm": 2.114746332168579, "learning_rate": 9.893826459668558e-05, "loss": 0.1167, "step": 7870 }, { "epoch": 0.24122202834664952, "grad_norm": 1.209485411643982, "learning_rate": 9.893341720566716e-05, "loss": 0.0814, "step": 7880 }, { "epoch": 0.24152814767196254, "grad_norm": 1.308172583580017, "learning_rate": 9.892855889365916e-05, "loss": 0.0448, "step": 7890 }, { "epoch": 0.24183426699727553, "grad_norm": 2.401151418685913, "learning_rate": 9.89236896617459e-05, "loss": 0.1193, "step": 7900 }, { "epoch": 0.24214038632258855, "grad_norm": 0.5725356936454773, "learning_rate": 9.891880951101407e-05, "loss": 0.1325, "step": 7910 }, { "epoch": 0.24244650564790154, "grad_norm": 1.180150032043457, "learning_rate": 9.891391844255284e-05, "loss": 0.1019, "step": 7920 }, { "epoch": 0.24275262497321456, "grad_norm": 2.825957775115967, "learning_rate": 9.890901645745379e-05, "loss": 0.1036, "step": 7930 }, { "epoch": 0.24305874429852756, "grad_norm": 3.0667991638183594, "learning_rate": 9.890410355681097e-05, "loss": 0.0793, "step": 7940 }, { "epoch": 0.24336486362384058, "grad_norm": 4.105109214782715, "learning_rate": 9.88991797417208e-05, "loss": 0.1527, "step": 7950 }, { "epoch": 0.24367098294915357, "grad_norm": 1.1580673456192017, "learning_rate": 9.889424501328222e-05, "loss": 0.1175, "step": 7960 }, { "epoch": 0.2439771022744666, "grad_norm": 0.5224062204360962, "learning_rate": 9.888929937259657e-05, "loss": 0.0574, "step": 7970 }, { "epoch": 0.24428322159977958, "grad_norm": 1.9450995922088623, "learning_rate": 9.888434282076758e-05, "loss": 0.0857, "step": 7980 }, { "epoch": 0.2445893409250926, "grad_norm": 1.8330403566360474, "learning_rate": 9.88793753589015e-05, "loss": 0.1462, "step": 7990 }, { "epoch": 0.2448954602504056, "grad_norm": 2.454977512359619, "learning_rate": 9.887439698810693e-05, "loss": 0.0855, "step": 8000 }, { "epoch": 0.24520157957571861, "grad_norm": 0.7472816705703735, "learning_rate": 9.886940770949498e-05, "loss": 0.0896, "step": 8010 }, { "epoch": 0.24550769890103163, "grad_norm": 1.482491374015808, "learning_rate": 9.886440752417915e-05, "loss": 0.0747, "step": 8020 }, { "epoch": 0.24581381822634463, "grad_norm": 1.7246959209442139, "learning_rate": 9.885939643327538e-05, "loss": 0.127, "step": 8030 }, { "epoch": 0.24611993755165765, "grad_norm": 1.9117499589920044, "learning_rate": 9.885437443790204e-05, "loss": 0.0506, "step": 8040 }, { "epoch": 0.24642605687697064, "grad_norm": 2.630049467086792, "learning_rate": 9.884934153917997e-05, "loss": 0.1735, "step": 8050 }, { "epoch": 0.24673217620228366, "grad_norm": 2.3429548740386963, "learning_rate": 9.884429773823239e-05, "loss": 0.0746, "step": 8060 }, { "epoch": 0.24703829552759665, "grad_norm": 0.99043208360672, "learning_rate": 9.883924303618497e-05, "loss": 0.1084, "step": 8070 }, { "epoch": 0.24734441485290967, "grad_norm": 1.7860842943191528, "learning_rate": 9.883417743416587e-05, "loss": 0.071, "step": 8080 }, { "epoch": 0.24765053417822266, "grad_norm": 2.8133249282836914, "learning_rate": 9.882910093330558e-05, "loss": 0.0668, "step": 8090 }, { "epoch": 0.24795665350353568, "grad_norm": 2.958132028579712, "learning_rate": 9.882401353473711e-05, "loss": 0.0833, "step": 8100 }, { "epoch": 0.24826277282884868, "grad_norm": 2.1089651584625244, "learning_rate": 9.881891523959585e-05, "loss": 0.1156, "step": 8110 }, { "epoch": 0.2485688921541617, "grad_norm": 1.9703330993652344, "learning_rate": 9.881380604901964e-05, "loss": 0.171, "step": 8120 }, { "epoch": 0.2488750114794747, "grad_norm": 2.2278270721435547, "learning_rate": 9.880868596414876e-05, "loss": 0.0631, "step": 8130 }, { "epoch": 0.2491811308047877, "grad_norm": 3.452984571456909, "learning_rate": 9.880355498612593e-05, "loss": 0.1321, "step": 8140 }, { "epoch": 0.2494872501301007, "grad_norm": 1.7025448083877563, "learning_rate": 9.879841311609626e-05, "loss": 0.1012, "step": 8150 }, { "epoch": 0.24979336945541372, "grad_norm": 3.340855360031128, "learning_rate": 9.879326035520732e-05, "loss": 0.1396, "step": 8160 }, { "epoch": 0.25009948878072674, "grad_norm": 1.847730278968811, "learning_rate": 9.878809670460911e-05, "loss": 0.1293, "step": 8170 }, { "epoch": 0.25040560810603973, "grad_norm": 1.4400056600570679, "learning_rate": 9.878292216545406e-05, "loss": 0.094, "step": 8180 }, { "epoch": 0.2507117274313527, "grad_norm": 2.4105467796325684, "learning_rate": 9.877773673889701e-05, "loss": 0.0716, "step": 8190 }, { "epoch": 0.2510178467566658, "grad_norm": 2.1414449214935303, "learning_rate": 9.877254042609528e-05, "loss": 0.0753, "step": 8200 }, { "epoch": 0.25132396608197877, "grad_norm": 4.589915752410889, "learning_rate": 9.876733322820856e-05, "loss": 0.1196, "step": 8210 }, { "epoch": 0.25163008540729176, "grad_norm": 2.260150909423828, "learning_rate": 9.876211514639901e-05, "loss": 0.1168, "step": 8220 }, { "epoch": 0.25193620473260475, "grad_norm": 1.8613935708999634, "learning_rate": 9.875688618183118e-05, "loss": 0.0948, "step": 8230 }, { "epoch": 0.2522423240579178, "grad_norm": 1.2993934154510498, "learning_rate": 9.87516463356721e-05, "loss": 0.0711, "step": 8240 }, { "epoch": 0.2525484433832308, "grad_norm": 3.483132839202881, "learning_rate": 9.874639560909117e-05, "loss": 0.1099, "step": 8250 }, { "epoch": 0.2528545627085438, "grad_norm": 2.079169750213623, "learning_rate": 9.87411340032603e-05, "loss": 0.1076, "step": 8260 }, { "epoch": 0.2531606820338568, "grad_norm": 6.849393367767334, "learning_rate": 9.873586151935375e-05, "loss": 0.1098, "step": 8270 }, { "epoch": 0.2534668013591698, "grad_norm": 2.8657889366149902, "learning_rate": 9.873057815854823e-05, "loss": 0.1129, "step": 8280 }, { "epoch": 0.2537729206844828, "grad_norm": 2.3948473930358887, "learning_rate": 9.87252839220229e-05, "loss": 0.0937, "step": 8290 }, { "epoch": 0.2540790400097958, "grad_norm": 1.5646295547485352, "learning_rate": 9.87199788109593e-05, "loss": 0.0951, "step": 8300 }, { "epoch": 0.2543851593351088, "grad_norm": 2.5670716762542725, "learning_rate": 9.871466282654146e-05, "loss": 0.1045, "step": 8310 }, { "epoch": 0.25469127866042185, "grad_norm": 2.523190498352051, "learning_rate": 9.87093359699558e-05, "loss": 0.1057, "step": 8320 }, { "epoch": 0.25499739798573484, "grad_norm": 2.407392978668213, "learning_rate": 9.870399824239117e-05, "loss": 0.1302, "step": 8330 }, { "epoch": 0.25530351731104783, "grad_norm": 0.9431551694869995, "learning_rate": 9.869864964503881e-05, "loss": 0.0703, "step": 8340 }, { "epoch": 0.2556096366363608, "grad_norm": 1.0593295097351074, "learning_rate": 9.869329017909248e-05, "loss": 0.1785, "step": 8350 }, { "epoch": 0.2559157559616739, "grad_norm": 1.7925031185150146, "learning_rate": 9.868791984574829e-05, "loss": 0.0757, "step": 8360 }, { "epoch": 0.25622187528698687, "grad_norm": 2.467088222503662, "learning_rate": 9.868253864620478e-05, "loss": 0.0856, "step": 8370 }, { "epoch": 0.25652799461229986, "grad_norm": 1.6726059913635254, "learning_rate": 9.867714658166294e-05, "loss": 0.0751, "step": 8380 }, { "epoch": 0.2568341139376129, "grad_norm": 1.5929301977157593, "learning_rate": 9.867174365332617e-05, "loss": 0.119, "step": 8390 }, { "epoch": 0.2571402332629259, "grad_norm": 2.771980047225952, "learning_rate": 9.86663298624003e-05, "loss": 0.0988, "step": 8400 }, { "epoch": 0.2574463525882389, "grad_norm": 2.352374315261841, "learning_rate": 9.866090521009358e-05, "loss": 0.0937, "step": 8410 }, { "epoch": 0.2577524719135519, "grad_norm": 1.5301297903060913, "learning_rate": 9.86554696976167e-05, "loss": 0.1088, "step": 8420 }, { "epoch": 0.25805859123886493, "grad_norm": 2.0454115867614746, "learning_rate": 9.865002332618274e-05, "loss": 0.144, "step": 8430 }, { "epoch": 0.2583647105641779, "grad_norm": 1.2540911436080933, "learning_rate": 9.864456609700726e-05, "loss": 0.1123, "step": 8440 }, { "epoch": 0.2586708298894909, "grad_norm": 1.6065363883972168, "learning_rate": 9.863909801130816e-05, "loss": 0.1267, "step": 8450 }, { "epoch": 0.2589769492148039, "grad_norm": 1.1264289617538452, "learning_rate": 9.863361907030584e-05, "loss": 0.0911, "step": 8460 }, { "epoch": 0.25928306854011696, "grad_norm": 1.2569090127944946, "learning_rate": 9.862812927522309e-05, "loss": 0.1043, "step": 8470 }, { "epoch": 0.25958918786542995, "grad_norm": 1.9751571416854858, "learning_rate": 9.862262862728513e-05, "loss": 0.0885, "step": 8480 }, { "epoch": 0.25989530719074294, "grad_norm": 2.481016159057617, "learning_rate": 9.86171171277196e-05, "loss": 0.0966, "step": 8490 }, { "epoch": 0.26020142651605593, "grad_norm": 1.7500247955322266, "learning_rate": 9.861159477775653e-05, "loss": 0.0939, "step": 8500 }, { "epoch": 0.260507545841369, "grad_norm": 1.7027628421783447, "learning_rate": 9.860606157862844e-05, "loss": 0.1159, "step": 8510 }, { "epoch": 0.260813665166682, "grad_norm": 3.1246449947357178, "learning_rate": 9.860051753157021e-05, "loss": 0.12, "step": 8520 }, { "epoch": 0.26111978449199497, "grad_norm": 2.449132204055786, "learning_rate": 9.859496263781916e-05, "loss": 0.0989, "step": 8530 }, { "epoch": 0.261425903817308, "grad_norm": 1.8863625526428223, "learning_rate": 9.858939689861506e-05, "loss": 0.0733, "step": 8540 }, { "epoch": 0.261732023142621, "grad_norm": 2.6189699172973633, "learning_rate": 9.858382031520005e-05, "loss": 0.0873, "step": 8550 }, { "epoch": 0.262038142467934, "grad_norm": 1.4323750734329224, "learning_rate": 9.857823288881873e-05, "loss": 0.1319, "step": 8560 }, { "epoch": 0.262344261793247, "grad_norm": 2.6214327812194824, "learning_rate": 9.85726346207181e-05, "loss": 0.1697, "step": 8570 }, { "epoch": 0.26265038111856004, "grad_norm": 1.2144218683242798, "learning_rate": 9.856702551214758e-05, "loss": 0.0546, "step": 8580 }, { "epoch": 0.26295650044387303, "grad_norm": 6.053402900695801, "learning_rate": 9.856140556435903e-05, "loss": 0.1135, "step": 8590 }, { "epoch": 0.263262619769186, "grad_norm": 1.8159583806991577, "learning_rate": 9.855577477860668e-05, "loss": 0.0974, "step": 8600 }, { "epoch": 0.263568739094499, "grad_norm": 3.339336395263672, "learning_rate": 9.855013315614725e-05, "loss": 0.0961, "step": 8610 }, { "epoch": 0.26387485841981206, "grad_norm": 0.8444164395332336, "learning_rate": 9.854448069823981e-05, "loss": 0.1355, "step": 8620 }, { "epoch": 0.26418097774512506, "grad_norm": 1.7030513286590576, "learning_rate": 9.853881740614591e-05, "loss": 0.0969, "step": 8630 }, { "epoch": 0.26448709707043805, "grad_norm": 1.8839223384857178, "learning_rate": 9.853314328112947e-05, "loss": 0.1669, "step": 8640 }, { "epoch": 0.26479321639575104, "grad_norm": 1.9995276927947998, "learning_rate": 9.852745832445685e-05, "loss": 0.1251, "step": 8650 }, { "epoch": 0.2650993357210641, "grad_norm": 0.8406396508216858, "learning_rate": 9.85217625373968e-05, "loss": 0.0971, "step": 8660 }, { "epoch": 0.2654054550463771, "grad_norm": 4.399531841278076, "learning_rate": 9.851605592122054e-05, "loss": 0.0775, "step": 8670 }, { "epoch": 0.2657115743716901, "grad_norm": 4.4648332595825195, "learning_rate": 9.851033847720166e-05, "loss": 0.1234, "step": 8680 }, { "epoch": 0.26601769369700307, "grad_norm": 3.450784921646118, "learning_rate": 9.850461020661618e-05, "loss": 0.1146, "step": 8690 }, { "epoch": 0.2663238130223161, "grad_norm": 5.410933971405029, "learning_rate": 9.849887111074256e-05, "loss": 0.1276, "step": 8700 }, { "epoch": 0.2666299323476291, "grad_norm": 1.2008389234542847, "learning_rate": 9.849312119086162e-05, "loss": 0.1825, "step": 8710 }, { "epoch": 0.2669360516729421, "grad_norm": 1.7930357456207275, "learning_rate": 9.848736044825666e-05, "loss": 0.0697, "step": 8720 }, { "epoch": 0.26724217099825515, "grad_norm": 6.452085018157959, "learning_rate": 9.848158888421335e-05, "loss": 0.0749, "step": 8730 }, { "epoch": 0.26754829032356814, "grad_norm": 3.722665309906006, "learning_rate": 9.84758065000198e-05, "loss": 0.1125, "step": 8740 }, { "epoch": 0.26785440964888113, "grad_norm": 1.6297873258590698, "learning_rate": 9.847001329696653e-05, "loss": 0.1072, "step": 8750 }, { "epoch": 0.2681605289741941, "grad_norm": 2.0728933811187744, "learning_rate": 9.846420927634645e-05, "loss": 0.0782, "step": 8760 }, { "epoch": 0.26846664829950717, "grad_norm": 5.263658046722412, "learning_rate": 9.845839443945493e-05, "loss": 0.1504, "step": 8770 }, { "epoch": 0.26877276762482016, "grad_norm": 3.1635963916778564, "learning_rate": 9.84525687875897e-05, "loss": 0.0641, "step": 8780 }, { "epoch": 0.26907888695013316, "grad_norm": 2.194575548171997, "learning_rate": 9.844673232205097e-05, "loss": 0.1028, "step": 8790 }, { "epoch": 0.26938500627544615, "grad_norm": 3.1097524166107178, "learning_rate": 9.844088504414131e-05, "loss": 0.0951, "step": 8800 }, { "epoch": 0.2696911256007592, "grad_norm": 2.3990437984466553, "learning_rate": 9.843502695516568e-05, "loss": 0.0748, "step": 8810 }, { "epoch": 0.2699972449260722, "grad_norm": 2.7200043201446533, "learning_rate": 9.842915805643155e-05, "loss": 0.0863, "step": 8820 }, { "epoch": 0.2703033642513852, "grad_norm": 2.7684333324432373, "learning_rate": 9.842327834924874e-05, "loss": 0.095, "step": 8830 }, { "epoch": 0.2706094835766982, "grad_norm": 1.049220085144043, "learning_rate": 9.841738783492944e-05, "loss": 0.1059, "step": 8840 }, { "epoch": 0.2709156029020112, "grad_norm": 2.7631466388702393, "learning_rate": 9.841148651478833e-05, "loss": 0.1541, "step": 8850 }, { "epoch": 0.2712217222273242, "grad_norm": 1.768842101097107, "learning_rate": 9.840557439014247e-05, "loss": 0.0856, "step": 8860 }, { "epoch": 0.2715278415526372, "grad_norm": 2.123788356781006, "learning_rate": 9.839965146231132e-05, "loss": 0.0937, "step": 8870 }, { "epoch": 0.2718339608779502, "grad_norm": 3.184283494949341, "learning_rate": 9.839371773261678e-05, "loss": 0.0799, "step": 8880 }, { "epoch": 0.27214008020326325, "grad_norm": 1.9813085794448853, "learning_rate": 9.838777320238312e-05, "loss": 0.0948, "step": 8890 }, { "epoch": 0.27244619952857624, "grad_norm": 2.194902181625366, "learning_rate": 9.838181787293708e-05, "loss": 0.0577, "step": 8900 }, { "epoch": 0.27275231885388923, "grad_norm": 4.697174549102783, "learning_rate": 9.837585174560773e-05, "loss": 0.1353, "step": 8910 }, { "epoch": 0.2730584381792023, "grad_norm": 1.5165328979492188, "learning_rate": 9.836987482172662e-05, "loss": 0.0581, "step": 8920 }, { "epoch": 0.27336455750451527, "grad_norm": 1.3728970289230347, "learning_rate": 9.836388710262769e-05, "loss": 0.0877, "step": 8930 }, { "epoch": 0.27367067682982826, "grad_norm": 8.531307220458984, "learning_rate": 9.835788858964726e-05, "loss": 0.081, "step": 8940 }, { "epoch": 0.27397679615514126, "grad_norm": 4.332725524902344, "learning_rate": 9.83518792841241e-05, "loss": 0.0845, "step": 8950 }, { "epoch": 0.2742829154804543, "grad_norm": 1.717137336730957, "learning_rate": 9.834585918739936e-05, "loss": 0.0664, "step": 8960 }, { "epoch": 0.2745890348057673, "grad_norm": 1.151362657546997, "learning_rate": 9.83398283008166e-05, "loss": 0.0789, "step": 8970 }, { "epoch": 0.2748951541310803, "grad_norm": 1.2243688106536865, "learning_rate": 9.833378662572183e-05, "loss": 0.1202, "step": 8980 }, { "epoch": 0.2752012734563933, "grad_norm": 1.1989778280258179, "learning_rate": 9.832773416346342e-05, "loss": 0.0746, "step": 8990 }, { "epoch": 0.27550739278170633, "grad_norm": 1.3899109363555908, "learning_rate": 9.832167091539214e-05, "loss": 0.0871, "step": 9000 }, { "epoch": 0.2758135121070193, "grad_norm": 2.3529648780822754, "learning_rate": 9.831559688286121e-05, "loss": 0.1065, "step": 9010 }, { "epoch": 0.2761196314323323, "grad_norm": 1.2400860786437988, "learning_rate": 9.830951206722622e-05, "loss": 0.0764, "step": 9020 }, { "epoch": 0.2764257507576453, "grad_norm": 2.364504098892212, "learning_rate": 9.830341646984521e-05, "loss": 0.1143, "step": 9030 }, { "epoch": 0.27673187008295835, "grad_norm": 1.1331194639205933, "learning_rate": 9.829731009207859e-05, "loss": 0.0768, "step": 9040 }, { "epoch": 0.27703798940827135, "grad_norm": 1.5548025369644165, "learning_rate": 9.829119293528916e-05, "loss": 0.1196, "step": 9050 }, { "epoch": 0.27734410873358434, "grad_norm": 1.2946772575378418, "learning_rate": 9.828506500084218e-05, "loss": 0.1143, "step": 9060 }, { "epoch": 0.2776502280588974, "grad_norm": 1.464438796043396, "learning_rate": 9.827892629010527e-05, "loss": 0.1726, "step": 9070 }, { "epoch": 0.2779563473842104, "grad_norm": 0.7962360382080078, "learning_rate": 9.827277680444848e-05, "loss": 0.1307, "step": 9080 }, { "epoch": 0.27826246670952337, "grad_norm": 2.106457471847534, "learning_rate": 9.826661654524425e-05, "loss": 0.0437, "step": 9090 }, { "epoch": 0.27856858603483636, "grad_norm": 1.6375941038131714, "learning_rate": 9.826044551386744e-05, "loss": 0.1039, "step": 9100 }, { "epoch": 0.2788747053601494, "grad_norm": 3.8801169395446777, "learning_rate": 9.825426371169529e-05, "loss": 0.211, "step": 9110 }, { "epoch": 0.2791808246854624, "grad_norm": 3.657499074935913, "learning_rate": 9.824807114010748e-05, "loss": 0.1753, "step": 9120 }, { "epoch": 0.2794869440107754, "grad_norm": 0.7428058981895447, "learning_rate": 9.824186780048605e-05, "loss": 0.0606, "step": 9130 }, { "epoch": 0.2797930633360884, "grad_norm": 1.749219536781311, "learning_rate": 9.823565369421545e-05, "loss": 0.1116, "step": 9140 }, { "epoch": 0.28009918266140144, "grad_norm": 2.245060443878174, "learning_rate": 9.82294288226826e-05, "loss": 0.1097, "step": 9150 }, { "epoch": 0.28040530198671443, "grad_norm": 1.1237412691116333, "learning_rate": 9.822319318727673e-05, "loss": 0.0868, "step": 9160 }, { "epoch": 0.2807114213120274, "grad_norm": 3.640514612197876, "learning_rate": 9.821694678938953e-05, "loss": 0.099, "step": 9170 }, { "epoch": 0.2810175406373404, "grad_norm": 2.1076064109802246, "learning_rate": 9.821068963041507e-05, "loss": 0.197, "step": 9180 }, { "epoch": 0.28132365996265346, "grad_norm": 2.4886364936828613, "learning_rate": 9.820442171174981e-05, "loss": 0.1251, "step": 9190 }, { "epoch": 0.28162977928796645, "grad_norm": 4.1933369636535645, "learning_rate": 9.819814303479267e-05, "loss": 0.1354, "step": 9200 }, { "epoch": 0.28193589861327945, "grad_norm": 2.029078722000122, "learning_rate": 9.819185360094489e-05, "loss": 0.157, "step": 9210 }, { "epoch": 0.28224201793859244, "grad_norm": 2.4823174476623535, "learning_rate": 9.818555341161015e-05, "loss": 0.0818, "step": 9220 }, { "epoch": 0.2825481372639055, "grad_norm": 0.7557628154754639, "learning_rate": 9.817924246819457e-05, "loss": 0.0392, "step": 9230 }, { "epoch": 0.2828542565892185, "grad_norm": 3.7983741760253906, "learning_rate": 9.817292077210659e-05, "loss": 0.1158, "step": 9240 }, { "epoch": 0.28316037591453147, "grad_norm": 2.7923429012298584, "learning_rate": 9.81665883247571e-05, "loss": 0.0864, "step": 9250 }, { "epoch": 0.2834664952398445, "grad_norm": 3.4032058715820312, "learning_rate": 9.816024512755937e-05, "loss": 0.0922, "step": 9260 }, { "epoch": 0.2837726145651575, "grad_norm": 1.079509973526001, "learning_rate": 9.815389118192911e-05, "loss": 0.115, "step": 9270 }, { "epoch": 0.2840787338904705, "grad_norm": 1.0681260824203491, "learning_rate": 9.814752648928436e-05, "loss": 0.0475, "step": 9280 }, { "epoch": 0.2843848532157835, "grad_norm": 1.4272516965866089, "learning_rate": 9.814115105104562e-05, "loss": 0.1723, "step": 9290 }, { "epoch": 0.28469097254109654, "grad_norm": 4.17203426361084, "learning_rate": 9.813476486863576e-05, "loss": 0.0856, "step": 9300 }, { "epoch": 0.28499709186640954, "grad_norm": 0.742900013923645, "learning_rate": 9.812836794348004e-05, "loss": 0.094, "step": 9310 }, { "epoch": 0.2853032111917225, "grad_norm": 2.543469190597534, "learning_rate": 9.812196027700615e-05, "loss": 0.0918, "step": 9320 }, { "epoch": 0.2856093305170355, "grad_norm": 1.6741896867752075, "learning_rate": 9.811554187064414e-05, "loss": 0.1062, "step": 9330 }, { "epoch": 0.28591544984234857, "grad_norm": 0.6105459928512573, "learning_rate": 9.81091127258265e-05, "loss": 0.0653, "step": 9340 }, { "epoch": 0.28622156916766156, "grad_norm": 1.700282096862793, "learning_rate": 9.810267284398804e-05, "loss": 0.1276, "step": 9350 }, { "epoch": 0.28652768849297455, "grad_norm": 2.070998430252075, "learning_rate": 9.809622222656606e-05, "loss": 0.0854, "step": 9360 }, { "epoch": 0.28683380781828755, "grad_norm": 2.600511312484741, "learning_rate": 9.808976087500021e-05, "loss": 0.1186, "step": 9370 }, { "epoch": 0.2871399271436006, "grad_norm": 1.4905434846878052, "learning_rate": 9.808328879073251e-05, "loss": 0.0793, "step": 9380 }, { "epoch": 0.2874460464689136, "grad_norm": 0.9937986731529236, "learning_rate": 9.807680597520746e-05, "loss": 0.0971, "step": 9390 }, { "epoch": 0.2877521657942266, "grad_norm": 1.568678379058838, "learning_rate": 9.807031242987181e-05, "loss": 0.092, "step": 9400 }, { "epoch": 0.28805828511953957, "grad_norm": 1.4511319398880005, "learning_rate": 9.806380815617489e-05, "loss": 0.0432, "step": 9410 }, { "epoch": 0.2883644044448526, "grad_norm": 0.9382933378219604, "learning_rate": 9.805729315556825e-05, "loss": 0.1322, "step": 9420 }, { "epoch": 0.2886705237701656, "grad_norm": 3.4938080310821533, "learning_rate": 9.805076742950595e-05, "loss": 0.1246, "step": 9430 }, { "epoch": 0.2889766430954786, "grad_norm": 1.5257649421691895, "learning_rate": 9.804423097944439e-05, "loss": 0.1034, "step": 9440 }, { "epoch": 0.28928276242079165, "grad_norm": 4.91271448135376, "learning_rate": 9.803768380684242e-05, "loss": 0.1331, "step": 9450 }, { "epoch": 0.28958888174610464, "grad_norm": 2.367832660675049, "learning_rate": 9.803112591316119e-05, "loss": 0.12, "step": 9460 }, { "epoch": 0.28989500107141764, "grad_norm": 1.4538675546646118, "learning_rate": 9.80245572998643e-05, "loss": 0.0705, "step": 9470 }, { "epoch": 0.2902011203967306, "grad_norm": 1.7277566194534302, "learning_rate": 9.801797796841778e-05, "loss": 0.1002, "step": 9480 }, { "epoch": 0.2905072397220437, "grad_norm": 1.2240797281265259, "learning_rate": 9.801138792028996e-05, "loss": 0.1108, "step": 9490 }, { "epoch": 0.29081335904735667, "grad_norm": 3.3451905250549316, "learning_rate": 9.800478715695163e-05, "loss": 0.0878, "step": 9500 }, { "epoch": 0.29111947837266966, "grad_norm": 1.874456524848938, "learning_rate": 9.799817567987597e-05, "loss": 0.1228, "step": 9510 }, { "epoch": 0.29142559769798265, "grad_norm": 1.0535386800765991, "learning_rate": 9.799155349053851e-05, "loss": 0.0547, "step": 9520 }, { "epoch": 0.2917317170232957, "grad_norm": 1.8777551651000977, "learning_rate": 9.79849205904172e-05, "loss": 0.1188, "step": 9530 }, { "epoch": 0.2920378363486087, "grad_norm": 1.2258338928222656, "learning_rate": 9.797827698099238e-05, "loss": 0.0738, "step": 9540 }, { "epoch": 0.2923439556739217, "grad_norm": 1.2403132915496826, "learning_rate": 9.797162266374676e-05, "loss": 0.0903, "step": 9550 }, { "epoch": 0.2926500749992347, "grad_norm": 1.9955670833587646, "learning_rate": 9.796495764016548e-05, "loss": 0.0487, "step": 9560 }, { "epoch": 0.2929561943245477, "grad_norm": 1.4181959629058838, "learning_rate": 9.795828191173602e-05, "loss": 0.0928, "step": 9570 }, { "epoch": 0.2932623136498607, "grad_norm": 1.9099206924438477, "learning_rate": 9.79515954799483e-05, "loss": 0.127, "step": 9580 }, { "epoch": 0.2935684329751737, "grad_norm": 1.9413686990737915, "learning_rate": 9.794489834629455e-05, "loss": 0.0708, "step": 9590 }, { "epoch": 0.2938745523004867, "grad_norm": 3.8492274284362793, "learning_rate": 9.79381905122695e-05, "loss": 0.1599, "step": 9600 }, { "epoch": 0.29418067162579975, "grad_norm": 1.3420981168746948, "learning_rate": 9.793147197937019e-05, "loss": 0.1192, "step": 9610 }, { "epoch": 0.29448679095111274, "grad_norm": 1.1397960186004639, "learning_rate": 9.792474274909604e-05, "loss": 0.1491, "step": 9620 }, { "epoch": 0.29479291027642573, "grad_norm": 1.5817359685897827, "learning_rate": 9.791800282294891e-05, "loss": 0.1171, "step": 9630 }, { "epoch": 0.2950990296017388, "grad_norm": 2.090073347091675, "learning_rate": 9.791125220243303e-05, "loss": 0.1565, "step": 9640 }, { "epoch": 0.2954051489270518, "grad_norm": 2.1590442657470703, "learning_rate": 9.790449088905495e-05, "loss": 0.0902, "step": 9650 }, { "epoch": 0.29571126825236477, "grad_norm": 1.2035187482833862, "learning_rate": 9.789771888432375e-05, "loss": 0.0625, "step": 9660 }, { "epoch": 0.29601738757767776, "grad_norm": 2.4882919788360596, "learning_rate": 9.789093618975074e-05, "loss": 0.142, "step": 9670 }, { "epoch": 0.2963235069029908, "grad_norm": 3.165327548980713, "learning_rate": 9.788414280684972e-05, "loss": 0.1493, "step": 9680 }, { "epoch": 0.2966296262283038, "grad_norm": 1.4025465250015259, "learning_rate": 9.787733873713682e-05, "loss": 0.0486, "step": 9690 }, { "epoch": 0.2969357455536168, "grad_norm": 2.2610771656036377, "learning_rate": 9.787052398213062e-05, "loss": 0.1162, "step": 9700 }, { "epoch": 0.2972418648789298, "grad_norm": 1.0529038906097412, "learning_rate": 9.786369854335198e-05, "loss": 0.0957, "step": 9710 }, { "epoch": 0.29754798420424283, "grad_norm": 3.4704833030700684, "learning_rate": 9.785686242232426e-05, "loss": 0.1176, "step": 9720 }, { "epoch": 0.2978541035295558, "grad_norm": 1.5511928796768188, "learning_rate": 9.785001562057309e-05, "loss": 0.0967, "step": 9730 }, { "epoch": 0.2981602228548688, "grad_norm": 3.028200149536133, "learning_rate": 9.784315813962662e-05, "loss": 0.0874, "step": 9740 }, { "epoch": 0.2984663421801818, "grad_norm": 1.1339192390441895, "learning_rate": 9.783628998101524e-05, "loss": 0.0623, "step": 9750 }, { "epoch": 0.29877246150549486, "grad_norm": 2.720989227294922, "learning_rate": 9.782941114627181e-05, "loss": 0.1473, "step": 9760 }, { "epoch": 0.29907858083080785, "grad_norm": 1.89190673828125, "learning_rate": 9.782252163693158e-05, "loss": 0.2201, "step": 9770 }, { "epoch": 0.29938470015612084, "grad_norm": 2.197136402130127, "learning_rate": 9.781562145453212e-05, "loss": 0.0893, "step": 9780 }, { "epoch": 0.2996908194814339, "grad_norm": 2.711585521697998, "learning_rate": 9.780871060061344e-05, "loss": 0.0945, "step": 9790 }, { "epoch": 0.2999969388067469, "grad_norm": 2.5299928188323975, "learning_rate": 9.780178907671789e-05, "loss": 0.1311, "step": 9800 }, { "epoch": 0.3003030581320599, "grad_norm": 1.6482499837875366, "learning_rate": 9.779485688439023e-05, "loss": 0.0468, "step": 9810 }, { "epoch": 0.30060917745737287, "grad_norm": 1.597687840461731, "learning_rate": 9.778791402517757e-05, "loss": 0.0684, "step": 9820 }, { "epoch": 0.3009152967826859, "grad_norm": 2.5657145977020264, "learning_rate": 9.778096050062947e-05, "loss": 0.1035, "step": 9830 }, { "epoch": 0.3012214161079989, "grad_norm": 0.9659326672554016, "learning_rate": 9.777399631229777e-05, "loss": 0.1073, "step": 9840 }, { "epoch": 0.3015275354333119, "grad_norm": 3.83663272857666, "learning_rate": 9.776702146173678e-05, "loss": 0.1356, "step": 9850 }, { "epoch": 0.3018336547586249, "grad_norm": 1.2196873426437378, "learning_rate": 9.776003595050313e-05, "loss": 0.094, "step": 9860 }, { "epoch": 0.30213977408393794, "grad_norm": 2.199186086654663, "learning_rate": 9.775303978015585e-05, "loss": 0.1044, "step": 9870 }, { "epoch": 0.30244589340925093, "grad_norm": 1.3421063423156738, "learning_rate": 9.774603295225636e-05, "loss": 0.1124, "step": 9880 }, { "epoch": 0.3027520127345639, "grad_norm": 2.089191436767578, "learning_rate": 9.773901546836844e-05, "loss": 0.0742, "step": 9890 }, { "epoch": 0.3030581320598769, "grad_norm": 2.0137929916381836, "learning_rate": 9.773198733005826e-05, "loss": 0.0841, "step": 9900 }, { "epoch": 0.30336425138518996, "grad_norm": 1.6326284408569336, "learning_rate": 9.772494853889438e-05, "loss": 0.0934, "step": 9910 }, { "epoch": 0.30367037071050296, "grad_norm": 0.8576561212539673, "learning_rate": 9.771789909644768e-05, "loss": 0.0708, "step": 9920 }, { "epoch": 0.30397649003581595, "grad_norm": 2.853802442550659, "learning_rate": 9.77108390042915e-05, "loss": 0.0857, "step": 9930 }, { "epoch": 0.30428260936112894, "grad_norm": 2.122769594192505, "learning_rate": 9.77037682640015e-05, "loss": 0.1005, "step": 9940 }, { "epoch": 0.304588728686442, "grad_norm": 1.3511682748794556, "learning_rate": 9.769668687715572e-05, "loss": 0.08, "step": 9950 }, { "epoch": 0.304894848011755, "grad_norm": 0.8437986969947815, "learning_rate": 9.76895948453346e-05, "loss": 0.151, "step": 9960 }, { "epoch": 0.305200967337068, "grad_norm": 1.5106908082962036, "learning_rate": 9.768249217012096e-05, "loss": 0.1077, "step": 9970 }, { "epoch": 0.305507086662381, "grad_norm": 3.811744451522827, "learning_rate": 9.767537885309996e-05, "loss": 0.0868, "step": 9980 }, { "epoch": 0.305813205987694, "grad_norm": 1.123874306678772, "learning_rate": 9.766825489585915e-05, "loss": 0.0544, "step": 9990 }, { "epoch": 0.306119325313007, "grad_norm": 1.7879579067230225, "learning_rate": 9.766112029998846e-05, "loss": 0.1455, "step": 10000 }, { "epoch": 0.30642544463832, "grad_norm": 1.0249699354171753, "learning_rate": 9.765397506708023e-05, "loss": 0.1071, "step": 10010 }, { "epoch": 0.30673156396363305, "grad_norm": 3.908276319503784, "learning_rate": 9.764681919872908e-05, "loss": 0.1558, "step": 10020 }, { "epoch": 0.30703768328894604, "grad_norm": 2.9735913276672363, "learning_rate": 9.763965269653211e-05, "loss": 0.0581, "step": 10030 }, { "epoch": 0.30734380261425903, "grad_norm": 1.7602760791778564, "learning_rate": 9.76324755620887e-05, "loss": 0.0756, "step": 10040 }, { "epoch": 0.307649921939572, "grad_norm": 2.445547342300415, "learning_rate": 9.762528779700067e-05, "loss": 0.111, "step": 10050 }, { "epoch": 0.30795604126488507, "grad_norm": 1.423012137413025, "learning_rate": 9.76180894028722e-05, "loss": 0.11, "step": 10060 }, { "epoch": 0.30826216059019806, "grad_norm": 1.5245476961135864, "learning_rate": 9.761088038130983e-05, "loss": 0.0809, "step": 10070 }, { "epoch": 0.30856827991551106, "grad_norm": 1.722937822341919, "learning_rate": 9.760366073392246e-05, "loss": 0.0925, "step": 10080 }, { "epoch": 0.30887439924082405, "grad_norm": 0.7855029702186584, "learning_rate": 9.759643046232138e-05, "loss": 0.074, "step": 10090 }, { "epoch": 0.3091805185661371, "grad_norm": 0.7298972010612488, "learning_rate": 9.758918956812024e-05, "loss": 0.109, "step": 10100 }, { "epoch": 0.3094866378914501, "grad_norm": 1.380223274230957, "learning_rate": 9.758193805293508e-05, "loss": 0.0642, "step": 10110 }, { "epoch": 0.3097927572167631, "grad_norm": 1.607657790184021, "learning_rate": 9.757467591838429e-05, "loss": 0.0722, "step": 10120 }, { "epoch": 0.3100988765420761, "grad_norm": 2.252207040786743, "learning_rate": 9.756740316608864e-05, "loss": 0.0746, "step": 10130 }, { "epoch": 0.3104049958673891, "grad_norm": 0.31838300824165344, "learning_rate": 9.756011979767128e-05, "loss": 0.0586, "step": 10140 }, { "epoch": 0.3107111151927021, "grad_norm": 2.7551803588867188, "learning_rate": 9.755282581475769e-05, "loss": 0.0799, "step": 10150 }, { "epoch": 0.3110172345180151, "grad_norm": 1.5491517782211304, "learning_rate": 9.754552121897575e-05, "loss": 0.0806, "step": 10160 }, { "epoch": 0.31132335384332815, "grad_norm": 1.5549142360687256, "learning_rate": 9.753820601195573e-05, "loss": 0.0842, "step": 10170 }, { "epoch": 0.31162947316864115, "grad_norm": 1.635538935661316, "learning_rate": 9.75308801953302e-05, "loss": 0.1147, "step": 10180 }, { "epoch": 0.31193559249395414, "grad_norm": 2.09708833694458, "learning_rate": 9.75235437707342e-05, "loss": 0.1265, "step": 10190 }, { "epoch": 0.31224171181926713, "grad_norm": 0.4747856557369232, "learning_rate": 9.751619673980504e-05, "loss": 0.1035, "step": 10200 }, { "epoch": 0.3125478311445802, "grad_norm": 1.8101648092269897, "learning_rate": 9.750883910418242e-05, "loss": 0.0686, "step": 10210 }, { "epoch": 0.31285395046989317, "grad_norm": 4.77341365814209, "learning_rate": 9.750147086550844e-05, "loss": 0.1122, "step": 10220 }, { "epoch": 0.31316006979520616, "grad_norm": 0.48354363441467285, "learning_rate": 9.749409202542755e-05, "loss": 0.0838, "step": 10230 }, { "epoch": 0.31346618912051916, "grad_norm": 1.6966552734375, "learning_rate": 9.748670258558656e-05, "loss": 0.1173, "step": 10240 }, { "epoch": 0.3137723084458322, "grad_norm": 2.4418389797210693, "learning_rate": 9.747930254763467e-05, "loss": 0.1169, "step": 10250 }, { "epoch": 0.3140784277711452, "grad_norm": 1.2398344278335571, "learning_rate": 9.747189191322337e-05, "loss": 0.1345, "step": 10260 }, { "epoch": 0.3143845470964582, "grad_norm": 1.7130587100982666, "learning_rate": 9.746447068400664e-05, "loss": 0.0662, "step": 10270 }, { "epoch": 0.3146906664217712, "grad_norm": 3.8370800018310547, "learning_rate": 9.74570388616407e-05, "loss": 0.1133, "step": 10280 }, { "epoch": 0.31499678574708423, "grad_norm": 2.1765575408935547, "learning_rate": 9.744959644778422e-05, "loss": 0.0897, "step": 10290 }, { "epoch": 0.3153029050723972, "grad_norm": 2.135916233062744, "learning_rate": 9.744214344409819e-05, "loss": 0.1033, "step": 10300 }, { "epoch": 0.3156090243977102, "grad_norm": 3.163672924041748, "learning_rate": 9.743467985224597e-05, "loss": 0.0758, "step": 10310 }, { "epoch": 0.31591514372302326, "grad_norm": 0.6859220862388611, "learning_rate": 9.74272056738933e-05, "loss": 0.1046, "step": 10320 }, { "epoch": 0.31622126304833625, "grad_norm": 0.9044406414031982, "learning_rate": 9.741972091070826e-05, "loss": 0.0918, "step": 10330 }, { "epoch": 0.31652738237364925, "grad_norm": 1.6162575483322144, "learning_rate": 9.741222556436132e-05, "loss": 0.0698, "step": 10340 }, { "epoch": 0.31683350169896224, "grad_norm": 2.7123749256134033, "learning_rate": 9.740471963652529e-05, "loss": 0.1463, "step": 10350 }, { "epoch": 0.3171396210242753, "grad_norm": 1.6510311365127563, "learning_rate": 9.739720312887535e-05, "loss": 0.1491, "step": 10360 }, { "epoch": 0.3174457403495883, "grad_norm": 2.3170084953308105, "learning_rate": 9.738967604308902e-05, "loss": 0.1041, "step": 10370 }, { "epoch": 0.31775185967490127, "grad_norm": 2.252295970916748, "learning_rate": 9.738213838084621e-05, "loss": 0.1162, "step": 10380 }, { "epoch": 0.31805797900021426, "grad_norm": 1.4030131101608276, "learning_rate": 9.73745901438292e-05, "loss": 0.0881, "step": 10390 }, { "epoch": 0.3183640983255273, "grad_norm": 1.3684661388397217, "learning_rate": 9.736703133372258e-05, "loss": 0.0959, "step": 10400 }, { "epoch": 0.3186702176508403, "grad_norm": 1.1458405256271362, "learning_rate": 9.735946195221335e-05, "loss": 0.0657, "step": 10410 }, { "epoch": 0.3189763369761533, "grad_norm": 2.623811960220337, "learning_rate": 9.735188200099083e-05, "loss": 0.0805, "step": 10420 }, { "epoch": 0.3192824563014663, "grad_norm": 2.338008403778076, "learning_rate": 9.734429148174675e-05, "loss": 0.1537, "step": 10430 }, { "epoch": 0.31958857562677934, "grad_norm": 1.5616565942764282, "learning_rate": 9.733669039617513e-05, "loss": 0.0784, "step": 10440 }, { "epoch": 0.31989469495209233, "grad_norm": 1.5770268440246582, "learning_rate": 9.732907874597242e-05, "loss": 0.1253, "step": 10450 }, { "epoch": 0.3202008142774053, "grad_norm": 1.9312416315078735, "learning_rate": 9.732145653283736e-05, "loss": 0.0892, "step": 10460 }, { "epoch": 0.3205069336027183, "grad_norm": 1.5965973138809204, "learning_rate": 9.73138237584711e-05, "loss": 0.0853, "step": 10470 }, { "epoch": 0.32081305292803136, "grad_norm": 0.9397444128990173, "learning_rate": 9.730618042457711e-05, "loss": 0.0927, "step": 10480 }, { "epoch": 0.32111917225334435, "grad_norm": 1.4492167234420776, "learning_rate": 9.729852653286127e-05, "loss": 0.1079, "step": 10490 }, { "epoch": 0.32142529157865735, "grad_norm": 1.2235288619995117, "learning_rate": 9.729086208503174e-05, "loss": 0.1348, "step": 10500 }, { "epoch": 0.3217314109039704, "grad_norm": 1.0259045362472534, "learning_rate": 9.728318708279909e-05, "loss": 0.079, "step": 10510 }, { "epoch": 0.3220375302292834, "grad_norm": 6.507325172424316, "learning_rate": 9.727550152787623e-05, "loss": 0.0636, "step": 10520 }, { "epoch": 0.3223436495545964, "grad_norm": 19.467769622802734, "learning_rate": 9.726780542197844e-05, "loss": 0.1571, "step": 10530 }, { "epoch": 0.32264976887990937, "grad_norm": 3.00909686088562, "learning_rate": 9.726009876682333e-05, "loss": 0.0637, "step": 10540 }, { "epoch": 0.3229558882052224, "grad_norm": 2.6689260005950928, "learning_rate": 9.725238156413089e-05, "loss": 0.1078, "step": 10550 }, { "epoch": 0.3232620075305354, "grad_norm": 1.4631553888320923, "learning_rate": 9.724465381562342e-05, "loss": 0.1176, "step": 10560 }, { "epoch": 0.3235681268558484, "grad_norm": 2.795293092727661, "learning_rate": 9.723691552302562e-05, "loss": 0.1195, "step": 10570 }, { "epoch": 0.3238742461811614, "grad_norm": 0.8562132716178894, "learning_rate": 9.722916668806454e-05, "loss": 0.0926, "step": 10580 }, { "epoch": 0.32418036550647444, "grad_norm": 1.5246907472610474, "learning_rate": 9.722140731246956e-05, "loss": 0.0609, "step": 10590 }, { "epoch": 0.32448648483178744, "grad_norm": 0.7407625913619995, "learning_rate": 9.721363739797242e-05, "loss": 0.047, "step": 10600 }, { "epoch": 0.32479260415710043, "grad_norm": 2.616672992706299, "learning_rate": 9.720585694630721e-05, "loss": 0.0691, "step": 10610 }, { "epoch": 0.3250987234824134, "grad_norm": 0.9457241892814636, "learning_rate": 9.719806595921037e-05, "loss": 0.0852, "step": 10620 }, { "epoch": 0.32540484280772647, "grad_norm": 2.929642677307129, "learning_rate": 9.719026443842071e-05, "loss": 0.1213, "step": 10630 }, { "epoch": 0.32571096213303946, "grad_norm": 6.163235187530518, "learning_rate": 9.718245238567939e-05, "loss": 0.1256, "step": 10640 }, { "epoch": 0.32601708145835245, "grad_norm": 1.8274846076965332, "learning_rate": 9.71746298027299e-05, "loss": 0.0507, "step": 10650 }, { "epoch": 0.32632320078366545, "grad_norm": 1.4754546880722046, "learning_rate": 9.716679669131806e-05, "loss": 0.0938, "step": 10660 }, { "epoch": 0.3266293201089785, "grad_norm": 2.937162399291992, "learning_rate": 9.715895305319209e-05, "loss": 0.0616, "step": 10670 }, { "epoch": 0.3269354394342915, "grad_norm": 1.5422263145446777, "learning_rate": 9.715109889010256e-05, "loss": 0.0749, "step": 10680 }, { "epoch": 0.3272415587596045, "grad_norm": 1.5061594247817993, "learning_rate": 9.714323420380234e-05, "loss": 0.0753, "step": 10690 }, { "epoch": 0.3275476780849175, "grad_norm": 2.9967257976531982, "learning_rate": 9.713535899604666e-05, "loss": 0.118, "step": 10700 }, { "epoch": 0.3278537974102305, "grad_norm": 3.9753053188323975, "learning_rate": 9.712747326859315e-05, "loss": 0.1105, "step": 10710 }, { "epoch": 0.3281599167355435, "grad_norm": 1.6708104610443115, "learning_rate": 9.711957702320175e-05, "loss": 0.0596, "step": 10720 }, { "epoch": 0.3284660360608565, "grad_norm": 0.8681694269180298, "learning_rate": 9.711167026163473e-05, "loss": 0.1112, "step": 10730 }, { "epoch": 0.32877215538616955, "grad_norm": 3.1901907920837402, "learning_rate": 9.710375298565672e-05, "loss": 0.1153, "step": 10740 }, { "epoch": 0.32907827471148254, "grad_norm": 5.432855129241943, "learning_rate": 9.70958251970347e-05, "loss": 0.122, "step": 10750 }, { "epoch": 0.32938439403679554, "grad_norm": 1.9388885498046875, "learning_rate": 9.708788689753805e-05, "loss": 0.0592, "step": 10760 }, { "epoch": 0.32969051336210853, "grad_norm": 1.076128363609314, "learning_rate": 9.707993808893838e-05, "loss": 0.1568, "step": 10770 }, { "epoch": 0.3299966326874216, "grad_norm": 1.2440675497055054, "learning_rate": 9.707197877300974e-05, "loss": 0.0598, "step": 10780 }, { "epoch": 0.33030275201273457, "grad_norm": 1.3350582122802734, "learning_rate": 9.70640089515285e-05, "loss": 0.1544, "step": 10790 }, { "epoch": 0.33060887133804756, "grad_norm": 0.9731342196464539, "learning_rate": 9.705602862627335e-05, "loss": 0.0698, "step": 10800 }, { "epoch": 0.33091499066336055, "grad_norm": 1.2542915344238281, "learning_rate": 9.704803779902537e-05, "loss": 0.0528, "step": 10810 }, { "epoch": 0.3312211099886736, "grad_norm": 0.9512651562690735, "learning_rate": 9.704003647156795e-05, "loss": 0.1312, "step": 10820 }, { "epoch": 0.3315272293139866, "grad_norm": 2.5870168209075928, "learning_rate": 9.703202464568679e-05, "loss": 0.0877, "step": 10830 }, { "epoch": 0.3318333486392996, "grad_norm": 2.379193067550659, "learning_rate": 9.702400232317003e-05, "loss": 0.1495, "step": 10840 }, { "epoch": 0.33213946796461263, "grad_norm": 1.5660991668701172, "learning_rate": 9.701596950580806e-05, "loss": 0.0772, "step": 10850 }, { "epoch": 0.3324455872899256, "grad_norm": 2.031839370727539, "learning_rate": 9.700792619539368e-05, "loss": 0.0757, "step": 10860 }, { "epoch": 0.3327517066152386, "grad_norm": 1.06702721118927, "learning_rate": 9.699987239372198e-05, "loss": 0.1192, "step": 10870 }, { "epoch": 0.3330578259405516, "grad_norm": 1.9646339416503906, "learning_rate": 9.699180810259043e-05, "loss": 0.1087, "step": 10880 }, { "epoch": 0.33336394526586466, "grad_norm": 2.224531412124634, "learning_rate": 9.698373332379877e-05, "loss": 0.1188, "step": 10890 }, { "epoch": 0.33367006459117765, "grad_norm": 1.329424262046814, "learning_rate": 9.697564805914922e-05, "loss": 0.0898, "step": 10900 }, { "epoch": 0.33397618391649064, "grad_norm": 3.7547450065612793, "learning_rate": 9.696755231044618e-05, "loss": 0.1272, "step": 10910 }, { "epoch": 0.33428230324180364, "grad_norm": 2.0063366889953613, "learning_rate": 9.695944607949649e-05, "loss": 0.0757, "step": 10920 }, { "epoch": 0.3345884225671167, "grad_norm": 1.1477597951889038, "learning_rate": 9.695132936810934e-05, "loss": 0.0646, "step": 10930 }, { "epoch": 0.3348945418924297, "grad_norm": 2.1506316661834717, "learning_rate": 9.694320217809616e-05, "loss": 0.105, "step": 10940 }, { "epoch": 0.33520066121774267, "grad_norm": 2.3385813236236572, "learning_rate": 9.693506451127082e-05, "loss": 0.1572, "step": 10950 }, { "epoch": 0.33550678054305566, "grad_norm": 1.536656379699707, "learning_rate": 9.692691636944947e-05, "loss": 0.0839, "step": 10960 }, { "epoch": 0.3358128998683687, "grad_norm": 1.6184978485107422, "learning_rate": 9.691875775445063e-05, "loss": 0.0928, "step": 10970 }, { "epoch": 0.3361190191936817, "grad_norm": 2.9701082706451416, "learning_rate": 9.691058866809514e-05, "loss": 0.1368, "step": 10980 }, { "epoch": 0.3364251385189947, "grad_norm": 1.5958423614501953, "learning_rate": 9.690240911220618e-05, "loss": 0.085, "step": 10990 }, { "epoch": 0.3367312578443077, "grad_norm": 2.0901613235473633, "learning_rate": 9.689421908860927e-05, "loss": 0.1398, "step": 11000 }, { "epoch": 0.33703737716962073, "grad_norm": 0.779874324798584, "learning_rate": 9.688601859913226e-05, "loss": 0.0688, "step": 11010 }, { "epoch": 0.3373434964949337, "grad_norm": 1.8597877025604248, "learning_rate": 9.687780764560537e-05, "loss": 0.0879, "step": 11020 }, { "epoch": 0.3376496158202467, "grad_norm": 1.5335710048675537, "learning_rate": 9.686958622986109e-05, "loss": 0.1239, "step": 11030 }, { "epoch": 0.33795573514555977, "grad_norm": 2.9628546237945557, "learning_rate": 9.686135435373428e-05, "loss": 0.0763, "step": 11040 }, { "epoch": 0.33826185447087276, "grad_norm": 1.4601588249206543, "learning_rate": 9.685311201906215e-05, "loss": 0.0481, "step": 11050 }, { "epoch": 0.33856797379618575, "grad_norm": 3.5252232551574707, "learning_rate": 9.684485922768422e-05, "loss": 0.1128, "step": 11060 }, { "epoch": 0.33887409312149874, "grad_norm": 1.3233957290649414, "learning_rate": 9.683659598144239e-05, "loss": 0.0807, "step": 11070 }, { "epoch": 0.3391802124468118, "grad_norm": 1.2178308963775635, "learning_rate": 9.682832228218079e-05, "loss": 0.1831, "step": 11080 }, { "epoch": 0.3394863317721248, "grad_norm": 1.7241348028182983, "learning_rate": 9.682003813174601e-05, "loss": 0.0991, "step": 11090 }, { "epoch": 0.3397924510974378, "grad_norm": 0.6112222075462341, "learning_rate": 9.681174353198687e-05, "loss": 0.1105, "step": 11100 }, { "epoch": 0.34009857042275077, "grad_norm": 1.4654452800750732, "learning_rate": 9.680343848475457e-05, "loss": 0.0983, "step": 11110 }, { "epoch": 0.3404046897480638, "grad_norm": 1.9700126647949219, "learning_rate": 9.679512299190267e-05, "loss": 0.1108, "step": 11120 }, { "epoch": 0.3407108090733768, "grad_norm": 1.4940730333328247, "learning_rate": 9.6786797055287e-05, "loss": 0.1102, "step": 11130 }, { "epoch": 0.3410169283986898, "grad_norm": 2.004081964492798, "learning_rate": 9.677846067676572e-05, "loss": 0.0869, "step": 11140 }, { "epoch": 0.3413230477240028, "grad_norm": 1.9196646213531494, "learning_rate": 9.677011385819941e-05, "loss": 0.089, "step": 11150 }, { "epoch": 0.34162916704931584, "grad_norm": 3.4433953762054443, "learning_rate": 9.676175660145086e-05, "loss": 0.1284, "step": 11160 }, { "epoch": 0.34193528637462883, "grad_norm": 1.1688241958618164, "learning_rate": 9.675338890838527e-05, "loss": 0.0709, "step": 11170 }, { "epoch": 0.3422414056999418, "grad_norm": 5.422648906707764, "learning_rate": 9.674501078087018e-05, "loss": 0.0504, "step": 11180 }, { "epoch": 0.3425475250252548, "grad_norm": 1.7840934991836548, "learning_rate": 9.673662222077537e-05, "loss": 0.1255, "step": 11190 }, { "epoch": 0.34285364435056787, "grad_norm": 1.8883945941925049, "learning_rate": 9.672822322997305e-05, "loss": 0.1063, "step": 11200 }, { "epoch": 0.34315976367588086, "grad_norm": 1.1659624576568604, "learning_rate": 9.671981381033767e-05, "loss": 0.0557, "step": 11210 }, { "epoch": 0.34346588300119385, "grad_norm": 2.7860188484191895, "learning_rate": 9.671139396374608e-05, "loss": 0.0994, "step": 11220 }, { "epoch": 0.3437720023265069, "grad_norm": 2.68568754196167, "learning_rate": 9.670296369207743e-05, "loss": 0.1491, "step": 11230 }, { "epoch": 0.3440781216518199, "grad_norm": 1.3166067600250244, "learning_rate": 9.669452299721316e-05, "loss": 0.1114, "step": 11240 }, { "epoch": 0.3443842409771329, "grad_norm": 2.9512133598327637, "learning_rate": 9.668607188103708e-05, "loss": 0.1097, "step": 11250 }, { "epoch": 0.3446903603024459, "grad_norm": 3.348801374435425, "learning_rate": 9.667761034543535e-05, "loss": 0.1546, "step": 11260 }, { "epoch": 0.3449964796277589, "grad_norm": 1.5960410833358765, "learning_rate": 9.66691383922964e-05, "loss": 0.0619, "step": 11270 }, { "epoch": 0.3453025989530719, "grad_norm": 1.0051345825195312, "learning_rate": 9.666065602351097e-05, "loss": 0.0889, "step": 11280 }, { "epoch": 0.3456087182783849, "grad_norm": 2.7992115020751953, "learning_rate": 9.665216324097222e-05, "loss": 0.0753, "step": 11290 }, { "epoch": 0.3459148376036979, "grad_norm": 2.619053840637207, "learning_rate": 9.664366004657552e-05, "loss": 0.0848, "step": 11300 }, { "epoch": 0.34622095692901095, "grad_norm": 3.15210223197937, "learning_rate": 9.663514644221866e-05, "loss": 0.0835, "step": 11310 }, { "epoch": 0.34652707625432394, "grad_norm": 0.8709501624107361, "learning_rate": 9.662662242980169e-05, "loss": 0.0866, "step": 11320 }, { "epoch": 0.34683319557963693, "grad_norm": 1.026875615119934, "learning_rate": 9.6618088011227e-05, "loss": 0.2016, "step": 11330 }, { "epoch": 0.3471393149049499, "grad_norm": 2.247967004776001, "learning_rate": 9.660954318839933e-05, "loss": 0.0876, "step": 11340 }, { "epoch": 0.347445434230263, "grad_norm": 2.163813829421997, "learning_rate": 9.660098796322569e-05, "loss": 0.0956, "step": 11350 }, { "epoch": 0.34775155355557597, "grad_norm": 2.2411060333251953, "learning_rate": 9.659242233761547e-05, "loss": 0.1093, "step": 11360 }, { "epoch": 0.34805767288088896, "grad_norm": 3.0251238346099854, "learning_rate": 9.658384631348034e-05, "loss": 0.0785, "step": 11370 }, { "epoch": 0.34836379220620195, "grad_norm": 3.978628158569336, "learning_rate": 9.657525989273428e-05, "loss": 0.0827, "step": 11380 }, { "epoch": 0.348669911531515, "grad_norm": 1.1360535621643066, "learning_rate": 9.656666307729363e-05, "loss": 0.0547, "step": 11390 }, { "epoch": 0.348976030856828, "grad_norm": 2.898461103439331, "learning_rate": 9.655805586907704e-05, "loss": 0.1405, "step": 11400 }, { "epoch": 0.349282150182141, "grad_norm": 1.872139573097229, "learning_rate": 9.654943827000548e-05, "loss": 0.1234, "step": 11410 }, { "epoch": 0.34958826950745403, "grad_norm": 1.2998340129852295, "learning_rate": 9.654081028200219e-05, "loss": 0.0801, "step": 11420 }, { "epoch": 0.349894388832767, "grad_norm": 1.256026029586792, "learning_rate": 9.653217190699282e-05, "loss": 0.1128, "step": 11430 }, { "epoch": 0.35020050815808, "grad_norm": 2.2667794227600098, "learning_rate": 9.652352314690524e-05, "loss": 0.0925, "step": 11440 }, { "epoch": 0.350506627483393, "grad_norm": 4.2149739265441895, "learning_rate": 9.651486400366972e-05, "loss": 0.0766, "step": 11450 }, { "epoch": 0.35081274680870606, "grad_norm": 3.931305408477783, "learning_rate": 9.650619447921881e-05, "loss": 0.1101, "step": 11460 }, { "epoch": 0.35111886613401905, "grad_norm": 1.8098517656326294, "learning_rate": 9.649751457548737e-05, "loss": 0.0979, "step": 11470 }, { "epoch": 0.35142498545933204, "grad_norm": 2.404709815979004, "learning_rate": 9.648882429441257e-05, "loss": 0.1134, "step": 11480 }, { "epoch": 0.35173110478464503, "grad_norm": 5.482476711273193, "learning_rate": 9.648012363793395e-05, "loss": 0.1552, "step": 11490 }, { "epoch": 0.3520372241099581, "grad_norm": 7.628167629241943, "learning_rate": 9.64714126079933e-05, "loss": 0.0811, "step": 11500 }, { "epoch": 0.3523433434352711, "grad_norm": 0.9070479273796082, "learning_rate": 9.646269120653474e-05, "loss": 0.0634, "step": 11510 }, { "epoch": 0.35264946276058406, "grad_norm": 1.4987833499908447, "learning_rate": 9.645395943550476e-05, "loss": 0.0791, "step": 11520 }, { "epoch": 0.35295558208589706, "grad_norm": 1.999185562133789, "learning_rate": 9.64452172968521e-05, "loss": 0.0804, "step": 11530 }, { "epoch": 0.3532617014112101, "grad_norm": 9.1586275100708, "learning_rate": 9.643646479252784e-05, "loss": 0.1104, "step": 11540 }, { "epoch": 0.3535678207365231, "grad_norm": 1.3768631219863892, "learning_rate": 9.642770192448536e-05, "loss": 0.1242, "step": 11550 }, { "epoch": 0.3538739400618361, "grad_norm": 0.6404541730880737, "learning_rate": 9.641892869468037e-05, "loss": 0.0739, "step": 11560 }, { "epoch": 0.35418005938714914, "grad_norm": 2.3043599128723145, "learning_rate": 9.641014510507087e-05, "loss": 0.1262, "step": 11570 }, { "epoch": 0.35448617871246213, "grad_norm": 0.6082026362419128, "learning_rate": 9.640135115761721e-05, "loss": 0.0854, "step": 11580 }, { "epoch": 0.3547922980377751, "grad_norm": 0.8055647015571594, "learning_rate": 9.639254685428201e-05, "loss": 0.0647, "step": 11590 }, { "epoch": 0.3550984173630881, "grad_norm": 2.716341733932495, "learning_rate": 9.638373219703024e-05, "loss": 0.1168, "step": 11600 }, { "epoch": 0.35540453668840116, "grad_norm": 0.91561359167099, "learning_rate": 9.637490718782914e-05, "loss": 0.1171, "step": 11610 }, { "epoch": 0.35571065601371415, "grad_norm": 2.7038228511810303, "learning_rate": 9.636607182864827e-05, "loss": 0.1204, "step": 11620 }, { "epoch": 0.35601677533902715, "grad_norm": 2.394864082336426, "learning_rate": 9.635722612145954e-05, "loss": 0.1263, "step": 11630 }, { "epoch": 0.35632289466434014, "grad_norm": 1.666427493095398, "learning_rate": 9.634837006823714e-05, "loss": 0.1085, "step": 11640 }, { "epoch": 0.3566290139896532, "grad_norm": 1.7806757688522339, "learning_rate": 9.633950367095756e-05, "loss": 0.0678, "step": 11650 }, { "epoch": 0.3569351333149662, "grad_norm": 1.0015203952789307, "learning_rate": 9.63306269315996e-05, "loss": 0.0663, "step": 11660 }, { "epoch": 0.35724125264027917, "grad_norm": 2.0356311798095703, "learning_rate": 9.632173985214438e-05, "loss": 0.0677, "step": 11670 }, { "epoch": 0.35754737196559216, "grad_norm": 0.7418850064277649, "learning_rate": 9.631284243457533e-05, "loss": 0.0833, "step": 11680 }, { "epoch": 0.3578534912909052, "grad_norm": 2.657531499862671, "learning_rate": 9.630393468087818e-05, "loss": 0.1415, "step": 11690 }, { "epoch": 0.3581596106162182, "grad_norm": 2.0118746757507324, "learning_rate": 9.629501659304096e-05, "loss": 0.0885, "step": 11700 }, { "epoch": 0.3584657299415312, "grad_norm": 2.9051902294158936, "learning_rate": 9.628608817305403e-05, "loss": 0.1258, "step": 11710 }, { "epoch": 0.3587718492668442, "grad_norm": 0.7109959721565247, "learning_rate": 9.627714942291001e-05, "loss": 0.0741, "step": 11720 }, { "epoch": 0.35907796859215724, "grad_norm": 1.5541315078735352, "learning_rate": 9.626820034460388e-05, "loss": 0.085, "step": 11730 }, { "epoch": 0.35938408791747023, "grad_norm": 3.670504093170166, "learning_rate": 9.62592409401329e-05, "loss": 0.0934, "step": 11740 }, { "epoch": 0.3596902072427832, "grad_norm": 10.897790908813477, "learning_rate": 9.625027121149663e-05, "loss": 0.11, "step": 11750 }, { "epoch": 0.35999632656809627, "grad_norm": 2.647651195526123, "learning_rate": 9.624129116069694e-05, "loss": 0.0928, "step": 11760 }, { "epoch": 0.36030244589340926, "grad_norm": 1.3748407363891602, "learning_rate": 9.623230078973802e-05, "loss": 0.122, "step": 11770 }, { "epoch": 0.36060856521872225, "grad_norm": 1.4044488668441772, "learning_rate": 9.622330010062632e-05, "loss": 0.0742, "step": 11780 }, { "epoch": 0.36091468454403525, "grad_norm": 2.137383460998535, "learning_rate": 9.621428909537062e-05, "loss": 0.0716, "step": 11790 }, { "epoch": 0.3612208038693483, "grad_norm": 5.487400531768799, "learning_rate": 9.620526777598202e-05, "loss": 0.127, "step": 11800 }, { "epoch": 0.3615269231946613, "grad_norm": 1.5318713188171387, "learning_rate": 9.619623614447389e-05, "loss": 0.0566, "step": 11810 }, { "epoch": 0.3618330425199743, "grad_norm": 1.0365670919418335, "learning_rate": 9.618719420286191e-05, "loss": 0.0689, "step": 11820 }, { "epoch": 0.36213916184528727, "grad_norm": 2.72540545463562, "learning_rate": 9.617814195316411e-05, "loss": 0.0654, "step": 11830 }, { "epoch": 0.3624452811706003, "grad_norm": 1.6201409101486206, "learning_rate": 9.61690793974007e-05, "loss": 0.0742, "step": 11840 }, { "epoch": 0.3627514004959133, "grad_norm": 1.1613458395004272, "learning_rate": 9.616000653759433e-05, "loss": 0.0893, "step": 11850 }, { "epoch": 0.3630575198212263, "grad_norm": 1.7210279703140259, "learning_rate": 9.615092337576988e-05, "loss": 0.0848, "step": 11860 }, { "epoch": 0.3633636391465393, "grad_norm": 0.822706937789917, "learning_rate": 9.614182991395448e-05, "loss": 0.0795, "step": 11870 }, { "epoch": 0.36366975847185234, "grad_norm": 1.5478063821792603, "learning_rate": 9.613272615417769e-05, "loss": 0.0678, "step": 11880 }, { "epoch": 0.36397587779716534, "grad_norm": 1.801391363143921, "learning_rate": 9.612361209847125e-05, "loss": 0.1297, "step": 11890 }, { "epoch": 0.36428199712247833, "grad_norm": 0.4378199279308319, "learning_rate": 9.611448774886924e-05, "loss": 0.1292, "step": 11900 }, { "epoch": 0.3645881164477913, "grad_norm": 3.1130027770996094, "learning_rate": 9.610535310740805e-05, "loss": 0.0884, "step": 11910 }, { "epoch": 0.36489423577310437, "grad_norm": 1.1158876419067383, "learning_rate": 9.609620817612634e-05, "loss": 0.0827, "step": 11920 }, { "epoch": 0.36520035509841736, "grad_norm": 1.9424448013305664, "learning_rate": 9.60870529570651e-05, "loss": 0.0767, "step": 11930 }, { "epoch": 0.36550647442373035, "grad_norm": 1.3918670415878296, "learning_rate": 9.60778874522676e-05, "loss": 0.122, "step": 11940 }, { "epoch": 0.3658125937490434, "grad_norm": 3.9453015327453613, "learning_rate": 9.606871166377938e-05, "loss": 0.0767, "step": 11950 }, { "epoch": 0.3661187130743564, "grad_norm": 1.0377246141433716, "learning_rate": 9.605952559364833e-05, "loss": 0.1208, "step": 11960 }, { "epoch": 0.3664248323996694, "grad_norm": 1.8074084520339966, "learning_rate": 9.605032924392457e-05, "loss": 0.0786, "step": 11970 }, { "epoch": 0.3667309517249824, "grad_norm": 2.0449681282043457, "learning_rate": 9.604112261666054e-05, "loss": 0.1066, "step": 11980 }, { "epoch": 0.3670370710502954, "grad_norm": 1.9962010383605957, "learning_rate": 9.603190571391102e-05, "loss": 0.072, "step": 11990 }, { "epoch": 0.3673431903756084, "grad_norm": 1.0591236352920532, "learning_rate": 9.602267853773301e-05, "loss": 0.0554, "step": 12000 }, { "epoch": 0.3676493097009214, "grad_norm": 1.539754033088684, "learning_rate": 9.601344109018586e-05, "loss": 0.0739, "step": 12010 }, { "epoch": 0.3679554290262344, "grad_norm": 1.6070619821548462, "learning_rate": 9.60041933733312e-05, "loss": 0.1251, "step": 12020 }, { "epoch": 0.36826154835154745, "grad_norm": 1.6473312377929688, "learning_rate": 9.59949353892329e-05, "loss": 0.0779, "step": 12030 }, { "epoch": 0.36856766767686044, "grad_norm": 0.7518571019172668, "learning_rate": 9.598566713995718e-05, "loss": 0.068, "step": 12040 }, { "epoch": 0.36887378700217344, "grad_norm": 1.5032440423965454, "learning_rate": 9.597638862757255e-05, "loss": 0.0842, "step": 12050 }, { "epoch": 0.36917990632748643, "grad_norm": 1.3396837711334229, "learning_rate": 9.596709985414975e-05, "loss": 0.0449, "step": 12060 }, { "epoch": 0.3694860256527995, "grad_norm": 1.4806891679763794, "learning_rate": 9.595780082176191e-05, "loss": 0.0886, "step": 12070 }, { "epoch": 0.36979214497811247, "grad_norm": 0.9186739921569824, "learning_rate": 9.594849153248438e-05, "loss": 0.0459, "step": 12080 }, { "epoch": 0.37009826430342546, "grad_norm": 3.428168535232544, "learning_rate": 9.593917198839479e-05, "loss": 0.0631, "step": 12090 }, { "epoch": 0.3704043836287385, "grad_norm": 2.652672052383423, "learning_rate": 9.592984219157308e-05, "loss": 0.1508, "step": 12100 }, { "epoch": 0.3707105029540515, "grad_norm": 1.5446950197219849, "learning_rate": 9.59205021441015e-05, "loss": 0.1019, "step": 12110 }, { "epoch": 0.3710166222793645, "grad_norm": 1.2984801530838013, "learning_rate": 9.591115184806456e-05, "loss": 0.0672, "step": 12120 }, { "epoch": 0.3713227416046775, "grad_norm": 3.0230214595794678, "learning_rate": 9.590179130554908e-05, "loss": 0.147, "step": 12130 }, { "epoch": 0.37162886092999053, "grad_norm": 4.982631206512451, "learning_rate": 9.589242051864413e-05, "loss": 0.1013, "step": 12140 }, { "epoch": 0.3719349802553035, "grad_norm": 1.2376598119735718, "learning_rate": 9.588303948944109e-05, "loss": 0.0656, "step": 12150 }, { "epoch": 0.3722410995806165, "grad_norm": 2.863232374191284, "learning_rate": 9.587364822003362e-05, "loss": 0.0935, "step": 12160 }, { "epoch": 0.3725472189059295, "grad_norm": 1.102233648300171, "learning_rate": 9.586424671251769e-05, "loss": 0.0675, "step": 12170 }, { "epoch": 0.37285333823124256, "grad_norm": 1.5799063444137573, "learning_rate": 9.58548349689915e-05, "loss": 0.108, "step": 12180 }, { "epoch": 0.37315945755655555, "grad_norm": 1.2824606895446777, "learning_rate": 9.58454129915556e-05, "loss": 0.0586, "step": 12190 }, { "epoch": 0.37346557688186854, "grad_norm": 5.779458522796631, "learning_rate": 9.58359807823128e-05, "loss": 0.0931, "step": 12200 }, { "epoch": 0.37377169620718154, "grad_norm": 1.5653777122497559, "learning_rate": 9.582653834336815e-05, "loss": 0.2148, "step": 12210 }, { "epoch": 0.3740778155324946, "grad_norm": 0.6316955089569092, "learning_rate": 9.581708567682904e-05, "loss": 0.1117, "step": 12220 }, { "epoch": 0.3743839348578076, "grad_norm": 2.682523250579834, "learning_rate": 9.580762278480513e-05, "loss": 0.1185, "step": 12230 }, { "epoch": 0.37469005418312057, "grad_norm": 1.0376918315887451, "learning_rate": 9.579814966940833e-05, "loss": 0.0693, "step": 12240 }, { "epoch": 0.37499617350843356, "grad_norm": 4.0359930992126465, "learning_rate": 9.578866633275288e-05, "loss": 0.0943, "step": 12250 }, { "epoch": 0.3753022928337466, "grad_norm": 2.0417373180389404, "learning_rate": 9.577917277695525e-05, "loss": 0.0978, "step": 12260 }, { "epoch": 0.3756084121590596, "grad_norm": 2.088052272796631, "learning_rate": 9.576966900413426e-05, "loss": 0.1435, "step": 12270 }, { "epoch": 0.3759145314843726, "grad_norm": 1.6307413578033447, "learning_rate": 9.576015501641091e-05, "loss": 0.0767, "step": 12280 }, { "epoch": 0.37622065080968564, "grad_norm": 1.892928957939148, "learning_rate": 9.575063081590859e-05, "loss": 0.1117, "step": 12290 }, { "epoch": 0.37652677013499863, "grad_norm": 3.6255135536193848, "learning_rate": 9.574109640475288e-05, "loss": 0.0699, "step": 12300 }, { "epoch": 0.3768328894603116, "grad_norm": 2.470275640487671, "learning_rate": 9.573155178507172e-05, "loss": 0.0683, "step": 12310 }, { "epoch": 0.3771390087856246, "grad_norm": 0.6138864755630493, "learning_rate": 9.572199695899522e-05, "loss": 0.0656, "step": 12320 }, { "epoch": 0.37744512811093767, "grad_norm": 2.4065535068511963, "learning_rate": 9.571243192865588e-05, "loss": 0.0867, "step": 12330 }, { "epoch": 0.37775124743625066, "grad_norm": 13.843074798583984, "learning_rate": 9.570285669618842e-05, "loss": 0.06, "step": 12340 }, { "epoch": 0.37805736676156365, "grad_norm": 2.963845729827881, "learning_rate": 9.569327126372984e-05, "loss": 0.1036, "step": 12350 }, { "epoch": 0.37836348608687664, "grad_norm": 1.3915051221847534, "learning_rate": 9.568367563341944e-05, "loss": 0.0601, "step": 12360 }, { "epoch": 0.3786696054121897, "grad_norm": 1.165898084640503, "learning_rate": 9.567406980739875e-05, "loss": 0.0972, "step": 12370 }, { "epoch": 0.3789757247375027, "grad_norm": 3.303825855255127, "learning_rate": 9.566445378781162e-05, "loss": 0.1224, "step": 12380 }, { "epoch": 0.3792818440628157, "grad_norm": 1.8773754835128784, "learning_rate": 9.565482757680415e-05, "loss": 0.1202, "step": 12390 }, { "epoch": 0.37958796338812867, "grad_norm": 1.9032129049301147, "learning_rate": 9.564519117652473e-05, "loss": 0.0767, "step": 12400 }, { "epoch": 0.3798940827134417, "grad_norm": 2.0278472900390625, "learning_rate": 9.563554458912403e-05, "loss": 0.0759, "step": 12410 }, { "epoch": 0.3802002020387547, "grad_norm": 1.1034749746322632, "learning_rate": 9.562588781675495e-05, "loss": 0.1607, "step": 12420 }, { "epoch": 0.3805063213640677, "grad_norm": 3.443660259246826, "learning_rate": 9.561622086157272e-05, "loss": 0.0433, "step": 12430 }, { "epoch": 0.3808124406893807, "grad_norm": 0.5192626118659973, "learning_rate": 9.560654372573481e-05, "loss": 0.1046, "step": 12440 }, { "epoch": 0.38111856001469374, "grad_norm": 2.655177354812622, "learning_rate": 9.559685641140098e-05, "loss": 0.0879, "step": 12450 }, { "epoch": 0.38142467934000673, "grad_norm": 2.477210760116577, "learning_rate": 9.558715892073323e-05, "loss": 0.0724, "step": 12460 }, { "epoch": 0.3817307986653197, "grad_norm": 0.9763163924217224, "learning_rate": 9.557745125589586e-05, "loss": 0.097, "step": 12470 }, { "epoch": 0.3820369179906328, "grad_norm": 0.7471407055854797, "learning_rate": 9.556773341905546e-05, "loss": 0.0493, "step": 12480 }, { "epoch": 0.38234303731594577, "grad_norm": 0.823295533657074, "learning_rate": 9.55580054123808e-05, "loss": 0.0852, "step": 12490 }, { "epoch": 0.38264915664125876, "grad_norm": 6.486001968383789, "learning_rate": 9.554826723804303e-05, "loss": 0.0849, "step": 12500 }, { "epoch": 0.38295527596657175, "grad_norm": 1.891080379486084, "learning_rate": 9.553851889821553e-05, "loss": 0.1107, "step": 12510 }, { "epoch": 0.3832613952918848, "grad_norm": 1.111156702041626, "learning_rate": 9.55287603950739e-05, "loss": 0.0822, "step": 12520 }, { "epoch": 0.3835675146171978, "grad_norm": 2.1680684089660645, "learning_rate": 9.551899173079607e-05, "loss": 0.0989, "step": 12530 }, { "epoch": 0.3838736339425108, "grad_norm": 2.179661512374878, "learning_rate": 9.550921290756222e-05, "loss": 0.0913, "step": 12540 }, { "epoch": 0.3841797532678238, "grad_norm": 1.8815556764602661, "learning_rate": 9.54994239275548e-05, "loss": 0.0887, "step": 12550 }, { "epoch": 0.3844858725931368, "grad_norm": 3.9691805839538574, "learning_rate": 9.548962479295849e-05, "loss": 0.1369, "step": 12560 }, { "epoch": 0.3847919919184498, "grad_norm": 6.041999340057373, "learning_rate": 9.54798155059603e-05, "loss": 0.1097, "step": 12570 }, { "epoch": 0.3850981112437628, "grad_norm": 2.505753755569458, "learning_rate": 9.546999606874947e-05, "loss": 0.081, "step": 12580 }, { "epoch": 0.3854042305690758, "grad_norm": 1.158669352531433, "learning_rate": 9.546016648351748e-05, "loss": 0.0711, "step": 12590 }, { "epoch": 0.38571034989438885, "grad_norm": 0.734747052192688, "learning_rate": 9.545032675245813e-05, "loss": 0.0649, "step": 12600 }, { "epoch": 0.38601646921970184, "grad_norm": 1.0432038307189941, "learning_rate": 9.544047687776745e-05, "loss": 0.0542, "step": 12610 }, { "epoch": 0.38632258854501483, "grad_norm": 2.8592467308044434, "learning_rate": 9.543061686164373e-05, "loss": 0.088, "step": 12620 }, { "epoch": 0.3866287078703279, "grad_norm": 4.251460075378418, "learning_rate": 9.542074670628753e-05, "loss": 0.1453, "step": 12630 }, { "epoch": 0.3869348271956409, "grad_norm": 1.3175849914550781, "learning_rate": 9.54108664139017e-05, "loss": 0.0827, "step": 12640 }, { "epoch": 0.38724094652095387, "grad_norm": 1.7044323682785034, "learning_rate": 9.540097598669134e-05, "loss": 0.1019, "step": 12650 }, { "epoch": 0.38754706584626686, "grad_norm": 2.5961716175079346, "learning_rate": 9.539107542686377e-05, "loss": 0.1084, "step": 12660 }, { "epoch": 0.3878531851715799, "grad_norm": 1.5662176609039307, "learning_rate": 9.538116473662861e-05, "loss": 0.0831, "step": 12670 }, { "epoch": 0.3881593044968929, "grad_norm": 0.9793252348899841, "learning_rate": 9.537124391819774e-05, "loss": 0.1039, "step": 12680 }, { "epoch": 0.3884654238222059, "grad_norm": 1.190667748451233, "learning_rate": 9.536131297378529e-05, "loss": 0.0698, "step": 12690 }, { "epoch": 0.3887715431475189, "grad_norm": 1.5505279302597046, "learning_rate": 9.535137190560766e-05, "loss": 0.1129, "step": 12700 }, { "epoch": 0.38907766247283193, "grad_norm": 1.395700454711914, "learning_rate": 9.534142071588348e-05, "loss": 0.0724, "step": 12710 }, { "epoch": 0.3893837817981449, "grad_norm": 3.3887224197387695, "learning_rate": 9.53314594068337e-05, "loss": 0.0952, "step": 12720 }, { "epoch": 0.3896899011234579, "grad_norm": 2.0047996044158936, "learning_rate": 9.532148798068147e-05, "loss": 0.0694, "step": 12730 }, { "epoch": 0.3899960204487709, "grad_norm": 2.551248073577881, "learning_rate": 9.531150643965223e-05, "loss": 0.1181, "step": 12740 }, { "epoch": 0.39030213977408396, "grad_norm": 2.140549659729004, "learning_rate": 9.530151478597366e-05, "loss": 0.0716, "step": 12750 }, { "epoch": 0.39060825909939695, "grad_norm": 1.8936657905578613, "learning_rate": 9.529151302187569e-05, "loss": 0.0941, "step": 12760 }, { "epoch": 0.39091437842470994, "grad_norm": 2.5999364852905273, "learning_rate": 9.528150114959053e-05, "loss": 0.1057, "step": 12770 }, { "epoch": 0.39122049775002293, "grad_norm": 1.4767500162124634, "learning_rate": 9.527147917135265e-05, "loss": 0.0582, "step": 12780 }, { "epoch": 0.391526617075336, "grad_norm": 1.1517871618270874, "learning_rate": 9.526144708939874e-05, "loss": 0.0735, "step": 12790 }, { "epoch": 0.391832736400649, "grad_norm": 1.3957666158676147, "learning_rate": 9.525140490596778e-05, "loss": 0.081, "step": 12800 }, { "epoch": 0.39213885572596197, "grad_norm": 1.0439659357070923, "learning_rate": 9.524135262330098e-05, "loss": 0.0692, "step": 12810 }, { "epoch": 0.392444975051275, "grad_norm": 0.6107520461082458, "learning_rate": 9.523129024364182e-05, "loss": 0.046, "step": 12820 }, { "epoch": 0.392751094376588, "grad_norm": 1.4503040313720703, "learning_rate": 9.522121776923602e-05, "loss": 0.0526, "step": 12830 }, { "epoch": 0.393057213701901, "grad_norm": 2.0084376335144043, "learning_rate": 9.521113520233158e-05, "loss": 0.1128, "step": 12840 }, { "epoch": 0.393363333027214, "grad_norm": 1.08815598487854, "learning_rate": 9.520104254517872e-05, "loss": 0.1085, "step": 12850 }, { "epoch": 0.39366945235252704, "grad_norm": 1.3055720329284668, "learning_rate": 9.519093980002993e-05, "loss": 0.0524, "step": 12860 }, { "epoch": 0.39397557167784003, "grad_norm": 1.4523377418518066, "learning_rate": 9.518082696913994e-05, "loss": 0.1055, "step": 12870 }, { "epoch": 0.394281691003153, "grad_norm": 1.8215625286102295, "learning_rate": 9.517070405476575e-05, "loss": 0.0595, "step": 12880 }, { "epoch": 0.394587810328466, "grad_norm": 1.8068543672561646, "learning_rate": 9.516057105916658e-05, "loss": 0.0808, "step": 12890 }, { "epoch": 0.39489392965377906, "grad_norm": 0.9998023509979248, "learning_rate": 9.515042798460392e-05, "loss": 0.063, "step": 12900 }, { "epoch": 0.39520004897909206, "grad_norm": 2.6491010189056396, "learning_rate": 9.514027483334154e-05, "loss": 0.0634, "step": 12910 }, { "epoch": 0.39550616830440505, "grad_norm": 2.5022404193878174, "learning_rate": 9.513011160764537e-05, "loss": 0.1449, "step": 12920 }, { "epoch": 0.39581228762971804, "grad_norm": 1.9516102075576782, "learning_rate": 9.51199383097837e-05, "loss": 0.0909, "step": 12930 }, { "epoch": 0.3961184069550311, "grad_norm": 1.0449331998825073, "learning_rate": 9.510975494202698e-05, "loss": 0.0676, "step": 12940 }, { "epoch": 0.3964245262803441, "grad_norm": 1.1806339025497437, "learning_rate": 9.509956150664796e-05, "loss": 0.1059, "step": 12950 }, { "epoch": 0.3967306456056571, "grad_norm": 1.719538927078247, "learning_rate": 9.508935800592159e-05, "loss": 0.0965, "step": 12960 }, { "epoch": 0.39703676493097007, "grad_norm": 1.3198596239089966, "learning_rate": 9.50791444421251e-05, "loss": 0.1446, "step": 12970 }, { "epoch": 0.3973428842562831, "grad_norm": 3.2393815517425537, "learning_rate": 9.5068920817538e-05, "loss": 0.0712, "step": 12980 }, { "epoch": 0.3976490035815961, "grad_norm": 1.5750102996826172, "learning_rate": 9.505868713444197e-05, "loss": 0.0788, "step": 12990 }, { "epoch": 0.3979551229069091, "grad_norm": 3.5020782947540283, "learning_rate": 9.504844339512095e-05, "loss": 0.0699, "step": 13000 }, { "epoch": 0.39826124223222215, "grad_norm": 2.386610269546509, "learning_rate": 9.50381896018612e-05, "loss": 0.1662, "step": 13010 }, { "epoch": 0.39856736155753514, "grad_norm": 1.466571569442749, "learning_rate": 9.502792575695112e-05, "loss": 0.0682, "step": 13020 }, { "epoch": 0.39887348088284813, "grad_norm": 3.2662880420684814, "learning_rate": 9.501765186268142e-05, "loss": 0.1692, "step": 13030 }, { "epoch": 0.3991796002081611, "grad_norm": 2.420297622680664, "learning_rate": 9.500736792134501e-05, "loss": 0.1714, "step": 13040 }, { "epoch": 0.39948571953347417, "grad_norm": 2.0185201168060303, "learning_rate": 9.49970739352371e-05, "loss": 0.0553, "step": 13050 }, { "epoch": 0.39979183885878716, "grad_norm": 1.3933488130569458, "learning_rate": 9.498676990665509e-05, "loss": 0.1362, "step": 13060 }, { "epoch": 0.40009795818410016, "grad_norm": 1.2758276462554932, "learning_rate": 9.497645583789864e-05, "loss": 0.1026, "step": 13070 }, { "epoch": 0.40040407750941315, "grad_norm": 0.7596147656440735, "learning_rate": 9.496613173126968e-05, "loss": 0.1438, "step": 13080 }, { "epoch": 0.4007101968347262, "grad_norm": 1.4819891452789307, "learning_rate": 9.49557975890723e-05, "loss": 0.0675, "step": 13090 }, { "epoch": 0.4010163161600392, "grad_norm": 0.9984959959983826, "learning_rate": 9.494545341361292e-05, "loss": 0.0794, "step": 13100 }, { "epoch": 0.4013224354853522, "grad_norm": 1.6495821475982666, "learning_rate": 9.493509920720012e-05, "loss": 0.0967, "step": 13110 }, { "epoch": 0.4016285548106652, "grad_norm": 1.5241929292678833, "learning_rate": 9.49247349721448e-05, "loss": 0.1611, "step": 13120 }, { "epoch": 0.4019346741359782, "grad_norm": 1.345292568206787, "learning_rate": 9.491436071076004e-05, "loss": 0.0399, "step": 13130 }, { "epoch": 0.4022407934612912, "grad_norm": 3.2791318893432617, "learning_rate": 9.490397642536117e-05, "loss": 0.1626, "step": 13140 }, { "epoch": 0.4025469127866042, "grad_norm": 2.5817465782165527, "learning_rate": 9.489358211826576e-05, "loss": 0.1287, "step": 13150 }, { "epoch": 0.40285303211191725, "grad_norm": 4.349579334259033, "learning_rate": 9.488317779179361e-05, "loss": 0.0873, "step": 13160 }, { "epoch": 0.40315915143723025, "grad_norm": 2.5451393127441406, "learning_rate": 9.487276344826679e-05, "loss": 0.1176, "step": 13170 }, { "epoch": 0.40346527076254324, "grad_norm": 1.2099024057388306, "learning_rate": 9.486233909000957e-05, "loss": 0.0902, "step": 13180 }, { "epoch": 0.40377139008785623, "grad_norm": 3.7079951763153076, "learning_rate": 9.485190471934843e-05, "loss": 0.1449, "step": 13190 }, { "epoch": 0.4040775094131693, "grad_norm": 2.6306815147399902, "learning_rate": 9.484146033861217e-05, "loss": 0.1074, "step": 13200 }, { "epoch": 0.40438362873848227, "grad_norm": 2.4785962104797363, "learning_rate": 9.483100595013175e-05, "loss": 0.0969, "step": 13210 }, { "epoch": 0.40468974806379526, "grad_norm": 1.9351024627685547, "learning_rate": 9.482054155624038e-05, "loss": 0.0867, "step": 13220 }, { "epoch": 0.40499586738910826, "grad_norm": 2.247373580932617, "learning_rate": 9.481006715927351e-05, "loss": 0.0683, "step": 13230 }, { "epoch": 0.4053019867144213, "grad_norm": 3.7536182403564453, "learning_rate": 9.479958276156884e-05, "loss": 0.1162, "step": 13240 }, { "epoch": 0.4056081060397343, "grad_norm": 0.8160581588745117, "learning_rate": 9.478908836546629e-05, "loss": 0.0897, "step": 13250 }, { "epoch": 0.4059142253650473, "grad_norm": 1.2417911291122437, "learning_rate": 9.477858397330796e-05, "loss": 0.089, "step": 13260 }, { "epoch": 0.4062203446903603, "grad_norm": 1.7100048065185547, "learning_rate": 9.476806958743828e-05, "loss": 0.1014, "step": 13270 }, { "epoch": 0.40652646401567333, "grad_norm": 1.4237340688705444, "learning_rate": 9.475754521020381e-05, "loss": 0.086, "step": 13280 }, { "epoch": 0.4068325833409863, "grad_norm": 0.9541651606559753, "learning_rate": 9.474701084395343e-05, "loss": 0.0647, "step": 13290 }, { "epoch": 0.4071387026662993, "grad_norm": 2.0203640460968018, "learning_rate": 9.473646649103818e-05, "loss": 0.1069, "step": 13300 }, { "epoch": 0.4074448219916123, "grad_norm": 0.8138912320137024, "learning_rate": 9.472591215381136e-05, "loss": 0.0545, "step": 13310 }, { "epoch": 0.40775094131692535, "grad_norm": 1.6099324226379395, "learning_rate": 9.471534783462848e-05, "loss": 0.0735, "step": 13320 }, { "epoch": 0.40805706064223835, "grad_norm": 2.371617078781128, "learning_rate": 9.470477353584732e-05, "loss": 0.0921, "step": 13330 }, { "epoch": 0.40836317996755134, "grad_norm": 2.8135159015655518, "learning_rate": 9.469418925982785e-05, "loss": 0.0541, "step": 13340 }, { "epoch": 0.4086692992928644, "grad_norm": 1.23381769657135, "learning_rate": 9.468359500893226e-05, "loss": 0.0572, "step": 13350 }, { "epoch": 0.4089754186181774, "grad_norm": 2.612121343612671, "learning_rate": 9.4672990785525e-05, "loss": 0.0896, "step": 13360 }, { "epoch": 0.40928153794349037, "grad_norm": 0.6103622913360596, "learning_rate": 9.46623765919727e-05, "loss": 0.075, "step": 13370 }, { "epoch": 0.40958765726880336, "grad_norm": 1.5664350986480713, "learning_rate": 9.465175243064428e-05, "loss": 0.072, "step": 13380 }, { "epoch": 0.4098937765941164, "grad_norm": 3.746584177017212, "learning_rate": 9.464111830391081e-05, "loss": 0.0971, "step": 13390 }, { "epoch": 0.4101998959194294, "grad_norm": 1.9048094749450684, "learning_rate": 9.463047421414563e-05, "loss": 0.0918, "step": 13400 }, { "epoch": 0.4105060152447424, "grad_norm": 1.0589948892593384, "learning_rate": 9.461982016372433e-05, "loss": 0.0714, "step": 13410 }, { "epoch": 0.4108121345700554, "grad_norm": 1.4851967096328735, "learning_rate": 9.460915615502464e-05, "loss": 0.0977, "step": 13420 }, { "epoch": 0.41111825389536844, "grad_norm": 1.6384062767028809, "learning_rate": 9.459848219042658e-05, "loss": 0.0988, "step": 13430 }, { "epoch": 0.4114243732206814, "grad_norm": 1.6082104444503784, "learning_rate": 9.458779827231237e-05, "loss": 0.0919, "step": 13440 }, { "epoch": 0.4117304925459944, "grad_norm": 0.7654136419296265, "learning_rate": 9.457710440306645e-05, "loss": 0.0608, "step": 13450 }, { "epoch": 0.4120366118713074, "grad_norm": 0.9802854657173157, "learning_rate": 9.45664005850755e-05, "loss": 0.1267, "step": 13460 }, { "epoch": 0.41234273119662046, "grad_norm": 2.8827450275421143, "learning_rate": 9.455568682072837e-05, "loss": 0.0736, "step": 13470 }, { "epoch": 0.41264885052193345, "grad_norm": 1.0719469785690308, "learning_rate": 9.45449631124162e-05, "loss": 0.0589, "step": 13480 }, { "epoch": 0.41295496984724644, "grad_norm": 3.3022384643554688, "learning_rate": 9.453422946253232e-05, "loss": 0.079, "step": 13490 }, { "epoch": 0.41326108917255944, "grad_norm": 2.9064548015594482, "learning_rate": 9.452348587347224e-05, "loss": 0.0917, "step": 13500 }, { "epoch": 0.4135672084978725, "grad_norm": 5.1654953956604, "learning_rate": 9.451273234763371e-05, "loss": 0.0726, "step": 13510 }, { "epoch": 0.4138733278231855, "grad_norm": 2.657626152038574, "learning_rate": 9.450196888741676e-05, "loss": 0.0547, "step": 13520 }, { "epoch": 0.41417944714849847, "grad_norm": 3.744380235671997, "learning_rate": 9.449119549522357e-05, "loss": 0.1512, "step": 13530 }, { "epoch": 0.4144855664738115, "grad_norm": 0.8218960762023926, "learning_rate": 9.448041217345851e-05, "loss": 0.0661, "step": 13540 }, { "epoch": 0.4147916857991245, "grad_norm": 1.214759349822998, "learning_rate": 9.446961892452824e-05, "loss": 0.0598, "step": 13550 }, { "epoch": 0.4150978051244375, "grad_norm": 2.081840991973877, "learning_rate": 9.445881575084161e-05, "loss": 0.0916, "step": 13560 }, { "epoch": 0.4154039244497505, "grad_norm": 1.7958946228027344, "learning_rate": 9.444800265480967e-05, "loss": 0.1258, "step": 13570 }, { "epoch": 0.41571004377506354, "grad_norm": 1.3391852378845215, "learning_rate": 9.443717963884569e-05, "loss": 0.0605, "step": 13580 }, { "epoch": 0.41601616310037653, "grad_norm": 2.785421371459961, "learning_rate": 9.442634670536516e-05, "loss": 0.123, "step": 13590 }, { "epoch": 0.4163222824256895, "grad_norm": 1.7365342378616333, "learning_rate": 9.441550385678579e-05, "loss": 0.0633, "step": 13600 }, { "epoch": 0.4166284017510025, "grad_norm": 15.269658088684082, "learning_rate": 9.440465109552748e-05, "loss": 0.1079, "step": 13610 }, { "epoch": 0.41693452107631557, "grad_norm": 0.6487070322036743, "learning_rate": 9.439378842401236e-05, "loss": 0.0851, "step": 13620 }, { "epoch": 0.41724064040162856, "grad_norm": 1.4207345247268677, "learning_rate": 9.438291584466479e-05, "loss": 0.1099, "step": 13630 }, { "epoch": 0.41754675972694155, "grad_norm": 2.6239094734191895, "learning_rate": 9.437203335991127e-05, "loss": 0.0913, "step": 13640 }, { "epoch": 0.41785287905225454, "grad_norm": 0.6405467987060547, "learning_rate": 9.43611409721806e-05, "loss": 0.037, "step": 13650 }, { "epoch": 0.4181589983775676, "grad_norm": 0.6811105608940125, "learning_rate": 9.435023868390374e-05, "loss": 0.0952, "step": 13660 }, { "epoch": 0.4184651177028806, "grad_norm": 0.8578160405158997, "learning_rate": 9.433932649751385e-05, "loss": 0.1094, "step": 13670 }, { "epoch": 0.4187712370281936, "grad_norm": 0.8805117607116699, "learning_rate": 9.432840441544634e-05, "loss": 0.0761, "step": 13680 }, { "epoch": 0.41907735635350657, "grad_norm": 1.270601749420166, "learning_rate": 9.431747244013881e-05, "loss": 0.1094, "step": 13690 }, { "epoch": 0.4193834756788196, "grad_norm": 1.5217000246047974, "learning_rate": 9.430653057403105e-05, "loss": 0.0598, "step": 13700 }, { "epoch": 0.4196895950041326, "grad_norm": 1.3255424499511719, "learning_rate": 9.429557881956508e-05, "loss": 0.0919, "step": 13710 }, { "epoch": 0.4199957143294456, "grad_norm": 1.7314304113388062, "learning_rate": 9.428461717918511e-05, "loss": 0.0612, "step": 13720 }, { "epoch": 0.42030183365475865, "grad_norm": 2.3680522441864014, "learning_rate": 9.427364565533758e-05, "loss": 0.1472, "step": 13730 }, { "epoch": 0.42060795298007164, "grad_norm": 2.674236536026001, "learning_rate": 9.426266425047113e-05, "loss": 0.1199, "step": 13740 }, { "epoch": 0.42091407230538463, "grad_norm": 3.1490695476531982, "learning_rate": 9.425167296703655e-05, "loss": 0.0995, "step": 13750 }, { "epoch": 0.4212201916306976, "grad_norm": 0.6207751035690308, "learning_rate": 9.424067180748692e-05, "loss": 0.1297, "step": 13760 }, { "epoch": 0.4215263109560107, "grad_norm": 0.7753440141677856, "learning_rate": 9.422966077427748e-05, "loss": 0.1691, "step": 13770 }, { "epoch": 0.42183243028132367, "grad_norm": 2.0391547679901123, "learning_rate": 9.42186398698657e-05, "loss": 0.1114, "step": 13780 }, { "epoch": 0.42213854960663666, "grad_norm": 0.5031137466430664, "learning_rate": 9.420760909671118e-05, "loss": 0.0453, "step": 13790 }, { "epoch": 0.42244466893194965, "grad_norm": 1.0279449224472046, "learning_rate": 9.419656845727582e-05, "loss": 0.0756, "step": 13800 }, { "epoch": 0.4227507882572627, "grad_norm": 0.6779003739356995, "learning_rate": 9.418551795402365e-05, "loss": 0.0972, "step": 13810 }, { "epoch": 0.4230569075825757, "grad_norm": 2.406759023666382, "learning_rate": 9.417445758942094e-05, "loss": 0.0691, "step": 13820 }, { "epoch": 0.4233630269078887, "grad_norm": 1.2480343580245972, "learning_rate": 9.416338736593613e-05, "loss": 0.0704, "step": 13830 }, { "epoch": 0.4236691462332017, "grad_norm": 2.543140411376953, "learning_rate": 9.415230728603992e-05, "loss": 0.13, "step": 13840 }, { "epoch": 0.4239752655585147, "grad_norm": 1.9612802267074585, "learning_rate": 9.414121735220514e-05, "loss": 0.1429, "step": 13850 }, { "epoch": 0.4242813848838277, "grad_norm": 3.653991222381592, "learning_rate": 9.413011756690685e-05, "loss": 0.1302, "step": 13860 }, { "epoch": 0.4245875042091407, "grad_norm": 4.1847124099731445, "learning_rate": 9.411900793262232e-05, "loss": 0.2326, "step": 13870 }, { "epoch": 0.42489362353445376, "grad_norm": 1.9005354642868042, "learning_rate": 9.410788845183097e-05, "loss": 0.1612, "step": 13880 }, { "epoch": 0.42519974285976675, "grad_norm": 2.5741326808929443, "learning_rate": 9.40967591270145e-05, "loss": 0.094, "step": 13890 }, { "epoch": 0.42550586218507974, "grad_norm": 4.011760711669922, "learning_rate": 9.408561996065672e-05, "loss": 0.1068, "step": 13900 }, { "epoch": 0.42581198151039273, "grad_norm": 1.922353744506836, "learning_rate": 9.407447095524371e-05, "loss": 0.0686, "step": 13910 }, { "epoch": 0.4261181008357058, "grad_norm": 0.8092479705810547, "learning_rate": 9.406331211326367e-05, "loss": 0.1022, "step": 13920 }, { "epoch": 0.4264242201610188, "grad_norm": 4.280985355377197, "learning_rate": 9.405214343720707e-05, "loss": 0.0719, "step": 13930 }, { "epoch": 0.42673033948633177, "grad_norm": 0.6187805533409119, "learning_rate": 9.404096492956656e-05, "loss": 0.1067, "step": 13940 }, { "epoch": 0.42703645881164476, "grad_norm": 0.37273114919662476, "learning_rate": 9.40297765928369e-05, "loss": 0.066, "step": 13950 }, { "epoch": 0.4273425781369578, "grad_norm": 2.5796396732330322, "learning_rate": 9.401857842951518e-05, "loss": 0.0819, "step": 13960 }, { "epoch": 0.4276486974622708, "grad_norm": 1.893586277961731, "learning_rate": 9.400737044210055e-05, "loss": 0.0649, "step": 13970 }, { "epoch": 0.4279548167875838, "grad_norm": 0.680808424949646, "learning_rate": 9.399615263309448e-05, "loss": 0.1062, "step": 13980 }, { "epoch": 0.4282609361128968, "grad_norm": 1.7447322607040405, "learning_rate": 9.398492500500051e-05, "loss": 0.1356, "step": 13990 }, { "epoch": 0.42856705543820983, "grad_norm": 1.7073661088943481, "learning_rate": 9.397368756032445e-05, "loss": 0.0775, "step": 14000 }, { "epoch": 0.4288731747635228, "grad_norm": 1.5063246488571167, "learning_rate": 9.39624403015743e-05, "loss": 0.0878, "step": 14010 }, { "epoch": 0.4291792940888358, "grad_norm": 0.7898034453392029, "learning_rate": 9.395118323126021e-05, "loss": 0.0662, "step": 14020 }, { "epoch": 0.4294854134141488, "grad_norm": 2.2770063877105713, "learning_rate": 9.393991635189455e-05, "loss": 0.0824, "step": 14030 }, { "epoch": 0.42979153273946186, "grad_norm": 2.6705663204193115, "learning_rate": 9.392863966599183e-05, "loss": 0.1214, "step": 14040 }, { "epoch": 0.43009765206477485, "grad_norm": 2.7651596069335938, "learning_rate": 9.391735317606885e-05, "loss": 0.1065, "step": 14050 }, { "epoch": 0.43040377139008784, "grad_norm": 1.8814215660095215, "learning_rate": 9.390605688464449e-05, "loss": 0.1018, "step": 14060 }, { "epoch": 0.4307098907154009, "grad_norm": 1.498242735862732, "learning_rate": 9.389475079423988e-05, "loss": 0.1028, "step": 14070 }, { "epoch": 0.4310160100407139, "grad_norm": 2.6099636554718018, "learning_rate": 9.388343490737832e-05, "loss": 0.1415, "step": 14080 }, { "epoch": 0.4313221293660269, "grad_norm": 0.9792959094047546, "learning_rate": 9.38721092265853e-05, "loss": 0.0805, "step": 14090 }, { "epoch": 0.43162824869133987, "grad_norm": 3.45493221282959, "learning_rate": 9.386077375438848e-05, "loss": 0.0977, "step": 14100 }, { "epoch": 0.4319343680166529, "grad_norm": 1.2913768291473389, "learning_rate": 9.384942849331772e-05, "loss": 0.1332, "step": 14110 }, { "epoch": 0.4322404873419659, "grad_norm": 1.540218472480774, "learning_rate": 9.383807344590507e-05, "loss": 0.0699, "step": 14120 }, { "epoch": 0.4325466066672789, "grad_norm": 1.6141514778137207, "learning_rate": 9.382670861468476e-05, "loss": 0.0933, "step": 14130 }, { "epoch": 0.4328527259925919, "grad_norm": 1.379106879234314, "learning_rate": 9.381533400219318e-05, "loss": 0.1176, "step": 14140 }, { "epoch": 0.43315884531790494, "grad_norm": 1.635521650314331, "learning_rate": 9.380394961096894e-05, "loss": 0.0854, "step": 14150 }, { "epoch": 0.43346496464321793, "grad_norm": 1.8567777872085571, "learning_rate": 9.379255544355282e-05, "loss": 0.0887, "step": 14160 }, { "epoch": 0.4337710839685309, "grad_norm": 1.2173206806182861, "learning_rate": 9.378115150248776e-05, "loss": 0.0627, "step": 14170 }, { "epoch": 0.4340772032938439, "grad_norm": 1.19930899143219, "learning_rate": 9.37697377903189e-05, "loss": 0.1469, "step": 14180 }, { "epoch": 0.43438332261915696, "grad_norm": 1.6889574527740479, "learning_rate": 9.375831430959358e-05, "loss": 0.0826, "step": 14190 }, { "epoch": 0.43468944194446996, "grad_norm": 2.8208742141723633, "learning_rate": 9.374688106286127e-05, "loss": 0.0963, "step": 14200 }, { "epoch": 0.43499556126978295, "grad_norm": 0.9670422077178955, "learning_rate": 9.373543805267368e-05, "loss": 0.1308, "step": 14210 }, { "epoch": 0.43530168059509594, "grad_norm": 1.9587273597717285, "learning_rate": 9.372398528158464e-05, "loss": 0.0646, "step": 14220 }, { "epoch": 0.435607799920409, "grad_norm": 1.7797999382019043, "learning_rate": 9.371252275215019e-05, "loss": 0.1101, "step": 14230 }, { "epoch": 0.435913919245722, "grad_norm": 0.9778621196746826, "learning_rate": 9.370105046692856e-05, "loss": 0.0924, "step": 14240 }, { "epoch": 0.436220038571035, "grad_norm": 1.2256360054016113, "learning_rate": 9.368956842848015e-05, "loss": 0.0922, "step": 14250 }, { "epoch": 0.436526157896348, "grad_norm": 2.736199140548706, "learning_rate": 9.36780766393675e-05, "loss": 0.0901, "step": 14260 }, { "epoch": 0.436832277221661, "grad_norm": 2.243072748184204, "learning_rate": 9.366657510215535e-05, "loss": 0.1169, "step": 14270 }, { "epoch": 0.437138396546974, "grad_norm": 1.1409287452697754, "learning_rate": 9.365506381941066e-05, "loss": 0.1308, "step": 14280 }, { "epoch": 0.437444515872287, "grad_norm": 1.783536672592163, "learning_rate": 9.364354279370249e-05, "loss": 0.0803, "step": 14290 }, { "epoch": 0.43775063519760005, "grad_norm": 1.4142507314682007, "learning_rate": 9.363201202760213e-05, "loss": 0.0747, "step": 14300 }, { "epoch": 0.43805675452291304, "grad_norm": 1.3322628736495972, "learning_rate": 9.3620471523683e-05, "loss": 0.0856, "step": 14310 }, { "epoch": 0.43836287384822603, "grad_norm": 2.206137180328369, "learning_rate": 9.360892128452073e-05, "loss": 0.1063, "step": 14320 }, { "epoch": 0.438668993173539, "grad_norm": 1.3459750413894653, "learning_rate": 9.359736131269312e-05, "loss": 0.0544, "step": 14330 }, { "epoch": 0.43897511249885207, "grad_norm": 0.6650859117507935, "learning_rate": 9.358579161078013e-05, "loss": 0.1076, "step": 14340 }, { "epoch": 0.43928123182416506, "grad_norm": 3.0819754600524902, "learning_rate": 9.357421218136386e-05, "loss": 0.1064, "step": 14350 }, { "epoch": 0.43958735114947806, "grad_norm": 2.5323398113250732, "learning_rate": 9.356262302702866e-05, "loss": 0.082, "step": 14360 }, { "epoch": 0.43989347047479105, "grad_norm": 4.000069618225098, "learning_rate": 9.355102415036098e-05, "loss": 0.1015, "step": 14370 }, { "epoch": 0.4401995898001041, "grad_norm": 2.2962145805358887, "learning_rate": 9.353941555394946e-05, "loss": 0.1009, "step": 14380 }, { "epoch": 0.4405057091254171, "grad_norm": 1.2681316137313843, "learning_rate": 9.352779724038493e-05, "loss": 0.113, "step": 14390 }, { "epoch": 0.4408118284507301, "grad_norm": 22.617420196533203, "learning_rate": 9.351616921226036e-05, "loss": 0.1007, "step": 14400 }, { "epoch": 0.44111794777604313, "grad_norm": 1.7137999534606934, "learning_rate": 9.35045314721709e-05, "loss": 0.0906, "step": 14410 }, { "epoch": 0.4414240671013561, "grad_norm": 3.514897108078003, "learning_rate": 9.349288402271388e-05, "loss": 0.0623, "step": 14420 }, { "epoch": 0.4417301864266691, "grad_norm": 1.462731122970581, "learning_rate": 9.348122686648877e-05, "loss": 0.0786, "step": 14430 }, { "epoch": 0.4420363057519821, "grad_norm": 0.9862880110740662, "learning_rate": 9.346956000609721e-05, "loss": 0.1317, "step": 14440 }, { "epoch": 0.44234242507729515, "grad_norm": 2.2102372646331787, "learning_rate": 9.345788344414306e-05, "loss": 0.0809, "step": 14450 }, { "epoch": 0.44264854440260815, "grad_norm": 3.8262767791748047, "learning_rate": 9.344619718323227e-05, "loss": 0.0873, "step": 14460 }, { "epoch": 0.44295466372792114, "grad_norm": 1.5238415002822876, "learning_rate": 9.3434501225973e-05, "loss": 0.114, "step": 14470 }, { "epoch": 0.44326078305323413, "grad_norm": 3.3903303146362305, "learning_rate": 9.342279557497553e-05, "loss": 0.1036, "step": 14480 }, { "epoch": 0.4435669023785472, "grad_norm": 1.2982937097549438, "learning_rate": 9.341108023285238e-05, "loss": 0.0982, "step": 14490 }, { "epoch": 0.44387302170386017, "grad_norm": 0.8475042581558228, "learning_rate": 9.339935520221816e-05, "loss": 0.0835, "step": 14500 }, { "epoch": 0.44417914102917316, "grad_norm": 0.7169262766838074, "learning_rate": 9.338762048568968e-05, "loss": 0.1165, "step": 14510 }, { "epoch": 0.44448526035448616, "grad_norm": 1.080785870552063, "learning_rate": 9.337587608588588e-05, "loss": 0.0711, "step": 14520 }, { "epoch": 0.4447913796797992, "grad_norm": 1.3339011669158936, "learning_rate": 9.33641220054279e-05, "loss": 0.1462, "step": 14530 }, { "epoch": 0.4450974990051122, "grad_norm": 2.0962753295898438, "learning_rate": 9.335235824693904e-05, "loss": 0.1102, "step": 14540 }, { "epoch": 0.4454036183304252, "grad_norm": 1.2654088735580444, "learning_rate": 9.334058481304471e-05, "loss": 0.0777, "step": 14550 }, { "epoch": 0.4457097376557382, "grad_norm": 4.406511306762695, "learning_rate": 9.332880170637252e-05, "loss": 0.116, "step": 14560 }, { "epoch": 0.44601585698105123, "grad_norm": 1.7384761571884155, "learning_rate": 9.331700892955225e-05, "loss": 0.1035, "step": 14570 }, { "epoch": 0.4463219763063642, "grad_norm": 0.5817931294441223, "learning_rate": 9.330520648521581e-05, "loss": 0.0616, "step": 14580 }, { "epoch": 0.4466280956316772, "grad_norm": 2.2323215007781982, "learning_rate": 9.329339437599726e-05, "loss": 0.0548, "step": 14590 }, { "epoch": 0.44693421495699026, "grad_norm": 1.5175938606262207, "learning_rate": 9.328157260453286e-05, "loss": 0.0836, "step": 14600 }, { "epoch": 0.44724033428230325, "grad_norm": 1.6685996055603027, "learning_rate": 9.326974117346096e-05, "loss": 0.0667, "step": 14610 }, { "epoch": 0.44754645360761625, "grad_norm": 2.5271873474121094, "learning_rate": 9.325790008542215e-05, "loss": 0.0774, "step": 14620 }, { "epoch": 0.44785257293292924, "grad_norm": 1.181317687034607, "learning_rate": 9.32460493430591e-05, "loss": 0.0651, "step": 14630 }, { "epoch": 0.4481586922582423, "grad_norm": 0.8624565005302429, "learning_rate": 9.323418894901669e-05, "loss": 0.2486, "step": 14640 }, { "epoch": 0.4484648115835553, "grad_norm": 1.5864088535308838, "learning_rate": 9.322231890594193e-05, "loss": 0.0372, "step": 14650 }, { "epoch": 0.44877093090886827, "grad_norm": 4.2946953773498535, "learning_rate": 9.321043921648395e-05, "loss": 0.1297, "step": 14660 }, { "epoch": 0.44907705023418126, "grad_norm": 1.4478389024734497, "learning_rate": 9.31985498832941e-05, "loss": 0.1007, "step": 14670 }, { "epoch": 0.4493831695594943, "grad_norm": 1.0578116178512573, "learning_rate": 9.318665090902583e-05, "loss": 0.1147, "step": 14680 }, { "epoch": 0.4496892888848073, "grad_norm": 1.7539995908737183, "learning_rate": 9.317474229633478e-05, "loss": 0.103, "step": 14690 }, { "epoch": 0.4499954082101203, "grad_norm": 1.0728046894073486, "learning_rate": 9.316282404787871e-05, "loss": 0.0599, "step": 14700 }, { "epoch": 0.4503015275354333, "grad_norm": 1.3426088094711304, "learning_rate": 9.315089616631752e-05, "loss": 0.0443, "step": 14710 }, { "epoch": 0.45060764686074634, "grad_norm": 4.468818664550781, "learning_rate": 9.313895865431331e-05, "loss": 0.1605, "step": 14720 }, { "epoch": 0.45091376618605933, "grad_norm": 1.410890817642212, "learning_rate": 9.312701151453029e-05, "loss": 0.0893, "step": 14730 }, { "epoch": 0.4512198855113723, "grad_norm": 2.665668249130249, "learning_rate": 9.311505474963484e-05, "loss": 0.1797, "step": 14740 }, { "epoch": 0.4515260048366853, "grad_norm": 1.6382077932357788, "learning_rate": 9.310308836229549e-05, "loss": 0.0685, "step": 14750 }, { "epoch": 0.45183212416199836, "grad_norm": 2.2502052783966064, "learning_rate": 9.309111235518286e-05, "loss": 0.1324, "step": 14760 }, { "epoch": 0.45213824348731135, "grad_norm": 0.906369686126709, "learning_rate": 9.30791267309698e-05, "loss": 0.1269, "step": 14770 }, { "epoch": 0.45244436281262435, "grad_norm": 0.7296282052993774, "learning_rate": 9.306713149233126e-05, "loss": 0.0496, "step": 14780 }, { "epoch": 0.4527504821379374, "grad_norm": 1.7908471822738647, "learning_rate": 9.305512664194434e-05, "loss": 0.0845, "step": 14790 }, { "epoch": 0.4530566014632504, "grad_norm": 0.7445313930511475, "learning_rate": 9.304311218248828e-05, "loss": 0.0765, "step": 14800 }, { "epoch": 0.4533627207885634, "grad_norm": 1.7820439338684082, "learning_rate": 9.30310881166445e-05, "loss": 0.0964, "step": 14810 }, { "epoch": 0.45366884011387637, "grad_norm": 1.9641543626785278, "learning_rate": 9.301905444709651e-05, "loss": 0.1418, "step": 14820 }, { "epoch": 0.4539749594391894, "grad_norm": 1.9809280633926392, "learning_rate": 9.300701117653004e-05, "loss": 0.1335, "step": 14830 }, { "epoch": 0.4542810787645024, "grad_norm": 0.8709591627120972, "learning_rate": 9.299495830763286e-05, "loss": 0.0909, "step": 14840 }, { "epoch": 0.4545871980898154, "grad_norm": 1.7460039854049683, "learning_rate": 9.298289584309495e-05, "loss": 0.2062, "step": 14850 }, { "epoch": 0.4548933174151284, "grad_norm": 2.2538397312164307, "learning_rate": 9.297082378560844e-05, "loss": 0.2138, "step": 14860 }, { "epoch": 0.45519943674044144, "grad_norm": 1.7560421228408813, "learning_rate": 9.295874213786754e-05, "loss": 0.1007, "step": 14870 }, { "epoch": 0.45550555606575444, "grad_norm": 1.7407946586608887, "learning_rate": 9.29466509025687e-05, "loss": 0.1307, "step": 14880 }, { "epoch": 0.45581167539106743, "grad_norm": 1.424093246459961, "learning_rate": 9.293455008241038e-05, "loss": 0.081, "step": 14890 }, { "epoch": 0.4561177947163804, "grad_norm": 0.7443898916244507, "learning_rate": 9.292243968009331e-05, "loss": 0.0668, "step": 14900 }, { "epoch": 0.45642391404169347, "grad_norm": 1.756664752960205, "learning_rate": 9.291031969832026e-05, "loss": 0.0699, "step": 14910 }, { "epoch": 0.45673003336700646, "grad_norm": 2.7245984077453613, "learning_rate": 9.289819013979616e-05, "loss": 0.1231, "step": 14920 }, { "epoch": 0.45703615269231945, "grad_norm": 1.165811538696289, "learning_rate": 9.288605100722816e-05, "loss": 0.0672, "step": 14930 }, { "epoch": 0.4573422720176325, "grad_norm": 5.172287940979004, "learning_rate": 9.28739023033254e-05, "loss": 0.0958, "step": 14940 }, { "epoch": 0.4576483913429455, "grad_norm": 7.629774570465088, "learning_rate": 9.286174403079928e-05, "loss": 0.211, "step": 14950 }, { "epoch": 0.4579545106682585, "grad_norm": 1.5239241123199463, "learning_rate": 9.284957619236329e-05, "loss": 0.0648, "step": 14960 }, { "epoch": 0.4582606299935715, "grad_norm": 3.7137715816497803, "learning_rate": 9.283739879073303e-05, "loss": 0.1113, "step": 14970 }, { "epoch": 0.4585667493188845, "grad_norm": 0.9761481881141663, "learning_rate": 9.282521182862629e-05, "loss": 0.069, "step": 14980 }, { "epoch": 0.4588728686441975, "grad_norm": 1.5887664556503296, "learning_rate": 9.281301530876295e-05, "loss": 0.1072, "step": 14990 }, { "epoch": 0.4591789879695105, "grad_norm": 5.88398551940918, "learning_rate": 9.280080923386501e-05, "loss": 0.1006, "step": 15000 }, { "epoch": 0.4594851072948235, "grad_norm": 1.66131591796875, "learning_rate": 9.278859360665669e-05, "loss": 0.0813, "step": 15010 }, { "epoch": 0.45979122662013655, "grad_norm": 0.978567361831665, "learning_rate": 9.277636842986421e-05, "loss": 0.0735, "step": 15020 }, { "epoch": 0.46009734594544954, "grad_norm": 2.2793807983398438, "learning_rate": 9.276413370621604e-05, "loss": 0.1143, "step": 15030 }, { "epoch": 0.46040346527076254, "grad_norm": 1.284197211265564, "learning_rate": 9.275188943844273e-05, "loss": 0.0702, "step": 15040 }, { "epoch": 0.4607095845960755, "grad_norm": 1.2438043355941772, "learning_rate": 9.273963562927695e-05, "loss": 0.1603, "step": 15050 }, { "epoch": 0.4610157039213886, "grad_norm": 3.1227340698242188, "learning_rate": 9.272737228145352e-05, "loss": 0.1408, "step": 15060 }, { "epoch": 0.46132182324670157, "grad_norm": 5.024231910705566, "learning_rate": 9.271509939770938e-05, "loss": 0.1408, "step": 15070 }, { "epoch": 0.46162794257201456, "grad_norm": 0.9836534261703491, "learning_rate": 9.27028169807836e-05, "loss": 0.0541, "step": 15080 }, { "epoch": 0.46193406189732755, "grad_norm": 5.579927921295166, "learning_rate": 9.269052503341736e-05, "loss": 0.0869, "step": 15090 }, { "epoch": 0.4622401812226406, "grad_norm": 6.494168281555176, "learning_rate": 9.267822355835402e-05, "loss": 0.0778, "step": 15100 }, { "epoch": 0.4625463005479536, "grad_norm": 1.1752842664718628, "learning_rate": 9.266591255833899e-05, "loss": 0.0823, "step": 15110 }, { "epoch": 0.4628524198732666, "grad_norm": 1.1785664558410645, "learning_rate": 9.265359203611987e-05, "loss": 0.0876, "step": 15120 }, { "epoch": 0.46315853919857963, "grad_norm": 1.1558669805526733, "learning_rate": 9.264126199444638e-05, "loss": 0.0844, "step": 15130 }, { "epoch": 0.4634646585238926, "grad_norm": 1.5937340259552002, "learning_rate": 9.262892243607031e-05, "loss": 0.0852, "step": 15140 }, { "epoch": 0.4637707778492056, "grad_norm": 1.0346592664718628, "learning_rate": 9.261657336374561e-05, "loss": 0.0791, "step": 15150 }, { "epoch": 0.4640768971745186, "grad_norm": 2.403765916824341, "learning_rate": 9.260421478022839e-05, "loss": 0.1405, "step": 15160 }, { "epoch": 0.46438301649983166, "grad_norm": 3.0857882499694824, "learning_rate": 9.25918466882768e-05, "loss": 0.0778, "step": 15170 }, { "epoch": 0.46468913582514465, "grad_norm": 2.6101882457733154, "learning_rate": 9.25794690906512e-05, "loss": 0.1246, "step": 15180 }, { "epoch": 0.46499525515045764, "grad_norm": 1.9436036348342896, "learning_rate": 9.256708199011401e-05, "loss": 0.0789, "step": 15190 }, { "epoch": 0.46530137447577063, "grad_norm": 0.7653971910476685, "learning_rate": 9.255468538942979e-05, "loss": 0.0652, "step": 15200 }, { "epoch": 0.4656074938010837, "grad_norm": 1.7407361268997192, "learning_rate": 9.254227929136522e-05, "loss": 0.1896, "step": 15210 }, { "epoch": 0.4659136131263967, "grad_norm": 1.5176851749420166, "learning_rate": 9.25298636986891e-05, "loss": 0.0604, "step": 15220 }, { "epoch": 0.46621973245170967, "grad_norm": 1.124219536781311, "learning_rate": 9.251743861417238e-05, "loss": 0.0546, "step": 15230 }, { "epoch": 0.46652585177702266, "grad_norm": 1.256282091140747, "learning_rate": 9.250500404058804e-05, "loss": 0.0389, "step": 15240 }, { "epoch": 0.4668319711023357, "grad_norm": 1.4062711000442505, "learning_rate": 9.249255998071126e-05, "loss": 0.0817, "step": 15250 }, { "epoch": 0.4671380904276487, "grad_norm": 0.8326440453529358, "learning_rate": 9.248010643731935e-05, "loss": 0.1294, "step": 15260 }, { "epoch": 0.4674442097529617, "grad_norm": 2.6153132915496826, "learning_rate": 9.246764341319166e-05, "loss": 0.1468, "step": 15270 }, { "epoch": 0.4677503290782747, "grad_norm": 2.3372092247009277, "learning_rate": 9.24551709111097e-05, "loss": 0.0912, "step": 15280 }, { "epoch": 0.46805644840358773, "grad_norm": 1.166774034500122, "learning_rate": 9.244268893385709e-05, "loss": 0.0847, "step": 15290 }, { "epoch": 0.4683625677289007, "grad_norm": 4.578220844268799, "learning_rate": 9.243019748421955e-05, "loss": 0.0751, "step": 15300 }, { "epoch": 0.4686686870542137, "grad_norm": 3.019054412841797, "learning_rate": 9.241769656498498e-05, "loss": 0.1346, "step": 15310 }, { "epoch": 0.46897480637952677, "grad_norm": 0.7566227316856384, "learning_rate": 9.24051861789433e-05, "loss": 0.1147, "step": 15320 }, { "epoch": 0.46928092570483976, "grad_norm": 5.580915451049805, "learning_rate": 9.239266632888659e-05, "loss": 0.102, "step": 15330 }, { "epoch": 0.46958704503015275, "grad_norm": 0.6141241192817688, "learning_rate": 9.238013701760906e-05, "loss": 0.1192, "step": 15340 }, { "epoch": 0.46989316435546574, "grad_norm": 1.1927489042282104, "learning_rate": 9.236759824790699e-05, "loss": 0.0619, "step": 15350 }, { "epoch": 0.4701992836807788, "grad_norm": 2.0818965435028076, "learning_rate": 9.235505002257878e-05, "loss": 0.0763, "step": 15360 }, { "epoch": 0.4705054030060918, "grad_norm": 1.6688851118087769, "learning_rate": 9.234249234442499e-05, "loss": 0.0773, "step": 15370 }, { "epoch": 0.4708115223314048, "grad_norm": 2.15084171295166, "learning_rate": 9.23299252162482e-05, "loss": 0.0736, "step": 15380 }, { "epoch": 0.47111764165671777, "grad_norm": 1.8012754917144775, "learning_rate": 9.231734864085317e-05, "loss": 0.115, "step": 15390 }, { "epoch": 0.4714237609820308, "grad_norm": 0.7692734599113464, "learning_rate": 9.230476262104677e-05, "loss": 0.0919, "step": 15400 }, { "epoch": 0.4717298803073438, "grad_norm": 1.0908902883529663, "learning_rate": 9.229216715963793e-05, "loss": 0.0697, "step": 15410 }, { "epoch": 0.4720359996326568, "grad_norm": 3.916834831237793, "learning_rate": 9.227956225943774e-05, "loss": 0.0879, "step": 15420 }, { "epoch": 0.4723421189579698, "grad_norm": 1.6046714782714844, "learning_rate": 9.226694792325932e-05, "loss": 0.065, "step": 15430 }, { "epoch": 0.47264823828328284, "grad_norm": 0.7772650718688965, "learning_rate": 9.2254324153918e-05, "loss": 0.053, "step": 15440 }, { "epoch": 0.47295435760859583, "grad_norm": 3.121925115585327, "learning_rate": 9.224169095423111e-05, "loss": 0.0764, "step": 15450 }, { "epoch": 0.4732604769339088, "grad_norm": 3.970888137817383, "learning_rate": 9.222904832701817e-05, "loss": 0.1068, "step": 15460 }, { "epoch": 0.4735665962592218, "grad_norm": 2.0810813903808594, "learning_rate": 9.221639627510076e-05, "loss": 0.1098, "step": 15470 }, { "epoch": 0.47387271558453486, "grad_norm": 5.528313159942627, "learning_rate": 9.220373480130258e-05, "loss": 0.0772, "step": 15480 }, { "epoch": 0.47417883490984786, "grad_norm": 1.3197784423828125, "learning_rate": 9.21910639084494e-05, "loss": 0.1011, "step": 15490 }, { "epoch": 0.47448495423516085, "grad_norm": 1.5342094898223877, "learning_rate": 9.217838359936914e-05, "loss": 0.0887, "step": 15500 }, { "epoch": 0.4747910735604739, "grad_norm": 2.7424449920654297, "learning_rate": 9.21656938768918e-05, "loss": 0.116, "step": 15510 }, { "epoch": 0.4750971928857869, "grad_norm": 0.9800657629966736, "learning_rate": 9.215299474384946e-05, "loss": 0.0931, "step": 15520 }, { "epoch": 0.4754033122110999, "grad_norm": 0.6521843671798706, "learning_rate": 9.214028620307636e-05, "loss": 0.0875, "step": 15530 }, { "epoch": 0.4757094315364129, "grad_norm": 1.751487374305725, "learning_rate": 9.212756825740873e-05, "loss": 0.14, "step": 15540 }, { "epoch": 0.4760155508617259, "grad_norm": 1.2336324453353882, "learning_rate": 9.211484090968506e-05, "loss": 0.1303, "step": 15550 }, { "epoch": 0.4763216701870389, "grad_norm": 0.47360196709632874, "learning_rate": 9.210210416274576e-05, "loss": 0.0746, "step": 15560 }, { "epoch": 0.4766277895123519, "grad_norm": 1.7737971544265747, "learning_rate": 9.208935801943348e-05, "loss": 0.1209, "step": 15570 }, { "epoch": 0.4769339088376649, "grad_norm": 1.122165560722351, "learning_rate": 9.20766024825929e-05, "loss": 0.122, "step": 15580 }, { "epoch": 0.47724002816297795, "grad_norm": 1.7407957315444946, "learning_rate": 9.206383755507079e-05, "loss": 0.0524, "step": 15590 }, { "epoch": 0.47754614748829094, "grad_norm": 1.422116994857788, "learning_rate": 9.205106323971608e-05, "loss": 0.1062, "step": 15600 }, { "epoch": 0.47785226681360393, "grad_norm": 1.757582426071167, "learning_rate": 9.20382795393797e-05, "loss": 0.144, "step": 15610 }, { "epoch": 0.4781583861389169, "grad_norm": 1.717265248298645, "learning_rate": 9.202548645691474e-05, "loss": 0.101, "step": 15620 }, { "epoch": 0.47846450546422997, "grad_norm": 1.059262752532959, "learning_rate": 9.201268399517636e-05, "loss": 0.0617, "step": 15630 }, { "epoch": 0.47877062478954296, "grad_norm": 0.7910197973251343, "learning_rate": 9.199987215702184e-05, "loss": 0.0802, "step": 15640 }, { "epoch": 0.47907674411485596, "grad_norm": 0.9525631070137024, "learning_rate": 9.198705094531052e-05, "loss": 0.076, "step": 15650 }, { "epoch": 0.479382863440169, "grad_norm": 3.3649981021881104, "learning_rate": 9.197422036290387e-05, "loss": 0.1043, "step": 15660 }, { "epoch": 0.479688982765482, "grad_norm": 1.6511188745498657, "learning_rate": 9.19613804126654e-05, "loss": 0.0591, "step": 15670 }, { "epoch": 0.479995102090795, "grad_norm": 1.2408367395401, "learning_rate": 9.194853109746074e-05, "loss": 0.154, "step": 15680 }, { "epoch": 0.480301221416108, "grad_norm": 1.5919982194900513, "learning_rate": 9.19356724201576e-05, "loss": 0.0502, "step": 15690 }, { "epoch": 0.48060734074142103, "grad_norm": 0.8484762907028198, "learning_rate": 9.192280438362582e-05, "loss": 0.1057, "step": 15700 }, { "epoch": 0.480913460066734, "grad_norm": 2.8061437606811523, "learning_rate": 9.190992699073727e-05, "loss": 0.0817, "step": 15710 }, { "epoch": 0.481219579392047, "grad_norm": 2.427689552307129, "learning_rate": 9.189704024436594e-05, "loss": 0.0423, "step": 15720 }, { "epoch": 0.48152569871736, "grad_norm": 1.0331453084945679, "learning_rate": 9.18841441473879e-05, "loss": 0.0905, "step": 15730 }, { "epoch": 0.48183181804267305, "grad_norm": 0.7088305354118347, "learning_rate": 9.187123870268133e-05, "loss": 0.0585, "step": 15740 }, { "epoch": 0.48213793736798605, "grad_norm": 2.513693332672119, "learning_rate": 9.185832391312644e-05, "loss": 0.1093, "step": 15750 }, { "epoch": 0.48244405669329904, "grad_norm": 2.15670108795166, "learning_rate": 9.184539978160558e-05, "loss": 0.0863, "step": 15760 }, { "epoch": 0.48275017601861203, "grad_norm": 4.182633399963379, "learning_rate": 9.183246631100316e-05, "loss": 0.0999, "step": 15770 }, { "epoch": 0.4830562953439251, "grad_norm": 1.4030603170394897, "learning_rate": 9.181952350420568e-05, "loss": 0.164, "step": 15780 }, { "epoch": 0.48336241466923807, "grad_norm": 0.8709356188774109, "learning_rate": 9.180657136410174e-05, "loss": 0.1015, "step": 15790 }, { "epoch": 0.48366853399455106, "grad_norm": 1.4507290124893188, "learning_rate": 9.179360989358197e-05, "loss": 0.0762, "step": 15800 }, { "epoch": 0.48397465331986406, "grad_norm": 1.5539132356643677, "learning_rate": 9.178063909553915e-05, "loss": 0.0548, "step": 15810 }, { "epoch": 0.4842807726451771, "grad_norm": 1.1296942234039307, "learning_rate": 9.176765897286813e-05, "loss": 0.0745, "step": 15820 }, { "epoch": 0.4845868919704901, "grad_norm": 0.9780799746513367, "learning_rate": 9.175466952846577e-05, "loss": 0.1279, "step": 15830 }, { "epoch": 0.4848930112958031, "grad_norm": 2.913576602935791, "learning_rate": 9.174167076523108e-05, "loss": 0.1326, "step": 15840 }, { "epoch": 0.48519913062111614, "grad_norm": 2.5375587940216064, "learning_rate": 9.172866268606513e-05, "loss": 0.0889, "step": 15850 }, { "epoch": 0.48550524994642913, "grad_norm": 1.26824152469635, "learning_rate": 9.17156452938711e-05, "loss": 0.0639, "step": 15860 }, { "epoch": 0.4858113692717421, "grad_norm": 3.4552314281463623, "learning_rate": 9.170261859155418e-05, "loss": 0.0945, "step": 15870 }, { "epoch": 0.4861174885970551, "grad_norm": 1.0607167482376099, "learning_rate": 9.168958258202171e-05, "loss": 0.0686, "step": 15880 }, { "epoch": 0.48642360792236816, "grad_norm": 0.6420822739601135, "learning_rate": 9.167653726818305e-05, "loss": 0.0629, "step": 15890 }, { "epoch": 0.48672972724768115, "grad_norm": 0.9560511708259583, "learning_rate": 9.166348265294968e-05, "loss": 0.057, "step": 15900 }, { "epoch": 0.48703584657299415, "grad_norm": 0.9671882390975952, "learning_rate": 9.16504187392351e-05, "loss": 0.0673, "step": 15910 }, { "epoch": 0.48734196589830714, "grad_norm": 1.62807297706604, "learning_rate": 9.163734552995498e-05, "loss": 0.0998, "step": 15920 }, { "epoch": 0.4876480852236202, "grad_norm": 2.454005479812622, "learning_rate": 9.162426302802696e-05, "loss": 0.0792, "step": 15930 }, { "epoch": 0.4879542045489332, "grad_norm": 1.3510428667068481, "learning_rate": 9.161117123637081e-05, "loss": 0.1037, "step": 15940 }, { "epoch": 0.48826032387424617, "grad_norm": 2.2834384441375732, "learning_rate": 9.15980701579084e-05, "loss": 0.0699, "step": 15950 }, { "epoch": 0.48856644319955916, "grad_norm": 0.9039933681488037, "learning_rate": 9.158495979556358e-05, "loss": 0.1494, "step": 15960 }, { "epoch": 0.4888725625248722, "grad_norm": 1.0189439058303833, "learning_rate": 9.157184015226237e-05, "loss": 0.0471, "step": 15970 }, { "epoch": 0.4891786818501852, "grad_norm": 2.2022218704223633, "learning_rate": 9.15587112309328e-05, "loss": 0.0989, "step": 15980 }, { "epoch": 0.4894848011754982, "grad_norm": 1.1715055704116821, "learning_rate": 9.1545573034505e-05, "loss": 0.0545, "step": 15990 }, { "epoch": 0.4897909205008112, "grad_norm": 0.3900102972984314, "learning_rate": 9.153242556591116e-05, "loss": 0.1017, "step": 16000 }, { "epoch": 0.49009703982612424, "grad_norm": 1.704458475112915, "learning_rate": 9.151926882808553e-05, "loss": 0.162, "step": 16010 }, { "epoch": 0.49040315915143723, "grad_norm": 4.580431938171387, "learning_rate": 9.150610282396443e-05, "loss": 0.0651, "step": 16020 }, { "epoch": 0.4907092784767502, "grad_norm": 1.5268687009811401, "learning_rate": 9.14929275564863e-05, "loss": 0.0781, "step": 16030 }, { "epoch": 0.49101539780206327, "grad_norm": 0.7486280202865601, "learning_rate": 9.147974302859157e-05, "loss": 0.0762, "step": 16040 }, { "epoch": 0.49132151712737626, "grad_norm": 9.213862419128418, "learning_rate": 9.146654924322277e-05, "loss": 0.1502, "step": 16050 }, { "epoch": 0.49162763645268925, "grad_norm": 1.9956196546554565, "learning_rate": 9.14533462033245e-05, "loss": 0.0944, "step": 16060 }, { "epoch": 0.49193375577800225, "grad_norm": 3.028981924057007, "learning_rate": 9.144013391184343e-05, "loss": 0.1535, "step": 16070 }, { "epoch": 0.4922398751033153, "grad_norm": 1.6550132036209106, "learning_rate": 9.142691237172829e-05, "loss": 0.1128, "step": 16080 }, { "epoch": 0.4925459944286283, "grad_norm": 1.053426742553711, "learning_rate": 9.141368158592987e-05, "loss": 0.0861, "step": 16090 }, { "epoch": 0.4928521137539413, "grad_norm": 2.4252431392669678, "learning_rate": 9.140044155740101e-05, "loss": 0.0993, "step": 16100 }, { "epoch": 0.49315823307925427, "grad_norm": 3.882795572280884, "learning_rate": 9.138719228909663e-05, "loss": 0.1346, "step": 16110 }, { "epoch": 0.4934643524045673, "grad_norm": 4.0511345863342285, "learning_rate": 9.137393378397374e-05, "loss": 0.0917, "step": 16120 }, { "epoch": 0.4937704717298803, "grad_norm": 1.6706206798553467, "learning_rate": 9.136066604499134e-05, "loss": 0.0444, "step": 16130 }, { "epoch": 0.4940765910551933, "grad_norm": 2.224947452545166, "learning_rate": 9.134738907511055e-05, "loss": 0.1351, "step": 16140 }, { "epoch": 0.4943827103805063, "grad_norm": 1.5902832746505737, "learning_rate": 9.133410287729454e-05, "loss": 0.0904, "step": 16150 }, { "epoch": 0.49468882970581934, "grad_norm": 0.9989720582962036, "learning_rate": 9.132080745450853e-05, "loss": 0.1013, "step": 16160 }, { "epoch": 0.49499494903113234, "grad_norm": 4.21253776550293, "learning_rate": 9.130750280971978e-05, "loss": 0.0815, "step": 16170 }, { "epoch": 0.49530106835644533, "grad_norm": 0.6621524095535278, "learning_rate": 9.129418894589765e-05, "loss": 0.0783, "step": 16180 }, { "epoch": 0.4956071876817584, "grad_norm": 1.3862035274505615, "learning_rate": 9.128086586601353e-05, "loss": 0.063, "step": 16190 }, { "epoch": 0.49591330700707137, "grad_norm": 2.1857151985168457, "learning_rate": 9.126753357304087e-05, "loss": 0.1314, "step": 16200 }, { "epoch": 0.49621942633238436, "grad_norm": 1.950392723083496, "learning_rate": 9.125419206995519e-05, "loss": 0.0678, "step": 16210 }, { "epoch": 0.49652554565769735, "grad_norm": 16.323745727539062, "learning_rate": 9.124084135973403e-05, "loss": 0.0958, "step": 16220 }, { "epoch": 0.4968316649830104, "grad_norm": 2.157759428024292, "learning_rate": 9.122748144535705e-05, "loss": 0.0616, "step": 16230 }, { "epoch": 0.4971377843083234, "grad_norm": 2.214200496673584, "learning_rate": 9.121411232980588e-05, "loss": 0.1399, "step": 16240 }, { "epoch": 0.4974439036336364, "grad_norm": 2.334289789199829, "learning_rate": 9.120073401606426e-05, "loss": 0.0409, "step": 16250 }, { "epoch": 0.4977500229589494, "grad_norm": 1.3024981021881104, "learning_rate": 9.1187346507118e-05, "loss": 0.0776, "step": 16260 }, { "epoch": 0.4980561422842624, "grad_norm": 5.309118270874023, "learning_rate": 9.117394980595489e-05, "loss": 0.0765, "step": 16270 }, { "epoch": 0.4983622616095754, "grad_norm": 7.192727565765381, "learning_rate": 9.116054391556484e-05, "loss": 0.0816, "step": 16280 }, { "epoch": 0.4986683809348884, "grad_norm": 1.780509114265442, "learning_rate": 9.114712883893979e-05, "loss": 0.0688, "step": 16290 }, { "epoch": 0.4989745002602014, "grad_norm": 6.526240825653076, "learning_rate": 9.113370457907368e-05, "loss": 0.1343, "step": 16300 }, { "epoch": 0.49928061958551445, "grad_norm": 1.353118896484375, "learning_rate": 9.112027113896262e-05, "loss": 0.138, "step": 16310 }, { "epoch": 0.49958673891082744, "grad_norm": 1.8733898401260376, "learning_rate": 9.110682852160463e-05, "loss": 0.0717, "step": 16320 }, { "epoch": 0.49989285823614044, "grad_norm": 2.5175721645355225, "learning_rate": 9.109337672999986e-05, "loss": 0.0817, "step": 16330 }, { "epoch": 0.5001989775614535, "grad_norm": 1.42759108543396, "learning_rate": 9.107991576715049e-05, "loss": 0.0859, "step": 16340 }, { "epoch": 0.5005050968867665, "grad_norm": 2.371325969696045, "learning_rate": 9.106644563606075e-05, "loss": 0.0538, "step": 16350 }, { "epoch": 0.5008112162120795, "grad_norm": 2.613302707672119, "learning_rate": 9.10529663397369e-05, "loss": 0.1156, "step": 16360 }, { "epoch": 0.5011173355373925, "grad_norm": 4.252920150756836, "learning_rate": 9.103947788118728e-05, "loss": 0.0848, "step": 16370 }, { "epoch": 0.5014234548627055, "grad_norm": 1.084359049797058, "learning_rate": 9.102598026342222e-05, "loss": 0.105, "step": 16380 }, { "epoch": 0.5017295741880184, "grad_norm": 6.318103313446045, "learning_rate": 9.101247348945416e-05, "loss": 0.0772, "step": 16390 }, { "epoch": 0.5020356935133315, "grad_norm": 6.327723026275635, "learning_rate": 9.099895756229753e-05, "loss": 0.1695, "step": 16400 }, { "epoch": 0.5023418128386445, "grad_norm": 1.8054463863372803, "learning_rate": 9.098543248496884e-05, "loss": 0.109, "step": 16410 }, { "epoch": 0.5026479321639575, "grad_norm": 2.1815361976623535, "learning_rate": 9.09718982604866e-05, "loss": 0.0654, "step": 16420 }, { "epoch": 0.5029540514892705, "grad_norm": 4.1056952476501465, "learning_rate": 9.09583548918714e-05, "loss": 0.1496, "step": 16430 }, { "epoch": 0.5032601708145835, "grad_norm": 3.0715558528900146, "learning_rate": 9.094480238214586e-05, "loss": 0.1004, "step": 16440 }, { "epoch": 0.5035662901398965, "grad_norm": 1.1854883432388306, "learning_rate": 9.093124073433463e-05, "loss": 0.0925, "step": 16450 }, { "epoch": 0.5038724094652095, "grad_norm": 0.6031140685081482, "learning_rate": 9.09176699514644e-05, "loss": 0.0872, "step": 16460 }, { "epoch": 0.5041785287905225, "grad_norm": 1.535892367362976, "learning_rate": 9.090409003656392e-05, "loss": 0.0756, "step": 16470 }, { "epoch": 0.5044846481158356, "grad_norm": 4.648338317871094, "learning_rate": 9.089050099266399e-05, "loss": 0.1536, "step": 16480 }, { "epoch": 0.5047907674411486, "grad_norm": 1.1255533695220947, "learning_rate": 9.087690282279735e-05, "loss": 0.0813, "step": 16490 }, { "epoch": 0.5050968867664616, "grad_norm": 1.450295329093933, "learning_rate": 9.086329552999891e-05, "loss": 0.0888, "step": 16500 }, { "epoch": 0.5054030060917746, "grad_norm": 1.7263022661209106, "learning_rate": 9.084967911730553e-05, "loss": 0.1189, "step": 16510 }, { "epoch": 0.5057091254170876, "grad_norm": 0.8963340520858765, "learning_rate": 9.083605358775612e-05, "loss": 0.0758, "step": 16520 }, { "epoch": 0.5060152447424006, "grad_norm": 1.6494486331939697, "learning_rate": 9.082241894439165e-05, "loss": 0.1148, "step": 16530 }, { "epoch": 0.5063213640677136, "grad_norm": 2.5192174911499023, "learning_rate": 9.08087751902551e-05, "loss": 0.0817, "step": 16540 }, { "epoch": 0.5066274833930267, "grad_norm": 2.619896650314331, "learning_rate": 9.07951223283915e-05, "loss": 0.0899, "step": 16550 }, { "epoch": 0.5069336027183396, "grad_norm": 1.1014113426208496, "learning_rate": 9.07814603618479e-05, "loss": 0.0557, "step": 16560 }, { "epoch": 0.5072397220436526, "grad_norm": 1.4254117012023926, "learning_rate": 9.076778929367338e-05, "loss": 0.0439, "step": 16570 }, { "epoch": 0.5075458413689656, "grad_norm": 2.3857128620147705, "learning_rate": 9.075410912691907e-05, "loss": 0.0985, "step": 16580 }, { "epoch": 0.5078519606942786, "grad_norm": 1.052743911743164, "learning_rate": 9.074041986463808e-05, "loss": 0.0807, "step": 16590 }, { "epoch": 0.5081580800195916, "grad_norm": 2.1825289726257324, "learning_rate": 9.072672150988564e-05, "loss": 0.0447, "step": 16600 }, { "epoch": 0.5084641993449046, "grad_norm": 1.680105447769165, "learning_rate": 9.071301406571892e-05, "loss": 0.0991, "step": 16610 }, { "epoch": 0.5087703186702176, "grad_norm": 3.3460755348205566, "learning_rate": 9.06992975351972e-05, "loss": 0.069, "step": 16620 }, { "epoch": 0.5090764379955307, "grad_norm": 2.022778034210205, "learning_rate": 9.068557192138169e-05, "loss": 0.1547, "step": 16630 }, { "epoch": 0.5093825573208437, "grad_norm": 3.5960042476654053, "learning_rate": 9.06718372273357e-05, "loss": 0.1121, "step": 16640 }, { "epoch": 0.5096886766461567, "grad_norm": 1.610456109046936, "learning_rate": 9.065809345612456e-05, "loss": 0.05, "step": 16650 }, { "epoch": 0.5099947959714697, "grad_norm": 2.957820415496826, "learning_rate": 9.064434061081562e-05, "loss": 0.0868, "step": 16660 }, { "epoch": 0.5103009152967827, "grad_norm": 0.775360643863678, "learning_rate": 9.063057869447819e-05, "loss": 0.0777, "step": 16670 }, { "epoch": 0.5106070346220957, "grad_norm": 0.8857529759407043, "learning_rate": 9.061680771018374e-05, "loss": 0.0493, "step": 16680 }, { "epoch": 0.5109131539474087, "grad_norm": 2.7274482250213623, "learning_rate": 9.060302766100566e-05, "loss": 0.1078, "step": 16690 }, { "epoch": 0.5112192732727217, "grad_norm": 2.817481756210327, "learning_rate": 9.058923855001935e-05, "loss": 0.1433, "step": 16700 }, { "epoch": 0.5115253925980348, "grad_norm": 1.7012343406677246, "learning_rate": 9.057544038030233e-05, "loss": 0.062, "step": 16710 }, { "epoch": 0.5118315119233477, "grad_norm": 2.4234583377838135, "learning_rate": 9.056163315493404e-05, "loss": 0.0795, "step": 16720 }, { "epoch": 0.5121376312486607, "grad_norm": 4.345454692840576, "learning_rate": 9.0547816876996e-05, "loss": 0.1231, "step": 16730 }, { "epoch": 0.5124437505739737, "grad_norm": 0.9105473160743713, "learning_rate": 9.053399154957176e-05, "loss": 0.0498, "step": 16740 }, { "epoch": 0.5127498698992867, "grad_norm": 1.2071871757507324, "learning_rate": 9.052015717574682e-05, "loss": 0.1256, "step": 16750 }, { "epoch": 0.5130559892245997, "grad_norm": 3.550121545791626, "learning_rate": 9.050631375860878e-05, "loss": 0.1181, "step": 16760 }, { "epoch": 0.5133621085499127, "grad_norm": 1.9042247533798218, "learning_rate": 9.04924613012472e-05, "loss": 0.0691, "step": 16770 }, { "epoch": 0.5136682278752258, "grad_norm": 3.303232431411743, "learning_rate": 9.04785998067537e-05, "loss": 0.111, "step": 16780 }, { "epoch": 0.5139743472005388, "grad_norm": 1.8764259815216064, "learning_rate": 9.046472927822187e-05, "loss": 0.0726, "step": 16790 }, { "epoch": 0.5142804665258518, "grad_norm": 2.3412868976593018, "learning_rate": 9.045084971874738e-05, "loss": 0.0768, "step": 16800 }, { "epoch": 0.5145865858511648, "grad_norm": 0.7884442806243896, "learning_rate": 9.043696113142784e-05, "loss": 0.0739, "step": 16810 }, { "epoch": 0.5148927051764778, "grad_norm": 1.5131927728652954, "learning_rate": 9.042306351936294e-05, "loss": 0.077, "step": 16820 }, { "epoch": 0.5151988245017908, "grad_norm": 2.6428468227386475, "learning_rate": 9.040915688565434e-05, "loss": 0.1534, "step": 16830 }, { "epoch": 0.5155049438271038, "grad_norm": 4.112473011016846, "learning_rate": 9.039524123340572e-05, "loss": 0.1088, "step": 16840 }, { "epoch": 0.5158110631524168, "grad_norm": 0.6373800039291382, "learning_rate": 9.038131656572284e-05, "loss": 0.0941, "step": 16850 }, { "epoch": 0.5161171824777299, "grad_norm": 0.6467779278755188, "learning_rate": 9.036738288571335e-05, "loss": 0.1739, "step": 16860 }, { "epoch": 0.5164233018030429, "grad_norm": 0.7952077388763428, "learning_rate": 9.035344019648702e-05, "loss": 0.0464, "step": 16870 }, { "epoch": 0.5167294211283558, "grad_norm": 0.6587779521942139, "learning_rate": 9.033948850115557e-05, "loss": 0.1105, "step": 16880 }, { "epoch": 0.5170355404536688, "grad_norm": 1.5167827606201172, "learning_rate": 9.032552780283274e-05, "loss": 0.059, "step": 16890 }, { "epoch": 0.5173416597789818, "grad_norm": 0.8524138927459717, "learning_rate": 9.031155810463431e-05, "loss": 0.0934, "step": 16900 }, { "epoch": 0.5176477791042948, "grad_norm": 1.4036937952041626, "learning_rate": 9.029757940967803e-05, "loss": 0.0823, "step": 16910 }, { "epoch": 0.5179538984296078, "grad_norm": 2.5998334884643555, "learning_rate": 9.028359172108367e-05, "loss": 0.1449, "step": 16920 }, { "epoch": 0.5182600177549209, "grad_norm": 2.6439948081970215, "learning_rate": 9.026959504197302e-05, "loss": 0.1002, "step": 16930 }, { "epoch": 0.5185661370802339, "grad_norm": 2.2357892990112305, "learning_rate": 9.025558937546988e-05, "loss": 0.0761, "step": 16940 }, { "epoch": 0.5188722564055469, "grad_norm": 1.4890446662902832, "learning_rate": 9.02415747247e-05, "loss": 0.0831, "step": 16950 }, { "epoch": 0.5191783757308599, "grad_norm": 2.0321457386016846, "learning_rate": 9.022755109279122e-05, "loss": 0.0683, "step": 16960 }, { "epoch": 0.5194844950561729, "grad_norm": 3.258251905441284, "learning_rate": 9.021351848287333e-05, "loss": 0.0649, "step": 16970 }, { "epoch": 0.5197906143814859, "grad_norm": 2.071309804916382, "learning_rate": 9.019947689807812e-05, "loss": 0.1289, "step": 16980 }, { "epoch": 0.5200967337067989, "grad_norm": 1.2024916410446167, "learning_rate": 9.018542634153944e-05, "loss": 0.1302, "step": 16990 }, { "epoch": 0.5204028530321119, "grad_norm": 2.8159337043762207, "learning_rate": 9.017136681639307e-05, "loss": 0.1123, "step": 17000 }, { "epoch": 0.520708972357425, "grad_norm": 2.572781562805176, "learning_rate": 9.015729832577681e-05, "loss": 0.1088, "step": 17010 }, { "epoch": 0.521015091682738, "grad_norm": 2.122330904006958, "learning_rate": 9.014322087283052e-05, "loss": 0.0702, "step": 17020 }, { "epoch": 0.521321211008051, "grad_norm": 2.013110876083374, "learning_rate": 9.012913446069597e-05, "loss": 0.0761, "step": 17030 }, { "epoch": 0.521627330333364, "grad_norm": 1.901363492012024, "learning_rate": 9.0115039092517e-05, "loss": 0.1158, "step": 17040 }, { "epoch": 0.5219334496586769, "grad_norm": 1.4622840881347656, "learning_rate": 9.010093477143942e-05, "loss": 0.0921, "step": 17050 }, { "epoch": 0.5222395689839899, "grad_norm": 2.1239676475524902, "learning_rate": 9.008682150061102e-05, "loss": 0.075, "step": 17060 }, { "epoch": 0.5225456883093029, "grad_norm": 0.5313006639480591, "learning_rate": 9.007269928318163e-05, "loss": 0.1362, "step": 17070 }, { "epoch": 0.522851807634616, "grad_norm": 1.055264949798584, "learning_rate": 9.005856812230304e-05, "loss": 0.0871, "step": 17080 }, { "epoch": 0.523157926959929, "grad_norm": 1.2796859741210938, "learning_rate": 9.004442802112909e-05, "loss": 0.1236, "step": 17090 }, { "epoch": 0.523464046285242, "grad_norm": 2.644641160964966, "learning_rate": 9.003027898281552e-05, "loss": 0.0715, "step": 17100 }, { "epoch": 0.523770165610555, "grad_norm": 0.6953494548797607, "learning_rate": 9.001612101052015e-05, "loss": 0.1453, "step": 17110 }, { "epoch": 0.524076284935868, "grad_norm": 2.150362491607666, "learning_rate": 9.000195410740277e-05, "loss": 0.0567, "step": 17120 }, { "epoch": 0.524382404261181, "grad_norm": 0.721895158290863, "learning_rate": 8.998777827662514e-05, "loss": 0.1037, "step": 17130 }, { "epoch": 0.524688523586494, "grad_norm": 0.5438295006752014, "learning_rate": 8.997359352135105e-05, "loss": 0.1, "step": 17140 }, { "epoch": 0.524994642911807, "grad_norm": 1.4209331274032593, "learning_rate": 8.995939984474624e-05, "loss": 0.1246, "step": 17150 }, { "epoch": 0.5253007622371201, "grad_norm": 3.076477289199829, "learning_rate": 8.99451972499785e-05, "loss": 0.1558, "step": 17160 }, { "epoch": 0.5256068815624331, "grad_norm": 1.4353572130203247, "learning_rate": 8.993098574021752e-05, "loss": 0.1172, "step": 17170 }, { "epoch": 0.5259130008877461, "grad_norm": 1.160791039466858, "learning_rate": 8.991676531863508e-05, "loss": 0.1181, "step": 17180 }, { "epoch": 0.526219120213059, "grad_norm": 3.511200428009033, "learning_rate": 8.990253598840489e-05, "loss": 0.0835, "step": 17190 }, { "epoch": 0.526525239538372, "grad_norm": 0.9947602152824402, "learning_rate": 8.988829775270264e-05, "loss": 0.1182, "step": 17200 }, { "epoch": 0.526831358863685, "grad_norm": 3.0478508472442627, "learning_rate": 8.987405061470604e-05, "loss": 0.1006, "step": 17210 }, { "epoch": 0.527137478188998, "grad_norm": 2.298609495162964, "learning_rate": 8.98597945775948e-05, "loss": 0.0716, "step": 17220 }, { "epoch": 0.527443597514311, "grad_norm": 3.530823230743408, "learning_rate": 8.984552964455055e-05, "loss": 0.0626, "step": 17230 }, { "epoch": 0.5277497168396241, "grad_norm": 1.6858326196670532, "learning_rate": 8.983125581875698e-05, "loss": 0.1168, "step": 17240 }, { "epoch": 0.5280558361649371, "grad_norm": 1.0556493997573853, "learning_rate": 8.981697310339972e-05, "loss": 0.0816, "step": 17250 }, { "epoch": 0.5283619554902501, "grad_norm": 4.684512615203857, "learning_rate": 8.980268150166637e-05, "loss": 0.0747, "step": 17260 }, { "epoch": 0.5286680748155631, "grad_norm": 1.9220772981643677, "learning_rate": 8.978838101674658e-05, "loss": 0.092, "step": 17270 }, { "epoch": 0.5289741941408761, "grad_norm": 4.822595119476318, "learning_rate": 8.977407165183191e-05, "loss": 0.0831, "step": 17280 }, { "epoch": 0.5292803134661891, "grad_norm": 1.4362778663635254, "learning_rate": 8.975975341011596e-05, "loss": 0.0495, "step": 17290 }, { "epoch": 0.5295864327915021, "grad_norm": 1.3459266424179077, "learning_rate": 8.974542629479425e-05, "loss": 0.1175, "step": 17300 }, { "epoch": 0.5298925521168152, "grad_norm": 0.9239422082901001, "learning_rate": 8.973109030906433e-05, "loss": 0.0963, "step": 17310 }, { "epoch": 0.5301986714421282, "grad_norm": 0.8661477565765381, "learning_rate": 8.971674545612574e-05, "loss": 0.0807, "step": 17320 }, { "epoch": 0.5305047907674412, "grad_norm": 1.559001088142395, "learning_rate": 8.970239173917993e-05, "loss": 0.0553, "step": 17330 }, { "epoch": 0.5308109100927542, "grad_norm": 7.289370059967041, "learning_rate": 8.968802916143039e-05, "loss": 0.1202, "step": 17340 }, { "epoch": 0.5311170294180672, "grad_norm": 6.027449131011963, "learning_rate": 8.967365772608258e-05, "loss": 0.0931, "step": 17350 }, { "epoch": 0.5314231487433801, "grad_norm": 1.6569733619689941, "learning_rate": 8.965927743634391e-05, "loss": 0.0656, "step": 17360 }, { "epoch": 0.5317292680686931, "grad_norm": 1.6700319051742554, "learning_rate": 8.964488829542377e-05, "loss": 0.0913, "step": 17370 }, { "epoch": 0.5320353873940061, "grad_norm": 1.4534227848052979, "learning_rate": 8.963049030653357e-05, "loss": 0.0586, "step": 17380 }, { "epoch": 0.5323415067193192, "grad_norm": 2.439934253692627, "learning_rate": 8.961608347288663e-05, "loss": 0.1228, "step": 17390 }, { "epoch": 0.5326476260446322, "grad_norm": 5.925632953643799, "learning_rate": 8.96016677976983e-05, "loss": 0.0725, "step": 17400 }, { "epoch": 0.5329537453699452, "grad_norm": 1.224521517753601, "learning_rate": 8.958724328418587e-05, "loss": 0.1044, "step": 17410 }, { "epoch": 0.5332598646952582, "grad_norm": 1.4637165069580078, "learning_rate": 8.957280993556861e-05, "loss": 0.1288, "step": 17420 }, { "epoch": 0.5335659840205712, "grad_norm": 1.5781984329223633, "learning_rate": 8.955836775506776e-05, "loss": 0.1103, "step": 17430 }, { "epoch": 0.5338721033458842, "grad_norm": 1.4218541383743286, "learning_rate": 8.954391674590656e-05, "loss": 0.0619, "step": 17440 }, { "epoch": 0.5341782226711972, "grad_norm": 1.9340968132019043, "learning_rate": 8.952945691131015e-05, "loss": 0.1019, "step": 17450 }, { "epoch": 0.5344843419965103, "grad_norm": 1.944988489151001, "learning_rate": 8.951498825450573e-05, "loss": 0.0834, "step": 17460 }, { "epoch": 0.5347904613218233, "grad_norm": 2.6533987522125244, "learning_rate": 8.950051077872238e-05, "loss": 0.0777, "step": 17470 }, { "epoch": 0.5350965806471363, "grad_norm": 1.45201575756073, "learning_rate": 8.948602448719124e-05, "loss": 0.0743, "step": 17480 }, { "epoch": 0.5354026999724493, "grad_norm": 1.710046648979187, "learning_rate": 8.947152938314532e-05, "loss": 0.0754, "step": 17490 }, { "epoch": 0.5357088192977623, "grad_norm": 1.6024376153945923, "learning_rate": 8.945702546981969e-05, "loss": 0.0973, "step": 17500 }, { "epoch": 0.5360149386230753, "grad_norm": 1.24100923538208, "learning_rate": 8.94425127504513e-05, "loss": 0.098, "step": 17510 }, { "epoch": 0.5363210579483882, "grad_norm": 1.2606449127197266, "learning_rate": 8.942799122827914e-05, "loss": 0.1006, "step": 17520 }, { "epoch": 0.5366271772737012, "grad_norm": 2.159536123275757, "learning_rate": 8.941346090654412e-05, "loss": 0.0875, "step": 17530 }, { "epoch": 0.5369332965990143, "grad_norm": 1.6715996265411377, "learning_rate": 8.939892178848914e-05, "loss": 0.1417, "step": 17540 }, { "epoch": 0.5372394159243273, "grad_norm": 1.404321551322937, "learning_rate": 8.938437387735903e-05, "loss": 0.0986, "step": 17550 }, { "epoch": 0.5375455352496403, "grad_norm": 6.168661117553711, "learning_rate": 8.936981717640061e-05, "loss": 0.0788, "step": 17560 }, { "epoch": 0.5378516545749533, "grad_norm": 2.548712968826294, "learning_rate": 8.935525168886262e-05, "loss": 0.0768, "step": 17570 }, { "epoch": 0.5381577739002663, "grad_norm": 0.9194461703300476, "learning_rate": 8.934067741799587e-05, "loss": 0.0797, "step": 17580 }, { "epoch": 0.5384638932255793, "grad_norm": 1.5519667863845825, "learning_rate": 8.9326094367053e-05, "loss": 0.0581, "step": 17590 }, { "epoch": 0.5387700125508923, "grad_norm": 1.6295015811920166, "learning_rate": 8.931150253928866e-05, "loss": 0.0661, "step": 17600 }, { "epoch": 0.5390761318762054, "grad_norm": 3.6348607540130615, "learning_rate": 8.929690193795948e-05, "loss": 0.1334, "step": 17610 }, { "epoch": 0.5393822512015184, "grad_norm": 4.874104976654053, "learning_rate": 8.928229256632404e-05, "loss": 0.07, "step": 17620 }, { "epoch": 0.5396883705268314, "grad_norm": 1.8207197189331055, "learning_rate": 8.926767442764284e-05, "loss": 0.0722, "step": 17630 }, { "epoch": 0.5399944898521444, "grad_norm": 0.6412423849105835, "learning_rate": 8.92530475251784e-05, "loss": 0.0677, "step": 17640 }, { "epoch": 0.5403006091774574, "grad_norm": 2.106632709503174, "learning_rate": 8.923841186219512e-05, "loss": 0.0778, "step": 17650 }, { "epoch": 0.5406067285027704, "grad_norm": 1.9696195125579834, "learning_rate": 8.922376744195944e-05, "loss": 0.0513, "step": 17660 }, { "epoch": 0.5409128478280834, "grad_norm": 1.9372191429138184, "learning_rate": 8.920911426773967e-05, "loss": 0.0855, "step": 17670 }, { "epoch": 0.5412189671533963, "grad_norm": 1.0542749166488647, "learning_rate": 8.919445234280614e-05, "loss": 0.0837, "step": 17680 }, { "epoch": 0.5415250864787094, "grad_norm": 11.035536766052246, "learning_rate": 8.917978167043111e-05, "loss": 0.1276, "step": 17690 }, { "epoch": 0.5418312058040224, "grad_norm": 1.8971259593963623, "learning_rate": 8.916510225388878e-05, "loss": 0.0769, "step": 17700 }, { "epoch": 0.5421373251293354, "grad_norm": 1.3654263019561768, "learning_rate": 8.91504140964553e-05, "loss": 0.06, "step": 17710 }, { "epoch": 0.5424434444546484, "grad_norm": 1.0783042907714844, "learning_rate": 8.91357172014088e-05, "loss": 0.0857, "step": 17720 }, { "epoch": 0.5427495637799614, "grad_norm": 1.2406238317489624, "learning_rate": 8.912101157202933e-05, "loss": 0.1355, "step": 17730 }, { "epoch": 0.5430556831052744, "grad_norm": 0.5089297890663147, "learning_rate": 8.910629721159892e-05, "loss": 0.0389, "step": 17740 }, { "epoch": 0.5433618024305874, "grad_norm": 3.6994469165802, "learning_rate": 8.90915741234015e-05, "loss": 0.0999, "step": 17750 }, { "epoch": 0.5436679217559004, "grad_norm": 1.8375918865203857, "learning_rate": 8.9076842310723e-05, "loss": 0.1043, "step": 17760 }, { "epoch": 0.5439740410812135, "grad_norm": 31.0684871673584, "learning_rate": 8.906210177685127e-05, "loss": 0.1032, "step": 17770 }, { "epoch": 0.5442801604065265, "grad_norm": 2.4394044876098633, "learning_rate": 8.90473525250761e-05, "loss": 0.075, "step": 17780 }, { "epoch": 0.5445862797318395, "grad_norm": 3.3439793586730957, "learning_rate": 8.903259455868925e-05, "loss": 0.0539, "step": 17790 }, { "epoch": 0.5448923990571525, "grad_norm": 2.1890225410461426, "learning_rate": 8.901782788098442e-05, "loss": 0.1038, "step": 17800 }, { "epoch": 0.5451985183824655, "grad_norm": 3.743161916732788, "learning_rate": 8.900305249525722e-05, "loss": 0.0753, "step": 17810 }, { "epoch": 0.5455046377077785, "grad_norm": 2.369563579559326, "learning_rate": 8.898826840480525e-05, "loss": 0.094, "step": 17820 }, { "epoch": 0.5458107570330915, "grad_norm": 1.2869993448257446, "learning_rate": 8.897347561292803e-05, "loss": 0.0426, "step": 17830 }, { "epoch": 0.5461168763584046, "grad_norm": 1.3060675859451294, "learning_rate": 8.895867412292702e-05, "loss": 0.1006, "step": 17840 }, { "epoch": 0.5464229956837175, "grad_norm": 2.4838740825653076, "learning_rate": 8.894386393810563e-05, "loss": 0.0794, "step": 17850 }, { "epoch": 0.5467291150090305, "grad_norm": 0.33143845200538635, "learning_rate": 8.892904506176919e-05, "loss": 0.0628, "step": 17860 }, { "epoch": 0.5470352343343435, "grad_norm": 1.2539291381835938, "learning_rate": 8.8914217497225e-05, "loss": 0.0704, "step": 17870 }, { "epoch": 0.5473413536596565, "grad_norm": 2.5828073024749756, "learning_rate": 8.88993812477823e-05, "loss": 0.1051, "step": 17880 }, { "epoch": 0.5476474729849695, "grad_norm": 0.9927963614463806, "learning_rate": 8.888453631675222e-05, "loss": 0.0546, "step": 17890 }, { "epoch": 0.5479535923102825, "grad_norm": 1.7073454856872559, "learning_rate": 8.886968270744787e-05, "loss": 0.0606, "step": 17900 }, { "epoch": 0.5482597116355955, "grad_norm": 1.932438850402832, "learning_rate": 8.88548204231843e-05, "loss": 0.0691, "step": 17910 }, { "epoch": 0.5485658309609086, "grad_norm": 3.7842986583709717, "learning_rate": 8.883994946727849e-05, "loss": 0.0632, "step": 17920 }, { "epoch": 0.5488719502862216, "grad_norm": 2.300132989883423, "learning_rate": 8.882506984304933e-05, "loss": 0.0823, "step": 17930 }, { "epoch": 0.5491780696115346, "grad_norm": 6.929584503173828, "learning_rate": 8.881018155381766e-05, "loss": 0.0901, "step": 17940 }, { "epoch": 0.5494841889368476, "grad_norm": 0.9251776933670044, "learning_rate": 8.879528460290627e-05, "loss": 0.1166, "step": 17950 }, { "epoch": 0.5497903082621606, "grad_norm": 2.0602152347564697, "learning_rate": 8.878037899363987e-05, "loss": 0.0855, "step": 17960 }, { "epoch": 0.5500964275874736, "grad_norm": 2.1690666675567627, "learning_rate": 8.87654647293451e-05, "loss": 0.0942, "step": 17970 }, { "epoch": 0.5504025469127866, "grad_norm": 0.9474266171455383, "learning_rate": 8.875054181335054e-05, "loss": 0.1533, "step": 17980 }, { "epoch": 0.5507086662380997, "grad_norm": 1.605456829071045, "learning_rate": 8.873561024898668e-05, "loss": 0.079, "step": 17990 }, { "epoch": 0.5510147855634127, "grad_norm": 1.3590905666351318, "learning_rate": 8.872067003958598e-05, "loss": 0.0709, "step": 18000 }, { "epoch": 0.5513209048887256, "grad_norm": 1.9974632263183594, "learning_rate": 8.870572118848277e-05, "loss": 0.0804, "step": 18010 }, { "epoch": 0.5516270242140386, "grad_norm": 2.2061359882354736, "learning_rate": 8.869076369901338e-05, "loss": 0.1035, "step": 18020 }, { "epoch": 0.5519331435393516, "grad_norm": 0.70917147397995, "learning_rate": 8.867579757451601e-05, "loss": 0.0519, "step": 18030 }, { "epoch": 0.5522392628646646, "grad_norm": 1.362100601196289, "learning_rate": 8.866082281833081e-05, "loss": 0.1333, "step": 18040 }, { "epoch": 0.5525453821899776, "grad_norm": 1.3874975442886353, "learning_rate": 8.864583943379987e-05, "loss": 0.0572, "step": 18050 }, { "epoch": 0.5528515015152906, "grad_norm": 1.4474339485168457, "learning_rate": 8.863084742426719e-05, "loss": 0.0813, "step": 18060 }, { "epoch": 0.5531576208406037, "grad_norm": 1.0422884225845337, "learning_rate": 8.861584679307865e-05, "loss": 0.0696, "step": 18070 }, { "epoch": 0.5534637401659167, "grad_norm": 2.5996367931365967, "learning_rate": 8.860083754358218e-05, "loss": 0.0658, "step": 18080 }, { "epoch": 0.5537698594912297, "grad_norm": 1.2275768518447876, "learning_rate": 8.858581967912749e-05, "loss": 0.0603, "step": 18090 }, { "epoch": 0.5540759788165427, "grad_norm": 2.9202091693878174, "learning_rate": 8.85707932030663e-05, "loss": 0.0914, "step": 18100 }, { "epoch": 0.5543820981418557, "grad_norm": 0.9320257306098938, "learning_rate": 8.85557581187522e-05, "loss": 0.1118, "step": 18110 }, { "epoch": 0.5546882174671687, "grad_norm": 2.806178092956543, "learning_rate": 8.854071442954078e-05, "loss": 0.0772, "step": 18120 }, { "epoch": 0.5549943367924817, "grad_norm": 0.566160261631012, "learning_rate": 8.852566213878947e-05, "loss": 0.0571, "step": 18130 }, { "epoch": 0.5553004561177948, "grad_norm": 2.1974711418151855, "learning_rate": 8.851060124985765e-05, "loss": 0.0827, "step": 18140 }, { "epoch": 0.5556065754431078, "grad_norm": 0.47594401240348816, "learning_rate": 8.849553176610661e-05, "loss": 0.0716, "step": 18150 }, { "epoch": 0.5559126947684208, "grad_norm": 1.5340893268585205, "learning_rate": 8.848045369089958e-05, "loss": 0.0499, "step": 18160 }, { "epoch": 0.5562188140937337, "grad_norm": 0.8226722478866577, "learning_rate": 8.846536702760168e-05, "loss": 0.1263, "step": 18170 }, { "epoch": 0.5565249334190467, "grad_norm": 2.8143818378448486, "learning_rate": 8.845027177958e-05, "loss": 0.1037, "step": 18180 }, { "epoch": 0.5568310527443597, "grad_norm": 0.8161550164222717, "learning_rate": 8.843516795020346e-05, "loss": 0.0995, "step": 18190 }, { "epoch": 0.5571371720696727, "grad_norm": 2.607715368270874, "learning_rate": 8.842005554284296e-05, "loss": 0.0763, "step": 18200 }, { "epoch": 0.5574432913949857, "grad_norm": 2.4699668884277344, "learning_rate": 8.840493456087129e-05, "loss": 0.082, "step": 18210 }, { "epoch": 0.5577494107202988, "grad_norm": 1.0549659729003906, "learning_rate": 8.838980500766317e-05, "loss": 0.0599, "step": 18220 }, { "epoch": 0.5580555300456118, "grad_norm": 0.9297858476638794, "learning_rate": 8.837466688659521e-05, "loss": 0.1019, "step": 18230 }, { "epoch": 0.5583616493709248, "grad_norm": 1.4249616861343384, "learning_rate": 8.835952020104598e-05, "loss": 0.071, "step": 18240 }, { "epoch": 0.5586677686962378, "grad_norm": 7.003759860992432, "learning_rate": 8.834436495439588e-05, "loss": 0.1599, "step": 18250 }, { "epoch": 0.5589738880215508, "grad_norm": 2.0097458362579346, "learning_rate": 8.832920115002729e-05, "loss": 0.1116, "step": 18260 }, { "epoch": 0.5592800073468638, "grad_norm": 0.8163025975227356, "learning_rate": 8.831402879132446e-05, "loss": 0.0905, "step": 18270 }, { "epoch": 0.5595861266721768, "grad_norm": 2.7733302116394043, "learning_rate": 8.829884788167359e-05, "loss": 0.1188, "step": 18280 }, { "epoch": 0.5598922459974898, "grad_norm": 8.863258361816406, "learning_rate": 8.828365842446276e-05, "loss": 0.06, "step": 18290 }, { "epoch": 0.5601983653228029, "grad_norm": 2.663630485534668, "learning_rate": 8.826846042308196e-05, "loss": 0.0971, "step": 18300 }, { "epoch": 0.5605044846481159, "grad_norm": 1.5266823768615723, "learning_rate": 8.825325388092308e-05, "loss": 0.0749, "step": 18310 }, { "epoch": 0.5608106039734289, "grad_norm": 2.306844472885132, "learning_rate": 8.823803880137993e-05, "loss": 0.1988, "step": 18320 }, { "epoch": 0.5611167232987418, "grad_norm": 1.5924192667007446, "learning_rate": 8.822281518784821e-05, "loss": 0.0789, "step": 18330 }, { "epoch": 0.5614228426240548, "grad_norm": 1.0598334074020386, "learning_rate": 8.820758304372557e-05, "loss": 0.0537, "step": 18340 }, { "epoch": 0.5617289619493678, "grad_norm": 3.652139902114868, "learning_rate": 8.819234237241148e-05, "loss": 0.0859, "step": 18350 }, { "epoch": 0.5620350812746808, "grad_norm": 3.107088327407837, "learning_rate": 8.81770931773074e-05, "loss": 0.057, "step": 18360 }, { "epoch": 0.5623412005999939, "grad_norm": 0.4609134793281555, "learning_rate": 8.816183546181666e-05, "loss": 0.083, "step": 18370 }, { "epoch": 0.5626473199253069, "grad_norm": 1.6694644689559937, "learning_rate": 8.814656922934444e-05, "loss": 0.0835, "step": 18380 }, { "epoch": 0.5629534392506199, "grad_norm": 0.5141112804412842, "learning_rate": 8.813129448329791e-05, "loss": 0.1345, "step": 18390 }, { "epoch": 0.5632595585759329, "grad_norm": 1.5268311500549316, "learning_rate": 8.811601122708608e-05, "loss": 0.1234, "step": 18400 }, { "epoch": 0.5635656779012459, "grad_norm": 2.127094030380249, "learning_rate": 8.810071946411989e-05, "loss": 0.0573, "step": 18410 }, { "epoch": 0.5638717972265589, "grad_norm": 1.5593664646148682, "learning_rate": 8.808541919781214e-05, "loss": 0.0633, "step": 18420 }, { "epoch": 0.5641779165518719, "grad_norm": 2.139944553375244, "learning_rate": 8.807011043157758e-05, "loss": 0.1156, "step": 18430 }, { "epoch": 0.5644840358771849, "grad_norm": 2.670586585998535, "learning_rate": 8.80547931688328e-05, "loss": 0.1025, "step": 18440 }, { "epoch": 0.564790155202498, "grad_norm": 1.6591285467147827, "learning_rate": 8.803946741299634e-05, "loss": 0.1566, "step": 18450 }, { "epoch": 0.565096274527811, "grad_norm": 3.873175621032715, "learning_rate": 8.802413316748862e-05, "loss": 0.1198, "step": 18460 }, { "epoch": 0.565402393853124, "grad_norm": 2.5468924045562744, "learning_rate": 8.800879043573191e-05, "loss": 0.083, "step": 18470 }, { "epoch": 0.565708513178437, "grad_norm": 0.5579409003257751, "learning_rate": 8.799343922115044e-05, "loss": 0.067, "step": 18480 }, { "epoch": 0.56601463250375, "grad_norm": 1.506224274635315, "learning_rate": 8.797807952717028e-05, "loss": 0.0852, "step": 18490 }, { "epoch": 0.5663207518290629, "grad_norm": 1.983156681060791, "learning_rate": 8.796271135721944e-05, "loss": 0.0816, "step": 18500 }, { "epoch": 0.5666268711543759, "grad_norm": 1.1202529668807983, "learning_rate": 8.794733471472778e-05, "loss": 0.0866, "step": 18510 }, { "epoch": 0.566932990479689, "grad_norm": 3.253276824951172, "learning_rate": 8.793194960312707e-05, "loss": 0.0754, "step": 18520 }, { "epoch": 0.567239109805002, "grad_norm": 22.81183624267578, "learning_rate": 8.791655602585099e-05, "loss": 0.0986, "step": 18530 }, { "epoch": 0.567545229130315, "grad_norm": 2.8610188961029053, "learning_rate": 8.790115398633507e-05, "loss": 0.1074, "step": 18540 }, { "epoch": 0.567851348455628, "grad_norm": 1.1412922143936157, "learning_rate": 8.788574348801675e-05, "loss": 0.0563, "step": 18550 }, { "epoch": 0.568157467780941, "grad_norm": 0.9019370079040527, "learning_rate": 8.787032453433533e-05, "loss": 0.0712, "step": 18560 }, { "epoch": 0.568463587106254, "grad_norm": 1.1669948101043701, "learning_rate": 8.785489712873206e-05, "loss": 0.0751, "step": 18570 }, { "epoch": 0.568769706431567, "grad_norm": 1.3782049417495728, "learning_rate": 8.783946127465001e-05, "loss": 0.076, "step": 18580 }, { "epoch": 0.56907582575688, "grad_norm": 2.535080671310425, "learning_rate": 8.78240169755342e-05, "loss": 0.1092, "step": 18590 }, { "epoch": 0.5693819450821931, "grad_norm": 2.1479711532592773, "learning_rate": 8.780856423483145e-05, "loss": 0.1296, "step": 18600 }, { "epoch": 0.5696880644075061, "grad_norm": 1.4038639068603516, "learning_rate": 8.779310305599053e-05, "loss": 0.0929, "step": 18610 }, { "epoch": 0.5699941837328191, "grad_norm": 1.7132821083068848, "learning_rate": 8.77776334424621e-05, "loss": 0.0779, "step": 18620 }, { "epoch": 0.5703003030581321, "grad_norm": 2.096501588821411, "learning_rate": 8.776215539769865e-05, "loss": 0.136, "step": 18630 }, { "epoch": 0.570606422383445, "grad_norm": 2.5387356281280518, "learning_rate": 8.774666892515458e-05, "loss": 0.086, "step": 18640 }, { "epoch": 0.570912541708758, "grad_norm": 3.6319987773895264, "learning_rate": 8.773117402828617e-05, "loss": 0.1018, "step": 18650 }, { "epoch": 0.571218661034071, "grad_norm": 1.4638314247131348, "learning_rate": 8.77156707105516e-05, "loss": 0.117, "step": 18660 }, { "epoch": 0.571524780359384, "grad_norm": 4.653579235076904, "learning_rate": 8.77001589754109e-05, "loss": 0.1394, "step": 18670 }, { "epoch": 0.5718308996846971, "grad_norm": 0.5305338501930237, "learning_rate": 8.768463882632598e-05, "loss": 0.0504, "step": 18680 }, { "epoch": 0.5721370190100101, "grad_norm": 3.3701834678649902, "learning_rate": 8.766911026676064e-05, "loss": 0.134, "step": 18690 }, { "epoch": 0.5724431383353231, "grad_norm": 0.9350230097770691, "learning_rate": 8.765357330018056e-05, "loss": 0.0855, "step": 18700 }, { "epoch": 0.5727492576606361, "grad_norm": 1.3407856225967407, "learning_rate": 8.763802793005327e-05, "loss": 0.1677, "step": 18710 }, { "epoch": 0.5730553769859491, "grad_norm": 4.982719421386719, "learning_rate": 8.762247415984823e-05, "loss": 0.0659, "step": 18720 }, { "epoch": 0.5733614963112621, "grad_norm": 1.8883341550827026, "learning_rate": 8.76069119930367e-05, "loss": 0.0883, "step": 18730 }, { "epoch": 0.5736676156365751, "grad_norm": 2.2071568965911865, "learning_rate": 8.759134143309189e-05, "loss": 0.0793, "step": 18740 }, { "epoch": 0.5739737349618882, "grad_norm": 0.8038753271102905, "learning_rate": 8.757576248348882e-05, "loss": 0.0847, "step": 18750 }, { "epoch": 0.5742798542872012, "grad_norm": 4.283339977264404, "learning_rate": 8.756017514770443e-05, "loss": 0.1028, "step": 18760 }, { "epoch": 0.5745859736125142, "grad_norm": 0.7252814173698425, "learning_rate": 8.754457942921748e-05, "loss": 0.0986, "step": 18770 }, { "epoch": 0.5748920929378272, "grad_norm": 1.0732625722885132, "learning_rate": 8.752897533150868e-05, "loss": 0.082, "step": 18780 }, { "epoch": 0.5751982122631402, "grad_norm": 6.191799163818359, "learning_rate": 8.751336285806051e-05, "loss": 0.0758, "step": 18790 }, { "epoch": 0.5755043315884532, "grad_norm": 1.0749508142471313, "learning_rate": 8.74977420123574e-05, "loss": 0.0897, "step": 18800 }, { "epoch": 0.5758104509137661, "grad_norm": 1.7769955396652222, "learning_rate": 8.748211279788563e-05, "loss": 0.0682, "step": 18810 }, { "epoch": 0.5761165702390791, "grad_norm": 1.7770689725875854, "learning_rate": 8.746647521813331e-05, "loss": 0.0576, "step": 18820 }, { "epoch": 0.5764226895643922, "grad_norm": 1.376184344291687, "learning_rate": 8.745082927659047e-05, "loss": 0.0831, "step": 18830 }, { "epoch": 0.5767288088897052, "grad_norm": 0.5502825379371643, "learning_rate": 8.743517497674896e-05, "loss": 0.0732, "step": 18840 }, { "epoch": 0.5770349282150182, "grad_norm": 1.4829846620559692, "learning_rate": 8.741951232210253e-05, "loss": 0.0854, "step": 18850 }, { "epoch": 0.5773410475403312, "grad_norm": 0.7775599360466003, "learning_rate": 8.740384131614676e-05, "loss": 0.0841, "step": 18860 }, { "epoch": 0.5776471668656442, "grad_norm": 1.8778620958328247, "learning_rate": 8.738816196237913e-05, "loss": 0.109, "step": 18870 }, { "epoch": 0.5779532861909572, "grad_norm": 2.107863187789917, "learning_rate": 8.737247426429898e-05, "loss": 0.1507, "step": 18880 }, { "epoch": 0.5782594055162702, "grad_norm": 1.9294424057006836, "learning_rate": 8.735677822540749e-05, "loss": 0.0681, "step": 18890 }, { "epoch": 0.5785655248415833, "grad_norm": 1.2979081869125366, "learning_rate": 8.73410738492077e-05, "loss": 0.1257, "step": 18900 }, { "epoch": 0.5788716441668963, "grad_norm": 1.3356834650039673, "learning_rate": 8.732536113920453e-05, "loss": 0.0537, "step": 18910 }, { "epoch": 0.5791777634922093, "grad_norm": 15.372196197509766, "learning_rate": 8.730964009890475e-05, "loss": 0.1447, "step": 18920 }, { "epoch": 0.5794838828175223, "grad_norm": 1.1194695234298706, "learning_rate": 8.729391073181701e-05, "loss": 0.1039, "step": 18930 }, { "epoch": 0.5797900021428353, "grad_norm": 1.6513419151306152, "learning_rate": 8.727817304145176e-05, "loss": 0.0899, "step": 18940 }, { "epoch": 0.5800961214681483, "grad_norm": 3.5611321926116943, "learning_rate": 8.726242703132137e-05, "loss": 0.1004, "step": 18950 }, { "epoch": 0.5804022407934613, "grad_norm": 0.8349171876907349, "learning_rate": 8.724667270494006e-05, "loss": 0.078, "step": 18960 }, { "epoch": 0.5807083601187742, "grad_norm": 1.830304503440857, "learning_rate": 8.723091006582389e-05, "loss": 0.1166, "step": 18970 }, { "epoch": 0.5810144794440874, "grad_norm": 0.5099964737892151, "learning_rate": 8.721513911749073e-05, "loss": 0.0529, "step": 18980 }, { "epoch": 0.5813205987694003, "grad_norm": 1.1803936958312988, "learning_rate": 8.719935986346039e-05, "loss": 0.1112, "step": 18990 }, { "epoch": 0.5816267180947133, "grad_norm": 3.7399539947509766, "learning_rate": 8.718357230725449e-05, "loss": 0.128, "step": 19000 }, { "epoch": 0.5819328374200263, "grad_norm": 3.4703593254089355, "learning_rate": 8.716777645239649e-05, "loss": 0.107, "step": 19010 }, { "epoch": 0.5822389567453393, "grad_norm": 3.314333915710449, "learning_rate": 8.715197230241173e-05, "loss": 0.1266, "step": 19020 }, { "epoch": 0.5825450760706523, "grad_norm": 2.1478676795959473, "learning_rate": 8.713615986082739e-05, "loss": 0.1151, "step": 19030 }, { "epoch": 0.5828511953959653, "grad_norm": 1.5035841464996338, "learning_rate": 8.71203391311725e-05, "loss": 0.0861, "step": 19040 }, { "epoch": 0.5831573147212784, "grad_norm": 1.1026999950408936, "learning_rate": 8.710451011697793e-05, "loss": 0.1343, "step": 19050 }, { "epoch": 0.5834634340465914, "grad_norm": 0.787921130657196, "learning_rate": 8.708867282177643e-05, "loss": 0.0675, "step": 19060 }, { "epoch": 0.5837695533719044, "grad_norm": 1.542465329170227, "learning_rate": 8.707282724910255e-05, "loss": 0.0542, "step": 19070 }, { "epoch": 0.5840756726972174, "grad_norm": 1.066216230392456, "learning_rate": 8.705697340249275e-05, "loss": 0.0675, "step": 19080 }, { "epoch": 0.5843817920225304, "grad_norm": 3.4143145084381104, "learning_rate": 8.704111128548527e-05, "loss": 0.0652, "step": 19090 }, { "epoch": 0.5846879113478434, "grad_norm": 1.7182910442352295, "learning_rate": 8.702524090162021e-05, "loss": 0.0929, "step": 19100 }, { "epoch": 0.5849940306731564, "grad_norm": 1.3129005432128906, "learning_rate": 8.700936225443959e-05, "loss": 0.0957, "step": 19110 }, { "epoch": 0.5853001499984694, "grad_norm": 1.867177128791809, "learning_rate": 8.699347534748717e-05, "loss": 0.0865, "step": 19120 }, { "epoch": 0.5856062693237825, "grad_norm": 1.4712152481079102, "learning_rate": 8.697758018430861e-05, "loss": 0.1114, "step": 19130 }, { "epoch": 0.5859123886490955, "grad_norm": 1.4765366315841675, "learning_rate": 8.696167676845143e-05, "loss": 0.087, "step": 19140 }, { "epoch": 0.5862185079744084, "grad_norm": 1.649992823600769, "learning_rate": 8.694576510346493e-05, "loss": 0.0719, "step": 19150 }, { "epoch": 0.5865246272997214, "grad_norm": 10.834943771362305, "learning_rate": 8.692984519290028e-05, "loss": 0.1903, "step": 19160 }, { "epoch": 0.5868307466250344, "grad_norm": 0.8024684190750122, "learning_rate": 8.691391704031051e-05, "loss": 0.1021, "step": 19170 }, { "epoch": 0.5871368659503474, "grad_norm": 2.5359370708465576, "learning_rate": 8.689798064925049e-05, "loss": 0.1086, "step": 19180 }, { "epoch": 0.5874429852756604, "grad_norm": 1.5372257232666016, "learning_rate": 8.688203602327689e-05, "loss": 0.0791, "step": 19190 }, { "epoch": 0.5877491046009734, "grad_norm": 0.9397455453872681, "learning_rate": 8.686608316594825e-05, "loss": 0.0551, "step": 19200 }, { "epoch": 0.5880552239262865, "grad_norm": 1.6303670406341553, "learning_rate": 8.685012208082495e-05, "loss": 0.0629, "step": 19210 }, { "epoch": 0.5883613432515995, "grad_norm": 0.6113882064819336, "learning_rate": 8.683415277146916e-05, "loss": 0.0798, "step": 19220 }, { "epoch": 0.5886674625769125, "grad_norm": 0.7448592185974121, "learning_rate": 8.681817524144495e-05, "loss": 0.114, "step": 19230 }, { "epoch": 0.5889735819022255, "grad_norm": 0.6192340850830078, "learning_rate": 8.68021894943182e-05, "loss": 0.0762, "step": 19240 }, { "epoch": 0.5892797012275385, "grad_norm": 1.7990257740020752, "learning_rate": 8.678619553365659e-05, "loss": 0.0817, "step": 19250 }, { "epoch": 0.5895858205528515, "grad_norm": 4.377070903778076, "learning_rate": 8.677019336302967e-05, "loss": 0.1071, "step": 19260 }, { "epoch": 0.5898919398781645, "grad_norm": 1.5486366748809814, "learning_rate": 8.675418298600884e-05, "loss": 0.1222, "step": 19270 }, { "epoch": 0.5901980592034776, "grad_norm": 2.890590190887451, "learning_rate": 8.673816440616729e-05, "loss": 0.0827, "step": 19280 }, { "epoch": 0.5905041785287906, "grad_norm": 1.6971317529678345, "learning_rate": 8.672213762708003e-05, "loss": 0.07, "step": 19290 }, { "epoch": 0.5908102978541035, "grad_norm": 1.8228631019592285, "learning_rate": 8.670610265232397e-05, "loss": 0.1298, "step": 19300 }, { "epoch": 0.5911164171794165, "grad_norm": 2.359498977661133, "learning_rate": 8.66900594854778e-05, "loss": 0.1075, "step": 19310 }, { "epoch": 0.5914225365047295, "grad_norm": 2.463705062866211, "learning_rate": 8.6674008130122e-05, "loss": 0.1646, "step": 19320 }, { "epoch": 0.5917286558300425, "grad_norm": 0.9487640261650085, "learning_rate": 8.665794858983896e-05, "loss": 0.0615, "step": 19330 }, { "epoch": 0.5920347751553555, "grad_norm": 1.3945744037628174, "learning_rate": 8.664188086821284e-05, "loss": 0.1274, "step": 19340 }, { "epoch": 0.5923408944806685, "grad_norm": 2.2580134868621826, "learning_rate": 8.662580496882966e-05, "loss": 0.163, "step": 19350 }, { "epoch": 0.5926470138059816, "grad_norm": 16.101028442382812, "learning_rate": 8.660972089527724e-05, "loss": 0.033, "step": 19360 }, { "epoch": 0.5929531331312946, "grad_norm": 1.0345908403396606, "learning_rate": 8.659362865114524e-05, "loss": 0.0644, "step": 19370 }, { "epoch": 0.5932592524566076, "grad_norm": 3.052868604660034, "learning_rate": 8.657752824002512e-05, "loss": 0.0933, "step": 19380 }, { "epoch": 0.5935653717819206, "grad_norm": 1.345623254776001, "learning_rate": 8.656141966551019e-05, "loss": 0.1009, "step": 19390 }, { "epoch": 0.5938714911072336, "grad_norm": 1.922345519065857, "learning_rate": 8.654530293119558e-05, "loss": 0.1082, "step": 19400 }, { "epoch": 0.5941776104325466, "grad_norm": 2.926276206970215, "learning_rate": 8.652917804067821e-05, "loss": 0.1311, "step": 19410 }, { "epoch": 0.5944837297578596, "grad_norm": 1.1183714866638184, "learning_rate": 8.651304499755687e-05, "loss": 0.0562, "step": 19420 }, { "epoch": 0.5947898490831727, "grad_norm": 0.8935567140579224, "learning_rate": 8.649690380543211e-05, "loss": 0.1015, "step": 19430 }, { "epoch": 0.5950959684084857, "grad_norm": 1.3038612604141235, "learning_rate": 8.648075446790637e-05, "loss": 0.0728, "step": 19440 }, { "epoch": 0.5954020877337987, "grad_norm": 1.1368244886398315, "learning_rate": 8.646459698858386e-05, "loss": 0.0996, "step": 19450 }, { "epoch": 0.5957082070591116, "grad_norm": 2.1968276500701904, "learning_rate": 8.644843137107059e-05, "loss": 0.0578, "step": 19460 }, { "epoch": 0.5960143263844246, "grad_norm": 1.0061569213867188, "learning_rate": 8.643225761897443e-05, "loss": 0.1426, "step": 19470 }, { "epoch": 0.5963204457097376, "grad_norm": 1.111851692199707, "learning_rate": 8.641607573590502e-05, "loss": 0.0886, "step": 19480 }, { "epoch": 0.5966265650350506, "grad_norm": 1.0605820417404175, "learning_rate": 8.639988572547389e-05, "loss": 0.0457, "step": 19490 }, { "epoch": 0.5969326843603636, "grad_norm": 1.17946195602417, "learning_rate": 8.638368759129432e-05, "loss": 0.0906, "step": 19500 }, { "epoch": 0.5972388036856767, "grad_norm": 1.5167421102523804, "learning_rate": 8.63674813369814e-05, "loss": 0.099, "step": 19510 }, { "epoch": 0.5975449230109897, "grad_norm": 1.8802787065505981, "learning_rate": 8.635126696615207e-05, "loss": 0.0849, "step": 19520 }, { "epoch": 0.5978510423363027, "grad_norm": 2.0623974800109863, "learning_rate": 8.633504448242505e-05, "loss": 0.1354, "step": 19530 }, { "epoch": 0.5981571616616157, "grad_norm": 2.4153964519500732, "learning_rate": 8.631881388942089e-05, "loss": 0.0766, "step": 19540 }, { "epoch": 0.5984632809869287, "grad_norm": 2.023933172225952, "learning_rate": 8.630257519076196e-05, "loss": 0.095, "step": 19550 }, { "epoch": 0.5987694003122417, "grad_norm": 0.552020251750946, "learning_rate": 8.62863283900724e-05, "loss": 0.0618, "step": 19560 }, { "epoch": 0.5990755196375547, "grad_norm": 0.8430296778678894, "learning_rate": 8.627007349097818e-05, "loss": 0.0858, "step": 19570 }, { "epoch": 0.5993816389628678, "grad_norm": 0.42809367179870605, "learning_rate": 8.625381049710711e-05, "loss": 0.0441, "step": 19580 }, { "epoch": 0.5996877582881808, "grad_norm": 1.2795302867889404, "learning_rate": 8.623753941208872e-05, "loss": 0.1016, "step": 19590 }, { "epoch": 0.5999938776134938, "grad_norm": 1.3502521514892578, "learning_rate": 8.622126023955446e-05, "loss": 0.1084, "step": 19600 }, { "epoch": 0.6002999969388068, "grad_norm": 1.198207974433899, "learning_rate": 8.620497298313748e-05, "loss": 0.1109, "step": 19610 }, { "epoch": 0.6006061162641197, "grad_norm": 1.1594444513320923, "learning_rate": 8.61886776464728e-05, "loss": 0.0541, "step": 19620 }, { "epoch": 0.6009122355894327, "grad_norm": 1.2304247617721558, "learning_rate": 8.617237423319722e-05, "loss": 0.0494, "step": 19630 }, { "epoch": 0.6012183549147457, "grad_norm": 1.3918805122375488, "learning_rate": 8.615606274694934e-05, "loss": 0.0639, "step": 19640 }, { "epoch": 0.6015244742400587, "grad_norm": 1.0724470615386963, "learning_rate": 8.613974319136958e-05, "loss": 0.0922, "step": 19650 }, { "epoch": 0.6018305935653718, "grad_norm": 1.32346510887146, "learning_rate": 8.612341557010014e-05, "loss": 0.0773, "step": 19660 }, { "epoch": 0.6021367128906848, "grad_norm": 1.9886091947555542, "learning_rate": 8.610707988678503e-05, "loss": 0.0915, "step": 19670 }, { "epoch": 0.6024428322159978, "grad_norm": 1.4628443717956543, "learning_rate": 8.609073614507006e-05, "loss": 0.0413, "step": 19680 }, { "epoch": 0.6027489515413108, "grad_norm": 0.878562331199646, "learning_rate": 8.607438434860283e-05, "loss": 0.063, "step": 19690 }, { "epoch": 0.6030550708666238, "grad_norm": 1.2010884284973145, "learning_rate": 8.605802450103275e-05, "loss": 0.0505, "step": 19700 }, { "epoch": 0.6033611901919368, "grad_norm": 1.821014165878296, "learning_rate": 8.604165660601102e-05, "loss": 0.1473, "step": 19710 }, { "epoch": 0.6036673095172498, "grad_norm": 1.7571609020233154, "learning_rate": 8.602528066719062e-05, "loss": 0.0894, "step": 19720 }, { "epoch": 0.6039734288425628, "grad_norm": 1.0768753290176392, "learning_rate": 8.600889668822636e-05, "loss": 0.0727, "step": 19730 }, { "epoch": 0.6042795481678759, "grad_norm": 3.958505153656006, "learning_rate": 8.599250467277483e-05, "loss": 0.0761, "step": 19740 }, { "epoch": 0.6045856674931889, "grad_norm": 1.0353981256484985, "learning_rate": 8.597610462449442e-05, "loss": 0.0952, "step": 19750 }, { "epoch": 0.6048917868185019, "grad_norm": 1.1674911975860596, "learning_rate": 8.595969654704526e-05, "loss": 0.0653, "step": 19760 }, { "epoch": 0.6051979061438149, "grad_norm": 5.949222564697266, "learning_rate": 8.594328044408934e-05, "loss": 0.1355, "step": 19770 }, { "epoch": 0.6055040254691278, "grad_norm": 1.6935853958129883, "learning_rate": 8.592685631929041e-05, "loss": 0.0393, "step": 19780 }, { "epoch": 0.6058101447944408, "grad_norm": 1.3791170120239258, "learning_rate": 8.591042417631404e-05, "loss": 0.0671, "step": 19790 }, { "epoch": 0.6061162641197538, "grad_norm": 0.9852278828620911, "learning_rate": 8.589398401882755e-05, "loss": 0.0516, "step": 19800 }, { "epoch": 0.6064223834450669, "grad_norm": 0.7238945364952087, "learning_rate": 8.587753585050004e-05, "loss": 0.092, "step": 19810 }, { "epoch": 0.6067285027703799, "grad_norm": 1.3391366004943848, "learning_rate": 8.586107967500246e-05, "loss": 0.1147, "step": 19820 }, { "epoch": 0.6070346220956929, "grad_norm": 1.206836223602295, "learning_rate": 8.584461549600749e-05, "loss": 0.0897, "step": 19830 }, { "epoch": 0.6073407414210059, "grad_norm": 6.419968605041504, "learning_rate": 8.582814331718961e-05, "loss": 0.1511, "step": 19840 }, { "epoch": 0.6076468607463189, "grad_norm": 1.1883189678192139, "learning_rate": 8.58116631422251e-05, "loss": 0.1072, "step": 19850 }, { "epoch": 0.6079529800716319, "grad_norm": 0.4477207064628601, "learning_rate": 8.579517497479203e-05, "loss": 0.0509, "step": 19860 }, { "epoch": 0.6082590993969449, "grad_norm": 1.4621031284332275, "learning_rate": 8.57786788185702e-05, "loss": 0.0738, "step": 19870 }, { "epoch": 0.6085652187222579, "grad_norm": 2.196526288986206, "learning_rate": 8.576217467724128e-05, "loss": 0.0715, "step": 19880 }, { "epoch": 0.608871338047571, "grad_norm": 4.036099910736084, "learning_rate": 8.574566255448864e-05, "loss": 0.1239, "step": 19890 }, { "epoch": 0.609177457372884, "grad_norm": 2.787729263305664, "learning_rate": 8.572914245399748e-05, "loss": 0.0621, "step": 19900 }, { "epoch": 0.609483576698197, "grad_norm": 0.678308367729187, "learning_rate": 8.571261437945475e-05, "loss": 0.0671, "step": 19910 }, { "epoch": 0.60978969602351, "grad_norm": 1.1166934967041016, "learning_rate": 8.569607833454923e-05, "loss": 0.076, "step": 19920 }, { "epoch": 0.610095815348823, "grad_norm": 4.571006774902344, "learning_rate": 8.567953432297138e-05, "loss": 0.0852, "step": 19930 }, { "epoch": 0.610401934674136, "grad_norm": 1.4092068672180176, "learning_rate": 8.566298234841357e-05, "loss": 0.051, "step": 19940 }, { "epoch": 0.6107080539994489, "grad_norm": 1.9017550945281982, "learning_rate": 8.564642241456986e-05, "loss": 0.0806, "step": 19950 }, { "epoch": 0.611014173324762, "grad_norm": 1.3629646301269531, "learning_rate": 8.562985452513609e-05, "loss": 0.0597, "step": 19960 }, { "epoch": 0.611320292650075, "grad_norm": 0.8344027996063232, "learning_rate": 8.561327868380991e-05, "loss": 0.0307, "step": 19970 }, { "epoch": 0.611626411975388, "grad_norm": 6.314414024353027, "learning_rate": 8.55966948942907e-05, "loss": 0.0497, "step": 19980 }, { "epoch": 0.611932531300701, "grad_norm": 1.3347326517105103, "learning_rate": 8.558010316027967e-05, "loss": 0.069, "step": 19990 }, { "epoch": 0.612238650626014, "grad_norm": 2.4904561042785645, "learning_rate": 8.556350348547977e-05, "loss": 0.0622, "step": 20000 } ], "logging_steps": 10, "max_steps": 70000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.233371118912e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }