diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21615 @@ +{ + "best_metric": 0.8167605279211759, + "best_model_checkpoint": "/data/hungnm/unisentiment/roberta-base-sentiment/checkpoint-12296", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 15370, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016265452179570592, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 2.1785, + "step": 5 + }, + { + "epoch": 0.0032530904359141183, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 2.1908, + "step": 10 + }, + { + "epoch": 0.004879635653871178, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 2.1871, + "step": 15 + }, + { + "epoch": 0.006506180871828237, + "grad_norm": 0.5645683407783508, + "learning_rate": 1.2987012987012988e-06, + "loss": 2.1871, + "step": 20 + }, + { + "epoch": 0.008132726089785295, + "grad_norm": 0.5938918590545654, + "learning_rate": 2.922077922077922e-06, + "loss": 2.1877, + "step": 25 + }, + { + "epoch": 0.009759271307742356, + "grad_norm": 0.29931628704071045, + "learning_rate": 4.5454545454545455e-06, + "loss": 2.1764, + "step": 30 + }, + { + "epoch": 0.011385816525699415, + "grad_norm": 0.3661767840385437, + "learning_rate": 6.168831168831169e-06, + "loss": 2.1846, + "step": 35 + }, + { + "epoch": 0.013012361743656473, + "grad_norm": 0.4001053273677826, + "learning_rate": 7.792207792207792e-06, + "loss": 2.1768, + "step": 40 + }, + { + "epoch": 0.014638906961613532, + "grad_norm": 0.9277121424674988, + "learning_rate": 9.415584415584416e-06, + "loss": 2.1666, + "step": 45 + }, + { + "epoch": 0.01626545217957059, + "grad_norm": 1.1185601949691772, + "learning_rate": 1.0714285714285714e-05, + "loss": 2.1125, + "step": 50 + }, + { + "epoch": 0.017891997397527653, + "grad_norm": 2.3510420322418213, + "learning_rate": 1.2337662337662339e-05, + "loss": 2.0339, + "step": 55 + }, + { + "epoch": 0.01951854261548471, + "grad_norm": 3.3014020919799805, + "learning_rate": 1.396103896103896e-05, + "loss": 1.8524, + "step": 60 + }, + { + "epoch": 0.02114508783344177, + "grad_norm": 2.7046000957489014, + "learning_rate": 1.525974025974026e-05, + "loss": 1.686, + "step": 65 + }, + { + "epoch": 0.02277163305139883, + "grad_norm": 3.1015822887420654, + "learning_rate": 1.6883116883116884e-05, + "loss": 1.6389, + "step": 70 + }, + { + "epoch": 0.024398178269355888, + "grad_norm": 7.0122504234313965, + "learning_rate": 1.850649350649351e-05, + "loss": 1.5962, + "step": 75 + }, + { + "epoch": 0.026024723487312947, + "grad_norm": 3.228231191635132, + "learning_rate": 2.012987012987013e-05, + "loss": 1.5237, + "step": 80 + }, + { + "epoch": 0.027651268705270005, + "grad_norm": 2.9056921005249023, + "learning_rate": 2.1753246753246752e-05, + "loss": 1.4676, + "step": 85 + }, + { + "epoch": 0.029277813923227064, + "grad_norm": 4.655855655670166, + "learning_rate": 2.3376623376623376e-05, + "loss": 1.4479, + "step": 90 + }, + { + "epoch": 0.030904359141184126, + "grad_norm": 6.634426116943359, + "learning_rate": 2.5e-05, + "loss": 1.3859, + "step": 95 + }, + { + "epoch": 0.03253090435914118, + "grad_norm": 6.168959617614746, + "learning_rate": 2.6623376623376623e-05, + "loss": 1.3611, + "step": 100 + }, + { + "epoch": 0.034157449577098244, + "grad_norm": 8.923408508300781, + "learning_rate": 2.824675324675325e-05, + "loss": 1.4147, + "step": 105 + }, + { + "epoch": 0.035783994795055306, + "grad_norm": 3.014373779296875, + "learning_rate": 2.9870129870129872e-05, + "loss": 1.3446, + "step": 110 + }, + { + "epoch": 0.03741054001301236, + "grad_norm": 2.305469512939453, + "learning_rate": 3.14935064935065e-05, + "loss": 1.3145, + "step": 115 + }, + { + "epoch": 0.03903708523096942, + "grad_norm": 5.387579917907715, + "learning_rate": 3.311688311688312e-05, + "loss": 1.2636, + "step": 120 + }, + { + "epoch": 0.04066363044892648, + "grad_norm": 2.3187530040740967, + "learning_rate": 3.474025974025974e-05, + "loss": 1.2629, + "step": 125 + }, + { + "epoch": 0.04229017566688354, + "grad_norm": 2.4720473289489746, + "learning_rate": 3.6363636363636364e-05, + "loss": 1.2981, + "step": 130 + }, + { + "epoch": 0.043916720884840596, + "grad_norm": 4.648244857788086, + "learning_rate": 3.798701298701299e-05, + "loss": 1.2667, + "step": 135 + }, + { + "epoch": 0.04554326610279766, + "grad_norm": 3.724539279937744, + "learning_rate": 3.9610389610389614e-05, + "loss": 1.2947, + "step": 140 + }, + { + "epoch": 0.04716981132075472, + "grad_norm": 2.6719841957092285, + "learning_rate": 4.123376623376624e-05, + "loss": 1.2512, + "step": 145 + }, + { + "epoch": 0.048796356538711776, + "grad_norm": 3.474255084991455, + "learning_rate": 4.2857142857142856e-05, + "loss": 1.2281, + "step": 150 + }, + { + "epoch": 0.05042290175666884, + "grad_norm": 2.7938802242279053, + "learning_rate": 4.448051948051948e-05, + "loss": 1.21, + "step": 155 + }, + { + "epoch": 0.05204944697462589, + "grad_norm": 5.464128494262695, + "learning_rate": 4.6103896103896106e-05, + "loss": 1.2085, + "step": 160 + }, + { + "epoch": 0.053675992192582955, + "grad_norm": 2.3026108741760254, + "learning_rate": 4.772727272727273e-05, + "loss": 1.2344, + "step": 165 + }, + { + "epoch": 0.05530253741054001, + "grad_norm": 5.3961181640625, + "learning_rate": 4.9350649350649355e-05, + "loss": 1.2371, + "step": 170 + }, + { + "epoch": 0.05692908262849707, + "grad_norm": 3.639165163040161, + "learning_rate": 4.999999520430831e-05, + "loss": 1.183, + "step": 175 + }, + { + "epoch": 0.05855562784645413, + "grad_norm": 2.9800868034362793, + "learning_rate": 4.999996589731022e-05, + "loss": 1.1735, + "step": 180 + }, + { + "epoch": 0.06018217306441119, + "grad_norm": 4.420463562011719, + "learning_rate": 4.999990994761838e-05, + "loss": 1.1907, + "step": 185 + }, + { + "epoch": 0.06180871828236825, + "grad_norm": 3.3192920684814453, + "learning_rate": 4.999982735529244e-05, + "loss": 1.1585, + "step": 190 + }, + { + "epoch": 0.06343526350032531, + "grad_norm": 2.5585310459136963, + "learning_rate": 4.9999718120420395e-05, + "loss": 1.1875, + "step": 195 + }, + { + "epoch": 0.06506180871828236, + "grad_norm": 2.6861331462860107, + "learning_rate": 4.9999582243118675e-05, + "loss": 1.1392, + "step": 200 + }, + { + "epoch": 0.06668835393623943, + "grad_norm": 2.9083189964294434, + "learning_rate": 4.9999419723532074e-05, + "loss": 1.159, + "step": 205 + }, + { + "epoch": 0.06831489915419649, + "grad_norm": 2.3593695163726807, + "learning_rate": 4.99992305618338e-05, + "loss": 1.1589, + "step": 210 + }, + { + "epoch": 0.06994144437215355, + "grad_norm": 3.1515111923217773, + "learning_rate": 4.999901475822544e-05, + "loss": 1.142, + "step": 215 + }, + { + "epoch": 0.07156798959011061, + "grad_norm": 4.399820804595947, + "learning_rate": 4.9998772312936976e-05, + "loss": 1.1331, + "step": 220 + }, + { + "epoch": 0.07319453480806766, + "grad_norm": 3.1851677894592285, + "learning_rate": 4.999850322622679e-05, + "loss": 1.1209, + "step": 225 + }, + { + "epoch": 0.07482108002602472, + "grad_norm": 2.9607701301574707, + "learning_rate": 4.999820749838164e-05, + "loss": 1.1434, + "step": 230 + }, + { + "epoch": 0.07644762524398178, + "grad_norm": 4.575477600097656, + "learning_rate": 4.9997885129716696e-05, + "loss": 1.1271, + "step": 235 + }, + { + "epoch": 0.07807417046193885, + "grad_norm": 2.9342846870422363, + "learning_rate": 4.999753612057551e-05, + "loss": 1.1429, + "step": 240 + }, + { + "epoch": 0.0797007156798959, + "grad_norm": 3.5558149814605713, + "learning_rate": 4.9997160471330007e-05, + "loss": 1.1603, + "step": 245 + }, + { + "epoch": 0.08132726089785296, + "grad_norm": 2.368305206298828, + "learning_rate": 4.9996758182380546e-05, + "loss": 1.1212, + "step": 250 + }, + { + "epoch": 0.08295380611581002, + "grad_norm": 2.373504161834717, + "learning_rate": 4.999632925415583e-05, + "loss": 1.104, + "step": 255 + }, + { + "epoch": 0.08458035133376708, + "grad_norm": 3.757291793823242, + "learning_rate": 4.999587368711298e-05, + "loss": 1.0807, + "step": 260 + }, + { + "epoch": 0.08620689655172414, + "grad_norm": 2.5040974617004395, + "learning_rate": 4.9995391481737494e-05, + "loss": 1.0852, + "step": 265 + }, + { + "epoch": 0.08783344176968119, + "grad_norm": 2.787508726119995, + "learning_rate": 4.9994882638543264e-05, + "loss": 1.1174, + "step": 270 + }, + { + "epoch": 0.08945998698763825, + "grad_norm": 4.031174182891846, + "learning_rate": 4.9994347158072576e-05, + "loss": 1.1362, + "step": 275 + }, + { + "epoch": 0.09108653220559532, + "grad_norm": 2.270128011703491, + "learning_rate": 4.9993785040896094e-05, + "loss": 1.1217, + "step": 280 + }, + { + "epoch": 0.09271307742355238, + "grad_norm": 3.772061824798584, + "learning_rate": 4.999319628761285e-05, + "loss": 1.0978, + "step": 285 + }, + { + "epoch": 0.09433962264150944, + "grad_norm": 3.2462875843048096, + "learning_rate": 4.9992580898850315e-05, + "loss": 1.0788, + "step": 290 + }, + { + "epoch": 0.09596616785946649, + "grad_norm": 2.688223361968994, + "learning_rate": 4.999193887526431e-05, + "loss": 1.077, + "step": 295 + }, + { + "epoch": 0.09759271307742355, + "grad_norm": 4.434839248657227, + "learning_rate": 4.999127021753902e-05, + "loss": 1.0917, + "step": 300 + }, + { + "epoch": 0.09921925829538061, + "grad_norm": 2.364314079284668, + "learning_rate": 4.9990574926387064e-05, + "loss": 1.0815, + "step": 305 + }, + { + "epoch": 0.10084580351333768, + "grad_norm": 3.0964620113372803, + "learning_rate": 4.998985300254941e-05, + "loss": 1.0536, + "step": 310 + }, + { + "epoch": 0.10247234873129472, + "grad_norm": 2.663844585418701, + "learning_rate": 4.998910444679542e-05, + "loss": 1.0906, + "step": 315 + }, + { + "epoch": 0.10409889394925179, + "grad_norm": 2.6342573165893555, + "learning_rate": 4.9988329259922844e-05, + "loss": 1.0899, + "step": 320 + }, + { + "epoch": 0.10572543916720885, + "grad_norm": 1.6737209558486938, + "learning_rate": 4.99875274427578e-05, + "loss": 1.047, + "step": 325 + }, + { + "epoch": 0.10735198438516591, + "grad_norm": 2.3672590255737305, + "learning_rate": 4.998669899615479e-05, + "loss": 1.1264, + "step": 330 + }, + { + "epoch": 0.10897852960312297, + "grad_norm": 2.4219727516174316, + "learning_rate": 4.9985843920996694e-05, + "loss": 1.0904, + "step": 335 + }, + { + "epoch": 0.11060507482108002, + "grad_norm": 2.6155354976654053, + "learning_rate": 4.9984962218194785e-05, + "loss": 1.0877, + "step": 340 + }, + { + "epoch": 0.11223162003903708, + "grad_norm": 1.8474421501159668, + "learning_rate": 4.998405388868869e-05, + "loss": 1.1057, + "step": 345 + }, + { + "epoch": 0.11385816525699415, + "grad_norm": 2.564061403274536, + "learning_rate": 4.998311893344644e-05, + "loss": 1.0626, + "step": 350 + }, + { + "epoch": 0.11548471047495121, + "grad_norm": 2.4053661823272705, + "learning_rate": 4.9982157353464396e-05, + "loss": 1.0503, + "step": 355 + }, + { + "epoch": 0.11711125569290826, + "grad_norm": 1.725553274154663, + "learning_rate": 4.998116914976736e-05, + "loss": 1.066, + "step": 360 + }, + { + "epoch": 0.11873780091086532, + "grad_norm": 2.982055187225342, + "learning_rate": 4.998015432340845e-05, + "loss": 1.0551, + "step": 365 + }, + { + "epoch": 0.12036434612882238, + "grad_norm": 2.4996230602264404, + "learning_rate": 4.997911287546916e-05, + "loss": 1.0596, + "step": 370 + }, + { + "epoch": 0.12199089134677944, + "grad_norm": 2.708897829055786, + "learning_rate": 4.99780448070594e-05, + "loss": 1.0519, + "step": 375 + }, + { + "epoch": 0.1236174365647365, + "grad_norm": 3.140862226486206, + "learning_rate": 4.9976950119317414e-05, + "loss": 1.0729, + "step": 380 + }, + { + "epoch": 0.12524398178269355, + "grad_norm": 2.259716749191284, + "learning_rate": 4.99758288134098e-05, + "loss": 1.0338, + "step": 385 + }, + { + "epoch": 0.12687052700065063, + "grad_norm": 2.4723799228668213, + "learning_rate": 4.9974680890531565e-05, + "loss": 1.0667, + "step": 390 + }, + { + "epoch": 0.12849707221860768, + "grad_norm": 2.4552574157714844, + "learning_rate": 4.997350635190606e-05, + "loss": 1.0217, + "step": 395 + }, + { + "epoch": 0.13012361743656473, + "grad_norm": 2.4334287643432617, + "learning_rate": 4.997230519878499e-05, + "loss": 1.0493, + "step": 400 + }, + { + "epoch": 0.1317501626545218, + "grad_norm": 1.9064465761184692, + "learning_rate": 4.997107743244844e-05, + "loss": 1.0564, + "step": 405 + }, + { + "epoch": 0.13337670787247885, + "grad_norm": 3.3704562187194824, + "learning_rate": 4.9969823054204864e-05, + "loss": 1.0531, + "step": 410 + }, + { + "epoch": 0.13500325309043593, + "grad_norm": 2.59091854095459, + "learning_rate": 4.996854206539104e-05, + "loss": 1.0458, + "step": 415 + }, + { + "epoch": 0.13662979830839297, + "grad_norm": 2.579805612564087, + "learning_rate": 4.996723446737216e-05, + "loss": 1.0452, + "step": 420 + }, + { + "epoch": 0.13825634352635002, + "grad_norm": 2.606947183609009, + "learning_rate": 4.9965900261541706e-05, + "loss": 1.0419, + "step": 425 + }, + { + "epoch": 0.1398828887443071, + "grad_norm": 1.7824945449829102, + "learning_rate": 4.996453944932159e-05, + "loss": 1.0822, + "step": 430 + }, + { + "epoch": 0.14150943396226415, + "grad_norm": 1.9553674459457397, + "learning_rate": 4.996315203216202e-05, + "loss": 1.0487, + "step": 435 + }, + { + "epoch": 0.14313597918022122, + "grad_norm": 2.531892776489258, + "learning_rate": 4.996173801154159e-05, + "loss": 1.0291, + "step": 440 + }, + { + "epoch": 0.14476252439817827, + "grad_norm": 1.7413674592971802, + "learning_rate": 4.996029738896723e-05, + "loss": 1.0462, + "step": 445 + }, + { + "epoch": 0.14638906961613532, + "grad_norm": 2.1651906967163086, + "learning_rate": 4.995883016597421e-05, + "loss": 1.027, + "step": 450 + }, + { + "epoch": 0.1480156148340924, + "grad_norm": 1.927914023399353, + "learning_rate": 4.995733634412619e-05, + "loss": 1.0418, + "step": 455 + }, + { + "epoch": 0.14964216005204944, + "grad_norm": 1.8429598808288574, + "learning_rate": 4.9955815925015145e-05, + "loss": 1.0417, + "step": 460 + }, + { + "epoch": 0.1512687052700065, + "grad_norm": 1.548966646194458, + "learning_rate": 4.9954268910261385e-05, + "loss": 1.0143, + "step": 465 + }, + { + "epoch": 0.15289525048796357, + "grad_norm": 1.8078657388687134, + "learning_rate": 4.995269530151358e-05, + "loss": 1.0635, + "step": 470 + }, + { + "epoch": 0.15452179570592062, + "grad_norm": 2.4410746097564697, + "learning_rate": 4.9951095100448746e-05, + "loss": 1.0662, + "step": 475 + }, + { + "epoch": 0.1561483409238777, + "grad_norm": 1.6874713897705078, + "learning_rate": 4.994946830877223e-05, + "loss": 1.0465, + "step": 480 + }, + { + "epoch": 0.15777488614183474, + "grad_norm": 3.639892816543579, + "learning_rate": 4.994781492821773e-05, + "loss": 1.0277, + "step": 485 + }, + { + "epoch": 0.1594014313597918, + "grad_norm": 1.8491342067718506, + "learning_rate": 4.994613496054724e-05, + "loss": 1.0302, + "step": 490 + }, + { + "epoch": 0.16102797657774887, + "grad_norm": 4.213651657104492, + "learning_rate": 4.994442840755115e-05, + "loss": 1.0291, + "step": 495 + }, + { + "epoch": 0.16265452179570591, + "grad_norm": 2.1080269813537598, + "learning_rate": 4.994269527104813e-05, + "loss": 1.0189, + "step": 500 + }, + { + "epoch": 0.164281067013663, + "grad_norm": 1.6446752548217773, + "learning_rate": 4.99409355528852e-05, + "loss": 0.9992, + "step": 505 + }, + { + "epoch": 0.16590761223162004, + "grad_norm": 1.7973002195358276, + "learning_rate": 4.993914925493772e-05, + "loss": 1.0542, + "step": 510 + }, + { + "epoch": 0.1675341574495771, + "grad_norm": 1.507645606994629, + "learning_rate": 4.993733637910935e-05, + "loss": 1.0292, + "step": 515 + }, + { + "epoch": 0.16916070266753416, + "grad_norm": 2.0109477043151855, + "learning_rate": 4.993549692733209e-05, + "loss": 1.0325, + "step": 520 + }, + { + "epoch": 0.1707872478854912, + "grad_norm": 1.6091409921646118, + "learning_rate": 4.993363090156628e-05, + "loss": 1.0003, + "step": 525 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 2.9904284477233887, + "learning_rate": 4.9931738303800536e-05, + "loss": 1.0259, + "step": 530 + }, + { + "epoch": 0.17404033832140534, + "grad_norm": 2.8802828788757324, + "learning_rate": 4.9929819136051824e-05, + "loss": 1.0144, + "step": 535 + }, + { + "epoch": 0.17566688353936238, + "grad_norm": 2.4210402965545654, + "learning_rate": 4.992787340036542e-05, + "loss": 0.9996, + "step": 540 + }, + { + "epoch": 0.17729342875731946, + "grad_norm": 1.6843578815460205, + "learning_rate": 4.992590109881492e-05, + "loss": 1.0209, + "step": 545 + }, + { + "epoch": 0.1789199739752765, + "grad_norm": 2.3898844718933105, + "learning_rate": 4.992390223350222e-05, + "loss": 0.9938, + "step": 550 + }, + { + "epoch": 0.18054651919323358, + "grad_norm": 1.7469401359558105, + "learning_rate": 4.992187680655752e-05, + "loss": 1.0219, + "step": 555 + }, + { + "epoch": 0.18217306441119063, + "grad_norm": 2.327509641647339, + "learning_rate": 4.991982482013935e-05, + "loss": 1.0169, + "step": 560 + }, + { + "epoch": 0.18379960962914768, + "grad_norm": 2.3872838020324707, + "learning_rate": 4.991774627643452e-05, + "loss": 0.9991, + "step": 565 + }, + { + "epoch": 0.18542615484710476, + "grad_norm": 3.121908664703369, + "learning_rate": 4.991564117765816e-05, + "loss": 0.972, + "step": 570 + }, + { + "epoch": 0.1870527000650618, + "grad_norm": 2.5355122089385986, + "learning_rate": 4.991350952605368e-05, + "loss": 0.9952, + "step": 575 + }, + { + "epoch": 0.18867924528301888, + "grad_norm": 1.873481273651123, + "learning_rate": 4.991135132389282e-05, + "loss": 0.9818, + "step": 580 + }, + { + "epoch": 0.19030579050097593, + "grad_norm": 2.0629231929779053, + "learning_rate": 4.9909166573475587e-05, + "loss": 1.0162, + "step": 585 + }, + { + "epoch": 0.19193233571893298, + "grad_norm": 1.8241119384765625, + "learning_rate": 4.990695527713029e-05, + "loss": 0.9645, + "step": 590 + }, + { + "epoch": 0.19355888093689005, + "grad_norm": 2.270092487335205, + "learning_rate": 4.990471743721352e-05, + "loss": 0.9838, + "step": 595 + }, + { + "epoch": 0.1951854261548471, + "grad_norm": 1.664500117301941, + "learning_rate": 4.9902453056110176e-05, + "loss": 1.0356, + "step": 600 + }, + { + "epoch": 0.19681197137280415, + "grad_norm": 1.805112600326538, + "learning_rate": 4.990016213623343e-05, + "loss": 0.9959, + "step": 605 + }, + { + "epoch": 0.19843851659076123, + "grad_norm": 2.4506659507751465, + "learning_rate": 4.989784468002472e-05, + "loss": 1.0026, + "step": 610 + }, + { + "epoch": 0.20006506180871828, + "grad_norm": 1.6195240020751953, + "learning_rate": 4.98955006899538e-05, + "loss": 1.0332, + "step": 615 + }, + { + "epoch": 0.20169160702667535, + "grad_norm": 2.2610089778900146, + "learning_rate": 4.989313016851866e-05, + "loss": 0.9568, + "step": 620 + }, + { + "epoch": 0.2033181522446324, + "grad_norm": 2.934418201446533, + "learning_rate": 4.989073311824561e-05, + "loss": 1.0278, + "step": 625 + }, + { + "epoch": 0.20494469746258945, + "grad_norm": 2.727705955505371, + "learning_rate": 4.9888309541689204e-05, + "loss": 0.9941, + "step": 630 + }, + { + "epoch": 0.20657124268054652, + "grad_norm": 1.622628927230835, + "learning_rate": 4.988585944143226e-05, + "loss": 1.0566, + "step": 635 + }, + { + "epoch": 0.20819778789850357, + "grad_norm": 1.8430848121643066, + "learning_rate": 4.9883382820085876e-05, + "loss": 0.9891, + "step": 640 + }, + { + "epoch": 0.20982433311646065, + "grad_norm": 2.350111961364746, + "learning_rate": 4.988087968028941e-05, + "loss": 0.9954, + "step": 645 + }, + { + "epoch": 0.2114508783344177, + "grad_norm": 1.4554814100265503, + "learning_rate": 4.987835002471048e-05, + "loss": 0.984, + "step": 650 + }, + { + "epoch": 0.21307742355237475, + "grad_norm": 3.922762870788574, + "learning_rate": 4.987579385604497e-05, + "loss": 1.0, + "step": 655 + }, + { + "epoch": 0.21470396877033182, + "grad_norm": 1.7150508165359497, + "learning_rate": 4.9873211177017e-05, + "loss": 1.0098, + "step": 660 + }, + { + "epoch": 0.21633051398828887, + "grad_norm": 1.7583746910095215, + "learning_rate": 4.9870601990378975e-05, + "loss": 1.0127, + "step": 665 + }, + { + "epoch": 0.21795705920624595, + "grad_norm": 1.8868824243545532, + "learning_rate": 4.986796629891151e-05, + "loss": 1.0001, + "step": 670 + }, + { + "epoch": 0.219583604424203, + "grad_norm": 1.6140953302383423, + "learning_rate": 4.9865304105423485e-05, + "loss": 0.988, + "step": 675 + }, + { + "epoch": 0.22121014964216004, + "grad_norm": 1.6653664112091064, + "learning_rate": 4.986261541275204e-05, + "loss": 0.976, + "step": 680 + }, + { + "epoch": 0.22283669486011712, + "grad_norm": 1.789740800857544, + "learning_rate": 4.9859900223762523e-05, + "loss": 0.9568, + "step": 685 + }, + { + "epoch": 0.22446324007807417, + "grad_norm": 1.7127046585083008, + "learning_rate": 4.9857158541348554e-05, + "loss": 0.98, + "step": 690 + }, + { + "epoch": 0.22608978529603124, + "grad_norm": 2.332801103591919, + "learning_rate": 4.985439036843195e-05, + "loss": 1.0021, + "step": 695 + }, + { + "epoch": 0.2277163305139883, + "grad_norm": 2.1749675273895264, + "learning_rate": 4.98515957079628e-05, + "loss": 1.0032, + "step": 700 + }, + { + "epoch": 0.22934287573194534, + "grad_norm": 1.8240071535110474, + "learning_rate": 4.984877456291938e-05, + "loss": 0.9886, + "step": 705 + }, + { + "epoch": 0.23096942094990242, + "grad_norm": 1.8020601272583008, + "learning_rate": 4.984592693630821e-05, + "loss": 1.0177, + "step": 710 + }, + { + "epoch": 0.23259596616785946, + "grad_norm": 1.9537023305892944, + "learning_rate": 4.984305283116404e-05, + "loss": 1.0044, + "step": 715 + }, + { + "epoch": 0.2342225113858165, + "grad_norm": 2.1307947635650635, + "learning_rate": 4.984015225054983e-05, + "loss": 0.9647, + "step": 720 + }, + { + "epoch": 0.2358490566037736, + "grad_norm": 1.86850106716156, + "learning_rate": 4.983722519755676e-05, + "loss": 0.9841, + "step": 725 + }, + { + "epoch": 0.23747560182173064, + "grad_norm": 2.6345553398132324, + "learning_rate": 4.983427167530421e-05, + "loss": 0.9784, + "step": 730 + }, + { + "epoch": 0.2391021470396877, + "grad_norm": 1.8052325248718262, + "learning_rate": 4.983129168693978e-05, + "loss": 1.0085, + "step": 735 + }, + { + "epoch": 0.24072869225764476, + "grad_norm": 1.7532668113708496, + "learning_rate": 4.982828523563926e-05, + "loss": 0.9689, + "step": 740 + }, + { + "epoch": 0.2423552374756018, + "grad_norm": 2.89123797416687, + "learning_rate": 4.9825252324606675e-05, + "loss": 0.9971, + "step": 745 + }, + { + "epoch": 0.24398178269355889, + "grad_norm": 2.188971757888794, + "learning_rate": 4.98221929570742e-05, + "loss": 1.0305, + "step": 750 + }, + { + "epoch": 0.24560832791151593, + "grad_norm": 1.4119281768798828, + "learning_rate": 4.981910713630225e-05, + "loss": 0.959, + "step": 755 + }, + { + "epoch": 0.247234873129473, + "grad_norm": 1.7555522918701172, + "learning_rate": 4.981599486557941e-05, + "loss": 0.9882, + "step": 760 + }, + { + "epoch": 0.24886141834743006, + "grad_norm": 2.206737518310547, + "learning_rate": 4.981285614822244e-05, + "loss": 1.0002, + "step": 765 + }, + { + "epoch": 0.2504879635653871, + "grad_norm": 1.9809949398040771, + "learning_rate": 4.9809690987576324e-05, + "loss": 0.9589, + "step": 770 + }, + { + "epoch": 0.2521145087833442, + "grad_norm": 1.4272880554199219, + "learning_rate": 4.980649938701418e-05, + "loss": 0.9913, + "step": 775 + }, + { + "epoch": 0.25374105400130126, + "grad_norm": 1.9526935815811157, + "learning_rate": 4.980328134993735e-05, + "loss": 0.978, + "step": 780 + }, + { + "epoch": 0.2553675992192583, + "grad_norm": 1.9960428476333618, + "learning_rate": 4.98000368797753e-05, + "loss": 0.9688, + "step": 785 + }, + { + "epoch": 0.25699414443721535, + "grad_norm": 2.43294358253479, + "learning_rate": 4.9796765979985714e-05, + "loss": 0.9756, + "step": 790 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 1.5361756086349487, + "learning_rate": 4.979346865405441e-05, + "loss": 0.9683, + "step": 795 + }, + { + "epoch": 0.26024723487312945, + "grad_norm": 1.903690218925476, + "learning_rate": 4.9790144905495375e-05, + "loss": 0.9818, + "step": 800 + }, + { + "epoch": 0.2618737800910865, + "grad_norm": 2.3576812744140625, + "learning_rate": 4.978679473785076e-05, + "loss": 0.9699, + "step": 805 + }, + { + "epoch": 0.2635003253090436, + "grad_norm": 1.679548978805542, + "learning_rate": 4.9783418154690874e-05, + "loss": 0.9745, + "step": 810 + }, + { + "epoch": 0.2651268705270006, + "grad_norm": 1.5994036197662354, + "learning_rate": 4.978001515961418e-05, + "loss": 0.9959, + "step": 815 + }, + { + "epoch": 0.2667534157449577, + "grad_norm": 1.596286416053772, + "learning_rate": 4.9776585756247264e-05, + "loss": 0.9505, + "step": 820 + }, + { + "epoch": 0.2683799609629148, + "grad_norm": 1.487785816192627, + "learning_rate": 4.977312994824488e-05, + "loss": 0.9764, + "step": 825 + }, + { + "epoch": 0.27000650618087185, + "grad_norm": 1.7366130352020264, + "learning_rate": 4.9769647739289916e-05, + "loss": 0.9893, + "step": 830 + }, + { + "epoch": 0.2716330513988289, + "grad_norm": 2.0338809490203857, + "learning_rate": 4.976613913309339e-05, + "loss": 0.9938, + "step": 835 + }, + { + "epoch": 0.27325959661678595, + "grad_norm": 2.8601138591766357, + "learning_rate": 4.976260413339446e-05, + "loss": 0.9762, + "step": 840 + }, + { + "epoch": 0.274886141834743, + "grad_norm": 2.227843761444092, + "learning_rate": 4.9759042743960405e-05, + "loss": 0.9857, + "step": 845 + }, + { + "epoch": 0.27651268705270005, + "grad_norm": 2.175116539001465, + "learning_rate": 4.975545496858664e-05, + "loss": 1.0037, + "step": 850 + }, + { + "epoch": 0.2781392322706571, + "grad_norm": 1.625112771987915, + "learning_rate": 4.975184081109667e-05, + "loss": 0.9795, + "step": 855 + }, + { + "epoch": 0.2797657774886142, + "grad_norm": 1.7663930654525757, + "learning_rate": 4.974820027534216e-05, + "loss": 0.9686, + "step": 860 + }, + { + "epoch": 0.2813923227065712, + "grad_norm": 1.8421977758407593, + "learning_rate": 4.974453336520285e-05, + "loss": 0.9791, + "step": 865 + }, + { + "epoch": 0.2830188679245283, + "grad_norm": 2.1246590614318848, + "learning_rate": 4.97408400845866e-05, + "loss": 0.9588, + "step": 870 + }, + { + "epoch": 0.28464541314248537, + "grad_norm": 2.4559266567230225, + "learning_rate": 4.9737120437429366e-05, + "loss": 0.9038, + "step": 875 + }, + { + "epoch": 0.28627195836044245, + "grad_norm": 1.9700146913528442, + "learning_rate": 4.973337442769523e-05, + "loss": 1.0133, + "step": 880 + }, + { + "epoch": 0.28789850357839947, + "grad_norm": 1.494170069694519, + "learning_rate": 4.972960205937633e-05, + "loss": 0.9525, + "step": 885 + }, + { + "epoch": 0.28952504879635654, + "grad_norm": 1.7003804445266724, + "learning_rate": 4.972580333649292e-05, + "loss": 0.9523, + "step": 890 + }, + { + "epoch": 0.2911515940143136, + "grad_norm": 1.5958185195922852, + "learning_rate": 4.9721978263093335e-05, + "loss": 0.9596, + "step": 895 + }, + { + "epoch": 0.29277813923227064, + "grad_norm": 1.9930297136306763, + "learning_rate": 4.971812684325399e-05, + "loss": 1.0152, + "step": 900 + }, + { + "epoch": 0.2944046844502277, + "grad_norm": 1.5336381196975708, + "learning_rate": 4.971424908107938e-05, + "loss": 0.9533, + "step": 905 + }, + { + "epoch": 0.2960312296681848, + "grad_norm": 2.03478741645813, + "learning_rate": 4.9710344980702076e-05, + "loss": 0.9777, + "step": 910 + }, + { + "epoch": 0.2976577748861418, + "grad_norm": 2.151768207550049, + "learning_rate": 4.9706414546282706e-05, + "loss": 0.9889, + "step": 915 + }, + { + "epoch": 0.2992843201040989, + "grad_norm": 1.4807366132736206, + "learning_rate": 4.970245778200997e-05, + "loss": 0.9869, + "step": 920 + }, + { + "epoch": 0.30091086532205596, + "grad_norm": 1.772693395614624, + "learning_rate": 4.9698474692100634e-05, + "loss": 0.9569, + "step": 925 + }, + { + "epoch": 0.302537410540013, + "grad_norm": 1.63463294506073, + "learning_rate": 4.969446528079951e-05, + "loss": 0.9671, + "step": 930 + }, + { + "epoch": 0.30416395575797006, + "grad_norm": 1.873739242553711, + "learning_rate": 4.969042955237946e-05, + "loss": 0.9646, + "step": 935 + }, + { + "epoch": 0.30579050097592714, + "grad_norm": 1.5414514541625977, + "learning_rate": 4.9686367511141404e-05, + "loss": 0.9329, + "step": 940 + }, + { + "epoch": 0.3074170461938842, + "grad_norm": 1.5263457298278809, + "learning_rate": 4.9682279161414305e-05, + "loss": 0.9713, + "step": 945 + }, + { + "epoch": 0.30904359141184123, + "grad_norm": 2.059281349182129, + "learning_rate": 4.967816450755513e-05, + "loss": 0.9239, + "step": 950 + }, + { + "epoch": 0.3106701366297983, + "grad_norm": 1.6876810789108276, + "learning_rate": 4.9674023553948924e-05, + "loss": 0.9574, + "step": 955 + }, + { + "epoch": 0.3122966818477554, + "grad_norm": 1.7108839750289917, + "learning_rate": 4.9669856305008734e-05, + "loss": 0.9493, + "step": 960 + }, + { + "epoch": 0.3139232270657124, + "grad_norm": 1.3386033773422241, + "learning_rate": 4.966566276517563e-05, + "loss": 0.9415, + "step": 965 + }, + { + "epoch": 0.3155497722836695, + "grad_norm": 1.6674686670303345, + "learning_rate": 4.966144293891872e-05, + "loss": 0.9597, + "step": 970 + }, + { + "epoch": 0.31717631750162656, + "grad_norm": 2.0637047290802, + "learning_rate": 4.96571968307351e-05, + "loss": 0.9869, + "step": 975 + }, + { + "epoch": 0.3188028627195836, + "grad_norm": 1.587874412536621, + "learning_rate": 4.965292444514989e-05, + "loss": 0.9669, + "step": 980 + }, + { + "epoch": 0.32042940793754066, + "grad_norm": 1.5072038173675537, + "learning_rate": 4.9648625786716205e-05, + "loss": 0.9609, + "step": 985 + }, + { + "epoch": 0.32205595315549773, + "grad_norm": 1.4424433708190918, + "learning_rate": 4.964430086001517e-05, + "loss": 0.9265, + "step": 990 + }, + { + "epoch": 0.3236824983734548, + "grad_norm": 1.8296180963516235, + "learning_rate": 4.9639949669655906e-05, + "loss": 0.9557, + "step": 995 + }, + { + "epoch": 0.32530904359141183, + "grad_norm": 1.2986112833023071, + "learning_rate": 4.9635572220275505e-05, + "loss": 0.9351, + "step": 1000 + }, + { + "epoch": 0.3269355888093689, + "grad_norm": 1.4849457740783691, + "learning_rate": 4.963116851653906e-05, + "loss": 0.9159, + "step": 1005 + }, + { + "epoch": 0.328562134027326, + "grad_norm": 1.598100185394287, + "learning_rate": 4.9626738563139644e-05, + "loss": 0.9639, + "step": 1010 + }, + { + "epoch": 0.330188679245283, + "grad_norm": 2.3685154914855957, + "learning_rate": 4.962228236479828e-05, + "loss": 0.9488, + "step": 1015 + }, + { + "epoch": 0.3318152244632401, + "grad_norm": 3.058476686477661, + "learning_rate": 4.9617799926264e-05, + "loss": 0.9779, + "step": 1020 + }, + { + "epoch": 0.33344176968119715, + "grad_norm": 1.4213464260101318, + "learning_rate": 4.961329125231378e-05, + "loss": 0.9624, + "step": 1025 + }, + { + "epoch": 0.3350683148991542, + "grad_norm": 1.5156611204147339, + "learning_rate": 4.960875634775254e-05, + "loss": 0.9736, + "step": 1030 + }, + { + "epoch": 0.33669486011711125, + "grad_norm": 1.6368368864059448, + "learning_rate": 4.960419521741317e-05, + "loss": 0.9403, + "step": 1035 + }, + { + "epoch": 0.3383214053350683, + "grad_norm": 2.446671962738037, + "learning_rate": 4.959960786615653e-05, + "loss": 0.9694, + "step": 1040 + }, + { + "epoch": 0.33994795055302535, + "grad_norm": 1.5360190868377686, + "learning_rate": 4.959499429887139e-05, + "loss": 0.9903, + "step": 1045 + }, + { + "epoch": 0.3415744957709824, + "grad_norm": 1.5697903633117676, + "learning_rate": 4.959035452047446e-05, + "loss": 0.9546, + "step": 1050 + }, + { + "epoch": 0.3432010409889395, + "grad_norm": 1.5649073123931885, + "learning_rate": 4.958568853591041e-05, + "loss": 0.9627, + "step": 1055 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 1.4789602756500244, + "learning_rate": 4.9580996350151814e-05, + "loss": 0.9507, + "step": 1060 + }, + { + "epoch": 0.3464541314248536, + "grad_norm": 1.6226849555969238, + "learning_rate": 4.957627796819918e-05, + "loss": 0.9526, + "step": 1065 + }, + { + "epoch": 0.34808067664281067, + "grad_norm": 1.798439621925354, + "learning_rate": 4.9571533395080935e-05, + "loss": 0.9786, + "step": 1070 + }, + { + "epoch": 0.34970722186076775, + "grad_norm": 1.8151806592941284, + "learning_rate": 4.9566762635853414e-05, + "loss": 0.9682, + "step": 1075 + }, + { + "epoch": 0.35133376707872477, + "grad_norm": 1.8990837335586548, + "learning_rate": 4.956196569560085e-05, + "loss": 0.931, + "step": 1080 + }, + { + "epoch": 0.35296031229668184, + "grad_norm": 1.5986672639846802, + "learning_rate": 4.95571425794354e-05, + "loss": 0.9767, + "step": 1085 + }, + { + "epoch": 0.3545868575146389, + "grad_norm": 1.6713230609893799, + "learning_rate": 4.955229329249709e-05, + "loss": 0.9576, + "step": 1090 + }, + { + "epoch": 0.35621340273259594, + "grad_norm": 1.281786561012268, + "learning_rate": 4.954741783995386e-05, + "loss": 0.9553, + "step": 1095 + }, + { + "epoch": 0.357839947950553, + "grad_norm": 1.453911304473877, + "learning_rate": 4.954251622700151e-05, + "loss": 0.9321, + "step": 1100 + }, + { + "epoch": 0.3594664931685101, + "grad_norm": 1.748932123184204, + "learning_rate": 4.953758845886375e-05, + "loss": 0.9423, + "step": 1105 + }, + { + "epoch": 0.36109303838646717, + "grad_norm": 1.7616121768951416, + "learning_rate": 4.953263454079214e-05, + "loss": 1.0057, + "step": 1110 + }, + { + "epoch": 0.3627195836044242, + "grad_norm": 1.510319709777832, + "learning_rate": 4.952765447806612e-05, + "loss": 0.981, + "step": 1115 + }, + { + "epoch": 0.36434612882238127, + "grad_norm": 1.7181427478790283, + "learning_rate": 4.952264827599299e-05, + "loss": 0.9534, + "step": 1120 + }, + { + "epoch": 0.36597267404033834, + "grad_norm": 1.621125340461731, + "learning_rate": 4.9517615939907904e-05, + "loss": 0.9439, + "step": 1125 + }, + { + "epoch": 0.36759921925829536, + "grad_norm": 2.2383875846862793, + "learning_rate": 4.951255747517386e-05, + "loss": 0.945, + "step": 1130 + }, + { + "epoch": 0.36922576447625244, + "grad_norm": 1.282774567604065, + "learning_rate": 4.950747288718172e-05, + "loss": 0.9347, + "step": 1135 + }, + { + "epoch": 0.3708523096942095, + "grad_norm": 1.8514864444732666, + "learning_rate": 4.950236218135018e-05, + "loss": 0.9721, + "step": 1140 + }, + { + "epoch": 0.37247885491216653, + "grad_norm": 2.039710283279419, + "learning_rate": 4.949722536312575e-05, + "loss": 0.9077, + "step": 1145 + }, + { + "epoch": 0.3741054001301236, + "grad_norm": 2.0170350074768066, + "learning_rate": 4.94920624379828e-05, + "loss": 0.9389, + "step": 1150 + }, + { + "epoch": 0.3757319453480807, + "grad_norm": 1.74747633934021, + "learning_rate": 4.9486873411423494e-05, + "loss": 0.9458, + "step": 1155 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 2.139087677001953, + "learning_rate": 4.948165828897784e-05, + "loss": 0.9705, + "step": 1160 + }, + { + "epoch": 0.3789850357839948, + "grad_norm": 1.7666411399841309, + "learning_rate": 4.9476417076203644e-05, + "loss": 0.9785, + "step": 1165 + }, + { + "epoch": 0.38061158100195186, + "grad_norm": 1.8385261297225952, + "learning_rate": 4.9471149778686484e-05, + "loss": 0.9533, + "step": 1170 + }, + { + "epoch": 0.38223812621990894, + "grad_norm": 2.0710532665252686, + "learning_rate": 4.946585640203981e-05, + "loss": 0.9396, + "step": 1175 + }, + { + "epoch": 0.38386467143786596, + "grad_norm": 1.8147737979888916, + "learning_rate": 4.946053695190479e-05, + "loss": 0.9389, + "step": 1180 + }, + { + "epoch": 0.38549121665582303, + "grad_norm": 1.547853708267212, + "learning_rate": 4.945519143395042e-05, + "loss": 0.9411, + "step": 1185 + }, + { + "epoch": 0.3871177618737801, + "grad_norm": 1.445509910583496, + "learning_rate": 4.944981985387347e-05, + "loss": 0.9252, + "step": 1190 + }, + { + "epoch": 0.38874430709173713, + "grad_norm": 1.9876288175582886, + "learning_rate": 4.944442221739849e-05, + "loss": 0.9293, + "step": 1195 + }, + { + "epoch": 0.3903708523096942, + "grad_norm": 1.3176074028015137, + "learning_rate": 4.943899853027778e-05, + "loss": 0.9146, + "step": 1200 + }, + { + "epoch": 0.3919973975276513, + "grad_norm": 2.7344281673431396, + "learning_rate": 4.943354879829141e-05, + "loss": 0.9827, + "step": 1205 + }, + { + "epoch": 0.3936239427456083, + "grad_norm": 2.022479295730591, + "learning_rate": 4.942807302724722e-05, + "loss": 0.9495, + "step": 1210 + }, + { + "epoch": 0.3952504879635654, + "grad_norm": 1.3476173877716064, + "learning_rate": 4.9422571222980784e-05, + "loss": 0.9833, + "step": 1215 + }, + { + "epoch": 0.39687703318152245, + "grad_norm": 1.4940625429153442, + "learning_rate": 4.9417043391355435e-05, + "loss": 0.9364, + "step": 1220 + }, + { + "epoch": 0.39850357839947953, + "grad_norm": 1.5108622312545776, + "learning_rate": 4.9411489538262214e-05, + "loss": 0.9561, + "step": 1225 + }, + { + "epoch": 0.40013012361743655, + "grad_norm": 1.8426663875579834, + "learning_rate": 4.940590966961993e-05, + "loss": 0.9433, + "step": 1230 + }, + { + "epoch": 0.4017566688353936, + "grad_norm": 1.4690388441085815, + "learning_rate": 4.9400303791375094e-05, + "loss": 0.9168, + "step": 1235 + }, + { + "epoch": 0.4033832140533507, + "grad_norm": 1.5599989891052246, + "learning_rate": 4.939467190950195e-05, + "loss": 0.9667, + "step": 1240 + }, + { + "epoch": 0.4050097592713077, + "grad_norm": 2.242982864379883, + "learning_rate": 4.938901403000243e-05, + "loss": 0.9348, + "step": 1245 + }, + { + "epoch": 0.4066363044892648, + "grad_norm": 1.5606322288513184, + "learning_rate": 4.938333015890621e-05, + "loss": 0.9529, + "step": 1250 + }, + { + "epoch": 0.4082628497072219, + "grad_norm": 1.5630214214324951, + "learning_rate": 4.937762030227062e-05, + "loss": 0.9196, + "step": 1255 + }, + { + "epoch": 0.4098893949251789, + "grad_norm": 1.516115427017212, + "learning_rate": 4.937188446618073e-05, + "loss": 0.9176, + "step": 1260 + }, + { + "epoch": 0.41151594014313597, + "grad_norm": 1.807707667350769, + "learning_rate": 4.9366122656749254e-05, + "loss": 0.9113, + "step": 1265 + }, + { + "epoch": 0.41314248536109305, + "grad_norm": 1.2315680980682373, + "learning_rate": 4.936033488011661e-05, + "loss": 0.9425, + "step": 1270 + }, + { + "epoch": 0.4147690305790501, + "grad_norm": 1.6471747159957886, + "learning_rate": 4.93545211424509e-05, + "loss": 0.9542, + "step": 1275 + }, + { + "epoch": 0.41639557579700714, + "grad_norm": 1.5492879152297974, + "learning_rate": 4.934868144994785e-05, + "loss": 0.9226, + "step": 1280 + }, + { + "epoch": 0.4180221210149642, + "grad_norm": 1.6206769943237305, + "learning_rate": 4.9342815808830906e-05, + "loss": 0.9298, + "step": 1285 + }, + { + "epoch": 0.4196486662329213, + "grad_norm": 1.3421366214752197, + "learning_rate": 4.9336924225351114e-05, + "loss": 0.9756, + "step": 1290 + }, + { + "epoch": 0.4212752114508783, + "grad_norm": 1.5250636339187622, + "learning_rate": 4.93310067057872e-05, + "loss": 0.9392, + "step": 1295 + }, + { + "epoch": 0.4229017566688354, + "grad_norm": 2.4688832759857178, + "learning_rate": 4.9325063256445505e-05, + "loss": 0.9387, + "step": 1300 + }, + { + "epoch": 0.42452830188679247, + "grad_norm": 1.4313111305236816, + "learning_rate": 4.931909388366002e-05, + "loss": 0.8986, + "step": 1305 + }, + { + "epoch": 0.4261548471047495, + "grad_norm": 1.502111792564392, + "learning_rate": 4.931309859379238e-05, + "loss": 0.9529, + "step": 1310 + }, + { + "epoch": 0.42778139232270657, + "grad_norm": 1.6324115991592407, + "learning_rate": 4.93070773932318e-05, + "loss": 0.9303, + "step": 1315 + }, + { + "epoch": 0.42940793754066364, + "grad_norm": 2.1549460887908936, + "learning_rate": 4.930103028839513e-05, + "loss": 0.9451, + "step": 1320 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 1.3161667585372925, + "learning_rate": 4.929495728572685e-05, + "loss": 0.9504, + "step": 1325 + }, + { + "epoch": 0.43266102797657774, + "grad_norm": 1.6194000244140625, + "learning_rate": 4.928885839169898e-05, + "loss": 0.9301, + "step": 1330 + }, + { + "epoch": 0.4342875731945348, + "grad_norm": 1.5842366218566895, + "learning_rate": 4.928273361281119e-05, + "loss": 0.9309, + "step": 1335 + }, + { + "epoch": 0.4359141184124919, + "grad_norm": 1.420822024345398, + "learning_rate": 4.927658295559071e-05, + "loss": 0.9214, + "step": 1340 + }, + { + "epoch": 0.4375406636304489, + "grad_norm": 1.6249916553497314, + "learning_rate": 4.927040642659234e-05, + "loss": 0.9311, + "step": 1345 + }, + { + "epoch": 0.439167208848406, + "grad_norm": 1.6335654258728027, + "learning_rate": 4.926420403239848e-05, + "loss": 0.9309, + "step": 1350 + }, + { + "epoch": 0.44079375406636306, + "grad_norm": 1.8016791343688965, + "learning_rate": 4.925797577961907e-05, + "loss": 0.9665, + "step": 1355 + }, + { + "epoch": 0.4424202992843201, + "grad_norm": 1.702170729637146, + "learning_rate": 4.925172167489161e-05, + "loss": 0.9547, + "step": 1360 + }, + { + "epoch": 0.44404684450227716, + "grad_norm": 1.4510329961776733, + "learning_rate": 4.924544172488118e-05, + "loss": 0.9724, + "step": 1365 + }, + { + "epoch": 0.44567338972023424, + "grad_norm": 1.3579403162002563, + "learning_rate": 4.9239135936280353e-05, + "loss": 0.949, + "step": 1370 + }, + { + "epoch": 0.44729993493819126, + "grad_norm": 1.373290777206421, + "learning_rate": 4.923280431580928e-05, + "loss": 0.9405, + "step": 1375 + }, + { + "epoch": 0.44892648015614833, + "grad_norm": 1.6023136377334595, + "learning_rate": 4.922644687021563e-05, + "loss": 0.9165, + "step": 1380 + }, + { + "epoch": 0.4505530253741054, + "grad_norm": 1.6051198244094849, + "learning_rate": 4.922006360627458e-05, + "loss": 0.9328, + "step": 1385 + }, + { + "epoch": 0.4521795705920625, + "grad_norm": 1.6767120361328125, + "learning_rate": 4.9213654530788846e-05, + "loss": 0.9082, + "step": 1390 + }, + { + "epoch": 0.4538061158100195, + "grad_norm": 1.3570812940597534, + "learning_rate": 4.920721965058863e-05, + "loss": 0.9247, + "step": 1395 + }, + { + "epoch": 0.4554326610279766, + "grad_norm": 1.2051457166671753, + "learning_rate": 4.920075897253165e-05, + "loss": 0.9028, + "step": 1400 + }, + { + "epoch": 0.45705920624593366, + "grad_norm": 1.7422350645065308, + "learning_rate": 4.919427250350309e-05, + "loss": 0.9215, + "step": 1405 + }, + { + "epoch": 0.4586857514638907, + "grad_norm": 2.972627639770508, + "learning_rate": 4.918776025041567e-05, + "loss": 0.9062, + "step": 1410 + }, + { + "epoch": 0.46031229668184775, + "grad_norm": 2.1310195922851562, + "learning_rate": 4.918122222020954e-05, + "loss": 0.9547, + "step": 1415 + }, + { + "epoch": 0.46193884189980483, + "grad_norm": 2.038820266723633, + "learning_rate": 4.917465841985234e-05, + "loss": 0.9609, + "step": 1420 + }, + { + "epoch": 0.46356538711776185, + "grad_norm": 1.5045058727264404, + "learning_rate": 4.9168068856339176e-05, + "loss": 0.9049, + "step": 1425 + }, + { + "epoch": 0.4651919323357189, + "grad_norm": 1.4390023946762085, + "learning_rate": 4.91614535366926e-05, + "loss": 0.9294, + "step": 1430 + }, + { + "epoch": 0.466818477553676, + "grad_norm": 1.7314965724945068, + "learning_rate": 4.915481246796263e-05, + "loss": 0.93, + "step": 1435 + }, + { + "epoch": 0.468445022771633, + "grad_norm": 1.193915843963623, + "learning_rate": 4.914814565722671e-05, + "loss": 0.9099, + "step": 1440 + }, + { + "epoch": 0.4700715679895901, + "grad_norm": 1.7515811920166016, + "learning_rate": 4.914145311158972e-05, + "loss": 0.9728, + "step": 1445 + }, + { + "epoch": 0.4716981132075472, + "grad_norm": 1.3913501501083374, + "learning_rate": 4.913473483818396e-05, + "loss": 0.8982, + "step": 1450 + }, + { + "epoch": 0.47332465842550425, + "grad_norm": 1.778140902519226, + "learning_rate": 4.912799084416917e-05, + "loss": 0.9418, + "step": 1455 + }, + { + "epoch": 0.4749512036434613, + "grad_norm": 1.871268630027771, + "learning_rate": 4.9121221136732474e-05, + "loss": 0.9771, + "step": 1460 + }, + { + "epoch": 0.47657774886141835, + "grad_norm": 1.234295129776001, + "learning_rate": 4.911442572308842e-05, + "loss": 0.9344, + "step": 1465 + }, + { + "epoch": 0.4782042940793754, + "grad_norm": 1.5811593532562256, + "learning_rate": 4.910760461047894e-05, + "loss": 0.9173, + "step": 1470 + }, + { + "epoch": 0.47983083929733245, + "grad_norm": 1.6737935543060303, + "learning_rate": 4.910075780617336e-05, + "loss": 0.8794, + "step": 1475 + }, + { + "epoch": 0.4814573845152895, + "grad_norm": 1.5899697542190552, + "learning_rate": 4.9093885317468376e-05, + "loss": 0.8952, + "step": 1480 + }, + { + "epoch": 0.4830839297332466, + "grad_norm": 1.5737837553024292, + "learning_rate": 4.908698715168806e-05, + "loss": 0.9671, + "step": 1485 + }, + { + "epoch": 0.4847104749512036, + "grad_norm": 2.055270195007324, + "learning_rate": 4.9080063316183864e-05, + "loss": 0.9181, + "step": 1490 + }, + { + "epoch": 0.4863370201691607, + "grad_norm": 1.5866291522979736, + "learning_rate": 4.907311381833457e-05, + "loss": 0.9083, + "step": 1495 + }, + { + "epoch": 0.48796356538711777, + "grad_norm": 1.3368301391601562, + "learning_rate": 4.906613866554634e-05, + "loss": 0.9312, + "step": 1500 + }, + { + "epoch": 0.48959011060507485, + "grad_norm": 1.6596410274505615, + "learning_rate": 4.9059137865252626e-05, + "loss": 0.9366, + "step": 1505 + }, + { + "epoch": 0.49121665582303187, + "grad_norm": 1.4246739149093628, + "learning_rate": 4.9052111424914275e-05, + "loss": 0.9204, + "step": 1510 + }, + { + "epoch": 0.49284320104098894, + "grad_norm": 1.4149776697158813, + "learning_rate": 4.9045059352019416e-05, + "loss": 0.9055, + "step": 1515 + }, + { + "epoch": 0.494469746258946, + "grad_norm": 1.7604811191558838, + "learning_rate": 4.903798165408351e-05, + "loss": 0.898, + "step": 1520 + }, + { + "epoch": 0.49609629147690304, + "grad_norm": 1.5318808555603027, + "learning_rate": 4.9030878338649334e-05, + "loss": 0.9408, + "step": 1525 + }, + { + "epoch": 0.4977228366948601, + "grad_norm": 2.7622036933898926, + "learning_rate": 4.902374941328695e-05, + "loss": 0.9125, + "step": 1530 + }, + { + "epoch": 0.4993493819128172, + "grad_norm": 1.3908295631408691, + "learning_rate": 4.90165948855937e-05, + "loss": 0.9438, + "step": 1535 + }, + { + "epoch": 0.5009759271307742, + "grad_norm": 1.68529212474823, + "learning_rate": 4.900941476319426e-05, + "loss": 0.9392, + "step": 1540 + }, + { + "epoch": 0.5026024723487313, + "grad_norm": 1.5151035785675049, + "learning_rate": 4.900220905374054e-05, + "loss": 0.9141, + "step": 1545 + }, + { + "epoch": 0.5042290175666884, + "grad_norm": 1.3652807474136353, + "learning_rate": 4.899497776491173e-05, + "loss": 0.9252, + "step": 1550 + }, + { + "epoch": 0.5058555627846454, + "grad_norm": 1.6974607706069946, + "learning_rate": 4.8987720904414286e-05, + "loss": 0.9352, + "step": 1555 + }, + { + "epoch": 0.5074821080026025, + "grad_norm": 1.5016193389892578, + "learning_rate": 4.8980438479981906e-05, + "loss": 0.9327, + "step": 1560 + }, + { + "epoch": 0.5091086532205595, + "grad_norm": 1.5985078811645508, + "learning_rate": 4.8973130499375534e-05, + "loss": 0.917, + "step": 1565 + }, + { + "epoch": 0.5107351984385166, + "grad_norm": 1.5521959066390991, + "learning_rate": 4.896579697038336e-05, + "loss": 0.9102, + "step": 1570 + }, + { + "epoch": 0.5123617436564737, + "grad_norm": 1.8769845962524414, + "learning_rate": 4.895843790082078e-05, + "loss": 0.9067, + "step": 1575 + }, + { + "epoch": 0.5139882888744307, + "grad_norm": 1.608547568321228, + "learning_rate": 4.8951053298530444e-05, + "loss": 0.9488, + "step": 1580 + }, + { + "epoch": 0.5156148340923877, + "grad_norm": 1.9403187036514282, + "learning_rate": 4.894364317138218e-05, + "loss": 0.9225, + "step": 1585 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 1.756372094154358, + "learning_rate": 4.893620752727303e-05, + "loss": 0.9165, + "step": 1590 + }, + { + "epoch": 0.5188679245283019, + "grad_norm": 1.3236535787582397, + "learning_rate": 4.892874637412721e-05, + "loss": 0.9027, + "step": 1595 + }, + { + "epoch": 0.5204944697462589, + "grad_norm": 1.629150390625, + "learning_rate": 4.8921259719896165e-05, + "loss": 0.9068, + "step": 1600 + }, + { + "epoch": 0.522121014964216, + "grad_norm": 1.3580920696258545, + "learning_rate": 4.8913747572558474e-05, + "loss": 0.9554, + "step": 1605 + }, + { + "epoch": 0.523747560182173, + "grad_norm": 1.53166663646698, + "learning_rate": 4.890620994011992e-05, + "loss": 0.9468, + "step": 1610 + }, + { + "epoch": 0.5253741054001301, + "grad_norm": 1.379834532737732, + "learning_rate": 4.88986468306134e-05, + "loss": 0.969, + "step": 1615 + }, + { + "epoch": 0.5270006506180872, + "grad_norm": 1.6708744764328003, + "learning_rate": 4.889105825209902e-05, + "loss": 0.9213, + "step": 1620 + }, + { + "epoch": 0.5286271958360442, + "grad_norm": 1.648132562637329, + "learning_rate": 4.888344421266397e-05, + "loss": 0.9234, + "step": 1625 + }, + { + "epoch": 0.5302537410540012, + "grad_norm": 1.803969383239746, + "learning_rate": 4.887580472042264e-05, + "loss": 0.9353, + "step": 1630 + }, + { + "epoch": 0.5318802862719584, + "grad_norm": 2.2783384323120117, + "learning_rate": 4.886813978351646e-05, + "loss": 0.9195, + "step": 1635 + }, + { + "epoch": 0.5335068314899154, + "grad_norm": 1.956603765487671, + "learning_rate": 4.886044941011405e-05, + "loss": 0.9087, + "step": 1640 + }, + { + "epoch": 0.5351333767078725, + "grad_norm": 1.3895164728164673, + "learning_rate": 4.885273360841112e-05, + "loss": 0.9246, + "step": 1645 + }, + { + "epoch": 0.5367599219258296, + "grad_norm": 1.6388825178146362, + "learning_rate": 4.884499238663045e-05, + "loss": 0.94, + "step": 1650 + }, + { + "epoch": 0.5383864671437866, + "grad_norm": 1.2691999673843384, + "learning_rate": 4.883722575302193e-05, + "loss": 0.925, + "step": 1655 + }, + { + "epoch": 0.5400130123617437, + "grad_norm": 1.6276110410690308, + "learning_rate": 4.882943371586256e-05, + "loss": 0.9182, + "step": 1660 + }, + { + "epoch": 0.5416395575797007, + "grad_norm": 1.5029033422470093, + "learning_rate": 4.882161628345636e-05, + "loss": 0.8791, + "step": 1665 + }, + { + "epoch": 0.5432661027976577, + "grad_norm": 1.3272761106491089, + "learning_rate": 4.881377346413446e-05, + "loss": 0.9284, + "step": 1670 + }, + { + "epoch": 0.5448926480156149, + "grad_norm": 1.3621693849563599, + "learning_rate": 4.8805905266254995e-05, + "loss": 0.9284, + "step": 1675 + }, + { + "epoch": 0.5465191932335719, + "grad_norm": 1.5262432098388672, + "learning_rate": 4.879801169820321e-05, + "loss": 0.9112, + "step": 1680 + }, + { + "epoch": 0.5481457384515289, + "grad_norm": 1.681083083152771, + "learning_rate": 4.879009276839133e-05, + "loss": 0.891, + "step": 1685 + }, + { + "epoch": 0.549772283669486, + "grad_norm": 1.43258798122406, + "learning_rate": 4.8782148485258636e-05, + "loss": 0.8994, + "step": 1690 + }, + { + "epoch": 0.5513988288874431, + "grad_norm": 1.2909198999404907, + "learning_rate": 4.877417885727143e-05, + "loss": 0.9234, + "step": 1695 + }, + { + "epoch": 0.5530253741054001, + "grad_norm": 1.4957412481307983, + "learning_rate": 4.8766183892923025e-05, + "loss": 0.9163, + "step": 1700 + }, + { + "epoch": 0.5546519193233572, + "grad_norm": 1.355197787284851, + "learning_rate": 4.8758163600733705e-05, + "loss": 0.9011, + "step": 1705 + }, + { + "epoch": 0.5562784645413142, + "grad_norm": 1.5381678342819214, + "learning_rate": 4.875011798925078e-05, + "loss": 0.8994, + "step": 1710 + }, + { + "epoch": 0.5579050097592713, + "grad_norm": 1.3496179580688477, + "learning_rate": 4.874204706704854e-05, + "loss": 0.9382, + "step": 1715 + }, + { + "epoch": 0.5595315549772284, + "grad_norm": 1.2696263790130615, + "learning_rate": 4.873395084272823e-05, + "loss": 0.9497, + "step": 1720 + }, + { + "epoch": 0.5611581001951854, + "grad_norm": 1.3558508157730103, + "learning_rate": 4.8725829324918084e-05, + "loss": 0.8997, + "step": 1725 + }, + { + "epoch": 0.5627846454131424, + "grad_norm": 1.562991976737976, + "learning_rate": 4.871768252227326e-05, + "loss": 0.8956, + "step": 1730 + }, + { + "epoch": 0.5644111906310996, + "grad_norm": 1.7173439264297485, + "learning_rate": 4.870951044347589e-05, + "loss": 0.9083, + "step": 1735 + }, + { + "epoch": 0.5660377358490566, + "grad_norm": 1.1349279880523682, + "learning_rate": 4.870131309723505e-05, + "loss": 0.9533, + "step": 1740 + }, + { + "epoch": 0.5676642810670136, + "grad_norm": 1.9187933206558228, + "learning_rate": 4.8693090492286695e-05, + "loss": 0.9323, + "step": 1745 + }, + { + "epoch": 0.5692908262849707, + "grad_norm": 1.462189793586731, + "learning_rate": 4.868484263739375e-05, + "loss": 0.9299, + "step": 1750 + }, + { + "epoch": 0.5709173715029278, + "grad_norm": 1.755936622619629, + "learning_rate": 4.867656954134603e-05, + "loss": 0.9344, + "step": 1755 + }, + { + "epoch": 0.5725439167208849, + "grad_norm": 1.4357781410217285, + "learning_rate": 4.866827121296025e-05, + "loss": 0.8918, + "step": 1760 + }, + { + "epoch": 0.5741704619388419, + "grad_norm": 1.812160849571228, + "learning_rate": 4.865994766108e-05, + "loss": 0.902, + "step": 1765 + }, + { + "epoch": 0.5757970071567989, + "grad_norm": 1.4691888093948364, + "learning_rate": 4.865159889457579e-05, + "loss": 0.906, + "step": 1770 + }, + { + "epoch": 0.5774235523747561, + "grad_norm": 1.464616298675537, + "learning_rate": 4.864322492234495e-05, + "loss": 0.9398, + "step": 1775 + }, + { + "epoch": 0.5790500975927131, + "grad_norm": 1.6250700950622559, + "learning_rate": 4.863482575331173e-05, + "loss": 0.9044, + "step": 1780 + }, + { + "epoch": 0.5806766428106701, + "grad_norm": 1.3317840099334717, + "learning_rate": 4.8626401396427176e-05, + "loss": 0.945, + "step": 1785 + }, + { + "epoch": 0.5823031880286272, + "grad_norm": 1.3712043762207031, + "learning_rate": 4.861795186066922e-05, + "loss": 0.9229, + "step": 1790 + }, + { + "epoch": 0.5839297332465843, + "grad_norm": 1.259316325187683, + "learning_rate": 4.860947715504259e-05, + "loss": 0.9202, + "step": 1795 + }, + { + "epoch": 0.5855562784645413, + "grad_norm": 1.2751063108444214, + "learning_rate": 4.860097728857889e-05, + "loss": 0.942, + "step": 1800 + }, + { + "epoch": 0.5871828236824984, + "grad_norm": 1.4026081562042236, + "learning_rate": 4.859245227033647e-05, + "loss": 0.9305, + "step": 1805 + }, + { + "epoch": 0.5888093689004554, + "grad_norm": 1.7082586288452148, + "learning_rate": 4.858390210940054e-05, + "loss": 0.8954, + "step": 1810 + }, + { + "epoch": 0.5904359141184125, + "grad_norm": 1.4861118793487549, + "learning_rate": 4.857532681488308e-05, + "loss": 0.9268, + "step": 1815 + }, + { + "epoch": 0.5920624593363696, + "grad_norm": 1.7153403759002686, + "learning_rate": 4.8566726395922866e-05, + "loss": 0.9146, + "step": 1820 + }, + { + "epoch": 0.5936890045543266, + "grad_norm": 1.5805591344833374, + "learning_rate": 4.8558100861685443e-05, + "loss": 0.9032, + "step": 1825 + }, + { + "epoch": 0.5953155497722836, + "grad_norm": 1.1711604595184326, + "learning_rate": 4.8549450221363106e-05, + "loss": 0.9127, + "step": 1830 + }, + { + "epoch": 0.5969420949902408, + "grad_norm": 1.3482109308242798, + "learning_rate": 4.8540774484174946e-05, + "loss": 0.9176, + "step": 1835 + }, + { + "epoch": 0.5985686402081978, + "grad_norm": 1.6254630088806152, + "learning_rate": 4.853207365936676e-05, + "loss": 0.9253, + "step": 1840 + }, + { + "epoch": 0.6001951854261548, + "grad_norm": 1.8429731130599976, + "learning_rate": 4.85233477562111e-05, + "loss": 0.9143, + "step": 1845 + }, + { + "epoch": 0.6018217306441119, + "grad_norm": 1.5493820905685425, + "learning_rate": 4.851459678400725e-05, + "loss": 0.8856, + "step": 1850 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 1.4831209182739258, + "learning_rate": 4.850582075208119e-05, + "loss": 0.9438, + "step": 1855 + }, + { + "epoch": 0.605074821080026, + "grad_norm": 1.5420846939086914, + "learning_rate": 4.849701966978562e-05, + "loss": 0.928, + "step": 1860 + }, + { + "epoch": 0.6067013662979831, + "grad_norm": 1.4983383417129517, + "learning_rate": 4.848819354649995e-05, + "loss": 0.9074, + "step": 1865 + }, + { + "epoch": 0.6083279115159401, + "grad_norm": 1.2379560470581055, + "learning_rate": 4.8479342391630233e-05, + "loss": 0.9302, + "step": 1870 + }, + { + "epoch": 0.6099544567338973, + "grad_norm": 1.3859138488769531, + "learning_rate": 4.8470466214609245e-05, + "loss": 0.9336, + "step": 1875 + }, + { + "epoch": 0.6115810019518543, + "grad_norm": 1.684417724609375, + "learning_rate": 4.846156502489641e-05, + "loss": 0.919, + "step": 1880 + }, + { + "epoch": 0.6132075471698113, + "grad_norm": 1.8131024837493896, + "learning_rate": 4.8452638831977806e-05, + "loss": 0.9121, + "step": 1885 + }, + { + "epoch": 0.6148340923877684, + "grad_norm": 1.740172266960144, + "learning_rate": 4.844368764536614e-05, + "loss": 0.9154, + "step": 1890 + }, + { + "epoch": 0.6164606376057254, + "grad_norm": 1.6505630016326904, + "learning_rate": 4.843471147460081e-05, + "loss": 0.9107, + "step": 1895 + }, + { + "epoch": 0.6180871828236825, + "grad_norm": 1.505215048789978, + "learning_rate": 4.842751255582534e-05, + "loss": 0.8945, + "step": 1900 + }, + { + "epoch": 0.6197137280416396, + "grad_norm": 1.4264938831329346, + "learning_rate": 4.841849143770754e-05, + "loss": 0.8896, + "step": 1905 + }, + { + "epoch": 0.6213402732595966, + "grad_norm": 1.25933039188385, + "learning_rate": 4.84094453622879e-05, + "loss": 0.9213, + "step": 1910 + }, + { + "epoch": 0.6229668184775536, + "grad_norm": 1.39175546169281, + "learning_rate": 4.840037433920688e-05, + "loss": 0.9129, + "step": 1915 + }, + { + "epoch": 0.6245933636955108, + "grad_norm": 1.2698136568069458, + "learning_rate": 4.839127837813158e-05, + "loss": 0.9019, + "step": 1920 + }, + { + "epoch": 0.6262199089134678, + "grad_norm": 1.455878734588623, + "learning_rate": 4.838215748875562e-05, + "loss": 0.8827, + "step": 1925 + }, + { + "epoch": 0.6278464541314248, + "grad_norm": 1.2814476490020752, + "learning_rate": 4.837301168079924e-05, + "loss": 0.9246, + "step": 1930 + }, + { + "epoch": 0.629472999349382, + "grad_norm": 1.7256293296813965, + "learning_rate": 4.83638409640092e-05, + "loss": 0.9243, + "step": 1935 + }, + { + "epoch": 0.631099544567339, + "grad_norm": 1.6953538656234741, + "learning_rate": 4.835464534815882e-05, + "loss": 0.8951, + "step": 1940 + }, + { + "epoch": 0.632726089785296, + "grad_norm": 1.900547742843628, + "learning_rate": 4.834542484304795e-05, + "loss": 0.9313, + "step": 1945 + }, + { + "epoch": 0.6343526350032531, + "grad_norm": 1.5444824695587158, + "learning_rate": 4.8336179458502975e-05, + "loss": 0.9305, + "step": 1950 + }, + { + "epoch": 0.6359791802212101, + "grad_norm": 1.6127551794052124, + "learning_rate": 4.8326909204376776e-05, + "loss": 0.8828, + "step": 1955 + }, + { + "epoch": 0.6376057254391672, + "grad_norm": 1.699691891670227, + "learning_rate": 4.8317614090548755e-05, + "loss": 0.9208, + "step": 1960 + }, + { + "epoch": 0.6392322706571243, + "grad_norm": 1.6295160055160522, + "learning_rate": 4.8308294126924794e-05, + "loss": 0.8974, + "step": 1965 + }, + { + "epoch": 0.6408588158750813, + "grad_norm": 1.6393744945526123, + "learning_rate": 4.829894932343727e-05, + "loss": 0.9003, + "step": 1970 + }, + { + "epoch": 0.6424853610930383, + "grad_norm": 1.308160424232483, + "learning_rate": 4.828957969004502e-05, + "loss": 0.9315, + "step": 1975 + }, + { + "epoch": 0.6441119063109955, + "grad_norm": 1.4571892023086548, + "learning_rate": 4.828018523673336e-05, + "loss": 0.8949, + "step": 1980 + }, + { + "epoch": 0.6457384515289525, + "grad_norm": 1.5001887083053589, + "learning_rate": 4.827076597351403e-05, + "loss": 0.9136, + "step": 1985 + }, + { + "epoch": 0.6473649967469096, + "grad_norm": 1.5765280723571777, + "learning_rate": 4.826132191042525e-05, + "loss": 0.9075, + "step": 1990 + }, + { + "epoch": 0.6489915419648666, + "grad_norm": 1.9233174324035645, + "learning_rate": 4.825185305753161e-05, + "loss": 0.9265, + "step": 1995 + }, + { + "epoch": 0.6506180871828237, + "grad_norm": 1.5038241147994995, + "learning_rate": 4.824235942492418e-05, + "loss": 0.9009, + "step": 2000 + }, + { + "epoch": 0.6522446324007808, + "grad_norm": 1.5957181453704834, + "learning_rate": 4.823284102272041e-05, + "loss": 0.8986, + "step": 2005 + }, + { + "epoch": 0.6538711776187378, + "grad_norm": 1.202020287513733, + "learning_rate": 4.822329786106413e-05, + "loss": 0.9246, + "step": 2010 + }, + { + "epoch": 0.6554977228366948, + "grad_norm": 1.4576326608657837, + "learning_rate": 4.82137299501256e-05, + "loss": 0.9389, + "step": 2015 + }, + { + "epoch": 0.657124268054652, + "grad_norm": 1.691660761833191, + "learning_rate": 4.820413730010141e-05, + "loss": 0.8919, + "step": 2020 + }, + { + "epoch": 0.658750813272609, + "grad_norm": 1.5620954036712646, + "learning_rate": 4.819451992121454e-05, + "loss": 0.8708, + "step": 2025 + }, + { + "epoch": 0.660377358490566, + "grad_norm": 1.7448545694351196, + "learning_rate": 4.818487782371434e-05, + "loss": 0.9154, + "step": 2030 + }, + { + "epoch": 0.6620039037085231, + "grad_norm": 1.2183756828308105, + "learning_rate": 4.817521101787646e-05, + "loss": 0.8987, + "step": 2035 + }, + { + "epoch": 0.6636304489264802, + "grad_norm": 1.219459891319275, + "learning_rate": 4.81655195140029e-05, + "loss": 0.892, + "step": 2040 + }, + { + "epoch": 0.6652569941444372, + "grad_norm": 1.7939671277999878, + "learning_rate": 4.815580332242199e-05, + "loss": 0.9229, + "step": 2045 + }, + { + "epoch": 0.6668835393623943, + "grad_norm": 1.5826709270477295, + "learning_rate": 4.814606245348838e-05, + "loss": 0.9057, + "step": 2050 + }, + { + "epoch": 0.6685100845803513, + "grad_norm": 1.5429420471191406, + "learning_rate": 4.813629691758299e-05, + "loss": 0.9043, + "step": 2055 + }, + { + "epoch": 0.6701366297983083, + "grad_norm": 1.361778736114502, + "learning_rate": 4.812650672511305e-05, + "loss": 0.8971, + "step": 2060 + }, + { + "epoch": 0.6717631750162655, + "grad_norm": 1.300718903541565, + "learning_rate": 4.811669188651204e-05, + "loss": 0.8844, + "step": 2065 + }, + { + "epoch": 0.6733897202342225, + "grad_norm": 1.4906117916107178, + "learning_rate": 4.810685241223974e-05, + "loss": 0.8839, + "step": 2070 + }, + { + "epoch": 0.6750162654521795, + "grad_norm": 1.3935993909835815, + "learning_rate": 4.8096988312782174e-05, + "loss": 0.9039, + "step": 2075 + }, + { + "epoch": 0.6766428106701367, + "grad_norm": 1.3832114934921265, + "learning_rate": 4.808709959865159e-05, + "loss": 0.9251, + "step": 2080 + }, + { + "epoch": 0.6782693558880937, + "grad_norm": 1.497290015220642, + "learning_rate": 4.8077186280386475e-05, + "loss": 0.9217, + "step": 2085 + }, + { + "epoch": 0.6798959011060507, + "grad_norm": 1.2372499704360962, + "learning_rate": 4.806724836855157e-05, + "loss": 0.9133, + "step": 2090 + }, + { + "epoch": 0.6815224463240078, + "grad_norm": 1.556992769241333, + "learning_rate": 4.8057285873737765e-05, + "loss": 0.9021, + "step": 2095 + }, + { + "epoch": 0.6831489915419648, + "grad_norm": 1.4471299648284912, + "learning_rate": 4.804729880656221e-05, + "loss": 0.9041, + "step": 2100 + }, + { + "epoch": 0.684775536759922, + "grad_norm": 1.3373253345489502, + "learning_rate": 4.803728717766821e-05, + "loss": 0.9119, + "step": 2105 + }, + { + "epoch": 0.686402081977879, + "grad_norm": 1.2518610954284668, + "learning_rate": 4.8027250997725235e-05, + "loss": 0.9396, + "step": 2110 + }, + { + "epoch": 0.688028627195836, + "grad_norm": 2.017392873764038, + "learning_rate": 4.8017190277428956e-05, + "loss": 0.8935, + "step": 2115 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 1.5191129446029663, + "learning_rate": 4.800710502750115e-05, + "loss": 0.9019, + "step": 2120 + }, + { + "epoch": 0.6912817176317502, + "grad_norm": 1.138543963432312, + "learning_rate": 4.799699525868979e-05, + "loss": 0.8925, + "step": 2125 + }, + { + "epoch": 0.6929082628497072, + "grad_norm": 1.3243154287338257, + "learning_rate": 4.798686098176892e-05, + "loss": 0.9023, + "step": 2130 + }, + { + "epoch": 0.6945348080676643, + "grad_norm": 1.4241605997085571, + "learning_rate": 4.797670220753876e-05, + "loss": 0.9079, + "step": 2135 + }, + { + "epoch": 0.6961613532856213, + "grad_norm": 1.514777660369873, + "learning_rate": 4.796651894682558e-05, + "loss": 0.9397, + "step": 2140 + }, + { + "epoch": 0.6977878985035784, + "grad_norm": 1.6136503219604492, + "learning_rate": 4.79563112104818e-05, + "loss": 0.9236, + "step": 2145 + }, + { + "epoch": 0.6994144437215355, + "grad_norm": 2.230391263961792, + "learning_rate": 4.794607900938589e-05, + "loss": 0.9045, + "step": 2150 + }, + { + "epoch": 0.7010409889394925, + "grad_norm": 1.5815776586532593, + "learning_rate": 4.7935822354442397e-05, + "loss": 0.8906, + "step": 2155 + }, + { + "epoch": 0.7026675341574495, + "grad_norm": 1.461105227470398, + "learning_rate": 4.792554125658193e-05, + "loss": 0.9111, + "step": 2160 + }, + { + "epoch": 0.7042940793754067, + "grad_norm": 1.2527744770050049, + "learning_rate": 4.7915235726761154e-05, + "loss": 0.938, + "step": 2165 + }, + { + "epoch": 0.7059206245933637, + "grad_norm": 1.2054787874221802, + "learning_rate": 4.790490577596277e-05, + "loss": 0.9067, + "step": 2170 + }, + { + "epoch": 0.7075471698113207, + "grad_norm": 1.340711236000061, + "learning_rate": 4.789455141519551e-05, + "loss": 0.8835, + "step": 2175 + }, + { + "epoch": 0.7091737150292778, + "grad_norm": 1.614592432975769, + "learning_rate": 4.7884172655494086e-05, + "loss": 0.8839, + "step": 2180 + }, + { + "epoch": 0.7108002602472349, + "grad_norm": 1.570431113243103, + "learning_rate": 4.7873769507919266e-05, + "loss": 0.9258, + "step": 2185 + }, + { + "epoch": 0.7124268054651919, + "grad_norm": 1.6654767990112305, + "learning_rate": 4.786334198355775e-05, + "loss": 0.9091, + "step": 2190 + }, + { + "epoch": 0.714053350683149, + "grad_norm": 1.2270712852478027, + "learning_rate": 4.785289009352227e-05, + "loss": 0.895, + "step": 2195 + }, + { + "epoch": 0.715679895901106, + "grad_norm": 1.5635367631912231, + "learning_rate": 4.784241384895151e-05, + "loss": 0.88, + "step": 2200 + }, + { + "epoch": 0.7173064411190632, + "grad_norm": 1.700825572013855, + "learning_rate": 4.7831913261010066e-05, + "loss": 0.926, + "step": 2205 + }, + { + "epoch": 0.7189329863370202, + "grad_norm": 1.4447286128997803, + "learning_rate": 4.7821388340888535e-05, + "loss": 0.8864, + "step": 2210 + }, + { + "epoch": 0.7205595315549772, + "grad_norm": 1.4532170295715332, + "learning_rate": 4.781083909980342e-05, + "loss": 0.9093, + "step": 2215 + }, + { + "epoch": 0.7221860767729343, + "grad_norm": 1.4676454067230225, + "learning_rate": 4.7800265548997135e-05, + "loss": 0.9034, + "step": 2220 + }, + { + "epoch": 0.7238126219908914, + "grad_norm": 1.2255409955978394, + "learning_rate": 4.778966769973802e-05, + "loss": 0.9175, + "step": 2225 + }, + { + "epoch": 0.7254391672088484, + "grad_norm": 1.2460826635360718, + "learning_rate": 4.77790455633203e-05, + "loss": 0.9093, + "step": 2230 + }, + { + "epoch": 0.7270657124268055, + "grad_norm": 1.4651199579238892, + "learning_rate": 4.7768399151064076e-05, + "loss": 0.8822, + "step": 2235 + }, + { + "epoch": 0.7286922576447625, + "grad_norm": 1.409850001335144, + "learning_rate": 4.775772847431533e-05, + "loss": 0.9438, + "step": 2240 + }, + { + "epoch": 0.7303188028627196, + "grad_norm": 2.183683395385742, + "learning_rate": 4.774703354444591e-05, + "loss": 0.8875, + "step": 2245 + }, + { + "epoch": 0.7319453480806767, + "grad_norm": 1.7117607593536377, + "learning_rate": 4.7736314372853486e-05, + "loss": 0.9076, + "step": 2250 + }, + { + "epoch": 0.7335718932986337, + "grad_norm": 1.5121268033981323, + "learning_rate": 4.7725570970961586e-05, + "loss": 0.9199, + "step": 2255 + }, + { + "epoch": 0.7351984385165907, + "grad_norm": 1.3727307319641113, + "learning_rate": 4.771480335021955e-05, + "loss": 0.9003, + "step": 2260 + }, + { + "epoch": 0.7368249837345479, + "grad_norm": 1.9547559022903442, + "learning_rate": 4.770401152210253e-05, + "loss": 0.9363, + "step": 2265 + }, + { + "epoch": 0.7384515289525049, + "grad_norm": 1.2419757843017578, + "learning_rate": 4.7693195498111486e-05, + "loss": 0.8862, + "step": 2270 + }, + { + "epoch": 0.7400780741704619, + "grad_norm": 1.3020061254501343, + "learning_rate": 4.768235528977314e-05, + "loss": 0.9147, + "step": 2275 + }, + { + "epoch": 0.741704619388419, + "grad_norm": 1.5069094896316528, + "learning_rate": 4.767149090864002e-05, + "loss": 0.8773, + "step": 2280 + }, + { + "epoch": 0.743331164606376, + "grad_norm": 1.7590210437774658, + "learning_rate": 4.766060236629037e-05, + "loss": 0.9125, + "step": 2285 + }, + { + "epoch": 0.7449577098243331, + "grad_norm": 6.086793899536133, + "learning_rate": 4.764968967432824e-05, + "loss": 0.875, + "step": 2290 + }, + { + "epoch": 0.7465842550422902, + "grad_norm": 1.422428011894226, + "learning_rate": 4.763875284438336e-05, + "loss": 0.8972, + "step": 2295 + }, + { + "epoch": 0.7482108002602472, + "grad_norm": 1.4383183717727661, + "learning_rate": 4.7627791888111227e-05, + "loss": 0.8952, + "step": 2300 + }, + { + "epoch": 0.7498373454782042, + "grad_norm": 1.260046362876892, + "learning_rate": 4.7616806817193024e-05, + "loss": 0.9001, + "step": 2305 + }, + { + "epoch": 0.7514638906961614, + "grad_norm": 1.5630263090133667, + "learning_rate": 4.7605797643335655e-05, + "loss": 0.9195, + "step": 2310 + }, + { + "epoch": 0.7530904359141184, + "grad_norm": 1.5919269323349, + "learning_rate": 4.759476437827168e-05, + "loss": 0.9194, + "step": 2315 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 1.6837332248687744, + "learning_rate": 4.758370703375936e-05, + "loss": 0.9159, + "step": 2320 + }, + { + "epoch": 0.7563435263500325, + "grad_norm": 1.653176188468933, + "learning_rate": 4.757262562158262e-05, + "loss": 0.941, + "step": 2325 + }, + { + "epoch": 0.7579700715679896, + "grad_norm": 2.1222479343414307, + "learning_rate": 4.7561520153550997e-05, + "loss": 0.9049, + "step": 2330 + }, + { + "epoch": 0.7595966167859467, + "grad_norm": 1.3103207349777222, + "learning_rate": 4.7550390641499715e-05, + "loss": 0.9158, + "step": 2335 + }, + { + "epoch": 0.7612231620039037, + "grad_norm": 1.8177686929702759, + "learning_rate": 4.753923709728958e-05, + "loss": 0.9221, + "step": 2340 + }, + { + "epoch": 0.7628497072218607, + "grad_norm": 1.3908177614212036, + "learning_rate": 4.7528059532807045e-05, + "loss": 0.8849, + "step": 2345 + }, + { + "epoch": 0.7644762524398179, + "grad_norm": 1.4398841857910156, + "learning_rate": 4.751685795996413e-05, + "loss": 0.8825, + "step": 2350 + }, + { + "epoch": 0.7661027976577749, + "grad_norm": 1.4527658224105835, + "learning_rate": 4.750563239069845e-05, + "loss": 0.8783, + "step": 2355 + }, + { + "epoch": 0.7677293428757319, + "grad_norm": 1.27535879611969, + "learning_rate": 4.749438283697321e-05, + "loss": 0.8824, + "step": 2360 + }, + { + "epoch": 0.769355888093689, + "grad_norm": 1.118067741394043, + "learning_rate": 4.7483109310777165e-05, + "loss": 0.901, + "step": 2365 + }, + { + "epoch": 0.7709824333116461, + "grad_norm": 1.5649245977401733, + "learning_rate": 4.747181182412459e-05, + "loss": 0.8866, + "step": 2370 + }, + { + "epoch": 0.7726089785296031, + "grad_norm": 1.307242751121521, + "learning_rate": 4.7460490389055355e-05, + "loss": 0.8819, + "step": 2375 + }, + { + "epoch": 0.7742355237475602, + "grad_norm": 1.4124691486358643, + "learning_rate": 4.7449145017634795e-05, + "loss": 0.9327, + "step": 2380 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 1.5891166925430298, + "learning_rate": 4.743777572195378e-05, + "loss": 0.8904, + "step": 2385 + }, + { + "epoch": 0.7774886141834743, + "grad_norm": 1.482954978942871, + "learning_rate": 4.742638251412868e-05, + "loss": 0.8952, + "step": 2390 + }, + { + "epoch": 0.7791151594014314, + "grad_norm": 2.101250171661377, + "learning_rate": 4.741496540630134e-05, + "loss": 0.9599, + "step": 2395 + }, + { + "epoch": 0.7807417046193884, + "grad_norm": 1.6486133337020874, + "learning_rate": 4.7403524410639066e-05, + "loss": 0.9159, + "step": 2400 + }, + { + "epoch": 0.7823682498373454, + "grad_norm": 2.821922779083252, + "learning_rate": 4.739205953933464e-05, + "loss": 0.9168, + "step": 2405 + }, + { + "epoch": 0.7839947950553026, + "grad_norm": 1.8599350452423096, + "learning_rate": 4.738057080460629e-05, + "loss": 0.8834, + "step": 2410 + }, + { + "epoch": 0.7856213402732596, + "grad_norm": 1.3947274684906006, + "learning_rate": 4.736905821869765e-05, + "loss": 0.9196, + "step": 2415 + }, + { + "epoch": 0.7872478854912166, + "grad_norm": 1.0371801853179932, + "learning_rate": 4.73575217938778e-05, + "loss": 0.921, + "step": 2420 + }, + { + "epoch": 0.7888744307091737, + "grad_norm": 1.368464708328247, + "learning_rate": 4.734596154244121e-05, + "loss": 0.8993, + "step": 2425 + }, + { + "epoch": 0.7905009759271308, + "grad_norm": 1.3924031257629395, + "learning_rate": 4.733437747670775e-05, + "loss": 0.8992, + "step": 2430 + }, + { + "epoch": 0.7921275211450879, + "grad_norm": 1.2024319171905518, + "learning_rate": 4.732276960902267e-05, + "loss": 0.9099, + "step": 2435 + }, + { + "epoch": 0.7937540663630449, + "grad_norm": 1.3124393224716187, + "learning_rate": 4.731113795175656e-05, + "loss": 0.8652, + "step": 2440 + }, + { + "epoch": 0.7953806115810019, + "grad_norm": 1.3136018514633179, + "learning_rate": 4.7299482517305404e-05, + "loss": 0.8494, + "step": 2445 + }, + { + "epoch": 0.7970071567989591, + "grad_norm": 1.4578925371170044, + "learning_rate": 4.72878033180905e-05, + "loss": 0.9009, + "step": 2450 + }, + { + "epoch": 0.7986337020169161, + "grad_norm": 1.4511196613311768, + "learning_rate": 4.7276100366558474e-05, + "loss": 0.8783, + "step": 2455 + }, + { + "epoch": 0.8002602472348731, + "grad_norm": 1.3172091245651245, + "learning_rate": 4.726437367518128e-05, + "loss": 0.8729, + "step": 2460 + }, + { + "epoch": 0.8018867924528302, + "grad_norm": 1.4047578573226929, + "learning_rate": 4.7252623256456144e-05, + "loss": 0.9013, + "step": 2465 + }, + { + "epoch": 0.8035133376707873, + "grad_norm": 1.3769360780715942, + "learning_rate": 4.7240849122905616e-05, + "loss": 0.8819, + "step": 2470 + }, + { + "epoch": 0.8051398828887443, + "grad_norm": 1.2437390089035034, + "learning_rate": 4.722905128707749e-05, + "loss": 0.8865, + "step": 2475 + }, + { + "epoch": 0.8067664281067014, + "grad_norm": 1.3973208665847778, + "learning_rate": 4.721722976154481e-05, + "loss": 0.9081, + "step": 2480 + }, + { + "epoch": 0.8083929733246584, + "grad_norm": 1.4897907972335815, + "learning_rate": 4.720538455890591e-05, + "loss": 0.9, + "step": 2485 + }, + { + "epoch": 0.8100195185426154, + "grad_norm": 1.6833552122116089, + "learning_rate": 4.7193515691784326e-05, + "loss": 0.8661, + "step": 2490 + }, + { + "epoch": 0.8116460637605726, + "grad_norm": 1.1261287927627563, + "learning_rate": 4.718162317282882e-05, + "loss": 0.8789, + "step": 2495 + }, + { + "epoch": 0.8132726089785296, + "grad_norm": 1.7293471097946167, + "learning_rate": 4.716970701471334e-05, + "loss": 0.8905, + "step": 2500 + }, + { + "epoch": 0.8148991541964866, + "grad_norm": 1.425553798675537, + "learning_rate": 4.7157767230137064e-05, + "loss": 0.8777, + "step": 2505 + }, + { + "epoch": 0.8165256994144438, + "grad_norm": 1.802399754524231, + "learning_rate": 4.714580383182433e-05, + "loss": 0.9044, + "step": 2510 + }, + { + "epoch": 0.8181522446324008, + "grad_norm": 1.6173362731933594, + "learning_rate": 4.713381683252463e-05, + "loss": 0.8687, + "step": 2515 + }, + { + "epoch": 0.8197787898503578, + "grad_norm": 1.3433735370635986, + "learning_rate": 4.712180624501263e-05, + "loss": 0.8753, + "step": 2520 + }, + { + "epoch": 0.8214053350683149, + "grad_norm": 1.1365931034088135, + "learning_rate": 4.710977208208812e-05, + "loss": 0.9034, + "step": 2525 + }, + { + "epoch": 0.8230318802862719, + "grad_norm": 1.590441346168518, + "learning_rate": 4.709771435657602e-05, + "loss": 0.905, + "step": 2530 + }, + { + "epoch": 0.824658425504229, + "grad_norm": 1.416684627532959, + "learning_rate": 4.708563308132636e-05, + "loss": 0.8948, + "step": 2535 + }, + { + "epoch": 0.8262849707221861, + "grad_norm": 1.2487531900405884, + "learning_rate": 4.707352826921426e-05, + "loss": 0.8734, + "step": 2540 + }, + { + "epoch": 0.8279115159401431, + "grad_norm": 1.228668451309204, + "learning_rate": 4.706139993313994e-05, + "loss": 0.9077, + "step": 2545 + }, + { + "epoch": 0.8295380611581002, + "grad_norm": 1.2547862529754639, + "learning_rate": 4.7049248086028666e-05, + "loss": 0.8753, + "step": 2550 + }, + { + "epoch": 0.8311646063760573, + "grad_norm": 1.5615379810333252, + "learning_rate": 4.7037072740830785e-05, + "loss": 0.8534, + "step": 2555 + }, + { + "epoch": 0.8327911515940143, + "grad_norm": 1.3224072456359863, + "learning_rate": 4.702487391052167e-05, + "loss": 0.8961, + "step": 2560 + }, + { + "epoch": 0.8344176968119714, + "grad_norm": 1.1746350526809692, + "learning_rate": 4.701265160810172e-05, + "loss": 0.8829, + "step": 2565 + }, + { + "epoch": 0.8360442420299284, + "grad_norm": 1.277154803276062, + "learning_rate": 4.7000405846596365e-05, + "loss": 0.9225, + "step": 2570 + }, + { + "epoch": 0.8376707872478855, + "grad_norm": 1.3092966079711914, + "learning_rate": 4.6988136639056025e-05, + "loss": 0.918, + "step": 2575 + }, + { + "epoch": 0.8392973324658426, + "grad_norm": 1.238376498222351, + "learning_rate": 4.69758439985561e-05, + "loss": 0.8788, + "step": 2580 + }, + { + "epoch": 0.8409238776837996, + "grad_norm": 1.7734768390655518, + "learning_rate": 4.696352793819698e-05, + "loss": 0.8749, + "step": 2585 + }, + { + "epoch": 0.8425504229017566, + "grad_norm": 1.8386881351470947, + "learning_rate": 4.6951188471104e-05, + "loss": 0.9006, + "step": 2590 + }, + { + "epoch": 0.8441769681197138, + "grad_norm": 1.5979670286178589, + "learning_rate": 4.693882561042743e-05, + "loss": 0.8943, + "step": 2595 + }, + { + "epoch": 0.8458035133376708, + "grad_norm": 1.3632482290267944, + "learning_rate": 4.6926439369342515e-05, + "loss": 0.8867, + "step": 2600 + }, + { + "epoch": 0.8474300585556278, + "grad_norm": 1.5908359289169312, + "learning_rate": 4.6914029761049357e-05, + "loss": 0.9058, + "step": 2605 + }, + { + "epoch": 0.8490566037735849, + "grad_norm": 1.3819308280944824, + "learning_rate": 4.6901596798772995e-05, + "loss": 0.9116, + "step": 2610 + }, + { + "epoch": 0.850683148991542, + "grad_norm": 1.7333976030349731, + "learning_rate": 4.688914049576337e-05, + "loss": 0.8662, + "step": 2615 + }, + { + "epoch": 0.852309694209499, + "grad_norm": 1.5826621055603027, + "learning_rate": 4.6876660865295264e-05, + "loss": 0.9177, + "step": 2620 + }, + { + "epoch": 0.8539362394274561, + "grad_norm": 1.3125733137130737, + "learning_rate": 4.686415792066833e-05, + "loss": 0.9016, + "step": 2625 + }, + { + "epoch": 0.8555627846454131, + "grad_norm": 1.6679668426513672, + "learning_rate": 4.685163167520708e-05, + "loss": 0.8826, + "step": 2630 + }, + { + "epoch": 0.8571893298633702, + "grad_norm": 1.358363389968872, + "learning_rate": 4.683908214226084e-05, + "loss": 0.8839, + "step": 2635 + }, + { + "epoch": 0.8588158750813273, + "grad_norm": 1.4284957647323608, + "learning_rate": 4.682650933520377e-05, + "loss": 0.9089, + "step": 2640 + }, + { + "epoch": 0.8604424202992843, + "grad_norm": 1.3300113677978516, + "learning_rate": 4.6813913267434835e-05, + "loss": 0.9001, + "step": 2645 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 1.3424525260925293, + "learning_rate": 4.680129395237776e-05, + "loss": 0.9223, + "step": 2650 + }, + { + "epoch": 0.8636955107351985, + "grad_norm": 1.2451975345611572, + "learning_rate": 4.678865140348108e-05, + "loss": 0.8878, + "step": 2655 + }, + { + "epoch": 0.8653220559531555, + "grad_norm": 1.4667913913726807, + "learning_rate": 4.6775985634218074e-05, + "loss": 0.8799, + "step": 2660 + }, + { + "epoch": 0.8669486011711126, + "grad_norm": 1.3510668277740479, + "learning_rate": 4.676329665808677e-05, + "loss": 0.9041, + "step": 2665 + }, + { + "epoch": 0.8685751463890696, + "grad_norm": 1.3839348554611206, + "learning_rate": 4.675058448860991e-05, + "loss": 0.9221, + "step": 2670 + }, + { + "epoch": 0.8702016916070267, + "grad_norm": 1.2424792051315308, + "learning_rate": 4.673784913933499e-05, + "loss": 0.922, + "step": 2675 + }, + { + "epoch": 0.8718282368249838, + "grad_norm": 1.6038709878921509, + "learning_rate": 4.672509062383418e-05, + "loss": 0.9114, + "step": 2680 + }, + { + "epoch": 0.8734547820429408, + "grad_norm": 1.4350557327270508, + "learning_rate": 4.6712308955704346e-05, + "loss": 0.8949, + "step": 2685 + }, + { + "epoch": 0.8750813272608978, + "grad_norm": 1.5301589965820312, + "learning_rate": 4.669950414856704e-05, + "loss": 0.8669, + "step": 2690 + }, + { + "epoch": 0.876707872478855, + "grad_norm": 1.5579694509506226, + "learning_rate": 4.668667621606845e-05, + "loss": 0.9055, + "step": 2695 + }, + { + "epoch": 0.878334417696812, + "grad_norm": 1.2389270067214966, + "learning_rate": 4.6673825171879426e-05, + "loss": 0.9124, + "step": 2700 + }, + { + "epoch": 0.879960962914769, + "grad_norm": 1.3482732772827148, + "learning_rate": 4.666095102969544e-05, + "loss": 0.8909, + "step": 2705 + }, + { + "epoch": 0.8815875081327261, + "grad_norm": 1.3475229740142822, + "learning_rate": 4.6648053803236603e-05, + "loss": 0.8978, + "step": 2710 + }, + { + "epoch": 0.8832140533506831, + "grad_norm": 1.2835525274276733, + "learning_rate": 4.6635133506247585e-05, + "loss": 0.8675, + "step": 2715 + }, + { + "epoch": 0.8848405985686402, + "grad_norm": 1.702312707901001, + "learning_rate": 4.662219015249768e-05, + "loss": 0.8663, + "step": 2720 + }, + { + "epoch": 0.8864671437865973, + "grad_norm": 1.5453839302062988, + "learning_rate": 4.660922375578073e-05, + "loss": 0.9019, + "step": 2725 + }, + { + "epoch": 0.8880936890045543, + "grad_norm": 1.2763853073120117, + "learning_rate": 4.6596234329915144e-05, + "loss": 0.8898, + "step": 2730 + }, + { + "epoch": 0.8897202342225113, + "grad_norm": 1.4243005514144897, + "learning_rate": 4.658322188874388e-05, + "loss": 0.8721, + "step": 2735 + }, + { + "epoch": 0.8913467794404685, + "grad_norm": 1.681253433227539, + "learning_rate": 4.65701864461344e-05, + "loss": 0.9313, + "step": 2740 + }, + { + "epoch": 0.8929733246584255, + "grad_norm": 1.4784010648727417, + "learning_rate": 4.6557128015978726e-05, + "loss": 0.8784, + "step": 2745 + }, + { + "epoch": 0.8945998698763825, + "grad_norm": 1.2526527643203735, + "learning_rate": 4.654404661219331e-05, + "loss": 0.8943, + "step": 2750 + }, + { + "epoch": 0.8962264150943396, + "grad_norm": 2.029129981994629, + "learning_rate": 4.653094224871916e-05, + "loss": 0.8905, + "step": 2755 + }, + { + "epoch": 0.8978529603122967, + "grad_norm": 1.133832335472107, + "learning_rate": 4.651781493952169e-05, + "loss": 0.874, + "step": 2760 + }, + { + "epoch": 0.8994795055302537, + "grad_norm": 1.1803925037384033, + "learning_rate": 4.650466469859079e-05, + "loss": 0.8811, + "step": 2765 + }, + { + "epoch": 0.9011060507482108, + "grad_norm": 1.043959140777588, + "learning_rate": 4.649149153994082e-05, + "loss": 0.8331, + "step": 2770 + }, + { + "epoch": 0.9027325959661678, + "grad_norm": 1.412017822265625, + "learning_rate": 4.647829547761053e-05, + "loss": 0.9295, + "step": 2775 + }, + { + "epoch": 0.904359141184125, + "grad_norm": 1.296935796737671, + "learning_rate": 4.646507652566307e-05, + "loss": 0.9129, + "step": 2780 + }, + { + "epoch": 0.905985686402082, + "grad_norm": 1.4021939039230347, + "learning_rate": 4.6451834698186e-05, + "loss": 0.9042, + "step": 2785 + }, + { + "epoch": 0.907612231620039, + "grad_norm": 1.2917166948318481, + "learning_rate": 4.643857000929128e-05, + "loss": 0.8817, + "step": 2790 + }, + { + "epoch": 0.9092387768379961, + "grad_norm": 1.6610374450683594, + "learning_rate": 4.642528247311518e-05, + "loss": 0.9261, + "step": 2795 + }, + { + "epoch": 0.9108653220559532, + "grad_norm": 1.1196281909942627, + "learning_rate": 4.6411972103818374e-05, + "loss": 0.8698, + "step": 2800 + }, + { + "epoch": 0.9124918672739102, + "grad_norm": 1.1855905055999756, + "learning_rate": 4.6398638915585835e-05, + "loss": 0.8741, + "step": 2805 + }, + { + "epoch": 0.9141184124918673, + "grad_norm": 1.4766261577606201, + "learning_rate": 4.638528292262686e-05, + "loss": 0.9204, + "step": 2810 + }, + { + "epoch": 0.9157449577098243, + "grad_norm": 1.1754957437515259, + "learning_rate": 4.637190413917506e-05, + "loss": 0.8729, + "step": 2815 + }, + { + "epoch": 0.9173715029277814, + "grad_norm": 1.3844324350357056, + "learning_rate": 4.6358502579488327e-05, + "loss": 0.8404, + "step": 2820 + }, + { + "epoch": 0.9189980481457385, + "grad_norm": 1.7580204010009766, + "learning_rate": 4.634507825784882e-05, + "loss": 0.9071, + "step": 2825 + }, + { + "epoch": 0.9206245933636955, + "grad_norm": 1.3922860622406006, + "learning_rate": 4.633163118856297e-05, + "loss": 0.9232, + "step": 2830 + }, + { + "epoch": 0.9222511385816525, + "grad_norm": 1.4340027570724487, + "learning_rate": 4.631816138596145e-05, + "loss": 0.8808, + "step": 2835 + }, + { + "epoch": 0.9238776837996097, + "grad_norm": 1.350365400314331, + "learning_rate": 4.630466886439914e-05, + "loss": 0.8786, + "step": 2840 + }, + { + "epoch": 0.9255042290175667, + "grad_norm": 1.529891014099121, + "learning_rate": 4.629115363825514e-05, + "loss": 0.8786, + "step": 2845 + }, + { + "epoch": 0.9271307742355237, + "grad_norm": 1.4447596073150635, + "learning_rate": 4.627761572193277e-05, + "loss": 0.8893, + "step": 2850 + }, + { + "epoch": 0.9287573194534808, + "grad_norm": 1.2633144855499268, + "learning_rate": 4.626405512985948e-05, + "loss": 0.8696, + "step": 2855 + }, + { + "epoch": 0.9303838646714379, + "grad_norm": 1.3535503149032593, + "learning_rate": 4.6250471876486954e-05, + "loss": 0.8854, + "step": 2860 + }, + { + "epoch": 0.9320104098893949, + "grad_norm": 1.520047664642334, + "learning_rate": 4.623686597629098e-05, + "loss": 0.8666, + "step": 2865 + }, + { + "epoch": 0.933636955107352, + "grad_norm": 1.3524024486541748, + "learning_rate": 4.6223237443771474e-05, + "loss": 0.8979, + "step": 2870 + }, + { + "epoch": 0.935263500325309, + "grad_norm": 1.3893473148345947, + "learning_rate": 4.62095862934525e-05, + "loss": 0.8978, + "step": 2875 + }, + { + "epoch": 0.936890045543266, + "grad_norm": 1.2136142253875732, + "learning_rate": 4.6195912539882214e-05, + "loss": 0.8805, + "step": 2880 + }, + { + "epoch": 0.9385165907612232, + "grad_norm": 1.6588865518569946, + "learning_rate": 4.618221619763287e-05, + "loss": 0.8845, + "step": 2885 + }, + { + "epoch": 0.9401431359791802, + "grad_norm": 1.3332774639129639, + "learning_rate": 4.616849728130077e-05, + "loss": 0.8937, + "step": 2890 + }, + { + "epoch": 0.9417696811971373, + "grad_norm": 1.2780805826187134, + "learning_rate": 4.6154755805506294e-05, + "loss": 0.8978, + "step": 2895 + }, + { + "epoch": 0.9433962264150944, + "grad_norm": 1.3405190706253052, + "learning_rate": 4.6140991784893864e-05, + "loss": 0.861, + "step": 2900 + }, + { + "epoch": 0.9450227716330514, + "grad_norm": 1.8001176118850708, + "learning_rate": 4.612720523413193e-05, + "loss": 0.9023, + "step": 2905 + }, + { + "epoch": 0.9466493168510085, + "grad_norm": 1.534879446029663, + "learning_rate": 4.6113396167912925e-05, + "loss": 0.8981, + "step": 2910 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 1.270139455795288, + "learning_rate": 4.609956460095332e-05, + "loss": 0.8781, + "step": 2915 + }, + { + "epoch": 0.9499024072869225, + "grad_norm": 1.3285653591156006, + "learning_rate": 4.608571054799353e-05, + "loss": 0.9076, + "step": 2920 + }, + { + "epoch": 0.9515289525048797, + "grad_norm": 1.3763809204101562, + "learning_rate": 4.607183402379794e-05, + "loss": 0.8754, + "step": 2925 + }, + { + "epoch": 0.9531554977228367, + "grad_norm": 1.564855694770813, + "learning_rate": 4.605793504315491e-05, + "loss": 0.8969, + "step": 2930 + }, + { + "epoch": 0.9547820429407937, + "grad_norm": 1.3776639699935913, + "learning_rate": 4.6044013620876706e-05, + "loss": 0.8784, + "step": 2935 + }, + { + "epoch": 0.9564085881587508, + "grad_norm": 1.2027461528778076, + "learning_rate": 4.603006977179951e-05, + "loss": 0.8816, + "step": 2940 + }, + { + "epoch": 0.9580351333767079, + "grad_norm": 1.1574504375457764, + "learning_rate": 4.60161035107834e-05, + "loss": 0.8773, + "step": 2945 + }, + { + "epoch": 0.9596616785946649, + "grad_norm": 1.2480653524398804, + "learning_rate": 4.6002114852712374e-05, + "loss": 0.8967, + "step": 2950 + }, + { + "epoch": 0.961288223812622, + "grad_norm": 1.405286192893982, + "learning_rate": 4.598810381249425e-05, + "loss": 0.911, + "step": 2955 + }, + { + "epoch": 0.962914769030579, + "grad_norm": 1.166619062423706, + "learning_rate": 4.5974070405060746e-05, + "loss": 0.8689, + "step": 2960 + }, + { + "epoch": 0.9645413142485361, + "grad_norm": 1.383713722229004, + "learning_rate": 4.596001464536737e-05, + "loss": 0.887, + "step": 2965 + }, + { + "epoch": 0.9661678594664932, + "grad_norm": 1.1339458227157593, + "learning_rate": 4.5945936548393486e-05, + "loss": 0.8684, + "step": 2970 + }, + { + "epoch": 0.9677944046844502, + "grad_norm": 1.3403124809265137, + "learning_rate": 4.593183612914225e-05, + "loss": 0.8859, + "step": 2975 + }, + { + "epoch": 0.9694209499024072, + "grad_norm": 1.409609317779541, + "learning_rate": 4.591771340264061e-05, + "loss": 0.8816, + "step": 2980 + }, + { + "epoch": 0.9710474951203644, + "grad_norm": 1.3345836400985718, + "learning_rate": 4.5903568383939284e-05, + "loss": 0.9161, + "step": 2985 + }, + { + "epoch": 0.9726740403383214, + "grad_norm": 1.1995890140533447, + "learning_rate": 4.588940108811275e-05, + "loss": 0.8833, + "step": 2990 + }, + { + "epoch": 0.9743005855562785, + "grad_norm": 1.159071922302246, + "learning_rate": 4.587521153025922e-05, + "loss": 0.9093, + "step": 2995 + }, + { + "epoch": 0.9759271307742355, + "grad_norm": 1.2970279455184937, + "learning_rate": 4.5860999725500644e-05, + "loss": 0.8508, + "step": 3000 + }, + { + "epoch": 0.9775536759921926, + "grad_norm": 1.2380355596542358, + "learning_rate": 4.584676568898267e-05, + "loss": 0.8805, + "step": 3005 + }, + { + "epoch": 0.9791802212101497, + "grad_norm": 1.3525652885437012, + "learning_rate": 4.583250943587464e-05, + "loss": 0.8979, + "step": 3010 + }, + { + "epoch": 0.9808067664281067, + "grad_norm": 1.401249647140503, + "learning_rate": 4.5818230981369584e-05, + "loss": 0.8777, + "step": 3015 + }, + { + "epoch": 0.9824333116460637, + "grad_norm": 1.3878058195114136, + "learning_rate": 4.580393034068416e-05, + "loss": 0.8866, + "step": 3020 + }, + { + "epoch": 0.9840598568640209, + "grad_norm": 1.4664883613586426, + "learning_rate": 4.5789607529058715e-05, + "loss": 0.8579, + "step": 3025 + }, + { + "epoch": 0.9856864020819779, + "grad_norm": 1.3563498258590698, + "learning_rate": 4.5775262561757195e-05, + "loss": 0.8889, + "step": 3030 + }, + { + "epoch": 0.9873129472999349, + "grad_norm": 1.3692197799682617, + "learning_rate": 4.5760895454067154e-05, + "loss": 0.8786, + "step": 3035 + }, + { + "epoch": 0.988939492517892, + "grad_norm": 1.6675642728805542, + "learning_rate": 4.574650622129976e-05, + "loss": 0.9113, + "step": 3040 + }, + { + "epoch": 0.9905660377358491, + "grad_norm": 1.5963557958602905, + "learning_rate": 4.5732094878789756e-05, + "loss": 0.8864, + "step": 3045 + }, + { + "epoch": 0.9921925829538061, + "grad_norm": 1.2697241306304932, + "learning_rate": 4.571766144189543e-05, + "loss": 0.8809, + "step": 3050 + }, + { + "epoch": 0.9938191281717632, + "grad_norm": 1.428701400756836, + "learning_rate": 4.570320592599863e-05, + "loss": 0.9056, + "step": 3055 + }, + { + "epoch": 0.9954456733897202, + "grad_norm": 1.18083918094635, + "learning_rate": 4.568872834650474e-05, + "loss": 0.8827, + "step": 3060 + }, + { + "epoch": 0.9970722186076773, + "grad_norm": 1.6958439350128174, + "learning_rate": 4.567422871884265e-05, + "loss": 0.8937, + "step": 3065 + }, + { + "epoch": 0.9986987638256344, + "grad_norm": 1.2595447301864624, + "learning_rate": 4.565970705846474e-05, + "loss": 0.898, + "step": 3070 + }, + { + "epoch": 1.0, + "eval_f1": 0.8047922849548802, + "eval_loss": 0.432373046875, + "eval_precision": 0.8080985921865219, + "eval_recall": 0.8033079374726126, + "eval_runtime": 722.5023, + "eval_samples_per_second": 544.545, + "eval_steps_per_second": 1.064, + "step": 3074 + }, + { + "epoch": 1.0003253090435915, + "grad_norm": 1.2510595321655273, + "learning_rate": 4.564516338084688e-05, + "loss": 0.9067, + "step": 3075 + }, + { + "epoch": 1.0019518542615484, + "grad_norm": 1.3450714349746704, + "learning_rate": 4.5630597701488396e-05, + "loss": 0.8666, + "step": 3080 + }, + { + "epoch": 1.0035783994795056, + "grad_norm": 1.4504094123840332, + "learning_rate": 4.561601003591208e-05, + "loss": 0.8449, + "step": 3085 + }, + { + "epoch": 1.0052049446974627, + "grad_norm": 1.4622325897216797, + "learning_rate": 4.560140039966413e-05, + "loss": 0.8521, + "step": 3090 + }, + { + "epoch": 1.0068314899154196, + "grad_norm": 1.4779932498931885, + "learning_rate": 4.558676880831417e-05, + "loss": 0.8233, + "step": 3095 + }, + { + "epoch": 1.0084580351333767, + "grad_norm": 1.3692481517791748, + "learning_rate": 4.557211527745523e-05, + "loss": 0.7967, + "step": 3100 + }, + { + "epoch": 1.0100845803513339, + "grad_norm": 1.6611404418945312, + "learning_rate": 4.555743982270369e-05, + "loss": 0.8495, + "step": 3105 + }, + { + "epoch": 1.0117111255692908, + "grad_norm": 1.182586431503296, + "learning_rate": 4.554274245969936e-05, + "loss": 0.8484, + "step": 3110 + }, + { + "epoch": 1.013337670787248, + "grad_norm": 1.4083424806594849, + "learning_rate": 4.5528023204105306e-05, + "loss": 0.8341, + "step": 3115 + }, + { + "epoch": 1.014964216005205, + "grad_norm": 1.2453911304473877, + "learning_rate": 4.551328207160801e-05, + "loss": 0.8592, + "step": 3120 + }, + { + "epoch": 1.016590761223162, + "grad_norm": 1.269647240638733, + "learning_rate": 4.549851907791722e-05, + "loss": 0.8551, + "step": 3125 + }, + { + "epoch": 1.018217306441119, + "grad_norm": 1.3350937366485596, + "learning_rate": 4.548373423876598e-05, + "loss": 0.8302, + "step": 3130 + }, + { + "epoch": 1.0198438516590762, + "grad_norm": 1.9106508493423462, + "learning_rate": 4.5468927569910663e-05, + "loss": 0.7995, + "step": 3135 + }, + { + "epoch": 1.0214703968770331, + "grad_norm": 1.26970636844635, + "learning_rate": 4.545409908713084e-05, + "loss": 0.8413, + "step": 3140 + }, + { + "epoch": 1.0230969420949902, + "grad_norm": 1.2396373748779297, + "learning_rate": 4.5439248806229386e-05, + "loss": 0.8022, + "step": 3145 + }, + { + "epoch": 1.0247234873129474, + "grad_norm": 1.313094973564148, + "learning_rate": 4.542437674303236e-05, + "loss": 0.8852, + "step": 3150 + }, + { + "epoch": 1.0263500325309043, + "grad_norm": 1.5670477151870728, + "learning_rate": 4.5409482913389065e-05, + "loss": 0.8411, + "step": 3155 + }, + { + "epoch": 1.0279765777488614, + "grad_norm": 1.6376301050186157, + "learning_rate": 4.5394567333172e-05, + "loss": 0.8556, + "step": 3160 + }, + { + "epoch": 1.0296031229668186, + "grad_norm": 1.2869666814804077, + "learning_rate": 4.5379630018276834e-05, + "loss": 0.8506, + "step": 3165 + }, + { + "epoch": 1.0312296681847755, + "grad_norm": 1.4691636562347412, + "learning_rate": 4.5364670984622385e-05, + "loss": 0.8247, + "step": 3170 + }, + { + "epoch": 1.0328562134027326, + "grad_norm": 1.259082317352295, + "learning_rate": 4.534969024815066e-05, + "loss": 0.8207, + "step": 3175 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 1.3718903064727783, + "learning_rate": 4.533468782482674e-05, + "loss": 0.847, + "step": 3180 + }, + { + "epoch": 1.0361093038386466, + "grad_norm": 1.1277000904083252, + "learning_rate": 4.531966373063886e-05, + "loss": 0.8416, + "step": 3185 + }, + { + "epoch": 1.0377358490566038, + "grad_norm": 1.6853476762771606, + "learning_rate": 4.5304617981598334e-05, + "loss": 0.8592, + "step": 3190 + }, + { + "epoch": 1.039362394274561, + "grad_norm": 1.3323298692703247, + "learning_rate": 4.528955059373956e-05, + "loss": 0.8351, + "step": 3195 + }, + { + "epoch": 1.0409889394925178, + "grad_norm": 1.3430452346801758, + "learning_rate": 4.527446158311998e-05, + "loss": 0.8436, + "step": 3200 + }, + { + "epoch": 1.042615484710475, + "grad_norm": 1.4332150220870972, + "learning_rate": 4.52593509658201e-05, + "loss": 0.8386, + "step": 3205 + }, + { + "epoch": 1.044242029928432, + "grad_norm": 1.3050662279129028, + "learning_rate": 4.5244218757943444e-05, + "loss": 0.8197, + "step": 3210 + }, + { + "epoch": 1.045868575146389, + "grad_norm": 1.4190422296524048, + "learning_rate": 4.522906497561655e-05, + "loss": 0.846, + "step": 3215 + }, + { + "epoch": 1.047495120364346, + "grad_norm": 1.1908092498779297, + "learning_rate": 4.521388963498893e-05, + "loss": 0.8167, + "step": 3220 + }, + { + "epoch": 1.0491216655823032, + "grad_norm": 2.088581085205078, + "learning_rate": 4.519869275223309e-05, + "loss": 0.8361, + "step": 3225 + }, + { + "epoch": 1.0507482108002602, + "grad_norm": 1.436299204826355, + "learning_rate": 4.5183474343544496e-05, + "loss": 0.8373, + "step": 3230 + }, + { + "epoch": 1.0523747560182173, + "grad_norm": 1.2843883037567139, + "learning_rate": 4.516823442514153e-05, + "loss": 0.8516, + "step": 3235 + }, + { + "epoch": 1.0540013012361744, + "grad_norm": 1.4098544120788574, + "learning_rate": 4.5152973013265524e-05, + "loss": 0.8494, + "step": 3240 + }, + { + "epoch": 1.0556278464541313, + "grad_norm": 1.2439136505126953, + "learning_rate": 4.513769012418071e-05, + "loss": 0.8241, + "step": 3245 + }, + { + "epoch": 1.0572543916720885, + "grad_norm": 1.5683567523956299, + "learning_rate": 4.5122385774174194e-05, + "loss": 0.8581, + "step": 3250 + }, + { + "epoch": 1.0588809368900456, + "grad_norm": 1.2416630983352661, + "learning_rate": 4.510705997955596e-05, + "loss": 0.8529, + "step": 3255 + }, + { + "epoch": 1.0605074821080025, + "grad_norm": 1.3455045223236084, + "learning_rate": 4.5091712756658855e-05, + "loss": 0.8609, + "step": 3260 + }, + { + "epoch": 1.0621340273259596, + "grad_norm": 1.5861693620681763, + "learning_rate": 4.507634412183856e-05, + "loss": 0.8577, + "step": 3265 + }, + { + "epoch": 1.0637605725439168, + "grad_norm": 1.406770944595337, + "learning_rate": 4.506095409147356e-05, + "loss": 0.8336, + "step": 3270 + }, + { + "epoch": 1.065387117761874, + "grad_norm": 1.4342951774597168, + "learning_rate": 4.504554268196516e-05, + "loss": 0.8417, + "step": 3275 + }, + { + "epoch": 1.0670136629798308, + "grad_norm": 1.2808282375335693, + "learning_rate": 4.503010990973744e-05, + "loss": 0.8174, + "step": 3280 + }, + { + "epoch": 1.068640208197788, + "grad_norm": 1.5604490041732788, + "learning_rate": 4.5014655791237245e-05, + "loss": 0.8528, + "step": 3285 + }, + { + "epoch": 1.070266753415745, + "grad_norm": 1.4205149412155151, + "learning_rate": 4.499918034293416e-05, + "loss": 0.8441, + "step": 3290 + }, + { + "epoch": 1.071893298633702, + "grad_norm": 1.3421305418014526, + "learning_rate": 4.498368358132055e-05, + "loss": 0.849, + "step": 3295 + }, + { + "epoch": 1.073519843851659, + "grad_norm": 1.5593198537826538, + "learning_rate": 4.496816552291141e-05, + "loss": 0.8556, + "step": 3300 + }, + { + "epoch": 1.0751463890696162, + "grad_norm": 1.2776402235031128, + "learning_rate": 4.4952626184244504e-05, + "loss": 0.8428, + "step": 3305 + }, + { + "epoch": 1.0767729342875731, + "grad_norm": 1.3097907304763794, + "learning_rate": 4.4937065581880245e-05, + "loss": 0.8163, + "step": 3310 + }, + { + "epoch": 1.0783994795055303, + "grad_norm": 1.4089592695236206, + "learning_rate": 4.492148373240171e-05, + "loss": 0.8684, + "step": 3315 + }, + { + "epoch": 1.0800260247234874, + "grad_norm": 1.6047899723052979, + "learning_rate": 4.490588065241461e-05, + "loss": 0.8211, + "step": 3320 + }, + { + "epoch": 1.0816525699414443, + "grad_norm": 1.47470223903656, + "learning_rate": 4.4890256358547304e-05, + "loss": 0.8548, + "step": 3325 + }, + { + "epoch": 1.0832791151594015, + "grad_norm": 1.3604538440704346, + "learning_rate": 4.487461086745074e-05, + "loss": 0.801, + "step": 3330 + }, + { + "epoch": 1.0849056603773586, + "grad_norm": 1.7443616390228271, + "learning_rate": 4.485894419579846e-05, + "loss": 0.8438, + "step": 3335 + }, + { + "epoch": 1.0865322055953155, + "grad_norm": 1.3506377935409546, + "learning_rate": 4.484325636028659e-05, + "loss": 0.8395, + "step": 3340 + }, + { + "epoch": 1.0881587508132726, + "grad_norm": 1.0828458070755005, + "learning_rate": 4.482754737763378e-05, + "loss": 0.8419, + "step": 3345 + }, + { + "epoch": 1.0897852960312298, + "grad_norm": 1.3996878862380981, + "learning_rate": 4.4811817264581254e-05, + "loss": 0.8344, + "step": 3350 + }, + { + "epoch": 1.0914118412491867, + "grad_norm": 1.5588873624801636, + "learning_rate": 4.4796066037892734e-05, + "loss": 0.8371, + "step": 3355 + }, + { + "epoch": 1.0930383864671438, + "grad_norm": 1.4824800491333008, + "learning_rate": 4.478029371435446e-05, + "loss": 0.8414, + "step": 3360 + }, + { + "epoch": 1.094664931685101, + "grad_norm": 1.3720481395721436, + "learning_rate": 4.4764500310775116e-05, + "loss": 0.8583, + "step": 3365 + }, + { + "epoch": 1.0962914769030578, + "grad_norm": 1.8168917894363403, + "learning_rate": 4.47486858439859e-05, + "loss": 0.8103, + "step": 3370 + }, + { + "epoch": 1.097918022121015, + "grad_norm": 1.4291276931762695, + "learning_rate": 4.473285033084043e-05, + "loss": 0.8293, + "step": 3375 + }, + { + "epoch": 1.099544567338972, + "grad_norm": 1.2566916942596436, + "learning_rate": 4.471699378821474e-05, + "loss": 0.8407, + "step": 3380 + }, + { + "epoch": 1.101171112556929, + "grad_norm": 1.4391167163848877, + "learning_rate": 4.4701116233007314e-05, + "loss": 0.8256, + "step": 3385 + }, + { + "epoch": 1.1027976577748861, + "grad_norm": 1.6327389478683472, + "learning_rate": 4.4685217682139e-05, + "loss": 0.8565, + "step": 3390 + }, + { + "epoch": 1.1044242029928433, + "grad_norm": 1.9258527755737305, + "learning_rate": 4.466929815255304e-05, + "loss": 0.8314, + "step": 3395 + }, + { + "epoch": 1.1060507482108002, + "grad_norm": 1.4537739753723145, + "learning_rate": 4.465335766121501e-05, + "loss": 0.8256, + "step": 3400 + }, + { + "epoch": 1.1076772934287573, + "grad_norm": 1.3447164297103882, + "learning_rate": 4.4637396225112846e-05, + "loss": 0.8109, + "step": 3405 + }, + { + "epoch": 1.1093038386467144, + "grad_norm": 1.4910389184951782, + "learning_rate": 4.46214138612568e-05, + "loss": 0.8639, + "step": 3410 + }, + { + "epoch": 1.1109303838646714, + "grad_norm": 3.749124050140381, + "learning_rate": 4.460541058667942e-05, + "loss": 0.8354, + "step": 3415 + }, + { + "epoch": 1.1125569290826285, + "grad_norm": 1.4633839130401611, + "learning_rate": 4.4589386418435535e-05, + "loss": 0.8127, + "step": 3420 + }, + { + "epoch": 1.1141834743005856, + "grad_norm": 1.5497817993164062, + "learning_rate": 4.457334137360226e-05, + "loss": 0.8595, + "step": 3425 + }, + { + "epoch": 1.1158100195185425, + "grad_norm": 1.56538987159729, + "learning_rate": 4.4557275469278946e-05, + "loss": 0.8463, + "step": 3430 + }, + { + "epoch": 1.1174365647364997, + "grad_norm": 1.3224478960037231, + "learning_rate": 4.4541188722587165e-05, + "loss": 0.8667, + "step": 3435 + }, + { + "epoch": 1.1190631099544568, + "grad_norm": 1.2080453634262085, + "learning_rate": 4.452508115067073e-05, + "loss": 0.8666, + "step": 3440 + }, + { + "epoch": 1.1206896551724137, + "grad_norm": 1.3278350830078125, + "learning_rate": 4.450895277069561e-05, + "loss": 0.84, + "step": 3445 + }, + { + "epoch": 1.1223162003903708, + "grad_norm": 1.586470365524292, + "learning_rate": 4.449280359984997e-05, + "loss": 0.8461, + "step": 3450 + }, + { + "epoch": 1.123942745608328, + "grad_norm": 1.4312207698822021, + "learning_rate": 4.4476633655344144e-05, + "loss": 0.8463, + "step": 3455 + }, + { + "epoch": 1.1255692908262849, + "grad_norm": 1.8289159536361694, + "learning_rate": 4.446044295441058e-05, + "loss": 0.8556, + "step": 3460 + }, + { + "epoch": 1.127195836044242, + "grad_norm": 1.4957107305526733, + "learning_rate": 4.444423151430386e-05, + "loss": 0.8168, + "step": 3465 + }, + { + "epoch": 1.1288223812621991, + "grad_norm": 1.100389003753662, + "learning_rate": 4.442799935230066e-05, + "loss": 0.8632, + "step": 3470 + }, + { + "epoch": 1.130448926480156, + "grad_norm": 1.3238141536712646, + "learning_rate": 4.4411746485699744e-05, + "loss": 0.8539, + "step": 3475 + }, + { + "epoch": 1.1320754716981132, + "grad_norm": 1.3629249334335327, + "learning_rate": 4.439547293182195e-05, + "loss": 0.8377, + "step": 3480 + }, + { + "epoch": 1.1337020169160703, + "grad_norm": 1.3217734098434448, + "learning_rate": 4.437917870801015e-05, + "loss": 0.816, + "step": 3485 + }, + { + "epoch": 1.1353285621340272, + "grad_norm": 1.5412160158157349, + "learning_rate": 4.4362863831629254e-05, + "loss": 0.8054, + "step": 3490 + }, + { + "epoch": 1.1369551073519844, + "grad_norm": 1.4753262996673584, + "learning_rate": 4.434652832006616e-05, + "loss": 0.8744, + "step": 3495 + }, + { + "epoch": 1.1385816525699415, + "grad_norm": 2.2404487133026123, + "learning_rate": 4.433017219072979e-05, + "loss": 0.8323, + "step": 3500 + }, + { + "epoch": 1.1402081977878984, + "grad_norm": 1.3693082332611084, + "learning_rate": 4.431379546105101e-05, + "loss": 0.8292, + "step": 3505 + }, + { + "epoch": 1.1418347430058555, + "grad_norm": 1.5645606517791748, + "learning_rate": 4.4297398148482655e-05, + "loss": 0.8377, + "step": 3510 + }, + { + "epoch": 1.1434612882238127, + "grad_norm": 1.3587448596954346, + "learning_rate": 4.4280980270499494e-05, + "loss": 0.8229, + "step": 3515 + }, + { + "epoch": 1.1450878334417696, + "grad_norm": 1.3877063989639282, + "learning_rate": 4.426454184459821e-05, + "loss": 0.8611, + "step": 3520 + }, + { + "epoch": 1.1467143786597267, + "grad_norm": 1.9232988357543945, + "learning_rate": 4.424808288829739e-05, + "loss": 0.8351, + "step": 3525 + }, + { + "epoch": 1.1483409238776838, + "grad_norm": 1.283495306968689, + "learning_rate": 4.423160341913748e-05, + "loss": 0.8125, + "step": 3530 + }, + { + "epoch": 1.1499674690956407, + "grad_norm": 1.4220075607299805, + "learning_rate": 4.421510345468082e-05, + "loss": 0.8469, + "step": 3535 + }, + { + "epoch": 1.1515940143135979, + "grad_norm": 1.358188509941101, + "learning_rate": 4.419858301251156e-05, + "loss": 0.8236, + "step": 3540 + }, + { + "epoch": 1.153220559531555, + "grad_norm": 1.2543272972106934, + "learning_rate": 4.4182042110235686e-05, + "loss": 0.8585, + "step": 3545 + }, + { + "epoch": 1.1548471047495121, + "grad_norm": 1.2889257669448853, + "learning_rate": 4.4165480765481016e-05, + "loss": 0.8334, + "step": 3550 + }, + { + "epoch": 1.156473649967469, + "grad_norm": 1.186691403388977, + "learning_rate": 4.414889899589709e-05, + "loss": 0.8158, + "step": 3555 + }, + { + "epoch": 1.1581001951854262, + "grad_norm": 1.4506323337554932, + "learning_rate": 4.4132296819155284e-05, + "loss": 0.8226, + "step": 3560 + }, + { + "epoch": 1.1597267404033833, + "grad_norm": 1.4542236328125, + "learning_rate": 4.411567425294867e-05, + "loss": 0.8328, + "step": 3565 + }, + { + "epoch": 1.1613532856213402, + "grad_norm": 1.5659462213516235, + "learning_rate": 4.4099031314992075e-05, + "loss": 0.8396, + "step": 3570 + }, + { + "epoch": 1.1629798308392973, + "grad_norm": 1.2975887060165405, + "learning_rate": 4.408236802302203e-05, + "loss": 0.8391, + "step": 3575 + }, + { + "epoch": 1.1646063760572545, + "grad_norm": 1.3343780040740967, + "learning_rate": 4.406568439479677e-05, + "loss": 0.8203, + "step": 3580 + }, + { + "epoch": 1.1662329212752114, + "grad_norm": 1.4639620780944824, + "learning_rate": 4.404898044809618e-05, + "loss": 0.8509, + "step": 3585 + }, + { + "epoch": 1.1678594664931685, + "grad_norm": 1.3307031393051147, + "learning_rate": 4.403225620072182e-05, + "loss": 0.8202, + "step": 3590 + }, + { + "epoch": 1.1694860117111257, + "grad_norm": 1.3971885442733765, + "learning_rate": 4.401551167049686e-05, + "loss": 0.8079, + "step": 3595 + }, + { + "epoch": 1.1711125569290826, + "grad_norm": 1.3948359489440918, + "learning_rate": 4.39987468752661e-05, + "loss": 0.8564, + "step": 3600 + }, + { + "epoch": 1.1727391021470397, + "grad_norm": 1.3139369487762451, + "learning_rate": 4.398196183289595e-05, + "loss": 0.8237, + "step": 3605 + }, + { + "epoch": 1.1743656473649968, + "grad_norm": 1.2766032218933105, + "learning_rate": 4.396515656127437e-05, + "loss": 0.8372, + "step": 3610 + }, + { + "epoch": 1.1759921925829537, + "grad_norm": 1.4495149850845337, + "learning_rate": 4.394833107831091e-05, + "loss": 0.8563, + "step": 3615 + }, + { + "epoch": 1.1776187378009109, + "grad_norm": 1.2466148138046265, + "learning_rate": 4.393148540193663e-05, + "loss": 0.8427, + "step": 3620 + }, + { + "epoch": 1.179245283018868, + "grad_norm": 1.3020066022872925, + "learning_rate": 4.3914619550104125e-05, + "loss": 0.827, + "step": 3625 + }, + { + "epoch": 1.180871828236825, + "grad_norm": 1.6559127569198608, + "learning_rate": 4.389773354078749e-05, + "loss": 0.8511, + "step": 3630 + }, + { + "epoch": 1.182498373454782, + "grad_norm": 1.5298069715499878, + "learning_rate": 4.388082739198229e-05, + "loss": 0.832, + "step": 3635 + }, + { + "epoch": 1.1841249186727392, + "grad_norm": 1.4255952835083008, + "learning_rate": 4.386390112170558e-05, + "loss": 0.8397, + "step": 3640 + }, + { + "epoch": 1.185751463890696, + "grad_norm": 1.6214672327041626, + "learning_rate": 4.3846954747995825e-05, + "loss": 0.8624, + "step": 3645 + }, + { + "epoch": 1.1873780091086532, + "grad_norm": 1.3479894399642944, + "learning_rate": 4.382998828891295e-05, + "loss": 0.8316, + "step": 3650 + }, + { + "epoch": 1.1890045543266103, + "grad_norm": 1.3111231327056885, + "learning_rate": 4.381300176253825e-05, + "loss": 0.8317, + "step": 3655 + }, + { + "epoch": 1.1906310995445673, + "grad_norm": 1.4689635038375854, + "learning_rate": 4.379599518697444e-05, + "loss": 0.8333, + "step": 3660 + }, + { + "epoch": 1.1922576447625244, + "grad_norm": 1.4377338886260986, + "learning_rate": 4.377896858034557e-05, + "loss": 0.8883, + "step": 3665 + }, + { + "epoch": 1.1938841899804815, + "grad_norm": 1.1751782894134521, + "learning_rate": 4.376192196079705e-05, + "loss": 0.8125, + "step": 3670 + }, + { + "epoch": 1.1955107351984384, + "grad_norm": 1.3648537397384644, + "learning_rate": 4.374485534649562e-05, + "loss": 0.8555, + "step": 3675 + }, + { + "epoch": 1.1971372804163956, + "grad_norm": 1.4132126569747925, + "learning_rate": 4.372776875562934e-05, + "loss": 0.8604, + "step": 3680 + }, + { + "epoch": 1.1987638256343527, + "grad_norm": 1.4481204748153687, + "learning_rate": 4.371066220640754e-05, + "loss": 0.831, + "step": 3685 + }, + { + "epoch": 1.2003903708523098, + "grad_norm": 1.6251473426818848, + "learning_rate": 4.369353571706082e-05, + "loss": 0.8355, + "step": 3690 + }, + { + "epoch": 1.2020169160702667, + "grad_norm": 1.4322580099105835, + "learning_rate": 4.367638930584105e-05, + "loss": 0.8265, + "step": 3695 + }, + { + "epoch": 1.2036434612882239, + "grad_norm": 1.3358713388442993, + "learning_rate": 4.365922299102131e-05, + "loss": 0.8126, + "step": 3700 + }, + { + "epoch": 1.205270006506181, + "grad_norm": 1.240532398223877, + "learning_rate": 4.36420367908959e-05, + "loss": 0.8367, + "step": 3705 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 1.4430460929870605, + "learning_rate": 4.3624830723780314e-05, + "loss": 0.8471, + "step": 3710 + }, + { + "epoch": 1.208523096942095, + "grad_norm": 1.4554240703582764, + "learning_rate": 4.3607604808011213e-05, + "loss": 0.8261, + "step": 3715 + }, + { + "epoch": 1.2101496421600522, + "grad_norm": 1.2977465391159058, + "learning_rate": 4.3590359061946416e-05, + "loss": 0.8083, + "step": 3720 + }, + { + "epoch": 1.211776187378009, + "grad_norm": 1.417873501777649, + "learning_rate": 4.357309350396488e-05, + "loss": 0.8542, + "step": 3725 + }, + { + "epoch": 1.2134027325959662, + "grad_norm": 1.3270829916000366, + "learning_rate": 4.355580815246666e-05, + "loss": 0.821, + "step": 3730 + }, + { + "epoch": 1.2150292778139233, + "grad_norm": 1.4509855508804321, + "learning_rate": 4.353850302587291e-05, + "loss": 0.8362, + "step": 3735 + }, + { + "epoch": 1.2166558230318802, + "grad_norm": 1.5028307437896729, + "learning_rate": 4.352117814262587e-05, + "loss": 0.8406, + "step": 3740 + }, + { + "epoch": 1.2182823682498374, + "grad_norm": 1.2940112352371216, + "learning_rate": 4.3503833521188844e-05, + "loss": 0.8227, + "step": 3745 + }, + { + "epoch": 1.2199089134677945, + "grad_norm": 1.4184249639511108, + "learning_rate": 4.3486469180046116e-05, + "loss": 0.8254, + "step": 3750 + }, + { + "epoch": 1.2215354586857514, + "grad_norm": 1.2665643692016602, + "learning_rate": 4.346908513770306e-05, + "loss": 0.8422, + "step": 3755 + }, + { + "epoch": 1.2231620039037086, + "grad_norm": 1.7031304836273193, + "learning_rate": 4.345168141268599e-05, + "loss": 0.8424, + "step": 3760 + }, + { + "epoch": 1.2247885491216657, + "grad_norm": 1.304803729057312, + "learning_rate": 4.343425802354222e-05, + "loss": 0.838, + "step": 3765 + }, + { + "epoch": 1.2264150943396226, + "grad_norm": 1.6106655597686768, + "learning_rate": 4.3416814988840024e-05, + "loss": 0.824, + "step": 3770 + }, + { + "epoch": 1.2280416395575797, + "grad_norm": 1.3643959760665894, + "learning_rate": 4.3399352327168595e-05, + "loss": 0.8159, + "step": 3775 + }, + { + "epoch": 1.2296681847755369, + "grad_norm": 1.7104637622833252, + "learning_rate": 4.3381870057138054e-05, + "loss": 0.8634, + "step": 3780 + }, + { + "epoch": 1.2312947299934938, + "grad_norm": 1.4220479726791382, + "learning_rate": 4.3364368197379426e-05, + "loss": 0.8551, + "step": 3785 + }, + { + "epoch": 1.232921275211451, + "grad_norm": 1.3457602262496948, + "learning_rate": 4.334684676654459e-05, + "loss": 0.8418, + "step": 3790 + }, + { + "epoch": 1.234547820429408, + "grad_norm": 1.1884374618530273, + "learning_rate": 4.33293057833063e-05, + "loss": 0.864, + "step": 3795 + }, + { + "epoch": 1.236174365647365, + "grad_norm": 1.3422157764434814, + "learning_rate": 4.331174526635815e-05, + "loss": 0.8573, + "step": 3800 + }, + { + "epoch": 1.237800910865322, + "grad_norm": 1.1692341566085815, + "learning_rate": 4.329416523441454e-05, + "loss": 0.8425, + "step": 3805 + }, + { + "epoch": 1.2394274560832792, + "grad_norm": 1.550016164779663, + "learning_rate": 4.327656570621067e-05, + "loss": 0.8258, + "step": 3810 + }, + { + "epoch": 1.241054001301236, + "grad_norm": 1.2474884986877441, + "learning_rate": 4.3258946700502535e-05, + "loss": 0.8311, + "step": 3815 + }, + { + "epoch": 1.2426805465191932, + "grad_norm": 1.481498122215271, + "learning_rate": 4.3241308236066846e-05, + "loss": 0.8493, + "step": 3820 + }, + { + "epoch": 1.2443070917371504, + "grad_norm": 1.439204454421997, + "learning_rate": 4.322365033170109e-05, + "loss": 0.8542, + "step": 3825 + }, + { + "epoch": 1.2459336369551073, + "grad_norm": 1.5375816822052002, + "learning_rate": 4.320597300622346e-05, + "loss": 0.8242, + "step": 3830 + }, + { + "epoch": 1.2475601821730644, + "grad_norm": 1.4234498739242554, + "learning_rate": 4.318827627847284e-05, + "loss": 0.8014, + "step": 3835 + }, + { + "epoch": 1.2491867273910215, + "grad_norm": 1.540426254272461, + "learning_rate": 4.31705601673088e-05, + "loss": 0.802, + "step": 3840 + }, + { + "epoch": 1.2508132726089785, + "grad_norm": 1.3604222536087036, + "learning_rate": 4.315282469161156e-05, + "loss": 0.8565, + "step": 3845 + }, + { + "epoch": 1.2524398178269356, + "grad_norm": 1.2253650426864624, + "learning_rate": 4.313506987028198e-05, + "loss": 0.8346, + "step": 3850 + }, + { + "epoch": 1.2540663630448927, + "grad_norm": 1.386781096458435, + "learning_rate": 4.311729572224153e-05, + "loss": 0.8592, + "step": 3855 + }, + { + "epoch": 1.2556929082628496, + "grad_norm": 1.7717795372009277, + "learning_rate": 4.309950226643229e-05, + "loss": 0.8513, + "step": 3860 + }, + { + "epoch": 1.2573194534808068, + "grad_norm": 1.3498862981796265, + "learning_rate": 4.308168952181691e-05, + "loss": 0.829, + "step": 3865 + }, + { + "epoch": 1.258945998698764, + "grad_norm": 1.342957854270935, + "learning_rate": 4.30638575073786e-05, + "loss": 0.8687, + "step": 3870 + }, + { + "epoch": 1.2605725439167208, + "grad_norm": 1.280372977256775, + "learning_rate": 4.304600624212109e-05, + "loss": 0.8402, + "step": 3875 + }, + { + "epoch": 1.262199089134678, + "grad_norm": 1.4032455682754517, + "learning_rate": 4.302813574506866e-05, + "loss": 0.8486, + "step": 3880 + }, + { + "epoch": 1.263825634352635, + "grad_norm": 1.268967866897583, + "learning_rate": 4.301024603526603e-05, + "loss": 0.8215, + "step": 3885 + }, + { + "epoch": 1.265452179570592, + "grad_norm": 1.5919753313064575, + "learning_rate": 4.299233713177845e-05, + "loss": 0.8618, + "step": 3890 + }, + { + "epoch": 1.267078724788549, + "grad_norm": 1.1171088218688965, + "learning_rate": 4.297440905369161e-05, + "loss": 0.8565, + "step": 3895 + }, + { + "epoch": 1.2687052700065062, + "grad_norm": 2.0121335983276367, + "learning_rate": 4.2956461820111605e-05, + "loss": 0.8432, + "step": 3900 + }, + { + "epoch": 1.2703318152244631, + "grad_norm": 1.4732331037521362, + "learning_rate": 4.2938495450164986e-05, + "loss": 0.8659, + "step": 3905 + }, + { + "epoch": 1.2719583604424203, + "grad_norm": 1.3948605060577393, + "learning_rate": 4.2920509962998664e-05, + "loss": 0.8377, + "step": 3910 + }, + { + "epoch": 1.2735849056603774, + "grad_norm": 1.3198820352554321, + "learning_rate": 4.290250537777996e-05, + "loss": 0.8314, + "step": 3915 + }, + { + "epoch": 1.2752114508783343, + "grad_norm": 1.6067476272583008, + "learning_rate": 4.28844817136965e-05, + "loss": 0.8538, + "step": 3920 + }, + { + "epoch": 1.2768379960962914, + "grad_norm": 1.5238882303237915, + "learning_rate": 4.286643898995628e-05, + "loss": 0.8483, + "step": 3925 + }, + { + "epoch": 1.2784645413142486, + "grad_norm": 1.4347761869430542, + "learning_rate": 4.284837722578761e-05, + "loss": 0.8294, + "step": 3930 + }, + { + "epoch": 1.2800910865322055, + "grad_norm": 1.2087209224700928, + "learning_rate": 4.283029644043906e-05, + "loss": 0.8274, + "step": 3935 + }, + { + "epoch": 1.2817176317501626, + "grad_norm": 1.5129261016845703, + "learning_rate": 4.281219665317949e-05, + "loss": 0.8447, + "step": 3940 + }, + { + "epoch": 1.2833441769681198, + "grad_norm": 1.491698145866394, + "learning_rate": 4.2794077883298014e-05, + "loss": 0.8358, + "step": 3945 + }, + { + "epoch": 1.2849707221860767, + "grad_norm": 1.28629469871521, + "learning_rate": 4.277594015010398e-05, + "loss": 0.8308, + "step": 3950 + }, + { + "epoch": 1.2865972674040338, + "grad_norm": 1.4009138345718384, + "learning_rate": 4.275778347292693e-05, + "loss": 0.8018, + "step": 3955 + }, + { + "epoch": 1.288223812621991, + "grad_norm": 1.2688696384429932, + "learning_rate": 4.27396078711166e-05, + "loss": 0.8197, + "step": 3960 + }, + { + "epoch": 1.2898503578399478, + "grad_norm": 1.3794180154800415, + "learning_rate": 4.272141336404289e-05, + "loss": 0.8353, + "step": 3965 + }, + { + "epoch": 1.291476903057905, + "grad_norm": 1.2758440971374512, + "learning_rate": 4.2703199971095876e-05, + "loss": 0.8545, + "step": 3970 + }, + { + "epoch": 1.293103448275862, + "grad_norm": 1.3469489812850952, + "learning_rate": 4.268496771168572e-05, + "loss": 0.8113, + "step": 3975 + }, + { + "epoch": 1.294729993493819, + "grad_norm": 1.6951850652694702, + "learning_rate": 4.2666716605242717e-05, + "loss": 0.846, + "step": 3980 + }, + { + "epoch": 1.2963565387117761, + "grad_norm": 1.342667818069458, + "learning_rate": 4.264844667121723e-05, + "loss": 0.8295, + "step": 3985 + }, + { + "epoch": 1.2979830839297333, + "grad_norm": 1.1020427942276, + "learning_rate": 4.263015792907971e-05, + "loss": 0.8237, + "step": 3990 + }, + { + "epoch": 1.2996096291476902, + "grad_norm": 1.1668118238449097, + "learning_rate": 4.261185039832061e-05, + "loss": 0.8631, + "step": 3995 + }, + { + "epoch": 1.3012361743656473, + "grad_norm": 1.2531483173370361, + "learning_rate": 4.259352409845047e-05, + "loss": 0.855, + "step": 4000 + }, + { + "epoch": 1.3028627195836044, + "grad_norm": 1.3954122066497803, + "learning_rate": 4.2575179048999766e-05, + "loss": 0.8268, + "step": 4005 + }, + { + "epoch": 1.3044892648015614, + "grad_norm": 1.562114953994751, + "learning_rate": 4.2560489522878477e-05, + "loss": 0.8083, + "step": 4010 + }, + { + "epoch": 1.3061158100195185, + "grad_norm": 1.6567217111587524, + "learning_rate": 4.254211077346343e-05, + "loss": 0.8235, + "step": 4015 + }, + { + "epoch": 1.3077423552374756, + "grad_norm": 1.5267730951309204, + "learning_rate": 4.2523713329259484e-05, + "loss": 0.8339, + "step": 4020 + }, + { + "epoch": 1.3093689004554325, + "grad_norm": 1.3425706624984741, + "learning_rate": 4.250529720987297e-05, + "loss": 0.8294, + "step": 4025 + }, + { + "epoch": 1.3109954456733897, + "grad_norm": 1.2095836400985718, + "learning_rate": 4.248686243493012e-05, + "loss": 0.8147, + "step": 4030 + }, + { + "epoch": 1.3126219908913468, + "grad_norm": 1.4347078800201416, + "learning_rate": 4.2468409024077026e-05, + "loss": 0.8352, + "step": 4035 + }, + { + "epoch": 1.3142485361093037, + "grad_norm": 1.145184874534607, + "learning_rate": 4.244993699697968e-05, + "loss": 0.818, + "step": 4040 + }, + { + "epoch": 1.3158750813272608, + "grad_norm": 1.2478063106536865, + "learning_rate": 4.243144637332387e-05, + "loss": 0.8298, + "step": 4045 + }, + { + "epoch": 1.317501626545218, + "grad_norm": 1.221039056777954, + "learning_rate": 4.241293717281522e-05, + "loss": 0.8466, + "step": 4050 + }, + { + "epoch": 1.319128171763175, + "grad_norm": 1.2457643747329712, + "learning_rate": 4.239440941517919e-05, + "loss": 0.817, + "step": 4055 + }, + { + "epoch": 1.320754716981132, + "grad_norm": 1.4972866773605347, + "learning_rate": 4.2375863120160955e-05, + "loss": 0.8471, + "step": 4060 + }, + { + "epoch": 1.3223812621990891, + "grad_norm": 1.7402946949005127, + "learning_rate": 4.2357298307525465e-05, + "loss": 0.8451, + "step": 4065 + }, + { + "epoch": 1.3240078074170463, + "grad_norm": 1.2478011846542358, + "learning_rate": 4.233871499705743e-05, + "loss": 0.8231, + "step": 4070 + }, + { + "epoch": 1.3256343526350032, + "grad_norm": 1.3042181730270386, + "learning_rate": 4.2320113208561254e-05, + "loss": 0.8376, + "step": 4075 + }, + { + "epoch": 1.3272608978529603, + "grad_norm": 1.3558564186096191, + "learning_rate": 4.230149296186102e-05, + "loss": 0.8207, + "step": 4080 + }, + { + "epoch": 1.3288874430709174, + "grad_norm": 1.2882791757583618, + "learning_rate": 4.228285427680052e-05, + "loss": 0.8467, + "step": 4085 + }, + { + "epoch": 1.3305139882888743, + "grad_norm": 1.440674066543579, + "learning_rate": 4.226419717324315e-05, + "loss": 0.8125, + "step": 4090 + }, + { + "epoch": 1.3321405335068315, + "grad_norm": 1.3060364723205566, + "learning_rate": 4.2245521671071954e-05, + "loss": 0.8367, + "step": 4095 + }, + { + "epoch": 1.3337670787247886, + "grad_norm": 1.3582439422607422, + "learning_rate": 4.2226827790189604e-05, + "loss": 0.8479, + "step": 4100 + }, + { + "epoch": 1.3353936239427457, + "grad_norm": 1.2926558256149292, + "learning_rate": 4.220811555051834e-05, + "loss": 0.8255, + "step": 4105 + }, + { + "epoch": 1.3370201691607027, + "grad_norm": 1.4806886911392212, + "learning_rate": 4.2189384971999956e-05, + "loss": 0.838, + "step": 4110 + }, + { + "epoch": 1.3386467143786598, + "grad_norm": 1.191068172454834, + "learning_rate": 4.217063607459581e-05, + "loss": 0.8117, + "step": 4115 + }, + { + "epoch": 1.340273259596617, + "grad_norm": 1.5894582271575928, + "learning_rate": 4.2151868878286774e-05, + "loss": 0.8839, + "step": 4120 + }, + { + "epoch": 1.3418998048145738, + "grad_norm": 1.975527286529541, + "learning_rate": 4.2133083403073217e-05, + "loss": 0.8229, + "step": 4125 + }, + { + "epoch": 1.343526350032531, + "grad_norm": 1.3623355627059937, + "learning_rate": 4.2114279668975e-05, + "loss": 0.841, + "step": 4130 + }, + { + "epoch": 1.345152895250488, + "grad_norm": 1.3781988620758057, + "learning_rate": 4.209545769603143e-05, + "loss": 0.8388, + "step": 4135 + }, + { + "epoch": 1.346779440468445, + "grad_norm": 1.4330931901931763, + "learning_rate": 4.2076617504301254e-05, + "loss": 0.845, + "step": 4140 + }, + { + "epoch": 1.3484059856864021, + "grad_norm": 1.555031418800354, + "learning_rate": 4.2057759113862645e-05, + "loss": 0.825, + "step": 4145 + }, + { + "epoch": 1.3500325309043593, + "grad_norm": 1.5022776126861572, + "learning_rate": 4.2038882544813156e-05, + "loss": 0.8394, + "step": 4150 + }, + { + "epoch": 1.3516590761223162, + "grad_norm": 1.2136030197143555, + "learning_rate": 4.2019987817269726e-05, + "loss": 0.8187, + "step": 4155 + }, + { + "epoch": 1.3532856213402733, + "grad_norm": 1.4881867170333862, + "learning_rate": 4.2001074951368645e-05, + "loss": 0.8369, + "step": 4160 + }, + { + "epoch": 1.3549121665582304, + "grad_norm": 1.3224900960922241, + "learning_rate": 4.198214396726552e-05, + "loss": 0.8523, + "step": 4165 + }, + { + "epoch": 1.3565387117761873, + "grad_norm": 1.3465425968170166, + "learning_rate": 4.196319488513527e-05, + "loss": 0.8621, + "step": 4170 + }, + { + "epoch": 1.3581652569941445, + "grad_norm": 1.471650242805481, + "learning_rate": 4.1944227725172124e-05, + "loss": 0.8576, + "step": 4175 + }, + { + "epoch": 1.3597918022121016, + "grad_norm": 1.381115436553955, + "learning_rate": 4.192524250758953e-05, + "loss": 0.8329, + "step": 4180 + }, + { + "epoch": 1.3614183474300585, + "grad_norm": 1.289157748222351, + "learning_rate": 4.190623925262025e-05, + "loss": 0.821, + "step": 4185 + }, + { + "epoch": 1.3630448926480156, + "grad_norm": 1.4478178024291992, + "learning_rate": 4.188721798051619e-05, + "loss": 0.8257, + "step": 4190 + }, + { + "epoch": 1.3646714378659728, + "grad_norm": 1.5857568979263306, + "learning_rate": 4.186817871154851e-05, + "loss": 0.8496, + "step": 4195 + }, + { + "epoch": 1.3662979830839297, + "grad_norm": 1.2931289672851562, + "learning_rate": 4.184912146600754e-05, + "loss": 0.8449, + "step": 4200 + }, + { + "epoch": 1.3679245283018868, + "grad_norm": 1.4624470472335815, + "learning_rate": 4.1830046264202746e-05, + "loss": 0.8499, + "step": 4205 + }, + { + "epoch": 1.369551073519844, + "grad_norm": 1.3153526782989502, + "learning_rate": 4.181095312646277e-05, + "loss": 0.8533, + "step": 4210 + }, + { + "epoch": 1.3711776187378009, + "grad_norm": 1.1572147607803345, + "learning_rate": 4.179184207313532e-05, + "loss": 0.8327, + "step": 4215 + }, + { + "epoch": 1.372804163955758, + "grad_norm": 1.4946880340576172, + "learning_rate": 4.177271312458724e-05, + "loss": 0.8176, + "step": 4220 + }, + { + "epoch": 1.3744307091737151, + "grad_norm": 1.7683255672454834, + "learning_rate": 4.1753566301204414e-05, + "loss": 0.8026, + "step": 4225 + }, + { + "epoch": 1.376057254391672, + "grad_norm": 1.1509405374526978, + "learning_rate": 4.173440162339179e-05, + "loss": 0.8436, + "step": 4230 + }, + { + "epoch": 1.3776837996096292, + "grad_norm": 1.3632792234420776, + "learning_rate": 4.1715219111573343e-05, + "loss": 0.8262, + "step": 4235 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 1.5997217893600464, + "learning_rate": 4.169601878619206e-05, + "loss": 0.8229, + "step": 4240 + }, + { + "epoch": 1.3809368900455432, + "grad_norm": 1.7572593688964844, + "learning_rate": 4.167680066770989e-05, + "loss": 0.8239, + "step": 4245 + }, + { + "epoch": 1.3825634352635003, + "grad_norm": 1.3744713068008423, + "learning_rate": 4.165756477660777e-05, + "loss": 0.8492, + "step": 4250 + }, + { + "epoch": 1.3841899804814575, + "grad_norm": 1.2562565803527832, + "learning_rate": 4.1638311133385566e-05, + "loss": 0.8584, + "step": 4255 + }, + { + "epoch": 1.3858165256994144, + "grad_norm": 1.3067395687103271, + "learning_rate": 4.161903975856205e-05, + "loss": 0.8177, + "step": 4260 + }, + { + "epoch": 1.3874430709173715, + "grad_norm": 1.3908008337020874, + "learning_rate": 4.1599750672674926e-05, + "loss": 0.8196, + "step": 4265 + }, + { + "epoch": 1.3890696161353286, + "grad_norm": 1.2384636402130127, + "learning_rate": 4.158044389628073e-05, + "loss": 0.8245, + "step": 4270 + }, + { + "epoch": 1.3906961613532856, + "grad_norm": 1.5143787860870361, + "learning_rate": 4.1561119449954875e-05, + "loss": 0.8585, + "step": 4275 + }, + { + "epoch": 1.3923227065712427, + "grad_norm": 1.6442577838897705, + "learning_rate": 4.154177735429161e-05, + "loss": 0.8424, + "step": 4280 + }, + { + "epoch": 1.3939492517891998, + "grad_norm": 1.731024980545044, + "learning_rate": 4.152241762990397e-05, + "loss": 0.843, + "step": 4285 + }, + { + "epoch": 1.3955757970071567, + "grad_norm": 1.2395538091659546, + "learning_rate": 4.150304029742381e-05, + "loss": 0.8278, + "step": 4290 + }, + { + "epoch": 1.3972023422251139, + "grad_norm": 1.2536678314208984, + "learning_rate": 4.148364537750172e-05, + "loss": 0.8194, + "step": 4295 + }, + { + "epoch": 1.398828887443071, + "grad_norm": 1.941697120666504, + "learning_rate": 4.146423289080705e-05, + "loss": 0.8578, + "step": 4300 + }, + { + "epoch": 1.400455432661028, + "grad_norm": 1.2927970886230469, + "learning_rate": 4.144480285802787e-05, + "loss": 0.8306, + "step": 4305 + }, + { + "epoch": 1.402081977878985, + "grad_norm": 1.1343443393707275, + "learning_rate": 4.1425355299870915e-05, + "loss": 0.8425, + "step": 4310 + }, + { + "epoch": 1.4037085230969422, + "grad_norm": 1.457992434501648, + "learning_rate": 4.140589023706166e-05, + "loss": 0.847, + "step": 4315 + }, + { + "epoch": 1.405335068314899, + "grad_norm": 1.468604326248169, + "learning_rate": 4.138640769034419e-05, + "loss": 0.8351, + "step": 4320 + }, + { + "epoch": 1.4069616135328562, + "grad_norm": 1.3238948583602905, + "learning_rate": 4.1366907680481236e-05, + "loss": 0.8486, + "step": 4325 + }, + { + "epoch": 1.4085881587508133, + "grad_norm": 1.2403712272644043, + "learning_rate": 4.1347390228254145e-05, + "loss": 0.8434, + "step": 4330 + }, + { + "epoch": 1.4102147039687702, + "grad_norm": 1.3687196969985962, + "learning_rate": 4.132785535446283e-05, + "loss": 0.8297, + "step": 4335 + }, + { + "epoch": 1.4118412491867274, + "grad_norm": 1.600464940071106, + "learning_rate": 4.130830307992579e-05, + "loss": 0.8698, + "step": 4340 + }, + { + "epoch": 1.4134677944046845, + "grad_norm": 1.4882853031158447, + "learning_rate": 4.128873342548007e-05, + "loss": 0.8138, + "step": 4345 + }, + { + "epoch": 1.4150943396226414, + "grad_norm": 1.3315528631210327, + "learning_rate": 4.126914641198123e-05, + "loss": 0.8139, + "step": 4350 + }, + { + "epoch": 1.4167208848405985, + "grad_norm": 1.479295253753662, + "learning_rate": 4.1249542060303325e-05, + "loss": 0.8653, + "step": 4355 + }, + { + "epoch": 1.4183474300585557, + "grad_norm": 1.3208636045455933, + "learning_rate": 4.1229920391338896e-05, + "loss": 0.8327, + "step": 4360 + }, + { + "epoch": 1.4199739752765126, + "grad_norm": 1.485842227935791, + "learning_rate": 4.121028142599894e-05, + "loss": 0.814, + "step": 4365 + }, + { + "epoch": 1.4216005204944697, + "grad_norm": 1.618328332901001, + "learning_rate": 4.119062518521288e-05, + "loss": 0.8361, + "step": 4370 + }, + { + "epoch": 1.4232270657124269, + "grad_norm": 1.5308271646499634, + "learning_rate": 4.117095168992855e-05, + "loss": 0.8156, + "step": 4375 + }, + { + "epoch": 1.4248536109303838, + "grad_norm": 1.9163178205490112, + "learning_rate": 4.115126096111218e-05, + "loss": 0.8282, + "step": 4380 + }, + { + "epoch": 1.426480156148341, + "grad_norm": 1.2593997716903687, + "learning_rate": 4.113155301974836e-05, + "loss": 0.8408, + "step": 4385 + }, + { + "epoch": 1.428106701366298, + "grad_norm": 1.5695834159851074, + "learning_rate": 4.1111827886840015e-05, + "loss": 0.8313, + "step": 4390 + }, + { + "epoch": 1.429733246584255, + "grad_norm": 1.2713311910629272, + "learning_rate": 4.1092085583408415e-05, + "loss": 0.8165, + "step": 4395 + }, + { + "epoch": 1.431359791802212, + "grad_norm": 1.3176714181900024, + "learning_rate": 4.1072326130493104e-05, + "loss": 0.8675, + "step": 4400 + }, + { + "epoch": 1.4329863370201692, + "grad_norm": 1.207156777381897, + "learning_rate": 4.105254954915192e-05, + "loss": 0.8137, + "step": 4405 + }, + { + "epoch": 1.434612882238126, + "grad_norm": 1.444702386856079, + "learning_rate": 4.103275586046095e-05, + "loss": 0.8551, + "step": 4410 + }, + { + "epoch": 1.4362394274560832, + "grad_norm": 1.3145989179611206, + "learning_rate": 4.10129450855145e-05, + "loss": 0.8506, + "step": 4415 + }, + { + "epoch": 1.4378659726740404, + "grad_norm": 1.3064020872116089, + "learning_rate": 4.099311724542509e-05, + "loss": 0.8253, + "step": 4420 + }, + { + "epoch": 1.4394925178919973, + "grad_norm": 1.4843122959136963, + "learning_rate": 4.097327236132346e-05, + "loss": 0.8539, + "step": 4425 + }, + { + "epoch": 1.4411190631099544, + "grad_norm": 1.3873169422149658, + "learning_rate": 4.0953410454358455e-05, + "loss": 0.8112, + "step": 4430 + }, + { + "epoch": 1.4427456083279115, + "grad_norm": 1.2894911766052246, + "learning_rate": 4.09335315456971e-05, + "loss": 0.8094, + "step": 4435 + }, + { + "epoch": 1.4443721535458685, + "grad_norm": 1.4788933992385864, + "learning_rate": 4.091363565652455e-05, + "loss": 0.8426, + "step": 4440 + }, + { + "epoch": 1.4459986987638256, + "grad_norm": 1.1745434999465942, + "learning_rate": 4.089372280804401e-05, + "loss": 0.8328, + "step": 4445 + }, + { + "epoch": 1.4476252439817827, + "grad_norm": 1.6083126068115234, + "learning_rate": 4.08737930214768e-05, + "loss": 0.8221, + "step": 4450 + }, + { + "epoch": 1.4492517891997396, + "grad_norm": 1.2938541173934937, + "learning_rate": 4.0853846318062285e-05, + "loss": 0.8328, + "step": 4455 + }, + { + "epoch": 1.4508783344176968, + "grad_norm": 1.2787350416183472, + "learning_rate": 4.083388271905783e-05, + "loss": 0.7955, + "step": 4460 + }, + { + "epoch": 1.452504879635654, + "grad_norm": 1.2539377212524414, + "learning_rate": 4.0813902245738845e-05, + "loss": 0.8392, + "step": 4465 + }, + { + "epoch": 1.4541314248536108, + "grad_norm": 1.411559820175171, + "learning_rate": 4.0793904919398685e-05, + "loss": 0.8134, + "step": 4470 + }, + { + "epoch": 1.455757970071568, + "grad_norm": 1.4134405851364136, + "learning_rate": 4.077389076134871e-05, + "loss": 0.8026, + "step": 4475 + }, + { + "epoch": 1.457384515289525, + "grad_norm": 1.3306896686553955, + "learning_rate": 4.075385979291817e-05, + "loss": 0.8456, + "step": 4480 + }, + { + "epoch": 1.459011060507482, + "grad_norm": 1.3529064655303955, + "learning_rate": 4.0733812035454264e-05, + "loss": 0.8125, + "step": 4485 + }, + { + "epoch": 1.460637605725439, + "grad_norm": 1.5420523881912231, + "learning_rate": 4.071374751032206e-05, + "loss": 0.8012, + "step": 4490 + }, + { + "epoch": 1.4622641509433962, + "grad_norm": 1.4212911128997803, + "learning_rate": 4.0693666238904525e-05, + "loss": 0.8644, + "step": 4495 + }, + { + "epoch": 1.4638906961613534, + "grad_norm": 1.2714157104492188, + "learning_rate": 4.067356824260244e-05, + "loss": 0.9037, + "step": 4500 + }, + { + "epoch": 1.4655172413793103, + "grad_norm": 1.2137748003005981, + "learning_rate": 4.065345354283444e-05, + "loss": 0.8259, + "step": 4505 + }, + { + "epoch": 1.4671437865972674, + "grad_norm": 2.057116985321045, + "learning_rate": 4.0633322161036924e-05, + "loss": 0.8383, + "step": 4510 + }, + { + "epoch": 1.4687703318152245, + "grad_norm": 1.609216332435608, + "learning_rate": 4.061317411866411e-05, + "loss": 0.8621, + "step": 4515 + }, + { + "epoch": 1.4703968770331814, + "grad_norm": 1.5873533487319946, + "learning_rate": 4.059300943718794e-05, + "loss": 0.8567, + "step": 4520 + }, + { + "epoch": 1.4720234222511386, + "grad_norm": 1.5258110761642456, + "learning_rate": 4.0572828138098106e-05, + "loss": 0.8162, + "step": 4525 + }, + { + "epoch": 1.4736499674690957, + "grad_norm": 1.5315823554992676, + "learning_rate": 4.055263024290201e-05, + "loss": 0.8029, + "step": 4530 + }, + { + "epoch": 1.4752765126870526, + "grad_norm": 1.419977068901062, + "learning_rate": 4.053241577312472e-05, + "loss": 0.8382, + "step": 4535 + }, + { + "epoch": 1.4769030579050098, + "grad_norm": 1.451988697052002, + "learning_rate": 4.051218475030899e-05, + "loss": 0.8394, + "step": 4540 + }, + { + "epoch": 1.4785296031229669, + "grad_norm": 1.3073935508728027, + "learning_rate": 4.0491937196015214e-05, + "loss": 0.8399, + "step": 4545 + }, + { + "epoch": 1.480156148340924, + "grad_norm": 1.4135723114013672, + "learning_rate": 4.0471673131821386e-05, + "loss": 0.8562, + "step": 4550 + }, + { + "epoch": 1.481782693558881, + "grad_norm": 1.13473379611969, + "learning_rate": 4.045139257932311e-05, + "loss": 0.8293, + "step": 4555 + }, + { + "epoch": 1.483409238776838, + "grad_norm": 1.382108449935913, + "learning_rate": 4.043109556013356e-05, + "loss": 0.8185, + "step": 4560 + }, + { + "epoch": 1.4850357839947952, + "grad_norm": 1.0713036060333252, + "learning_rate": 4.041078209588346e-05, + "loss": 0.8278, + "step": 4565 + }, + { + "epoch": 1.486662329212752, + "grad_norm": 1.3231723308563232, + "learning_rate": 4.039045220822104e-05, + "loss": 0.8598, + "step": 4570 + }, + { + "epoch": 1.4882888744307092, + "grad_norm": 1.3532663583755493, + "learning_rate": 4.037010591881206e-05, + "loss": 0.8573, + "step": 4575 + }, + { + "epoch": 1.4899154196486664, + "grad_norm": 1.6196377277374268, + "learning_rate": 4.0349743249339756e-05, + "loss": 0.8474, + "step": 4580 + }, + { + "epoch": 1.4915419648666233, + "grad_norm": 1.2389353513717651, + "learning_rate": 4.0329364221504796e-05, + "loss": 0.8306, + "step": 4585 + }, + { + "epoch": 1.4931685100845804, + "grad_norm": 1.3713908195495605, + "learning_rate": 4.0308968857025296e-05, + "loss": 0.8246, + "step": 4590 + }, + { + "epoch": 1.4947950553025375, + "grad_norm": 1.223692774772644, + "learning_rate": 4.02885571776368e-05, + "loss": 0.81, + "step": 4595 + }, + { + "epoch": 1.4964216005204944, + "grad_norm": 1.6347450017929077, + "learning_rate": 4.026812920509221e-05, + "loss": 0.8298, + "step": 4600 + }, + { + "epoch": 1.4980481457384516, + "grad_norm": 1.1801060438156128, + "learning_rate": 4.0247684961161815e-05, + "loss": 0.8187, + "step": 4605 + }, + { + "epoch": 1.4996746909564087, + "grad_norm": 1.3351877927780151, + "learning_rate": 4.022722446763322e-05, + "loss": 0.8163, + "step": 4610 + }, + { + "epoch": 1.5013012361743656, + "grad_norm": 1.1578738689422607, + "learning_rate": 4.0206747746311376e-05, + "loss": 0.8244, + "step": 4615 + }, + { + "epoch": 1.5029277813923227, + "grad_norm": 1.7494115829467773, + "learning_rate": 4.0186254819018504e-05, + "loss": 0.826, + "step": 4620 + }, + { + "epoch": 1.5045543266102799, + "grad_norm": 1.291858434677124, + "learning_rate": 4.01657457075941e-05, + "loss": 0.8755, + "step": 4625 + }, + { + "epoch": 1.5061808718282368, + "grad_norm": 1.615688681602478, + "learning_rate": 4.014522043389493e-05, + "loss": 0.8504, + "step": 4630 + }, + { + "epoch": 1.507807417046194, + "grad_norm": 1.5270360708236694, + "learning_rate": 4.012467901979496e-05, + "loss": 0.8178, + "step": 4635 + }, + { + "epoch": 1.509433962264151, + "grad_norm": 1.676094889640808, + "learning_rate": 4.010412148718535e-05, + "loss": 0.8614, + "step": 4640 + }, + { + "epoch": 1.511060507482108, + "grad_norm": 1.2096936702728271, + "learning_rate": 4.008354785797447e-05, + "loss": 0.7952, + "step": 4645 + }, + { + "epoch": 1.512687052700065, + "grad_norm": 1.4507110118865967, + "learning_rate": 4.006295815408781e-05, + "loss": 0.8252, + "step": 4650 + }, + { + "epoch": 1.5143135979180222, + "grad_norm": 1.2976456880569458, + "learning_rate": 4.004235239746803e-05, + "loss": 0.834, + "step": 4655 + }, + { + "epoch": 1.5159401431359791, + "grad_norm": 1.3359556198120117, + "learning_rate": 4.0021730610074856e-05, + "loss": 0.8393, + "step": 4660 + }, + { + "epoch": 1.5175666883539363, + "grad_norm": 1.153700351715088, + "learning_rate": 4.0001092813885116e-05, + "loss": 0.8494, + "step": 4665 + }, + { + "epoch": 1.5191932335718934, + "grad_norm": 1.330363154411316, + "learning_rate": 3.99804390308927e-05, + "loss": 0.8004, + "step": 4670 + }, + { + "epoch": 1.5208197787898503, + "grad_norm": 1.576116919517517, + "learning_rate": 3.995976928310855e-05, + "loss": 0.824, + "step": 4675 + }, + { + "epoch": 1.5224463240078074, + "grad_norm": 1.213552474975586, + "learning_rate": 3.9939083592560586e-05, + "loss": 0.833, + "step": 4680 + }, + { + "epoch": 1.5240728692257646, + "grad_norm": 1.1656043529510498, + "learning_rate": 3.991838198129376e-05, + "loss": 0.8479, + "step": 4685 + }, + { + "epoch": 1.5256994144437215, + "grad_norm": 1.242655634880066, + "learning_rate": 3.989766447136995e-05, + "loss": 0.8597, + "step": 4690 + }, + { + "epoch": 1.5273259596616786, + "grad_norm": 1.4139341115951538, + "learning_rate": 3.987693108486801e-05, + "loss": 0.8137, + "step": 4695 + }, + { + "epoch": 1.5289525048796357, + "grad_norm": 1.1662260293960571, + "learning_rate": 3.9856181843883687e-05, + "loss": 0.8177, + "step": 4700 + }, + { + "epoch": 1.5305790500975927, + "grad_norm": 1.1157597303390503, + "learning_rate": 3.9835416770529655e-05, + "loss": 0.8192, + "step": 4705 + }, + { + "epoch": 1.5322055953155498, + "grad_norm": 1.3376294374465942, + "learning_rate": 3.981463588693543e-05, + "loss": 0.8519, + "step": 4710 + }, + { + "epoch": 1.533832140533507, + "grad_norm": 1.4485875368118286, + "learning_rate": 3.9793839215247394e-05, + "loss": 0.8246, + "step": 4715 + }, + { + "epoch": 1.5354586857514638, + "grad_norm": 1.1737607717514038, + "learning_rate": 3.977302677762876e-05, + "loss": 0.8168, + "step": 4720 + }, + { + "epoch": 1.537085230969421, + "grad_norm": 1.2390429973602295, + "learning_rate": 3.975219859625953e-05, + "loss": 0.8274, + "step": 4725 + }, + { + "epoch": 1.538711776187378, + "grad_norm": 1.2670161724090576, + "learning_rate": 3.973135469333647e-05, + "loss": 0.8471, + "step": 4730 + }, + { + "epoch": 1.540338321405335, + "grad_norm": 1.131270408630371, + "learning_rate": 3.971049509107315e-05, + "loss": 0.8471, + "step": 4735 + }, + { + "epoch": 1.5419648666232921, + "grad_norm": 1.4478318691253662, + "learning_rate": 3.968961981169983e-05, + "loss": 0.8508, + "step": 4740 + }, + { + "epoch": 1.5435914118412493, + "grad_norm": 1.3702036142349243, + "learning_rate": 3.9668728877463465e-05, + "loss": 0.8327, + "step": 4745 + }, + { + "epoch": 1.5452179570592062, + "grad_norm": 1.397478461265564, + "learning_rate": 3.9647822310627755e-05, + "loss": 0.832, + "step": 4750 + }, + { + "epoch": 1.5468445022771633, + "grad_norm": 1.1441662311553955, + "learning_rate": 3.962690013347299e-05, + "loss": 0.8482, + "step": 4755 + }, + { + "epoch": 1.5484710474951204, + "grad_norm": 1.500946044921875, + "learning_rate": 3.9605962368296135e-05, + "loss": 0.8245, + "step": 4760 + }, + { + "epoch": 1.5500975927130773, + "grad_norm": 1.5952589511871338, + "learning_rate": 3.958500903741077e-05, + "loss": 0.8214, + "step": 4765 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 1.512475848197937, + "learning_rate": 3.956404016314703e-05, + "loss": 0.8091, + "step": 4770 + }, + { + "epoch": 1.5533506831489916, + "grad_norm": 1.2697346210479736, + "learning_rate": 3.954305576785166e-05, + "loss": 0.8482, + "step": 4775 + }, + { + "epoch": 1.5549772283669485, + "grad_norm": 1.394012689590454, + "learning_rate": 3.9522055873887906e-05, + "loss": 0.8327, + "step": 4780 + }, + { + "epoch": 1.5566037735849056, + "grad_norm": 1.3668144941329956, + "learning_rate": 3.950104050363557e-05, + "loss": 0.8219, + "step": 4785 + }, + { + "epoch": 1.5582303188028628, + "grad_norm": 1.3674423694610596, + "learning_rate": 3.94800096794909e-05, + "loss": 0.848, + "step": 4790 + }, + { + "epoch": 1.5598568640208197, + "grad_norm": 1.5250712633132935, + "learning_rate": 3.945896342386666e-05, + "loss": 0.84, + "step": 4795 + }, + { + "epoch": 1.5614834092387768, + "grad_norm": 1.3539470434188843, + "learning_rate": 3.943790175919201e-05, + "loss": 0.8351, + "step": 4800 + }, + { + "epoch": 1.563109954456734, + "grad_norm": 1.5184630155563354, + "learning_rate": 3.9416824707912594e-05, + "loss": 0.8131, + "step": 4805 + }, + { + "epoch": 1.5647364996746909, + "grad_norm": 1.220706582069397, + "learning_rate": 3.9395732292490404e-05, + "loss": 0.8251, + "step": 4810 + }, + { + "epoch": 1.566363044892648, + "grad_norm": 1.2286885976791382, + "learning_rate": 3.937462453540381e-05, + "loss": 0.8488, + "step": 4815 + }, + { + "epoch": 1.5679895901106051, + "grad_norm": 1.2241997718811035, + "learning_rate": 3.935350145914757e-05, + "loss": 0.8266, + "step": 4820 + }, + { + "epoch": 1.569616135328562, + "grad_norm": 1.3638066053390503, + "learning_rate": 3.933236308623271e-05, + "loss": 0.8189, + "step": 4825 + }, + { + "epoch": 1.5712426805465192, + "grad_norm": 1.3383840322494507, + "learning_rate": 3.931120943918661e-05, + "loss": 0.8471, + "step": 4830 + }, + { + "epoch": 1.5728692257644763, + "grad_norm": 1.1668282747268677, + "learning_rate": 3.92900405405529e-05, + "loss": 0.8179, + "step": 4835 + }, + { + "epoch": 1.5744957709824332, + "grad_norm": 1.277183175086975, + "learning_rate": 3.9268856412891474e-05, + "loss": 0.7808, + "step": 4840 + }, + { + "epoch": 1.5761223162003903, + "grad_norm": 1.4851373434066772, + "learning_rate": 3.9247657078778444e-05, + "loss": 0.8454, + "step": 4845 + }, + { + "epoch": 1.5777488614183475, + "grad_norm": 1.2690775394439697, + "learning_rate": 3.9226442560806145e-05, + "loss": 0.81, + "step": 4850 + }, + { + "epoch": 1.5793754066363044, + "grad_norm": 1.434639811515808, + "learning_rate": 3.9205212881583064e-05, + "loss": 0.8699, + "step": 4855 + }, + { + "epoch": 1.5810019518542615, + "grad_norm": 1.2943447828292847, + "learning_rate": 3.918396806373389e-05, + "loss": 0.8127, + "step": 4860 + }, + { + "epoch": 1.5826284970722186, + "grad_norm": 1.281567096710205, + "learning_rate": 3.9162708129899406e-05, + "loss": 0.8287, + "step": 4865 + }, + { + "epoch": 1.5842550422901756, + "grad_norm": 1.5119740962982178, + "learning_rate": 3.914143310273653e-05, + "loss": 0.8472, + "step": 4870 + }, + { + "epoch": 1.5858815875081327, + "grad_norm": 1.6095927953720093, + "learning_rate": 3.912014300491825e-05, + "loss": 0.7974, + "step": 4875 + }, + { + "epoch": 1.5875081327260898, + "grad_norm": 1.608175277709961, + "learning_rate": 3.9098837859133606e-05, + "loss": 0.8571, + "step": 4880 + }, + { + "epoch": 1.5891346779440467, + "grad_norm": 1.289676547050476, + "learning_rate": 3.907751768808771e-05, + "loss": 0.8177, + "step": 4885 + }, + { + "epoch": 1.5907612231620039, + "grad_norm": 1.3131219148635864, + "learning_rate": 3.905618251450165e-05, + "loss": 0.8088, + "step": 4890 + }, + { + "epoch": 1.592387768379961, + "grad_norm": 1.564332127571106, + "learning_rate": 3.9034832361112516e-05, + "loss": 0.8561, + "step": 4895 + }, + { + "epoch": 1.594014313597918, + "grad_norm": 1.384636402130127, + "learning_rate": 3.9013467250673375e-05, + "loss": 0.8305, + "step": 4900 + }, + { + "epoch": 1.595640858815875, + "grad_norm": 1.1895666122436523, + "learning_rate": 3.8992087205953215e-05, + "loss": 0.8332, + "step": 4905 + }, + { + "epoch": 1.5972674040338322, + "grad_norm": 1.1903373003005981, + "learning_rate": 3.897069224973694e-05, + "loss": 0.8547, + "step": 4910 + }, + { + "epoch": 1.598893949251789, + "grad_norm": 1.3151439428329468, + "learning_rate": 3.894928240482536e-05, + "loss": 0.8264, + "step": 4915 + }, + { + "epoch": 1.6005204944697464, + "grad_norm": 1.277608036994934, + "learning_rate": 3.892785769403514e-05, + "loss": 0.8323, + "step": 4920 + }, + { + "epoch": 1.6021470396877033, + "grad_norm": 1.4774775505065918, + "learning_rate": 3.89064181401988e-05, + "loss": 0.8446, + "step": 4925 + }, + { + "epoch": 1.6037735849056602, + "grad_norm": 1.4679292440414429, + "learning_rate": 3.888496376616466e-05, + "loss": 0.8476, + "step": 4930 + }, + { + "epoch": 1.6054001301236176, + "grad_norm": 1.5357601642608643, + "learning_rate": 3.886349459479684e-05, + "loss": 0.8318, + "step": 4935 + }, + { + "epoch": 1.6070266753415745, + "grad_norm": 1.3433672189712524, + "learning_rate": 3.8842010648975244e-05, + "loss": 0.8291, + "step": 4940 + }, + { + "epoch": 1.6086532205595314, + "grad_norm": 1.1758679151535034, + "learning_rate": 3.882051195159551e-05, + "loss": 0.8681, + "step": 4945 + }, + { + "epoch": 1.6102797657774888, + "grad_norm": 1.5041158199310303, + "learning_rate": 3.879899852556899e-05, + "loss": 0.8259, + "step": 4950 + }, + { + "epoch": 1.6119063109954457, + "grad_norm": 1.4063795804977417, + "learning_rate": 3.877747039382275e-05, + "loss": 0.8258, + "step": 4955 + }, + { + "epoch": 1.6135328562134026, + "grad_norm": 1.4906984567642212, + "learning_rate": 3.87559275792995e-05, + "loss": 0.8114, + "step": 4960 + }, + { + "epoch": 1.61515940143136, + "grad_norm": 1.3947577476501465, + "learning_rate": 3.873437010495764e-05, + "loss": 0.8248, + "step": 4965 + }, + { + "epoch": 1.6167859466493169, + "grad_norm": 1.2514725923538208, + "learning_rate": 3.871279799377116e-05, + "loss": 0.8271, + "step": 4970 + }, + { + "epoch": 1.6184124918672738, + "grad_norm": 1.3585410118103027, + "learning_rate": 3.869121126872964e-05, + "loss": 0.8354, + "step": 4975 + }, + { + "epoch": 1.6200390370852311, + "grad_norm": 1.5782438516616821, + "learning_rate": 3.8669609952838284e-05, + "loss": 0.8374, + "step": 4980 + }, + { + "epoch": 1.621665582303188, + "grad_norm": 1.5710558891296387, + "learning_rate": 3.864799406911779e-05, + "loss": 0.8411, + "step": 4985 + }, + { + "epoch": 1.623292127521145, + "grad_norm": 1.3906829357147217, + "learning_rate": 3.86263636406044e-05, + "loss": 0.8632, + "step": 4990 + }, + { + "epoch": 1.6249186727391023, + "grad_norm": 1.2946661710739136, + "learning_rate": 3.860471869034987e-05, + "loss": 0.8418, + "step": 4995 + }, + { + "epoch": 1.6265452179570592, + "grad_norm": 1.2655375003814697, + "learning_rate": 3.8583059241421433e-05, + "loss": 0.8245, + "step": 5000 + }, + { + "epoch": 1.628171763175016, + "grad_norm": 1.3955624103546143, + "learning_rate": 3.856138531690175e-05, + "loss": 0.8417, + "step": 5005 + }, + { + "epoch": 1.6297983083929735, + "grad_norm": 1.0718348026275635, + "learning_rate": 3.853969693988892e-05, + "loss": 0.8498, + "step": 5010 + }, + { + "epoch": 1.6314248536109304, + "grad_norm": 1.3488701581954956, + "learning_rate": 3.851799413349646e-05, + "loss": 0.8493, + "step": 5015 + }, + { + "epoch": 1.6330513988288873, + "grad_norm": 1.3999229669570923, + "learning_rate": 3.849627692085324e-05, + "loss": 0.8223, + "step": 5020 + }, + { + "epoch": 1.6346779440468446, + "grad_norm": 1.2206923961639404, + "learning_rate": 3.8474545325103485e-05, + "loss": 0.8176, + "step": 5025 + }, + { + "epoch": 1.6363044892648015, + "grad_norm": 1.539080262184143, + "learning_rate": 3.845279936940678e-05, + "loss": 0.8484, + "step": 5030 + }, + { + "epoch": 1.6379310344827587, + "grad_norm": 1.1388968229293823, + "learning_rate": 3.8431039076937966e-05, + "loss": 0.8328, + "step": 5035 + }, + { + "epoch": 1.6395575797007158, + "grad_norm": 1.2527915239334106, + "learning_rate": 3.8409264470887204e-05, + "loss": 0.8093, + "step": 5040 + }, + { + "epoch": 1.6411841249186727, + "grad_norm": 1.4909052848815918, + "learning_rate": 3.838747557445988e-05, + "loss": 0.8025, + "step": 5045 + }, + { + "epoch": 1.6428106701366298, + "grad_norm": 1.6596057415008545, + "learning_rate": 3.836567241087663e-05, + "loss": 0.8203, + "step": 5050 + }, + { + "epoch": 1.644437215354587, + "grad_norm": 1.1326504945755005, + "learning_rate": 3.8343855003373286e-05, + "loss": 0.8291, + "step": 5055 + }, + { + "epoch": 1.6460637605725439, + "grad_norm": 1.1924946308135986, + "learning_rate": 3.832202337520085e-05, + "loss": 0.8493, + "step": 5060 + }, + { + "epoch": 1.647690305790501, + "grad_norm": 1.4593422412872314, + "learning_rate": 3.8300177549625504e-05, + "loss": 0.8007, + "step": 5065 + }, + { + "epoch": 1.6493168510084582, + "grad_norm": 1.308897614479065, + "learning_rate": 3.8278317549928534e-05, + "loss": 0.8316, + "step": 5070 + }, + { + "epoch": 1.650943396226415, + "grad_norm": 1.8221460580825806, + "learning_rate": 3.8256443399406344e-05, + "loss": 0.8402, + "step": 5075 + }, + { + "epoch": 1.6525699414443722, + "grad_norm": 1.2264111042022705, + "learning_rate": 3.823455512137042e-05, + "loss": 0.8438, + "step": 5080 + }, + { + "epoch": 1.6541964866623293, + "grad_norm": 1.6410421133041382, + "learning_rate": 3.8212652739147306e-05, + "loss": 0.8298, + "step": 5085 + }, + { + "epoch": 1.6558230318802862, + "grad_norm": 1.4165881872177124, + "learning_rate": 3.819073627607856e-05, + "loss": 0.8152, + "step": 5090 + }, + { + "epoch": 1.6574495770982434, + "grad_norm": 1.1489217281341553, + "learning_rate": 3.8168805755520775e-05, + "loss": 0.8453, + "step": 5095 + }, + { + "epoch": 1.6590761223162005, + "grad_norm": 1.309434175491333, + "learning_rate": 3.81468612008455e-05, + "loss": 0.8284, + "step": 5100 + }, + { + "epoch": 1.6607026675341574, + "grad_norm": 1.3349792957305908, + "learning_rate": 3.8124902635439235e-05, + "loss": 0.832, + "step": 5105 + }, + { + "epoch": 1.6623292127521145, + "grad_norm": 1.2414830923080444, + "learning_rate": 3.810293008270345e-05, + "loss": 0.7964, + "step": 5110 + }, + { + "epoch": 1.6639557579700717, + "grad_norm": 1.2398914098739624, + "learning_rate": 3.808094356605448e-05, + "loss": 0.8146, + "step": 5115 + }, + { + "epoch": 1.6655823031880286, + "grad_norm": 1.3887569904327393, + "learning_rate": 3.8058943108923565e-05, + "loss": 0.8282, + "step": 5120 + }, + { + "epoch": 1.6672088484059857, + "grad_norm": 1.4989736080169678, + "learning_rate": 3.80369287347568e-05, + "loss": 0.8079, + "step": 5125 + }, + { + "epoch": 1.6688353936239428, + "grad_norm": 1.3330059051513672, + "learning_rate": 3.801490046701509e-05, + "loss": 0.8142, + "step": 5130 + }, + { + "epoch": 1.6704619388418998, + "grad_norm": 1.6739710569381714, + "learning_rate": 3.799285832917417e-05, + "loss": 0.8297, + "step": 5135 + }, + { + "epoch": 1.6720884840598569, + "grad_norm": 1.429032325744629, + "learning_rate": 3.797080234472456e-05, + "loss": 0.81, + "step": 5140 + }, + { + "epoch": 1.673715029277814, + "grad_norm": 1.1783761978149414, + "learning_rate": 3.7948732537171516e-05, + "loss": 0.8264, + "step": 5145 + }, + { + "epoch": 1.675341574495771, + "grad_norm": 1.207680344581604, + "learning_rate": 3.792664893003503e-05, + "loss": 0.8432, + "step": 5150 + }, + { + "epoch": 1.676968119713728, + "grad_norm": 1.3401858806610107, + "learning_rate": 3.7904551546849806e-05, + "loss": 0.8592, + "step": 5155 + }, + { + "epoch": 1.6785946649316852, + "grad_norm": 1.4058512449264526, + "learning_rate": 3.788244041116525e-05, + "loss": 0.8186, + "step": 5160 + }, + { + "epoch": 1.680221210149642, + "grad_norm": 1.4915868043899536, + "learning_rate": 3.7860315546545375e-05, + "loss": 0.8519, + "step": 5165 + }, + { + "epoch": 1.6818477553675992, + "grad_norm": 1.1779934167861938, + "learning_rate": 3.783817697656887e-05, + "loss": 0.825, + "step": 5170 + }, + { + "epoch": 1.6834743005855564, + "grad_norm": 1.1503974199295044, + "learning_rate": 3.7816024724829e-05, + "loss": 0.825, + "step": 5175 + }, + { + "epoch": 1.6851008458035133, + "grad_norm": 1.4627374410629272, + "learning_rate": 3.779385881493364e-05, + "loss": 0.82, + "step": 5180 + }, + { + "epoch": 1.6867273910214704, + "grad_norm": 1.2746937274932861, + "learning_rate": 3.777167927050519e-05, + "loss": 0.8276, + "step": 5185 + }, + { + "epoch": 1.6883539362394275, + "grad_norm": 1.3548493385314941, + "learning_rate": 3.77494861151806e-05, + "loss": 0.807, + "step": 5190 + }, + { + "epoch": 1.6899804814573844, + "grad_norm": 1.1232099533081055, + "learning_rate": 3.772727937261132e-05, + "loss": 0.8581, + "step": 5195 + }, + { + "epoch": 1.6916070266753416, + "grad_norm": 1.2118384838104248, + "learning_rate": 3.7705059066463274e-05, + "loss": 0.8276, + "step": 5200 + }, + { + "epoch": 1.6932335718932987, + "grad_norm": 1.2310107946395874, + "learning_rate": 3.7682825220416865e-05, + "loss": 0.8481, + "step": 5205 + }, + { + "epoch": 1.6948601171112556, + "grad_norm": 1.42371666431427, + "learning_rate": 3.766057785816688e-05, + "loss": 0.8388, + "step": 5210 + }, + { + "epoch": 1.6964866623292127, + "grad_norm": 1.5081219673156738, + "learning_rate": 3.7638317003422564e-05, + "loss": 0.8317, + "step": 5215 + }, + { + "epoch": 1.6981132075471699, + "grad_norm": 1.2987513542175293, + "learning_rate": 3.7616042679907494e-05, + "loss": 0.8517, + "step": 5220 + }, + { + "epoch": 1.6997397527651268, + "grad_norm": 1.0817666053771973, + "learning_rate": 3.759375491135964e-05, + "loss": 0.8392, + "step": 5225 + }, + { + "epoch": 1.701366297983084, + "grad_norm": 1.1084290742874146, + "learning_rate": 3.7571453721531256e-05, + "loss": 0.8164, + "step": 5230 + }, + { + "epoch": 1.702992843201041, + "grad_norm": 1.2885674238204956, + "learning_rate": 3.7549139134188954e-05, + "loss": 0.8279, + "step": 5235 + }, + { + "epoch": 1.704619388418998, + "grad_norm": 1.1124091148376465, + "learning_rate": 3.752681117311358e-05, + "loss": 0.8154, + "step": 5240 + }, + { + "epoch": 1.706245933636955, + "grad_norm": 1.2188146114349365, + "learning_rate": 3.7504469862100256e-05, + "loss": 0.8165, + "step": 5245 + }, + { + "epoch": 1.7078724788549122, + "grad_norm": 1.3769359588623047, + "learning_rate": 3.748211522495831e-05, + "loss": 0.8514, + "step": 5250 + }, + { + "epoch": 1.7094990240728691, + "grad_norm": 1.3241037130355835, + "learning_rate": 3.745974728551129e-05, + "loss": 0.8496, + "step": 5255 + }, + { + "epoch": 1.7111255692908263, + "grad_norm": 1.1701422929763794, + "learning_rate": 3.7437366067596924e-05, + "loss": 0.8329, + "step": 5260 + }, + { + "epoch": 1.7127521145087834, + "grad_norm": 1.4167858362197876, + "learning_rate": 3.7414971595067074e-05, + "loss": 0.8121, + "step": 5265 + }, + { + "epoch": 1.7143786597267403, + "grad_norm": 1.4011586904525757, + "learning_rate": 3.7392563891787726e-05, + "loss": 0.856, + "step": 5270 + }, + { + "epoch": 1.7160052049446974, + "grad_norm": 1.2024650573730469, + "learning_rate": 3.7370142981638996e-05, + "loss": 0.7888, + "step": 5275 + }, + { + "epoch": 1.7176317501626546, + "grad_norm": 1.2074370384216309, + "learning_rate": 3.734770888851504e-05, + "loss": 0.7993, + "step": 5280 + }, + { + "epoch": 1.7192582953806115, + "grad_norm": 1.3105443716049194, + "learning_rate": 3.732526163632408e-05, + "loss": 0.8398, + "step": 5285 + }, + { + "epoch": 1.7208848405985686, + "grad_norm": 1.3848609924316406, + "learning_rate": 3.7302801248988365e-05, + "loss": 0.8425, + "step": 5290 + }, + { + "epoch": 1.7225113858165257, + "grad_norm": 1.2775307893753052, + "learning_rate": 3.728032775044413e-05, + "loss": 0.8457, + "step": 5295 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 1.3581163883209229, + "learning_rate": 3.7257841164641595e-05, + "loss": 0.8227, + "step": 5300 + }, + { + "epoch": 1.7257644762524398, + "grad_norm": 1.2594901323318481, + "learning_rate": 3.723534151554492e-05, + "loss": 0.8108, + "step": 5305 + }, + { + "epoch": 1.727391021470397, + "grad_norm": 1.307614803314209, + "learning_rate": 3.721282882713218e-05, + "loss": 0.8172, + "step": 5310 + }, + { + "epoch": 1.7290175666883538, + "grad_norm": 1.093103289604187, + "learning_rate": 3.719030312339536e-05, + "loss": 0.8525, + "step": 5315 + }, + { + "epoch": 1.730644111906311, + "grad_norm": 1.2865791320800781, + "learning_rate": 3.7167764428340314e-05, + "loss": 0.8428, + "step": 5320 + }, + { + "epoch": 1.732270657124268, + "grad_norm": 1.4712680578231812, + "learning_rate": 3.7145212765986725e-05, + "loss": 0.8434, + "step": 5325 + }, + { + "epoch": 1.733897202342225, + "grad_norm": 1.481542706489563, + "learning_rate": 3.7122648160368125e-05, + "loss": 0.8259, + "step": 5330 + }, + { + "epoch": 1.7355237475601821, + "grad_norm": 1.4224997758865356, + "learning_rate": 3.71000706355318e-05, + "loss": 0.8625, + "step": 5335 + }, + { + "epoch": 1.7371502927781393, + "grad_norm": 1.240639567375183, + "learning_rate": 3.7077480215538854e-05, + "loss": 0.808, + "step": 5340 + }, + { + "epoch": 1.7387768379960962, + "grad_norm": 1.1594419479370117, + "learning_rate": 3.7054876924464075e-05, + "loss": 0.8381, + "step": 5345 + }, + { + "epoch": 1.7404033832140533, + "grad_norm": 1.259247064590454, + "learning_rate": 3.7032260786396025e-05, + "loss": 0.8485, + "step": 5350 + }, + { + "epoch": 1.7420299284320104, + "grad_norm": 1.2307862043380737, + "learning_rate": 3.700963182543691e-05, + "loss": 0.811, + "step": 5355 + }, + { + "epoch": 1.7436564736499673, + "grad_norm": 1.3291717767715454, + "learning_rate": 3.698699006570263e-05, + "loss": 0.819, + "step": 5360 + }, + { + "epoch": 1.7452830188679245, + "grad_norm": 1.530931830406189, + "learning_rate": 3.696433553132271e-05, + "loss": 0.8199, + "step": 5365 + }, + { + "epoch": 1.7469095640858816, + "grad_norm": 1.9178639650344849, + "learning_rate": 3.694166824644032e-05, + "loss": 0.8388, + "step": 5370 + }, + { + "epoch": 1.7485361093038385, + "grad_norm": 1.7192288637161255, + "learning_rate": 3.691898823521216e-05, + "loss": 0.8042, + "step": 5375 + }, + { + "epoch": 1.7501626545217959, + "grad_norm": 1.356046199798584, + "learning_rate": 3.6896295521808556e-05, + "loss": 0.8565, + "step": 5380 + }, + { + "epoch": 1.7517891997397528, + "grad_norm": 1.194600224494934, + "learning_rate": 3.6873590130413324e-05, + "loss": 0.8731, + "step": 5385 + }, + { + "epoch": 1.7534157449577097, + "grad_norm": 1.3915358781814575, + "learning_rate": 3.685087208522381e-05, + "loss": 0.8066, + "step": 5390 + }, + { + "epoch": 1.755042290175667, + "grad_norm": 1.3240704536437988, + "learning_rate": 3.682814141045085e-05, + "loss": 0.8341, + "step": 5395 + }, + { + "epoch": 1.756668835393624, + "grad_norm": 1.393480658531189, + "learning_rate": 3.6805398130318736e-05, + "loss": 0.8046, + "step": 5400 + }, + { + "epoch": 1.7582953806115809, + "grad_norm": 1.2181992530822754, + "learning_rate": 3.6782642269065177e-05, + "loss": 0.8375, + "step": 5405 + }, + { + "epoch": 1.7599219258295382, + "grad_norm": 1.2869491577148438, + "learning_rate": 3.675987385094131e-05, + "loss": 0.8052, + "step": 5410 + }, + { + "epoch": 1.7615484710474951, + "grad_norm": 1.1566784381866455, + "learning_rate": 3.673709290021166e-05, + "loss": 0.8293, + "step": 5415 + }, + { + "epoch": 1.763175016265452, + "grad_norm": 1.1554367542266846, + "learning_rate": 3.6714299441154084e-05, + "loss": 0.8528, + "step": 5420 + }, + { + "epoch": 1.7648015614834094, + "grad_norm": 1.191815972328186, + "learning_rate": 3.669149349805978e-05, + "loss": 0.8327, + "step": 5425 + }, + { + "epoch": 1.7664281067013663, + "grad_norm": 1.8547695875167847, + "learning_rate": 3.666867509523325e-05, + "loss": 0.8167, + "step": 5430 + }, + { + "epoch": 1.7680546519193232, + "grad_norm": 1.2833627462387085, + "learning_rate": 3.664584425699229e-05, + "loss": 0.8171, + "step": 5435 + }, + { + "epoch": 1.7696811971372806, + "grad_norm": 1.3713700771331787, + "learning_rate": 3.6623001007667926e-05, + "loss": 0.8223, + "step": 5440 + }, + { + "epoch": 1.7713077423552375, + "grad_norm": 1.2113951444625854, + "learning_rate": 3.660014537160441e-05, + "loss": 0.82, + "step": 5445 + }, + { + "epoch": 1.7729342875731944, + "grad_norm": 1.2681775093078613, + "learning_rate": 3.6577277373159216e-05, + "loss": 0.8192, + "step": 5450 + }, + { + "epoch": 1.7745608327911517, + "grad_norm": 1.2323765754699707, + "learning_rate": 3.6554397036702976e-05, + "loss": 0.8418, + "step": 5455 + }, + { + "epoch": 1.7761873780091086, + "grad_norm": 1.29707670211792, + "learning_rate": 3.6531504386619466e-05, + "loss": 0.7979, + "step": 5460 + }, + { + "epoch": 1.7778139232270656, + "grad_norm": 1.3160725831985474, + "learning_rate": 3.650859944730561e-05, + "loss": 0.8388, + "step": 5465 + }, + { + "epoch": 1.779440468445023, + "grad_norm": 1.3094955682754517, + "learning_rate": 3.648568224317141e-05, + "loss": 0.8094, + "step": 5470 + }, + { + "epoch": 1.7810670136629798, + "grad_norm": 1.5002540349960327, + "learning_rate": 3.646275279863993e-05, + "loss": 0.8532, + "step": 5475 + }, + { + "epoch": 1.7826935588809367, + "grad_norm": 1.4463096857070923, + "learning_rate": 3.6439811138147306e-05, + "loss": 0.8279, + "step": 5480 + }, + { + "epoch": 1.784320104098894, + "grad_norm": 1.274701476097107, + "learning_rate": 3.641685728614266e-05, + "loss": 0.8462, + "step": 5485 + }, + { + "epoch": 1.785946649316851, + "grad_norm": 1.5319132804870605, + "learning_rate": 3.639389126708813e-05, + "loss": 0.8364, + "step": 5490 + }, + { + "epoch": 1.7875731945348081, + "grad_norm": 1.2064110040664673, + "learning_rate": 3.637091310545882e-05, + "loss": 0.8131, + "step": 5495 + }, + { + "epoch": 1.7891997397527653, + "grad_norm": 1.426894187927246, + "learning_rate": 3.6347922825742766e-05, + "loss": 0.8498, + "step": 5500 + }, + { + "epoch": 1.7908262849707222, + "grad_norm": 1.027532935142517, + "learning_rate": 3.6324920452440904e-05, + "loss": 0.887, + "step": 5505 + }, + { + "epoch": 1.7924528301886793, + "grad_norm": 1.3164483308792114, + "learning_rate": 3.63019060100671e-05, + "loss": 0.8356, + "step": 5510 + }, + { + "epoch": 1.7940793754066364, + "grad_norm": 2.5683417320251465, + "learning_rate": 3.6278879523148045e-05, + "loss": 0.8399, + "step": 5515 + }, + { + "epoch": 1.7957059206245933, + "grad_norm": 1.281097412109375, + "learning_rate": 3.625584101622328e-05, + "loss": 0.8435, + "step": 5520 + }, + { + "epoch": 1.7973324658425505, + "grad_norm": 1.2374613285064697, + "learning_rate": 3.623279051384515e-05, + "loss": 0.8279, + "step": 5525 + }, + { + "epoch": 1.7989590110605076, + "grad_norm": 1.4182496070861816, + "learning_rate": 3.62097280405788e-05, + "loss": 0.8166, + "step": 5530 + }, + { + "epoch": 1.8005855562784645, + "grad_norm": 1.4572012424468994, + "learning_rate": 3.618665362100209e-05, + "loss": 0.7618, + "step": 5535 + }, + { + "epoch": 1.8022121014964216, + "grad_norm": 1.3700534105300903, + "learning_rate": 3.616356727970567e-05, + "loss": 0.8001, + "step": 5540 + }, + { + "epoch": 1.8038386467143788, + "grad_norm": 1.166533350944519, + "learning_rate": 3.614046904129286e-05, + "loss": 0.819, + "step": 5545 + }, + { + "epoch": 1.8054651919323357, + "grad_norm": 1.4602328538894653, + "learning_rate": 3.611735893037967e-05, + "loss": 0.8187, + "step": 5550 + }, + { + "epoch": 1.8070917371502928, + "grad_norm": 1.256103515625, + "learning_rate": 3.609423697159474e-05, + "loss": 0.8211, + "step": 5555 + }, + { + "epoch": 1.80871828236825, + "grad_norm": 1.5523267984390259, + "learning_rate": 3.607110318957937e-05, + "loss": 0.8395, + "step": 5560 + }, + { + "epoch": 1.8103448275862069, + "grad_norm": 1.3885524272918701, + "learning_rate": 3.604795760898745e-05, + "loss": 0.8329, + "step": 5565 + }, + { + "epoch": 1.811971372804164, + "grad_norm": 1.4500535726547241, + "learning_rate": 3.602480025448541e-05, + "loss": 0.8328, + "step": 5570 + }, + { + "epoch": 1.8135979180221211, + "grad_norm": 1.1918100118637085, + "learning_rate": 3.600163115075229e-05, + "loss": 0.8399, + "step": 5575 + }, + { + "epoch": 1.815224463240078, + "grad_norm": 1.3094122409820557, + "learning_rate": 3.5978450322479596e-05, + "loss": 0.8389, + "step": 5580 + }, + { + "epoch": 1.8168510084580352, + "grad_norm": 1.2879383563995361, + "learning_rate": 3.595525779437135e-05, + "loss": 0.8334, + "step": 5585 + }, + { + "epoch": 1.8184775536759923, + "grad_norm": 1.3823051452636719, + "learning_rate": 3.5932053591144056e-05, + "loss": 0.8306, + "step": 5590 + }, + { + "epoch": 1.8201040988939492, + "grad_norm": 1.3814623355865479, + "learning_rate": 3.5908837737526636e-05, + "loss": 0.8153, + "step": 5595 + }, + { + "epoch": 1.8217306441119063, + "grad_norm": 1.2130191326141357, + "learning_rate": 3.5885610258260435e-05, + "loss": 0.827, + "step": 5600 + }, + { + "epoch": 1.8233571893298635, + "grad_norm": 1.1492528915405273, + "learning_rate": 3.5862371178099206e-05, + "loss": 0.8401, + "step": 5605 + }, + { + "epoch": 1.8249837345478204, + "grad_norm": 1.5023655891418457, + "learning_rate": 3.583912052180903e-05, + "loss": 0.829, + "step": 5610 + }, + { + "epoch": 1.8266102797657775, + "grad_norm": 1.248105764389038, + "learning_rate": 3.581585831416837e-05, + "loss": 0.8157, + "step": 5615 + }, + { + "epoch": 1.8282368249837346, + "grad_norm": 1.2280492782592773, + "learning_rate": 3.579258457996796e-05, + "loss": 0.8378, + "step": 5620 + }, + { + "epoch": 1.8298633702016915, + "grad_norm": 1.1267839670181274, + "learning_rate": 3.576929934401084e-05, + "loss": 0.8353, + "step": 5625 + }, + { + "epoch": 1.8314899154196487, + "grad_norm": 1.1950007677078247, + "learning_rate": 3.5746002631112294e-05, + "loss": 0.796, + "step": 5630 + }, + { + "epoch": 1.8331164606376058, + "grad_norm": 1.6472572088241577, + "learning_rate": 3.5722694466099835e-05, + "loss": 0.8088, + "step": 5635 + }, + { + "epoch": 1.8347430058555627, + "grad_norm": 1.6403762102127075, + "learning_rate": 3.569937487381321e-05, + "loss": 0.8586, + "step": 5640 + }, + { + "epoch": 1.8363695510735198, + "grad_norm": 1.199302315711975, + "learning_rate": 3.5676043879104304e-05, + "loss": 0.8093, + "step": 5645 + }, + { + "epoch": 1.837996096291477, + "grad_norm": 1.1643785238265991, + "learning_rate": 3.565270150683718e-05, + "loss": 0.8444, + "step": 5650 + }, + { + "epoch": 1.8396226415094339, + "grad_norm": 1.1825166940689087, + "learning_rate": 3.5629347781888026e-05, + "loss": 0.7964, + "step": 5655 + }, + { + "epoch": 1.841249186727391, + "grad_norm": 1.3321552276611328, + "learning_rate": 3.560598272914512e-05, + "loss": 0.8056, + "step": 5660 + }, + { + "epoch": 1.8428757319453482, + "grad_norm": 1.3552656173706055, + "learning_rate": 3.55826063735088e-05, + "loss": 0.8356, + "step": 5665 + }, + { + "epoch": 1.844502277163305, + "grad_norm": 1.163568139076233, + "learning_rate": 3.555921873989148e-05, + "loss": 0.8251, + "step": 5670 + }, + { + "epoch": 1.8461288223812622, + "grad_norm": 1.2719063758850098, + "learning_rate": 3.5535819853217567e-05, + "loss": 0.7959, + "step": 5675 + }, + { + "epoch": 1.8477553675992193, + "grad_norm": 1.2836542129516602, + "learning_rate": 3.551240973842346e-05, + "loss": 0.8531, + "step": 5680 + }, + { + "epoch": 1.8493819128171762, + "grad_norm": 1.463977575302124, + "learning_rate": 3.548898842045756e-05, + "loss": 0.8953, + "step": 5685 + }, + { + "epoch": 1.8510084580351334, + "grad_norm": 1.1124452352523804, + "learning_rate": 3.546555592428016e-05, + "loss": 0.8182, + "step": 5690 + }, + { + "epoch": 1.8526350032530905, + "grad_norm": 1.2463699579238892, + "learning_rate": 3.5442112274863496e-05, + "loss": 0.8311, + "step": 5695 + }, + { + "epoch": 1.8542615484710474, + "grad_norm": 1.4317057132720947, + "learning_rate": 3.541865749719167e-05, + "loss": 0.8264, + "step": 5700 + }, + { + "epoch": 1.8558880936890045, + "grad_norm": 1.3176183700561523, + "learning_rate": 3.539519161626068e-05, + "loss": 0.8242, + "step": 5705 + }, + { + "epoch": 1.8575146389069617, + "grad_norm": 1.1687026023864746, + "learning_rate": 3.5371714657078305e-05, + "loss": 0.8278, + "step": 5710 + }, + { + "epoch": 1.8591411841249186, + "grad_norm": 1.219484567642212, + "learning_rate": 3.5348226644664165e-05, + "loss": 0.8141, + "step": 5715 + }, + { + "epoch": 1.8607677293428757, + "grad_norm": 1.347642183303833, + "learning_rate": 3.532472760404966e-05, + "loss": 0.8005, + "step": 5720 + }, + { + "epoch": 1.8623942745608328, + "grad_norm": 1.4095207452774048, + "learning_rate": 3.530121756027791e-05, + "loss": 0.8598, + "step": 5725 + }, + { + "epoch": 1.8640208197787898, + "grad_norm": 1.235945701599121, + "learning_rate": 3.5277696538403806e-05, + "loss": 0.8438, + "step": 5730 + }, + { + "epoch": 1.8656473649967469, + "grad_norm": 1.2110533714294434, + "learning_rate": 3.525416456349392e-05, + "loss": 0.8438, + "step": 5735 + }, + { + "epoch": 1.867273910214704, + "grad_norm": 1.4699175357818604, + "learning_rate": 3.5230621660626486e-05, + "loss": 0.8308, + "step": 5740 + }, + { + "epoch": 1.868900455432661, + "grad_norm": 1.6243605613708496, + "learning_rate": 3.520706785489139e-05, + "loss": 0.8455, + "step": 5745 + }, + { + "epoch": 1.870527000650618, + "grad_norm": 1.3754017353057861, + "learning_rate": 3.518350317139013e-05, + "loss": 0.8364, + "step": 5750 + }, + { + "epoch": 1.8721535458685752, + "grad_norm": 1.485510230064392, + "learning_rate": 3.515992763523584e-05, + "loss": 0.8504, + "step": 5755 + }, + { + "epoch": 1.873780091086532, + "grad_norm": 1.2265267372131348, + "learning_rate": 3.513634127155314e-05, + "loss": 0.8163, + "step": 5760 + }, + { + "epoch": 1.8754066363044892, + "grad_norm": 1.2929047346115112, + "learning_rate": 3.5112744105478276e-05, + "loss": 0.8324, + "step": 5765 + }, + { + "epoch": 1.8770331815224464, + "grad_norm": 1.1692301034927368, + "learning_rate": 3.508913616215894e-05, + "loss": 0.8042, + "step": 5770 + }, + { + "epoch": 1.8786597267404033, + "grad_norm": 1.1555005311965942, + "learning_rate": 3.507024206479406e-05, + "loss": 0.8208, + "step": 5775 + }, + { + "epoch": 1.8802862719583604, + "grad_norm": 1.3043735027313232, + "learning_rate": 3.504661478584359e-05, + "loss": 0.8242, + "step": 5780 + }, + { + "epoch": 1.8819128171763175, + "grad_norm": 1.2373569011688232, + "learning_rate": 3.502297680012327e-05, + "loss": 0.8169, + "step": 5785 + }, + { + "epoch": 1.8835393623942744, + "grad_norm": 1.0609958171844482, + "learning_rate": 3.4999328132824326e-05, + "loss": 0.8029, + "step": 5790 + }, + { + "epoch": 1.8851659076122316, + "grad_norm": 1.2642152309417725, + "learning_rate": 3.4975668809149375e-05, + "loss": 0.808, + "step": 5795 + }, + { + "epoch": 1.8867924528301887, + "grad_norm": 1.2736371755599976, + "learning_rate": 3.495199885431236e-05, + "loss": 0.7965, + "step": 5800 + }, + { + "epoch": 1.8884189980481456, + "grad_norm": 1.4471628665924072, + "learning_rate": 3.492831829353857e-05, + "loss": 0.8274, + "step": 5805 + }, + { + "epoch": 1.8900455432661027, + "grad_norm": 1.122707724571228, + "learning_rate": 3.4904627152064605e-05, + "loss": 0.836, + "step": 5810 + }, + { + "epoch": 1.8916720884840599, + "grad_norm": 1.312488317489624, + "learning_rate": 3.488092545513833e-05, + "loss": 0.8097, + "step": 5815 + }, + { + "epoch": 1.8932986337020168, + "grad_norm": 1.3100965023040771, + "learning_rate": 3.485721322801886e-05, + "loss": 0.8386, + "step": 5820 + }, + { + "epoch": 1.8949251789199741, + "grad_norm": 1.2820485830307007, + "learning_rate": 3.483349049597653e-05, + "loss": 0.8399, + "step": 5825 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 1.0979855060577393, + "learning_rate": 3.480975728429288e-05, + "loss": 0.8268, + "step": 5830 + }, + { + "epoch": 1.898178269355888, + "grad_norm": 1.330854058265686, + "learning_rate": 3.4786013618260615e-05, + "loss": 0.8239, + "step": 5835 + }, + { + "epoch": 1.8998048145738453, + "grad_norm": 1.3029650449752808, + "learning_rate": 3.476225952318356e-05, + "loss": 0.8208, + "step": 5840 + }, + { + "epoch": 1.9014313597918022, + "grad_norm": 1.558043360710144, + "learning_rate": 3.47384950243767e-05, + "loss": 0.8283, + "step": 5845 + }, + { + "epoch": 1.9030579050097591, + "grad_norm": 1.3287074565887451, + "learning_rate": 3.471472014716606e-05, + "loss": 0.8343, + "step": 5850 + }, + { + "epoch": 1.9046844502277165, + "grad_norm": 1.2479119300842285, + "learning_rate": 3.4690934916888754e-05, + "loss": 0.8606, + "step": 5855 + }, + { + "epoch": 1.9063109954456734, + "grad_norm": 1.2990680932998657, + "learning_rate": 3.4667139358892914e-05, + "loss": 0.85, + "step": 5860 + }, + { + "epoch": 1.9079375406636303, + "grad_norm": 1.369714617729187, + "learning_rate": 3.464333349853769e-05, + "loss": 0.8377, + "step": 5865 + }, + { + "epoch": 1.9095640858815877, + "grad_norm": 1.3558005094528198, + "learning_rate": 3.461951736119321e-05, + "loss": 0.8, + "step": 5870 + }, + { + "epoch": 1.9111906310995446, + "grad_norm": 1.611104965209961, + "learning_rate": 3.459569097224054e-05, + "loss": 0.8254, + "step": 5875 + }, + { + "epoch": 1.9128171763175015, + "grad_norm": 1.4185853004455566, + "learning_rate": 3.45718543570717e-05, + "loss": 0.8215, + "step": 5880 + }, + { + "epoch": 1.9144437215354588, + "grad_norm": 1.2258068323135376, + "learning_rate": 3.454800754108957e-05, + "loss": 0.823, + "step": 5885 + }, + { + "epoch": 1.9160702667534157, + "grad_norm": 1.2691441774368286, + "learning_rate": 3.452415054970793e-05, + "loss": 0.8076, + "step": 5890 + }, + { + "epoch": 1.9176968119713727, + "grad_norm": 1.3791495561599731, + "learning_rate": 3.45002834083514e-05, + "loss": 0.825, + "step": 5895 + }, + { + "epoch": 1.91932335718933, + "grad_norm": 1.2337132692337036, + "learning_rate": 3.4476406142455394e-05, + "loss": 0.8509, + "step": 5900 + }, + { + "epoch": 1.920949902407287, + "grad_norm": 1.3338109254837036, + "learning_rate": 3.445251877746616e-05, + "loss": 0.8261, + "step": 5905 + }, + { + "epoch": 1.9225764476252438, + "grad_norm": 1.1126174926757812, + "learning_rate": 3.442862133884067e-05, + "loss": 0.8357, + "step": 5910 + }, + { + "epoch": 1.9242029928432012, + "grad_norm": 1.359473466873169, + "learning_rate": 3.440471385204664e-05, + "loss": 0.8082, + "step": 5915 + }, + { + "epoch": 1.925829538061158, + "grad_norm": 1.277681589126587, + "learning_rate": 3.43807963425625e-05, + "loss": 0.8566, + "step": 5920 + }, + { + "epoch": 1.927456083279115, + "grad_norm": 1.444916844367981, + "learning_rate": 3.4356868835877376e-05, + "loss": 0.8023, + "step": 5925 + }, + { + "epoch": 1.9290826284970723, + "grad_norm": 1.6084500551223755, + "learning_rate": 3.433293135749101e-05, + "loss": 0.8095, + "step": 5930 + }, + { + "epoch": 1.9307091737150293, + "grad_norm": 1.2378942966461182, + "learning_rate": 3.430898393291381e-05, + "loss": 0.8054, + "step": 5935 + }, + { + "epoch": 1.9323357189329864, + "grad_norm": 1.2593529224395752, + "learning_rate": 3.4285026587666755e-05, + "loss": 0.8659, + "step": 5940 + }, + { + "epoch": 1.9339622641509435, + "grad_norm": 1.36800217628479, + "learning_rate": 3.426105934728141e-05, + "loss": 0.8067, + "step": 5945 + }, + { + "epoch": 1.9355888093689004, + "grad_norm": 1.403613567352295, + "learning_rate": 3.423708223729988e-05, + "loss": 0.8867, + "step": 5950 + }, + { + "epoch": 1.9372153545868576, + "grad_norm": 1.60515558719635, + "learning_rate": 3.4213095283274807e-05, + "loss": 0.8075, + "step": 5955 + }, + { + "epoch": 1.9388418998048147, + "grad_norm": 1.1290634870529175, + "learning_rate": 3.41890985107693e-05, + "loss": 0.8112, + "step": 5960 + }, + { + "epoch": 1.9404684450227716, + "grad_norm": 1.2028326988220215, + "learning_rate": 3.416509194535693e-05, + "loss": 0.8008, + "step": 5965 + }, + { + "epoch": 1.9420949902407287, + "grad_norm": 1.232129693031311, + "learning_rate": 3.414107561262173e-05, + "loss": 0.8055, + "step": 5970 + }, + { + "epoch": 1.9437215354586859, + "grad_norm": 1.546708345413208, + "learning_rate": 3.411704953815813e-05, + "loss": 0.8426, + "step": 5975 + }, + { + "epoch": 1.9453480806766428, + "grad_norm": 1.1460624933242798, + "learning_rate": 3.409301374757092e-05, + "loss": 0.8195, + "step": 5980 + }, + { + "epoch": 1.9469746258946, + "grad_norm": 1.1917885541915894, + "learning_rate": 3.406896826647528e-05, + "loss": 0.8214, + "step": 5985 + }, + { + "epoch": 1.948601171112557, + "grad_norm": 1.2794747352600098, + "learning_rate": 3.404491312049669e-05, + "loss": 0.8172, + "step": 5990 + }, + { + "epoch": 1.950227716330514, + "grad_norm": 1.2147552967071533, + "learning_rate": 3.4020848335270944e-05, + "loss": 0.8141, + "step": 5995 + }, + { + "epoch": 1.951854261548471, + "grad_norm": 1.4158474206924438, + "learning_rate": 3.3996773936444104e-05, + "loss": 0.8231, + "step": 6000 + }, + { + "epoch": 1.9534808067664282, + "grad_norm": 1.9261717796325684, + "learning_rate": 3.397268994967248e-05, + "loss": 0.8031, + "step": 6005 + }, + { + "epoch": 1.9551073519843851, + "grad_norm": 1.176213026046753, + "learning_rate": 3.39485964006226e-05, + "loss": 0.8255, + "step": 6010 + }, + { + "epoch": 1.9567338972023423, + "grad_norm": 1.2920970916748047, + "learning_rate": 3.392449331497117e-05, + "loss": 0.8478, + "step": 6015 + }, + { + "epoch": 1.9583604424202994, + "grad_norm": 1.188717007637024, + "learning_rate": 3.3900380718405096e-05, + "loss": 0.8296, + "step": 6020 + }, + { + "epoch": 1.9599869876382563, + "grad_norm": 1.333640217781067, + "learning_rate": 3.387625863662137e-05, + "loss": 0.7855, + "step": 6025 + }, + { + "epoch": 1.9616135328562134, + "grad_norm": 1.4747451543807983, + "learning_rate": 3.3852127095327115e-05, + "loss": 0.8323, + "step": 6030 + }, + { + "epoch": 1.9632400780741706, + "grad_norm": 1.3423820734024048, + "learning_rate": 3.3827986120239556e-05, + "loss": 0.8398, + "step": 6035 + }, + { + "epoch": 1.9648666232921275, + "grad_norm": 1.2037580013275146, + "learning_rate": 3.380383573708594e-05, + "loss": 0.8035, + "step": 6040 + }, + { + "epoch": 1.9664931685100846, + "grad_norm": 1.476690649986267, + "learning_rate": 3.377967597160355e-05, + "loss": 0.8277, + "step": 6045 + }, + { + "epoch": 1.9681197137280417, + "grad_norm": 1.291995882987976, + "learning_rate": 3.375550684953968e-05, + "loss": 0.8112, + "step": 6050 + }, + { + "epoch": 1.9697462589459986, + "grad_norm": 1.1101301908493042, + "learning_rate": 3.373132839665159e-05, + "loss": 0.8272, + "step": 6055 + }, + { + "epoch": 1.9713728041639558, + "grad_norm": 1.3454889059066772, + "learning_rate": 3.3707140638706445e-05, + "loss": 0.8229, + "step": 6060 + }, + { + "epoch": 1.972999349381913, + "grad_norm": 1.2014589309692383, + "learning_rate": 3.368294360148141e-05, + "loss": 0.7982, + "step": 6065 + }, + { + "epoch": 1.9746258945998698, + "grad_norm": 1.3191430568695068, + "learning_rate": 3.365873731076346e-05, + "loss": 0.8082, + "step": 6070 + }, + { + "epoch": 1.976252439817827, + "grad_norm": 1.420417308807373, + "learning_rate": 3.363452179234946e-05, + "loss": 0.8231, + "step": 6075 + }, + { + "epoch": 1.977878985035784, + "grad_norm": 1.7952125072479248, + "learning_rate": 3.3610297072046126e-05, + "loss": 0.8244, + "step": 6080 + }, + { + "epoch": 1.979505530253741, + "grad_norm": 1.126236081123352, + "learning_rate": 3.3586063175669957e-05, + "loss": 0.8134, + "step": 6085 + }, + { + "epoch": 1.9811320754716981, + "grad_norm": 1.29691481590271, + "learning_rate": 3.356182012904725e-05, + "loss": 0.8207, + "step": 6090 + }, + { + "epoch": 1.9827586206896552, + "grad_norm": 1.0842036008834839, + "learning_rate": 3.353756795801402e-05, + "loss": 0.8333, + "step": 6095 + }, + { + "epoch": 1.9843851659076122, + "grad_norm": 1.2627893686294556, + "learning_rate": 3.351330668841605e-05, + "loss": 0.8067, + "step": 6100 + }, + { + "epoch": 1.9860117111255693, + "grad_norm": 1.2320276498794556, + "learning_rate": 3.348903634610879e-05, + "loss": 0.8363, + "step": 6105 + }, + { + "epoch": 1.9876382563435264, + "grad_norm": 1.286724328994751, + "learning_rate": 3.346475695695737e-05, + "loss": 0.8569, + "step": 6110 + }, + { + "epoch": 1.9892648015614833, + "grad_norm": 1.47573721408844, + "learning_rate": 3.344046854683656e-05, + "loss": 0.7946, + "step": 6115 + }, + { + "epoch": 1.9908913467794405, + "grad_norm": 1.3220078945159912, + "learning_rate": 3.341617114163074e-05, + "loss": 0.8516, + "step": 6120 + }, + { + "epoch": 1.9925178919973976, + "grad_norm": 1.486553430557251, + "learning_rate": 3.3391864767233874e-05, + "loss": 0.8164, + "step": 6125 + }, + { + "epoch": 1.9941444372153545, + "grad_norm": 1.2705063819885254, + "learning_rate": 3.33675494495495e-05, + "loss": 0.7905, + "step": 6130 + }, + { + "epoch": 1.9957709824333116, + "grad_norm": 1.6260697841644287, + "learning_rate": 3.334322521449066e-05, + "loss": 0.8164, + "step": 6135 + }, + { + "epoch": 1.9973975276512688, + "grad_norm": 1.4485939741134644, + "learning_rate": 3.331889208797992e-05, + "loss": 0.8037, + "step": 6140 + }, + { + "epoch": 1.9990240728692257, + "grad_norm": 1.350242257118225, + "learning_rate": 3.3294550095949325e-05, + "loss": 0.8305, + "step": 6145 + }, + { + "epoch": 2.0, + "eval_f1": 0.8146817323620569, + "eval_loss": 0.41748046875, + "eval_precision": 0.8151118436115431, + "eval_recall": 0.8143230204193971, + "eval_runtime": 388.8818, + "eval_samples_per_second": 1011.708, + "eval_steps_per_second": 1.977, + "step": 6148 + }, + { + "epoch": 2.000650618087183, + "grad_norm": 1.0202901363372803, + "learning_rate": 3.327019926434036e-05, + "loss": 0.8055, + "step": 6150 + }, + { + "epoch": 2.00227716330514, + "grad_norm": 1.112686276435852, + "learning_rate": 3.3245839619103916e-05, + "loss": 0.7646, + "step": 6155 + }, + { + "epoch": 2.003903708523097, + "grad_norm": 1.3123199939727783, + "learning_rate": 3.3221471186200306e-05, + "loss": 0.7602, + "step": 6160 + }, + { + "epoch": 2.005530253741054, + "grad_norm": 1.5367430448532104, + "learning_rate": 3.319709399159919e-05, + "loss": 0.7892, + "step": 6165 + }, + { + "epoch": 2.007156798959011, + "grad_norm": 1.2061059474945068, + "learning_rate": 3.3172708061279564e-05, + "loss": 0.7487, + "step": 6170 + }, + { + "epoch": 2.008783344176968, + "grad_norm": 1.4139245748519897, + "learning_rate": 3.314831342122974e-05, + "loss": 0.7592, + "step": 6175 + }, + { + "epoch": 2.0104098893949254, + "grad_norm": 1.3764551877975464, + "learning_rate": 3.312391009744732e-05, + "loss": 0.775, + "step": 6180 + }, + { + "epoch": 2.0120364346128823, + "grad_norm": 1.278091311454773, + "learning_rate": 3.309949811593914e-05, + "loss": 0.7744, + "step": 6185 + }, + { + "epoch": 2.013662979830839, + "grad_norm": 1.315973162651062, + "learning_rate": 3.3075077502721266e-05, + "loss": 0.7733, + "step": 6190 + }, + { + "epoch": 2.0152895250487965, + "grad_norm": 1.5155901908874512, + "learning_rate": 3.3050648283818985e-05, + "loss": 0.769, + "step": 6195 + }, + { + "epoch": 2.0169160702667535, + "grad_norm": 1.4194787740707397, + "learning_rate": 3.3026210485266734e-05, + "loss": 0.7778, + "step": 6200 + }, + { + "epoch": 2.0185426154847104, + "grad_norm": 1.4269171953201294, + "learning_rate": 3.30017641331081e-05, + "loss": 0.7709, + "step": 6205 + }, + { + "epoch": 2.0201691607026677, + "grad_norm": 1.2979527711868286, + "learning_rate": 3.2977309253395786e-05, + "loss": 0.7731, + "step": 6210 + }, + { + "epoch": 2.0217957059206246, + "grad_norm": 1.4972949028015137, + "learning_rate": 3.295284587219159e-05, + "loss": 0.7926, + "step": 6215 + }, + { + "epoch": 2.0234222511385815, + "grad_norm": 1.3243732452392578, + "learning_rate": 3.292837401556635e-05, + "loss": 0.7917, + "step": 6220 + }, + { + "epoch": 2.025048796356539, + "grad_norm": 1.310394048690796, + "learning_rate": 3.290389370959995e-05, + "loss": 0.7628, + "step": 6225 + }, + { + "epoch": 2.026675341574496, + "grad_norm": 1.4164220094680786, + "learning_rate": 3.287940498038129e-05, + "loss": 0.7696, + "step": 6230 + }, + { + "epoch": 2.0283018867924527, + "grad_norm": 1.3164575099945068, + "learning_rate": 3.285490785400822e-05, + "loss": 0.7616, + "step": 6235 + }, + { + "epoch": 2.02992843201041, + "grad_norm": 1.3442150354385376, + "learning_rate": 3.283040235658756e-05, + "loss": 0.7425, + "step": 6240 + }, + { + "epoch": 2.031554977228367, + "grad_norm": 1.3715566396713257, + "learning_rate": 3.280588851423504e-05, + "loss": 0.7672, + "step": 6245 + }, + { + "epoch": 2.033181522446324, + "grad_norm": 1.5307257175445557, + "learning_rate": 3.27813663530753e-05, + "loss": 0.7613, + "step": 6250 + }, + { + "epoch": 2.0348080676642812, + "grad_norm": 1.4891488552093506, + "learning_rate": 3.275683589924181e-05, + "loss": 0.7574, + "step": 6255 + }, + { + "epoch": 2.036434612882238, + "grad_norm": 1.347203254699707, + "learning_rate": 3.273229717887692e-05, + "loss": 0.7754, + "step": 6260 + }, + { + "epoch": 2.038061158100195, + "grad_norm": 1.494616985321045, + "learning_rate": 3.270775021813177e-05, + "loss": 0.7735, + "step": 6265 + }, + { + "epoch": 2.0396877033181524, + "grad_norm": 1.3829678297042847, + "learning_rate": 3.268319504316627e-05, + "loss": 0.7305, + "step": 6270 + }, + { + "epoch": 2.0413142485361093, + "grad_norm": 1.4462776184082031, + "learning_rate": 3.26586316801491e-05, + "loss": 0.7744, + "step": 6275 + }, + { + "epoch": 2.0429407937540662, + "grad_norm": 1.400621771812439, + "learning_rate": 3.263406015525767e-05, + "loss": 0.7626, + "step": 6280 + }, + { + "epoch": 2.0445673389720236, + "grad_norm": 1.2794369459152222, + "learning_rate": 3.2609480494678055e-05, + "loss": 0.765, + "step": 6285 + }, + { + "epoch": 2.0461938841899805, + "grad_norm": 1.3490339517593384, + "learning_rate": 3.258489272460507e-05, + "loss": 0.7571, + "step": 6290 + }, + { + "epoch": 2.0478204294079374, + "grad_norm": 1.6398358345031738, + "learning_rate": 3.256029687124209e-05, + "loss": 0.7769, + "step": 6295 + }, + { + "epoch": 2.0494469746258948, + "grad_norm": 1.278414249420166, + "learning_rate": 3.2535692960801147e-05, + "loss": 0.7941, + "step": 6300 + }, + { + "epoch": 2.0510735198438517, + "grad_norm": 1.3432247638702393, + "learning_rate": 3.2511081019502875e-05, + "loss": 0.7568, + "step": 6305 + }, + { + "epoch": 2.0527000650618086, + "grad_norm": 1.3420158624649048, + "learning_rate": 3.248646107357643e-05, + "loss": 0.7802, + "step": 6310 + }, + { + "epoch": 2.054326610279766, + "grad_norm": 1.3872815370559692, + "learning_rate": 3.2461833149259516e-05, + "loss": 0.7779, + "step": 6315 + }, + { + "epoch": 2.055953155497723, + "grad_norm": 1.3763517141342163, + "learning_rate": 3.243719727279834e-05, + "loss": 0.7696, + "step": 6320 + }, + { + "epoch": 2.0575797007156797, + "grad_norm": 1.6830278635025024, + "learning_rate": 3.241255347044759e-05, + "loss": 0.7639, + "step": 6325 + }, + { + "epoch": 2.059206245933637, + "grad_norm": 1.3106756210327148, + "learning_rate": 3.2387901768470375e-05, + "loss": 0.7801, + "step": 6330 + }, + { + "epoch": 2.060832791151594, + "grad_norm": 1.5052073001861572, + "learning_rate": 3.236324219313826e-05, + "loss": 0.7492, + "step": 6335 + }, + { + "epoch": 2.062459336369551, + "grad_norm": 1.6413931846618652, + "learning_rate": 3.2338574770731174e-05, + "loss": 0.7791, + "step": 6340 + }, + { + "epoch": 2.0640858815875083, + "grad_norm": 1.4201804399490356, + "learning_rate": 3.231389952753742e-05, + "loss": 0.7586, + "step": 6345 + }, + { + "epoch": 2.065712426805465, + "grad_norm": 1.3374663591384888, + "learning_rate": 3.2289216489853613e-05, + "loss": 0.7553, + "step": 6350 + }, + { + "epoch": 2.067338972023422, + "grad_norm": 1.3363286256790161, + "learning_rate": 3.226452568398471e-05, + "loss": 0.7688, + "step": 6355 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 1.3866255283355713, + "learning_rate": 3.223982713624394e-05, + "loss": 0.7743, + "step": 6360 + }, + { + "epoch": 2.0705920624593364, + "grad_norm": 1.2702984809875488, + "learning_rate": 3.221512087295275e-05, + "loss": 0.7722, + "step": 6365 + }, + { + "epoch": 2.0722186076772933, + "grad_norm": 1.357702374458313, + "learning_rate": 3.2190406920440847e-05, + "loss": 0.7707, + "step": 6370 + }, + { + "epoch": 2.0738451528952506, + "grad_norm": 1.490100622177124, + "learning_rate": 3.216568530504611e-05, + "loss": 0.789, + "step": 6375 + }, + { + "epoch": 2.0754716981132075, + "grad_norm": 1.7559356689453125, + "learning_rate": 3.21409560531146e-05, + "loss": 0.765, + "step": 6380 + }, + { + "epoch": 2.0770982433311644, + "grad_norm": 1.53473699092865, + "learning_rate": 3.21162191910005e-05, + "loss": 0.7707, + "step": 6385 + }, + { + "epoch": 2.078724788549122, + "grad_norm": 1.3503605127334595, + "learning_rate": 3.2091474745066116e-05, + "loss": 0.8001, + "step": 6390 + }, + { + "epoch": 2.0803513337670787, + "grad_norm": 1.383765459060669, + "learning_rate": 3.2066722741681845e-05, + "loss": 0.7642, + "step": 6395 + }, + { + "epoch": 2.0819778789850356, + "grad_norm": 1.4698325395584106, + "learning_rate": 3.204196320722609e-05, + "loss": 0.7624, + "step": 6400 + }, + { + "epoch": 2.083604424202993, + "grad_norm": 1.5627089738845825, + "learning_rate": 3.2017196168085345e-05, + "loss": 0.7769, + "step": 6405 + }, + { + "epoch": 2.08523096942095, + "grad_norm": 1.467020869255066, + "learning_rate": 3.199242165065406e-05, + "loss": 0.7824, + "step": 6410 + }, + { + "epoch": 2.086857514638907, + "grad_norm": 1.406431794166565, + "learning_rate": 3.196763968133466e-05, + "loss": 0.768, + "step": 6415 + }, + { + "epoch": 2.088484059856864, + "grad_norm": 1.4981839656829834, + "learning_rate": 3.194285028653754e-05, + "loss": 0.747, + "step": 6420 + }, + { + "epoch": 2.090110605074821, + "grad_norm": 1.7578871250152588, + "learning_rate": 3.191805349268097e-05, + "loss": 0.7725, + "step": 6425 + }, + { + "epoch": 2.091737150292778, + "grad_norm": 1.316992998123169, + "learning_rate": 3.189324932619112e-05, + "loss": 0.7939, + "step": 6430 + }, + { + "epoch": 2.0933636955107353, + "grad_norm": 1.180167555809021, + "learning_rate": 3.1868437813502026e-05, + "loss": 0.7981, + "step": 6435 + }, + { + "epoch": 2.094990240728692, + "grad_norm": 3.879301071166992, + "learning_rate": 3.184361898105554e-05, + "loss": 0.7876, + "step": 6440 + }, + { + "epoch": 2.096616785946649, + "grad_norm": 1.503963589668274, + "learning_rate": 3.1818792855301316e-05, + "loss": 0.7608, + "step": 6445 + }, + { + "epoch": 2.0982433311646065, + "grad_norm": 1.274816870689392, + "learning_rate": 3.17939594626968e-05, + "loss": 0.7573, + "step": 6450 + }, + { + "epoch": 2.0998698763825634, + "grad_norm": 1.4709075689315796, + "learning_rate": 3.1769118829707156e-05, + "loss": 0.7764, + "step": 6455 + }, + { + "epoch": 2.1014964216005203, + "grad_norm": 1.880231499671936, + "learning_rate": 3.1744270982805266e-05, + "loss": 0.7731, + "step": 6460 + }, + { + "epoch": 2.1031229668184777, + "grad_norm": 1.441886067390442, + "learning_rate": 3.171941594847173e-05, + "loss": 0.7842, + "step": 6465 + }, + { + "epoch": 2.1047495120364346, + "grad_norm": 1.3664748668670654, + "learning_rate": 3.169455375319478e-05, + "loss": 0.7877, + "step": 6470 + }, + { + "epoch": 2.1063760572543915, + "grad_norm": 1.3562390804290771, + "learning_rate": 3.1669684423470275e-05, + "loss": 0.7369, + "step": 6475 + }, + { + "epoch": 2.108002602472349, + "grad_norm": 1.3197569847106934, + "learning_rate": 3.16448079858017e-05, + "loss": 0.7615, + "step": 6480 + }, + { + "epoch": 2.1096291476903057, + "grad_norm": 1.430830717086792, + "learning_rate": 3.16199244667001e-05, + "loss": 0.7672, + "step": 6485 + }, + { + "epoch": 2.1112556929082626, + "grad_norm": 1.397912621498108, + "learning_rate": 3.159503389268407e-05, + "loss": 0.7835, + "step": 6490 + }, + { + "epoch": 2.11288223812622, + "grad_norm": 1.385737419128418, + "learning_rate": 3.157013629027972e-05, + "loss": 0.7468, + "step": 6495 + }, + { + "epoch": 2.114508783344177, + "grad_norm": 1.3258286714553833, + "learning_rate": 3.154523168602066e-05, + "loss": 0.771, + "step": 6500 + }, + { + "epoch": 2.116135328562134, + "grad_norm": 1.2190338373184204, + "learning_rate": 3.152032010644796e-05, + "loss": 0.7459, + "step": 6505 + }, + { + "epoch": 2.117761873780091, + "grad_norm": 1.2382704019546509, + "learning_rate": 3.149540157811011e-05, + "loss": 0.7651, + "step": 6510 + }, + { + "epoch": 2.119388418998048, + "grad_norm": 1.2130050659179688, + "learning_rate": 3.147047612756302e-05, + "loss": 0.7533, + "step": 6515 + }, + { + "epoch": 2.121014964216005, + "grad_norm": 1.4269800186157227, + "learning_rate": 3.144554378136997e-05, + "loss": 0.7784, + "step": 6520 + }, + { + "epoch": 2.1226415094339623, + "grad_norm": 1.5174479484558105, + "learning_rate": 3.142060456610159e-05, + "loss": 0.7537, + "step": 6525 + }, + { + "epoch": 2.1242680546519193, + "grad_norm": 1.4713554382324219, + "learning_rate": 3.139565850833584e-05, + "loss": 0.7862, + "step": 6530 + }, + { + "epoch": 2.125894599869876, + "grad_norm": 1.3499360084533691, + "learning_rate": 3.137070563465796e-05, + "loss": 0.7704, + "step": 6535 + }, + { + "epoch": 2.1275211450878335, + "grad_norm": 1.244152307510376, + "learning_rate": 3.134574597166043e-05, + "loss": 0.7573, + "step": 6540 + }, + { + "epoch": 2.1291476903057904, + "grad_norm": 1.4201141595840454, + "learning_rate": 3.1320779545943034e-05, + "loss": 0.7885, + "step": 6545 + }, + { + "epoch": 2.130774235523748, + "grad_norm": 1.6688637733459473, + "learning_rate": 3.12958063841127e-05, + "loss": 0.7666, + "step": 6550 + }, + { + "epoch": 2.1324007807417047, + "grad_norm": 1.5988080501556396, + "learning_rate": 3.127082651278357e-05, + "loss": 0.7832, + "step": 6555 + }, + { + "epoch": 2.1340273259596616, + "grad_norm": 1.5584568977355957, + "learning_rate": 3.12458399585769e-05, + "loss": 0.7672, + "step": 6560 + }, + { + "epoch": 2.1356538711776185, + "grad_norm": 1.364772081375122, + "learning_rate": 3.1220846748121105e-05, + "loss": 0.7477, + "step": 6565 + }, + { + "epoch": 2.137280416395576, + "grad_norm": 1.3926069736480713, + "learning_rate": 3.1195846908051686e-05, + "loss": 0.7415, + "step": 6570 + }, + { + "epoch": 2.1389069616135328, + "grad_norm": 1.7917803525924683, + "learning_rate": 3.117084046501119e-05, + "loss": 0.7855, + "step": 6575 + }, + { + "epoch": 2.14053350683149, + "grad_norm": 1.430078148841858, + "learning_rate": 3.114582744564922e-05, + "loss": 0.7791, + "step": 6580 + }, + { + "epoch": 2.142160052049447, + "grad_norm": 1.5274559259414673, + "learning_rate": 3.112080787662237e-05, + "loss": 0.7539, + "step": 6585 + }, + { + "epoch": 2.143786597267404, + "grad_norm": 1.3352601528167725, + "learning_rate": 3.109578178459424e-05, + "loss": 0.7731, + "step": 6590 + }, + { + "epoch": 2.145413142485361, + "grad_norm": 1.4311776161193848, + "learning_rate": 3.107074919623536e-05, + "loss": 0.7864, + "step": 6595 + }, + { + "epoch": 2.147039687703318, + "grad_norm": 1.2369918823242188, + "learning_rate": 3.10457101382232e-05, + "loss": 0.7962, + "step": 6600 + }, + { + "epoch": 2.148666232921275, + "grad_norm": 1.3263933658599854, + "learning_rate": 3.102066463724209e-05, + "loss": 0.7338, + "step": 6605 + }, + { + "epoch": 2.1502927781392325, + "grad_norm": 1.1727105379104614, + "learning_rate": 3.0995612719983275e-05, + "loss": 0.7797, + "step": 6610 + }, + { + "epoch": 2.1519193233571894, + "grad_norm": 1.3748154640197754, + "learning_rate": 3.0970554413144805e-05, + "loss": 0.7504, + "step": 6615 + }, + { + "epoch": 2.1535458685751463, + "grad_norm": 1.4959580898284912, + "learning_rate": 3.094548974343154e-05, + "loss": 0.7727, + "step": 6620 + }, + { + "epoch": 2.1551724137931036, + "grad_norm": 1.3238462209701538, + "learning_rate": 3.0920418737555144e-05, + "loss": 0.7682, + "step": 6625 + }, + { + "epoch": 2.1567989590110606, + "grad_norm": 1.4884978532791138, + "learning_rate": 3.0895341422234006e-05, + "loss": 0.7932, + "step": 6630 + }, + { + "epoch": 2.1584255042290175, + "grad_norm": 1.130836009979248, + "learning_rate": 3.0870257824193263e-05, + "loss": 0.7547, + "step": 6635 + }, + { + "epoch": 2.160052049446975, + "grad_norm": 1.7785542011260986, + "learning_rate": 3.084516797016473e-05, + "loss": 0.7969, + "step": 6640 + }, + { + "epoch": 2.1616785946649317, + "grad_norm": 1.2669305801391602, + "learning_rate": 3.08200718868869e-05, + "loss": 0.7707, + "step": 6645 + }, + { + "epoch": 2.1633051398828886, + "grad_norm": 1.3831796646118164, + "learning_rate": 3.0794969601104894e-05, + "loss": 0.7602, + "step": 6650 + }, + { + "epoch": 2.164931685100846, + "grad_norm": 1.629812240600586, + "learning_rate": 3.076986113957044e-05, + "loss": 0.7896, + "step": 6655 + }, + { + "epoch": 2.166558230318803, + "grad_norm": 1.4540612697601318, + "learning_rate": 3.074474652904189e-05, + "loss": 0.8113, + "step": 6660 + }, + { + "epoch": 2.16818477553676, + "grad_norm": 1.4621168375015259, + "learning_rate": 3.071962579628408e-05, + "loss": 0.7947, + "step": 6665 + }, + { + "epoch": 2.169811320754717, + "grad_norm": 1.3441826105117798, + "learning_rate": 3.0694498968068416e-05, + "loss": 0.7777, + "step": 6670 + }, + { + "epoch": 2.171437865972674, + "grad_norm": 1.4090392589569092, + "learning_rate": 3.066936607117279e-05, + "loss": 0.7676, + "step": 6675 + }, + { + "epoch": 2.173064411190631, + "grad_norm": 1.3409441709518433, + "learning_rate": 3.064422713238158e-05, + "loss": 0.802, + "step": 6680 + }, + { + "epoch": 2.1746909564085883, + "grad_norm": 1.452314853668213, + "learning_rate": 3.061908217848556e-05, + "loss": 0.7889, + "step": 6685 + }, + { + "epoch": 2.1763175016265452, + "grad_norm": 1.2857537269592285, + "learning_rate": 3.0593931236281945e-05, + "loss": 0.7666, + "step": 6690 + }, + { + "epoch": 2.177944046844502, + "grad_norm": 1.4280742406845093, + "learning_rate": 3.056877433257434e-05, + "loss": 0.7542, + "step": 6695 + }, + { + "epoch": 2.1795705920624595, + "grad_norm": 1.506604552268982, + "learning_rate": 3.054361149417266e-05, + "loss": 0.7979, + "step": 6700 + }, + { + "epoch": 2.1811971372804164, + "grad_norm": 1.1827101707458496, + "learning_rate": 3.051844274789321e-05, + "loss": 0.7779, + "step": 6705 + }, + { + "epoch": 2.1828236824983733, + "grad_norm": 1.4131433963775635, + "learning_rate": 3.0493268120558525e-05, + "loss": 0.7589, + "step": 6710 + }, + { + "epoch": 2.1844502277163307, + "grad_norm": 1.4985051155090332, + "learning_rate": 3.046808763899745e-05, + "loss": 0.764, + "step": 6715 + }, + { + "epoch": 2.1860767729342876, + "grad_norm": 1.250264286994934, + "learning_rate": 3.0442901330045055e-05, + "loss": 0.7451, + "step": 6720 + }, + { + "epoch": 2.1877033181522445, + "grad_norm": 1.598387360572815, + "learning_rate": 3.041770922054262e-05, + "loss": 0.8075, + "step": 6725 + }, + { + "epoch": 2.189329863370202, + "grad_norm": 1.2596818208694458, + "learning_rate": 3.0392511337337608e-05, + "loss": 0.7644, + "step": 6730 + }, + { + "epoch": 2.1909564085881588, + "grad_norm": 1.2911043167114258, + "learning_rate": 3.0367307707283626e-05, + "loss": 0.757, + "step": 6735 + }, + { + "epoch": 2.1925829538061157, + "grad_norm": 1.3745313882827759, + "learning_rate": 3.0342098357240435e-05, + "loss": 0.7757, + "step": 6740 + }, + { + "epoch": 2.194209499024073, + "grad_norm": 1.5250049829483032, + "learning_rate": 3.031688331407386e-05, + "loss": 0.7696, + "step": 6745 + }, + { + "epoch": 2.19583604424203, + "grad_norm": 1.8375773429870605, + "learning_rate": 3.029166260465579e-05, + "loss": 0.7891, + "step": 6750 + }, + { + "epoch": 2.197462589459987, + "grad_norm": 1.2324512004852295, + "learning_rate": 3.0266436255864195e-05, + "loss": 0.7756, + "step": 6755 + }, + { + "epoch": 2.199089134677944, + "grad_norm": 1.3040711879730225, + "learning_rate": 3.0241204294583008e-05, + "loss": 0.7789, + "step": 6760 + }, + { + "epoch": 2.200715679895901, + "grad_norm": 1.5040578842163086, + "learning_rate": 3.0215966747702156e-05, + "loss": 0.7731, + "step": 6765 + }, + { + "epoch": 2.202342225113858, + "grad_norm": 1.648367166519165, + "learning_rate": 3.0190723642117553e-05, + "loss": 0.771, + "step": 6770 + }, + { + "epoch": 2.2039687703318154, + "grad_norm": 1.4410743713378906, + "learning_rate": 3.0165475004730993e-05, + "loss": 0.7742, + "step": 6775 + }, + { + "epoch": 2.2055953155497723, + "grad_norm": 1.4250338077545166, + "learning_rate": 3.0140220862450175e-05, + "loss": 0.765, + "step": 6780 + }, + { + "epoch": 2.207221860767729, + "grad_norm": 1.4945993423461914, + "learning_rate": 3.0114961242188677e-05, + "loss": 0.7396, + "step": 6785 + }, + { + "epoch": 2.2088484059856865, + "grad_norm": 1.232087254524231, + "learning_rate": 3.0089696170865912e-05, + "loss": 0.7611, + "step": 6790 + }, + { + "epoch": 2.2104749512036435, + "grad_norm": 1.6387182474136353, + "learning_rate": 3.0064425675407092e-05, + "loss": 0.7591, + "step": 6795 + }, + { + "epoch": 2.2121014964216004, + "grad_norm": 1.5776211023330688, + "learning_rate": 3.003914978274322e-05, + "loss": 0.8021, + "step": 6800 + }, + { + "epoch": 2.2137280416395577, + "grad_norm": 1.4892339706420898, + "learning_rate": 3.001386851981105e-05, + "loss": 0.7878, + "step": 6805 + }, + { + "epoch": 2.2153545868575146, + "grad_norm": 1.4057245254516602, + "learning_rate": 2.9988581913553054e-05, + "loss": 0.7594, + "step": 6810 + }, + { + "epoch": 2.2169811320754715, + "grad_norm": 1.1302647590637207, + "learning_rate": 2.99632899909174e-05, + "loss": 0.7576, + "step": 6815 + }, + { + "epoch": 2.218607677293429, + "grad_norm": 1.5885183811187744, + "learning_rate": 2.9937992778857927e-05, + "loss": 0.7442, + "step": 6820 + }, + { + "epoch": 2.220234222511386, + "grad_norm": 1.4258043766021729, + "learning_rate": 2.99126903043341e-05, + "loss": 0.7959, + "step": 6825 + }, + { + "epoch": 2.2218607677293427, + "grad_norm": 1.3153363466262817, + "learning_rate": 2.9887382594311003e-05, + "loss": 0.8074, + "step": 6830 + }, + { + "epoch": 2.2234873129473, + "grad_norm": 1.410018801689148, + "learning_rate": 2.9862069675759296e-05, + "loss": 0.7837, + "step": 6835 + }, + { + "epoch": 2.225113858165257, + "grad_norm": 1.5136847496032715, + "learning_rate": 2.9836751575655186e-05, + "loss": 0.7341, + "step": 6840 + }, + { + "epoch": 2.226740403383214, + "grad_norm": 1.5181306600570679, + "learning_rate": 2.9816493382985712e-05, + "loss": 0.7832, + "step": 6845 + }, + { + "epoch": 2.2283669486011712, + "grad_norm": 1.4423621892929077, + "learning_rate": 2.9791166024084942e-05, + "loss": 0.7519, + "step": 6850 + }, + { + "epoch": 2.229993493819128, + "grad_norm": 1.5381733179092407, + "learning_rate": 2.976583355919444e-05, + "loss": 0.7893, + "step": 6855 + }, + { + "epoch": 2.231620039037085, + "grad_norm": 1.4135688543319702, + "learning_rate": 2.974049601531126e-05, + "loss": 0.7854, + "step": 6860 + }, + { + "epoch": 2.2332465842550424, + "grad_norm": 1.1766645908355713, + "learning_rate": 2.9715153419437845e-05, + "loss": 0.771, + "step": 6865 + }, + { + "epoch": 2.2348731294729993, + "grad_norm": 1.2250036001205444, + "learning_rate": 2.968980579858203e-05, + "loss": 0.7846, + "step": 6870 + }, + { + "epoch": 2.2364996746909562, + "grad_norm": 1.6647272109985352, + "learning_rate": 2.9664453179757022e-05, + "loss": 0.773, + "step": 6875 + }, + { + "epoch": 2.2381262199089136, + "grad_norm": 1.4135332107543945, + "learning_rate": 2.963909558998133e-05, + "loss": 0.7241, + "step": 6880 + }, + { + "epoch": 2.2397527651268705, + "grad_norm": 1.2721209526062012, + "learning_rate": 2.961373305627877e-05, + "loss": 0.7553, + "step": 6885 + }, + { + "epoch": 2.2413793103448274, + "grad_norm": 1.3459265232086182, + "learning_rate": 2.9588365605678438e-05, + "loss": 0.7425, + "step": 6890 + }, + { + "epoch": 2.2430058555627848, + "grad_norm": 1.242512822151184, + "learning_rate": 2.9562993265214644e-05, + "loss": 0.7626, + "step": 6895 + }, + { + "epoch": 2.2446324007807417, + "grad_norm": 1.3302488327026367, + "learning_rate": 2.9537616061926938e-05, + "loss": 0.7514, + "step": 6900 + }, + { + "epoch": 2.2462589459986986, + "grad_norm": 1.547134280204773, + "learning_rate": 2.9512234022860018e-05, + "loss": 0.7551, + "step": 6905 + }, + { + "epoch": 2.247885491216656, + "grad_norm": 1.7118721008300781, + "learning_rate": 2.9486847175063786e-05, + "loss": 0.7646, + "step": 6910 + }, + { + "epoch": 2.249512036434613, + "grad_norm": 1.275580883026123, + "learning_rate": 2.9461455545593226e-05, + "loss": 0.7768, + "step": 6915 + }, + { + "epoch": 2.2511385816525697, + "grad_norm": 1.6420084238052368, + "learning_rate": 2.9436059161508423e-05, + "loss": 0.8, + "step": 6920 + }, + { + "epoch": 2.252765126870527, + "grad_norm": 1.4558192491531372, + "learning_rate": 2.9410658049874558e-05, + "loss": 0.7752, + "step": 6925 + }, + { + "epoch": 2.254391672088484, + "grad_norm": 1.368084192276001, + "learning_rate": 2.9385252237761806e-05, + "loss": 0.8031, + "step": 6930 + }, + { + "epoch": 2.2560182173064414, + "grad_norm": 1.5736048221588135, + "learning_rate": 2.935984175224539e-05, + "loss": 0.7476, + "step": 6935 + }, + { + "epoch": 2.2576447625243983, + "grad_norm": 1.3301265239715576, + "learning_rate": 2.933442662040549e-05, + "loss": 0.7854, + "step": 6940 + }, + { + "epoch": 2.259271307742355, + "grad_norm": 1.5797492265701294, + "learning_rate": 2.9309006869327254e-05, + "loss": 0.7662, + "step": 6945 + }, + { + "epoch": 2.260897852960312, + "grad_norm": 1.3316903114318848, + "learning_rate": 2.9283582526100738e-05, + "loss": 0.7569, + "step": 6950 + }, + { + "epoch": 2.2625243981782694, + "grad_norm": 1.4439982175827026, + "learning_rate": 2.92581536178209e-05, + "loss": 0.7604, + "step": 6955 + }, + { + "epoch": 2.2641509433962264, + "grad_norm": 1.4118748903274536, + "learning_rate": 2.9232720171587564e-05, + "loss": 0.7842, + "step": 6960 + }, + { + "epoch": 2.2657774886141837, + "grad_norm": 1.2566323280334473, + "learning_rate": 2.9207282214505383e-05, + "loss": 0.7779, + "step": 6965 + }, + { + "epoch": 2.2674040338321406, + "grad_norm": 1.4547405242919922, + "learning_rate": 2.9181839773683827e-05, + "loss": 0.7523, + "step": 6970 + }, + { + "epoch": 2.2690305790500975, + "grad_norm": 1.4687365293502808, + "learning_rate": 2.9156392876237138e-05, + "loss": 0.7969, + "step": 6975 + }, + { + "epoch": 2.2706571242680544, + "grad_norm": 1.6208761930465698, + "learning_rate": 2.9130941549284307e-05, + "loss": 0.7715, + "step": 6980 + }, + { + "epoch": 2.272283669486012, + "grad_norm": 1.4808614253997803, + "learning_rate": 2.9105485819949045e-05, + "loss": 0.771, + "step": 6985 + }, + { + "epoch": 2.2739102147039687, + "grad_norm": 1.253281593322754, + "learning_rate": 2.9080025715359773e-05, + "loss": 0.7629, + "step": 6990 + }, + { + "epoch": 2.275536759921926, + "grad_norm": 1.4285131692886353, + "learning_rate": 2.905456126264954e-05, + "loss": 0.7498, + "step": 6995 + }, + { + "epoch": 2.277163305139883, + "grad_norm": 1.8890243768692017, + "learning_rate": 2.9029092488956045e-05, + "loss": 0.7639, + "step": 7000 + }, + { + "epoch": 2.27878985035784, + "grad_norm": 1.6777067184448242, + "learning_rate": 2.9003619421421612e-05, + "loss": 0.7872, + "step": 7005 + }, + { + "epoch": 2.280416395575797, + "grad_norm": 1.4272592067718506, + "learning_rate": 2.8978142087193112e-05, + "loss": 0.7625, + "step": 7010 + }, + { + "epoch": 2.282042940793754, + "grad_norm": 1.3542779684066772, + "learning_rate": 2.8952660513421976e-05, + "loss": 0.7348, + "step": 7015 + }, + { + "epoch": 2.283669486011711, + "grad_norm": 1.5800836086273193, + "learning_rate": 2.8927174727264154e-05, + "loss": 0.7773, + "step": 7020 + }, + { + "epoch": 2.2852960312296684, + "grad_norm": 1.4163938760757446, + "learning_rate": 2.8901684755880087e-05, + "loss": 0.7899, + "step": 7025 + }, + { + "epoch": 2.2869225764476253, + "grad_norm": 1.5095348358154297, + "learning_rate": 2.8876190626434664e-05, + "loss": 0.759, + "step": 7030 + }, + { + "epoch": 2.288549121665582, + "grad_norm": 1.256325125694275, + "learning_rate": 2.885069236609722e-05, + "loss": 0.7839, + "step": 7035 + }, + { + "epoch": 2.290175666883539, + "grad_norm": 1.219807744026184, + "learning_rate": 2.8825190002041474e-05, + "loss": 0.782, + "step": 7040 + }, + { + "epoch": 2.2918022121014965, + "grad_norm": 1.44439697265625, + "learning_rate": 2.8799683561445545e-05, + "loss": 0.7896, + "step": 7045 + }, + { + "epoch": 2.2934287573194534, + "grad_norm": 1.5239802598953247, + "learning_rate": 2.8774173071491874e-05, + "loss": 0.7701, + "step": 7050 + }, + { + "epoch": 2.2950553025374107, + "grad_norm": 1.197445273399353, + "learning_rate": 2.874865855936722e-05, + "loss": 0.7838, + "step": 7055 + }, + { + "epoch": 2.2966818477553677, + "grad_norm": 1.3942207098007202, + "learning_rate": 2.8723140052262647e-05, + "loss": 0.7583, + "step": 7060 + }, + { + "epoch": 2.2983083929733246, + "grad_norm": 1.6031618118286133, + "learning_rate": 2.8697617577373447e-05, + "loss": 0.7612, + "step": 7065 + }, + { + "epoch": 2.2999349381912815, + "grad_norm": 1.6670364141464233, + "learning_rate": 2.8672091161899172e-05, + "loss": 0.7627, + "step": 7070 + }, + { + "epoch": 2.301561483409239, + "grad_norm": 1.3343385457992554, + "learning_rate": 2.864656083304355e-05, + "loss": 0.7521, + "step": 7075 + }, + { + "epoch": 2.3031880286271957, + "grad_norm": 1.286812663078308, + "learning_rate": 2.8621026618014483e-05, + "loss": 0.8077, + "step": 7080 + }, + { + "epoch": 2.304814573845153, + "grad_norm": 1.4556792974472046, + "learning_rate": 2.859548854402403e-05, + "loss": 0.7829, + "step": 7085 + }, + { + "epoch": 2.30644111906311, + "grad_norm": 1.4524141550064087, + "learning_rate": 2.8569946638288343e-05, + "loss": 0.7752, + "step": 7090 + }, + { + "epoch": 2.308067664281067, + "grad_norm": 1.3135497570037842, + "learning_rate": 2.8544400928027665e-05, + "loss": 0.8064, + "step": 7095 + }, + { + "epoch": 2.3096942094990243, + "grad_norm": 1.3401880264282227, + "learning_rate": 2.85188514404663e-05, + "loss": 0.7887, + "step": 7100 + }, + { + "epoch": 2.311320754716981, + "grad_norm": 1.3075878620147705, + "learning_rate": 2.8493298202832568e-05, + "loss": 0.7416, + "step": 7105 + }, + { + "epoch": 2.312947299934938, + "grad_norm": 1.2680739164352417, + "learning_rate": 2.8467741242358794e-05, + "loss": 0.7735, + "step": 7110 + }, + { + "epoch": 2.3145738451528954, + "grad_norm": 1.4084503650665283, + "learning_rate": 2.844218058628126e-05, + "loss": 0.7556, + "step": 7115 + }, + { + "epoch": 2.3162003903708523, + "grad_norm": 1.3460830450057983, + "learning_rate": 2.84166162618402e-05, + "loss": 0.7649, + "step": 7120 + }, + { + "epoch": 2.3178269355888093, + "grad_norm": 1.4924503564834595, + "learning_rate": 2.8391048296279742e-05, + "loss": 0.7803, + "step": 7125 + }, + { + "epoch": 2.3194534808067666, + "grad_norm": 1.4632081985473633, + "learning_rate": 2.8365476716847906e-05, + "loss": 0.7857, + "step": 7130 + }, + { + "epoch": 2.3210800260247235, + "grad_norm": 1.2531808614730835, + "learning_rate": 2.833990155079656e-05, + "loss": 0.7755, + "step": 7135 + }, + { + "epoch": 2.3227065712426804, + "grad_norm": 1.5589675903320312, + "learning_rate": 2.8314322825381394e-05, + "loss": 0.765, + "step": 7140 + }, + { + "epoch": 2.324333116460638, + "grad_norm": 1.3784013986587524, + "learning_rate": 2.8288740567861888e-05, + "loss": 0.7674, + "step": 7145 + }, + { + "epoch": 2.3259596616785947, + "grad_norm": 1.4949363470077515, + "learning_rate": 2.8263154805501297e-05, + "loss": 0.8119, + "step": 7150 + }, + { + "epoch": 2.3275862068965516, + "grad_norm": 1.3017836809158325, + "learning_rate": 2.8237565565566592e-05, + "loss": 0.7847, + "step": 7155 + }, + { + "epoch": 2.329212752114509, + "grad_norm": 1.428378939628601, + "learning_rate": 2.821197287532847e-05, + "loss": 0.7624, + "step": 7160 + }, + { + "epoch": 2.330839297332466, + "grad_norm": 1.4479926824569702, + "learning_rate": 2.8186376762061288e-05, + "loss": 0.7432, + "step": 7165 + }, + { + "epoch": 2.3324658425504228, + "grad_norm": 1.3771941661834717, + "learning_rate": 2.8160777253043074e-05, + "loss": 0.7773, + "step": 7170 + }, + { + "epoch": 2.33409238776838, + "grad_norm": 1.3941649198532104, + "learning_rate": 2.8135174375555446e-05, + "loss": 0.7566, + "step": 7175 + }, + { + "epoch": 2.335718932986337, + "grad_norm": 1.5908803939819336, + "learning_rate": 2.8109568156883633e-05, + "loss": 0.7433, + "step": 7180 + }, + { + "epoch": 2.337345478204294, + "grad_norm": 1.4220364093780518, + "learning_rate": 2.808395862431642e-05, + "loss": 0.7554, + "step": 7185 + }, + { + "epoch": 2.3389720234222513, + "grad_norm": 1.4723397493362427, + "learning_rate": 2.8058345805146113e-05, + "loss": 0.7498, + "step": 7190 + }, + { + "epoch": 2.340598568640208, + "grad_norm": 1.3126842975616455, + "learning_rate": 2.8032729726668538e-05, + "loss": 0.7445, + "step": 7195 + }, + { + "epoch": 2.342225113858165, + "grad_norm": 1.2721545696258545, + "learning_rate": 2.800711041618298e-05, + "loss": 0.7759, + "step": 7200 + }, + { + "epoch": 2.3438516590761225, + "grad_norm": 1.54392409324646, + "learning_rate": 2.7981487900992182e-05, + "loss": 0.7454, + "step": 7205 + }, + { + "epoch": 2.3454782042940794, + "grad_norm": 1.5344245433807373, + "learning_rate": 2.7955862208402283e-05, + "loss": 0.7856, + "step": 7210 + }, + { + "epoch": 2.3471047495120363, + "grad_norm": 1.541072130203247, + "learning_rate": 2.7930233365722825e-05, + "loss": 0.7947, + "step": 7215 + }, + { + "epoch": 2.3487312947299936, + "grad_norm": 1.413663387298584, + "learning_rate": 2.7904601400266707e-05, + "loss": 0.7722, + "step": 7220 + }, + { + "epoch": 2.3503578399479506, + "grad_norm": 1.239256739616394, + "learning_rate": 2.7878966339350132e-05, + "loss": 0.7598, + "step": 7225 + }, + { + "epoch": 2.3519843851659075, + "grad_norm": 1.4928994178771973, + "learning_rate": 2.7853328210292646e-05, + "loss": 0.7673, + "step": 7230 + }, + { + "epoch": 2.353610930383865, + "grad_norm": 1.2077562808990479, + "learning_rate": 2.7827687040417023e-05, + "loss": 0.7461, + "step": 7235 + }, + { + "epoch": 2.3552374756018217, + "grad_norm": 1.4283638000488281, + "learning_rate": 2.7802042857049292e-05, + "loss": 0.7524, + "step": 7240 + }, + { + "epoch": 2.3568640208197786, + "grad_norm": 1.4223048686981201, + "learning_rate": 2.7776395687518703e-05, + "loss": 0.775, + "step": 7245 + }, + { + "epoch": 2.358490566037736, + "grad_norm": 1.2481826543807983, + "learning_rate": 2.7750745559157682e-05, + "loss": 0.7735, + "step": 7250 + }, + { + "epoch": 2.360117111255693, + "grad_norm": 1.3343746662139893, + "learning_rate": 2.7725092499301797e-05, + "loss": 0.7394, + "step": 7255 + }, + { + "epoch": 2.36174365647365, + "grad_norm": 1.1968649625778198, + "learning_rate": 2.769943653528976e-05, + "loss": 0.7667, + "step": 7260 + }, + { + "epoch": 2.363370201691607, + "grad_norm": 1.3382525444030762, + "learning_rate": 2.767377769446336e-05, + "loss": 0.7656, + "step": 7265 + }, + { + "epoch": 2.364996746909564, + "grad_norm": 1.3819516897201538, + "learning_rate": 2.7648116004167457e-05, + "loss": 0.7739, + "step": 7270 + }, + { + "epoch": 2.366623292127521, + "grad_norm": 1.404935598373413, + "learning_rate": 2.7622451491749966e-05, + "loss": 0.7684, + "step": 7275 + }, + { + "epoch": 2.3682498373454783, + "grad_norm": 1.56549870967865, + "learning_rate": 2.7596784184561787e-05, + "loss": 0.8385, + "step": 7280 + }, + { + "epoch": 2.3698763825634352, + "grad_norm": 1.2172797918319702, + "learning_rate": 2.75711141099568e-05, + "loss": 0.7678, + "step": 7285 + }, + { + "epoch": 2.371502927781392, + "grad_norm": 1.7972877025604248, + "learning_rate": 2.754544129529184e-05, + "loss": 0.7614, + "step": 7290 + }, + { + "epoch": 2.3731294729993495, + "grad_norm": 1.8294897079467773, + "learning_rate": 2.7519765767926668e-05, + "loss": 0.806, + "step": 7295 + }, + { + "epoch": 2.3747560182173064, + "grad_norm": 1.380858302116394, + "learning_rate": 2.749408755522393e-05, + "loss": 0.7849, + "step": 7300 + }, + { + "epoch": 2.3763825634352633, + "grad_norm": 1.4070088863372803, + "learning_rate": 2.7468406684549123e-05, + "loss": 0.8108, + "step": 7305 + }, + { + "epoch": 2.3780091086532207, + "grad_norm": 1.400122880935669, + "learning_rate": 2.7442723183270598e-05, + "loss": 0.7843, + "step": 7310 + }, + { + "epoch": 2.3796356538711776, + "grad_norm": 1.3813008069992065, + "learning_rate": 2.7417037078759495e-05, + "loss": 0.7493, + "step": 7315 + }, + { + "epoch": 2.3812621990891345, + "grad_norm": 1.5987757444381714, + "learning_rate": 2.7391348398389734e-05, + "loss": 0.7702, + "step": 7320 + }, + { + "epoch": 2.382888744307092, + "grad_norm": 1.4951194524765015, + "learning_rate": 2.736565716953797e-05, + "loss": 0.7516, + "step": 7325 + }, + { + "epoch": 2.3845152895250488, + "grad_norm": 1.5555793046951294, + "learning_rate": 2.7339963419583603e-05, + "loss": 0.75, + "step": 7330 + }, + { + "epoch": 2.3861418347430057, + "grad_norm": 1.5632519721984863, + "learning_rate": 2.7314267175908675e-05, + "loss": 0.7513, + "step": 7335 + }, + { + "epoch": 2.387768379960963, + "grad_norm": 1.68852698802948, + "learning_rate": 2.7288568465897917e-05, + "loss": 0.7543, + "step": 7340 + }, + { + "epoch": 2.38939492517892, + "grad_norm": 1.4951677322387695, + "learning_rate": 2.7262867316938688e-05, + "loss": 0.7776, + "step": 7345 + }, + { + "epoch": 2.391021470396877, + "grad_norm": 1.3430877923965454, + "learning_rate": 2.723716375642093e-05, + "loss": 0.753, + "step": 7350 + }, + { + "epoch": 2.392648015614834, + "grad_norm": 1.3241239786148071, + "learning_rate": 2.7211457811737167e-05, + "loss": 0.767, + "step": 7355 + }, + { + "epoch": 2.394274560832791, + "grad_norm": 1.3422410488128662, + "learning_rate": 2.7185749510282467e-05, + "loss": 0.7746, + "step": 7360 + }, + { + "epoch": 2.395901106050748, + "grad_norm": 1.2942992448806763, + "learning_rate": 2.7160038879454392e-05, + "loss": 0.743, + "step": 7365 + }, + { + "epoch": 2.3975276512687054, + "grad_norm": 1.4469120502471924, + "learning_rate": 2.7134325946653e-05, + "loss": 0.7753, + "step": 7370 + }, + { + "epoch": 2.3991541964866623, + "grad_norm": 1.383994698524475, + "learning_rate": 2.710861073928081e-05, + "loss": 0.7385, + "step": 7375 + }, + { + "epoch": 2.4007807417046196, + "grad_norm": 1.3670079708099365, + "learning_rate": 2.7082893284742748e-05, + "loss": 0.7686, + "step": 7380 + }, + { + "epoch": 2.4024072869225765, + "grad_norm": 1.4173766374588013, + "learning_rate": 2.705717361044614e-05, + "loss": 0.7463, + "step": 7385 + }, + { + "epoch": 2.4040338321405335, + "grad_norm": 1.337874174118042, + "learning_rate": 2.7031451743800684e-05, + "loss": 0.7722, + "step": 7390 + }, + { + "epoch": 2.4056603773584904, + "grad_norm": 1.4389969110488892, + "learning_rate": 2.7005727712218416e-05, + "loss": 0.7374, + "step": 7395 + }, + { + "epoch": 2.4072869225764477, + "grad_norm": 1.385742425918579, + "learning_rate": 2.6980001543113652e-05, + "loss": 0.7763, + "step": 7400 + }, + { + "epoch": 2.4089134677944046, + "grad_norm": 1.436094045639038, + "learning_rate": 2.6954273263903028e-05, + "loss": 0.7908, + "step": 7405 + }, + { + "epoch": 2.410540013012362, + "grad_norm": 1.4230499267578125, + "learning_rate": 2.6928542902005406e-05, + "loss": 0.7282, + "step": 7410 + }, + { + "epoch": 2.412166558230319, + "grad_norm": 1.3393175601959229, + "learning_rate": 2.6902810484841856e-05, + "loss": 0.7507, + "step": 7415 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 1.6187865734100342, + "learning_rate": 2.6877076039835663e-05, + "loss": 0.7831, + "step": 7420 + }, + { + "epoch": 2.4154196486662327, + "grad_norm": 1.4352965354919434, + "learning_rate": 2.685133959441226e-05, + "loss": 0.7922, + "step": 7425 + }, + { + "epoch": 2.41704619388419, + "grad_norm": 1.4569371938705444, + "learning_rate": 2.682560117599921e-05, + "loss": 0.7701, + "step": 7430 + }, + { + "epoch": 2.418672739102147, + "grad_norm": 1.296837329864502, + "learning_rate": 2.6799860812026188e-05, + "loss": 0.7744, + "step": 7435 + }, + { + "epoch": 2.4202992843201043, + "grad_norm": 1.6270610094070435, + "learning_rate": 2.6774118529924934e-05, + "loss": 0.7878, + "step": 7440 + }, + { + "epoch": 2.4219258295380612, + "grad_norm": 1.3578767776489258, + "learning_rate": 2.674837435712923e-05, + "loss": 0.7567, + "step": 7445 + }, + { + "epoch": 2.423552374756018, + "grad_norm": 1.294484257698059, + "learning_rate": 2.6722628321074883e-05, + "loss": 0.7759, + "step": 7450 + }, + { + "epoch": 2.425178919973975, + "grad_norm": 1.2326703071594238, + "learning_rate": 2.6696880449199685e-05, + "loss": 0.7791, + "step": 7455 + }, + { + "epoch": 2.4268054651919324, + "grad_norm": 1.334506869316101, + "learning_rate": 2.6671130768943375e-05, + "loss": 0.7758, + "step": 7460 + }, + { + "epoch": 2.4284320104098893, + "grad_norm": 1.3414207696914673, + "learning_rate": 2.6645379307747625e-05, + "loss": 0.751, + "step": 7465 + }, + { + "epoch": 2.4300585556278467, + "grad_norm": 1.3746579885482788, + "learning_rate": 2.6619626093056005e-05, + "loss": 0.7558, + "step": 7470 + }, + { + "epoch": 2.4316851008458036, + "grad_norm": 1.4226535558700562, + "learning_rate": 2.659387115231395e-05, + "loss": 0.7692, + "step": 7475 + }, + { + "epoch": 2.4333116460637605, + "grad_norm": 1.333991289138794, + "learning_rate": 2.6568114512968732e-05, + "loss": 0.7466, + "step": 7480 + }, + { + "epoch": 2.4349381912817174, + "grad_norm": 1.268318772315979, + "learning_rate": 2.6542356202469455e-05, + "loss": 0.7569, + "step": 7485 + }, + { + "epoch": 2.4365647364996748, + "grad_norm": 1.5921881198883057, + "learning_rate": 2.651659624826698e-05, + "loss": 0.7807, + "step": 7490 + }, + { + "epoch": 2.4381912817176317, + "grad_norm": 1.4532439708709717, + "learning_rate": 2.6490834677813915e-05, + "loss": 0.7846, + "step": 7495 + }, + { + "epoch": 2.439817826935589, + "grad_norm": 1.5919448137283325, + "learning_rate": 2.646507151856462e-05, + "loss": 0.7657, + "step": 7500 + }, + { + "epoch": 2.441444372153546, + "grad_norm": 1.4111067056655884, + "learning_rate": 2.6439306797975126e-05, + "loss": 0.772, + "step": 7505 + }, + { + "epoch": 2.443070917371503, + "grad_norm": 1.2800402641296387, + "learning_rate": 2.641354054350313e-05, + "loss": 0.7976, + "step": 7510 + }, + { + "epoch": 2.4446974625894597, + "grad_norm": 1.3687649965286255, + "learning_rate": 2.6387772782607962e-05, + "loss": 0.794, + "step": 7515 + }, + { + "epoch": 2.446324007807417, + "grad_norm": 1.6004873514175415, + "learning_rate": 2.6362003542750568e-05, + "loss": 0.7706, + "step": 7520 + }, + { + "epoch": 2.447950553025374, + "grad_norm": 1.4323501586914062, + "learning_rate": 2.633623285139347e-05, + "loss": 0.7557, + "step": 7525 + }, + { + "epoch": 2.4495770982433314, + "grad_norm": 1.5661638975143433, + "learning_rate": 2.631046073600072e-05, + "loss": 0.7663, + "step": 7530 + }, + { + "epoch": 2.4512036434612883, + "grad_norm": 1.2775802612304688, + "learning_rate": 2.6284687224037908e-05, + "loss": 0.7737, + "step": 7535 + }, + { + "epoch": 2.452830188679245, + "grad_norm": 1.444387674331665, + "learning_rate": 2.625891234297209e-05, + "loss": 0.7594, + "step": 7540 + }, + { + "epoch": 2.4544567338972025, + "grad_norm": 1.5663361549377441, + "learning_rate": 2.6233136120271807e-05, + "loss": 0.7428, + "step": 7545 + }, + { + "epoch": 2.4560832791151594, + "grad_norm": 1.4844396114349365, + "learning_rate": 2.620735858340701e-05, + "loss": 0.7761, + "step": 7550 + }, + { + "epoch": 2.4577098243331164, + "grad_norm": 1.392500638961792, + "learning_rate": 2.6181579759849062e-05, + "loss": 0.7493, + "step": 7555 + }, + { + "epoch": 2.4593363695510737, + "grad_norm": 1.3076505661010742, + "learning_rate": 2.615579967707068e-05, + "loss": 0.7535, + "step": 7560 + }, + { + "epoch": 2.4609629147690306, + "grad_norm": 1.2808064222335815, + "learning_rate": 2.6130018362545956e-05, + "loss": 0.7775, + "step": 7565 + }, + { + "epoch": 2.4625894599869875, + "grad_norm": 1.6727182865142822, + "learning_rate": 2.6104235843750263e-05, + "loss": 0.7571, + "step": 7570 + }, + { + "epoch": 2.464216005204945, + "grad_norm": 1.3842878341674805, + "learning_rate": 2.607845214816026e-05, + "loss": 0.7851, + "step": 7575 + }, + { + "epoch": 2.465842550422902, + "grad_norm": 1.4012354612350464, + "learning_rate": 2.6052667303253887e-05, + "loss": 0.8006, + "step": 7580 + }, + { + "epoch": 2.4674690956408587, + "grad_norm": 1.151585578918457, + "learning_rate": 2.6026881336510267e-05, + "loss": 0.7838, + "step": 7585 + }, + { + "epoch": 2.469095640858816, + "grad_norm": 1.207098126411438, + "learning_rate": 2.6001094275409772e-05, + "loss": 0.7702, + "step": 7590 + }, + { + "epoch": 2.470722186076773, + "grad_norm": 1.322762131690979, + "learning_rate": 2.5975306147433882e-05, + "loss": 0.7688, + "step": 7595 + }, + { + "epoch": 2.47234873129473, + "grad_norm": 1.4287461042404175, + "learning_rate": 2.594951698006526e-05, + "loss": 0.7605, + "step": 7600 + }, + { + "epoch": 2.4739752765126872, + "grad_norm": 1.5663272142410278, + "learning_rate": 2.5923726800787657e-05, + "loss": 0.768, + "step": 7605 + }, + { + "epoch": 2.475601821730644, + "grad_norm": 1.4309062957763672, + "learning_rate": 2.589793563708589e-05, + "loss": 0.781, + "step": 7610 + }, + { + "epoch": 2.477228366948601, + "grad_norm": 1.3493362665176392, + "learning_rate": 2.587214351644586e-05, + "loss": 0.75, + "step": 7615 + }, + { + "epoch": 2.4788549121665584, + "grad_norm": 1.6738557815551758, + "learning_rate": 2.5846350466354457e-05, + "loss": 0.7859, + "step": 7620 + }, + { + "epoch": 2.4804814573845153, + "grad_norm": 1.623167634010315, + "learning_rate": 2.5820556514299572e-05, + "loss": 0.7457, + "step": 7625 + }, + { + "epoch": 2.482108002602472, + "grad_norm": 1.3531709909439087, + "learning_rate": 2.5794761687770058e-05, + "loss": 0.766, + "step": 7630 + }, + { + "epoch": 2.4837345478204296, + "grad_norm": 1.6604338884353638, + "learning_rate": 2.57689660142557e-05, + "loss": 0.777, + "step": 7635 + }, + { + "epoch": 2.4853610930383865, + "grad_norm": 1.460084319114685, + "learning_rate": 2.574316952124718e-05, + "loss": 0.7606, + "step": 7640 + }, + { + "epoch": 2.4869876382563434, + "grad_norm": 1.262992262840271, + "learning_rate": 2.5717372236236065e-05, + "loss": 0.7791, + "step": 7645 + }, + { + "epoch": 2.4886141834743007, + "grad_norm": 1.318466067314148, + "learning_rate": 2.5691574186714758e-05, + "loss": 0.7841, + "step": 7650 + }, + { + "epoch": 2.4902407286922577, + "grad_norm": 1.4100533723831177, + "learning_rate": 2.5665775400176466e-05, + "loss": 0.7715, + "step": 7655 + }, + { + "epoch": 2.4918672739102146, + "grad_norm": 1.2835301160812378, + "learning_rate": 2.563997590411521e-05, + "loss": 0.7464, + "step": 7660 + }, + { + "epoch": 2.493493819128172, + "grad_norm": 1.3279248476028442, + "learning_rate": 2.561417572602574e-05, + "loss": 0.8049, + "step": 7665 + }, + { + "epoch": 2.495120364346129, + "grad_norm": 1.3721544742584229, + "learning_rate": 2.558837489340355e-05, + "loss": 0.7525, + "step": 7670 + }, + { + "epoch": 2.4967469095640857, + "grad_norm": 1.3791251182556152, + "learning_rate": 2.5562573433744818e-05, + "loss": 0.7721, + "step": 7675 + }, + { + "epoch": 2.498373454782043, + "grad_norm": 1.3697301149368286, + "learning_rate": 2.5536771374546402e-05, + "loss": 0.7515, + "step": 7680 + }, + { + "epoch": 2.5, + "grad_norm": 1.5030325651168823, + "learning_rate": 2.5510968743305797e-05, + "loss": 0.7472, + "step": 7685 + }, + { + "epoch": 2.501626545217957, + "grad_norm": 1.443269968032837, + "learning_rate": 2.5485165567521086e-05, + "loss": 0.7678, + "step": 7690 + }, + { + "epoch": 2.5032530904359143, + "grad_norm": 1.55005943775177, + "learning_rate": 2.5459361874690974e-05, + "loss": 0.7786, + "step": 7695 + }, + { + "epoch": 2.504879635653871, + "grad_norm": 1.421657919883728, + "learning_rate": 2.5433557692314687e-05, + "loss": 0.7827, + "step": 7700 + }, + { + "epoch": 2.506506180871828, + "grad_norm": 1.3606561422348022, + "learning_rate": 2.540775304789197e-05, + "loss": 0.7739, + "step": 7705 + }, + { + "epoch": 2.5081327260897854, + "grad_norm": 1.2665021419525146, + "learning_rate": 2.5381947968923083e-05, + "loss": 0.7761, + "step": 7710 + }, + { + "epoch": 2.5097592713077423, + "grad_norm": 1.4803599119186401, + "learning_rate": 2.5356142482908724e-05, + "loss": 0.7591, + "step": 7715 + }, + { + "epoch": 2.5113858165256993, + "grad_norm": 1.4428778886795044, + "learning_rate": 2.5330336617350035e-05, + "loss": 0.7875, + "step": 7720 + }, + { + "epoch": 2.5130123617436566, + "grad_norm": 1.2702828645706177, + "learning_rate": 2.5304530399748587e-05, + "loss": 0.7794, + "step": 7725 + }, + { + "epoch": 2.5146389069616135, + "grad_norm": 1.3777793645858765, + "learning_rate": 2.527872385760629e-05, + "loss": 0.7983, + "step": 7730 + }, + { + "epoch": 2.5162654521795704, + "grad_norm": 1.3720378875732422, + "learning_rate": 2.5252917018425416e-05, + "loss": 0.7763, + "step": 7735 + }, + { + "epoch": 2.517891997397528, + "grad_norm": 1.3934985399246216, + "learning_rate": 2.5227109909708536e-05, + "loss": 0.7409, + "step": 7740 + }, + { + "epoch": 2.5195185426154847, + "grad_norm": 1.5128051042556763, + "learning_rate": 2.520130255895854e-05, + "loss": 0.7809, + "step": 7745 + }, + { + "epoch": 2.5211450878334416, + "grad_norm": 1.4889494180679321, + "learning_rate": 2.5175494993678555e-05, + "loss": 0.7883, + "step": 7750 + }, + { + "epoch": 2.522771633051399, + "grad_norm": 1.5664595365524292, + "learning_rate": 2.5149687241371937e-05, + "loss": 0.7528, + "step": 7755 + }, + { + "epoch": 2.524398178269356, + "grad_norm": 1.3623387813568115, + "learning_rate": 2.5123879329542255e-05, + "loss": 0.7521, + "step": 7760 + }, + { + "epoch": 2.526024723487313, + "grad_norm": 1.6652363538742065, + "learning_rate": 2.5098071285693226e-05, + "loss": 0.7718, + "step": 7765 + }, + { + "epoch": 2.52765126870527, + "grad_norm": 1.3834195137023926, + "learning_rate": 2.5072263137328723e-05, + "loss": 0.7615, + "step": 7770 + }, + { + "epoch": 2.529277813923227, + "grad_norm": 1.5325889587402344, + "learning_rate": 2.504645491195274e-05, + "loss": 0.745, + "step": 7775 + }, + { + "epoch": 2.530904359141184, + "grad_norm": 1.6550880670547485, + "learning_rate": 2.5020646637069324e-05, + "loss": 0.7786, + "step": 7780 + }, + { + "epoch": 2.5325309043591413, + "grad_norm": 1.6033899784088135, + "learning_rate": 2.4994838340182588e-05, + "loss": 0.7799, + "step": 7785 + }, + { + "epoch": 2.534157449577098, + "grad_norm": 1.3278917074203491, + "learning_rate": 2.496903004879669e-05, + "loss": 0.7665, + "step": 7790 + }, + { + "epoch": 2.5357839947950556, + "grad_norm": 1.3834562301635742, + "learning_rate": 2.494322179041575e-05, + "loss": 0.7458, + "step": 7795 + }, + { + "epoch": 2.5374105400130125, + "grad_norm": 1.3784668445587158, + "learning_rate": 2.4917413592543872e-05, + "loss": 0.7285, + "step": 7800 + }, + { + "epoch": 2.5390370852309694, + "grad_norm": 1.4264081716537476, + "learning_rate": 2.4891605482685087e-05, + "loss": 0.7477, + "step": 7805 + }, + { + "epoch": 2.5406636304489263, + "grad_norm": 1.4146618843078613, + "learning_rate": 2.4865797488343344e-05, + "loss": 0.7641, + "step": 7810 + }, + { + "epoch": 2.5422901756668836, + "grad_norm": 1.4429621696472168, + "learning_rate": 2.4839989637022447e-05, + "loss": 0.7557, + "step": 7815 + }, + { + "epoch": 2.5439167208848406, + "grad_norm": 1.4784046411514282, + "learning_rate": 2.4814181956226067e-05, + "loss": 0.7583, + "step": 7820 + }, + { + "epoch": 2.545543266102798, + "grad_norm": 1.2326250076293945, + "learning_rate": 2.4788374473457686e-05, + "loss": 0.7875, + "step": 7825 + }, + { + "epoch": 2.547169811320755, + "grad_norm": 1.62208092212677, + "learning_rate": 2.4762567216220575e-05, + "loss": 0.8161, + "step": 7830 + }, + { + "epoch": 2.5487963565387117, + "grad_norm": 1.449559211730957, + "learning_rate": 2.4736760212017765e-05, + "loss": 0.7459, + "step": 7835 + }, + { + "epoch": 2.5504229017566686, + "grad_norm": 1.5942325592041016, + "learning_rate": 2.471095348835203e-05, + "loss": 0.7268, + "step": 7840 + }, + { + "epoch": 2.552049446974626, + "grad_norm": 1.5314955711364746, + "learning_rate": 2.4685147072725816e-05, + "loss": 0.7847, + "step": 7845 + }, + { + "epoch": 2.553675992192583, + "grad_norm": 1.4110380411148071, + "learning_rate": 2.4659340992641264e-05, + "loss": 0.7562, + "step": 7850 + }, + { + "epoch": 2.5553025374105403, + "grad_norm": 1.387182593345642, + "learning_rate": 2.4633535275600146e-05, + "loss": 0.7495, + "step": 7855 + }, + { + "epoch": 2.556929082628497, + "grad_norm": 1.4227756261825562, + "learning_rate": 2.460772994910387e-05, + "loss": 0.7799, + "step": 7860 + }, + { + "epoch": 2.558555627846454, + "grad_norm": 1.8554261922836304, + "learning_rate": 2.4581925040653385e-05, + "loss": 0.7276, + "step": 7865 + }, + { + "epoch": 2.560182173064411, + "grad_norm": 1.3811417818069458, + "learning_rate": 2.4556120577749242e-05, + "loss": 0.7412, + "step": 7870 + }, + { + "epoch": 2.5618087182823683, + "grad_norm": 1.4243059158325195, + "learning_rate": 2.453031658789148e-05, + "loss": 0.7591, + "step": 7875 + }, + { + "epoch": 2.5634352635003252, + "grad_norm": 1.4668538570404053, + "learning_rate": 2.450451309857965e-05, + "loss": 0.7375, + "step": 7880 + }, + { + "epoch": 2.5650618087182826, + "grad_norm": 1.405526041984558, + "learning_rate": 2.4478710137312773e-05, + "loss": 0.7696, + "step": 7885 + }, + { + "epoch": 2.5666883539362395, + "grad_norm": 1.356564998626709, + "learning_rate": 2.4452907731589306e-05, + "loss": 0.7604, + "step": 7890 + }, + { + "epoch": 2.5683148991541964, + "grad_norm": 1.3351854085922241, + "learning_rate": 2.4427105908907097e-05, + "loss": 0.7292, + "step": 7895 + }, + { + "epoch": 2.5699414443721533, + "grad_norm": 1.331409215927124, + "learning_rate": 2.4401304696763397e-05, + "loss": 0.7868, + "step": 7900 + }, + { + "epoch": 2.5715679895901107, + "grad_norm": 1.5373462438583374, + "learning_rate": 2.4375504122654784e-05, + "loss": 0.787, + "step": 7905 + }, + { + "epoch": 2.5731945348080676, + "grad_norm": 1.63596510887146, + "learning_rate": 2.4349704214077185e-05, + "loss": 0.766, + "step": 7910 + }, + { + "epoch": 2.574821080026025, + "grad_norm": 1.4256339073181152, + "learning_rate": 2.4323904998525783e-05, + "loss": 0.7794, + "step": 7915 + }, + { + "epoch": 2.576447625243982, + "grad_norm": 1.401673436164856, + "learning_rate": 2.4298106503495046e-05, + "loss": 0.7452, + "step": 7920 + }, + { + "epoch": 2.5780741704619388, + "grad_norm": 1.5896552801132202, + "learning_rate": 2.4272308756478677e-05, + "loss": 0.7735, + "step": 7925 + }, + { + "epoch": 2.5797007156798957, + "grad_norm": 1.6745907068252563, + "learning_rate": 2.424651178496955e-05, + "loss": 0.7708, + "step": 7930 + }, + { + "epoch": 2.581327260897853, + "grad_norm": 1.3708373308181763, + "learning_rate": 2.4220715616459765e-05, + "loss": 0.7689, + "step": 7935 + }, + { + "epoch": 2.58295380611581, + "grad_norm": 1.6794244050979614, + "learning_rate": 2.4194920278440508e-05, + "loss": 0.7839, + "step": 7940 + }, + { + "epoch": 2.5845803513337673, + "grad_norm": 1.4708285331726074, + "learning_rate": 2.4169125798402127e-05, + "loss": 0.7917, + "step": 7945 + }, + { + "epoch": 2.586206896551724, + "grad_norm": 1.3197507858276367, + "learning_rate": 2.4143332203834017e-05, + "loss": 0.7687, + "step": 7950 + }, + { + "epoch": 2.587833441769681, + "grad_norm": 1.431296706199646, + "learning_rate": 2.411753952222468e-05, + "loss": 0.7567, + "step": 7955 + }, + { + "epoch": 2.589459986987638, + "grad_norm": 1.161069631576538, + "learning_rate": 2.409174778106158e-05, + "loss": 0.7453, + "step": 7960 + }, + { + "epoch": 2.5910865322055954, + "grad_norm": 1.5825971364974976, + "learning_rate": 2.4065957007831235e-05, + "loss": 0.7435, + "step": 7965 + }, + { + "epoch": 2.5927130774235523, + "grad_norm": 1.4699854850769043, + "learning_rate": 2.4040167230019102e-05, + "loss": 0.7798, + "step": 7970 + }, + { + "epoch": 2.5943396226415096, + "grad_norm": 1.438014030456543, + "learning_rate": 2.4014378475109587e-05, + "loss": 0.7799, + "step": 7975 + }, + { + "epoch": 2.5959661678594665, + "grad_norm": 1.1449774503707886, + "learning_rate": 2.3988590770585993e-05, + "loss": 0.7306, + "step": 7980 + }, + { + "epoch": 2.5975927130774235, + "grad_norm": 1.258499264717102, + "learning_rate": 2.3962804143930543e-05, + "loss": 0.7258, + "step": 7985 + }, + { + "epoch": 2.5992192582953804, + "grad_norm": 1.4304072856903076, + "learning_rate": 2.3937018622624247e-05, + "loss": 0.7615, + "step": 7990 + }, + { + "epoch": 2.6008458035133377, + "grad_norm": 1.5716931819915771, + "learning_rate": 2.3911234234146993e-05, + "loss": 0.7814, + "step": 7995 + }, + { + "epoch": 2.6024723487312946, + "grad_norm": 1.3027786016464233, + "learning_rate": 2.388545100597743e-05, + "loss": 0.7495, + "step": 8000 + }, + { + "epoch": 2.604098893949252, + "grad_norm": 1.321679949760437, + "learning_rate": 2.3859668965592993e-05, + "loss": 0.7509, + "step": 8005 + }, + { + "epoch": 2.605725439167209, + "grad_norm": 1.5215293169021606, + "learning_rate": 2.383388814046983e-05, + "loss": 0.7502, + "step": 8010 + }, + { + "epoch": 2.607351984385166, + "grad_norm": 1.2832412719726562, + "learning_rate": 2.3808108558082818e-05, + "loss": 0.7703, + "step": 8015 + }, + { + "epoch": 2.6089785296031227, + "grad_norm": 1.1952447891235352, + "learning_rate": 2.3782330245905475e-05, + "loss": 0.749, + "step": 8020 + }, + { + "epoch": 2.61060507482108, + "grad_norm": 1.41889226436615, + "learning_rate": 2.3756553231409998e-05, + "loss": 0.7808, + "step": 8025 + }, + { + "epoch": 2.612231620039037, + "grad_norm": 1.2736446857452393, + "learning_rate": 2.3730777542067185e-05, + "loss": 0.7645, + "step": 8030 + }, + { + "epoch": 2.6138581652569943, + "grad_norm": 1.4455156326293945, + "learning_rate": 2.3705003205346434e-05, + "loss": 0.7864, + "step": 8035 + }, + { + "epoch": 2.6154847104749512, + "grad_norm": 1.2804951667785645, + "learning_rate": 2.367923024871567e-05, + "loss": 0.7729, + "step": 8040 + }, + { + "epoch": 2.617111255692908, + "grad_norm": 1.50819993019104, + "learning_rate": 2.36534586996414e-05, + "loss": 0.8001, + "step": 8045 + }, + { + "epoch": 2.618737800910865, + "grad_norm": 1.3382127285003662, + "learning_rate": 2.362768858558858e-05, + "loss": 0.7639, + "step": 8050 + }, + { + "epoch": 2.6203643461288224, + "grad_norm": 1.3726097345352173, + "learning_rate": 2.3601919934020667e-05, + "loss": 0.7515, + "step": 8055 + }, + { + "epoch": 2.6219908913467793, + "grad_norm": 1.6643213033676147, + "learning_rate": 2.357615277239954e-05, + "loss": 0.7628, + "step": 8060 + }, + { + "epoch": 2.6236174365647367, + "grad_norm": 1.6360995769500732, + "learning_rate": 2.355038712818551e-05, + "loss": 0.7542, + "step": 8065 + }, + { + "epoch": 2.6252439817826936, + "grad_norm": 1.389898419380188, + "learning_rate": 2.352462302883727e-05, + "loss": 0.7455, + "step": 8070 + }, + { + "epoch": 2.6268705270006505, + "grad_norm": 1.3939239978790283, + "learning_rate": 2.349886050181183e-05, + "loss": 0.7572, + "step": 8075 + }, + { + "epoch": 2.6284970722186074, + "grad_norm": 1.5887764692306519, + "learning_rate": 2.3473099574564584e-05, + "loss": 0.7715, + "step": 8080 + }, + { + "epoch": 2.6301236174365648, + "grad_norm": 1.3532639741897583, + "learning_rate": 2.3447340274549163e-05, + "loss": 0.7684, + "step": 8085 + }, + { + "epoch": 2.6317501626545217, + "grad_norm": 1.5505973100662231, + "learning_rate": 2.3421582629217507e-05, + "loss": 0.7801, + "step": 8090 + }, + { + "epoch": 2.633376707872479, + "grad_norm": 1.2806953191757202, + "learning_rate": 2.3395826666019762e-05, + "loss": 0.7356, + "step": 8095 + }, + { + "epoch": 2.635003253090436, + "grad_norm": 1.4974077939987183, + "learning_rate": 2.3370072412404318e-05, + "loss": 0.7319, + "step": 8100 + }, + { + "epoch": 2.636629798308393, + "grad_norm": 1.2973668575286865, + "learning_rate": 2.3344319895817686e-05, + "loss": 0.8082, + "step": 8105 + }, + { + "epoch": 2.63825634352635, + "grad_norm": 1.4212173223495483, + "learning_rate": 2.3318569143704597e-05, + "loss": 0.7938, + "step": 8110 + }, + { + "epoch": 2.639882888744307, + "grad_norm": 1.1959309577941895, + "learning_rate": 2.329282018350783e-05, + "loss": 0.7707, + "step": 8115 + }, + { + "epoch": 2.641509433962264, + "grad_norm": 1.3446601629257202, + "learning_rate": 2.3267073042668318e-05, + "loss": 0.77, + "step": 8120 + }, + { + "epoch": 2.6431359791802214, + "grad_norm": 1.1607987880706787, + "learning_rate": 2.3241327748625003e-05, + "loss": 0.7605, + "step": 8125 + }, + { + "epoch": 2.6447625243981783, + "grad_norm": 1.2222195863723755, + "learning_rate": 2.3215584328814905e-05, + "loss": 0.7877, + "step": 8130 + }, + { + "epoch": 2.646389069616135, + "grad_norm": 1.3480989933013916, + "learning_rate": 2.3189842810673e-05, + "loss": 0.742, + "step": 8135 + }, + { + "epoch": 2.6480156148340925, + "grad_norm": 1.417054533958435, + "learning_rate": 2.316410322163227e-05, + "loss": 0.7438, + "step": 8140 + }, + { + "epoch": 2.6496421600520494, + "grad_norm": 1.5921944379806519, + "learning_rate": 2.3138365589123635e-05, + "loss": 0.7614, + "step": 8145 + }, + { + "epoch": 2.6512687052700064, + "grad_norm": 1.4190773963928223, + "learning_rate": 2.3112629940575928e-05, + "loss": 0.769, + "step": 8150 + }, + { + "epoch": 2.6528952504879637, + "grad_norm": 1.4252692461013794, + "learning_rate": 2.3086896303415858e-05, + "loss": 0.7882, + "step": 8155 + }, + { + "epoch": 2.6545217957059206, + "grad_norm": 1.2739335298538208, + "learning_rate": 2.3061164705068016e-05, + "loss": 0.7826, + "step": 8160 + }, + { + "epoch": 2.6561483409238775, + "grad_norm": 1.4117742776870728, + "learning_rate": 2.3035435172954782e-05, + "loss": 0.7598, + "step": 8165 + }, + { + "epoch": 2.657774886141835, + "grad_norm": 1.5920547246932983, + "learning_rate": 2.3009707734496367e-05, + "loss": 0.7613, + "step": 8170 + }, + { + "epoch": 2.659401431359792, + "grad_norm": 1.9303627014160156, + "learning_rate": 2.298398241711073e-05, + "loss": 0.7703, + "step": 8175 + }, + { + "epoch": 2.6610279765777487, + "grad_norm": 1.5395746231079102, + "learning_rate": 2.2958259248213594e-05, + "loss": 0.7547, + "step": 8180 + }, + { + "epoch": 2.662654521795706, + "grad_norm": 1.4378993511199951, + "learning_rate": 2.293253825521836e-05, + "loss": 0.7803, + "step": 8185 + }, + { + "epoch": 2.664281067013663, + "grad_norm": 1.5083246231079102, + "learning_rate": 2.290681946553615e-05, + "loss": 0.7818, + "step": 8190 + }, + { + "epoch": 2.66590761223162, + "grad_norm": 1.3597829341888428, + "learning_rate": 2.2881102906575686e-05, + "loss": 0.7499, + "step": 8195 + }, + { + "epoch": 2.6675341574495772, + "grad_norm": 1.3747528791427612, + "learning_rate": 2.2855388605743356e-05, + "loss": 0.7632, + "step": 8200 + }, + { + "epoch": 2.669160702667534, + "grad_norm": 1.4150460958480835, + "learning_rate": 2.282967659044313e-05, + "loss": 0.7325, + "step": 8205 + }, + { + "epoch": 2.6707872478854915, + "grad_norm": 1.2788463830947876, + "learning_rate": 2.2803966888076533e-05, + "loss": 0.7477, + "step": 8210 + }, + { + "epoch": 2.6724137931034484, + "grad_norm": 1.2472187280654907, + "learning_rate": 2.2778259526042635e-05, + "loss": 0.7476, + "step": 8215 + }, + { + "epoch": 2.6740403383214053, + "grad_norm": 1.292850136756897, + "learning_rate": 2.2752554531738008e-05, + "loss": 0.7732, + "step": 8220 + }, + { + "epoch": 2.675666883539362, + "grad_norm": 1.2652130126953125, + "learning_rate": 2.272685193255671e-05, + "loss": 0.7688, + "step": 8225 + }, + { + "epoch": 2.6772934287573196, + "grad_norm": 1.4194849729537964, + "learning_rate": 2.2701151755890215e-05, + "loss": 0.8083, + "step": 8230 + }, + { + "epoch": 2.6789199739752765, + "grad_norm": 1.3642696142196655, + "learning_rate": 2.2675454029127458e-05, + "loss": 0.7539, + "step": 8235 + }, + { + "epoch": 2.680546519193234, + "grad_norm": 1.624185562133789, + "learning_rate": 2.264975877965473e-05, + "loss": 0.7476, + "step": 8240 + }, + { + "epoch": 2.6821730644111907, + "grad_norm": 1.5707036256790161, + "learning_rate": 2.262406603485571e-05, + "loss": 0.7593, + "step": 8245 + }, + { + "epoch": 2.6837996096291477, + "grad_norm": 1.309596061706543, + "learning_rate": 2.2598375822111366e-05, + "loss": 0.7608, + "step": 8250 + }, + { + "epoch": 2.6854261548471046, + "grad_norm": 1.4106091260910034, + "learning_rate": 2.257268816880003e-05, + "loss": 0.7842, + "step": 8255 + }, + { + "epoch": 2.687052700065062, + "grad_norm": 1.4927936792373657, + "learning_rate": 2.254700310229724e-05, + "loss": 0.7589, + "step": 8260 + }, + { + "epoch": 2.688679245283019, + "grad_norm": 1.2755460739135742, + "learning_rate": 2.2521320649975823e-05, + "loss": 0.7866, + "step": 8265 + }, + { + "epoch": 2.690305790500976, + "grad_norm": 1.5741503238677979, + "learning_rate": 2.2495640839205802e-05, + "loss": 0.769, + "step": 8270 + }, + { + "epoch": 2.691932335718933, + "grad_norm": 1.3726242780685425, + "learning_rate": 2.2469963697354394e-05, + "loss": 0.7783, + "step": 8275 + }, + { + "epoch": 2.69355888093689, + "grad_norm": 1.4581364393234253, + "learning_rate": 2.2444289251785942e-05, + "loss": 0.7804, + "step": 8280 + }, + { + "epoch": 2.695185426154847, + "grad_norm": 1.456614375114441, + "learning_rate": 2.241861752986197e-05, + "loss": 0.7308, + "step": 8285 + }, + { + "epoch": 2.6968119713728043, + "grad_norm": 1.4411842823028564, + "learning_rate": 2.2392948558941045e-05, + "loss": 0.7696, + "step": 8290 + }, + { + "epoch": 2.698438516590761, + "grad_norm": 1.5765049457550049, + "learning_rate": 2.2367282366378842e-05, + "loss": 0.7537, + "step": 8295 + }, + { + "epoch": 2.7000650618087185, + "grad_norm": 1.2976040840148926, + "learning_rate": 2.2341618979528042e-05, + "loss": 0.7524, + "step": 8300 + }, + { + "epoch": 2.7016916070266754, + "grad_norm": 1.2431724071502686, + "learning_rate": 2.2315958425738373e-05, + "loss": 0.7577, + "step": 8305 + }, + { + "epoch": 2.7033181522446323, + "grad_norm": 1.474848747253418, + "learning_rate": 2.2290300732356503e-05, + "loss": 0.7655, + "step": 8310 + }, + { + "epoch": 2.7049446974625893, + "grad_norm": 1.5613243579864502, + "learning_rate": 2.2264645926726086e-05, + "loss": 0.733, + "step": 8315 + }, + { + "epoch": 2.7065712426805466, + "grad_norm": 1.5631688833236694, + "learning_rate": 2.2238994036187672e-05, + "loss": 0.7957, + "step": 8320 + }, + { + "epoch": 2.7081977878985035, + "grad_norm": 1.510170578956604, + "learning_rate": 2.221334508807873e-05, + "loss": 0.7634, + "step": 8325 + }, + { + "epoch": 2.709824333116461, + "grad_norm": 1.3474400043487549, + "learning_rate": 2.2187699109733567e-05, + "loss": 0.7509, + "step": 8330 + }, + { + "epoch": 2.711450878334418, + "grad_norm": 1.3957786560058594, + "learning_rate": 2.2162056128483347e-05, + "loss": 0.7676, + "step": 8335 + }, + { + "epoch": 2.7130774235523747, + "grad_norm": 1.3940699100494385, + "learning_rate": 2.2136416171656042e-05, + "loss": 0.773, + "step": 8340 + }, + { + "epoch": 2.7147039687703316, + "grad_norm": 1.459333896636963, + "learning_rate": 2.2110779266576364e-05, + "loss": 0.7627, + "step": 8345 + }, + { + "epoch": 2.716330513988289, + "grad_norm": 1.4141310453414917, + "learning_rate": 2.208514544056582e-05, + "loss": 0.7929, + "step": 8350 + }, + { + "epoch": 2.717957059206246, + "grad_norm": 1.5461808443069458, + "learning_rate": 2.20595147209426e-05, + "loss": 0.7935, + "step": 8355 + }, + { + "epoch": 2.719583604424203, + "grad_norm": 1.2824581861495972, + "learning_rate": 2.2033887135021606e-05, + "loss": 0.767, + "step": 8360 + }, + { + "epoch": 2.72121014964216, + "grad_norm": 1.4507813453674316, + "learning_rate": 2.2008262710114384e-05, + "loss": 0.776, + "step": 8365 + }, + { + "epoch": 2.722836694860117, + "grad_norm": 1.439858317375183, + "learning_rate": 2.1982641473529142e-05, + "loss": 0.7719, + "step": 8370 + }, + { + "epoch": 2.724463240078074, + "grad_norm": 1.5216050148010254, + "learning_rate": 2.1957023452570642e-05, + "loss": 0.787, + "step": 8375 + }, + { + "epoch": 2.7260897852960313, + "grad_norm": 1.4557849168777466, + "learning_rate": 2.1931408674540254e-05, + "loss": 0.7783, + "step": 8380 + }, + { + "epoch": 2.727716330513988, + "grad_norm": 1.3647226095199585, + "learning_rate": 2.1905797166735882e-05, + "loss": 0.7478, + "step": 8385 + }, + { + "epoch": 2.7293428757319456, + "grad_norm": 1.3849550485610962, + "learning_rate": 2.1880188956451947e-05, + "loss": 0.759, + "step": 8390 + }, + { + "epoch": 2.7309694209499025, + "grad_norm": 1.3714096546173096, + "learning_rate": 2.185458407097935e-05, + "loss": 0.7929, + "step": 8395 + }, + { + "epoch": 2.7325959661678594, + "grad_norm": 1.6565088033676147, + "learning_rate": 2.182898253760547e-05, + "loss": 0.7811, + "step": 8400 + }, + { + "epoch": 2.7342225113858163, + "grad_norm": 1.3324366807937622, + "learning_rate": 2.180338438361407e-05, + "loss": 0.7459, + "step": 8405 + }, + { + "epoch": 2.7358490566037736, + "grad_norm": 1.6535967588424683, + "learning_rate": 2.177778963628536e-05, + "loss": 0.7741, + "step": 8410 + }, + { + "epoch": 2.7374756018217306, + "grad_norm": 1.7948744297027588, + "learning_rate": 2.175219832289588e-05, + "loss": 0.7705, + "step": 8415 + }, + { + "epoch": 2.739102147039688, + "grad_norm": 1.394195318222046, + "learning_rate": 2.1726610470718554e-05, + "loss": 0.7597, + "step": 8420 + }, + { + "epoch": 2.740728692257645, + "grad_norm": 1.5796781778335571, + "learning_rate": 2.1701026107022554e-05, + "loss": 0.7821, + "step": 8425 + }, + { + "epoch": 2.7423552374756017, + "grad_norm": 1.5026342868804932, + "learning_rate": 2.167544525907341e-05, + "loss": 0.736, + "step": 8430 + }, + { + "epoch": 2.7439817826935586, + "grad_norm": 1.3287760019302368, + "learning_rate": 2.1649867954132847e-05, + "loss": 0.7715, + "step": 8435 + }, + { + "epoch": 2.745608327911516, + "grad_norm": 1.2638212442398071, + "learning_rate": 2.1624294219458836e-05, + "loss": 0.8018, + "step": 8440 + }, + { + "epoch": 2.747234873129473, + "grad_norm": 1.5260733366012573, + "learning_rate": 2.1598724082305537e-05, + "loss": 0.7461, + "step": 8445 + }, + { + "epoch": 2.7488614183474303, + "grad_norm": 1.5431984663009644, + "learning_rate": 2.15731575699233e-05, + "loss": 0.7822, + "step": 8450 + }, + { + "epoch": 2.750487963565387, + "grad_norm": 1.8251315355300903, + "learning_rate": 2.154759470955857e-05, + "loss": 0.7747, + "step": 8455 + }, + { + "epoch": 2.752114508783344, + "grad_norm": 1.4617520570755005, + "learning_rate": 2.1522035528453936e-05, + "loss": 0.7617, + "step": 8460 + }, + { + "epoch": 2.753741054001301, + "grad_norm": 1.4723654985427856, + "learning_rate": 2.1496480053848038e-05, + "loss": 0.7836, + "step": 8465 + }, + { + "epoch": 2.7553675992192583, + "grad_norm": 1.2941076755523682, + "learning_rate": 2.1470928312975596e-05, + "loss": 0.775, + "step": 8470 + }, + { + "epoch": 2.7569941444372152, + "grad_norm": 1.3632558584213257, + "learning_rate": 2.1445380333067336e-05, + "loss": 0.7607, + "step": 8475 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 1.3414629697799683, + "learning_rate": 2.141983614134996e-05, + "loss": 0.7437, + "step": 8480 + }, + { + "epoch": 2.7602472348731295, + "grad_norm": 1.4272958040237427, + "learning_rate": 2.139429576504617e-05, + "loss": 0.7875, + "step": 8485 + }, + { + "epoch": 2.7618737800910864, + "grad_norm": 1.4554578065872192, + "learning_rate": 2.1368759231374553e-05, + "loss": 0.73, + "step": 8490 + }, + { + "epoch": 2.7635003253090433, + "grad_norm": 1.6653311252593994, + "learning_rate": 2.1343226567549656e-05, + "loss": 0.7698, + "step": 8495 + }, + { + "epoch": 2.7651268705270007, + "grad_norm": 1.8493320941925049, + "learning_rate": 2.131769780078185e-05, + "loss": 0.7766, + "step": 8500 + }, + { + "epoch": 2.7667534157449576, + "grad_norm": 1.2956358194351196, + "learning_rate": 2.1292172958277395e-05, + "loss": 0.7835, + "step": 8505 + }, + { + "epoch": 2.768379960962915, + "grad_norm": 1.11705482006073, + "learning_rate": 2.1266652067238335e-05, + "loss": 0.7666, + "step": 8510 + }, + { + "epoch": 2.770006506180872, + "grad_norm": 1.3041105270385742, + "learning_rate": 2.124113515486254e-05, + "loss": 0.7521, + "step": 8515 + }, + { + "epoch": 2.7716330513988288, + "grad_norm": 1.2648659944534302, + "learning_rate": 2.1215622248343593e-05, + "loss": 0.7893, + "step": 8520 + }, + { + "epoch": 2.7732595966167857, + "grad_norm": 1.29115891456604, + "learning_rate": 2.1190113374870844e-05, + "loss": 0.7718, + "step": 8525 + }, + { + "epoch": 2.774886141834743, + "grad_norm": 1.3295314311981201, + "learning_rate": 2.116460856162934e-05, + "loss": 0.7874, + "step": 8530 + }, + { + "epoch": 2.7765126870527, + "grad_norm": 1.231979489326477, + "learning_rate": 2.1139107835799787e-05, + "loss": 0.7584, + "step": 8535 + }, + { + "epoch": 2.7781392322706573, + "grad_norm": 1.2370855808258057, + "learning_rate": 2.1113611224558545e-05, + "loss": 0.7465, + "step": 8540 + }, + { + "epoch": 2.779765777488614, + "grad_norm": 1.519385814666748, + "learning_rate": 2.10881187550776e-05, + "loss": 0.7512, + "step": 8545 + }, + { + "epoch": 2.781392322706571, + "grad_norm": 1.2954163551330566, + "learning_rate": 2.106263045452449e-05, + "loss": 0.7647, + "step": 8550 + }, + { + "epoch": 2.7830188679245285, + "grad_norm": 1.5285433530807495, + "learning_rate": 2.1037146350062344e-05, + "loss": 0.7305, + "step": 8555 + }, + { + "epoch": 2.7846454131424854, + "grad_norm": 1.34639310836792, + "learning_rate": 2.1011666468849797e-05, + "loss": 0.7748, + "step": 8560 + }, + { + "epoch": 2.7862719583604423, + "grad_norm": 1.4454941749572754, + "learning_rate": 2.098619083804101e-05, + "loss": 0.76, + "step": 8565 + }, + { + "epoch": 2.7878985035783996, + "grad_norm": 1.531281590461731, + "learning_rate": 2.0960719484785578e-05, + "loss": 0.796, + "step": 8570 + }, + { + "epoch": 2.7895250487963565, + "grad_norm": 1.3614991903305054, + "learning_rate": 2.093525243622858e-05, + "loss": 0.7884, + "step": 8575 + }, + { + "epoch": 2.7911515940143135, + "grad_norm": 1.235312581062317, + "learning_rate": 2.0909789719510457e-05, + "loss": 0.7841, + "step": 8580 + }, + { + "epoch": 2.792778139232271, + "grad_norm": 1.4854750633239746, + "learning_rate": 2.0884331361767078e-05, + "loss": 0.7692, + "step": 8585 + }, + { + "epoch": 2.7944046844502277, + "grad_norm": 1.2958935499191284, + "learning_rate": 2.085887739012964e-05, + "loss": 0.7777, + "step": 8590 + }, + { + "epoch": 2.7960312296681846, + "grad_norm": 1.3739302158355713, + "learning_rate": 2.083342783172469e-05, + "loss": 0.8185, + "step": 8595 + }, + { + "epoch": 2.797657774886142, + "grad_norm": 1.1716203689575195, + "learning_rate": 2.0807982713674036e-05, + "loss": 0.7891, + "step": 8600 + }, + { + "epoch": 2.799284320104099, + "grad_norm": 1.1605089902877808, + "learning_rate": 2.078254206309478e-05, + "loss": 0.7548, + "step": 8605 + }, + { + "epoch": 2.800910865322056, + "grad_norm": 1.4513792991638184, + "learning_rate": 2.0757105907099278e-05, + "loss": 0.7883, + "step": 8610 + }, + { + "epoch": 2.802537410540013, + "grad_norm": 1.3878464698791504, + "learning_rate": 2.0731674272795047e-05, + "loss": 0.7744, + "step": 8615 + }, + { + "epoch": 2.80416395575797, + "grad_norm": 1.3584715127944946, + "learning_rate": 2.0706247187284836e-05, + "loss": 0.8021, + "step": 8620 + }, + { + "epoch": 2.805790500975927, + "grad_norm": 1.3374097347259521, + "learning_rate": 2.0680824677666506e-05, + "loss": 0.7187, + "step": 8625 + }, + { + "epoch": 2.8074170461938843, + "grad_norm": 1.449511170387268, + "learning_rate": 2.0655406771033077e-05, + "loss": 0.762, + "step": 8630 + }, + { + "epoch": 2.8090435914118412, + "grad_norm": 1.5614320039749146, + "learning_rate": 2.0629993494472625e-05, + "loss": 0.7522, + "step": 8635 + }, + { + "epoch": 2.810670136629798, + "grad_norm": 1.3400728702545166, + "learning_rate": 2.0604584875068337e-05, + "loss": 0.8134, + "step": 8640 + }, + { + "epoch": 2.8122966818477555, + "grad_norm": 1.3312265872955322, + "learning_rate": 2.057918093989839e-05, + "loss": 0.7542, + "step": 8645 + }, + { + "epoch": 2.8139232270657124, + "grad_norm": 1.4635919332504272, + "learning_rate": 2.0553781716036006e-05, + "loss": 0.7413, + "step": 8650 + }, + { + "epoch": 2.8155497722836698, + "grad_norm": 1.461859107017517, + "learning_rate": 2.0528387230549364e-05, + "loss": 0.7878, + "step": 8655 + }, + { + "epoch": 2.8171763175016267, + "grad_norm": 1.3515243530273438, + "learning_rate": 2.0502997510501616e-05, + "loss": 0.7968, + "step": 8660 + }, + { + "epoch": 2.8188028627195836, + "grad_norm": 1.3076084852218628, + "learning_rate": 2.047761258295079e-05, + "loss": 0.766, + "step": 8665 + }, + { + "epoch": 2.8204294079375405, + "grad_norm": 1.3640028238296509, + "learning_rate": 2.045223247494987e-05, + "loss": 0.7517, + "step": 8670 + }, + { + "epoch": 2.822055953155498, + "grad_norm": 1.1475551128387451, + "learning_rate": 2.0426857213546647e-05, + "loss": 0.7639, + "step": 8675 + }, + { + "epoch": 2.8236824983734548, + "grad_norm": 1.51615571975708, + "learning_rate": 2.0401486825783784e-05, + "loss": 0.7728, + "step": 8680 + }, + { + "epoch": 2.825309043591412, + "grad_norm": 1.2827743291854858, + "learning_rate": 2.0376121338698728e-05, + "loss": 0.7774, + "step": 8685 + }, + { + "epoch": 2.826935588809369, + "grad_norm": 1.4386907815933228, + "learning_rate": 2.0350760779323728e-05, + "loss": 0.7389, + "step": 8690 + }, + { + "epoch": 2.828562134027326, + "grad_norm": 1.5587327480316162, + "learning_rate": 2.032540517468574e-05, + "loss": 0.7841, + "step": 8695 + }, + { + "epoch": 2.830188679245283, + "grad_norm": 1.6727339029312134, + "learning_rate": 2.0300054551806488e-05, + "loss": 0.7571, + "step": 8700 + }, + { + "epoch": 2.83181522446324, + "grad_norm": 1.5247676372528076, + "learning_rate": 2.027470893770235e-05, + "loss": 0.7696, + "step": 8705 + }, + { + "epoch": 2.833441769681197, + "grad_norm": 1.493812084197998, + "learning_rate": 2.024936835938439e-05, + "loss": 0.7424, + "step": 8710 + }, + { + "epoch": 2.8350683148991545, + "grad_norm": 1.3727971315383911, + "learning_rate": 2.0224032843858284e-05, + "loss": 0.7759, + "step": 8715 + }, + { + "epoch": 2.8366948601171114, + "grad_norm": 1.431779146194458, + "learning_rate": 2.0198702418124342e-05, + "loss": 0.7614, + "step": 8720 + }, + { + "epoch": 2.8383214053350683, + "grad_norm": 1.2456868886947632, + "learning_rate": 2.017337710917741e-05, + "loss": 0.7602, + "step": 8725 + }, + { + "epoch": 2.839947950553025, + "grad_norm": 1.4105199575424194, + "learning_rate": 2.014805694400692e-05, + "loss": 0.7433, + "step": 8730 + }, + { + "epoch": 2.8415744957709825, + "grad_norm": 1.2181018590927124, + "learning_rate": 2.0122741949596797e-05, + "loss": 0.7342, + "step": 8735 + }, + { + "epoch": 2.8432010409889394, + "grad_norm": 1.342274785041809, + "learning_rate": 2.0097432152925462e-05, + "loss": 0.76, + "step": 8740 + }, + { + "epoch": 2.844827586206897, + "grad_norm": 1.2392679452896118, + "learning_rate": 2.0072127580965805e-05, + "loss": 0.7533, + "step": 8745 + }, + { + "epoch": 2.8464541314248537, + "grad_norm": 1.5799916982650757, + "learning_rate": 2.0046828260685136e-05, + "loss": 0.7564, + "step": 8750 + }, + { + "epoch": 2.8480806766428106, + "grad_norm": 1.3523061275482178, + "learning_rate": 2.0021534219045184e-05, + "loss": 0.7264, + "step": 8755 + }, + { + "epoch": 2.8497072218607675, + "grad_norm": 1.616676926612854, + "learning_rate": 1.9996245483002025e-05, + "loss": 0.7913, + "step": 8760 + }, + { + "epoch": 2.851333767078725, + "grad_norm": 1.553757667541504, + "learning_rate": 1.9970962079506106e-05, + "loss": 0.7684, + "step": 8765 + }, + { + "epoch": 2.852960312296682, + "grad_norm": 2.267834186553955, + "learning_rate": 1.9945684035502184e-05, + "loss": 0.7529, + "step": 8770 + }, + { + "epoch": 2.854586857514639, + "grad_norm": 1.3689544200897217, + "learning_rate": 1.9920411377929303e-05, + "loss": 0.7524, + "step": 8775 + }, + { + "epoch": 2.856213402732596, + "grad_norm": 1.270214319229126, + "learning_rate": 1.989514413372076e-05, + "loss": 0.752, + "step": 8780 + }, + { + "epoch": 2.857839947950553, + "grad_norm": 1.531227469444275, + "learning_rate": 1.9869882329804108e-05, + "loss": 0.7722, + "step": 8785 + }, + { + "epoch": 2.85946649316851, + "grad_norm": 1.3616315126419067, + "learning_rate": 1.9844625993101056e-05, + "loss": 0.7597, + "step": 8790 + }, + { + "epoch": 2.8610930383864672, + "grad_norm": 1.3281629085540771, + "learning_rate": 1.981937515052754e-05, + "loss": 0.7364, + "step": 8795 + }, + { + "epoch": 2.862719583604424, + "grad_norm": 1.376363754272461, + "learning_rate": 1.97941298289936e-05, + "loss": 0.754, + "step": 8800 + }, + { + "epoch": 2.8643461288223815, + "grad_norm": 1.386327862739563, + "learning_rate": 1.976889005540342e-05, + "loss": 0.7732, + "step": 8805 + }, + { + "epoch": 2.8659726740403384, + "grad_norm": 1.2972625494003296, + "learning_rate": 1.9743655856655237e-05, + "loss": 0.7544, + "step": 8810 + }, + { + "epoch": 2.8675992192582953, + "grad_norm": 1.3431020975112915, + "learning_rate": 1.97184272596414e-05, + "loss": 0.7421, + "step": 8815 + }, + { + "epoch": 2.869225764476252, + "grad_norm": 1.2134712934494019, + "learning_rate": 1.969320429124823e-05, + "loss": 0.7459, + "step": 8820 + }, + { + "epoch": 2.8708523096942096, + "grad_norm": 1.4390586614608765, + "learning_rate": 1.9667986978356092e-05, + "loss": 0.7812, + "step": 8825 + }, + { + "epoch": 2.8724788549121665, + "grad_norm": 1.7135803699493408, + "learning_rate": 1.9642775347839297e-05, + "loss": 0.744, + "step": 8830 + }, + { + "epoch": 2.874105400130124, + "grad_norm": 1.330793023109436, + "learning_rate": 1.961756942656613e-05, + "loss": 0.7381, + "step": 8835 + }, + { + "epoch": 2.8757319453480807, + "grad_norm": 1.3416228294372559, + "learning_rate": 1.9592369241398746e-05, + "loss": 0.7417, + "step": 8840 + }, + { + "epoch": 2.8773584905660377, + "grad_norm": 1.8280797004699707, + "learning_rate": 1.9567174819193244e-05, + "loss": 0.778, + "step": 8845 + }, + { + "epoch": 2.8789850357839946, + "grad_norm": 1.3129626512527466, + "learning_rate": 1.9547023448804738e-05, + "loss": 0.7946, + "step": 8850 + }, + { + "epoch": 2.880611581001952, + "grad_norm": 1.4983445405960083, + "learning_rate": 1.952183946758826e-05, + "loss": 0.757, + "step": 8855 + }, + { + "epoch": 2.882238126219909, + "grad_norm": 1.1888790130615234, + "learning_rate": 1.9496661324497888e-05, + "loss": 0.7562, + "step": 8860 + }, + { + "epoch": 2.883864671437866, + "grad_norm": 1.1900570392608643, + "learning_rate": 1.9471489046366185e-05, + "loss": 0.784, + "step": 8865 + }, + { + "epoch": 2.885491216655823, + "grad_norm": 1.1963016986846924, + "learning_rate": 1.9446322660019488e-05, + "loss": 0.7719, + "step": 8870 + }, + { + "epoch": 2.88711776187378, + "grad_norm": 1.5933735370635986, + "learning_rate": 1.942116219227784e-05, + "loss": 0.7601, + "step": 8875 + }, + { + "epoch": 2.888744307091737, + "grad_norm": 1.905726671218872, + "learning_rate": 1.9396007669954985e-05, + "loss": 0.7739, + "step": 8880 + }, + { + "epoch": 2.8903708523096943, + "grad_norm": 1.3557093143463135, + "learning_rate": 1.937085911985834e-05, + "loss": 0.7399, + "step": 8885 + }, + { + "epoch": 2.891997397527651, + "grad_norm": 1.4264521598815918, + "learning_rate": 1.934571656878893e-05, + "loss": 0.7355, + "step": 8890 + }, + { + "epoch": 2.8936239427456085, + "grad_norm": 1.7798237800598145, + "learning_rate": 1.9320580043541425e-05, + "loss": 0.7988, + "step": 8895 + }, + { + "epoch": 2.8952504879635654, + "grad_norm": 1.564153790473938, + "learning_rate": 1.9295449570904024e-05, + "loss": 0.7534, + "step": 8900 + }, + { + "epoch": 2.8968770331815223, + "grad_norm": 1.2349964380264282, + "learning_rate": 1.9270325177658523e-05, + "loss": 0.7411, + "step": 8905 + }, + { + "epoch": 2.8985035783994793, + "grad_norm": 1.5775413513183594, + "learning_rate": 1.92452068905802e-05, + "loss": 0.7682, + "step": 8910 + }, + { + "epoch": 2.9001301236174366, + "grad_norm": 1.3158737421035767, + "learning_rate": 1.922009473643787e-05, + "loss": 0.7696, + "step": 8915 + }, + { + "epoch": 2.9017566688353935, + "grad_norm": 1.4552288055419922, + "learning_rate": 1.919498874199377e-05, + "loss": 0.7365, + "step": 8920 + }, + { + "epoch": 2.903383214053351, + "grad_norm": 1.4623161554336548, + "learning_rate": 1.9169888934003598e-05, + "loss": 0.7674, + "step": 8925 + }, + { + "epoch": 2.905009759271308, + "grad_norm": 1.2894598245620728, + "learning_rate": 1.9144795339216437e-05, + "loss": 0.7713, + "step": 8930 + }, + { + "epoch": 2.9066363044892647, + "grad_norm": 1.3721530437469482, + "learning_rate": 1.9119707984374774e-05, + "loss": 0.7511, + "step": 8935 + }, + { + "epoch": 2.9082628497072216, + "grad_norm": 1.347119927406311, + "learning_rate": 1.909462689621443e-05, + "loss": 0.7909, + "step": 8940 + }, + { + "epoch": 2.909889394925179, + "grad_norm": 1.3229573965072632, + "learning_rate": 1.9069552101464552e-05, + "loss": 0.7551, + "step": 8945 + }, + { + "epoch": 2.911515940143136, + "grad_norm": 1.1502162218093872, + "learning_rate": 1.9044483626847577e-05, + "loss": 0.7243, + "step": 8950 + }, + { + "epoch": 2.913142485361093, + "grad_norm": 1.2690176963806152, + "learning_rate": 1.901942149907922e-05, + "loss": 0.7821, + "step": 8955 + }, + { + "epoch": 2.91476903057905, + "grad_norm": 1.4652591943740845, + "learning_rate": 1.8994365744868404e-05, + "loss": 0.7814, + "step": 8960 + }, + { + "epoch": 2.916395575797007, + "grad_norm": 1.5496482849121094, + "learning_rate": 1.8969316390917288e-05, + "loss": 0.7822, + "step": 8965 + }, + { + "epoch": 2.918022121014964, + "grad_norm": 1.1814756393432617, + "learning_rate": 1.8944273463921192e-05, + "loss": 0.7681, + "step": 8970 + }, + { + "epoch": 2.9196486662329213, + "grad_norm": 1.6355721950531006, + "learning_rate": 1.891923699056861e-05, + "loss": 0.802, + "step": 8975 + }, + { + "epoch": 2.921275211450878, + "grad_norm": 1.267281174659729, + "learning_rate": 1.8894206997541113e-05, + "loss": 0.767, + "step": 8980 + }, + { + "epoch": 2.9229017566688356, + "grad_norm": 1.460801601409912, + "learning_rate": 1.886918351151343e-05, + "loss": 0.777, + "step": 8985 + }, + { + "epoch": 2.9245283018867925, + "grad_norm": 1.2017325162887573, + "learning_rate": 1.884416655915329e-05, + "loss": 0.7499, + "step": 8990 + }, + { + "epoch": 2.9261548471047494, + "grad_norm": 1.3612703084945679, + "learning_rate": 1.881915616712151e-05, + "loss": 0.7694, + "step": 8995 + }, + { + "epoch": 2.9277813923227067, + "grad_norm": 1.5361329317092896, + "learning_rate": 1.8794152362071883e-05, + "loss": 0.777, + "step": 9000 + }, + { + "epoch": 2.9294079375406636, + "grad_norm": 1.4652985334396362, + "learning_rate": 1.8769155170651203e-05, + "loss": 0.7805, + "step": 9005 + }, + { + "epoch": 2.9310344827586206, + "grad_norm": 1.3386067152023315, + "learning_rate": 1.874416461949919e-05, + "loss": 0.7561, + "step": 9010 + }, + { + "epoch": 2.932661027976578, + "grad_norm": 1.2517653703689575, + "learning_rate": 1.8719180735248522e-05, + "loss": 0.7546, + "step": 9015 + }, + { + "epoch": 2.934287573194535, + "grad_norm": 1.5450550317764282, + "learning_rate": 1.869420354452476e-05, + "loss": 0.7565, + "step": 9020 + }, + { + "epoch": 2.9359141184124917, + "grad_norm": 1.20124351978302, + "learning_rate": 1.8669233073946303e-05, + "loss": 0.7501, + "step": 9025 + }, + { + "epoch": 2.937540663630449, + "grad_norm": 1.2083044052124023, + "learning_rate": 1.864426935012443e-05, + "loss": 0.7698, + "step": 9030 + }, + { + "epoch": 2.939167208848406, + "grad_norm": 1.3604949712753296, + "learning_rate": 1.86193123996632e-05, + "loss": 0.75, + "step": 9035 + }, + { + "epoch": 2.940793754066363, + "grad_norm": 1.3318531513214111, + "learning_rate": 1.8594362249159473e-05, + "loss": 0.7736, + "step": 9040 + }, + { + "epoch": 2.9424202992843203, + "grad_norm": 1.3299968242645264, + "learning_rate": 1.856941892520284e-05, + "loss": 0.7672, + "step": 9045 + }, + { + "epoch": 2.944046844502277, + "grad_norm": 1.2953205108642578, + "learning_rate": 1.8544482454375655e-05, + "loss": 0.7692, + "step": 9050 + }, + { + "epoch": 2.945673389720234, + "grad_norm": 1.3176677227020264, + "learning_rate": 1.851955286325292e-05, + "loss": 0.7703, + "step": 9055 + }, + { + "epoch": 2.9472999349381914, + "grad_norm": 1.3802894353866577, + "learning_rate": 1.849463017840235e-05, + "loss": 0.7705, + "step": 9060 + }, + { + "epoch": 2.9489264801561483, + "grad_norm": 1.6382813453674316, + "learning_rate": 1.846971442638426e-05, + "loss": 0.7966, + "step": 9065 + }, + { + "epoch": 2.9505530253741052, + "grad_norm": 1.4489498138427734, + "learning_rate": 1.8444805633751617e-05, + "loss": 0.7471, + "step": 9070 + }, + { + "epoch": 2.9521795705920626, + "grad_norm": 1.488726258277893, + "learning_rate": 1.841990382704993e-05, + "loss": 0.7343, + "step": 9075 + }, + { + "epoch": 2.9538061158100195, + "grad_norm": 1.3678256273269653, + "learning_rate": 1.83950090328173e-05, + "loss": 0.7582, + "step": 9080 + }, + { + "epoch": 2.9554326610279764, + "grad_norm": 1.3807744979858398, + "learning_rate": 1.8370121277584325e-05, + "loss": 0.7446, + "step": 9085 + }, + { + "epoch": 2.9570592062459338, + "grad_norm": 1.3815174102783203, + "learning_rate": 1.8345240587874125e-05, + "loss": 0.7658, + "step": 9090 + }, + { + "epoch": 2.9586857514638907, + "grad_norm": 1.1353662014007568, + "learning_rate": 1.8320366990202276e-05, + "loss": 0.7353, + "step": 9095 + }, + { + "epoch": 2.960312296681848, + "grad_norm": 1.390992283821106, + "learning_rate": 1.829550051107681e-05, + "loss": 0.7432, + "step": 9100 + }, + { + "epoch": 2.961938841899805, + "grad_norm": 1.4276349544525146, + "learning_rate": 1.827064117699814e-05, + "loss": 0.7455, + "step": 9105 + }, + { + "epoch": 2.963565387117762, + "grad_norm": 1.3312143087387085, + "learning_rate": 1.8245789014459104e-05, + "loss": 0.7516, + "step": 9110 + }, + { + "epoch": 2.9651919323357188, + "grad_norm": 1.2436290979385376, + "learning_rate": 1.822094404994487e-05, + "loss": 0.7594, + "step": 9115 + }, + { + "epoch": 2.966818477553676, + "grad_norm": 1.2839477062225342, + "learning_rate": 1.819610630993296e-05, + "loss": 0.7681, + "step": 9120 + }, + { + "epoch": 2.968445022771633, + "grad_norm": 1.4155715703964233, + "learning_rate": 1.817127582089317e-05, + "loss": 0.7608, + "step": 9125 + }, + { + "epoch": 2.9700715679895904, + "grad_norm": 1.5015283823013306, + "learning_rate": 1.8146452609287592e-05, + "loss": 0.7548, + "step": 9130 + }, + { + "epoch": 2.9716981132075473, + "grad_norm": 1.4017950296401978, + "learning_rate": 1.8121636701570537e-05, + "loss": 0.7784, + "step": 9135 + }, + { + "epoch": 2.973324658425504, + "grad_norm": 1.3168200254440308, + "learning_rate": 1.8096828124188555e-05, + "loss": 0.7885, + "step": 9140 + }, + { + "epoch": 2.974951203643461, + "grad_norm": 1.4054969549179077, + "learning_rate": 1.807202690358037e-05, + "loss": 0.7759, + "step": 9145 + }, + { + "epoch": 2.9765777488614185, + "grad_norm": 1.470202922821045, + "learning_rate": 1.804723306617687e-05, + "loss": 0.7485, + "step": 9150 + }, + { + "epoch": 2.9782042940793754, + "grad_norm": 1.391217589378357, + "learning_rate": 1.802244663840109e-05, + "loss": 0.78, + "step": 9155 + }, + { + "epoch": 2.9798308392973327, + "grad_norm": 1.7147318124771118, + "learning_rate": 1.7997667646668136e-05, + "loss": 0.733, + "step": 9160 + }, + { + "epoch": 2.9814573845152896, + "grad_norm": 1.486747145652771, + "learning_rate": 1.797289611738523e-05, + "loss": 0.7699, + "step": 9165 + }, + { + "epoch": 2.9830839297332465, + "grad_norm": 1.5375494956970215, + "learning_rate": 1.7948132076951594e-05, + "loss": 0.7869, + "step": 9170 + }, + { + "epoch": 2.9847104749512035, + "grad_norm": 1.274763584136963, + "learning_rate": 1.7923375551758505e-05, + "loss": 0.774, + "step": 9175 + }, + { + "epoch": 2.986337020169161, + "grad_norm": 1.2824516296386719, + "learning_rate": 1.7898626568189216e-05, + "loss": 0.7377, + "step": 9180 + }, + { + "epoch": 2.9879635653871177, + "grad_norm": 1.6717973947525024, + "learning_rate": 1.7873885152618956e-05, + "loss": 0.7782, + "step": 9185 + }, + { + "epoch": 2.989590110605075, + "grad_norm": 1.6405168771743774, + "learning_rate": 1.7849151331414866e-05, + "loss": 0.8018, + "step": 9190 + }, + { + "epoch": 2.991216655823032, + "grad_norm": 1.3563889265060425, + "learning_rate": 1.7824425130936023e-05, + "loss": 0.7617, + "step": 9195 + }, + { + "epoch": 2.992843201040989, + "grad_norm": 1.3543968200683594, + "learning_rate": 1.7799706577533347e-05, + "loss": 0.7547, + "step": 9200 + }, + { + "epoch": 2.994469746258946, + "grad_norm": 1.3570406436920166, + "learning_rate": 1.7774995697549645e-05, + "loss": 0.766, + "step": 9205 + }, + { + "epoch": 2.996096291476903, + "grad_norm": 1.5409280061721802, + "learning_rate": 1.7750292517319518e-05, + "loss": 0.7303, + "step": 9210 + }, + { + "epoch": 2.99772283669486, + "grad_norm": 1.5350112915039062, + "learning_rate": 1.7725597063169386e-05, + "loss": 0.8049, + "step": 9215 + }, + { + "epoch": 2.9993493819128174, + "grad_norm": 1.3614444732666016, + "learning_rate": 1.7700909361417395e-05, + "loss": 0.7187, + "step": 9220 + }, + { + "epoch": 3.0, + "eval_f1": 0.8162116648274761, + "eval_loss": 0.421875, + "eval_precision": 0.8166944823792163, + "eval_recall": 0.8158176957095531, + "eval_runtime": 389.5727, + "eval_samples_per_second": 1009.914, + "eval_steps_per_second": 1.974, + "step": 9222 + }, + { + "epoch": 3.0009759271307743, + "grad_norm": 1.440277099609375, + "learning_rate": 1.767622943837349e-05, + "loss": 0.7135, + "step": 9225 + }, + { + "epoch": 3.0026024723487312, + "grad_norm": 1.4324421882629395, + "learning_rate": 1.7651557320339266e-05, + "loss": 0.7043, + "step": 9230 + }, + { + "epoch": 3.004229017566688, + "grad_norm": 1.406107783317566, + "learning_rate": 1.7626893033608038e-05, + "loss": 0.7287, + "step": 9235 + }, + { + "epoch": 3.0058555627846455, + "grad_norm": 1.3286113739013672, + "learning_rate": 1.7602236604464762e-05, + "loss": 0.6989, + "step": 9240 + }, + { + "epoch": 3.0074821080026024, + "grad_norm": 1.582288146018982, + "learning_rate": 1.7582517136271616e-05, + "loss": 0.7011, + "step": 9245 + }, + { + "epoch": 3.0091086532205593, + "grad_norm": 1.3919352293014526, + "learning_rate": 1.7557874916997996e-05, + "loss": 0.7073, + "step": 9250 + }, + { + "epoch": 3.0107351984385167, + "grad_norm": 1.5728718042373657, + "learning_rate": 1.7533240628865567e-05, + "loss": 0.7265, + "step": 9255 + }, + { + "epoch": 3.0123617436564736, + "grad_norm": 1.4800955057144165, + "learning_rate": 1.7508614298127322e-05, + "loss": 0.7146, + "step": 9260 + }, + { + "epoch": 3.0139882888744305, + "grad_norm": 1.6532727479934692, + "learning_rate": 1.7483995951027767e-05, + "loss": 0.7028, + "step": 9265 + }, + { + "epoch": 3.015614834092388, + "grad_norm": 1.5154070854187012, + "learning_rate": 1.7459385613802903e-05, + "loss": 0.7303, + "step": 9270 + }, + { + "epoch": 3.0172413793103448, + "grad_norm": 1.3430280685424805, + "learning_rate": 1.743478331268018e-05, + "loss": 0.7367, + "step": 9275 + }, + { + "epoch": 3.018867924528302, + "grad_norm": 1.5188809633255005, + "learning_rate": 1.7410189073878513e-05, + "loss": 0.6923, + "step": 9280 + }, + { + "epoch": 3.020494469746259, + "grad_norm": 1.4097915887832642, + "learning_rate": 1.7385602923608192e-05, + "loss": 0.6938, + "step": 9285 + }, + { + "epoch": 3.022121014964216, + "grad_norm": 1.7351887226104736, + "learning_rate": 1.736102488807092e-05, + "loss": 0.7279, + "step": 9290 + }, + { + "epoch": 3.0237475601821733, + "grad_norm": 1.4884587526321411, + "learning_rate": 1.7336454993459726e-05, + "loss": 0.7277, + "step": 9295 + }, + { + "epoch": 3.02537410540013, + "grad_norm": 1.481894612312317, + "learning_rate": 1.7311893265958974e-05, + "loss": 0.7353, + "step": 9300 + }, + { + "epoch": 3.027000650618087, + "grad_norm": 1.51095712184906, + "learning_rate": 1.7287339731744336e-05, + "loss": 0.7069, + "step": 9305 + }, + { + "epoch": 3.0286271958360445, + "grad_norm": 1.621412754058838, + "learning_rate": 1.7262794416982716e-05, + "loss": 0.7309, + "step": 9310 + }, + { + "epoch": 3.0302537410540014, + "grad_norm": 1.413838267326355, + "learning_rate": 1.72382573478323e-05, + "loss": 0.7157, + "step": 9315 + }, + { + "epoch": 3.0318802862719583, + "grad_norm": 1.4564160108566284, + "learning_rate": 1.721372855044246e-05, + "loss": 0.7194, + "step": 9320 + }, + { + "epoch": 3.0335068314899156, + "grad_norm": 1.5395921468734741, + "learning_rate": 1.7189208050953765e-05, + "loss": 0.7176, + "step": 9325 + }, + { + "epoch": 3.0351333767078725, + "grad_norm": 1.7604949474334717, + "learning_rate": 1.7164695875497928e-05, + "loss": 0.7065, + "step": 9330 + }, + { + "epoch": 3.0367599219258294, + "grad_norm": 1.7284990549087524, + "learning_rate": 1.714019205019782e-05, + "loss": 0.6965, + "step": 9335 + }, + { + "epoch": 3.038386467143787, + "grad_norm": 1.7822333574295044, + "learning_rate": 1.711569660116737e-05, + "loss": 0.7249, + "step": 9340 + }, + { + "epoch": 3.0400130123617437, + "grad_norm": 1.4060776233673096, + "learning_rate": 1.709120955451162e-05, + "loss": 0.7531, + "step": 9345 + }, + { + "epoch": 3.0416395575797006, + "grad_norm": 1.657284140586853, + "learning_rate": 1.706673093632663e-05, + "loss": 0.7365, + "step": 9350 + }, + { + "epoch": 3.043266102797658, + "grad_norm": 1.3833869695663452, + "learning_rate": 1.70422607726995e-05, + "loss": 0.7132, + "step": 9355 + }, + { + "epoch": 3.044892648015615, + "grad_norm": 1.7730997800827026, + "learning_rate": 1.7017799089708293e-05, + "loss": 0.7189, + "step": 9360 + }, + { + "epoch": 3.046519193233572, + "grad_norm": 1.628828525543213, + "learning_rate": 1.699334591342207e-05, + "loss": 0.7286, + "step": 9365 + }, + { + "epoch": 3.048145738451529, + "grad_norm": 1.514278531074524, + "learning_rate": 1.696890126990079e-05, + "loss": 0.7089, + "step": 9370 + }, + { + "epoch": 3.049772283669486, + "grad_norm": 1.7661864757537842, + "learning_rate": 1.694446518519534e-05, + "loss": 0.7476, + "step": 9375 + }, + { + "epoch": 3.051398828887443, + "grad_norm": 1.5344783067703247, + "learning_rate": 1.692003768534747e-05, + "loss": 0.7084, + "step": 9380 + }, + { + "epoch": 3.0530253741054003, + "grad_norm": 1.4687203168869019, + "learning_rate": 1.689561879638982e-05, + "loss": 0.7333, + "step": 9385 + }, + { + "epoch": 3.0546519193233572, + "grad_norm": 1.5589638948440552, + "learning_rate": 1.687120854434579e-05, + "loss": 0.7238, + "step": 9390 + }, + { + "epoch": 3.056278464541314, + "grad_norm": 1.5215935707092285, + "learning_rate": 1.684680695522964e-05, + "loss": 0.7229, + "step": 9395 + }, + { + "epoch": 3.0579050097592715, + "grad_norm": 1.4687856435775757, + "learning_rate": 1.682241405504634e-05, + "loss": 0.7165, + "step": 9400 + }, + { + "epoch": 3.0595315549772284, + "grad_norm": 1.579168677330017, + "learning_rate": 1.679802986979165e-05, + "loss": 0.7145, + "step": 9405 + }, + { + "epoch": 3.0611581001951853, + "grad_norm": 1.6092942953109741, + "learning_rate": 1.6773654425452007e-05, + "loss": 0.716, + "step": 9410 + }, + { + "epoch": 3.0627846454131427, + "grad_norm": 1.3213398456573486, + "learning_rate": 1.6749287748004567e-05, + "loss": 0.7242, + "step": 9415 + }, + { + "epoch": 3.0644111906310996, + "grad_norm": 1.4730993509292603, + "learning_rate": 1.6724929863417094e-05, + "loss": 0.6825, + "step": 9420 + }, + { + "epoch": 3.0660377358490565, + "grad_norm": 1.4565410614013672, + "learning_rate": 1.670058079764802e-05, + "loss": 0.7152, + "step": 9425 + }, + { + "epoch": 3.067664281067014, + "grad_norm": 1.575098991394043, + "learning_rate": 1.6676240576646387e-05, + "loss": 0.7002, + "step": 9430 + }, + { + "epoch": 3.0692908262849707, + "grad_norm": 1.5681147575378418, + "learning_rate": 1.665190922635177e-05, + "loss": 0.7148, + "step": 9435 + }, + { + "epoch": 3.0709173715029277, + "grad_norm": 1.5109776258468628, + "learning_rate": 1.662758677269432e-05, + "loss": 0.7242, + "step": 9440 + }, + { + "epoch": 3.072543916720885, + "grad_norm": 1.438925862312317, + "learning_rate": 1.66032732415947e-05, + "loss": 0.7292, + "step": 9445 + }, + { + "epoch": 3.074170461938842, + "grad_norm": 1.3795181512832642, + "learning_rate": 1.657896865896407e-05, + "loss": 0.717, + "step": 9450 + }, + { + "epoch": 3.075797007156799, + "grad_norm": 1.152470588684082, + "learning_rate": 1.6554673050704038e-05, + "loss": 0.7302, + "step": 9455 + }, + { + "epoch": 3.077423552374756, + "grad_norm": 1.3653312921524048, + "learning_rate": 1.6530386442706664e-05, + "loss": 0.7028, + "step": 9460 + }, + { + "epoch": 3.079050097592713, + "grad_norm": 1.8764070272445679, + "learning_rate": 1.65061088608544e-05, + "loss": 0.7025, + "step": 9465 + }, + { + "epoch": 3.08067664281067, + "grad_norm": 1.467470407485962, + "learning_rate": 1.6481840331020098e-05, + "loss": 0.7574, + "step": 9470 + }, + { + "epoch": 3.0823031880286273, + "grad_norm": 1.9174879789352417, + "learning_rate": 1.645758087906695e-05, + "loss": 0.7546, + "step": 9475 + }, + { + "epoch": 3.0839297332465843, + "grad_norm": 1.4528872966766357, + "learning_rate": 1.6433330530848487e-05, + "loss": 0.7013, + "step": 9480 + }, + { + "epoch": 3.085556278464541, + "grad_norm": 1.3309197425842285, + "learning_rate": 1.640908931220851e-05, + "loss": 0.7051, + "step": 9485 + }, + { + "epoch": 3.0871828236824985, + "grad_norm": 1.552443027496338, + "learning_rate": 1.638485724898112e-05, + "loss": 0.7213, + "step": 9490 + }, + { + "epoch": 3.0888093689004554, + "grad_norm": 1.6178815364837646, + "learning_rate": 1.636063436699064e-05, + "loss": 0.7246, + "step": 9495 + }, + { + "epoch": 3.0904359141184123, + "grad_norm": 1.3050587177276611, + "learning_rate": 1.6336420692051637e-05, + "loss": 0.709, + "step": 9500 + }, + { + "epoch": 3.0920624593363697, + "grad_norm": 1.7693572044372559, + "learning_rate": 1.6312216249968828e-05, + "loss": 0.6662, + "step": 9505 + }, + { + "epoch": 3.0936890045543266, + "grad_norm": 1.6759785413742065, + "learning_rate": 1.628802106653713e-05, + "loss": 0.7034, + "step": 9510 + }, + { + "epoch": 3.0953155497722835, + "grad_norm": 1.7418941259384155, + "learning_rate": 1.626383516754155e-05, + "loss": 0.7416, + "step": 9515 + }, + { + "epoch": 3.096942094990241, + "grad_norm": 1.5033810138702393, + "learning_rate": 1.6239658578757238e-05, + "loss": 0.7287, + "step": 9520 + }, + { + "epoch": 3.0985686402081978, + "grad_norm": 1.4363847970962524, + "learning_rate": 1.6215491325949397e-05, + "loss": 0.7175, + "step": 9525 + }, + { + "epoch": 3.1001951854261547, + "grad_norm": 1.450146198272705, + "learning_rate": 1.61913334348733e-05, + "loss": 0.7148, + "step": 9530 + }, + { + "epoch": 3.101821730644112, + "grad_norm": 1.4933956861495972, + "learning_rate": 1.6167184931274232e-05, + "loss": 0.7225, + "step": 9535 + }, + { + "epoch": 3.103448275862069, + "grad_norm": 1.725346565246582, + "learning_rate": 1.614304584088748e-05, + "loss": 0.7342, + "step": 9540 + }, + { + "epoch": 3.105074821080026, + "grad_norm": 1.6251167058944702, + "learning_rate": 1.6118916189438278e-05, + "loss": 0.6935, + "step": 9545 + }, + { + "epoch": 3.106701366297983, + "grad_norm": 1.5891484022140503, + "learning_rate": 1.6094796002641834e-05, + "loss": 0.7322, + "step": 9550 + }, + { + "epoch": 3.10832791151594, + "grad_norm": 1.561183214187622, + "learning_rate": 1.607068530620324e-05, + "loss": 0.7118, + "step": 9555 + }, + { + "epoch": 3.109954456733897, + "grad_norm": 1.5726548433303833, + "learning_rate": 1.60465841258175e-05, + "loss": 0.6956, + "step": 9560 + }, + { + "epoch": 3.1115810019518544, + "grad_norm": 1.6114755868911743, + "learning_rate": 1.602249248716946e-05, + "loss": 0.7313, + "step": 9565 + }, + { + "epoch": 3.1132075471698113, + "grad_norm": 1.5556243658065796, + "learning_rate": 1.5998410415933794e-05, + "loss": 0.7301, + "step": 9570 + }, + { + "epoch": 3.114834092387768, + "grad_norm": 1.5923330783843994, + "learning_rate": 1.597433793777501e-05, + "loss": 0.746, + "step": 9575 + }, + { + "epoch": 3.1164606376057256, + "grad_norm": 1.611331582069397, + "learning_rate": 1.5950275078347333e-05, + "loss": 0.7099, + "step": 9580 + }, + { + "epoch": 3.1180871828236825, + "grad_norm": 1.255745530128479, + "learning_rate": 1.5926221863294798e-05, + "loss": 0.6956, + "step": 9585 + }, + { + "epoch": 3.1197137280416394, + "grad_norm": 1.7463656663894653, + "learning_rate": 1.590217831825113e-05, + "loss": 0.704, + "step": 9590 + }, + { + "epoch": 3.1213402732595967, + "grad_norm": 1.6205796003341675, + "learning_rate": 1.5878144468839767e-05, + "loss": 0.7323, + "step": 9595 + }, + { + "epoch": 3.1229668184775536, + "grad_norm": 1.6915699243545532, + "learning_rate": 1.585412034067378e-05, + "loss": 0.7254, + "step": 9600 + }, + { + "epoch": 3.1245933636955106, + "grad_norm": 1.576432466506958, + "learning_rate": 1.5830105959355933e-05, + "loss": 0.7253, + "step": 9605 + }, + { + "epoch": 3.126219908913468, + "grad_norm": 1.4804075956344604, + "learning_rate": 1.5806101350478554e-05, + "loss": 0.7332, + "step": 9610 + }, + { + "epoch": 3.127846454131425, + "grad_norm": 1.4172931909561157, + "learning_rate": 1.5782106539623578e-05, + "loss": 0.6734, + "step": 9615 + }, + { + "epoch": 3.1294729993493817, + "grad_norm": 1.5690916776657104, + "learning_rate": 1.57581215523625e-05, + "loss": 0.7135, + "step": 9620 + }, + { + "epoch": 3.131099544567339, + "grad_norm": 1.5965441465377808, + "learning_rate": 1.5734146414256338e-05, + "loss": 0.7126, + "step": 9625 + }, + { + "epoch": 3.132726089785296, + "grad_norm": 1.7947732210159302, + "learning_rate": 1.5710181150855618e-05, + "loss": 0.7448, + "step": 9630 + }, + { + "epoch": 3.134352635003253, + "grad_norm": 1.4908939599990845, + "learning_rate": 1.5686225787700347e-05, + "loss": 0.7187, + "step": 9635 + }, + { + "epoch": 3.1359791802212102, + "grad_norm": 1.5316404104232788, + "learning_rate": 1.566228035031997e-05, + "loss": 0.7143, + "step": 9640 + }, + { + "epoch": 3.137605725439167, + "grad_norm": 1.3814529180526733, + "learning_rate": 1.5638344864233363e-05, + "loss": 0.7219, + "step": 9645 + }, + { + "epoch": 3.139232270657124, + "grad_norm": 1.5012269020080566, + "learning_rate": 1.5614419354948783e-05, + "loss": 0.7183, + "step": 9650 + }, + { + "epoch": 3.1408588158750814, + "grad_norm": 1.751840591430664, + "learning_rate": 1.559050384796389e-05, + "loss": 0.7056, + "step": 9655 + }, + { + "epoch": 3.1424853610930383, + "grad_norm": 1.5153909921646118, + "learning_rate": 1.5566598368765635e-05, + "loss": 0.6984, + "step": 9660 + }, + { + "epoch": 3.1441119063109952, + "grad_norm": 1.6611508131027222, + "learning_rate": 1.5542702942830322e-05, + "loss": 0.6832, + "step": 9665 + }, + { + "epoch": 3.1457384515289526, + "grad_norm": 1.7145307064056396, + "learning_rate": 1.5518817595623514e-05, + "loss": 0.7647, + "step": 9670 + }, + { + "epoch": 3.1473649967469095, + "grad_norm": 1.5652216672897339, + "learning_rate": 1.549494235260006e-05, + "loss": 0.7373, + "step": 9675 + }, + { + "epoch": 3.1489915419648664, + "grad_norm": 1.4932833909988403, + "learning_rate": 1.5471077239204014e-05, + "loss": 0.6988, + "step": 9680 + }, + { + "epoch": 3.1506180871828238, + "grad_norm": 1.3038312196731567, + "learning_rate": 1.544722228086866e-05, + "loss": 0.729, + "step": 9685 + }, + { + "epoch": 3.1522446324007807, + "grad_norm": 2.2358272075653076, + "learning_rate": 1.542337750301643e-05, + "loss": 0.7147, + "step": 9690 + }, + { + "epoch": 3.153871177618738, + "grad_norm": 1.4131916761398315, + "learning_rate": 1.5399542931058933e-05, + "loss": 0.6692, + "step": 9695 + }, + { + "epoch": 3.155497722836695, + "grad_norm": 1.5478293895721436, + "learning_rate": 1.5375718590396894e-05, + "loss": 0.7173, + "step": 9700 + }, + { + "epoch": 3.157124268054652, + "grad_norm": 1.4857743978500366, + "learning_rate": 1.5351904506420124e-05, + "loss": 0.6882, + "step": 9705 + }, + { + "epoch": 3.1587508132726088, + "grad_norm": 1.4926530122756958, + "learning_rate": 1.532810070450752e-05, + "loss": 0.7346, + "step": 9710 + }, + { + "epoch": 3.160377358490566, + "grad_norm": 1.4504636526107788, + "learning_rate": 1.5304307210027004e-05, + "loss": 0.7018, + "step": 9715 + }, + { + "epoch": 3.162003903708523, + "grad_norm": 1.6056586503982544, + "learning_rate": 1.5280524048335538e-05, + "loss": 0.699, + "step": 9720 + }, + { + "epoch": 3.1636304489264804, + "grad_norm": 1.6431245803833008, + "learning_rate": 1.5256751244779036e-05, + "loss": 0.7005, + "step": 9725 + }, + { + "epoch": 3.1652569941444373, + "grad_norm": 1.7279828786849976, + "learning_rate": 1.5232988824692406e-05, + "loss": 0.7023, + "step": 9730 + }, + { + "epoch": 3.166883539362394, + "grad_norm": 1.5771671533584595, + "learning_rate": 1.520923681339947e-05, + "loss": 0.7033, + "step": 9735 + }, + { + "epoch": 3.168510084580351, + "grad_norm": 1.526728868484497, + "learning_rate": 1.5185495236212976e-05, + "loss": 0.7188, + "step": 9740 + }, + { + "epoch": 3.1701366297983085, + "grad_norm": 1.3081923723220825, + "learning_rate": 1.516176411843453e-05, + "loss": 0.7044, + "step": 9745 + }, + { + "epoch": 3.1717631750162654, + "grad_norm": 1.4636553525924683, + "learning_rate": 1.5138043485354614e-05, + "loss": 0.7076, + "step": 9750 + }, + { + "epoch": 3.1733897202342227, + "grad_norm": 1.5434120893478394, + "learning_rate": 1.5114333362252509e-05, + "loss": 0.7056, + "step": 9755 + }, + { + "epoch": 3.1750162654521796, + "grad_norm": 1.5241512060165405, + "learning_rate": 1.5090633774396329e-05, + "loss": 0.7086, + "step": 9760 + }, + { + "epoch": 3.1766428106701365, + "grad_norm": 1.4584996700286865, + "learning_rate": 1.5066944747042927e-05, + "loss": 0.6882, + "step": 9765 + }, + { + "epoch": 3.178269355888094, + "grad_norm": 1.4083778858184814, + "learning_rate": 1.5043266305437937e-05, + "loss": 0.7296, + "step": 9770 + }, + { + "epoch": 3.179895901106051, + "grad_norm": 1.4902408123016357, + "learning_rate": 1.5019598474815666e-05, + "loss": 0.7253, + "step": 9775 + }, + { + "epoch": 3.1815224463240077, + "grad_norm": 1.5519683361053467, + "learning_rate": 1.4995941280399166e-05, + "loss": 0.7133, + "step": 9780 + }, + { + "epoch": 3.183148991541965, + "grad_norm": 1.5642268657684326, + "learning_rate": 1.4972294747400106e-05, + "loss": 0.754, + "step": 9785 + }, + { + "epoch": 3.184775536759922, + "grad_norm": 1.6030256748199463, + "learning_rate": 1.4948658901018826e-05, + "loss": 0.737, + "step": 9790 + }, + { + "epoch": 3.186402081977879, + "grad_norm": 1.6638157367706299, + "learning_rate": 1.4925033766444255e-05, + "loss": 0.7423, + "step": 9795 + }, + { + "epoch": 3.1880286271958362, + "grad_norm": 1.5237773656845093, + "learning_rate": 1.4901419368853934e-05, + "loss": 0.7358, + "step": 9800 + }, + { + "epoch": 3.189655172413793, + "grad_norm": 1.5982376337051392, + "learning_rate": 1.4877815733413919e-05, + "loss": 0.6896, + "step": 9805 + }, + { + "epoch": 3.19128171763175, + "grad_norm": 1.4923803806304932, + "learning_rate": 1.4854222885278842e-05, + "loss": 0.7027, + "step": 9810 + }, + { + "epoch": 3.1929082628497074, + "grad_norm": 1.449155569076538, + "learning_rate": 1.4830640849591806e-05, + "loss": 0.6938, + "step": 9815 + }, + { + "epoch": 3.1945348080676643, + "grad_norm": 1.8611520528793335, + "learning_rate": 1.4807069651484415e-05, + "loss": 0.7134, + "step": 9820 + }, + { + "epoch": 3.1961613532856212, + "grad_norm": 1.682241439819336, + "learning_rate": 1.4783509316076698e-05, + "loss": 0.7016, + "step": 9825 + }, + { + "epoch": 3.1977878985035786, + "grad_norm": 1.8355181217193604, + "learning_rate": 1.475995986847713e-05, + "loss": 0.6952, + "step": 9830 + }, + { + "epoch": 3.1994144437215355, + "grad_norm": 1.4858391284942627, + "learning_rate": 1.4736421333782582e-05, + "loss": 0.6999, + "step": 9835 + }, + { + "epoch": 3.2010409889394924, + "grad_norm": 1.5187908411026, + "learning_rate": 1.4712893737078262e-05, + "loss": 0.7081, + "step": 9840 + }, + { + "epoch": 3.2026675341574498, + "grad_norm": 1.5705649852752686, + "learning_rate": 1.4689377103437779e-05, + "loss": 0.7319, + "step": 9845 + }, + { + "epoch": 3.2042940793754067, + "grad_norm": 1.2782399654388428, + "learning_rate": 1.4665871457922998e-05, + "loss": 0.7318, + "step": 9850 + }, + { + "epoch": 3.2059206245933636, + "grad_norm": 1.452333688735962, + "learning_rate": 1.4642376825584115e-05, + "loss": 0.6869, + "step": 9855 + }, + { + "epoch": 3.207547169811321, + "grad_norm": 1.5658483505249023, + "learning_rate": 1.4618893231459572e-05, + "loss": 0.7342, + "step": 9860 + }, + { + "epoch": 3.209173715029278, + "grad_norm": 1.7879657745361328, + "learning_rate": 1.4595420700576062e-05, + "loss": 0.731, + "step": 9865 + }, + { + "epoch": 3.2108002602472347, + "grad_norm": 1.459113359451294, + "learning_rate": 1.457195925794846e-05, + "loss": 0.686, + "step": 9870 + }, + { + "epoch": 3.212426805465192, + "grad_norm": 1.5036383867263794, + "learning_rate": 1.4548508928579855e-05, + "loss": 0.7254, + "step": 9875 + }, + { + "epoch": 3.214053350683149, + "grad_norm": 1.637881875038147, + "learning_rate": 1.4525069737461456e-05, + "loss": 0.7572, + "step": 9880 + }, + { + "epoch": 3.215679895901106, + "grad_norm": 1.921654462814331, + "learning_rate": 1.450164170957266e-05, + "loss": 0.6887, + "step": 9885 + }, + { + "epoch": 3.2173064411190633, + "grad_norm": 1.5535845756530762, + "learning_rate": 1.4478224869880908e-05, + "loss": 0.6997, + "step": 9890 + }, + { + "epoch": 3.21893298633702, + "grad_norm": 1.6989622116088867, + "learning_rate": 1.4454819243341752e-05, + "loss": 0.679, + "step": 9895 + }, + { + "epoch": 3.220559531554977, + "grad_norm": 1.5417860746383667, + "learning_rate": 1.4431424854898772e-05, + "loss": 0.7005, + "step": 9900 + }, + { + "epoch": 3.2221860767729344, + "grad_norm": 1.6961115598678589, + "learning_rate": 1.44080417294836e-05, + "loss": 0.7189, + "step": 9905 + }, + { + "epoch": 3.2238126219908914, + "grad_norm": 1.4906436204910278, + "learning_rate": 1.4384669892015828e-05, + "loss": 0.7128, + "step": 9910 + }, + { + "epoch": 3.2254391672088483, + "grad_norm": 1.4389100074768066, + "learning_rate": 1.436130936740305e-05, + "loss": 0.7186, + "step": 9915 + }, + { + "epoch": 3.2270657124268056, + "grad_norm": 1.4868855476379395, + "learning_rate": 1.4337960180540788e-05, + "loss": 0.6769, + "step": 9920 + }, + { + "epoch": 3.2286922576447625, + "grad_norm": 1.4868018627166748, + "learning_rate": 1.4314622356312502e-05, + "loss": 0.704, + "step": 9925 + }, + { + "epoch": 3.2303188028627194, + "grad_norm": 1.5681591033935547, + "learning_rate": 1.4291295919589498e-05, + "loss": 0.729, + "step": 9930 + }, + { + "epoch": 3.231945348080677, + "grad_norm": 1.8189630508422852, + "learning_rate": 1.4267980895230997e-05, + "loss": 0.7329, + "step": 9935 + }, + { + "epoch": 3.2335718932986337, + "grad_norm": 1.4947065114974976, + "learning_rate": 1.424467730808402e-05, + "loss": 0.7111, + "step": 9940 + }, + { + "epoch": 3.2351984385165906, + "grad_norm": 1.5021048784255981, + "learning_rate": 1.4221385182983418e-05, + "loss": 0.7117, + "step": 9945 + }, + { + "epoch": 3.236824983734548, + "grad_norm": 1.5461513996124268, + "learning_rate": 1.4198104544751828e-05, + "loss": 0.6998, + "step": 9950 + }, + { + "epoch": 3.238451528952505, + "grad_norm": 1.6981432437896729, + "learning_rate": 1.4174835418199645e-05, + "loss": 0.7258, + "step": 9955 + }, + { + "epoch": 3.240078074170462, + "grad_norm": 1.5343623161315918, + "learning_rate": 1.4151577828124977e-05, + "loss": 0.7141, + "step": 9960 + }, + { + "epoch": 3.241704619388419, + "grad_norm": 1.4762821197509766, + "learning_rate": 1.4128331799313657e-05, + "loss": 0.6988, + "step": 9965 + }, + { + "epoch": 3.243331164606376, + "grad_norm": 1.9735140800476074, + "learning_rate": 1.4105097356539203e-05, + "loss": 0.7378, + "step": 9970 + }, + { + "epoch": 3.244957709824333, + "grad_norm": 1.6543573141098022, + "learning_rate": 1.408187452456276e-05, + "loss": 0.6871, + "step": 9975 + }, + { + "epoch": 3.2465842550422903, + "grad_norm": 1.4793490171432495, + "learning_rate": 1.4058663328133115e-05, + "loss": 0.7165, + "step": 9980 + }, + { + "epoch": 3.248210800260247, + "grad_norm": 1.5775221586227417, + "learning_rate": 1.4035463791986661e-05, + "loss": 0.7368, + "step": 9985 + }, + { + "epoch": 3.249837345478204, + "grad_norm": 1.4512039422988892, + "learning_rate": 1.4012275940847363e-05, + "loss": 0.7334, + "step": 9990 + }, + { + "epoch": 3.2514638906961615, + "grad_norm": 1.3824808597564697, + "learning_rate": 1.3989099799426708e-05, + "loss": 0.7144, + "step": 9995 + }, + { + "epoch": 3.2530904359141184, + "grad_norm": 1.5353810787200928, + "learning_rate": 1.3965935392423746e-05, + "loss": 0.7229, + "step": 10000 + }, + { + "epoch": 3.2547169811320753, + "grad_norm": 1.6290570497512817, + "learning_rate": 1.3942782744524973e-05, + "loss": 0.7428, + "step": 10005 + }, + { + "epoch": 3.2563435263500327, + "grad_norm": 1.511370301246643, + "learning_rate": 1.3919641880404394e-05, + "loss": 0.7106, + "step": 10010 + }, + { + "epoch": 3.2579700715679896, + "grad_norm": 1.5856270790100098, + "learning_rate": 1.3896512824723437e-05, + "loss": 0.6765, + "step": 10015 + }, + { + "epoch": 3.2595966167859465, + "grad_norm": 1.5825022459030151, + "learning_rate": 1.3873395602130961e-05, + "loss": 0.715, + "step": 10020 + }, + { + "epoch": 3.261223162003904, + "grad_norm": 1.6221950054168701, + "learning_rate": 1.385029023726319e-05, + "loss": 0.7099, + "step": 10025 + }, + { + "epoch": 3.2628497072218607, + "grad_norm": 1.632330298423767, + "learning_rate": 1.3827196754743732e-05, + "loss": 0.7432, + "step": 10030 + }, + { + "epoch": 3.2644762524398176, + "grad_norm": 1.530735969543457, + "learning_rate": 1.3804115179183511e-05, + "loss": 0.6988, + "step": 10035 + }, + { + "epoch": 3.266102797657775, + "grad_norm": 1.9401394128799438, + "learning_rate": 1.3781045535180782e-05, + "loss": 0.7309, + "step": 10040 + }, + { + "epoch": 3.267729342875732, + "grad_norm": 1.458641767501831, + "learning_rate": 1.375798784732108e-05, + "loss": 0.7389, + "step": 10045 + }, + { + "epoch": 3.269355888093689, + "grad_norm": 1.3258392810821533, + "learning_rate": 1.3734942140177201e-05, + "loss": 0.7348, + "step": 10050 + }, + { + "epoch": 3.270982433311646, + "grad_norm": 1.3521080017089844, + "learning_rate": 1.371190843830915e-05, + "loss": 0.7185, + "step": 10055 + }, + { + "epoch": 3.272608978529603, + "grad_norm": 1.5230739116668701, + "learning_rate": 1.3688886766264175e-05, + "loss": 0.7026, + "step": 10060 + }, + { + "epoch": 3.27423552374756, + "grad_norm": 1.4361578226089478, + "learning_rate": 1.3665877148576661e-05, + "loss": 0.7088, + "step": 10065 + }, + { + "epoch": 3.2758620689655173, + "grad_norm": 1.3476760387420654, + "learning_rate": 1.3642879609768184e-05, + "loss": 0.721, + "step": 10070 + }, + { + "epoch": 3.2774886141834743, + "grad_norm": 1.4184231758117676, + "learning_rate": 1.3619894174347428e-05, + "loss": 0.7005, + "step": 10075 + }, + { + "epoch": 3.279115159401431, + "grad_norm": 1.4210221767425537, + "learning_rate": 1.3596920866810197e-05, + "loss": 0.7403, + "step": 10080 + }, + { + "epoch": 3.2807417046193885, + "grad_norm": 1.6710317134857178, + "learning_rate": 1.3573959711639334e-05, + "loss": 0.7061, + "step": 10085 + }, + { + "epoch": 3.2823682498373454, + "grad_norm": 1.3709286451339722, + "learning_rate": 1.3551010733304773e-05, + "loss": 0.6942, + "step": 10090 + }, + { + "epoch": 3.2839947950553023, + "grad_norm": 1.5083563327789307, + "learning_rate": 1.3528073956263432e-05, + "loss": 0.7214, + "step": 10095 + }, + { + "epoch": 3.2856213402732597, + "grad_norm": 1.6538763046264648, + "learning_rate": 1.3505149404959255e-05, + "loss": 0.7046, + "step": 10100 + }, + { + "epoch": 3.2872478854912166, + "grad_norm": 1.6021710634231567, + "learning_rate": 1.348223710382315e-05, + "loss": 0.7247, + "step": 10105 + }, + { + "epoch": 3.288874430709174, + "grad_norm": 1.453261375427246, + "learning_rate": 1.3459337077272965e-05, + "loss": 0.757, + "step": 10110 + }, + { + "epoch": 3.290500975927131, + "grad_norm": 1.5504785776138306, + "learning_rate": 1.3436449349713478e-05, + "loss": 0.71, + "step": 10115 + }, + { + "epoch": 3.2921275211450878, + "grad_norm": 1.4742834568023682, + "learning_rate": 1.3413573945536334e-05, + "loss": 0.7361, + "step": 10120 + }, + { + "epoch": 3.2937540663630447, + "grad_norm": 1.8467600345611572, + "learning_rate": 1.3390710889120077e-05, + "loss": 0.7144, + "step": 10125 + }, + { + "epoch": 3.295380611581002, + "grad_norm": 1.5694597959518433, + "learning_rate": 1.3367860204830063e-05, + "loss": 0.7177, + "step": 10130 + }, + { + "epoch": 3.297007156798959, + "grad_norm": 1.5384880304336548, + "learning_rate": 1.334502191701848e-05, + "loss": 0.6938, + "step": 10135 + }, + { + "epoch": 3.2986337020169163, + "grad_norm": 1.4981285333633423, + "learning_rate": 1.3322196050024307e-05, + "loss": 0.6952, + "step": 10140 + }, + { + "epoch": 3.300260247234873, + "grad_norm": 1.676553726196289, + "learning_rate": 1.3299382628173287e-05, + "loss": 0.7158, + "step": 10145 + }, + { + "epoch": 3.30188679245283, + "grad_norm": 1.5886269807815552, + "learning_rate": 1.3276581675777877e-05, + "loss": 0.6863, + "step": 10150 + }, + { + "epoch": 3.303513337670787, + "grad_norm": 1.6923164129257202, + "learning_rate": 1.3253793217137275e-05, + "loss": 0.6976, + "step": 10155 + }, + { + "epoch": 3.3051398828887444, + "grad_norm": 1.584494948387146, + "learning_rate": 1.3231017276537339e-05, + "loss": 0.7109, + "step": 10160 + }, + { + "epoch": 3.3067664281067013, + "grad_norm": 1.5413905382156372, + "learning_rate": 1.3208253878250604e-05, + "loss": 0.7154, + "step": 10165 + }, + { + "epoch": 3.3083929733246586, + "grad_norm": 1.5632292032241821, + "learning_rate": 1.3185503046536235e-05, + "loss": 0.6953, + "step": 10170 + }, + { + "epoch": 3.3100195185426156, + "grad_norm": 1.340214729309082, + "learning_rate": 1.316276480564001e-05, + "loss": 0.6667, + "step": 10175 + }, + { + "epoch": 3.3116460637605725, + "grad_norm": 1.4377366304397583, + "learning_rate": 1.3140039179794267e-05, + "loss": 0.7057, + "step": 10180 + }, + { + "epoch": 3.3132726089785294, + "grad_norm": 1.8054593801498413, + "learning_rate": 1.3117326193217927e-05, + "loss": 0.6993, + "step": 10185 + }, + { + "epoch": 3.3148991541964867, + "grad_norm": 1.7788589000701904, + "learning_rate": 1.3094625870116418e-05, + "loss": 0.7208, + "step": 10190 + }, + { + "epoch": 3.3165256994144436, + "grad_norm": 1.6707552671432495, + "learning_rate": 1.3071938234681686e-05, + "loss": 0.7208, + "step": 10195 + }, + { + "epoch": 3.318152244632401, + "grad_norm": 1.4484069347381592, + "learning_rate": 1.3049263311092153e-05, + "loss": 0.698, + "step": 10200 + }, + { + "epoch": 3.319778789850358, + "grad_norm": 1.6301907300949097, + "learning_rate": 1.3026601123512707e-05, + "loss": 0.7264, + "step": 10205 + }, + { + "epoch": 3.321405335068315, + "grad_norm": 1.7185864448547363, + "learning_rate": 1.300395169609463e-05, + "loss": 0.7273, + "step": 10210 + }, + { + "epoch": 3.3230318802862717, + "grad_norm": 1.5145241022109985, + "learning_rate": 1.2981315052975643e-05, + "loss": 0.7335, + "step": 10215 + }, + { + "epoch": 3.324658425504229, + "grad_norm": 1.6701593399047852, + "learning_rate": 1.2958691218279807e-05, + "loss": 0.7495, + "step": 10220 + }, + { + "epoch": 3.326284970722186, + "grad_norm": 1.5697754621505737, + "learning_rate": 1.2936080216117568e-05, + "loss": 0.7159, + "step": 10225 + }, + { + "epoch": 3.3279115159401433, + "grad_norm": 1.5273504257202148, + "learning_rate": 1.2913482070585653e-05, + "loss": 0.7231, + "step": 10230 + }, + { + "epoch": 3.3295380611581002, + "grad_norm": 1.4350838661193848, + "learning_rate": 1.2890896805767138e-05, + "loss": 0.7127, + "step": 10235 + }, + { + "epoch": 3.331164606376057, + "grad_norm": 1.387499213218689, + "learning_rate": 1.2868324445731355e-05, + "loss": 0.7209, + "step": 10240 + }, + { + "epoch": 3.332791151594014, + "grad_norm": 1.4423376321792603, + "learning_rate": 1.2845765014533851e-05, + "loss": 0.7224, + "step": 10245 + }, + { + "epoch": 3.3344176968119714, + "grad_norm": 1.6776084899902344, + "learning_rate": 1.2823218536216442e-05, + "loss": 0.705, + "step": 10250 + }, + { + "epoch": 3.3360442420299283, + "grad_norm": 1.4317584037780762, + "learning_rate": 1.2800685034807091e-05, + "loss": 0.6849, + "step": 10255 + }, + { + "epoch": 3.3376707872478857, + "grad_norm": 1.462085485458374, + "learning_rate": 1.2778164534319986e-05, + "loss": 0.6955, + "step": 10260 + }, + { + "epoch": 3.3392973324658426, + "grad_norm": 1.4515974521636963, + "learning_rate": 1.2755657058755399e-05, + "loss": 0.6959, + "step": 10265 + }, + { + "epoch": 3.3409238776837995, + "grad_norm": 1.6273915767669678, + "learning_rate": 1.273316263209979e-05, + "loss": 0.7104, + "step": 10270 + }, + { + "epoch": 3.342550422901757, + "grad_norm": 1.5296517610549927, + "learning_rate": 1.2710681278325653e-05, + "loss": 0.7004, + "step": 10275 + }, + { + "epoch": 3.3441769681197138, + "grad_norm": 1.442663550376892, + "learning_rate": 1.2688213021391587e-05, + "loss": 0.7464, + "step": 10280 + }, + { + "epoch": 3.3458035133376707, + "grad_norm": 1.6764130592346191, + "learning_rate": 1.2665757885242208e-05, + "loss": 0.7421, + "step": 10285 + }, + { + "epoch": 3.347430058555628, + "grad_norm": 1.6246837377548218, + "learning_rate": 1.2643315893808172e-05, + "loss": 0.7045, + "step": 10290 + }, + { + "epoch": 3.349056603773585, + "grad_norm": 1.4812027215957642, + "learning_rate": 1.2620887071006104e-05, + "loss": 0.712, + "step": 10295 + }, + { + "epoch": 3.350683148991542, + "grad_norm": 1.2883481979370117, + "learning_rate": 1.2598471440738612e-05, + "loss": 0.7078, + "step": 10300 + }, + { + "epoch": 3.352309694209499, + "grad_norm": 1.635571002960205, + "learning_rate": 1.2576069026894238e-05, + "loss": 0.6837, + "step": 10305 + }, + { + "epoch": 3.353936239427456, + "grad_norm": 1.4900321960449219, + "learning_rate": 1.2553679853347458e-05, + "loss": 0.732, + "step": 10310 + }, + { + "epoch": 3.355562784645413, + "grad_norm": 1.5119543075561523, + "learning_rate": 1.2531303943958595e-05, + "loss": 0.7075, + "step": 10315 + }, + { + "epoch": 3.3571893298633704, + "grad_norm": 1.4920324087142944, + "learning_rate": 1.250894132257388e-05, + "loss": 0.7563, + "step": 10320 + }, + { + "epoch": 3.3588158750813273, + "grad_norm": 1.6020745038986206, + "learning_rate": 1.248659201302535e-05, + "loss": 0.7194, + "step": 10325 + }, + { + "epoch": 3.360442420299284, + "grad_norm": 1.659667730331421, + "learning_rate": 1.2464256039130876e-05, + "loss": 0.7261, + "step": 10330 + }, + { + "epoch": 3.3620689655172415, + "grad_norm": 1.8238872289657593, + "learning_rate": 1.244193342469411e-05, + "loss": 0.75, + "step": 10335 + }, + { + "epoch": 3.3636955107351985, + "grad_norm": 1.4221556186676025, + "learning_rate": 1.2419624193504481e-05, + "loss": 0.7191, + "step": 10340 + }, + { + "epoch": 3.3653220559531554, + "grad_norm": 1.6134623289108276, + "learning_rate": 1.239732836933712e-05, + "loss": 0.695, + "step": 10345 + }, + { + "epoch": 3.3669486011711127, + "grad_norm": 1.579270601272583, + "learning_rate": 1.237504597595291e-05, + "loss": 0.6977, + "step": 10350 + }, + { + "epoch": 3.3685751463890696, + "grad_norm": 1.424869179725647, + "learning_rate": 1.2352777037098386e-05, + "loss": 0.7302, + "step": 10355 + }, + { + "epoch": 3.3702016916070265, + "grad_norm": 1.791737675666809, + "learning_rate": 1.2330521576505771e-05, + "loss": 0.7342, + "step": 10360 + }, + { + "epoch": 3.371828236824984, + "grad_norm": 1.5047688484191895, + "learning_rate": 1.2308279617892915e-05, + "loss": 0.7156, + "step": 10365 + }, + { + "epoch": 3.373454782042941, + "grad_norm": 1.4820597171783447, + "learning_rate": 1.2286051184963273e-05, + "loss": 0.6901, + "step": 10370 + }, + { + "epoch": 3.3750813272608977, + "grad_norm": 1.4850261211395264, + "learning_rate": 1.2263836301405906e-05, + "loss": 0.721, + "step": 10375 + }, + { + "epoch": 3.376707872478855, + "grad_norm": 1.51651930809021, + "learning_rate": 1.2241634990895396e-05, + "loss": 0.6978, + "step": 10380 + }, + { + "epoch": 3.378334417696812, + "grad_norm": 1.579532504081726, + "learning_rate": 1.2219447277091906e-05, + "loss": 0.7238, + "step": 10385 + }, + { + "epoch": 3.379960962914769, + "grad_norm": 1.5791518688201904, + "learning_rate": 1.2197273183641067e-05, + "loss": 0.6893, + "step": 10390 + }, + { + "epoch": 3.3815875081327262, + "grad_norm": 1.5441879034042358, + "learning_rate": 1.2175112734174024e-05, + "loss": 0.7019, + "step": 10395 + }, + { + "epoch": 3.383214053350683, + "grad_norm": 1.655837893486023, + "learning_rate": 1.2152965952307372e-05, + "loss": 0.7004, + "step": 10400 + }, + { + "epoch": 3.38484059856864, + "grad_norm": 1.7848845720291138, + "learning_rate": 1.213083286164315e-05, + "loss": 0.7092, + "step": 10405 + }, + { + "epoch": 3.3864671437865974, + "grad_norm": 1.4102035760879517, + "learning_rate": 1.2108713485768774e-05, + "loss": 0.6816, + "step": 10410 + }, + { + "epoch": 3.3880936890045543, + "grad_norm": 1.450148105621338, + "learning_rate": 1.2086607848257092e-05, + "loss": 0.7409, + "step": 10415 + }, + { + "epoch": 3.3897202342225112, + "grad_norm": 1.5566368103027344, + "learning_rate": 1.2064515972666262e-05, + "loss": 0.6832, + "step": 10420 + }, + { + "epoch": 3.3913467794404686, + "grad_norm": 1.4712780714035034, + "learning_rate": 1.2042437882539809e-05, + "loss": 0.727, + "step": 10425 + }, + { + "epoch": 3.3929733246584255, + "grad_norm": 1.4727650880813599, + "learning_rate": 1.2020373601406556e-05, + "loss": 0.7397, + "step": 10430 + }, + { + "epoch": 3.3945998698763824, + "grad_norm": 1.4990061521530151, + "learning_rate": 1.1998323152780621e-05, + "loss": 0.7217, + "step": 10435 + }, + { + "epoch": 3.3962264150943398, + "grad_norm": 1.4598941802978516, + "learning_rate": 1.1976286560161354e-05, + "loss": 0.7256, + "step": 10440 + }, + { + "epoch": 3.3978529603122967, + "grad_norm": 1.5940728187561035, + "learning_rate": 1.195426384703337e-05, + "loss": 0.6982, + "step": 10445 + }, + { + "epoch": 3.3994795055302536, + "grad_norm": 1.4404159784317017, + "learning_rate": 1.1932255036866458e-05, + "loss": 0.7389, + "step": 10450 + }, + { + "epoch": 3.401106050748211, + "grad_norm": 1.8194462060928345, + "learning_rate": 1.1910260153115619e-05, + "loss": 0.7232, + "step": 10455 + }, + { + "epoch": 3.402732595966168, + "grad_norm": 1.5840704441070557, + "learning_rate": 1.1888279219221004e-05, + "loss": 0.7327, + "step": 10460 + }, + { + "epoch": 3.4043591411841247, + "grad_norm": 1.384757399559021, + "learning_rate": 1.1866312258607908e-05, + "loss": 0.6981, + "step": 10465 + }, + { + "epoch": 3.405985686402082, + "grad_norm": 1.5484743118286133, + "learning_rate": 1.1844359294686702e-05, + "loss": 0.7377, + "step": 10470 + }, + { + "epoch": 3.407612231620039, + "grad_norm": 1.4628351926803589, + "learning_rate": 1.1822420350852878e-05, + "loss": 0.7273, + "step": 10475 + }, + { + "epoch": 3.409238776837996, + "grad_norm": 1.4812062978744507, + "learning_rate": 1.1800495450486956e-05, + "loss": 0.7043, + "step": 10480 + }, + { + "epoch": 3.4108653220559533, + "grad_norm": 1.6957182884216309, + "learning_rate": 1.1778584616954513e-05, + "loss": 0.6996, + "step": 10485 + }, + { + "epoch": 3.41249186727391, + "grad_norm": 1.6092971563339233, + "learning_rate": 1.1756687873606122e-05, + "loss": 0.7321, + "step": 10490 + }, + { + "epoch": 3.414118412491867, + "grad_norm": 1.8958309888839722, + "learning_rate": 1.1734805243777361e-05, + "loss": 0.7272, + "step": 10495 + }, + { + "epoch": 3.4157449577098244, + "grad_norm": 1.4432017803192139, + "learning_rate": 1.1712936750788723e-05, + "loss": 0.7123, + "step": 10500 + }, + { + "epoch": 3.4173715029277814, + "grad_norm": 1.6514073610305786, + "learning_rate": 1.1691082417945673e-05, + "loss": 0.7423, + "step": 10505 + }, + { + "epoch": 3.4189980481457383, + "grad_norm": 1.6459007263183594, + "learning_rate": 1.1669242268538589e-05, + "loss": 0.7416, + "step": 10510 + }, + { + "epoch": 3.4206245933636956, + "grad_norm": 1.4266551733016968, + "learning_rate": 1.1647416325842694e-05, + "loss": 0.7463, + "step": 10515 + }, + { + "epoch": 3.4222511385816525, + "grad_norm": 1.4602348804473877, + "learning_rate": 1.1625604613118111e-05, + "loss": 0.7061, + "step": 10520 + }, + { + "epoch": 3.4238776837996094, + "grad_norm": 1.6422368288040161, + "learning_rate": 1.160380715360978e-05, + "loss": 0.7195, + "step": 10525 + }, + { + "epoch": 3.425504229017567, + "grad_norm": 1.3674519062042236, + "learning_rate": 1.1582023970547464e-05, + "loss": 0.7053, + "step": 10530 + }, + { + "epoch": 3.4271307742355237, + "grad_norm": 1.6518824100494385, + "learning_rate": 1.1560255087145686e-05, + "loss": 0.7295, + "step": 10535 + }, + { + "epoch": 3.4287573194534806, + "grad_norm": 1.490520715713501, + "learning_rate": 1.153850052660376e-05, + "loss": 0.707, + "step": 10540 + }, + { + "epoch": 3.430383864671438, + "grad_norm": 1.8190025091171265, + "learning_rate": 1.1516760312105702e-05, + "loss": 0.682, + "step": 10545 + }, + { + "epoch": 3.432010409889395, + "grad_norm": 1.6653496026992798, + "learning_rate": 1.149503446682027e-05, + "loss": 0.748, + "step": 10550 + }, + { + "epoch": 3.4336369551073522, + "grad_norm": 1.7003514766693115, + "learning_rate": 1.14733230139009e-05, + "loss": 0.7389, + "step": 10555 + }, + { + "epoch": 3.435263500325309, + "grad_norm": 2.0321991443634033, + "learning_rate": 1.1451625976485691e-05, + "loss": 0.7263, + "step": 10560 + }, + { + "epoch": 3.436890045543266, + "grad_norm": 1.838392734527588, + "learning_rate": 1.1429943377697363e-05, + "loss": 0.6925, + "step": 10565 + }, + { + "epoch": 3.438516590761223, + "grad_norm": 1.4092440605163574, + "learning_rate": 1.1408275240643274e-05, + "loss": 0.731, + "step": 10570 + }, + { + "epoch": 3.4401431359791803, + "grad_norm": 1.6057584285736084, + "learning_rate": 1.1386621588415341e-05, + "loss": 0.7068, + "step": 10575 + }, + { + "epoch": 3.441769681197137, + "grad_norm": 1.5403319597244263, + "learning_rate": 1.136498244409007e-05, + "loss": 0.6894, + "step": 10580 + }, + { + "epoch": 3.4433962264150946, + "grad_norm": 1.563175916671753, + "learning_rate": 1.1343357830728496e-05, + "loss": 0.6911, + "step": 10585 + }, + { + "epoch": 3.4450227716330515, + "grad_norm": 1.5636881589889526, + "learning_rate": 1.1321747771376177e-05, + "loss": 0.7058, + "step": 10590 + }, + { + "epoch": 3.4466493168510084, + "grad_norm": 1.7510802745819092, + "learning_rate": 1.1300152289063135e-05, + "loss": 0.7321, + "step": 10595 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 1.7262320518493652, + "learning_rate": 1.127857140680389e-05, + "loss": 0.7346, + "step": 10600 + }, + { + "epoch": 3.4499024072869227, + "grad_norm": 1.5443428754806519, + "learning_rate": 1.1257005147597371e-05, + "loss": 0.7077, + "step": 10605 + }, + { + "epoch": 3.4515289525048796, + "grad_norm": 1.6872402429580688, + "learning_rate": 1.123545353442696e-05, + "loss": 0.7375, + "step": 10610 + }, + { + "epoch": 3.453155497722837, + "grad_norm": 1.5397756099700928, + "learning_rate": 1.1213916590260376e-05, + "loss": 0.7125, + "step": 10615 + }, + { + "epoch": 3.454782042940794, + "grad_norm": 1.5730502605438232, + "learning_rate": 1.1192394338049777e-05, + "loss": 0.6936, + "step": 10620 + }, + { + "epoch": 3.4564085881587507, + "grad_norm": 1.488749623298645, + "learning_rate": 1.11708868007316e-05, + "loss": 0.7139, + "step": 10625 + }, + { + "epoch": 3.4580351333767076, + "grad_norm": 1.4544713497161865, + "learning_rate": 1.114939400122664e-05, + "loss": 0.705, + "step": 10630 + }, + { + "epoch": 3.459661678594665, + "grad_norm": 1.5768717527389526, + "learning_rate": 1.1127915962439958e-05, + "loss": 0.6907, + "step": 10635 + }, + { + "epoch": 3.461288223812622, + "grad_norm": 1.412430763244629, + "learning_rate": 1.1106452707260903e-05, + "loss": 0.6892, + "step": 10640 + }, + { + "epoch": 3.4629147690305793, + "grad_norm": 1.574110984802246, + "learning_rate": 1.1085004258563076e-05, + "loss": 0.6767, + "step": 10645 + }, + { + "epoch": 3.464541314248536, + "grad_norm": 1.7304850816726685, + "learning_rate": 1.1063570639204254e-05, + "loss": 0.6999, + "step": 10650 + }, + { + "epoch": 3.466167859466493, + "grad_norm": 1.688988447189331, + "learning_rate": 1.1042151872026482e-05, + "loss": 0.7273, + "step": 10655 + }, + { + "epoch": 3.46779440468445, + "grad_norm": 1.464310884475708, + "learning_rate": 1.102074797985591e-05, + "loss": 0.7321, + "step": 10660 + }, + { + "epoch": 3.4694209499024073, + "grad_norm": 1.3652915954589844, + "learning_rate": 1.0999358985502883e-05, + "loss": 0.6896, + "step": 10665 + }, + { + "epoch": 3.4710474951203643, + "grad_norm": 1.5867336988449097, + "learning_rate": 1.0977984911761826e-05, + "loss": 0.7469, + "step": 10670 + }, + { + "epoch": 3.4726740403383216, + "grad_norm": 1.4255341291427612, + "learning_rate": 1.0956625781411305e-05, + "loss": 0.7027, + "step": 10675 + }, + { + "epoch": 3.4743005855562785, + "grad_norm": 1.4923709630966187, + "learning_rate": 1.0935281617213918e-05, + "loss": 0.7498, + "step": 10680 + }, + { + "epoch": 3.4759271307742354, + "grad_norm": 1.4815912246704102, + "learning_rate": 1.0913952441916375e-05, + "loss": 0.7361, + "step": 10685 + }, + { + "epoch": 3.4775536759921923, + "grad_norm": 1.5375128984451294, + "learning_rate": 1.089263827824934e-05, + "loss": 0.7016, + "step": 10690 + }, + { + "epoch": 3.4791802212101497, + "grad_norm": 1.5304309129714966, + "learning_rate": 1.0871339148927537e-05, + "loss": 0.7195, + "step": 10695 + }, + { + "epoch": 3.4808067664281066, + "grad_norm": 1.3942852020263672, + "learning_rate": 1.0850055076649624e-05, + "loss": 0.6803, + "step": 10700 + }, + { + "epoch": 3.482433311646064, + "grad_norm": 1.5246343612670898, + "learning_rate": 1.0828786084098253e-05, + "loss": 0.7181, + "step": 10705 + }, + { + "epoch": 3.484059856864021, + "grad_norm": 1.6288559436798096, + "learning_rate": 1.0807532193939966e-05, + "loss": 0.7027, + "step": 10710 + }, + { + "epoch": 3.4856864020819778, + "grad_norm": 1.7364447116851807, + "learning_rate": 1.0786293428825234e-05, + "loss": 0.7243, + "step": 10715 + }, + { + "epoch": 3.487312947299935, + "grad_norm": 1.4793058633804321, + "learning_rate": 1.076506981138841e-05, + "loss": 0.7183, + "step": 10720 + }, + { + "epoch": 3.488939492517892, + "grad_norm": 1.7521905899047852, + "learning_rate": 1.0743861364247707e-05, + "loss": 0.6934, + "step": 10725 + }, + { + "epoch": 3.490566037735849, + "grad_norm": 1.63874089717865, + "learning_rate": 1.072266811000514e-05, + "loss": 0.7156, + "step": 10730 + }, + { + "epoch": 3.4921925829538063, + "grad_norm": 1.5781902074813843, + "learning_rate": 1.070149007124658e-05, + "loss": 0.7175, + "step": 10735 + }, + { + "epoch": 3.493819128171763, + "grad_norm": 1.4869858026504517, + "learning_rate": 1.0680327270541634e-05, + "loss": 0.6958, + "step": 10740 + }, + { + "epoch": 3.49544567338972, + "grad_norm": 1.7733018398284912, + "learning_rate": 1.0659179730443706e-05, + "loss": 0.7371, + "step": 10745 + }, + { + "epoch": 3.4970722186076775, + "grad_norm": 1.657753586769104, + "learning_rate": 1.063804747348992e-05, + "loss": 0.7112, + "step": 10750 + }, + { + "epoch": 3.4986987638256344, + "grad_norm": 1.668204426765442, + "learning_rate": 1.061693052220113e-05, + "loss": 0.7099, + "step": 10755 + }, + { + "epoch": 3.5003253090435913, + "grad_norm": 1.5889633893966675, + "learning_rate": 1.0595828899081844e-05, + "loss": 0.6905, + "step": 10760 + }, + { + "epoch": 3.5019518542615486, + "grad_norm": 1.5939786434173584, + "learning_rate": 1.0574742626620277e-05, + "loss": 0.7177, + "step": 10765 + }, + { + "epoch": 3.5035783994795056, + "grad_norm": 1.5347907543182373, + "learning_rate": 1.0553671727288243e-05, + "loss": 0.7034, + "step": 10770 + }, + { + "epoch": 3.5052049446974625, + "grad_norm": 1.4950356483459473, + "learning_rate": 1.0532616223541202e-05, + "loss": 0.7016, + "step": 10775 + }, + { + "epoch": 3.5068314899154194, + "grad_norm": 1.7475671768188477, + "learning_rate": 1.0511576137818202e-05, + "loss": 0.694, + "step": 10780 + }, + { + "epoch": 3.5084580351333767, + "grad_norm": 1.471077561378479, + "learning_rate": 1.0490551492541856e-05, + "loss": 0.698, + "step": 10785 + }, + { + "epoch": 3.5100845803513336, + "grad_norm": 1.5810470581054688, + "learning_rate": 1.0469542310118324e-05, + "loss": 0.716, + "step": 10790 + }, + { + "epoch": 3.511711125569291, + "grad_norm": 1.5249967575073242, + "learning_rate": 1.0448548612937272e-05, + "loss": 0.6923, + "step": 10795 + }, + { + "epoch": 3.513337670787248, + "grad_norm": 1.6547253131866455, + "learning_rate": 1.0427570423371896e-05, + "loss": 0.6937, + "step": 10800 + }, + { + "epoch": 3.514964216005205, + "grad_norm": 1.4754985570907593, + "learning_rate": 1.0406607763778825e-05, + "loss": 0.725, + "step": 10805 + }, + { + "epoch": 3.516590761223162, + "grad_norm": 1.5849754810333252, + "learning_rate": 1.038566065649817e-05, + "loss": 0.708, + "step": 10810 + }, + { + "epoch": 3.518217306441119, + "grad_norm": 1.5983434915542603, + "learning_rate": 1.036472912385345e-05, + "loss": 0.7304, + "step": 10815 + }, + { + "epoch": 3.519843851659076, + "grad_norm": 1.6020256280899048, + "learning_rate": 1.0343813188151608e-05, + "loss": 0.7113, + "step": 10820 + }, + { + "epoch": 3.5214703968770333, + "grad_norm": 1.7454357147216797, + "learning_rate": 1.0322912871682928e-05, + "loss": 0.7335, + "step": 10825 + }, + { + "epoch": 3.5230969420949902, + "grad_norm": 1.7060421705245972, + "learning_rate": 1.0302028196721087e-05, + "loss": 0.7337, + "step": 10830 + }, + { + "epoch": 3.524723487312947, + "grad_norm": 1.7001293897628784, + "learning_rate": 1.0281159185523057e-05, + "loss": 0.7161, + "step": 10835 + }, + { + "epoch": 3.5263500325309045, + "grad_norm": 1.522144079208374, + "learning_rate": 1.0260305860329145e-05, + "loss": 0.7163, + "step": 10840 + }, + { + "epoch": 3.5279765777488614, + "grad_norm": 1.8119834661483765, + "learning_rate": 1.0239468243362932e-05, + "loss": 0.7424, + "step": 10845 + }, + { + "epoch": 3.5296031229668183, + "grad_norm": 1.5742155313491821, + "learning_rate": 1.0218646356831269e-05, + "loss": 0.696, + "step": 10850 + }, + { + "epoch": 3.5312296681847757, + "grad_norm": 1.5806254148483276, + "learning_rate": 1.019784022292421e-05, + "loss": 0.7425, + "step": 10855 + }, + { + "epoch": 3.5328562134027326, + "grad_norm": 1.4441313743591309, + "learning_rate": 1.0177049863815064e-05, + "loss": 0.7169, + "step": 10860 + }, + { + "epoch": 3.5344827586206895, + "grad_norm": 1.6235036849975586, + "learning_rate": 1.0156275301660289e-05, + "loss": 0.7074, + "step": 10865 + }, + { + "epoch": 3.536109303838647, + "grad_norm": 1.671635389328003, + "learning_rate": 1.0135516558599537e-05, + "loss": 0.696, + "step": 10870 + }, + { + "epoch": 3.5377358490566038, + "grad_norm": 1.348893642425537, + "learning_rate": 1.0114773656755591e-05, + "loss": 0.723, + "step": 10875 + }, + { + "epoch": 3.5393623942745607, + "grad_norm": 1.5773969888687134, + "learning_rate": 1.0094046618234362e-05, + "loss": 0.7092, + "step": 10880 + }, + { + "epoch": 3.540988939492518, + "grad_norm": 1.5765646696090698, + "learning_rate": 1.0073335465124827e-05, + "loss": 0.703, + "step": 10885 + }, + { + "epoch": 3.542615484710475, + "grad_norm": 1.6599916219711304, + "learning_rate": 1.0052640219499073e-05, + "loss": 0.6936, + "step": 10890 + }, + { + "epoch": 3.544242029928432, + "grad_norm": 1.5221357345581055, + "learning_rate": 1.003196090341219e-05, + "loss": 0.708, + "step": 10895 + }, + { + "epoch": 3.545868575146389, + "grad_norm": 1.9498372077941895, + "learning_rate": 1.0011297538902331e-05, + "loss": 0.7201, + "step": 10900 + }, + { + "epoch": 3.547495120364346, + "grad_norm": 1.7263884544372559, + "learning_rate": 9.99065014799063e-06, + "loss": 0.716, + "step": 10905 + }, + { + "epoch": 3.5491216655823035, + "grad_norm": 1.6290454864501953, + "learning_rate": 9.970018752681212e-06, + "loss": 0.687, + "step": 10910 + }, + { + "epoch": 3.5507482108002604, + "grad_norm": 1.834154486656189, + "learning_rate": 9.949403374961125e-06, + "loss": 0.7009, + "step": 10915 + }, + { + "epoch": 3.5523747560182173, + "grad_norm": 1.2725188732147217, + "learning_rate": 9.928804036800376e-06, + "loss": 0.7077, + "step": 10920 + }, + { + "epoch": 3.554001301236174, + "grad_norm": 1.5033130645751953, + "learning_rate": 9.908220760151879e-06, + "loss": 0.694, + "step": 10925 + }, + { + "epoch": 3.5556278464541315, + "grad_norm": 1.5326581001281738, + "learning_rate": 9.887653566951405e-06, + "loss": 0.706, + "step": 10930 + }, + { + "epoch": 3.5572543916720885, + "grad_norm": 1.638375997543335, + "learning_rate": 9.867102479117606e-06, + "loss": 0.7194, + "step": 10935 + }, + { + "epoch": 3.558880936890046, + "grad_norm": 1.496351718902588, + "learning_rate": 9.846567518551972e-06, + "loss": 0.6969, + "step": 10940 + }, + { + "epoch": 3.5605074821080027, + "grad_norm": 1.7245603799819946, + "learning_rate": 9.826048707138802e-06, + "loss": 0.7163, + "step": 10945 + }, + { + "epoch": 3.5621340273259596, + "grad_norm": 1.4130351543426514, + "learning_rate": 9.805546066745167e-06, + "loss": 0.6869, + "step": 10950 + }, + { + "epoch": 3.5637605725439165, + "grad_norm": 1.4424623250961304, + "learning_rate": 9.785059619220937e-06, + "loss": 0.7284, + "step": 10955 + }, + { + "epoch": 3.565387117761874, + "grad_norm": 1.5214426517486572, + "learning_rate": 9.76458938639869e-06, + "loss": 0.7201, + "step": 10960 + }, + { + "epoch": 3.567013662979831, + "grad_norm": 1.5227466821670532, + "learning_rate": 9.74413539009375e-06, + "loss": 0.7147, + "step": 10965 + }, + { + "epoch": 3.568640208197788, + "grad_norm": 1.5004678964614868, + "learning_rate": 9.72369765210413e-06, + "loss": 0.7204, + "step": 10970 + }, + { + "epoch": 3.570266753415745, + "grad_norm": 1.437030553817749, + "learning_rate": 9.703276194210523e-06, + "loss": 0.7542, + "step": 10975 + }, + { + "epoch": 3.571893298633702, + "grad_norm": 1.3358042240142822, + "learning_rate": 9.682871038176247e-06, + "loss": 0.7111, + "step": 10980 + }, + { + "epoch": 3.573519843851659, + "grad_norm": 1.4909240007400513, + "learning_rate": 9.662482205747286e-06, + "loss": 0.7229, + "step": 10985 + }, + { + "epoch": 3.5751463890696162, + "grad_norm": 1.5553544759750366, + "learning_rate": 9.642109718652183e-06, + "loss": 0.7231, + "step": 10990 + }, + { + "epoch": 3.576772934287573, + "grad_norm": 1.4255069494247437, + "learning_rate": 9.621753598602107e-06, + "loss": 0.7021, + "step": 10995 + }, + { + "epoch": 3.5783994795055305, + "grad_norm": 1.4210271835327148, + "learning_rate": 9.601413867290734e-06, + "loss": 0.7395, + "step": 11000 + }, + { + "epoch": 3.5800260247234874, + "grad_norm": 1.4358595609664917, + "learning_rate": 9.581090546394337e-06, + "loss": 0.732, + "step": 11005 + }, + { + "epoch": 3.5816525699414443, + "grad_norm": 1.89461350440979, + "learning_rate": 9.560783657571642e-06, + "loss": 0.7073, + "step": 11010 + }, + { + "epoch": 3.5832791151594012, + "grad_norm": 1.55374014377594, + "learning_rate": 9.540493222463905e-06, + "loss": 0.7347, + "step": 11015 + }, + { + "epoch": 3.5849056603773586, + "grad_norm": 1.6205248832702637, + "learning_rate": 9.52021926269481e-06, + "loss": 0.6887, + "step": 11020 + }, + { + "epoch": 3.5865322055953155, + "grad_norm": 1.6707634925842285, + "learning_rate": 9.499961799870521e-06, + "loss": 0.7344, + "step": 11025 + }, + { + "epoch": 3.588158750813273, + "grad_norm": 1.648051142692566, + "learning_rate": 9.479720855579586e-06, + "loss": 0.6981, + "step": 11030 + }, + { + "epoch": 3.5897852960312298, + "grad_norm": 1.5489128828048706, + "learning_rate": 9.45949645139297e-06, + "loss": 0.7281, + "step": 11035 + }, + { + "epoch": 3.5914118412491867, + "grad_norm": 1.468323826789856, + "learning_rate": 9.439288608864013e-06, + "loss": 0.7361, + "step": 11040 + }, + { + "epoch": 3.5930383864671436, + "grad_norm": 1.420743703842163, + "learning_rate": 9.419097349528406e-06, + "loss": 0.7072, + "step": 11045 + }, + { + "epoch": 3.594664931685101, + "grad_norm": 1.663732886314392, + "learning_rate": 9.398922694904139e-06, + "loss": 0.7336, + "step": 11050 + }, + { + "epoch": 3.596291476903058, + "grad_norm": 1.593814730644226, + "learning_rate": 9.378764666491538e-06, + "loss": 0.6991, + "step": 11055 + }, + { + "epoch": 3.597918022121015, + "grad_norm": 1.5049411058425903, + "learning_rate": 9.358623285773207e-06, + "loss": 0.7166, + "step": 11060 + }, + { + "epoch": 3.599544567338972, + "grad_norm": 1.529930591583252, + "learning_rate": 9.338498574213977e-06, + "loss": 0.6839, + "step": 11065 + }, + { + "epoch": 3.601171112556929, + "grad_norm": 1.4036911725997925, + "learning_rate": 9.318390553260972e-06, + "loss": 0.6808, + "step": 11070 + }, + { + "epoch": 3.602797657774886, + "grad_norm": 1.6711106300354004, + "learning_rate": 9.29829924434347e-06, + "loss": 0.6842, + "step": 11075 + }, + { + "epoch": 3.6044242029928433, + "grad_norm": 1.5226213932037354, + "learning_rate": 9.278224668872976e-06, + "loss": 0.7293, + "step": 11080 + }, + { + "epoch": 3.6060507482108, + "grad_norm": 1.6881130933761597, + "learning_rate": 9.258166848243138e-06, + "loss": 0.7121, + "step": 11085 + }, + { + "epoch": 3.6076772934287575, + "grad_norm": 1.5739160776138306, + "learning_rate": 9.238125803829775e-06, + "loss": 0.7261, + "step": 11090 + }, + { + "epoch": 3.6093038386467144, + "grad_norm": 1.5207170248031616, + "learning_rate": 9.218101556990799e-06, + "loss": 0.6994, + "step": 11095 + }, + { + "epoch": 3.6109303838646714, + "grad_norm": 1.6722067594528198, + "learning_rate": 9.198094129066237e-06, + "loss": 0.6848, + "step": 11100 + }, + { + "epoch": 3.6125569290826283, + "grad_norm": 1.4655790328979492, + "learning_rate": 9.178103541378192e-06, + "loss": 0.7279, + "step": 11105 + }, + { + "epoch": 3.6141834743005856, + "grad_norm": 1.6617612838745117, + "learning_rate": 9.158129815230827e-06, + "loss": 0.7335, + "step": 11110 + }, + { + "epoch": 3.6158100195185425, + "grad_norm": 1.3803026676177979, + "learning_rate": 9.138172971910305e-06, + "loss": 0.6761, + "step": 11115 + }, + { + "epoch": 3.6174365647365, + "grad_norm": 1.6932843923568726, + "learning_rate": 9.118233032684839e-06, + "loss": 0.7244, + "step": 11120 + }, + { + "epoch": 3.619063109954457, + "grad_norm": 1.5689126253128052, + "learning_rate": 9.098310018804585e-06, + "loss": 0.7227, + "step": 11125 + }, + { + "epoch": 3.6206896551724137, + "grad_norm": 1.5847350358963013, + "learning_rate": 9.078403951501694e-06, + "loss": 0.7361, + "step": 11130 + }, + { + "epoch": 3.6223162003903706, + "grad_norm": 1.612509846687317, + "learning_rate": 9.058514851990241e-06, + "loss": 0.7171, + "step": 11135 + }, + { + "epoch": 3.623942745608328, + "grad_norm": 1.5795437097549438, + "learning_rate": 9.038642741466235e-06, + "loss": 0.7151, + "step": 11140 + }, + { + "epoch": 3.625569290826285, + "grad_norm": 1.480151891708374, + "learning_rate": 9.018787641107549e-06, + "loss": 0.7249, + "step": 11145 + }, + { + "epoch": 3.6271958360442422, + "grad_norm": 1.5246944427490234, + "learning_rate": 8.99894957207396e-06, + "loss": 0.7208, + "step": 11150 + }, + { + "epoch": 3.628822381262199, + "grad_norm": 1.4257066249847412, + "learning_rate": 8.979128555507072e-06, + "loss": 0.689, + "step": 11155 + }, + { + "epoch": 3.630448926480156, + "grad_norm": 1.8311058282852173, + "learning_rate": 8.959324612530324e-06, + "loss": 0.6831, + "step": 11160 + }, + { + "epoch": 3.632075471698113, + "grad_norm": 1.6707180738449097, + "learning_rate": 8.939537764248962e-06, + "loss": 0.7263, + "step": 11165 + }, + { + "epoch": 3.6337020169160703, + "grad_norm": 1.5278527736663818, + "learning_rate": 8.919768031750025e-06, + "loss": 0.7046, + "step": 11170 + }, + { + "epoch": 3.635328562134027, + "grad_norm": 1.432439923286438, + "learning_rate": 8.900015436102283e-06, + "loss": 0.7218, + "step": 11175 + }, + { + "epoch": 3.6369551073519846, + "grad_norm": 1.6748601198196411, + "learning_rate": 8.880279998356267e-06, + "loss": 0.7009, + "step": 11180 + }, + { + "epoch": 3.6385816525699415, + "grad_norm": 1.6934770345687866, + "learning_rate": 8.860561739544207e-06, + "loss": 0.6957, + "step": 11185 + }, + { + "epoch": 3.6402081977878984, + "grad_norm": 1.4138067960739136, + "learning_rate": 8.840860680680036e-06, + "loss": 0.7357, + "step": 11190 + }, + { + "epoch": 3.6418347430058553, + "grad_norm": 1.5827653408050537, + "learning_rate": 8.821176842759355e-06, + "loss": 0.711, + "step": 11195 + }, + { + "epoch": 3.6434612882238127, + "grad_norm": 1.699881672859192, + "learning_rate": 8.801510246759411e-06, + "loss": 0.7058, + "step": 11200 + }, + { + "epoch": 3.6450878334417696, + "grad_norm": 1.8827296495437622, + "learning_rate": 8.78186091363909e-06, + "loss": 0.7211, + "step": 11205 + }, + { + "epoch": 3.646714378659727, + "grad_norm": 1.634993314743042, + "learning_rate": 8.762228864338842e-06, + "loss": 0.699, + "step": 11210 + }, + { + "epoch": 3.648340923877684, + "grad_norm": 1.676653265953064, + "learning_rate": 8.742614119780743e-06, + "loss": 0.7251, + "step": 11215 + }, + { + "epoch": 3.6499674690956407, + "grad_norm": 1.5447890758514404, + "learning_rate": 8.723016700868391e-06, + "loss": 0.6831, + "step": 11220 + }, + { + "epoch": 3.6515940143135976, + "grad_norm": 1.5870860815048218, + "learning_rate": 8.703436628486945e-06, + "loss": 0.7185, + "step": 11225 + }, + { + "epoch": 3.653220559531555, + "grad_norm": 1.86679208278656, + "learning_rate": 8.683873923503064e-06, + "loss": 0.6815, + "step": 11230 + }, + { + "epoch": 3.654847104749512, + "grad_norm": 1.460684061050415, + "learning_rate": 8.664328606764916e-06, + "loss": 0.6851, + "step": 11235 + }, + { + "epoch": 3.6564736499674693, + "grad_norm": 1.7665942907333374, + "learning_rate": 8.644800699102101e-06, + "loss": 0.7026, + "step": 11240 + }, + { + "epoch": 3.658100195185426, + "grad_norm": 1.4680864810943604, + "learning_rate": 8.629190921491773e-06, + "loss": 0.706, + "step": 11245 + }, + { + "epoch": 3.659726740403383, + "grad_norm": 1.711676001548767, + "learning_rate": 8.609694402596e-06, + "loss": 0.7329, + "step": 11250 + }, + { + "epoch": 3.6613532856213404, + "grad_norm": 1.7062443494796753, + "learning_rate": 8.59021535099975e-06, + "loss": 0.7563, + "step": 11255 + }, + { + "epoch": 3.6629798308392973, + "grad_norm": 1.5526551008224487, + "learning_rate": 8.570753787462032e-06, + "loss": 0.6702, + "step": 11260 + }, + { + "epoch": 3.6646063760572543, + "grad_norm": 1.6249076128005981, + "learning_rate": 8.551309732723198e-06, + "loss": 0.7077, + "step": 11265 + }, + { + "epoch": 3.6662329212752116, + "grad_norm": 1.4946502447128296, + "learning_rate": 8.531883207504973e-06, + "loss": 0.6921, + "step": 11270 + }, + { + "epoch": 3.6678594664931685, + "grad_norm": 1.6216143369674683, + "learning_rate": 8.512474232510367e-06, + "loss": 0.705, + "step": 11275 + }, + { + "epoch": 3.6694860117111254, + "grad_norm": 1.7009637355804443, + "learning_rate": 8.493082828423712e-06, + "loss": 0.724, + "step": 11280 + }, + { + "epoch": 3.671112556929083, + "grad_norm": 1.5824236869812012, + "learning_rate": 8.473709015910605e-06, + "loss": 0.7415, + "step": 11285 + }, + { + "epoch": 3.6727391021470397, + "grad_norm": 1.5868396759033203, + "learning_rate": 8.454352815617905e-06, + "loss": 0.7215, + "step": 11290 + }, + { + "epoch": 3.6743656473649966, + "grad_norm": 1.588746190071106, + "learning_rate": 8.435014248173679e-06, + "loss": 0.7194, + "step": 11295 + }, + { + "epoch": 3.675992192582954, + "grad_norm": 1.5329923629760742, + "learning_rate": 8.415693334187224e-06, + "loss": 0.7153, + "step": 11300 + }, + { + "epoch": 3.677618737800911, + "grad_norm": 1.4592005014419556, + "learning_rate": 8.39639009424901e-06, + "loss": 0.7061, + "step": 11305 + }, + { + "epoch": 3.6792452830188678, + "grad_norm": 1.4733586311340332, + "learning_rate": 8.377104548930678e-06, + "loss": 0.7202, + "step": 11310 + }, + { + "epoch": 3.680871828236825, + "grad_norm": 1.5324994325637817, + "learning_rate": 8.357836718785017e-06, + "loss": 0.7092, + "step": 11315 + }, + { + "epoch": 3.682498373454782, + "grad_norm": 1.8415261507034302, + "learning_rate": 8.338586624345926e-06, + "loss": 0.705, + "step": 11320 + }, + { + "epoch": 3.684124918672739, + "grad_norm": 1.4567241668701172, + "learning_rate": 8.31935428612842e-06, + "loss": 0.7219, + "step": 11325 + }, + { + "epoch": 3.6857514638906963, + "grad_norm": 1.5034760236740112, + "learning_rate": 8.300139724628559e-06, + "loss": 0.6919, + "step": 11330 + }, + { + "epoch": 3.687378009108653, + "grad_norm": 1.7857074737548828, + "learning_rate": 8.280942960323496e-06, + "loss": 0.6974, + "step": 11335 + }, + { + "epoch": 3.68900455432661, + "grad_norm": 1.5064575672149658, + "learning_rate": 8.261764013671384e-06, + "loss": 0.7164, + "step": 11340 + }, + { + "epoch": 3.6906310995445675, + "grad_norm": 1.8192105293273926, + "learning_rate": 8.242602905111408e-06, + "loss": 0.6909, + "step": 11345 + }, + { + "epoch": 3.6922576447625244, + "grad_norm": 1.5833736658096313, + "learning_rate": 8.22345965506374e-06, + "loss": 0.7132, + "step": 11350 + }, + { + "epoch": 3.6938841899804813, + "grad_norm": 1.5906457901000977, + "learning_rate": 8.204334283929527e-06, + "loss": 0.727, + "step": 11355 + }, + { + "epoch": 3.6955107351984386, + "grad_norm": 1.7350749969482422, + "learning_rate": 8.185226812090834e-06, + "loss": 0.6895, + "step": 11360 + }, + { + "epoch": 3.6971372804163956, + "grad_norm": 1.5793412923812866, + "learning_rate": 8.166137259910689e-06, + "loss": 0.721, + "step": 11365 + }, + { + "epoch": 3.6987638256343525, + "grad_norm": 1.5128093957901, + "learning_rate": 8.147065647732984e-06, + "loss": 0.7131, + "step": 11370 + }, + { + "epoch": 3.70039037085231, + "grad_norm": 1.4426097869873047, + "learning_rate": 8.12801199588253e-06, + "loss": 0.7236, + "step": 11375 + }, + { + "epoch": 3.7020169160702667, + "grad_norm": 1.4467498064041138, + "learning_rate": 8.108976324664955e-06, + "loss": 0.7097, + "step": 11380 + }, + { + "epoch": 3.703643461288224, + "grad_norm": 1.621412754058838, + "learning_rate": 8.089958654366781e-06, + "loss": 0.7465, + "step": 11385 + }, + { + "epoch": 3.705270006506181, + "grad_norm": 1.6890511512756348, + "learning_rate": 8.070959005255288e-06, + "loss": 0.743, + "step": 11390 + }, + { + "epoch": 3.706896551724138, + "grad_norm": 1.5129393339157104, + "learning_rate": 8.051977397578594e-06, + "loss": 0.7237, + "step": 11395 + }, + { + "epoch": 3.708523096942095, + "grad_norm": 1.5517621040344238, + "learning_rate": 8.033013851565561e-06, + "loss": 0.7062, + "step": 11400 + }, + { + "epoch": 3.710149642160052, + "grad_norm": 1.579030990600586, + "learning_rate": 8.014068387425824e-06, + "loss": 0.7029, + "step": 11405 + }, + { + "epoch": 3.711776187378009, + "grad_norm": 1.5877450704574585, + "learning_rate": 7.995141025349717e-06, + "loss": 0.6958, + "step": 11410 + }, + { + "epoch": 3.7134027325959664, + "grad_norm": 1.4702614545822144, + "learning_rate": 7.97623178550834e-06, + "loss": 0.7081, + "step": 11415 + }, + { + "epoch": 3.7150292778139233, + "grad_norm": 1.5619736909866333, + "learning_rate": 7.957340688053413e-06, + "loss": 0.7184, + "step": 11420 + }, + { + "epoch": 3.7166558230318802, + "grad_norm": 1.4940590858459473, + "learning_rate": 7.938467753117373e-06, + "loss": 0.7201, + "step": 11425 + }, + { + "epoch": 3.718282368249837, + "grad_norm": 1.5410562753677368, + "learning_rate": 7.919613000813264e-06, + "loss": 0.6996, + "step": 11430 + }, + { + "epoch": 3.7199089134677945, + "grad_norm": 1.5216858386993408, + "learning_rate": 7.900776451234785e-06, + "loss": 0.7198, + "step": 11435 + }, + { + "epoch": 3.7215354586857514, + "grad_norm": 1.4659316539764404, + "learning_rate": 7.881958124456201e-06, + "loss": 0.7338, + "step": 11440 + }, + { + "epoch": 3.7231620039037088, + "grad_norm": 1.5941298007965088, + "learning_rate": 7.86315804053239e-06, + "loss": 0.7064, + "step": 11445 + }, + { + "epoch": 3.7247885491216657, + "grad_norm": 1.5526405572891235, + "learning_rate": 7.844376219498766e-06, + "loss": 0.6827, + "step": 11450 + }, + { + "epoch": 3.7264150943396226, + "grad_norm": 1.4719443321228027, + "learning_rate": 7.825612681371309e-06, + "loss": 0.7077, + "step": 11455 + }, + { + "epoch": 3.7280416395575795, + "grad_norm": 1.470279335975647, + "learning_rate": 7.806867446146466e-06, + "loss": 0.713, + "step": 11460 + }, + { + "epoch": 3.729668184775537, + "grad_norm": 1.5371547937393188, + "learning_rate": 7.788140533801219e-06, + "loss": 0.6979, + "step": 11465 + }, + { + "epoch": 3.7312947299934938, + "grad_norm": 1.9076155424118042, + "learning_rate": 7.769431964293023e-06, + "loss": 0.7506, + "step": 11470 + }, + { + "epoch": 3.732921275211451, + "grad_norm": 1.4369025230407715, + "learning_rate": 7.750741757559751e-06, + "loss": 0.724, + "step": 11475 + }, + { + "epoch": 3.734547820429408, + "grad_norm": 1.5313704013824463, + "learning_rate": 7.73206993351974e-06, + "loss": 0.6726, + "step": 11480 + }, + { + "epoch": 3.736174365647365, + "grad_norm": 1.5152792930603027, + "learning_rate": 7.713416512071722e-06, + "loss": 0.7423, + "step": 11485 + }, + { + "epoch": 3.737800910865322, + "grad_norm": 1.65264093875885, + "learning_rate": 7.694781513094828e-06, + "loss": 0.743, + "step": 11490 + }, + { + "epoch": 3.739427456083279, + "grad_norm": 1.4614166021347046, + "learning_rate": 7.676164956448534e-06, + "loss": 0.7137, + "step": 11495 + }, + { + "epoch": 3.741054001301236, + "grad_norm": 1.5228296518325806, + "learning_rate": 7.657566861972692e-06, + "loss": 0.6776, + "step": 11500 + }, + { + "epoch": 3.7426805465191935, + "grad_norm": 1.4110604524612427, + "learning_rate": 7.638987249487444e-06, + "loss": 0.7084, + "step": 11505 + }, + { + "epoch": 3.7443070917371504, + "grad_norm": 1.5986486673355103, + "learning_rate": 7.6204261387932655e-06, + "loss": 0.72, + "step": 11510 + }, + { + "epoch": 3.7459336369551073, + "grad_norm": 1.6505364179611206, + "learning_rate": 7.601883549670899e-06, + "loss": 0.7295, + "step": 11515 + }, + { + "epoch": 3.747560182173064, + "grad_norm": 1.6019734144210815, + "learning_rate": 7.583359501881363e-06, + "loss": 0.7244, + "step": 11520 + }, + { + "epoch": 3.7491867273910215, + "grad_norm": 1.7245863676071167, + "learning_rate": 7.564854015165887e-06, + "loss": 0.7155, + "step": 11525 + }, + { + "epoch": 3.7508132726089785, + "grad_norm": 1.7682193517684937, + "learning_rate": 7.546367109245955e-06, + "loss": 0.7222, + "step": 11530 + }, + { + "epoch": 3.752439817826936, + "grad_norm": 1.466137409210205, + "learning_rate": 7.5278988038232215e-06, + "loss": 0.7041, + "step": 11535 + }, + { + "epoch": 3.7540663630448927, + "grad_norm": 1.8637816905975342, + "learning_rate": 7.509449118579534e-06, + "loss": 0.7005, + "step": 11540 + }, + { + "epoch": 3.7556929082628496, + "grad_norm": 1.4678994417190552, + "learning_rate": 7.4910180731768916e-06, + "loss": 0.7322, + "step": 11545 + }, + { + "epoch": 3.7573194534808065, + "grad_norm": 1.4559235572814941, + "learning_rate": 7.472605687257436e-06, + "loss": 0.6871, + "step": 11550 + }, + { + "epoch": 3.758945998698764, + "grad_norm": 1.483712911605835, + "learning_rate": 7.454211980443404e-06, + "loss": 0.7194, + "step": 11555 + }, + { + "epoch": 3.760572543916721, + "grad_norm": 1.6858371496200562, + "learning_rate": 7.435836972337151e-06, + "loss": 0.7441, + "step": 11560 + }, + { + "epoch": 3.762199089134678, + "grad_norm": 1.4982420206069946, + "learning_rate": 7.4174806825210775e-06, + "loss": 0.7266, + "step": 11565 + }, + { + "epoch": 3.763825634352635, + "grad_norm": 1.4680901765823364, + "learning_rate": 7.399143130557659e-06, + "loss": 0.7165, + "step": 11570 + }, + { + "epoch": 3.765452179570592, + "grad_norm": 1.756365418434143, + "learning_rate": 7.380824335989392e-06, + "loss": 0.7343, + "step": 11575 + }, + { + "epoch": 3.767078724788549, + "grad_norm": 1.412775993347168, + "learning_rate": 7.3625243183387926e-06, + "loss": 0.7099, + "step": 11580 + }, + { + "epoch": 3.7687052700065062, + "grad_norm": 1.686211109161377, + "learning_rate": 7.344243097108341e-06, + "loss": 0.7111, + "step": 11585 + }, + { + "epoch": 3.770331815224463, + "grad_norm": 1.5744112730026245, + "learning_rate": 7.325980691780523e-06, + "loss": 0.7085, + "step": 11590 + }, + { + "epoch": 3.7719583604424205, + "grad_norm": 1.4835525751113892, + "learning_rate": 7.307737121817734e-06, + "loss": 0.7188, + "step": 11595 + }, + { + "epoch": 3.7735849056603774, + "grad_norm": 1.5784283876419067, + "learning_rate": 7.289512406662322e-06, + "loss": 0.7051, + "step": 11600 + }, + { + "epoch": 3.7752114508783343, + "grad_norm": 1.65445077419281, + "learning_rate": 7.2713065657365326e-06, + "loss": 0.6916, + "step": 11605 + }, + { + "epoch": 3.7768379960962912, + "grad_norm": 1.6914763450622559, + "learning_rate": 7.2531196184424975e-06, + "loss": 0.7208, + "step": 11610 + }, + { + "epoch": 3.7784645413142486, + "grad_norm": 1.4508914947509766, + "learning_rate": 7.234951584162225e-06, + "loss": 0.7081, + "step": 11615 + }, + { + "epoch": 3.7800910865322055, + "grad_norm": 1.6250535249710083, + "learning_rate": 7.21680248225754e-06, + "loss": 0.7187, + "step": 11620 + }, + { + "epoch": 3.781717631750163, + "grad_norm": 1.6406818628311157, + "learning_rate": 7.198672332070119e-06, + "loss": 0.6991, + "step": 11625 + }, + { + "epoch": 3.7833441769681198, + "grad_norm": 1.6318637132644653, + "learning_rate": 7.180561152921419e-06, + "loss": 0.6538, + "step": 11630 + }, + { + "epoch": 3.7849707221860767, + "grad_norm": 1.812267541885376, + "learning_rate": 7.162468964112698e-06, + "loss": 0.6928, + "step": 11635 + }, + { + "epoch": 3.7865972674040336, + "grad_norm": 2.0272045135498047, + "learning_rate": 7.144395784924965e-06, + "loss": 0.7211, + "step": 11640 + }, + { + "epoch": 3.788223812621991, + "grad_norm": 1.5524481534957886, + "learning_rate": 7.126341634618983e-06, + "loss": 0.6818, + "step": 11645 + }, + { + "epoch": 3.789850357839948, + "grad_norm": 1.5196950435638428, + "learning_rate": 7.108306532435208e-06, + "loss": 0.7085, + "step": 11650 + }, + { + "epoch": 3.791476903057905, + "grad_norm": 1.493674397468567, + "learning_rate": 7.090290497593835e-06, + "loss": 0.7017, + "step": 11655 + }, + { + "epoch": 3.793103448275862, + "grad_norm": 1.4521366357803345, + "learning_rate": 7.072293549294698e-06, + "loss": 0.6889, + "step": 11660 + }, + { + "epoch": 3.794729993493819, + "grad_norm": 1.8838039636611938, + "learning_rate": 7.054315706717324e-06, + "loss": 0.6941, + "step": 11665 + }, + { + "epoch": 3.796356538711776, + "grad_norm": 1.545961856842041, + "learning_rate": 7.036356989020856e-06, + "loss": 0.7089, + "step": 11670 + }, + { + "epoch": 3.7979830839297333, + "grad_norm": 1.4653252363204956, + "learning_rate": 7.018417415344081e-06, + "loss": 0.688, + "step": 11675 + }, + { + "epoch": 3.79960962914769, + "grad_norm": 1.4264947175979614, + "learning_rate": 7.000497004805348e-06, + "loss": 0.6874, + "step": 11680 + }, + { + "epoch": 3.8012361743656475, + "grad_norm": 1.7496637105941772, + "learning_rate": 6.982595776502621e-06, + "loss": 0.6798, + "step": 11685 + }, + { + "epoch": 3.8028627195836044, + "grad_norm": 1.6161344051361084, + "learning_rate": 6.964713749513388e-06, + "loss": 0.6976, + "step": 11690 + }, + { + "epoch": 3.8044892648015614, + "grad_norm": 1.6683776378631592, + "learning_rate": 6.946850942894695e-06, + "loss": 0.7002, + "step": 11695 + }, + { + "epoch": 3.8061158100195187, + "grad_norm": 1.6606576442718506, + "learning_rate": 6.9290073756831e-06, + "loss": 0.7125, + "step": 11700 + }, + { + "epoch": 3.8077423552374756, + "grad_norm": 1.6164546012878418, + "learning_rate": 6.9111830668946694e-06, + "loss": 0.7005, + "step": 11705 + }, + { + "epoch": 3.8093689004554325, + "grad_norm": 1.8829686641693115, + "learning_rate": 6.893378035524911e-06, + "loss": 0.729, + "step": 11710 + }, + { + "epoch": 3.81099544567339, + "grad_norm": 1.6989054679870605, + "learning_rate": 6.87559230054883e-06, + "loss": 0.7005, + "step": 11715 + }, + { + "epoch": 3.812621990891347, + "grad_norm": 1.5152456760406494, + "learning_rate": 6.857825880920832e-06, + "loss": 0.6924, + "step": 11720 + }, + { + "epoch": 3.8142485361093037, + "grad_norm": 1.5547714233398438, + "learning_rate": 6.840078795574767e-06, + "loss": 0.7038, + "step": 11725 + }, + { + "epoch": 3.815875081327261, + "grad_norm": 1.5432192087173462, + "learning_rate": 6.822351063423849e-06, + "loss": 0.7078, + "step": 11730 + }, + { + "epoch": 3.817501626545218, + "grad_norm": 1.5756508111953735, + "learning_rate": 6.8046427033607e-06, + "loss": 0.6813, + "step": 11735 + }, + { + "epoch": 3.819128171763175, + "grad_norm": 1.4717226028442383, + "learning_rate": 6.786953734257293e-06, + "loss": 0.7053, + "step": 11740 + }, + { + "epoch": 3.8207547169811322, + "grad_norm": 1.6005703210830688, + "learning_rate": 6.769284174964902e-06, + "loss": 0.6857, + "step": 11745 + }, + { + "epoch": 3.822381262199089, + "grad_norm": 1.5539947748184204, + "learning_rate": 6.751634044314156e-06, + "loss": 0.7289, + "step": 11750 + }, + { + "epoch": 3.824007807417046, + "grad_norm": 1.5078961849212646, + "learning_rate": 6.734003361114946e-06, + "loss": 0.719, + "step": 11755 + }, + { + "epoch": 3.8256343526350034, + "grad_norm": 1.6385048627853394, + "learning_rate": 6.716392144156464e-06, + "loss": 0.7305, + "step": 11760 + }, + { + "epoch": 3.8272608978529603, + "grad_norm": 1.628980278968811, + "learning_rate": 6.698800412207146e-06, + "loss": 0.6847, + "step": 11765 + }, + { + "epoch": 3.828887443070917, + "grad_norm": 1.4770705699920654, + "learning_rate": 6.681228184014665e-06, + "loss": 0.6997, + "step": 11770 + }, + { + "epoch": 3.8305139882888746, + "grad_norm": 1.4534417390823364, + "learning_rate": 6.6636754783058945e-06, + "loss": 0.7068, + "step": 11775 + }, + { + "epoch": 3.8321405335068315, + "grad_norm": 1.4955569505691528, + "learning_rate": 6.6461423137869295e-06, + "loss": 0.7149, + "step": 11780 + }, + { + "epoch": 3.8337670787247884, + "grad_norm": 1.6264514923095703, + "learning_rate": 6.6286287091430116e-06, + "loss": 0.7132, + "step": 11785 + }, + { + "epoch": 3.8353936239427457, + "grad_norm": 1.7065529823303223, + "learning_rate": 6.611134683038561e-06, + "loss": 0.7466, + "step": 11790 + }, + { + "epoch": 3.8370201691607027, + "grad_norm": 1.5052491426467896, + "learning_rate": 6.593660254117104e-06, + "loss": 0.7223, + "step": 11795 + }, + { + "epoch": 3.8386467143786596, + "grad_norm": 1.423587679862976, + "learning_rate": 6.576205441001329e-06, + "loss": 0.7065, + "step": 11800 + }, + { + "epoch": 3.840273259596617, + "grad_norm": 1.4672945737838745, + "learning_rate": 6.558770262292968e-06, + "loss": 0.6886, + "step": 11805 + }, + { + "epoch": 3.841899804814574, + "grad_norm": 1.5300358533859253, + "learning_rate": 6.541354736572866e-06, + "loss": 0.7323, + "step": 11810 + }, + { + "epoch": 3.8435263500325307, + "grad_norm": 1.803086280822754, + "learning_rate": 6.523958882400893e-06, + "loss": 0.7569, + "step": 11815 + }, + { + "epoch": 3.845152895250488, + "grad_norm": 1.5379537343978882, + "learning_rate": 6.50658271831599e-06, + "loss": 0.703, + "step": 11820 + }, + { + "epoch": 3.846779440468445, + "grad_norm": 1.797277569770813, + "learning_rate": 6.4892262628360785e-06, + "loss": 0.7221, + "step": 11825 + }, + { + "epoch": 3.8484059856864024, + "grad_norm": 1.563128113746643, + "learning_rate": 6.471889534458098e-06, + "loss": 0.7123, + "step": 11830 + }, + { + "epoch": 3.8500325309043593, + "grad_norm": 1.9174617528915405, + "learning_rate": 6.454572551657962e-06, + "loss": 0.7001, + "step": 11835 + }, + { + "epoch": 3.851659076122316, + "grad_norm": 1.6336548328399658, + "learning_rate": 6.437275332890541e-06, + "loss": 0.7353, + "step": 11840 + }, + { + "epoch": 3.853285621340273, + "grad_norm": 1.5880738496780396, + "learning_rate": 6.419997896589633e-06, + "loss": 0.7202, + "step": 11845 + }, + { + "epoch": 3.8549121665582304, + "grad_norm": 1.8489093780517578, + "learning_rate": 6.402740261167969e-06, + "loss": 0.7291, + "step": 11850 + }, + { + "epoch": 3.8565387117761873, + "grad_norm": 1.558556318283081, + "learning_rate": 6.3855024450171604e-06, + "loss": 0.717, + "step": 11855 + }, + { + "epoch": 3.8581652569941447, + "grad_norm": 1.674322485923767, + "learning_rate": 6.36828446650771e-06, + "loss": 0.7045, + "step": 11860 + }, + { + "epoch": 3.8597918022121016, + "grad_norm": 1.4663958549499512, + "learning_rate": 6.351086343988976e-06, + "loss": 0.7032, + "step": 11865 + }, + { + "epoch": 3.8614183474300585, + "grad_norm": 1.6179548501968384, + "learning_rate": 6.333908095789162e-06, + "loss": 0.7078, + "step": 11870 + }, + { + "epoch": 3.8630448926480154, + "grad_norm": 1.6068236827850342, + "learning_rate": 6.316749740215283e-06, + "loss": 0.6999, + "step": 11875 + }, + { + "epoch": 3.864671437865973, + "grad_norm": 1.4371055364608765, + "learning_rate": 6.299611295553149e-06, + "loss": 0.7464, + "step": 11880 + }, + { + "epoch": 3.8662979830839297, + "grad_norm": 1.5478278398513794, + "learning_rate": 6.28249278006737e-06, + "loss": 0.7116, + "step": 11885 + }, + { + "epoch": 3.867924528301887, + "grad_norm": 1.5324428081512451, + "learning_rate": 6.265394212001291e-06, + "loss": 0.7075, + "step": 11890 + }, + { + "epoch": 3.869551073519844, + "grad_norm": 1.499799370765686, + "learning_rate": 6.248315609577021e-06, + "loss": 0.7046, + "step": 11895 + }, + { + "epoch": 3.871177618737801, + "grad_norm": 1.73545503616333, + "learning_rate": 6.231256990995385e-06, + "loss": 0.717, + "step": 11900 + }, + { + "epoch": 3.8728041639557578, + "grad_norm": 1.564713716506958, + "learning_rate": 6.214218374435915e-06, + "loss": 0.7652, + "step": 11905 + }, + { + "epoch": 3.874430709173715, + "grad_norm": 1.5255908966064453, + "learning_rate": 6.197199778056806e-06, + "loss": 0.7386, + "step": 11910 + }, + { + "epoch": 3.876057254391672, + "grad_norm": 1.5205202102661133, + "learning_rate": 6.1802012199949495e-06, + "loss": 0.7216, + "step": 11915 + }, + { + "epoch": 3.8776837996096294, + "grad_norm": 1.5822677612304688, + "learning_rate": 6.163222718365852e-06, + "loss": 0.7265, + "step": 11920 + }, + { + "epoch": 3.8793103448275863, + "grad_norm": 1.4400520324707031, + "learning_rate": 6.1462642912636645e-06, + "loss": 0.7047, + "step": 11925 + }, + { + "epoch": 3.880936890045543, + "grad_norm": 1.3026463985443115, + "learning_rate": 6.129325956761139e-06, + "loss": 0.7341, + "step": 11930 + }, + { + "epoch": 3.8825634352635, + "grad_norm": 1.372084379196167, + "learning_rate": 6.11240773290962e-06, + "loss": 0.6976, + "step": 11935 + }, + { + "epoch": 3.8841899804814575, + "grad_norm": 1.5613969564437866, + "learning_rate": 6.095509637739003e-06, + "loss": 0.6958, + "step": 11940 + }, + { + "epoch": 3.8858165256994144, + "grad_norm": 1.609868049621582, + "learning_rate": 6.078631689257755e-06, + "loss": 0.7089, + "step": 11945 + }, + { + "epoch": 3.8874430709173717, + "grad_norm": 1.5152796506881714, + "learning_rate": 6.06177390545285e-06, + "loss": 0.7226, + "step": 11950 + }, + { + "epoch": 3.8890696161353286, + "grad_norm": 1.6927896738052368, + "learning_rate": 6.044936304289786e-06, + "loss": 0.7152, + "step": 11955 + }, + { + "epoch": 3.8906961613532856, + "grad_norm": 1.4628791809082031, + "learning_rate": 6.028118903712554e-06, + "loss": 0.6842, + "step": 11960 + }, + { + "epoch": 3.8923227065712425, + "grad_norm": 1.5437496900558472, + "learning_rate": 6.011321721643617e-06, + "loss": 0.701, + "step": 11965 + }, + { + "epoch": 3.8939492517892, + "grad_norm": 1.3414350748062134, + "learning_rate": 5.994544775983874e-06, + "loss": 0.7195, + "step": 11970 + }, + { + "epoch": 3.8955757970071567, + "grad_norm": 1.5080596208572388, + "learning_rate": 5.9777880846126845e-06, + "loss": 0.7068, + "step": 11975 + }, + { + "epoch": 3.897202342225114, + "grad_norm": 1.7566637992858887, + "learning_rate": 5.96105166538779e-06, + "loss": 0.7136, + "step": 11980 + }, + { + "epoch": 3.898828887443071, + "grad_norm": 1.7713772058486938, + "learning_rate": 5.944335536145362e-06, + "loss": 0.7253, + "step": 11985 + }, + { + "epoch": 3.900455432661028, + "grad_norm": 1.6474518775939941, + "learning_rate": 5.927639714699926e-06, + "loss": 0.7461, + "step": 11990 + }, + { + "epoch": 3.902081977878985, + "grad_norm": 1.4776532649993896, + "learning_rate": 5.910964218844384e-06, + "loss": 0.6956, + "step": 11995 + }, + { + "epoch": 3.903708523096942, + "grad_norm": 1.4372047185897827, + "learning_rate": 5.894309066349946e-06, + "loss": 0.6923, + "step": 12000 + }, + { + "epoch": 3.905335068314899, + "grad_norm": 1.8111002445220947, + "learning_rate": 5.877674274966174e-06, + "loss": 0.7254, + "step": 12005 + }, + { + "epoch": 3.9069616135328564, + "grad_norm": 1.4235066175460815, + "learning_rate": 5.861059862420925e-06, + "loss": 0.6995, + "step": 12010 + }, + { + "epoch": 3.9085881587508133, + "grad_norm": 1.3693865537643433, + "learning_rate": 5.8444658464203135e-06, + "loss": 0.7257, + "step": 12015 + }, + { + "epoch": 3.9102147039687702, + "grad_norm": 1.5525527000427246, + "learning_rate": 5.827892244648744e-06, + "loss": 0.7, + "step": 12020 + }, + { + "epoch": 3.911841249186727, + "grad_norm": 1.5768146514892578, + "learning_rate": 5.811339074768857e-06, + "loss": 0.7166, + "step": 12025 + }, + { + "epoch": 3.9134677944046845, + "grad_norm": 1.7078166007995605, + "learning_rate": 5.794806354421525e-06, + "loss": 0.7426, + "step": 12030 + }, + { + "epoch": 3.9150943396226414, + "grad_norm": 1.4564528465270996, + "learning_rate": 5.778294101225806e-06, + "loss": 0.6962, + "step": 12035 + }, + { + "epoch": 3.9167208848405988, + "grad_norm": 1.7343370914459229, + "learning_rate": 5.761802332778973e-06, + "loss": 0.7143, + "step": 12040 + }, + { + "epoch": 3.9183474300585557, + "grad_norm": 1.4146348237991333, + "learning_rate": 5.745331066656443e-06, + "loss": 0.7127, + "step": 12045 + }, + { + "epoch": 3.9199739752765126, + "grad_norm": 1.3861075639724731, + "learning_rate": 5.7288803204118e-06, + "loss": 0.6876, + "step": 12050 + }, + { + "epoch": 3.9216005204944695, + "grad_norm": 1.3959790468215942, + "learning_rate": 5.712450111576762e-06, + "loss": 0.7163, + "step": 12055 + }, + { + "epoch": 3.923227065712427, + "grad_norm": 1.6086056232452393, + "learning_rate": 5.696040457661153e-06, + "loss": 0.689, + "step": 12060 + }, + { + "epoch": 3.9248536109303838, + "grad_norm": 1.4251943826675415, + "learning_rate": 5.679651376152883e-06, + "loss": 0.6937, + "step": 12065 + }, + { + "epoch": 3.926480156148341, + "grad_norm": 1.4546692371368408, + "learning_rate": 5.663282884517962e-06, + "loss": 0.675, + "step": 12070 + }, + { + "epoch": 3.928106701366298, + "grad_norm": 1.5735273361206055, + "learning_rate": 5.646935000200423e-06, + "loss": 0.6852, + "step": 12075 + }, + { + "epoch": 3.929733246584255, + "grad_norm": 1.6226511001586914, + "learning_rate": 5.630607740622368e-06, + "loss": 0.6968, + "step": 12080 + }, + { + "epoch": 3.931359791802212, + "grad_norm": 1.5431448221206665, + "learning_rate": 5.6143011231839075e-06, + "loss": 0.6906, + "step": 12085 + }, + { + "epoch": 3.932986337020169, + "grad_norm": 1.466394066810608, + "learning_rate": 5.59801516526316e-06, + "loss": 0.6715, + "step": 12090 + }, + { + "epoch": 3.934612882238126, + "grad_norm": 1.6077519655227661, + "learning_rate": 5.5817498842162055e-06, + "loss": 0.7011, + "step": 12095 + }, + { + "epoch": 3.9362394274560835, + "grad_norm": 1.498256802558899, + "learning_rate": 5.565505297377119e-06, + "loss": 0.7002, + "step": 12100 + }, + { + "epoch": 3.9378659726740404, + "grad_norm": 1.627827763557434, + "learning_rate": 5.549281422057892e-06, + "loss": 0.6984, + "step": 12105 + }, + { + "epoch": 3.9394925178919973, + "grad_norm": 1.4427131414413452, + "learning_rate": 5.533078275548473e-06, + "loss": 0.6891, + "step": 12110 + }, + { + "epoch": 3.941119063109954, + "grad_norm": 1.510719895362854, + "learning_rate": 5.516895875116681e-06, + "loss": 0.7155, + "step": 12115 + }, + { + "epoch": 3.9427456083279115, + "grad_norm": 1.4542162418365479, + "learning_rate": 5.5007342380082785e-06, + "loss": 0.7184, + "step": 12120 + }, + { + "epoch": 3.9443721535458685, + "grad_norm": 1.451937198638916, + "learning_rate": 5.484593381446851e-06, + "loss": 0.7488, + "step": 12125 + }, + { + "epoch": 3.945998698763826, + "grad_norm": 1.4829756021499634, + "learning_rate": 5.4684733226338685e-06, + "loss": 0.7235, + "step": 12130 + }, + { + "epoch": 3.9476252439817827, + "grad_norm": 1.4456208944320679, + "learning_rate": 5.452374078748615e-06, + "loss": 0.6865, + "step": 12135 + }, + { + "epoch": 3.9492517891997396, + "grad_norm": 1.521122932434082, + "learning_rate": 5.43629566694821e-06, + "loss": 0.7179, + "step": 12140 + }, + { + "epoch": 3.9508783344176965, + "grad_norm": 1.46628737449646, + "learning_rate": 5.420238104367562e-06, + "loss": 0.6759, + "step": 12145 + }, + { + "epoch": 3.952504879635654, + "grad_norm": 1.6070935726165771, + "learning_rate": 5.404201408119366e-06, + "loss": 0.702, + "step": 12150 + }, + { + "epoch": 3.954131424853611, + "grad_norm": 1.6403086185455322, + "learning_rate": 5.388185595294082e-06, + "loss": 0.7103, + "step": 12155 + }, + { + "epoch": 3.955757970071568, + "grad_norm": 1.6057466268539429, + "learning_rate": 5.372190682959896e-06, + "loss": 0.6918, + "step": 12160 + }, + { + "epoch": 3.957384515289525, + "grad_norm": 1.5387924909591675, + "learning_rate": 5.356216688162749e-06, + "loss": 0.7277, + "step": 12165 + }, + { + "epoch": 3.959011060507482, + "grad_norm": 1.5061837434768677, + "learning_rate": 5.340263627926257e-06, + "loss": 0.7221, + "step": 12170 + }, + { + "epoch": 3.9606376057254393, + "grad_norm": 1.4618033170700073, + "learning_rate": 5.32433151925176e-06, + "loss": 0.7281, + "step": 12175 + }, + { + "epoch": 3.9622641509433962, + "grad_norm": 1.704401969909668, + "learning_rate": 5.308420379118228e-06, + "loss": 0.6974, + "step": 12180 + }, + { + "epoch": 3.963890696161353, + "grad_norm": 1.5502952337265015, + "learning_rate": 5.292530224482345e-06, + "loss": 0.7088, + "step": 12185 + }, + { + "epoch": 3.9655172413793105, + "grad_norm": 1.6927815675735474, + "learning_rate": 5.276661072278363e-06, + "loss": 0.6933, + "step": 12190 + }, + { + "epoch": 3.9671437865972674, + "grad_norm": 1.5949292182922363, + "learning_rate": 5.260812939418208e-06, + "loss": 0.7141, + "step": 12195 + }, + { + "epoch": 3.9687703318152243, + "grad_norm": 1.5294796228408813, + "learning_rate": 5.2449858427913616e-06, + "loss": 0.6958, + "step": 12200 + }, + { + "epoch": 3.9703968770331817, + "grad_norm": 1.7828904390335083, + "learning_rate": 5.229179799264919e-06, + "loss": 0.7042, + "step": 12205 + }, + { + "epoch": 3.9720234222511386, + "grad_norm": 1.4922099113464355, + "learning_rate": 5.213394825683518e-06, + "loss": 0.728, + "step": 12210 + }, + { + "epoch": 3.9736499674690955, + "grad_norm": 1.5130531787872314, + "learning_rate": 5.1976309388693525e-06, + "loss": 0.7363, + "step": 12215 + }, + { + "epoch": 3.975276512687053, + "grad_norm": 1.5120097398757935, + "learning_rate": 5.181888155622141e-06, + "loss": 0.7018, + "step": 12220 + }, + { + "epoch": 3.9769030579050098, + "grad_norm": 1.7935912609100342, + "learning_rate": 5.166166492719124e-06, + "loss": 0.7087, + "step": 12225 + }, + { + "epoch": 3.9785296031229667, + "grad_norm": 1.5667980909347534, + "learning_rate": 5.150465966915005e-06, + "loss": 0.6957, + "step": 12230 + }, + { + "epoch": 3.980156148340924, + "grad_norm": 1.5784883499145508, + "learning_rate": 5.134786594941995e-06, + "loss": 0.71, + "step": 12235 + }, + { + "epoch": 3.981782693558881, + "grad_norm": 1.3857823610305786, + "learning_rate": 5.119128393509728e-06, + "loss": 0.6908, + "step": 12240 + }, + { + "epoch": 3.983409238776838, + "grad_norm": 1.8749308586120605, + "learning_rate": 5.1066170863677375e-06, + "loss": 0.6745, + "step": 12245 + }, + { + "epoch": 3.985035783994795, + "grad_norm": 1.6782424449920654, + "learning_rate": 5.090997033945124e-06, + "loss": 0.7409, + "step": 12250 + }, + { + "epoch": 3.986662329212752, + "grad_norm": 1.6530330181121826, + "learning_rate": 5.075398198730194e-06, + "loss": 0.7031, + "step": 12255 + }, + { + "epoch": 3.988288874430709, + "grad_norm": 1.542108178138733, + "learning_rate": 5.059820597346784e-06, + "loss": 0.6974, + "step": 12260 + }, + { + "epoch": 3.9899154196486664, + "grad_norm": 1.5486468076705933, + "learning_rate": 5.044264246396071e-06, + "loss": 0.7215, + "step": 12265 + }, + { + "epoch": 3.9915419648666233, + "grad_norm": 1.5157986879348755, + "learning_rate": 5.02872916245661e-06, + "loss": 0.7054, + "step": 12270 + }, + { + "epoch": 3.9931685100845806, + "grad_norm": 1.6133856773376465, + "learning_rate": 5.013215362084283e-06, + "loss": 0.7146, + "step": 12275 + }, + { + "epoch": 3.9947950553025375, + "grad_norm": 1.4904115200042725, + "learning_rate": 4.9977228618122905e-06, + "loss": 0.7032, + "step": 12280 + }, + { + "epoch": 3.9964216005204944, + "grad_norm": 1.3834036588668823, + "learning_rate": 4.982251678151137e-06, + "loss": 0.7094, + "step": 12285 + }, + { + "epoch": 3.9980481457384514, + "grad_norm": 1.5233303308486938, + "learning_rate": 4.966801827588591e-06, + "loss": 0.7222, + "step": 12290 + }, + { + "epoch": 3.9996746909564087, + "grad_norm": 1.5705431699752808, + "learning_rate": 4.95137332658972e-06, + "loss": 0.7263, + "step": 12295 + }, + { + "epoch": 4.0, + "eval_f1": 0.8167605279211759, + "eval_loss": 0.426025390625, + "eval_precision": 0.8178058937030933, + "eval_recall": 0.8160241347563627, + "eval_runtime": 387.9471, + "eval_samples_per_second": 1014.146, + "eval_steps_per_second": 1.982, + "step": 12296 + }, + { + "epoch": 4.001301236174366, + "grad_norm": 1.7015174627304077, + "learning_rate": 4.9359661915968e-06, + "loss": 0.6928, + "step": 12300 + }, + { + "epoch": 4.002927781392323, + "grad_norm": 1.6386970281600952, + "learning_rate": 4.920580439029366e-06, + "loss": 0.6909, + "step": 12305 + }, + { + "epoch": 4.00455432661028, + "grad_norm": 1.5570523738861084, + "learning_rate": 4.905216085284156e-06, + "loss": 0.6442, + "step": 12310 + }, + { + "epoch": 4.006180871828237, + "grad_norm": 1.861940622329712, + "learning_rate": 4.8898731467351104e-06, + "loss": 0.6816, + "step": 12315 + }, + { + "epoch": 4.007807417046194, + "grad_norm": 1.485962152481079, + "learning_rate": 4.874551639733324e-06, + "loss": 0.6891, + "step": 12320 + }, + { + "epoch": 4.009433962264151, + "grad_norm": 1.7311413288116455, + "learning_rate": 4.859251580607082e-06, + "loss": 0.664, + "step": 12325 + }, + { + "epoch": 4.011060507482108, + "grad_norm": 1.646789789199829, + "learning_rate": 4.843972985661788e-06, + "loss": 0.686, + "step": 12330 + }, + { + "epoch": 4.012687052700065, + "grad_norm": 1.6874957084655762, + "learning_rate": 4.828715871179984e-06, + "loss": 0.6852, + "step": 12335 + }, + { + "epoch": 4.014313597918022, + "grad_norm": 1.880491018295288, + "learning_rate": 4.813480253421321e-06, + "loss": 0.6793, + "step": 12340 + }, + { + "epoch": 4.015940143135979, + "grad_norm": 1.5796757936477661, + "learning_rate": 4.79826614862254e-06, + "loss": 0.6662, + "step": 12345 + }, + { + "epoch": 4.017566688353936, + "grad_norm": 1.7009522914886475, + "learning_rate": 4.78307357299744e-06, + "loss": 0.7018, + "step": 12350 + }, + { + "epoch": 4.019193233571893, + "grad_norm": 1.6670128107070923, + "learning_rate": 4.767902542736905e-06, + "loss": 0.6838, + "step": 12355 + }, + { + "epoch": 4.020819778789851, + "grad_norm": 1.648879885673523, + "learning_rate": 4.7527530740088255e-06, + "loss": 0.677, + "step": 12360 + }, + { + "epoch": 4.022446324007808, + "grad_norm": 1.5904170274734497, + "learning_rate": 4.7376251829581385e-06, + "loss": 0.6683, + "step": 12365 + }, + { + "epoch": 4.024072869225765, + "grad_norm": 1.6361606121063232, + "learning_rate": 4.722518885706773e-06, + "loss": 0.6888, + "step": 12370 + }, + { + "epoch": 4.0256994144437215, + "grad_norm": 1.4905328750610352, + "learning_rate": 4.707434198353658e-06, + "loss": 0.6693, + "step": 12375 + }, + { + "epoch": 4.027325959661678, + "grad_norm": 1.4023780822753906, + "learning_rate": 4.692371136974671e-06, + "loss": 0.668, + "step": 12380 + }, + { + "epoch": 4.028952504879635, + "grad_norm": 1.4039663076400757, + "learning_rate": 4.677329717622667e-06, + "loss": 0.6866, + "step": 12385 + }, + { + "epoch": 4.030579050097593, + "grad_norm": 1.559519648551941, + "learning_rate": 4.6623099563274125e-06, + "loss": 0.6819, + "step": 12390 + }, + { + "epoch": 4.03220559531555, + "grad_norm": 1.4499599933624268, + "learning_rate": 4.647311869095613e-06, + "loss": 0.6688, + "step": 12395 + }, + { + "epoch": 4.033832140533507, + "grad_norm": 1.6000298261642456, + "learning_rate": 4.632335471910867e-06, + "loss": 0.679, + "step": 12400 + }, + { + "epoch": 4.035458685751464, + "grad_norm": 1.6708033084869385, + "learning_rate": 4.617380780733665e-06, + "loss": 0.7138, + "step": 12405 + }, + { + "epoch": 4.037085230969421, + "grad_norm": 1.495922327041626, + "learning_rate": 4.602447811501345e-06, + "loss": 0.6859, + "step": 12410 + }, + { + "epoch": 4.038711776187378, + "grad_norm": 1.5014092922210693, + "learning_rate": 4.587536580128121e-06, + "loss": 0.6644, + "step": 12415 + }, + { + "epoch": 4.040338321405335, + "grad_norm": 1.7722117900848389, + "learning_rate": 4.572647102505029e-06, + "loss": 0.6997, + "step": 12420 + }, + { + "epoch": 4.041964866623292, + "grad_norm": 1.671643853187561, + "learning_rate": 4.557779394499914e-06, + "loss": 0.6805, + "step": 12425 + }, + { + "epoch": 4.043591411841249, + "grad_norm": 1.5817434787750244, + "learning_rate": 4.542933471957436e-06, + "loss": 0.696, + "step": 12430 + }, + { + "epoch": 4.045217957059206, + "grad_norm": 1.5921458005905151, + "learning_rate": 4.528109350699028e-06, + "loss": 0.6538, + "step": 12435 + }, + { + "epoch": 4.046844502277163, + "grad_norm": 1.6459001302719116, + "learning_rate": 4.513307046522902e-06, + "loss": 0.7125, + "step": 12440 + }, + { + "epoch": 4.04847104749512, + "grad_norm": 2.0896899700164795, + "learning_rate": 4.498526575203996e-06, + "loss": 0.6705, + "step": 12445 + }, + { + "epoch": 4.050097592713078, + "grad_norm": 1.6820528507232666, + "learning_rate": 4.483767952494008e-06, + "loss": 0.6886, + "step": 12450 + }, + { + "epoch": 4.051724137931035, + "grad_norm": 1.5964027643203735, + "learning_rate": 4.469031194121323e-06, + "loss": 0.6726, + "step": 12455 + }, + { + "epoch": 4.053350683148992, + "grad_norm": 1.8637484312057495, + "learning_rate": 4.4543163157910465e-06, + "loss": 0.6888, + "step": 12460 + }, + { + "epoch": 4.0549772283669485, + "grad_norm": 1.617895245552063, + "learning_rate": 4.439623333184961e-06, + "loss": 0.7402, + "step": 12465 + }, + { + "epoch": 4.056603773584905, + "grad_norm": 1.5728648900985718, + "learning_rate": 4.424952261961521e-06, + "loss": 0.6766, + "step": 12470 + }, + { + "epoch": 4.058230318802862, + "grad_norm": 1.448233723640442, + "learning_rate": 4.410303117755809e-06, + "loss": 0.6576, + "step": 12475 + }, + { + "epoch": 4.05985686402082, + "grad_norm": 1.7988523244857788, + "learning_rate": 4.395675916179562e-06, + "loss": 0.6662, + "step": 12480 + }, + { + "epoch": 4.061483409238777, + "grad_norm": 1.5474234819412231, + "learning_rate": 4.3810706728211135e-06, + "loss": 0.6625, + "step": 12485 + }, + { + "epoch": 4.063109954456734, + "grad_norm": 1.4681956768035889, + "learning_rate": 4.3664874032454115e-06, + "loss": 0.6412, + "step": 12490 + }, + { + "epoch": 4.064736499674691, + "grad_norm": 1.8372201919555664, + "learning_rate": 4.3519261229939825e-06, + "loss": 0.702, + "step": 12495 + }, + { + "epoch": 4.066363044892648, + "grad_norm": 1.63862943649292, + "learning_rate": 4.337386847584921e-06, + "loss": 0.6799, + "step": 12500 + }, + { + "epoch": 4.067989590110605, + "grad_norm": 1.62037193775177, + "learning_rate": 4.322869592512854e-06, + "loss": 0.6988, + "step": 12505 + }, + { + "epoch": 4.0696161353285625, + "grad_norm": 1.5762559175491333, + "learning_rate": 4.308374373248966e-06, + "loss": 0.6541, + "step": 12510 + }, + { + "epoch": 4.071242680546519, + "grad_norm": 1.5794283151626587, + "learning_rate": 4.293901205240936e-06, + "loss": 0.6667, + "step": 12515 + }, + { + "epoch": 4.072869225764476, + "grad_norm": 1.7357828617095947, + "learning_rate": 4.27945010391296e-06, + "loss": 0.6567, + "step": 12520 + }, + { + "epoch": 4.074495770982433, + "grad_norm": 1.5217994451522827, + "learning_rate": 4.265021084665696e-06, + "loss": 0.6766, + "step": 12525 + }, + { + "epoch": 4.07612231620039, + "grad_norm": 1.6336015462875366, + "learning_rate": 4.250614162876304e-06, + "loss": 0.6696, + "step": 12530 + }, + { + "epoch": 4.077748861418347, + "grad_norm": 1.6641258001327515, + "learning_rate": 4.2362293538983575e-06, + "loss": 0.6812, + "step": 12535 + }, + { + "epoch": 4.079375406636305, + "grad_norm": 1.5502071380615234, + "learning_rate": 4.221866673061889e-06, + "loss": 0.6646, + "step": 12540 + }, + { + "epoch": 4.081001951854262, + "grad_norm": 1.5271575450897217, + "learning_rate": 4.207526135673329e-06, + "loss": 0.6598, + "step": 12545 + }, + { + "epoch": 4.082628497072219, + "grad_norm": 1.5592507123947144, + "learning_rate": 4.193207757015527e-06, + "loss": 0.6755, + "step": 12550 + }, + { + "epoch": 4.0842550422901756, + "grad_norm": 1.4440104961395264, + "learning_rate": 4.178911552347714e-06, + "loss": 0.676, + "step": 12555 + }, + { + "epoch": 4.0858815875081325, + "grad_norm": 1.7511436939239502, + "learning_rate": 4.164637536905472e-06, + "loss": 0.6758, + "step": 12560 + }, + { + "epoch": 4.087508132726089, + "grad_norm": 1.8090898990631104, + "learning_rate": 4.150385725900774e-06, + "loss": 0.7302, + "step": 12565 + }, + { + "epoch": 4.089134677944047, + "grad_norm": 1.6882518529891968, + "learning_rate": 4.136156134521887e-06, + "loss": 0.6903, + "step": 12570 + }, + { + "epoch": 4.090761223162004, + "grad_norm": 1.533145785331726, + "learning_rate": 4.121948777933432e-06, + "loss": 0.6957, + "step": 12575 + }, + { + "epoch": 4.092387768379961, + "grad_norm": 1.574063777923584, + "learning_rate": 4.107763671276305e-06, + "loss": 0.6796, + "step": 12580 + }, + { + "epoch": 4.094014313597918, + "grad_norm": 1.4989714622497559, + "learning_rate": 4.093600829667718e-06, + "loss": 0.6977, + "step": 12585 + }, + { + "epoch": 4.095640858815875, + "grad_norm": 1.5422461032867432, + "learning_rate": 4.079460268201127e-06, + "loss": 0.6912, + "step": 12590 + }, + { + "epoch": 4.097267404033832, + "grad_norm": 1.477959156036377, + "learning_rate": 4.065342001946268e-06, + "loss": 0.6517, + "step": 12595 + }, + { + "epoch": 4.0988939492517895, + "grad_norm": 1.6347063779830933, + "learning_rate": 4.051246045949106e-06, + "loss": 0.6792, + "step": 12600 + }, + { + "epoch": 4.100520494469746, + "grad_norm": 1.7770346403121948, + "learning_rate": 4.037172415231838e-06, + "loss": 0.694, + "step": 12605 + }, + { + "epoch": 4.102147039687703, + "grad_norm": 1.654436469078064, + "learning_rate": 4.023121124792847e-06, + "loss": 0.6822, + "step": 12610 + }, + { + "epoch": 4.10377358490566, + "grad_norm": 1.5574926137924194, + "learning_rate": 4.009092189606739e-06, + "loss": 0.6852, + "step": 12615 + }, + { + "epoch": 4.105400130123617, + "grad_norm": 1.5083457231521606, + "learning_rate": 3.995085624624262e-06, + "loss": 0.6528, + "step": 12620 + }, + { + "epoch": 4.107026675341574, + "grad_norm": 1.529775857925415, + "learning_rate": 3.9811014447723505e-06, + "loss": 0.6692, + "step": 12625 + }, + { + "epoch": 4.108653220559532, + "grad_norm": 1.7209826707839966, + "learning_rate": 3.9671396649540755e-06, + "loss": 0.7088, + "step": 12630 + }, + { + "epoch": 4.110279765777489, + "grad_norm": 1.4956921339035034, + "learning_rate": 3.9532003000486344e-06, + "loss": 0.6697, + "step": 12635 + }, + { + "epoch": 4.111906310995446, + "grad_norm": 1.6645127534866333, + "learning_rate": 3.939283364911331e-06, + "loss": 0.6959, + "step": 12640 + }, + { + "epoch": 4.113532856213403, + "grad_norm": 1.7171512842178345, + "learning_rate": 3.925388874373579e-06, + "loss": 0.6751, + "step": 12645 + }, + { + "epoch": 4.1151594014313595, + "grad_norm": 1.639894723892212, + "learning_rate": 3.911516843242852e-06, + "loss": 0.6951, + "step": 12650 + }, + { + "epoch": 4.116785946649317, + "grad_norm": 1.4588055610656738, + "learning_rate": 3.897667286302711e-06, + "loss": 0.6718, + "step": 12655 + }, + { + "epoch": 4.118412491867274, + "grad_norm": 1.5775295495986938, + "learning_rate": 3.883840218312757e-06, + "loss": 0.7049, + "step": 12660 + }, + { + "epoch": 4.120039037085231, + "grad_norm": 1.58664071559906, + "learning_rate": 3.870035654008625e-06, + "loss": 0.6862, + "step": 12665 + }, + { + "epoch": 4.121665582303188, + "grad_norm": 1.6246001720428467, + "learning_rate": 3.856253608101957e-06, + "loss": 0.6578, + "step": 12670 + }, + { + "epoch": 4.123292127521145, + "grad_norm": 1.4375513792037964, + "learning_rate": 3.8424940952804165e-06, + "loss": 0.6937, + "step": 12675 + }, + { + "epoch": 4.124918672739102, + "grad_norm": 1.8211374282836914, + "learning_rate": 3.828757130207633e-06, + "loss": 0.6993, + "step": 12680 + }, + { + "epoch": 4.126545217957059, + "grad_norm": 1.3433880805969238, + "learning_rate": 3.815042727523221e-06, + "loss": 0.6557, + "step": 12685 + }, + { + "epoch": 4.1281717631750166, + "grad_norm": 1.612414002418518, + "learning_rate": 3.8013509018427455e-06, + "loss": 0.7026, + "step": 12690 + }, + { + "epoch": 4.1297983083929735, + "grad_norm": 1.5452624559402466, + "learning_rate": 3.7876816677577144e-06, + "loss": 0.6676, + "step": 12695 + }, + { + "epoch": 4.13142485361093, + "grad_norm": 1.7117056846618652, + "learning_rate": 3.7740350398355577e-06, + "loss": 0.6686, + "step": 12700 + }, + { + "epoch": 4.133051398828887, + "grad_norm": 1.6139566898345947, + "learning_rate": 3.760411032619604e-06, + "loss": 0.6728, + "step": 12705 + }, + { + "epoch": 4.134677944046844, + "grad_norm": 1.7520240545272827, + "learning_rate": 3.746809660629094e-06, + "loss": 0.6787, + "step": 12710 + }, + { + "epoch": 4.136304489264802, + "grad_norm": 1.6776567697525024, + "learning_rate": 3.7332309383591224e-06, + "loss": 0.6762, + "step": 12715 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.7078745365142822, + "learning_rate": 3.719674880280663e-06, + "loss": 0.675, + "step": 12720 + }, + { + "epoch": 4.139557579700716, + "grad_norm": 1.7358143329620361, + "learning_rate": 3.7061415008405347e-06, + "loss": 0.6843, + "step": 12725 + }, + { + "epoch": 4.141184124918673, + "grad_norm": 1.8416589498519897, + "learning_rate": 3.6926308144613896e-06, + "loss": 0.6814, + "step": 12730 + }, + { + "epoch": 4.14281067013663, + "grad_norm": 1.640164852142334, + "learning_rate": 3.67914283554168e-06, + "loss": 0.67, + "step": 12735 + }, + { + "epoch": 4.1444372153545865, + "grad_norm": 1.4602364301681519, + "learning_rate": 3.665677578455676e-06, + "loss": 0.6582, + "step": 12740 + }, + { + "epoch": 4.146063760572544, + "grad_norm": 1.843189001083374, + "learning_rate": 3.6522350575534187e-06, + "loss": 0.6871, + "step": 12745 + }, + { + "epoch": 4.147690305790501, + "grad_norm": 1.5945450067520142, + "learning_rate": 3.6388152871607324e-06, + "loss": 0.6978, + "step": 12750 + }, + { + "epoch": 4.149316851008458, + "grad_norm": 1.604054570198059, + "learning_rate": 3.625418281579185e-06, + "loss": 0.6918, + "step": 12755 + }, + { + "epoch": 4.150943396226415, + "grad_norm": 1.4195245504379272, + "learning_rate": 3.6120440550861e-06, + "loss": 0.7106, + "step": 12760 + }, + { + "epoch": 4.152569941444372, + "grad_norm": 1.5902831554412842, + "learning_rate": 3.598692621934502e-06, + "loss": 0.6659, + "step": 12765 + }, + { + "epoch": 4.154196486662329, + "grad_norm": 1.5757604837417603, + "learning_rate": 3.5853639963531444e-06, + "loss": 0.6886, + "step": 12770 + }, + { + "epoch": 4.155823031880287, + "grad_norm": 1.5209938287734985, + "learning_rate": 3.572058192546454e-06, + "loss": 0.6813, + "step": 12775 + }, + { + "epoch": 4.157449577098244, + "grad_norm": 1.7000783681869507, + "learning_rate": 3.5587752246945577e-06, + "loss": 0.6629, + "step": 12780 + }, + { + "epoch": 4.1590761223162005, + "grad_norm": 1.6796540021896362, + "learning_rate": 3.5455151069532345e-06, + "loss": 0.6973, + "step": 12785 + }, + { + "epoch": 4.160702667534157, + "grad_norm": 1.5791364908218384, + "learning_rate": 3.532277853453922e-06, + "loss": 0.6699, + "step": 12790 + }, + { + "epoch": 4.162329212752114, + "grad_norm": 1.551390290260315, + "learning_rate": 3.519063478303669e-06, + "loss": 0.712, + "step": 12795 + }, + { + "epoch": 4.163955757970071, + "grad_norm": 1.550437092781067, + "learning_rate": 3.5058719955851667e-06, + "loss": 0.6854, + "step": 12800 + }, + { + "epoch": 4.165582303188029, + "grad_norm": 1.6224336624145508, + "learning_rate": 3.492703419356694e-06, + "loss": 0.6666, + "step": 12805 + }, + { + "epoch": 4.167208848405986, + "grad_norm": 1.5481473207473755, + "learning_rate": 3.4795577636521246e-06, + "loss": 0.7026, + "step": 12810 + }, + { + "epoch": 4.168835393623943, + "grad_norm": 1.5013996362686157, + "learning_rate": 3.466435042480903e-06, + "loss": 0.6982, + "step": 12815 + }, + { + "epoch": 4.1704619388419, + "grad_norm": 1.5081018209457397, + "learning_rate": 3.4533352698280424e-06, + "loss": 0.6961, + "step": 12820 + }, + { + "epoch": 4.172088484059857, + "grad_norm": 1.5642473697662354, + "learning_rate": 3.4402584596540887e-06, + "loss": 0.6906, + "step": 12825 + }, + { + "epoch": 4.173715029277814, + "grad_norm": 1.4824109077453613, + "learning_rate": 3.4272046258951086e-06, + "loss": 0.6728, + "step": 12830 + }, + { + "epoch": 4.175341574495771, + "grad_norm": 1.471656084060669, + "learning_rate": 3.414173782462707e-06, + "loss": 0.7055, + "step": 12835 + }, + { + "epoch": 4.176968119713728, + "grad_norm": 1.5673131942749023, + "learning_rate": 3.401165943243964e-06, + "loss": 0.6661, + "step": 12840 + }, + { + "epoch": 4.178594664931685, + "grad_norm": 1.6731101274490356, + "learning_rate": 3.3881811221014525e-06, + "loss": 0.6797, + "step": 12845 + }, + { + "epoch": 4.180221210149642, + "grad_norm": 1.9920939207077026, + "learning_rate": 3.3752193328732225e-06, + "loss": 0.6783, + "step": 12850 + }, + { + "epoch": 4.181847755367599, + "grad_norm": 1.5620946884155273, + "learning_rate": 3.362280589372774e-06, + "loss": 0.6647, + "step": 12855 + }, + { + "epoch": 4.183474300585556, + "grad_norm": 1.5220149755477905, + "learning_rate": 3.3493649053890326e-06, + "loss": 0.6741, + "step": 12860 + }, + { + "epoch": 4.185100845803514, + "grad_norm": 1.7352159023284912, + "learning_rate": 3.336472294686377e-06, + "loss": 0.6754, + "step": 12865 + }, + { + "epoch": 4.186727391021471, + "grad_norm": 1.6692588329315186, + "learning_rate": 3.323602771004569e-06, + "loss": 0.6874, + "step": 12870 + }, + { + "epoch": 4.1883539362394275, + "grad_norm": 1.8202842473983765, + "learning_rate": 3.31075634805878e-06, + "loss": 0.6784, + "step": 12875 + }, + { + "epoch": 4.189980481457384, + "grad_norm": 1.427772045135498, + "learning_rate": 3.2979330395395662e-06, + "loss": 0.6722, + "step": 12880 + }, + { + "epoch": 4.191607026675341, + "grad_norm": 1.79413640499115, + "learning_rate": 3.2851328591128493e-06, + "loss": 0.6751, + "step": 12885 + }, + { + "epoch": 4.193233571893298, + "grad_norm": 1.6928439140319824, + "learning_rate": 3.2723558204198866e-06, + "loss": 0.6955, + "step": 12890 + }, + { + "epoch": 4.194860117111256, + "grad_norm": 1.5578070878982544, + "learning_rate": 3.2596019370773e-06, + "loss": 0.6869, + "step": 12895 + }, + { + "epoch": 4.196486662329213, + "grad_norm": 1.6566860675811768, + "learning_rate": 3.246871222677006e-06, + "loss": 0.6877, + "step": 12900 + }, + { + "epoch": 4.19811320754717, + "grad_norm": 1.5405158996582031, + "learning_rate": 3.234163690786257e-06, + "loss": 0.7, + "step": 12905 + }, + { + "epoch": 4.199739752765127, + "grad_norm": 1.5821462869644165, + "learning_rate": 3.221479354947568e-06, + "loss": 0.6885, + "step": 12910 + }, + { + "epoch": 4.201366297983084, + "grad_norm": 1.687434434890747, + "learning_rate": 3.208818228678778e-06, + "loss": 0.6917, + "step": 12915 + }, + { + "epoch": 4.202992843201041, + "grad_norm": 1.7101229429244995, + "learning_rate": 3.1961803254729473e-06, + "loss": 0.6789, + "step": 12920 + }, + { + "epoch": 4.204619388418998, + "grad_norm": 1.6556411981582642, + "learning_rate": 3.183565658798418e-06, + "loss": 0.699, + "step": 12925 + }, + { + "epoch": 4.206245933636955, + "grad_norm": 1.503229022026062, + "learning_rate": 3.1709742420987426e-06, + "loss": 0.6705, + "step": 12930 + }, + { + "epoch": 4.207872478854912, + "grad_norm": 1.6503989696502686, + "learning_rate": 3.158406088792723e-06, + "loss": 0.6771, + "step": 12935 + }, + { + "epoch": 4.209499024072869, + "grad_norm": 1.6764299869537354, + "learning_rate": 3.1458612122743452e-06, + "loss": 0.6784, + "step": 12940 + }, + { + "epoch": 4.211125569290826, + "grad_norm": 1.7612955570220947, + "learning_rate": 3.133339625912804e-06, + "loss": 0.6811, + "step": 12945 + }, + { + "epoch": 4.212752114508783, + "grad_norm": 1.9285688400268555, + "learning_rate": 3.120841343052469e-06, + "loss": 0.6687, + "step": 12950 + }, + { + "epoch": 4.214378659726741, + "grad_norm": 1.5360966920852661, + "learning_rate": 3.108366377012875e-06, + "loss": 0.6783, + "step": 12955 + }, + { + "epoch": 4.216005204944698, + "grad_norm": 1.5546157360076904, + "learning_rate": 3.095914741088715e-06, + "loss": 0.6649, + "step": 12960 + }, + { + "epoch": 4.217631750162655, + "grad_norm": 1.6157512664794922, + "learning_rate": 3.083486448549802e-06, + "loss": 0.6477, + "step": 12965 + }, + { + "epoch": 4.2192582953806115, + "grad_norm": 1.4710103273391724, + "learning_rate": 3.0710815126410887e-06, + "loss": 0.7134, + "step": 12970 + }, + { + "epoch": 4.220884840598568, + "grad_norm": 1.7620134353637695, + "learning_rate": 3.058699946582619e-06, + "loss": 0.6677, + "step": 12975 + }, + { + "epoch": 4.222511385816525, + "grad_norm": 1.7053090333938599, + "learning_rate": 3.046341763569557e-06, + "loss": 0.6647, + "step": 12980 + }, + { + "epoch": 4.224137931034483, + "grad_norm": 1.6642110347747803, + "learning_rate": 3.0340069767721192e-06, + "loss": 0.6872, + "step": 12985 + }, + { + "epoch": 4.22576447625244, + "grad_norm": 1.7863682508468628, + "learning_rate": 3.021695599335611e-06, + "loss": 0.6627, + "step": 12990 + }, + { + "epoch": 4.227391021470397, + "grad_norm": 1.7297356128692627, + "learning_rate": 3.0094076443803713e-06, + "loss": 0.6604, + "step": 12995 + }, + { + "epoch": 4.229017566688354, + "grad_norm": 1.6169915199279785, + "learning_rate": 2.997143125001797e-06, + "loss": 0.7297, + "step": 13000 + }, + { + "epoch": 4.230644111906311, + "grad_norm": 1.3918139934539795, + "learning_rate": 2.9849020542702887e-06, + "loss": 0.6545, + "step": 13005 + }, + { + "epoch": 4.232270657124268, + "grad_norm": 1.5246433019638062, + "learning_rate": 2.9726844452312744e-06, + "loss": 0.6715, + "step": 13010 + }, + { + "epoch": 4.233897202342225, + "grad_norm": 1.703620195388794, + "learning_rate": 2.96049031090517e-06, + "loss": 0.6544, + "step": 13015 + }, + { + "epoch": 4.235523747560182, + "grad_norm": 1.4148470163345337, + "learning_rate": 2.948319664287383e-06, + "loss": 0.6612, + "step": 13020 + }, + { + "epoch": 4.237150292778139, + "grad_norm": 1.7176105976104736, + "learning_rate": 2.9361725183482762e-06, + "loss": 0.6744, + "step": 13025 + }, + { + "epoch": 4.238776837996096, + "grad_norm": 1.5496314764022827, + "learning_rate": 2.9240488860331787e-06, + "loss": 0.6879, + "step": 13030 + }, + { + "epoch": 4.240403383214053, + "grad_norm": 1.7216888666152954, + "learning_rate": 2.9119487802623523e-06, + "loss": 0.6661, + "step": 13035 + }, + { + "epoch": 4.24202992843201, + "grad_norm": 1.8513258695602417, + "learning_rate": 2.8998722139309933e-06, + "loss": 0.6886, + "step": 13040 + }, + { + "epoch": 4.243656473649968, + "grad_norm": 1.6031893491744995, + "learning_rate": 2.8878191999092115e-06, + "loss": 0.7155, + "step": 13045 + }, + { + "epoch": 4.245283018867925, + "grad_norm": 1.5605480670928955, + "learning_rate": 2.8757897510420186e-06, + "loss": 0.6706, + "step": 13050 + }, + { + "epoch": 4.246909564085882, + "grad_norm": 1.5444579124450684, + "learning_rate": 2.863783880149301e-06, + "loss": 0.6837, + "step": 13055 + }, + { + "epoch": 4.2485361093038385, + "grad_norm": 1.830214262008667, + "learning_rate": 2.85180160002583e-06, + "loss": 0.6836, + "step": 13060 + }, + { + "epoch": 4.250162654521795, + "grad_norm": 1.885754108428955, + "learning_rate": 2.83984292344123e-06, + "loss": 0.6788, + "step": 13065 + }, + { + "epoch": 4.251789199739752, + "grad_norm": 1.5409435033798218, + "learning_rate": 2.8279078631399725e-06, + "loss": 0.6589, + "step": 13070 + }, + { + "epoch": 4.25341574495771, + "grad_norm": 1.5811169147491455, + "learning_rate": 2.8159964318413613e-06, + "loss": 0.6891, + "step": 13075 + }, + { + "epoch": 4.255042290175667, + "grad_norm": 1.5375542640686035, + "learning_rate": 2.804108642239525e-06, + "loss": 0.6548, + "step": 13080 + }, + { + "epoch": 4.256668835393624, + "grad_norm": 1.6110589504241943, + "learning_rate": 2.7922445070033804e-06, + "loss": 0.6294, + "step": 13085 + }, + { + "epoch": 4.258295380611581, + "grad_norm": 1.8343173265457153, + "learning_rate": 2.780404038776649e-06, + "loss": 0.6765, + "step": 13090 + }, + { + "epoch": 4.259921925829538, + "grad_norm": 1.6019445657730103, + "learning_rate": 2.7685872501778314e-06, + "loss": 0.679, + "step": 13095 + }, + { + "epoch": 4.261548471047496, + "grad_norm": 1.5864222049713135, + "learning_rate": 2.7567941538001796e-06, + "loss": 0.6807, + "step": 13100 + }, + { + "epoch": 4.2631750162654525, + "grad_norm": 1.6361010074615479, + "learning_rate": 2.745024762211712e-06, + "loss": 0.6275, + "step": 13105 + }, + { + "epoch": 4.264801561483409, + "grad_norm": 1.7175207138061523, + "learning_rate": 2.7332790879551734e-06, + "loss": 0.6688, + "step": 13110 + }, + { + "epoch": 4.266428106701366, + "grad_norm": 1.6481674909591675, + "learning_rate": 2.721557143548045e-06, + "loss": 0.7097, + "step": 13115 + }, + { + "epoch": 4.268054651919323, + "grad_norm": 1.5934703350067139, + "learning_rate": 2.709858941482496e-06, + "loss": 0.6725, + "step": 13120 + }, + { + "epoch": 4.26968119713728, + "grad_norm": 1.622652530670166, + "learning_rate": 2.6981844942254253e-06, + "loss": 0.6568, + "step": 13125 + }, + { + "epoch": 4.271307742355237, + "grad_norm": 1.6894246339797974, + "learning_rate": 2.686533814218384e-06, + "loss": 0.6772, + "step": 13130 + }, + { + "epoch": 4.272934287573195, + "grad_norm": 1.4382003545761108, + "learning_rate": 2.6749069138776146e-06, + "loss": 0.6631, + "step": 13135 + }, + { + "epoch": 4.274560832791152, + "grad_norm": 1.7184326648712158, + "learning_rate": 2.663303805594011e-06, + "loss": 0.6847, + "step": 13140 + }, + { + "epoch": 4.276187378009109, + "grad_norm": 1.667479157447815, + "learning_rate": 2.6517245017331187e-06, + "loss": 0.6603, + "step": 13145 + }, + { + "epoch": 4.2778139232270656, + "grad_norm": 1.7459298372268677, + "learning_rate": 2.640169014635102e-06, + "loss": 0.6909, + "step": 13150 + }, + { + "epoch": 4.2794404684450225, + "grad_norm": 1.7656573057174683, + "learning_rate": 2.628637356614752e-06, + "loss": 0.6805, + "step": 13155 + }, + { + "epoch": 4.28106701366298, + "grad_norm": 1.9437636137008667, + "learning_rate": 2.6171295399614603e-06, + "loss": 0.7043, + "step": 13160 + }, + { + "epoch": 4.282693558880937, + "grad_norm": 1.6186453104019165, + "learning_rate": 2.605645576939211e-06, + "loss": 0.7114, + "step": 13165 + }, + { + "epoch": 4.284320104098894, + "grad_norm": 1.577893614768982, + "learning_rate": 2.594185479786576e-06, + "loss": 0.6812, + "step": 13170 + }, + { + "epoch": 4.285946649316851, + "grad_norm": 1.8926020860671997, + "learning_rate": 2.582749260716688e-06, + "loss": 0.6735, + "step": 13175 + }, + { + "epoch": 4.287573194534808, + "grad_norm": 1.4763965606689453, + "learning_rate": 2.5713369319172203e-06, + "loss": 0.6719, + "step": 13180 + }, + { + "epoch": 4.289199739752765, + "grad_norm": 1.4994641542434692, + "learning_rate": 2.5599485055504053e-06, + "loss": 0.6661, + "step": 13185 + }, + { + "epoch": 4.290826284970722, + "grad_norm": 1.6598994731903076, + "learning_rate": 2.5485839937529897e-06, + "loss": 0.7234, + "step": 13190 + }, + { + "epoch": 4.2924528301886795, + "grad_norm": 1.6748179197311401, + "learning_rate": 2.537243408636239e-06, + "loss": 0.695, + "step": 13195 + }, + { + "epoch": 4.294079375406636, + "grad_norm": 1.7361656427383423, + "learning_rate": 2.5259267622859182e-06, + "loss": 0.6742, + "step": 13200 + }, + { + "epoch": 4.295705920624593, + "grad_norm": 1.6123602390289307, + "learning_rate": 2.514634066762289e-06, + "loss": 0.701, + "step": 13205 + }, + { + "epoch": 4.29733246584255, + "grad_norm": 1.5134855508804321, + "learning_rate": 2.503365334100069e-06, + "loss": 0.7204, + "step": 13210 + }, + { + "epoch": 4.298959011060507, + "grad_norm": 1.6295850276947021, + "learning_rate": 2.4921205763084626e-06, + "loss": 0.6671, + "step": 13215 + }, + { + "epoch": 4.300585556278465, + "grad_norm": 1.7750049829483032, + "learning_rate": 2.4808998053711007e-06, + "loss": 0.7067, + "step": 13220 + }, + { + "epoch": 4.302212101496422, + "grad_norm": 1.5359071493148804, + "learning_rate": 2.4697030332460653e-06, + "loss": 0.678, + "step": 13225 + }, + { + "epoch": 4.303838646714379, + "grad_norm": 1.6256874799728394, + "learning_rate": 2.458530271865861e-06, + "loss": 0.6976, + "step": 13230 + }, + { + "epoch": 4.305465191932336, + "grad_norm": 1.589576005935669, + "learning_rate": 2.447381533137405e-06, + "loss": 0.6561, + "step": 13235 + }, + { + "epoch": 4.307091737150293, + "grad_norm": 1.5652520656585693, + "learning_rate": 2.4362568289420103e-06, + "loss": 0.6601, + "step": 13240 + }, + { + "epoch": 4.3087182823682495, + "grad_norm": 1.7298879623413086, + "learning_rate": 2.427374378417388e-06, + "loss": 0.6649, + "step": 13245 + }, + { + "epoch": 4.310344827586207, + "grad_norm": 1.5562372207641602, + "learning_rate": 2.416292966240641e-06, + "loss": 0.6751, + "step": 13250 + }, + { + "epoch": 4.311971372804164, + "grad_norm": 1.5756971836090088, + "learning_rate": 2.405235621728322e-06, + "loss": 0.6635, + "step": 13255 + }, + { + "epoch": 4.313597918022121, + "grad_norm": 1.8689061403274536, + "learning_rate": 2.394202356664349e-06, + "loss": 0.6921, + "step": 13260 + }, + { + "epoch": 4.315224463240078, + "grad_norm": 1.9085664749145508, + "learning_rate": 2.383193182806978e-06, + "loss": 0.683, + "step": 13265 + }, + { + "epoch": 4.316851008458035, + "grad_norm": 1.6282198429107666, + "learning_rate": 2.3722081118887767e-06, + "loss": 0.6982, + "step": 13270 + }, + { + "epoch": 4.318477553675992, + "grad_norm": 1.935289740562439, + "learning_rate": 2.3612471556166442e-06, + "loss": 0.6637, + "step": 13275 + }, + { + "epoch": 4.32010409889395, + "grad_norm": 1.777384877204895, + "learning_rate": 2.3503103256717673e-06, + "loss": 0.6863, + "step": 13280 + }, + { + "epoch": 4.3217306441119065, + "grad_norm": 1.5307095050811768, + "learning_rate": 2.3393976337096334e-06, + "loss": 0.7005, + "step": 13285 + }, + { + "epoch": 4.3233571893298635, + "grad_norm": 1.6356438398361206, + "learning_rate": 2.328509091359984e-06, + "loss": 0.6843, + "step": 13290 + }, + { + "epoch": 4.32498373454782, + "grad_norm": 1.5878732204437256, + "learning_rate": 2.3176447102268602e-06, + "loss": 0.6621, + "step": 13295 + }, + { + "epoch": 4.326610279765777, + "grad_norm": 1.491715908050537, + "learning_rate": 2.3068045018885153e-06, + "loss": 0.7134, + "step": 13300 + }, + { + "epoch": 4.328236824983734, + "grad_norm": 1.5836570262908936, + "learning_rate": 2.2959884778974735e-06, + "loss": 0.6724, + "step": 13305 + }, + { + "epoch": 4.329863370201692, + "grad_norm": 1.6826726198196411, + "learning_rate": 2.285196649780455e-06, + "loss": 0.6944, + "step": 13310 + }, + { + "epoch": 4.331489915419649, + "grad_norm": 2.0284595489501953, + "learning_rate": 2.2744290290384247e-06, + "loss": 0.6715, + "step": 13315 + }, + { + "epoch": 4.333116460637606, + "grad_norm": 1.5435397624969482, + "learning_rate": 2.2636856271465194e-06, + "loss": 0.6789, + "step": 13320 + }, + { + "epoch": 4.334743005855563, + "grad_norm": 1.5644055604934692, + "learning_rate": 2.252966455554101e-06, + "loss": 0.7005, + "step": 13325 + }, + { + "epoch": 4.33636955107352, + "grad_norm": 2.4014153480529785, + "learning_rate": 2.242271525684672e-06, + "loss": 0.6558, + "step": 13330 + }, + { + "epoch": 4.3379960962914765, + "grad_norm": 1.8670786619186401, + "learning_rate": 2.2316008489359304e-06, + "loss": 0.6984, + "step": 13335 + }, + { + "epoch": 4.339622641509434, + "grad_norm": 1.5707929134368896, + "learning_rate": 2.2209544366797066e-06, + "loss": 0.6626, + "step": 13340 + }, + { + "epoch": 4.341249186727391, + "grad_norm": 1.4436675310134888, + "learning_rate": 2.2103323002619857e-06, + "loss": 0.6597, + "step": 13345 + }, + { + "epoch": 4.342875731945348, + "grad_norm": 1.5216563940048218, + "learning_rate": 2.199734451002869e-06, + "loss": 0.6822, + "step": 13350 + }, + { + "epoch": 4.344502277163305, + "grad_norm": 1.6805729866027832, + "learning_rate": 2.189160900196585e-06, + "loss": 0.6738, + "step": 13355 + }, + { + "epoch": 4.346128822381262, + "grad_norm": 1.7852559089660645, + "learning_rate": 2.178611659111468e-06, + "loss": 0.6557, + "step": 13360 + }, + { + "epoch": 4.347755367599219, + "grad_norm": 1.955796718597412, + "learning_rate": 2.1680867389899355e-06, + "loss": 0.675, + "step": 13365 + }, + { + "epoch": 4.349381912817177, + "grad_norm": 1.8595960140228271, + "learning_rate": 2.1575861510485017e-06, + "loss": 0.7121, + "step": 13370 + }, + { + "epoch": 4.351008458035134, + "grad_norm": 2.8343262672424316, + "learning_rate": 2.147109906477726e-06, + "loss": 0.6618, + "step": 13375 + }, + { + "epoch": 4.3526350032530905, + "grad_norm": 1.6237424612045288, + "learning_rate": 2.136658016442253e-06, + "loss": 0.7207, + "step": 13380 + }, + { + "epoch": 4.354261548471047, + "grad_norm": 1.6495102643966675, + "learning_rate": 2.126230492080744e-06, + "loss": 0.6919, + "step": 13385 + }, + { + "epoch": 4.355888093689004, + "grad_norm": 1.836276888847351, + "learning_rate": 2.1158273445059135e-06, + "loss": 0.6972, + "step": 13390 + }, + { + "epoch": 4.357514638906961, + "grad_norm": 1.5071443319320679, + "learning_rate": 2.1054485848044952e-06, + "loss": 0.6625, + "step": 13395 + }, + { + "epoch": 4.359141184124919, + "grad_norm": 1.6386994123458862, + "learning_rate": 2.095094224037228e-06, + "loss": 0.7104, + "step": 13400 + }, + { + "epoch": 4.360767729342876, + "grad_norm": 1.5281263589859009, + "learning_rate": 2.0847642732388457e-06, + "loss": 0.6701, + "step": 13405 + }, + { + "epoch": 4.362394274560833, + "grad_norm": 1.653648853302002, + "learning_rate": 2.0744587434180757e-06, + "loss": 0.6973, + "step": 13410 + }, + { + "epoch": 4.36402081977879, + "grad_norm": 1.5188273191452026, + "learning_rate": 2.0641776455576105e-06, + "loss": 0.6845, + "step": 13415 + }, + { + "epoch": 4.365647364996747, + "grad_norm": 1.5206401348114014, + "learning_rate": 2.0539209906141167e-06, + "loss": 0.6819, + "step": 13420 + }, + { + "epoch": 4.367273910214704, + "grad_norm": 1.7888633012771606, + "learning_rate": 2.0436887895182e-06, + "loss": 0.672, + "step": 13425 + }, + { + "epoch": 4.368900455432661, + "grad_norm": 1.8444963693618774, + "learning_rate": 2.0334810531744213e-06, + "loss": 0.6761, + "step": 13430 + }, + { + "epoch": 4.370527000650618, + "grad_norm": 1.7280285358428955, + "learning_rate": 2.0232977924612457e-06, + "loss": 0.683, + "step": 13435 + }, + { + "epoch": 4.372153545868575, + "grad_norm": 1.6709339618682861, + "learning_rate": 2.0131390182310805e-06, + "loss": 0.696, + "step": 13440 + }, + { + "epoch": 4.373780091086532, + "grad_norm": 1.9296367168426514, + "learning_rate": 2.003004741310216e-06, + "loss": 0.684, + "step": 13445 + }, + { + "epoch": 4.375406636304489, + "grad_norm": 1.607478380203247, + "learning_rate": 1.992894972498846e-06, + "loss": 0.6868, + "step": 13450 + }, + { + "epoch": 4.377033181522446, + "grad_norm": 1.5199337005615234, + "learning_rate": 1.982809722571047e-06, + "loss": 0.6747, + "step": 13455 + }, + { + "epoch": 4.378659726740404, + "grad_norm": 1.7574974298477173, + "learning_rate": 1.972749002274765e-06, + "loss": 0.6894, + "step": 13460 + }, + { + "epoch": 4.380286271958361, + "grad_norm": 1.6400377750396729, + "learning_rate": 1.9627128223317942e-06, + "loss": 0.7086, + "step": 13465 + }, + { + "epoch": 4.3819128171763175, + "grad_norm": 3.752514123916626, + "learning_rate": 1.952701193437792e-06, + "loss": 0.7106, + "step": 13470 + }, + { + "epoch": 4.383539362394274, + "grad_norm": 1.859137773513794, + "learning_rate": 1.942714126262238e-06, + "loss": 0.6602, + "step": 13475 + }, + { + "epoch": 4.385165907612231, + "grad_norm": 1.9155802726745605, + "learning_rate": 1.93275163144844e-06, + "loss": 0.6831, + "step": 13480 + }, + { + "epoch": 4.386792452830189, + "grad_norm": 1.5806576013565063, + "learning_rate": 1.9228137196135254e-06, + "loss": 0.6831, + "step": 13485 + }, + { + "epoch": 4.388418998048146, + "grad_norm": 1.658105731010437, + "learning_rate": 1.912900401348422e-06, + "loss": 0.6775, + "step": 13490 + }, + { + "epoch": 4.390045543266103, + "grad_norm": 1.6631839275360107, + "learning_rate": 1.9030116872178316e-06, + "loss": 0.6689, + "step": 13495 + }, + { + "epoch": 4.39167208848406, + "grad_norm": 1.5634475946426392, + "learning_rate": 1.8931475877602579e-06, + "loss": 0.6974, + "step": 13500 + }, + { + "epoch": 4.393298633702017, + "grad_norm": 1.4149178266525269, + "learning_rate": 1.8833081134879637e-06, + "loss": 0.6884, + "step": 13505 + }, + { + "epoch": 4.394925178919974, + "grad_norm": 1.7998847961425781, + "learning_rate": 1.8734932748869588e-06, + "loss": 0.6625, + "step": 13510 + }, + { + "epoch": 4.396551724137931, + "grad_norm": 1.500913381576538, + "learning_rate": 1.8637030824170121e-06, + "loss": 0.6588, + "step": 13515 + }, + { + "epoch": 4.398178269355888, + "grad_norm": 1.8569172620773315, + "learning_rate": 1.8539375465116193e-06, + "loss": 0.6976, + "step": 13520 + }, + { + "epoch": 4.399804814573845, + "grad_norm": 1.6474556922912598, + "learning_rate": 1.8441966775780112e-06, + "loss": 0.6597, + "step": 13525 + }, + { + "epoch": 4.401431359791802, + "grad_norm": 1.8232332468032837, + "learning_rate": 1.834480485997106e-06, + "loss": 0.6937, + "step": 13530 + }, + { + "epoch": 4.403057905009759, + "grad_norm": 1.4316027164459229, + "learning_rate": 1.8247889821235543e-06, + "loss": 0.6689, + "step": 13535 + }, + { + "epoch": 4.404684450227716, + "grad_norm": 1.717432975769043, + "learning_rate": 1.815122176285669e-06, + "loss": 0.687, + "step": 13540 + }, + { + "epoch": 4.406310995445674, + "grad_norm": 1.7138704061508179, + "learning_rate": 1.8054800787854569e-06, + "loss": 0.6816, + "step": 13545 + }, + { + "epoch": 4.407937540663631, + "grad_norm": 1.5330016613006592, + "learning_rate": 1.7958626998985928e-06, + "loss": 0.6954, + "step": 13550 + }, + { + "epoch": 4.409564085881588, + "grad_norm": 1.5293231010437012, + "learning_rate": 1.7862700498744085e-06, + "loss": 0.661, + "step": 13555 + }, + { + "epoch": 4.411190631099545, + "grad_norm": 1.6859644651412964, + "learning_rate": 1.7767021389358706e-06, + "loss": 0.6796, + "step": 13560 + }, + { + "epoch": 4.4128171763175015, + "grad_norm": 1.5425148010253906, + "learning_rate": 1.767158977279601e-06, + "loss": 0.6904, + "step": 13565 + }, + { + "epoch": 4.414443721535458, + "grad_norm": 1.5657821893692017, + "learning_rate": 1.7576405750758224e-06, + "loss": 0.6667, + "step": 13570 + }, + { + "epoch": 4.416070266753415, + "grad_norm": 1.6440789699554443, + "learning_rate": 1.74814694246839e-06, + "loss": 0.6737, + "step": 13575 + }, + { + "epoch": 4.417696811971373, + "grad_norm": 1.7115478515625, + "learning_rate": 1.7386780895747578e-06, + "loss": 0.7053, + "step": 13580 + }, + { + "epoch": 4.41932335718933, + "grad_norm": 1.529528021812439, + "learning_rate": 1.729234026485968e-06, + "loss": 0.6682, + "step": 13585 + }, + { + "epoch": 4.420949902407287, + "grad_norm": 1.57627534866333, + "learning_rate": 1.7198147632666416e-06, + "loss": 0.6664, + "step": 13590 + }, + { + "epoch": 4.422576447625244, + "grad_norm": 1.5823880434036255, + "learning_rate": 1.7104203099549827e-06, + "loss": 0.637, + "step": 13595 + }, + { + "epoch": 4.424202992843201, + "grad_norm": 1.6674782037734985, + "learning_rate": 1.701050676562735e-06, + "loss": 0.6721, + "step": 13600 + }, + { + "epoch": 4.4258295380611585, + "grad_norm": 1.5833678245544434, + "learning_rate": 1.691705873075211e-06, + "loss": 0.67, + "step": 13605 + }, + { + "epoch": 4.427456083279115, + "grad_norm": 1.635013222694397, + "learning_rate": 1.6823859094512507e-06, + "loss": 0.6944, + "step": 13610 + }, + { + "epoch": 4.429082628497072, + "grad_norm": 1.5148203372955322, + "learning_rate": 1.6730907956232306e-06, + "loss": 0.6621, + "step": 13615 + }, + { + "epoch": 4.430709173715029, + "grad_norm": 1.5766314268112183, + "learning_rate": 1.6638205414970298e-06, + "loss": 0.6618, + "step": 13620 + }, + { + "epoch": 4.432335718932986, + "grad_norm": 1.7416802644729614, + "learning_rate": 1.654575156952054e-06, + "loss": 0.6925, + "step": 13625 + }, + { + "epoch": 4.433962264150943, + "grad_norm": 1.5971283912658691, + "learning_rate": 1.6453546518411855e-06, + "loss": 0.6747, + "step": 13630 + }, + { + "epoch": 4.4355888093689, + "grad_norm": 1.6883677244186401, + "learning_rate": 1.636159035990803e-06, + "loss": 0.6991, + "step": 13635 + }, + { + "epoch": 4.437215354586858, + "grad_norm": 1.7667531967163086, + "learning_rate": 1.6269883192007618e-06, + "loss": 0.662, + "step": 13640 + }, + { + "epoch": 4.438841899804815, + "grad_norm": 1.6016697883605957, + "learning_rate": 1.6178425112443774e-06, + "loss": 0.6743, + "step": 13645 + }, + { + "epoch": 4.440468445022772, + "grad_norm": 1.6263352632522583, + "learning_rate": 1.608721621868428e-06, + "loss": 0.7229, + "step": 13650 + }, + { + "epoch": 4.4420949902407285, + "grad_norm": 1.7048295736312866, + "learning_rate": 1.5996256607931193e-06, + "loss": 0.6607, + "step": 13655 + }, + { + "epoch": 4.443721535458685, + "grad_norm": 1.7446348667144775, + "learning_rate": 1.5905546377121077e-06, + "loss": 0.687, + "step": 13660 + }, + { + "epoch": 4.445348080676643, + "grad_norm": 1.7615159749984741, + "learning_rate": 1.5815085622924607e-06, + "loss": 0.6777, + "step": 13665 + }, + { + "epoch": 4.4469746258946, + "grad_norm": 1.926862120628357, + "learning_rate": 1.5724874441746696e-06, + "loss": 0.6835, + "step": 13670 + }, + { + "epoch": 4.448601171112557, + "grad_norm": 1.694735050201416, + "learning_rate": 1.5634912929726131e-06, + "loss": 0.7306, + "step": 13675 + }, + { + "epoch": 4.450227716330514, + "grad_norm": 1.7926830053329468, + "learning_rate": 1.5545201182735891e-06, + "loss": 0.6911, + "step": 13680 + }, + { + "epoch": 4.451854261548471, + "grad_norm": 1.59516441822052, + "learning_rate": 1.5455739296382442e-06, + "loss": 0.6875, + "step": 13685 + }, + { + "epoch": 4.453480806766428, + "grad_norm": 1.608954906463623, + "learning_rate": 1.5366527366006272e-06, + "loss": 0.6785, + "step": 13690 + }, + { + "epoch": 4.455107351984386, + "grad_norm": 1.6335598230361938, + "learning_rate": 1.5277565486681272e-06, + "loss": 0.6706, + "step": 13695 + }, + { + "epoch": 4.4567338972023425, + "grad_norm": 1.6038471460342407, + "learning_rate": 1.5188853753214966e-06, + "loss": 0.6683, + "step": 13700 + }, + { + "epoch": 4.458360442420299, + "grad_norm": 1.7724075317382812, + "learning_rate": 1.51003922601482e-06, + "loss": 0.7016, + "step": 13705 + }, + { + "epoch": 4.459986987638256, + "grad_norm": 1.7537798881530762, + "learning_rate": 1.501218110175534e-06, + "loss": 0.6639, + "step": 13710 + }, + { + "epoch": 4.461613532856213, + "grad_norm": 1.569793939590454, + "learning_rate": 1.4924220372043718e-06, + "loss": 0.6899, + "step": 13715 + }, + { + "epoch": 4.46324007807417, + "grad_norm": 1.8291469812393188, + "learning_rate": 1.483651016475393e-06, + "loss": 0.7068, + "step": 13720 + }, + { + "epoch": 4.464866623292128, + "grad_norm": 1.8672168254852295, + "learning_rate": 1.474905057335954e-06, + "loss": 0.6502, + "step": 13725 + }, + { + "epoch": 4.466493168510085, + "grad_norm": 1.4979305267333984, + "learning_rate": 1.4661841691067074e-06, + "loss": 0.6711, + "step": 13730 + }, + { + "epoch": 4.468119713728042, + "grad_norm": 1.5918117761611938, + "learning_rate": 1.457488361081577e-06, + "loss": 0.6541, + "step": 13735 + }, + { + "epoch": 4.469746258945999, + "grad_norm": 1.4842528104782104, + "learning_rate": 1.4488176425277693e-06, + "loss": 0.6809, + "step": 13740 + }, + { + "epoch": 4.4713728041639555, + "grad_norm": 1.9275486469268799, + "learning_rate": 1.4401720226857485e-06, + "loss": 0.695, + "step": 13745 + }, + { + "epoch": 4.4729993493819125, + "grad_norm": 1.7275476455688477, + "learning_rate": 1.4315515107692385e-06, + "loss": 0.6852, + "step": 13750 + }, + { + "epoch": 4.47462589459987, + "grad_norm": 1.4187053442001343, + "learning_rate": 1.422956115965185e-06, + "loss": 0.6944, + "step": 13755 + }, + { + "epoch": 4.476252439817827, + "grad_norm": 1.8387137651443481, + "learning_rate": 1.4143858474337912e-06, + "loss": 0.7114, + "step": 13760 + }, + { + "epoch": 4.477878985035784, + "grad_norm": 1.4862850904464722, + "learning_rate": 1.4058407143084596e-06, + "loss": 0.6943, + "step": 13765 + }, + { + "epoch": 4.479505530253741, + "grad_norm": 1.6738651990890503, + "learning_rate": 1.3973207256958277e-06, + "loss": 0.6705, + "step": 13770 + }, + { + "epoch": 4.481132075471698, + "grad_norm": 1.5694196224212646, + "learning_rate": 1.3888258906757185e-06, + "loss": 0.6723, + "step": 13775 + }, + { + "epoch": 4.482758620689655, + "grad_norm": 1.5761380195617676, + "learning_rate": 1.3803562183011598e-06, + "loss": 0.6751, + "step": 13780 + }, + { + "epoch": 4.484385165907613, + "grad_norm": 1.5624138116836548, + "learning_rate": 1.371911717598362e-06, + "loss": 0.6858, + "step": 13785 + }, + { + "epoch": 4.4860117111255695, + "grad_norm": 1.7549540996551514, + "learning_rate": 1.3634923975667013e-06, + "loss": 0.6778, + "step": 13790 + }, + { + "epoch": 4.487638256343526, + "grad_norm": 1.3703219890594482, + "learning_rate": 1.3550982671787283e-06, + "loss": 0.6908, + "step": 13795 + }, + { + "epoch": 4.489264801561483, + "grad_norm": 1.7416917085647583, + "learning_rate": 1.3467293353801424e-06, + "loss": 0.6981, + "step": 13800 + }, + { + "epoch": 4.49089134677944, + "grad_norm": 1.4844809770584106, + "learning_rate": 1.3383856110897901e-06, + "loss": 0.6882, + "step": 13805 + }, + { + "epoch": 4.492517891997397, + "grad_norm": 1.662513256072998, + "learning_rate": 1.3300671031996586e-06, + "loss": 0.6723, + "step": 13810 + }, + { + "epoch": 4.494144437215355, + "grad_norm": 1.6496549844741821, + "learning_rate": 1.3217738205748619e-06, + "loss": 0.7103, + "step": 13815 + }, + { + "epoch": 4.495770982433312, + "grad_norm": 1.6217796802520752, + "learning_rate": 1.3135057720536193e-06, + "loss": 0.6716, + "step": 13820 + }, + { + "epoch": 4.497397527651269, + "grad_norm": 1.6400377750396729, + "learning_rate": 1.3052629664472738e-06, + "loss": 0.6653, + "step": 13825 + }, + { + "epoch": 4.499024072869226, + "grad_norm": 1.5758782625198364, + "learning_rate": 1.2970454125402514e-06, + "loss": 0.6778, + "step": 13830 + }, + { + "epoch": 4.500650618087183, + "grad_norm": 1.8863537311553955, + "learning_rate": 1.2888531190900777e-06, + "loss": 0.6912, + "step": 13835 + }, + { + "epoch": 4.5022771633051395, + "grad_norm": 1.5520035028457642, + "learning_rate": 1.2806860948273575e-06, + "loss": 0.6888, + "step": 13840 + }, + { + "epoch": 4.503903708523097, + "grad_norm": 1.7430062294006348, + "learning_rate": 1.2725443484557675e-06, + "loss": 0.6698, + "step": 13845 + }, + { + "epoch": 4.505530253741054, + "grad_norm": 1.6013933420181274, + "learning_rate": 1.264427888652031e-06, + "loss": 0.6749, + "step": 13850 + }, + { + "epoch": 4.507156798959011, + "grad_norm": 1.6851446628570557, + "learning_rate": 1.256336724065943e-06, + "loss": 0.6939, + "step": 13855 + }, + { + "epoch": 4.508783344176968, + "grad_norm": 1.4689491987228394, + "learning_rate": 1.2482708633203278e-06, + "loss": 0.7121, + "step": 13860 + }, + { + "epoch": 4.510409889394925, + "grad_norm": 2.6870551109313965, + "learning_rate": 1.2402303150110455e-06, + "loss": 0.661, + "step": 13865 + }, + { + "epoch": 4.512036434612883, + "grad_norm": 1.8838740587234497, + "learning_rate": 1.2322150877069894e-06, + "loss": 0.6829, + "step": 13870 + }, + { + "epoch": 4.51366297983084, + "grad_norm": 1.5556480884552002, + "learning_rate": 1.2242251899500568e-06, + "loss": 0.6709, + "step": 13875 + }, + { + "epoch": 4.5152895250487965, + "grad_norm": 1.86298668384552, + "learning_rate": 1.2162606302551532e-06, + "loss": 0.681, + "step": 13880 + }, + { + "epoch": 4.5169160702667535, + "grad_norm": 1.6995937824249268, + "learning_rate": 1.2083214171101893e-06, + "loss": 0.7031, + "step": 13885 + }, + { + "epoch": 4.51854261548471, + "grad_norm": 1.7769548892974854, + "learning_rate": 1.2004075589760495e-06, + "loss": 0.6769, + "step": 13890 + }, + { + "epoch": 4.520169160702667, + "grad_norm": 1.8099085092544556, + "learning_rate": 1.1925190642866096e-06, + "loss": 0.6624, + "step": 13895 + }, + { + "epoch": 4.521795705920624, + "grad_norm": 1.5483301877975464, + "learning_rate": 1.184655941448712e-06, + "loss": 0.6627, + "step": 13900 + }, + { + "epoch": 4.523422251138582, + "grad_norm": 1.6125638484954834, + "learning_rate": 1.1768181988421583e-06, + "loss": 0.6681, + "step": 13905 + }, + { + "epoch": 4.525048796356539, + "grad_norm": 1.6662535667419434, + "learning_rate": 1.1690058448197038e-06, + "loss": 0.7225, + "step": 13910 + }, + { + "epoch": 4.526675341574496, + "grad_norm": 1.6829675436019897, + "learning_rate": 1.161218887707044e-06, + "loss": 0.7157, + "step": 13915 + }, + { + "epoch": 4.528301886792453, + "grad_norm": 1.623538851737976, + "learning_rate": 1.1534573358028127e-06, + "loss": 0.6658, + "step": 13920 + }, + { + "epoch": 4.52992843201041, + "grad_norm": 1.629623293876648, + "learning_rate": 1.1457211973785632e-06, + "loss": 0.6512, + "step": 13925 + }, + { + "epoch": 4.531554977228367, + "grad_norm": 1.6995491981506348, + "learning_rate": 1.1380104806787722e-06, + "loss": 0.7059, + "step": 13930 + }, + { + "epoch": 4.533181522446324, + "grad_norm": 1.6409274339675903, + "learning_rate": 1.1303251939208227e-06, + "loss": 0.6799, + "step": 13935 + }, + { + "epoch": 4.534808067664281, + "grad_norm": 1.5322301387786865, + "learning_rate": 1.1226653452949987e-06, + "loss": 0.706, + "step": 13940 + }, + { + "epoch": 4.536434612882238, + "grad_norm": 1.5808583498001099, + "learning_rate": 1.1150309429644623e-06, + "loss": 0.6644, + "step": 13945 + }, + { + "epoch": 4.538061158100195, + "grad_norm": 1.4785289764404297, + "learning_rate": 1.1074219950652747e-06, + "loss": 0.6956, + "step": 13950 + }, + { + "epoch": 4.539687703318152, + "grad_norm": 1.5430021286010742, + "learning_rate": 1.0998385097063553e-06, + "loss": 0.6651, + "step": 13955 + }, + { + "epoch": 4.541314248536109, + "grad_norm": 1.736583948135376, + "learning_rate": 1.0922804949694943e-06, + "loss": 0.6847, + "step": 13960 + }, + { + "epoch": 4.542940793754067, + "grad_norm": 1.5016318559646606, + "learning_rate": 1.0847479589093435e-06, + "loss": 0.6949, + "step": 13965 + }, + { + "epoch": 4.544567338972024, + "grad_norm": 1.565224528312683, + "learning_rate": 1.0772409095533953e-06, + "loss": 0.6952, + "step": 13970 + }, + { + "epoch": 4.5461938841899805, + "grad_norm": 1.649155855178833, + "learning_rate": 1.0697593549019752e-06, + "loss": 0.6531, + "step": 13975 + }, + { + "epoch": 4.547820429407937, + "grad_norm": 1.6376577615737915, + "learning_rate": 1.0623033029282525e-06, + "loss": 0.6913, + "step": 13980 + }, + { + "epoch": 4.549446974625894, + "grad_norm": 1.5706044435501099, + "learning_rate": 1.0548727615782017e-06, + "loss": 0.6803, + "step": 13985 + }, + { + "epoch": 4.551073519843852, + "grad_norm": 1.6474052667617798, + "learning_rate": 1.0474677387706244e-06, + "loss": 0.7117, + "step": 13990 + }, + { + "epoch": 4.552700065061809, + "grad_norm": 1.6361408233642578, + "learning_rate": 1.040088242397122e-06, + "loss": 0.6728, + "step": 13995 + }, + { + "epoch": 4.554326610279766, + "grad_norm": 1.6147422790527344, + "learning_rate": 1.0327342803220925e-06, + "loss": 0.6864, + "step": 14000 + }, + { + "epoch": 4.555953155497723, + "grad_norm": 1.5160025358200073, + "learning_rate": 1.0254058603827139e-06, + "loss": 0.7029, + "step": 14005 + }, + { + "epoch": 4.55757970071568, + "grad_norm": 1.6433602571487427, + "learning_rate": 1.018102990388961e-06, + "loss": 0.6838, + "step": 14010 + }, + { + "epoch": 4.559206245933637, + "grad_norm": 1.6100993156433105, + "learning_rate": 1.0108256781235615e-06, + "loss": 0.6543, + "step": 14015 + }, + { + "epoch": 4.560832791151594, + "grad_norm": 2.2205612659454346, + "learning_rate": 1.0035739313420194e-06, + "loss": 0.7032, + "step": 14020 + }, + { + "epoch": 4.562459336369551, + "grad_norm": 1.7252569198608398, + "learning_rate": 9.96347757772581e-07, + "loss": 0.6625, + "step": 14025 + }, + { + "epoch": 4.564085881587508, + "grad_norm": 1.7936674356460571, + "learning_rate": 9.891471651162554e-07, + "loss": 0.7072, + "step": 14030 + }, + { + "epoch": 4.565712426805465, + "grad_norm": 1.66959547996521, + "learning_rate": 9.819721610467736e-07, + "loss": 0.6815, + "step": 14035 + }, + { + "epoch": 4.567338972023422, + "grad_norm": 1.6491159200668335, + "learning_rate": 9.748227532106025e-07, + "loss": 0.701, + "step": 14040 + }, + { + "epoch": 4.568965517241379, + "grad_norm": 1.7318546772003174, + "learning_rate": 9.676989492269417e-07, + "loss": 0.6505, + "step": 14045 + }, + { + "epoch": 4.570592062459337, + "grad_norm": 1.3802294731140137, + "learning_rate": 9.606007566876823e-07, + "loss": 0.6852, + "step": 14050 + }, + { + "epoch": 4.572218607677294, + "grad_norm": 1.8392539024353027, + "learning_rate": 9.535281831574399e-07, + "loss": 0.6434, + "step": 14055 + }, + { + "epoch": 4.573845152895251, + "grad_norm": 1.9678912162780762, + "learning_rate": 9.464812361735214e-07, + "loss": 0.6707, + "step": 14060 + }, + { + "epoch": 4.5754716981132075, + "grad_norm": 1.5375745296478271, + "learning_rate": 9.394599232459222e-07, + "loss": 0.6876, + "step": 14065 + }, + { + "epoch": 4.577098243331164, + "grad_norm": 1.559077262878418, + "learning_rate": 9.324642518573179e-07, + "loss": 0.6771, + "step": 14070 + }, + { + "epoch": 4.578724788549121, + "grad_norm": 1.6018946170806885, + "learning_rate": 9.25494229463067e-07, + "loss": 0.6985, + "step": 14075 + }, + { + "epoch": 4.580351333767078, + "grad_norm": 1.7922332286834717, + "learning_rate": 9.185498634911749e-07, + "loss": 0.6878, + "step": 14080 + }, + { + "epoch": 4.581977878985036, + "grad_norm": 1.9412555694580078, + "learning_rate": 9.116311613423272e-07, + "loss": 0.7295, + "step": 14085 + }, + { + "epoch": 4.583604424202993, + "grad_norm": 1.6968659162521362, + "learning_rate": 9.047381303898372e-07, + "loss": 0.6567, + "step": 14090 + }, + { + "epoch": 4.58523096942095, + "grad_norm": 1.54911208152771, + "learning_rate": 8.978707779796869e-07, + "loss": 0.6685, + "step": 14095 + }, + { + "epoch": 4.586857514638907, + "grad_norm": 1.735830545425415, + "learning_rate": 8.910291114304637e-07, + "loss": 0.7142, + "step": 14100 + }, + { + "epoch": 4.588484059856864, + "grad_norm": 1.7051962614059448, + "learning_rate": 8.842131380334017e-07, + "loss": 0.6875, + "step": 14105 + }, + { + "epoch": 4.5901106050748215, + "grad_norm": 1.6530320644378662, + "learning_rate": 8.774228650523408e-07, + "loss": 0.6891, + "step": 14110 + }, + { + "epoch": 4.591737150292778, + "grad_norm": 1.6566858291625977, + "learning_rate": 8.706582997237422e-07, + "loss": 0.6427, + "step": 14115 + }, + { + "epoch": 4.593363695510735, + "grad_norm": 1.5007658004760742, + "learning_rate": 8.639194492566616e-07, + "loss": 0.6416, + "step": 14120 + }, + { + "epoch": 4.594990240728692, + "grad_norm": 1.4796278476715088, + "learning_rate": 8.572063208327568e-07, + "loss": 0.6817, + "step": 14125 + }, + { + "epoch": 4.596616785946649, + "grad_norm": 1.6929032802581787, + "learning_rate": 8.50518921606272e-07, + "loss": 0.6714, + "step": 14130 + }, + { + "epoch": 4.598243331164606, + "grad_norm": 1.5221062898635864, + "learning_rate": 8.438572587040283e-07, + "loss": 0.6792, + "step": 14135 + }, + { + "epoch": 4.599869876382563, + "grad_norm": 1.6462249755859375, + "learning_rate": 8.372213392254191e-07, + "loss": 0.7065, + "step": 14140 + }, + { + "epoch": 4.601496421600521, + "grad_norm": 1.749974250793457, + "learning_rate": 8.306111702424069e-07, + "loss": 0.6995, + "step": 14145 + }, + { + "epoch": 4.603122966818478, + "grad_norm": 1.7774195671081543, + "learning_rate": 8.240267587995093e-07, + "loss": 0.6761, + "step": 14150 + }, + { + "epoch": 4.604749512036435, + "grad_norm": 1.839493989944458, + "learning_rate": 8.174681119137939e-07, + "loss": 0.6883, + "step": 14155 + }, + { + "epoch": 4.6063760572543915, + "grad_norm": 1.6923067569732666, + "learning_rate": 8.109352365748696e-07, + "loss": 0.6793, + "step": 14160 + }, + { + "epoch": 4.608002602472348, + "grad_norm": 1.507960557937622, + "learning_rate": 8.044281397448894e-07, + "loss": 0.6885, + "step": 14165 + }, + { + "epoch": 4.609629147690306, + "grad_norm": 1.4883050918579102, + "learning_rate": 7.979468283585145e-07, + "loss": 0.6711, + "step": 14170 + }, + { + "epoch": 4.611255692908263, + "grad_norm": 1.5881962776184082, + "learning_rate": 7.914913093229447e-07, + "loss": 0.6727, + "step": 14175 + }, + { + "epoch": 4.61288223812622, + "grad_norm": 2.5120699405670166, + "learning_rate": 7.850615895178909e-07, + "loss": 0.6889, + "step": 14180 + }, + { + "epoch": 4.614508783344177, + "grad_norm": 1.4840761423110962, + "learning_rate": 7.786576757955521e-07, + "loss": 0.6713, + "step": 14185 + }, + { + "epoch": 4.616135328562134, + "grad_norm": 1.8393685817718506, + "learning_rate": 7.722795749806472e-07, + "loss": 0.6808, + "step": 14190 + }, + { + "epoch": 4.617761873780091, + "grad_norm": 2.054499387741089, + "learning_rate": 7.659272938703749e-07, + "loss": 0.6885, + "step": 14195 + }, + { + "epoch": 4.6193884189980485, + "grad_norm": 1.917855978012085, + "learning_rate": 7.596008392344228e-07, + "loss": 0.6871, + "step": 14200 + }, + { + "epoch": 4.621014964216005, + "grad_norm": 1.9141509532928467, + "learning_rate": 7.533002178149451e-07, + "loss": 0.6656, + "step": 14205 + }, + { + "epoch": 4.622641509433962, + "grad_norm": 1.6884684562683105, + "learning_rate": 7.470254363265788e-07, + "loss": 0.6765, + "step": 14210 + }, + { + "epoch": 4.624268054651919, + "grad_norm": 1.6262927055358887, + "learning_rate": 7.407765014564111e-07, + "loss": 0.6792, + "step": 14215 + }, + { + "epoch": 4.625894599869876, + "grad_norm": 1.6845659017562866, + "learning_rate": 7.345534198639897e-07, + "loss": 0.675, + "step": 14220 + }, + { + "epoch": 4.627521145087833, + "grad_norm": 1.8522858619689941, + "learning_rate": 7.283561981813125e-07, + "loss": 0.6897, + "step": 14225 + }, + { + "epoch": 4.629147690305791, + "grad_norm": 1.466511607170105, + "learning_rate": 7.221848430128158e-07, + "loss": 0.6746, + "step": 14230 + }, + { + "epoch": 4.630774235523748, + "grad_norm": 2.0542683601379395, + "learning_rate": 7.160393609353694e-07, + "loss": 0.6991, + "step": 14235 + }, + { + "epoch": 4.632400780741705, + "grad_norm": 1.9120912551879883, + "learning_rate": 7.099197584982675e-07, + "loss": 0.6865, + "step": 14240 + }, + { + "epoch": 4.634027325959662, + "grad_norm": 1.398518443107605, + "learning_rate": 7.050427142731547e-07, + "loss": 0.6792, + "step": 14245 + }, + { + "epoch": 4.6356538711776185, + "grad_norm": 1.6294313669204712, + "learning_rate": 6.989697116048633e-07, + "loss": 0.6986, + "step": 14250 + }, + { + "epoch": 4.637280416395575, + "grad_norm": 1.673263430595398, + "learning_rate": 6.929226067682037e-07, + "loss": 0.6728, + "step": 14255 + }, + { + "epoch": 4.638906961613533, + "grad_norm": 1.6511849164962769, + "learning_rate": 6.86901406207624e-07, + "loss": 0.7071, + "step": 14260 + }, + { + "epoch": 4.64053350683149, + "grad_norm": 1.5854378938674927, + "learning_rate": 6.809061163399827e-07, + "loss": 0.6687, + "step": 14265 + }, + { + "epoch": 4.642160052049447, + "grad_norm": 1.608728051185608, + "learning_rate": 6.749367435545024e-07, + "loss": 0.6588, + "step": 14270 + }, + { + "epoch": 4.643786597267404, + "grad_norm": 1.547501564025879, + "learning_rate": 6.689932942128108e-07, + "loss": 0.6728, + "step": 14275 + }, + { + "epoch": 4.645413142485361, + "grad_norm": 1.7333050966262817, + "learning_rate": 6.630757746488886e-07, + "loss": 0.6711, + "step": 14280 + }, + { + "epoch": 4.647039687703318, + "grad_norm": 1.66715669631958, + "learning_rate": 6.571841911690968e-07, + "loss": 0.7231, + "step": 14285 + }, + { + "epoch": 4.648666232921276, + "grad_norm": 1.5106160640716553, + "learning_rate": 6.513185500521463e-07, + "loss": 0.6685, + "step": 14290 + }, + { + "epoch": 4.6502927781392325, + "grad_norm": 1.5364242792129517, + "learning_rate": 6.454788575491061e-07, + "loss": 0.6995, + "step": 14295 + }, + { + "epoch": 4.651919323357189, + "grad_norm": 1.6903712749481201, + "learning_rate": 6.396651198833897e-07, + "loss": 0.6618, + "step": 14300 + }, + { + "epoch": 4.653545868575146, + "grad_norm": 1.806966781616211, + "learning_rate": 6.338773432507494e-07, + "loss": 0.6866, + "step": 14305 + }, + { + "epoch": 4.655172413793103, + "grad_norm": 1.7275676727294922, + "learning_rate": 6.281155338192762e-07, + "loss": 0.6309, + "step": 14310 + }, + { + "epoch": 4.656798959011061, + "grad_norm": 1.6055309772491455, + "learning_rate": 6.223796977293777e-07, + "loss": 0.6603, + "step": 14315 + }, + { + "epoch": 4.658425504229018, + "grad_norm": 1.8119957447052002, + "learning_rate": 6.166698410937949e-07, + "loss": 0.6818, + "step": 14320 + }, + { + "epoch": 4.660052049446975, + "grad_norm": 1.6336417198181152, + "learning_rate": 6.109859699975684e-07, + "loss": 0.666, + "step": 14325 + }, + { + "epoch": 4.661678594664932, + "grad_norm": 1.4614338874816895, + "learning_rate": 6.053280904980557e-07, + "loss": 0.6361, + "step": 14330 + }, + { + "epoch": 4.663305139882889, + "grad_norm": 1.5490102767944336, + "learning_rate": 5.996962086249058e-07, + "loss": 0.6899, + "step": 14335 + }, + { + "epoch": 4.6649316851008455, + "grad_norm": 1.6667866706848145, + "learning_rate": 5.940903303800705e-07, + "loss": 0.6662, + "step": 14340 + }, + { + "epoch": 4.6665582303188025, + "grad_norm": 1.591328740119934, + "learning_rate": 5.885104617377873e-07, + "loss": 0.6772, + "step": 14345 + }, + { + "epoch": 4.66818477553676, + "grad_norm": 1.7102192640304565, + "learning_rate": 5.829566086445721e-07, + "loss": 0.7084, + "step": 14350 + }, + { + "epoch": 4.669811320754717, + "grad_norm": 2.091461181640625, + "learning_rate": 5.774287770192149e-07, + "loss": 0.6814, + "step": 14355 + }, + { + "epoch": 4.671437865972674, + "grad_norm": 1.3679436445236206, + "learning_rate": 5.719269727527843e-07, + "loss": 0.6548, + "step": 14360 + }, + { + "epoch": 4.673064411190631, + "grad_norm": 1.683988332748413, + "learning_rate": 5.664512017085926e-07, + "loss": 0.6855, + "step": 14365 + }, + { + "epoch": 4.674690956408588, + "grad_norm": 1.6595364809036255, + "learning_rate": 5.610014697222249e-07, + "loss": 0.7018, + "step": 14370 + }, + { + "epoch": 4.676317501626546, + "grad_norm": 1.4635215997695923, + "learning_rate": 5.555777826015129e-07, + "loss": 0.6972, + "step": 14375 + }, + { + "epoch": 4.677944046844503, + "grad_norm": 1.5773056745529175, + "learning_rate": 5.501801461265304e-07, + "loss": 0.6357, + "step": 14380 + }, + { + "epoch": 4.6795705920624595, + "grad_norm": 1.783153772354126, + "learning_rate": 5.448085660495816e-07, + "loss": 0.6882, + "step": 14385 + }, + { + "epoch": 4.681197137280416, + "grad_norm": 1.6693134307861328, + "learning_rate": 5.394630480952178e-07, + "loss": 0.6552, + "step": 14390 + }, + { + "epoch": 4.682823682498373, + "grad_norm": 1.7608495950698853, + "learning_rate": 5.341435979601988e-07, + "loss": 0.6498, + "step": 14395 + }, + { + "epoch": 4.68445022771633, + "grad_norm": 1.5767661333084106, + "learning_rate": 5.288502213135149e-07, + "loss": 0.7165, + "step": 14400 + }, + { + "epoch": 4.686076772934287, + "grad_norm": 1.6951254606246948, + "learning_rate": 5.235829237963646e-07, + "loss": 0.6742, + "step": 14405 + }, + { + "epoch": 4.687703318152245, + "grad_norm": 1.5949219465255737, + "learning_rate": 5.183417110221606e-07, + "loss": 0.69, + "step": 14410 + }, + { + "epoch": 4.689329863370202, + "grad_norm": 1.536463975906372, + "learning_rate": 5.131265885765041e-07, + "loss": 0.7035, + "step": 14415 + }, + { + "epoch": 4.690956408588159, + "grad_norm": 1.710235834121704, + "learning_rate": 5.07937562017205e-07, + "loss": 0.665, + "step": 14420 + }, + { + "epoch": 4.692582953806116, + "grad_norm": 1.563416600227356, + "learning_rate": 5.027746368742536e-07, + "loss": 0.6943, + "step": 14425 + }, + { + "epoch": 4.694209499024073, + "grad_norm": 1.478869080543518, + "learning_rate": 4.976378186498293e-07, + "loss": 0.6664, + "step": 14430 + }, + { + "epoch": 4.69583604424203, + "grad_norm": 1.5734919309616089, + "learning_rate": 4.925271128182807e-07, + "loss": 0.6516, + "step": 14435 + }, + { + "epoch": 4.697462589459987, + "grad_norm": 1.8956400156021118, + "learning_rate": 4.874425248261428e-07, + "loss": 0.6905, + "step": 14440 + }, + { + "epoch": 4.699089134677944, + "grad_norm": 1.747822880744934, + "learning_rate": 4.823840600921003e-07, + "loss": 0.6788, + "step": 14445 + }, + { + "epoch": 4.700715679895901, + "grad_norm": 1.4453195333480835, + "learning_rate": 4.773517240070108e-07, + "loss": 0.6663, + "step": 14450 + }, + { + "epoch": 4.702342225113858, + "grad_norm": 1.6447715759277344, + "learning_rate": 4.7234552193387846e-07, + "loss": 0.6794, + "step": 14455 + }, + { + "epoch": 4.703968770331815, + "grad_norm": 1.8819068670272827, + "learning_rate": 4.67365459207858e-07, + "loss": 0.6844, + "step": 14460 + }, + { + "epoch": 4.705595315549772, + "grad_norm": 1.676971197128296, + "learning_rate": 4.624115411362512e-07, + "loss": 0.6997, + "step": 14465 + }, + { + "epoch": 4.70722186076773, + "grad_norm": 1.4313923120498657, + "learning_rate": 4.5748377299849045e-07, + "loss": 0.6858, + "step": 14470 + }, + { + "epoch": 4.7088484059856865, + "grad_norm": 1.509230613708496, + "learning_rate": 4.525821600461472e-07, + "loss": 0.6579, + "step": 14475 + }, + { + "epoch": 4.7104749512036435, + "grad_norm": 1.3929016590118408, + "learning_rate": 4.477067075029123e-07, + "loss": 0.6855, + "step": 14480 + }, + { + "epoch": 4.7121014964216, + "grad_norm": 1.5677586793899536, + "learning_rate": 4.428574205646047e-07, + "loss": 0.6703, + "step": 14485 + }, + { + "epoch": 4.713728041639557, + "grad_norm": 1.5566221475601196, + "learning_rate": 4.3803430439915137e-07, + "loss": 0.6625, + "step": 14490 + }, + { + "epoch": 4.715354586857515, + "grad_norm": 1.4790152311325073, + "learning_rate": 4.332373641465909e-07, + "loss": 0.6596, + "step": 14495 + }, + { + "epoch": 4.716981132075472, + "grad_norm": 1.4039454460144043, + "learning_rate": 4.284666049190644e-07, + "loss": 0.6691, + "step": 14500 + }, + { + "epoch": 4.718607677293429, + "grad_norm": 1.6368210315704346, + "learning_rate": 4.2372203180081893e-07, + "loss": 0.6812, + "step": 14505 + }, + { + "epoch": 4.720234222511386, + "grad_norm": 1.5727107524871826, + "learning_rate": 4.1900364984818754e-07, + "loss": 0.6791, + "step": 14510 + }, + { + "epoch": 4.721860767729343, + "grad_norm": 1.4456034898757935, + "learning_rate": 4.143114640895951e-07, + "loss": 0.6806, + "step": 14515 + }, + { + "epoch": 4.7234873129473, + "grad_norm": 1.680716872215271, + "learning_rate": 4.0964547952554443e-07, + "loss": 0.6604, + "step": 14520 + }, + { + "epoch": 4.7251138581652565, + "grad_norm": 1.6163851022720337, + "learning_rate": 4.05005701128619e-07, + "loss": 0.7087, + "step": 14525 + }, + { + "epoch": 4.726740403383214, + "grad_norm": 1.5150792598724365, + "learning_rate": 4.0039213384347187e-07, + "loss": 0.6758, + "step": 14530 + }, + { + "epoch": 4.728366948601171, + "grad_norm": 1.7528713941574097, + "learning_rate": 3.958047825868283e-07, + "loss": 0.6683, + "step": 14535 + }, + { + "epoch": 4.729993493819128, + "grad_norm": 1.5113881826400757, + "learning_rate": 3.912436522474666e-07, + "loss": 0.6857, + "step": 14540 + }, + { + "epoch": 4.731620039037085, + "grad_norm": 1.7433273792266846, + "learning_rate": 3.867087476862291e-07, + "loss": 0.6853, + "step": 14545 + }, + { + "epoch": 4.733246584255042, + "grad_norm": 1.6659798622131348, + "learning_rate": 3.822000737360026e-07, + "loss": 0.6621, + "step": 14550 + }, + { + "epoch": 4.734873129473, + "grad_norm": 1.6939740180969238, + "learning_rate": 3.777176352017242e-07, + "loss": 0.6749, + "step": 14555 + }, + { + "epoch": 4.736499674690957, + "grad_norm": 1.7440557479858398, + "learning_rate": 3.7326143686036706e-07, + "loss": 0.69, + "step": 14560 + }, + { + "epoch": 4.738126219908914, + "grad_norm": 1.7185252904891968, + "learning_rate": 3.6883148346094356e-07, + "loss": 0.699, + "step": 14565 + }, + { + "epoch": 4.7397527651268705, + "grad_norm": 1.528664469718933, + "learning_rate": 3.644277797244966e-07, + "loss": 0.7128, + "step": 14570 + }, + { + "epoch": 4.741379310344827, + "grad_norm": 1.7487074136734009, + "learning_rate": 3.600503303440972e-07, + "loss": 0.6843, + "step": 14575 + }, + { + "epoch": 4.743005855562784, + "grad_norm": 1.7359619140625, + "learning_rate": 3.556991399848275e-07, + "loss": 0.6735, + "step": 14580 + }, + { + "epoch": 4.744632400780741, + "grad_norm": 1.862557053565979, + "learning_rate": 3.5137421328379493e-07, + "loss": 0.6587, + "step": 14585 + }, + { + "epoch": 4.746258945998699, + "grad_norm": 1.6317099332809448, + "learning_rate": 3.4707555485011533e-07, + "loss": 0.6921, + "step": 14590 + }, + { + "epoch": 4.747885491216656, + "grad_norm": 1.6553255319595337, + "learning_rate": 3.4280316926490196e-07, + "loss": 0.6499, + "step": 14595 + }, + { + "epoch": 4.749512036434613, + "grad_norm": 1.649868369102478, + "learning_rate": 3.385570610812794e-07, + "loss": 0.6763, + "step": 14600 + }, + { + "epoch": 4.75113858165257, + "grad_norm": 1.6531949043273926, + "learning_rate": 3.3433723482436676e-07, + "loss": 0.6762, + "step": 14605 + }, + { + "epoch": 4.752765126870527, + "grad_norm": 1.613049864768982, + "learning_rate": 3.3014369499126675e-07, + "loss": 0.6587, + "step": 14610 + }, + { + "epoch": 4.7543916720884845, + "grad_norm": 1.719322681427002, + "learning_rate": 3.259764460510767e-07, + "loss": 0.6862, + "step": 14615 + }, + { + "epoch": 4.756018217306441, + "grad_norm": 1.531546950340271, + "learning_rate": 3.218354924448719e-07, + "loss": 0.6581, + "step": 14620 + }, + { + "epoch": 4.757644762524398, + "grad_norm": 1.5294305086135864, + "learning_rate": 3.177208385857028e-07, + "loss": 0.666, + "step": 14625 + }, + { + "epoch": 4.759271307742355, + "grad_norm": 1.5790424346923828, + "learning_rate": 3.1363248885859506e-07, + "loss": 0.6703, + "step": 14630 + }, + { + "epoch": 4.760897852960312, + "grad_norm": 1.7488723993301392, + "learning_rate": 3.0957044762054133e-07, + "loss": 0.6917, + "step": 14635 + }, + { + "epoch": 4.762524398178269, + "grad_norm": 1.6882662773132324, + "learning_rate": 3.055347192004954e-07, + "loss": 0.707, + "step": 14640 + }, + { + "epoch": 4.764150943396227, + "grad_norm": 1.5606118440628052, + "learning_rate": 3.0152530789936963e-07, + "loss": 0.6469, + "step": 14645 + }, + { + "epoch": 4.765777488614184, + "grad_norm": 1.596451997756958, + "learning_rate": 2.9754221799003503e-07, + "loss": 0.6667, + "step": 14650 + }, + { + "epoch": 4.767404033832141, + "grad_norm": 1.7899802923202515, + "learning_rate": 2.9358545371729883e-07, + "loss": 0.6638, + "step": 14655 + }, + { + "epoch": 4.7690305790500975, + "grad_norm": 1.6344199180603027, + "learning_rate": 2.8965501929792695e-07, + "loss": 0.6872, + "step": 14660 + }, + { + "epoch": 4.770657124268054, + "grad_norm": 1.7451854944229126, + "learning_rate": 2.857509189206187e-07, + "loss": 0.6985, + "step": 14665 + }, + { + "epoch": 4.772283669486011, + "grad_norm": 1.6508374214172363, + "learning_rate": 2.818731567460098e-07, + "loss": 0.6798, + "step": 14670 + }, + { + "epoch": 4.773910214703969, + "grad_norm": 1.5053751468658447, + "learning_rate": 2.7802173690666676e-07, + "loss": 0.6521, + "step": 14675 + }, + { + "epoch": 4.775536759921926, + "grad_norm": 1.5697300434112549, + "learning_rate": 2.741966635070842e-07, + "loss": 0.6857, + "step": 14680 + }, + { + "epoch": 4.777163305139883, + "grad_norm": 1.6234354972839355, + "learning_rate": 2.7039794062367616e-07, + "loss": 0.6569, + "step": 14685 + }, + { + "epoch": 4.77878985035784, + "grad_norm": 1.5883147716522217, + "learning_rate": 2.6662557230477667e-07, + "loss": 0.6339, + "step": 14690 + }, + { + "epoch": 4.780416395575797, + "grad_norm": 1.713050127029419, + "learning_rate": 2.6287956257063374e-07, + "loss": 0.6568, + "step": 14695 + }, + { + "epoch": 4.782042940793754, + "grad_norm": 1.929298996925354, + "learning_rate": 2.5915991541340667e-07, + "loss": 0.6939, + "step": 14700 + }, + { + "epoch": 4.7836694860117115, + "grad_norm": 1.5586493015289307, + "learning_rate": 2.5546663479715236e-07, + "loss": 0.6885, + "step": 14705 + }, + { + "epoch": 4.785296031229668, + "grad_norm": 1.44963800907135, + "learning_rate": 2.5179972465784186e-07, + "loss": 0.6865, + "step": 14710 + }, + { + "epoch": 4.786922576447625, + "grad_norm": 1.6566404104232788, + "learning_rate": 2.481591889033269e-07, + "loss": 0.6637, + "step": 14715 + }, + { + "epoch": 4.788549121665582, + "grad_norm": 1.6075341701507568, + "learning_rate": 2.4454503141336513e-07, + "loss": 0.6707, + "step": 14720 + }, + { + "epoch": 4.790175666883539, + "grad_norm": 1.7933886051177979, + "learning_rate": 2.409572560395951e-07, + "loss": 0.708, + "step": 14725 + }, + { + "epoch": 4.791802212101496, + "grad_norm": 1.563027262687683, + "learning_rate": 2.3739586660554148e-07, + "loss": 0.6885, + "step": 14730 + }, + { + "epoch": 4.793428757319454, + "grad_norm": 1.8409078121185303, + "learning_rate": 2.338608669066128e-07, + "loss": 0.6521, + "step": 14735 + }, + { + "epoch": 4.795055302537411, + "grad_norm": 1.6207574605941772, + "learning_rate": 2.3035226071008997e-07, + "loss": 0.6789, + "step": 14740 + }, + { + "epoch": 4.796681847755368, + "grad_norm": 1.9524937868118286, + "learning_rate": 2.2687005175512642e-07, + "loss": 0.6617, + "step": 14745 + }, + { + "epoch": 4.798308392973325, + "grad_norm": 1.8599200248718262, + "learning_rate": 2.2341424375274256e-07, + "loss": 0.6806, + "step": 14750 + }, + { + "epoch": 4.7999349381912815, + "grad_norm": 1.6578569412231445, + "learning_rate": 2.1998484038582567e-07, + "loss": 0.6666, + "step": 14755 + }, + { + "epoch": 4.801561483409239, + "grad_norm": 1.6024216413497925, + "learning_rate": 2.165818453091245e-07, + "loss": 0.7018, + "step": 14760 + }, + { + "epoch": 4.803188028627196, + "grad_norm": 1.7053841352462769, + "learning_rate": 2.1320526214924086e-07, + "loss": 0.68, + "step": 14765 + }, + { + "epoch": 4.804814573845153, + "grad_norm": 1.632737636566162, + "learning_rate": 2.098550945046268e-07, + "loss": 0.6814, + "step": 14770 + }, + { + "epoch": 4.80644111906311, + "grad_norm": 1.546188235282898, + "learning_rate": 2.0653134594559586e-07, + "loss": 0.6874, + "step": 14775 + }, + { + "epoch": 4.808067664281067, + "grad_norm": 1.5504343509674072, + "learning_rate": 2.0323402001428682e-07, + "loss": 0.673, + "step": 14780 + }, + { + "epoch": 4.809694209499024, + "grad_norm": 1.7148295640945435, + "learning_rate": 1.999631202246971e-07, + "loss": 0.6881, + "step": 14785 + }, + { + "epoch": 4.811320754716981, + "grad_norm": 1.4756275415420532, + "learning_rate": 1.9671865006265223e-07, + "loss": 0.7046, + "step": 14790 + }, + { + "epoch": 4.8129472999349385, + "grad_norm": 1.7910751104354858, + "learning_rate": 1.935006129858169e-07, + "loss": 0.6888, + "step": 14795 + }, + { + "epoch": 4.814573845152895, + "grad_norm": 1.4888908863067627, + "learning_rate": 1.9030901242367837e-07, + "loss": 0.6412, + "step": 14800 + }, + { + "epoch": 4.816200390370852, + "grad_norm": 1.5730111598968506, + "learning_rate": 1.8714385177756032e-07, + "loss": 0.6676, + "step": 14805 + }, + { + "epoch": 4.817826935588809, + "grad_norm": 1.5474718809127808, + "learning_rate": 1.8400513442059786e-07, + "loss": 0.6649, + "step": 14810 + }, + { + "epoch": 4.819453480806766, + "grad_norm": 1.6302613019943237, + "learning_rate": 1.8089286369775415e-07, + "loss": 0.7375, + "step": 14815 + }, + { + "epoch": 4.821080026024724, + "grad_norm": 1.4501763582229614, + "learning_rate": 1.7780704292580107e-07, + "loss": 0.6485, + "step": 14820 + }, + { + "epoch": 4.822706571242681, + "grad_norm": 1.771649718284607, + "learning_rate": 1.7474767539333302e-07, + "loss": 0.7074, + "step": 14825 + }, + { + "epoch": 4.824333116460638, + "grad_norm": 1.6281365156173706, + "learning_rate": 1.717147643607392e-07, + "loss": 0.6904, + "step": 14830 + }, + { + "epoch": 4.825959661678595, + "grad_norm": 1.714962363243103, + "learning_rate": 1.687083130602257e-07, + "loss": 0.6807, + "step": 14835 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 1.717561960220337, + "learning_rate": 1.6572832469579357e-07, + "loss": 0.6781, + "step": 14840 + }, + { + "epoch": 4.8292127521145085, + "grad_norm": 1.8593811988830566, + "learning_rate": 1.6277480244324127e-07, + "loss": 0.6716, + "step": 14845 + }, + { + "epoch": 4.830839297332465, + "grad_norm": 1.4186038970947266, + "learning_rate": 1.5984774945017044e-07, + "loss": 0.6667, + "step": 14850 + }, + { + "epoch": 4.832465842550423, + "grad_norm": 1.5681532621383667, + "learning_rate": 1.5694716883596083e-07, + "loss": 0.6667, + "step": 14855 + }, + { + "epoch": 4.83409238776838, + "grad_norm": 1.7162848711013794, + "learning_rate": 1.540730636917953e-07, + "loss": 0.6847, + "step": 14860 + }, + { + "epoch": 4.835718932986337, + "grad_norm": 1.569232702255249, + "learning_rate": 1.512254370806293e-07, + "loss": 0.6873, + "step": 14865 + }, + { + "epoch": 4.837345478204294, + "grad_norm": 1.7480008602142334, + "learning_rate": 1.4840429203720752e-07, + "loss": 0.666, + "step": 14870 + }, + { + "epoch": 4.838972023422251, + "grad_norm": 1.588944911956787, + "learning_rate": 1.4560963156804997e-07, + "loss": 0.6922, + "step": 14875 + }, + { + "epoch": 4.840598568640209, + "grad_norm": 1.6101741790771484, + "learning_rate": 1.4284145865144928e-07, + "loss": 0.6941, + "step": 14880 + }, + { + "epoch": 4.842225113858166, + "grad_norm": 1.6242797374725342, + "learning_rate": 1.4009977623747617e-07, + "loss": 0.6481, + "step": 14885 + }, + { + "epoch": 4.8438516590761225, + "grad_norm": 1.7038553953170776, + "learning_rate": 1.3738458724796288e-07, + "loss": 0.6873, + "step": 14890 + }, + { + "epoch": 4.845478204294079, + "grad_norm": 1.6171497106552124, + "learning_rate": 1.3469589457651422e-07, + "loss": 0.6293, + "step": 14895 + }, + { + "epoch": 4.847104749512036, + "grad_norm": 1.5528895854949951, + "learning_rate": 1.3203370108849644e-07, + "loss": 0.6475, + "step": 14900 + }, + { + "epoch": 4.848731294729993, + "grad_norm": 1.7914743423461914, + "learning_rate": 1.2939800962103176e-07, + "loss": 0.6771, + "step": 14905 + }, + { + "epoch": 4.85035783994795, + "grad_norm": 1.3965297937393188, + "learning_rate": 1.2678882298299833e-07, + "loss": 0.6676, + "step": 14910 + }, + { + "epoch": 4.851984385165908, + "grad_norm": 1.5735267400741577, + "learning_rate": 1.2420614395503294e-07, + "loss": 0.6703, + "step": 14915 + }, + { + "epoch": 4.853610930383865, + "grad_norm": 1.6437585353851318, + "learning_rate": 1.2164997528952004e-07, + "loss": 0.6764, + "step": 14920 + }, + { + "epoch": 4.855237475601822, + "grad_norm": 1.5509518384933472, + "learning_rate": 1.1912031971059168e-07, + "loss": 0.6643, + "step": 14925 + }, + { + "epoch": 4.856864020819779, + "grad_norm": 1.6096965074539185, + "learning_rate": 1.1661717991412746e-07, + "loss": 0.668, + "step": 14930 + }, + { + "epoch": 4.8584905660377355, + "grad_norm": 1.4271329641342163, + "learning_rate": 1.1414055856774075e-07, + "loss": 0.6872, + "step": 14935 + }, + { + "epoch": 4.860117111255693, + "grad_norm": 1.771170735359192, + "learning_rate": 1.1169045831079805e-07, + "loss": 0.6842, + "step": 14940 + }, + { + "epoch": 4.86174365647365, + "grad_norm": 1.6611682176589966, + "learning_rate": 1.0926688175438571e-07, + "loss": 0.6148, + "step": 14945 + }, + { + "epoch": 4.863370201691607, + "grad_norm": 1.7018705606460571, + "learning_rate": 1.0686983148133489e-07, + "loss": 0.6784, + "step": 14950 + }, + { + "epoch": 4.864996746909564, + "grad_norm": 1.6583181619644165, + "learning_rate": 1.0449931004620495e-07, + "loss": 0.6657, + "step": 14955 + }, + { + "epoch": 4.866623292127521, + "grad_norm": 1.769606113433838, + "learning_rate": 1.0215531997528338e-07, + "loss": 0.6823, + "step": 14960 + }, + { + "epoch": 4.868249837345478, + "grad_norm": 1.6844284534454346, + "learning_rate": 9.983786376657755e-08, + "loss": 0.674, + "step": 14965 + }, + { + "epoch": 4.869876382563435, + "grad_norm": 1.5215201377868652, + "learning_rate": 9.754694388982854e-08, + "loss": 0.6846, + "step": 14970 + }, + { + "epoch": 4.871502927781393, + "grad_norm": 1.4498556852340698, + "learning_rate": 9.52825627864834e-08, + "loss": 0.6973, + "step": 14975 + }, + { + "epoch": 4.8731294729993495, + "grad_norm": 1.6475036144256592, + "learning_rate": 9.304472286971733e-08, + "loss": 0.6876, + "step": 14980 + }, + { + "epoch": 4.874756018217306, + "grad_norm": 1.6651227474212646, + "learning_rate": 9.083342652441706e-08, + "loss": 0.6719, + "step": 14985 + }, + { + "epoch": 4.876382563435263, + "grad_norm": 1.6878098249435425, + "learning_rate": 8.864867610718363e-08, + "loss": 0.7013, + "step": 14990 + }, + { + "epoch": 4.87800910865322, + "grad_norm": 1.4566644430160522, + "learning_rate": 8.649047394632126e-08, + "loss": 0.679, + "step": 14995 + }, + { + "epoch": 4.879635653871178, + "grad_norm": 1.5212668180465698, + "learning_rate": 8.435882234184844e-08, + "loss": 0.6675, + "step": 15000 + }, + { + "epoch": 4.881262199089135, + "grad_norm": 1.6233844757080078, + "learning_rate": 8.225372356548689e-08, + "loss": 0.6562, + "step": 15005 + }, + { + "epoch": 4.882888744307092, + "grad_norm": 1.6228147745132446, + "learning_rate": 8.017517986065593e-08, + "loss": 0.6888, + "step": 15010 + }, + { + "epoch": 4.884515289525049, + "grad_norm": 1.5216223001480103, + "learning_rate": 7.812319344248365e-08, + "loss": 0.6865, + "step": 15015 + }, + { + "epoch": 4.886141834743006, + "grad_norm": 1.8619071245193481, + "learning_rate": 7.609776649778466e-08, + "loss": 0.6783, + "step": 15020 + }, + { + "epoch": 4.887768379960963, + "grad_norm": 1.5881717205047607, + "learning_rate": 7.409890118508234e-08, + "loss": 0.6796, + "step": 15025 + }, + { + "epoch": 4.8893949251789195, + "grad_norm": 1.6474624872207642, + "learning_rate": 7.212659963458101e-08, + "loss": 0.6859, + "step": 15030 + }, + { + "epoch": 4.891021470396877, + "grad_norm": 1.60321843624115, + "learning_rate": 7.018086394817991e-08, + "loss": 0.6553, + "step": 15035 + }, + { + "epoch": 4.892648015614834, + "grad_norm": 2.054914712905884, + "learning_rate": 6.826169619947032e-08, + "loss": 0.6646, + "step": 15040 + }, + { + "epoch": 4.894274560832791, + "grad_norm": 1.41323721408844, + "learning_rate": 6.63690984337273e-08, + "loss": 0.6778, + "step": 15045 + }, + { + "epoch": 4.895901106050748, + "grad_norm": 1.72225022315979, + "learning_rate": 6.45030726679069e-08, + "loss": 0.6591, + "step": 15050 + }, + { + "epoch": 4.897527651268705, + "grad_norm": 1.6134376525878906, + "learning_rate": 6.266362089065448e-08, + "loss": 0.6877, + "step": 15055 + }, + { + "epoch": 4.899154196486663, + "grad_norm": 1.64195716381073, + "learning_rate": 6.085074506228528e-08, + "loss": 0.6952, + "step": 15060 + }, + { + "epoch": 4.90078074170462, + "grad_norm": 1.468338966369629, + "learning_rate": 5.90644471147983e-08, + "loss": 0.6529, + "step": 15065 + }, + { + "epoch": 4.9024072869225765, + "grad_norm": 1.8615585565567017, + "learning_rate": 5.730472895187355e-08, + "loss": 0.6856, + "step": 15070 + }, + { + "epoch": 4.9040338321405335, + "grad_norm": 1.8940926790237427, + "learning_rate": 5.557159244885257e-08, + "loss": 0.6927, + "step": 15075 + }, + { + "epoch": 4.90566037735849, + "grad_norm": 1.7590283155441284, + "learning_rate": 5.386503945275789e-08, + "loss": 0.7002, + "step": 15080 + }, + { + "epoch": 4.907286922576447, + "grad_norm": 2.323991537094116, + "learning_rate": 5.2185071782276385e-08, + "loss": 0.6403, + "step": 15085 + }, + { + "epoch": 4.908913467794405, + "grad_norm": 1.6552611589431763, + "learning_rate": 5.053169122776757e-08, + "loss": 0.6861, + "step": 15090 + }, + { + "epoch": 4.910540013012362, + "grad_norm": 1.6465176343917847, + "learning_rate": 4.8904899551255326e-08, + "loss": 0.693, + "step": 15095 + }, + { + "epoch": 4.912166558230319, + "grad_norm": 1.6236457824707031, + "learning_rate": 4.730469848642505e-08, + "loss": 0.6818, + "step": 15100 + }, + { + "epoch": 4.913793103448276, + "grad_norm": 1.8764091730117798, + "learning_rate": 4.573108973862095e-08, + "loss": 0.6943, + "step": 15105 + }, + { + "epoch": 4.915419648666233, + "grad_norm": 1.6069153547286987, + "learning_rate": 4.418407498485988e-08, + "loss": 0.6681, + "step": 15110 + }, + { + "epoch": 4.91704619388419, + "grad_norm": 1.7984602451324463, + "learning_rate": 4.2663655873806385e-08, + "loss": 0.6844, + "step": 15115 + }, + { + "epoch": 4.918672739102147, + "grad_norm": 1.534196376800537, + "learning_rate": 4.116983402578656e-08, + "loss": 0.661, + "step": 15120 + }, + { + "epoch": 4.920299284320104, + "grad_norm": 1.567634105682373, + "learning_rate": 3.9702611032776946e-08, + "loss": 0.6886, + "step": 15125 + }, + { + "epoch": 4.921925829538061, + "grad_norm": 1.7674660682678223, + "learning_rate": 3.826198845841289e-08, + "loss": 0.6858, + "step": 15130 + }, + { + "epoch": 4.923552374756018, + "grad_norm": 1.7355701923370361, + "learning_rate": 3.684796783798017e-08, + "loss": 0.6923, + "step": 15135 + }, + { + "epoch": 4.925178919973975, + "grad_norm": 1.398328185081482, + "learning_rate": 3.546055067840948e-08, + "loss": 0.6593, + "step": 15140 + }, + { + "epoch": 4.926805465191932, + "grad_norm": 1.6259276866912842, + "learning_rate": 3.409973845829029e-08, + "loss": 0.6868, + "step": 15145 + }, + { + "epoch": 4.92843201040989, + "grad_norm": 1.5454919338226318, + "learning_rate": 3.2765532627845874e-08, + "loss": 0.6805, + "step": 15150 + }, + { + "epoch": 4.930058555627847, + "grad_norm": 1.6514472961425781, + "learning_rate": 3.145793460895552e-08, + "loss": 0.6958, + "step": 15155 + }, + { + "epoch": 4.931685100845804, + "grad_norm": 1.774691104888916, + "learning_rate": 3.017694579514063e-08, + "loss": 0.6961, + "step": 15160 + }, + { + "epoch": 4.9333116460637605, + "grad_norm": 1.5681709051132202, + "learning_rate": 2.8922567551556424e-08, + "loss": 0.6692, + "step": 15165 + }, + { + "epoch": 4.934938191281717, + "grad_norm": 1.7506740093231201, + "learning_rate": 2.7694801215011333e-08, + "loss": 0.6931, + "step": 15170 + }, + { + "epoch": 4.936564736499674, + "grad_norm": 1.5380946397781372, + "learning_rate": 2.6493648093942058e-08, + "loss": 0.6855, + "step": 15175 + }, + { + "epoch": 4.938191281717632, + "grad_norm": 1.6914947032928467, + "learning_rate": 2.531910946843574e-08, + "loss": 0.6872, + "step": 15180 + }, + { + "epoch": 4.939817826935589, + "grad_norm": 1.5922038555145264, + "learning_rate": 2.4171186590202233e-08, + "loss": 0.6905, + "step": 15185 + }, + { + "epoch": 4.941444372153546, + "grad_norm": 1.5917595624923706, + "learning_rate": 2.3049880682593504e-08, + "loss": 0.7, + "step": 15190 + }, + { + "epoch": 4.943070917371503, + "grad_norm": 1.6863962411880493, + "learning_rate": 2.1955192940600887e-08, + "loss": 0.6881, + "step": 15195 + }, + { + "epoch": 4.94469746258946, + "grad_norm": 1.6738801002502441, + "learning_rate": 2.0887124530841183e-08, + "loss": 0.6734, + "step": 15200 + }, + { + "epoch": 4.9463240078074175, + "grad_norm": 1.877226710319519, + "learning_rate": 1.9845676591559446e-08, + "loss": 0.6921, + "step": 15205 + }, + { + "epoch": 4.9479505530253745, + "grad_norm": 1.529873251914978, + "learning_rate": 1.8830850232645636e-08, + "loss": 0.6762, + "step": 15210 + }, + { + "epoch": 4.949577098243331, + "grad_norm": 1.6177641153335571, + "learning_rate": 1.7842646535601305e-08, + "loss": 0.682, + "step": 15215 + }, + { + "epoch": 4.951203643461288, + "grad_norm": 2.247166872024536, + "learning_rate": 1.688106655356736e-08, + "loss": 0.6719, + "step": 15220 + }, + { + "epoch": 4.952830188679245, + "grad_norm": 1.631149172782898, + "learning_rate": 1.5946111311310186e-08, + "loss": 0.6686, + "step": 15225 + }, + { + "epoch": 4.954456733897202, + "grad_norm": 1.5437954664230347, + "learning_rate": 1.5037781805218863e-08, + "loss": 0.6903, + "step": 15230 + }, + { + "epoch": 4.956083279115159, + "grad_norm": 1.5984909534454346, + "learning_rate": 1.4156079003307953e-08, + "loss": 0.6467, + "step": 15235 + }, + { + "epoch": 4.957709824333117, + "grad_norm": 1.57529878616333, + "learning_rate": 1.330100384521471e-08, + "loss": 0.692, + "step": 15240 + }, + { + "epoch": 4.959336369551074, + "grad_norm": 1.506515622138977, + "learning_rate": 1.247255724220464e-08, + "loss": 0.6751, + "step": 15245 + }, + { + "epoch": 4.960962914769031, + "grad_norm": 1.7199063301086426, + "learning_rate": 1.1828973113703346e-08, + "loss": 0.6851, + "step": 15250 + }, + { + "epoch": 4.9625894599869875, + "grad_norm": 1.7767671346664429, + "learning_rate": 1.1048460115634096e-08, + "loss": 0.6711, + "step": 15255 + }, + { + "epoch": 4.964216005204944, + "grad_norm": 1.551937460899353, + "learning_rate": 1.02945780732e-08, + "loss": 0.696, + "step": 15260 + }, + { + "epoch": 4.965842550422902, + "grad_norm": 1.822651743888855, + "learning_rate": 9.567327789825054e-09, + "loss": 0.701, + "step": 15265 + }, + { + "epoch": 4.967469095640859, + "grad_norm": 1.716488003730774, + "learning_rate": 8.86671004054207e-09, + "loss": 0.6651, + "step": 15270 + }, + { + "epoch": 4.969095640858816, + "grad_norm": 1.6750794649124146, + "learning_rate": 8.192725572006565e-09, + "loss": 0.6647, + "step": 15275 + }, + { + "epoch": 4.970722186076773, + "grad_norm": 1.744114637374878, + "learning_rate": 7.545375102493979e-09, + "loss": 0.7004, + "step": 15280 + }, + { + "epoch": 4.97234873129473, + "grad_norm": 1.8388464450836182, + "learning_rate": 6.924659321888571e-09, + "loss": 0.7087, + "step": 15285 + }, + { + "epoch": 4.973975276512687, + "grad_norm": 1.7136541604995728, + "learning_rate": 6.330578891691752e-09, + "loss": 0.6878, + "step": 15290 + }, + { + "epoch": 4.975601821730644, + "grad_norm": 1.7347222566604614, + "learning_rate": 5.763134445022078e-09, + "loss": 0.6769, + "step": 15295 + }, + { + "epoch": 4.9772283669486015, + "grad_norm": 1.5208088159561157, + "learning_rate": 5.222326586609705e-09, + "loss": 0.6898, + "step": 15300 + }, + { + "epoch": 4.978854912166558, + "grad_norm": 1.480334997177124, + "learning_rate": 4.7081558927991594e-09, + "loss": 0.6731, + "step": 15305 + }, + { + "epoch": 4.980481457384515, + "grad_norm": 2.0022387504577637, + "learning_rate": 4.220622911546568e-09, + "loss": 0.6542, + "step": 15310 + }, + { + "epoch": 4.982108002602472, + "grad_norm": 1.5774517059326172, + "learning_rate": 3.759728162422427e-09, + "loss": 0.6694, + "step": 15315 + }, + { + "epoch": 4.983734547820429, + "grad_norm": 1.596854567527771, + "learning_rate": 3.3254721366032805e-09, + "loss": 0.631, + "step": 15320 + }, + { + "epoch": 4.985361093038387, + "grad_norm": 1.559390664100647, + "learning_rate": 2.9178552968800454e-09, + "loss": 0.6747, + "step": 15325 + }, + { + "epoch": 4.986987638256344, + "grad_norm": 1.5390856266021729, + "learning_rate": 2.536878077655236e-09, + "loss": 0.6581, + "step": 15330 + }, + { + "epoch": 4.988614183474301, + "grad_norm": 1.662405014038086, + "learning_rate": 2.1825408849401873e-09, + "loss": 0.7099, + "step": 15335 + }, + { + "epoch": 4.990240728692258, + "grad_norm": 1.6493418216705322, + "learning_rate": 1.8548440963522818e-09, + "loss": 0.6682, + "step": 15340 + }, + { + "epoch": 4.991867273910215, + "grad_norm": 1.6311626434326172, + "learning_rate": 1.5537880611260491e-09, + "loss": 0.6963, + "step": 15345 + }, + { + "epoch": 4.9934938191281715, + "grad_norm": 1.7533777952194214, + "learning_rate": 1.2793731000937393e-09, + "loss": 0.6723, + "step": 15350 + }, + { + "epoch": 4.995120364346128, + "grad_norm": 1.5436981916427612, + "learning_rate": 1.0315995057075256e-09, + "loss": 0.7002, + "step": 15355 + }, + { + "epoch": 4.996746909564086, + "grad_norm": 1.6257444620132446, + "learning_rate": 8.104675420173014e-10, + "loss": 0.6751, + "step": 15360 + }, + { + "epoch": 4.998373454782043, + "grad_norm": 1.6246846914291382, + "learning_rate": 6.159774446901079e-10, + "loss": 0.6744, + "step": 15365 + }, + { + "epoch": 5.0, + "grad_norm": 2.0969643592834473, + "learning_rate": 4.481294209907061e-10, + "loss": 0.6721, + "step": 15370 + }, + { + "epoch": 5.0, + "eval_f1": 0.8167575437095372, + "eval_loss": 0.433349609375, + "eval_precision": 0.8175113443828662, + "eval_recall": 0.8161877020334489, + "eval_runtime": 637.2031, + "eval_samples_per_second": 617.441, + "eval_steps_per_second": 1.207, + "step": 15370 + }, + { + "epoch": 5.0, + "step": 15370, + "total_flos": 4.1404949424494346e+18, + "train_loss": 0.7951339374212142, + "train_runtime": 80422.989, + "train_samples_per_second": 195.683, + "train_steps_per_second": 0.191 + } + ], + "logging_steps": 5, + "max_steps": 15370, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 5.0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.1404949424494346e+18, + "train_batch_size": 512, + "trial_name": null, + "trial_params": null +}