diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12965 @@ +{ + "best_metric": 0.6779661016949152, + "best_model_checkpoint": "DF_Image_VIT_V1/checkpoint-13812", + "epoch": 4.0, + "eval_steps": 500, + "global_step": 18416, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002172024326672459, + "grad_norm": 0.530893087387085, + "learning_rate": 4.9972849695916595e-05, + "loss": 0.3852, + "step": 10 + }, + { + "epoch": 0.004344048653344918, + "grad_norm": 0.3032895624637604, + "learning_rate": 4.994569939183319e-05, + "loss": 0.1074, + "step": 20 + }, + { + "epoch": 0.006516072980017376, + "grad_norm": 0.9604411125183105, + "learning_rate": 4.991854908774978e-05, + "loss": 0.1413, + "step": 30 + }, + { + "epoch": 0.008688097306689836, + "grad_norm": 0.1610596477985382, + "learning_rate": 4.989139878366638e-05, + "loss": 0.0231, + "step": 40 + }, + { + "epoch": 0.010860121633362294, + "grad_norm": 0.4416898488998413, + "learning_rate": 4.9864248479582974e-05, + "loss": 0.068, + "step": 50 + }, + { + "epoch": 0.013032145960034752, + "grad_norm": 0.19819538295269012, + "learning_rate": 4.983709817549957e-05, + "loss": 0.1456, + "step": 60 + }, + { + "epoch": 0.015204170286707211, + "grad_norm": 0.18559947609901428, + "learning_rate": 4.980994787141616e-05, + "loss": 0.071, + "step": 70 + }, + { + "epoch": 0.01737619461337967, + "grad_norm": 0.13225772976875305, + "learning_rate": 4.978279756733275e-05, + "loss": 0.0423, + "step": 80 + }, + { + "epoch": 0.01954821894005213, + "grad_norm": 0.10720467567443848, + "learning_rate": 4.975564726324935e-05, + "loss": 0.014, + "step": 90 + }, + { + "epoch": 0.021720243266724587, + "grad_norm": 0.09248513728380203, + "learning_rate": 4.9728496959165946e-05, + "loss": 0.0398, + "step": 100 + }, + { + "epoch": 0.023892267593397045, + "grad_norm": 0.16907687485218048, + "learning_rate": 4.970134665508254e-05, + "loss": 0.1245, + "step": 110 + }, + { + "epoch": 0.026064291920069503, + "grad_norm": 0.2754700183868408, + "learning_rate": 4.967419635099914e-05, + "loss": 0.1191, + "step": 120 + }, + { + "epoch": 0.028236316246741965, + "grad_norm": 0.11275039613246918, + "learning_rate": 4.964704604691573e-05, + "loss": 0.0192, + "step": 130 + }, + { + "epoch": 0.030408340573414423, + "grad_norm": 0.10382300615310669, + "learning_rate": 4.9619895742832325e-05, + "loss": 0.0674, + "step": 140 + }, + { + "epoch": 0.03258036490008688, + "grad_norm": 0.10827434808015823, + "learning_rate": 4.959274543874892e-05, + "loss": 0.0674, + "step": 150 + }, + { + "epoch": 0.03475238922675934, + "grad_norm": 0.09914221614599228, + "learning_rate": 4.956559513466551e-05, + "loss": 0.0125, + "step": 160 + }, + { + "epoch": 0.0369244135534318, + "grad_norm": 0.08502601832151413, + "learning_rate": 4.9538444830582104e-05, + "loss": 0.039, + "step": 170 + }, + { + "epoch": 0.03909643788010426, + "grad_norm": 0.0816200003027916, + "learning_rate": 4.95112945264987e-05, + "loss": 0.0388, + "step": 180 + }, + { + "epoch": 0.04126846220677671, + "grad_norm": 0.09381763637065887, + "learning_rate": 4.948414422241529e-05, + "loss": 0.0663, + "step": 190 + }, + { + "epoch": 0.043440486533449174, + "grad_norm": 0.08192573487758636, + "learning_rate": 4.945699391833189e-05, + "loss": 0.0679, + "step": 200 + }, + { + "epoch": 0.045612510860121636, + "grad_norm": 0.10738043487071991, + "learning_rate": 4.942984361424848e-05, + "loss": 0.0959, + "step": 210 + }, + { + "epoch": 0.04778453518679409, + "grad_norm": 0.10316260159015656, + "learning_rate": 4.9402693310165076e-05, + "loss": 0.04, + "step": 220 + }, + { + "epoch": 0.04995655951346655, + "grad_norm": 0.08651253581047058, + "learning_rate": 4.937554300608167e-05, + "loss": 0.0113, + "step": 230 + }, + { + "epoch": 0.052128583840139006, + "grad_norm": 0.46742257475852966, + "learning_rate": 4.934839270199826e-05, + "loss": 0.097, + "step": 240 + }, + { + "epoch": 0.05430060816681147, + "grad_norm": 0.10922794789075851, + "learning_rate": 4.9321242397914855e-05, + "loss": 0.0958, + "step": 250 + }, + { + "epoch": 0.05647263249348393, + "grad_norm": 0.09597119688987732, + "learning_rate": 4.929409209383145e-05, + "loss": 0.0126, + "step": 260 + }, + { + "epoch": 0.058644656820156384, + "grad_norm": 0.08410635590553284, + "learning_rate": 4.926694178974805e-05, + "loss": 0.01, + "step": 270 + }, + { + "epoch": 0.060816681146828845, + "grad_norm": 0.07707684487104416, + "learning_rate": 4.923979148566464e-05, + "loss": 0.0678, + "step": 280 + }, + { + "epoch": 0.0629887054735013, + "grad_norm": 0.06932105869054794, + "learning_rate": 4.9212641181581234e-05, + "loss": 0.0085, + "step": 290 + }, + { + "epoch": 0.06516072980017376, + "grad_norm": 0.07078186422586441, + "learning_rate": 4.9185490877497834e-05, + "loss": 0.0983, + "step": 300 + }, + { + "epoch": 0.06733275412684622, + "grad_norm": 0.08902157843112946, + "learning_rate": 4.9158340573414427e-05, + "loss": 0.0957, + "step": 310 + }, + { + "epoch": 0.06950477845351868, + "grad_norm": 0.09481088072061539, + "learning_rate": 4.913119026933102e-05, + "loss": 0.0396, + "step": 320 + }, + { + "epoch": 0.07167680278019113, + "grad_norm": 1.0230822563171387, + "learning_rate": 4.910403996524761e-05, + "loss": 0.0947, + "step": 330 + }, + { + "epoch": 0.0738488271068636, + "grad_norm": 0.19595967233181, + "learning_rate": 4.9076889661164206e-05, + "loss": 0.0646, + "step": 340 + }, + { + "epoch": 0.07602085143353605, + "grad_norm": 0.09209536015987396, + "learning_rate": 4.9049739357080805e-05, + "loss": 0.0148, + "step": 350 + }, + { + "epoch": 0.07819287576020852, + "grad_norm": 0.0737241730093956, + "learning_rate": 4.90225890529974e-05, + "loss": 0.0098, + "step": 360 + }, + { + "epoch": 0.08036490008688098, + "grad_norm": 0.0739888846874237, + "learning_rate": 4.899543874891399e-05, + "loss": 0.0683, + "step": 370 + }, + { + "epoch": 0.08253692441355343, + "grad_norm": 0.07659421861171722, + "learning_rate": 4.8968288444830584e-05, + "loss": 0.0686, + "step": 380 + }, + { + "epoch": 0.08470894874022589, + "grad_norm": 1.0171195268630981, + "learning_rate": 4.894113814074718e-05, + "loss": 0.0956, + "step": 390 + }, + { + "epoch": 0.08688097306689835, + "grad_norm": 0.12161863595247269, + "learning_rate": 4.891398783666377e-05, + "loss": 0.0384, + "step": 400 + }, + { + "epoch": 0.08905299739357081, + "grad_norm": 0.1082439124584198, + "learning_rate": 4.8886837532580364e-05, + "loss": 0.0682, + "step": 410 + }, + { + "epoch": 0.09122502172024327, + "grad_norm": 0.10778769105672836, + "learning_rate": 4.885968722849696e-05, + "loss": 0.0662, + "step": 420 + }, + { + "epoch": 0.09339704604691572, + "grad_norm": 0.12607441842556, + "learning_rate": 4.8832536924413556e-05, + "loss": 0.0369, + "step": 430 + }, + { + "epoch": 0.09556907037358818, + "grad_norm": 0.08641614764928818, + "learning_rate": 4.880538662033015e-05, + "loss": 0.0969, + "step": 440 + }, + { + "epoch": 0.09774109470026064, + "grad_norm": 0.12377568334341049, + "learning_rate": 4.877823631624674e-05, + "loss": 0.0665, + "step": 450 + }, + { + "epoch": 0.0999131190269331, + "grad_norm": 0.09685255587100983, + "learning_rate": 4.8751086012163335e-05, + "loss": 0.0413, + "step": 460 + }, + { + "epoch": 0.10208514335360556, + "grad_norm": 0.13495175540447235, + "learning_rate": 4.8723935708079935e-05, + "loss": 0.0958, + "step": 470 + }, + { + "epoch": 0.10425716768027801, + "grad_norm": 0.09797213226556778, + "learning_rate": 4.869678540399653e-05, + "loss": 0.04, + "step": 480 + }, + { + "epoch": 0.10642919200695047, + "grad_norm": 0.11449749022722244, + "learning_rate": 4.866963509991312e-05, + "loss": 0.0941, + "step": 490 + }, + { + "epoch": 0.10860121633362294, + "grad_norm": 0.11653515696525574, + "learning_rate": 4.864248479582972e-05, + "loss": 0.0394, + "step": 500 + }, + { + "epoch": 0.1107732406602954, + "grad_norm": 0.4893771708011627, + "learning_rate": 4.8615334491746314e-05, + "loss": 0.0394, + "step": 510 + }, + { + "epoch": 0.11294526498696786, + "grad_norm": 0.07817152887582779, + "learning_rate": 4.858818418766291e-05, + "loss": 0.0377, + "step": 520 + }, + { + "epoch": 0.11511728931364032, + "grad_norm": 0.07076311856508255, + "learning_rate": 4.85610338835795e-05, + "loss": 0.0358, + "step": 530 + }, + { + "epoch": 0.11728931364031277, + "grad_norm": 0.4873422086238861, + "learning_rate": 4.853388357949609e-05, + "loss": 0.0392, + "step": 540 + }, + { + "epoch": 0.11946133796698523, + "grad_norm": 0.06412825733423233, + "learning_rate": 4.8506733275412686e-05, + "loss": 0.0079, + "step": 550 + }, + { + "epoch": 0.12163336229365769, + "grad_norm": 0.06440827250480652, + "learning_rate": 4.847958297132928e-05, + "loss": 0.0378, + "step": 560 + }, + { + "epoch": 0.12380538662033015, + "grad_norm": 0.0619625598192215, + "learning_rate": 4.845243266724587e-05, + "loss": 0.0376, + "step": 570 + }, + { + "epoch": 0.1259774109470026, + "grad_norm": 0.060499969869852066, + "learning_rate": 4.842528236316247e-05, + "loss": 0.0073, + "step": 580 + }, + { + "epoch": 0.12814943527367506, + "grad_norm": 0.08101051300764084, + "learning_rate": 4.8398132059079065e-05, + "loss": 0.067, + "step": 590 + }, + { + "epoch": 0.13032145960034752, + "grad_norm": 0.6294172406196594, + "learning_rate": 4.837098175499566e-05, + "loss": 0.0336, + "step": 600 + }, + { + "epoch": 0.13249348392701998, + "grad_norm": 0.44123587012290955, + "learning_rate": 4.834383145091225e-05, + "loss": 0.1469, + "step": 610 + }, + { + "epoch": 0.13466550825369245, + "grad_norm": 0.06616009771823883, + "learning_rate": 4.8316681146828844e-05, + "loss": 0.0364, + "step": 620 + }, + { + "epoch": 0.1368375325803649, + "grad_norm": 0.06032127887010574, + "learning_rate": 4.828953084274544e-05, + "loss": 0.0079, + "step": 630 + }, + { + "epoch": 0.13900955690703737, + "grad_norm": 0.055732373148202896, + "learning_rate": 4.826238053866203e-05, + "loss": 0.0379, + "step": 640 + }, + { + "epoch": 0.14118158123370983, + "grad_norm": 0.09018450975418091, + "learning_rate": 4.823523023457863e-05, + "loss": 0.0682, + "step": 650 + }, + { + "epoch": 0.14335360556038226, + "grad_norm": 0.07363690435886383, + "learning_rate": 4.820807993049522e-05, + "loss": 0.0366, + "step": 660 + }, + { + "epoch": 0.14552562988705472, + "grad_norm": 0.15481476485729218, + "learning_rate": 4.818092962641182e-05, + "loss": 0.0702, + "step": 670 + }, + { + "epoch": 0.1476976542137272, + "grad_norm": 0.15850311517715454, + "learning_rate": 4.8153779322328416e-05, + "loss": 0.1018, + "step": 680 + }, + { + "epoch": 0.14986967854039965, + "grad_norm": 0.10273230075836182, + "learning_rate": 4.812662901824501e-05, + "loss": 0.0699, + "step": 690 + }, + { + "epoch": 0.1520417028670721, + "grad_norm": 0.13007517158985138, + "learning_rate": 4.80994787141616e-05, + "loss": 0.07, + "step": 700 + }, + { + "epoch": 0.15421372719374457, + "grad_norm": 0.0926077589392662, + "learning_rate": 4.8072328410078195e-05, + "loss": 0.0397, + "step": 710 + }, + { + "epoch": 0.15638575152041703, + "grad_norm": 0.05907576531171799, + "learning_rate": 4.804517810599479e-05, + "loss": 0.008, + "step": 720 + }, + { + "epoch": 0.1585577758470895, + "grad_norm": 0.055849187076091766, + "learning_rate": 4.801802780191139e-05, + "loss": 0.0382, + "step": 730 + }, + { + "epoch": 0.16072980017376196, + "grad_norm": 0.06694773584604263, + "learning_rate": 4.799087749782798e-05, + "loss": 0.069, + "step": 740 + }, + { + "epoch": 0.16290182450043442, + "grad_norm": 0.06179416552186012, + "learning_rate": 4.7963727193744574e-05, + "loss": 0.0081, + "step": 750 + }, + { + "epoch": 0.16507384882710685, + "grad_norm": 0.06511629372835159, + "learning_rate": 4.7936576889661167e-05, + "loss": 0.0689, + "step": 760 + }, + { + "epoch": 0.1672458731537793, + "grad_norm": 0.05806345120072365, + "learning_rate": 4.790942658557776e-05, + "loss": 0.0073, + "step": 770 + }, + { + "epoch": 0.16941789748045177, + "grad_norm": 0.05196061730384827, + "learning_rate": 4.788227628149435e-05, + "loss": 0.0371, + "step": 780 + }, + { + "epoch": 0.17158992180712423, + "grad_norm": 0.057679492980241776, + "learning_rate": 4.7855125977410946e-05, + "loss": 0.0062, + "step": 790 + }, + { + "epoch": 0.1737619461337967, + "grad_norm": 0.04627954214811325, + "learning_rate": 4.782797567332754e-05, + "loss": 0.0057, + "step": 800 + }, + { + "epoch": 0.17593397046046916, + "grad_norm": 0.04635784775018692, + "learning_rate": 4.780082536924414e-05, + "loss": 0.0707, + "step": 810 + }, + { + "epoch": 0.17810599478714162, + "grad_norm": 0.6465384364128113, + "learning_rate": 4.777367506516073e-05, + "loss": 0.1365, + "step": 820 + }, + { + "epoch": 0.18027801911381408, + "grad_norm": 0.0622185543179512, + "learning_rate": 4.7746524761077324e-05, + "loss": 0.0364, + "step": 830 + }, + { + "epoch": 0.18245004344048654, + "grad_norm": 0.49819082021713257, + "learning_rate": 4.771937445699392e-05, + "loss": 0.0702, + "step": 840 + }, + { + "epoch": 0.184622067767159, + "grad_norm": 0.07075995206832886, + "learning_rate": 4.769222415291052e-05, + "loss": 0.0383, + "step": 850 + }, + { + "epoch": 0.18679409209383144, + "grad_norm": 0.06961839646100998, + "learning_rate": 4.766507384882711e-05, + "loss": 0.0685, + "step": 860 + }, + { + "epoch": 0.1889661164205039, + "grad_norm": 0.08239472657442093, + "learning_rate": 4.76379235447437e-05, + "loss": 0.0386, + "step": 870 + }, + { + "epoch": 0.19113814074717636, + "grad_norm": 0.5006256699562073, + "learning_rate": 4.76107732406603e-05, + "loss": 0.0383, + "step": 880 + }, + { + "epoch": 0.19331016507384882, + "grad_norm": 0.05663491412997246, + "learning_rate": 4.7583622936576896e-05, + "loss": 0.0067, + "step": 890 + }, + { + "epoch": 0.19548218940052128, + "grad_norm": 0.061523787677288055, + "learning_rate": 4.755647263249349e-05, + "loss": 0.068, + "step": 900 + }, + { + "epoch": 0.19765421372719374, + "grad_norm": 0.09269333630800247, + "learning_rate": 4.752932232841008e-05, + "loss": 0.0674, + "step": 910 + }, + { + "epoch": 0.1998262380538662, + "grad_norm": 0.0798824205994606, + "learning_rate": 4.7502172024326675e-05, + "loss": 0.0379, + "step": 920 + }, + { + "epoch": 0.20199826238053867, + "grad_norm": 0.13433697819709778, + "learning_rate": 4.747502172024327e-05, + "loss": 0.0907, + "step": 930 + }, + { + "epoch": 0.20417028670721113, + "grad_norm": 0.07293786108493805, + "learning_rate": 4.744787141615986e-05, + "loss": 0.0323, + "step": 940 + }, + { + "epoch": 0.2063423110338836, + "grad_norm": 0.15695428848266602, + "learning_rate": 4.7420721112076454e-05, + "loss": 0.0631, + "step": 950 + }, + { + "epoch": 0.20851433536055602, + "grad_norm": 0.23731303215026855, + "learning_rate": 4.7393570807993054e-05, + "loss": 0.0594, + "step": 960 + }, + { + "epoch": 0.21068635968722849, + "grad_norm": 0.21904346346855164, + "learning_rate": 4.736642050390965e-05, + "loss": 0.0978, + "step": 970 + }, + { + "epoch": 0.21285838401390095, + "grad_norm": 0.05626118555665016, + "learning_rate": 4.733927019982624e-05, + "loss": 0.0243, + "step": 980 + }, + { + "epoch": 0.2150304083405734, + "grad_norm": 0.04844866693019867, + "learning_rate": 4.731211989574283e-05, + "loss": 0.0059, + "step": 990 + }, + { + "epoch": 0.21720243266724587, + "grad_norm": 0.5048914551734924, + "learning_rate": 4.7284969591659426e-05, + "loss": 0.102, + "step": 1000 + }, + { + "epoch": 0.21937445699391833, + "grad_norm": 0.06435242295265198, + "learning_rate": 4.725781928757602e-05, + "loss": 0.0381, + "step": 1010 + }, + { + "epoch": 0.2215464813205908, + "grad_norm": 0.08498011529445648, + "learning_rate": 4.723066898349261e-05, + "loss": 0.0916, + "step": 1020 + }, + { + "epoch": 0.22371850564726325, + "grad_norm": 0.07478222995996475, + "learning_rate": 4.720351867940921e-05, + "loss": 0.0118, + "step": 1030 + }, + { + "epoch": 0.22589052997393572, + "grad_norm": 0.05415387079119682, + "learning_rate": 4.7176368375325805e-05, + "loss": 0.0087, + "step": 1040 + }, + { + "epoch": 0.22806255430060818, + "grad_norm": 0.08049948513507843, + "learning_rate": 4.7149218071242405e-05, + "loss": 0.0652, + "step": 1050 + }, + { + "epoch": 0.23023457862728064, + "grad_norm": 0.19372807443141937, + "learning_rate": 4.7122067767159e-05, + "loss": 0.0472, + "step": 1060 + }, + { + "epoch": 0.23240660295395307, + "grad_norm": 0.04606153070926666, + "learning_rate": 4.709491746307559e-05, + "loss": 0.03, + "step": 1070 + }, + { + "epoch": 0.23457862728062553, + "grad_norm": 0.05246344953775406, + "learning_rate": 4.7067767158992184e-05, + "loss": 0.0208, + "step": 1080 + }, + { + "epoch": 0.236750651607298, + "grad_norm": 0.4381423890590668, + "learning_rate": 4.704061685490878e-05, + "loss": 0.1184, + "step": 1090 + }, + { + "epoch": 0.23892267593397046, + "grad_norm": 0.05256995931267738, + "learning_rate": 4.701346655082537e-05, + "loss": 0.046, + "step": 1100 + }, + { + "epoch": 0.24109470026064292, + "grad_norm": 0.05329786613583565, + "learning_rate": 4.698631624674197e-05, + "loss": 0.0228, + "step": 1110 + }, + { + "epoch": 0.24326672458731538, + "grad_norm": 0.04372464492917061, + "learning_rate": 4.695916594265856e-05, + "loss": 0.0219, + "step": 1120 + }, + { + "epoch": 0.24543874891398784, + "grad_norm": 0.03980829194188118, + "learning_rate": 4.6932015638575156e-05, + "loss": 0.0369, + "step": 1130 + }, + { + "epoch": 0.2476107732406603, + "grad_norm": 0.055453334003686905, + "learning_rate": 4.690486533449175e-05, + "loss": 0.0341, + "step": 1140 + }, + { + "epoch": 0.24978279756733276, + "grad_norm": 0.22770388424396515, + "learning_rate": 4.687771503040834e-05, + "loss": 0.0413, + "step": 1150 + }, + { + "epoch": 0.2519548218940052, + "grad_norm": 0.8776328563690186, + "learning_rate": 4.6850564726324935e-05, + "loss": 0.083, + "step": 1160 + }, + { + "epoch": 0.25412684622067766, + "grad_norm": 0.16721822321414948, + "learning_rate": 4.682341442224153e-05, + "loss": 0.0813, + "step": 1170 + }, + { + "epoch": 0.2562988705473501, + "grad_norm": 0.07600212097167969, + "learning_rate": 4.679626411815812e-05, + "loss": 0.0387, + "step": 1180 + }, + { + "epoch": 0.2584708948740226, + "grad_norm": 0.049551110714673996, + "learning_rate": 4.676911381407472e-05, + "loss": 0.0067, + "step": 1190 + }, + { + "epoch": 0.26064291920069504, + "grad_norm": 0.058788660913705826, + "learning_rate": 4.6741963509991314e-05, + "loss": 0.0715, + "step": 1200 + }, + { + "epoch": 0.2628149435273675, + "grad_norm": 0.10510104894638062, + "learning_rate": 4.6714813205907907e-05, + "loss": 0.0718, + "step": 1210 + }, + { + "epoch": 0.26498696785403997, + "grad_norm": 0.06653593480587006, + "learning_rate": 4.66876629018245e-05, + "loss": 0.039, + "step": 1220 + }, + { + "epoch": 0.26715899218071243, + "grad_norm": 0.04870177060365677, + "learning_rate": 4.66605125977411e-05, + "loss": 0.0068, + "step": 1230 + }, + { + "epoch": 0.2693310165073849, + "grad_norm": 0.04352603852748871, + "learning_rate": 4.663336229365769e-05, + "loss": 0.0384, + "step": 1240 + }, + { + "epoch": 0.27150304083405735, + "grad_norm": 0.044990599155426025, + "learning_rate": 4.6606211989574285e-05, + "loss": 0.0383, + "step": 1250 + }, + { + "epoch": 0.2736750651607298, + "grad_norm": 0.09989727288484573, + "learning_rate": 4.6579061685490885e-05, + "loss": 0.1971, + "step": 1260 + }, + { + "epoch": 0.2758470894874023, + "grad_norm": 0.26683279871940613, + "learning_rate": 4.655191138140748e-05, + "loss": 0.1433, + "step": 1270 + }, + { + "epoch": 0.27801911381407474, + "grad_norm": 0.13336573541164398, + "learning_rate": 4.652476107732407e-05, + "loss": 0.0504, + "step": 1280 + }, + { + "epoch": 0.2801911381407472, + "grad_norm": 0.09248033910989761, + "learning_rate": 4.6497610773240664e-05, + "loss": 0.0371, + "step": 1290 + }, + { + "epoch": 0.28236316246741966, + "grad_norm": 0.067159004509449, + "learning_rate": 4.647046046915726e-05, + "loss": 0.039, + "step": 1300 + }, + { + "epoch": 0.2845351867940921, + "grad_norm": 0.4917806386947632, + "learning_rate": 4.644331016507385e-05, + "loss": 0.0648, + "step": 1310 + }, + { + "epoch": 0.2867072111207645, + "grad_norm": 0.08391285687685013, + "learning_rate": 4.641615986099044e-05, + "loss": 0.0401, + "step": 1320 + }, + { + "epoch": 0.288879235447437, + "grad_norm": 0.16138538718223572, + "learning_rate": 4.6389009556907036e-05, + "loss": 0.0595, + "step": 1330 + }, + { + "epoch": 0.29105125977410945, + "grad_norm": 0.05601629614830017, + "learning_rate": 4.6361859252823636e-05, + "loss": 0.0146, + "step": 1340 + }, + { + "epoch": 0.2932232841007819, + "grad_norm": 0.05195131152868271, + "learning_rate": 4.633470894874023e-05, + "loss": 0.0651, + "step": 1350 + }, + { + "epoch": 0.2953953084274544, + "grad_norm": 0.49680355191230774, + "learning_rate": 4.630755864465682e-05, + "loss": 0.0639, + "step": 1360 + }, + { + "epoch": 0.29756733275412683, + "grad_norm": 0.2788536250591278, + "learning_rate": 4.6280408340573415e-05, + "loss": 0.1024, + "step": 1370 + }, + { + "epoch": 0.2997393570807993, + "grad_norm": 0.21596238017082214, + "learning_rate": 4.625325803649001e-05, + "loss": 0.0613, + "step": 1380 + }, + { + "epoch": 0.30191138140747176, + "grad_norm": 0.3460381031036377, + "learning_rate": 4.62261077324066e-05, + "loss": 0.0602, + "step": 1390 + }, + { + "epoch": 0.3040834057341442, + "grad_norm": 1.0655152797698975, + "learning_rate": 4.6198957428323194e-05, + "loss": 0.1124, + "step": 1400 + }, + { + "epoch": 0.3062554300608167, + "grad_norm": 0.4805503487586975, + "learning_rate": 4.6171807124239794e-05, + "loss": 0.0272, + "step": 1410 + }, + { + "epoch": 0.30842745438748914, + "grad_norm": 0.05258309841156006, + "learning_rate": 4.614465682015639e-05, + "loss": 0.0132, + "step": 1420 + }, + { + "epoch": 0.3105994787141616, + "grad_norm": 0.056971024721860886, + "learning_rate": 4.611750651607299e-05, + "loss": 0.08, + "step": 1430 + }, + { + "epoch": 0.31277150304083406, + "grad_norm": 0.05958361178636551, + "learning_rate": 4.609035621198958e-05, + "loss": 0.0699, + "step": 1440 + }, + { + "epoch": 0.3149435273675065, + "grad_norm": 0.05827038362622261, + "learning_rate": 4.606320590790617e-05, + "loss": 0.0069, + "step": 1450 + }, + { + "epoch": 0.317115551694179, + "grad_norm": 0.06229276955127716, + "learning_rate": 4.6036055603822766e-05, + "loss": 0.0696, + "step": 1460 + }, + { + "epoch": 0.31928757602085145, + "grad_norm": 0.06447760760784149, + "learning_rate": 4.600890529973936e-05, + "loss": 0.0381, + "step": 1470 + }, + { + "epoch": 0.3214596003475239, + "grad_norm": 0.09773342311382294, + "learning_rate": 4.598175499565595e-05, + "loss": 0.1581, + "step": 1480 + }, + { + "epoch": 0.32363162467419637, + "grad_norm": 0.1255279779434204, + "learning_rate": 4.595460469157255e-05, + "loss": 0.0407, + "step": 1490 + }, + { + "epoch": 0.32580364900086883, + "grad_norm": 0.1384665071964264, + "learning_rate": 4.5927454387489145e-05, + "loss": 0.1221, + "step": 1500 + }, + { + "epoch": 0.3279756733275413, + "grad_norm": 0.11321841925382614, + "learning_rate": 4.590030408340574e-05, + "loss": 0.0506, + "step": 1510 + }, + { + "epoch": 0.3301476976542137, + "grad_norm": 0.4507741332054138, + "learning_rate": 4.587315377932233e-05, + "loss": 0.0664, + "step": 1520 + }, + { + "epoch": 0.33231972198088616, + "grad_norm": 0.10015634447336197, + "learning_rate": 4.5846003475238924e-05, + "loss": 0.0401, + "step": 1530 + }, + { + "epoch": 0.3344917463075586, + "grad_norm": 0.0798087790608406, + "learning_rate": 4.581885317115552e-05, + "loss": 0.0391, + "step": 1540 + }, + { + "epoch": 0.3366637706342311, + "grad_norm": 0.08314526081085205, + "learning_rate": 4.579170286707211e-05, + "loss": 0.0676, + "step": 1550 + }, + { + "epoch": 0.33883579496090355, + "grad_norm": 0.07247483730316162, + "learning_rate": 4.57645525629887e-05, + "loss": 0.0381, + "step": 1560 + }, + { + "epoch": 0.341007819287576, + "grad_norm": 0.0960722267627716, + "learning_rate": 4.57374022589053e-05, + "loss": 0.0578, + "step": 1570 + }, + { + "epoch": 0.34317984361424847, + "grad_norm": 0.1308896243572235, + "learning_rate": 4.5710251954821896e-05, + "loss": 0.0681, + "step": 1580 + }, + { + "epoch": 0.34535186794092093, + "grad_norm": 0.6395774483680725, + "learning_rate": 4.568310165073849e-05, + "loss": 0.0489, + "step": 1590 + }, + { + "epoch": 0.3475238922675934, + "grad_norm": 0.07124833017587662, + "learning_rate": 4.565595134665508e-05, + "loss": 0.0915, + "step": 1600 + }, + { + "epoch": 0.34969591659426585, + "grad_norm": 0.06098182126879692, + "learning_rate": 4.562880104257168e-05, + "loss": 0.0384, + "step": 1610 + }, + { + "epoch": 0.3518679409209383, + "grad_norm": 0.055936507880687714, + "learning_rate": 4.5601650738488274e-05, + "loss": 0.0067, + "step": 1620 + }, + { + "epoch": 0.3540399652476108, + "grad_norm": 0.05517324060201645, + "learning_rate": 4.557450043440487e-05, + "loss": 0.0698, + "step": 1630 + }, + { + "epoch": 0.35621198957428324, + "grad_norm": 0.059903811663389206, + "learning_rate": 4.554735013032147e-05, + "loss": 0.0381, + "step": 1640 + }, + { + "epoch": 0.3583840139009557, + "grad_norm": 0.453118234872818, + "learning_rate": 4.552019982623806e-05, + "loss": 0.1588, + "step": 1650 + }, + { + "epoch": 0.36055603822762816, + "grad_norm": 0.10766888409852982, + "learning_rate": 4.549304952215465e-05, + "loss": 0.013, + "step": 1660 + }, + { + "epoch": 0.3627280625543006, + "grad_norm": 0.4284590482711792, + "learning_rate": 4.5465899218071246e-05, + "loss": 0.1186, + "step": 1670 + }, + { + "epoch": 0.3649000868809731, + "grad_norm": 0.1976746916770935, + "learning_rate": 4.543874891398784e-05, + "loss": 0.0951, + "step": 1680 + }, + { + "epoch": 0.36707211120764555, + "grad_norm": 0.4342038929462433, + "learning_rate": 4.541159860990443e-05, + "loss": 0.0699, + "step": 1690 + }, + { + "epoch": 0.369244135534318, + "grad_norm": 0.09489353746175766, + "learning_rate": 4.5384448305821025e-05, + "loss": 0.068, + "step": 1700 + }, + { + "epoch": 0.37141615986099047, + "grad_norm": 0.09361070394515991, + "learning_rate": 4.535729800173762e-05, + "loss": 0.0676, + "step": 1710 + }, + { + "epoch": 0.3735881841876629, + "grad_norm": 0.10838571935892105, + "learning_rate": 4.533014769765422e-05, + "loss": 0.058, + "step": 1720 + }, + { + "epoch": 0.37576020851433534, + "grad_norm": 0.08518973737955093, + "learning_rate": 4.530299739357081e-05, + "loss": 0.0118, + "step": 1730 + }, + { + "epoch": 0.3779322328410078, + "grad_norm": 0.0637778714299202, + "learning_rate": 4.5275847089487404e-05, + "loss": 0.0088, + "step": 1740 + }, + { + "epoch": 0.38010425716768026, + "grad_norm": 0.1527567058801651, + "learning_rate": 4.5248696785404e-05, + "loss": 0.0343, + "step": 1750 + }, + { + "epoch": 0.3822762814943527, + "grad_norm": 0.05855727940797806, + "learning_rate": 4.522154648132059e-05, + "loss": 0.0374, + "step": 1760 + }, + { + "epoch": 0.3844483058210252, + "grad_norm": 0.10861743986606598, + "learning_rate": 4.519439617723718e-05, + "loss": 0.0705, + "step": 1770 + }, + { + "epoch": 0.38662033014769764, + "grad_norm": 0.22820153832435608, + "learning_rate": 4.5167245873153776e-05, + "loss": 0.0659, + "step": 1780 + }, + { + "epoch": 0.3887923544743701, + "grad_norm": 0.9262331128120422, + "learning_rate": 4.5140095569070376e-05, + "loss": 0.0662, + "step": 1790 + }, + { + "epoch": 0.39096437880104257, + "grad_norm": 0.07286658883094788, + "learning_rate": 4.511294526498697e-05, + "loss": 0.053, + "step": 1800 + }, + { + "epoch": 0.39313640312771503, + "grad_norm": 0.04754822701215744, + "learning_rate": 4.508579496090357e-05, + "loss": 0.0305, + "step": 1810 + }, + { + "epoch": 0.3953084274543875, + "grad_norm": 0.2825242280960083, + "learning_rate": 4.505864465682016e-05, + "loss": 0.0465, + "step": 1820 + }, + { + "epoch": 0.39748045178105995, + "grad_norm": 0.6193529963493347, + "learning_rate": 4.5031494352736755e-05, + "loss": 0.1629, + "step": 1830 + }, + { + "epoch": 0.3996524761077324, + "grad_norm": 0.05920102447271347, + "learning_rate": 4.500434404865335e-05, + "loss": 0.0269, + "step": 1840 + }, + { + "epoch": 0.4018245004344049, + "grad_norm": 0.06066862866282463, + "learning_rate": 4.497719374456994e-05, + "loss": 0.0329, + "step": 1850 + }, + { + "epoch": 0.40399652476107734, + "grad_norm": 0.046323712915182114, + "learning_rate": 4.4950043440486534e-05, + "loss": 0.0056, + "step": 1860 + }, + { + "epoch": 0.4061685490877498, + "grad_norm": 0.06750129908323288, + "learning_rate": 4.4922893136403134e-05, + "loss": 0.058, + "step": 1870 + }, + { + "epoch": 0.40834057341442226, + "grad_norm": 0.045560941100120544, + "learning_rate": 4.489574283231973e-05, + "loss": 0.0075, + "step": 1880 + }, + { + "epoch": 0.4105125977410947, + "grad_norm": 0.03991026058793068, + "learning_rate": 4.486859252823632e-05, + "loss": 0.0052, + "step": 1890 + }, + { + "epoch": 0.4126846220677672, + "grad_norm": 0.03694348409771919, + "learning_rate": 4.484144222415291e-05, + "loss": 0.0045, + "step": 1900 + }, + { + "epoch": 0.41485664639443964, + "grad_norm": 2.1510820388793945, + "learning_rate": 4.4814291920069506e-05, + "loss": 0.0821, + "step": 1910 + }, + { + "epoch": 0.41702867072111205, + "grad_norm": 0.18489497900009155, + "learning_rate": 4.47871416159861e-05, + "loss": 0.0415, + "step": 1920 + }, + { + "epoch": 0.4192006950477845, + "grad_norm": 0.7301527857780457, + "learning_rate": 4.475999131190269e-05, + "loss": 0.0817, + "step": 1930 + }, + { + "epoch": 0.42137271937445697, + "grad_norm": 0.038684993982315063, + "learning_rate": 4.4732841007819285e-05, + "loss": 0.049, + "step": 1940 + }, + { + "epoch": 0.42354474370112943, + "grad_norm": 0.9332627654075623, + "learning_rate": 4.4705690703735885e-05, + "loss": 0.0219, + "step": 1950 + }, + { + "epoch": 0.4257167680278019, + "grad_norm": 0.14876317977905273, + "learning_rate": 4.467854039965248e-05, + "loss": 0.0206, + "step": 1960 + }, + { + "epoch": 0.42788879235447436, + "grad_norm": 0.25213751196861267, + "learning_rate": 4.465139009556907e-05, + "loss": 0.0104, + "step": 1970 + }, + { + "epoch": 0.4300608166811468, + "grad_norm": 0.04049533233046532, + "learning_rate": 4.4624239791485664e-05, + "loss": 0.0306, + "step": 1980 + }, + { + "epoch": 0.4322328410078193, + "grad_norm": 0.04084054008126259, + "learning_rate": 4.4597089487402264e-05, + "loss": 0.1077, + "step": 1990 + }, + { + "epoch": 0.43440486533449174, + "grad_norm": 0.07672455906867981, + "learning_rate": 4.4569939183318857e-05, + "loss": 0.0064, + "step": 2000 + }, + { + "epoch": 0.4365768896611642, + "grad_norm": 0.5209121704101562, + "learning_rate": 4.454278887923545e-05, + "loss": 0.0407, + "step": 2010 + }, + { + "epoch": 0.43874891398783666, + "grad_norm": 0.046432483941316605, + "learning_rate": 4.451563857515205e-05, + "loss": 0.0362, + "step": 2020 + }, + { + "epoch": 0.4409209383145091, + "grad_norm": 0.045268259942531586, + "learning_rate": 4.448848827106864e-05, + "loss": 0.0487, + "step": 2030 + }, + { + "epoch": 0.4430929626411816, + "grad_norm": 0.0382797010242939, + "learning_rate": 4.4461337966985235e-05, + "loss": 0.0173, + "step": 2040 + }, + { + "epoch": 0.44526498696785405, + "grad_norm": 0.06607993692159653, + "learning_rate": 4.443418766290183e-05, + "loss": 0.0283, + "step": 2050 + }, + { + "epoch": 0.4474370112945265, + "grad_norm": 0.13242176175117493, + "learning_rate": 4.440703735881842e-05, + "loss": 0.065, + "step": 2060 + }, + { + "epoch": 0.44960903562119897, + "grad_norm": 0.5223703384399414, + "learning_rate": 4.4379887054735014e-05, + "loss": 0.0592, + "step": 2070 + }, + { + "epoch": 0.45178105994787143, + "grad_norm": 0.032611947506666183, + "learning_rate": 4.435273675065161e-05, + "loss": 0.0496, + "step": 2080 + }, + { + "epoch": 0.4539530842745439, + "grad_norm": 0.5175477266311646, + "learning_rate": 4.43255864465682e-05, + "loss": 0.0709, + "step": 2090 + }, + { + "epoch": 0.45612510860121636, + "grad_norm": 0.12021242827177048, + "learning_rate": 4.42984361424848e-05, + "loss": 0.0696, + "step": 2100 + }, + { + "epoch": 0.4582971329278888, + "grad_norm": 0.36755773425102234, + "learning_rate": 4.427128583840139e-05, + "loss": 0.0735, + "step": 2110 + }, + { + "epoch": 0.4604691572545613, + "grad_norm": 0.05745413899421692, + "learning_rate": 4.4244135534317986e-05, + "loss": 0.0344, + "step": 2120 + }, + { + "epoch": 0.4626411815812337, + "grad_norm": 0.040968943387269974, + "learning_rate": 4.421698523023458e-05, + "loss": 0.0475, + "step": 2130 + }, + { + "epoch": 0.46481320590790615, + "grad_norm": 0.04184969514608383, + "learning_rate": 4.418983492615117e-05, + "loss": 0.0383, + "step": 2140 + }, + { + "epoch": 0.4669852302345786, + "grad_norm": 0.049214381724596024, + "learning_rate": 4.4162684622067765e-05, + "loss": 0.0702, + "step": 2150 + }, + { + "epoch": 0.46915725456125107, + "grad_norm": 0.05034701153635979, + "learning_rate": 4.413553431798436e-05, + "loss": 0.0058, + "step": 2160 + }, + { + "epoch": 0.47132927888792353, + "grad_norm": 0.1388736218214035, + "learning_rate": 4.410838401390096e-05, + "loss": 0.0641, + "step": 2170 + }, + { + "epoch": 0.473501303214596, + "grad_norm": 0.052767496556043625, + "learning_rate": 4.408123370981755e-05, + "loss": 0.0076, + "step": 2180 + }, + { + "epoch": 0.47567332754126845, + "grad_norm": 0.04132657125592232, + "learning_rate": 4.405408340573415e-05, + "loss": 0.1008, + "step": 2190 + }, + { + "epoch": 0.4778453518679409, + "grad_norm": 0.054085779935121536, + "learning_rate": 4.4026933101650744e-05, + "loss": 0.038, + "step": 2200 + }, + { + "epoch": 0.4800173761946134, + "grad_norm": 0.22452211380004883, + "learning_rate": 4.399978279756734e-05, + "loss": 0.0699, + "step": 2210 + }, + { + "epoch": 0.48218940052128584, + "grad_norm": 0.04294193536043167, + "learning_rate": 4.397263249348393e-05, + "loss": 0.0406, + "step": 2220 + }, + { + "epoch": 0.4843614248479583, + "grad_norm": 0.04693491756916046, + "learning_rate": 4.394548218940052e-05, + "loss": 0.0344, + "step": 2230 + }, + { + "epoch": 0.48653344917463076, + "grad_norm": 0.06575354188680649, + "learning_rate": 4.3918331885317116e-05, + "loss": 0.0333, + "step": 2240 + }, + { + "epoch": 0.4887054735013032, + "grad_norm": 0.09756498783826828, + "learning_rate": 4.3891181581233716e-05, + "loss": 0.0091, + "step": 2250 + }, + { + "epoch": 0.4908774978279757, + "grad_norm": 0.03621898964047432, + "learning_rate": 4.386403127715031e-05, + "loss": 0.0655, + "step": 2260 + }, + { + "epoch": 0.49304952215464815, + "grad_norm": 0.33425211906433105, + "learning_rate": 4.38368809730669e-05, + "loss": 0.141, + "step": 2270 + }, + { + "epoch": 0.4952215464813206, + "grad_norm": 0.4894958734512329, + "learning_rate": 4.3809730668983495e-05, + "loss": 0.0667, + "step": 2280 + }, + { + "epoch": 0.49739357080799307, + "grad_norm": 0.6559067368507385, + "learning_rate": 4.378258036490009e-05, + "loss": 0.0452, + "step": 2290 + }, + { + "epoch": 0.49956559513466553, + "grad_norm": 0.04155660793185234, + "learning_rate": 4.375543006081668e-05, + "loss": 0.0369, + "step": 2300 + }, + { + "epoch": 0.501737619461338, + "grad_norm": 0.04563942924141884, + "learning_rate": 4.3728279756733274e-05, + "loss": 0.0297, + "step": 2310 + }, + { + "epoch": 0.5039096437880104, + "grad_norm": 0.04393638297915459, + "learning_rate": 4.370112945264987e-05, + "loss": 0.0052, + "step": 2320 + }, + { + "epoch": 0.5060816681146829, + "grad_norm": 0.03522152081131935, + "learning_rate": 4.367397914856647e-05, + "loss": 0.0057, + "step": 2330 + }, + { + "epoch": 0.5082536924413553, + "grad_norm": 0.04149174690246582, + "learning_rate": 4.364682884448306e-05, + "loss": 0.0327, + "step": 2340 + }, + { + "epoch": 0.5104257167680278, + "grad_norm": 0.05420512333512306, + "learning_rate": 4.361967854039965e-05, + "loss": 0.0405, + "step": 2350 + }, + { + "epoch": 0.5125977410947002, + "grad_norm": 0.29514026641845703, + "learning_rate": 4.3592528236316246e-05, + "loss": 0.0221, + "step": 2360 + }, + { + "epoch": 0.5147697654213728, + "grad_norm": 0.04918777570128441, + "learning_rate": 4.3565377932232846e-05, + "loss": 0.0374, + "step": 2370 + }, + { + "epoch": 0.5169417897480452, + "grad_norm": 0.769515872001648, + "learning_rate": 4.353822762814944e-05, + "loss": 0.061, + "step": 2380 + }, + { + "epoch": 0.5191138140747177, + "grad_norm": 0.058411117643117905, + "learning_rate": 4.351107732406603e-05, + "loss": 0.0053, + "step": 2390 + }, + { + "epoch": 0.5212858384013901, + "grad_norm": 0.04015062376856804, + "learning_rate": 4.3483927019982625e-05, + "loss": 0.0061, + "step": 2400 + }, + { + "epoch": 0.5234578627280626, + "grad_norm": 0.06921057403087616, + "learning_rate": 4.3456776715899224e-05, + "loss": 0.02, + "step": 2410 + }, + { + "epoch": 0.525629887054735, + "grad_norm": 1.794832706451416, + "learning_rate": 4.342962641181582e-05, + "loss": 0.0521, + "step": 2420 + }, + { + "epoch": 0.5278019113814074, + "grad_norm": 0.09245938807725906, + "learning_rate": 4.340247610773241e-05, + "loss": 0.0446, + "step": 2430 + }, + { + "epoch": 0.5299739357080799, + "grad_norm": 0.06291071325540543, + "learning_rate": 4.3375325803649004e-05, + "loss": 0.0616, + "step": 2440 + }, + { + "epoch": 0.5321459600347523, + "grad_norm": 0.9209924936294556, + "learning_rate": 4.3348175499565597e-05, + "loss": 0.0683, + "step": 2450 + }, + { + "epoch": 0.5343179843614249, + "grad_norm": 0.05425272881984711, + "learning_rate": 4.332102519548219e-05, + "loss": 0.0065, + "step": 2460 + }, + { + "epoch": 0.5364900086880973, + "grad_norm": 0.13152609765529633, + "learning_rate": 4.329387489139878e-05, + "loss": 0.0236, + "step": 2470 + }, + { + "epoch": 0.5386620330147698, + "grad_norm": 0.1100706234574318, + "learning_rate": 4.326672458731538e-05, + "loss": 0.0379, + "step": 2480 + }, + { + "epoch": 0.5408340573414422, + "grad_norm": 0.06025245040655136, + "learning_rate": 4.3239574283231975e-05, + "loss": 0.0097, + "step": 2490 + }, + { + "epoch": 0.5430060816681147, + "grad_norm": 0.02963717095553875, + "learning_rate": 4.321242397914857e-05, + "loss": 0.0037, + "step": 2500 + }, + { + "epoch": 0.5451781059947871, + "grad_norm": 0.08883394300937653, + "learning_rate": 4.318527367506516e-05, + "loss": 0.1101, + "step": 2510 + }, + { + "epoch": 0.5473501303214596, + "grad_norm": 0.03146001324057579, + "learning_rate": 4.3158123370981754e-05, + "loss": 0.0062, + "step": 2520 + }, + { + "epoch": 0.549522154648132, + "grad_norm": 0.06231880933046341, + "learning_rate": 4.313097306689835e-05, + "loss": 0.0592, + "step": 2530 + }, + { + "epoch": 0.5516941789748045, + "grad_norm": 0.05751520395278931, + "learning_rate": 4.310382276281494e-05, + "loss": 0.0111, + "step": 2540 + }, + { + "epoch": 0.553866203301477, + "grad_norm": 0.060239288955926895, + "learning_rate": 4.307667245873154e-05, + "loss": 0.0086, + "step": 2550 + }, + { + "epoch": 0.5560382276281495, + "grad_norm": 0.027065258473157883, + "learning_rate": 4.304952215464813e-05, + "loss": 0.0403, + "step": 2560 + }, + { + "epoch": 0.5582102519548219, + "grad_norm": 0.02755948156118393, + "learning_rate": 4.302237185056473e-05, + "loss": 0.0316, + "step": 2570 + }, + { + "epoch": 0.5603822762814944, + "grad_norm": 0.0682368353009224, + "learning_rate": 4.2995221546481326e-05, + "loss": 0.0856, + "step": 2580 + }, + { + "epoch": 0.5625543006081668, + "grad_norm": 0.3988370895385742, + "learning_rate": 4.296807124239792e-05, + "loss": 0.0503, + "step": 2590 + }, + { + "epoch": 0.5647263249348393, + "grad_norm": 0.05351010710000992, + "learning_rate": 4.294092093831451e-05, + "loss": 0.046, + "step": 2600 + }, + { + "epoch": 0.5668983492615117, + "grad_norm": 0.06630027294158936, + "learning_rate": 4.2913770634231105e-05, + "loss": 0.0683, + "step": 2610 + }, + { + "epoch": 0.5690703735881842, + "grad_norm": 0.5351850390434265, + "learning_rate": 4.28866203301477e-05, + "loss": 0.0789, + "step": 2620 + }, + { + "epoch": 0.5712423979148566, + "grad_norm": 0.9941126108169556, + "learning_rate": 4.28594700260643e-05, + "loss": 0.0497, + "step": 2630 + }, + { + "epoch": 0.573414422241529, + "grad_norm": 0.06872381269931793, + "learning_rate": 4.283231972198089e-05, + "loss": 0.0076, + "step": 2640 + }, + { + "epoch": 0.5755864465682016, + "grad_norm": 0.04038708284497261, + "learning_rate": 4.2805169417897484e-05, + "loss": 0.008, + "step": 2650 + }, + { + "epoch": 0.577758470894874, + "grad_norm": 1.9096306562423706, + "learning_rate": 4.277801911381408e-05, + "loss": 0.0591, + "step": 2660 + }, + { + "epoch": 0.5799304952215465, + "grad_norm": 0.030831623822450638, + "learning_rate": 4.275086880973067e-05, + "loss": 0.0055, + "step": 2670 + }, + { + "epoch": 0.5821025195482189, + "grad_norm": 0.02636183612048626, + "learning_rate": 4.272371850564726e-05, + "loss": 0.0079, + "step": 2680 + }, + { + "epoch": 0.5842745438748914, + "grad_norm": 0.153968945145607, + "learning_rate": 4.2696568201563856e-05, + "loss": 0.1446, + "step": 2690 + }, + { + "epoch": 0.5864465682015638, + "grad_norm": 0.1505323201417923, + "learning_rate": 4.266941789748045e-05, + "loss": 0.1777, + "step": 2700 + }, + { + "epoch": 0.5886185925282363, + "grad_norm": 0.06393560767173767, + "learning_rate": 4.264226759339705e-05, + "loss": 0.0447, + "step": 2710 + }, + { + "epoch": 0.5907906168549087, + "grad_norm": 0.1916840672492981, + "learning_rate": 4.261511728931364e-05, + "loss": 0.0545, + "step": 2720 + }, + { + "epoch": 0.5929626411815813, + "grad_norm": 0.039754655212163925, + "learning_rate": 4.2587966985230235e-05, + "loss": 0.0179, + "step": 2730 + }, + { + "epoch": 0.5951346655082537, + "grad_norm": 0.03142647072672844, + "learning_rate": 4.256081668114683e-05, + "loss": 0.085, + "step": 2740 + }, + { + "epoch": 0.5973066898349262, + "grad_norm": 0.22314947843551636, + "learning_rate": 4.253366637706343e-05, + "loss": 0.055, + "step": 2750 + }, + { + "epoch": 0.5994787141615986, + "grad_norm": 0.8706423044204712, + "learning_rate": 4.250651607298002e-05, + "loss": 0.0515, + "step": 2760 + }, + { + "epoch": 0.6016507384882711, + "grad_norm": 0.17597514390945435, + "learning_rate": 4.2479365768896614e-05, + "loss": 0.0316, + "step": 2770 + }, + { + "epoch": 0.6038227628149435, + "grad_norm": 0.02838090807199478, + "learning_rate": 4.245221546481321e-05, + "loss": 0.0559, + "step": 2780 + }, + { + "epoch": 0.605994787141616, + "grad_norm": 0.028333989903330803, + "learning_rate": 4.2425065160729807e-05, + "loss": 0.0058, + "step": 2790 + }, + { + "epoch": 0.6081668114682884, + "grad_norm": 0.5321160554885864, + "learning_rate": 4.23979148566464e-05, + "loss": 0.0544, + "step": 2800 + }, + { + "epoch": 0.610338835794961, + "grad_norm": 3.2245936393737793, + "learning_rate": 4.237076455256299e-05, + "loss": 0.0498, + "step": 2810 + }, + { + "epoch": 0.6125108601216334, + "grad_norm": 0.029928002506494522, + "learning_rate": 4.2343614248479586e-05, + "loss": 0.0181, + "step": 2820 + }, + { + "epoch": 0.6146828844483058, + "grad_norm": 0.02794519253075123, + "learning_rate": 4.231646394439618e-05, + "loss": 0.0318, + "step": 2830 + }, + { + "epoch": 0.6168549087749783, + "grad_norm": 0.02685512974858284, + "learning_rate": 4.228931364031277e-05, + "loss": 0.015, + "step": 2840 + }, + { + "epoch": 0.6190269331016507, + "grad_norm": 0.029942605644464493, + "learning_rate": 4.2262163336229365e-05, + "loss": 0.0033, + "step": 2850 + }, + { + "epoch": 0.6211989574283232, + "grad_norm": 0.02547876164317131, + "learning_rate": 4.2235013032145964e-05, + "loss": 0.0389, + "step": 2860 + }, + { + "epoch": 0.6233709817549956, + "grad_norm": 2.500945568084717, + "learning_rate": 4.220786272806256e-05, + "loss": 0.0626, + "step": 2870 + }, + { + "epoch": 0.6255430060816681, + "grad_norm": 0.3283202350139618, + "learning_rate": 4.218071242397915e-05, + "loss": 0.0393, + "step": 2880 + }, + { + "epoch": 0.6277150304083405, + "grad_norm": 1.1086935997009277, + "learning_rate": 4.2153562119895744e-05, + "loss": 0.036, + "step": 2890 + }, + { + "epoch": 0.629887054735013, + "grad_norm": 0.02584236115217209, + "learning_rate": 4.2126411815812337e-05, + "loss": 0.0096, + "step": 2900 + }, + { + "epoch": 0.6320590790616855, + "grad_norm": 0.02778252400457859, + "learning_rate": 4.209926151172893e-05, + "loss": 0.0478, + "step": 2910 + }, + { + "epoch": 0.634231103388358, + "grad_norm": 0.026364050805568695, + "learning_rate": 4.207211120764552e-05, + "loss": 0.0028, + "step": 2920 + }, + { + "epoch": 0.6364031277150304, + "grad_norm": 0.025249965488910675, + "learning_rate": 4.204496090356212e-05, + "loss": 0.0526, + "step": 2930 + }, + { + "epoch": 0.6385751520417029, + "grad_norm": 0.029704120010137558, + "learning_rate": 4.2017810599478715e-05, + "loss": 0.029, + "step": 2940 + }, + { + "epoch": 0.6407471763683753, + "grad_norm": 0.041738465428352356, + "learning_rate": 4.1990660295395315e-05, + "loss": 0.0514, + "step": 2950 + }, + { + "epoch": 0.6429192006950478, + "grad_norm": 0.02726450003683567, + "learning_rate": 4.196350999131191e-05, + "loss": 0.0566, + "step": 2960 + }, + { + "epoch": 0.6450912250217202, + "grad_norm": 0.027154915034770966, + "learning_rate": 4.19363596872285e-05, + "loss": 0.0075, + "step": 2970 + }, + { + "epoch": 0.6472632493483927, + "grad_norm": 0.02589614875614643, + "learning_rate": 4.1909209383145094e-05, + "loss": 0.0032, + "step": 2980 + }, + { + "epoch": 0.6494352736750652, + "grad_norm": 0.03165976330637932, + "learning_rate": 4.188205907906169e-05, + "loss": 0.0331, + "step": 2990 + }, + { + "epoch": 0.6516072980017377, + "grad_norm": 0.46476975083351135, + "learning_rate": 4.185490877497828e-05, + "loss": 0.0548, + "step": 3000 + }, + { + "epoch": 0.6537793223284101, + "grad_norm": 0.7368276119232178, + "learning_rate": 4.182775847089487e-05, + "loss": 0.0981, + "step": 3010 + }, + { + "epoch": 0.6559513466550826, + "grad_norm": 0.03922104462981224, + "learning_rate": 4.180060816681147e-05, + "loss": 0.0145, + "step": 3020 + }, + { + "epoch": 0.658123370981755, + "grad_norm": 0.03890977427363396, + "learning_rate": 4.1773457862728066e-05, + "loss": 0.026, + "step": 3030 + }, + { + "epoch": 0.6602953953084274, + "grad_norm": 0.05434383079409599, + "learning_rate": 4.174630755864466e-05, + "loss": 0.0045, + "step": 3040 + }, + { + "epoch": 0.6624674196350999, + "grad_norm": 0.032415471971035004, + "learning_rate": 4.171915725456125e-05, + "loss": 0.0997, + "step": 3050 + }, + { + "epoch": 0.6646394439617723, + "grad_norm": 0.024642497301101685, + "learning_rate": 4.1692006950477845e-05, + "loss": 0.016, + "step": 3060 + }, + { + "epoch": 0.6668114682884448, + "grad_norm": 0.023403340950608253, + "learning_rate": 4.166485664639444e-05, + "loss": 0.0055, + "step": 3070 + }, + { + "epoch": 0.6689834926151172, + "grad_norm": 0.07261425256729126, + "learning_rate": 4.163770634231103e-05, + "loss": 0.1822, + "step": 3080 + }, + { + "epoch": 0.6711555169417898, + "grad_norm": 0.13217313587665558, + "learning_rate": 4.161055603822763e-05, + "loss": 0.0597, + "step": 3090 + }, + { + "epoch": 0.6733275412684622, + "grad_norm": 0.026420656591653824, + "learning_rate": 4.1583405734144224e-05, + "loss": 0.0204, + "step": 3100 + }, + { + "epoch": 0.6754995655951347, + "grad_norm": 0.02567846141755581, + "learning_rate": 4.155625543006082e-05, + "loss": 0.0439, + "step": 3110 + }, + { + "epoch": 0.6776715899218071, + "grad_norm": 0.19673630595207214, + "learning_rate": 4.152910512597741e-05, + "loss": 0.0767, + "step": 3120 + }, + { + "epoch": 0.6798436142484796, + "grad_norm": 0.03708234429359436, + "learning_rate": 4.150195482189401e-05, + "loss": 0.0228, + "step": 3130 + }, + { + "epoch": 0.682015638575152, + "grad_norm": 0.026442553848028183, + "learning_rate": 4.14748045178106e-05, + "loss": 0.0036, + "step": 3140 + }, + { + "epoch": 0.6841876629018245, + "grad_norm": 0.05390092357993126, + "learning_rate": 4.1447654213727196e-05, + "loss": 0.0936, + "step": 3150 + }, + { + "epoch": 0.6863596872284969, + "grad_norm": 0.20121973752975464, + "learning_rate": 4.142050390964379e-05, + "loss": 0.0275, + "step": 3160 + }, + { + "epoch": 0.6885317115551695, + "grad_norm": 0.027467237785458565, + "learning_rate": 4.139335360556039e-05, + "loss": 0.0046, + "step": 3170 + }, + { + "epoch": 0.6907037358818419, + "grad_norm": 0.022788554430007935, + "learning_rate": 4.136620330147698e-05, + "loss": 0.0329, + "step": 3180 + }, + { + "epoch": 0.6928757602085144, + "grad_norm": 0.04069753736257553, + "learning_rate": 4.1339052997393575e-05, + "loss": 0.0684, + "step": 3190 + }, + { + "epoch": 0.6950477845351868, + "grad_norm": 0.046147171407938004, + "learning_rate": 4.131190269331017e-05, + "loss": 0.0289, + "step": 3200 + }, + { + "epoch": 0.6972198088618593, + "grad_norm": 0.03701859340071678, + "learning_rate": 4.128475238922676e-05, + "loss": 0.0634, + "step": 3210 + }, + { + "epoch": 0.6993918331885317, + "grad_norm": 0.053171392530202866, + "learning_rate": 4.1257602085143354e-05, + "loss": 0.016, + "step": 3220 + }, + { + "epoch": 0.7015638575152042, + "grad_norm": 0.06105630844831467, + "learning_rate": 4.123045178105995e-05, + "loss": 0.047, + "step": 3230 + }, + { + "epoch": 0.7037358818418766, + "grad_norm": 0.058090176433324814, + "learning_rate": 4.1203301476976547e-05, + "loss": 0.0345, + "step": 3240 + }, + { + "epoch": 0.705907906168549, + "grad_norm": 1.5910778045654297, + "learning_rate": 4.117615117289314e-05, + "loss": 0.0487, + "step": 3250 + }, + { + "epoch": 0.7080799304952216, + "grad_norm": 0.11721807718276978, + "learning_rate": 4.114900086880973e-05, + "loss": 0.0201, + "step": 3260 + }, + { + "epoch": 0.710251954821894, + "grad_norm": 1.5974066257476807, + "learning_rate": 4.1121850564726326e-05, + "loss": 0.0277, + "step": 3270 + }, + { + "epoch": 0.7124239791485665, + "grad_norm": 0.023307811468839645, + "learning_rate": 4.109470026064292e-05, + "loss": 0.0028, + "step": 3280 + }, + { + "epoch": 0.7145960034752389, + "grad_norm": 0.04362959787249565, + "learning_rate": 4.106754995655951e-05, + "loss": 0.0195, + "step": 3290 + }, + { + "epoch": 0.7167680278019114, + "grad_norm": 0.2800048291683197, + "learning_rate": 4.104039965247611e-05, + "loss": 0.0792, + "step": 3300 + }, + { + "epoch": 0.7189400521285838, + "grad_norm": 0.05572018399834633, + "learning_rate": 4.1013249348392704e-05, + "loss": 0.0295, + "step": 3310 + }, + { + "epoch": 0.7211120764552563, + "grad_norm": 0.024460218846797943, + "learning_rate": 4.09860990443093e-05, + "loss": 0.0026, + "step": 3320 + }, + { + "epoch": 0.7232841007819287, + "grad_norm": 0.022767340764403343, + "learning_rate": 4.09589487402259e-05, + "loss": 0.0825, + "step": 3330 + }, + { + "epoch": 0.7254561251086012, + "grad_norm": 0.03402335196733475, + "learning_rate": 4.093179843614249e-05, + "loss": 0.0394, + "step": 3340 + }, + { + "epoch": 0.7276281494352737, + "grad_norm": 0.11153494566679001, + "learning_rate": 4.090464813205908e-05, + "loss": 0.0049, + "step": 3350 + }, + { + "epoch": 0.7298001737619462, + "grad_norm": 1.4000017642974854, + "learning_rate": 4.0877497827975676e-05, + "loss": 0.0656, + "step": 3360 + }, + { + "epoch": 0.7319721980886186, + "grad_norm": 0.10694686323404312, + "learning_rate": 4.085034752389227e-05, + "loss": 0.0139, + "step": 3370 + }, + { + "epoch": 0.7341442224152911, + "grad_norm": 0.022809429094195366, + "learning_rate": 4.082319721980886e-05, + "loss": 0.0089, + "step": 3380 + }, + { + "epoch": 0.7363162467419635, + "grad_norm": 0.024037901312112808, + "learning_rate": 4.0796046915725455e-05, + "loss": 0.0489, + "step": 3390 + }, + { + "epoch": 0.738488271068636, + "grad_norm": 0.024476533755660057, + "learning_rate": 4.0768896611642055e-05, + "loss": 0.0344, + "step": 3400 + }, + { + "epoch": 0.7406602953953084, + "grad_norm": 0.05012943968176842, + "learning_rate": 4.074174630755865e-05, + "loss": 0.0374, + "step": 3410 + }, + { + "epoch": 0.7428323197219809, + "grad_norm": 0.1134481132030487, + "learning_rate": 4.071459600347524e-05, + "loss": 0.0406, + "step": 3420 + }, + { + "epoch": 0.7450043440486533, + "grad_norm": 0.04325913265347481, + "learning_rate": 4.0687445699391834e-05, + "loss": 0.0226, + "step": 3430 + }, + { + "epoch": 0.7471763683753258, + "grad_norm": 0.029155496507883072, + "learning_rate": 4.066029539530843e-05, + "loss": 0.0342, + "step": 3440 + }, + { + "epoch": 0.7493483927019983, + "grad_norm": 0.030118346214294434, + "learning_rate": 4.063314509122502e-05, + "loss": 0.0036, + "step": 3450 + }, + { + "epoch": 0.7515204170286707, + "grad_norm": 0.024459168314933777, + "learning_rate": 4.060599478714161e-05, + "loss": 0.0027, + "step": 3460 + }, + { + "epoch": 0.7536924413553432, + "grad_norm": 0.06704209744930267, + "learning_rate": 4.057884448305821e-05, + "loss": 0.0283, + "step": 3470 + }, + { + "epoch": 0.7558644656820156, + "grad_norm": 0.02914384752511978, + "learning_rate": 4.0551694178974806e-05, + "loss": 0.0316, + "step": 3480 + }, + { + "epoch": 0.7580364900086881, + "grad_norm": 0.26781970262527466, + "learning_rate": 4.05245438748914e-05, + "loss": 0.0396, + "step": 3490 + }, + { + "epoch": 0.7602085143353605, + "grad_norm": 0.19622226059436798, + "learning_rate": 4.049739357080799e-05, + "loss": 0.0151, + "step": 3500 + }, + { + "epoch": 0.762380538662033, + "grad_norm": 0.04174257442355156, + "learning_rate": 4.047024326672459e-05, + "loss": 0.0105, + "step": 3510 + }, + { + "epoch": 0.7645525629887054, + "grad_norm": 1.6611101627349854, + "learning_rate": 4.0443092962641185e-05, + "loss": 0.0353, + "step": 3520 + }, + { + "epoch": 0.766724587315378, + "grad_norm": 1.0151467323303223, + "learning_rate": 4.041594265855778e-05, + "loss": 0.0443, + "step": 3530 + }, + { + "epoch": 0.7688966116420504, + "grad_norm": 0.18949908018112183, + "learning_rate": 4.038879235447437e-05, + "loss": 0.006, + "step": 3540 + }, + { + "epoch": 0.7710686359687229, + "grad_norm": 0.019808197394013405, + "learning_rate": 4.036164205039097e-05, + "loss": 0.0038, + "step": 3550 + }, + { + "epoch": 0.7732406602953953, + "grad_norm": 0.05713880434632301, + "learning_rate": 4.0334491746307564e-05, + "loss": 0.0373, + "step": 3560 + }, + { + "epoch": 0.7754126846220678, + "grad_norm": 0.029946787282824516, + "learning_rate": 4.030734144222416e-05, + "loss": 0.0299, + "step": 3570 + }, + { + "epoch": 0.7775847089487402, + "grad_norm": 0.07558543980121613, + "learning_rate": 4.028019113814075e-05, + "loss": 0.0565, + "step": 3580 + }, + { + "epoch": 0.7797567332754127, + "grad_norm": 0.03241603448987007, + "learning_rate": 4.025304083405734e-05, + "loss": 0.0051, + "step": 3590 + }, + { + "epoch": 0.7819287576020851, + "grad_norm": 0.02400217391550541, + "learning_rate": 4.0225890529973936e-05, + "loss": 0.0363, + "step": 3600 + }, + { + "epoch": 0.7841007819287577, + "grad_norm": 0.13099756836891174, + "learning_rate": 4.019874022589053e-05, + "loss": 0.0175, + "step": 3610 + }, + { + "epoch": 0.7862728062554301, + "grad_norm": 0.7141011357307434, + "learning_rate": 4.017158992180712e-05, + "loss": 0.0122, + "step": 3620 + }, + { + "epoch": 0.7884448305821026, + "grad_norm": 0.04486560821533203, + "learning_rate": 4.014443961772372e-05, + "loss": 0.0279, + "step": 3630 + }, + { + "epoch": 0.790616854908775, + "grad_norm": 0.02008930593729019, + "learning_rate": 4.0117289313640315e-05, + "loss": 0.0497, + "step": 3640 + }, + { + "epoch": 0.7927888792354474, + "grad_norm": 0.1151106208562851, + "learning_rate": 4.009013900955691e-05, + "loss": 0.0327, + "step": 3650 + }, + { + "epoch": 0.7949609035621199, + "grad_norm": 0.02791333571076393, + "learning_rate": 4.00629887054735e-05, + "loss": 0.0057, + "step": 3660 + }, + { + "epoch": 0.7971329278887923, + "grad_norm": 1.612856388092041, + "learning_rate": 4.0035838401390094e-05, + "loss": 0.0368, + "step": 3670 + }, + { + "epoch": 0.7993049522154648, + "grad_norm": 0.03310969099402428, + "learning_rate": 4.0008688097306694e-05, + "loss": 0.0023, + "step": 3680 + }, + { + "epoch": 0.8014769765421372, + "grad_norm": 0.021099913865327835, + "learning_rate": 3.9981537793223287e-05, + "loss": 0.0265, + "step": 3690 + }, + { + "epoch": 0.8036490008688097, + "grad_norm": 0.03919641301035881, + "learning_rate": 3.995438748913988e-05, + "loss": 0.0393, + "step": 3700 + }, + { + "epoch": 0.8058210251954822, + "grad_norm": 2.8318557739257812, + "learning_rate": 3.992723718505648e-05, + "loss": 0.0158, + "step": 3710 + }, + { + "epoch": 0.8079930495221547, + "grad_norm": 0.07735323160886765, + "learning_rate": 3.990008688097307e-05, + "loss": 0.0029, + "step": 3720 + }, + { + "epoch": 0.8101650738488271, + "grad_norm": 0.0240157600492239, + "learning_rate": 3.9872936576889665e-05, + "loss": 0.0233, + "step": 3730 + }, + { + "epoch": 0.8123370981754996, + "grad_norm": 0.04485835134983063, + "learning_rate": 3.984578627280626e-05, + "loss": 0.0169, + "step": 3740 + }, + { + "epoch": 0.814509122502172, + "grad_norm": 5.289847373962402, + "learning_rate": 3.981863596872285e-05, + "loss": 0.0335, + "step": 3750 + }, + { + "epoch": 0.8166811468288445, + "grad_norm": 0.018795961514115334, + "learning_rate": 3.9791485664639444e-05, + "loss": 0.0383, + "step": 3760 + }, + { + "epoch": 0.8188531711555169, + "grad_norm": 0.18189352750778198, + "learning_rate": 3.976433536055604e-05, + "loss": 0.0718, + "step": 3770 + }, + { + "epoch": 0.8210251954821894, + "grad_norm": 0.050412606447935104, + "learning_rate": 3.973718505647264e-05, + "loss": 0.0136, + "step": 3780 + }, + { + "epoch": 0.8231972198088618, + "grad_norm": 0.21731217205524445, + "learning_rate": 3.971003475238923e-05, + "loss": 0.0607, + "step": 3790 + }, + { + "epoch": 0.8253692441355344, + "grad_norm": 0.04902643337845802, + "learning_rate": 3.968288444830582e-05, + "loss": 0.0061, + "step": 3800 + }, + { + "epoch": 0.8275412684622068, + "grad_norm": 0.015945184975862503, + "learning_rate": 3.9655734144222416e-05, + "loss": 0.0021, + "step": 3810 + }, + { + "epoch": 0.8297132927888793, + "grad_norm": 0.05196581408381462, + "learning_rate": 3.962858384013901e-05, + "loss": 0.0584, + "step": 3820 + }, + { + "epoch": 0.8318853171155517, + "grad_norm": 1.6970964670181274, + "learning_rate": 3.96014335360556e-05, + "loss": 0.0235, + "step": 3830 + }, + { + "epoch": 0.8340573414422241, + "grad_norm": 0.01579507440328598, + "learning_rate": 3.9574283231972195e-05, + "loss": 0.0141, + "step": 3840 + }, + { + "epoch": 0.8362293657688966, + "grad_norm": 0.015089421533048153, + "learning_rate": 3.9547132927888795e-05, + "loss": 0.0338, + "step": 3850 + }, + { + "epoch": 0.838401390095569, + "grad_norm": 1.4259192943572998, + "learning_rate": 3.951998262380539e-05, + "loss": 0.0454, + "step": 3860 + }, + { + "epoch": 0.8405734144222415, + "grad_norm": 0.6754148602485657, + "learning_rate": 3.949283231972198e-05, + "loss": 0.08, + "step": 3870 + }, + { + "epoch": 0.8427454387489139, + "grad_norm": 0.045398175716400146, + "learning_rate": 3.946568201563858e-05, + "loss": 0.043, + "step": 3880 + }, + { + "epoch": 0.8449174630755865, + "grad_norm": 0.023143045604228973, + "learning_rate": 3.9438531711555174e-05, + "loss": 0.0036, + "step": 3890 + }, + { + "epoch": 0.8470894874022589, + "grad_norm": 0.015820972621440887, + "learning_rate": 3.941138140747177e-05, + "loss": 0.0062, + "step": 3900 + }, + { + "epoch": 0.8492615117289314, + "grad_norm": 0.014995508827269077, + "learning_rate": 3.938423110338836e-05, + "loss": 0.0318, + "step": 3910 + }, + { + "epoch": 0.8514335360556038, + "grad_norm": 0.01640624739229679, + "learning_rate": 3.935708079930495e-05, + "loss": 0.04, + "step": 3920 + }, + { + "epoch": 0.8536055603822763, + "grad_norm": 0.09035991877317429, + "learning_rate": 3.932993049522155e-05, + "loss": 0.0473, + "step": 3930 + }, + { + "epoch": 0.8557775847089487, + "grad_norm": 0.1076781302690506, + "learning_rate": 3.9302780191138146e-05, + "loss": 0.0227, + "step": 3940 + }, + { + "epoch": 0.8579496090356212, + "grad_norm": 0.017277223989367485, + "learning_rate": 3.927562988705474e-05, + "loss": 0.0039, + "step": 3950 + }, + { + "epoch": 0.8601216333622936, + "grad_norm": 0.01418287679553032, + "learning_rate": 3.924847958297133e-05, + "loss": 0.0016, + "step": 3960 + }, + { + "epoch": 0.8622936576889662, + "grad_norm": 0.013834814541041851, + "learning_rate": 3.9221329278887925e-05, + "loss": 0.0017, + "step": 3970 + }, + { + "epoch": 0.8644656820156386, + "grad_norm": 0.013640154153108597, + "learning_rate": 3.919417897480452e-05, + "loss": 0.0423, + "step": 3980 + }, + { + "epoch": 0.8666377063423111, + "grad_norm": 0.09465904533863068, + "learning_rate": 3.916702867072111e-05, + "loss": 0.0823, + "step": 3990 + }, + { + "epoch": 0.8688097306689835, + "grad_norm": 0.07072905451059341, + "learning_rate": 3.9139878366637704e-05, + "loss": 0.0544, + "step": 4000 + }, + { + "epoch": 0.870981754995656, + "grad_norm": 0.04354293271899223, + "learning_rate": 3.9112728062554304e-05, + "loss": 0.0084, + "step": 4010 + }, + { + "epoch": 0.8731537793223284, + "grad_norm": 0.017259210348129272, + "learning_rate": 3.90855777584709e-05, + "loss": 0.0019, + "step": 4020 + }, + { + "epoch": 0.8753258036490009, + "grad_norm": 0.018902383744716644, + "learning_rate": 3.905842745438749e-05, + "loss": 0.0355, + "step": 4030 + }, + { + "epoch": 0.8774978279756733, + "grad_norm": 0.022762592881917953, + "learning_rate": 3.903127715030408e-05, + "loss": 0.0376, + "step": 4040 + }, + { + "epoch": 0.8796698523023457, + "grad_norm": 0.810174822807312, + "learning_rate": 3.9004126846220676e-05, + "loss": 0.1255, + "step": 4050 + }, + { + "epoch": 0.8818418766290183, + "grad_norm": 0.024319645017385483, + "learning_rate": 3.8976976542137276e-05, + "loss": 0.0112, + "step": 4060 + }, + { + "epoch": 0.8840139009556907, + "grad_norm": 0.01799897663295269, + "learning_rate": 3.894982623805387e-05, + "loss": 0.0022, + "step": 4070 + }, + { + "epoch": 0.8861859252823632, + "grad_norm": 0.016442058607935905, + "learning_rate": 3.892267593397046e-05, + "loss": 0.0018, + "step": 4080 + }, + { + "epoch": 0.8883579496090356, + "grad_norm": 0.015257969498634338, + "learning_rate": 3.889552562988706e-05, + "loss": 0.064, + "step": 4090 + }, + { + "epoch": 0.8905299739357081, + "grad_norm": 0.03301286697387695, + "learning_rate": 3.8868375325803654e-05, + "loss": 0.0188, + "step": 4100 + }, + { + "epoch": 0.8927019982623805, + "grad_norm": 0.09779565036296844, + "learning_rate": 3.884122502172025e-05, + "loss": 0.0401, + "step": 4110 + }, + { + "epoch": 0.894874022589053, + "grad_norm": 0.10050684213638306, + "learning_rate": 3.881407471763684e-05, + "loss": 0.0068, + "step": 4120 + }, + { + "epoch": 0.8970460469157254, + "grad_norm": 1.210329294204712, + "learning_rate": 3.8786924413553433e-05, + "loss": 0.0995, + "step": 4130 + }, + { + "epoch": 0.8992180712423979, + "grad_norm": 2.245310068130493, + "learning_rate": 3.876248913987837e-05, + "loss": 0.1687, + "step": 4140 + }, + { + "epoch": 0.9013900955690703, + "grad_norm": 0.03958917781710625, + "learning_rate": 3.873533883579497e-05, + "loss": 0.0064, + "step": 4150 + }, + { + "epoch": 0.9035621198957429, + "grad_norm": 0.018245557323098183, + "learning_rate": 3.870818853171156e-05, + "loss": 0.03, + "step": 4160 + }, + { + "epoch": 0.9057341442224153, + "grad_norm": 0.017429698258638382, + "learning_rate": 3.868103822762815e-05, + "loss": 0.0024, + "step": 4170 + }, + { + "epoch": 0.9079061685490878, + "grad_norm": 0.029029618948698044, + "learning_rate": 3.8653887923544746e-05, + "loss": 0.0026, + "step": 4180 + }, + { + "epoch": 0.9100781928757602, + "grad_norm": 0.034047931432724, + "learning_rate": 3.862673761946134e-05, + "loss": 0.0189, + "step": 4190 + }, + { + "epoch": 0.9122502172024327, + "grad_norm": 0.06103040277957916, + "learning_rate": 3.859958731537793e-05, + "loss": 0.0151, + "step": 4200 + }, + { + "epoch": 0.9144222415291051, + "grad_norm": 0.018298335373401642, + "learning_rate": 3.8572437011294525e-05, + "loss": 0.0399, + "step": 4210 + }, + { + "epoch": 0.9165942658557776, + "grad_norm": 0.018746716901659966, + "learning_rate": 3.854528670721112e-05, + "loss": 0.0136, + "step": 4220 + }, + { + "epoch": 0.91876629018245, + "grad_norm": 2.7724406719207764, + "learning_rate": 3.851813640312772e-05, + "loss": 0.0163, + "step": 4230 + }, + { + "epoch": 0.9209383145091226, + "grad_norm": 0.03073030896484852, + "learning_rate": 3.849098609904431e-05, + "loss": 0.0025, + "step": 4240 + }, + { + "epoch": 0.923110338835795, + "grad_norm": 0.06461982429027557, + "learning_rate": 3.8463835794960904e-05, + "loss": 0.0488, + "step": 4250 + }, + { + "epoch": 0.9252823631624674, + "grad_norm": 0.023927874863147736, + "learning_rate": 3.84366854908775e-05, + "loss": 0.0078, + "step": 4260 + }, + { + "epoch": 0.9274543874891399, + "grad_norm": 0.022562723606824875, + "learning_rate": 3.840953518679409e-05, + "loss": 0.0357, + "step": 4270 + }, + { + "epoch": 0.9296264118158123, + "grad_norm": 0.10135874897241592, + "learning_rate": 3.838238488271069e-05, + "loss": 0.0118, + "step": 4280 + }, + { + "epoch": 0.9317984361424848, + "grad_norm": 0.014547540806233883, + "learning_rate": 3.835523457862728e-05, + "loss": 0.022, + "step": 4290 + }, + { + "epoch": 0.9339704604691572, + "grad_norm": 0.013702181167900562, + "learning_rate": 3.8328084274543876e-05, + "loss": 0.0181, + "step": 4300 + }, + { + "epoch": 0.9361424847958297, + "grad_norm": 0.022185347974300385, + "learning_rate": 3.8300933970460476e-05, + "loss": 0.0714, + "step": 4310 + }, + { + "epoch": 0.9383145091225021, + "grad_norm": 0.04213215410709381, + "learning_rate": 3.827378366637707e-05, + "loss": 0.0432, + "step": 4320 + }, + { + "epoch": 0.9404865334491747, + "grad_norm": 1.9515026807785034, + "learning_rate": 3.824663336229366e-05, + "loss": 0.0362, + "step": 4330 + }, + { + "epoch": 0.9426585577758471, + "grad_norm": 0.03761598840355873, + "learning_rate": 3.8219483058210255e-05, + "loss": 0.0458, + "step": 4340 + }, + { + "epoch": 0.9448305821025196, + "grad_norm": 0.0594487339258194, + "learning_rate": 3.819233275412685e-05, + "loss": 0.0042, + "step": 4350 + }, + { + "epoch": 0.947002606429192, + "grad_norm": 0.01828021928668022, + "learning_rate": 3.816518245004344e-05, + "loss": 0.0374, + "step": 4360 + }, + { + "epoch": 0.9491746307558645, + "grad_norm": 0.01723085157573223, + "learning_rate": 3.8138032145960034e-05, + "loss": 0.0033, + "step": 4370 + }, + { + "epoch": 0.9513466550825369, + "grad_norm": 1.6202086210250854, + "learning_rate": 3.8110881841876633e-05, + "loss": 0.0281, + "step": 4380 + }, + { + "epoch": 0.9535186794092094, + "grad_norm": 1.0708816051483154, + "learning_rate": 3.8083731537793227e-05, + "loss": 0.0467, + "step": 4390 + }, + { + "epoch": 0.9556907037358818, + "grad_norm": 0.08704803138971329, + "learning_rate": 3.805658123370982e-05, + "loss": 0.0152, + "step": 4400 + }, + { + "epoch": 0.9578627280625543, + "grad_norm": 0.014155888929963112, + "learning_rate": 3.802943092962641e-05, + "loss": 0.0148, + "step": 4410 + }, + { + "epoch": 0.9600347523892268, + "grad_norm": 0.02981710433959961, + "learning_rate": 3.8002280625543006e-05, + "loss": 0.0496, + "step": 4420 + }, + { + "epoch": 0.9622067767158993, + "grad_norm": 0.05970924347639084, + "learning_rate": 3.79751303214596e-05, + "loss": 0.0223, + "step": 4430 + }, + { + "epoch": 0.9643788010425717, + "grad_norm": 0.03441289812326431, + "learning_rate": 3.794798001737619e-05, + "loss": 0.0024, + "step": 4440 + }, + { + "epoch": 0.9665508253692441, + "grad_norm": 0.024355070665478706, + "learning_rate": 3.792082971329279e-05, + "loss": 0.0502, + "step": 4450 + }, + { + "epoch": 0.9687228496959166, + "grad_norm": 0.10018374770879745, + "learning_rate": 3.7893679409209384e-05, + "loss": 0.0919, + "step": 4460 + }, + { + "epoch": 0.970894874022589, + "grad_norm": 0.0919993445277214, + "learning_rate": 3.786652910512598e-05, + "loss": 0.0154, + "step": 4470 + }, + { + "epoch": 0.9730668983492615, + "grad_norm": 0.03763913735747337, + "learning_rate": 3.783937880104258e-05, + "loss": 0.0172, + "step": 4480 + }, + { + "epoch": 0.9752389226759339, + "grad_norm": 0.17443352937698364, + "learning_rate": 3.781222849695917e-05, + "loss": 0.0353, + "step": 4490 + }, + { + "epoch": 0.9774109470026064, + "grad_norm": 0.08582145720720291, + "learning_rate": 3.778507819287576e-05, + "loss": 0.0328, + "step": 4500 + }, + { + "epoch": 0.9795829713292789, + "grad_norm": 0.017119983211159706, + "learning_rate": 3.7757927888792356e-05, + "loss": 0.0056, + "step": 4510 + }, + { + "epoch": 0.9817549956559514, + "grad_norm": 0.014104608446359634, + "learning_rate": 3.773077758470895e-05, + "loss": 0.0532, + "step": 4520 + }, + { + "epoch": 0.9839270199826238, + "grad_norm": 0.13239271938800812, + "learning_rate": 3.770362728062555e-05, + "loss": 0.0027, + "step": 4530 + }, + { + "epoch": 0.9860990443092963, + "grad_norm": 0.020031528547406197, + "learning_rate": 3.767647697654214e-05, + "loss": 0.0028, + "step": 4540 + }, + { + "epoch": 0.9882710686359687, + "grad_norm": 0.014798794873058796, + "learning_rate": 3.7649326672458735e-05, + "loss": 0.0385, + "step": 4550 + }, + { + "epoch": 0.9904430929626412, + "grad_norm": 0.16992108523845673, + "learning_rate": 3.762217636837533e-05, + "loss": 0.0496, + "step": 4560 + }, + { + "epoch": 0.9926151172893136, + "grad_norm": 0.06289473921060562, + "learning_rate": 3.759502606429192e-05, + "loss": 0.0077, + "step": 4570 + }, + { + "epoch": 0.9947871416159861, + "grad_norm": 0.03464280068874359, + "learning_rate": 3.7567875760208514e-05, + "loss": 0.0296, + "step": 4580 + }, + { + "epoch": 0.9969591659426585, + "grad_norm": 0.01265657227486372, + "learning_rate": 3.754072545612511e-05, + "loss": 0.0051, + "step": 4590 + }, + { + "epoch": 0.9991311902693311, + "grad_norm": 0.8480960130691528, + "learning_rate": 3.75135751520417e-05, + "loss": 0.0462, + "step": 4600 + }, + { + "epoch": 1.0, + "eval_f1": 0.4793388429752066, + "eval_loss": 0.05696646869182587, + "eval_runtime": 83.2949, + "eval_samples_per_second": 119.755, + "eval_steps_per_second": 7.491, + "step": 4604 + }, + { + "epoch": 1.0013032145960035, + "grad_norm": 2.0061440467834473, + "learning_rate": 3.74864248479583e-05, + "loss": 0.0313, + "step": 4610 + }, + { + "epoch": 1.003475238922676, + "grad_norm": 0.05946578085422516, + "learning_rate": 3.745927454387489e-05, + "loss": 0.0033, + "step": 4620 + }, + { + "epoch": 1.0056472632493485, + "grad_norm": 0.04357343912124634, + "learning_rate": 3.7432124239791486e-05, + "loss": 0.0493, + "step": 4630 + }, + { + "epoch": 1.0078192875760208, + "grad_norm": 0.03947281464934349, + "learning_rate": 3.740497393570808e-05, + "loss": 0.0024, + "step": 4640 + }, + { + "epoch": 1.0099913119026933, + "grad_norm": 0.016838541254401207, + "learning_rate": 3.737782363162468e-05, + "loss": 0.0163, + "step": 4650 + }, + { + "epoch": 1.0121633362293658, + "grad_norm": 0.06586115807294846, + "learning_rate": 3.735067332754127e-05, + "loss": 0.0603, + "step": 4660 + }, + { + "epoch": 1.0143353605560381, + "grad_norm": 1.1860322952270508, + "learning_rate": 3.7323523023457865e-05, + "loss": 0.0339, + "step": 4670 + }, + { + "epoch": 1.0165073848827106, + "grad_norm": 0.16159577667713165, + "learning_rate": 3.729637271937446e-05, + "loss": 0.0216, + "step": 4680 + }, + { + "epoch": 1.0186794092093832, + "grad_norm": 0.01223788969218731, + "learning_rate": 3.726922241529106e-05, + "loss": 0.0045, + "step": 4690 + }, + { + "epoch": 1.0208514335360557, + "grad_norm": 0.011866359040141106, + "learning_rate": 3.724207211120765e-05, + "loss": 0.0014, + "step": 4700 + }, + { + "epoch": 1.023023457862728, + "grad_norm": 0.011750188656151295, + "learning_rate": 3.7214921807124244e-05, + "loss": 0.0295, + "step": 4710 + }, + { + "epoch": 1.0251954821894005, + "grad_norm": 0.028903665021061897, + "learning_rate": 3.718777150304084e-05, + "loss": 0.0051, + "step": 4720 + }, + { + "epoch": 1.027367506516073, + "grad_norm": 0.034531235694885254, + "learning_rate": 3.716062119895743e-05, + "loss": 0.0242, + "step": 4730 + }, + { + "epoch": 1.0295395308427455, + "grad_norm": 0.078121617436409, + "learning_rate": 3.713347089487402e-05, + "loss": 0.0477, + "step": 4740 + }, + { + "epoch": 1.0317115551694178, + "grad_norm": 0.06223325431346893, + "learning_rate": 3.7106320590790616e-05, + "loss": 0.0186, + "step": 4750 + }, + { + "epoch": 1.0338835794960903, + "grad_norm": 0.052579279989004135, + "learning_rate": 3.7079170286707216e-05, + "loss": 0.0125, + "step": 4760 + }, + { + "epoch": 1.0360556038227628, + "grad_norm": 1.9749809503555298, + "learning_rate": 3.705201998262381e-05, + "loss": 0.0553, + "step": 4770 + }, + { + "epoch": 1.0382276281494354, + "grad_norm": 0.20051714777946472, + "learning_rate": 3.70248696785404e-05, + "loss": 0.0504, + "step": 4780 + }, + { + "epoch": 1.0403996524761077, + "grad_norm": 0.011600709520280361, + "learning_rate": 3.6997719374456995e-05, + "loss": 0.0343, + "step": 4790 + }, + { + "epoch": 1.0425716768027802, + "grad_norm": 4.0308122634887695, + "learning_rate": 3.697056907037359e-05, + "loss": 0.0265, + "step": 4800 + }, + { + "epoch": 1.0447437011294527, + "grad_norm": 1.1246380805969238, + "learning_rate": 3.694341876629018e-05, + "loss": 0.0407, + "step": 4810 + }, + { + "epoch": 1.0469157254561252, + "grad_norm": 0.07232671976089478, + "learning_rate": 3.6916268462206774e-05, + "loss": 0.0088, + "step": 4820 + }, + { + "epoch": 1.0490877497827975, + "grad_norm": 0.03161951154470444, + "learning_rate": 3.6889118158123373e-05, + "loss": 0.038, + "step": 4830 + }, + { + "epoch": 1.05125977410947, + "grad_norm": 0.08089262247085571, + "learning_rate": 3.6861967854039967e-05, + "loss": 0.0022, + "step": 4840 + }, + { + "epoch": 1.0534317984361425, + "grad_norm": 0.019513120874762535, + "learning_rate": 3.683481754995656e-05, + "loss": 0.0115, + "step": 4850 + }, + { + "epoch": 1.0556038227628148, + "grad_norm": 0.010743933729827404, + "learning_rate": 3.680766724587316e-05, + "loss": 0.0013, + "step": 4860 + }, + { + "epoch": 1.0577758470894874, + "grad_norm": 0.010847543366253376, + "learning_rate": 3.678051694178975e-05, + "loss": 0.0013, + "step": 4870 + }, + { + "epoch": 1.0599478714161599, + "grad_norm": 0.010532297194004059, + "learning_rate": 3.6753366637706345e-05, + "loss": 0.0092, + "step": 4880 + }, + { + "epoch": 1.0621198957428324, + "grad_norm": 0.010408210568130016, + "learning_rate": 3.672621633362294e-05, + "loss": 0.0424, + "step": 4890 + }, + { + "epoch": 1.0642919200695047, + "grad_norm": 0.055000320076942444, + "learning_rate": 3.669906602953953e-05, + "loss": 0.0496, + "step": 4900 + }, + { + "epoch": 1.0664639443961772, + "grad_norm": 0.03006312996149063, + "learning_rate": 3.667191572545613e-05, + "loss": 0.0073, + "step": 4910 + }, + { + "epoch": 1.0686359687228497, + "grad_norm": 0.02324208803474903, + "learning_rate": 3.6644765421372724e-05, + "loss": 0.0053, + "step": 4920 + }, + { + "epoch": 1.0708079930495222, + "grad_norm": 0.015525688417255878, + "learning_rate": 3.661761511728932e-05, + "loss": 0.052, + "step": 4930 + }, + { + "epoch": 1.0729800173761945, + "grad_norm": 0.5552154779434204, + "learning_rate": 3.659046481320591e-05, + "loss": 0.0397, + "step": 4940 + }, + { + "epoch": 1.075152041702867, + "grad_norm": 0.14947287738323212, + "learning_rate": 3.65633145091225e-05, + "loss": 0.0036, + "step": 4950 + }, + { + "epoch": 1.0773240660295396, + "grad_norm": 0.018405791372060776, + "learning_rate": 3.6536164205039096e-05, + "loss": 0.0019, + "step": 4960 + }, + { + "epoch": 1.079496090356212, + "grad_norm": 0.012576512061059475, + "learning_rate": 3.650901390095569e-05, + "loss": 0.0031, + "step": 4970 + }, + { + "epoch": 1.0816681146828844, + "grad_norm": 0.4258500933647156, + "learning_rate": 3.648186359687228e-05, + "loss": 0.0888, + "step": 4980 + }, + { + "epoch": 1.083840139009557, + "grad_norm": 0.11915474385023117, + "learning_rate": 3.645471329278888e-05, + "loss": 0.0081, + "step": 4990 + }, + { + "epoch": 1.0860121633362294, + "grad_norm": 0.011587638407945633, + "learning_rate": 3.6427562988705475e-05, + "loss": 0.0148, + "step": 5000 + }, + { + "epoch": 1.088184187662902, + "grad_norm": 0.014459837228059769, + "learning_rate": 3.640041268462207e-05, + "loss": 0.0026, + "step": 5010 + }, + { + "epoch": 1.0903562119895742, + "grad_norm": 0.010975486598908901, + "learning_rate": 3.637326238053866e-05, + "loss": 0.0292, + "step": 5020 + }, + { + "epoch": 1.0925282363162467, + "grad_norm": 0.1021094024181366, + "learning_rate": 3.634611207645526e-05, + "loss": 0.0045, + "step": 5030 + }, + { + "epoch": 1.0947002606429193, + "grad_norm": 0.030722634866833687, + "learning_rate": 3.6318961772371854e-05, + "loss": 0.0021, + "step": 5040 + }, + { + "epoch": 1.0968722849695918, + "grad_norm": 0.010209214873611927, + "learning_rate": 3.629181146828845e-05, + "loss": 0.0021, + "step": 5050 + }, + { + "epoch": 1.099044309296264, + "grad_norm": 0.009853394702076912, + "learning_rate": 3.626466116420504e-05, + "loss": 0.0017, + "step": 5060 + }, + { + "epoch": 1.1012163336229366, + "grad_norm": 0.009726290591061115, + "learning_rate": 3.623751086012164e-05, + "loss": 0.0319, + "step": 5070 + }, + { + "epoch": 1.103388357949609, + "grad_norm": 0.010656571947038174, + "learning_rate": 3.621036055603823e-05, + "loss": 0.0017, + "step": 5080 + }, + { + "epoch": 1.1055603822762814, + "grad_norm": 0.009740895591676235, + "learning_rate": 3.6183210251954826e-05, + "loss": 0.0016, + "step": 5090 + }, + { + "epoch": 1.107732406602954, + "grad_norm": 0.009546713903546333, + "learning_rate": 3.615605994787142e-05, + "loss": 0.0027, + "step": 5100 + }, + { + "epoch": 1.1099044309296264, + "grad_norm": 0.11545804142951965, + "learning_rate": 3.612890964378801e-05, + "loss": 0.032, + "step": 5110 + }, + { + "epoch": 1.112076455256299, + "grad_norm": 0.23652075231075287, + "learning_rate": 3.6101759339704605e-05, + "loss": 0.0037, + "step": 5120 + }, + { + "epoch": 1.1142484795829712, + "grad_norm": 0.009880350902676582, + "learning_rate": 3.60746090356212e-05, + "loss": 0.0279, + "step": 5130 + }, + { + "epoch": 1.1164205039096438, + "grad_norm": 0.057632774114608765, + "learning_rate": 3.60474587315378e-05, + "loss": 0.0849, + "step": 5140 + }, + { + "epoch": 1.1185925282363163, + "grad_norm": 0.07699901610612869, + "learning_rate": 3.602030842745439e-05, + "loss": 0.005, + "step": 5150 + }, + { + "epoch": 1.1207645525629888, + "grad_norm": 0.5865737795829773, + "learning_rate": 3.5993158123370984e-05, + "loss": 0.0401, + "step": 5160 + }, + { + "epoch": 1.122936576889661, + "grad_norm": 0.03473491966724396, + "learning_rate": 3.596600781928758e-05, + "loss": 0.0057, + "step": 5170 + }, + { + "epoch": 1.1251086012163336, + "grad_norm": 3.234090805053711, + "learning_rate": 3.593885751520417e-05, + "loss": 0.045, + "step": 5180 + }, + { + "epoch": 1.1272806255430061, + "grad_norm": 0.038007259368896484, + "learning_rate": 3.591170721112076e-05, + "loss": 0.0038, + "step": 5190 + }, + { + "epoch": 1.1294526498696786, + "grad_norm": 8.481440544128418, + "learning_rate": 3.5884556907037356e-05, + "loss": 0.0237, + "step": 5200 + }, + { + "epoch": 1.131624674196351, + "grad_norm": 0.010371362790465355, + "learning_rate": 3.5857406602953956e-05, + "loss": 0.0243, + "step": 5210 + }, + { + "epoch": 1.1337966985230234, + "grad_norm": 0.013176986016333103, + "learning_rate": 3.583025629887055e-05, + "loss": 0.032, + "step": 5220 + }, + { + "epoch": 1.135968722849696, + "grad_norm": 0.0369015671312809, + "learning_rate": 3.580310599478715e-05, + "loss": 0.0279, + "step": 5230 + }, + { + "epoch": 1.1381407471763683, + "grad_norm": 0.18637622892856598, + "learning_rate": 3.577595569070374e-05, + "loss": 0.0033, + "step": 5240 + }, + { + "epoch": 1.1403127715030408, + "grad_norm": 0.05641024932265282, + "learning_rate": 3.5748805386620334e-05, + "loss": 0.014, + "step": 5250 + }, + { + "epoch": 1.1424847958297133, + "grad_norm": 0.19416458904743195, + "learning_rate": 3.572165508253693e-05, + "loss": 0.004, + "step": 5260 + }, + { + "epoch": 1.1446568201563858, + "grad_norm": 0.009675233624875546, + "learning_rate": 3.569450477845352e-05, + "loss": 0.0154, + "step": 5270 + }, + { + "epoch": 1.1468288444830583, + "grad_norm": 0.019023172557353973, + "learning_rate": 3.5667354474370113e-05, + "loss": 0.0136, + "step": 5280 + }, + { + "epoch": 1.1490008688097306, + "grad_norm": 0.02198546566069126, + "learning_rate": 3.5640204170286706e-05, + "loss": 0.0453, + "step": 5290 + }, + { + "epoch": 1.1511728931364031, + "grad_norm": 0.47042351961135864, + "learning_rate": 3.5613053866203306e-05, + "loss": 0.0341, + "step": 5300 + }, + { + "epoch": 1.1533449174630757, + "grad_norm": 0.008669150061905384, + "learning_rate": 3.55859035621199e-05, + "loss": 0.0225, + "step": 5310 + }, + { + "epoch": 1.155516941789748, + "grad_norm": 0.008939978666603565, + "learning_rate": 3.555875325803649e-05, + "loss": 0.0277, + "step": 5320 + }, + { + "epoch": 1.1576889661164205, + "grad_norm": 2.583348512649536, + "learning_rate": 3.5531602953953085e-05, + "loss": 0.147, + "step": 5330 + }, + { + "epoch": 1.159860990443093, + "grad_norm": 0.8401560187339783, + "learning_rate": 3.550445264986968e-05, + "loss": 0.0502, + "step": 5340 + }, + { + "epoch": 1.1620330147697655, + "grad_norm": 0.07709191739559174, + "learning_rate": 3.547730234578627e-05, + "loss": 0.0124, + "step": 5350 + }, + { + "epoch": 1.1642050390964378, + "grad_norm": 0.010382450185716152, + "learning_rate": 3.5450152041702864e-05, + "loss": 0.0038, + "step": 5360 + }, + { + "epoch": 1.1663770634231103, + "grad_norm": 0.2649940252304077, + "learning_rate": 3.5423001737619464e-05, + "loss": 0.0034, + "step": 5370 + }, + { + "epoch": 1.1685490877497828, + "grad_norm": 0.013010102324187756, + "learning_rate": 3.539585143353606e-05, + "loss": 0.0011, + "step": 5380 + }, + { + "epoch": 1.1707211120764554, + "grad_norm": 0.014443314634263515, + "learning_rate": 3.536870112945265e-05, + "loss": 0.0625, + "step": 5390 + }, + { + "epoch": 1.1728931364031276, + "grad_norm": 1.5826534032821655, + "learning_rate": 3.534155082536924e-05, + "loss": 0.0399, + "step": 5400 + }, + { + "epoch": 1.1750651607298002, + "grad_norm": 0.1101546585559845, + "learning_rate": 3.531440052128584e-05, + "loss": 0.0501, + "step": 5410 + }, + { + "epoch": 1.1772371850564727, + "grad_norm": 0.4018704891204834, + "learning_rate": 3.5287250217202436e-05, + "loss": 0.0336, + "step": 5420 + }, + { + "epoch": 1.1794092093831452, + "grad_norm": 0.009711096063256264, + "learning_rate": 3.526009991311903e-05, + "loss": 0.005, + "step": 5430 + }, + { + "epoch": 1.1815812337098175, + "grad_norm": 0.01917942240834236, + "learning_rate": 3.523294960903562e-05, + "loss": 0.1212, + "step": 5440 + }, + { + "epoch": 1.18375325803649, + "grad_norm": 0.02508840151131153, + "learning_rate": 3.520579930495222e-05, + "loss": 0.0028, + "step": 5450 + }, + { + "epoch": 1.1859252823631625, + "grad_norm": 0.0388546884059906, + "learning_rate": 3.5178649000868815e-05, + "loss": 0.0312, + "step": 5460 + }, + { + "epoch": 1.1880973066898348, + "grad_norm": 0.06240135803818703, + "learning_rate": 3.515149869678541e-05, + "loss": 0.0353, + "step": 5470 + }, + { + "epoch": 1.1902693310165073, + "grad_norm": 0.061889566481113434, + "learning_rate": 3.5124348392702e-05, + "loss": 0.0255, + "step": 5480 + }, + { + "epoch": 1.1924413553431799, + "grad_norm": 0.07585006207227707, + "learning_rate": 3.5097198088618594e-05, + "loss": 0.004, + "step": 5490 + }, + { + "epoch": 1.1946133796698524, + "grad_norm": 0.039220329374074936, + "learning_rate": 3.507004778453519e-05, + "loss": 0.024, + "step": 5500 + }, + { + "epoch": 1.1967854039965247, + "grad_norm": 0.17422033846378326, + "learning_rate": 3.504289748045178e-05, + "loss": 0.0257, + "step": 5510 + }, + { + "epoch": 1.1989574283231972, + "grad_norm": 0.02887544222176075, + "learning_rate": 3.501574717636838e-05, + "loss": 0.0038, + "step": 5520 + }, + { + "epoch": 1.2011294526498697, + "grad_norm": 0.044750556349754333, + "learning_rate": 3.498859687228497e-05, + "loss": 0.0329, + "step": 5530 + }, + { + "epoch": 1.2033014769765422, + "grad_norm": 0.00962216965854168, + "learning_rate": 3.4961446568201566e-05, + "loss": 0.0027, + "step": 5540 + }, + { + "epoch": 1.2054735013032145, + "grad_norm": 0.008415351621806622, + "learning_rate": 3.493429626411816e-05, + "loss": 0.0022, + "step": 5550 + }, + { + "epoch": 1.207645525629887, + "grad_norm": 0.1523517519235611, + "learning_rate": 3.490714596003475e-05, + "loss": 0.0168, + "step": 5560 + }, + { + "epoch": 1.2098175499565595, + "grad_norm": 0.008230285719037056, + "learning_rate": 3.4879995655951345e-05, + "loss": 0.0202, + "step": 5570 + }, + { + "epoch": 1.211989574283232, + "grad_norm": 0.008194214664399624, + "learning_rate": 3.485284535186794e-05, + "loss": 0.0053, + "step": 5580 + }, + { + "epoch": 1.2141615986099044, + "grad_norm": 0.008157436735928059, + "learning_rate": 3.482569504778454e-05, + "loss": 0.008, + "step": 5590 + }, + { + "epoch": 1.2163336229365769, + "grad_norm": 0.008164693601429462, + "learning_rate": 3.479854474370113e-05, + "loss": 0.0326, + "step": 5600 + }, + { + "epoch": 1.2185056472632494, + "grad_norm": 0.01076839491724968, + "learning_rate": 3.477139443961773e-05, + "loss": 0.0011, + "step": 5610 + }, + { + "epoch": 1.2206776715899217, + "grad_norm": 0.007989328354597092, + "learning_rate": 3.4744244135534323e-05, + "loss": 0.0058, + "step": 5620 + }, + { + "epoch": 1.2228496959165942, + "grad_norm": 0.08358601480722427, + "learning_rate": 3.4717093831450917e-05, + "loss": 0.045, + "step": 5630 + }, + { + "epoch": 1.2250217202432667, + "grad_norm": 0.08348576724529266, + "learning_rate": 3.468994352736751e-05, + "loss": 0.0024, + "step": 5640 + }, + { + "epoch": 1.2271937445699392, + "grad_norm": 0.008693045936524868, + "learning_rate": 3.46627932232841e-05, + "loss": 0.0026, + "step": 5650 + }, + { + "epoch": 1.2293657688966118, + "grad_norm": 0.007211147341877222, + "learning_rate": 3.4635642919200696e-05, + "loss": 0.0096, + "step": 5660 + }, + { + "epoch": 1.231537793223284, + "grad_norm": 0.007140511646866798, + "learning_rate": 3.460849261511729e-05, + "loss": 0.0008, + "step": 5670 + }, + { + "epoch": 1.2337098175499566, + "grad_norm": 0.8093301653862, + "learning_rate": 3.458134231103389e-05, + "loss": 0.0773, + "step": 5680 + }, + { + "epoch": 1.235881841876629, + "grad_norm": 0.04519034922122955, + "learning_rate": 3.455419200695048e-05, + "loss": 0.0012, + "step": 5690 + }, + { + "epoch": 1.2380538662033014, + "grad_norm": 1.5662466287612915, + "learning_rate": 3.4527041702867074e-05, + "loss": 0.0354, + "step": 5700 + }, + { + "epoch": 1.240225890529974, + "grad_norm": 0.051535408943891525, + "learning_rate": 3.449989139878367e-05, + "loss": 0.0272, + "step": 5710 + }, + { + "epoch": 1.2423979148566464, + "grad_norm": 0.008189026266336441, + "learning_rate": 3.447274109470026e-05, + "loss": 0.0048, + "step": 5720 + }, + { + "epoch": 1.244569939183319, + "grad_norm": 0.007900476455688477, + "learning_rate": 3.4445590790616853e-05, + "loss": 0.0009, + "step": 5730 + }, + { + "epoch": 1.2467419635099912, + "grad_norm": 0.00759408064186573, + "learning_rate": 3.4418440486533446e-05, + "loss": 0.0008, + "step": 5740 + }, + { + "epoch": 1.2489139878366637, + "grad_norm": 0.7654576301574707, + "learning_rate": 3.4391290182450046e-05, + "loss": 0.0588, + "step": 5750 + }, + { + "epoch": 1.2510860121633363, + "grad_norm": 0.008799172006547451, + "learning_rate": 3.436413987836664e-05, + "loss": 0.0014, + "step": 5760 + }, + { + "epoch": 1.2532580364900086, + "grad_norm": 0.011267498135566711, + "learning_rate": 3.433698957428323e-05, + "loss": 0.0523, + "step": 5770 + }, + { + "epoch": 1.255430060816681, + "grad_norm": 0.029857400804758072, + "learning_rate": 3.4309839270199825e-05, + "loss": 0.0369, + "step": 5780 + }, + { + "epoch": 1.2576020851433536, + "grad_norm": 0.024623876437544823, + "learning_rate": 3.4282688966116425e-05, + "loss": 0.0054, + "step": 5790 + }, + { + "epoch": 1.259774109470026, + "grad_norm": 0.012604706920683384, + "learning_rate": 3.425553866203302e-05, + "loss": 0.0031, + "step": 5800 + }, + { + "epoch": 1.2619461337966986, + "grad_norm": 0.009479483589529991, + "learning_rate": 3.422838835794961e-05, + "loss": 0.0117, + "step": 5810 + }, + { + "epoch": 1.264118158123371, + "grad_norm": 0.018399232998490334, + "learning_rate": 3.4201238053866204e-05, + "loss": 0.0335, + "step": 5820 + }, + { + "epoch": 1.2662901824500434, + "grad_norm": 0.024806447327136993, + "learning_rate": 3.4174087749782804e-05, + "loss": 0.0021, + "step": 5830 + }, + { + "epoch": 1.268462206776716, + "grad_norm": 0.01572875864803791, + "learning_rate": 3.41469374456994e-05, + "loss": 0.0192, + "step": 5840 + }, + { + "epoch": 1.2706342311033882, + "grad_norm": 0.009287680499255657, + "learning_rate": 3.411978714161599e-05, + "loss": 0.0046, + "step": 5850 + }, + { + "epoch": 1.2728062554300608, + "grad_norm": 3.3354904651641846, + "learning_rate": 3.409263683753258e-05, + "loss": 0.0236, + "step": 5860 + }, + { + "epoch": 1.2749782797567333, + "grad_norm": 0.01085092592984438, + "learning_rate": 3.4065486533449176e-05, + "loss": 0.0758, + "step": 5870 + }, + { + "epoch": 1.2771503040834058, + "grad_norm": 0.06153455376625061, + "learning_rate": 3.403833622936577e-05, + "loss": 0.0023, + "step": 5880 + }, + { + "epoch": 1.2793223284100783, + "grad_norm": 0.08027364313602448, + "learning_rate": 3.401118592528236e-05, + "loss": 0.0242, + "step": 5890 + }, + { + "epoch": 1.2814943527367506, + "grad_norm": 0.026004912331700325, + "learning_rate": 3.3984035621198955e-05, + "loss": 0.0135, + "step": 5900 + }, + { + "epoch": 1.2836663770634231, + "grad_norm": 0.007019513752311468, + "learning_rate": 3.3956885317115555e-05, + "loss": 0.0018, + "step": 5910 + }, + { + "epoch": 1.2858384013900956, + "grad_norm": 0.007902990095317364, + "learning_rate": 3.392973501303215e-05, + "loss": 0.0461, + "step": 5920 + }, + { + "epoch": 1.288010425716768, + "grad_norm": 0.008273580111563206, + "learning_rate": 3.390258470894874e-05, + "loss": 0.0015, + "step": 5930 + }, + { + "epoch": 1.2901824500434405, + "grad_norm": 0.008654528297483921, + "learning_rate": 3.3875434404865334e-05, + "loss": 0.0637, + "step": 5940 + }, + { + "epoch": 1.292354474370113, + "grad_norm": 0.3597409129142761, + "learning_rate": 3.384828410078193e-05, + "loss": 0.0082, + "step": 5950 + }, + { + "epoch": 1.2945264986967855, + "grad_norm": 0.014476552605628967, + "learning_rate": 3.382113379669852e-05, + "loss": 0.0055, + "step": 5960 + }, + { + "epoch": 1.2966985230234578, + "grad_norm": 3.3513331413269043, + "learning_rate": 3.379398349261512e-05, + "loss": 0.0815, + "step": 5970 + }, + { + "epoch": 1.2988705473501303, + "grad_norm": 0.008756415918469429, + "learning_rate": 3.376683318853171e-05, + "loss": 0.0009, + "step": 5980 + }, + { + "epoch": 1.3010425716768028, + "grad_norm": 0.01109595037996769, + "learning_rate": 3.373968288444831e-05, + "loss": 0.0193, + "step": 5990 + }, + { + "epoch": 1.3032145960034751, + "grad_norm": 0.05595744401216507, + "learning_rate": 3.3712532580364906e-05, + "loss": 0.0018, + "step": 6000 + }, + { + "epoch": 1.3053866203301476, + "grad_norm": 0.008088390342891216, + "learning_rate": 3.36853822762815e-05, + "loss": 0.0552, + "step": 6010 + }, + { + "epoch": 1.3075586446568201, + "grad_norm": 0.012108515948057175, + "learning_rate": 3.365823197219809e-05, + "loss": 0.0019, + "step": 6020 + }, + { + "epoch": 1.3097306689834927, + "grad_norm": 4.250258922576904, + "learning_rate": 3.3631081668114685e-05, + "loss": 0.0069, + "step": 6030 + }, + { + "epoch": 1.3119026933101652, + "grad_norm": 0.018538329750299454, + "learning_rate": 3.360393136403128e-05, + "loss": 0.0014, + "step": 6040 + }, + { + "epoch": 1.3140747176368375, + "grad_norm": 0.05477520078420639, + "learning_rate": 3.357678105994787e-05, + "loss": 0.0172, + "step": 6050 + }, + { + "epoch": 1.31624674196351, + "grad_norm": 0.4852977991104126, + "learning_rate": 3.354963075586447e-05, + "loss": 0.0555, + "step": 6060 + }, + { + "epoch": 1.3184187662901825, + "grad_norm": 0.013744533993303776, + "learning_rate": 3.3522480451781063e-05, + "loss": 0.0072, + "step": 6070 + }, + { + "epoch": 1.3205907906168548, + "grad_norm": 0.008594054728746414, + "learning_rate": 3.3495330147697656e-05, + "loss": 0.0135, + "step": 6080 + }, + { + "epoch": 1.3227628149435273, + "grad_norm": 0.008579927496612072, + "learning_rate": 3.346817984361425e-05, + "loss": 0.0484, + "step": 6090 + }, + { + "epoch": 1.3249348392701998, + "grad_norm": 0.02037675306200981, + "learning_rate": 3.344102953953084e-05, + "loss": 0.0017, + "step": 6100 + }, + { + "epoch": 1.3271068635968724, + "grad_norm": 0.040782492607831955, + "learning_rate": 3.3413879235447436e-05, + "loss": 0.014, + "step": 6110 + }, + { + "epoch": 1.3292788879235449, + "grad_norm": 0.04753238335251808, + "learning_rate": 3.338672893136403e-05, + "loss": 0.0246, + "step": 6120 + }, + { + "epoch": 1.3314509122502172, + "grad_norm": 0.049100227653980255, + "learning_rate": 3.335957862728063e-05, + "loss": 0.0279, + "step": 6130 + }, + { + "epoch": 1.3336229365768897, + "grad_norm": 0.008865290321409702, + "learning_rate": 3.333242832319722e-05, + "loss": 0.0015, + "step": 6140 + }, + { + "epoch": 1.3357949609035622, + "grad_norm": 0.4136160910129547, + "learning_rate": 3.3305278019113814e-05, + "loss": 0.0028, + "step": 6150 + }, + { + "epoch": 1.3379669852302345, + "grad_norm": 0.9577689170837402, + "learning_rate": 3.327812771503041e-05, + "loss": 0.0394, + "step": 6160 + }, + { + "epoch": 1.340139009556907, + "grad_norm": 0.04995536804199219, + "learning_rate": 3.325097741094701e-05, + "loss": 0.0231, + "step": 6170 + }, + { + "epoch": 1.3423110338835795, + "grad_norm": 0.03455106168985367, + "learning_rate": 3.32238271068636e-05, + "loss": 0.0118, + "step": 6180 + }, + { + "epoch": 1.344483058210252, + "grad_norm": 0.4952455163002014, + "learning_rate": 3.319667680278019e-05, + "loss": 0.0239, + "step": 6190 + }, + { + "epoch": 1.3466550825369243, + "grad_norm": 0.006735064554959536, + "learning_rate": 3.3169526498696786e-05, + "loss": 0.0028, + "step": 6200 + }, + { + "epoch": 1.3488271068635969, + "grad_norm": 0.0067639597691595554, + "learning_rate": 3.3142376194613386e-05, + "loss": 0.0008, + "step": 6210 + }, + { + "epoch": 1.3509991311902694, + "grad_norm": 0.0066012111492455006, + "learning_rate": 3.311522589052998e-05, + "loss": 0.0506, + "step": 6220 + }, + { + "epoch": 1.3531711555169417, + "grad_norm": 1.9521390199661255, + "learning_rate": 3.308807558644657e-05, + "loss": 0.0134, + "step": 6230 + }, + { + "epoch": 1.3553431798436142, + "grad_norm": 0.346284419298172, + "learning_rate": 3.3060925282363165e-05, + "loss": 0.0114, + "step": 6240 + }, + { + "epoch": 1.3575152041702867, + "grad_norm": 0.006885781418532133, + "learning_rate": 3.303377497827976e-05, + "loss": 0.0011, + "step": 6250 + }, + { + "epoch": 1.3596872284969592, + "grad_norm": 0.006113228388130665, + "learning_rate": 3.300662467419635e-05, + "loss": 0.0017, + "step": 6260 + }, + { + "epoch": 1.3618592528236317, + "grad_norm": 0.006051701493561268, + "learning_rate": 3.2979474370112944e-05, + "loss": 0.0032, + "step": 6270 + }, + { + "epoch": 1.364031277150304, + "grad_norm": 0.0061963628977537155, + "learning_rate": 3.295232406602954e-05, + "loss": 0.0423, + "step": 6280 + }, + { + "epoch": 1.3662033014769766, + "grad_norm": 0.006955439690500498, + "learning_rate": 3.292517376194614e-05, + "loss": 0.0011, + "step": 6290 + }, + { + "epoch": 1.368375325803649, + "grad_norm": 0.04997705668210983, + "learning_rate": 3.289802345786273e-05, + "loss": 0.0041, + "step": 6300 + }, + { + "epoch": 1.3705473501303214, + "grad_norm": 2.2008578777313232, + "learning_rate": 3.287087315377932e-05, + "loss": 0.017, + "step": 6310 + }, + { + "epoch": 1.3727193744569939, + "grad_norm": 0.046700820326805115, + "learning_rate": 3.2843722849695916e-05, + "loss": 0.0021, + "step": 6320 + }, + { + "epoch": 1.3748913987836664, + "grad_norm": 0.005715570878237486, + "learning_rate": 3.281657254561251e-05, + "loss": 0.0402, + "step": 6330 + }, + { + "epoch": 1.377063423110339, + "grad_norm": 0.7199205160140991, + "learning_rate": 3.27894222415291e-05, + "loss": 0.0022, + "step": 6340 + }, + { + "epoch": 1.3792354474370114, + "grad_norm": 0.007373593281954527, + "learning_rate": 3.27622719374457e-05, + "loss": 0.023, + "step": 6350 + }, + { + "epoch": 1.3814074717636837, + "grad_norm": 0.020932350307703018, + "learning_rate": 3.2735121633362295e-05, + "loss": 0.0097, + "step": 6360 + }, + { + "epoch": 1.3835794960903562, + "grad_norm": 0.010742595419287682, + "learning_rate": 3.2707971329278895e-05, + "loss": 0.0029, + "step": 6370 + }, + { + "epoch": 1.3857515204170285, + "grad_norm": 0.006945365574210882, + "learning_rate": 3.268082102519549e-05, + "loss": 0.0162, + "step": 6380 + }, + { + "epoch": 1.387923544743701, + "grad_norm": 0.048244886100292206, + "learning_rate": 3.265367072111208e-05, + "loss": 0.0015, + "step": 6390 + }, + { + "epoch": 1.3900955690703736, + "grad_norm": 0.005674061365425587, + "learning_rate": 3.2626520417028674e-05, + "loss": 0.0245, + "step": 6400 + }, + { + "epoch": 1.392267593397046, + "grad_norm": 0.06562032550573349, + "learning_rate": 3.259937011294527e-05, + "loss": 0.0474, + "step": 6410 + }, + { + "epoch": 1.3944396177237186, + "grad_norm": 0.07147029787302017, + "learning_rate": 3.257221980886186e-05, + "loss": 0.0056, + "step": 6420 + }, + { + "epoch": 1.396611642050391, + "grad_norm": 0.009265787899494171, + "learning_rate": 3.254506950477845e-05, + "loss": 0.0161, + "step": 6430 + }, + { + "epoch": 1.3987836663770634, + "grad_norm": 0.011842530220746994, + "learning_rate": 3.252063423110339e-05, + "loss": 0.0094, + "step": 6440 + }, + { + "epoch": 1.400955690703736, + "grad_norm": 6.259805679321289, + "learning_rate": 3.2493483927019986e-05, + "loss": 0.0427, + "step": 6450 + }, + { + "epoch": 1.4031277150304082, + "grad_norm": 0.017940033227205276, + "learning_rate": 3.246633362293658e-05, + "loss": 0.0049, + "step": 6460 + }, + { + "epoch": 1.4052997393570807, + "grad_norm": 0.05736970901489258, + "learning_rate": 3.243918331885317e-05, + "loss": 0.0303, + "step": 6470 + }, + { + "epoch": 1.4074717636837533, + "grad_norm": 0.006364389322698116, + "learning_rate": 3.2412033014769765e-05, + "loss": 0.0033, + "step": 6480 + }, + { + "epoch": 1.4096437880104258, + "grad_norm": 0.006534558720886707, + "learning_rate": 3.238488271068636e-05, + "loss": 0.0095, + "step": 6490 + }, + { + "epoch": 1.4118158123370983, + "grad_norm": 0.01259972807019949, + "learning_rate": 3.235773240660295e-05, + "loss": 0.036, + "step": 6500 + }, + { + "epoch": 1.4139878366637706, + "grad_norm": 0.08793149143457413, + "learning_rate": 3.233058210251955e-05, + "loss": 0.0258, + "step": 6510 + }, + { + "epoch": 1.416159860990443, + "grad_norm": 0.007453892845660448, + "learning_rate": 3.2303431798436144e-05, + "loss": 0.0432, + "step": 6520 + }, + { + "epoch": 1.4183318853171156, + "grad_norm": 0.008104286156594753, + "learning_rate": 3.227628149435274e-05, + "loss": 0.0372, + "step": 6530 + }, + { + "epoch": 1.420503909643788, + "grad_norm": 0.16239531338214874, + "learning_rate": 3.224913119026933e-05, + "loss": 0.0201, + "step": 6540 + }, + { + "epoch": 1.4226759339704604, + "grad_norm": 0.014620975591242313, + "learning_rate": 3.222198088618592e-05, + "loss": 0.0251, + "step": 6550 + }, + { + "epoch": 1.424847958297133, + "grad_norm": 0.006428881548345089, + "learning_rate": 3.219483058210252e-05, + "loss": 0.0058, + "step": 6560 + }, + { + "epoch": 1.4270199826238055, + "grad_norm": 0.006015344522893429, + "learning_rate": 3.2167680278019116e-05, + "loss": 0.0249, + "step": 6570 + }, + { + "epoch": 1.4291920069504778, + "grad_norm": 0.00819337647408247, + "learning_rate": 3.214052997393571e-05, + "loss": 0.0389, + "step": 6580 + }, + { + "epoch": 1.4313640312771503, + "grad_norm": 0.0471440814435482, + "learning_rate": 3.211337966985231e-05, + "loss": 0.0275, + "step": 6590 + }, + { + "epoch": 1.4335360556038228, + "grad_norm": 0.005635848734527826, + "learning_rate": 3.20862293657689e-05, + "loss": 0.0028, + "step": 6600 + }, + { + "epoch": 1.435708079930495, + "grad_norm": 0.005153193604201078, + "learning_rate": 3.2059079061685495e-05, + "loss": 0.0015, + "step": 6610 + }, + { + "epoch": 1.4378801042571676, + "grad_norm": 0.0051053185015916824, + "learning_rate": 3.203192875760209e-05, + "loss": 0.0015, + "step": 6620 + }, + { + "epoch": 1.4400521285838401, + "grad_norm": 0.005034744273871183, + "learning_rate": 3.200477845351868e-05, + "loss": 0.0006, + "step": 6630 + }, + { + "epoch": 1.4422241529105126, + "grad_norm": 0.6263902187347412, + "learning_rate": 3.1977628149435274e-05, + "loss": 0.0472, + "step": 6640 + }, + { + "epoch": 1.4443961772371852, + "grad_norm": 0.009957981295883656, + "learning_rate": 3.195047784535187e-05, + "loss": 0.0027, + "step": 6650 + }, + { + "epoch": 1.4465682015638575, + "grad_norm": 0.007074551656842232, + "learning_rate": 3.192332754126847e-05, + "loss": 0.0009, + "step": 6660 + }, + { + "epoch": 1.44874022589053, + "grad_norm": 0.01237443182617426, + "learning_rate": 3.189617723718506e-05, + "loss": 0.0315, + "step": 6670 + }, + { + "epoch": 1.4509122502172025, + "grad_norm": 0.020478179678320885, + "learning_rate": 3.1871741963509993e-05, + "loss": 0.0585, + "step": 6680 + }, + { + "epoch": 1.4530842745438748, + "grad_norm": 0.06119263172149658, + "learning_rate": 3.1844591659426586e-05, + "loss": 0.0347, + "step": 6690 + }, + { + "epoch": 1.4552562988705473, + "grad_norm": 0.20599764585494995, + "learning_rate": 3.181744135534318e-05, + "loss": 0.0266, + "step": 6700 + }, + { + "epoch": 1.4574283231972198, + "grad_norm": 0.024101588875055313, + "learning_rate": 3.179029105125977e-05, + "loss": 0.013, + "step": 6710 + }, + { + "epoch": 1.4596003475238923, + "grad_norm": 1.3978750705718994, + "learning_rate": 3.1763140747176366e-05, + "loss": 0.0416, + "step": 6720 + }, + { + "epoch": 1.4617723718505649, + "grad_norm": 0.008877950720489025, + "learning_rate": 3.1735990443092965e-05, + "loss": 0.0457, + "step": 6730 + }, + { + "epoch": 1.4639443961772372, + "grad_norm": 0.026043567806482315, + "learning_rate": 3.170884013900956e-05, + "loss": 0.0263, + "step": 6740 + }, + { + "epoch": 1.4661164205039097, + "grad_norm": 0.0366000272333622, + "learning_rate": 3.168168983492615e-05, + "loss": 0.0043, + "step": 6750 + }, + { + "epoch": 1.4682884448305822, + "grad_norm": 0.02416500821709633, + "learning_rate": 3.1654539530842744e-05, + "loss": 0.0028, + "step": 6760 + }, + { + "epoch": 1.4704604691572545, + "grad_norm": 0.12652349472045898, + "learning_rate": 3.1627389226759344e-05, + "loss": 0.0322, + "step": 6770 + }, + { + "epoch": 1.472632493483927, + "grad_norm": 0.017712270841002464, + "learning_rate": 3.160023892267594e-05, + "loss": 0.0192, + "step": 6780 + }, + { + "epoch": 1.4748045178105995, + "grad_norm": 2.6074209213256836, + "learning_rate": 3.157308861859253e-05, + "loss": 0.0178, + "step": 6790 + }, + { + "epoch": 1.476976542137272, + "grad_norm": 0.03420431539416313, + "learning_rate": 3.154593831450912e-05, + "loss": 0.0362, + "step": 6800 + }, + { + "epoch": 1.4791485664639443, + "grad_norm": 0.028371965512633324, + "learning_rate": 3.151878801042572e-05, + "loss": 0.0052, + "step": 6810 + }, + { + "epoch": 1.4813205907906168, + "grad_norm": 0.019446449354290962, + "learning_rate": 3.1491637706342316e-05, + "loss": 0.001, + "step": 6820 + }, + { + "epoch": 1.4834926151172894, + "grad_norm": 0.022431597113609314, + "learning_rate": 3.146448740225891e-05, + "loss": 0.015, + "step": 6830 + }, + { + "epoch": 1.4856646394439617, + "grad_norm": 0.0063674296252429485, + "learning_rate": 3.14373370981755e-05, + "loss": 0.0399, + "step": 6840 + }, + { + "epoch": 1.4878366637706342, + "grad_norm": 0.02433244325220585, + "learning_rate": 3.1410186794092095e-05, + "loss": 0.0308, + "step": 6850 + }, + { + "epoch": 1.4900086880973067, + "grad_norm": 0.12433426082134247, + "learning_rate": 3.138303649000869e-05, + "loss": 0.0901, + "step": 6860 + }, + { + "epoch": 1.4921807124239792, + "grad_norm": 0.027635198086500168, + "learning_rate": 3.135588618592528e-05, + "loss": 0.0095, + "step": 6870 + }, + { + "epoch": 1.4943527367506517, + "grad_norm": 0.01609298586845398, + "learning_rate": 3.132873588184188e-05, + "loss": 0.021, + "step": 6880 + }, + { + "epoch": 1.496524761077324, + "grad_norm": 0.005982758477330208, + "learning_rate": 3.1301585577758474e-05, + "loss": 0.0007, + "step": 6890 + }, + { + "epoch": 1.4986967854039965, + "grad_norm": 0.12338759750127792, + "learning_rate": 3.127443527367507e-05, + "loss": 0.0042, + "step": 6900 + }, + { + "epoch": 1.5008688097306688, + "grad_norm": 0.009839468635618687, + "learning_rate": 3.124728496959166e-05, + "loss": 0.0007, + "step": 6910 + }, + { + "epoch": 1.5030408340573413, + "grad_norm": 0.00818830356001854, + "learning_rate": 3.122013466550825e-05, + "loss": 0.0014, + "step": 6920 + }, + { + "epoch": 1.5052128583840139, + "grad_norm": 0.0049653262831270695, + "learning_rate": 3.1192984361424846e-05, + "loss": 0.0248, + "step": 6930 + }, + { + "epoch": 1.5073848827106864, + "grad_norm": 0.017730310559272766, + "learning_rate": 3.116583405734144e-05, + "loss": 0.0364, + "step": 6940 + }, + { + "epoch": 1.509556907037359, + "grad_norm": 0.07770511507987976, + "learning_rate": 3.113868375325804e-05, + "loss": 0.0154, + "step": 6950 + }, + { + "epoch": 1.5117289313640314, + "grad_norm": 0.00605663051828742, + "learning_rate": 3.111153344917463e-05, + "loss": 0.0015, + "step": 6960 + }, + { + "epoch": 1.5139009556907037, + "grad_norm": 0.01187584176659584, + "learning_rate": 3.1084383145091225e-05, + "loss": 0.0025, + "step": 6970 + }, + { + "epoch": 1.5160729800173762, + "grad_norm": 0.004929904360324144, + "learning_rate": 3.1057232841007825e-05, + "loss": 0.0097, + "step": 6980 + }, + { + "epoch": 1.5182450043440485, + "grad_norm": 0.0053964219987392426, + "learning_rate": 3.103008253692442e-05, + "loss": 0.0145, + "step": 6990 + }, + { + "epoch": 1.520417028670721, + "grad_norm": 0.004978001583367586, + "learning_rate": 3.100293223284101e-05, + "loss": 0.046, + "step": 7000 + }, + { + "epoch": 1.5225890529973936, + "grad_norm": 0.09007082879543304, + "learning_rate": 3.0975781928757604e-05, + "loss": 0.0418, + "step": 7010 + }, + { + "epoch": 1.524761077324066, + "grad_norm": 0.11491013318300247, + "learning_rate": 3.09486316246742e-05, + "loss": 0.03, + "step": 7020 + }, + { + "epoch": 1.5269331016507386, + "grad_norm": 0.005492928437888622, + "learning_rate": 3.092148132059079e-05, + "loss": 0.0038, + "step": 7030 + }, + { + "epoch": 1.529105125977411, + "grad_norm": 0.005056523717939854, + "learning_rate": 3.089433101650739e-05, + "loss": 0.0238, + "step": 7040 + }, + { + "epoch": 1.5312771503040834, + "grad_norm": 0.006425573956221342, + "learning_rate": 3.086718071242398e-05, + "loss": 0.0015, + "step": 7050 + }, + { + "epoch": 1.533449174630756, + "grad_norm": 0.005666010081768036, + "learning_rate": 3.0840030408340576e-05, + "loss": 0.0023, + "step": 7060 + }, + { + "epoch": 1.5356211989574282, + "grad_norm": 0.004847542382776737, + "learning_rate": 3.081288010425717e-05, + "loss": 0.0033, + "step": 7070 + }, + { + "epoch": 1.5377932232841007, + "grad_norm": 0.00474773533642292, + "learning_rate": 3.078572980017376e-05, + "loss": 0.0306, + "step": 7080 + }, + { + "epoch": 1.5399652476107732, + "grad_norm": 0.11169460415840149, + "learning_rate": 3.0758579496090355e-05, + "loss": 0.0159, + "step": 7090 + }, + { + "epoch": 1.5421372719374458, + "grad_norm": 4.95255708694458, + "learning_rate": 3.073142919200695e-05, + "loss": 0.0364, + "step": 7100 + }, + { + "epoch": 1.5443092962641183, + "grad_norm": 0.26123908162117004, + "learning_rate": 3.070427888792355e-05, + "loss": 0.0052, + "step": 7110 + }, + { + "epoch": 1.5464813205907906, + "grad_norm": 0.004803699441254139, + "learning_rate": 3.067712858384014e-05, + "loss": 0.0012, + "step": 7120 + }, + { + "epoch": 1.548653344917463, + "grad_norm": 0.004582292400300503, + "learning_rate": 3.0649978279756733e-05, + "loss": 0.0005, + "step": 7130 + }, + { + "epoch": 1.5508253692441354, + "grad_norm": 0.004967282060533762, + "learning_rate": 3.0622827975673326e-05, + "loss": 0.0453, + "step": 7140 + }, + { + "epoch": 1.552997393570808, + "grad_norm": 3.779393196105957, + "learning_rate": 3.0595677671589926e-05, + "loss": 0.0148, + "step": 7150 + }, + { + "epoch": 1.5551694178974804, + "grad_norm": 0.08057697117328644, + "learning_rate": 3.056852736750652e-05, + "loss": 0.0015, + "step": 7160 + }, + { + "epoch": 1.557341442224153, + "grad_norm": 0.020298315212130547, + "learning_rate": 3.054137706342311e-05, + "loss": 0.0022, + "step": 7170 + }, + { + "epoch": 1.5595134665508255, + "grad_norm": 0.005744563415646553, + "learning_rate": 3.0514226759339702e-05, + "loss": 0.0013, + "step": 7180 + }, + { + "epoch": 1.561685490877498, + "grad_norm": 0.005050037521868944, + "learning_rate": 3.0487076455256302e-05, + "loss": 0.0365, + "step": 7190 + }, + { + "epoch": 1.5638575152041703, + "grad_norm": 0.02648780681192875, + "learning_rate": 3.0459926151172895e-05, + "loss": 0.0017, + "step": 7200 + }, + { + "epoch": 1.5660295395308428, + "grad_norm": 1.0146911144256592, + "learning_rate": 3.043277584708949e-05, + "loss": 0.0024, + "step": 7210 + }, + { + "epoch": 1.568201563857515, + "grad_norm": 0.005274607799947262, + "learning_rate": 3.0405625543006084e-05, + "loss": 0.0475, + "step": 7220 + }, + { + "epoch": 1.5703735881841876, + "grad_norm": 0.013579404912889004, + "learning_rate": 3.0378475238922677e-05, + "loss": 0.0108, + "step": 7230 + }, + { + "epoch": 1.5725456125108601, + "grad_norm": 0.015852799639105797, + "learning_rate": 3.035132493483927e-05, + "loss": 0.0227, + "step": 7240 + }, + { + "epoch": 1.5747176368375326, + "grad_norm": 0.014161293394863605, + "learning_rate": 3.0324174630755863e-05, + "loss": 0.0028, + "step": 7250 + }, + { + "epoch": 1.5768896611642051, + "grad_norm": 0.005616712383925915, + "learning_rate": 3.0297024326672463e-05, + "loss": 0.0012, + "step": 7260 + }, + { + "epoch": 1.5790616854908774, + "grad_norm": 0.008326984010636806, + "learning_rate": 3.0269874022589056e-05, + "loss": 0.001, + "step": 7270 + }, + { + "epoch": 1.58123370981755, + "grad_norm": 0.016356853768229485, + "learning_rate": 3.024272371850565e-05, + "loss": 0.0765, + "step": 7280 + }, + { + "epoch": 1.5834057341442223, + "grad_norm": 0.020525842905044556, + "learning_rate": 3.0215573414422242e-05, + "loss": 0.0195, + "step": 7290 + }, + { + "epoch": 1.5855777584708948, + "grad_norm": 0.012340064160525799, + "learning_rate": 3.018842311033884e-05, + "loss": 0.003, + "step": 7300 + }, + { + "epoch": 1.5877497827975673, + "grad_norm": 0.008134052157402039, + "learning_rate": 3.016127280625543e-05, + "loss": 0.0016, + "step": 7310 + }, + { + "epoch": 1.5899218071242398, + "grad_norm": 0.8095653057098389, + "learning_rate": 3.0134122502172024e-05, + "loss": 0.0154, + "step": 7320 + }, + { + "epoch": 1.5920938314509123, + "grad_norm": 0.007166721858084202, + "learning_rate": 3.0106972198088617e-05, + "loss": 0.0307, + "step": 7330 + }, + { + "epoch": 1.5942658557775848, + "grad_norm": 0.036194682121276855, + "learning_rate": 3.0079821894005217e-05, + "loss": 0.0414, + "step": 7340 + }, + { + "epoch": 1.5964378801042571, + "grad_norm": 1.6515989303588867, + "learning_rate": 3.005267158992181e-05, + "loss": 0.0297, + "step": 7350 + }, + { + "epoch": 1.5986099044309297, + "grad_norm": 0.015606805682182312, + "learning_rate": 3.0025521285838403e-05, + "loss": 0.0033, + "step": 7360 + }, + { + "epoch": 1.600781928757602, + "grad_norm": 0.010313029401004314, + "learning_rate": 2.9998370981754996e-05, + "loss": 0.0069, + "step": 7370 + }, + { + "epoch": 1.6029539530842745, + "grad_norm": 0.2324070781469345, + "learning_rate": 2.997122067767159e-05, + "loss": 0.0293, + "step": 7380 + }, + { + "epoch": 1.605125977410947, + "grad_norm": 2.7912869453430176, + "learning_rate": 2.9944070373588186e-05, + "loss": 0.0525, + "step": 7390 + }, + { + "epoch": 1.6072980017376195, + "grad_norm": 0.10823327302932739, + "learning_rate": 2.991692006950478e-05, + "loss": 0.0319, + "step": 7400 + }, + { + "epoch": 1.609470026064292, + "grad_norm": 0.3346640169620514, + "learning_rate": 2.9889769765421372e-05, + "loss": 0.0528, + "step": 7410 + }, + { + "epoch": 1.6116420503909645, + "grad_norm": 0.4158894121646881, + "learning_rate": 2.986261946133797e-05, + "loss": 0.031, + "step": 7420 + }, + { + "epoch": 1.6138140747176368, + "grad_norm": 0.005024017300456762, + "learning_rate": 2.9835469157254565e-05, + "loss": 0.0017, + "step": 7430 + }, + { + "epoch": 1.6159860990443093, + "grad_norm": 0.00963117741048336, + "learning_rate": 2.9808318853171158e-05, + "loss": 0.0482, + "step": 7440 + }, + { + "epoch": 1.6181581233709816, + "grad_norm": 0.011439714580774307, + "learning_rate": 2.978116854908775e-05, + "loss": 0.0035, + "step": 7450 + }, + { + "epoch": 1.6203301476976542, + "grad_norm": 0.004979605786502361, + "learning_rate": 2.9754018245004344e-05, + "loss": 0.0055, + "step": 7460 + }, + { + "epoch": 1.6225021720243267, + "grad_norm": 0.005157648120075464, + "learning_rate": 2.9726867940920937e-05, + "loss": 0.0006, + "step": 7470 + }, + { + "epoch": 1.6246741963509992, + "grad_norm": 0.004680620972067118, + "learning_rate": 2.9699717636837533e-05, + "loss": 0.0241, + "step": 7480 + }, + { + "epoch": 1.6268462206776717, + "grad_norm": 0.004479921422898769, + "learning_rate": 2.967256733275413e-05, + "loss": 0.0006, + "step": 7490 + }, + { + "epoch": 1.629018245004344, + "grad_norm": 0.00438233558088541, + "learning_rate": 2.9645417028670726e-05, + "loss": 0.0464, + "step": 7500 + }, + { + "epoch": 1.6311902693310165, + "grad_norm": 0.29184991121292114, + "learning_rate": 2.961826672458732e-05, + "loss": 0.0013, + "step": 7510 + }, + { + "epoch": 1.6333622936576888, + "grad_norm": 3.4082252979278564, + "learning_rate": 2.9591116420503912e-05, + "loss": 0.0357, + "step": 7520 + }, + { + "epoch": 1.6355343179843613, + "grad_norm": 0.11537332832813263, + "learning_rate": 2.9563966116420505e-05, + "loss": 0.003, + "step": 7530 + }, + { + "epoch": 1.6377063423110338, + "grad_norm": 0.08221070468425751, + "learning_rate": 2.9536815812337098e-05, + "loss": 0.0031, + "step": 7540 + }, + { + "epoch": 1.6398783666377064, + "grad_norm": 0.00624883221462369, + "learning_rate": 2.950966550825369e-05, + "loss": 0.0018, + "step": 7550 + }, + { + "epoch": 1.6420503909643789, + "grad_norm": 0.004528459627181292, + "learning_rate": 2.9482515204170284e-05, + "loss": 0.0006, + "step": 7560 + }, + { + "epoch": 1.6442224152910514, + "grad_norm": 0.007416573353111744, + "learning_rate": 2.9455364900086884e-05, + "loss": 0.0349, + "step": 7570 + }, + { + "epoch": 1.6463944396177237, + "grad_norm": 0.020044928416609764, + "learning_rate": 2.9428214596003477e-05, + "loss": 0.004, + "step": 7580 + }, + { + "epoch": 1.6485664639443962, + "grad_norm": 0.01522949431091547, + "learning_rate": 2.9401064291920073e-05, + "loss": 0.0012, + "step": 7590 + }, + { + "epoch": 1.6507384882710685, + "grad_norm": 0.012193024158477783, + "learning_rate": 2.9373913987836666e-05, + "loss": 0.0382, + "step": 7600 + }, + { + "epoch": 1.652910512597741, + "grad_norm": 0.004249626770615578, + "learning_rate": 2.934676368375326e-05, + "loss": 0.0019, + "step": 7610 + }, + { + "epoch": 1.6550825369244135, + "grad_norm": 0.0253335889428854, + "learning_rate": 2.9319613379669852e-05, + "loss": 0.0213, + "step": 7620 + }, + { + "epoch": 1.657254561251086, + "grad_norm": 0.0044834488071501255, + "learning_rate": 2.9292463075586445e-05, + "loss": 0.0022, + "step": 7630 + }, + { + "epoch": 1.6594265855777586, + "grad_norm": 0.1263313889503479, + "learning_rate": 2.9265312771503038e-05, + "loss": 0.0162, + "step": 7640 + }, + { + "epoch": 1.661598609904431, + "grad_norm": 0.46258336305618286, + "learning_rate": 2.9238162467419638e-05, + "loss": 0.0038, + "step": 7650 + }, + { + "epoch": 1.6637706342311034, + "grad_norm": 2.930629014968872, + "learning_rate": 2.921101216333623e-05, + "loss": 0.0198, + "step": 7660 + }, + { + "epoch": 1.665942658557776, + "grad_norm": 0.0038381244521588087, + "learning_rate": 2.9183861859252824e-05, + "loss": 0.0004, + "step": 7670 + }, + { + "epoch": 1.6681146828844482, + "grad_norm": 0.013386134058237076, + "learning_rate": 2.915671155516942e-05, + "loss": 0.0656, + "step": 7680 + }, + { + "epoch": 1.6702867072111207, + "grad_norm": 0.00613776408135891, + "learning_rate": 2.9129561251086014e-05, + "loss": 0.0009, + "step": 7690 + }, + { + "epoch": 1.6724587315377932, + "grad_norm": 0.005761744920164347, + "learning_rate": 2.9102410947002607e-05, + "loss": 0.0017, + "step": 7700 + }, + { + "epoch": 1.6746307558644657, + "grad_norm": 0.01169886626303196, + "learning_rate": 2.90752606429192e-05, + "loss": 0.0161, + "step": 7710 + }, + { + "epoch": 1.6768027801911383, + "grad_norm": 0.005424773786216974, + "learning_rate": 2.90481103388358e-05, + "loss": 0.0016, + "step": 7720 + }, + { + "epoch": 1.6789748045178106, + "grad_norm": 0.004224838223308325, + "learning_rate": 2.9020960034752392e-05, + "loss": 0.0016, + "step": 7730 + }, + { + "epoch": 1.681146828844483, + "grad_norm": 0.004218948073685169, + "learning_rate": 2.8993809730668985e-05, + "loss": 0.0005, + "step": 7740 + }, + { + "epoch": 1.6833188531711554, + "grad_norm": 0.030858902260661125, + "learning_rate": 2.896665942658558e-05, + "loss": 0.0009, + "step": 7750 + }, + { + "epoch": 1.6854908774978279, + "grad_norm": 0.0144006023183465, + "learning_rate": 2.893950912250217e-05, + "loss": 0.0419, + "step": 7760 + }, + { + "epoch": 1.6876629018245004, + "grad_norm": 0.00441219424828887, + "learning_rate": 2.8912358818418768e-05, + "loss": 0.0011, + "step": 7770 + }, + { + "epoch": 1.689834926151173, + "grad_norm": 0.004708564840257168, + "learning_rate": 2.888520851433536e-05, + "loss": 0.0049, + "step": 7780 + }, + { + "epoch": 1.6920069504778454, + "grad_norm": 0.004145478829741478, + "learning_rate": 2.8858058210251954e-05, + "loss": 0.0154, + "step": 7790 + }, + { + "epoch": 1.694178974804518, + "grad_norm": 0.012138472869992256, + "learning_rate": 2.8830907906168554e-05, + "loss": 0.0174, + "step": 7800 + }, + { + "epoch": 1.6963509991311903, + "grad_norm": 0.03401601314544678, + "learning_rate": 2.8803757602085147e-05, + "loss": 0.0013, + "step": 7810 + }, + { + "epoch": 1.6985230234578628, + "grad_norm": 0.7554841041564941, + "learning_rate": 2.877660729800174e-05, + "loss": 0.0083, + "step": 7820 + }, + { + "epoch": 1.700695047784535, + "grad_norm": 0.0041222646832466125, + "learning_rate": 2.8749456993918333e-05, + "loss": 0.0012, + "step": 7830 + }, + { + "epoch": 1.7028670721112076, + "grad_norm": 0.003780083265155554, + "learning_rate": 2.8722306689834926e-05, + "loss": 0.0007, + "step": 7840 + }, + { + "epoch": 1.70503909643788, + "grad_norm": 4.778973579406738, + "learning_rate": 2.869515638575152e-05, + "loss": 0.0362, + "step": 7850 + }, + { + "epoch": 1.7072111207645526, + "grad_norm": 0.005503606982529163, + "learning_rate": 2.8668006081668115e-05, + "loss": 0.0258, + "step": 7860 + }, + { + "epoch": 1.7093831450912251, + "grad_norm": 0.003981790505349636, + "learning_rate": 2.864085577758471e-05, + "loss": 0.0149, + "step": 7870 + }, + { + "epoch": 1.7115551694178974, + "grad_norm": 0.003801483428105712, + "learning_rate": 2.8613705473501308e-05, + "loss": 0.0013, + "step": 7880 + }, + { + "epoch": 1.71372719374457, + "grad_norm": 0.0036825397983193398, + "learning_rate": 2.85865551694179e-05, + "loss": 0.0068, + "step": 7890 + }, + { + "epoch": 1.7158992180712422, + "grad_norm": 0.004739957861602306, + "learning_rate": 2.8559404865334494e-05, + "loss": 0.0004, + "step": 7900 + }, + { + "epoch": 1.7180712423979148, + "grad_norm": 8.044977188110352, + "learning_rate": 2.8532254561251087e-05, + "loss": 0.0233, + "step": 7910 + }, + { + "epoch": 1.7202432667245873, + "grad_norm": 3.087222099304199, + "learning_rate": 2.850510425716768e-05, + "loss": 0.0223, + "step": 7920 + }, + { + "epoch": 1.7224152910512598, + "grad_norm": 0.003672607010230422, + "learning_rate": 2.8477953953084273e-05, + "loss": 0.0361, + "step": 7930 + }, + { + "epoch": 1.7245873153779323, + "grad_norm": 0.004437604453414679, + "learning_rate": 2.845080364900087e-05, + "loss": 0.0008, + "step": 7940 + }, + { + "epoch": 1.7267593397046048, + "grad_norm": 0.006330874748528004, + "learning_rate": 2.8423653344917466e-05, + "loss": 0.0191, + "step": 7950 + }, + { + "epoch": 1.7289313640312771, + "grad_norm": 0.3795784115791321, + "learning_rate": 2.839650304083406e-05, + "loss": 0.0023, + "step": 7960 + }, + { + "epoch": 1.7311033883579496, + "grad_norm": 0.00351201300509274, + "learning_rate": 2.8369352736750655e-05, + "loss": 0.0163, + "step": 7970 + }, + { + "epoch": 1.733275412684622, + "grad_norm": 1.7485036849975586, + "learning_rate": 2.834220243266725e-05, + "loss": 0.0028, + "step": 7980 + }, + { + "epoch": 1.7354474370112944, + "grad_norm": 0.0035417363978922367, + "learning_rate": 2.831505212858384e-05, + "loss": 0.0199, + "step": 7990 + }, + { + "epoch": 1.737619461337967, + "grad_norm": 0.0039270068518817425, + "learning_rate": 2.8287901824500434e-05, + "loss": 0.0013, + "step": 8000 + }, + { + "epoch": 1.7397914856646395, + "grad_norm": 0.004426254890859127, + "learning_rate": 2.8260751520417027e-05, + "loss": 0.0074, + "step": 8010 + }, + { + "epoch": 1.741963509991312, + "grad_norm": 0.0035433934535831213, + "learning_rate": 2.823360121633362e-05, + "loss": 0.0154, + "step": 8020 + }, + { + "epoch": 1.7441355343179845, + "grad_norm": 0.003466078545898199, + "learning_rate": 2.820645091225022e-05, + "loss": 0.011, + "step": 8030 + }, + { + "epoch": 1.7463075586446568, + "grad_norm": 0.1723119467496872, + "learning_rate": 2.8179300608166813e-05, + "loss": 0.0241, + "step": 8040 + }, + { + "epoch": 1.7484795829713293, + "grad_norm": 0.005867532454431057, + "learning_rate": 2.8152150304083406e-05, + "loss": 0.0022, + "step": 8050 + }, + { + "epoch": 1.7506516072980016, + "grad_norm": 0.005134823732078075, + "learning_rate": 2.8125000000000003e-05, + "loss": 0.0006, + "step": 8060 + }, + { + "epoch": 1.7528236316246741, + "grad_norm": 0.003376134904101491, + "learning_rate": 2.8097849695916596e-05, + "loss": 0.0021, + "step": 8070 + }, + { + "epoch": 1.7549956559513467, + "grad_norm": 0.007784237619489431, + "learning_rate": 2.807069939183319e-05, + "loss": 0.0128, + "step": 8080 + }, + { + "epoch": 1.7571676802780192, + "grad_norm": 0.5563734173774719, + "learning_rate": 2.804354908774978e-05, + "loss": 0.0192, + "step": 8090 + }, + { + "epoch": 1.7593397046046917, + "grad_norm": 0.0036883733700960875, + "learning_rate": 2.801639878366638e-05, + "loss": 0.0062, + "step": 8100 + }, + { + "epoch": 1.761511728931364, + "grad_norm": 1.4399293661117554, + "learning_rate": 2.7989248479582974e-05, + "loss": 0.0084, + "step": 8110 + }, + { + "epoch": 1.7636837532580365, + "grad_norm": 0.003304542973637581, + "learning_rate": 2.7962098175499567e-05, + "loss": 0.0004, + "step": 8120 + }, + { + "epoch": 1.7658557775847088, + "grad_norm": 0.003243118291720748, + "learning_rate": 2.793494787141616e-05, + "loss": 0.0088, + "step": 8130 + }, + { + "epoch": 1.7680278019113813, + "grad_norm": 0.12104818224906921, + "learning_rate": 2.7907797567332754e-05, + "loss": 0.0015, + "step": 8140 + }, + { + "epoch": 1.7701998262380538, + "grad_norm": 0.0031961267814040184, + "learning_rate": 2.788064726324935e-05, + "loss": 0.0205, + "step": 8150 + }, + { + "epoch": 1.7723718505647263, + "grad_norm": 0.03292842581868172, + "learning_rate": 2.7853496959165943e-05, + "loss": 0.0036, + "step": 8160 + }, + { + "epoch": 1.7745438748913989, + "grad_norm": 0.003530114656314254, + "learning_rate": 2.7826346655082536e-05, + "loss": 0.0016, + "step": 8170 + }, + { + "epoch": 1.7767158992180714, + "grad_norm": 0.012900624424219131, + "learning_rate": 2.7799196350999136e-05, + "loss": 0.0022, + "step": 8180 + }, + { + "epoch": 1.7788879235447437, + "grad_norm": 0.003065924858674407, + "learning_rate": 2.777204604691573e-05, + "loss": 0.0013, + "step": 8190 + }, + { + "epoch": 1.7810599478714162, + "grad_norm": 0.0030597923323512077, + "learning_rate": 2.7744895742832322e-05, + "loss": 0.0009, + "step": 8200 + }, + { + "epoch": 1.7832319721980885, + "grad_norm": 0.0034443363547325134, + "learning_rate": 2.7717745438748915e-05, + "loss": 0.0933, + "step": 8210 + }, + { + "epoch": 1.785403996524761, + "grad_norm": 0.00810242909938097, + "learning_rate": 2.7690595134665508e-05, + "loss": 0.0394, + "step": 8220 + }, + { + "epoch": 1.7875760208514335, + "grad_norm": 2.509730339050293, + "learning_rate": 2.7663444830582104e-05, + "loss": 0.0056, + "step": 8230 + }, + { + "epoch": 1.789748045178106, + "grad_norm": 0.024317113682627678, + "learning_rate": 2.7636294526498697e-05, + "loss": 0.0033, + "step": 8240 + }, + { + "epoch": 1.7919200695047786, + "grad_norm": 0.18328578770160675, + "learning_rate": 2.760914422241529e-05, + "loss": 0.0531, + "step": 8250 + }, + { + "epoch": 1.794092093831451, + "grad_norm": 0.008703135885298252, + "learning_rate": 2.758199391833189e-05, + "loss": 0.0014, + "step": 8260 + }, + { + "epoch": 1.7962641181581234, + "grad_norm": 0.005929219536483288, + "learning_rate": 2.7554843614248483e-05, + "loss": 0.0204, + "step": 8270 + }, + { + "epoch": 1.7984361424847957, + "grad_norm": 0.11236939579248428, + "learning_rate": 2.7527693310165076e-05, + "loss": 0.0121, + "step": 8280 + }, + { + "epoch": 1.8006081668114682, + "grad_norm": 0.00475434260442853, + "learning_rate": 2.750054300608167e-05, + "loss": 0.01, + "step": 8290 + }, + { + "epoch": 1.8027801911381407, + "grad_norm": 0.004544104915112257, + "learning_rate": 2.7473392701998262e-05, + "loss": 0.0007, + "step": 8300 + }, + { + "epoch": 1.8049522154648132, + "grad_norm": 0.0046235802583396435, + "learning_rate": 2.7446242397914855e-05, + "loss": 0.0018, + "step": 8310 + }, + { + "epoch": 1.8071242397914857, + "grad_norm": 4.546655654907227, + "learning_rate": 2.741909209383145e-05, + "loss": 0.0212, + "step": 8320 + }, + { + "epoch": 1.8092962641181582, + "grad_norm": 0.25990429520606995, + "learning_rate": 2.7391941789748048e-05, + "loss": 0.0025, + "step": 8330 + }, + { + "epoch": 1.8114682884448305, + "grad_norm": 0.003434494836255908, + "learning_rate": 2.736479148566464e-05, + "loss": 0.0004, + "step": 8340 + }, + { + "epoch": 1.813640312771503, + "grad_norm": 0.0053975642658770084, + "learning_rate": 2.7337641181581237e-05, + "loss": 0.0004, + "step": 8350 + }, + { + "epoch": 1.8158123370981754, + "grad_norm": 0.003612579545006156, + "learning_rate": 2.731049087749783e-05, + "loss": 0.0004, + "step": 8360 + }, + { + "epoch": 1.8179843614248479, + "grad_norm": 0.004297258798032999, + "learning_rate": 2.7283340573414423e-05, + "loss": 0.0133, + "step": 8370 + }, + { + "epoch": 1.8201563857515204, + "grad_norm": 0.00486018368974328, + "learning_rate": 2.7256190269331016e-05, + "loss": 0.0007, + "step": 8380 + }, + { + "epoch": 1.822328410078193, + "grad_norm": 0.00850472692400217, + "learning_rate": 2.722903996524761e-05, + "loss": 0.0006, + "step": 8390 + }, + { + "epoch": 1.8245004344048654, + "grad_norm": 4.226654529571533, + "learning_rate": 2.7201889661164202e-05, + "loss": 0.0146, + "step": 8400 + }, + { + "epoch": 1.826672458731538, + "grad_norm": 0.0532936193048954, + "learning_rate": 2.7174739357080802e-05, + "loss": 0.0008, + "step": 8410 + }, + { + "epoch": 1.8288444830582102, + "grad_norm": 0.0031467361841350794, + "learning_rate": 2.7147589052997395e-05, + "loss": 0.002, + "step": 8420 + }, + { + "epoch": 1.8310165073848828, + "grad_norm": 0.0035019817296415567, + "learning_rate": 2.7120438748913988e-05, + "loss": 0.0013, + "step": 8430 + }, + { + "epoch": 1.833188531711555, + "grad_norm": 0.15777327120304108, + "learning_rate": 2.7093288444830585e-05, + "loss": 0.0053, + "step": 8440 + }, + { + "epoch": 1.8353605560382276, + "grad_norm": 0.0041242060251533985, + "learning_rate": 2.7066138140747178e-05, + "loss": 0.0009, + "step": 8450 + }, + { + "epoch": 1.8375325803649, + "grad_norm": 0.004727715160697699, + "learning_rate": 2.703898783666377e-05, + "loss": 0.0372, + "step": 8460 + }, + { + "epoch": 1.8397046046915726, + "grad_norm": 0.2560582160949707, + "learning_rate": 2.7011837532580364e-05, + "loss": 0.0013, + "step": 8470 + }, + { + "epoch": 1.8418766290182451, + "grad_norm": 0.002973441733047366, + "learning_rate": 2.6984687228496964e-05, + "loss": 0.0075, + "step": 8480 + }, + { + "epoch": 1.8440486533449174, + "grad_norm": 0.03613729402422905, + "learning_rate": 2.6957536924413557e-05, + "loss": 0.0368, + "step": 8490 + }, + { + "epoch": 1.84622067767159, + "grad_norm": 0.006184692494571209, + "learning_rate": 2.693038662033015e-05, + "loss": 0.0405, + "step": 8500 + }, + { + "epoch": 1.8483927019982622, + "grad_norm": 0.0032991603948175907, + "learning_rate": 2.6903236316246743e-05, + "loss": 0.0159, + "step": 8510 + }, + { + "epoch": 1.8505647263249347, + "grad_norm": 0.0034411856904625893, + "learning_rate": 2.687608601216334e-05, + "loss": 0.0012, + "step": 8520 + }, + { + "epoch": 1.8527367506516073, + "grad_norm": 0.016189776360988617, + "learning_rate": 2.6848935708079932e-05, + "loss": 0.1094, + "step": 8530 + }, + { + "epoch": 1.8549087749782798, + "grad_norm": 0.45711269974708557, + "learning_rate": 2.6821785403996525e-05, + "loss": 0.0633, + "step": 8540 + }, + { + "epoch": 1.8570807993049523, + "grad_norm": 2.4776110649108887, + "learning_rate": 2.6794635099913118e-05, + "loss": 0.0258, + "step": 8550 + }, + { + "epoch": 1.8592528236316248, + "grad_norm": 0.008575936779379845, + "learning_rate": 2.6767484795829718e-05, + "loss": 0.0021, + "step": 8560 + }, + { + "epoch": 1.861424847958297, + "grad_norm": 1.326846957206726, + "learning_rate": 2.674033449174631e-05, + "loss": 0.0428, + "step": 8570 + }, + { + "epoch": 1.8635968722849696, + "grad_norm": 0.011363858357071877, + "learning_rate": 2.6713184187662904e-05, + "loss": 0.0029, + "step": 8580 + }, + { + "epoch": 1.865768896611642, + "grad_norm": 0.015298294834792614, + "learning_rate": 2.6686033883579497e-05, + "loss": 0.0035, + "step": 8590 + }, + { + "epoch": 1.8679409209383144, + "grad_norm": 0.014568958431482315, + "learning_rate": 2.665888357949609e-05, + "loss": 0.001, + "step": 8600 + }, + { + "epoch": 1.870112945264987, + "grad_norm": 1.8582741022109985, + "learning_rate": 2.6631733275412686e-05, + "loss": 0.048, + "step": 8610 + }, + { + "epoch": 1.8722849695916595, + "grad_norm": 0.08303183317184448, + "learning_rate": 2.660458297132928e-05, + "loss": 0.0029, + "step": 8620 + }, + { + "epoch": 1.874456993918332, + "grad_norm": 1.4877798557281494, + "learning_rate": 2.6577432667245872e-05, + "loss": 0.0439, + "step": 8630 + }, + { + "epoch": 1.8766290182450045, + "grad_norm": 0.13910454511642456, + "learning_rate": 2.6550282363162472e-05, + "loss": 0.0043, + "step": 8640 + }, + { + "epoch": 1.8788010425716768, + "grad_norm": 0.07800116389989853, + "learning_rate": 2.6523132059079065e-05, + "loss": 0.0185, + "step": 8650 + }, + { + "epoch": 1.8809730668983493, + "grad_norm": 1.8513163328170776, + "learning_rate": 2.6495981754995658e-05, + "loss": 0.0057, + "step": 8660 + }, + { + "epoch": 1.8831450912250216, + "grad_norm": 0.006727566011250019, + "learning_rate": 2.646883145091225e-05, + "loss": 0.0011, + "step": 8670 + }, + { + "epoch": 1.8853171155516941, + "grad_norm": 0.004973508417606354, + "learning_rate": 2.6441681146828844e-05, + "loss": 0.0011, + "step": 8680 + }, + { + "epoch": 1.8874891398783666, + "grad_norm": 0.011856785044074059, + "learning_rate": 2.6414530842745437e-05, + "loss": 0.0356, + "step": 8690 + }, + { + "epoch": 1.8896611642050392, + "grad_norm": 0.025869742035865784, + "learning_rate": 2.6387380538662034e-05, + "loss": 0.0203, + "step": 8700 + }, + { + "epoch": 1.8918331885317117, + "grad_norm": 0.010730310343205929, + "learning_rate": 2.636023023457863e-05, + "loss": 0.018, + "step": 8710 + }, + { + "epoch": 1.894005212858384, + "grad_norm": 0.009472482837736607, + "learning_rate": 2.6333079930495223e-05, + "loss": 0.0025, + "step": 8720 + }, + { + "epoch": 1.8961772371850565, + "grad_norm": 0.0053679742850363255, + "learning_rate": 2.630592962641182e-05, + "loss": 0.0017, + "step": 8730 + }, + { + "epoch": 1.8983492615117288, + "grad_norm": 0.004012218210846186, + "learning_rate": 2.6278779322328412e-05, + "loss": 0.0005, + "step": 8740 + }, + { + "epoch": 1.9005212858384013, + "grad_norm": 0.019520413130521774, + "learning_rate": 2.6251629018245006e-05, + "loss": 0.0337, + "step": 8750 + }, + { + "epoch": 1.9026933101650738, + "grad_norm": 0.014791909605264664, + "learning_rate": 2.62244787141616e-05, + "loss": 0.0047, + "step": 8760 + }, + { + "epoch": 1.9048653344917463, + "grad_norm": 0.00672262255102396, + "learning_rate": 2.619732841007819e-05, + "loss": 0.0015, + "step": 8770 + }, + { + "epoch": 1.9070373588184188, + "grad_norm": 0.005134627688676119, + "learning_rate": 2.6170178105994785e-05, + "loss": 0.0008, + "step": 8780 + }, + { + "epoch": 1.9092093831450914, + "grad_norm": 0.009598666802048683, + "learning_rate": 2.6143027801911384e-05, + "loss": 0.0689, + "step": 8790 + }, + { + "epoch": 1.9113814074717637, + "grad_norm": 0.010584644973278046, + "learning_rate": 2.6115877497827977e-05, + "loss": 0.0013, + "step": 8800 + }, + { + "epoch": 1.9135534317984362, + "grad_norm": 0.003948741592466831, + "learning_rate": 2.6088727193744574e-05, + "loss": 0.0026, + "step": 8810 + }, + { + "epoch": 1.9157254561251085, + "grad_norm": 0.02011152356863022, + "learning_rate": 2.6061576889661167e-05, + "loss": 0.017, + "step": 8820 + }, + { + "epoch": 1.917897480451781, + "grad_norm": 0.013354657217860222, + "learning_rate": 2.603442658557776e-05, + "loss": 0.0027, + "step": 8830 + }, + { + "epoch": 1.9200695047784535, + "grad_norm": 0.003748238319531083, + "learning_rate": 2.6007276281494353e-05, + "loss": 0.0152, + "step": 8840 + }, + { + "epoch": 1.922241529105126, + "grad_norm": 0.01269373670220375, + "learning_rate": 2.5980125977410946e-05, + "loss": 0.0008, + "step": 8850 + }, + { + "epoch": 1.9244135534317985, + "grad_norm": 0.00882689282298088, + "learning_rate": 2.595297567332754e-05, + "loss": 0.0025, + "step": 8860 + }, + { + "epoch": 1.926585577758471, + "grad_norm": 0.005238520447164774, + "learning_rate": 2.592582536924414e-05, + "loss": 0.004, + "step": 8870 + }, + { + "epoch": 1.9287576020851434, + "grad_norm": 0.06422706693410873, + "learning_rate": 2.589867506516073e-05, + "loss": 0.0325, + "step": 8880 + }, + { + "epoch": 1.9309296264118156, + "grad_norm": 0.0035638187546283007, + "learning_rate": 2.5871524761077325e-05, + "loss": 0.0205, + "step": 8890 + }, + { + "epoch": 1.9331016507384882, + "grad_norm": 0.1594795435667038, + "learning_rate": 2.584437445699392e-05, + "loss": 0.0606, + "step": 8900 + }, + { + "epoch": 1.9352736750651607, + "grad_norm": 0.012897428125143051, + "learning_rate": 2.5817224152910514e-05, + "loss": 0.0529, + "step": 8910 + }, + { + "epoch": 1.9374456993918332, + "grad_norm": 0.00483663659542799, + "learning_rate": 2.5790073848827107e-05, + "loss": 0.004, + "step": 8920 + }, + { + "epoch": 1.9396177237185057, + "grad_norm": 0.004800234921276569, + "learning_rate": 2.57629235447437e-05, + "loss": 0.0017, + "step": 8930 + }, + { + "epoch": 1.9417897480451782, + "grad_norm": 0.09830110520124435, + "learning_rate": 2.57357732406603e-05, + "loss": 0.0016, + "step": 8940 + }, + { + "epoch": 1.9439617723718505, + "grad_norm": 0.0042363316752016544, + "learning_rate": 2.5708622936576893e-05, + "loss": 0.0442, + "step": 8950 + }, + { + "epoch": 1.946133796698523, + "grad_norm": 0.014568965882062912, + "learning_rate": 2.5681472632493486e-05, + "loss": 0.0379, + "step": 8960 + }, + { + "epoch": 1.9483058210251953, + "grad_norm": 0.010344590991735458, + "learning_rate": 2.565432232841008e-05, + "loss": 0.0207, + "step": 8970 + }, + { + "epoch": 1.9504778453518679, + "grad_norm": 0.017466790974140167, + "learning_rate": 2.5627172024326672e-05, + "loss": 0.0023, + "step": 8980 + }, + { + "epoch": 1.9526498696785404, + "grad_norm": 0.027372797951102257, + "learning_rate": 2.560002172024327e-05, + "loss": 0.0014, + "step": 8990 + }, + { + "epoch": 1.954821894005213, + "grad_norm": 0.049418918788433075, + "learning_rate": 2.557287141615986e-05, + "loss": 0.0149, + "step": 9000 + }, + { + "epoch": 1.9569939183318854, + "grad_norm": 0.002828997327014804, + "learning_rate": 2.5545721112076454e-05, + "loss": 0.0004, + "step": 9010 + }, + { + "epoch": 1.959165942658558, + "grad_norm": 0.0028039535973221064, + "learning_rate": 2.5518570807993054e-05, + "loss": 0.0168, + "step": 9020 + }, + { + "epoch": 1.9613379669852302, + "grad_norm": 0.01572626270353794, + "learning_rate": 2.5491420503909647e-05, + "loss": 0.0564, + "step": 9030 + }, + { + "epoch": 1.9635099913119027, + "grad_norm": 0.08528812974691391, + "learning_rate": 2.546427019982624e-05, + "loss": 0.0128, + "step": 9040 + }, + { + "epoch": 1.965682015638575, + "grad_norm": 0.018535811454057693, + "learning_rate": 2.5437119895742833e-05, + "loss": 0.0102, + "step": 9050 + }, + { + "epoch": 1.9678540399652475, + "grad_norm": 0.007433717139065266, + "learning_rate": 2.5409969591659426e-05, + "loss": 0.0039, + "step": 9060 + }, + { + "epoch": 1.97002606429192, + "grad_norm": 0.0035428928676992655, + "learning_rate": 2.538281928757602e-05, + "loss": 0.0226, + "step": 9070 + }, + { + "epoch": 1.9721980886185926, + "grad_norm": 0.011540411040186882, + "learning_rate": 2.5355668983492616e-05, + "loss": 0.0093, + "step": 9080 + }, + { + "epoch": 1.974370112945265, + "grad_norm": 1.2338663339614868, + "learning_rate": 2.5328518679409212e-05, + "loss": 0.0123, + "step": 9090 + }, + { + "epoch": 1.9765421372719374, + "grad_norm": 0.003352933330461383, + "learning_rate": 2.5301368375325805e-05, + "loss": 0.0008, + "step": 9100 + }, + { + "epoch": 1.97871416159861, + "grad_norm": 0.0044760508462786674, + "learning_rate": 2.52742180712424e-05, + "loss": 0.0374, + "step": 9110 + }, + { + "epoch": 1.9808861859252822, + "grad_norm": 0.0031131410505622625, + "learning_rate": 2.5247067767158995e-05, + "loss": 0.0009, + "step": 9120 + }, + { + "epoch": 1.9830582102519547, + "grad_norm": 0.1342082917690277, + "learning_rate": 2.5219917463075588e-05, + "loss": 0.0052, + "step": 9130 + }, + { + "epoch": 1.9852302345786272, + "grad_norm": 0.005118147935718298, + "learning_rate": 2.519276715899218e-05, + "loss": 0.0006, + "step": 9140 + }, + { + "epoch": 1.9874022589052998, + "grad_norm": 0.0030676417518407106, + "learning_rate": 2.5165616854908774e-05, + "loss": 0.0006, + "step": 9150 + }, + { + "epoch": 1.9895742832319723, + "grad_norm": 0.005789658520370722, + "learning_rate": 2.5138466550825367e-05, + "loss": 0.0044, + "step": 9160 + }, + { + "epoch": 1.9917463075586448, + "grad_norm": 0.025706937536597252, + "learning_rate": 2.5111316246741966e-05, + "loss": 0.0048, + "step": 9170 + }, + { + "epoch": 1.993918331885317, + "grad_norm": 0.002730116480961442, + "learning_rate": 2.508416594265856e-05, + "loss": 0.0008, + "step": 9180 + }, + { + "epoch": 1.9960903562119896, + "grad_norm": 0.0026551971677690744, + "learning_rate": 2.5057015638575156e-05, + "loss": 0.001, + "step": 9190 + }, + { + "epoch": 1.998262380538662, + "grad_norm": 0.0026213659439235926, + "learning_rate": 2.502986533449175e-05, + "loss": 0.0006, + "step": 9200 + }, + { + "epoch": 2.0, + "eval_f1": 0.4260869565217391, + "eval_loss": 0.0783366709947586, + "eval_runtime": 82.7104, + "eval_samples_per_second": 120.601, + "eval_steps_per_second": 7.544, + "step": 9208 + }, + { + "epoch": 2.0004344048653344, + "grad_norm": 0.0034962480422109365, + "learning_rate": 2.5002715030408342e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 2.002606429192007, + "grad_norm": 0.0033830376341938972, + "learning_rate": 2.4975564726324935e-05, + "loss": 0.0013, + "step": 9220 + }, + { + "epoch": 2.0047784535186794, + "grad_norm": 0.020555773749947548, + "learning_rate": 2.494841442224153e-05, + "loss": 0.0007, + "step": 9230 + }, + { + "epoch": 2.006950477845352, + "grad_norm": 0.00492148706689477, + "learning_rate": 2.4921264118158124e-05, + "loss": 0.0037, + "step": 9240 + }, + { + "epoch": 2.0091225021720245, + "grad_norm": 0.06383500248193741, + "learning_rate": 2.4894113814074717e-05, + "loss": 0.0013, + "step": 9250 + }, + { + "epoch": 2.011294526498697, + "grad_norm": 0.007955756969749928, + "learning_rate": 2.486696350999131e-05, + "loss": 0.0005, + "step": 9260 + }, + { + "epoch": 2.013466550825369, + "grad_norm": 0.002853860380128026, + "learning_rate": 2.4839813205907907e-05, + "loss": 0.0223, + "step": 9270 + }, + { + "epoch": 2.0156385751520416, + "grad_norm": 0.0032983573619276285, + "learning_rate": 2.4812662901824503e-05, + "loss": 0.0005, + "step": 9280 + }, + { + "epoch": 2.017810599478714, + "grad_norm": 0.015170352533459663, + "learning_rate": 2.4785512597741096e-05, + "loss": 0.0023, + "step": 9290 + }, + { + "epoch": 2.0199826238053866, + "grad_norm": 0.014421436935663223, + "learning_rate": 2.4758362293657693e-05, + "loss": 0.001, + "step": 9300 + }, + { + "epoch": 2.022154648132059, + "grad_norm": 0.7703703045845032, + "learning_rate": 2.4731211989574286e-05, + "loss": 0.0185, + "step": 9310 + }, + { + "epoch": 2.0243266724587317, + "grad_norm": 0.003181320382282138, + "learning_rate": 2.470406168549088e-05, + "loss": 0.0006, + "step": 9320 + }, + { + "epoch": 2.026498696785404, + "grad_norm": 0.004758354276418686, + "learning_rate": 2.467691138140747e-05, + "loss": 0.0023, + "step": 9330 + }, + { + "epoch": 2.0286707211120762, + "grad_norm": 0.02517046220600605, + "learning_rate": 2.4649761077324068e-05, + "loss": 0.0263, + "step": 9340 + }, + { + "epoch": 2.0308427454387488, + "grad_norm": 0.012879346497356892, + "learning_rate": 2.462261077324066e-05, + "loss": 0.0165, + "step": 9350 + }, + { + "epoch": 2.0330147697654213, + "grad_norm": 0.06726440042257309, + "learning_rate": 2.4595460469157254e-05, + "loss": 0.0007, + "step": 9360 + }, + { + "epoch": 2.035186794092094, + "grad_norm": 0.0029317401349544525, + "learning_rate": 2.456831016507385e-05, + "loss": 0.0176, + "step": 9370 + }, + { + "epoch": 2.0373588184187663, + "grad_norm": 0.0026387099642306566, + "learning_rate": 2.4541159860990447e-05, + "loss": 0.001, + "step": 9380 + }, + { + "epoch": 2.039530842745439, + "grad_norm": 12.957719802856445, + "learning_rate": 2.451400955690704e-05, + "loss": 0.0049, + "step": 9390 + }, + { + "epoch": 2.0417028670721113, + "grad_norm": 0.012641023844480515, + "learning_rate": 2.4486859252823633e-05, + "loss": 0.0017, + "step": 9400 + }, + { + "epoch": 2.043874891398784, + "grad_norm": 0.004745169542729855, + "learning_rate": 2.4459708948740226e-05, + "loss": 0.0157, + "step": 9410 + }, + { + "epoch": 2.046046915725456, + "grad_norm": 0.008116445504128933, + "learning_rate": 2.4432558644656822e-05, + "loss": 0.0014, + "step": 9420 + }, + { + "epoch": 2.0482189400521285, + "grad_norm": 0.005568632390350103, + "learning_rate": 2.4405408340573415e-05, + "loss": 0.0014, + "step": 9430 + }, + { + "epoch": 2.050390964378801, + "grad_norm": 0.01662755198776722, + "learning_rate": 2.437825803649001e-05, + "loss": 0.0308, + "step": 9440 + }, + { + "epoch": 2.0525629887054735, + "grad_norm": 0.46532508730888367, + "learning_rate": 2.43511077324066e-05, + "loss": 0.0008, + "step": 9450 + }, + { + "epoch": 2.054735013032146, + "grad_norm": 0.05777544528245926, + "learning_rate": 2.4323957428323198e-05, + "loss": 0.0006, + "step": 9460 + }, + { + "epoch": 2.0569070373588185, + "grad_norm": 0.003521420992910862, + "learning_rate": 2.4296807124239794e-05, + "loss": 0.0006, + "step": 9470 + }, + { + "epoch": 2.059079061685491, + "grad_norm": 0.0035868764389306307, + "learning_rate": 2.4269656820156387e-05, + "loss": 0.0012, + "step": 9480 + }, + { + "epoch": 2.061251086012163, + "grad_norm": 0.002437378978356719, + "learning_rate": 2.4242506516072984e-05, + "loss": 0.0023, + "step": 9490 + }, + { + "epoch": 2.0634231103388356, + "grad_norm": 0.0028121236246079206, + "learning_rate": 2.4215356211989577e-05, + "loss": 0.0259, + "step": 9500 + }, + { + "epoch": 2.065595134665508, + "grad_norm": 0.0023697256110608578, + "learning_rate": 2.418820590790617e-05, + "loss": 0.0139, + "step": 9510 + }, + { + "epoch": 2.0677671589921807, + "grad_norm": 0.003545396961271763, + "learning_rate": 2.4161055603822763e-05, + "loss": 0.0173, + "step": 9520 + }, + { + "epoch": 2.069939183318853, + "grad_norm": 0.0023652813397347927, + "learning_rate": 2.413390529973936e-05, + "loss": 0.0002, + "step": 9530 + }, + { + "epoch": 2.0721112076455257, + "grad_norm": 0.0023803082294762135, + "learning_rate": 2.4106754995655952e-05, + "loss": 0.0003, + "step": 9540 + }, + { + "epoch": 2.074283231972198, + "grad_norm": 3.721370220184326, + "learning_rate": 2.4079604691572545e-05, + "loss": 0.0289, + "step": 9550 + }, + { + "epoch": 2.0764552562988707, + "grad_norm": 0.0023493345361202955, + "learning_rate": 2.405245438748914e-05, + "loss": 0.0017, + "step": 9560 + }, + { + "epoch": 2.078627280625543, + "grad_norm": 0.0024111897218972445, + "learning_rate": 2.4025304083405738e-05, + "loss": 0.0007, + "step": 9570 + }, + { + "epoch": 2.0807993049522153, + "grad_norm": 0.00238398858346045, + "learning_rate": 2.399815377932233e-05, + "loss": 0.0007, + "step": 9580 + }, + { + "epoch": 2.082971329278888, + "grad_norm": 0.0024116358254104853, + "learning_rate": 2.3971003475238924e-05, + "loss": 0.0009, + "step": 9590 + }, + { + "epoch": 2.0851433536055604, + "grad_norm": 0.0063408599235117435, + "learning_rate": 2.3943853171155517e-05, + "loss": 0.0205, + "step": 9600 + }, + { + "epoch": 2.087315377932233, + "grad_norm": 0.21969419717788696, + "learning_rate": 2.3916702867072113e-05, + "loss": 0.0012, + "step": 9610 + }, + { + "epoch": 2.0894874022589054, + "grad_norm": 0.003895159810781479, + "learning_rate": 2.3889552562988706e-05, + "loss": 0.0006, + "step": 9620 + }, + { + "epoch": 2.091659426585578, + "grad_norm": 0.005609441548585892, + "learning_rate": 2.386511728931364e-05, + "loss": 0.0084, + "step": 9630 + }, + { + "epoch": 2.0938314509122504, + "grad_norm": 0.002843148773536086, + "learning_rate": 2.3837966985230237e-05, + "loss": 0.0007, + "step": 9640 + }, + { + "epoch": 2.0960034752389225, + "grad_norm": 0.0022862793412059546, + "learning_rate": 2.381081668114683e-05, + "loss": 0.0018, + "step": 9650 + }, + { + "epoch": 2.098175499565595, + "grad_norm": 0.007815422490239143, + "learning_rate": 2.3783666377063423e-05, + "loss": 0.0011, + "step": 9660 + }, + { + "epoch": 2.1003475238922675, + "grad_norm": 0.0026114368811249733, + "learning_rate": 2.375651607298002e-05, + "loss": 0.0004, + "step": 9670 + }, + { + "epoch": 2.10251954821894, + "grad_norm": 0.002247220603749156, + "learning_rate": 2.3729365768896612e-05, + "loss": 0.0276, + "step": 9680 + }, + { + "epoch": 2.1046915725456126, + "grad_norm": 0.0022397038992494345, + "learning_rate": 2.370221546481321e-05, + "loss": 0.0016, + "step": 9690 + }, + { + "epoch": 2.106863596872285, + "grad_norm": 0.003445403417572379, + "learning_rate": 2.36750651607298e-05, + "loss": 0.0244, + "step": 9700 + }, + { + "epoch": 2.1090356211989576, + "grad_norm": 0.002695605391636491, + "learning_rate": 2.3647914856646394e-05, + "loss": 0.0002, + "step": 9710 + }, + { + "epoch": 2.1112076455256297, + "grad_norm": 0.00220641796477139, + "learning_rate": 2.362076455256299e-05, + "loss": 0.042, + "step": 9720 + }, + { + "epoch": 2.113379669852302, + "grad_norm": 0.0028647775761783123, + "learning_rate": 2.3593614248479584e-05, + "loss": 0.0146, + "step": 9730 + }, + { + "epoch": 2.1155516941789747, + "grad_norm": 0.00231398968026042, + "learning_rate": 2.3566463944396177e-05, + "loss": 0.0007, + "step": 9740 + }, + { + "epoch": 2.1177237185056472, + "grad_norm": 0.003436851780861616, + "learning_rate": 2.3539313640312773e-05, + "loss": 0.0083, + "step": 9750 + }, + { + "epoch": 2.1198957428323197, + "grad_norm": 0.01339609082788229, + "learning_rate": 2.3512163336229366e-05, + "loss": 0.0007, + "step": 9760 + }, + { + "epoch": 2.1220677671589923, + "grad_norm": 0.035412754863500595, + "learning_rate": 2.3485013032145963e-05, + "loss": 0.0165, + "step": 9770 + }, + { + "epoch": 2.1242397914856648, + "grad_norm": 0.012655205093324184, + "learning_rate": 2.3457862728062556e-05, + "loss": 0.0003, + "step": 9780 + }, + { + "epoch": 2.1264118158123373, + "grad_norm": 0.10211930423974991, + "learning_rate": 2.3430712423979152e-05, + "loss": 0.0015, + "step": 9790 + }, + { + "epoch": 2.1285838401390094, + "grad_norm": 0.11602195352315903, + "learning_rate": 2.3403562119895745e-05, + "loss": 0.0007, + "step": 9800 + }, + { + "epoch": 2.130755864465682, + "grad_norm": 0.0032904541585594416, + "learning_rate": 2.3376411815812338e-05, + "loss": 0.0307, + "step": 9810 + }, + { + "epoch": 2.1329278887923544, + "grad_norm": 0.0022344952449202538, + "learning_rate": 2.334926151172893e-05, + "loss": 0.0494, + "step": 9820 + }, + { + "epoch": 2.135099913119027, + "grad_norm": 0.008065508678555489, + "learning_rate": 2.3322111207645528e-05, + "loss": 0.0079, + "step": 9830 + }, + { + "epoch": 2.1372719374456994, + "grad_norm": 0.0043975287117064, + "learning_rate": 2.329496090356212e-05, + "loss": 0.0031, + "step": 9840 + }, + { + "epoch": 2.139443961772372, + "grad_norm": 0.003410003613680601, + "learning_rate": 2.3267810599478714e-05, + "loss": 0.0006, + "step": 9850 + }, + { + "epoch": 2.1416159860990445, + "grad_norm": 0.005292691756039858, + "learning_rate": 2.324066029539531e-05, + "loss": 0.0144, + "step": 9860 + }, + { + "epoch": 2.143788010425717, + "grad_norm": 0.04600781202316284, + "learning_rate": 2.3213509991311903e-05, + "loss": 0.0011, + "step": 9870 + }, + { + "epoch": 2.145960034752389, + "grad_norm": 0.06467035412788391, + "learning_rate": 2.31863596872285e-05, + "loss": 0.0013, + "step": 9880 + }, + { + "epoch": 2.1481320590790616, + "grad_norm": 0.0022334170062094927, + "learning_rate": 2.3159209383145092e-05, + "loss": 0.0015, + "step": 9890 + }, + { + "epoch": 2.150304083405734, + "grad_norm": 0.0021759674418717623, + "learning_rate": 2.3132059079061685e-05, + "loss": 0.0004, + "step": 9900 + }, + { + "epoch": 2.1524761077324066, + "grad_norm": 0.002184486947953701, + "learning_rate": 2.3104908774978282e-05, + "loss": 0.0003, + "step": 9910 + }, + { + "epoch": 2.154648132059079, + "grad_norm": 0.02917756699025631, + "learning_rate": 2.3077758470894875e-05, + "loss": 0.0203, + "step": 9920 + }, + { + "epoch": 2.1568201563857516, + "grad_norm": 0.005450095981359482, + "learning_rate": 2.3050608166811468e-05, + "loss": 0.0004, + "step": 9930 + }, + { + "epoch": 2.158992180712424, + "grad_norm": 0.002119156066328287, + "learning_rate": 2.3023457862728064e-05, + "loss": 0.0077, + "step": 9940 + }, + { + "epoch": 2.1611642050390962, + "grad_norm": 0.0021557000000029802, + "learning_rate": 2.2996307558644657e-05, + "loss": 0.0003, + "step": 9950 + }, + { + "epoch": 2.1633362293657687, + "grad_norm": 0.0028954967856407166, + "learning_rate": 2.2969157254561254e-05, + "loss": 0.0337, + "step": 9960 + }, + { + "epoch": 2.1655082536924413, + "grad_norm": 0.0021395536605268717, + "learning_rate": 2.2942006950477847e-05, + "loss": 0.0006, + "step": 9970 + }, + { + "epoch": 2.167680278019114, + "grad_norm": 0.005348121747374535, + "learning_rate": 2.2914856646394443e-05, + "loss": 0.0235, + "step": 9980 + }, + { + "epoch": 2.1698523023457863, + "grad_norm": 0.05934173986315727, + "learning_rate": 2.2887706342311036e-05, + "loss": 0.0008, + "step": 9990 + }, + { + "epoch": 2.172024326672459, + "grad_norm": 0.00287908548489213, + "learning_rate": 2.286327106863597e-05, + "loss": 0.026, + "step": 10000 + }, + { + "epoch": 2.1741963509991313, + "grad_norm": 0.0036688735708594322, + "learning_rate": 2.2836120764552566e-05, + "loss": 0.0004, + "step": 10010 + }, + { + "epoch": 2.176368375325804, + "grad_norm": 0.002232051221653819, + "learning_rate": 2.280897046046916e-05, + "loss": 0.0028, + "step": 10020 + }, + { + "epoch": 2.178540399652476, + "grad_norm": 0.0030915099196135998, + "learning_rate": 2.2781820156385752e-05, + "loss": 0.0045, + "step": 10030 + }, + { + "epoch": 2.1807124239791484, + "grad_norm": 0.019997352734208107, + "learning_rate": 2.2754669852302345e-05, + "loss": 0.0019, + "step": 10040 + }, + { + "epoch": 2.182884448305821, + "grad_norm": 0.010031803511083126, + "learning_rate": 2.2727519548218942e-05, + "loss": 0.0034, + "step": 10050 + }, + { + "epoch": 2.1850564726324935, + "grad_norm": 0.007232017815113068, + "learning_rate": 2.2700369244135535e-05, + "loss": 0.0005, + "step": 10060 + }, + { + "epoch": 2.187228496959166, + "grad_norm": 0.0020421240478754044, + "learning_rate": 2.2673218940052128e-05, + "loss": 0.0002, + "step": 10070 + }, + { + "epoch": 2.1894005212858385, + "grad_norm": 0.00201141694560647, + "learning_rate": 2.2646068635968724e-05, + "loss": 0.0003, + "step": 10080 + }, + { + "epoch": 2.191572545612511, + "grad_norm": 0.0020108462776988745, + "learning_rate": 2.261891833188532e-05, + "loss": 0.0002, + "step": 10090 + }, + { + "epoch": 2.1937445699391835, + "grad_norm": 0.010289808735251427, + "learning_rate": 2.2591768027801914e-05, + "loss": 0.0355, + "step": 10100 + }, + { + "epoch": 2.1959165942658556, + "grad_norm": 0.0021487479098141193, + "learning_rate": 2.2564617723718507e-05, + "loss": 0.0006, + "step": 10110 + }, + { + "epoch": 2.198088618592528, + "grad_norm": 0.0020929924212396145, + "learning_rate": 2.25374674196351e-05, + "loss": 0.0004, + "step": 10120 + }, + { + "epoch": 2.2002606429192006, + "grad_norm": 0.0024244049564003944, + "learning_rate": 2.2510317115551696e-05, + "loss": 0.0004, + "step": 10130 + }, + { + "epoch": 2.202432667245873, + "grad_norm": 0.007566337939351797, + "learning_rate": 2.248316681146829e-05, + "loss": 0.0008, + "step": 10140 + }, + { + "epoch": 2.2046046915725457, + "grad_norm": 0.0022506555542349815, + "learning_rate": 2.2456016507384882e-05, + "loss": 0.0002, + "step": 10150 + }, + { + "epoch": 2.206776715899218, + "grad_norm": 0.002210445236414671, + "learning_rate": 2.2428866203301475e-05, + "loss": 0.0006, + "step": 10160 + }, + { + "epoch": 2.2089487402258907, + "grad_norm": 0.0020521217957139015, + "learning_rate": 2.240171589921807e-05, + "loss": 0.0005, + "step": 10170 + }, + { + "epoch": 2.211120764552563, + "grad_norm": 0.015064552426338196, + "learning_rate": 2.2374565595134668e-05, + "loss": 0.0154, + "step": 10180 + }, + { + "epoch": 2.2132927888792353, + "grad_norm": 0.0021721182856708765, + "learning_rate": 2.234741529105126e-05, + "loss": 0.0005, + "step": 10190 + }, + { + "epoch": 2.215464813205908, + "grad_norm": 0.001961242873221636, + "learning_rate": 2.2320264986967854e-05, + "loss": 0.0004, + "step": 10200 + }, + { + "epoch": 2.2176368375325803, + "grad_norm": 0.005401493050158024, + "learning_rate": 2.229311468288445e-05, + "loss": 0.0003, + "step": 10210 + }, + { + "epoch": 2.219808861859253, + "grad_norm": 0.004179791547358036, + "learning_rate": 2.2265964378801043e-05, + "loss": 0.0005, + "step": 10220 + }, + { + "epoch": 2.2219808861859254, + "grad_norm": 0.003323676297441125, + "learning_rate": 2.2238814074717636e-05, + "loss": 0.0003, + "step": 10230 + }, + { + "epoch": 2.224152910512598, + "grad_norm": 0.01833084411919117, + "learning_rate": 2.2211663770634233e-05, + "loss": 0.0269, + "step": 10240 + }, + { + "epoch": 2.22632493483927, + "grad_norm": 0.018702253699302673, + "learning_rate": 2.2184513466550826e-05, + "loss": 0.0007, + "step": 10250 + }, + { + "epoch": 2.2284969591659425, + "grad_norm": 0.04174269735813141, + "learning_rate": 2.215736316246742e-05, + "loss": 0.0005, + "step": 10260 + }, + { + "epoch": 2.230668983492615, + "grad_norm": 0.005739922169595957, + "learning_rate": 2.2130212858384015e-05, + "loss": 0.0023, + "step": 10270 + }, + { + "epoch": 2.2328410078192875, + "grad_norm": 0.019641762599349022, + "learning_rate": 2.210306255430061e-05, + "loss": 0.0068, + "step": 10280 + }, + { + "epoch": 2.23501303214596, + "grad_norm": 0.0034746015444397926, + "learning_rate": 2.2075912250217205e-05, + "loss": 0.0003, + "step": 10290 + }, + { + "epoch": 2.2371850564726325, + "grad_norm": 0.008811332285404205, + "learning_rate": 2.2048761946133798e-05, + "loss": 0.0291, + "step": 10300 + }, + { + "epoch": 2.239357080799305, + "grad_norm": 0.014716439880430698, + "learning_rate": 2.202161164205039e-05, + "loss": 0.0004, + "step": 10310 + }, + { + "epoch": 2.2415291051259776, + "grad_norm": 0.013416060246527195, + "learning_rate": 2.1994461337966987e-05, + "loss": 0.001, + "step": 10320 + }, + { + "epoch": 2.24370112945265, + "grad_norm": 0.0019183940021321177, + "learning_rate": 2.196731103388358e-05, + "loss": 0.0003, + "step": 10330 + }, + { + "epoch": 2.245873153779322, + "grad_norm": 0.002004083478823304, + "learning_rate": 2.1940160729800173e-05, + "loss": 0.0004, + "step": 10340 + }, + { + "epoch": 2.2480451781059947, + "grad_norm": 0.01202855259180069, + "learning_rate": 2.191301042571677e-05, + "loss": 0.0005, + "step": 10350 + }, + { + "epoch": 2.250217202432667, + "grad_norm": 0.0019037555903196335, + "learning_rate": 2.1885860121633363e-05, + "loss": 0.0006, + "step": 10360 + }, + { + "epoch": 2.2523892267593397, + "grad_norm": 0.0023449785076081753, + "learning_rate": 2.185870981754996e-05, + "loss": 0.0003, + "step": 10370 + }, + { + "epoch": 2.2545612510860122, + "grad_norm": 0.007250937633216381, + "learning_rate": 2.1831559513466552e-05, + "loss": 0.0004, + "step": 10380 + }, + { + "epoch": 2.2567332754126848, + "grad_norm": 0.0018250870052725077, + "learning_rate": 2.1804409209383145e-05, + "loss": 0.0004, + "step": 10390 + }, + { + "epoch": 2.2589052997393573, + "grad_norm": 0.00181575957685709, + "learning_rate": 2.177725890529974e-05, + "loss": 0.0002, + "step": 10400 + }, + { + "epoch": 2.2610773240660293, + "grad_norm": 0.0019296440295875072, + "learning_rate": 2.1750108601216334e-05, + "loss": 0.0102, + "step": 10410 + }, + { + "epoch": 2.263249348392702, + "grad_norm": 0.008913841098546982, + "learning_rate": 2.1722958297132927e-05, + "loss": 0.0157, + "step": 10420 + }, + { + "epoch": 2.2654213727193744, + "grad_norm": 0.001792456954717636, + "learning_rate": 2.1695807993049524e-05, + "loss": 0.0002, + "step": 10430 + }, + { + "epoch": 2.267593397046047, + "grad_norm": 0.0018146372167393565, + "learning_rate": 2.1668657688966117e-05, + "loss": 0.0106, + "step": 10440 + }, + { + "epoch": 2.2697654213727194, + "grad_norm": 0.53955078125, + "learning_rate": 2.164150738488271e-05, + "loss": 0.0019, + "step": 10450 + }, + { + "epoch": 2.271937445699392, + "grad_norm": 0.0025254616048187017, + "learning_rate": 2.1614357080799306e-05, + "loss": 0.0327, + "step": 10460 + }, + { + "epoch": 2.2741094700260645, + "grad_norm": 0.05029534921050072, + "learning_rate": 2.1587206776715903e-05, + "loss": 0.0236, + "step": 10470 + }, + { + "epoch": 2.2762814943527365, + "grad_norm": 0.006896906066685915, + "learning_rate": 2.1560056472632496e-05, + "loss": 0.0014, + "step": 10480 + }, + { + "epoch": 2.278453518679409, + "grad_norm": 0.007253032643347979, + "learning_rate": 2.153290616854909e-05, + "loss": 0.0004, + "step": 10490 + }, + { + "epoch": 2.2806255430060816, + "grad_norm": 0.002225355012342334, + "learning_rate": 2.1505755864465682e-05, + "loss": 0.0007, + "step": 10500 + }, + { + "epoch": 2.282797567332754, + "grad_norm": 0.0017743059433996677, + "learning_rate": 2.1478605560382278e-05, + "loss": 0.0295, + "step": 10510 + }, + { + "epoch": 2.2849695916594266, + "grad_norm": 0.001814755261875689, + "learning_rate": 2.145145525629887e-05, + "loss": 0.0002, + "step": 10520 + }, + { + "epoch": 2.287141615986099, + "grad_norm": 0.002265684073790908, + "learning_rate": 2.1424304952215464e-05, + "loss": 0.0332, + "step": 10530 + }, + { + "epoch": 2.2893136403127716, + "grad_norm": 0.004274342674762011, + "learning_rate": 2.139715464813206e-05, + "loss": 0.0009, + "step": 10540 + }, + { + "epoch": 2.291485664639444, + "grad_norm": 0.0074489060789346695, + "learning_rate": 2.1370004344048654e-05, + "loss": 0.0005, + "step": 10550 + }, + { + "epoch": 2.2936576889661167, + "grad_norm": 0.0019445127109065652, + "learning_rate": 2.134285403996525e-05, + "loss": 0.0006, + "step": 10560 + }, + { + "epoch": 2.2958297132927887, + "grad_norm": 0.0019618631340563297, + "learning_rate": 2.1315703735881843e-05, + "loss": 0.0421, + "step": 10570 + }, + { + "epoch": 2.2980017376194612, + "grad_norm": 0.001958042150363326, + "learning_rate": 2.1288553431798436e-05, + "loss": 0.0002, + "step": 10580 + }, + { + "epoch": 2.3001737619461338, + "grad_norm": 2.4066147804260254, + "learning_rate": 2.1261403127715032e-05, + "loss": 0.0734, + "step": 10590 + }, + { + "epoch": 2.3023457862728063, + "grad_norm": 0.020740212872624397, + "learning_rate": 2.1234252823631625e-05, + "loss": 0.0007, + "step": 10600 + }, + { + "epoch": 2.304517810599479, + "grad_norm": 0.20123733580112457, + "learning_rate": 2.120710251954822e-05, + "loss": 0.0017, + "step": 10610 + }, + { + "epoch": 2.3066898349261513, + "grad_norm": 0.5305845141410828, + "learning_rate": 2.1179952215464815e-05, + "loss": 0.0378, + "step": 10620 + }, + { + "epoch": 2.308861859252824, + "grad_norm": 0.007456798106431961, + "learning_rate": 2.1152801911381408e-05, + "loss": 0.0003, + "step": 10630 + }, + { + "epoch": 2.311033883579496, + "grad_norm": 0.0020253192633390427, + "learning_rate": 2.1125651607298004e-05, + "loss": 0.0047, + "step": 10640 + }, + { + "epoch": 2.3132059079061684, + "grad_norm": 0.0039650010876357555, + "learning_rate": 2.1098501303214597e-05, + "loss": 0.0023, + "step": 10650 + }, + { + "epoch": 2.315377932232841, + "grad_norm": 0.015204093419015408, + "learning_rate": 2.1071350999131194e-05, + "loss": 0.0017, + "step": 10660 + }, + { + "epoch": 2.3175499565595135, + "grad_norm": 0.001984816510230303, + "learning_rate": 2.1044200695047787e-05, + "loss": 0.0015, + "step": 10670 + }, + { + "epoch": 2.319721980886186, + "grad_norm": 0.0019000971224159002, + "learning_rate": 2.101705039096438e-05, + "loss": 0.0012, + "step": 10680 + }, + { + "epoch": 2.3218940052128585, + "grad_norm": 0.0021752913016825914, + "learning_rate": 2.0989900086880973e-05, + "loss": 0.0027, + "step": 10690 + }, + { + "epoch": 2.324066029539531, + "grad_norm": 0.003734230063855648, + "learning_rate": 2.096274978279757e-05, + "loss": 0.0017, + "step": 10700 + }, + { + "epoch": 2.326238053866203, + "grad_norm": 0.007717654574662447, + "learning_rate": 2.0935599478714162e-05, + "loss": 0.0165, + "step": 10710 + }, + { + "epoch": 2.3284100781928756, + "grad_norm": 0.12917552888393402, + "learning_rate": 2.0908449174630755e-05, + "loss": 0.0012, + "step": 10720 + }, + { + "epoch": 2.330582102519548, + "grad_norm": 0.0019150119042024016, + "learning_rate": 2.088129887054735e-05, + "loss": 0.005, + "step": 10730 + }, + { + "epoch": 2.3327541268462206, + "grad_norm": 0.0018117213621735573, + "learning_rate": 2.0854148566463945e-05, + "loss": 0.0003, + "step": 10740 + }, + { + "epoch": 2.334926151172893, + "grad_norm": 0.003375353990122676, + "learning_rate": 2.082699826238054e-05, + "loss": 0.0007, + "step": 10750 + }, + { + "epoch": 2.3370981754995657, + "grad_norm": 0.00251543871127069, + "learning_rate": 2.0799847958297134e-05, + "loss": 0.0011, + "step": 10760 + }, + { + "epoch": 2.339270199826238, + "grad_norm": 0.0018188374815508723, + "learning_rate": 2.0772697654213727e-05, + "loss": 0.0002, + "step": 10770 + }, + { + "epoch": 2.3414422241529107, + "grad_norm": 0.0017927787266671658, + "learning_rate": 2.0745547350130324e-05, + "loss": 0.0113, + "step": 10780 + }, + { + "epoch": 2.343614248479583, + "grad_norm": 0.15189605951309204, + "learning_rate": 2.0718397046046917e-05, + "loss": 0.0196, + "step": 10790 + }, + { + "epoch": 2.3457862728062553, + "grad_norm": 0.0038990580942481756, + "learning_rate": 2.069124674196351e-05, + "loss": 0.0374, + "step": 10800 + }, + { + "epoch": 2.347958297132928, + "grad_norm": 0.07599227130413055, + "learning_rate": 2.0664096437880106e-05, + "loss": 0.0011, + "step": 10810 + }, + { + "epoch": 2.3501303214596003, + "grad_norm": 0.3914039731025696, + "learning_rate": 2.06369461337967e-05, + "loss": 0.0038, + "step": 10820 + }, + { + "epoch": 2.352302345786273, + "grad_norm": 0.0017740110633894801, + "learning_rate": 2.0609795829713295e-05, + "loss": 0.0292, + "step": 10830 + }, + { + "epoch": 2.3544743701129454, + "grad_norm": 0.047251638025045395, + "learning_rate": 2.058264552562989e-05, + "loss": 0.0012, + "step": 10840 + }, + { + "epoch": 2.356646394439618, + "grad_norm": 0.0017566693713888526, + "learning_rate": 2.0555495221546485e-05, + "loss": 0.0008, + "step": 10850 + }, + { + "epoch": 2.3588184187662904, + "grad_norm": 0.006062482949346304, + "learning_rate": 2.0528344917463078e-05, + "loss": 0.0021, + "step": 10860 + }, + { + "epoch": 2.3609904430929625, + "grad_norm": 0.0026041665114462376, + "learning_rate": 2.050119461337967e-05, + "loss": 0.0009, + "step": 10870 + }, + { + "epoch": 2.363162467419635, + "grad_norm": 0.0018312680767849088, + "learning_rate": 2.0474044309296264e-05, + "loss": 0.0007, + "step": 10880 + }, + { + "epoch": 2.3653344917463075, + "grad_norm": 0.0016842596232891083, + "learning_rate": 2.044689400521286e-05, + "loss": 0.0002, + "step": 10890 + }, + { + "epoch": 2.36750651607298, + "grad_norm": 9.34298038482666, + "learning_rate": 2.0419743701129453e-05, + "loss": 0.0413, + "step": 10900 + }, + { + "epoch": 2.3696785403996525, + "grad_norm": 0.6906639337539673, + "learning_rate": 2.0392593397046046e-05, + "loss": 0.0139, + "step": 10910 + }, + { + "epoch": 2.371850564726325, + "grad_norm": 0.019118599593639374, + "learning_rate": 2.0365443092962643e-05, + "loss": 0.0012, + "step": 10920 + }, + { + "epoch": 2.3740225890529976, + "grad_norm": 0.0017927911831066012, + "learning_rate": 2.0338292788879236e-05, + "loss": 0.0018, + "step": 10930 + }, + { + "epoch": 2.3761946133796696, + "grad_norm": 0.0017185697797685862, + "learning_rate": 2.0311142484795832e-05, + "loss": 0.0003, + "step": 10940 + }, + { + "epoch": 2.378366637706342, + "grad_norm": 0.0017069017048925161, + "learning_rate": 2.0283992180712425e-05, + "loss": 0.0005, + "step": 10950 + }, + { + "epoch": 2.3805386620330147, + "grad_norm": 0.0016620549140498042, + "learning_rate": 2.0256841876629018e-05, + "loss": 0.0006, + "step": 10960 + }, + { + "epoch": 2.382710686359687, + "grad_norm": 0.013684243895113468, + "learning_rate": 2.0229691572545615e-05, + "loss": 0.0007, + "step": 10970 + }, + { + "epoch": 2.3848827106863597, + "grad_norm": 0.013956604525446892, + "learning_rate": 2.0202541268462208e-05, + "loss": 0.0016, + "step": 10980 + }, + { + "epoch": 2.3870547350130322, + "grad_norm": 0.003464424517005682, + "learning_rate": 2.01753909643788e-05, + "loss": 0.0065, + "step": 10990 + }, + { + "epoch": 2.3892267593397047, + "grad_norm": 0.0017165833851322532, + "learning_rate": 2.0148240660295394e-05, + "loss": 0.0028, + "step": 11000 + }, + { + "epoch": 2.391398783666377, + "grad_norm": 0.0019415366696193814, + "learning_rate": 2.012109035621199e-05, + "loss": 0.0002, + "step": 11010 + }, + { + "epoch": 2.3935708079930493, + "grad_norm": 0.022299442440271378, + "learning_rate": 2.0093940052128586e-05, + "loss": 0.0004, + "step": 11020 + }, + { + "epoch": 2.395742832319722, + "grad_norm": 0.0016049507539719343, + "learning_rate": 2.006678974804518e-05, + "loss": 0.0002, + "step": 11030 + }, + { + "epoch": 2.3979148566463944, + "grad_norm": 0.012664725072681904, + "learning_rate": 2.0039639443961776e-05, + "loss": 0.0019, + "step": 11040 + }, + { + "epoch": 2.400086880973067, + "grad_norm": 0.010130131617188454, + "learning_rate": 2.001248913987837e-05, + "loss": 0.0004, + "step": 11050 + }, + { + "epoch": 2.4022589052997394, + "grad_norm": 0.049954500049352646, + "learning_rate": 1.9985338835794962e-05, + "loss": 0.003, + "step": 11060 + }, + { + "epoch": 2.404430929626412, + "grad_norm": 0.01277677807956934, + "learning_rate": 1.9958188531711555e-05, + "loss": 0.0014, + "step": 11070 + }, + { + "epoch": 2.4066029539530844, + "grad_norm": 0.0015594464493915439, + "learning_rate": 1.993103822762815e-05, + "loss": 0.0002, + "step": 11080 + }, + { + "epoch": 2.408774978279757, + "grad_norm": 0.0015805740840733051, + "learning_rate": 1.9903887923544744e-05, + "loss": 0.0005, + "step": 11090 + }, + { + "epoch": 2.410947002606429, + "grad_norm": 0.001530050183646381, + "learning_rate": 1.9876737619461337e-05, + "loss": 0.0002, + "step": 11100 + }, + { + "epoch": 2.4131190269331015, + "grad_norm": 0.001565085374750197, + "learning_rate": 1.9849587315377934e-05, + "loss": 0.0485, + "step": 11110 + }, + { + "epoch": 2.415291051259774, + "grad_norm": 0.0015722399111837149, + "learning_rate": 1.982243701129453e-05, + "loss": 0.0003, + "step": 11120 + }, + { + "epoch": 2.4174630755864466, + "grad_norm": 0.019924765452742577, + "learning_rate": 1.9795286707211123e-05, + "loss": 0.015, + "step": 11130 + }, + { + "epoch": 2.419635099913119, + "grad_norm": 0.001612838706932962, + "learning_rate": 1.9768136403127716e-05, + "loss": 0.0002, + "step": 11140 + }, + { + "epoch": 2.4218071242397916, + "grad_norm": 0.00153868249617517, + "learning_rate": 1.974098609904431e-05, + "loss": 0.0002, + "step": 11150 + }, + { + "epoch": 2.423979148566464, + "grad_norm": 2.775703191757202, + "learning_rate": 1.9713835794960906e-05, + "loss": 0.046, + "step": 11160 + }, + { + "epoch": 2.426151172893136, + "grad_norm": 0.012296248227357864, + "learning_rate": 1.96866854908775e-05, + "loss": 0.0005, + "step": 11170 + }, + { + "epoch": 2.4283231972198087, + "grad_norm": 0.0023132723290473223, + "learning_rate": 1.965953518679409e-05, + "loss": 0.0003, + "step": 11180 + }, + { + "epoch": 2.4304952215464812, + "grad_norm": 0.0025435942225158215, + "learning_rate": 1.9632384882710685e-05, + "loss": 0.0003, + "step": 11190 + }, + { + "epoch": 2.4326672458731537, + "grad_norm": 0.0021016327664256096, + "learning_rate": 1.960523457862728e-05, + "loss": 0.0177, + "step": 11200 + }, + { + "epoch": 2.4348392701998263, + "grad_norm": 0.002213849686086178, + "learning_rate": 1.9578084274543877e-05, + "loss": 0.0006, + "step": 11210 + }, + { + "epoch": 2.437011294526499, + "grad_norm": 0.0018159413011744618, + "learning_rate": 1.955093397046047e-05, + "loss": 0.0002, + "step": 11220 + }, + { + "epoch": 2.4391833188531713, + "grad_norm": 0.001797963515855372, + "learning_rate": 1.9523783666377067e-05, + "loss": 0.0005, + "step": 11230 + }, + { + "epoch": 2.4413553431798434, + "grad_norm": 0.0016879733884707093, + "learning_rate": 1.949663336229366e-05, + "loss": 0.0002, + "step": 11240 + }, + { + "epoch": 2.443527367506516, + "grad_norm": 0.0022847780492156744, + "learning_rate": 1.9469483058210253e-05, + "loss": 0.0244, + "step": 11250 + }, + { + "epoch": 2.4456993918331884, + "grad_norm": 0.001833285903558135, + "learning_rate": 1.9442332754126846e-05, + "loss": 0.019, + "step": 11260 + }, + { + "epoch": 2.447871416159861, + "grad_norm": 0.04164701700210571, + "learning_rate": 1.9415182450043442e-05, + "loss": 0.0115, + "step": 11270 + }, + { + "epoch": 2.4500434404865334, + "grad_norm": 0.001846460741944611, + "learning_rate": 1.9388032145960035e-05, + "loss": 0.0224, + "step": 11280 + }, + { + "epoch": 2.452215464813206, + "grad_norm": 0.012611893005669117, + "learning_rate": 1.936088184187663e-05, + "loss": 0.0024, + "step": 11290 + }, + { + "epoch": 2.4543874891398785, + "grad_norm": 0.0033035138621926308, + "learning_rate": 1.9333731537793225e-05, + "loss": 0.0013, + "step": 11300 + }, + { + "epoch": 2.456559513466551, + "grad_norm": 0.0016401956090703607, + "learning_rate": 1.930658123370982e-05, + "loss": 0.0008, + "step": 11310 + }, + { + "epoch": 2.4587315377932235, + "grad_norm": 0.046950288116931915, + "learning_rate": 1.9279430929626414e-05, + "loss": 0.0022, + "step": 11320 + }, + { + "epoch": 2.4609035621198956, + "grad_norm": 0.004596000071614981, + "learning_rate": 1.9252280625543007e-05, + "loss": 0.0018, + "step": 11330 + }, + { + "epoch": 2.463075586446568, + "grad_norm": 0.0016806930070742965, + "learning_rate": 1.92251303214596e-05, + "loss": 0.0022, + "step": 11340 + }, + { + "epoch": 2.4652476107732406, + "grad_norm": 0.001617814414203167, + "learning_rate": 1.9197980017376197e-05, + "loss": 0.0022, + "step": 11350 + }, + { + "epoch": 2.467419635099913, + "grad_norm": 0.004318607039749622, + "learning_rate": 1.917082971329279e-05, + "loss": 0.0017, + "step": 11360 + }, + { + "epoch": 2.4695916594265857, + "grad_norm": 0.0050076707266271114, + "learning_rate": 1.9143679409209383e-05, + "loss": 0.0012, + "step": 11370 + }, + { + "epoch": 2.471763683753258, + "grad_norm": 0.21344320476055145, + "learning_rate": 1.9116529105125976e-05, + "loss": 0.0012, + "step": 11380 + }, + { + "epoch": 2.4739357080799307, + "grad_norm": 0.0015156455338001251, + "learning_rate": 1.9089378801042572e-05, + "loss": 0.0006, + "step": 11390 + }, + { + "epoch": 2.4761077324066028, + "grad_norm": 0.02697795256972313, + "learning_rate": 1.906222849695917e-05, + "loss": 0.0007, + "step": 11400 + }, + { + "epoch": 2.4782797567332753, + "grad_norm": 0.00153002655133605, + "learning_rate": 1.903507819287576e-05, + "loss": 0.0007, + "step": 11410 + }, + { + "epoch": 2.480451781059948, + "grad_norm": 0.0015049789799377322, + "learning_rate": 1.9007927888792358e-05, + "loss": 0.0015, + "step": 11420 + }, + { + "epoch": 2.4826238053866203, + "grad_norm": 0.024290762841701508, + "learning_rate": 1.898077758470895e-05, + "loss": 0.002, + "step": 11430 + }, + { + "epoch": 2.484795829713293, + "grad_norm": 0.0015076440759003162, + "learning_rate": 1.8953627280625544e-05, + "loss": 0.0003, + "step": 11440 + }, + { + "epoch": 2.4869678540399653, + "grad_norm": 0.0014670953387394547, + "learning_rate": 1.8926476976542137e-05, + "loss": 0.0365, + "step": 11450 + }, + { + "epoch": 2.489139878366638, + "grad_norm": 0.0019197690999135375, + "learning_rate": 1.8899326672458733e-05, + "loss": 0.0003, + "step": 11460 + }, + { + "epoch": 2.49131190269331, + "grad_norm": 0.014209013432264328, + "learning_rate": 1.8872176368375326e-05, + "loss": 0.0036, + "step": 11470 + }, + { + "epoch": 2.4934839270199824, + "grad_norm": 0.0019741549622267485, + "learning_rate": 1.884502606429192e-05, + "loss": 0.014, + "step": 11480 + }, + { + "epoch": 2.495655951346655, + "grad_norm": 0.0015247397823259234, + "learning_rate": 1.8817875760208516e-05, + "loss": 0.0006, + "step": 11490 + }, + { + "epoch": 2.4978279756733275, + "grad_norm": 0.0014463032130151987, + "learning_rate": 1.8790725456125112e-05, + "loss": 0.0004, + "step": 11500 + }, + { + "epoch": 2.5, + "grad_norm": 7.1138739585876465, + "learning_rate": 1.8763575152041705e-05, + "loss": 0.0264, + "step": 11510 + }, + { + "epoch": 2.5021720243266725, + "grad_norm": 0.009552753530442715, + "learning_rate": 1.8736424847958298e-05, + "loss": 0.0139, + "step": 11520 + }, + { + "epoch": 2.504344048653345, + "grad_norm": 0.014069234021008015, + "learning_rate": 1.870927454387489e-05, + "loss": 0.0005, + "step": 11530 + }, + { + "epoch": 2.506516072980017, + "grad_norm": 0.23029930889606476, + "learning_rate": 1.8682124239791488e-05, + "loss": 0.001, + "step": 11540 + }, + { + "epoch": 2.50868809730669, + "grad_norm": 0.006573742721229792, + "learning_rate": 1.865497393570808e-05, + "loss": 0.0013, + "step": 11550 + }, + { + "epoch": 2.510860121633362, + "grad_norm": 0.0014153916854411364, + "learning_rate": 1.8627823631624674e-05, + "loss": 0.0011, + "step": 11560 + }, + { + "epoch": 2.5130321459600347, + "grad_norm": 0.0014535776572301984, + "learning_rate": 1.8600673327541267e-05, + "loss": 0.0344, + "step": 11570 + }, + { + "epoch": 2.515204170286707, + "grad_norm": 0.002279409673064947, + "learning_rate": 1.8573523023457863e-05, + "loss": 0.008, + "step": 11580 + }, + { + "epoch": 2.5173761946133797, + "grad_norm": 0.01143862958997488, + "learning_rate": 1.854637271937446e-05, + "loss": 0.004, + "step": 11590 + }, + { + "epoch": 2.519548218940052, + "grad_norm": 0.0021647445391863585, + "learning_rate": 1.8519222415291053e-05, + "loss": 0.0281, + "step": 11600 + }, + { + "epoch": 2.5217202432667247, + "grad_norm": 0.011500898748636246, + "learning_rate": 1.8492072111207646e-05, + "loss": 0.051, + "step": 11610 + }, + { + "epoch": 2.5238922675933972, + "grad_norm": 0.07578958570957184, + "learning_rate": 1.8464921807124242e-05, + "loss": 0.0236, + "step": 11620 + }, + { + "epoch": 2.5260642919200693, + "grad_norm": 0.002254678402096033, + "learning_rate": 1.8437771503040835e-05, + "loss": 0.0007, + "step": 11630 + }, + { + "epoch": 2.528236316246742, + "grad_norm": 0.009960656054317951, + "learning_rate": 1.8410621198957428e-05, + "loss": 0.0019, + "step": 11640 + }, + { + "epoch": 2.5304083405734143, + "grad_norm": 4.388426303863525, + "learning_rate": 1.8383470894874024e-05, + "loss": 0.0246, + "step": 11650 + }, + { + "epoch": 2.532580364900087, + "grad_norm": 0.002279053907841444, + "learning_rate": 1.8356320590790617e-05, + "loss": 0.0014, + "step": 11660 + }, + { + "epoch": 2.5347523892267594, + "grad_norm": 0.009489334188401699, + "learning_rate": 1.832917028670721e-05, + "loss": 0.0121, + "step": 11670 + }, + { + "epoch": 2.536924413553432, + "grad_norm": 0.0027339665684849024, + "learning_rate": 1.8302019982623807e-05, + "loss": 0.0015, + "step": 11680 + }, + { + "epoch": 2.5390964378801044, + "grad_norm": 0.003309717169031501, + "learning_rate": 1.8274869678540403e-05, + "loss": 0.0003, + "step": 11690 + }, + { + "epoch": 2.5412684622067765, + "grad_norm": 0.0023564095608890057, + "learning_rate": 1.8247719374456996e-05, + "loss": 0.0009, + "step": 11700 + }, + { + "epoch": 2.5434404865334495, + "grad_norm": 0.0017603716114535928, + "learning_rate": 1.822056907037359e-05, + "loss": 0.0005, + "step": 11710 + }, + { + "epoch": 2.5456125108601215, + "grad_norm": 0.0017267238581553102, + "learning_rate": 1.8193418766290182e-05, + "loss": 0.0005, + "step": 11720 + }, + { + "epoch": 2.547784535186794, + "grad_norm": 0.0017208755016326904, + "learning_rate": 1.816626846220678e-05, + "loss": 0.0004, + "step": 11730 + }, + { + "epoch": 2.5499565595134666, + "grad_norm": 0.0018419224070385098, + "learning_rate": 1.8139118158123372e-05, + "loss": 0.0004, + "step": 11740 + }, + { + "epoch": 2.552128583840139, + "grad_norm": 0.003102607326582074, + "learning_rate": 1.8111967854039965e-05, + "loss": 0.0292, + "step": 11750 + }, + { + "epoch": 2.5543006081668116, + "grad_norm": 0.0016968741547316313, + "learning_rate": 1.8084817549956558e-05, + "loss": 0.0006, + "step": 11760 + }, + { + "epoch": 2.5564726324934837, + "grad_norm": 0.016231173649430275, + "learning_rate": 1.8057667245873154e-05, + "loss": 0.001, + "step": 11770 + }, + { + "epoch": 2.5586446568201566, + "grad_norm": 0.00396195612847805, + "learning_rate": 1.803051694178975e-05, + "loss": 0.0004, + "step": 11780 + }, + { + "epoch": 2.5608166811468287, + "grad_norm": 0.017558127641677856, + "learning_rate": 1.8003366637706344e-05, + "loss": 0.0363, + "step": 11790 + }, + { + "epoch": 2.562988705473501, + "grad_norm": 0.032345548272132874, + "learning_rate": 1.7976216333622937e-05, + "loss": 0.0136, + "step": 11800 + }, + { + "epoch": 2.5651607298001737, + "grad_norm": 0.040637094527482986, + "learning_rate": 1.7949066029539533e-05, + "loss": 0.0043, + "step": 11810 + }, + { + "epoch": 2.5673327541268463, + "grad_norm": 0.10168937593698502, + "learning_rate": 1.7921915725456126e-05, + "loss": 0.0263, + "step": 11820 + }, + { + "epoch": 2.5695047784535188, + "grad_norm": 0.0077156987972557545, + "learning_rate": 1.789476542137272e-05, + "loss": 0.0072, + "step": 11830 + }, + { + "epoch": 2.5716768027801913, + "grad_norm": 0.003105215495452285, + "learning_rate": 1.7867615117289315e-05, + "loss": 0.0009, + "step": 11840 + }, + { + "epoch": 2.573848827106864, + "grad_norm": 0.0035061310045421124, + "learning_rate": 1.784046481320591e-05, + "loss": 0.0005, + "step": 11850 + }, + { + "epoch": 2.576020851433536, + "grad_norm": 0.0113242631778121, + "learning_rate": 1.78133145091225e-05, + "loss": 0.0259, + "step": 11860 + }, + { + "epoch": 2.5781928757602084, + "grad_norm": 0.0015477265696972609, + "learning_rate": 1.7786164205039098e-05, + "loss": 0.0239, + "step": 11870 + }, + { + "epoch": 2.580364900086881, + "grad_norm": 0.004106955602765083, + "learning_rate": 1.7759013900955694e-05, + "loss": 0.001, + "step": 11880 + }, + { + "epoch": 2.5825369244135534, + "grad_norm": 0.0014083506539463997, + "learning_rate": 1.7731863596872287e-05, + "loss": 0.0003, + "step": 11890 + }, + { + "epoch": 2.584708948740226, + "grad_norm": 0.0013759738067165017, + "learning_rate": 1.770471329278888e-05, + "loss": 0.0057, + "step": 11900 + }, + { + "epoch": 2.5868809730668985, + "grad_norm": 0.0014395932666957378, + "learning_rate": 1.7677562988705473e-05, + "loss": 0.0244, + "step": 11910 + }, + { + "epoch": 2.589052997393571, + "grad_norm": 0.018697045743465424, + "learning_rate": 1.765041268462207e-05, + "loss": 0.0114, + "step": 11920 + }, + { + "epoch": 2.591225021720243, + "grad_norm": 0.008916974999010563, + "learning_rate": 1.7623262380538663e-05, + "loss": 0.0015, + "step": 11930 + }, + { + "epoch": 2.5933970460469156, + "grad_norm": 0.06151333078742027, + "learning_rate": 1.7596112076455256e-05, + "loss": 0.0012, + "step": 11940 + }, + { + "epoch": 2.595569070373588, + "grad_norm": 0.011603694409132004, + "learning_rate": 1.756896177237185e-05, + "loss": 0.0009, + "step": 11950 + }, + { + "epoch": 2.5977410947002606, + "grad_norm": 0.0036000856198370457, + "learning_rate": 1.7541811468288445e-05, + "loss": 0.0002, + "step": 11960 + }, + { + "epoch": 2.599913119026933, + "grad_norm": 0.008364981971681118, + "learning_rate": 1.751466116420504e-05, + "loss": 0.0063, + "step": 11970 + }, + { + "epoch": 2.6020851433536056, + "grad_norm": 0.0018292396562173963, + "learning_rate": 1.7487510860121635e-05, + "loss": 0.0014, + "step": 11980 + }, + { + "epoch": 2.604257167680278, + "grad_norm": 0.0016355229308828712, + "learning_rate": 1.7460360556038228e-05, + "loss": 0.0004, + "step": 11990 + }, + { + "epoch": 2.6064291920069502, + "grad_norm": 0.035449955612421036, + "learning_rate": 1.7433210251954824e-05, + "loss": 0.0015, + "step": 12000 + }, + { + "epoch": 2.608601216333623, + "grad_norm": 0.0014802763471379876, + "learning_rate": 1.7406059947871417e-05, + "loss": 0.0003, + "step": 12010 + }, + { + "epoch": 2.6107732406602953, + "grad_norm": 0.0014913080958649516, + "learning_rate": 1.737890964378801e-05, + "loss": 0.0003, + "step": 12020 + }, + { + "epoch": 2.6129452649869678, + "grad_norm": 0.001743799657560885, + "learning_rate": 1.7351759339704607e-05, + "loss": 0.0006, + "step": 12030 + }, + { + "epoch": 2.6151172893136403, + "grad_norm": 0.001439134357497096, + "learning_rate": 1.73246090356212e-05, + "loss": 0.0002, + "step": 12040 + }, + { + "epoch": 2.617289313640313, + "grad_norm": 0.0013170883757993579, + "learning_rate": 1.7297458731537793e-05, + "loss": 0.0006, + "step": 12050 + }, + { + "epoch": 2.6194613379669853, + "grad_norm": 0.0016408158699050546, + "learning_rate": 1.727030842745439e-05, + "loss": 0.0004, + "step": 12060 + }, + { + "epoch": 2.621633362293658, + "grad_norm": 0.006257723551243544, + "learning_rate": 1.7243158123370985e-05, + "loss": 0.0002, + "step": 12070 + }, + { + "epoch": 2.6238053866203304, + "grad_norm": 0.0013448239769786596, + "learning_rate": 1.721600781928758e-05, + "loss": 0.0003, + "step": 12080 + }, + { + "epoch": 2.6259774109470024, + "grad_norm": 0.0014338934561237693, + "learning_rate": 1.718885751520417e-05, + "loss": 0.0002, + "step": 12090 + }, + { + "epoch": 2.628149435273675, + "grad_norm": 0.0013018847675994039, + "learning_rate": 1.7161707211120764e-05, + "loss": 0.0002, + "step": 12100 + }, + { + "epoch": 2.6303214596003475, + "grad_norm": 0.0013456381857395172, + "learning_rate": 1.713455690703736e-05, + "loss": 0.0003, + "step": 12110 + }, + { + "epoch": 2.63249348392702, + "grad_norm": 0.0014625930925831199, + "learning_rate": 1.7107406602953954e-05, + "loss": 0.0076, + "step": 12120 + }, + { + "epoch": 2.6346655082536925, + "grad_norm": 0.00126743467990309, + "learning_rate": 1.7080256298870547e-05, + "loss": 0.0005, + "step": 12130 + }, + { + "epoch": 2.636837532580365, + "grad_norm": 0.0012669408461079001, + "learning_rate": 1.705310599478714e-05, + "loss": 0.0004, + "step": 12140 + }, + { + "epoch": 2.6390095569070375, + "grad_norm": 0.001318922615610063, + "learning_rate": 1.7025955690703736e-05, + "loss": 0.0188, + "step": 12150 + }, + { + "epoch": 2.6411815812337096, + "grad_norm": 0.09571494907140732, + "learning_rate": 1.6998805386620333e-05, + "loss": 0.0244, + "step": 12160 + }, + { + "epoch": 2.643353605560382, + "grad_norm": 0.024946704506874084, + "learning_rate": 1.6971655082536926e-05, + "loss": 0.001, + "step": 12170 + }, + { + "epoch": 2.6455256298870546, + "grad_norm": 0.001351698418147862, + "learning_rate": 1.694450477845352e-05, + "loss": 0.0003, + "step": 12180 + }, + { + "epoch": 2.647697654213727, + "grad_norm": 6.923243522644043, + "learning_rate": 1.6917354474370115e-05, + "loss": 0.0254, + "step": 12190 + }, + { + "epoch": 2.6498696785403997, + "grad_norm": 0.0013446049997583032, + "learning_rate": 1.6890204170286708e-05, + "loss": 0.0013, + "step": 12200 + }, + { + "epoch": 2.652041702867072, + "grad_norm": 0.0017462641699239612, + "learning_rate": 1.68630538662033e-05, + "loss": 0.0004, + "step": 12210 + }, + { + "epoch": 2.6542137271937447, + "grad_norm": 0.0650092363357544, + "learning_rate": 1.6835903562119894e-05, + "loss": 0.0002, + "step": 12220 + }, + { + "epoch": 2.656385751520417, + "grad_norm": 0.0013285756576806307, + "learning_rate": 1.680875325803649e-05, + "loss": 0.0135, + "step": 12230 + }, + { + "epoch": 2.6585577758470897, + "grad_norm": 0.001253599999472499, + "learning_rate": 1.6781602953953084e-05, + "loss": 0.0022, + "step": 12240 + }, + { + "epoch": 2.660729800173762, + "grad_norm": 0.001297818380407989, + "learning_rate": 1.675445264986968e-05, + "loss": 0.0004, + "step": 12250 + }, + { + "epoch": 2.6629018245004343, + "grad_norm": 2.883939266204834, + "learning_rate": 1.6727302345786276e-05, + "loss": 0.0135, + "step": 12260 + }, + { + "epoch": 2.665073848827107, + "grad_norm": 0.001262718578800559, + "learning_rate": 1.670015204170287e-05, + "loss": 0.0003, + "step": 12270 + }, + { + "epoch": 2.6672458731537794, + "grad_norm": 0.0012341596884652972, + "learning_rate": 1.6673001737619462e-05, + "loss": 0.0024, + "step": 12280 + }, + { + "epoch": 2.669417897480452, + "grad_norm": 0.0012327926233410835, + "learning_rate": 1.6645851433536055e-05, + "loss": 0.039, + "step": 12290 + }, + { + "epoch": 2.6715899218071244, + "grad_norm": 0.0012394741643220186, + "learning_rate": 1.6618701129452652e-05, + "loss": 0.0004, + "step": 12300 + }, + { + "epoch": 2.673761946133797, + "grad_norm": 5.620744705200195, + "learning_rate": 1.6591550825369245e-05, + "loss": 0.0317, + "step": 12310 + }, + { + "epoch": 2.675933970460469, + "grad_norm": 1.9395980834960938, + "learning_rate": 1.6564400521285838e-05, + "loss": 0.0132, + "step": 12320 + }, + { + "epoch": 2.6781059947871415, + "grad_norm": 0.07889677584171295, + "learning_rate": 1.653725021720243e-05, + "loss": 0.0091, + "step": 12330 + }, + { + "epoch": 2.680278019113814, + "grad_norm": 0.20382428169250488, + "learning_rate": 1.6510099913119027e-05, + "loss": 0.0017, + "step": 12340 + }, + { + "epoch": 2.6824500434404865, + "grad_norm": 0.03647405281662941, + "learning_rate": 1.6482949609035624e-05, + "loss": 0.0042, + "step": 12350 + }, + { + "epoch": 2.684622067767159, + "grad_norm": 0.002080442849546671, + "learning_rate": 1.6455799304952217e-05, + "loss": 0.0023, + "step": 12360 + }, + { + "epoch": 2.6867940920938316, + "grad_norm": 0.001212194561958313, + "learning_rate": 1.642864900086881e-05, + "loss": 0.0004, + "step": 12370 + }, + { + "epoch": 2.688966116420504, + "grad_norm": 0.0034903050400316715, + "learning_rate": 1.6401498696785406e-05, + "loss": 0.0497, + "step": 12380 + }, + { + "epoch": 2.691138140747176, + "grad_norm": 0.004165531136095524, + "learning_rate": 1.6374348392702e-05, + "loss": 0.0005, + "step": 12390 + }, + { + "epoch": 2.6933101650738487, + "grad_norm": 0.003915696870535612, + "learning_rate": 1.6347198088618592e-05, + "loss": 0.0005, + "step": 12400 + }, + { + "epoch": 2.695482189400521, + "grad_norm": 0.07809191197156906, + "learning_rate": 1.6320047784535185e-05, + "loss": 0.0027, + "step": 12410 + }, + { + "epoch": 2.6976542137271937, + "grad_norm": 0.007462525740265846, + "learning_rate": 1.629289748045178e-05, + "loss": 0.0018, + "step": 12420 + }, + { + "epoch": 2.6998262380538662, + "grad_norm": 0.02216045930981636, + "learning_rate": 1.6265747176368375e-05, + "loss": 0.0028, + "step": 12430 + }, + { + "epoch": 2.7019982623805388, + "grad_norm": 0.02276870794594288, + "learning_rate": 1.623859687228497e-05, + "loss": 0.0003, + "step": 12440 + }, + { + "epoch": 2.7041702867072113, + "grad_norm": 0.002578067360445857, + "learning_rate": 1.6211446568201567e-05, + "loss": 0.0007, + "step": 12450 + }, + { + "epoch": 2.7063423110338833, + "grad_norm": 0.001441303757019341, + "learning_rate": 1.618429626411816e-05, + "loss": 0.0186, + "step": 12460 + }, + { + "epoch": 2.7085143353605563, + "grad_norm": 0.0013689674669876695, + "learning_rate": 1.6157145960034753e-05, + "loss": 0.0005, + "step": 12470 + }, + { + "epoch": 2.7106863596872284, + "grad_norm": 0.0017949125031009316, + "learning_rate": 1.6129995655951346e-05, + "loss": 0.0002, + "step": 12480 + }, + { + "epoch": 2.712858384013901, + "grad_norm": 0.0020256515126675367, + "learning_rate": 1.6102845351867943e-05, + "loss": 0.017, + "step": 12490 + }, + { + "epoch": 2.7150304083405734, + "grad_norm": 0.0013759853318333626, + "learning_rate": 1.6075695047784536e-05, + "loss": 0.032, + "step": 12500 + }, + { + "epoch": 2.717202432667246, + "grad_norm": 0.0013345404295250773, + "learning_rate": 1.604854474370113e-05, + "loss": 0.0009, + "step": 12510 + }, + { + "epoch": 2.7193744569939184, + "grad_norm": 0.0013303733430802822, + "learning_rate": 1.6021394439617722e-05, + "loss": 0.0008, + "step": 12520 + }, + { + "epoch": 2.7215464813205905, + "grad_norm": 0.0013412077678367496, + "learning_rate": 1.599424413553432e-05, + "loss": 0.0005, + "step": 12530 + }, + { + "epoch": 2.7237185056472635, + "grad_norm": 0.0020882273092865944, + "learning_rate": 1.5967093831450915e-05, + "loss": 0.0004, + "step": 12540 + }, + { + "epoch": 2.7258905299739355, + "grad_norm": 0.27230900526046753, + "learning_rate": 1.5939943527367508e-05, + "loss": 0.0013, + "step": 12550 + }, + { + "epoch": 2.728062554300608, + "grad_norm": 0.0014832447050139308, + "learning_rate": 1.59127932232841e-05, + "loss": 0.0011, + "step": 12560 + }, + { + "epoch": 2.7302345786272806, + "grad_norm": 0.0016118614003062248, + "learning_rate": 1.5885642919200697e-05, + "loss": 0.0008, + "step": 12570 + }, + { + "epoch": 2.732406602953953, + "grad_norm": 0.02013046108186245, + "learning_rate": 1.585849261511729e-05, + "loss": 0.0127, + "step": 12580 + }, + { + "epoch": 2.7345786272806256, + "grad_norm": 0.0024779075756669044, + "learning_rate": 1.5831342311033883e-05, + "loss": 0.0002, + "step": 12590 + }, + { + "epoch": 2.736750651607298, + "grad_norm": 0.005045650061219931, + "learning_rate": 1.5804192006950476e-05, + "loss": 0.0002, + "step": 12600 + }, + { + "epoch": 2.7389226759339707, + "grad_norm": 0.007579619996249676, + "learning_rate": 1.5777041702867073e-05, + "loss": 0.0003, + "step": 12610 + }, + { + "epoch": 2.7410947002606427, + "grad_norm": 0.0014458505902439356, + "learning_rate": 1.5749891398783666e-05, + "loss": 0.0001, + "step": 12620 + }, + { + "epoch": 2.7432667245873152, + "grad_norm": 0.0013349404325708747, + "learning_rate": 1.5722741094700262e-05, + "loss": 0.0001, + "step": 12630 + }, + { + "epoch": 2.7454387489139878, + "grad_norm": 0.002168971812352538, + "learning_rate": 1.569559079061686e-05, + "loss": 0.0421, + "step": 12640 + }, + { + "epoch": 2.7476107732406603, + "grad_norm": 0.01694057695567608, + "learning_rate": 1.566844048653345e-05, + "loss": 0.0023, + "step": 12650 + }, + { + "epoch": 2.749782797567333, + "grad_norm": 0.004285480361431837, + "learning_rate": 1.5641290182450045e-05, + "loss": 0.0023, + "step": 12660 + }, + { + "epoch": 2.7519548218940053, + "grad_norm": 0.00815389771014452, + "learning_rate": 1.5614139878366638e-05, + "loss": 0.0007, + "step": 12670 + }, + { + "epoch": 2.754126846220678, + "grad_norm": 0.012979789637029171, + "learning_rate": 1.5586989574283234e-05, + "loss": 0.004, + "step": 12680 + }, + { + "epoch": 2.75629887054735, + "grad_norm": 0.009059431962668896, + "learning_rate": 1.5559839270199827e-05, + "loss": 0.0068, + "step": 12690 + }, + { + "epoch": 2.758470894874023, + "grad_norm": 0.007445912342518568, + "learning_rate": 1.553268896611642e-05, + "loss": 0.0046, + "step": 12700 + }, + { + "epoch": 2.760642919200695, + "grad_norm": 0.0011629179352894425, + "learning_rate": 1.5505538662033013e-05, + "loss": 0.0298, + "step": 12710 + }, + { + "epoch": 2.7628149435273675, + "grad_norm": 0.001427669427357614, + "learning_rate": 1.547838835794961e-05, + "loss": 0.0275, + "step": 12720 + }, + { + "epoch": 2.76498696785404, + "grad_norm": 0.022871850058436394, + "learning_rate": 1.5451238053866206e-05, + "loss": 0.0004, + "step": 12730 + }, + { + "epoch": 2.7671589921807125, + "grad_norm": 0.003048468613997102, + "learning_rate": 1.54240877497828e-05, + "loss": 0.0006, + "step": 12740 + }, + { + "epoch": 2.769331016507385, + "grad_norm": 0.0029503628611564636, + "learning_rate": 1.5396937445699392e-05, + "loss": 0.0006, + "step": 12750 + }, + { + "epoch": 2.771503040834057, + "grad_norm": 2.5482583045959473, + "learning_rate": 1.5369787141615988e-05, + "loss": 0.0159, + "step": 12760 + }, + { + "epoch": 2.77367506516073, + "grad_norm": 0.003224568674340844, + "learning_rate": 1.534263683753258e-05, + "loss": 0.0105, + "step": 12770 + }, + { + "epoch": 2.775847089487402, + "grad_norm": 0.03141826391220093, + "learning_rate": 1.5315486533449174e-05, + "loss": 0.0021, + "step": 12780 + }, + { + "epoch": 2.7780191138140746, + "grad_norm": 0.010639526881277561, + "learning_rate": 1.5288336229365767e-05, + "loss": 0.0133, + "step": 12790 + }, + { + "epoch": 2.780191138140747, + "grad_norm": 0.001194779179058969, + "learning_rate": 1.5261185925282364e-05, + "loss": 0.0143, + "step": 12800 + }, + { + "epoch": 2.7823631624674197, + "grad_norm": 0.0038152916822582483, + "learning_rate": 1.5234035621198958e-05, + "loss": 0.0112, + "step": 12810 + }, + { + "epoch": 2.784535186794092, + "grad_norm": 0.001374902785755694, + "learning_rate": 1.5206885317115551e-05, + "loss": 0.0003, + "step": 12820 + }, + { + "epoch": 2.7867072111207647, + "grad_norm": 0.004674356430768967, + "learning_rate": 1.5179735013032148e-05, + "loss": 0.0007, + "step": 12830 + }, + { + "epoch": 2.788879235447437, + "grad_norm": 0.011177301406860352, + "learning_rate": 1.5152584708948741e-05, + "loss": 0.0006, + "step": 12840 + }, + { + "epoch": 2.7910512597741093, + "grad_norm": 0.006574731785804033, + "learning_rate": 1.5125434404865336e-05, + "loss": 0.0002, + "step": 12850 + }, + { + "epoch": 2.793223284100782, + "grad_norm": 0.0032284893095493317, + "learning_rate": 1.5098284100781929e-05, + "loss": 0.0045, + "step": 12860 + }, + { + "epoch": 2.7953953084274543, + "grad_norm": 0.00666830176487565, + "learning_rate": 1.5071133796698525e-05, + "loss": 0.0006, + "step": 12870 + }, + { + "epoch": 2.797567332754127, + "grad_norm": 0.006095957476645708, + "learning_rate": 1.5043983492615118e-05, + "loss": 0.0139, + "step": 12880 + }, + { + "epoch": 2.7997393570807994, + "grad_norm": 0.0036753893364220858, + "learning_rate": 1.5016833188531713e-05, + "loss": 0.006, + "step": 12890 + }, + { + "epoch": 2.801911381407472, + "grad_norm": 0.0015879254788160324, + "learning_rate": 1.4989682884448306e-05, + "loss": 0.0004, + "step": 12900 + }, + { + "epoch": 2.8040834057341444, + "grad_norm": 0.004430914297699928, + "learning_rate": 1.4962532580364902e-05, + "loss": 0.0003, + "step": 12910 + }, + { + "epoch": 2.8062554300608165, + "grad_norm": 0.001315233763307333, + "learning_rate": 1.4935382276281495e-05, + "loss": 0.0002, + "step": 12920 + }, + { + "epoch": 2.808427454387489, + "grad_norm": 0.23426829278469086, + "learning_rate": 1.4908231972198088e-05, + "loss": 0.0159, + "step": 12930 + }, + { + "epoch": 2.8105994787141615, + "grad_norm": 0.001154038356617093, + "learning_rate": 1.4881081668114683e-05, + "loss": 0.0001, + "step": 12940 + }, + { + "epoch": 2.812771503040834, + "grad_norm": 0.0013015108415856957, + "learning_rate": 1.485393136403128e-05, + "loss": 0.0004, + "step": 12950 + }, + { + "epoch": 2.8149435273675065, + "grad_norm": 0.0011772330617532134, + "learning_rate": 1.4826781059947872e-05, + "loss": 0.0003, + "step": 12960 + }, + { + "epoch": 2.817115551694179, + "grad_norm": 0.0015416668029502034, + "learning_rate": 1.4799630755864465e-05, + "loss": 0.0048, + "step": 12970 + }, + { + "epoch": 2.8192875760208516, + "grad_norm": 0.07705456763505936, + "learning_rate": 1.477248045178106e-05, + "loss": 0.0054, + "step": 12980 + }, + { + "epoch": 2.8214596003475236, + "grad_norm": 4.77557897567749, + "learning_rate": 1.4745330147697656e-05, + "loss": 0.0108, + "step": 12990 + }, + { + "epoch": 2.8236316246741966, + "grad_norm": 0.0011944122379645705, + "learning_rate": 1.471817984361425e-05, + "loss": 0.0004, + "step": 13000 + }, + { + "epoch": 2.8258036490008687, + "grad_norm": 0.004810866434127092, + "learning_rate": 1.4691029539530842e-05, + "loss": 0.0006, + "step": 13010 + }, + { + "epoch": 2.827975673327541, + "grad_norm": 0.0011162413284182549, + "learning_rate": 1.4663879235447437e-05, + "loss": 0.0005, + "step": 13020 + }, + { + "epoch": 2.8301476976542137, + "grad_norm": 0.0012644167291000485, + "learning_rate": 1.4636728931364032e-05, + "loss": 0.0001, + "step": 13030 + }, + { + "epoch": 2.832319721980886, + "grad_norm": 0.050831083208322525, + "learning_rate": 1.4609578627280627e-05, + "loss": 0.0133, + "step": 13040 + }, + { + "epoch": 2.8344917463075587, + "grad_norm": 0.0011091434862464666, + "learning_rate": 1.458242832319722e-05, + "loss": 0.0005, + "step": 13050 + }, + { + "epoch": 2.8366637706342313, + "grad_norm": 0.0010839985916391015, + "learning_rate": 1.4555278019113816e-05, + "loss": 0.003, + "step": 13060 + }, + { + "epoch": 2.8388357949609038, + "grad_norm": 0.0010779522126540542, + "learning_rate": 1.4528127715030409e-05, + "loss": 0.0284, + "step": 13070 + }, + { + "epoch": 2.841007819287576, + "grad_norm": 0.0010752989910542965, + "learning_rate": 1.4500977410947004e-05, + "loss": 0.0001, + "step": 13080 + }, + { + "epoch": 2.8431798436142484, + "grad_norm": 0.0011290594702586532, + "learning_rate": 1.4473827106863597e-05, + "loss": 0.0001, + "step": 13090 + }, + { + "epoch": 2.845351867940921, + "grad_norm": 0.001103981863707304, + "learning_rate": 1.4446676802780193e-05, + "loss": 0.0002, + "step": 13100 + }, + { + "epoch": 2.8475238922675934, + "grad_norm": 0.009349314495921135, + "learning_rate": 1.4419526498696786e-05, + "loss": 0.0002, + "step": 13110 + }, + { + "epoch": 2.849695916594266, + "grad_norm": 0.0011064874706789851, + "learning_rate": 1.439237619461338e-05, + "loss": 0.0273, + "step": 13120 + }, + { + "epoch": 2.8518679409209384, + "grad_norm": 0.001436484744772315, + "learning_rate": 1.4365225890529974e-05, + "loss": 0.0067, + "step": 13130 + }, + { + "epoch": 2.854039965247611, + "grad_norm": 0.001617782050743699, + "learning_rate": 1.433807558644657e-05, + "loss": 0.0096, + "step": 13140 + }, + { + "epoch": 2.856211989574283, + "grad_norm": 0.0010622652480378747, + "learning_rate": 1.4310925282363163e-05, + "loss": 0.0012, + "step": 13150 + }, + { + "epoch": 2.8583840139009555, + "grad_norm": 0.001210720743983984, + "learning_rate": 1.4283774978279756e-05, + "loss": 0.0003, + "step": 13160 + }, + { + "epoch": 2.860556038227628, + "grad_norm": 0.001125740702264011, + "learning_rate": 1.4256624674196351e-05, + "loss": 0.0006, + "step": 13170 + }, + { + "epoch": 2.8627280625543006, + "grad_norm": 2.121816396713257, + "learning_rate": 1.4229474370112947e-05, + "loss": 0.0072, + "step": 13180 + }, + { + "epoch": 2.864900086880973, + "grad_norm": 0.001073643914423883, + "learning_rate": 1.420232406602954e-05, + "loss": 0.0774, + "step": 13190 + }, + { + "epoch": 2.8670721112076456, + "grad_norm": 0.001115081482566893, + "learning_rate": 1.4175173761946134e-05, + "loss": 0.0003, + "step": 13200 + }, + { + "epoch": 2.869244135534318, + "grad_norm": 1.6094557046890259, + "learning_rate": 1.4148023457862728e-05, + "loss": 0.0354, + "step": 13210 + }, + { + "epoch": 2.87141615986099, + "grad_norm": 0.0011969703482463956, + "learning_rate": 1.4120873153779323e-05, + "loss": 0.001, + "step": 13220 + }, + { + "epoch": 2.873588184187663, + "grad_norm": 0.0011109106708317995, + "learning_rate": 1.4093722849695918e-05, + "loss": 0.0088, + "step": 13230 + }, + { + "epoch": 2.8757602085143352, + "grad_norm": 0.01138853095471859, + "learning_rate": 1.406657254561251e-05, + "loss": 0.0008, + "step": 13240 + }, + { + "epoch": 2.8779322328410077, + "grad_norm": 0.001634183689020574, + "learning_rate": 1.4039422241529107e-05, + "loss": 0.005, + "step": 13250 + }, + { + "epoch": 2.8801042571676803, + "grad_norm": 0.003161991247907281, + "learning_rate": 1.40122719374457e-05, + "loss": 0.0153, + "step": 13260 + }, + { + "epoch": 2.8822762814943528, + "grad_norm": 0.001149641931988299, + "learning_rate": 1.3985121633362295e-05, + "loss": 0.0003, + "step": 13270 + }, + { + "epoch": 2.8844483058210253, + "grad_norm": 0.001772301853634417, + "learning_rate": 1.3957971329278888e-05, + "loss": 0.065, + "step": 13280 + }, + { + "epoch": 2.886620330147698, + "grad_norm": 0.004322749096900225, + "learning_rate": 1.3930821025195484e-05, + "loss": 0.0025, + "step": 13290 + }, + { + "epoch": 2.8887923544743703, + "grad_norm": 0.009627276100218296, + "learning_rate": 1.3903670721112077e-05, + "loss": 0.0009, + "step": 13300 + }, + { + "epoch": 2.8909643788010424, + "grad_norm": 0.0011137340916320682, + "learning_rate": 1.3876520417028672e-05, + "loss": 0.0003, + "step": 13310 + }, + { + "epoch": 2.893136403127715, + "grad_norm": 5.180811405181885, + "learning_rate": 1.3849370112945265e-05, + "loss": 0.0053, + "step": 13320 + }, + { + "epoch": 2.8953084274543874, + "grad_norm": 0.0012574810534715652, + "learning_rate": 1.3822219808861861e-05, + "loss": 0.0007, + "step": 13330 + }, + { + "epoch": 2.89748045178106, + "grad_norm": 0.11162279546260834, + "learning_rate": 1.3795069504778454e-05, + "loss": 0.0017, + "step": 13340 + }, + { + "epoch": 2.8996524761077325, + "grad_norm": 0.0011239303275942802, + "learning_rate": 1.3767919200695047e-05, + "loss": 0.0002, + "step": 13350 + }, + { + "epoch": 2.901824500434405, + "grad_norm": 0.0021542648319154978, + "learning_rate": 1.3740768896611642e-05, + "loss": 0.0287, + "step": 13360 + }, + { + "epoch": 2.9039965247610775, + "grad_norm": 0.0015336337964981794, + "learning_rate": 1.3713618592528239e-05, + "loss": 0.0005, + "step": 13370 + }, + { + "epoch": 2.9061685490877496, + "grad_norm": 0.28698936104774475, + "learning_rate": 1.3686468288444832e-05, + "loss": 0.036, + "step": 13380 + }, + { + "epoch": 2.908340573414422, + "grad_norm": 0.0016295438399538398, + "learning_rate": 1.3659317984361425e-05, + "loss": 0.0305, + "step": 13390 + }, + { + "epoch": 2.9105125977410946, + "grad_norm": 0.0020589695777744055, + "learning_rate": 1.363216768027802e-05, + "loss": 0.0005, + "step": 13400 + }, + { + "epoch": 2.912684622067767, + "grad_norm": 0.0016789559740573168, + "learning_rate": 1.3605017376194614e-05, + "loss": 0.0033, + "step": 13410 + }, + { + "epoch": 2.9148566463944396, + "grad_norm": 0.10769294947385788, + "learning_rate": 1.3577867072111209e-05, + "loss": 0.0008, + "step": 13420 + }, + { + "epoch": 2.917028670721112, + "grad_norm": 0.005054687615483999, + "learning_rate": 1.3550716768027802e-05, + "loss": 0.0012, + "step": 13430 + }, + { + "epoch": 2.9192006950477847, + "grad_norm": 1.0613359212875366, + "learning_rate": 1.3523566463944398e-05, + "loss": 0.0084, + "step": 13440 + }, + { + "epoch": 2.9213727193744568, + "grad_norm": 0.0010838696034625173, + "learning_rate": 1.3496416159860991e-05, + "loss": 0.0002, + "step": 13450 + }, + { + "epoch": 2.9235447437011297, + "grad_norm": 0.0011025476269423962, + "learning_rate": 1.3469265855777586e-05, + "loss": 0.0008, + "step": 13460 + }, + { + "epoch": 2.925716768027802, + "grad_norm": 0.001397005165927112, + "learning_rate": 1.3442115551694179e-05, + "loss": 0.0004, + "step": 13470 + }, + { + "epoch": 2.9278887923544743, + "grad_norm": 0.027524210512638092, + "learning_rate": 1.3414965247610775e-05, + "loss": 0.0003, + "step": 13480 + }, + { + "epoch": 2.930060816681147, + "grad_norm": 0.0011223404435440898, + "learning_rate": 1.3387814943527368e-05, + "loss": 0.0004, + "step": 13490 + }, + { + "epoch": 2.9322328410078193, + "grad_norm": 0.0010830480605363846, + "learning_rate": 1.3360664639443963e-05, + "loss": 0.0002, + "step": 13500 + }, + { + "epoch": 2.934404865334492, + "grad_norm": 0.0011122282594442368, + "learning_rate": 1.3333514335360556e-05, + "loss": 0.0002, + "step": 13510 + }, + { + "epoch": 2.9365768896611644, + "grad_norm": 0.0010628863237798214, + "learning_rate": 1.3306364031277152e-05, + "loss": 0.0002, + "step": 13520 + }, + { + "epoch": 2.938748913987837, + "grad_norm": 0.0013200805988162756, + "learning_rate": 1.3279213727193745e-05, + "loss": 0.0518, + "step": 13530 + }, + { + "epoch": 2.940920938314509, + "grad_norm": 0.0017253914847970009, + "learning_rate": 1.3252063423110338e-05, + "loss": 0.0002, + "step": 13540 + }, + { + "epoch": 2.9430929626411815, + "grad_norm": 0.007514182478189468, + "learning_rate": 1.3224913119026933e-05, + "loss": 0.0004, + "step": 13550 + }, + { + "epoch": 2.945264986967854, + "grad_norm": 0.038544125854969025, + "learning_rate": 1.319776281494353e-05, + "loss": 0.0003, + "step": 13560 + }, + { + "epoch": 2.9474370112945265, + "grad_norm": 0.01065383106470108, + "learning_rate": 1.3170612510860123e-05, + "loss": 0.0232, + "step": 13570 + }, + { + "epoch": 2.949609035621199, + "grad_norm": 0.027386289089918137, + "learning_rate": 1.3143462206776716e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "epoch": 2.9517810599478715, + "grad_norm": 0.00448612542822957, + "learning_rate": 1.311631190269331e-05, + "loss": 0.0043, + "step": 13590 + }, + { + "epoch": 2.953953084274544, + "grad_norm": 0.004412383772432804, + "learning_rate": 1.3089161598609907e-05, + "loss": 0.0184, + "step": 13600 + }, + { + "epoch": 2.956125108601216, + "grad_norm": 0.012439540587365627, + "learning_rate": 1.30620112945265e-05, + "loss": 0.0044, + "step": 13610 + }, + { + "epoch": 2.9582971329278887, + "grad_norm": 0.008847391232848167, + "learning_rate": 1.3034860990443093e-05, + "loss": 0.0004, + "step": 13620 + }, + { + "epoch": 2.960469157254561, + "grad_norm": 0.0029101003892719746, + "learning_rate": 1.3007710686359686e-05, + "loss": 0.0009, + "step": 13630 + }, + { + "epoch": 2.9626411815812337, + "grad_norm": 0.0015134647255763412, + "learning_rate": 1.2980560382276282e-05, + "loss": 0.0006, + "step": 13640 + }, + { + "epoch": 2.964813205907906, + "grad_norm": 0.0011704802745953202, + "learning_rate": 1.2953410078192877e-05, + "loss": 0.0002, + "step": 13650 + }, + { + "epoch": 2.9669852302345787, + "grad_norm": 0.0011589336208999157, + "learning_rate": 1.292625977410947e-05, + "loss": 0.0003, + "step": 13660 + }, + { + "epoch": 2.9691572545612512, + "grad_norm": 0.002994675189256668, + "learning_rate": 1.2899109470026066e-05, + "loss": 0.0002, + "step": 13670 + }, + { + "epoch": 2.9713292788879233, + "grad_norm": 0.0010929458076134324, + "learning_rate": 1.287195916594266e-05, + "loss": 0.0003, + "step": 13680 + }, + { + "epoch": 2.9735013032145963, + "grad_norm": 0.0018180428305640817, + "learning_rate": 1.2844808861859254e-05, + "loss": 0.0002, + "step": 13690 + }, + { + "epoch": 2.9756733275412683, + "grad_norm": 0.0012238157214596868, + "learning_rate": 1.2817658557775847e-05, + "loss": 0.0002, + "step": 13700 + }, + { + "epoch": 2.977845351867941, + "grad_norm": 0.0010512187145650387, + "learning_rate": 1.2790508253692443e-05, + "loss": 0.0002, + "step": 13710 + }, + { + "epoch": 2.9800173761946134, + "grad_norm": 0.0022942163050174713, + "learning_rate": 1.2763357949609036e-05, + "loss": 0.0006, + "step": 13720 + }, + { + "epoch": 2.982189400521286, + "grad_norm": 0.0010182593250647187, + "learning_rate": 1.273620764552563e-05, + "loss": 0.0334, + "step": 13730 + }, + { + "epoch": 2.9843614248479584, + "grad_norm": 0.0010593519546091557, + "learning_rate": 1.2709057341442224e-05, + "loss": 0.0001, + "step": 13740 + }, + { + "epoch": 2.9865334491746305, + "grad_norm": 0.001036479021422565, + "learning_rate": 1.268190703735882e-05, + "loss": 0.0007, + "step": 13750 + }, + { + "epoch": 2.9887054735013034, + "grad_norm": 0.014912799000740051, + "learning_rate": 1.2654756733275414e-05, + "loss": 0.0003, + "step": 13760 + }, + { + "epoch": 2.9908774978279755, + "grad_norm": 0.0010219586547464132, + "learning_rate": 1.2627606429192007e-05, + "loss": 0.0007, + "step": 13770 + }, + { + "epoch": 2.993049522154648, + "grad_norm": 0.001014968380331993, + "learning_rate": 1.2600456125108601e-05, + "loss": 0.0153, + "step": 13780 + }, + { + "epoch": 2.9952215464813206, + "grad_norm": 0.001060599577613175, + "learning_rate": 1.2573305821025198e-05, + "loss": 0.0003, + "step": 13790 + }, + { + "epoch": 2.997393570807993, + "grad_norm": 0.005443928763270378, + "learning_rate": 1.254615551694179e-05, + "loss": 0.0002, + "step": 13800 + }, + { + "epoch": 2.9995655951346656, + "grad_norm": 0.0010318290442228317, + "learning_rate": 1.2519005212858384e-05, + "loss": 0.0006, + "step": 13810 + }, + { + "epoch": 3.0, + "eval_f1": 0.6779661016949152, + "eval_loss": 0.04916631802916527, + "eval_runtime": 82.7215, + "eval_samples_per_second": 120.585, + "eval_steps_per_second": 7.543, + "step": 13812 + }, + { + "epoch": 3.001737619461338, + "grad_norm": 0.0010593549814075232, + "learning_rate": 1.2491854908774979e-05, + "loss": 0.0002, + "step": 13820 + }, + { + "epoch": 3.0039096437880106, + "grad_norm": 0.06401921808719635, + "learning_rate": 1.2464704604691573e-05, + "loss": 0.0006, + "step": 13830 + }, + { + "epoch": 3.0060816681146827, + "grad_norm": 0.0010438418248668313, + "learning_rate": 1.2437554300608168e-05, + "loss": 0.0004, + "step": 13840 + }, + { + "epoch": 3.008253692441355, + "grad_norm": 0.003679616143926978, + "learning_rate": 1.2410403996524763e-05, + "loss": 0.0003, + "step": 13850 + }, + { + "epoch": 3.0104257167680277, + "grad_norm": 0.044540684670209885, + "learning_rate": 1.2383253692441356e-05, + "loss": 0.0008, + "step": 13860 + }, + { + "epoch": 3.0125977410947002, + "grad_norm": 0.002288431627675891, + "learning_rate": 1.235610338835795e-05, + "loss": 0.0001, + "step": 13870 + }, + { + "epoch": 3.0147697654213728, + "grad_norm": 0.0010103103704750538, + "learning_rate": 1.2328953084274545e-05, + "loss": 0.0001, + "step": 13880 + }, + { + "epoch": 3.0169417897480453, + "grad_norm": 0.001005678903311491, + "learning_rate": 1.230180278019114e-05, + "loss": 0.0003, + "step": 13890 + }, + { + "epoch": 3.019113814074718, + "grad_norm": 0.000996905262582004, + "learning_rate": 1.2274652476107733e-05, + "loss": 0.0005, + "step": 13900 + }, + { + "epoch": 3.0212858384013903, + "grad_norm": 0.0010406819637864828, + "learning_rate": 1.2247502172024328e-05, + "loss": 0.0005, + "step": 13910 + }, + { + "epoch": 3.0234578627280624, + "grad_norm": 0.0011022110702469945, + "learning_rate": 1.222035186794092e-05, + "loss": 0.0003, + "step": 13920 + }, + { + "epoch": 3.025629887054735, + "grad_norm": 0.0009952927939593792, + "learning_rate": 1.2193201563857517e-05, + "loss": 0.0003, + "step": 13930 + }, + { + "epoch": 3.0278019113814074, + "grad_norm": 0.007508167065680027, + "learning_rate": 1.216605125977411e-05, + "loss": 0.0001, + "step": 13940 + }, + { + "epoch": 3.02997393570808, + "grad_norm": 0.0009940717136487365, + "learning_rate": 1.2138900955690705e-05, + "loss": 0.0002, + "step": 13950 + }, + { + "epoch": 3.0321459600347525, + "grad_norm": 0.009310873225331306, + "learning_rate": 1.2111750651607298e-05, + "loss": 0.0004, + "step": 13960 + }, + { + "epoch": 3.034317984361425, + "grad_norm": 0.0009831355419009924, + "learning_rate": 1.2084600347523892e-05, + "loss": 0.0002, + "step": 13970 + }, + { + "epoch": 3.0364900086880975, + "grad_norm": 0.001007686834782362, + "learning_rate": 1.2057450043440487e-05, + "loss": 0.0002, + "step": 13980 + }, + { + "epoch": 3.0386620330147696, + "grad_norm": 0.0009837389225140214, + "learning_rate": 1.2030299739357082e-05, + "loss": 0.0001, + "step": 13990 + }, + { + "epoch": 3.040834057341442, + "grad_norm": 0.0009638213668949902, + "learning_rate": 1.2003149435273675e-05, + "loss": 0.0001, + "step": 14000 + }, + { + "epoch": 3.0430060816681146, + "grad_norm": 0.0009538141894154251, + "learning_rate": 1.197599913119027e-05, + "loss": 0.0005, + "step": 14010 + }, + { + "epoch": 3.045178105994787, + "grad_norm": 0.0009638393530622125, + "learning_rate": 1.1948848827106864e-05, + "loss": 0.0002, + "step": 14020 + }, + { + "epoch": 3.0473501303214596, + "grad_norm": 0.00662460969761014, + "learning_rate": 1.1921698523023459e-05, + "loss": 0.0003, + "step": 14030 + }, + { + "epoch": 3.049522154648132, + "grad_norm": 0.0009571721311658621, + "learning_rate": 1.1894548218940054e-05, + "loss": 0.0002, + "step": 14040 + }, + { + "epoch": 3.0516941789748047, + "grad_norm": 0.0009538009180687368, + "learning_rate": 1.1867397914856647e-05, + "loss": 0.0001, + "step": 14050 + }, + { + "epoch": 3.053866203301477, + "grad_norm": 0.00587887316942215, + "learning_rate": 1.1840247610773241e-05, + "loss": 0.0003, + "step": 14060 + }, + { + "epoch": 3.0560382276281493, + "grad_norm": 0.00492137623950839, + "learning_rate": 1.1813097306689836e-05, + "loss": 0.0003, + "step": 14070 + }, + { + "epoch": 3.0582102519548218, + "grad_norm": 0.0009409641497768462, + "learning_rate": 1.178594700260643e-05, + "loss": 0.0002, + "step": 14080 + }, + { + "epoch": 3.0603822762814943, + "grad_norm": 0.0009386486490257084, + "learning_rate": 1.1758796698523024e-05, + "loss": 0.0003, + "step": 14090 + }, + { + "epoch": 3.062554300608167, + "grad_norm": 0.005429286975413561, + "learning_rate": 1.1731646394439619e-05, + "loss": 0.0002, + "step": 14100 + }, + { + "epoch": 3.0647263249348393, + "grad_norm": 0.005310894921422005, + "learning_rate": 1.1704496090356212e-05, + "loss": 0.0002, + "step": 14110 + }, + { + "epoch": 3.066898349261512, + "grad_norm": 0.38952526450157166, + "learning_rate": 1.1677345786272808e-05, + "loss": 0.0008, + "step": 14120 + }, + { + "epoch": 3.0690703735881844, + "grad_norm": 0.0009406186873093247, + "learning_rate": 1.1650195482189401e-05, + "loss": 0.0001, + "step": 14130 + }, + { + "epoch": 3.0712423979148564, + "grad_norm": 0.01129077933728695, + "learning_rate": 1.1623045178105996e-05, + "loss": 0.0023, + "step": 14140 + }, + { + "epoch": 3.073414422241529, + "grad_norm": 0.0024124332703649998, + "learning_rate": 1.1595894874022589e-05, + "loss": 0.0002, + "step": 14150 + }, + { + "epoch": 3.0755864465682015, + "grad_norm": 0.006113865878432989, + "learning_rate": 1.1568744569939183e-05, + "loss": 0.0004, + "step": 14160 + }, + { + "epoch": 3.077758470894874, + "grad_norm": 0.0014659430598840117, + "learning_rate": 1.1541594265855778e-05, + "loss": 0.0002, + "step": 14170 + }, + { + "epoch": 3.0799304952215465, + "grad_norm": 0.004594105761498213, + "learning_rate": 1.1514443961772373e-05, + "loss": 0.0003, + "step": 14180 + }, + { + "epoch": 3.082102519548219, + "grad_norm": 0.0009500061278231442, + "learning_rate": 1.1487293657688966e-05, + "loss": 0.025, + "step": 14190 + }, + { + "epoch": 3.0842745438748915, + "grad_norm": 0.0009136287262663245, + "learning_rate": 1.146014335360556e-05, + "loss": 0.0218, + "step": 14200 + }, + { + "epoch": 3.086446568201564, + "grad_norm": 0.01176744606345892, + "learning_rate": 1.1432993049522155e-05, + "loss": 0.0002, + "step": 14210 + }, + { + "epoch": 3.088618592528236, + "grad_norm": 0.0009232366573996842, + "learning_rate": 1.140584274543875e-05, + "loss": 0.026, + "step": 14220 + }, + { + "epoch": 3.0907906168549086, + "grad_norm": 0.0009313057526014745, + "learning_rate": 1.1378692441355343e-05, + "loss": 0.0401, + "step": 14230 + }, + { + "epoch": 3.092962641181581, + "grad_norm": 0.0009547146037220955, + "learning_rate": 1.1351542137271938e-05, + "loss": 0.0001, + "step": 14240 + }, + { + "epoch": 3.0951346655082537, + "grad_norm": 0.0010204818099737167, + "learning_rate": 1.1324391833188532e-05, + "loss": 0.001, + "step": 14250 + }, + { + "epoch": 3.097306689834926, + "grad_norm": 0.0015138749731704593, + "learning_rate": 1.1297241529105127e-05, + "loss": 0.0002, + "step": 14260 + }, + { + "epoch": 3.0994787141615987, + "grad_norm": 0.0009715591440908611, + "learning_rate": 1.1270091225021722e-05, + "loss": 0.0002, + "step": 14270 + }, + { + "epoch": 3.101650738488271, + "grad_norm": 0.001034111832268536, + "learning_rate": 1.1242940920938315e-05, + "loss": 0.0002, + "step": 14280 + }, + { + "epoch": 3.1038227628149437, + "grad_norm": 0.09780330210924149, + "learning_rate": 1.121579061685491e-05, + "loss": 0.0008, + "step": 14290 + }, + { + "epoch": 3.105994787141616, + "grad_norm": 0.012254135683178902, + "learning_rate": 1.1188640312771503e-05, + "loss": 0.0002, + "step": 14300 + }, + { + "epoch": 3.1081668114682883, + "grad_norm": 0.0010856038425117731, + "learning_rate": 1.1161490008688099e-05, + "loss": 0.0002, + "step": 14310 + }, + { + "epoch": 3.110338835794961, + "grad_norm": 0.0009265311527997255, + "learning_rate": 1.1134339704604692e-05, + "loss": 0.0003, + "step": 14320 + }, + { + "epoch": 3.1125108601216334, + "grad_norm": 0.0013762598391622305, + "learning_rate": 1.1107189400521287e-05, + "loss": 0.0003, + "step": 14330 + }, + { + "epoch": 3.114682884448306, + "grad_norm": 0.0012818133691325784, + "learning_rate": 1.108003909643788e-05, + "loss": 0.0037, + "step": 14340 + }, + { + "epoch": 3.1168549087749784, + "grad_norm": 0.000965480983722955, + "learning_rate": 1.1052888792354474e-05, + "loss": 0.0198, + "step": 14350 + }, + { + "epoch": 3.119026933101651, + "grad_norm": 0.0009158066241070628, + "learning_rate": 1.102573848827107e-05, + "loss": 0.013, + "step": 14360 + }, + { + "epoch": 3.121198957428323, + "grad_norm": 0.005339405033737421, + "learning_rate": 1.0998588184187664e-05, + "loss": 0.0004, + "step": 14370 + }, + { + "epoch": 3.1233709817549955, + "grad_norm": 0.000903356762137264, + "learning_rate": 1.0971437880104257e-05, + "loss": 0.0002, + "step": 14380 + }, + { + "epoch": 3.125543006081668, + "grad_norm": 0.0009056212729774415, + "learning_rate": 1.0944287576020852e-05, + "loss": 0.0094, + "step": 14390 + }, + { + "epoch": 3.1277150304083405, + "grad_norm": 0.004547704942524433, + "learning_rate": 1.0917137271937446e-05, + "loss": 0.0001, + "step": 14400 + }, + { + "epoch": 3.129887054735013, + "grad_norm": 0.006993583869189024, + "learning_rate": 1.0889986967854041e-05, + "loss": 0.0014, + "step": 14410 + }, + { + "epoch": 3.1320590790616856, + "grad_norm": 0.0009027125779539347, + "learning_rate": 1.0862836663770634e-05, + "loss": 0.0055, + "step": 14420 + }, + { + "epoch": 3.134231103388358, + "grad_norm": 2.2096879482269287, + "learning_rate": 1.0835686359687229e-05, + "loss": 0.0412, + "step": 14430 + }, + { + "epoch": 3.1364031277150306, + "grad_norm": 0.0009054460097104311, + "learning_rate": 1.0808536055603822e-05, + "loss": 0.0001, + "step": 14440 + }, + { + "epoch": 3.1385751520417027, + "grad_norm": 0.0009118215530179441, + "learning_rate": 1.0781385751520418e-05, + "loss": 0.0004, + "step": 14450 + }, + { + "epoch": 3.140747176368375, + "grad_norm": 0.03558327257633209, + "learning_rate": 1.0754235447437013e-05, + "loss": 0.0004, + "step": 14460 + }, + { + "epoch": 3.1429192006950477, + "grad_norm": 0.0009161134366877377, + "learning_rate": 1.0727085143353606e-05, + "loss": 0.0002, + "step": 14470 + }, + { + "epoch": 3.1450912250217202, + "grad_norm": 0.0009112968109548092, + "learning_rate": 1.06999348392702e-05, + "loss": 0.0001, + "step": 14480 + }, + { + "epoch": 3.1472632493483927, + "grad_norm": 0.26400279998779297, + "learning_rate": 1.0672784535186794e-05, + "loss": 0.0054, + "step": 14490 + }, + { + "epoch": 3.1494352736750653, + "grad_norm": 0.0011174866231158376, + "learning_rate": 1.064563423110339e-05, + "loss": 0.0062, + "step": 14500 + }, + { + "epoch": 3.151607298001738, + "grad_norm": 0.0009248264250345528, + "learning_rate": 1.0618483927019983e-05, + "loss": 0.0044, + "step": 14510 + }, + { + "epoch": 3.1537793223284103, + "grad_norm": 0.00093687983462587, + "learning_rate": 1.0591333622936578e-05, + "loss": 0.0021, + "step": 14520 + }, + { + "epoch": 3.1559513466550824, + "grad_norm": 0.0012292963219806552, + "learning_rate": 1.056418331885317e-05, + "loss": 0.0016, + "step": 14530 + }, + { + "epoch": 3.158123370981755, + "grad_norm": 0.1130121722817421, + "learning_rate": 1.0537033014769766e-05, + "loss": 0.0007, + "step": 14540 + }, + { + "epoch": 3.1602953953084274, + "grad_norm": 0.001622872776351869, + "learning_rate": 1.0512597741094701e-05, + "loss": 0.0102, + "step": 14550 + }, + { + "epoch": 3.1624674196351, + "grad_norm": 0.01409598533064127, + "learning_rate": 1.0485447437011296e-05, + "loss": 0.0116, + "step": 14560 + }, + { + "epoch": 3.1646394439617724, + "grad_norm": 0.000908426649402827, + "learning_rate": 1.045829713292789e-05, + "loss": 0.0182, + "step": 14570 + }, + { + "epoch": 3.166811468288445, + "grad_norm": 0.0009166007512249053, + "learning_rate": 1.0431146828844483e-05, + "loss": 0.0003, + "step": 14580 + }, + { + "epoch": 3.1689834926151175, + "grad_norm": 0.0008866732241585851, + "learning_rate": 1.0403996524761078e-05, + "loss": 0.0005, + "step": 14590 + }, + { + "epoch": 3.1711555169417895, + "grad_norm": 0.0011180889559909701, + "learning_rate": 1.0376846220677671e-05, + "loss": 0.0127, + "step": 14600 + }, + { + "epoch": 3.173327541268462, + "grad_norm": 0.011834468692541122, + "learning_rate": 1.0349695916594268e-05, + "loss": 0.0003, + "step": 14610 + }, + { + "epoch": 3.1754995655951346, + "grad_norm": 0.0009702285169623792, + "learning_rate": 1.032254561251086e-05, + "loss": 0.0097, + "step": 14620 + }, + { + "epoch": 3.177671589921807, + "grad_norm": 0.000917663041036576, + "learning_rate": 1.0295395308427455e-05, + "loss": 0.0001, + "step": 14630 + }, + { + "epoch": 3.1798436142484796, + "grad_norm": 0.0103968670591712, + "learning_rate": 1.0268245004344048e-05, + "loss": 0.0003, + "step": 14640 + }, + { + "epoch": 3.182015638575152, + "grad_norm": 0.0010382416658103466, + "learning_rate": 1.0241094700260643e-05, + "loss": 0.0019, + "step": 14650 + }, + { + "epoch": 3.1841876629018246, + "grad_norm": 0.009599311277270317, + "learning_rate": 1.0213944396177238e-05, + "loss": 0.0009, + "step": 14660 + }, + { + "epoch": 3.186359687228497, + "grad_norm": 0.0008943171706050634, + "learning_rate": 1.0186794092093832e-05, + "loss": 0.0015, + "step": 14670 + }, + { + "epoch": 3.1885317115551692, + "grad_norm": 0.0009029650827869773, + "learning_rate": 1.0159643788010425e-05, + "loss": 0.0002, + "step": 14680 + }, + { + "epoch": 3.1907037358818418, + "grad_norm": 0.0009432806400582194, + "learning_rate": 1.013249348392702e-05, + "loss": 0.0002, + "step": 14690 + }, + { + "epoch": 3.1928757602085143, + "grad_norm": 0.00372710800729692, + "learning_rate": 1.0105343179843615e-05, + "loss": 0.0005, + "step": 14700 + }, + { + "epoch": 3.195047784535187, + "grad_norm": 0.0008843187242746353, + "learning_rate": 1.007819287576021e-05, + "loss": 0.0003, + "step": 14710 + }, + { + "epoch": 3.1972198088618593, + "grad_norm": 0.0008964376756921411, + "learning_rate": 1.0051042571676804e-05, + "loss": 0.0002, + "step": 14720 + }, + { + "epoch": 3.199391833188532, + "grad_norm": 0.0008904538117349148, + "learning_rate": 1.0023892267593397e-05, + "loss": 0.0002, + "step": 14730 + }, + { + "epoch": 3.2015638575152043, + "grad_norm": 0.0008782143704593182, + "learning_rate": 9.996741963509992e-06, + "loss": 0.0073, + "step": 14740 + }, + { + "epoch": 3.203735881841877, + "grad_norm": 0.0008750375709496439, + "learning_rate": 9.969591659426587e-06, + "loss": 0.0002, + "step": 14750 + }, + { + "epoch": 3.205907906168549, + "grad_norm": 8.284111022949219, + "learning_rate": 9.942441355343181e-06, + "loss": 0.002, + "step": 14760 + }, + { + "epoch": 3.2080799304952214, + "grad_norm": 0.001080102170817554, + "learning_rate": 9.915291051259774e-06, + "loss": 0.0001, + "step": 14770 + }, + { + "epoch": 3.210251954821894, + "grad_norm": 0.0008838811190798879, + "learning_rate": 9.888140747176369e-06, + "loss": 0.0002, + "step": 14780 + }, + { + "epoch": 3.2124239791485665, + "grad_norm": 0.0008833123138174415, + "learning_rate": 9.860990443092962e-06, + "loss": 0.0003, + "step": 14790 + }, + { + "epoch": 3.214596003475239, + "grad_norm": 0.01015100721269846, + "learning_rate": 9.833840139009559e-06, + "loss": 0.0003, + "step": 14800 + }, + { + "epoch": 3.2167680278019115, + "grad_norm": 0.0013419504975900054, + "learning_rate": 9.806689834926152e-06, + "loss": 0.0003, + "step": 14810 + }, + { + "epoch": 3.218940052128584, + "grad_norm": 0.0012950883246958256, + "learning_rate": 9.779539530842746e-06, + "loss": 0.0003, + "step": 14820 + }, + { + "epoch": 3.221112076455256, + "grad_norm": 0.0009361078846268356, + "learning_rate": 9.75238922675934e-06, + "loss": 0.0001, + "step": 14830 + }, + { + "epoch": 3.2232841007819286, + "grad_norm": 0.0008646132191643119, + "learning_rate": 9.725238922675934e-06, + "loss": 0.0002, + "step": 14840 + }, + { + "epoch": 3.225456125108601, + "grad_norm": 0.0008529993938282132, + "learning_rate": 9.698088618592529e-06, + "loss": 0.0018, + "step": 14850 + }, + { + "epoch": 3.2276281494352737, + "grad_norm": 0.000858749495819211, + "learning_rate": 9.670938314509123e-06, + "loss": 0.0001, + "step": 14860 + }, + { + "epoch": 3.229800173761946, + "grad_norm": 0.0008768062107264996, + "learning_rate": 9.643788010425716e-06, + "loss": 0.0002, + "step": 14870 + }, + { + "epoch": 3.2319721980886187, + "grad_norm": 0.0008445355342701077, + "learning_rate": 9.616637706342311e-06, + "loss": 0.0002, + "step": 14880 + }, + { + "epoch": 3.234144222415291, + "grad_norm": 0.0014718384481966496, + "learning_rate": 9.589487402258906e-06, + "loss": 0.0033, + "step": 14890 + }, + { + "epoch": 3.2363162467419633, + "grad_norm": 0.0008432904141955078, + "learning_rate": 9.5623370981755e-06, + "loss": 0.0002, + "step": 14900 + }, + { + "epoch": 3.238488271068636, + "grad_norm": 0.0008551370119675994, + "learning_rate": 9.535186794092095e-06, + "loss": 0.0137, + "step": 14910 + }, + { + "epoch": 3.2406602953953083, + "grad_norm": 0.0009141720947809517, + "learning_rate": 9.508036490008688e-06, + "loss": 0.0338, + "step": 14920 + }, + { + "epoch": 3.242832319721981, + "grad_norm": 0.0008974664960987866, + "learning_rate": 9.480886185925283e-06, + "loss": 0.0002, + "step": 14930 + }, + { + "epoch": 3.2450043440486533, + "grad_norm": 1.5477900505065918, + "learning_rate": 9.453735881841878e-06, + "loss": 0.018, + "step": 14940 + }, + { + "epoch": 3.247176368375326, + "grad_norm": 0.0009042550809681416, + "learning_rate": 9.426585577758472e-06, + "loss": 0.0003, + "step": 14950 + }, + { + "epoch": 3.2493483927019984, + "grad_norm": 0.22629916667938232, + "learning_rate": 9.399435273675065e-06, + "loss": 0.0014, + "step": 14960 + }, + { + "epoch": 3.251520417028671, + "grad_norm": 0.0008919961983337998, + "learning_rate": 9.37228496959166e-06, + "loss": 0.0005, + "step": 14970 + }, + { + "epoch": 3.2536924413553434, + "grad_norm": 0.0008801415096968412, + "learning_rate": 9.345134665508253e-06, + "loss": 0.0022, + "step": 14980 + }, + { + "epoch": 3.2558644656820155, + "grad_norm": 0.24160538613796234, + "learning_rate": 9.31798436142485e-06, + "loss": 0.0119, + "step": 14990 + }, + { + "epoch": 3.258036490008688, + "grad_norm": 0.019302744418382645, + "learning_rate": 9.290834057341443e-06, + "loss": 0.0008, + "step": 15000 + }, + { + "epoch": 3.2602085143353605, + "grad_norm": 0.0008701402693986893, + "learning_rate": 9.263683753258037e-06, + "loss": 0.0055, + "step": 15010 + }, + { + "epoch": 3.262380538662033, + "grad_norm": 0.0008556676330044866, + "learning_rate": 9.23653344917463e-06, + "loss": 0.0007, + "step": 15020 + }, + { + "epoch": 3.2645525629887056, + "grad_norm": 0.0008663336047902703, + "learning_rate": 9.209383145091225e-06, + "loss": 0.0006, + "step": 15030 + }, + { + "epoch": 3.266724587315378, + "grad_norm": 0.1436619907617569, + "learning_rate": 9.18223284100782e-06, + "loss": 0.0038, + "step": 15040 + }, + { + "epoch": 3.2688966116420506, + "grad_norm": 0.0009035149705596268, + "learning_rate": 9.155082536924414e-06, + "loss": 0.0002, + "step": 15050 + }, + { + "epoch": 3.2710686359687227, + "grad_norm": 0.0008320367196574807, + "learning_rate": 9.127932232841007e-06, + "loss": 0.0001, + "step": 15060 + }, + { + "epoch": 3.273240660295395, + "grad_norm": 0.0010443541686981916, + "learning_rate": 9.100781928757602e-06, + "loss": 0.0002, + "step": 15070 + }, + { + "epoch": 3.2754126846220677, + "grad_norm": 0.0015021273866295815, + "learning_rate": 9.073631624674197e-06, + "loss": 0.0002, + "step": 15080 + }, + { + "epoch": 3.27758470894874, + "grad_norm": 0.0009816536912694573, + "learning_rate": 9.046481320590792e-06, + "loss": 0.0001, + "step": 15090 + }, + { + "epoch": 3.2797567332754127, + "grad_norm": 0.0013045056257396936, + "learning_rate": 9.019331016507385e-06, + "loss": 0.0107, + "step": 15100 + }, + { + "epoch": 3.2819287576020852, + "grad_norm": 0.002013902645558119, + "learning_rate": 8.99218071242398e-06, + "loss": 0.0004, + "step": 15110 + }, + { + "epoch": 3.2841007819287578, + "grad_norm": 1.3247151374816895, + "learning_rate": 8.965030408340574e-06, + "loss": 0.0074, + "step": 15120 + }, + { + "epoch": 3.28627280625543, + "grad_norm": 0.0020623058080673218, + "learning_rate": 8.937880104257169e-06, + "loss": 0.0002, + "step": 15130 + }, + { + "epoch": 3.2884448305821024, + "grad_norm": 0.003490692237392068, + "learning_rate": 8.910729800173763e-06, + "loss": 0.0003, + "step": 15140 + }, + { + "epoch": 3.290616854908775, + "grad_norm": 0.0008333229343406856, + "learning_rate": 8.883579496090357e-06, + "loss": 0.0001, + "step": 15150 + }, + { + "epoch": 3.2927888792354474, + "grad_norm": 0.0008364542154595256, + "learning_rate": 8.856429192006951e-06, + "loss": 0.0028, + "step": 15160 + }, + { + "epoch": 3.29496090356212, + "grad_norm": 0.005969099700450897, + "learning_rate": 8.829278887923544e-06, + "loss": 0.0058, + "step": 15170 + }, + { + "epoch": 3.2971329278887924, + "grad_norm": 0.0008359033381566405, + "learning_rate": 8.80212858384014e-06, + "loss": 0.0001, + "step": 15180 + }, + { + "epoch": 3.299304952215465, + "grad_norm": 0.004270035773515701, + "learning_rate": 8.774978279756734e-06, + "loss": 0.0001, + "step": 15190 + }, + { + "epoch": 3.3014769765421375, + "grad_norm": 0.0008729678229428828, + "learning_rate": 8.747827975673328e-06, + "loss": 0.0003, + "step": 15200 + }, + { + "epoch": 3.30364900086881, + "grad_norm": 5.316149711608887, + "learning_rate": 8.720677671589921e-06, + "loss": 0.0102, + "step": 15210 + }, + { + "epoch": 3.305821025195482, + "grad_norm": 0.0008195647387765348, + "learning_rate": 8.693527367506516e-06, + "loss": 0.0003, + "step": 15220 + }, + { + "epoch": 3.3079930495221546, + "grad_norm": 0.0008153934613801539, + "learning_rate": 8.66637706342311e-06, + "loss": 0.0001, + "step": 15230 + }, + { + "epoch": 3.310165073848827, + "grad_norm": 0.022830063477158546, + "learning_rate": 8.639226759339706e-06, + "loss": 0.0004, + "step": 15240 + }, + { + "epoch": 3.3123370981754996, + "grad_norm": 0.0008046124130487442, + "learning_rate": 8.612076455256299e-06, + "loss": 0.0002, + "step": 15250 + }, + { + "epoch": 3.314509122502172, + "grad_norm": 0.00419920589774847, + "learning_rate": 8.584926151172893e-06, + "loss": 0.0001, + "step": 15260 + }, + { + "epoch": 3.3166811468288446, + "grad_norm": 0.008058223873376846, + "learning_rate": 8.557775847089488e-06, + "loss": 0.0179, + "step": 15270 + }, + { + "epoch": 3.318853171155517, + "grad_norm": 0.0008202812750823796, + "learning_rate": 8.530625543006083e-06, + "loss": 0.0002, + "step": 15280 + }, + { + "epoch": 3.321025195482189, + "grad_norm": 0.0008240279275923967, + "learning_rate": 8.503475238922676e-06, + "loss": 0.0041, + "step": 15290 + }, + { + "epoch": 3.3231972198088617, + "grad_norm": 0.00086255744099617, + "learning_rate": 8.47632493483927e-06, + "loss": 0.0001, + "step": 15300 + }, + { + "epoch": 3.3253692441355343, + "grad_norm": 0.0007995104533620179, + "learning_rate": 8.449174630755863e-06, + "loss": 0.001, + "step": 15310 + }, + { + "epoch": 3.3275412684622068, + "grad_norm": 0.0010926051763817668, + "learning_rate": 8.42202432667246e-06, + "loss": 0.0001, + "step": 15320 + }, + { + "epoch": 3.3297132927888793, + "grad_norm": 0.0010417832527309656, + "learning_rate": 8.394874022589055e-06, + "loss": 0.0002, + "step": 15330 + }, + { + "epoch": 3.331885317115552, + "grad_norm": 0.0009360710973851383, + "learning_rate": 8.367723718505648e-06, + "loss": 0.0002, + "step": 15340 + }, + { + "epoch": 3.3340573414422243, + "grad_norm": 0.0008018920780159533, + "learning_rate": 8.340573414422242e-06, + "loss": 0.0042, + "step": 15350 + }, + { + "epoch": 3.3362293657688964, + "grad_norm": 0.028158850967884064, + "learning_rate": 8.313423110338835e-06, + "loss": 0.0049, + "step": 15360 + }, + { + "epoch": 3.338401390095569, + "grad_norm": 0.0008068516617640853, + "learning_rate": 8.286272806255432e-06, + "loss": 0.0002, + "step": 15370 + }, + { + "epoch": 3.3405734144222414, + "grad_norm": 0.000881023530382663, + "learning_rate": 8.259122502172025e-06, + "loss": 0.0002, + "step": 15380 + }, + { + "epoch": 3.342745438748914, + "grad_norm": 0.0008173162932507694, + "learning_rate": 8.23197219808862e-06, + "loss": 0.0001, + "step": 15390 + }, + { + "epoch": 3.3449174630755865, + "grad_norm": 0.000863785739056766, + "learning_rate": 8.204821894005212e-06, + "loss": 0.0003, + "step": 15400 + }, + { + "epoch": 3.347089487402259, + "grad_norm": 0.03187748044729233, + "learning_rate": 8.177671589921807e-06, + "loss": 0.0003, + "step": 15410 + }, + { + "epoch": 3.3492615117289315, + "grad_norm": 0.0008539878181181848, + "learning_rate": 8.150521285838402e-06, + "loss": 0.0047, + "step": 15420 + }, + { + "epoch": 3.351433536055604, + "grad_norm": 0.0008244166965596378, + "learning_rate": 8.123370981754997e-06, + "loss": 0.0001, + "step": 15430 + }, + { + "epoch": 3.3536055603822765, + "grad_norm": 0.0009109236998483539, + "learning_rate": 8.09622067767159e-06, + "loss": 0.0117, + "step": 15440 + }, + { + "epoch": 3.3557775847089486, + "grad_norm": 0.0009838317055255175, + "learning_rate": 8.069070373588184e-06, + "loss": 0.0002, + "step": 15450 + }, + { + "epoch": 3.357949609035621, + "grad_norm": 0.0014414238976314664, + "learning_rate": 8.041920069504779e-06, + "loss": 0.0001, + "step": 15460 + }, + { + "epoch": 3.3601216333622936, + "grad_norm": 0.0042699044570326805, + "learning_rate": 8.014769765421374e-06, + "loss": 0.0001, + "step": 15470 + }, + { + "epoch": 3.362293657688966, + "grad_norm": 0.048958804458379745, + "learning_rate": 7.987619461337967e-06, + "loss": 0.0057, + "step": 15480 + }, + { + "epoch": 3.3644656820156387, + "grad_norm": 0.0007931030704639852, + "learning_rate": 7.960469157254561e-06, + "loss": 0.0003, + "step": 15490 + }, + { + "epoch": 3.366637706342311, + "grad_norm": 0.0008370107971131802, + "learning_rate": 7.933318853171154e-06, + "loss": 0.0058, + "step": 15500 + }, + { + "epoch": 3.3688097306689837, + "grad_norm": 0.0008241998148150742, + "learning_rate": 7.906168549087751e-06, + "loss": 0.0001, + "step": 15510 + }, + { + "epoch": 3.3709817549956558, + "grad_norm": 0.0010215636575594544, + "learning_rate": 7.879018245004346e-06, + "loss": 0.0054, + "step": 15520 + }, + { + "epoch": 3.3731537793223283, + "grad_norm": 0.0008759453776292503, + "learning_rate": 7.851867940920939e-06, + "loss": 0.0003, + "step": 15530 + }, + { + "epoch": 3.375325803649001, + "grad_norm": 0.0008560109417885542, + "learning_rate": 7.824717636837533e-06, + "loss": 0.0002, + "step": 15540 + }, + { + "epoch": 3.3774978279756733, + "grad_norm": 0.0008002111571840942, + "learning_rate": 7.797567332754126e-06, + "loss": 0.0004, + "step": 15550 + }, + { + "epoch": 3.379669852302346, + "grad_norm": 0.0007969232392497361, + "learning_rate": 7.770417028670723e-06, + "loss": 0.0002, + "step": 15560 + }, + { + "epoch": 3.3818418766290184, + "grad_norm": 0.000970225315541029, + "learning_rate": 7.743266724587316e-06, + "loss": 0.0002, + "step": 15570 + }, + { + "epoch": 3.384013900955691, + "grad_norm": 0.0008663604385219514, + "learning_rate": 7.71611642050391e-06, + "loss": 0.0039, + "step": 15580 + }, + { + "epoch": 3.386185925282363, + "grad_norm": 0.0031977728940546513, + "learning_rate": 7.688966116420503e-06, + "loss": 0.0002, + "step": 15590 + }, + { + "epoch": 3.3883579496090355, + "grad_norm": 0.0008602018351666629, + "learning_rate": 7.661815812337098e-06, + "loss": 0.0169, + "step": 15600 + }, + { + "epoch": 3.390529973935708, + "grad_norm": 0.0009347721934318542, + "learning_rate": 7.634665508253693e-06, + "loss": 0.0056, + "step": 15610 + }, + { + "epoch": 3.3927019982623805, + "grad_norm": 0.0007914070738479495, + "learning_rate": 7.607515204170288e-06, + "loss": 0.0001, + "step": 15620 + }, + { + "epoch": 3.394874022589053, + "grad_norm": 0.0007933730375953019, + "learning_rate": 7.580364900086881e-06, + "loss": 0.0099, + "step": 15630 + }, + { + "epoch": 3.3970460469157255, + "grad_norm": 0.000815921404864639, + "learning_rate": 7.553214596003476e-06, + "loss": 0.0101, + "step": 15640 + }, + { + "epoch": 3.399218071242398, + "grad_norm": 0.0013794410042464733, + "learning_rate": 7.526064291920069e-06, + "loss": 0.0002, + "step": 15650 + }, + { + "epoch": 3.40139009556907, + "grad_norm": 0.0015276771737262607, + "learning_rate": 7.498913987836664e-06, + "loss": 0.0024, + "step": 15660 + }, + { + "epoch": 3.4035621198957426, + "grad_norm": 0.0007931669242680073, + "learning_rate": 7.471763683753258e-06, + "loss": 0.0001, + "step": 15670 + }, + { + "epoch": 3.405734144222415, + "grad_norm": 0.0007988855941221118, + "learning_rate": 7.4446133796698525e-06, + "loss": 0.0005, + "step": 15680 + }, + { + "epoch": 3.4079061685490877, + "grad_norm": 0.0007956427871249616, + "learning_rate": 7.417463075586446e-06, + "loss": 0.0001, + "step": 15690 + }, + { + "epoch": 3.41007819287576, + "grad_norm": 0.8127291202545166, + "learning_rate": 7.390312771503041e-06, + "loss": 0.005, + "step": 15700 + }, + { + "epoch": 3.4122502172024327, + "grad_norm": 0.0007987445569597185, + "learning_rate": 7.363162467419635e-06, + "loss": 0.0001, + "step": 15710 + }, + { + "epoch": 3.4144222415291052, + "grad_norm": 0.0007936052861623466, + "learning_rate": 7.33601216333623e-06, + "loss": 0.0038, + "step": 15720 + }, + { + "epoch": 3.4165942658557777, + "grad_norm": 0.0008497874368913472, + "learning_rate": 7.308861859252824e-06, + "loss": 0.0001, + "step": 15730 + }, + { + "epoch": 3.4187662901824503, + "grad_norm": 0.0007960118819028139, + "learning_rate": 7.281711555169418e-06, + "loss": 0.0001, + "step": 15740 + }, + { + "epoch": 3.4209383145091223, + "grad_norm": 1.809430480003357, + "learning_rate": 7.254561251086013e-06, + "loss": 0.0114, + "step": 15750 + }, + { + "epoch": 3.423110338835795, + "grad_norm": 0.003907696343958378, + "learning_rate": 7.227410947002607e-06, + "loss": 0.0001, + "step": 15760 + }, + { + "epoch": 3.4252823631624674, + "grad_norm": 0.0008068581810221076, + "learning_rate": 7.2002606429192015e-06, + "loss": 0.0001, + "step": 15770 + }, + { + "epoch": 3.42745438748914, + "grad_norm": 0.0009195163147523999, + "learning_rate": 7.173110338835795e-06, + "loss": 0.0001, + "step": 15780 + }, + { + "epoch": 3.4296264118158124, + "grad_norm": 0.00783773697912693, + "learning_rate": 7.14596003475239e-06, + "loss": 0.0002, + "step": 15790 + }, + { + "epoch": 3.431798436142485, + "grad_norm": 0.0008429814479313791, + "learning_rate": 7.118809730668984e-06, + "loss": 0.0001, + "step": 15800 + }, + { + "epoch": 3.4339704604691574, + "grad_norm": 0.0007842437480576336, + "learning_rate": 7.091659426585579e-06, + "loss": 0.0002, + "step": 15810 + }, + { + "epoch": 3.4361424847958295, + "grad_norm": 0.0007843258208595216, + "learning_rate": 7.064509122502172e-06, + "loss": 0.0001, + "step": 15820 + }, + { + "epoch": 3.438314509122502, + "grad_norm": 0.0032298804726451635, + "learning_rate": 7.037358818418767e-06, + "loss": 0.0003, + "step": 15830 + }, + { + "epoch": 3.4404865334491745, + "grad_norm": 0.0035422821529209614, + "learning_rate": 7.01020851433536e-06, + "loss": 0.0002, + "step": 15840 + }, + { + "epoch": 3.442658557775847, + "grad_norm": 0.0007916768663562834, + "learning_rate": 6.983058210251956e-06, + "loss": 0.0001, + "step": 15850 + }, + { + "epoch": 3.4448305821025196, + "grad_norm": 0.003323981538414955, + "learning_rate": 6.955907906168549e-06, + "loss": 0.0006, + "step": 15860 + }, + { + "epoch": 3.447002606429192, + "grad_norm": 0.0007974683539941907, + "learning_rate": 6.9287576020851435e-06, + "loss": 0.0003, + "step": 15870 + }, + { + "epoch": 3.4491746307558646, + "grad_norm": 0.002854996593669057, + "learning_rate": 6.901607298001737e-06, + "loss": 0.0001, + "step": 15880 + }, + { + "epoch": 3.4513466550825367, + "grad_norm": 0.0008318639011122286, + "learning_rate": 6.874456993918332e-06, + "loss": 0.0001, + "step": 15890 + }, + { + "epoch": 3.453518679409209, + "grad_norm": 0.0008181575685739517, + "learning_rate": 6.847306689834926e-06, + "loss": 0.0001, + "step": 15900 + }, + { + "epoch": 3.4556907037358817, + "grad_norm": 0.000765151169616729, + "learning_rate": 6.820156385751521e-06, + "loss": 0.0001, + "step": 15910 + }, + { + "epoch": 3.4578627280625542, + "grad_norm": 0.0007698533590883017, + "learning_rate": 6.793006081668115e-06, + "loss": 0.0026, + "step": 15920 + }, + { + "epoch": 3.4600347523892268, + "grad_norm": 0.0137395691126585, + "learning_rate": 6.765855777584709e-06, + "loss": 0.0002, + "step": 15930 + }, + { + "epoch": 3.4622067767158993, + "grad_norm": 0.010664197616279125, + "learning_rate": 6.738705473501304e-06, + "loss": 0.0001, + "step": 15940 + }, + { + "epoch": 3.464378801042572, + "grad_norm": 0.001031374093145132, + "learning_rate": 6.711555169417898e-06, + "loss": 0.0001, + "step": 15950 + }, + { + "epoch": 3.4665508253692443, + "grad_norm": 0.0018270538421347737, + "learning_rate": 6.6844048653344925e-06, + "loss": 0.0418, + "step": 15960 + }, + { + "epoch": 3.468722849695917, + "grad_norm": 0.0007723259041085839, + "learning_rate": 6.657254561251086e-06, + "loss": 0.0001, + "step": 15970 + }, + { + "epoch": 3.470894874022589, + "grad_norm": 0.0008250788087025285, + "learning_rate": 6.630104257167681e-06, + "loss": 0.0002, + "step": 15980 + }, + { + "epoch": 3.4730668983492614, + "grad_norm": 0.0007811134564690292, + "learning_rate": 6.602953953084275e-06, + "loss": 0.0006, + "step": 15990 + }, + { + "epoch": 3.475238922675934, + "grad_norm": 0.000832605641335249, + "learning_rate": 6.57580364900087e-06, + "loss": 0.0002, + "step": 16000 + }, + { + "epoch": 3.4774109470026064, + "grad_norm": 0.0007671408238820732, + "learning_rate": 6.548653344917463e-06, + "loss": 0.0002, + "step": 16010 + }, + { + "epoch": 3.479582971329279, + "grad_norm": 0.0007610549218952656, + "learning_rate": 6.521503040834058e-06, + "loss": 0.0003, + "step": 16020 + }, + { + "epoch": 3.4817549956559515, + "grad_norm": 0.0007617721566930413, + "learning_rate": 6.494352736750651e-06, + "loss": 0.0011, + "step": 16030 + }, + { + "epoch": 3.483927019982624, + "grad_norm": 0.001039681606926024, + "learning_rate": 6.467202432667247e-06, + "loss": 0.0001, + "step": 16040 + }, + { + "epoch": 3.486099044309296, + "grad_norm": 0.0007607974112033844, + "learning_rate": 6.44005212858384e-06, + "loss": 0.0001, + "step": 16050 + }, + { + "epoch": 3.4882710686359686, + "grad_norm": 0.0007605382124893367, + "learning_rate": 6.4129018245004346e-06, + "loss": 0.0001, + "step": 16060 + }, + { + "epoch": 3.490443092962641, + "grad_norm": 0.0007544245454482734, + "learning_rate": 6.3857515204170284e-06, + "loss": 0.0002, + "step": 16070 + }, + { + "epoch": 3.4926151172893136, + "grad_norm": 0.0007656171219423413, + "learning_rate": 6.358601216333623e-06, + "loss": 0.0001, + "step": 16080 + }, + { + "epoch": 3.494787141615986, + "grad_norm": 0.0007810737006366253, + "learning_rate": 6.331450912250217e-06, + "loss": 0.0001, + "step": 16090 + }, + { + "epoch": 3.4969591659426587, + "grad_norm": 0.1456904411315918, + "learning_rate": 6.304300608166812e-06, + "loss": 0.0003, + "step": 16100 + }, + { + "epoch": 3.499131190269331, + "grad_norm": 0.0007609643507748842, + "learning_rate": 6.277150304083406e-06, + "loss": 0.0001, + "step": 16110 + }, + { + "epoch": 3.5013032145960032, + "grad_norm": 0.0034587134141474962, + "learning_rate": 6.25e-06, + "loss": 0.0002, + "step": 16120 + }, + { + "epoch": 3.503475238922676, + "grad_norm": 0.0008425491396337748, + "learning_rate": 6.222849695916594e-06, + "loss": 0.0001, + "step": 16130 + }, + { + "epoch": 3.5056472632493483, + "grad_norm": 0.0007616893853992224, + "learning_rate": 6.195699391833189e-06, + "loss": 0.001, + "step": 16140 + }, + { + "epoch": 3.507819287576021, + "grad_norm": 0.000824123912025243, + "learning_rate": 6.168549087749783e-06, + "loss": 0.0001, + "step": 16150 + }, + { + "epoch": 3.5099913119026933, + "grad_norm": 0.0007427196251228452, + "learning_rate": 6.1413987836663775e-06, + "loss": 0.0001, + "step": 16160 + }, + { + "epoch": 3.512163336229366, + "grad_norm": 0.0027246661484241486, + "learning_rate": 6.114248479582971e-06, + "loss": 0.0001, + "step": 16170 + }, + { + "epoch": 3.5143353605560383, + "grad_norm": 0.0007525637629441917, + "learning_rate": 6.087098175499566e-06, + "loss": 0.0001, + "step": 16180 + }, + { + "epoch": 3.5165073848827104, + "grad_norm": 0.17882095277309418, + "learning_rate": 6.05994787141616e-06, + "loss": 0.0003, + "step": 16190 + }, + { + "epoch": 3.5186794092093834, + "grad_norm": 0.0022536071483045816, + "learning_rate": 6.032797567332754e-06, + "loss": 0.0002, + "step": 16200 + }, + { + "epoch": 3.5208514335360555, + "grad_norm": 0.0007445691735483706, + "learning_rate": 6.0056472632493485e-06, + "loss": 0.0001, + "step": 16210 + }, + { + "epoch": 3.523023457862728, + "grad_norm": 0.0007559970254078507, + "learning_rate": 5.978496959165942e-06, + "loss": 0.0194, + "step": 16220 + }, + { + "epoch": 3.5251954821894005, + "grad_norm": 0.001018458278849721, + "learning_rate": 5.951346655082538e-06, + "loss": 0.0001, + "step": 16230 + }, + { + "epoch": 3.527367506516073, + "grad_norm": 0.0007447813986800611, + "learning_rate": 5.924196350999132e-06, + "loss": 0.0001, + "step": 16240 + }, + { + "epoch": 3.5295395308427455, + "grad_norm": 0.0007549290312454104, + "learning_rate": 5.897046046915726e-06, + "loss": 0.0001, + "step": 16250 + }, + { + "epoch": 3.531711555169418, + "grad_norm": 0.0007536158664152026, + "learning_rate": 5.86989574283232e-06, + "loss": 0.0054, + "step": 16260 + }, + { + "epoch": 3.5338835794960906, + "grad_norm": 0.000750150007661432, + "learning_rate": 5.842745438748914e-06, + "loss": 0.0001, + "step": 16270 + }, + { + "epoch": 3.5360556038227626, + "grad_norm": 0.0007642016862519085, + "learning_rate": 5.815595134665509e-06, + "loss": 0.0046, + "step": 16280 + }, + { + "epoch": 3.538227628149435, + "grad_norm": 0.0008390732109546661, + "learning_rate": 5.788444830582103e-06, + "loss": 0.0001, + "step": 16290 + }, + { + "epoch": 3.5403996524761077, + "grad_norm": 0.0008663103799335659, + "learning_rate": 5.7612945264986975e-06, + "loss": 0.0001, + "step": 16300 + }, + { + "epoch": 3.54257167680278, + "grad_norm": 0.004686756059527397, + "learning_rate": 5.734144222415291e-06, + "loss": 0.0001, + "step": 16310 + }, + { + "epoch": 3.5447437011294527, + "grad_norm": 0.000735687674023211, + "learning_rate": 5.706993918331885e-06, + "loss": 0.0001, + "step": 16320 + }, + { + "epoch": 3.546915725456125, + "grad_norm": 0.2715161442756653, + "learning_rate": 5.67984361424848e-06, + "loss": 0.0003, + "step": 16330 + }, + { + "epoch": 3.5490877497827977, + "grad_norm": 0.0007402179180644453, + "learning_rate": 5.652693310165074e-06, + "loss": 0.0001, + "step": 16340 + }, + { + "epoch": 3.55125977410947, + "grad_norm": 0.0007502386579290032, + "learning_rate": 5.6255430060816685e-06, + "loss": 0.0001, + "step": 16350 + }, + { + "epoch": 3.5534317984361423, + "grad_norm": 0.0010584397241473198, + "learning_rate": 5.598392701998262e-06, + "loss": 0.0001, + "step": 16360 + }, + { + "epoch": 3.555603822762815, + "grad_norm": 0.0007463762303814292, + "learning_rate": 5.571242397914857e-06, + "loss": 0.0035, + "step": 16370 + }, + { + "epoch": 3.5577758470894874, + "grad_norm": 0.0025062961503863335, + "learning_rate": 5.544092093831451e-06, + "loss": 0.0001, + "step": 16380 + }, + { + "epoch": 3.55994787141616, + "grad_norm": 0.0007336509297601879, + "learning_rate": 5.516941789748045e-06, + "loss": 0.0009, + "step": 16390 + }, + { + "epoch": 3.5621198957428324, + "grad_norm": 0.004578218795359135, + "learning_rate": 5.4897914856646395e-06, + "loss": 0.0002, + "step": 16400 + }, + { + "epoch": 3.564291920069505, + "grad_norm": 0.0007287193438969553, + "learning_rate": 5.462641181581233e-06, + "loss": 0.0106, + "step": 16410 + }, + { + "epoch": 3.566463944396177, + "grad_norm": 0.000773219857364893, + "learning_rate": 5.435490877497828e-06, + "loss": 0.0002, + "step": 16420 + }, + { + "epoch": 3.56863596872285, + "grad_norm": 0.0025072686839848757, + "learning_rate": 5.408340573414423e-06, + "loss": 0.0002, + "step": 16430 + }, + { + "epoch": 3.570807993049522, + "grad_norm": 0.0008311655255965889, + "learning_rate": 5.381190269331017e-06, + "loss": 0.0022, + "step": 16440 + }, + { + "epoch": 3.5729800173761945, + "grad_norm": 0.19069628417491913, + "learning_rate": 5.354039965247611e-06, + "loss": 0.0034, + "step": 16450 + }, + { + "epoch": 3.575152041702867, + "grad_norm": 0.09645688533782959, + "learning_rate": 5.326889661164205e-06, + "loss": 0.031, + "step": 16460 + }, + { + "epoch": 3.5773240660295396, + "grad_norm": 0.005101743154227734, + "learning_rate": 5.2997393570808e-06, + "loss": 0.0354, + "step": 16470 + }, + { + "epoch": 3.579496090356212, + "grad_norm": 0.000746962963603437, + "learning_rate": 5.272589052997394e-06, + "loss": 0.0001, + "step": 16480 + }, + { + "epoch": 3.5816681146828846, + "grad_norm": 0.003772861324250698, + "learning_rate": 5.2454387489139885e-06, + "loss": 0.0002, + "step": 16490 + }, + { + "epoch": 3.583840139009557, + "grad_norm": 0.0007960237562656403, + "learning_rate": 5.218288444830582e-06, + "loss": 0.0001, + "step": 16500 + }, + { + "epoch": 3.586012163336229, + "grad_norm": 0.1657443791627884, + "learning_rate": 5.191138140747176e-06, + "loss": 0.0026, + "step": 16510 + }, + { + "epoch": 3.5881841876629017, + "grad_norm": 0.0007543342071585357, + "learning_rate": 5.163987836663771e-06, + "loss": 0.0001, + "step": 16520 + }, + { + "epoch": 3.590356211989574, + "grad_norm": 0.0007493611774407327, + "learning_rate": 5.136837532580365e-06, + "loss": 0.0001, + "step": 16530 + }, + { + "epoch": 3.5925282363162467, + "grad_norm": 0.0041643306612968445, + "learning_rate": 5.1096872284969595e-06, + "loss": 0.0002, + "step": 16540 + }, + { + "epoch": 3.5947002606429193, + "grad_norm": 0.0007641764241270721, + "learning_rate": 5.082536924413553e-06, + "loss": 0.0008, + "step": 16550 + }, + { + "epoch": 3.5968722849695918, + "grad_norm": 0.003648051293566823, + "learning_rate": 5.055386620330148e-06, + "loss": 0.0001, + "step": 16560 + }, + { + "epoch": 3.5990443092962643, + "grad_norm": 0.0033129567746073008, + "learning_rate": 5.028236316246742e-06, + "loss": 0.0002, + "step": 16570 + }, + { + "epoch": 3.6012163336229364, + "grad_norm": 0.000875050260219723, + "learning_rate": 5.001086012163337e-06, + "loss": 0.0003, + "step": 16580 + }, + { + "epoch": 3.603388357949609, + "grad_norm": 0.0007452957797795534, + "learning_rate": 4.9739357080799306e-06, + "loss": 0.0001, + "step": 16590 + }, + { + "epoch": 3.6055603822762814, + "grad_norm": 0.0007511350559070706, + "learning_rate": 4.946785403996524e-06, + "loss": 0.0001, + "step": 16600 + }, + { + "epoch": 3.607732406602954, + "grad_norm": 0.003435454098507762, + "learning_rate": 4.919635099913119e-06, + "loss": 0.0001, + "step": 16610 + }, + { + "epoch": 3.6099044309296264, + "grad_norm": 0.0033019285183399916, + "learning_rate": 4.892484795829713e-06, + "loss": 0.0002, + "step": 16620 + }, + { + "epoch": 3.612076455256299, + "grad_norm": 0.0009856983087956905, + "learning_rate": 4.865334491746308e-06, + "loss": 0.0001, + "step": 16630 + }, + { + "epoch": 3.6142484795829715, + "grad_norm": 0.0007532949093729258, + "learning_rate": 4.838184187662902e-06, + "loss": 0.017, + "step": 16640 + }, + { + "epoch": 3.6164205039096435, + "grad_norm": 0.0007319801952689886, + "learning_rate": 4.811033883579496e-06, + "loss": 0.0001, + "step": 16650 + }, + { + "epoch": 3.6185925282363165, + "grad_norm": 0.001139196683652699, + "learning_rate": 4.783883579496091e-06, + "loss": 0.0002, + "step": 16660 + }, + { + "epoch": 3.6207645525629886, + "grad_norm": 0.0007473634323105216, + "learning_rate": 4.756733275412685e-06, + "loss": 0.0001, + "step": 16670 + }, + { + "epoch": 3.622936576889661, + "grad_norm": 0.006157098803669214, + "learning_rate": 4.7295829713292796e-06, + "loss": 0.0002, + "step": 16680 + }, + { + "epoch": 3.6251086012163336, + "grad_norm": 0.0007382711628451943, + "learning_rate": 4.7024326672458734e-06, + "loss": 0.0036, + "step": 16690 + }, + { + "epoch": 3.627280625543006, + "grad_norm": 0.0007937946356832981, + "learning_rate": 4.675282363162468e-06, + "loss": 0.0001, + "step": 16700 + }, + { + "epoch": 3.6294526498696786, + "grad_norm": 0.0007356005371548235, + "learning_rate": 4.648132059079062e-06, + "loss": 0.0034, + "step": 16710 + }, + { + "epoch": 3.631624674196351, + "grad_norm": 0.0007657075184397399, + "learning_rate": 4.620981754995656e-06, + "loss": 0.0001, + "step": 16720 + }, + { + "epoch": 3.6337966985230237, + "grad_norm": 0.0008099807891994715, + "learning_rate": 4.593831450912251e-06, + "loss": 0.0001, + "step": 16730 + }, + { + "epoch": 3.6359687228496957, + "grad_norm": 0.0007494213059544563, + "learning_rate": 4.5666811468288444e-06, + "loss": 0.0001, + "step": 16740 + }, + { + "epoch": 3.6381407471763683, + "grad_norm": 0.0016505821840837598, + "learning_rate": 4.539530842745439e-06, + "loss": 0.0001, + "step": 16750 + }, + { + "epoch": 3.640312771503041, + "grad_norm": 0.0035423533990979195, + "learning_rate": 4.512380538662033e-06, + "loss": 0.0002, + "step": 16760 + }, + { + "epoch": 3.6424847958297133, + "grad_norm": 2.324589490890503, + "learning_rate": 4.485230234578628e-06, + "loss": 0.0019, + "step": 16770 + }, + { + "epoch": 3.644656820156386, + "grad_norm": 0.0007355398265644908, + "learning_rate": 4.458079930495222e-06, + "loss": 0.0288, + "step": 16780 + }, + { + "epoch": 3.6468288444830583, + "grad_norm": 0.0007384142372757196, + "learning_rate": 4.4309296264118155e-06, + "loss": 0.007, + "step": 16790 + }, + { + "epoch": 3.649000868809731, + "grad_norm": 0.01005201693624258, + "learning_rate": 4.40377932232841e-06, + "loss": 0.0001, + "step": 16800 + }, + { + "epoch": 3.651172893136403, + "grad_norm": 0.0007379804737865925, + "learning_rate": 4.376629018245004e-06, + "loss": 0.0087, + "step": 16810 + }, + { + "epoch": 3.6533449174630754, + "grad_norm": 0.0007344160694628954, + "learning_rate": 4.349478714161599e-06, + "loss": 0.0001, + "step": 16820 + }, + { + "epoch": 3.655516941789748, + "grad_norm": 0.9092646837234497, + "learning_rate": 4.3223284100781935e-06, + "loss": 0.0101, + "step": 16830 + }, + { + "epoch": 3.6576889661164205, + "grad_norm": 0.006105160806328058, + "learning_rate": 4.295178105994787e-06, + "loss": 0.0002, + "step": 16840 + }, + { + "epoch": 3.659860990443093, + "grad_norm": 0.0007417987799271941, + "learning_rate": 4.268027801911382e-06, + "loss": 0.0002, + "step": 16850 + }, + { + "epoch": 3.6620330147697655, + "grad_norm": 0.004440960939973593, + "learning_rate": 4.240877497827976e-06, + "loss": 0.0002, + "step": 16860 + }, + { + "epoch": 3.664205039096438, + "grad_norm": 0.0007366131176240742, + "learning_rate": 4.213727193744571e-06, + "loss": 0.0001, + "step": 16870 + }, + { + "epoch": 3.66637706342311, + "grad_norm": 0.0007839969475753605, + "learning_rate": 4.1865768896611645e-06, + "loss": 0.0066, + "step": 16880 + }, + { + "epoch": 3.668549087749783, + "grad_norm": 0.0007367105572484434, + "learning_rate": 4.159426585577759e-06, + "loss": 0.0002, + "step": 16890 + }, + { + "epoch": 3.670721112076455, + "grad_norm": 0.0007469954434782267, + "learning_rate": 4.132276281494353e-06, + "loss": 0.0001, + "step": 16900 + }, + { + "epoch": 3.6728931364031276, + "grad_norm": 0.000735281384550035, + "learning_rate": 4.105125977410947e-06, + "loss": 0.0002, + "step": 16910 + }, + { + "epoch": 3.6750651607298, + "grad_norm": 0.0007407576194964349, + "learning_rate": 4.077975673327542e-06, + "loss": 0.0001, + "step": 16920 + }, + { + "epoch": 3.6772371850564727, + "grad_norm": 0.0008425369742326438, + "learning_rate": 4.0508253692441355e-06, + "loss": 0.0001, + "step": 16930 + }, + { + "epoch": 3.679409209383145, + "grad_norm": 0.0007419844623655081, + "learning_rate": 4.02367506516073e-06, + "loss": 0.0001, + "step": 16940 + }, + { + "epoch": 3.6815812337098177, + "grad_norm": 0.0007465860689990222, + "learning_rate": 3.996524761077324e-06, + "loss": 0.0003, + "step": 16950 + }, + { + "epoch": 3.6837532580364902, + "grad_norm": 0.0007462946814484894, + "learning_rate": 3.969374456993919e-06, + "loss": 0.0002, + "step": 16960 + }, + { + "epoch": 3.6859252823631623, + "grad_norm": 0.0007361106108874083, + "learning_rate": 3.942224152910513e-06, + "loss": 0.0001, + "step": 16970 + }, + { + "epoch": 3.688097306689835, + "grad_norm": 0.0007370809908024967, + "learning_rate": 3.9150738488271065e-06, + "loss": 0.0001, + "step": 16980 + }, + { + "epoch": 3.6902693310165073, + "grad_norm": 0.0007324148900806904, + "learning_rate": 3.887923544743701e-06, + "loss": 0.0001, + "step": 16990 + }, + { + "epoch": 3.69244135534318, + "grad_norm": 0.015564072877168655, + "learning_rate": 3.860773240660295e-06, + "loss": 0.0127, + "step": 17000 + }, + { + "epoch": 3.6946133796698524, + "grad_norm": 0.0007292072405107319, + "learning_rate": 3.83362293657689e-06, + "loss": 0.0001, + "step": 17010 + }, + { + "epoch": 3.696785403996525, + "grad_norm": 0.000941729755140841, + "learning_rate": 3.8064726324934837e-06, + "loss": 0.0002, + "step": 17020 + }, + { + "epoch": 3.6989574283231974, + "grad_norm": 0.0020503199193626642, + "learning_rate": 3.779322328410079e-06, + "loss": 0.0002, + "step": 17030 + }, + { + "epoch": 3.7011294526498695, + "grad_norm": 0.0007638138486072421, + "learning_rate": 3.752172024326673e-06, + "loss": 0.0001, + "step": 17040 + }, + { + "epoch": 3.703301476976542, + "grad_norm": 0.0007398283923976123, + "learning_rate": 3.7250217202432674e-06, + "loss": 0.0001, + "step": 17050 + }, + { + "epoch": 3.7054735013032145, + "grad_norm": 0.44359394907951355, + "learning_rate": 3.6978714161598612e-06, + "loss": 0.0032, + "step": 17060 + }, + { + "epoch": 3.707645525629887, + "grad_norm": 0.0008260589092969894, + "learning_rate": 3.6707211120764555e-06, + "loss": 0.0002, + "step": 17070 + }, + { + "epoch": 3.7098175499565595, + "grad_norm": 0.0007681497954763472, + "learning_rate": 3.64357080799305e-06, + "loss": 0.0016, + "step": 17080 + }, + { + "epoch": 3.711989574283232, + "grad_norm": 0.00076832011109218, + "learning_rate": 3.616420503909644e-06, + "loss": 0.0004, + "step": 17090 + }, + { + "epoch": 3.7141615986099046, + "grad_norm": 0.0007327235070988536, + "learning_rate": 3.5892701998262384e-06, + "loss": 0.0003, + "step": 17100 + }, + { + "epoch": 3.7163336229365767, + "grad_norm": 0.0007373854168690741, + "learning_rate": 3.5621198957428327e-06, + "loss": 0.0002, + "step": 17110 + }, + { + "epoch": 3.7185056472632496, + "grad_norm": 0.0007295843679457903, + "learning_rate": 3.534969591659427e-06, + "loss": 0.003, + "step": 17120 + }, + { + "epoch": 3.7206776715899217, + "grad_norm": 0.000977756455540657, + "learning_rate": 3.507819287576021e-06, + "loss": 0.0001, + "step": 17130 + }, + { + "epoch": 3.722849695916594, + "grad_norm": 0.0007806509966030717, + "learning_rate": 3.480668983492615e-06, + "loss": 0.0001, + "step": 17140 + }, + { + "epoch": 3.7250217202432667, + "grad_norm": 0.0007476311875507236, + "learning_rate": 3.4535186794092094e-06, + "loss": 0.0269, + "step": 17150 + }, + { + "epoch": 3.7271937445699392, + "grad_norm": 0.014178342185914516, + "learning_rate": 3.4263683753258037e-06, + "loss": 0.0042, + "step": 17160 + }, + { + "epoch": 3.7293657688966118, + "grad_norm": 0.000731293112039566, + "learning_rate": 3.399218071242398e-06, + "loss": 0.0003, + "step": 17170 + }, + { + "epoch": 3.731537793223284, + "grad_norm": 0.0008929009782150388, + "learning_rate": 3.3720677671589923e-06, + "loss": 0.0001, + "step": 17180 + }, + { + "epoch": 3.733709817549957, + "grad_norm": 0.0007993864710442722, + "learning_rate": 3.3449174630755865e-06, + "loss": 0.0001, + "step": 17190 + }, + { + "epoch": 3.735881841876629, + "grad_norm": 0.0007400406175293028, + "learning_rate": 3.317767158992181e-06, + "loss": 0.0003, + "step": 17200 + }, + { + "epoch": 3.7380538662033014, + "grad_norm": 0.0007384680793620646, + "learning_rate": 3.2906168549087747e-06, + "loss": 0.0002, + "step": 17210 + }, + { + "epoch": 3.740225890529974, + "grad_norm": 0.0007810071110725403, + "learning_rate": 3.263466550825369e-06, + "loss": 0.0001, + "step": 17220 + }, + { + "epoch": 3.7423979148566464, + "grad_norm": 0.0007533092866651714, + "learning_rate": 3.2363162467419633e-06, + "loss": 0.007, + "step": 17230 + }, + { + "epoch": 3.744569939183319, + "grad_norm": 0.0007717570406384766, + "learning_rate": 3.2091659426585584e-06, + "loss": 0.0003, + "step": 17240 + }, + { + "epoch": 3.7467419635099914, + "grad_norm": 0.01403042022138834, + "learning_rate": 3.1820156385751523e-06, + "loss": 0.0003, + "step": 17250 + }, + { + "epoch": 3.748913987836664, + "grad_norm": 0.0007486468530260026, + "learning_rate": 3.1548653344917466e-06, + "loss": 0.0004, + "step": 17260 + }, + { + "epoch": 3.751086012163336, + "grad_norm": 0.000728482089471072, + "learning_rate": 3.127715030408341e-06, + "loss": 0.0001, + "step": 17270 + }, + { + "epoch": 3.7532580364900086, + "grad_norm": 0.0007304223254323006, + "learning_rate": 3.1005647263249347e-06, + "loss": 0.0002, + "step": 17280 + }, + { + "epoch": 3.755430060816681, + "grad_norm": 0.0015727184945717454, + "learning_rate": 3.0734144222415294e-06, + "loss": 0.0001, + "step": 17290 + }, + { + "epoch": 3.7576020851433536, + "grad_norm": 0.0007460480555891991, + "learning_rate": 3.0462641181581237e-06, + "loss": 0.0001, + "step": 17300 + }, + { + "epoch": 3.759774109470026, + "grad_norm": 0.0007649777107872069, + "learning_rate": 3.019113814074718e-06, + "loss": 0.0001, + "step": 17310 + }, + { + "epoch": 3.7619461337966986, + "grad_norm": 0.005170903634279966, + "learning_rate": 2.9919635099913123e-06, + "loss": 0.0002, + "step": 17320 + }, + { + "epoch": 3.764118158123371, + "grad_norm": 0.0007338092545978725, + "learning_rate": 2.964813205907906e-06, + "loss": 0.0003, + "step": 17330 + }, + { + "epoch": 3.766290182450043, + "grad_norm": 0.0008042194531299174, + "learning_rate": 2.9376629018245004e-06, + "loss": 0.0001, + "step": 17340 + }, + { + "epoch": 3.768462206776716, + "grad_norm": 0.0007178256055340171, + "learning_rate": 2.9105125977410947e-06, + "loss": 0.0007, + "step": 17350 + }, + { + "epoch": 3.7706342311033882, + "grad_norm": 0.0007388383965007961, + "learning_rate": 2.883362293657689e-06, + "loss": 0.0004, + "step": 17360 + }, + { + "epoch": 3.7728062554300608, + "grad_norm": 0.0007393441046588123, + "learning_rate": 2.8562119895742833e-06, + "loss": 0.0002, + "step": 17370 + }, + { + "epoch": 3.7749782797567333, + "grad_norm": 0.0007252399227581918, + "learning_rate": 2.8290616854908776e-06, + "loss": 0.0002, + "step": 17380 + }, + { + "epoch": 3.777150304083406, + "grad_norm": 0.0008031509933061898, + "learning_rate": 2.801911381407472e-06, + "loss": 0.0002, + "step": 17390 + }, + { + "epoch": 3.7793223284100783, + "grad_norm": 0.0007574482006020844, + "learning_rate": 2.774761077324066e-06, + "loss": 0.012, + "step": 17400 + }, + { + "epoch": 3.7814943527367504, + "grad_norm": 0.0009023218881338835, + "learning_rate": 2.7476107732406605e-06, + "loss": 0.0003, + "step": 17410 + }, + { + "epoch": 3.7836663770634233, + "grad_norm": 0.0007265750900842249, + "learning_rate": 2.7204604691572547e-06, + "loss": 0.0298, + "step": 17420 + }, + { + "epoch": 3.7858384013900954, + "grad_norm": 0.0007209655013866723, + "learning_rate": 2.693310165073849e-06, + "loss": 0.0002, + "step": 17430 + }, + { + "epoch": 3.788010425716768, + "grad_norm": 0.0007346943602897227, + "learning_rate": 2.6661598609904433e-06, + "loss": 0.0006, + "step": 17440 + }, + { + "epoch": 3.7901824500434405, + "grad_norm": 0.000738593575078994, + "learning_rate": 2.6390095569070376e-06, + "loss": 0.0004, + "step": 17450 + }, + { + "epoch": 3.792354474370113, + "grad_norm": 0.0007343738689087331, + "learning_rate": 2.6118592528236315e-06, + "loss": 0.0003, + "step": 17460 + }, + { + "epoch": 3.7945264986967855, + "grad_norm": 0.007034082897007465, + "learning_rate": 2.5847089487402258e-06, + "loss": 0.0003, + "step": 17470 + }, + { + "epoch": 3.796698523023458, + "grad_norm": 0.0068852780386805534, + "learning_rate": 2.55755864465682e-06, + "loss": 0.0084, + "step": 17480 + }, + { + "epoch": 3.7988705473501305, + "grad_norm": 0.022389423102140427, + "learning_rate": 2.5304083405734148e-06, + "loss": 0.0004, + "step": 17490 + }, + { + "epoch": 3.8010425716768026, + "grad_norm": 0.0008509616018272936, + "learning_rate": 2.503258036490009e-06, + "loss": 0.0001, + "step": 17500 + }, + { + "epoch": 3.803214596003475, + "grad_norm": 0.006721413694322109, + "learning_rate": 2.4761077324066033e-06, + "loss": 0.0004, + "step": 17510 + }, + { + "epoch": 3.8053866203301476, + "grad_norm": 0.000736792222596705, + "learning_rate": 2.448957428323197e-06, + "loss": 0.0001, + "step": 17520 + }, + { + "epoch": 3.80755864465682, + "grad_norm": 0.001253137830644846, + "learning_rate": 2.4218071242397915e-06, + "loss": 0.0002, + "step": 17530 + }, + { + "epoch": 3.8097306689834927, + "grad_norm": 0.0007209026953205466, + "learning_rate": 2.3946568201563858e-06, + "loss": 0.0001, + "step": 17540 + }, + { + "epoch": 3.811902693310165, + "grad_norm": 0.0007342109456658363, + "learning_rate": 2.36750651607298e-06, + "loss": 0.0001, + "step": 17550 + }, + { + "epoch": 3.8140747176368377, + "grad_norm": 0.0008827606798149645, + "learning_rate": 2.3403562119895743e-06, + "loss": 0.0067, + "step": 17560 + }, + { + "epoch": 3.8162467419635098, + "grad_norm": 0.0007216184167191386, + "learning_rate": 2.3132059079061686e-06, + "loss": 0.0001, + "step": 17570 + }, + { + "epoch": 3.8184187662901823, + "grad_norm": 0.0007301006116904318, + "learning_rate": 2.286055603822763e-06, + "loss": 0.0002, + "step": 17580 + }, + { + "epoch": 3.820590790616855, + "grad_norm": 0.0007518759812228382, + "learning_rate": 2.258905299739357e-06, + "loss": 0.0001, + "step": 17590 + }, + { + "epoch": 3.8227628149435273, + "grad_norm": 0.000788421428296715, + "learning_rate": 2.2317549956559515e-06, + "loss": 0.0001, + "step": 17600 + }, + { + "epoch": 3.8249348392702, + "grad_norm": 0.0007337273564189672, + "learning_rate": 2.2046046915725458e-06, + "loss": 0.0001, + "step": 17610 + }, + { + "epoch": 3.8271068635968724, + "grad_norm": 0.0007254479569382966, + "learning_rate": 2.17745438748914e-06, + "loss": 0.0002, + "step": 17620 + }, + { + "epoch": 3.829278887923545, + "grad_norm": 0.0007236094097606838, + "learning_rate": 2.1503040834057344e-06, + "loss": 0.0062, + "step": 17630 + }, + { + "epoch": 3.831450912250217, + "grad_norm": 0.005401818081736565, + "learning_rate": 2.1231537793223286e-06, + "loss": 0.0003, + "step": 17640 + }, + { + "epoch": 3.83362293657689, + "grad_norm": 0.0007210998446680605, + "learning_rate": 2.0960034752389225e-06, + "loss": 0.0005, + "step": 17650 + }, + { + "epoch": 3.835794960903562, + "grad_norm": 0.005460330750793219, + "learning_rate": 2.068853171155517e-06, + "loss": 0.0002, + "step": 17660 + }, + { + "epoch": 3.8379669852302345, + "grad_norm": 0.00539832329377532, + "learning_rate": 2.041702867072111e-06, + "loss": 0.0002, + "step": 17670 + }, + { + "epoch": 3.840139009556907, + "grad_norm": 0.0007603506674058735, + "learning_rate": 2.0145525629887054e-06, + "loss": 0.0001, + "step": 17680 + }, + { + "epoch": 3.8423110338835795, + "grad_norm": 0.0007322979508899152, + "learning_rate": 1.9874022589053e-06, + "loss": 0.0003, + "step": 17690 + }, + { + "epoch": 3.844483058210252, + "grad_norm": 0.0007165221031755209, + "learning_rate": 1.9602519548218944e-06, + "loss": 0.0206, + "step": 17700 + }, + { + "epoch": 3.8466550825369246, + "grad_norm": 0.0007267245091497898, + "learning_rate": 1.9331016507384887e-06, + "loss": 0.0003, + "step": 17710 + }, + { + "epoch": 3.848827106863597, + "grad_norm": 0.0007173445192165673, + "learning_rate": 1.9059513466550827e-06, + "loss": 0.0001, + "step": 17720 + }, + { + "epoch": 3.850999131190269, + "grad_norm": 0.0007399597088806331, + "learning_rate": 1.8788010425716768e-06, + "loss": 0.001, + "step": 17730 + }, + { + "epoch": 3.8531711555169417, + "grad_norm": 0.0007208718452602625, + "learning_rate": 1.851650738488271e-06, + "loss": 0.0001, + "step": 17740 + }, + { + "epoch": 3.855343179843614, + "grad_norm": 0.8270201086997986, + "learning_rate": 1.8245004344048654e-06, + "loss": 0.0172, + "step": 17750 + }, + { + "epoch": 3.8575152041702867, + "grad_norm": 0.0016780218575149775, + "learning_rate": 1.7973501303214597e-06, + "loss": 0.0001, + "step": 17760 + }, + { + "epoch": 3.859687228496959, + "grad_norm": 0.8392778038978577, + "learning_rate": 1.7701998262380538e-06, + "loss": 0.01, + "step": 17770 + }, + { + "epoch": 3.8618592528236317, + "grad_norm": 0.0007236091187223792, + "learning_rate": 1.743049522154648e-06, + "loss": 0.0002, + "step": 17780 + }, + { + "epoch": 3.8640312771503043, + "grad_norm": 0.05827876180410385, + "learning_rate": 1.7158992180712425e-06, + "loss": 0.0005, + "step": 17790 + }, + { + "epoch": 3.8662033014769763, + "grad_norm": 0.0007236993405967951, + "learning_rate": 1.6887489139878368e-06, + "loss": 0.0001, + "step": 17800 + }, + { + "epoch": 3.868375325803649, + "grad_norm": 0.0007266022148542106, + "learning_rate": 1.6615986099044311e-06, + "loss": 0.0009, + "step": 17810 + }, + { + "epoch": 3.8705473501303214, + "grad_norm": 0.007087093777954578, + "learning_rate": 1.6344483058210254e-06, + "loss": 0.0054, + "step": 17820 + }, + { + "epoch": 3.872719374456994, + "grad_norm": 0.0007367506041191518, + "learning_rate": 1.6072980017376195e-06, + "loss": 0.0003, + "step": 17830 + }, + { + "epoch": 3.8748913987836664, + "grad_norm": 0.0007232032367028296, + "learning_rate": 1.5801476976542138e-06, + "loss": 0.0004, + "step": 17840 + }, + { + "epoch": 3.877063423110339, + "grad_norm": 0.0007434978033415973, + "learning_rate": 1.552997393570808e-06, + "loss": 0.0002, + "step": 17850 + }, + { + "epoch": 3.8792354474370114, + "grad_norm": 0.0007194494246505201, + "learning_rate": 1.5258470894874023e-06, + "loss": 0.0001, + "step": 17860 + }, + { + "epoch": 3.8814074717636835, + "grad_norm": 0.0007267376640811563, + "learning_rate": 1.4986967854039966e-06, + "loss": 0.0001, + "step": 17870 + }, + { + "epoch": 3.8835794960903565, + "grad_norm": 0.0007254155352711678, + "learning_rate": 1.471546481320591e-06, + "loss": 0.0002, + "step": 17880 + }, + { + "epoch": 3.8857515204170285, + "grad_norm": 0.0007285097963176668, + "learning_rate": 1.4443961772371852e-06, + "loss": 0.0099, + "step": 17890 + }, + { + "epoch": 3.887923544743701, + "grad_norm": 0.0007246483583003283, + "learning_rate": 1.4172458731537793e-06, + "loss": 0.0001, + "step": 17900 + }, + { + "epoch": 3.8900955690703736, + "grad_norm": 0.0007502794032916427, + "learning_rate": 1.3900955690703736e-06, + "loss": 0.0003, + "step": 17910 + }, + { + "epoch": 3.892267593397046, + "grad_norm": 0.00836797896772623, + "learning_rate": 1.362945264986968e-06, + "loss": 0.0001, + "step": 17920 + }, + { + "epoch": 3.8944396177237186, + "grad_norm": 0.0007197211962193251, + "learning_rate": 1.3357949609035621e-06, + "loss": 0.0001, + "step": 17930 + }, + { + "epoch": 3.896611642050391, + "grad_norm": 0.008673655800521374, + "learning_rate": 1.3086446568201564e-06, + "loss": 0.0005, + "step": 17940 + }, + { + "epoch": 3.8987836663770636, + "grad_norm": 0.0007204540306702256, + "learning_rate": 1.2814943527367507e-06, + "loss": 0.0003, + "step": 17950 + }, + { + "epoch": 3.9009556907037357, + "grad_norm": 0.0007173253106884658, + "learning_rate": 1.2543440486533448e-06, + "loss": 0.0001, + "step": 17960 + }, + { + "epoch": 3.9031277150304082, + "grad_norm": 0.0007573234033770859, + "learning_rate": 1.2271937445699393e-06, + "loss": 0.0009, + "step": 17970 + }, + { + "epoch": 3.9052997393570807, + "grad_norm": 0.0007323205936700106, + "learning_rate": 1.2000434404865336e-06, + "loss": 0.0055, + "step": 17980 + }, + { + "epoch": 3.9074717636837533, + "grad_norm": 0.0007362981559708714, + "learning_rate": 1.1728931364031277e-06, + "loss": 0.0092, + "step": 17990 + }, + { + "epoch": 3.909643788010426, + "grad_norm": 0.00073151447577402, + "learning_rate": 1.145742832319722e-06, + "loss": 0.0001, + "step": 18000 + }, + { + "epoch": 3.9118158123370983, + "grad_norm": 0.0007298584096133709, + "learning_rate": 1.1185925282363162e-06, + "loss": 0.0002, + "step": 18010 + }, + { + "epoch": 3.913987836663771, + "grad_norm": 0.0007194079225882888, + "learning_rate": 1.0914422241529105e-06, + "loss": 0.0037, + "step": 18020 + }, + { + "epoch": 3.916159860990443, + "grad_norm": 0.0007370563107542694, + "learning_rate": 1.0642919200695048e-06, + "loss": 0.0049, + "step": 18030 + }, + { + "epoch": 3.9183318853171154, + "grad_norm": 0.0009322396363131702, + "learning_rate": 1.037141615986099e-06, + "loss": 0.0038, + "step": 18040 + }, + { + "epoch": 3.920503909643788, + "grad_norm": 0.000749268860090524, + "learning_rate": 1.0099913119026934e-06, + "loss": 0.0001, + "step": 18050 + }, + { + "epoch": 3.9226759339704604, + "grad_norm": 0.0007187744486145675, + "learning_rate": 9.828410078192875e-07, + "loss": 0.0002, + "step": 18060 + }, + { + "epoch": 3.924847958297133, + "grad_norm": 0.0009798071114346385, + "learning_rate": 9.55690703735882e-07, + "loss": 0.0001, + "step": 18070 + }, + { + "epoch": 3.9270199826238055, + "grad_norm": 0.000750518636777997, + "learning_rate": 9.285403996524761e-07, + "loss": 0.0001, + "step": 18080 + }, + { + "epoch": 3.929192006950478, + "grad_norm": 0.0007136166095733643, + "learning_rate": 9.013900955690704e-07, + "loss": 0.0001, + "step": 18090 + }, + { + "epoch": 3.93136403127715, + "grad_norm": 0.0007220849511213601, + "learning_rate": 8.742397914856646e-07, + "loss": 0.0001, + "step": 18100 + }, + { + "epoch": 3.933536055603823, + "grad_norm": 0.007109349127858877, + "learning_rate": 8.470894874022589e-07, + "loss": 0.0004, + "step": 18110 + }, + { + "epoch": 3.935708079930495, + "grad_norm": 0.0007139624794945121, + "learning_rate": 8.199391833188533e-07, + "loss": 0.0002, + "step": 18120 + }, + { + "epoch": 3.9378801042571676, + "grad_norm": 0.0007277438417077065, + "learning_rate": 7.927888792354475e-07, + "loss": 0.0063, + "step": 18130 + }, + { + "epoch": 3.94005212858384, + "grad_norm": 0.0007169364835135639, + "learning_rate": 7.656385751520418e-07, + "loss": 0.0001, + "step": 18140 + }, + { + "epoch": 3.9422241529105126, + "grad_norm": 0.000733832479454577, + "learning_rate": 7.384882710686359e-07, + "loss": 0.0001, + "step": 18150 + }, + { + "epoch": 3.944396177237185, + "grad_norm": 0.0007185288704931736, + "learning_rate": 7.113379669852302e-07, + "loss": 0.0002, + "step": 18160 + }, + { + "epoch": 3.9465682015638577, + "grad_norm": 0.000867704045958817, + "learning_rate": 6.841876629018245e-07, + "loss": 0.0003, + "step": 18170 + }, + { + "epoch": 3.94874022589053, + "grad_norm": 0.0007935749599710107, + "learning_rate": 6.570373588184188e-07, + "loss": 0.0002, + "step": 18180 + }, + { + "epoch": 3.9509122502172023, + "grad_norm": 0.0007953582680784166, + "learning_rate": 6.298870547350131e-07, + "loss": 0.0001, + "step": 18190 + }, + { + "epoch": 3.953084274543875, + "grad_norm": 0.000717403250746429, + "learning_rate": 6.027367506516073e-07, + "loss": 0.0001, + "step": 18200 + }, + { + "epoch": 3.9552562988705473, + "grad_norm": 0.0007746173650957644, + "learning_rate": 5.755864465682016e-07, + "loss": 0.0003, + "step": 18210 + }, + { + "epoch": 3.95742832319722, + "grad_norm": 0.0007847067317925394, + "learning_rate": 5.484361424847959e-07, + "loss": 0.0001, + "step": 18220 + }, + { + "epoch": 3.9596003475238923, + "grad_norm": 0.005621789488941431, + "learning_rate": 5.212858384013901e-07, + "loss": 0.0001, + "step": 18230 + }, + { + "epoch": 3.961772371850565, + "grad_norm": 0.0007133004837669432, + "learning_rate": 4.941355343179844e-07, + "loss": 0.0004, + "step": 18240 + }, + { + "epoch": 3.9639443961772374, + "grad_norm": 0.00072172109503299, + "learning_rate": 4.669852302345786e-07, + "loss": 0.0002, + "step": 18250 + }, + { + "epoch": 3.9661164205039094, + "grad_norm": 0.0070555261336266994, + "learning_rate": 4.3983492615117295e-07, + "loss": 0.0002, + "step": 18260 + }, + { + "epoch": 3.968288444830582, + "grad_norm": 0.0007262382423505187, + "learning_rate": 4.126846220677672e-07, + "loss": 0.0002, + "step": 18270 + }, + { + "epoch": 3.9704604691572545, + "grad_norm": 0.0007373811677098274, + "learning_rate": 3.855343179843614e-07, + "loss": 0.0008, + "step": 18280 + }, + { + "epoch": 3.972632493483927, + "grad_norm": 0.0007229727343656123, + "learning_rate": 3.583840139009557e-07, + "loss": 0.0001, + "step": 18290 + }, + { + "epoch": 3.9748045178105995, + "grad_norm": 0.0007267961045727134, + "learning_rate": 3.3123370981755e-07, + "loss": 0.0001, + "step": 18300 + }, + { + "epoch": 3.976976542137272, + "grad_norm": 0.17784090340137482, + "learning_rate": 3.0408340573414423e-07, + "loss": 0.0035, + "step": 18310 + }, + { + "epoch": 3.9791485664639445, + "grad_norm": 0.0007294813403859735, + "learning_rate": 2.7693310165073847e-07, + "loss": 0.0002, + "step": 18320 + }, + { + "epoch": 3.9813205907906166, + "grad_norm": 0.0007423889474011958, + "learning_rate": 2.4978279756733275e-07, + "loss": 0.0001, + "step": 18330 + }, + { + "epoch": 3.9834926151172896, + "grad_norm": 0.0007235651719383895, + "learning_rate": 2.2263249348392704e-07, + "loss": 0.0001, + "step": 18340 + }, + { + "epoch": 3.9856646394439617, + "grad_norm": 0.0007269734633155167, + "learning_rate": 1.954821894005213e-07, + "loss": 0.0001, + "step": 18350 + }, + { + "epoch": 3.987836663770634, + "grad_norm": 0.3467176854610443, + "learning_rate": 1.6833188531711556e-07, + "loss": 0.0022, + "step": 18360 + }, + { + "epoch": 3.9900086880973067, + "grad_norm": 0.0007139446679502726, + "learning_rate": 1.4118158123370983e-07, + "loss": 0.0001, + "step": 18370 + }, + { + "epoch": 3.992180712423979, + "grad_norm": 0.000727241684217006, + "learning_rate": 1.1403127715030409e-07, + "loss": 0.0002, + "step": 18380 + }, + { + "epoch": 3.9943527367506517, + "grad_norm": 0.000711127242539078, + "learning_rate": 8.688097306689835e-08, + "loss": 0.0001, + "step": 18390 + }, + { + "epoch": 3.996524761077324, + "grad_norm": 0.0008922716369852424, + "learning_rate": 5.973066898349262e-08, + "loss": 0.0001, + "step": 18400 + }, + { + "epoch": 3.9986967854039968, + "grad_norm": 0.0007189746247604489, + "learning_rate": 3.2580364900086884e-08, + "loss": 0.0002, + "step": 18410 + }, + { + "epoch": 4.0, + "eval_f1": 0.6523297491039427, + "eval_loss": 0.06827918440103531, + "eval_runtime": 83.7948, + "eval_samples_per_second": 119.041, + "eval_steps_per_second": 7.447, + "step": 18416 + }, + { + "epoch": 4.0, + "step": 18416, + "total_flos": 2.2832239820043387e+19, + "train_loss": 0.018179216772772535, + "train_runtime": 6460.7853, + "train_samples_per_second": 45.604, + "train_steps_per_second": 2.85 + } + ], + "logging_steps": 10, + "max_steps": 18416, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.2832239820043387e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}