diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,104333 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9998993322371732, + "eval_steps": 500, + "global_step": 29800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00013422368376900105, + "grad_norm": 171.1130828857422, + "learning_rate": 2.2371364653243848e-07, + "loss": 31.6684, + "step": 2 + }, + { + "epoch": 0.0002684473675380021, + "grad_norm": 176.14602661132812, + "learning_rate": 4.4742729306487696e-07, + "loss": 26.0574, + "step": 4 + }, + { + "epoch": 0.00040267105130700314, + "grad_norm": 161.99838256835938, + "learning_rate": 6.711409395973154e-07, + "loss": 25.5986, + "step": 6 + }, + { + "epoch": 0.0005368947350760042, + "grad_norm": 160.26161193847656, + "learning_rate": 8.948545861297539e-07, + "loss": 25.8844, + "step": 8 + }, + { + "epoch": 0.0006711184188450052, + "grad_norm": 135.05043029785156, + "learning_rate": 1.1185682326621925e-06, + "loss": 24.5937, + "step": 10 + }, + { + "epoch": 0.0008053421026140063, + "grad_norm": 141.6531524658203, + "learning_rate": 1.3422818791946309e-06, + "loss": 23.9209, + "step": 12 + }, + { + "epoch": 0.0009395657863830073, + "grad_norm": 136.20071411132812, + "learning_rate": 1.5659955257270695e-06, + "loss": 22.2859, + "step": 14 + }, + { + "epoch": 0.0010737894701520084, + "grad_norm": 125.7350082397461, + "learning_rate": 1.7897091722595078e-06, + "loss": 19.8666, + "step": 16 + }, + { + "epoch": 0.0012080131539210094, + "grad_norm": 115.17154693603516, + "learning_rate": 2.013422818791946e-06, + "loss": 19.1472, + "step": 18 + }, + { + "epoch": 0.0013422368376900104, + "grad_norm": 71.52286529541016, + "learning_rate": 2.237136465324385e-06, + "loss": 15.6429, + "step": 20 + }, + { + "epoch": 0.0014764605214590114, + "grad_norm": 59.182003021240234, + "learning_rate": 2.4608501118568234e-06, + "loss": 14.5513, + "step": 22 + }, + { + "epoch": 0.0016106842052280126, + "grad_norm": 49.64590072631836, + "learning_rate": 2.6845637583892617e-06, + "loss": 13.8686, + "step": 24 + }, + { + "epoch": 0.0017449078889970136, + "grad_norm": 32.98600387573242, + "learning_rate": 2.9082774049217e-06, + "loss": 12.4748, + "step": 26 + }, + { + "epoch": 0.0018791315727660146, + "grad_norm": 31.01237678527832, + "learning_rate": 3.131991051454139e-06, + "loss": 12.0216, + "step": 28 + }, + { + "epoch": 0.0020133552565350155, + "grad_norm": 21.60193634033203, + "learning_rate": 3.3557046979865773e-06, + "loss": 11.4164, + "step": 30 + }, + { + "epoch": 0.0021475789403040168, + "grad_norm": 19.610647201538086, + "learning_rate": 3.5794183445190157e-06, + "loss": 11.1246, + "step": 32 + }, + { + "epoch": 0.0022818026240730175, + "grad_norm": 16.299062728881836, + "learning_rate": 3.803131991051454e-06, + "loss": 10.7795, + "step": 34 + }, + { + "epoch": 0.0024160263078420187, + "grad_norm": 13.230311393737793, + "learning_rate": 4.026845637583892e-06, + "loss": 10.2409, + "step": 36 + }, + { + "epoch": 0.00255024999161102, + "grad_norm": 18.888996124267578, + "learning_rate": 4.250559284116332e-06, + "loss": 10.4125, + "step": 38 + }, + { + "epoch": 0.0026844736753800207, + "grad_norm": 12.800553321838379, + "learning_rate": 4.47427293064877e-06, + "loss": 10.0694, + "step": 40 + }, + { + "epoch": 0.002818697359149022, + "grad_norm": 13.493648529052734, + "learning_rate": 4.697986577181209e-06, + "loss": 9.8657, + "step": 42 + }, + { + "epoch": 0.0029529210429180227, + "grad_norm": 14.25110912322998, + "learning_rate": 4.921700223713647e-06, + "loss": 9.6782, + "step": 44 + }, + { + "epoch": 0.003087144726687024, + "grad_norm": 11.908875465393066, + "learning_rate": 5.1454138702460855e-06, + "loss": 9.365, + "step": 46 + }, + { + "epoch": 0.003221368410456025, + "grad_norm": 10.207379341125488, + "learning_rate": 5.3691275167785235e-06, + "loss": 9.2452, + "step": 48 + }, + { + "epoch": 0.003355592094225026, + "grad_norm": 16.149885177612305, + "learning_rate": 5.592841163310962e-06, + "loss": 9.0995, + "step": 50 + }, + { + "epoch": 0.003489815777994027, + "grad_norm": 14.250184059143066, + "learning_rate": 5.8165548098434e-06, + "loss": 8.9436, + "step": 52 + }, + { + "epoch": 0.003624039461763028, + "grad_norm": 17.353586196899414, + "learning_rate": 6.04026845637584e-06, + "loss": 8.798, + "step": 54 + }, + { + "epoch": 0.003758263145532029, + "grad_norm": 12.49612808227539, + "learning_rate": 6.263982102908278e-06, + "loss": 8.5168, + "step": 56 + }, + { + "epoch": 0.0038924868293010303, + "grad_norm": 13.603919982910156, + "learning_rate": 6.487695749440716e-06, + "loss": 8.5533, + "step": 58 + }, + { + "epoch": 0.004026710513070031, + "grad_norm": 15.124484062194824, + "learning_rate": 6.7114093959731546e-06, + "loss": 8.3539, + "step": 60 + }, + { + "epoch": 0.004160934196839032, + "grad_norm": 12.35384464263916, + "learning_rate": 6.935123042505594e-06, + "loss": 8.3328, + "step": 62 + }, + { + "epoch": 0.0042951578806080335, + "grad_norm": 10.686278343200684, + "learning_rate": 7.158836689038031e-06, + "loss": 8.1974, + "step": 64 + }, + { + "epoch": 0.004429381564377034, + "grad_norm": 11.038646697998047, + "learning_rate": 7.382550335570471e-06, + "loss": 7.9084, + "step": 66 + }, + { + "epoch": 0.004563605248146035, + "grad_norm": 11.763455390930176, + "learning_rate": 7.606263982102908e-06, + "loss": 7.8376, + "step": 68 + }, + { + "epoch": 0.004697828931915037, + "grad_norm": 10.757540702819824, + "learning_rate": 7.829977628635348e-06, + "loss": 7.6839, + "step": 70 + }, + { + "epoch": 0.0048320526156840375, + "grad_norm": 13.832391738891602, + "learning_rate": 8.053691275167785e-06, + "loss": 7.3858, + "step": 72 + }, + { + "epoch": 0.004966276299453038, + "grad_norm": 13.143087387084961, + "learning_rate": 8.277404921700224e-06, + "loss": 7.1794, + "step": 74 + }, + { + "epoch": 0.00510049998322204, + "grad_norm": 13.321001052856445, + "learning_rate": 8.501118568232664e-06, + "loss": 6.9488, + "step": 76 + }, + { + "epoch": 0.005234723666991041, + "grad_norm": 14.340903282165527, + "learning_rate": 8.724832214765101e-06, + "loss": 6.8885, + "step": 78 + }, + { + "epoch": 0.0053689473507600415, + "grad_norm": 12.12249755859375, + "learning_rate": 8.94854586129754e-06, + "loss": 6.5723, + "step": 80 + }, + { + "epoch": 0.005503171034529042, + "grad_norm": 13.309209823608398, + "learning_rate": 9.172259507829977e-06, + "loss": 6.3054, + "step": 82 + }, + { + "epoch": 0.005637394718298044, + "grad_norm": 11.140517234802246, + "learning_rate": 9.395973154362418e-06, + "loss": 6.2513, + "step": 84 + }, + { + "epoch": 0.005771618402067045, + "grad_norm": 11.450445175170898, + "learning_rate": 9.619686800894855e-06, + "loss": 6.0974, + "step": 86 + }, + { + "epoch": 0.005905842085836045, + "grad_norm": 12.40820026397705, + "learning_rate": 9.843400447427293e-06, + "loss": 5.9598, + "step": 88 + }, + { + "epoch": 0.006040065769605047, + "grad_norm": 10.447242736816406, + "learning_rate": 1.006711409395973e-05, + "loss": 5.9528, + "step": 90 + }, + { + "epoch": 0.006174289453374048, + "grad_norm": 9.825074195861816, + "learning_rate": 1.0290827740492171e-05, + "loss": 5.8049, + "step": 92 + }, + { + "epoch": 0.006308513137143049, + "grad_norm": 9.361897468566895, + "learning_rate": 1.051454138702461e-05, + "loss": 5.7073, + "step": 94 + }, + { + "epoch": 0.00644273682091205, + "grad_norm": 10.115484237670898, + "learning_rate": 1.0738255033557047e-05, + "loss": 5.8558, + "step": 96 + }, + { + "epoch": 0.006576960504681051, + "grad_norm": 9.615787506103516, + "learning_rate": 1.0961968680089486e-05, + "loss": 5.5608, + "step": 98 + }, + { + "epoch": 0.006711184188450052, + "grad_norm": 9.042574882507324, + "learning_rate": 1.1185682326621925e-05, + "loss": 5.4409, + "step": 100 + }, + { + "epoch": 0.0068454078722190535, + "grad_norm": 8.968979835510254, + "learning_rate": 1.1409395973154363e-05, + "loss": 5.4345, + "step": 102 + }, + { + "epoch": 0.006979631555988054, + "grad_norm": 9.127655982971191, + "learning_rate": 1.16331096196868e-05, + "loss": 5.2327, + "step": 104 + }, + { + "epoch": 0.007113855239757055, + "grad_norm": 9.799625396728516, + "learning_rate": 1.185682326621924e-05, + "loss": 5.3695, + "step": 106 + }, + { + "epoch": 0.007248078923526056, + "grad_norm": 8.575567245483398, + "learning_rate": 1.208053691275168e-05, + "loss": 5.137, + "step": 108 + }, + { + "epoch": 0.007382302607295057, + "grad_norm": 10.603007316589355, + "learning_rate": 1.2304250559284117e-05, + "loss": 5.0931, + "step": 110 + }, + { + "epoch": 0.007516526291064058, + "grad_norm": 9.567828178405762, + "learning_rate": 1.2527964205816556e-05, + "loss": 5.0452, + "step": 112 + }, + { + "epoch": 0.007650749974833059, + "grad_norm": 8.963132858276367, + "learning_rate": 1.2751677852348994e-05, + "loss": 5.4306, + "step": 114 + }, + { + "epoch": 0.007784973658602061, + "grad_norm": 9.023338317871094, + "learning_rate": 1.2975391498881432e-05, + "loss": 5.26, + "step": 116 + }, + { + "epoch": 0.007919197342371061, + "grad_norm": 8.850415229797363, + "learning_rate": 1.319910514541387e-05, + "loss": 5.1219, + "step": 118 + }, + { + "epoch": 0.008053421026140062, + "grad_norm": 8.642598152160645, + "learning_rate": 1.3422818791946309e-05, + "loss": 5.0104, + "step": 120 + }, + { + "epoch": 0.008187644709909063, + "grad_norm": 9.182424545288086, + "learning_rate": 1.3646532438478748e-05, + "loss": 5.0709, + "step": 122 + }, + { + "epoch": 0.008321868393678064, + "grad_norm": 9.25109577178955, + "learning_rate": 1.3870246085011188e-05, + "loss": 4.7935, + "step": 124 + }, + { + "epoch": 0.008456092077447066, + "grad_norm": 10.07085132598877, + "learning_rate": 1.4093959731543624e-05, + "loss": 5.1152, + "step": 126 + }, + { + "epoch": 0.008590315761216067, + "grad_norm": 8.49173355102539, + "learning_rate": 1.4317673378076063e-05, + "loss": 5.0015, + "step": 128 + }, + { + "epoch": 0.008724539444985068, + "grad_norm": 9.74452018737793, + "learning_rate": 1.4541387024608501e-05, + "loss": 4.7356, + "step": 130 + }, + { + "epoch": 0.008858763128754069, + "grad_norm": 8.353522300720215, + "learning_rate": 1.4765100671140942e-05, + "loss": 4.7962, + "step": 132 + }, + { + "epoch": 0.00899298681252307, + "grad_norm": 10.006658554077148, + "learning_rate": 1.4988814317673377e-05, + "loss": 4.7493, + "step": 134 + }, + { + "epoch": 0.00912721049629207, + "grad_norm": 8.943411827087402, + "learning_rate": 1.5212527964205816e-05, + "loss": 4.8692, + "step": 136 + }, + { + "epoch": 0.009261434180061073, + "grad_norm": 8.465804100036621, + "learning_rate": 1.5436241610738255e-05, + "loss": 4.6024, + "step": 138 + }, + { + "epoch": 0.009395657863830073, + "grad_norm": 8.696969985961914, + "learning_rate": 1.5659955257270695e-05, + "loss": 4.6939, + "step": 140 + }, + { + "epoch": 0.009529881547599074, + "grad_norm": 8.411632537841797, + "learning_rate": 1.5883668903803133e-05, + "loss": 4.6008, + "step": 142 + }, + { + "epoch": 0.009664105231368075, + "grad_norm": 7.8183722496032715, + "learning_rate": 1.610738255033557e-05, + "loss": 4.6418, + "step": 144 + }, + { + "epoch": 0.009798328915137076, + "grad_norm": 8.37464714050293, + "learning_rate": 1.633109619686801e-05, + "loss": 4.7527, + "step": 146 + }, + { + "epoch": 0.009932552598906077, + "grad_norm": 8.137810707092285, + "learning_rate": 1.6554809843400447e-05, + "loss": 4.6414, + "step": 148 + }, + { + "epoch": 0.010066776282675077, + "grad_norm": 8.560234069824219, + "learning_rate": 1.6778523489932888e-05, + "loss": 4.5897, + "step": 150 + }, + { + "epoch": 0.01020099996644408, + "grad_norm": 8.440449714660645, + "learning_rate": 1.7002237136465328e-05, + "loss": 4.6773, + "step": 152 + }, + { + "epoch": 0.01033522365021308, + "grad_norm": 8.208459854125977, + "learning_rate": 1.7225950782997762e-05, + "loss": 4.4353, + "step": 154 + }, + { + "epoch": 0.010469447333982081, + "grad_norm": 8.27092170715332, + "learning_rate": 1.7449664429530202e-05, + "loss": 4.5743, + "step": 156 + }, + { + "epoch": 0.010603671017751082, + "grad_norm": 8.993355751037598, + "learning_rate": 1.767337807606264e-05, + "loss": 4.481, + "step": 158 + }, + { + "epoch": 0.010737894701520083, + "grad_norm": 8.263463973999023, + "learning_rate": 1.789709172259508e-05, + "loss": 4.3719, + "step": 160 + }, + { + "epoch": 0.010872118385289084, + "grad_norm": 7.953629016876221, + "learning_rate": 1.8120805369127517e-05, + "loss": 4.4273, + "step": 162 + }, + { + "epoch": 0.011006342069058084, + "grad_norm": 7.4534993171691895, + "learning_rate": 1.8344519015659954e-05, + "loss": 4.4569, + "step": 164 + }, + { + "epoch": 0.011140565752827087, + "grad_norm": 8.82858657836914, + "learning_rate": 1.8568232662192395e-05, + "loss": 4.6424, + "step": 166 + }, + { + "epoch": 0.011274789436596088, + "grad_norm": 8.202412605285645, + "learning_rate": 1.8791946308724835e-05, + "loss": 4.5939, + "step": 168 + }, + { + "epoch": 0.011409013120365089, + "grad_norm": 8.504288673400879, + "learning_rate": 1.9015659955257272e-05, + "loss": 4.2646, + "step": 170 + }, + { + "epoch": 0.01154323680413409, + "grad_norm": 8.530281066894531, + "learning_rate": 1.923937360178971e-05, + "loss": 4.6194, + "step": 172 + }, + { + "epoch": 0.01167746048790309, + "grad_norm": 8.400354385375977, + "learning_rate": 1.946308724832215e-05, + "loss": 4.5271, + "step": 174 + }, + { + "epoch": 0.01181168417167209, + "grad_norm": 7.966704845428467, + "learning_rate": 1.9686800894854587e-05, + "loss": 4.0851, + "step": 176 + }, + { + "epoch": 0.011945907855441093, + "grad_norm": 8.812636375427246, + "learning_rate": 1.9910514541387027e-05, + "loss": 4.3592, + "step": 178 + }, + { + "epoch": 0.012080131539210094, + "grad_norm": 7.87779426574707, + "learning_rate": 2.013422818791946e-05, + "loss": 4.3825, + "step": 180 + }, + { + "epoch": 0.012214355222979095, + "grad_norm": 8.381267547607422, + "learning_rate": 2.03579418344519e-05, + "loss": 4.4824, + "step": 182 + }, + { + "epoch": 0.012348578906748096, + "grad_norm": 8.175226211547852, + "learning_rate": 2.0581655480984342e-05, + "loss": 4.3559, + "step": 184 + }, + { + "epoch": 0.012482802590517096, + "grad_norm": 8.237099647521973, + "learning_rate": 2.080536912751678e-05, + "loss": 4.4196, + "step": 186 + }, + { + "epoch": 0.012617026274286097, + "grad_norm": 8.391732215881348, + "learning_rate": 2.102908277404922e-05, + "loss": 4.3984, + "step": 188 + }, + { + "epoch": 0.012751249958055098, + "grad_norm": 7.575904846191406, + "learning_rate": 2.1252796420581657e-05, + "loss": 4.4591, + "step": 190 + }, + { + "epoch": 0.0128854736418241, + "grad_norm": 7.631547451019287, + "learning_rate": 2.1476510067114094e-05, + "loss": 4.2241, + "step": 192 + }, + { + "epoch": 0.013019697325593101, + "grad_norm": 7.607235431671143, + "learning_rate": 2.1700223713646534e-05, + "loss": 4.4159, + "step": 194 + }, + { + "epoch": 0.013153921009362102, + "grad_norm": 8.409624099731445, + "learning_rate": 2.192393736017897e-05, + "loss": 4.5411, + "step": 196 + }, + { + "epoch": 0.013288144693131103, + "grad_norm": 9.032918930053711, + "learning_rate": 2.2147651006711412e-05, + "loss": 4.4841, + "step": 198 + }, + { + "epoch": 0.013422368376900104, + "grad_norm": 7.431969165802002, + "learning_rate": 2.237136465324385e-05, + "loss": 4.2217, + "step": 200 + }, + { + "epoch": 0.013556592060669104, + "grad_norm": 7.810758113861084, + "learning_rate": 2.2595078299776286e-05, + "loss": 4.5364, + "step": 202 + }, + { + "epoch": 0.013690815744438107, + "grad_norm": 7.845270156860352, + "learning_rate": 2.2818791946308727e-05, + "loss": 4.1384, + "step": 204 + }, + { + "epoch": 0.013825039428207108, + "grad_norm": 8.138202667236328, + "learning_rate": 2.3042505592841164e-05, + "loss": 4.273, + "step": 206 + }, + { + "epoch": 0.013959263111976108, + "grad_norm": 7.94398307800293, + "learning_rate": 2.32662192393736e-05, + "loss": 4.0684, + "step": 208 + }, + { + "epoch": 0.01409348679574511, + "grad_norm": 7.332570552825928, + "learning_rate": 2.348993288590604e-05, + "loss": 3.9733, + "step": 210 + }, + { + "epoch": 0.01422771047951411, + "grad_norm": 7.195709705352783, + "learning_rate": 2.371364653243848e-05, + "loss": 4.1796, + "step": 212 + }, + { + "epoch": 0.01436193416328311, + "grad_norm": 7.754658222198486, + "learning_rate": 2.393736017897092e-05, + "loss": 4.1152, + "step": 214 + }, + { + "epoch": 0.014496157847052112, + "grad_norm": 7.562770366668701, + "learning_rate": 2.416107382550336e-05, + "loss": 4.1977, + "step": 216 + }, + { + "epoch": 0.014630381530821114, + "grad_norm": 7.157684803009033, + "learning_rate": 2.4384787472035793e-05, + "loss": 4.0194, + "step": 218 + }, + { + "epoch": 0.014764605214590115, + "grad_norm": 7.642800331115723, + "learning_rate": 2.4608501118568234e-05, + "loss": 4.1962, + "step": 220 + }, + { + "epoch": 0.014898828898359116, + "grad_norm": 8.98104476928711, + "learning_rate": 2.4832214765100674e-05, + "loss": 4.1585, + "step": 222 + }, + { + "epoch": 0.015033052582128116, + "grad_norm": 7.569408416748047, + "learning_rate": 2.505592841163311e-05, + "loss": 4.0645, + "step": 224 + }, + { + "epoch": 0.015167276265897117, + "grad_norm": 8.704858779907227, + "learning_rate": 2.527964205816555e-05, + "loss": 4.3407, + "step": 226 + }, + { + "epoch": 0.015301499949666118, + "grad_norm": 8.129752159118652, + "learning_rate": 2.550335570469799e-05, + "loss": 4.269, + "step": 228 + }, + { + "epoch": 0.01543572363343512, + "grad_norm": 7.098968029022217, + "learning_rate": 2.5727069351230426e-05, + "loss": 4.239, + "step": 230 + }, + { + "epoch": 0.015569947317204121, + "grad_norm": 7.034340858459473, + "learning_rate": 2.5950782997762863e-05, + "loss": 4.0018, + "step": 232 + }, + { + "epoch": 0.01570417100097312, + "grad_norm": 7.63564920425415, + "learning_rate": 2.6174496644295304e-05, + "loss": 4.1808, + "step": 234 + }, + { + "epoch": 0.015838394684742123, + "grad_norm": 21.066104888916016, + "learning_rate": 2.639821029082774e-05, + "loss": 4.2819, + "step": 236 + }, + { + "epoch": 0.015972618368511125, + "grad_norm": 9.885736465454102, + "learning_rate": 2.662192393736018e-05, + "loss": 4.0867, + "step": 238 + }, + { + "epoch": 0.016106842052280124, + "grad_norm": 37.06069564819336, + "learning_rate": 2.6845637583892618e-05, + "loss": 4.0657, + "step": 240 + }, + { + "epoch": 0.016241065736049127, + "grad_norm": 8.000858306884766, + "learning_rate": 2.7069351230425055e-05, + "loss": 4.2649, + "step": 242 + }, + { + "epoch": 0.016375289419818126, + "grad_norm": 7.614824295043945, + "learning_rate": 2.7293064876957496e-05, + "loss": 3.9866, + "step": 244 + }, + { + "epoch": 0.01650951310358713, + "grad_norm": 8.951519012451172, + "learning_rate": 2.7516778523489933e-05, + "loss": 4.0407, + "step": 246 + }, + { + "epoch": 0.016643736787356127, + "grad_norm": 7.866461277008057, + "learning_rate": 2.7740492170022377e-05, + "loss": 4.2786, + "step": 248 + }, + { + "epoch": 0.01677796047112513, + "grad_norm": 7.071528911590576, + "learning_rate": 2.796420581655481e-05, + "loss": 3.9093, + "step": 250 + }, + { + "epoch": 0.016912184154894132, + "grad_norm": 7.511150360107422, + "learning_rate": 2.8187919463087248e-05, + "loss": 4.2382, + "step": 252 + }, + { + "epoch": 0.01704640783866313, + "grad_norm": 6.744004249572754, + "learning_rate": 2.8411633109619688e-05, + "loss": 3.9463, + "step": 254 + }, + { + "epoch": 0.017180631522432134, + "grad_norm": 7.476906776428223, + "learning_rate": 2.8635346756152125e-05, + "loss": 4.0695, + "step": 256 + }, + { + "epoch": 0.017314855206201133, + "grad_norm": 7.356006145477295, + "learning_rate": 2.885906040268457e-05, + "loss": 4.1326, + "step": 258 + }, + { + "epoch": 0.017449078889970136, + "grad_norm": 8.37822151184082, + "learning_rate": 2.9082774049217003e-05, + "loss": 3.9508, + "step": 260 + }, + { + "epoch": 0.017583302573739135, + "grad_norm": 7.0157599449157715, + "learning_rate": 2.930648769574944e-05, + "loss": 4.1541, + "step": 262 + }, + { + "epoch": 0.017717526257508137, + "grad_norm": 7.196605205535889, + "learning_rate": 2.9530201342281884e-05, + "loss": 3.7216, + "step": 264 + }, + { + "epoch": 0.01785174994127714, + "grad_norm": 7.066977024078369, + "learning_rate": 2.9753914988814318e-05, + "loss": 3.9887, + "step": 266 + }, + { + "epoch": 0.01798597362504614, + "grad_norm": 7.5818376541137695, + "learning_rate": 2.9977628635346755e-05, + "loss": 3.9404, + "step": 268 + }, + { + "epoch": 0.01812019730881514, + "grad_norm": 13.544082641601562, + "learning_rate": 3.02013422818792e-05, + "loss": 4.0594, + "step": 270 + }, + { + "epoch": 0.01825442099258414, + "grad_norm": 7.587532997131348, + "learning_rate": 3.0425055928411632e-05, + "loss": 4.0585, + "step": 272 + }, + { + "epoch": 0.018388644676353143, + "grad_norm": 6.641517162322998, + "learning_rate": 3.0648769574944076e-05, + "loss": 4.0138, + "step": 274 + }, + { + "epoch": 0.018522868360122145, + "grad_norm": 7.253713607788086, + "learning_rate": 3.087248322147651e-05, + "loss": 4.2718, + "step": 276 + }, + { + "epoch": 0.018657092043891144, + "grad_norm": 7.163355827331543, + "learning_rate": 3.109619686800895e-05, + "loss": 3.8726, + "step": 278 + }, + { + "epoch": 0.018791315727660147, + "grad_norm": 7.897916316986084, + "learning_rate": 3.131991051454139e-05, + "loss": 4.1464, + "step": 280 + }, + { + "epoch": 0.018925539411429146, + "grad_norm": 7.485755920410156, + "learning_rate": 3.1543624161073825e-05, + "loss": 4.1225, + "step": 282 + }, + { + "epoch": 0.01905976309519815, + "grad_norm": 9.443805694580078, + "learning_rate": 3.1767337807606265e-05, + "loss": 3.9947, + "step": 284 + }, + { + "epoch": 0.019193986778967147, + "grad_norm": 7.174338340759277, + "learning_rate": 3.1991051454138706e-05, + "loss": 3.9088, + "step": 286 + }, + { + "epoch": 0.01932821046273615, + "grad_norm": 6.906972885131836, + "learning_rate": 3.221476510067114e-05, + "loss": 3.8667, + "step": 288 + }, + { + "epoch": 0.019462434146505152, + "grad_norm": 7.4788594245910645, + "learning_rate": 3.243847874720358e-05, + "loss": 3.9111, + "step": 290 + }, + { + "epoch": 0.01959665783027415, + "grad_norm": 11.140477180480957, + "learning_rate": 3.266219239373602e-05, + "loss": 4.3426, + "step": 292 + }, + { + "epoch": 0.019730881514043154, + "grad_norm": 7.505558013916016, + "learning_rate": 3.288590604026846e-05, + "loss": 3.9657, + "step": 294 + }, + { + "epoch": 0.019865105197812153, + "grad_norm": 7.082154750823975, + "learning_rate": 3.3109619686800894e-05, + "loss": 3.6794, + "step": 296 + }, + { + "epoch": 0.019999328881581156, + "grad_norm": 7.729694366455078, + "learning_rate": 3.3333333333333335e-05, + "loss": 3.7422, + "step": 298 + }, + { + "epoch": 0.020133552565350155, + "grad_norm": 7.517658710479736, + "learning_rate": 3.3557046979865775e-05, + "loss": 4.0164, + "step": 300 + }, + { + "epoch": 0.020267776249119157, + "grad_norm": 6.784401893615723, + "learning_rate": 3.378076062639821e-05, + "loss": 3.8938, + "step": 302 + }, + { + "epoch": 0.02040199993288816, + "grad_norm": 6.954875469207764, + "learning_rate": 3.4004474272930656e-05, + "loss": 3.8334, + "step": 304 + }, + { + "epoch": 0.02053622361665716, + "grad_norm": 6.979121685028076, + "learning_rate": 3.422818791946309e-05, + "loss": 4.0774, + "step": 306 + }, + { + "epoch": 0.02067044730042616, + "grad_norm": 6.839256286621094, + "learning_rate": 3.4451901565995524e-05, + "loss": 3.8882, + "step": 308 + }, + { + "epoch": 0.02080467098419516, + "grad_norm": 6.747851371765137, + "learning_rate": 3.4675615212527964e-05, + "loss": 4.144, + "step": 310 + }, + { + "epoch": 0.020938894667964163, + "grad_norm": 17.720966339111328, + "learning_rate": 3.4899328859060405e-05, + "loss": 4.0772, + "step": 312 + }, + { + "epoch": 0.021073118351733162, + "grad_norm": 7.7228569984436035, + "learning_rate": 3.512304250559284e-05, + "loss": 4.0468, + "step": 314 + }, + { + "epoch": 0.021207342035502164, + "grad_norm": 6.525973796844482, + "learning_rate": 3.534675615212528e-05, + "loss": 3.7354, + "step": 316 + }, + { + "epoch": 0.021341565719271167, + "grad_norm": 10.763757705688477, + "learning_rate": 3.557046979865772e-05, + "loss": 3.8835, + "step": 318 + }, + { + "epoch": 0.021475789403040166, + "grad_norm": 7.908733367919922, + "learning_rate": 3.579418344519016e-05, + "loss": 4.1263, + "step": 320 + }, + { + "epoch": 0.02161001308680917, + "grad_norm": 8.188868522644043, + "learning_rate": 3.6017897091722594e-05, + "loss": 3.6327, + "step": 322 + }, + { + "epoch": 0.021744236770578167, + "grad_norm": 9.370588302612305, + "learning_rate": 3.6241610738255034e-05, + "loss": 3.8051, + "step": 324 + }, + { + "epoch": 0.02187846045434717, + "grad_norm": 7.457039833068848, + "learning_rate": 3.6465324384787475e-05, + "loss": 3.9466, + "step": 326 + }, + { + "epoch": 0.02201268413811617, + "grad_norm": 7.0435404777526855, + "learning_rate": 3.668903803131991e-05, + "loss": 4.1573, + "step": 328 + }, + { + "epoch": 0.02214690782188517, + "grad_norm": 7.235965728759766, + "learning_rate": 3.6912751677852356e-05, + "loss": 3.8049, + "step": 330 + }, + { + "epoch": 0.022281131505654174, + "grad_norm": 7.083849906921387, + "learning_rate": 3.713646532438479e-05, + "loss": 3.89, + "step": 332 + }, + { + "epoch": 0.022415355189423173, + "grad_norm": 6.842654228210449, + "learning_rate": 3.736017897091722e-05, + "loss": 3.7809, + "step": 334 + }, + { + "epoch": 0.022549578873192175, + "grad_norm": 8.721003532409668, + "learning_rate": 3.758389261744967e-05, + "loss": 3.672, + "step": 336 + }, + { + "epoch": 0.022683802556961175, + "grad_norm": 6.654012680053711, + "learning_rate": 3.7807606263982104e-05, + "loss": 3.7298, + "step": 338 + }, + { + "epoch": 0.022818026240730177, + "grad_norm": 12.028807640075684, + "learning_rate": 3.8031319910514545e-05, + "loss": 4.183, + "step": 340 + }, + { + "epoch": 0.02295224992449918, + "grad_norm": 7.003884792327881, + "learning_rate": 3.8255033557046985e-05, + "loss": 3.8528, + "step": 342 + }, + { + "epoch": 0.02308647360826818, + "grad_norm": 6.5270586013793945, + "learning_rate": 3.847874720357942e-05, + "loss": 3.7357, + "step": 344 + }, + { + "epoch": 0.02322069729203718, + "grad_norm": 16.928237915039062, + "learning_rate": 3.870246085011186e-05, + "loss": 3.8991, + "step": 346 + }, + { + "epoch": 0.02335492097580618, + "grad_norm": 7.216508388519287, + "learning_rate": 3.89261744966443e-05, + "loss": 3.6292, + "step": 348 + }, + { + "epoch": 0.023489144659575183, + "grad_norm": 6.936648368835449, + "learning_rate": 3.914988814317674e-05, + "loss": 3.8135, + "step": 350 + }, + { + "epoch": 0.02362336834334418, + "grad_norm": 7.404638290405273, + "learning_rate": 3.9373601789709174e-05, + "loss": 3.8391, + "step": 352 + }, + { + "epoch": 0.023757592027113184, + "grad_norm": 7.5172648429870605, + "learning_rate": 3.959731543624161e-05, + "loss": 3.8111, + "step": 354 + }, + { + "epoch": 0.023891815710882187, + "grad_norm": 6.857153415679932, + "learning_rate": 3.9821029082774055e-05, + "loss": 3.6017, + "step": 356 + }, + { + "epoch": 0.024026039394651186, + "grad_norm": 6.878764629364014, + "learning_rate": 4.004474272930649e-05, + "loss": 3.5724, + "step": 358 + }, + { + "epoch": 0.02416026307842019, + "grad_norm": 6.8217315673828125, + "learning_rate": 4.026845637583892e-05, + "loss": 3.8698, + "step": 360 + }, + { + "epoch": 0.024294486762189187, + "grad_norm": 7.0202178955078125, + "learning_rate": 4.049217002237137e-05, + "loss": 3.5676, + "step": 362 + }, + { + "epoch": 0.02442871044595819, + "grad_norm": 7.347701072692871, + "learning_rate": 4.07158836689038e-05, + "loss": 3.9088, + "step": 364 + }, + { + "epoch": 0.02456293412972719, + "grad_norm": 9.914051055908203, + "learning_rate": 4.0939597315436244e-05, + "loss": 4.0511, + "step": 366 + }, + { + "epoch": 0.02469715781349619, + "grad_norm": 16.037700653076172, + "learning_rate": 4.1163310961968684e-05, + "loss": 3.9389, + "step": 368 + }, + { + "epoch": 0.024831381497265194, + "grad_norm": 6.9445271492004395, + "learning_rate": 4.138702460850112e-05, + "loss": 3.8469, + "step": 370 + }, + { + "epoch": 0.024965605181034193, + "grad_norm": 15.266164779663086, + "learning_rate": 4.161073825503356e-05, + "loss": 3.853, + "step": 372 + }, + { + "epoch": 0.025099828864803195, + "grad_norm": 7.2589111328125, + "learning_rate": 4.1834451901566e-05, + "loss": 4.013, + "step": 374 + }, + { + "epoch": 0.025234052548572194, + "grad_norm": 6.08740234375, + "learning_rate": 4.205816554809844e-05, + "loss": 3.6531, + "step": 376 + }, + { + "epoch": 0.025368276232341197, + "grad_norm": 6.87108039855957, + "learning_rate": 4.228187919463087e-05, + "loss": 3.727, + "step": 378 + }, + { + "epoch": 0.025502499916110196, + "grad_norm": 6.207707405090332, + "learning_rate": 4.2505592841163314e-05, + "loss": 3.8917, + "step": 380 + }, + { + "epoch": 0.0256367235998792, + "grad_norm": 7.019343852996826, + "learning_rate": 4.2729306487695754e-05, + "loss": 3.8243, + "step": 382 + }, + { + "epoch": 0.0257709472836482, + "grad_norm": 7.028400897979736, + "learning_rate": 4.295302013422819e-05, + "loss": 3.9234, + "step": 384 + }, + { + "epoch": 0.0259051709674172, + "grad_norm": 5.866400241851807, + "learning_rate": 4.317673378076063e-05, + "loss": 3.7391, + "step": 386 + }, + { + "epoch": 0.026039394651186203, + "grad_norm": 14.94155216217041, + "learning_rate": 4.340044742729307e-05, + "loss": 4.0892, + "step": 388 + }, + { + "epoch": 0.0261736183349552, + "grad_norm": 6.1532816886901855, + "learning_rate": 4.36241610738255e-05, + "loss": 3.759, + "step": 390 + }, + { + "epoch": 0.026307842018724204, + "grad_norm": 7.094605922698975, + "learning_rate": 4.384787472035794e-05, + "loss": 4.0304, + "step": 392 + }, + { + "epoch": 0.026442065702493203, + "grad_norm": 6.5840840339660645, + "learning_rate": 4.4071588366890384e-05, + "loss": 3.7616, + "step": 394 + }, + { + "epoch": 0.026576289386262206, + "grad_norm": 7.627487659454346, + "learning_rate": 4.4295302013422824e-05, + "loss": 3.7793, + "step": 396 + }, + { + "epoch": 0.026710513070031208, + "grad_norm": 7.650868892669678, + "learning_rate": 4.451901565995526e-05, + "loss": 3.8192, + "step": 398 + }, + { + "epoch": 0.026844736753800207, + "grad_norm": 6.1228814125061035, + "learning_rate": 4.47427293064877e-05, + "loss": 3.6282, + "step": 400 + }, + { + "epoch": 0.02697896043756921, + "grad_norm": 7.545034408569336, + "learning_rate": 4.496644295302014e-05, + "loss": 3.8926, + "step": 402 + }, + { + "epoch": 0.02711318412133821, + "grad_norm": 10.36417007446289, + "learning_rate": 4.519015659955257e-05, + "loss": 3.6484, + "step": 404 + }, + { + "epoch": 0.02724740780510721, + "grad_norm": 6.767966270446777, + "learning_rate": 4.541387024608501e-05, + "loss": 3.8392, + "step": 406 + }, + { + "epoch": 0.027381631488876214, + "grad_norm": 7.342782974243164, + "learning_rate": 4.5637583892617453e-05, + "loss": 3.7989, + "step": 408 + }, + { + "epoch": 0.027515855172645213, + "grad_norm": 7.797837734222412, + "learning_rate": 4.586129753914989e-05, + "loss": 3.6473, + "step": 410 + }, + { + "epoch": 0.027650078856414215, + "grad_norm": 11.83535385131836, + "learning_rate": 4.608501118568233e-05, + "loss": 3.7866, + "step": 412 + }, + { + "epoch": 0.027784302540183214, + "grad_norm": 6.518899917602539, + "learning_rate": 4.630872483221477e-05, + "loss": 3.7616, + "step": 414 + }, + { + "epoch": 0.027918526223952217, + "grad_norm": 6.7666544914245605, + "learning_rate": 4.65324384787472e-05, + "loss": 3.72, + "step": 416 + }, + { + "epoch": 0.028052749907721216, + "grad_norm": 6.393723964691162, + "learning_rate": 4.675615212527964e-05, + "loss": 3.5969, + "step": 418 + }, + { + "epoch": 0.02818697359149022, + "grad_norm": 6.654068946838379, + "learning_rate": 4.697986577181208e-05, + "loss": 3.5377, + "step": 420 + }, + { + "epoch": 0.02832119727525922, + "grad_norm": 6.415705680847168, + "learning_rate": 4.720357941834452e-05, + "loss": 3.7649, + "step": 422 + }, + { + "epoch": 0.02845542095902822, + "grad_norm": 6.588418483734131, + "learning_rate": 4.742729306487696e-05, + "loss": 3.772, + "step": 424 + }, + { + "epoch": 0.028589644642797223, + "grad_norm": 6.276059150695801, + "learning_rate": 4.76510067114094e-05, + "loss": 3.7094, + "step": 426 + }, + { + "epoch": 0.02872386832656622, + "grad_norm": 11.73387622833252, + "learning_rate": 4.787472035794184e-05, + "loss": 3.7788, + "step": 428 + }, + { + "epoch": 0.028858092010335224, + "grad_norm": 7.142545700073242, + "learning_rate": 4.809843400447427e-05, + "loss": 3.8028, + "step": 430 + }, + { + "epoch": 0.028992315694104223, + "grad_norm": 6.749697685241699, + "learning_rate": 4.832214765100672e-05, + "loss": 3.6969, + "step": 432 + }, + { + "epoch": 0.029126539377873226, + "grad_norm": 7.347177505493164, + "learning_rate": 4.854586129753915e-05, + "loss": 3.8187, + "step": 434 + }, + { + "epoch": 0.029260763061642228, + "grad_norm": 7.12244987487793, + "learning_rate": 4.8769574944071586e-05, + "loss": 3.8582, + "step": 436 + }, + { + "epoch": 0.029394986745411227, + "grad_norm": 6.025589942932129, + "learning_rate": 4.8993288590604034e-05, + "loss": 3.5271, + "step": 438 + }, + { + "epoch": 0.02952921042918023, + "grad_norm": 6.692173480987549, + "learning_rate": 4.921700223713647e-05, + "loss": 3.7877, + "step": 440 + }, + { + "epoch": 0.02966343411294923, + "grad_norm": 7.137646198272705, + "learning_rate": 4.944071588366891e-05, + "loss": 3.7105, + "step": 442 + }, + { + "epoch": 0.02979765779671823, + "grad_norm": 10.763802528381348, + "learning_rate": 4.966442953020135e-05, + "loss": 4.2047, + "step": 444 + }, + { + "epoch": 0.02993188148048723, + "grad_norm": 12.43513298034668, + "learning_rate": 4.988814317673378e-05, + "loss": 3.794, + "step": 446 + }, + { + "epoch": 0.030066105164256233, + "grad_norm": 6.897948265075684, + "learning_rate": 5.011185682326622e-05, + "loss": 3.6728, + "step": 448 + }, + { + "epoch": 0.030200328848025235, + "grad_norm": 18.368371963500977, + "learning_rate": 5.033557046979866e-05, + "loss": 3.9377, + "step": 450 + }, + { + "epoch": 0.030334552531794234, + "grad_norm": 6.893589973449707, + "learning_rate": 5.05592841163311e-05, + "loss": 3.7882, + "step": 452 + }, + { + "epoch": 0.030468776215563237, + "grad_norm": 6.960872650146484, + "learning_rate": 5.078299776286354e-05, + "loss": 3.9439, + "step": 454 + }, + { + "epoch": 0.030602999899332236, + "grad_norm": 12.221426010131836, + "learning_rate": 5.100671140939598e-05, + "loss": 3.9909, + "step": 456 + }, + { + "epoch": 0.03073722358310124, + "grad_norm": 9.257293701171875, + "learning_rate": 5.123042505592841e-05, + "loss": 3.7877, + "step": 458 + }, + { + "epoch": 0.03087144726687024, + "grad_norm": 13.20697021484375, + "learning_rate": 5.145413870246085e-05, + "loss": 3.7724, + "step": 460 + }, + { + "epoch": 0.03100567095063924, + "grad_norm": 10.374574661254883, + "learning_rate": 5.167785234899329e-05, + "loss": 4.0389, + "step": 462 + }, + { + "epoch": 0.031139894634408243, + "grad_norm": 13.057169914245605, + "learning_rate": 5.1901565995525726e-05, + "loss": 3.7931, + "step": 464 + }, + { + "epoch": 0.031274118318177245, + "grad_norm": 10.55159854888916, + "learning_rate": 5.212527964205817e-05, + "loss": 3.8399, + "step": 466 + }, + { + "epoch": 0.03140834200194624, + "grad_norm": 5.971856594085693, + "learning_rate": 5.234899328859061e-05, + "loss": 3.5588, + "step": 468 + }, + { + "epoch": 0.03154256568571524, + "grad_norm": 6.693935394287109, + "learning_rate": 5.257270693512305e-05, + "loss": 3.8607, + "step": 470 + }, + { + "epoch": 0.031676789369484246, + "grad_norm": 10.626763343811035, + "learning_rate": 5.279642058165548e-05, + "loss": 3.6853, + "step": 472 + }, + { + "epoch": 0.03181101305325325, + "grad_norm": 8.273348808288574, + "learning_rate": 5.302013422818792e-05, + "loss": 3.7549, + "step": 474 + }, + { + "epoch": 0.03194523673702225, + "grad_norm": 12.269926071166992, + "learning_rate": 5.324384787472036e-05, + "loss": 3.6712, + "step": 476 + }, + { + "epoch": 0.032079460420791246, + "grad_norm": 12.251240730285645, + "learning_rate": 5.3467561521252796e-05, + "loss": 3.483, + "step": 478 + }, + { + "epoch": 0.03221368410456025, + "grad_norm": 6.750390529632568, + "learning_rate": 5.3691275167785237e-05, + "loss": 3.8403, + "step": 480 + }, + { + "epoch": 0.03234790778832925, + "grad_norm": 6.331452369689941, + "learning_rate": 5.391498881431768e-05, + "loss": 3.4432, + "step": 482 + }, + { + "epoch": 0.032482131472098254, + "grad_norm": 6.544020652770996, + "learning_rate": 5.413870246085011e-05, + "loss": 3.5818, + "step": 484 + }, + { + "epoch": 0.03261635515586725, + "grad_norm": 8.156982421875, + "learning_rate": 5.436241610738255e-05, + "loss": 3.7511, + "step": 486 + }, + { + "epoch": 0.03275057883963625, + "grad_norm": 5.622401237487793, + "learning_rate": 5.458612975391499e-05, + "loss": 3.7074, + "step": 488 + }, + { + "epoch": 0.032884802523405254, + "grad_norm": 8.30151653289795, + "learning_rate": 5.4809843400447426e-05, + "loss": 3.5401, + "step": 490 + }, + { + "epoch": 0.03301902620717426, + "grad_norm": 6.8552727699279785, + "learning_rate": 5.5033557046979866e-05, + "loss": 3.9042, + "step": 492 + }, + { + "epoch": 0.03315324989094326, + "grad_norm": 6.769829750061035, + "learning_rate": 5.5257270693512306e-05, + "loss": 3.58, + "step": 494 + }, + { + "epoch": 0.033287473574712255, + "grad_norm": 7.658302307128906, + "learning_rate": 5.5480984340044754e-05, + "loss": 3.5769, + "step": 496 + }, + { + "epoch": 0.03342169725848126, + "grad_norm": 6.169486999511719, + "learning_rate": 5.570469798657718e-05, + "loss": 3.682, + "step": 498 + }, + { + "epoch": 0.03355592094225026, + "grad_norm": 6.343045711517334, + "learning_rate": 5.592841163310962e-05, + "loss": 3.6239, + "step": 500 + }, + { + "epoch": 0.03369014462601926, + "grad_norm": 7.797464370727539, + "learning_rate": 5.615212527964206e-05, + "loss": 3.4007, + "step": 502 + }, + { + "epoch": 0.033824368309788265, + "grad_norm": 6.705879211425781, + "learning_rate": 5.6375838926174495e-05, + "loss": 3.6614, + "step": 504 + }, + { + "epoch": 0.03395859199355726, + "grad_norm": 7.328771591186523, + "learning_rate": 5.6599552572706936e-05, + "loss": 3.9532, + "step": 506 + }, + { + "epoch": 0.03409281567732626, + "grad_norm": 5.835031986236572, + "learning_rate": 5.6823266219239376e-05, + "loss": 3.9121, + "step": 508 + }, + { + "epoch": 0.034227039361095266, + "grad_norm": 5.903189659118652, + "learning_rate": 5.704697986577181e-05, + "loss": 3.5588, + "step": 510 + }, + { + "epoch": 0.03436126304486427, + "grad_norm": 8.89779281616211, + "learning_rate": 5.727069351230425e-05, + "loss": 3.5968, + "step": 512 + }, + { + "epoch": 0.03449548672863327, + "grad_norm": 8.59582233428955, + "learning_rate": 5.749440715883669e-05, + "loss": 3.8228, + "step": 514 + }, + { + "epoch": 0.034629710412402266, + "grad_norm": 6.839929103851318, + "learning_rate": 5.771812080536914e-05, + "loss": 3.7921, + "step": 516 + }, + { + "epoch": 0.03476393409617127, + "grad_norm": 7.3084797859191895, + "learning_rate": 5.7941834451901565e-05, + "loss": 3.5803, + "step": 518 + }, + { + "epoch": 0.03489815777994027, + "grad_norm": 19.94928741455078, + "learning_rate": 5.8165548098434006e-05, + "loss": 3.7166, + "step": 520 + }, + { + "epoch": 0.035032381463709274, + "grad_norm": 6.431812286376953, + "learning_rate": 5.838926174496645e-05, + "loss": 3.6818, + "step": 522 + }, + { + "epoch": 0.03516660514747827, + "grad_norm": 8.97736644744873, + "learning_rate": 5.861297539149888e-05, + "loss": 3.4825, + "step": 524 + }, + { + "epoch": 0.03530082883124727, + "grad_norm": 19.04999542236328, + "learning_rate": 5.883668903803132e-05, + "loss": 3.6485, + "step": 526 + }, + { + "epoch": 0.035435052515016274, + "grad_norm": 7.413877010345459, + "learning_rate": 5.906040268456377e-05, + "loss": 3.6837, + "step": 528 + }, + { + "epoch": 0.03556927619878528, + "grad_norm": 6.229156494140625, + "learning_rate": 5.9284116331096195e-05, + "loss": 3.5182, + "step": 530 + }, + { + "epoch": 0.03570349988255428, + "grad_norm": 9.691214561462402, + "learning_rate": 5.9507829977628635e-05, + "loss": 3.8224, + "step": 532 + }, + { + "epoch": 0.035837723566323275, + "grad_norm": 10.384590148925781, + "learning_rate": 5.973154362416108e-05, + "loss": 3.5419, + "step": 534 + }, + { + "epoch": 0.03597194725009228, + "grad_norm": 6.141333103179932, + "learning_rate": 5.995525727069351e-05, + "loss": 3.3365, + "step": 536 + }, + { + "epoch": 0.03610617093386128, + "grad_norm": 6.892976760864258, + "learning_rate": 6.017897091722595e-05, + "loss": 3.9954, + "step": 538 + }, + { + "epoch": 0.03624039461763028, + "grad_norm": 6.330560207366943, + "learning_rate": 6.04026845637584e-05, + "loss": 3.766, + "step": 540 + }, + { + "epoch": 0.036374618301399285, + "grad_norm": 8.203278541564941, + "learning_rate": 6.062639821029084e-05, + "loss": 3.5777, + "step": 542 + }, + { + "epoch": 0.03650884198516828, + "grad_norm": 9.398039817810059, + "learning_rate": 6.0850111856823265e-05, + "loss": 3.6125, + "step": 544 + }, + { + "epoch": 0.03664306566893728, + "grad_norm": 8.135908126831055, + "learning_rate": 6.107382550335571e-05, + "loss": 3.9842, + "step": 546 + }, + { + "epoch": 0.036777289352706286, + "grad_norm": 10.252396583557129, + "learning_rate": 6.129753914988815e-05, + "loss": 3.6789, + "step": 548 + }, + { + "epoch": 0.03691151303647529, + "grad_norm": 6.439090251922607, + "learning_rate": 6.152125279642058e-05, + "loss": 3.636, + "step": 550 + }, + { + "epoch": 0.03704573672024429, + "grad_norm": 7.835175037384033, + "learning_rate": 6.174496644295302e-05, + "loss": 3.6272, + "step": 552 + }, + { + "epoch": 0.037179960404013286, + "grad_norm": 43.03024673461914, + "learning_rate": 6.196868008948546e-05, + "loss": 3.6347, + "step": 554 + }, + { + "epoch": 0.03731418408778229, + "grad_norm": 6.875336647033691, + "learning_rate": 6.21923937360179e-05, + "loss": 3.6689, + "step": 556 + }, + { + "epoch": 0.03744840777155129, + "grad_norm": 6.688826560974121, + "learning_rate": 6.241610738255034e-05, + "loss": 3.8033, + "step": 558 + }, + { + "epoch": 0.037582631455320294, + "grad_norm": 21.09029197692871, + "learning_rate": 6.263982102908278e-05, + "loss": 3.5633, + "step": 560 + }, + { + "epoch": 0.03771685513908929, + "grad_norm": 11.56898021697998, + "learning_rate": 6.286353467561522e-05, + "loss": 3.7086, + "step": 562 + }, + { + "epoch": 0.03785107882285829, + "grad_norm": 6.911277770996094, + "learning_rate": 6.308724832214765e-05, + "loss": 3.7694, + "step": 564 + }, + { + "epoch": 0.037985302506627294, + "grad_norm": 6.0444560050964355, + "learning_rate": 6.331096196868009e-05, + "loss": 3.3953, + "step": 566 + }, + { + "epoch": 0.0381195261903963, + "grad_norm": 7.797275066375732, + "learning_rate": 6.353467561521253e-05, + "loss": 3.8707, + "step": 568 + }, + { + "epoch": 0.0382537498741653, + "grad_norm": 6.242194652557373, + "learning_rate": 6.375838926174497e-05, + "loss": 3.5228, + "step": 570 + }, + { + "epoch": 0.038387973557934295, + "grad_norm": 7.708515644073486, + "learning_rate": 6.398210290827741e-05, + "loss": 3.2691, + "step": 572 + }, + { + "epoch": 0.0385221972417033, + "grad_norm": 6.023196697235107, + "learning_rate": 6.420581655480985e-05, + "loss": 3.581, + "step": 574 + }, + { + "epoch": 0.0386564209254723, + "grad_norm": 6.546248912811279, + "learning_rate": 6.442953020134228e-05, + "loss": 3.4443, + "step": 576 + }, + { + "epoch": 0.0387906446092413, + "grad_norm": 6.326827049255371, + "learning_rate": 6.465324384787472e-05, + "loss": 3.5929, + "step": 578 + }, + { + "epoch": 0.038924868293010305, + "grad_norm": 27.494388580322266, + "learning_rate": 6.487695749440716e-05, + "loss": 3.3452, + "step": 580 + }, + { + "epoch": 0.0390590919767793, + "grad_norm": 6.618257522583008, + "learning_rate": 6.51006711409396e-05, + "loss": 3.6339, + "step": 582 + }, + { + "epoch": 0.0391933156605483, + "grad_norm": 7.49738073348999, + "learning_rate": 6.532438478747204e-05, + "loss": 3.6682, + "step": 584 + }, + { + "epoch": 0.039327539344317305, + "grad_norm": 254.7956085205078, + "learning_rate": 6.554809843400448e-05, + "loss": 3.6769, + "step": 586 + }, + { + "epoch": 0.03946176302808631, + "grad_norm": 7.109992027282715, + "learning_rate": 6.577181208053692e-05, + "loss": 3.5133, + "step": 588 + }, + { + "epoch": 0.039595986711855304, + "grad_norm": 11.50060749053955, + "learning_rate": 6.599552572706935e-05, + "loss": 3.4971, + "step": 590 + }, + { + "epoch": 0.039730210395624306, + "grad_norm": 14.20955753326416, + "learning_rate": 6.621923937360179e-05, + "loss": 3.6074, + "step": 592 + }, + { + "epoch": 0.03986443407939331, + "grad_norm": 12.47415542602539, + "learning_rate": 6.644295302013423e-05, + "loss": 3.8254, + "step": 594 + }, + { + "epoch": 0.03999865776316231, + "grad_norm": 7.203413486480713, + "learning_rate": 6.666666666666667e-05, + "loss": 3.5427, + "step": 596 + }, + { + "epoch": 0.040132881446931314, + "grad_norm": 6.353034019470215, + "learning_rate": 6.689038031319911e-05, + "loss": 3.7011, + "step": 598 + }, + { + "epoch": 0.04026710513070031, + "grad_norm": 5.875802040100098, + "learning_rate": 6.711409395973155e-05, + "loss": 3.5086, + "step": 600 + }, + { + "epoch": 0.04040132881446931, + "grad_norm": 7.2121195793151855, + "learning_rate": 6.733780760626398e-05, + "loss": 3.5549, + "step": 602 + }, + { + "epoch": 0.040535552498238314, + "grad_norm": 5.9159722328186035, + "learning_rate": 6.756152125279642e-05, + "loss": 3.5046, + "step": 604 + }, + { + "epoch": 0.04066977618200732, + "grad_norm": 20.871990203857422, + "learning_rate": 6.778523489932886e-05, + "loss": 3.7367, + "step": 606 + }, + { + "epoch": 0.04080399986577632, + "grad_norm": 9.327224731445312, + "learning_rate": 6.800894854586131e-05, + "loss": 3.6809, + "step": 608 + }, + { + "epoch": 0.040938223549545315, + "grad_norm": 6.717700481414795, + "learning_rate": 6.823266219239374e-05, + "loss": 3.3567, + "step": 610 + }, + { + "epoch": 0.04107244723331432, + "grad_norm": 7.617694854736328, + "learning_rate": 6.845637583892618e-05, + "loss": 3.679, + "step": 612 + }, + { + "epoch": 0.04120667091708332, + "grad_norm": 8.056676864624023, + "learning_rate": 6.868008948545862e-05, + "loss": 3.5921, + "step": 614 + }, + { + "epoch": 0.04134089460085232, + "grad_norm": 7.336218357086182, + "learning_rate": 6.890380313199105e-05, + "loss": 3.612, + "step": 616 + }, + { + "epoch": 0.041475118284621325, + "grad_norm": 6.059606075286865, + "learning_rate": 6.912751677852349e-05, + "loss": 3.5374, + "step": 618 + }, + { + "epoch": 0.04160934196839032, + "grad_norm": 6.771276950836182, + "learning_rate": 6.935123042505593e-05, + "loss": 3.9388, + "step": 620 + }, + { + "epoch": 0.04174356565215932, + "grad_norm": 5.5659308433532715, + "learning_rate": 6.957494407158837e-05, + "loss": 3.3201, + "step": 622 + }, + { + "epoch": 0.041877789335928325, + "grad_norm": 6.125942230224609, + "learning_rate": 6.979865771812081e-05, + "loss": 3.4779, + "step": 624 + }, + { + "epoch": 0.04201201301969733, + "grad_norm": 18.244232177734375, + "learning_rate": 7.002237136465325e-05, + "loss": 3.5218, + "step": 626 + }, + { + "epoch": 0.042146236703466324, + "grad_norm": 16.113792419433594, + "learning_rate": 7.024608501118568e-05, + "loss": 3.6077, + "step": 628 + }, + { + "epoch": 0.042280460387235326, + "grad_norm": 7.761129379272461, + "learning_rate": 7.046979865771812e-05, + "loss": 3.5406, + "step": 630 + }, + { + "epoch": 0.04241468407100433, + "grad_norm": 6.223979473114014, + "learning_rate": 7.069351230425056e-05, + "loss": 3.7111, + "step": 632 + }, + { + "epoch": 0.04254890775477333, + "grad_norm": 10.040348052978516, + "learning_rate": 7.091722595078301e-05, + "loss": 4.0278, + "step": 634 + }, + { + "epoch": 0.042683131438542334, + "grad_norm": 8.123263359069824, + "learning_rate": 7.114093959731544e-05, + "loss": 3.4017, + "step": 636 + }, + { + "epoch": 0.04281735512231133, + "grad_norm": 6.905566692352295, + "learning_rate": 7.136465324384788e-05, + "loss": 3.4533, + "step": 638 + }, + { + "epoch": 0.04295157880608033, + "grad_norm": 7.479582786560059, + "learning_rate": 7.158836689038032e-05, + "loss": 3.4485, + "step": 640 + }, + { + "epoch": 0.043085802489849334, + "grad_norm": 7.876314163208008, + "learning_rate": 7.181208053691275e-05, + "loss": 3.5333, + "step": 642 + }, + { + "epoch": 0.04322002617361834, + "grad_norm": 21.732248306274414, + "learning_rate": 7.203579418344519e-05, + "loss": 3.4896, + "step": 644 + }, + { + "epoch": 0.04335424985738734, + "grad_norm": 8.48886489868164, + "learning_rate": 7.225950782997764e-05, + "loss": 3.6481, + "step": 646 + }, + { + "epoch": 0.043488473541156335, + "grad_norm": 6.066891670227051, + "learning_rate": 7.248322147651007e-05, + "loss": 3.4602, + "step": 648 + }, + { + "epoch": 0.04362269722492534, + "grad_norm": 6.900521755218506, + "learning_rate": 7.270693512304251e-05, + "loss": 3.2799, + "step": 650 + }, + { + "epoch": 0.04375692090869434, + "grad_norm": 6.809779644012451, + "learning_rate": 7.293064876957495e-05, + "loss": 3.5911, + "step": 652 + }, + { + "epoch": 0.04389114459246334, + "grad_norm": 5.950530529022217, + "learning_rate": 7.315436241610739e-05, + "loss": 3.5741, + "step": 654 + }, + { + "epoch": 0.04402536827623234, + "grad_norm": 5.8318190574646, + "learning_rate": 7.337807606263982e-05, + "loss": 3.5756, + "step": 656 + }, + { + "epoch": 0.04415959196000134, + "grad_norm": 5.912139415740967, + "learning_rate": 7.360178970917227e-05, + "loss": 3.4419, + "step": 658 + }, + { + "epoch": 0.04429381564377034, + "grad_norm": 7.128172397613525, + "learning_rate": 7.382550335570471e-05, + "loss": 3.5884, + "step": 660 + }, + { + "epoch": 0.044428039327539345, + "grad_norm": 6.197318077087402, + "learning_rate": 7.404921700223714e-05, + "loss": 3.4301, + "step": 662 + }, + { + "epoch": 0.04456226301130835, + "grad_norm": 6.563938140869141, + "learning_rate": 7.427293064876958e-05, + "loss": 3.5526, + "step": 664 + }, + { + "epoch": 0.04469648669507734, + "grad_norm": 5.898063659667969, + "learning_rate": 7.449664429530202e-05, + "loss": 3.4291, + "step": 666 + }, + { + "epoch": 0.044830710378846346, + "grad_norm": 7.36962366104126, + "learning_rate": 7.472035794183445e-05, + "loss": 3.5298, + "step": 668 + }, + { + "epoch": 0.04496493406261535, + "grad_norm": 6.431079387664795, + "learning_rate": 7.494407158836689e-05, + "loss": 3.7087, + "step": 670 + }, + { + "epoch": 0.04509915774638435, + "grad_norm": 14.708699226379395, + "learning_rate": 7.516778523489934e-05, + "loss": 3.4532, + "step": 672 + }, + { + "epoch": 0.045233381430153353, + "grad_norm": 6.959427833557129, + "learning_rate": 7.539149888143177e-05, + "loss": 3.8553, + "step": 674 + }, + { + "epoch": 0.04536760511392235, + "grad_norm": 17.458847045898438, + "learning_rate": 7.561521252796421e-05, + "loss": 3.6052, + "step": 676 + }, + { + "epoch": 0.04550182879769135, + "grad_norm": 6.124471664428711, + "learning_rate": 7.583892617449665e-05, + "loss": 3.6101, + "step": 678 + }, + { + "epoch": 0.045636052481460354, + "grad_norm": 58.917232513427734, + "learning_rate": 7.606263982102909e-05, + "loss": 3.5562, + "step": 680 + }, + { + "epoch": 0.04577027616522936, + "grad_norm": 5.872721195220947, + "learning_rate": 7.628635346756152e-05, + "loss": 3.6709, + "step": 682 + }, + { + "epoch": 0.04590449984899836, + "grad_norm": 10.595252990722656, + "learning_rate": 7.651006711409397e-05, + "loss": 3.531, + "step": 684 + }, + { + "epoch": 0.046038723532767355, + "grad_norm": 18.059101104736328, + "learning_rate": 7.673378076062641e-05, + "loss": 3.4885, + "step": 686 + }, + { + "epoch": 0.04617294721653636, + "grad_norm": 5.316922664642334, + "learning_rate": 7.695749440715884e-05, + "loss": 3.2766, + "step": 688 + }, + { + "epoch": 0.04630717090030536, + "grad_norm": 5.4431915283203125, + "learning_rate": 7.718120805369128e-05, + "loss": 3.412, + "step": 690 + }, + { + "epoch": 0.04644139458407436, + "grad_norm": 6.910205364227295, + "learning_rate": 7.740492170022372e-05, + "loss": 3.4436, + "step": 692 + }, + { + "epoch": 0.04657561826784336, + "grad_norm": 11.197047233581543, + "learning_rate": 7.762863534675615e-05, + "loss": 3.7823, + "step": 694 + }, + { + "epoch": 0.04670984195161236, + "grad_norm": 8.21236801147461, + "learning_rate": 7.78523489932886e-05, + "loss": 3.7158, + "step": 696 + }, + { + "epoch": 0.04684406563538136, + "grad_norm": 6.315340042114258, + "learning_rate": 7.807606263982104e-05, + "loss": 3.6055, + "step": 698 + }, + { + "epoch": 0.046978289319150365, + "grad_norm": 7.295384883880615, + "learning_rate": 7.829977628635348e-05, + "loss": 3.4397, + "step": 700 + }, + { + "epoch": 0.04711251300291937, + "grad_norm": 6.277502536773682, + "learning_rate": 7.852348993288591e-05, + "loss": 3.6332, + "step": 702 + }, + { + "epoch": 0.04724673668668836, + "grad_norm": 8.296098709106445, + "learning_rate": 7.874720357941835e-05, + "loss": 3.621, + "step": 704 + }, + { + "epoch": 0.047380960370457366, + "grad_norm": 19.555770874023438, + "learning_rate": 7.897091722595079e-05, + "loss": 3.5076, + "step": 706 + }, + { + "epoch": 0.04751518405422637, + "grad_norm": 7.423375129699707, + "learning_rate": 7.919463087248322e-05, + "loss": 3.8212, + "step": 708 + }, + { + "epoch": 0.04764940773799537, + "grad_norm": 14.805453300476074, + "learning_rate": 7.941834451901567e-05, + "loss": 3.4138, + "step": 710 + }, + { + "epoch": 0.04778363142176437, + "grad_norm": 7.334309101104736, + "learning_rate": 7.964205816554811e-05, + "loss": 3.5886, + "step": 712 + }, + { + "epoch": 0.04791785510553337, + "grad_norm": 6.199641227722168, + "learning_rate": 7.986577181208054e-05, + "loss": 3.5527, + "step": 714 + }, + { + "epoch": 0.04805207878930237, + "grad_norm": 5.70994234085083, + "learning_rate": 8.008948545861298e-05, + "loss": 3.3417, + "step": 716 + }, + { + "epoch": 0.048186302473071374, + "grad_norm": 7.336976528167725, + "learning_rate": 8.031319910514542e-05, + "loss": 3.7149, + "step": 718 + }, + { + "epoch": 0.04832052615684038, + "grad_norm": 10.913148880004883, + "learning_rate": 8.053691275167784e-05, + "loss": 3.642, + "step": 720 + }, + { + "epoch": 0.04845474984060937, + "grad_norm": 16.541223526000977, + "learning_rate": 8.07606263982103e-05, + "loss": 3.5684, + "step": 722 + }, + { + "epoch": 0.048588973524378375, + "grad_norm": 5.663680553436279, + "learning_rate": 8.098434004474274e-05, + "loss": 3.8074, + "step": 724 + }, + { + "epoch": 0.04872319720814738, + "grad_norm": 5.889994144439697, + "learning_rate": 8.120805369127518e-05, + "loss": 3.418, + "step": 726 + }, + { + "epoch": 0.04885742089191638, + "grad_norm": 11.671000480651855, + "learning_rate": 8.14317673378076e-05, + "loss": 3.5682, + "step": 728 + }, + { + "epoch": 0.04899164457568538, + "grad_norm": 22.093782424926758, + "learning_rate": 8.165548098434005e-05, + "loss": 3.4729, + "step": 730 + }, + { + "epoch": 0.04912586825945438, + "grad_norm": 9.847548484802246, + "learning_rate": 8.187919463087249e-05, + "loss": 3.4667, + "step": 732 + }, + { + "epoch": 0.04926009194322338, + "grad_norm": 7.368834018707275, + "learning_rate": 8.210290827740493e-05, + "loss": 3.6529, + "step": 734 + }, + { + "epoch": 0.04939431562699238, + "grad_norm": 7.640537261962891, + "learning_rate": 8.232662192393737e-05, + "loss": 3.4283, + "step": 736 + }, + { + "epoch": 0.049528539310761385, + "grad_norm": 6.394989490509033, + "learning_rate": 8.255033557046981e-05, + "loss": 3.6375, + "step": 738 + }, + { + "epoch": 0.04966276299453039, + "grad_norm": 6.245489120483398, + "learning_rate": 8.277404921700224e-05, + "loss": 3.4821, + "step": 740 + }, + { + "epoch": 0.04979698667829938, + "grad_norm": 5.876097202301025, + "learning_rate": 8.299776286353468e-05, + "loss": 3.4258, + "step": 742 + }, + { + "epoch": 0.049931210362068386, + "grad_norm": 8.523154258728027, + "learning_rate": 8.322147651006712e-05, + "loss": 3.1519, + "step": 744 + }, + { + "epoch": 0.05006543404583739, + "grad_norm": 7.3169684410095215, + "learning_rate": 8.344519015659956e-05, + "loss": 3.8159, + "step": 746 + }, + { + "epoch": 0.05019965772960639, + "grad_norm": 7.223337650299072, + "learning_rate": 8.3668903803132e-05, + "loss": 3.467, + "step": 748 + }, + { + "epoch": 0.05033388141337539, + "grad_norm": 9.149331092834473, + "learning_rate": 8.389261744966444e-05, + "loss": 3.4041, + "step": 750 + }, + { + "epoch": 0.05046810509714439, + "grad_norm": 8.020407676696777, + "learning_rate": 8.411633109619688e-05, + "loss": 3.6859, + "step": 752 + }, + { + "epoch": 0.05060232878091339, + "grad_norm": 5.433152675628662, + "learning_rate": 8.43400447427293e-05, + "loss": 3.4937, + "step": 754 + }, + { + "epoch": 0.050736552464682394, + "grad_norm": 28.608104705810547, + "learning_rate": 8.456375838926175e-05, + "loss": 3.4541, + "step": 756 + }, + { + "epoch": 0.050870776148451397, + "grad_norm": 5.212295055389404, + "learning_rate": 8.478747203579419e-05, + "loss": 3.354, + "step": 758 + }, + { + "epoch": 0.05100499983222039, + "grad_norm": 6.356689929962158, + "learning_rate": 8.501118568232663e-05, + "loss": 3.6705, + "step": 760 + }, + { + "epoch": 0.051139223515989395, + "grad_norm": 7.921032905578613, + "learning_rate": 8.523489932885907e-05, + "loss": 3.5634, + "step": 762 + }, + { + "epoch": 0.0512734471997584, + "grad_norm": 7.109432220458984, + "learning_rate": 8.545861297539151e-05, + "loss": 3.4999, + "step": 764 + }, + { + "epoch": 0.0514076708835274, + "grad_norm": 11.956075668334961, + "learning_rate": 8.568232662192394e-05, + "loss": 3.6819, + "step": 766 + }, + { + "epoch": 0.0515418945672964, + "grad_norm": 10.776528358459473, + "learning_rate": 8.590604026845638e-05, + "loss": 3.3118, + "step": 768 + }, + { + "epoch": 0.0516761182510654, + "grad_norm": 7.396949291229248, + "learning_rate": 8.612975391498882e-05, + "loss": 3.5331, + "step": 770 + }, + { + "epoch": 0.0518103419348344, + "grad_norm": 7.392943382263184, + "learning_rate": 8.635346756152126e-05, + "loss": 3.7768, + "step": 772 + }, + { + "epoch": 0.0519445656186034, + "grad_norm": 11.591886520385742, + "learning_rate": 8.65771812080537e-05, + "loss": 3.5958, + "step": 774 + }, + { + "epoch": 0.052078789302372405, + "grad_norm": 6.83457088470459, + "learning_rate": 8.680089485458614e-05, + "loss": 3.5063, + "step": 776 + }, + { + "epoch": 0.05221301298614141, + "grad_norm": 6.991634368896484, + "learning_rate": 8.702460850111858e-05, + "loss": 3.4712, + "step": 778 + }, + { + "epoch": 0.0523472366699104, + "grad_norm": 5.948524475097656, + "learning_rate": 8.7248322147651e-05, + "loss": 3.592, + "step": 780 + }, + { + "epoch": 0.052481460353679406, + "grad_norm": 10.283416748046875, + "learning_rate": 8.747203579418345e-05, + "loss": 3.572, + "step": 782 + }, + { + "epoch": 0.05261568403744841, + "grad_norm": 6.051855564117432, + "learning_rate": 8.769574944071589e-05, + "loss": 3.6918, + "step": 784 + }, + { + "epoch": 0.05274990772121741, + "grad_norm": 7.013720989227295, + "learning_rate": 8.791946308724833e-05, + "loss": 3.91, + "step": 786 + }, + { + "epoch": 0.052884131404986406, + "grad_norm": 21.16366958618164, + "learning_rate": 8.814317673378077e-05, + "loss": 3.6744, + "step": 788 + }, + { + "epoch": 0.05301835508875541, + "grad_norm": 6.167827606201172, + "learning_rate": 8.836689038031321e-05, + "loss": 3.6626, + "step": 790 + }, + { + "epoch": 0.05315257877252441, + "grad_norm": 6.08787202835083, + "learning_rate": 8.859060402684565e-05, + "loss": 3.6071, + "step": 792 + }, + { + "epoch": 0.053286802456293414, + "grad_norm": 6.658585548400879, + "learning_rate": 8.881431767337808e-05, + "loss": 3.6567, + "step": 794 + }, + { + "epoch": 0.053421026140062416, + "grad_norm": 5.592415809631348, + "learning_rate": 8.903803131991052e-05, + "loss": 3.4031, + "step": 796 + }, + { + "epoch": 0.05355524982383141, + "grad_norm": 6.692835807800293, + "learning_rate": 8.926174496644296e-05, + "loss": 3.4361, + "step": 798 + }, + { + "epoch": 0.053689473507600415, + "grad_norm": 6.528738975524902, + "learning_rate": 8.94854586129754e-05, + "loss": 3.5052, + "step": 800 + }, + { + "epoch": 0.05382369719136942, + "grad_norm": 5.62727165222168, + "learning_rate": 8.970917225950784e-05, + "loss": 3.2618, + "step": 802 + }, + { + "epoch": 0.05395792087513842, + "grad_norm": 6.9318108558654785, + "learning_rate": 8.993288590604028e-05, + "loss": 3.5657, + "step": 804 + }, + { + "epoch": 0.05409214455890742, + "grad_norm": 7.904621601104736, + "learning_rate": 9.01565995525727e-05, + "loss": 3.7319, + "step": 806 + }, + { + "epoch": 0.05422636824267642, + "grad_norm": 8.11798095703125, + "learning_rate": 9.038031319910515e-05, + "loss": 3.5067, + "step": 808 + }, + { + "epoch": 0.05436059192644542, + "grad_norm": 5.375821113586426, + "learning_rate": 9.060402684563759e-05, + "loss": 3.3674, + "step": 810 + }, + { + "epoch": 0.05449481561021442, + "grad_norm": 6.205749034881592, + "learning_rate": 9.082774049217003e-05, + "loss": 3.6622, + "step": 812 + }, + { + "epoch": 0.054629039293983425, + "grad_norm": 6.343143939971924, + "learning_rate": 9.105145413870247e-05, + "loss": 3.3511, + "step": 814 + }, + { + "epoch": 0.05476326297775243, + "grad_norm": 5.7758378982543945, + "learning_rate": 9.127516778523491e-05, + "loss": 3.313, + "step": 816 + }, + { + "epoch": 0.05489748666152142, + "grad_norm": 5.52503776550293, + "learning_rate": 9.149888143176735e-05, + "loss": 3.5756, + "step": 818 + }, + { + "epoch": 0.055031710345290426, + "grad_norm": 5.814706802368164, + "learning_rate": 9.172259507829977e-05, + "loss": 3.5954, + "step": 820 + }, + { + "epoch": 0.05516593402905943, + "grad_norm": 7.0166916847229, + "learning_rate": 9.194630872483221e-05, + "loss": 3.6314, + "step": 822 + }, + { + "epoch": 0.05530015771282843, + "grad_norm": 5.3512091636657715, + "learning_rate": 9.217002237136466e-05, + "loss": 3.571, + "step": 824 + }, + { + "epoch": 0.055434381396597426, + "grad_norm": 7.410608768463135, + "learning_rate": 9.23937360178971e-05, + "loss": 3.4967, + "step": 826 + }, + { + "epoch": 0.05556860508036643, + "grad_norm": 6.141826152801514, + "learning_rate": 9.261744966442954e-05, + "loss": 3.57, + "step": 828 + }, + { + "epoch": 0.05570282876413543, + "grad_norm": 6.035155773162842, + "learning_rate": 9.284116331096198e-05, + "loss": 3.4498, + "step": 830 + }, + { + "epoch": 0.055837052447904434, + "grad_norm": 6.0198774337768555, + "learning_rate": 9.30648769574944e-05, + "loss": 3.4591, + "step": 832 + }, + { + "epoch": 0.055971276131673436, + "grad_norm": 13.252030372619629, + "learning_rate": 9.328859060402684e-05, + "loss": 3.4182, + "step": 834 + }, + { + "epoch": 0.05610549981544243, + "grad_norm": 9.277962684631348, + "learning_rate": 9.351230425055928e-05, + "loss": 3.6953, + "step": 836 + }, + { + "epoch": 0.056239723499211434, + "grad_norm": 6.543147563934326, + "learning_rate": 9.373601789709174e-05, + "loss": 3.4536, + "step": 838 + }, + { + "epoch": 0.05637394718298044, + "grad_norm": 6.048597812652588, + "learning_rate": 9.395973154362417e-05, + "loss": 3.49, + "step": 840 + }, + { + "epoch": 0.05650817086674944, + "grad_norm": 6.587677955627441, + "learning_rate": 9.41834451901566e-05, + "loss": 3.7087, + "step": 842 + }, + { + "epoch": 0.05664239455051844, + "grad_norm": 14.014129638671875, + "learning_rate": 9.440715883668905e-05, + "loss": 3.7608, + "step": 844 + }, + { + "epoch": 0.05677661823428744, + "grad_norm": 6.882875442504883, + "learning_rate": 9.463087248322147e-05, + "loss": 3.7145, + "step": 846 + }, + { + "epoch": 0.05691084191805644, + "grad_norm": 8.141765594482422, + "learning_rate": 9.485458612975391e-05, + "loss": 3.3836, + "step": 848 + }, + { + "epoch": 0.05704506560182544, + "grad_norm": 6.149584770202637, + "learning_rate": 9.507829977628635e-05, + "loss": 3.438, + "step": 850 + }, + { + "epoch": 0.057179289285594445, + "grad_norm": 8.708868980407715, + "learning_rate": 9.53020134228188e-05, + "loss": 3.5711, + "step": 852 + }, + { + "epoch": 0.05731351296936345, + "grad_norm": 9.75905990600586, + "learning_rate": 9.552572706935124e-05, + "loss": 3.4675, + "step": 854 + }, + { + "epoch": 0.05744773665313244, + "grad_norm": 7.875243663787842, + "learning_rate": 9.574944071588368e-05, + "loss": 3.5555, + "step": 856 + }, + { + "epoch": 0.057581960336901446, + "grad_norm": 7.794872760772705, + "learning_rate": 9.59731543624161e-05, + "loss": 3.4774, + "step": 858 + }, + { + "epoch": 0.05771618402067045, + "grad_norm": 5.2783098220825195, + "learning_rate": 9.619686800894854e-05, + "loss": 3.3929, + "step": 860 + }, + { + "epoch": 0.05785040770443945, + "grad_norm": 7.097997188568115, + "learning_rate": 9.642058165548098e-05, + "loss": 3.7978, + "step": 862 + }, + { + "epoch": 0.057984631388208446, + "grad_norm": 5.594595909118652, + "learning_rate": 9.664429530201344e-05, + "loss": 3.6068, + "step": 864 + }, + { + "epoch": 0.05811885507197745, + "grad_norm": 5.124762058258057, + "learning_rate": 9.686800894854587e-05, + "loss": 3.3773, + "step": 866 + }, + { + "epoch": 0.05825307875574645, + "grad_norm": 6.03474235534668, + "learning_rate": 9.70917225950783e-05, + "loss": 3.6807, + "step": 868 + }, + { + "epoch": 0.058387302439515454, + "grad_norm": 9.455263137817383, + "learning_rate": 9.731543624161075e-05, + "loss": 3.6407, + "step": 870 + }, + { + "epoch": 0.058521526123284456, + "grad_norm": 5.564420700073242, + "learning_rate": 9.753914988814317e-05, + "loss": 3.2585, + "step": 872 + }, + { + "epoch": 0.05865574980705345, + "grad_norm": 6.151849269866943, + "learning_rate": 9.776286353467561e-05, + "loss": 3.5467, + "step": 874 + }, + { + "epoch": 0.058789973490822454, + "grad_norm": 6.962841987609863, + "learning_rate": 9.798657718120807e-05, + "loss": 3.4256, + "step": 876 + }, + { + "epoch": 0.05892419717459146, + "grad_norm": 5.577215671539307, + "learning_rate": 9.82102908277405e-05, + "loss": 3.6446, + "step": 878 + }, + { + "epoch": 0.05905842085836046, + "grad_norm": 6.033496379852295, + "learning_rate": 9.843400447427293e-05, + "loss": 3.7447, + "step": 880 + }, + { + "epoch": 0.05919264454212946, + "grad_norm": 7.093349456787109, + "learning_rate": 9.865771812080538e-05, + "loss": 3.6522, + "step": 882 + }, + { + "epoch": 0.05932686822589846, + "grad_norm": 5.829392910003662, + "learning_rate": 9.888143176733782e-05, + "loss": 3.5359, + "step": 884 + }, + { + "epoch": 0.05946109190966746, + "grad_norm": 5.967329502105713, + "learning_rate": 9.910514541387024e-05, + "loss": 3.5288, + "step": 886 + }, + { + "epoch": 0.05959531559343646, + "grad_norm": 5.496152400970459, + "learning_rate": 9.93288590604027e-05, + "loss": 3.3902, + "step": 888 + }, + { + "epoch": 0.059729539277205465, + "grad_norm": 5.928604602813721, + "learning_rate": 9.955257270693514e-05, + "loss": 3.5896, + "step": 890 + }, + { + "epoch": 0.05986376296097446, + "grad_norm": 11.239786148071289, + "learning_rate": 9.977628635346756e-05, + "loss": 3.4004, + "step": 892 + }, + { + "epoch": 0.05999798664474346, + "grad_norm": 6.084554672241211, + "learning_rate": 0.0001, + "loss": 3.6935, + "step": 894 + }, + { + "epoch": 0.060132210328512466, + "grad_norm": 5.8398566246032715, + "learning_rate": 9.999999881879917e-05, + "loss": 3.3129, + "step": 896 + }, + { + "epoch": 0.06026643401228147, + "grad_norm": 7.668843746185303, + "learning_rate": 9.99999952751967e-05, + "loss": 3.5885, + "step": 898 + }, + { + "epoch": 0.06040065769605047, + "grad_norm": 5.525981903076172, + "learning_rate": 9.999998936919278e-05, + "loss": 3.429, + "step": 900 + }, + { + "epoch": 0.060534881379819466, + "grad_norm": 6.0813093185424805, + "learning_rate": 9.999998110078769e-05, + "loss": 3.4128, + "step": 902 + }, + { + "epoch": 0.06066910506358847, + "grad_norm": 6.08104133605957, + "learning_rate": 9.99999704699818e-05, + "loss": 3.6065, + "step": 904 + }, + { + "epoch": 0.06080332874735747, + "grad_norm": 5.642941474914551, + "learning_rate": 9.999995747677564e-05, + "loss": 3.2405, + "step": 906 + }, + { + "epoch": 0.060937552431126474, + "grad_norm": 8.11876106262207, + "learning_rate": 9.99999421211698e-05, + "loss": 3.3717, + "step": 908 + }, + { + "epoch": 0.061071776114895476, + "grad_norm": 11.752470016479492, + "learning_rate": 9.999992440316502e-05, + "loss": 3.4968, + "step": 910 + }, + { + "epoch": 0.06120599979866447, + "grad_norm": 9.452066421508789, + "learning_rate": 9.999990432276214e-05, + "loss": 3.5023, + "step": 912 + }, + { + "epoch": 0.061340223482433474, + "grad_norm": 6.29379415512085, + "learning_rate": 9.999988187996208e-05, + "loss": 3.6219, + "step": 914 + }, + { + "epoch": 0.06147444716620248, + "grad_norm": 8.226164817810059, + "learning_rate": 9.999985707476594e-05, + "loss": 3.5981, + "step": 916 + }, + { + "epoch": 0.06160867084997148, + "grad_norm": 9.218595504760742, + "learning_rate": 9.999982990717487e-05, + "loss": 3.5708, + "step": 918 + }, + { + "epoch": 0.06174289453374048, + "grad_norm": 6.205141067504883, + "learning_rate": 9.999980037719016e-05, + "loss": 3.4458, + "step": 920 + }, + { + "epoch": 0.06187711821750948, + "grad_norm": 6.289495468139648, + "learning_rate": 9.99997684848132e-05, + "loss": 3.6472, + "step": 922 + }, + { + "epoch": 0.06201134190127848, + "grad_norm": 6.385005474090576, + "learning_rate": 9.99997342300455e-05, + "loss": 3.5303, + "step": 924 + }, + { + "epoch": 0.06214556558504748, + "grad_norm": 11.002243041992188, + "learning_rate": 9.999969761288868e-05, + "loss": 3.5445, + "step": 926 + }, + { + "epoch": 0.062279789268816485, + "grad_norm": 5.7877984046936035, + "learning_rate": 9.999965863334445e-05, + "loss": 3.3564, + "step": 928 + }, + { + "epoch": 0.06241401295258548, + "grad_norm": 6.728155612945557, + "learning_rate": 9.99996172914147e-05, + "loss": 3.6947, + "step": 930 + }, + { + "epoch": 0.06254823663635449, + "grad_norm": 5.656542778015137, + "learning_rate": 9.999957358710132e-05, + "loss": 3.4287, + "step": 932 + }, + { + "epoch": 0.06268246032012349, + "grad_norm": 8.826468467712402, + "learning_rate": 9.999952752040643e-05, + "loss": 3.7097, + "step": 934 + }, + { + "epoch": 0.06281668400389248, + "grad_norm": 6.318328380584717, + "learning_rate": 9.999947909133219e-05, + "loss": 3.4995, + "step": 936 + }, + { + "epoch": 0.06295090768766148, + "grad_norm": 5.421375274658203, + "learning_rate": 9.999942829988086e-05, + "loss": 3.225, + "step": 938 + }, + { + "epoch": 0.06308513137143049, + "grad_norm": 8.352094650268555, + "learning_rate": 9.999937514605486e-05, + "loss": 3.335, + "step": 940 + }, + { + "epoch": 0.06321935505519949, + "grad_norm": 6.500336647033691, + "learning_rate": 9.999931962985674e-05, + "loss": 3.448, + "step": 942 + }, + { + "epoch": 0.06335357873896849, + "grad_norm": 12.906319618225098, + "learning_rate": 9.999926175128905e-05, + "loss": 3.5281, + "step": 944 + }, + { + "epoch": 0.0634878024227375, + "grad_norm": 11.743858337402344, + "learning_rate": 9.999920151035458e-05, + "loss": 3.7445, + "step": 946 + }, + { + "epoch": 0.0636220261065065, + "grad_norm": 5.939197063446045, + "learning_rate": 9.999913890705616e-05, + "loss": 3.4796, + "step": 948 + }, + { + "epoch": 0.0637562497902755, + "grad_norm": 7.059786796569824, + "learning_rate": 9.999907394139674e-05, + "loss": 3.4885, + "step": 950 + }, + { + "epoch": 0.0638904734740445, + "grad_norm": 7.132233142852783, + "learning_rate": 9.99990066133794e-05, + "loss": 3.5528, + "step": 952 + }, + { + "epoch": 0.06402469715781349, + "grad_norm": 6.480830192565918, + "learning_rate": 9.99989369230073e-05, + "loss": 3.3878, + "step": 954 + }, + { + "epoch": 0.06415892084158249, + "grad_norm": 6.6425957679748535, + "learning_rate": 9.999886487028376e-05, + "loss": 3.6726, + "step": 956 + }, + { + "epoch": 0.0642931445253515, + "grad_norm": 5.976042747497559, + "learning_rate": 9.999879045521218e-05, + "loss": 3.6266, + "step": 958 + }, + { + "epoch": 0.0644273682091205, + "grad_norm": 5.397177219390869, + "learning_rate": 9.999871367779606e-05, + "loss": 3.3703, + "step": 960 + }, + { + "epoch": 0.0645615918928895, + "grad_norm": 5.917455673217773, + "learning_rate": 9.999863453803904e-05, + "loss": 3.4946, + "step": 962 + }, + { + "epoch": 0.0646958155766585, + "grad_norm": 6.064168453216553, + "learning_rate": 9.999855303594485e-05, + "loss": 3.3582, + "step": 964 + }, + { + "epoch": 0.0648300392604275, + "grad_norm": 6.026656627655029, + "learning_rate": 9.999846917151737e-05, + "loss": 3.4174, + "step": 966 + }, + { + "epoch": 0.06496426294419651, + "grad_norm": 8.024328231811523, + "learning_rate": 9.999838294476051e-05, + "loss": 3.4529, + "step": 968 + }, + { + "epoch": 0.06509848662796551, + "grad_norm": 6.611804485321045, + "learning_rate": 9.99982943556784e-05, + "loss": 3.5304, + "step": 970 + }, + { + "epoch": 0.0652327103117345, + "grad_norm": 10.23029613494873, + "learning_rate": 9.999820340427517e-05, + "loss": 3.4491, + "step": 972 + }, + { + "epoch": 0.0653669339955035, + "grad_norm": 6.020142078399658, + "learning_rate": 9.999811009055518e-05, + "loss": 3.4011, + "step": 974 + }, + { + "epoch": 0.0655011576792725, + "grad_norm": 5.6800971031188965, + "learning_rate": 9.999801441452278e-05, + "loss": 3.1802, + "step": 976 + }, + { + "epoch": 0.0656353813630415, + "grad_norm": 6.159589767456055, + "learning_rate": 9.999791637618252e-05, + "loss": 3.3775, + "step": 978 + }, + { + "epoch": 0.06576960504681051, + "grad_norm": 6.277538299560547, + "learning_rate": 9.999781597553903e-05, + "loss": 3.3866, + "step": 980 + }, + { + "epoch": 0.06590382873057951, + "grad_norm": 10.414388656616211, + "learning_rate": 9.999771321259705e-05, + "loss": 3.5538, + "step": 982 + }, + { + "epoch": 0.06603805241434851, + "grad_norm": 5.688206195831299, + "learning_rate": 9.999760808736145e-05, + "loss": 3.5343, + "step": 984 + }, + { + "epoch": 0.06617227609811752, + "grad_norm": 9.720704078674316, + "learning_rate": 9.999750059983716e-05, + "loss": 3.3255, + "step": 986 + }, + { + "epoch": 0.06630649978188652, + "grad_norm": 6.03059196472168, + "learning_rate": 9.999739075002931e-05, + "loss": 3.6596, + "step": 988 + }, + { + "epoch": 0.06644072346565552, + "grad_norm": 6.467611312866211, + "learning_rate": 9.999727853794305e-05, + "loss": 3.7195, + "step": 990 + }, + { + "epoch": 0.06657494714942451, + "grad_norm": 6.139322757720947, + "learning_rate": 9.999716396358369e-05, + "loss": 3.5535, + "step": 992 + }, + { + "epoch": 0.06670917083319351, + "grad_norm": 8.892382621765137, + "learning_rate": 9.999704702695664e-05, + "loss": 3.6081, + "step": 994 + }, + { + "epoch": 0.06684339451696251, + "grad_norm": 7.700809478759766, + "learning_rate": 9.999692772806746e-05, + "loss": 3.8583, + "step": 996 + }, + { + "epoch": 0.06697761820073152, + "grad_norm": 8.791764259338379, + "learning_rate": 9.999680606692174e-05, + "loss": 3.4335, + "step": 998 + }, + { + "epoch": 0.06711184188450052, + "grad_norm": 5.53908109664917, + "learning_rate": 9.999668204352526e-05, + "loss": 3.3987, + "step": 1000 + }, + { + "epoch": 0.06724606556826952, + "grad_norm": 6.682313442230225, + "learning_rate": 9.999655565788385e-05, + "loss": 3.456, + "step": 1002 + }, + { + "epoch": 0.06738028925203852, + "grad_norm": 6.062136173248291, + "learning_rate": 9.99964269100035e-05, + "loss": 3.1766, + "step": 1004 + }, + { + "epoch": 0.06751451293580753, + "grad_norm": 8.87801742553711, + "learning_rate": 9.999629579989032e-05, + "loss": 3.2709, + "step": 1006 + }, + { + "epoch": 0.06764873661957653, + "grad_norm": 7.134387016296387, + "learning_rate": 9.999616232755045e-05, + "loss": 3.4257, + "step": 1008 + }, + { + "epoch": 0.06778296030334552, + "grad_norm": 7.788958549499512, + "learning_rate": 9.999602649299022e-05, + "loss": 3.3612, + "step": 1010 + }, + { + "epoch": 0.06791718398711452, + "grad_norm": 8.099238395690918, + "learning_rate": 9.999588829621606e-05, + "loss": 3.6442, + "step": 1012 + }, + { + "epoch": 0.06805140767088352, + "grad_norm": 5.430387020111084, + "learning_rate": 9.99957477372345e-05, + "loss": 3.4412, + "step": 1014 + }, + { + "epoch": 0.06818563135465253, + "grad_norm": 5.201018810272217, + "learning_rate": 9.999560481605217e-05, + "loss": 3.1863, + "step": 1016 + }, + { + "epoch": 0.06831985503842153, + "grad_norm": 6.469707012176514, + "learning_rate": 9.999545953267582e-05, + "loss": 3.3892, + "step": 1018 + }, + { + "epoch": 0.06845407872219053, + "grad_norm": 5.899304389953613, + "learning_rate": 9.999531188711232e-05, + "loss": 3.6244, + "step": 1020 + }, + { + "epoch": 0.06858830240595953, + "grad_norm": 6.014455318450928, + "learning_rate": 9.999516187936864e-05, + "loss": 3.2323, + "step": 1022 + }, + { + "epoch": 0.06872252608972854, + "grad_norm": 5.102768898010254, + "learning_rate": 9.999500950945188e-05, + "loss": 3.4378, + "step": 1024 + }, + { + "epoch": 0.06885674977349754, + "grad_norm": 5.54885721206665, + "learning_rate": 9.999485477736923e-05, + "loss": 3.3678, + "step": 1026 + }, + { + "epoch": 0.06899097345726654, + "grad_norm": 7.176478862762451, + "learning_rate": 9.999469768312799e-05, + "loss": 3.6779, + "step": 1028 + }, + { + "epoch": 0.06912519714103553, + "grad_norm": 6.093576908111572, + "learning_rate": 9.99945382267356e-05, + "loss": 3.5615, + "step": 1030 + }, + { + "epoch": 0.06925942082480453, + "grad_norm": 5.526006698608398, + "learning_rate": 9.999437640819959e-05, + "loss": 3.5801, + "step": 1032 + }, + { + "epoch": 0.06939364450857353, + "grad_norm": 5.540441989898682, + "learning_rate": 9.999421222752763e-05, + "loss": 3.4977, + "step": 1034 + }, + { + "epoch": 0.06952786819234254, + "grad_norm": 5.6648478507995605, + "learning_rate": 9.999404568472742e-05, + "loss": 3.5008, + "step": 1036 + }, + { + "epoch": 0.06966209187611154, + "grad_norm": 6.011509895324707, + "learning_rate": 9.999387677980687e-05, + "loss": 3.4769, + "step": 1038 + }, + { + "epoch": 0.06979631555988054, + "grad_norm": 9.40377426147461, + "learning_rate": 9.999370551277395e-05, + "loss": 3.3829, + "step": 1040 + }, + { + "epoch": 0.06993053924364954, + "grad_norm": 6.709582805633545, + "learning_rate": 9.999353188363676e-05, + "loss": 3.5491, + "step": 1042 + }, + { + "epoch": 0.07006476292741855, + "grad_norm": 11.932124137878418, + "learning_rate": 9.999335589240348e-05, + "loss": 3.4272, + "step": 1044 + }, + { + "epoch": 0.07019898661118755, + "grad_norm": 5.807068824768066, + "learning_rate": 9.999317753908246e-05, + "loss": 3.439, + "step": 1046 + }, + { + "epoch": 0.07033321029495654, + "grad_norm": 7.362298011779785, + "learning_rate": 9.999299682368211e-05, + "loss": 3.4947, + "step": 1048 + }, + { + "epoch": 0.07046743397872554, + "grad_norm": 6.588179111480713, + "learning_rate": 9.999281374621095e-05, + "loss": 3.2589, + "step": 1050 + }, + { + "epoch": 0.07060165766249454, + "grad_norm": 5.922714710235596, + "learning_rate": 9.999262830667766e-05, + "loss": 3.395, + "step": 1052 + }, + { + "epoch": 0.07073588134626355, + "grad_norm": 5.773919582366943, + "learning_rate": 9.999244050509098e-05, + "loss": 3.5153, + "step": 1054 + }, + { + "epoch": 0.07087010503003255, + "grad_norm": 9.401991844177246, + "learning_rate": 9.999225034145979e-05, + "loss": 3.1471, + "step": 1056 + }, + { + "epoch": 0.07100432871380155, + "grad_norm": 5.781289577484131, + "learning_rate": 9.999205781579309e-05, + "loss": 3.3965, + "step": 1058 + }, + { + "epoch": 0.07113855239757055, + "grad_norm": 5.418633460998535, + "learning_rate": 9.999186292809995e-05, + "loss": 3.4152, + "step": 1060 + }, + { + "epoch": 0.07127277608133956, + "grad_norm": 4.763847827911377, + "learning_rate": 9.99916656783896e-05, + "loss": 3.1999, + "step": 1062 + }, + { + "epoch": 0.07140699976510856, + "grad_norm": 5.772603511810303, + "learning_rate": 9.999146606667135e-05, + "loss": 3.3026, + "step": 1064 + }, + { + "epoch": 0.07154122344887756, + "grad_norm": 5.7941083908081055, + "learning_rate": 9.999126409295463e-05, + "loss": 3.4628, + "step": 1066 + }, + { + "epoch": 0.07167544713264655, + "grad_norm": 6.987176895141602, + "learning_rate": 9.999105975724898e-05, + "loss": 3.3658, + "step": 1068 + }, + { + "epoch": 0.07180967081641555, + "grad_norm": 5.47424840927124, + "learning_rate": 9.999085305956406e-05, + "loss": 3.2916, + "step": 1070 + }, + { + "epoch": 0.07194389450018455, + "grad_norm": 6.670589923858643, + "learning_rate": 9.999064399990964e-05, + "loss": 3.42, + "step": 1072 + }, + { + "epoch": 0.07207811818395356, + "grad_norm": 6.116762161254883, + "learning_rate": 9.999043257829561e-05, + "loss": 3.1421, + "step": 1074 + }, + { + "epoch": 0.07221234186772256, + "grad_norm": 6.477542400360107, + "learning_rate": 9.999021879473192e-05, + "loss": 3.1747, + "step": 1076 + }, + { + "epoch": 0.07234656555149156, + "grad_norm": 8.998074531555176, + "learning_rate": 9.99900026492287e-05, + "loss": 3.3105, + "step": 1078 + }, + { + "epoch": 0.07248078923526056, + "grad_norm": 8.048651695251465, + "learning_rate": 9.998978414179617e-05, + "loss": 3.3722, + "step": 1080 + }, + { + "epoch": 0.07261501291902957, + "grad_norm": 5.405158042907715, + "learning_rate": 9.998956327244462e-05, + "loss": 3.6694, + "step": 1082 + }, + { + "epoch": 0.07274923660279857, + "grad_norm": 5.328011512756348, + "learning_rate": 9.998934004118452e-05, + "loss": 3.2059, + "step": 1084 + }, + { + "epoch": 0.07288346028656756, + "grad_norm": 5.2418999671936035, + "learning_rate": 9.99891144480264e-05, + "loss": 3.2338, + "step": 1086 + }, + { + "epoch": 0.07301768397033656, + "grad_norm": 5.666139602661133, + "learning_rate": 9.99888864929809e-05, + "loss": 3.2501, + "step": 1088 + }, + { + "epoch": 0.07315190765410556, + "grad_norm": 5.636970520019531, + "learning_rate": 9.998865617605883e-05, + "loss": 3.3643, + "step": 1090 + }, + { + "epoch": 0.07328613133787457, + "grad_norm": 4.762028217315674, + "learning_rate": 9.998842349727107e-05, + "loss": 3.2604, + "step": 1092 + }, + { + "epoch": 0.07342035502164357, + "grad_norm": 5.527797222137451, + "learning_rate": 9.998818845662859e-05, + "loss": 3.3189, + "step": 1094 + }, + { + "epoch": 0.07355457870541257, + "grad_norm": 5.613569736480713, + "learning_rate": 9.998795105414248e-05, + "loss": 3.3428, + "step": 1096 + }, + { + "epoch": 0.07368880238918157, + "grad_norm": 6.939001083374023, + "learning_rate": 9.998771128982399e-05, + "loss": 3.4748, + "step": 1098 + }, + { + "epoch": 0.07382302607295058, + "grad_norm": 6.65857458114624, + "learning_rate": 9.998746916368444e-05, + "loss": 3.5409, + "step": 1100 + }, + { + "epoch": 0.07395724975671958, + "grad_norm": 5.536454677581787, + "learning_rate": 9.998722467573528e-05, + "loss": 3.4545, + "step": 1102 + }, + { + "epoch": 0.07409147344048858, + "grad_norm": 5.921494960784912, + "learning_rate": 9.998697782598804e-05, + "loss": 3.1985, + "step": 1104 + }, + { + "epoch": 0.07422569712425757, + "grad_norm": 8.644729614257812, + "learning_rate": 9.998672861445439e-05, + "loss": 3.5976, + "step": 1106 + }, + { + "epoch": 0.07435992080802657, + "grad_norm": 11.374105453491211, + "learning_rate": 9.998647704114612e-05, + "loss": 3.3512, + "step": 1108 + }, + { + "epoch": 0.07449414449179557, + "grad_norm": 5.59948205947876, + "learning_rate": 9.998622310607508e-05, + "loss": 3.2591, + "step": 1110 + }, + { + "epoch": 0.07462836817556458, + "grad_norm": 6.104866981506348, + "learning_rate": 9.998596680925331e-05, + "loss": 3.4759, + "step": 1112 + }, + { + "epoch": 0.07476259185933358, + "grad_norm": 6.387207508087158, + "learning_rate": 9.99857081506929e-05, + "loss": 3.3342, + "step": 1114 + }, + { + "epoch": 0.07489681554310258, + "grad_norm": 49.64759826660156, + "learning_rate": 9.998544713040608e-05, + "loss": 3.1628, + "step": 1116 + }, + { + "epoch": 0.07503103922687158, + "grad_norm": 6.476597309112549, + "learning_rate": 9.998518374840515e-05, + "loss": 3.5386, + "step": 1118 + }, + { + "epoch": 0.07516526291064059, + "grad_norm": 8.075818061828613, + "learning_rate": 9.998491800470259e-05, + "loss": 3.3858, + "step": 1120 + }, + { + "epoch": 0.07529948659440959, + "grad_norm": 7.310550689697266, + "learning_rate": 9.998464989931097e-05, + "loss": 3.3698, + "step": 1122 + }, + { + "epoch": 0.07543371027817858, + "grad_norm": 5.388381481170654, + "learning_rate": 9.998437943224292e-05, + "loss": 3.48, + "step": 1124 + }, + { + "epoch": 0.07556793396194758, + "grad_norm": 6.972514629364014, + "learning_rate": 9.998410660351121e-05, + "loss": 3.2821, + "step": 1126 + }, + { + "epoch": 0.07570215764571658, + "grad_norm": 5.622326374053955, + "learning_rate": 9.998383141312877e-05, + "loss": 3.359, + "step": 1128 + }, + { + "epoch": 0.07583638132948559, + "grad_norm": 10.619815826416016, + "learning_rate": 9.99835538611086e-05, + "loss": 3.3315, + "step": 1130 + }, + { + "epoch": 0.07597060501325459, + "grad_norm": 6.039434432983398, + "learning_rate": 9.998327394746378e-05, + "loss": 3.5254, + "step": 1132 + }, + { + "epoch": 0.07610482869702359, + "grad_norm": 7.552164554595947, + "learning_rate": 9.998299167220755e-05, + "loss": 3.5483, + "step": 1134 + }, + { + "epoch": 0.0762390523807926, + "grad_norm": 6.412265777587891, + "learning_rate": 9.998270703535326e-05, + "loss": 3.1456, + "step": 1136 + }, + { + "epoch": 0.0763732760645616, + "grad_norm": 5.766594886779785, + "learning_rate": 9.998242003691434e-05, + "loss": 3.6138, + "step": 1138 + }, + { + "epoch": 0.0765074997483306, + "grad_norm": 6.081409931182861, + "learning_rate": 9.998213067690436e-05, + "loss": 3.5893, + "step": 1140 + }, + { + "epoch": 0.07664172343209959, + "grad_norm": 5.574845314025879, + "learning_rate": 9.998183895533701e-05, + "loss": 3.5834, + "step": 1142 + }, + { + "epoch": 0.07677594711586859, + "grad_norm": 6.0622663497924805, + "learning_rate": 9.998154487222602e-05, + "loss": 3.4972, + "step": 1144 + }, + { + "epoch": 0.07691017079963759, + "grad_norm": 5.6902618408203125, + "learning_rate": 9.998124842758535e-05, + "loss": 3.4169, + "step": 1146 + }, + { + "epoch": 0.0770443944834066, + "grad_norm": 5.422920227050781, + "learning_rate": 9.998094962142897e-05, + "loss": 3.3244, + "step": 1148 + }, + { + "epoch": 0.0771786181671756, + "grad_norm": 5.074063301086426, + "learning_rate": 9.9980648453771e-05, + "loss": 3.2075, + "step": 1150 + }, + { + "epoch": 0.0773128418509446, + "grad_norm": 5.441277503967285, + "learning_rate": 9.998034492462567e-05, + "loss": 3.5534, + "step": 1152 + }, + { + "epoch": 0.0774470655347136, + "grad_norm": 6.959360599517822, + "learning_rate": 9.998003903400732e-05, + "loss": 3.3297, + "step": 1154 + }, + { + "epoch": 0.0775812892184826, + "grad_norm": 5.1406731605529785, + "learning_rate": 9.997973078193041e-05, + "loss": 3.5922, + "step": 1156 + }, + { + "epoch": 0.07771551290225161, + "grad_norm": 11.094526290893555, + "learning_rate": 9.99794201684095e-05, + "loss": 3.3475, + "step": 1158 + }, + { + "epoch": 0.07784973658602061, + "grad_norm": 5.445409297943115, + "learning_rate": 9.997910719345928e-05, + "loss": 3.4549, + "step": 1160 + }, + { + "epoch": 0.0779839602697896, + "grad_norm": 5.577290058135986, + "learning_rate": 9.997879185709453e-05, + "loss": 3.3619, + "step": 1162 + }, + { + "epoch": 0.0781181839535586, + "grad_norm": 11.976070404052734, + "learning_rate": 9.997847415933012e-05, + "loss": 3.0764, + "step": 1164 + }, + { + "epoch": 0.0782524076373276, + "grad_norm": 6.134497165679932, + "learning_rate": 9.997815410018111e-05, + "loss": 3.6408, + "step": 1166 + }, + { + "epoch": 0.0783866313210966, + "grad_norm": 9.704249382019043, + "learning_rate": 9.997783167966258e-05, + "loss": 3.5082, + "step": 1168 + }, + { + "epoch": 0.07852085500486561, + "grad_norm": 8.681211471557617, + "learning_rate": 9.997750689778978e-05, + "loss": 3.4357, + "step": 1170 + }, + { + "epoch": 0.07865507868863461, + "grad_norm": 8.367074966430664, + "learning_rate": 9.997717975457807e-05, + "loss": 3.4653, + "step": 1172 + }, + { + "epoch": 0.07878930237240361, + "grad_norm": 5.973531723022461, + "learning_rate": 9.997685025004288e-05, + "loss": 3.3465, + "step": 1174 + }, + { + "epoch": 0.07892352605617262, + "grad_norm": 9.62775993347168, + "learning_rate": 9.997651838419979e-05, + "loss": 3.3291, + "step": 1176 + }, + { + "epoch": 0.07905774973994162, + "grad_norm": 4.801080226898193, + "learning_rate": 9.997618415706448e-05, + "loss": 3.2543, + "step": 1178 + }, + { + "epoch": 0.07919197342371061, + "grad_norm": 37.14448928833008, + "learning_rate": 9.997584756865274e-05, + "loss": 3.4335, + "step": 1180 + }, + { + "epoch": 0.07932619710747961, + "grad_norm": 7.281774997711182, + "learning_rate": 9.997550861898049e-05, + "loss": 3.6716, + "step": 1182 + }, + { + "epoch": 0.07946042079124861, + "grad_norm": 9.717472076416016, + "learning_rate": 9.997516730806372e-05, + "loss": 3.4658, + "step": 1184 + }, + { + "epoch": 0.07959464447501761, + "grad_norm": 6.860538482666016, + "learning_rate": 9.997482363591857e-05, + "loss": 3.4278, + "step": 1186 + }, + { + "epoch": 0.07972886815878662, + "grad_norm": 9.284693717956543, + "learning_rate": 9.997447760256126e-05, + "loss": 3.4455, + "step": 1188 + }, + { + "epoch": 0.07986309184255562, + "grad_norm": 6.0754714012146, + "learning_rate": 9.997412920800817e-05, + "loss": 3.1804, + "step": 1190 + }, + { + "epoch": 0.07999731552632462, + "grad_norm": 5.935841083526611, + "learning_rate": 9.997377845227576e-05, + "loss": 3.4543, + "step": 1192 + }, + { + "epoch": 0.08013153921009362, + "grad_norm": 6.062462329864502, + "learning_rate": 9.997342533538056e-05, + "loss": 3.5291, + "step": 1194 + }, + { + "epoch": 0.08026576289386263, + "grad_norm": 7.05542516708374, + "learning_rate": 9.99730698573393e-05, + "loss": 3.3654, + "step": 1196 + }, + { + "epoch": 0.08039998657763163, + "grad_norm": 6.461819171905518, + "learning_rate": 9.997271201816873e-05, + "loss": 3.3347, + "step": 1198 + }, + { + "epoch": 0.08053421026140062, + "grad_norm": 5.816255569458008, + "learning_rate": 9.99723518178858e-05, + "loss": 3.2055, + "step": 1200 + }, + { + "epoch": 0.08066843394516962, + "grad_norm": 8.016623497009277, + "learning_rate": 9.997198925650753e-05, + "loss": 3.5794, + "step": 1202 + }, + { + "epoch": 0.08080265762893862, + "grad_norm": 5.3279266357421875, + "learning_rate": 9.9971624334051e-05, + "loss": 3.3944, + "step": 1204 + }, + { + "epoch": 0.08093688131270763, + "grad_norm": 6.095082759857178, + "learning_rate": 9.997125705053352e-05, + "loss": 3.3988, + "step": 1206 + }, + { + "epoch": 0.08107110499647663, + "grad_norm": 6.014826774597168, + "learning_rate": 9.997088740597237e-05, + "loss": 3.3655, + "step": 1208 + }, + { + "epoch": 0.08120532868024563, + "grad_norm": 5.735154151916504, + "learning_rate": 9.997051540038508e-05, + "loss": 3.227, + "step": 1210 + }, + { + "epoch": 0.08133955236401463, + "grad_norm": 4.937158107757568, + "learning_rate": 9.997014103378921e-05, + "loss": 3.2831, + "step": 1212 + }, + { + "epoch": 0.08147377604778364, + "grad_norm": 7.227793216705322, + "learning_rate": 9.996976430620241e-05, + "loss": 3.1704, + "step": 1214 + }, + { + "epoch": 0.08160799973155264, + "grad_norm": 5.940042495727539, + "learning_rate": 9.996938521764254e-05, + "loss": 3.266, + "step": 1216 + }, + { + "epoch": 0.08174222341532163, + "grad_norm": 9.42092514038086, + "learning_rate": 9.996900376812746e-05, + "loss": 3.5492, + "step": 1218 + }, + { + "epoch": 0.08187644709909063, + "grad_norm": 7.032357215881348, + "learning_rate": 9.996861995767522e-05, + "loss": 3.444, + "step": 1220 + }, + { + "epoch": 0.08201067078285963, + "grad_norm": 14.4237642288208, + "learning_rate": 9.996823378630393e-05, + "loss": 3.4667, + "step": 1222 + }, + { + "epoch": 0.08214489446662863, + "grad_norm": 5.706676483154297, + "learning_rate": 9.996784525403186e-05, + "loss": 3.2002, + "step": 1224 + }, + { + "epoch": 0.08227911815039764, + "grad_norm": 5.385760307312012, + "learning_rate": 9.996745436087736e-05, + "loss": 3.4557, + "step": 1226 + }, + { + "epoch": 0.08241334183416664, + "grad_norm": 6.752831935882568, + "learning_rate": 9.99670611068589e-05, + "loss": 3.5313, + "step": 1228 + }, + { + "epoch": 0.08254756551793564, + "grad_norm": 6.346020698547363, + "learning_rate": 9.996666549199505e-05, + "loss": 3.5123, + "step": 1230 + }, + { + "epoch": 0.08268178920170464, + "grad_norm": 6.008730411529541, + "learning_rate": 9.996626751630453e-05, + "loss": 3.4447, + "step": 1232 + }, + { + "epoch": 0.08281601288547365, + "grad_norm": 6.893509864807129, + "learning_rate": 9.996586717980611e-05, + "loss": 3.2355, + "step": 1234 + }, + { + "epoch": 0.08295023656924265, + "grad_norm": 6.833907604217529, + "learning_rate": 9.996546448251871e-05, + "loss": 3.4422, + "step": 1236 + }, + { + "epoch": 0.08308446025301164, + "grad_norm": 5.142963409423828, + "learning_rate": 9.996505942446139e-05, + "loss": 3.2837, + "step": 1238 + }, + { + "epoch": 0.08321868393678064, + "grad_norm": 5.610093593597412, + "learning_rate": 9.996465200565324e-05, + "loss": 3.7045, + "step": 1240 + }, + { + "epoch": 0.08335290762054964, + "grad_norm": 5.881671905517578, + "learning_rate": 9.996424222611356e-05, + "loss": 3.1158, + "step": 1242 + }, + { + "epoch": 0.08348713130431865, + "grad_norm": 7.472585201263428, + "learning_rate": 9.996383008586165e-05, + "loss": 3.2884, + "step": 1244 + }, + { + "epoch": 0.08362135498808765, + "grad_norm": 6.149244785308838, + "learning_rate": 9.996341558491706e-05, + "loss": 3.6347, + "step": 1246 + }, + { + "epoch": 0.08375557867185665, + "grad_norm": 6.998498439788818, + "learning_rate": 9.996299872329931e-05, + "loss": 3.2658, + "step": 1248 + }, + { + "epoch": 0.08388980235562565, + "grad_norm": 4.736637592315674, + "learning_rate": 9.996257950102811e-05, + "loss": 3.0909, + "step": 1250 + }, + { + "epoch": 0.08402402603939466, + "grad_norm": 6.211285591125488, + "learning_rate": 9.996215791812328e-05, + "loss": 3.224, + "step": 1252 + }, + { + "epoch": 0.08415824972316366, + "grad_norm": 17.857206344604492, + "learning_rate": 9.996173397460475e-05, + "loss": 3.5145, + "step": 1254 + }, + { + "epoch": 0.08429247340693265, + "grad_norm": 9.011795043945312, + "learning_rate": 9.996130767049252e-05, + "loss": 3.0956, + "step": 1256 + }, + { + "epoch": 0.08442669709070165, + "grad_norm": 6.894383430480957, + "learning_rate": 9.996087900580675e-05, + "loss": 3.2413, + "step": 1258 + }, + { + "epoch": 0.08456092077447065, + "grad_norm": 5.806680202484131, + "learning_rate": 9.996044798056769e-05, + "loss": 3.423, + "step": 1260 + }, + { + "epoch": 0.08469514445823965, + "grad_norm": 7.171674728393555, + "learning_rate": 9.996001459479572e-05, + "loss": 3.2966, + "step": 1262 + }, + { + "epoch": 0.08482936814200866, + "grad_norm": 5.4411702156066895, + "learning_rate": 9.995957884851129e-05, + "loss": 3.3403, + "step": 1264 + }, + { + "epoch": 0.08496359182577766, + "grad_norm": 4.708560466766357, + "learning_rate": 9.995914074173501e-05, + "loss": 3.0728, + "step": 1266 + }, + { + "epoch": 0.08509781550954666, + "grad_norm": 5.711141109466553, + "learning_rate": 9.995870027448756e-05, + "loss": 3.3948, + "step": 1268 + }, + { + "epoch": 0.08523203919331566, + "grad_norm": 5.8343658447265625, + "learning_rate": 9.995825744678976e-05, + "loss": 3.471, + "step": 1270 + }, + { + "epoch": 0.08536626287708467, + "grad_norm": 4.717342853546143, + "learning_rate": 9.995781225866254e-05, + "loss": 3.2019, + "step": 1272 + }, + { + "epoch": 0.08550048656085366, + "grad_norm": 6.064172744750977, + "learning_rate": 9.995736471012693e-05, + "loss": 3.5814, + "step": 1274 + }, + { + "epoch": 0.08563471024462266, + "grad_norm": 4.9020304679870605, + "learning_rate": 9.995691480120408e-05, + "loss": 3.1876, + "step": 1276 + }, + { + "epoch": 0.08576893392839166, + "grad_norm": 5.300848484039307, + "learning_rate": 9.995646253191522e-05, + "loss": 3.5007, + "step": 1278 + }, + { + "epoch": 0.08590315761216066, + "grad_norm": 6.633376598358154, + "learning_rate": 9.995600790228176e-05, + "loss": 3.2068, + "step": 1280 + }, + { + "epoch": 0.08603738129592967, + "grad_norm": 5.815533638000488, + "learning_rate": 9.995555091232516e-05, + "loss": 3.1289, + "step": 1282 + }, + { + "epoch": 0.08617160497969867, + "grad_norm": 5.219507217407227, + "learning_rate": 9.995509156206701e-05, + "loss": 3.211, + "step": 1284 + }, + { + "epoch": 0.08630582866346767, + "grad_norm": 5.926776885986328, + "learning_rate": 9.995462985152902e-05, + "loss": 3.3846, + "step": 1286 + }, + { + "epoch": 0.08644005234723667, + "grad_norm": 6.506069660186768, + "learning_rate": 9.995416578073299e-05, + "loss": 3.0443, + "step": 1288 + }, + { + "epoch": 0.08657427603100568, + "grad_norm": 8.011123657226562, + "learning_rate": 9.995369934970085e-05, + "loss": 3.1737, + "step": 1290 + }, + { + "epoch": 0.08670849971477468, + "grad_norm": 7.4286370277404785, + "learning_rate": 9.995323055845466e-05, + "loss": 3.0934, + "step": 1292 + }, + { + "epoch": 0.08684272339854367, + "grad_norm": 7.9222540855407715, + "learning_rate": 9.995275940701657e-05, + "loss": 3.3185, + "step": 1294 + }, + { + "epoch": 0.08697694708231267, + "grad_norm": 5.114498615264893, + "learning_rate": 9.995228589540881e-05, + "loss": 3.2255, + "step": 1296 + }, + { + "epoch": 0.08711117076608167, + "grad_norm": 5.586244583129883, + "learning_rate": 9.995181002365376e-05, + "loss": 3.3593, + "step": 1298 + }, + { + "epoch": 0.08724539444985067, + "grad_norm": 7.384498596191406, + "learning_rate": 9.995133179177391e-05, + "loss": 3.071, + "step": 1300 + }, + { + "epoch": 0.08737961813361968, + "grad_norm": 13.150132179260254, + "learning_rate": 9.995085119979189e-05, + "loss": 3.1267, + "step": 1302 + }, + { + "epoch": 0.08751384181738868, + "grad_norm": 5.484643459320068, + "learning_rate": 9.995036824773034e-05, + "loss": 3.1915, + "step": 1304 + }, + { + "epoch": 0.08764806550115768, + "grad_norm": 6.346102237701416, + "learning_rate": 9.994988293561213e-05, + "loss": 3.4418, + "step": 1306 + }, + { + "epoch": 0.08778228918492668, + "grad_norm": 5.941910266876221, + "learning_rate": 9.994939526346016e-05, + "loss": 3.4018, + "step": 1308 + }, + { + "epoch": 0.08791651286869569, + "grad_norm": 6.629932403564453, + "learning_rate": 9.99489052312975e-05, + "loss": 2.9861, + "step": 1310 + }, + { + "epoch": 0.08805073655246468, + "grad_norm": 5.917767524719238, + "learning_rate": 9.99484128391473e-05, + "loss": 3.1348, + "step": 1312 + }, + { + "epoch": 0.08818496023623368, + "grad_norm": 5.490838050842285, + "learning_rate": 9.994791808703279e-05, + "loss": 3.0756, + "step": 1314 + }, + { + "epoch": 0.08831918392000268, + "grad_norm": 5.195504665374756, + "learning_rate": 9.994742097497737e-05, + "loss": 3.6349, + "step": 1316 + }, + { + "epoch": 0.08845340760377168, + "grad_norm": 5.061168670654297, + "learning_rate": 9.994692150300453e-05, + "loss": 3.3093, + "step": 1318 + }, + { + "epoch": 0.08858763128754069, + "grad_norm": 5.497848033905029, + "learning_rate": 9.994641967113787e-05, + "loss": 3.2055, + "step": 1320 + }, + { + "epoch": 0.08872185497130969, + "grad_norm": 5.648294448852539, + "learning_rate": 9.994591547940109e-05, + "loss": 2.9543, + "step": 1322 + }, + { + "epoch": 0.08885607865507869, + "grad_norm": 4.940004348754883, + "learning_rate": 9.994540892781802e-05, + "loss": 3.4539, + "step": 1324 + }, + { + "epoch": 0.0889903023388477, + "grad_norm": 5.000607967376709, + "learning_rate": 9.994490001641258e-05, + "loss": 3.2719, + "step": 1326 + }, + { + "epoch": 0.0891245260226167, + "grad_norm": 4.678399562835693, + "learning_rate": 9.994438874520885e-05, + "loss": 3.2593, + "step": 1328 + }, + { + "epoch": 0.0892587497063857, + "grad_norm": 5.138744831085205, + "learning_rate": 9.994387511423096e-05, + "loss": 3.2957, + "step": 1330 + }, + { + "epoch": 0.08939297339015469, + "grad_norm": 5.105295658111572, + "learning_rate": 9.994335912350317e-05, + "loss": 3.3528, + "step": 1332 + }, + { + "epoch": 0.08952719707392369, + "grad_norm": 4.650461196899414, + "learning_rate": 9.994284077304987e-05, + "loss": 3.2752, + "step": 1334 + }, + { + "epoch": 0.08966142075769269, + "grad_norm": 4.988102912902832, + "learning_rate": 9.994232006289554e-05, + "loss": 3.3209, + "step": 1336 + }, + { + "epoch": 0.0897956444414617, + "grad_norm": 4.811791896820068, + "learning_rate": 9.994179699306483e-05, + "loss": 3.1127, + "step": 1338 + }, + { + "epoch": 0.0899298681252307, + "grad_norm": 5.037345886230469, + "learning_rate": 9.99412715635824e-05, + "loss": 3.7285, + "step": 1340 + }, + { + "epoch": 0.0900640918089997, + "grad_norm": 12.744126319885254, + "learning_rate": 9.994074377447309e-05, + "loss": 3.3142, + "step": 1342 + }, + { + "epoch": 0.0901983154927687, + "grad_norm": 5.314663887023926, + "learning_rate": 9.994021362576184e-05, + "loss": 3.1447, + "step": 1344 + }, + { + "epoch": 0.0903325391765377, + "grad_norm": 5.708115100860596, + "learning_rate": 9.99396811174737e-05, + "loss": 3.3273, + "step": 1346 + }, + { + "epoch": 0.09046676286030671, + "grad_norm": 4.934871196746826, + "learning_rate": 9.993914624963383e-05, + "loss": 3.1997, + "step": 1348 + }, + { + "epoch": 0.0906009865440757, + "grad_norm": 5.502106189727783, + "learning_rate": 9.99386090222675e-05, + "loss": 3.1939, + "step": 1350 + }, + { + "epoch": 0.0907352102278447, + "grad_norm": 5.3581976890563965, + "learning_rate": 9.993806943540009e-05, + "loss": 3.3245, + "step": 1352 + }, + { + "epoch": 0.0908694339116137, + "grad_norm": 11.048656463623047, + "learning_rate": 9.993752748905712e-05, + "loss": 3.2456, + "step": 1354 + }, + { + "epoch": 0.0910036575953827, + "grad_norm": 5.652985095977783, + "learning_rate": 9.993698318326416e-05, + "loss": 3.2566, + "step": 1356 + }, + { + "epoch": 0.0911378812791517, + "grad_norm": 5.679229736328125, + "learning_rate": 9.993643651804694e-05, + "loss": 3.4141, + "step": 1358 + }, + { + "epoch": 0.09127210496292071, + "grad_norm": 5.4820356369018555, + "learning_rate": 9.99358874934313e-05, + "loss": 3.4955, + "step": 1360 + }, + { + "epoch": 0.09140632864668971, + "grad_norm": 5.1083173751831055, + "learning_rate": 9.993533610944315e-05, + "loss": 3.3104, + "step": 1362 + }, + { + "epoch": 0.09154055233045871, + "grad_norm": 6.494790077209473, + "learning_rate": 9.993478236610858e-05, + "loss": 3.3376, + "step": 1364 + }, + { + "epoch": 0.09167477601422772, + "grad_norm": 6.635143280029297, + "learning_rate": 9.993422626345373e-05, + "loss": 3.4073, + "step": 1366 + }, + { + "epoch": 0.09180899969799672, + "grad_norm": 5.743031978607178, + "learning_rate": 9.993366780150488e-05, + "loss": 3.5089, + "step": 1368 + }, + { + "epoch": 0.0919432233817657, + "grad_norm": 8.172161102294922, + "learning_rate": 9.993310698028842e-05, + "loss": 3.1719, + "step": 1370 + }, + { + "epoch": 0.09207744706553471, + "grad_norm": 7.374875068664551, + "learning_rate": 9.993254379983084e-05, + "loss": 3.323, + "step": 1372 + }, + { + "epoch": 0.09221167074930371, + "grad_norm": 6.428849697113037, + "learning_rate": 9.993197826015874e-05, + "loss": 3.5799, + "step": 1374 + }, + { + "epoch": 0.09234589443307271, + "grad_norm": 4.769272804260254, + "learning_rate": 9.993141036129887e-05, + "loss": 3.1663, + "step": 1376 + }, + { + "epoch": 0.09248011811684172, + "grad_norm": 7.206545352935791, + "learning_rate": 9.993084010327804e-05, + "loss": 3.308, + "step": 1378 + }, + { + "epoch": 0.09261434180061072, + "grad_norm": 5.589379787445068, + "learning_rate": 9.993026748612322e-05, + "loss": 3.2205, + "step": 1380 + }, + { + "epoch": 0.09274856548437972, + "grad_norm": 10.826541900634766, + "learning_rate": 9.992969250986142e-05, + "loss": 3.4514, + "step": 1382 + }, + { + "epoch": 0.09288278916814872, + "grad_norm": 7.835822582244873, + "learning_rate": 9.992911517451985e-05, + "loss": 3.4218, + "step": 1384 + }, + { + "epoch": 0.09301701285191773, + "grad_norm": 6.128800868988037, + "learning_rate": 9.992853548012576e-05, + "loss": 3.4036, + "step": 1386 + }, + { + "epoch": 0.09315123653568672, + "grad_norm": 5.190601348876953, + "learning_rate": 9.992795342670656e-05, + "loss": 3.148, + "step": 1388 + }, + { + "epoch": 0.09328546021945572, + "grad_norm": 6.64279317855835, + "learning_rate": 9.992736901428971e-05, + "loss": 3.15, + "step": 1390 + }, + { + "epoch": 0.09341968390322472, + "grad_norm": 5.452963829040527, + "learning_rate": 9.992678224290288e-05, + "loss": 3.1168, + "step": 1392 + }, + { + "epoch": 0.09355390758699372, + "grad_norm": 8.23900318145752, + "learning_rate": 9.992619311257376e-05, + "loss": 3.2054, + "step": 1394 + }, + { + "epoch": 0.09368813127076273, + "grad_norm": 5.5644683837890625, + "learning_rate": 9.992560162333019e-05, + "loss": 3.2592, + "step": 1396 + }, + { + "epoch": 0.09382235495453173, + "grad_norm": 7.396058559417725, + "learning_rate": 9.992500777520011e-05, + "loss": 3.4178, + "step": 1398 + }, + { + "epoch": 0.09395657863830073, + "grad_norm": 6.023173809051514, + "learning_rate": 9.99244115682116e-05, + "loss": 3.4235, + "step": 1400 + }, + { + "epoch": 0.09409080232206973, + "grad_norm": 6.518898010253906, + "learning_rate": 9.992381300239281e-05, + "loss": 3.2812, + "step": 1402 + }, + { + "epoch": 0.09422502600583874, + "grad_norm": 5.112799167633057, + "learning_rate": 9.992321207777202e-05, + "loss": 3.236, + "step": 1404 + }, + { + "epoch": 0.09435924968960774, + "grad_norm": 6.0553364753723145, + "learning_rate": 9.992260879437763e-05, + "loss": 2.9559, + "step": 1406 + }, + { + "epoch": 0.09449347337337673, + "grad_norm": 5.628736972808838, + "learning_rate": 9.992200315223815e-05, + "loss": 3.2496, + "step": 1408 + }, + { + "epoch": 0.09462769705714573, + "grad_norm": 5.198064804077148, + "learning_rate": 9.992139515138219e-05, + "loss": 3.3915, + "step": 1410 + }, + { + "epoch": 0.09476192074091473, + "grad_norm": 7.422445774078369, + "learning_rate": 9.992078479183847e-05, + "loss": 3.34, + "step": 1412 + }, + { + "epoch": 0.09489614442468373, + "grad_norm": 8.453798294067383, + "learning_rate": 9.992017207363584e-05, + "loss": 3.2566, + "step": 1414 + }, + { + "epoch": 0.09503036810845274, + "grad_norm": 5.395920753479004, + "learning_rate": 9.991955699680322e-05, + "loss": 3.1437, + "step": 1416 + }, + { + "epoch": 0.09516459179222174, + "grad_norm": 5.582540512084961, + "learning_rate": 9.991893956136973e-05, + "loss": 3.6004, + "step": 1418 + }, + { + "epoch": 0.09529881547599074, + "grad_norm": 8.468682289123535, + "learning_rate": 9.991831976736447e-05, + "loss": 3.4174, + "step": 1420 + }, + { + "epoch": 0.09543303915975974, + "grad_norm": 5.373665809631348, + "learning_rate": 9.99176976148168e-05, + "loss": 3.3952, + "step": 1422 + }, + { + "epoch": 0.09556726284352875, + "grad_norm": 5.65083646774292, + "learning_rate": 9.991707310375604e-05, + "loss": 3.121, + "step": 1424 + }, + { + "epoch": 0.09570148652729774, + "grad_norm": 8.831767082214355, + "learning_rate": 9.991644623421176e-05, + "loss": 3.1193, + "step": 1426 + }, + { + "epoch": 0.09583571021106674, + "grad_norm": 6.369810104370117, + "learning_rate": 9.991581700621355e-05, + "loss": 3.2959, + "step": 1428 + }, + { + "epoch": 0.09596993389483574, + "grad_norm": 5.503120422363281, + "learning_rate": 9.991518541979113e-05, + "loss": 3.3179, + "step": 1430 + }, + { + "epoch": 0.09610415757860474, + "grad_norm": 5.957647800445557, + "learning_rate": 9.991455147497435e-05, + "loss": 3.3284, + "step": 1432 + }, + { + "epoch": 0.09623838126237375, + "grad_norm": 5.326055526733398, + "learning_rate": 9.991391517179318e-05, + "loss": 3.4781, + "step": 1434 + }, + { + "epoch": 0.09637260494614275, + "grad_norm": 6.5234198570251465, + "learning_rate": 9.991327651027765e-05, + "loss": 3.6498, + "step": 1436 + }, + { + "epoch": 0.09650682862991175, + "grad_norm": 5.438503742218018, + "learning_rate": 9.991263549045797e-05, + "loss": 3.214, + "step": 1438 + }, + { + "epoch": 0.09664105231368075, + "grad_norm": 5.163165092468262, + "learning_rate": 9.991199211236442e-05, + "loss": 3.224, + "step": 1440 + }, + { + "epoch": 0.09677527599744976, + "grad_norm": 5.548779010772705, + "learning_rate": 9.991134637602737e-05, + "loss": 3.1428, + "step": 1442 + }, + { + "epoch": 0.09690949968121874, + "grad_norm": 4.302369117736816, + "learning_rate": 9.991069828147737e-05, + "loss": 2.9269, + "step": 1444 + }, + { + "epoch": 0.09704372336498775, + "grad_norm": 13.575542449951172, + "learning_rate": 9.9910047828745e-05, + "loss": 3.3788, + "step": 1446 + }, + { + "epoch": 0.09717794704875675, + "grad_norm": 5.76348876953125, + "learning_rate": 9.990939501786103e-05, + "loss": 3.0906, + "step": 1448 + }, + { + "epoch": 0.09731217073252575, + "grad_norm": 5.5641303062438965, + "learning_rate": 9.990873984885629e-05, + "loss": 3.3252, + "step": 1450 + }, + { + "epoch": 0.09744639441629475, + "grad_norm": 5.386458873748779, + "learning_rate": 9.990808232176172e-05, + "loss": 3.3987, + "step": 1452 + }, + { + "epoch": 0.09758061810006376, + "grad_norm": 5.445592880249023, + "learning_rate": 9.99074224366084e-05, + "loss": 3.2361, + "step": 1454 + }, + { + "epoch": 0.09771484178383276, + "grad_norm": 5.548768043518066, + "learning_rate": 9.990676019342752e-05, + "loss": 3.1857, + "step": 1456 + }, + { + "epoch": 0.09784906546760176, + "grad_norm": 7.846155166625977, + "learning_rate": 9.990609559225036e-05, + "loss": 3.2907, + "step": 1458 + }, + { + "epoch": 0.09798328915137076, + "grad_norm": 4.604873180389404, + "learning_rate": 9.990542863310831e-05, + "loss": 2.8918, + "step": 1460 + }, + { + "epoch": 0.09811751283513977, + "grad_norm": 5.658130168914795, + "learning_rate": 9.990475931603289e-05, + "loss": 3.478, + "step": 1462 + }, + { + "epoch": 0.09825173651890876, + "grad_norm": 4.8975725173950195, + "learning_rate": 9.990408764105575e-05, + "loss": 3.297, + "step": 1464 + }, + { + "epoch": 0.09838596020267776, + "grad_norm": 7.686820030212402, + "learning_rate": 9.990341360820856e-05, + "loss": 3.433, + "step": 1466 + }, + { + "epoch": 0.09852018388644676, + "grad_norm": 10.522367477416992, + "learning_rate": 9.990273721752324e-05, + "loss": 3.4131, + "step": 1468 + }, + { + "epoch": 0.09865440757021576, + "grad_norm": 6.470432758331299, + "learning_rate": 9.99020584690317e-05, + "loss": 3.1754, + "step": 1470 + }, + { + "epoch": 0.09878863125398477, + "grad_norm": 7.206165790557861, + "learning_rate": 9.990137736276604e-05, + "loss": 3.2327, + "step": 1472 + }, + { + "epoch": 0.09892285493775377, + "grad_norm": 5.666351795196533, + "learning_rate": 9.990069389875843e-05, + "loss": 3.1697, + "step": 1474 + }, + { + "epoch": 0.09905707862152277, + "grad_norm": 5.71097993850708, + "learning_rate": 9.990000807704114e-05, + "loss": 3.1172, + "step": 1476 + }, + { + "epoch": 0.09919130230529177, + "grad_norm": 5.661677360534668, + "learning_rate": 9.98993198976466e-05, + "loss": 3.3257, + "step": 1478 + }, + { + "epoch": 0.09932552598906078, + "grad_norm": 8.711248397827148, + "learning_rate": 9.989862936060731e-05, + "loss": 3.2827, + "step": 1480 + }, + { + "epoch": 0.09945974967282976, + "grad_norm": 9.67746639251709, + "learning_rate": 9.989793646595591e-05, + "loss": 3.2182, + "step": 1482 + }, + { + "epoch": 0.09959397335659877, + "grad_norm": 7.961912631988525, + "learning_rate": 9.989724121372514e-05, + "loss": 3.0409, + "step": 1484 + }, + { + "epoch": 0.09972819704036777, + "grad_norm": 10.580859184265137, + "learning_rate": 9.989654360394782e-05, + "loss": 3.3434, + "step": 1486 + }, + { + "epoch": 0.09986242072413677, + "grad_norm": 6.013082027435303, + "learning_rate": 9.989584363665696e-05, + "loss": 3.31, + "step": 1488 + }, + { + "epoch": 0.09999664440790577, + "grad_norm": 6.003145217895508, + "learning_rate": 9.989514131188559e-05, + "loss": 3.1238, + "step": 1490 + }, + { + "epoch": 0.10013086809167478, + "grad_norm": 5.291478157043457, + "learning_rate": 9.989443662966691e-05, + "loss": 3.1066, + "step": 1492 + }, + { + "epoch": 0.10026509177544378, + "grad_norm": 6.97996187210083, + "learning_rate": 9.989372959003421e-05, + "loss": 3.3493, + "step": 1494 + }, + { + "epoch": 0.10039931545921278, + "grad_norm": 5.54588508605957, + "learning_rate": 9.98930201930209e-05, + "loss": 3.1271, + "step": 1496 + }, + { + "epoch": 0.10053353914298178, + "grad_norm": 5.932539463043213, + "learning_rate": 9.989230843866049e-05, + "loss": 3.2617, + "step": 1498 + }, + { + "epoch": 0.10066776282675079, + "grad_norm": 5.6464338302612305, + "learning_rate": 9.989159432698663e-05, + "loss": 3.0507, + "step": 1500 + }, + { + "epoch": 0.10080198651051978, + "grad_norm": 7.3423333168029785, + "learning_rate": 9.989087785803303e-05, + "loss": 3.2649, + "step": 1502 + }, + { + "epoch": 0.10093621019428878, + "grad_norm": 7.415172100067139, + "learning_rate": 9.989015903183357e-05, + "loss": 3.1304, + "step": 1504 + }, + { + "epoch": 0.10107043387805778, + "grad_norm": 7.95955753326416, + "learning_rate": 9.98894378484222e-05, + "loss": 3.5012, + "step": 1506 + }, + { + "epoch": 0.10120465756182678, + "grad_norm": 5.592349529266357, + "learning_rate": 9.988871430783298e-05, + "loss": 3.2036, + "step": 1508 + }, + { + "epoch": 0.10133888124559579, + "grad_norm": 5.054538726806641, + "learning_rate": 9.988798841010012e-05, + "loss": 3.2655, + "step": 1510 + }, + { + "epoch": 0.10147310492936479, + "grad_norm": 5.887115478515625, + "learning_rate": 9.98872601552579e-05, + "loss": 3.0772, + "step": 1512 + }, + { + "epoch": 0.10160732861313379, + "grad_norm": 5.638282299041748, + "learning_rate": 9.988652954334076e-05, + "loss": 3.1401, + "step": 1514 + }, + { + "epoch": 0.10174155229690279, + "grad_norm": 5.734525203704834, + "learning_rate": 9.988579657438317e-05, + "loss": 3.3598, + "step": 1516 + }, + { + "epoch": 0.1018757759806718, + "grad_norm": 5.234301567077637, + "learning_rate": 9.988506124841981e-05, + "loss": 3.1324, + "step": 1518 + }, + { + "epoch": 0.10200999966444078, + "grad_norm": 7.676272869110107, + "learning_rate": 9.98843235654854e-05, + "loss": 3.1265, + "step": 1520 + }, + { + "epoch": 0.10214422334820979, + "grad_norm": 4.645020961761475, + "learning_rate": 9.988358352561478e-05, + "loss": 2.8253, + "step": 1522 + }, + { + "epoch": 0.10227844703197879, + "grad_norm": 5.660773754119873, + "learning_rate": 9.988284112884294e-05, + "loss": 3.0385, + "step": 1524 + }, + { + "epoch": 0.10241267071574779, + "grad_norm": 5.834421634674072, + "learning_rate": 9.988209637520494e-05, + "loss": 3.3706, + "step": 1526 + }, + { + "epoch": 0.1025468943995168, + "grad_norm": 5.760612487792969, + "learning_rate": 9.988134926473598e-05, + "loss": 3.3838, + "step": 1528 + }, + { + "epoch": 0.1026811180832858, + "grad_norm": 4.664913654327393, + "learning_rate": 9.988059979747135e-05, + "loss": 3.1069, + "step": 1530 + }, + { + "epoch": 0.1028153417670548, + "grad_norm": 5.396612167358398, + "learning_rate": 9.987984797344648e-05, + "loss": 3.3132, + "step": 1532 + }, + { + "epoch": 0.1029495654508238, + "grad_norm": 5.569342136383057, + "learning_rate": 9.987909379269686e-05, + "loss": 3.2089, + "step": 1534 + }, + { + "epoch": 0.1030837891345928, + "grad_norm": 5.627209663391113, + "learning_rate": 9.987833725525815e-05, + "loss": 3.4794, + "step": 1536 + }, + { + "epoch": 0.1032180128183618, + "grad_norm": 5.886993408203125, + "learning_rate": 9.987757836116608e-05, + "loss": 3.15, + "step": 1538 + }, + { + "epoch": 0.1033522365021308, + "grad_norm": 5.083450794219971, + "learning_rate": 9.987681711045652e-05, + "loss": 3.2183, + "step": 1540 + }, + { + "epoch": 0.1034864601858998, + "grad_norm": 4.452432155609131, + "learning_rate": 9.987605350316542e-05, + "loss": 3.3367, + "step": 1542 + }, + { + "epoch": 0.1036206838696688, + "grad_norm": 5.707921504974365, + "learning_rate": 9.987528753932888e-05, + "loss": 3.2372, + "step": 1544 + }, + { + "epoch": 0.1037549075534378, + "grad_norm": 5.265520095825195, + "learning_rate": 9.987451921898307e-05, + "loss": 3.0654, + "step": 1546 + }, + { + "epoch": 0.1038891312372068, + "grad_norm": 6.164703369140625, + "learning_rate": 9.987374854216431e-05, + "loss": 3.3766, + "step": 1548 + }, + { + "epoch": 0.10402335492097581, + "grad_norm": 4.548923015594482, + "learning_rate": 9.9872975508909e-05, + "loss": 3.1964, + "step": 1550 + }, + { + "epoch": 0.10415757860474481, + "grad_norm": 5.389708995819092, + "learning_rate": 9.987220011925367e-05, + "loss": 2.8238, + "step": 1552 + }, + { + "epoch": 0.10429180228851381, + "grad_norm": 5.1769914627075195, + "learning_rate": 9.987142237323495e-05, + "loss": 3.1626, + "step": 1554 + }, + { + "epoch": 0.10442602597228282, + "grad_norm": 6.4823689460754395, + "learning_rate": 9.98706422708896e-05, + "loss": 3.3188, + "step": 1556 + }, + { + "epoch": 0.1045602496560518, + "grad_norm": 5.855801582336426, + "learning_rate": 9.986985981225445e-05, + "loss": 3.1819, + "step": 1558 + }, + { + "epoch": 0.1046944733398208, + "grad_norm": 5.281543731689453, + "learning_rate": 9.98690749973665e-05, + "loss": 3.1424, + "step": 1560 + }, + { + "epoch": 0.10482869702358981, + "grad_norm": 11.356325149536133, + "learning_rate": 9.986828782626282e-05, + "loss": 2.906, + "step": 1562 + }, + { + "epoch": 0.10496292070735881, + "grad_norm": 5.213841915130615, + "learning_rate": 9.986749829898061e-05, + "loss": 3.0625, + "step": 1564 + }, + { + "epoch": 0.10509714439112781, + "grad_norm": 4.866334915161133, + "learning_rate": 9.986670641555715e-05, + "loss": 3.3868, + "step": 1566 + }, + { + "epoch": 0.10523136807489682, + "grad_norm": 5.219162940979004, + "learning_rate": 9.986591217602988e-05, + "loss": 3.055, + "step": 1568 + }, + { + "epoch": 0.10536559175866582, + "grad_norm": 5.018749713897705, + "learning_rate": 9.986511558043631e-05, + "loss": 3.0454, + "step": 1570 + }, + { + "epoch": 0.10549981544243482, + "grad_norm": 4.99350118637085, + "learning_rate": 9.98643166288141e-05, + "loss": 2.9681, + "step": 1572 + }, + { + "epoch": 0.10563403912620382, + "grad_norm": 6.966550350189209, + "learning_rate": 9.986351532120097e-05, + "loss": 3.4073, + "step": 1574 + }, + { + "epoch": 0.10576826280997281, + "grad_norm": 5.450279712677002, + "learning_rate": 9.98627116576348e-05, + "loss": 2.9659, + "step": 1576 + }, + { + "epoch": 0.10590248649374182, + "grad_norm": 5.239212512969971, + "learning_rate": 9.986190563815355e-05, + "loss": 3.0306, + "step": 1578 + }, + { + "epoch": 0.10603671017751082, + "grad_norm": 6.291440486907959, + "learning_rate": 9.986109726279531e-05, + "loss": 2.9706, + "step": 1580 + }, + { + "epoch": 0.10617093386127982, + "grad_norm": 5.843376159667969, + "learning_rate": 9.986028653159826e-05, + "loss": 3.2687, + "step": 1582 + }, + { + "epoch": 0.10630515754504882, + "grad_norm": 5.656061172485352, + "learning_rate": 9.985947344460074e-05, + "loss": 3.456, + "step": 1584 + }, + { + "epoch": 0.10643938122881783, + "grad_norm": 6.17963171005249, + "learning_rate": 9.985865800184113e-05, + "loss": 3.2986, + "step": 1586 + }, + { + "epoch": 0.10657360491258683, + "grad_norm": 8.181617736816406, + "learning_rate": 9.985784020335798e-05, + "loss": 3.2845, + "step": 1588 + }, + { + "epoch": 0.10670782859635583, + "grad_norm": 4.9603681564331055, + "learning_rate": 9.985702004918992e-05, + "loss": 3.3383, + "step": 1590 + }, + { + "epoch": 0.10684205228012483, + "grad_norm": 5.438014030456543, + "learning_rate": 9.98561975393757e-05, + "loss": 3.4234, + "step": 1592 + }, + { + "epoch": 0.10697627596389384, + "grad_norm": 6.849715232849121, + "learning_rate": 9.985537267395418e-05, + "loss": 3.0628, + "step": 1594 + }, + { + "epoch": 0.10711049964766282, + "grad_norm": 5.137056350708008, + "learning_rate": 9.985454545296434e-05, + "loss": 3.1541, + "step": 1596 + }, + { + "epoch": 0.10724472333143183, + "grad_norm": 5.612728118896484, + "learning_rate": 9.985371587644526e-05, + "loss": 3.4082, + "step": 1598 + }, + { + "epoch": 0.10737894701520083, + "grad_norm": 4.71220064163208, + "learning_rate": 9.985288394443615e-05, + "loss": 3.1991, + "step": 1600 + }, + { + "epoch": 0.10751317069896983, + "grad_norm": 6.874771595001221, + "learning_rate": 9.98520496569763e-05, + "loss": 3.0938, + "step": 1602 + }, + { + "epoch": 0.10764739438273883, + "grad_norm": 7.147547245025635, + "learning_rate": 9.985121301410511e-05, + "loss": 3.0746, + "step": 1604 + }, + { + "epoch": 0.10778161806650784, + "grad_norm": 5.196357727050781, + "learning_rate": 9.985037401586217e-05, + "loss": 3.5965, + "step": 1606 + }, + { + "epoch": 0.10791584175027684, + "grad_norm": 5.115869045257568, + "learning_rate": 9.984953266228707e-05, + "loss": 3.2399, + "step": 1608 + }, + { + "epoch": 0.10805006543404584, + "grad_norm": 5.130259037017822, + "learning_rate": 9.984868895341957e-05, + "loss": 3.1369, + "step": 1610 + }, + { + "epoch": 0.10818428911781484, + "grad_norm": 4.5213942527771, + "learning_rate": 9.984784288929953e-05, + "loss": 3.0072, + "step": 1612 + }, + { + "epoch": 0.10831851280158383, + "grad_norm": 7.015380382537842, + "learning_rate": 9.984699446996697e-05, + "loss": 3.3073, + "step": 1614 + }, + { + "epoch": 0.10845273648535284, + "grad_norm": 11.1233549118042, + "learning_rate": 9.984614369546191e-05, + "loss": 3.4081, + "step": 1616 + }, + { + "epoch": 0.10858696016912184, + "grad_norm": 7.172938346862793, + "learning_rate": 9.984529056582459e-05, + "loss": 3.2607, + "step": 1618 + }, + { + "epoch": 0.10872118385289084, + "grad_norm": 5.0229105949401855, + "learning_rate": 9.984443508109531e-05, + "loss": 3.1651, + "step": 1620 + }, + { + "epoch": 0.10885540753665984, + "grad_norm": 6.559587478637695, + "learning_rate": 9.984357724131448e-05, + "loss": 3.2985, + "step": 1622 + }, + { + "epoch": 0.10898963122042885, + "grad_norm": 5.519924640655518, + "learning_rate": 9.984271704652263e-05, + "loss": 3.0166, + "step": 1624 + }, + { + "epoch": 0.10912385490419785, + "grad_norm": 5.887812614440918, + "learning_rate": 9.984185449676044e-05, + "loss": 3.0336, + "step": 1626 + }, + { + "epoch": 0.10925807858796685, + "grad_norm": 7.605010986328125, + "learning_rate": 9.984098959206863e-05, + "loss": 3.0836, + "step": 1628 + }, + { + "epoch": 0.10939230227173585, + "grad_norm": 8.086282730102539, + "learning_rate": 9.984012233248805e-05, + "loss": 3.0804, + "step": 1630 + }, + { + "epoch": 0.10952652595550486, + "grad_norm": 5.2925639152526855, + "learning_rate": 9.98392527180597e-05, + "loss": 3.1248, + "step": 1632 + }, + { + "epoch": 0.10966074963927384, + "grad_norm": 5.293577194213867, + "learning_rate": 9.983838074882467e-05, + "loss": 3.2394, + "step": 1634 + }, + { + "epoch": 0.10979497332304285, + "grad_norm": 8.610538482666016, + "learning_rate": 9.983750642482414e-05, + "loss": 3.1456, + "step": 1636 + }, + { + "epoch": 0.10992919700681185, + "grad_norm": 7.117098808288574, + "learning_rate": 9.983662974609945e-05, + "loss": 3.3048, + "step": 1638 + }, + { + "epoch": 0.11006342069058085, + "grad_norm": 7.877905368804932, + "learning_rate": 9.9835750712692e-05, + "loss": 3.0914, + "step": 1640 + }, + { + "epoch": 0.11019764437434985, + "grad_norm": 4.8839545249938965, + "learning_rate": 9.983486932464332e-05, + "loss": 3.0918, + "step": 1642 + }, + { + "epoch": 0.11033186805811886, + "grad_norm": 5.791496276855469, + "learning_rate": 9.983398558199506e-05, + "loss": 3.0983, + "step": 1644 + }, + { + "epoch": 0.11046609174188786, + "grad_norm": 6.990381717681885, + "learning_rate": 9.983309948478898e-05, + "loss": 3.3216, + "step": 1646 + }, + { + "epoch": 0.11060031542565686, + "grad_norm": 5.609792709350586, + "learning_rate": 9.983221103306695e-05, + "loss": 2.9153, + "step": 1648 + }, + { + "epoch": 0.11073453910942586, + "grad_norm": 9.12102222442627, + "learning_rate": 9.983132022687093e-05, + "loss": 3.0755, + "step": 1650 + }, + { + "epoch": 0.11086876279319485, + "grad_norm": 7.948161602020264, + "learning_rate": 9.983042706624302e-05, + "loss": 3.4237, + "step": 1652 + }, + { + "epoch": 0.11100298647696386, + "grad_norm": 4.841409683227539, + "learning_rate": 9.982953155122542e-05, + "loss": 3.0898, + "step": 1654 + }, + { + "epoch": 0.11113721016073286, + "grad_norm": 4.723064422607422, + "learning_rate": 9.982863368186044e-05, + "loss": 3.1947, + "step": 1656 + }, + { + "epoch": 0.11127143384450186, + "grad_norm": 4.872771739959717, + "learning_rate": 9.98277334581905e-05, + "loss": 3.1103, + "step": 1658 + }, + { + "epoch": 0.11140565752827086, + "grad_norm": 5.240569591522217, + "learning_rate": 9.982683088025813e-05, + "loss": 3.1447, + "step": 1660 + }, + { + "epoch": 0.11153988121203987, + "grad_norm": 5.995388984680176, + "learning_rate": 9.982592594810599e-05, + "loss": 3.3921, + "step": 1662 + }, + { + "epoch": 0.11167410489580887, + "grad_norm": 6.137365341186523, + "learning_rate": 9.982501866177682e-05, + "loss": 3.3363, + "step": 1664 + }, + { + "epoch": 0.11180832857957787, + "grad_norm": 6.419748306274414, + "learning_rate": 9.98241090213135e-05, + "loss": 3.1357, + "step": 1666 + }, + { + "epoch": 0.11194255226334687, + "grad_norm": 6.381359577178955, + "learning_rate": 9.982319702675901e-05, + "loss": 3.2538, + "step": 1668 + }, + { + "epoch": 0.11207677594711588, + "grad_norm": 9.51447582244873, + "learning_rate": 9.982228267815643e-05, + "loss": 3.0938, + "step": 1670 + }, + { + "epoch": 0.11221099963088486, + "grad_norm": 7.221292495727539, + "learning_rate": 9.982136597554896e-05, + "loss": 3.241, + "step": 1672 + }, + { + "epoch": 0.11234522331465387, + "grad_norm": 5.564941883087158, + "learning_rate": 9.982044691897991e-05, + "loss": 2.9077, + "step": 1674 + }, + { + "epoch": 0.11247944699842287, + "grad_norm": 4.436361789703369, + "learning_rate": 9.981952550849273e-05, + "loss": 3.0312, + "step": 1676 + }, + { + "epoch": 0.11261367068219187, + "grad_norm": 4.935223579406738, + "learning_rate": 9.981860174413092e-05, + "loss": 3.2345, + "step": 1678 + }, + { + "epoch": 0.11274789436596087, + "grad_norm": 5.594458103179932, + "learning_rate": 9.981767562593815e-05, + "loss": 2.8401, + "step": 1680 + }, + { + "epoch": 0.11288211804972988, + "grad_norm": 4.991801738739014, + "learning_rate": 9.981674715395816e-05, + "loss": 3.1998, + "step": 1682 + }, + { + "epoch": 0.11301634173349888, + "grad_norm": 5.285741806030273, + "learning_rate": 9.981581632823485e-05, + "loss": 3.2872, + "step": 1684 + }, + { + "epoch": 0.11315056541726788, + "grad_norm": 5.913918495178223, + "learning_rate": 9.981488314881215e-05, + "loss": 3.3428, + "step": 1686 + }, + { + "epoch": 0.11328478910103688, + "grad_norm": 4.796611785888672, + "learning_rate": 9.981394761573419e-05, + "loss": 3.0236, + "step": 1688 + }, + { + "epoch": 0.11341901278480587, + "grad_norm": 5.216083526611328, + "learning_rate": 9.981300972904515e-05, + "loss": 3.2191, + "step": 1690 + }, + { + "epoch": 0.11355323646857488, + "grad_norm": 6.207251071929932, + "learning_rate": 9.981206948878937e-05, + "loss": 3.0901, + "step": 1692 + }, + { + "epoch": 0.11368746015234388, + "grad_norm": 4.577812194824219, + "learning_rate": 9.981112689501126e-05, + "loss": 3.2282, + "step": 1694 + }, + { + "epoch": 0.11382168383611288, + "grad_norm": 7.139584064483643, + "learning_rate": 9.981018194775533e-05, + "loss": 3.2101, + "step": 1696 + }, + { + "epoch": 0.11395590751988188, + "grad_norm": 6.850059509277344, + "learning_rate": 9.980923464706627e-05, + "loss": 3.5516, + "step": 1698 + }, + { + "epoch": 0.11409013120365089, + "grad_norm": 5.1064653396606445, + "learning_rate": 9.980828499298882e-05, + "loss": 3.4808, + "step": 1700 + }, + { + "epoch": 0.11422435488741989, + "grad_norm": 5.6978759765625, + "learning_rate": 9.980733298556783e-05, + "loss": 3.2386, + "step": 1702 + }, + { + "epoch": 0.11435857857118889, + "grad_norm": 4.97546911239624, + "learning_rate": 9.980637862484832e-05, + "loss": 3.2591, + "step": 1704 + }, + { + "epoch": 0.11449280225495789, + "grad_norm": 11.24573802947998, + "learning_rate": 9.980542191087535e-05, + "loss": 3.2182, + "step": 1706 + }, + { + "epoch": 0.1146270259387269, + "grad_norm": 6.866262912750244, + "learning_rate": 9.980446284369413e-05, + "loss": 3.1405, + "step": 1708 + }, + { + "epoch": 0.11476124962249588, + "grad_norm": 5.927145481109619, + "learning_rate": 9.980350142334998e-05, + "loss": 3.161, + "step": 1710 + }, + { + "epoch": 0.11489547330626489, + "grad_norm": 4.782253742218018, + "learning_rate": 9.980253764988832e-05, + "loss": 3.3736, + "step": 1712 + }, + { + "epoch": 0.11502969699003389, + "grad_norm": 6.354815483093262, + "learning_rate": 9.980157152335467e-05, + "loss": 3.2203, + "step": 1714 + }, + { + "epoch": 0.11516392067380289, + "grad_norm": 4.765363693237305, + "learning_rate": 9.980060304379472e-05, + "loss": 3.1808, + "step": 1716 + }, + { + "epoch": 0.1152981443575719, + "grad_norm": 4.743289947509766, + "learning_rate": 9.979963221125421e-05, + "loss": 2.8858, + "step": 1718 + }, + { + "epoch": 0.1154323680413409, + "grad_norm": 5.87974739074707, + "learning_rate": 9.9798659025779e-05, + "loss": 3.0633, + "step": 1720 + }, + { + "epoch": 0.1155665917251099, + "grad_norm": 4.994908332824707, + "learning_rate": 9.979768348741507e-05, + "loss": 3.0816, + "step": 1722 + }, + { + "epoch": 0.1157008154088789, + "grad_norm": 5.046123027801514, + "learning_rate": 9.979670559620851e-05, + "loss": 3.0777, + "step": 1724 + }, + { + "epoch": 0.1158350390926479, + "grad_norm": 5.084259986877441, + "learning_rate": 9.979572535220555e-05, + "loss": 3.1045, + "step": 1726 + }, + { + "epoch": 0.11596926277641689, + "grad_norm": 5.742770195007324, + "learning_rate": 9.979474275545248e-05, + "loss": 3.1058, + "step": 1728 + }, + { + "epoch": 0.1161034864601859, + "grad_norm": 4.458430290222168, + "learning_rate": 9.979375780599573e-05, + "loss": 3.0327, + "step": 1730 + }, + { + "epoch": 0.1162377101439549, + "grad_norm": 5.71297025680542, + "learning_rate": 9.979277050388183e-05, + "loss": 2.8942, + "step": 1732 + }, + { + "epoch": 0.1163719338277239, + "grad_norm": 4.460875511169434, + "learning_rate": 9.979178084915745e-05, + "loss": 3.0289, + "step": 1734 + }, + { + "epoch": 0.1165061575114929, + "grad_norm": 6.321335315704346, + "learning_rate": 9.979078884186933e-05, + "loss": 3.0473, + "step": 1736 + }, + { + "epoch": 0.1166403811952619, + "grad_norm": 4.691104888916016, + "learning_rate": 9.978979448206434e-05, + "loss": 3.2932, + "step": 1738 + }, + { + "epoch": 0.11677460487903091, + "grad_norm": 8.167805671691895, + "learning_rate": 9.978879776978949e-05, + "loss": 3.5283, + "step": 1740 + }, + { + "epoch": 0.11690882856279991, + "grad_norm": 7.626429080963135, + "learning_rate": 9.978779870509182e-05, + "loss": 3.1929, + "step": 1742 + }, + { + "epoch": 0.11704305224656891, + "grad_norm": 7.401897430419922, + "learning_rate": 9.978679728801859e-05, + "loss": 3.326, + "step": 1744 + }, + { + "epoch": 0.1171772759303379, + "grad_norm": 5.91472864151001, + "learning_rate": 9.978579351861707e-05, + "loss": 3.2485, + "step": 1746 + }, + { + "epoch": 0.1173114996141069, + "grad_norm": 5.109776020050049, + "learning_rate": 9.978478739693473e-05, + "loss": 3.0723, + "step": 1748 + }, + { + "epoch": 0.1174457232978759, + "grad_norm": 7.09427547454834, + "learning_rate": 9.978377892301906e-05, + "loss": 2.8806, + "step": 1750 + }, + { + "epoch": 0.11757994698164491, + "grad_norm": 11.201136589050293, + "learning_rate": 9.978276809691776e-05, + "loss": 3.2434, + "step": 1752 + }, + { + "epoch": 0.11771417066541391, + "grad_norm": 9.921677589416504, + "learning_rate": 9.978175491867854e-05, + "loss": 3.2075, + "step": 1754 + }, + { + "epoch": 0.11784839434918291, + "grad_norm": 4.348266124725342, + "learning_rate": 9.978073938834929e-05, + "loss": 3.0315, + "step": 1756 + }, + { + "epoch": 0.11798261803295192, + "grad_norm": 5.276819229125977, + "learning_rate": 9.977972150597799e-05, + "loss": 3.276, + "step": 1758 + }, + { + "epoch": 0.11811684171672092, + "grad_norm": 4.815385818481445, + "learning_rate": 9.977870127161275e-05, + "loss": 3.0742, + "step": 1760 + }, + { + "epoch": 0.11825106540048992, + "grad_norm": 5.617897987365723, + "learning_rate": 9.977767868530176e-05, + "loss": 3.287, + "step": 1762 + }, + { + "epoch": 0.11838528908425892, + "grad_norm": 9.503907203674316, + "learning_rate": 9.977665374709333e-05, + "loss": 3.3061, + "step": 1764 + }, + { + "epoch": 0.11851951276802791, + "grad_norm": 5.873293399810791, + "learning_rate": 9.977562645703589e-05, + "loss": 3.076, + "step": 1766 + }, + { + "epoch": 0.11865373645179692, + "grad_norm": 6.136275291442871, + "learning_rate": 9.977459681517798e-05, + "loss": 3.3982, + "step": 1768 + }, + { + "epoch": 0.11878796013556592, + "grad_norm": 4.490494728088379, + "learning_rate": 9.977356482156825e-05, + "loss": 3.0276, + "step": 1770 + }, + { + "epoch": 0.11892218381933492, + "grad_norm": 4.948819160461426, + "learning_rate": 9.977253047625546e-05, + "loss": 3.0442, + "step": 1772 + }, + { + "epoch": 0.11905640750310392, + "grad_norm": 5.110283374786377, + "learning_rate": 9.977149377928847e-05, + "loss": 3.1532, + "step": 1774 + }, + { + "epoch": 0.11919063118687293, + "grad_norm": 5.60075044631958, + "learning_rate": 9.977045473071627e-05, + "loss": 2.9403, + "step": 1776 + }, + { + "epoch": 0.11932485487064193, + "grad_norm": 5.411577224731445, + "learning_rate": 9.976941333058796e-05, + "loss": 3.3183, + "step": 1778 + }, + { + "epoch": 0.11945907855441093, + "grad_norm": 5.953779697418213, + "learning_rate": 9.976836957895275e-05, + "loss": 3.2636, + "step": 1780 + }, + { + "epoch": 0.11959330223817993, + "grad_norm": 8.72913932800293, + "learning_rate": 9.976732347585993e-05, + "loss": 3.3967, + "step": 1782 + }, + { + "epoch": 0.11972752592194892, + "grad_norm": 5.251708030700684, + "learning_rate": 9.976627502135894e-05, + "loss": 3.467, + "step": 1784 + }, + { + "epoch": 0.11986174960571792, + "grad_norm": 4.776645183563232, + "learning_rate": 9.976522421549932e-05, + "loss": 2.6968, + "step": 1786 + }, + { + "epoch": 0.11999597328948693, + "grad_norm": 5.268375873565674, + "learning_rate": 9.97641710583307e-05, + "loss": 3.245, + "step": 1788 + }, + { + "epoch": 0.12013019697325593, + "grad_norm": 5.311893939971924, + "learning_rate": 9.976311554990287e-05, + "loss": 2.7908, + "step": 1790 + }, + { + "epoch": 0.12026442065702493, + "grad_norm": 5.1271281242370605, + "learning_rate": 9.976205769026568e-05, + "loss": 3.0556, + "step": 1792 + }, + { + "epoch": 0.12039864434079393, + "grad_norm": 4.692525386810303, + "learning_rate": 9.976099747946912e-05, + "loss": 2.8955, + "step": 1794 + }, + { + "epoch": 0.12053286802456294, + "grad_norm": 4.815884113311768, + "learning_rate": 9.975993491756328e-05, + "loss": 3.3716, + "step": 1796 + }, + { + "epoch": 0.12066709170833194, + "grad_norm": 6.628861427307129, + "learning_rate": 9.975887000459835e-05, + "loss": 3.4252, + "step": 1798 + }, + { + "epoch": 0.12080131539210094, + "grad_norm": 6.095422267913818, + "learning_rate": 9.975780274062468e-05, + "loss": 3.0695, + "step": 1800 + }, + { + "epoch": 0.12093553907586994, + "grad_norm": 4.718890190124512, + "learning_rate": 9.975673312569267e-05, + "loss": 2.9096, + "step": 1802 + }, + { + "epoch": 0.12106976275963893, + "grad_norm": 5.122056484222412, + "learning_rate": 9.975566115985284e-05, + "loss": 3.1438, + "step": 1804 + }, + { + "epoch": 0.12120398644340794, + "grad_norm": 4.7924346923828125, + "learning_rate": 9.975458684315588e-05, + "loss": 3.0023, + "step": 1806 + }, + { + "epoch": 0.12133821012717694, + "grad_norm": 4.440341472625732, + "learning_rate": 9.975351017565253e-05, + "loss": 2.893, + "step": 1808 + }, + { + "epoch": 0.12147243381094594, + "grad_norm": 4.992955684661865, + "learning_rate": 9.975243115739366e-05, + "loss": 3.0508, + "step": 1810 + }, + { + "epoch": 0.12160665749471494, + "grad_norm": 5.225953102111816, + "learning_rate": 9.975134978843026e-05, + "loss": 3.2141, + "step": 1812 + }, + { + "epoch": 0.12174088117848395, + "grad_norm": 4.577791213989258, + "learning_rate": 9.97502660688134e-05, + "loss": 3.0212, + "step": 1814 + }, + { + "epoch": 0.12187510486225295, + "grad_norm": 6.57459831237793, + "learning_rate": 9.97491799985943e-05, + "loss": 3.3442, + "step": 1816 + }, + { + "epoch": 0.12200932854602195, + "grad_norm": 5.210793972015381, + "learning_rate": 9.974809157782427e-05, + "loss": 3.1931, + "step": 1818 + }, + { + "epoch": 0.12214355222979095, + "grad_norm": 5.453722953796387, + "learning_rate": 9.974700080655475e-05, + "loss": 2.9984, + "step": 1820 + }, + { + "epoch": 0.12227777591355994, + "grad_norm": 5.96092414855957, + "learning_rate": 9.974590768483725e-05, + "loss": 3.063, + "step": 1822 + }, + { + "epoch": 0.12241199959732894, + "grad_norm": 4.682698726654053, + "learning_rate": 9.974481221272345e-05, + "loss": 3.1477, + "step": 1824 + }, + { + "epoch": 0.12254622328109795, + "grad_norm": 4.147486209869385, + "learning_rate": 9.974371439026508e-05, + "loss": 3.0456, + "step": 1826 + }, + { + "epoch": 0.12268044696486695, + "grad_norm": 5.037916660308838, + "learning_rate": 9.974261421751403e-05, + "loss": 3.042, + "step": 1828 + }, + { + "epoch": 0.12281467064863595, + "grad_norm": 4.71373987197876, + "learning_rate": 9.974151169452226e-05, + "loss": 3.2291, + "step": 1830 + }, + { + "epoch": 0.12294889433240495, + "grad_norm": 4.648612022399902, + "learning_rate": 9.974040682134189e-05, + "loss": 3.2147, + "step": 1832 + }, + { + "epoch": 0.12308311801617396, + "grad_norm": 5.1071882247924805, + "learning_rate": 9.97392995980251e-05, + "loss": 3.188, + "step": 1834 + }, + { + "epoch": 0.12321734169994296, + "grad_norm": 4.938551902770996, + "learning_rate": 9.973819002462421e-05, + "loss": 3.276, + "step": 1836 + }, + { + "epoch": 0.12335156538371196, + "grad_norm": 5.290544509887695, + "learning_rate": 9.973707810119165e-05, + "loss": 3.0569, + "step": 1838 + }, + { + "epoch": 0.12348578906748096, + "grad_norm": 5.244781017303467, + "learning_rate": 9.973596382777995e-05, + "loss": 3.2609, + "step": 1840 + }, + { + "epoch": 0.12362001275124995, + "grad_norm": 5.339991569519043, + "learning_rate": 9.973484720444178e-05, + "loss": 3.1848, + "step": 1842 + }, + { + "epoch": 0.12375423643501895, + "grad_norm": 4.960624694824219, + "learning_rate": 9.973372823122985e-05, + "loss": 3.2744, + "step": 1844 + }, + { + "epoch": 0.12388846011878796, + "grad_norm": 5.458547115325928, + "learning_rate": 9.973260690819708e-05, + "loss": 3.0862, + "step": 1846 + }, + { + "epoch": 0.12402268380255696, + "grad_norm": 5.38363790512085, + "learning_rate": 9.973148323539641e-05, + "loss": 3.1478, + "step": 1848 + }, + { + "epoch": 0.12415690748632596, + "grad_norm": 5.067777156829834, + "learning_rate": 9.973035721288096e-05, + "loss": 3.1349, + "step": 1850 + }, + { + "epoch": 0.12429113117009497, + "grad_norm": 4.486639976501465, + "learning_rate": 9.972922884070392e-05, + "loss": 3.0198, + "step": 1852 + }, + { + "epoch": 0.12442535485386397, + "grad_norm": 4.554307460784912, + "learning_rate": 9.972809811891861e-05, + "loss": 3.1377, + "step": 1854 + }, + { + "epoch": 0.12455957853763297, + "grad_norm": 7.610952854156494, + "learning_rate": 9.972696504757846e-05, + "loss": 3.2731, + "step": 1856 + }, + { + "epoch": 0.12469380222140197, + "grad_norm": 12.380949974060059, + "learning_rate": 9.972582962673698e-05, + "loss": 3.0574, + "step": 1858 + }, + { + "epoch": 0.12482802590517096, + "grad_norm": 5.415958881378174, + "learning_rate": 9.972469185644783e-05, + "loss": 3.0909, + "step": 1860 + }, + { + "epoch": 0.12496224958893996, + "grad_norm": 7.094273567199707, + "learning_rate": 9.972355173676478e-05, + "loss": 3.2325, + "step": 1862 + }, + { + "epoch": 0.12509647327270898, + "grad_norm": 5.19990873336792, + "learning_rate": 9.972240926774168e-05, + "loss": 3.0171, + "step": 1864 + }, + { + "epoch": 0.12523069695647798, + "grad_norm": 4.704460144042969, + "learning_rate": 9.972126444943252e-05, + "loss": 3.0189, + "step": 1866 + }, + { + "epoch": 0.12536492064024699, + "grad_norm": 4.520235538482666, + "learning_rate": 9.97201172818914e-05, + "loss": 3.0877, + "step": 1868 + }, + { + "epoch": 0.12549914432401596, + "grad_norm": 5.759894847869873, + "learning_rate": 9.97189677651725e-05, + "loss": 3.1987, + "step": 1870 + }, + { + "epoch": 0.12563336800778496, + "grad_norm": 5.287220001220703, + "learning_rate": 9.971781589933012e-05, + "loss": 2.9765, + "step": 1872 + }, + { + "epoch": 0.12576759169155396, + "grad_norm": 4.922494411468506, + "learning_rate": 9.971666168441872e-05, + "loss": 3.2145, + "step": 1874 + }, + { + "epoch": 0.12590181537532297, + "grad_norm": 4.91917085647583, + "learning_rate": 9.971550512049281e-05, + "loss": 3.0725, + "step": 1876 + }, + { + "epoch": 0.12603603905909197, + "grad_norm": 5.373570919036865, + "learning_rate": 9.971434620760707e-05, + "loss": 2.9508, + "step": 1878 + }, + { + "epoch": 0.12617026274286097, + "grad_norm": 5.828456401824951, + "learning_rate": 9.97131849458162e-05, + "loss": 3.0998, + "step": 1880 + }, + { + "epoch": 0.12630448642662997, + "grad_norm": 4.458247661590576, + "learning_rate": 9.971202133517512e-05, + "loss": 2.8528, + "step": 1882 + }, + { + "epoch": 0.12643871011039898, + "grad_norm": 4.787200450897217, + "learning_rate": 9.971085537573879e-05, + "loss": 3.3618, + "step": 1884 + }, + { + "epoch": 0.12657293379416798, + "grad_norm": 4.6539483070373535, + "learning_rate": 9.970968706756227e-05, + "loss": 2.9551, + "step": 1886 + }, + { + "epoch": 0.12670715747793698, + "grad_norm": 4.9112749099731445, + "learning_rate": 9.970851641070081e-05, + "loss": 3.2089, + "step": 1888 + }, + { + "epoch": 0.12684138116170598, + "grad_norm": 5.147674560546875, + "learning_rate": 9.970734340520969e-05, + "loss": 2.8562, + "step": 1890 + }, + { + "epoch": 0.126975604845475, + "grad_norm": 5.440029144287109, + "learning_rate": 9.970616805114434e-05, + "loss": 3.2027, + "step": 1892 + }, + { + "epoch": 0.127109828529244, + "grad_norm": 5.192463397979736, + "learning_rate": 9.970499034856029e-05, + "loss": 3.2872, + "step": 1894 + }, + { + "epoch": 0.127244052213013, + "grad_norm": 4.445255756378174, + "learning_rate": 9.970381029751319e-05, + "loss": 3.1898, + "step": 1896 + }, + { + "epoch": 0.127378275896782, + "grad_norm": 5.454135417938232, + "learning_rate": 9.970262789805878e-05, + "loss": 3.144, + "step": 1898 + }, + { + "epoch": 0.127512499580551, + "grad_norm": 4.853320121765137, + "learning_rate": 9.970144315025296e-05, + "loss": 3.1673, + "step": 1900 + }, + { + "epoch": 0.12764672326432, + "grad_norm": 6.060484409332275, + "learning_rate": 9.970025605415166e-05, + "loss": 2.8958, + "step": 1902 + }, + { + "epoch": 0.127780946948089, + "grad_norm": 4.723323822021484, + "learning_rate": 9.9699066609811e-05, + "loss": 3.2904, + "step": 1904 + }, + { + "epoch": 0.127915170631858, + "grad_norm": 4.771556377410889, + "learning_rate": 9.969787481728718e-05, + "loss": 3.0759, + "step": 1906 + }, + { + "epoch": 0.12804939431562698, + "grad_norm": 4.131943225860596, + "learning_rate": 9.969668067663652e-05, + "loss": 2.9299, + "step": 1908 + }, + { + "epoch": 0.12818361799939598, + "grad_norm": 5.0996174812316895, + "learning_rate": 9.969548418791539e-05, + "loss": 3.0828, + "step": 1910 + }, + { + "epoch": 0.12831784168316498, + "grad_norm": 5.363957405090332, + "learning_rate": 9.969428535118036e-05, + "loss": 3.0313, + "step": 1912 + }, + { + "epoch": 0.128452065366934, + "grad_norm": 5.529791355133057, + "learning_rate": 9.969308416648807e-05, + "loss": 3.1248, + "step": 1914 + }, + { + "epoch": 0.128586289050703, + "grad_norm": 4.932836532592773, + "learning_rate": 9.969188063389528e-05, + "loss": 3.3056, + "step": 1916 + }, + { + "epoch": 0.128720512734472, + "grad_norm": 4.576365947723389, + "learning_rate": 9.969067475345884e-05, + "loss": 3.0261, + "step": 1918 + }, + { + "epoch": 0.128854736418241, + "grad_norm": 4.493544578552246, + "learning_rate": 9.968946652523572e-05, + "loss": 3.1441, + "step": 1920 + }, + { + "epoch": 0.12898896010201, + "grad_norm": 6.04786491394043, + "learning_rate": 9.968825594928302e-05, + "loss": 3.4698, + "step": 1922 + }, + { + "epoch": 0.129123183785779, + "grad_norm": 5.773447513580322, + "learning_rate": 9.968704302565794e-05, + "loss": 3.1163, + "step": 1924 + }, + { + "epoch": 0.129257407469548, + "grad_norm": 4.874556064605713, + "learning_rate": 9.968582775441778e-05, + "loss": 3.1339, + "step": 1926 + }, + { + "epoch": 0.129391631153317, + "grad_norm": 5.087906837463379, + "learning_rate": 9.968461013561995e-05, + "loss": 3.0395, + "step": 1928 + }, + { + "epoch": 0.129525854837086, + "grad_norm": 5.196646213531494, + "learning_rate": 9.968339016932202e-05, + "loss": 2.9947, + "step": 1930 + }, + { + "epoch": 0.129660078520855, + "grad_norm": 5.235856533050537, + "learning_rate": 9.968216785558158e-05, + "loss": 3.159, + "step": 1932 + }, + { + "epoch": 0.129794302204624, + "grad_norm": 4.910450458526611, + "learning_rate": 9.968094319445642e-05, + "loss": 3.0338, + "step": 1934 + }, + { + "epoch": 0.12992852588839301, + "grad_norm": 7.49721622467041, + "learning_rate": 9.967971618600437e-05, + "loss": 2.9504, + "step": 1936 + }, + { + "epoch": 0.13006274957216202, + "grad_norm": 5.889185428619385, + "learning_rate": 9.967848683028343e-05, + "loss": 3.3514, + "step": 1938 + }, + { + "epoch": 0.13019697325593102, + "grad_norm": 4.6525983810424805, + "learning_rate": 9.967725512735169e-05, + "loss": 2.8184, + "step": 1940 + }, + { + "epoch": 0.13033119693970002, + "grad_norm": 4.371654033660889, + "learning_rate": 9.96760210772673e-05, + "loss": 3.138, + "step": 1942 + }, + { + "epoch": 0.130465420623469, + "grad_norm": 5.1929473876953125, + "learning_rate": 9.967478468008861e-05, + "loss": 3.3237, + "step": 1944 + }, + { + "epoch": 0.130599644307238, + "grad_norm": 5.243816375732422, + "learning_rate": 9.967354593587403e-05, + "loss": 3.279, + "step": 1946 + }, + { + "epoch": 0.130733867991007, + "grad_norm": 4.860206604003906, + "learning_rate": 9.96723048446821e-05, + "loss": 2.9855, + "step": 1948 + }, + { + "epoch": 0.130868091674776, + "grad_norm": 5.074206352233887, + "learning_rate": 9.967106140657143e-05, + "loss": 3.317, + "step": 1950 + }, + { + "epoch": 0.131002315358545, + "grad_norm": 5.155564785003662, + "learning_rate": 9.966981562160077e-05, + "loss": 3.2585, + "step": 1952 + }, + { + "epoch": 0.131136539042314, + "grad_norm": 5.171399116516113, + "learning_rate": 9.9668567489829e-05, + "loss": 3.2023, + "step": 1954 + }, + { + "epoch": 0.131270762726083, + "grad_norm": 4.879687786102295, + "learning_rate": 9.966731701131509e-05, + "loss": 3.0168, + "step": 1956 + }, + { + "epoch": 0.13140498640985201, + "grad_norm": 5.2717132568359375, + "learning_rate": 9.966606418611811e-05, + "loss": 2.8676, + "step": 1958 + }, + { + "epoch": 0.13153921009362102, + "grad_norm": 5.726935386657715, + "learning_rate": 9.966480901429727e-05, + "loss": 3.1697, + "step": 1960 + }, + { + "epoch": 0.13167343377739002, + "grad_norm": 4.999803066253662, + "learning_rate": 9.966355149591187e-05, + "loss": 3.4354, + "step": 1962 + }, + { + "epoch": 0.13180765746115902, + "grad_norm": 5.106264114379883, + "learning_rate": 9.96622916310213e-05, + "loss": 2.9982, + "step": 1964 + }, + { + "epoch": 0.13194188114492802, + "grad_norm": 5.01404333114624, + "learning_rate": 9.966102941968512e-05, + "loss": 2.9673, + "step": 1966 + }, + { + "epoch": 0.13207610482869703, + "grad_norm": 4.781093120574951, + "learning_rate": 9.965976486196295e-05, + "loss": 2.8827, + "step": 1968 + }, + { + "epoch": 0.13221032851246603, + "grad_norm": 4.669822692871094, + "learning_rate": 9.965849795791455e-05, + "loss": 2.9975, + "step": 1970 + }, + { + "epoch": 0.13234455219623503, + "grad_norm": 4.6353912353515625, + "learning_rate": 9.965722870759977e-05, + "loss": 3.1204, + "step": 1972 + }, + { + "epoch": 0.13247877588000403, + "grad_norm": 5.496473789215088, + "learning_rate": 9.965595711107858e-05, + "loss": 3.3388, + "step": 1974 + }, + { + "epoch": 0.13261299956377304, + "grad_norm": 5.193242073059082, + "learning_rate": 9.965468316841106e-05, + "loss": 3.226, + "step": 1976 + }, + { + "epoch": 0.13274722324754204, + "grad_norm": 4.238823890686035, + "learning_rate": 9.96534068796574e-05, + "loss": 3.0708, + "step": 1978 + }, + { + "epoch": 0.13288144693131104, + "grad_norm": 7.229121685028076, + "learning_rate": 9.965212824487791e-05, + "loss": 3.0822, + "step": 1980 + }, + { + "epoch": 0.13301567061508002, + "grad_norm": 4.837655067443848, + "learning_rate": 9.965084726413298e-05, + "loss": 2.9978, + "step": 1982 + }, + { + "epoch": 0.13314989429884902, + "grad_norm": 4.577108383178711, + "learning_rate": 9.964956393748317e-05, + "loss": 2.8916, + "step": 1984 + }, + { + "epoch": 0.13328411798261802, + "grad_norm": 6.443302154541016, + "learning_rate": 9.964827826498909e-05, + "loss": 3.1264, + "step": 1986 + }, + { + "epoch": 0.13341834166638702, + "grad_norm": 7.785131454467773, + "learning_rate": 9.964699024671148e-05, + "loss": 3.262, + "step": 1988 + }, + { + "epoch": 0.13355256535015603, + "grad_norm": 4.875034809112549, + "learning_rate": 9.964569988271122e-05, + "loss": 2.938, + "step": 1990 + }, + { + "epoch": 0.13368678903392503, + "grad_norm": 4.388699054718018, + "learning_rate": 9.964440717304926e-05, + "loss": 3.0336, + "step": 1992 + }, + { + "epoch": 0.13382101271769403, + "grad_norm": 4.761597633361816, + "learning_rate": 9.964311211778667e-05, + "loss": 3.0545, + "step": 1994 + }, + { + "epoch": 0.13395523640146303, + "grad_norm": 5.361051559448242, + "learning_rate": 9.964181471698469e-05, + "loss": 3.1133, + "step": 1996 + }, + { + "epoch": 0.13408946008523204, + "grad_norm": 4.326852321624756, + "learning_rate": 9.964051497070455e-05, + "loss": 2.8921, + "step": 1998 + }, + { + "epoch": 0.13422368376900104, + "grad_norm": 5.271337985992432, + "learning_rate": 9.963921287900769e-05, + "loss": 3.2232, + "step": 2000 + }, + { + "epoch": 0.13435790745277004, + "grad_norm": 4.657649040222168, + "learning_rate": 9.963790844195563e-05, + "loss": 2.8665, + "step": 2002 + }, + { + "epoch": 0.13449213113653904, + "grad_norm": 4.908298492431641, + "learning_rate": 9.963660165961002e-05, + "loss": 3.0068, + "step": 2004 + }, + { + "epoch": 0.13462635482030805, + "grad_norm": 5.049971103668213, + "learning_rate": 9.963529253203259e-05, + "loss": 3.1529, + "step": 2006 + }, + { + "epoch": 0.13476057850407705, + "grad_norm": 4.741759777069092, + "learning_rate": 9.963398105928519e-05, + "loss": 3.1009, + "step": 2008 + }, + { + "epoch": 0.13489480218784605, + "grad_norm": 5.248902797698975, + "learning_rate": 9.963266724142976e-05, + "loss": 3.159, + "step": 2010 + }, + { + "epoch": 0.13502902587161505, + "grad_norm": 5.0422210693359375, + "learning_rate": 9.963135107852844e-05, + "loss": 2.8982, + "step": 2012 + }, + { + "epoch": 0.13516324955538406, + "grad_norm": 4.27108907699585, + "learning_rate": 9.963003257064336e-05, + "loss": 3.0873, + "step": 2014 + }, + { + "epoch": 0.13529747323915306, + "grad_norm": 6.209035873413086, + "learning_rate": 9.962871171783684e-05, + "loss": 3.179, + "step": 2016 + }, + { + "epoch": 0.13543169692292206, + "grad_norm": 5.149410724639893, + "learning_rate": 9.962738852017126e-05, + "loss": 3.1459, + "step": 2018 + }, + { + "epoch": 0.13556592060669104, + "grad_norm": 5.5774335861206055, + "learning_rate": 9.962606297770917e-05, + "loss": 3.2999, + "step": 2020 + }, + { + "epoch": 0.13570014429046004, + "grad_norm": 5.572885513305664, + "learning_rate": 9.962473509051319e-05, + "loss": 3.1215, + "step": 2022 + }, + { + "epoch": 0.13583436797422904, + "grad_norm": 6.514729976654053, + "learning_rate": 9.962340485864608e-05, + "loss": 3.1387, + "step": 2024 + }, + { + "epoch": 0.13596859165799804, + "grad_norm": 6.528088092803955, + "learning_rate": 9.962207228217066e-05, + "loss": 3.161, + "step": 2026 + }, + { + "epoch": 0.13610281534176705, + "grad_norm": 4.789805889129639, + "learning_rate": 9.962073736114989e-05, + "loss": 2.9032, + "step": 2028 + }, + { + "epoch": 0.13623703902553605, + "grad_norm": 4.954381465911865, + "learning_rate": 9.961940009564688e-05, + "loss": 3.0583, + "step": 2030 + }, + { + "epoch": 0.13637126270930505, + "grad_norm": 4.447454452514648, + "learning_rate": 9.961806048572477e-05, + "loss": 2.9241, + "step": 2032 + }, + { + "epoch": 0.13650548639307405, + "grad_norm": 7.249116897583008, + "learning_rate": 9.961671853144687e-05, + "loss": 2.9347, + "step": 2034 + }, + { + "epoch": 0.13663971007684306, + "grad_norm": 5.301792144775391, + "learning_rate": 9.96153742328766e-05, + "loss": 3.3187, + "step": 2036 + }, + { + "epoch": 0.13677393376061206, + "grad_norm": 5.316348075866699, + "learning_rate": 9.961402759007742e-05, + "loss": 3.091, + "step": 2038 + }, + { + "epoch": 0.13690815744438106, + "grad_norm": 5.183095932006836, + "learning_rate": 9.961267860311305e-05, + "loss": 3.032, + "step": 2040 + }, + { + "epoch": 0.13704238112815006, + "grad_norm": 5.164398670196533, + "learning_rate": 9.961132727204716e-05, + "loss": 3.2217, + "step": 2042 + }, + { + "epoch": 0.13717660481191907, + "grad_norm": 6.8557257652282715, + "learning_rate": 9.96099735969436e-05, + "loss": 3.4091, + "step": 2044 + }, + { + "epoch": 0.13731082849568807, + "grad_norm": 5.4604268074035645, + "learning_rate": 9.960861757786634e-05, + "loss": 3.0971, + "step": 2046 + }, + { + "epoch": 0.13744505217945707, + "grad_norm": 4.615115642547607, + "learning_rate": 9.960725921487947e-05, + "loss": 3.3456, + "step": 2048 + }, + { + "epoch": 0.13757927586322607, + "grad_norm": 7.403223037719727, + "learning_rate": 9.960589850804713e-05, + "loss": 3.2093, + "step": 2050 + }, + { + "epoch": 0.13771349954699508, + "grad_norm": 4.958242893218994, + "learning_rate": 9.960453545743365e-05, + "loss": 2.9987, + "step": 2052 + }, + { + "epoch": 0.13784772323076408, + "grad_norm": 4.5686354637146, + "learning_rate": 9.96031700631034e-05, + "loss": 3.0824, + "step": 2054 + }, + { + "epoch": 0.13798194691453308, + "grad_norm": 5.309609413146973, + "learning_rate": 9.96018023251209e-05, + "loss": 3.07, + "step": 2056 + }, + { + "epoch": 0.13811617059830206, + "grad_norm": 7.003293037414551, + "learning_rate": 9.96004322435508e-05, + "loss": 2.9864, + "step": 2058 + }, + { + "epoch": 0.13825039428207106, + "grad_norm": 4.931795120239258, + "learning_rate": 9.959905981845781e-05, + "loss": 3.0395, + "step": 2060 + }, + { + "epoch": 0.13838461796584006, + "grad_norm": 5.000448703765869, + "learning_rate": 9.959768504990675e-05, + "loss": 3.1655, + "step": 2062 + }, + { + "epoch": 0.13851884164960906, + "grad_norm": 4.859556198120117, + "learning_rate": 9.959630793796262e-05, + "loss": 3.1993, + "step": 2064 + }, + { + "epoch": 0.13865306533337807, + "grad_norm": 4.763759613037109, + "learning_rate": 9.959492848269047e-05, + "loss": 3.186, + "step": 2066 + }, + { + "epoch": 0.13878728901714707, + "grad_norm": 11.302384376525879, + "learning_rate": 9.959354668415546e-05, + "loss": 3.0651, + "step": 2068 + }, + { + "epoch": 0.13892151270091607, + "grad_norm": 4.843883514404297, + "learning_rate": 9.95921625424229e-05, + "loss": 3.047, + "step": 2070 + }, + { + "epoch": 0.13905573638468507, + "grad_norm": 5.9107441902160645, + "learning_rate": 9.959077605755818e-05, + "loss": 3.0265, + "step": 2072 + }, + { + "epoch": 0.13918996006845408, + "grad_norm": 6.591708183288574, + "learning_rate": 9.95893872296268e-05, + "loss": 2.8207, + "step": 2074 + }, + { + "epoch": 0.13932418375222308, + "grad_norm": 4.742521286010742, + "learning_rate": 9.958799605869438e-05, + "loss": 2.9444, + "step": 2076 + }, + { + "epoch": 0.13945840743599208, + "grad_norm": 6.578078746795654, + "learning_rate": 9.958660254482667e-05, + "loss": 2.9721, + "step": 2078 + }, + { + "epoch": 0.13959263111976108, + "grad_norm": 4.9574971199035645, + "learning_rate": 9.95852066880895e-05, + "loss": 3.3906, + "step": 2080 + }, + { + "epoch": 0.1397268548035301, + "grad_norm": 4.410763740539551, + "learning_rate": 9.95838084885488e-05, + "loss": 3.0561, + "step": 2082 + }, + { + "epoch": 0.1398610784872991, + "grad_norm": 7.5792741775512695, + "learning_rate": 9.958240794627067e-05, + "loss": 3.0398, + "step": 2084 + }, + { + "epoch": 0.1399953021710681, + "grad_norm": 7.25602912902832, + "learning_rate": 9.958100506132127e-05, + "loss": 3.4314, + "step": 2086 + }, + { + "epoch": 0.1401295258548371, + "grad_norm": 5.873611927032471, + "learning_rate": 9.957959983376686e-05, + "loss": 3.0289, + "step": 2088 + }, + { + "epoch": 0.1402637495386061, + "grad_norm": 4.511819839477539, + "learning_rate": 9.957819226367385e-05, + "loss": 3.0818, + "step": 2090 + }, + { + "epoch": 0.1403979732223751, + "grad_norm": 4.869277000427246, + "learning_rate": 9.957678235110877e-05, + "loss": 3.0101, + "step": 2092 + }, + { + "epoch": 0.1405321969061441, + "grad_norm": 4.9718852043151855, + "learning_rate": 9.957537009613819e-05, + "loss": 3.106, + "step": 2094 + }, + { + "epoch": 0.14066642058991308, + "grad_norm": 5.964816093444824, + "learning_rate": 9.957395549882887e-05, + "loss": 3.2948, + "step": 2096 + }, + { + "epoch": 0.14080064427368208, + "grad_norm": 5.46800422668457, + "learning_rate": 9.957253855924761e-05, + "loss": 3.0637, + "step": 2098 + }, + { + "epoch": 0.14093486795745108, + "grad_norm": 5.4803786277771, + "learning_rate": 9.957111927746143e-05, + "loss": 3.2227, + "step": 2100 + }, + { + "epoch": 0.14106909164122008, + "grad_norm": 4.213554859161377, + "learning_rate": 9.956969765353731e-05, + "loss": 2.9772, + "step": 2102 + }, + { + "epoch": 0.1412033153249891, + "grad_norm": 5.628570079803467, + "learning_rate": 9.956827368754246e-05, + "loss": 3.2328, + "step": 2104 + }, + { + "epoch": 0.1413375390087581, + "grad_norm": 4.814405918121338, + "learning_rate": 9.956684737954414e-05, + "loss": 3.0682, + "step": 2106 + }, + { + "epoch": 0.1414717626925271, + "grad_norm": 4.658204555511475, + "learning_rate": 9.956541872960976e-05, + "loss": 3.0486, + "step": 2108 + }, + { + "epoch": 0.1416059863762961, + "grad_norm": 4.766931533813477, + "learning_rate": 9.956398773780682e-05, + "loss": 3.1754, + "step": 2110 + }, + { + "epoch": 0.1417402100600651, + "grad_norm": 5.739643096923828, + "learning_rate": 9.95625544042029e-05, + "loss": 3.0355, + "step": 2112 + }, + { + "epoch": 0.1418744337438341, + "grad_norm": 5.019379138946533, + "learning_rate": 9.956111872886576e-05, + "loss": 2.9841, + "step": 2114 + }, + { + "epoch": 0.1420086574276031, + "grad_norm": 5.025475025177002, + "learning_rate": 9.95596807118632e-05, + "loss": 2.6247, + "step": 2116 + }, + { + "epoch": 0.1421428811113721, + "grad_norm": 4.756045341491699, + "learning_rate": 9.955824035326321e-05, + "loss": 2.9542, + "step": 2118 + }, + { + "epoch": 0.1422771047951411, + "grad_norm": 5.124791622161865, + "learning_rate": 9.95567976531338e-05, + "loss": 3.2348, + "step": 2120 + }, + { + "epoch": 0.1424113284789101, + "grad_norm": 4.631430625915527, + "learning_rate": 9.955535261154316e-05, + "loss": 3.2264, + "step": 2122 + }, + { + "epoch": 0.1425455521626791, + "grad_norm": 5.531032085418701, + "learning_rate": 9.955390522855954e-05, + "loss": 3.1407, + "step": 2124 + }, + { + "epoch": 0.14267977584644811, + "grad_norm": 6.330652236938477, + "learning_rate": 9.955245550425135e-05, + "loss": 3.1993, + "step": 2126 + }, + { + "epoch": 0.14281399953021712, + "grad_norm": 5.368353843688965, + "learning_rate": 9.955100343868709e-05, + "loss": 2.9807, + "step": 2128 + }, + { + "epoch": 0.14294822321398612, + "grad_norm": 11.636331558227539, + "learning_rate": 9.954954903193533e-05, + "loss": 3.0383, + "step": 2130 + }, + { + "epoch": 0.14308244689775512, + "grad_norm": 6.560414791107178, + "learning_rate": 9.954809228406483e-05, + "loss": 3.0078, + "step": 2132 + }, + { + "epoch": 0.1432166705815241, + "grad_norm": 8.455137252807617, + "learning_rate": 9.954663319514439e-05, + "loss": 3.3259, + "step": 2134 + }, + { + "epoch": 0.1433508942652931, + "grad_norm": 5.098543167114258, + "learning_rate": 9.954517176524298e-05, + "loss": 2.934, + "step": 2136 + }, + { + "epoch": 0.1434851179490621, + "grad_norm": 5.203482627868652, + "learning_rate": 9.954370799442961e-05, + "loss": 3.3467, + "step": 2138 + }, + { + "epoch": 0.1436193416328311, + "grad_norm": 8.075535774230957, + "learning_rate": 9.954224188277347e-05, + "loss": 2.7958, + "step": 2140 + }, + { + "epoch": 0.1437535653166001, + "grad_norm": 4.626559734344482, + "learning_rate": 9.954077343034383e-05, + "loss": 2.8518, + "step": 2142 + }, + { + "epoch": 0.1438877890003691, + "grad_norm": 5.467393398284912, + "learning_rate": 9.953930263721003e-05, + "loss": 3.2087, + "step": 2144 + }, + { + "epoch": 0.1440220126841381, + "grad_norm": 4.648910999298096, + "learning_rate": 9.953782950344164e-05, + "loss": 3.0577, + "step": 2146 + }, + { + "epoch": 0.14415623636790711, + "grad_norm": 4.951671123504639, + "learning_rate": 9.95363540291082e-05, + "loss": 3.2448, + "step": 2148 + }, + { + "epoch": 0.14429046005167612, + "grad_norm": 4.981507301330566, + "learning_rate": 9.953487621427942e-05, + "loss": 3.0338, + "step": 2150 + }, + { + "epoch": 0.14442468373544512, + "grad_norm": 6.454351425170898, + "learning_rate": 9.953339605902517e-05, + "loss": 3.1979, + "step": 2152 + }, + { + "epoch": 0.14455890741921412, + "grad_norm": 5.815209865570068, + "learning_rate": 9.953191356341535e-05, + "loss": 3.2559, + "step": 2154 + }, + { + "epoch": 0.14469313110298312, + "grad_norm": 4.497643947601318, + "learning_rate": 9.953042872752003e-05, + "loss": 3.0176, + "step": 2156 + }, + { + "epoch": 0.14482735478675213, + "grad_norm": 5.619792461395264, + "learning_rate": 9.952894155140932e-05, + "loss": 2.9797, + "step": 2158 + }, + { + "epoch": 0.14496157847052113, + "grad_norm": 4.946806907653809, + "learning_rate": 9.952745203515354e-05, + "loss": 3.1102, + "step": 2160 + }, + { + "epoch": 0.14509580215429013, + "grad_norm": 5.714873790740967, + "learning_rate": 9.952596017882304e-05, + "loss": 3.068, + "step": 2162 + }, + { + "epoch": 0.14523002583805913, + "grad_norm": 5.328080177307129, + "learning_rate": 9.95244659824883e-05, + "loss": 3.0938, + "step": 2164 + }, + { + "epoch": 0.14536424952182814, + "grad_norm": 4.8951239585876465, + "learning_rate": 9.952296944621994e-05, + "loss": 2.9826, + "step": 2166 + }, + { + "epoch": 0.14549847320559714, + "grad_norm": 4.647072792053223, + "learning_rate": 9.952147057008864e-05, + "loss": 2.8317, + "step": 2168 + }, + { + "epoch": 0.14563269688936614, + "grad_norm": 4.668970108032227, + "learning_rate": 9.951996935416525e-05, + "loss": 3.0014, + "step": 2170 + }, + { + "epoch": 0.14576692057313512, + "grad_norm": 9.910264015197754, + "learning_rate": 9.951846579852069e-05, + "loss": 2.9596, + "step": 2172 + }, + { + "epoch": 0.14590114425690412, + "grad_norm": 4.813793659210205, + "learning_rate": 9.951695990322598e-05, + "loss": 3.1976, + "step": 2174 + }, + { + "epoch": 0.14603536794067312, + "grad_norm": 5.294146537780762, + "learning_rate": 9.95154516683523e-05, + "loss": 2.9053, + "step": 2176 + }, + { + "epoch": 0.14616959162444212, + "grad_norm": 4.478919982910156, + "learning_rate": 9.951394109397088e-05, + "loss": 3.0204, + "step": 2178 + }, + { + "epoch": 0.14630381530821113, + "grad_norm": 4.557251930236816, + "learning_rate": 9.951242818015312e-05, + "loss": 3.2491, + "step": 2180 + }, + { + "epoch": 0.14643803899198013, + "grad_norm": 6.604088306427002, + "learning_rate": 9.951091292697048e-05, + "loss": 3.0078, + "step": 2182 + }, + { + "epoch": 0.14657226267574913, + "grad_norm": 4.533780574798584, + "learning_rate": 9.950939533449458e-05, + "loss": 3.145, + "step": 2184 + }, + { + "epoch": 0.14670648635951813, + "grad_norm": 4.2600531578063965, + "learning_rate": 9.950787540279711e-05, + "loss": 2.9899, + "step": 2186 + }, + { + "epoch": 0.14684071004328714, + "grad_norm": 4.438211441040039, + "learning_rate": 9.950635313194986e-05, + "loss": 3.1028, + "step": 2188 + }, + { + "epoch": 0.14697493372705614, + "grad_norm": 3.9078826904296875, + "learning_rate": 9.950482852202478e-05, + "loss": 2.7989, + "step": 2190 + }, + { + "epoch": 0.14710915741082514, + "grad_norm": 5.061270713806152, + "learning_rate": 9.95033015730939e-05, + "loss": 3.3481, + "step": 2192 + }, + { + "epoch": 0.14724338109459414, + "grad_norm": 4.728822231292725, + "learning_rate": 9.950177228522937e-05, + "loss": 3.1094, + "step": 2194 + }, + { + "epoch": 0.14737760477836315, + "grad_norm": 5.271886348724365, + "learning_rate": 9.950024065850343e-05, + "loss": 2.8751, + "step": 2196 + }, + { + "epoch": 0.14751182846213215, + "grad_norm": 7.036325931549072, + "learning_rate": 9.949870669298846e-05, + "loss": 3.0656, + "step": 2198 + }, + { + "epoch": 0.14764605214590115, + "grad_norm": 4.801326274871826, + "learning_rate": 9.949717038875695e-05, + "loss": 2.9964, + "step": 2200 + }, + { + "epoch": 0.14778027582967015, + "grad_norm": 4.732778549194336, + "learning_rate": 9.949563174588146e-05, + "loss": 2.8615, + "step": 2202 + }, + { + "epoch": 0.14791449951343916, + "grad_norm": 4.801909446716309, + "learning_rate": 9.949409076443468e-05, + "loss": 3.1174, + "step": 2204 + }, + { + "epoch": 0.14804872319720816, + "grad_norm": 4.568469047546387, + "learning_rate": 9.949254744448946e-05, + "loss": 3.3157, + "step": 2206 + }, + { + "epoch": 0.14818294688097716, + "grad_norm": 6.887981414794922, + "learning_rate": 9.94910017861187e-05, + "loss": 3.0442, + "step": 2208 + }, + { + "epoch": 0.14831717056474614, + "grad_norm": 4.546311855316162, + "learning_rate": 9.948945378939542e-05, + "loss": 2.8858, + "step": 2210 + }, + { + "epoch": 0.14845139424851514, + "grad_norm": 4.9952521324157715, + "learning_rate": 9.948790345439276e-05, + "loss": 3.3749, + "step": 2212 + }, + { + "epoch": 0.14858561793228414, + "grad_norm": 5.643448829650879, + "learning_rate": 9.948635078118398e-05, + "loss": 3.2754, + "step": 2214 + }, + { + "epoch": 0.14871984161605314, + "grad_norm": 4.379649639129639, + "learning_rate": 9.948479576984242e-05, + "loss": 3.0028, + "step": 2216 + }, + { + "epoch": 0.14885406529982215, + "grad_norm": 4.558875560760498, + "learning_rate": 9.948323842044159e-05, + "loss": 3.066, + "step": 2218 + }, + { + "epoch": 0.14898828898359115, + "grad_norm": 5.105526447296143, + "learning_rate": 9.948167873305503e-05, + "loss": 2.8952, + "step": 2220 + }, + { + "epoch": 0.14912251266736015, + "grad_norm": 5.437850475311279, + "learning_rate": 9.948011670775647e-05, + "loss": 3.3548, + "step": 2222 + }, + { + "epoch": 0.14925673635112915, + "grad_norm": 5.870114326477051, + "learning_rate": 9.94785523446197e-05, + "loss": 3.1708, + "step": 2224 + }, + { + "epoch": 0.14939096003489816, + "grad_norm": 4.705777645111084, + "learning_rate": 9.947698564371859e-05, + "loss": 3.0829, + "step": 2226 + }, + { + "epoch": 0.14952518371866716, + "grad_norm": 4.50447416305542, + "learning_rate": 9.947541660512722e-05, + "loss": 2.9822, + "step": 2228 + }, + { + "epoch": 0.14965940740243616, + "grad_norm": 5.573049068450928, + "learning_rate": 9.947384522891972e-05, + "loss": 3.225, + "step": 2230 + }, + { + "epoch": 0.14979363108620516, + "grad_norm": 5.45938777923584, + "learning_rate": 9.947227151517032e-05, + "loss": 3.2157, + "step": 2232 + }, + { + "epoch": 0.14992785476997417, + "grad_norm": 4.7953972816467285, + "learning_rate": 9.947069546395336e-05, + "loss": 3.1028, + "step": 2234 + }, + { + "epoch": 0.15006207845374317, + "grad_norm": 4.933053493499756, + "learning_rate": 9.946911707534331e-05, + "loss": 3.131, + "step": 2236 + }, + { + "epoch": 0.15019630213751217, + "grad_norm": 9.141112327575684, + "learning_rate": 9.946753634941477e-05, + "loss": 3.2776, + "step": 2238 + }, + { + "epoch": 0.15033052582128117, + "grad_norm": 4.522568702697754, + "learning_rate": 9.94659532862424e-05, + "loss": 2.8591, + "step": 2240 + }, + { + "epoch": 0.15046474950505018, + "grad_norm": 4.454549312591553, + "learning_rate": 9.9464367885901e-05, + "loss": 2.9154, + "step": 2242 + }, + { + "epoch": 0.15059897318881918, + "grad_norm": 5.7813615798950195, + "learning_rate": 9.946278014846551e-05, + "loss": 3.2354, + "step": 2244 + }, + { + "epoch": 0.15073319687258815, + "grad_norm": 4.963947772979736, + "learning_rate": 9.946119007401091e-05, + "loss": 2.9292, + "step": 2246 + }, + { + "epoch": 0.15086742055635716, + "grad_norm": 4.720251560211182, + "learning_rate": 9.945959766261235e-05, + "loss": 2.9656, + "step": 2248 + }, + { + "epoch": 0.15100164424012616, + "grad_norm": 4.670527458190918, + "learning_rate": 9.945800291434504e-05, + "loss": 3.0976, + "step": 2250 + }, + { + "epoch": 0.15113586792389516, + "grad_norm": 8.313284873962402, + "learning_rate": 9.945640582928437e-05, + "loss": 2.6532, + "step": 2252 + }, + { + "epoch": 0.15127009160766416, + "grad_norm": 5.388286113739014, + "learning_rate": 9.945480640750577e-05, + "loss": 3.1246, + "step": 2254 + }, + { + "epoch": 0.15140431529143317, + "grad_norm": 4.8239006996154785, + "learning_rate": 9.945320464908481e-05, + "loss": 3.0721, + "step": 2256 + }, + { + "epoch": 0.15153853897520217, + "grad_norm": 5.749651908874512, + "learning_rate": 9.945160055409717e-05, + "loss": 3.2055, + "step": 2258 + }, + { + "epoch": 0.15167276265897117, + "grad_norm": 5.973934173583984, + "learning_rate": 9.944999412261866e-05, + "loss": 3.1061, + "step": 2260 + }, + { + "epoch": 0.15180698634274017, + "grad_norm": 6.607673645019531, + "learning_rate": 9.944838535472515e-05, + "loss": 3.1193, + "step": 2262 + }, + { + "epoch": 0.15194121002650918, + "grad_norm": 4.988908290863037, + "learning_rate": 9.944677425049268e-05, + "loss": 3.1267, + "step": 2264 + }, + { + "epoch": 0.15207543371027818, + "grad_norm": 5.186244010925293, + "learning_rate": 9.944516080999735e-05, + "loss": 3.1206, + "step": 2266 + }, + { + "epoch": 0.15220965739404718, + "grad_norm": 5.0459885597229, + "learning_rate": 9.944354503331541e-05, + "loss": 2.8403, + "step": 2268 + }, + { + "epoch": 0.15234388107781618, + "grad_norm": 5.559090614318848, + "learning_rate": 9.94419269205232e-05, + "loss": 2.9991, + "step": 2270 + }, + { + "epoch": 0.1524781047615852, + "grad_norm": 4.958273410797119, + "learning_rate": 9.944030647169715e-05, + "loss": 3.0723, + "step": 2272 + }, + { + "epoch": 0.1526123284453542, + "grad_norm": 4.9088454246521, + "learning_rate": 9.943868368691386e-05, + "loss": 2.7799, + "step": 2274 + }, + { + "epoch": 0.1527465521291232, + "grad_norm": 4.8308796882629395, + "learning_rate": 9.943705856624996e-05, + "loss": 2.9946, + "step": 2276 + }, + { + "epoch": 0.1528807758128922, + "grad_norm": 4.8122639656066895, + "learning_rate": 9.943543110978227e-05, + "loss": 3.1531, + "step": 2278 + }, + { + "epoch": 0.1530149994966612, + "grad_norm": 5.635534286499023, + "learning_rate": 9.943380131758768e-05, + "loss": 2.9922, + "step": 2280 + }, + { + "epoch": 0.1531492231804302, + "grad_norm": 5.36400842666626, + "learning_rate": 9.943216918974317e-05, + "loss": 3.2108, + "step": 2282 + }, + { + "epoch": 0.15328344686419917, + "grad_norm": 4.865900993347168, + "learning_rate": 9.943053472632587e-05, + "loss": 3.1425, + "step": 2284 + }, + { + "epoch": 0.15341767054796818, + "grad_norm": 5.648663520812988, + "learning_rate": 9.942889792741302e-05, + "loss": 2.9768, + "step": 2286 + }, + { + "epoch": 0.15355189423173718, + "grad_norm": 5.230109691619873, + "learning_rate": 9.942725879308192e-05, + "loss": 2.8769, + "step": 2288 + }, + { + "epoch": 0.15368611791550618, + "grad_norm": 5.364927768707275, + "learning_rate": 9.942561732341005e-05, + "loss": 3.176, + "step": 2290 + }, + { + "epoch": 0.15382034159927518, + "grad_norm": 6.132948875427246, + "learning_rate": 9.942397351847494e-05, + "loss": 2.7477, + "step": 2292 + }, + { + "epoch": 0.1539545652830442, + "grad_norm": 5.106632709503174, + "learning_rate": 9.942232737835428e-05, + "loss": 3.1448, + "step": 2294 + }, + { + "epoch": 0.1540887889668132, + "grad_norm": 4.970794677734375, + "learning_rate": 9.942067890312584e-05, + "loss": 3.2298, + "step": 2296 + }, + { + "epoch": 0.1542230126505822, + "grad_norm": 5.55429744720459, + "learning_rate": 9.94190280928675e-05, + "loss": 3.1498, + "step": 2298 + }, + { + "epoch": 0.1543572363343512, + "grad_norm": 5.4858527183532715, + "learning_rate": 9.941737494765725e-05, + "loss": 2.7382, + "step": 2300 + }, + { + "epoch": 0.1544914600181202, + "grad_norm": 6.06125020980835, + "learning_rate": 9.941571946757321e-05, + "loss": 3.0835, + "step": 2302 + }, + { + "epoch": 0.1546256837018892, + "grad_norm": 7.500847339630127, + "learning_rate": 9.941406165269362e-05, + "loss": 2.9686, + "step": 2304 + }, + { + "epoch": 0.1547599073856582, + "grad_norm": 6.030723571777344, + "learning_rate": 9.941240150309676e-05, + "loss": 3.3509, + "step": 2306 + }, + { + "epoch": 0.1548941310694272, + "grad_norm": 4.083286762237549, + "learning_rate": 9.94107390188611e-05, + "loss": 2.7602, + "step": 2308 + }, + { + "epoch": 0.1550283547531962, + "grad_norm": 4.838588714599609, + "learning_rate": 9.94090742000652e-05, + "loss": 3.0031, + "step": 2310 + }, + { + "epoch": 0.1551625784369652, + "grad_norm": 3.6877191066741943, + "learning_rate": 9.940740704678768e-05, + "loss": 3.0656, + "step": 2312 + }, + { + "epoch": 0.1552968021207342, + "grad_norm": 6.273148536682129, + "learning_rate": 9.940573755910735e-05, + "loss": 3.1004, + "step": 2314 + }, + { + "epoch": 0.15543102580450321, + "grad_norm": 5.302018165588379, + "learning_rate": 9.940406573710306e-05, + "loss": 3.1525, + "step": 2316 + }, + { + "epoch": 0.15556524948827222, + "grad_norm": 5.849557399749756, + "learning_rate": 9.940239158085382e-05, + "loss": 3.4426, + "step": 2318 + }, + { + "epoch": 0.15569947317204122, + "grad_norm": 4.6106414794921875, + "learning_rate": 9.940071509043872e-05, + "loss": 2.9759, + "step": 2320 + }, + { + "epoch": 0.1558336968558102, + "grad_norm": 5.277981758117676, + "learning_rate": 9.939903626593698e-05, + "loss": 3.2483, + "step": 2322 + }, + { + "epoch": 0.1559679205395792, + "grad_norm": 5.463323593139648, + "learning_rate": 9.939735510742792e-05, + "loss": 3.0998, + "step": 2324 + }, + { + "epoch": 0.1561021442233482, + "grad_norm": 4.913983345031738, + "learning_rate": 9.939567161499095e-05, + "loss": 3.2109, + "step": 2326 + }, + { + "epoch": 0.1562363679071172, + "grad_norm": 4.99582576751709, + "learning_rate": 9.939398578870563e-05, + "loss": 3.083, + "step": 2328 + }, + { + "epoch": 0.1563705915908862, + "grad_norm": 5.657858371734619, + "learning_rate": 9.939229762865164e-05, + "loss": 2.9409, + "step": 2330 + }, + { + "epoch": 0.1565048152746552, + "grad_norm": 5.819788932800293, + "learning_rate": 9.939060713490868e-05, + "loss": 2.6312, + "step": 2332 + }, + { + "epoch": 0.1566390389584242, + "grad_norm": 4.889979839324951, + "learning_rate": 9.938891430755666e-05, + "loss": 3.0574, + "step": 2334 + }, + { + "epoch": 0.1567732626421932, + "grad_norm": 4.752182960510254, + "learning_rate": 9.938721914667557e-05, + "loss": 2.9542, + "step": 2336 + }, + { + "epoch": 0.15690748632596221, + "grad_norm": 5.525217056274414, + "learning_rate": 9.938552165234548e-05, + "loss": 2.9656, + "step": 2338 + }, + { + "epoch": 0.15704171000973122, + "grad_norm": 5.320490837097168, + "learning_rate": 9.93838218246466e-05, + "loss": 2.9948, + "step": 2340 + }, + { + "epoch": 0.15717593369350022, + "grad_norm": 5.844310283660889, + "learning_rate": 9.938211966365926e-05, + "loss": 3.1865, + "step": 2342 + }, + { + "epoch": 0.15731015737726922, + "grad_norm": 5.152303218841553, + "learning_rate": 9.938041516946389e-05, + "loss": 2.8293, + "step": 2344 + }, + { + "epoch": 0.15744438106103822, + "grad_norm": 5.031563758850098, + "learning_rate": 9.937870834214097e-05, + "loss": 2.8561, + "step": 2346 + }, + { + "epoch": 0.15757860474480723, + "grad_norm": 4.260949611663818, + "learning_rate": 9.93769991817712e-05, + "loss": 2.8603, + "step": 2348 + }, + { + "epoch": 0.15771282842857623, + "grad_norm": 5.0424113273620605, + "learning_rate": 9.93752876884353e-05, + "loss": 2.9822, + "step": 2350 + }, + { + "epoch": 0.15784705211234523, + "grad_norm": 4.565359592437744, + "learning_rate": 9.937357386221416e-05, + "loss": 3.1702, + "step": 2352 + }, + { + "epoch": 0.15798127579611423, + "grad_norm": 5.830257892608643, + "learning_rate": 9.937185770318874e-05, + "loss": 3.024, + "step": 2354 + }, + { + "epoch": 0.15811549947988324, + "grad_norm": 5.120675086975098, + "learning_rate": 9.937013921144014e-05, + "loss": 2.971, + "step": 2356 + }, + { + "epoch": 0.15824972316365224, + "grad_norm": 5.247325897216797, + "learning_rate": 9.936841838704953e-05, + "loss": 3.0857, + "step": 2358 + }, + { + "epoch": 0.15838394684742121, + "grad_norm": 4.855215072631836, + "learning_rate": 9.936669523009823e-05, + "loss": 3.0612, + "step": 2360 + }, + { + "epoch": 0.15851817053119022, + "grad_norm": 5.022877216339111, + "learning_rate": 9.936496974066767e-05, + "loss": 2.9488, + "step": 2362 + }, + { + "epoch": 0.15865239421495922, + "grad_norm": 4.8562331199646, + "learning_rate": 9.936324191883935e-05, + "loss": 2.9434, + "step": 2364 + }, + { + "epoch": 0.15878661789872822, + "grad_norm": 4.913610935211182, + "learning_rate": 9.936151176469493e-05, + "loss": 3.1906, + "step": 2366 + }, + { + "epoch": 0.15892084158249722, + "grad_norm": 6.747122764587402, + "learning_rate": 9.935977927831612e-05, + "loss": 2.8481, + "step": 2368 + }, + { + "epoch": 0.15905506526626623, + "grad_norm": 4.916810035705566, + "learning_rate": 9.935804445978483e-05, + "loss": 3.518, + "step": 2370 + }, + { + "epoch": 0.15918928895003523, + "grad_norm": 4.627390384674072, + "learning_rate": 9.935630730918297e-05, + "loss": 2.7573, + "step": 2372 + }, + { + "epoch": 0.15932351263380423, + "grad_norm": 5.630057334899902, + "learning_rate": 9.935456782659267e-05, + "loss": 3.2307, + "step": 2374 + }, + { + "epoch": 0.15945773631757323, + "grad_norm": 6.760767459869385, + "learning_rate": 9.935282601209608e-05, + "loss": 3.2004, + "step": 2376 + }, + { + "epoch": 0.15959196000134224, + "grad_norm": 4.73686408996582, + "learning_rate": 9.93510818657755e-05, + "loss": 3.0383, + "step": 2378 + }, + { + "epoch": 0.15972618368511124, + "grad_norm": 5.315295696258545, + "learning_rate": 9.934933538771336e-05, + "loss": 2.9158, + "step": 2380 + }, + { + "epoch": 0.15986040736888024, + "grad_norm": 4.9172539710998535, + "learning_rate": 9.934758657799218e-05, + "loss": 3.3249, + "step": 2382 + }, + { + "epoch": 0.15999463105264924, + "grad_norm": 4.389949321746826, + "learning_rate": 9.934583543669453e-05, + "loss": 3.0406, + "step": 2384 + }, + { + "epoch": 0.16012885473641825, + "grad_norm": 4.958530902862549, + "learning_rate": 9.934408196390322e-05, + "loss": 2.9617, + "step": 2386 + }, + { + "epoch": 0.16026307842018725, + "grad_norm": 4.353750228881836, + "learning_rate": 9.934232615970107e-05, + "loss": 2.9918, + "step": 2388 + }, + { + "epoch": 0.16039730210395625, + "grad_norm": 6.040163516998291, + "learning_rate": 9.934056802417101e-05, + "loss": 3.1947, + "step": 2390 + }, + { + "epoch": 0.16053152578772525, + "grad_norm": 5.366269111633301, + "learning_rate": 9.933880755739616e-05, + "loss": 3.43, + "step": 2392 + }, + { + "epoch": 0.16066574947149426, + "grad_norm": 5.06095027923584, + "learning_rate": 9.933704475945966e-05, + "loss": 3.2431, + "step": 2394 + }, + { + "epoch": 0.16079997315526326, + "grad_norm": 6.0037994384765625, + "learning_rate": 9.933527963044483e-05, + "loss": 2.8886, + "step": 2396 + }, + { + "epoch": 0.16093419683903223, + "grad_norm": 5.0638322830200195, + "learning_rate": 9.933351217043504e-05, + "loss": 2.9755, + "step": 2398 + }, + { + "epoch": 0.16106842052280124, + "grad_norm": 4.628988265991211, + "learning_rate": 9.933174237951381e-05, + "loss": 2.9084, + "step": 2400 + }, + { + "epoch": 0.16120264420657024, + "grad_norm": 5.4450273513793945, + "learning_rate": 9.932997025776475e-05, + "loss": 3.1714, + "step": 2402 + }, + { + "epoch": 0.16133686789033924, + "grad_norm": 5.10666561126709, + "learning_rate": 9.932819580527162e-05, + "loss": 3.0647, + "step": 2404 + }, + { + "epoch": 0.16147109157410824, + "grad_norm": 3.9153714179992676, + "learning_rate": 9.932641902211821e-05, + "loss": 2.9163, + "step": 2406 + }, + { + "epoch": 0.16160531525787725, + "grad_norm": 4.312791347503662, + "learning_rate": 9.932463990838851e-05, + "loss": 2.8301, + "step": 2408 + }, + { + "epoch": 0.16173953894164625, + "grad_norm": 5.8141961097717285, + "learning_rate": 9.932285846416658e-05, + "loss": 3.0901, + "step": 2410 + }, + { + "epoch": 0.16187376262541525, + "grad_norm": 4.593001365661621, + "learning_rate": 9.932107468953656e-05, + "loss": 3.1745, + "step": 2412 + }, + { + "epoch": 0.16200798630918425, + "grad_norm": 5.0655927658081055, + "learning_rate": 9.931928858458276e-05, + "loss": 3.1401, + "step": 2414 + }, + { + "epoch": 0.16214220999295326, + "grad_norm": 4.262302875518799, + "learning_rate": 9.931750014938954e-05, + "loss": 2.8477, + "step": 2416 + }, + { + "epoch": 0.16227643367672226, + "grad_norm": 4.612324237823486, + "learning_rate": 9.931570938404144e-05, + "loss": 2.9769, + "step": 2418 + }, + { + "epoch": 0.16241065736049126, + "grad_norm": 4.949276924133301, + "learning_rate": 9.931391628862304e-05, + "loss": 2.8647, + "step": 2420 + }, + { + "epoch": 0.16254488104426026, + "grad_norm": 8.669781684875488, + "learning_rate": 9.931212086321905e-05, + "loss": 3.1894, + "step": 2422 + }, + { + "epoch": 0.16267910472802927, + "grad_norm": 4.579480171203613, + "learning_rate": 9.931032310791432e-05, + "loss": 3.057, + "step": 2424 + }, + { + "epoch": 0.16281332841179827, + "grad_norm": 4.533434867858887, + "learning_rate": 9.930852302279381e-05, + "loss": 3.0226, + "step": 2426 + }, + { + "epoch": 0.16294755209556727, + "grad_norm": 5.411552429199219, + "learning_rate": 9.930672060794253e-05, + "loss": 2.855, + "step": 2428 + }, + { + "epoch": 0.16308177577933627, + "grad_norm": 4.528864860534668, + "learning_rate": 9.930491586344565e-05, + "loss": 2.9357, + "step": 2430 + }, + { + "epoch": 0.16321599946310528, + "grad_norm": 4.312304496765137, + "learning_rate": 9.930310878938846e-05, + "loss": 2.9842, + "step": 2432 + }, + { + "epoch": 0.16335022314687428, + "grad_norm": 4.301158428192139, + "learning_rate": 9.930129938585633e-05, + "loss": 3.0489, + "step": 2434 + }, + { + "epoch": 0.16348444683064325, + "grad_norm": 4.562753677368164, + "learning_rate": 9.929948765293477e-05, + "loss": 2.6979, + "step": 2436 + }, + { + "epoch": 0.16361867051441226, + "grad_norm": 5.258896350860596, + "learning_rate": 9.929767359070934e-05, + "loss": 2.7191, + "step": 2438 + }, + { + "epoch": 0.16375289419818126, + "grad_norm": 5.779174327850342, + "learning_rate": 9.929585719926577e-05, + "loss": 2.9883, + "step": 2440 + }, + { + "epoch": 0.16388711788195026, + "grad_norm": 5.4019622802734375, + "learning_rate": 9.92940384786899e-05, + "loss": 2.9399, + "step": 2442 + }, + { + "epoch": 0.16402134156571926, + "grad_norm": 4.617805480957031, + "learning_rate": 9.929221742906763e-05, + "loss": 3.2752, + "step": 2444 + }, + { + "epoch": 0.16415556524948827, + "grad_norm": 4.539881229400635, + "learning_rate": 9.929039405048501e-05, + "loss": 2.9559, + "step": 2446 + }, + { + "epoch": 0.16428978893325727, + "grad_norm": 4.392913818359375, + "learning_rate": 9.928856834302823e-05, + "loss": 3.1516, + "step": 2448 + }, + { + "epoch": 0.16442401261702627, + "grad_norm": 4.3720622062683105, + "learning_rate": 9.928674030678348e-05, + "loss": 3.0162, + "step": 2450 + }, + { + "epoch": 0.16455823630079527, + "grad_norm": 4.535030841827393, + "learning_rate": 9.928490994183719e-05, + "loss": 3.1109, + "step": 2452 + }, + { + "epoch": 0.16469245998456428, + "grad_norm": 5.183037757873535, + "learning_rate": 9.928307724827581e-05, + "loss": 2.9675, + "step": 2454 + }, + { + "epoch": 0.16482668366833328, + "grad_norm": 4.375551700592041, + "learning_rate": 9.928124222618594e-05, + "loss": 2.7317, + "step": 2456 + }, + { + "epoch": 0.16496090735210228, + "grad_norm": 4.85524320602417, + "learning_rate": 9.92794048756543e-05, + "loss": 2.9456, + "step": 2458 + }, + { + "epoch": 0.16509513103587128, + "grad_norm": 4.994555473327637, + "learning_rate": 9.927756519676765e-05, + "loss": 2.9717, + "step": 2460 + }, + { + "epoch": 0.1652293547196403, + "grad_norm": 4.660478115081787, + "learning_rate": 9.927572318961299e-05, + "loss": 2.975, + "step": 2462 + }, + { + "epoch": 0.1653635784034093, + "grad_norm": 5.854766845703125, + "learning_rate": 9.927387885427726e-05, + "loss": 2.7832, + "step": 2464 + }, + { + "epoch": 0.1654978020871783, + "grad_norm": 4.598071575164795, + "learning_rate": 9.927203219084766e-05, + "loss": 3.0666, + "step": 2466 + }, + { + "epoch": 0.1656320257709473, + "grad_norm": 5.329703330993652, + "learning_rate": 9.927018319941145e-05, + "loss": 3.1059, + "step": 2468 + }, + { + "epoch": 0.1657662494547163, + "grad_norm": 4.5437798500061035, + "learning_rate": 9.926833188005595e-05, + "loss": 3.1284, + "step": 2470 + }, + { + "epoch": 0.1659004731384853, + "grad_norm": 4.357038974761963, + "learning_rate": 9.926647823286865e-05, + "loss": 2.8121, + "step": 2472 + }, + { + "epoch": 0.16603469682225427, + "grad_norm": 5.938759803771973, + "learning_rate": 9.926462225793714e-05, + "loss": 3.1282, + "step": 2474 + }, + { + "epoch": 0.16616892050602328, + "grad_norm": 4.481321811676025, + "learning_rate": 9.926276395534911e-05, + "loss": 2.8818, + "step": 2476 + }, + { + "epoch": 0.16630314418979228, + "grad_norm": 5.025930881500244, + "learning_rate": 9.926090332519234e-05, + "loss": 3.2001, + "step": 2478 + }, + { + "epoch": 0.16643736787356128, + "grad_norm": 4.8085761070251465, + "learning_rate": 9.925904036755475e-05, + "loss": 3.4289, + "step": 2480 + }, + { + "epoch": 0.16657159155733028, + "grad_norm": 4.481333255767822, + "learning_rate": 9.92571750825244e-05, + "loss": 2.9439, + "step": 2482 + }, + { + "epoch": 0.1667058152410993, + "grad_norm": 6.288950443267822, + "learning_rate": 9.925530747018935e-05, + "loss": 3.0349, + "step": 2484 + }, + { + "epoch": 0.1668400389248683, + "grad_norm": 4.04398775100708, + "learning_rate": 9.92534375306379e-05, + "loss": 2.8086, + "step": 2486 + }, + { + "epoch": 0.1669742626086373, + "grad_norm": 4.912535190582275, + "learning_rate": 9.925156526395836e-05, + "loss": 3.1634, + "step": 2488 + }, + { + "epoch": 0.1671084862924063, + "grad_norm": 5.039572715759277, + "learning_rate": 9.924969067023922e-05, + "loss": 3.0078, + "step": 2490 + }, + { + "epoch": 0.1672427099761753, + "grad_norm": 6.615666389465332, + "learning_rate": 9.924781374956905e-05, + "loss": 2.9979, + "step": 2492 + }, + { + "epoch": 0.1673769336599443, + "grad_norm": 4.174526214599609, + "learning_rate": 9.924593450203652e-05, + "loss": 2.8617, + "step": 2494 + }, + { + "epoch": 0.1675111573437133, + "grad_norm": 5.304354667663574, + "learning_rate": 9.924405292773041e-05, + "loss": 3.147, + "step": 2496 + }, + { + "epoch": 0.1676453810274823, + "grad_norm": 4.149040699005127, + "learning_rate": 9.924216902673964e-05, + "loss": 3.0662, + "step": 2498 + }, + { + "epoch": 0.1677796047112513, + "grad_norm": 4.638777732849121, + "learning_rate": 9.924028279915323e-05, + "loss": 2.945, + "step": 2500 + }, + { + "epoch": 0.1679138283950203, + "grad_norm": 5.038109302520752, + "learning_rate": 9.923839424506026e-05, + "loss": 2.9244, + "step": 2502 + }, + { + "epoch": 0.1680480520787893, + "grad_norm": 4.374161720275879, + "learning_rate": 9.923650336454999e-05, + "loss": 2.63, + "step": 2504 + }, + { + "epoch": 0.16818227576255831, + "grad_norm": 4.272861480712891, + "learning_rate": 9.923461015771177e-05, + "loss": 2.9861, + "step": 2506 + }, + { + "epoch": 0.16831649944632732, + "grad_norm": 4.438540935516357, + "learning_rate": 9.923271462463502e-05, + "loss": 2.8278, + "step": 2508 + }, + { + "epoch": 0.16845072313009632, + "grad_norm": 5.675797939300537, + "learning_rate": 9.923081676540933e-05, + "loss": 2.8895, + "step": 2510 + }, + { + "epoch": 0.1685849468138653, + "grad_norm": 4.798285484313965, + "learning_rate": 9.922891658012435e-05, + "loss": 2.9409, + "step": 2512 + }, + { + "epoch": 0.1687191704976343, + "grad_norm": 4.3139848709106445, + "learning_rate": 9.922701406886987e-05, + "loss": 2.9047, + "step": 2514 + }, + { + "epoch": 0.1688533941814033, + "grad_norm": 7.094779014587402, + "learning_rate": 9.922510923173576e-05, + "loss": 3.0341, + "step": 2516 + }, + { + "epoch": 0.1689876178651723, + "grad_norm": 4.535288333892822, + "learning_rate": 9.922320206881205e-05, + "loss": 3.093, + "step": 2518 + }, + { + "epoch": 0.1691218415489413, + "grad_norm": 4.081509590148926, + "learning_rate": 9.922129258018883e-05, + "loss": 3.0159, + "step": 2520 + }, + { + "epoch": 0.1692560652327103, + "grad_norm": 4.549274921417236, + "learning_rate": 9.921938076595632e-05, + "loss": 2.9773, + "step": 2522 + }, + { + "epoch": 0.1693902889164793, + "grad_norm": 5.63139009475708, + "learning_rate": 9.921746662620488e-05, + "loss": 3.1701, + "step": 2524 + }, + { + "epoch": 0.1695245126002483, + "grad_norm": 5.4546356201171875, + "learning_rate": 9.92155501610249e-05, + "loss": 3.1156, + "step": 2526 + }, + { + "epoch": 0.16965873628401731, + "grad_norm": 4.83322811126709, + "learning_rate": 9.921363137050696e-05, + "loss": 3.0001, + "step": 2528 + }, + { + "epoch": 0.16979295996778632, + "grad_norm": 4.705216407775879, + "learning_rate": 9.921171025474171e-05, + "loss": 2.8138, + "step": 2530 + }, + { + "epoch": 0.16992718365155532, + "grad_norm": 4.900602340698242, + "learning_rate": 9.920978681381991e-05, + "loss": 3.1554, + "step": 2532 + }, + { + "epoch": 0.17006140733532432, + "grad_norm": 9.108877182006836, + "learning_rate": 9.920786104783247e-05, + "loss": 3.1165, + "step": 2534 + }, + { + "epoch": 0.17019563101909332, + "grad_norm": 5.6285014152526855, + "learning_rate": 9.920593295687035e-05, + "loss": 3.0027, + "step": 2536 + }, + { + "epoch": 0.17032985470286233, + "grad_norm": 4.69816255569458, + "learning_rate": 9.920400254102466e-05, + "loss": 3.1349, + "step": 2538 + }, + { + "epoch": 0.17046407838663133, + "grad_norm": 7.903632164001465, + "learning_rate": 9.92020698003866e-05, + "loss": 2.8657, + "step": 2540 + }, + { + "epoch": 0.17059830207040033, + "grad_norm": 7.024686813354492, + "learning_rate": 9.92001347350475e-05, + "loss": 2.9817, + "step": 2542 + }, + { + "epoch": 0.17073252575416933, + "grad_norm": 5.035988807678223, + "learning_rate": 9.919819734509878e-05, + "loss": 2.8708, + "step": 2544 + }, + { + "epoch": 0.17086674943793834, + "grad_norm": 4.611110210418701, + "learning_rate": 9.919625763063197e-05, + "loss": 2.8601, + "step": 2546 + }, + { + "epoch": 0.1710009731217073, + "grad_norm": 4.338691234588623, + "learning_rate": 9.919431559173874e-05, + "loss": 3.1669, + "step": 2548 + }, + { + "epoch": 0.17113519680547631, + "grad_norm": 6.397730827331543, + "learning_rate": 9.919237122851084e-05, + "loss": 3.0887, + "step": 2550 + }, + { + "epoch": 0.17126942048924532, + "grad_norm": 4.744366645812988, + "learning_rate": 9.91904245410401e-05, + "loss": 2.9591, + "step": 2552 + }, + { + "epoch": 0.17140364417301432, + "grad_norm": 6.40416955947876, + "learning_rate": 9.918847552941856e-05, + "loss": 3.1684, + "step": 2554 + }, + { + "epoch": 0.17153786785678332, + "grad_norm": 5.252598285675049, + "learning_rate": 9.918652419373827e-05, + "loss": 3.1063, + "step": 2556 + }, + { + "epoch": 0.17167209154055232, + "grad_norm": 4.403224945068359, + "learning_rate": 9.918457053409142e-05, + "loss": 3.0782, + "step": 2558 + }, + { + "epoch": 0.17180631522432133, + "grad_norm": 4.804382801055908, + "learning_rate": 9.918261455057036e-05, + "loss": 2.7089, + "step": 2560 + }, + { + "epoch": 0.17194053890809033, + "grad_norm": 4.620754718780518, + "learning_rate": 9.918065624326745e-05, + "loss": 3.1844, + "step": 2562 + }, + { + "epoch": 0.17207476259185933, + "grad_norm": 3.6761586666107178, + "learning_rate": 9.917869561227524e-05, + "loss": 2.7122, + "step": 2564 + }, + { + "epoch": 0.17220898627562833, + "grad_norm": 4.333477020263672, + "learning_rate": 9.917673265768639e-05, + "loss": 2.7142, + "step": 2566 + }, + { + "epoch": 0.17234320995939734, + "grad_norm": 5.1761016845703125, + "learning_rate": 9.917476737959361e-05, + "loss": 3.0675, + "step": 2568 + }, + { + "epoch": 0.17247743364316634, + "grad_norm": 7.395604610443115, + "learning_rate": 9.917279977808976e-05, + "loss": 2.9373, + "step": 2570 + }, + { + "epoch": 0.17261165732693534, + "grad_norm": 4.505858898162842, + "learning_rate": 9.917082985326782e-05, + "loss": 3.0737, + "step": 2572 + }, + { + "epoch": 0.17274588101070434, + "grad_norm": 4.485294818878174, + "learning_rate": 9.916885760522087e-05, + "loss": 2.9131, + "step": 2574 + }, + { + "epoch": 0.17288010469447335, + "grad_norm": 4.653338432312012, + "learning_rate": 9.916688303404208e-05, + "loss": 3.0085, + "step": 2576 + }, + { + "epoch": 0.17301432837824235, + "grad_norm": 4.831528663635254, + "learning_rate": 9.916490613982474e-05, + "loss": 2.8484, + "step": 2578 + }, + { + "epoch": 0.17314855206201135, + "grad_norm": 4.890402793884277, + "learning_rate": 9.916292692266228e-05, + "loss": 3.1733, + "step": 2580 + }, + { + "epoch": 0.17328277574578035, + "grad_norm": 5.323947906494141, + "learning_rate": 9.91609453826482e-05, + "loss": 3.0336, + "step": 2582 + }, + { + "epoch": 0.17341699942954936, + "grad_norm": 4.720778465270996, + "learning_rate": 9.915896151987609e-05, + "loss": 2.7515, + "step": 2584 + }, + { + "epoch": 0.17355122311331833, + "grad_norm": 5.045989036560059, + "learning_rate": 9.915697533443976e-05, + "loss": 2.9349, + "step": 2586 + }, + { + "epoch": 0.17368544679708733, + "grad_norm": 5.105770587921143, + "learning_rate": 9.915498682643297e-05, + "loss": 3.0847, + "step": 2588 + }, + { + "epoch": 0.17381967048085634, + "grad_norm": 4.349035739898682, + "learning_rate": 9.915299599594973e-05, + "loss": 2.8527, + "step": 2590 + }, + { + "epoch": 0.17395389416462534, + "grad_norm": 4.372335433959961, + "learning_rate": 9.915100284308407e-05, + "loss": 3.0656, + "step": 2592 + }, + { + "epoch": 0.17408811784839434, + "grad_norm": 5.08442497253418, + "learning_rate": 9.91490073679302e-05, + "loss": 2.9787, + "step": 2594 + }, + { + "epoch": 0.17422234153216334, + "grad_norm": 4.246547698974609, + "learning_rate": 9.914700957058235e-05, + "loss": 2.8209, + "step": 2596 + }, + { + "epoch": 0.17435656521593235, + "grad_norm": 4.797031879425049, + "learning_rate": 9.914500945113496e-05, + "loss": 2.9574, + "step": 2598 + }, + { + "epoch": 0.17449078889970135, + "grad_norm": 4.417155742645264, + "learning_rate": 9.91430070096825e-05, + "loss": 2.8201, + "step": 2600 + }, + { + "epoch": 0.17462501258347035, + "grad_norm": 4.563340187072754, + "learning_rate": 9.914100224631962e-05, + "loss": 2.938, + "step": 2602 + }, + { + "epoch": 0.17475923626723935, + "grad_norm": 4.95414924621582, + "learning_rate": 9.9138995161141e-05, + "loss": 2.8519, + "step": 2604 + }, + { + "epoch": 0.17489345995100836, + "grad_norm": 5.963942050933838, + "learning_rate": 9.913698575424148e-05, + "loss": 2.6706, + "step": 2606 + }, + { + "epoch": 0.17502768363477736, + "grad_norm": 5.098476886749268, + "learning_rate": 9.913497402571602e-05, + "loss": 2.9292, + "step": 2608 + }, + { + "epoch": 0.17516190731854636, + "grad_norm": 5.4268012046813965, + "learning_rate": 9.913295997565966e-05, + "loss": 2.741, + "step": 2610 + }, + { + "epoch": 0.17529613100231536, + "grad_norm": 4.903132438659668, + "learning_rate": 9.913094360416755e-05, + "loss": 3.0616, + "step": 2612 + }, + { + "epoch": 0.17543035468608437, + "grad_norm": 5.198611736297607, + "learning_rate": 9.912892491133496e-05, + "loss": 3.0996, + "step": 2614 + }, + { + "epoch": 0.17556457836985337, + "grad_norm": 4.728588104248047, + "learning_rate": 9.912690389725727e-05, + "loss": 2.8964, + "step": 2616 + }, + { + "epoch": 0.17569880205362237, + "grad_norm": 5.030994892120361, + "learning_rate": 9.912488056202998e-05, + "loss": 3.1506, + "step": 2618 + }, + { + "epoch": 0.17583302573739137, + "grad_norm": 4.8778839111328125, + "learning_rate": 9.91228549057487e-05, + "loss": 3.0298, + "step": 2620 + }, + { + "epoch": 0.17596724942116038, + "grad_norm": 4.678228855133057, + "learning_rate": 9.91208269285091e-05, + "loss": 3.2421, + "step": 2622 + }, + { + "epoch": 0.17610147310492935, + "grad_norm": 5.158640384674072, + "learning_rate": 9.911879663040704e-05, + "loss": 3.1435, + "step": 2624 + }, + { + "epoch": 0.17623569678869835, + "grad_norm": 4.335305690765381, + "learning_rate": 9.911676401153842e-05, + "loss": 2.9415, + "step": 2626 + }, + { + "epoch": 0.17636992047246736, + "grad_norm": 5.130254745483398, + "learning_rate": 9.911472907199928e-05, + "loss": 2.9838, + "step": 2628 + }, + { + "epoch": 0.17650414415623636, + "grad_norm": 4.540406227111816, + "learning_rate": 9.911269181188575e-05, + "loss": 2.9873, + "step": 2630 + }, + { + "epoch": 0.17663836784000536, + "grad_norm": 4.806802749633789, + "learning_rate": 9.911065223129414e-05, + "loss": 2.8901, + "step": 2632 + }, + { + "epoch": 0.17677259152377436, + "grad_norm": 12.860517501831055, + "learning_rate": 9.910861033032079e-05, + "loss": 3.1693, + "step": 2634 + }, + { + "epoch": 0.17690681520754337, + "grad_norm": 4.569222927093506, + "learning_rate": 9.910656610906214e-05, + "loss": 3.0086, + "step": 2636 + }, + { + "epoch": 0.17704103889131237, + "grad_norm": 4.903293609619141, + "learning_rate": 9.910451956761482e-05, + "loss": 3.2535, + "step": 2638 + }, + { + "epoch": 0.17717526257508137, + "grad_norm": 5.008412837982178, + "learning_rate": 9.910247070607552e-05, + "loss": 3.0327, + "step": 2640 + }, + { + "epoch": 0.17730948625885037, + "grad_norm": 9.792829513549805, + "learning_rate": 9.910041952454103e-05, + "loss": 2.8796, + "step": 2642 + }, + { + "epoch": 0.17744370994261938, + "grad_norm": 7.832380294799805, + "learning_rate": 9.909836602310824e-05, + "loss": 2.9112, + "step": 2644 + }, + { + "epoch": 0.17757793362638838, + "grad_norm": 4.565872669219971, + "learning_rate": 9.909631020187424e-05, + "loss": 3.1241, + "step": 2646 + }, + { + "epoch": 0.17771215731015738, + "grad_norm": 4.389710903167725, + "learning_rate": 9.909425206093613e-05, + "loss": 2.969, + "step": 2648 + }, + { + "epoch": 0.17784638099392638, + "grad_norm": 4.301173210144043, + "learning_rate": 9.909219160039112e-05, + "loss": 2.8983, + "step": 2650 + }, + { + "epoch": 0.1779806046776954, + "grad_norm": 5.061441898345947, + "learning_rate": 9.90901288203366e-05, + "loss": 3.0081, + "step": 2652 + }, + { + "epoch": 0.1781148283614644, + "grad_norm": 5.468021869659424, + "learning_rate": 9.908806372087002e-05, + "loss": 3.018, + "step": 2654 + }, + { + "epoch": 0.1782490520452334, + "grad_norm": 5.737773895263672, + "learning_rate": 9.9085996302089e-05, + "loss": 3.5082, + "step": 2656 + }, + { + "epoch": 0.1783832757290024, + "grad_norm": 4.621213436126709, + "learning_rate": 9.908392656409113e-05, + "loss": 2.7489, + "step": 2658 + }, + { + "epoch": 0.1785174994127714, + "grad_norm": 5.166444301605225, + "learning_rate": 9.908185450697428e-05, + "loss": 3.0265, + "step": 2660 + }, + { + "epoch": 0.17865172309654037, + "grad_norm": 4.766532897949219, + "learning_rate": 9.907978013083629e-05, + "loss": 3.1038, + "step": 2662 + }, + { + "epoch": 0.17878594678030937, + "grad_norm": 4.324908256530762, + "learning_rate": 9.907770343577522e-05, + "loss": 3.1761, + "step": 2664 + }, + { + "epoch": 0.17892017046407838, + "grad_norm": 5.4798903465271, + "learning_rate": 9.907562442188916e-05, + "loss": 3.1491, + "step": 2666 + }, + { + "epoch": 0.17905439414784738, + "grad_norm": 4.882856845855713, + "learning_rate": 9.907354308927635e-05, + "loss": 3.1869, + "step": 2668 + }, + { + "epoch": 0.17918861783161638, + "grad_norm": 4.844232082366943, + "learning_rate": 9.907145943803513e-05, + "loss": 3.152, + "step": 2670 + }, + { + "epoch": 0.17932284151538538, + "grad_norm": 4.772347450256348, + "learning_rate": 9.906937346826395e-05, + "loss": 3.1003, + "step": 2672 + }, + { + "epoch": 0.1794570651991544, + "grad_norm": 5.155459403991699, + "learning_rate": 9.906728518006136e-05, + "loss": 3.0124, + "step": 2674 + }, + { + "epoch": 0.1795912888829234, + "grad_norm": 5.141105651855469, + "learning_rate": 9.906519457352605e-05, + "loss": 2.953, + "step": 2676 + }, + { + "epoch": 0.1797255125666924, + "grad_norm": 5.1469573974609375, + "learning_rate": 9.906310164875676e-05, + "loss": 2.902, + "step": 2678 + }, + { + "epoch": 0.1798597362504614, + "grad_norm": 4.808437824249268, + "learning_rate": 9.90610064058524e-05, + "loss": 2.7625, + "step": 2680 + }, + { + "epoch": 0.1799939599342304, + "grad_norm": 4.775310039520264, + "learning_rate": 9.905890884491195e-05, + "loss": 3.1228, + "step": 2682 + }, + { + "epoch": 0.1801281836179994, + "grad_norm": 4.711374282836914, + "learning_rate": 9.905680896603455e-05, + "loss": 2.795, + "step": 2684 + }, + { + "epoch": 0.1802624073017684, + "grad_norm": 4.63525915145874, + "learning_rate": 9.905470676931938e-05, + "loss": 3.1443, + "step": 2686 + }, + { + "epoch": 0.1803966309855374, + "grad_norm": 4.4490790367126465, + "learning_rate": 9.905260225486577e-05, + "loss": 2.8346, + "step": 2688 + }, + { + "epoch": 0.1805308546693064, + "grad_norm": 5.343776226043701, + "learning_rate": 9.905049542277318e-05, + "loss": 3.0441, + "step": 2690 + }, + { + "epoch": 0.1806650783530754, + "grad_norm": 4.965149879455566, + "learning_rate": 9.904838627314112e-05, + "loss": 3.0645, + "step": 2692 + }, + { + "epoch": 0.1807993020368444, + "grad_norm": 4.992354393005371, + "learning_rate": 9.904627480606926e-05, + "loss": 3.0905, + "step": 2694 + }, + { + "epoch": 0.18093352572061341, + "grad_norm": 5.3002166748046875, + "learning_rate": 9.904416102165736e-05, + "loss": 2.7724, + "step": 2696 + }, + { + "epoch": 0.18106774940438242, + "grad_norm": 15.152284622192383, + "learning_rate": 9.90420449200053e-05, + "loss": 2.9806, + "step": 2698 + }, + { + "epoch": 0.1812019730881514, + "grad_norm": 6.101254463195801, + "learning_rate": 9.903992650121306e-05, + "loss": 2.7564, + "step": 2700 + }, + { + "epoch": 0.1813361967719204, + "grad_norm": 4.714508056640625, + "learning_rate": 9.903780576538071e-05, + "loss": 3.0622, + "step": 2702 + }, + { + "epoch": 0.1814704204556894, + "grad_norm": 8.488800048828125, + "learning_rate": 9.903568271260847e-05, + "loss": 3.4396, + "step": 2704 + }, + { + "epoch": 0.1816046441394584, + "grad_norm": 6.979100704193115, + "learning_rate": 9.903355734299664e-05, + "loss": 2.8045, + "step": 2706 + }, + { + "epoch": 0.1817388678232274, + "grad_norm": 7.527510166168213, + "learning_rate": 9.903142965664566e-05, + "loss": 3.0148, + "step": 2708 + }, + { + "epoch": 0.1818730915069964, + "grad_norm": 4.804107189178467, + "learning_rate": 9.902929965365603e-05, + "loss": 3.0704, + "step": 2710 + }, + { + "epoch": 0.1820073151907654, + "grad_norm": 4.122807502746582, + "learning_rate": 9.902716733412842e-05, + "loss": 2.8526, + "step": 2712 + }, + { + "epoch": 0.1821415388745344, + "grad_norm": 5.3552398681640625, + "learning_rate": 9.902503269816356e-05, + "loss": 2.8483, + "step": 2714 + }, + { + "epoch": 0.1822757625583034, + "grad_norm": 4.559593200683594, + "learning_rate": 9.90228957458623e-05, + "loss": 3.0815, + "step": 2716 + }, + { + "epoch": 0.1824099862420724, + "grad_norm": 6.4086012840271, + "learning_rate": 9.902075647732563e-05, + "loss": 2.861, + "step": 2718 + }, + { + "epoch": 0.18254420992584142, + "grad_norm": 5.807387351989746, + "learning_rate": 9.90186148926546e-05, + "loss": 2.9596, + "step": 2720 + }, + { + "epoch": 0.18267843360961042, + "grad_norm": 5.999558925628662, + "learning_rate": 9.90164709919504e-05, + "loss": 3.2973, + "step": 2722 + }, + { + "epoch": 0.18281265729337942, + "grad_norm": 4.5873942375183105, + "learning_rate": 9.901432477531433e-05, + "loss": 2.8861, + "step": 2724 + }, + { + "epoch": 0.18294688097714842, + "grad_norm": 4.400182723999023, + "learning_rate": 9.901217624284782e-05, + "loss": 3.2204, + "step": 2726 + }, + { + "epoch": 0.18308110466091743, + "grad_norm": 5.39105749130249, + "learning_rate": 9.901002539465234e-05, + "loss": 3.3566, + "step": 2728 + }, + { + "epoch": 0.18321532834468643, + "grad_norm": 4.405644416809082, + "learning_rate": 9.900787223082955e-05, + "loss": 3.1483, + "step": 2730 + }, + { + "epoch": 0.18334955202845543, + "grad_norm": 4.848171710968018, + "learning_rate": 9.900571675148116e-05, + "loss": 2.7969, + "step": 2732 + }, + { + "epoch": 0.18348377571222443, + "grad_norm": 4.875325679779053, + "learning_rate": 9.9003558956709e-05, + "loss": 3.0175, + "step": 2734 + }, + { + "epoch": 0.18361799939599344, + "grad_norm": 4.2971601486206055, + "learning_rate": 9.900139884661507e-05, + "loss": 3.1017, + "step": 2736 + }, + { + "epoch": 0.1837522230797624, + "grad_norm": 6.6973466873168945, + "learning_rate": 9.899923642130139e-05, + "loss": 3.0386, + "step": 2738 + }, + { + "epoch": 0.1838864467635314, + "grad_norm": 5.369362831115723, + "learning_rate": 9.899707168087013e-05, + "loss": 2.9801, + "step": 2740 + }, + { + "epoch": 0.18402067044730042, + "grad_norm": 4.55104398727417, + "learning_rate": 9.89949046254236e-05, + "loss": 3.0322, + "step": 2742 + }, + { + "epoch": 0.18415489413106942, + "grad_norm": 5.05248498916626, + "learning_rate": 9.899273525506417e-05, + "loss": 3.0482, + "step": 2744 + }, + { + "epoch": 0.18428911781483842, + "grad_norm": 4.862440586090088, + "learning_rate": 9.899056356989434e-05, + "loss": 2.7865, + "step": 2746 + }, + { + "epoch": 0.18442334149860742, + "grad_norm": 4.306432247161865, + "learning_rate": 9.89883895700167e-05, + "loss": 2.7666, + "step": 2748 + }, + { + "epoch": 0.18455756518237643, + "grad_norm": 5.790725231170654, + "learning_rate": 9.8986213255534e-05, + "loss": 3.4177, + "step": 2750 + }, + { + "epoch": 0.18469178886614543, + "grad_norm": 5.963059902191162, + "learning_rate": 9.898403462654904e-05, + "loss": 3.2053, + "step": 2752 + }, + { + "epoch": 0.18482601254991443, + "grad_norm": 5.231216907501221, + "learning_rate": 9.898185368316477e-05, + "loss": 3.084, + "step": 2754 + }, + { + "epoch": 0.18496023623368343, + "grad_norm": 4.580977916717529, + "learning_rate": 9.897967042548424e-05, + "loss": 3.0843, + "step": 2756 + }, + { + "epoch": 0.18509445991745244, + "grad_norm": 4.761484622955322, + "learning_rate": 9.897748485361059e-05, + "loss": 2.9444, + "step": 2758 + }, + { + "epoch": 0.18522868360122144, + "grad_norm": 4.556168556213379, + "learning_rate": 9.89752969676471e-05, + "loss": 2.8191, + "step": 2760 + }, + { + "epoch": 0.18536290728499044, + "grad_norm": 4.885586738586426, + "learning_rate": 9.897310676769712e-05, + "loss": 3.2436, + "step": 2762 + }, + { + "epoch": 0.18549713096875944, + "grad_norm": 4.555342197418213, + "learning_rate": 9.897091425386415e-05, + "loss": 3.0212, + "step": 2764 + }, + { + "epoch": 0.18563135465252845, + "grad_norm": 5.180994033813477, + "learning_rate": 9.896871942625179e-05, + "loss": 3.0865, + "step": 2766 + }, + { + "epoch": 0.18576557833629745, + "grad_norm": 5.284283638000488, + "learning_rate": 9.896652228496372e-05, + "loss": 3.1816, + "step": 2768 + }, + { + "epoch": 0.18589980202006645, + "grad_norm": 4.591383934020996, + "learning_rate": 9.896432283010376e-05, + "loss": 2.9231, + "step": 2770 + }, + { + "epoch": 0.18603402570383545, + "grad_norm": 4.555316925048828, + "learning_rate": 9.896212106177583e-05, + "loss": 3.0008, + "step": 2772 + }, + { + "epoch": 0.18616824938760446, + "grad_norm": 8.6260404586792, + "learning_rate": 9.895991698008397e-05, + "loss": 3.0706, + "step": 2774 + }, + { + "epoch": 0.18630247307137343, + "grad_norm": 4.872500419616699, + "learning_rate": 9.89577105851323e-05, + "loss": 3.1808, + "step": 2776 + }, + { + "epoch": 0.18643669675514243, + "grad_norm": 5.334842681884766, + "learning_rate": 9.895550187702506e-05, + "loss": 3.1294, + "step": 2778 + }, + { + "epoch": 0.18657092043891144, + "grad_norm": 4.260520935058594, + "learning_rate": 9.895329085586667e-05, + "loss": 3.0935, + "step": 2780 + }, + { + "epoch": 0.18670514412268044, + "grad_norm": 4.5289812088012695, + "learning_rate": 9.895107752176152e-05, + "loss": 3.3009, + "step": 2782 + }, + { + "epoch": 0.18683936780644944, + "grad_norm": 4.24552583694458, + "learning_rate": 9.894886187481421e-05, + "loss": 3.2435, + "step": 2784 + }, + { + "epoch": 0.18697359149021844, + "grad_norm": 4.825685024261475, + "learning_rate": 9.894664391512943e-05, + "loss": 3.1724, + "step": 2786 + }, + { + "epoch": 0.18710781517398745, + "grad_norm": 5.19764518737793, + "learning_rate": 9.894442364281197e-05, + "loss": 2.9545, + "step": 2788 + }, + { + "epoch": 0.18724203885775645, + "grad_norm": 5.114936351776123, + "learning_rate": 9.894220105796676e-05, + "loss": 3.2709, + "step": 2790 + }, + { + "epoch": 0.18737626254152545, + "grad_norm": 4.323258399963379, + "learning_rate": 9.893997616069878e-05, + "loss": 3.0974, + "step": 2792 + }, + { + "epoch": 0.18751048622529445, + "grad_norm": 4.974935531616211, + "learning_rate": 9.893774895111317e-05, + "loss": 2.9185, + "step": 2794 + }, + { + "epoch": 0.18764470990906346, + "grad_norm": 4.615647315979004, + "learning_rate": 9.893551942931514e-05, + "loss": 3.1645, + "step": 2796 + }, + { + "epoch": 0.18777893359283246, + "grad_norm": 4.176204204559326, + "learning_rate": 9.893328759541003e-05, + "loss": 3.2093, + "step": 2798 + }, + { + "epoch": 0.18791315727660146, + "grad_norm": 5.691295146942139, + "learning_rate": 9.893105344950333e-05, + "loss": 2.7254, + "step": 2800 + }, + { + "epoch": 0.18804738096037046, + "grad_norm": 4.316233158111572, + "learning_rate": 9.892881699170058e-05, + "loss": 2.6145, + "step": 2802 + }, + { + "epoch": 0.18818160464413947, + "grad_norm": 4.690177917480469, + "learning_rate": 9.892657822210742e-05, + "loss": 2.7959, + "step": 2804 + }, + { + "epoch": 0.18831582832790847, + "grad_norm": 5.248259544372559, + "learning_rate": 9.892433714082966e-05, + "loss": 2.963, + "step": 2806 + }, + { + "epoch": 0.18845005201167747, + "grad_norm": 4.938899517059326, + "learning_rate": 9.892209374797318e-05, + "loss": 3.1232, + "step": 2808 + }, + { + "epoch": 0.18858427569544647, + "grad_norm": 4.6278276443481445, + "learning_rate": 9.891984804364395e-05, + "loss": 2.9296, + "step": 2810 + }, + { + "epoch": 0.18871849937921548, + "grad_norm": 4.560853481292725, + "learning_rate": 9.891760002794812e-05, + "loss": 2.9692, + "step": 2812 + }, + { + "epoch": 0.18885272306298445, + "grad_norm": 4.186822414398193, + "learning_rate": 9.891534970099188e-05, + "loss": 3.1278, + "step": 2814 + }, + { + "epoch": 0.18898694674675345, + "grad_norm": 4.2081170082092285, + "learning_rate": 9.891309706288154e-05, + "loss": 2.8261, + "step": 2816 + }, + { + "epoch": 0.18912117043052246, + "grad_norm": 4.672083854675293, + "learning_rate": 9.891084211372356e-05, + "loss": 3.1821, + "step": 2818 + }, + { + "epoch": 0.18925539411429146, + "grad_norm": 9.166268348693848, + "learning_rate": 9.890858485362447e-05, + "loss": 3.0306, + "step": 2820 + }, + { + "epoch": 0.18938961779806046, + "grad_norm": 5.412232875823975, + "learning_rate": 9.89063252826909e-05, + "loss": 3.0113, + "step": 2822 + }, + { + "epoch": 0.18952384148182946, + "grad_norm": 5.974574565887451, + "learning_rate": 9.890406340102964e-05, + "loss": 3.125, + "step": 2824 + }, + { + "epoch": 0.18965806516559847, + "grad_norm": 5.323157787322998, + "learning_rate": 9.890179920874756e-05, + "loss": 2.8618, + "step": 2826 + }, + { + "epoch": 0.18979228884936747, + "grad_norm": 4.565852165222168, + "learning_rate": 9.889953270595162e-05, + "loss": 3.0025, + "step": 2828 + }, + { + "epoch": 0.18992651253313647, + "grad_norm": 4.1333770751953125, + "learning_rate": 9.889726389274892e-05, + "loss": 2.8268, + "step": 2830 + }, + { + "epoch": 0.19006073621690547, + "grad_norm": 5.104335784912109, + "learning_rate": 9.889499276924666e-05, + "loss": 3.1633, + "step": 2832 + }, + { + "epoch": 0.19019495990067448, + "grad_norm": 6.353046417236328, + "learning_rate": 9.889271933555213e-05, + "loss": 3.1672, + "step": 2834 + }, + { + "epoch": 0.19032918358444348, + "grad_norm": 4.872444152832031, + "learning_rate": 9.889044359177277e-05, + "loss": 2.8665, + "step": 2836 + }, + { + "epoch": 0.19046340726821248, + "grad_norm": 5.431187629699707, + "learning_rate": 9.888816553801608e-05, + "loss": 2.9538, + "step": 2838 + }, + { + "epoch": 0.19059763095198148, + "grad_norm": 4.387043476104736, + "learning_rate": 9.888588517438968e-05, + "loss": 2.9044, + "step": 2840 + }, + { + "epoch": 0.1907318546357505, + "grad_norm": 4.429694175720215, + "learning_rate": 9.888360250100137e-05, + "loss": 2.8731, + "step": 2842 + }, + { + "epoch": 0.1908660783195195, + "grad_norm": 5.1301398277282715, + "learning_rate": 9.888131751795895e-05, + "loss": 3.0307, + "step": 2844 + }, + { + "epoch": 0.1910003020032885, + "grad_norm": 5.835402488708496, + "learning_rate": 9.88790302253704e-05, + "loss": 3.1002, + "step": 2846 + }, + { + "epoch": 0.1911345256870575, + "grad_norm": 4.891978740692139, + "learning_rate": 9.887674062334377e-05, + "loss": 3.2357, + "step": 2848 + }, + { + "epoch": 0.19126874937082647, + "grad_norm": 5.5543646812438965, + "learning_rate": 9.88744487119873e-05, + "loss": 2.991, + "step": 2850 + }, + { + "epoch": 0.19140297305459547, + "grad_norm": 5.036186695098877, + "learning_rate": 9.88721544914092e-05, + "loss": 2.873, + "step": 2852 + }, + { + "epoch": 0.19153719673836447, + "grad_norm": 4.820505619049072, + "learning_rate": 9.886985796171792e-05, + "loss": 3.1428, + "step": 2854 + }, + { + "epoch": 0.19167142042213348, + "grad_norm": 4.456747055053711, + "learning_rate": 9.886755912302194e-05, + "loss": 2.7074, + "step": 2856 + }, + { + "epoch": 0.19180564410590248, + "grad_norm": 4.537261962890625, + "learning_rate": 9.886525797542989e-05, + "loss": 3.0265, + "step": 2858 + }, + { + "epoch": 0.19193986778967148, + "grad_norm": 4.514282703399658, + "learning_rate": 9.88629545190505e-05, + "loss": 3.1259, + "step": 2860 + }, + { + "epoch": 0.19207409147344048, + "grad_norm": 4.908985137939453, + "learning_rate": 9.88606487539926e-05, + "loss": 3.1397, + "step": 2862 + }, + { + "epoch": 0.1922083151572095, + "grad_norm": 4.455459117889404, + "learning_rate": 9.88583406803651e-05, + "loss": 3.014, + "step": 2864 + }, + { + "epoch": 0.1923425388409785, + "grad_norm": 4.096607208251953, + "learning_rate": 9.885603029827707e-05, + "loss": 2.8655, + "step": 2866 + }, + { + "epoch": 0.1924767625247475, + "grad_norm": 4.936752796173096, + "learning_rate": 9.885371760783772e-05, + "loss": 2.9668, + "step": 2868 + }, + { + "epoch": 0.1926109862085165, + "grad_norm": 4.402291297912598, + "learning_rate": 9.885140260915625e-05, + "loss": 2.9565, + "step": 2870 + }, + { + "epoch": 0.1927452098922855, + "grad_norm": 5.832319259643555, + "learning_rate": 9.884908530234208e-05, + "loss": 3.0515, + "step": 2872 + }, + { + "epoch": 0.1928794335760545, + "grad_norm": 4.646378993988037, + "learning_rate": 9.884676568750469e-05, + "loss": 2.9138, + "step": 2874 + }, + { + "epoch": 0.1930136572598235, + "grad_norm": 4.520015716552734, + "learning_rate": 9.884444376475367e-05, + "loss": 3.0548, + "step": 2876 + }, + { + "epoch": 0.1931478809435925, + "grad_norm": 4.29709529876709, + "learning_rate": 9.884211953419873e-05, + "loss": 2.6654, + "step": 2878 + }, + { + "epoch": 0.1932821046273615, + "grad_norm": 5.964015007019043, + "learning_rate": 9.883979299594969e-05, + "loss": 2.9208, + "step": 2880 + }, + { + "epoch": 0.1934163283111305, + "grad_norm": 4.348883628845215, + "learning_rate": 9.883746415011646e-05, + "loss": 2.8494, + "step": 2882 + }, + { + "epoch": 0.1935505519948995, + "grad_norm": 4.637997627258301, + "learning_rate": 9.88351329968091e-05, + "loss": 3.025, + "step": 2884 + }, + { + "epoch": 0.1936847756786685, + "grad_norm": 4.242954254150391, + "learning_rate": 9.883279953613771e-05, + "loss": 2.8031, + "step": 2886 + }, + { + "epoch": 0.1938189993624375, + "grad_norm": 4.646054744720459, + "learning_rate": 9.88304637682126e-05, + "loss": 2.926, + "step": 2888 + }, + { + "epoch": 0.1939532230462065, + "grad_norm": 4.189169406890869, + "learning_rate": 9.882812569314408e-05, + "loss": 2.748, + "step": 2890 + }, + { + "epoch": 0.1940874467299755, + "grad_norm": 6.062061309814453, + "learning_rate": 9.882578531104263e-05, + "loss": 3.0497, + "step": 2892 + }, + { + "epoch": 0.1942216704137445, + "grad_norm": 4.167173862457275, + "learning_rate": 9.882344262201884e-05, + "loss": 2.887, + "step": 2894 + }, + { + "epoch": 0.1943558940975135, + "grad_norm": 13.352838516235352, + "learning_rate": 9.882109762618342e-05, + "loss": 2.8955, + "step": 2896 + }, + { + "epoch": 0.1944901177812825, + "grad_norm": 4.54686975479126, + "learning_rate": 9.88187503236471e-05, + "loss": 3.2579, + "step": 2898 + }, + { + "epoch": 0.1946243414650515, + "grad_norm": 4.379400253295898, + "learning_rate": 9.881640071452085e-05, + "loss": 2.9556, + "step": 2900 + }, + { + "epoch": 0.1947585651488205, + "grad_norm": 4.808958053588867, + "learning_rate": 9.881404879891565e-05, + "loss": 3.1112, + "step": 2902 + }, + { + "epoch": 0.1948927888325895, + "grad_norm": 5.711093902587891, + "learning_rate": 9.881169457694263e-05, + "loss": 2.9822, + "step": 2904 + }, + { + "epoch": 0.1950270125163585, + "grad_norm": 8.140791893005371, + "learning_rate": 9.880933804871304e-05, + "loss": 3.2018, + "step": 2906 + }, + { + "epoch": 0.1951612362001275, + "grad_norm": 5.479485988616943, + "learning_rate": 9.88069792143382e-05, + "loss": 2.9921, + "step": 2908 + }, + { + "epoch": 0.19529545988389652, + "grad_norm": 4.84016227722168, + "learning_rate": 9.880461807392956e-05, + "loss": 2.8461, + "step": 2910 + }, + { + "epoch": 0.19542968356766552, + "grad_norm": 4.35216760635376, + "learning_rate": 9.880225462759869e-05, + "loss": 3.0878, + "step": 2912 + }, + { + "epoch": 0.19556390725143452, + "grad_norm": 5.248281478881836, + "learning_rate": 9.879988887545726e-05, + "loss": 3.0106, + "step": 2914 + }, + { + "epoch": 0.19569813093520352, + "grad_norm": 4.720247745513916, + "learning_rate": 9.879752081761704e-05, + "loss": 2.8837, + "step": 2916 + }, + { + "epoch": 0.19583235461897253, + "grad_norm": 5.087040424346924, + "learning_rate": 9.87951504541899e-05, + "loss": 3.1059, + "step": 2918 + }, + { + "epoch": 0.19596657830274153, + "grad_norm": 6.601572513580322, + "learning_rate": 9.879277778528788e-05, + "loss": 3.1033, + "step": 2920 + }, + { + "epoch": 0.19610080198651053, + "grad_norm": 4.676246643066406, + "learning_rate": 9.879040281102303e-05, + "loss": 2.8641, + "step": 2922 + }, + { + "epoch": 0.19623502567027953, + "grad_norm": 5.660369873046875, + "learning_rate": 9.878802553150762e-05, + "loss": 3.0561, + "step": 2924 + }, + { + "epoch": 0.1963692493540485, + "grad_norm": 4.581969738006592, + "learning_rate": 9.87856459468539e-05, + "loss": 2.9039, + "step": 2926 + }, + { + "epoch": 0.1965034730378175, + "grad_norm": 5.679072380065918, + "learning_rate": 9.878326405717438e-05, + "loss": 3.1084, + "step": 2928 + }, + { + "epoch": 0.1966376967215865, + "grad_norm": 4.841191291809082, + "learning_rate": 9.878087986258156e-05, + "loss": 2.9364, + "step": 2930 + }, + { + "epoch": 0.19677192040535552, + "grad_norm": 4.380975723266602, + "learning_rate": 9.877849336318807e-05, + "loss": 2.9225, + "step": 2932 + }, + { + "epoch": 0.19690614408912452, + "grad_norm": 5.682320594787598, + "learning_rate": 9.877610455910668e-05, + "loss": 3.097, + "step": 2934 + }, + { + "epoch": 0.19704036777289352, + "grad_norm": 4.730079174041748, + "learning_rate": 9.877371345045029e-05, + "loss": 3.0194, + "step": 2936 + }, + { + "epoch": 0.19717459145666252, + "grad_norm": 5.038098335266113, + "learning_rate": 9.877132003733182e-05, + "loss": 2.7863, + "step": 2938 + }, + { + "epoch": 0.19730881514043153, + "grad_norm": 4.949071884155273, + "learning_rate": 9.876892431986442e-05, + "loss": 2.983, + "step": 2940 + }, + { + "epoch": 0.19744303882420053, + "grad_norm": 5.0759172439575195, + "learning_rate": 9.876652629816122e-05, + "loss": 2.8861, + "step": 2942 + }, + { + "epoch": 0.19757726250796953, + "grad_norm": 4.650147438049316, + "learning_rate": 9.876412597233555e-05, + "loss": 3.233, + "step": 2944 + }, + { + "epoch": 0.19771148619173853, + "grad_norm": 5.366354942321777, + "learning_rate": 9.876172334250082e-05, + "loss": 2.8143, + "step": 2946 + }, + { + "epoch": 0.19784570987550754, + "grad_norm": 4.915742874145508, + "learning_rate": 9.875931840877055e-05, + "loss": 3.038, + "step": 2948 + }, + { + "epoch": 0.19797993355927654, + "grad_norm": 4.63529109954834, + "learning_rate": 9.875691117125837e-05, + "loss": 3.0048, + "step": 2950 + }, + { + "epoch": 0.19811415724304554, + "grad_norm": 4.820679664611816, + "learning_rate": 9.875450163007801e-05, + "loss": 3.0712, + "step": 2952 + }, + { + "epoch": 0.19824838092681454, + "grad_norm": 7.678104400634766, + "learning_rate": 9.875208978534331e-05, + "loss": 2.7191, + "step": 2954 + }, + { + "epoch": 0.19838260461058355, + "grad_norm": 4.032191276550293, + "learning_rate": 9.874967563716826e-05, + "loss": 2.6864, + "step": 2956 + }, + { + "epoch": 0.19851682829435255, + "grad_norm": 5.204829216003418, + "learning_rate": 9.874725918566687e-05, + "loss": 2.9366, + "step": 2958 + }, + { + "epoch": 0.19865105197812155, + "grad_norm": 6.055318832397461, + "learning_rate": 9.874484043095336e-05, + "loss": 2.8951, + "step": 2960 + }, + { + "epoch": 0.19878527566189055, + "grad_norm": 4.718400478363037, + "learning_rate": 9.874241937314199e-05, + "loss": 2.6789, + "step": 2962 + }, + { + "epoch": 0.19891949934565953, + "grad_norm": 4.888490676879883, + "learning_rate": 9.873999601234715e-05, + "loss": 3.0525, + "step": 2964 + }, + { + "epoch": 0.19905372302942853, + "grad_norm": 6.307872772216797, + "learning_rate": 9.873757034868333e-05, + "loss": 2.842, + "step": 2966 + }, + { + "epoch": 0.19918794671319753, + "grad_norm": 5.280684947967529, + "learning_rate": 9.873514238226515e-05, + "loss": 2.9871, + "step": 2968 + }, + { + "epoch": 0.19932217039696654, + "grad_norm": 4.8313679695129395, + "learning_rate": 9.873271211320735e-05, + "loss": 2.9292, + "step": 2970 + }, + { + "epoch": 0.19945639408073554, + "grad_norm": 4.769979953765869, + "learning_rate": 9.873027954162471e-05, + "loss": 3.0119, + "step": 2972 + }, + { + "epoch": 0.19959061776450454, + "grad_norm": 4.911874771118164, + "learning_rate": 9.87278446676322e-05, + "loss": 3.06, + "step": 2974 + }, + { + "epoch": 0.19972484144827354, + "grad_norm": 4.960622787475586, + "learning_rate": 9.872540749134484e-05, + "loss": 3.2491, + "step": 2976 + }, + { + "epoch": 0.19985906513204255, + "grad_norm": 6.709426403045654, + "learning_rate": 9.872296801287779e-05, + "loss": 3.1919, + "step": 2978 + }, + { + "epoch": 0.19999328881581155, + "grad_norm": 4.409572601318359, + "learning_rate": 9.872052623234632e-05, + "loss": 2.9556, + "step": 2980 + }, + { + "epoch": 0.20012751249958055, + "grad_norm": 5.028787612915039, + "learning_rate": 9.871808214986578e-05, + "loss": 2.77, + "step": 2982 + }, + { + "epoch": 0.20026173618334955, + "grad_norm": 5.187703609466553, + "learning_rate": 9.871563576555165e-05, + "loss": 2.9415, + "step": 2984 + }, + { + "epoch": 0.20039595986711856, + "grad_norm": 5.913765907287598, + "learning_rate": 9.871318707951953e-05, + "loss": 2.9506, + "step": 2986 + }, + { + "epoch": 0.20053018355088756, + "grad_norm": 4.607470989227295, + "learning_rate": 9.871073609188513e-05, + "loss": 3.2737, + "step": 2988 + }, + { + "epoch": 0.20066440723465656, + "grad_norm": 4.7657856941223145, + "learning_rate": 9.87082828027642e-05, + "loss": 2.8081, + "step": 2990 + }, + { + "epoch": 0.20079863091842556, + "grad_norm": 4.646143913269043, + "learning_rate": 9.870582721227273e-05, + "loss": 3.2008, + "step": 2992 + }, + { + "epoch": 0.20093285460219457, + "grad_norm": 4.492005825042725, + "learning_rate": 9.870336932052667e-05, + "loss": 2.8083, + "step": 2994 + }, + { + "epoch": 0.20106707828596357, + "grad_norm": 4.593507289886475, + "learning_rate": 9.87009091276422e-05, + "loss": 2.9462, + "step": 2996 + }, + { + "epoch": 0.20120130196973257, + "grad_norm": 6.412387371063232, + "learning_rate": 9.869844663373553e-05, + "loss": 2.9482, + "step": 2998 + }, + { + "epoch": 0.20133552565350157, + "grad_norm": 5.73430061340332, + "learning_rate": 9.8695981838923e-05, + "loss": 2.9716, + "step": 3000 + }, + { + "epoch": 0.20146974933727055, + "grad_norm": 4.69178581237793, + "learning_rate": 9.869351474332111e-05, + "loss": 2.9309, + "step": 3002 + }, + { + "epoch": 0.20160397302103955, + "grad_norm": 4.473366737365723, + "learning_rate": 9.869104534704641e-05, + "loss": 2.8653, + "step": 3004 + }, + { + "epoch": 0.20173819670480855, + "grad_norm": 4.58015775680542, + "learning_rate": 9.868857365021553e-05, + "loss": 3.2293, + "step": 3006 + }, + { + "epoch": 0.20187242038857756, + "grad_norm": 5.450071334838867, + "learning_rate": 9.86860996529453e-05, + "loss": 2.9203, + "step": 3008 + }, + { + "epoch": 0.20200664407234656, + "grad_norm": 4.470168113708496, + "learning_rate": 9.868362335535262e-05, + "loss": 3.1081, + "step": 3010 + }, + { + "epoch": 0.20214086775611556, + "grad_norm": 4.281466960906982, + "learning_rate": 9.868114475755445e-05, + "loss": 2.8112, + "step": 3012 + }, + { + "epoch": 0.20227509143988456, + "grad_norm": 4.289178371429443, + "learning_rate": 9.867866385966791e-05, + "loss": 2.923, + "step": 3014 + }, + { + "epoch": 0.20240931512365357, + "grad_norm": 4.964844703674316, + "learning_rate": 9.867618066181023e-05, + "loss": 3.1304, + "step": 3016 + }, + { + "epoch": 0.20254353880742257, + "grad_norm": 7.420295238494873, + "learning_rate": 9.867369516409874e-05, + "loss": 2.9144, + "step": 3018 + }, + { + "epoch": 0.20267776249119157, + "grad_norm": 5.811273097991943, + "learning_rate": 9.867120736665087e-05, + "loss": 2.9201, + "step": 3020 + }, + { + "epoch": 0.20281198617496057, + "grad_norm": 5.532215118408203, + "learning_rate": 9.866871726958415e-05, + "loss": 3.3769, + "step": 3022 + }, + { + "epoch": 0.20294620985872958, + "grad_norm": 5.784740924835205, + "learning_rate": 9.866622487301624e-05, + "loss": 2.6821, + "step": 3024 + }, + { + "epoch": 0.20308043354249858, + "grad_norm": 4.862562656402588, + "learning_rate": 9.866373017706492e-05, + "loss": 3.1841, + "step": 3026 + }, + { + "epoch": 0.20321465722626758, + "grad_norm": 4.08389949798584, + "learning_rate": 9.866123318184803e-05, + "loss": 2.6044, + "step": 3028 + }, + { + "epoch": 0.20334888091003658, + "grad_norm": 5.807723522186279, + "learning_rate": 9.865873388748354e-05, + "loss": 3.0873, + "step": 3030 + }, + { + "epoch": 0.20348310459380559, + "grad_norm": 4.190324783325195, + "learning_rate": 9.865623229408959e-05, + "loss": 2.8872, + "step": 3032 + }, + { + "epoch": 0.2036173282775746, + "grad_norm": 4.082070827484131, + "learning_rate": 9.865372840178433e-05, + "loss": 3.0798, + "step": 3034 + }, + { + "epoch": 0.2037515519613436, + "grad_norm": 4.487260818481445, + "learning_rate": 9.865122221068608e-05, + "loss": 3.0651, + "step": 3036 + }, + { + "epoch": 0.2038857756451126, + "grad_norm": 5.233713626861572, + "learning_rate": 9.864871372091324e-05, + "loss": 2.8496, + "step": 3038 + }, + { + "epoch": 0.20401999932888157, + "grad_norm": 5.118753433227539, + "learning_rate": 9.864620293258434e-05, + "loss": 2.9149, + "step": 3040 + }, + { + "epoch": 0.20415422301265057, + "grad_norm": 5.081789016723633, + "learning_rate": 9.864368984581803e-05, + "loss": 2.7138, + "step": 3042 + }, + { + "epoch": 0.20428844669641957, + "grad_norm": 5.064399242401123, + "learning_rate": 9.8641174460733e-05, + "loss": 2.9081, + "step": 3044 + }, + { + "epoch": 0.20442267038018858, + "grad_norm": 4.803860187530518, + "learning_rate": 9.863865677744814e-05, + "loss": 2.9386, + "step": 3046 + }, + { + "epoch": 0.20455689406395758, + "grad_norm": 4.2439374923706055, + "learning_rate": 9.863613679608239e-05, + "loss": 2.7423, + "step": 3048 + }, + { + "epoch": 0.20469111774772658, + "grad_norm": 4.583193302154541, + "learning_rate": 9.863361451675481e-05, + "loss": 3.2513, + "step": 3050 + }, + { + "epoch": 0.20482534143149558, + "grad_norm": 4.706700801849365, + "learning_rate": 9.863108993958459e-05, + "loss": 3.0364, + "step": 3052 + }, + { + "epoch": 0.20495956511526459, + "grad_norm": 4.859589576721191, + "learning_rate": 9.862856306469099e-05, + "loss": 3.037, + "step": 3054 + }, + { + "epoch": 0.2050937887990336, + "grad_norm": 4.992952346801758, + "learning_rate": 9.86260338921934e-05, + "loss": 2.7897, + "step": 3056 + }, + { + "epoch": 0.2052280124828026, + "grad_norm": 4.215695381164551, + "learning_rate": 9.862350242221135e-05, + "loss": 3.1884, + "step": 3058 + }, + { + "epoch": 0.2053622361665716, + "grad_norm": 4.713428497314453, + "learning_rate": 9.862096865486441e-05, + "loss": 2.8889, + "step": 3060 + }, + { + "epoch": 0.2054964598503406, + "grad_norm": 4.492865562438965, + "learning_rate": 9.861843259027233e-05, + "loss": 2.8674, + "step": 3062 + }, + { + "epoch": 0.2056306835341096, + "grad_norm": 4.709263324737549, + "learning_rate": 9.861589422855488e-05, + "loss": 3.057, + "step": 3064 + }, + { + "epoch": 0.2057649072178786, + "grad_norm": 4.3539838790893555, + "learning_rate": 9.861335356983206e-05, + "loss": 2.8442, + "step": 3066 + }, + { + "epoch": 0.2058991309016476, + "grad_norm": 5.1753458976745605, + "learning_rate": 9.861081061422386e-05, + "loss": 3.1212, + "step": 3068 + }, + { + "epoch": 0.2060333545854166, + "grad_norm": 5.75732946395874, + "learning_rate": 9.860826536185044e-05, + "loss": 2.9136, + "step": 3070 + }, + { + "epoch": 0.2061675782691856, + "grad_norm": 3.810210704803467, + "learning_rate": 9.860571781283208e-05, + "loss": 2.8801, + "step": 3072 + }, + { + "epoch": 0.2063018019529546, + "grad_norm": 5.917092800140381, + "learning_rate": 9.860316796728912e-05, + "loss": 3.1506, + "step": 3074 + }, + { + "epoch": 0.2064360256367236, + "grad_norm": 4.960634231567383, + "learning_rate": 9.860061582534205e-05, + "loss": 2.8865, + "step": 3076 + }, + { + "epoch": 0.2065702493204926, + "grad_norm": 5.972437858581543, + "learning_rate": 9.859806138711148e-05, + "loss": 3.1422, + "step": 3078 + }, + { + "epoch": 0.2067044730042616, + "grad_norm": 11.929288864135742, + "learning_rate": 9.859550465271804e-05, + "loss": 2.6299, + "step": 3080 + }, + { + "epoch": 0.2068386966880306, + "grad_norm": 5.1269378662109375, + "learning_rate": 9.859294562228258e-05, + "loss": 3.2633, + "step": 3082 + }, + { + "epoch": 0.2069729203717996, + "grad_norm": 4.171808242797852, + "learning_rate": 9.859038429592599e-05, + "loss": 2.9585, + "step": 3084 + }, + { + "epoch": 0.2071071440555686, + "grad_norm": 4.3812947273254395, + "learning_rate": 9.858782067376928e-05, + "loss": 2.6922, + "step": 3086 + }, + { + "epoch": 0.2072413677393376, + "grad_norm": 5.0925445556640625, + "learning_rate": 9.85852547559336e-05, + "loss": 3.3227, + "step": 3088 + }, + { + "epoch": 0.2073755914231066, + "grad_norm": 4.524392127990723, + "learning_rate": 9.858268654254017e-05, + "loss": 2.9197, + "step": 3090 + }, + { + "epoch": 0.2075098151068756, + "grad_norm": 5.856257438659668, + "learning_rate": 9.858011603371033e-05, + "loss": 2.8195, + "step": 3092 + }, + { + "epoch": 0.2076440387906446, + "grad_norm": 4.258291721343994, + "learning_rate": 9.857754322956554e-05, + "loss": 2.7859, + "step": 3094 + }, + { + "epoch": 0.2077782624744136, + "grad_norm": 4.55875301361084, + "learning_rate": 9.857496813022735e-05, + "loss": 2.9629, + "step": 3096 + }, + { + "epoch": 0.2079124861581826, + "grad_norm": 5.416093349456787, + "learning_rate": 9.857239073581743e-05, + "loss": 3.1102, + "step": 3098 + }, + { + "epoch": 0.20804670984195162, + "grad_norm": 4.387839317321777, + "learning_rate": 9.856981104645757e-05, + "loss": 2.9312, + "step": 3100 + }, + { + "epoch": 0.20818093352572062, + "grad_norm": 4.711963176727295, + "learning_rate": 9.856722906226965e-05, + "loss": 3.0957, + "step": 3102 + }, + { + "epoch": 0.20831515720948962, + "grad_norm": 4.83768367767334, + "learning_rate": 9.856464478337566e-05, + "loss": 3.0969, + "step": 3104 + }, + { + "epoch": 0.20844938089325862, + "grad_norm": 4.408140182495117, + "learning_rate": 9.85620582098977e-05, + "loss": 2.9563, + "step": 3106 + }, + { + "epoch": 0.20858360457702763, + "grad_norm": 4.3398756980896, + "learning_rate": 9.855946934195799e-05, + "loss": 3.1224, + "step": 3108 + }, + { + "epoch": 0.20871782826079663, + "grad_norm": 6.5346856117248535, + "learning_rate": 9.855687817967882e-05, + "loss": 3.0357, + "step": 3110 + }, + { + "epoch": 0.20885205194456563, + "grad_norm": 4.619713306427002, + "learning_rate": 9.855428472318267e-05, + "loss": 3.0359, + "step": 3112 + }, + { + "epoch": 0.20898627562833463, + "grad_norm": 5.982784271240234, + "learning_rate": 9.855168897259202e-05, + "loss": 2.9542, + "step": 3114 + }, + { + "epoch": 0.2091204993121036, + "grad_norm": 5.458873748779297, + "learning_rate": 9.854909092802955e-05, + "loss": 2.8819, + "step": 3116 + }, + { + "epoch": 0.2092547229958726, + "grad_norm": 4.327225685119629, + "learning_rate": 9.854649058961799e-05, + "loss": 2.9382, + "step": 3118 + }, + { + "epoch": 0.2093889466796416, + "grad_norm": 5.588174819946289, + "learning_rate": 9.854388795748022e-05, + "loss": 2.9088, + "step": 3120 + }, + { + "epoch": 0.20952317036341062, + "grad_norm": 4.043487548828125, + "learning_rate": 9.854128303173919e-05, + "loss": 2.7337, + "step": 3122 + }, + { + "epoch": 0.20965739404717962, + "grad_norm": 4.739105701446533, + "learning_rate": 9.8538675812518e-05, + "loss": 3.0553, + "step": 3124 + }, + { + "epoch": 0.20979161773094862, + "grad_norm": 6.621822357177734, + "learning_rate": 9.853606629993983e-05, + "loss": 3.0532, + "step": 3126 + }, + { + "epoch": 0.20992584141471762, + "grad_norm": 4.173099040985107, + "learning_rate": 9.853345449412796e-05, + "loss": 2.6463, + "step": 3128 + }, + { + "epoch": 0.21006006509848663, + "grad_norm": 4.807617664337158, + "learning_rate": 9.853084039520581e-05, + "loss": 3.1227, + "step": 3130 + }, + { + "epoch": 0.21019428878225563, + "grad_norm": 5.022834777832031, + "learning_rate": 9.852822400329688e-05, + "loss": 2.9445, + "step": 3132 + }, + { + "epoch": 0.21032851246602463, + "grad_norm": 4.744380474090576, + "learning_rate": 9.852560531852479e-05, + "loss": 2.9372, + "step": 3134 + }, + { + "epoch": 0.21046273614979363, + "grad_norm": 4.575165748596191, + "learning_rate": 9.852298434101328e-05, + "loss": 2.8397, + "step": 3136 + }, + { + "epoch": 0.21059695983356264, + "grad_norm": 5.041839599609375, + "learning_rate": 9.852036107088617e-05, + "loss": 2.9368, + "step": 3138 + }, + { + "epoch": 0.21073118351733164, + "grad_norm": 4.542657852172852, + "learning_rate": 9.851773550826742e-05, + "loss": 2.8069, + "step": 3140 + }, + { + "epoch": 0.21086540720110064, + "grad_norm": 4.615137100219727, + "learning_rate": 9.851510765328105e-05, + "loss": 2.9071, + "step": 3142 + }, + { + "epoch": 0.21099963088486964, + "grad_norm": 4.4661712646484375, + "learning_rate": 9.851247750605126e-05, + "loss": 2.9178, + "step": 3144 + }, + { + "epoch": 0.21113385456863865, + "grad_norm": 6.805563449859619, + "learning_rate": 9.85098450667023e-05, + "loss": 3.0035, + "step": 3146 + }, + { + "epoch": 0.21126807825240765, + "grad_norm": 6.8029561042785645, + "learning_rate": 9.850721033535854e-05, + "loss": 3.1095, + "step": 3148 + }, + { + "epoch": 0.21140230193617665, + "grad_norm": 4.767754077911377, + "learning_rate": 9.85045733121445e-05, + "loss": 2.8336, + "step": 3150 + }, + { + "epoch": 0.21153652561994563, + "grad_norm": 4.567541599273682, + "learning_rate": 9.850193399718475e-05, + "loss": 2.9405, + "step": 3152 + }, + { + "epoch": 0.21167074930371463, + "grad_norm": 4.145812511444092, + "learning_rate": 9.849929239060398e-05, + "loss": 2.8604, + "step": 3154 + }, + { + "epoch": 0.21180497298748363, + "grad_norm": 4.918109893798828, + "learning_rate": 9.849664849252701e-05, + "loss": 2.807, + "step": 3156 + }, + { + "epoch": 0.21193919667125263, + "grad_norm": 4.160008907318115, + "learning_rate": 9.849400230307877e-05, + "loss": 2.9697, + "step": 3158 + }, + { + "epoch": 0.21207342035502164, + "grad_norm": 4.261327743530273, + "learning_rate": 9.849135382238428e-05, + "loss": 2.8545, + "step": 3160 + }, + { + "epoch": 0.21220764403879064, + "grad_norm": 4.802911281585693, + "learning_rate": 9.848870305056867e-05, + "loss": 2.9989, + "step": 3162 + }, + { + "epoch": 0.21234186772255964, + "grad_norm": 4.546838283538818, + "learning_rate": 9.848604998775721e-05, + "loss": 3.1321, + "step": 3164 + }, + { + "epoch": 0.21247609140632864, + "grad_norm": 4.2616801261901855, + "learning_rate": 9.848339463407521e-05, + "loss": 2.4788, + "step": 3166 + }, + { + "epoch": 0.21261031509009765, + "grad_norm": 5.034578800201416, + "learning_rate": 9.848073698964817e-05, + "loss": 2.7673, + "step": 3168 + }, + { + "epoch": 0.21274453877386665, + "grad_norm": 4.598456859588623, + "learning_rate": 9.847807705460163e-05, + "loss": 3.0042, + "step": 3170 + }, + { + "epoch": 0.21287876245763565, + "grad_norm": 4.923595905303955, + "learning_rate": 9.847541482906129e-05, + "loss": 3.0857, + "step": 3172 + }, + { + "epoch": 0.21301298614140465, + "grad_norm": 6.1189866065979, + "learning_rate": 9.84727503131529e-05, + "loss": 3.0402, + "step": 3174 + }, + { + "epoch": 0.21314720982517366, + "grad_norm": 5.022345066070557, + "learning_rate": 9.847008350700239e-05, + "loss": 3.0227, + "step": 3176 + }, + { + "epoch": 0.21328143350894266, + "grad_norm": 4.516480445861816, + "learning_rate": 9.846741441073574e-05, + "loss": 3.0322, + "step": 3178 + }, + { + "epoch": 0.21341565719271166, + "grad_norm": 4.745030879974365, + "learning_rate": 9.846474302447907e-05, + "loss": 3.1041, + "step": 3180 + }, + { + "epoch": 0.21354988087648066, + "grad_norm": 8.465485572814941, + "learning_rate": 9.846206934835859e-05, + "loss": 2.8489, + "step": 3182 + }, + { + "epoch": 0.21368410456024967, + "grad_norm": 5.875777244567871, + "learning_rate": 9.845939338250063e-05, + "loss": 2.836, + "step": 3184 + }, + { + "epoch": 0.21381832824401867, + "grad_norm": 4.333359718322754, + "learning_rate": 9.845671512703163e-05, + "loss": 2.948, + "step": 3186 + }, + { + "epoch": 0.21395255192778767, + "grad_norm": 4.516866683959961, + "learning_rate": 9.845403458207813e-05, + "loss": 3.061, + "step": 3188 + }, + { + "epoch": 0.21408677561155665, + "grad_norm": 8.358100891113281, + "learning_rate": 9.845135174776676e-05, + "loss": 3.1022, + "step": 3190 + }, + { + "epoch": 0.21422099929532565, + "grad_norm": 6.638658046722412, + "learning_rate": 9.844866662422432e-05, + "loss": 3.0056, + "step": 3192 + }, + { + "epoch": 0.21435522297909465, + "grad_norm": 4.624916076660156, + "learning_rate": 9.844597921157764e-05, + "loss": 2.7997, + "step": 3194 + }, + { + "epoch": 0.21448944666286365, + "grad_norm": 4.655485153198242, + "learning_rate": 9.84432895099537e-05, + "loss": 3.309, + "step": 3196 + }, + { + "epoch": 0.21462367034663266, + "grad_norm": 4.707542419433594, + "learning_rate": 9.844059751947959e-05, + "loss": 3.1159, + "step": 3198 + }, + { + "epoch": 0.21475789403040166, + "grad_norm": 6.209988117218018, + "learning_rate": 9.84379032402825e-05, + "loss": 2.743, + "step": 3200 + }, + { + "epoch": 0.21489211771417066, + "grad_norm": 4.734823703765869, + "learning_rate": 9.843520667248974e-05, + "loss": 2.7783, + "step": 3202 + }, + { + "epoch": 0.21502634139793966, + "grad_norm": 4.565969944000244, + "learning_rate": 9.84325078162287e-05, + "loss": 2.8353, + "step": 3204 + }, + { + "epoch": 0.21516056508170867, + "grad_norm": 4.411762714385986, + "learning_rate": 9.84298066716269e-05, + "loss": 2.8188, + "step": 3206 + }, + { + "epoch": 0.21529478876547767, + "grad_norm": 4.151341915130615, + "learning_rate": 9.842710323881199e-05, + "loss": 3.0261, + "step": 3208 + }, + { + "epoch": 0.21542901244924667, + "grad_norm": 3.9552001953125, + "learning_rate": 9.842439751791169e-05, + "loss": 2.8251, + "step": 3210 + }, + { + "epoch": 0.21556323613301567, + "grad_norm": 25.90407371520996, + "learning_rate": 9.842168950905379e-05, + "loss": 2.7112, + "step": 3212 + }, + { + "epoch": 0.21569745981678468, + "grad_norm": 4.168037414550781, + "learning_rate": 9.84189792123663e-05, + "loss": 2.7934, + "step": 3214 + }, + { + "epoch": 0.21583168350055368, + "grad_norm": 4.90933084487915, + "learning_rate": 9.841626662797725e-05, + "loss": 2.9671, + "step": 3216 + }, + { + "epoch": 0.21596590718432268, + "grad_norm": 7.482623100280762, + "learning_rate": 9.841355175601481e-05, + "loss": 3.0095, + "step": 3218 + }, + { + "epoch": 0.21610013086809168, + "grad_norm": 6.469265937805176, + "learning_rate": 9.841083459660725e-05, + "loss": 3.0178, + "step": 3220 + }, + { + "epoch": 0.21623435455186069, + "grad_norm": 4.870399475097656, + "learning_rate": 9.840811514988294e-05, + "loss": 3.0245, + "step": 3222 + }, + { + "epoch": 0.2163685782356297, + "grad_norm": 4.6951212882995605, + "learning_rate": 9.840539341597039e-05, + "loss": 2.8885, + "step": 3224 + }, + { + "epoch": 0.2165028019193987, + "grad_norm": 4.333229064941406, + "learning_rate": 9.840266939499818e-05, + "loss": 2.9237, + "step": 3226 + }, + { + "epoch": 0.21663702560316767, + "grad_norm": 8.162107467651367, + "learning_rate": 9.839994308709504e-05, + "loss": 2.7184, + "step": 3228 + }, + { + "epoch": 0.21677124928693667, + "grad_norm": 5.619515419006348, + "learning_rate": 9.839721449238974e-05, + "loss": 2.6794, + "step": 3230 + }, + { + "epoch": 0.21690547297070567, + "grad_norm": 4.345789909362793, + "learning_rate": 9.839448361101124e-05, + "loss": 2.8856, + "step": 3232 + }, + { + "epoch": 0.21703969665447467, + "grad_norm": 4.711472511291504, + "learning_rate": 9.839175044308854e-05, + "loss": 2.9071, + "step": 3234 + }, + { + "epoch": 0.21717392033824368, + "grad_norm": 4.306356430053711, + "learning_rate": 9.838901498875081e-05, + "loss": 2.808, + "step": 3236 + }, + { + "epoch": 0.21730814402201268, + "grad_norm": 4.499025344848633, + "learning_rate": 9.838627724812725e-05, + "loss": 2.7973, + "step": 3238 + }, + { + "epoch": 0.21744236770578168, + "grad_norm": 9.268061637878418, + "learning_rate": 9.838353722134725e-05, + "loss": 3.019, + "step": 3240 + }, + { + "epoch": 0.21757659138955068, + "grad_norm": 4.392327785491943, + "learning_rate": 9.838079490854027e-05, + "loss": 2.8354, + "step": 3242 + }, + { + "epoch": 0.21771081507331969, + "grad_norm": 5.304717063903809, + "learning_rate": 9.837805030983585e-05, + "loss": 3.089, + "step": 3244 + }, + { + "epoch": 0.2178450387570887, + "grad_norm": 7.626925945281982, + "learning_rate": 9.837530342536368e-05, + "loss": 3.0413, + "step": 3246 + }, + { + "epoch": 0.2179792624408577, + "grad_norm": 4.418500900268555, + "learning_rate": 9.837255425525356e-05, + "loss": 2.8767, + "step": 3248 + }, + { + "epoch": 0.2181134861246267, + "grad_norm": 5.249969482421875, + "learning_rate": 9.836980279963537e-05, + "loss": 3.0477, + "step": 3250 + }, + { + "epoch": 0.2182477098083957, + "grad_norm": 4.925004959106445, + "learning_rate": 9.836704905863911e-05, + "loss": 2.8013, + "step": 3252 + }, + { + "epoch": 0.2183819334921647, + "grad_norm": 95.86943817138672, + "learning_rate": 9.836429303239491e-05, + "loss": 3.0286, + "step": 3254 + }, + { + "epoch": 0.2185161571759337, + "grad_norm": 6.368263244628906, + "learning_rate": 9.836153472103296e-05, + "loss": 3.0907, + "step": 3256 + }, + { + "epoch": 0.2186503808597027, + "grad_norm": 6.23747444152832, + "learning_rate": 9.835877412468357e-05, + "loss": 3.0702, + "step": 3258 + }, + { + "epoch": 0.2187846045434717, + "grad_norm": 4.674152374267578, + "learning_rate": 9.835601124347722e-05, + "loss": 2.7118, + "step": 3260 + }, + { + "epoch": 0.2189188282272407, + "grad_norm": 4.093385219573975, + "learning_rate": 9.835324607754442e-05, + "loss": 2.762, + "step": 3262 + }, + { + "epoch": 0.2190530519110097, + "grad_norm": 5.249168395996094, + "learning_rate": 9.835047862701583e-05, + "loss": 3.2016, + "step": 3264 + }, + { + "epoch": 0.21918727559477869, + "grad_norm": 4.444293975830078, + "learning_rate": 9.834770889202219e-05, + "loss": 3.0379, + "step": 3266 + }, + { + "epoch": 0.2193214992785477, + "grad_norm": 5.560075759887695, + "learning_rate": 9.834493687269438e-05, + "loss": 2.8766, + "step": 3268 + }, + { + "epoch": 0.2194557229623167, + "grad_norm": 5.051884651184082, + "learning_rate": 9.834216256916337e-05, + "loss": 2.9895, + "step": 3270 + }, + { + "epoch": 0.2195899466460857, + "grad_norm": 5.459766387939453, + "learning_rate": 9.833938598156025e-05, + "loss": 2.9902, + "step": 3272 + }, + { + "epoch": 0.2197241703298547, + "grad_norm": 4.386317253112793, + "learning_rate": 9.833660711001619e-05, + "loss": 3.0621, + "step": 3274 + }, + { + "epoch": 0.2198583940136237, + "grad_norm": 5.564157962799072, + "learning_rate": 9.833382595466249e-05, + "loss": 2.6571, + "step": 3276 + }, + { + "epoch": 0.2199926176973927, + "grad_norm": 5.5588297843933105, + "learning_rate": 9.833104251563056e-05, + "loss": 2.8146, + "step": 3278 + }, + { + "epoch": 0.2201268413811617, + "grad_norm": 4.767416954040527, + "learning_rate": 9.832825679305191e-05, + "loss": 2.9313, + "step": 3280 + }, + { + "epoch": 0.2202610650649307, + "grad_norm": 5.7216477394104, + "learning_rate": 9.832546878705817e-05, + "loss": 2.8147, + "step": 3282 + }, + { + "epoch": 0.2203952887486997, + "grad_norm": 4.740909576416016, + "learning_rate": 9.832267849778106e-05, + "loss": 2.9829, + "step": 3284 + }, + { + "epoch": 0.2205295124324687, + "grad_norm": 5.116589546203613, + "learning_rate": 9.83198859253524e-05, + "loss": 3.0341, + "step": 3286 + }, + { + "epoch": 0.2206637361162377, + "grad_norm": 4.384726524353027, + "learning_rate": 9.831709106990414e-05, + "loss": 2.8806, + "step": 3288 + }, + { + "epoch": 0.22079795980000672, + "grad_norm": 5.068042755126953, + "learning_rate": 9.831429393156834e-05, + "loss": 2.7781, + "step": 3290 + }, + { + "epoch": 0.22093218348377572, + "grad_norm": 5.084854602813721, + "learning_rate": 9.831149451047718e-05, + "loss": 3.1386, + "step": 3292 + }, + { + "epoch": 0.22106640716754472, + "grad_norm": 4.81299352645874, + "learning_rate": 9.83086928067629e-05, + "loss": 2.8578, + "step": 3294 + }, + { + "epoch": 0.22120063085131372, + "grad_norm": 5.942620754241943, + "learning_rate": 9.830588882055786e-05, + "loss": 2.8865, + "step": 3296 + }, + { + "epoch": 0.22133485453508273, + "grad_norm": 6.289276123046875, + "learning_rate": 9.830308255199457e-05, + "loss": 2.9529, + "step": 3298 + }, + { + "epoch": 0.22146907821885173, + "grad_norm": 4.506661891937256, + "learning_rate": 9.830027400120561e-05, + "loss": 2.9582, + "step": 3300 + }, + { + "epoch": 0.22160330190262073, + "grad_norm": 4.655487537384033, + "learning_rate": 9.82974631683237e-05, + "loss": 3.0006, + "step": 3302 + }, + { + "epoch": 0.2217375255863897, + "grad_norm": 5.373042106628418, + "learning_rate": 9.829465005348162e-05, + "loss": 2.9525, + "step": 3304 + }, + { + "epoch": 0.2218717492701587, + "grad_norm": 3.8952243328094482, + "learning_rate": 9.829183465681229e-05, + "loss": 2.7836, + "step": 3306 + }, + { + "epoch": 0.2220059729539277, + "grad_norm": 6.103254318237305, + "learning_rate": 9.828901697844872e-05, + "loss": 2.6345, + "step": 3308 + }, + { + "epoch": 0.2221401966376967, + "grad_norm": 5.9213972091674805, + "learning_rate": 9.828619701852407e-05, + "loss": 3.2005, + "step": 3310 + }, + { + "epoch": 0.22227442032146572, + "grad_norm": 5.214477062225342, + "learning_rate": 9.828337477717157e-05, + "loss": 2.6755, + "step": 3312 + }, + { + "epoch": 0.22240864400523472, + "grad_norm": 9.626215934753418, + "learning_rate": 9.828055025452454e-05, + "loss": 3.3089, + "step": 3314 + }, + { + "epoch": 0.22254286768900372, + "grad_norm": 6.347432613372803, + "learning_rate": 9.827772345071647e-05, + "loss": 2.8049, + "step": 3316 + }, + { + "epoch": 0.22267709137277272, + "grad_norm": 5.2829132080078125, + "learning_rate": 9.827489436588088e-05, + "loss": 3.0039, + "step": 3318 + }, + { + "epoch": 0.22281131505654173, + "grad_norm": 5.1854472160339355, + "learning_rate": 9.827206300015147e-05, + "loss": 3.1362, + "step": 3320 + }, + { + "epoch": 0.22294553874031073, + "grad_norm": 5.282427787780762, + "learning_rate": 9.8269229353662e-05, + "loss": 3.1745, + "step": 3322 + }, + { + "epoch": 0.22307976242407973, + "grad_norm": 4.748351573944092, + "learning_rate": 9.826639342654636e-05, + "loss": 2.9188, + "step": 3324 + }, + { + "epoch": 0.22321398610784873, + "grad_norm": 5.004763126373291, + "learning_rate": 9.826355521893855e-05, + "loss": 2.8189, + "step": 3326 + }, + { + "epoch": 0.22334820979161774, + "grad_norm": 5.269996166229248, + "learning_rate": 9.826071473097265e-05, + "loss": 3.1897, + "step": 3328 + }, + { + "epoch": 0.22348243347538674, + "grad_norm": 5.000327110290527, + "learning_rate": 9.82578719627829e-05, + "loss": 3.0209, + "step": 3330 + }, + { + "epoch": 0.22361665715915574, + "grad_norm": 4.801200866699219, + "learning_rate": 9.825502691450357e-05, + "loss": 2.746, + "step": 3332 + }, + { + "epoch": 0.22375088084292474, + "grad_norm": 5.693075180053711, + "learning_rate": 9.825217958626913e-05, + "loss": 2.9851, + "step": 3334 + }, + { + "epoch": 0.22388510452669375, + "grad_norm": 27.153553009033203, + "learning_rate": 9.824932997821408e-05, + "loss": 2.766, + "step": 3336 + }, + { + "epoch": 0.22401932821046275, + "grad_norm": 4.795072078704834, + "learning_rate": 9.824647809047306e-05, + "loss": 2.9686, + "step": 3338 + }, + { + "epoch": 0.22415355189423175, + "grad_norm": 5.893414497375488, + "learning_rate": 9.824362392318082e-05, + "loss": 2.6333, + "step": 3340 + }, + { + "epoch": 0.22428777557800073, + "grad_norm": 4.632457733154297, + "learning_rate": 9.824076747647223e-05, + "loss": 2.8212, + "step": 3342 + }, + { + "epoch": 0.22442199926176973, + "grad_norm": 5.015408992767334, + "learning_rate": 9.823790875048224e-05, + "loss": 3.1354, + "step": 3344 + }, + { + "epoch": 0.22455622294553873, + "grad_norm": 4.613503456115723, + "learning_rate": 9.823504774534591e-05, + "loss": 2.9801, + "step": 3346 + }, + { + "epoch": 0.22469044662930773, + "grad_norm": 5.118015766143799, + "learning_rate": 9.823218446119842e-05, + "loss": 2.7213, + "step": 3348 + }, + { + "epoch": 0.22482467031307674, + "grad_norm": 8.232337951660156, + "learning_rate": 9.822931889817506e-05, + "loss": 3.0829, + "step": 3350 + }, + { + "epoch": 0.22495889399684574, + "grad_norm": 4.7350897789001465, + "learning_rate": 9.822645105641123e-05, + "loss": 2.8886, + "step": 3352 + }, + { + "epoch": 0.22509311768061474, + "grad_norm": 4.655916213989258, + "learning_rate": 9.822358093604242e-05, + "loss": 2.9391, + "step": 3354 + }, + { + "epoch": 0.22522734136438374, + "grad_norm": 7.8251566886901855, + "learning_rate": 9.822070853720421e-05, + "loss": 2.7644, + "step": 3356 + }, + { + "epoch": 0.22536156504815275, + "grad_norm": 4.598936080932617, + "learning_rate": 9.821783386003239e-05, + "loss": 2.7775, + "step": 3358 + }, + { + "epoch": 0.22549578873192175, + "grad_norm": 9.644124984741211, + "learning_rate": 9.821495690466272e-05, + "loss": 2.9779, + "step": 3360 + }, + { + "epoch": 0.22563001241569075, + "grad_norm": 5.089818000793457, + "learning_rate": 9.821207767123113e-05, + "loss": 3.2477, + "step": 3362 + }, + { + "epoch": 0.22576423609945975, + "grad_norm": 5.280829906463623, + "learning_rate": 9.820919615987368e-05, + "loss": 2.7776, + "step": 3364 + }, + { + "epoch": 0.22589845978322876, + "grad_norm": 4.773318290710449, + "learning_rate": 9.820631237072652e-05, + "loss": 3.315, + "step": 3366 + }, + { + "epoch": 0.22603268346699776, + "grad_norm": 4.601736545562744, + "learning_rate": 9.82034263039259e-05, + "loss": 2.6916, + "step": 3368 + }, + { + "epoch": 0.22616690715076676, + "grad_norm": 4.649728775024414, + "learning_rate": 9.820053795960815e-05, + "loss": 3.1986, + "step": 3370 + }, + { + "epoch": 0.22630113083453576, + "grad_norm": 5.045429229736328, + "learning_rate": 9.819764733790979e-05, + "loss": 2.766, + "step": 3372 + }, + { + "epoch": 0.22643535451830477, + "grad_norm": 4.7845778465271, + "learning_rate": 9.819475443896736e-05, + "loss": 2.7421, + "step": 3374 + }, + { + "epoch": 0.22656957820207377, + "grad_norm": 4.862174034118652, + "learning_rate": 9.819185926291754e-05, + "loss": 2.9889, + "step": 3376 + }, + { + "epoch": 0.22670380188584277, + "grad_norm": 4.2407073974609375, + "learning_rate": 9.818896180989716e-05, + "loss": 2.8838, + "step": 3378 + }, + { + "epoch": 0.22683802556961175, + "grad_norm": 4.588068962097168, + "learning_rate": 9.818606208004309e-05, + "loss": 2.8999, + "step": 3380 + }, + { + "epoch": 0.22697224925338075, + "grad_norm": 5.135111331939697, + "learning_rate": 9.818316007349232e-05, + "loss": 3.2078, + "step": 3382 + }, + { + "epoch": 0.22710647293714975, + "grad_norm": 4.265831470489502, + "learning_rate": 9.8180255790382e-05, + "loss": 2.8628, + "step": 3384 + }, + { + "epoch": 0.22724069662091875, + "grad_norm": 4.77894926071167, + "learning_rate": 9.817734923084934e-05, + "loss": 2.8696, + "step": 3386 + }, + { + "epoch": 0.22737492030468776, + "grad_norm": 7.407833576202393, + "learning_rate": 9.817444039503165e-05, + "loss": 2.879, + "step": 3388 + }, + { + "epoch": 0.22750914398845676, + "grad_norm": 4.333123683929443, + "learning_rate": 9.817152928306638e-05, + "loss": 2.8917, + "step": 3390 + }, + { + "epoch": 0.22764336767222576, + "grad_norm": 6.175624370574951, + "learning_rate": 9.81686158950911e-05, + "loss": 2.8375, + "step": 3392 + }, + { + "epoch": 0.22777759135599476, + "grad_norm": 5.103316783905029, + "learning_rate": 9.816570023124342e-05, + "loss": 3.3558, + "step": 3394 + }, + { + "epoch": 0.22791181503976377, + "grad_norm": 6.074088096618652, + "learning_rate": 9.816278229166114e-05, + "loss": 3.0318, + "step": 3396 + }, + { + "epoch": 0.22804603872353277, + "grad_norm": 4.2534403800964355, + "learning_rate": 9.815986207648208e-05, + "loss": 3.1109, + "step": 3398 + }, + { + "epoch": 0.22818026240730177, + "grad_norm": 6.565694808959961, + "learning_rate": 9.815693958584424e-05, + "loss": 2.8822, + "step": 3400 + }, + { + "epoch": 0.22831448609107077, + "grad_norm": 4.798267364501953, + "learning_rate": 9.815401481988571e-05, + "loss": 2.921, + "step": 3402 + }, + { + "epoch": 0.22844870977483978, + "grad_norm": 4.792600631713867, + "learning_rate": 9.815108777874467e-05, + "loss": 3.0415, + "step": 3404 + }, + { + "epoch": 0.22858293345860878, + "grad_norm": 5.162966728210449, + "learning_rate": 9.814815846255942e-05, + "loss": 3.2873, + "step": 3406 + }, + { + "epoch": 0.22871715714237778, + "grad_norm": 4.5035929679870605, + "learning_rate": 9.814522687146837e-05, + "loss": 2.671, + "step": 3408 + }, + { + "epoch": 0.22885138082614678, + "grad_norm": 5.536206245422363, + "learning_rate": 9.814229300560999e-05, + "loss": 2.9134, + "step": 3410 + }, + { + "epoch": 0.22898560450991579, + "grad_norm": 4.492737770080566, + "learning_rate": 9.813935686512297e-05, + "loss": 2.884, + "step": 3412 + }, + { + "epoch": 0.2291198281936848, + "grad_norm": 6.110756874084473, + "learning_rate": 9.813641845014599e-05, + "loss": 3.2012, + "step": 3414 + }, + { + "epoch": 0.2292540518774538, + "grad_norm": 4.220027446746826, + "learning_rate": 9.813347776081789e-05, + "loss": 2.7361, + "step": 3416 + }, + { + "epoch": 0.22938827556122277, + "grad_norm": 5.251460075378418, + "learning_rate": 9.813053479727761e-05, + "loss": 2.8165, + "step": 3418 + }, + { + "epoch": 0.22952249924499177, + "grad_norm": 4.458982467651367, + "learning_rate": 9.812758955966421e-05, + "loss": 2.7622, + "step": 3420 + }, + { + "epoch": 0.22965672292876077, + "grad_norm": 5.481340408325195, + "learning_rate": 9.812464204811686e-05, + "loss": 2.9603, + "step": 3422 + }, + { + "epoch": 0.22979094661252977, + "grad_norm": 4.935278415679932, + "learning_rate": 9.812169226277479e-05, + "loss": 2.8247, + "step": 3424 + }, + { + "epoch": 0.22992517029629878, + "grad_norm": 5.654877185821533, + "learning_rate": 9.811874020377738e-05, + "loss": 3.0099, + "step": 3426 + }, + { + "epoch": 0.23005939398006778, + "grad_norm": 5.353259563446045, + "learning_rate": 9.811578587126413e-05, + "loss": 2.79, + "step": 3428 + }, + { + "epoch": 0.23019361766383678, + "grad_norm": 5.858390808105469, + "learning_rate": 9.81128292653746e-05, + "loss": 2.7487, + "step": 3430 + }, + { + "epoch": 0.23032784134760578, + "grad_norm": 5.0995259284973145, + "learning_rate": 9.810987038624851e-05, + "loss": 3.0735, + "step": 3432 + }, + { + "epoch": 0.23046206503137479, + "grad_norm": 8.109825134277344, + "learning_rate": 9.810690923402566e-05, + "loss": 2.8517, + "step": 3434 + }, + { + "epoch": 0.2305962887151438, + "grad_norm": 5.124128341674805, + "learning_rate": 9.810394580884592e-05, + "loss": 2.9086, + "step": 3436 + }, + { + "epoch": 0.2307305123989128, + "grad_norm": 4.235292911529541, + "learning_rate": 9.810098011084935e-05, + "loss": 3.0547, + "step": 3438 + }, + { + "epoch": 0.2308647360826818, + "grad_norm": 4.973613739013672, + "learning_rate": 9.809801214017604e-05, + "loss": 2.8458, + "step": 3440 + }, + { + "epoch": 0.2309989597664508, + "grad_norm": 4.519331455230713, + "learning_rate": 9.809504189696626e-05, + "loss": 2.8676, + "step": 3442 + }, + { + "epoch": 0.2311331834502198, + "grad_norm": 4.856176853179932, + "learning_rate": 9.809206938136031e-05, + "loss": 2.8747, + "step": 3444 + }, + { + "epoch": 0.2312674071339888, + "grad_norm": 5.011623382568359, + "learning_rate": 9.808909459349865e-05, + "loss": 3.0321, + "step": 3446 + }, + { + "epoch": 0.2314016308177578, + "grad_norm": 4.687420845031738, + "learning_rate": 9.808611753352184e-05, + "loss": 2.9261, + "step": 3448 + }, + { + "epoch": 0.2315358545015268, + "grad_norm": 4.80540132522583, + "learning_rate": 9.808313820157052e-05, + "loss": 2.8343, + "step": 3450 + }, + { + "epoch": 0.2316700781852958, + "grad_norm": 4.12592077255249, + "learning_rate": 9.808015659778549e-05, + "loss": 2.9922, + "step": 3452 + }, + { + "epoch": 0.23180430186906478, + "grad_norm": 4.31397008895874, + "learning_rate": 9.80771727223076e-05, + "loss": 2.7783, + "step": 3454 + }, + { + "epoch": 0.23193852555283379, + "grad_norm": 4.0939459800720215, + "learning_rate": 9.807418657527782e-05, + "loss": 2.9206, + "step": 3456 + }, + { + "epoch": 0.2320727492366028, + "grad_norm": 4.101994037628174, + "learning_rate": 9.807119815683728e-05, + "loss": 2.7761, + "step": 3458 + }, + { + "epoch": 0.2322069729203718, + "grad_norm": 5.084845542907715, + "learning_rate": 9.806820746712716e-05, + "loss": 2.9254, + "step": 3460 + }, + { + "epoch": 0.2323411966041408, + "grad_norm": 5.000672817230225, + "learning_rate": 9.806521450628875e-05, + "loss": 3.0981, + "step": 3462 + }, + { + "epoch": 0.2324754202879098, + "grad_norm": 4.513111591339111, + "learning_rate": 9.806221927446347e-05, + "loss": 3.1585, + "step": 3464 + }, + { + "epoch": 0.2326096439716788, + "grad_norm": 19.119462966918945, + "learning_rate": 9.805922177179283e-05, + "loss": 3.1355, + "step": 3466 + }, + { + "epoch": 0.2327438676554478, + "grad_norm": 4.71213436126709, + "learning_rate": 9.805622199841848e-05, + "loss": 2.8161, + "step": 3468 + }, + { + "epoch": 0.2328780913392168, + "grad_norm": 4.117844581604004, + "learning_rate": 9.805321995448214e-05, + "loss": 2.8879, + "step": 3470 + }, + { + "epoch": 0.2330123150229858, + "grad_norm": 5.2103962898254395, + "learning_rate": 9.805021564012564e-05, + "loss": 3.051, + "step": 3472 + }, + { + "epoch": 0.2331465387067548, + "grad_norm": 5.274440765380859, + "learning_rate": 9.804720905549094e-05, + "loss": 2.8093, + "step": 3474 + }, + { + "epoch": 0.2332807623905238, + "grad_norm": 5.404548168182373, + "learning_rate": 9.80442002007201e-05, + "loss": 2.7168, + "step": 3476 + }, + { + "epoch": 0.2334149860742928, + "grad_norm": 4.877373695373535, + "learning_rate": 9.804118907595527e-05, + "loss": 2.6926, + "step": 3478 + }, + { + "epoch": 0.23354920975806182, + "grad_norm": 5.964360237121582, + "learning_rate": 9.803817568133872e-05, + "loss": 2.8838, + "step": 3480 + }, + { + "epoch": 0.23368343344183082, + "grad_norm": 5.101726055145264, + "learning_rate": 9.803516001701286e-05, + "loss": 3.1395, + "step": 3482 + }, + { + "epoch": 0.23381765712559982, + "grad_norm": 4.697463512420654, + "learning_rate": 9.80321420831201e-05, + "loss": 2.8899, + "step": 3484 + }, + { + "epoch": 0.23395188080936882, + "grad_norm": 4.481101036071777, + "learning_rate": 9.80291218798031e-05, + "loss": 2.9105, + "step": 3486 + }, + { + "epoch": 0.23408610449313783, + "grad_norm": 4.796844482421875, + "learning_rate": 9.802609940720455e-05, + "loss": 3.0705, + "step": 3488 + }, + { + "epoch": 0.23422032817690683, + "grad_norm": 4.4849419593811035, + "learning_rate": 9.802307466546723e-05, + "loss": 3.0303, + "step": 3490 + }, + { + "epoch": 0.2343545518606758, + "grad_norm": 6.117046356201172, + "learning_rate": 9.802004765473407e-05, + "loss": 2.9064, + "step": 3492 + }, + { + "epoch": 0.2344887755444448, + "grad_norm": 5.3497209548950195, + "learning_rate": 9.801701837514808e-05, + "loss": 2.7396, + "step": 3494 + }, + { + "epoch": 0.2346229992282138, + "grad_norm": 5.660746097564697, + "learning_rate": 9.801398682685238e-05, + "loss": 2.9593, + "step": 3496 + }, + { + "epoch": 0.2347572229119828, + "grad_norm": 4.422527313232422, + "learning_rate": 9.801095300999024e-05, + "loss": 2.9238, + "step": 3498 + }, + { + "epoch": 0.2348914465957518, + "grad_norm": 3.960125207901001, + "learning_rate": 9.800791692470497e-05, + "loss": 2.6124, + "step": 3500 + }, + { + "epoch": 0.23502567027952082, + "grad_norm": 4.9334845542907715, + "learning_rate": 9.800487857114004e-05, + "loss": 2.8895, + "step": 3502 + }, + { + "epoch": 0.23515989396328982, + "grad_norm": 4.580432415008545, + "learning_rate": 9.800183794943898e-05, + "loss": 2.9001, + "step": 3504 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 7.876648902893066, + "learning_rate": 9.799879505974548e-05, + "loss": 3.0433, + "step": 3506 + }, + { + "epoch": 0.23542834133082782, + "grad_norm": 6.6552958488464355, + "learning_rate": 9.799574990220328e-05, + "loss": 2.8002, + "step": 3508 + }, + { + "epoch": 0.23556256501459683, + "grad_norm": 4.133473873138428, + "learning_rate": 9.79927024769563e-05, + "loss": 2.5329, + "step": 3510 + }, + { + "epoch": 0.23569678869836583, + "grad_norm": 5.062989234924316, + "learning_rate": 9.798965278414849e-05, + "loss": 2.9299, + "step": 3512 + }, + { + "epoch": 0.23583101238213483, + "grad_norm": 4.45814847946167, + "learning_rate": 9.798660082392396e-05, + "loss": 2.7678, + "step": 3514 + }, + { + "epoch": 0.23596523606590383, + "grad_norm": 4.339692115783691, + "learning_rate": 9.798354659642691e-05, + "loss": 3.0466, + "step": 3516 + }, + { + "epoch": 0.23609945974967284, + "grad_norm": 5.498777866363525, + "learning_rate": 9.798049010180161e-05, + "loss": 2.9428, + "step": 3518 + }, + { + "epoch": 0.23623368343344184, + "grad_norm": 5.010158061981201, + "learning_rate": 9.797743134019253e-05, + "loss": 3.2064, + "step": 3520 + }, + { + "epoch": 0.23636790711721084, + "grad_norm": 4.360749244689941, + "learning_rate": 9.797437031174414e-05, + "loss": 2.9555, + "step": 3522 + }, + { + "epoch": 0.23650213080097984, + "grad_norm": 4.602811813354492, + "learning_rate": 9.797130701660111e-05, + "loss": 2.9388, + "step": 3524 + }, + { + "epoch": 0.23663635448474885, + "grad_norm": 5.362203121185303, + "learning_rate": 9.796824145490815e-05, + "loss": 2.8615, + "step": 3526 + }, + { + "epoch": 0.23677057816851785, + "grad_norm": 4.440452575683594, + "learning_rate": 9.79651736268101e-05, + "loss": 2.8202, + "step": 3528 + }, + { + "epoch": 0.23690480185228682, + "grad_norm": 4.728034019470215, + "learning_rate": 9.796210353245192e-05, + "loss": 2.864, + "step": 3530 + }, + { + "epoch": 0.23703902553605583, + "grad_norm": 4.578831672668457, + "learning_rate": 9.795903117197867e-05, + "loss": 2.8794, + "step": 3532 + }, + { + "epoch": 0.23717324921982483, + "grad_norm": 5.103178024291992, + "learning_rate": 9.795595654553548e-05, + "loss": 2.824, + "step": 3534 + }, + { + "epoch": 0.23730747290359383, + "grad_norm": 4.332368850708008, + "learning_rate": 9.795287965326767e-05, + "loss": 2.9129, + "step": 3536 + }, + { + "epoch": 0.23744169658736283, + "grad_norm": 4.664647579193115, + "learning_rate": 9.794980049532058e-05, + "loss": 2.8136, + "step": 3538 + }, + { + "epoch": 0.23757592027113184, + "grad_norm": 5.427055358886719, + "learning_rate": 9.79467190718397e-05, + "loss": 3.1247, + "step": 3540 + }, + { + "epoch": 0.23771014395490084, + "grad_norm": 4.3742804527282715, + "learning_rate": 9.794363538297065e-05, + "loss": 3.0551, + "step": 3542 + }, + { + "epoch": 0.23784436763866984, + "grad_norm": 4.9423956871032715, + "learning_rate": 9.794054942885909e-05, + "loss": 2.9589, + "step": 3544 + }, + { + "epoch": 0.23797859132243884, + "grad_norm": 4.380127906799316, + "learning_rate": 9.793746120965083e-05, + "loss": 2.8619, + "step": 3546 + }, + { + "epoch": 0.23811281500620785, + "grad_norm": 5.554233074188232, + "learning_rate": 9.793437072549181e-05, + "loss": 2.6947, + "step": 3548 + }, + { + "epoch": 0.23824703868997685, + "grad_norm": 4.242538928985596, + "learning_rate": 9.793127797652801e-05, + "loss": 2.7202, + "step": 3550 + }, + { + "epoch": 0.23838126237374585, + "grad_norm": 4.816305637359619, + "learning_rate": 9.79281829629056e-05, + "loss": 3.1049, + "step": 3552 + }, + { + "epoch": 0.23851548605751485, + "grad_norm": 5.510799407958984, + "learning_rate": 9.792508568477078e-05, + "loss": 2.8664, + "step": 3554 + }, + { + "epoch": 0.23864970974128386, + "grad_norm": 5.219315052032471, + "learning_rate": 9.792198614226992e-05, + "loss": 2.8473, + "step": 3556 + }, + { + "epoch": 0.23878393342505286, + "grad_norm": 5.73204231262207, + "learning_rate": 9.791888433554943e-05, + "loss": 2.8926, + "step": 3558 + }, + { + "epoch": 0.23891815710882186, + "grad_norm": 4.188295364379883, + "learning_rate": 9.79157802647559e-05, + "loss": 2.8064, + "step": 3560 + }, + { + "epoch": 0.23905238079259086, + "grad_norm": 5.148331165313721, + "learning_rate": 9.791267393003596e-05, + "loss": 2.9285, + "step": 3562 + }, + { + "epoch": 0.23918660447635987, + "grad_norm": 5.505407810211182, + "learning_rate": 9.79095653315364e-05, + "loss": 3.1796, + "step": 3564 + }, + { + "epoch": 0.23932082816012887, + "grad_norm": 7.9292216300964355, + "learning_rate": 9.790645446940408e-05, + "loss": 2.9113, + "step": 3566 + }, + { + "epoch": 0.23945505184389784, + "grad_norm": 5.547903537750244, + "learning_rate": 9.7903341343786e-05, + "loss": 2.8473, + "step": 3568 + }, + { + "epoch": 0.23958927552766685, + "grad_norm": 4.837677478790283, + "learning_rate": 9.790022595482924e-05, + "loss": 3.0799, + "step": 3570 + }, + { + "epoch": 0.23972349921143585, + "grad_norm": 4.681033611297607, + "learning_rate": 9.789710830268099e-05, + "loss": 2.7969, + "step": 3572 + }, + { + "epoch": 0.23985772289520485, + "grad_norm": 5.2082390785217285, + "learning_rate": 9.789398838748856e-05, + "loss": 2.904, + "step": 3574 + }, + { + "epoch": 0.23999194657897385, + "grad_norm": 4.519204616546631, + "learning_rate": 9.789086620939936e-05, + "loss": 3.1759, + "step": 3576 + }, + { + "epoch": 0.24012617026274286, + "grad_norm": 4.622751712799072, + "learning_rate": 9.78877417685609e-05, + "loss": 3.0018, + "step": 3578 + }, + { + "epoch": 0.24026039394651186, + "grad_norm": 5.534416198730469, + "learning_rate": 9.788461506512081e-05, + "loss": 2.7665, + "step": 3580 + }, + { + "epoch": 0.24039461763028086, + "grad_norm": 6.328052520751953, + "learning_rate": 9.788148609922682e-05, + "loss": 2.9221, + "step": 3582 + }, + { + "epoch": 0.24052884131404986, + "grad_norm": 10.008332252502441, + "learning_rate": 9.787835487102677e-05, + "loss": 3.1349, + "step": 3584 + }, + { + "epoch": 0.24066306499781887, + "grad_norm": 5.157011985778809, + "learning_rate": 9.78752213806686e-05, + "loss": 2.8738, + "step": 3586 + }, + { + "epoch": 0.24079728868158787, + "grad_norm": 13.359930992126465, + "learning_rate": 9.787208562830036e-05, + "loss": 2.6853, + "step": 3588 + }, + { + "epoch": 0.24093151236535687, + "grad_norm": 4.898461818695068, + "learning_rate": 9.786894761407021e-05, + "loss": 2.7027, + "step": 3590 + }, + { + "epoch": 0.24106573604912587, + "grad_norm": 5.66255521774292, + "learning_rate": 9.786580733812643e-05, + "loss": 3.0953, + "step": 3592 + }, + { + "epoch": 0.24119995973289488, + "grad_norm": 4.488663196563721, + "learning_rate": 9.786266480061737e-05, + "loss": 2.8454, + "step": 3594 + }, + { + "epoch": 0.24133418341666388, + "grad_norm": 5.843430995941162, + "learning_rate": 9.78595200016915e-05, + "loss": 2.8533, + "step": 3596 + }, + { + "epoch": 0.24146840710043288, + "grad_norm": 4.478718280792236, + "learning_rate": 9.785637294149743e-05, + "loss": 3.0258, + "step": 3598 + }, + { + "epoch": 0.24160263078420188, + "grad_norm": 5.787358283996582, + "learning_rate": 9.785322362018385e-05, + "loss": 2.9713, + "step": 3600 + }, + { + "epoch": 0.24173685446797089, + "grad_norm": 4.742392539978027, + "learning_rate": 9.785007203789955e-05, + "loss": 2.9978, + "step": 3602 + }, + { + "epoch": 0.2418710781517399, + "grad_norm": 4.89331579208374, + "learning_rate": 9.784691819479343e-05, + "loss": 2.87, + "step": 3604 + }, + { + "epoch": 0.24200530183550886, + "grad_norm": 4.644773960113525, + "learning_rate": 9.784376209101454e-05, + "loss": 3.0617, + "step": 3606 + }, + { + "epoch": 0.24213952551927787, + "grad_norm": 4.111745834350586, + "learning_rate": 9.784060372671195e-05, + "loss": 2.8524, + "step": 3608 + }, + { + "epoch": 0.24227374920304687, + "grad_norm": 5.595155239105225, + "learning_rate": 9.783744310203491e-05, + "loss": 2.8704, + "step": 3610 + }, + { + "epoch": 0.24240797288681587, + "grad_norm": 8.37452507019043, + "learning_rate": 9.783428021713274e-05, + "loss": 2.8235, + "step": 3612 + }, + { + "epoch": 0.24254219657058487, + "grad_norm": 4.796763896942139, + "learning_rate": 9.783111507215491e-05, + "loss": 2.9581, + "step": 3614 + }, + { + "epoch": 0.24267642025435388, + "grad_norm": 5.138809680938721, + "learning_rate": 9.782794766725094e-05, + "loss": 3.1909, + "step": 3616 + }, + { + "epoch": 0.24281064393812288, + "grad_norm": 4.548169136047363, + "learning_rate": 9.78247780025705e-05, + "loss": 2.8526, + "step": 3618 + }, + { + "epoch": 0.24294486762189188, + "grad_norm": 7.975538730621338, + "learning_rate": 9.782160607826334e-05, + "loss": 3.0038, + "step": 3620 + }, + { + "epoch": 0.24307909130566088, + "grad_norm": 4.368995666503906, + "learning_rate": 9.781843189447933e-05, + "loss": 2.8162, + "step": 3622 + }, + { + "epoch": 0.24321331498942989, + "grad_norm": 4.0763115882873535, + "learning_rate": 9.781525545136844e-05, + "loss": 2.8441, + "step": 3624 + }, + { + "epoch": 0.2433475386731989, + "grad_norm": 5.2212677001953125, + "learning_rate": 9.781207674908076e-05, + "loss": 3.0824, + "step": 3626 + }, + { + "epoch": 0.2434817623569679, + "grad_norm": 5.029758930206299, + "learning_rate": 9.780889578776647e-05, + "loss": 2.7206, + "step": 3628 + }, + { + "epoch": 0.2436159860407369, + "grad_norm": 4.820970058441162, + "learning_rate": 9.780571256757587e-05, + "loss": 2.6908, + "step": 3630 + }, + { + "epoch": 0.2437502097245059, + "grad_norm": 4.490484714508057, + "learning_rate": 9.780252708865936e-05, + "loss": 2.7523, + "step": 3632 + }, + { + "epoch": 0.2438844334082749, + "grad_norm": 4.445520401000977, + "learning_rate": 9.779933935116742e-05, + "loss": 2.6361, + "step": 3634 + }, + { + "epoch": 0.2440186570920439, + "grad_norm": 4.620863437652588, + "learning_rate": 9.779614935525073e-05, + "loss": 3.1276, + "step": 3636 + }, + { + "epoch": 0.2441528807758129, + "grad_norm": 12.859939575195312, + "learning_rate": 9.779295710105993e-05, + "loss": 3.1293, + "step": 3638 + }, + { + "epoch": 0.2442871044595819, + "grad_norm": 4.186205863952637, + "learning_rate": 9.77897625887459e-05, + "loss": 2.7418, + "step": 3640 + }, + { + "epoch": 0.2444213281433509, + "grad_norm": 4.115688323974609, + "learning_rate": 9.778656581845958e-05, + "loss": 2.7426, + "step": 3642 + }, + { + "epoch": 0.24455555182711988, + "grad_norm": 4.15122652053833, + "learning_rate": 9.778336679035197e-05, + "loss": 2.7877, + "step": 3644 + }, + { + "epoch": 0.24468977551088889, + "grad_norm": 4.656614303588867, + "learning_rate": 9.778016550457425e-05, + "loss": 2.7754, + "step": 3646 + }, + { + "epoch": 0.2448239991946579, + "grad_norm": 4.11848258972168, + "learning_rate": 9.777696196127766e-05, + "loss": 2.7426, + "step": 3648 + }, + { + "epoch": 0.2449582228784269, + "grad_norm": 4.677850723266602, + "learning_rate": 9.777375616061359e-05, + "loss": 3.0094, + "step": 3650 + }, + { + "epoch": 0.2450924465621959, + "grad_norm": 4.39869499206543, + "learning_rate": 9.777054810273345e-05, + "loss": 2.8813, + "step": 3652 + }, + { + "epoch": 0.2452266702459649, + "grad_norm": 4.820565700531006, + "learning_rate": 9.776733778778888e-05, + "loss": 2.8945, + "step": 3654 + }, + { + "epoch": 0.2453608939297339, + "grad_norm": 4.5989179611206055, + "learning_rate": 9.776412521593152e-05, + "loss": 2.6907, + "step": 3656 + }, + { + "epoch": 0.2454951176135029, + "grad_norm": 4.423440456390381, + "learning_rate": 9.776091038731317e-05, + "loss": 2.8724, + "step": 3658 + }, + { + "epoch": 0.2456293412972719, + "grad_norm": 4.464333534240723, + "learning_rate": 9.775769330208571e-05, + "loss": 2.7016, + "step": 3660 + }, + { + "epoch": 0.2457635649810409, + "grad_norm": 4.1726884841918945, + "learning_rate": 9.775447396040116e-05, + "loss": 2.9203, + "step": 3662 + }, + { + "epoch": 0.2458977886648099, + "grad_norm": 4.4997148513793945, + "learning_rate": 9.775125236241161e-05, + "loss": 2.8846, + "step": 3664 + }, + { + "epoch": 0.2460320123485789, + "grad_norm": 7.820681571960449, + "learning_rate": 9.77480285082693e-05, + "loss": 2.98, + "step": 3666 + }, + { + "epoch": 0.2461662360323479, + "grad_norm": 4.404434680938721, + "learning_rate": 9.774480239812653e-05, + "loss": 2.8448, + "step": 3668 + }, + { + "epoch": 0.24630045971611692, + "grad_norm": 4.326044082641602, + "learning_rate": 9.774157403213573e-05, + "loss": 2.8689, + "step": 3670 + }, + { + "epoch": 0.24643468339988592, + "grad_norm": 4.071447372436523, + "learning_rate": 9.773834341044944e-05, + "loss": 2.9589, + "step": 3672 + }, + { + "epoch": 0.24656890708365492, + "grad_norm": 4.817883014678955, + "learning_rate": 9.77351105332203e-05, + "loss": 2.9119, + "step": 3674 + }, + { + "epoch": 0.24670313076742392, + "grad_norm": 7.239350318908691, + "learning_rate": 9.773187540060105e-05, + "loss": 3.2546, + "step": 3676 + }, + { + "epoch": 0.24683735445119293, + "grad_norm": 5.1940178871154785, + "learning_rate": 9.772863801274455e-05, + "loss": 3.0253, + "step": 3678 + }, + { + "epoch": 0.24697157813496193, + "grad_norm": 4.614957809448242, + "learning_rate": 9.772539836980376e-05, + "loss": 3.1114, + "step": 3680 + }, + { + "epoch": 0.2471058018187309, + "grad_norm": 3.920725107192993, + "learning_rate": 9.772215647193174e-05, + "loss": 2.5826, + "step": 3682 + }, + { + "epoch": 0.2472400255024999, + "grad_norm": 4.401264190673828, + "learning_rate": 9.771891231928167e-05, + "loss": 2.6898, + "step": 3684 + }, + { + "epoch": 0.2473742491862689, + "grad_norm": 4.035146236419678, + "learning_rate": 9.771566591200682e-05, + "loss": 2.6026, + "step": 3686 + }, + { + "epoch": 0.2475084728700379, + "grad_norm": 7.408110618591309, + "learning_rate": 9.77124172502606e-05, + "loss": 3.1643, + "step": 3688 + }, + { + "epoch": 0.2476426965538069, + "grad_norm": 4.725294589996338, + "learning_rate": 9.77091663341965e-05, + "loss": 2.838, + "step": 3690 + }, + { + "epoch": 0.24777692023757591, + "grad_norm": 4.667789459228516, + "learning_rate": 9.770591316396807e-05, + "loss": 3.0964, + "step": 3692 + }, + { + "epoch": 0.24791114392134492, + "grad_norm": 6.781662464141846, + "learning_rate": 9.770265773972906e-05, + "loss": 2.9273, + "step": 3694 + }, + { + "epoch": 0.24804536760511392, + "grad_norm": 4.207139015197754, + "learning_rate": 9.769940006163329e-05, + "loss": 3.19, + "step": 3696 + }, + { + "epoch": 0.24817959128888292, + "grad_norm": 5.004262447357178, + "learning_rate": 9.769614012983465e-05, + "loss": 3.2087, + "step": 3698 + }, + { + "epoch": 0.24831381497265193, + "grad_norm": 5.153347015380859, + "learning_rate": 9.769287794448721e-05, + "loss": 2.6927, + "step": 3700 + }, + { + "epoch": 0.24844803865642093, + "grad_norm": 5.593570709228516, + "learning_rate": 9.768961350574503e-05, + "loss": 2.7504, + "step": 3702 + }, + { + "epoch": 0.24858226234018993, + "grad_norm": 5.1197190284729, + "learning_rate": 9.768634681376243e-05, + "loss": 2.696, + "step": 3704 + }, + { + "epoch": 0.24871648602395893, + "grad_norm": 5.623376846313477, + "learning_rate": 9.768307786869369e-05, + "loss": 2.9229, + "step": 3706 + }, + { + "epoch": 0.24885070970772794, + "grad_norm": 4.783009052276611, + "learning_rate": 9.767980667069328e-05, + "loss": 3.0498, + "step": 3708 + }, + { + "epoch": 0.24898493339149694, + "grad_norm": 4.269865036010742, + "learning_rate": 9.767653321991578e-05, + "loss": 2.8573, + "step": 3710 + }, + { + "epoch": 0.24911915707526594, + "grad_norm": 4.768504619598389, + "learning_rate": 9.767325751651583e-05, + "loss": 2.7766, + "step": 3712 + }, + { + "epoch": 0.24925338075903494, + "grad_norm": 4.686526775360107, + "learning_rate": 9.76699795606482e-05, + "loss": 2.7388, + "step": 3714 + }, + { + "epoch": 0.24938760444280395, + "grad_norm": 5.842690944671631, + "learning_rate": 9.766669935246778e-05, + "loss": 2.724, + "step": 3716 + }, + { + "epoch": 0.24952182812657295, + "grad_norm": 4.456333160400391, + "learning_rate": 9.766341689212956e-05, + "loss": 2.8092, + "step": 3718 + }, + { + "epoch": 0.24965605181034192, + "grad_norm": 5.577649116516113, + "learning_rate": 9.76601321797886e-05, + "loss": 2.7848, + "step": 3720 + }, + { + "epoch": 0.24979027549411092, + "grad_norm": 4.571791648864746, + "learning_rate": 9.765684521560012e-05, + "loss": 2.9778, + "step": 3722 + }, + { + "epoch": 0.24992449917787993, + "grad_norm": 4.296677589416504, + "learning_rate": 9.765355599971942e-05, + "loss": 2.5825, + "step": 3724 + }, + { + "epoch": 0.25005872286164893, + "grad_norm": 4.985516548156738, + "learning_rate": 9.765026453230191e-05, + "loss": 2.7191, + "step": 3726 + }, + { + "epoch": 0.25019294654541796, + "grad_norm": 4.327792644500732, + "learning_rate": 9.76469708135031e-05, + "loss": 2.7083, + "step": 3728 + }, + { + "epoch": 0.25032717022918693, + "grad_norm": 4.927600860595703, + "learning_rate": 9.764367484347861e-05, + "loss": 2.6934, + "step": 3730 + }, + { + "epoch": 0.25046139391295597, + "grad_norm": 4.861480712890625, + "learning_rate": 9.764037662238417e-05, + "loss": 2.9762, + "step": 3732 + }, + { + "epoch": 0.25059561759672494, + "grad_norm": 4.824362277984619, + "learning_rate": 9.763707615037561e-05, + "loss": 3.0472, + "step": 3734 + }, + { + "epoch": 0.25072984128049397, + "grad_norm": 4.678927421569824, + "learning_rate": 9.763377342760888e-05, + "loss": 3.0329, + "step": 3736 + }, + { + "epoch": 0.25086406496426294, + "grad_norm": 7.954378604888916, + "learning_rate": 9.763046845424002e-05, + "loss": 3.2291, + "step": 3738 + }, + { + "epoch": 0.2509982886480319, + "grad_norm": 4.2465057373046875, + "learning_rate": 9.762716123042519e-05, + "loss": 2.752, + "step": 3740 + }, + { + "epoch": 0.25113251233180095, + "grad_norm": 5.1916584968566895, + "learning_rate": 9.762385175632065e-05, + "loss": 2.8252, + "step": 3742 + }, + { + "epoch": 0.2512667360155699, + "grad_norm": 5.199374675750732, + "learning_rate": 9.762054003208276e-05, + "loss": 3.1804, + "step": 3744 + }, + { + "epoch": 0.25140095969933896, + "grad_norm": 6.191391944885254, + "learning_rate": 9.761722605786799e-05, + "loss": 2.9779, + "step": 3746 + }, + { + "epoch": 0.25153518338310793, + "grad_norm": 4.924802303314209, + "learning_rate": 9.761390983383294e-05, + "loss": 2.6129, + "step": 3748 + }, + { + "epoch": 0.25166940706687696, + "grad_norm": 4.1155524253845215, + "learning_rate": 9.761059136013426e-05, + "loss": 2.8446, + "step": 3750 + }, + { + "epoch": 0.25180363075064593, + "grad_norm": 4.534238815307617, + "learning_rate": 9.760727063692878e-05, + "loss": 3.2621, + "step": 3752 + }, + { + "epoch": 0.25193785443441497, + "grad_norm": 4.668457508087158, + "learning_rate": 9.760394766437335e-05, + "loss": 2.9074, + "step": 3754 + }, + { + "epoch": 0.25207207811818394, + "grad_norm": 6.318287372589111, + "learning_rate": 9.760062244262502e-05, + "loss": 2.8603, + "step": 3756 + }, + { + "epoch": 0.25220630180195297, + "grad_norm": 4.42996883392334, + "learning_rate": 9.759729497184089e-05, + "loss": 3.0149, + "step": 3758 + }, + { + "epoch": 0.25234052548572194, + "grad_norm": 4.502966403961182, + "learning_rate": 9.759396525217817e-05, + "loss": 3.0097, + "step": 3760 + }, + { + "epoch": 0.252474749169491, + "grad_norm": 11.476615905761719, + "learning_rate": 9.759063328379416e-05, + "loss": 2.9576, + "step": 3762 + }, + { + "epoch": 0.25260897285325995, + "grad_norm": 3.6005799770355225, + "learning_rate": 9.758729906684632e-05, + "loss": 2.4614, + "step": 3764 + }, + { + "epoch": 0.252743196537029, + "grad_norm": 4.021541118621826, + "learning_rate": 9.758396260149219e-05, + "loss": 2.7723, + "step": 3766 + }, + { + "epoch": 0.25287742022079795, + "grad_norm": 5.030503749847412, + "learning_rate": 9.758062388788937e-05, + "loss": 2.8716, + "step": 3768 + }, + { + "epoch": 0.253011643904567, + "grad_norm": 5.309235572814941, + "learning_rate": 9.757728292619566e-05, + "loss": 2.8525, + "step": 3770 + }, + { + "epoch": 0.25314586758833596, + "grad_norm": 4.047428131103516, + "learning_rate": 9.757393971656888e-05, + "loss": 2.8719, + "step": 3772 + }, + { + "epoch": 0.253280091272105, + "grad_norm": 6.008095741271973, + "learning_rate": 9.7570594259167e-05, + "loss": 2.7927, + "step": 3774 + }, + { + "epoch": 0.25341431495587396, + "grad_norm": 4.450249671936035, + "learning_rate": 9.756724655414807e-05, + "loss": 2.8215, + "step": 3776 + }, + { + "epoch": 0.25354853863964294, + "grad_norm": 4.348392009735107, + "learning_rate": 9.75638966016703e-05, + "loss": 2.9823, + "step": 3778 + }, + { + "epoch": 0.25368276232341197, + "grad_norm": 4.385859489440918, + "learning_rate": 9.756054440189191e-05, + "loss": 2.7279, + "step": 3780 + }, + { + "epoch": 0.25381698600718094, + "grad_norm": 5.505998611450195, + "learning_rate": 9.755718995497136e-05, + "loss": 2.7651, + "step": 3782 + }, + { + "epoch": 0.25395120969095, + "grad_norm": 5.81832218170166, + "learning_rate": 9.755383326106709e-05, + "loss": 2.9004, + "step": 3784 + }, + { + "epoch": 0.25408543337471895, + "grad_norm": 5.072657108306885, + "learning_rate": 9.75504743203377e-05, + "loss": 2.8567, + "step": 3786 + }, + { + "epoch": 0.254219657058488, + "grad_norm": 4.730710506439209, + "learning_rate": 9.75471131329419e-05, + "loss": 2.7587, + "step": 3788 + }, + { + "epoch": 0.25435388074225695, + "grad_norm": 4.394271373748779, + "learning_rate": 9.754374969903852e-05, + "loss": 2.6392, + "step": 3790 + }, + { + "epoch": 0.254488104426026, + "grad_norm": 4.492122650146484, + "learning_rate": 9.754038401878645e-05, + "loss": 2.9621, + "step": 3792 + }, + { + "epoch": 0.25462232810979496, + "grad_norm": 7.814239501953125, + "learning_rate": 9.753701609234471e-05, + "loss": 3.0051, + "step": 3794 + }, + { + "epoch": 0.254756551793564, + "grad_norm": 4.79547119140625, + "learning_rate": 9.753364591987244e-05, + "loss": 2.9443, + "step": 3796 + }, + { + "epoch": 0.25489077547733296, + "grad_norm": 4.384224891662598, + "learning_rate": 9.753027350152888e-05, + "loss": 3.1902, + "step": 3798 + }, + { + "epoch": 0.255024999161102, + "grad_norm": 4.503432750701904, + "learning_rate": 9.752689883747335e-05, + "loss": 2.9395, + "step": 3800 + }, + { + "epoch": 0.25515922284487097, + "grad_norm": 4.598775863647461, + "learning_rate": 9.752352192786531e-05, + "loss": 2.9314, + "step": 3802 + }, + { + "epoch": 0.25529344652864, + "grad_norm": 4.5040669441223145, + "learning_rate": 9.752014277286432e-05, + "loss": 2.6463, + "step": 3804 + }, + { + "epoch": 0.255427670212409, + "grad_norm": 11.628575325012207, + "learning_rate": 9.751676137263002e-05, + "loss": 2.7508, + "step": 3806 + }, + { + "epoch": 0.255561893896178, + "grad_norm": 7.8853983879089355, + "learning_rate": 9.751337772732218e-05, + "loss": 3.0338, + "step": 3808 + }, + { + "epoch": 0.255696117579947, + "grad_norm": 4.61055850982666, + "learning_rate": 9.750999183710068e-05, + "loss": 2.9111, + "step": 3810 + }, + { + "epoch": 0.255830341263716, + "grad_norm": 5.367734909057617, + "learning_rate": 9.750660370212549e-05, + "loss": 2.8714, + "step": 3812 + }, + { + "epoch": 0.255964564947485, + "grad_norm": 4.552687644958496, + "learning_rate": 9.75032133225567e-05, + "loss": 2.8826, + "step": 3814 + }, + { + "epoch": 0.25609878863125396, + "grad_norm": 6.952396869659424, + "learning_rate": 9.749982069855448e-05, + "loss": 2.6692, + "step": 3816 + }, + { + "epoch": 0.256233012315023, + "grad_norm": 5.241904258728027, + "learning_rate": 9.749642583027914e-05, + "loss": 2.9713, + "step": 3818 + }, + { + "epoch": 0.25636723599879196, + "grad_norm": 4.3984575271606445, + "learning_rate": 9.749302871789107e-05, + "loss": 2.673, + "step": 3820 + }, + { + "epoch": 0.256501459682561, + "grad_norm": 9.191459655761719, + "learning_rate": 9.748962936155079e-05, + "loss": 3.2357, + "step": 3822 + }, + { + "epoch": 0.25663568336632997, + "grad_norm": 4.205466270446777, + "learning_rate": 9.748622776141892e-05, + "loss": 2.9447, + "step": 3824 + }, + { + "epoch": 0.256769907050099, + "grad_norm": 4.94639253616333, + "learning_rate": 9.748282391765615e-05, + "loss": 2.7192, + "step": 3826 + }, + { + "epoch": 0.256904130733868, + "grad_norm": 5.0135979652404785, + "learning_rate": 9.747941783042332e-05, + "loss": 2.8835, + "step": 3828 + }, + { + "epoch": 0.257038354417637, + "grad_norm": 16.02644157409668, + "learning_rate": 9.747600949988136e-05, + "loss": 3.0616, + "step": 3830 + }, + { + "epoch": 0.257172578101406, + "grad_norm": 4.721651554107666, + "learning_rate": 9.747259892619132e-05, + "loss": 2.7941, + "step": 3832 + }, + { + "epoch": 0.257306801785175, + "grad_norm": 5.691426753997803, + "learning_rate": 9.746918610951433e-05, + "loss": 2.8792, + "step": 3834 + }, + { + "epoch": 0.257441025468944, + "grad_norm": 8.217187881469727, + "learning_rate": 9.746577105001163e-05, + "loss": 2.7986, + "step": 3836 + }, + { + "epoch": 0.257575249152713, + "grad_norm": 5.78372859954834, + "learning_rate": 9.74623537478446e-05, + "loss": 3.059, + "step": 3838 + }, + { + "epoch": 0.257709472836482, + "grad_norm": 6.568265438079834, + "learning_rate": 9.745893420317469e-05, + "loss": 2.5614, + "step": 3840 + }, + { + "epoch": 0.257843696520251, + "grad_norm": 11.304030418395996, + "learning_rate": 9.745551241616344e-05, + "loss": 3.0028, + "step": 3842 + }, + { + "epoch": 0.25797792020402, + "grad_norm": 7.653731822967529, + "learning_rate": 9.745208838697255e-05, + "loss": 2.9094, + "step": 3844 + }, + { + "epoch": 0.258112143887789, + "grad_norm": 4.855002403259277, + "learning_rate": 9.744866211576381e-05, + "loss": 2.8979, + "step": 3846 + }, + { + "epoch": 0.258246367571558, + "grad_norm": 6.184627532958984, + "learning_rate": 9.744523360269909e-05, + "loss": 2.5042, + "step": 3848 + }, + { + "epoch": 0.25838059125532703, + "grad_norm": 4.907862186431885, + "learning_rate": 9.744180284794035e-05, + "loss": 3.0158, + "step": 3850 + }, + { + "epoch": 0.258514814939096, + "grad_norm": 5.3828959465026855, + "learning_rate": 9.743836985164974e-05, + "loss": 2.8671, + "step": 3852 + }, + { + "epoch": 0.258649038622865, + "grad_norm": 7.80665922164917, + "learning_rate": 9.743493461398942e-05, + "loss": 2.8939, + "step": 3854 + }, + { + "epoch": 0.258783262306634, + "grad_norm": 4.868162155151367, + "learning_rate": 9.743149713512175e-05, + "loss": 2.9265, + "step": 3856 + }, + { + "epoch": 0.258917485990403, + "grad_norm": 4.65509033203125, + "learning_rate": 9.742805741520908e-05, + "loss": 3.2036, + "step": 3858 + }, + { + "epoch": 0.259051709674172, + "grad_norm": 4.194815158843994, + "learning_rate": 9.742461545441398e-05, + "loss": 2.8988, + "step": 3860 + }, + { + "epoch": 0.259185933357941, + "grad_norm": 36.96123123168945, + "learning_rate": 9.742117125289904e-05, + "loss": 3.0697, + "step": 3862 + }, + { + "epoch": 0.25932015704171, + "grad_norm": 10.22326374053955, + "learning_rate": 9.741772481082702e-05, + "loss": 2.7539, + "step": 3864 + }, + { + "epoch": 0.259454380725479, + "grad_norm": 4.319609642028809, + "learning_rate": 9.741427612836074e-05, + "loss": 2.8561, + "step": 3866 + }, + { + "epoch": 0.259588604409248, + "grad_norm": 5.122544765472412, + "learning_rate": 9.741082520566314e-05, + "loss": 3.011, + "step": 3868 + }, + { + "epoch": 0.259722828093017, + "grad_norm": 4.805135250091553, + "learning_rate": 9.740737204289729e-05, + "loss": 3.0242, + "step": 3870 + }, + { + "epoch": 0.25985705177678603, + "grad_norm": 5.441457748413086, + "learning_rate": 9.740391664022633e-05, + "loss": 2.8182, + "step": 3872 + }, + { + "epoch": 0.259991275460555, + "grad_norm": 50.61954879760742, + "learning_rate": 9.740045899781352e-05, + "loss": 3.0208, + "step": 3874 + }, + { + "epoch": 0.26012549914432403, + "grad_norm": 7.911048412322998, + "learning_rate": 9.739699911582225e-05, + "loss": 3.2954, + "step": 3876 + }, + { + "epoch": 0.260259722828093, + "grad_norm": 5.471540451049805, + "learning_rate": 9.739353699441596e-05, + "loss": 2.7159, + "step": 3878 + }, + { + "epoch": 0.26039394651186204, + "grad_norm": 4.384070873260498, + "learning_rate": 9.739007263375823e-05, + "loss": 2.9516, + "step": 3880 + }, + { + "epoch": 0.260528170195631, + "grad_norm": 4.7725067138671875, + "learning_rate": 9.738660603401277e-05, + "loss": 2.941, + "step": 3882 + }, + { + "epoch": 0.26066239387940005, + "grad_norm": 4.251208782196045, + "learning_rate": 9.738313719534337e-05, + "loss": 3.1586, + "step": 3884 + }, + { + "epoch": 0.260796617563169, + "grad_norm": 4.268898963928223, + "learning_rate": 9.73796661179139e-05, + "loss": 2.7005, + "step": 3886 + }, + { + "epoch": 0.260930841246938, + "grad_norm": 5.078932285308838, + "learning_rate": 9.737619280188837e-05, + "loss": 3.0199, + "step": 3888 + }, + { + "epoch": 0.261065064930707, + "grad_norm": 8.791038513183594, + "learning_rate": 9.737271724743088e-05, + "loss": 2.5965, + "step": 3890 + }, + { + "epoch": 0.261199288614476, + "grad_norm": 9.152828216552734, + "learning_rate": 9.736923945470568e-05, + "loss": 2.9545, + "step": 3892 + }, + { + "epoch": 0.26133351229824503, + "grad_norm": 5.3709635734558105, + "learning_rate": 9.736575942387706e-05, + "loss": 2.9114, + "step": 3894 + }, + { + "epoch": 0.261467735982014, + "grad_norm": 4.703250885009766, + "learning_rate": 9.736227715510944e-05, + "loss": 2.8461, + "step": 3896 + }, + { + "epoch": 0.26160195966578303, + "grad_norm": 4.87571907043457, + "learning_rate": 9.735879264856736e-05, + "loss": 2.8391, + "step": 3898 + }, + { + "epoch": 0.261736183349552, + "grad_norm": 5.0557026863098145, + "learning_rate": 9.735530590441545e-05, + "loss": 2.643, + "step": 3900 + }, + { + "epoch": 0.26187040703332104, + "grad_norm": 5.071584224700928, + "learning_rate": 9.735181692281846e-05, + "loss": 2.993, + "step": 3902 + }, + { + "epoch": 0.26200463071709, + "grad_norm": 4.648756980895996, + "learning_rate": 9.734832570394124e-05, + "loss": 2.8004, + "step": 3904 + }, + { + "epoch": 0.26213885440085904, + "grad_norm": 4.742566108703613, + "learning_rate": 9.734483224794872e-05, + "loss": 3.0349, + "step": 3906 + }, + { + "epoch": 0.262273078084628, + "grad_norm": 5.388330936431885, + "learning_rate": 9.7341336555006e-05, + "loss": 3.1334, + "step": 3908 + }, + { + "epoch": 0.26240730176839705, + "grad_norm": 6.746428966522217, + "learning_rate": 9.73378386252782e-05, + "loss": 2.8106, + "step": 3910 + }, + { + "epoch": 0.262541525452166, + "grad_norm": 4.934975624084473, + "learning_rate": 9.73343384589306e-05, + "loss": 2.7082, + "step": 3912 + }, + { + "epoch": 0.26267574913593505, + "grad_norm": 4.19777250289917, + "learning_rate": 9.733083605612863e-05, + "loss": 2.9045, + "step": 3914 + }, + { + "epoch": 0.26280997281970403, + "grad_norm": 4.641584873199463, + "learning_rate": 9.732733141703769e-05, + "loss": 2.7457, + "step": 3916 + }, + { + "epoch": 0.26294419650347306, + "grad_norm": 5.426604270935059, + "learning_rate": 9.732382454182343e-05, + "loss": 3.2045, + "step": 3918 + }, + { + "epoch": 0.26307842018724203, + "grad_norm": 3.7249510288238525, + "learning_rate": 9.73203154306515e-05, + "loss": 2.5263, + "step": 3920 + }, + { + "epoch": 0.26321264387101106, + "grad_norm": 4.395806789398193, + "learning_rate": 9.731680408368772e-05, + "loss": 2.7632, + "step": 3922 + }, + { + "epoch": 0.26334686755478004, + "grad_norm": 4.738161563873291, + "learning_rate": 9.7313290501098e-05, + "loss": 2.9799, + "step": 3924 + }, + { + "epoch": 0.263481091238549, + "grad_norm": 5.362348556518555, + "learning_rate": 9.730977468304834e-05, + "loss": 3.1575, + "step": 3926 + }, + { + "epoch": 0.26361531492231804, + "grad_norm": 4.340371608734131, + "learning_rate": 9.730625662970485e-05, + "loss": 2.9956, + "step": 3928 + }, + { + "epoch": 0.263749538606087, + "grad_norm": 5.128086566925049, + "learning_rate": 9.730273634123377e-05, + "loss": 2.6242, + "step": 3930 + }, + { + "epoch": 0.26388376228985605, + "grad_norm": 5.090345859527588, + "learning_rate": 9.72992138178014e-05, + "loss": 2.7916, + "step": 3932 + }, + { + "epoch": 0.264017985973625, + "grad_norm": 11.786833763122559, + "learning_rate": 9.72956890595742e-05, + "loss": 2.5576, + "step": 3934 + }, + { + "epoch": 0.26415220965739405, + "grad_norm": 9.224922180175781, + "learning_rate": 9.729216206671868e-05, + "loss": 2.8392, + "step": 3936 + }, + { + "epoch": 0.26428643334116303, + "grad_norm": 5.0883917808532715, + "learning_rate": 9.728863283940151e-05, + "loss": 2.9469, + "step": 3938 + }, + { + "epoch": 0.26442065702493206, + "grad_norm": 4.107527732849121, + "learning_rate": 9.728510137778944e-05, + "loss": 2.6257, + "step": 3940 + }, + { + "epoch": 0.26455488070870103, + "grad_norm": 4.484808444976807, + "learning_rate": 9.728156768204928e-05, + "loss": 2.9356, + "step": 3942 + }, + { + "epoch": 0.26468910439247006, + "grad_norm": 4.970802307128906, + "learning_rate": 9.727803175234804e-05, + "loss": 2.8978, + "step": 3944 + }, + { + "epoch": 0.26482332807623904, + "grad_norm": 4.273073673248291, + "learning_rate": 9.727449358885276e-05, + "loss": 3.0461, + "step": 3946 + }, + { + "epoch": 0.26495755176000807, + "grad_norm": 6.181250095367432, + "learning_rate": 9.727095319173065e-05, + "loss": 3.0763, + "step": 3948 + }, + { + "epoch": 0.26509177544377704, + "grad_norm": 4.704627513885498, + "learning_rate": 9.726741056114892e-05, + "loss": 3.2268, + "step": 3950 + }, + { + "epoch": 0.2652259991275461, + "grad_norm": 4.599483966827393, + "learning_rate": 9.726386569727501e-05, + "loss": 2.6831, + "step": 3952 + }, + { + "epoch": 0.26536022281131505, + "grad_norm": 4.5531206130981445, + "learning_rate": 9.726031860027637e-05, + "loss": 3.0074, + "step": 3954 + }, + { + "epoch": 0.2654944464950841, + "grad_norm": 4.613763332366943, + "learning_rate": 9.725676927032061e-05, + "loss": 2.7482, + "step": 3956 + }, + { + "epoch": 0.26562867017885305, + "grad_norm": 11.522418975830078, + "learning_rate": 9.725321770757545e-05, + "loss": 2.9847, + "step": 3958 + }, + { + "epoch": 0.2657628938626221, + "grad_norm": 4.713412284851074, + "learning_rate": 9.724966391220865e-05, + "loss": 3.2084, + "step": 3960 + }, + { + "epoch": 0.26589711754639106, + "grad_norm": 7.225686073303223, + "learning_rate": 9.724610788438815e-05, + "loss": 2.8622, + "step": 3962 + }, + { + "epoch": 0.26603134123016003, + "grad_norm": 4.989211082458496, + "learning_rate": 9.724254962428196e-05, + "loss": 2.7118, + "step": 3964 + }, + { + "epoch": 0.26616556491392906, + "grad_norm": 3.8185431957244873, + "learning_rate": 9.72389891320582e-05, + "loss": 2.7416, + "step": 3966 + }, + { + "epoch": 0.26629978859769804, + "grad_norm": 4.335028648376465, + "learning_rate": 9.723542640788509e-05, + "loss": 2.8403, + "step": 3968 + }, + { + "epoch": 0.26643401228146707, + "grad_norm": 4.759063720703125, + "learning_rate": 9.723186145193097e-05, + "loss": 2.9363, + "step": 3970 + }, + { + "epoch": 0.26656823596523604, + "grad_norm": 5.227819919586182, + "learning_rate": 9.722829426436427e-05, + "loss": 2.9752, + "step": 3972 + }, + { + "epoch": 0.2667024596490051, + "grad_norm": 3.7737274169921875, + "learning_rate": 9.722472484535354e-05, + "loss": 2.8307, + "step": 3974 + }, + { + "epoch": 0.26683668333277405, + "grad_norm": 4.53120756149292, + "learning_rate": 9.722115319506743e-05, + "loss": 3.0238, + "step": 3976 + }, + { + "epoch": 0.2669709070165431, + "grad_norm": 6.027231216430664, + "learning_rate": 9.721757931367468e-05, + "loss": 2.9482, + "step": 3978 + }, + { + "epoch": 0.26710513070031205, + "grad_norm": 4.465987205505371, + "learning_rate": 9.721400320134415e-05, + "loss": 3.1955, + "step": 3980 + }, + { + "epoch": 0.2672393543840811, + "grad_norm": 4.473721504211426, + "learning_rate": 9.721042485824483e-05, + "loss": 2.9479, + "step": 3982 + }, + { + "epoch": 0.26737357806785006, + "grad_norm": 5.463778972625732, + "learning_rate": 9.720684428454576e-05, + "loss": 2.7186, + "step": 3984 + }, + { + "epoch": 0.2675078017516191, + "grad_norm": 5.678593158721924, + "learning_rate": 9.720326148041612e-05, + "loss": 2.8956, + "step": 3986 + }, + { + "epoch": 0.26764202543538806, + "grad_norm": 4.487136363983154, + "learning_rate": 9.719967644602521e-05, + "loss": 2.8297, + "step": 3988 + }, + { + "epoch": 0.2677762491191571, + "grad_norm": 4.16965389251709, + "learning_rate": 9.71960891815424e-05, + "loss": 2.5569, + "step": 3990 + }, + { + "epoch": 0.26791047280292607, + "grad_norm": 4.42320442199707, + "learning_rate": 9.719249968713717e-05, + "loss": 2.7291, + "step": 3992 + }, + { + "epoch": 0.2680446964866951, + "grad_norm": 4.414381504058838, + "learning_rate": 9.718890796297914e-05, + "loss": 2.9322, + "step": 3994 + }, + { + "epoch": 0.2681789201704641, + "grad_norm": 4.346126556396484, + "learning_rate": 9.7185314009238e-05, + "loss": 2.5523, + "step": 3996 + }, + { + "epoch": 0.2683131438542331, + "grad_norm": 5.720865249633789, + "learning_rate": 9.718171782608356e-05, + "loss": 2.9651, + "step": 3998 + }, + { + "epoch": 0.2684473675380021, + "grad_norm": 4.567556858062744, + "learning_rate": 9.717811941368574e-05, + "loss": 2.8128, + "step": 4000 + }, + { + "epoch": 0.26858159122177105, + "grad_norm": 5.082077980041504, + "learning_rate": 9.717451877221453e-05, + "loss": 3.0047, + "step": 4002 + }, + { + "epoch": 0.2687158149055401, + "grad_norm": 4.736049652099609, + "learning_rate": 9.717091590184008e-05, + "loss": 3.0069, + "step": 4004 + }, + { + "epoch": 0.26885003858930906, + "grad_norm": 4.209043979644775, + "learning_rate": 9.71673108027326e-05, + "loss": 2.8355, + "step": 4006 + }, + { + "epoch": 0.2689842622730781, + "grad_norm": 6.464731693267822, + "learning_rate": 9.716370347506247e-05, + "loss": 2.9225, + "step": 4008 + }, + { + "epoch": 0.26911848595684706, + "grad_norm": 5.049082279205322, + "learning_rate": 9.716009391900006e-05, + "loss": 2.6866, + "step": 4010 + }, + { + "epoch": 0.2692527096406161, + "grad_norm": 5.675856590270996, + "learning_rate": 9.715648213471597e-05, + "loss": 3.1129, + "step": 4012 + }, + { + "epoch": 0.26938693332438507, + "grad_norm": 4.8798604011535645, + "learning_rate": 9.715286812238082e-05, + "loss": 2.6135, + "step": 4014 + }, + { + "epoch": 0.2695211570081541, + "grad_norm": 4.656592845916748, + "learning_rate": 9.714925188216537e-05, + "loss": 3.1532, + "step": 4016 + }, + { + "epoch": 0.2696553806919231, + "grad_norm": 5.303769588470459, + "learning_rate": 9.714563341424048e-05, + "loss": 2.8507, + "step": 4018 + }, + { + "epoch": 0.2697896043756921, + "grad_norm": 4.53091287612915, + "learning_rate": 9.714201271877713e-05, + "loss": 2.9376, + "step": 4020 + }, + { + "epoch": 0.2699238280594611, + "grad_norm": 4.468153953552246, + "learning_rate": 9.713838979594638e-05, + "loss": 2.7845, + "step": 4022 + }, + { + "epoch": 0.2700580517432301, + "grad_norm": 4.170029163360596, + "learning_rate": 9.71347646459194e-05, + "loss": 2.7015, + "step": 4024 + }, + { + "epoch": 0.2701922754269991, + "grad_norm": 4.247119903564453, + "learning_rate": 9.713113726886747e-05, + "loss": 2.6321, + "step": 4026 + }, + { + "epoch": 0.2703264991107681, + "grad_norm": 4.74530029296875, + "learning_rate": 9.712750766496201e-05, + "loss": 2.738, + "step": 4028 + }, + { + "epoch": 0.2704607227945371, + "grad_norm": 4.521475315093994, + "learning_rate": 9.712387583437445e-05, + "loss": 2.9147, + "step": 4030 + }, + { + "epoch": 0.2705949464783061, + "grad_norm": 4.623838424682617, + "learning_rate": 9.712024177727645e-05, + "loss": 2.9856, + "step": 4032 + }, + { + "epoch": 0.2707291701620751, + "grad_norm": 4.910146713256836, + "learning_rate": 9.711660549383967e-05, + "loss": 2.9839, + "step": 4034 + }, + { + "epoch": 0.2708633938458441, + "grad_norm": 15.759222030639648, + "learning_rate": 9.711296698423593e-05, + "loss": 2.8493, + "step": 4036 + }, + { + "epoch": 0.2709976175296131, + "grad_norm": 8.270707130432129, + "learning_rate": 9.710932624863715e-05, + "loss": 2.7943, + "step": 4038 + }, + { + "epoch": 0.2711318412133821, + "grad_norm": 5.88360595703125, + "learning_rate": 9.710568328721534e-05, + "loss": 2.8633, + "step": 4040 + }, + { + "epoch": 0.2712660648971511, + "grad_norm": 169.24310302734375, + "learning_rate": 9.710203810014262e-05, + "loss": 3.1246, + "step": 4042 + }, + { + "epoch": 0.2714002885809201, + "grad_norm": 5.594478130340576, + "learning_rate": 9.709839068759123e-05, + "loss": 2.7809, + "step": 4044 + }, + { + "epoch": 0.2715345122646891, + "grad_norm": 5.170252799987793, + "learning_rate": 9.70947410497335e-05, + "loss": 2.851, + "step": 4046 + }, + { + "epoch": 0.2716687359484581, + "grad_norm": 5.185520648956299, + "learning_rate": 9.709108918674185e-05, + "loss": 2.9856, + "step": 4048 + }, + { + "epoch": 0.2718029596322271, + "grad_norm": 4.54862642288208, + "learning_rate": 9.708743509878884e-05, + "loss": 3.0046, + "step": 4050 + }, + { + "epoch": 0.2719371833159961, + "grad_norm": 4.384169578552246, + "learning_rate": 9.70837787860471e-05, + "loss": 2.9905, + "step": 4052 + }, + { + "epoch": 0.2720714069997651, + "grad_norm": 4.191932678222656, + "learning_rate": 9.708012024868942e-05, + "loss": 2.7531, + "step": 4054 + }, + { + "epoch": 0.2722056306835341, + "grad_norm": 5.2822771072387695, + "learning_rate": 9.707645948688863e-05, + "loss": 3.0735, + "step": 4056 + }, + { + "epoch": 0.2723398543673031, + "grad_norm": 4.294942378997803, + "learning_rate": 9.70727965008177e-05, + "loss": 2.743, + "step": 4058 + }, + { + "epoch": 0.2724740780510721, + "grad_norm": 8.210320472717285, + "learning_rate": 9.706913129064971e-05, + "loss": 2.8249, + "step": 4060 + }, + { + "epoch": 0.27260830173484113, + "grad_norm": 4.383756160736084, + "learning_rate": 9.706546385655781e-05, + "loss": 2.8831, + "step": 4062 + }, + { + "epoch": 0.2727425254186101, + "grad_norm": 4.694242477416992, + "learning_rate": 9.706179419871531e-05, + "loss": 2.7986, + "step": 4064 + }, + { + "epoch": 0.27287674910237913, + "grad_norm": 4.497794151306152, + "learning_rate": 9.705812231729557e-05, + "loss": 2.8603, + "step": 4066 + }, + { + "epoch": 0.2730109727861481, + "grad_norm": 4.399491310119629, + "learning_rate": 9.705444821247208e-05, + "loss": 3.0649, + "step": 4068 + }, + { + "epoch": 0.27314519646991714, + "grad_norm": 4.574599266052246, + "learning_rate": 9.705077188441844e-05, + "loss": 2.8201, + "step": 4070 + }, + { + "epoch": 0.2732794201536861, + "grad_norm": 4.3626179695129395, + "learning_rate": 9.704709333330836e-05, + "loss": 2.8861, + "step": 4072 + }, + { + "epoch": 0.27341364383745514, + "grad_norm": 4.452908515930176, + "learning_rate": 9.704341255931562e-05, + "loss": 2.7583, + "step": 4074 + }, + { + "epoch": 0.2735478675212241, + "grad_norm": 7.966529369354248, + "learning_rate": 9.703972956261416e-05, + "loss": 2.9913, + "step": 4076 + }, + { + "epoch": 0.2736820912049931, + "grad_norm": 11.209407806396484, + "learning_rate": 9.703604434337797e-05, + "loss": 2.8997, + "step": 4078 + }, + { + "epoch": 0.2738163148887621, + "grad_norm": 5.107242584228516, + "learning_rate": 9.703235690178118e-05, + "loss": 2.9209, + "step": 4080 + }, + { + "epoch": 0.2739505385725311, + "grad_norm": 4.261386871337891, + "learning_rate": 9.7028667237998e-05, + "loss": 2.7775, + "step": 4082 + }, + { + "epoch": 0.27408476225630013, + "grad_norm": 5.284154415130615, + "learning_rate": 9.702497535220278e-05, + "loss": 3.088, + "step": 4084 + }, + { + "epoch": 0.2742189859400691, + "grad_norm": 4.708865642547607, + "learning_rate": 9.702128124456996e-05, + "loss": 2.9112, + "step": 4086 + }, + { + "epoch": 0.27435320962383813, + "grad_norm": 4.617551803588867, + "learning_rate": 9.701758491527404e-05, + "loss": 3.1388, + "step": 4088 + }, + { + "epoch": 0.2744874333076071, + "grad_norm": 4.412621021270752, + "learning_rate": 9.701388636448969e-05, + "loss": 3.0211, + "step": 4090 + }, + { + "epoch": 0.27462165699137614, + "grad_norm": 7.436208248138428, + "learning_rate": 9.701018559239167e-05, + "loss": 2.8558, + "step": 4092 + }, + { + "epoch": 0.2747558806751451, + "grad_norm": 5.102291107177734, + "learning_rate": 9.700648259915481e-05, + "loss": 2.6445, + "step": 4094 + }, + { + "epoch": 0.27489010435891414, + "grad_norm": 5.127889156341553, + "learning_rate": 9.700277738495409e-05, + "loss": 3.0786, + "step": 4096 + }, + { + "epoch": 0.2750243280426831, + "grad_norm": 5.14064884185791, + "learning_rate": 9.699906994996457e-05, + "loss": 2.9958, + "step": 4098 + }, + { + "epoch": 0.27515855172645215, + "grad_norm": 5.03084659576416, + "learning_rate": 9.69953602943614e-05, + "loss": 2.8666, + "step": 4100 + }, + { + "epoch": 0.2752927754102211, + "grad_norm": 4.460030555725098, + "learning_rate": 9.699164841831989e-05, + "loss": 2.8817, + "step": 4102 + }, + { + "epoch": 0.27542699909399015, + "grad_norm": 5.121056079864502, + "learning_rate": 9.698793432201538e-05, + "loss": 3.1211, + "step": 4104 + }, + { + "epoch": 0.27556122277775913, + "grad_norm": 4.521562576293945, + "learning_rate": 9.698421800562338e-05, + "loss": 2.9042, + "step": 4106 + }, + { + "epoch": 0.27569544646152816, + "grad_norm": 4.717555046081543, + "learning_rate": 9.698049946931947e-05, + "loss": 2.9158, + "step": 4108 + }, + { + "epoch": 0.27582967014529713, + "grad_norm": 4.48073673248291, + "learning_rate": 9.697677871327933e-05, + "loss": 2.7932, + "step": 4110 + }, + { + "epoch": 0.27596389382906616, + "grad_norm": 5.835611820220947, + "learning_rate": 9.697305573767879e-05, + "loss": 2.8972, + "step": 4112 + }, + { + "epoch": 0.27609811751283514, + "grad_norm": 5.249105930328369, + "learning_rate": 9.696933054269372e-05, + "loss": 2.5913, + "step": 4114 + }, + { + "epoch": 0.2762323411966041, + "grad_norm": 4.106356620788574, + "learning_rate": 9.696560312850015e-05, + "loss": 2.928, + "step": 4116 + }, + { + "epoch": 0.27636656488037314, + "grad_norm": 4.8609514236450195, + "learning_rate": 9.69618734952742e-05, + "loss": 2.8677, + "step": 4118 + }, + { + "epoch": 0.2765007885641421, + "grad_norm": 5.048997402191162, + "learning_rate": 9.695814164319204e-05, + "loss": 3.0378, + "step": 4120 + }, + { + "epoch": 0.27663501224791115, + "grad_norm": 4.015932559967041, + "learning_rate": 9.695440757243005e-05, + "loss": 2.6085, + "step": 4122 + }, + { + "epoch": 0.2767692359316801, + "grad_norm": 4.388291358947754, + "learning_rate": 9.695067128316463e-05, + "loss": 2.7588, + "step": 4124 + }, + { + "epoch": 0.27690345961544915, + "grad_norm": 4.349859714508057, + "learning_rate": 9.694693277557232e-05, + "loss": 2.7503, + "step": 4126 + }, + { + "epoch": 0.27703768329921813, + "grad_norm": 5.0706634521484375, + "learning_rate": 9.694319204982974e-05, + "loss": 2.9206, + "step": 4128 + }, + { + "epoch": 0.27717190698298716, + "grad_norm": 4.432368278503418, + "learning_rate": 9.693944910611365e-05, + "loss": 2.7092, + "step": 4130 + }, + { + "epoch": 0.27730613066675613, + "grad_norm": 4.009401798248291, + "learning_rate": 9.693570394460091e-05, + "loss": 2.5669, + "step": 4132 + }, + { + "epoch": 0.27744035435052516, + "grad_norm": 5.182618141174316, + "learning_rate": 9.693195656546843e-05, + "loss": 2.7832, + "step": 4134 + }, + { + "epoch": 0.27757457803429414, + "grad_norm": 5.331184387207031, + "learning_rate": 9.692820696889331e-05, + "loss": 2.9276, + "step": 4136 + }, + { + "epoch": 0.27770880171806317, + "grad_norm": 4.570873260498047, + "learning_rate": 9.692445515505268e-05, + "loss": 2.6841, + "step": 4138 + }, + { + "epoch": 0.27784302540183214, + "grad_norm": 30.049213409423828, + "learning_rate": 9.692070112412382e-05, + "loss": 2.6601, + "step": 4140 + }, + { + "epoch": 0.2779772490856012, + "grad_norm": 9.012673377990723, + "learning_rate": 9.69169448762841e-05, + "loss": 2.9335, + "step": 4142 + }, + { + "epoch": 0.27811147276937015, + "grad_norm": 4.412790775299072, + "learning_rate": 9.691318641171099e-05, + "loss": 2.9204, + "step": 4144 + }, + { + "epoch": 0.2782456964531392, + "grad_norm": 10.712197303771973, + "learning_rate": 9.690942573058207e-05, + "loss": 2.7294, + "step": 4146 + }, + { + "epoch": 0.27837992013690815, + "grad_norm": 5.417295932769775, + "learning_rate": 9.690566283307503e-05, + "loss": 2.8046, + "step": 4148 + }, + { + "epoch": 0.2785141438206772, + "grad_norm": 4.383596420288086, + "learning_rate": 9.690189771936766e-05, + "loss": 2.7957, + "step": 4150 + }, + { + "epoch": 0.27864836750444616, + "grad_norm": 4.120327472686768, + "learning_rate": 9.689813038963784e-05, + "loss": 2.5869, + "step": 4152 + }, + { + "epoch": 0.27878259118821513, + "grad_norm": 5.765247344970703, + "learning_rate": 9.68943608440636e-05, + "loss": 2.9769, + "step": 4154 + }, + { + "epoch": 0.27891681487198416, + "grad_norm": 4.293210506439209, + "learning_rate": 9.6890589082823e-05, + "loss": 2.8665, + "step": 4156 + }, + { + "epoch": 0.27905103855575314, + "grad_norm": 4.791554927825928, + "learning_rate": 9.68868151060943e-05, + "loss": 2.9854, + "step": 4158 + }, + { + "epoch": 0.27918526223952217, + "grad_norm": 5.720062732696533, + "learning_rate": 9.688303891405576e-05, + "loss": 3.1373, + "step": 4160 + }, + { + "epoch": 0.27931948592329114, + "grad_norm": 4.095503330230713, + "learning_rate": 9.687926050688583e-05, + "loss": 2.9991, + "step": 4162 + }, + { + "epoch": 0.2794537096070602, + "grad_norm": 6.757335186004639, + "learning_rate": 9.687547988476303e-05, + "loss": 2.8988, + "step": 4164 + }, + { + "epoch": 0.27958793329082915, + "grad_norm": 3.922107458114624, + "learning_rate": 9.687169704786599e-05, + "loss": 2.7128, + "step": 4166 + }, + { + "epoch": 0.2797221569745982, + "grad_norm": 4.5588483810424805, + "learning_rate": 9.686791199637342e-05, + "loss": 2.7556, + "step": 4168 + }, + { + "epoch": 0.27985638065836715, + "grad_norm": 4.572213649749756, + "learning_rate": 9.686412473046418e-05, + "loss": 2.9467, + "step": 4170 + }, + { + "epoch": 0.2799906043421362, + "grad_norm": 5.05283784866333, + "learning_rate": 9.686033525031719e-05, + "loss": 2.9973, + "step": 4172 + }, + { + "epoch": 0.28012482802590516, + "grad_norm": 12.720797538757324, + "learning_rate": 9.685654355611151e-05, + "loss": 3.0451, + "step": 4174 + }, + { + "epoch": 0.2802590517096742, + "grad_norm": 4.63034725189209, + "learning_rate": 9.685274964802629e-05, + "loss": 3.0651, + "step": 4176 + }, + { + "epoch": 0.28039327539344316, + "grad_norm": 4.01109504699707, + "learning_rate": 9.684895352624077e-05, + "loss": 2.7133, + "step": 4178 + }, + { + "epoch": 0.2805274990772122, + "grad_norm": 9.731619834899902, + "learning_rate": 9.684515519093433e-05, + "loss": 2.6891, + "step": 4180 + }, + { + "epoch": 0.28066172276098117, + "grad_norm": 5.961690425872803, + "learning_rate": 9.684135464228643e-05, + "loss": 3.0509, + "step": 4182 + }, + { + "epoch": 0.2807959464447502, + "grad_norm": 4.275909423828125, + "learning_rate": 9.683755188047663e-05, + "loss": 2.8081, + "step": 4184 + }, + { + "epoch": 0.2809301701285192, + "grad_norm": 5.812361717224121, + "learning_rate": 9.68337469056846e-05, + "loss": 2.8378, + "step": 4186 + }, + { + "epoch": 0.2810643938122882, + "grad_norm": 4.45614767074585, + "learning_rate": 9.682993971809012e-05, + "loss": 3.1243, + "step": 4188 + }, + { + "epoch": 0.2811986174960572, + "grad_norm": 4.391541481018066, + "learning_rate": 9.682613031787308e-05, + "loss": 2.9247, + "step": 4190 + }, + { + "epoch": 0.28133284117982615, + "grad_norm": 6.787690162658691, + "learning_rate": 9.682231870521347e-05, + "loss": 2.9535, + "step": 4192 + }, + { + "epoch": 0.2814670648635952, + "grad_norm": 5.025696754455566, + "learning_rate": 9.681850488029136e-05, + "loss": 2.7302, + "step": 4194 + }, + { + "epoch": 0.28160128854736416, + "grad_norm": 3.9815497398376465, + "learning_rate": 9.681468884328697e-05, + "loss": 2.599, + "step": 4196 + }, + { + "epoch": 0.2817355122311332, + "grad_norm": 4.538339138031006, + "learning_rate": 9.681087059438059e-05, + "loss": 2.5946, + "step": 4198 + }, + { + "epoch": 0.28186973591490216, + "grad_norm": 4.356814861297607, + "learning_rate": 9.680705013375261e-05, + "loss": 2.9767, + "step": 4200 + }, + { + "epoch": 0.2820039595986712, + "grad_norm": 4.2917938232421875, + "learning_rate": 9.680322746158357e-05, + "loss": 2.9385, + "step": 4202 + }, + { + "epoch": 0.28213818328244017, + "grad_norm": 4.0296525955200195, + "learning_rate": 9.679940257805404e-05, + "loss": 2.9261, + "step": 4204 + }, + { + "epoch": 0.2822724069662092, + "grad_norm": 4.678405284881592, + "learning_rate": 9.679557548334479e-05, + "loss": 2.7669, + "step": 4206 + }, + { + "epoch": 0.2824066306499782, + "grad_norm": 4.662387371063232, + "learning_rate": 9.679174617763662e-05, + "loss": 2.9637, + "step": 4208 + }, + { + "epoch": 0.2825408543337472, + "grad_norm": 6.621274948120117, + "learning_rate": 9.678791466111044e-05, + "loss": 3.2062, + "step": 4210 + }, + { + "epoch": 0.2826750780175162, + "grad_norm": 4.3010640144348145, + "learning_rate": 9.67840809339473e-05, + "loss": 2.8514, + "step": 4212 + }, + { + "epoch": 0.2828093017012852, + "grad_norm": 4.508181571960449, + "learning_rate": 9.678024499632834e-05, + "loss": 2.9384, + "step": 4214 + }, + { + "epoch": 0.2829435253850542, + "grad_norm": 4.91469144821167, + "learning_rate": 9.677640684843478e-05, + "loss": 3.0442, + "step": 4216 + }, + { + "epoch": 0.2830777490688232, + "grad_norm": 4.502353668212891, + "learning_rate": 9.6772566490448e-05, + "loss": 2.5879, + "step": 4218 + }, + { + "epoch": 0.2832119727525922, + "grad_norm": 4.58046817779541, + "learning_rate": 9.676872392254941e-05, + "loss": 2.7815, + "step": 4220 + }, + { + "epoch": 0.2833461964363612, + "grad_norm": 5.134589195251465, + "learning_rate": 9.67648791449206e-05, + "loss": 2.8397, + "step": 4222 + }, + { + "epoch": 0.2834804201201302, + "grad_norm": 3.7888898849487305, + "learning_rate": 9.676103215774319e-05, + "loss": 2.5533, + "step": 4224 + }, + { + "epoch": 0.2836146438038992, + "grad_norm": 4.6854023933410645, + "learning_rate": 9.675718296119898e-05, + "loss": 2.9232, + "step": 4226 + }, + { + "epoch": 0.2837488674876682, + "grad_norm": 4.38947868347168, + "learning_rate": 9.675333155546979e-05, + "loss": 2.687, + "step": 4228 + }, + { + "epoch": 0.2838830911714372, + "grad_norm": 4.7861552238464355, + "learning_rate": 9.674947794073765e-05, + "loss": 2.8218, + "step": 4230 + }, + { + "epoch": 0.2840173148552062, + "grad_norm": 3.9444420337677, + "learning_rate": 9.674562211718459e-05, + "loss": 2.5504, + "step": 4232 + }, + { + "epoch": 0.2841515385389752, + "grad_norm": 17.840564727783203, + "learning_rate": 9.674176408499281e-05, + "loss": 2.616, + "step": 4234 + }, + { + "epoch": 0.2842857622227442, + "grad_norm": 4.470780849456787, + "learning_rate": 9.67379038443446e-05, + "loss": 2.8291, + "step": 4236 + }, + { + "epoch": 0.2844199859065132, + "grad_norm": 4.559418678283691, + "learning_rate": 9.673404139542233e-05, + "loss": 2.8704, + "step": 4238 + }, + { + "epoch": 0.2845542095902822, + "grad_norm": 4.235703945159912, + "learning_rate": 9.67301767384085e-05, + "loss": 2.8871, + "step": 4240 + }, + { + "epoch": 0.2846884332740512, + "grad_norm": 4.498559474945068, + "learning_rate": 9.672630987348573e-05, + "loss": 2.8378, + "step": 4242 + }, + { + "epoch": 0.2848226569578202, + "grad_norm": 4.169527053833008, + "learning_rate": 9.672244080083668e-05, + "loss": 2.8379, + "step": 4244 + }, + { + "epoch": 0.2849568806415892, + "grad_norm": 5.160027980804443, + "learning_rate": 9.671856952064418e-05, + "loss": 2.805, + "step": 4246 + }, + { + "epoch": 0.2850911043253582, + "grad_norm": 4.435372352600098, + "learning_rate": 9.671469603309113e-05, + "loss": 2.4769, + "step": 4248 + }, + { + "epoch": 0.2852253280091272, + "grad_norm": 7.1346893310546875, + "learning_rate": 9.671082033836057e-05, + "loss": 3.077, + "step": 4250 + }, + { + "epoch": 0.28535955169289623, + "grad_norm": 4.583145618438721, + "learning_rate": 9.67069424366356e-05, + "loss": 2.8965, + "step": 4252 + }, + { + "epoch": 0.2854937753766652, + "grad_norm": 4.895559787750244, + "learning_rate": 9.670306232809945e-05, + "loss": 2.9422, + "step": 4254 + }, + { + "epoch": 0.28562799906043423, + "grad_norm": 3.9592697620391846, + "learning_rate": 9.669918001293543e-05, + "loss": 3.0156, + "step": 4256 + }, + { + "epoch": 0.2857622227442032, + "grad_norm": 4.613564968109131, + "learning_rate": 9.669529549132699e-05, + "loss": 2.762, + "step": 4258 + }, + { + "epoch": 0.28589644642797224, + "grad_norm": 4.822320461273193, + "learning_rate": 9.669140876345765e-05, + "loss": 2.9974, + "step": 4260 + }, + { + "epoch": 0.2860306701117412, + "grad_norm": 4.790377616882324, + "learning_rate": 9.668751982951108e-05, + "loss": 2.8906, + "step": 4262 + }, + { + "epoch": 0.28616489379551024, + "grad_norm": 5.874161243438721, + "learning_rate": 9.6683628689671e-05, + "loss": 2.8784, + "step": 4264 + }, + { + "epoch": 0.2862991174792792, + "grad_norm": 4.480892181396484, + "learning_rate": 9.667973534412125e-05, + "loss": 2.7342, + "step": 4266 + }, + { + "epoch": 0.2864333411630482, + "grad_norm": 4.273677349090576, + "learning_rate": 9.66758397930458e-05, + "loss": 2.9031, + "step": 4268 + }, + { + "epoch": 0.2865675648468172, + "grad_norm": 4.243147373199463, + "learning_rate": 9.667194203662873e-05, + "loss": 2.8031, + "step": 4270 + }, + { + "epoch": 0.2867017885305862, + "grad_norm": 4.375511169433594, + "learning_rate": 9.666804207505414e-05, + "loss": 2.9516, + "step": 4272 + }, + { + "epoch": 0.28683601221435523, + "grad_norm": 5.284012794494629, + "learning_rate": 9.666413990850635e-05, + "loss": 2.636, + "step": 4274 + }, + { + "epoch": 0.2869702358981242, + "grad_norm": 4.121740818023682, + "learning_rate": 9.666023553716971e-05, + "loss": 2.7856, + "step": 4276 + }, + { + "epoch": 0.28710445958189323, + "grad_norm": 5.697823524475098, + "learning_rate": 9.665632896122869e-05, + "loss": 2.8435, + "step": 4278 + }, + { + "epoch": 0.2872386832656622, + "grad_norm": 4.427298545837402, + "learning_rate": 9.665242018086786e-05, + "loss": 2.8956, + "step": 4280 + }, + { + "epoch": 0.28737290694943124, + "grad_norm": 5.039431571960449, + "learning_rate": 9.664850919627193e-05, + "loss": 2.7659, + "step": 4282 + }, + { + "epoch": 0.2875071306332002, + "grad_norm": 3.896700859069824, + "learning_rate": 9.664459600762568e-05, + "loss": 2.9325, + "step": 4284 + }, + { + "epoch": 0.28764135431696924, + "grad_norm": 4.021005153656006, + "learning_rate": 9.664068061511397e-05, + "loss": 2.8066, + "step": 4286 + }, + { + "epoch": 0.2877755780007382, + "grad_norm": 4.565979957580566, + "learning_rate": 9.663676301892182e-05, + "loss": 2.7862, + "step": 4288 + }, + { + "epoch": 0.28790980168450725, + "grad_norm": 5.946723937988281, + "learning_rate": 9.663284321923434e-05, + "loss": 2.8577, + "step": 4290 + }, + { + "epoch": 0.2880440253682762, + "grad_norm": 5.3086371421813965, + "learning_rate": 9.66289212162367e-05, + "loss": 3.1177, + "step": 4292 + }, + { + "epoch": 0.28817824905204525, + "grad_norm": 5.295441150665283, + "learning_rate": 9.662499701011424e-05, + "loss": 2.809, + "step": 4294 + }, + { + "epoch": 0.28831247273581423, + "grad_norm": 4.057580471038818, + "learning_rate": 9.662107060105234e-05, + "loss": 2.6903, + "step": 4296 + }, + { + "epoch": 0.28844669641958326, + "grad_norm": 4.606744289398193, + "learning_rate": 9.661714198923654e-05, + "loss": 2.8072, + "step": 4298 + }, + { + "epoch": 0.28858092010335223, + "grad_norm": 5.737088680267334, + "learning_rate": 9.661321117485244e-05, + "loss": 2.9362, + "step": 4300 + }, + { + "epoch": 0.28871514378712126, + "grad_norm": 4.509100914001465, + "learning_rate": 9.66092781580858e-05, + "loss": 2.7189, + "step": 4302 + }, + { + "epoch": 0.28884936747089024, + "grad_norm": 6.721002101898193, + "learning_rate": 9.66053429391224e-05, + "loss": 3.1925, + "step": 4304 + }, + { + "epoch": 0.2889835911546592, + "grad_norm": 5.095534801483154, + "learning_rate": 9.660140551814817e-05, + "loss": 2.8098, + "step": 4306 + }, + { + "epoch": 0.28911781483842824, + "grad_norm": 6.2896833419799805, + "learning_rate": 9.65974658953492e-05, + "loss": 2.5557, + "step": 4308 + }, + { + "epoch": 0.2892520385221972, + "grad_norm": 4.803857803344727, + "learning_rate": 9.659352407091159e-05, + "loss": 2.7089, + "step": 4310 + }, + { + "epoch": 0.28938626220596625, + "grad_norm": 4.55495023727417, + "learning_rate": 9.658958004502158e-05, + "loss": 2.8863, + "step": 4312 + }, + { + "epoch": 0.2895204858897352, + "grad_norm": 4.019484043121338, + "learning_rate": 9.658563381786554e-05, + "loss": 2.4644, + "step": 4314 + }, + { + "epoch": 0.28965470957350425, + "grad_norm": 4.205108642578125, + "learning_rate": 9.65816853896299e-05, + "loss": 2.495, + "step": 4316 + }, + { + "epoch": 0.28978893325727323, + "grad_norm": 4.480343341827393, + "learning_rate": 9.657773476050123e-05, + "loss": 2.8414, + "step": 4318 + }, + { + "epoch": 0.28992315694104226, + "grad_norm": 4.256911754608154, + "learning_rate": 9.657378193066617e-05, + "loss": 2.8397, + "step": 4320 + }, + { + "epoch": 0.29005738062481123, + "grad_norm": 11.298532485961914, + "learning_rate": 9.65698269003115e-05, + "loss": 2.7417, + "step": 4322 + }, + { + "epoch": 0.29019160430858026, + "grad_norm": 4.165858268737793, + "learning_rate": 9.656586966962411e-05, + "loss": 2.6164, + "step": 4324 + }, + { + "epoch": 0.29032582799234924, + "grad_norm": 6.106349468231201, + "learning_rate": 9.656191023879092e-05, + "loss": 2.8016, + "step": 4326 + }, + { + "epoch": 0.29046005167611827, + "grad_norm": 4.305108070373535, + "learning_rate": 9.655794860799904e-05, + "loss": 2.9212, + "step": 4328 + }, + { + "epoch": 0.29059427535988724, + "grad_norm": 6.89378023147583, + "learning_rate": 9.655398477743565e-05, + "loss": 2.8581, + "step": 4330 + }, + { + "epoch": 0.2907284990436563, + "grad_norm": 5.793878078460693, + "learning_rate": 9.6550018747288e-05, + "loss": 2.8761, + "step": 4332 + }, + { + "epoch": 0.29086272272742525, + "grad_norm": 4.622072696685791, + "learning_rate": 9.654605051774352e-05, + "loss": 2.5835, + "step": 4334 + }, + { + "epoch": 0.2909969464111943, + "grad_norm": 4.1598801612854, + "learning_rate": 9.654208008898968e-05, + "loss": 2.8829, + "step": 4336 + }, + { + "epoch": 0.29113117009496325, + "grad_norm": 5.081852436065674, + "learning_rate": 9.653810746121407e-05, + "loss": 2.7901, + "step": 4338 + }, + { + "epoch": 0.2912653937787323, + "grad_norm": 4.310352325439453, + "learning_rate": 9.65341326346044e-05, + "loss": 2.7889, + "step": 4340 + }, + { + "epoch": 0.29139961746250126, + "grad_norm": 4.451131820678711, + "learning_rate": 9.653015560934846e-05, + "loss": 2.8015, + "step": 4342 + }, + { + "epoch": 0.29153384114627023, + "grad_norm": 4.252593517303467, + "learning_rate": 9.652617638563417e-05, + "loss": 3.2431, + "step": 4344 + }, + { + "epoch": 0.29166806483003926, + "grad_norm": 4.317137718200684, + "learning_rate": 9.652219496364954e-05, + "loss": 2.8835, + "step": 4346 + }, + { + "epoch": 0.29180228851380824, + "grad_norm": 4.864560604095459, + "learning_rate": 9.651821134358268e-05, + "loss": 2.8122, + "step": 4348 + }, + { + "epoch": 0.29193651219757727, + "grad_norm": 4.486721515655518, + "learning_rate": 9.651422552562181e-05, + "loss": 2.7973, + "step": 4350 + }, + { + "epoch": 0.29207073588134624, + "grad_norm": 5.228052139282227, + "learning_rate": 9.651023750995525e-05, + "loss": 2.8821, + "step": 4352 + }, + { + "epoch": 0.2922049595651153, + "grad_norm": 4.414638996124268, + "learning_rate": 9.650624729677141e-05, + "loss": 2.7877, + "step": 4354 + }, + { + "epoch": 0.29233918324888425, + "grad_norm": 4.660495758056641, + "learning_rate": 9.650225488625886e-05, + "loss": 2.995, + "step": 4356 + }, + { + "epoch": 0.2924734069326533, + "grad_norm": 4.510606288909912, + "learning_rate": 9.649826027860619e-05, + "loss": 2.7683, + "step": 4358 + }, + { + "epoch": 0.29260763061642225, + "grad_norm": 4.000579833984375, + "learning_rate": 9.649426347400217e-05, + "loss": 2.6826, + "step": 4360 + }, + { + "epoch": 0.2927418543001913, + "grad_norm": 4.54395866394043, + "learning_rate": 9.649026447263561e-05, + "loss": 2.8395, + "step": 4362 + }, + { + "epoch": 0.29287607798396026, + "grad_norm": 4.416308879852295, + "learning_rate": 9.648626327469549e-05, + "loss": 2.785, + "step": 4364 + }, + { + "epoch": 0.2930103016677293, + "grad_norm": 5.317094802856445, + "learning_rate": 9.648225988037083e-05, + "loss": 2.7927, + "step": 4366 + }, + { + "epoch": 0.29314452535149826, + "grad_norm": 4.843416690826416, + "learning_rate": 9.64782542898508e-05, + "loss": 2.798, + "step": 4368 + }, + { + "epoch": 0.2932787490352673, + "grad_norm": 4.4379801750183105, + "learning_rate": 9.647424650332467e-05, + "loss": 2.6519, + "step": 4370 + }, + { + "epoch": 0.29341297271903627, + "grad_norm": 4.21823263168335, + "learning_rate": 9.647023652098174e-05, + "loss": 2.8164, + "step": 4372 + }, + { + "epoch": 0.2935471964028053, + "grad_norm": 4.426904678344727, + "learning_rate": 9.646622434301154e-05, + "loss": 3.1133, + "step": 4374 + }, + { + "epoch": 0.2936814200865743, + "grad_norm": 5.63826847076416, + "learning_rate": 9.64622099696036e-05, + "loss": 2.8423, + "step": 4376 + }, + { + "epoch": 0.2938156437703433, + "grad_norm": 4.365314960479736, + "learning_rate": 9.645819340094762e-05, + "loss": 2.7352, + "step": 4378 + }, + { + "epoch": 0.2939498674541123, + "grad_norm": 4.459978103637695, + "learning_rate": 9.645417463723335e-05, + "loss": 3.0145, + "step": 4380 + }, + { + "epoch": 0.29408409113788125, + "grad_norm": 4.389591693878174, + "learning_rate": 9.645015367865067e-05, + "loss": 2.8081, + "step": 4382 + }, + { + "epoch": 0.2942183148216503, + "grad_norm": 4.4773054122924805, + "learning_rate": 9.644613052538957e-05, + "loss": 2.9834, + "step": 4384 + }, + { + "epoch": 0.29435253850541926, + "grad_norm": 4.6308135986328125, + "learning_rate": 9.644210517764014e-05, + "loss": 2.7416, + "step": 4386 + }, + { + "epoch": 0.2944867621891883, + "grad_norm": 4.584836483001709, + "learning_rate": 9.643807763559258e-05, + "loss": 2.6876, + "step": 4388 + }, + { + "epoch": 0.29462098587295726, + "grad_norm": 4.036896705627441, + "learning_rate": 9.643404789943713e-05, + "loss": 2.8005, + "step": 4390 + }, + { + "epoch": 0.2947552095567263, + "grad_norm": 4.252262592315674, + "learning_rate": 9.643001596936427e-05, + "loss": 2.7134, + "step": 4392 + }, + { + "epoch": 0.29488943324049527, + "grad_norm": 6.459151268005371, + "learning_rate": 9.642598184556442e-05, + "loss": 2.9496, + "step": 4394 + }, + { + "epoch": 0.2950236569242643, + "grad_norm": 4.69563102722168, + "learning_rate": 9.642194552822823e-05, + "loss": 3.1036, + "step": 4396 + }, + { + "epoch": 0.2951578806080333, + "grad_norm": 4.204712390899658, + "learning_rate": 9.64179070175464e-05, + "loss": 2.8721, + "step": 4398 + }, + { + "epoch": 0.2952921042918023, + "grad_norm": 4.495340824127197, + "learning_rate": 9.641386631370976e-05, + "loss": 2.9372, + "step": 4400 + }, + { + "epoch": 0.2954263279755713, + "grad_norm": 6.331702709197998, + "learning_rate": 9.640982341690918e-05, + "loss": 2.7097, + "step": 4402 + }, + { + "epoch": 0.2955605516593403, + "grad_norm": 4.468221187591553, + "learning_rate": 9.640577832733571e-05, + "loss": 3.0273, + "step": 4404 + }, + { + "epoch": 0.2956947753431093, + "grad_norm": 5.569672584533691, + "learning_rate": 9.640173104518047e-05, + "loss": 2.7471, + "step": 4406 + }, + { + "epoch": 0.2958289990268783, + "grad_norm": 4.7355732917785645, + "learning_rate": 9.63976815706347e-05, + "loss": 2.9711, + "step": 4408 + }, + { + "epoch": 0.2959632227106473, + "grad_norm": 4.640005588531494, + "learning_rate": 9.639362990388969e-05, + "loss": 2.7965, + "step": 4410 + }, + { + "epoch": 0.2960974463944163, + "grad_norm": 4.5016865730285645, + "learning_rate": 9.63895760451369e-05, + "loss": 2.829, + "step": 4412 + }, + { + "epoch": 0.2962316700781853, + "grad_norm": 4.592218399047852, + "learning_rate": 9.638551999456786e-05, + "loss": 2.8954, + "step": 4414 + }, + { + "epoch": 0.2963658937619543, + "grad_norm": 4.330235481262207, + "learning_rate": 9.638146175237421e-05, + "loss": 2.868, + "step": 4416 + }, + { + "epoch": 0.2965001174457233, + "grad_norm": 4.879384517669678, + "learning_rate": 9.637740131874771e-05, + "loss": 2.8953, + "step": 4418 + }, + { + "epoch": 0.2966343411294923, + "grad_norm": 4.742946624755859, + "learning_rate": 9.63733386938802e-05, + "loss": 2.8201, + "step": 4420 + }, + { + "epoch": 0.2967685648132613, + "grad_norm": 4.391503810882568, + "learning_rate": 9.636927387796361e-05, + "loss": 2.7408, + "step": 4422 + }, + { + "epoch": 0.2969027884970303, + "grad_norm": 4.507451057434082, + "learning_rate": 9.636520687119002e-05, + "loss": 2.8048, + "step": 4424 + }, + { + "epoch": 0.2970370121807993, + "grad_norm": 4.08432149887085, + "learning_rate": 9.636113767375158e-05, + "loss": 2.6405, + "step": 4426 + }, + { + "epoch": 0.2971712358645683, + "grad_norm": 4.181814193725586, + "learning_rate": 9.635706628584054e-05, + "loss": 2.5257, + "step": 4428 + }, + { + "epoch": 0.2973054595483373, + "grad_norm": 4.254319667816162, + "learning_rate": 9.63529927076493e-05, + "loss": 2.5481, + "step": 4430 + }, + { + "epoch": 0.2974396832321063, + "grad_norm": 4.968878746032715, + "learning_rate": 9.634891693937026e-05, + "loss": 2.6539, + "step": 4432 + }, + { + "epoch": 0.2975739069158753, + "grad_norm": 4.721351623535156, + "learning_rate": 9.634483898119608e-05, + "loss": 2.5796, + "step": 4434 + }, + { + "epoch": 0.2977081305996443, + "grad_norm": 4.361441135406494, + "learning_rate": 9.634075883331937e-05, + "loss": 2.9684, + "step": 4436 + }, + { + "epoch": 0.2978423542834133, + "grad_norm": 4.122551918029785, + "learning_rate": 9.633667649593294e-05, + "loss": 2.7484, + "step": 4438 + }, + { + "epoch": 0.2979765779671823, + "grad_norm": 4.220254421234131, + "learning_rate": 9.633259196922966e-05, + "loss": 2.6596, + "step": 4440 + }, + { + "epoch": 0.29811080165095133, + "grad_norm": 4.917792797088623, + "learning_rate": 9.632850525340251e-05, + "loss": 2.9604, + "step": 4442 + }, + { + "epoch": 0.2982450253347203, + "grad_norm": 4.152742385864258, + "learning_rate": 9.63244163486446e-05, + "loss": 2.7229, + "step": 4444 + }, + { + "epoch": 0.29837924901848933, + "grad_norm": 4.582280158996582, + "learning_rate": 9.63203252551491e-05, + "loss": 2.6641, + "step": 4446 + }, + { + "epoch": 0.2985134727022583, + "grad_norm": 5.116814136505127, + "learning_rate": 9.63162319731093e-05, + "loss": 3.2308, + "step": 4448 + }, + { + "epoch": 0.29864769638602734, + "grad_norm": 4.871245861053467, + "learning_rate": 9.631213650271864e-05, + "loss": 2.8902, + "step": 4450 + }, + { + "epoch": 0.2987819200697963, + "grad_norm": 3.4879517555236816, + "learning_rate": 9.630803884417061e-05, + "loss": 2.7054, + "step": 4452 + }, + { + "epoch": 0.29891614375356534, + "grad_norm": 4.9700398445129395, + "learning_rate": 9.630393899765878e-05, + "loss": 3.0752, + "step": 4454 + }, + { + "epoch": 0.2990503674373343, + "grad_norm": 4.45244026184082, + "learning_rate": 9.62998369633769e-05, + "loss": 2.7945, + "step": 4456 + }, + { + "epoch": 0.2991845911211033, + "grad_norm": 4.513127326965332, + "learning_rate": 9.629573274151876e-05, + "loss": 2.6998, + "step": 4458 + }, + { + "epoch": 0.2993188148048723, + "grad_norm": 4.69876766204834, + "learning_rate": 9.62916263322783e-05, + "loss": 2.8585, + "step": 4460 + }, + { + "epoch": 0.2994530384886413, + "grad_norm": 4.1242876052856445, + "learning_rate": 9.628751773584951e-05, + "loss": 2.7014, + "step": 4462 + }, + { + "epoch": 0.29958726217241033, + "grad_norm": 4.260964870452881, + "learning_rate": 9.628340695242652e-05, + "loss": 2.8091, + "step": 4464 + }, + { + "epoch": 0.2997214858561793, + "grad_norm": 4.401588439941406, + "learning_rate": 9.627929398220358e-05, + "loss": 2.5774, + "step": 4466 + }, + { + "epoch": 0.29985570953994833, + "grad_norm": 4.387324333190918, + "learning_rate": 9.6275178825375e-05, + "loss": 2.7146, + "step": 4468 + }, + { + "epoch": 0.2999899332237173, + "grad_norm": 4.455936908721924, + "learning_rate": 9.627106148213522e-05, + "loss": 2.7175, + "step": 4470 + }, + { + "epoch": 0.30012415690748634, + "grad_norm": 5.413829803466797, + "learning_rate": 9.626694195267876e-05, + "loss": 2.709, + "step": 4472 + }, + { + "epoch": 0.3002583805912553, + "grad_norm": 4.5451836585998535, + "learning_rate": 9.626282023720028e-05, + "loss": 2.7874, + "step": 4474 + }, + { + "epoch": 0.30039260427502434, + "grad_norm": 4.0828022956848145, + "learning_rate": 9.625869633589453e-05, + "loss": 2.7683, + "step": 4476 + }, + { + "epoch": 0.3005268279587933, + "grad_norm": 4.733001708984375, + "learning_rate": 9.625457024895632e-05, + "loss": 2.8524, + "step": 4478 + }, + { + "epoch": 0.30066105164256235, + "grad_norm": 4.87908935546875, + "learning_rate": 9.625044197658063e-05, + "loss": 2.484, + "step": 4480 + }, + { + "epoch": 0.3007952753263313, + "grad_norm": 4.06143045425415, + "learning_rate": 9.624631151896251e-05, + "loss": 2.6534, + "step": 4482 + }, + { + "epoch": 0.30092949901010035, + "grad_norm": 5.013394355773926, + "learning_rate": 9.62421788762971e-05, + "loss": 2.7053, + "step": 4484 + }, + { + "epoch": 0.30106372269386933, + "grad_norm": 4.7068023681640625, + "learning_rate": 9.623804404877967e-05, + "loss": 3.1, + "step": 4486 + }, + { + "epoch": 0.30119794637763836, + "grad_norm": 3.9088642597198486, + "learning_rate": 9.623390703660559e-05, + "loss": 3.0131, + "step": 4488 + }, + { + "epoch": 0.30133217006140733, + "grad_norm": 5.358076095581055, + "learning_rate": 9.62297678399703e-05, + "loss": 3.2312, + "step": 4490 + }, + { + "epoch": 0.3014663937451763, + "grad_norm": 4.62116003036499, + "learning_rate": 9.62256264590694e-05, + "loss": 2.6818, + "step": 4492 + }, + { + "epoch": 0.30160061742894534, + "grad_norm": 4.490265846252441, + "learning_rate": 9.622148289409855e-05, + "loss": 2.7669, + "step": 4494 + }, + { + "epoch": 0.3017348411127143, + "grad_norm": 4.117717742919922, + "learning_rate": 9.621733714525353e-05, + "loss": 2.7245, + "step": 4496 + }, + { + "epoch": 0.30186906479648334, + "grad_norm": 5.409145355224609, + "learning_rate": 9.621318921273021e-05, + "loss": 2.8862, + "step": 4498 + }, + { + "epoch": 0.3020032884802523, + "grad_norm": 5.360276222229004, + "learning_rate": 9.620903909672457e-05, + "loss": 2.6924, + "step": 4500 + }, + { + "epoch": 0.30213751216402135, + "grad_norm": 4.707224369049072, + "learning_rate": 9.620488679743269e-05, + "loss": 2.795, + "step": 4502 + }, + { + "epoch": 0.3022717358477903, + "grad_norm": 5.54187536239624, + "learning_rate": 9.620073231505078e-05, + "loss": 2.4445, + "step": 4504 + }, + { + "epoch": 0.30240595953155935, + "grad_norm": 4.667269706726074, + "learning_rate": 9.61965756497751e-05, + "loss": 2.6875, + "step": 4506 + }, + { + "epoch": 0.30254018321532833, + "grad_norm": 5.11043643951416, + "learning_rate": 9.619241680180209e-05, + "loss": 2.5907, + "step": 4508 + }, + { + "epoch": 0.30267440689909736, + "grad_norm": 4.776096343994141, + "learning_rate": 9.61882557713282e-05, + "loss": 2.9425, + "step": 4510 + }, + { + "epoch": 0.30280863058286633, + "grad_norm": 3.99296236038208, + "learning_rate": 9.618409255855006e-05, + "loss": 2.4919, + "step": 4512 + }, + { + "epoch": 0.30294285426663536, + "grad_norm": 4.668543815612793, + "learning_rate": 9.617992716366435e-05, + "loss": 2.8931, + "step": 4514 + }, + { + "epoch": 0.30307707795040434, + "grad_norm": 6.861340045928955, + "learning_rate": 9.61757595868679e-05, + "loss": 2.9397, + "step": 4516 + }, + { + "epoch": 0.30321130163417337, + "grad_norm": 4.314986705780029, + "learning_rate": 9.617158982835761e-05, + "loss": 2.835, + "step": 4518 + }, + { + "epoch": 0.30334552531794234, + "grad_norm": 3.8005197048187256, + "learning_rate": 9.61674178883305e-05, + "loss": 2.7943, + "step": 4520 + }, + { + "epoch": 0.3034797490017114, + "grad_norm": 5.241741180419922, + "learning_rate": 9.616324376698366e-05, + "loss": 2.7522, + "step": 4522 + }, + { + "epoch": 0.30361397268548035, + "grad_norm": 4.293467998504639, + "learning_rate": 9.615906746451435e-05, + "loss": 2.5473, + "step": 4524 + }, + { + "epoch": 0.3037481963692494, + "grad_norm": 4.571163654327393, + "learning_rate": 9.615488898111985e-05, + "loss": 2.7669, + "step": 4526 + }, + { + "epoch": 0.30388242005301835, + "grad_norm": 4.5359272956848145, + "learning_rate": 9.615070831699762e-05, + "loss": 2.682, + "step": 4528 + }, + { + "epoch": 0.30401664373678733, + "grad_norm": 11.191313743591309, + "learning_rate": 9.614652547234516e-05, + "loss": 2.9159, + "step": 4530 + }, + { + "epoch": 0.30415086742055636, + "grad_norm": 4.646055698394775, + "learning_rate": 9.614234044736012e-05, + "loss": 2.8608, + "step": 4532 + }, + { + "epoch": 0.30428509110432533, + "grad_norm": 4.6511101722717285, + "learning_rate": 9.613815324224023e-05, + "loss": 2.9769, + "step": 4534 + }, + { + "epoch": 0.30441931478809436, + "grad_norm": 4.581108570098877, + "learning_rate": 9.613396385718334e-05, + "loss": 2.9311, + "step": 4536 + }, + { + "epoch": 0.30455353847186334, + "grad_norm": 4.487287521362305, + "learning_rate": 9.612977229238735e-05, + "loss": 2.7775, + "step": 4538 + }, + { + "epoch": 0.30468776215563237, + "grad_norm": 6.247684001922607, + "learning_rate": 9.612557854805036e-05, + "loss": 3.0886, + "step": 4540 + }, + { + "epoch": 0.30482198583940134, + "grad_norm": 5.305548667907715, + "learning_rate": 9.612138262437046e-05, + "loss": 3.0034, + "step": 4542 + }, + { + "epoch": 0.3049562095231704, + "grad_norm": 4.936919212341309, + "learning_rate": 9.611718452154594e-05, + "loss": 2.976, + "step": 4544 + }, + { + "epoch": 0.30509043320693935, + "grad_norm": 5.227365970611572, + "learning_rate": 9.611298423977512e-05, + "loss": 2.8578, + "step": 4546 + }, + { + "epoch": 0.3052246568907084, + "grad_norm": 4.523658275604248, + "learning_rate": 9.610878177925648e-05, + "loss": 2.9038, + "step": 4548 + }, + { + "epoch": 0.30535888057447735, + "grad_norm": 4.566622257232666, + "learning_rate": 9.610457714018857e-05, + "loss": 2.8884, + "step": 4550 + }, + { + "epoch": 0.3054931042582464, + "grad_norm": 4.658285140991211, + "learning_rate": 9.610037032277007e-05, + "loss": 2.6862, + "step": 4552 + }, + { + "epoch": 0.30562732794201536, + "grad_norm": 5.1031880378723145, + "learning_rate": 9.609616132719971e-05, + "loss": 3.0147, + "step": 4554 + }, + { + "epoch": 0.3057615516257844, + "grad_norm": 5.341793060302734, + "learning_rate": 9.609195015367636e-05, + "loss": 2.6541, + "step": 4556 + }, + { + "epoch": 0.30589577530955336, + "grad_norm": 5.561375617980957, + "learning_rate": 9.608773680239902e-05, + "loss": 3.1148, + "step": 4558 + }, + { + "epoch": 0.3060299989933224, + "grad_norm": 5.0951642990112305, + "learning_rate": 9.608352127356672e-05, + "loss": 2.7487, + "step": 4560 + }, + { + "epoch": 0.30616422267709137, + "grad_norm": 3.8317627906799316, + "learning_rate": 9.607930356737869e-05, + "loss": 2.7281, + "step": 4562 + }, + { + "epoch": 0.3062984463608604, + "grad_norm": 4.597611427307129, + "learning_rate": 9.607508368403415e-05, + "loss": 2.6501, + "step": 4564 + }, + { + "epoch": 0.3064326700446294, + "grad_norm": 8.3587007522583, + "learning_rate": 9.607086162373253e-05, + "loss": 2.9334, + "step": 4566 + }, + { + "epoch": 0.30656689372839835, + "grad_norm": 5.170097351074219, + "learning_rate": 9.606663738667328e-05, + "loss": 2.8807, + "step": 4568 + }, + { + "epoch": 0.3067011174121674, + "grad_norm": 5.480592250823975, + "learning_rate": 9.6062410973056e-05, + "loss": 2.6911, + "step": 4570 + }, + { + "epoch": 0.30683534109593635, + "grad_norm": 4.340033054351807, + "learning_rate": 9.605818238308038e-05, + "loss": 2.6087, + "step": 4572 + }, + { + "epoch": 0.3069695647797054, + "grad_norm": 4.567822456359863, + "learning_rate": 9.605395161694621e-05, + "loss": 2.8826, + "step": 4574 + }, + { + "epoch": 0.30710378846347436, + "grad_norm": 6.017644882202148, + "learning_rate": 9.60497186748534e-05, + "loss": 3.2873, + "step": 4576 + }, + { + "epoch": 0.3072380121472434, + "grad_norm": 4.851518154144287, + "learning_rate": 9.604548355700194e-05, + "loss": 2.9505, + "step": 4578 + }, + { + "epoch": 0.30737223583101236, + "grad_norm": 4.505639553070068, + "learning_rate": 9.60412462635919e-05, + "loss": 2.5102, + "step": 4580 + }, + { + "epoch": 0.3075064595147814, + "grad_norm": 5.116810321807861, + "learning_rate": 9.603700679482352e-05, + "loss": 2.6088, + "step": 4582 + }, + { + "epoch": 0.30764068319855037, + "grad_norm": 4.021944046020508, + "learning_rate": 9.603276515089711e-05, + "loss": 2.7644, + "step": 4584 + }, + { + "epoch": 0.3077749068823194, + "grad_norm": 4.237162113189697, + "learning_rate": 9.602852133201305e-05, + "loss": 2.7789, + "step": 4586 + }, + { + "epoch": 0.3079091305660884, + "grad_norm": 4.570884704589844, + "learning_rate": 9.602427533837188e-05, + "loss": 3.1294, + "step": 4588 + }, + { + "epoch": 0.3080433542498574, + "grad_norm": 4.388261795043945, + "learning_rate": 9.60200271701742e-05, + "loss": 2.9495, + "step": 4590 + }, + { + "epoch": 0.3081775779336264, + "grad_norm": 8.241074562072754, + "learning_rate": 9.601577682762072e-05, + "loss": 2.692, + "step": 4592 + }, + { + "epoch": 0.3083118016173954, + "grad_norm": 5.576723575592041, + "learning_rate": 9.60115243109123e-05, + "loss": 2.941, + "step": 4594 + }, + { + "epoch": 0.3084460253011644, + "grad_norm": 4.4936113357543945, + "learning_rate": 9.60072696202498e-05, + "loss": 2.9334, + "step": 4596 + }, + { + "epoch": 0.3085802489849334, + "grad_norm": 4.247500896453857, + "learning_rate": 9.60030127558343e-05, + "loss": 2.6563, + "step": 4598 + }, + { + "epoch": 0.3087144726687024, + "grad_norm": 4.405754089355469, + "learning_rate": 9.59987537178669e-05, + "loss": 2.7311, + "step": 4600 + }, + { + "epoch": 0.3088486963524714, + "grad_norm": 4.56029748916626, + "learning_rate": 9.599449250654884e-05, + "loss": 2.761, + "step": 4602 + }, + { + "epoch": 0.3089829200362404, + "grad_norm": 4.169101715087891, + "learning_rate": 9.599022912208145e-05, + "loss": 2.7336, + "step": 4604 + }, + { + "epoch": 0.30911714372000937, + "grad_norm": 4.1185078620910645, + "learning_rate": 9.598596356466618e-05, + "loss": 2.7237, + "step": 4606 + }, + { + "epoch": 0.3092513674037784, + "grad_norm": 4.468047618865967, + "learning_rate": 9.598169583450455e-05, + "loss": 2.6378, + "step": 4608 + }, + { + "epoch": 0.3093855910875474, + "grad_norm": 4.505421161651611, + "learning_rate": 9.597742593179822e-05, + "loss": 2.6516, + "step": 4610 + }, + { + "epoch": 0.3095198147713164, + "grad_norm": 4.539307117462158, + "learning_rate": 9.597315385674893e-05, + "loss": 2.7067, + "step": 4612 + }, + { + "epoch": 0.3096540384550854, + "grad_norm": 5.020122528076172, + "learning_rate": 9.596887960955849e-05, + "loss": 2.6182, + "step": 4614 + }, + { + "epoch": 0.3097882621388544, + "grad_norm": 5.121588230133057, + "learning_rate": 9.596460319042891e-05, + "loss": 2.7869, + "step": 4616 + }, + { + "epoch": 0.3099224858226234, + "grad_norm": 4.746420860290527, + "learning_rate": 9.596032459956222e-05, + "loss": 2.7738, + "step": 4618 + }, + { + "epoch": 0.3100567095063924, + "grad_norm": 4.5703816413879395, + "learning_rate": 9.595604383716055e-05, + "loss": 2.8652, + "step": 4620 + }, + { + "epoch": 0.3101909331901614, + "grad_norm": 4.282858371734619, + "learning_rate": 9.59517609034262e-05, + "loss": 2.8398, + "step": 4622 + }, + { + "epoch": 0.3103251568739304, + "grad_norm": 4.363923072814941, + "learning_rate": 9.594747579856149e-05, + "loss": 2.9201, + "step": 4624 + }, + { + "epoch": 0.3104593805576994, + "grad_norm": 4.344200134277344, + "learning_rate": 9.59431885227689e-05, + "loss": 2.6822, + "step": 4626 + }, + { + "epoch": 0.3105936042414684, + "grad_norm": 5.004021167755127, + "learning_rate": 9.5938899076251e-05, + "loss": 2.7559, + "step": 4628 + }, + { + "epoch": 0.3107278279252374, + "grad_norm": 6.682697296142578, + "learning_rate": 9.593460745921046e-05, + "loss": 2.8774, + "step": 4630 + }, + { + "epoch": 0.31086205160900643, + "grad_norm": 4.345556259155273, + "learning_rate": 9.593031367185003e-05, + "loss": 2.7919, + "step": 4632 + }, + { + "epoch": 0.3109962752927754, + "grad_norm": 4.5160722732543945, + "learning_rate": 9.592601771437261e-05, + "loss": 2.8116, + "step": 4634 + }, + { + "epoch": 0.31113049897654443, + "grad_norm": 4.608706474304199, + "learning_rate": 9.592171958698115e-05, + "loss": 3.0399, + "step": 4636 + }, + { + "epoch": 0.3112647226603134, + "grad_norm": 4.54157829284668, + "learning_rate": 9.591741928987876e-05, + "loss": 2.9618, + "step": 4638 + }, + { + "epoch": 0.31139894634408244, + "grad_norm": 4.718753814697266, + "learning_rate": 9.591311682326859e-05, + "loss": 2.874, + "step": 4640 + }, + { + "epoch": 0.3115331700278514, + "grad_norm": 4.489236831665039, + "learning_rate": 9.590881218735394e-05, + "loss": 2.5411, + "step": 4642 + }, + { + "epoch": 0.3116673937116204, + "grad_norm": 6.23002290725708, + "learning_rate": 9.590450538233817e-05, + "loss": 2.7616, + "step": 4644 + }, + { + "epoch": 0.3118016173953894, + "grad_norm": 4.560533046722412, + "learning_rate": 9.590019640842482e-05, + "loss": 2.6763, + "step": 4646 + }, + { + "epoch": 0.3119358410791584, + "grad_norm": 4.451091766357422, + "learning_rate": 9.589588526581741e-05, + "loss": 2.9028, + "step": 4648 + }, + { + "epoch": 0.3120700647629274, + "grad_norm": 4.500238418579102, + "learning_rate": 9.58915719547197e-05, + "loss": 2.9401, + "step": 4650 + }, + { + "epoch": 0.3122042884466964, + "grad_norm": 4.431452751159668, + "learning_rate": 9.588725647533545e-05, + "loss": 2.8282, + "step": 4652 + }, + { + "epoch": 0.31233851213046543, + "grad_norm": 4.606666564941406, + "learning_rate": 9.588293882786857e-05, + "loss": 2.7313, + "step": 4654 + }, + { + "epoch": 0.3124727358142344, + "grad_norm": 6.21956205368042, + "learning_rate": 9.587861901252305e-05, + "loss": 2.5568, + "step": 4656 + }, + { + "epoch": 0.31260695949800343, + "grad_norm": 7.46677827835083, + "learning_rate": 9.5874297029503e-05, + "loss": 2.9861, + "step": 4658 + }, + { + "epoch": 0.3127411831817724, + "grad_norm": 4.036979675292969, + "learning_rate": 9.586997287901262e-05, + "loss": 2.9225, + "step": 4660 + }, + { + "epoch": 0.31287540686554144, + "grad_norm": 29.86538314819336, + "learning_rate": 9.586564656125623e-05, + "loss": 2.851, + "step": 4662 + }, + { + "epoch": 0.3130096305493104, + "grad_norm": 4.825428009033203, + "learning_rate": 9.586131807643822e-05, + "loss": 2.6576, + "step": 4664 + }, + { + "epoch": 0.31314385423307944, + "grad_norm": 3.9345617294311523, + "learning_rate": 9.585698742476311e-05, + "loss": 2.6048, + "step": 4666 + }, + { + "epoch": 0.3132780779168484, + "grad_norm": 4.144377708435059, + "learning_rate": 9.585265460643553e-05, + "loss": 2.8836, + "step": 4668 + }, + { + "epoch": 0.31341230160061745, + "grad_norm": 4.160819053649902, + "learning_rate": 9.584831962166017e-05, + "loss": 2.7412, + "step": 4670 + }, + { + "epoch": 0.3135465252843864, + "grad_norm": 4.792141914367676, + "learning_rate": 9.584398247064188e-05, + "loss": 2.8614, + "step": 4672 + }, + { + "epoch": 0.31368074896815545, + "grad_norm": 5.140069961547852, + "learning_rate": 9.583964315358555e-05, + "loss": 2.7085, + "step": 4674 + }, + { + "epoch": 0.31381497265192443, + "grad_norm": 5.181036472320557, + "learning_rate": 9.583530167069626e-05, + "loss": 2.9415, + "step": 4676 + }, + { + "epoch": 0.31394919633569346, + "grad_norm": 4.17869234085083, + "learning_rate": 9.583095802217905e-05, + "loss": 2.669, + "step": 4678 + }, + { + "epoch": 0.31408342001946243, + "grad_norm": 4.385260105133057, + "learning_rate": 9.582661220823922e-05, + "loss": 2.6062, + "step": 4680 + }, + { + "epoch": 0.3142176437032314, + "grad_norm": 9.437397003173828, + "learning_rate": 9.582226422908207e-05, + "loss": 2.8381, + "step": 4682 + }, + { + "epoch": 0.31435186738700044, + "grad_norm": 4.309237957000732, + "learning_rate": 9.581791408491305e-05, + "loss": 2.4641, + "step": 4684 + }, + { + "epoch": 0.3144860910707694, + "grad_norm": 5.118946075439453, + "learning_rate": 9.581356177593767e-05, + "loss": 2.9729, + "step": 4686 + }, + { + "epoch": 0.31462031475453844, + "grad_norm": 4.7912445068359375, + "learning_rate": 9.58092073023616e-05, + "loss": 2.8461, + "step": 4688 + }, + { + "epoch": 0.3147545384383074, + "grad_norm": 4.65407133102417, + "learning_rate": 9.580485066439056e-05, + "loss": 2.8251, + "step": 4690 + }, + { + "epoch": 0.31488876212207645, + "grad_norm": 4.348706245422363, + "learning_rate": 9.58004918622304e-05, + "loss": 2.7085, + "step": 4692 + }, + { + "epoch": 0.3150229858058454, + "grad_norm": 4.739401817321777, + "learning_rate": 9.579613089608705e-05, + "loss": 2.7164, + "step": 4694 + }, + { + "epoch": 0.31515720948961445, + "grad_norm": 4.470139503479004, + "learning_rate": 9.579176776616658e-05, + "loss": 2.6974, + "step": 4696 + }, + { + "epoch": 0.31529143317338343, + "grad_norm": 4.64754581451416, + "learning_rate": 9.578740247267514e-05, + "loss": 2.8518, + "step": 4698 + }, + { + "epoch": 0.31542565685715246, + "grad_norm": 4.090550899505615, + "learning_rate": 9.578303501581895e-05, + "loss": 2.6144, + "step": 4700 + }, + { + "epoch": 0.31555988054092143, + "grad_norm": 4.283535480499268, + "learning_rate": 9.57786653958044e-05, + "loss": 2.752, + "step": 4702 + }, + { + "epoch": 0.31569410422469046, + "grad_norm": 4.33030366897583, + "learning_rate": 9.577429361283792e-05, + "loss": 2.7063, + "step": 4704 + }, + { + "epoch": 0.31582832790845944, + "grad_norm": 4.208527088165283, + "learning_rate": 9.576991966712607e-05, + "loss": 2.7503, + "step": 4706 + }, + { + "epoch": 0.31596255159222847, + "grad_norm": 4.507981300354004, + "learning_rate": 9.576554355887554e-05, + "loss": 2.7464, + "step": 4708 + }, + { + "epoch": 0.31609677527599744, + "grad_norm": 7.0039567947387695, + "learning_rate": 9.576116528829306e-05, + "loss": 2.8894, + "step": 4710 + }, + { + "epoch": 0.3162309989597665, + "grad_norm": 4.224391460418701, + "learning_rate": 9.575678485558551e-05, + "loss": 3.0016, + "step": 4712 + }, + { + "epoch": 0.31636522264353545, + "grad_norm": 4.631462574005127, + "learning_rate": 9.575240226095984e-05, + "loss": 2.8124, + "step": 4714 + }, + { + "epoch": 0.3164994463273045, + "grad_norm": 4.526212215423584, + "learning_rate": 9.574801750462315e-05, + "loss": 2.9319, + "step": 4716 + }, + { + "epoch": 0.31663367001107345, + "grad_norm": 4.702991485595703, + "learning_rate": 9.574363058678257e-05, + "loss": 2.994, + "step": 4718 + }, + { + "epoch": 0.31676789369484243, + "grad_norm": 4.410007953643799, + "learning_rate": 9.573924150764541e-05, + "loss": 2.7626, + "step": 4720 + }, + { + "epoch": 0.31690211737861146, + "grad_norm": 4.21797513961792, + "learning_rate": 9.573485026741902e-05, + "loss": 2.7804, + "step": 4722 + }, + { + "epoch": 0.31703634106238043, + "grad_norm": 4.420831680297852, + "learning_rate": 9.57304568663109e-05, + "loss": 2.6485, + "step": 4724 + }, + { + "epoch": 0.31717056474614946, + "grad_norm": 4.200890064239502, + "learning_rate": 9.572606130452862e-05, + "loss": 2.7091, + "step": 4726 + }, + { + "epoch": 0.31730478842991844, + "grad_norm": 4.35244607925415, + "learning_rate": 9.572166358227985e-05, + "loss": 2.8004, + "step": 4728 + }, + { + "epoch": 0.31743901211368747, + "grad_norm": 5.082586765289307, + "learning_rate": 9.571726369977239e-05, + "loss": 3.0312, + "step": 4730 + }, + { + "epoch": 0.31757323579745644, + "grad_norm": 4.479050636291504, + "learning_rate": 9.571286165721412e-05, + "loss": 2.8266, + "step": 4732 + }, + { + "epoch": 0.3177074594812255, + "grad_norm": 4.166759967803955, + "learning_rate": 9.570845745481303e-05, + "loss": 2.8249, + "step": 4734 + }, + { + "epoch": 0.31784168316499445, + "grad_norm": 3.3819527626037598, + "learning_rate": 9.570405109277719e-05, + "loss": 2.4459, + "step": 4736 + }, + { + "epoch": 0.3179759068487635, + "grad_norm": 4.992671966552734, + "learning_rate": 9.569964257131484e-05, + "loss": 3.1127, + "step": 4738 + }, + { + "epoch": 0.31811013053253245, + "grad_norm": 4.490841388702393, + "learning_rate": 9.56952318906342e-05, + "loss": 2.9532, + "step": 4740 + }, + { + "epoch": 0.3182443542163015, + "grad_norm": 4.70282506942749, + "learning_rate": 9.569081905094375e-05, + "loss": 2.6404, + "step": 4742 + }, + { + "epoch": 0.31837857790007046, + "grad_norm": 3.8591148853302, + "learning_rate": 9.568640405245192e-05, + "loss": 2.5941, + "step": 4744 + }, + { + "epoch": 0.3185128015838395, + "grad_norm": 5.643902778625488, + "learning_rate": 9.568198689536734e-05, + "loss": 2.842, + "step": 4746 + }, + { + "epoch": 0.31864702526760846, + "grad_norm": 5.011003494262695, + "learning_rate": 9.567756757989872e-05, + "loss": 2.7419, + "step": 4748 + }, + { + "epoch": 0.3187812489513775, + "grad_norm": 4.517385959625244, + "learning_rate": 9.567314610625485e-05, + "loss": 2.7479, + "step": 4750 + }, + { + "epoch": 0.31891547263514647, + "grad_norm": 3.9494786262512207, + "learning_rate": 9.566872247464464e-05, + "loss": 2.6473, + "step": 4752 + }, + { + "epoch": 0.3190496963189155, + "grad_norm": 4.160868167877197, + "learning_rate": 9.56642966852771e-05, + "loss": 2.8063, + "step": 4754 + }, + { + "epoch": 0.3191839200026845, + "grad_norm": 4.55134391784668, + "learning_rate": 9.565986873836132e-05, + "loss": 2.9603, + "step": 4756 + }, + { + "epoch": 0.31931814368645345, + "grad_norm": 5.166600227355957, + "learning_rate": 9.565543863410654e-05, + "loss": 2.6055, + "step": 4758 + }, + { + "epoch": 0.3194523673702225, + "grad_norm": 4.276504993438721, + "learning_rate": 9.565100637272206e-05, + "loss": 2.7229, + "step": 4760 + }, + { + "epoch": 0.31958659105399145, + "grad_norm": 4.418055057525635, + "learning_rate": 9.564657195441731e-05, + "loss": 2.7384, + "step": 4762 + }, + { + "epoch": 0.3197208147377605, + "grad_norm": 5.944407939910889, + "learning_rate": 9.564213537940177e-05, + "loss": 3.1485, + "step": 4764 + }, + { + "epoch": 0.31985503842152946, + "grad_norm": 20.827974319458008, + "learning_rate": 9.563769664788511e-05, + "loss": 2.7624, + "step": 4766 + }, + { + "epoch": 0.3199892621052985, + "grad_norm": 5.2699875831604, + "learning_rate": 9.563325576007701e-05, + "loss": 2.9964, + "step": 4768 + }, + { + "epoch": 0.32012348578906746, + "grad_norm": 4.551694869995117, + "learning_rate": 9.562881271618732e-05, + "loss": 2.6591, + "step": 4770 + }, + { + "epoch": 0.3202577094728365, + "grad_norm": 5.028125762939453, + "learning_rate": 9.562436751642593e-05, + "loss": 2.7933, + "step": 4772 + }, + { + "epoch": 0.32039193315660547, + "grad_norm": 3.953355550765991, + "learning_rate": 9.561992016100293e-05, + "loss": 2.6693, + "step": 4774 + }, + { + "epoch": 0.3205261568403745, + "grad_norm": 4.487157821655273, + "learning_rate": 9.561547065012839e-05, + "loss": 3.0046, + "step": 4776 + }, + { + "epoch": 0.3206603805241435, + "grad_norm": 4.485952377319336, + "learning_rate": 9.561101898401255e-05, + "loss": 3.0006, + "step": 4778 + }, + { + "epoch": 0.3207946042079125, + "grad_norm": 4.878428936004639, + "learning_rate": 9.560656516286577e-05, + "loss": 2.8177, + "step": 4780 + }, + { + "epoch": 0.3209288278916815, + "grad_norm": 4.614159107208252, + "learning_rate": 9.560210918689847e-05, + "loss": 2.9703, + "step": 4782 + }, + { + "epoch": 0.3210630515754505, + "grad_norm": 5.311845302581787, + "learning_rate": 9.559765105632117e-05, + "loss": 2.7765, + "step": 4784 + }, + { + "epoch": 0.3211972752592195, + "grad_norm": 6.025974750518799, + "learning_rate": 9.559319077134453e-05, + "loss": 2.6088, + "step": 4786 + }, + { + "epoch": 0.3213314989429885, + "grad_norm": 4.583115100860596, + "learning_rate": 9.558872833217927e-05, + "loss": 2.6978, + "step": 4788 + }, + { + "epoch": 0.3214657226267575, + "grad_norm": 4.147791385650635, + "learning_rate": 9.558426373903626e-05, + "loss": 2.7702, + "step": 4790 + }, + { + "epoch": 0.3215999463105265, + "grad_norm": 4.178574562072754, + "learning_rate": 9.557979699212642e-05, + "loss": 2.6456, + "step": 4792 + }, + { + "epoch": 0.3217341699942955, + "grad_norm": 5.9982829093933105, + "learning_rate": 9.557532809166079e-05, + "loss": 2.8206, + "step": 4794 + }, + { + "epoch": 0.32186839367806447, + "grad_norm": 4.453436374664307, + "learning_rate": 9.557085703785054e-05, + "loss": 3.152, + "step": 4796 + }, + { + "epoch": 0.3220026173618335, + "grad_norm": 4.6709208488464355, + "learning_rate": 9.55663838309069e-05, + "loss": 2.7787, + "step": 4798 + }, + { + "epoch": 0.3221368410456025, + "grad_norm": 4.28563117980957, + "learning_rate": 9.556190847104123e-05, + "loss": 2.9227, + "step": 4800 + }, + { + "epoch": 0.3222710647293715, + "grad_norm": 5.123575687408447, + "learning_rate": 9.555743095846497e-05, + "loss": 2.659, + "step": 4802 + }, + { + "epoch": 0.3224052884131405, + "grad_norm": 4.528499126434326, + "learning_rate": 9.555295129338969e-05, + "loss": 2.534, + "step": 4804 + }, + { + "epoch": 0.3225395120969095, + "grad_norm": 4.492948055267334, + "learning_rate": 9.554846947602704e-05, + "loss": 2.7698, + "step": 4806 + }, + { + "epoch": 0.3226737357806785, + "grad_norm": 4.553065299987793, + "learning_rate": 9.554398550658876e-05, + "loss": 2.7167, + "step": 4808 + }, + { + "epoch": 0.3228079594644475, + "grad_norm": 4.366567134857178, + "learning_rate": 9.553949938528675e-05, + "loss": 2.6404, + "step": 4810 + }, + { + "epoch": 0.3229421831482165, + "grad_norm": 4.57440710067749, + "learning_rate": 9.553501111233292e-05, + "loss": 2.7944, + "step": 4812 + }, + { + "epoch": 0.3230764068319855, + "grad_norm": 4.641818523406982, + "learning_rate": 9.553052068793937e-05, + "loss": 2.8811, + "step": 4814 + }, + { + "epoch": 0.3232106305157545, + "grad_norm": 4.205483436584473, + "learning_rate": 9.552602811231824e-05, + "loss": 2.8528, + "step": 4816 + }, + { + "epoch": 0.3233448541995235, + "grad_norm": 4.704224109649658, + "learning_rate": 9.552153338568181e-05, + "loss": 3.183, + "step": 4818 + }, + { + "epoch": 0.3234790778832925, + "grad_norm": 7.043558597564697, + "learning_rate": 9.551703650824243e-05, + "loss": 3.028, + "step": 4820 + }, + { + "epoch": 0.32361330156706153, + "grad_norm": 4.305962562561035, + "learning_rate": 9.551253748021259e-05, + "loss": 2.8371, + "step": 4822 + }, + { + "epoch": 0.3237475252508305, + "grad_norm": 4.649087905883789, + "learning_rate": 9.550803630180485e-05, + "loss": 2.8463, + "step": 4824 + }, + { + "epoch": 0.32388174893459953, + "grad_norm": 4.486215114593506, + "learning_rate": 9.55035329732319e-05, + "loss": 2.667, + "step": 4826 + }, + { + "epoch": 0.3240159726183685, + "grad_norm": 4.441422462463379, + "learning_rate": 9.549902749470646e-05, + "loss": 2.6491, + "step": 4828 + }, + { + "epoch": 0.32415019630213754, + "grad_norm": 4.837339401245117, + "learning_rate": 9.549451986644147e-05, + "loss": 2.7542, + "step": 4830 + }, + { + "epoch": 0.3242844199859065, + "grad_norm": 4.634901523590088, + "learning_rate": 9.549001008864987e-05, + "loss": 2.8391, + "step": 4832 + }, + { + "epoch": 0.3244186436696755, + "grad_norm": 4.521147727966309, + "learning_rate": 9.548549816154473e-05, + "loss": 2.9727, + "step": 4834 + }, + { + "epoch": 0.3245528673534445, + "grad_norm": 3.5688681602478027, + "learning_rate": 9.548098408533926e-05, + "loss": 2.4701, + "step": 4836 + }, + { + "epoch": 0.3246870910372135, + "grad_norm": 4.50125789642334, + "learning_rate": 9.547646786024673e-05, + "loss": 2.8135, + "step": 4838 + }, + { + "epoch": 0.3248213147209825, + "grad_norm": 5.272185802459717, + "learning_rate": 9.547194948648051e-05, + "loss": 2.7469, + "step": 4840 + }, + { + "epoch": 0.3249555384047515, + "grad_norm": 4.6086344718933105, + "learning_rate": 9.546742896425409e-05, + "loss": 3.0953, + "step": 4842 + }, + { + "epoch": 0.32508976208852053, + "grad_norm": 4.41192626953125, + "learning_rate": 9.546290629378107e-05, + "loss": 2.7742, + "step": 4844 + }, + { + "epoch": 0.3252239857722895, + "grad_norm": 3.9594902992248535, + "learning_rate": 9.545838147527512e-05, + "loss": 2.7775, + "step": 4846 + }, + { + "epoch": 0.32535820945605853, + "grad_norm": 5.814505577087402, + "learning_rate": 9.545385450895003e-05, + "loss": 2.8, + "step": 4848 + }, + { + "epoch": 0.3254924331398275, + "grad_norm": 4.324177265167236, + "learning_rate": 9.544932539501971e-05, + "loss": 2.671, + "step": 4850 + }, + { + "epoch": 0.32562665682359654, + "grad_norm": 5.848516464233398, + "learning_rate": 9.544479413369814e-05, + "loss": 2.8049, + "step": 4852 + }, + { + "epoch": 0.3257608805073655, + "grad_norm": 4.612092018127441, + "learning_rate": 9.54402607251994e-05, + "loss": 2.893, + "step": 4854 + }, + { + "epoch": 0.32589510419113454, + "grad_norm": 6.4037957191467285, + "learning_rate": 9.543572516973769e-05, + "loss": 2.4562, + "step": 4856 + }, + { + "epoch": 0.3260293278749035, + "grad_norm": 4.8241167068481445, + "learning_rate": 9.543118746752733e-05, + "loss": 2.6758, + "step": 4858 + }, + { + "epoch": 0.32616355155867255, + "grad_norm": 4.255184650421143, + "learning_rate": 9.542664761878269e-05, + "loss": 2.7484, + "step": 4860 + }, + { + "epoch": 0.3262977752424415, + "grad_norm": 4.2574005126953125, + "learning_rate": 9.542210562371828e-05, + "loss": 2.7609, + "step": 4862 + }, + { + "epoch": 0.32643199892621055, + "grad_norm": 4.476006984710693, + "learning_rate": 9.541756148254867e-05, + "loss": 2.689, + "step": 4864 + }, + { + "epoch": 0.32656622260997953, + "grad_norm": 5.071721076965332, + "learning_rate": 9.541301519548861e-05, + "loss": 2.8171, + "step": 4866 + }, + { + "epoch": 0.32670044629374856, + "grad_norm": 4.256276607513428, + "learning_rate": 9.54084667627529e-05, + "loss": 2.5153, + "step": 4868 + }, + { + "epoch": 0.32683466997751753, + "grad_norm": 4.756156921386719, + "learning_rate": 9.540391618455641e-05, + "loss": 2.841, + "step": 4870 + }, + { + "epoch": 0.3269688936612865, + "grad_norm": 4.809201717376709, + "learning_rate": 9.539936346111416e-05, + "loss": 3.0205, + "step": 4872 + }, + { + "epoch": 0.32710311734505554, + "grad_norm": 5.182834625244141, + "learning_rate": 9.539480859264128e-05, + "loss": 2.93, + "step": 4874 + }, + { + "epoch": 0.3272373410288245, + "grad_norm": 4.0786333084106445, + "learning_rate": 9.539025157935292e-05, + "loss": 2.9198, + "step": 4876 + }, + { + "epoch": 0.32737156471259354, + "grad_norm": 11.014899253845215, + "learning_rate": 9.538569242146447e-05, + "loss": 2.9465, + "step": 4878 + }, + { + "epoch": 0.3275057883963625, + "grad_norm": 4.420095443725586, + "learning_rate": 9.53811311191913e-05, + "loss": 2.8203, + "step": 4880 + }, + { + "epoch": 0.32764001208013155, + "grad_norm": 4.948727130889893, + "learning_rate": 9.53765676727489e-05, + "loss": 2.8129, + "step": 4882 + }, + { + "epoch": 0.3277742357639005, + "grad_norm": 4.2386698722839355, + "learning_rate": 9.537200208235291e-05, + "loss": 2.8553, + "step": 4884 + }, + { + "epoch": 0.32790845944766955, + "grad_norm": 5.146722316741943, + "learning_rate": 9.536743434821904e-05, + "loss": 2.6904, + "step": 4886 + }, + { + "epoch": 0.32804268313143853, + "grad_norm": 4.915933609008789, + "learning_rate": 9.536286447056311e-05, + "loss": 3.0697, + "step": 4888 + }, + { + "epoch": 0.32817690681520756, + "grad_norm": 4.858584880828857, + "learning_rate": 9.535829244960104e-05, + "loss": 2.8818, + "step": 4890 + }, + { + "epoch": 0.32831113049897653, + "grad_norm": 4.175931453704834, + "learning_rate": 9.535371828554884e-05, + "loss": 2.82, + "step": 4892 + }, + { + "epoch": 0.32844535418274556, + "grad_norm": 4.182652950286865, + "learning_rate": 9.534914197862266e-05, + "loss": 2.6842, + "step": 4894 + }, + { + "epoch": 0.32857957786651454, + "grad_norm": 4.424933910369873, + "learning_rate": 9.534456352903866e-05, + "loss": 2.7674, + "step": 4896 + }, + { + "epoch": 0.32871380155028357, + "grad_norm": 4.7122883796691895, + "learning_rate": 9.533998293701323e-05, + "loss": 2.6934, + "step": 4898 + }, + { + "epoch": 0.32884802523405254, + "grad_norm": 4.621822834014893, + "learning_rate": 9.533540020276274e-05, + "loss": 2.9068, + "step": 4900 + }, + { + "epoch": 0.3289822489178216, + "grad_norm": 6.582426071166992, + "learning_rate": 9.533081532650375e-05, + "loss": 2.643, + "step": 4902 + }, + { + "epoch": 0.32911647260159055, + "grad_norm": 4.253739356994629, + "learning_rate": 9.532622830845287e-05, + "loss": 2.7006, + "step": 4904 + }, + { + "epoch": 0.3292506962853596, + "grad_norm": 4.99995231628418, + "learning_rate": 9.532163914882685e-05, + "loss": 2.7146, + "step": 4906 + }, + { + "epoch": 0.32938491996912855, + "grad_norm": 3.9720499515533447, + "learning_rate": 9.531704784784248e-05, + "loss": 2.5664, + "step": 4908 + }, + { + "epoch": 0.32951914365289753, + "grad_norm": 4.1222147941589355, + "learning_rate": 9.531245440571672e-05, + "loss": 2.5348, + "step": 4910 + }, + { + "epoch": 0.32965336733666656, + "grad_norm": 5.0071892738342285, + "learning_rate": 9.53078588226666e-05, + "loss": 2.698, + "step": 4912 + }, + { + "epoch": 0.32978759102043553, + "grad_norm": 4.019351959228516, + "learning_rate": 9.530326109890924e-05, + "loss": 2.5263, + "step": 4914 + }, + { + "epoch": 0.32992181470420456, + "grad_norm": 6.58957576751709, + "learning_rate": 9.529866123466187e-05, + "loss": 2.7356, + "step": 4916 + }, + { + "epoch": 0.33005603838797354, + "grad_norm": 4.0144853591918945, + "learning_rate": 9.529405923014183e-05, + "loss": 2.4392, + "step": 4918 + }, + { + "epoch": 0.33019026207174257, + "grad_norm": 4.585607528686523, + "learning_rate": 9.528945508556656e-05, + "loss": 2.8313, + "step": 4920 + }, + { + "epoch": 0.33032448575551154, + "grad_norm": 4.080796241760254, + "learning_rate": 9.528484880115361e-05, + "loss": 2.6604, + "step": 4922 + }, + { + "epoch": 0.3304587094392806, + "grad_norm": 5.4585723876953125, + "learning_rate": 9.52802403771206e-05, + "loss": 3.4678, + "step": 4924 + }, + { + "epoch": 0.33059293312304955, + "grad_norm": 4.997231483459473, + "learning_rate": 9.527562981368525e-05, + "loss": 2.7336, + "step": 4926 + }, + { + "epoch": 0.3307271568068186, + "grad_norm": 4.641803741455078, + "learning_rate": 9.527101711106546e-05, + "loss": 2.6091, + "step": 4928 + }, + { + "epoch": 0.33086138049058755, + "grad_norm": 4.6295270919799805, + "learning_rate": 9.52664022694791e-05, + "loss": 2.9819, + "step": 4930 + }, + { + "epoch": 0.3309956041743566, + "grad_norm": 5.65504789352417, + "learning_rate": 9.526178528914425e-05, + "loss": 2.7159, + "step": 4932 + }, + { + "epoch": 0.33112982785812556, + "grad_norm": 4.392491340637207, + "learning_rate": 9.525716617027906e-05, + "loss": 2.7255, + "step": 4934 + }, + { + "epoch": 0.3312640515418946, + "grad_norm": 4.2046356201171875, + "learning_rate": 9.525254491310176e-05, + "loss": 2.5555, + "step": 4936 + }, + { + "epoch": 0.33139827522566356, + "grad_norm": 4.140669822692871, + "learning_rate": 9.524792151783069e-05, + "loss": 2.5779, + "step": 4938 + }, + { + "epoch": 0.3315324989094326, + "grad_norm": 3.931997537612915, + "learning_rate": 9.524329598468431e-05, + "loss": 2.5234, + "step": 4940 + }, + { + "epoch": 0.33166672259320157, + "grad_norm": 4.813669204711914, + "learning_rate": 9.523866831388116e-05, + "loss": 2.944, + "step": 4942 + }, + { + "epoch": 0.3318009462769706, + "grad_norm": 4.3032050132751465, + "learning_rate": 9.52340385056399e-05, + "loss": 2.6268, + "step": 4944 + }, + { + "epoch": 0.3319351699607396, + "grad_norm": 4.668279647827148, + "learning_rate": 9.522940656017926e-05, + "loss": 2.7964, + "step": 4946 + }, + { + "epoch": 0.33206939364450855, + "grad_norm": 4.316313743591309, + "learning_rate": 9.52247724777181e-05, + "loss": 2.5879, + "step": 4948 + }, + { + "epoch": 0.3322036173282776, + "grad_norm": 3.8818910121917725, + "learning_rate": 9.522013625847537e-05, + "loss": 2.8235, + "step": 4950 + }, + { + "epoch": 0.33233784101204655, + "grad_norm": 4.990545272827148, + "learning_rate": 9.521549790267013e-05, + "loss": 2.6953, + "step": 4952 + }, + { + "epoch": 0.3324720646958156, + "grad_norm": 4.406045436859131, + "learning_rate": 9.521085741052152e-05, + "loss": 2.6551, + "step": 4954 + }, + { + "epoch": 0.33260628837958456, + "grad_norm": 4.863401889801025, + "learning_rate": 9.52062147822488e-05, + "loss": 2.7477, + "step": 4956 + }, + { + "epoch": 0.3327405120633536, + "grad_norm": 4.337003707885742, + "learning_rate": 9.520157001807133e-05, + "loss": 2.9053, + "step": 4958 + }, + { + "epoch": 0.33287473574712256, + "grad_norm": 4.516232490539551, + "learning_rate": 9.519692311820856e-05, + "loss": 2.5521, + "step": 4960 + }, + { + "epoch": 0.3330089594308916, + "grad_norm": 4.095492839813232, + "learning_rate": 9.519227408288006e-05, + "loss": 2.7516, + "step": 4962 + }, + { + "epoch": 0.33314318311466057, + "grad_norm": 5.608027458190918, + "learning_rate": 9.518762291230546e-05, + "loss": 2.7992, + "step": 4964 + }, + { + "epoch": 0.3332774067984296, + "grad_norm": 4.126688003540039, + "learning_rate": 9.518296960670455e-05, + "loss": 2.7674, + "step": 4966 + }, + { + "epoch": 0.3334116304821986, + "grad_norm": 5.609880447387695, + "learning_rate": 9.517831416629716e-05, + "loss": 2.7771, + "step": 4968 + }, + { + "epoch": 0.3335458541659676, + "grad_norm": 4.962559223175049, + "learning_rate": 9.517365659130326e-05, + "loss": 2.6753, + "step": 4970 + }, + { + "epoch": 0.3336800778497366, + "grad_norm": 4.403448581695557, + "learning_rate": 9.516899688194294e-05, + "loss": 2.6169, + "step": 4972 + }, + { + "epoch": 0.3338143015335056, + "grad_norm": 4.363509654998779, + "learning_rate": 9.516433503843631e-05, + "loss": 2.8962, + "step": 4974 + }, + { + "epoch": 0.3339485252172746, + "grad_norm": 4.22034215927124, + "learning_rate": 9.515967106100368e-05, + "loss": 2.9095, + "step": 4976 + }, + { + "epoch": 0.3340827489010436, + "grad_norm": 4.660727500915527, + "learning_rate": 9.515500494986541e-05, + "loss": 2.8022, + "step": 4978 + }, + { + "epoch": 0.3342169725848126, + "grad_norm": 4.701873302459717, + "learning_rate": 9.515033670524192e-05, + "loss": 2.7648, + "step": 4980 + }, + { + "epoch": 0.3343511962685816, + "grad_norm": 4.948964595794678, + "learning_rate": 9.514566632735382e-05, + "loss": 2.6042, + "step": 4982 + }, + { + "epoch": 0.3344854199523506, + "grad_norm": 4.810503959655762, + "learning_rate": 9.514099381642175e-05, + "loss": 2.7674, + "step": 4984 + }, + { + "epoch": 0.33461964363611957, + "grad_norm": 5.058028697967529, + "learning_rate": 9.51363191726665e-05, + "loss": 2.9123, + "step": 4986 + }, + { + "epoch": 0.3347538673198886, + "grad_norm": 3.9013426303863525, + "learning_rate": 9.513164239630891e-05, + "loss": 2.5291, + "step": 4988 + }, + { + "epoch": 0.3348880910036576, + "grad_norm": 4.769268035888672, + "learning_rate": 9.512696348756997e-05, + "loss": 2.7862, + "step": 4990 + }, + { + "epoch": 0.3350223146874266, + "grad_norm": 4.717803478240967, + "learning_rate": 9.512228244667076e-05, + "loss": 2.7518, + "step": 4992 + }, + { + "epoch": 0.3351565383711956, + "grad_norm": 4.679978370666504, + "learning_rate": 9.511759927383243e-05, + "loss": 2.7379, + "step": 4994 + }, + { + "epoch": 0.3352907620549646, + "grad_norm": 4.145959377288818, + "learning_rate": 9.511291396927625e-05, + "loss": 2.7921, + "step": 4996 + }, + { + "epoch": 0.3354249857387336, + "grad_norm": 4.466618061065674, + "learning_rate": 9.510822653322359e-05, + "loss": 2.6485, + "step": 4998 + }, + { + "epoch": 0.3355592094225026, + "grad_norm": 4.304198741912842, + "learning_rate": 9.510353696589593e-05, + "loss": 2.6605, + "step": 5000 + }, + { + "epoch": 0.3356934331062716, + "grad_norm": 5.271279811859131, + "learning_rate": 9.509884526751485e-05, + "loss": 2.8201, + "step": 5002 + }, + { + "epoch": 0.3358276567900406, + "grad_norm": 5.154038429260254, + "learning_rate": 9.5094151438302e-05, + "loss": 2.9473, + "step": 5004 + }, + { + "epoch": 0.3359618804738096, + "grad_norm": 4.2331719398498535, + "learning_rate": 9.508945547847916e-05, + "loss": 2.9103, + "step": 5006 + }, + { + "epoch": 0.3360961041575786, + "grad_norm": 6.93991756439209, + "learning_rate": 9.508475738826823e-05, + "loss": 2.9778, + "step": 5008 + }, + { + "epoch": 0.3362303278413476, + "grad_norm": 8.204298973083496, + "learning_rate": 9.508005716789117e-05, + "loss": 2.8712, + "step": 5010 + }, + { + "epoch": 0.33636455152511663, + "grad_norm": 4.0471296310424805, + "learning_rate": 9.507535481757005e-05, + "loss": 3.0377, + "step": 5012 + }, + { + "epoch": 0.3364987752088856, + "grad_norm": 5.4011430740356445, + "learning_rate": 9.507065033752704e-05, + "loss": 2.9144, + "step": 5014 + }, + { + "epoch": 0.33663299889265463, + "grad_norm": 4.217594623565674, + "learning_rate": 9.506594372798446e-05, + "loss": 2.8843, + "step": 5016 + }, + { + "epoch": 0.3367672225764236, + "grad_norm": 4.35903787612915, + "learning_rate": 9.506123498916463e-05, + "loss": 2.538, + "step": 5018 + }, + { + "epoch": 0.33690144626019264, + "grad_norm": 6.514352798461914, + "learning_rate": 9.505652412129008e-05, + "loss": 2.5834, + "step": 5020 + }, + { + "epoch": 0.3370356699439616, + "grad_norm": 5.0609517097473145, + "learning_rate": 9.505181112458335e-05, + "loss": 2.8621, + "step": 5022 + }, + { + "epoch": 0.3371698936277306, + "grad_norm": 5.007692813873291, + "learning_rate": 9.504709599926712e-05, + "loss": 2.7016, + "step": 5024 + }, + { + "epoch": 0.3373041173114996, + "grad_norm": 4.268667221069336, + "learning_rate": 9.504237874556421e-05, + "loss": 2.7879, + "step": 5026 + }, + { + "epoch": 0.3374383409952686, + "grad_norm": 3.9460203647613525, + "learning_rate": 9.503765936369746e-05, + "loss": 2.6661, + "step": 5028 + }, + { + "epoch": 0.3375725646790376, + "grad_norm": 4.544965744018555, + "learning_rate": 9.503293785388987e-05, + "loss": 2.7891, + "step": 5030 + }, + { + "epoch": 0.3377067883628066, + "grad_norm": 4.450967311859131, + "learning_rate": 9.502821421636454e-05, + "loss": 2.8056, + "step": 5032 + }, + { + "epoch": 0.33784101204657563, + "grad_norm": 4.375290393829346, + "learning_rate": 9.502348845134461e-05, + "loss": 2.6507, + "step": 5034 + }, + { + "epoch": 0.3379752357303446, + "grad_norm": 4.243753910064697, + "learning_rate": 9.501876055905339e-05, + "loss": 2.8748, + "step": 5036 + }, + { + "epoch": 0.33810945941411363, + "grad_norm": 4.907501220703125, + "learning_rate": 9.501403053971427e-05, + "loss": 2.7849, + "step": 5038 + }, + { + "epoch": 0.3382436830978826, + "grad_norm": 4.474222660064697, + "learning_rate": 9.500929839355071e-05, + "loss": 2.4631, + "step": 5040 + }, + { + "epoch": 0.33837790678165164, + "grad_norm": 4.687340259552002, + "learning_rate": 9.500456412078631e-05, + "loss": 2.8247, + "step": 5042 + }, + { + "epoch": 0.3385121304654206, + "grad_norm": 4.601787567138672, + "learning_rate": 9.499982772164475e-05, + "loss": 2.8404, + "step": 5044 + }, + { + "epoch": 0.33864635414918964, + "grad_norm": 4.496009349822998, + "learning_rate": 9.499508919634983e-05, + "loss": 2.7328, + "step": 5046 + }, + { + "epoch": 0.3387805778329586, + "grad_norm": 4.428960800170898, + "learning_rate": 9.499034854512542e-05, + "loss": 2.8412, + "step": 5048 + }, + { + "epoch": 0.33891480151672765, + "grad_norm": 4.580824851989746, + "learning_rate": 9.49856057681955e-05, + "loss": 2.8881, + "step": 5050 + }, + { + "epoch": 0.3390490252004966, + "grad_norm": 6.0238471031188965, + "learning_rate": 9.498086086578418e-05, + "loss": 2.9743, + "step": 5052 + }, + { + "epoch": 0.33918324888426565, + "grad_norm": 4.326607704162598, + "learning_rate": 9.497611383811564e-05, + "loss": 2.7712, + "step": 5054 + }, + { + "epoch": 0.33931747256803463, + "grad_norm": 4.26543664932251, + "learning_rate": 9.497136468541415e-05, + "loss": 2.5032, + "step": 5056 + }, + { + "epoch": 0.33945169625180366, + "grad_norm": 4.853246212005615, + "learning_rate": 9.496661340790411e-05, + "loss": 2.8087, + "step": 5058 + }, + { + "epoch": 0.33958591993557263, + "grad_norm": 4.0450215339660645, + "learning_rate": 9.496186000581002e-05, + "loss": 2.8986, + "step": 5060 + }, + { + "epoch": 0.3397201436193416, + "grad_norm": 5.164757251739502, + "learning_rate": 9.495710447935646e-05, + "loss": 2.7072, + "step": 5062 + }, + { + "epoch": 0.33985436730311064, + "grad_norm": 4.453521251678467, + "learning_rate": 9.49523468287681e-05, + "loss": 3.0229, + "step": 5064 + }, + { + "epoch": 0.3399885909868796, + "grad_norm": 7.14592981338501, + "learning_rate": 9.494758705426978e-05, + "loss": 2.829, + "step": 5066 + }, + { + "epoch": 0.34012281467064864, + "grad_norm": 4.829479217529297, + "learning_rate": 9.494282515608632e-05, + "loss": 2.761, + "step": 5068 + }, + { + "epoch": 0.3402570383544176, + "grad_norm": 4.307732105255127, + "learning_rate": 9.493806113444277e-05, + "loss": 2.9306, + "step": 5070 + }, + { + "epoch": 0.34039126203818665, + "grad_norm": 6.544069766998291, + "learning_rate": 9.493329498956421e-05, + "loss": 2.5722, + "step": 5072 + }, + { + "epoch": 0.3405254857219556, + "grad_norm": 4.5266194343566895, + "learning_rate": 9.492852672167578e-05, + "loss": 2.6071, + "step": 5074 + }, + { + "epoch": 0.34065970940572465, + "grad_norm": 5.099391937255859, + "learning_rate": 9.492375633100283e-05, + "loss": 2.5488, + "step": 5076 + }, + { + "epoch": 0.34079393308949363, + "grad_norm": 4.947330951690674, + "learning_rate": 9.491898381777074e-05, + "loss": 2.6016, + "step": 5078 + }, + { + "epoch": 0.34092815677326266, + "grad_norm": 4.224489212036133, + "learning_rate": 9.4914209182205e-05, + "loss": 2.8744, + "step": 5080 + }, + { + "epoch": 0.34106238045703163, + "grad_norm": 4.396609306335449, + "learning_rate": 9.490943242453118e-05, + "loss": 2.7889, + "step": 5082 + }, + { + "epoch": 0.34119660414080066, + "grad_norm": 4.186410903930664, + "learning_rate": 9.4904653544975e-05, + "loss": 2.5274, + "step": 5084 + }, + { + "epoch": 0.34133082782456964, + "grad_norm": 4.43904972076416, + "learning_rate": 9.489987254376222e-05, + "loss": 2.8087, + "step": 5086 + }, + { + "epoch": 0.34146505150833867, + "grad_norm": 4.645698547363281, + "learning_rate": 9.489508942111878e-05, + "loss": 2.7804, + "step": 5088 + }, + { + "epoch": 0.34159927519210764, + "grad_norm": 5.521573066711426, + "learning_rate": 9.489030417727063e-05, + "loss": 2.8211, + "step": 5090 + }, + { + "epoch": 0.3417334988758767, + "grad_norm": 17.015960693359375, + "learning_rate": 9.488551681244388e-05, + "loss": 2.7495, + "step": 5092 + }, + { + "epoch": 0.34186772255964565, + "grad_norm": 4.375091552734375, + "learning_rate": 9.488072732686474e-05, + "loss": 2.71, + "step": 5094 + }, + { + "epoch": 0.3420019462434146, + "grad_norm": 4.099365711212158, + "learning_rate": 9.487593572075948e-05, + "loss": 2.6125, + "step": 5096 + }, + { + "epoch": 0.34213616992718365, + "grad_norm": 5.044370651245117, + "learning_rate": 9.48711419943545e-05, + "loss": 2.944, + "step": 5098 + }, + { + "epoch": 0.34227039361095263, + "grad_norm": 4.389590263366699, + "learning_rate": 9.486634614787631e-05, + "loss": 2.6168, + "step": 5100 + }, + { + "epoch": 0.34240461729472166, + "grad_norm": 4.647300720214844, + "learning_rate": 9.486154818155146e-05, + "loss": 2.8035, + "step": 5102 + }, + { + "epoch": 0.34253884097849063, + "grad_norm": 3.9734420776367188, + "learning_rate": 9.485674809560669e-05, + "loss": 2.6524, + "step": 5104 + }, + { + "epoch": 0.34267306466225966, + "grad_norm": 4.570618629455566, + "learning_rate": 9.485194589026878e-05, + "loss": 2.7589, + "step": 5106 + }, + { + "epoch": 0.34280728834602864, + "grad_norm": 3.9199256896972656, + "learning_rate": 9.484714156576464e-05, + "loss": 2.7801, + "step": 5108 + }, + { + "epoch": 0.34294151202979767, + "grad_norm": 4.850030899047852, + "learning_rate": 9.484233512232123e-05, + "loss": 2.9329, + "step": 5110 + }, + { + "epoch": 0.34307573571356664, + "grad_norm": 4.383432865142822, + "learning_rate": 9.483752656016567e-05, + "loss": 3.0824, + "step": 5112 + }, + { + "epoch": 0.3432099593973357, + "grad_norm": 4.5393218994140625, + "learning_rate": 9.483271587952515e-05, + "loss": 2.7734, + "step": 5114 + }, + { + "epoch": 0.34334418308110465, + "grad_norm": 4.235030651092529, + "learning_rate": 9.482790308062697e-05, + "loss": 3.0433, + "step": 5116 + }, + { + "epoch": 0.3434784067648737, + "grad_norm": 4.200363636016846, + "learning_rate": 9.48230881636985e-05, + "loss": 3.0068, + "step": 5118 + }, + { + "epoch": 0.34361263044864265, + "grad_norm": 5.202736854553223, + "learning_rate": 9.481827112896727e-05, + "loss": 2.9725, + "step": 5120 + }, + { + "epoch": 0.3437468541324117, + "grad_norm": 3.952399492263794, + "learning_rate": 9.481345197666087e-05, + "loss": 2.6147, + "step": 5122 + }, + { + "epoch": 0.34388107781618066, + "grad_norm": 7.118772029876709, + "learning_rate": 9.480863070700696e-05, + "loss": 2.6495, + "step": 5124 + }, + { + "epoch": 0.3440153014999497, + "grad_norm": 4.474292755126953, + "learning_rate": 9.480380732023338e-05, + "loss": 2.8797, + "step": 5126 + }, + { + "epoch": 0.34414952518371866, + "grad_norm": 5.272939682006836, + "learning_rate": 9.479898181656801e-05, + "loss": 2.7917, + "step": 5128 + }, + { + "epoch": 0.3442837488674877, + "grad_norm": 4.653070449829102, + "learning_rate": 9.479415419623883e-05, + "loss": 3.1653, + "step": 5130 + }, + { + "epoch": 0.34441797255125667, + "grad_norm": 4.605733871459961, + "learning_rate": 9.478932445947395e-05, + "loss": 2.6711, + "step": 5132 + }, + { + "epoch": 0.34455219623502564, + "grad_norm": 4.8246989250183105, + "learning_rate": 9.478449260650158e-05, + "loss": 2.8562, + "step": 5134 + }, + { + "epoch": 0.3446864199187947, + "grad_norm": 5.191328525543213, + "learning_rate": 9.477965863754998e-05, + "loss": 2.7466, + "step": 5136 + }, + { + "epoch": 0.34482064360256365, + "grad_norm": 4.576777458190918, + "learning_rate": 9.477482255284757e-05, + "loss": 3.0467, + "step": 5138 + }, + { + "epoch": 0.3449548672863327, + "grad_norm": 4.5528059005737305, + "learning_rate": 9.476998435262284e-05, + "loss": 2.5898, + "step": 5140 + }, + { + "epoch": 0.34508909097010165, + "grad_norm": 4.129106044769287, + "learning_rate": 9.476514403710439e-05, + "loss": 2.9768, + "step": 5142 + }, + { + "epoch": 0.3452233146538707, + "grad_norm": 4.683798313140869, + "learning_rate": 9.476030160652091e-05, + "loss": 2.4687, + "step": 5144 + }, + { + "epoch": 0.34535753833763966, + "grad_norm": 4.225332260131836, + "learning_rate": 9.475545706110119e-05, + "loss": 2.988, + "step": 5146 + }, + { + "epoch": 0.3454917620214087, + "grad_norm": 4.068180561065674, + "learning_rate": 9.475061040107414e-05, + "loss": 2.8428, + "step": 5148 + }, + { + "epoch": 0.34562598570517766, + "grad_norm": 5.385563373565674, + "learning_rate": 9.474576162666874e-05, + "loss": 2.7471, + "step": 5150 + }, + { + "epoch": 0.3457602093889467, + "grad_norm": 5.344685077667236, + "learning_rate": 9.474091073811409e-05, + "loss": 2.7372, + "step": 5152 + }, + { + "epoch": 0.34589443307271567, + "grad_norm": 5.769327640533447, + "learning_rate": 9.47360577356394e-05, + "loss": 2.8958, + "step": 5154 + }, + { + "epoch": 0.3460286567564847, + "grad_norm": 4.722472190856934, + "learning_rate": 9.473120261947395e-05, + "loss": 2.8665, + "step": 5156 + }, + { + "epoch": 0.3461628804402537, + "grad_norm": 3.978144407272339, + "learning_rate": 9.472634538984712e-05, + "loss": 2.7455, + "step": 5158 + }, + { + "epoch": 0.3462971041240227, + "grad_norm": 4.771278381347656, + "learning_rate": 9.472148604698843e-05, + "loss": 3.1132, + "step": 5160 + }, + { + "epoch": 0.3464313278077917, + "grad_norm": 4.193169593811035, + "learning_rate": 9.471662459112747e-05, + "loss": 2.9934, + "step": 5162 + }, + { + "epoch": 0.3465655514915607, + "grad_norm": 4.225020885467529, + "learning_rate": 9.471176102249393e-05, + "loss": 2.3135, + "step": 5164 + }, + { + "epoch": 0.3466997751753297, + "grad_norm": 4.955338478088379, + "learning_rate": 9.47068953413176e-05, + "loss": 2.8721, + "step": 5166 + }, + { + "epoch": 0.3468339988590987, + "grad_norm": 4.997450828552246, + "learning_rate": 9.470202754782837e-05, + "loss": 2.5393, + "step": 5168 + }, + { + "epoch": 0.3469682225428677, + "grad_norm": 4.580684661865234, + "learning_rate": 9.469715764225626e-05, + "loss": 2.7071, + "step": 5170 + }, + { + "epoch": 0.34710244622663666, + "grad_norm": 3.8439831733703613, + "learning_rate": 9.469228562483132e-05, + "loss": 2.5096, + "step": 5172 + }, + { + "epoch": 0.3472366699104057, + "grad_norm": 4.156217575073242, + "learning_rate": 9.468741149578379e-05, + "loss": 2.7572, + "step": 5174 + }, + { + "epoch": 0.34737089359417467, + "grad_norm": 4.6566290855407715, + "learning_rate": 9.468253525534393e-05, + "loss": 2.9012, + "step": 5176 + }, + { + "epoch": 0.3475051172779437, + "grad_norm": 4.5853657722473145, + "learning_rate": 9.467765690374214e-05, + "loss": 2.6721, + "step": 5178 + }, + { + "epoch": 0.3476393409617127, + "grad_norm": 4.8563232421875, + "learning_rate": 9.467277644120893e-05, + "loss": 2.9283, + "step": 5180 + }, + { + "epoch": 0.3477735646454817, + "grad_norm": 5.039611339569092, + "learning_rate": 9.466789386797486e-05, + "loss": 2.82, + "step": 5182 + }, + { + "epoch": 0.3479077883292507, + "grad_norm": 5.415084362030029, + "learning_rate": 9.466300918427065e-05, + "loss": 2.9119, + "step": 5184 + }, + { + "epoch": 0.3480420120130197, + "grad_norm": 4.202699661254883, + "learning_rate": 9.465812239032708e-05, + "loss": 2.5768, + "step": 5186 + }, + { + "epoch": 0.3481762356967887, + "grad_norm": 4.253652572631836, + "learning_rate": 9.465323348637505e-05, + "loss": 2.6247, + "step": 5188 + }, + { + "epoch": 0.3483104593805577, + "grad_norm": 5.695315361022949, + "learning_rate": 9.464834247264553e-05, + "loss": 2.6897, + "step": 5190 + }, + { + "epoch": 0.3484446830643267, + "grad_norm": 4.938042640686035, + "learning_rate": 9.464344934936964e-05, + "loss": 2.651, + "step": 5192 + }, + { + "epoch": 0.3485789067480957, + "grad_norm": 4.47199010848999, + "learning_rate": 9.463855411677856e-05, + "loss": 2.7808, + "step": 5194 + }, + { + "epoch": 0.3487131304318647, + "grad_norm": 4.494866847991943, + "learning_rate": 9.463365677510357e-05, + "loss": 3.0175, + "step": 5196 + }, + { + "epoch": 0.3488473541156337, + "grad_norm": 4.403122901916504, + "learning_rate": 9.462875732457606e-05, + "loss": 2.7012, + "step": 5198 + }, + { + "epoch": 0.3489815777994027, + "grad_norm": 4.369345188140869, + "learning_rate": 9.462385576542752e-05, + "loss": 2.6787, + "step": 5200 + }, + { + "epoch": 0.34911580148317173, + "grad_norm": 4.274954795837402, + "learning_rate": 9.461895209788956e-05, + "loss": 2.8005, + "step": 5202 + }, + { + "epoch": 0.3492500251669407, + "grad_norm": 4.075732231140137, + "learning_rate": 9.461404632219384e-05, + "loss": 2.508, + "step": 5204 + }, + { + "epoch": 0.34938424885070973, + "grad_norm": 11.497817039489746, + "learning_rate": 9.460913843857217e-05, + "loss": 2.9328, + "step": 5206 + }, + { + "epoch": 0.3495184725344787, + "grad_norm": 4.008476257324219, + "learning_rate": 9.460422844725642e-05, + "loss": 2.7772, + "step": 5208 + }, + { + "epoch": 0.3496526962182477, + "grad_norm": 5.075708389282227, + "learning_rate": 9.459931634847859e-05, + "loss": 3.0062, + "step": 5210 + }, + { + "epoch": 0.3497869199020167, + "grad_norm": 4.063309192657471, + "learning_rate": 9.459440214247077e-05, + "loss": 2.4124, + "step": 5212 + }, + { + "epoch": 0.3499211435857857, + "grad_norm": 3.785775899887085, + "learning_rate": 9.458948582946514e-05, + "loss": 2.62, + "step": 5214 + }, + { + "epoch": 0.3500553672695547, + "grad_norm": 4.124805927276611, + "learning_rate": 9.458456740969397e-05, + "loss": 2.5921, + "step": 5216 + }, + { + "epoch": 0.3501895909533237, + "grad_norm": 3.904207944869995, + "learning_rate": 9.457964688338967e-05, + "loss": 2.6288, + "step": 5218 + }, + { + "epoch": 0.3503238146370927, + "grad_norm": 5.318083763122559, + "learning_rate": 9.457472425078473e-05, + "loss": 2.9289, + "step": 5220 + }, + { + "epoch": 0.3504580383208617, + "grad_norm": 4.559274673461914, + "learning_rate": 9.456979951211172e-05, + "loss": 2.3351, + "step": 5222 + }, + { + "epoch": 0.35059226200463073, + "grad_norm": 4.2297139167785645, + "learning_rate": 9.45648726676033e-05, + "loss": 2.7949, + "step": 5224 + }, + { + "epoch": 0.3507264856883997, + "grad_norm": 4.51598596572876, + "learning_rate": 9.455994371749231e-05, + "loss": 2.5876, + "step": 5226 + }, + { + "epoch": 0.35086070937216873, + "grad_norm": 7.814706325531006, + "learning_rate": 9.45550126620116e-05, + "loss": 2.8794, + "step": 5228 + }, + { + "epoch": 0.3509949330559377, + "grad_norm": 17.153701782226562, + "learning_rate": 9.455007950139412e-05, + "loss": 2.795, + "step": 5230 + }, + { + "epoch": 0.35112915673970674, + "grad_norm": 4.0703349113464355, + "learning_rate": 9.454514423587301e-05, + "loss": 2.652, + "step": 5232 + }, + { + "epoch": 0.3512633804234757, + "grad_norm": 14.859977722167969, + "learning_rate": 9.454020686568143e-05, + "loss": 2.9188, + "step": 5234 + }, + { + "epoch": 0.35139760410724474, + "grad_norm": 4.886653423309326, + "learning_rate": 9.453526739105267e-05, + "loss": 2.7369, + "step": 5236 + }, + { + "epoch": 0.3515318277910137, + "grad_norm": 4.016383647918701, + "learning_rate": 9.45303258122201e-05, + "loss": 2.4834, + "step": 5238 + }, + { + "epoch": 0.35166605147478275, + "grad_norm": 7.229006767272949, + "learning_rate": 9.452538212941719e-05, + "loss": 2.7814, + "step": 5240 + }, + { + "epoch": 0.3518002751585517, + "grad_norm": 3.78702974319458, + "learning_rate": 9.452043634287753e-05, + "loss": 2.8475, + "step": 5242 + }, + { + "epoch": 0.35193449884232075, + "grad_norm": 4.467276573181152, + "learning_rate": 9.45154884528348e-05, + "loss": 2.8422, + "step": 5244 + }, + { + "epoch": 0.35206872252608973, + "grad_norm": 6.788466930389404, + "learning_rate": 9.451053845952278e-05, + "loss": 2.7141, + "step": 5246 + }, + { + "epoch": 0.3522029462098587, + "grad_norm": 4.399141788482666, + "learning_rate": 9.450558636317533e-05, + "loss": 2.6074, + "step": 5248 + }, + { + "epoch": 0.35233716989362773, + "grad_norm": 4.6371235847473145, + "learning_rate": 9.450063216402644e-05, + "loss": 3.1104, + "step": 5250 + }, + { + "epoch": 0.3524713935773967, + "grad_norm": 4.552755832672119, + "learning_rate": 9.44956758623102e-05, + "loss": 2.66, + "step": 5252 + }, + { + "epoch": 0.35260561726116574, + "grad_norm": 4.298757553100586, + "learning_rate": 9.44907174582608e-05, + "loss": 2.8818, + "step": 5254 + }, + { + "epoch": 0.3527398409449347, + "grad_norm": 4.821155548095703, + "learning_rate": 9.448575695211244e-05, + "loss": 2.7901, + "step": 5256 + }, + { + "epoch": 0.35287406462870374, + "grad_norm": 5.045724868774414, + "learning_rate": 9.448079434409956e-05, + "loss": 2.5796, + "step": 5258 + }, + { + "epoch": 0.3530082883124727, + "grad_norm": 5.395529270172119, + "learning_rate": 9.447582963445663e-05, + "loss": 2.6229, + "step": 5260 + }, + { + "epoch": 0.35314251199624175, + "grad_norm": 7.896290302276611, + "learning_rate": 9.447086282341818e-05, + "loss": 2.5701, + "step": 5262 + }, + { + "epoch": 0.3532767356800107, + "grad_norm": 7.765769004821777, + "learning_rate": 9.446589391121893e-05, + "loss": 2.844, + "step": 5264 + }, + { + "epoch": 0.35341095936377975, + "grad_norm": 4.298630714416504, + "learning_rate": 9.446092289809361e-05, + "loss": 2.617, + "step": 5266 + }, + { + "epoch": 0.35354518304754873, + "grad_norm": 4.214929580688477, + "learning_rate": 9.445594978427714e-05, + "loss": 2.6354, + "step": 5268 + }, + { + "epoch": 0.35367940673131776, + "grad_norm": 4.784689903259277, + "learning_rate": 9.445097457000444e-05, + "loss": 3.0112, + "step": 5270 + }, + { + "epoch": 0.35381363041508673, + "grad_norm": 4.239730358123779, + "learning_rate": 9.444599725551061e-05, + "loss": 2.8854, + "step": 5272 + }, + { + "epoch": 0.35394785409885576, + "grad_norm": 4.278818607330322, + "learning_rate": 9.444101784103082e-05, + "loss": 2.7838, + "step": 5274 + }, + { + "epoch": 0.35408207778262474, + "grad_norm": 4.070528984069824, + "learning_rate": 9.443603632680031e-05, + "loss": 2.5584, + "step": 5276 + }, + { + "epoch": 0.35421630146639377, + "grad_norm": 10.919440269470215, + "learning_rate": 9.443105271305445e-05, + "loss": 2.6862, + "step": 5278 + }, + { + "epoch": 0.35435052515016274, + "grad_norm": 4.905244827270508, + "learning_rate": 9.442606700002874e-05, + "loss": 2.6465, + "step": 5280 + }, + { + "epoch": 0.3544847488339318, + "grad_norm": 5.141050338745117, + "learning_rate": 9.442107918795873e-05, + "loss": 2.8772, + "step": 5282 + }, + { + "epoch": 0.35461897251770075, + "grad_norm": 6.065023422241211, + "learning_rate": 9.441608927708006e-05, + "loss": 2.9803, + "step": 5284 + }, + { + "epoch": 0.3547531962014697, + "grad_norm": 6.407088279724121, + "learning_rate": 9.441109726762852e-05, + "loss": 2.9665, + "step": 5286 + }, + { + "epoch": 0.35488741988523875, + "grad_norm": 4.3577656745910645, + "learning_rate": 9.440610315983998e-05, + "loss": 2.5671, + "step": 5288 + }, + { + "epoch": 0.3550216435690077, + "grad_norm": 4.18439245223999, + "learning_rate": 9.440110695395037e-05, + "loss": 2.7449, + "step": 5290 + }, + { + "epoch": 0.35515586725277676, + "grad_norm": 4.321678638458252, + "learning_rate": 9.439610865019577e-05, + "loss": 2.5397, + "step": 5292 + }, + { + "epoch": 0.35529009093654573, + "grad_norm": 4.571523189544678, + "learning_rate": 9.439110824881232e-05, + "loss": 2.9376, + "step": 5294 + }, + { + "epoch": 0.35542431462031476, + "grad_norm": 5.326371192932129, + "learning_rate": 9.438610575003632e-05, + "loss": 2.8537, + "step": 5296 + }, + { + "epoch": 0.35555853830408374, + "grad_norm": 5.828704833984375, + "learning_rate": 9.43811011541041e-05, + "loss": 2.4207, + "step": 5298 + }, + { + "epoch": 0.35569276198785277, + "grad_norm": 5.072452068328857, + "learning_rate": 9.437609446125211e-05, + "loss": 2.9669, + "step": 5300 + }, + { + "epoch": 0.35582698567162174, + "grad_norm": 5.229039669036865, + "learning_rate": 9.437108567171693e-05, + "loss": 2.9767, + "step": 5302 + }, + { + "epoch": 0.3559612093553908, + "grad_norm": 12.244040489196777, + "learning_rate": 9.436607478573522e-05, + "loss": 2.7013, + "step": 5304 + }, + { + "epoch": 0.35609543303915975, + "grad_norm": 4.607955455780029, + "learning_rate": 9.436106180354369e-05, + "loss": 2.8572, + "step": 5306 + }, + { + "epoch": 0.3562296567229288, + "grad_norm": 4.649220943450928, + "learning_rate": 9.435604672537924e-05, + "loss": 2.5325, + "step": 5308 + }, + { + "epoch": 0.35636388040669775, + "grad_norm": 4.859424591064453, + "learning_rate": 9.43510295514788e-05, + "loss": 2.4455, + "step": 5310 + }, + { + "epoch": 0.3564981040904668, + "grad_norm": 9.003073692321777, + "learning_rate": 9.434601028207942e-05, + "loss": 2.3665, + "step": 5312 + }, + { + "epoch": 0.35663232777423576, + "grad_norm": 5.521931171417236, + "learning_rate": 9.434098891741827e-05, + "loss": 2.5769, + "step": 5314 + }, + { + "epoch": 0.3567665514580048, + "grad_norm": 4.240405559539795, + "learning_rate": 9.433596545773258e-05, + "loss": 2.85, + "step": 5316 + }, + { + "epoch": 0.35690077514177376, + "grad_norm": 4.506983757019043, + "learning_rate": 9.43309399032597e-05, + "loss": 2.8872, + "step": 5318 + }, + { + "epoch": 0.3570349988255428, + "grad_norm": 4.64597225189209, + "learning_rate": 9.432591225423708e-05, + "loss": 2.9849, + "step": 5320 + }, + { + "epoch": 0.35716922250931177, + "grad_norm": 4.196255207061768, + "learning_rate": 9.432088251090228e-05, + "loss": 2.776, + "step": 5322 + }, + { + "epoch": 0.35730344619308074, + "grad_norm": 5.30235481262207, + "learning_rate": 9.431585067349293e-05, + "loss": 2.6993, + "step": 5324 + }, + { + "epoch": 0.3574376698768498, + "grad_norm": 4.140336513519287, + "learning_rate": 9.431081674224677e-05, + "loss": 2.6918, + "step": 5326 + }, + { + "epoch": 0.35757189356061875, + "grad_norm": 4.132028102874756, + "learning_rate": 9.430578071740167e-05, + "loss": 2.697, + "step": 5328 + }, + { + "epoch": 0.3577061172443878, + "grad_norm": 4.670660495758057, + "learning_rate": 9.430074259919554e-05, + "loss": 2.6838, + "step": 5330 + }, + { + "epoch": 0.35784034092815675, + "grad_norm": 4.381897926330566, + "learning_rate": 9.429570238786645e-05, + "loss": 2.6966, + "step": 5332 + }, + { + "epoch": 0.3579745646119258, + "grad_norm": 5.25192928314209, + "learning_rate": 9.429066008365251e-05, + "loss": 2.8415, + "step": 5334 + }, + { + "epoch": 0.35810878829569476, + "grad_norm": 4.575705528259277, + "learning_rate": 9.428561568679199e-05, + "loss": 2.9814, + "step": 5336 + }, + { + "epoch": 0.3582430119794638, + "grad_norm": 4.663041591644287, + "learning_rate": 9.428056919752319e-05, + "loss": 2.9783, + "step": 5338 + }, + { + "epoch": 0.35837723566323276, + "grad_norm": 4.3741936683654785, + "learning_rate": 9.42755206160846e-05, + "loss": 2.5498, + "step": 5340 + }, + { + "epoch": 0.3585114593470018, + "grad_norm": 4.2484941482543945, + "learning_rate": 9.427046994271471e-05, + "loss": 2.9414, + "step": 5342 + }, + { + "epoch": 0.35864568303077077, + "grad_norm": 4.594083309173584, + "learning_rate": 9.426541717765216e-05, + "loss": 2.5455, + "step": 5344 + }, + { + "epoch": 0.3587799067145398, + "grad_norm": 4.94602632522583, + "learning_rate": 9.426036232113571e-05, + "loss": 2.8315, + "step": 5346 + }, + { + "epoch": 0.3589141303983088, + "grad_norm": 4.249922275543213, + "learning_rate": 9.425530537340417e-05, + "loss": 2.7731, + "step": 5348 + }, + { + "epoch": 0.3590483540820778, + "grad_norm": 4.714175224304199, + "learning_rate": 9.425024633469647e-05, + "loss": 2.6257, + "step": 5350 + }, + { + "epoch": 0.3591825777658468, + "grad_norm": 4.461861610412598, + "learning_rate": 9.424518520525165e-05, + "loss": 2.8914, + "step": 5352 + }, + { + "epoch": 0.3593168014496158, + "grad_norm": 4.436424255371094, + "learning_rate": 9.424012198530882e-05, + "loss": 2.6946, + "step": 5354 + }, + { + "epoch": 0.3594510251333848, + "grad_norm": 4.146971225738525, + "learning_rate": 9.423505667510724e-05, + "loss": 2.5102, + "step": 5356 + }, + { + "epoch": 0.3595852488171538, + "grad_norm": 4.524332046508789, + "learning_rate": 9.42299892748862e-05, + "loss": 2.6842, + "step": 5358 + }, + { + "epoch": 0.3597194725009228, + "grad_norm": 3.9621152877807617, + "learning_rate": 9.422491978488515e-05, + "loss": 2.8363, + "step": 5360 + }, + { + "epoch": 0.35985369618469176, + "grad_norm": 4.059809684753418, + "learning_rate": 9.42198482053436e-05, + "loss": 2.558, + "step": 5362 + }, + { + "epoch": 0.3599879198684608, + "grad_norm": 8.49139404296875, + "learning_rate": 9.421477453650118e-05, + "loss": 2.3881, + "step": 5364 + }, + { + "epoch": 0.36012214355222977, + "grad_norm": 4.614034175872803, + "learning_rate": 9.420969877859761e-05, + "loss": 2.6541, + "step": 5366 + }, + { + "epoch": 0.3602563672359988, + "grad_norm": 6.361608982086182, + "learning_rate": 9.420462093187271e-05, + "loss": 2.4399, + "step": 5368 + }, + { + "epoch": 0.3603905909197678, + "grad_norm": 4.8697004318237305, + "learning_rate": 9.419954099656638e-05, + "loss": 2.9167, + "step": 5370 + }, + { + "epoch": 0.3605248146035368, + "grad_norm": 4.3515400886535645, + "learning_rate": 9.419445897291867e-05, + "loss": 2.825, + "step": 5372 + }, + { + "epoch": 0.3606590382873058, + "grad_norm": 6.429574012756348, + "learning_rate": 9.418937486116968e-05, + "loss": 2.742, + "step": 5374 + }, + { + "epoch": 0.3607932619710748, + "grad_norm": 4.384714126586914, + "learning_rate": 9.418428866155961e-05, + "loss": 2.942, + "step": 5376 + }, + { + "epoch": 0.3609274856548438, + "grad_norm": 3.786713123321533, + "learning_rate": 9.417920037432879e-05, + "loss": 2.268, + "step": 5378 + }, + { + "epoch": 0.3610617093386128, + "grad_norm": 4.212123394012451, + "learning_rate": 9.417410999971762e-05, + "loss": 2.7629, + "step": 5380 + }, + { + "epoch": 0.3611959330223818, + "grad_norm": 3.9216935634613037, + "learning_rate": 9.416901753796663e-05, + "loss": 2.6065, + "step": 5382 + }, + { + "epoch": 0.3613301567061508, + "grad_norm": 4.684676647186279, + "learning_rate": 9.41639229893164e-05, + "loss": 3.0384, + "step": 5384 + }, + { + "epoch": 0.3614643803899198, + "grad_norm": 4.611850738525391, + "learning_rate": 9.415882635400768e-05, + "loss": 2.8341, + "step": 5386 + }, + { + "epoch": 0.3615986040736888, + "grad_norm": 5.564373970031738, + "learning_rate": 9.415372763228123e-05, + "loss": 2.9635, + "step": 5388 + }, + { + "epoch": 0.3617328277574578, + "grad_norm": 4.18248176574707, + "learning_rate": 9.414862682437797e-05, + "loss": 2.6391, + "step": 5390 + }, + { + "epoch": 0.36186705144122683, + "grad_norm": 4.781639575958252, + "learning_rate": 9.414352393053891e-05, + "loss": 2.5082, + "step": 5392 + }, + { + "epoch": 0.3620012751249958, + "grad_norm": 6.180868148803711, + "learning_rate": 9.413841895100515e-05, + "loss": 2.6318, + "step": 5394 + }, + { + "epoch": 0.36213549880876483, + "grad_norm": 4.035216331481934, + "learning_rate": 9.413331188601791e-05, + "loss": 2.8241, + "step": 5396 + }, + { + "epoch": 0.3622697224925338, + "grad_norm": 5.181272029876709, + "learning_rate": 9.412820273581844e-05, + "loss": 2.6645, + "step": 5398 + }, + { + "epoch": 0.3624039461763028, + "grad_norm": 4.726448059082031, + "learning_rate": 9.412309150064817e-05, + "loss": 2.584, + "step": 5400 + }, + { + "epoch": 0.3625381698600718, + "grad_norm": 4.2401299476623535, + "learning_rate": 9.411797818074861e-05, + "loss": 2.7674, + "step": 5402 + }, + { + "epoch": 0.3626723935438408, + "grad_norm": 5.462427139282227, + "learning_rate": 9.411286277636131e-05, + "loss": 2.7576, + "step": 5404 + }, + { + "epoch": 0.3628066172276098, + "grad_norm": 4.240049362182617, + "learning_rate": 9.410774528772802e-05, + "loss": 2.7103, + "step": 5406 + }, + { + "epoch": 0.3629408409113788, + "grad_norm": 4.462477684020996, + "learning_rate": 9.410262571509046e-05, + "loss": 2.8199, + "step": 5408 + }, + { + "epoch": 0.3630750645951478, + "grad_norm": 4.264585018157959, + "learning_rate": 9.409750405869058e-05, + "loss": 2.7549, + "step": 5410 + }, + { + "epoch": 0.3632092882789168, + "grad_norm": 5.325710296630859, + "learning_rate": 9.409238031877034e-05, + "loss": 3.097, + "step": 5412 + }, + { + "epoch": 0.36334351196268583, + "grad_norm": 4.437282085418701, + "learning_rate": 9.408725449557184e-05, + "loss": 2.9302, + "step": 5414 + }, + { + "epoch": 0.3634777356464548, + "grad_norm": 4.237143516540527, + "learning_rate": 9.408212658933726e-05, + "loss": 2.7891, + "step": 5416 + }, + { + "epoch": 0.36361195933022383, + "grad_norm": 4.302945137023926, + "learning_rate": 9.407699660030888e-05, + "loss": 2.7739, + "step": 5418 + }, + { + "epoch": 0.3637461830139928, + "grad_norm": 5.079391002655029, + "learning_rate": 9.407186452872908e-05, + "loss": 2.8064, + "step": 5420 + }, + { + "epoch": 0.36388040669776184, + "grad_norm": 4.50987434387207, + "learning_rate": 9.406673037484035e-05, + "loss": 2.8566, + "step": 5422 + }, + { + "epoch": 0.3640146303815308, + "grad_norm": 4.055134296417236, + "learning_rate": 9.406159413888527e-05, + "loss": 2.5428, + "step": 5424 + }, + { + "epoch": 0.36414885406529984, + "grad_norm": 4.3460612297058105, + "learning_rate": 9.40564558211065e-05, + "loss": 2.8035, + "step": 5426 + }, + { + "epoch": 0.3642830777490688, + "grad_norm": 4.214920520782471, + "learning_rate": 9.405131542174684e-05, + "loss": 2.8739, + "step": 5428 + }, + { + "epoch": 0.36441730143283785, + "grad_norm": 5.096232891082764, + "learning_rate": 9.404617294104911e-05, + "loss": 3.3897, + "step": 5430 + }, + { + "epoch": 0.3645515251166068, + "grad_norm": 4.486722469329834, + "learning_rate": 9.404102837925637e-05, + "loss": 2.9047, + "step": 5432 + }, + { + "epoch": 0.36468574880037585, + "grad_norm": 4.998208045959473, + "learning_rate": 9.403588173661162e-05, + "loss": 2.74, + "step": 5434 + }, + { + "epoch": 0.3648199724841448, + "grad_norm": 4.114593982696533, + "learning_rate": 9.403073301335805e-05, + "loss": 2.8158, + "step": 5436 + }, + { + "epoch": 0.3649541961679138, + "grad_norm": 4.261693954467773, + "learning_rate": 9.402558220973892e-05, + "loss": 2.7313, + "step": 5438 + }, + { + "epoch": 0.36508841985168283, + "grad_norm": 4.467926502227783, + "learning_rate": 9.402042932599762e-05, + "loss": 2.7996, + "step": 5440 + }, + { + "epoch": 0.3652226435354518, + "grad_norm": 4.971966743469238, + "learning_rate": 9.401527436237758e-05, + "loss": 2.8536, + "step": 5442 + }, + { + "epoch": 0.36535686721922084, + "grad_norm": 4.139760971069336, + "learning_rate": 9.40101173191224e-05, + "loss": 2.4344, + "step": 5444 + }, + { + "epoch": 0.3654910909029898, + "grad_norm": 4.401684284210205, + "learning_rate": 9.40049581964757e-05, + "loss": 2.6912, + "step": 5446 + }, + { + "epoch": 0.36562531458675884, + "grad_norm": 4.1803975105285645, + "learning_rate": 9.399979699468126e-05, + "loss": 2.4442, + "step": 5448 + }, + { + "epoch": 0.3657595382705278, + "grad_norm": 5.040067672729492, + "learning_rate": 9.399463371398295e-05, + "loss": 2.9074, + "step": 5450 + }, + { + "epoch": 0.36589376195429685, + "grad_norm": 4.011792182922363, + "learning_rate": 9.398946835462469e-05, + "loss": 2.5192, + "step": 5452 + }, + { + "epoch": 0.3660279856380658, + "grad_norm": 4.007993698120117, + "learning_rate": 9.398430091685056e-05, + "loss": 2.7319, + "step": 5454 + }, + { + "epoch": 0.36616220932183485, + "grad_norm": 4.506133079528809, + "learning_rate": 9.397913140090471e-05, + "loss": 2.7681, + "step": 5456 + }, + { + "epoch": 0.3662964330056038, + "grad_norm": 3.9589080810546875, + "learning_rate": 9.397395980703137e-05, + "loss": 2.8992, + "step": 5458 + }, + { + "epoch": 0.36643065668937286, + "grad_norm": 4.035534381866455, + "learning_rate": 9.39687861354749e-05, + "loss": 2.6831, + "step": 5460 + }, + { + "epoch": 0.36656488037314183, + "grad_norm": 4.617319107055664, + "learning_rate": 9.396361038647976e-05, + "loss": 3.1176, + "step": 5462 + }, + { + "epoch": 0.36669910405691086, + "grad_norm": 5.421172618865967, + "learning_rate": 9.395843256029047e-05, + "loss": 2.9154, + "step": 5464 + }, + { + "epoch": 0.36683332774067984, + "grad_norm": 4.61995792388916, + "learning_rate": 9.39532526571517e-05, + "loss": 3.0221, + "step": 5466 + }, + { + "epoch": 0.36696755142444887, + "grad_norm": 4.960544586181641, + "learning_rate": 9.394807067730814e-05, + "loss": 2.644, + "step": 5468 + }, + { + "epoch": 0.36710177510821784, + "grad_norm": 4.394144058227539, + "learning_rate": 9.394288662100467e-05, + "loss": 2.8, + "step": 5470 + }, + { + "epoch": 0.3672359987919869, + "grad_norm": 3.9708635807037354, + "learning_rate": 9.393770048848622e-05, + "loss": 2.5532, + "step": 5472 + }, + { + "epoch": 0.36737022247575585, + "grad_norm": 5.630776882171631, + "learning_rate": 9.393251227999784e-05, + "loss": 2.6979, + "step": 5474 + }, + { + "epoch": 0.3675044461595248, + "grad_norm": 4.429136276245117, + "learning_rate": 9.392732199578462e-05, + "loss": 2.5521, + "step": 5476 + }, + { + "epoch": 0.36763866984329385, + "grad_norm": 4.337031841278076, + "learning_rate": 9.392212963609183e-05, + "loss": 2.7518, + "step": 5478 + }, + { + "epoch": 0.3677728935270628, + "grad_norm": 5.05925989151001, + "learning_rate": 9.391693520116477e-05, + "loss": 2.7781, + "step": 5480 + }, + { + "epoch": 0.36790711721083186, + "grad_norm": 4.7606329917907715, + "learning_rate": 9.391173869124889e-05, + "loss": 2.6782, + "step": 5482 + }, + { + "epoch": 0.36804134089460083, + "grad_norm": 4.245934963226318, + "learning_rate": 9.390654010658971e-05, + "loss": 2.646, + "step": 5484 + }, + { + "epoch": 0.36817556457836986, + "grad_norm": 4.612819194793701, + "learning_rate": 9.390133944743284e-05, + "loss": 2.5541, + "step": 5486 + }, + { + "epoch": 0.36830978826213884, + "grad_norm": 5.987905979156494, + "learning_rate": 9.389613671402402e-05, + "loss": 2.5047, + "step": 5488 + }, + { + "epoch": 0.36844401194590787, + "grad_norm": 4.373660564422607, + "learning_rate": 9.389093190660905e-05, + "loss": 2.8173, + "step": 5490 + }, + { + "epoch": 0.36857823562967684, + "grad_norm": 5.018381595611572, + "learning_rate": 9.388572502543384e-05, + "loss": 2.6468, + "step": 5492 + }, + { + "epoch": 0.3687124593134459, + "grad_norm": 4.259917259216309, + "learning_rate": 9.388051607074445e-05, + "loss": 2.926, + "step": 5494 + }, + { + "epoch": 0.36884668299721485, + "grad_norm": 4.378512859344482, + "learning_rate": 9.387530504278695e-05, + "loss": 2.615, + "step": 5496 + }, + { + "epoch": 0.3689809066809839, + "grad_norm": 4.563418865203857, + "learning_rate": 9.387009194180755e-05, + "loss": 2.7862, + "step": 5498 + }, + { + "epoch": 0.36911513036475285, + "grad_norm": 6.261932373046875, + "learning_rate": 9.38648767680526e-05, + "loss": 2.7616, + "step": 5500 + }, + { + "epoch": 0.3692493540485219, + "grad_norm": 4.391724586486816, + "learning_rate": 9.385965952176847e-05, + "loss": 2.7228, + "step": 5502 + }, + { + "epoch": 0.36938357773229086, + "grad_norm": 4.309655666351318, + "learning_rate": 9.385444020320166e-05, + "loss": 2.7209, + "step": 5504 + }, + { + "epoch": 0.3695178014160599, + "grad_norm": 3.7818496227264404, + "learning_rate": 9.38492188125988e-05, + "loss": 2.6661, + "step": 5506 + }, + { + "epoch": 0.36965202509982886, + "grad_norm": 4.204699993133545, + "learning_rate": 9.384399535020657e-05, + "loss": 2.5679, + "step": 5508 + }, + { + "epoch": 0.3697862487835979, + "grad_norm": 4.796344757080078, + "learning_rate": 9.383876981627178e-05, + "loss": 2.8827, + "step": 5510 + }, + { + "epoch": 0.36992047246736687, + "grad_norm": 4.481595993041992, + "learning_rate": 9.383354221104132e-05, + "loss": 2.9821, + "step": 5512 + }, + { + "epoch": 0.37005469615113584, + "grad_norm": 6.102607250213623, + "learning_rate": 9.382831253476219e-05, + "loss": 2.6519, + "step": 5514 + }, + { + "epoch": 0.3701889198349049, + "grad_norm": 4.505462646484375, + "learning_rate": 9.382308078768146e-05, + "loss": 2.601, + "step": 5516 + }, + { + "epoch": 0.37032314351867385, + "grad_norm": 3.762302875518799, + "learning_rate": 9.381784697004636e-05, + "loss": 2.7514, + "step": 5518 + }, + { + "epoch": 0.3704573672024429, + "grad_norm": 4.143299579620361, + "learning_rate": 9.381261108210412e-05, + "loss": 2.5488, + "step": 5520 + }, + { + "epoch": 0.37059159088621185, + "grad_norm": 3.8981800079345703, + "learning_rate": 9.380737312410219e-05, + "loss": 2.3342, + "step": 5522 + }, + { + "epoch": 0.3707258145699809, + "grad_norm": 4.863278865814209, + "learning_rate": 9.380213309628803e-05, + "loss": 2.7947, + "step": 5524 + }, + { + "epoch": 0.37086003825374986, + "grad_norm": 4.210094928741455, + "learning_rate": 9.379689099890921e-05, + "loss": 2.6129, + "step": 5526 + }, + { + "epoch": 0.3709942619375189, + "grad_norm": 4.936619758605957, + "learning_rate": 9.37916468322134e-05, + "loss": 2.354, + "step": 5528 + }, + { + "epoch": 0.37112848562128786, + "grad_norm": 4.915158748626709, + "learning_rate": 9.378640059644839e-05, + "loss": 2.7932, + "step": 5530 + }, + { + "epoch": 0.3712627093050569, + "grad_norm": 4.147298812866211, + "learning_rate": 9.378115229186207e-05, + "loss": 2.6523, + "step": 5532 + }, + { + "epoch": 0.37139693298882587, + "grad_norm": 4.360683917999268, + "learning_rate": 9.377590191870236e-05, + "loss": 2.4761, + "step": 5534 + }, + { + "epoch": 0.3715311566725949, + "grad_norm": 3.597799062728882, + "learning_rate": 9.37706494772174e-05, + "loss": 2.6255, + "step": 5536 + }, + { + "epoch": 0.3716653803563639, + "grad_norm": 6.678796291351318, + "learning_rate": 9.376539496765531e-05, + "loss": 2.4901, + "step": 5538 + }, + { + "epoch": 0.3717996040401329, + "grad_norm": 5.41901159286499, + "learning_rate": 9.376013839026437e-05, + "loss": 2.5678, + "step": 5540 + }, + { + "epoch": 0.3719338277239019, + "grad_norm": 4.086419105529785, + "learning_rate": 9.375487974529296e-05, + "loss": 2.723, + "step": 5542 + }, + { + "epoch": 0.3720680514076709, + "grad_norm": 4.867722034454346, + "learning_rate": 9.37496190329895e-05, + "loss": 2.6755, + "step": 5544 + }, + { + "epoch": 0.3722022750914399, + "grad_norm": 3.9583323001861572, + "learning_rate": 9.374435625360259e-05, + "loss": 2.7547, + "step": 5546 + }, + { + "epoch": 0.3723364987752089, + "grad_norm": 4.093666076660156, + "learning_rate": 9.373909140738084e-05, + "loss": 2.5878, + "step": 5548 + }, + { + "epoch": 0.3724707224589779, + "grad_norm": 4.402645587921143, + "learning_rate": 9.373382449457304e-05, + "loss": 2.926, + "step": 5550 + }, + { + "epoch": 0.37260494614274686, + "grad_norm": 4.495758533477783, + "learning_rate": 9.372855551542805e-05, + "loss": 2.81, + "step": 5552 + }, + { + "epoch": 0.3727391698265159, + "grad_norm": 4.273186206817627, + "learning_rate": 9.372328447019478e-05, + "loss": 2.8712, + "step": 5554 + }, + { + "epoch": 0.37287339351028487, + "grad_norm": 4.8527092933654785, + "learning_rate": 9.37180113591223e-05, + "loss": 3.0374, + "step": 5556 + }, + { + "epoch": 0.3730076171940539, + "grad_norm": 4.852907657623291, + "learning_rate": 9.371273618245976e-05, + "loss": 2.7286, + "step": 5558 + }, + { + "epoch": 0.3731418408778229, + "grad_norm": 4.303847312927246, + "learning_rate": 9.370745894045639e-05, + "loss": 2.5822, + "step": 5560 + }, + { + "epoch": 0.3732760645615919, + "grad_norm": 3.6403069496154785, + "learning_rate": 9.370217963336152e-05, + "loss": 2.6562, + "step": 5562 + }, + { + "epoch": 0.3734102882453609, + "grad_norm": 4.582805633544922, + "learning_rate": 9.36968982614246e-05, + "loss": 2.9862, + "step": 5564 + }, + { + "epoch": 0.3735445119291299, + "grad_norm": 4.722568035125732, + "learning_rate": 9.369161482489519e-05, + "loss": 2.7377, + "step": 5566 + }, + { + "epoch": 0.3736787356128989, + "grad_norm": 5.47265625, + "learning_rate": 9.368632932402287e-05, + "loss": 2.8856, + "step": 5568 + }, + { + "epoch": 0.3738129592966679, + "grad_norm": 4.891671180725098, + "learning_rate": 9.368104175905741e-05, + "loss": 2.9501, + "step": 5570 + }, + { + "epoch": 0.3739471829804369, + "grad_norm": 5.590705871582031, + "learning_rate": 9.367575213024861e-05, + "loss": 2.849, + "step": 5572 + }, + { + "epoch": 0.3740814066642059, + "grad_norm": 4.53803825378418, + "learning_rate": 9.36704604378464e-05, + "loss": 3.035, + "step": 5574 + }, + { + "epoch": 0.3742156303479749, + "grad_norm": 5.699151992797852, + "learning_rate": 9.366516668210083e-05, + "loss": 3.0478, + "step": 5576 + }, + { + "epoch": 0.3743498540317439, + "grad_norm": 5.226761817932129, + "learning_rate": 9.365987086326198e-05, + "loss": 2.6398, + "step": 5578 + }, + { + "epoch": 0.3744840777155129, + "grad_norm": 3.952197551727295, + "learning_rate": 9.365457298158009e-05, + "loss": 2.7097, + "step": 5580 + }, + { + "epoch": 0.3746183013992819, + "grad_norm": 4.444697380065918, + "learning_rate": 9.364927303730549e-05, + "loss": 2.82, + "step": 5582 + }, + { + "epoch": 0.3747525250830509, + "grad_norm": 4.448780059814453, + "learning_rate": 9.364397103068854e-05, + "loss": 2.355, + "step": 5584 + }, + { + "epoch": 0.37488674876681993, + "grad_norm": 4.430472373962402, + "learning_rate": 9.36386669619798e-05, + "loss": 2.7573, + "step": 5586 + }, + { + "epoch": 0.3750209724505889, + "grad_norm": 4.136151313781738, + "learning_rate": 9.363336083142986e-05, + "loss": 2.6327, + "step": 5588 + }, + { + "epoch": 0.3751551961343579, + "grad_norm": 3.9574267864227295, + "learning_rate": 9.36280526392894e-05, + "loss": 2.7043, + "step": 5590 + }, + { + "epoch": 0.3752894198181269, + "grad_norm": 4.086404323577881, + "learning_rate": 9.362274238580926e-05, + "loss": 2.5787, + "step": 5592 + }, + { + "epoch": 0.3754236435018959, + "grad_norm": 4.672739505767822, + "learning_rate": 9.361743007124032e-05, + "loss": 2.8147, + "step": 5594 + }, + { + "epoch": 0.3755578671856649, + "grad_norm": 4.694242000579834, + "learning_rate": 9.36121156958336e-05, + "loss": 3.2229, + "step": 5596 + }, + { + "epoch": 0.3756920908694339, + "grad_norm": 4.497357368469238, + "learning_rate": 9.360679925984013e-05, + "loss": 2.7128, + "step": 5598 + }, + { + "epoch": 0.3758263145532029, + "grad_norm": 4.168105125427246, + "learning_rate": 9.360148076351117e-05, + "loss": 2.3243, + "step": 5600 + }, + { + "epoch": 0.3759605382369719, + "grad_norm": 4.222919464111328, + "learning_rate": 9.359616020709798e-05, + "loss": 2.4977, + "step": 5602 + }, + { + "epoch": 0.3760947619207409, + "grad_norm": 4.423033237457275, + "learning_rate": 9.359083759085195e-05, + "loss": 2.7142, + "step": 5604 + }, + { + "epoch": 0.3762289856045099, + "grad_norm": 4.268894672393799, + "learning_rate": 9.358551291502456e-05, + "loss": 2.5994, + "step": 5606 + }, + { + "epoch": 0.37636320928827893, + "grad_norm": 6.241854667663574, + "learning_rate": 9.358018617986739e-05, + "loss": 2.5802, + "step": 5608 + }, + { + "epoch": 0.3764974329720479, + "grad_norm": 4.564337253570557, + "learning_rate": 9.357485738563212e-05, + "loss": 2.677, + "step": 5610 + }, + { + "epoch": 0.37663165665581694, + "grad_norm": 5.293037414550781, + "learning_rate": 9.356952653257051e-05, + "loss": 2.6076, + "step": 5612 + }, + { + "epoch": 0.3767658803395859, + "grad_norm": 4.307677268981934, + "learning_rate": 9.356419362093449e-05, + "loss": 2.8783, + "step": 5614 + }, + { + "epoch": 0.37690010402335494, + "grad_norm": 4.066353797912598, + "learning_rate": 9.355885865097595e-05, + "loss": 2.5431, + "step": 5616 + }, + { + "epoch": 0.3770343277071239, + "grad_norm": 5.424474239349365, + "learning_rate": 9.3553521622947e-05, + "loss": 2.9835, + "step": 5618 + }, + { + "epoch": 0.37716855139089295, + "grad_norm": 4.530653953552246, + "learning_rate": 9.354818253709981e-05, + "loss": 2.8293, + "step": 5620 + }, + { + "epoch": 0.3773027750746619, + "grad_norm": 4.2697906494140625, + "learning_rate": 9.354284139368662e-05, + "loss": 2.8686, + "step": 5622 + }, + { + "epoch": 0.37743699875843095, + "grad_norm": 5.059792995452881, + "learning_rate": 9.35374981929598e-05, + "loss": 2.7791, + "step": 5624 + }, + { + "epoch": 0.3775712224421999, + "grad_norm": 8.896444320678711, + "learning_rate": 9.35321529351718e-05, + "loss": 2.5261, + "step": 5626 + }, + { + "epoch": 0.3777054461259689, + "grad_norm": 4.408978462219238, + "learning_rate": 9.352680562057516e-05, + "loss": 2.7391, + "step": 5628 + }, + { + "epoch": 0.37783966980973793, + "grad_norm": 4.437492847442627, + "learning_rate": 9.352145624942256e-05, + "loss": 2.7169, + "step": 5630 + }, + { + "epoch": 0.3779738934935069, + "grad_norm": 5.360891819000244, + "learning_rate": 9.351610482196676e-05, + "loss": 2.5824, + "step": 5632 + }, + { + "epoch": 0.37810811717727594, + "grad_norm": 5.207115173339844, + "learning_rate": 9.351075133846054e-05, + "loss": 2.917, + "step": 5634 + }, + { + "epoch": 0.3782423408610449, + "grad_norm": 4.933265209197998, + "learning_rate": 9.35053957991569e-05, + "loss": 2.4044, + "step": 5636 + }, + { + "epoch": 0.37837656454481394, + "grad_norm": 3.8754594326019287, + "learning_rate": 9.350003820430885e-05, + "loss": 2.7023, + "step": 5638 + }, + { + "epoch": 0.3785107882285829, + "grad_norm": 4.274329662322998, + "learning_rate": 9.349467855416953e-05, + "loss": 2.5298, + "step": 5640 + }, + { + "epoch": 0.37864501191235195, + "grad_norm": 7.3154754638671875, + "learning_rate": 9.348931684899219e-05, + "loss": 2.5434, + "step": 5642 + }, + { + "epoch": 0.3787792355961209, + "grad_norm": 4.45452880859375, + "learning_rate": 9.348395308903014e-05, + "loss": 2.8986, + "step": 5644 + }, + { + "epoch": 0.37891345927988995, + "grad_norm": 4.631004333496094, + "learning_rate": 9.347858727453682e-05, + "loss": 2.8903, + "step": 5646 + }, + { + "epoch": 0.3790476829636589, + "grad_norm": 4.7028303146362305, + "learning_rate": 9.347321940576575e-05, + "loss": 2.3982, + "step": 5648 + }, + { + "epoch": 0.37918190664742796, + "grad_norm": 4.799648761749268, + "learning_rate": 9.346784948297054e-05, + "loss": 2.72, + "step": 5650 + }, + { + "epoch": 0.37931613033119693, + "grad_norm": 4.521987438201904, + "learning_rate": 9.346247750640491e-05, + "loss": 2.7418, + "step": 5652 + }, + { + "epoch": 0.37945035401496596, + "grad_norm": 4.554125785827637, + "learning_rate": 9.34571034763227e-05, + "loss": 2.6266, + "step": 5654 + }, + { + "epoch": 0.37958457769873494, + "grad_norm": 4.32382345199585, + "learning_rate": 9.34517273929778e-05, + "loss": 2.8568, + "step": 5656 + }, + { + "epoch": 0.37971880138250397, + "grad_norm": 4.376607418060303, + "learning_rate": 9.34463492566242e-05, + "loss": 2.8922, + "step": 5658 + }, + { + "epoch": 0.37985302506627294, + "grad_norm": 4.694797515869141, + "learning_rate": 9.344096906751607e-05, + "loss": 2.6888, + "step": 5660 + }, + { + "epoch": 0.379987248750042, + "grad_norm": 4.937413692474365, + "learning_rate": 9.343558682590756e-05, + "loss": 2.849, + "step": 5662 + }, + { + "epoch": 0.38012147243381095, + "grad_norm": 4.679848670959473, + "learning_rate": 9.343020253205298e-05, + "loss": 2.585, + "step": 5664 + }, + { + "epoch": 0.3802556961175799, + "grad_norm": 4.659168720245361, + "learning_rate": 9.342481618620673e-05, + "loss": 2.6381, + "step": 5666 + }, + { + "epoch": 0.38038991980134895, + "grad_norm": 4.362579822540283, + "learning_rate": 9.341942778862331e-05, + "loss": 2.7055, + "step": 5668 + }, + { + "epoch": 0.3805241434851179, + "grad_norm": 6.501589298248291, + "learning_rate": 9.341403733955732e-05, + "loss": 2.8444, + "step": 5670 + }, + { + "epoch": 0.38065836716888696, + "grad_norm": 4.2455949783325195, + "learning_rate": 9.340864483926343e-05, + "loss": 2.9843, + "step": 5672 + }, + { + "epoch": 0.38079259085265593, + "grad_norm": 5.221848487854004, + "learning_rate": 9.340325028799642e-05, + "loss": 2.7173, + "step": 5674 + }, + { + "epoch": 0.38092681453642496, + "grad_norm": 4.243717670440674, + "learning_rate": 9.339785368601119e-05, + "loss": 2.46, + "step": 5676 + }, + { + "epoch": 0.38106103822019394, + "grad_norm": 4.025247573852539, + "learning_rate": 9.339245503356271e-05, + "loss": 2.6798, + "step": 5678 + }, + { + "epoch": 0.38119526190396297, + "grad_norm": 4.779268264770508, + "learning_rate": 9.338705433090607e-05, + "loss": 2.7539, + "step": 5680 + }, + { + "epoch": 0.38132948558773194, + "grad_norm": 5.455138206481934, + "learning_rate": 9.338165157829641e-05, + "loss": 2.6765, + "step": 5682 + }, + { + "epoch": 0.381463709271501, + "grad_norm": 3.359126091003418, + "learning_rate": 9.337624677598903e-05, + "loss": 2.4256, + "step": 5684 + }, + { + "epoch": 0.38159793295526995, + "grad_norm": 4.026283264160156, + "learning_rate": 9.337083992423927e-05, + "loss": 2.6001, + "step": 5686 + }, + { + "epoch": 0.381732156639039, + "grad_norm": 4.992631912231445, + "learning_rate": 9.336543102330263e-05, + "loss": 2.6052, + "step": 5688 + }, + { + "epoch": 0.38186638032280795, + "grad_norm": 4.368664741516113, + "learning_rate": 9.336002007343464e-05, + "loss": 2.5892, + "step": 5690 + }, + { + "epoch": 0.382000604006577, + "grad_norm": 4.5046210289001465, + "learning_rate": 9.335460707489097e-05, + "loss": 2.8472, + "step": 5692 + }, + { + "epoch": 0.38213482769034596, + "grad_norm": 3.551159620285034, + "learning_rate": 9.334919202792736e-05, + "loss": 2.508, + "step": 5694 + }, + { + "epoch": 0.382269051374115, + "grad_norm": 4.415622711181641, + "learning_rate": 9.334377493279968e-05, + "loss": 2.6782, + "step": 5696 + }, + { + "epoch": 0.38240327505788396, + "grad_norm": 3.806196928024292, + "learning_rate": 9.333835578976385e-05, + "loss": 2.4003, + "step": 5698 + }, + { + "epoch": 0.38253749874165294, + "grad_norm": 5.350992202758789, + "learning_rate": 9.333293459907595e-05, + "loss": 2.7656, + "step": 5700 + }, + { + "epoch": 0.38267172242542197, + "grad_norm": 6.6068925857543945, + "learning_rate": 9.33275113609921e-05, + "loss": 2.5725, + "step": 5702 + }, + { + "epoch": 0.38280594610919094, + "grad_norm": 4.245990753173828, + "learning_rate": 9.332208607576851e-05, + "loss": 2.6219, + "step": 5704 + }, + { + "epoch": 0.38294016979296, + "grad_norm": 4.277673721313477, + "learning_rate": 9.331665874366156e-05, + "loss": 2.9046, + "step": 5706 + }, + { + "epoch": 0.38307439347672895, + "grad_norm": 4.076413154602051, + "learning_rate": 9.331122936492766e-05, + "loss": 2.5249, + "step": 5708 + }, + { + "epoch": 0.383208617160498, + "grad_norm": 4.771399974822998, + "learning_rate": 9.330579793982335e-05, + "loss": 3.0067, + "step": 5710 + }, + { + "epoch": 0.38334284084426695, + "grad_norm": 4.2639641761779785, + "learning_rate": 9.330036446860524e-05, + "loss": 3.0474, + "step": 5712 + }, + { + "epoch": 0.383477064528036, + "grad_norm": 6.921311855316162, + "learning_rate": 9.329492895153006e-05, + "loss": 2.6941, + "step": 5714 + }, + { + "epoch": 0.38361128821180496, + "grad_norm": 4.977880477905273, + "learning_rate": 9.328949138885461e-05, + "loss": 2.7889, + "step": 5716 + }, + { + "epoch": 0.383745511895574, + "grad_norm": 4.34611701965332, + "learning_rate": 9.328405178083584e-05, + "loss": 2.6162, + "step": 5718 + }, + { + "epoch": 0.38387973557934296, + "grad_norm": 4.522355079650879, + "learning_rate": 9.327861012773071e-05, + "loss": 2.8766, + "step": 5720 + }, + { + "epoch": 0.384013959263112, + "grad_norm": 4.85167932510376, + "learning_rate": 9.327316642979638e-05, + "loss": 3.0779, + "step": 5722 + }, + { + "epoch": 0.38414818294688097, + "grad_norm": 4.171302318572998, + "learning_rate": 9.326772068729001e-05, + "loss": 2.597, + "step": 5724 + }, + { + "epoch": 0.38428240663065, + "grad_norm": 4.132660865783691, + "learning_rate": 9.326227290046892e-05, + "loss": 2.735, + "step": 5726 + }, + { + "epoch": 0.384416630314419, + "grad_norm": 3.913954973220825, + "learning_rate": 9.325682306959051e-05, + "loss": 2.483, + "step": 5728 + }, + { + "epoch": 0.384550853998188, + "grad_norm": 4.612536907196045, + "learning_rate": 9.325137119491227e-05, + "loss": 2.8368, + "step": 5730 + }, + { + "epoch": 0.384685077681957, + "grad_norm": 3.9966089725494385, + "learning_rate": 9.324591727669181e-05, + "loss": 2.6604, + "step": 5732 + }, + { + "epoch": 0.384819301365726, + "grad_norm": 4.591483116149902, + "learning_rate": 9.324046131518678e-05, + "loss": 2.83, + "step": 5734 + }, + { + "epoch": 0.384953525049495, + "grad_norm": 4.3875885009765625, + "learning_rate": 9.323500331065498e-05, + "loss": 2.5778, + "step": 5736 + }, + { + "epoch": 0.38508774873326396, + "grad_norm": 4.370931625366211, + "learning_rate": 9.322954326335429e-05, + "loss": 2.8998, + "step": 5738 + }, + { + "epoch": 0.385221972417033, + "grad_norm": 5.691708087921143, + "learning_rate": 9.322408117354271e-05, + "loss": 2.5093, + "step": 5740 + }, + { + "epoch": 0.38535619610080196, + "grad_norm": 4.014068603515625, + "learning_rate": 9.321861704147827e-05, + "loss": 2.6271, + "step": 5742 + }, + { + "epoch": 0.385490419784571, + "grad_norm": 4.609500408172607, + "learning_rate": 9.321315086741916e-05, + "loss": 2.655, + "step": 5744 + }, + { + "epoch": 0.38562464346833997, + "grad_norm": 7.620419025421143, + "learning_rate": 9.320768265162366e-05, + "loss": 2.7585, + "step": 5746 + }, + { + "epoch": 0.385758867152109, + "grad_norm": 6.504104137420654, + "learning_rate": 9.320221239435012e-05, + "loss": 2.4384, + "step": 5748 + }, + { + "epoch": 0.38589309083587797, + "grad_norm": 3.966221332550049, + "learning_rate": 9.319674009585699e-05, + "loss": 2.3578, + "step": 5750 + }, + { + "epoch": 0.386027314519647, + "grad_norm": 3.9148831367492676, + "learning_rate": 9.319126575640283e-05, + "loss": 2.559, + "step": 5752 + }, + { + "epoch": 0.386161538203416, + "grad_norm": 4.401569843292236, + "learning_rate": 9.318578937624629e-05, + "loss": 2.6032, + "step": 5754 + }, + { + "epoch": 0.386295761887185, + "grad_norm": 4.615095138549805, + "learning_rate": 9.318031095564613e-05, + "loss": 2.7138, + "step": 5756 + }, + { + "epoch": 0.386429985570954, + "grad_norm": 4.335453987121582, + "learning_rate": 9.31748304948612e-05, + "loss": 2.563, + "step": 5758 + }, + { + "epoch": 0.386564209254723, + "grad_norm": 3.991868495941162, + "learning_rate": 9.316934799415041e-05, + "loss": 2.6794, + "step": 5760 + }, + { + "epoch": 0.386698432938492, + "grad_norm": 4.968031883239746, + "learning_rate": 9.316386345377281e-05, + "loss": 2.656, + "step": 5762 + }, + { + "epoch": 0.386832656622261, + "grad_norm": 4.014264106750488, + "learning_rate": 9.315837687398756e-05, + "loss": 2.7443, + "step": 5764 + }, + { + "epoch": 0.38696688030603, + "grad_norm": 4.425012111663818, + "learning_rate": 9.315288825505387e-05, + "loss": 2.8173, + "step": 5766 + }, + { + "epoch": 0.387101103989799, + "grad_norm": 5.143444538116455, + "learning_rate": 9.314739759723105e-05, + "loss": 2.9666, + "step": 5768 + }, + { + "epoch": 0.387235327673568, + "grad_norm": 4.464515209197998, + "learning_rate": 9.314190490077857e-05, + "loss": 2.5184, + "step": 5770 + }, + { + "epoch": 0.387369551357337, + "grad_norm": 4.531096458435059, + "learning_rate": 9.313641016595588e-05, + "loss": 2.6395, + "step": 5772 + }, + { + "epoch": 0.387503775041106, + "grad_norm": 4.660670757293701, + "learning_rate": 9.313091339302267e-05, + "loss": 2.9391, + "step": 5774 + }, + { + "epoch": 0.387637998724875, + "grad_norm": 4.459737300872803, + "learning_rate": 9.312541458223858e-05, + "loss": 3.0327, + "step": 5776 + }, + { + "epoch": 0.387772222408644, + "grad_norm": 5.1107306480407715, + "learning_rate": 9.311991373386349e-05, + "loss": 2.7852, + "step": 5778 + }, + { + "epoch": 0.387906446092413, + "grad_norm": 3.998610496520996, + "learning_rate": 9.311441084815724e-05, + "loss": 2.6899, + "step": 5780 + }, + { + "epoch": 0.388040669776182, + "grad_norm": 4.146368980407715, + "learning_rate": 9.310890592537987e-05, + "loss": 2.6531, + "step": 5782 + }, + { + "epoch": 0.388174893459951, + "grad_norm": 6.052302837371826, + "learning_rate": 9.310339896579145e-05, + "loss": 2.6712, + "step": 5784 + }, + { + "epoch": 0.38830911714372, + "grad_norm": 4.555976390838623, + "learning_rate": 9.30978899696522e-05, + "loss": 2.4785, + "step": 5786 + }, + { + "epoch": 0.388443340827489, + "grad_norm": 4.505791664123535, + "learning_rate": 9.30923789372224e-05, + "loss": 2.6944, + "step": 5788 + }, + { + "epoch": 0.388577564511258, + "grad_norm": 5.865919589996338, + "learning_rate": 9.308686586876243e-05, + "loss": 2.526, + "step": 5790 + }, + { + "epoch": 0.388711788195027, + "grad_norm": 6.511723518371582, + "learning_rate": 9.308135076453277e-05, + "loss": 2.4342, + "step": 5792 + }, + { + "epoch": 0.388846011878796, + "grad_norm": 4.842666149139404, + "learning_rate": 9.307583362479402e-05, + "loss": 2.728, + "step": 5794 + }, + { + "epoch": 0.388980235562565, + "grad_norm": 4.971519470214844, + "learning_rate": 9.307031444980681e-05, + "loss": 2.6905, + "step": 5796 + }, + { + "epoch": 0.38911445924633403, + "grad_norm": 4.388241767883301, + "learning_rate": 9.306479323983195e-05, + "loss": 2.6547, + "step": 5798 + }, + { + "epoch": 0.389248682930103, + "grad_norm": 4.385958671569824, + "learning_rate": 9.305926999513029e-05, + "loss": 2.9425, + "step": 5800 + }, + { + "epoch": 0.38938290661387204, + "grad_norm": 4.742867946624756, + "learning_rate": 9.30537447159628e-05, + "loss": 2.7317, + "step": 5802 + }, + { + "epoch": 0.389517130297641, + "grad_norm": 3.931878089904785, + "learning_rate": 9.304821740259053e-05, + "loss": 2.7121, + "step": 5804 + }, + { + "epoch": 0.38965135398141004, + "grad_norm": 16.072484970092773, + "learning_rate": 9.304268805527464e-05, + "loss": 2.8311, + "step": 5806 + }, + { + "epoch": 0.389785577665179, + "grad_norm": 4.543520450592041, + "learning_rate": 9.303715667427639e-05, + "loss": 2.6834, + "step": 5808 + }, + { + "epoch": 0.38991980134894805, + "grad_norm": 4.456310749053955, + "learning_rate": 9.30316232598571e-05, + "loss": 2.8021, + "step": 5810 + }, + { + "epoch": 0.390054025032717, + "grad_norm": 4.1256937980651855, + "learning_rate": 9.302608781227823e-05, + "loss": 2.8051, + "step": 5812 + }, + { + "epoch": 0.390188248716486, + "grad_norm": 4.321413040161133, + "learning_rate": 9.302055033180133e-05, + "loss": 2.6012, + "step": 5814 + }, + { + "epoch": 0.390322472400255, + "grad_norm": 5.2868733406066895, + "learning_rate": 9.3015010818688e-05, + "loss": 2.5966, + "step": 5816 + }, + { + "epoch": 0.390456696084024, + "grad_norm": 4.310025215148926, + "learning_rate": 9.300946927320001e-05, + "loss": 2.3595, + "step": 5818 + }, + { + "epoch": 0.39059091976779303, + "grad_norm": 4.5966362953186035, + "learning_rate": 9.300392569559917e-05, + "loss": 2.8635, + "step": 5820 + }, + { + "epoch": 0.390725143451562, + "grad_norm": 4.711021900177002, + "learning_rate": 9.29983800861474e-05, + "loss": 2.8327, + "step": 5822 + }, + { + "epoch": 0.39085936713533104, + "grad_norm": 4.551838397979736, + "learning_rate": 9.299283244510674e-05, + "loss": 2.8338, + "step": 5824 + }, + { + "epoch": 0.3909935908191, + "grad_norm": 4.3667826652526855, + "learning_rate": 9.298728277273927e-05, + "loss": 2.8059, + "step": 5826 + }, + { + "epoch": 0.39112781450286904, + "grad_norm": 4.459073543548584, + "learning_rate": 9.298173106930723e-05, + "loss": 2.4408, + "step": 5828 + }, + { + "epoch": 0.391262038186638, + "grad_norm": 3.848027229309082, + "learning_rate": 9.297617733507291e-05, + "loss": 2.9918, + "step": 5830 + }, + { + "epoch": 0.39139626187040705, + "grad_norm": 7.309458255767822, + "learning_rate": 9.297062157029872e-05, + "loss": 2.5915, + "step": 5832 + }, + { + "epoch": 0.391530485554176, + "grad_norm": 4.231155872344971, + "learning_rate": 9.296506377524716e-05, + "loss": 2.8185, + "step": 5834 + }, + { + "epoch": 0.39166470923794505, + "grad_norm": 4.240152359008789, + "learning_rate": 9.295950395018084e-05, + "loss": 2.9408, + "step": 5836 + }, + { + "epoch": 0.391798932921714, + "grad_norm": 4.538527965545654, + "learning_rate": 9.295394209536242e-05, + "loss": 3.0459, + "step": 5838 + }, + { + "epoch": 0.39193315660548306, + "grad_norm": 4.03680944442749, + "learning_rate": 9.29483782110547e-05, + "loss": 2.8519, + "step": 5840 + }, + { + "epoch": 0.39206738028925203, + "grad_norm": 4.715655326843262, + "learning_rate": 9.294281229752057e-05, + "loss": 2.6281, + "step": 5842 + }, + { + "epoch": 0.39220160397302106, + "grad_norm": 4.012216091156006, + "learning_rate": 9.2937244355023e-05, + "loss": 2.7697, + "step": 5844 + }, + { + "epoch": 0.39233582765679004, + "grad_norm": 4.309658527374268, + "learning_rate": 9.293167438382506e-05, + "loss": 2.8553, + "step": 5846 + }, + { + "epoch": 0.39247005134055907, + "grad_norm": 4.805260181427002, + "learning_rate": 9.292610238418992e-05, + "loss": 2.6743, + "step": 5848 + }, + { + "epoch": 0.39260427502432804, + "grad_norm": 4.281696319580078, + "learning_rate": 9.292052835638088e-05, + "loss": 3.0242, + "step": 5850 + }, + { + "epoch": 0.392738498708097, + "grad_norm": 5.395972728729248, + "learning_rate": 9.291495230066125e-05, + "loss": 2.6847, + "step": 5852 + }, + { + "epoch": 0.39287272239186605, + "grad_norm": 4.76246976852417, + "learning_rate": 9.290937421729454e-05, + "loss": 2.4489, + "step": 5854 + }, + { + "epoch": 0.393006946075635, + "grad_norm": 5.166550159454346, + "learning_rate": 9.290379410654425e-05, + "loss": 2.708, + "step": 5856 + }, + { + "epoch": 0.39314116975940405, + "grad_norm": 5.182868480682373, + "learning_rate": 9.289821196867405e-05, + "loss": 2.8996, + "step": 5858 + }, + { + "epoch": 0.393275393443173, + "grad_norm": 6.137669086456299, + "learning_rate": 9.289262780394772e-05, + "loss": 2.6759, + "step": 5860 + }, + { + "epoch": 0.39340961712694206, + "grad_norm": 4.473116874694824, + "learning_rate": 9.288704161262904e-05, + "loss": 2.6622, + "step": 5862 + }, + { + "epoch": 0.39354384081071103, + "grad_norm": 8.986534118652344, + "learning_rate": 9.2881453394982e-05, + "loss": 2.6754, + "step": 5864 + }, + { + "epoch": 0.39367806449448006, + "grad_norm": 6.508819103240967, + "learning_rate": 9.28758631512706e-05, + "loss": 2.7252, + "step": 5866 + }, + { + "epoch": 0.39381228817824904, + "grad_norm": 4.679440498352051, + "learning_rate": 9.287027088175898e-05, + "loss": 2.8259, + "step": 5868 + }, + { + "epoch": 0.39394651186201807, + "grad_norm": 4.634162425994873, + "learning_rate": 9.286467658671136e-05, + "loss": 2.6362, + "step": 5870 + }, + { + "epoch": 0.39408073554578704, + "grad_norm": 4.989544868469238, + "learning_rate": 9.285908026639207e-05, + "loss": 2.8164, + "step": 5872 + }, + { + "epoch": 0.3942149592295561, + "grad_norm": 4.820374011993408, + "learning_rate": 9.285348192106551e-05, + "loss": 3.0088, + "step": 5874 + }, + { + "epoch": 0.39434918291332505, + "grad_norm": 4.143151760101318, + "learning_rate": 9.28478815509962e-05, + "loss": 2.6469, + "step": 5876 + }, + { + "epoch": 0.3944834065970941, + "grad_norm": 3.816375970840454, + "learning_rate": 9.284227915644872e-05, + "loss": 2.8506, + "step": 5878 + }, + { + "epoch": 0.39461763028086305, + "grad_norm": 4.560688018798828, + "learning_rate": 9.283667473768782e-05, + "loss": 2.6398, + "step": 5880 + }, + { + "epoch": 0.3947518539646321, + "grad_norm": 5.178072452545166, + "learning_rate": 9.283106829497828e-05, + "loss": 2.9165, + "step": 5882 + }, + { + "epoch": 0.39488607764840106, + "grad_norm": 5.071436405181885, + "learning_rate": 9.282545982858496e-05, + "loss": 2.6345, + "step": 5884 + }, + { + "epoch": 0.3950203013321701, + "grad_norm": 3.9842870235443115, + "learning_rate": 9.28198493387729e-05, + "loss": 2.7103, + "step": 5886 + }, + { + "epoch": 0.39515452501593906, + "grad_norm": 4.58811616897583, + "learning_rate": 9.281423682580714e-05, + "loss": 2.9151, + "step": 5888 + }, + { + "epoch": 0.39528874869970804, + "grad_norm": 4.418056011199951, + "learning_rate": 9.280862228995291e-05, + "loss": 2.9551, + "step": 5890 + }, + { + "epoch": 0.39542297238347707, + "grad_norm": 4.349847316741943, + "learning_rate": 9.280300573147542e-05, + "loss": 2.6025, + "step": 5892 + }, + { + "epoch": 0.39555719606724604, + "grad_norm": 4.248231410980225, + "learning_rate": 9.27973871506401e-05, + "loss": 2.8111, + "step": 5894 + }, + { + "epoch": 0.39569141975101507, + "grad_norm": 3.9297103881835938, + "learning_rate": 9.27917665477124e-05, + "loss": 2.7148, + "step": 5896 + }, + { + "epoch": 0.39582564343478405, + "grad_norm": 3.9927327632904053, + "learning_rate": 9.278614392295786e-05, + "loss": 2.7066, + "step": 5898 + }, + { + "epoch": 0.3959598671185531, + "grad_norm": 4.586557388305664, + "learning_rate": 9.278051927664217e-05, + "loss": 2.4255, + "step": 5900 + }, + { + "epoch": 0.39609409080232205, + "grad_norm": 4.964137077331543, + "learning_rate": 9.277489260903104e-05, + "loss": 2.7669, + "step": 5902 + }, + { + "epoch": 0.3962283144860911, + "grad_norm": 4.071866512298584, + "learning_rate": 9.276926392039038e-05, + "loss": 2.7627, + "step": 5904 + }, + { + "epoch": 0.39636253816986006, + "grad_norm": 4.686675071716309, + "learning_rate": 9.276363321098609e-05, + "loss": 2.7581, + "step": 5906 + }, + { + "epoch": 0.3964967618536291, + "grad_norm": 4.247570514678955, + "learning_rate": 9.275800048108423e-05, + "loss": 2.697, + "step": 5908 + }, + { + "epoch": 0.39663098553739806, + "grad_norm": 4.465463161468506, + "learning_rate": 9.27523657309509e-05, + "loss": 2.8186, + "step": 5910 + }, + { + "epoch": 0.3967652092211671, + "grad_norm": 4.53939151763916, + "learning_rate": 9.27467289608524e-05, + "loss": 2.7176, + "step": 5912 + }, + { + "epoch": 0.39689943290493607, + "grad_norm": 4.430271625518799, + "learning_rate": 9.274109017105497e-05, + "loss": 2.628, + "step": 5914 + }, + { + "epoch": 0.3970336565887051, + "grad_norm": 3.9117445945739746, + "learning_rate": 9.27354493618251e-05, + "loss": 2.7225, + "step": 5916 + }, + { + "epoch": 0.39716788027247407, + "grad_norm": 4.276066780090332, + "learning_rate": 9.272980653342929e-05, + "loss": 2.6653, + "step": 5918 + }, + { + "epoch": 0.3973021039562431, + "grad_norm": 3.9940185546875, + "learning_rate": 9.272416168613414e-05, + "loss": 2.5093, + "step": 5920 + }, + { + "epoch": 0.3974363276400121, + "grad_norm": 4.22068452835083, + "learning_rate": 9.271851482020635e-05, + "loss": 2.6847, + "step": 5922 + }, + { + "epoch": 0.3975705513237811, + "grad_norm": 4.614835262298584, + "learning_rate": 9.271286593591275e-05, + "loss": 2.8607, + "step": 5924 + }, + { + "epoch": 0.3977047750075501, + "grad_norm": 4.325474262237549, + "learning_rate": 9.270721503352022e-05, + "loss": 2.9162, + "step": 5926 + }, + { + "epoch": 0.39783899869131906, + "grad_norm": 5.324969291687012, + "learning_rate": 9.270156211329578e-05, + "loss": 2.7407, + "step": 5928 + }, + { + "epoch": 0.3979732223750881, + "grad_norm": 5.9494194984436035, + "learning_rate": 9.269590717550647e-05, + "loss": 2.6137, + "step": 5930 + }, + { + "epoch": 0.39810744605885706, + "grad_norm": 4.551085948944092, + "learning_rate": 9.269025022041953e-05, + "loss": 2.7456, + "step": 5932 + }, + { + "epoch": 0.3982416697426261, + "grad_norm": 5.164455890655518, + "learning_rate": 9.268459124830218e-05, + "loss": 2.7276, + "step": 5934 + }, + { + "epoch": 0.39837589342639507, + "grad_norm": 4.128621578216553, + "learning_rate": 9.267893025942186e-05, + "loss": 2.6942, + "step": 5936 + }, + { + "epoch": 0.3985101171101641, + "grad_norm": 4.4194207191467285, + "learning_rate": 9.267326725404599e-05, + "loss": 2.8466, + "step": 5938 + }, + { + "epoch": 0.39864434079393307, + "grad_norm": 4.274546146392822, + "learning_rate": 9.266760223244218e-05, + "loss": 2.6496, + "step": 5940 + }, + { + "epoch": 0.3987785644777021, + "grad_norm": 4.288792133331299, + "learning_rate": 9.266193519487805e-05, + "loss": 2.7863, + "step": 5942 + }, + { + "epoch": 0.3989127881614711, + "grad_norm": 4.364317893981934, + "learning_rate": 9.265626614162137e-05, + "loss": 2.7845, + "step": 5944 + }, + { + "epoch": 0.3990470118452401, + "grad_norm": 5.46148681640625, + "learning_rate": 9.265059507294001e-05, + "loss": 2.9042, + "step": 5946 + }, + { + "epoch": 0.3991812355290091, + "grad_norm": 4.8145928382873535, + "learning_rate": 9.264492198910189e-05, + "loss": 2.8279, + "step": 5948 + }, + { + "epoch": 0.3993154592127781, + "grad_norm": 5.112297058105469, + "learning_rate": 9.263924689037505e-05, + "loss": 2.6142, + "step": 5950 + }, + { + "epoch": 0.3994496828965471, + "grad_norm": 4.367063999176025, + "learning_rate": 9.263356977702766e-05, + "loss": 2.8228, + "step": 5952 + }, + { + "epoch": 0.3995839065803161, + "grad_norm": 4.239729404449463, + "learning_rate": 9.262789064932794e-05, + "loss": 2.6518, + "step": 5954 + }, + { + "epoch": 0.3997181302640851, + "grad_norm": 4.603468418121338, + "learning_rate": 9.262220950754419e-05, + "loss": 2.6897, + "step": 5956 + }, + { + "epoch": 0.3998523539478541, + "grad_norm": 4.360147953033447, + "learning_rate": 9.261652635194487e-05, + "loss": 2.7835, + "step": 5958 + }, + { + "epoch": 0.3999865776316231, + "grad_norm": 3.91550350189209, + "learning_rate": 9.261084118279847e-05, + "loss": 2.7811, + "step": 5960 + }, + { + "epoch": 0.4001208013153921, + "grad_norm": 3.762937545776367, + "learning_rate": 9.260515400037362e-05, + "loss": 2.7176, + "step": 5962 + }, + { + "epoch": 0.4002550249991611, + "grad_norm": 4.352243423461914, + "learning_rate": 9.259946480493902e-05, + "loss": 2.7409, + "step": 5964 + }, + { + "epoch": 0.4003892486829301, + "grad_norm": 4.552089214324951, + "learning_rate": 9.259377359676348e-05, + "loss": 2.9776, + "step": 5966 + }, + { + "epoch": 0.4005234723666991, + "grad_norm": 4.194084167480469, + "learning_rate": 9.25880803761159e-05, + "loss": 2.3633, + "step": 5968 + }, + { + "epoch": 0.4006576960504681, + "grad_norm": 4.258677005767822, + "learning_rate": 9.258238514326525e-05, + "loss": 2.4174, + "step": 5970 + }, + { + "epoch": 0.4007919197342371, + "grad_norm": 5.8834147453308105, + "learning_rate": 9.257668789848067e-05, + "loss": 2.6645, + "step": 5972 + }, + { + "epoch": 0.4009261434180061, + "grad_norm": 5.271775722503662, + "learning_rate": 9.257098864203128e-05, + "loss": 2.7409, + "step": 5974 + }, + { + "epoch": 0.4010603671017751, + "grad_norm": 4.356500625610352, + "learning_rate": 9.25652873741864e-05, + "loss": 2.6316, + "step": 5976 + }, + { + "epoch": 0.4011945907855441, + "grad_norm": 4.010976314544678, + "learning_rate": 9.255958409521538e-05, + "loss": 2.441, + "step": 5978 + }, + { + "epoch": 0.4013288144693131, + "grad_norm": 4.302550792694092, + "learning_rate": 9.25538788053877e-05, + "loss": 2.3741, + "step": 5980 + }, + { + "epoch": 0.4014630381530821, + "grad_norm": 4.270500659942627, + "learning_rate": 9.254817150497295e-05, + "loss": 2.6327, + "step": 5982 + }, + { + "epoch": 0.4015972618368511, + "grad_norm": 3.924363136291504, + "learning_rate": 9.254246219424075e-05, + "loss": 2.6893, + "step": 5984 + }, + { + "epoch": 0.4017314855206201, + "grad_norm": 4.30816650390625, + "learning_rate": 9.253675087346087e-05, + "loss": 2.5394, + "step": 5986 + }, + { + "epoch": 0.40186570920438913, + "grad_norm": 4.848104000091553, + "learning_rate": 9.253103754290315e-05, + "loss": 2.5762, + "step": 5988 + }, + { + "epoch": 0.4019999328881581, + "grad_norm": 4.170071125030518, + "learning_rate": 9.252532220283754e-05, + "loss": 2.4078, + "step": 5990 + }, + { + "epoch": 0.40213415657192714, + "grad_norm": 3.3773891925811768, + "learning_rate": 9.251960485353408e-05, + "loss": 2.6219, + "step": 5992 + }, + { + "epoch": 0.4022683802556961, + "grad_norm": 4.772397041320801, + "learning_rate": 9.251388549526292e-05, + "loss": 2.7285, + "step": 5994 + }, + { + "epoch": 0.40240260393946514, + "grad_norm": 4.530271530151367, + "learning_rate": 9.250816412829425e-05, + "loss": 2.9633, + "step": 5996 + }, + { + "epoch": 0.4025368276232341, + "grad_norm": 4.300594806671143, + "learning_rate": 9.250244075289843e-05, + "loss": 2.6042, + "step": 5998 + }, + { + "epoch": 0.40267105130700315, + "grad_norm": 4.331608772277832, + "learning_rate": 9.249671536934585e-05, + "loss": 2.6149, + "step": 6000 + }, + { + "epoch": 0.4028052749907721, + "grad_norm": 3.9991989135742188, + "learning_rate": 9.249098797790702e-05, + "loss": 2.595, + "step": 6002 + }, + { + "epoch": 0.4029394986745411, + "grad_norm": 3.954796075820923, + "learning_rate": 9.248525857885259e-05, + "loss": 2.3826, + "step": 6004 + }, + { + "epoch": 0.4030737223583101, + "grad_norm": 4.176741123199463, + "learning_rate": 9.247952717245321e-05, + "loss": 2.8648, + "step": 6006 + }, + { + "epoch": 0.4032079460420791, + "grad_norm": 10.131089210510254, + "learning_rate": 9.247379375897974e-05, + "loss": 2.9669, + "step": 6008 + }, + { + "epoch": 0.40334216972584813, + "grad_norm": 4.783845901489258, + "learning_rate": 9.2468058338703e-05, + "loss": 3.0577, + "step": 6010 + }, + { + "epoch": 0.4034763934096171, + "grad_norm": 5.06345796585083, + "learning_rate": 9.246232091189402e-05, + "loss": 2.7134, + "step": 6012 + }, + { + "epoch": 0.40361061709338614, + "grad_norm": 4.116209983825684, + "learning_rate": 9.245658147882388e-05, + "loss": 2.5298, + "step": 6014 + }, + { + "epoch": 0.4037448407771551, + "grad_norm": 4.584890365600586, + "learning_rate": 9.245084003976377e-05, + "loss": 2.913, + "step": 6016 + }, + { + "epoch": 0.40387906446092414, + "grad_norm": 3.9450948238372803, + "learning_rate": 9.244509659498493e-05, + "loss": 2.8173, + "step": 6018 + }, + { + "epoch": 0.4040132881446931, + "grad_norm": 3.854506254196167, + "learning_rate": 9.243935114475872e-05, + "loss": 2.6219, + "step": 6020 + }, + { + "epoch": 0.40414751182846215, + "grad_norm": 4.416873455047607, + "learning_rate": 9.243360368935666e-05, + "loss": 2.6194, + "step": 6022 + }, + { + "epoch": 0.4042817355122311, + "grad_norm": 5.134489059448242, + "learning_rate": 9.242785422905025e-05, + "loss": 2.6788, + "step": 6024 + }, + { + "epoch": 0.40441595919600015, + "grad_norm": 4.307210445404053, + "learning_rate": 9.242210276411115e-05, + "loss": 2.6628, + "step": 6026 + }, + { + "epoch": 0.4045501828797691, + "grad_norm": 4.462407112121582, + "learning_rate": 9.241634929481112e-05, + "loss": 2.7002, + "step": 6028 + }, + { + "epoch": 0.40468440656353816, + "grad_norm": 4.424509525299072, + "learning_rate": 9.2410593821422e-05, + "loss": 2.5719, + "step": 6030 + }, + { + "epoch": 0.40481863024730713, + "grad_norm": 4.921133995056152, + "learning_rate": 9.24048363442157e-05, + "loss": 2.6645, + "step": 6032 + }, + { + "epoch": 0.40495285393107616, + "grad_norm": 4.752201557159424, + "learning_rate": 9.239907686346429e-05, + "loss": 2.5966, + "step": 6034 + }, + { + "epoch": 0.40508707761484514, + "grad_norm": 4.548332691192627, + "learning_rate": 9.239331537943987e-05, + "loss": 2.5355, + "step": 6036 + }, + { + "epoch": 0.40522130129861417, + "grad_norm": 4.283113479614258, + "learning_rate": 9.238755189241466e-05, + "loss": 2.8698, + "step": 6038 + }, + { + "epoch": 0.40535552498238314, + "grad_norm": 3.8970086574554443, + "learning_rate": 9.238178640266096e-05, + "loss": 2.5767, + "step": 6040 + }, + { + "epoch": 0.4054897486661521, + "grad_norm": 3.9884033203125, + "learning_rate": 9.23760189104512e-05, + "loss": 2.5337, + "step": 6042 + }, + { + "epoch": 0.40562397234992115, + "grad_norm": 4.715690612792969, + "learning_rate": 9.237024941605789e-05, + "loss": 2.6969, + "step": 6044 + }, + { + "epoch": 0.4057581960336901, + "grad_norm": 5.577376365661621, + "learning_rate": 9.23644779197536e-05, + "loss": 2.6054, + "step": 6046 + }, + { + "epoch": 0.40589241971745915, + "grad_norm": 4.383141040802002, + "learning_rate": 9.235870442181104e-05, + "loss": 2.7219, + "step": 6048 + }, + { + "epoch": 0.4060266434012281, + "grad_norm": 4.7122392654418945, + "learning_rate": 9.235292892250298e-05, + "loss": 3.0281, + "step": 6050 + }, + { + "epoch": 0.40616086708499716, + "grad_norm": 4.64913272857666, + "learning_rate": 9.234715142210233e-05, + "loss": 2.5, + "step": 6052 + }, + { + "epoch": 0.40629509076876613, + "grad_norm": 4.033514976501465, + "learning_rate": 9.234137192088202e-05, + "loss": 2.5057, + "step": 6054 + }, + { + "epoch": 0.40642931445253516, + "grad_norm": 4.268181324005127, + "learning_rate": 9.233559041911517e-05, + "loss": 2.6734, + "step": 6056 + }, + { + "epoch": 0.40656353813630414, + "grad_norm": 4.861144542694092, + "learning_rate": 9.232980691707491e-05, + "loss": 2.4216, + "step": 6058 + }, + { + "epoch": 0.40669776182007317, + "grad_norm": 4.990074634552002, + "learning_rate": 9.232402141503452e-05, + "loss": 2.8837, + "step": 6060 + }, + { + "epoch": 0.40683198550384214, + "grad_norm": 4.339177131652832, + "learning_rate": 9.231823391326734e-05, + "loss": 2.5415, + "step": 6062 + }, + { + "epoch": 0.40696620918761117, + "grad_norm": 4.251555442810059, + "learning_rate": 9.231244441204683e-05, + "loss": 2.5901, + "step": 6064 + }, + { + "epoch": 0.40710043287138015, + "grad_norm": 4.391055583953857, + "learning_rate": 9.230665291164652e-05, + "loss": 2.769, + "step": 6066 + }, + { + "epoch": 0.4072346565551492, + "grad_norm": 4.436460971832275, + "learning_rate": 9.230085941234006e-05, + "loss": 2.8083, + "step": 6068 + }, + { + "epoch": 0.40736888023891815, + "grad_norm": 4.593411445617676, + "learning_rate": 9.229506391440115e-05, + "loss": 2.9014, + "step": 6070 + }, + { + "epoch": 0.4075031039226872, + "grad_norm": 4.514215469360352, + "learning_rate": 9.228926641810367e-05, + "loss": 2.6097, + "step": 6072 + }, + { + "epoch": 0.40763732760645616, + "grad_norm": 4.321610927581787, + "learning_rate": 9.22834669237215e-05, + "loss": 2.6554, + "step": 6074 + }, + { + "epoch": 0.4077715512902252, + "grad_norm": 3.9874820709228516, + "learning_rate": 9.227766543152864e-05, + "loss": 2.4764, + "step": 6076 + }, + { + "epoch": 0.40790577497399416, + "grad_norm": 4.242318630218506, + "learning_rate": 9.227186194179925e-05, + "loss": 2.6142, + "step": 6078 + }, + { + "epoch": 0.40803999865776314, + "grad_norm": 4.796761512756348, + "learning_rate": 9.22660564548075e-05, + "loss": 3.1161, + "step": 6080 + }, + { + "epoch": 0.40817422234153217, + "grad_norm": 5.005732536315918, + "learning_rate": 9.22602489708277e-05, + "loss": 2.4537, + "step": 6082 + }, + { + "epoch": 0.40830844602530114, + "grad_norm": 4.526998043060303, + "learning_rate": 9.225443949013424e-05, + "loss": 2.6166, + "step": 6084 + }, + { + "epoch": 0.40844266970907017, + "grad_norm": 7.221223831176758, + "learning_rate": 9.224862801300159e-05, + "loss": 2.8215, + "step": 6086 + }, + { + "epoch": 0.40857689339283915, + "grad_norm": 4.31970739364624, + "learning_rate": 9.224281453970436e-05, + "loss": 2.6501, + "step": 6088 + }, + { + "epoch": 0.4087111170766082, + "grad_norm": 4.30662202835083, + "learning_rate": 9.22369990705172e-05, + "loss": 2.5139, + "step": 6090 + }, + { + "epoch": 0.40884534076037715, + "grad_norm": 3.7805721759796143, + "learning_rate": 9.223118160571489e-05, + "loss": 2.5635, + "step": 6092 + }, + { + "epoch": 0.4089795644441462, + "grad_norm": 3.9343883991241455, + "learning_rate": 9.22253621455723e-05, + "loss": 2.5466, + "step": 6094 + }, + { + "epoch": 0.40911378812791516, + "grad_norm": 6.975900650024414, + "learning_rate": 9.221954069036438e-05, + "loss": 2.4712, + "step": 6096 + }, + { + "epoch": 0.4092480118116842, + "grad_norm": 3.880152702331543, + "learning_rate": 9.221371724036619e-05, + "loss": 2.7583, + "step": 6098 + }, + { + "epoch": 0.40938223549545316, + "grad_norm": 5.132516384124756, + "learning_rate": 9.220789179585286e-05, + "loss": 2.6575, + "step": 6100 + }, + { + "epoch": 0.4095164591792222, + "grad_norm": 4.046781063079834, + "learning_rate": 9.220206435709963e-05, + "loss": 2.7076, + "step": 6102 + }, + { + "epoch": 0.40965068286299117, + "grad_norm": 4.085521697998047, + "learning_rate": 9.219623492438185e-05, + "loss": 2.4099, + "step": 6104 + }, + { + "epoch": 0.4097849065467602, + "grad_norm": 5.025508403778076, + "learning_rate": 9.219040349797495e-05, + "loss": 2.8031, + "step": 6106 + }, + { + "epoch": 0.40991913023052917, + "grad_norm": 4.20902156829834, + "learning_rate": 9.218457007815447e-05, + "loss": 2.6411, + "step": 6108 + }, + { + "epoch": 0.4100533539142982, + "grad_norm": 4.620366096496582, + "learning_rate": 9.217873466519597e-05, + "loss": 2.9737, + "step": 6110 + }, + { + "epoch": 0.4101875775980672, + "grad_norm": 4.000386714935303, + "learning_rate": 9.217289725937521e-05, + "loss": 2.4043, + "step": 6112 + }, + { + "epoch": 0.4103218012818362, + "grad_norm": 4.5831379890441895, + "learning_rate": 9.2167057860968e-05, + "loss": 2.7419, + "step": 6114 + }, + { + "epoch": 0.4104560249656052, + "grad_norm": 4.877871513366699, + "learning_rate": 9.216121647025021e-05, + "loss": 3.1355, + "step": 6116 + }, + { + "epoch": 0.41059024864937416, + "grad_norm": 4.255468845367432, + "learning_rate": 9.215537308749784e-05, + "loss": 2.6124, + "step": 6118 + }, + { + "epoch": 0.4107244723331432, + "grad_norm": 4.50504207611084, + "learning_rate": 9.214952771298701e-05, + "loss": 2.7365, + "step": 6120 + }, + { + "epoch": 0.41085869601691216, + "grad_norm": 4.121947765350342, + "learning_rate": 9.214368034699387e-05, + "loss": 2.6513, + "step": 6122 + }, + { + "epoch": 0.4109929197006812, + "grad_norm": 4.089262962341309, + "learning_rate": 9.213783098979469e-05, + "loss": 2.7967, + "step": 6124 + }, + { + "epoch": 0.41112714338445017, + "grad_norm": 4.061792373657227, + "learning_rate": 9.213197964166587e-05, + "loss": 2.6969, + "step": 6126 + }, + { + "epoch": 0.4112613670682192, + "grad_norm": 4.236400127410889, + "learning_rate": 9.212612630288386e-05, + "loss": 2.6534, + "step": 6128 + }, + { + "epoch": 0.41139559075198817, + "grad_norm": 4.173422813415527, + "learning_rate": 9.212027097372522e-05, + "loss": 2.8354, + "step": 6130 + }, + { + "epoch": 0.4115298144357572, + "grad_norm": 4.337775230407715, + "learning_rate": 9.21144136544666e-05, + "loss": 2.6782, + "step": 6132 + }, + { + "epoch": 0.4116640381195262, + "grad_norm": 3.644167423248291, + "learning_rate": 9.210855434538477e-05, + "loss": 2.4178, + "step": 6134 + }, + { + "epoch": 0.4117982618032952, + "grad_norm": 4.925889015197754, + "learning_rate": 9.210269304675652e-05, + "loss": 2.8401, + "step": 6136 + }, + { + "epoch": 0.4119324854870642, + "grad_norm": 4.520608425140381, + "learning_rate": 9.209682975885882e-05, + "loss": 2.7393, + "step": 6138 + }, + { + "epoch": 0.4120667091708332, + "grad_norm": 4.5182108879089355, + "learning_rate": 9.209096448196872e-05, + "loss": 2.7167, + "step": 6140 + }, + { + "epoch": 0.4122009328546022, + "grad_norm": 4.787085056304932, + "learning_rate": 9.208509721636328e-05, + "loss": 2.6585, + "step": 6142 + }, + { + "epoch": 0.4123351565383712, + "grad_norm": 4.024620056152344, + "learning_rate": 9.207922796231977e-05, + "loss": 2.7648, + "step": 6144 + }, + { + "epoch": 0.4124693802221402, + "grad_norm": 4.7871904373168945, + "learning_rate": 9.20733567201155e-05, + "loss": 2.7329, + "step": 6146 + }, + { + "epoch": 0.4126036039059092, + "grad_norm": 4.047810077667236, + "learning_rate": 9.206748349002782e-05, + "loss": 2.6119, + "step": 6148 + }, + { + "epoch": 0.4127378275896782, + "grad_norm": 3.857339382171631, + "learning_rate": 9.20616082723343e-05, + "loss": 2.7145, + "step": 6150 + }, + { + "epoch": 0.4128720512734472, + "grad_norm": 4.447113513946533, + "learning_rate": 9.20557310673125e-05, + "loss": 2.8339, + "step": 6152 + }, + { + "epoch": 0.4130062749572162, + "grad_norm": 5.958827495574951, + "learning_rate": 9.20498518752401e-05, + "loss": 2.3957, + "step": 6154 + }, + { + "epoch": 0.4131404986409852, + "grad_norm": 3.4225313663482666, + "learning_rate": 9.204397069639486e-05, + "loss": 2.6393, + "step": 6156 + }, + { + "epoch": 0.4132747223247542, + "grad_norm": 4.0693440437316895, + "learning_rate": 9.203808753105471e-05, + "loss": 2.6933, + "step": 6158 + }, + { + "epoch": 0.4134089460085232, + "grad_norm": 4.4387688636779785, + "learning_rate": 9.203220237949758e-05, + "loss": 2.6958, + "step": 6160 + }, + { + "epoch": 0.4135431696922922, + "grad_norm": 4.160854339599609, + "learning_rate": 9.202631524200153e-05, + "loss": 2.2314, + "step": 6162 + }, + { + "epoch": 0.4136773933760612, + "grad_norm": 5.242966651916504, + "learning_rate": 9.202042611884475e-05, + "loss": 2.6834, + "step": 6164 + }, + { + "epoch": 0.4138116170598302, + "grad_norm": 3.5562796592712402, + "learning_rate": 9.201453501030546e-05, + "loss": 2.5367, + "step": 6166 + }, + { + "epoch": 0.4139458407435992, + "grad_norm": 4.497880935668945, + "learning_rate": 9.200864191666199e-05, + "loss": 2.5125, + "step": 6168 + }, + { + "epoch": 0.4140800644273682, + "grad_norm": 4.1113128662109375, + "learning_rate": 9.200274683819282e-05, + "loss": 2.5963, + "step": 6170 + }, + { + "epoch": 0.4142142881111372, + "grad_norm": 4.225329399108887, + "learning_rate": 9.199684977517645e-05, + "loss": 2.7167, + "step": 6172 + }, + { + "epoch": 0.4143485117949062, + "grad_norm": 5.921441555023193, + "learning_rate": 9.199095072789149e-05, + "loss": 2.8421, + "step": 6174 + }, + { + "epoch": 0.4144827354786752, + "grad_norm": 3.9740912914276123, + "learning_rate": 9.19850496966167e-05, + "loss": 2.5148, + "step": 6176 + }, + { + "epoch": 0.41461695916244423, + "grad_norm": 4.889580726623535, + "learning_rate": 9.197914668163085e-05, + "loss": 2.7107, + "step": 6178 + }, + { + "epoch": 0.4147511828462132, + "grad_norm": 5.635776042938232, + "learning_rate": 9.19732416832129e-05, + "loss": 2.7687, + "step": 6180 + }, + { + "epoch": 0.41488540652998224, + "grad_norm": 4.3424835205078125, + "learning_rate": 9.19673347016418e-05, + "loss": 2.3408, + "step": 6182 + }, + { + "epoch": 0.4150196302137512, + "grad_norm": 4.705587387084961, + "learning_rate": 9.196142573719666e-05, + "loss": 2.9642, + "step": 6184 + }, + { + "epoch": 0.41515385389752024, + "grad_norm": 4.102505207061768, + "learning_rate": 9.195551479015667e-05, + "loss": 2.5804, + "step": 6186 + }, + { + "epoch": 0.4152880775812892, + "grad_norm": 4.497350692749023, + "learning_rate": 9.19496018608011e-05, + "loss": 2.7508, + "step": 6188 + }, + { + "epoch": 0.41542230126505825, + "grad_norm": 4.154359817504883, + "learning_rate": 9.194368694940935e-05, + "loss": 2.7926, + "step": 6190 + }, + { + "epoch": 0.4155565249488272, + "grad_norm": 4.524932861328125, + "learning_rate": 9.193777005626086e-05, + "loss": 2.7, + "step": 6192 + }, + { + "epoch": 0.4156907486325962, + "grad_norm": 4.621452808380127, + "learning_rate": 9.193185118163521e-05, + "loss": 2.5041, + "step": 6194 + }, + { + "epoch": 0.4158249723163652, + "grad_norm": 4.308021068572998, + "learning_rate": 9.192593032581203e-05, + "loss": 2.776, + "step": 6196 + }, + { + "epoch": 0.4159591960001342, + "grad_norm": 3.980311870574951, + "learning_rate": 9.19200074890711e-05, + "loss": 2.6878, + "step": 6198 + }, + { + "epoch": 0.41609341968390323, + "grad_norm": 4.921126842498779, + "learning_rate": 9.191408267169226e-05, + "loss": 2.8516, + "step": 6200 + }, + { + "epoch": 0.4162276433676722, + "grad_norm": 3.763077974319458, + "learning_rate": 9.19081558739554e-05, + "loss": 2.5387, + "step": 6202 + }, + { + "epoch": 0.41636186705144124, + "grad_norm": 6.35998010635376, + "learning_rate": 9.190222709614061e-05, + "loss": 2.5482, + "step": 6204 + }, + { + "epoch": 0.4164960907352102, + "grad_norm": 5.924992084503174, + "learning_rate": 9.189629633852799e-05, + "loss": 2.709, + "step": 6206 + }, + { + "epoch": 0.41663031441897924, + "grad_norm": 4.4666008949279785, + "learning_rate": 9.189036360139773e-05, + "loss": 2.5562, + "step": 6208 + }, + { + "epoch": 0.4167645381027482, + "grad_norm": 4.742714881896973, + "learning_rate": 9.188442888503018e-05, + "loss": 2.6291, + "step": 6210 + }, + { + "epoch": 0.41689876178651725, + "grad_norm": 5.034817218780518, + "learning_rate": 9.187849218970572e-05, + "loss": 2.9213, + "step": 6212 + }, + { + "epoch": 0.4170329854702862, + "grad_norm": 4.200456619262695, + "learning_rate": 9.187255351570487e-05, + "loss": 2.5703, + "step": 6214 + }, + { + "epoch": 0.41716720915405525, + "grad_norm": 9.1509428024292, + "learning_rate": 9.18666128633082e-05, + "loss": 2.7172, + "step": 6216 + }, + { + "epoch": 0.4173014328378242, + "grad_norm": 4.7845458984375, + "learning_rate": 9.186067023279639e-05, + "loss": 2.8061, + "step": 6218 + }, + { + "epoch": 0.41743565652159326, + "grad_norm": 4.0122575759887695, + "learning_rate": 9.185472562445022e-05, + "loss": 2.6262, + "step": 6220 + }, + { + "epoch": 0.41756988020536223, + "grad_norm": 6.589612007141113, + "learning_rate": 9.184877903855058e-05, + "loss": 2.8272, + "step": 6222 + }, + { + "epoch": 0.41770410388913126, + "grad_norm": 4.486132621765137, + "learning_rate": 9.184283047537843e-05, + "loss": 2.5911, + "step": 6224 + }, + { + "epoch": 0.41783832757290024, + "grad_norm": 4.57509183883667, + "learning_rate": 9.18368799352148e-05, + "loss": 2.5289, + "step": 6226 + }, + { + "epoch": 0.41797255125666927, + "grad_norm": 5.151191234588623, + "learning_rate": 9.183092741834087e-05, + "loss": 2.7736, + "step": 6228 + }, + { + "epoch": 0.41810677494043824, + "grad_norm": 4.145803928375244, + "learning_rate": 9.182497292503789e-05, + "loss": 2.6167, + "step": 6230 + }, + { + "epoch": 0.4182409986242072, + "grad_norm": 4.389992713928223, + "learning_rate": 9.181901645558717e-05, + "loss": 2.5767, + "step": 6232 + }, + { + "epoch": 0.41837522230797625, + "grad_norm": 4.060533046722412, + "learning_rate": 9.181305801027015e-05, + "loss": 2.3937, + "step": 6234 + }, + { + "epoch": 0.4185094459917452, + "grad_norm": 5.16310977935791, + "learning_rate": 9.180709758936839e-05, + "loss": 2.5, + "step": 6236 + }, + { + "epoch": 0.41864366967551425, + "grad_norm": 6.350131511688232, + "learning_rate": 9.180113519316345e-05, + "loss": 2.9629, + "step": 6238 + }, + { + "epoch": 0.4187778933592832, + "grad_norm": 5.0895233154296875, + "learning_rate": 9.179517082193709e-05, + "loss": 2.7015, + "step": 6240 + }, + { + "epoch": 0.41891211704305226, + "grad_norm": 5.020917892456055, + "learning_rate": 9.178920447597108e-05, + "loss": 2.4973, + "step": 6242 + }, + { + "epoch": 0.41904634072682123, + "grad_norm": 5.3406572341918945, + "learning_rate": 9.178323615554733e-05, + "loss": 2.4534, + "step": 6244 + }, + { + "epoch": 0.41918056441059026, + "grad_norm": 3.8323302268981934, + "learning_rate": 9.177726586094785e-05, + "loss": 2.4021, + "step": 6246 + }, + { + "epoch": 0.41931478809435924, + "grad_norm": 6.14771032333374, + "learning_rate": 9.177129359245471e-05, + "loss": 2.6364, + "step": 6248 + }, + { + "epoch": 0.41944901177812827, + "grad_norm": 4.685268878936768, + "learning_rate": 9.176531935035009e-05, + "loss": 2.444, + "step": 6250 + }, + { + "epoch": 0.41958323546189724, + "grad_norm": 5.385211944580078, + "learning_rate": 9.175934313491625e-05, + "loss": 2.8177, + "step": 6252 + }, + { + "epoch": 0.41971745914566627, + "grad_norm": 4.64099645614624, + "learning_rate": 9.175336494643557e-05, + "loss": 2.8978, + "step": 6254 + }, + { + "epoch": 0.41985168282943525, + "grad_norm": 4.3928399085998535, + "learning_rate": 9.174738478519047e-05, + "loss": 2.4548, + "step": 6256 + }, + { + "epoch": 0.4199859065132043, + "grad_norm": 4.474331855773926, + "learning_rate": 9.174140265146356e-05, + "loss": 2.6339, + "step": 6258 + }, + { + "epoch": 0.42012013019697325, + "grad_norm": 5.734708309173584, + "learning_rate": 9.173541854553745e-05, + "loss": 2.6393, + "step": 6260 + }, + { + "epoch": 0.4202543538807423, + "grad_norm": 4.316348552703857, + "learning_rate": 9.172943246769489e-05, + "loss": 2.8166, + "step": 6262 + }, + { + "epoch": 0.42038857756451126, + "grad_norm": 3.9207425117492676, + "learning_rate": 9.17234444182187e-05, + "loss": 2.5672, + "step": 6264 + }, + { + "epoch": 0.42052280124828023, + "grad_norm": 4.109706401824951, + "learning_rate": 9.17174543973918e-05, + "loss": 2.5178, + "step": 6266 + }, + { + "epoch": 0.42065702493204926, + "grad_norm": 6.928643226623535, + "learning_rate": 9.171146240549722e-05, + "loss": 2.835, + "step": 6268 + }, + { + "epoch": 0.42079124861581824, + "grad_norm": 5.22552490234375, + "learning_rate": 9.170546844281807e-05, + "loss": 2.8837, + "step": 6270 + }, + { + "epoch": 0.42092547229958727, + "grad_norm": 4.671639442443848, + "learning_rate": 9.169947250963753e-05, + "loss": 2.6705, + "step": 6272 + }, + { + "epoch": 0.42105969598335624, + "grad_norm": 4.095703125, + "learning_rate": 9.169347460623892e-05, + "loss": 2.3746, + "step": 6274 + }, + { + "epoch": 0.42119391966712527, + "grad_norm": 5.061890602111816, + "learning_rate": 9.168747473290562e-05, + "loss": 2.9126, + "step": 6276 + }, + { + "epoch": 0.42132814335089425, + "grad_norm": 44.30119323730469, + "learning_rate": 9.168147288992112e-05, + "loss": 2.8764, + "step": 6278 + }, + { + "epoch": 0.4214623670346633, + "grad_norm": 4.376634120941162, + "learning_rate": 9.167546907756898e-05, + "loss": 2.7328, + "step": 6280 + }, + { + "epoch": 0.42159659071843225, + "grad_norm": 5.508876800537109, + "learning_rate": 9.166946329613288e-05, + "loss": 2.5956, + "step": 6282 + }, + { + "epoch": 0.4217308144022013, + "grad_norm": 3.944815158843994, + "learning_rate": 9.166345554589658e-05, + "loss": 2.3305, + "step": 6284 + }, + { + "epoch": 0.42186503808597026, + "grad_norm": 4.360310077667236, + "learning_rate": 9.165744582714393e-05, + "loss": 2.5057, + "step": 6286 + }, + { + "epoch": 0.4219992617697393, + "grad_norm": 4.8339643478393555, + "learning_rate": 9.165143414015889e-05, + "loss": 2.8526, + "step": 6288 + }, + { + "epoch": 0.42213348545350826, + "grad_norm": 4.663270950317383, + "learning_rate": 9.164542048522549e-05, + "loss": 2.4291, + "step": 6290 + }, + { + "epoch": 0.4222677091372773, + "grad_norm": 4.59062385559082, + "learning_rate": 9.163940486262785e-05, + "loss": 2.7683, + "step": 6292 + }, + { + "epoch": 0.42240193282104627, + "grad_norm": 4.1641526222229, + "learning_rate": 9.163338727265022e-05, + "loss": 2.5817, + "step": 6294 + }, + { + "epoch": 0.4225361565048153, + "grad_norm": 5.35437536239624, + "learning_rate": 9.162736771557692e-05, + "loss": 2.987, + "step": 6296 + }, + { + "epoch": 0.42267038018858427, + "grad_norm": 5.962706565856934, + "learning_rate": 9.162134619169233e-05, + "loss": 2.6481, + "step": 6298 + }, + { + "epoch": 0.4228046038723533, + "grad_norm": 4.411477088928223, + "learning_rate": 9.161532270128099e-05, + "loss": 2.7417, + "step": 6300 + }, + { + "epoch": 0.4229388275561223, + "grad_norm": 6.189830780029297, + "learning_rate": 9.16092972446275e-05, + "loss": 2.8017, + "step": 6302 + }, + { + "epoch": 0.42307305123989125, + "grad_norm": 4.4966840744018555, + "learning_rate": 9.160326982201652e-05, + "loss": 2.7329, + "step": 6304 + }, + { + "epoch": 0.4232072749236603, + "grad_norm": 4.5483503341674805, + "learning_rate": 9.159724043373284e-05, + "loss": 2.7011, + "step": 6306 + }, + { + "epoch": 0.42334149860742926, + "grad_norm": 4.229082107543945, + "learning_rate": 9.159120908006135e-05, + "loss": 2.5478, + "step": 6308 + }, + { + "epoch": 0.4234757222911983, + "grad_norm": 4.093816757202148, + "learning_rate": 9.158517576128705e-05, + "loss": 2.5086, + "step": 6310 + }, + { + "epoch": 0.42360994597496726, + "grad_norm": 4.416231155395508, + "learning_rate": 9.157914047769493e-05, + "loss": 2.8173, + "step": 6312 + }, + { + "epoch": 0.4237441696587363, + "grad_norm": 4.335710048675537, + "learning_rate": 9.15731032295702e-05, + "loss": 2.3931, + "step": 6314 + }, + { + "epoch": 0.42387839334250527, + "grad_norm": 4.770411491394043, + "learning_rate": 9.15670640171981e-05, + "loss": 2.7392, + "step": 6316 + }, + { + "epoch": 0.4240126170262743, + "grad_norm": 4.34832239151001, + "learning_rate": 9.156102284086394e-05, + "loss": 2.8575, + "step": 6318 + }, + { + "epoch": 0.42414684071004327, + "grad_norm": 10.871674537658691, + "learning_rate": 9.15549797008532e-05, + "loss": 2.5903, + "step": 6320 + }, + { + "epoch": 0.4242810643938123, + "grad_norm": 3.9406650066375732, + "learning_rate": 9.154893459745138e-05, + "loss": 2.4867, + "step": 6322 + }, + { + "epoch": 0.4244152880775813, + "grad_norm": 5.917113304138184, + "learning_rate": 9.154288753094408e-05, + "loss": 2.4241, + "step": 6324 + }, + { + "epoch": 0.4245495117613503, + "grad_norm": 5.126455783843994, + "learning_rate": 9.153683850161706e-05, + "loss": 2.8495, + "step": 6326 + }, + { + "epoch": 0.4246837354451193, + "grad_norm": 4.3944621086120605, + "learning_rate": 9.15307875097561e-05, + "loss": 2.6111, + "step": 6328 + }, + { + "epoch": 0.4248179591288883, + "grad_norm": 6.753971099853516, + "learning_rate": 9.152473455564708e-05, + "loss": 2.8723, + "step": 6330 + }, + { + "epoch": 0.4249521828126573, + "grad_norm": 4.258637428283691, + "learning_rate": 9.151867963957601e-05, + "loss": 2.6648, + "step": 6332 + }, + { + "epoch": 0.4250864064964263, + "grad_norm": 4.1473917961120605, + "learning_rate": 9.151262276182898e-05, + "loss": 2.6951, + "step": 6334 + }, + { + "epoch": 0.4252206301801953, + "grad_norm": 4.248603820800781, + "learning_rate": 9.150656392269215e-05, + "loss": 2.8352, + "step": 6336 + }, + { + "epoch": 0.4253548538639643, + "grad_norm": 4.686061382293701, + "learning_rate": 9.15005031224518e-05, + "loss": 2.7906, + "step": 6338 + }, + { + "epoch": 0.4254890775477333, + "grad_norm": 6.2163214683532715, + "learning_rate": 9.149444036139427e-05, + "loss": 2.6037, + "step": 6340 + }, + { + "epoch": 0.42562330123150227, + "grad_norm": 4.5133843421936035, + "learning_rate": 9.148837563980606e-05, + "loss": 2.6674, + "step": 6342 + }, + { + "epoch": 0.4257575249152713, + "grad_norm": 5.479149341583252, + "learning_rate": 9.148230895797366e-05, + "loss": 2.7208, + "step": 6344 + }, + { + "epoch": 0.4258917485990403, + "grad_norm": 4.975689888000488, + "learning_rate": 9.147624031618373e-05, + "loss": 2.5408, + "step": 6346 + }, + { + "epoch": 0.4260259722828093, + "grad_norm": 3.9083433151245117, + "learning_rate": 9.147016971472299e-05, + "loss": 2.5587, + "step": 6348 + }, + { + "epoch": 0.4261601959665783, + "grad_norm": 4.565997123718262, + "learning_rate": 9.146409715387832e-05, + "loss": 2.6963, + "step": 6350 + }, + { + "epoch": 0.4262944196503473, + "grad_norm": 4.239004135131836, + "learning_rate": 9.145802263393657e-05, + "loss": 2.4018, + "step": 6352 + }, + { + "epoch": 0.4264286433341163, + "grad_norm": 6.083830833435059, + "learning_rate": 9.145194615518477e-05, + "loss": 2.7329, + "step": 6354 + }, + { + "epoch": 0.4265628670178853, + "grad_norm": 3.928145170211792, + "learning_rate": 9.144586771791003e-05, + "loss": 2.4773, + "step": 6356 + }, + { + "epoch": 0.4266970907016543, + "grad_norm": 4.5316877365112305, + "learning_rate": 9.143978732239955e-05, + "loss": 2.8888, + "step": 6358 + }, + { + "epoch": 0.4268313143854233, + "grad_norm": 5.3037824630737305, + "learning_rate": 9.143370496894061e-05, + "loss": 2.872, + "step": 6360 + }, + { + "epoch": 0.4269655380691923, + "grad_norm": 7.860236167907715, + "learning_rate": 9.142762065782058e-05, + "loss": 2.6658, + "step": 6362 + }, + { + "epoch": 0.4270997617529613, + "grad_norm": 4.193946361541748, + "learning_rate": 9.142153438932693e-05, + "loss": 2.6577, + "step": 6364 + }, + { + "epoch": 0.4272339854367303, + "grad_norm": 3.849970579147339, + "learning_rate": 9.141544616374724e-05, + "loss": 2.6203, + "step": 6366 + }, + { + "epoch": 0.42736820912049933, + "grad_norm": 4.2176337242126465, + "learning_rate": 9.140935598136914e-05, + "loss": 2.7638, + "step": 6368 + }, + { + "epoch": 0.4275024328042683, + "grad_norm": 4.727967262268066, + "learning_rate": 9.140326384248042e-05, + "loss": 2.8057, + "step": 6370 + }, + { + "epoch": 0.42763665648803734, + "grad_norm": 4.96689510345459, + "learning_rate": 9.139716974736889e-05, + "loss": 2.7638, + "step": 6372 + }, + { + "epoch": 0.4277708801718063, + "grad_norm": 5.766088008880615, + "learning_rate": 9.13910736963225e-05, + "loss": 2.6752, + "step": 6374 + }, + { + "epoch": 0.42790510385557534, + "grad_norm": 4.663301467895508, + "learning_rate": 9.138497568962927e-05, + "loss": 2.6183, + "step": 6376 + }, + { + "epoch": 0.4280393275393443, + "grad_norm": 4.447956085205078, + "learning_rate": 9.137887572757732e-05, + "loss": 2.5537, + "step": 6378 + }, + { + "epoch": 0.4281735512231133, + "grad_norm": 7.009554386138916, + "learning_rate": 9.137277381045486e-05, + "loss": 2.2799, + "step": 6380 + }, + { + "epoch": 0.4283077749068823, + "grad_norm": 4.310940265655518, + "learning_rate": 9.136666993855018e-05, + "loss": 2.7128, + "step": 6382 + }, + { + "epoch": 0.4284419985906513, + "grad_norm": 4.0918450355529785, + "learning_rate": 9.13605641121517e-05, + "loss": 2.7671, + "step": 6384 + }, + { + "epoch": 0.4285762222744203, + "grad_norm": 4.550774574279785, + "learning_rate": 9.135445633154789e-05, + "loss": 2.7536, + "step": 6386 + }, + { + "epoch": 0.4287104459581893, + "grad_norm": 4.2373270988464355, + "learning_rate": 9.134834659702736e-05, + "loss": 2.9832, + "step": 6388 + }, + { + "epoch": 0.42884466964195833, + "grad_norm": 4.460054397583008, + "learning_rate": 9.134223490887875e-05, + "loss": 2.5986, + "step": 6390 + }, + { + "epoch": 0.4289788933257273, + "grad_norm": 4.287836074829102, + "learning_rate": 9.133612126739082e-05, + "loss": 2.7188, + "step": 6392 + }, + { + "epoch": 0.42911311700949634, + "grad_norm": 6.468466758728027, + "learning_rate": 9.133000567285245e-05, + "loss": 2.528, + "step": 6394 + }, + { + "epoch": 0.4292473406932653, + "grad_norm": 4.1939496994018555, + "learning_rate": 9.13238881255526e-05, + "loss": 2.9736, + "step": 6396 + }, + { + "epoch": 0.42938156437703434, + "grad_norm": 3.956188917160034, + "learning_rate": 9.131776862578027e-05, + "loss": 2.7896, + "step": 6398 + }, + { + "epoch": 0.4295157880608033, + "grad_norm": 4.563364505767822, + "learning_rate": 9.131164717382466e-05, + "loss": 3.1239, + "step": 6400 + }, + { + "epoch": 0.42965001174457235, + "grad_norm": 4.1328840255737305, + "learning_rate": 9.130552376997492e-05, + "loss": 2.4734, + "step": 6402 + }, + { + "epoch": 0.4297842354283413, + "grad_norm": 3.9669103622436523, + "learning_rate": 9.129939841452042e-05, + "loss": 2.6004, + "step": 6404 + }, + { + "epoch": 0.42991845911211035, + "grad_norm": 3.781691789627075, + "learning_rate": 9.129327110775056e-05, + "loss": 2.2185, + "step": 6406 + }, + { + "epoch": 0.4300526827958793, + "grad_norm": 4.150212287902832, + "learning_rate": 9.128714184995483e-05, + "loss": 2.4393, + "step": 6408 + }, + { + "epoch": 0.43018690647964836, + "grad_norm": 4.927125453948975, + "learning_rate": 9.128101064142285e-05, + "loss": 2.5648, + "step": 6410 + }, + { + "epoch": 0.43032113016341733, + "grad_norm": 5.423046588897705, + "learning_rate": 9.127487748244427e-05, + "loss": 2.8188, + "step": 6412 + }, + { + "epoch": 0.43045535384718636, + "grad_norm": 4.546079635620117, + "learning_rate": 9.126874237330891e-05, + "loss": 2.8708, + "step": 6414 + }, + { + "epoch": 0.43058957753095534, + "grad_norm": 4.018931865692139, + "learning_rate": 9.126260531430662e-05, + "loss": 2.6003, + "step": 6416 + }, + { + "epoch": 0.4307238012147243, + "grad_norm": 4.744691371917725, + "learning_rate": 9.125646630572737e-05, + "loss": 2.7246, + "step": 6418 + }, + { + "epoch": 0.43085802489849334, + "grad_norm": 4.1431565284729, + "learning_rate": 9.125032534786122e-05, + "loss": 2.5295, + "step": 6420 + }, + { + "epoch": 0.4309922485822623, + "grad_norm": 4.817676544189453, + "learning_rate": 9.12441824409983e-05, + "loss": 2.6693, + "step": 6422 + }, + { + "epoch": 0.43112647226603135, + "grad_norm": 3.685617208480835, + "learning_rate": 9.123803758542888e-05, + "loss": 2.3923, + "step": 6424 + }, + { + "epoch": 0.4312606959498003, + "grad_norm": 3.9089114665985107, + "learning_rate": 9.123189078144326e-05, + "loss": 2.4161, + "step": 6426 + }, + { + "epoch": 0.43139491963356935, + "grad_norm": 4.6746087074279785, + "learning_rate": 9.122574202933188e-05, + "loss": 2.8383, + "step": 6428 + }, + { + "epoch": 0.4315291433173383, + "grad_norm": 4.261181831359863, + "learning_rate": 9.121959132938527e-05, + "loss": 2.6123, + "step": 6430 + }, + { + "epoch": 0.43166336700110736, + "grad_norm": 3.7131378650665283, + "learning_rate": 9.1213438681894e-05, + "loss": 2.5137, + "step": 6432 + }, + { + "epoch": 0.43179759068487633, + "grad_norm": 3.4865028858184814, + "learning_rate": 9.120728408714882e-05, + "loss": 2.5043, + "step": 6434 + }, + { + "epoch": 0.43193181436864536, + "grad_norm": 4.125328063964844, + "learning_rate": 9.120112754544047e-05, + "loss": 2.6068, + "step": 6436 + }, + { + "epoch": 0.43206603805241434, + "grad_norm": 4.203118324279785, + "learning_rate": 9.119496905705989e-05, + "loss": 2.3767, + "step": 6438 + }, + { + "epoch": 0.43220026173618337, + "grad_norm": 4.893166542053223, + "learning_rate": 9.118880862229802e-05, + "loss": 2.8074, + "step": 6440 + }, + { + "epoch": 0.43233448541995234, + "grad_norm": 4.6311750411987305, + "learning_rate": 9.118264624144594e-05, + "loss": 2.6245, + "step": 6442 + }, + { + "epoch": 0.43246870910372137, + "grad_norm": 4.471560955047607, + "learning_rate": 9.11764819147948e-05, + "loss": 2.5908, + "step": 6444 + }, + { + "epoch": 0.43260293278749035, + "grad_norm": 4.3428263664245605, + "learning_rate": 9.117031564263584e-05, + "loss": 2.8904, + "step": 6446 + }, + { + "epoch": 0.4327371564712594, + "grad_norm": 4.013436794281006, + "learning_rate": 9.116414742526047e-05, + "loss": 2.6994, + "step": 6448 + }, + { + "epoch": 0.43287138015502835, + "grad_norm": 5.5789408683776855, + "learning_rate": 9.115797726296004e-05, + "loss": 2.7295, + "step": 6450 + }, + { + "epoch": 0.4330056038387974, + "grad_norm": 3.9403886795043945, + "learning_rate": 9.115180515602614e-05, + "loss": 2.4838, + "step": 6452 + }, + { + "epoch": 0.43313982752256636, + "grad_norm": 15.493501663208008, + "learning_rate": 9.114563110475036e-05, + "loss": 2.5905, + "step": 6454 + }, + { + "epoch": 0.43327405120633533, + "grad_norm": 3.833239793777466, + "learning_rate": 9.113945510942443e-05, + "loss": 2.7013, + "step": 6456 + }, + { + "epoch": 0.43340827489010436, + "grad_norm": 4.413332939147949, + "learning_rate": 9.113327717034013e-05, + "loss": 2.6676, + "step": 6458 + }, + { + "epoch": 0.43354249857387334, + "grad_norm": 3.9899258613586426, + "learning_rate": 9.112709728778937e-05, + "loss": 2.6315, + "step": 6460 + }, + { + "epoch": 0.43367672225764237, + "grad_norm": 5.115573883056641, + "learning_rate": 9.112091546206414e-05, + "loss": 2.37, + "step": 6462 + }, + { + "epoch": 0.43381094594141134, + "grad_norm": 4.203201770782471, + "learning_rate": 9.111473169345652e-05, + "loss": 2.7736, + "step": 6464 + }, + { + "epoch": 0.43394516962518037, + "grad_norm": 6.207315921783447, + "learning_rate": 9.110854598225867e-05, + "loss": 2.5405, + "step": 6466 + }, + { + "epoch": 0.43407939330894935, + "grad_norm": 4.178586006164551, + "learning_rate": 9.110235832876286e-05, + "loss": 2.6397, + "step": 6468 + }, + { + "epoch": 0.4342136169927184, + "grad_norm": 5.387241840362549, + "learning_rate": 9.109616873326144e-05, + "loss": 2.822, + "step": 6470 + }, + { + "epoch": 0.43434784067648735, + "grad_norm": 4.200450897216797, + "learning_rate": 9.108997719604687e-05, + "loss": 2.6894, + "step": 6472 + }, + { + "epoch": 0.4344820643602564, + "grad_norm": 3.8428444862365723, + "learning_rate": 9.108378371741167e-05, + "loss": 2.4065, + "step": 6474 + }, + { + "epoch": 0.43461628804402536, + "grad_norm": 3.638162136077881, + "learning_rate": 9.107758829764848e-05, + "loss": 2.4627, + "step": 6476 + }, + { + "epoch": 0.4347505117277944, + "grad_norm": 3.329681634902954, + "learning_rate": 9.107139093705001e-05, + "loss": 2.4447, + "step": 6478 + }, + { + "epoch": 0.43488473541156336, + "grad_norm": 4.292613506317139, + "learning_rate": 9.106519163590907e-05, + "loss": 2.9783, + "step": 6480 + }, + { + "epoch": 0.4350189590953324, + "grad_norm": 4.124265193939209, + "learning_rate": 9.10589903945186e-05, + "loss": 2.7321, + "step": 6482 + }, + { + "epoch": 0.43515318277910137, + "grad_norm": 3.864797353744507, + "learning_rate": 9.105278721317157e-05, + "loss": 2.5006, + "step": 6484 + }, + { + "epoch": 0.4352874064628704, + "grad_norm": 4.452638626098633, + "learning_rate": 9.104658209216108e-05, + "loss": 2.7607, + "step": 6486 + }, + { + "epoch": 0.43542163014663937, + "grad_norm": 4.6523566246032715, + "learning_rate": 9.104037503178028e-05, + "loss": 2.7419, + "step": 6488 + }, + { + "epoch": 0.4355558538304084, + "grad_norm": 5.29513692855835, + "learning_rate": 9.103416603232246e-05, + "loss": 2.7671, + "step": 6490 + }, + { + "epoch": 0.4356900775141774, + "grad_norm": 3.851041793823242, + "learning_rate": 9.102795509408099e-05, + "loss": 2.5037, + "step": 6492 + }, + { + "epoch": 0.43582430119794635, + "grad_norm": 4.577882289886475, + "learning_rate": 9.102174221734934e-05, + "loss": 2.8371, + "step": 6494 + }, + { + "epoch": 0.4359585248817154, + "grad_norm": 3.9537527561187744, + "learning_rate": 9.101552740242102e-05, + "loss": 2.4467, + "step": 6496 + }, + { + "epoch": 0.43609274856548436, + "grad_norm": 4.178625106811523, + "learning_rate": 9.100931064958968e-05, + "loss": 2.6085, + "step": 6498 + }, + { + "epoch": 0.4362269722492534, + "grad_norm": 4.015218734741211, + "learning_rate": 9.100309195914907e-05, + "loss": 2.5397, + "step": 6500 + }, + { + "epoch": 0.43636119593302236, + "grad_norm": 3.5989022254943848, + "learning_rate": 9.099687133139298e-05, + "loss": 2.3262, + "step": 6502 + }, + { + "epoch": 0.4364954196167914, + "grad_norm": 4.110573768615723, + "learning_rate": 9.099064876661533e-05, + "loss": 2.6493, + "step": 6504 + }, + { + "epoch": 0.43662964330056037, + "grad_norm": 4.963046073913574, + "learning_rate": 9.098442426511014e-05, + "loss": 2.7172, + "step": 6506 + }, + { + "epoch": 0.4367638669843294, + "grad_norm": 4.188169002532959, + "learning_rate": 9.09781978271715e-05, + "loss": 2.7326, + "step": 6508 + }, + { + "epoch": 0.43689809066809837, + "grad_norm": 3.790184259414673, + "learning_rate": 9.097196945309359e-05, + "loss": 2.5498, + "step": 6510 + }, + { + "epoch": 0.4370323143518674, + "grad_norm": 4.878396987915039, + "learning_rate": 9.096573914317068e-05, + "loss": 2.6773, + "step": 6512 + }, + { + "epoch": 0.4371665380356364, + "grad_norm": 4.108456611633301, + "learning_rate": 9.095950689769716e-05, + "loss": 2.7564, + "step": 6514 + }, + { + "epoch": 0.4373007617194054, + "grad_norm": 4.105154514312744, + "learning_rate": 9.095327271696749e-05, + "loss": 2.5453, + "step": 6516 + }, + { + "epoch": 0.4374349854031744, + "grad_norm": 6.492742538452148, + "learning_rate": 9.094703660127622e-05, + "loss": 2.6365, + "step": 6518 + }, + { + "epoch": 0.4375692090869434, + "grad_norm": 4.645877361297607, + "learning_rate": 9.094079855091797e-05, + "loss": 2.9546, + "step": 6520 + }, + { + "epoch": 0.4377034327707124, + "grad_norm": 4.500748634338379, + "learning_rate": 9.09345585661875e-05, + "loss": 2.5619, + "step": 6522 + }, + { + "epoch": 0.4378376564544814, + "grad_norm": 4.078969955444336, + "learning_rate": 9.092831664737964e-05, + "loss": 2.9366, + "step": 6524 + }, + { + "epoch": 0.4379718801382504, + "grad_norm": 4.2843451499938965, + "learning_rate": 9.092207279478929e-05, + "loss": 2.5992, + "step": 6526 + }, + { + "epoch": 0.4381061038220194, + "grad_norm": 3.9586129188537598, + "learning_rate": 9.091582700871148e-05, + "loss": 2.492, + "step": 6528 + }, + { + "epoch": 0.4382403275057884, + "grad_norm": 4.229062080383301, + "learning_rate": 9.090957928944129e-05, + "loss": 2.6626, + "step": 6530 + }, + { + "epoch": 0.43837455118955737, + "grad_norm": 4.400057792663574, + "learning_rate": 9.090332963727393e-05, + "loss": 2.737, + "step": 6532 + }, + { + "epoch": 0.4385087748733264, + "grad_norm": 4.599862575531006, + "learning_rate": 9.089707805250468e-05, + "loss": 2.6565, + "step": 6534 + }, + { + "epoch": 0.4386429985570954, + "grad_norm": 3.6752023696899414, + "learning_rate": 9.089082453542891e-05, + "loss": 2.6051, + "step": 6536 + }, + { + "epoch": 0.4387772222408644, + "grad_norm": 3.979030132293701, + "learning_rate": 9.088456908634209e-05, + "loss": 2.7021, + "step": 6538 + }, + { + "epoch": 0.4389114459246334, + "grad_norm": 4.347792148590088, + "learning_rate": 9.087831170553978e-05, + "loss": 2.5932, + "step": 6540 + }, + { + "epoch": 0.4390456696084024, + "grad_norm": 3.8072409629821777, + "learning_rate": 9.087205239331762e-05, + "loss": 2.5557, + "step": 6542 + }, + { + "epoch": 0.4391798932921714, + "grad_norm": 4.370699405670166, + "learning_rate": 9.086579114997136e-05, + "loss": 2.7811, + "step": 6544 + }, + { + "epoch": 0.4393141169759404, + "grad_norm": 4.128059387207031, + "learning_rate": 9.085952797579682e-05, + "loss": 2.8291, + "step": 6546 + }, + { + "epoch": 0.4394483406597094, + "grad_norm": 4.426694869995117, + "learning_rate": 9.085326287108995e-05, + "loss": 2.8633, + "step": 6548 + }, + { + "epoch": 0.4395825643434784, + "grad_norm": 4.578085422515869, + "learning_rate": 9.084699583614673e-05, + "loss": 2.8095, + "step": 6550 + }, + { + "epoch": 0.4397167880272474, + "grad_norm": 4.76617956161499, + "learning_rate": 9.084072687126327e-05, + "loss": 2.9523, + "step": 6552 + }, + { + "epoch": 0.4398510117110164, + "grad_norm": 3.9182968139648438, + "learning_rate": 9.083445597673578e-05, + "loss": 2.3865, + "step": 6554 + }, + { + "epoch": 0.4399852353947854, + "grad_norm": 4.2658257484436035, + "learning_rate": 9.082818315286055e-05, + "loss": 2.4603, + "step": 6556 + }, + { + "epoch": 0.44011945907855443, + "grad_norm": 19.688993453979492, + "learning_rate": 9.082190839993395e-05, + "loss": 2.4279, + "step": 6558 + }, + { + "epoch": 0.4402536827623234, + "grad_norm": 4.599429130554199, + "learning_rate": 9.081563171825245e-05, + "loss": 2.6318, + "step": 6560 + }, + { + "epoch": 0.44038790644609244, + "grad_norm": 5.2143754959106445, + "learning_rate": 9.08093531081126e-05, + "loss": 2.8037, + "step": 6562 + }, + { + "epoch": 0.4405221301298614, + "grad_norm": 4.374457359313965, + "learning_rate": 9.080307256981109e-05, + "loss": 2.5882, + "step": 6564 + }, + { + "epoch": 0.44065635381363044, + "grad_norm": 4.059348106384277, + "learning_rate": 9.079679010364461e-05, + "loss": 2.6768, + "step": 6566 + }, + { + "epoch": 0.4407905774973994, + "grad_norm": 3.738992691040039, + "learning_rate": 9.079050570991004e-05, + "loss": 2.4322, + "step": 6568 + }, + { + "epoch": 0.4409248011811684, + "grad_norm": 4.001020431518555, + "learning_rate": 9.078421938890426e-05, + "loss": 2.519, + "step": 6570 + }, + { + "epoch": 0.4410590248649374, + "grad_norm": 4.492617607116699, + "learning_rate": 9.077793114092435e-05, + "loss": 2.5848, + "step": 6572 + }, + { + "epoch": 0.4411932485487064, + "grad_norm": 4.844489097595215, + "learning_rate": 9.077164096626736e-05, + "loss": 2.8137, + "step": 6574 + }, + { + "epoch": 0.4413274722324754, + "grad_norm": 4.454866409301758, + "learning_rate": 9.07653488652305e-05, + "loss": 2.6514, + "step": 6576 + }, + { + "epoch": 0.4414616959162444, + "grad_norm": 6.936885356903076, + "learning_rate": 9.075905483811109e-05, + "loss": 2.7796, + "step": 6578 + }, + { + "epoch": 0.44159591960001343, + "grad_norm": 5.623012065887451, + "learning_rate": 9.075275888520647e-05, + "loss": 2.5, + "step": 6580 + }, + { + "epoch": 0.4417301432837824, + "grad_norm": 6.060022354125977, + "learning_rate": 9.074646100681413e-05, + "loss": 2.7845, + "step": 6582 + }, + { + "epoch": 0.44186436696755144, + "grad_norm": 4.07180643081665, + "learning_rate": 9.074016120323163e-05, + "loss": 2.4926, + "step": 6584 + }, + { + "epoch": 0.4419985906513204, + "grad_norm": 4.583629608154297, + "learning_rate": 9.073385947475664e-05, + "loss": 2.7696, + "step": 6586 + }, + { + "epoch": 0.44213281433508944, + "grad_norm": 5.631913661956787, + "learning_rate": 9.072755582168688e-05, + "loss": 2.6125, + "step": 6588 + }, + { + "epoch": 0.4422670380188584, + "grad_norm": 4.197836875915527, + "learning_rate": 9.07212502443202e-05, + "loss": 2.4635, + "step": 6590 + }, + { + "epoch": 0.44240126170262745, + "grad_norm": 5.654948711395264, + "learning_rate": 9.071494274295452e-05, + "loss": 2.5995, + "step": 6592 + }, + { + "epoch": 0.4425354853863964, + "grad_norm": 4.764624118804932, + "learning_rate": 9.070863331788785e-05, + "loss": 2.6811, + "step": 6594 + }, + { + "epoch": 0.44266970907016545, + "grad_norm": 4.810792446136475, + "learning_rate": 9.07023219694183e-05, + "loss": 2.6695, + "step": 6596 + }, + { + "epoch": 0.4428039327539344, + "grad_norm": 4.770355224609375, + "learning_rate": 9.06960086978441e-05, + "loss": 2.6653, + "step": 6598 + }, + { + "epoch": 0.44293815643770346, + "grad_norm": 4.356984615325928, + "learning_rate": 9.068969350346349e-05, + "loss": 2.7572, + "step": 6600 + }, + { + "epoch": 0.44307238012147243, + "grad_norm": 3.9566965103149414, + "learning_rate": 9.068337638657489e-05, + "loss": 2.4583, + "step": 6602 + }, + { + "epoch": 0.44320660380524146, + "grad_norm": 3.6533048152923584, + "learning_rate": 9.067705734747674e-05, + "loss": 2.4589, + "step": 6604 + }, + { + "epoch": 0.44334082748901044, + "grad_norm": 4.520003795623779, + "learning_rate": 9.067073638646763e-05, + "loss": 2.8761, + "step": 6606 + }, + { + "epoch": 0.4434750511727794, + "grad_norm": 4.033199310302734, + "learning_rate": 9.06644135038462e-05, + "loss": 2.5832, + "step": 6608 + }, + { + "epoch": 0.44360927485654844, + "grad_norm": 4.658405780792236, + "learning_rate": 9.06580886999112e-05, + "loss": 2.8035, + "step": 6610 + }, + { + "epoch": 0.4437434985403174, + "grad_norm": 4.566067695617676, + "learning_rate": 9.065176197496146e-05, + "loss": 2.7147, + "step": 6612 + }, + { + "epoch": 0.44387772222408645, + "grad_norm": 4.514727592468262, + "learning_rate": 9.064543332929589e-05, + "loss": 2.7506, + "step": 6614 + }, + { + "epoch": 0.4440119459078554, + "grad_norm": 4.587108612060547, + "learning_rate": 9.063910276321354e-05, + "loss": 2.6453, + "step": 6616 + }, + { + "epoch": 0.44414616959162445, + "grad_norm": 6.032138824462891, + "learning_rate": 9.063277027701349e-05, + "loss": 2.545, + "step": 6618 + }, + { + "epoch": 0.4442803932753934, + "grad_norm": 4.398451805114746, + "learning_rate": 9.062643587099495e-05, + "loss": 2.5099, + "step": 6620 + }, + { + "epoch": 0.44441461695916246, + "grad_norm": 4.60685396194458, + "learning_rate": 9.062009954545719e-05, + "loss": 2.3242, + "step": 6622 + }, + { + "epoch": 0.44454884064293143, + "grad_norm": 4.390097141265869, + "learning_rate": 9.061376130069961e-05, + "loss": 2.7472, + "step": 6624 + }, + { + "epoch": 0.44468306432670046, + "grad_norm": 4.587697505950928, + "learning_rate": 9.060742113702168e-05, + "loss": 2.552, + "step": 6626 + }, + { + "epoch": 0.44481728801046944, + "grad_norm": 5.473755359649658, + "learning_rate": 9.060107905472294e-05, + "loss": 2.815, + "step": 6628 + }, + { + "epoch": 0.44495151169423847, + "grad_norm": 7.815277576446533, + "learning_rate": 9.059473505410305e-05, + "loss": 2.6194, + "step": 6630 + }, + { + "epoch": 0.44508573537800744, + "grad_norm": 4.130092620849609, + "learning_rate": 9.058838913546178e-05, + "loss": 2.626, + "step": 6632 + }, + { + "epoch": 0.44521995906177647, + "grad_norm": 4.458771705627441, + "learning_rate": 9.058204129909891e-05, + "loss": 2.6919, + "step": 6634 + }, + { + "epoch": 0.44535418274554545, + "grad_norm": 4.6821746826171875, + "learning_rate": 9.05756915453144e-05, + "loss": 2.7655, + "step": 6636 + }, + { + "epoch": 0.4454884064293145, + "grad_norm": 6.82846736907959, + "learning_rate": 9.056933987440825e-05, + "loss": 2.9728, + "step": 6638 + }, + { + "epoch": 0.44562263011308345, + "grad_norm": 10.000629425048828, + "learning_rate": 9.056298628668056e-05, + "loss": 2.8112, + "step": 6640 + }, + { + "epoch": 0.4457568537968525, + "grad_norm": 4.687206745147705, + "learning_rate": 9.055663078243156e-05, + "loss": 2.5437, + "step": 6642 + }, + { + "epoch": 0.44589107748062146, + "grad_norm": 5.1216721534729, + "learning_rate": 9.055027336196146e-05, + "loss": 2.6745, + "step": 6644 + }, + { + "epoch": 0.44602530116439043, + "grad_norm": 4.584372520446777, + "learning_rate": 9.054391402557072e-05, + "loss": 2.5324, + "step": 6646 + }, + { + "epoch": 0.44615952484815946, + "grad_norm": 6.411717414855957, + "learning_rate": 9.053755277355976e-05, + "loss": 2.5254, + "step": 6648 + }, + { + "epoch": 0.44629374853192844, + "grad_norm": 5.526740074157715, + "learning_rate": 9.053118960622915e-05, + "loss": 2.858, + "step": 6650 + }, + { + "epoch": 0.44642797221569747, + "grad_norm": 4.861766815185547, + "learning_rate": 9.052482452387953e-05, + "loss": 2.3393, + "step": 6652 + }, + { + "epoch": 0.44656219589946644, + "grad_norm": 3.7491087913513184, + "learning_rate": 9.051845752681163e-05, + "loss": 2.566, + "step": 6654 + }, + { + "epoch": 0.44669641958323547, + "grad_norm": 4.369933128356934, + "learning_rate": 9.051208861532629e-05, + "loss": 2.7129, + "step": 6656 + }, + { + "epoch": 0.44683064326700445, + "grad_norm": 4.5769853591918945, + "learning_rate": 9.050571778972443e-05, + "loss": 2.923, + "step": 6658 + }, + { + "epoch": 0.4469648669507735, + "grad_norm": 4.643664360046387, + "learning_rate": 9.049934505030705e-05, + "loss": 2.4005, + "step": 6660 + }, + { + "epoch": 0.44709909063454245, + "grad_norm": 4.306925296783447, + "learning_rate": 9.049297039737528e-05, + "loss": 2.5411, + "step": 6662 + }, + { + "epoch": 0.4472333143183115, + "grad_norm": 4.559482097625732, + "learning_rate": 9.048659383123026e-05, + "loss": 2.5749, + "step": 6664 + }, + { + "epoch": 0.44736753800208046, + "grad_norm": 4.816279888153076, + "learning_rate": 9.04802153521733e-05, + "loss": 2.7913, + "step": 6666 + }, + { + "epoch": 0.4475017616858495, + "grad_norm": 5.0893096923828125, + "learning_rate": 9.047383496050576e-05, + "loss": 2.7953, + "step": 6668 + }, + { + "epoch": 0.44763598536961846, + "grad_norm": 4.2626051902771, + "learning_rate": 9.046745265652912e-05, + "loss": 2.7643, + "step": 6670 + }, + { + "epoch": 0.4477702090533875, + "grad_norm": 3.9255287647247314, + "learning_rate": 9.046106844054491e-05, + "loss": 2.532, + "step": 6672 + }, + { + "epoch": 0.44790443273715647, + "grad_norm": 4.304283618927002, + "learning_rate": 9.045468231285477e-05, + "loss": 2.5629, + "step": 6674 + }, + { + "epoch": 0.4480386564209255, + "grad_norm": 4.485901355743408, + "learning_rate": 9.044829427376046e-05, + "loss": 2.7834, + "step": 6676 + }, + { + "epoch": 0.44817288010469447, + "grad_norm": 6.120778560638428, + "learning_rate": 9.044190432356377e-05, + "loss": 2.5367, + "step": 6678 + }, + { + "epoch": 0.4483071037884635, + "grad_norm": 5.08696174621582, + "learning_rate": 9.043551246256664e-05, + "loss": 2.6025, + "step": 6680 + }, + { + "epoch": 0.4484413274722325, + "grad_norm": 4.3109540939331055, + "learning_rate": 9.042911869107105e-05, + "loss": 2.4308, + "step": 6682 + }, + { + "epoch": 0.44857555115600145, + "grad_norm": 4.394944190979004, + "learning_rate": 9.04227230093791e-05, + "loss": 2.6897, + "step": 6684 + }, + { + "epoch": 0.4487097748397705, + "grad_norm": 4.893490791320801, + "learning_rate": 9.041632541779298e-05, + "loss": 2.4432, + "step": 6686 + }, + { + "epoch": 0.44884399852353946, + "grad_norm": 3.7863852977752686, + "learning_rate": 9.040992591661495e-05, + "loss": 2.6138, + "step": 6688 + }, + { + "epoch": 0.4489782222073085, + "grad_norm": 4.083008289337158, + "learning_rate": 9.04035245061474e-05, + "loss": 2.7538, + "step": 6690 + }, + { + "epoch": 0.44911244589107746, + "grad_norm": 5.002460479736328, + "learning_rate": 9.039712118669276e-05, + "loss": 2.8411, + "step": 6692 + }, + { + "epoch": 0.4492466695748465, + "grad_norm": 4.351027965545654, + "learning_rate": 9.039071595855357e-05, + "loss": 2.8402, + "step": 6694 + }, + { + "epoch": 0.44938089325861547, + "grad_norm": 4.9319024085998535, + "learning_rate": 9.038430882203249e-05, + "loss": 2.6389, + "step": 6696 + }, + { + "epoch": 0.4495151169423845, + "grad_norm": 4.402589797973633, + "learning_rate": 9.037789977743223e-05, + "loss": 2.6949, + "step": 6698 + }, + { + "epoch": 0.44964934062615347, + "grad_norm": 3.931483745574951, + "learning_rate": 9.03714888250556e-05, + "loss": 2.7888, + "step": 6700 + }, + { + "epoch": 0.4497835643099225, + "grad_norm": 4.293480396270752, + "learning_rate": 9.036507596520551e-05, + "loss": 2.7274, + "step": 6702 + }, + { + "epoch": 0.4499177879936915, + "grad_norm": 4.882685661315918, + "learning_rate": 9.035866119818495e-05, + "loss": 2.5124, + "step": 6704 + }, + { + "epoch": 0.4500520116774605, + "grad_norm": 7.182562351226807, + "learning_rate": 9.035224452429703e-05, + "loss": 2.7153, + "step": 6706 + }, + { + "epoch": 0.4501862353612295, + "grad_norm": 4.290604114532471, + "learning_rate": 9.034582594384488e-05, + "loss": 2.5535, + "step": 6708 + }, + { + "epoch": 0.4503204590449985, + "grad_norm": 4.762977123260498, + "learning_rate": 9.033940545713182e-05, + "loss": 2.5706, + "step": 6710 + }, + { + "epoch": 0.4504546827287675, + "grad_norm": 4.55588960647583, + "learning_rate": 9.033298306446115e-05, + "loss": 2.4721, + "step": 6712 + }, + { + "epoch": 0.4505889064125365, + "grad_norm": 4.404652118682861, + "learning_rate": 9.032655876613636e-05, + "loss": 2.8203, + "step": 6714 + }, + { + "epoch": 0.4507231300963055, + "grad_norm": 4.846052169799805, + "learning_rate": 9.032013256246094e-05, + "loss": 2.625, + "step": 6716 + }, + { + "epoch": 0.4508573537800745, + "grad_norm": 4.408112049102783, + "learning_rate": 9.031370445373856e-05, + "loss": 2.752, + "step": 6718 + }, + { + "epoch": 0.4509915774638435, + "grad_norm": 5.196068286895752, + "learning_rate": 9.030727444027294e-05, + "loss": 2.5054, + "step": 6720 + }, + { + "epoch": 0.45112580114761247, + "grad_norm": 4.7484846115112305, + "learning_rate": 9.030084252236783e-05, + "loss": 2.6094, + "step": 6722 + }, + { + "epoch": 0.4512600248313815, + "grad_norm": 5.227911472320557, + "learning_rate": 9.029440870032718e-05, + "loss": 2.7575, + "step": 6724 + }, + { + "epoch": 0.4513942485151505, + "grad_norm": 4.22373104095459, + "learning_rate": 9.028797297445495e-05, + "loss": 2.3918, + "step": 6726 + }, + { + "epoch": 0.4515284721989195, + "grad_norm": 4.601612091064453, + "learning_rate": 9.028153534505522e-05, + "loss": 2.5987, + "step": 6728 + }, + { + "epoch": 0.4516626958826885, + "grad_norm": 3.9957544803619385, + "learning_rate": 9.027509581243214e-05, + "loss": 2.6174, + "step": 6730 + }, + { + "epoch": 0.4517969195664575, + "grad_norm": 7.065196514129639, + "learning_rate": 9.026865437688998e-05, + "loss": 2.6802, + "step": 6732 + }, + { + "epoch": 0.4519311432502265, + "grad_norm": 3.995753049850464, + "learning_rate": 9.026221103873312e-05, + "loss": 2.6517, + "step": 6734 + }, + { + "epoch": 0.4520653669339955, + "grad_norm": 5.514827251434326, + "learning_rate": 9.025576579826593e-05, + "loss": 2.6061, + "step": 6736 + }, + { + "epoch": 0.4521995906177645, + "grad_norm": 4.080893516540527, + "learning_rate": 9.024931865579296e-05, + "loss": 2.474, + "step": 6738 + }, + { + "epoch": 0.4523338143015335, + "grad_norm": 4.804905891418457, + "learning_rate": 9.024286961161885e-05, + "loss": 2.5522, + "step": 6740 + }, + { + "epoch": 0.4524680379853025, + "grad_norm": 6.218737602233887, + "learning_rate": 9.023641866604829e-05, + "loss": 2.7608, + "step": 6742 + }, + { + "epoch": 0.4526022616690715, + "grad_norm": 5.229629039764404, + "learning_rate": 9.022996581938605e-05, + "loss": 2.7859, + "step": 6744 + }, + { + "epoch": 0.4527364853528405, + "grad_norm": 3.769021987915039, + "learning_rate": 9.022351107193704e-05, + "loss": 2.5806, + "step": 6746 + }, + { + "epoch": 0.45287070903660953, + "grad_norm": 4.068995952606201, + "learning_rate": 9.021705442400623e-05, + "loss": 2.5174, + "step": 6748 + }, + { + "epoch": 0.4530049327203785, + "grad_norm": 3.8388078212738037, + "learning_rate": 9.021059587589869e-05, + "loss": 2.557, + "step": 6750 + }, + { + "epoch": 0.45313915640414754, + "grad_norm": 4.125687599182129, + "learning_rate": 9.020413542791955e-05, + "loss": 2.7312, + "step": 6752 + }, + { + "epoch": 0.4532733800879165, + "grad_norm": 3.9784047603607178, + "learning_rate": 9.019767308037407e-05, + "loss": 2.2627, + "step": 6754 + }, + { + "epoch": 0.45340760377168554, + "grad_norm": 6.201930046081543, + "learning_rate": 9.01912088335676e-05, + "loss": 2.7463, + "step": 6756 + }, + { + "epoch": 0.4535418274554545, + "grad_norm": 4.326287269592285, + "learning_rate": 9.018474268780553e-05, + "loss": 2.4359, + "step": 6758 + }, + { + "epoch": 0.4536760511392235, + "grad_norm": 4.2895731925964355, + "learning_rate": 9.017827464339338e-05, + "loss": 2.5211, + "step": 6760 + }, + { + "epoch": 0.4538102748229925, + "grad_norm": 5.394578456878662, + "learning_rate": 9.017180470063679e-05, + "loss": 2.7416, + "step": 6762 + }, + { + "epoch": 0.4539444985067615, + "grad_norm": 4.3471999168396, + "learning_rate": 9.01653328598414e-05, + "loss": 2.7005, + "step": 6764 + }, + { + "epoch": 0.4540787221905305, + "grad_norm": 4.809871673583984, + "learning_rate": 9.015885912131302e-05, + "loss": 2.8108, + "step": 6766 + }, + { + "epoch": 0.4542129458742995, + "grad_norm": 5.145501136779785, + "learning_rate": 9.015238348535751e-05, + "loss": 2.8414, + "step": 6768 + }, + { + "epoch": 0.45434716955806853, + "grad_norm": 4.633159160614014, + "learning_rate": 9.014590595228086e-05, + "loss": 2.6689, + "step": 6770 + }, + { + "epoch": 0.4544813932418375, + "grad_norm": 4.192873954772949, + "learning_rate": 9.013942652238908e-05, + "loss": 2.7034, + "step": 6772 + }, + { + "epoch": 0.45461561692560654, + "grad_norm": 4.1083269119262695, + "learning_rate": 9.01329451959883e-05, + "loss": 2.5779, + "step": 6774 + }, + { + "epoch": 0.4547498406093755, + "grad_norm": 5.247707366943359, + "learning_rate": 9.012646197338481e-05, + "loss": 2.6471, + "step": 6776 + }, + { + "epoch": 0.45488406429314454, + "grad_norm": 3.8821425437927246, + "learning_rate": 9.011997685488489e-05, + "loss": 2.5332, + "step": 6778 + }, + { + "epoch": 0.4550182879769135, + "grad_norm": 4.408045768737793, + "learning_rate": 9.011348984079496e-05, + "loss": 2.6481, + "step": 6780 + }, + { + "epoch": 0.45515251166068255, + "grad_norm": 4.207502841949463, + "learning_rate": 9.010700093142151e-05, + "loss": 2.4966, + "step": 6782 + }, + { + "epoch": 0.4552867353444515, + "grad_norm": 4.335634231567383, + "learning_rate": 9.010051012707114e-05, + "loss": 2.7953, + "step": 6784 + }, + { + "epoch": 0.45542095902822055, + "grad_norm": 4.272940158843994, + "learning_rate": 9.009401742805052e-05, + "loss": 2.6259, + "step": 6786 + }, + { + "epoch": 0.4555551827119895, + "grad_norm": 3.9337854385375977, + "learning_rate": 9.008752283466641e-05, + "loss": 2.5318, + "step": 6788 + }, + { + "epoch": 0.45568940639575856, + "grad_norm": 5.345792293548584, + "learning_rate": 9.008102634722568e-05, + "loss": 2.4717, + "step": 6790 + }, + { + "epoch": 0.45582363007952753, + "grad_norm": 3.8838188648223877, + "learning_rate": 9.007452796603526e-05, + "loss": 2.2141, + "step": 6792 + }, + { + "epoch": 0.45595785376329656, + "grad_norm": 4.516840934753418, + "learning_rate": 9.006802769140221e-05, + "loss": 2.7082, + "step": 6794 + }, + { + "epoch": 0.45609207744706554, + "grad_norm": 4.274863243103027, + "learning_rate": 9.006152552363363e-05, + "loss": 2.9694, + "step": 6796 + }, + { + "epoch": 0.4562263011308345, + "grad_norm": 4.761748790740967, + "learning_rate": 9.005502146303676e-05, + "loss": 2.3637, + "step": 6798 + }, + { + "epoch": 0.45636052481460354, + "grad_norm": 4.448204517364502, + "learning_rate": 9.004851550991888e-05, + "loss": 2.7428, + "step": 6800 + }, + { + "epoch": 0.4564947484983725, + "grad_norm": 4.481865406036377, + "learning_rate": 9.004200766458742e-05, + "loss": 2.5298, + "step": 6802 + }, + { + "epoch": 0.45662897218214155, + "grad_norm": 4.08547830581665, + "learning_rate": 9.00354979273498e-05, + "loss": 2.5185, + "step": 6804 + }, + { + "epoch": 0.4567631958659105, + "grad_norm": 4.783926010131836, + "learning_rate": 9.002898629851364e-05, + "loss": 2.4765, + "step": 6806 + }, + { + "epoch": 0.45689741954967955, + "grad_norm": 7.594779014587402, + "learning_rate": 9.00224727783866e-05, + "loss": 2.5663, + "step": 6808 + }, + { + "epoch": 0.4570316432334485, + "grad_norm": 4.276284694671631, + "learning_rate": 9.001595736727642e-05, + "loss": 2.8098, + "step": 6810 + }, + { + "epoch": 0.45716586691721756, + "grad_norm": 4.903249740600586, + "learning_rate": 9.000944006549095e-05, + "loss": 2.9752, + "step": 6812 + }, + { + "epoch": 0.45730009060098653, + "grad_norm": 4.316068172454834, + "learning_rate": 9.00029208733381e-05, + "loss": 2.6601, + "step": 6814 + }, + { + "epoch": 0.45743431428475556, + "grad_norm": 4.385031700134277, + "learning_rate": 8.99963997911259e-05, + "loss": 3.0387, + "step": 6816 + }, + { + "epoch": 0.45756853796852454, + "grad_norm": 4.358438491821289, + "learning_rate": 8.998987681916246e-05, + "loss": 2.5556, + "step": 6818 + }, + { + "epoch": 0.45770276165229357, + "grad_norm": 5.080533981323242, + "learning_rate": 8.998335195775599e-05, + "loss": 2.7031, + "step": 6820 + }, + { + "epoch": 0.45783698533606254, + "grad_norm": 4.4694061279296875, + "learning_rate": 8.997682520721476e-05, + "loss": 2.5452, + "step": 6822 + }, + { + "epoch": 0.45797120901983157, + "grad_norm": 5.221470832824707, + "learning_rate": 8.997029656784715e-05, + "loss": 2.6395, + "step": 6824 + }, + { + "epoch": 0.45810543270360055, + "grad_norm": 4.103508472442627, + "learning_rate": 8.996376603996161e-05, + "loss": 2.6415, + "step": 6826 + }, + { + "epoch": 0.4582396563873696, + "grad_norm": 4.024650573730469, + "learning_rate": 8.995723362386672e-05, + "loss": 2.5514, + "step": 6828 + }, + { + "epoch": 0.45837388007113855, + "grad_norm": 4.450990200042725, + "learning_rate": 8.995069931987113e-05, + "loss": 2.7797, + "step": 6830 + }, + { + "epoch": 0.4585081037549076, + "grad_norm": 4.203819751739502, + "learning_rate": 8.994416312828354e-05, + "loss": 2.4798, + "step": 6832 + }, + { + "epoch": 0.45864232743867656, + "grad_norm": 4.118832588195801, + "learning_rate": 8.993762504941277e-05, + "loss": 2.828, + "step": 6834 + }, + { + "epoch": 0.45877655112244553, + "grad_norm": 4.2487473487854, + "learning_rate": 8.993108508356779e-05, + "loss": 2.5813, + "step": 6836 + }, + { + "epoch": 0.45891077480621456, + "grad_norm": 4.102899074554443, + "learning_rate": 8.992454323105752e-05, + "loss": 2.9093, + "step": 6838 + }, + { + "epoch": 0.45904499848998354, + "grad_norm": 4.096856117248535, + "learning_rate": 8.991799949219112e-05, + "loss": 2.4868, + "step": 6840 + }, + { + "epoch": 0.45917922217375257, + "grad_norm": 4.39479923248291, + "learning_rate": 8.991145386727773e-05, + "loss": 2.581, + "step": 6842 + }, + { + "epoch": 0.45931344585752154, + "grad_norm": 4.10569953918457, + "learning_rate": 8.990490635662663e-05, + "loss": 2.6896, + "step": 6844 + }, + { + "epoch": 0.45944766954129057, + "grad_norm": 4.980190753936768, + "learning_rate": 8.989835696054718e-05, + "loss": 2.8446, + "step": 6846 + }, + { + "epoch": 0.45958189322505955, + "grad_norm": 4.979743957519531, + "learning_rate": 8.989180567934881e-05, + "loss": 2.8417, + "step": 6848 + }, + { + "epoch": 0.4597161169088286, + "grad_norm": 6.015244483947754, + "learning_rate": 8.988525251334106e-05, + "loss": 2.4785, + "step": 6850 + }, + { + "epoch": 0.45985034059259755, + "grad_norm": 4.301202774047852, + "learning_rate": 8.987869746283358e-05, + "loss": 2.6052, + "step": 6852 + }, + { + "epoch": 0.4599845642763666, + "grad_norm": 5.396670818328857, + "learning_rate": 8.987214052813604e-05, + "loss": 2.4915, + "step": 6854 + }, + { + "epoch": 0.46011878796013556, + "grad_norm": 4.186859607696533, + "learning_rate": 8.986558170955828e-05, + "loss": 2.6431, + "step": 6856 + }, + { + "epoch": 0.4602530116439046, + "grad_norm": 3.97515869140625, + "learning_rate": 8.985902100741018e-05, + "loss": 2.2817, + "step": 6858 + }, + { + "epoch": 0.46038723532767356, + "grad_norm": 3.857466220855713, + "learning_rate": 8.98524584220017e-05, + "loss": 2.7273, + "step": 6860 + }, + { + "epoch": 0.4605214590114426, + "grad_norm": 4.097855091094971, + "learning_rate": 8.984589395364294e-05, + "loss": 2.5774, + "step": 6862 + }, + { + "epoch": 0.46065568269521157, + "grad_norm": 4.678231239318848, + "learning_rate": 8.983932760264405e-05, + "loss": 2.6723, + "step": 6864 + }, + { + "epoch": 0.4607899063789806, + "grad_norm": 3.9446685314178467, + "learning_rate": 8.983275936931526e-05, + "loss": 2.8893, + "step": 6866 + }, + { + "epoch": 0.46092413006274957, + "grad_norm": 3.811579465866089, + "learning_rate": 8.982618925396691e-05, + "loss": 2.5396, + "step": 6868 + }, + { + "epoch": 0.46105835374651855, + "grad_norm": 4.469409942626953, + "learning_rate": 8.981961725690943e-05, + "loss": 2.5653, + "step": 6870 + }, + { + "epoch": 0.4611925774302876, + "grad_norm": 4.08332633972168, + "learning_rate": 8.981304337845337e-05, + "loss": 2.3335, + "step": 6872 + }, + { + "epoch": 0.46132680111405655, + "grad_norm": 4.423959732055664, + "learning_rate": 8.980646761890928e-05, + "loss": 2.6204, + "step": 6874 + }, + { + "epoch": 0.4614610247978256, + "grad_norm": 4.722589015960693, + "learning_rate": 8.979988997858785e-05, + "loss": 2.7507, + "step": 6876 + }, + { + "epoch": 0.46159524848159456, + "grad_norm": 4.000850677490234, + "learning_rate": 8.97933104577999e-05, + "loss": 2.665, + "step": 6878 + }, + { + "epoch": 0.4617294721653636, + "grad_norm": 4.3229146003723145, + "learning_rate": 8.978672905685629e-05, + "loss": 2.7495, + "step": 6880 + }, + { + "epoch": 0.46186369584913256, + "grad_norm": 4.127862930297852, + "learning_rate": 8.978014577606797e-05, + "loss": 2.7347, + "step": 6882 + }, + { + "epoch": 0.4619979195329016, + "grad_norm": 4.071578025817871, + "learning_rate": 8.977356061574597e-05, + "loss": 2.6125, + "step": 6884 + }, + { + "epoch": 0.46213214321667057, + "grad_norm": 6.170718669891357, + "learning_rate": 8.976697357620145e-05, + "loss": 2.665, + "step": 6886 + }, + { + "epoch": 0.4622663669004396, + "grad_norm": 4.352163314819336, + "learning_rate": 8.976038465774563e-05, + "loss": 2.4306, + "step": 6888 + }, + { + "epoch": 0.46240059058420857, + "grad_norm": 4.4100260734558105, + "learning_rate": 8.975379386068981e-05, + "loss": 2.847, + "step": 6890 + }, + { + "epoch": 0.4625348142679776, + "grad_norm": 3.9729838371276855, + "learning_rate": 8.974720118534541e-05, + "loss": 2.521, + "step": 6892 + }, + { + "epoch": 0.4626690379517466, + "grad_norm": 4.072219371795654, + "learning_rate": 8.974060663202392e-05, + "loss": 2.8013, + "step": 6894 + }, + { + "epoch": 0.4628032616355156, + "grad_norm": 4.898754119873047, + "learning_rate": 8.97340102010369e-05, + "loss": 2.8053, + "step": 6896 + }, + { + "epoch": 0.4629374853192846, + "grad_norm": 4.588500022888184, + "learning_rate": 8.972741189269605e-05, + "loss": 2.6472, + "step": 6898 + }, + { + "epoch": 0.4630717090030536, + "grad_norm": 5.003199100494385, + "learning_rate": 8.972081170731307e-05, + "loss": 2.5246, + "step": 6900 + }, + { + "epoch": 0.4632059326868226, + "grad_norm": 4.10089111328125, + "learning_rate": 8.971420964519988e-05, + "loss": 2.707, + "step": 6902 + }, + { + "epoch": 0.4633401563705916, + "grad_norm": 4.348474979400635, + "learning_rate": 8.970760570666839e-05, + "loss": 2.9159, + "step": 6904 + }, + { + "epoch": 0.4634743800543606, + "grad_norm": 3.7568275928497314, + "learning_rate": 8.970099989203058e-05, + "loss": 2.5911, + "step": 6906 + }, + { + "epoch": 0.46360860373812957, + "grad_norm": 4.679823398590088, + "learning_rate": 8.96943922015986e-05, + "loss": 2.7516, + "step": 6908 + }, + { + "epoch": 0.4637428274218986, + "grad_norm": 4.490511894226074, + "learning_rate": 8.968778263568465e-05, + "loss": 2.4443, + "step": 6910 + }, + { + "epoch": 0.46387705110566757, + "grad_norm": 4.38399076461792, + "learning_rate": 8.968117119460103e-05, + "loss": 2.9781, + "step": 6912 + }, + { + "epoch": 0.4640112747894366, + "grad_norm": 5.079002857208252, + "learning_rate": 8.967455787866007e-05, + "loss": 2.7158, + "step": 6914 + }, + { + "epoch": 0.4641454984732056, + "grad_norm": 4.453191757202148, + "learning_rate": 8.966794268817427e-05, + "loss": 2.627, + "step": 6916 + }, + { + "epoch": 0.4642797221569746, + "grad_norm": 4.414239883422852, + "learning_rate": 8.96613256234562e-05, + "loss": 2.8078, + "step": 6918 + }, + { + "epoch": 0.4644139458407436, + "grad_norm": 4.319516658782959, + "learning_rate": 8.965470668481848e-05, + "loss": 2.6808, + "step": 6920 + }, + { + "epoch": 0.4645481695245126, + "grad_norm": 4.981371879577637, + "learning_rate": 8.964808587257386e-05, + "loss": 2.5935, + "step": 6922 + }, + { + "epoch": 0.4646823932082816, + "grad_norm": 5.345224380493164, + "learning_rate": 8.964146318703512e-05, + "loss": 2.3121, + "step": 6924 + }, + { + "epoch": 0.4648166168920506, + "grad_norm": 4.819183349609375, + "learning_rate": 8.96348386285152e-05, + "loss": 2.81, + "step": 6926 + }, + { + "epoch": 0.4649508405758196, + "grad_norm": 3.991220474243164, + "learning_rate": 8.962821219732711e-05, + "loss": 2.5692, + "step": 6928 + }, + { + "epoch": 0.4650850642595886, + "grad_norm": 4.394068717956543, + "learning_rate": 8.96215838937839e-05, + "loss": 2.4979, + "step": 6930 + }, + { + "epoch": 0.4652192879433576, + "grad_norm": 4.551640510559082, + "learning_rate": 8.961495371819877e-05, + "loss": 2.8771, + "step": 6932 + }, + { + "epoch": 0.4653535116271266, + "grad_norm": 3.6698877811431885, + "learning_rate": 8.960832167088498e-05, + "loss": 2.5351, + "step": 6934 + }, + { + "epoch": 0.4654877353108956, + "grad_norm": 3.922799587249756, + "learning_rate": 8.960168775215588e-05, + "loss": 2.5333, + "step": 6936 + }, + { + "epoch": 0.46562195899466463, + "grad_norm": 4.217718124389648, + "learning_rate": 8.95950519623249e-05, + "loss": 2.5563, + "step": 6938 + }, + { + "epoch": 0.4657561826784336, + "grad_norm": 4.376163482666016, + "learning_rate": 8.95884143017056e-05, + "loss": 2.6018, + "step": 6940 + }, + { + "epoch": 0.46589040636220264, + "grad_norm": 3.6110143661499023, + "learning_rate": 8.958177477061154e-05, + "loss": 2.5155, + "step": 6942 + }, + { + "epoch": 0.4660246300459716, + "grad_norm": 4.253117084503174, + "learning_rate": 8.957513336935646e-05, + "loss": 2.7444, + "step": 6944 + }, + { + "epoch": 0.4661588537297406, + "grad_norm": 4.066983699798584, + "learning_rate": 8.956849009825417e-05, + "loss": 2.5445, + "step": 6946 + }, + { + "epoch": 0.4662930774135096, + "grad_norm": 4.551822185516357, + "learning_rate": 8.95618449576185e-05, + "loss": 2.9186, + "step": 6948 + }, + { + "epoch": 0.4664273010972786, + "grad_norm": 4.886963844299316, + "learning_rate": 8.955519794776348e-05, + "loss": 2.7368, + "step": 6950 + }, + { + "epoch": 0.4665615247810476, + "grad_norm": 4.153200149536133, + "learning_rate": 8.954854906900312e-05, + "loss": 2.5721, + "step": 6952 + }, + { + "epoch": 0.4666957484648166, + "grad_norm": 3.972158193588257, + "learning_rate": 8.954189832165159e-05, + "loss": 2.7321, + "step": 6954 + }, + { + "epoch": 0.4668299721485856, + "grad_norm": 4.347879886627197, + "learning_rate": 8.953524570602313e-05, + "loss": 2.8392, + "step": 6956 + }, + { + "epoch": 0.4669641958323546, + "grad_norm": 4.2411298751831055, + "learning_rate": 8.952859122243204e-05, + "loss": 2.7777, + "step": 6958 + }, + { + "epoch": 0.46709841951612363, + "grad_norm": 4.0564093589782715, + "learning_rate": 8.952193487119276e-05, + "loss": 2.4627, + "step": 6960 + }, + { + "epoch": 0.4672326431998926, + "grad_norm": 4.698480606079102, + "learning_rate": 8.951527665261976e-05, + "loss": 2.758, + "step": 6962 + }, + { + "epoch": 0.46736686688366164, + "grad_norm": 4.490565299987793, + "learning_rate": 8.950861656702764e-05, + "loss": 3.0621, + "step": 6964 + }, + { + "epoch": 0.4675010905674306, + "grad_norm": 4.246215343475342, + "learning_rate": 8.950195461473109e-05, + "loss": 2.6648, + "step": 6966 + }, + { + "epoch": 0.46763531425119964, + "grad_norm": 4.62151575088501, + "learning_rate": 8.949529079604485e-05, + "loss": 2.6217, + "step": 6968 + }, + { + "epoch": 0.4677695379349686, + "grad_norm": 4.33922815322876, + "learning_rate": 8.94886251112838e-05, + "loss": 2.7134, + "step": 6970 + }, + { + "epoch": 0.46790376161873765, + "grad_norm": 4.222580909729004, + "learning_rate": 8.948195756076285e-05, + "loss": 2.9432, + "step": 6972 + }, + { + "epoch": 0.4680379853025066, + "grad_norm": 4.332396984100342, + "learning_rate": 8.947528814479704e-05, + "loss": 2.4451, + "step": 6974 + }, + { + "epoch": 0.46817220898627565, + "grad_norm": 4.2198615074157715, + "learning_rate": 8.94686168637015e-05, + "loss": 2.6525, + "step": 6976 + }, + { + "epoch": 0.4683064326700446, + "grad_norm": 4.377411365509033, + "learning_rate": 8.946194371779142e-05, + "loss": 2.588, + "step": 6978 + }, + { + "epoch": 0.46844065635381366, + "grad_norm": 4.003425121307373, + "learning_rate": 8.94552687073821e-05, + "loss": 2.6873, + "step": 6980 + }, + { + "epoch": 0.46857488003758263, + "grad_norm": 4.475175380706787, + "learning_rate": 8.944859183278891e-05, + "loss": 2.5125, + "step": 6982 + }, + { + "epoch": 0.4687091037213516, + "grad_norm": 4.574103355407715, + "learning_rate": 8.944191309432735e-05, + "loss": 2.7465, + "step": 6984 + }, + { + "epoch": 0.46884332740512064, + "grad_norm": 4.210731029510498, + "learning_rate": 8.943523249231293e-05, + "loss": 2.696, + "step": 6986 + }, + { + "epoch": 0.4689775510888896, + "grad_norm": 4.213034152984619, + "learning_rate": 8.942855002706134e-05, + "loss": 2.7537, + "step": 6988 + }, + { + "epoch": 0.46911177477265864, + "grad_norm": 3.6739556789398193, + "learning_rate": 8.942186569888829e-05, + "loss": 2.4047, + "step": 6990 + }, + { + "epoch": 0.4692459984564276, + "grad_norm": 4.227363586425781, + "learning_rate": 8.94151795081096e-05, + "loss": 2.6766, + "step": 6992 + }, + { + "epoch": 0.46938022214019665, + "grad_norm": 4.360649108886719, + "learning_rate": 8.940849145504118e-05, + "loss": 2.7139, + "step": 6994 + }, + { + "epoch": 0.4695144458239656, + "grad_norm": 4.168656349182129, + "learning_rate": 8.940180153999904e-05, + "loss": 2.5623, + "step": 6996 + }, + { + "epoch": 0.46964866950773465, + "grad_norm": 4.466777801513672, + "learning_rate": 8.939510976329927e-05, + "loss": 2.7151, + "step": 6998 + }, + { + "epoch": 0.4697828931915036, + "grad_norm": 4.756308078765869, + "learning_rate": 8.938841612525801e-05, + "loss": 2.5756, + "step": 7000 + }, + { + "epoch": 0.46991711687527266, + "grad_norm": 4.931173324584961, + "learning_rate": 8.938172062619155e-05, + "loss": 2.5536, + "step": 7002 + }, + { + "epoch": 0.47005134055904163, + "grad_norm": 3.8782262802124023, + "learning_rate": 8.937502326641622e-05, + "loss": 3.1283, + "step": 7004 + }, + { + "epoch": 0.47018556424281066, + "grad_norm": 3.470226526260376, + "learning_rate": 8.936832404624848e-05, + "loss": 2.5945, + "step": 7006 + }, + { + "epoch": 0.47031978792657964, + "grad_norm": 5.099259376525879, + "learning_rate": 8.936162296600486e-05, + "loss": 2.7787, + "step": 7008 + }, + { + "epoch": 0.47045401161034867, + "grad_norm": 4.218259334564209, + "learning_rate": 8.935492002600194e-05, + "loss": 2.3241, + "step": 7010 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 5.012670040130615, + "learning_rate": 8.934821522655642e-05, + "loss": 2.8535, + "step": 7012 + }, + { + "epoch": 0.47072245897788667, + "grad_norm": 4.3207292556762695, + "learning_rate": 8.934150856798514e-05, + "loss": 2.4347, + "step": 7014 + }, + { + "epoch": 0.47085668266165565, + "grad_norm": 3.9788949489593506, + "learning_rate": 8.933480005060492e-05, + "loss": 2.8566, + "step": 7016 + }, + { + "epoch": 0.4709909063454247, + "grad_norm": 4.4518842697143555, + "learning_rate": 8.932808967473274e-05, + "loss": 2.565, + "step": 7018 + }, + { + "epoch": 0.47112513002919365, + "grad_norm": 3.775743007659912, + "learning_rate": 8.932137744068567e-05, + "loss": 2.4319, + "step": 7020 + }, + { + "epoch": 0.4712593537129626, + "grad_norm": 4.122305393218994, + "learning_rate": 8.931466334878085e-05, + "loss": 2.56, + "step": 7022 + }, + { + "epoch": 0.47139357739673166, + "grad_norm": 4.579588890075684, + "learning_rate": 8.930794739933547e-05, + "loss": 2.5523, + "step": 7024 + }, + { + "epoch": 0.47152780108050063, + "grad_norm": 4.340974807739258, + "learning_rate": 8.930122959266689e-05, + "loss": 2.5857, + "step": 7026 + }, + { + "epoch": 0.47166202476426966, + "grad_norm": 3.930934190750122, + "learning_rate": 8.929450992909248e-05, + "loss": 2.8183, + "step": 7028 + }, + { + "epoch": 0.47179624844803864, + "grad_norm": 4.25164270401001, + "learning_rate": 8.928778840892975e-05, + "loss": 2.4909, + "step": 7030 + }, + { + "epoch": 0.47193047213180767, + "grad_norm": 4.129549503326416, + "learning_rate": 8.928106503249628e-05, + "loss": 2.7331, + "step": 7032 + }, + { + "epoch": 0.47206469581557664, + "grad_norm": 4.424439907073975, + "learning_rate": 8.927433980010973e-05, + "loss": 2.3614, + "step": 7034 + }, + { + "epoch": 0.47219891949934567, + "grad_norm": 4.904636383056641, + "learning_rate": 8.926761271208785e-05, + "loss": 2.8096, + "step": 7036 + }, + { + "epoch": 0.47233314318311465, + "grad_norm": 8.651076316833496, + "learning_rate": 8.926088376874849e-05, + "loss": 2.7303, + "step": 7038 + }, + { + "epoch": 0.4724673668668837, + "grad_norm": 4.241939544677734, + "learning_rate": 8.925415297040957e-05, + "loss": 2.7252, + "step": 7040 + }, + { + "epoch": 0.47260159055065265, + "grad_norm": 4.663801193237305, + "learning_rate": 8.924742031738911e-05, + "loss": 2.3995, + "step": 7042 + }, + { + "epoch": 0.4727358142344217, + "grad_norm": 4.041371822357178, + "learning_rate": 8.924068581000521e-05, + "loss": 2.4959, + "step": 7044 + }, + { + "epoch": 0.47287003791819066, + "grad_norm": 4.038695812225342, + "learning_rate": 8.923394944857609e-05, + "loss": 2.4274, + "step": 7046 + }, + { + "epoch": 0.4730042616019597, + "grad_norm": 3.9315481185913086, + "learning_rate": 8.922721123341999e-05, + "loss": 2.6692, + "step": 7048 + }, + { + "epoch": 0.47313848528572866, + "grad_norm": 3.900315999984741, + "learning_rate": 8.922047116485532e-05, + "loss": 2.7247, + "step": 7050 + }, + { + "epoch": 0.4732727089694977, + "grad_norm": 4.914180278778076, + "learning_rate": 8.921372924320048e-05, + "loss": 2.6178, + "step": 7052 + }, + { + "epoch": 0.47340693265326667, + "grad_norm": 5.2734761238098145, + "learning_rate": 8.920698546877406e-05, + "loss": 2.5651, + "step": 7054 + }, + { + "epoch": 0.4735411563370357, + "grad_norm": 3.7270963191986084, + "learning_rate": 8.920023984189468e-05, + "loss": 2.6258, + "step": 7056 + }, + { + "epoch": 0.47367538002080467, + "grad_norm": 9.499377250671387, + "learning_rate": 8.919349236288105e-05, + "loss": 2.5663, + "step": 7058 + }, + { + "epoch": 0.47380960370457365, + "grad_norm": 4.34774112701416, + "learning_rate": 8.918674303205197e-05, + "loss": 2.8656, + "step": 7060 + }, + { + "epoch": 0.4739438273883427, + "grad_norm": 4.147979736328125, + "learning_rate": 8.917999184972634e-05, + "loss": 2.6422, + "step": 7062 + }, + { + "epoch": 0.47407805107211165, + "grad_norm": 4.299435615539551, + "learning_rate": 8.917323881622314e-05, + "loss": 2.7793, + "step": 7064 + }, + { + "epoch": 0.4742122747558807, + "grad_norm": 4.994565010070801, + "learning_rate": 8.916648393186143e-05, + "loss": 2.394, + "step": 7066 + }, + { + "epoch": 0.47434649843964966, + "grad_norm": 4.384877681732178, + "learning_rate": 8.915972719696037e-05, + "loss": 2.5494, + "step": 7068 + }, + { + "epoch": 0.4744807221234187, + "grad_norm": 4.041708946228027, + "learning_rate": 8.915296861183923e-05, + "loss": 2.7972, + "step": 7070 + }, + { + "epoch": 0.47461494580718766, + "grad_norm": 4.151302814483643, + "learning_rate": 8.914620817681729e-05, + "loss": 2.7517, + "step": 7072 + }, + { + "epoch": 0.4747491694909567, + "grad_norm": 5.232569694519043, + "learning_rate": 8.9139445892214e-05, + "loss": 2.5117, + "step": 7074 + }, + { + "epoch": 0.47488339317472567, + "grad_norm": 4.84133768081665, + "learning_rate": 8.913268175834886e-05, + "loss": 3.0138, + "step": 7076 + }, + { + "epoch": 0.4750176168584947, + "grad_norm": 4.289444923400879, + "learning_rate": 8.912591577554143e-05, + "loss": 2.5905, + "step": 7078 + }, + { + "epoch": 0.47515184054226367, + "grad_norm": 10.42679214477539, + "learning_rate": 8.911914794411144e-05, + "loss": 2.5108, + "step": 7080 + }, + { + "epoch": 0.4752860642260327, + "grad_norm": 5.153686046600342, + "learning_rate": 8.911237826437865e-05, + "loss": 2.4559, + "step": 7082 + }, + { + "epoch": 0.4754202879098017, + "grad_norm": 4.183183193206787, + "learning_rate": 8.910560673666289e-05, + "loss": 2.83, + "step": 7084 + }, + { + "epoch": 0.4755545115935707, + "grad_norm": 4.56895637512207, + "learning_rate": 8.909883336128408e-05, + "loss": 2.5659, + "step": 7086 + }, + { + "epoch": 0.4756887352773397, + "grad_norm": 4.402456283569336, + "learning_rate": 8.909205813856232e-05, + "loss": 2.4954, + "step": 7088 + }, + { + "epoch": 0.4758229589611087, + "grad_norm": 4.156567573547363, + "learning_rate": 8.908528106881765e-05, + "loss": 2.559, + "step": 7090 + }, + { + "epoch": 0.4759571826448777, + "grad_norm": 4.818496227264404, + "learning_rate": 8.907850215237032e-05, + "loss": 3.0014, + "step": 7092 + }, + { + "epoch": 0.4760914063286467, + "grad_norm": 3.9951303005218506, + "learning_rate": 8.907172138954061e-05, + "loss": 2.4544, + "step": 7094 + }, + { + "epoch": 0.4762256300124157, + "grad_norm": 4.7877936363220215, + "learning_rate": 8.90649387806489e-05, + "loss": 2.5682, + "step": 7096 + }, + { + "epoch": 0.47635985369618467, + "grad_norm": 4.162332057952881, + "learning_rate": 8.905815432601566e-05, + "loss": 2.4665, + "step": 7098 + }, + { + "epoch": 0.4764940773799537, + "grad_norm": 4.497354507446289, + "learning_rate": 8.905136802596142e-05, + "loss": 2.4651, + "step": 7100 + }, + { + "epoch": 0.47662830106372267, + "grad_norm": 4.425973415374756, + "learning_rate": 8.904457988080681e-05, + "loss": 2.6097, + "step": 7102 + }, + { + "epoch": 0.4767625247474917, + "grad_norm": 4.430517196655273, + "learning_rate": 8.90377898908726e-05, + "loss": 2.7147, + "step": 7104 + }, + { + "epoch": 0.4768967484312607, + "grad_norm": 4.83636474609375, + "learning_rate": 8.903099805647959e-05, + "loss": 2.7745, + "step": 7106 + }, + { + "epoch": 0.4770309721150297, + "grad_norm": 4.290573596954346, + "learning_rate": 8.902420437794865e-05, + "loss": 2.5328, + "step": 7108 + }, + { + "epoch": 0.4771651957987987, + "grad_norm": 4.112292766571045, + "learning_rate": 8.901740885560082e-05, + "loss": 2.6319, + "step": 7110 + }, + { + "epoch": 0.4772994194825677, + "grad_norm": 4.194301128387451, + "learning_rate": 8.901061148975711e-05, + "loss": 2.7102, + "step": 7112 + }, + { + "epoch": 0.4774336431663367, + "grad_norm": 3.9754135608673096, + "learning_rate": 8.900381228073875e-05, + "loss": 2.6409, + "step": 7114 + }, + { + "epoch": 0.4775678668501057, + "grad_norm": 4.550083160400391, + "learning_rate": 8.899701122886695e-05, + "loss": 2.7104, + "step": 7116 + }, + { + "epoch": 0.4777020905338747, + "grad_norm": 4.325358867645264, + "learning_rate": 8.899020833446304e-05, + "loss": 2.5454, + "step": 7118 + }, + { + "epoch": 0.4778363142176437, + "grad_norm": 4.065746784210205, + "learning_rate": 8.898340359784847e-05, + "loss": 2.3406, + "step": 7120 + }, + { + "epoch": 0.4779705379014127, + "grad_norm": 4.200448036193848, + "learning_rate": 8.897659701934474e-05, + "loss": 2.6497, + "step": 7122 + }, + { + "epoch": 0.4781047615851817, + "grad_norm": 9.11935043334961, + "learning_rate": 8.896978859927343e-05, + "loss": 2.6637, + "step": 7124 + }, + { + "epoch": 0.4782389852689507, + "grad_norm": 4.336365222930908, + "learning_rate": 8.896297833795625e-05, + "loss": 2.3339, + "step": 7126 + }, + { + "epoch": 0.47837320895271973, + "grad_norm": 5.01307487487793, + "learning_rate": 8.895616623571497e-05, + "loss": 2.6536, + "step": 7128 + }, + { + "epoch": 0.4785074326364887, + "grad_norm": 4.196367263793945, + "learning_rate": 8.894935229287142e-05, + "loss": 2.6863, + "step": 7130 + }, + { + "epoch": 0.47864165632025774, + "grad_norm": 4.020771503448486, + "learning_rate": 8.894253650974757e-05, + "loss": 2.5721, + "step": 7132 + }, + { + "epoch": 0.4787758800040267, + "grad_norm": 4.471967697143555, + "learning_rate": 8.893571888666545e-05, + "loss": 2.5459, + "step": 7134 + }, + { + "epoch": 0.4789101036877957, + "grad_norm": 4.176780700683594, + "learning_rate": 8.892889942394719e-05, + "loss": 2.6522, + "step": 7136 + }, + { + "epoch": 0.4790443273715647, + "grad_norm": 3.639908790588379, + "learning_rate": 8.892207812191497e-05, + "loss": 2.2976, + "step": 7138 + }, + { + "epoch": 0.4791785510553337, + "grad_norm": 4.095739841461182, + "learning_rate": 8.89152549808911e-05, + "loss": 2.6945, + "step": 7140 + }, + { + "epoch": 0.4793127747391027, + "grad_norm": 3.5340707302093506, + "learning_rate": 8.890843000119795e-05, + "loss": 2.5057, + "step": 7142 + }, + { + "epoch": 0.4794469984228717, + "grad_norm": 5.841238021850586, + "learning_rate": 8.890160318315798e-05, + "loss": 2.9293, + "step": 7144 + }, + { + "epoch": 0.4795812221066407, + "grad_norm": 4.015960216522217, + "learning_rate": 8.889477452709378e-05, + "loss": 2.733, + "step": 7146 + }, + { + "epoch": 0.4797154457904097, + "grad_norm": 4.297277927398682, + "learning_rate": 8.888794403332797e-05, + "loss": 2.2863, + "step": 7148 + }, + { + "epoch": 0.47984966947417873, + "grad_norm": 4.9667205810546875, + "learning_rate": 8.888111170218325e-05, + "loss": 2.4524, + "step": 7150 + }, + { + "epoch": 0.4799838931579477, + "grad_norm": 4.611047744750977, + "learning_rate": 8.887427753398248e-05, + "loss": 2.6309, + "step": 7152 + }, + { + "epoch": 0.48011811684171674, + "grad_norm": 4.468472957611084, + "learning_rate": 8.886744152904851e-05, + "loss": 2.5474, + "step": 7154 + }, + { + "epoch": 0.4802523405254857, + "grad_norm": 3.620321273803711, + "learning_rate": 8.886060368770439e-05, + "loss": 2.5933, + "step": 7156 + }, + { + "epoch": 0.48038656420925474, + "grad_norm": 4.279125690460205, + "learning_rate": 8.885376401027315e-05, + "loss": 2.9492, + "step": 7158 + }, + { + "epoch": 0.4805207878930237, + "grad_norm": 4.083330154418945, + "learning_rate": 8.884692249707795e-05, + "loss": 2.5083, + "step": 7160 + }, + { + "epoch": 0.48065501157679275, + "grad_norm": 3.6691131591796875, + "learning_rate": 8.884007914844208e-05, + "loss": 2.6959, + "step": 7162 + }, + { + "epoch": 0.4807892352605617, + "grad_norm": 9.744839668273926, + "learning_rate": 8.883323396468882e-05, + "loss": 2.287, + "step": 7164 + }, + { + "epoch": 0.48092345894433075, + "grad_norm": 4.636609077453613, + "learning_rate": 8.882638694614163e-05, + "loss": 2.4582, + "step": 7166 + }, + { + "epoch": 0.4810576826280997, + "grad_norm": 4.281426906585693, + "learning_rate": 8.8819538093124e-05, + "loss": 2.7384, + "step": 7168 + }, + { + "epoch": 0.48119190631186876, + "grad_norm": 4.758574962615967, + "learning_rate": 8.881268740595954e-05, + "loss": 2.715, + "step": 7170 + }, + { + "epoch": 0.48132612999563773, + "grad_norm": 4.1711039543151855, + "learning_rate": 8.880583488497192e-05, + "loss": 2.559, + "step": 7172 + }, + { + "epoch": 0.4814603536794067, + "grad_norm": 4.605276584625244, + "learning_rate": 8.87989805304849e-05, + "loss": 2.3397, + "step": 7174 + }, + { + "epoch": 0.48159457736317574, + "grad_norm": 4.368406772613525, + "learning_rate": 8.879212434282235e-05, + "loss": 2.6512, + "step": 7176 + }, + { + "epoch": 0.4817288010469447, + "grad_norm": 4.211768627166748, + "learning_rate": 8.878526632230819e-05, + "loss": 2.5264, + "step": 7178 + }, + { + "epoch": 0.48186302473071374, + "grad_norm": 4.499227046966553, + "learning_rate": 8.87784064692665e-05, + "loss": 2.5037, + "step": 7180 + }, + { + "epoch": 0.4819972484144827, + "grad_norm": 3.967230796813965, + "learning_rate": 8.877154478402131e-05, + "loss": 2.4586, + "step": 7182 + }, + { + "epoch": 0.48213147209825175, + "grad_norm": 4.356146335601807, + "learning_rate": 8.876468126689692e-05, + "loss": 2.9337, + "step": 7184 + }, + { + "epoch": 0.4822656957820207, + "grad_norm": 3.6149747371673584, + "learning_rate": 8.875781591821754e-05, + "loss": 2.5838, + "step": 7186 + }, + { + "epoch": 0.48239991946578975, + "grad_norm": 4.355716228485107, + "learning_rate": 8.875094873830758e-05, + "loss": 2.6256, + "step": 7188 + }, + { + "epoch": 0.4825341431495587, + "grad_norm": 4.324185371398926, + "learning_rate": 8.87440797274915e-05, + "loss": 2.7633, + "step": 7190 + }, + { + "epoch": 0.48266836683332776, + "grad_norm": 4.585443019866943, + "learning_rate": 8.873720888609382e-05, + "loss": 2.51, + "step": 7192 + }, + { + "epoch": 0.48280259051709673, + "grad_norm": 5.006647109985352, + "learning_rate": 8.873033621443921e-05, + "loss": 2.6196, + "step": 7194 + }, + { + "epoch": 0.48293681420086576, + "grad_norm": 3.8665144443511963, + "learning_rate": 8.872346171285237e-05, + "loss": 2.4378, + "step": 7196 + }, + { + "epoch": 0.48307103788463474, + "grad_norm": 8.824873924255371, + "learning_rate": 8.871658538165811e-05, + "loss": 2.6473, + "step": 7198 + }, + { + "epoch": 0.48320526156840377, + "grad_norm": 4.950484752655029, + "learning_rate": 8.870970722118132e-05, + "loss": 2.5462, + "step": 7200 + }, + { + "epoch": 0.48333948525217274, + "grad_norm": 4.06249475479126, + "learning_rate": 8.870282723174699e-05, + "loss": 2.3414, + "step": 7202 + }, + { + "epoch": 0.48347370893594177, + "grad_norm": 5.5365424156188965, + "learning_rate": 8.869594541368017e-05, + "loss": 2.5774, + "step": 7204 + }, + { + "epoch": 0.48360793261971075, + "grad_norm": 4.662693023681641, + "learning_rate": 8.868906176730602e-05, + "loss": 2.863, + "step": 7206 + }, + { + "epoch": 0.4837421563034798, + "grad_norm": 4.086627960205078, + "learning_rate": 8.868217629294979e-05, + "loss": 2.5895, + "step": 7208 + }, + { + "epoch": 0.48387637998724875, + "grad_norm": 4.505926609039307, + "learning_rate": 8.86752889909368e-05, + "loss": 2.5329, + "step": 7210 + }, + { + "epoch": 0.4840106036710177, + "grad_norm": 3.620770215988159, + "learning_rate": 8.866839986159244e-05, + "loss": 2.4379, + "step": 7212 + }, + { + "epoch": 0.48414482735478676, + "grad_norm": 4.196841239929199, + "learning_rate": 8.866150890524224e-05, + "loss": 2.5144, + "step": 7214 + }, + { + "epoch": 0.48427905103855573, + "grad_norm": 4.062780857086182, + "learning_rate": 8.865461612221176e-05, + "loss": 2.5825, + "step": 7216 + }, + { + "epoch": 0.48441327472232476, + "grad_norm": 5.748974323272705, + "learning_rate": 8.864772151282668e-05, + "loss": 2.67, + "step": 7218 + }, + { + "epoch": 0.48454749840609374, + "grad_norm": 4.198263645172119, + "learning_rate": 8.864082507741276e-05, + "loss": 2.4246, + "step": 7220 + }, + { + "epoch": 0.48468172208986277, + "grad_norm": 3.8544650077819824, + "learning_rate": 8.863392681629583e-05, + "loss": 2.3023, + "step": 7222 + }, + { + "epoch": 0.48481594577363174, + "grad_norm": 4.148684978485107, + "learning_rate": 8.862702672980185e-05, + "loss": 2.4788, + "step": 7224 + }, + { + "epoch": 0.48495016945740077, + "grad_norm": 4.001841068267822, + "learning_rate": 8.862012481825679e-05, + "loss": 2.4807, + "step": 7226 + }, + { + "epoch": 0.48508439314116975, + "grad_norm": 4.812371253967285, + "learning_rate": 8.861322108198678e-05, + "loss": 2.5021, + "step": 7228 + }, + { + "epoch": 0.4852186168249388, + "grad_norm": 4.368526458740234, + "learning_rate": 8.860631552131801e-05, + "loss": 2.6885, + "step": 7230 + }, + { + "epoch": 0.48535284050870775, + "grad_norm": 4.1001176834106445, + "learning_rate": 8.859940813657675e-05, + "loss": 2.5116, + "step": 7232 + }, + { + "epoch": 0.4854870641924768, + "grad_norm": 4.034219264984131, + "learning_rate": 8.859249892808935e-05, + "loss": 2.7522, + "step": 7234 + }, + { + "epoch": 0.48562128787624576, + "grad_norm": 4.254793167114258, + "learning_rate": 8.858558789618228e-05, + "loss": 2.7122, + "step": 7236 + }, + { + "epoch": 0.4857555115600148, + "grad_norm": 4.518510818481445, + "learning_rate": 8.857867504118204e-05, + "loss": 2.4749, + "step": 7238 + }, + { + "epoch": 0.48588973524378376, + "grad_norm": 3.966928243637085, + "learning_rate": 8.857176036341526e-05, + "loss": 2.4623, + "step": 7240 + }, + { + "epoch": 0.4860239589275528, + "grad_norm": 4.670034408569336, + "learning_rate": 8.856484386320867e-05, + "loss": 2.6908, + "step": 7242 + }, + { + "epoch": 0.48615818261132177, + "grad_norm": 4.137224197387695, + "learning_rate": 8.855792554088903e-05, + "loss": 2.4177, + "step": 7244 + }, + { + "epoch": 0.4862924062950908, + "grad_norm": 3.921180248260498, + "learning_rate": 8.855100539678324e-05, + "loss": 2.6965, + "step": 7246 + }, + { + "epoch": 0.48642662997885977, + "grad_norm": 4.062808990478516, + "learning_rate": 8.854408343121824e-05, + "loss": 2.6392, + "step": 7248 + }, + { + "epoch": 0.48656085366262875, + "grad_norm": 4.586909294128418, + "learning_rate": 8.85371596445211e-05, + "loss": 2.9198, + "step": 7250 + }, + { + "epoch": 0.4866950773463978, + "grad_norm": 4.539356708526611, + "learning_rate": 8.853023403701894e-05, + "loss": 2.877, + "step": 7252 + }, + { + "epoch": 0.48682930103016675, + "grad_norm": 4.287798881530762, + "learning_rate": 8.852330660903899e-05, + "loss": 2.4574, + "step": 7254 + }, + { + "epoch": 0.4869635247139358, + "grad_norm": 4.679342746734619, + "learning_rate": 8.851637736090857e-05, + "loss": 2.649, + "step": 7256 + }, + { + "epoch": 0.48709774839770476, + "grad_norm": 3.814764976501465, + "learning_rate": 8.850944629295503e-05, + "loss": 2.6646, + "step": 7258 + }, + { + "epoch": 0.4872319720814738, + "grad_norm": 4.791277885437012, + "learning_rate": 8.850251340550591e-05, + "loss": 2.6773, + "step": 7260 + }, + { + "epoch": 0.48736619576524276, + "grad_norm": 4.596240997314453, + "learning_rate": 8.849557869888872e-05, + "loss": 2.5976, + "step": 7262 + }, + { + "epoch": 0.4875004194490118, + "grad_norm": 4.075602054595947, + "learning_rate": 8.848864217343114e-05, + "loss": 2.2844, + "step": 7264 + }, + { + "epoch": 0.48763464313278077, + "grad_norm": 4.275961875915527, + "learning_rate": 8.848170382946091e-05, + "loss": 2.4443, + "step": 7266 + }, + { + "epoch": 0.4877688668165498, + "grad_norm": 4.558645248413086, + "learning_rate": 8.847476366730584e-05, + "loss": 2.548, + "step": 7268 + }, + { + "epoch": 0.48790309050031877, + "grad_norm": 3.7694294452667236, + "learning_rate": 8.846782168729384e-05, + "loss": 2.2822, + "step": 7270 + }, + { + "epoch": 0.4880373141840878, + "grad_norm": 4.77918815612793, + "learning_rate": 8.846087788975292e-05, + "loss": 2.8864, + "step": 7272 + }, + { + "epoch": 0.4881715378678568, + "grad_norm": 4.352509021759033, + "learning_rate": 8.845393227501114e-05, + "loss": 2.6307, + "step": 7274 + }, + { + "epoch": 0.4883057615516258, + "grad_norm": 5.291930198669434, + "learning_rate": 8.844698484339668e-05, + "loss": 2.7275, + "step": 7276 + }, + { + "epoch": 0.4884399852353948, + "grad_norm": 4.3260297775268555, + "learning_rate": 8.844003559523779e-05, + "loss": 2.7075, + "step": 7278 + }, + { + "epoch": 0.4885742089191638, + "grad_norm": 4.2062788009643555, + "learning_rate": 8.84330845308628e-05, + "loss": 2.5085, + "step": 7280 + }, + { + "epoch": 0.4887084326029328, + "grad_norm": 4.816370487213135, + "learning_rate": 8.842613165060015e-05, + "loss": 2.627, + "step": 7282 + }, + { + "epoch": 0.4888426562867018, + "grad_norm": 4.675765514373779, + "learning_rate": 8.841917695477835e-05, + "loss": 2.6003, + "step": 7284 + }, + { + "epoch": 0.4889768799704708, + "grad_norm": 4.0935468673706055, + "learning_rate": 8.841222044372597e-05, + "loss": 2.5212, + "step": 7286 + }, + { + "epoch": 0.48911110365423976, + "grad_norm": 3.9285390377044678, + "learning_rate": 8.840526211777172e-05, + "loss": 2.6756, + "step": 7288 + }, + { + "epoch": 0.4892453273380088, + "grad_norm": 4.629647254943848, + "learning_rate": 8.839830197724435e-05, + "loss": 2.313, + "step": 7290 + }, + { + "epoch": 0.48937955102177777, + "grad_norm": 4.355985641479492, + "learning_rate": 8.839134002247272e-05, + "loss": 2.5554, + "step": 7292 + }, + { + "epoch": 0.4895137747055468, + "grad_norm": 3.805363893508911, + "learning_rate": 8.838437625378575e-05, + "loss": 2.5341, + "step": 7294 + }, + { + "epoch": 0.4896479983893158, + "grad_norm": 4.074311256408691, + "learning_rate": 8.83774106715125e-05, + "loss": 2.6239, + "step": 7296 + }, + { + "epoch": 0.4897822220730848, + "grad_norm": 4.8023858070373535, + "learning_rate": 8.837044327598206e-05, + "loss": 2.5575, + "step": 7298 + }, + { + "epoch": 0.4899164457568538, + "grad_norm": 4.360156536102295, + "learning_rate": 8.836347406752363e-05, + "loss": 2.4957, + "step": 7300 + }, + { + "epoch": 0.4900506694406228, + "grad_norm": 5.964948654174805, + "learning_rate": 8.835650304646648e-05, + "loss": 2.6705, + "step": 7302 + }, + { + "epoch": 0.4901848931243918, + "grad_norm": 4.420108795166016, + "learning_rate": 8.834953021313999e-05, + "loss": 2.6595, + "step": 7304 + }, + { + "epoch": 0.4903191168081608, + "grad_norm": 4.500249862670898, + "learning_rate": 8.834255556787361e-05, + "loss": 2.8269, + "step": 7306 + }, + { + "epoch": 0.4904533404919298, + "grad_norm": 4.5006561279296875, + "learning_rate": 8.833557911099688e-05, + "loss": 3.0141, + "step": 7308 + }, + { + "epoch": 0.4905875641756988, + "grad_norm": 3.531846523284912, + "learning_rate": 8.832860084283942e-05, + "loss": 2.3614, + "step": 7310 + }, + { + "epoch": 0.4907217878594678, + "grad_norm": 4.034825801849365, + "learning_rate": 8.832162076373094e-05, + "loss": 2.5797, + "step": 7312 + }, + { + "epoch": 0.4908560115432368, + "grad_norm": 3.82961368560791, + "learning_rate": 8.831463887400122e-05, + "loss": 2.4478, + "step": 7314 + }, + { + "epoch": 0.4909902352270058, + "grad_norm": 4.268989562988281, + "learning_rate": 8.830765517398017e-05, + "loss": 2.5791, + "step": 7316 + }, + { + "epoch": 0.49112445891077483, + "grad_norm": 4.757140636444092, + "learning_rate": 8.830066966399775e-05, + "loss": 2.8206, + "step": 7318 + }, + { + "epoch": 0.4912586825945438, + "grad_norm": 4.309196472167969, + "learning_rate": 8.829368234438397e-05, + "loss": 2.3189, + "step": 7320 + }, + { + "epoch": 0.49139290627831284, + "grad_norm": 6.056865215301514, + "learning_rate": 8.828669321546902e-05, + "loss": 2.61, + "step": 7322 + }, + { + "epoch": 0.4915271299620818, + "grad_norm": 4.676591873168945, + "learning_rate": 8.827970227758313e-05, + "loss": 2.8818, + "step": 7324 + }, + { + "epoch": 0.4916613536458508, + "grad_norm": 4.237356185913086, + "learning_rate": 8.827270953105655e-05, + "loss": 2.8814, + "step": 7326 + }, + { + "epoch": 0.4917955773296198, + "grad_norm": 4.38384485244751, + "learning_rate": 8.826571497621972e-05, + "loss": 2.6045, + "step": 7328 + }, + { + "epoch": 0.4919298010133888, + "grad_norm": 3.7271828651428223, + "learning_rate": 8.825871861340308e-05, + "loss": 2.3893, + "step": 7330 + }, + { + "epoch": 0.4920640246971578, + "grad_norm": 5.184826374053955, + "learning_rate": 8.825172044293725e-05, + "loss": 2.6662, + "step": 7332 + }, + { + "epoch": 0.4921982483809268, + "grad_norm": 4.592067241668701, + "learning_rate": 8.824472046515283e-05, + "loss": 2.6992, + "step": 7334 + }, + { + "epoch": 0.4923324720646958, + "grad_norm": 4.587413311004639, + "learning_rate": 8.823771868038058e-05, + "loss": 2.6919, + "step": 7336 + }, + { + "epoch": 0.4924666957484648, + "grad_norm": 4.302164077758789, + "learning_rate": 8.823071508895131e-05, + "loss": 2.3992, + "step": 7338 + }, + { + "epoch": 0.49260091943223383, + "grad_norm": 6.215792655944824, + "learning_rate": 8.822370969119592e-05, + "loss": 2.7843, + "step": 7340 + }, + { + "epoch": 0.4927351431160028, + "grad_norm": 4.385227203369141, + "learning_rate": 8.821670248744542e-05, + "loss": 2.5618, + "step": 7342 + }, + { + "epoch": 0.49286936679977184, + "grad_norm": 4.651440620422363, + "learning_rate": 8.820969347803088e-05, + "loss": 2.9295, + "step": 7344 + }, + { + "epoch": 0.4930035904835408, + "grad_norm": 3.9608006477355957, + "learning_rate": 8.820268266328345e-05, + "loss": 2.6823, + "step": 7346 + }, + { + "epoch": 0.49313781416730984, + "grad_norm": 4.744499683380127, + "learning_rate": 8.81956700435344e-05, + "loss": 2.6736, + "step": 7348 + }, + { + "epoch": 0.4932720378510788, + "grad_norm": 5.9010844230651855, + "learning_rate": 8.818865561911504e-05, + "loss": 2.4995, + "step": 7350 + }, + { + "epoch": 0.49340626153484785, + "grad_norm": 4.187682628631592, + "learning_rate": 8.81816393903568e-05, + "loss": 2.8577, + "step": 7352 + }, + { + "epoch": 0.4935404852186168, + "grad_norm": 4.1631269454956055, + "learning_rate": 8.817462135759117e-05, + "loss": 2.5869, + "step": 7354 + }, + { + "epoch": 0.49367470890238585, + "grad_norm": 4.486316680908203, + "learning_rate": 8.816760152114976e-05, + "loss": 2.728, + "step": 7356 + }, + { + "epoch": 0.4938089325861548, + "grad_norm": 4.365041255950928, + "learning_rate": 8.816057988136422e-05, + "loss": 2.4348, + "step": 7358 + }, + { + "epoch": 0.49394315626992386, + "grad_norm": 4.537342548370361, + "learning_rate": 8.815355643856633e-05, + "loss": 2.7402, + "step": 7360 + }, + { + "epoch": 0.49407737995369283, + "grad_norm": 3.941340208053589, + "learning_rate": 8.814653119308794e-05, + "loss": 2.5429, + "step": 7362 + }, + { + "epoch": 0.4942116036374618, + "grad_norm": 5.055961608886719, + "learning_rate": 8.813950414526093e-05, + "loss": 2.7675, + "step": 7364 + }, + { + "epoch": 0.49434582732123084, + "grad_norm": 3.6938276290893555, + "learning_rate": 8.813247529541737e-05, + "loss": 2.5853, + "step": 7366 + }, + { + "epoch": 0.4944800510049998, + "grad_norm": 4.35965633392334, + "learning_rate": 8.812544464388932e-05, + "loss": 3.0063, + "step": 7368 + }, + { + "epoch": 0.49461427468876884, + "grad_norm": 3.495882034301758, + "learning_rate": 8.8118412191009e-05, + "loss": 2.3619, + "step": 7370 + }, + { + "epoch": 0.4947484983725378, + "grad_norm": 3.9517219066619873, + "learning_rate": 8.811137793710863e-05, + "loss": 2.4843, + "step": 7372 + }, + { + "epoch": 0.49488272205630685, + "grad_norm": 4.577459335327148, + "learning_rate": 8.810434188252063e-05, + "loss": 3.0213, + "step": 7374 + }, + { + "epoch": 0.4950169457400758, + "grad_norm": 4.833443641662598, + "learning_rate": 8.809730402757739e-05, + "loss": 2.4209, + "step": 7376 + }, + { + "epoch": 0.49515116942384485, + "grad_norm": 4.76648473739624, + "learning_rate": 8.809026437261145e-05, + "loss": 2.5358, + "step": 7378 + }, + { + "epoch": 0.4952853931076138, + "grad_norm": 4.333948135375977, + "learning_rate": 8.80832229179554e-05, + "loss": 2.5925, + "step": 7380 + }, + { + "epoch": 0.49541961679138286, + "grad_norm": 4.8047685623168945, + "learning_rate": 8.8076179663942e-05, + "loss": 2.6311, + "step": 7382 + }, + { + "epoch": 0.49555384047515183, + "grad_norm": 4.282135963439941, + "learning_rate": 8.806913461090395e-05, + "loss": 2.6859, + "step": 7384 + }, + { + "epoch": 0.49568806415892086, + "grad_norm": 4.136877536773682, + "learning_rate": 8.806208775917417e-05, + "loss": 2.6077, + "step": 7386 + }, + { + "epoch": 0.49582228784268984, + "grad_norm": 8.161808967590332, + "learning_rate": 8.805503910908557e-05, + "loss": 2.6385, + "step": 7388 + }, + { + "epoch": 0.49595651152645887, + "grad_norm": 5.1308417320251465, + "learning_rate": 8.804798866097121e-05, + "loss": 2.5717, + "step": 7390 + }, + { + "epoch": 0.49609073521022784, + "grad_norm": 3.809506416320801, + "learning_rate": 8.80409364151642e-05, + "loss": 2.7605, + "step": 7392 + }, + { + "epoch": 0.49622495889399687, + "grad_norm": 5.569212913513184, + "learning_rate": 8.803388237199776e-05, + "loss": 2.8064, + "step": 7394 + }, + { + "epoch": 0.49635918257776585, + "grad_norm": 3.905700922012329, + "learning_rate": 8.802682653180516e-05, + "loss": 2.2991, + "step": 7396 + }, + { + "epoch": 0.4964934062615349, + "grad_norm": 4.65444278717041, + "learning_rate": 8.801976889491979e-05, + "loss": 2.6327, + "step": 7398 + }, + { + "epoch": 0.49662762994530385, + "grad_norm": 4.846920967102051, + "learning_rate": 8.80127094616751e-05, + "loss": 2.6678, + "step": 7400 + }, + { + "epoch": 0.4967618536290728, + "grad_norm": 4.1511430740356445, + "learning_rate": 8.800564823240464e-05, + "loss": 2.602, + "step": 7402 + }, + { + "epoch": 0.49689607731284186, + "grad_norm": 8.277396202087402, + "learning_rate": 8.799858520744201e-05, + "loss": 2.4806, + "step": 7404 + }, + { + "epoch": 0.49703030099661083, + "grad_norm": 4.194830894470215, + "learning_rate": 8.799152038712099e-05, + "loss": 2.3866, + "step": 7406 + }, + { + "epoch": 0.49716452468037986, + "grad_norm": 5.199011325836182, + "learning_rate": 8.798445377177531e-05, + "loss": 2.5777, + "step": 7408 + }, + { + "epoch": 0.49729874836414883, + "grad_norm": 4.007650852203369, + "learning_rate": 8.79773853617389e-05, + "loss": 2.4144, + "step": 7410 + }, + { + "epoch": 0.49743297204791787, + "grad_norm": 4.554393291473389, + "learning_rate": 8.797031515734571e-05, + "loss": 2.6692, + "step": 7412 + }, + { + "epoch": 0.49756719573168684, + "grad_norm": 6.9764909744262695, + "learning_rate": 8.796324315892978e-05, + "loss": 2.4622, + "step": 7414 + }, + { + "epoch": 0.49770141941545587, + "grad_norm": 4.2294087409973145, + "learning_rate": 8.795616936682528e-05, + "loss": 2.4618, + "step": 7416 + }, + { + "epoch": 0.49783564309922484, + "grad_norm": 4.353862762451172, + "learning_rate": 8.794909378136639e-05, + "loss": 2.599, + "step": 7418 + }, + { + "epoch": 0.4979698667829939, + "grad_norm": 3.8220903873443604, + "learning_rate": 8.794201640288746e-05, + "loss": 2.5144, + "step": 7420 + }, + { + "epoch": 0.49810409046676285, + "grad_norm": 4.469327926635742, + "learning_rate": 8.793493723172286e-05, + "loss": 2.4891, + "step": 7422 + }, + { + "epoch": 0.4982383141505319, + "grad_norm": 3.768348455429077, + "learning_rate": 8.792785626820708e-05, + "loss": 2.6009, + "step": 7424 + }, + { + "epoch": 0.49837253783430085, + "grad_norm": 4.019043922424316, + "learning_rate": 8.792077351267466e-05, + "loss": 2.78, + "step": 7426 + }, + { + "epoch": 0.4985067615180699, + "grad_norm": 5.577075004577637, + "learning_rate": 8.791368896546027e-05, + "loss": 2.7303, + "step": 7428 + }, + { + "epoch": 0.49864098520183886, + "grad_norm": 5.177112102508545, + "learning_rate": 8.790660262689863e-05, + "loss": 3.1277, + "step": 7430 + }, + { + "epoch": 0.4987752088856079, + "grad_norm": 4.220139026641846, + "learning_rate": 8.789951449732454e-05, + "loss": 2.6234, + "step": 7432 + }, + { + "epoch": 0.49890943256937687, + "grad_norm": 4.71269416809082, + "learning_rate": 8.789242457707294e-05, + "loss": 2.6911, + "step": 7434 + }, + { + "epoch": 0.4990436562531459, + "grad_norm": 3.9555888175964355, + "learning_rate": 8.788533286647878e-05, + "loss": 2.3261, + "step": 7436 + }, + { + "epoch": 0.49917787993691487, + "grad_norm": 4.8989152908325195, + "learning_rate": 8.787823936587714e-05, + "loss": 2.6015, + "step": 7438 + }, + { + "epoch": 0.49931210362068384, + "grad_norm": 4.181760311126709, + "learning_rate": 8.787114407560317e-05, + "loss": 2.5427, + "step": 7440 + }, + { + "epoch": 0.4994463273044529, + "grad_norm": 4.241029262542725, + "learning_rate": 8.786404699599211e-05, + "loss": 2.8278, + "step": 7442 + }, + { + "epoch": 0.49958055098822185, + "grad_norm": 4.158200740814209, + "learning_rate": 8.78569481273793e-05, + "loss": 2.7832, + "step": 7444 + }, + { + "epoch": 0.4997147746719909, + "grad_norm": 4.230244159698486, + "learning_rate": 8.784984747010012e-05, + "loss": 2.6833, + "step": 7446 + }, + { + "epoch": 0.49984899835575985, + "grad_norm": 5.388680934906006, + "learning_rate": 8.784274502449009e-05, + "loss": 2.6264, + "step": 7448 + }, + { + "epoch": 0.4999832220395289, + "grad_norm": 4.317330360412598, + "learning_rate": 8.783564079088477e-05, + "loss": 2.4306, + "step": 7450 + }, + { + "epoch": 0.5001174457232979, + "grad_norm": 4.286252021789551, + "learning_rate": 8.782853476961981e-05, + "loss": 2.4738, + "step": 7452 + }, + { + "epoch": 0.5002516694070669, + "grad_norm": 4.267755508422852, + "learning_rate": 8.782142696103098e-05, + "loss": 2.7039, + "step": 7454 + }, + { + "epoch": 0.5003858930908359, + "grad_norm": 11.01051139831543, + "learning_rate": 8.781431736545409e-05, + "loss": 2.4731, + "step": 7456 + }, + { + "epoch": 0.5005201167746048, + "grad_norm": 3.8400440216064453, + "learning_rate": 8.780720598322507e-05, + "loss": 2.4483, + "step": 7458 + }, + { + "epoch": 0.5006543404583739, + "grad_norm": 5.083245754241943, + "learning_rate": 8.780009281467991e-05, + "loss": 2.6762, + "step": 7460 + }, + { + "epoch": 0.5007885641421429, + "grad_norm": 5.250662803649902, + "learning_rate": 8.77929778601547e-05, + "loss": 2.5807, + "step": 7462 + }, + { + "epoch": 0.5009227878259119, + "grad_norm": 5.7152252197265625, + "learning_rate": 8.778586111998561e-05, + "loss": 2.3978, + "step": 7464 + }, + { + "epoch": 0.5010570115096808, + "grad_norm": 4.1584696769714355, + "learning_rate": 8.777874259450887e-05, + "loss": 2.5731, + "step": 7466 + }, + { + "epoch": 0.5011912351934499, + "grad_norm": 3.997542142868042, + "learning_rate": 8.777162228406083e-05, + "loss": 2.4529, + "step": 7468 + }, + { + "epoch": 0.5013254588772189, + "grad_norm": 4.072284698486328, + "learning_rate": 8.776450018897792e-05, + "loss": 2.6079, + "step": 7470 + }, + { + "epoch": 0.5014596825609879, + "grad_norm": 5.22924280166626, + "learning_rate": 8.775737630959662e-05, + "loss": 2.5828, + "step": 7472 + }, + { + "epoch": 0.5015939062447569, + "grad_norm": 4.336501598358154, + "learning_rate": 8.775025064625355e-05, + "loss": 2.5567, + "step": 7474 + }, + { + "epoch": 0.5017281299285259, + "grad_norm": 4.498725414276123, + "learning_rate": 8.774312319928537e-05, + "loss": 2.6124, + "step": 7476 + }, + { + "epoch": 0.5018623536122949, + "grad_norm": 4.192162036895752, + "learning_rate": 8.773599396902886e-05, + "loss": 2.5602, + "step": 7478 + }, + { + "epoch": 0.5019965772960638, + "grad_norm": 3.5663812160491943, + "learning_rate": 8.772886295582079e-05, + "loss": 2.2215, + "step": 7480 + }, + { + "epoch": 0.5021308009798329, + "grad_norm": 3.8979668617248535, + "learning_rate": 8.772173015999816e-05, + "loss": 2.3116, + "step": 7482 + }, + { + "epoch": 0.5022650246636019, + "grad_norm": 3.70542049407959, + "learning_rate": 8.771459558189796e-05, + "loss": 2.4325, + "step": 7484 + }, + { + "epoch": 0.5023992483473709, + "grad_norm": 5.162866115570068, + "learning_rate": 8.770745922185728e-05, + "loss": 2.5629, + "step": 7486 + }, + { + "epoch": 0.5025334720311398, + "grad_norm": 4.339237213134766, + "learning_rate": 8.770032108021331e-05, + "loss": 2.6591, + "step": 7488 + }, + { + "epoch": 0.5026676957149089, + "grad_norm": 5.1962666511535645, + "learning_rate": 8.76931811573033e-05, + "loss": 2.6903, + "step": 7490 + }, + { + "epoch": 0.5028019193986779, + "grad_norm": 3.8429229259490967, + "learning_rate": 8.768603945346458e-05, + "loss": 2.5508, + "step": 7492 + }, + { + "epoch": 0.5029361430824469, + "grad_norm": 4.64263916015625, + "learning_rate": 8.767889596903462e-05, + "loss": 2.7528, + "step": 7494 + }, + { + "epoch": 0.5030703667662159, + "grad_norm": 4.290610313415527, + "learning_rate": 8.767175070435092e-05, + "loss": 2.5088, + "step": 7496 + }, + { + "epoch": 0.5032045904499849, + "grad_norm": 3.783186912536621, + "learning_rate": 8.766460365975107e-05, + "loss": 2.3388, + "step": 7498 + }, + { + "epoch": 0.5033388141337539, + "grad_norm": 4.33715295791626, + "learning_rate": 8.765745483557276e-05, + "loss": 2.6039, + "step": 7500 + }, + { + "epoch": 0.503473037817523, + "grad_norm": 4.383300304412842, + "learning_rate": 8.765030423215377e-05, + "loss": 2.4584, + "step": 7502 + }, + { + "epoch": 0.5036072615012919, + "grad_norm": 4.270421981811523, + "learning_rate": 8.764315184983193e-05, + "loss": 2.6612, + "step": 7504 + }, + { + "epoch": 0.5037414851850609, + "grad_norm": 5.638134479522705, + "learning_rate": 8.76359976889452e-05, + "loss": 2.7484, + "step": 7506 + }, + { + "epoch": 0.5038757088688299, + "grad_norm": 4.157181739807129, + "learning_rate": 8.762884174983158e-05, + "loss": 2.6454, + "step": 7508 + }, + { + "epoch": 0.504009932552599, + "grad_norm": 5.201225280761719, + "learning_rate": 8.762168403282917e-05, + "loss": 2.858, + "step": 7510 + }, + { + "epoch": 0.5041441562363679, + "grad_norm": 5.3665642738342285, + "learning_rate": 8.761452453827618e-05, + "loss": 2.6975, + "step": 7512 + }, + { + "epoch": 0.5042783799201369, + "grad_norm": 3.9251575469970703, + "learning_rate": 8.760736326651087e-05, + "loss": 2.4424, + "step": 7514 + }, + { + "epoch": 0.5044126036039059, + "grad_norm": 3.6599321365356445, + "learning_rate": 8.760020021787158e-05, + "loss": 2.4696, + "step": 7516 + }, + { + "epoch": 0.5045468272876749, + "grad_norm": 4.856175899505615, + "learning_rate": 8.759303539269679e-05, + "loss": 2.5651, + "step": 7518 + }, + { + "epoch": 0.5046810509714439, + "grad_norm": 4.286593914031982, + "learning_rate": 8.7585868791325e-05, + "loss": 2.604, + "step": 7520 + }, + { + "epoch": 0.5048152746552129, + "grad_norm": 3.9354991912841797, + "learning_rate": 8.757870041409481e-05, + "loss": 2.5533, + "step": 7522 + }, + { + "epoch": 0.504949498338982, + "grad_norm": 4.769345760345459, + "learning_rate": 8.757153026134493e-05, + "loss": 2.6625, + "step": 7524 + }, + { + "epoch": 0.5050837220227509, + "grad_norm": 4.495894908905029, + "learning_rate": 8.756435833341412e-05, + "loss": 2.9584, + "step": 7526 + }, + { + "epoch": 0.5052179457065199, + "grad_norm": 4.759127616882324, + "learning_rate": 8.755718463064125e-05, + "loss": 2.394, + "step": 7528 + }, + { + "epoch": 0.5053521693902889, + "grad_norm": 4.178998947143555, + "learning_rate": 8.755000915336527e-05, + "loss": 2.628, + "step": 7530 + }, + { + "epoch": 0.505486393074058, + "grad_norm": 5.259588241577148, + "learning_rate": 8.754283190192516e-05, + "loss": 2.4386, + "step": 7532 + }, + { + "epoch": 0.5056206167578269, + "grad_norm": 4.368453025817871, + "learning_rate": 8.75356528766601e-05, + "loss": 2.4249, + "step": 7534 + }, + { + "epoch": 0.5057548404415959, + "grad_norm": 4.659669876098633, + "learning_rate": 8.752847207790924e-05, + "loss": 2.7448, + "step": 7536 + }, + { + "epoch": 0.5058890641253649, + "grad_norm": 4.26934814453125, + "learning_rate": 8.752128950601189e-05, + "loss": 2.4918, + "step": 7538 + }, + { + "epoch": 0.506023287809134, + "grad_norm": 5.034293174743652, + "learning_rate": 8.751410516130739e-05, + "loss": 2.3753, + "step": 7540 + }, + { + "epoch": 0.5061575114929029, + "grad_norm": 4.237463474273682, + "learning_rate": 8.750691904413515e-05, + "loss": 2.8974, + "step": 7542 + }, + { + "epoch": 0.5062917351766719, + "grad_norm": 4.295358180999756, + "learning_rate": 8.749973115483478e-05, + "loss": 2.6244, + "step": 7544 + }, + { + "epoch": 0.506425958860441, + "grad_norm": 4.369450092315674, + "learning_rate": 8.749254149374583e-05, + "loss": 2.8105, + "step": 7546 + }, + { + "epoch": 0.50656018254421, + "grad_norm": 4.1012349128723145, + "learning_rate": 8.748535006120804e-05, + "loss": 2.6226, + "step": 7548 + }, + { + "epoch": 0.5066944062279789, + "grad_norm": 4.369060039520264, + "learning_rate": 8.747815685756115e-05, + "loss": 2.4882, + "step": 7550 + }, + { + "epoch": 0.5068286299117479, + "grad_norm": 4.100227355957031, + "learning_rate": 8.747096188314506e-05, + "loss": 2.6042, + "step": 7552 + }, + { + "epoch": 0.506962853595517, + "grad_norm": 4.369357585906982, + "learning_rate": 8.74637651382997e-05, + "loss": 2.7427, + "step": 7554 + }, + { + "epoch": 0.5070970772792859, + "grad_norm": 4.246026515960693, + "learning_rate": 8.745656662336511e-05, + "loss": 2.4502, + "step": 7556 + }, + { + "epoch": 0.5072313009630549, + "grad_norm": 4.855943202972412, + "learning_rate": 8.744936633868139e-05, + "loss": 2.663, + "step": 7558 + }, + { + "epoch": 0.5073655246468239, + "grad_norm": 4.0846028327941895, + "learning_rate": 8.744216428458878e-05, + "loss": 2.6357, + "step": 7560 + }, + { + "epoch": 0.507499748330593, + "grad_norm": 4.579471111297607, + "learning_rate": 8.743496046142752e-05, + "loss": 2.5816, + "step": 7562 + }, + { + "epoch": 0.5076339720143619, + "grad_norm": 4.676353454589844, + "learning_rate": 8.742775486953799e-05, + "loss": 2.3426, + "step": 7564 + }, + { + "epoch": 0.5077681956981309, + "grad_norm": 5.120142459869385, + "learning_rate": 8.742054750926063e-05, + "loss": 2.7192, + "step": 7566 + }, + { + "epoch": 0.5079024193819, + "grad_norm": 4.468045234680176, + "learning_rate": 8.7413338380936e-05, + "loss": 2.9643, + "step": 7568 + }, + { + "epoch": 0.508036643065669, + "grad_norm": 4.359709739685059, + "learning_rate": 8.740612748490468e-05, + "loss": 2.7799, + "step": 7570 + }, + { + "epoch": 0.5081708667494379, + "grad_norm": 4.042903900146484, + "learning_rate": 8.739891482150741e-05, + "loss": 2.4568, + "step": 7572 + }, + { + "epoch": 0.5083050904332069, + "grad_norm": 4.3763837814331055, + "learning_rate": 8.739170039108495e-05, + "loss": 2.6907, + "step": 7574 + }, + { + "epoch": 0.508439314116976, + "grad_norm": 4.92977237701416, + "learning_rate": 8.738448419397818e-05, + "loss": 2.7495, + "step": 7576 + }, + { + "epoch": 0.508573537800745, + "grad_norm": 3.997642993927002, + "learning_rate": 8.737726623052805e-05, + "loss": 2.5555, + "step": 7578 + }, + { + "epoch": 0.5087077614845139, + "grad_norm": 4.705076694488525, + "learning_rate": 8.737004650107557e-05, + "loss": 2.5254, + "step": 7580 + }, + { + "epoch": 0.5088419851682829, + "grad_norm": 4.045025825500488, + "learning_rate": 8.736282500596188e-05, + "loss": 2.551, + "step": 7582 + }, + { + "epoch": 0.508976208852052, + "grad_norm": 4.482827186584473, + "learning_rate": 8.735560174552821e-05, + "loss": 2.8424, + "step": 7584 + }, + { + "epoch": 0.509110432535821, + "grad_norm": 4.296957492828369, + "learning_rate": 8.73483767201158e-05, + "loss": 2.9321, + "step": 7586 + }, + { + "epoch": 0.5092446562195899, + "grad_norm": 6.290749549865723, + "learning_rate": 8.734114993006602e-05, + "loss": 2.6032, + "step": 7588 + }, + { + "epoch": 0.509378879903359, + "grad_norm": 4.689025402069092, + "learning_rate": 8.733392137572033e-05, + "loss": 2.519, + "step": 7590 + }, + { + "epoch": 0.509513103587128, + "grad_norm": 4.394416809082031, + "learning_rate": 8.732669105742028e-05, + "loss": 2.6412, + "step": 7592 + }, + { + "epoch": 0.5096473272708969, + "grad_norm": 4.349326133728027, + "learning_rate": 8.731945897550748e-05, + "loss": 2.6666, + "step": 7594 + }, + { + "epoch": 0.5097815509546659, + "grad_norm": 4.277975082397461, + "learning_rate": 8.731222513032362e-05, + "loss": 2.4098, + "step": 7596 + }, + { + "epoch": 0.509915774638435, + "grad_norm": 4.268492221832275, + "learning_rate": 8.730498952221051e-05, + "loss": 2.6572, + "step": 7598 + }, + { + "epoch": 0.510049998322204, + "grad_norm": 4.008342742919922, + "learning_rate": 8.729775215151e-05, + "loss": 2.45, + "step": 7600 + }, + { + "epoch": 0.5101842220059729, + "grad_norm": 4.441450119018555, + "learning_rate": 8.729051301856402e-05, + "loss": 2.3465, + "step": 7602 + }, + { + "epoch": 0.5103184456897419, + "grad_norm": 4.787481307983398, + "learning_rate": 8.728327212371466e-05, + "loss": 2.7769, + "step": 7604 + }, + { + "epoch": 0.510452669373511, + "grad_norm": 4.317994594573975, + "learning_rate": 8.727602946730397e-05, + "loss": 2.4682, + "step": 7606 + }, + { + "epoch": 0.51058689305728, + "grad_norm": 5.2388811111450195, + "learning_rate": 8.726878504967421e-05, + "loss": 2.6553, + "step": 7608 + }, + { + "epoch": 0.5107211167410489, + "grad_norm": 3.8276124000549316, + "learning_rate": 8.726153887116766e-05, + "loss": 2.5801, + "step": 7610 + }, + { + "epoch": 0.510855340424818, + "grad_norm": 4.252018928527832, + "learning_rate": 8.725429093212664e-05, + "loss": 2.4858, + "step": 7612 + }, + { + "epoch": 0.510989564108587, + "grad_norm": 4.37074089050293, + "learning_rate": 8.724704123289365e-05, + "loss": 2.5736, + "step": 7614 + }, + { + "epoch": 0.511123787792356, + "grad_norm": 3.892350435256958, + "learning_rate": 8.723978977381119e-05, + "loss": 2.4296, + "step": 7616 + }, + { + "epoch": 0.5112580114761249, + "grad_norm": 4.678012371063232, + "learning_rate": 8.72325365552219e-05, + "loss": 2.7042, + "step": 7618 + }, + { + "epoch": 0.511392235159894, + "grad_norm": 4.389869213104248, + "learning_rate": 8.722528157746847e-05, + "loss": 2.6639, + "step": 7620 + }, + { + "epoch": 0.511526458843663, + "grad_norm": 5.003087043762207, + "learning_rate": 8.721802484089368e-05, + "loss": 2.7067, + "step": 7622 + }, + { + "epoch": 0.511660682527432, + "grad_norm": 5.437467575073242, + "learning_rate": 8.721076634584042e-05, + "loss": 2.5388, + "step": 7624 + }, + { + "epoch": 0.5117949062112009, + "grad_norm": 4.276947975158691, + "learning_rate": 8.72035060926516e-05, + "loss": 2.459, + "step": 7626 + }, + { + "epoch": 0.51192912989497, + "grad_norm": 3.9904770851135254, + "learning_rate": 8.719624408167029e-05, + "loss": 2.3909, + "step": 7628 + }, + { + "epoch": 0.512063353578739, + "grad_norm": 3.91359806060791, + "learning_rate": 8.71889803132396e-05, + "loss": 2.5954, + "step": 7630 + }, + { + "epoch": 0.5121975772625079, + "grad_norm": 3.795306444168091, + "learning_rate": 8.718171478770269e-05, + "loss": 2.6696, + "step": 7632 + }, + { + "epoch": 0.512331800946277, + "grad_norm": 4.307861328125, + "learning_rate": 8.71744475054029e-05, + "loss": 2.759, + "step": 7634 + }, + { + "epoch": 0.512466024630046, + "grad_norm": 4.762552738189697, + "learning_rate": 8.716717846668354e-05, + "loss": 2.4894, + "step": 7636 + }, + { + "epoch": 0.512600248313815, + "grad_norm": 4.160583019256592, + "learning_rate": 8.715990767188811e-05, + "loss": 2.4493, + "step": 7638 + }, + { + "epoch": 0.5127344719975839, + "grad_norm": 4.169168949127197, + "learning_rate": 8.71526351213601e-05, + "loss": 2.5601, + "step": 7640 + }, + { + "epoch": 0.512868695681353, + "grad_norm": 4.148977279663086, + "learning_rate": 8.714536081544314e-05, + "loss": 2.3593, + "step": 7642 + }, + { + "epoch": 0.513002919365122, + "grad_norm": 4.210081100463867, + "learning_rate": 8.71380847544809e-05, + "loss": 2.5531, + "step": 7644 + }, + { + "epoch": 0.513137143048891, + "grad_norm": 4.51228141784668, + "learning_rate": 8.713080693881723e-05, + "loss": 2.6426, + "step": 7646 + }, + { + "epoch": 0.5132713667326599, + "grad_norm": 4.383159637451172, + "learning_rate": 8.712352736879592e-05, + "loss": 2.8626, + "step": 7648 + }, + { + "epoch": 0.513405590416429, + "grad_norm": 3.8582780361175537, + "learning_rate": 8.711624604476094e-05, + "loss": 2.6985, + "step": 7650 + }, + { + "epoch": 0.513539814100198, + "grad_norm": 5.451298236846924, + "learning_rate": 8.710896296705634e-05, + "loss": 2.5685, + "step": 7652 + }, + { + "epoch": 0.513674037783967, + "grad_norm": 4.7417683601379395, + "learning_rate": 8.710167813602619e-05, + "loss": 2.7466, + "step": 7654 + }, + { + "epoch": 0.513808261467736, + "grad_norm": 3.7915444374084473, + "learning_rate": 8.70943915520147e-05, + "loss": 2.3706, + "step": 7656 + }, + { + "epoch": 0.513942485151505, + "grad_norm": 4.156683444976807, + "learning_rate": 8.708710321536617e-05, + "loss": 2.3886, + "step": 7658 + }, + { + "epoch": 0.514076708835274, + "grad_norm": 4.2623982429504395, + "learning_rate": 8.707981312642494e-05, + "loss": 2.5911, + "step": 7660 + }, + { + "epoch": 0.514210932519043, + "grad_norm": 3.970834493637085, + "learning_rate": 8.707252128553544e-05, + "loss": 2.4906, + "step": 7662 + }, + { + "epoch": 0.514345156202812, + "grad_norm": 6.389592170715332, + "learning_rate": 8.706522769304224e-05, + "loss": 2.5142, + "step": 7664 + }, + { + "epoch": 0.514479379886581, + "grad_norm": 4.095816612243652, + "learning_rate": 8.70579323492899e-05, + "loss": 2.5821, + "step": 7666 + }, + { + "epoch": 0.51461360357035, + "grad_norm": 5.122433662414551, + "learning_rate": 8.705063525462312e-05, + "loss": 2.3988, + "step": 7668 + }, + { + "epoch": 0.5147478272541189, + "grad_norm": 6.592900276184082, + "learning_rate": 8.704333640938669e-05, + "loss": 2.5171, + "step": 7670 + }, + { + "epoch": 0.514882050937888, + "grad_norm": 4.208958625793457, + "learning_rate": 8.703603581392546e-05, + "loss": 2.6358, + "step": 7672 + }, + { + "epoch": 0.515016274621657, + "grad_norm": 9.402848243713379, + "learning_rate": 8.702873346858434e-05, + "loss": 2.5366, + "step": 7674 + }, + { + "epoch": 0.515150498305426, + "grad_norm": 3.7950775623321533, + "learning_rate": 8.70214293737084e-05, + "loss": 2.6058, + "step": 7676 + }, + { + "epoch": 0.515284721989195, + "grad_norm": 4.716952323913574, + "learning_rate": 8.701412352964274e-05, + "loss": 2.6619, + "step": 7678 + }, + { + "epoch": 0.515418945672964, + "grad_norm": 5.911996364593506, + "learning_rate": 8.70068159367325e-05, + "loss": 2.574, + "step": 7680 + }, + { + "epoch": 0.515553169356733, + "grad_norm": 4.496488094329834, + "learning_rate": 8.699950659532298e-05, + "loss": 2.5985, + "step": 7682 + }, + { + "epoch": 0.515687393040502, + "grad_norm": 4.189420223236084, + "learning_rate": 8.699219550575953e-05, + "loss": 2.4226, + "step": 7684 + }, + { + "epoch": 0.515821616724271, + "grad_norm": 4.857367992401123, + "learning_rate": 8.698488266838759e-05, + "loss": 2.4796, + "step": 7686 + }, + { + "epoch": 0.51595584040804, + "grad_norm": 4.714810371398926, + "learning_rate": 8.697756808355267e-05, + "loss": 2.5351, + "step": 7688 + }, + { + "epoch": 0.516090064091809, + "grad_norm": 4.04406213760376, + "learning_rate": 8.697025175160039e-05, + "loss": 2.296, + "step": 7690 + }, + { + "epoch": 0.516224287775578, + "grad_norm": 4.754507541656494, + "learning_rate": 8.696293367287638e-05, + "loss": 2.8079, + "step": 7692 + }, + { + "epoch": 0.516358511459347, + "grad_norm": 4.2741618156433105, + "learning_rate": 8.695561384772646e-05, + "loss": 2.5228, + "step": 7694 + }, + { + "epoch": 0.516492735143116, + "grad_norm": 3.8869478702545166, + "learning_rate": 8.694829227649644e-05, + "loss": 2.668, + "step": 7696 + }, + { + "epoch": 0.516626958826885, + "grad_norm": 4.7568864822387695, + "learning_rate": 8.694096895953227e-05, + "loss": 2.6572, + "step": 7698 + }, + { + "epoch": 0.5167611825106541, + "grad_norm": 3.909097671508789, + "learning_rate": 8.693364389717998e-05, + "loss": 2.4038, + "step": 7700 + }, + { + "epoch": 0.516895406194423, + "grad_norm": 4.057075023651123, + "learning_rate": 8.692631708978562e-05, + "loss": 2.5879, + "step": 7702 + }, + { + "epoch": 0.517029629878192, + "grad_norm": 4.75908899307251, + "learning_rate": 8.69189885376954e-05, + "loss": 2.5147, + "step": 7704 + }, + { + "epoch": 0.517163853561961, + "grad_norm": 4.300357341766357, + "learning_rate": 8.691165824125556e-05, + "loss": 2.6272, + "step": 7706 + }, + { + "epoch": 0.51729807724573, + "grad_norm": 4.365426063537598, + "learning_rate": 8.690432620081246e-05, + "loss": 2.5889, + "step": 7708 + }, + { + "epoch": 0.517432300929499, + "grad_norm": 4.774388790130615, + "learning_rate": 8.689699241671252e-05, + "loss": 2.4285, + "step": 7710 + }, + { + "epoch": 0.517566524613268, + "grad_norm": 4.306029796600342, + "learning_rate": 8.688965688930224e-05, + "loss": 2.3479, + "step": 7712 + }, + { + "epoch": 0.517700748297037, + "grad_norm": 4.200129508972168, + "learning_rate": 8.68823196189282e-05, + "loss": 2.7346, + "step": 7714 + }, + { + "epoch": 0.517834971980806, + "grad_norm": 4.222865104675293, + "learning_rate": 8.68749806059371e-05, + "loss": 2.9167, + "step": 7716 + }, + { + "epoch": 0.517969195664575, + "grad_norm": 4.387313365936279, + "learning_rate": 8.686763985067568e-05, + "loss": 2.7648, + "step": 7718 + }, + { + "epoch": 0.518103419348344, + "grad_norm": 4.233602523803711, + "learning_rate": 8.686029735349075e-05, + "loss": 2.4876, + "step": 7720 + }, + { + "epoch": 0.5182376430321131, + "grad_norm": 4.773935794830322, + "learning_rate": 8.685295311472927e-05, + "loss": 2.7023, + "step": 7722 + }, + { + "epoch": 0.518371866715882, + "grad_norm": 4.431504249572754, + "learning_rate": 8.684560713473822e-05, + "loss": 2.6229, + "step": 7724 + }, + { + "epoch": 0.518506090399651, + "grad_norm": 5.188981056213379, + "learning_rate": 8.68382594138647e-05, + "loss": 2.4367, + "step": 7726 + }, + { + "epoch": 0.51864031408342, + "grad_norm": 4.081497669219971, + "learning_rate": 8.683090995245584e-05, + "loss": 2.7612, + "step": 7728 + }, + { + "epoch": 0.5187745377671891, + "grad_norm": 5.153810024261475, + "learning_rate": 8.682355875085893e-05, + "loss": 2.3953, + "step": 7730 + }, + { + "epoch": 0.518908761450958, + "grad_norm": 3.790618419647217, + "learning_rate": 8.681620580942127e-05, + "loss": 2.4205, + "step": 7732 + }, + { + "epoch": 0.519042985134727, + "grad_norm": 4.093502521514893, + "learning_rate": 8.680885112849028e-05, + "loss": 2.5853, + "step": 7734 + }, + { + "epoch": 0.519177208818496, + "grad_norm": 4.206640720367432, + "learning_rate": 8.680149470841346e-05, + "loss": 2.5618, + "step": 7736 + }, + { + "epoch": 0.5193114325022651, + "grad_norm": 4.019861221313477, + "learning_rate": 8.679413654953837e-05, + "loss": 2.5481, + "step": 7738 + }, + { + "epoch": 0.519445656186034, + "grad_norm": 4.103980541229248, + "learning_rate": 8.678677665221268e-05, + "loss": 2.3164, + "step": 7740 + }, + { + "epoch": 0.519579879869803, + "grad_norm": 4.419374942779541, + "learning_rate": 8.677941501678415e-05, + "loss": 2.5201, + "step": 7742 + }, + { + "epoch": 0.5197141035535721, + "grad_norm": 4.712435245513916, + "learning_rate": 8.677205164360059e-05, + "loss": 3.0306, + "step": 7744 + }, + { + "epoch": 0.519848327237341, + "grad_norm": 3.7718887329101562, + "learning_rate": 8.676468653300987e-05, + "loss": 2.5192, + "step": 7746 + }, + { + "epoch": 0.51998255092111, + "grad_norm": 4.155655384063721, + "learning_rate": 8.675731968536002e-05, + "loss": 2.8078, + "step": 7748 + }, + { + "epoch": 0.520116774604879, + "grad_norm": 4.051815032958984, + "learning_rate": 8.674995110099911e-05, + "loss": 2.6438, + "step": 7750 + }, + { + "epoch": 0.5202509982886481, + "grad_norm": 4.111629486083984, + "learning_rate": 8.674258078027525e-05, + "loss": 2.6628, + "step": 7752 + }, + { + "epoch": 0.520385221972417, + "grad_norm": 4.951748371124268, + "learning_rate": 8.673520872353671e-05, + "loss": 2.6485, + "step": 7754 + }, + { + "epoch": 0.520519445656186, + "grad_norm": 5.134395122528076, + "learning_rate": 8.672783493113179e-05, + "loss": 2.9125, + "step": 7756 + }, + { + "epoch": 0.520653669339955, + "grad_norm": 3.8216757774353027, + "learning_rate": 8.672045940340889e-05, + "loss": 2.6321, + "step": 7758 + }, + { + "epoch": 0.5207878930237241, + "grad_norm": 11.177483558654785, + "learning_rate": 8.671308214071652e-05, + "loss": 2.5881, + "step": 7760 + }, + { + "epoch": 0.520922116707493, + "grad_norm": 4.693935394287109, + "learning_rate": 8.670570314340318e-05, + "loss": 2.5436, + "step": 7762 + }, + { + "epoch": 0.521056340391262, + "grad_norm": 4.31471061706543, + "learning_rate": 8.669832241181756e-05, + "loss": 2.6876, + "step": 7764 + }, + { + "epoch": 0.5211905640750311, + "grad_norm": 4.5565385818481445, + "learning_rate": 8.669093994630835e-05, + "loss": 2.8467, + "step": 7766 + }, + { + "epoch": 0.5213247877588001, + "grad_norm": 4.538492679595947, + "learning_rate": 8.66835557472244e-05, + "loss": 2.7616, + "step": 7768 + }, + { + "epoch": 0.521459011442569, + "grad_norm": 4.518410682678223, + "learning_rate": 8.667616981491458e-05, + "loss": 2.849, + "step": 7770 + }, + { + "epoch": 0.521593235126338, + "grad_norm": 5.218977928161621, + "learning_rate": 8.666878214972783e-05, + "loss": 2.6335, + "step": 7772 + }, + { + "epoch": 0.5217274588101071, + "grad_norm": 3.982905626296997, + "learning_rate": 8.666139275201325e-05, + "loss": 2.5744, + "step": 7774 + }, + { + "epoch": 0.521861682493876, + "grad_norm": 4.416606426239014, + "learning_rate": 8.665400162211995e-05, + "loss": 2.5901, + "step": 7776 + }, + { + "epoch": 0.521995906177645, + "grad_norm": 5.926548480987549, + "learning_rate": 8.664660876039715e-05, + "loss": 2.5403, + "step": 7778 + }, + { + "epoch": 0.522130129861414, + "grad_norm": 4.510757923126221, + "learning_rate": 8.663921416719415e-05, + "loss": 2.9672, + "step": 7780 + }, + { + "epoch": 0.5222643535451831, + "grad_norm": 3.9402148723602295, + "learning_rate": 8.663181784286032e-05, + "loss": 2.7806, + "step": 7782 + }, + { + "epoch": 0.522398577228952, + "grad_norm": 9.779068946838379, + "learning_rate": 8.662441978774514e-05, + "loss": 2.4038, + "step": 7784 + }, + { + "epoch": 0.522532800912721, + "grad_norm": 4.330663681030273, + "learning_rate": 8.661702000219814e-05, + "loss": 2.6648, + "step": 7786 + }, + { + "epoch": 0.5226670245964901, + "grad_norm": 4.18076753616333, + "learning_rate": 8.660961848656895e-05, + "loss": 2.427, + "step": 7788 + }, + { + "epoch": 0.5228012482802591, + "grad_norm": 4.143764495849609, + "learning_rate": 8.660221524120727e-05, + "loss": 2.7626, + "step": 7790 + }, + { + "epoch": 0.522935471964028, + "grad_norm": 4.345688343048096, + "learning_rate": 8.65948102664629e-05, + "loss": 2.4092, + "step": 7792 + }, + { + "epoch": 0.523069695647797, + "grad_norm": 4.829495429992676, + "learning_rate": 8.658740356268571e-05, + "loss": 2.6023, + "step": 7794 + }, + { + "epoch": 0.5232039193315661, + "grad_norm": 4.245131969451904, + "learning_rate": 8.657999513022563e-05, + "loss": 2.5413, + "step": 7796 + }, + { + "epoch": 0.5233381430153351, + "grad_norm": 4.898064136505127, + "learning_rate": 8.657258496943274e-05, + "loss": 2.4862, + "step": 7798 + }, + { + "epoch": 0.523472366699104, + "grad_norm": 4.1950297355651855, + "learning_rate": 8.65651730806571e-05, + "loss": 2.6887, + "step": 7800 + }, + { + "epoch": 0.523606590382873, + "grad_norm": 4.158627986907959, + "learning_rate": 8.655775946424895e-05, + "loss": 2.2618, + "step": 7802 + }, + { + "epoch": 0.5237408140666421, + "grad_norm": 5.042837619781494, + "learning_rate": 8.655034412055856e-05, + "loss": 2.6582, + "step": 7804 + }, + { + "epoch": 0.5238750377504111, + "grad_norm": 4.598635196685791, + "learning_rate": 8.654292704993627e-05, + "loss": 2.5118, + "step": 7806 + }, + { + "epoch": 0.52400926143418, + "grad_norm": 3.8862454891204834, + "learning_rate": 8.653550825273253e-05, + "loss": 2.5077, + "step": 7808 + }, + { + "epoch": 0.5241434851179491, + "grad_norm": 4.829532146453857, + "learning_rate": 8.652808772929788e-05, + "loss": 2.9614, + "step": 7810 + }, + { + "epoch": 0.5242777088017181, + "grad_norm": 4.586523532867432, + "learning_rate": 8.652066547998292e-05, + "loss": 2.6114, + "step": 7812 + }, + { + "epoch": 0.524411932485487, + "grad_norm": 7.152475357055664, + "learning_rate": 8.651324150513833e-05, + "loss": 2.5168, + "step": 7814 + }, + { + "epoch": 0.524546156169256, + "grad_norm": 4.570213794708252, + "learning_rate": 8.650581580511487e-05, + "loss": 2.6234, + "step": 7816 + }, + { + "epoch": 0.5246803798530251, + "grad_norm": 4.530813694000244, + "learning_rate": 8.64983883802634e-05, + "loss": 2.4293, + "step": 7818 + }, + { + "epoch": 0.5248146035367941, + "grad_norm": 3.3239657878875732, + "learning_rate": 8.649095923093484e-05, + "loss": 2.1217, + "step": 7820 + }, + { + "epoch": 0.524948827220563, + "grad_norm": 4.191898345947266, + "learning_rate": 8.648352835748024e-05, + "loss": 2.5192, + "step": 7822 + }, + { + "epoch": 0.525083050904332, + "grad_norm": 3.9366202354431152, + "learning_rate": 8.647609576025064e-05, + "loss": 2.4018, + "step": 7824 + }, + { + "epoch": 0.5252172745881011, + "grad_norm": 4.59829568862915, + "learning_rate": 8.646866143959725e-05, + "loss": 2.6877, + "step": 7826 + }, + { + "epoch": 0.5253514982718701, + "grad_norm": 3.878995418548584, + "learning_rate": 8.646122539587133e-05, + "loss": 2.3951, + "step": 7828 + }, + { + "epoch": 0.525485721955639, + "grad_norm": 4.590762138366699, + "learning_rate": 8.64537876294242e-05, + "loss": 2.6999, + "step": 7830 + }, + { + "epoch": 0.5256199456394081, + "grad_norm": 4.366131782531738, + "learning_rate": 8.644634814060728e-05, + "loss": 2.3999, + "step": 7832 + }, + { + "epoch": 0.5257541693231771, + "grad_norm": 4.579868316650391, + "learning_rate": 8.643890692977209e-05, + "loss": 2.7246, + "step": 7834 + }, + { + "epoch": 0.5258883930069461, + "grad_norm": 4.333804607391357, + "learning_rate": 8.64314639972702e-05, + "loss": 2.4991, + "step": 7836 + }, + { + "epoch": 0.526022616690715, + "grad_norm": 4.387228965759277, + "learning_rate": 8.642401934345328e-05, + "loss": 2.7469, + "step": 7838 + }, + { + "epoch": 0.5261568403744841, + "grad_norm": 4.322113990783691, + "learning_rate": 8.641657296867306e-05, + "loss": 2.4529, + "step": 7840 + }, + { + "epoch": 0.5262910640582531, + "grad_norm": 4.518538951873779, + "learning_rate": 8.640912487328139e-05, + "loss": 2.5449, + "step": 7842 + }, + { + "epoch": 0.5264252877420221, + "grad_norm": 3.9182779788970947, + "learning_rate": 8.640167505763014e-05, + "loss": 2.3911, + "step": 7844 + }, + { + "epoch": 0.526559511425791, + "grad_norm": 3.991652011871338, + "learning_rate": 8.639422352207136e-05, + "loss": 2.8132, + "step": 7846 + }, + { + "epoch": 0.5266937351095601, + "grad_norm": 8.387751579284668, + "learning_rate": 8.638677026695707e-05, + "loss": 2.4719, + "step": 7848 + }, + { + "epoch": 0.5268279587933291, + "grad_norm": 4.986452102661133, + "learning_rate": 8.637931529263943e-05, + "loss": 2.7852, + "step": 7850 + }, + { + "epoch": 0.526962182477098, + "grad_norm": 3.9898791313171387, + "learning_rate": 8.637185859947067e-05, + "loss": 2.3864, + "step": 7852 + }, + { + "epoch": 0.5270964061608671, + "grad_norm": 4.326294898986816, + "learning_rate": 8.636440018780314e-05, + "loss": 2.5898, + "step": 7854 + }, + { + "epoch": 0.5272306298446361, + "grad_norm": 4.389654636383057, + "learning_rate": 8.635694005798919e-05, + "loss": 2.5991, + "step": 7856 + }, + { + "epoch": 0.5273648535284051, + "grad_norm": 4.385763168334961, + "learning_rate": 8.634947821038132e-05, + "loss": 2.9905, + "step": 7858 + }, + { + "epoch": 0.527499077212174, + "grad_norm": 4.243071556091309, + "learning_rate": 8.634201464533207e-05, + "loss": 2.6927, + "step": 7860 + }, + { + "epoch": 0.5276333008959431, + "grad_norm": 4.06496000289917, + "learning_rate": 8.633454936319411e-05, + "loss": 2.5673, + "step": 7862 + }, + { + "epoch": 0.5277675245797121, + "grad_norm": 4.197615146636963, + "learning_rate": 8.632708236432015e-05, + "loss": 2.6029, + "step": 7864 + }, + { + "epoch": 0.5279017482634811, + "grad_norm": 4.647643566131592, + "learning_rate": 8.631961364906296e-05, + "loss": 2.509, + "step": 7866 + }, + { + "epoch": 0.52803597194725, + "grad_norm": 4.320682525634766, + "learning_rate": 8.631214321777546e-05, + "loss": 2.8207, + "step": 7868 + }, + { + "epoch": 0.5281701956310191, + "grad_norm": 3.9764275550842285, + "learning_rate": 8.630467107081057e-05, + "loss": 2.6107, + "step": 7870 + }, + { + "epoch": 0.5283044193147881, + "grad_norm": 4.257374286651611, + "learning_rate": 8.629719720852138e-05, + "loss": 2.4409, + "step": 7872 + }, + { + "epoch": 0.5284386429985571, + "grad_norm": 4.278438568115234, + "learning_rate": 8.628972163126101e-05, + "loss": 2.4687, + "step": 7874 + }, + { + "epoch": 0.5285728666823261, + "grad_norm": 3.953315019607544, + "learning_rate": 8.628224433938263e-05, + "loss": 2.4945, + "step": 7876 + }, + { + "epoch": 0.5287070903660951, + "grad_norm": 4.087108135223389, + "learning_rate": 8.627476533323957e-05, + "loss": 2.5739, + "step": 7878 + }, + { + "epoch": 0.5288413140498641, + "grad_norm": 9.083980560302734, + "learning_rate": 8.626728461318518e-05, + "loss": 2.6826, + "step": 7880 + }, + { + "epoch": 0.5289755377336331, + "grad_norm": 4.440384387969971, + "learning_rate": 8.62598021795729e-05, + "loss": 2.5707, + "step": 7882 + }, + { + "epoch": 0.5291097614174021, + "grad_norm": 4.4943528175354, + "learning_rate": 8.625231803275627e-05, + "loss": 2.7634, + "step": 7884 + }, + { + "epoch": 0.5292439851011711, + "grad_norm": 3.400726795196533, + "learning_rate": 8.62448321730889e-05, + "loss": 2.4414, + "step": 7886 + }, + { + "epoch": 0.5293782087849401, + "grad_norm": 5.8532023429870605, + "learning_rate": 8.623734460092449e-05, + "loss": 2.483, + "step": 7888 + }, + { + "epoch": 0.529512432468709, + "grad_norm": 4.663845062255859, + "learning_rate": 8.62298553166168e-05, + "loss": 2.6216, + "step": 7890 + }, + { + "epoch": 0.5296466561524781, + "grad_norm": 3.9895107746124268, + "learning_rate": 8.622236432051969e-05, + "loss": 2.5717, + "step": 7892 + }, + { + "epoch": 0.5297808798362471, + "grad_norm": 5.88865327835083, + "learning_rate": 8.621487161298709e-05, + "loss": 2.4854, + "step": 7894 + }, + { + "epoch": 0.5299151035200161, + "grad_norm": 4.466606616973877, + "learning_rate": 8.620737719437302e-05, + "loss": 2.6799, + "step": 7896 + }, + { + "epoch": 0.5300493272037851, + "grad_norm": 4.0327301025390625, + "learning_rate": 8.619988106503157e-05, + "loss": 2.0806, + "step": 7898 + }, + { + "epoch": 0.5301835508875541, + "grad_norm": 3.9066245555877686, + "learning_rate": 8.619238322531694e-05, + "loss": 2.5709, + "step": 7900 + }, + { + "epoch": 0.5303177745713231, + "grad_norm": 3.9463694095611572, + "learning_rate": 8.618488367558335e-05, + "loss": 2.508, + "step": 7902 + }, + { + "epoch": 0.5304519982550921, + "grad_norm": 5.188057899475098, + "learning_rate": 8.617738241618518e-05, + "loss": 2.4754, + "step": 7904 + }, + { + "epoch": 0.5305862219388611, + "grad_norm": 4.426109790802002, + "learning_rate": 8.616987944747683e-05, + "loss": 2.55, + "step": 7906 + }, + { + "epoch": 0.5307204456226301, + "grad_norm": 3.9903974533081055, + "learning_rate": 8.616237476981279e-05, + "loss": 2.4372, + "step": 7908 + }, + { + "epoch": 0.5308546693063991, + "grad_norm": 7.278976917266846, + "learning_rate": 8.615486838354765e-05, + "loss": 2.3931, + "step": 7910 + }, + { + "epoch": 0.5309888929901682, + "grad_norm": 4.570991039276123, + "learning_rate": 8.61473602890361e-05, + "loss": 2.7201, + "step": 7912 + }, + { + "epoch": 0.5311231166739371, + "grad_norm": 4.029073238372803, + "learning_rate": 8.613985048663282e-05, + "loss": 2.5726, + "step": 7914 + }, + { + "epoch": 0.5312573403577061, + "grad_norm": 4.330550670623779, + "learning_rate": 8.613233897669271e-05, + "loss": 2.4154, + "step": 7916 + }, + { + "epoch": 0.5313915640414751, + "grad_norm": 4.407141208648682, + "learning_rate": 8.612482575957062e-05, + "loss": 2.6854, + "step": 7918 + }, + { + "epoch": 0.5315257877252442, + "grad_norm": 4.084245204925537, + "learning_rate": 8.611731083562154e-05, + "loss": 2.3102, + "step": 7920 + }, + { + "epoch": 0.5316600114090131, + "grad_norm": 4.506310939788818, + "learning_rate": 8.610979420520053e-05, + "loss": 2.7083, + "step": 7922 + }, + { + "epoch": 0.5317942350927821, + "grad_norm": 4.71737813949585, + "learning_rate": 8.610227586866278e-05, + "loss": 2.3992, + "step": 7924 + }, + { + "epoch": 0.5319284587765511, + "grad_norm": 3.3069674968719482, + "learning_rate": 8.609475582636348e-05, + "loss": 2.3179, + "step": 7926 + }, + { + "epoch": 0.5320626824603201, + "grad_norm": 5.165279865264893, + "learning_rate": 8.608723407865792e-05, + "loss": 2.7187, + "step": 7928 + }, + { + "epoch": 0.5321969061440891, + "grad_norm": 4.771925449371338, + "learning_rate": 8.607971062590155e-05, + "loss": 2.5056, + "step": 7930 + }, + { + "epoch": 0.5323311298278581, + "grad_norm": 4.870205402374268, + "learning_rate": 8.607218546844979e-05, + "loss": 2.5591, + "step": 7932 + }, + { + "epoch": 0.5324653535116272, + "grad_norm": 4.264636039733887, + "learning_rate": 8.606465860665819e-05, + "loss": 2.5409, + "step": 7934 + }, + { + "epoch": 0.5325995771953961, + "grad_norm": 4.230488300323486, + "learning_rate": 8.605713004088238e-05, + "loss": 2.8461, + "step": 7936 + }, + { + "epoch": 0.5327338008791651, + "grad_norm": 4.155982494354248, + "learning_rate": 8.60495997714781e-05, + "loss": 2.5138, + "step": 7938 + }, + { + "epoch": 0.5328680245629341, + "grad_norm": 3.9188036918640137, + "learning_rate": 8.60420677988011e-05, + "loss": 2.5341, + "step": 7940 + }, + { + "epoch": 0.5330022482467032, + "grad_norm": 4.2725372314453125, + "learning_rate": 8.60345341232073e-05, + "loss": 2.4795, + "step": 7942 + }, + { + "epoch": 0.5331364719304721, + "grad_norm": 4.599841594696045, + "learning_rate": 8.60269987450526e-05, + "loss": 2.5429, + "step": 7944 + }, + { + "epoch": 0.5332706956142411, + "grad_norm": 4.052094459533691, + "learning_rate": 8.601946166469305e-05, + "loss": 2.4683, + "step": 7946 + }, + { + "epoch": 0.5334049192980101, + "grad_norm": 3.761728286743164, + "learning_rate": 8.601192288248478e-05, + "loss": 2.5079, + "step": 7948 + }, + { + "epoch": 0.5335391429817792, + "grad_norm": 4.451366901397705, + "learning_rate": 8.600438239878394e-05, + "loss": 2.4978, + "step": 7950 + }, + { + "epoch": 0.5336733666655481, + "grad_norm": 4.719310283660889, + "learning_rate": 8.599684021394686e-05, + "loss": 3.0114, + "step": 7952 + }, + { + "epoch": 0.5338075903493171, + "grad_norm": 4.4590606689453125, + "learning_rate": 8.598929632832985e-05, + "loss": 2.5896, + "step": 7954 + }, + { + "epoch": 0.5339418140330862, + "grad_norm": 3.4980359077453613, + "learning_rate": 8.598175074228938e-05, + "loss": 2.2694, + "step": 7956 + }, + { + "epoch": 0.5340760377168552, + "grad_norm": 4.550145149230957, + "learning_rate": 8.597420345618193e-05, + "loss": 2.4754, + "step": 7958 + }, + { + "epoch": 0.5342102614006241, + "grad_norm": 4.040860652923584, + "learning_rate": 8.596665447036409e-05, + "loss": 2.5747, + "step": 7960 + }, + { + "epoch": 0.5343444850843931, + "grad_norm": 4.391819953918457, + "learning_rate": 8.595910378519257e-05, + "loss": 2.6966, + "step": 7962 + }, + { + "epoch": 0.5344787087681622, + "grad_norm": 3.9462764263153076, + "learning_rate": 8.59515514010241e-05, + "loss": 2.44, + "step": 7964 + }, + { + "epoch": 0.5346129324519311, + "grad_norm": 4.971057415008545, + "learning_rate": 8.594399731821552e-05, + "loss": 2.5661, + "step": 7966 + }, + { + "epoch": 0.5347471561357001, + "grad_norm": 4.2162299156188965, + "learning_rate": 8.593644153712374e-05, + "loss": 2.5175, + "step": 7968 + }, + { + "epoch": 0.5348813798194691, + "grad_norm": 4.711593151092529, + "learning_rate": 8.592888405810578e-05, + "loss": 2.5439, + "step": 7970 + }, + { + "epoch": 0.5350156035032382, + "grad_norm": 5.139729022979736, + "learning_rate": 8.59213248815187e-05, + "loss": 2.7122, + "step": 7972 + }, + { + "epoch": 0.5351498271870071, + "grad_norm": 4.613955020904541, + "learning_rate": 8.591376400771964e-05, + "loss": 2.3277, + "step": 7974 + }, + { + "epoch": 0.5352840508707761, + "grad_norm": 5.149893760681152, + "learning_rate": 8.590620143706587e-05, + "loss": 2.8251, + "step": 7976 + }, + { + "epoch": 0.5354182745545452, + "grad_norm": 4.8103814125061035, + "learning_rate": 8.589863716991469e-05, + "loss": 2.8251, + "step": 7978 + }, + { + "epoch": 0.5355524982383142, + "grad_norm": 3.9946272373199463, + "learning_rate": 8.589107120662348e-05, + "loss": 2.3991, + "step": 7980 + }, + { + "epoch": 0.5356867219220831, + "grad_norm": 11.562091827392578, + "learning_rate": 8.588350354754973e-05, + "loss": 2.5604, + "step": 7982 + }, + { + "epoch": 0.5358209456058521, + "grad_norm": 4.426586151123047, + "learning_rate": 8.587593419305101e-05, + "loss": 2.6303, + "step": 7984 + }, + { + "epoch": 0.5359551692896212, + "grad_norm": 3.5754129886627197, + "learning_rate": 8.586836314348494e-05, + "loss": 2.6859, + "step": 7986 + }, + { + "epoch": 0.5360893929733902, + "grad_norm": 3.9783775806427, + "learning_rate": 8.586079039920924e-05, + "loss": 2.4879, + "step": 7988 + }, + { + "epoch": 0.5362236166571591, + "grad_norm": 6.910389423370361, + "learning_rate": 8.585321596058174e-05, + "loss": 2.8797, + "step": 7990 + }, + { + "epoch": 0.5363578403409281, + "grad_norm": 6.6749725341796875, + "learning_rate": 8.584563982796026e-05, + "loss": 2.5065, + "step": 7992 + }, + { + "epoch": 0.5364920640246972, + "grad_norm": 4.4966607093811035, + "learning_rate": 8.583806200170279e-05, + "loss": 2.6714, + "step": 7994 + }, + { + "epoch": 0.5366262877084662, + "grad_norm": 5.05345344543457, + "learning_rate": 8.583048248216736e-05, + "loss": 2.6603, + "step": 7996 + }, + { + "epoch": 0.5367605113922351, + "grad_norm": 4.12758207321167, + "learning_rate": 8.582290126971209e-05, + "loss": 2.4507, + "step": 7998 + }, + { + "epoch": 0.5368947350760042, + "grad_norm": 3.6378579139709473, + "learning_rate": 8.581531836469518e-05, + "loss": 2.3878, + "step": 8000 + }, + { + "epoch": 0.5370289587597732, + "grad_norm": 4.1027021408081055, + "learning_rate": 8.580773376747492e-05, + "loss": 2.4425, + "step": 8002 + }, + { + "epoch": 0.5371631824435421, + "grad_norm": 4.127890110015869, + "learning_rate": 8.580014747840964e-05, + "loss": 2.4977, + "step": 8004 + }, + { + "epoch": 0.5372974061273111, + "grad_norm": 4.317221641540527, + "learning_rate": 8.579255949785779e-05, + "loss": 2.4934, + "step": 8006 + }, + { + "epoch": 0.5374316298110802, + "grad_norm": 4.497485637664795, + "learning_rate": 8.578496982617788e-05, + "loss": 2.6716, + "step": 8008 + }, + { + "epoch": 0.5375658534948492, + "grad_norm": 4.33066463470459, + "learning_rate": 8.577737846372853e-05, + "loss": 2.7603, + "step": 8010 + }, + { + "epoch": 0.5377000771786181, + "grad_norm": 4.210849761962891, + "learning_rate": 8.576978541086838e-05, + "loss": 2.4898, + "step": 8012 + }, + { + "epoch": 0.5378343008623871, + "grad_norm": 4.391765594482422, + "learning_rate": 8.576219066795622e-05, + "loss": 2.3638, + "step": 8014 + }, + { + "epoch": 0.5379685245461562, + "grad_norm": 3.5945048332214355, + "learning_rate": 8.575459423535089e-05, + "loss": 2.5099, + "step": 8016 + }, + { + "epoch": 0.5381027482299252, + "grad_norm": 4.290977954864502, + "learning_rate": 8.574699611341125e-05, + "loss": 2.4592, + "step": 8018 + }, + { + "epoch": 0.5382369719136941, + "grad_norm": 3.9793906211853027, + "learning_rate": 8.573939630249638e-05, + "loss": 2.6812, + "step": 8020 + }, + { + "epoch": 0.5383711955974632, + "grad_norm": 4.765017032623291, + "learning_rate": 8.573179480296529e-05, + "loss": 2.7961, + "step": 8022 + }, + { + "epoch": 0.5385054192812322, + "grad_norm": 4.4390363693237305, + "learning_rate": 8.572419161517716e-05, + "loss": 2.6257, + "step": 8024 + }, + { + "epoch": 0.5386396429650012, + "grad_norm": 4.646846771240234, + "learning_rate": 8.571658673949124e-05, + "loss": 2.6642, + "step": 8026 + }, + { + "epoch": 0.5387738666487701, + "grad_norm": 9.733538627624512, + "learning_rate": 8.570898017626681e-05, + "loss": 2.479, + "step": 8028 + }, + { + "epoch": 0.5389080903325392, + "grad_norm": 4.450980186462402, + "learning_rate": 8.570137192586329e-05, + "loss": 2.7121, + "step": 8030 + }, + { + "epoch": 0.5390423140163082, + "grad_norm": 3.727104663848877, + "learning_rate": 8.569376198864015e-05, + "loss": 2.4955, + "step": 8032 + }, + { + "epoch": 0.5391765377000772, + "grad_norm": 4.16594934463501, + "learning_rate": 8.568615036495696e-05, + "loss": 2.5656, + "step": 8034 + }, + { + "epoch": 0.5393107613838461, + "grad_norm": 3.636723756790161, + "learning_rate": 8.567853705517332e-05, + "loss": 2.332, + "step": 8036 + }, + { + "epoch": 0.5394449850676152, + "grad_norm": 4.95151948928833, + "learning_rate": 8.567092205964897e-05, + "loss": 2.4277, + "step": 8038 + }, + { + "epoch": 0.5395792087513842, + "grad_norm": 5.008978366851807, + "learning_rate": 8.566330537874369e-05, + "loss": 2.5309, + "step": 8040 + }, + { + "epoch": 0.5397134324351531, + "grad_norm": 4.234550952911377, + "learning_rate": 8.565568701281738e-05, + "loss": 2.2145, + "step": 8042 + }, + { + "epoch": 0.5398476561189222, + "grad_norm": 3.5870344638824463, + "learning_rate": 8.564806696222995e-05, + "loss": 2.0369, + "step": 8044 + }, + { + "epoch": 0.5399818798026912, + "grad_norm": 4.443338871002197, + "learning_rate": 8.564044522734147e-05, + "loss": 2.5245, + "step": 8046 + }, + { + "epoch": 0.5401161034864602, + "grad_norm": 4.957966327667236, + "learning_rate": 8.563282180851203e-05, + "loss": 2.7679, + "step": 8048 + }, + { + "epoch": 0.5402503271702291, + "grad_norm": 5.453382968902588, + "learning_rate": 8.562519670610183e-05, + "loss": 2.7301, + "step": 8050 + }, + { + "epoch": 0.5403845508539982, + "grad_norm": 4.377908706665039, + "learning_rate": 8.561756992047114e-05, + "loss": 2.6769, + "step": 8052 + }, + { + "epoch": 0.5405187745377672, + "grad_norm": 4.1069536209106445, + "learning_rate": 8.560994145198031e-05, + "loss": 2.4965, + "step": 8054 + }, + { + "epoch": 0.5406529982215362, + "grad_norm": 4.0189361572265625, + "learning_rate": 8.560231130098977e-05, + "loss": 2.7421, + "step": 8056 + }, + { + "epoch": 0.5407872219053051, + "grad_norm": 5.241302490234375, + "learning_rate": 8.559467946786002e-05, + "loss": 2.7436, + "step": 8058 + }, + { + "epoch": 0.5409214455890742, + "grad_norm": 3.989299774169922, + "learning_rate": 8.558704595295168e-05, + "loss": 2.4371, + "step": 8060 + }, + { + "epoch": 0.5410556692728432, + "grad_norm": 4.198920726776123, + "learning_rate": 8.557941075662538e-05, + "loss": 2.601, + "step": 8062 + }, + { + "epoch": 0.5411898929566122, + "grad_norm": 4.102448463439941, + "learning_rate": 8.55717738792419e-05, + "loss": 2.5383, + "step": 8064 + }, + { + "epoch": 0.5413241166403812, + "grad_norm": 6.239486217498779, + "learning_rate": 8.556413532116204e-05, + "loss": 2.4194, + "step": 8066 + }, + { + "epoch": 0.5414583403241502, + "grad_norm": 5.557333469390869, + "learning_rate": 8.555649508274672e-05, + "loss": 2.743, + "step": 8068 + }, + { + "epoch": 0.5415925640079192, + "grad_norm": 4.180229663848877, + "learning_rate": 8.554885316435694e-05, + "loss": 2.7462, + "step": 8070 + }, + { + "epoch": 0.5417267876916882, + "grad_norm": 3.9027676582336426, + "learning_rate": 8.554120956635375e-05, + "loss": 2.4961, + "step": 8072 + }, + { + "epoch": 0.5418610113754572, + "grad_norm": 4.273139953613281, + "learning_rate": 8.553356428909827e-05, + "loss": 2.5783, + "step": 8074 + }, + { + "epoch": 0.5419952350592262, + "grad_norm": 3.908167839050293, + "learning_rate": 8.552591733295179e-05, + "loss": 2.6274, + "step": 8076 + }, + { + "epoch": 0.5421294587429952, + "grad_norm": 4.3872270584106445, + "learning_rate": 8.551826869827555e-05, + "loss": 2.8294, + "step": 8078 + }, + { + "epoch": 0.5422636824267641, + "grad_norm": 4.477575778961182, + "learning_rate": 8.551061838543097e-05, + "loss": 2.7391, + "step": 8080 + }, + { + "epoch": 0.5423979061105332, + "grad_norm": 5.517204284667969, + "learning_rate": 8.550296639477948e-05, + "loss": 2.6704, + "step": 8082 + }, + { + "epoch": 0.5425321297943022, + "grad_norm": 4.290626525878906, + "learning_rate": 8.549531272668266e-05, + "loss": 2.533, + "step": 8084 + }, + { + "epoch": 0.5426663534780712, + "grad_norm": 5.084266662597656, + "learning_rate": 8.54876573815021e-05, + "loss": 2.7561, + "step": 8086 + }, + { + "epoch": 0.5428005771618402, + "grad_norm": 4.279355049133301, + "learning_rate": 8.548000035959953e-05, + "loss": 2.5227, + "step": 8088 + }, + { + "epoch": 0.5429348008456092, + "grad_norm": 4.231164455413818, + "learning_rate": 8.547234166133671e-05, + "loss": 2.5146, + "step": 8090 + }, + { + "epoch": 0.5430690245293782, + "grad_norm": 4.726373672485352, + "learning_rate": 8.546468128707548e-05, + "loss": 2.3581, + "step": 8092 + }, + { + "epoch": 0.5432032482131472, + "grad_norm": 4.160212993621826, + "learning_rate": 8.545701923717781e-05, + "loss": 2.9179, + "step": 8094 + }, + { + "epoch": 0.5433374718969162, + "grad_norm": 3.619513750076294, + "learning_rate": 8.54493555120057e-05, + "loss": 2.2703, + "step": 8096 + }, + { + "epoch": 0.5434716955806852, + "grad_norm": 4.557647705078125, + "learning_rate": 8.544169011192125e-05, + "loss": 2.8888, + "step": 8098 + }, + { + "epoch": 0.5436059192644542, + "grad_norm": 4.125056743621826, + "learning_rate": 8.543402303728665e-05, + "loss": 2.6946, + "step": 8100 + }, + { + "epoch": 0.5437401429482233, + "grad_norm": 4.06985330581665, + "learning_rate": 8.542635428846413e-05, + "loss": 2.4114, + "step": 8102 + }, + { + "epoch": 0.5438743666319922, + "grad_norm": 4.345375061035156, + "learning_rate": 8.541868386581601e-05, + "loss": 2.6558, + "step": 8104 + }, + { + "epoch": 0.5440085903157612, + "grad_norm": 4.179208755493164, + "learning_rate": 8.541101176970476e-05, + "loss": 2.8875, + "step": 8106 + }, + { + "epoch": 0.5441428139995302, + "grad_norm": 4.244075775146484, + "learning_rate": 8.540333800049282e-05, + "loss": 2.7773, + "step": 8108 + }, + { + "epoch": 0.5442770376832993, + "grad_norm": 3.676865816116333, + "learning_rate": 8.539566255854277e-05, + "loss": 2.6402, + "step": 8110 + }, + { + "epoch": 0.5444112613670682, + "grad_norm": 4.1005401611328125, + "learning_rate": 8.538798544421729e-05, + "loss": 2.4639, + "step": 8112 + }, + { + "epoch": 0.5445454850508372, + "grad_norm": 3.9115328788757324, + "learning_rate": 8.538030665787906e-05, + "loss": 2.252, + "step": 8114 + }, + { + "epoch": 0.5446797087346062, + "grad_norm": 5.012974262237549, + "learning_rate": 8.537262619989093e-05, + "loss": 2.9147, + "step": 8116 + }, + { + "epoch": 0.5448139324183752, + "grad_norm": 8.417624473571777, + "learning_rate": 8.536494407061575e-05, + "loss": 2.5004, + "step": 8118 + }, + { + "epoch": 0.5449481561021442, + "grad_norm": 4.0051774978637695, + "learning_rate": 8.535726027041652e-05, + "loss": 2.8997, + "step": 8120 + }, + { + "epoch": 0.5450823797859132, + "grad_norm": 4.453825950622559, + "learning_rate": 8.534957479965629e-05, + "loss": 2.47, + "step": 8122 + }, + { + "epoch": 0.5452166034696823, + "grad_norm": 3.924103021621704, + "learning_rate": 8.534188765869813e-05, + "loss": 2.4944, + "step": 8124 + }, + { + "epoch": 0.5453508271534512, + "grad_norm": 4.378748893737793, + "learning_rate": 8.533419884790528e-05, + "loss": 2.4826, + "step": 8126 + }, + { + "epoch": 0.5454850508372202, + "grad_norm": 5.203148365020752, + "learning_rate": 8.532650836764102e-05, + "loss": 2.7763, + "step": 8128 + }, + { + "epoch": 0.5456192745209892, + "grad_norm": 4.275012969970703, + "learning_rate": 8.531881621826871e-05, + "loss": 2.683, + "step": 8130 + }, + { + "epoch": 0.5457534982047583, + "grad_norm": 4.2090277671813965, + "learning_rate": 8.531112240015178e-05, + "loss": 2.6628, + "step": 8132 + }, + { + "epoch": 0.5458877218885272, + "grad_norm": 5.175638675689697, + "learning_rate": 8.530342691365377e-05, + "loss": 2.573, + "step": 8134 + }, + { + "epoch": 0.5460219455722962, + "grad_norm": 5.116296768188477, + "learning_rate": 8.529572975913824e-05, + "loss": 2.2855, + "step": 8136 + }, + { + "epoch": 0.5461561692560652, + "grad_norm": 4.279474258422852, + "learning_rate": 8.528803093696889e-05, + "loss": 2.2895, + "step": 8138 + }, + { + "epoch": 0.5462903929398343, + "grad_norm": 4.436120986938477, + "learning_rate": 8.528033044750947e-05, + "loss": 2.6675, + "step": 8140 + }, + { + "epoch": 0.5464246166236032, + "grad_norm": 4.406123161315918, + "learning_rate": 8.527262829112382e-05, + "loss": 2.8858, + "step": 8142 + }, + { + "epoch": 0.5465588403073722, + "grad_norm": 4.271512508392334, + "learning_rate": 8.526492446817583e-05, + "loss": 2.6279, + "step": 8144 + }, + { + "epoch": 0.5466930639911413, + "grad_norm": 4.273552417755127, + "learning_rate": 8.52572189790295e-05, + "loss": 2.4079, + "step": 8146 + }, + { + "epoch": 0.5468272876749103, + "grad_norm": 3.876375913619995, + "learning_rate": 8.524951182404892e-05, + "loss": 2.3574, + "step": 8148 + }, + { + "epoch": 0.5469615113586792, + "grad_norm": 3.983973503112793, + "learning_rate": 8.52418030035982e-05, + "loss": 2.7919, + "step": 8150 + }, + { + "epoch": 0.5470957350424482, + "grad_norm": 5.272140026092529, + "learning_rate": 8.523409251804163e-05, + "loss": 2.5509, + "step": 8152 + }, + { + "epoch": 0.5472299587262173, + "grad_norm": 5.105844497680664, + "learning_rate": 8.522638036774345e-05, + "loss": 2.6468, + "step": 8154 + }, + { + "epoch": 0.5473641824099862, + "grad_norm": 3.7335832118988037, + "learning_rate": 8.521866655306805e-05, + "loss": 2.7396, + "step": 8156 + }, + { + "epoch": 0.5474984060937552, + "grad_norm": 5.03615665435791, + "learning_rate": 8.521095107437993e-05, + "loss": 2.5291, + "step": 8158 + }, + { + "epoch": 0.5476326297775242, + "grad_norm": 3.9458134174346924, + "learning_rate": 8.52032339320436e-05, + "loss": 2.4699, + "step": 8160 + }, + { + "epoch": 0.5477668534612933, + "grad_norm": 4.119931221008301, + "learning_rate": 8.51955151264237e-05, + "loss": 2.3864, + "step": 8162 + }, + { + "epoch": 0.5479010771450622, + "grad_norm": 4.583990573883057, + "learning_rate": 8.518779465788491e-05, + "loss": 2.8692, + "step": 8164 + }, + { + "epoch": 0.5480353008288312, + "grad_norm": 3.8425562381744385, + "learning_rate": 8.518007252679202e-05, + "loss": 2.5118, + "step": 8166 + }, + { + "epoch": 0.5481695245126003, + "grad_norm": 3.9390852451324463, + "learning_rate": 8.517234873350987e-05, + "loss": 2.4117, + "step": 8168 + }, + { + "epoch": 0.5483037481963693, + "grad_norm": 3.9448883533477783, + "learning_rate": 8.516462327840343e-05, + "loss": 2.3101, + "step": 8170 + }, + { + "epoch": 0.5484379718801382, + "grad_norm": 5.998781204223633, + "learning_rate": 8.515689616183769e-05, + "loss": 2.5359, + "step": 8172 + }, + { + "epoch": 0.5485721955639072, + "grad_norm": 4.809030055999756, + "learning_rate": 8.51491673841777e-05, + "loss": 2.6064, + "step": 8174 + }, + { + "epoch": 0.5487064192476763, + "grad_norm": 4.268282890319824, + "learning_rate": 8.51414369457887e-05, + "loss": 2.5089, + "step": 8176 + }, + { + "epoch": 0.5488406429314453, + "grad_norm": 4.691493034362793, + "learning_rate": 8.513370484703591e-05, + "loss": 2.648, + "step": 8178 + }, + { + "epoch": 0.5489748666152142, + "grad_norm": 4.372832298278809, + "learning_rate": 8.512597108828464e-05, + "loss": 2.694, + "step": 8180 + }, + { + "epoch": 0.5491090902989832, + "grad_norm": 4.148030757904053, + "learning_rate": 8.511823566990031e-05, + "loss": 2.8166, + "step": 8182 + }, + { + "epoch": 0.5492433139827523, + "grad_norm": 4.948546409606934, + "learning_rate": 8.511049859224842e-05, + "loss": 2.8792, + "step": 8184 + }, + { + "epoch": 0.5493775376665213, + "grad_norm": 4.280914306640625, + "learning_rate": 8.510275985569449e-05, + "loss": 2.6019, + "step": 8186 + }, + { + "epoch": 0.5495117613502902, + "grad_norm": 4.22799825668335, + "learning_rate": 8.509501946060421e-05, + "loss": 2.5373, + "step": 8188 + }, + { + "epoch": 0.5496459850340593, + "grad_norm": 5.222815990447998, + "learning_rate": 8.508727740734324e-05, + "loss": 2.6236, + "step": 8190 + }, + { + "epoch": 0.5497802087178283, + "grad_norm": 4.463929176330566, + "learning_rate": 8.507953369627743e-05, + "loss": 2.6484, + "step": 8192 + }, + { + "epoch": 0.5499144324015972, + "grad_norm": 3.9856793880462646, + "learning_rate": 8.507178832777263e-05, + "loss": 2.528, + "step": 8194 + }, + { + "epoch": 0.5500486560853662, + "grad_norm": 4.271596908569336, + "learning_rate": 8.506404130219479e-05, + "loss": 2.6224, + "step": 8196 + }, + { + "epoch": 0.5501828797691353, + "grad_norm": 4.0262932777404785, + "learning_rate": 8.505629261990995e-05, + "loss": 2.7584, + "step": 8198 + }, + { + "epoch": 0.5503171034529043, + "grad_norm": 3.9979381561279297, + "learning_rate": 8.504854228128422e-05, + "loss": 2.605, + "step": 8200 + }, + { + "epoch": 0.5504513271366732, + "grad_norm": 4.106181621551514, + "learning_rate": 8.504079028668379e-05, + "loss": 2.5342, + "step": 8202 + }, + { + "epoch": 0.5505855508204422, + "grad_norm": 4.400891304016113, + "learning_rate": 8.503303663647492e-05, + "loss": 2.3276, + "step": 8204 + }, + { + "epoch": 0.5507197745042113, + "grad_norm": 4.194718360900879, + "learning_rate": 8.502528133102397e-05, + "loss": 2.201, + "step": 8206 + }, + { + "epoch": 0.5508539981879803, + "grad_norm": 4.0377726554870605, + "learning_rate": 8.501752437069734e-05, + "loss": 2.517, + "step": 8208 + }, + { + "epoch": 0.5509882218717492, + "grad_norm": 4.208590507507324, + "learning_rate": 8.500976575586155e-05, + "loss": 2.4684, + "step": 8210 + }, + { + "epoch": 0.5511224455555183, + "grad_norm": 4.272232532501221, + "learning_rate": 8.500200548688318e-05, + "loss": 2.8585, + "step": 8212 + }, + { + "epoch": 0.5512566692392873, + "grad_norm": 4.240617275238037, + "learning_rate": 8.499424356412886e-05, + "loss": 2.5566, + "step": 8214 + }, + { + "epoch": 0.5513908929230563, + "grad_norm": 4.24310302734375, + "learning_rate": 8.498647998796534e-05, + "loss": 2.3877, + "step": 8216 + }, + { + "epoch": 0.5515251166068252, + "grad_norm": 4.352355003356934, + "learning_rate": 8.497871475875946e-05, + "loss": 2.5585, + "step": 8218 + }, + { + "epoch": 0.5516593402905943, + "grad_norm": 4.210171699523926, + "learning_rate": 8.497094787687807e-05, + "loss": 2.8299, + "step": 8220 + }, + { + "epoch": 0.5517935639743633, + "grad_norm": 4.8140363693237305, + "learning_rate": 8.496317934268818e-05, + "loss": 2.6559, + "step": 8222 + }, + { + "epoch": 0.5519277876581323, + "grad_norm": 4.127819061279297, + "learning_rate": 8.49554091565568e-05, + "loss": 2.4813, + "step": 8224 + }, + { + "epoch": 0.5520620113419012, + "grad_norm": 4.640266418457031, + "learning_rate": 8.49476373188511e-05, + "loss": 2.6515, + "step": 8226 + }, + { + "epoch": 0.5521962350256703, + "grad_norm": 4.110575199127197, + "learning_rate": 8.493986382993823e-05, + "loss": 2.5545, + "step": 8228 + }, + { + "epoch": 0.5523304587094393, + "grad_norm": 3.9333462715148926, + "learning_rate": 8.49320886901855e-05, + "loss": 2.4639, + "step": 8230 + }, + { + "epoch": 0.5524646823932082, + "grad_norm": 4.189158916473389, + "learning_rate": 8.492431189996029e-05, + "loss": 2.1858, + "step": 8232 + }, + { + "epoch": 0.5525989060769773, + "grad_norm": 3.9591896533966064, + "learning_rate": 8.491653345963002e-05, + "loss": 2.3846, + "step": 8234 + }, + { + "epoch": 0.5527331297607463, + "grad_norm": 4.166201114654541, + "learning_rate": 8.490875336956219e-05, + "loss": 2.5285, + "step": 8236 + }, + { + "epoch": 0.5528673534445153, + "grad_norm": 4.008368015289307, + "learning_rate": 8.49009716301244e-05, + "loss": 2.6618, + "step": 8238 + }, + { + "epoch": 0.5530015771282842, + "grad_norm": 4.083521842956543, + "learning_rate": 8.489318824168435e-05, + "loss": 2.6512, + "step": 8240 + }, + { + "epoch": 0.5531358008120533, + "grad_norm": 4.338555335998535, + "learning_rate": 8.488540320460975e-05, + "loss": 2.4152, + "step": 8242 + }, + { + "epoch": 0.5532700244958223, + "grad_norm": 3.9184563159942627, + "learning_rate": 8.487761651926848e-05, + "loss": 2.5963, + "step": 8244 + }, + { + "epoch": 0.5534042481795913, + "grad_norm": 4.642265319824219, + "learning_rate": 8.48698281860284e-05, + "loss": 2.5857, + "step": 8246 + }, + { + "epoch": 0.5535384718633602, + "grad_norm": 4.204073905944824, + "learning_rate": 8.486203820525748e-05, + "loss": 2.372, + "step": 8248 + }, + { + "epoch": 0.5536726955471293, + "grad_norm": 4.36286735534668, + "learning_rate": 8.485424657732384e-05, + "loss": 2.6788, + "step": 8250 + }, + { + "epoch": 0.5538069192308983, + "grad_norm": 4.360098361968994, + "learning_rate": 8.484645330259557e-05, + "loss": 2.6764, + "step": 8252 + }, + { + "epoch": 0.5539411429146673, + "grad_norm": 4.394487380981445, + "learning_rate": 8.483865838144092e-05, + "loss": 2.3705, + "step": 8254 + }, + { + "epoch": 0.5540753665984363, + "grad_norm": 4.692635536193848, + "learning_rate": 8.483086181422816e-05, + "loss": 2.4384, + "step": 8256 + }, + { + "epoch": 0.5542095902822053, + "grad_norm": 4.492144584655762, + "learning_rate": 8.482306360132567e-05, + "loss": 2.6362, + "step": 8258 + }, + { + "epoch": 0.5543438139659743, + "grad_norm": 5.204732894897461, + "learning_rate": 8.48152637431019e-05, + "loss": 2.5046, + "step": 8260 + }, + { + "epoch": 0.5544780376497433, + "grad_norm": 4.233471393585205, + "learning_rate": 8.480746223992539e-05, + "loss": 2.7991, + "step": 8262 + }, + { + "epoch": 0.5546122613335123, + "grad_norm": 4.026937007904053, + "learning_rate": 8.479965909216471e-05, + "loss": 2.7436, + "step": 8264 + }, + { + "epoch": 0.5547464850172813, + "grad_norm": 4.113080978393555, + "learning_rate": 8.479185430018858e-05, + "loss": 2.4756, + "step": 8266 + }, + { + "epoch": 0.5548807087010503, + "grad_norm": 4.42962646484375, + "learning_rate": 8.478404786436577e-05, + "loss": 2.7085, + "step": 8268 + }, + { + "epoch": 0.5550149323848192, + "grad_norm": 4.322906970977783, + "learning_rate": 8.477623978506508e-05, + "loss": 2.5586, + "step": 8270 + }, + { + "epoch": 0.5551491560685883, + "grad_norm": 3.9850378036499023, + "learning_rate": 8.476843006265545e-05, + "loss": 2.3505, + "step": 8272 + }, + { + "epoch": 0.5552833797523573, + "grad_norm": 4.793756008148193, + "learning_rate": 8.476061869750586e-05, + "loss": 2.7832, + "step": 8274 + }, + { + "epoch": 0.5554176034361263, + "grad_norm": 4.213423252105713, + "learning_rate": 8.475280568998541e-05, + "loss": 2.4237, + "step": 8276 + }, + { + "epoch": 0.5555518271198953, + "grad_norm": 4.991631507873535, + "learning_rate": 8.474499104046322e-05, + "loss": 2.4687, + "step": 8278 + }, + { + "epoch": 0.5556860508036643, + "grad_norm": 8.072646141052246, + "learning_rate": 8.473717474930851e-05, + "loss": 2.7893, + "step": 8280 + }, + { + "epoch": 0.5558202744874333, + "grad_norm": 4.3776702880859375, + "learning_rate": 8.472935681689063e-05, + "loss": 2.3155, + "step": 8282 + }, + { + "epoch": 0.5559544981712023, + "grad_norm": 4.341430187225342, + "learning_rate": 8.472153724357892e-05, + "loss": 2.9293, + "step": 8284 + }, + { + "epoch": 0.5560887218549713, + "grad_norm": 4.99183464050293, + "learning_rate": 8.471371602974284e-05, + "loss": 2.691, + "step": 8286 + }, + { + "epoch": 0.5562229455387403, + "grad_norm": 3.7314751148223877, + "learning_rate": 8.470589317575194e-05, + "loss": 2.2903, + "step": 8288 + }, + { + "epoch": 0.5563571692225093, + "grad_norm": 5.280544281005859, + "learning_rate": 8.469806868197585e-05, + "loss": 2.6645, + "step": 8290 + }, + { + "epoch": 0.5564913929062784, + "grad_norm": 4.331841468811035, + "learning_rate": 8.469024254878424e-05, + "loss": 2.8022, + "step": 8292 + }, + { + "epoch": 0.5566256165900473, + "grad_norm": 4.209014415740967, + "learning_rate": 8.468241477654688e-05, + "loss": 2.4629, + "step": 8294 + }, + { + "epoch": 0.5567598402738163, + "grad_norm": 4.676037788391113, + "learning_rate": 8.467458536563362e-05, + "loss": 2.6709, + "step": 8296 + }, + { + "epoch": 0.5568940639575853, + "grad_norm": 4.397090435028076, + "learning_rate": 8.46667543164144e-05, + "loss": 2.5735, + "step": 8298 + }, + { + "epoch": 0.5570282876413544, + "grad_norm": 4.1258978843688965, + "learning_rate": 8.465892162925919e-05, + "loss": 2.5969, + "step": 8300 + }, + { + "epoch": 0.5571625113251233, + "grad_norm": 4.182967662811279, + "learning_rate": 8.46510873045381e-05, + "loss": 2.5525, + "step": 8302 + }, + { + "epoch": 0.5572967350088923, + "grad_norm": 22.151844024658203, + "learning_rate": 8.464325134262126e-05, + "loss": 2.6987, + "step": 8304 + }, + { + "epoch": 0.5574309586926613, + "grad_norm": 4.016932487487793, + "learning_rate": 8.463541374387894e-05, + "loss": 2.8158, + "step": 8306 + }, + { + "epoch": 0.5575651823764303, + "grad_norm": 3.887678623199463, + "learning_rate": 8.462757450868139e-05, + "loss": 2.5556, + "step": 8308 + }, + { + "epoch": 0.5576994060601993, + "grad_norm": 3.991913080215454, + "learning_rate": 8.461973363739906e-05, + "loss": 2.62, + "step": 8310 + }, + { + "epoch": 0.5578336297439683, + "grad_norm": 4.259491443634033, + "learning_rate": 8.46118911304024e-05, + "loss": 2.6139, + "step": 8312 + }, + { + "epoch": 0.5579678534277374, + "grad_norm": 3.8207497596740723, + "learning_rate": 8.460404698806194e-05, + "loss": 2.3962, + "step": 8314 + }, + { + "epoch": 0.5581020771115063, + "grad_norm": 3.395413637161255, + "learning_rate": 8.45962012107483e-05, + "loss": 2.3819, + "step": 8316 + }, + { + "epoch": 0.5582363007952753, + "grad_norm": 11.139334678649902, + "learning_rate": 8.458835379883219e-05, + "loss": 2.665, + "step": 8318 + }, + { + "epoch": 0.5583705244790443, + "grad_norm": 4.46119499206543, + "learning_rate": 8.458050475268437e-05, + "loss": 2.6025, + "step": 8320 + }, + { + "epoch": 0.5585047481628134, + "grad_norm": 4.3873443603515625, + "learning_rate": 8.457265407267572e-05, + "loss": 2.5785, + "step": 8322 + }, + { + "epoch": 0.5586389718465823, + "grad_norm": 4.617380619049072, + "learning_rate": 8.456480175917713e-05, + "loss": 2.5747, + "step": 8324 + }, + { + "epoch": 0.5587731955303513, + "grad_norm": 4.257845878601074, + "learning_rate": 8.455694781255963e-05, + "loss": 2.7323, + "step": 8326 + }, + { + "epoch": 0.5589074192141203, + "grad_norm": 3.8678760528564453, + "learning_rate": 8.454909223319429e-05, + "loss": 2.3128, + "step": 8328 + }, + { + "epoch": 0.5590416428978894, + "grad_norm": 4.195471286773682, + "learning_rate": 8.45412350214523e-05, + "loss": 2.5229, + "step": 8330 + }, + { + "epoch": 0.5591758665816583, + "grad_norm": 4.51228141784668, + "learning_rate": 8.453337617770486e-05, + "loss": 2.5916, + "step": 8332 + }, + { + "epoch": 0.5593100902654273, + "grad_norm": 4.481307506561279, + "learning_rate": 8.452551570232331e-05, + "loss": 2.4085, + "step": 8334 + }, + { + "epoch": 0.5594443139491964, + "grad_norm": 3.9786770343780518, + "learning_rate": 8.451765359567903e-05, + "loss": 2.5577, + "step": 8336 + }, + { + "epoch": 0.5595785376329654, + "grad_norm": 5.480557918548584, + "learning_rate": 8.45097898581435e-05, + "loss": 2.6114, + "step": 8338 + }, + { + "epoch": 0.5597127613167343, + "grad_norm": 4.712765693664551, + "learning_rate": 8.450192449008825e-05, + "loss": 2.5922, + "step": 8340 + }, + { + "epoch": 0.5598469850005033, + "grad_norm": 6.5508713722229, + "learning_rate": 8.449405749188493e-05, + "loss": 2.2825, + "step": 8342 + }, + { + "epoch": 0.5599812086842724, + "grad_norm": 4.411618232727051, + "learning_rate": 8.448618886390522e-05, + "loss": 2.5152, + "step": 8344 + }, + { + "epoch": 0.5601154323680413, + "grad_norm": 4.3906049728393555, + "learning_rate": 8.447831860652089e-05, + "loss": 2.5162, + "step": 8346 + }, + { + "epoch": 0.5602496560518103, + "grad_norm": 4.357546806335449, + "learning_rate": 8.447044672010382e-05, + "loss": 2.6264, + "step": 8348 + }, + { + "epoch": 0.5603838797355793, + "grad_norm": 4.4480438232421875, + "learning_rate": 8.44625732050259e-05, + "loss": 2.5656, + "step": 8350 + }, + { + "epoch": 0.5605181034193484, + "grad_norm": 3.934579849243164, + "learning_rate": 8.445469806165918e-05, + "loss": 2.4851, + "step": 8352 + }, + { + "epoch": 0.5606523271031173, + "grad_norm": 4.053318023681641, + "learning_rate": 8.444682129037574e-05, + "loss": 2.4429, + "step": 8354 + }, + { + "epoch": 0.5607865507868863, + "grad_norm": 3.8829972743988037, + "learning_rate": 8.44389428915477e-05, + "loss": 2.5364, + "step": 8356 + }, + { + "epoch": 0.5609207744706554, + "grad_norm": 4.518396377563477, + "learning_rate": 8.443106286554737e-05, + "loss": 2.5288, + "step": 8358 + }, + { + "epoch": 0.5610549981544244, + "grad_norm": 4.240685939788818, + "learning_rate": 8.4423181212747e-05, + "loss": 2.5954, + "step": 8360 + }, + { + "epoch": 0.5611892218381933, + "grad_norm": 6.756836414337158, + "learning_rate": 8.4415297933519e-05, + "loss": 2.7178, + "step": 8362 + }, + { + "epoch": 0.5613234455219623, + "grad_norm": 4.246984481811523, + "learning_rate": 8.440741302823585e-05, + "loss": 2.446, + "step": 8364 + }, + { + "epoch": 0.5614576692057314, + "grad_norm": 4.002936840057373, + "learning_rate": 8.43995264972701e-05, + "loss": 2.3386, + "step": 8366 + }, + { + "epoch": 0.5615918928895004, + "grad_norm": 4.119433879852295, + "learning_rate": 8.439163834099437e-05, + "loss": 2.7297, + "step": 8368 + }, + { + "epoch": 0.5617261165732693, + "grad_norm": 4.4096174240112305, + "learning_rate": 8.438374855978136e-05, + "loss": 2.3638, + "step": 8370 + }, + { + "epoch": 0.5618603402570383, + "grad_norm": 4.135191440582275, + "learning_rate": 8.437585715400384e-05, + "loss": 2.3131, + "step": 8372 + }, + { + "epoch": 0.5619945639408074, + "grad_norm": 3.688091516494751, + "learning_rate": 8.436796412403466e-05, + "loss": 2.1646, + "step": 8374 + }, + { + "epoch": 0.5621287876245764, + "grad_norm": 3.876673936843872, + "learning_rate": 8.436006947024677e-05, + "loss": 2.348, + "step": 8376 + }, + { + "epoch": 0.5622630113083453, + "grad_norm": 3.9576992988586426, + "learning_rate": 8.435217319301315e-05, + "loss": 2.3277, + "step": 8378 + }, + { + "epoch": 0.5623972349921144, + "grad_norm": 6.287901401519775, + "learning_rate": 8.43442752927069e-05, + "loss": 2.6974, + "step": 8380 + }, + { + "epoch": 0.5625314586758834, + "grad_norm": 4.201307773590088, + "learning_rate": 8.433637576970119e-05, + "loss": 2.5573, + "step": 8382 + }, + { + "epoch": 0.5626656823596523, + "grad_norm": 5.210993766784668, + "learning_rate": 8.432847462436924e-05, + "loss": 2.6776, + "step": 8384 + }, + { + "epoch": 0.5627999060434213, + "grad_norm": 5.100184917449951, + "learning_rate": 8.432057185708436e-05, + "loss": 2.3866, + "step": 8386 + }, + { + "epoch": 0.5629341297271904, + "grad_norm": 4.129569053649902, + "learning_rate": 8.431266746821995e-05, + "loss": 2.5417, + "step": 8388 + }, + { + "epoch": 0.5630683534109594, + "grad_norm": 4.025933265686035, + "learning_rate": 8.430476145814948e-05, + "loss": 2.5443, + "step": 8390 + }, + { + "epoch": 0.5632025770947283, + "grad_norm": 4.495929718017578, + "learning_rate": 8.429685382724648e-05, + "loss": 2.5528, + "step": 8392 + }, + { + "epoch": 0.5633368007784973, + "grad_norm": 5.3260273933410645, + "learning_rate": 8.428894457588458e-05, + "loss": 2.5957, + "step": 8394 + }, + { + "epoch": 0.5634710244622664, + "grad_norm": 4.064072608947754, + "learning_rate": 8.428103370443748e-05, + "loss": 2.4995, + "step": 8396 + }, + { + "epoch": 0.5636052481460354, + "grad_norm": 3.9863500595092773, + "learning_rate": 8.427312121327895e-05, + "loss": 2.656, + "step": 8398 + }, + { + "epoch": 0.5637394718298043, + "grad_norm": 4.975805282592773, + "learning_rate": 8.426520710278283e-05, + "loss": 2.5169, + "step": 8400 + }, + { + "epoch": 0.5638736955135734, + "grad_norm": 4.088968753814697, + "learning_rate": 8.425729137332306e-05, + "loss": 2.4654, + "step": 8402 + }, + { + "epoch": 0.5640079191973424, + "grad_norm": 4.080051898956299, + "learning_rate": 8.424937402527363e-05, + "loss": 2.5062, + "step": 8404 + }, + { + "epoch": 0.5641421428811114, + "grad_norm": 4.824666500091553, + "learning_rate": 8.424145505900863e-05, + "loss": 2.8069, + "step": 8406 + }, + { + "epoch": 0.5642763665648803, + "grad_norm": 7.949369430541992, + "learning_rate": 8.423353447490221e-05, + "loss": 2.4425, + "step": 8408 + }, + { + "epoch": 0.5644105902486494, + "grad_norm": 3.898083209991455, + "learning_rate": 8.422561227332861e-05, + "loss": 2.4562, + "step": 8410 + }, + { + "epoch": 0.5645448139324184, + "grad_norm": 4.708263397216797, + "learning_rate": 8.421768845466213e-05, + "loss": 2.4962, + "step": 8412 + }, + { + "epoch": 0.5646790376161874, + "grad_norm": 3.4602713584899902, + "learning_rate": 8.420976301927716e-05, + "loss": 2.6309, + "step": 8414 + }, + { + "epoch": 0.5648132612999563, + "grad_norm": 4.393330097198486, + "learning_rate": 8.420183596754816e-05, + "loss": 2.4019, + "step": 8416 + }, + { + "epoch": 0.5649474849837254, + "grad_norm": 3.8166189193725586, + "learning_rate": 8.419390729984966e-05, + "loss": 2.2988, + "step": 8418 + }, + { + "epoch": 0.5650817086674944, + "grad_norm": 27.76455307006836, + "learning_rate": 8.418597701655628e-05, + "loss": 2.6648, + "step": 8420 + }, + { + "epoch": 0.5652159323512633, + "grad_norm": 4.22926139831543, + "learning_rate": 8.41780451180427e-05, + "loss": 2.4733, + "step": 8422 + }, + { + "epoch": 0.5653501560350324, + "grad_norm": 4.157386302947998, + "learning_rate": 8.417011160468371e-05, + "loss": 2.2998, + "step": 8424 + }, + { + "epoch": 0.5654843797188014, + "grad_norm": 4.235902309417725, + "learning_rate": 8.416217647685411e-05, + "loss": 2.6926, + "step": 8426 + }, + { + "epoch": 0.5656186034025704, + "grad_norm": 3.975883960723877, + "learning_rate": 8.415423973492888e-05, + "loss": 2.3574, + "step": 8428 + }, + { + "epoch": 0.5657528270863393, + "grad_norm": 3.9211628437042236, + "learning_rate": 8.414630137928298e-05, + "loss": 2.5335, + "step": 8430 + }, + { + "epoch": 0.5658870507701084, + "grad_norm": 3.765695810317993, + "learning_rate": 8.413836141029148e-05, + "loss": 2.456, + "step": 8432 + }, + { + "epoch": 0.5660212744538774, + "grad_norm": 4.240943908691406, + "learning_rate": 8.413041982832954e-05, + "loss": 2.784, + "step": 8434 + }, + { + "epoch": 0.5661554981376464, + "grad_norm": 4.515347957611084, + "learning_rate": 8.412247663377237e-05, + "loss": 2.7356, + "step": 8436 + }, + { + "epoch": 0.5662897218214153, + "grad_norm": 4.863190650939941, + "learning_rate": 8.411453182699529e-05, + "loss": 2.4103, + "step": 8438 + }, + { + "epoch": 0.5664239455051844, + "grad_norm": 4.531732559204102, + "learning_rate": 8.410658540837364e-05, + "loss": 2.6903, + "step": 8440 + }, + { + "epoch": 0.5665581691889534, + "grad_norm": 4.3706183433532715, + "learning_rate": 8.409863737828292e-05, + "loss": 2.7017, + "step": 8442 + }, + { + "epoch": 0.5666923928727224, + "grad_norm": 4.277491569519043, + "learning_rate": 8.409068773709862e-05, + "loss": 2.8401, + "step": 8444 + }, + { + "epoch": 0.5668266165564914, + "grad_norm": 4.800031661987305, + "learning_rate": 8.408273648519638e-05, + "loss": 2.8235, + "step": 8446 + }, + { + "epoch": 0.5669608402402604, + "grad_norm": 4.412444114685059, + "learning_rate": 8.407478362295184e-05, + "loss": 2.613, + "step": 8448 + }, + { + "epoch": 0.5670950639240294, + "grad_norm": 6.609631061553955, + "learning_rate": 8.40668291507408e-05, + "loss": 2.683, + "step": 8450 + }, + { + "epoch": 0.5672292876077984, + "grad_norm": 4.35358190536499, + "learning_rate": 8.405887306893906e-05, + "loss": 2.6162, + "step": 8452 + }, + { + "epoch": 0.5673635112915674, + "grad_norm": 4.389922618865967, + "learning_rate": 8.405091537792253e-05, + "loss": 2.5983, + "step": 8454 + }, + { + "epoch": 0.5674977349753364, + "grad_norm": 4.410494804382324, + "learning_rate": 8.404295607806723e-05, + "loss": 2.4812, + "step": 8456 + }, + { + "epoch": 0.5676319586591054, + "grad_norm": 4.589960098266602, + "learning_rate": 8.40349951697492e-05, + "loss": 2.8749, + "step": 8458 + }, + { + "epoch": 0.5677661823428743, + "grad_norm": 4.545804977416992, + "learning_rate": 8.402703265334455e-05, + "loss": 2.6096, + "step": 8460 + }, + { + "epoch": 0.5679004060266434, + "grad_norm": 4.109602928161621, + "learning_rate": 8.401906852922954e-05, + "loss": 2.5356, + "step": 8462 + }, + { + "epoch": 0.5680346297104124, + "grad_norm": 4.219510078430176, + "learning_rate": 8.401110279778043e-05, + "loss": 2.4889, + "step": 8464 + }, + { + "epoch": 0.5681688533941814, + "grad_norm": 4.685168743133545, + "learning_rate": 8.400313545937359e-05, + "loss": 2.6495, + "step": 8466 + }, + { + "epoch": 0.5683030770779504, + "grad_norm": 4.974390983581543, + "learning_rate": 8.399516651438546e-05, + "loss": 2.5963, + "step": 8468 + }, + { + "epoch": 0.5684373007617194, + "grad_norm": 4.574667453765869, + "learning_rate": 8.398719596319258e-05, + "loss": 2.7611, + "step": 8470 + }, + { + "epoch": 0.5685715244454884, + "grad_norm": 4.16183614730835, + "learning_rate": 8.39792238061715e-05, + "loss": 2.3687, + "step": 8472 + }, + { + "epoch": 0.5687057481292574, + "grad_norm": 3.926473617553711, + "learning_rate": 8.397125004369892e-05, + "loss": 2.2634, + "step": 8474 + }, + { + "epoch": 0.5688399718130264, + "grad_norm": 4.685787677764893, + "learning_rate": 8.396327467615158e-05, + "loss": 2.4388, + "step": 8476 + }, + { + "epoch": 0.5689741954967954, + "grad_norm": 3.5876762866973877, + "learning_rate": 8.39552977039063e-05, + "loss": 2.5395, + "step": 8478 + }, + { + "epoch": 0.5691084191805644, + "grad_norm": 4.026362895965576, + "learning_rate": 8.394731912733998e-05, + "loss": 2.4738, + "step": 8480 + }, + { + "epoch": 0.5692426428643335, + "grad_norm": 4.062354564666748, + "learning_rate": 8.393933894682957e-05, + "loss": 2.4356, + "step": 8482 + }, + { + "epoch": 0.5693768665481024, + "grad_norm": 3.5353667736053467, + "learning_rate": 8.393135716275215e-05, + "loss": 2.2539, + "step": 8484 + }, + { + "epoch": 0.5695110902318714, + "grad_norm": 4.320888519287109, + "learning_rate": 8.392337377548481e-05, + "loss": 2.5617, + "step": 8486 + }, + { + "epoch": 0.5696453139156404, + "grad_norm": 3.9936816692352295, + "learning_rate": 8.391538878540477e-05, + "loss": 2.5126, + "step": 8488 + }, + { + "epoch": 0.5697795375994095, + "grad_norm": 4.313360214233398, + "learning_rate": 8.390740219288931e-05, + "loss": 2.5896, + "step": 8490 + }, + { + "epoch": 0.5699137612831784, + "grad_norm": 4.123593330383301, + "learning_rate": 8.389941399831578e-05, + "loss": 2.4386, + "step": 8492 + }, + { + "epoch": 0.5700479849669474, + "grad_norm": 4.6988983154296875, + "learning_rate": 8.389142420206158e-05, + "loss": 2.8902, + "step": 8494 + }, + { + "epoch": 0.5701822086507164, + "grad_norm": 5.335660934448242, + "learning_rate": 8.388343280450424e-05, + "loss": 2.7408, + "step": 8496 + }, + { + "epoch": 0.5703164323344854, + "grad_norm": 3.777940034866333, + "learning_rate": 8.387543980602133e-05, + "loss": 2.3638, + "step": 8498 + }, + { + "epoch": 0.5704506560182544, + "grad_norm": 4.334600448608398, + "learning_rate": 8.38674452069905e-05, + "loss": 2.7369, + "step": 8500 + }, + { + "epoch": 0.5705848797020234, + "grad_norm": 4.0694098472595215, + "learning_rate": 8.385944900778948e-05, + "loss": 2.46, + "step": 8502 + }, + { + "epoch": 0.5707191033857925, + "grad_norm": 4.403539180755615, + "learning_rate": 8.385145120879607e-05, + "loss": 2.5679, + "step": 8504 + }, + { + "epoch": 0.5708533270695614, + "grad_norm": 5.599013805389404, + "learning_rate": 8.384345181038818e-05, + "loss": 2.608, + "step": 8506 + }, + { + "epoch": 0.5709875507533304, + "grad_norm": 4.419242858886719, + "learning_rate": 8.383545081294371e-05, + "loss": 2.4312, + "step": 8508 + }, + { + "epoch": 0.5711217744370994, + "grad_norm": 3.1814043521881104, + "learning_rate": 8.382744821684074e-05, + "loss": 2.1077, + "step": 8510 + }, + { + "epoch": 0.5712559981208685, + "grad_norm": 4.393637180328369, + "learning_rate": 8.381944402245736e-05, + "loss": 2.4158, + "step": 8512 + }, + { + "epoch": 0.5713902218046374, + "grad_norm": 4.4806742668151855, + "learning_rate": 8.381143823017176e-05, + "loss": 2.2392, + "step": 8514 + }, + { + "epoch": 0.5715244454884064, + "grad_norm": 3.9448065757751465, + "learning_rate": 8.380343084036218e-05, + "loss": 2.7342, + "step": 8516 + }, + { + "epoch": 0.5716586691721754, + "grad_norm": 4.226758003234863, + "learning_rate": 8.379542185340696e-05, + "loss": 2.5123, + "step": 8518 + }, + { + "epoch": 0.5717928928559445, + "grad_norm": 7.436162948608398, + "learning_rate": 8.378741126968453e-05, + "loss": 2.6883, + "step": 8520 + }, + { + "epoch": 0.5719271165397134, + "grad_norm": 4.708135604858398, + "learning_rate": 8.377939908957336e-05, + "loss": 2.5972, + "step": 8522 + }, + { + "epoch": 0.5720613402234824, + "grad_norm": 4.468167304992676, + "learning_rate": 8.377138531345198e-05, + "loss": 2.5316, + "step": 8524 + }, + { + "epoch": 0.5721955639072515, + "grad_norm": 3.896035671234131, + "learning_rate": 8.376336994169908e-05, + "loss": 2.6359, + "step": 8526 + }, + { + "epoch": 0.5723297875910205, + "grad_norm": 5.716073036193848, + "learning_rate": 8.375535297469332e-05, + "loss": 2.6066, + "step": 8528 + }, + { + "epoch": 0.5724640112747894, + "grad_norm": 4.237614154815674, + "learning_rate": 8.374733441281353e-05, + "loss": 2.4735, + "step": 8530 + }, + { + "epoch": 0.5725982349585584, + "grad_norm": 4.0519232749938965, + "learning_rate": 8.373931425643855e-05, + "loss": 2.5378, + "step": 8532 + }, + { + "epoch": 0.5727324586423275, + "grad_norm": 4.237734317779541, + "learning_rate": 8.37312925059473e-05, + "loss": 2.494, + "step": 8534 + }, + { + "epoch": 0.5728666823260964, + "grad_norm": 4.746260643005371, + "learning_rate": 8.372326916171882e-05, + "loss": 2.4607, + "step": 8536 + }, + { + "epoch": 0.5730009060098654, + "grad_norm": 4.260883808135986, + "learning_rate": 8.371524422413218e-05, + "loss": 2.4111, + "step": 8538 + }, + { + "epoch": 0.5731351296936344, + "grad_norm": 4.387004852294922, + "learning_rate": 8.370721769356656e-05, + "loss": 2.5375, + "step": 8540 + }, + { + "epoch": 0.5732693533774035, + "grad_norm": 4.607949256896973, + "learning_rate": 8.369918957040116e-05, + "loss": 2.5962, + "step": 8542 + }, + { + "epoch": 0.5734035770611724, + "grad_norm": 3.9089224338531494, + "learning_rate": 8.369115985501534e-05, + "loss": 2.4241, + "step": 8544 + }, + { + "epoch": 0.5735378007449414, + "grad_norm": 4.255965709686279, + "learning_rate": 8.368312854778848e-05, + "loss": 2.4483, + "step": 8546 + }, + { + "epoch": 0.5736720244287105, + "grad_norm": 4.306459903717041, + "learning_rate": 8.367509564910001e-05, + "loss": 2.519, + "step": 8548 + }, + { + "epoch": 0.5738062481124795, + "grad_norm": 3.9240145683288574, + "learning_rate": 8.366706115932951e-05, + "loss": 2.6215, + "step": 8550 + }, + { + "epoch": 0.5739404717962484, + "grad_norm": 6.927346229553223, + "learning_rate": 8.365902507885656e-05, + "loss": 2.7789, + "step": 8552 + }, + { + "epoch": 0.5740746954800174, + "grad_norm": 3.7573180198669434, + "learning_rate": 8.365098740806086e-05, + "loss": 2.501, + "step": 8554 + }, + { + "epoch": 0.5742089191637865, + "grad_norm": 4.458992958068848, + "learning_rate": 8.364294814732218e-05, + "loss": 2.4046, + "step": 8556 + }, + { + "epoch": 0.5743431428475555, + "grad_norm": 3.7318577766418457, + "learning_rate": 8.363490729702034e-05, + "loss": 2.3466, + "step": 8558 + }, + { + "epoch": 0.5744773665313244, + "grad_norm": 5.027885913848877, + "learning_rate": 8.362686485753531e-05, + "loss": 2.5407, + "step": 8560 + }, + { + "epoch": 0.5746115902150934, + "grad_norm": 3.6790857315063477, + "learning_rate": 8.361882082924702e-05, + "loss": 2.2477, + "step": 8562 + }, + { + "epoch": 0.5747458138988625, + "grad_norm": 3.9506261348724365, + "learning_rate": 8.361077521253553e-05, + "loss": 2.6256, + "step": 8564 + }, + { + "epoch": 0.5748800375826315, + "grad_norm": 4.100966930389404, + "learning_rate": 8.360272800778103e-05, + "loss": 2.4909, + "step": 8566 + }, + { + "epoch": 0.5750142612664004, + "grad_norm": 3.781608819961548, + "learning_rate": 8.359467921536371e-05, + "loss": 2.5526, + "step": 8568 + }, + { + "epoch": 0.5751484849501695, + "grad_norm": 4.2817277908325195, + "learning_rate": 8.358662883566383e-05, + "loss": 2.5454, + "step": 8570 + }, + { + "epoch": 0.5752827086339385, + "grad_norm": 4.558470249176025, + "learning_rate": 8.357857686906182e-05, + "loss": 2.784, + "step": 8572 + }, + { + "epoch": 0.5754169323177074, + "grad_norm": 3.7852673530578613, + "learning_rate": 8.357052331593807e-05, + "loss": 2.4493, + "step": 8574 + }, + { + "epoch": 0.5755511560014764, + "grad_norm": 4.660189151763916, + "learning_rate": 8.35624681766731e-05, + "loss": 2.2978, + "step": 8576 + }, + { + "epoch": 0.5756853796852455, + "grad_norm": 4.3616719245910645, + "learning_rate": 8.355441145164751e-05, + "loss": 2.5713, + "step": 8578 + }, + { + "epoch": 0.5758196033690145, + "grad_norm": 4.003500938415527, + "learning_rate": 8.354635314124195e-05, + "loss": 2.2576, + "step": 8580 + }, + { + "epoch": 0.5759538270527834, + "grad_norm": 6.738184452056885, + "learning_rate": 8.35382932458372e-05, + "loss": 2.656, + "step": 8582 + }, + { + "epoch": 0.5760880507365524, + "grad_norm": 4.545653820037842, + "learning_rate": 8.353023176581401e-05, + "loss": 2.5384, + "step": 8584 + }, + { + "epoch": 0.5762222744203215, + "grad_norm": 4.314968109130859, + "learning_rate": 8.352216870155331e-05, + "loss": 3.0218, + "step": 8586 + }, + { + "epoch": 0.5763564981040905, + "grad_norm": 4.749731540679932, + "learning_rate": 8.351410405343607e-05, + "loss": 2.4679, + "step": 8588 + }, + { + "epoch": 0.5764907217878594, + "grad_norm": 3.6533219814300537, + "learning_rate": 8.350603782184332e-05, + "loss": 2.5919, + "step": 8590 + }, + { + "epoch": 0.5766249454716285, + "grad_norm": 4.030946731567383, + "learning_rate": 8.349797000715616e-05, + "loss": 2.6267, + "step": 8592 + }, + { + "epoch": 0.5767591691553975, + "grad_norm": 4.420132160186768, + "learning_rate": 8.348990060975578e-05, + "loss": 2.5683, + "step": 8594 + }, + { + "epoch": 0.5768933928391665, + "grad_norm": 3.9198834896087646, + "learning_rate": 8.348182963002346e-05, + "loss": 2.4575, + "step": 8596 + }, + { + "epoch": 0.5770276165229354, + "grad_norm": 5.079920768737793, + "learning_rate": 8.347375706834055e-05, + "loss": 2.4305, + "step": 8598 + }, + { + "epoch": 0.5771618402067045, + "grad_norm": 3.9008002281188965, + "learning_rate": 8.34656829250884e-05, + "loss": 2.4422, + "step": 8600 + }, + { + "epoch": 0.5772960638904735, + "grad_norm": 4.498838901519775, + "learning_rate": 8.345760720064856e-05, + "loss": 3.0537, + "step": 8602 + }, + { + "epoch": 0.5774302875742425, + "grad_norm": 4.210926055908203, + "learning_rate": 8.344952989540259e-05, + "loss": 2.3762, + "step": 8604 + }, + { + "epoch": 0.5775645112580114, + "grad_norm": 5.135854721069336, + "learning_rate": 8.344145100973209e-05, + "loss": 2.2649, + "step": 8606 + }, + { + "epoch": 0.5776987349417805, + "grad_norm": 4.51980447769165, + "learning_rate": 8.343337054401878e-05, + "loss": 2.608, + "step": 8608 + }, + { + "epoch": 0.5778329586255495, + "grad_norm": 5.514434337615967, + "learning_rate": 8.342528849864447e-05, + "loss": 2.4988, + "step": 8610 + }, + { + "epoch": 0.5779671823093184, + "grad_norm": 4.676685333251953, + "learning_rate": 8.3417204873991e-05, + "loss": 2.5481, + "step": 8612 + }, + { + "epoch": 0.5781014059930875, + "grad_norm": 3.964831590652466, + "learning_rate": 8.340911967044032e-05, + "loss": 2.7685, + "step": 8614 + }, + { + "epoch": 0.5782356296768565, + "grad_norm": 3.800869941711426, + "learning_rate": 8.340103288837443e-05, + "loss": 2.5576, + "step": 8616 + }, + { + "epoch": 0.5783698533606255, + "grad_norm": 4.21420431137085, + "learning_rate": 8.33929445281754e-05, + "loss": 2.6522, + "step": 8618 + }, + { + "epoch": 0.5785040770443944, + "grad_norm": 4.266425132751465, + "learning_rate": 8.338485459022544e-05, + "loss": 2.33, + "step": 8620 + }, + { + "epoch": 0.5786383007281635, + "grad_norm": 4.065768718719482, + "learning_rate": 8.337676307490671e-05, + "loss": 2.4784, + "step": 8622 + }, + { + "epoch": 0.5787725244119325, + "grad_norm": 10.085619926452637, + "learning_rate": 8.336866998260159e-05, + "loss": 2.4139, + "step": 8624 + }, + { + "epoch": 0.5789067480957015, + "grad_norm": 3.9824628829956055, + "learning_rate": 8.336057531369241e-05, + "loss": 2.358, + "step": 8626 + }, + { + "epoch": 0.5790409717794704, + "grad_norm": 4.087893009185791, + "learning_rate": 8.335247906856165e-05, + "loss": 2.7788, + "step": 8628 + }, + { + "epoch": 0.5791751954632395, + "grad_norm": 4.205395221710205, + "learning_rate": 8.334438124759184e-05, + "loss": 2.4679, + "step": 8630 + }, + { + "epoch": 0.5793094191470085, + "grad_norm": 4.44117546081543, + "learning_rate": 8.333628185116559e-05, + "loss": 2.604, + "step": 8632 + }, + { + "epoch": 0.5794436428307775, + "grad_norm": 4.060374736785889, + "learning_rate": 8.332818087966556e-05, + "loss": 2.6416, + "step": 8634 + }, + { + "epoch": 0.5795778665145465, + "grad_norm": 4.626859664916992, + "learning_rate": 8.332007833347454e-05, + "loss": 2.5992, + "step": 8636 + }, + { + "epoch": 0.5797120901983155, + "grad_norm": 4.269913196563721, + "learning_rate": 8.331197421297534e-05, + "loss": 2.2761, + "step": 8638 + }, + { + "epoch": 0.5798463138820845, + "grad_norm": 4.410906791687012, + "learning_rate": 8.330386851855086e-05, + "loss": 2.2996, + "step": 8640 + }, + { + "epoch": 0.5799805375658535, + "grad_norm": 4.210602760314941, + "learning_rate": 8.329576125058406e-05, + "loss": 2.675, + "step": 8642 + }, + { + "epoch": 0.5801147612496225, + "grad_norm": 4.73867130279541, + "learning_rate": 8.328765240945803e-05, + "loss": 2.5312, + "step": 8644 + }, + { + "epoch": 0.5802489849333915, + "grad_norm": 3.791243553161621, + "learning_rate": 8.327954199555587e-05, + "loss": 2.5011, + "step": 8646 + }, + { + "epoch": 0.5803832086171605, + "grad_norm": 3.884016752243042, + "learning_rate": 8.327143000926082e-05, + "loss": 2.4041, + "step": 8648 + }, + { + "epoch": 0.5805174323009294, + "grad_norm": 4.305232524871826, + "learning_rate": 8.32633164509561e-05, + "loss": 2.5893, + "step": 8650 + }, + { + "epoch": 0.5806516559846985, + "grad_norm": 4.345340728759766, + "learning_rate": 8.32552013210251e-05, + "loss": 2.3638, + "step": 8652 + }, + { + "epoch": 0.5807858796684675, + "grad_norm": 3.7943947315216064, + "learning_rate": 8.324708461985124e-05, + "loss": 2.5394, + "step": 8654 + }, + { + "epoch": 0.5809201033522365, + "grad_norm": 4.334877014160156, + "learning_rate": 8.323896634781799e-05, + "loss": 2.4824, + "step": 8656 + }, + { + "epoch": 0.5810543270360055, + "grad_norm": 3.9206814765930176, + "learning_rate": 8.323084650530895e-05, + "loss": 2.4619, + "step": 8658 + }, + { + "epoch": 0.5811885507197745, + "grad_norm": 4.215569019317627, + "learning_rate": 8.322272509270777e-05, + "loss": 2.1477, + "step": 8660 + }, + { + "epoch": 0.5813227744035435, + "grad_norm": 8.379711151123047, + "learning_rate": 8.321460211039814e-05, + "loss": 2.5784, + "step": 8662 + }, + { + "epoch": 0.5814569980873125, + "grad_norm": 3.680039405822754, + "learning_rate": 8.320647755876389e-05, + "loss": 2.5239, + "step": 8664 + }, + { + "epoch": 0.5815912217710815, + "grad_norm": 4.599070072174072, + "learning_rate": 8.319835143818887e-05, + "loss": 2.3703, + "step": 8666 + }, + { + "epoch": 0.5817254454548505, + "grad_norm": 4.846008777618408, + "learning_rate": 8.319022374905701e-05, + "loss": 2.5077, + "step": 8668 + }, + { + "epoch": 0.5818596691386195, + "grad_norm": 4.042200565338135, + "learning_rate": 8.318209449175236e-05, + "loss": 2.3553, + "step": 8670 + }, + { + "epoch": 0.5819938928223886, + "grad_norm": 4.261415958404541, + "learning_rate": 8.317396366665899e-05, + "loss": 2.3828, + "step": 8672 + }, + { + "epoch": 0.5821281165061575, + "grad_norm": 4.352478504180908, + "learning_rate": 8.316583127416107e-05, + "loss": 2.5205, + "step": 8674 + }, + { + "epoch": 0.5822623401899265, + "grad_norm": 4.7064738273620605, + "learning_rate": 8.315769731464284e-05, + "loss": 2.7663, + "step": 8676 + }, + { + "epoch": 0.5823965638736955, + "grad_norm": 4.8127121925354, + "learning_rate": 8.314956178848861e-05, + "loss": 2.4564, + "step": 8678 + }, + { + "epoch": 0.5825307875574646, + "grad_norm": 4.631601333618164, + "learning_rate": 8.314142469608278e-05, + "loss": 2.8721, + "step": 8680 + }, + { + "epoch": 0.5826650112412335, + "grad_norm": 4.055510997772217, + "learning_rate": 8.313328603780979e-05, + "loss": 2.4575, + "step": 8682 + }, + { + "epoch": 0.5827992349250025, + "grad_norm": 9.857450485229492, + "learning_rate": 8.31251458140542e-05, + "loss": 2.4936, + "step": 8684 + }, + { + "epoch": 0.5829334586087715, + "grad_norm": 3.949652910232544, + "learning_rate": 8.311700402520062e-05, + "loss": 2.4, + "step": 8686 + }, + { + "epoch": 0.5830676822925405, + "grad_norm": 4.358405590057373, + "learning_rate": 8.31088606716337e-05, + "loss": 2.5583, + "step": 8688 + }, + { + "epoch": 0.5832019059763095, + "grad_norm": 3.9306118488311768, + "learning_rate": 8.310071575373823e-05, + "loss": 2.4351, + "step": 8690 + }, + { + "epoch": 0.5833361296600785, + "grad_norm": 4.126506328582764, + "learning_rate": 8.309256927189903e-05, + "loss": 2.3379, + "step": 8692 + }, + { + "epoch": 0.5834703533438476, + "grad_norm": 3.90529727935791, + "learning_rate": 8.308442122650099e-05, + "loss": 2.3769, + "step": 8694 + }, + { + "epoch": 0.5836045770276165, + "grad_norm": 4.4760661125183105, + "learning_rate": 8.307627161792912e-05, + "loss": 2.4236, + "step": 8696 + }, + { + "epoch": 0.5837388007113855, + "grad_norm": 4.036798477172852, + "learning_rate": 8.306812044656846e-05, + "loss": 2.4819, + "step": 8698 + }, + { + "epoch": 0.5838730243951545, + "grad_norm": 4.164179801940918, + "learning_rate": 8.305996771280413e-05, + "loss": 2.5526, + "step": 8700 + }, + { + "epoch": 0.5840072480789236, + "grad_norm": 4.522403240203857, + "learning_rate": 8.305181341702134e-05, + "loss": 2.5695, + "step": 8702 + }, + { + "epoch": 0.5841414717626925, + "grad_norm": 4.463257312774658, + "learning_rate": 8.304365755960534e-05, + "loss": 2.95, + "step": 8704 + }, + { + "epoch": 0.5842756954464615, + "grad_norm": 4.180798530578613, + "learning_rate": 8.303550014094153e-05, + "loss": 2.4061, + "step": 8706 + }, + { + "epoch": 0.5844099191302305, + "grad_norm": 3.828770637512207, + "learning_rate": 8.302734116141527e-05, + "loss": 2.301, + "step": 8708 + }, + { + "epoch": 0.5845441428139996, + "grad_norm": 4.361574649810791, + "learning_rate": 8.30191806214121e-05, + "loss": 2.5314, + "step": 8710 + }, + { + "epoch": 0.5846783664977685, + "grad_norm": 4.372044086456299, + "learning_rate": 8.301101852131757e-05, + "loss": 2.7754, + "step": 8712 + }, + { + "epoch": 0.5848125901815375, + "grad_norm": 4.624102592468262, + "learning_rate": 8.300285486151734e-05, + "loss": 2.3357, + "step": 8714 + }, + { + "epoch": 0.5849468138653066, + "grad_norm": 4.4027419090271, + "learning_rate": 8.299468964239709e-05, + "loss": 2.5692, + "step": 8716 + }, + { + "epoch": 0.5850810375490756, + "grad_norm": 4.74369478225708, + "learning_rate": 8.298652286434265e-05, + "loss": 2.5859, + "step": 8718 + }, + { + "epoch": 0.5852152612328445, + "grad_norm": 3.945875406265259, + "learning_rate": 8.297835452773988e-05, + "loss": 2.386, + "step": 8720 + }, + { + "epoch": 0.5853494849166135, + "grad_norm": 4.752105236053467, + "learning_rate": 8.297018463297471e-05, + "loss": 2.5209, + "step": 8722 + }, + { + "epoch": 0.5854837086003826, + "grad_norm": 4.107145309448242, + "learning_rate": 8.296201318043313e-05, + "loss": 2.5579, + "step": 8724 + }, + { + "epoch": 0.5856179322841515, + "grad_norm": 3.9321653842926025, + "learning_rate": 8.295384017050125e-05, + "loss": 2.4065, + "step": 8726 + }, + { + "epoch": 0.5857521559679205, + "grad_norm": 4.4222235679626465, + "learning_rate": 8.294566560356522e-05, + "loss": 2.6541, + "step": 8728 + }, + { + "epoch": 0.5858863796516895, + "grad_norm": 3.9056284427642822, + "learning_rate": 8.293748948001129e-05, + "loss": 2.5617, + "step": 8730 + }, + { + "epoch": 0.5860206033354586, + "grad_norm": 4.759344100952148, + "learning_rate": 8.292931180022574e-05, + "loss": 2.466, + "step": 8732 + }, + { + "epoch": 0.5861548270192275, + "grad_norm": 4.147807598114014, + "learning_rate": 8.292113256459495e-05, + "loss": 2.5297, + "step": 8734 + }, + { + "epoch": 0.5862890507029965, + "grad_norm": 4.365340709686279, + "learning_rate": 8.29129517735054e-05, + "loss": 2.4532, + "step": 8736 + }, + { + "epoch": 0.5864232743867656, + "grad_norm": 4.021247863769531, + "learning_rate": 8.29047694273436e-05, + "loss": 2.5349, + "step": 8738 + }, + { + "epoch": 0.5865574980705346, + "grad_norm": 4.6124372482299805, + "learning_rate": 8.289658552649614e-05, + "loss": 2.615, + "step": 8740 + }, + { + "epoch": 0.5866917217543035, + "grad_norm": 4.122811317443848, + "learning_rate": 8.28884000713497e-05, + "loss": 2.4493, + "step": 8742 + }, + { + "epoch": 0.5868259454380725, + "grad_norm": 3.848012685775757, + "learning_rate": 8.288021306229103e-05, + "loss": 2.536, + "step": 8744 + }, + { + "epoch": 0.5869601691218416, + "grad_norm": 4.305703163146973, + "learning_rate": 8.287202449970695e-05, + "loss": 2.6003, + "step": 8746 + }, + { + "epoch": 0.5870943928056106, + "grad_norm": 9.836258888244629, + "learning_rate": 8.286383438398437e-05, + "loss": 2.5883, + "step": 8748 + }, + { + "epoch": 0.5872286164893795, + "grad_norm": 5.791433334350586, + "learning_rate": 8.285564271551023e-05, + "loss": 2.868, + "step": 8750 + }, + { + "epoch": 0.5873628401731485, + "grad_norm": 4.404978275299072, + "learning_rate": 8.284744949467156e-05, + "loss": 2.319, + "step": 8752 + }, + { + "epoch": 0.5874970638569176, + "grad_norm": 4.417796611785889, + "learning_rate": 8.283925472185552e-05, + "loss": 2.7275, + "step": 8754 + }, + { + "epoch": 0.5876312875406866, + "grad_norm": 3.9615683555603027, + "learning_rate": 8.283105839744925e-05, + "loss": 2.1653, + "step": 8756 + }, + { + "epoch": 0.5877655112244555, + "grad_norm": 3.981778621673584, + "learning_rate": 8.282286052184005e-05, + "loss": 2.3737, + "step": 8758 + }, + { + "epoch": 0.5878997349082246, + "grad_norm": 4.731372833251953, + "learning_rate": 8.281466109541521e-05, + "loss": 2.5268, + "step": 8760 + }, + { + "epoch": 0.5880339585919936, + "grad_norm": 4.534754276275635, + "learning_rate": 8.280646011856218e-05, + "loss": 2.5068, + "step": 8762 + }, + { + "epoch": 0.5881681822757625, + "grad_norm": 4.3109450340271, + "learning_rate": 8.279825759166842e-05, + "loss": 2.5453, + "step": 8764 + }, + { + "epoch": 0.5883024059595315, + "grad_norm": 4.540164947509766, + "learning_rate": 8.279005351512146e-05, + "loss": 2.6207, + "step": 8766 + }, + { + "epoch": 0.5884366296433006, + "grad_norm": 4.531014919281006, + "learning_rate": 8.278184788930897e-05, + "loss": 2.6197, + "step": 8768 + }, + { + "epoch": 0.5885708533270696, + "grad_norm": 4.114141941070557, + "learning_rate": 8.277364071461862e-05, + "loss": 2.425, + "step": 8770 + }, + { + "epoch": 0.5887050770108385, + "grad_norm": 4.054774284362793, + "learning_rate": 8.27654319914382e-05, + "loss": 2.4545, + "step": 8772 + }, + { + "epoch": 0.5888393006946075, + "grad_norm": 4.6349968910217285, + "learning_rate": 8.275722172015555e-05, + "loss": 2.554, + "step": 8774 + }, + { + "epoch": 0.5889735243783766, + "grad_norm": 4.556373596191406, + "learning_rate": 8.274900990115859e-05, + "loss": 2.6353, + "step": 8776 + }, + { + "epoch": 0.5891077480621456, + "grad_norm": 4.9533371925354, + "learning_rate": 8.274079653483531e-05, + "loss": 2.6336, + "step": 8778 + }, + { + "epoch": 0.5892419717459145, + "grad_norm": 3.828535318374634, + "learning_rate": 8.273258162157377e-05, + "loss": 2.5169, + "step": 8780 + }, + { + "epoch": 0.5893761954296836, + "grad_norm": 4.479168891906738, + "learning_rate": 8.272436516176212e-05, + "loss": 2.5588, + "step": 8782 + }, + { + "epoch": 0.5895104191134526, + "grad_norm": 4.448615074157715, + "learning_rate": 8.271614715578856e-05, + "loss": 2.6996, + "step": 8784 + }, + { + "epoch": 0.5896446427972216, + "grad_norm": 4.319115161895752, + "learning_rate": 8.270792760404138e-05, + "loss": 2.7004, + "step": 8786 + }, + { + "epoch": 0.5897788664809905, + "grad_norm": 4.303225994110107, + "learning_rate": 8.269970650690894e-05, + "loss": 2.4805, + "step": 8788 + }, + { + "epoch": 0.5899130901647596, + "grad_norm": 4.342695236206055, + "learning_rate": 8.269148386477968e-05, + "loss": 2.4268, + "step": 8790 + }, + { + "epoch": 0.5900473138485286, + "grad_norm": 3.994988203048706, + "learning_rate": 8.268325967804207e-05, + "loss": 2.6514, + "step": 8792 + }, + { + "epoch": 0.5901815375322976, + "grad_norm": 4.990392684936523, + "learning_rate": 8.267503394708472e-05, + "loss": 2.4891, + "step": 8794 + }, + { + "epoch": 0.5903157612160665, + "grad_norm": 4.199056148529053, + "learning_rate": 8.266680667229627e-05, + "loss": 2.4172, + "step": 8796 + }, + { + "epoch": 0.5904499848998356, + "grad_norm": 4.1956000328063965, + "learning_rate": 8.265857785406544e-05, + "loss": 2.3185, + "step": 8798 + }, + { + "epoch": 0.5905842085836046, + "grad_norm": 5.026490211486816, + "learning_rate": 8.265034749278103e-05, + "loss": 2.809, + "step": 8800 + }, + { + "epoch": 0.5907184322673735, + "grad_norm": 4.155625343322754, + "learning_rate": 8.264211558883191e-05, + "loss": 2.4502, + "step": 8802 + }, + { + "epoch": 0.5908526559511426, + "grad_norm": 3.9899210929870605, + "learning_rate": 8.263388214260702e-05, + "loss": 2.5653, + "step": 8804 + }, + { + "epoch": 0.5909868796349116, + "grad_norm": 3.9169070720672607, + "learning_rate": 8.262564715449535e-05, + "loss": 2.5573, + "step": 8806 + }, + { + "epoch": 0.5911211033186806, + "grad_norm": 3.759997606277466, + "learning_rate": 8.261741062488602e-05, + "loss": 2.6143, + "step": 8808 + }, + { + "epoch": 0.5912553270024495, + "grad_norm": 7.896114826202393, + "learning_rate": 8.260917255416817e-05, + "loss": 2.541, + "step": 8810 + }, + { + "epoch": 0.5913895506862186, + "grad_norm": 4.104400157928467, + "learning_rate": 8.260093294273103e-05, + "loss": 2.4731, + "step": 8812 + }, + { + "epoch": 0.5915237743699876, + "grad_norm": 4.641652584075928, + "learning_rate": 8.259269179096393e-05, + "loss": 2.6404, + "step": 8814 + }, + { + "epoch": 0.5916579980537566, + "grad_norm": 4.0738606452941895, + "learning_rate": 8.258444909925624e-05, + "loss": 2.7038, + "step": 8816 + }, + { + "epoch": 0.5917922217375255, + "grad_norm": 3.9715116024017334, + "learning_rate": 8.25762048679974e-05, + "loss": 2.2818, + "step": 8818 + }, + { + "epoch": 0.5919264454212946, + "grad_norm": 4.339904308319092, + "learning_rate": 8.256795909757694e-05, + "loss": 2.5335, + "step": 8820 + }, + { + "epoch": 0.5920606691050636, + "grad_norm": 4.661920547485352, + "learning_rate": 8.255971178838445e-05, + "loss": 2.4602, + "step": 8822 + }, + { + "epoch": 0.5921948927888326, + "grad_norm": 4.37688684463501, + "learning_rate": 8.25514629408096e-05, + "loss": 2.7001, + "step": 8824 + }, + { + "epoch": 0.5923291164726016, + "grad_norm": 7.555542945861816, + "learning_rate": 8.254321255524215e-05, + "loss": 2.6541, + "step": 8826 + }, + { + "epoch": 0.5924633401563706, + "grad_norm": 4.340180397033691, + "learning_rate": 8.25349606320719e-05, + "loss": 2.2997, + "step": 8828 + }, + { + "epoch": 0.5925975638401396, + "grad_norm": 4.247691631317139, + "learning_rate": 8.252670717168872e-05, + "loss": 2.56, + "step": 8830 + }, + { + "epoch": 0.5927317875239086, + "grad_norm": 4.623472213745117, + "learning_rate": 8.25184521744826e-05, + "loss": 2.3346, + "step": 8832 + }, + { + "epoch": 0.5928660112076776, + "grad_norm": 4.127546787261963, + "learning_rate": 8.251019564084355e-05, + "loss": 2.4708, + "step": 8834 + }, + { + "epoch": 0.5930002348914466, + "grad_norm": 4.249456882476807, + "learning_rate": 8.250193757116169e-05, + "loss": 2.6228, + "step": 8836 + }, + { + "epoch": 0.5931344585752156, + "grad_norm": 4.1076979637146, + "learning_rate": 8.24936779658272e-05, + "loss": 2.4825, + "step": 8838 + }, + { + "epoch": 0.5932686822589845, + "grad_norm": 3.9312238693237305, + "learning_rate": 8.248541682523032e-05, + "loss": 2.6657, + "step": 8840 + }, + { + "epoch": 0.5934029059427536, + "grad_norm": 4.476598739624023, + "learning_rate": 8.247715414976136e-05, + "loss": 2.5861, + "step": 8842 + }, + { + "epoch": 0.5935371296265226, + "grad_norm": 4.599727630615234, + "learning_rate": 8.246888993981074e-05, + "loss": 2.4715, + "step": 8844 + }, + { + "epoch": 0.5936713533102916, + "grad_norm": 3.807175874710083, + "learning_rate": 8.246062419576892e-05, + "loss": 2.5472, + "step": 8846 + }, + { + "epoch": 0.5938055769940606, + "grad_norm": 4.137506008148193, + "learning_rate": 8.245235691802644e-05, + "loss": 2.5123, + "step": 8848 + }, + { + "epoch": 0.5939398006778296, + "grad_norm": 20.516651153564453, + "learning_rate": 8.24440881069739e-05, + "loss": 2.4184, + "step": 8850 + }, + { + "epoch": 0.5940740243615986, + "grad_norm": 4.08819580078125, + "learning_rate": 8.2435817763002e-05, + "loss": 2.4422, + "step": 8852 + }, + { + "epoch": 0.5942082480453676, + "grad_norm": 4.767317295074463, + "learning_rate": 8.24275458865015e-05, + "loss": 2.5156, + "step": 8854 + }, + { + "epoch": 0.5943424717291366, + "grad_norm": 3.7627360820770264, + "learning_rate": 8.24192724778632e-05, + "loss": 2.6512, + "step": 8856 + }, + { + "epoch": 0.5944766954129056, + "grad_norm": 4.243515968322754, + "learning_rate": 8.241099753747805e-05, + "loss": 2.6322, + "step": 8858 + }, + { + "epoch": 0.5946109190966746, + "grad_norm": 4.289247512817383, + "learning_rate": 8.240272106573699e-05, + "loss": 2.3952, + "step": 8860 + }, + { + "epoch": 0.5947451427804437, + "grad_norm": 4.001808166503906, + "learning_rate": 8.239444306303109e-05, + "loss": 2.585, + "step": 8862 + }, + { + "epoch": 0.5948793664642126, + "grad_norm": 4.271307945251465, + "learning_rate": 8.238616352975143e-05, + "loss": 2.7436, + "step": 8864 + }, + { + "epoch": 0.5950135901479816, + "grad_norm": 4.3449177742004395, + "learning_rate": 8.237788246628925e-05, + "loss": 2.3765, + "step": 8866 + }, + { + "epoch": 0.5951478138317506, + "grad_norm": 4.211949348449707, + "learning_rate": 8.236959987303579e-05, + "loss": 2.7284, + "step": 8868 + }, + { + "epoch": 0.5952820375155197, + "grad_norm": 4.3124918937683105, + "learning_rate": 8.236131575038238e-05, + "loss": 2.7566, + "step": 8870 + }, + { + "epoch": 0.5954162611992886, + "grad_norm": 4.061512470245361, + "learning_rate": 8.235303009872043e-05, + "loss": 2.7471, + "step": 8872 + }, + { + "epoch": 0.5955504848830576, + "grad_norm": 8.871076583862305, + "learning_rate": 8.234474291844144e-05, + "loss": 2.6149, + "step": 8874 + }, + { + "epoch": 0.5956847085668266, + "grad_norm": 5.126734733581543, + "learning_rate": 8.233645420993695e-05, + "loss": 2.4663, + "step": 8876 + }, + { + "epoch": 0.5958189322505956, + "grad_norm": 4.589522838592529, + "learning_rate": 8.232816397359858e-05, + "loss": 2.4705, + "step": 8878 + }, + { + "epoch": 0.5959531559343646, + "grad_norm": 4.163862228393555, + "learning_rate": 8.231987220981804e-05, + "loss": 2.4501, + "step": 8880 + }, + { + "epoch": 0.5960873796181336, + "grad_norm": 4.5732574462890625, + "learning_rate": 8.231157891898708e-05, + "loss": 2.5883, + "step": 8882 + }, + { + "epoch": 0.5962216033019027, + "grad_norm": 4.857141017913818, + "learning_rate": 8.230328410149756e-05, + "loss": 2.4698, + "step": 8884 + }, + { + "epoch": 0.5963558269856716, + "grad_norm": 3.893690586090088, + "learning_rate": 8.22949877577414e-05, + "loss": 2.6003, + "step": 8886 + }, + { + "epoch": 0.5964900506694406, + "grad_norm": 6.929137706756592, + "learning_rate": 8.228668988811055e-05, + "loss": 2.5006, + "step": 8888 + }, + { + "epoch": 0.5966242743532096, + "grad_norm": 4.750836372375488, + "learning_rate": 8.227839049299711e-05, + "loss": 2.5349, + "step": 8890 + }, + { + "epoch": 0.5967584980369787, + "grad_norm": 3.8297767639160156, + "learning_rate": 8.227008957279319e-05, + "loss": 2.4723, + "step": 8892 + }, + { + "epoch": 0.5968927217207476, + "grad_norm": 4.638721466064453, + "learning_rate": 8.2261787127891e-05, + "loss": 2.3645, + "step": 8894 + }, + { + "epoch": 0.5970269454045166, + "grad_norm": 4.643172264099121, + "learning_rate": 8.22534831586828e-05, + "loss": 2.3017, + "step": 8896 + }, + { + "epoch": 0.5971611690882856, + "grad_norm": 3.9855916500091553, + "learning_rate": 8.224517766556094e-05, + "loss": 2.4005, + "step": 8898 + }, + { + "epoch": 0.5972953927720547, + "grad_norm": 4.492776393890381, + "learning_rate": 8.223687064891785e-05, + "loss": 2.5525, + "step": 8900 + }, + { + "epoch": 0.5974296164558236, + "grad_norm": 5.372023582458496, + "learning_rate": 8.222856210914601e-05, + "loss": 2.5462, + "step": 8902 + }, + { + "epoch": 0.5975638401395926, + "grad_norm": 3.9104228019714355, + "learning_rate": 8.222025204663799e-05, + "loss": 2.7046, + "step": 8904 + }, + { + "epoch": 0.5976980638233617, + "grad_norm": 4.160070419311523, + "learning_rate": 8.221194046178641e-05, + "loss": 2.4438, + "step": 8906 + }, + { + "epoch": 0.5978322875071307, + "grad_norm": 4.126678466796875, + "learning_rate": 8.220362735498399e-05, + "loss": 2.408, + "step": 8908 + }, + { + "epoch": 0.5979665111908996, + "grad_norm": 3.922210216522217, + "learning_rate": 8.21953127266235e-05, + "loss": 2.6763, + "step": 8910 + }, + { + "epoch": 0.5981007348746686, + "grad_norm": 4.421837329864502, + "learning_rate": 8.21869965770978e-05, + "loss": 2.6527, + "step": 8912 + }, + { + "epoch": 0.5982349585584377, + "grad_norm": 4.134332656860352, + "learning_rate": 8.21786789067998e-05, + "loss": 2.6399, + "step": 8914 + }, + { + "epoch": 0.5983691822422066, + "grad_norm": 4.121059894561768, + "learning_rate": 8.21703597161225e-05, + "loss": 2.535, + "step": 8916 + }, + { + "epoch": 0.5985034059259756, + "grad_norm": 3.9294402599334717, + "learning_rate": 8.216203900545895e-05, + "loss": 2.5182, + "step": 8918 + }, + { + "epoch": 0.5986376296097446, + "grad_norm": 3.4122824668884277, + "learning_rate": 8.215371677520231e-05, + "loss": 2.3394, + "step": 8920 + }, + { + "epoch": 0.5987718532935137, + "grad_norm": 3.8868610858917236, + "learning_rate": 8.21453930257458e-05, + "loss": 2.2634, + "step": 8922 + }, + { + "epoch": 0.5989060769772826, + "grad_norm": 4.277468681335449, + "learning_rate": 8.213706775748265e-05, + "loss": 2.3323, + "step": 8924 + }, + { + "epoch": 0.5990403006610516, + "grad_norm": 4.558352470397949, + "learning_rate": 8.212874097080626e-05, + "loss": 2.8577, + "step": 8926 + }, + { + "epoch": 0.5991745243448207, + "grad_norm": 4.838447570800781, + "learning_rate": 8.212041266611003e-05, + "loss": 2.4875, + "step": 8928 + }, + { + "epoch": 0.5993087480285897, + "grad_norm": 4.7096476554870605, + "learning_rate": 8.211208284378747e-05, + "loss": 2.7445, + "step": 8930 + }, + { + "epoch": 0.5994429717123586, + "grad_norm": 4.332677841186523, + "learning_rate": 8.210375150423214e-05, + "loss": 2.4125, + "step": 8932 + }, + { + "epoch": 0.5995771953961276, + "grad_norm": 4.12073278427124, + "learning_rate": 8.209541864783769e-05, + "loss": 2.4716, + "step": 8934 + }, + { + "epoch": 0.5997114190798967, + "grad_norm": 4.195922374725342, + "learning_rate": 8.20870842749978e-05, + "loss": 2.6955, + "step": 8936 + }, + { + "epoch": 0.5998456427636657, + "grad_norm": 3.9287028312683105, + "learning_rate": 8.20787483861063e-05, + "loss": 2.4971, + "step": 8938 + }, + { + "epoch": 0.5999798664474346, + "grad_norm": 4.323615074157715, + "learning_rate": 8.2070410981557e-05, + "loss": 2.4225, + "step": 8940 + }, + { + "epoch": 0.6001140901312036, + "grad_norm": 4.464348316192627, + "learning_rate": 8.206207206174386e-05, + "loss": 2.4591, + "step": 8942 + }, + { + "epoch": 0.6002483138149727, + "grad_norm": 5.980128765106201, + "learning_rate": 8.205373162706085e-05, + "loss": 2.7141, + "step": 8944 + }, + { + "epoch": 0.6003825374987416, + "grad_norm": 5.478582859039307, + "learning_rate": 8.204538967790205e-05, + "loss": 2.7969, + "step": 8946 + }, + { + "epoch": 0.6005167611825106, + "grad_norm": 3.949402093887329, + "learning_rate": 8.20370462146616e-05, + "loss": 2.6109, + "step": 8948 + }, + { + "epoch": 0.6006509848662797, + "grad_norm": 4.461920261383057, + "learning_rate": 8.202870123773371e-05, + "loss": 2.7867, + "step": 8950 + }, + { + "epoch": 0.6007852085500487, + "grad_norm": 4.190479278564453, + "learning_rate": 8.20203547475127e-05, + "loss": 2.481, + "step": 8952 + }, + { + "epoch": 0.6009194322338176, + "grad_norm": 3.613729238510132, + "learning_rate": 8.201200674439287e-05, + "loss": 2.3612, + "step": 8954 + }, + { + "epoch": 0.6010536559175866, + "grad_norm": 4.1229119300842285, + "learning_rate": 8.200365722876868e-05, + "loss": 2.4447, + "step": 8956 + }, + { + "epoch": 0.6011878796013557, + "grad_norm": 4.488452911376953, + "learning_rate": 8.199530620103461e-05, + "loss": 2.3404, + "step": 8958 + }, + { + "epoch": 0.6013221032851247, + "grad_norm": 4.630026340484619, + "learning_rate": 8.198695366158523e-05, + "loss": 2.38, + "step": 8960 + }, + { + "epoch": 0.6014563269688936, + "grad_norm": 6.354121208190918, + "learning_rate": 8.197859961081522e-05, + "loss": 2.6374, + "step": 8962 + }, + { + "epoch": 0.6015905506526626, + "grad_norm": 4.0342302322387695, + "learning_rate": 8.197024404911924e-05, + "loss": 2.464, + "step": 8964 + }, + { + "epoch": 0.6017247743364317, + "grad_norm": 4.258775234222412, + "learning_rate": 8.196188697689209e-05, + "loss": 2.8326, + "step": 8966 + }, + { + "epoch": 0.6018589980202007, + "grad_norm": 4.816757678985596, + "learning_rate": 8.195352839452863e-05, + "loss": 2.3929, + "step": 8968 + }, + { + "epoch": 0.6019932217039696, + "grad_norm": 3.9922780990600586, + "learning_rate": 8.19451683024238e-05, + "loss": 2.6539, + "step": 8970 + }, + { + "epoch": 0.6021274453877387, + "grad_norm": 4.559800148010254, + "learning_rate": 8.193680670097257e-05, + "loss": 2.6657, + "step": 8972 + }, + { + "epoch": 0.6022616690715077, + "grad_norm": 4.175164699554443, + "learning_rate": 8.192844359057003e-05, + "loss": 2.3953, + "step": 8974 + }, + { + "epoch": 0.6023958927552767, + "grad_norm": 3.6620821952819824, + "learning_rate": 8.192007897161133e-05, + "loss": 2.3151, + "step": 8976 + }, + { + "epoch": 0.6025301164390456, + "grad_norm": 3.905653476715088, + "learning_rate": 8.191171284449166e-05, + "loss": 2.4051, + "step": 8978 + }, + { + "epoch": 0.6026643401228147, + "grad_norm": 8.385147094726562, + "learning_rate": 8.19033452096063e-05, + "loss": 2.57, + "step": 8980 + }, + { + "epoch": 0.6027985638065837, + "grad_norm": 3.7588188648223877, + "learning_rate": 8.189497606735061e-05, + "loss": 2.5564, + "step": 8982 + }, + { + "epoch": 0.6029327874903526, + "grad_norm": 4.010897636413574, + "learning_rate": 8.188660541812004e-05, + "loss": 2.6832, + "step": 8984 + }, + { + "epoch": 0.6030670111741216, + "grad_norm": 7.395690441131592, + "learning_rate": 8.187823326231005e-05, + "loss": 2.6514, + "step": 8986 + }, + { + "epoch": 0.6032012348578907, + "grad_norm": 4.573713302612305, + "learning_rate": 8.186985960031623e-05, + "loss": 2.812, + "step": 8988 + }, + { + "epoch": 0.6033354585416597, + "grad_norm": 5.045195579528809, + "learning_rate": 8.18614844325342e-05, + "loss": 2.5235, + "step": 8990 + }, + { + "epoch": 0.6034696822254286, + "grad_norm": 5.21574068069458, + "learning_rate": 8.185310775935971e-05, + "loss": 2.5084, + "step": 8992 + }, + { + "epoch": 0.6036039059091977, + "grad_norm": 3.7024238109588623, + "learning_rate": 8.184472958118851e-05, + "loss": 2.5578, + "step": 8994 + }, + { + "epoch": 0.6037381295929667, + "grad_norm": 3.9225690364837646, + "learning_rate": 8.183634989841644e-05, + "loss": 2.6286, + "step": 8996 + }, + { + "epoch": 0.6038723532767357, + "grad_norm": 3.2778637409210205, + "learning_rate": 8.182796871143945e-05, + "loss": 2.1463, + "step": 8998 + }, + { + "epoch": 0.6040065769605046, + "grad_norm": 3.9872758388519287, + "learning_rate": 8.181958602065351e-05, + "loss": 2.4153, + "step": 9000 + }, + { + "epoch": 0.6041408006442737, + "grad_norm": 3.9885640144348145, + "learning_rate": 8.181120182645473e-05, + "loss": 2.5411, + "step": 9002 + }, + { + "epoch": 0.6042750243280427, + "grad_norm": 4.221307754516602, + "learning_rate": 8.18028161292392e-05, + "loss": 2.5613, + "step": 9004 + }, + { + "epoch": 0.6044092480118117, + "grad_norm": 4.800970554351807, + "learning_rate": 8.179442892940314e-05, + "loss": 2.5007, + "step": 9006 + }, + { + "epoch": 0.6045434716955806, + "grad_norm": 4.236454963684082, + "learning_rate": 8.178604022734287e-05, + "loss": 2.8678, + "step": 9008 + }, + { + "epoch": 0.6046776953793497, + "grad_norm": 3.9555976390838623, + "learning_rate": 8.177765002345466e-05, + "loss": 2.6268, + "step": 9010 + }, + { + "epoch": 0.6048119190631187, + "grad_norm": 3.724667549133301, + "learning_rate": 8.176925831813499e-05, + "loss": 2.5199, + "step": 9012 + }, + { + "epoch": 0.6049461427468877, + "grad_norm": 3.742142915725708, + "learning_rate": 8.176086511178034e-05, + "loss": 2.5321, + "step": 9014 + }, + { + "epoch": 0.6050803664306567, + "grad_norm": 4.037389755249023, + "learning_rate": 8.175247040478727e-05, + "loss": 2.4599, + "step": 9016 + }, + { + "epoch": 0.6052145901144257, + "grad_norm": 4.698034763336182, + "learning_rate": 8.17440741975524e-05, + "loss": 2.4274, + "step": 9018 + }, + { + "epoch": 0.6053488137981947, + "grad_norm": 4.194585800170898, + "learning_rate": 8.173567649047246e-05, + "loss": 2.468, + "step": 9020 + }, + { + "epoch": 0.6054830374819636, + "grad_norm": 5.341063022613525, + "learning_rate": 8.17272772839442e-05, + "loss": 2.5194, + "step": 9022 + }, + { + "epoch": 0.6056172611657327, + "grad_norm": 4.198864936828613, + "learning_rate": 8.171887657836448e-05, + "loss": 2.5811, + "step": 9024 + }, + { + "epoch": 0.6057514848495017, + "grad_norm": 5.401327133178711, + "learning_rate": 8.171047437413022e-05, + "loss": 2.5638, + "step": 9026 + }, + { + "epoch": 0.6058857085332707, + "grad_norm": 4.066832542419434, + "learning_rate": 8.17020706716384e-05, + "loss": 2.4428, + "step": 9028 + }, + { + "epoch": 0.6060199322170396, + "grad_norm": 3.860799789428711, + "learning_rate": 8.169366547128607e-05, + "loss": 2.3719, + "step": 9030 + }, + { + "epoch": 0.6061541559008087, + "grad_norm": 3.8259668350219727, + "learning_rate": 8.16852587734704e-05, + "loss": 2.338, + "step": 9032 + }, + { + "epoch": 0.6062883795845777, + "grad_norm": 4.700682163238525, + "learning_rate": 8.167685057858853e-05, + "loss": 2.5884, + "step": 9034 + }, + { + "epoch": 0.6064226032683467, + "grad_norm": 8.170792579650879, + "learning_rate": 8.166844088703777e-05, + "loss": 2.2966, + "step": 9036 + }, + { + "epoch": 0.6065568269521157, + "grad_norm": 4.495944499969482, + "learning_rate": 8.166002969921545e-05, + "loss": 2.32, + "step": 9038 + }, + { + "epoch": 0.6066910506358847, + "grad_norm": 4.270186424255371, + "learning_rate": 8.165161701551898e-05, + "loss": 2.5372, + "step": 9040 + }, + { + "epoch": 0.6068252743196537, + "grad_norm": 4.89624547958374, + "learning_rate": 8.164320283634585e-05, + "loss": 2.5425, + "step": 9042 + }, + { + "epoch": 0.6069594980034227, + "grad_norm": 4.5904011726379395, + "learning_rate": 8.16347871620936e-05, + "loss": 2.6572, + "step": 9044 + }, + { + "epoch": 0.6070937216871917, + "grad_norm": 3.807624101638794, + "learning_rate": 8.162636999315987e-05, + "loss": 2.3467, + "step": 9046 + }, + { + "epoch": 0.6072279453709607, + "grad_norm": 5.034989833831787, + "learning_rate": 8.161795132994235e-05, + "loss": 2.4336, + "step": 9048 + }, + { + "epoch": 0.6073621690547297, + "grad_norm": 6.169510841369629, + "learning_rate": 8.160953117283881e-05, + "loss": 2.4117, + "step": 9050 + }, + { + "epoch": 0.6074963927384988, + "grad_norm": 4.1171698570251465, + "learning_rate": 8.160110952224707e-05, + "loss": 2.4815, + "step": 9052 + }, + { + "epoch": 0.6076306164222677, + "grad_norm": 3.558795928955078, + "learning_rate": 8.159268637856506e-05, + "loss": 2.5662, + "step": 9054 + }, + { + "epoch": 0.6077648401060367, + "grad_norm": 3.9375669956207275, + "learning_rate": 8.158426174219074e-05, + "loss": 2.6396, + "step": 9056 + }, + { + "epoch": 0.6078990637898057, + "grad_norm": 4.415063381195068, + "learning_rate": 8.157583561352213e-05, + "loss": 2.505, + "step": 9058 + }, + { + "epoch": 0.6080332874735747, + "grad_norm": 3.8389272689819336, + "learning_rate": 8.156740799295741e-05, + "loss": 2.4385, + "step": 9060 + }, + { + "epoch": 0.6081675111573437, + "grad_norm": 3.5941693782806396, + "learning_rate": 8.155897888089473e-05, + "loss": 2.334, + "step": 9062 + }, + { + "epoch": 0.6083017348411127, + "grad_norm": 4.396344184875488, + "learning_rate": 8.155054827773237e-05, + "loss": 2.4266, + "step": 9064 + }, + { + "epoch": 0.6084359585248817, + "grad_norm": 4.584478855133057, + "learning_rate": 8.154211618386862e-05, + "loss": 2.4301, + "step": 9066 + }, + { + "epoch": 0.6085701822086507, + "grad_norm": 4.67272424697876, + "learning_rate": 8.153368259970191e-05, + "loss": 2.8888, + "step": 9068 + }, + { + "epoch": 0.6087044058924197, + "grad_norm": 4.081058025360107, + "learning_rate": 8.15252475256307e-05, + "loss": 2.408, + "step": 9070 + }, + { + "epoch": 0.6088386295761887, + "grad_norm": 4.057193756103516, + "learning_rate": 8.151681096205356e-05, + "loss": 2.7144, + "step": 9072 + }, + { + "epoch": 0.6089728532599578, + "grad_norm": 4.5054497718811035, + "learning_rate": 8.150837290936905e-05, + "loss": 2.4219, + "step": 9074 + }, + { + "epoch": 0.6091070769437267, + "grad_norm": 4.6952972412109375, + "learning_rate": 8.14999333679759e-05, + "loss": 2.4724, + "step": 9076 + }, + { + "epoch": 0.6092413006274957, + "grad_norm": 3.952958106994629, + "learning_rate": 8.149149233827285e-05, + "loss": 2.2442, + "step": 9078 + }, + { + "epoch": 0.6093755243112647, + "grad_norm": 4.196749687194824, + "learning_rate": 8.148304982065869e-05, + "loss": 2.5439, + "step": 9080 + }, + { + "epoch": 0.6095097479950338, + "grad_norm": 4.3267011642456055, + "learning_rate": 8.147460581553233e-05, + "loss": 2.3828, + "step": 9082 + }, + { + "epoch": 0.6096439716788027, + "grad_norm": 5.130125522613525, + "learning_rate": 8.146616032329275e-05, + "loss": 2.7516, + "step": 9084 + }, + { + "epoch": 0.6097781953625717, + "grad_norm": 4.089123725891113, + "learning_rate": 8.145771334433896e-05, + "loss": 2.4982, + "step": 9086 + }, + { + "epoch": 0.6099124190463407, + "grad_norm": 4.136356830596924, + "learning_rate": 8.144926487907009e-05, + "loss": 2.8069, + "step": 9088 + }, + { + "epoch": 0.6100466427301098, + "grad_norm": 3.7694504261016846, + "learning_rate": 8.144081492788528e-05, + "loss": 2.5655, + "step": 9090 + }, + { + "epoch": 0.6101808664138787, + "grad_norm": 5.003740310668945, + "learning_rate": 8.143236349118381e-05, + "loss": 2.6199, + "step": 9092 + }, + { + "epoch": 0.6103150900976477, + "grad_norm": 4.394735336303711, + "learning_rate": 8.142391056936495e-05, + "loss": 3.0748, + "step": 9094 + }, + { + "epoch": 0.6104493137814168, + "grad_norm": 4.072969913482666, + "learning_rate": 8.141545616282812e-05, + "loss": 2.5728, + "step": 9096 + }, + { + "epoch": 0.6105835374651857, + "grad_norm": 4.271207809448242, + "learning_rate": 8.140700027197277e-05, + "loss": 2.4379, + "step": 9098 + }, + { + "epoch": 0.6107177611489547, + "grad_norm": 4.130514144897461, + "learning_rate": 8.139854289719841e-05, + "loss": 2.4433, + "step": 9100 + }, + { + "epoch": 0.6108519848327237, + "grad_norm": 4.972227096557617, + "learning_rate": 8.139008403890465e-05, + "loss": 2.6144, + "step": 9102 + }, + { + "epoch": 0.6109862085164928, + "grad_norm": 4.2085957527160645, + "learning_rate": 8.138162369749114e-05, + "loss": 2.612, + "step": 9104 + }, + { + "epoch": 0.6111204322002617, + "grad_norm": 4.33076810836792, + "learning_rate": 8.137316187335761e-05, + "loss": 2.5171, + "step": 9106 + }, + { + "epoch": 0.6112546558840307, + "grad_norm": 4.153900146484375, + "learning_rate": 8.136469856690387e-05, + "loss": 2.7074, + "step": 9108 + }, + { + "epoch": 0.6113888795677997, + "grad_norm": 4.441685676574707, + "learning_rate": 8.135623377852982e-05, + "loss": 2.703, + "step": 9110 + }, + { + "epoch": 0.6115231032515688, + "grad_norm": 4.255147933959961, + "learning_rate": 8.134776750863535e-05, + "loss": 2.5123, + "step": 9112 + }, + { + "epoch": 0.6116573269353377, + "grad_norm": 3.865961790084839, + "learning_rate": 8.133929975762053e-05, + "loss": 2.1755, + "step": 9114 + }, + { + "epoch": 0.6117915506191067, + "grad_norm": 4.764133930206299, + "learning_rate": 8.133083052588543e-05, + "loss": 2.5787, + "step": 9116 + }, + { + "epoch": 0.6119257743028758, + "grad_norm": 4.488648414611816, + "learning_rate": 8.132235981383018e-05, + "loss": 2.7307, + "step": 9118 + }, + { + "epoch": 0.6120599979866448, + "grad_norm": 4.363330841064453, + "learning_rate": 8.131388762185503e-05, + "loss": 2.7205, + "step": 9120 + }, + { + "epoch": 0.6121942216704137, + "grad_norm": 4.300917625427246, + "learning_rate": 8.130541395036027e-05, + "loss": 2.3792, + "step": 9122 + }, + { + "epoch": 0.6123284453541827, + "grad_norm": 4.350119113922119, + "learning_rate": 8.129693879974626e-05, + "loss": 2.6001, + "step": 9124 + }, + { + "epoch": 0.6124626690379518, + "grad_norm": 4.002981185913086, + "learning_rate": 8.128846217041344e-05, + "loss": 2.4321, + "step": 9126 + }, + { + "epoch": 0.6125968927217208, + "grad_norm": 5.337876319885254, + "learning_rate": 8.12799840627623e-05, + "loss": 2.3506, + "step": 9128 + }, + { + "epoch": 0.6127311164054897, + "grad_norm": 4.049530506134033, + "learning_rate": 8.127150447719342e-05, + "loss": 2.3553, + "step": 9130 + }, + { + "epoch": 0.6128653400892587, + "grad_norm": 3.926036834716797, + "learning_rate": 8.126302341410744e-05, + "loss": 2.2581, + "step": 9132 + }, + { + "epoch": 0.6129995637730278, + "grad_norm": 4.575514316558838, + "learning_rate": 8.125454087390509e-05, + "loss": 2.6306, + "step": 9134 + }, + { + "epoch": 0.6131337874567967, + "grad_norm": 4.305856227874756, + "learning_rate": 8.124605685698714e-05, + "loss": 2.5999, + "step": 9136 + }, + { + "epoch": 0.6132680111405657, + "grad_norm": 4.576911926269531, + "learning_rate": 8.123757136375445e-05, + "loss": 2.3955, + "step": 9138 + }, + { + "epoch": 0.6134022348243348, + "grad_norm": 3.791184902191162, + "learning_rate": 8.122908439460794e-05, + "loss": 2.5496, + "step": 9140 + }, + { + "epoch": 0.6135364585081038, + "grad_norm": 4.40363073348999, + "learning_rate": 8.12205959499486e-05, + "loss": 2.4772, + "step": 9142 + }, + { + "epoch": 0.6136706821918727, + "grad_norm": 3.837270498275757, + "learning_rate": 8.121210603017748e-05, + "loss": 2.3749, + "step": 9144 + }, + { + "epoch": 0.6138049058756417, + "grad_norm": 4.759726047515869, + "learning_rate": 8.120361463569575e-05, + "loss": 2.4175, + "step": 9146 + }, + { + "epoch": 0.6139391295594108, + "grad_norm": 6.092769145965576, + "learning_rate": 8.119512176690455e-05, + "loss": 2.6347, + "step": 9148 + }, + { + "epoch": 0.6140733532431798, + "grad_norm": 4.399352073669434, + "learning_rate": 8.118662742420523e-05, + "loss": 2.6058, + "step": 9150 + }, + { + "epoch": 0.6142075769269487, + "grad_norm": 3.663189172744751, + "learning_rate": 8.117813160799908e-05, + "loss": 2.2993, + "step": 9152 + }, + { + "epoch": 0.6143418006107177, + "grad_norm": 4.024352550506592, + "learning_rate": 8.116963431868751e-05, + "loss": 2.3472, + "step": 9154 + }, + { + "epoch": 0.6144760242944868, + "grad_norm": 4.17489767074585, + "learning_rate": 8.116113555667203e-05, + "loss": 2.8893, + "step": 9156 + }, + { + "epoch": 0.6146102479782558, + "grad_norm": 3.6512746810913086, + "learning_rate": 8.115263532235416e-05, + "loss": 2.5128, + "step": 9158 + }, + { + "epoch": 0.6147444716620247, + "grad_norm": 4.6630754470825195, + "learning_rate": 8.114413361613551e-05, + "loss": 2.1024, + "step": 9160 + }, + { + "epoch": 0.6148786953457938, + "grad_norm": 4.214949131011963, + "learning_rate": 8.113563043841781e-05, + "loss": 2.4285, + "step": 9162 + }, + { + "epoch": 0.6150129190295628, + "grad_norm": 3.9457476139068604, + "learning_rate": 8.11271257896028e-05, + "loss": 2.3736, + "step": 9164 + }, + { + "epoch": 0.6151471427133318, + "grad_norm": 4.510122776031494, + "learning_rate": 8.111861967009229e-05, + "loss": 2.4622, + "step": 9166 + }, + { + "epoch": 0.6152813663971007, + "grad_norm": 4.670661449432373, + "learning_rate": 8.111011208028821e-05, + "loss": 2.6998, + "step": 9168 + }, + { + "epoch": 0.6154155900808698, + "grad_norm": 3.816521167755127, + "learning_rate": 8.11016030205925e-05, + "loss": 2.4881, + "step": 9170 + }, + { + "epoch": 0.6155498137646388, + "grad_norm": 3.920358657836914, + "learning_rate": 8.109309249140721e-05, + "loss": 2.8295, + "step": 9172 + }, + { + "epoch": 0.6156840374484077, + "grad_norm": 3.963780403137207, + "learning_rate": 8.108458049313443e-05, + "loss": 2.4023, + "step": 9174 + }, + { + "epoch": 0.6158182611321767, + "grad_norm": 4.407249450683594, + "learning_rate": 8.107606702617636e-05, + "loss": 2.5201, + "step": 9176 + }, + { + "epoch": 0.6159524848159458, + "grad_norm": 3.9000496864318848, + "learning_rate": 8.106755209093523e-05, + "loss": 2.4026, + "step": 9178 + }, + { + "epoch": 0.6160867084997148, + "grad_norm": 4.593108654022217, + "learning_rate": 8.105903568781335e-05, + "loss": 2.5248, + "step": 9180 + }, + { + "epoch": 0.6162209321834837, + "grad_norm": 6.260143280029297, + "learning_rate": 8.105051781721312e-05, + "loss": 2.4829, + "step": 9182 + }, + { + "epoch": 0.6163551558672528, + "grad_norm": 4.274314880371094, + "learning_rate": 8.104199847953696e-05, + "loss": 2.5821, + "step": 9184 + }, + { + "epoch": 0.6164893795510218, + "grad_norm": 4.316870212554932, + "learning_rate": 8.103347767518743e-05, + "loss": 2.557, + "step": 9186 + }, + { + "epoch": 0.6166236032347908, + "grad_norm": 4.170542240142822, + "learning_rate": 8.102495540456711e-05, + "loss": 2.4017, + "step": 9188 + }, + { + "epoch": 0.6167578269185597, + "grad_norm": 3.7514233589172363, + "learning_rate": 8.101643166807864e-05, + "loss": 2.3675, + "step": 9190 + }, + { + "epoch": 0.6168920506023288, + "grad_norm": 3.850247383117676, + "learning_rate": 8.100790646612477e-05, + "loss": 2.381, + "step": 9192 + }, + { + "epoch": 0.6170262742860978, + "grad_norm": 4.556551933288574, + "learning_rate": 8.09993797991083e-05, + "loss": 2.5007, + "step": 9194 + }, + { + "epoch": 0.6171604979698668, + "grad_norm": 4.473758697509766, + "learning_rate": 8.099085166743208e-05, + "loss": 2.4983, + "step": 9196 + }, + { + "epoch": 0.6172947216536357, + "grad_norm": 4.928914546966553, + "learning_rate": 8.098232207149907e-05, + "loss": 2.6329, + "step": 9198 + }, + { + "epoch": 0.6174289453374048, + "grad_norm": 4.065887451171875, + "learning_rate": 8.097379101171225e-05, + "loss": 2.5516, + "step": 9200 + }, + { + "epoch": 0.6175631690211738, + "grad_norm": 3.9749040603637695, + "learning_rate": 8.096525848847473e-05, + "loss": 2.5832, + "step": 9202 + }, + { + "epoch": 0.6176973927049428, + "grad_norm": 3.646329879760742, + "learning_rate": 8.095672450218964e-05, + "loss": 2.0489, + "step": 9204 + }, + { + "epoch": 0.6178316163887118, + "grad_norm": 4.094140529632568, + "learning_rate": 8.094818905326019e-05, + "loss": 2.5117, + "step": 9206 + }, + { + "epoch": 0.6179658400724808, + "grad_norm": 5.7615485191345215, + "learning_rate": 8.093965214208964e-05, + "loss": 2.5358, + "step": 9208 + }, + { + "epoch": 0.6181000637562498, + "grad_norm": 4.09364652633667, + "learning_rate": 8.09311137690814e-05, + "loss": 2.6483, + "step": 9210 + }, + { + "epoch": 0.6182342874400187, + "grad_norm": 3.7803707122802734, + "learning_rate": 8.092257393463882e-05, + "loss": 2.4514, + "step": 9212 + }, + { + "epoch": 0.6183685111237878, + "grad_norm": 4.557623386383057, + "learning_rate": 8.091403263916546e-05, + "loss": 2.6301, + "step": 9214 + }, + { + "epoch": 0.6185027348075568, + "grad_norm": 3.7583541870117188, + "learning_rate": 8.090548988306483e-05, + "loss": 2.3319, + "step": 9216 + }, + { + "epoch": 0.6186369584913258, + "grad_norm": 4.801093578338623, + "learning_rate": 8.08969456667406e-05, + "loss": 2.7094, + "step": 9218 + }, + { + "epoch": 0.6187711821750947, + "grad_norm": 4.031949996948242, + "learning_rate": 8.088839999059642e-05, + "loss": 2.8114, + "step": 9220 + }, + { + "epoch": 0.6189054058588638, + "grad_norm": 4.149991989135742, + "learning_rate": 8.087985285503609e-05, + "loss": 2.3211, + "step": 9222 + }, + { + "epoch": 0.6190396295426328, + "grad_norm": 4.101548194885254, + "learning_rate": 8.087130426046343e-05, + "loss": 2.2708, + "step": 9224 + }, + { + "epoch": 0.6191738532264018, + "grad_norm": 4.2072319984436035, + "learning_rate": 8.086275420728235e-05, + "loss": 2.4102, + "step": 9226 + }, + { + "epoch": 0.6193080769101708, + "grad_norm": 3.980923891067505, + "learning_rate": 8.085420269589682e-05, + "loss": 2.2091, + "step": 9228 + }, + { + "epoch": 0.6194423005939398, + "grad_norm": 5.351043701171875, + "learning_rate": 8.08456497267109e-05, + "loss": 2.7141, + "step": 9230 + }, + { + "epoch": 0.6195765242777088, + "grad_norm": 4.446384906768799, + "learning_rate": 8.083709530012867e-05, + "loss": 2.4744, + "step": 9232 + }, + { + "epoch": 0.6197107479614778, + "grad_norm": 4.167870998382568, + "learning_rate": 8.082853941655433e-05, + "loss": 2.5855, + "step": 9234 + }, + { + "epoch": 0.6198449716452468, + "grad_norm": 4.381982803344727, + "learning_rate": 8.081998207639212e-05, + "loss": 2.6089, + "step": 9236 + }, + { + "epoch": 0.6199791953290158, + "grad_norm": 11.192785263061523, + "learning_rate": 8.081142328004637e-05, + "loss": 2.5443, + "step": 9238 + }, + { + "epoch": 0.6201134190127848, + "grad_norm": 3.9837169647216797, + "learning_rate": 8.080286302792146e-05, + "loss": 2.7552, + "step": 9240 + }, + { + "epoch": 0.6202476426965539, + "grad_norm": 4.570556640625, + "learning_rate": 8.079430132042183e-05, + "loss": 2.2639, + "step": 9242 + }, + { + "epoch": 0.6203818663803228, + "grad_norm": 3.8627026081085205, + "learning_rate": 8.078573815795203e-05, + "loss": 2.7416, + "step": 9244 + }, + { + "epoch": 0.6205160900640918, + "grad_norm": 3.639782190322876, + "learning_rate": 8.077717354091663e-05, + "loss": 2.3016, + "step": 9246 + }, + { + "epoch": 0.6206503137478608, + "grad_norm": 3.8367433547973633, + "learning_rate": 8.07686074697203e-05, + "loss": 2.3277, + "step": 9248 + }, + { + "epoch": 0.6207845374316298, + "grad_norm": 16.4312744140625, + "learning_rate": 8.076003994476778e-05, + "loss": 2.6499, + "step": 9250 + }, + { + "epoch": 0.6209187611153988, + "grad_norm": 5.410732269287109, + "learning_rate": 8.075147096646387e-05, + "loss": 2.574, + "step": 9252 + }, + { + "epoch": 0.6210529847991678, + "grad_norm": 4.034197807312012, + "learning_rate": 8.074290053521341e-05, + "loss": 2.3711, + "step": 9254 + }, + { + "epoch": 0.6211872084829368, + "grad_norm": 4.532978057861328, + "learning_rate": 8.073432865142135e-05, + "loss": 2.7534, + "step": 9256 + }, + { + "epoch": 0.6213214321667058, + "grad_norm": 4.570117950439453, + "learning_rate": 8.07257553154927e-05, + "loss": 2.6578, + "step": 9258 + }, + { + "epoch": 0.6214556558504748, + "grad_norm": 3.7817113399505615, + "learning_rate": 8.071718052783253e-05, + "loss": 2.4789, + "step": 9260 + }, + { + "epoch": 0.6215898795342438, + "grad_norm": 3.852466106414795, + "learning_rate": 8.070860428884599e-05, + "loss": 2.3371, + "step": 9262 + }, + { + "epoch": 0.6217241032180129, + "grad_norm": 4.30131196975708, + "learning_rate": 8.070002659893829e-05, + "loss": 2.3021, + "step": 9264 + }, + { + "epoch": 0.6218583269017818, + "grad_norm": 4.073439598083496, + "learning_rate": 8.069144745851469e-05, + "loss": 2.3607, + "step": 9266 + }, + { + "epoch": 0.6219925505855508, + "grad_norm": 4.053984642028809, + "learning_rate": 8.068286686798054e-05, + "loss": 2.4579, + "step": 9268 + }, + { + "epoch": 0.6221267742693198, + "grad_norm": 3.6464343070983887, + "learning_rate": 8.067428482774129e-05, + "loss": 2.788, + "step": 9270 + }, + { + "epoch": 0.6222609979530889, + "grad_norm": 4.2018208503723145, + "learning_rate": 8.06657013382024e-05, + "loss": 2.4198, + "step": 9272 + }, + { + "epoch": 0.6223952216368578, + "grad_norm": 4.467110633850098, + "learning_rate": 8.065711639976939e-05, + "loss": 2.4752, + "step": 9274 + }, + { + "epoch": 0.6225294453206268, + "grad_norm": 6.980469226837158, + "learning_rate": 8.064853001284793e-05, + "loss": 3.0093, + "step": 9276 + }, + { + "epoch": 0.6226636690043958, + "grad_norm": 4.0777482986450195, + "learning_rate": 8.063994217784372e-05, + "loss": 2.5327, + "step": 9278 + }, + { + "epoch": 0.6227978926881649, + "grad_norm": 4.442088603973389, + "learning_rate": 8.063135289516245e-05, + "loss": 2.522, + "step": 9280 + }, + { + "epoch": 0.6229321163719338, + "grad_norm": 5.462404251098633, + "learning_rate": 8.062276216521003e-05, + "loss": 2.466, + "step": 9282 + }, + { + "epoch": 0.6230663400557028, + "grad_norm": 3.824766159057617, + "learning_rate": 8.061416998839231e-05, + "loss": 2.4338, + "step": 9284 + }, + { + "epoch": 0.6232005637394719, + "grad_norm": 3.9678351879119873, + "learning_rate": 8.060557636511523e-05, + "loss": 2.3745, + "step": 9286 + }, + { + "epoch": 0.6233347874232408, + "grad_norm": 4.467328071594238, + "learning_rate": 8.059698129578486e-05, + "loss": 2.4897, + "step": 9288 + }, + { + "epoch": 0.6234690111070098, + "grad_norm": 4.755666255950928, + "learning_rate": 8.058838478080731e-05, + "loss": 2.4956, + "step": 9290 + }, + { + "epoch": 0.6236032347907788, + "grad_norm": 4.179197311401367, + "learning_rate": 8.05797868205887e-05, + "loss": 2.6838, + "step": 9292 + }, + { + "epoch": 0.6237374584745479, + "grad_norm": 3.495025157928467, + "learning_rate": 8.057118741553533e-05, + "loss": 2.2205, + "step": 9294 + }, + { + "epoch": 0.6238716821583168, + "grad_norm": 4.280924320220947, + "learning_rate": 8.056258656605344e-05, + "loss": 2.4781, + "step": 9296 + }, + { + "epoch": 0.6240059058420858, + "grad_norm": 4.744064807891846, + "learning_rate": 8.055398427254945e-05, + "loss": 2.5059, + "step": 9298 + }, + { + "epoch": 0.6241401295258548, + "grad_norm": 4.356153964996338, + "learning_rate": 8.054538053542978e-05, + "loss": 2.4885, + "step": 9300 + }, + { + "epoch": 0.6242743532096239, + "grad_norm": 5.05357027053833, + "learning_rate": 8.053677535510094e-05, + "loss": 2.4824, + "step": 9302 + }, + { + "epoch": 0.6244085768933928, + "grad_norm": 3.808603286743164, + "learning_rate": 8.052816873196952e-05, + "loss": 2.3591, + "step": 9304 + }, + { + "epoch": 0.6245428005771618, + "grad_norm": 4.230475902557373, + "learning_rate": 8.051956066644217e-05, + "loss": 2.4937, + "step": 9306 + }, + { + "epoch": 0.6246770242609309, + "grad_norm": 3.849309206008911, + "learning_rate": 8.051095115892557e-05, + "loss": 2.7045, + "step": 9308 + }, + { + "epoch": 0.6248112479446999, + "grad_norm": 4.075511932373047, + "learning_rate": 8.050234020982653e-05, + "loss": 2.6007, + "step": 9310 + }, + { + "epoch": 0.6249454716284688, + "grad_norm": 3.6101858615875244, + "learning_rate": 8.04937278195519e-05, + "loss": 2.4012, + "step": 9312 + }, + { + "epoch": 0.6250796953122378, + "grad_norm": 4.278129577636719, + "learning_rate": 8.04851139885086e-05, + "loss": 2.3744, + "step": 9314 + }, + { + "epoch": 0.6252139189960069, + "grad_norm": 3.817182779312134, + "learning_rate": 8.047649871710359e-05, + "loss": 2.6773, + "step": 9316 + }, + { + "epoch": 0.6253481426797759, + "grad_norm": 4.634020805358887, + "learning_rate": 8.046788200574395e-05, + "loss": 2.7671, + "step": 9318 + }, + { + "epoch": 0.6254823663635448, + "grad_norm": 4.505612373352051, + "learning_rate": 8.045926385483682e-05, + "loss": 2.6756, + "step": 9320 + }, + { + "epoch": 0.6256165900473138, + "grad_norm": 3.840406656265259, + "learning_rate": 8.045064426478935e-05, + "loss": 2.6635, + "step": 9322 + }, + { + "epoch": 0.6257508137310829, + "grad_norm": 3.9256153106689453, + "learning_rate": 8.044202323600882e-05, + "loss": 2.2672, + "step": 9324 + }, + { + "epoch": 0.6258850374148518, + "grad_norm": 4.101027965545654, + "learning_rate": 8.043340076890256e-05, + "loss": 2.4367, + "step": 9326 + }, + { + "epoch": 0.6260192610986208, + "grad_norm": 4.525516033172607, + "learning_rate": 8.042477686387794e-05, + "loss": 2.3154, + "step": 9328 + }, + { + "epoch": 0.6261534847823899, + "grad_norm": 4.143069267272949, + "learning_rate": 8.041615152134247e-05, + "loss": 2.4331, + "step": 9330 + }, + { + "epoch": 0.6262877084661589, + "grad_norm": 4.256640434265137, + "learning_rate": 8.040752474170364e-05, + "loss": 2.3915, + "step": 9332 + }, + { + "epoch": 0.6264219321499278, + "grad_norm": 4.3398542404174805, + "learning_rate": 8.039889652536905e-05, + "loss": 2.2939, + "step": 9334 + }, + { + "epoch": 0.6265561558336968, + "grad_norm": 4.245126724243164, + "learning_rate": 8.039026687274638e-05, + "loss": 2.3981, + "step": 9336 + }, + { + "epoch": 0.6266903795174659, + "grad_norm": 4.001992225646973, + "learning_rate": 8.038163578424336e-05, + "loss": 2.6281, + "step": 9338 + }, + { + "epoch": 0.6268246032012349, + "grad_norm": 4.49854040145874, + "learning_rate": 8.037300326026779e-05, + "loss": 2.7282, + "step": 9340 + }, + { + "epoch": 0.6269588268850038, + "grad_norm": 4.579887390136719, + "learning_rate": 8.036436930122754e-05, + "loss": 2.5799, + "step": 9342 + }, + { + "epoch": 0.6270930505687728, + "grad_norm": 4.254425525665283, + "learning_rate": 8.035573390753054e-05, + "loss": 2.4442, + "step": 9344 + }, + { + "epoch": 0.6272272742525419, + "grad_norm": 4.500899791717529, + "learning_rate": 8.034709707958483e-05, + "loss": 2.8233, + "step": 9346 + }, + { + "epoch": 0.6273614979363109, + "grad_norm": 4.100552082061768, + "learning_rate": 8.033845881779845e-05, + "loss": 2.4384, + "step": 9348 + }, + { + "epoch": 0.6274957216200798, + "grad_norm": 4.370426177978516, + "learning_rate": 8.032981912257955e-05, + "loss": 2.5465, + "step": 9350 + }, + { + "epoch": 0.6276299453038489, + "grad_norm": 3.549281597137451, + "learning_rate": 8.032117799433634e-05, + "loss": 2.3638, + "step": 9352 + }, + { + "epoch": 0.6277641689876179, + "grad_norm": 4.308472633361816, + "learning_rate": 8.031253543347708e-05, + "loss": 2.9694, + "step": 9354 + }, + { + "epoch": 0.6278983926713869, + "grad_norm": 3.8870790004730225, + "learning_rate": 8.030389144041014e-05, + "loss": 2.3204, + "step": 9356 + }, + { + "epoch": 0.6280326163551558, + "grad_norm": 3.9747726917266846, + "learning_rate": 8.029524601554392e-05, + "loss": 2.3589, + "step": 9358 + }, + { + "epoch": 0.6281668400389249, + "grad_norm": 4.659850120544434, + "learning_rate": 8.028659915928689e-05, + "loss": 2.5505, + "step": 9360 + }, + { + "epoch": 0.6283010637226939, + "grad_norm": 3.6822569370269775, + "learning_rate": 8.027795087204761e-05, + "loss": 2.4061, + "step": 9362 + }, + { + "epoch": 0.6284352874064628, + "grad_norm": 3.8036937713623047, + "learning_rate": 8.026930115423469e-05, + "loss": 2.4698, + "step": 9364 + }, + { + "epoch": 0.6285695110902318, + "grad_norm": 4.725591659545898, + "learning_rate": 8.026065000625681e-05, + "loss": 2.1441, + "step": 9366 + }, + { + "epoch": 0.6287037347740009, + "grad_norm": 4.165973663330078, + "learning_rate": 8.025199742852272e-05, + "loss": 2.645, + "step": 9368 + }, + { + "epoch": 0.6288379584577699, + "grad_norm": 3.7307209968566895, + "learning_rate": 8.024334342144124e-05, + "loss": 2.5469, + "step": 9370 + }, + { + "epoch": 0.6289721821415388, + "grad_norm": 3.941166400909424, + "learning_rate": 8.023468798542127e-05, + "loss": 2.2567, + "step": 9372 + }, + { + "epoch": 0.6291064058253079, + "grad_norm": 4.061511516571045, + "learning_rate": 8.022603112087174e-05, + "loss": 2.3791, + "step": 9374 + }, + { + "epoch": 0.6292406295090769, + "grad_norm": 4.037644386291504, + "learning_rate": 8.021737282820167e-05, + "loss": 2.3999, + "step": 9376 + }, + { + "epoch": 0.6293748531928459, + "grad_norm": 4.442147731781006, + "learning_rate": 8.020871310782015e-05, + "loss": 2.6047, + "step": 9378 + }, + { + "epoch": 0.6295090768766148, + "grad_norm": 4.299973487854004, + "learning_rate": 8.020005196013636e-05, + "loss": 2.6416, + "step": 9380 + }, + { + "epoch": 0.6296433005603839, + "grad_norm": 4.174050807952881, + "learning_rate": 8.019138938555947e-05, + "loss": 2.5022, + "step": 9382 + }, + { + "epoch": 0.6297775242441529, + "grad_norm": 4.1872358322143555, + "learning_rate": 8.018272538449882e-05, + "loss": 2.4575, + "step": 9384 + }, + { + "epoch": 0.6299117479279219, + "grad_norm": 4.186075210571289, + "learning_rate": 8.017405995736374e-05, + "loss": 2.5613, + "step": 9386 + }, + { + "epoch": 0.6300459716116908, + "grad_norm": 3.8855340480804443, + "learning_rate": 8.016539310456367e-05, + "loss": 2.2519, + "step": 9388 + }, + { + "epoch": 0.6301801952954599, + "grad_norm": 3.8435158729553223, + "learning_rate": 8.01567248265081e-05, + "loss": 2.4713, + "step": 9390 + }, + { + "epoch": 0.6303144189792289, + "grad_norm": 6.876553535461426, + "learning_rate": 8.014805512360655e-05, + "loss": 2.2506, + "step": 9392 + }, + { + "epoch": 0.6304486426629979, + "grad_norm": 5.10235595703125, + "learning_rate": 8.01393839962687e-05, + "loss": 2.5857, + "step": 9394 + }, + { + "epoch": 0.6305828663467669, + "grad_norm": 3.86175537109375, + "learning_rate": 8.013071144490423e-05, + "loss": 2.2823, + "step": 9396 + }, + { + "epoch": 0.6307170900305359, + "grad_norm": 3.6441540718078613, + "learning_rate": 8.012203746992288e-05, + "loss": 2.5069, + "step": 9398 + }, + { + "epoch": 0.6308513137143049, + "grad_norm": 3.650243043899536, + "learning_rate": 8.01133620717345e-05, + "loss": 2.209, + "step": 9400 + }, + { + "epoch": 0.6309855373980738, + "grad_norm": 5.420849323272705, + "learning_rate": 8.010468525074897e-05, + "loss": 2.7766, + "step": 9402 + }, + { + "epoch": 0.6311197610818429, + "grad_norm": 4.422460079193115, + "learning_rate": 8.009600700737627e-05, + "loss": 2.3327, + "step": 9404 + }, + { + "epoch": 0.6312539847656119, + "grad_norm": 5.004990100860596, + "learning_rate": 8.008732734202642e-05, + "loss": 2.6519, + "step": 9406 + }, + { + "epoch": 0.6313882084493809, + "grad_norm": 4.267537593841553, + "learning_rate": 8.007864625510951e-05, + "loss": 2.5225, + "step": 9408 + }, + { + "epoch": 0.6315224321331498, + "grad_norm": 4.220572471618652, + "learning_rate": 8.006996374703572e-05, + "loss": 2.2272, + "step": 9410 + }, + { + "epoch": 0.6316566558169189, + "grad_norm": 4.380600929260254, + "learning_rate": 8.006127981821526e-05, + "loss": 2.2293, + "step": 9412 + }, + { + "epoch": 0.6317908795006879, + "grad_norm": 4.441895008087158, + "learning_rate": 8.005259446905845e-05, + "loss": 2.3583, + "step": 9414 + }, + { + "epoch": 0.6319251031844569, + "grad_norm": 4.247764587402344, + "learning_rate": 8.004390769997565e-05, + "loss": 2.9793, + "step": 9416 + }, + { + "epoch": 0.6320593268682259, + "grad_norm": 4.3163957595825195, + "learning_rate": 8.003521951137728e-05, + "loss": 2.5302, + "step": 9418 + }, + { + "epoch": 0.6321935505519949, + "grad_norm": 4.522477149963379, + "learning_rate": 8.002652990367385e-05, + "loss": 2.5217, + "step": 9420 + }, + { + "epoch": 0.6323277742357639, + "grad_norm": 6.229316234588623, + "learning_rate": 8.001783887727594e-05, + "loss": 2.6598, + "step": 9422 + }, + { + "epoch": 0.632461997919533, + "grad_norm": 4.097538471221924, + "learning_rate": 8.000914643259416e-05, + "loss": 2.7025, + "step": 9424 + }, + { + "epoch": 0.6325962216033019, + "grad_norm": 4.399979591369629, + "learning_rate": 8.000045257003923e-05, + "loss": 2.5217, + "step": 9426 + }, + { + "epoch": 0.6327304452870709, + "grad_norm": 4.077821731567383, + "learning_rate": 7.999175729002191e-05, + "loss": 2.4088, + "step": 9428 + }, + { + "epoch": 0.6328646689708399, + "grad_norm": 4.405796527862549, + "learning_rate": 7.998306059295301e-05, + "loss": 2.4534, + "step": 9430 + }, + { + "epoch": 0.632998892654609, + "grad_norm": 3.882195234298706, + "learning_rate": 7.997436247924348e-05, + "loss": 2.3328, + "step": 9432 + }, + { + "epoch": 0.6331331163383779, + "grad_norm": 4.063603401184082, + "learning_rate": 7.996566294930428e-05, + "loss": 2.4795, + "step": 9434 + }, + { + "epoch": 0.6332673400221469, + "grad_norm": 4.9280290603637695, + "learning_rate": 7.99569620035464e-05, + "loss": 2.5797, + "step": 9436 + }, + { + "epoch": 0.6334015637059159, + "grad_norm": 4.13392972946167, + "learning_rate": 7.994825964238099e-05, + "loss": 2.8288, + "step": 9438 + }, + { + "epoch": 0.6335357873896849, + "grad_norm": 4.155078887939453, + "learning_rate": 7.993955586621918e-05, + "loss": 2.6033, + "step": 9440 + }, + { + "epoch": 0.6336700110734539, + "grad_norm": 4.058880805969238, + "learning_rate": 7.993085067547226e-05, + "loss": 2.4616, + "step": 9442 + }, + { + "epoch": 0.6338042347572229, + "grad_norm": 4.63592004776001, + "learning_rate": 7.992214407055148e-05, + "loss": 2.5441, + "step": 9444 + }, + { + "epoch": 0.633938458440992, + "grad_norm": 4.5437164306640625, + "learning_rate": 7.991343605186826e-05, + "loss": 2.4853, + "step": 9446 + }, + { + "epoch": 0.6340726821247609, + "grad_norm": 4.211022853851318, + "learning_rate": 7.990472661983398e-05, + "loss": 2.6395, + "step": 9448 + }, + { + "epoch": 0.6342069058085299, + "grad_norm": 3.8164844512939453, + "learning_rate": 7.989601577486017e-05, + "loss": 2.3864, + "step": 9450 + }, + { + "epoch": 0.6343411294922989, + "grad_norm": 4.575959205627441, + "learning_rate": 7.988730351735843e-05, + "loss": 2.5408, + "step": 9452 + }, + { + "epoch": 0.634475353176068, + "grad_norm": 5.870684623718262, + "learning_rate": 7.987858984774035e-05, + "loss": 2.4985, + "step": 9454 + }, + { + "epoch": 0.6346095768598369, + "grad_norm": 4.374214172363281, + "learning_rate": 7.986987476641764e-05, + "loss": 3.0478, + "step": 9456 + }, + { + "epoch": 0.6347438005436059, + "grad_norm": 4.228574275970459, + "learning_rate": 7.98611582738021e-05, + "loss": 2.3916, + "step": 9458 + }, + { + "epoch": 0.6348780242273749, + "grad_norm": 4.101493835449219, + "learning_rate": 7.985244037030556e-05, + "loss": 2.224, + "step": 9460 + }, + { + "epoch": 0.635012247911144, + "grad_norm": 3.4139280319213867, + "learning_rate": 7.984372105633991e-05, + "loss": 2.4691, + "step": 9462 + }, + { + "epoch": 0.6351464715949129, + "grad_norm": 4.085309982299805, + "learning_rate": 7.983500033231711e-05, + "loss": 2.343, + "step": 9464 + }, + { + "epoch": 0.6352806952786819, + "grad_norm": 4.260854721069336, + "learning_rate": 7.982627819864923e-05, + "loss": 2.5984, + "step": 9466 + }, + { + "epoch": 0.635414918962451, + "grad_norm": 4.156835556030273, + "learning_rate": 7.981755465574834e-05, + "loss": 2.3087, + "step": 9468 + }, + { + "epoch": 0.63554914264622, + "grad_norm": 5.836766719818115, + "learning_rate": 7.980882970402664e-05, + "loss": 2.421, + "step": 9470 + }, + { + "epoch": 0.6356833663299889, + "grad_norm": 4.085309982299805, + "learning_rate": 7.980010334389636e-05, + "loss": 2.5827, + "step": 9472 + }, + { + "epoch": 0.6358175900137579, + "grad_norm": 4.191207408905029, + "learning_rate": 7.979137557576978e-05, + "loss": 2.8001, + "step": 9474 + }, + { + "epoch": 0.635951813697527, + "grad_norm": 3.5483057498931885, + "learning_rate": 7.978264640005928e-05, + "loss": 2.296, + "step": 9476 + }, + { + "epoch": 0.6360860373812959, + "grad_norm": 3.8344576358795166, + "learning_rate": 7.977391581717733e-05, + "loss": 2.2078, + "step": 9478 + }, + { + "epoch": 0.6362202610650649, + "grad_norm": 5.572956562042236, + "learning_rate": 7.976518382753637e-05, + "loss": 2.4494, + "step": 9480 + }, + { + "epoch": 0.6363544847488339, + "grad_norm": 4.167756080627441, + "learning_rate": 7.975645043154903e-05, + "loss": 2.5183, + "step": 9482 + }, + { + "epoch": 0.636488708432603, + "grad_norm": 4.781607627868652, + "learning_rate": 7.974771562962791e-05, + "loss": 2.7102, + "step": 9484 + }, + { + "epoch": 0.6366229321163719, + "grad_norm": 4.014424800872803, + "learning_rate": 7.973897942218573e-05, + "loss": 2.6892, + "step": 9486 + }, + { + "epoch": 0.6367571558001409, + "grad_norm": 3.578322172164917, + "learning_rate": 7.973024180963526e-05, + "loss": 2.4178, + "step": 9488 + }, + { + "epoch": 0.63689137948391, + "grad_norm": 6.678563594818115, + "learning_rate": 7.97215027923893e-05, + "loss": 2.4371, + "step": 9490 + }, + { + "epoch": 0.637025603167679, + "grad_norm": 4.042203426361084, + "learning_rate": 7.97127623708608e-05, + "loss": 2.3325, + "step": 9492 + }, + { + "epoch": 0.6371598268514479, + "grad_norm": 4.701484203338623, + "learning_rate": 7.97040205454627e-05, + "loss": 2.6546, + "step": 9494 + }, + { + "epoch": 0.6372940505352169, + "grad_norm": 4.146543979644775, + "learning_rate": 7.969527731660805e-05, + "loss": 2.573, + "step": 9496 + }, + { + "epoch": 0.637428274218986, + "grad_norm": 4.032986164093018, + "learning_rate": 7.968653268470992e-05, + "loss": 2.6528, + "step": 9498 + }, + { + "epoch": 0.637562497902755, + "grad_norm": 4.569640159606934, + "learning_rate": 7.967778665018151e-05, + "loss": 2.354, + "step": 9500 + }, + { + "epoch": 0.6376967215865239, + "grad_norm": 3.2581264972686768, + "learning_rate": 7.966903921343603e-05, + "loss": 2.3191, + "step": 9502 + }, + { + "epoch": 0.6378309452702929, + "grad_norm": 5.032060146331787, + "learning_rate": 7.966029037488681e-05, + "loss": 2.6928, + "step": 9504 + }, + { + "epoch": 0.637965168954062, + "grad_norm": 3.672466516494751, + "learning_rate": 7.965154013494717e-05, + "loss": 2.4766, + "step": 9506 + }, + { + "epoch": 0.638099392637831, + "grad_norm": 4.299150466918945, + "learning_rate": 7.964278849403057e-05, + "loss": 2.5789, + "step": 9508 + }, + { + "epoch": 0.6382336163215999, + "grad_norm": 4.194480895996094, + "learning_rate": 7.96340354525505e-05, + "loss": 2.3736, + "step": 9510 + }, + { + "epoch": 0.638367840005369, + "grad_norm": 4.198761940002441, + "learning_rate": 7.962528101092054e-05, + "loss": 2.405, + "step": 9512 + }, + { + "epoch": 0.638502063689138, + "grad_norm": 4.252949237823486, + "learning_rate": 7.96165251695543e-05, + "loss": 2.4748, + "step": 9514 + }, + { + "epoch": 0.6386362873729069, + "grad_norm": 3.6461381912231445, + "learning_rate": 7.96077679288655e-05, + "loss": 2.3031, + "step": 9516 + }, + { + "epoch": 0.6387705110566759, + "grad_norm": 4.829996585845947, + "learning_rate": 7.959900928926788e-05, + "loss": 2.4415, + "step": 9518 + }, + { + "epoch": 0.638904734740445, + "grad_norm": 4.1353373527526855, + "learning_rate": 7.959024925117527e-05, + "loss": 2.5658, + "step": 9520 + }, + { + "epoch": 0.639038958424214, + "grad_norm": 5.272538661956787, + "learning_rate": 7.958148781500156e-05, + "loss": 2.9014, + "step": 9522 + }, + { + "epoch": 0.6391731821079829, + "grad_norm": 4.20519495010376, + "learning_rate": 7.957272498116073e-05, + "loss": 2.425, + "step": 9524 + }, + { + "epoch": 0.6393074057917519, + "grad_norm": 9.202614784240723, + "learning_rate": 7.956396075006681e-05, + "loss": 2.5366, + "step": 9526 + }, + { + "epoch": 0.639441629475521, + "grad_norm": 3.842808961868286, + "learning_rate": 7.955519512213386e-05, + "loss": 2.534, + "step": 9528 + }, + { + "epoch": 0.63957585315929, + "grad_norm": 7.609048843383789, + "learning_rate": 7.954642809777606e-05, + "loss": 2.5545, + "step": 9530 + }, + { + "epoch": 0.6397100768430589, + "grad_norm": 5.3016462326049805, + "learning_rate": 7.953765967740764e-05, + "loss": 2.2747, + "step": 9532 + }, + { + "epoch": 0.639844300526828, + "grad_norm": 4.231348991394043, + "learning_rate": 7.952888986144288e-05, + "loss": 2.5054, + "step": 9534 + }, + { + "epoch": 0.639978524210597, + "grad_norm": 4.0735626220703125, + "learning_rate": 7.952011865029614e-05, + "loss": 2.4711, + "step": 9536 + }, + { + "epoch": 0.640112747894366, + "grad_norm": 4.737496376037598, + "learning_rate": 7.951134604438183e-05, + "loss": 2.3616, + "step": 9538 + }, + { + "epoch": 0.6402469715781349, + "grad_norm": 3.87019681930542, + "learning_rate": 7.950257204411448e-05, + "loss": 2.5297, + "step": 9540 + }, + { + "epoch": 0.640381195261904, + "grad_norm": 4.12528133392334, + "learning_rate": 7.949379664990859e-05, + "loss": 2.2126, + "step": 9542 + }, + { + "epoch": 0.640515418945673, + "grad_norm": 4.036653518676758, + "learning_rate": 7.948501986217883e-05, + "loss": 2.2454, + "step": 9544 + }, + { + "epoch": 0.640649642629442, + "grad_norm": 5.073432445526123, + "learning_rate": 7.947624168133984e-05, + "loss": 2.8245, + "step": 9546 + }, + { + "epoch": 0.6407838663132109, + "grad_norm": 4.996280193328857, + "learning_rate": 7.94674621078064e-05, + "loss": 2.4986, + "step": 9548 + }, + { + "epoch": 0.64091808999698, + "grad_norm": 4.797095775604248, + "learning_rate": 7.945868114199332e-05, + "loss": 2.5961, + "step": 9550 + }, + { + "epoch": 0.641052313680749, + "grad_norm": 4.3009490966796875, + "learning_rate": 7.944989878431548e-05, + "loss": 2.2691, + "step": 9552 + }, + { + "epoch": 0.6411865373645179, + "grad_norm": 3.7648074626922607, + "learning_rate": 7.944111503518782e-05, + "loss": 2.2263, + "step": 9554 + }, + { + "epoch": 0.641320761048287, + "grad_norm": 4.57593297958374, + "learning_rate": 7.943232989502541e-05, + "loss": 2.4929, + "step": 9556 + }, + { + "epoch": 0.641454984732056, + "grad_norm": 4.440011024475098, + "learning_rate": 7.942354336424325e-05, + "loss": 2.5827, + "step": 9558 + }, + { + "epoch": 0.641589208415825, + "grad_norm": 4.309257984161377, + "learning_rate": 7.941475544325654e-05, + "loss": 2.321, + "step": 9560 + }, + { + "epoch": 0.6417234320995939, + "grad_norm": 4.4259114265441895, + "learning_rate": 7.940596613248048e-05, + "loss": 2.6444, + "step": 9562 + }, + { + "epoch": 0.641857655783363, + "grad_norm": 3.782869577407837, + "learning_rate": 7.939717543233034e-05, + "loss": 2.6536, + "step": 9564 + }, + { + "epoch": 0.641991879467132, + "grad_norm": 4.188439846038818, + "learning_rate": 7.938838334322147e-05, + "loss": 2.4968, + "step": 9566 + }, + { + "epoch": 0.642126103150901, + "grad_norm": 4.07708215713501, + "learning_rate": 7.937958986556927e-05, + "loss": 2.5138, + "step": 9568 + }, + { + "epoch": 0.6422603268346699, + "grad_norm": 5.017003059387207, + "learning_rate": 7.937079499978923e-05, + "loss": 2.3489, + "step": 9570 + }, + { + "epoch": 0.642394550518439, + "grad_norm": 3.858734607696533, + "learning_rate": 7.936199874629689e-05, + "loss": 2.6037, + "step": 9572 + }, + { + "epoch": 0.642528774202208, + "grad_norm": 4.179551601409912, + "learning_rate": 7.935320110550783e-05, + "loss": 2.2769, + "step": 9574 + }, + { + "epoch": 0.642662997885977, + "grad_norm": 5.47299861907959, + "learning_rate": 7.934440207783773e-05, + "loss": 2.4851, + "step": 9576 + }, + { + "epoch": 0.642797221569746, + "grad_norm": 4.137773513793945, + "learning_rate": 7.933560166370235e-05, + "loss": 2.3742, + "step": 9578 + }, + { + "epoch": 0.642931445253515, + "grad_norm": 4.7056660652160645, + "learning_rate": 7.932679986351748e-05, + "loss": 2.501, + "step": 9580 + }, + { + "epoch": 0.643065668937284, + "grad_norm": 4.263729095458984, + "learning_rate": 7.931799667769899e-05, + "loss": 2.4784, + "step": 9582 + }, + { + "epoch": 0.643199892621053, + "grad_norm": 3.685204267501831, + "learning_rate": 7.93091921066628e-05, + "loss": 2.5429, + "step": 9584 + }, + { + "epoch": 0.643334116304822, + "grad_norm": 3.981276273727417, + "learning_rate": 7.930038615082491e-05, + "loss": 2.3056, + "step": 9586 + }, + { + "epoch": 0.643468339988591, + "grad_norm": 4.582936763763428, + "learning_rate": 7.92915788106014e-05, + "loss": 2.4387, + "step": 9588 + }, + { + "epoch": 0.64360256367236, + "grad_norm": 4.945670127868652, + "learning_rate": 7.928277008640838e-05, + "loss": 2.2671, + "step": 9590 + }, + { + "epoch": 0.6437367873561289, + "grad_norm": 6.517011642456055, + "learning_rate": 7.927395997866207e-05, + "loss": 2.2875, + "step": 9592 + }, + { + "epoch": 0.643871011039898, + "grad_norm": 5.871518135070801, + "learning_rate": 7.926514848777871e-05, + "loss": 2.4965, + "step": 9594 + }, + { + "epoch": 0.644005234723667, + "grad_norm": 5.556583404541016, + "learning_rate": 7.925633561417462e-05, + "loss": 2.3854, + "step": 9596 + }, + { + "epoch": 0.644139458407436, + "grad_norm": 4.378159999847412, + "learning_rate": 7.924752135826623e-05, + "loss": 2.41, + "step": 9598 + }, + { + "epoch": 0.644273682091205, + "grad_norm": 6.121932029724121, + "learning_rate": 7.923870572046994e-05, + "loss": 2.3893, + "step": 9600 + }, + { + "epoch": 0.644407905774974, + "grad_norm": 4.871433258056641, + "learning_rate": 7.922988870120232e-05, + "loss": 2.689, + "step": 9602 + }, + { + "epoch": 0.644542129458743, + "grad_norm": 4.098337650299072, + "learning_rate": 7.922107030087992e-05, + "loss": 2.3227, + "step": 9604 + }, + { + "epoch": 0.644676353142512, + "grad_norm": 4.302072048187256, + "learning_rate": 7.921225051991942e-05, + "loss": 2.425, + "step": 9606 + }, + { + "epoch": 0.644810576826281, + "grad_norm": 5.249427795410156, + "learning_rate": 7.920342935873752e-05, + "loss": 2.7733, + "step": 9608 + }, + { + "epoch": 0.64494480051005, + "grad_norm": 4.137930870056152, + "learning_rate": 7.919460681775101e-05, + "loss": 2.409, + "step": 9610 + }, + { + "epoch": 0.645079024193819, + "grad_norm": 3.85347580909729, + "learning_rate": 7.918578289737675e-05, + "loss": 2.4217, + "step": 9612 + }, + { + "epoch": 0.645213247877588, + "grad_norm": 4.481478214263916, + "learning_rate": 7.917695759803163e-05, + "loss": 2.4965, + "step": 9614 + }, + { + "epoch": 0.645347471561357, + "grad_norm": 4.322864532470703, + "learning_rate": 7.916813092013264e-05, + "loss": 2.33, + "step": 9616 + }, + { + "epoch": 0.645481695245126, + "grad_norm": 3.873746871948242, + "learning_rate": 7.91593028640968e-05, + "loss": 2.4718, + "step": 9618 + }, + { + "epoch": 0.645615918928895, + "grad_norm": 4.588845252990723, + "learning_rate": 7.915047343034128e-05, + "loss": 2.5103, + "step": 9620 + }, + { + "epoch": 0.6457501426126641, + "grad_norm": 4.11820125579834, + "learning_rate": 7.914164261928318e-05, + "loss": 2.3787, + "step": 9622 + }, + { + "epoch": 0.645884366296433, + "grad_norm": 3.3415396213531494, + "learning_rate": 7.913281043133978e-05, + "loss": 2.2161, + "step": 9624 + }, + { + "epoch": 0.646018589980202, + "grad_norm": 4.5580830574035645, + "learning_rate": 7.912397686692837e-05, + "loss": 2.4119, + "step": 9626 + }, + { + "epoch": 0.646152813663971, + "grad_norm": 8.497093200683594, + "learning_rate": 7.911514192646632e-05, + "loss": 2.424, + "step": 9628 + }, + { + "epoch": 0.64628703734774, + "grad_norm": 3.9596495628356934, + "learning_rate": 7.910630561037109e-05, + "loss": 2.6232, + "step": 9630 + }, + { + "epoch": 0.646421261031509, + "grad_norm": 5.552451133728027, + "learning_rate": 7.909746791906013e-05, + "loss": 2.4431, + "step": 9632 + }, + { + "epoch": 0.646555484715278, + "grad_norm": 5.001251220703125, + "learning_rate": 7.908862885295103e-05, + "loss": 2.5478, + "step": 9634 + }, + { + "epoch": 0.646689708399047, + "grad_norm": 5.973057746887207, + "learning_rate": 7.907978841246144e-05, + "loss": 2.2617, + "step": 9636 + }, + { + "epoch": 0.646823932082816, + "grad_norm": 6.307474613189697, + "learning_rate": 7.907094659800902e-05, + "loss": 2.467, + "step": 9638 + }, + { + "epoch": 0.646958155766585, + "grad_norm": 5.672025203704834, + "learning_rate": 7.906210341001152e-05, + "loss": 2.2793, + "step": 9640 + }, + { + "epoch": 0.647092379450354, + "grad_norm": 5.464870452880859, + "learning_rate": 7.905325884888679e-05, + "loss": 2.6466, + "step": 9642 + }, + { + "epoch": 0.6472266031341231, + "grad_norm": 3.6963400840759277, + "learning_rate": 7.90444129150527e-05, + "loss": 2.3165, + "step": 9644 + }, + { + "epoch": 0.647360826817892, + "grad_norm": 5.015190124511719, + "learning_rate": 7.903556560892724e-05, + "loss": 2.747, + "step": 9646 + }, + { + "epoch": 0.647495050501661, + "grad_norm": 4.473080635070801, + "learning_rate": 7.90267169309284e-05, + "loss": 2.5956, + "step": 9648 + }, + { + "epoch": 0.64762927418543, + "grad_norm": 4.296324253082275, + "learning_rate": 7.901786688147426e-05, + "loss": 2.4782, + "step": 9650 + }, + { + "epoch": 0.6477634978691991, + "grad_norm": 4.915452003479004, + "learning_rate": 7.900901546098296e-05, + "loss": 2.6375, + "step": 9652 + }, + { + "epoch": 0.647897721552968, + "grad_norm": 7.077152729034424, + "learning_rate": 7.900016266987274e-05, + "loss": 2.597, + "step": 9654 + }, + { + "epoch": 0.648031945236737, + "grad_norm": 4.372560501098633, + "learning_rate": 7.899130850856184e-05, + "loss": 2.5271, + "step": 9656 + }, + { + "epoch": 0.648166168920506, + "grad_norm": 4.174579620361328, + "learning_rate": 7.898245297746863e-05, + "loss": 2.3691, + "step": 9658 + }, + { + "epoch": 0.6483003926042751, + "grad_norm": 5.6975626945495605, + "learning_rate": 7.897359607701151e-05, + "loss": 2.4429, + "step": 9660 + }, + { + "epoch": 0.648434616288044, + "grad_norm": 4.263298034667969, + "learning_rate": 7.896473780760895e-05, + "loss": 2.3293, + "step": 9662 + }, + { + "epoch": 0.648568839971813, + "grad_norm": 4.074979782104492, + "learning_rate": 7.895587816967948e-05, + "loss": 2.4497, + "step": 9664 + }, + { + "epoch": 0.6487030636555821, + "grad_norm": 4.58725643157959, + "learning_rate": 7.89470171636417e-05, + "loss": 2.4818, + "step": 9666 + }, + { + "epoch": 0.648837287339351, + "grad_norm": 4.3168416023254395, + "learning_rate": 7.89381547899143e-05, + "loss": 2.418, + "step": 9668 + }, + { + "epoch": 0.64897151102312, + "grad_norm": 4.72337007522583, + "learning_rate": 7.892929104891598e-05, + "loss": 2.4134, + "step": 9670 + }, + { + "epoch": 0.649105734706889, + "grad_norm": 3.903315544128418, + "learning_rate": 7.892042594106555e-05, + "loss": 2.2947, + "step": 9672 + }, + { + "epoch": 0.6492399583906581, + "grad_norm": 4.561180591583252, + "learning_rate": 7.891155946678185e-05, + "loss": 2.4594, + "step": 9674 + }, + { + "epoch": 0.649374182074427, + "grad_norm": 4.0424981117248535, + "learning_rate": 7.890269162648382e-05, + "loss": 2.4619, + "step": 9676 + }, + { + "epoch": 0.649508405758196, + "grad_norm": 4.079989910125732, + "learning_rate": 7.889382242059044e-05, + "loss": 2.3836, + "step": 9678 + }, + { + "epoch": 0.649642629441965, + "grad_norm": 5.402851104736328, + "learning_rate": 7.888495184952079e-05, + "loss": 2.5605, + "step": 9680 + }, + { + "epoch": 0.6497768531257341, + "grad_norm": 3.8966150283813477, + "learning_rate": 7.887607991369396e-05, + "loss": 2.5055, + "step": 9682 + }, + { + "epoch": 0.649911076809503, + "grad_norm": 4.805270195007324, + "learning_rate": 7.886720661352913e-05, + "loss": 2.4599, + "step": 9684 + }, + { + "epoch": 0.650045300493272, + "grad_norm": 3.857311248779297, + "learning_rate": 7.885833194944555e-05, + "loss": 2.3665, + "step": 9686 + }, + { + "epoch": 0.6501795241770411, + "grad_norm": 4.817582130432129, + "learning_rate": 7.884945592186255e-05, + "loss": 2.6373, + "step": 9688 + }, + { + "epoch": 0.6503137478608101, + "grad_norm": 4.35537052154541, + "learning_rate": 7.884057853119947e-05, + "loss": 2.359, + "step": 9690 + }, + { + "epoch": 0.650447971544579, + "grad_norm": 4.783344268798828, + "learning_rate": 7.883169977787577e-05, + "loss": 2.5657, + "step": 9692 + }, + { + "epoch": 0.650582195228348, + "grad_norm": 21.795448303222656, + "learning_rate": 7.882281966231094e-05, + "loss": 2.4934, + "step": 9694 + }, + { + "epoch": 0.6507164189121171, + "grad_norm": 4.528155326843262, + "learning_rate": 7.881393818492457e-05, + "loss": 2.2529, + "step": 9696 + }, + { + "epoch": 0.6508506425958861, + "grad_norm": 6.1442389488220215, + "learning_rate": 7.880505534613629e-05, + "loss": 2.4488, + "step": 9698 + }, + { + "epoch": 0.650984866279655, + "grad_norm": 4.287701606750488, + "learning_rate": 7.879617114636577e-05, + "loss": 2.7434, + "step": 9700 + }, + { + "epoch": 0.651119089963424, + "grad_norm": 3.8347396850585938, + "learning_rate": 7.87872855860328e-05, + "loss": 2.4022, + "step": 9702 + }, + { + "epoch": 0.6512533136471931, + "grad_norm": 3.844906806945801, + "learning_rate": 7.877839866555719e-05, + "loss": 2.5909, + "step": 9704 + }, + { + "epoch": 0.651387537330962, + "grad_norm": 3.7077279090881348, + "learning_rate": 7.876951038535883e-05, + "loss": 2.666, + "step": 9706 + }, + { + "epoch": 0.651521761014731, + "grad_norm": 4.342477798461914, + "learning_rate": 7.876062074585768e-05, + "loss": 2.505, + "step": 9708 + }, + { + "epoch": 0.6516559846985001, + "grad_norm": 4.148163795471191, + "learning_rate": 7.875172974747376e-05, + "loss": 2.314, + "step": 9710 + }, + { + "epoch": 0.6517902083822691, + "grad_norm": 3.899130344390869, + "learning_rate": 7.874283739062715e-05, + "loss": 2.0719, + "step": 9712 + }, + { + "epoch": 0.651924432066038, + "grad_norm": 4.345756530761719, + "learning_rate": 7.873394367573798e-05, + "loss": 2.3458, + "step": 9714 + }, + { + "epoch": 0.652058655749807, + "grad_norm": 5.149659633636475, + "learning_rate": 7.87250486032265e-05, + "loss": 2.3458, + "step": 9716 + }, + { + "epoch": 0.6521928794335761, + "grad_norm": 5.543517112731934, + "learning_rate": 7.871615217351294e-05, + "loss": 2.7175, + "step": 9718 + }, + { + "epoch": 0.6523271031173451, + "grad_norm": 5.265548229217529, + "learning_rate": 7.870725438701765e-05, + "loss": 2.6686, + "step": 9720 + }, + { + "epoch": 0.652461326801114, + "grad_norm": 4.000052452087402, + "learning_rate": 7.869835524416104e-05, + "loss": 2.5067, + "step": 9722 + }, + { + "epoch": 0.652595550484883, + "grad_norm": 4.537476062774658, + "learning_rate": 7.86894547453636e-05, + "loss": 2.4677, + "step": 9724 + }, + { + "epoch": 0.6527297741686521, + "grad_norm": 3.6201343536376953, + "learning_rate": 7.868055289104581e-05, + "loss": 2.1954, + "step": 9726 + }, + { + "epoch": 0.6528639978524211, + "grad_norm": 4.2078142166137695, + "learning_rate": 7.867164968162833e-05, + "loss": 2.5474, + "step": 9728 + }, + { + "epoch": 0.65299822153619, + "grad_norm": 4.035112380981445, + "learning_rate": 7.866274511753175e-05, + "loss": 2.5281, + "step": 9730 + }, + { + "epoch": 0.6531324452199591, + "grad_norm": 5.351541519165039, + "learning_rate": 7.865383919917683e-05, + "loss": 2.3404, + "step": 9732 + }, + { + "epoch": 0.6532666689037281, + "grad_norm": 4.342325210571289, + "learning_rate": 7.864493192698437e-05, + "loss": 2.2985, + "step": 9734 + }, + { + "epoch": 0.6534008925874971, + "grad_norm": 4.292960166931152, + "learning_rate": 7.863602330137519e-05, + "loss": 2.441, + "step": 9736 + }, + { + "epoch": 0.653535116271266, + "grad_norm": 3.8541793823242188, + "learning_rate": 7.862711332277023e-05, + "loss": 2.2936, + "step": 9738 + }, + { + "epoch": 0.6536693399550351, + "grad_norm": 3.866708755493164, + "learning_rate": 7.861820199159044e-05, + "loss": 2.3939, + "step": 9740 + }, + { + "epoch": 0.6538035636388041, + "grad_norm": 4.384768486022949, + "learning_rate": 7.860928930825691e-05, + "loss": 2.3454, + "step": 9742 + }, + { + "epoch": 0.653937787322573, + "grad_norm": 4.171535968780518, + "learning_rate": 7.86003752731907e-05, + "loss": 2.603, + "step": 9744 + }, + { + "epoch": 0.654072011006342, + "grad_norm": 4.376555442810059, + "learning_rate": 7.8591459886813e-05, + "loss": 2.5463, + "step": 9746 + }, + { + "epoch": 0.6542062346901111, + "grad_norm": 4.902318477630615, + "learning_rate": 7.858254314954505e-05, + "loss": 2.3787, + "step": 9748 + }, + { + "epoch": 0.6543404583738801, + "grad_norm": 4.393580913543701, + "learning_rate": 7.857362506180813e-05, + "loss": 2.633, + "step": 9750 + }, + { + "epoch": 0.654474682057649, + "grad_norm": 3.803201198577881, + "learning_rate": 7.856470562402362e-05, + "loss": 2.3142, + "step": 9752 + }, + { + "epoch": 0.6546089057414181, + "grad_norm": 4.352086067199707, + "learning_rate": 7.855578483661293e-05, + "loss": 2.4513, + "step": 9754 + }, + { + "epoch": 0.6547431294251871, + "grad_norm": 4.884162902832031, + "learning_rate": 7.854686269999757e-05, + "loss": 2.3829, + "step": 9756 + }, + { + "epoch": 0.6548773531089561, + "grad_norm": 4.209322452545166, + "learning_rate": 7.853793921459909e-05, + "loss": 2.547, + "step": 9758 + }, + { + "epoch": 0.655011576792725, + "grad_norm": 4.284951686859131, + "learning_rate": 7.852901438083908e-05, + "loss": 2.7002, + "step": 9760 + }, + { + "epoch": 0.6551458004764941, + "grad_norm": 3.7635223865509033, + "learning_rate": 7.852008819913925e-05, + "loss": 2.5692, + "step": 9762 + }, + { + "epoch": 0.6552800241602631, + "grad_norm": 4.520583629608154, + "learning_rate": 7.851116066992133e-05, + "loss": 2.313, + "step": 9764 + }, + { + "epoch": 0.6554142478440321, + "grad_norm": 4.254071235656738, + "learning_rate": 7.850223179360714e-05, + "loss": 2.5633, + "step": 9766 + }, + { + "epoch": 0.655548471527801, + "grad_norm": 3.9166440963745117, + "learning_rate": 7.849330157061854e-05, + "loss": 2.3821, + "step": 9768 + }, + { + "epoch": 0.6556826952115701, + "grad_norm": 4.1478495597839355, + "learning_rate": 7.848437000137747e-05, + "loss": 2.4904, + "step": 9770 + }, + { + "epoch": 0.6558169188953391, + "grad_norm": 4.113980293273926, + "learning_rate": 7.847543708630593e-05, + "loss": 2.4345, + "step": 9772 + }, + { + "epoch": 0.6559511425791081, + "grad_norm": 3.9628117084503174, + "learning_rate": 7.8466502825826e-05, + "loss": 2.291, + "step": 9774 + }, + { + "epoch": 0.6560853662628771, + "grad_norm": 3.842691421508789, + "learning_rate": 7.845756722035978e-05, + "loss": 2.3382, + "step": 9776 + }, + { + "epoch": 0.6562195899466461, + "grad_norm": 4.233767986297607, + "learning_rate": 7.844863027032945e-05, + "loss": 2.316, + "step": 9778 + }, + { + "epoch": 0.6563538136304151, + "grad_norm": 4.430943965911865, + "learning_rate": 7.84396919761573e-05, + "loss": 2.6291, + "step": 9780 + }, + { + "epoch": 0.656488037314184, + "grad_norm": 4.0105180740356445, + "learning_rate": 7.843075233826561e-05, + "loss": 2.3963, + "step": 9782 + }, + { + "epoch": 0.6566222609979531, + "grad_norm": 4.167514801025391, + "learning_rate": 7.84218113570768e-05, + "loss": 2.2559, + "step": 9784 + }, + { + "epoch": 0.6567564846817221, + "grad_norm": 4.088459014892578, + "learning_rate": 7.841286903301328e-05, + "loss": 2.1982, + "step": 9786 + }, + { + "epoch": 0.6568907083654911, + "grad_norm": 3.836155652999878, + "learning_rate": 7.840392536649758e-05, + "loss": 2.3375, + "step": 9788 + }, + { + "epoch": 0.65702493204926, + "grad_norm": 4.645664215087891, + "learning_rate": 7.839498035795224e-05, + "loss": 2.4946, + "step": 9790 + }, + { + "epoch": 0.6571591557330291, + "grad_norm": 4.452833652496338, + "learning_rate": 7.838603400779993e-05, + "loss": 2.5287, + "step": 9792 + }, + { + "epoch": 0.6572933794167981, + "grad_norm": 4.2831549644470215, + "learning_rate": 7.837708631646333e-05, + "loss": 2.4744, + "step": 9794 + }, + { + "epoch": 0.6574276031005671, + "grad_norm": 4.9046101570129395, + "learning_rate": 7.836813728436521e-05, + "loss": 2.5162, + "step": 9796 + }, + { + "epoch": 0.6575618267843361, + "grad_norm": 3.9372682571411133, + "learning_rate": 7.835918691192837e-05, + "loss": 2.4678, + "step": 9798 + }, + { + "epoch": 0.6576960504681051, + "grad_norm": 4.020133018493652, + "learning_rate": 7.835023519957571e-05, + "loss": 2.5014, + "step": 9800 + }, + { + "epoch": 0.6578302741518741, + "grad_norm": 4.535752296447754, + "learning_rate": 7.834128214773022e-05, + "loss": 2.4102, + "step": 9802 + }, + { + "epoch": 0.6579644978356431, + "grad_norm": 4.347311019897461, + "learning_rate": 7.833232775681485e-05, + "loss": 2.604, + "step": 9804 + }, + { + "epoch": 0.6580987215194121, + "grad_norm": 3.8814072608947754, + "learning_rate": 7.83233720272527e-05, + "loss": 2.4562, + "step": 9806 + }, + { + "epoch": 0.6582329452031811, + "grad_norm": 5.8235931396484375, + "learning_rate": 7.831441495946694e-05, + "loss": 2.5089, + "step": 9808 + }, + { + "epoch": 0.6583671688869501, + "grad_norm": 4.036945819854736, + "learning_rate": 7.830545655388075e-05, + "loss": 2.3634, + "step": 9810 + }, + { + "epoch": 0.6585013925707192, + "grad_norm": 3.8493521213531494, + "learning_rate": 7.829649681091738e-05, + "loss": 2.1658, + "step": 9812 + }, + { + "epoch": 0.6586356162544881, + "grad_norm": 4.315537929534912, + "learning_rate": 7.828753573100019e-05, + "loss": 2.458, + "step": 9814 + }, + { + "epoch": 0.6587698399382571, + "grad_norm": 3.616145372390747, + "learning_rate": 7.827857331455256e-05, + "loss": 2.3201, + "step": 9816 + }, + { + "epoch": 0.6589040636220261, + "grad_norm": 3.9414212703704834, + "learning_rate": 7.826960956199794e-05, + "loss": 2.2255, + "step": 9818 + }, + { + "epoch": 0.6590382873057951, + "grad_norm": 4.253956317901611, + "learning_rate": 7.826064447375987e-05, + "loss": 2.37, + "step": 9820 + }, + { + "epoch": 0.6591725109895641, + "grad_norm": 4.019162178039551, + "learning_rate": 7.825167805026193e-05, + "loss": 2.3692, + "step": 9822 + }, + { + "epoch": 0.6593067346733331, + "grad_norm": 4.183456897735596, + "learning_rate": 7.824271029192773e-05, + "loss": 2.7097, + "step": 9824 + }, + { + "epoch": 0.6594409583571021, + "grad_norm": 4.2609758377075195, + "learning_rate": 7.823374119918103e-05, + "loss": 2.1258, + "step": 9826 + }, + { + "epoch": 0.6595751820408711, + "grad_norm": 3.8337132930755615, + "learning_rate": 7.822477077244557e-05, + "loss": 2.5794, + "step": 9828 + }, + { + "epoch": 0.6597094057246401, + "grad_norm": 4.226895332336426, + "learning_rate": 7.821579901214518e-05, + "loss": 2.3634, + "step": 9830 + }, + { + "epoch": 0.6598436294084091, + "grad_norm": 3.7317821979522705, + "learning_rate": 7.820682591870378e-05, + "loss": 2.449, + "step": 9832 + }, + { + "epoch": 0.6599778530921782, + "grad_norm": 4.741488933563232, + "learning_rate": 7.819785149254532e-05, + "loss": 2.4929, + "step": 9834 + }, + { + "epoch": 0.6601120767759471, + "grad_norm": 5.786540508270264, + "learning_rate": 7.818887573409383e-05, + "loss": 2.5971, + "step": 9836 + }, + { + "epoch": 0.6602463004597161, + "grad_norm": 4.244748592376709, + "learning_rate": 7.817989864377339e-05, + "loss": 2.2805, + "step": 9838 + }, + { + "epoch": 0.6603805241434851, + "grad_norm": 3.5480146408081055, + "learning_rate": 7.817092022200816e-05, + "loss": 2.0683, + "step": 9840 + }, + { + "epoch": 0.6605147478272542, + "grad_norm": 3.810404062271118, + "learning_rate": 7.816194046922234e-05, + "loss": 2.3825, + "step": 9842 + }, + { + "epoch": 0.6606489715110231, + "grad_norm": 4.937906742095947, + "learning_rate": 7.815295938584021e-05, + "loss": 2.6785, + "step": 9844 + }, + { + "epoch": 0.6607831951947921, + "grad_norm": 3.825792074203491, + "learning_rate": 7.81439769722861e-05, + "loss": 2.5063, + "step": 9846 + }, + { + "epoch": 0.6609174188785611, + "grad_norm": 3.1793365478515625, + "learning_rate": 7.813499322898443e-05, + "loss": 2.0573, + "step": 9848 + }, + { + "epoch": 0.6610516425623302, + "grad_norm": 4.30018949508667, + "learning_rate": 7.812600815635967e-05, + "loss": 2.5246, + "step": 9850 + }, + { + "epoch": 0.6611858662460991, + "grad_norm": 3.890578269958496, + "learning_rate": 7.81170217548363e-05, + "loss": 2.4231, + "step": 9852 + }, + { + "epoch": 0.6613200899298681, + "grad_norm": 4.524956703186035, + "learning_rate": 7.810803402483897e-05, + "loss": 2.308, + "step": 9854 + }, + { + "epoch": 0.6614543136136372, + "grad_norm": 3.8929316997528076, + "learning_rate": 7.80990449667923e-05, + "loss": 2.9336, + "step": 9856 + }, + { + "epoch": 0.6615885372974061, + "grad_norm": 3.993377685546875, + "learning_rate": 7.8090054581121e-05, + "loss": 2.3833, + "step": 9858 + }, + { + "epoch": 0.6617227609811751, + "grad_norm": 4.200159549713135, + "learning_rate": 7.808106286824986e-05, + "loss": 2.4533, + "step": 9860 + }, + { + "epoch": 0.6618569846649441, + "grad_norm": 5.0328688621521, + "learning_rate": 7.807206982860371e-05, + "loss": 2.38, + "step": 9862 + }, + { + "epoch": 0.6619912083487132, + "grad_norm": 4.661871433258057, + "learning_rate": 7.806307546260748e-05, + "loss": 2.4247, + "step": 9864 + }, + { + "epoch": 0.6621254320324821, + "grad_norm": 6.494414806365967, + "learning_rate": 7.80540797706861e-05, + "loss": 2.6287, + "step": 9866 + }, + { + "epoch": 0.6622596557162511, + "grad_norm": 4.4238762855529785, + "learning_rate": 7.804508275326462e-05, + "loss": 2.814, + "step": 9868 + }, + { + "epoch": 0.6623938794000201, + "grad_norm": 3.729238986968994, + "learning_rate": 7.803608441076815e-05, + "loss": 2.2061, + "step": 9870 + }, + { + "epoch": 0.6625281030837892, + "grad_norm": 4.877319812774658, + "learning_rate": 7.80270847436218e-05, + "loss": 2.5977, + "step": 9872 + }, + { + "epoch": 0.6626623267675581, + "grad_norm": 3.866612672805786, + "learning_rate": 7.801808375225082e-05, + "loss": 2.5865, + "step": 9874 + }, + { + "epoch": 0.6627965504513271, + "grad_norm": 4.238446235656738, + "learning_rate": 7.800908143708047e-05, + "loss": 2.2594, + "step": 9876 + }, + { + "epoch": 0.6629307741350962, + "grad_norm": 4.119851589202881, + "learning_rate": 7.800007779853611e-05, + "loss": 2.2887, + "step": 9878 + }, + { + "epoch": 0.6630649978188652, + "grad_norm": 3.9656014442443848, + "learning_rate": 7.799107283704312e-05, + "loss": 2.6085, + "step": 9880 + }, + { + "epoch": 0.6631992215026341, + "grad_norm": 4.121694564819336, + "learning_rate": 7.798206655302698e-05, + "loss": 2.3611, + "step": 9882 + }, + { + "epoch": 0.6633334451864031, + "grad_norm": 5.2972941398620605, + "learning_rate": 7.797305894691322e-05, + "loss": 2.6596, + "step": 9884 + }, + { + "epoch": 0.6634676688701722, + "grad_norm": 4.310673236846924, + "learning_rate": 7.796405001912746e-05, + "loss": 2.3751, + "step": 9886 + }, + { + "epoch": 0.6636018925539412, + "grad_norm": 4.95958137512207, + "learning_rate": 7.795503977009528e-05, + "loss": 2.5701, + "step": 9888 + }, + { + "epoch": 0.6637361162377101, + "grad_norm": 3.933764934539795, + "learning_rate": 7.794602820024248e-05, + "loss": 2.3547, + "step": 9890 + }, + { + "epoch": 0.6638703399214791, + "grad_norm": 3.7402541637420654, + "learning_rate": 7.793701530999478e-05, + "loss": 2.3371, + "step": 9892 + }, + { + "epoch": 0.6640045636052482, + "grad_norm": 3.992349624633789, + "learning_rate": 7.792800109977804e-05, + "loss": 2.3622, + "step": 9894 + }, + { + "epoch": 0.6641387872890171, + "grad_norm": 4.274453639984131, + "learning_rate": 7.791898557001818e-05, + "loss": 2.6285, + "step": 9896 + }, + { + "epoch": 0.6642730109727861, + "grad_norm": 3.3397836685180664, + "learning_rate": 7.790996872114116e-05, + "loss": 2.4879, + "step": 9898 + }, + { + "epoch": 0.6644072346565552, + "grad_norm": 4.453442573547363, + "learning_rate": 7.790095055357298e-05, + "loss": 2.5206, + "step": 9900 + }, + { + "epoch": 0.6645414583403242, + "grad_norm": 4.385496139526367, + "learning_rate": 7.789193106773976e-05, + "loss": 2.5207, + "step": 9902 + }, + { + "epoch": 0.6646756820240931, + "grad_norm": 4.306093215942383, + "learning_rate": 7.788291026406764e-05, + "loss": 2.2604, + "step": 9904 + }, + { + "epoch": 0.6648099057078621, + "grad_norm": 4.210898399353027, + "learning_rate": 7.787388814298284e-05, + "loss": 2.3955, + "step": 9906 + }, + { + "epoch": 0.6649441293916312, + "grad_norm": 4.045642375946045, + "learning_rate": 7.786486470491165e-05, + "loss": 2.6073, + "step": 9908 + }, + { + "epoch": 0.6650783530754002, + "grad_norm": 4.572116374969482, + "learning_rate": 7.785583995028038e-05, + "loss": 2.4074, + "step": 9910 + }, + { + "epoch": 0.6652125767591691, + "grad_norm": 4.639484405517578, + "learning_rate": 7.784681387951546e-05, + "loss": 2.4391, + "step": 9912 + }, + { + "epoch": 0.6653468004429381, + "grad_norm": 4.22584867477417, + "learning_rate": 7.783778649304333e-05, + "loss": 2.2756, + "step": 9914 + }, + { + "epoch": 0.6654810241267072, + "grad_norm": 4.009024620056152, + "learning_rate": 7.782875779129055e-05, + "loss": 2.247, + "step": 9916 + }, + { + "epoch": 0.6656152478104762, + "grad_norm": 4.1454901695251465, + "learning_rate": 7.781972777468367e-05, + "loss": 2.6123, + "step": 9918 + }, + { + "epoch": 0.6657494714942451, + "grad_norm": 3.7668566703796387, + "learning_rate": 7.781069644364936e-05, + "loss": 2.5681, + "step": 9920 + }, + { + "epoch": 0.6658836951780142, + "grad_norm": 4.353116512298584, + "learning_rate": 7.780166379861432e-05, + "loss": 2.4611, + "step": 9922 + }, + { + "epoch": 0.6660179188617832, + "grad_norm": 3.8577747344970703, + "learning_rate": 7.779262984000536e-05, + "loss": 2.4553, + "step": 9924 + }, + { + "epoch": 0.6661521425455522, + "grad_norm": 4.385298252105713, + "learning_rate": 7.778359456824929e-05, + "loss": 2.5397, + "step": 9926 + }, + { + "epoch": 0.6662863662293211, + "grad_norm": 5.144322395324707, + "learning_rate": 7.777455798377297e-05, + "loss": 2.6573, + "step": 9928 + }, + { + "epoch": 0.6664205899130902, + "grad_norm": 6.5314040184021, + "learning_rate": 7.776552008700344e-05, + "loss": 2.5012, + "step": 9930 + }, + { + "epoch": 0.6665548135968592, + "grad_norm": 4.772058486938477, + "learning_rate": 7.775648087836768e-05, + "loss": 2.2479, + "step": 9932 + }, + { + "epoch": 0.6666890372806281, + "grad_norm": 4.686263084411621, + "learning_rate": 7.774744035829277e-05, + "loss": 2.9271, + "step": 9934 + }, + { + "epoch": 0.6668232609643971, + "grad_norm": 4.8289923667907715, + "learning_rate": 7.773839852720589e-05, + "loss": 2.5602, + "step": 9936 + }, + { + "epoch": 0.6669574846481662, + "grad_norm": 6.3527727127075195, + "learning_rate": 7.772935538553418e-05, + "loss": 2.6389, + "step": 9938 + }, + { + "epoch": 0.6670917083319352, + "grad_norm": 4.026030540466309, + "learning_rate": 7.772031093370499e-05, + "loss": 2.7078, + "step": 9940 + }, + { + "epoch": 0.6672259320157041, + "grad_norm": 4.107556343078613, + "learning_rate": 7.771126517214561e-05, + "loss": 2.649, + "step": 9942 + }, + { + "epoch": 0.6673601556994732, + "grad_norm": 3.995701313018799, + "learning_rate": 7.770221810128343e-05, + "loss": 2.4676, + "step": 9944 + }, + { + "epoch": 0.6674943793832422, + "grad_norm": 3.854656934738159, + "learning_rate": 7.769316972154594e-05, + "loss": 2.3988, + "step": 9946 + }, + { + "epoch": 0.6676286030670112, + "grad_norm": 4.2461042404174805, + "learning_rate": 7.768412003336064e-05, + "loss": 2.6073, + "step": 9948 + }, + { + "epoch": 0.6677628267507801, + "grad_norm": 8.255500793457031, + "learning_rate": 7.767506903715509e-05, + "loss": 2.511, + "step": 9950 + }, + { + "epoch": 0.6678970504345492, + "grad_norm": 4.787571430206299, + "learning_rate": 7.766601673335694e-05, + "loss": 2.6185, + "step": 9952 + }, + { + "epoch": 0.6680312741183182, + "grad_norm": 3.9394097328186035, + "learning_rate": 7.765696312239392e-05, + "loss": 2.0797, + "step": 9954 + }, + { + "epoch": 0.6681654978020872, + "grad_norm": 4.173615455627441, + "learning_rate": 7.764790820469377e-05, + "loss": 2.3556, + "step": 9956 + }, + { + "epoch": 0.6682997214858561, + "grad_norm": 3.835442066192627, + "learning_rate": 7.763885198068433e-05, + "loss": 2.1834, + "step": 9958 + }, + { + "epoch": 0.6684339451696252, + "grad_norm": 4.033315658569336, + "learning_rate": 7.762979445079348e-05, + "loss": 2.5756, + "step": 9960 + }, + { + "epoch": 0.6685681688533942, + "grad_norm": 4.095593452453613, + "learning_rate": 7.762073561544918e-05, + "loss": 2.5175, + "step": 9962 + }, + { + "epoch": 0.6687023925371632, + "grad_norm": 3.7405357360839844, + "learning_rate": 7.761167547507942e-05, + "loss": 2.5243, + "step": 9964 + }, + { + "epoch": 0.6688366162209322, + "grad_norm": 3.7441294193267822, + "learning_rate": 7.760261403011231e-05, + "loss": 2.4177, + "step": 9966 + }, + { + "epoch": 0.6689708399047012, + "grad_norm": 4.713369369506836, + "learning_rate": 7.759355128097595e-05, + "loss": 2.6393, + "step": 9968 + }, + { + "epoch": 0.6691050635884702, + "grad_norm": 6.948410511016846, + "learning_rate": 7.758448722809856e-05, + "loss": 2.4817, + "step": 9970 + }, + { + "epoch": 0.6692392872722391, + "grad_norm": 4.262134552001953, + "learning_rate": 7.757542187190838e-05, + "loss": 2.4867, + "step": 9972 + }, + { + "epoch": 0.6693735109560082, + "grad_norm": 4.167291641235352, + "learning_rate": 7.756635521283375e-05, + "loss": 2.5481, + "step": 9974 + }, + { + "epoch": 0.6695077346397772, + "grad_norm": 4.315824508666992, + "learning_rate": 7.755728725130304e-05, + "loss": 2.3259, + "step": 9976 + }, + { + "epoch": 0.6696419583235462, + "grad_norm": 9.065067291259766, + "learning_rate": 7.754821798774471e-05, + "loss": 2.67, + "step": 9978 + }, + { + "epoch": 0.6697761820073151, + "grad_norm": 3.779745578765869, + "learning_rate": 7.753914742258724e-05, + "loss": 2.3354, + "step": 9980 + }, + { + "epoch": 0.6699104056910842, + "grad_norm": 5.070758819580078, + "learning_rate": 7.75300755562592e-05, + "loss": 2.6658, + "step": 9982 + }, + { + "epoch": 0.6700446293748532, + "grad_norm": 4.45797872543335, + "learning_rate": 7.752100238918925e-05, + "loss": 2.8177, + "step": 9984 + }, + { + "epoch": 0.6701788530586222, + "grad_norm": 3.844167470932007, + "learning_rate": 7.751192792180604e-05, + "loss": 2.3377, + "step": 9986 + }, + { + "epoch": 0.6703130767423912, + "grad_norm": 4.048766136169434, + "learning_rate": 7.750285215453833e-05, + "loss": 2.2202, + "step": 9988 + }, + { + "epoch": 0.6704473004261602, + "grad_norm": 4.1248273849487305, + "learning_rate": 7.749377508781495e-05, + "loss": 2.4106, + "step": 9990 + }, + { + "epoch": 0.6705815241099292, + "grad_norm": 4.301931858062744, + "learning_rate": 7.748469672206476e-05, + "loss": 2.2755, + "step": 9992 + }, + { + "epoch": 0.6707157477936982, + "grad_norm": 4.459712982177734, + "learning_rate": 7.747561705771669e-05, + "loss": 2.5351, + "step": 9994 + }, + { + "epoch": 0.6708499714774672, + "grad_norm": 4.025245189666748, + "learning_rate": 7.746653609519973e-05, + "loss": 2.4011, + "step": 9996 + }, + { + "epoch": 0.6709841951612362, + "grad_norm": 3.97322940826416, + "learning_rate": 7.745745383494296e-05, + "loss": 2.352, + "step": 9998 + }, + { + "epoch": 0.6711184188450052, + "grad_norm": 4.022007465362549, + "learning_rate": 7.74483702773755e-05, + "loss": 2.4902, + "step": 10000 + }, + { + "epoch": 0.6712526425287743, + "grad_norm": 4.934265613555908, + "learning_rate": 7.74392854229265e-05, + "loss": 2.3414, + "step": 10002 + }, + { + "epoch": 0.6713868662125432, + "grad_norm": 4.257050037384033, + "learning_rate": 7.743019927202524e-05, + "loss": 2.5035, + "step": 10004 + }, + { + "epoch": 0.6715210898963122, + "grad_norm": 3.4540767669677734, + "learning_rate": 7.7421111825101e-05, + "loss": 2.3069, + "step": 10006 + }, + { + "epoch": 0.6716553135800812, + "grad_norm": 3.909991502761841, + "learning_rate": 7.741202308258314e-05, + "loss": 2.3287, + "step": 10008 + }, + { + "epoch": 0.6717895372638502, + "grad_norm": 4.233033180236816, + "learning_rate": 7.74029330449011e-05, + "loss": 2.3385, + "step": 10010 + }, + { + "epoch": 0.6719237609476192, + "grad_norm": 4.764827728271484, + "learning_rate": 7.739384171248435e-05, + "loss": 2.6277, + "step": 10012 + }, + { + "epoch": 0.6720579846313882, + "grad_norm": 4.094422817230225, + "learning_rate": 7.738474908576246e-05, + "loss": 2.4945, + "step": 10014 + }, + { + "epoch": 0.6721922083151572, + "grad_norm": 7.86492919921875, + "learning_rate": 7.737565516516501e-05, + "loss": 2.4155, + "step": 10016 + }, + { + "epoch": 0.6723264319989262, + "grad_norm": 4.100944519042969, + "learning_rate": 7.73665599511217e-05, + "loss": 2.6091, + "step": 10018 + }, + { + "epoch": 0.6724606556826952, + "grad_norm": 3.6147449016571045, + "learning_rate": 7.735746344406223e-05, + "loss": 2.3332, + "step": 10020 + }, + { + "epoch": 0.6725948793664642, + "grad_norm": 4.489572048187256, + "learning_rate": 7.734836564441642e-05, + "loss": 2.6162, + "step": 10022 + }, + { + "epoch": 0.6727291030502333, + "grad_norm": 4.093594551086426, + "learning_rate": 7.733926655261411e-05, + "loss": 2.4555, + "step": 10024 + }, + { + "epoch": 0.6728633267340022, + "grad_norm": 4.191204071044922, + "learning_rate": 7.733016616908521e-05, + "loss": 2.4828, + "step": 10026 + }, + { + "epoch": 0.6729975504177712, + "grad_norm": 3.9796581268310547, + "learning_rate": 7.73210644942597e-05, + "loss": 2.4629, + "step": 10028 + }, + { + "epoch": 0.6731317741015402, + "grad_norm": 3.5244030952453613, + "learning_rate": 7.731196152856763e-05, + "loss": 2.3974, + "step": 10030 + }, + { + "epoch": 0.6732659977853093, + "grad_norm": 4.239780426025391, + "learning_rate": 7.730285727243907e-05, + "loss": 2.6515, + "step": 10032 + }, + { + "epoch": 0.6734002214690782, + "grad_norm": 3.7465124130249023, + "learning_rate": 7.72937517263042e-05, + "loss": 2.3112, + "step": 10034 + }, + { + "epoch": 0.6735344451528472, + "grad_norm": 4.189391136169434, + "learning_rate": 7.728464489059324e-05, + "loss": 2.7592, + "step": 10036 + }, + { + "epoch": 0.6736686688366162, + "grad_norm": 4.216996192932129, + "learning_rate": 7.727553676573644e-05, + "loss": 2.4521, + "step": 10038 + }, + { + "epoch": 0.6738028925203853, + "grad_norm": 5.243020534515381, + "learning_rate": 7.726642735216418e-05, + "loss": 2.315, + "step": 10040 + }, + { + "epoch": 0.6739371162041542, + "grad_norm": 4.440161228179932, + "learning_rate": 7.725731665030684e-05, + "loss": 2.5375, + "step": 10042 + }, + { + "epoch": 0.6740713398879232, + "grad_norm": 5.126798152923584, + "learning_rate": 7.72482046605949e-05, + "loss": 2.7955, + "step": 10044 + }, + { + "epoch": 0.6742055635716923, + "grad_norm": 4.297595977783203, + "learning_rate": 7.723909138345886e-05, + "loss": 2.7305, + "step": 10046 + }, + { + "epoch": 0.6743397872554612, + "grad_norm": 4.033557415008545, + "learning_rate": 7.722997681932931e-05, + "loss": 2.5505, + "step": 10048 + }, + { + "epoch": 0.6744740109392302, + "grad_norm": 3.964308023452759, + "learning_rate": 7.722086096863692e-05, + "loss": 2.3954, + "step": 10050 + }, + { + "epoch": 0.6746082346229992, + "grad_norm": 4.498386859893799, + "learning_rate": 7.721174383181235e-05, + "loss": 2.4826, + "step": 10052 + }, + { + "epoch": 0.6747424583067683, + "grad_norm": 3.794327735900879, + "learning_rate": 7.720262540928641e-05, + "loss": 2.4373, + "step": 10054 + }, + { + "epoch": 0.6748766819905372, + "grad_norm": 4.222204208374023, + "learning_rate": 7.719350570148991e-05, + "loss": 2.3065, + "step": 10056 + }, + { + "epoch": 0.6750109056743062, + "grad_norm": 4.450692176818848, + "learning_rate": 7.718438470885375e-05, + "loss": 2.4077, + "step": 10058 + }, + { + "epoch": 0.6751451293580752, + "grad_norm": 3.9556899070739746, + "learning_rate": 7.717526243180883e-05, + "loss": 2.361, + "step": 10060 + }, + { + "epoch": 0.6752793530418443, + "grad_norm": 4.0093159675598145, + "learning_rate": 7.716613887078623e-05, + "loss": 2.5069, + "step": 10062 + }, + { + "epoch": 0.6754135767256132, + "grad_norm": 4.799712181091309, + "learning_rate": 7.715701402621699e-05, + "loss": 2.7205, + "step": 10064 + }, + { + "epoch": 0.6755478004093822, + "grad_norm": 5.138760566711426, + "learning_rate": 7.714788789853223e-05, + "loss": 2.3339, + "step": 10066 + }, + { + "epoch": 0.6756820240931513, + "grad_norm": 4.441573143005371, + "learning_rate": 7.713876048816316e-05, + "loss": 2.5126, + "step": 10068 + }, + { + "epoch": 0.6758162477769203, + "grad_norm": 6.177773475646973, + "learning_rate": 7.7129631795541e-05, + "loss": 2.5405, + "step": 10070 + }, + { + "epoch": 0.6759504714606892, + "grad_norm": 4.028964042663574, + "learning_rate": 7.712050182109711e-05, + "loss": 2.3547, + "step": 10072 + }, + { + "epoch": 0.6760846951444582, + "grad_norm": 5.477880477905273, + "learning_rate": 7.711137056526283e-05, + "loss": 2.4945, + "step": 10074 + }, + { + "epoch": 0.6762189188282273, + "grad_norm": 5.170097827911377, + "learning_rate": 7.71022380284696e-05, + "loss": 2.4918, + "step": 10076 + }, + { + "epoch": 0.6763531425119963, + "grad_norm": 4.041374683380127, + "learning_rate": 7.709310421114892e-05, + "loss": 2.4825, + "step": 10078 + }, + { + "epoch": 0.6764873661957652, + "grad_norm": 4.1755900382995605, + "learning_rate": 7.708396911373233e-05, + "loss": 2.2941, + "step": 10080 + }, + { + "epoch": 0.6766215898795342, + "grad_norm": 4.487412929534912, + "learning_rate": 7.707483273665149e-05, + "loss": 2.567, + "step": 10082 + }, + { + "epoch": 0.6767558135633033, + "grad_norm": 8.20638656616211, + "learning_rate": 7.706569508033801e-05, + "loss": 2.5414, + "step": 10084 + }, + { + "epoch": 0.6768900372470722, + "grad_norm": 4.502035140991211, + "learning_rate": 7.705655614522367e-05, + "loss": 2.6481, + "step": 10086 + }, + { + "epoch": 0.6770242609308412, + "grad_norm": 3.417861223220825, + "learning_rate": 7.704741593174026e-05, + "loss": 2.0966, + "step": 10088 + }, + { + "epoch": 0.6771584846146103, + "grad_norm": 3.879194498062134, + "learning_rate": 7.703827444031963e-05, + "loss": 2.3486, + "step": 10090 + }, + { + "epoch": 0.6772927082983793, + "grad_norm": 4.076622009277344, + "learning_rate": 7.702913167139372e-05, + "loss": 2.4463, + "step": 10092 + }, + { + "epoch": 0.6774269319821482, + "grad_norm": 3.5914101600646973, + "learning_rate": 7.701998762539446e-05, + "loss": 2.4417, + "step": 10094 + }, + { + "epoch": 0.6775611556659172, + "grad_norm": 4.51848030090332, + "learning_rate": 7.701084230275392e-05, + "loss": 2.4358, + "step": 10096 + }, + { + "epoch": 0.6776953793496863, + "grad_norm": 4.330906867980957, + "learning_rate": 7.700169570390422e-05, + "loss": 2.42, + "step": 10098 + }, + { + "epoch": 0.6778296030334553, + "grad_norm": 5.030096054077148, + "learning_rate": 7.699254782927749e-05, + "loss": 2.5547, + "step": 10100 + }, + { + "epoch": 0.6779638267172242, + "grad_norm": 3.8967878818511963, + "learning_rate": 7.698339867930592e-05, + "loss": 2.4915, + "step": 10102 + }, + { + "epoch": 0.6780980504009932, + "grad_norm": 4.279300689697266, + "learning_rate": 7.697424825442187e-05, + "loss": 2.3062, + "step": 10104 + }, + { + "epoch": 0.6782322740847623, + "grad_norm": 3.955096483230591, + "learning_rate": 7.69650965550576e-05, + "loss": 2.3973, + "step": 10106 + }, + { + "epoch": 0.6783664977685313, + "grad_norm": 4.169755935668945, + "learning_rate": 7.695594358164557e-05, + "loss": 2.5873, + "step": 10108 + }, + { + "epoch": 0.6785007214523002, + "grad_norm": 4.304602146148682, + "learning_rate": 7.694678933461818e-05, + "loss": 2.3866, + "step": 10110 + }, + { + "epoch": 0.6786349451360693, + "grad_norm": 4.050364017486572, + "learning_rate": 7.6937633814408e-05, + "loss": 2.6004, + "step": 10112 + }, + { + "epoch": 0.6787691688198383, + "grad_norm": 3.796499729156494, + "learning_rate": 7.692847702144759e-05, + "loss": 2.6268, + "step": 10114 + }, + { + "epoch": 0.6789033925036073, + "grad_norm": 4.623426914215088, + "learning_rate": 7.69193189561696e-05, + "loss": 2.4347, + "step": 10116 + }, + { + "epoch": 0.6790376161873762, + "grad_norm": 4.770562171936035, + "learning_rate": 7.691015961900671e-05, + "loss": 2.2909, + "step": 10118 + }, + { + "epoch": 0.6791718398711453, + "grad_norm": 4.003107070922852, + "learning_rate": 7.69009990103917e-05, + "loss": 2.3247, + "step": 10120 + }, + { + "epoch": 0.6793060635549143, + "grad_norm": 4.5514092445373535, + "learning_rate": 7.689183713075741e-05, + "loss": 2.4737, + "step": 10122 + }, + { + "epoch": 0.6794402872386832, + "grad_norm": 4.309668064117432, + "learning_rate": 7.688267398053665e-05, + "loss": 2.5334, + "step": 10124 + }, + { + "epoch": 0.6795745109224522, + "grad_norm": 3.6899683475494385, + "learning_rate": 7.687350956016244e-05, + "loss": 2.3946, + "step": 10126 + }, + { + "epoch": 0.6797087346062213, + "grad_norm": 4.069836139678955, + "learning_rate": 7.686434387006773e-05, + "loss": 2.4852, + "step": 10128 + }, + { + "epoch": 0.6798429582899903, + "grad_norm": 4.6230998039245605, + "learning_rate": 7.685517691068563e-05, + "loss": 2.4538, + "step": 10130 + }, + { + "epoch": 0.6799771819737592, + "grad_norm": 3.792956590652466, + "learning_rate": 7.68460086824492e-05, + "loss": 2.4063, + "step": 10132 + }, + { + "epoch": 0.6801114056575283, + "grad_norm": 4.2054877281188965, + "learning_rate": 7.683683918579165e-05, + "loss": 2.2781, + "step": 10134 + }, + { + "epoch": 0.6802456293412973, + "grad_norm": 4.290092468261719, + "learning_rate": 7.682766842114622e-05, + "loss": 2.4865, + "step": 10136 + }, + { + "epoch": 0.6803798530250663, + "grad_norm": 3.8959639072418213, + "learning_rate": 7.681849638894623e-05, + "loss": 2.352, + "step": 10138 + }, + { + "epoch": 0.6805140767088352, + "grad_norm": 4.049508571624756, + "learning_rate": 7.6809323089625e-05, + "loss": 2.503, + "step": 10140 + }, + { + "epoch": 0.6806483003926043, + "grad_norm": 4.301867961883545, + "learning_rate": 7.680014852361598e-05, + "loss": 2.2958, + "step": 10142 + }, + { + "epoch": 0.6807825240763733, + "grad_norm": 4.32470178604126, + "learning_rate": 7.679097269135266e-05, + "loss": 2.6288, + "step": 10144 + }, + { + "epoch": 0.6809167477601423, + "grad_norm": 4.135592937469482, + "learning_rate": 7.678179559326855e-05, + "loss": 2.4815, + "step": 10146 + }, + { + "epoch": 0.6810509714439112, + "grad_norm": 7.036674499511719, + "learning_rate": 7.677261722979725e-05, + "loss": 2.3559, + "step": 10148 + }, + { + "epoch": 0.6811851951276803, + "grad_norm": 4.306659698486328, + "learning_rate": 7.676343760137244e-05, + "loss": 2.4845, + "step": 10150 + }, + { + "epoch": 0.6813194188114493, + "grad_norm": 3.4131529331207275, + "learning_rate": 7.675425670842783e-05, + "loss": 2.3944, + "step": 10152 + }, + { + "epoch": 0.6814536424952182, + "grad_norm": 4.208308696746826, + "learning_rate": 7.674507455139721e-05, + "loss": 2.6401, + "step": 10154 + }, + { + "epoch": 0.6815878661789873, + "grad_norm": 4.569714069366455, + "learning_rate": 7.673589113071442e-05, + "loss": 2.5042, + "step": 10156 + }, + { + "epoch": 0.6817220898627563, + "grad_norm": 4.430919647216797, + "learning_rate": 7.672670644681332e-05, + "loss": 2.3535, + "step": 10158 + }, + { + "epoch": 0.6818563135465253, + "grad_norm": 4.089615345001221, + "learning_rate": 7.671752050012792e-05, + "loss": 2.3328, + "step": 10160 + }, + { + "epoch": 0.6819905372302942, + "grad_norm": 4.196087837219238, + "learning_rate": 7.670833329109219e-05, + "loss": 2.4742, + "step": 10162 + }, + { + "epoch": 0.6821247609140633, + "grad_norm": 3.9801979064941406, + "learning_rate": 7.669914482014025e-05, + "loss": 2.5595, + "step": 10164 + }, + { + "epoch": 0.6822589845978323, + "grad_norm": 4.004915237426758, + "learning_rate": 7.668995508770621e-05, + "loss": 2.4412, + "step": 10166 + }, + { + "epoch": 0.6823932082816013, + "grad_norm": 4.4228715896606445, + "learning_rate": 7.668076409422427e-05, + "loss": 2.4928, + "step": 10168 + }, + { + "epoch": 0.6825274319653702, + "grad_norm": 4.750372886657715, + "learning_rate": 7.667157184012871e-05, + "loss": 2.5675, + "step": 10170 + }, + { + "epoch": 0.6826616556491393, + "grad_norm": 4.334416389465332, + "learning_rate": 7.666237832585382e-05, + "loss": 2.4006, + "step": 10172 + }, + { + "epoch": 0.6827958793329083, + "grad_norm": 3.8024954795837402, + "learning_rate": 7.665318355183398e-05, + "loss": 2.1775, + "step": 10174 + }, + { + "epoch": 0.6829301030166773, + "grad_norm": 4.326840877532959, + "learning_rate": 7.664398751850363e-05, + "loss": 2.7129, + "step": 10176 + }, + { + "epoch": 0.6830643267004463, + "grad_norm": 4.4982171058654785, + "learning_rate": 7.663479022629727e-05, + "loss": 2.4956, + "step": 10178 + }, + { + "epoch": 0.6831985503842153, + "grad_norm": 4.07792329788208, + "learning_rate": 7.662559167564944e-05, + "loss": 2.5656, + "step": 10180 + }, + { + "epoch": 0.6833327740679843, + "grad_norm": 4.513014316558838, + "learning_rate": 7.661639186699474e-05, + "loss": 2.3537, + "step": 10182 + }, + { + "epoch": 0.6834669977517533, + "grad_norm": 3.834482431411743, + "learning_rate": 7.66071908007679e-05, + "loss": 2.2217, + "step": 10184 + }, + { + "epoch": 0.6836012214355223, + "grad_norm": 4.266119480133057, + "learning_rate": 7.659798847740359e-05, + "loss": 2.7731, + "step": 10186 + }, + { + "epoch": 0.6837354451192913, + "grad_norm": 5.053428649902344, + "learning_rate": 7.658878489733664e-05, + "loss": 2.4479, + "step": 10188 + }, + { + "epoch": 0.6838696688030603, + "grad_norm": 4.810938358306885, + "learning_rate": 7.657958006100188e-05, + "loss": 2.4343, + "step": 10190 + }, + { + "epoch": 0.6840038924868292, + "grad_norm": 4.057448387145996, + "learning_rate": 7.657037396883423e-05, + "loss": 2.4027, + "step": 10192 + }, + { + "epoch": 0.6841381161705983, + "grad_norm": 4.3940510749816895, + "learning_rate": 7.656116662126866e-05, + "loss": 2.5526, + "step": 10194 + }, + { + "epoch": 0.6842723398543673, + "grad_norm": 4.452503204345703, + "learning_rate": 7.65519580187402e-05, + "loss": 2.2662, + "step": 10196 + }, + { + "epoch": 0.6844065635381363, + "grad_norm": 4.2132487297058105, + "learning_rate": 7.654274816168396e-05, + "loss": 2.1274, + "step": 10198 + }, + { + "epoch": 0.6845407872219053, + "grad_norm": 4.2607550621032715, + "learning_rate": 7.653353705053503e-05, + "loss": 2.3895, + "step": 10200 + }, + { + "epoch": 0.6846750109056743, + "grad_norm": 3.9814321994781494, + "learning_rate": 7.652432468572865e-05, + "loss": 2.2879, + "step": 10202 + }, + { + "epoch": 0.6848092345894433, + "grad_norm": 4.220925331115723, + "learning_rate": 7.65151110677001e-05, + "loss": 2.3019, + "step": 10204 + }, + { + "epoch": 0.6849434582732123, + "grad_norm": 4.531630992889404, + "learning_rate": 7.650589619688469e-05, + "loss": 2.4668, + "step": 10206 + }, + { + "epoch": 0.6850776819569813, + "grad_norm": 3.727102756500244, + "learning_rate": 7.64966800737178e-05, + "loss": 2.3751, + "step": 10208 + }, + { + "epoch": 0.6852119056407503, + "grad_norm": 4.336957931518555, + "learning_rate": 7.648746269863487e-05, + "loss": 2.3486, + "step": 10210 + }, + { + "epoch": 0.6853461293245193, + "grad_norm": 4.5283403396606445, + "learning_rate": 7.647824407207144e-05, + "loss": 2.437, + "step": 10212 + }, + { + "epoch": 0.6854803530082884, + "grad_norm": 3.668177604675293, + "learning_rate": 7.646902419446302e-05, + "loss": 2.2924, + "step": 10214 + }, + { + "epoch": 0.6856145766920573, + "grad_norm": 3.611708164215088, + "learning_rate": 7.645980306624528e-05, + "loss": 2.4637, + "step": 10216 + }, + { + "epoch": 0.6857488003758263, + "grad_norm": 4.079258918762207, + "learning_rate": 7.645058068785386e-05, + "loss": 2.4696, + "step": 10218 + }, + { + "epoch": 0.6858830240595953, + "grad_norm": 3.8769989013671875, + "learning_rate": 7.64413570597245e-05, + "loss": 2.5449, + "step": 10220 + }, + { + "epoch": 0.6860172477433644, + "grad_norm": 4.513350486755371, + "learning_rate": 7.643213218229305e-05, + "loss": 2.4567, + "step": 10222 + }, + { + "epoch": 0.6861514714271333, + "grad_norm": 4.617779731750488, + "learning_rate": 7.642290605599531e-05, + "loss": 2.6232, + "step": 10224 + }, + { + "epoch": 0.6862856951109023, + "grad_norm": 4.076568126678467, + "learning_rate": 7.64136786812672e-05, + "loss": 2.3309, + "step": 10226 + }, + { + "epoch": 0.6864199187946713, + "grad_norm": 4.0758161544799805, + "learning_rate": 7.640445005854475e-05, + "loss": 2.5231, + "step": 10228 + }, + { + "epoch": 0.6865541424784403, + "grad_norm": 4.297593593597412, + "learning_rate": 7.639522018826393e-05, + "loss": 2.8017, + "step": 10230 + }, + { + "epoch": 0.6866883661622093, + "grad_norm": 5.1603899002075195, + "learning_rate": 7.638598907086088e-05, + "loss": 2.5319, + "step": 10232 + }, + { + "epoch": 0.6868225898459783, + "grad_norm": 4.406987190246582, + "learning_rate": 7.637675670677171e-05, + "loss": 2.5746, + "step": 10234 + }, + { + "epoch": 0.6869568135297474, + "grad_norm": 3.784359931945801, + "learning_rate": 7.636752309643267e-05, + "loss": 2.4762, + "step": 10236 + }, + { + "epoch": 0.6870910372135163, + "grad_norm": 4.021445274353027, + "learning_rate": 7.635828824028001e-05, + "loss": 2.4264, + "step": 10238 + }, + { + "epoch": 0.6872252608972853, + "grad_norm": 4.038495063781738, + "learning_rate": 7.634905213875006e-05, + "loss": 2.1512, + "step": 10240 + }, + { + "epoch": 0.6873594845810543, + "grad_norm": 4.370901107788086, + "learning_rate": 7.633981479227921e-05, + "loss": 2.5739, + "step": 10242 + }, + { + "epoch": 0.6874937082648234, + "grad_norm": 4.1967363357543945, + "learning_rate": 7.633057620130391e-05, + "loss": 2.6192, + "step": 10244 + }, + { + "epoch": 0.6876279319485923, + "grad_norm": 3.7612032890319824, + "learning_rate": 7.632133636626064e-05, + "loss": 2.2812, + "step": 10246 + }, + { + "epoch": 0.6877621556323613, + "grad_norm": 4.187586784362793, + "learning_rate": 7.6312095287586e-05, + "loss": 2.5124, + "step": 10248 + }, + { + "epoch": 0.6878963793161303, + "grad_norm": 4.302048683166504, + "learning_rate": 7.630285296571661e-05, + "loss": 2.5674, + "step": 10250 + }, + { + "epoch": 0.6880306029998994, + "grad_norm": 4.557802677154541, + "learning_rate": 7.629360940108913e-05, + "loss": 2.3934, + "step": 10252 + }, + { + "epoch": 0.6881648266836683, + "grad_norm": 4.257068157196045, + "learning_rate": 7.62843645941403e-05, + "loss": 2.2474, + "step": 10254 + }, + { + "epoch": 0.6882990503674373, + "grad_norm": 4.054049015045166, + "learning_rate": 7.627511854530695e-05, + "loss": 2.5217, + "step": 10256 + }, + { + "epoch": 0.6884332740512064, + "grad_norm": 4.2540388107299805, + "learning_rate": 7.62658712550259e-05, + "loss": 2.2741, + "step": 10258 + }, + { + "epoch": 0.6885674977349754, + "grad_norm": 3.5929436683654785, + "learning_rate": 7.625662272373409e-05, + "loss": 2.2851, + "step": 10260 + }, + { + "epoch": 0.6887017214187443, + "grad_norm": 4.626030445098877, + "learning_rate": 7.624737295186849e-05, + "loss": 2.3433, + "step": 10262 + }, + { + "epoch": 0.6888359451025133, + "grad_norm": 4.171584129333496, + "learning_rate": 7.623812193986612e-05, + "loss": 2.5867, + "step": 10264 + }, + { + "epoch": 0.6889701687862824, + "grad_norm": 4.088718891143799, + "learning_rate": 7.62288696881641e-05, + "loss": 2.256, + "step": 10266 + }, + { + "epoch": 0.6891043924700513, + "grad_norm": 3.911005973815918, + "learning_rate": 7.621961619719954e-05, + "loss": 2.4362, + "step": 10268 + }, + { + "epoch": 0.6892386161538203, + "grad_norm": 3.9225382804870605, + "learning_rate": 7.621036146740972e-05, + "loss": 2.311, + "step": 10270 + }, + { + "epoch": 0.6893728398375893, + "grad_norm": 4.149808406829834, + "learning_rate": 7.620110549923181e-05, + "loss": 2.4072, + "step": 10272 + }, + { + "epoch": 0.6895070635213584, + "grad_norm": 4.469205379486084, + "learning_rate": 7.619184829310322e-05, + "loss": 2.603, + "step": 10274 + }, + { + "epoch": 0.6896412872051273, + "grad_norm": 4.303328990936279, + "learning_rate": 7.618258984946128e-05, + "loss": 2.3823, + "step": 10276 + }, + { + "epoch": 0.6897755108888963, + "grad_norm": 4.1894989013671875, + "learning_rate": 7.617333016874346e-05, + "loss": 2.5695, + "step": 10278 + }, + { + "epoch": 0.6899097345726654, + "grad_norm": 4.950192451477051, + "learning_rate": 7.616406925138727e-05, + "loss": 2.43, + "step": 10280 + }, + { + "epoch": 0.6900439582564344, + "grad_norm": 3.967576742172241, + "learning_rate": 7.615480709783025e-05, + "loss": 2.364, + "step": 10282 + }, + { + "epoch": 0.6901781819402033, + "grad_norm": 4.634243965148926, + "learning_rate": 7.614554370851003e-05, + "loss": 2.7208, + "step": 10284 + }, + { + "epoch": 0.6903124056239723, + "grad_norm": 4.516168117523193, + "learning_rate": 7.613627908386427e-05, + "loss": 2.716, + "step": 10286 + }, + { + "epoch": 0.6904466293077414, + "grad_norm": 3.734182596206665, + "learning_rate": 7.612701322433074e-05, + "loss": 2.2862, + "step": 10288 + }, + { + "epoch": 0.6905808529915104, + "grad_norm": 3.8726351261138916, + "learning_rate": 7.61177461303472e-05, + "loss": 2.4995, + "step": 10290 + }, + { + "epoch": 0.6907150766752793, + "grad_norm": 3.9013051986694336, + "learning_rate": 7.610847780235149e-05, + "loss": 2.3737, + "step": 10292 + }, + { + "epoch": 0.6908493003590483, + "grad_norm": 4.049952507019043, + "learning_rate": 7.609920824078157e-05, + "loss": 2.2756, + "step": 10294 + }, + { + "epoch": 0.6909835240428174, + "grad_norm": 4.092665672302246, + "learning_rate": 7.608993744607538e-05, + "loss": 2.6362, + "step": 10296 + }, + { + "epoch": 0.6911177477265864, + "grad_norm": 4.030460357666016, + "learning_rate": 7.608066541867093e-05, + "loss": 2.3434, + "step": 10298 + }, + { + "epoch": 0.6912519714103553, + "grad_norm": 4.222945213317871, + "learning_rate": 7.607139215900636e-05, + "loss": 2.4419, + "step": 10300 + }, + { + "epoch": 0.6913861950941244, + "grad_norm": 3.979375123977661, + "learning_rate": 7.606211766751976e-05, + "loss": 2.4924, + "step": 10302 + }, + { + "epoch": 0.6915204187778934, + "grad_norm": 3.94038462638855, + "learning_rate": 7.605284194464934e-05, + "loss": 2.6859, + "step": 10304 + }, + { + "epoch": 0.6916546424616623, + "grad_norm": 5.725574016571045, + "learning_rate": 7.604356499083338e-05, + "loss": 2.4843, + "step": 10306 + }, + { + "epoch": 0.6917888661454313, + "grad_norm": 5.625984191894531, + "learning_rate": 7.603428680651019e-05, + "loss": 2.6468, + "step": 10308 + }, + { + "epoch": 0.6919230898292004, + "grad_norm": 4.454188823699951, + "learning_rate": 7.602500739211813e-05, + "loss": 2.4681, + "step": 10310 + }, + { + "epoch": 0.6920573135129694, + "grad_norm": 15.70682144165039, + "learning_rate": 7.601572674809565e-05, + "loss": 2.3122, + "step": 10312 + }, + { + "epoch": 0.6921915371967383, + "grad_norm": 3.694979429244995, + "learning_rate": 7.600644487488124e-05, + "loss": 2.5083, + "step": 10314 + }, + { + "epoch": 0.6923257608805073, + "grad_norm": 3.884251356124878, + "learning_rate": 7.599716177291345e-05, + "loss": 2.3708, + "step": 10316 + }, + { + "epoch": 0.6924599845642764, + "grad_norm": 3.836459159851074, + "learning_rate": 7.59878774426309e-05, + "loss": 2.359, + "step": 10318 + }, + { + "epoch": 0.6925942082480454, + "grad_norm": 4.6417012214660645, + "learning_rate": 7.597859188447223e-05, + "loss": 2.5216, + "step": 10320 + }, + { + "epoch": 0.6927284319318143, + "grad_norm": 3.8124120235443115, + "learning_rate": 7.596930509887618e-05, + "loss": 2.1761, + "step": 10322 + }, + { + "epoch": 0.6928626556155834, + "grad_norm": 4.074049472808838, + "learning_rate": 7.596001708628152e-05, + "loss": 2.5667, + "step": 10324 + }, + { + "epoch": 0.6929968792993524, + "grad_norm": 4.2731451988220215, + "learning_rate": 7.595072784712711e-05, + "loss": 2.6523, + "step": 10326 + }, + { + "epoch": 0.6931311029831214, + "grad_norm": 6.0567803382873535, + "learning_rate": 7.594143738185184e-05, + "loss": 2.5021, + "step": 10328 + }, + { + "epoch": 0.6932653266668903, + "grad_norm": 4.403380393981934, + "learning_rate": 7.593214569089467e-05, + "loss": 2.4687, + "step": 10330 + }, + { + "epoch": 0.6933995503506594, + "grad_norm": 4.109714031219482, + "learning_rate": 7.59228527746946e-05, + "loss": 2.5203, + "step": 10332 + }, + { + "epoch": 0.6935337740344284, + "grad_norm": 4.108243465423584, + "learning_rate": 7.591355863369071e-05, + "loss": 2.4328, + "step": 10334 + }, + { + "epoch": 0.6936679977181974, + "grad_norm": 4.398876667022705, + "learning_rate": 7.590426326832217e-05, + "loss": 2.7046, + "step": 10336 + }, + { + "epoch": 0.6938022214019663, + "grad_norm": 3.720494031906128, + "learning_rate": 7.589496667902809e-05, + "loss": 2.2081, + "step": 10338 + }, + { + "epoch": 0.6939364450857354, + "grad_norm": 3.906531572341919, + "learning_rate": 7.588566886624774e-05, + "loss": 2.4453, + "step": 10340 + }, + { + "epoch": 0.6940706687695044, + "grad_norm": 4.2388153076171875, + "learning_rate": 7.587636983042048e-05, + "loss": 2.445, + "step": 10342 + }, + { + "epoch": 0.6942048924532733, + "grad_norm": 4.742746353149414, + "learning_rate": 7.586706957198562e-05, + "loss": 2.5355, + "step": 10344 + }, + { + "epoch": 0.6943391161370424, + "grad_norm": 4.868573188781738, + "learning_rate": 7.585776809138257e-05, + "loss": 2.3943, + "step": 10346 + }, + { + "epoch": 0.6944733398208114, + "grad_norm": 4.6798882484436035, + "learning_rate": 7.584846538905083e-05, + "loss": 2.5811, + "step": 10348 + }, + { + "epoch": 0.6946075635045804, + "grad_norm": 4.686613082885742, + "learning_rate": 7.583916146542992e-05, + "loss": 2.6075, + "step": 10350 + }, + { + "epoch": 0.6947417871883493, + "grad_norm": 3.775581121444702, + "learning_rate": 7.582985632095946e-05, + "loss": 2.2971, + "step": 10352 + }, + { + "epoch": 0.6948760108721184, + "grad_norm": 4.799238204956055, + "learning_rate": 7.582054995607908e-05, + "loss": 2.6713, + "step": 10354 + }, + { + "epoch": 0.6950102345558874, + "grad_norm": 4.156506061553955, + "learning_rate": 7.581124237122848e-05, + "loss": 2.1265, + "step": 10356 + }, + { + "epoch": 0.6951444582396564, + "grad_norm": 4.145840644836426, + "learning_rate": 7.580193356684743e-05, + "loss": 2.3391, + "step": 10358 + }, + { + "epoch": 0.6952786819234253, + "grad_norm": 3.9857993125915527, + "learning_rate": 7.579262354337577e-05, + "loss": 2.2932, + "step": 10360 + }, + { + "epoch": 0.6954129056071944, + "grad_norm": 5.030476093292236, + "learning_rate": 7.578331230125336e-05, + "loss": 2.5146, + "step": 10362 + }, + { + "epoch": 0.6955471292909634, + "grad_norm": 3.69381046295166, + "learning_rate": 7.577399984092015e-05, + "loss": 2.2153, + "step": 10364 + }, + { + "epoch": 0.6956813529747324, + "grad_norm": 3.9217896461486816, + "learning_rate": 7.576468616281612e-05, + "loss": 2.3734, + "step": 10366 + }, + { + "epoch": 0.6958155766585014, + "grad_norm": 4.454031467437744, + "learning_rate": 7.575537126738132e-05, + "loss": 2.3083, + "step": 10368 + }, + { + "epoch": 0.6959498003422704, + "grad_norm": 4.0883378982543945, + "learning_rate": 7.57460551550559e-05, + "loss": 2.2967, + "step": 10370 + }, + { + "epoch": 0.6960840240260394, + "grad_norm": 5.048201084136963, + "learning_rate": 7.573673782628e-05, + "loss": 2.1915, + "step": 10372 + }, + { + "epoch": 0.6962182477098084, + "grad_norm": 4.33798885345459, + "learning_rate": 7.572741928149384e-05, + "loss": 2.3865, + "step": 10374 + }, + { + "epoch": 0.6963524713935774, + "grad_norm": 4.150729179382324, + "learning_rate": 7.57180995211377e-05, + "loss": 2.1748, + "step": 10376 + }, + { + "epoch": 0.6964866950773464, + "grad_norm": 4.274837493896484, + "learning_rate": 7.570877854565195e-05, + "loss": 2.3331, + "step": 10378 + }, + { + "epoch": 0.6966209187611154, + "grad_norm": 4.607488632202148, + "learning_rate": 7.569945635547695e-05, + "loss": 2.6656, + "step": 10380 + }, + { + "epoch": 0.6967551424448843, + "grad_norm": 3.9426257610321045, + "learning_rate": 7.569013295105318e-05, + "loss": 2.3009, + "step": 10382 + }, + { + "epoch": 0.6968893661286534, + "grad_norm": 4.256702423095703, + "learning_rate": 7.568080833282114e-05, + "loss": 2.5833, + "step": 10384 + }, + { + "epoch": 0.6970235898124224, + "grad_norm": 3.9030344486236572, + "learning_rate": 7.56714825012214e-05, + "loss": 2.1919, + "step": 10386 + }, + { + "epoch": 0.6971578134961914, + "grad_norm": 4.171705722808838, + "learning_rate": 7.566215545669462e-05, + "loss": 2.4967, + "step": 10388 + }, + { + "epoch": 0.6972920371799604, + "grad_norm": 4.0048980712890625, + "learning_rate": 7.565282719968143e-05, + "loss": 2.291, + "step": 10390 + }, + { + "epoch": 0.6974262608637294, + "grad_norm": 4.116173267364502, + "learning_rate": 7.564349773062262e-05, + "loss": 2.3831, + "step": 10392 + }, + { + "epoch": 0.6975604845474984, + "grad_norm": 4.100427150726318, + "learning_rate": 7.563416704995894e-05, + "loss": 2.4702, + "step": 10394 + }, + { + "epoch": 0.6976947082312674, + "grad_norm": 4.27834939956665, + "learning_rate": 7.56248351581313e-05, + "loss": 2.346, + "step": 10396 + }, + { + "epoch": 0.6978289319150364, + "grad_norm": 5.703613758087158, + "learning_rate": 7.561550205558058e-05, + "loss": 2.247, + "step": 10398 + }, + { + "epoch": 0.6979631555988054, + "grad_norm": 3.657223701477051, + "learning_rate": 7.560616774274775e-05, + "loss": 2.0474, + "step": 10400 + }, + { + "epoch": 0.6980973792825744, + "grad_norm": 4.607489585876465, + "learning_rate": 7.559683222007386e-05, + "loss": 2.5701, + "step": 10402 + }, + { + "epoch": 0.6982316029663435, + "grad_norm": 4.1589674949646, + "learning_rate": 7.558749548799997e-05, + "loss": 2.3506, + "step": 10404 + }, + { + "epoch": 0.6983658266501124, + "grad_norm": 5.002182483673096, + "learning_rate": 7.557815754696724e-05, + "loss": 2.565, + "step": 10406 + }, + { + "epoch": 0.6985000503338814, + "grad_norm": 4.845500946044922, + "learning_rate": 7.556881839741687e-05, + "loss": 2.2809, + "step": 10408 + }, + { + "epoch": 0.6986342740176504, + "grad_norm": 4.1446123123168945, + "learning_rate": 7.55594780397901e-05, + "loss": 2.5107, + "step": 10410 + }, + { + "epoch": 0.6987684977014195, + "grad_norm": 3.9348325729370117, + "learning_rate": 7.555013647452826e-05, + "loss": 2.3757, + "step": 10412 + }, + { + "epoch": 0.6989027213851884, + "grad_norm": 3.905123710632324, + "learning_rate": 7.554079370207269e-05, + "loss": 2.3182, + "step": 10414 + }, + { + "epoch": 0.6990369450689574, + "grad_norm": 4.367519855499268, + "learning_rate": 7.553144972286488e-05, + "loss": 2.2741, + "step": 10416 + }, + { + "epoch": 0.6991711687527264, + "grad_norm": 3.8974897861480713, + "learning_rate": 7.552210453734625e-05, + "loss": 2.3073, + "step": 10418 + }, + { + "epoch": 0.6993053924364954, + "grad_norm": 3.921881914138794, + "learning_rate": 7.551275814595837e-05, + "loss": 2.3947, + "step": 10420 + }, + { + "epoch": 0.6994396161202644, + "grad_norm": 4.407809734344482, + "learning_rate": 7.550341054914284e-05, + "loss": 2.6342, + "step": 10422 + }, + { + "epoch": 0.6995738398040334, + "grad_norm": 4.541403293609619, + "learning_rate": 7.549406174734132e-05, + "loss": 2.3033, + "step": 10424 + }, + { + "epoch": 0.6997080634878025, + "grad_norm": 3.8274130821228027, + "learning_rate": 7.548471174099551e-05, + "loss": 2.1347, + "step": 10426 + }, + { + "epoch": 0.6998422871715714, + "grad_norm": 5.4157233238220215, + "learning_rate": 7.547536053054718e-05, + "loss": 2.4552, + "step": 10428 + }, + { + "epoch": 0.6999765108553404, + "grad_norm": 4.519705772399902, + "learning_rate": 7.546600811643816e-05, + "loss": 2.4191, + "step": 10430 + }, + { + "epoch": 0.7001107345391094, + "grad_norm": 4.171374320983887, + "learning_rate": 7.545665449911032e-05, + "loss": 2.3953, + "step": 10432 + }, + { + "epoch": 0.7002449582228785, + "grad_norm": 4.07275390625, + "learning_rate": 7.544729967900563e-05, + "loss": 2.3263, + "step": 10434 + }, + { + "epoch": 0.7003791819066474, + "grad_norm": 4.000622272491455, + "learning_rate": 7.543794365656609e-05, + "loss": 2.4808, + "step": 10436 + }, + { + "epoch": 0.7005134055904164, + "grad_norm": 3.8615615367889404, + "learning_rate": 7.542858643223369e-05, + "loss": 2.4284, + "step": 10438 + }, + { + "epoch": 0.7006476292741854, + "grad_norm": 4.672926902770996, + "learning_rate": 7.541922800645061e-05, + "loss": 2.4729, + "step": 10440 + }, + { + "epoch": 0.7007818529579545, + "grad_norm": 4.046049118041992, + "learning_rate": 7.540986837965899e-05, + "loss": 2.4289, + "step": 10442 + }, + { + "epoch": 0.7009160766417234, + "grad_norm": 4.123379230499268, + "learning_rate": 7.540050755230104e-05, + "loss": 2.3859, + "step": 10444 + }, + { + "epoch": 0.7010503003254924, + "grad_norm": 4.474039077758789, + "learning_rate": 7.539114552481908e-05, + "loss": 2.2417, + "step": 10446 + }, + { + "epoch": 0.7011845240092615, + "grad_norm": 4.218482494354248, + "learning_rate": 7.53817822976554e-05, + "loss": 2.4733, + "step": 10448 + }, + { + "epoch": 0.7013187476930305, + "grad_norm": 4.551057815551758, + "learning_rate": 7.537241787125245e-05, + "loss": 2.7144, + "step": 10450 + }, + { + "epoch": 0.7014529713767994, + "grad_norm": 4.612653732299805, + "learning_rate": 7.536305224605261e-05, + "loss": 2.598, + "step": 10452 + }, + { + "epoch": 0.7015871950605684, + "grad_norm": 4.110091209411621, + "learning_rate": 7.535368542249846e-05, + "loss": 2.5428, + "step": 10454 + }, + { + "epoch": 0.7017214187443375, + "grad_norm": 3.9299237728118896, + "learning_rate": 7.534431740103249e-05, + "loss": 2.4342, + "step": 10456 + }, + { + "epoch": 0.7018556424281064, + "grad_norm": 4.1085429191589355, + "learning_rate": 7.53349481820974e-05, + "loss": 2.2853, + "step": 10458 + }, + { + "epoch": 0.7019898661118754, + "grad_norm": 4.016427040100098, + "learning_rate": 7.53255777661358e-05, + "loss": 2.3837, + "step": 10460 + }, + { + "epoch": 0.7021240897956444, + "grad_norm": 3.726989984512329, + "learning_rate": 7.531620615359046e-05, + "loss": 2.7055, + "step": 10462 + }, + { + "epoch": 0.7022583134794135, + "grad_norm": 4.052381992340088, + "learning_rate": 7.530683334490416e-05, + "loss": 2.2791, + "step": 10464 + }, + { + "epoch": 0.7023925371631824, + "grad_norm": 4.320976734161377, + "learning_rate": 7.529745934051976e-05, + "loss": 2.467, + "step": 10466 + }, + { + "epoch": 0.7025267608469514, + "grad_norm": 3.9716713428497314, + "learning_rate": 7.528808414088015e-05, + "loss": 2.3338, + "step": 10468 + }, + { + "epoch": 0.7026609845307205, + "grad_norm": 4.216366767883301, + "learning_rate": 7.527870774642828e-05, + "loss": 2.2804, + "step": 10470 + }, + { + "epoch": 0.7027952082144895, + "grad_norm": 3.9913816452026367, + "learning_rate": 7.526933015760717e-05, + "loss": 2.3534, + "step": 10472 + }, + { + "epoch": 0.7029294318982584, + "grad_norm": 4.353600025177002, + "learning_rate": 7.525995137485993e-05, + "loss": 2.4693, + "step": 10474 + }, + { + "epoch": 0.7030636555820274, + "grad_norm": 4.392929553985596, + "learning_rate": 7.525057139862964e-05, + "loss": 2.4712, + "step": 10476 + }, + { + "epoch": 0.7031978792657965, + "grad_norm": 4.091434955596924, + "learning_rate": 7.524119022935949e-05, + "loss": 2.0507, + "step": 10478 + }, + { + "epoch": 0.7033321029495655, + "grad_norm": 4.223429203033447, + "learning_rate": 7.523180786749276e-05, + "loss": 2.37, + "step": 10480 + }, + { + "epoch": 0.7034663266333344, + "grad_norm": 4.722282886505127, + "learning_rate": 7.522242431347272e-05, + "loss": 2.3314, + "step": 10482 + }, + { + "epoch": 0.7036005503171034, + "grad_norm": 4.061368942260742, + "learning_rate": 7.521303956774273e-05, + "loss": 2.2652, + "step": 10484 + }, + { + "epoch": 0.7037347740008725, + "grad_norm": 4.159390926361084, + "learning_rate": 7.52036536307462e-05, + "loss": 2.5213, + "step": 10486 + }, + { + "epoch": 0.7038689976846415, + "grad_norm": 4.889634132385254, + "learning_rate": 7.519426650292658e-05, + "loss": 2.5897, + "step": 10488 + }, + { + "epoch": 0.7040032213684104, + "grad_norm": 3.6511359214782715, + "learning_rate": 7.518487818472743e-05, + "loss": 2.5912, + "step": 10490 + }, + { + "epoch": 0.7041374450521795, + "grad_norm": 5.239933490753174, + "learning_rate": 7.51754886765923e-05, + "loss": 2.5261, + "step": 10492 + }, + { + "epoch": 0.7042716687359485, + "grad_norm": 3.8524367809295654, + "learning_rate": 7.516609797896483e-05, + "loss": 2.4782, + "step": 10494 + }, + { + "epoch": 0.7044058924197174, + "grad_norm": 4.020973205566406, + "learning_rate": 7.515670609228873e-05, + "loss": 2.2794, + "step": 10496 + }, + { + "epoch": 0.7045401161034864, + "grad_norm": 4.012627601623535, + "learning_rate": 7.514731301700773e-05, + "loss": 2.6311, + "step": 10498 + }, + { + "epoch": 0.7046743397872555, + "grad_norm": 4.4972920417785645, + "learning_rate": 7.513791875356564e-05, + "loss": 2.5391, + "step": 10500 + }, + { + "epoch": 0.7048085634710245, + "grad_norm": 4.50946569442749, + "learning_rate": 7.512852330240632e-05, + "loss": 2.4301, + "step": 10502 + }, + { + "epoch": 0.7049427871547934, + "grad_norm": 4.472950458526611, + "learning_rate": 7.51191266639737e-05, + "loss": 2.7733, + "step": 10504 + }, + { + "epoch": 0.7050770108385624, + "grad_norm": 4.731194972991943, + "learning_rate": 7.51097288387117e-05, + "loss": 2.2696, + "step": 10506 + }, + { + "epoch": 0.7052112345223315, + "grad_norm": 4.232707500457764, + "learning_rate": 7.510032982706443e-05, + "loss": 2.5066, + "step": 10508 + }, + { + "epoch": 0.7053454582061005, + "grad_norm": 4.277015209197998, + "learning_rate": 7.509092962947591e-05, + "loss": 2.4531, + "step": 10510 + }, + { + "epoch": 0.7054796818898694, + "grad_norm": 3.9417624473571777, + "learning_rate": 7.508152824639032e-05, + "loss": 2.7072, + "step": 10512 + }, + { + "epoch": 0.7056139055736385, + "grad_norm": 4.360600471496582, + "learning_rate": 7.507212567825184e-05, + "loss": 2.4733, + "step": 10514 + }, + { + "epoch": 0.7057481292574075, + "grad_norm": 3.952209949493408, + "learning_rate": 7.506272192550472e-05, + "loss": 2.4297, + "step": 10516 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 4.206145286560059, + "learning_rate": 7.505331698859325e-05, + "loss": 2.3405, + "step": 10518 + }, + { + "epoch": 0.7060165766249454, + "grad_norm": 4.253728866577148, + "learning_rate": 7.504391086796186e-05, + "loss": 2.4267, + "step": 10520 + }, + { + "epoch": 0.7061508003087145, + "grad_norm": 4.390504837036133, + "learning_rate": 7.50345035640549e-05, + "loss": 2.4915, + "step": 10522 + }, + { + "epoch": 0.7062850239924835, + "grad_norm": 4.670053005218506, + "learning_rate": 7.502509507731688e-05, + "loss": 2.2865, + "step": 10524 + }, + { + "epoch": 0.7064192476762525, + "grad_norm": 4.8236260414123535, + "learning_rate": 7.501568540819233e-05, + "loss": 2.6274, + "step": 10526 + }, + { + "epoch": 0.7065534713600214, + "grad_norm": 4.4007697105407715, + "learning_rate": 7.500627455712583e-05, + "loss": 2.5421, + "step": 10528 + }, + { + "epoch": 0.7066876950437905, + "grad_norm": 4.444860458374023, + "learning_rate": 7.499686252456205e-05, + "loss": 2.2995, + "step": 10530 + }, + { + "epoch": 0.7068219187275595, + "grad_norm": 4.387526988983154, + "learning_rate": 7.498744931094565e-05, + "loss": 2.4864, + "step": 10532 + }, + { + "epoch": 0.7069561424113284, + "grad_norm": 4.624956130981445, + "learning_rate": 7.497803491672141e-05, + "loss": 2.4401, + "step": 10534 + }, + { + "epoch": 0.7070903660950975, + "grad_norm": 4.780411720275879, + "learning_rate": 7.496861934233414e-05, + "loss": 2.4887, + "step": 10536 + }, + { + "epoch": 0.7072245897788665, + "grad_norm": 4.557697772979736, + "learning_rate": 7.495920258822869e-05, + "loss": 2.5738, + "step": 10538 + }, + { + "epoch": 0.7073588134626355, + "grad_norm": 4.41593599319458, + "learning_rate": 7.494978465485002e-05, + "loss": 2.6504, + "step": 10540 + }, + { + "epoch": 0.7074930371464044, + "grad_norm": 3.5026121139526367, + "learning_rate": 7.494036554264308e-05, + "loss": 2.2839, + "step": 10542 + }, + { + "epoch": 0.7076272608301735, + "grad_norm": 4.216272830963135, + "learning_rate": 7.49309452520529e-05, + "loss": 2.3689, + "step": 10544 + }, + { + "epoch": 0.7077614845139425, + "grad_norm": 4.400782108306885, + "learning_rate": 7.49215237835246e-05, + "loss": 2.584, + "step": 10546 + }, + { + "epoch": 0.7078957081977115, + "grad_norm": 3.6906657218933105, + "learning_rate": 7.49121011375033e-05, + "loss": 2.263, + "step": 10548 + }, + { + "epoch": 0.7080299318814804, + "grad_norm": 4.257205486297607, + "learning_rate": 7.490267731443422e-05, + "loss": 2.2863, + "step": 10550 + }, + { + "epoch": 0.7081641555652495, + "grad_norm": 4.505555629730225, + "learning_rate": 7.489325231476258e-05, + "loss": 2.375, + "step": 10552 + }, + { + "epoch": 0.7082983792490185, + "grad_norm": 4.901331901550293, + "learning_rate": 7.488382613893372e-05, + "loss": 2.501, + "step": 10554 + }, + { + "epoch": 0.7084326029327875, + "grad_norm": 3.9549202919006348, + "learning_rate": 7.487439878739303e-05, + "loss": 2.3995, + "step": 10556 + }, + { + "epoch": 0.7085668266165565, + "grad_norm": 5.026314735412598, + "learning_rate": 7.486497026058588e-05, + "loss": 2.4103, + "step": 10558 + }, + { + "epoch": 0.7087010503003255, + "grad_norm": 3.8978047370910645, + "learning_rate": 7.485554055895781e-05, + "loss": 2.0687, + "step": 10560 + }, + { + "epoch": 0.7088352739840945, + "grad_norm": 4.228473663330078, + "learning_rate": 7.484610968295431e-05, + "loss": 2.4663, + "step": 10562 + }, + { + "epoch": 0.7089694976678635, + "grad_norm": 4.546695232391357, + "learning_rate": 7.4836677633021e-05, + "loss": 2.3484, + "step": 10564 + }, + { + "epoch": 0.7091037213516325, + "grad_norm": 5.828413486480713, + "learning_rate": 7.48272444096035e-05, + "loss": 2.3411, + "step": 10566 + }, + { + "epoch": 0.7092379450354015, + "grad_norm": 3.75494122505188, + "learning_rate": 7.481781001314751e-05, + "loss": 2.4714, + "step": 10568 + }, + { + "epoch": 0.7093721687191705, + "grad_norm": 3.827033519744873, + "learning_rate": 7.480837444409882e-05, + "loss": 2.5135, + "step": 10570 + }, + { + "epoch": 0.7095063924029394, + "grad_norm": 4.053363800048828, + "learning_rate": 7.479893770290321e-05, + "loss": 2.4661, + "step": 10572 + }, + { + "epoch": 0.7096406160867085, + "grad_norm": 3.6383848190307617, + "learning_rate": 7.478949979000656e-05, + "loss": 2.2284, + "step": 10574 + }, + { + "epoch": 0.7097748397704775, + "grad_norm": 4.280341148376465, + "learning_rate": 7.47800607058548e-05, + "loss": 2.3708, + "step": 10576 + }, + { + "epoch": 0.7099090634542465, + "grad_norm": 4.358154296875, + "learning_rate": 7.477062045089389e-05, + "loss": 2.5245, + "step": 10578 + }, + { + "epoch": 0.7100432871380155, + "grad_norm": 4.734317302703857, + "learning_rate": 7.476117902556987e-05, + "loss": 2.6667, + "step": 10580 + }, + { + "epoch": 0.7101775108217845, + "grad_norm": 5.22210693359375, + "learning_rate": 7.475173643032882e-05, + "loss": 2.6097, + "step": 10582 + }, + { + "epoch": 0.7103117345055535, + "grad_norm": 4.767031192779541, + "learning_rate": 7.474229266561692e-05, + "loss": 2.7387, + "step": 10584 + }, + { + "epoch": 0.7104459581893225, + "grad_norm": 4.32905387878418, + "learning_rate": 7.473284773188034e-05, + "loss": 2.5586, + "step": 10586 + }, + { + "epoch": 0.7105801818730915, + "grad_norm": 3.8542752265930176, + "learning_rate": 7.472340162956534e-05, + "loss": 2.7024, + "step": 10588 + }, + { + "epoch": 0.7107144055568605, + "grad_norm": 4.212634086608887, + "learning_rate": 7.471395435911822e-05, + "loss": 2.2766, + "step": 10590 + }, + { + "epoch": 0.7108486292406295, + "grad_norm": 8.887304306030273, + "learning_rate": 7.470450592098537e-05, + "loss": 2.3204, + "step": 10592 + }, + { + "epoch": 0.7109828529243986, + "grad_norm": 4.036746025085449, + "learning_rate": 7.469505631561317e-05, + "loss": 2.6044, + "step": 10594 + }, + { + "epoch": 0.7111170766081675, + "grad_norm": 4.680142879486084, + "learning_rate": 7.468560554344814e-05, + "loss": 2.2386, + "step": 10596 + }, + { + "epoch": 0.7112513002919365, + "grad_norm": 3.915767192840576, + "learning_rate": 7.46761536049368e-05, + "loss": 2.3628, + "step": 10598 + }, + { + "epoch": 0.7113855239757055, + "grad_norm": 3.643148899078369, + "learning_rate": 7.46667005005257e-05, + "loss": 2.4249, + "step": 10600 + }, + { + "epoch": 0.7115197476594746, + "grad_norm": 3.8178822994232178, + "learning_rate": 7.465724623066153e-05, + "loss": 2.0555, + "step": 10602 + }, + { + "epoch": 0.7116539713432435, + "grad_norm": 4.450192451477051, + "learning_rate": 7.464779079579094e-05, + "loss": 2.2442, + "step": 10604 + }, + { + "epoch": 0.7117881950270125, + "grad_norm": 3.594052314758301, + "learning_rate": 7.463833419636072e-05, + "loss": 2.2427, + "step": 10606 + }, + { + "epoch": 0.7119224187107815, + "grad_norm": 4.674749851226807, + "learning_rate": 7.462887643281764e-05, + "loss": 2.3401, + "step": 10608 + }, + { + "epoch": 0.7120566423945505, + "grad_norm": 4.26141357421875, + "learning_rate": 7.46194175056086e-05, + "loss": 2.1943, + "step": 10610 + }, + { + "epoch": 0.7121908660783195, + "grad_norm": 4.00946044921875, + "learning_rate": 7.460995741518049e-05, + "loss": 2.4506, + "step": 10612 + }, + { + "epoch": 0.7123250897620885, + "grad_norm": 4.676577568054199, + "learning_rate": 7.460049616198027e-05, + "loss": 2.3817, + "step": 10614 + }, + { + "epoch": 0.7124593134458576, + "grad_norm": 4.068159103393555, + "learning_rate": 7.4591033746455e-05, + "loss": 2.3598, + "step": 10616 + }, + { + "epoch": 0.7125935371296265, + "grad_norm": 3.657719373703003, + "learning_rate": 7.458157016905173e-05, + "loss": 2.393, + "step": 10618 + }, + { + "epoch": 0.7127277608133955, + "grad_norm": 3.7355082035064697, + "learning_rate": 7.45721054302176e-05, + "loss": 2.2839, + "step": 10620 + }, + { + "epoch": 0.7128619844971645, + "grad_norm": 5.574342250823975, + "learning_rate": 7.456263953039984e-05, + "loss": 2.523, + "step": 10622 + }, + { + "epoch": 0.7129962081809336, + "grad_norm": 4.034138202667236, + "learning_rate": 7.455317247004563e-05, + "loss": 2.3185, + "step": 10624 + }, + { + "epoch": 0.7131304318647025, + "grad_norm": 4.019510269165039, + "learning_rate": 7.454370424960231e-05, + "loss": 2.5146, + "step": 10626 + }, + { + "epoch": 0.7132646555484715, + "grad_norm": 4.504369258880615, + "learning_rate": 7.453423486951723e-05, + "loss": 2.6264, + "step": 10628 + }, + { + "epoch": 0.7133988792322405, + "grad_norm": 4.075549125671387, + "learning_rate": 7.45247643302378e-05, + "loss": 2.4659, + "step": 10630 + }, + { + "epoch": 0.7135331029160096, + "grad_norm": 4.263719081878662, + "learning_rate": 7.451529263221147e-05, + "loss": 2.552, + "step": 10632 + }, + { + "epoch": 0.7136673265997785, + "grad_norm": 3.98980450630188, + "learning_rate": 7.450581977588577e-05, + "loss": 2.508, + "step": 10634 + }, + { + "epoch": 0.7138015502835475, + "grad_norm": 8.877470016479492, + "learning_rate": 7.44963457617083e-05, + "loss": 2.4991, + "step": 10636 + }, + { + "epoch": 0.7139357739673166, + "grad_norm": 3.8702170848846436, + "learning_rate": 7.448687059012665e-05, + "loss": 2.4122, + "step": 10638 + }, + { + "epoch": 0.7140699976510856, + "grad_norm": 3.7633845806121826, + "learning_rate": 7.44773942615885e-05, + "loss": 2.2158, + "step": 10640 + }, + { + "epoch": 0.7142042213348545, + "grad_norm": 4.470631122589111, + "learning_rate": 7.446791677654162e-05, + "loss": 2.6121, + "step": 10642 + }, + { + "epoch": 0.7143384450186235, + "grad_norm": 3.879288673400879, + "learning_rate": 7.445843813543379e-05, + "loss": 2.2024, + "step": 10644 + }, + { + "epoch": 0.7144726687023926, + "grad_norm": 3.9044554233551025, + "learning_rate": 7.444895833871283e-05, + "loss": 2.8168, + "step": 10646 + }, + { + "epoch": 0.7146068923861615, + "grad_norm": 4.369875907897949, + "learning_rate": 7.443947738682668e-05, + "loss": 2.9716, + "step": 10648 + }, + { + "epoch": 0.7147411160699305, + "grad_norm": 4.864446640014648, + "learning_rate": 7.44299952802233e-05, + "loss": 2.6239, + "step": 10650 + }, + { + "epoch": 0.7148753397536995, + "grad_norm": 4.130701065063477, + "learning_rate": 7.442051201935065e-05, + "loss": 2.4306, + "step": 10652 + }, + { + "epoch": 0.7150095634374686, + "grad_norm": 4.29925537109375, + "learning_rate": 7.441102760465686e-05, + "loss": 2.4521, + "step": 10654 + }, + { + "epoch": 0.7151437871212375, + "grad_norm": 5.307042598724365, + "learning_rate": 7.440154203658999e-05, + "loss": 2.6073, + "step": 10656 + }, + { + "epoch": 0.7152780108050065, + "grad_norm": 3.968083620071411, + "learning_rate": 7.439205531559825e-05, + "loss": 2.5378, + "step": 10658 + }, + { + "epoch": 0.7154122344887756, + "grad_norm": 3.539463520050049, + "learning_rate": 7.438256744212987e-05, + "loss": 2.4452, + "step": 10660 + }, + { + "epoch": 0.7155464581725446, + "grad_norm": 3.9558801651000977, + "learning_rate": 7.437307841663312e-05, + "loss": 2.4684, + "step": 10662 + }, + { + "epoch": 0.7156806818563135, + "grad_norm": 3.8848776817321777, + "learning_rate": 7.436358823955634e-05, + "loss": 2.3796, + "step": 10664 + }, + { + "epoch": 0.7158149055400825, + "grad_norm": 4.504702091217041, + "learning_rate": 7.435409691134792e-05, + "loss": 2.3581, + "step": 10666 + }, + { + "epoch": 0.7159491292238516, + "grad_norm": 5.330410480499268, + "learning_rate": 7.434460443245632e-05, + "loss": 2.6184, + "step": 10668 + }, + { + "epoch": 0.7160833529076206, + "grad_norm": 4.467921257019043, + "learning_rate": 7.433511080333004e-05, + "loss": 2.4418, + "step": 10670 + }, + { + "epoch": 0.7162175765913895, + "grad_norm": 4.01969575881958, + "learning_rate": 7.43256160244176e-05, + "loss": 2.235, + "step": 10672 + }, + { + "epoch": 0.7163518002751585, + "grad_norm": 3.606708288192749, + "learning_rate": 7.431612009616767e-05, + "loss": 2.3923, + "step": 10674 + }, + { + "epoch": 0.7164860239589276, + "grad_norm": 4.676477909088135, + "learning_rate": 7.430662301902885e-05, + "loss": 2.4956, + "step": 10676 + }, + { + "epoch": 0.7166202476426966, + "grad_norm": 4.6230597496032715, + "learning_rate": 7.429712479344992e-05, + "loss": 2.3475, + "step": 10678 + }, + { + "epoch": 0.7167544713264655, + "grad_norm": 4.161017417907715, + "learning_rate": 7.42876254198796e-05, + "loss": 2.6736, + "step": 10680 + }, + { + "epoch": 0.7168886950102346, + "grad_norm": 3.6622507572174072, + "learning_rate": 7.427812489876674e-05, + "loss": 2.196, + "step": 10682 + }, + { + "epoch": 0.7170229186940036, + "grad_norm": 5.00488805770874, + "learning_rate": 7.426862323056023e-05, + "loss": 2.5027, + "step": 10684 + }, + { + "epoch": 0.7171571423777725, + "grad_norm": 3.742720365524292, + "learning_rate": 7.425912041570899e-05, + "loss": 2.304, + "step": 10686 + }, + { + "epoch": 0.7172913660615415, + "grad_norm": 4.101393222808838, + "learning_rate": 7.424961645466202e-05, + "loss": 2.4603, + "step": 10688 + }, + { + "epoch": 0.7174255897453106, + "grad_norm": 4.340503692626953, + "learning_rate": 7.424011134786835e-05, + "loss": 2.3578, + "step": 10690 + }, + { + "epoch": 0.7175598134290796, + "grad_norm": 4.045363903045654, + "learning_rate": 7.423060509577707e-05, + "loss": 2.4132, + "step": 10692 + }, + { + "epoch": 0.7176940371128485, + "grad_norm": 3.9351680278778076, + "learning_rate": 7.422109769883738e-05, + "loss": 2.2412, + "step": 10694 + }, + { + "epoch": 0.7178282607966175, + "grad_norm": 3.7238423824310303, + "learning_rate": 7.421158915749842e-05, + "loss": 2.3876, + "step": 10696 + }, + { + "epoch": 0.7179624844803866, + "grad_norm": 3.994783878326416, + "learning_rate": 7.42020794722095e-05, + "loss": 2.2071, + "step": 10698 + }, + { + "epoch": 0.7180967081641556, + "grad_norm": 3.810760259628296, + "learning_rate": 7.419256864341992e-05, + "loss": 2.399, + "step": 10700 + }, + { + "epoch": 0.7182309318479245, + "grad_norm": 3.9990618228912354, + "learning_rate": 7.418305667157903e-05, + "loss": 2.3401, + "step": 10702 + }, + { + "epoch": 0.7183651555316936, + "grad_norm": 4.310705184936523, + "learning_rate": 7.417354355713627e-05, + "loss": 2.6845, + "step": 10704 + }, + { + "epoch": 0.7184993792154626, + "grad_norm": 4.311987400054932, + "learning_rate": 7.416402930054111e-05, + "loss": 2.4088, + "step": 10706 + }, + { + "epoch": 0.7186336028992316, + "grad_norm": 4.549762725830078, + "learning_rate": 7.415451390224309e-05, + "loss": 2.343, + "step": 10708 + }, + { + "epoch": 0.7187678265830005, + "grad_norm": 3.7910077571868896, + "learning_rate": 7.414499736269178e-05, + "loss": 2.3643, + "step": 10710 + }, + { + "epoch": 0.7189020502667696, + "grad_norm": 3.948997974395752, + "learning_rate": 7.413547968233684e-05, + "loss": 2.3316, + "step": 10712 + }, + { + "epoch": 0.7190362739505386, + "grad_norm": 3.9703781604766846, + "learning_rate": 7.412596086162793e-05, + "loss": 2.7161, + "step": 10714 + }, + { + "epoch": 0.7191704976343076, + "grad_norm": 3.899873733520508, + "learning_rate": 7.411644090101481e-05, + "loss": 2.2835, + "step": 10716 + }, + { + "epoch": 0.7193047213180765, + "grad_norm": 3.772862672805786, + "learning_rate": 7.410691980094728e-05, + "loss": 2.4461, + "step": 10718 + }, + { + "epoch": 0.7194389450018456, + "grad_norm": 4.3151116371154785, + "learning_rate": 7.409739756187519e-05, + "loss": 2.4873, + "step": 10720 + }, + { + "epoch": 0.7195731686856146, + "grad_norm": 4.18092679977417, + "learning_rate": 7.408787418424848e-05, + "loss": 2.5711, + "step": 10722 + }, + { + "epoch": 0.7197073923693835, + "grad_norm": 4.09240198135376, + "learning_rate": 7.407834966851705e-05, + "loss": 2.3022, + "step": 10724 + }, + { + "epoch": 0.7198416160531526, + "grad_norm": 4.113024711608887, + "learning_rate": 7.406882401513096e-05, + "loss": 2.4251, + "step": 10726 + }, + { + "epoch": 0.7199758397369216, + "grad_norm": 4.046052932739258, + "learning_rate": 7.405929722454026e-05, + "loss": 2.618, + "step": 10728 + }, + { + "epoch": 0.7201100634206906, + "grad_norm": 4.328642845153809, + "learning_rate": 7.404976929719507e-05, + "loss": 2.3123, + "step": 10730 + }, + { + "epoch": 0.7202442871044595, + "grad_norm": 3.8210508823394775, + "learning_rate": 7.404024023354558e-05, + "loss": 2.5126, + "step": 10732 + }, + { + "epoch": 0.7203785107882286, + "grad_norm": 4.068430423736572, + "learning_rate": 7.4030710034042e-05, + "loss": 2.3137, + "step": 10734 + }, + { + "epoch": 0.7205127344719976, + "grad_norm": 4.189883232116699, + "learning_rate": 7.402117869913465e-05, + "loss": 2.5813, + "step": 10736 + }, + { + "epoch": 0.7206469581557666, + "grad_norm": 4.224363803863525, + "learning_rate": 7.401164622927382e-05, + "loss": 2.4847, + "step": 10738 + }, + { + "epoch": 0.7207811818395355, + "grad_norm": 4.162548542022705, + "learning_rate": 7.400211262490994e-05, + "loss": 2.5527, + "step": 10740 + }, + { + "epoch": 0.7209154055233046, + "grad_norm": 4.02898645401001, + "learning_rate": 7.399257788649342e-05, + "loss": 2.3966, + "step": 10742 + }, + { + "epoch": 0.7210496292070736, + "grad_norm": 4.710583209991455, + "learning_rate": 7.398304201447478e-05, + "loss": 2.3055, + "step": 10744 + }, + { + "epoch": 0.7211838528908426, + "grad_norm": 4.615324974060059, + "learning_rate": 7.397350500930458e-05, + "loss": 2.3232, + "step": 10746 + }, + { + "epoch": 0.7213180765746116, + "grad_norm": 4.242539405822754, + "learning_rate": 7.39639668714334e-05, + "loss": 2.4382, + "step": 10748 + }, + { + "epoch": 0.7214523002583806, + "grad_norm": 4.6566162109375, + "learning_rate": 7.395442760131192e-05, + "loss": 2.6285, + "step": 10750 + }, + { + "epoch": 0.7215865239421496, + "grad_norm": 3.800720691680908, + "learning_rate": 7.394488719939081e-05, + "loss": 2.4278, + "step": 10752 + }, + { + "epoch": 0.7217207476259186, + "grad_norm": 4.202616214752197, + "learning_rate": 7.393534566612089e-05, + "loss": 2.5596, + "step": 10754 + }, + { + "epoch": 0.7218549713096876, + "grad_norm": 3.974564790725708, + "learning_rate": 7.392580300195296e-05, + "loss": 2.2352, + "step": 10756 + }, + { + "epoch": 0.7219891949934566, + "grad_norm": 4.149641513824463, + "learning_rate": 7.391625920733786e-05, + "loss": 2.4271, + "step": 10758 + }, + { + "epoch": 0.7221234186772256, + "grad_norm": 4.326314926147461, + "learning_rate": 7.390671428272655e-05, + "loss": 2.5679, + "step": 10760 + }, + { + "epoch": 0.7222576423609945, + "grad_norm": 3.510146379470825, + "learning_rate": 7.389716822857e-05, + "loss": 2.2699, + "step": 10762 + }, + { + "epoch": 0.7223918660447636, + "grad_norm": 4.29338264465332, + "learning_rate": 7.388762104531925e-05, + "loss": 2.5263, + "step": 10764 + }, + { + "epoch": 0.7225260897285326, + "grad_norm": 3.9356846809387207, + "learning_rate": 7.387807273342539e-05, + "loss": 2.4411, + "step": 10766 + }, + { + "epoch": 0.7226603134123016, + "grad_norm": 3.748566150665283, + "learning_rate": 7.386852329333953e-05, + "loss": 2.434, + "step": 10768 + }, + { + "epoch": 0.7227945370960706, + "grad_norm": 4.548520088195801, + "learning_rate": 7.385897272551287e-05, + "loss": 2.303, + "step": 10770 + }, + { + "epoch": 0.7229287607798396, + "grad_norm": 3.7700064182281494, + "learning_rate": 7.38494210303967e-05, + "loss": 2.4461, + "step": 10772 + }, + { + "epoch": 0.7230629844636086, + "grad_norm": 4.502312183380127, + "learning_rate": 7.383986820844226e-05, + "loss": 2.3954, + "step": 10774 + }, + { + "epoch": 0.7231972081473776, + "grad_norm": 4.548856258392334, + "learning_rate": 7.383031426010092e-05, + "loss": 2.4336, + "step": 10776 + }, + { + "epoch": 0.7233314318311466, + "grad_norm": 4.230014324188232, + "learning_rate": 7.38207591858241e-05, + "loss": 2.3543, + "step": 10778 + }, + { + "epoch": 0.7234656555149156, + "grad_norm": 4.921919822692871, + "learning_rate": 7.381120298606325e-05, + "loss": 2.2988, + "step": 10780 + }, + { + "epoch": 0.7235998791986846, + "grad_norm": 4.318753719329834, + "learning_rate": 7.380164566126989e-05, + "loss": 2.34, + "step": 10782 + }, + { + "epoch": 0.7237341028824537, + "grad_norm": 4.386026859283447, + "learning_rate": 7.379208721189557e-05, + "loss": 2.517, + "step": 10784 + }, + { + "epoch": 0.7238683265662226, + "grad_norm": 4.080481052398682, + "learning_rate": 7.37825276383919e-05, + "loss": 2.3861, + "step": 10786 + }, + { + "epoch": 0.7240025502499916, + "grad_norm": 3.7138774394989014, + "learning_rate": 7.377296694121058e-05, + "loss": 2.3513, + "step": 10788 + }, + { + "epoch": 0.7241367739337606, + "grad_norm": 3.6630241870880127, + "learning_rate": 7.376340512080334e-05, + "loss": 2.613, + "step": 10790 + }, + { + "epoch": 0.7242709976175297, + "grad_norm": 3.7827651500701904, + "learning_rate": 7.375384217762191e-05, + "loss": 2.3283, + "step": 10792 + }, + { + "epoch": 0.7244052213012986, + "grad_norm": 5.365560054779053, + "learning_rate": 7.374427811211815e-05, + "loss": 2.5332, + "step": 10794 + }, + { + "epoch": 0.7245394449850676, + "grad_norm": 4.338688373565674, + "learning_rate": 7.373471292474393e-05, + "loss": 2.5846, + "step": 10796 + }, + { + "epoch": 0.7246736686688366, + "grad_norm": 3.666334629058838, + "learning_rate": 7.372514661595122e-05, + "loss": 2.1723, + "step": 10798 + }, + { + "epoch": 0.7248078923526056, + "grad_norm": 4.048376560211182, + "learning_rate": 7.371557918619198e-05, + "loss": 2.2656, + "step": 10800 + }, + { + "epoch": 0.7249421160363746, + "grad_norm": 4.2108073234558105, + "learning_rate": 7.370601063591824e-05, + "loss": 2.4335, + "step": 10802 + }, + { + "epoch": 0.7250763397201436, + "grad_norm": 3.78383469581604, + "learning_rate": 7.369644096558213e-05, + "loss": 2.3186, + "step": 10804 + }, + { + "epoch": 0.7252105634039127, + "grad_norm": 4.10956335067749, + "learning_rate": 7.368687017563578e-05, + "loss": 2.5118, + "step": 10806 + }, + { + "epoch": 0.7253447870876816, + "grad_norm": 3.7950096130371094, + "learning_rate": 7.36772982665314e-05, + "loss": 2.4203, + "step": 10808 + }, + { + "epoch": 0.7254790107714506, + "grad_norm": 3.9439592361450195, + "learning_rate": 7.366772523872122e-05, + "loss": 2.4542, + "step": 10810 + }, + { + "epoch": 0.7256132344552196, + "grad_norm": 4.466374397277832, + "learning_rate": 7.365815109265757e-05, + "loss": 2.5833, + "step": 10812 + }, + { + "epoch": 0.7257474581389887, + "grad_norm": 3.8481647968292236, + "learning_rate": 7.36485758287928e-05, + "loss": 2.2637, + "step": 10814 + }, + { + "epoch": 0.7258816818227576, + "grad_norm": 4.284858226776123, + "learning_rate": 7.363899944757935e-05, + "loss": 2.2915, + "step": 10816 + }, + { + "epoch": 0.7260159055065266, + "grad_norm": 4.342136859893799, + "learning_rate": 7.362942194946962e-05, + "loss": 2.4316, + "step": 10818 + }, + { + "epoch": 0.7261501291902956, + "grad_norm": 5.854531288146973, + "learning_rate": 7.361984333491618e-05, + "loss": 2.3215, + "step": 10820 + }, + { + "epoch": 0.7262843528740647, + "grad_norm": 3.8914694786071777, + "learning_rate": 7.361026360437162e-05, + "loss": 2.2961, + "step": 10822 + }, + { + "epoch": 0.7264185765578336, + "grad_norm": 3.6100566387176514, + "learning_rate": 7.360068275828849e-05, + "loss": 2.2794, + "step": 10824 + }, + { + "epoch": 0.7265528002416026, + "grad_norm": 4.6159892082214355, + "learning_rate": 7.359110079711953e-05, + "loss": 2.5313, + "step": 10826 + }, + { + "epoch": 0.7266870239253717, + "grad_norm": 3.967874526977539, + "learning_rate": 7.358151772131744e-05, + "loss": 2.4356, + "step": 10828 + }, + { + "epoch": 0.7268212476091407, + "grad_norm": 4.602555751800537, + "learning_rate": 7.357193353133503e-05, + "loss": 2.6472, + "step": 10830 + }, + { + "epoch": 0.7269554712929096, + "grad_norm": 4.193561553955078, + "learning_rate": 7.356234822762508e-05, + "loss": 2.3831, + "step": 10832 + }, + { + "epoch": 0.7270896949766786, + "grad_norm": 4.506980895996094, + "learning_rate": 7.355276181064052e-05, + "loss": 2.2939, + "step": 10834 + }, + { + "epoch": 0.7272239186604477, + "grad_norm": 5.14094877243042, + "learning_rate": 7.354317428083428e-05, + "loss": 2.7552, + "step": 10836 + }, + { + "epoch": 0.7273581423442166, + "grad_norm": 4.040049076080322, + "learning_rate": 7.353358563865935e-05, + "loss": 2.3443, + "step": 10838 + }, + { + "epoch": 0.7274923660279856, + "grad_norm": 4.344991683959961, + "learning_rate": 7.35239958845688e-05, + "loss": 2.3336, + "step": 10840 + }, + { + "epoch": 0.7276265897117546, + "grad_norm": 4.047455787658691, + "learning_rate": 7.351440501901567e-05, + "loss": 2.1983, + "step": 10842 + }, + { + "epoch": 0.7277608133955237, + "grad_norm": 4.834317684173584, + "learning_rate": 7.350481304245315e-05, + "loss": 2.5797, + "step": 10844 + }, + { + "epoch": 0.7278950370792926, + "grad_norm": 4.029971599578857, + "learning_rate": 7.349521995533444e-05, + "loss": 2.1772, + "step": 10846 + }, + { + "epoch": 0.7280292607630616, + "grad_norm": 4.007789611816406, + "learning_rate": 7.348562575811279e-05, + "loss": 2.6131, + "step": 10848 + }, + { + "epoch": 0.7281634844468307, + "grad_norm": 7.776440143585205, + "learning_rate": 7.347603045124149e-05, + "loss": 2.3371, + "step": 10850 + }, + { + "epoch": 0.7282977081305997, + "grad_norm": 7.322795391082764, + "learning_rate": 7.346643403517394e-05, + "loss": 2.4333, + "step": 10852 + }, + { + "epoch": 0.7284319318143686, + "grad_norm": 4.771570205688477, + "learning_rate": 7.345683651036351e-05, + "loss": 2.5278, + "step": 10854 + }, + { + "epoch": 0.7285661554981376, + "grad_norm": 3.980994462966919, + "learning_rate": 7.344723787726368e-05, + "loss": 2.1994, + "step": 10856 + }, + { + "epoch": 0.7287003791819067, + "grad_norm": 4.092049598693848, + "learning_rate": 7.343763813632798e-05, + "loss": 2.3809, + "step": 10858 + }, + { + "epoch": 0.7288346028656757, + "grad_norm": 4.523285388946533, + "learning_rate": 7.342803728800995e-05, + "loss": 2.5612, + "step": 10860 + }, + { + "epoch": 0.7289688265494446, + "grad_norm": 4.068786144256592, + "learning_rate": 7.341843533276323e-05, + "loss": 2.225, + "step": 10862 + }, + { + "epoch": 0.7291030502332136, + "grad_norm": 4.431085109710693, + "learning_rate": 7.340883227104151e-05, + "loss": 2.3359, + "step": 10864 + }, + { + "epoch": 0.7292372739169827, + "grad_norm": 3.9813270568847656, + "learning_rate": 7.339922810329846e-05, + "loss": 2.166, + "step": 10866 + }, + { + "epoch": 0.7293714976007517, + "grad_norm": 4.519051551818848, + "learning_rate": 7.338962282998794e-05, + "loss": 2.3678, + "step": 10868 + }, + { + "epoch": 0.7295057212845206, + "grad_norm": 6.508392810821533, + "learning_rate": 7.33800164515637e-05, + "loss": 2.2911, + "step": 10870 + }, + { + "epoch": 0.7296399449682897, + "grad_norm": 4.4188313484191895, + "learning_rate": 7.337040896847967e-05, + "loss": 2.2801, + "step": 10872 + }, + { + "epoch": 0.7297741686520587, + "grad_norm": 4.097393035888672, + "learning_rate": 7.336080038118978e-05, + "loss": 2.255, + "step": 10874 + }, + { + "epoch": 0.7299083923358276, + "grad_norm": 3.981466770172119, + "learning_rate": 7.335119069014798e-05, + "loss": 2.3566, + "step": 10876 + }, + { + "epoch": 0.7300426160195966, + "grad_norm": 3.910905599594116, + "learning_rate": 7.334157989580838e-05, + "loss": 2.3852, + "step": 10878 + }, + { + "epoch": 0.7301768397033657, + "grad_norm": 6.127138137817383, + "learning_rate": 7.333196799862499e-05, + "loss": 2.4788, + "step": 10880 + }, + { + "epoch": 0.7303110633871347, + "grad_norm": 3.6178231239318848, + "learning_rate": 7.332235499905202e-05, + "loss": 2.4269, + "step": 10882 + }, + { + "epoch": 0.7304452870709036, + "grad_norm": 3.6302647590637207, + "learning_rate": 7.331274089754363e-05, + "loss": 2.3051, + "step": 10884 + }, + { + "epoch": 0.7305795107546726, + "grad_norm": 4.403247356414795, + "learning_rate": 7.330312569455408e-05, + "loss": 2.2963, + "step": 10886 + }, + { + "epoch": 0.7307137344384417, + "grad_norm": 3.771132230758667, + "learning_rate": 7.329350939053766e-05, + "loss": 2.1423, + "step": 10888 + }, + { + "epoch": 0.7308479581222107, + "grad_norm": 4.342030048370361, + "learning_rate": 7.328389198594872e-05, + "loss": 2.4648, + "step": 10890 + }, + { + "epoch": 0.7309821818059796, + "grad_norm": 4.26757287979126, + "learning_rate": 7.327427348124167e-05, + "loss": 2.4298, + "step": 10892 + }, + { + "epoch": 0.7311164054897487, + "grad_norm": 4.003725051879883, + "learning_rate": 7.326465387687097e-05, + "loss": 2.1302, + "step": 10894 + }, + { + "epoch": 0.7312506291735177, + "grad_norm": 4.64120626449585, + "learning_rate": 7.325503317329112e-05, + "loss": 2.171, + "step": 10896 + }, + { + "epoch": 0.7313848528572867, + "grad_norm": 3.9790279865264893, + "learning_rate": 7.324541137095669e-05, + "loss": 2.4607, + "step": 10898 + }, + { + "epoch": 0.7315190765410556, + "grad_norm": 4.35110330581665, + "learning_rate": 7.323578847032226e-05, + "loss": 2.5719, + "step": 10900 + }, + { + "epoch": 0.7316533002248247, + "grad_norm": 4.294532299041748, + "learning_rate": 7.322616447184254e-05, + "loss": 2.6803, + "step": 10902 + }, + { + "epoch": 0.7317875239085937, + "grad_norm": 3.8902876377105713, + "learning_rate": 7.321653937597222e-05, + "loss": 2.1858, + "step": 10904 + }, + { + "epoch": 0.7319217475923627, + "grad_norm": 4.2639641761779785, + "learning_rate": 7.320691318316606e-05, + "loss": 2.4674, + "step": 10906 + }, + { + "epoch": 0.7320559712761316, + "grad_norm": 4.816429615020752, + "learning_rate": 7.319728589387888e-05, + "loss": 2.4746, + "step": 10908 + }, + { + "epoch": 0.7321901949599007, + "grad_norm": 3.569833517074585, + "learning_rate": 7.318765750856555e-05, + "loss": 2.4532, + "step": 10910 + }, + { + "epoch": 0.7323244186436697, + "grad_norm": 4.893139839172363, + "learning_rate": 7.317802802768102e-05, + "loss": 2.4305, + "step": 10912 + }, + { + "epoch": 0.7324586423274386, + "grad_norm": 3.766443967819214, + "learning_rate": 7.316839745168024e-05, + "loss": 2.3146, + "step": 10914 + }, + { + "epoch": 0.7325928660112077, + "grad_norm": 4.885905742645264, + "learning_rate": 7.315876578101823e-05, + "loss": 2.4498, + "step": 10916 + }, + { + "epoch": 0.7327270896949767, + "grad_norm": 4.003500461578369, + "learning_rate": 7.314913301615008e-05, + "loss": 2.3216, + "step": 10918 + }, + { + "epoch": 0.7328613133787457, + "grad_norm": 3.874864101409912, + "learning_rate": 7.313949915753093e-05, + "loss": 2.1915, + "step": 10920 + }, + { + "epoch": 0.7329955370625146, + "grad_norm": 5.9848151206970215, + "learning_rate": 7.312986420561593e-05, + "loss": 2.1931, + "step": 10922 + }, + { + "epoch": 0.7331297607462837, + "grad_norm": 3.9558935165405273, + "learning_rate": 7.312022816086033e-05, + "loss": 2.3592, + "step": 10924 + }, + { + "epoch": 0.7332639844300527, + "grad_norm": 3.498685836791992, + "learning_rate": 7.311059102371942e-05, + "loss": 2.2091, + "step": 10926 + }, + { + "epoch": 0.7333982081138217, + "grad_norm": 4.744442462921143, + "learning_rate": 7.310095279464852e-05, + "loss": 2.5766, + "step": 10928 + }, + { + "epoch": 0.7335324317975906, + "grad_norm": 4.492681980133057, + "learning_rate": 7.309131347410303e-05, + "loss": 2.4674, + "step": 10930 + }, + { + "epoch": 0.7336666554813597, + "grad_norm": 3.8054258823394775, + "learning_rate": 7.308167306253839e-05, + "loss": 2.5546, + "step": 10932 + }, + { + "epoch": 0.7338008791651287, + "grad_norm": 4.617919921875, + "learning_rate": 7.30720315604101e-05, + "loss": 2.5489, + "step": 10934 + }, + { + "epoch": 0.7339351028488977, + "grad_norm": 4.0439558029174805, + "learning_rate": 7.306238896817366e-05, + "loss": 2.2752, + "step": 10936 + }, + { + "epoch": 0.7340693265326667, + "grad_norm": 4.25284481048584, + "learning_rate": 7.305274528628469e-05, + "loss": 2.3661, + "step": 10938 + }, + { + "epoch": 0.7342035502164357, + "grad_norm": 4.237573146820068, + "learning_rate": 7.304310051519886e-05, + "loss": 2.5364, + "step": 10940 + }, + { + "epoch": 0.7343377739002047, + "grad_norm": 4.189595699310303, + "learning_rate": 7.303345465537184e-05, + "loss": 2.3883, + "step": 10942 + }, + { + "epoch": 0.7344719975839737, + "grad_norm": 3.8806605339050293, + "learning_rate": 7.302380770725937e-05, + "loss": 2.5074, + "step": 10944 + }, + { + "epoch": 0.7346062212677427, + "grad_norm": 4.180690288543701, + "learning_rate": 7.301415967131727e-05, + "loss": 2.7015, + "step": 10946 + }, + { + "epoch": 0.7347404449515117, + "grad_norm": 4.185053825378418, + "learning_rate": 7.300451054800137e-05, + "loss": 2.7117, + "step": 10948 + }, + { + "epoch": 0.7348746686352807, + "grad_norm": 4.32503604888916, + "learning_rate": 7.29948603377676e-05, + "loss": 2.5089, + "step": 10950 + }, + { + "epoch": 0.7350088923190496, + "grad_norm": 4.06556510925293, + "learning_rate": 7.298520904107187e-05, + "loss": 2.1503, + "step": 10952 + }, + { + "epoch": 0.7351431160028187, + "grad_norm": 3.818847179412842, + "learning_rate": 7.297555665837024e-05, + "loss": 2.5723, + "step": 10954 + }, + { + "epoch": 0.7352773396865877, + "grad_norm": 4.0330705642700195, + "learning_rate": 7.296590319011871e-05, + "loss": 2.3972, + "step": 10956 + }, + { + "epoch": 0.7354115633703567, + "grad_norm": 3.859311819076538, + "learning_rate": 7.295624863677343e-05, + "loss": 2.5587, + "step": 10958 + }, + { + "epoch": 0.7355457870541257, + "grad_norm": 3.879464864730835, + "learning_rate": 7.294659299879054e-05, + "loss": 2.2533, + "step": 10960 + }, + { + "epoch": 0.7356800107378947, + "grad_norm": 4.655669689178467, + "learning_rate": 7.293693627662625e-05, + "loss": 2.1137, + "step": 10962 + }, + { + "epoch": 0.7358142344216637, + "grad_norm": 4.182253837585449, + "learning_rate": 7.292727847073684e-05, + "loss": 2.3016, + "step": 10964 + }, + { + "epoch": 0.7359484581054327, + "grad_norm": 4.5778069496154785, + "learning_rate": 7.29176195815786e-05, + "loss": 2.2696, + "step": 10966 + }, + { + "epoch": 0.7360826817892017, + "grad_norm": 4.306759834289551, + "learning_rate": 7.29079596096079e-05, + "loss": 2.4431, + "step": 10968 + }, + { + "epoch": 0.7362169054729707, + "grad_norm": 3.8768017292022705, + "learning_rate": 7.289829855528114e-05, + "loss": 2.3681, + "step": 10970 + }, + { + "epoch": 0.7363511291567397, + "grad_norm": 4.522663116455078, + "learning_rate": 7.288863641905481e-05, + "loss": 2.2571, + "step": 10972 + }, + { + "epoch": 0.7364853528405088, + "grad_norm": 3.3957276344299316, + "learning_rate": 7.287897320138542e-05, + "loss": 2.2477, + "step": 10974 + }, + { + "epoch": 0.7366195765242777, + "grad_norm": 4.140124797821045, + "learning_rate": 7.286930890272954e-05, + "loss": 2.3892, + "step": 10976 + }, + { + "epoch": 0.7367538002080467, + "grad_norm": 4.415163993835449, + "learning_rate": 7.285964352354378e-05, + "loss": 2.2949, + "step": 10978 + }, + { + "epoch": 0.7368880238918157, + "grad_norm": 3.912407875061035, + "learning_rate": 7.284997706428482e-05, + "loss": 2.4809, + "step": 10980 + }, + { + "epoch": 0.7370222475755848, + "grad_norm": 4.295978546142578, + "learning_rate": 7.284030952540937e-05, + "loss": 2.445, + "step": 10982 + }, + { + "epoch": 0.7371564712593537, + "grad_norm": 4.556746006011963, + "learning_rate": 7.28306409073742e-05, + "loss": 2.4859, + "step": 10984 + }, + { + "epoch": 0.7372906949431227, + "grad_norm": 3.715505838394165, + "learning_rate": 7.282097121063616e-05, + "loss": 2.4846, + "step": 10986 + }, + { + "epoch": 0.7374249186268917, + "grad_norm": 3.9189414978027344, + "learning_rate": 7.28113004356521e-05, + "loss": 2.3724, + "step": 10988 + }, + { + "epoch": 0.7375591423106607, + "grad_norm": 4.327106475830078, + "learning_rate": 7.280162858287894e-05, + "loss": 2.5282, + "step": 10990 + }, + { + "epoch": 0.7376933659944297, + "grad_norm": 3.9742214679718018, + "learning_rate": 7.279195565277369e-05, + "loss": 2.4264, + "step": 10992 + }, + { + "epoch": 0.7378275896781987, + "grad_norm": 4.13374137878418, + "learning_rate": 7.278228164579333e-05, + "loss": 2.4845, + "step": 10994 + }, + { + "epoch": 0.7379618133619678, + "grad_norm": 4.329884052276611, + "learning_rate": 7.277260656239497e-05, + "loss": 2.5174, + "step": 10996 + }, + { + "epoch": 0.7380960370457367, + "grad_norm": 3.9397120475769043, + "learning_rate": 7.276293040303573e-05, + "loss": 2.4219, + "step": 10998 + }, + { + "epoch": 0.7382302607295057, + "grad_norm": 4.054099082946777, + "learning_rate": 7.275325316817279e-05, + "loss": 2.4492, + "step": 11000 + }, + { + "epoch": 0.7383644844132747, + "grad_norm": 4.295323371887207, + "learning_rate": 7.274357485826339e-05, + "loss": 2.4201, + "step": 11002 + }, + { + "epoch": 0.7384987080970438, + "grad_norm": 4.2904744148254395, + "learning_rate": 7.27338954737648e-05, + "loss": 2.5129, + "step": 11004 + }, + { + "epoch": 0.7386329317808127, + "grad_norm": 4.5226359367370605, + "learning_rate": 7.272421501513434e-05, + "loss": 2.5123, + "step": 11006 + }, + { + "epoch": 0.7387671554645817, + "grad_norm": 4.290727138519287, + "learning_rate": 7.27145334828294e-05, + "loss": 2.4532, + "step": 11008 + }, + { + "epoch": 0.7389013791483507, + "grad_norm": 4.06302547454834, + "learning_rate": 7.270485087730744e-05, + "loss": 2.3576, + "step": 11010 + }, + { + "epoch": 0.7390356028321198, + "grad_norm": 5.573270320892334, + "learning_rate": 7.269516719902591e-05, + "loss": 2.1942, + "step": 11012 + }, + { + "epoch": 0.7391698265158887, + "grad_norm": 3.5449423789978027, + "learning_rate": 7.268548244844236e-05, + "loss": 2.4714, + "step": 11014 + }, + { + "epoch": 0.7393040501996577, + "grad_norm": 3.8782951831817627, + "learning_rate": 7.267579662601439e-05, + "loss": 2.35, + "step": 11016 + }, + { + "epoch": 0.7394382738834268, + "grad_norm": 3.753293514251709, + "learning_rate": 7.266610973219959e-05, + "loss": 2.2992, + "step": 11018 + }, + { + "epoch": 0.7395724975671958, + "grad_norm": 4.448056221008301, + "learning_rate": 7.265642176745571e-05, + "loss": 2.5667, + "step": 11020 + }, + { + "epoch": 0.7397067212509647, + "grad_norm": 5.364463806152344, + "learning_rate": 7.264673273224042e-05, + "loss": 2.5219, + "step": 11022 + }, + { + "epoch": 0.7398409449347337, + "grad_norm": 4.09712553024292, + "learning_rate": 7.263704262701157e-05, + "loss": 2.5206, + "step": 11024 + }, + { + "epoch": 0.7399751686185028, + "grad_norm": 4.2858452796936035, + "learning_rate": 7.262735145222696e-05, + "loss": 2.4326, + "step": 11026 + }, + { + "epoch": 0.7401093923022717, + "grad_norm": 4.155178546905518, + "learning_rate": 7.261765920834447e-05, + "loss": 2.1703, + "step": 11028 + }, + { + "epoch": 0.7402436159860407, + "grad_norm": 3.807370185852051, + "learning_rate": 7.260796589582208e-05, + "loss": 2.2917, + "step": 11030 + }, + { + "epoch": 0.7403778396698097, + "grad_norm": 4.3051252365112305, + "learning_rate": 7.259827151511775e-05, + "loss": 2.4949, + "step": 11032 + }, + { + "epoch": 0.7405120633535788, + "grad_norm": 4.613729000091553, + "learning_rate": 7.258857606668951e-05, + "loss": 2.4687, + "step": 11034 + }, + { + "epoch": 0.7406462870373477, + "grad_norm": 4.275402545928955, + "learning_rate": 7.257887955099551e-05, + "loss": 2.6357, + "step": 11036 + }, + { + "epoch": 0.7407805107211167, + "grad_norm": 3.7861268520355225, + "learning_rate": 7.256918196849382e-05, + "loss": 2.1003, + "step": 11038 + }, + { + "epoch": 0.7409147344048858, + "grad_norm": 4.989249229431152, + "learning_rate": 7.255948331964268e-05, + "loss": 2.4991, + "step": 11040 + }, + { + "epoch": 0.7410489580886548, + "grad_norm": 4.795454502105713, + "learning_rate": 7.25497836049003e-05, + "loss": 2.4985, + "step": 11042 + }, + { + "epoch": 0.7411831817724237, + "grad_norm": 3.9073293209075928, + "learning_rate": 7.2540082824725e-05, + "loss": 2.4688, + "step": 11044 + }, + { + "epoch": 0.7413174054561927, + "grad_norm": 4.53556489944458, + "learning_rate": 7.25303809795751e-05, + "loss": 2.3787, + "step": 11046 + }, + { + "epoch": 0.7414516291399618, + "grad_norm": 4.3630571365356445, + "learning_rate": 7.2520678069909e-05, + "loss": 2.2331, + "step": 11048 + }, + { + "epoch": 0.7415858528237308, + "grad_norm": 4.053533554077148, + "learning_rate": 7.251097409618515e-05, + "loss": 2.6886, + "step": 11050 + }, + { + "epoch": 0.7417200765074997, + "grad_norm": 4.029368877410889, + "learning_rate": 7.250126905886204e-05, + "loss": 2.3022, + "step": 11052 + }, + { + "epoch": 0.7418543001912687, + "grad_norm": 3.671145439147949, + "learning_rate": 7.249156295839824e-05, + "loss": 2.1892, + "step": 11054 + }, + { + "epoch": 0.7419885238750378, + "grad_norm": 4.149932384490967, + "learning_rate": 7.248185579525228e-05, + "loss": 2.2478, + "step": 11056 + }, + { + "epoch": 0.7421227475588068, + "grad_norm": 3.7836241722106934, + "learning_rate": 7.247214756988285e-05, + "loss": 2.3313, + "step": 11058 + }, + { + "epoch": 0.7422569712425757, + "grad_norm": 3.595994234085083, + "learning_rate": 7.246243828274863e-05, + "loss": 2.1348, + "step": 11060 + }, + { + "epoch": 0.7423911949263448, + "grad_norm": 5.218435287475586, + "learning_rate": 7.24527279343084e-05, + "loss": 2.3169, + "step": 11062 + }, + { + "epoch": 0.7425254186101138, + "grad_norm": 4.83737325668335, + "learning_rate": 7.24430165250209e-05, + "loss": 2.2874, + "step": 11064 + }, + { + "epoch": 0.7426596422938827, + "grad_norm": 3.88678240776062, + "learning_rate": 7.2433304055345e-05, + "loss": 2.6483, + "step": 11066 + }, + { + "epoch": 0.7427938659776517, + "grad_norm": 7.5890350341796875, + "learning_rate": 7.242359052573963e-05, + "loss": 2.5544, + "step": 11068 + }, + { + "epoch": 0.7429280896614208, + "grad_norm": 4.342431545257568, + "learning_rate": 7.241387593666368e-05, + "loss": 2.2923, + "step": 11070 + }, + { + "epoch": 0.7430623133451898, + "grad_norm": 4.528657913208008, + "learning_rate": 7.240416028857617e-05, + "loss": 2.3605, + "step": 11072 + }, + { + "epoch": 0.7431965370289587, + "grad_norm": 3.6290102005004883, + "learning_rate": 7.239444358193613e-05, + "loss": 2.5441, + "step": 11074 + }, + { + "epoch": 0.7433307607127277, + "grad_norm": 4.2957682609558105, + "learning_rate": 7.238472581720268e-05, + "loss": 2.5723, + "step": 11076 + }, + { + "epoch": 0.7434649843964968, + "grad_norm": 3.911288261413574, + "learning_rate": 7.237500699483495e-05, + "loss": 2.4119, + "step": 11078 + }, + { + "epoch": 0.7435992080802658, + "grad_norm": 3.708458662033081, + "learning_rate": 7.236528711529213e-05, + "loss": 2.1132, + "step": 11080 + }, + { + "epoch": 0.7437334317640347, + "grad_norm": 3.9929935932159424, + "learning_rate": 7.23555661790335e-05, + "loss": 2.1868, + "step": 11082 + }, + { + "epoch": 0.7438676554478038, + "grad_norm": 3.1504969596862793, + "learning_rate": 7.23458441865183e-05, + "loss": 1.9861, + "step": 11084 + }, + { + "epoch": 0.7440018791315728, + "grad_norm": 4.499558448791504, + "learning_rate": 7.233612113820592e-05, + "loss": 2.6979, + "step": 11086 + }, + { + "epoch": 0.7441361028153418, + "grad_norm": 4.638678550720215, + "learning_rate": 7.232639703455573e-05, + "loss": 2.3383, + "step": 11088 + }, + { + "epoch": 0.7442703264991107, + "grad_norm": 3.7643091678619385, + "learning_rate": 7.231667187602718e-05, + "loss": 2.4827, + "step": 11090 + }, + { + "epoch": 0.7444045501828798, + "grad_norm": 3.954303741455078, + "learning_rate": 7.230694566307978e-05, + "loss": 2.1746, + "step": 11092 + }, + { + "epoch": 0.7445387738666488, + "grad_norm": 4.310286998748779, + "learning_rate": 7.229721839617306e-05, + "loss": 2.3933, + "step": 11094 + }, + { + "epoch": 0.7446729975504178, + "grad_norm": 3.9358222484588623, + "learning_rate": 7.228749007576661e-05, + "loss": 2.6339, + "step": 11096 + }, + { + "epoch": 0.7448072212341867, + "grad_norm": 4.704927921295166, + "learning_rate": 7.227776070232008e-05, + "loss": 2.9006, + "step": 11098 + }, + { + "epoch": 0.7449414449179558, + "grad_norm": 4.024145603179932, + "learning_rate": 7.226803027629316e-05, + "loss": 2.0225, + "step": 11100 + }, + { + "epoch": 0.7450756686017248, + "grad_norm": 4.292037010192871, + "learning_rate": 7.225829879814561e-05, + "loss": 2.3414, + "step": 11102 + }, + { + "epoch": 0.7452098922854937, + "grad_norm": 4.171245098114014, + "learning_rate": 7.22485662683372e-05, + "loss": 2.4089, + "step": 11104 + }, + { + "epoch": 0.7453441159692628, + "grad_norm": 3.7214810848236084, + "learning_rate": 7.223883268732779e-05, + "loss": 2.1963, + "step": 11106 + }, + { + "epoch": 0.7454783396530318, + "grad_norm": 3.96905779838562, + "learning_rate": 7.222909805557726e-05, + "loss": 2.5637, + "step": 11108 + }, + { + "epoch": 0.7456125633368008, + "grad_norm": 4.069817543029785, + "learning_rate": 7.221936237354557e-05, + "loss": 2.3955, + "step": 11110 + }, + { + "epoch": 0.7457467870205697, + "grad_norm": 4.1629319190979, + "learning_rate": 7.22096256416927e-05, + "loss": 2.3483, + "step": 11112 + }, + { + "epoch": 0.7458810107043388, + "grad_norm": 3.936802864074707, + "learning_rate": 7.219988786047866e-05, + "loss": 2.0831, + "step": 11114 + }, + { + "epoch": 0.7460152343881078, + "grad_norm": 4.886227607727051, + "learning_rate": 7.219014903036361e-05, + "loss": 2.4273, + "step": 11116 + }, + { + "epoch": 0.7461494580718768, + "grad_norm": 3.959627151489258, + "learning_rate": 7.218040915180764e-05, + "loss": 2.4124, + "step": 11118 + }, + { + "epoch": 0.7462836817556457, + "grad_norm": 5.122107982635498, + "learning_rate": 7.217066822527096e-05, + "loss": 2.3112, + "step": 11120 + }, + { + "epoch": 0.7464179054394148, + "grad_norm": 4.318151950836182, + "learning_rate": 7.216092625121379e-05, + "loss": 2.3432, + "step": 11122 + }, + { + "epoch": 0.7465521291231838, + "grad_norm": 4.502048015594482, + "learning_rate": 7.215118323009643e-05, + "loss": 2.2016, + "step": 11124 + }, + { + "epoch": 0.7466863528069528, + "grad_norm": 4.706890106201172, + "learning_rate": 7.214143916237925e-05, + "loss": 2.3835, + "step": 11126 + }, + { + "epoch": 0.7468205764907218, + "grad_norm": 5.1026105880737305, + "learning_rate": 7.213169404852258e-05, + "loss": 2.4878, + "step": 11128 + }, + { + "epoch": 0.7469548001744908, + "grad_norm": 3.8253159523010254, + "learning_rate": 7.21219478889869e-05, + "loss": 2.5126, + "step": 11130 + }, + { + "epoch": 0.7470890238582598, + "grad_norm": 4.234988212585449, + "learning_rate": 7.211220068423266e-05, + "loss": 2.4305, + "step": 11132 + }, + { + "epoch": 0.7472232475420288, + "grad_norm": 3.7908647060394287, + "learning_rate": 7.210245243472046e-05, + "loss": 2.2605, + "step": 11134 + }, + { + "epoch": 0.7473574712257978, + "grad_norm": 5.244839668273926, + "learning_rate": 7.209270314091081e-05, + "loss": 2.5089, + "step": 11136 + }, + { + "epoch": 0.7474916949095668, + "grad_norm": 4.3923563957214355, + "learning_rate": 7.208295280326439e-05, + "loss": 2.3379, + "step": 11138 + }, + { + "epoch": 0.7476259185933358, + "grad_norm": 4.177116394042969, + "learning_rate": 7.207320142224188e-05, + "loss": 2.5802, + "step": 11140 + }, + { + "epoch": 0.7477601422771047, + "grad_norm": 4.0173115730285645, + "learning_rate": 7.206344899830401e-05, + "loss": 2.4063, + "step": 11142 + }, + { + "epoch": 0.7478943659608738, + "grad_norm": 4.382330894470215, + "learning_rate": 7.205369553191156e-05, + "loss": 2.0669, + "step": 11144 + }, + { + "epoch": 0.7480285896446428, + "grad_norm": 4.262269973754883, + "learning_rate": 7.204394102352535e-05, + "loss": 2.5939, + "step": 11146 + }, + { + "epoch": 0.7481628133284118, + "grad_norm": 3.7809271812438965, + "learning_rate": 7.20341854736063e-05, + "loss": 2.2101, + "step": 11148 + }, + { + "epoch": 0.7482970370121808, + "grad_norm": 3.585874080657959, + "learning_rate": 7.20244288826153e-05, + "loss": 2.2994, + "step": 11150 + }, + { + "epoch": 0.7484312606959498, + "grad_norm": 4.25922155380249, + "learning_rate": 7.201467125101332e-05, + "loss": 2.3149, + "step": 11152 + }, + { + "epoch": 0.7485654843797188, + "grad_norm": 7.503423690795898, + "learning_rate": 7.200491257926145e-05, + "loss": 2.551, + "step": 11154 + }, + { + "epoch": 0.7486997080634878, + "grad_norm": 4.369760036468506, + "learning_rate": 7.19951528678207e-05, + "loss": 2.4491, + "step": 11156 + }, + { + "epoch": 0.7488339317472568, + "grad_norm": 4.345922946929932, + "learning_rate": 7.198539211715226e-05, + "loss": 2.5407, + "step": 11158 + }, + { + "epoch": 0.7489681554310258, + "grad_norm": 3.7949564456939697, + "learning_rate": 7.197563032771727e-05, + "loss": 2.3246, + "step": 11160 + }, + { + "epoch": 0.7491023791147948, + "grad_norm": 4.7387614250183105, + "learning_rate": 7.196586749997694e-05, + "loss": 2.7899, + "step": 11162 + }, + { + "epoch": 0.7492366027985639, + "grad_norm": 4.251669406890869, + "learning_rate": 7.195610363439259e-05, + "loss": 2.2983, + "step": 11164 + }, + { + "epoch": 0.7493708264823328, + "grad_norm": 5.807317733764648, + "learning_rate": 7.194633873142548e-05, + "loss": 2.5744, + "step": 11166 + }, + { + "epoch": 0.7495050501661018, + "grad_norm": 4.935104846954346, + "learning_rate": 7.193657279153706e-05, + "loss": 2.4975, + "step": 11168 + }, + { + "epoch": 0.7496392738498708, + "grad_norm": 4.213870048522949, + "learning_rate": 7.19268058151887e-05, + "loss": 2.3686, + "step": 11170 + }, + { + "epoch": 0.7497734975336399, + "grad_norm": 5.528947353363037, + "learning_rate": 7.191703780284187e-05, + "loss": 2.3971, + "step": 11172 + }, + { + "epoch": 0.7499077212174088, + "grad_norm": 3.873913288116455, + "learning_rate": 7.190726875495812e-05, + "loss": 2.5515, + "step": 11174 + }, + { + "epoch": 0.7500419449011778, + "grad_norm": 4.165197372436523, + "learning_rate": 7.189749867199899e-05, + "loss": 2.3357, + "step": 11176 + }, + { + "epoch": 0.7501761685849468, + "grad_norm": 4.195569038391113, + "learning_rate": 7.188772755442611e-05, + "loss": 2.3059, + "step": 11178 + }, + { + "epoch": 0.7503103922687158, + "grad_norm": 4.233708381652832, + "learning_rate": 7.187795540270114e-05, + "loss": 2.4536, + "step": 11180 + }, + { + "epoch": 0.7504446159524848, + "grad_norm": 4.363933086395264, + "learning_rate": 7.18681822172858e-05, + "loss": 2.3045, + "step": 11182 + }, + { + "epoch": 0.7505788396362538, + "grad_norm": 4.2642951011657715, + "learning_rate": 7.185840799864186e-05, + "loss": 2.7472, + "step": 11184 + }, + { + "epoch": 0.7507130633200229, + "grad_norm": 3.9238505363464355, + "learning_rate": 7.184863274723111e-05, + "loss": 2.5465, + "step": 11186 + }, + { + "epoch": 0.7508472870037918, + "grad_norm": 4.437404155731201, + "learning_rate": 7.183885646351542e-05, + "loss": 2.4563, + "step": 11188 + }, + { + "epoch": 0.7509815106875608, + "grad_norm": 3.8917899131774902, + "learning_rate": 7.182907914795672e-05, + "loss": 2.7818, + "step": 11190 + }, + { + "epoch": 0.7511157343713298, + "grad_norm": 3.8974177837371826, + "learning_rate": 7.181930080101696e-05, + "loss": 2.4209, + "step": 11192 + }, + { + "epoch": 0.7512499580550989, + "grad_norm": 4.532040596008301, + "learning_rate": 7.180952142315813e-05, + "loss": 2.4029, + "step": 11194 + }, + { + "epoch": 0.7513841817388678, + "grad_norm": 4.161258220672607, + "learning_rate": 7.179974101484232e-05, + "loss": 2.3101, + "step": 11196 + }, + { + "epoch": 0.7515184054226368, + "grad_norm": 4.389341354370117, + "learning_rate": 7.17899595765316e-05, + "loss": 2.019, + "step": 11198 + }, + { + "epoch": 0.7516526291064058, + "grad_norm": 4.369332790374756, + "learning_rate": 7.178017710868814e-05, + "loss": 2.2422, + "step": 11200 + }, + { + "epoch": 0.7517868527901749, + "grad_norm": 4.484252452850342, + "learning_rate": 7.177039361177413e-05, + "loss": 2.5128, + "step": 11202 + }, + { + "epoch": 0.7519210764739438, + "grad_norm": 3.9748198986053467, + "learning_rate": 7.176060908625184e-05, + "loss": 2.3103, + "step": 11204 + }, + { + "epoch": 0.7520553001577128, + "grad_norm": 4.388058185577393, + "learning_rate": 7.175082353258358e-05, + "loss": 2.8087, + "step": 11206 + }, + { + "epoch": 0.7521895238414819, + "grad_norm": 3.581876516342163, + "learning_rate": 7.174103695123166e-05, + "loss": 2.0318, + "step": 11208 + }, + { + "epoch": 0.7523237475252509, + "grad_norm": 3.841101884841919, + "learning_rate": 7.17312493426585e-05, + "loss": 2.1238, + "step": 11210 + }, + { + "epoch": 0.7524579712090198, + "grad_norm": 5.706211566925049, + "learning_rate": 7.172146070732652e-05, + "loss": 2.558, + "step": 11212 + }, + { + "epoch": 0.7525921948927888, + "grad_norm": 4.372034072875977, + "learning_rate": 7.171167104569826e-05, + "loss": 2.1673, + "step": 11214 + }, + { + "epoch": 0.7527264185765579, + "grad_norm": 4.441688537597656, + "learning_rate": 7.170188035823624e-05, + "loss": 2.4112, + "step": 11216 + }, + { + "epoch": 0.7528606422603268, + "grad_norm": 3.590373992919922, + "learning_rate": 7.169208864540303e-05, + "loss": 2.1518, + "step": 11218 + }, + { + "epoch": 0.7529948659440958, + "grad_norm": 3.4404401779174805, + "learning_rate": 7.16822959076613e-05, + "loss": 2.2603, + "step": 11220 + }, + { + "epoch": 0.7531290896278648, + "grad_norm": 4.942688941955566, + "learning_rate": 7.167250214547372e-05, + "loss": 2.4411, + "step": 11222 + }, + { + "epoch": 0.7532633133116339, + "grad_norm": 3.2718775272369385, + "learning_rate": 7.166270735930304e-05, + "loss": 2.1443, + "step": 11224 + }, + { + "epoch": 0.7533975369954028, + "grad_norm": 3.274251699447632, + "learning_rate": 7.165291154961202e-05, + "loss": 2.1561, + "step": 11226 + }, + { + "epoch": 0.7535317606791718, + "grad_norm": 4.556865692138672, + "learning_rate": 7.164311471686352e-05, + "loss": 2.5381, + "step": 11228 + }, + { + "epoch": 0.7536659843629409, + "grad_norm": 3.9423232078552246, + "learning_rate": 7.163331686152042e-05, + "loss": 2.6433, + "step": 11230 + }, + { + "epoch": 0.7538002080467099, + "grad_norm": 4.029872894287109, + "learning_rate": 7.16235179840456e-05, + "loss": 2.6149, + "step": 11232 + }, + { + "epoch": 0.7539344317304788, + "grad_norm": 4.319665431976318, + "learning_rate": 7.161371808490212e-05, + "loss": 2.3194, + "step": 11234 + }, + { + "epoch": 0.7540686554142478, + "grad_norm": 4.080962181091309, + "learning_rate": 7.160391716455292e-05, + "loss": 2.4187, + "step": 11236 + }, + { + "epoch": 0.7542028790980169, + "grad_norm": 4.30687952041626, + "learning_rate": 7.159411522346115e-05, + "loss": 2.2809, + "step": 11238 + }, + { + "epoch": 0.7543371027817859, + "grad_norm": 3.9152417182922363, + "learning_rate": 7.158431226208988e-05, + "loss": 2.3327, + "step": 11240 + }, + { + "epoch": 0.7544713264655548, + "grad_norm": 9.128992080688477, + "learning_rate": 7.157450828090231e-05, + "loss": 2.3412, + "step": 11242 + }, + { + "epoch": 0.7546055501493238, + "grad_norm": 5.4534010887146, + "learning_rate": 7.156470328036165e-05, + "loss": 2.4263, + "step": 11244 + }, + { + "epoch": 0.7547397738330929, + "grad_norm": 3.91579270362854, + "learning_rate": 7.155489726093114e-05, + "loss": 2.0808, + "step": 11246 + }, + { + "epoch": 0.7548739975168619, + "grad_norm": 4.122446537017822, + "learning_rate": 7.154509022307415e-05, + "loss": 2.6913, + "step": 11248 + }, + { + "epoch": 0.7550082212006308, + "grad_norm": 4.175485610961914, + "learning_rate": 7.1535282167254e-05, + "loss": 2.3805, + "step": 11250 + }, + { + "epoch": 0.7551424448843999, + "grad_norm": 4.073541641235352, + "learning_rate": 7.152547309393411e-05, + "loss": 2.5724, + "step": 11252 + }, + { + "epoch": 0.7552766685681689, + "grad_norm": 4.104366302490234, + "learning_rate": 7.151566300357796e-05, + "loss": 2.1686, + "step": 11254 + }, + { + "epoch": 0.7554108922519378, + "grad_norm": 4.431308269500732, + "learning_rate": 7.150585189664902e-05, + "loss": 2.3199, + "step": 11256 + }, + { + "epoch": 0.7555451159357068, + "grad_norm": 4.226772308349609, + "learning_rate": 7.14960397736109e-05, + "loss": 2.4893, + "step": 11258 + }, + { + "epoch": 0.7556793396194759, + "grad_norm": 4.147597312927246, + "learning_rate": 7.148622663492715e-05, + "loss": 2.2196, + "step": 11260 + }, + { + "epoch": 0.7558135633032449, + "grad_norm": 4.089413166046143, + "learning_rate": 7.147641248106142e-05, + "loss": 2.3646, + "step": 11262 + }, + { + "epoch": 0.7559477869870138, + "grad_norm": 4.161861419677734, + "learning_rate": 7.146659731247747e-05, + "loss": 2.5156, + "step": 11264 + }, + { + "epoch": 0.7560820106707828, + "grad_norm": 3.936002016067505, + "learning_rate": 7.1456781129639e-05, + "loss": 2.2801, + "step": 11266 + }, + { + "epoch": 0.7562162343545519, + "grad_norm": 3.904479742050171, + "learning_rate": 7.144696393300981e-05, + "loss": 2.1277, + "step": 11268 + }, + { + "epoch": 0.7563504580383209, + "grad_norm": 5.420644760131836, + "learning_rate": 7.143714572305374e-05, + "loss": 2.3204, + "step": 11270 + }, + { + "epoch": 0.7564846817220898, + "grad_norm": 4.554372787475586, + "learning_rate": 7.14273265002347e-05, + "loss": 2.6611, + "step": 11272 + }, + { + "epoch": 0.7566189054058589, + "grad_norm": 4.332543849945068, + "learning_rate": 7.141750626501661e-05, + "loss": 2.495, + "step": 11274 + }, + { + "epoch": 0.7567531290896279, + "grad_norm": 3.7059528827667236, + "learning_rate": 7.140768501786347e-05, + "loss": 2.3857, + "step": 11276 + }, + { + "epoch": 0.7568873527733969, + "grad_norm": 4.5286545753479, + "learning_rate": 7.13978627592393e-05, + "loss": 2.2532, + "step": 11278 + }, + { + "epoch": 0.7570215764571658, + "grad_norm": 3.7809102535247803, + "learning_rate": 7.13880394896082e-05, + "loss": 2.1552, + "step": 11280 + }, + { + "epoch": 0.7571558001409349, + "grad_norm": 3.9972620010375977, + "learning_rate": 7.13782152094343e-05, + "loss": 2.3241, + "step": 11282 + }, + { + "epoch": 0.7572900238247039, + "grad_norm": 4.098164081573486, + "learning_rate": 7.136838991918175e-05, + "loss": 2.8863, + "step": 11284 + }, + { + "epoch": 0.7574242475084729, + "grad_norm": 4.495838642120361, + "learning_rate": 7.135856361931482e-05, + "loss": 2.4604, + "step": 11286 + }, + { + "epoch": 0.7575584711922418, + "grad_norm": 4.044468879699707, + "learning_rate": 7.134873631029775e-05, + "loss": 2.3123, + "step": 11288 + }, + { + "epoch": 0.7576926948760109, + "grad_norm": 4.508501052856445, + "learning_rate": 7.133890799259486e-05, + "loss": 2.4652, + "step": 11290 + }, + { + "epoch": 0.7578269185597799, + "grad_norm": 4.1003737449646, + "learning_rate": 7.132907866667053e-05, + "loss": 2.2709, + "step": 11292 + }, + { + "epoch": 0.7579611422435488, + "grad_norm": 4.297711372375488, + "learning_rate": 7.131924833298918e-05, + "loss": 2.4198, + "step": 11294 + }, + { + "epoch": 0.7580953659273179, + "grad_norm": 4.100171089172363, + "learning_rate": 7.130941699201528e-05, + "loss": 2.4633, + "step": 11296 + }, + { + "epoch": 0.7582295896110869, + "grad_norm": 9.093619346618652, + "learning_rate": 7.129958464421331e-05, + "loss": 2.2747, + "step": 11298 + }, + { + "epoch": 0.7583638132948559, + "grad_norm": 4.300504207611084, + "learning_rate": 7.128975129004786e-05, + "loss": 2.3136, + "step": 11300 + }, + { + "epoch": 0.7584980369786248, + "grad_norm": 5.112210273742676, + "learning_rate": 7.127991692998353e-05, + "loss": 2.4761, + "step": 11302 + }, + { + "epoch": 0.7586322606623939, + "grad_norm": 4.667318820953369, + "learning_rate": 7.127008156448496e-05, + "loss": 2.8811, + "step": 11304 + }, + { + "epoch": 0.7587664843461629, + "grad_norm": 4.018482208251953, + "learning_rate": 7.126024519401687e-05, + "loss": 2.4153, + "step": 11306 + }, + { + "epoch": 0.7589007080299319, + "grad_norm": 4.252175807952881, + "learning_rate": 7.1250407819044e-05, + "loss": 2.4657, + "step": 11308 + }, + { + "epoch": 0.7590349317137008, + "grad_norm": 3.798412799835205, + "learning_rate": 7.124056944003114e-05, + "loss": 2.1205, + "step": 11310 + }, + { + "epoch": 0.7591691553974699, + "grad_norm": 4.108224391937256, + "learning_rate": 7.123073005744314e-05, + "loss": 2.2915, + "step": 11312 + }, + { + "epoch": 0.7593033790812389, + "grad_norm": 3.7943153381347656, + "learning_rate": 7.12208896717449e-05, + "loss": 2.336, + "step": 11314 + }, + { + "epoch": 0.7594376027650079, + "grad_norm": 3.9298410415649414, + "learning_rate": 7.121104828340136e-05, + "loss": 2.2497, + "step": 11316 + }, + { + "epoch": 0.7595718264487769, + "grad_norm": 3.9611244201660156, + "learning_rate": 7.120120589287749e-05, + "loss": 2.3291, + "step": 11318 + }, + { + "epoch": 0.7597060501325459, + "grad_norm": 3.842162609100342, + "learning_rate": 7.119136250063833e-05, + "loss": 2.1429, + "step": 11320 + }, + { + "epoch": 0.7598402738163149, + "grad_norm": 4.364062786102295, + "learning_rate": 7.118151810714896e-05, + "loss": 2.2752, + "step": 11322 + }, + { + "epoch": 0.759974497500084, + "grad_norm": 4.21122932434082, + "learning_rate": 7.117167271287453e-05, + "loss": 2.6932, + "step": 11324 + }, + { + "epoch": 0.7601087211838529, + "grad_norm": 4.056561470031738, + "learning_rate": 7.116182631828016e-05, + "loss": 2.5354, + "step": 11326 + }, + { + "epoch": 0.7602429448676219, + "grad_norm": 5.775080680847168, + "learning_rate": 7.115197892383114e-05, + "loss": 2.6319, + "step": 11328 + }, + { + "epoch": 0.7603771685513909, + "grad_norm": 4.0575408935546875, + "learning_rate": 7.11421305299927e-05, + "loss": 2.7827, + "step": 11330 + }, + { + "epoch": 0.7605113922351598, + "grad_norm": 5.925610065460205, + "learning_rate": 7.113228113723016e-05, + "loss": 2.4155, + "step": 11332 + }, + { + "epoch": 0.7606456159189289, + "grad_norm": 4.080082893371582, + "learning_rate": 7.11224307460089e-05, + "loss": 2.0217, + "step": 11334 + }, + { + "epoch": 0.7607798396026979, + "grad_norm": 3.577495574951172, + "learning_rate": 7.111257935679433e-05, + "loss": 2.0585, + "step": 11336 + }, + { + "epoch": 0.7609140632864669, + "grad_norm": 4.217135906219482, + "learning_rate": 7.110272697005189e-05, + "loss": 2.413, + "step": 11338 + }, + { + "epoch": 0.7610482869702359, + "grad_norm": 4.11265754699707, + "learning_rate": 7.10928735862471e-05, + "loss": 2.6288, + "step": 11340 + }, + { + "epoch": 0.7611825106540049, + "grad_norm": 4.358964920043945, + "learning_rate": 7.108301920584552e-05, + "loss": 2.3651, + "step": 11342 + }, + { + "epoch": 0.7613167343377739, + "grad_norm": 3.7061753273010254, + "learning_rate": 7.107316382931272e-05, + "loss": 2.5994, + "step": 11344 + }, + { + "epoch": 0.761450958021543, + "grad_norm": 4.161567211151123, + "learning_rate": 7.106330745711438e-05, + "loss": 2.5095, + "step": 11346 + }, + { + "epoch": 0.7615851817053119, + "grad_norm": 4.006433010101318, + "learning_rate": 7.105345008971619e-05, + "loss": 2.4561, + "step": 11348 + }, + { + "epoch": 0.7617194053890809, + "grad_norm": 4.928308963775635, + "learning_rate": 7.104359172758387e-05, + "loss": 2.6566, + "step": 11350 + }, + { + "epoch": 0.7618536290728499, + "grad_norm": 4.330458164215088, + "learning_rate": 7.103373237118321e-05, + "loss": 2.2823, + "step": 11352 + }, + { + "epoch": 0.761987852756619, + "grad_norm": 4.0525431632995605, + "learning_rate": 7.102387202098008e-05, + "loss": 2.2292, + "step": 11354 + }, + { + "epoch": 0.7621220764403879, + "grad_norm": 4.118330001831055, + "learning_rate": 7.101401067744033e-05, + "loss": 2.4861, + "step": 11356 + }, + { + "epoch": 0.7622563001241569, + "grad_norm": 3.4864912033081055, + "learning_rate": 7.10041483410299e-05, + "loss": 2.2176, + "step": 11358 + }, + { + "epoch": 0.7623905238079259, + "grad_norm": 3.616830587387085, + "learning_rate": 7.099428501221476e-05, + "loss": 2.4008, + "step": 11360 + }, + { + "epoch": 0.7625247474916949, + "grad_norm": 3.5534772872924805, + "learning_rate": 7.098442069146095e-05, + "loss": 2.1751, + "step": 11362 + }, + { + "epoch": 0.7626589711754639, + "grad_norm": 4.280544757843018, + "learning_rate": 7.09745553792345e-05, + "loss": 2.5798, + "step": 11364 + }, + { + "epoch": 0.7627931948592329, + "grad_norm": 4.433603286743164, + "learning_rate": 7.096468907600157e-05, + "loss": 2.469, + "step": 11366 + }, + { + "epoch": 0.762927418543002, + "grad_norm": 3.8000266551971436, + "learning_rate": 7.095482178222832e-05, + "loss": 2.5999, + "step": 11368 + }, + { + "epoch": 0.7630616422267709, + "grad_norm": 3.8679873943328857, + "learning_rate": 7.094495349838092e-05, + "loss": 2.2456, + "step": 11370 + }, + { + "epoch": 0.7631958659105399, + "grad_norm": 4.378355026245117, + "learning_rate": 7.093508422492568e-05, + "loss": 2.3315, + "step": 11372 + }, + { + "epoch": 0.7633300895943089, + "grad_norm": 4.040618896484375, + "learning_rate": 7.092521396232887e-05, + "loss": 2.0327, + "step": 11374 + }, + { + "epoch": 0.763464313278078, + "grad_norm": 4.7639079093933105, + "learning_rate": 7.091534271105682e-05, + "loss": 2.4405, + "step": 11376 + }, + { + "epoch": 0.7635985369618469, + "grad_norm": 12.425516128540039, + "learning_rate": 7.090547047157599e-05, + "loss": 2.0484, + "step": 11378 + }, + { + "epoch": 0.7637327606456159, + "grad_norm": 4.630842208862305, + "learning_rate": 7.089559724435277e-05, + "loss": 2.6327, + "step": 11380 + }, + { + "epoch": 0.7638669843293849, + "grad_norm": 4.203490734100342, + "learning_rate": 7.088572302985368e-05, + "loss": 2.4243, + "step": 11382 + }, + { + "epoch": 0.764001208013154, + "grad_norm": 4.418346405029297, + "learning_rate": 7.087584782854525e-05, + "loss": 2.4067, + "step": 11384 + }, + { + "epoch": 0.7641354316969229, + "grad_norm": 6.152133464813232, + "learning_rate": 7.086597164089404e-05, + "loss": 2.3286, + "step": 11386 + }, + { + "epoch": 0.7642696553806919, + "grad_norm": 4.210158348083496, + "learning_rate": 7.085609446736671e-05, + "loss": 2.5305, + "step": 11388 + }, + { + "epoch": 0.764403879064461, + "grad_norm": 7.721442222595215, + "learning_rate": 7.084621630842993e-05, + "loss": 2.521, + "step": 11390 + }, + { + "epoch": 0.76453810274823, + "grad_norm": 4.331101894378662, + "learning_rate": 7.083633716455043e-05, + "loss": 2.0844, + "step": 11392 + }, + { + "epoch": 0.7646723264319989, + "grad_norm": 4.861443996429443, + "learning_rate": 7.082645703619496e-05, + "loss": 2.5607, + "step": 11394 + }, + { + "epoch": 0.7648065501157679, + "grad_norm": 4.3593268394470215, + "learning_rate": 7.081657592383035e-05, + "loss": 2.2934, + "step": 11396 + }, + { + "epoch": 0.764940773799537, + "grad_norm": 4.047499179840088, + "learning_rate": 7.080669382792346e-05, + "loss": 2.2455, + "step": 11398 + }, + { + "epoch": 0.7650749974833059, + "grad_norm": 3.7151589393615723, + "learning_rate": 7.079681074894123e-05, + "loss": 2.3628, + "step": 11400 + }, + { + "epoch": 0.7652092211670749, + "grad_norm": 4.752010345458984, + "learning_rate": 7.078692668735054e-05, + "loss": 2.3357, + "step": 11402 + }, + { + "epoch": 0.7653434448508439, + "grad_norm": 4.275126934051514, + "learning_rate": 7.077704164361848e-05, + "loss": 2.4005, + "step": 11404 + }, + { + "epoch": 0.765477668534613, + "grad_norm": 4.897658824920654, + "learning_rate": 7.076715561821204e-05, + "loss": 2.3771, + "step": 11406 + }, + { + "epoch": 0.7656118922183819, + "grad_norm": 3.824275016784668, + "learning_rate": 7.075726861159832e-05, + "loss": 2.3163, + "step": 11408 + }, + { + "epoch": 0.7657461159021509, + "grad_norm": 4.442480564117432, + "learning_rate": 7.07473806242445e-05, + "loss": 2.2043, + "step": 11410 + }, + { + "epoch": 0.76588033958592, + "grad_norm": 4.133688926696777, + "learning_rate": 7.073749165661773e-05, + "loss": 2.4009, + "step": 11412 + }, + { + "epoch": 0.766014563269689, + "grad_norm": 4.073807239532471, + "learning_rate": 7.072760170918526e-05, + "loss": 2.556, + "step": 11414 + }, + { + "epoch": 0.7661487869534579, + "grad_norm": 3.937955379486084, + "learning_rate": 7.071771078241438e-05, + "loss": 2.237, + "step": 11416 + }, + { + "epoch": 0.7662830106372269, + "grad_norm": 4.574248790740967, + "learning_rate": 7.070781887677239e-05, + "loss": 2.3585, + "step": 11418 + }, + { + "epoch": 0.766417234320996, + "grad_norm": 4.322278022766113, + "learning_rate": 7.069792599272669e-05, + "loss": 2.4696, + "step": 11420 + }, + { + "epoch": 0.766551458004765, + "grad_norm": 5.871338844299316, + "learning_rate": 7.068803213074468e-05, + "loss": 2.4904, + "step": 11422 + }, + { + "epoch": 0.7666856816885339, + "grad_norm": 5.657440185546875, + "learning_rate": 7.067813729129384e-05, + "loss": 2.381, + "step": 11424 + }, + { + "epoch": 0.7668199053723029, + "grad_norm": 3.992628574371338, + "learning_rate": 7.066824147484165e-05, + "loss": 2.2861, + "step": 11426 + }, + { + "epoch": 0.766954129056072, + "grad_norm": 4.2963385581970215, + "learning_rate": 7.065834468185573e-05, + "loss": 2.4364, + "step": 11428 + }, + { + "epoch": 0.767088352739841, + "grad_norm": 3.668748617172241, + "learning_rate": 7.064844691280362e-05, + "loss": 2.4193, + "step": 11430 + }, + { + "epoch": 0.7672225764236099, + "grad_norm": 3.5948545932769775, + "learning_rate": 7.063854816815301e-05, + "loss": 2.2, + "step": 11432 + }, + { + "epoch": 0.767356800107379, + "grad_norm": 4.092372894287109, + "learning_rate": 7.062864844837159e-05, + "loss": 2.5144, + "step": 11434 + }, + { + "epoch": 0.767491023791148, + "grad_norm": 3.9353220462799072, + "learning_rate": 7.061874775392709e-05, + "loss": 2.3438, + "step": 11436 + }, + { + "epoch": 0.7676252474749169, + "grad_norm": 4.584843158721924, + "learning_rate": 7.060884608528728e-05, + "loss": 2.7813, + "step": 11438 + }, + { + "epoch": 0.7677594711586859, + "grad_norm": 4.553761959075928, + "learning_rate": 7.059894344292004e-05, + "loss": 2.3227, + "step": 11440 + }, + { + "epoch": 0.767893694842455, + "grad_norm": 4.157339096069336, + "learning_rate": 7.058903982729322e-05, + "loss": 2.5099, + "step": 11442 + }, + { + "epoch": 0.768027918526224, + "grad_norm": 4.407057285308838, + "learning_rate": 7.057913523887478e-05, + "loss": 2.3595, + "step": 11444 + }, + { + "epoch": 0.7681621422099929, + "grad_norm": 5.548940658569336, + "learning_rate": 7.056922967813263e-05, + "loss": 2.4501, + "step": 11446 + }, + { + "epoch": 0.7682963658937619, + "grad_norm": 4.603552341461182, + "learning_rate": 7.055932314553485e-05, + "loss": 2.5621, + "step": 11448 + }, + { + "epoch": 0.768430589577531, + "grad_norm": 4.348165988922119, + "learning_rate": 7.054941564154946e-05, + "loss": 2.3751, + "step": 11450 + }, + { + "epoch": 0.7685648132613, + "grad_norm": 4.173041820526123, + "learning_rate": 7.05395071666446e-05, + "loss": 2.4164, + "step": 11452 + }, + { + "epoch": 0.7686990369450689, + "grad_norm": 4.223103046417236, + "learning_rate": 7.052959772128841e-05, + "loss": 2.447, + "step": 11454 + }, + { + "epoch": 0.768833260628838, + "grad_norm": 3.7018229961395264, + "learning_rate": 7.051968730594911e-05, + "loss": 2.3216, + "step": 11456 + }, + { + "epoch": 0.768967484312607, + "grad_norm": 4.249338150024414, + "learning_rate": 7.050977592109494e-05, + "loss": 2.3342, + "step": 11458 + }, + { + "epoch": 0.769101707996376, + "grad_norm": 4.3667144775390625, + "learning_rate": 7.049986356719417e-05, + "loss": 2.4545, + "step": 11460 + }, + { + "epoch": 0.7692359316801449, + "grad_norm": 4.275052547454834, + "learning_rate": 7.048995024471517e-05, + "loss": 2.5679, + "step": 11462 + }, + { + "epoch": 0.769370155363914, + "grad_norm": 4.919400691986084, + "learning_rate": 7.048003595412632e-05, + "loss": 2.5159, + "step": 11464 + }, + { + "epoch": 0.769504379047683, + "grad_norm": 3.8241994380950928, + "learning_rate": 7.047012069589601e-05, + "loss": 2.3341, + "step": 11466 + }, + { + "epoch": 0.769638602731452, + "grad_norm": 4.200124740600586, + "learning_rate": 7.046020447049277e-05, + "loss": 2.5125, + "step": 11468 + }, + { + "epoch": 0.7697728264152209, + "grad_norm": 3.8301353454589844, + "learning_rate": 7.045028727838511e-05, + "loss": 2.3869, + "step": 11470 + }, + { + "epoch": 0.76990705009899, + "grad_norm": 4.540897369384766, + "learning_rate": 7.044036912004159e-05, + "loss": 2.7281, + "step": 11472 + }, + { + "epoch": 0.770041273782759, + "grad_norm": 4.516687393188477, + "learning_rate": 7.04304499959308e-05, + "loss": 2.2524, + "step": 11474 + }, + { + "epoch": 0.7701754974665279, + "grad_norm": 4.597642421722412, + "learning_rate": 7.042052990652146e-05, + "loss": 2.4117, + "step": 11476 + }, + { + "epoch": 0.7703097211502969, + "grad_norm": 4.4047064781188965, + "learning_rate": 7.041060885228222e-05, + "loss": 2.528, + "step": 11478 + }, + { + "epoch": 0.770443944834066, + "grad_norm": 3.611037254333496, + "learning_rate": 7.040068683368181e-05, + "loss": 2.2441, + "step": 11480 + }, + { + "epoch": 0.770578168517835, + "grad_norm": 4.4341559410095215, + "learning_rate": 7.039076385118911e-05, + "loss": 2.1938, + "step": 11482 + }, + { + "epoch": 0.7707123922016039, + "grad_norm": 3.943108558654785, + "learning_rate": 7.03808399052729e-05, + "loss": 2.4068, + "step": 11484 + }, + { + "epoch": 0.770846615885373, + "grad_norm": 4.038671016693115, + "learning_rate": 7.03709149964021e-05, + "loss": 2.0114, + "step": 11486 + }, + { + "epoch": 0.770980839569142, + "grad_norm": 4.739253520965576, + "learning_rate": 7.036098912504559e-05, + "loss": 2.3807, + "step": 11488 + }, + { + "epoch": 0.771115063252911, + "grad_norm": 4.359121799468994, + "learning_rate": 7.035106229167241e-05, + "loss": 2.4745, + "step": 11490 + }, + { + "epoch": 0.7712492869366799, + "grad_norm": 4.096681594848633, + "learning_rate": 7.034113449675154e-05, + "loss": 2.3572, + "step": 11492 + }, + { + "epoch": 0.771383510620449, + "grad_norm": 4.139388084411621, + "learning_rate": 7.033120574075206e-05, + "loss": 2.5654, + "step": 11494 + }, + { + "epoch": 0.771517734304218, + "grad_norm": 5.21331787109375, + "learning_rate": 7.032127602414311e-05, + "loss": 2.3886, + "step": 11496 + }, + { + "epoch": 0.771651957987987, + "grad_norm": 4.290503025054932, + "learning_rate": 7.031134534739381e-05, + "loss": 2.6905, + "step": 11498 + }, + { + "epoch": 0.7717861816717559, + "grad_norm": 3.941718816757202, + "learning_rate": 7.030141371097339e-05, + "loss": 2.2912, + "step": 11500 + }, + { + "epoch": 0.771920405355525, + "grad_norm": 4.082904815673828, + "learning_rate": 7.029148111535109e-05, + "loss": 2.3593, + "step": 11502 + }, + { + "epoch": 0.772054629039294, + "grad_norm": 4.21201753616333, + "learning_rate": 7.02815475609962e-05, + "loss": 2.3883, + "step": 11504 + }, + { + "epoch": 0.772188852723063, + "grad_norm": 3.8474233150482178, + "learning_rate": 7.02716130483781e-05, + "loss": 2.6895, + "step": 11506 + }, + { + "epoch": 0.772323076406832, + "grad_norm": 3.849520444869995, + "learning_rate": 7.026167757796612e-05, + "loss": 2.3069, + "step": 11508 + }, + { + "epoch": 0.772457300090601, + "grad_norm": 4.205483436584473, + "learning_rate": 7.025174115022972e-05, + "loss": 2.3535, + "step": 11510 + }, + { + "epoch": 0.77259152377437, + "grad_norm": 4.057200908660889, + "learning_rate": 7.024180376563838e-05, + "loss": 2.2243, + "step": 11512 + }, + { + "epoch": 0.7727257474581389, + "grad_norm": 4.535856246948242, + "learning_rate": 7.02318654246616e-05, + "loss": 2.3728, + "step": 11514 + }, + { + "epoch": 0.772859971141908, + "grad_norm": 3.383394241333008, + "learning_rate": 7.022192612776899e-05, + "loss": 2.5767, + "step": 11516 + }, + { + "epoch": 0.772994194825677, + "grad_norm": 3.99257493019104, + "learning_rate": 7.021198587543012e-05, + "loss": 2.2402, + "step": 11518 + }, + { + "epoch": 0.773128418509446, + "grad_norm": 4.46248722076416, + "learning_rate": 7.020204466811467e-05, + "loss": 2.7501, + "step": 11520 + }, + { + "epoch": 0.7732626421932149, + "grad_norm": 3.844619035720825, + "learning_rate": 7.019210250629231e-05, + "loss": 2.2347, + "step": 11522 + }, + { + "epoch": 0.773396865876984, + "grad_norm": 3.690890073776245, + "learning_rate": 7.018215939043285e-05, + "loss": 2.5834, + "step": 11524 + }, + { + "epoch": 0.773531089560753, + "grad_norm": 4.4122090339660645, + "learning_rate": 7.017221532100601e-05, + "loss": 2.2467, + "step": 11526 + }, + { + "epoch": 0.773665313244522, + "grad_norm": 4.2802205085754395, + "learning_rate": 7.016227029848169e-05, + "loss": 2.3185, + "step": 11528 + }, + { + "epoch": 0.773799536928291, + "grad_norm": 4.0430216789245605, + "learning_rate": 7.015232432332974e-05, + "loss": 2.2797, + "step": 11530 + }, + { + "epoch": 0.77393376061206, + "grad_norm": 4.7422261238098145, + "learning_rate": 7.014237739602008e-05, + "loss": 2.1578, + "step": 11532 + }, + { + "epoch": 0.774067984295829, + "grad_norm": 4.300187110900879, + "learning_rate": 7.01324295170227e-05, + "loss": 2.3355, + "step": 11534 + }, + { + "epoch": 0.774202207979598, + "grad_norm": 3.6634624004364014, + "learning_rate": 7.012248068680762e-05, + "loss": 2.3008, + "step": 11536 + }, + { + "epoch": 0.774336431663367, + "grad_norm": 4.25755500793457, + "learning_rate": 7.01125309058449e-05, + "loss": 2.4008, + "step": 11538 + }, + { + "epoch": 0.774470655347136, + "grad_norm": 4.340115547180176, + "learning_rate": 7.010258017460463e-05, + "loss": 2.3379, + "step": 11540 + }, + { + "epoch": 0.774604879030905, + "grad_norm": 4.023719787597656, + "learning_rate": 7.0092628493557e-05, + "loss": 2.5981, + "step": 11542 + }, + { + "epoch": 0.774739102714674, + "grad_norm": 4.092066764831543, + "learning_rate": 7.008267586317216e-05, + "loss": 2.4207, + "step": 11544 + }, + { + "epoch": 0.774873326398443, + "grad_norm": 4.019627094268799, + "learning_rate": 7.007272228392039e-05, + "loss": 2.5359, + "step": 11546 + }, + { + "epoch": 0.775007550082212, + "grad_norm": 4.110751152038574, + "learning_rate": 7.006276775627196e-05, + "loss": 2.4719, + "step": 11548 + }, + { + "epoch": 0.775141773765981, + "grad_norm": 3.713867664337158, + "learning_rate": 7.005281228069721e-05, + "loss": 2.3306, + "step": 11550 + }, + { + "epoch": 0.77527599744975, + "grad_norm": 4.178574562072754, + "learning_rate": 7.004285585766651e-05, + "loss": 2.4504, + "step": 11552 + }, + { + "epoch": 0.775410221133519, + "grad_norm": 4.442382335662842, + "learning_rate": 7.003289848765028e-05, + "loss": 2.3454, + "step": 11554 + }, + { + "epoch": 0.775544444817288, + "grad_norm": 4.147940158843994, + "learning_rate": 7.002294017111899e-05, + "loss": 2.2849, + "step": 11556 + }, + { + "epoch": 0.775678668501057, + "grad_norm": 4.430386066436768, + "learning_rate": 7.001298090854316e-05, + "loss": 2.4446, + "step": 11558 + }, + { + "epoch": 0.775812892184826, + "grad_norm": 4.518462657928467, + "learning_rate": 7.000302070039332e-05, + "loss": 2.3029, + "step": 11560 + }, + { + "epoch": 0.775947115868595, + "grad_norm": 5.51124382019043, + "learning_rate": 6.999305954714009e-05, + "loss": 2.5066, + "step": 11562 + }, + { + "epoch": 0.776081339552364, + "grad_norm": 4.7067646980285645, + "learning_rate": 6.998309744925411e-05, + "loss": 2.5777, + "step": 11564 + }, + { + "epoch": 0.776215563236133, + "grad_norm": 4.3481855392456055, + "learning_rate": 6.997313440720608e-05, + "loss": 2.6018, + "step": 11566 + }, + { + "epoch": 0.776349786919902, + "grad_norm": 3.910787343978882, + "learning_rate": 6.996317042146671e-05, + "loss": 2.2105, + "step": 11568 + }, + { + "epoch": 0.776484010603671, + "grad_norm": 4.258603096008301, + "learning_rate": 6.995320549250681e-05, + "loss": 2.3539, + "step": 11570 + }, + { + "epoch": 0.77661823428744, + "grad_norm": 4.032443523406982, + "learning_rate": 6.99432396207972e-05, + "loss": 2.1471, + "step": 11572 + }, + { + "epoch": 0.7767524579712091, + "grad_norm": 4.176078796386719, + "learning_rate": 6.99332728068087e-05, + "loss": 2.1491, + "step": 11574 + }, + { + "epoch": 0.776886681654978, + "grad_norm": 4.071287631988525, + "learning_rate": 6.992330505101228e-05, + "loss": 2.2145, + "step": 11576 + }, + { + "epoch": 0.777020905338747, + "grad_norm": 3.9985711574554443, + "learning_rate": 6.991333635387886e-05, + "loss": 2.4206, + "step": 11578 + }, + { + "epoch": 0.777155129022516, + "grad_norm": 3.9831907749176025, + "learning_rate": 6.990336671587946e-05, + "loss": 2.5373, + "step": 11580 + }, + { + "epoch": 0.7772893527062851, + "grad_norm": 4.035172939300537, + "learning_rate": 6.989339613748512e-05, + "loss": 2.3723, + "step": 11582 + }, + { + "epoch": 0.777423576390054, + "grad_norm": 3.843107223510742, + "learning_rate": 6.988342461916693e-05, + "loss": 2.3922, + "step": 11584 + }, + { + "epoch": 0.777557800073823, + "grad_norm": 4.41557502746582, + "learning_rate": 6.987345216139604e-05, + "loss": 2.3215, + "step": 11586 + }, + { + "epoch": 0.777692023757592, + "grad_norm": 4.031645774841309, + "learning_rate": 6.98634787646436e-05, + "loss": 2.4639, + "step": 11588 + }, + { + "epoch": 0.777826247441361, + "grad_norm": 3.741028308868408, + "learning_rate": 6.985350442938084e-05, + "loss": 2.0545, + "step": 11590 + }, + { + "epoch": 0.77796047112513, + "grad_norm": 3.874918222427368, + "learning_rate": 6.984352915607906e-05, + "loss": 2.3176, + "step": 11592 + }, + { + "epoch": 0.778094694808899, + "grad_norm": 3.5494930744171143, + "learning_rate": 6.983355294520952e-05, + "loss": 2.1289, + "step": 11594 + }, + { + "epoch": 0.7782289184926681, + "grad_norm": 3.910691022872925, + "learning_rate": 6.982357579724364e-05, + "loss": 2.346, + "step": 11596 + }, + { + "epoch": 0.778363142176437, + "grad_norm": 3.4379429817199707, + "learning_rate": 6.981359771265276e-05, + "loss": 2.1714, + "step": 11598 + }, + { + "epoch": 0.778497365860206, + "grad_norm": 4.418918609619141, + "learning_rate": 6.980361869190836e-05, + "loss": 2.5762, + "step": 11600 + }, + { + "epoch": 0.778631589543975, + "grad_norm": 3.8664233684539795, + "learning_rate": 6.97936387354819e-05, + "loss": 2.1421, + "step": 11602 + }, + { + "epoch": 0.7787658132277441, + "grad_norm": 3.5825388431549072, + "learning_rate": 6.978365784384494e-05, + "loss": 2.1876, + "step": 11604 + }, + { + "epoch": 0.778900036911513, + "grad_norm": 4.422878742218018, + "learning_rate": 6.977367601746907e-05, + "loss": 2.6012, + "step": 11606 + }, + { + "epoch": 0.779034260595282, + "grad_norm": 3.889528274536133, + "learning_rate": 6.976369325682586e-05, + "loss": 2.2947, + "step": 11608 + }, + { + "epoch": 0.779168484279051, + "grad_norm": 6.0385260581970215, + "learning_rate": 6.975370956238703e-05, + "loss": 2.237, + "step": 11610 + }, + { + "epoch": 0.7793027079628201, + "grad_norm": 4.52658748626709, + "learning_rate": 6.974372493462427e-05, + "loss": 2.4917, + "step": 11612 + }, + { + "epoch": 0.779436931646589, + "grad_norm": 4.705216407775879, + "learning_rate": 6.973373937400932e-05, + "loss": 2.3862, + "step": 11614 + }, + { + "epoch": 0.779571155330358, + "grad_norm": 4.305156230926514, + "learning_rate": 6.9723752881014e-05, + "loss": 2.3936, + "step": 11616 + }, + { + "epoch": 0.7797053790141271, + "grad_norm": 4.189755916595459, + "learning_rate": 6.971376545611012e-05, + "loss": 2.4117, + "step": 11618 + }, + { + "epoch": 0.7798396026978961, + "grad_norm": 11.95207691192627, + "learning_rate": 6.97037770997696e-05, + "loss": 2.2495, + "step": 11620 + }, + { + "epoch": 0.779973826381665, + "grad_norm": 3.6579291820526123, + "learning_rate": 6.969378781246436e-05, + "loss": 2.341, + "step": 11622 + }, + { + "epoch": 0.780108050065434, + "grad_norm": 4.072883129119873, + "learning_rate": 6.968379759466638e-05, + "loss": 2.7239, + "step": 11624 + }, + { + "epoch": 0.7802422737492031, + "grad_norm": 4.044282913208008, + "learning_rate": 6.967380644684765e-05, + "loss": 2.3953, + "step": 11626 + }, + { + "epoch": 0.780376497432972, + "grad_norm": 5.477477073669434, + "learning_rate": 6.966381436948027e-05, + "loss": 2.7891, + "step": 11628 + }, + { + "epoch": 0.780510721116741, + "grad_norm": 4.130548000335693, + "learning_rate": 6.965382136303632e-05, + "loss": 2.652, + "step": 11630 + }, + { + "epoch": 0.78064494480051, + "grad_norm": 4.0945353507995605, + "learning_rate": 6.964382742798797e-05, + "loss": 2.3253, + "step": 11632 + }, + { + "epoch": 0.7807791684842791, + "grad_norm": 4.452800273895264, + "learning_rate": 6.963383256480738e-05, + "loss": 2.1801, + "step": 11634 + }, + { + "epoch": 0.780913392168048, + "grad_norm": 4.523156642913818, + "learning_rate": 6.962383677396682e-05, + "loss": 2.5392, + "step": 11636 + }, + { + "epoch": 0.781047615851817, + "grad_norm": 4.522687911987305, + "learning_rate": 6.961384005593856e-05, + "loss": 2.6318, + "step": 11638 + }, + { + "epoch": 0.7811818395355861, + "grad_norm": 4.909079551696777, + "learning_rate": 6.960384241119494e-05, + "loss": 2.2286, + "step": 11640 + }, + { + "epoch": 0.7813160632193551, + "grad_norm": 3.94197940826416, + "learning_rate": 6.95938438402083e-05, + "loss": 2.2472, + "step": 11642 + }, + { + "epoch": 0.781450286903124, + "grad_norm": 4.146164894104004, + "learning_rate": 6.958384434345107e-05, + "loss": 2.4853, + "step": 11644 + }, + { + "epoch": 0.781584510586893, + "grad_norm": 4.126928806304932, + "learning_rate": 6.95738439213957e-05, + "loss": 2.5334, + "step": 11646 + }, + { + "epoch": 0.7817187342706621, + "grad_norm": 3.8468308448791504, + "learning_rate": 6.956384257451471e-05, + "loss": 2.2385, + "step": 11648 + }, + { + "epoch": 0.7818529579544311, + "grad_norm": 4.691030502319336, + "learning_rate": 6.955384030328063e-05, + "loss": 2.4898, + "step": 11650 + }, + { + "epoch": 0.7819871816382, + "grad_norm": 4.135719299316406, + "learning_rate": 6.954383710816604e-05, + "loss": 2.2743, + "step": 11652 + }, + { + "epoch": 0.782121405321969, + "grad_norm": 4.869410991668701, + "learning_rate": 6.953383298964357e-05, + "loss": 2.4817, + "step": 11654 + }, + { + "epoch": 0.7822556290057381, + "grad_norm": 4.050382614135742, + "learning_rate": 6.95238279481859e-05, + "loss": 2.3205, + "step": 11656 + }, + { + "epoch": 0.7823898526895071, + "grad_norm": 3.8741040229797363, + "learning_rate": 6.951382198426577e-05, + "loss": 2.4796, + "step": 11658 + }, + { + "epoch": 0.782524076373276, + "grad_norm": 3.742560625076294, + "learning_rate": 6.95038150983559e-05, + "loss": 2.0026, + "step": 11660 + }, + { + "epoch": 0.7826583000570451, + "grad_norm": 8.755746841430664, + "learning_rate": 6.949380729092914e-05, + "loss": 2.3849, + "step": 11662 + }, + { + "epoch": 0.7827925237408141, + "grad_norm": 4.821051120758057, + "learning_rate": 6.948379856245832e-05, + "loss": 2.1645, + "step": 11664 + }, + { + "epoch": 0.782926747424583, + "grad_norm": 4.7184906005859375, + "learning_rate": 6.947378891341631e-05, + "loss": 2.117, + "step": 11666 + }, + { + "epoch": 0.783060971108352, + "grad_norm": 3.589005708694458, + "learning_rate": 6.946377834427608e-05, + "loss": 2.1912, + "step": 11668 + }, + { + "epoch": 0.7831951947921211, + "grad_norm": 4.387725830078125, + "learning_rate": 6.945376685551061e-05, + "loss": 2.4374, + "step": 11670 + }, + { + "epoch": 0.7833294184758901, + "grad_norm": 4.167393684387207, + "learning_rate": 6.94437544475929e-05, + "loss": 2.4832, + "step": 11672 + }, + { + "epoch": 0.783463642159659, + "grad_norm": 5.531542778015137, + "learning_rate": 6.9433741120996e-05, + "loss": 2.2661, + "step": 11674 + }, + { + "epoch": 0.783597865843428, + "grad_norm": 4.332960605621338, + "learning_rate": 6.94237268761931e-05, + "loss": 2.0781, + "step": 11676 + }, + { + "epoch": 0.7837320895271971, + "grad_norm": 4.665246963500977, + "learning_rate": 6.941371171365725e-05, + "loss": 2.3761, + "step": 11678 + }, + { + "epoch": 0.7838663132109661, + "grad_norm": 4.249281406402588, + "learning_rate": 6.940369563386172e-05, + "loss": 2.3528, + "step": 11680 + }, + { + "epoch": 0.784000536894735, + "grad_norm": 4.420426845550537, + "learning_rate": 6.939367863727973e-05, + "loss": 2.3651, + "step": 11682 + }, + { + "epoch": 0.7841347605785041, + "grad_norm": 4.291654109954834, + "learning_rate": 6.938366072438456e-05, + "loss": 2.4635, + "step": 11684 + }, + { + "epoch": 0.7842689842622731, + "grad_norm": 4.709523677825928, + "learning_rate": 6.937364189564954e-05, + "loss": 2.5032, + "step": 11686 + }, + { + "epoch": 0.7844032079460421, + "grad_norm": 4.51956033706665, + "learning_rate": 6.936362215154802e-05, + "loss": 2.4919, + "step": 11688 + }, + { + "epoch": 0.784537431629811, + "grad_norm": 6.963902950286865, + "learning_rate": 6.935360149255345e-05, + "loss": 2.3575, + "step": 11690 + }, + { + "epoch": 0.7846716553135801, + "grad_norm": 3.844845771789551, + "learning_rate": 6.934357991913924e-05, + "loss": 2.5086, + "step": 11692 + }, + { + "epoch": 0.7848058789973491, + "grad_norm": 6.134592533111572, + "learning_rate": 6.933355743177894e-05, + "loss": 2.3952, + "step": 11694 + }, + { + "epoch": 0.7849401026811181, + "grad_norm": 3.9616620540618896, + "learning_rate": 6.932353403094605e-05, + "loss": 2.4393, + "step": 11696 + }, + { + "epoch": 0.785074326364887, + "grad_norm": 4.282020092010498, + "learning_rate": 6.931350971711418e-05, + "loss": 2.2089, + "step": 11698 + }, + { + "epoch": 0.7852085500486561, + "grad_norm": 3.778949499130249, + "learning_rate": 6.930348449075699e-05, + "loss": 2.4859, + "step": 11700 + }, + { + "epoch": 0.7853427737324251, + "grad_norm": 4.037566661834717, + "learning_rate": 6.929345835234808e-05, + "loss": 2.6026, + "step": 11702 + }, + { + "epoch": 0.785476997416194, + "grad_norm": 3.910252809524536, + "learning_rate": 6.928343130236121e-05, + "loss": 2.4148, + "step": 11704 + }, + { + "epoch": 0.7856112210999631, + "grad_norm": 3.845367431640625, + "learning_rate": 6.927340334127013e-05, + "loss": 2.4208, + "step": 11706 + }, + { + "epoch": 0.7857454447837321, + "grad_norm": 4.048295021057129, + "learning_rate": 6.926337446954864e-05, + "loss": 2.3414, + "step": 11708 + }, + { + "epoch": 0.7858796684675011, + "grad_norm": 4.449352264404297, + "learning_rate": 6.92533446876706e-05, + "loss": 2.5768, + "step": 11710 + }, + { + "epoch": 0.78601389215127, + "grad_norm": 4.327541828155518, + "learning_rate": 6.924331399610986e-05, + "loss": 2.5276, + "step": 11712 + }, + { + "epoch": 0.7861481158350391, + "grad_norm": 4.533745765686035, + "learning_rate": 6.92332823953404e-05, + "loss": 2.3138, + "step": 11714 + }, + { + "epoch": 0.7862823395188081, + "grad_norm": 3.51767635345459, + "learning_rate": 6.922324988583616e-05, + "loss": 2.0114, + "step": 11716 + }, + { + "epoch": 0.7864165632025771, + "grad_norm": 4.33095121383667, + "learning_rate": 6.921321646807113e-05, + "loss": 2.3545, + "step": 11718 + }, + { + "epoch": 0.786550786886346, + "grad_norm": 4.167200565338135, + "learning_rate": 6.920318214251945e-05, + "loss": 2.2865, + "step": 11720 + }, + { + "epoch": 0.7866850105701151, + "grad_norm": 3.974395751953125, + "learning_rate": 6.919314690965514e-05, + "loss": 2.0917, + "step": 11722 + }, + { + "epoch": 0.7868192342538841, + "grad_norm": 4.0500922203063965, + "learning_rate": 6.91831107699524e-05, + "loss": 2.1107, + "step": 11724 + }, + { + "epoch": 0.7869534579376531, + "grad_norm": 4.344975471496582, + "learning_rate": 6.917307372388539e-05, + "loss": 2.282, + "step": 11726 + }, + { + "epoch": 0.7870876816214221, + "grad_norm": 3.2461459636688232, + "learning_rate": 6.916303577192835e-05, + "loss": 2.1755, + "step": 11728 + }, + { + "epoch": 0.7872219053051911, + "grad_norm": 3.713268995285034, + "learning_rate": 6.915299691455555e-05, + "loss": 2.5807, + "step": 11730 + }, + { + "epoch": 0.7873561289889601, + "grad_norm": 3.5151681900024414, + "learning_rate": 6.914295715224132e-05, + "loss": 2.322, + "step": 11732 + }, + { + "epoch": 0.7874903526727292, + "grad_norm": 4.5565080642700195, + "learning_rate": 6.913291648546001e-05, + "loss": 2.427, + "step": 11734 + }, + { + "epoch": 0.7876245763564981, + "grad_norm": 3.833242177963257, + "learning_rate": 6.9122874914686e-05, + "loss": 2.3863, + "step": 11736 + }, + { + "epoch": 0.7877588000402671, + "grad_norm": 4.0461201667785645, + "learning_rate": 6.911283244039377e-05, + "loss": 2.3058, + "step": 11738 + }, + { + "epoch": 0.7878930237240361, + "grad_norm": 4.19537353515625, + "learning_rate": 6.910278906305778e-05, + "loss": 2.2784, + "step": 11740 + }, + { + "epoch": 0.788027247407805, + "grad_norm": 4.800963401794434, + "learning_rate": 6.909274478315257e-05, + "loss": 2.2815, + "step": 11742 + }, + { + "epoch": 0.7881614710915741, + "grad_norm": 3.820190668106079, + "learning_rate": 6.908269960115273e-05, + "loss": 2.428, + "step": 11744 + }, + { + "epoch": 0.7882956947753431, + "grad_norm": 6.782271385192871, + "learning_rate": 6.907265351753283e-05, + "loss": 2.3689, + "step": 11746 + }, + { + "epoch": 0.7884299184591121, + "grad_norm": 4.376456260681152, + "learning_rate": 6.906260653276758e-05, + "loss": 2.4339, + "step": 11748 + }, + { + "epoch": 0.7885641421428811, + "grad_norm": 4.613930702209473, + "learning_rate": 6.905255864733164e-05, + "loss": 2.2436, + "step": 11750 + }, + { + "epoch": 0.7886983658266501, + "grad_norm": 4.127045631408691, + "learning_rate": 6.90425098616998e-05, + "loss": 2.2539, + "step": 11752 + }, + { + "epoch": 0.7888325895104191, + "grad_norm": 4.0000739097595215, + "learning_rate": 6.903246017634677e-05, + "loss": 2.3186, + "step": 11754 + }, + { + "epoch": 0.7889668131941882, + "grad_norm": 4.156532287597656, + "learning_rate": 6.902240959174745e-05, + "loss": 2.4092, + "step": 11756 + }, + { + "epoch": 0.7891010368779571, + "grad_norm": 4.697082996368408, + "learning_rate": 6.901235810837669e-05, + "loss": 2.1325, + "step": 11758 + }, + { + "epoch": 0.7892352605617261, + "grad_norm": 4.503077983856201, + "learning_rate": 6.900230572670938e-05, + "loss": 2.5138, + "step": 11760 + }, + { + "epoch": 0.7893694842454951, + "grad_norm": 4.127632141113281, + "learning_rate": 6.89922524472205e-05, + "loss": 2.3626, + "step": 11762 + }, + { + "epoch": 0.7895037079292642, + "grad_norm": 3.816030502319336, + "learning_rate": 6.898219827038503e-05, + "loss": 2.0497, + "step": 11764 + }, + { + "epoch": 0.7896379316130331, + "grad_norm": 3.9024815559387207, + "learning_rate": 6.897214319667802e-05, + "loss": 2.2516, + "step": 11766 + }, + { + "epoch": 0.7897721552968021, + "grad_norm": 4.270203113555908, + "learning_rate": 6.896208722657455e-05, + "loss": 2.5766, + "step": 11768 + }, + { + "epoch": 0.7899063789805711, + "grad_norm": 3.540478467941284, + "learning_rate": 6.895203036054974e-05, + "loss": 2.2852, + "step": 11770 + }, + { + "epoch": 0.7900406026643402, + "grad_norm": 4.405488014221191, + "learning_rate": 6.894197259907879e-05, + "loss": 2.1134, + "step": 11772 + }, + { + "epoch": 0.7901748263481091, + "grad_norm": 3.8472824096679688, + "learning_rate": 6.893191394263684e-05, + "loss": 2.1531, + "step": 11774 + }, + { + "epoch": 0.7903090500318781, + "grad_norm": 3.6740059852600098, + "learning_rate": 6.892185439169922e-05, + "loss": 2.5515, + "step": 11776 + }, + { + "epoch": 0.7904432737156472, + "grad_norm": 4.199147701263428, + "learning_rate": 6.891179394674119e-05, + "loss": 2.0712, + "step": 11778 + }, + { + "epoch": 0.7905774973994161, + "grad_norm": 5.967792987823486, + "learning_rate": 6.890173260823807e-05, + "loss": 2.2457, + "step": 11780 + }, + { + "epoch": 0.7907117210831851, + "grad_norm": 3.915254592895508, + "learning_rate": 6.889167037666525e-05, + "loss": 2.1696, + "step": 11782 + }, + { + "epoch": 0.7908459447669541, + "grad_norm": 4.980099201202393, + "learning_rate": 6.888160725249816e-05, + "loss": 2.7172, + "step": 11784 + }, + { + "epoch": 0.7909801684507232, + "grad_norm": 6.468454837799072, + "learning_rate": 6.887154323621225e-05, + "loss": 2.4503, + "step": 11786 + }, + { + "epoch": 0.7911143921344921, + "grad_norm": 3.393594264984131, + "learning_rate": 6.886147832828303e-05, + "loss": 2.4619, + "step": 11788 + }, + { + "epoch": 0.7912486158182611, + "grad_norm": 4.082221508026123, + "learning_rate": 6.885141252918607e-05, + "loss": 2.4783, + "step": 11790 + }, + { + "epoch": 0.7913828395020301, + "grad_norm": 3.90731143951416, + "learning_rate": 6.884134583939692e-05, + "loss": 2.4365, + "step": 11792 + }, + { + "epoch": 0.7915170631857992, + "grad_norm": 4.254923343658447, + "learning_rate": 6.883127825939122e-05, + "loss": 2.6291, + "step": 11794 + }, + { + "epoch": 0.7916512868695681, + "grad_norm": 4.045051574707031, + "learning_rate": 6.882120978964466e-05, + "loss": 2.2295, + "step": 11796 + }, + { + "epoch": 0.7917855105533371, + "grad_norm": 4.0003533363342285, + "learning_rate": 6.881114043063296e-05, + "loss": 2.4193, + "step": 11798 + }, + { + "epoch": 0.7919197342371062, + "grad_norm": 4.750698566436768, + "learning_rate": 6.880107018283186e-05, + "loss": 2.2609, + "step": 11800 + }, + { + "epoch": 0.7920539579208752, + "grad_norm": 4.315789699554443, + "learning_rate": 6.879099904671715e-05, + "loss": 2.3327, + "step": 11802 + }, + { + "epoch": 0.7921881816046441, + "grad_norm": 5.336950778961182, + "learning_rate": 6.87809270227647e-05, + "loss": 1.9614, + "step": 11804 + }, + { + "epoch": 0.7923224052884131, + "grad_norm": 4.06541633605957, + "learning_rate": 6.877085411145038e-05, + "loss": 2.3611, + "step": 11806 + }, + { + "epoch": 0.7924566289721822, + "grad_norm": 4.463405132293701, + "learning_rate": 6.87607803132501e-05, + "loss": 2.2412, + "step": 11808 + }, + { + "epoch": 0.7925908526559512, + "grad_norm": 4.139978885650635, + "learning_rate": 6.875070562863986e-05, + "loss": 2.4751, + "step": 11810 + }, + { + "epoch": 0.7927250763397201, + "grad_norm": 4.0918354988098145, + "learning_rate": 6.874063005809563e-05, + "loss": 2.2608, + "step": 11812 + }, + { + "epoch": 0.7928593000234891, + "grad_norm": 4.152354717254639, + "learning_rate": 6.87305536020935e-05, + "loss": 2.417, + "step": 11814 + }, + { + "epoch": 0.7929935237072582, + "grad_norm": 4.1407318115234375, + "learning_rate": 6.872047626110955e-05, + "loss": 2.3668, + "step": 11816 + }, + { + "epoch": 0.7931277473910271, + "grad_norm": 7.112698078155518, + "learning_rate": 6.87103980356199e-05, + "loss": 2.2458, + "step": 11818 + }, + { + "epoch": 0.7932619710747961, + "grad_norm": 5.437074184417725, + "learning_rate": 6.870031892610073e-05, + "loss": 2.2344, + "step": 11820 + }, + { + "epoch": 0.7933961947585652, + "grad_norm": 4.076390743255615, + "learning_rate": 6.869023893302826e-05, + "loss": 2.3287, + "step": 11822 + }, + { + "epoch": 0.7935304184423342, + "grad_norm": 4.208154201507568, + "learning_rate": 6.868015805687877e-05, + "loss": 2.1845, + "step": 11824 + }, + { + "epoch": 0.7936646421261031, + "grad_norm": 4.688246250152588, + "learning_rate": 6.867007629812852e-05, + "loss": 2.9112, + "step": 11826 + }, + { + "epoch": 0.7937988658098721, + "grad_norm": 3.865987777709961, + "learning_rate": 6.865999365725391e-05, + "loss": 2.3098, + "step": 11828 + }, + { + "epoch": 0.7939330894936412, + "grad_norm": 4.318973064422607, + "learning_rate": 6.864991013473125e-05, + "loss": 2.3456, + "step": 11830 + }, + { + "epoch": 0.7940673131774102, + "grad_norm": 3.8203606605529785, + "learning_rate": 6.863982573103704e-05, + "loss": 2.2495, + "step": 11832 + }, + { + "epoch": 0.7942015368611791, + "grad_norm": 4.348165035247803, + "learning_rate": 6.862974044664772e-05, + "loss": 2.1852, + "step": 11834 + }, + { + "epoch": 0.7943357605449481, + "grad_norm": 4.1135053634643555, + "learning_rate": 6.861965428203978e-05, + "loss": 2.4131, + "step": 11836 + }, + { + "epoch": 0.7944699842287172, + "grad_norm": 4.3015899658203125, + "learning_rate": 6.860956723768981e-05, + "loss": 2.583, + "step": 11838 + }, + { + "epoch": 0.7946042079124862, + "grad_norm": 4.018304347991943, + "learning_rate": 6.859947931407436e-05, + "loss": 2.0528, + "step": 11840 + }, + { + "epoch": 0.7947384315962551, + "grad_norm": 3.769784927368164, + "learning_rate": 6.858939051167011e-05, + "loss": 2.4433, + "step": 11842 + }, + { + "epoch": 0.7948726552800242, + "grad_norm": 4.60347318649292, + "learning_rate": 6.85793008309537e-05, + "loss": 2.4962, + "step": 11844 + }, + { + "epoch": 0.7950068789637932, + "grad_norm": 4.397321701049805, + "learning_rate": 6.856921027240187e-05, + "loss": 2.3377, + "step": 11846 + }, + { + "epoch": 0.7951411026475622, + "grad_norm": 5.161647796630859, + "learning_rate": 6.855911883649137e-05, + "loss": 2.4645, + "step": 11848 + }, + { + "epoch": 0.7952753263313311, + "grad_norm": 4.812028884887695, + "learning_rate": 6.854902652369898e-05, + "loss": 2.9441, + "step": 11850 + }, + { + "epoch": 0.7954095500151002, + "grad_norm": 3.5488829612731934, + "learning_rate": 6.853893333450158e-05, + "loss": 2.2668, + "step": 11852 + }, + { + "epoch": 0.7955437736988692, + "grad_norm": 4.017467975616455, + "learning_rate": 6.852883926937602e-05, + "loss": 2.4292, + "step": 11854 + }, + { + "epoch": 0.7956779973826381, + "grad_norm": 3.7664098739624023, + "learning_rate": 6.851874432879925e-05, + "loss": 2.6504, + "step": 11856 + }, + { + "epoch": 0.7958122210664071, + "grad_norm": 4.338728904724121, + "learning_rate": 6.850864851324823e-05, + "loss": 2.2092, + "step": 11858 + }, + { + "epoch": 0.7959464447501762, + "grad_norm": 3.915762186050415, + "learning_rate": 6.849855182319995e-05, + "loss": 2.4129, + "step": 11860 + }, + { + "epoch": 0.7960806684339452, + "grad_norm": 3.823099136352539, + "learning_rate": 6.848845425913149e-05, + "loss": 2.656, + "step": 11862 + }, + { + "epoch": 0.7962148921177141, + "grad_norm": 4.269783020019531, + "learning_rate": 6.84783558215199e-05, + "loss": 2.6346, + "step": 11864 + }, + { + "epoch": 0.7963491158014832, + "grad_norm": 4.26074743270874, + "learning_rate": 6.846825651084236e-05, + "loss": 2.5069, + "step": 11866 + }, + { + "epoch": 0.7964833394852522, + "grad_norm": 3.3621456623077393, + "learning_rate": 6.8458156327576e-05, + "loss": 2.2395, + "step": 11868 + }, + { + "epoch": 0.7966175631690212, + "grad_norm": 3.664923906326294, + "learning_rate": 6.844805527219804e-05, + "loss": 2.3853, + "step": 11870 + }, + { + "epoch": 0.7967517868527901, + "grad_norm": 4.099206924438477, + "learning_rate": 6.843795334518576e-05, + "loss": 2.3313, + "step": 11872 + }, + { + "epoch": 0.7968860105365592, + "grad_norm": 3.848264455795288, + "learning_rate": 6.842785054701643e-05, + "loss": 2.691, + "step": 11874 + }, + { + "epoch": 0.7970202342203282, + "grad_norm": 4.262717247009277, + "learning_rate": 6.84177468781674e-05, + "loss": 2.315, + "step": 11876 + }, + { + "epoch": 0.7971544579040972, + "grad_norm": 4.188451766967773, + "learning_rate": 6.840764233911606e-05, + "loss": 2.2925, + "step": 11878 + }, + { + "epoch": 0.7972886815878661, + "grad_norm": 3.999814748764038, + "learning_rate": 6.83975369303398e-05, + "loss": 2.1512, + "step": 11880 + }, + { + "epoch": 0.7974229052716352, + "grad_norm": 4.679488182067871, + "learning_rate": 6.838743065231612e-05, + "loss": 2.6007, + "step": 11882 + }, + { + "epoch": 0.7975571289554042, + "grad_norm": 3.8606324195861816, + "learning_rate": 6.837732350552249e-05, + "loss": 2.375, + "step": 11884 + }, + { + "epoch": 0.7976913526391732, + "grad_norm": 4.720124244689941, + "learning_rate": 6.836721549043645e-05, + "loss": 2.3763, + "step": 11886 + }, + { + "epoch": 0.7978255763229422, + "grad_norm": 3.9931371212005615, + "learning_rate": 6.835710660753561e-05, + "loss": 2.3091, + "step": 11888 + }, + { + "epoch": 0.7979598000067112, + "grad_norm": 3.984501361846924, + "learning_rate": 6.834699685729757e-05, + "loss": 2.3606, + "step": 11890 + }, + { + "epoch": 0.7980940236904802, + "grad_norm": 3.475846529006958, + "learning_rate": 6.83368862402e-05, + "loss": 2.2178, + "step": 11892 + }, + { + "epoch": 0.7982282473742491, + "grad_norm": 3.840435743331909, + "learning_rate": 6.832677475672063e-05, + "loss": 2.4674, + "step": 11894 + }, + { + "epoch": 0.7983624710580182, + "grad_norm": 4.302377223968506, + "learning_rate": 6.831666240733718e-05, + "loss": 2.4395, + "step": 11896 + }, + { + "epoch": 0.7984966947417872, + "grad_norm": 4.756796360015869, + "learning_rate": 6.830654919252745e-05, + "loss": 2.5434, + "step": 11898 + }, + { + "epoch": 0.7986309184255562, + "grad_norm": 4.250802516937256, + "learning_rate": 6.829643511276929e-05, + "loss": 2.442, + "step": 11900 + }, + { + "epoch": 0.7987651421093251, + "grad_norm": 4.008729457855225, + "learning_rate": 6.828632016854051e-05, + "loss": 2.1341, + "step": 11902 + }, + { + "epoch": 0.7988993657930942, + "grad_norm": 3.9569737911224365, + "learning_rate": 6.82762043603191e-05, + "loss": 2.3481, + "step": 11904 + }, + { + "epoch": 0.7990335894768632, + "grad_norm": 3.598179578781128, + "learning_rate": 6.826608768858294e-05, + "loss": 2.3718, + "step": 11906 + }, + { + "epoch": 0.7991678131606322, + "grad_norm": 4.281187534332275, + "learning_rate": 6.825597015381007e-05, + "loss": 2.2684, + "step": 11908 + }, + { + "epoch": 0.7993020368444012, + "grad_norm": 3.9689600467681885, + "learning_rate": 6.824585175647852e-05, + "loss": 2.4667, + "step": 11910 + }, + { + "epoch": 0.7994362605281702, + "grad_norm": 3.585904121398926, + "learning_rate": 6.823573249706634e-05, + "loss": 2.3351, + "step": 11912 + }, + { + "epoch": 0.7995704842119392, + "grad_norm": 4.427435398101807, + "learning_rate": 6.822561237605167e-05, + "loss": 2.2338, + "step": 11914 + }, + { + "epoch": 0.7997047078957082, + "grad_norm": 4.686459064483643, + "learning_rate": 6.821549139391264e-05, + "loss": 2.67, + "step": 11916 + }, + { + "epoch": 0.7998389315794772, + "grad_norm": 3.6180546283721924, + "learning_rate": 6.820536955112747e-05, + "loss": 2.2918, + "step": 11918 + }, + { + "epoch": 0.7999731552632462, + "grad_norm": 4.427609920501709, + "learning_rate": 6.819524684817438e-05, + "loss": 2.6522, + "step": 11920 + }, + { + "epoch": 0.8001073789470152, + "grad_norm": 4.231048583984375, + "learning_rate": 6.818512328553166e-05, + "loss": 2.4479, + "step": 11922 + }, + { + "epoch": 0.8002416026307843, + "grad_norm": 4.1897993087768555, + "learning_rate": 6.817499886367763e-05, + "loss": 2.3876, + "step": 11924 + }, + { + "epoch": 0.8003758263145532, + "grad_norm": 4.174452304840088, + "learning_rate": 6.816487358309064e-05, + "loss": 2.5764, + "step": 11926 + }, + { + "epoch": 0.8005100499983222, + "grad_norm": 4.236385822296143, + "learning_rate": 6.815474744424908e-05, + "loss": 2.3369, + "step": 11928 + }, + { + "epoch": 0.8006442736820912, + "grad_norm": 4.439067840576172, + "learning_rate": 6.814462044763143e-05, + "loss": 2.367, + "step": 11930 + }, + { + "epoch": 0.8007784973658602, + "grad_norm": 6.180463790893555, + "learning_rate": 6.813449259371611e-05, + "loss": 2.4381, + "step": 11932 + }, + { + "epoch": 0.8009127210496292, + "grad_norm": 4.448161602020264, + "learning_rate": 6.81243638829817e-05, + "loss": 2.4421, + "step": 11934 + }, + { + "epoch": 0.8010469447333982, + "grad_norm": 4.206192970275879, + "learning_rate": 6.811423431590672e-05, + "loss": 2.5335, + "step": 11936 + }, + { + "epoch": 0.8011811684171672, + "grad_norm": 4.1249284744262695, + "learning_rate": 6.81041038929698e-05, + "loss": 2.296, + "step": 11938 + }, + { + "epoch": 0.8013153921009362, + "grad_norm": 4.6043381690979, + "learning_rate": 6.809397261464957e-05, + "loss": 2.1677, + "step": 11940 + }, + { + "epoch": 0.8014496157847052, + "grad_norm": 4.640593528747559, + "learning_rate": 6.808384048142472e-05, + "loss": 2.358, + "step": 11942 + }, + { + "epoch": 0.8015838394684742, + "grad_norm": 7.808780193328857, + "learning_rate": 6.807370749377396e-05, + "loss": 2.2745, + "step": 11944 + }, + { + "epoch": 0.8017180631522433, + "grad_norm": 3.72843599319458, + "learning_rate": 6.806357365217606e-05, + "loss": 2.3462, + "step": 11946 + }, + { + "epoch": 0.8018522868360122, + "grad_norm": 4.356242656707764, + "learning_rate": 6.805343895710983e-05, + "loss": 2.3593, + "step": 11948 + }, + { + "epoch": 0.8019865105197812, + "grad_norm": 3.7849576473236084, + "learning_rate": 6.80433034090541e-05, + "loss": 2.1373, + "step": 11950 + }, + { + "epoch": 0.8021207342035502, + "grad_norm": 4.395554542541504, + "learning_rate": 6.803316700848779e-05, + "loss": 2.617, + "step": 11952 + }, + { + "epoch": 0.8022549578873193, + "grad_norm": 3.9441540241241455, + "learning_rate": 6.802302975588976e-05, + "loss": 2.3527, + "step": 11954 + }, + { + "epoch": 0.8023891815710882, + "grad_norm": 4.612701892852783, + "learning_rate": 6.801289165173905e-05, + "loss": 2.1606, + "step": 11956 + }, + { + "epoch": 0.8025234052548572, + "grad_norm": 4.569306373596191, + "learning_rate": 6.800275269651462e-05, + "loss": 2.9041, + "step": 11958 + }, + { + "epoch": 0.8026576289386262, + "grad_norm": 4.055098533630371, + "learning_rate": 6.79926128906955e-05, + "loss": 2.318, + "step": 11960 + }, + { + "epoch": 0.8027918526223953, + "grad_norm": 4.485448360443115, + "learning_rate": 6.798247223476084e-05, + "loss": 2.3853, + "step": 11962 + }, + { + "epoch": 0.8029260763061642, + "grad_norm": 4.309789657592773, + "learning_rate": 6.79723307291897e-05, + "loss": 2.6057, + "step": 11964 + }, + { + "epoch": 0.8030602999899332, + "grad_norm": 4.194309234619141, + "learning_rate": 6.79621883744613e-05, + "loss": 2.3082, + "step": 11966 + }, + { + "epoch": 0.8031945236737023, + "grad_norm": 3.722787857055664, + "learning_rate": 6.79520451710548e-05, + "loss": 2.365, + "step": 11968 + }, + { + "epoch": 0.8033287473574712, + "grad_norm": 4.540039539337158, + "learning_rate": 6.794190111944948e-05, + "loss": 2.3128, + "step": 11970 + }, + { + "epoch": 0.8034629710412402, + "grad_norm": 3.7989771366119385, + "learning_rate": 6.79317562201246e-05, + "loss": 2.4401, + "step": 11972 + }, + { + "epoch": 0.8035971947250092, + "grad_norm": 3.887632369995117, + "learning_rate": 6.792161047355951e-05, + "loss": 2.3287, + "step": 11974 + }, + { + "epoch": 0.8037314184087783, + "grad_norm": 4.816518783569336, + "learning_rate": 6.791146388023356e-05, + "loss": 2.4146, + "step": 11976 + }, + { + "epoch": 0.8038656420925472, + "grad_norm": 5.422640800476074, + "learning_rate": 6.790131644062616e-05, + "loss": 2.5722, + "step": 11978 + }, + { + "epoch": 0.8039998657763162, + "grad_norm": 3.807874917984009, + "learning_rate": 6.789116815521678e-05, + "loss": 2.5366, + "step": 11980 + }, + { + "epoch": 0.8041340894600852, + "grad_norm": 4.698153495788574, + "learning_rate": 6.788101902448486e-05, + "loss": 2.0801, + "step": 11982 + }, + { + "epoch": 0.8042683131438543, + "grad_norm": 4.7957282066345215, + "learning_rate": 6.787086904890998e-05, + "loss": 2.3335, + "step": 11984 + }, + { + "epoch": 0.8044025368276232, + "grad_norm": 3.832075595855713, + "learning_rate": 6.786071822897166e-05, + "loss": 2.2171, + "step": 11986 + }, + { + "epoch": 0.8045367605113922, + "grad_norm": 3.8857131004333496, + "learning_rate": 6.785056656514953e-05, + "loss": 2.3594, + "step": 11988 + }, + { + "epoch": 0.8046709841951613, + "grad_norm": 4.381795406341553, + "learning_rate": 6.784041405792324e-05, + "loss": 2.2969, + "step": 11990 + }, + { + "epoch": 0.8048052078789303, + "grad_norm": 3.3679425716400146, + "learning_rate": 6.783026070777245e-05, + "loss": 2.1462, + "step": 11992 + }, + { + "epoch": 0.8049394315626992, + "grad_norm": 3.9914650917053223, + "learning_rate": 6.782010651517691e-05, + "loss": 2.3682, + "step": 11994 + }, + { + "epoch": 0.8050736552464682, + "grad_norm": 4.742506504058838, + "learning_rate": 6.780995148061638e-05, + "loss": 2.2224, + "step": 11996 + }, + { + "epoch": 0.8052078789302373, + "grad_norm": 4.59517765045166, + "learning_rate": 6.779979560457066e-05, + "loss": 2.4717, + "step": 11998 + }, + { + "epoch": 0.8053421026140063, + "grad_norm": 3.966500759124756, + "learning_rate": 6.778963888751961e-05, + "loss": 2.4603, + "step": 12000 + }, + { + "epoch": 0.8054763262977752, + "grad_norm": 4.63678503036499, + "learning_rate": 6.77794813299431e-05, + "loss": 2.2993, + "step": 12002 + }, + { + "epoch": 0.8056105499815442, + "grad_norm": 3.736947774887085, + "learning_rate": 6.776932293232106e-05, + "loss": 2.19, + "step": 12004 + }, + { + "epoch": 0.8057447736653133, + "grad_norm": 4.289041042327881, + "learning_rate": 6.775916369513344e-05, + "loss": 2.5548, + "step": 12006 + }, + { + "epoch": 0.8058789973490822, + "grad_norm": 3.8926138877868652, + "learning_rate": 6.774900361886028e-05, + "loss": 2.1093, + "step": 12008 + }, + { + "epoch": 0.8060132210328512, + "grad_norm": 4.012953758239746, + "learning_rate": 6.773884270398158e-05, + "loss": 2.5518, + "step": 12010 + }, + { + "epoch": 0.8061474447166203, + "grad_norm": 4.055338382720947, + "learning_rate": 6.772868095097745e-05, + "loss": 2.2775, + "step": 12012 + }, + { + "epoch": 0.8062816684003893, + "grad_norm": 3.680119514465332, + "learning_rate": 6.771851836032801e-05, + "loss": 2.0337, + "step": 12014 + }, + { + "epoch": 0.8064158920841582, + "grad_norm": 4.594735145568848, + "learning_rate": 6.770835493251342e-05, + "loss": 2.4778, + "step": 12016 + }, + { + "epoch": 0.8065501157679272, + "grad_norm": 3.751901626586914, + "learning_rate": 6.769819066801388e-05, + "loss": 2.2095, + "step": 12018 + }, + { + "epoch": 0.8066843394516963, + "grad_norm": 4.165175914764404, + "learning_rate": 6.768802556730964e-05, + "loss": 2.431, + "step": 12020 + }, + { + "epoch": 0.8068185631354653, + "grad_norm": 4.9608540534973145, + "learning_rate": 6.767785963088096e-05, + "loss": 2.3088, + "step": 12022 + }, + { + "epoch": 0.8069527868192342, + "grad_norm": 4.269825458526611, + "learning_rate": 6.766769285920819e-05, + "loss": 2.3633, + "step": 12024 + }, + { + "epoch": 0.8070870105030032, + "grad_norm": 4.02180814743042, + "learning_rate": 6.765752525277168e-05, + "loss": 2.1543, + "step": 12026 + }, + { + "epoch": 0.8072212341867723, + "grad_norm": 3.6681110858917236, + "learning_rate": 6.76473568120518e-05, + "loss": 2.154, + "step": 12028 + }, + { + "epoch": 0.8073554578705413, + "grad_norm": 3.8722779750823975, + "learning_rate": 6.763718753752901e-05, + "loss": 2.1763, + "step": 12030 + }, + { + "epoch": 0.8074896815543102, + "grad_norm": 4.26720666885376, + "learning_rate": 6.762701742968382e-05, + "loss": 2.6846, + "step": 12032 + }, + { + "epoch": 0.8076239052380793, + "grad_norm": 3.3866233825683594, + "learning_rate": 6.761684648899669e-05, + "loss": 2.3694, + "step": 12034 + }, + { + "epoch": 0.8077581289218483, + "grad_norm": 4.008025169372559, + "learning_rate": 6.760667471594821e-05, + "loss": 2.5075, + "step": 12036 + }, + { + "epoch": 0.8078923526056173, + "grad_norm": 4.6622185707092285, + "learning_rate": 6.7596502111019e-05, + "loss": 2.3859, + "step": 12038 + }, + { + "epoch": 0.8080265762893862, + "grad_norm": 3.6876566410064697, + "learning_rate": 6.758632867468964e-05, + "loss": 2.1359, + "step": 12040 + }, + { + "epoch": 0.8081607999731553, + "grad_norm": 4.1992082595825195, + "learning_rate": 6.757615440744084e-05, + "loss": 2.1775, + "step": 12042 + }, + { + "epoch": 0.8082950236569243, + "grad_norm": 3.654109001159668, + "learning_rate": 6.756597930975331e-05, + "loss": 2.2458, + "step": 12044 + }, + { + "epoch": 0.8084292473406932, + "grad_norm": 4.374342918395996, + "learning_rate": 6.75558033821078e-05, + "loss": 2.359, + "step": 12046 + }, + { + "epoch": 0.8085634710244622, + "grad_norm": 4.213740348815918, + "learning_rate": 6.754562662498509e-05, + "loss": 2.402, + "step": 12048 + }, + { + "epoch": 0.8086976947082313, + "grad_norm": 4.254446506500244, + "learning_rate": 6.753544903886602e-05, + "loss": 2.342, + "step": 12050 + }, + { + "epoch": 0.8088319183920003, + "grad_norm": 3.503413677215576, + "learning_rate": 6.75252706242315e-05, + "loss": 2.2339, + "step": 12052 + }, + { + "epoch": 0.8089661420757692, + "grad_norm": 4.212827682495117, + "learning_rate": 6.751509138156239e-05, + "loss": 2.3024, + "step": 12054 + }, + { + "epoch": 0.8091003657595383, + "grad_norm": 3.824294328689575, + "learning_rate": 6.750491131133962e-05, + "loss": 2.5268, + "step": 12056 + }, + { + "epoch": 0.8092345894433073, + "grad_norm": 4.160386562347412, + "learning_rate": 6.749473041404424e-05, + "loss": 2.1362, + "step": 12058 + }, + { + "epoch": 0.8093688131270763, + "grad_norm": 3.9776995182037354, + "learning_rate": 6.748454869015725e-05, + "loss": 2.2656, + "step": 12060 + }, + { + "epoch": 0.8095030368108452, + "grad_norm": 4.443406105041504, + "learning_rate": 6.747436614015972e-05, + "loss": 2.5201, + "step": 12062 + }, + { + "epoch": 0.8096372604946143, + "grad_norm": 3.691879987716675, + "learning_rate": 6.746418276453275e-05, + "loss": 2.2681, + "step": 12064 + }, + { + "epoch": 0.8097714841783833, + "grad_norm": 4.406975269317627, + "learning_rate": 6.745399856375749e-05, + "loss": 2.5514, + "step": 12066 + }, + { + "epoch": 0.8099057078621523, + "grad_norm": 3.550677537918091, + "learning_rate": 6.74438135383151e-05, + "loss": 2.2832, + "step": 12068 + }, + { + "epoch": 0.8100399315459212, + "grad_norm": 3.780902862548828, + "learning_rate": 6.743362768868682e-05, + "loss": 2.4096, + "step": 12070 + }, + { + "epoch": 0.8101741552296903, + "grad_norm": 3.585832118988037, + "learning_rate": 6.742344101535394e-05, + "loss": 2.0656, + "step": 12072 + }, + { + "epoch": 0.8103083789134593, + "grad_norm": 4.807201862335205, + "learning_rate": 6.741325351879771e-05, + "loss": 2.3043, + "step": 12074 + }, + { + "epoch": 0.8104426025972283, + "grad_norm": 4.071023941040039, + "learning_rate": 6.740306519949952e-05, + "loss": 2.3193, + "step": 12076 + }, + { + "epoch": 0.8105768262809973, + "grad_norm": 3.998645305633545, + "learning_rate": 6.739287605794069e-05, + "loss": 2.2049, + "step": 12078 + }, + { + "epoch": 0.8107110499647663, + "grad_norm": 4.352407455444336, + "learning_rate": 6.73826860946027e-05, + "loss": 2.8518, + "step": 12080 + }, + { + "epoch": 0.8108452736485353, + "grad_norm": 4.5442070960998535, + "learning_rate": 6.737249530996694e-05, + "loss": 2.2908, + "step": 12082 + }, + { + "epoch": 0.8109794973323042, + "grad_norm": 4.067252159118652, + "learning_rate": 6.736230370451496e-05, + "loss": 2.5378, + "step": 12084 + }, + { + "epoch": 0.8111137210160733, + "grad_norm": 3.537820816040039, + "learning_rate": 6.735211127872827e-05, + "loss": 2.0715, + "step": 12086 + }, + { + "epoch": 0.8112479446998423, + "grad_norm": 3.7288403511047363, + "learning_rate": 6.734191803308842e-05, + "loss": 2.2351, + "step": 12088 + }, + { + "epoch": 0.8113821683836113, + "grad_norm": 4.353682041168213, + "learning_rate": 6.733172396807708e-05, + "loss": 2.3525, + "step": 12090 + }, + { + "epoch": 0.8115163920673802, + "grad_norm": 3.9554054737091064, + "learning_rate": 6.732152908417583e-05, + "loss": 2.2037, + "step": 12092 + }, + { + "epoch": 0.8116506157511493, + "grad_norm": 5.100489616394043, + "learning_rate": 6.731133338186643e-05, + "loss": 2.8148, + "step": 12094 + }, + { + "epoch": 0.8117848394349183, + "grad_norm": 3.750927686691284, + "learning_rate": 6.730113686163055e-05, + "loss": 2.5577, + "step": 12096 + }, + { + "epoch": 0.8119190631186873, + "grad_norm": 4.187867641448975, + "learning_rate": 6.729093952394996e-05, + "loss": 2.3867, + "step": 12098 + }, + { + "epoch": 0.8120532868024563, + "grad_norm": 5.361365795135498, + "learning_rate": 6.72807413693065e-05, + "loss": 2.5561, + "step": 12100 + }, + { + "epoch": 0.8121875104862253, + "grad_norm": 4.458645343780518, + "learning_rate": 6.727054239818198e-05, + "loss": 2.3769, + "step": 12102 + }, + { + "epoch": 0.8123217341699943, + "grad_norm": 4.313767433166504, + "learning_rate": 6.72603426110583e-05, + "loss": 2.4508, + "step": 12104 + }, + { + "epoch": 0.8124559578537633, + "grad_norm": 4.2462286949157715, + "learning_rate": 6.725014200841738e-05, + "loss": 2.2251, + "step": 12106 + }, + { + "epoch": 0.8125901815375323, + "grad_norm": 3.971339464187622, + "learning_rate": 6.723994059074114e-05, + "loss": 2.3469, + "step": 12108 + }, + { + "epoch": 0.8127244052213013, + "grad_norm": 3.531235933303833, + "learning_rate": 6.722973835851162e-05, + "loss": 2.4569, + "step": 12110 + }, + { + "epoch": 0.8128586289050703, + "grad_norm": 6.26817512512207, + "learning_rate": 6.721953531221085e-05, + "loss": 2.3304, + "step": 12112 + }, + { + "epoch": 0.8129928525888394, + "grad_norm": 4.0628533363342285, + "learning_rate": 6.720933145232091e-05, + "loss": 2.2839, + "step": 12114 + }, + { + "epoch": 0.8131270762726083, + "grad_norm": 4.1753830909729, + "learning_rate": 6.719912677932389e-05, + "loss": 2.3906, + "step": 12116 + }, + { + "epoch": 0.8132612999563773, + "grad_norm": 4.501725196838379, + "learning_rate": 6.718892129370195e-05, + "loss": 2.1791, + "step": 12118 + }, + { + "epoch": 0.8133955236401463, + "grad_norm": 4.501204490661621, + "learning_rate": 6.717871499593728e-05, + "loss": 2.5419, + "step": 12120 + }, + { + "epoch": 0.8135297473239153, + "grad_norm": 3.9756031036376953, + "learning_rate": 6.71685078865121e-05, + "loss": 2.1278, + "step": 12122 + }, + { + "epoch": 0.8136639710076843, + "grad_norm": 4.483238697052002, + "learning_rate": 6.71582999659087e-05, + "loss": 2.4031, + "step": 12124 + }, + { + "epoch": 0.8137981946914533, + "grad_norm": 4.175754070281982, + "learning_rate": 6.714809123460935e-05, + "loss": 2.388, + "step": 12126 + }, + { + "epoch": 0.8139324183752223, + "grad_norm": 4.078599452972412, + "learning_rate": 6.713788169309641e-05, + "loss": 2.4835, + "step": 12128 + }, + { + "epoch": 0.8140666420589913, + "grad_norm": 4.107693672180176, + "learning_rate": 6.712767134185228e-05, + "loss": 2.1247, + "step": 12130 + }, + { + "epoch": 0.8142008657427603, + "grad_norm": 5.214900016784668, + "learning_rate": 6.711746018135933e-05, + "loss": 2.3069, + "step": 12132 + }, + { + "epoch": 0.8143350894265293, + "grad_norm": 4.954100131988525, + "learning_rate": 6.710724821210006e-05, + "loss": 2.4044, + "step": 12134 + }, + { + "epoch": 0.8144693131102984, + "grad_norm": 4.007582187652588, + "learning_rate": 6.709703543455695e-05, + "loss": 2.2437, + "step": 12136 + }, + { + "epoch": 0.8146035367940673, + "grad_norm": 4.103234767913818, + "learning_rate": 6.708682184921255e-05, + "loss": 2.3792, + "step": 12138 + }, + { + "epoch": 0.8147377604778363, + "grad_norm": 4.207164287567139, + "learning_rate": 6.70766074565494e-05, + "loss": 2.5038, + "step": 12140 + }, + { + "epoch": 0.8148719841616053, + "grad_norm": 5.270178318023682, + "learning_rate": 6.706639225705014e-05, + "loss": 2.693, + "step": 12142 + }, + { + "epoch": 0.8150062078453744, + "grad_norm": 3.9722371101379395, + "learning_rate": 6.705617625119738e-05, + "loss": 2.4645, + "step": 12144 + }, + { + "epoch": 0.8151404315291433, + "grad_norm": 4.305671215057373, + "learning_rate": 6.704595943947385e-05, + "loss": 2.4885, + "step": 12146 + }, + { + "epoch": 0.8152746552129123, + "grad_norm": 4.424032211303711, + "learning_rate": 6.703574182236226e-05, + "loss": 2.4169, + "step": 12148 + }, + { + "epoch": 0.8154088788966813, + "grad_norm": 4.1208977699279785, + "learning_rate": 6.702552340034535e-05, + "loss": 2.7071, + "step": 12150 + }, + { + "epoch": 0.8155431025804504, + "grad_norm": 5.488428115844727, + "learning_rate": 6.701530417390597e-05, + "loss": 2.1647, + "step": 12152 + }, + { + "epoch": 0.8156773262642193, + "grad_norm": 3.8692030906677246, + "learning_rate": 6.70050841435269e-05, + "loss": 2.3197, + "step": 12154 + }, + { + "epoch": 0.8158115499479883, + "grad_norm": 4.113739013671875, + "learning_rate": 6.699486330969106e-05, + "loss": 2.5397, + "step": 12156 + }, + { + "epoch": 0.8159457736317574, + "grad_norm": 4.363931655883789, + "learning_rate": 6.698464167288133e-05, + "loss": 2.5893, + "step": 12158 + }, + { + "epoch": 0.8160799973155263, + "grad_norm": 3.524725914001465, + "learning_rate": 6.697441923358068e-05, + "loss": 2.343, + "step": 12160 + }, + { + "epoch": 0.8162142209992953, + "grad_norm": 3.6724839210510254, + "learning_rate": 6.696419599227213e-05, + "loss": 2.2751, + "step": 12162 + }, + { + "epoch": 0.8163484446830643, + "grad_norm": 3.5649564266204834, + "learning_rate": 6.695397194943864e-05, + "loss": 2.0539, + "step": 12164 + }, + { + "epoch": 0.8164826683668334, + "grad_norm": 5.119905471801758, + "learning_rate": 6.694374710556335e-05, + "loss": 2.46, + "step": 12166 + }, + { + "epoch": 0.8166168920506023, + "grad_norm": 4.477723598480225, + "learning_rate": 6.69335214611293e-05, + "loss": 2.5698, + "step": 12168 + }, + { + "epoch": 0.8167511157343713, + "grad_norm": 5.36578893661499, + "learning_rate": 6.692329501661966e-05, + "loss": 2.4763, + "step": 12170 + }, + { + "epoch": 0.8168853394181403, + "grad_norm": 4.171503067016602, + "learning_rate": 6.691306777251762e-05, + "loss": 2.7111, + "step": 12172 + }, + { + "epoch": 0.8170195631019094, + "grad_norm": 4.299825191497803, + "learning_rate": 6.690283972930639e-05, + "loss": 2.5374, + "step": 12174 + }, + { + "epoch": 0.8171537867856783, + "grad_norm": 4.065514087677002, + "learning_rate": 6.689261088746921e-05, + "loss": 2.5355, + "step": 12176 + }, + { + "epoch": 0.8172880104694473, + "grad_norm": 4.045982837677002, + "learning_rate": 6.688238124748939e-05, + "loss": 2.3861, + "step": 12178 + }, + { + "epoch": 0.8174222341532164, + "grad_norm": 4.3706769943237305, + "learning_rate": 6.687215080985025e-05, + "loss": 2.5542, + "step": 12180 + }, + { + "epoch": 0.8175564578369854, + "grad_norm": 4.2047600746154785, + "learning_rate": 6.686191957503517e-05, + "loss": 2.3242, + "step": 12182 + }, + { + "epoch": 0.8176906815207543, + "grad_norm": 5.149896144866943, + "learning_rate": 6.685168754352754e-05, + "loss": 2.4218, + "step": 12184 + }, + { + "epoch": 0.8178249052045233, + "grad_norm": 4.221558570861816, + "learning_rate": 6.684145471581081e-05, + "loss": 2.0379, + "step": 12186 + }, + { + "epoch": 0.8179591288882924, + "grad_norm": 4.023637771606445, + "learning_rate": 6.683122109236845e-05, + "loss": 2.5477, + "step": 12188 + }, + { + "epoch": 0.8180933525720614, + "grad_norm": 3.964062452316284, + "learning_rate": 6.682098667368403e-05, + "loss": 2.5959, + "step": 12190 + }, + { + "epoch": 0.8182275762558303, + "grad_norm": 4.3608293533325195, + "learning_rate": 6.681075146024104e-05, + "loss": 2.1815, + "step": 12192 + }, + { + "epoch": 0.8183617999395993, + "grad_norm": 8.469134330749512, + "learning_rate": 6.68005154525231e-05, + "loss": 2.4811, + "step": 12194 + }, + { + "epoch": 0.8184960236233684, + "grad_norm": 3.5421078205108643, + "learning_rate": 6.679027865101383e-05, + "loss": 2.5033, + "step": 12196 + }, + { + "epoch": 0.8186302473071373, + "grad_norm": 4.10988712310791, + "learning_rate": 6.678004105619693e-05, + "loss": 2.2995, + "step": 12198 + }, + { + "epoch": 0.8187644709909063, + "grad_norm": 3.6743173599243164, + "learning_rate": 6.676980266855608e-05, + "loss": 2.2513, + "step": 12200 + }, + { + "epoch": 0.8188986946746754, + "grad_norm": 3.790402889251709, + "learning_rate": 6.675956348857504e-05, + "loss": 2.1483, + "step": 12202 + }, + { + "epoch": 0.8190329183584444, + "grad_norm": 4.197056293487549, + "learning_rate": 6.674932351673758e-05, + "loss": 2.7135, + "step": 12204 + }, + { + "epoch": 0.8191671420422133, + "grad_norm": 4.827541828155518, + "learning_rate": 6.67390827535275e-05, + "loss": 2.3438, + "step": 12206 + }, + { + "epoch": 0.8193013657259823, + "grad_norm": 4.331939220428467, + "learning_rate": 6.672884119942868e-05, + "loss": 2.4267, + "step": 12208 + }, + { + "epoch": 0.8194355894097514, + "grad_norm": 4.927274227142334, + "learning_rate": 6.671859885492502e-05, + "loss": 2.48, + "step": 12210 + }, + { + "epoch": 0.8195698130935204, + "grad_norm": 4.325490474700928, + "learning_rate": 6.670835572050043e-05, + "loss": 2.2924, + "step": 12212 + }, + { + "epoch": 0.8197040367772893, + "grad_norm": 3.68344783782959, + "learning_rate": 6.669811179663891e-05, + "loss": 2.2859, + "step": 12214 + }, + { + "epoch": 0.8198382604610583, + "grad_norm": 4.01980447769165, + "learning_rate": 6.668786708382441e-05, + "loss": 2.0717, + "step": 12216 + }, + { + "epoch": 0.8199724841448274, + "grad_norm": 4.366873741149902, + "learning_rate": 6.667762158254104e-05, + "loss": 2.4595, + "step": 12218 + }, + { + "epoch": 0.8201067078285964, + "grad_norm": 4.293093204498291, + "learning_rate": 6.666737529327282e-05, + "loss": 2.3963, + "step": 12220 + }, + { + "epoch": 0.8202409315123653, + "grad_norm": 4.267847537994385, + "learning_rate": 6.66571282165039e-05, + "loss": 2.7272, + "step": 12222 + }, + { + "epoch": 0.8203751551961344, + "grad_norm": 4.015305519104004, + "learning_rate": 6.664688035271843e-05, + "loss": 2.4136, + "step": 12224 + }, + { + "epoch": 0.8205093788799034, + "grad_norm": 3.8808066844940186, + "learning_rate": 6.66366317024006e-05, + "loss": 2.1557, + "step": 12226 + }, + { + "epoch": 0.8206436025636724, + "grad_norm": 4.428849697113037, + "learning_rate": 6.662638226603463e-05, + "loss": 2.1745, + "step": 12228 + }, + { + "epoch": 0.8207778262474413, + "grad_norm": 3.9925777912139893, + "learning_rate": 6.661613204410479e-05, + "loss": 2.389, + "step": 12230 + }, + { + "epoch": 0.8209120499312104, + "grad_norm": 4.185019493103027, + "learning_rate": 6.66058810370954e-05, + "loss": 2.1874, + "step": 12232 + }, + { + "epoch": 0.8210462736149794, + "grad_norm": 4.374430179595947, + "learning_rate": 6.659562924549076e-05, + "loss": 2.1074, + "step": 12234 + }, + { + "epoch": 0.8211804972987483, + "grad_norm": 4.644327640533447, + "learning_rate": 6.658537666977529e-05, + "loss": 2.6376, + "step": 12236 + }, + { + "epoch": 0.8213147209825173, + "grad_norm": 4.416599750518799, + "learning_rate": 6.657512331043339e-05, + "loss": 2.5466, + "step": 12238 + }, + { + "epoch": 0.8214489446662864, + "grad_norm": 3.8600032329559326, + "learning_rate": 6.65648691679495e-05, + "loss": 2.3075, + "step": 12240 + }, + { + "epoch": 0.8215831683500554, + "grad_norm": 4.793914794921875, + "learning_rate": 6.65546142428081e-05, + "loss": 2.7288, + "step": 12242 + }, + { + "epoch": 0.8217173920338243, + "grad_norm": 5.383265018463135, + "learning_rate": 6.654435853549375e-05, + "loss": 2.3846, + "step": 12244 + }, + { + "epoch": 0.8218516157175934, + "grad_norm": 4.518656253814697, + "learning_rate": 6.653410204649099e-05, + "loss": 2.5316, + "step": 12246 + }, + { + "epoch": 0.8219858394013624, + "grad_norm": 3.948580741882324, + "learning_rate": 6.652384477628442e-05, + "loss": 2.4109, + "step": 12248 + }, + { + "epoch": 0.8221200630851314, + "grad_norm": 4.008388042449951, + "learning_rate": 6.651358672535868e-05, + "loss": 2.5197, + "step": 12250 + }, + { + "epoch": 0.8222542867689003, + "grad_norm": 4.135456085205078, + "learning_rate": 6.650332789419844e-05, + "loss": 2.4276, + "step": 12252 + }, + { + "epoch": 0.8223885104526694, + "grad_norm": 3.9457364082336426, + "learning_rate": 6.64930682832884e-05, + "loss": 2.5246, + "step": 12254 + }, + { + "epoch": 0.8225227341364384, + "grad_norm": 4.037944316864014, + "learning_rate": 6.648280789311332e-05, + "loss": 2.6116, + "step": 12256 + }, + { + "epoch": 0.8226569578202074, + "grad_norm": 3.746868133544922, + "learning_rate": 6.6472546724158e-05, + "loss": 2.3013, + "step": 12258 + }, + { + "epoch": 0.8227911815039763, + "grad_norm": 4.420881271362305, + "learning_rate": 6.646228477690722e-05, + "loss": 2.7024, + "step": 12260 + }, + { + "epoch": 0.8229254051877454, + "grad_norm": 3.724362850189209, + "learning_rate": 6.645202205184584e-05, + "loss": 1.9816, + "step": 12262 + }, + { + "epoch": 0.8230596288715144, + "grad_norm": 4.266753673553467, + "learning_rate": 6.644175854945878e-05, + "loss": 2.475, + "step": 12264 + }, + { + "epoch": 0.8231938525552834, + "grad_norm": 3.9063615798950195, + "learning_rate": 6.643149427023097e-05, + "loss": 2.4515, + "step": 12266 + }, + { + "epoch": 0.8233280762390524, + "grad_norm": 5.489950656890869, + "learning_rate": 6.642122921464736e-05, + "loss": 2.6128, + "step": 12268 + }, + { + "epoch": 0.8234622999228214, + "grad_norm": 3.725987434387207, + "learning_rate": 6.641096338319297e-05, + "loss": 2.2314, + "step": 12270 + }, + { + "epoch": 0.8235965236065904, + "grad_norm": 5.513998508453369, + "learning_rate": 6.640069677635282e-05, + "loss": 2.391, + "step": 12272 + }, + { + "epoch": 0.8237307472903593, + "grad_norm": 4.486222267150879, + "learning_rate": 6.6390429394612e-05, + "loss": 2.4994, + "step": 12274 + }, + { + "epoch": 0.8238649709741284, + "grad_norm": 4.494882106781006, + "learning_rate": 6.638016123845562e-05, + "loss": 2.421, + "step": 12276 + }, + { + "epoch": 0.8239991946578974, + "grad_norm": 4.757532119750977, + "learning_rate": 6.636989230836884e-05, + "loss": 2.3781, + "step": 12278 + }, + { + "epoch": 0.8241334183416664, + "grad_norm": 4.679840564727783, + "learning_rate": 6.635962260483683e-05, + "loss": 2.3506, + "step": 12280 + }, + { + "epoch": 0.8242676420254353, + "grad_norm": 4.06081485748291, + "learning_rate": 6.634935212834483e-05, + "loss": 1.9943, + "step": 12282 + }, + { + "epoch": 0.8244018657092044, + "grad_norm": 4.084214687347412, + "learning_rate": 6.63390808793781e-05, + "loss": 2.4161, + "step": 12284 + }, + { + "epoch": 0.8245360893929734, + "grad_norm": 4.192774772644043, + "learning_rate": 6.63288088584219e-05, + "loss": 2.4192, + "step": 12286 + }, + { + "epoch": 0.8246703130767424, + "grad_norm": 5.120491027832031, + "learning_rate": 6.63185360659616e-05, + "loss": 2.2073, + "step": 12288 + }, + { + "epoch": 0.8248045367605114, + "grad_norm": 3.9698641300201416, + "learning_rate": 6.630826250248256e-05, + "loss": 2.2244, + "step": 12290 + }, + { + "epoch": 0.8249387604442804, + "grad_norm": 3.7444067001342773, + "learning_rate": 6.629798816847019e-05, + "loss": 2.037, + "step": 12292 + }, + { + "epoch": 0.8250729841280494, + "grad_norm": 4.0939507484436035, + "learning_rate": 6.628771306440994e-05, + "loss": 2.2781, + "step": 12294 + }, + { + "epoch": 0.8252072078118184, + "grad_norm": 5.866815567016602, + "learning_rate": 6.627743719078725e-05, + "loss": 2.3106, + "step": 12296 + }, + { + "epoch": 0.8253414314955874, + "grad_norm": 4.578657150268555, + "learning_rate": 6.626716054808768e-05, + "loss": 2.538, + "step": 12298 + }, + { + "epoch": 0.8254756551793564, + "grad_norm": 3.695403575897217, + "learning_rate": 6.625688313679676e-05, + "loss": 2.3558, + "step": 12300 + }, + { + "epoch": 0.8256098788631254, + "grad_norm": 4.383969306945801, + "learning_rate": 6.624660495740007e-05, + "loss": 2.4218, + "step": 12302 + }, + { + "epoch": 0.8257441025468945, + "grad_norm": 3.864387273788452, + "learning_rate": 6.623632601038325e-05, + "loss": 2.3698, + "step": 12304 + }, + { + "epoch": 0.8258783262306634, + "grad_norm": 3.755929708480835, + "learning_rate": 6.622604629623196e-05, + "loss": 2.6467, + "step": 12306 + }, + { + "epoch": 0.8260125499144324, + "grad_norm": 3.8158481121063232, + "learning_rate": 6.621576581543189e-05, + "loss": 2.2788, + "step": 12308 + }, + { + "epoch": 0.8261467735982014, + "grad_norm": 5.101398944854736, + "learning_rate": 6.620548456846876e-05, + "loss": 2.1868, + "step": 12310 + }, + { + "epoch": 0.8262809972819704, + "grad_norm": 4.16798734664917, + "learning_rate": 6.619520255582834e-05, + "loss": 2.5105, + "step": 12312 + }, + { + "epoch": 0.8264152209657394, + "grad_norm": 4.561279296875, + "learning_rate": 6.618491977799648e-05, + "loss": 2.2824, + "step": 12314 + }, + { + "epoch": 0.8265494446495084, + "grad_norm": 4.481846332550049, + "learning_rate": 6.617463623545895e-05, + "loss": 2.4035, + "step": 12316 + }, + { + "epoch": 0.8266836683332774, + "grad_norm": 4.176505088806152, + "learning_rate": 6.61643519287017e-05, + "loss": 2.6136, + "step": 12318 + }, + { + "epoch": 0.8268178920170464, + "grad_norm": 4.552618980407715, + "learning_rate": 6.615406685821058e-05, + "loss": 2.3941, + "step": 12320 + }, + { + "epoch": 0.8269521157008154, + "grad_norm": 4.570150375366211, + "learning_rate": 6.614378102447158e-05, + "loss": 2.4801, + "step": 12322 + }, + { + "epoch": 0.8270863393845844, + "grad_norm": 3.988173484802246, + "learning_rate": 6.613349442797066e-05, + "loss": 2.2483, + "step": 12324 + }, + { + "epoch": 0.8272205630683535, + "grad_norm": 4.044221878051758, + "learning_rate": 6.612320706919387e-05, + "loss": 2.2139, + "step": 12326 + }, + { + "epoch": 0.8273547867521224, + "grad_norm": 3.770693778991699, + "learning_rate": 6.611291894862726e-05, + "loss": 2.3951, + "step": 12328 + }, + { + "epoch": 0.8274890104358914, + "grad_norm": 4.457366943359375, + "learning_rate": 6.610263006675688e-05, + "loss": 2.3244, + "step": 12330 + }, + { + "epoch": 0.8276232341196604, + "grad_norm": 3.984312057495117, + "learning_rate": 6.609234042406892e-05, + "loss": 2.2482, + "step": 12332 + }, + { + "epoch": 0.8277574578034295, + "grad_norm": 4.414451599121094, + "learning_rate": 6.60820500210495e-05, + "loss": 2.5196, + "step": 12334 + }, + { + "epoch": 0.8278916814871984, + "grad_norm": 4.278848171234131, + "learning_rate": 6.607175885818485e-05, + "loss": 2.3785, + "step": 12336 + }, + { + "epoch": 0.8280259051709674, + "grad_norm": 3.971078395843506, + "learning_rate": 6.60614669359612e-05, + "loss": 2.4399, + "step": 12338 + }, + { + "epoch": 0.8281601288547364, + "grad_norm": 3.788719654083252, + "learning_rate": 6.605117425486482e-05, + "loss": 2.2136, + "step": 12340 + }, + { + "epoch": 0.8282943525385055, + "grad_norm": 4.0413947105407715, + "learning_rate": 6.604088081538203e-05, + "loss": 2.3101, + "step": 12342 + }, + { + "epoch": 0.8284285762222744, + "grad_norm": 3.8304154872894287, + "learning_rate": 6.603058661799915e-05, + "loss": 2.1936, + "step": 12344 + }, + { + "epoch": 0.8285627999060434, + "grad_norm": 5.874207973480225, + "learning_rate": 6.602029166320258e-05, + "loss": 2.4613, + "step": 12346 + }, + { + "epoch": 0.8286970235898125, + "grad_norm": 4.188701629638672, + "learning_rate": 6.600999595147872e-05, + "loss": 2.2656, + "step": 12348 + }, + { + "epoch": 0.8288312472735814, + "grad_norm": 4.529519081115723, + "learning_rate": 6.599969948331403e-05, + "loss": 2.4881, + "step": 12350 + }, + { + "epoch": 0.8289654709573504, + "grad_norm": 4.031110763549805, + "learning_rate": 6.598940225919504e-05, + "loss": 2.1678, + "step": 12352 + }, + { + "epoch": 0.8290996946411194, + "grad_norm": 3.9845433235168457, + "learning_rate": 6.59791042796082e-05, + "loss": 2.1235, + "step": 12354 + }, + { + "epoch": 0.8292339183248885, + "grad_norm": 4.89502477645874, + "learning_rate": 6.596880554504011e-05, + "loss": 2.146, + "step": 12356 + }, + { + "epoch": 0.8293681420086574, + "grad_norm": 4.253124237060547, + "learning_rate": 6.595850605597736e-05, + "loss": 2.3067, + "step": 12358 + }, + { + "epoch": 0.8295023656924264, + "grad_norm": 5.683328151702881, + "learning_rate": 6.594820581290659e-05, + "loss": 2.835, + "step": 12360 + }, + { + "epoch": 0.8296365893761954, + "grad_norm": 3.843940258026123, + "learning_rate": 6.593790481631445e-05, + "loss": 2.3485, + "step": 12362 + }, + { + "epoch": 0.8297708130599645, + "grad_norm": 4.190773963928223, + "learning_rate": 6.592760306668763e-05, + "loss": 2.2006, + "step": 12364 + }, + { + "epoch": 0.8299050367437334, + "grad_norm": 4.197145938873291, + "learning_rate": 6.591730056451292e-05, + "loss": 2.3457, + "step": 12366 + }, + { + "epoch": 0.8300392604275024, + "grad_norm": 4.325713634490967, + "learning_rate": 6.590699731027703e-05, + "loss": 2.0396, + "step": 12368 + }, + { + "epoch": 0.8301734841112715, + "grad_norm": 3.9802017211914062, + "learning_rate": 6.589669330446682e-05, + "loss": 2.4611, + "step": 12370 + }, + { + "epoch": 0.8303077077950405, + "grad_norm": 4.3565144538879395, + "learning_rate": 6.58863885475691e-05, + "loss": 2.295, + "step": 12372 + }, + { + "epoch": 0.8304419314788094, + "grad_norm": 4.69005823135376, + "learning_rate": 6.587608304007075e-05, + "loss": 2.2813, + "step": 12374 + }, + { + "epoch": 0.8305761551625784, + "grad_norm": 4.541662216186523, + "learning_rate": 6.586577678245872e-05, + "loss": 2.3955, + "step": 12376 + }, + { + "epoch": 0.8307103788463475, + "grad_norm": 3.9838244915008545, + "learning_rate": 6.58554697752199e-05, + "loss": 2.5086, + "step": 12378 + }, + { + "epoch": 0.8308446025301165, + "grad_norm": 4.057279109954834, + "learning_rate": 6.584516201884134e-05, + "loss": 2.3336, + "step": 12380 + }, + { + "epoch": 0.8309788262138854, + "grad_norm": 4.138448715209961, + "learning_rate": 6.583485351381002e-05, + "loss": 2.4857, + "step": 12382 + }, + { + "epoch": 0.8311130498976544, + "grad_norm": 3.944072723388672, + "learning_rate": 6.582454426061302e-05, + "loss": 2.3095, + "step": 12384 + }, + { + "epoch": 0.8312472735814235, + "grad_norm": 4.922465801239014, + "learning_rate": 6.581423425973741e-05, + "loss": 2.6321, + "step": 12386 + }, + { + "epoch": 0.8313814972651924, + "grad_norm": 4.128320693969727, + "learning_rate": 6.580392351167033e-05, + "loss": 2.6602, + "step": 12388 + }, + { + "epoch": 0.8315157209489614, + "grad_norm": 3.937958240509033, + "learning_rate": 6.579361201689895e-05, + "loss": 2.4461, + "step": 12390 + }, + { + "epoch": 0.8316499446327305, + "grad_norm": 4.767930030822754, + "learning_rate": 6.578329977591045e-05, + "loss": 2.4018, + "step": 12392 + }, + { + "epoch": 0.8317841683164995, + "grad_norm": 4.144962787628174, + "learning_rate": 6.577298678919209e-05, + "loss": 2.2325, + "step": 12394 + }, + { + "epoch": 0.8319183920002684, + "grad_norm": 4.773271083831787, + "learning_rate": 6.576267305723111e-05, + "loss": 2.1338, + "step": 12396 + }, + { + "epoch": 0.8320526156840374, + "grad_norm": 3.87656569480896, + "learning_rate": 6.575235858051481e-05, + "loss": 2.5854, + "step": 12398 + }, + { + "epoch": 0.8321868393678065, + "grad_norm": 4.232511520385742, + "learning_rate": 6.574204335953056e-05, + "loss": 2.2508, + "step": 12400 + }, + { + "epoch": 0.8323210630515755, + "grad_norm": 17.29722785949707, + "learning_rate": 6.57317273947657e-05, + "loss": 2.4671, + "step": 12402 + }, + { + "epoch": 0.8324552867353444, + "grad_norm": 3.9943947792053223, + "learning_rate": 6.572141068670768e-05, + "loss": 2.1579, + "step": 12404 + }, + { + "epoch": 0.8325895104191134, + "grad_norm": 3.745868682861328, + "learning_rate": 6.57110932358439e-05, + "loss": 2.3336, + "step": 12406 + }, + { + "epoch": 0.8327237341028825, + "grad_norm": 4.1321187019348145, + "learning_rate": 6.570077504266188e-05, + "loss": 2.2651, + "step": 12408 + }, + { + "epoch": 0.8328579577866515, + "grad_norm": 3.8748979568481445, + "learning_rate": 6.569045610764909e-05, + "loss": 2.2416, + "step": 12410 + }, + { + "epoch": 0.8329921814704204, + "grad_norm": 5.4878106117248535, + "learning_rate": 6.568013643129311e-05, + "loss": 2.3992, + "step": 12412 + }, + { + "epoch": 0.8331264051541895, + "grad_norm": 3.937441110610962, + "learning_rate": 6.566981601408153e-05, + "loss": 2.1806, + "step": 12414 + }, + { + "epoch": 0.8332606288379585, + "grad_norm": 3.918555736541748, + "learning_rate": 6.565949485650193e-05, + "loss": 2.3829, + "step": 12416 + }, + { + "epoch": 0.8333948525217275, + "grad_norm": 4.289477348327637, + "learning_rate": 6.564917295904201e-05, + "loss": 2.3792, + "step": 12418 + }, + { + "epoch": 0.8335290762054964, + "grad_norm": 4.174271583557129, + "learning_rate": 6.563885032218943e-05, + "loss": 2.5228, + "step": 12420 + }, + { + "epoch": 0.8336632998892655, + "grad_norm": 4.806816577911377, + "learning_rate": 6.562852694643194e-05, + "loss": 2.1337, + "step": 12422 + }, + { + "epoch": 0.8337975235730345, + "grad_norm": 7.162111759185791, + "learning_rate": 6.561820283225726e-05, + "loss": 2.3283, + "step": 12424 + }, + { + "epoch": 0.8339317472568034, + "grad_norm": 3.468047857284546, + "learning_rate": 6.560787798015323e-05, + "loss": 2.2646, + "step": 12426 + }, + { + "epoch": 0.8340659709405724, + "grad_norm": 4.713889122009277, + "learning_rate": 6.559755239060765e-05, + "loss": 2.5064, + "step": 12428 + }, + { + "epoch": 0.8342001946243415, + "grad_norm": 4.3931684494018555, + "learning_rate": 6.558722606410839e-05, + "loss": 2.1819, + "step": 12430 + }, + { + "epoch": 0.8343344183081105, + "grad_norm": 3.880244255065918, + "learning_rate": 6.557689900114337e-05, + "loss": 2.5382, + "step": 12432 + }, + { + "epoch": 0.8344686419918794, + "grad_norm": 5.756272792816162, + "learning_rate": 6.556657120220046e-05, + "loss": 2.3187, + "step": 12434 + }, + { + "epoch": 0.8346028656756485, + "grad_norm": 3.887141466140747, + "learning_rate": 6.555624266776771e-05, + "loss": 2.0641, + "step": 12436 + }, + { + "epoch": 0.8347370893594175, + "grad_norm": 4.206582546234131, + "learning_rate": 6.554591339833307e-05, + "loss": 2.018, + "step": 12438 + }, + { + "epoch": 0.8348713130431865, + "grad_norm": 4.100188255310059, + "learning_rate": 6.553558339438457e-05, + "loss": 2.1552, + "step": 12440 + }, + { + "epoch": 0.8350055367269554, + "grad_norm": 4.3288798332214355, + "learning_rate": 6.552525265641034e-05, + "loss": 2.3467, + "step": 12442 + }, + { + "epoch": 0.8351397604107245, + "grad_norm": 4.296035289764404, + "learning_rate": 6.551492118489843e-05, + "loss": 2.4768, + "step": 12444 + }, + { + "epoch": 0.8352739840944935, + "grad_norm": 4.481319427490234, + "learning_rate": 6.550458898033701e-05, + "loss": 2.2597, + "step": 12446 + }, + { + "epoch": 0.8354082077782625, + "grad_norm": 3.536719799041748, + "learning_rate": 6.549425604321424e-05, + "loss": 2.3018, + "step": 12448 + }, + { + "epoch": 0.8355424314620314, + "grad_norm": 4.264576435089111, + "learning_rate": 6.548392237401836e-05, + "loss": 2.3111, + "step": 12450 + }, + { + "epoch": 0.8356766551458005, + "grad_norm": 4.516921043395996, + "learning_rate": 6.547358797323758e-05, + "loss": 2.4993, + "step": 12452 + }, + { + "epoch": 0.8358108788295695, + "grad_norm": 4.059057235717773, + "learning_rate": 6.546325284136019e-05, + "loss": 2.5054, + "step": 12454 + }, + { + "epoch": 0.8359451025133385, + "grad_norm": 4.279839515686035, + "learning_rate": 6.545291697887453e-05, + "loss": 2.5482, + "step": 12456 + }, + { + "epoch": 0.8360793261971075, + "grad_norm": 4.994450569152832, + "learning_rate": 6.544258038626891e-05, + "loss": 2.494, + "step": 12458 + }, + { + "epoch": 0.8362135498808765, + "grad_norm": 3.9286320209503174, + "learning_rate": 6.543224306403174e-05, + "loss": 2.2492, + "step": 12460 + }, + { + "epoch": 0.8363477735646455, + "grad_norm": 4.060963153839111, + "learning_rate": 6.542190501265144e-05, + "loss": 2.2235, + "step": 12462 + }, + { + "epoch": 0.8364819972484144, + "grad_norm": 3.7335662841796875, + "learning_rate": 6.541156623261646e-05, + "loss": 2.6808, + "step": 12464 + }, + { + "epoch": 0.8366162209321835, + "grad_norm": 8.624871253967285, + "learning_rate": 6.540122672441526e-05, + "loss": 2.5018, + "step": 12466 + }, + { + "epoch": 0.8367504446159525, + "grad_norm": 4.238193511962891, + "learning_rate": 6.53908864885364e-05, + "loss": 2.6812, + "step": 12468 + }, + { + "epoch": 0.8368846682997215, + "grad_norm": 4.334986686706543, + "learning_rate": 6.538054552546841e-05, + "loss": 2.4281, + "step": 12470 + }, + { + "epoch": 0.8370188919834904, + "grad_norm": 3.9105429649353027, + "learning_rate": 6.537020383569988e-05, + "loss": 2.2319, + "step": 12472 + }, + { + "epoch": 0.8371531156672595, + "grad_norm": 3.429661750793457, + "learning_rate": 6.535986141971946e-05, + "loss": 2.0768, + "step": 12474 + }, + { + "epoch": 0.8372873393510285, + "grad_norm": 4.185454368591309, + "learning_rate": 6.534951827801579e-05, + "loss": 2.5052, + "step": 12476 + }, + { + "epoch": 0.8374215630347975, + "grad_norm": 4.344455718994141, + "learning_rate": 6.533917441107755e-05, + "loss": 2.257, + "step": 12478 + }, + { + "epoch": 0.8375557867185665, + "grad_norm": 4.156767845153809, + "learning_rate": 6.532882981939349e-05, + "loss": 2.332, + "step": 12480 + }, + { + "epoch": 0.8376900104023355, + "grad_norm": 4.473140716552734, + "learning_rate": 6.531848450345236e-05, + "loss": 2.4439, + "step": 12482 + }, + { + "epoch": 0.8378242340861045, + "grad_norm": 4.035686016082764, + "learning_rate": 6.530813846374297e-05, + "loss": 2.0925, + "step": 12484 + }, + { + "epoch": 0.8379584577698735, + "grad_norm": 4.611408710479736, + "learning_rate": 6.529779170075413e-05, + "loss": 2.4631, + "step": 12486 + }, + { + "epoch": 0.8380926814536425, + "grad_norm": 4.749829292297363, + "learning_rate": 6.528744421497471e-05, + "loss": 2.8513, + "step": 12488 + }, + { + "epoch": 0.8382269051374115, + "grad_norm": 3.561922550201416, + "learning_rate": 6.527709600689363e-05, + "loss": 2.3569, + "step": 12490 + }, + { + "epoch": 0.8383611288211805, + "grad_norm": 4.075413703918457, + "learning_rate": 6.526674707699979e-05, + "loss": 2.3929, + "step": 12492 + }, + { + "epoch": 0.8384953525049496, + "grad_norm": 4.686333656311035, + "learning_rate": 6.525639742578218e-05, + "loss": 2.1929, + "step": 12494 + }, + { + "epoch": 0.8386295761887185, + "grad_norm": 3.8635823726654053, + "learning_rate": 6.524604705372979e-05, + "loss": 2.3561, + "step": 12496 + }, + { + "epoch": 0.8387637998724875, + "grad_norm": 3.914000988006592, + "learning_rate": 6.523569596133165e-05, + "loss": 2.0558, + "step": 12498 + }, + { + "epoch": 0.8388980235562565, + "grad_norm": 4.657124042510986, + "learning_rate": 6.522534414907684e-05, + "loss": 2.2131, + "step": 12500 + }, + { + "epoch": 0.8390322472400255, + "grad_norm": 3.939862012863159, + "learning_rate": 6.521499161745445e-05, + "loss": 2.1503, + "step": 12502 + }, + { + "epoch": 0.8391664709237945, + "grad_norm": 4.01451301574707, + "learning_rate": 6.520463836695364e-05, + "loss": 2.6882, + "step": 12504 + }, + { + "epoch": 0.8393006946075635, + "grad_norm": 6.307074546813965, + "learning_rate": 6.519428439806356e-05, + "loss": 2.5137, + "step": 12506 + }, + { + "epoch": 0.8394349182913325, + "grad_norm": 4.346617698669434, + "learning_rate": 6.518392971127341e-05, + "loss": 2.3436, + "step": 12508 + }, + { + "epoch": 0.8395691419751015, + "grad_norm": 4.316330909729004, + "learning_rate": 6.517357430707245e-05, + "loss": 2.4943, + "step": 12510 + }, + { + "epoch": 0.8397033656588705, + "grad_norm": 4.290022850036621, + "learning_rate": 6.516321818594995e-05, + "loss": 2.3949, + "step": 12512 + }, + { + "epoch": 0.8398375893426395, + "grad_norm": 4.723174571990967, + "learning_rate": 6.51528613483952e-05, + "loss": 2.5103, + "step": 12514 + }, + { + "epoch": 0.8399718130264086, + "grad_norm": 4.512057781219482, + "learning_rate": 6.514250379489753e-05, + "loss": 2.2436, + "step": 12516 + }, + { + "epoch": 0.8401060367101775, + "grad_norm": 4.49254035949707, + "learning_rate": 6.513214552594637e-05, + "loss": 2.2172, + "step": 12518 + }, + { + "epoch": 0.8402402603939465, + "grad_norm": 4.7141900062561035, + "learning_rate": 6.512178654203105e-05, + "loss": 2.2037, + "step": 12520 + }, + { + "epoch": 0.8403744840777155, + "grad_norm": 5.026899814605713, + "learning_rate": 6.511142684364109e-05, + "loss": 2.3468, + "step": 12522 + }, + { + "epoch": 0.8405087077614846, + "grad_norm": 3.876502752304077, + "learning_rate": 6.51010664312659e-05, + "loss": 2.459, + "step": 12524 + }, + { + "epoch": 0.8406429314452535, + "grad_norm": 3.958247184753418, + "learning_rate": 6.509070530539502e-05, + "loss": 2.2256, + "step": 12526 + }, + { + "epoch": 0.8407771551290225, + "grad_norm": 4.557751178741455, + "learning_rate": 6.508034346651798e-05, + "loss": 2.6517, + "step": 12528 + }, + { + "epoch": 0.8409113788127915, + "grad_norm": 3.791175603866577, + "learning_rate": 6.506998091512438e-05, + "loss": 2.2467, + "step": 12530 + }, + { + "epoch": 0.8410456024965605, + "grad_norm": 3.859926462173462, + "learning_rate": 6.505961765170383e-05, + "loss": 1.9772, + "step": 12532 + }, + { + "epoch": 0.8411798261803295, + "grad_norm": 3.983001232147217, + "learning_rate": 6.504925367674594e-05, + "loss": 2.4814, + "step": 12534 + }, + { + "epoch": 0.8413140498640985, + "grad_norm": 4.0371222496032715, + "learning_rate": 6.503888899074041e-05, + "loss": 2.2233, + "step": 12536 + }, + { + "epoch": 0.8414482735478676, + "grad_norm": 3.5069406032562256, + "learning_rate": 6.502852359417696e-05, + "loss": 2.0, + "step": 12538 + }, + { + "epoch": 0.8415824972316365, + "grad_norm": 4.392330169677734, + "learning_rate": 6.50181574875453e-05, + "loss": 2.1611, + "step": 12540 + }, + { + "epoch": 0.8417167209154055, + "grad_norm": 4.290402889251709, + "learning_rate": 6.500779067133524e-05, + "loss": 2.3637, + "step": 12542 + }, + { + "epoch": 0.8418509445991745, + "grad_norm": 4.430279731750488, + "learning_rate": 6.499742314603659e-05, + "loss": 2.3966, + "step": 12544 + }, + { + "epoch": 0.8419851682829436, + "grad_norm": 4.125484466552734, + "learning_rate": 6.498705491213917e-05, + "loss": 2.4938, + "step": 12546 + }, + { + "epoch": 0.8421193919667125, + "grad_norm": 3.670572280883789, + "learning_rate": 6.497668597013289e-05, + "loss": 2.6102, + "step": 12548 + }, + { + "epoch": 0.8422536156504815, + "grad_norm": 4.533646106719971, + "learning_rate": 6.496631632050763e-05, + "loss": 2.4887, + "step": 12550 + }, + { + "epoch": 0.8423878393342505, + "grad_norm": 5.545773983001709, + "learning_rate": 6.495594596375338e-05, + "loss": 2.1748, + "step": 12552 + }, + { + "epoch": 0.8425220630180196, + "grad_norm": 5.627823829650879, + "learning_rate": 6.494557490036009e-05, + "loss": 2.344, + "step": 12554 + }, + { + "epoch": 0.8426562867017885, + "grad_norm": 3.9791643619537354, + "learning_rate": 6.493520313081775e-05, + "loss": 2.5633, + "step": 12556 + }, + { + "epoch": 0.8427905103855575, + "grad_norm": 5.267456531524658, + "learning_rate": 6.492483065561645e-05, + "loss": 2.1715, + "step": 12558 + }, + { + "epoch": 0.8429247340693266, + "grad_norm": 3.6140096187591553, + "learning_rate": 6.491445747524627e-05, + "loss": 2.1731, + "step": 12560 + }, + { + "epoch": 0.8430589577530956, + "grad_norm": 3.763373851776123, + "learning_rate": 6.490408359019726e-05, + "loss": 2.405, + "step": 12562 + }, + { + "epoch": 0.8431931814368645, + "grad_norm": 3.909391403198242, + "learning_rate": 6.489370900095963e-05, + "loss": 2.3084, + "step": 12564 + }, + { + "epoch": 0.8433274051206335, + "grad_norm": 4.457961559295654, + "learning_rate": 6.488333370802354e-05, + "loss": 2.4616, + "step": 12566 + }, + { + "epoch": 0.8434616288044026, + "grad_norm": 4.102262020111084, + "learning_rate": 6.487295771187919e-05, + "loss": 2.2291, + "step": 12568 + }, + { + "epoch": 0.8435958524881715, + "grad_norm": 3.9554293155670166, + "learning_rate": 6.486258101301685e-05, + "loss": 2.4934, + "step": 12570 + }, + { + "epoch": 0.8437300761719405, + "grad_norm": 3.7523815631866455, + "learning_rate": 6.485220361192677e-05, + "loss": 2.0268, + "step": 12572 + }, + { + "epoch": 0.8438642998557095, + "grad_norm": 4.708202362060547, + "learning_rate": 6.484182550909927e-05, + "loss": 2.2552, + "step": 12574 + }, + { + "epoch": 0.8439985235394786, + "grad_norm": 4.448557376861572, + "learning_rate": 6.48314467050247e-05, + "loss": 2.1063, + "step": 12576 + }, + { + "epoch": 0.8441327472232475, + "grad_norm": 4.077234268188477, + "learning_rate": 6.482106720019344e-05, + "loss": 2.346, + "step": 12578 + }, + { + "epoch": 0.8442669709070165, + "grad_norm": 3.7490530014038086, + "learning_rate": 6.481068699509591e-05, + "loss": 2.4436, + "step": 12580 + }, + { + "epoch": 0.8444011945907856, + "grad_norm": 4.367644309997559, + "learning_rate": 6.480030609022253e-05, + "loss": 2.0942, + "step": 12582 + }, + { + "epoch": 0.8445354182745546, + "grad_norm": 4.164281368255615, + "learning_rate": 6.478992448606381e-05, + "loss": 2.169, + "step": 12584 + }, + { + "epoch": 0.8446696419583235, + "grad_norm": 4.2117085456848145, + "learning_rate": 6.477954218311021e-05, + "loss": 2.2817, + "step": 12586 + }, + { + "epoch": 0.8448038656420925, + "grad_norm": 4.355278015136719, + "learning_rate": 6.476915918185234e-05, + "loss": 2.333, + "step": 12588 + }, + { + "epoch": 0.8449380893258616, + "grad_norm": 3.9745237827301025, + "learning_rate": 6.475877548278073e-05, + "loss": 2.2981, + "step": 12590 + }, + { + "epoch": 0.8450723130096306, + "grad_norm": 3.9635584354400635, + "learning_rate": 6.474839108638598e-05, + "loss": 2.4108, + "step": 12592 + }, + { + "epoch": 0.8452065366933995, + "grad_norm": 3.6157350540161133, + "learning_rate": 6.473800599315878e-05, + "loss": 2.1574, + "step": 12594 + }, + { + "epoch": 0.8453407603771685, + "grad_norm": 3.9338250160217285, + "learning_rate": 6.472762020358976e-05, + "loss": 2.1176, + "step": 12596 + }, + { + "epoch": 0.8454749840609376, + "grad_norm": 4.706528186798096, + "learning_rate": 6.471723371816965e-05, + "loss": 2.5489, + "step": 12598 + }, + { + "epoch": 0.8456092077447066, + "grad_norm": 3.6421804428100586, + "learning_rate": 6.470684653738919e-05, + "loss": 2.2251, + "step": 12600 + }, + { + "epoch": 0.8457434314284755, + "grad_norm": 4.404399394989014, + "learning_rate": 6.469645866173916e-05, + "loss": 2.3887, + "step": 12602 + }, + { + "epoch": 0.8458776551122446, + "grad_norm": 3.4706668853759766, + "learning_rate": 6.468607009171035e-05, + "loss": 2.2738, + "step": 12604 + }, + { + "epoch": 0.8460118787960136, + "grad_norm": 4.379974842071533, + "learning_rate": 6.46756808277936e-05, + "loss": 2.526, + "step": 12606 + }, + { + "epoch": 0.8461461024797825, + "grad_norm": 4.268222332000732, + "learning_rate": 6.46652908704798e-05, + "loss": 2.4134, + "step": 12608 + }, + { + "epoch": 0.8462803261635515, + "grad_norm": 4.0430145263671875, + "learning_rate": 6.465490022025984e-05, + "loss": 2.1537, + "step": 12610 + }, + { + "epoch": 0.8464145498473206, + "grad_norm": 3.9597086906433105, + "learning_rate": 6.464450887762465e-05, + "loss": 2.2919, + "step": 12612 + }, + { + "epoch": 0.8465487735310896, + "grad_norm": 4.240558624267578, + "learning_rate": 6.463411684306522e-05, + "loss": 2.176, + "step": 12614 + }, + { + "epoch": 0.8466829972148585, + "grad_norm": 3.7688443660736084, + "learning_rate": 6.462372411707255e-05, + "loss": 2.2005, + "step": 12616 + }, + { + "epoch": 0.8468172208986275, + "grad_norm": 3.752406597137451, + "learning_rate": 6.461333070013767e-05, + "loss": 2.0988, + "step": 12618 + }, + { + "epoch": 0.8469514445823966, + "grad_norm": 3.8309779167175293, + "learning_rate": 6.460293659275165e-05, + "loss": 2.3756, + "step": 12620 + }, + { + "epoch": 0.8470856682661656, + "grad_norm": 4.7403693199157715, + "learning_rate": 6.45925417954056e-05, + "loss": 2.1706, + "step": 12622 + }, + { + "epoch": 0.8472198919499345, + "grad_norm": 4.432013511657715, + "learning_rate": 6.458214630859064e-05, + "loss": 2.1867, + "step": 12624 + }, + { + "epoch": 0.8473541156337036, + "grad_norm": 3.8922617435455322, + "learning_rate": 6.457175013279792e-05, + "loss": 2.2961, + "step": 12626 + }, + { + "epoch": 0.8474883393174726, + "grad_norm": 4.582221508026123, + "learning_rate": 6.45613532685187e-05, + "loss": 2.6208, + "step": 12628 + }, + { + "epoch": 0.8476225630012416, + "grad_norm": 9.179282188415527, + "learning_rate": 6.455095571624414e-05, + "loss": 2.5128, + "step": 12630 + }, + { + "epoch": 0.8477567866850105, + "grad_norm": 4.201467990875244, + "learning_rate": 6.454055747646555e-05, + "loss": 2.4789, + "step": 12632 + }, + { + "epoch": 0.8478910103687796, + "grad_norm": 4.183443546295166, + "learning_rate": 6.45301585496742e-05, + "loss": 2.3214, + "step": 12634 + }, + { + "epoch": 0.8480252340525486, + "grad_norm": 3.732389450073242, + "learning_rate": 6.451975893636144e-05, + "loss": 2.4093, + "step": 12636 + }, + { + "epoch": 0.8481594577363176, + "grad_norm": 4.0802788734436035, + "learning_rate": 6.450935863701863e-05, + "loss": 2.2323, + "step": 12638 + }, + { + "epoch": 0.8482936814200865, + "grad_norm": 4.310392379760742, + "learning_rate": 6.449895765213713e-05, + "loss": 2.4456, + "step": 12640 + }, + { + "epoch": 0.8484279051038556, + "grad_norm": 4.244978904724121, + "learning_rate": 6.448855598220842e-05, + "loss": 2.3747, + "step": 12642 + }, + { + "epoch": 0.8485621287876246, + "grad_norm": 3.871551036834717, + "learning_rate": 6.447815362772392e-05, + "loss": 2.2821, + "step": 12644 + }, + { + "epoch": 0.8486963524713935, + "grad_norm": 4.896049499511719, + "learning_rate": 6.446775058917514e-05, + "loss": 2.4048, + "step": 12646 + }, + { + "epoch": 0.8488305761551626, + "grad_norm": 4.348114967346191, + "learning_rate": 6.445734686705358e-05, + "loss": 2.5416, + "step": 12648 + }, + { + "epoch": 0.8489647998389316, + "grad_norm": 4.32326602935791, + "learning_rate": 6.444694246185082e-05, + "loss": 2.4614, + "step": 12650 + }, + { + "epoch": 0.8490990235227006, + "grad_norm": 3.9539294242858887, + "learning_rate": 6.443653737405842e-05, + "loss": 2.1553, + "step": 12652 + }, + { + "epoch": 0.8492332472064695, + "grad_norm": 4.4254865646362305, + "learning_rate": 6.442613160416803e-05, + "loss": 2.2244, + "step": 12654 + }, + { + "epoch": 0.8493674708902386, + "grad_norm": 4.304101467132568, + "learning_rate": 6.44157251526713e-05, + "loss": 2.2637, + "step": 12656 + }, + { + "epoch": 0.8495016945740076, + "grad_norm": 3.897646188735962, + "learning_rate": 6.440531802005988e-05, + "loss": 2.1268, + "step": 12658 + }, + { + "epoch": 0.8496359182577766, + "grad_norm": 4.306998252868652, + "learning_rate": 6.439491020682553e-05, + "loss": 2.3078, + "step": 12660 + }, + { + "epoch": 0.8497701419415455, + "grad_norm": 4.224643230438232, + "learning_rate": 6.438450171345996e-05, + "loss": 2.0588, + "step": 12662 + }, + { + "epoch": 0.8499043656253146, + "grad_norm": 3.7235498428344727, + "learning_rate": 6.437409254045499e-05, + "loss": 2.0265, + "step": 12664 + }, + { + "epoch": 0.8500385893090836, + "grad_norm": 4.675958633422852, + "learning_rate": 6.436368268830241e-05, + "loss": 2.4191, + "step": 12666 + }, + { + "epoch": 0.8501728129928526, + "grad_norm": 4.289715766906738, + "learning_rate": 6.435327215749405e-05, + "loss": 2.2887, + "step": 12668 + }, + { + "epoch": 0.8503070366766216, + "grad_norm": 4.180143356323242, + "learning_rate": 6.434286094852184e-05, + "loss": 2.3673, + "step": 12670 + }, + { + "epoch": 0.8504412603603906, + "grad_norm": 4.1665120124816895, + "learning_rate": 6.433244906187763e-05, + "loss": 2.323, + "step": 12672 + }, + { + "epoch": 0.8505754840441596, + "grad_norm": 4.474100589752197, + "learning_rate": 6.432203649805337e-05, + "loss": 2.5603, + "step": 12674 + }, + { + "epoch": 0.8507097077279286, + "grad_norm": 4.592491149902344, + "learning_rate": 6.431162325754108e-05, + "loss": 2.3454, + "step": 12676 + }, + { + "epoch": 0.8508439314116976, + "grad_norm": 4.294625759124756, + "learning_rate": 6.430120934083271e-05, + "loss": 2.5283, + "step": 12678 + }, + { + "epoch": 0.8509781550954666, + "grad_norm": 4.015101909637451, + "learning_rate": 6.429079474842035e-05, + "loss": 2.1192, + "step": 12680 + }, + { + "epoch": 0.8511123787792356, + "grad_norm": 4.132538795471191, + "learning_rate": 6.428037948079602e-05, + "loss": 2.3123, + "step": 12682 + }, + { + "epoch": 0.8512466024630045, + "grad_norm": 4.363884449005127, + "learning_rate": 6.426996353845184e-05, + "loss": 2.2976, + "step": 12684 + }, + { + "epoch": 0.8513808261467736, + "grad_norm": 3.938227891921997, + "learning_rate": 6.425954692187995e-05, + "loss": 2.2373, + "step": 12686 + }, + { + "epoch": 0.8515150498305426, + "grad_norm": 3.8049468994140625, + "learning_rate": 6.424912963157249e-05, + "loss": 2.1432, + "step": 12688 + }, + { + "epoch": 0.8516492735143116, + "grad_norm": 4.373707294464111, + "learning_rate": 6.423871166802169e-05, + "loss": 2.3439, + "step": 12690 + }, + { + "epoch": 0.8517834971980806, + "grad_norm": 3.947650194168091, + "learning_rate": 6.422829303171976e-05, + "loss": 2.0457, + "step": 12692 + }, + { + "epoch": 0.8519177208818496, + "grad_norm": 3.753072738647461, + "learning_rate": 6.421787372315897e-05, + "loss": 2.4337, + "step": 12694 + }, + { + "epoch": 0.8520519445656186, + "grad_norm": 4.575463771820068, + "learning_rate": 6.420745374283159e-05, + "loss": 2.5284, + "step": 12696 + }, + { + "epoch": 0.8521861682493876, + "grad_norm": 4.253490924835205, + "learning_rate": 6.419703309122997e-05, + "loss": 2.1453, + "step": 12698 + }, + { + "epoch": 0.8523203919331566, + "grad_norm": 4.012346267700195, + "learning_rate": 6.418661176884644e-05, + "loss": 2.302, + "step": 12700 + }, + { + "epoch": 0.8524546156169256, + "grad_norm": 4.21397590637207, + "learning_rate": 6.417618977617342e-05, + "loss": 2.1547, + "step": 12702 + }, + { + "epoch": 0.8525888393006946, + "grad_norm": 3.876781463623047, + "learning_rate": 6.41657671137033e-05, + "loss": 2.4858, + "step": 12704 + }, + { + "epoch": 0.8527230629844637, + "grad_norm": 3.90132737159729, + "learning_rate": 6.415534378192853e-05, + "loss": 2.5547, + "step": 12706 + }, + { + "epoch": 0.8528572866682326, + "grad_norm": 3.9456212520599365, + "learning_rate": 6.414491978134161e-05, + "loss": 2.3376, + "step": 12708 + }, + { + "epoch": 0.8529915103520016, + "grad_norm": 3.632577657699585, + "learning_rate": 6.413449511243505e-05, + "loss": 2.2119, + "step": 12710 + }, + { + "epoch": 0.8531257340357706, + "grad_norm": 4.282527923583984, + "learning_rate": 6.41240697757014e-05, + "loss": 2.6478, + "step": 12712 + }, + { + "epoch": 0.8532599577195397, + "grad_norm": 3.854275941848755, + "learning_rate": 6.41136437716332e-05, + "loss": 2.4608, + "step": 12714 + }, + { + "epoch": 0.8533941814033086, + "grad_norm": 3.577906608581543, + "learning_rate": 6.41032171007231e-05, + "loss": 2.2729, + "step": 12716 + }, + { + "epoch": 0.8535284050870776, + "grad_norm": 4.199882984161377, + "learning_rate": 6.409278976346373e-05, + "loss": 2.2949, + "step": 12718 + }, + { + "epoch": 0.8536626287708466, + "grad_norm": 4.2782511711120605, + "learning_rate": 6.408236176034774e-05, + "loss": 2.5334, + "step": 12720 + }, + { + "epoch": 0.8537968524546156, + "grad_norm": 4.53331995010376, + "learning_rate": 6.407193309186787e-05, + "loss": 2.0636, + "step": 12722 + }, + { + "epoch": 0.8539310761383846, + "grad_norm": 4.630396842956543, + "learning_rate": 6.406150375851682e-05, + "loss": 2.5368, + "step": 12724 + }, + { + "epoch": 0.8540652998221536, + "grad_norm": 4.021825313568115, + "learning_rate": 6.405107376078737e-05, + "loss": 2.5015, + "step": 12726 + }, + { + "epoch": 0.8541995235059227, + "grad_norm": 3.6902551651000977, + "learning_rate": 6.404064309917231e-05, + "loss": 2.3229, + "step": 12728 + }, + { + "epoch": 0.8543337471896916, + "grad_norm": 4.478460311889648, + "learning_rate": 6.403021177416447e-05, + "loss": 2.4693, + "step": 12730 + }, + { + "epoch": 0.8544679708734606, + "grad_norm": 3.3805034160614014, + "learning_rate": 6.401977978625672e-05, + "loss": 2.2752, + "step": 12732 + }, + { + "epoch": 0.8546021945572296, + "grad_norm": 3.8685388565063477, + "learning_rate": 6.400934713594194e-05, + "loss": 2.4816, + "step": 12734 + }, + { + "epoch": 0.8547364182409987, + "grad_norm": 4.433788299560547, + "learning_rate": 6.399891382371308e-05, + "loss": 2.1877, + "step": 12736 + }, + { + "epoch": 0.8548706419247676, + "grad_norm": 3.803598403930664, + "learning_rate": 6.398847985006304e-05, + "loss": 2.1073, + "step": 12738 + }, + { + "epoch": 0.8550048656085366, + "grad_norm": 4.550588607788086, + "learning_rate": 6.397804521548486e-05, + "loss": 2.4537, + "step": 12740 + }, + { + "epoch": 0.8551390892923056, + "grad_norm": 4.970055103302002, + "learning_rate": 6.39676099204715e-05, + "loss": 2.3361, + "step": 12742 + }, + { + "epoch": 0.8552733129760747, + "grad_norm": 4.8406453132629395, + "learning_rate": 6.395717396551606e-05, + "loss": 2.4178, + "step": 12744 + }, + { + "epoch": 0.8554075366598436, + "grad_norm": 4.321325302124023, + "learning_rate": 6.394673735111158e-05, + "loss": 2.1812, + "step": 12746 + }, + { + "epoch": 0.8555417603436126, + "grad_norm": 4.572945594787598, + "learning_rate": 6.39363000777512e-05, + "loss": 2.4812, + "step": 12748 + }, + { + "epoch": 0.8556759840273817, + "grad_norm": 4.330101013183594, + "learning_rate": 6.392586214592804e-05, + "loss": 2.5928, + "step": 12750 + }, + { + "epoch": 0.8558102077111507, + "grad_norm": 3.9266068935394287, + "learning_rate": 6.391542355613526e-05, + "loss": 2.3228, + "step": 12752 + }, + { + "epoch": 0.8559444313949196, + "grad_norm": 4.817348480224609, + "learning_rate": 6.390498430886611e-05, + "loss": 2.3413, + "step": 12754 + }, + { + "epoch": 0.8560786550786886, + "grad_norm": 4.0713605880737305, + "learning_rate": 6.389454440461378e-05, + "loss": 2.2928, + "step": 12756 + }, + { + "epoch": 0.8562128787624577, + "grad_norm": 5.578426837921143, + "learning_rate": 6.388410384387154e-05, + "loss": 2.3742, + "step": 12758 + }, + { + "epoch": 0.8563471024462266, + "grad_norm": 4.819778919219971, + "learning_rate": 6.387366262713272e-05, + "loss": 2.432, + "step": 12760 + }, + { + "epoch": 0.8564813261299956, + "grad_norm": 3.7090606689453125, + "learning_rate": 6.38632207548906e-05, + "loss": 2.2368, + "step": 12762 + }, + { + "epoch": 0.8566155498137646, + "grad_norm": 4.21394157409668, + "learning_rate": 6.385277822763857e-05, + "loss": 2.3933, + "step": 12764 + }, + { + "epoch": 0.8567497734975337, + "grad_norm": 5.089909553527832, + "learning_rate": 6.384233504587001e-05, + "loss": 2.7282, + "step": 12766 + }, + { + "epoch": 0.8568839971813026, + "grad_norm": 4.241338729858398, + "learning_rate": 6.383189121007833e-05, + "loss": 2.2534, + "step": 12768 + }, + { + "epoch": 0.8570182208650716, + "grad_norm": 4.93939208984375, + "learning_rate": 6.382144672075701e-05, + "loss": 2.3597, + "step": 12770 + }, + { + "epoch": 0.8571524445488407, + "grad_norm": 4.200321674346924, + "learning_rate": 6.381100157839948e-05, + "loss": 2.1153, + "step": 12772 + }, + { + "epoch": 0.8572866682326097, + "grad_norm": 3.922708749771118, + "learning_rate": 6.380055578349931e-05, + "loss": 2.6511, + "step": 12774 + }, + { + "epoch": 0.8574208919163786, + "grad_norm": 3.9964544773101807, + "learning_rate": 6.379010933655001e-05, + "loss": 2.1527, + "step": 12776 + }, + { + "epoch": 0.8575551156001476, + "grad_norm": 4.775642395019531, + "learning_rate": 6.377966223804515e-05, + "loss": 2.4182, + "step": 12778 + }, + { + "epoch": 0.8576893392839167, + "grad_norm": 3.578030586242676, + "learning_rate": 6.376921448847836e-05, + "loss": 2.047, + "step": 12780 + }, + { + "epoch": 0.8578235629676857, + "grad_norm": 10.069700241088867, + "learning_rate": 6.375876608834324e-05, + "loss": 2.3916, + "step": 12782 + }, + { + "epoch": 0.8579577866514546, + "grad_norm": 3.8084893226623535, + "learning_rate": 6.374831703813351e-05, + "loss": 2.2912, + "step": 12784 + }, + { + "epoch": 0.8580920103352236, + "grad_norm": 3.94189190864563, + "learning_rate": 6.37378673383428e-05, + "loss": 2.4025, + "step": 12786 + }, + { + "epoch": 0.8582262340189927, + "grad_norm": 3.863218307495117, + "learning_rate": 6.372741698946491e-05, + "loss": 2.1823, + "step": 12788 + }, + { + "epoch": 0.8583604577027617, + "grad_norm": 4.549015998840332, + "learning_rate": 6.371696599199353e-05, + "loss": 2.6684, + "step": 12790 + }, + { + "epoch": 0.8584946813865306, + "grad_norm": 4.179776191711426, + "learning_rate": 6.370651434642248e-05, + "loss": 2.6498, + "step": 12792 + }, + { + "epoch": 0.8586289050702997, + "grad_norm": 4.062543869018555, + "learning_rate": 6.36960620532456e-05, + "loss": 2.2298, + "step": 12794 + }, + { + "epoch": 0.8587631287540687, + "grad_norm": 3.9268996715545654, + "learning_rate": 6.368560911295671e-05, + "loss": 2.5774, + "step": 12796 + }, + { + "epoch": 0.8588973524378376, + "grad_norm": 4.598140239715576, + "learning_rate": 6.367515552604971e-05, + "loss": 2.2309, + "step": 12798 + }, + { + "epoch": 0.8590315761216066, + "grad_norm": 4.02744197845459, + "learning_rate": 6.36647012930185e-05, + "loss": 2.1498, + "step": 12800 + }, + { + "epoch": 0.8591657998053757, + "grad_norm": 3.8978030681610107, + "learning_rate": 6.365424641435702e-05, + "loss": 2.6012, + "step": 12802 + }, + { + "epoch": 0.8593000234891447, + "grad_norm": 4.505646705627441, + "learning_rate": 6.364379089055923e-05, + "loss": 2.4149, + "step": 12804 + }, + { + "epoch": 0.8594342471729136, + "grad_norm": 3.7733664512634277, + "learning_rate": 6.363333472211917e-05, + "loss": 2.1643, + "step": 12806 + }, + { + "epoch": 0.8595684708566826, + "grad_norm": 3.7341387271881104, + "learning_rate": 6.362287790953086e-05, + "loss": 2.0197, + "step": 12808 + }, + { + "epoch": 0.8597026945404517, + "grad_norm": 4.361305236816406, + "learning_rate": 6.361242045328834e-05, + "loss": 2.178, + "step": 12810 + }, + { + "epoch": 0.8598369182242207, + "grad_norm": 4.021759510040283, + "learning_rate": 6.360196235388574e-05, + "loss": 2.2598, + "step": 12812 + }, + { + "epoch": 0.8599711419079896, + "grad_norm": 4.20520544052124, + "learning_rate": 6.359150361181715e-05, + "loss": 2.2549, + "step": 12814 + }, + { + "epoch": 0.8601053655917587, + "grad_norm": 4.409783840179443, + "learning_rate": 6.358104422757673e-05, + "loss": 2.4228, + "step": 12816 + }, + { + "epoch": 0.8602395892755277, + "grad_norm": 3.938746213912964, + "learning_rate": 6.357058420165872e-05, + "loss": 2.2875, + "step": 12818 + }, + { + "epoch": 0.8603738129592967, + "grad_norm": 4.219440937042236, + "learning_rate": 6.356012353455725e-05, + "loss": 2.3278, + "step": 12820 + }, + { + "epoch": 0.8605080366430656, + "grad_norm": 4.332894802093506, + "learning_rate": 6.354966222676661e-05, + "loss": 2.1723, + "step": 12822 + }, + { + "epoch": 0.8606422603268347, + "grad_norm": 3.9280569553375244, + "learning_rate": 6.353920027878108e-05, + "loss": 2.0896, + "step": 12824 + }, + { + "epoch": 0.8607764840106037, + "grad_norm": 4.0482587814331055, + "learning_rate": 6.352873769109498e-05, + "loss": 2.2756, + "step": 12826 + }, + { + "epoch": 0.8609107076943727, + "grad_norm": 4.482020378112793, + "learning_rate": 6.351827446420261e-05, + "loss": 2.2794, + "step": 12828 + }, + { + "epoch": 0.8610449313781416, + "grad_norm": 4.308781147003174, + "learning_rate": 6.350781059859835e-05, + "loss": 2.4557, + "step": 12830 + }, + { + "epoch": 0.8611791550619107, + "grad_norm": 3.9765515327453613, + "learning_rate": 6.349734609477661e-05, + "loss": 2.2231, + "step": 12832 + }, + { + "epoch": 0.8613133787456797, + "grad_norm": 4.183588981628418, + "learning_rate": 6.348688095323181e-05, + "loss": 2.1329, + "step": 12834 + }, + { + "epoch": 0.8614476024294486, + "grad_norm": 4.5689167976379395, + "learning_rate": 6.34764151744584e-05, + "loss": 2.1444, + "step": 12836 + }, + { + "epoch": 0.8615818261132177, + "grad_norm": 3.7060606479644775, + "learning_rate": 6.346594875895088e-05, + "loss": 2.0484, + "step": 12838 + }, + { + "epoch": 0.8617160497969867, + "grad_norm": 4.002459526062012, + "learning_rate": 6.345548170720376e-05, + "loss": 2.1947, + "step": 12840 + }, + { + "epoch": 0.8618502734807557, + "grad_norm": 3.6220836639404297, + "learning_rate": 6.344501401971157e-05, + "loss": 2.157, + "step": 12842 + }, + { + "epoch": 0.8619844971645246, + "grad_norm": 5.049450874328613, + "learning_rate": 6.343454569696891e-05, + "loss": 2.4175, + "step": 12844 + }, + { + "epoch": 0.8621187208482937, + "grad_norm": 3.8821232318878174, + "learning_rate": 6.34240767394704e-05, + "loss": 2.1814, + "step": 12846 + }, + { + "epoch": 0.8622529445320627, + "grad_norm": 4.407138347625732, + "learning_rate": 6.341360714771066e-05, + "loss": 2.3204, + "step": 12848 + }, + { + "epoch": 0.8623871682158317, + "grad_norm": 4.432413101196289, + "learning_rate": 6.340313692218435e-05, + "loss": 2.5639, + "step": 12850 + }, + { + "epoch": 0.8625213918996006, + "grad_norm": 4.4644269943237305, + "learning_rate": 6.339266606338619e-05, + "loss": 2.6189, + "step": 12852 + }, + { + "epoch": 0.8626556155833697, + "grad_norm": 3.8751749992370605, + "learning_rate": 6.338219457181088e-05, + "loss": 2.0959, + "step": 12854 + }, + { + "epoch": 0.8627898392671387, + "grad_norm": 3.8894295692443848, + "learning_rate": 6.33717224479532e-05, + "loss": 2.6485, + "step": 12856 + }, + { + "epoch": 0.8629240629509077, + "grad_norm": 4.533567905426025, + "learning_rate": 6.336124969230792e-05, + "loss": 2.2849, + "step": 12858 + }, + { + "epoch": 0.8630582866346767, + "grad_norm": 3.70389461517334, + "learning_rate": 6.335077630536988e-05, + "loss": 2.089, + "step": 12860 + }, + { + "epoch": 0.8631925103184457, + "grad_norm": 3.854749917984009, + "learning_rate": 6.33403022876339e-05, + "loss": 2.3793, + "step": 12862 + }, + { + "epoch": 0.8633267340022147, + "grad_norm": 4.170308589935303, + "learning_rate": 6.332982763959487e-05, + "loss": 2.4327, + "step": 12864 + }, + { + "epoch": 0.8634609576859837, + "grad_norm": 3.6572630405426025, + "learning_rate": 6.33193523617477e-05, + "loss": 2.1932, + "step": 12866 + }, + { + "epoch": 0.8635951813697527, + "grad_norm": 4.113853454589844, + "learning_rate": 6.330887645458732e-05, + "loss": 2.2968, + "step": 12868 + }, + { + "epoch": 0.8637294050535217, + "grad_norm": 4.1171135902404785, + "learning_rate": 6.32983999186087e-05, + "loss": 2.3379, + "step": 12870 + }, + { + "epoch": 0.8638636287372907, + "grad_norm": 3.8346290588378906, + "learning_rate": 6.328792275430682e-05, + "loss": 2.1244, + "step": 12872 + }, + { + "epoch": 0.8639978524210596, + "grad_norm": 4.442126274108887, + "learning_rate": 6.327744496217675e-05, + "loss": 2.1214, + "step": 12874 + }, + { + "epoch": 0.8641320761048287, + "grad_norm": 4.262237548828125, + "learning_rate": 6.32669665427135e-05, + "loss": 2.2752, + "step": 12876 + }, + { + "epoch": 0.8642662997885977, + "grad_norm": 3.6756014823913574, + "learning_rate": 6.325648749641217e-05, + "loss": 2.2865, + "step": 12878 + }, + { + "epoch": 0.8644005234723667, + "grad_norm": 3.6892428398132324, + "learning_rate": 6.324600782376788e-05, + "loss": 2.2783, + "step": 12880 + }, + { + "epoch": 0.8645347471561357, + "grad_norm": 4.4488372802734375, + "learning_rate": 6.323552752527575e-05, + "loss": 2.3452, + "step": 12882 + }, + { + "epoch": 0.8646689708399047, + "grad_norm": 4.012686729431152, + "learning_rate": 6.322504660143099e-05, + "loss": 2.1623, + "step": 12884 + }, + { + "epoch": 0.8648031945236737, + "grad_norm": 4.144957542419434, + "learning_rate": 6.321456505272876e-05, + "loss": 2.3102, + "step": 12886 + }, + { + "epoch": 0.8649374182074427, + "grad_norm": 3.7943077087402344, + "learning_rate": 6.320408287966433e-05, + "loss": 2.6226, + "step": 12888 + }, + { + "epoch": 0.8650716418912117, + "grad_norm": 4.986623287200928, + "learning_rate": 6.319360008273294e-05, + "loss": 2.3121, + "step": 12890 + }, + { + "epoch": 0.8652058655749807, + "grad_norm": 3.961261510848999, + "learning_rate": 6.31831166624299e-05, + "loss": 2.1224, + "step": 12892 + }, + { + "epoch": 0.8653400892587497, + "grad_norm": 4.666899681091309, + "learning_rate": 6.317263261925052e-05, + "loss": 2.4383, + "step": 12894 + }, + { + "epoch": 0.8654743129425188, + "grad_norm": 4.334089756011963, + "learning_rate": 6.316214795369016e-05, + "loss": 2.3499, + "step": 12896 + }, + { + "epoch": 0.8656085366262877, + "grad_norm": 4.3817572593688965, + "learning_rate": 6.315166266624418e-05, + "loss": 2.1624, + "step": 12898 + }, + { + "epoch": 0.8657427603100567, + "grad_norm": 4.173321723937988, + "learning_rate": 6.314117675740801e-05, + "loss": 2.499, + "step": 12900 + }, + { + "epoch": 0.8658769839938257, + "grad_norm": 4.039221286773682, + "learning_rate": 6.313069022767707e-05, + "loss": 2.6311, + "step": 12902 + }, + { + "epoch": 0.8660112076775948, + "grad_norm": 3.9388949871063232, + "learning_rate": 6.312020307754684e-05, + "loss": 2.5086, + "step": 12904 + }, + { + "epoch": 0.8661454313613637, + "grad_norm": 3.898528575897217, + "learning_rate": 6.31097153075128e-05, + "loss": 2.4208, + "step": 12906 + }, + { + "epoch": 0.8662796550451327, + "grad_norm": 4.038965225219727, + "learning_rate": 6.30992269180705e-05, + "loss": 2.4655, + "step": 12908 + }, + { + "epoch": 0.8664138787289017, + "grad_norm": 5.388798236846924, + "learning_rate": 6.30887379097155e-05, + "loss": 2.4155, + "step": 12910 + }, + { + "epoch": 0.8665481024126707, + "grad_norm": 4.02509069442749, + "learning_rate": 6.307824828294338e-05, + "loss": 2.5828, + "step": 12912 + }, + { + "epoch": 0.8666823260964397, + "grad_norm": 4.605691909790039, + "learning_rate": 6.306775803824971e-05, + "loss": 2.2408, + "step": 12914 + }, + { + "epoch": 0.8668165497802087, + "grad_norm": 4.648369312286377, + "learning_rate": 6.30572671761302e-05, + "loss": 2.2432, + "step": 12916 + }, + { + "epoch": 0.8669507734639778, + "grad_norm": 3.8884549140930176, + "learning_rate": 6.304677569708047e-05, + "loss": 2.2112, + "step": 12918 + }, + { + "epoch": 0.8670849971477467, + "grad_norm": 4.132173538208008, + "learning_rate": 6.303628360159625e-05, + "loss": 2.1968, + "step": 12920 + }, + { + "epoch": 0.8672192208315157, + "grad_norm": 3.9699339866638184, + "learning_rate": 6.302579089017327e-05, + "loss": 2.3709, + "step": 12922 + }, + { + "epoch": 0.8673534445152847, + "grad_norm": 3.889127492904663, + "learning_rate": 6.301529756330728e-05, + "loss": 2.3515, + "step": 12924 + }, + { + "epoch": 0.8674876681990538, + "grad_norm": 5.266239166259766, + "learning_rate": 6.300480362149409e-05, + "loss": 2.2461, + "step": 12926 + }, + { + "epoch": 0.8676218918828227, + "grad_norm": 3.201749086380005, + "learning_rate": 6.299430906522948e-05, + "loss": 2.1446, + "step": 12928 + }, + { + "epoch": 0.8677561155665917, + "grad_norm": 3.893454074859619, + "learning_rate": 6.298381389500933e-05, + "loss": 2.2865, + "step": 12930 + }, + { + "epoch": 0.8678903392503607, + "grad_norm": 4.085938453674316, + "learning_rate": 6.297331811132951e-05, + "loss": 2.457, + "step": 12932 + }, + { + "epoch": 0.8680245629341298, + "grad_norm": 4.0466742515563965, + "learning_rate": 6.29628217146859e-05, + "loss": 2.403, + "step": 12934 + }, + { + "epoch": 0.8681587866178987, + "grad_norm": 3.7475316524505615, + "learning_rate": 6.295232470557447e-05, + "loss": 2.8161, + "step": 12936 + }, + { + "epoch": 0.8682930103016677, + "grad_norm": 4.365174293518066, + "learning_rate": 6.294182708449117e-05, + "loss": 2.414, + "step": 12938 + }, + { + "epoch": 0.8684272339854368, + "grad_norm": 4.526188850402832, + "learning_rate": 6.293132885193198e-05, + "loss": 2.4666, + "step": 12940 + }, + { + "epoch": 0.8685614576692058, + "grad_norm": 3.817545175552368, + "learning_rate": 6.292083000839292e-05, + "loss": 2.1174, + "step": 12942 + }, + { + "epoch": 0.8686956813529747, + "grad_norm": 4.437409400939941, + "learning_rate": 6.291033055437008e-05, + "loss": 2.345, + "step": 12944 + }, + { + "epoch": 0.8688299050367437, + "grad_norm": 3.9439468383789062, + "learning_rate": 6.28998304903595e-05, + "loss": 2.2601, + "step": 12946 + }, + { + "epoch": 0.8689641287205128, + "grad_norm": 3.933319568634033, + "learning_rate": 6.288932981685728e-05, + "loss": 2.3672, + "step": 12948 + }, + { + "epoch": 0.8690983524042817, + "grad_norm": 5.8179121017456055, + "learning_rate": 6.287882853435959e-05, + "loss": 2.3196, + "step": 12950 + }, + { + "epoch": 0.8692325760880507, + "grad_norm": 4.29022741317749, + "learning_rate": 6.286832664336255e-05, + "loss": 2.3655, + "step": 12952 + }, + { + "epoch": 0.8693667997718197, + "grad_norm": 4.129533767700195, + "learning_rate": 6.28578241443624e-05, + "loss": 2.2465, + "step": 12954 + }, + { + "epoch": 0.8695010234555888, + "grad_norm": 3.9288742542266846, + "learning_rate": 6.284732103785535e-05, + "loss": 2.2121, + "step": 12956 + }, + { + "epoch": 0.8696352471393577, + "grad_norm": 3.560307502746582, + "learning_rate": 6.283681732433762e-05, + "loss": 2.3452, + "step": 12958 + }, + { + "epoch": 0.8697694708231267, + "grad_norm": 3.8282310962677, + "learning_rate": 6.282631300430554e-05, + "loss": 2.4334, + "step": 12960 + }, + { + "epoch": 0.8699036945068958, + "grad_norm": 4.113216400146484, + "learning_rate": 6.281580807825538e-05, + "loss": 2.3782, + "step": 12962 + }, + { + "epoch": 0.8700379181906648, + "grad_norm": 4.397510528564453, + "learning_rate": 6.280530254668349e-05, + "loss": 2.1181, + "step": 12964 + }, + { + "epoch": 0.8701721418744337, + "grad_norm": 3.718649387359619, + "learning_rate": 6.279479641008623e-05, + "loss": 2.592, + "step": 12966 + }, + { + "epoch": 0.8703063655582027, + "grad_norm": 4.227586269378662, + "learning_rate": 6.278428966896e-05, + "loss": 2.4077, + "step": 12968 + }, + { + "epoch": 0.8704405892419718, + "grad_norm": 3.934403657913208, + "learning_rate": 6.277378232380123e-05, + "loss": 2.3161, + "step": 12970 + }, + { + "epoch": 0.8705748129257408, + "grad_norm": 4.304378509521484, + "learning_rate": 6.276327437510636e-05, + "loss": 2.4691, + "step": 12972 + }, + { + "epoch": 0.8707090366095097, + "grad_norm": 3.860569477081299, + "learning_rate": 6.275276582337188e-05, + "loss": 2.1756, + "step": 12974 + }, + { + "epoch": 0.8708432602932787, + "grad_norm": 3.677579879760742, + "learning_rate": 6.274225666909428e-05, + "loss": 2.3206, + "step": 12976 + }, + { + "epoch": 0.8709774839770478, + "grad_norm": 4.061268329620361, + "learning_rate": 6.273174691277012e-05, + "loss": 2.1596, + "step": 12978 + }, + { + "epoch": 0.8711117076608168, + "grad_norm": 5.121475696563721, + "learning_rate": 6.272123655489595e-05, + "loss": 2.2173, + "step": 12980 + }, + { + "epoch": 0.8712459313445857, + "grad_norm": 4.893365383148193, + "learning_rate": 6.271072559596836e-05, + "loss": 2.7166, + "step": 12982 + }, + { + "epoch": 0.8713801550283548, + "grad_norm": 13.57975959777832, + "learning_rate": 6.270021403648399e-05, + "loss": 2.3925, + "step": 12984 + }, + { + "epoch": 0.8715143787121238, + "grad_norm": 3.8788199424743652, + "learning_rate": 6.268970187693947e-05, + "loss": 2.1665, + "step": 12986 + }, + { + "epoch": 0.8716486023958927, + "grad_norm": 3.67877459526062, + "learning_rate": 6.267918911783152e-05, + "loss": 2.3414, + "step": 12988 + }, + { + "epoch": 0.8717828260796617, + "grad_norm": 3.671874761581421, + "learning_rate": 6.266867575965679e-05, + "loss": 2.2904, + "step": 12990 + }, + { + "epoch": 0.8719170497634308, + "grad_norm": 4.053514003753662, + "learning_rate": 6.265816180291205e-05, + "loss": 2.3778, + "step": 12992 + }, + { + "epoch": 0.8720512734471998, + "grad_norm": 3.9955756664276123, + "learning_rate": 6.264764724809404e-05, + "loss": 2.3423, + "step": 12994 + }, + { + "epoch": 0.8721854971309687, + "grad_norm": 3.7492175102233887, + "learning_rate": 6.263713209569958e-05, + "loss": 2.2935, + "step": 12996 + }, + { + "epoch": 0.8723197208147377, + "grad_norm": 4.108679294586182, + "learning_rate": 6.262661634622549e-05, + "loss": 2.1514, + "step": 12998 + }, + { + "epoch": 0.8724539444985068, + "grad_norm": 3.73331880569458, + "learning_rate": 6.26161000001686e-05, + "loss": 2.2163, + "step": 13000 + }, + { + "epoch": 0.8725881681822758, + "grad_norm": 4.351728916168213, + "learning_rate": 6.260558305802578e-05, + "loss": 2.1707, + "step": 13002 + }, + { + "epoch": 0.8727223918660447, + "grad_norm": 4.453310966491699, + "learning_rate": 6.259506552029396e-05, + "loss": 2.7524, + "step": 13004 + }, + { + "epoch": 0.8728566155498138, + "grad_norm": 4.526618480682373, + "learning_rate": 6.258454738747006e-05, + "loss": 2.3539, + "step": 13006 + }, + { + "epoch": 0.8729908392335828, + "grad_norm": 4.155110836029053, + "learning_rate": 6.257402866005105e-05, + "loss": 2.2164, + "step": 13008 + }, + { + "epoch": 0.8731250629173518, + "grad_norm": 5.068065166473389, + "learning_rate": 6.25635093385339e-05, + "loss": 2.2708, + "step": 13010 + }, + { + "epoch": 0.8732592866011207, + "grad_norm": 4.3249192237854, + "learning_rate": 6.255298942341564e-05, + "loss": 2.1798, + "step": 13012 + }, + { + "epoch": 0.8733935102848898, + "grad_norm": 3.929267406463623, + "learning_rate": 6.254246891519332e-05, + "loss": 2.4551, + "step": 13014 + }, + { + "epoch": 0.8735277339686588, + "grad_norm": 4.439569473266602, + "learning_rate": 6.2531947814364e-05, + "loss": 2.1396, + "step": 13016 + }, + { + "epoch": 0.8736619576524278, + "grad_norm": 4.392258167266846, + "learning_rate": 6.25214261214248e-05, + "loss": 2.3657, + "step": 13018 + }, + { + "epoch": 0.8737961813361967, + "grad_norm": 4.312061786651611, + "learning_rate": 6.251090383687283e-05, + "loss": 2.2657, + "step": 13020 + }, + { + "epoch": 0.8739304050199658, + "grad_norm": 4.252840042114258, + "learning_rate": 6.250038096120526e-05, + "loss": 2.4117, + "step": 13022 + }, + { + "epoch": 0.8740646287037348, + "grad_norm": 3.9253506660461426, + "learning_rate": 6.248985749491926e-05, + "loss": 2.4054, + "step": 13024 + }, + { + "epoch": 0.8741988523875037, + "grad_norm": 4.6473259925842285, + "learning_rate": 6.247933343851207e-05, + "loss": 2.4869, + "step": 13026 + }, + { + "epoch": 0.8743330760712728, + "grad_norm": 4.150829315185547, + "learning_rate": 6.246880879248089e-05, + "loss": 2.3548, + "step": 13028 + }, + { + "epoch": 0.8744672997550418, + "grad_norm": 3.9089043140411377, + "learning_rate": 6.245828355732303e-05, + "loss": 2.0943, + "step": 13030 + }, + { + "epoch": 0.8746015234388108, + "grad_norm": 4.780176162719727, + "learning_rate": 6.244775773353577e-05, + "loss": 2.3192, + "step": 13032 + }, + { + "epoch": 0.8747357471225797, + "grad_norm": 4.125881195068359, + "learning_rate": 6.243723132161643e-05, + "loss": 2.2667, + "step": 13034 + }, + { + "epoch": 0.8748699708063488, + "grad_norm": 3.9088571071624756, + "learning_rate": 6.242670432206238e-05, + "loss": 2.1037, + "step": 13036 + }, + { + "epoch": 0.8750041944901178, + "grad_norm": 3.8567380905151367, + "learning_rate": 6.241617673537097e-05, + "loss": 2.4802, + "step": 13038 + }, + { + "epoch": 0.8751384181738868, + "grad_norm": 3.972968578338623, + "learning_rate": 6.240564856203964e-05, + "loss": 2.2342, + "step": 13040 + }, + { + "epoch": 0.8752726418576557, + "grad_norm": 4.012211322784424, + "learning_rate": 6.239511980256579e-05, + "loss": 2.4629, + "step": 13042 + }, + { + "epoch": 0.8754068655414248, + "grad_norm": 3.8562111854553223, + "learning_rate": 6.238459045744691e-05, + "loss": 2.3386, + "step": 13044 + }, + { + "epoch": 0.8755410892251938, + "grad_norm": 3.7143337726593018, + "learning_rate": 6.23740605271805e-05, + "loss": 2.0967, + "step": 13046 + }, + { + "epoch": 0.8756753129089628, + "grad_norm": 4.270926475524902, + "learning_rate": 6.236353001226404e-05, + "loss": 2.4063, + "step": 13048 + }, + { + "epoch": 0.8758095365927318, + "grad_norm": 4.161036014556885, + "learning_rate": 6.235299891319512e-05, + "loss": 2.2753, + "step": 13050 + }, + { + "epoch": 0.8759437602765008, + "grad_norm": 3.899073600769043, + "learning_rate": 6.234246723047128e-05, + "loss": 2.354, + "step": 13052 + }, + { + "epoch": 0.8760779839602698, + "grad_norm": 4.301528453826904, + "learning_rate": 6.233193496459015e-05, + "loss": 2.5084, + "step": 13054 + }, + { + "epoch": 0.8762122076440388, + "grad_norm": 3.6793136596679688, + "learning_rate": 6.232140211604932e-05, + "loss": 2.1883, + "step": 13056 + }, + { + "epoch": 0.8763464313278078, + "grad_norm": 4.149114608764648, + "learning_rate": 6.231086868534647e-05, + "loss": 2.0607, + "step": 13058 + }, + { + "epoch": 0.8764806550115768, + "grad_norm": 3.748345375061035, + "learning_rate": 6.23003346729793e-05, + "loss": 2.3316, + "step": 13060 + }, + { + "epoch": 0.8766148786953458, + "grad_norm": 3.856954574584961, + "learning_rate": 6.22898000794455e-05, + "loss": 2.1514, + "step": 13062 + }, + { + "epoch": 0.8767491023791147, + "grad_norm": 4.414962291717529, + "learning_rate": 6.22792649052428e-05, + "loss": 2.3404, + "step": 13064 + }, + { + "epoch": 0.8768833260628838, + "grad_norm": 4.718215465545654, + "learning_rate": 6.226872915086899e-05, + "loss": 2.2654, + "step": 13066 + }, + { + "epoch": 0.8770175497466528, + "grad_norm": 3.807535171508789, + "learning_rate": 6.225819281682186e-05, + "loss": 2.1964, + "step": 13068 + }, + { + "epoch": 0.8771517734304218, + "grad_norm": 4.429305553436279, + "learning_rate": 6.22476559035992e-05, + "loss": 2.4306, + "step": 13070 + }, + { + "epoch": 0.8772859971141908, + "grad_norm": 4.112966060638428, + "learning_rate": 6.22371184116989e-05, + "loss": 2.3174, + "step": 13072 + }, + { + "epoch": 0.8774202207979598, + "grad_norm": 4.34882116317749, + "learning_rate": 6.22265803416188e-05, + "loss": 2.4342, + "step": 13074 + }, + { + "epoch": 0.8775544444817288, + "grad_norm": 3.953726053237915, + "learning_rate": 6.221604169385685e-05, + "loss": 2.2475, + "step": 13076 + }, + { + "epoch": 0.8776886681654978, + "grad_norm": 4.277991771697998, + "learning_rate": 6.220550246891095e-05, + "loss": 2.6058, + "step": 13078 + }, + { + "epoch": 0.8778228918492668, + "grad_norm": 4.548774719238281, + "learning_rate": 6.219496266727904e-05, + "loss": 2.5989, + "step": 13080 + }, + { + "epoch": 0.8779571155330358, + "grad_norm": 3.8690361976623535, + "learning_rate": 6.218442228945914e-05, + "loss": 2.1348, + "step": 13082 + }, + { + "epoch": 0.8780913392168048, + "grad_norm": 4.311750888824463, + "learning_rate": 6.217388133594925e-05, + "loss": 2.4669, + "step": 13084 + }, + { + "epoch": 0.8782255629005739, + "grad_norm": 3.6855945587158203, + "learning_rate": 6.216333980724738e-05, + "loss": 2.3872, + "step": 13086 + }, + { + "epoch": 0.8783597865843428, + "grad_norm": 3.3315882682800293, + "learning_rate": 6.215279770385165e-05, + "loss": 2.1577, + "step": 13088 + }, + { + "epoch": 0.8784940102681118, + "grad_norm": 4.064833641052246, + "learning_rate": 6.21422550262601e-05, + "loss": 2.3979, + "step": 13090 + }, + { + "epoch": 0.8786282339518808, + "grad_norm": 3.9736480712890625, + "learning_rate": 6.213171177497089e-05, + "loss": 2.1786, + "step": 13092 + }, + { + "epoch": 0.8787624576356499, + "grad_norm": 4.758360385894775, + "learning_rate": 6.212116795048216e-05, + "loss": 2.3694, + "step": 13094 + }, + { + "epoch": 0.8788966813194188, + "grad_norm": 3.942960023880005, + "learning_rate": 6.211062355329206e-05, + "loss": 2.1932, + "step": 13096 + }, + { + "epoch": 0.8790309050031878, + "grad_norm": 4.282471656799316, + "learning_rate": 6.210007858389885e-05, + "loss": 2.313, + "step": 13098 + }, + { + "epoch": 0.8791651286869568, + "grad_norm": 4.52429723739624, + "learning_rate": 6.208953304280069e-05, + "loss": 2.5491, + "step": 13100 + }, + { + "epoch": 0.8792993523707258, + "grad_norm": 5.728999614715576, + "learning_rate": 6.207898693049588e-05, + "loss": 2.1989, + "step": 13102 + }, + { + "epoch": 0.8794335760544948, + "grad_norm": 3.9560635089874268, + "learning_rate": 6.206844024748268e-05, + "loss": 2.1554, + "step": 13104 + }, + { + "epoch": 0.8795677997382638, + "grad_norm": 3.7752740383148193, + "learning_rate": 6.205789299425941e-05, + "loss": 2.212, + "step": 13106 + }, + { + "epoch": 0.8797020234220329, + "grad_norm": 4.457380771636963, + "learning_rate": 6.20473451713244e-05, + "loss": 2.3564, + "step": 13108 + }, + { + "epoch": 0.8798362471058018, + "grad_norm": 3.979569673538208, + "learning_rate": 6.203679677917603e-05, + "loss": 2.232, + "step": 13110 + }, + { + "epoch": 0.8799704707895708, + "grad_norm": 4.5428571701049805, + "learning_rate": 6.202624781831268e-05, + "loss": 2.6088, + "step": 13112 + }, + { + "epoch": 0.8801046944733398, + "grad_norm": 4.2008161544799805, + "learning_rate": 6.201569828923277e-05, + "loss": 2.5849, + "step": 13114 + }, + { + "epoch": 0.8802389181571089, + "grad_norm": 4.175887584686279, + "learning_rate": 6.200514819243476e-05, + "loss": 2.3157, + "step": 13116 + }, + { + "epoch": 0.8803731418408778, + "grad_norm": 4.271469593048096, + "learning_rate": 6.199459752841709e-05, + "loss": 2.3287, + "step": 13118 + }, + { + "epoch": 0.8805073655246468, + "grad_norm": 4.521378993988037, + "learning_rate": 6.198404629767825e-05, + "loss": 2.2668, + "step": 13120 + }, + { + "epoch": 0.8806415892084158, + "grad_norm": 4.365108489990234, + "learning_rate": 6.197349450071683e-05, + "loss": 2.2061, + "step": 13122 + }, + { + "epoch": 0.8807758128921849, + "grad_norm": 3.904893636703491, + "learning_rate": 6.196294213803131e-05, + "loss": 2.2052, + "step": 13124 + }, + { + "epoch": 0.8809100365759538, + "grad_norm": 3.965838670730591, + "learning_rate": 6.195238921012032e-05, + "loss": 2.3312, + "step": 13126 + }, + { + "epoch": 0.8810442602597228, + "grad_norm": 4.1790056228637695, + "learning_rate": 6.194183571748242e-05, + "loss": 2.2158, + "step": 13128 + }, + { + "epoch": 0.8811784839434919, + "grad_norm": 3.9538629055023193, + "learning_rate": 6.193128166061629e-05, + "loss": 2.3829, + "step": 13130 + }, + { + "epoch": 0.8813127076272609, + "grad_norm": 4.048318386077881, + "learning_rate": 6.192072704002054e-05, + "loss": 1.986, + "step": 13132 + }, + { + "epoch": 0.8814469313110298, + "grad_norm": 3.6938421726226807, + "learning_rate": 6.191017185619389e-05, + "loss": 2.0511, + "step": 13134 + }, + { + "epoch": 0.8815811549947988, + "grad_norm": 4.507421016693115, + "learning_rate": 6.189961610963504e-05, + "loss": 2.2733, + "step": 13136 + }, + { + "epoch": 0.8817153786785679, + "grad_norm": 4.0056471824646, + "learning_rate": 6.188905980084273e-05, + "loss": 2.1883, + "step": 13138 + }, + { + "epoch": 0.8818496023623368, + "grad_norm": 4.427181243896484, + "learning_rate": 6.187850293031571e-05, + "loss": 2.538, + "step": 13140 + }, + { + "epoch": 0.8819838260461058, + "grad_norm": 4.065192222595215, + "learning_rate": 6.18679454985528e-05, + "loss": 2.1228, + "step": 13142 + }, + { + "epoch": 0.8821180497298748, + "grad_norm": 4.004032135009766, + "learning_rate": 6.185738750605281e-05, + "loss": 2.3336, + "step": 13144 + }, + { + "epoch": 0.8822522734136439, + "grad_norm": 3.719531297683716, + "learning_rate": 6.184682895331456e-05, + "loss": 2.0156, + "step": 13146 + }, + { + "epoch": 0.8823864970974128, + "grad_norm": 4.068393707275391, + "learning_rate": 6.183626984083694e-05, + "loss": 2.1771, + "step": 13148 + }, + { + "epoch": 0.8825207207811818, + "grad_norm": 4.188860893249512, + "learning_rate": 6.182571016911886e-05, + "loss": 2.4988, + "step": 13150 + }, + { + "epoch": 0.8826549444649509, + "grad_norm": 4.527996063232422, + "learning_rate": 6.181514993865922e-05, + "loss": 2.1633, + "step": 13152 + }, + { + "epoch": 0.8827891681487199, + "grad_norm": 4.156023979187012, + "learning_rate": 6.180458914995696e-05, + "loss": 2.5425, + "step": 13154 + }, + { + "epoch": 0.8829233918324888, + "grad_norm": 4.266063690185547, + "learning_rate": 6.17940278035111e-05, + "loss": 2.3646, + "step": 13156 + }, + { + "epoch": 0.8830576155162578, + "grad_norm": 3.9170989990234375, + "learning_rate": 6.178346589982061e-05, + "loss": 2.2449, + "step": 13158 + }, + { + "epoch": 0.8831918392000269, + "grad_norm": 3.952340602874756, + "learning_rate": 6.177290343938452e-05, + "loss": 2.5391, + "step": 13160 + }, + { + "epoch": 0.8833260628837959, + "grad_norm": 3.9893417358398438, + "learning_rate": 6.17623404227019e-05, + "loss": 2.2556, + "step": 13162 + }, + { + "epoch": 0.8834602865675648, + "grad_norm": 3.44345760345459, + "learning_rate": 6.175177685027183e-05, + "loss": 1.8746, + "step": 13164 + }, + { + "epoch": 0.8835945102513338, + "grad_norm": 4.172008037567139, + "learning_rate": 6.17412127225934e-05, + "loss": 2.2588, + "step": 13166 + }, + { + "epoch": 0.8837287339351029, + "grad_norm": 4.204370021820068, + "learning_rate": 6.173064804016577e-05, + "loss": 2.217, + "step": 13168 + }, + { + "epoch": 0.8838629576188719, + "grad_norm": 3.8463590145111084, + "learning_rate": 6.172008280348808e-05, + "loss": 2.5062, + "step": 13170 + }, + { + "epoch": 0.8839971813026408, + "grad_norm": 5.019901275634766, + "learning_rate": 6.170951701305951e-05, + "loss": 2.072, + "step": 13172 + }, + { + "epoch": 0.8841314049864099, + "grad_norm": 4.78465461730957, + "learning_rate": 6.169895066937932e-05, + "loss": 2.3086, + "step": 13174 + }, + { + "epoch": 0.8842656286701789, + "grad_norm": 4.270925521850586, + "learning_rate": 6.168838377294669e-05, + "loss": 2.3035, + "step": 13176 + }, + { + "epoch": 0.8843998523539478, + "grad_norm": 3.9790561199188232, + "learning_rate": 6.167781632426093e-05, + "loss": 2.8907, + "step": 13178 + }, + { + "epoch": 0.8845340760377168, + "grad_norm": 4.213494300842285, + "learning_rate": 6.16672483238213e-05, + "loss": 2.5118, + "step": 13180 + }, + { + "epoch": 0.8846682997214859, + "grad_norm": 4.4090256690979, + "learning_rate": 6.165667977212712e-05, + "loss": 2.275, + "step": 13182 + }, + { + "epoch": 0.8848025234052549, + "grad_norm": 3.8691108226776123, + "learning_rate": 6.164611066967775e-05, + "loss": 2.2793, + "step": 13184 + }, + { + "epoch": 0.8849367470890238, + "grad_norm": 3.4996747970581055, + "learning_rate": 6.163554101697256e-05, + "loss": 2.3881, + "step": 13186 + }, + { + "epoch": 0.8850709707727928, + "grad_norm": 4.049676418304443, + "learning_rate": 6.162497081451093e-05, + "loss": 2.2937, + "step": 13188 + }, + { + "epoch": 0.8852051944565619, + "grad_norm": 4.600838661193848, + "learning_rate": 6.161440006279228e-05, + "loss": 2.3365, + "step": 13190 + }, + { + "epoch": 0.8853394181403309, + "grad_norm": 3.9883792400360107, + "learning_rate": 6.16038287623161e-05, + "loss": 2.3538, + "step": 13192 + }, + { + "epoch": 0.8854736418240998, + "grad_norm": 4.384263515472412, + "learning_rate": 6.15932569135818e-05, + "loss": 2.3709, + "step": 13194 + }, + { + "epoch": 0.8856078655078689, + "grad_norm": 4.16885232925415, + "learning_rate": 6.15826845170889e-05, + "loss": 2.6145, + "step": 13196 + }, + { + "epoch": 0.8857420891916379, + "grad_norm": 3.9720451831817627, + "learning_rate": 6.157211157333695e-05, + "loss": 2.3557, + "step": 13198 + }, + { + "epoch": 0.8858763128754069, + "grad_norm": 3.9394371509552, + "learning_rate": 6.156153808282548e-05, + "loss": 2.3185, + "step": 13200 + }, + { + "epoch": 0.8860105365591758, + "grad_norm": 4.3481125831604, + "learning_rate": 6.155096404605408e-05, + "loss": 2.4358, + "step": 13202 + }, + { + "epoch": 0.8861447602429449, + "grad_norm": 4.103490829467773, + "learning_rate": 6.154038946352232e-05, + "loss": 2.4217, + "step": 13204 + }, + { + "epoch": 0.8862789839267139, + "grad_norm": 4.118835926055908, + "learning_rate": 6.152981433572987e-05, + "loss": 2.247, + "step": 13206 + }, + { + "epoch": 0.8864132076104829, + "grad_norm": 4.125152587890625, + "learning_rate": 6.151923866317636e-05, + "loss": 2.5173, + "step": 13208 + }, + { + "epoch": 0.8865474312942518, + "grad_norm": 5.010989189147949, + "learning_rate": 6.150866244636146e-05, + "loss": 2.5141, + "step": 13210 + }, + { + "epoch": 0.8866816549780209, + "grad_norm": 3.808407783508301, + "learning_rate": 6.149808568578492e-05, + "loss": 2.3919, + "step": 13212 + }, + { + "epoch": 0.8868158786617899, + "grad_norm": 6.039849281311035, + "learning_rate": 6.148750838194642e-05, + "loss": 2.2414, + "step": 13214 + }, + { + "epoch": 0.8869501023455588, + "grad_norm": 4.121319770812988, + "learning_rate": 6.147693053534575e-05, + "loss": 2.1995, + "step": 13216 + }, + { + "epoch": 0.8870843260293279, + "grad_norm": 4.451784133911133, + "learning_rate": 6.146635214648267e-05, + "loss": 2.4589, + "step": 13218 + }, + { + "epoch": 0.8872185497130969, + "grad_norm": 4.32796573638916, + "learning_rate": 6.145577321585701e-05, + "loss": 2.4127, + "step": 13220 + }, + { + "epoch": 0.8873527733968659, + "grad_norm": 3.855429172515869, + "learning_rate": 6.144519374396859e-05, + "loss": 2.1908, + "step": 13222 + }, + { + "epoch": 0.8874869970806348, + "grad_norm": 7.813572406768799, + "learning_rate": 6.143461373131725e-05, + "loss": 2.285, + "step": 13224 + }, + { + "epoch": 0.8876212207644039, + "grad_norm": 3.7997872829437256, + "learning_rate": 6.142403317840292e-05, + "loss": 2.1727, + "step": 13226 + }, + { + "epoch": 0.8877554444481729, + "grad_norm": 4.220510482788086, + "learning_rate": 6.14134520857255e-05, + "loss": 2.6659, + "step": 13228 + }, + { + "epoch": 0.8878896681319419, + "grad_norm": 3.3481054306030273, + "learning_rate": 6.140287045378488e-05, + "loss": 2.2143, + "step": 13230 + }, + { + "epoch": 0.8880238918157108, + "grad_norm": 4.451102256774902, + "learning_rate": 6.139228828308107e-05, + "loss": 2.3425, + "step": 13232 + }, + { + "epoch": 0.8881581154994799, + "grad_norm": 3.6766891479492188, + "learning_rate": 6.138170557411403e-05, + "loss": 2.7465, + "step": 13234 + }, + { + "epoch": 0.8882923391832489, + "grad_norm": 4.314380168914795, + "learning_rate": 6.13711223273838e-05, + "loss": 2.2472, + "step": 13236 + }, + { + "epoch": 0.8884265628670179, + "grad_norm": 4.298898220062256, + "learning_rate": 6.136053854339039e-05, + "loss": 2.2132, + "step": 13238 + }, + { + "epoch": 0.8885607865507869, + "grad_norm": 4.198291778564453, + "learning_rate": 6.134995422263388e-05, + "loss": 2.4562, + "step": 13240 + }, + { + "epoch": 0.8886950102345559, + "grad_norm": 4.318065166473389, + "learning_rate": 6.133936936561432e-05, + "loss": 2.2347, + "step": 13242 + }, + { + "epoch": 0.8888292339183249, + "grad_norm": 4.383955478668213, + "learning_rate": 6.132878397283189e-05, + "loss": 2.2262, + "step": 13244 + }, + { + "epoch": 0.8889634576020939, + "grad_norm": 4.014040946960449, + "learning_rate": 6.131819804478669e-05, + "loss": 2.2242, + "step": 13246 + }, + { + "epoch": 0.8890976812858629, + "grad_norm": 4.983264923095703, + "learning_rate": 6.130761158197888e-05, + "loss": 2.3858, + "step": 13248 + }, + { + "epoch": 0.8892319049696319, + "grad_norm": 4.300611972808838, + "learning_rate": 6.129702458490867e-05, + "loss": 2.3027, + "step": 13250 + }, + { + "epoch": 0.8893661286534009, + "grad_norm": 4.0076470375061035, + "learning_rate": 6.128643705407625e-05, + "loss": 2.1781, + "step": 13252 + }, + { + "epoch": 0.8895003523371698, + "grad_norm": 4.521521091461182, + "learning_rate": 6.127584898998188e-05, + "loss": 2.5481, + "step": 13254 + }, + { + "epoch": 0.8896345760209389, + "grad_norm": 4.740964412689209, + "learning_rate": 6.126526039312581e-05, + "loss": 2.4285, + "step": 13256 + }, + { + "epoch": 0.8897687997047079, + "grad_norm": 4.708981037139893, + "learning_rate": 6.125467126400835e-05, + "loss": 2.4136, + "step": 13258 + }, + { + "epoch": 0.8899030233884769, + "grad_norm": 4.320923805236816, + "learning_rate": 6.12440816031298e-05, + "loss": 2.5356, + "step": 13260 + }, + { + "epoch": 0.8900372470722459, + "grad_norm": 4.111133098602295, + "learning_rate": 6.12334914109905e-05, + "loss": 2.4414, + "step": 13262 + }, + { + "epoch": 0.8901714707560149, + "grad_norm": 5.300046920776367, + "learning_rate": 6.122290068809083e-05, + "loss": 2.4225, + "step": 13264 + }, + { + "epoch": 0.8903056944397839, + "grad_norm": 7.118680953979492, + "learning_rate": 6.121230943493117e-05, + "loss": 2.2957, + "step": 13266 + }, + { + "epoch": 0.8904399181235529, + "grad_norm": 4.099699020385742, + "learning_rate": 6.120171765201194e-05, + "loss": 2.0853, + "step": 13268 + }, + { + "epoch": 0.8905741418073219, + "grad_norm": 4.244811058044434, + "learning_rate": 6.119112533983355e-05, + "loss": 2.4572, + "step": 13270 + }, + { + "epoch": 0.8907083654910909, + "grad_norm": 4.186366558074951, + "learning_rate": 6.118053249889652e-05, + "loss": 2.1978, + "step": 13272 + }, + { + "epoch": 0.8908425891748599, + "grad_norm": 3.906271457672119, + "learning_rate": 6.116993912970132e-05, + "loss": 2.1855, + "step": 13274 + }, + { + "epoch": 0.890976812858629, + "grad_norm": 3.947033166885376, + "learning_rate": 6.115934523274845e-05, + "loss": 2.2494, + "step": 13276 + }, + { + "epoch": 0.8911110365423979, + "grad_norm": 4.268795013427734, + "learning_rate": 6.114875080853846e-05, + "loss": 2.2204, + "step": 13278 + }, + { + "epoch": 0.8912452602261669, + "grad_norm": 4.324015140533447, + "learning_rate": 6.113815585757192e-05, + "loss": 2.3243, + "step": 13280 + }, + { + "epoch": 0.8913794839099359, + "grad_norm": 3.731476306915283, + "learning_rate": 6.112756038034942e-05, + "loss": 2.3213, + "step": 13282 + }, + { + "epoch": 0.891513707593705, + "grad_norm": 4.086124897003174, + "learning_rate": 6.111696437737157e-05, + "loss": 2.2508, + "step": 13284 + }, + { + "epoch": 0.8916479312774739, + "grad_norm": 3.869391918182373, + "learning_rate": 6.110636784913901e-05, + "loss": 2.3354, + "step": 13286 + }, + { + "epoch": 0.8917821549612429, + "grad_norm": 4.377526760101318, + "learning_rate": 6.109577079615243e-05, + "loss": 2.5813, + "step": 13288 + }, + { + "epoch": 0.8919163786450119, + "grad_norm": 4.4040398597717285, + "learning_rate": 6.108517321891247e-05, + "loss": 2.5135, + "step": 13290 + }, + { + "epoch": 0.8920506023287809, + "grad_norm": 4.5383687019348145, + "learning_rate": 6.10745751179199e-05, + "loss": 2.396, + "step": 13292 + }, + { + "epoch": 0.8921848260125499, + "grad_norm": 4.427144527435303, + "learning_rate": 6.106397649367541e-05, + "loss": 2.3296, + "step": 13294 + }, + { + "epoch": 0.8923190496963189, + "grad_norm": 3.836575984954834, + "learning_rate": 6.10533773466798e-05, + "loss": 2.2094, + "step": 13296 + }, + { + "epoch": 0.892453273380088, + "grad_norm": 4.33811616897583, + "learning_rate": 6.104277767743385e-05, + "loss": 2.2315, + "step": 13298 + }, + { + "epoch": 0.8925874970638569, + "grad_norm": 4.002844333648682, + "learning_rate": 6.1032177486438355e-05, + "loss": 2.2014, + "step": 13300 + }, + { + "epoch": 0.8927217207476259, + "grad_norm": 4.15293550491333, + "learning_rate": 6.102157677419418e-05, + "loss": 2.1473, + "step": 13302 + }, + { + "epoch": 0.8928559444313949, + "grad_norm": 4.260342597961426, + "learning_rate": 6.101097554120216e-05, + "loss": 2.1527, + "step": 13304 + }, + { + "epoch": 0.892990168115164, + "grad_norm": 4.442800998687744, + "learning_rate": 6.100037378796321e-05, + "loss": 2.6121, + "step": 13306 + }, + { + "epoch": 0.8931243917989329, + "grad_norm": 4.241450309753418, + "learning_rate": 6.0989771514978235e-05, + "loss": 2.417, + "step": 13308 + }, + { + "epoch": 0.8932586154827019, + "grad_norm": 7.889102935791016, + "learning_rate": 6.097916872274815e-05, + "loss": 2.1236, + "step": 13310 + }, + { + "epoch": 0.8933928391664709, + "grad_norm": 4.231535911560059, + "learning_rate": 6.096856541177395e-05, + "loss": 2.159, + "step": 13312 + }, + { + "epoch": 0.89352706285024, + "grad_norm": 4.214563369750977, + "learning_rate": 6.095796158255659e-05, + "loss": 2.1695, + "step": 13314 + }, + { + "epoch": 0.8936612865340089, + "grad_norm": 4.202199935913086, + "learning_rate": 6.09473572355971e-05, + "loss": 2.0925, + "step": 13316 + }, + { + "epoch": 0.8937955102177779, + "grad_norm": 4.456860065460205, + "learning_rate": 6.09367523713965e-05, + "loss": 2.5292, + "step": 13318 + }, + { + "epoch": 0.893929733901547, + "grad_norm": 4.651003360748291, + "learning_rate": 6.092614699045587e-05, + "loss": 2.2749, + "step": 13320 + }, + { + "epoch": 0.894063957585316, + "grad_norm": 4.153751373291016, + "learning_rate": 6.091554109327626e-05, + "loss": 2.4193, + "step": 13322 + }, + { + "epoch": 0.8941981812690849, + "grad_norm": 7.508519172668457, + "learning_rate": 6.090493468035882e-05, + "loss": 2.1572, + "step": 13324 + }, + { + "epoch": 0.8943324049528539, + "grad_norm": 3.822075366973877, + "learning_rate": 6.089432775220465e-05, + "loss": 2.5715, + "step": 13326 + }, + { + "epoch": 0.894466628636623, + "grad_norm": 3.283507823944092, + "learning_rate": 6.088372030931491e-05, + "loss": 2.0331, + "step": 13328 + }, + { + "epoch": 0.8946008523203919, + "grad_norm": 4.005886554718018, + "learning_rate": 6.0873112352190795e-05, + "loss": 2.6068, + "step": 13330 + }, + { + "epoch": 0.8947350760041609, + "grad_norm": 5.832508563995361, + "learning_rate": 6.0862503881333496e-05, + "loss": 2.1932, + "step": 13332 + }, + { + "epoch": 0.8948692996879299, + "grad_norm": 11.952640533447266, + "learning_rate": 6.085189489724426e-05, + "loss": 2.3196, + "step": 13334 + }, + { + "epoch": 0.895003523371699, + "grad_norm": 4.042935848236084, + "learning_rate": 6.084128540042432e-05, + "loss": 2.2712, + "step": 13336 + }, + { + "epoch": 0.8951377470554679, + "grad_norm": 3.702185869216919, + "learning_rate": 6.083067539137497e-05, + "loss": 2.3408, + "step": 13338 + }, + { + "epoch": 0.8952719707392369, + "grad_norm": 3.5819528102874756, + "learning_rate": 6.0820064870597504e-05, + "loss": 2.1375, + "step": 13340 + }, + { + "epoch": 0.895406194423006, + "grad_norm": 4.577406883239746, + "learning_rate": 6.0809453838593246e-05, + "loss": 2.5056, + "step": 13342 + }, + { + "epoch": 0.895540418106775, + "grad_norm": 4.23300838470459, + "learning_rate": 6.079884229586355e-05, + "loss": 2.2245, + "step": 13344 + }, + { + "epoch": 0.8956746417905439, + "grad_norm": 4.525956153869629, + "learning_rate": 6.0788230242909795e-05, + "loss": 2.3892, + "step": 13346 + }, + { + "epoch": 0.8958088654743129, + "grad_norm": 3.868999719619751, + "learning_rate": 6.077761768023337e-05, + "loss": 2.1828, + "step": 13348 + }, + { + "epoch": 0.895943089158082, + "grad_norm": 3.705273389816284, + "learning_rate": 6.076700460833571e-05, + "loss": 2.1787, + "step": 13350 + }, + { + "epoch": 0.896077312841851, + "grad_norm": 3.623911142349243, + "learning_rate": 6.075639102771824e-05, + "loss": 1.9522, + "step": 13352 + }, + { + "epoch": 0.8962115365256199, + "grad_norm": 4.5167999267578125, + "learning_rate": 6.074577693888246e-05, + "loss": 2.3702, + "step": 13354 + }, + { + "epoch": 0.8963457602093889, + "grad_norm": 3.978541612625122, + "learning_rate": 6.073516234232985e-05, + "loss": 2.3744, + "step": 13356 + }, + { + "epoch": 0.896479983893158, + "grad_norm": 4.193647861480713, + "learning_rate": 6.072454723856192e-05, + "loss": 2.1171, + "step": 13358 + }, + { + "epoch": 0.896614207576927, + "grad_norm": 5.019547939300537, + "learning_rate": 6.0713931628080236e-05, + "loss": 2.2123, + "step": 13360 + }, + { + "epoch": 0.8967484312606959, + "grad_norm": 4.222206115722656, + "learning_rate": 6.0703315511386336e-05, + "loss": 2.2713, + "step": 13362 + }, + { + "epoch": 0.896882654944465, + "grad_norm": 3.8558502197265625, + "learning_rate": 6.069269888898184e-05, + "loss": 2.304, + "step": 13364 + }, + { + "epoch": 0.897016878628234, + "grad_norm": 4.171947956085205, + "learning_rate": 6.0682081761368325e-05, + "loss": 2.3355, + "step": 13366 + }, + { + "epoch": 0.8971511023120029, + "grad_norm": 4.860604286193848, + "learning_rate": 6.0671464129047474e-05, + "loss": 2.2188, + "step": 13368 + }, + { + "epoch": 0.8972853259957719, + "grad_norm": 4.26322078704834, + "learning_rate": 6.066084599252092e-05, + "loss": 2.0704, + "step": 13370 + }, + { + "epoch": 0.897419549679541, + "grad_norm": 4.082518100738525, + "learning_rate": 6.0650227352290345e-05, + "loss": 2.2921, + "step": 13372 + }, + { + "epoch": 0.89755377336331, + "grad_norm": 4.871400356292725, + "learning_rate": 6.063960820885749e-05, + "loss": 2.3174, + "step": 13374 + }, + { + "epoch": 0.8976879970470789, + "grad_norm": 3.9868781566619873, + "learning_rate": 6.0628988562724054e-05, + "loss": 2.3242, + "step": 13376 + }, + { + "epoch": 0.8978222207308479, + "grad_norm": 4.919647216796875, + "learning_rate": 6.061836841439182e-05, + "loss": 2.246, + "step": 13378 + }, + { + "epoch": 0.897956444414617, + "grad_norm": 4.377017021179199, + "learning_rate": 6.060774776436255e-05, + "loss": 2.3733, + "step": 13380 + }, + { + "epoch": 0.898090668098386, + "grad_norm": 3.971627950668335, + "learning_rate": 6.059712661313807e-05, + "loss": 2.1867, + "step": 13382 + }, + { + "epoch": 0.8982248917821549, + "grad_norm": 4.018701076507568, + "learning_rate": 6.058650496122018e-05, + "loss": 2.1401, + "step": 13384 + }, + { + "epoch": 0.898359115465924, + "grad_norm": 4.340066432952881, + "learning_rate": 6.057588280911075e-05, + "loss": 2.0344, + "step": 13386 + }, + { + "epoch": 0.898493339149693, + "grad_norm": 4.207492828369141, + "learning_rate": 6.056526015731166e-05, + "loss": 2.3062, + "step": 13388 + }, + { + "epoch": 0.898627562833462, + "grad_norm": 5.015026569366455, + "learning_rate": 6.05546370063248e-05, + "loss": 1.9506, + "step": 13390 + }, + { + "epoch": 0.8987617865172309, + "grad_norm": 4.39638614654541, + "learning_rate": 6.054401335665211e-05, + "loss": 2.2557, + "step": 13392 + }, + { + "epoch": 0.898896010201, + "grad_norm": 4.386972427368164, + "learning_rate": 6.05333892087955e-05, + "loss": 2.383, + "step": 13394 + }, + { + "epoch": 0.899030233884769, + "grad_norm": 4.316920757293701, + "learning_rate": 6.0522764563256985e-05, + "loss": 2.3901, + "step": 13396 + }, + { + "epoch": 0.899164457568538, + "grad_norm": 3.8605875968933105, + "learning_rate": 6.0512139420538515e-05, + "loss": 2.364, + "step": 13398 + }, + { + "epoch": 0.8992986812523069, + "grad_norm": 4.000301837921143, + "learning_rate": 6.050151378114214e-05, + "loss": 2.475, + "step": 13400 + }, + { + "epoch": 0.899432904936076, + "grad_norm": 4.351308345794678, + "learning_rate": 6.049088764556989e-05, + "loss": 2.4521, + "step": 13402 + }, + { + "epoch": 0.899567128619845, + "grad_norm": 4.104516983032227, + "learning_rate": 6.0480261014323826e-05, + "loss": 2.1791, + "step": 13404 + }, + { + "epoch": 0.8997013523036139, + "grad_norm": 3.79512095451355, + "learning_rate": 6.046963388790604e-05, + "loss": 2.3587, + "step": 13406 + }, + { + "epoch": 0.899835575987383, + "grad_norm": 4.282802581787109, + "learning_rate": 6.045900626681864e-05, + "loss": 2.1685, + "step": 13408 + }, + { + "epoch": 0.899969799671152, + "grad_norm": 4.021341800689697, + "learning_rate": 6.044837815156377e-05, + "loss": 2.1095, + "step": 13410 + }, + { + "epoch": 0.900104023354921, + "grad_norm": 4.227400302886963, + "learning_rate": 6.043774954264355e-05, + "loss": 2.3182, + "step": 13412 + }, + { + "epoch": 0.9002382470386899, + "grad_norm": 4.156959056854248, + "learning_rate": 6.0427120440560204e-05, + "loss": 2.3334, + "step": 13414 + }, + { + "epoch": 0.900372470722459, + "grad_norm": 3.6131718158721924, + "learning_rate": 6.041649084581593e-05, + "loss": 2.0791, + "step": 13416 + }, + { + "epoch": 0.900506694406228, + "grad_norm": 4.9014997482299805, + "learning_rate": 6.040586075891293e-05, + "loss": 2.4963, + "step": 13418 + }, + { + "epoch": 0.900640918089997, + "grad_norm": 3.753248929977417, + "learning_rate": 6.0395230180353504e-05, + "loss": 2.4562, + "step": 13420 + }, + { + "epoch": 0.9007751417737659, + "grad_norm": 4.504335880279541, + "learning_rate": 6.038459911063986e-05, + "loss": 2.232, + "step": 13422 + }, + { + "epoch": 0.900909365457535, + "grad_norm": 3.553931951522827, + "learning_rate": 6.0373967550274336e-05, + "loss": 2.2407, + "step": 13424 + }, + { + "epoch": 0.901043589141304, + "grad_norm": 4.214244365692139, + "learning_rate": 6.0363335499759265e-05, + "loss": 2.3213, + "step": 13426 + }, + { + "epoch": 0.901177812825073, + "grad_norm": 4.319566249847412, + "learning_rate": 6.035270295959695e-05, + "loss": 2.331, + "step": 13428 + }, + { + "epoch": 0.901312036508842, + "grad_norm": 3.9772353172302246, + "learning_rate": 6.034206993028979e-05, + "loss": 2.2817, + "step": 13430 + }, + { + "epoch": 0.901446260192611, + "grad_norm": 3.5625922679901123, + "learning_rate": 6.0331436412340147e-05, + "loss": 2.4407, + "step": 13432 + }, + { + "epoch": 0.90158048387638, + "grad_norm": 4.041510105133057, + "learning_rate": 6.032080240625045e-05, + "loss": 2.2258, + "step": 13434 + }, + { + "epoch": 0.901714707560149, + "grad_norm": 4.135049343109131, + "learning_rate": 6.031016791252315e-05, + "loss": 2.5187, + "step": 13436 + }, + { + "epoch": 0.901848931243918, + "grad_norm": 3.934309959411621, + "learning_rate": 6.0299532931660675e-05, + "loss": 2.1582, + "step": 13438 + }, + { + "epoch": 0.901983154927687, + "grad_norm": 4.560545921325684, + "learning_rate": 6.028889746416553e-05, + "loss": 2.2538, + "step": 13440 + }, + { + "epoch": 0.902117378611456, + "grad_norm": 3.7169275283813477, + "learning_rate": 6.0278261510540214e-05, + "loss": 2.2823, + "step": 13442 + }, + { + "epoch": 0.9022516022952249, + "grad_norm": 4.499866962432861, + "learning_rate": 6.026762507128725e-05, + "loss": 2.1933, + "step": 13444 + }, + { + "epoch": 0.902385825978994, + "grad_norm": 4.100047588348389, + "learning_rate": 6.025698814690919e-05, + "loss": 2.3605, + "step": 13446 + }, + { + "epoch": 0.902520049662763, + "grad_norm": 3.975118398666382, + "learning_rate": 6.0246350737908605e-05, + "loss": 2.4752, + "step": 13448 + }, + { + "epoch": 0.902654273346532, + "grad_norm": 4.160134792327881, + "learning_rate": 6.02357128447881e-05, + "loss": 2.2449, + "step": 13450 + }, + { + "epoch": 0.902788497030301, + "grad_norm": 3.575411558151245, + "learning_rate": 6.022507446805029e-05, + "loss": 2.1023, + "step": 13452 + }, + { + "epoch": 0.90292272071407, + "grad_norm": 3.8838207721710205, + "learning_rate": 6.0214435608197825e-05, + "loss": 2.129, + "step": 13454 + }, + { + "epoch": 0.903056944397839, + "grad_norm": 4.564263820648193, + "learning_rate": 6.020379626573336e-05, + "loss": 2.1825, + "step": 13456 + }, + { + "epoch": 0.903191168081608, + "grad_norm": 4.129507064819336, + "learning_rate": 6.019315644115959e-05, + "loss": 2.2482, + "step": 13458 + }, + { + "epoch": 0.903325391765377, + "grad_norm": 4.343836784362793, + "learning_rate": 6.018251613497922e-05, + "loss": 2.3966, + "step": 13460 + }, + { + "epoch": 0.903459615449146, + "grad_norm": 4.247714996337891, + "learning_rate": 6.0171875347694974e-05, + "loss": 2.0404, + "step": 13462 + }, + { + "epoch": 0.903593839132915, + "grad_norm": 3.4892818927764893, + "learning_rate": 6.0161234079809635e-05, + "loss": 2.1299, + "step": 13464 + }, + { + "epoch": 0.903728062816684, + "grad_norm": 4.072977066040039, + "learning_rate": 6.015059233182596e-05, + "loss": 2.445, + "step": 13466 + }, + { + "epoch": 0.903862286500453, + "grad_norm": 3.726490020751953, + "learning_rate": 6.013995010424676e-05, + "loss": 2.1805, + "step": 13468 + }, + { + "epoch": 0.903996510184222, + "grad_norm": 3.5326225757598877, + "learning_rate": 6.012930739757485e-05, + "loss": 2.1996, + "step": 13470 + }, + { + "epoch": 0.904130733867991, + "grad_norm": 5.278005123138428, + "learning_rate": 6.011866421231309e-05, + "loss": 2.3808, + "step": 13472 + }, + { + "epoch": 0.9042649575517601, + "grad_norm": 4.261924743652344, + "learning_rate": 6.010802054896435e-05, + "loss": 2.4544, + "step": 13474 + }, + { + "epoch": 0.904399181235529, + "grad_norm": 3.962660312652588, + "learning_rate": 6.0097376408031504e-05, + "loss": 2.1137, + "step": 13476 + }, + { + "epoch": 0.904533404919298, + "grad_norm": 4.232475280761719, + "learning_rate": 6.008673179001748e-05, + "loss": 2.6358, + "step": 13478 + }, + { + "epoch": 0.904667628603067, + "grad_norm": 4.1019134521484375, + "learning_rate": 6.007608669542522e-05, + "loss": 2.3451, + "step": 13480 + }, + { + "epoch": 0.904801852286836, + "grad_norm": 4.18292236328125, + "learning_rate": 6.006544112475767e-05, + "loss": 2.323, + "step": 13482 + }, + { + "epoch": 0.904936075970605, + "grad_norm": 4.085184097290039, + "learning_rate": 6.0054795078517826e-05, + "loss": 2.4053, + "step": 13484 + }, + { + "epoch": 0.905070299654374, + "grad_norm": 4.547171592712402, + "learning_rate": 6.004414855720869e-05, + "loss": 2.3852, + "step": 13486 + }, + { + "epoch": 0.905204523338143, + "grad_norm": 4.070449352264404, + "learning_rate": 6.003350156133327e-05, + "loss": 2.2591, + "step": 13488 + }, + { + "epoch": 0.905338747021912, + "grad_norm": 4.119770526885986, + "learning_rate": 6.002285409139464e-05, + "loss": 2.4502, + "step": 13490 + }, + { + "epoch": 0.905472970705681, + "grad_norm": 4.280753135681152, + "learning_rate": 6.001220614789587e-05, + "loss": 2.4275, + "step": 13492 + }, + { + "epoch": 0.90560719438945, + "grad_norm": 4.115443706512451, + "learning_rate": 6.000155773134005e-05, + "loss": 2.0496, + "step": 13494 + }, + { + "epoch": 0.9057414180732191, + "grad_norm": 12.904505729675293, + "learning_rate": 5.999090884223029e-05, + "loss": 2.5109, + "step": 13496 + }, + { + "epoch": 0.905875641756988, + "grad_norm": 3.807137966156006, + "learning_rate": 5.998025948106973e-05, + "loss": 2.0573, + "step": 13498 + }, + { + "epoch": 0.906009865440757, + "grad_norm": 4.1935834884643555, + "learning_rate": 5.9969609648361526e-05, + "loss": 2.3148, + "step": 13500 + }, + { + "epoch": 0.906144089124526, + "grad_norm": 5.712697982788086, + "learning_rate": 5.995895934460889e-05, + "loss": 2.7601, + "step": 13502 + }, + { + "epoch": 0.9062783128082951, + "grad_norm": 3.845592975616455, + "learning_rate": 5.994830857031499e-05, + "loss": 1.9567, + "step": 13504 + }, + { + "epoch": 0.906412536492064, + "grad_norm": 3.248807191848755, + "learning_rate": 5.9937657325983086e-05, + "loss": 1.9753, + "step": 13506 + }, + { + "epoch": 0.906546760175833, + "grad_norm": 4.83198356628418, + "learning_rate": 5.992700561211641e-05, + "loss": 2.5385, + "step": 13508 + }, + { + "epoch": 0.906680983859602, + "grad_norm": 3.867241621017456, + "learning_rate": 5.991635342921823e-05, + "loss": 2.4016, + "step": 13510 + }, + { + "epoch": 0.9068152075433711, + "grad_norm": 4.353923320770264, + "learning_rate": 5.9905700777791864e-05, + "loss": 2.2816, + "step": 13512 + }, + { + "epoch": 0.90694943122714, + "grad_norm": 4.232877731323242, + "learning_rate": 5.989504765834061e-05, + "loss": 2.3509, + "step": 13514 + }, + { + "epoch": 0.907083654910909, + "grad_norm": 3.6561412811279297, + "learning_rate": 5.9884394071367814e-05, + "loss": 2.1905, + "step": 13516 + }, + { + "epoch": 0.9072178785946781, + "grad_norm": 3.8235747814178467, + "learning_rate": 5.9873740017376825e-05, + "loss": 2.2089, + "step": 13518 + }, + { + "epoch": 0.907352102278447, + "grad_norm": 4.373332500457764, + "learning_rate": 5.986308549687105e-05, + "loss": 2.2311, + "step": 13520 + }, + { + "epoch": 0.907486325962216, + "grad_norm": 4.137012004852295, + "learning_rate": 5.9852430510353876e-05, + "loss": 2.3306, + "step": 13522 + }, + { + "epoch": 0.907620549645985, + "grad_norm": 3.9338815212249756, + "learning_rate": 5.984177505832872e-05, + "loss": 2.4551, + "step": 13524 + }, + { + "epoch": 0.9077547733297541, + "grad_norm": 4.565598011016846, + "learning_rate": 5.9831119141299064e-05, + "loss": 2.4273, + "step": 13526 + }, + { + "epoch": 0.907888997013523, + "grad_norm": 3.9043869972229004, + "learning_rate": 5.982046275976836e-05, + "loss": 2.1582, + "step": 13528 + }, + { + "epoch": 0.908023220697292, + "grad_norm": 6.767940998077393, + "learning_rate": 5.9809805914240104e-05, + "loss": 2.3203, + "step": 13530 + }, + { + "epoch": 0.908157444381061, + "grad_norm": 4.5475263595581055, + "learning_rate": 5.979914860521779e-05, + "loss": 2.4044, + "step": 13532 + }, + { + "epoch": 0.9082916680648301, + "grad_norm": 3.91835618019104, + "learning_rate": 5.978849083320499e-05, + "loss": 2.021, + "step": 13534 + }, + { + "epoch": 0.908425891748599, + "grad_norm": 3.988319158554077, + "learning_rate": 5.977783259870524e-05, + "loss": 2.2506, + "step": 13536 + }, + { + "epoch": 0.908560115432368, + "grad_norm": 4.419792652130127, + "learning_rate": 5.9767173902222116e-05, + "loss": 2.4348, + "step": 13538 + }, + { + "epoch": 0.9086943391161371, + "grad_norm": 3.9547274112701416, + "learning_rate": 5.975651474425925e-05, + "loss": 2.2706, + "step": 13540 + }, + { + "epoch": 0.9088285627999061, + "grad_norm": 4.760572910308838, + "learning_rate": 5.9745855125320236e-05, + "loss": 2.1561, + "step": 13542 + }, + { + "epoch": 0.908962786483675, + "grad_norm": 3.9083075523376465, + "learning_rate": 5.973519504590874e-05, + "loss": 2.2292, + "step": 13544 + }, + { + "epoch": 0.909097010167444, + "grad_norm": 4.651097774505615, + "learning_rate": 5.972453450652842e-05, + "loss": 2.6824, + "step": 13546 + }, + { + "epoch": 0.9092312338512131, + "grad_norm": 4.664394378662109, + "learning_rate": 5.971387350768297e-05, + "loss": 2.3147, + "step": 13548 + }, + { + "epoch": 0.9093654575349821, + "grad_norm": 4.1343817710876465, + "learning_rate": 5.970321204987609e-05, + "loss": 2.141, + "step": 13550 + }, + { + "epoch": 0.909499681218751, + "grad_norm": 4.4833550453186035, + "learning_rate": 5.969255013361153e-05, + "loss": 2.5484, + "step": 13552 + }, + { + "epoch": 0.90963390490252, + "grad_norm": 4.352199554443359, + "learning_rate": 5.968188775939303e-05, + "loss": 2.2362, + "step": 13554 + }, + { + "epoch": 0.9097681285862891, + "grad_norm": 4.237645626068115, + "learning_rate": 5.9671224927724366e-05, + "loss": 2.0872, + "step": 13556 + }, + { + "epoch": 0.909902352270058, + "grad_norm": 3.9625790119171143, + "learning_rate": 5.966056163910936e-05, + "loss": 2.5127, + "step": 13558 + }, + { + "epoch": 0.910036575953827, + "grad_norm": 3.9941585063934326, + "learning_rate": 5.9649897894051785e-05, + "loss": 2.3106, + "step": 13560 + }, + { + "epoch": 0.9101707996375961, + "grad_norm": 6.847405910491943, + "learning_rate": 5.963923369305554e-05, + "loss": 2.2784, + "step": 13562 + }, + { + "epoch": 0.9103050233213651, + "grad_norm": 4.190040111541748, + "learning_rate": 5.9628569036624446e-05, + "loss": 2.3382, + "step": 13564 + }, + { + "epoch": 0.910439247005134, + "grad_norm": 4.367335319519043, + "learning_rate": 5.9617903925262395e-05, + "loss": 2.2592, + "step": 13566 + }, + { + "epoch": 0.910573470688903, + "grad_norm": 4.572558879852295, + "learning_rate": 5.960723835947331e-05, + "loss": 2.3933, + "step": 13568 + }, + { + "epoch": 0.9107076943726721, + "grad_norm": 7.520634651184082, + "learning_rate": 5.959657233976108e-05, + "loss": 2.1449, + "step": 13570 + }, + { + "epoch": 0.9108419180564411, + "grad_norm": 3.3707127571105957, + "learning_rate": 5.9585905866629687e-05, + "loss": 2.0629, + "step": 13572 + }, + { + "epoch": 0.91097614174021, + "grad_norm": 4.41083288192749, + "learning_rate": 5.95752389405831e-05, + "loss": 2.2035, + "step": 13574 + }, + { + "epoch": 0.911110365423979, + "grad_norm": 4.6220197677612305, + "learning_rate": 5.9564571562125294e-05, + "loss": 2.2829, + "step": 13576 + }, + { + "epoch": 0.9112445891077481, + "grad_norm": 3.9355082511901855, + "learning_rate": 5.9553903731760295e-05, + "loss": 2.3318, + "step": 13578 + }, + { + "epoch": 0.9113788127915171, + "grad_norm": 4.325434684753418, + "learning_rate": 5.954323544999213e-05, + "loss": 2.3677, + "step": 13580 + }, + { + "epoch": 0.911513036475286, + "grad_norm": 3.920098066329956, + "learning_rate": 5.953256671732487e-05, + "loss": 2.3351, + "step": 13582 + }, + { + "epoch": 0.9116472601590551, + "grad_norm": 3.9939496517181396, + "learning_rate": 5.952189753426255e-05, + "loss": 2.3929, + "step": 13584 + }, + { + "epoch": 0.9117814838428241, + "grad_norm": 4.201869964599609, + "learning_rate": 5.951122790130931e-05, + "loss": 2.2783, + "step": 13586 + }, + { + "epoch": 0.9119157075265931, + "grad_norm": 3.9465856552124023, + "learning_rate": 5.950055781896926e-05, + "loss": 2.1678, + "step": 13588 + }, + { + "epoch": 0.912049931210362, + "grad_norm": 3.8133621215820312, + "learning_rate": 5.948988728774652e-05, + "loss": 2.2734, + "step": 13590 + }, + { + "epoch": 0.9121841548941311, + "grad_norm": 4.266514301300049, + "learning_rate": 5.947921630814528e-05, + "loss": 2.022, + "step": 13592 + }, + { + "epoch": 0.9123183785779001, + "grad_norm": 3.828042507171631, + "learning_rate": 5.94685448806697e-05, + "loss": 2.428, + "step": 13594 + }, + { + "epoch": 0.912452602261669, + "grad_norm": 8.485233306884766, + "learning_rate": 5.945787300582401e-05, + "loss": 2.4057, + "step": 13596 + }, + { + "epoch": 0.912586825945438, + "grad_norm": 6.4911017417907715, + "learning_rate": 5.944720068411239e-05, + "loss": 2.3532, + "step": 13598 + }, + { + "epoch": 0.9127210496292071, + "grad_norm": 4.032436370849609, + "learning_rate": 5.943652791603913e-05, + "loss": 2.1182, + "step": 13600 + }, + { + "epoch": 0.9128552733129761, + "grad_norm": 4.570639610290527, + "learning_rate": 5.9425854702108494e-05, + "loss": 2.2898, + "step": 13602 + }, + { + "epoch": 0.912989496996745, + "grad_norm": 3.730602741241455, + "learning_rate": 5.941518104282474e-05, + "loss": 2.1097, + "step": 13604 + }, + { + "epoch": 0.9131237206805141, + "grad_norm": 4.323294639587402, + "learning_rate": 5.940450693869222e-05, + "loss": 2.4026, + "step": 13606 + }, + { + "epoch": 0.9132579443642831, + "grad_norm": 4.856786727905273, + "learning_rate": 5.939383239021521e-05, + "loss": 2.3215, + "step": 13608 + }, + { + "epoch": 0.9133921680480521, + "grad_norm": 4.278216361999512, + "learning_rate": 5.938315739789812e-05, + "loss": 2.1445, + "step": 13610 + }, + { + "epoch": 0.913526391731821, + "grad_norm": 3.765312910079956, + "learning_rate": 5.9372481962245274e-05, + "loss": 2.1593, + "step": 13612 + }, + { + "epoch": 0.9136606154155901, + "grad_norm": 5.51054573059082, + "learning_rate": 5.9361806083761084e-05, + "loss": 2.3857, + "step": 13614 + }, + { + "epoch": 0.9137948390993591, + "grad_norm": 3.8711538314819336, + "learning_rate": 5.935112976294997e-05, + "loss": 2.4336, + "step": 13616 + }, + { + "epoch": 0.9139290627831281, + "grad_norm": 5.119199275970459, + "learning_rate": 5.934045300031637e-05, + "loss": 2.1157, + "step": 13618 + }, + { + "epoch": 0.914063286466897, + "grad_norm": 4.118533611297607, + "learning_rate": 5.932977579636474e-05, + "loss": 2.063, + "step": 13620 + }, + { + "epoch": 0.9141975101506661, + "grad_norm": 13.589522361755371, + "learning_rate": 5.9319098151599525e-05, + "loss": 2.1901, + "step": 13622 + }, + { + "epoch": 0.9143317338344351, + "grad_norm": 4.788952827453613, + "learning_rate": 5.9308420066525274e-05, + "loss": 2.3024, + "step": 13624 + }, + { + "epoch": 0.9144659575182041, + "grad_norm": 4.584630489349365, + "learning_rate": 5.9297741541646465e-05, + "loss": 2.2547, + "step": 13626 + }, + { + "epoch": 0.9146001812019731, + "grad_norm": 3.8070483207702637, + "learning_rate": 5.9287062577467646e-05, + "loss": 2.1324, + "step": 13628 + }, + { + "epoch": 0.9147344048857421, + "grad_norm": 3.936797618865967, + "learning_rate": 5.92763831744934e-05, + "loss": 2.503, + "step": 13630 + }, + { + "epoch": 0.9148686285695111, + "grad_norm": 4.151053428649902, + "learning_rate": 5.926570333322828e-05, + "loss": 2.3311, + "step": 13632 + }, + { + "epoch": 0.91500285225328, + "grad_norm": 4.402261734008789, + "learning_rate": 5.9255023054176895e-05, + "loss": 2.3734, + "step": 13634 + }, + { + "epoch": 0.9151370759370491, + "grad_norm": 4.320906162261963, + "learning_rate": 5.924434233784388e-05, + "loss": 2.7299, + "step": 13636 + }, + { + "epoch": 0.9152712996208181, + "grad_norm": 3.9467384815216064, + "learning_rate": 5.9233661184733856e-05, + "loss": 2.2778, + "step": 13638 + }, + { + "epoch": 0.9154055233045871, + "grad_norm": 3.9730517864227295, + "learning_rate": 5.92229795953515e-05, + "loss": 2.0809, + "step": 13640 + }, + { + "epoch": 0.915539746988356, + "grad_norm": 3.6215879917144775, + "learning_rate": 5.92122975702015e-05, + "loss": 1.88, + "step": 13642 + }, + { + "epoch": 0.9156739706721251, + "grad_norm": 3.6553962230682373, + "learning_rate": 5.9201615109788555e-05, + "loss": 2.0622, + "step": 13644 + }, + { + "epoch": 0.9158081943558941, + "grad_norm": 4.450201034545898, + "learning_rate": 5.9190932214617376e-05, + "loss": 2.1514, + "step": 13646 + }, + { + "epoch": 0.9159424180396631, + "grad_norm": 4.429653644561768, + "learning_rate": 5.9180248885192735e-05, + "loss": 2.6192, + "step": 13648 + }, + { + "epoch": 0.9160766417234321, + "grad_norm": 6.913113594055176, + "learning_rate": 5.9169565122019386e-05, + "loss": 2.2301, + "step": 13650 + }, + { + "epoch": 0.9162108654072011, + "grad_norm": 4.134647369384766, + "learning_rate": 5.915888092560212e-05, + "loss": 2.2385, + "step": 13652 + }, + { + "epoch": 0.9163450890909701, + "grad_norm": 3.836021900177002, + "learning_rate": 5.914819629644574e-05, + "loss": 2.3306, + "step": 13654 + }, + { + "epoch": 0.9164793127747392, + "grad_norm": 3.8315556049346924, + "learning_rate": 5.913751123505506e-05, + "loss": 2.4219, + "step": 13656 + }, + { + "epoch": 0.9166135364585081, + "grad_norm": 4.580314636230469, + "learning_rate": 5.912682574193497e-05, + "loss": 2.4719, + "step": 13658 + }, + { + "epoch": 0.9167477601422771, + "grad_norm": 3.7787375450134277, + "learning_rate": 5.911613981759029e-05, + "loss": 2.2204, + "step": 13660 + }, + { + "epoch": 0.9168819838260461, + "grad_norm": 4.543973445892334, + "learning_rate": 5.9105453462525915e-05, + "loss": 2.3548, + "step": 13662 + }, + { + "epoch": 0.9170162075098152, + "grad_norm": 6.096688270568848, + "learning_rate": 5.90947666772468e-05, + "loss": 2.3424, + "step": 13664 + }, + { + "epoch": 0.9171504311935841, + "grad_norm": 4.0211262702941895, + "learning_rate": 5.9084079462257824e-05, + "loss": 2.3055, + "step": 13666 + }, + { + "epoch": 0.9172846548773531, + "grad_norm": 4.428564548492432, + "learning_rate": 5.907339181806397e-05, + "loss": 2.298, + "step": 13668 + }, + { + "epoch": 0.9174188785611221, + "grad_norm": 4.08590030670166, + "learning_rate": 5.906270374517019e-05, + "loss": 2.1561, + "step": 13670 + }, + { + "epoch": 0.9175531022448911, + "grad_norm": 4.163895606994629, + "learning_rate": 5.905201524408148e-05, + "loss": 2.4161, + "step": 13672 + }, + { + "epoch": 0.9176873259286601, + "grad_norm": 3.930108070373535, + "learning_rate": 5.9041326315302835e-05, + "loss": 2.3627, + "step": 13674 + }, + { + "epoch": 0.9178215496124291, + "grad_norm": 4.580862522125244, + "learning_rate": 5.903063695933931e-05, + "loss": 2.4779, + "step": 13676 + }, + { + "epoch": 0.9179557732961982, + "grad_norm": 4.047738552093506, + "learning_rate": 5.9019947176695954e-05, + "loss": 2.5681, + "step": 13678 + }, + { + "epoch": 0.9180899969799671, + "grad_norm": 4.236265659332275, + "learning_rate": 5.900925696787783e-05, + "loss": 2.395, + "step": 13680 + }, + { + "epoch": 0.9182242206637361, + "grad_norm": 3.8624672889709473, + "learning_rate": 5.899856633339003e-05, + "loss": 2.3268, + "step": 13682 + }, + { + "epoch": 0.9183584443475051, + "grad_norm": 4.354371547698975, + "learning_rate": 5.898787527373766e-05, + "loss": 2.1824, + "step": 13684 + }, + { + "epoch": 0.9184926680312742, + "grad_norm": 3.6684389114379883, + "learning_rate": 5.897718378942586e-05, + "loss": 2.1541, + "step": 13686 + }, + { + "epoch": 0.9186268917150431, + "grad_norm": 3.898897647857666, + "learning_rate": 5.8966491880959775e-05, + "loss": 2.3158, + "step": 13688 + }, + { + "epoch": 0.9187611153988121, + "grad_norm": 5.716151714324951, + "learning_rate": 5.895579954884458e-05, + "loss": 2.1231, + "step": 13690 + }, + { + "epoch": 0.9188953390825811, + "grad_norm": 4.028767108917236, + "learning_rate": 5.894510679358547e-05, + "loss": 2.0301, + "step": 13692 + }, + { + "epoch": 0.9190295627663502, + "grad_norm": 3.729539155960083, + "learning_rate": 5.8934413615687655e-05, + "loss": 2.2034, + "step": 13694 + }, + { + "epoch": 0.9191637864501191, + "grad_norm": 3.4697389602661133, + "learning_rate": 5.892372001565637e-05, + "loss": 2.3777, + "step": 13696 + }, + { + "epoch": 0.9192980101338881, + "grad_norm": 4.457888126373291, + "learning_rate": 5.891302599399685e-05, + "loss": 2.1608, + "step": 13698 + }, + { + "epoch": 0.9194322338176572, + "grad_norm": 3.8556292057037354, + "learning_rate": 5.8902331551214387e-05, + "loss": 2.228, + "step": 13700 + }, + { + "epoch": 0.9195664575014262, + "grad_norm": 4.238424301147461, + "learning_rate": 5.8891636687814276e-05, + "loss": 2.25, + "step": 13702 + }, + { + "epoch": 0.9197006811851951, + "grad_norm": 4.138828754425049, + "learning_rate": 5.8880941404301795e-05, + "loss": 2.2608, + "step": 13704 + }, + { + "epoch": 0.9198349048689641, + "grad_norm": 4.411657810211182, + "learning_rate": 5.887024570118231e-05, + "loss": 2.3145, + "step": 13706 + }, + { + "epoch": 0.9199691285527332, + "grad_norm": 3.5891294479370117, + "learning_rate": 5.885954957896115e-05, + "loss": 2.1134, + "step": 13708 + }, + { + "epoch": 0.9201033522365021, + "grad_norm": 3.615011215209961, + "learning_rate": 5.884885303814369e-05, + "loss": 2.1883, + "step": 13710 + }, + { + "epoch": 0.9202375759202711, + "grad_norm": 3.998955726623535, + "learning_rate": 5.8838156079235326e-05, + "loss": 2.1482, + "step": 13712 + }, + { + "epoch": 0.9203717996040401, + "grad_norm": 4.143151760101318, + "learning_rate": 5.8827458702741465e-05, + "loss": 2.3615, + "step": 13714 + }, + { + "epoch": 0.9205060232878092, + "grad_norm": 3.611841917037964, + "learning_rate": 5.881676090916756e-05, + "loss": 2.2538, + "step": 13716 + }, + { + "epoch": 0.9206402469715781, + "grad_norm": 4.0479512214660645, + "learning_rate": 5.880606269901902e-05, + "loss": 2.2712, + "step": 13718 + }, + { + "epoch": 0.9207744706553471, + "grad_norm": 4.313901424407959, + "learning_rate": 5.879536407280134e-05, + "loss": 2.228, + "step": 13720 + }, + { + "epoch": 0.9209086943391162, + "grad_norm": 5.103628158569336, + "learning_rate": 5.8784665031020004e-05, + "loss": 2.4081, + "step": 13722 + }, + { + "epoch": 0.9210429180228852, + "grad_norm": 3.8826887607574463, + "learning_rate": 5.87739655741805e-05, + "loss": 2.0933, + "step": 13724 + }, + { + "epoch": 0.9211771417066541, + "grad_norm": 3.4727020263671875, + "learning_rate": 5.8763265702788404e-05, + "loss": 1.9149, + "step": 13726 + }, + { + "epoch": 0.9213113653904231, + "grad_norm": 4.1810150146484375, + "learning_rate": 5.8752565417349215e-05, + "loss": 2.2471, + "step": 13728 + }, + { + "epoch": 0.9214455890741922, + "grad_norm": 3.8833372592926025, + "learning_rate": 5.874186471836854e-05, + "loss": 2.5587, + "step": 13730 + }, + { + "epoch": 0.9215798127579612, + "grad_norm": 4.3609113693237305, + "learning_rate": 5.8731163606351933e-05, + "loss": 2.5211, + "step": 13732 + }, + { + "epoch": 0.9217140364417301, + "grad_norm": 4.081363677978516, + "learning_rate": 5.872046208180503e-05, + "loss": 2.2117, + "step": 13734 + }, + { + "epoch": 0.9218482601254991, + "grad_norm": 4.0930094718933105, + "learning_rate": 5.8709760145233416e-05, + "loss": 2.3947, + "step": 13736 + }, + { + "epoch": 0.9219824838092682, + "grad_norm": 3.9564521312713623, + "learning_rate": 5.869905779714278e-05, + "loss": 2.1821, + "step": 13738 + }, + { + "epoch": 0.9221167074930371, + "grad_norm": 5.1200852394104, + "learning_rate": 5.868835503803877e-05, + "loss": 2.4971, + "step": 13740 + }, + { + "epoch": 0.9222509311768061, + "grad_norm": 7.13456916809082, + "learning_rate": 5.867765186842706e-05, + "loss": 2.2628, + "step": 13742 + }, + { + "epoch": 0.9223851548605752, + "grad_norm": 3.8519351482391357, + "learning_rate": 5.866694828881337e-05, + "loss": 2.249, + "step": 13744 + }, + { + "epoch": 0.9225193785443442, + "grad_norm": 4.050134181976318, + "learning_rate": 5.8656244299703414e-05, + "loss": 2.3747, + "step": 13746 + }, + { + "epoch": 0.9226536022281131, + "grad_norm": 4.573965549468994, + "learning_rate": 5.864553990160294e-05, + "loss": 2.5102, + "step": 13748 + }, + { + "epoch": 0.9227878259118821, + "grad_norm": 4.552481651306152, + "learning_rate": 5.86348350950177e-05, + "loss": 2.4408, + "step": 13750 + }, + { + "epoch": 0.9229220495956512, + "grad_norm": 4.3213791847229, + "learning_rate": 5.8624129880453485e-05, + "loss": 2.3098, + "step": 13752 + }, + { + "epoch": 0.9230562732794202, + "grad_norm": 3.98532772064209, + "learning_rate": 5.8613424258416094e-05, + "loss": 2.4451, + "step": 13754 + }, + { + "epoch": 0.9231904969631891, + "grad_norm": 4.16300630569458, + "learning_rate": 5.8602718229411335e-05, + "loss": 2.1051, + "step": 13756 + }, + { + "epoch": 0.9233247206469581, + "grad_norm": 3.833183526992798, + "learning_rate": 5.859201179394508e-05, + "loss": 2.1599, + "step": 13758 + }, + { + "epoch": 0.9234589443307272, + "grad_norm": 3.6441497802734375, + "learning_rate": 5.858130495252314e-05, + "loss": 2.3437, + "step": 13760 + }, + { + "epoch": 0.9235931680144962, + "grad_norm": 4.569154262542725, + "learning_rate": 5.857059770565142e-05, + "loss": 2.3654, + "step": 13762 + }, + { + "epoch": 0.9237273916982651, + "grad_norm": 3.9722697734832764, + "learning_rate": 5.855989005383581e-05, + "loss": 2.15, + "step": 13764 + }, + { + "epoch": 0.9238616153820342, + "grad_norm": 4.607711315155029, + "learning_rate": 5.8549181997582225e-05, + "loss": 2.5987, + "step": 13766 + }, + { + "epoch": 0.9239958390658032, + "grad_norm": 5.998827934265137, + "learning_rate": 5.853847353739663e-05, + "loss": 2.1168, + "step": 13768 + }, + { + "epoch": 0.9241300627495722, + "grad_norm": 4.024709224700928, + "learning_rate": 5.852776467378492e-05, + "loss": 2.4834, + "step": 13770 + }, + { + "epoch": 0.9242642864333411, + "grad_norm": 4.089205265045166, + "learning_rate": 5.8517055407253115e-05, + "loss": 2.3923, + "step": 13772 + }, + { + "epoch": 0.9243985101171102, + "grad_norm": 4.150171279907227, + "learning_rate": 5.850634573830718e-05, + "loss": 2.0743, + "step": 13774 + }, + { + "epoch": 0.9245327338008792, + "grad_norm": 4.649496555328369, + "learning_rate": 5.849563566745313e-05, + "loss": 2.2868, + "step": 13776 + }, + { + "epoch": 0.9246669574846481, + "grad_norm": 3.8351352214813232, + "learning_rate": 5.8484925195197016e-05, + "loss": 2.1306, + "step": 13778 + }, + { + "epoch": 0.9248011811684171, + "grad_norm": 3.8750414848327637, + "learning_rate": 5.847421432204486e-05, + "loss": 2.1749, + "step": 13780 + }, + { + "epoch": 0.9249354048521862, + "grad_norm": 5.1706390380859375, + "learning_rate": 5.846350304850274e-05, + "loss": 2.2998, + "step": 13782 + }, + { + "epoch": 0.9250696285359552, + "grad_norm": 3.796574115753174, + "learning_rate": 5.845279137507675e-05, + "loss": 2.1537, + "step": 13784 + }, + { + "epoch": 0.9252038522197241, + "grad_norm": 7.882443904876709, + "learning_rate": 5.8442079302273e-05, + "loss": 2.1407, + "step": 13786 + }, + { + "epoch": 0.9253380759034932, + "grad_norm": 5.377003192901611, + "learning_rate": 5.843136683059758e-05, + "loss": 2.3275, + "step": 13788 + }, + { + "epoch": 0.9254722995872622, + "grad_norm": 4.5544209480285645, + "learning_rate": 5.842065396055667e-05, + "loss": 2.3385, + "step": 13790 + }, + { + "epoch": 0.9256065232710312, + "grad_norm": 3.98335337638855, + "learning_rate": 5.8409940692656416e-05, + "loss": 2.3706, + "step": 13792 + }, + { + "epoch": 0.9257407469548001, + "grad_norm": 3.712745189666748, + "learning_rate": 5.8399227027403e-05, + "loss": 2.1918, + "step": 13794 + }, + { + "epoch": 0.9258749706385692, + "grad_norm": 6.450337886810303, + "learning_rate": 5.838851296530263e-05, + "loss": 2.0591, + "step": 13796 + }, + { + "epoch": 0.9260091943223382, + "grad_norm": 3.9920616149902344, + "learning_rate": 5.837779850686152e-05, + "loss": 2.3145, + "step": 13798 + }, + { + "epoch": 0.9261434180061072, + "grad_norm": 4.054947376251221, + "learning_rate": 5.836708365258589e-05, + "loss": 2.4483, + "step": 13800 + }, + { + "epoch": 0.9262776416898761, + "grad_norm": 3.609114408493042, + "learning_rate": 5.835636840298202e-05, + "loss": 2.1967, + "step": 13802 + }, + { + "epoch": 0.9264118653736452, + "grad_norm": 3.6542861461639404, + "learning_rate": 5.834565275855617e-05, + "loss": 2.1719, + "step": 13804 + }, + { + "epoch": 0.9265460890574142, + "grad_norm": 4.137095928192139, + "learning_rate": 5.833493671981465e-05, + "loss": 2.0669, + "step": 13806 + }, + { + "epoch": 0.9266803127411832, + "grad_norm": 3.5536000728607178, + "learning_rate": 5.832422028726375e-05, + "loss": 1.9429, + "step": 13808 + }, + { + "epoch": 0.9268145364249522, + "grad_norm": 4.263477325439453, + "learning_rate": 5.8313503461409826e-05, + "loss": 2.1951, + "step": 13810 + }, + { + "epoch": 0.9269487601087212, + "grad_norm": 4.169956684112549, + "learning_rate": 5.830278624275919e-05, + "loss": 2.1549, + "step": 13812 + }, + { + "epoch": 0.9270829837924902, + "grad_norm": 4.922275066375732, + "learning_rate": 5.829206863181823e-05, + "loss": 2.2071, + "step": 13814 + }, + { + "epoch": 0.9272172074762591, + "grad_norm": 3.9834465980529785, + "learning_rate": 5.8281350629093346e-05, + "loss": 2.2396, + "step": 13816 + }, + { + "epoch": 0.9273514311600282, + "grad_norm": 4.072121620178223, + "learning_rate": 5.8270632235090916e-05, + "loss": 2.2771, + "step": 13818 + }, + { + "epoch": 0.9274856548437972, + "grad_norm": 4.308944225311279, + "learning_rate": 5.825991345031739e-05, + "loss": 2.2806, + "step": 13820 + }, + { + "epoch": 0.9276198785275662, + "grad_norm": 4.049676418304443, + "learning_rate": 5.82491942752792e-05, + "loss": 2.4103, + "step": 13822 + }, + { + "epoch": 0.9277541022113351, + "grad_norm": 3.8209712505340576, + "learning_rate": 5.823847471048279e-05, + "loss": 2.1356, + "step": 13824 + }, + { + "epoch": 0.9278883258951042, + "grad_norm": 3.9184651374816895, + "learning_rate": 5.822775475643465e-05, + "loss": 2.9138, + "step": 13826 + }, + { + "epoch": 0.9280225495788732, + "grad_norm": 3.9393651485443115, + "learning_rate": 5.821703441364128e-05, + "loss": 2.5299, + "step": 13828 + }, + { + "epoch": 0.9281567732626422, + "grad_norm": 3.746563673019409, + "learning_rate": 5.820631368260919e-05, + "loss": 2.2275, + "step": 13830 + }, + { + "epoch": 0.9282909969464112, + "grad_norm": 4.460071563720703, + "learning_rate": 5.819559256384492e-05, + "loss": 2.0919, + "step": 13832 + }, + { + "epoch": 0.9284252206301802, + "grad_norm": 3.757009267807007, + "learning_rate": 5.818487105785502e-05, + "loss": 2.2286, + "step": 13834 + }, + { + "epoch": 0.9285594443139492, + "grad_norm": 4.2085161209106445, + "learning_rate": 5.8174149165146044e-05, + "loss": 2.081, + "step": 13836 + }, + { + "epoch": 0.9286936679977182, + "grad_norm": 4.202446460723877, + "learning_rate": 5.816342688622462e-05, + "loss": 2.3873, + "step": 13838 + }, + { + "epoch": 0.9288278916814872, + "grad_norm": 5.572206497192383, + "learning_rate": 5.815270422159731e-05, + "loss": 2.2524, + "step": 13840 + }, + { + "epoch": 0.9289621153652562, + "grad_norm": 6.615719795227051, + "learning_rate": 5.8141981171770755e-05, + "loss": 2.5016, + "step": 13842 + }, + { + "epoch": 0.9290963390490252, + "grad_norm": 3.6652913093566895, + "learning_rate": 5.81312577372516e-05, + "loss": 2.3985, + "step": 13844 + }, + { + "epoch": 0.9292305627327943, + "grad_norm": 4.178828716278076, + "learning_rate": 5.8120533918546506e-05, + "loss": 2.2836, + "step": 13846 + }, + { + "epoch": 0.9293647864165632, + "grad_norm": 4.516983985900879, + "learning_rate": 5.8109809716162164e-05, + "loss": 2.3656, + "step": 13848 + }, + { + "epoch": 0.9294990101003322, + "grad_norm": 3.776489734649658, + "learning_rate": 5.809908513060524e-05, + "loss": 2.3268, + "step": 13850 + }, + { + "epoch": 0.9296332337841012, + "grad_norm": 4.629979610443115, + "learning_rate": 5.8088360162382486e-05, + "loss": 2.1628, + "step": 13852 + }, + { + "epoch": 0.9297674574678702, + "grad_norm": 3.750661849975586, + "learning_rate": 5.8077634812000614e-05, + "loss": 2.2691, + "step": 13854 + }, + { + "epoch": 0.9299016811516392, + "grad_norm": 3.9753196239471436, + "learning_rate": 5.806690907996638e-05, + "loss": 2.2479, + "step": 13856 + }, + { + "epoch": 0.9300359048354082, + "grad_norm": 4.585379600524902, + "learning_rate": 5.8056182966786566e-05, + "loss": 2.4501, + "step": 13858 + }, + { + "epoch": 0.9301701285191772, + "grad_norm": 4.432864189147949, + "learning_rate": 5.804545647296793e-05, + "loss": 2.2161, + "step": 13860 + }, + { + "epoch": 0.9303043522029462, + "grad_norm": 6.650144577026367, + "learning_rate": 5.803472959901731e-05, + "loss": 2.322, + "step": 13862 + }, + { + "epoch": 0.9304385758867152, + "grad_norm": 4.385673999786377, + "learning_rate": 5.802400234544152e-05, + "loss": 2.6436, + "step": 13864 + }, + { + "epoch": 0.9305727995704842, + "grad_norm": 4.319478988647461, + "learning_rate": 5.801327471274738e-05, + "loss": 2.328, + "step": 13866 + }, + { + "epoch": 0.9307070232542533, + "grad_norm": 4.189038276672363, + "learning_rate": 5.8002546701441785e-05, + "loss": 2.5122, + "step": 13868 + }, + { + "epoch": 0.9308412469380222, + "grad_norm": 4.06777811050415, + "learning_rate": 5.7991818312031575e-05, + "loss": 2.1702, + "step": 13870 + }, + { + "epoch": 0.9309754706217912, + "grad_norm": 3.855466604232788, + "learning_rate": 5.798108954502368e-05, + "loss": 2.1388, + "step": 13872 + }, + { + "epoch": 0.9311096943055602, + "grad_norm": 3.5025246143341064, + "learning_rate": 5.7970360400924994e-05, + "loss": 2.0749, + "step": 13874 + }, + { + "epoch": 0.9312439179893293, + "grad_norm": 4.122305393218994, + "learning_rate": 5.795963088024247e-05, + "loss": 2.318, + "step": 13876 + }, + { + "epoch": 0.9313781416730982, + "grad_norm": 4.128359794616699, + "learning_rate": 5.794890098348301e-05, + "loss": 2.3936, + "step": 13878 + }, + { + "epoch": 0.9315123653568672, + "grad_norm": 4.260269641876221, + "learning_rate": 5.7938170711153614e-05, + "loss": 2.3867, + "step": 13880 + }, + { + "epoch": 0.9316465890406362, + "grad_norm": 4.132872581481934, + "learning_rate": 5.792744006376127e-05, + "loss": 2.1855, + "step": 13882 + }, + { + "epoch": 0.9317808127244053, + "grad_norm": 4.139801979064941, + "learning_rate": 5.791670904181297e-05, + "loss": 2.1836, + "step": 13884 + }, + { + "epoch": 0.9319150364081742, + "grad_norm": 4.336869239807129, + "learning_rate": 5.7905977645815745e-05, + "loss": 2.3176, + "step": 13886 + }, + { + "epoch": 0.9320492600919432, + "grad_norm": 4.261433124542236, + "learning_rate": 5.789524587627661e-05, + "loss": 2.0469, + "step": 13888 + }, + { + "epoch": 0.9321834837757123, + "grad_norm": 3.8923168182373047, + "learning_rate": 5.788451373370263e-05, + "loss": 2.3449, + "step": 13890 + }, + { + "epoch": 0.9323177074594812, + "grad_norm": 4.081993103027344, + "learning_rate": 5.78737812186009e-05, + "loss": 2.4665, + "step": 13892 + }, + { + "epoch": 0.9324519311432502, + "grad_norm": 4.048374652862549, + "learning_rate": 5.7863048331478466e-05, + "loss": 2.3849, + "step": 13894 + }, + { + "epoch": 0.9325861548270192, + "grad_norm": 3.8340957164764404, + "learning_rate": 5.785231507284248e-05, + "loss": 2.4203, + "step": 13896 + }, + { + "epoch": 0.9327203785107883, + "grad_norm": 4.695769309997559, + "learning_rate": 5.7841581443200035e-05, + "loss": 2.3357, + "step": 13898 + }, + { + "epoch": 0.9328546021945572, + "grad_norm": 3.701050043106079, + "learning_rate": 5.783084744305829e-05, + "loss": 2.1667, + "step": 13900 + }, + { + "epoch": 0.9329888258783262, + "grad_norm": 3.493070125579834, + "learning_rate": 5.7820113072924395e-05, + "loss": 2.1013, + "step": 13902 + }, + { + "epoch": 0.9331230495620952, + "grad_norm": 4.144317150115967, + "learning_rate": 5.780937833330554e-05, + "loss": 2.3286, + "step": 13904 + }, + { + "epoch": 0.9332572732458643, + "grad_norm": 4.1045355796813965, + "learning_rate": 5.779864322470894e-05, + "loss": 2.4273, + "step": 13906 + }, + { + "epoch": 0.9333914969296332, + "grad_norm": 4.455148220062256, + "learning_rate": 5.778790774764176e-05, + "loss": 2.0796, + "step": 13908 + }, + { + "epoch": 0.9335257206134022, + "grad_norm": 4.103682041168213, + "learning_rate": 5.777717190261125e-05, + "loss": 2.3588, + "step": 13910 + }, + { + "epoch": 0.9336599442971713, + "grad_norm": 3.987415313720703, + "learning_rate": 5.7766435690124667e-05, + "loss": 2.1929, + "step": 13912 + }, + { + "epoch": 0.9337941679809403, + "grad_norm": 3.8775322437286377, + "learning_rate": 5.775569911068925e-05, + "loss": 1.9314, + "step": 13914 + }, + { + "epoch": 0.9339283916647092, + "grad_norm": 4.492028713226318, + "learning_rate": 5.774496216481233e-05, + "loss": 2.2462, + "step": 13916 + }, + { + "epoch": 0.9340626153484782, + "grad_norm": 3.76678729057312, + "learning_rate": 5.773422485300116e-05, + "loss": 2.3506, + "step": 13918 + }, + { + "epoch": 0.9341968390322473, + "grad_norm": 4.1852617263793945, + "learning_rate": 5.772348717576309e-05, + "loss": 2.4187, + "step": 13920 + }, + { + "epoch": 0.9343310627160163, + "grad_norm": 4.061026096343994, + "learning_rate": 5.771274913360543e-05, + "loss": 2.3023, + "step": 13922 + }, + { + "epoch": 0.9344652863997852, + "grad_norm": 3.8341317176818848, + "learning_rate": 5.7702010727035536e-05, + "loss": 2.0414, + "step": 13924 + }, + { + "epoch": 0.9345995100835542, + "grad_norm": 6.66243314743042, + "learning_rate": 5.769127195656079e-05, + "loss": 2.355, + "step": 13926 + }, + { + "epoch": 0.9347337337673233, + "grad_norm": 3.9116806983947754, + "learning_rate": 5.768053282268855e-05, + "loss": 2.2606, + "step": 13928 + }, + { + "epoch": 0.9348679574510922, + "grad_norm": 3.8069653511047363, + "learning_rate": 5.766979332592626e-05, + "loss": 2.0819, + "step": 13930 + }, + { + "epoch": 0.9350021811348612, + "grad_norm": 3.7005391120910645, + "learning_rate": 5.76590534667813e-05, + "loss": 2.0937, + "step": 13932 + }, + { + "epoch": 0.9351364048186303, + "grad_norm": 4.167251110076904, + "learning_rate": 5.764831324576113e-05, + "loss": 2.3364, + "step": 13934 + }, + { + "epoch": 0.9352706285023993, + "grad_norm": 4.1704182624816895, + "learning_rate": 5.7637572663373194e-05, + "loss": 2.2625, + "step": 13936 + }, + { + "epoch": 0.9354048521861682, + "grad_norm": 3.919283390045166, + "learning_rate": 5.762683172012498e-05, + "loss": 1.8597, + "step": 13938 + }, + { + "epoch": 0.9355390758699372, + "grad_norm": 4.124593257904053, + "learning_rate": 5.761609041652396e-05, + "loss": 2.1747, + "step": 13940 + }, + { + "epoch": 0.9356732995537063, + "grad_norm": 3.7897355556488037, + "learning_rate": 5.7605348753077634e-05, + "loss": 2.0229, + "step": 13942 + }, + { + "epoch": 0.9358075232374753, + "grad_norm": 4.040798187255859, + "learning_rate": 5.7594606730293554e-05, + "loss": 2.3241, + "step": 13944 + }, + { + "epoch": 0.9359417469212442, + "grad_norm": 4.085429668426514, + "learning_rate": 5.7583864348679226e-05, + "loss": 2.4577, + "step": 13946 + }, + { + "epoch": 0.9360759706050132, + "grad_norm": 4.521383285522461, + "learning_rate": 5.7573121608742234e-05, + "loss": 2.4016, + "step": 13948 + }, + { + "epoch": 0.9362101942887823, + "grad_norm": 4.624905586242676, + "learning_rate": 5.7562378510990125e-05, + "loss": 2.3275, + "step": 13950 + }, + { + "epoch": 0.9363444179725513, + "grad_norm": 4.138852596282959, + "learning_rate": 5.755163505593051e-05, + "loss": 2.1589, + "step": 13952 + }, + { + "epoch": 0.9364786416563202, + "grad_norm": 4.621596336364746, + "learning_rate": 5.754089124407097e-05, + "loss": 2.3324, + "step": 13954 + }, + { + "epoch": 0.9366128653400893, + "grad_norm": 3.9945695400238037, + "learning_rate": 5.753014707591916e-05, + "loss": 2.1712, + "step": 13956 + }, + { + "epoch": 0.9367470890238583, + "grad_norm": 4.886146068572998, + "learning_rate": 5.751940255198272e-05, + "loss": 2.4344, + "step": 13958 + }, + { + "epoch": 0.9368813127076273, + "grad_norm": 4.4729084968566895, + "learning_rate": 5.750865767276927e-05, + "loss": 2.1681, + "step": 13960 + }, + { + "epoch": 0.9370155363913962, + "grad_norm": 4.156130790710449, + "learning_rate": 5.7497912438786536e-05, + "loss": 2.6165, + "step": 13962 + }, + { + "epoch": 0.9371497600751653, + "grad_norm": 4.416032314300537, + "learning_rate": 5.7487166850542165e-05, + "loss": 2.288, + "step": 13964 + }, + { + "epoch": 0.9372839837589343, + "grad_norm": 4.019629955291748, + "learning_rate": 5.74764209085439e-05, + "loss": 2.1355, + "step": 13966 + }, + { + "epoch": 0.9374182074427032, + "grad_norm": 4.030394554138184, + "learning_rate": 5.746567461329943e-05, + "loss": 2.155, + "step": 13968 + }, + { + "epoch": 0.9375524311264722, + "grad_norm": 3.9533560276031494, + "learning_rate": 5.7454927965316516e-05, + "loss": 2.0205, + "step": 13970 + }, + { + "epoch": 0.9376866548102413, + "grad_norm": 3.8851256370544434, + "learning_rate": 5.7444180965102936e-05, + "loss": 2.2272, + "step": 13972 + }, + { + "epoch": 0.9378208784940103, + "grad_norm": 4.209317207336426, + "learning_rate": 5.743343361316644e-05, + "loss": 2.4312, + "step": 13974 + }, + { + "epoch": 0.9379551021777792, + "grad_norm": 4.623584270477295, + "learning_rate": 5.742268591001481e-05, + "loss": 2.3998, + "step": 13976 + }, + { + "epoch": 0.9380893258615483, + "grad_norm": 4.329966068267822, + "learning_rate": 5.741193785615587e-05, + "loss": 2.3809, + "step": 13978 + }, + { + "epoch": 0.9382235495453173, + "grad_norm": 7.111311435699463, + "learning_rate": 5.740118945209744e-05, + "loss": 2.0472, + "step": 13980 + }, + { + "epoch": 0.9383577732290863, + "grad_norm": 3.563483476638794, + "learning_rate": 5.739044069834737e-05, + "loss": 2.1527, + "step": 13982 + }, + { + "epoch": 0.9384919969128552, + "grad_norm": 4.6202545166015625, + "learning_rate": 5.73796915954135e-05, + "loss": 2.3795, + "step": 13984 + }, + { + "epoch": 0.9386262205966243, + "grad_norm": 4.171895980834961, + "learning_rate": 5.7368942143803725e-05, + "loss": 2.5062, + "step": 13986 + }, + { + "epoch": 0.9387604442803933, + "grad_norm": 3.7840776443481445, + "learning_rate": 5.735819234402591e-05, + "loss": 2.2051, + "step": 13988 + }, + { + "epoch": 0.9388946679641623, + "grad_norm": 5.402760982513428, + "learning_rate": 5.7347442196587986e-05, + "loss": 2.2627, + "step": 13990 + }, + { + "epoch": 0.9390288916479312, + "grad_norm": 3.8733627796173096, + "learning_rate": 5.7336691701997866e-05, + "loss": 2.3522, + "step": 13992 + }, + { + "epoch": 0.9391631153317003, + "grad_norm": 3.777519941329956, + "learning_rate": 5.732594086076348e-05, + "loss": 2.3813, + "step": 13994 + }, + { + "epoch": 0.9392973390154693, + "grad_norm": 3.776413917541504, + "learning_rate": 5.731518967339281e-05, + "loss": 2.6689, + "step": 13996 + }, + { + "epoch": 0.9394315626992383, + "grad_norm": 4.160248279571533, + "learning_rate": 5.730443814039379e-05, + "loss": 2.3024, + "step": 13998 + }, + { + "epoch": 0.9395657863830073, + "grad_norm": 4.402238368988037, + "learning_rate": 5.729368626227446e-05, + "loss": 2.3807, + "step": 14000 + }, + { + "epoch": 0.9397000100667763, + "grad_norm": 3.461923360824585, + "learning_rate": 5.728293403954278e-05, + "loss": 2.2478, + "step": 14002 + }, + { + "epoch": 0.9398342337505453, + "grad_norm": 3.9442498683929443, + "learning_rate": 5.727218147270678e-05, + "loss": 2.5891, + "step": 14004 + }, + { + "epoch": 0.9399684574343142, + "grad_norm": 4.885064125061035, + "learning_rate": 5.726142856227452e-05, + "loss": 2.3523, + "step": 14006 + }, + { + "epoch": 0.9401026811180833, + "grad_norm": 3.919591188430786, + "learning_rate": 5.725067530875403e-05, + "loss": 2.2822, + "step": 14008 + }, + { + "epoch": 0.9402369048018523, + "grad_norm": 4.069267272949219, + "learning_rate": 5.72399217126534e-05, + "loss": 2.1065, + "step": 14010 + }, + { + "epoch": 0.9403711284856213, + "grad_norm": 3.734842538833618, + "learning_rate": 5.722916777448069e-05, + "loss": 2.3273, + "step": 14012 + }, + { + "epoch": 0.9405053521693902, + "grad_norm": 3.8212363719940186, + "learning_rate": 5.721841349474404e-05, + "loss": 2.2173, + "step": 14014 + }, + { + "epoch": 0.9406395758531593, + "grad_norm": 3.73728084564209, + "learning_rate": 5.720765887395153e-05, + "loss": 2.3783, + "step": 14016 + }, + { + "epoch": 0.9407737995369283, + "grad_norm": 4.2850236892700195, + "learning_rate": 5.719690391261131e-05, + "loss": 2.3815, + "step": 14018 + }, + { + "epoch": 0.9409080232206973, + "grad_norm": 4.111204624176025, + "learning_rate": 5.718614861123155e-05, + "loss": 2.3525, + "step": 14020 + }, + { + "epoch": 0.9410422469044663, + "grad_norm": 7.803344249725342, + "learning_rate": 5.717539297032039e-05, + "loss": 2.6423, + "step": 14022 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 4.1928391456604, + "learning_rate": 5.716463699038602e-05, + "loss": 2.2549, + "step": 14024 + }, + { + "epoch": 0.9413106942720043, + "grad_norm": 4.40097188949585, + "learning_rate": 5.7153880671936635e-05, + "loss": 2.7675, + "step": 14026 + }, + { + "epoch": 0.9414449179557733, + "grad_norm": 4.632482528686523, + "learning_rate": 5.7143124015480466e-05, + "loss": 2.6447, + "step": 14028 + }, + { + "epoch": 0.9415791416395423, + "grad_norm": 6.084924697875977, + "learning_rate": 5.713236702152572e-05, + "loss": 2.4924, + "step": 14030 + }, + { + "epoch": 0.9417133653233113, + "grad_norm": 3.6600189208984375, + "learning_rate": 5.7121609690580666e-05, + "loss": 2.0261, + "step": 14032 + }, + { + "epoch": 0.9418475890070803, + "grad_norm": 7.827619552612305, + "learning_rate": 5.711085202315356e-05, + "loss": 2.3025, + "step": 14034 + }, + { + "epoch": 0.9419818126908494, + "grad_norm": 3.56774640083313, + "learning_rate": 5.710009401975268e-05, + "loss": 2.2894, + "step": 14036 + }, + { + "epoch": 0.9421160363746183, + "grad_norm": 3.9378530979156494, + "learning_rate": 5.708933568088632e-05, + "loss": 2.3216, + "step": 14038 + }, + { + "epoch": 0.9422502600583873, + "grad_norm": 3.8983917236328125, + "learning_rate": 5.707857700706278e-05, + "loss": 2.2571, + "step": 14040 + }, + { + "epoch": 0.9423844837421563, + "grad_norm": 4.43137264251709, + "learning_rate": 5.706781799879041e-05, + "loss": 2.4317, + "step": 14042 + }, + { + "epoch": 0.9425187074259253, + "grad_norm": 4.026076316833496, + "learning_rate": 5.705705865657753e-05, + "loss": 2.0856, + "step": 14044 + }, + { + "epoch": 0.9426529311096943, + "grad_norm": 4.095816135406494, + "learning_rate": 5.704629898093251e-05, + "loss": 2.014, + "step": 14046 + }, + { + "epoch": 0.9427871547934633, + "grad_norm": 3.9125959873199463, + "learning_rate": 5.703553897236372e-05, + "loss": 2.3443, + "step": 14048 + }, + { + "epoch": 0.9429213784772323, + "grad_norm": 4.094547748565674, + "learning_rate": 5.702477863137954e-05, + "loss": 2.1064, + "step": 14050 + }, + { + "epoch": 0.9430556021610013, + "grad_norm": 3.859708786010742, + "learning_rate": 5.7014017958488375e-05, + "loss": 2.0644, + "step": 14052 + }, + { + "epoch": 0.9431898258447703, + "grad_norm": 4.1251068115234375, + "learning_rate": 5.700325695419868e-05, + "loss": 2.359, + "step": 14054 + }, + { + "epoch": 0.9433240495285393, + "grad_norm": 3.6594369411468506, + "learning_rate": 5.699249561901884e-05, + "loss": 2.3048, + "step": 14056 + }, + { + "epoch": 0.9434582732123084, + "grad_norm": 4.406859874725342, + "learning_rate": 5.698173395345735e-05, + "loss": 2.7895, + "step": 14058 + }, + { + "epoch": 0.9435924968960773, + "grad_norm": 4.152685642242432, + "learning_rate": 5.6970971958022644e-05, + "loss": 2.2957, + "step": 14060 + }, + { + "epoch": 0.9437267205798463, + "grad_norm": 4.281815528869629, + "learning_rate": 5.696020963322324e-05, + "loss": 2.2464, + "step": 14062 + }, + { + "epoch": 0.9438609442636153, + "grad_norm": 3.7686400413513184, + "learning_rate": 5.69494469795676e-05, + "loss": 2.404, + "step": 14064 + }, + { + "epoch": 0.9439951679473844, + "grad_norm": 4.03279447555542, + "learning_rate": 5.693868399756426e-05, + "loss": 2.4692, + "step": 14066 + }, + { + "epoch": 0.9441293916311533, + "grad_norm": 3.59029221534729, + "learning_rate": 5.692792068772176e-05, + "loss": 2.0376, + "step": 14068 + }, + { + "epoch": 0.9442636153149223, + "grad_norm": 3.663301706314087, + "learning_rate": 5.691715705054861e-05, + "loss": 2.2002, + "step": 14070 + }, + { + "epoch": 0.9443978389986913, + "grad_norm": 3.9416494369506836, + "learning_rate": 5.69063930865534e-05, + "loss": 2.4276, + "step": 14072 + }, + { + "epoch": 0.9445320626824604, + "grad_norm": 4.283918380737305, + "learning_rate": 5.689562879624469e-05, + "loss": 2.0603, + "step": 14074 + }, + { + "epoch": 0.9446662863662293, + "grad_norm": 4.517065048217773, + "learning_rate": 5.68848641801311e-05, + "loss": 2.1412, + "step": 14076 + }, + { + "epoch": 0.9448005100499983, + "grad_norm": 5.047810077667236, + "learning_rate": 5.68740992387212e-05, + "loss": 2.2514, + "step": 14078 + }, + { + "epoch": 0.9449347337337674, + "grad_norm": 4.55042028427124, + "learning_rate": 5.686333397252363e-05, + "loss": 2.2888, + "step": 14080 + }, + { + "epoch": 0.9450689574175363, + "grad_norm": 4.092801094055176, + "learning_rate": 5.6852568382047026e-05, + "loss": 2.3825, + "step": 14082 + }, + { + "epoch": 0.9452031811013053, + "grad_norm": 3.4427595138549805, + "learning_rate": 5.684180246780004e-05, + "loss": 2.0525, + "step": 14084 + }, + { + "epoch": 0.9453374047850743, + "grad_norm": 4.196927547454834, + "learning_rate": 5.683103623029135e-05, + "loss": 2.3335, + "step": 14086 + }, + { + "epoch": 0.9454716284688434, + "grad_norm": 4.979678630828857, + "learning_rate": 5.6820269670029615e-05, + "loss": 2.34, + "step": 14088 + }, + { + "epoch": 0.9456058521526123, + "grad_norm": 3.7227132320404053, + "learning_rate": 5.680950278752356e-05, + "loss": 2.3445, + "step": 14090 + }, + { + "epoch": 0.9457400758363813, + "grad_norm": 4.468086242675781, + "learning_rate": 5.679873558328188e-05, + "loss": 2.1207, + "step": 14092 + }, + { + "epoch": 0.9458742995201503, + "grad_norm": 3.8448846340179443, + "learning_rate": 5.6787968057813324e-05, + "loss": 2.5039, + "step": 14094 + }, + { + "epoch": 0.9460085232039194, + "grad_norm": 4.3266425132751465, + "learning_rate": 5.6777200211626624e-05, + "loss": 2.5194, + "step": 14096 + }, + { + "epoch": 0.9461427468876883, + "grad_norm": 4.41290807723999, + "learning_rate": 5.6766432045230536e-05, + "loss": 2.525, + "step": 14098 + }, + { + "epoch": 0.9462769705714573, + "grad_norm": 5.153083801269531, + "learning_rate": 5.675566355913384e-05, + "loss": 2.4996, + "step": 14100 + }, + { + "epoch": 0.9464111942552264, + "grad_norm": 3.660865306854248, + "learning_rate": 5.6744894753845326e-05, + "loss": 1.9523, + "step": 14102 + }, + { + "epoch": 0.9465454179389954, + "grad_norm": 4.091307163238525, + "learning_rate": 5.67341256298738e-05, + "loss": 2.3732, + "step": 14104 + }, + { + "epoch": 0.9466796416227643, + "grad_norm": 4.580510139465332, + "learning_rate": 5.672335618772808e-05, + "loss": 2.4314, + "step": 14106 + }, + { + "epoch": 0.9468138653065333, + "grad_norm": 4.876501083374023, + "learning_rate": 5.671258642791699e-05, + "loss": 2.2695, + "step": 14108 + }, + { + "epoch": 0.9469480889903024, + "grad_norm": 3.9316112995147705, + "learning_rate": 5.670181635094941e-05, + "loss": 2.1296, + "step": 14110 + }, + { + "epoch": 0.9470823126740714, + "grad_norm": 3.7948806285858154, + "learning_rate": 5.669104595733419e-05, + "loss": 2.2842, + "step": 14112 + }, + { + "epoch": 0.9472165363578403, + "grad_norm": 4.867766857147217, + "learning_rate": 5.668027524758019e-05, + "loss": 2.678, + "step": 14114 + }, + { + "epoch": 0.9473507600416093, + "grad_norm": 4.934841632843018, + "learning_rate": 5.6669504222196327e-05, + "loss": 2.1131, + "step": 14116 + }, + { + "epoch": 0.9474849837253784, + "grad_norm": 3.915083646774292, + "learning_rate": 5.665873288169149e-05, + "loss": 2.2867, + "step": 14118 + }, + { + "epoch": 0.9476192074091473, + "grad_norm": 3.6081702709198, + "learning_rate": 5.664796122657463e-05, + "loss": 2.4281, + "step": 14120 + }, + { + "epoch": 0.9477534310929163, + "grad_norm": 4.4091691970825195, + "learning_rate": 5.6637189257354675e-05, + "loss": 2.5855, + "step": 14122 + }, + { + "epoch": 0.9478876547766854, + "grad_norm": 3.7294723987579346, + "learning_rate": 5.6626416974540585e-05, + "loss": 1.9953, + "step": 14124 + }, + { + "epoch": 0.9480218784604544, + "grad_norm": 4.179866313934326, + "learning_rate": 5.6615644378641306e-05, + "loss": 2.2184, + "step": 14126 + }, + { + "epoch": 0.9481561021442233, + "grad_norm": 3.8817501068115234, + "learning_rate": 5.660487147016584e-05, + "loss": 2.2282, + "step": 14128 + }, + { + "epoch": 0.9482903258279923, + "grad_norm": 7.4222187995910645, + "learning_rate": 5.659409824962321e-05, + "loss": 2.0802, + "step": 14130 + }, + { + "epoch": 0.9484245495117614, + "grad_norm": 9.955772399902344, + "learning_rate": 5.658332471752239e-05, + "loss": 2.0321, + "step": 14132 + }, + { + "epoch": 0.9485587731955304, + "grad_norm": 3.82814884185791, + "learning_rate": 5.6572550874372424e-05, + "loss": 2.4772, + "step": 14134 + }, + { + "epoch": 0.9486929968792993, + "grad_norm": 4.761017322540283, + "learning_rate": 5.656177672068235e-05, + "loss": 2.627, + "step": 14136 + }, + { + "epoch": 0.9488272205630683, + "grad_norm": 3.708085060119629, + "learning_rate": 5.655100225696123e-05, + "loss": 2.2777, + "step": 14138 + }, + { + "epoch": 0.9489614442468374, + "grad_norm": 4.762210369110107, + "learning_rate": 5.654022748371813e-05, + "loss": 2.1856, + "step": 14140 + }, + { + "epoch": 0.9490956679306064, + "grad_norm": 4.970183372497559, + "learning_rate": 5.6529452401462145e-05, + "loss": 2.3994, + "step": 14142 + }, + { + "epoch": 0.9492298916143753, + "grad_norm": 3.95869779586792, + "learning_rate": 5.651867701070238e-05, + "loss": 2.5355, + "step": 14144 + }, + { + "epoch": 0.9493641152981444, + "grad_norm": 7.246446132659912, + "learning_rate": 5.650790131194794e-05, + "loss": 2.3321, + "step": 14146 + }, + { + "epoch": 0.9494983389819134, + "grad_norm": 4.4275288581848145, + "learning_rate": 5.649712530570797e-05, + "loss": 2.3881, + "step": 14148 + }, + { + "epoch": 0.9496325626656824, + "grad_norm": 4.761242866516113, + "learning_rate": 5.648634899249159e-05, + "loss": 2.2876, + "step": 14150 + }, + { + "epoch": 0.9497667863494513, + "grad_norm": 4.264746189117432, + "learning_rate": 5.647557237280798e-05, + "loss": 2.4029, + "step": 14152 + }, + { + "epoch": 0.9499010100332204, + "grad_norm": 4.1296257972717285, + "learning_rate": 5.6464795447166306e-05, + "loss": 2.2488, + "step": 14154 + }, + { + "epoch": 0.9500352337169894, + "grad_norm": 4.243205547332764, + "learning_rate": 5.645401821607577e-05, + "loss": 2.1816, + "step": 14156 + }, + { + "epoch": 0.9501694574007583, + "grad_norm": 4.770181655883789, + "learning_rate": 5.644324068004556e-05, + "loss": 2.4071, + "step": 14158 + }, + { + "epoch": 0.9503036810845273, + "grad_norm": 3.8598718643188477, + "learning_rate": 5.6432462839584886e-05, + "loss": 2.0809, + "step": 14160 + }, + { + "epoch": 0.9504379047682964, + "grad_norm": 4.11749267578125, + "learning_rate": 5.6421684695203015e-05, + "loss": 2.2308, + "step": 14162 + }, + { + "epoch": 0.9505721284520654, + "grad_norm": 4.15256929397583, + "learning_rate": 5.6410906247409144e-05, + "loss": 2.1382, + "step": 14164 + }, + { + "epoch": 0.9507063521358343, + "grad_norm": 3.6801416873931885, + "learning_rate": 5.6400127496712585e-05, + "loss": 2.3821, + "step": 14166 + }, + { + "epoch": 0.9508405758196034, + "grad_norm": 4.0785698890686035, + "learning_rate": 5.6389348443622566e-05, + "loss": 2.2417, + "step": 14168 + }, + { + "epoch": 0.9509747995033724, + "grad_norm": 5.067655563354492, + "learning_rate": 5.6378569088648406e-05, + "loss": 2.2857, + "step": 14170 + }, + { + "epoch": 0.9511090231871414, + "grad_norm": 4.022231101989746, + "learning_rate": 5.63677894322994e-05, + "loss": 2.2701, + "step": 14172 + }, + { + "epoch": 0.9512432468709103, + "grad_norm": 4.0595784187316895, + "learning_rate": 5.635700947508486e-05, + "loss": 2.3088, + "step": 14174 + }, + { + "epoch": 0.9513774705546794, + "grad_norm": 3.711423635482788, + "learning_rate": 5.6346229217514136e-05, + "loss": 2.1111, + "step": 14176 + }, + { + "epoch": 0.9515116942384484, + "grad_norm": 5.866116046905518, + "learning_rate": 5.6335448660096544e-05, + "loss": 2.4056, + "step": 14178 + }, + { + "epoch": 0.9516459179222174, + "grad_norm": 4.50927734375, + "learning_rate": 5.632466780334148e-05, + "loss": 2.3421, + "step": 14180 + }, + { + "epoch": 0.9517801416059863, + "grad_norm": 3.8313252925872803, + "learning_rate": 5.6313886647758294e-05, + "loss": 2.0668, + "step": 14182 + }, + { + "epoch": 0.9519143652897554, + "grad_norm": 9.641461372375488, + "learning_rate": 5.6303105193856365e-05, + "loss": 2.2556, + "step": 14184 + }, + { + "epoch": 0.9520485889735244, + "grad_norm": 4.703955173492432, + "learning_rate": 5.6292323442145126e-05, + "loss": 2.3426, + "step": 14186 + }, + { + "epoch": 0.9521828126572934, + "grad_norm": 4.262161731719971, + "learning_rate": 5.6281541393133976e-05, + "loss": 2.2604, + "step": 14188 + }, + { + "epoch": 0.9523170363410624, + "grad_norm": 3.966033935546875, + "learning_rate": 5.6270759047332334e-05, + "loss": 2.6233, + "step": 14190 + }, + { + "epoch": 0.9524512600248314, + "grad_norm": 4.056666374206543, + "learning_rate": 5.625997640524967e-05, + "loss": 2.0849, + "step": 14192 + }, + { + "epoch": 0.9525854837086004, + "grad_norm": 3.5071873664855957, + "learning_rate": 5.624919346739542e-05, + "loss": 2.457, + "step": 14194 + }, + { + "epoch": 0.9527197073923693, + "grad_norm": 3.885369062423706, + "learning_rate": 5.623841023427908e-05, + "loss": 2.1727, + "step": 14196 + }, + { + "epoch": 0.9528539310761384, + "grad_norm": 3.6435248851776123, + "learning_rate": 5.622762670641011e-05, + "loss": 1.9625, + "step": 14198 + }, + { + "epoch": 0.9529881547599074, + "grad_norm": 9.07417106628418, + "learning_rate": 5.6216842884298035e-05, + "loss": 2.2968, + "step": 14200 + }, + { + "epoch": 0.9531223784436764, + "grad_norm": 3.9441187381744385, + "learning_rate": 5.6206058768452355e-05, + "loss": 2.2522, + "step": 14202 + }, + { + "epoch": 0.9532566021274453, + "grad_norm": 5.436535358428955, + "learning_rate": 5.61952743593826e-05, + "loss": 2.4369, + "step": 14204 + }, + { + "epoch": 0.9533908258112144, + "grad_norm": 4.8568620681762695, + "learning_rate": 5.618448965759832e-05, + "loss": 2.4055, + "step": 14206 + }, + { + "epoch": 0.9535250494949834, + "grad_norm": 4.113932132720947, + "learning_rate": 5.617370466360905e-05, + "loss": 2.1693, + "step": 14208 + }, + { + "epoch": 0.9536592731787524, + "grad_norm": 4.359194755554199, + "learning_rate": 5.616291937792439e-05, + "loss": 2.2471, + "step": 14210 + }, + { + "epoch": 0.9537934968625214, + "grad_norm": 3.947134256362915, + "learning_rate": 5.615213380105391e-05, + "loss": 2.0437, + "step": 14212 + }, + { + "epoch": 0.9539277205462904, + "grad_norm": 3.999145746231079, + "learning_rate": 5.6141347933507204e-05, + "loss": 2.296, + "step": 14214 + }, + { + "epoch": 0.9540619442300594, + "grad_norm": 3.9625802040100098, + "learning_rate": 5.613056177579388e-05, + "loss": 2.3554, + "step": 14216 + }, + { + "epoch": 0.9541961679138284, + "grad_norm": 3.9728870391845703, + "learning_rate": 5.611977532842355e-05, + "loss": 2.1104, + "step": 14218 + }, + { + "epoch": 0.9543303915975974, + "grad_norm": 3.702415943145752, + "learning_rate": 5.61089885919059e-05, + "loss": 1.9957, + "step": 14220 + }, + { + "epoch": 0.9544646152813664, + "grad_norm": 3.874826431274414, + "learning_rate": 5.609820156675053e-05, + "loss": 2.3829, + "step": 14222 + }, + { + "epoch": 0.9545988389651354, + "grad_norm": 4.26115083694458, + "learning_rate": 5.608741425346714e-05, + "loss": 2.372, + "step": 14224 + }, + { + "epoch": 0.9547330626489045, + "grad_norm": 6.250181198120117, + "learning_rate": 5.607662665256539e-05, + "loss": 2.3434, + "step": 14226 + }, + { + "epoch": 0.9548672863326734, + "grad_norm": 4.136202335357666, + "learning_rate": 5.606583876455499e-05, + "loss": 2.3071, + "step": 14228 + }, + { + "epoch": 0.9550015100164424, + "grad_norm": 4.175271511077881, + "learning_rate": 5.605505058994562e-05, + "loss": 2.2639, + "step": 14230 + }, + { + "epoch": 0.9551357337002114, + "grad_norm": 4.165344715118408, + "learning_rate": 5.604426212924703e-05, + "loss": 2.3179, + "step": 14232 + }, + { + "epoch": 0.9552699573839804, + "grad_norm": 4.286163330078125, + "learning_rate": 5.6033473382968936e-05, + "loss": 2.4661, + "step": 14234 + }, + { + "epoch": 0.9554041810677494, + "grad_norm": 3.943777084350586, + "learning_rate": 5.6022684351621094e-05, + "loss": 2.2449, + "step": 14236 + }, + { + "epoch": 0.9555384047515184, + "grad_norm": 3.99908709526062, + "learning_rate": 5.601189503571326e-05, + "loss": 2.069, + "step": 14238 + }, + { + "epoch": 0.9556726284352874, + "grad_norm": 3.662431240081787, + "learning_rate": 5.6001105435755194e-05, + "loss": 2.135, + "step": 14240 + }, + { + "epoch": 0.9558068521190564, + "grad_norm": 3.8820736408233643, + "learning_rate": 5.599031555225671e-05, + "loss": 2.3225, + "step": 14242 + }, + { + "epoch": 0.9559410758028254, + "grad_norm": 4.123410224914551, + "learning_rate": 5.597952538572758e-05, + "loss": 2.338, + "step": 14244 + }, + { + "epoch": 0.9560752994865944, + "grad_norm": 3.4044694900512695, + "learning_rate": 5.596873493667765e-05, + "loss": 2.0701, + "step": 14246 + }, + { + "epoch": 0.9562095231703635, + "grad_norm": 4.028061866760254, + "learning_rate": 5.595794420561673e-05, + "loss": 1.9158, + "step": 14248 + }, + { + "epoch": 0.9563437468541324, + "grad_norm": 4.147693157196045, + "learning_rate": 5.5947153193054655e-05, + "loss": 2.2769, + "step": 14250 + }, + { + "epoch": 0.9564779705379014, + "grad_norm": 5.683887481689453, + "learning_rate": 5.59363618995013e-05, + "loss": 2.1417, + "step": 14252 + }, + { + "epoch": 0.9566121942216704, + "grad_norm": 3.8702781200408936, + "learning_rate": 5.5925570325466504e-05, + "loss": 2.2062, + "step": 14254 + }, + { + "epoch": 0.9567464179054395, + "grad_norm": 4.2953691482543945, + "learning_rate": 5.591477847146016e-05, + "loss": 2.4599, + "step": 14256 + }, + { + "epoch": 0.9568806415892084, + "grad_norm": 4.147792816162109, + "learning_rate": 5.5903986337992174e-05, + "loss": 2.3259, + "step": 14258 + }, + { + "epoch": 0.9570148652729774, + "grad_norm": 4.174890041351318, + "learning_rate": 5.589319392557244e-05, + "loss": 2.1034, + "step": 14260 + }, + { + "epoch": 0.9571490889567464, + "grad_norm": 4.53040075302124, + "learning_rate": 5.588240123471088e-05, + "loss": 2.3264, + "step": 14262 + }, + { + "epoch": 0.9572833126405155, + "grad_norm": 4.244524002075195, + "learning_rate": 5.587160826591743e-05, + "loss": 2.2907, + "step": 14264 + }, + { + "epoch": 0.9574175363242844, + "grad_norm": 4.900374889373779, + "learning_rate": 5.586081501970203e-05, + "loss": 2.3597, + "step": 14266 + }, + { + "epoch": 0.9575517600080534, + "grad_norm": 3.943157911300659, + "learning_rate": 5.585002149657466e-05, + "loss": 2.0931, + "step": 14268 + }, + { + "epoch": 0.9576859836918225, + "grad_norm": 4.510315418243408, + "learning_rate": 5.583922769704526e-05, + "loss": 2.2517, + "step": 14270 + }, + { + "epoch": 0.9578202073755914, + "grad_norm": 4.292592525482178, + "learning_rate": 5.5828433621623845e-05, + "loss": 2.3338, + "step": 14272 + }, + { + "epoch": 0.9579544310593604, + "grad_norm": 4.137983322143555, + "learning_rate": 5.5817639270820386e-05, + "loss": 2.3765, + "step": 14274 + }, + { + "epoch": 0.9580886547431294, + "grad_norm": 3.941842555999756, + "learning_rate": 5.580684464514494e-05, + "loss": 2.2649, + "step": 14276 + }, + { + "epoch": 0.9582228784268985, + "grad_norm": 4.082207202911377, + "learning_rate": 5.579604974510748e-05, + "loss": 2.1012, + "step": 14278 + }, + { + "epoch": 0.9583571021106674, + "grad_norm": 3.864945411682129, + "learning_rate": 5.578525457121807e-05, + "loss": 2.453, + "step": 14280 + }, + { + "epoch": 0.9584913257944364, + "grad_norm": 4.082785606384277, + "learning_rate": 5.5774459123986766e-05, + "loss": 2.1272, + "step": 14282 + }, + { + "epoch": 0.9586255494782054, + "grad_norm": 4.984314441680908, + "learning_rate": 5.5763663403923614e-05, + "loss": 2.2793, + "step": 14284 + }, + { + "epoch": 0.9587597731619745, + "grad_norm": 4.244630336761475, + "learning_rate": 5.575286741153871e-05, + "loss": 2.2963, + "step": 14286 + }, + { + "epoch": 0.9588939968457434, + "grad_norm": 3.8869500160217285, + "learning_rate": 5.574207114734212e-05, + "loss": 2.0414, + "step": 14288 + }, + { + "epoch": 0.9590282205295124, + "grad_norm": 12.646753311157227, + "learning_rate": 5.573127461184398e-05, + "loss": 2.5899, + "step": 14290 + }, + { + "epoch": 0.9591624442132815, + "grad_norm": 4.072728633880615, + "learning_rate": 5.5720477805554374e-05, + "loss": 1.9879, + "step": 14292 + }, + { + "epoch": 0.9592966678970505, + "grad_norm": 8.019472122192383, + "learning_rate": 5.570968072898343e-05, + "loss": 2.3724, + "step": 14294 + }, + { + "epoch": 0.9594308915808194, + "grad_norm": 3.7489840984344482, + "learning_rate": 5.569888338264131e-05, + "loss": 2.384, + "step": 14296 + }, + { + "epoch": 0.9595651152645884, + "grad_norm": 3.79803729057312, + "learning_rate": 5.568808576703816e-05, + "loss": 2.2596, + "step": 14298 + }, + { + "epoch": 0.9596993389483575, + "grad_norm": 5.132706642150879, + "learning_rate": 5.567728788268414e-05, + "loss": 2.1819, + "step": 14300 + }, + { + "epoch": 0.9598335626321265, + "grad_norm": 5.356688022613525, + "learning_rate": 5.566648973008942e-05, + "loss": 2.2244, + "step": 14302 + }, + { + "epoch": 0.9599677863158954, + "grad_norm": 8.215302467346191, + "learning_rate": 5.565569130976422e-05, + "loss": 2.1276, + "step": 14304 + }, + { + "epoch": 0.9601020099996644, + "grad_norm": 3.538175344467163, + "learning_rate": 5.564489262221871e-05, + "loss": 2.2204, + "step": 14306 + }, + { + "epoch": 0.9602362336834335, + "grad_norm": 4.21329402923584, + "learning_rate": 5.563409366796314e-05, + "loss": 2.1968, + "step": 14308 + }, + { + "epoch": 0.9603704573672024, + "grad_norm": 4.270647048950195, + "learning_rate": 5.562329444750771e-05, + "loss": 2.1273, + "step": 14310 + }, + { + "epoch": 0.9605046810509714, + "grad_norm": 4.219348907470703, + "learning_rate": 5.561249496136268e-05, + "loss": 2.5593, + "step": 14312 + }, + { + "epoch": 0.9606389047347405, + "grad_norm": 3.9671599864959717, + "learning_rate": 5.5601695210038306e-05, + "loss": 2.1917, + "step": 14314 + }, + { + "epoch": 0.9607731284185095, + "grad_norm": 4.409279823303223, + "learning_rate": 5.559089519404484e-05, + "loss": 2.3222, + "step": 14316 + }, + { + "epoch": 0.9609073521022784, + "grad_norm": 3.9485771656036377, + "learning_rate": 5.5580094913892575e-05, + "loss": 2.3243, + "step": 14318 + }, + { + "epoch": 0.9610415757860474, + "grad_norm": 4.064247131347656, + "learning_rate": 5.556929437009181e-05, + "loss": 2.3011, + "step": 14320 + }, + { + "epoch": 0.9611757994698165, + "grad_norm": 4.3832688331604, + "learning_rate": 5.555849356315281e-05, + "loss": 2.1066, + "step": 14322 + }, + { + "epoch": 0.9613100231535855, + "grad_norm": 4.322977066040039, + "learning_rate": 5.554769249358595e-05, + "loss": 2.2031, + "step": 14324 + }, + { + "epoch": 0.9614442468373544, + "grad_norm": 4.09542989730835, + "learning_rate": 5.553689116190151e-05, + "loss": 2.151, + "step": 14326 + }, + { + "epoch": 0.9615784705211234, + "grad_norm": 4.376242160797119, + "learning_rate": 5.552608956860985e-05, + "loss": 2.0942, + "step": 14328 + }, + { + "epoch": 0.9617126942048925, + "grad_norm": 3.455543041229248, + "learning_rate": 5.551528771422133e-05, + "loss": 1.9869, + "step": 14330 + }, + { + "epoch": 0.9618469178886615, + "grad_norm": 4.252726078033447, + "learning_rate": 5.55044855992463e-05, + "loss": 2.2736, + "step": 14332 + }, + { + "epoch": 0.9619811415724304, + "grad_norm": 3.960014820098877, + "learning_rate": 5.549368322419517e-05, + "loss": 2.1671, + "step": 14334 + }, + { + "epoch": 0.9621153652561995, + "grad_norm": 4.0929083824157715, + "learning_rate": 5.548288058957829e-05, + "loss": 2.2422, + "step": 14336 + }, + { + "epoch": 0.9622495889399685, + "grad_norm": 4.965394973754883, + "learning_rate": 5.54720776959061e-05, + "loss": 2.0608, + "step": 14338 + }, + { + "epoch": 0.9623838126237375, + "grad_norm": 4.052628993988037, + "learning_rate": 5.546127454368898e-05, + "loss": 2.0789, + "step": 14340 + }, + { + "epoch": 0.9625180363075064, + "grad_norm": 4.079805374145508, + "learning_rate": 5.54504711334374e-05, + "loss": 2.3247, + "step": 14342 + }, + { + "epoch": 0.9626522599912755, + "grad_norm": 4.881062030792236, + "learning_rate": 5.543966746566176e-05, + "loss": 2.0718, + "step": 14344 + }, + { + "epoch": 0.9627864836750445, + "grad_norm": 3.6743955612182617, + "learning_rate": 5.542886354087252e-05, + "loss": 2.216, + "step": 14346 + }, + { + "epoch": 0.9629207073588134, + "grad_norm": 3.841184139251709, + "learning_rate": 5.5418059359580175e-05, + "loss": 2.0892, + "step": 14348 + }, + { + "epoch": 0.9630549310425824, + "grad_norm": 7.325196266174316, + "learning_rate": 5.5407254922295174e-05, + "loss": 2.5195, + "step": 14350 + }, + { + "epoch": 0.9631891547263515, + "grad_norm": 3.9021213054656982, + "learning_rate": 5.5396450229528006e-05, + "loss": 2.2852, + "step": 14352 + }, + { + "epoch": 0.9633233784101205, + "grad_norm": 4.046391487121582, + "learning_rate": 5.5385645281789176e-05, + "loss": 2.2083, + "step": 14354 + }, + { + "epoch": 0.9634576020938894, + "grad_norm": 3.9723658561706543, + "learning_rate": 5.537484007958921e-05, + "loss": 2.1932, + "step": 14356 + }, + { + "epoch": 0.9635918257776585, + "grad_norm": 3.93508243560791, + "learning_rate": 5.53640346234386e-05, + "loss": 2.2257, + "step": 14358 + }, + { + "epoch": 0.9637260494614275, + "grad_norm": 3.7024824619293213, + "learning_rate": 5.535322891384791e-05, + "loss": 2.0154, + "step": 14360 + }, + { + "epoch": 0.9638602731451965, + "grad_norm": 4.73423433303833, + "learning_rate": 5.534242295132769e-05, + "loss": 2.203, + "step": 14362 + }, + { + "epoch": 0.9639944968289654, + "grad_norm": 3.972707509994507, + "learning_rate": 5.533161673638847e-05, + "loss": 2.0854, + "step": 14364 + }, + { + "epoch": 0.9641287205127345, + "grad_norm": 4.467918872833252, + "learning_rate": 5.532081026954087e-05, + "loss": 2.0035, + "step": 14366 + }, + { + "epoch": 0.9642629441965035, + "grad_norm": 3.896301746368408, + "learning_rate": 5.531000355129543e-05, + "loss": 2.0325, + "step": 14368 + }, + { + "epoch": 0.9643971678802725, + "grad_norm": 3.9608750343322754, + "learning_rate": 5.529919658216276e-05, + "loss": 2.6109, + "step": 14370 + }, + { + "epoch": 0.9645313915640414, + "grad_norm": 4.198347568511963, + "learning_rate": 5.5288389362653484e-05, + "loss": 2.4232, + "step": 14372 + }, + { + "epoch": 0.9646656152478105, + "grad_norm": 3.630444288253784, + "learning_rate": 5.52775818932782e-05, + "loss": 2.1915, + "step": 14374 + }, + { + "epoch": 0.9647998389315795, + "grad_norm": 5.895638942718506, + "learning_rate": 5.5266774174547564e-05, + "loss": 2.5013, + "step": 14376 + }, + { + "epoch": 0.9649340626153485, + "grad_norm": 3.949514389038086, + "learning_rate": 5.525596620697219e-05, + "loss": 2.1224, + "step": 14378 + }, + { + "epoch": 0.9650682862991175, + "grad_norm": 3.9288530349731445, + "learning_rate": 5.5245157991062755e-05, + "loss": 1.9612, + "step": 14380 + }, + { + "epoch": 0.9652025099828865, + "grad_norm": 4.177463531494141, + "learning_rate": 5.523434952732991e-05, + "loss": 2.2432, + "step": 14382 + }, + { + "epoch": 0.9653367336666555, + "grad_norm": 4.585700988769531, + "learning_rate": 5.522354081628435e-05, + "loss": 2.3195, + "step": 14384 + }, + { + "epoch": 0.9654709573504244, + "grad_norm": 4.275617599487305, + "learning_rate": 5.5212731858436774e-05, + "loss": 2.404, + "step": 14386 + }, + { + "epoch": 0.9656051810341935, + "grad_norm": 3.871185302734375, + "learning_rate": 5.520192265429784e-05, + "loss": 2.0887, + "step": 14388 + }, + { + "epoch": 0.9657394047179625, + "grad_norm": 4.735138416290283, + "learning_rate": 5.519111320437832e-05, + "loss": 2.7004, + "step": 14390 + }, + { + "epoch": 0.9658736284017315, + "grad_norm": 4.224652290344238, + "learning_rate": 5.518030350918888e-05, + "loss": 2.3499, + "step": 14392 + }, + { + "epoch": 0.9660078520855004, + "grad_norm": 4.322037220001221, + "learning_rate": 5.5169493569240295e-05, + "loss": 2.4251, + "step": 14394 + }, + { + "epoch": 0.9661420757692695, + "grad_norm": 4.046513557434082, + "learning_rate": 5.5158683385043307e-05, + "loss": 2.5467, + "step": 14396 + }, + { + "epoch": 0.9662762994530385, + "grad_norm": 3.862236499786377, + "learning_rate": 5.514787295710867e-05, + "loss": 2.2143, + "step": 14398 + }, + { + "epoch": 0.9664105231368075, + "grad_norm": 4.132143020629883, + "learning_rate": 5.513706228594717e-05, + "loss": 2.0341, + "step": 14400 + }, + { + "epoch": 0.9665447468205765, + "grad_norm": 3.763434886932373, + "learning_rate": 5.512625137206957e-05, + "loss": 2.3222, + "step": 14402 + }, + { + "epoch": 0.9666789705043455, + "grad_norm": 4.585814952850342, + "learning_rate": 5.5115440215986666e-05, + "loss": 2.2246, + "step": 14404 + }, + { + "epoch": 0.9668131941881145, + "grad_norm": 3.560659170150757, + "learning_rate": 5.510462881820928e-05, + "loss": 2.0618, + "step": 14406 + }, + { + "epoch": 0.9669474178718835, + "grad_norm": 3.9561479091644287, + "learning_rate": 5.509381717924822e-05, + "loss": 2.0391, + "step": 14408 + }, + { + "epoch": 0.9670816415556525, + "grad_norm": 4.006033897399902, + "learning_rate": 5.508300529961431e-05, + "loss": 2.3953, + "step": 14410 + }, + { + "epoch": 0.9672158652394215, + "grad_norm": 3.983616352081299, + "learning_rate": 5.50721931798184e-05, + "loss": 2.2488, + "step": 14412 + }, + { + "epoch": 0.9673500889231905, + "grad_norm": 4.648655891418457, + "learning_rate": 5.506138082037133e-05, + "loss": 2.4061, + "step": 14414 + }, + { + "epoch": 0.9674843126069596, + "grad_norm": 3.7832252979278564, + "learning_rate": 5.505056822178397e-05, + "loss": 2.1139, + "step": 14416 + }, + { + "epoch": 0.9676185362907285, + "grad_norm": 3.767975330352783, + "learning_rate": 5.5039755384567207e-05, + "loss": 2.3864, + "step": 14418 + }, + { + "epoch": 0.9677527599744975, + "grad_norm": 3.7016282081604004, + "learning_rate": 5.502894230923189e-05, + "loss": 2.2808, + "step": 14420 + }, + { + "epoch": 0.9678869836582665, + "grad_norm": 3.6796257495880127, + "learning_rate": 5.501812899628895e-05, + "loss": 2.0009, + "step": 14422 + }, + { + "epoch": 0.9680212073420355, + "grad_norm": 4.522967338562012, + "learning_rate": 5.5007315446249285e-05, + "loss": 2.4498, + "step": 14424 + }, + { + "epoch": 0.9681554310258045, + "grad_norm": 3.6321310997009277, + "learning_rate": 5.49965016596238e-05, + "loss": 2.2384, + "step": 14426 + }, + { + "epoch": 0.9682896547095735, + "grad_norm": 3.9669430255889893, + "learning_rate": 5.498568763692345e-05, + "loss": 2.4865, + "step": 14428 + }, + { + "epoch": 0.9684238783933425, + "grad_norm": 3.7853329181671143, + "learning_rate": 5.497487337865916e-05, + "loss": 2.071, + "step": 14430 + }, + { + "epoch": 0.9685581020771115, + "grad_norm": 3.557818651199341, + "learning_rate": 5.4964058885341886e-05, + "loss": 2.227, + "step": 14432 + }, + { + "epoch": 0.9686923257608805, + "grad_norm": 4.739255905151367, + "learning_rate": 5.495324415748259e-05, + "loss": 2.2382, + "step": 14434 + }, + { + "epoch": 0.9688265494446495, + "grad_norm": 3.9423372745513916, + "learning_rate": 5.494242919559224e-05, + "loss": 2.2904, + "step": 14436 + }, + { + "epoch": 0.9689607731284186, + "grad_norm": 6.941054344177246, + "learning_rate": 5.493161400018184e-05, + "loss": 2.2481, + "step": 14438 + }, + { + "epoch": 0.9690949968121875, + "grad_norm": 4.002125263214111, + "learning_rate": 5.492079857176236e-05, + "loss": 2.2749, + "step": 14440 + }, + { + "epoch": 0.9692292204959565, + "grad_norm": 4.736929893493652, + "learning_rate": 5.490998291084485e-05, + "loss": 2.5159, + "step": 14442 + }, + { + "epoch": 0.9693634441797255, + "grad_norm": 3.717266082763672, + "learning_rate": 5.489916701794028e-05, + "loss": 2.1324, + "step": 14444 + }, + { + "epoch": 0.9694976678634946, + "grad_norm": 3.9420924186706543, + "learning_rate": 5.488835089355971e-05, + "loss": 2.2114, + "step": 14446 + }, + { + "epoch": 0.9696318915472635, + "grad_norm": 4.340596675872803, + "learning_rate": 5.487753453821418e-05, + "loss": 2.5262, + "step": 14448 + }, + { + "epoch": 0.9697661152310325, + "grad_norm": 3.824708938598633, + "learning_rate": 5.4866717952414716e-05, + "loss": 2.0791, + "step": 14450 + }, + { + "epoch": 0.9699003389148015, + "grad_norm": 3.668595314025879, + "learning_rate": 5.485590113667242e-05, + "loss": 2.1693, + "step": 14452 + }, + { + "epoch": 0.9700345625985706, + "grad_norm": 4.093000411987305, + "learning_rate": 5.484508409149833e-05, + "loss": 2.2291, + "step": 14454 + }, + { + "epoch": 0.9701687862823395, + "grad_norm": 3.9892940521240234, + "learning_rate": 5.483426681740356e-05, + "loss": 2.053, + "step": 14456 + }, + { + "epoch": 0.9703030099661085, + "grad_norm": 4.018230438232422, + "learning_rate": 5.482344931489918e-05, + "loss": 1.9145, + "step": 14458 + }, + { + "epoch": 0.9704372336498776, + "grad_norm": 4.201679229736328, + "learning_rate": 5.48126315844963e-05, + "loss": 2.1264, + "step": 14460 + }, + { + "epoch": 0.9705714573336465, + "grad_norm": 3.9351541996002197, + "learning_rate": 5.480181362670605e-05, + "loss": 2.3922, + "step": 14462 + }, + { + "epoch": 0.9707056810174155, + "grad_norm": 3.5191173553466797, + "learning_rate": 5.4790995442039537e-05, + "loss": 2.3429, + "step": 14464 + }, + { + "epoch": 0.9708399047011845, + "grad_norm": 3.5705740451812744, + "learning_rate": 5.4780177031007916e-05, + "loss": 1.9656, + "step": 14466 + }, + { + "epoch": 0.9709741283849536, + "grad_norm": 4.181313991546631, + "learning_rate": 5.4769358394122326e-05, + "loss": 2.3476, + "step": 14468 + }, + { + "epoch": 0.9711083520687225, + "grad_norm": 4.2336554527282715, + "learning_rate": 5.475853953189393e-05, + "loss": 2.3609, + "step": 14470 + }, + { + "epoch": 0.9712425757524915, + "grad_norm": 4.196955680847168, + "learning_rate": 5.474772044483391e-05, + "loss": 2.4, + "step": 14472 + }, + { + "epoch": 0.9713767994362605, + "grad_norm": 3.709608793258667, + "learning_rate": 5.473690113345342e-05, + "loss": 2.118, + "step": 14474 + }, + { + "epoch": 0.9715110231200296, + "grad_norm": 9.165778160095215, + "learning_rate": 5.472608159826368e-05, + "loss": 2.3136, + "step": 14476 + }, + { + "epoch": 0.9716452468037985, + "grad_norm": 6.551671028137207, + "learning_rate": 5.471526183977587e-05, + "loss": 2.3586, + "step": 14478 + }, + { + "epoch": 0.9717794704875675, + "grad_norm": 3.9764087200164795, + "learning_rate": 5.470444185850121e-05, + "loss": 2.2415, + "step": 14480 + }, + { + "epoch": 0.9719136941713366, + "grad_norm": 3.9285309314727783, + "learning_rate": 5.4693621654950925e-05, + "loss": 2.5336, + "step": 14482 + }, + { + "epoch": 0.9720479178551056, + "grad_norm": 3.9203789234161377, + "learning_rate": 5.4682801229636236e-05, + "loss": 2.4367, + "step": 14484 + }, + { + "epoch": 0.9721821415388745, + "grad_norm": 3.5382332801818848, + "learning_rate": 5.467198058306842e-05, + "loss": 1.9324, + "step": 14486 + }, + { + "epoch": 0.9723163652226435, + "grad_norm": 3.983454942703247, + "learning_rate": 5.466115971575869e-05, + "loss": 2.3049, + "step": 14488 + }, + { + "epoch": 0.9724505889064126, + "grad_norm": 4.4090352058410645, + "learning_rate": 5.465033862821835e-05, + "loss": 2.4538, + "step": 14490 + }, + { + "epoch": 0.9725848125901816, + "grad_norm": 3.426849842071533, + "learning_rate": 5.463951732095862e-05, + "loss": 2.056, + "step": 14492 + }, + { + "epoch": 0.9727190362739505, + "grad_norm": 3.614067792892456, + "learning_rate": 5.462869579449085e-05, + "loss": 2.0827, + "step": 14494 + }, + { + "epoch": 0.9728532599577195, + "grad_norm": 3.9077634811401367, + "learning_rate": 5.461787404932629e-05, + "loss": 2.5013, + "step": 14496 + }, + { + "epoch": 0.9729874836414886, + "grad_norm": 3.9692931175231934, + "learning_rate": 5.460705208597626e-05, + "loss": 2.1749, + "step": 14498 + }, + { + "epoch": 0.9731217073252575, + "grad_norm": 4.400686740875244, + "learning_rate": 5.459622990495209e-05, + "loss": 2.545, + "step": 14500 + }, + { + "epoch": 0.9732559310090265, + "grad_norm": 5.655430793762207, + "learning_rate": 5.458540750676509e-05, + "loss": 2.3103, + "step": 14502 + }, + { + "epoch": 0.9733901546927956, + "grad_norm": 3.987758159637451, + "learning_rate": 5.457458489192661e-05, + "loss": 2.2009, + "step": 14504 + }, + { + "epoch": 0.9735243783765646, + "grad_norm": 4.2144927978515625, + "learning_rate": 5.4563762060947975e-05, + "loss": 2.2824, + "step": 14506 + }, + { + "epoch": 0.9736586020603335, + "grad_norm": 4.046241283416748, + "learning_rate": 5.455293901434056e-05, + "loss": 2.4713, + "step": 14508 + }, + { + "epoch": 0.9737928257441025, + "grad_norm": 4.068577766418457, + "learning_rate": 5.454211575261574e-05, + "loss": 2.3787, + "step": 14510 + }, + { + "epoch": 0.9739270494278716, + "grad_norm": 4.051711082458496, + "learning_rate": 5.453129227628487e-05, + "loss": 2.2475, + "step": 14512 + }, + { + "epoch": 0.9740612731116406, + "grad_norm": 3.5607621669769287, + "learning_rate": 5.4520468585859364e-05, + "loss": 2.007, + "step": 14514 + }, + { + "epoch": 0.9741954967954095, + "grad_norm": 4.121545314788818, + "learning_rate": 5.450964468185059e-05, + "loss": 2.1185, + "step": 14516 + }, + { + "epoch": 0.9743297204791785, + "grad_norm": 3.7789535522460938, + "learning_rate": 5.4498820564769994e-05, + "loss": 2.3118, + "step": 14518 + }, + { + "epoch": 0.9744639441629476, + "grad_norm": 4.209509372711182, + "learning_rate": 5.4487996235128946e-05, + "loss": 2.459, + "step": 14520 + }, + { + "epoch": 0.9745981678467166, + "grad_norm": 4.4189629554748535, + "learning_rate": 5.447717169343892e-05, + "loss": 2.3622, + "step": 14522 + }, + { + "epoch": 0.9747323915304855, + "grad_norm": 3.72898006439209, + "learning_rate": 5.4466346940211345e-05, + "loss": 2.0057, + "step": 14524 + }, + { + "epoch": 0.9748666152142546, + "grad_norm": 4.059386253356934, + "learning_rate": 5.4455521975957635e-05, + "loss": 2.247, + "step": 14526 + }, + { + "epoch": 0.9750008388980236, + "grad_norm": 3.9437596797943115, + "learning_rate": 5.444469680118929e-05, + "loss": 2.5692, + "step": 14528 + }, + { + "epoch": 0.9751350625817926, + "grad_norm": 3.6488869190216064, + "learning_rate": 5.443387141641775e-05, + "loss": 2.0263, + "step": 14530 + }, + { + "epoch": 0.9752692862655615, + "grad_norm": 4.623715400695801, + "learning_rate": 5.4423045822154506e-05, + "loss": 2.2049, + "step": 14532 + }, + { + "epoch": 0.9754035099493306, + "grad_norm": 5.509363174438477, + "learning_rate": 5.4412220018911056e-05, + "loss": 2.4486, + "step": 14534 + }, + { + "epoch": 0.9755377336330996, + "grad_norm": 4.240229606628418, + "learning_rate": 5.4401394007198866e-05, + "loss": 2.1417, + "step": 14536 + }, + { + "epoch": 0.9756719573168685, + "grad_norm": 4.209182262420654, + "learning_rate": 5.4390567787529476e-05, + "loss": 2.276, + "step": 14538 + }, + { + "epoch": 0.9758061810006375, + "grad_norm": 3.735595464706421, + "learning_rate": 5.437974136041439e-05, + "loss": 2.1615, + "step": 14540 + }, + { + "epoch": 0.9759404046844066, + "grad_norm": 5.800134181976318, + "learning_rate": 5.436891472636515e-05, + "loss": 2.3816, + "step": 14542 + }, + { + "epoch": 0.9760746283681756, + "grad_norm": 4.148758411407471, + "learning_rate": 5.435808788589326e-05, + "loss": 2.3264, + "step": 14544 + }, + { + "epoch": 0.9762088520519445, + "grad_norm": 3.9701175689697266, + "learning_rate": 5.4347260839510296e-05, + "loss": 2.1028, + "step": 14546 + }, + { + "epoch": 0.9763430757357136, + "grad_norm": 3.69547963142395, + "learning_rate": 5.433643358772781e-05, + "loss": 2.1854, + "step": 14548 + }, + { + "epoch": 0.9764772994194826, + "grad_norm": 4.002964973449707, + "learning_rate": 5.432560613105736e-05, + "loss": 2.3913, + "step": 14550 + }, + { + "epoch": 0.9766115231032516, + "grad_norm": 4.09747314453125, + "learning_rate": 5.431477847001053e-05, + "loss": 2.1581, + "step": 14552 + }, + { + "epoch": 0.9767457467870205, + "grad_norm": 4.438412189483643, + "learning_rate": 5.4303950605098894e-05, + "loss": 2.2384, + "step": 14554 + }, + { + "epoch": 0.9768799704707896, + "grad_norm": 4.581920623779297, + "learning_rate": 5.429312253683406e-05, + "loss": 2.0745, + "step": 14556 + }, + { + "epoch": 0.9770141941545586, + "grad_norm": 4.343719482421875, + "learning_rate": 5.428229426572763e-05, + "loss": 2.2939, + "step": 14558 + }, + { + "epoch": 0.9771484178383276, + "grad_norm": 3.8534672260284424, + "learning_rate": 5.427146579229121e-05, + "loss": 2.4031, + "step": 14560 + }, + { + "epoch": 0.9772826415220965, + "grad_norm": 4.172222137451172, + "learning_rate": 5.426063711703644e-05, + "loss": 2.0977, + "step": 14562 + }, + { + "epoch": 0.9774168652058656, + "grad_norm": 4.282873630523682, + "learning_rate": 5.424980824047493e-05, + "loss": 1.9751, + "step": 14564 + }, + { + "epoch": 0.9775510888896346, + "grad_norm": 3.37589955329895, + "learning_rate": 5.4238979163118355e-05, + "loss": 2.185, + "step": 14566 + }, + { + "epoch": 0.9776853125734036, + "grad_norm": 3.9945433139801025, + "learning_rate": 5.422814988547834e-05, + "loss": 2.0808, + "step": 14568 + }, + { + "epoch": 0.9778195362571726, + "grad_norm": 5.015462398529053, + "learning_rate": 5.421732040806656e-05, + "loss": 2.1529, + "step": 14570 + }, + { + "epoch": 0.9779537599409416, + "grad_norm": 3.8443875312805176, + "learning_rate": 5.420649073139469e-05, + "loss": 2.1557, + "step": 14572 + }, + { + "epoch": 0.9780879836247106, + "grad_norm": 4.350151538848877, + "learning_rate": 5.419566085597439e-05, + "loss": 2.237, + "step": 14574 + }, + { + "epoch": 0.9782222073084795, + "grad_norm": 4.711680889129639, + "learning_rate": 5.418483078231737e-05, + "loss": 2.2407, + "step": 14576 + }, + { + "epoch": 0.9783564309922486, + "grad_norm": 3.9837307929992676, + "learning_rate": 5.417400051093533e-05, + "loss": 2.2621, + "step": 14578 + }, + { + "epoch": 0.9784906546760176, + "grad_norm": 4.356902599334717, + "learning_rate": 5.416317004233997e-05, + "loss": 2.3896, + "step": 14580 + }, + { + "epoch": 0.9786248783597866, + "grad_norm": 4.295605659484863, + "learning_rate": 5.4152339377043015e-05, + "loss": 2.4578, + "step": 14582 + }, + { + "epoch": 0.9787591020435555, + "grad_norm": 4.2992472648620605, + "learning_rate": 5.41415085155562e-05, + "loss": 2.2646, + "step": 14584 + }, + { + "epoch": 0.9788933257273246, + "grad_norm": 4.095040321350098, + "learning_rate": 5.413067745839123e-05, + "loss": 2.2137, + "step": 14586 + }, + { + "epoch": 0.9790275494110936, + "grad_norm": 4.521244525909424, + "learning_rate": 5.4119846206059876e-05, + "loss": 1.8965, + "step": 14588 + }, + { + "epoch": 0.9791617730948626, + "grad_norm": 3.975860118865967, + "learning_rate": 5.410901475907391e-05, + "loss": 2.4366, + "step": 14590 + }, + { + "epoch": 0.9792959967786316, + "grad_norm": 4.146352291107178, + "learning_rate": 5.409818311794506e-05, + "loss": 2.1553, + "step": 14592 + }, + { + "epoch": 0.9794302204624006, + "grad_norm": 3.8948819637298584, + "learning_rate": 5.4087351283185116e-05, + "loss": 2.205, + "step": 14594 + }, + { + "epoch": 0.9795644441461696, + "grad_norm": 4.614405632019043, + "learning_rate": 5.407651925530587e-05, + "loss": 2.3126, + "step": 14596 + }, + { + "epoch": 0.9796986678299386, + "grad_norm": 4.718899726867676, + "learning_rate": 5.406568703481909e-05, + "loss": 2.0952, + "step": 14598 + }, + { + "epoch": 0.9798328915137076, + "grad_norm": 4.16251802444458, + "learning_rate": 5.4054854622236614e-05, + "loss": 2.4418, + "step": 14600 + }, + { + "epoch": 0.9799671151974766, + "grad_norm": 4.376885890960693, + "learning_rate": 5.4044022018070214e-05, + "loss": 2.4252, + "step": 14602 + }, + { + "epoch": 0.9801013388812456, + "grad_norm": 4.133791446685791, + "learning_rate": 5.4033189222831735e-05, + "loss": 2.2955, + "step": 14604 + }, + { + "epoch": 0.9802355625650147, + "grad_norm": 4.6475677490234375, + "learning_rate": 5.402235623703299e-05, + "loss": 2.263, + "step": 14606 + }, + { + "epoch": 0.9803697862487836, + "grad_norm": 4.115394592285156, + "learning_rate": 5.401152306118582e-05, + "loss": 2.2719, + "step": 14608 + }, + { + "epoch": 0.9805040099325526, + "grad_norm": 3.8109521865844727, + "learning_rate": 5.400068969580209e-05, + "loss": 2.3133, + "step": 14610 + }, + { + "epoch": 0.9806382336163216, + "grad_norm": 4.8505778312683105, + "learning_rate": 5.398985614139361e-05, + "loss": 2.2181, + "step": 14612 + }, + { + "epoch": 0.9807724573000905, + "grad_norm": 3.9344584941864014, + "learning_rate": 5.3979022398472304e-05, + "loss": 2.2292, + "step": 14614 + }, + { + "epoch": 0.9809066809838596, + "grad_norm": 3.987954616546631, + "learning_rate": 5.396818846754999e-05, + "loss": 2.0903, + "step": 14616 + }, + { + "epoch": 0.9810409046676286, + "grad_norm": 4.737724304199219, + "learning_rate": 5.39573543491386e-05, + "loss": 2.3885, + "step": 14618 + }, + { + "epoch": 0.9811751283513976, + "grad_norm": 4.250235557556152, + "learning_rate": 5.3946520043749974e-05, + "loss": 2.2418, + "step": 14620 + }, + { + "epoch": 0.9813093520351666, + "grad_norm": 3.988820791244507, + "learning_rate": 5.3935685551896045e-05, + "loss": 2.3543, + "step": 14622 + }, + { + "epoch": 0.9814435757189356, + "grad_norm": 4.301534175872803, + "learning_rate": 5.392485087408872e-05, + "loss": 2.2727, + "step": 14624 + }, + { + "epoch": 0.9815777994027046, + "grad_norm": 4.12412691116333, + "learning_rate": 5.39140160108399e-05, + "loss": 2.3113, + "step": 14626 + }, + { + "epoch": 0.9817120230864737, + "grad_norm": 4.3144683837890625, + "learning_rate": 5.390318096266152e-05, + "loss": 2.3569, + "step": 14628 + }, + { + "epoch": 0.9818462467702426, + "grad_norm": 3.347561836242676, + "learning_rate": 5.3892345730065506e-05, + "loss": 2.3377, + "step": 14630 + }, + { + "epoch": 0.9819804704540116, + "grad_norm": 3.7607944011688232, + "learning_rate": 5.3881510313563835e-05, + "loss": 2.2383, + "step": 14632 + }, + { + "epoch": 0.9821146941377806, + "grad_norm": 3.6575162410736084, + "learning_rate": 5.387067471366841e-05, + "loss": 2.2693, + "step": 14634 + }, + { + "epoch": 0.9822489178215497, + "grad_norm": 7.373117446899414, + "learning_rate": 5.385983893089123e-05, + "loss": 2.1705, + "step": 14636 + }, + { + "epoch": 0.9823831415053186, + "grad_norm": 4.237438201904297, + "learning_rate": 5.384900296574425e-05, + "loss": 2.5077, + "step": 14638 + }, + { + "epoch": 0.9825173651890876, + "grad_norm": 3.779155731201172, + "learning_rate": 5.383816681873944e-05, + "loss": 2.1408, + "step": 14640 + }, + { + "epoch": 0.9826515888728566, + "grad_norm": 4.888134956359863, + "learning_rate": 5.3827330490388804e-05, + "loss": 2.1211, + "step": 14642 + }, + { + "epoch": 0.9827858125566257, + "grad_norm": 3.7501444816589355, + "learning_rate": 5.381649398120433e-05, + "loss": 2.1335, + "step": 14644 + }, + { + "epoch": 0.9829200362403946, + "grad_norm": 3.8702239990234375, + "learning_rate": 5.380565729169802e-05, + "loss": 2.2733, + "step": 14646 + }, + { + "epoch": 0.9830542599241636, + "grad_norm": 4.0145463943481445, + "learning_rate": 5.379482042238187e-05, + "loss": 2.0712, + "step": 14648 + }, + { + "epoch": 0.9831884836079327, + "grad_norm": 4.332498073577881, + "learning_rate": 5.378398337376793e-05, + "loss": 2.4332, + "step": 14650 + }, + { + "epoch": 0.9833227072917016, + "grad_norm": 4.038681507110596, + "learning_rate": 5.377314614636822e-05, + "loss": 2.2614, + "step": 14652 + }, + { + "epoch": 0.9834569309754706, + "grad_norm": 4.23737907409668, + "learning_rate": 5.3762308740694765e-05, + "loss": 2.1258, + "step": 14654 + }, + { + "epoch": 0.9835911546592396, + "grad_norm": 4.3989152908325195, + "learning_rate": 5.375147115725964e-05, + "loss": 2.2248, + "step": 14656 + }, + { + "epoch": 0.9837253783430087, + "grad_norm": 4.5203986167907715, + "learning_rate": 5.374063339657486e-05, + "loss": 2.186, + "step": 14658 + }, + { + "epoch": 0.9838596020267776, + "grad_norm": 4.075854778289795, + "learning_rate": 5.372979545915252e-05, + "loss": 1.9008, + "step": 14660 + }, + { + "epoch": 0.9839938257105466, + "grad_norm": 3.1221203804016113, + "learning_rate": 5.371895734550467e-05, + "loss": 2.1437, + "step": 14662 + }, + { + "epoch": 0.9841280493943156, + "grad_norm": 3.687537670135498, + "learning_rate": 5.3708119056143404e-05, + "loss": 2.5555, + "step": 14664 + }, + { + "epoch": 0.9842622730780847, + "grad_norm": 4.118560314178467, + "learning_rate": 5.3697280591580804e-05, + "loss": 2.2826, + "step": 14666 + }, + { + "epoch": 0.9843964967618536, + "grad_norm": 4.375885486602783, + "learning_rate": 5.368644195232896e-05, + "loss": 2.2793, + "step": 14668 + }, + { + "epoch": 0.9845307204456226, + "grad_norm": 3.796051025390625, + "learning_rate": 5.367560313889998e-05, + "loss": 2.2699, + "step": 14670 + }, + { + "epoch": 0.9846649441293917, + "grad_norm": 3.6039083003997803, + "learning_rate": 5.366476415180599e-05, + "loss": 2.2225, + "step": 14672 + }, + { + "epoch": 0.9847991678131607, + "grad_norm": 3.9677748680114746, + "learning_rate": 5.365392499155909e-05, + "loss": 2.1994, + "step": 14674 + }, + { + "epoch": 0.9849333914969296, + "grad_norm": 4.272703170776367, + "learning_rate": 5.3643085658671435e-05, + "loss": 2.039, + "step": 14676 + }, + { + "epoch": 0.9850676151806986, + "grad_norm": 4.3886284828186035, + "learning_rate": 5.363224615365513e-05, + "loss": 2.1974, + "step": 14678 + }, + { + "epoch": 0.9852018388644677, + "grad_norm": 4.730482578277588, + "learning_rate": 5.362140647702235e-05, + "loss": 2.1523, + "step": 14680 + }, + { + "epoch": 0.9853360625482367, + "grad_norm": 6.450814247131348, + "learning_rate": 5.361056662928522e-05, + "loss": 2.2492, + "step": 14682 + }, + { + "epoch": 0.9854702862320056, + "grad_norm": 3.818128824234009, + "learning_rate": 5.3599726610955926e-05, + "loss": 2.3523, + "step": 14684 + }, + { + "epoch": 0.9856045099157746, + "grad_norm": 4.164072513580322, + "learning_rate": 5.358888642254663e-05, + "loss": 2.0055, + "step": 14686 + }, + { + "epoch": 0.9857387335995437, + "grad_norm": 4.263340473175049, + "learning_rate": 5.35780460645695e-05, + "loss": 1.9385, + "step": 14688 + }, + { + "epoch": 0.9858729572833126, + "grad_norm": 4.678997993469238, + "learning_rate": 5.3567205537536735e-05, + "loss": 2.4008, + "step": 14690 + }, + { + "epoch": 0.9860071809670816, + "grad_norm": 3.738867998123169, + "learning_rate": 5.3556364841960524e-05, + "loss": 2.398, + "step": 14692 + }, + { + "epoch": 0.9861414046508507, + "grad_norm": 3.961005449295044, + "learning_rate": 5.3545523978353073e-05, + "loss": 2.2888, + "step": 14694 + }, + { + "epoch": 0.9862756283346197, + "grad_norm": 4.564472198486328, + "learning_rate": 5.3534682947226576e-05, + "loss": 2.2419, + "step": 14696 + }, + { + "epoch": 0.9864098520183886, + "grad_norm": 3.618561029434204, + "learning_rate": 5.352384174909325e-05, + "loss": 2.039, + "step": 14698 + }, + { + "epoch": 0.9865440757021576, + "grad_norm": 4.633356094360352, + "learning_rate": 5.351300038446535e-05, + "loss": 2.3377, + "step": 14700 + }, + { + "epoch": 0.9866782993859267, + "grad_norm": 3.9564881324768066, + "learning_rate": 5.350215885385509e-05, + "loss": 2.4243, + "step": 14702 + }, + { + "epoch": 0.9868125230696957, + "grad_norm": 4.06054162979126, + "learning_rate": 5.34913171577747e-05, + "loss": 2.2609, + "step": 14704 + }, + { + "epoch": 0.9869467467534646, + "grad_norm": 4.7519145011901855, + "learning_rate": 5.348047529673644e-05, + "loss": 2.2338, + "step": 14706 + }, + { + "epoch": 0.9870809704372336, + "grad_norm": 4.16979455947876, + "learning_rate": 5.346963327125258e-05, + "loss": 2.5169, + "step": 14708 + }, + { + "epoch": 0.9872151941210027, + "grad_norm": 3.913973569869995, + "learning_rate": 5.3458791081835356e-05, + "loss": 2.2423, + "step": 14710 + }, + { + "epoch": 0.9873494178047717, + "grad_norm": 4.800822734832764, + "learning_rate": 5.3447948728997054e-05, + "loss": 2.4512, + "step": 14712 + }, + { + "epoch": 0.9874836414885406, + "grad_norm": 4.1757636070251465, + "learning_rate": 5.3437106213249964e-05, + "loss": 2.5273, + "step": 14714 + }, + { + "epoch": 0.9876178651723097, + "grad_norm": 4.275753021240234, + "learning_rate": 5.3426263535106355e-05, + "loss": 2.4145, + "step": 14716 + }, + { + "epoch": 0.9877520888560787, + "grad_norm": 3.9549317359924316, + "learning_rate": 5.3415420695078555e-05, + "loss": 2.1598, + "step": 14718 + }, + { + "epoch": 0.9878863125398477, + "grad_norm": 3.4823126792907715, + "learning_rate": 5.3404577693678815e-05, + "loss": 2.2446, + "step": 14720 + }, + { + "epoch": 0.9880205362236166, + "grad_norm": 3.540247917175293, + "learning_rate": 5.3393734531419496e-05, + "loss": 2.0715, + "step": 14722 + }, + { + "epoch": 0.9881547599073857, + "grad_norm": 3.8210275173187256, + "learning_rate": 5.338289120881287e-05, + "loss": 2.12, + "step": 14724 + }, + { + "epoch": 0.9882889835911547, + "grad_norm": 4.4707489013671875, + "learning_rate": 5.33720477263713e-05, + "loss": 2.4876, + "step": 14726 + }, + { + "epoch": 0.9884232072749236, + "grad_norm": 4.236754417419434, + "learning_rate": 5.336120408460711e-05, + "loss": 2.0975, + "step": 14728 + }, + { + "epoch": 0.9885574309586926, + "grad_norm": 4.110838413238525, + "learning_rate": 5.335036028403264e-05, + "loss": 1.9883, + "step": 14730 + }, + { + "epoch": 0.9886916546424617, + "grad_norm": 3.9260473251342773, + "learning_rate": 5.333951632516023e-05, + "loss": 2.2217, + "step": 14732 + }, + { + "epoch": 0.9888258783262307, + "grad_norm": 4.145524501800537, + "learning_rate": 5.332867220850223e-05, + "loss": 2.2853, + "step": 14734 + }, + { + "epoch": 0.9889601020099996, + "grad_norm": 5.046534061431885, + "learning_rate": 5.331782793457102e-05, + "loss": 2.3716, + "step": 14736 + }, + { + "epoch": 0.9890943256937687, + "grad_norm": 3.9444806575775146, + "learning_rate": 5.330698350387897e-05, + "loss": 1.9637, + "step": 14738 + }, + { + "epoch": 0.9892285493775377, + "grad_norm": 3.9877827167510986, + "learning_rate": 5.329613891693843e-05, + "loss": 2.1944, + "step": 14740 + }, + { + "epoch": 0.9893627730613067, + "grad_norm": 3.609973907470703, + "learning_rate": 5.3285294174261836e-05, + "loss": 2.2399, + "step": 14742 + }, + { + "epoch": 0.9894969967450756, + "grad_norm": 3.420736312866211, + "learning_rate": 5.3274449276361535e-05, + "loss": 2.1215, + "step": 14744 + }, + { + "epoch": 0.9896312204288447, + "grad_norm": 3.9124064445495605, + "learning_rate": 5.3263604223749954e-05, + "loss": 2.1603, + "step": 14746 + }, + { + "epoch": 0.9897654441126137, + "grad_norm": 3.940370798110962, + "learning_rate": 5.3252759016939494e-05, + "loss": 2.0313, + "step": 14748 + }, + { + "epoch": 0.9898996677963827, + "grad_norm": 4.218758583068848, + "learning_rate": 5.3241913656442557e-05, + "loss": 2.238, + "step": 14750 + }, + { + "epoch": 0.9900338914801516, + "grad_norm": 3.6178126335144043, + "learning_rate": 5.3231068142771576e-05, + "loss": 2.1066, + "step": 14752 + }, + { + "epoch": 0.9901681151639207, + "grad_norm": 4.30337381362915, + "learning_rate": 5.322022247643897e-05, + "loss": 2.2316, + "step": 14754 + }, + { + "epoch": 0.9903023388476897, + "grad_norm": 4.217620372772217, + "learning_rate": 5.320937665795721e-05, + "loss": 2.2936, + "step": 14756 + }, + { + "epoch": 0.9904365625314587, + "grad_norm": 3.9145500659942627, + "learning_rate": 5.31985306878387e-05, + "loss": 2.1169, + "step": 14758 + }, + { + "epoch": 0.9905707862152276, + "grad_norm": 4.329912185668945, + "learning_rate": 5.31876845665959e-05, + "loss": 2.4195, + "step": 14760 + }, + { + "epoch": 0.9907050098989967, + "grad_norm": 5.430179119110107, + "learning_rate": 5.317683829474127e-05, + "loss": 2.2453, + "step": 14762 + }, + { + "epoch": 0.9908392335827657, + "grad_norm": 4.243077754974365, + "learning_rate": 5.3165991872787276e-05, + "loss": 2.1987, + "step": 14764 + }, + { + "epoch": 0.9909734572665346, + "grad_norm": 3.877866506576538, + "learning_rate": 5.315514530124641e-05, + "loss": 2.0543, + "step": 14766 + }, + { + "epoch": 0.9911076809503037, + "grad_norm": 3.537794351577759, + "learning_rate": 5.3144298580631124e-05, + "loss": 2.0557, + "step": 14768 + }, + { + "epoch": 0.9912419046340727, + "grad_norm": 4.0214104652404785, + "learning_rate": 5.313345171145392e-05, + "loss": 2.3239, + "step": 14770 + }, + { + "epoch": 0.9913761283178417, + "grad_norm": 3.747116804122925, + "learning_rate": 5.3122604694227265e-05, + "loss": 2.0961, + "step": 14772 + }, + { + "epoch": 0.9915103520016106, + "grad_norm": 4.000688552856445, + "learning_rate": 5.311175752946369e-05, + "loss": 2.2331, + "step": 14774 + }, + { + "epoch": 0.9916445756853797, + "grad_norm": 3.526806592941284, + "learning_rate": 5.310091021767569e-05, + "loss": 2.2442, + "step": 14776 + }, + { + "epoch": 0.9917787993691487, + "grad_norm": 4.1016974449157715, + "learning_rate": 5.309006275937578e-05, + "loss": 2.282, + "step": 14778 + }, + { + "epoch": 0.9919130230529177, + "grad_norm": 3.8488059043884277, + "learning_rate": 5.307921515507649e-05, + "loss": 2.3643, + "step": 14780 + }, + { + "epoch": 0.9920472467366866, + "grad_norm": 3.943321466445923, + "learning_rate": 5.306836740529032e-05, + "loss": 2.309, + "step": 14782 + }, + { + "epoch": 0.9921814704204557, + "grad_norm": 5.4918646812438965, + "learning_rate": 5.305751951052984e-05, + "loss": 2.2037, + "step": 14784 + }, + { + "epoch": 0.9923156941042247, + "grad_norm": 4.354452133178711, + "learning_rate": 5.3046671471307566e-05, + "loss": 2.3999, + "step": 14786 + }, + { + "epoch": 0.9924499177879937, + "grad_norm": 3.555433750152588, + "learning_rate": 5.303582328813605e-05, + "loss": 2.047, + "step": 14788 + }, + { + "epoch": 0.9925841414717627, + "grad_norm": 4.474516868591309, + "learning_rate": 5.302497496152787e-05, + "loss": 2.1984, + "step": 14790 + }, + { + "epoch": 0.9927183651555317, + "grad_norm": 4.117682456970215, + "learning_rate": 5.3014126491995566e-05, + "loss": 2.139, + "step": 14792 + }, + { + "epoch": 0.9928525888393007, + "grad_norm": 3.8125007152557373, + "learning_rate": 5.300327788005171e-05, + "loss": 2.1357, + "step": 14794 + }, + { + "epoch": 0.9929868125230698, + "grad_norm": 4.144960880279541, + "learning_rate": 5.2992429126208875e-05, + "loss": 2.399, + "step": 14796 + }, + { + "epoch": 0.9931210362068387, + "grad_norm": 3.9722814559936523, + "learning_rate": 5.2981580230979664e-05, + "loss": 2.1718, + "step": 14798 + }, + { + "epoch": 0.9932552598906077, + "grad_norm": 4.150750160217285, + "learning_rate": 5.297073119487664e-05, + "loss": 2.281, + "step": 14800 + }, + { + "epoch": 0.9933894835743767, + "grad_norm": 3.536834239959717, + "learning_rate": 5.295988201841241e-05, + "loss": 2.1582, + "step": 14802 + }, + { + "epoch": 0.9935237072581456, + "grad_norm": 5.947303295135498, + "learning_rate": 5.294903270209958e-05, + "loss": 2.4298, + "step": 14804 + }, + { + "epoch": 0.9936579309419147, + "grad_norm": 3.787609577178955, + "learning_rate": 5.2938183246450735e-05, + "loss": 2.0718, + "step": 14806 + }, + { + "epoch": 0.9937921546256837, + "grad_norm": 3.7389376163482666, + "learning_rate": 5.2927333651978526e-05, + "loss": 2.0316, + "step": 14808 + }, + { + "epoch": 0.9939263783094527, + "grad_norm": 4.4572343826293945, + "learning_rate": 5.291648391919555e-05, + "loss": 2.29, + "step": 14810 + }, + { + "epoch": 0.9940606019932217, + "grad_norm": 4.3909687995910645, + "learning_rate": 5.290563404861445e-05, + "loss": 2.2996, + "step": 14812 + }, + { + "epoch": 0.9941948256769907, + "grad_norm": 5.230108737945557, + "learning_rate": 5.289478404074786e-05, + "loss": 2.1512, + "step": 14814 + }, + { + "epoch": 0.9943290493607597, + "grad_norm": 3.871673822402954, + "learning_rate": 5.28839338961084e-05, + "loss": 1.9237, + "step": 14816 + }, + { + "epoch": 0.9944632730445288, + "grad_norm": 3.636181592941284, + "learning_rate": 5.287308361520875e-05, + "loss": 2.1011, + "step": 14818 + }, + { + "epoch": 0.9945974967282977, + "grad_norm": 3.7406210899353027, + "learning_rate": 5.2862233198561525e-05, + "loss": 2.3483, + "step": 14820 + }, + { + "epoch": 0.9947317204120667, + "grad_norm": 4.094602584838867, + "learning_rate": 5.285138264667944e-05, + "loss": 2.0078, + "step": 14822 + }, + { + "epoch": 0.9948659440958357, + "grad_norm": 3.2757327556610107, + "learning_rate": 5.284053196007511e-05, + "loss": 2.2798, + "step": 14824 + }, + { + "epoch": 0.9950001677796048, + "grad_norm": 3.8077611923217773, + "learning_rate": 5.282968113926123e-05, + "loss": 2.1551, + "step": 14826 + }, + { + "epoch": 0.9951343914633737, + "grad_norm": 4.552893161773682, + "learning_rate": 5.28188301847505e-05, + "loss": 2.3631, + "step": 14828 + }, + { + "epoch": 0.9952686151471427, + "grad_norm": 3.6551778316497803, + "learning_rate": 5.280797909705557e-05, + "loss": 2.1509, + "step": 14830 + }, + { + "epoch": 0.9954028388309117, + "grad_norm": 4.5980682373046875, + "learning_rate": 5.279712787668917e-05, + "loss": 2.4153, + "step": 14832 + }, + { + "epoch": 0.9955370625146808, + "grad_norm": 4.048924922943115, + "learning_rate": 5.278627652416396e-05, + "loss": 2.2039, + "step": 14834 + }, + { + "epoch": 0.9956712861984497, + "grad_norm": 4.603737831115723, + "learning_rate": 5.277542503999265e-05, + "loss": 2.517, + "step": 14836 + }, + { + "epoch": 0.9958055098822187, + "grad_norm": 4.602569103240967, + "learning_rate": 5.2764573424688e-05, + "loss": 2.2501, + "step": 14838 + }, + { + "epoch": 0.9959397335659878, + "grad_norm": 4.268650054931641, + "learning_rate": 5.275372167876267e-05, + "loss": 2.1334, + "step": 14840 + }, + { + "epoch": 0.9960739572497567, + "grad_norm": 3.8503894805908203, + "learning_rate": 5.274286980272941e-05, + "loss": 2.2108, + "step": 14842 + }, + { + "epoch": 0.9962081809335257, + "grad_norm": 3.8951098918914795, + "learning_rate": 5.273201779710094e-05, + "loss": 2.3311, + "step": 14844 + }, + { + "epoch": 0.9963424046172947, + "grad_norm": 4.303333282470703, + "learning_rate": 5.2721165662390014e-05, + "loss": 2.3684, + "step": 14846 + }, + { + "epoch": 0.9964766283010638, + "grad_norm": 5.184800148010254, + "learning_rate": 5.2710313399109346e-05, + "loss": 2.17, + "step": 14848 + }, + { + "epoch": 0.9966108519848327, + "grad_norm": 4.565430641174316, + "learning_rate": 5.26994610077717e-05, + "loss": 2.3824, + "step": 14850 + }, + { + "epoch": 0.9967450756686017, + "grad_norm": 4.496313571929932, + "learning_rate": 5.2688608488889844e-05, + "loss": 2.3424, + "step": 14852 + }, + { + "epoch": 0.9968792993523707, + "grad_norm": 3.4321541786193848, + "learning_rate": 5.2677755842976516e-05, + "loss": 2.2644, + "step": 14854 + }, + { + "epoch": 0.9970135230361398, + "grad_norm": 3.925825834274292, + "learning_rate": 5.266690307054449e-05, + "loss": 1.9217, + "step": 14856 + }, + { + "epoch": 0.9971477467199087, + "grad_norm": 5.212893009185791, + "learning_rate": 5.265605017210654e-05, + "loss": 2.7019, + "step": 14858 + }, + { + "epoch": 0.9972819704036777, + "grad_norm": 4.050665855407715, + "learning_rate": 5.264519714817544e-05, + "loss": 2.2567, + "step": 14860 + }, + { + "epoch": 0.9974161940874468, + "grad_norm": 3.9940710067749023, + "learning_rate": 5.263434399926398e-05, + "loss": 2.1964, + "step": 14862 + }, + { + "epoch": 0.9975504177712158, + "grad_norm": 4.763870716094971, + "learning_rate": 5.2623490725884936e-05, + "loss": 2.5678, + "step": 14864 + }, + { + "epoch": 0.9976846414549847, + "grad_norm": 4.367804527282715, + "learning_rate": 5.261263732855113e-05, + "loss": 2.2004, + "step": 14866 + }, + { + "epoch": 0.9978188651387537, + "grad_norm": 4.216090202331543, + "learning_rate": 5.260178380777534e-05, + "loss": 2.1682, + "step": 14868 + }, + { + "epoch": 0.9979530888225228, + "grad_norm": 3.684403419494629, + "learning_rate": 5.2590930164070384e-05, + "loss": 2.1828, + "step": 14870 + }, + { + "epoch": 0.9980873125062918, + "grad_norm": 4.386778831481934, + "learning_rate": 5.258007639794907e-05, + "loss": 2.3102, + "step": 14872 + }, + { + "epoch": 0.9982215361900607, + "grad_norm": 3.964515209197998, + "learning_rate": 5.256922250992421e-05, + "loss": 2.1111, + "step": 14874 + }, + { + "epoch": 0.9983557598738297, + "grad_norm": 6.140744686126709, + "learning_rate": 5.255836850050866e-05, + "loss": 2.238, + "step": 14876 + }, + { + "epoch": 0.9984899835575988, + "grad_norm": 6.098419666290283, + "learning_rate": 5.2547514370215214e-05, + "loss": 2.2016, + "step": 14878 + }, + { + "epoch": 0.9986242072413677, + "grad_norm": 3.80700945854187, + "learning_rate": 5.253666011955673e-05, + "loss": 2.034, + "step": 14880 + }, + { + "epoch": 0.9987584309251367, + "grad_norm": 4.243361949920654, + "learning_rate": 5.252580574904603e-05, + "loss": 2.1983, + "step": 14882 + }, + { + "epoch": 0.9988926546089058, + "grad_norm": 4.22919225692749, + "learning_rate": 5.2514951259195985e-05, + "loss": 2.2504, + "step": 14884 + }, + { + "epoch": 0.9990268782926748, + "grad_norm": 4.203723907470703, + "learning_rate": 5.250409665051944e-05, + "loss": 2.7237, + "step": 14886 + }, + { + "epoch": 0.9991611019764437, + "grad_norm": 4.104478359222412, + "learning_rate": 5.2493241923529245e-05, + "loss": 2.401, + "step": 14888 + }, + { + "epoch": 0.9992953256602127, + "grad_norm": 3.648685932159424, + "learning_rate": 5.248238707873828e-05, + "loss": 2.1404, + "step": 14890 + }, + { + "epoch": 0.9994295493439818, + "grad_norm": 4.020578861236572, + "learning_rate": 5.24715321166594e-05, + "loss": 2.1743, + "step": 14892 + }, + { + "epoch": 0.9995637730277508, + "grad_norm": 4.597911834716797, + "learning_rate": 5.246067703780551e-05, + "loss": 2.3279, + "step": 14894 + }, + { + "epoch": 0.9996979967115197, + "grad_norm": 3.9140162467956543, + "learning_rate": 5.244982184268944e-05, + "loss": 2.1018, + "step": 14896 + }, + { + "epoch": 0.9998322203952887, + "grad_norm": 3.53041410446167, + "learning_rate": 5.243896653182412e-05, + "loss": 2.1042, + "step": 14898 + }, + { + "epoch": 0.9999664440790578, + "grad_norm": 3.5951576232910156, + "learning_rate": 5.242811110572242e-05, + "loss": 2.1256, + "step": 14900 + }, + { + "epoch": 1.0000671118418845, + "grad_norm": 3.7789435386657715, + "learning_rate": 5.241725556489724e-05, + "loss": 1.6218, + "step": 14902 + }, + { + "epoch": 1.0002013355256536, + "grad_norm": 3.9234747886657715, + "learning_rate": 5.2406399909861505e-05, + "loss": 2.2949, + "step": 14904 + }, + { + "epoch": 1.0003355592094225, + "grad_norm": 3.874363660812378, + "learning_rate": 5.239554414112809e-05, + "loss": 2.1896, + "step": 14906 + }, + { + "epoch": 1.0004697828931914, + "grad_norm": 3.7774875164031982, + "learning_rate": 5.238468825920994e-05, + "loss": 2.0438, + "step": 14908 + }, + { + "epoch": 1.0006040065769606, + "grad_norm": 4.080749034881592, + "learning_rate": 5.237383226461995e-05, + "loss": 2.3081, + "step": 14910 + }, + { + "epoch": 1.0007382302607295, + "grad_norm": 3.674964427947998, + "learning_rate": 5.236297615787106e-05, + "loss": 2.0098, + "step": 14912 + }, + { + "epoch": 1.0008724539444984, + "grad_norm": 6.775343894958496, + "learning_rate": 5.235211993947617e-05, + "loss": 2.1932, + "step": 14914 + }, + { + "epoch": 1.0010066776282676, + "grad_norm": 4.198841094970703, + "learning_rate": 5.2341263609948254e-05, + "loss": 2.0425, + "step": 14916 + }, + { + "epoch": 1.0011409013120365, + "grad_norm": 3.7309439182281494, + "learning_rate": 5.2330407169800245e-05, + "loss": 1.9539, + "step": 14918 + }, + { + "epoch": 1.0012751249958054, + "grad_norm": 3.954735040664673, + "learning_rate": 5.2319550619545074e-05, + "loss": 2.0932, + "step": 14920 + }, + { + "epoch": 1.0014093486795745, + "grad_norm": 4.426879405975342, + "learning_rate": 5.23086939596957e-05, + "loss": 2.1145, + "step": 14922 + }, + { + "epoch": 1.0015435723633435, + "grad_norm": 3.740556478500366, + "learning_rate": 5.229783719076506e-05, + "loss": 2.0384, + "step": 14924 + }, + { + "epoch": 1.0016777960471126, + "grad_norm": 5.016883850097656, + "learning_rate": 5.2286980313266145e-05, + "loss": 2.1433, + "step": 14926 + }, + { + "epoch": 1.0018120197308815, + "grad_norm": 4.734241008758545, + "learning_rate": 5.227612332771191e-05, + "loss": 2.1396, + "step": 14928 + }, + { + "epoch": 1.0019462434146504, + "grad_norm": 3.696647882461548, + "learning_rate": 5.2265266234615315e-05, + "loss": 2.0758, + "step": 14930 + }, + { + "epoch": 1.0020804670984196, + "grad_norm": 4.279547691345215, + "learning_rate": 5.225440903448936e-05, + "loss": 1.9865, + "step": 14932 + }, + { + "epoch": 1.0022146907821885, + "grad_norm": 4.368074417114258, + "learning_rate": 5.2243551727847e-05, + "loss": 2.0586, + "step": 14934 + }, + { + "epoch": 1.0023489144659574, + "grad_norm": 4.016139030456543, + "learning_rate": 5.223269431520124e-05, + "loss": 2.1004, + "step": 14936 + }, + { + "epoch": 1.0024831381497266, + "grad_norm": 3.96114444732666, + "learning_rate": 5.222183679706506e-05, + "loss": 2.1818, + "step": 14938 + }, + { + "epoch": 1.0026173618334955, + "grad_norm": 3.78045654296875, + "learning_rate": 5.221097917395146e-05, + "loss": 2.1877, + "step": 14940 + }, + { + "epoch": 1.0027515855172646, + "grad_norm": 4.152057647705078, + "learning_rate": 5.220012144637344e-05, + "loss": 1.917, + "step": 14942 + }, + { + "epoch": 1.0028858092010335, + "grad_norm": 4.107029438018799, + "learning_rate": 5.2189263614843995e-05, + "loss": 2.3795, + "step": 14944 + }, + { + "epoch": 1.0030200328848025, + "grad_norm": 4.472391605377197, + "learning_rate": 5.2178405679876174e-05, + "loss": 2.0003, + "step": 14946 + }, + { + "epoch": 1.0031542565685716, + "grad_norm": 3.9355342388153076, + "learning_rate": 5.216754764198295e-05, + "loss": 2.1854, + "step": 14948 + }, + { + "epoch": 1.0032884802523405, + "grad_norm": 4.068041801452637, + "learning_rate": 5.215668950167737e-05, + "loss": 2.2375, + "step": 14950 + }, + { + "epoch": 1.0034227039361094, + "grad_norm": 3.9176363945007324, + "learning_rate": 5.214583125947245e-05, + "loss": 2.0202, + "step": 14952 + }, + { + "epoch": 1.0035569276198786, + "grad_norm": 3.5777463912963867, + "learning_rate": 5.213497291588122e-05, + "loss": 2.139, + "step": 14954 + }, + { + "epoch": 1.0036911513036475, + "grad_norm": 4.177278518676758, + "learning_rate": 5.212411447141673e-05, + "loss": 2.1063, + "step": 14956 + }, + { + "epoch": 1.0038253749874164, + "grad_norm": 3.874234437942505, + "learning_rate": 5.2113255926592e-05, + "loss": 2.1612, + "step": 14958 + }, + { + "epoch": 1.0039595986711856, + "grad_norm": 6.875918865203857, + "learning_rate": 5.210239728192008e-05, + "loss": 2.2583, + "step": 14960 + }, + { + "epoch": 1.0040938223549545, + "grad_norm": 6.063951015472412, + "learning_rate": 5.209153853791402e-05, + "loss": 2.3546, + "step": 14962 + }, + { + "epoch": 1.0042280460387236, + "grad_norm": 4.126222133636475, + "learning_rate": 5.208067969508688e-05, + "loss": 2.2629, + "step": 14964 + }, + { + "epoch": 1.0043622697224925, + "grad_norm": 3.7655539512634277, + "learning_rate": 5.2069820753951726e-05, + "loss": 2.1611, + "step": 14966 + }, + { + "epoch": 1.0044964934062615, + "grad_norm": 4.678374290466309, + "learning_rate": 5.205896171502159e-05, + "loss": 2.4313, + "step": 14968 + }, + { + "epoch": 1.0046307170900306, + "grad_norm": 3.9586234092712402, + "learning_rate": 5.204810257880959e-05, + "loss": 1.8975, + "step": 14970 + }, + { + "epoch": 1.0047649407737995, + "grad_norm": 4.360969543457031, + "learning_rate": 5.203724334582875e-05, + "loss": 2.3749, + "step": 14972 + }, + { + "epoch": 1.0048991644575684, + "grad_norm": 3.9427073001861572, + "learning_rate": 5.202638401659218e-05, + "loss": 2.134, + "step": 14974 + }, + { + "epoch": 1.0050333881413376, + "grad_norm": 3.643672227859497, + "learning_rate": 5.201552459161293e-05, + "loss": 2.0937, + "step": 14976 + }, + { + "epoch": 1.0051676118251065, + "grad_norm": 4.271526336669922, + "learning_rate": 5.200466507140411e-05, + "loss": 2.0707, + "step": 14978 + }, + { + "epoch": 1.0053018355088756, + "grad_norm": 4.330949783325195, + "learning_rate": 5.1993805456478826e-05, + "loss": 2.0611, + "step": 14980 + }, + { + "epoch": 1.0054360591926446, + "grad_norm": 4.171957015991211, + "learning_rate": 5.198294574735014e-05, + "loss": 2.1301, + "step": 14982 + }, + { + "epoch": 1.0055702828764135, + "grad_norm": 3.5762083530426025, + "learning_rate": 5.197208594453118e-05, + "loss": 2.0927, + "step": 14984 + }, + { + "epoch": 1.0057045065601826, + "grad_norm": 3.868055582046509, + "learning_rate": 5.1961226048535026e-05, + "loss": 2.2641, + "step": 14986 + }, + { + "epoch": 1.0058387302439515, + "grad_norm": 3.982924461364746, + "learning_rate": 5.195036605987481e-05, + "loss": 2.2762, + "step": 14988 + }, + { + "epoch": 1.0059729539277205, + "grad_norm": 3.9412505626678467, + "learning_rate": 5.193950597906362e-05, + "loss": 2.3729, + "step": 14990 + }, + { + "epoch": 1.0061071776114896, + "grad_norm": 4.011491298675537, + "learning_rate": 5.192864580661459e-05, + "loss": 2.0071, + "step": 14992 + }, + { + "epoch": 1.0062414012952585, + "grad_norm": 4.337045669555664, + "learning_rate": 5.1917785543040844e-05, + "loss": 2.1596, + "step": 14994 + }, + { + "epoch": 1.0063756249790274, + "grad_norm": 4.364109039306641, + "learning_rate": 5.1906925188855494e-05, + "loss": 2.1398, + "step": 14996 + }, + { + "epoch": 1.0065098486627966, + "grad_norm": 3.9319875240325928, + "learning_rate": 5.189606474457168e-05, + "loss": 2.1866, + "step": 14998 + }, + { + "epoch": 1.0066440723465655, + "grad_norm": 3.9170334339141846, + "learning_rate": 5.188520421070253e-05, + "loss": 2.1519, + "step": 15000 + }, + { + "epoch": 1.0067782960303346, + "grad_norm": 6.434230327606201, + "learning_rate": 5.187434358776121e-05, + "loss": 2.2863, + "step": 15002 + }, + { + "epoch": 1.0069125197141036, + "grad_norm": 4.163902759552002, + "learning_rate": 5.186348287626083e-05, + "loss": 2.3349, + "step": 15004 + }, + { + "epoch": 1.0070467433978725, + "grad_norm": 4.111404895782471, + "learning_rate": 5.185262207671454e-05, + "loss": 2.0599, + "step": 15006 + }, + { + "epoch": 1.0071809670816416, + "grad_norm": 4.061774730682373, + "learning_rate": 5.184176118963551e-05, + "loss": 2.0933, + "step": 15008 + }, + { + "epoch": 1.0073151907654105, + "grad_norm": 4.100977897644043, + "learning_rate": 5.183090021553687e-05, + "loss": 1.8708, + "step": 15010 + }, + { + "epoch": 1.0074494144491795, + "grad_norm": 4.274810314178467, + "learning_rate": 5.182003915493181e-05, + "loss": 1.9338, + "step": 15012 + }, + { + "epoch": 1.0075836381329486, + "grad_norm": 4.0171074867248535, + "learning_rate": 5.180917800833347e-05, + "loss": 2.2501, + "step": 15014 + }, + { + "epoch": 1.0077178618167175, + "grad_norm": 3.9079110622406006, + "learning_rate": 5.179831677625503e-05, + "loss": 2.2229, + "step": 15016 + }, + { + "epoch": 1.0078520855004867, + "grad_norm": 4.131248474121094, + "learning_rate": 5.1787455459209665e-05, + "loss": 2.1998, + "step": 15018 + }, + { + "epoch": 1.0079863091842556, + "grad_norm": 4.423278331756592, + "learning_rate": 5.1776594057710535e-05, + "loss": 2.2538, + "step": 15020 + }, + { + "epoch": 1.0081205328680245, + "grad_norm": 4.568587779998779, + "learning_rate": 5.176573257227084e-05, + "loss": 2.0352, + "step": 15022 + }, + { + "epoch": 1.0082547565517936, + "grad_norm": 3.881840229034424, + "learning_rate": 5.175487100340374e-05, + "loss": 1.9803, + "step": 15024 + }, + { + "epoch": 1.0083889802355626, + "grad_norm": 3.2862401008605957, + "learning_rate": 5.1744009351622434e-05, + "loss": 1.8953, + "step": 15026 + }, + { + "epoch": 1.0085232039193315, + "grad_norm": 4.118152141571045, + "learning_rate": 5.173314761744014e-05, + "loss": 1.8925, + "step": 15028 + }, + { + "epoch": 1.0086574276031006, + "grad_norm": 3.794447660446167, + "learning_rate": 5.172228580137001e-05, + "loss": 2.1778, + "step": 15030 + }, + { + "epoch": 1.0087916512868695, + "grad_norm": 4.024848937988281, + "learning_rate": 5.171142390392526e-05, + "loss": 2.2366, + "step": 15032 + }, + { + "epoch": 1.0089258749706387, + "grad_norm": 3.941161632537842, + "learning_rate": 5.1700561925619096e-05, + "loss": 2.1842, + "step": 15034 + }, + { + "epoch": 1.0090600986544076, + "grad_norm": 3.5762217044830322, + "learning_rate": 5.1689699866964735e-05, + "loss": 2.0157, + "step": 15036 + }, + { + "epoch": 1.0091943223381765, + "grad_norm": 4.067358016967773, + "learning_rate": 5.167883772847537e-05, + "loss": 1.9601, + "step": 15038 + }, + { + "epoch": 1.0093285460219457, + "grad_norm": 4.197263240814209, + "learning_rate": 5.166797551066422e-05, + "loss": 2.248, + "step": 15040 + }, + { + "epoch": 1.0094627697057146, + "grad_norm": 3.96802020072937, + "learning_rate": 5.165711321404453e-05, + "loss": 2.0425, + "step": 15042 + }, + { + "epoch": 1.0095969933894835, + "grad_norm": 3.69063138961792, + "learning_rate": 5.1646250839129476e-05, + "loss": 2.1248, + "step": 15044 + }, + { + "epoch": 1.0097312170732526, + "grad_norm": 3.940708637237549, + "learning_rate": 5.163538838643232e-05, + "loss": 2.0274, + "step": 15046 + }, + { + "epoch": 1.0098654407570216, + "grad_norm": 4.0582170486450195, + "learning_rate": 5.162452585646627e-05, + "loss": 2.1725, + "step": 15048 + }, + { + "epoch": 1.0099996644407905, + "grad_norm": 4.901898384094238, + "learning_rate": 5.161366324974457e-05, + "loss": 2.1162, + "step": 15050 + }, + { + "epoch": 1.0101338881245596, + "grad_norm": 3.9881627559661865, + "learning_rate": 5.160280056678045e-05, + "loss": 2.2635, + "step": 15052 + }, + { + "epoch": 1.0102681118083285, + "grad_norm": 6.271773815155029, + "learning_rate": 5.159193780808716e-05, + "loss": 1.9765, + "step": 15054 + }, + { + "epoch": 1.0104023354920977, + "grad_norm": 4.270569324493408, + "learning_rate": 5.158107497417795e-05, + "loss": 2.1277, + "step": 15056 + }, + { + "epoch": 1.0105365591758666, + "grad_norm": 3.7646403312683105, + "learning_rate": 5.157021206556604e-05, + "loss": 2.2623, + "step": 15058 + }, + { + "epoch": 1.0106707828596355, + "grad_norm": 3.206523895263672, + "learning_rate": 5.155934908276471e-05, + "loss": 1.964, + "step": 15060 + }, + { + "epoch": 1.0108050065434047, + "grad_norm": 3.994358777999878, + "learning_rate": 5.154848602628719e-05, + "loss": 2.105, + "step": 15062 + }, + { + "epoch": 1.0109392302271736, + "grad_norm": 4.04884672164917, + "learning_rate": 5.153762289664676e-05, + "loss": 2.1797, + "step": 15064 + }, + { + "epoch": 1.0110734539109425, + "grad_norm": 5.685296058654785, + "learning_rate": 5.1526759694356665e-05, + "loss": 2.2835, + "step": 15066 + }, + { + "epoch": 1.0112076775947116, + "grad_norm": 3.559008836746216, + "learning_rate": 5.151589641993018e-05, + "loss": 2.1835, + "step": 15068 + }, + { + "epoch": 1.0113419012784806, + "grad_norm": 5.286598205566406, + "learning_rate": 5.150503307388057e-05, + "loss": 2.2262, + "step": 15070 + }, + { + "epoch": 1.0114761249622495, + "grad_norm": 4.023536682128906, + "learning_rate": 5.1494169656721104e-05, + "loss": 2.0911, + "step": 15072 + }, + { + "epoch": 1.0116103486460186, + "grad_norm": 4.345673084259033, + "learning_rate": 5.148330616896506e-05, + "loss": 2.1693, + "step": 15074 + }, + { + "epoch": 1.0117445723297875, + "grad_norm": 4.032179832458496, + "learning_rate": 5.147244261112573e-05, + "loss": 2.1846, + "step": 15076 + }, + { + "epoch": 1.0118787960135567, + "grad_norm": 4.561160087585449, + "learning_rate": 5.1461578983716365e-05, + "loss": 1.9116, + "step": 15078 + }, + { + "epoch": 1.0120130196973256, + "grad_norm": 4.047983646392822, + "learning_rate": 5.145071528725027e-05, + "loss": 2.0343, + "step": 15080 + }, + { + "epoch": 1.0121472433810945, + "grad_norm": 4.015536308288574, + "learning_rate": 5.143985152224073e-05, + "loss": 2.042, + "step": 15082 + }, + { + "epoch": 1.0122814670648637, + "grad_norm": 3.962599515914917, + "learning_rate": 5.1428987689201046e-05, + "loss": 2.046, + "step": 15084 + }, + { + "epoch": 1.0124156907486326, + "grad_norm": 4.230006694793701, + "learning_rate": 5.1418123788644504e-05, + "loss": 2.3499, + "step": 15086 + }, + { + "epoch": 1.0125499144324015, + "grad_norm": 4.5382609367370605, + "learning_rate": 5.1407259821084384e-05, + "loss": 2.2338, + "step": 15088 + }, + { + "epoch": 1.0126841381161706, + "grad_norm": 3.7899622917175293, + "learning_rate": 5.139639578703403e-05, + "loss": 2.3633, + "step": 15090 + }, + { + "epoch": 1.0128183617999396, + "grad_norm": 4.742870807647705, + "learning_rate": 5.1385531687006695e-05, + "loss": 2.1233, + "step": 15092 + }, + { + "epoch": 1.0129525854837087, + "grad_norm": 4.271642684936523, + "learning_rate": 5.137466752151573e-05, + "loss": 2.1601, + "step": 15094 + }, + { + "epoch": 1.0130868091674776, + "grad_norm": 3.7382326126098633, + "learning_rate": 5.136380329107442e-05, + "loss": 2.0497, + "step": 15096 + }, + { + "epoch": 1.0132210328512465, + "grad_norm": 4.1051025390625, + "learning_rate": 5.1352938996196084e-05, + "loss": 2.1474, + "step": 15098 + }, + { + "epoch": 1.0133552565350157, + "grad_norm": 3.694725513458252, + "learning_rate": 5.134207463739404e-05, + "loss": 2.1601, + "step": 15100 + }, + { + "epoch": 1.0134894802187846, + "grad_norm": 4.2169904708862305, + "learning_rate": 5.1331210215181613e-05, + "loss": 2.1553, + "step": 15102 + }, + { + "epoch": 1.0136237039025535, + "grad_norm": 6.213113784790039, + "learning_rate": 5.132034573007213e-05, + "loss": 1.9693, + "step": 15104 + }, + { + "epoch": 1.0137579275863227, + "grad_norm": 4.288989067077637, + "learning_rate": 5.130948118257889e-05, + "loss": 1.9184, + "step": 15106 + }, + { + "epoch": 1.0138921512700916, + "grad_norm": 4.2804718017578125, + "learning_rate": 5.129861657321525e-05, + "loss": 2.2127, + "step": 15108 + }, + { + "epoch": 1.0140263749538605, + "grad_norm": 4.5691070556640625, + "learning_rate": 5.128775190249453e-05, + "loss": 2.1294, + "step": 15110 + }, + { + "epoch": 1.0141605986376296, + "grad_norm": 3.944629669189453, + "learning_rate": 5.1276887170930064e-05, + "loss": 2.2621, + "step": 15112 + }, + { + "epoch": 1.0142948223213986, + "grad_norm": 4.299734592437744, + "learning_rate": 5.1266022379035196e-05, + "loss": 2.1085, + "step": 15114 + }, + { + "epoch": 1.0144290460051677, + "grad_norm": 4.502388954162598, + "learning_rate": 5.125515752732324e-05, + "loss": 2.1254, + "step": 15116 + }, + { + "epoch": 1.0145632696889366, + "grad_norm": 4.3462233543396, + "learning_rate": 5.1244292616307574e-05, + "loss": 2.2892, + "step": 15118 + }, + { + "epoch": 1.0146974933727055, + "grad_norm": 3.6943812370300293, + "learning_rate": 5.123342764650153e-05, + "loss": 1.8669, + "step": 15120 + }, + { + "epoch": 1.0148317170564747, + "grad_norm": 3.6406171321868896, + "learning_rate": 5.122256261841846e-05, + "loss": 2.0014, + "step": 15122 + }, + { + "epoch": 1.0149659407402436, + "grad_norm": 3.9462993144989014, + "learning_rate": 5.12116975325717e-05, + "loss": 2.2203, + "step": 15124 + }, + { + "epoch": 1.0151001644240125, + "grad_norm": 5.532958507537842, + "learning_rate": 5.120083238947462e-05, + "loss": 2.0148, + "step": 15126 + }, + { + "epoch": 1.0152343881077817, + "grad_norm": 4.074473857879639, + "learning_rate": 5.1189967189640566e-05, + "loss": 2.0141, + "step": 15128 + }, + { + "epoch": 1.0153686117915506, + "grad_norm": 4.195079326629639, + "learning_rate": 5.11791019335829e-05, + "loss": 2.2557, + "step": 15130 + }, + { + "epoch": 1.0155028354753197, + "grad_norm": 4.107151985168457, + "learning_rate": 5.116823662181499e-05, + "loss": 2.0548, + "step": 15132 + }, + { + "epoch": 1.0156370591590886, + "grad_norm": 3.293262243270874, + "learning_rate": 5.11573712548502e-05, + "loss": 1.9895, + "step": 15134 + }, + { + "epoch": 1.0157712828428576, + "grad_norm": 4.368526935577393, + "learning_rate": 5.11465058332019e-05, + "loss": 2.0716, + "step": 15136 + }, + { + "epoch": 1.0159055065266267, + "grad_norm": 4.015213489532471, + "learning_rate": 5.113564035738343e-05, + "loss": 2.1546, + "step": 15138 + }, + { + "epoch": 1.0160397302103956, + "grad_norm": 3.9897592067718506, + "learning_rate": 5.1124774827908215e-05, + "loss": 1.9923, + "step": 15140 + }, + { + "epoch": 1.0161739538941645, + "grad_norm": 4.000401020050049, + "learning_rate": 5.111390924528959e-05, + "loss": 2.1636, + "step": 15142 + }, + { + "epoch": 1.0163081775779337, + "grad_norm": 3.7139692306518555, + "learning_rate": 5.1103043610040934e-05, + "loss": 2.0906, + "step": 15144 + }, + { + "epoch": 1.0164424012617026, + "grad_norm": 4.225245475769043, + "learning_rate": 5.1092177922675655e-05, + "loss": 2.2162, + "step": 15146 + }, + { + "epoch": 1.0165766249454715, + "grad_norm": 3.5453405380249023, + "learning_rate": 5.1081312183707095e-05, + "loss": 2.0781, + "step": 15148 + }, + { + "epoch": 1.0167108486292407, + "grad_norm": 4.067018985748291, + "learning_rate": 5.107044639364867e-05, + "loss": 2.1328, + "step": 15150 + }, + { + "epoch": 1.0168450723130096, + "grad_norm": 4.9223246574401855, + "learning_rate": 5.1059580553013764e-05, + "loss": 2.0916, + "step": 15152 + }, + { + "epoch": 1.0169792959967787, + "grad_norm": 4.342743873596191, + "learning_rate": 5.104871466231575e-05, + "loss": 2.0673, + "step": 15154 + }, + { + "epoch": 1.0171135196805476, + "grad_norm": 3.741887331008911, + "learning_rate": 5.103784872206805e-05, + "loss": 2.0502, + "step": 15156 + }, + { + "epoch": 1.0172477433643166, + "grad_norm": 4.045413494110107, + "learning_rate": 5.1026982732784014e-05, + "loss": 2.1068, + "step": 15158 + }, + { + "epoch": 1.0173819670480857, + "grad_norm": 4.908628463745117, + "learning_rate": 5.101611669497708e-05, + "loss": 2.3752, + "step": 15160 + }, + { + "epoch": 1.0175161907318546, + "grad_norm": 3.6025471687316895, + "learning_rate": 5.1005250609160625e-05, + "loss": 1.7549, + "step": 15162 + }, + { + "epoch": 1.0176504144156235, + "grad_norm": 3.5920145511627197, + "learning_rate": 5.099438447584804e-05, + "loss": 2.0603, + "step": 15164 + }, + { + "epoch": 1.0177846380993927, + "grad_norm": 4.061990737915039, + "learning_rate": 5.098351829555277e-05, + "loss": 2.0256, + "step": 15166 + }, + { + "epoch": 1.0179188617831616, + "grad_norm": 3.749736785888672, + "learning_rate": 5.097265206878817e-05, + "loss": 2.041, + "step": 15168 + }, + { + "epoch": 1.0180530854669307, + "grad_norm": 4.597357749938965, + "learning_rate": 5.0961785796067696e-05, + "loss": 2.2221, + "step": 15170 + }, + { + "epoch": 1.0181873091506997, + "grad_norm": 3.9112017154693604, + "learning_rate": 5.095091947790472e-05, + "loss": 2.2188, + "step": 15172 + }, + { + "epoch": 1.0183215328344686, + "grad_norm": 3.883202075958252, + "learning_rate": 5.094005311481268e-05, + "loss": 2.4362, + "step": 15174 + }, + { + "epoch": 1.0184557565182377, + "grad_norm": 3.903162956237793, + "learning_rate": 5.0929186707304964e-05, + "loss": 2.2389, + "step": 15176 + }, + { + "epoch": 1.0185899802020066, + "grad_norm": 3.7419517040252686, + "learning_rate": 5.0918320255895014e-05, + "loss": 2.1386, + "step": 15178 + }, + { + "epoch": 1.0187242038857756, + "grad_norm": 7.164471626281738, + "learning_rate": 5.0907453761096235e-05, + "loss": 2.4574, + "step": 15180 + }, + { + "epoch": 1.0188584275695447, + "grad_norm": 3.6520111560821533, + "learning_rate": 5.0896587223422044e-05, + "loss": 2.1546, + "step": 15182 + }, + { + "epoch": 1.0189926512533136, + "grad_norm": 4.410410404205322, + "learning_rate": 5.088572064338588e-05, + "loss": 2.1898, + "step": 15184 + }, + { + "epoch": 1.0191268749370828, + "grad_norm": 4.306136608123779, + "learning_rate": 5.087485402150115e-05, + "loss": 2.0279, + "step": 15186 + }, + { + "epoch": 1.0192610986208517, + "grad_norm": 4.022566318511963, + "learning_rate": 5.086398735828129e-05, + "loss": 2.1553, + "step": 15188 + }, + { + "epoch": 1.0193953223046206, + "grad_norm": 4.156010150909424, + "learning_rate": 5.085312065423973e-05, + "loss": 2.0854, + "step": 15190 + }, + { + "epoch": 1.0195295459883897, + "grad_norm": 3.8781228065490723, + "learning_rate": 5.084225390988989e-05, + "loss": 1.9698, + "step": 15192 + }, + { + "epoch": 1.0196637696721587, + "grad_norm": 4.226541519165039, + "learning_rate": 5.083138712574522e-05, + "loss": 1.9985, + "step": 15194 + }, + { + "epoch": 1.0197979933559276, + "grad_norm": 4.131373405456543, + "learning_rate": 5.082052030231913e-05, + "loss": 2.1926, + "step": 15196 + }, + { + "epoch": 1.0199322170396967, + "grad_norm": 4.207480430603027, + "learning_rate": 5.080965344012508e-05, + "loss": 2.0202, + "step": 15198 + }, + { + "epoch": 1.0200664407234656, + "grad_norm": 4.251650333404541, + "learning_rate": 5.079878653967649e-05, + "loss": 2.1687, + "step": 15200 + }, + { + "epoch": 1.0202006644072346, + "grad_norm": 4.0163960456848145, + "learning_rate": 5.078791960148681e-05, + "loss": 2.3605, + "step": 15202 + }, + { + "epoch": 1.0203348880910037, + "grad_norm": 3.885429620742798, + "learning_rate": 5.0777052626069476e-05, + "loss": 1.9413, + "step": 15204 + }, + { + "epoch": 1.0204691117747726, + "grad_norm": 3.7201356887817383, + "learning_rate": 5.076618561393793e-05, + "loss": 2.2776, + "step": 15206 + }, + { + "epoch": 1.0206033354585418, + "grad_norm": 3.928723096847534, + "learning_rate": 5.075531856560563e-05, + "loss": 2.3369, + "step": 15208 + }, + { + "epoch": 1.0207375591423107, + "grad_norm": 6.387029647827148, + "learning_rate": 5.074445148158601e-05, + "loss": 2.0826, + "step": 15210 + }, + { + "epoch": 1.0208717828260796, + "grad_norm": 3.990041732788086, + "learning_rate": 5.073358436239252e-05, + "loss": 2.043, + "step": 15212 + }, + { + "epoch": 1.0210060065098487, + "grad_norm": 3.8444838523864746, + "learning_rate": 5.072271720853862e-05, + "loss": 2.2627, + "step": 15214 + }, + { + "epoch": 1.0211402301936177, + "grad_norm": 4.0435686111450195, + "learning_rate": 5.071185002053773e-05, + "loss": 2.1033, + "step": 15216 + }, + { + "epoch": 1.0212744538773866, + "grad_norm": 3.560354709625244, + "learning_rate": 5.0700982798903355e-05, + "loss": 2.0583, + "step": 15218 + }, + { + "epoch": 1.0214086775611557, + "grad_norm": 4.237874984741211, + "learning_rate": 5.06901155441489e-05, + "loss": 2.2901, + "step": 15220 + }, + { + "epoch": 1.0215429012449246, + "grad_norm": 4.011465072631836, + "learning_rate": 5.067924825678785e-05, + "loss": 1.8434, + "step": 15222 + }, + { + "epoch": 1.0216771249286936, + "grad_norm": 3.853816032409668, + "learning_rate": 5.066838093733365e-05, + "loss": 2.0349, + "step": 15224 + }, + { + "epoch": 1.0218113486124627, + "grad_norm": 3.628704071044922, + "learning_rate": 5.065751358629976e-05, + "loss": 1.8952, + "step": 15226 + }, + { + "epoch": 1.0219455722962316, + "grad_norm": 4.217268943786621, + "learning_rate": 5.0646646204199663e-05, + "loss": 1.9516, + "step": 15228 + }, + { + "epoch": 1.0220797959800008, + "grad_norm": 3.798882484436035, + "learning_rate": 5.0635778791546794e-05, + "loss": 1.7713, + "step": 15230 + }, + { + "epoch": 1.0222140196637697, + "grad_norm": 4.619846343994141, + "learning_rate": 5.062491134885463e-05, + "loss": 2.1659, + "step": 15232 + }, + { + "epoch": 1.0223482433475386, + "grad_norm": 3.8267903327941895, + "learning_rate": 5.061404387663663e-05, + "loss": 1.9112, + "step": 15234 + }, + { + "epoch": 1.0224824670313077, + "grad_norm": 5.676225662231445, + "learning_rate": 5.060317637540628e-05, + "loss": 2.2383, + "step": 15236 + }, + { + "epoch": 1.0226166907150767, + "grad_norm": 4.246944904327393, + "learning_rate": 5.059230884567702e-05, + "loss": 2.0435, + "step": 15238 + }, + { + "epoch": 1.0227509143988456, + "grad_norm": 4.0126729011535645, + "learning_rate": 5.058144128796232e-05, + "loss": 2.3204, + "step": 15240 + }, + { + "epoch": 1.0228851380826147, + "grad_norm": 4.181591033935547, + "learning_rate": 5.057057370277568e-05, + "loss": 2.0933, + "step": 15242 + }, + { + "epoch": 1.0230193617663836, + "grad_norm": 4.174458980560303, + "learning_rate": 5.0559706090630546e-05, + "loss": 2.1233, + "step": 15244 + }, + { + "epoch": 1.0231535854501528, + "grad_norm": 4.131515026092529, + "learning_rate": 5.0548838452040407e-05, + "loss": 2.1611, + "step": 15246 + }, + { + "epoch": 1.0232878091339217, + "grad_norm": 4.514529228210449, + "learning_rate": 5.053797078751872e-05, + "loss": 2.3386, + "step": 15248 + }, + { + "epoch": 1.0234220328176906, + "grad_norm": 3.905099391937256, + "learning_rate": 5.052710309757899e-05, + "loss": 2.1385, + "step": 15250 + }, + { + "epoch": 1.0235562565014598, + "grad_norm": 4.074185848236084, + "learning_rate": 5.0516235382734654e-05, + "loss": 1.9881, + "step": 15252 + }, + { + "epoch": 1.0236904801852287, + "grad_norm": 4.548555374145508, + "learning_rate": 5.050536764349921e-05, + "loss": 2.2044, + "step": 15254 + }, + { + "epoch": 1.0238247038689976, + "grad_norm": 3.8043835163116455, + "learning_rate": 5.049449988038616e-05, + "loss": 1.7933, + "step": 15256 + }, + { + "epoch": 1.0239589275527667, + "grad_norm": 4.136840343475342, + "learning_rate": 5.048363209390895e-05, + "loss": 2.2219, + "step": 15258 + }, + { + "epoch": 1.0240931512365357, + "grad_norm": 4.128140449523926, + "learning_rate": 5.047276428458108e-05, + "loss": 2.0897, + "step": 15260 + }, + { + "epoch": 1.0242273749203046, + "grad_norm": 4.94468355178833, + "learning_rate": 5.046189645291601e-05, + "loss": 2.5902, + "step": 15262 + }, + { + "epoch": 1.0243615986040737, + "grad_norm": 4.156492710113525, + "learning_rate": 5.0451028599427265e-05, + "loss": 2.2552, + "step": 15264 + }, + { + "epoch": 1.0244958222878426, + "grad_norm": 4.653423309326172, + "learning_rate": 5.044016072462829e-05, + "loss": 2.1809, + "step": 15266 + }, + { + "epoch": 1.0246300459716118, + "grad_norm": 3.6627097129821777, + "learning_rate": 5.0429292829032596e-05, + "loss": 2.2331, + "step": 15268 + }, + { + "epoch": 1.0247642696553807, + "grad_norm": 3.9174551963806152, + "learning_rate": 5.041842491315365e-05, + "loss": 2.0309, + "step": 15270 + }, + { + "epoch": 1.0248984933391496, + "grad_norm": 4.071387767791748, + "learning_rate": 5.040755697750496e-05, + "loss": 2.1018, + "step": 15272 + }, + { + "epoch": 1.0250327170229188, + "grad_norm": 4.438292980194092, + "learning_rate": 5.03966890226e-05, + "loss": 2.1022, + "step": 15274 + }, + { + "epoch": 1.0251669407066877, + "grad_norm": 4.131013870239258, + "learning_rate": 5.0385821048952255e-05, + "loss": 2.0957, + "step": 15276 + }, + { + "epoch": 1.0253011643904566, + "grad_norm": 3.7589871883392334, + "learning_rate": 5.037495305707524e-05, + "loss": 2.0154, + "step": 15278 + }, + { + "epoch": 1.0254353880742257, + "grad_norm": 3.9743192195892334, + "learning_rate": 5.0364085047482424e-05, + "loss": 2.1152, + "step": 15280 + }, + { + "epoch": 1.0255696117579947, + "grad_norm": 4.399693489074707, + "learning_rate": 5.035321702068731e-05, + "loss": 2.2291, + "step": 15282 + }, + { + "epoch": 1.0257038354417638, + "grad_norm": 3.869724988937378, + "learning_rate": 5.034234897720338e-05, + "loss": 2.2352, + "step": 15284 + }, + { + "epoch": 1.0258380591255327, + "grad_norm": 4.563023090362549, + "learning_rate": 5.0331480917544126e-05, + "loss": 2.0439, + "step": 15286 + }, + { + "epoch": 1.0259722828093016, + "grad_norm": 4.848474502563477, + "learning_rate": 5.032061284222307e-05, + "loss": 2.0449, + "step": 15288 + }, + { + "epoch": 1.0261065064930708, + "grad_norm": 3.8348677158355713, + "learning_rate": 5.0309744751753664e-05, + "loss": 2.1787, + "step": 15290 + }, + { + "epoch": 1.0262407301768397, + "grad_norm": 4.884356498718262, + "learning_rate": 5.029887664664944e-05, + "loss": 2.2166, + "step": 15292 + }, + { + "epoch": 1.0263749538606086, + "grad_norm": 4.260405540466309, + "learning_rate": 5.028800852742388e-05, + "loss": 1.8958, + "step": 15294 + }, + { + "epoch": 1.0265091775443778, + "grad_norm": 4.203830242156982, + "learning_rate": 5.0277140394590484e-05, + "loss": 2.1649, + "step": 15296 + }, + { + "epoch": 1.0266434012281467, + "grad_norm": 3.6228580474853516, + "learning_rate": 5.026627224866276e-05, + "loss": 2.0838, + "step": 15298 + }, + { + "epoch": 1.0267776249119156, + "grad_norm": 4.006748199462891, + "learning_rate": 5.025540409015418e-05, + "loss": 2.0693, + "step": 15300 + }, + { + "epoch": 1.0269118485956847, + "grad_norm": 3.9555771350860596, + "learning_rate": 5.024453591957826e-05, + "loss": 2.5317, + "step": 15302 + }, + { + "epoch": 1.0270460722794537, + "grad_norm": 3.994753122329712, + "learning_rate": 5.023366773744851e-05, + "loss": 2.1266, + "step": 15304 + }, + { + "epoch": 1.0271802959632228, + "grad_norm": 4.043581962585449, + "learning_rate": 5.022279954427841e-05, + "loss": 2.1709, + "step": 15306 + }, + { + "epoch": 1.0273145196469917, + "grad_norm": 3.7223033905029297, + "learning_rate": 5.0211931340581476e-05, + "loss": 2.3031, + "step": 15308 + }, + { + "epoch": 1.0274487433307606, + "grad_norm": 3.6280319690704346, + "learning_rate": 5.020106312687119e-05, + "loss": 1.9959, + "step": 15310 + }, + { + "epoch": 1.0275829670145298, + "grad_norm": 3.8471567630767822, + "learning_rate": 5.0190194903661094e-05, + "loss": 2.1177, + "step": 15312 + }, + { + "epoch": 1.0277171906982987, + "grad_norm": 4.761835098266602, + "learning_rate": 5.0179326671464634e-05, + "loss": 2.3411, + "step": 15314 + }, + { + "epoch": 1.0278514143820676, + "grad_norm": 3.8329663276672363, + "learning_rate": 5.016845843079535e-05, + "loss": 2.2198, + "step": 15316 + }, + { + "epoch": 1.0279856380658368, + "grad_norm": 3.8405768871307373, + "learning_rate": 5.015759018216675e-05, + "loss": 2.0519, + "step": 15318 + }, + { + "epoch": 1.0281198617496057, + "grad_norm": 4.021745681762695, + "learning_rate": 5.01467219260923e-05, + "loss": 2.0349, + "step": 15320 + }, + { + "epoch": 1.0282540854333748, + "grad_norm": 4.016420841217041, + "learning_rate": 5.0135853663085554e-05, + "loss": 1.7857, + "step": 15322 + }, + { + "epoch": 1.0283883091171437, + "grad_norm": 5.6464524269104, + "learning_rate": 5.012498539365996e-05, + "loss": 1.9768, + "step": 15324 + }, + { + "epoch": 1.0285225328009127, + "grad_norm": 4.059455394744873, + "learning_rate": 5.0114117118329075e-05, + "loss": 2.1382, + "step": 15326 + }, + { + "epoch": 1.0286567564846818, + "grad_norm": 3.851208448410034, + "learning_rate": 5.0103248837606375e-05, + "loss": 2.0828, + "step": 15328 + }, + { + "epoch": 1.0287909801684507, + "grad_norm": 3.965332269668579, + "learning_rate": 5.0092380552005355e-05, + "loss": 2.3413, + "step": 15330 + }, + { + "epoch": 1.0289252038522196, + "grad_norm": 4.351446151733398, + "learning_rate": 5.008151226203957e-05, + "loss": 2.3371, + "step": 15332 + }, + { + "epoch": 1.0290594275359888, + "grad_norm": 4.988500118255615, + "learning_rate": 5.007064396822245e-05, + "loss": 2.0442, + "step": 15334 + }, + { + "epoch": 1.0291936512197577, + "grad_norm": 4.033559322357178, + "learning_rate": 5.0059775671067575e-05, + "loss": 2.1593, + "step": 15336 + }, + { + "epoch": 1.0293278749035266, + "grad_norm": 3.4467878341674805, + "learning_rate": 5.0048907371088394e-05, + "loss": 2.1881, + "step": 15338 + }, + { + "epoch": 1.0294620985872958, + "grad_norm": 3.8561317920684814, + "learning_rate": 5.0038039068798446e-05, + "loss": 2.069, + "step": 15340 + }, + { + "epoch": 1.0295963222710647, + "grad_norm": 4.662243366241455, + "learning_rate": 5.002717076471123e-05, + "loss": 2.0826, + "step": 15342 + }, + { + "epoch": 1.0297305459548338, + "grad_norm": 3.840770721435547, + "learning_rate": 5.001630245934024e-05, + "loss": 1.9599, + "step": 15344 + }, + { + "epoch": 1.0298647696386027, + "grad_norm": 3.737920045852661, + "learning_rate": 5.000543415319901e-05, + "loss": 2.1205, + "step": 15346 + }, + { + "epoch": 1.0299989933223717, + "grad_norm": 3.777724266052246, + "learning_rate": 4.999456584680101e-05, + "loss": 1.7585, + "step": 15348 + }, + { + "epoch": 1.0301332170061408, + "grad_norm": 4.478520393371582, + "learning_rate": 4.998369754065976e-05, + "loss": 2.1643, + "step": 15350 + }, + { + "epoch": 1.0302674406899097, + "grad_norm": 3.454615831375122, + "learning_rate": 4.997282923528878e-05, + "loss": 1.9269, + "step": 15352 + }, + { + "epoch": 1.0304016643736786, + "grad_norm": 4.354729175567627, + "learning_rate": 4.996196093120156e-05, + "loss": 1.9991, + "step": 15354 + }, + { + "epoch": 1.0305358880574478, + "grad_norm": 3.9641172885894775, + "learning_rate": 4.995109262891162e-05, + "loss": 1.9811, + "step": 15356 + }, + { + "epoch": 1.0306701117412167, + "grad_norm": 4.126335144042969, + "learning_rate": 4.994022432893243e-05, + "loss": 2.04, + "step": 15358 + }, + { + "epoch": 1.0308043354249858, + "grad_norm": 4.078865051269531, + "learning_rate": 4.9929356031777555e-05, + "loss": 1.8943, + "step": 15360 + }, + { + "epoch": 1.0309385591087548, + "grad_norm": 3.904869556427002, + "learning_rate": 4.991848773796044e-05, + "loss": 2.1199, + "step": 15362 + }, + { + "epoch": 1.0310727827925237, + "grad_norm": 4.379369735717773, + "learning_rate": 4.990761944799465e-05, + "loss": 2.1709, + "step": 15364 + }, + { + "epoch": 1.0312070064762928, + "grad_norm": 3.9980309009552, + "learning_rate": 4.989675116239363e-05, + "loss": 2.0541, + "step": 15366 + }, + { + "epoch": 1.0313412301600617, + "grad_norm": 4.026772499084473, + "learning_rate": 4.988588288167093e-05, + "loss": 2.2639, + "step": 15368 + }, + { + "epoch": 1.0314754538438307, + "grad_norm": 4.847091197967529, + "learning_rate": 4.9875014606340035e-05, + "loss": 2.41, + "step": 15370 + }, + { + "epoch": 1.0316096775275998, + "grad_norm": 4.74475622177124, + "learning_rate": 4.9864146336914465e-05, + "loss": 2.0651, + "step": 15372 + }, + { + "epoch": 1.0317439012113687, + "grad_norm": 4.670109748840332, + "learning_rate": 4.98532780739077e-05, + "loss": 2.3475, + "step": 15374 + }, + { + "epoch": 1.0318781248951376, + "grad_norm": 3.9878427982330322, + "learning_rate": 4.984240981783327e-05, + "loss": 2.1018, + "step": 15376 + }, + { + "epoch": 1.0320123485789068, + "grad_norm": 3.786717653274536, + "learning_rate": 4.9831541569204654e-05, + "loss": 2.0891, + "step": 15378 + }, + { + "epoch": 1.0321465722626757, + "grad_norm": 3.4211676120758057, + "learning_rate": 4.982067332853538e-05, + "loss": 2.0397, + "step": 15380 + }, + { + "epoch": 1.0322807959464448, + "grad_norm": 5.650674343109131, + "learning_rate": 4.980980509633894e-05, + "loss": 2.1039, + "step": 15382 + }, + { + "epoch": 1.0324150196302138, + "grad_norm": 4.127651691436768, + "learning_rate": 4.979893687312881e-05, + "loss": 2.1543, + "step": 15384 + }, + { + "epoch": 1.0325492433139827, + "grad_norm": 3.7530641555786133, + "learning_rate": 4.978806865941855e-05, + "loss": 2.1307, + "step": 15386 + }, + { + "epoch": 1.0326834669977518, + "grad_norm": 4.240579128265381, + "learning_rate": 4.97772004557216e-05, + "loss": 2.0086, + "step": 15388 + }, + { + "epoch": 1.0328176906815207, + "grad_norm": 4.224952697753906, + "learning_rate": 4.9766332262551515e-05, + "loss": 2.4284, + "step": 15390 + }, + { + "epoch": 1.0329519143652897, + "grad_norm": 6.547316551208496, + "learning_rate": 4.975546408042175e-05, + "loss": 1.9714, + "step": 15392 + }, + { + "epoch": 1.0330861380490588, + "grad_norm": 4.187970161437988, + "learning_rate": 4.9744595909845845e-05, + "loss": 2.0739, + "step": 15394 + }, + { + "epoch": 1.0332203617328277, + "grad_norm": 3.8209917545318604, + "learning_rate": 4.973372775133725e-05, + "loss": 2.3734, + "step": 15396 + }, + { + "epoch": 1.0333545854165969, + "grad_norm": 4.269017696380615, + "learning_rate": 4.972285960540954e-05, + "loss": 1.9871, + "step": 15398 + }, + { + "epoch": 1.0334888091003658, + "grad_norm": 3.3950741291046143, + "learning_rate": 4.9711991472576124e-05, + "loss": 1.8377, + "step": 15400 + }, + { + "epoch": 1.0336230327841347, + "grad_norm": 4.268519401550293, + "learning_rate": 4.9701123353350575e-05, + "loss": 2.3262, + "step": 15402 + }, + { + "epoch": 1.0337572564679038, + "grad_norm": 4.897393226623535, + "learning_rate": 4.969025524824634e-05, + "loss": 2.1653, + "step": 15404 + }, + { + "epoch": 1.0338914801516728, + "grad_norm": 3.733123540878296, + "learning_rate": 4.967938715777696e-05, + "loss": 1.9846, + "step": 15406 + }, + { + "epoch": 1.0340257038354417, + "grad_norm": 4.231604099273682, + "learning_rate": 4.9668519082455885e-05, + "loss": 1.8428, + "step": 15408 + }, + { + "epoch": 1.0341599275192108, + "grad_norm": 3.983473062515259, + "learning_rate": 4.965765102279664e-05, + "loss": 2.1671, + "step": 15410 + }, + { + "epoch": 1.0342941512029797, + "grad_norm": 4.038753509521484, + "learning_rate": 4.964678297931271e-05, + "loss": 2.0095, + "step": 15412 + }, + { + "epoch": 1.0344283748867489, + "grad_norm": 4.525862216949463, + "learning_rate": 4.9635914952517594e-05, + "loss": 2.034, + "step": 15414 + }, + { + "epoch": 1.0345625985705178, + "grad_norm": 4.7311811447143555, + "learning_rate": 4.9625046942924774e-05, + "loss": 2.4199, + "step": 15416 + }, + { + "epoch": 1.0346968222542867, + "grad_norm": 3.714322805404663, + "learning_rate": 4.9614178951047756e-05, + "loss": 2.1346, + "step": 15418 + }, + { + "epoch": 1.0348310459380559, + "grad_norm": 4.774411201477051, + "learning_rate": 4.9603310977400006e-05, + "loss": 1.9691, + "step": 15420 + }, + { + "epoch": 1.0349652696218248, + "grad_norm": 4.516318321228027, + "learning_rate": 4.959244302249506e-05, + "loss": 2.1161, + "step": 15422 + }, + { + "epoch": 1.0350994933055937, + "grad_norm": 3.686032772064209, + "learning_rate": 4.958157508684635e-05, + "loss": 1.9209, + "step": 15424 + }, + { + "epoch": 1.0352337169893628, + "grad_norm": 3.8797056674957275, + "learning_rate": 4.957070717096742e-05, + "loss": 1.8913, + "step": 15426 + }, + { + "epoch": 1.0353679406731318, + "grad_norm": 3.7188751697540283, + "learning_rate": 4.955983927537171e-05, + "loss": 2.0816, + "step": 15428 + }, + { + "epoch": 1.0355021643569007, + "grad_norm": 4.194751262664795, + "learning_rate": 4.9548971400572747e-05, + "loss": 2.1484, + "step": 15430 + }, + { + "epoch": 1.0356363880406698, + "grad_norm": 3.823251247406006, + "learning_rate": 4.9538103547083985e-05, + "loss": 2.1887, + "step": 15432 + }, + { + "epoch": 1.0357706117244387, + "grad_norm": 3.9138033390045166, + "learning_rate": 4.952723571541893e-05, + "loss": 2.0965, + "step": 15434 + }, + { + "epoch": 1.0359048354082079, + "grad_norm": 4.174932479858398, + "learning_rate": 4.9516367906091055e-05, + "loss": 2.1204, + "step": 15436 + }, + { + "epoch": 1.0360390590919768, + "grad_norm": 3.5628175735473633, + "learning_rate": 4.950550011961385e-05, + "loss": 2.066, + "step": 15438 + }, + { + "epoch": 1.0361732827757457, + "grad_norm": 4.478567123413086, + "learning_rate": 4.9494632356500785e-05, + "loss": 2.1513, + "step": 15440 + }, + { + "epoch": 1.0363075064595149, + "grad_norm": 3.57936692237854, + "learning_rate": 4.948376461726536e-05, + "loss": 2.1889, + "step": 15442 + }, + { + "epoch": 1.0364417301432838, + "grad_norm": 3.6369903087615967, + "learning_rate": 4.947289690242102e-05, + "loss": 2.2716, + "step": 15444 + }, + { + "epoch": 1.0365759538270527, + "grad_norm": 4.141934871673584, + "learning_rate": 4.9462029212481286e-05, + "loss": 2.2033, + "step": 15446 + }, + { + "epoch": 1.0367101775108218, + "grad_norm": 3.3453400135040283, + "learning_rate": 4.945116154795959e-05, + "loss": 2.0041, + "step": 15448 + }, + { + "epoch": 1.0368444011945908, + "grad_norm": 3.8211312294006348, + "learning_rate": 4.9440293909369465e-05, + "loss": 1.8952, + "step": 15450 + }, + { + "epoch": 1.0369786248783597, + "grad_norm": 3.9080748558044434, + "learning_rate": 4.942942629722434e-05, + "loss": 2.0339, + "step": 15452 + }, + { + "epoch": 1.0371128485621288, + "grad_norm": 3.8182132244110107, + "learning_rate": 4.941855871203769e-05, + "loss": 2.1915, + "step": 15454 + }, + { + "epoch": 1.0372470722458977, + "grad_norm": 3.8758277893066406, + "learning_rate": 4.940769115432301e-05, + "loss": 1.9731, + "step": 15456 + }, + { + "epoch": 1.0373812959296669, + "grad_norm": 4.084555149078369, + "learning_rate": 4.939682362459374e-05, + "loss": 2.0862, + "step": 15458 + }, + { + "epoch": 1.0375155196134358, + "grad_norm": 3.995432138442993, + "learning_rate": 4.938595612336339e-05, + "loss": 2.398, + "step": 15460 + }, + { + "epoch": 1.0376497432972047, + "grad_norm": 3.8904337882995605, + "learning_rate": 4.937508865114537e-05, + "loss": 1.8826, + "step": 15462 + }, + { + "epoch": 1.0377839669809739, + "grad_norm": 3.816039562225342, + "learning_rate": 4.936422120845322e-05, + "loss": 2.2759, + "step": 15464 + }, + { + "epoch": 1.0379181906647428, + "grad_norm": 4.458072185516357, + "learning_rate": 4.935335379580034e-05, + "loss": 1.9181, + "step": 15466 + }, + { + "epoch": 1.0380524143485117, + "grad_norm": 4.13263463973999, + "learning_rate": 4.934248641370025e-05, + "loss": 2.0325, + "step": 15468 + }, + { + "epoch": 1.0381866380322808, + "grad_norm": 3.2751383781433105, + "learning_rate": 4.933161906266636e-05, + "loss": 1.9842, + "step": 15470 + }, + { + "epoch": 1.0383208617160498, + "grad_norm": 3.8670008182525635, + "learning_rate": 4.9320751743212176e-05, + "loss": 2.1101, + "step": 15472 + }, + { + "epoch": 1.038455085399819, + "grad_norm": 4.156954288482666, + "learning_rate": 4.930988445585112e-05, + "loss": 2.2947, + "step": 15474 + }, + { + "epoch": 1.0385893090835878, + "grad_norm": 3.838139057159424, + "learning_rate": 4.929901720109668e-05, + "loss": 2.0697, + "step": 15476 + }, + { + "epoch": 1.0387235327673567, + "grad_norm": 3.9270994663238525, + "learning_rate": 4.928814997946228e-05, + "loss": 2.0991, + "step": 15478 + }, + { + "epoch": 1.0388577564511259, + "grad_norm": 4.311946392059326, + "learning_rate": 4.927728279146141e-05, + "loss": 2.1412, + "step": 15480 + }, + { + "epoch": 1.0389919801348948, + "grad_norm": 4.199912071228027, + "learning_rate": 4.9266415637607494e-05, + "loss": 2.2712, + "step": 15482 + }, + { + "epoch": 1.0391262038186637, + "grad_norm": 3.8418102264404297, + "learning_rate": 4.925554851841401e-05, + "loss": 2.1009, + "step": 15484 + }, + { + "epoch": 1.0392604275024329, + "grad_norm": 3.3836328983306885, + "learning_rate": 4.924468143439437e-05, + "loss": 1.8758, + "step": 15486 + }, + { + "epoch": 1.0393946511862018, + "grad_norm": 4.149127960205078, + "learning_rate": 4.923381438606208e-05, + "loss": 1.9655, + "step": 15488 + }, + { + "epoch": 1.039528874869971, + "grad_norm": 3.404996633529663, + "learning_rate": 4.922294737393053e-05, + "loss": 2.1329, + "step": 15490 + }, + { + "epoch": 1.0396630985537398, + "grad_norm": 4.136187553405762, + "learning_rate": 4.92120803985132e-05, + "loss": 2.104, + "step": 15492 + }, + { + "epoch": 1.0397973222375088, + "grad_norm": 4.4718098640441895, + "learning_rate": 4.920121346032351e-05, + "loss": 2.0791, + "step": 15494 + }, + { + "epoch": 1.039931545921278, + "grad_norm": 4.284339904785156, + "learning_rate": 4.919034655987493e-05, + "loss": 2.0197, + "step": 15496 + }, + { + "epoch": 1.0400657696050468, + "grad_norm": 4.070452690124512, + "learning_rate": 4.917947969768087e-05, + "loss": 2.288, + "step": 15498 + }, + { + "epoch": 1.0401999932888157, + "grad_norm": 4.20653772354126, + "learning_rate": 4.916861287425479e-05, + "loss": 1.9479, + "step": 15500 + }, + { + "epoch": 1.0403342169725849, + "grad_norm": 3.9454939365386963, + "learning_rate": 4.915774609011011e-05, + "loss": 2.0128, + "step": 15502 + }, + { + "epoch": 1.0404684406563538, + "grad_norm": 4.578378677368164, + "learning_rate": 4.914687934576028e-05, + "loss": 2.4327, + "step": 15504 + }, + { + "epoch": 1.0406026643401227, + "grad_norm": 4.033146381378174, + "learning_rate": 4.913601264171871e-05, + "loss": 2.1623, + "step": 15506 + }, + { + "epoch": 1.0407368880238919, + "grad_norm": 4.196930885314941, + "learning_rate": 4.912514597849886e-05, + "loss": 2.0269, + "step": 15508 + }, + { + "epoch": 1.0408711117076608, + "grad_norm": 4.078706741333008, + "learning_rate": 4.911427935661411e-05, + "loss": 1.902, + "step": 15510 + }, + { + "epoch": 1.04100533539143, + "grad_norm": 4.119913101196289, + "learning_rate": 4.910341277657796e-05, + "loss": 2.1752, + "step": 15512 + }, + { + "epoch": 1.0411395590751988, + "grad_norm": 5.234511375427246, + "learning_rate": 4.909254623890376e-05, + "loss": 2.2054, + "step": 15514 + }, + { + "epoch": 1.0412737827589678, + "grad_norm": 3.945582628250122, + "learning_rate": 4.9081679744105e-05, + "loss": 2.2321, + "step": 15516 + }, + { + "epoch": 1.041408006442737, + "grad_norm": 3.949432134628296, + "learning_rate": 4.907081329269503e-05, + "loss": 1.9453, + "step": 15518 + }, + { + "epoch": 1.0415422301265058, + "grad_norm": 4.151355266571045, + "learning_rate": 4.905994688518733e-05, + "loss": 2.3667, + "step": 15520 + }, + { + "epoch": 1.0416764538102747, + "grad_norm": 3.7584006786346436, + "learning_rate": 4.9049080522095296e-05, + "loss": 2.0125, + "step": 15522 + }, + { + "epoch": 1.0418106774940439, + "grad_norm": 3.729097843170166, + "learning_rate": 4.903821420393231e-05, + "loss": 1.9023, + "step": 15524 + }, + { + "epoch": 1.0419449011778128, + "grad_norm": 4.1840667724609375, + "learning_rate": 4.902734793121183e-05, + "loss": 2.0752, + "step": 15526 + }, + { + "epoch": 1.0420791248615817, + "grad_norm": 3.9712769985198975, + "learning_rate": 4.9016481704447245e-05, + "loss": 2.0035, + "step": 15528 + }, + { + "epoch": 1.0422133485453509, + "grad_norm": 4.076865196228027, + "learning_rate": 4.900561552415197e-05, + "loss": 2.0787, + "step": 15530 + }, + { + "epoch": 1.0423475722291198, + "grad_norm": 4.183685779571533, + "learning_rate": 4.899474939083939e-05, + "loss": 2.12, + "step": 15532 + }, + { + "epoch": 1.042481795912889, + "grad_norm": 4.286768436431885, + "learning_rate": 4.8983883305022945e-05, + "loss": 2.4678, + "step": 15534 + }, + { + "epoch": 1.0426160195966578, + "grad_norm": 4.080395221710205, + "learning_rate": 4.8973017267216e-05, + "loss": 2.0815, + "step": 15536 + }, + { + "epoch": 1.0427502432804268, + "grad_norm": 4.091171741485596, + "learning_rate": 4.896215127793198e-05, + "loss": 2.0114, + "step": 15538 + }, + { + "epoch": 1.042884466964196, + "grad_norm": 4.098770618438721, + "learning_rate": 4.895128533768426e-05, + "loss": 2.0338, + "step": 15540 + }, + { + "epoch": 1.0430186906479648, + "grad_norm": 3.953237771987915, + "learning_rate": 4.894041944698626e-05, + "loss": 2.3192, + "step": 15542 + }, + { + "epoch": 1.0431529143317337, + "grad_norm": 4.5695013999938965, + "learning_rate": 4.892955360635134e-05, + "loss": 1.9646, + "step": 15544 + }, + { + "epoch": 1.0432871380155029, + "grad_norm": 4.310586452484131, + "learning_rate": 4.891868781629292e-05, + "loss": 2.0613, + "step": 15546 + }, + { + "epoch": 1.0434213616992718, + "grad_norm": 4.080277442932129, + "learning_rate": 4.890782207732436e-05, + "loss": 2.0956, + "step": 15548 + }, + { + "epoch": 1.043555585383041, + "grad_norm": 4.097259044647217, + "learning_rate": 4.8896956389959084e-05, + "loss": 2.1277, + "step": 15550 + }, + { + "epoch": 1.0436898090668099, + "grad_norm": 4.145259857177734, + "learning_rate": 4.888609075471042e-05, + "loss": 2.2351, + "step": 15552 + }, + { + "epoch": 1.0438240327505788, + "grad_norm": 3.857907772064209, + "learning_rate": 4.88752251720918e-05, + "loss": 1.9815, + "step": 15554 + }, + { + "epoch": 1.043958256434348, + "grad_norm": 3.6020021438598633, + "learning_rate": 4.886435964261656e-05, + "loss": 2.1662, + "step": 15556 + }, + { + "epoch": 1.0440924801181168, + "grad_norm": 3.945493698120117, + "learning_rate": 4.885349416679812e-05, + "loss": 2.2693, + "step": 15558 + }, + { + "epoch": 1.0442267038018858, + "grad_norm": 4.463277339935303, + "learning_rate": 4.884262874514981e-05, + "loss": 2.1762, + "step": 15560 + }, + { + "epoch": 1.044360927485655, + "grad_norm": 4.312330722808838, + "learning_rate": 4.883176337818502e-05, + "loss": 1.9556, + "step": 15562 + }, + { + "epoch": 1.0444951511694238, + "grad_norm": 3.6011910438537598, + "learning_rate": 4.88208980664171e-05, + "loss": 2.0922, + "step": 15564 + }, + { + "epoch": 1.044629374853193, + "grad_norm": 5.511405944824219, + "learning_rate": 4.8810032810359446e-05, + "loss": 2.0603, + "step": 15566 + }, + { + "epoch": 1.0447635985369619, + "grad_norm": 4.087583065032959, + "learning_rate": 4.879916761052539e-05, + "loss": 2.2142, + "step": 15568 + }, + { + "epoch": 1.0448978222207308, + "grad_norm": 4.014772415161133, + "learning_rate": 4.8788302467428316e-05, + "loss": 2.1405, + "step": 15570 + }, + { + "epoch": 1.0450320459045, + "grad_norm": 4.132350444793701, + "learning_rate": 4.877743738158155e-05, + "loss": 2.0685, + "step": 15572 + }, + { + "epoch": 1.0451662695882689, + "grad_norm": 4.32765007019043, + "learning_rate": 4.876657235349848e-05, + "loss": 2.2501, + "step": 15574 + }, + { + "epoch": 1.0453004932720378, + "grad_norm": 3.8142316341400146, + "learning_rate": 4.875570738369242e-05, + "loss": 1.9072, + "step": 15576 + }, + { + "epoch": 1.045434716955807, + "grad_norm": 4.614214897155762, + "learning_rate": 4.8744842472676764e-05, + "loss": 2.3136, + "step": 15578 + }, + { + "epoch": 1.0455689406395758, + "grad_norm": 3.6418204307556152, + "learning_rate": 4.873397762096481e-05, + "loss": 2.0053, + "step": 15580 + }, + { + "epoch": 1.0457031643233448, + "grad_norm": 3.5680627822875977, + "learning_rate": 4.872311282906994e-05, + "loss": 2.0913, + "step": 15582 + }, + { + "epoch": 1.045837388007114, + "grad_norm": 3.9714295864105225, + "learning_rate": 4.871224809750547e-05, + "loss": 1.9334, + "step": 15584 + }, + { + "epoch": 1.0459716116908828, + "grad_norm": 4.48379373550415, + "learning_rate": 4.8701383426784755e-05, + "loss": 1.976, + "step": 15586 + }, + { + "epoch": 1.046105835374652, + "grad_norm": 4.14409065246582, + "learning_rate": 4.869051881742112e-05, + "loss": 2.3383, + "step": 15588 + }, + { + "epoch": 1.0462400590584209, + "grad_norm": 4.3121819496154785, + "learning_rate": 4.867965426992788e-05, + "loss": 2.0512, + "step": 15590 + }, + { + "epoch": 1.0463742827421898, + "grad_norm": 3.4940998554229736, + "learning_rate": 4.86687897848184e-05, + "loss": 2.3349, + "step": 15592 + }, + { + "epoch": 1.046508506425959, + "grad_norm": 3.9161088466644287, + "learning_rate": 4.8657925362605964e-05, + "loss": 2.2324, + "step": 15594 + }, + { + "epoch": 1.0466427301097279, + "grad_norm": 3.5878028869628906, + "learning_rate": 4.8647061003803934e-05, + "loss": 2.1051, + "step": 15596 + }, + { + "epoch": 1.0467769537934968, + "grad_norm": 3.6115376949310303, + "learning_rate": 4.86361967089256e-05, + "loss": 2.1217, + "step": 15598 + }, + { + "epoch": 1.046911177477266, + "grad_norm": 3.987236499786377, + "learning_rate": 4.8625332478484295e-05, + "loss": 1.908, + "step": 15600 + }, + { + "epoch": 1.0470454011610348, + "grad_norm": 3.9282565116882324, + "learning_rate": 4.861446831299332e-05, + "loss": 2.208, + "step": 15602 + }, + { + "epoch": 1.0471796248448038, + "grad_norm": 3.720571517944336, + "learning_rate": 4.8603604212966005e-05, + "loss": 2.2428, + "step": 15604 + }, + { + "epoch": 1.047313848528573, + "grad_norm": 3.9952914714813232, + "learning_rate": 4.859274017891563e-05, + "loss": 2.2128, + "step": 15606 + }, + { + "epoch": 1.0474480722123418, + "grad_norm": 3.8690054416656494, + "learning_rate": 4.858187621135553e-05, + "loss": 2.0364, + "step": 15608 + }, + { + "epoch": 1.047582295896111, + "grad_norm": 3.4409756660461426, + "learning_rate": 4.8571012310798966e-05, + "loss": 1.7695, + "step": 15610 + }, + { + "epoch": 1.0477165195798799, + "grad_norm": 3.682298183441162, + "learning_rate": 4.8560148477759285e-05, + "loss": 2.0395, + "step": 15612 + }, + { + "epoch": 1.0478507432636488, + "grad_norm": 3.4032254219055176, + "learning_rate": 4.854928471274974e-05, + "loss": 1.9651, + "step": 15614 + }, + { + "epoch": 1.047984966947418, + "grad_norm": 5.938292026519775, + "learning_rate": 4.8538421016283646e-05, + "loss": 2.0293, + "step": 15616 + }, + { + "epoch": 1.0481191906311869, + "grad_norm": 3.8988993167877197, + "learning_rate": 4.8527557388874285e-05, + "loss": 2.0594, + "step": 15618 + }, + { + "epoch": 1.0482534143149558, + "grad_norm": 4.267573833465576, + "learning_rate": 4.851669383103495e-05, + "loss": 2.1009, + "step": 15620 + }, + { + "epoch": 1.048387637998725, + "grad_norm": 4.39330530166626, + "learning_rate": 4.85058303432789e-05, + "loss": 2.2097, + "step": 15622 + }, + { + "epoch": 1.0485218616824938, + "grad_norm": 3.7814483642578125, + "learning_rate": 4.849496692611945e-05, + "loss": 2.1945, + "step": 15624 + }, + { + "epoch": 1.048656085366263, + "grad_norm": 3.7469918727874756, + "learning_rate": 4.8484103580069826e-05, + "loss": 2.3376, + "step": 15626 + }, + { + "epoch": 1.048790309050032, + "grad_norm": 3.7638373374938965, + "learning_rate": 4.8473240305643346e-05, + "loss": 2.0636, + "step": 15628 + }, + { + "epoch": 1.0489245327338008, + "grad_norm": 4.420450687408447, + "learning_rate": 4.846237710335325e-05, + "loss": 2.2494, + "step": 15630 + }, + { + "epoch": 1.04905875641757, + "grad_norm": 3.866508722305298, + "learning_rate": 4.845151397371283e-05, + "loss": 2.087, + "step": 15632 + }, + { + "epoch": 1.0491929801013389, + "grad_norm": 3.5889501571655273, + "learning_rate": 4.8440650917235304e-05, + "loss": 1.9877, + "step": 15634 + }, + { + "epoch": 1.0493272037851078, + "grad_norm": 3.9181160926818848, + "learning_rate": 4.8429787934433974e-05, + "loss": 2.1164, + "step": 15636 + }, + { + "epoch": 1.049461427468877, + "grad_norm": 3.6128861904144287, + "learning_rate": 4.841892502582206e-05, + "loss": 2.1756, + "step": 15638 + }, + { + "epoch": 1.0495956511526459, + "grad_norm": 3.6779160499572754, + "learning_rate": 4.840806219191285e-05, + "loss": 1.8238, + "step": 15640 + }, + { + "epoch": 1.0497298748364148, + "grad_norm": 4.083311557769775, + "learning_rate": 4.839719943321954e-05, + "loss": 1.8517, + "step": 15642 + }, + { + "epoch": 1.049864098520184, + "grad_norm": 6.04585599899292, + "learning_rate": 4.8386336750255435e-05, + "loss": 1.9687, + "step": 15644 + }, + { + "epoch": 1.0499983222039528, + "grad_norm": 4.202354431152344, + "learning_rate": 4.837547414353373e-05, + "loss": 2.2516, + "step": 15646 + }, + { + "epoch": 1.050132545887722, + "grad_norm": 4.26218318939209, + "learning_rate": 4.8364611613567686e-05, + "loss": 2.215, + "step": 15648 + }, + { + "epoch": 1.050266769571491, + "grad_norm": 4.045238971710205, + "learning_rate": 4.835374916087053e-05, + "loss": 2.0957, + "step": 15650 + }, + { + "epoch": 1.0504009932552598, + "grad_norm": 7.1992387771606445, + "learning_rate": 4.8342886785955484e-05, + "loss": 2.3783, + "step": 15652 + }, + { + "epoch": 1.050535216939029, + "grad_norm": 3.7214248180389404, + "learning_rate": 4.8332024489335775e-05, + "loss": 2.1296, + "step": 15654 + }, + { + "epoch": 1.0506694406227979, + "grad_norm": 3.5150132179260254, + "learning_rate": 4.8321162271524634e-05, + "loss": 2.1256, + "step": 15656 + }, + { + "epoch": 1.0508036643065668, + "grad_norm": 4.00270938873291, + "learning_rate": 4.831030013303528e-05, + "loss": 2.0943, + "step": 15658 + }, + { + "epoch": 1.050937887990336, + "grad_norm": 4.227464199066162, + "learning_rate": 4.829943807438091e-05, + "loss": 1.9823, + "step": 15660 + }, + { + "epoch": 1.0510721116741049, + "grad_norm": 3.9993624687194824, + "learning_rate": 4.828857609607476e-05, + "loss": 2.0479, + "step": 15662 + }, + { + "epoch": 1.051206335357874, + "grad_norm": 3.91644024848938, + "learning_rate": 4.827771419863001e-05, + "loss": 2.0291, + "step": 15664 + }, + { + "epoch": 1.051340559041643, + "grad_norm": 4.23579216003418, + "learning_rate": 4.826685238255989e-05, + "loss": 2.0038, + "step": 15666 + }, + { + "epoch": 1.0514747827254118, + "grad_norm": 4.557195663452148, + "learning_rate": 4.825599064837758e-05, + "loss": 2.302, + "step": 15668 + }, + { + "epoch": 1.051609006409181, + "grad_norm": 4.210406303405762, + "learning_rate": 4.824512899659628e-05, + "loss": 2.0747, + "step": 15670 + }, + { + "epoch": 1.05174323009295, + "grad_norm": 3.9077773094177246, + "learning_rate": 4.823426742772917e-05, + "loss": 2.2019, + "step": 15672 + }, + { + "epoch": 1.0518774537767188, + "grad_norm": 4.158426284790039, + "learning_rate": 4.822340594228948e-05, + "loss": 1.9691, + "step": 15674 + }, + { + "epoch": 1.052011677460488, + "grad_norm": 4.798840045928955, + "learning_rate": 4.821254454079035e-05, + "loss": 2.2957, + "step": 15676 + }, + { + "epoch": 1.0521459011442569, + "grad_norm": 4.056008815765381, + "learning_rate": 4.820168322374498e-05, + "loss": 2.1922, + "step": 15678 + }, + { + "epoch": 1.0522801248280258, + "grad_norm": 3.7606358528137207, + "learning_rate": 4.819082199166654e-05, + "loss": 1.8764, + "step": 15680 + }, + { + "epoch": 1.052414348511795, + "grad_norm": 4.802450180053711, + "learning_rate": 4.8179960845068204e-05, + "loss": 2.2057, + "step": 15682 + }, + { + "epoch": 1.0525485721955639, + "grad_norm": 3.3192920684814453, + "learning_rate": 4.816909978446314e-05, + "loss": 1.8396, + "step": 15684 + }, + { + "epoch": 1.052682795879333, + "grad_norm": 4.2731757164001465, + "learning_rate": 4.815823881036451e-05, + "loss": 2.0238, + "step": 15686 + }, + { + "epoch": 1.052817019563102, + "grad_norm": 3.8090009689331055, + "learning_rate": 4.814737792328547e-05, + "loss": 2.0724, + "step": 15688 + }, + { + "epoch": 1.0529512432468708, + "grad_norm": 4.001204490661621, + "learning_rate": 4.8136517123739197e-05, + "loss": 2.065, + "step": 15690 + }, + { + "epoch": 1.05308546693064, + "grad_norm": 4.312192916870117, + "learning_rate": 4.8125656412238805e-05, + "loss": 2.3294, + "step": 15692 + }, + { + "epoch": 1.053219690614409, + "grad_norm": 6.070675373077393, + "learning_rate": 4.8114795789297476e-05, + "loss": 1.9615, + "step": 15694 + }, + { + "epoch": 1.0533539142981778, + "grad_norm": 4.198002815246582, + "learning_rate": 4.810393525542833e-05, + "loss": 2.1217, + "step": 15696 + }, + { + "epoch": 1.053488137981947, + "grad_norm": 4.027871608734131, + "learning_rate": 4.8093074811144525e-05, + "loss": 2.1549, + "step": 15698 + }, + { + "epoch": 1.0536223616657159, + "grad_norm": 4.299331188201904, + "learning_rate": 4.808221445695916e-05, + "loss": 2.3281, + "step": 15700 + }, + { + "epoch": 1.053756585349485, + "grad_norm": 2.911587953567505, + "learning_rate": 4.8071354193385424e-05, + "loss": 1.8729, + "step": 15702 + }, + { + "epoch": 1.053890809033254, + "grad_norm": 4.913629531860352, + "learning_rate": 4.806049402093638e-05, + "loss": 1.9534, + "step": 15704 + }, + { + "epoch": 1.0540250327170229, + "grad_norm": 3.598148822784424, + "learning_rate": 4.80496339401252e-05, + "loss": 2.1912, + "step": 15706 + }, + { + "epoch": 1.054159256400792, + "grad_norm": 3.693141222000122, + "learning_rate": 4.803877395146497e-05, + "loss": 1.8693, + "step": 15708 + }, + { + "epoch": 1.054293480084561, + "grad_norm": 4.772548675537109, + "learning_rate": 4.8027914055468826e-05, + "loss": 2.2446, + "step": 15710 + }, + { + "epoch": 1.0544277037683298, + "grad_norm": 3.9823038578033447, + "learning_rate": 4.8017054252649856e-05, + "loss": 2.0814, + "step": 15712 + }, + { + "epoch": 1.054561927452099, + "grad_norm": 4.2692437171936035, + "learning_rate": 4.800619454352118e-05, + "loss": 2.1008, + "step": 15714 + }, + { + "epoch": 1.054696151135868, + "grad_norm": 3.717489242553711, + "learning_rate": 4.799533492859588e-05, + "loss": 2.0688, + "step": 15716 + }, + { + "epoch": 1.0548303748196368, + "grad_norm": 3.780982494354248, + "learning_rate": 4.798447540838708e-05, + "loss": 2.3295, + "step": 15718 + }, + { + "epoch": 1.054964598503406, + "grad_norm": 4.572207927703857, + "learning_rate": 4.797361598340784e-05, + "loss": 2.2083, + "step": 15720 + }, + { + "epoch": 1.0550988221871749, + "grad_norm": 3.372490644454956, + "learning_rate": 4.7962756654171264e-05, + "loss": 1.9661, + "step": 15722 + }, + { + "epoch": 1.055233045870944, + "grad_norm": 4.061065673828125, + "learning_rate": 4.795189742119043e-05, + "loss": 1.9884, + "step": 15724 + }, + { + "epoch": 1.055367269554713, + "grad_norm": 3.8447861671447754, + "learning_rate": 4.794103828497842e-05, + "loss": 2.3456, + "step": 15726 + }, + { + "epoch": 1.0555014932384819, + "grad_norm": 4.166133880615234, + "learning_rate": 4.79301792460483e-05, + "loss": 1.901, + "step": 15728 + }, + { + "epoch": 1.055635716922251, + "grad_norm": 4.022586822509766, + "learning_rate": 4.791932030491313e-05, + "loss": 2.176, + "step": 15730 + }, + { + "epoch": 1.05576994060602, + "grad_norm": 4.096128940582275, + "learning_rate": 4.7908461462086e-05, + "loss": 2.1676, + "step": 15732 + }, + { + "epoch": 1.0559041642897888, + "grad_norm": 3.1035640239715576, + "learning_rate": 4.789760271807993e-05, + "loss": 2.1243, + "step": 15734 + }, + { + "epoch": 1.056038387973558, + "grad_norm": 4.163211822509766, + "learning_rate": 4.788674407340803e-05, + "loss": 2.0864, + "step": 15736 + }, + { + "epoch": 1.056172611657327, + "grad_norm": 4.042501449584961, + "learning_rate": 4.787588552858328e-05, + "loss": 2.1384, + "step": 15738 + }, + { + "epoch": 1.056306835341096, + "grad_norm": 3.730881452560425, + "learning_rate": 4.7865027084118805e-05, + "loss": 1.7871, + "step": 15740 + }, + { + "epoch": 1.056441059024865, + "grad_norm": 12.899774551391602, + "learning_rate": 4.785416874052756e-05, + "loss": 1.946, + "step": 15742 + }, + { + "epoch": 1.0565752827086339, + "grad_norm": 3.888071298599243, + "learning_rate": 4.7843310498322646e-05, + "loss": 1.9867, + "step": 15744 + }, + { + "epoch": 1.056709506392403, + "grad_norm": 5.069614887237549, + "learning_rate": 4.783245235801706e-05, + "loss": 2.1491, + "step": 15746 + }, + { + "epoch": 1.056843730076172, + "grad_norm": 3.620002031326294, + "learning_rate": 4.7821594320123844e-05, + "loss": 1.9213, + "step": 15748 + }, + { + "epoch": 1.0569779537599409, + "grad_norm": 4.183504104614258, + "learning_rate": 4.781073638515601e-05, + "loss": 2.0615, + "step": 15750 + }, + { + "epoch": 1.05711217744371, + "grad_norm": 4.347829818725586, + "learning_rate": 4.779987855362658e-05, + "loss": 2.2938, + "step": 15752 + }, + { + "epoch": 1.057246401127479, + "grad_norm": 4.149467468261719, + "learning_rate": 4.778902082604856e-05, + "loss": 2.0133, + "step": 15754 + }, + { + "epoch": 1.0573806248112478, + "grad_norm": 3.802290916442871, + "learning_rate": 4.777816320293496e-05, + "loss": 1.9547, + "step": 15756 + }, + { + "epoch": 1.057514848495017, + "grad_norm": 3.6006202697753906, + "learning_rate": 4.7767305684798775e-05, + "loss": 2.1805, + "step": 15758 + }, + { + "epoch": 1.057649072178786, + "grad_norm": 4.01051664352417, + "learning_rate": 4.7756448272153014e-05, + "loss": 2.1167, + "step": 15760 + }, + { + "epoch": 1.057783295862555, + "grad_norm": 3.8275437355041504, + "learning_rate": 4.774559096551064e-05, + "loss": 2.1626, + "step": 15762 + }, + { + "epoch": 1.057917519546324, + "grad_norm": 4.206202507019043, + "learning_rate": 4.773473376538469e-05, + "loss": 2.0954, + "step": 15764 + }, + { + "epoch": 1.0580517432300929, + "grad_norm": 4.646594524383545, + "learning_rate": 4.772387667228808e-05, + "loss": 2.0631, + "step": 15766 + }, + { + "epoch": 1.058185966913862, + "grad_norm": 3.927537441253662, + "learning_rate": 4.771301968673386e-05, + "loss": 1.952, + "step": 15768 + }, + { + "epoch": 1.058320190597631, + "grad_norm": 4.06552791595459, + "learning_rate": 4.770216280923493e-05, + "loss": 2.0579, + "step": 15770 + }, + { + "epoch": 1.0584544142813999, + "grad_norm": 3.8255834579467773, + "learning_rate": 4.7691306040304306e-05, + "loss": 1.7994, + "step": 15772 + }, + { + "epoch": 1.058588637965169, + "grad_norm": 3.8319809436798096, + "learning_rate": 4.768044938045493e-05, + "loss": 2.0371, + "step": 15774 + }, + { + "epoch": 1.058722861648938, + "grad_norm": 4.235959529876709, + "learning_rate": 4.766959283019976e-05, + "loss": 2.0587, + "step": 15776 + }, + { + "epoch": 1.058857085332707, + "grad_norm": 4.212389945983887, + "learning_rate": 4.765873639005174e-05, + "loss": 2.0474, + "step": 15778 + }, + { + "epoch": 1.058991309016476, + "grad_norm": 4.077500820159912, + "learning_rate": 4.764788006052383e-05, + "loss": 2.1307, + "step": 15780 + }, + { + "epoch": 1.059125532700245, + "grad_norm": 4.176285743713379, + "learning_rate": 4.763702384212896e-05, + "loss": 2.127, + "step": 15782 + }, + { + "epoch": 1.059259756384014, + "grad_norm": 4.045559883117676, + "learning_rate": 4.762616773538007e-05, + "loss": 2.1178, + "step": 15784 + }, + { + "epoch": 1.059393980067783, + "grad_norm": 4.212821006774902, + "learning_rate": 4.7615311740790074e-05, + "loss": 1.9854, + "step": 15786 + }, + { + "epoch": 1.0595282037515519, + "grad_norm": 3.772578477859497, + "learning_rate": 4.760445585887192e-05, + "loss": 2.0737, + "step": 15788 + }, + { + "epoch": 1.059662427435321, + "grad_norm": 4.630712985992432, + "learning_rate": 4.759360009013849e-05, + "loss": 1.982, + "step": 15790 + }, + { + "epoch": 1.05979665111909, + "grad_norm": 4.045752048492432, + "learning_rate": 4.7582744435102774e-05, + "loss": 1.9086, + "step": 15792 + }, + { + "epoch": 1.0599308748028589, + "grad_norm": 4.309189319610596, + "learning_rate": 4.7571888894277604e-05, + "loss": 2.0715, + "step": 15794 + }, + { + "epoch": 1.060065098486628, + "grad_norm": 4.363103866577148, + "learning_rate": 4.756103346817589e-05, + "loss": 2.0334, + "step": 15796 + }, + { + "epoch": 1.060199322170397, + "grad_norm": 4.649811267852783, + "learning_rate": 4.755017815731059e-05, + "loss": 2.3607, + "step": 15798 + }, + { + "epoch": 1.060333545854166, + "grad_norm": 4.820392608642578, + "learning_rate": 4.753932296219451e-05, + "loss": 2.3547, + "step": 15800 + }, + { + "epoch": 1.060467769537935, + "grad_norm": 4.342525005340576, + "learning_rate": 4.7528467883340615e-05, + "loss": 2.1319, + "step": 15802 + }, + { + "epoch": 1.060601993221704, + "grad_norm": 4.308598518371582, + "learning_rate": 4.751761292126173e-05, + "loss": 2.2346, + "step": 15804 + }, + { + "epoch": 1.060736216905473, + "grad_norm": 4.611044406890869, + "learning_rate": 4.750675807647077e-05, + "loss": 1.9597, + "step": 15806 + }, + { + "epoch": 1.060870440589242, + "grad_norm": 4.135756015777588, + "learning_rate": 4.749590334948057e-05, + "loss": 2.0548, + "step": 15808 + }, + { + "epoch": 1.0610046642730109, + "grad_norm": 4.3338303565979, + "learning_rate": 4.7485048740804026e-05, + "loss": 2.0284, + "step": 15810 + }, + { + "epoch": 1.06113888795678, + "grad_norm": 4.429969787597656, + "learning_rate": 4.747419425095398e-05, + "loss": 1.9782, + "step": 15812 + }, + { + "epoch": 1.061273111640549, + "grad_norm": 3.9365615844726562, + "learning_rate": 4.74633398804433e-05, + "loss": 1.847, + "step": 15814 + }, + { + "epoch": 1.061407335324318, + "grad_norm": 3.62101674079895, + "learning_rate": 4.7452485629784804e-05, + "loss": 1.9251, + "step": 15816 + }, + { + "epoch": 1.061541559008087, + "grad_norm": 4.554169654846191, + "learning_rate": 4.744163149949136e-05, + "loss": 2.1271, + "step": 15818 + }, + { + "epoch": 1.061675782691856, + "grad_norm": 4.532477378845215, + "learning_rate": 4.74307774900758e-05, + "loss": 2.079, + "step": 15820 + }, + { + "epoch": 1.061810006375625, + "grad_norm": 4.738977909088135, + "learning_rate": 4.7419923602050954e-05, + "loss": 2.1788, + "step": 15822 + }, + { + "epoch": 1.061944230059394, + "grad_norm": 4.03950834274292, + "learning_rate": 4.740906983592962e-05, + "loss": 2.0628, + "step": 15824 + }, + { + "epoch": 1.062078453743163, + "grad_norm": 4.568528652191162, + "learning_rate": 4.739821619222468e-05, + "loss": 2.0124, + "step": 15826 + }, + { + "epoch": 1.062212677426932, + "grad_norm": 4.180507183074951, + "learning_rate": 4.738736267144887e-05, + "loss": 2.0417, + "step": 15828 + }, + { + "epoch": 1.062346901110701, + "grad_norm": 3.924548864364624, + "learning_rate": 4.7376509274115075e-05, + "loss": 2.128, + "step": 15830 + }, + { + "epoch": 1.0624811247944699, + "grad_norm": 5.29783296585083, + "learning_rate": 4.736565600073602e-05, + "loss": 1.8158, + "step": 15832 + }, + { + "epoch": 1.062615348478239, + "grad_norm": 4.58465051651001, + "learning_rate": 4.735480285182457e-05, + "loss": 2.1153, + "step": 15834 + }, + { + "epoch": 1.062749572162008, + "grad_norm": 4.009093284606934, + "learning_rate": 4.7343949827893465e-05, + "loss": 2.1705, + "step": 15836 + }, + { + "epoch": 1.062883795845777, + "grad_norm": 4.356628894805908, + "learning_rate": 4.733309692945552e-05, + "loss": 2.2422, + "step": 15838 + }, + { + "epoch": 1.063018019529546, + "grad_norm": 3.8464438915252686, + "learning_rate": 4.732224415702349e-05, + "loss": 2.0549, + "step": 15840 + }, + { + "epoch": 1.063152243213315, + "grad_norm": 3.8532023429870605, + "learning_rate": 4.731139151111017e-05, + "loss": 2.0818, + "step": 15842 + }, + { + "epoch": 1.063286466897084, + "grad_norm": 4.187089443206787, + "learning_rate": 4.7300538992228295e-05, + "loss": 1.9758, + "step": 15844 + }, + { + "epoch": 1.063420690580853, + "grad_norm": 3.507282257080078, + "learning_rate": 4.7289686600890666e-05, + "loss": 2.0085, + "step": 15846 + }, + { + "epoch": 1.063554914264622, + "grad_norm": 4.108996391296387, + "learning_rate": 4.727883433761e-05, + "loss": 2.1659, + "step": 15848 + }, + { + "epoch": 1.063689137948391, + "grad_norm": 3.9412834644317627, + "learning_rate": 4.726798220289907e-05, + "loss": 2.2691, + "step": 15850 + }, + { + "epoch": 1.06382336163216, + "grad_norm": 3.9574170112609863, + "learning_rate": 4.7257130197270586e-05, + "loss": 2.0423, + "step": 15852 + }, + { + "epoch": 1.063957585315929, + "grad_norm": 4.022784233093262, + "learning_rate": 4.724627832123735e-05, + "loss": 2.3129, + "step": 15854 + }, + { + "epoch": 1.064091808999698, + "grad_norm": 3.6794145107269287, + "learning_rate": 4.7235426575312004e-05, + "loss": 2.0925, + "step": 15856 + }, + { + "epoch": 1.064226032683467, + "grad_norm": 4.147182941436768, + "learning_rate": 4.722457496000735e-05, + "loss": 1.946, + "step": 15858 + }, + { + "epoch": 1.064360256367236, + "grad_norm": 4.102267265319824, + "learning_rate": 4.721372347583604e-05, + "loss": 2.074, + "step": 15860 + }, + { + "epoch": 1.064494480051005, + "grad_norm": 4.1602606773376465, + "learning_rate": 4.7202872123310845e-05, + "loss": 2.0632, + "step": 15862 + }, + { + "epoch": 1.064628703734774, + "grad_norm": 4.436313152313232, + "learning_rate": 4.7192020902944446e-05, + "loss": 2.1524, + "step": 15864 + }, + { + "epoch": 1.064762927418543, + "grad_norm": 3.904359817504883, + "learning_rate": 4.71811698152495e-05, + "loss": 1.9484, + "step": 15866 + }, + { + "epoch": 1.064897151102312, + "grad_norm": 4.041442394256592, + "learning_rate": 4.7170318860738774e-05, + "loss": 2.0379, + "step": 15868 + }, + { + "epoch": 1.0650313747860811, + "grad_norm": 3.9930484294891357, + "learning_rate": 4.71594680399249e-05, + "loss": 2.2358, + "step": 15870 + }, + { + "epoch": 1.06516559846985, + "grad_norm": 4.076245307922363, + "learning_rate": 4.714861735332058e-05, + "loss": 1.9263, + "step": 15872 + }, + { + "epoch": 1.065299822153619, + "grad_norm": 3.5305583477020264, + "learning_rate": 4.713776680143848e-05, + "loss": 2.0212, + "step": 15874 + }, + { + "epoch": 1.065434045837388, + "grad_norm": 3.9087092876434326, + "learning_rate": 4.7126916384791274e-05, + "loss": 1.8878, + "step": 15876 + }, + { + "epoch": 1.065568269521157, + "grad_norm": 4.453912258148193, + "learning_rate": 4.7116066103891615e-05, + "loss": 2.0356, + "step": 15878 + }, + { + "epoch": 1.065702493204926, + "grad_norm": 3.913515329360962, + "learning_rate": 4.710521595925217e-05, + "loss": 2.1189, + "step": 15880 + }, + { + "epoch": 1.065836716888695, + "grad_norm": 4.5253119468688965, + "learning_rate": 4.709436595138557e-05, + "loss": 2.0431, + "step": 15882 + }, + { + "epoch": 1.065970940572464, + "grad_norm": 4.283291339874268, + "learning_rate": 4.708351608080447e-05, + "loss": 2.3691, + "step": 15884 + }, + { + "epoch": 1.066105164256233, + "grad_norm": 4.159745216369629, + "learning_rate": 4.7072666348021485e-05, + "loss": 2.1864, + "step": 15886 + }, + { + "epoch": 1.066239387940002, + "grad_norm": 4.70057487487793, + "learning_rate": 4.706181675354929e-05, + "loss": 2.141, + "step": 15888 + }, + { + "epoch": 1.066373611623771, + "grad_norm": 3.3206496238708496, + "learning_rate": 4.7050967297900434e-05, + "loss": 1.9411, + "step": 15890 + }, + { + "epoch": 1.0665078353075401, + "grad_norm": 4.210582733154297, + "learning_rate": 4.704011798158762e-05, + "loss": 2.1405, + "step": 15892 + }, + { + "epoch": 1.066642058991309, + "grad_norm": 4.958913803100586, + "learning_rate": 4.702926880512337e-05, + "loss": 2.2655, + "step": 15894 + }, + { + "epoch": 1.066776282675078, + "grad_norm": 3.712484836578369, + "learning_rate": 4.701841976902035e-05, + "loss": 1.9542, + "step": 15896 + }, + { + "epoch": 1.066910506358847, + "grad_norm": 3.7328202724456787, + "learning_rate": 4.700757087379112e-05, + "loss": 1.8636, + "step": 15898 + }, + { + "epoch": 1.067044730042616, + "grad_norm": 4.027070045471191, + "learning_rate": 4.69967221199483e-05, + "loss": 2.3697, + "step": 15900 + }, + { + "epoch": 1.067178953726385, + "grad_norm": 3.8679592609405518, + "learning_rate": 4.698587350800444e-05, + "loss": 2.0148, + "step": 15902 + }, + { + "epoch": 1.067313177410154, + "grad_norm": 4.067104339599609, + "learning_rate": 4.697502503847214e-05, + "loss": 1.992, + "step": 15904 + }, + { + "epoch": 1.067447401093923, + "grad_norm": 3.475123643875122, + "learning_rate": 4.696417671186395e-05, + "loss": 2.0485, + "step": 15906 + }, + { + "epoch": 1.067581624777692, + "grad_norm": 4.200191974639893, + "learning_rate": 4.6953328528692446e-05, + "loss": 2.1778, + "step": 15908 + }, + { + "epoch": 1.067715848461461, + "grad_norm": 4.255040168762207, + "learning_rate": 4.694248048947017e-05, + "loss": 1.818, + "step": 15910 + }, + { + "epoch": 1.06785007214523, + "grad_norm": 3.97406005859375, + "learning_rate": 4.693163259470969e-05, + "loss": 2.0223, + "step": 15912 + }, + { + "epoch": 1.0679842958289991, + "grad_norm": 4.189444541931152, + "learning_rate": 4.692078484492351e-05, + "loss": 2.0169, + "step": 15914 + }, + { + "epoch": 1.068118519512768, + "grad_norm": 7.686283111572266, + "learning_rate": 4.6909937240624236e-05, + "loss": 2.2024, + "step": 15916 + }, + { + "epoch": 1.068252743196537, + "grad_norm": 4.441576957702637, + "learning_rate": 4.689908978232431e-05, + "loss": 2.1852, + "step": 15918 + }, + { + "epoch": 1.068386966880306, + "grad_norm": 4.466763973236084, + "learning_rate": 4.688824247053633e-05, + "loss": 1.977, + "step": 15920 + }, + { + "epoch": 1.068521190564075, + "grad_norm": 3.7961270809173584, + "learning_rate": 4.6877395305772733e-05, + "loss": 1.8774, + "step": 15922 + }, + { + "epoch": 1.068655414247844, + "grad_norm": 4.037917613983154, + "learning_rate": 4.68665482885461e-05, + "loss": 2.2129, + "step": 15924 + }, + { + "epoch": 1.068789637931613, + "grad_norm": 4.45274543762207, + "learning_rate": 4.685570141936888e-05, + "loss": 2.3233, + "step": 15926 + }, + { + "epoch": 1.068923861615382, + "grad_norm": 4.633610725402832, + "learning_rate": 4.684485469875359e-05, + "loss": 2.1946, + "step": 15928 + }, + { + "epoch": 1.069058085299151, + "grad_norm": 4.184717655181885, + "learning_rate": 4.6834008127212715e-05, + "loss": 1.8802, + "step": 15930 + }, + { + "epoch": 1.06919230898292, + "grad_norm": 4.695326328277588, + "learning_rate": 4.682316170525874e-05, + "loss": 2.1451, + "step": 15932 + }, + { + "epoch": 1.069326532666689, + "grad_norm": 4.053454875946045, + "learning_rate": 4.681231543340412e-05, + "loss": 1.996, + "step": 15934 + }, + { + "epoch": 1.0694607563504581, + "grad_norm": 3.867274522781372, + "learning_rate": 4.680146931216132e-05, + "loss": 2.0832, + "step": 15936 + }, + { + "epoch": 1.069594980034227, + "grad_norm": 4.766475200653076, + "learning_rate": 4.679062334204281e-05, + "loss": 2.0481, + "step": 15938 + }, + { + "epoch": 1.069729203717996, + "grad_norm": 3.9486889839172363, + "learning_rate": 4.677977752356103e-05, + "loss": 2.0719, + "step": 15940 + }, + { + "epoch": 1.069863427401765, + "grad_norm": 5.9075493812561035, + "learning_rate": 4.676893185722844e-05, + "loss": 2.1417, + "step": 15942 + }, + { + "epoch": 1.069997651085534, + "grad_norm": 4.4630327224731445, + "learning_rate": 4.675808634355746e-05, + "loss": 2.1196, + "step": 15944 + }, + { + "epoch": 1.0701318747693032, + "grad_norm": 4.508720397949219, + "learning_rate": 4.674724098306053e-05, + "loss": 2.0761, + "step": 15946 + }, + { + "epoch": 1.070266098453072, + "grad_norm": 3.895442247390747, + "learning_rate": 4.6736395776250065e-05, + "loss": 1.994, + "step": 15948 + }, + { + "epoch": 1.070400322136841, + "grad_norm": 3.8768327236175537, + "learning_rate": 4.6725550723638484e-05, + "loss": 2.1635, + "step": 15950 + }, + { + "epoch": 1.0705345458206101, + "grad_norm": 4.503876209259033, + "learning_rate": 4.671470582573817e-05, + "loss": 2.0571, + "step": 15952 + }, + { + "epoch": 1.070668769504379, + "grad_norm": 4.252975940704346, + "learning_rate": 4.670386108306159e-05, + "loss": 2.1863, + "step": 15954 + }, + { + "epoch": 1.070802993188148, + "grad_norm": 3.711691379547119, + "learning_rate": 4.669301649612105e-05, + "loss": 2.027, + "step": 15956 + }, + { + "epoch": 1.0709372168719171, + "grad_norm": 4.1056647300720215, + "learning_rate": 4.6682172065428994e-05, + "loss": 2.1972, + "step": 15958 + }, + { + "epoch": 1.071071440555686, + "grad_norm": 4.414778232574463, + "learning_rate": 4.667132779149778e-05, + "loss": 2.28, + "step": 15960 + }, + { + "epoch": 1.071205664239455, + "grad_norm": 4.00264310836792, + "learning_rate": 4.6660483674839794e-05, + "loss": 1.9793, + "step": 15962 + }, + { + "epoch": 1.071339887923224, + "grad_norm": 4.029606342315674, + "learning_rate": 4.664963971596737e-05, + "loss": 2.1417, + "step": 15964 + }, + { + "epoch": 1.071474111606993, + "grad_norm": 4.130021572113037, + "learning_rate": 4.66387959153929e-05, + "loss": 2.1224, + "step": 15966 + }, + { + "epoch": 1.0716083352907622, + "grad_norm": 4.1108174324035645, + "learning_rate": 4.6627952273628705e-05, + "loss": 2.0009, + "step": 15968 + }, + { + "epoch": 1.071742558974531, + "grad_norm": 4.342737197875977, + "learning_rate": 4.6617108791187137e-05, + "loss": 2.2877, + "step": 15970 + }, + { + "epoch": 1.0718767826583, + "grad_norm": 4.155842304229736, + "learning_rate": 4.6606265468580516e-05, + "loss": 2.0337, + "step": 15972 + }, + { + "epoch": 1.0720110063420691, + "grad_norm": 4.086789608001709, + "learning_rate": 4.6595422306321196e-05, + "loss": 2.1034, + "step": 15974 + }, + { + "epoch": 1.072145230025838, + "grad_norm": 3.8458282947540283, + "learning_rate": 4.658457930492145e-05, + "loss": 2.2384, + "step": 15976 + }, + { + "epoch": 1.072279453709607, + "grad_norm": 3.6733222007751465, + "learning_rate": 4.657373646489365e-05, + "loss": 2.0758, + "step": 15978 + }, + { + "epoch": 1.0724136773933761, + "grad_norm": 4.611215114593506, + "learning_rate": 4.656289378675003e-05, + "loss": 1.9678, + "step": 15980 + }, + { + "epoch": 1.072547901077145, + "grad_norm": 3.85707950592041, + "learning_rate": 4.655205127100296e-05, + "loss": 1.885, + "step": 15982 + }, + { + "epoch": 1.072682124760914, + "grad_norm": 4.147252559661865, + "learning_rate": 4.654120891816464e-05, + "loss": 2.214, + "step": 15984 + }, + { + "epoch": 1.072816348444683, + "grad_norm": 4.394071102142334, + "learning_rate": 4.653036672874743e-05, + "loss": 2.2422, + "step": 15986 + }, + { + "epoch": 1.072950572128452, + "grad_norm": 3.7963624000549316, + "learning_rate": 4.651952470326356e-05, + "loss": 1.8308, + "step": 15988 + }, + { + "epoch": 1.0730847958122212, + "grad_norm": 5.06090784072876, + "learning_rate": 4.6508682842225305e-05, + "loss": 2.1534, + "step": 15990 + }, + { + "epoch": 1.07321901949599, + "grad_norm": 4.297322750091553, + "learning_rate": 4.6497841146144916e-05, + "loss": 2.2357, + "step": 15992 + }, + { + "epoch": 1.073353243179759, + "grad_norm": 4.325428009033203, + "learning_rate": 4.6486999615534656e-05, + "loss": 2.1343, + "step": 15994 + }, + { + "epoch": 1.0734874668635281, + "grad_norm": 4.4975457191467285, + "learning_rate": 4.6476158250906743e-05, + "loss": 2.2396, + "step": 15996 + }, + { + "epoch": 1.073621690547297, + "grad_norm": 4.3018059730529785, + "learning_rate": 4.6465317052773436e-05, + "loss": 2.0226, + "step": 15998 + }, + { + "epoch": 1.073755914231066, + "grad_norm": 6.7828288078308105, + "learning_rate": 4.645447602164695e-05, + "loss": 2.0585, + "step": 16000 + }, + { + "epoch": 1.0738901379148351, + "grad_norm": 4.454293727874756, + "learning_rate": 4.644363515803949e-05, + "loss": 2.0074, + "step": 16002 + }, + { + "epoch": 1.074024361598604, + "grad_norm": 4.367403030395508, + "learning_rate": 4.643279446246328e-05, + "loss": 2.296, + "step": 16004 + }, + { + "epoch": 1.074158585282373, + "grad_norm": 4.316980361938477, + "learning_rate": 4.6421953935430514e-05, + "loss": 2.0003, + "step": 16006 + }, + { + "epoch": 1.074292808966142, + "grad_norm": 4.091136932373047, + "learning_rate": 4.641111357745339e-05, + "loss": 2.1415, + "step": 16008 + }, + { + "epoch": 1.074427032649911, + "grad_norm": 4.111385822296143, + "learning_rate": 4.6400273389044086e-05, + "loss": 2.0301, + "step": 16010 + }, + { + "epoch": 1.0745612563336802, + "grad_norm": 4.62166166305542, + "learning_rate": 4.6389433370714795e-05, + "loss": 2.0852, + "step": 16012 + }, + { + "epoch": 1.074695480017449, + "grad_norm": 7.592520236968994, + "learning_rate": 4.637859352297766e-05, + "loss": 2.1842, + "step": 16014 + }, + { + "epoch": 1.074829703701218, + "grad_norm": 4.153733253479004, + "learning_rate": 4.636775384634489e-05, + "loss": 1.936, + "step": 16016 + }, + { + "epoch": 1.0749639273849871, + "grad_norm": 4.116402626037598, + "learning_rate": 4.635691434132858e-05, + "loss": 1.9987, + "step": 16018 + }, + { + "epoch": 1.075098151068756, + "grad_norm": 3.7315685749053955, + "learning_rate": 4.634607500844092e-05, + "loss": 2.048, + "step": 16020 + }, + { + "epoch": 1.0752323747525252, + "grad_norm": 3.902836799621582, + "learning_rate": 4.633523584819402e-05, + "loss": 2.1504, + "step": 16022 + }, + { + "epoch": 1.0753665984362941, + "grad_norm": 3.8383994102478027, + "learning_rate": 4.632439686110003e-05, + "loss": 1.8692, + "step": 16024 + }, + { + "epoch": 1.075500822120063, + "grad_norm": 4.082756519317627, + "learning_rate": 4.631355804767105e-05, + "loss": 2.0555, + "step": 16026 + }, + { + "epoch": 1.0756350458038322, + "grad_norm": 3.8638052940368652, + "learning_rate": 4.6302719408419214e-05, + "loss": 1.9952, + "step": 16028 + }, + { + "epoch": 1.075769269487601, + "grad_norm": 4.210520267486572, + "learning_rate": 4.629188094385661e-05, + "loss": 2.2862, + "step": 16030 + }, + { + "epoch": 1.07590349317137, + "grad_norm": 3.621293783187866, + "learning_rate": 4.6281042654495346e-05, + "loss": 2.1072, + "step": 16032 + }, + { + "epoch": 1.0760377168551392, + "grad_norm": 3.527984619140625, + "learning_rate": 4.627020454084749e-05, + "loss": 2.1104, + "step": 16034 + }, + { + "epoch": 1.076171940538908, + "grad_norm": 4.147447109222412, + "learning_rate": 4.6259366603425153e-05, + "loss": 1.9708, + "step": 16036 + }, + { + "epoch": 1.076306164222677, + "grad_norm": 4.1989898681640625, + "learning_rate": 4.624852884274037e-05, + "loss": 2.0998, + "step": 16038 + }, + { + "epoch": 1.0764403879064461, + "grad_norm": 4.3323655128479, + "learning_rate": 4.623769125930524e-05, + "loss": 2.0228, + "step": 16040 + }, + { + "epoch": 1.076574611590215, + "grad_norm": 4.142357349395752, + "learning_rate": 4.6226853853631775e-05, + "loss": 2.1275, + "step": 16042 + }, + { + "epoch": 1.0767088352739842, + "grad_norm": 4.487778663635254, + "learning_rate": 4.621601662623208e-05, + "loss": 2.0733, + "step": 16044 + }, + { + "epoch": 1.0768430589577531, + "grad_norm": 3.597740650177002, + "learning_rate": 4.6205179577618125e-05, + "loss": 1.9806, + "step": 16046 + }, + { + "epoch": 1.076977282641522, + "grad_norm": 4.385641098022461, + "learning_rate": 4.619434270830199e-05, + "loss": 1.9591, + "step": 16048 + }, + { + "epoch": 1.0771115063252912, + "grad_norm": 3.9902114868164062, + "learning_rate": 4.6183506018795674e-05, + "loss": 1.9798, + "step": 16050 + }, + { + "epoch": 1.07724573000906, + "grad_norm": 3.869234561920166, + "learning_rate": 4.61726695096112e-05, + "loss": 1.749, + "step": 16052 + }, + { + "epoch": 1.077379953692829, + "grad_norm": 4.26558256149292, + "learning_rate": 4.616183318126056e-05, + "loss": 1.945, + "step": 16054 + }, + { + "epoch": 1.0775141773765982, + "grad_norm": 5.865242958068848, + "learning_rate": 4.615099703425576e-05, + "loss": 2.0644, + "step": 16056 + }, + { + "epoch": 1.077648401060367, + "grad_norm": 3.6878573894500732, + "learning_rate": 4.614016106910877e-05, + "loss": 2.1004, + "step": 16058 + }, + { + "epoch": 1.077782624744136, + "grad_norm": 3.414153814315796, + "learning_rate": 4.6129325286331594e-05, + "loss": 2.0623, + "step": 16060 + }, + { + "epoch": 1.0779168484279051, + "grad_norm": 4.130207061767578, + "learning_rate": 4.611848968643618e-05, + "loss": 2.0506, + "step": 16062 + }, + { + "epoch": 1.078051072111674, + "grad_norm": 4.593921661376953, + "learning_rate": 4.61076542699345e-05, + "loss": 2.1254, + "step": 16064 + }, + { + "epoch": 1.0781852957954432, + "grad_norm": 4.419042587280273, + "learning_rate": 4.609681903733848e-05, + "loss": 2.0687, + "step": 16066 + }, + { + "epoch": 1.0783195194792121, + "grad_norm": 3.912442207336426, + "learning_rate": 4.608598398916012e-05, + "loss": 2.1223, + "step": 16068 + }, + { + "epoch": 1.078453743162981, + "grad_norm": 4.849279403686523, + "learning_rate": 4.607514912591131e-05, + "loss": 2.1971, + "step": 16070 + }, + { + "epoch": 1.0785879668467502, + "grad_norm": 4.650923252105713, + "learning_rate": 4.6064314448103974e-05, + "loss": 1.9707, + "step": 16072 + }, + { + "epoch": 1.078722190530519, + "grad_norm": 4.055800914764404, + "learning_rate": 4.605347995625005e-05, + "loss": 1.9861, + "step": 16074 + }, + { + "epoch": 1.078856414214288, + "grad_norm": 4.487636566162109, + "learning_rate": 4.604264565086141e-05, + "loss": 1.8674, + "step": 16076 + }, + { + "epoch": 1.0789906378980572, + "grad_norm": 4.130455017089844, + "learning_rate": 4.6031811532450025e-05, + "loss": 2.4515, + "step": 16078 + }, + { + "epoch": 1.079124861581826, + "grad_norm": 4.0925068855285645, + "learning_rate": 4.602097760152771e-05, + "loss": 2.2096, + "step": 16080 + }, + { + "epoch": 1.079259085265595, + "grad_norm": 3.8733909130096436, + "learning_rate": 4.601014385860639e-05, + "loss": 1.8768, + "step": 16082 + }, + { + "epoch": 1.0793933089493641, + "grad_norm": 3.964332342147827, + "learning_rate": 4.599931030419793e-05, + "loss": 2.2433, + "step": 16084 + }, + { + "epoch": 1.079527532633133, + "grad_norm": 4.327771186828613, + "learning_rate": 4.598847693881419e-05, + "loss": 2.2434, + "step": 16086 + }, + { + "epoch": 1.0796617563169022, + "grad_norm": 3.7350053787231445, + "learning_rate": 4.5977643762967024e-05, + "loss": 2.0027, + "step": 16088 + }, + { + "epoch": 1.0797959800006711, + "grad_norm": 3.9047751426696777, + "learning_rate": 4.596681077716828e-05, + "loss": 2.069, + "step": 16090 + }, + { + "epoch": 1.07993020368444, + "grad_norm": 4.003503799438477, + "learning_rate": 4.59559779819298e-05, + "loss": 2.1063, + "step": 16092 + }, + { + "epoch": 1.0800644273682092, + "grad_norm": 4.227535724639893, + "learning_rate": 4.594514537776341e-05, + "loss": 2.3219, + "step": 16094 + }, + { + "epoch": 1.080198651051978, + "grad_norm": 3.9154019355773926, + "learning_rate": 4.5934312965180916e-05, + "loss": 2.1904, + "step": 16096 + }, + { + "epoch": 1.0803328747357472, + "grad_norm": 3.527905225753784, + "learning_rate": 4.5923480744694156e-05, + "loss": 2.1051, + "step": 16098 + }, + { + "epoch": 1.0804670984195162, + "grad_norm": 3.8489246368408203, + "learning_rate": 4.5912648716814896e-05, + "loss": 2.3325, + "step": 16100 + }, + { + "epoch": 1.080601322103285, + "grad_norm": 4.081704139709473, + "learning_rate": 4.590181688205496e-05, + "loss": 1.8343, + "step": 16102 + }, + { + "epoch": 1.0807355457870542, + "grad_norm": 4.001219749450684, + "learning_rate": 4.58909852409261e-05, + "loss": 2.0632, + "step": 16104 + }, + { + "epoch": 1.0808697694708231, + "grad_norm": 3.9102137088775635, + "learning_rate": 4.5880153793940136e-05, + "loss": 2.1048, + "step": 16106 + }, + { + "epoch": 1.081003993154592, + "grad_norm": 4.188068866729736, + "learning_rate": 4.5869322541608775e-05, + "loss": 2.051, + "step": 16108 + }, + { + "epoch": 1.0811382168383612, + "grad_norm": 3.4991886615753174, + "learning_rate": 4.5858491484443815e-05, + "loss": 1.7848, + "step": 16110 + }, + { + "epoch": 1.0812724405221301, + "grad_norm": 3.9602808952331543, + "learning_rate": 4.584766062295698e-05, + "loss": 2.0227, + "step": 16112 + }, + { + "epoch": 1.081406664205899, + "grad_norm": 4.29440450668335, + "learning_rate": 4.583682995766003e-05, + "loss": 2.1695, + "step": 16114 + }, + { + "epoch": 1.0815408878896682, + "grad_norm": 4.1367506980896, + "learning_rate": 4.5825999489064676e-05, + "loss": 2.2515, + "step": 16116 + }, + { + "epoch": 1.081675111573437, + "grad_norm": 4.15057373046875, + "learning_rate": 4.581516921768263e-05, + "loss": 2.0967, + "step": 16118 + }, + { + "epoch": 1.0818093352572062, + "grad_norm": 4.406524181365967, + "learning_rate": 4.580433914402562e-05, + "loss": 2.0171, + "step": 16120 + }, + { + "epoch": 1.0819435589409752, + "grad_norm": 4.077412128448486, + "learning_rate": 4.579350926860533e-05, + "loss": 2.2588, + "step": 16122 + }, + { + "epoch": 1.082077782624744, + "grad_norm": 3.8568649291992188, + "learning_rate": 4.5782679591933444e-05, + "loss": 1.9682, + "step": 16124 + }, + { + "epoch": 1.0822120063085132, + "grad_norm": 3.855736494064331, + "learning_rate": 4.577185011452168e-05, + "loss": 2.0174, + "step": 16126 + }, + { + "epoch": 1.0823462299922821, + "grad_norm": 5.432003021240234, + "learning_rate": 4.576102083688165e-05, + "loss": 1.8708, + "step": 16128 + }, + { + "epoch": 1.082480453676051, + "grad_norm": 4.003406047821045, + "learning_rate": 4.575019175952508e-05, + "loss": 2.3092, + "step": 16130 + }, + { + "epoch": 1.0826146773598202, + "grad_norm": 3.9854254722595215, + "learning_rate": 4.573936288296356e-05, + "loss": 2.0951, + "step": 16132 + }, + { + "epoch": 1.0827489010435891, + "grad_norm": 4.104317665100098, + "learning_rate": 4.57285342077088e-05, + "loss": 1.9629, + "step": 16134 + }, + { + "epoch": 1.082883124727358, + "grad_norm": 4.287196159362793, + "learning_rate": 4.5717705734272375e-05, + "loss": 2.0522, + "step": 16136 + }, + { + "epoch": 1.0830173484111272, + "grad_norm": 3.4868221282958984, + "learning_rate": 4.570687746316595e-05, + "loss": 2.1319, + "step": 16138 + }, + { + "epoch": 1.083151572094896, + "grad_norm": 4.06925106048584, + "learning_rate": 4.569604939490113e-05, + "loss": 2.1136, + "step": 16140 + }, + { + "epoch": 1.0832857957786652, + "grad_norm": 5.180134296417236, + "learning_rate": 4.568522152998948e-05, + "loss": 2.2278, + "step": 16142 + }, + { + "epoch": 1.0834200194624342, + "grad_norm": 3.8926148414611816, + "learning_rate": 4.5674393868942666e-05, + "loss": 2.0402, + "step": 16144 + }, + { + "epoch": 1.083554243146203, + "grad_norm": 3.8267669677734375, + "learning_rate": 4.56635664122722e-05, + "loss": 2.0627, + "step": 16146 + }, + { + "epoch": 1.0836884668299722, + "grad_norm": 4.634647369384766, + "learning_rate": 4.5652739160489715e-05, + "loss": 2.2955, + "step": 16148 + }, + { + "epoch": 1.0838226905137411, + "grad_norm": 4.331087589263916, + "learning_rate": 4.564191211410675e-05, + "loss": 2.2665, + "step": 16150 + }, + { + "epoch": 1.08395691419751, + "grad_norm": 3.430710792541504, + "learning_rate": 4.563108527363487e-05, + "loss": 1.9029, + "step": 16152 + }, + { + "epoch": 1.0840911378812792, + "grad_norm": 3.612557888031006, + "learning_rate": 4.562025863958562e-05, + "loss": 2.0848, + "step": 16154 + }, + { + "epoch": 1.0842253615650481, + "grad_norm": 4.5212931632995605, + "learning_rate": 4.5609432212470536e-05, + "loss": 2.4318, + "step": 16156 + }, + { + "epoch": 1.084359585248817, + "grad_norm": 3.536191463470459, + "learning_rate": 4.5598605992801145e-05, + "loss": 2.0787, + "step": 16158 + }, + { + "epoch": 1.0844938089325862, + "grad_norm": 3.981351375579834, + "learning_rate": 4.5587779981088976e-05, + "loss": 2.0287, + "step": 16160 + }, + { + "epoch": 1.084628032616355, + "grad_norm": 4.238088607788086, + "learning_rate": 4.557695417784551e-05, + "loss": 1.901, + "step": 16162 + }, + { + "epoch": 1.0847622563001242, + "grad_norm": 4.1630988121032715, + "learning_rate": 4.5566128583582276e-05, + "loss": 1.9122, + "step": 16164 + }, + { + "epoch": 1.0848964799838932, + "grad_norm": 3.804776906967163, + "learning_rate": 4.5555303198810724e-05, + "loss": 1.8601, + "step": 16166 + }, + { + "epoch": 1.085030703667662, + "grad_norm": 4.377157688140869, + "learning_rate": 4.5544478024042384e-05, + "loss": 2.2534, + "step": 16168 + }, + { + "epoch": 1.0851649273514312, + "grad_norm": 5.318038463592529, + "learning_rate": 4.553365305978867e-05, + "loss": 2.269, + "step": 16170 + }, + { + "epoch": 1.0852991510352001, + "grad_norm": 3.6524598598480225, + "learning_rate": 4.5522828306561085e-05, + "loss": 1.9167, + "step": 16172 + }, + { + "epoch": 1.0854333747189693, + "grad_norm": 4.168848037719727, + "learning_rate": 4.551200376487105e-05, + "loss": 2.0301, + "step": 16174 + }, + { + "epoch": 1.0855675984027382, + "grad_norm": 4.386323928833008, + "learning_rate": 4.5501179435230024e-05, + "loss": 2.0699, + "step": 16176 + }, + { + "epoch": 1.0857018220865071, + "grad_norm": 3.952589273452759, + "learning_rate": 4.549035531814941e-05, + "loss": 2.0036, + "step": 16178 + }, + { + "epoch": 1.0858360457702763, + "grad_norm": 4.402552604675293, + "learning_rate": 4.5479531414140654e-05, + "loss": 2.2061, + "step": 16180 + }, + { + "epoch": 1.0859702694540452, + "grad_norm": 3.600811004638672, + "learning_rate": 4.5468707723715135e-05, + "loss": 1.988, + "step": 16182 + }, + { + "epoch": 1.086104493137814, + "grad_norm": 4.307168483734131, + "learning_rate": 4.545788424738427e-05, + "loss": 2.1912, + "step": 16184 + }, + { + "epoch": 1.0862387168215832, + "grad_norm": 3.225856304168701, + "learning_rate": 4.544706098565944e-05, + "loss": 2.0295, + "step": 16186 + }, + { + "epoch": 1.0863729405053522, + "grad_norm": 4.797672748565674, + "learning_rate": 4.543623793905204e-05, + "loss": 2.1673, + "step": 16188 + }, + { + "epoch": 1.086507164189121, + "grad_norm": 4.119539260864258, + "learning_rate": 4.54254151080734e-05, + "loss": 2.1452, + "step": 16190 + }, + { + "epoch": 1.0866413878728902, + "grad_norm": 3.9214460849761963, + "learning_rate": 4.541459249323492e-05, + "loss": 2.1588, + "step": 16192 + }, + { + "epoch": 1.0867756115566591, + "grad_norm": 3.9125235080718994, + "learning_rate": 4.54037700950479e-05, + "loss": 1.9408, + "step": 16194 + }, + { + "epoch": 1.0869098352404283, + "grad_norm": 4.25, + "learning_rate": 4.5392947914023745e-05, + "loss": 2.0832, + "step": 16196 + }, + { + "epoch": 1.0870440589241972, + "grad_norm": 3.47479510307312, + "learning_rate": 4.538212595067371e-05, + "loss": 1.833, + "step": 16198 + }, + { + "epoch": 1.0871782826079661, + "grad_norm": 3.7709293365478516, + "learning_rate": 4.5371304205509154e-05, + "loss": 1.839, + "step": 16200 + }, + { + "epoch": 1.0873125062917353, + "grad_norm": 4.504134178161621, + "learning_rate": 4.5360482679041375e-05, + "loss": 2.0771, + "step": 16202 + }, + { + "epoch": 1.0874467299755042, + "grad_norm": 3.962679624557495, + "learning_rate": 4.534966137178167e-05, + "loss": 2.0622, + "step": 16204 + }, + { + "epoch": 1.087580953659273, + "grad_norm": 4.1247239112854, + "learning_rate": 4.533884028424133e-05, + "loss": 2.2284, + "step": 16206 + }, + { + "epoch": 1.0877151773430422, + "grad_norm": 4.378789901733398, + "learning_rate": 4.5328019416931594e-05, + "loss": 2.012, + "step": 16208 + }, + { + "epoch": 1.0878494010268112, + "grad_norm": 3.6422479152679443, + "learning_rate": 4.531719877036377e-05, + "loss": 1.8382, + "step": 16210 + }, + { + "epoch": 1.08798362471058, + "grad_norm": 3.860154390335083, + "learning_rate": 4.530637834504908e-05, + "loss": 2.0289, + "step": 16212 + }, + { + "epoch": 1.0881178483943492, + "grad_norm": 22.060632705688477, + "learning_rate": 4.5295558141498804e-05, + "loss": 2.2805, + "step": 16214 + }, + { + "epoch": 1.0882520720781181, + "grad_norm": 3.9485065937042236, + "learning_rate": 4.528473816022414e-05, + "loss": 1.9486, + "step": 16216 + }, + { + "epoch": 1.0883862957618873, + "grad_norm": 3.8161160945892334, + "learning_rate": 4.5273918401736346e-05, + "loss": 1.847, + "step": 16218 + }, + { + "epoch": 1.0885205194456562, + "grad_norm": 4.2019124031066895, + "learning_rate": 4.5263098866546586e-05, + "loss": 2.2143, + "step": 16220 + }, + { + "epoch": 1.0886547431294251, + "grad_norm": 3.9935922622680664, + "learning_rate": 4.525227955516612e-05, + "loss": 2.1427, + "step": 16222 + }, + { + "epoch": 1.0887889668131943, + "grad_norm": 4.1661176681518555, + "learning_rate": 4.524146046810608e-05, + "loss": 2.4065, + "step": 16224 + }, + { + "epoch": 1.0889231904969632, + "grad_norm": 4.330832004547119, + "learning_rate": 4.523064160587769e-05, + "loss": 2.1527, + "step": 16226 + }, + { + "epoch": 1.089057414180732, + "grad_norm": 4.299709320068359, + "learning_rate": 4.5219822968992096e-05, + "loss": 2.0619, + "step": 16228 + }, + { + "epoch": 1.0891916378645012, + "grad_norm": 6.89337158203125, + "learning_rate": 4.520900455796049e-05, + "loss": 2.0469, + "step": 16230 + }, + { + "epoch": 1.0893258615482702, + "grad_norm": 3.8501763343811035, + "learning_rate": 4.5198186373293964e-05, + "loss": 1.8971, + "step": 16232 + }, + { + "epoch": 1.089460085232039, + "grad_norm": 4.7745771408081055, + "learning_rate": 4.518736841550373e-05, + "loss": 1.9636, + "step": 16234 + }, + { + "epoch": 1.0895943089158082, + "grad_norm": 4.014798641204834, + "learning_rate": 4.5176550685100834e-05, + "loss": 2.0927, + "step": 16236 + }, + { + "epoch": 1.0897285325995771, + "grad_norm": 3.9597039222717285, + "learning_rate": 4.516573318259646e-05, + "loss": 2.012, + "step": 16238 + }, + { + "epoch": 1.0898627562833463, + "grad_norm": 3.569164752960205, + "learning_rate": 4.515491590850167e-05, + "loss": 2.0848, + "step": 16240 + }, + { + "epoch": 1.0899969799671152, + "grad_norm": 4.26318883895874, + "learning_rate": 4.5144098863327586e-05, + "loss": 2.0882, + "step": 16242 + }, + { + "epoch": 1.0901312036508841, + "grad_norm": 3.86757755279541, + "learning_rate": 4.513328204758528e-05, + "loss": 2.1853, + "step": 16244 + }, + { + "epoch": 1.0902654273346533, + "grad_norm": 4.0429205894470215, + "learning_rate": 4.5122465461785833e-05, + "loss": 2.0861, + "step": 16246 + }, + { + "epoch": 1.0903996510184222, + "grad_norm": 4.285201549530029, + "learning_rate": 4.511164910644029e-05, + "loss": 2.2624, + "step": 16248 + }, + { + "epoch": 1.0905338747021913, + "grad_norm": 4.116120338439941, + "learning_rate": 4.5100832982059724e-05, + "loss": 2.0992, + "step": 16250 + }, + { + "epoch": 1.0906680983859602, + "grad_norm": 4.144741058349609, + "learning_rate": 4.509001708915516e-05, + "loss": 2.032, + "step": 16252 + }, + { + "epoch": 1.0908023220697292, + "grad_norm": 3.357253074645996, + "learning_rate": 4.507920142823764e-05, + "loss": 1.9481, + "step": 16254 + }, + { + "epoch": 1.0909365457534983, + "grad_norm": 3.4082210063934326, + "learning_rate": 4.506838599981816e-05, + "loss": 2.0617, + "step": 16256 + }, + { + "epoch": 1.0910707694372672, + "grad_norm": 4.099382400512695, + "learning_rate": 4.5057570804407774e-05, + "loss": 1.9906, + "step": 16258 + }, + { + "epoch": 1.0912049931210361, + "grad_norm": 4.178055763244629, + "learning_rate": 4.5046755842517415e-05, + "loss": 2.0688, + "step": 16260 + }, + { + "epoch": 1.0913392168048053, + "grad_norm": 4.000502109527588, + "learning_rate": 4.503594111465812e-05, + "loss": 1.8983, + "step": 16262 + }, + { + "epoch": 1.0914734404885742, + "grad_norm": 4.566054344177246, + "learning_rate": 4.5025126621340844e-05, + "loss": 2.0057, + "step": 16264 + }, + { + "epoch": 1.0916076641723431, + "grad_norm": 4.108896255493164, + "learning_rate": 4.501431236307655e-05, + "loss": 2.0584, + "step": 16266 + }, + { + "epoch": 1.0917418878561123, + "grad_norm": 4.420854091644287, + "learning_rate": 4.500349834037619e-05, + "loss": 2.3313, + "step": 16268 + }, + { + "epoch": 1.0918761115398812, + "grad_norm": 5.056260585784912, + "learning_rate": 4.499268455375073e-05, + "loss": 2.0857, + "step": 16270 + }, + { + "epoch": 1.0920103352236503, + "grad_norm": 3.8459439277648926, + "learning_rate": 4.498187100371105e-05, + "loss": 2.0539, + "step": 16272 + }, + { + "epoch": 1.0921445589074192, + "grad_norm": 3.474440336227417, + "learning_rate": 4.497105769076812e-05, + "loss": 1.9055, + "step": 16274 + }, + { + "epoch": 1.0922787825911882, + "grad_norm": 3.6422317028045654, + "learning_rate": 4.496024461543282e-05, + "loss": 2.0709, + "step": 16276 + }, + { + "epoch": 1.0924130062749573, + "grad_norm": 4.052451133728027, + "learning_rate": 4.494943177821604e-05, + "loss": 2.1539, + "step": 16278 + }, + { + "epoch": 1.0925472299587262, + "grad_norm": 4.379003047943115, + "learning_rate": 4.493861917962869e-05, + "loss": 2.0087, + "step": 16280 + }, + { + "epoch": 1.0926814536424951, + "grad_norm": 4.128014087677002, + "learning_rate": 4.492780682018162e-05, + "loss": 2.1387, + "step": 16282 + }, + { + "epoch": 1.0928156773262643, + "grad_norm": 3.5295164585113525, + "learning_rate": 4.4916994700385714e-05, + "loss": 1.915, + "step": 16284 + }, + { + "epoch": 1.0929499010100332, + "grad_norm": 3.970154285430908, + "learning_rate": 4.4906182820751796e-05, + "loss": 1.843, + "step": 16286 + }, + { + "epoch": 1.0930841246938021, + "grad_norm": 3.640730857849121, + "learning_rate": 4.489537118179074e-05, + "loss": 2.0482, + "step": 16288 + }, + { + "epoch": 1.0932183483775713, + "grad_norm": 4.726868629455566, + "learning_rate": 4.488455978401334e-05, + "loss": 2.1831, + "step": 16290 + }, + { + "epoch": 1.0933525720613402, + "grad_norm": 3.750082492828369, + "learning_rate": 4.4873748627930455e-05, + "loss": 2.3729, + "step": 16292 + }, + { + "epoch": 1.0934867957451093, + "grad_norm": 3.942878007888794, + "learning_rate": 4.4862937714052835e-05, + "loss": 2.2156, + "step": 16294 + }, + { + "epoch": 1.0936210194288782, + "grad_norm": 3.8557422161102295, + "learning_rate": 4.4852127042891354e-05, + "loss": 2.1956, + "step": 16296 + }, + { + "epoch": 1.0937552431126472, + "grad_norm": 3.4760513305664062, + "learning_rate": 4.48413166149567e-05, + "loss": 1.7741, + "step": 16298 + }, + { + "epoch": 1.0938894667964163, + "grad_norm": 3.478456735610962, + "learning_rate": 4.483050643075972e-05, + "loss": 1.7791, + "step": 16300 + }, + { + "epoch": 1.0940236904801852, + "grad_norm": 3.7245514392852783, + "learning_rate": 4.481969649081112e-05, + "loss": 2.0559, + "step": 16302 + }, + { + "epoch": 1.0941579141639541, + "grad_norm": 6.613199710845947, + "learning_rate": 4.4808886795621705e-05, + "loss": 2.2169, + "step": 16304 + }, + { + "epoch": 1.0942921378477233, + "grad_norm": 3.845038890838623, + "learning_rate": 4.479807734570216e-05, + "loss": 2.0821, + "step": 16306 + }, + { + "epoch": 1.0944263615314922, + "grad_norm": 3.4482052326202393, + "learning_rate": 4.478726814156325e-05, + "loss": 2.0525, + "step": 16308 + }, + { + "epoch": 1.0945605852152611, + "grad_norm": 3.4711790084838867, + "learning_rate": 4.4776459183715654e-05, + "loss": 1.8632, + "step": 16310 + }, + { + "epoch": 1.0946948088990303, + "grad_norm": 4.234772205352783, + "learning_rate": 4.4765650472670094e-05, + "loss": 1.928, + "step": 16312 + }, + { + "epoch": 1.0948290325827992, + "grad_norm": 3.9699790477752686, + "learning_rate": 4.475484200893725e-05, + "loss": 1.9813, + "step": 16314 + }, + { + "epoch": 1.0949632562665683, + "grad_norm": 4.101773262023926, + "learning_rate": 4.474403379302782e-05, + "loss": 2.0192, + "step": 16316 + }, + { + "epoch": 1.0950974799503372, + "grad_norm": 4.171555042266846, + "learning_rate": 4.473322582545244e-05, + "loss": 2.3452, + "step": 16318 + }, + { + "epoch": 1.0952317036341062, + "grad_norm": 4.194767475128174, + "learning_rate": 4.472241810672181e-05, + "loss": 2.2971, + "step": 16320 + }, + { + "epoch": 1.0953659273178753, + "grad_norm": 4.043852806091309, + "learning_rate": 4.471161063734652e-05, + "loss": 2.2202, + "step": 16322 + }, + { + "epoch": 1.0955001510016442, + "grad_norm": 4.1191864013671875, + "learning_rate": 4.470080341783726e-05, + "loss": 2.3918, + "step": 16324 + }, + { + "epoch": 1.0956343746854131, + "grad_norm": 4.666961669921875, + "learning_rate": 4.4689996448704574e-05, + "loss": 2.0336, + "step": 16326 + }, + { + "epoch": 1.0957685983691823, + "grad_norm": 3.5494906902313232, + "learning_rate": 4.4679189730459144e-05, + "loss": 2.2916, + "step": 16328 + }, + { + "epoch": 1.0959028220529512, + "grad_norm": 3.8350954055786133, + "learning_rate": 4.466838326361152e-05, + "loss": 2.0768, + "step": 16330 + }, + { + "epoch": 1.0960370457367203, + "grad_norm": 4.000762462615967, + "learning_rate": 4.4657577048672324e-05, + "loss": 2.1677, + "step": 16332 + }, + { + "epoch": 1.0961712694204893, + "grad_norm": 3.8005549907684326, + "learning_rate": 4.464677108615209e-05, + "loss": 1.9674, + "step": 16334 + }, + { + "epoch": 1.0963054931042582, + "grad_norm": 3.848801374435425, + "learning_rate": 4.463596537656141e-05, + "loss": 2.1065, + "step": 16336 + }, + { + "epoch": 1.0964397167880273, + "grad_norm": 4.347054481506348, + "learning_rate": 4.4625159920410804e-05, + "loss": 2.0455, + "step": 16338 + }, + { + "epoch": 1.0965739404717962, + "grad_norm": 3.91281795501709, + "learning_rate": 4.4614354718210835e-05, + "loss": 2.2434, + "step": 16340 + }, + { + "epoch": 1.0967081641555652, + "grad_norm": 4.575713634490967, + "learning_rate": 4.4603549770472e-05, + "loss": 2.1821, + "step": 16342 + }, + { + "epoch": 1.0968423878393343, + "grad_norm": 3.7771549224853516, + "learning_rate": 4.459274507770484e-05, + "loss": 2.0198, + "step": 16344 + }, + { + "epoch": 1.0969766115231032, + "grad_norm": 5.592887878417969, + "learning_rate": 4.4581940640419836e-05, + "loss": 1.9787, + "step": 16346 + }, + { + "epoch": 1.0971108352068724, + "grad_norm": 3.6779849529266357, + "learning_rate": 4.457113645912748e-05, + "loss": 2.0902, + "step": 16348 + }, + { + "epoch": 1.0972450588906413, + "grad_norm": 4.045385837554932, + "learning_rate": 4.456033253433826e-05, + "loss": 2.1187, + "step": 16350 + }, + { + "epoch": 1.0973792825744102, + "grad_norm": 3.7244064807891846, + "learning_rate": 4.4549528866562614e-05, + "loss": 2.2184, + "step": 16352 + }, + { + "epoch": 1.0975135062581793, + "grad_norm": 4.658431529998779, + "learning_rate": 4.453872545631104e-05, + "loss": 2.1181, + "step": 16354 + }, + { + "epoch": 1.0976477299419483, + "grad_norm": 4.378142833709717, + "learning_rate": 4.452792230409392e-05, + "loss": 2.3784, + "step": 16356 + }, + { + "epoch": 1.0977819536257172, + "grad_norm": 3.5649895668029785, + "learning_rate": 4.451711941042173e-05, + "loss": 1.9197, + "step": 16358 + }, + { + "epoch": 1.0979161773094863, + "grad_norm": 4.1912360191345215, + "learning_rate": 4.4506316775804844e-05, + "loss": 1.9712, + "step": 16360 + }, + { + "epoch": 1.0980504009932552, + "grad_norm": 3.9509899616241455, + "learning_rate": 4.449551440075371e-05, + "loss": 1.8693, + "step": 16362 + }, + { + "epoch": 1.0981846246770242, + "grad_norm": 3.8228883743286133, + "learning_rate": 4.448471228577868e-05, + "loss": 1.8263, + "step": 16364 + }, + { + "epoch": 1.0983188483607933, + "grad_norm": 3.971315622329712, + "learning_rate": 4.447391043139016e-05, + "loss": 1.9376, + "step": 16366 + }, + { + "epoch": 1.0984530720445622, + "grad_norm": 3.9491147994995117, + "learning_rate": 4.4463108838098504e-05, + "loss": 2.1462, + "step": 16368 + }, + { + "epoch": 1.0985872957283314, + "grad_norm": 4.625417232513428, + "learning_rate": 4.445230750641407e-05, + "loss": 2.1008, + "step": 16370 + }, + { + "epoch": 1.0987215194121003, + "grad_norm": 4.067205905914307, + "learning_rate": 4.4441506436847194e-05, + "loss": 2.4415, + "step": 16372 + }, + { + "epoch": 1.0988557430958692, + "grad_norm": 4.013747215270996, + "learning_rate": 4.443070562990821e-05, + "loss": 1.8879, + "step": 16374 + }, + { + "epoch": 1.0989899667796383, + "grad_norm": 3.913594961166382, + "learning_rate": 4.441990508610743e-05, + "loss": 2.1445, + "step": 16376 + }, + { + "epoch": 1.0991241904634073, + "grad_norm": 3.797631025314331, + "learning_rate": 4.440910480595517e-05, + "loss": 1.9551, + "step": 16378 + }, + { + "epoch": 1.0992584141471762, + "grad_norm": 4.257224082946777, + "learning_rate": 4.439830478996169e-05, + "loss": 1.935, + "step": 16380 + }, + { + "epoch": 1.0993926378309453, + "grad_norm": 3.8111395835876465, + "learning_rate": 4.438750503863733e-05, + "loss": 2.1427, + "step": 16382 + }, + { + "epoch": 1.0995268615147142, + "grad_norm": 4.064173221588135, + "learning_rate": 4.437670555249228e-05, + "loss": 2.2301, + "step": 16384 + }, + { + "epoch": 1.0996610851984832, + "grad_norm": 3.917475938796997, + "learning_rate": 4.436590633203688e-05, + "loss": 1.9877, + "step": 16386 + }, + { + "epoch": 1.0997953088822523, + "grad_norm": 3.6399834156036377, + "learning_rate": 4.4355107377781286e-05, + "loss": 2.2547, + "step": 16388 + }, + { + "epoch": 1.0999295325660212, + "grad_norm": 3.744269371032715, + "learning_rate": 4.434430869023579e-05, + "loss": 2.107, + "step": 16390 + }, + { + "epoch": 1.1000637562497904, + "grad_norm": 3.6787819862365723, + "learning_rate": 4.433351026991058e-05, + "loss": 2.0563, + "step": 16392 + }, + { + "epoch": 1.1001979799335593, + "grad_norm": 3.9946179389953613, + "learning_rate": 4.4322712117315874e-05, + "loss": 2.1133, + "step": 16394 + }, + { + "epoch": 1.1003322036173282, + "grad_norm": 3.3855764865875244, + "learning_rate": 4.4311914232961847e-05, + "loss": 1.762, + "step": 16396 + }, + { + "epoch": 1.1004664273010973, + "grad_norm": 3.9167473316192627, + "learning_rate": 4.4301116617358695e-05, + "loss": 1.8712, + "step": 16398 + }, + { + "epoch": 1.1006006509848663, + "grad_norm": 3.9625093936920166, + "learning_rate": 4.429031927101657e-05, + "loss": 2.0629, + "step": 16400 + }, + { + "epoch": 1.1007348746686352, + "grad_norm": 3.7112090587615967, + "learning_rate": 4.4279522194445644e-05, + "loss": 1.9624, + "step": 16402 + }, + { + "epoch": 1.1008690983524043, + "grad_norm": 4.842213153839111, + "learning_rate": 4.4268725388156026e-05, + "loss": 2.2747, + "step": 16404 + }, + { + "epoch": 1.1010033220361732, + "grad_norm": 4.336490631103516, + "learning_rate": 4.425792885265789e-05, + "loss": 2.1795, + "step": 16406 + }, + { + "epoch": 1.1011375457199424, + "grad_norm": 4.648943901062012, + "learning_rate": 4.4247132588461286e-05, + "loss": 2.2486, + "step": 16408 + }, + { + "epoch": 1.1012717694037113, + "grad_norm": 3.478628635406494, + "learning_rate": 4.423633659607639e-05, + "loss": 2.044, + "step": 16410 + }, + { + "epoch": 1.1014059930874802, + "grad_norm": 5.960132122039795, + "learning_rate": 4.422554087601325e-05, + "loss": 1.9382, + "step": 16412 + }, + { + "epoch": 1.1015402167712494, + "grad_norm": 4.245546817779541, + "learning_rate": 4.421474542878195e-05, + "loss": 2.283, + "step": 16414 + }, + { + "epoch": 1.1016744404550183, + "grad_norm": 4.410426616668701, + "learning_rate": 4.4203950254892545e-05, + "loss": 1.9113, + "step": 16416 + }, + { + "epoch": 1.1018086641387872, + "grad_norm": 4.082060813903809, + "learning_rate": 4.4193155354855074e-05, + "loss": 1.8849, + "step": 16418 + }, + { + "epoch": 1.1019428878225563, + "grad_norm": 4.019155979156494, + "learning_rate": 4.4182360729179625e-05, + "loss": 1.9073, + "step": 16420 + }, + { + "epoch": 1.1020771115063253, + "grad_norm": 4.328770160675049, + "learning_rate": 4.4171566378376166e-05, + "loss": 1.9164, + "step": 16422 + }, + { + "epoch": 1.1022113351900944, + "grad_norm": 3.9981138706207275, + "learning_rate": 4.416077230295475e-05, + "loss": 1.9721, + "step": 16424 + }, + { + "epoch": 1.1023455588738633, + "grad_norm": 4.624845504760742, + "learning_rate": 4.414997850342536e-05, + "loss": 2.1088, + "step": 16426 + }, + { + "epoch": 1.1024797825576322, + "grad_norm": 3.932976007461548, + "learning_rate": 4.4139184980297985e-05, + "loss": 1.9964, + "step": 16428 + }, + { + "epoch": 1.1026140062414014, + "grad_norm": 3.8753459453582764, + "learning_rate": 4.4128391734082586e-05, + "loss": 2.0035, + "step": 16430 + }, + { + "epoch": 1.1027482299251703, + "grad_norm": 4.000641345977783, + "learning_rate": 4.4117598765289145e-05, + "loss": 2.0515, + "step": 16432 + }, + { + "epoch": 1.1028824536089392, + "grad_norm": 3.8130247592926025, + "learning_rate": 4.410680607442758e-05, + "loss": 1.9899, + "step": 16434 + }, + { + "epoch": 1.1030166772927084, + "grad_norm": 4.402415752410889, + "learning_rate": 4.4096013662007844e-05, + "loss": 1.9906, + "step": 16436 + }, + { + "epoch": 1.1031509009764773, + "grad_norm": 3.550349235534668, + "learning_rate": 4.408522152853985e-05, + "loss": 2.206, + "step": 16438 + }, + { + "epoch": 1.1032851246602462, + "grad_norm": 3.966979742050171, + "learning_rate": 4.407442967453352e-05, + "loss": 2.119, + "step": 16440 + }, + { + "epoch": 1.1034193483440153, + "grad_norm": 4.233673095703125, + "learning_rate": 4.406363810049871e-05, + "loss": 2.2474, + "step": 16442 + }, + { + "epoch": 1.1035535720277843, + "grad_norm": 4.863281726837158, + "learning_rate": 4.405284680694536e-05, + "loss": 2.0703, + "step": 16444 + }, + { + "epoch": 1.1036877957115534, + "grad_norm": 3.605381727218628, + "learning_rate": 4.4042055794383277e-05, + "loss": 1.9798, + "step": 16446 + }, + { + "epoch": 1.1038220193953223, + "grad_norm": 4.316192626953125, + "learning_rate": 4.4031265063322364e-05, + "loss": 2.0975, + "step": 16448 + }, + { + "epoch": 1.1039562430790912, + "grad_norm": 4.835350036621094, + "learning_rate": 4.402047461427241e-05, + "loss": 2.0845, + "step": 16450 + }, + { + "epoch": 1.1040904667628604, + "grad_norm": 4.111530780792236, + "learning_rate": 4.40096844477433e-05, + "loss": 2.1104, + "step": 16452 + }, + { + "epoch": 1.1042246904466293, + "grad_norm": 3.495011568069458, + "learning_rate": 4.399889456424481e-05, + "loss": 1.7704, + "step": 16454 + }, + { + "epoch": 1.1043589141303982, + "grad_norm": 4.284898281097412, + "learning_rate": 4.398810496428675e-05, + "loss": 2.159, + "step": 16456 + }, + { + "epoch": 1.1044931378141674, + "grad_norm": 4.406925201416016, + "learning_rate": 4.397731564837891e-05, + "loss": 2.1659, + "step": 16458 + }, + { + "epoch": 1.1046273614979363, + "grad_norm": 4.501222610473633, + "learning_rate": 4.396652661703107e-05, + "loss": 2.2527, + "step": 16460 + }, + { + "epoch": 1.1047615851817052, + "grad_norm": 4.074746131896973, + "learning_rate": 4.395573787075297e-05, + "loss": 2.4979, + "step": 16462 + }, + { + "epoch": 1.1048958088654743, + "grad_norm": 4.859315395355225, + "learning_rate": 4.394494941005438e-05, + "loss": 2.2409, + "step": 16464 + }, + { + "epoch": 1.1050300325492433, + "grad_norm": 3.9316399097442627, + "learning_rate": 4.3934161235445015e-05, + "loss": 2.1489, + "step": 16466 + }, + { + "epoch": 1.1051642562330124, + "grad_norm": 3.738996982574463, + "learning_rate": 4.392337334743461e-05, + "loss": 1.7631, + "step": 16468 + }, + { + "epoch": 1.1052984799167813, + "grad_norm": 3.9726734161376953, + "learning_rate": 4.391258574653285e-05, + "loss": 1.869, + "step": 16470 + }, + { + "epoch": 1.1054327036005502, + "grad_norm": 3.611335277557373, + "learning_rate": 4.390179843324947e-05, + "loss": 1.9232, + "step": 16472 + }, + { + "epoch": 1.1055669272843194, + "grad_norm": 4.218809127807617, + "learning_rate": 4.3891011408094105e-05, + "loss": 2.0528, + "step": 16474 + }, + { + "epoch": 1.1057011509680883, + "grad_norm": 3.779365062713623, + "learning_rate": 4.3880224671576455e-05, + "loss": 1.9826, + "step": 16476 + }, + { + "epoch": 1.1058353746518572, + "grad_norm": 4.3851704597473145, + "learning_rate": 4.3869438224206126e-05, + "loss": 2.3478, + "step": 16478 + }, + { + "epoch": 1.1059695983356264, + "grad_norm": 4.468615531921387, + "learning_rate": 4.3858652066492814e-05, + "loss": 2.1217, + "step": 16480 + }, + { + "epoch": 1.1061038220193953, + "grad_norm": 3.836181402206421, + "learning_rate": 4.3847866198946116e-05, + "loss": 1.9398, + "step": 16482 + }, + { + "epoch": 1.1062380457031644, + "grad_norm": 4.168747425079346, + "learning_rate": 4.383708062207562e-05, + "loss": 2.1474, + "step": 16484 + }, + { + "epoch": 1.1063722693869333, + "grad_norm": 3.8097712993621826, + "learning_rate": 4.382629533639095e-05, + "loss": 2.0353, + "step": 16486 + }, + { + "epoch": 1.1065064930707023, + "grad_norm": 3.469424247741699, + "learning_rate": 4.381551034240169e-05, + "loss": 1.9128, + "step": 16488 + }, + { + "epoch": 1.1066407167544714, + "grad_norm": 4.2904372215271, + "learning_rate": 4.3804725640617414e-05, + "loss": 1.9872, + "step": 16490 + }, + { + "epoch": 1.1067749404382403, + "grad_norm": 3.8326869010925293, + "learning_rate": 4.379394123154766e-05, + "loss": 2.2904, + "step": 16492 + }, + { + "epoch": 1.1069091641220092, + "grad_norm": 3.7710776329040527, + "learning_rate": 4.3783157115701984e-05, + "loss": 2.2757, + "step": 16494 + }, + { + "epoch": 1.1070433878057784, + "grad_norm": 4.176615238189697, + "learning_rate": 4.3772373293589894e-05, + "loss": 2.0243, + "step": 16496 + }, + { + "epoch": 1.1071776114895473, + "grad_norm": 4.286485195159912, + "learning_rate": 4.376158976572094e-05, + "loss": 2.0318, + "step": 16498 + }, + { + "epoch": 1.1073118351733164, + "grad_norm": 4.190331935882568, + "learning_rate": 4.375080653260459e-05, + "loss": 1.9696, + "step": 16500 + }, + { + "epoch": 1.1074460588570854, + "grad_norm": 3.83244252204895, + "learning_rate": 4.374002359475036e-05, + "loss": 1.868, + "step": 16502 + }, + { + "epoch": 1.1075802825408543, + "grad_norm": 4.049759387969971, + "learning_rate": 4.3729240952667684e-05, + "loss": 1.9524, + "step": 16504 + }, + { + "epoch": 1.1077145062246234, + "grad_norm": 5.367188930511475, + "learning_rate": 4.3718458606866056e-05, + "loss": 1.9654, + "step": 16506 + }, + { + "epoch": 1.1078487299083923, + "grad_norm": 4.181414604187012, + "learning_rate": 4.370767655785488e-05, + "loss": 2.2743, + "step": 16508 + }, + { + "epoch": 1.1079829535921613, + "grad_norm": 4.492886543273926, + "learning_rate": 4.3696894806143653e-05, + "loss": 1.9294, + "step": 16510 + }, + { + "epoch": 1.1081171772759304, + "grad_norm": 3.7320139408111572, + "learning_rate": 4.368611335224172e-05, + "loss": 2.2214, + "step": 16512 + }, + { + "epoch": 1.1082514009596993, + "grad_norm": 4.763078212738037, + "learning_rate": 4.367533219665853e-05, + "loss": 2.2541, + "step": 16514 + }, + { + "epoch": 1.1083856246434682, + "grad_norm": 3.5161185264587402, + "learning_rate": 4.3664551339903454e-05, + "loss": 1.7515, + "step": 16516 + }, + { + "epoch": 1.1085198483272374, + "grad_norm": 3.911031484603882, + "learning_rate": 4.3653770782485876e-05, + "loss": 2.0808, + "step": 16518 + }, + { + "epoch": 1.1086540720110063, + "grad_norm": 4.575017929077148, + "learning_rate": 4.364299052491514e-05, + "loss": 1.9468, + "step": 16520 + }, + { + "epoch": 1.1087882956947754, + "grad_norm": 4.397642612457275, + "learning_rate": 4.36322105677006e-05, + "loss": 1.9507, + "step": 16522 + }, + { + "epoch": 1.1089225193785444, + "grad_norm": 4.071949005126953, + "learning_rate": 4.36214309113516e-05, + "loss": 1.9988, + "step": 16524 + }, + { + "epoch": 1.1090567430623133, + "grad_norm": 5.259698867797852, + "learning_rate": 4.361065155637744e-05, + "loss": 2.1689, + "step": 16526 + }, + { + "epoch": 1.1091909667460824, + "grad_norm": 4.076085090637207, + "learning_rate": 4.359987250328743e-05, + "loss": 2.1113, + "step": 16528 + }, + { + "epoch": 1.1093251904298513, + "grad_norm": 4.326290130615234, + "learning_rate": 4.358909375259086e-05, + "loss": 2.0211, + "step": 16530 + }, + { + "epoch": 1.1094594141136203, + "grad_norm": 3.914807081222534, + "learning_rate": 4.357831530479699e-05, + "loss": 1.9913, + "step": 16532 + }, + { + "epoch": 1.1095936377973894, + "grad_norm": 3.7238662242889404, + "learning_rate": 4.3567537160415125e-05, + "loss": 1.9089, + "step": 16534 + }, + { + "epoch": 1.1097278614811583, + "grad_norm": 3.8789045810699463, + "learning_rate": 4.355675931995444e-05, + "loss": 2.3844, + "step": 16536 + }, + { + "epoch": 1.1098620851649272, + "grad_norm": 4.381396770477295, + "learning_rate": 4.354598178392425e-05, + "loss": 2.1429, + "step": 16538 + }, + { + "epoch": 1.1099963088486964, + "grad_norm": 3.976555824279785, + "learning_rate": 4.353520455283369e-05, + "loss": 2.2739, + "step": 16540 + }, + { + "epoch": 1.1101305325324653, + "grad_norm": 3.722306728363037, + "learning_rate": 4.3524427627192024e-05, + "loss": 1.9279, + "step": 16542 + }, + { + "epoch": 1.1102647562162344, + "grad_norm": 3.6047775745391846, + "learning_rate": 4.3513651007508414e-05, + "loss": 1.9676, + "step": 16544 + }, + { + "epoch": 1.1103989799000034, + "grad_norm": 3.820068836212158, + "learning_rate": 4.3502874694292045e-05, + "loss": 2.0766, + "step": 16546 + }, + { + "epoch": 1.1105332035837723, + "grad_norm": 4.106356143951416, + "learning_rate": 4.3492098688052064e-05, + "loss": 1.9751, + "step": 16548 + }, + { + "epoch": 1.1106674272675414, + "grad_norm": 3.8885321617126465, + "learning_rate": 4.3481322989297626e-05, + "loss": 1.9978, + "step": 16550 + }, + { + "epoch": 1.1108016509513103, + "grad_norm": 3.728210926055908, + "learning_rate": 4.347054759853787e-05, + "loss": 1.8837, + "step": 16552 + }, + { + "epoch": 1.1109358746350793, + "grad_norm": 3.6546225547790527, + "learning_rate": 4.3459772516281886e-05, + "loss": 2.0598, + "step": 16554 + }, + { + "epoch": 1.1110700983188484, + "grad_norm": 4.089712619781494, + "learning_rate": 4.344899774303879e-05, + "loss": 1.738, + "step": 16556 + }, + { + "epoch": 1.1112043220026173, + "grad_norm": 3.8910715579986572, + "learning_rate": 4.343822327931767e-05, + "loss": 2.2006, + "step": 16558 + }, + { + "epoch": 1.1113385456863865, + "grad_norm": 4.18520450592041, + "learning_rate": 4.34274491256276e-05, + "loss": 2.0638, + "step": 16560 + }, + { + "epoch": 1.1114727693701554, + "grad_norm": 4.036118030548096, + "learning_rate": 4.3416675282477626e-05, + "loss": 2.013, + "step": 16562 + }, + { + "epoch": 1.1116069930539243, + "grad_norm": 3.9574904441833496, + "learning_rate": 4.340590175037681e-05, + "loss": 2.1072, + "step": 16564 + }, + { + "epoch": 1.1117412167376934, + "grad_norm": 3.906524896621704, + "learning_rate": 4.339512852983416e-05, + "loss": 2.0457, + "step": 16566 + }, + { + "epoch": 1.1118754404214624, + "grad_norm": 4.055656433105469, + "learning_rate": 4.3384355621358706e-05, + "loss": 1.8565, + "step": 16568 + }, + { + "epoch": 1.1120096641052313, + "grad_norm": 3.479572057723999, + "learning_rate": 4.3373583025459426e-05, + "loss": 2.0549, + "step": 16570 + }, + { + "epoch": 1.1121438877890004, + "grad_norm": 4.189981460571289, + "learning_rate": 4.3362810742645344e-05, + "loss": 2.2458, + "step": 16572 + }, + { + "epoch": 1.1122781114727693, + "grad_norm": 3.428611993789673, + "learning_rate": 4.3352038773425376e-05, + "loss": 1.8434, + "step": 16574 + }, + { + "epoch": 1.1124123351565385, + "grad_norm": 3.9249868392944336, + "learning_rate": 4.334126711830852e-05, + "loss": 1.8531, + "step": 16576 + }, + { + "epoch": 1.1125465588403074, + "grad_norm": 3.9113237857818604, + "learning_rate": 4.3330495777803685e-05, + "loss": 2.0481, + "step": 16578 + }, + { + "epoch": 1.1126807825240763, + "grad_norm": 3.856219530105591, + "learning_rate": 4.3319724752419825e-05, + "loss": 1.9207, + "step": 16580 + }, + { + "epoch": 1.1128150062078455, + "grad_norm": 4.2292280197143555, + "learning_rate": 4.330895404266583e-05, + "loss": 1.8697, + "step": 16582 + }, + { + "epoch": 1.1129492298916144, + "grad_norm": 5.9807515144348145, + "learning_rate": 4.32981836490506e-05, + "loss": 1.9765, + "step": 16584 + }, + { + "epoch": 1.1130834535753833, + "grad_norm": 4.729543209075928, + "learning_rate": 4.3287413572083e-05, + "loss": 2.001, + "step": 16586 + }, + { + "epoch": 1.1132176772591524, + "grad_norm": 5.590975284576416, + "learning_rate": 4.327664381227193e-05, + "loss": 2.24, + "step": 16588 + }, + { + "epoch": 1.1133519009429214, + "grad_norm": 3.7069525718688965, + "learning_rate": 4.32658743701262e-05, + "loss": 2.1761, + "step": 16590 + }, + { + "epoch": 1.1134861246266903, + "grad_norm": 3.945838212966919, + "learning_rate": 4.325510524615468e-05, + "loss": 1.8961, + "step": 16592 + }, + { + "epoch": 1.1136203483104594, + "grad_norm": 4.368412494659424, + "learning_rate": 4.324433644086617e-05, + "loss": 2.2105, + "step": 16594 + }, + { + "epoch": 1.1137545719942283, + "grad_norm": 5.354978084564209, + "learning_rate": 4.3233567954769475e-05, + "loss": 2.0446, + "step": 16596 + }, + { + "epoch": 1.1138887956779975, + "grad_norm": 4.028286457061768, + "learning_rate": 4.3222799788373374e-05, + "loss": 2.0221, + "step": 16598 + }, + { + "epoch": 1.1140230193617664, + "grad_norm": 4.329638957977295, + "learning_rate": 4.3212031942186694e-05, + "loss": 2.2989, + "step": 16600 + }, + { + "epoch": 1.1141572430455353, + "grad_norm": 3.5613839626312256, + "learning_rate": 4.320126441671811e-05, + "loss": 2.0317, + "step": 16602 + }, + { + "epoch": 1.1142914667293045, + "grad_norm": 3.8305747509002686, + "learning_rate": 4.3190497212476444e-05, + "loss": 1.9549, + "step": 16604 + }, + { + "epoch": 1.1144256904130734, + "grad_norm": 4.040568828582764, + "learning_rate": 4.317973032997038e-05, + "loss": 1.9734, + "step": 16606 + }, + { + "epoch": 1.1145599140968423, + "grad_norm": 4.2270917892456055, + "learning_rate": 4.316896376970866e-05, + "loss": 2.2482, + "step": 16608 + }, + { + "epoch": 1.1146941377806114, + "grad_norm": 3.8564345836639404, + "learning_rate": 4.3158197532199964e-05, + "loss": 1.9707, + "step": 16610 + }, + { + "epoch": 1.1148283614643804, + "grad_norm": 4.155436992645264, + "learning_rate": 4.314743161795298e-05, + "loss": 2.1585, + "step": 16612 + }, + { + "epoch": 1.1149625851481493, + "grad_norm": 3.8940186500549316, + "learning_rate": 4.313666602747638e-05, + "loss": 2.0976, + "step": 16614 + }, + { + "epoch": 1.1150968088319184, + "grad_norm": 3.891310453414917, + "learning_rate": 4.3125900761278817e-05, + "loss": 1.9568, + "step": 16616 + }, + { + "epoch": 1.1152310325156873, + "grad_norm": 3.646873950958252, + "learning_rate": 4.3115135819868925e-05, + "loss": 1.8941, + "step": 16618 + }, + { + "epoch": 1.1153652561994565, + "grad_norm": 3.9759597778320312, + "learning_rate": 4.3104371203755315e-05, + "loss": 1.8357, + "step": 16620 + }, + { + "epoch": 1.1154994798832254, + "grad_norm": 4.231314659118652, + "learning_rate": 4.309360691344662e-05, + "loss": 2.1101, + "step": 16622 + }, + { + "epoch": 1.1156337035669943, + "grad_norm": 4.109768390655518, + "learning_rate": 4.30828429494514e-05, + "loss": 1.9731, + "step": 16624 + }, + { + "epoch": 1.1157679272507635, + "grad_norm": 4.008238792419434, + "learning_rate": 4.3072079312278266e-05, + "loss": 2.1682, + "step": 16626 + }, + { + "epoch": 1.1159021509345324, + "grad_norm": 4.449732303619385, + "learning_rate": 4.306131600243575e-05, + "loss": 2.3373, + "step": 16628 + }, + { + "epoch": 1.1160363746183013, + "grad_norm": 4.461875915527344, + "learning_rate": 4.305055302043242e-05, + "loss": 2.1926, + "step": 16630 + }, + { + "epoch": 1.1161705983020704, + "grad_norm": 4.191856861114502, + "learning_rate": 4.303979036677677e-05, + "loss": 2.0346, + "step": 16632 + }, + { + "epoch": 1.1163048219858394, + "grad_norm": 3.5036747455596924, + "learning_rate": 4.302902804197737e-05, + "loss": 1.6813, + "step": 16634 + }, + { + "epoch": 1.1164390456696085, + "grad_norm": 3.9801554679870605, + "learning_rate": 4.301826604654266e-05, + "loss": 2.161, + "step": 16636 + }, + { + "epoch": 1.1165732693533774, + "grad_norm": 3.962695837020874, + "learning_rate": 4.3007504380981165e-05, + "loss": 2.2205, + "step": 16638 + }, + { + "epoch": 1.1167074930371463, + "grad_norm": 3.6572437286376953, + "learning_rate": 4.2996743045801334e-05, + "loss": 2.0011, + "step": 16640 + }, + { + "epoch": 1.1168417167209155, + "grad_norm": 3.800454616546631, + "learning_rate": 4.298598204151163e-05, + "loss": 2.2301, + "step": 16642 + }, + { + "epoch": 1.1169759404046844, + "grad_norm": 4.216679573059082, + "learning_rate": 4.297522136862047e-05, + "loss": 2.1406, + "step": 16644 + }, + { + "epoch": 1.1171101640884533, + "grad_norm": 3.894667148590088, + "learning_rate": 4.2964461027636305e-05, + "loss": 2.0786, + "step": 16646 + }, + { + "epoch": 1.1172443877722225, + "grad_norm": 3.9634909629821777, + "learning_rate": 4.29537010190675e-05, + "loss": 2.214, + "step": 16648 + }, + { + "epoch": 1.1173786114559914, + "grad_norm": 4.768835544586182, + "learning_rate": 4.2942941343422485e-05, + "loss": 2.0524, + "step": 16650 + }, + { + "epoch": 1.1175128351397605, + "grad_norm": 5.612378120422363, + "learning_rate": 4.2932182001209596e-05, + "loss": 2.2376, + "step": 16652 + }, + { + "epoch": 1.1176470588235294, + "grad_norm": 4.3104023933410645, + "learning_rate": 4.2921422992937224e-05, + "loss": 1.936, + "step": 16654 + }, + { + "epoch": 1.1177812825072984, + "grad_norm": 4.098492622375488, + "learning_rate": 4.291066431911369e-05, + "loss": 1.9973, + "step": 16656 + }, + { + "epoch": 1.1179155061910675, + "grad_norm": 3.9947609901428223, + "learning_rate": 4.2899905980247336e-05, + "loss": 2.0309, + "step": 16658 + }, + { + "epoch": 1.1180497298748364, + "grad_norm": 4.165680408477783, + "learning_rate": 4.2889147976846436e-05, + "loss": 2.2008, + "step": 16660 + }, + { + "epoch": 1.1181839535586053, + "grad_norm": 3.9686574935913086, + "learning_rate": 4.287839030941934e-05, + "loss": 2.0813, + "step": 16662 + }, + { + "epoch": 1.1183181772423745, + "grad_norm": 4.251199722290039, + "learning_rate": 4.286763297847428e-05, + "loss": 2.4575, + "step": 16664 + }, + { + "epoch": 1.1184524009261434, + "grad_norm": 4.328427314758301, + "learning_rate": 4.2856875984519545e-05, + "loss": 2.1179, + "step": 16666 + }, + { + "epoch": 1.1185866246099123, + "grad_norm": 3.858316659927368, + "learning_rate": 4.2846119328063363e-05, + "loss": 2.1455, + "step": 16668 + }, + { + "epoch": 1.1187208482936815, + "grad_norm": 4.434962749481201, + "learning_rate": 4.283536300961399e-05, + "loss": 2.3846, + "step": 16670 + }, + { + "epoch": 1.1188550719774504, + "grad_norm": 3.504774808883667, + "learning_rate": 4.282460702967962e-05, + "loss": 1.7363, + "step": 16672 + }, + { + "epoch": 1.1189892956612195, + "grad_norm": 3.8285796642303467, + "learning_rate": 4.2813851388768464e-05, + "loss": 2.0692, + "step": 16674 + }, + { + "epoch": 1.1191235193449884, + "grad_norm": 3.554232358932495, + "learning_rate": 4.2803096087388687e-05, + "loss": 2.0423, + "step": 16676 + }, + { + "epoch": 1.1192577430287574, + "grad_norm": 3.7297933101654053, + "learning_rate": 4.279234112604848e-05, + "loss": 2.0276, + "step": 16678 + }, + { + "epoch": 1.1193919667125265, + "grad_norm": 3.8941690921783447, + "learning_rate": 4.2781586505255965e-05, + "loss": 2.0139, + "step": 16680 + }, + { + "epoch": 1.1195261903962954, + "grad_norm": 4.294648170471191, + "learning_rate": 4.277083222551931e-05, + "loss": 2.2994, + "step": 16682 + }, + { + "epoch": 1.1196604140800643, + "grad_norm": 4.145288944244385, + "learning_rate": 4.276007828734661e-05, + "loss": 2.1725, + "step": 16684 + }, + { + "epoch": 1.1197946377638335, + "grad_norm": 3.917565107345581, + "learning_rate": 4.2749324691245977e-05, + "loss": 2.2807, + "step": 16686 + }, + { + "epoch": 1.1199288614476024, + "grad_norm": 3.7559027671813965, + "learning_rate": 4.27385714377255e-05, + "loss": 1.8576, + "step": 16688 + }, + { + "epoch": 1.1200630851313713, + "grad_norm": 4.408468246459961, + "learning_rate": 4.2727818527293226e-05, + "loss": 2.0116, + "step": 16690 + }, + { + "epoch": 1.1201973088151405, + "grad_norm": 4.042076587677002, + "learning_rate": 4.271706596045725e-05, + "loss": 2.1591, + "step": 16692 + }, + { + "epoch": 1.1203315324989094, + "grad_norm": 4.1378655433654785, + "learning_rate": 4.270631373772556e-05, + "loss": 1.9618, + "step": 16694 + }, + { + "epoch": 1.1204657561826785, + "grad_norm": 4.6970696449279785, + "learning_rate": 4.269556185960623e-05, + "loss": 2.0662, + "step": 16696 + }, + { + "epoch": 1.1205999798664474, + "grad_norm": 4.799225807189941, + "learning_rate": 4.2684810326607206e-05, + "loss": 2.1776, + "step": 16698 + }, + { + "epoch": 1.1207342035502164, + "grad_norm": 4.415023326873779, + "learning_rate": 4.267405913923654e-05, + "loss": 1.8723, + "step": 16700 + }, + { + "epoch": 1.1208684272339855, + "grad_norm": 4.45332145690918, + "learning_rate": 4.2663308298002146e-05, + "loss": 2.1918, + "step": 16702 + }, + { + "epoch": 1.1210026509177544, + "grad_norm": 3.728452205657959, + "learning_rate": 4.265255780341203e-05, + "loss": 1.9318, + "step": 16704 + }, + { + "epoch": 1.1211368746015233, + "grad_norm": 4.24226188659668, + "learning_rate": 4.2641807655974096e-05, + "loss": 2.4145, + "step": 16706 + }, + { + "epoch": 1.1212710982852925, + "grad_norm": 4.072190284729004, + "learning_rate": 4.26310578561963e-05, + "loss": 2.0638, + "step": 16708 + }, + { + "epoch": 1.1214053219690614, + "grad_norm": 4.718430042266846, + "learning_rate": 4.262030840458651e-05, + "loss": 2.5765, + "step": 16710 + }, + { + "epoch": 1.1215395456528305, + "grad_norm": 4.162775039672852, + "learning_rate": 4.260955930165265e-05, + "loss": 2.0125, + "step": 16712 + }, + { + "epoch": 1.1216737693365995, + "grad_norm": 3.9789397716522217, + "learning_rate": 4.259881054790257e-05, + "loss": 2.0423, + "step": 16714 + }, + { + "epoch": 1.1218079930203684, + "grad_norm": 4.0585737228393555, + "learning_rate": 4.258806214384415e-05, + "loss": 2.0279, + "step": 16716 + }, + { + "epoch": 1.1219422167041375, + "grad_norm": 5.51784086227417, + "learning_rate": 4.2577314089985204e-05, + "loss": 2.1619, + "step": 16718 + }, + { + "epoch": 1.1220764403879064, + "grad_norm": 5.074485778808594, + "learning_rate": 4.2566566386833584e-05, + "loss": 2.1455, + "step": 16720 + }, + { + "epoch": 1.1222106640716754, + "grad_norm": 7.221103191375732, + "learning_rate": 4.255581903489707e-05, + "loss": 2.1998, + "step": 16722 + }, + { + "epoch": 1.1223448877554445, + "grad_norm": 3.4538397789001465, + "learning_rate": 4.254507203468349e-05, + "loss": 1.8515, + "step": 16724 + }, + { + "epoch": 1.1224791114392134, + "grad_norm": 3.814380168914795, + "learning_rate": 4.253432538670057e-05, + "loss": 1.966, + "step": 16726 + }, + { + "epoch": 1.1226133351229826, + "grad_norm": 3.9717326164245605, + "learning_rate": 4.252357909145611e-05, + "loss": 2.0706, + "step": 16728 + }, + { + "epoch": 1.1227475588067515, + "grad_norm": 4.227086067199707, + "learning_rate": 4.251283314945783e-05, + "loss": 2.2245, + "step": 16730 + }, + { + "epoch": 1.1228817824905204, + "grad_norm": 4.046862602233887, + "learning_rate": 4.2502087561213475e-05, + "loss": 1.8814, + "step": 16732 + }, + { + "epoch": 1.1230160061742895, + "grad_norm": 5.919290542602539, + "learning_rate": 4.249134232723073e-05, + "loss": 2.0134, + "step": 16734 + }, + { + "epoch": 1.1231502298580585, + "grad_norm": 3.638331413269043, + "learning_rate": 4.2480597448017294e-05, + "loss": 1.9969, + "step": 16736 + }, + { + "epoch": 1.1232844535418274, + "grad_norm": 4.420238018035889, + "learning_rate": 4.246985292408084e-05, + "loss": 2.0742, + "step": 16738 + }, + { + "epoch": 1.1234186772255965, + "grad_norm": 3.702704429626465, + "learning_rate": 4.2459108755929034e-05, + "loss": 2.1076, + "step": 16740 + }, + { + "epoch": 1.1235529009093654, + "grad_norm": 4.254305362701416, + "learning_rate": 4.244836494406951e-05, + "loss": 1.9471, + "step": 16742 + }, + { + "epoch": 1.1236871245931344, + "grad_norm": 3.59605073928833, + "learning_rate": 4.2437621489009894e-05, + "loss": 2.1187, + "step": 16744 + }, + { + "epoch": 1.1238213482769035, + "grad_norm": 4.114795684814453, + "learning_rate": 4.242687839125778e-05, + "loss": 2.1798, + "step": 16746 + }, + { + "epoch": 1.1239555719606724, + "grad_norm": 3.965398073196411, + "learning_rate": 4.2416135651320785e-05, + "loss": 2.2856, + "step": 16748 + }, + { + "epoch": 1.1240897956444416, + "grad_norm": 3.825330972671509, + "learning_rate": 4.2405393269706444e-05, + "loss": 2.3544, + "step": 16750 + }, + { + "epoch": 1.1242240193282105, + "grad_norm": 3.835932970046997, + "learning_rate": 4.239465124692237e-05, + "loss": 1.7569, + "step": 16752 + }, + { + "epoch": 1.1243582430119794, + "grad_norm": 3.630664110183716, + "learning_rate": 4.238390958347604e-05, + "loss": 1.9996, + "step": 16754 + }, + { + "epoch": 1.1244924666957485, + "grad_norm": 3.7303407192230225, + "learning_rate": 4.237316827987502e-05, + "loss": 1.7689, + "step": 16756 + }, + { + "epoch": 1.1246266903795175, + "grad_norm": 3.144321918487549, + "learning_rate": 4.236242733662682e-05, + "loss": 2.0547, + "step": 16758 + }, + { + "epoch": 1.1247609140632864, + "grad_norm": 3.874851703643799, + "learning_rate": 4.235168675423887e-05, + "loss": 2.0177, + "step": 16760 + }, + { + "epoch": 1.1248951377470555, + "grad_norm": 3.815654754638672, + "learning_rate": 4.2340946533218725e-05, + "loss": 1.9861, + "step": 16762 + }, + { + "epoch": 1.1250293614308244, + "grad_norm": 4.610550403594971, + "learning_rate": 4.233020667407375e-05, + "loss": 2.1217, + "step": 16764 + }, + { + "epoch": 1.1251635851145934, + "grad_norm": 4.2167487144470215, + "learning_rate": 4.2319467177311463e-05, + "loss": 2.027, + "step": 16766 + }, + { + "epoch": 1.1252978087983625, + "grad_norm": 3.620274305343628, + "learning_rate": 4.2308728043439224e-05, + "loss": 1.7635, + "step": 16768 + }, + { + "epoch": 1.1254320324821314, + "grad_norm": 4.087381362915039, + "learning_rate": 4.2297989272964476e-05, + "loss": 2.0458, + "step": 16770 + }, + { + "epoch": 1.1255662561659006, + "grad_norm": 4.161133289337158, + "learning_rate": 4.228725086639458e-05, + "loss": 2.0812, + "step": 16772 + }, + { + "epoch": 1.1257004798496695, + "grad_norm": 3.963663339614868, + "learning_rate": 4.227651282423693e-05, + "loss": 2.0291, + "step": 16774 + }, + { + "epoch": 1.1258347035334384, + "grad_norm": 4.1072187423706055, + "learning_rate": 4.2265775146998844e-05, + "loss": 2.2175, + "step": 16776 + }, + { + "epoch": 1.1259689272172075, + "grad_norm": 4.698759078979492, + "learning_rate": 4.225503783518769e-05, + "loss": 2.2761, + "step": 16778 + }, + { + "epoch": 1.1261031509009765, + "grad_norm": 3.868648052215576, + "learning_rate": 4.224430088931075e-05, + "loss": 2.0158, + "step": 16780 + }, + { + "epoch": 1.1262373745847456, + "grad_norm": 4.00569486618042, + "learning_rate": 4.223356430987536e-05, + "loss": 2.3273, + "step": 16782 + }, + { + "epoch": 1.1263715982685145, + "grad_norm": 4.052145004272461, + "learning_rate": 4.222282809738875e-05, + "loss": 1.8499, + "step": 16784 + }, + { + "epoch": 1.1265058219522834, + "grad_norm": 4.949563026428223, + "learning_rate": 4.221209225235827e-05, + "loss": 2.0948, + "step": 16786 + }, + { + "epoch": 1.1266400456360526, + "grad_norm": 3.9840400218963623, + "learning_rate": 4.220135677529107e-05, + "loss": 1.9981, + "step": 16788 + }, + { + "epoch": 1.1267742693198215, + "grad_norm": 4.051647186279297, + "learning_rate": 4.219062166669447e-05, + "loss": 2.1292, + "step": 16790 + }, + { + "epoch": 1.1269084930035904, + "grad_norm": 4.057977199554443, + "learning_rate": 4.21798869270756e-05, + "loss": 2.064, + "step": 16792 + }, + { + "epoch": 1.1270427166873596, + "grad_norm": 3.8162310123443604, + "learning_rate": 4.216915255694172e-05, + "loss": 2.0319, + "step": 16794 + }, + { + "epoch": 1.1271769403711285, + "grad_norm": 4.510111331939697, + "learning_rate": 4.215841855679997e-05, + "loss": 2.0446, + "step": 16796 + }, + { + "epoch": 1.1273111640548974, + "grad_norm": 4.150903701782227, + "learning_rate": 4.214768492715754e-05, + "loss": 2.3406, + "step": 16798 + }, + { + "epoch": 1.1274453877386665, + "grad_norm": 4.205280303955078, + "learning_rate": 4.213695166852153e-05, + "loss": 2.154, + "step": 16800 + }, + { + "epoch": 1.1275796114224355, + "grad_norm": 4.056186676025391, + "learning_rate": 4.212621878139912e-05, + "loss": 2.4897, + "step": 16802 + }, + { + "epoch": 1.1277138351062046, + "grad_norm": 3.5722360610961914, + "learning_rate": 4.211548626629737e-05, + "loss": 2.0296, + "step": 16804 + }, + { + "epoch": 1.1278480587899735, + "grad_norm": 3.4033408164978027, + "learning_rate": 4.21047541237234e-05, + "loss": 2.0664, + "step": 16806 + }, + { + "epoch": 1.1279822824737424, + "grad_norm": 3.628054618835449, + "learning_rate": 4.2094022354184266e-05, + "loss": 1.8694, + "step": 16808 + }, + { + "epoch": 1.1281165061575116, + "grad_norm": 4.250136852264404, + "learning_rate": 4.208329095818704e-05, + "loss": 2.0696, + "step": 16810 + }, + { + "epoch": 1.1282507298412805, + "grad_norm": 3.786841869354248, + "learning_rate": 4.207255993623872e-05, + "loss": 2.1861, + "step": 16812 + }, + { + "epoch": 1.1283849535250494, + "grad_norm": 3.7504723072052, + "learning_rate": 4.206182928884639e-05, + "loss": 1.8749, + "step": 16814 + }, + { + "epoch": 1.1285191772088186, + "grad_norm": 3.8686182498931885, + "learning_rate": 4.205109901651699e-05, + "loss": 1.9976, + "step": 16816 + }, + { + "epoch": 1.1286534008925875, + "grad_norm": 3.9162709712982178, + "learning_rate": 4.204036911975755e-05, + "loss": 1.916, + "step": 16818 + }, + { + "epoch": 1.1287876245763564, + "grad_norm": 3.802671194076538, + "learning_rate": 4.2029639599075004e-05, + "loss": 2.153, + "step": 16820 + }, + { + "epoch": 1.1289218482601255, + "grad_norm": 4.273474216461182, + "learning_rate": 4.201891045497633e-05, + "loss": 2.2572, + "step": 16822 + }, + { + "epoch": 1.1290560719438945, + "grad_norm": 4.277017116546631, + "learning_rate": 4.200818168796844e-05, + "loss": 1.9852, + "step": 16824 + }, + { + "epoch": 1.1291902956276636, + "grad_norm": 3.3995885848999023, + "learning_rate": 4.199745329855823e-05, + "loss": 1.9643, + "step": 16826 + }, + { + "epoch": 1.1293245193114325, + "grad_norm": 5.581350803375244, + "learning_rate": 4.1986725287252634e-05, + "loss": 2.0039, + "step": 16828 + }, + { + "epoch": 1.1294587429952014, + "grad_norm": 4.130553245544434, + "learning_rate": 4.19759976545585e-05, + "loss": 2.1732, + "step": 16830 + }, + { + "epoch": 1.1295929666789706, + "grad_norm": 4.285623550415039, + "learning_rate": 4.19652704009827e-05, + "loss": 2.022, + "step": 16832 + }, + { + "epoch": 1.1297271903627395, + "grad_norm": 13.83343505859375, + "learning_rate": 4.1954543527032076e-05, + "loss": 1.5741, + "step": 16834 + }, + { + "epoch": 1.1298614140465084, + "grad_norm": 5.234622955322266, + "learning_rate": 4.194381703321346e-05, + "loss": 2.0917, + "step": 16836 + }, + { + "epoch": 1.1299956377302776, + "grad_norm": 3.8458309173583984, + "learning_rate": 4.193309092003363e-05, + "loss": 2.1463, + "step": 16838 + }, + { + "epoch": 1.1301298614140465, + "grad_norm": 4.250682830810547, + "learning_rate": 4.1922365187999404e-05, + "loss": 2.3843, + "step": 16840 + }, + { + "epoch": 1.1302640850978154, + "grad_norm": 3.9670250415802, + "learning_rate": 4.1911639837617526e-05, + "loss": 2.1138, + "step": 16842 + }, + { + "epoch": 1.1303983087815845, + "grad_norm": 4.229076862335205, + "learning_rate": 4.1900914869394775e-05, + "loss": 1.9494, + "step": 16844 + }, + { + "epoch": 1.1305325324653535, + "grad_norm": 4.22276496887207, + "learning_rate": 4.189019028383785e-05, + "loss": 1.8476, + "step": 16846 + }, + { + "epoch": 1.1306667561491226, + "grad_norm": 4.242138385772705, + "learning_rate": 4.187946608145351e-05, + "loss": 2.2882, + "step": 16848 + }, + { + "epoch": 1.1308009798328915, + "grad_norm": 4.0532050132751465, + "learning_rate": 4.18687422627484e-05, + "loss": 2.0467, + "step": 16850 + }, + { + "epoch": 1.1309352035166604, + "grad_norm": 4.391130447387695, + "learning_rate": 4.185801882822927e-05, + "loss": 2.088, + "step": 16852 + }, + { + "epoch": 1.1310694272004296, + "grad_norm": 4.3401618003845215, + "learning_rate": 4.184729577840271e-05, + "loss": 2.2665, + "step": 16854 + }, + { + "epoch": 1.1312036508841985, + "grad_norm": 3.9434096813201904, + "learning_rate": 4.18365731137754e-05, + "loss": 2.033, + "step": 16856 + }, + { + "epoch": 1.1313378745679676, + "grad_norm": 6.322754859924316, + "learning_rate": 4.1825850834853954e-05, + "loss": 1.9051, + "step": 16858 + }, + { + "epoch": 1.1314720982517366, + "grad_norm": 4.150980472564697, + "learning_rate": 4.181512894214499e-05, + "loss": 2.0491, + "step": 16860 + }, + { + "epoch": 1.1316063219355055, + "grad_norm": 4.417309761047363, + "learning_rate": 4.180440743615508e-05, + "loss": 2.1864, + "step": 16862 + }, + { + "epoch": 1.1317405456192744, + "grad_norm": 3.8537983894348145, + "learning_rate": 4.1793686317390815e-05, + "loss": 1.9371, + "step": 16864 + }, + { + "epoch": 1.1318747693030435, + "grad_norm": 3.874706268310547, + "learning_rate": 4.178296558635873e-05, + "loss": 1.8387, + "step": 16866 + }, + { + "epoch": 1.1320089929868125, + "grad_norm": 4.053635597229004, + "learning_rate": 4.1772245243565364e-05, + "loss": 2.0754, + "step": 16868 + }, + { + "epoch": 1.1321432166705816, + "grad_norm": 3.758985996246338, + "learning_rate": 4.1761525289517215e-05, + "loss": 2.0237, + "step": 16870 + }, + { + "epoch": 1.1322774403543505, + "grad_norm": 3.846487283706665, + "learning_rate": 4.175080572472082e-05, + "loss": 2.3355, + "step": 16872 + }, + { + "epoch": 1.1324116640381194, + "grad_norm": 4.2072930335998535, + "learning_rate": 4.1740086549682606e-05, + "loss": 2.0029, + "step": 16874 + }, + { + "epoch": 1.1325458877218886, + "grad_norm": 4.008121490478516, + "learning_rate": 4.172936776490909e-05, + "loss": 2.2456, + "step": 16876 + }, + { + "epoch": 1.1326801114056575, + "grad_norm": 3.8702473640441895, + "learning_rate": 4.171864937090665e-05, + "loss": 1.801, + "step": 16878 + }, + { + "epoch": 1.1328143350894266, + "grad_norm": 4.775311470031738, + "learning_rate": 4.170793136818178e-05, + "loss": 2.1072, + "step": 16880 + }, + { + "epoch": 1.1329485587731956, + "grad_norm": 4.0256571769714355, + "learning_rate": 4.1697213757240814e-05, + "loss": 1.904, + "step": 16882 + }, + { + "epoch": 1.1330827824569645, + "grad_norm": 3.9696686267852783, + "learning_rate": 4.168649653859019e-05, + "loss": 1.9352, + "step": 16884 + }, + { + "epoch": 1.1332170061407336, + "grad_norm": 4.258273601531982, + "learning_rate": 4.1675779712736254e-05, + "loss": 2.0722, + "step": 16886 + }, + { + "epoch": 1.1333512298245025, + "grad_norm": 4.347936153411865, + "learning_rate": 4.1665063280185354e-05, + "loss": 2.0778, + "step": 16888 + }, + { + "epoch": 1.1334854535082715, + "grad_norm": 4.19441556930542, + "learning_rate": 4.165434724144383e-05, + "loss": 1.9712, + "step": 16890 + }, + { + "epoch": 1.1336196771920406, + "grad_norm": 4.101309776306152, + "learning_rate": 4.1643631597017987e-05, + "loss": 2.055, + "step": 16892 + }, + { + "epoch": 1.1337539008758095, + "grad_norm": 4.2385945320129395, + "learning_rate": 4.1632916347414124e-05, + "loss": 1.9943, + "step": 16894 + }, + { + "epoch": 1.1338881245595784, + "grad_norm": 4.803402423858643, + "learning_rate": 4.1622201493138495e-05, + "loss": 1.9879, + "step": 16896 + }, + { + "epoch": 1.1340223482433476, + "grad_norm": 4.139742851257324, + "learning_rate": 4.161148703469739e-05, + "loss": 2.2447, + "step": 16898 + }, + { + "epoch": 1.1341565719271165, + "grad_norm": 3.947314739227295, + "learning_rate": 4.160077297259701e-05, + "loss": 1.9165, + "step": 16900 + }, + { + "epoch": 1.1342907956108856, + "grad_norm": 3.960298538208008, + "learning_rate": 4.1590059307343596e-05, + "loss": 2.1395, + "step": 16902 + }, + { + "epoch": 1.1344250192946546, + "grad_norm": 4.163427352905273, + "learning_rate": 4.1579346039443346e-05, + "loss": 1.8737, + "step": 16904 + }, + { + "epoch": 1.1345592429784235, + "grad_norm": 4.058857440948486, + "learning_rate": 4.156863316940244e-05, + "loss": 2.1333, + "step": 16906 + }, + { + "epoch": 1.1346934666621926, + "grad_norm": 4.088848114013672, + "learning_rate": 4.155792069772702e-05, + "loss": 2.2287, + "step": 16908 + }, + { + "epoch": 1.1348276903459615, + "grad_norm": 4.136159896850586, + "learning_rate": 4.1547208624923266e-05, + "loss": 2.1585, + "step": 16910 + }, + { + "epoch": 1.1349619140297305, + "grad_norm": 4.082432746887207, + "learning_rate": 4.153649695149726e-05, + "loss": 2.0282, + "step": 16912 + }, + { + "epoch": 1.1350961377134996, + "grad_norm": 4.07998514175415, + "learning_rate": 4.152578567795516e-05, + "loss": 2.1022, + "step": 16914 + }, + { + "epoch": 1.1352303613972685, + "grad_norm": 3.5843427181243896, + "learning_rate": 4.1515074804802995e-05, + "loss": 2.1641, + "step": 16916 + }, + { + "epoch": 1.1353645850810374, + "grad_norm": 4.00724458694458, + "learning_rate": 4.150436433254688e-05, + "loss": 1.9135, + "step": 16918 + }, + { + "epoch": 1.1354988087648066, + "grad_norm": 3.861314058303833, + "learning_rate": 4.1493654261692834e-05, + "loss": 1.9619, + "step": 16920 + }, + { + "epoch": 1.1356330324485755, + "grad_norm": 4.073401927947998, + "learning_rate": 4.148294459274691e-05, + "loss": 1.9535, + "step": 16922 + }, + { + "epoch": 1.1357672561323446, + "grad_norm": 4.078757286071777, + "learning_rate": 4.147223532621508e-05, + "loss": 1.9582, + "step": 16924 + }, + { + "epoch": 1.1359014798161136, + "grad_norm": 4.045374870300293, + "learning_rate": 4.146152646260339e-05, + "loss": 2.1009, + "step": 16926 + }, + { + "epoch": 1.1360357034998825, + "grad_norm": 4.657671928405762, + "learning_rate": 4.1450818002417766e-05, + "loss": 2.3121, + "step": 16928 + }, + { + "epoch": 1.1361699271836516, + "grad_norm": 4.17203950881958, + "learning_rate": 4.1440109946164196e-05, + "loss": 1.9728, + "step": 16930 + }, + { + "epoch": 1.1363041508674205, + "grad_norm": 3.503247022628784, + "learning_rate": 4.142940229434858e-05, + "loss": 1.8267, + "step": 16932 + }, + { + "epoch": 1.1364383745511897, + "grad_norm": 3.798213005065918, + "learning_rate": 4.141869504747687e-05, + "loss": 2.0642, + "step": 16934 + }, + { + "epoch": 1.1365725982349586, + "grad_norm": 4.006547927856445, + "learning_rate": 4.140798820605493e-05, + "loss": 1.96, + "step": 16936 + }, + { + "epoch": 1.1367068219187275, + "grad_norm": 3.866708755493164, + "learning_rate": 4.139728177058867e-05, + "loss": 2.0426, + "step": 16938 + }, + { + "epoch": 1.1368410456024964, + "grad_norm": 4.915768623352051, + "learning_rate": 4.1386575741583904e-05, + "loss": 2.1847, + "step": 16940 + }, + { + "epoch": 1.1369752692862656, + "grad_norm": 4.491135597229004, + "learning_rate": 4.1375870119546526e-05, + "loss": 2.3681, + "step": 16942 + }, + { + "epoch": 1.1371094929700345, + "grad_norm": 3.9626123905181885, + "learning_rate": 4.1365164904982296e-05, + "loss": 2.1071, + "step": 16944 + }, + { + "epoch": 1.1372437166538036, + "grad_norm": 3.6313588619232178, + "learning_rate": 4.135446009839707e-05, + "loss": 2.0279, + "step": 16946 + }, + { + "epoch": 1.1373779403375726, + "grad_norm": 3.8355252742767334, + "learning_rate": 4.1343755700296584e-05, + "loss": 2.1876, + "step": 16948 + }, + { + "epoch": 1.1375121640213415, + "grad_norm": 4.079477787017822, + "learning_rate": 4.1333051711186635e-05, + "loss": 2.2645, + "step": 16950 + }, + { + "epoch": 1.1376463877051106, + "grad_norm": 4.018315315246582, + "learning_rate": 4.132234813157294e-05, + "loss": 2.0649, + "step": 16952 + }, + { + "epoch": 1.1377806113888795, + "grad_norm": 3.8483216762542725, + "learning_rate": 4.1311644961961244e-05, + "loss": 2.086, + "step": 16954 + }, + { + "epoch": 1.1379148350726487, + "grad_norm": 4.301693916320801, + "learning_rate": 4.1300942202857226e-05, + "loss": 2.0813, + "step": 16956 + }, + { + "epoch": 1.1380490587564176, + "grad_norm": 4.101228713989258, + "learning_rate": 4.129023985476659e-05, + "loss": 2.3089, + "step": 16958 + }, + { + "epoch": 1.1381832824401865, + "grad_norm": 4.322676658630371, + "learning_rate": 4.127953791819499e-05, + "loss": 1.9706, + "step": 16960 + }, + { + "epoch": 1.1383175061239557, + "grad_norm": 4.090095043182373, + "learning_rate": 4.126883639364808e-05, + "loss": 1.9641, + "step": 16962 + }, + { + "epoch": 1.1384517298077246, + "grad_norm": 4.214702606201172, + "learning_rate": 4.125813528163148e-05, + "loss": 2.163, + "step": 16964 + }, + { + "epoch": 1.1385859534914935, + "grad_norm": 3.808501720428467, + "learning_rate": 4.124743458265079e-05, + "loss": 1.9469, + "step": 16966 + }, + { + "epoch": 1.1387201771752626, + "grad_norm": 3.6474597454071045, + "learning_rate": 4.123673429721162e-05, + "loss": 2.1282, + "step": 16968 + }, + { + "epoch": 1.1388544008590316, + "grad_norm": 4.33784818649292, + "learning_rate": 4.12260344258195e-05, + "loss": 2.4113, + "step": 16970 + }, + { + "epoch": 1.1389886245428005, + "grad_norm": 4.185258865356445, + "learning_rate": 4.121533496898002e-05, + "loss": 1.9519, + "step": 16972 + }, + { + "epoch": 1.1391228482265696, + "grad_norm": 4.2361555099487305, + "learning_rate": 4.120463592719867e-05, + "loss": 1.9051, + "step": 16974 + }, + { + "epoch": 1.1392570719103385, + "grad_norm": 4.316851615905762, + "learning_rate": 4.119393730098101e-05, + "loss": 2.1132, + "step": 16976 + }, + { + "epoch": 1.1393912955941077, + "grad_norm": 3.943042039871216, + "learning_rate": 4.1183239090832455e-05, + "loss": 2.1181, + "step": 16978 + }, + { + "epoch": 1.1395255192778766, + "grad_norm": 3.855246067047119, + "learning_rate": 4.117254129725854e-05, + "loss": 2.028, + "step": 16980 + }, + { + "epoch": 1.1396597429616455, + "grad_norm": 4.195390701293945, + "learning_rate": 4.116184392076468e-05, + "loss": 2.2362, + "step": 16982 + }, + { + "epoch": 1.1397939666454147, + "grad_norm": 4.382492542266846, + "learning_rate": 4.1151146961856326e-05, + "loss": 2.0882, + "step": 16984 + }, + { + "epoch": 1.1399281903291836, + "grad_norm": 9.980362892150879, + "learning_rate": 4.114045042103887e-05, + "loss": 1.9755, + "step": 16986 + }, + { + "epoch": 1.1400624140129525, + "grad_norm": 3.8790841102600098, + "learning_rate": 4.1129754298817714e-05, + "loss": 2.1491, + "step": 16988 + }, + { + "epoch": 1.1401966376967216, + "grad_norm": 3.759721279144287, + "learning_rate": 4.1119058595698217e-05, + "loss": 2.0129, + "step": 16990 + }, + { + "epoch": 1.1403308613804906, + "grad_norm": 4.279345512390137, + "learning_rate": 4.110836331218575e-05, + "loss": 2.0221, + "step": 16992 + }, + { + "epoch": 1.1404650850642595, + "grad_norm": 4.2544050216674805, + "learning_rate": 4.109766844878562e-05, + "loss": 2.3435, + "step": 16994 + }, + { + "epoch": 1.1405993087480286, + "grad_norm": 3.9759767055511475, + "learning_rate": 4.108697400600316e-05, + "loss": 1.7245, + "step": 16996 + }, + { + "epoch": 1.1407335324317975, + "grad_norm": 4.15731954574585, + "learning_rate": 4.1076279984343636e-05, + "loss": 1.9888, + "step": 16998 + }, + { + "epoch": 1.1408677561155667, + "grad_norm": 4.178341865539551, + "learning_rate": 4.106558638431236e-05, + "loss": 2.0183, + "step": 17000 + }, + { + "epoch": 1.1410019797993356, + "grad_norm": 5.006940841674805, + "learning_rate": 4.105489320641452e-05, + "loss": 2.2863, + "step": 17002 + }, + { + "epoch": 1.1411362034831045, + "grad_norm": 4.335559844970703, + "learning_rate": 4.104420045115543e-05, + "loss": 2.2606, + "step": 17004 + }, + { + "epoch": 1.1412704271668737, + "grad_norm": 3.8325324058532715, + "learning_rate": 4.103350811904022e-05, + "loss": 1.7936, + "step": 17006 + }, + { + "epoch": 1.1414046508506426, + "grad_norm": 3.8393518924713135, + "learning_rate": 4.102281621057415e-05, + "loss": 1.9553, + "step": 17008 + }, + { + "epoch": 1.1415388745344117, + "grad_norm": 4.253771781921387, + "learning_rate": 4.1012124726262345e-05, + "loss": 2.0411, + "step": 17010 + }, + { + "epoch": 1.1416730982181806, + "grad_norm": 4.1371965408325195, + "learning_rate": 4.100143366660998e-05, + "loss": 2.1761, + "step": 17012 + }, + { + "epoch": 1.1418073219019496, + "grad_norm": 4.131717205047607, + "learning_rate": 4.099074303212218e-05, + "loss": 2.0679, + "step": 17014 + }, + { + "epoch": 1.1419415455857185, + "grad_norm": 3.862807512283325, + "learning_rate": 4.098005282330406e-05, + "loss": 2.429, + "step": 17016 + }, + { + "epoch": 1.1420757692694876, + "grad_norm": 4.240258693695068, + "learning_rate": 4.0969363040660685e-05, + "loss": 2.1056, + "step": 17018 + }, + { + "epoch": 1.1422099929532565, + "grad_norm": 3.7979557514190674, + "learning_rate": 4.095867368469717e-05, + "loss": 2.1951, + "step": 17020 + }, + { + "epoch": 1.1423442166370257, + "grad_norm": 4.103130340576172, + "learning_rate": 4.094798475591853e-05, + "loss": 2.0486, + "step": 17022 + }, + { + "epoch": 1.1424784403207946, + "grad_norm": 3.9921844005584717, + "learning_rate": 4.093729625482983e-05, + "loss": 2.0756, + "step": 17024 + }, + { + "epoch": 1.1426126640045635, + "grad_norm": 3.7096407413482666, + "learning_rate": 4.092660818193603e-05, + "loss": 2.057, + "step": 17026 + }, + { + "epoch": 1.1427468876883327, + "grad_norm": 4.064582824707031, + "learning_rate": 4.091592053774219e-05, + "loss": 1.8255, + "step": 17028 + }, + { + "epoch": 1.1428811113721016, + "grad_norm": 4.528830528259277, + "learning_rate": 4.0905233322753224e-05, + "loss": 2.3291, + "step": 17030 + }, + { + "epoch": 1.1430153350558707, + "grad_norm": 4.45618200302124, + "learning_rate": 4.089454653747409e-05, + "loss": 2.0758, + "step": 17032 + }, + { + "epoch": 1.1431495587396396, + "grad_norm": 3.847686529159546, + "learning_rate": 4.088386018240974e-05, + "loss": 2.0073, + "step": 17034 + }, + { + "epoch": 1.1432837824234086, + "grad_norm": 4.453803539276123, + "learning_rate": 4.087317425806504e-05, + "loss": 2.1696, + "step": 17036 + }, + { + "epoch": 1.1434180061071777, + "grad_norm": 5.143105506896973, + "learning_rate": 4.0862488764944956e-05, + "loss": 2.1113, + "step": 17038 + }, + { + "epoch": 1.1435522297909466, + "grad_norm": 4.019973278045654, + "learning_rate": 4.085180370355427e-05, + "loss": 2.1349, + "step": 17040 + }, + { + "epoch": 1.1436864534747155, + "grad_norm": 4.524601936340332, + "learning_rate": 4.0841119074397895e-05, + "loss": 2.3447, + "step": 17042 + }, + { + "epoch": 1.1438206771584847, + "grad_norm": 3.8049421310424805, + "learning_rate": 4.083043487798062e-05, + "loss": 2.0377, + "step": 17044 + }, + { + "epoch": 1.1439549008422536, + "grad_norm": 3.7771623134613037, + "learning_rate": 4.081975111480728e-05, + "loss": 2.1989, + "step": 17046 + }, + { + "epoch": 1.1440891245260225, + "grad_norm": 3.9517111778259277, + "learning_rate": 4.080906778538263e-05, + "loss": 1.9469, + "step": 17048 + }, + { + "epoch": 1.1442233482097917, + "grad_norm": 4.090948581695557, + "learning_rate": 4.079838489021146e-05, + "loss": 2.0088, + "step": 17050 + }, + { + "epoch": 1.1443575718935606, + "grad_norm": 3.951122522354126, + "learning_rate": 4.078770242979851e-05, + "loss": 2.2738, + "step": 17052 + }, + { + "epoch": 1.1444917955773297, + "grad_norm": 3.675631046295166, + "learning_rate": 4.077702040464851e-05, + "loss": 1.9729, + "step": 17054 + }, + { + "epoch": 1.1446260192610986, + "grad_norm": 4.05651330947876, + "learning_rate": 4.0766338815266156e-05, + "loss": 1.8737, + "step": 17056 + }, + { + "epoch": 1.1447602429448676, + "grad_norm": 7.78154993057251, + "learning_rate": 4.0755657662156144e-05, + "loss": 1.9368, + "step": 17058 + }, + { + "epoch": 1.1448944666286367, + "grad_norm": 3.416621446609497, + "learning_rate": 4.0744976945823116e-05, + "loss": 2.0143, + "step": 17060 + }, + { + "epoch": 1.1450286903124056, + "grad_norm": 4.106746673583984, + "learning_rate": 4.073429666677174e-05, + "loss": 2.0351, + "step": 17062 + }, + { + "epoch": 1.1451629139961745, + "grad_norm": 4.134082317352295, + "learning_rate": 4.07236168255066e-05, + "loss": 2.0738, + "step": 17064 + }, + { + "epoch": 1.1452971376799437, + "grad_norm": 4.089905738830566, + "learning_rate": 4.0712937422532366e-05, + "loss": 1.8935, + "step": 17066 + }, + { + "epoch": 1.1454313613637126, + "grad_norm": 4.363068103790283, + "learning_rate": 4.070225845835354e-05, + "loss": 2.2391, + "step": 17068 + }, + { + "epoch": 1.1455655850474815, + "grad_norm": 3.5306975841522217, + "learning_rate": 4.069157993347474e-05, + "loss": 1.8949, + "step": 17070 + }, + { + "epoch": 1.1456998087312507, + "grad_norm": 4.1655402183532715, + "learning_rate": 4.068090184840047e-05, + "loss": 1.9249, + "step": 17072 + }, + { + "epoch": 1.1458340324150196, + "grad_norm": 4.03021240234375, + "learning_rate": 4.0670224203635274e-05, + "loss": 2.1069, + "step": 17074 + }, + { + "epoch": 1.1459682560987887, + "grad_norm": 4.010472297668457, + "learning_rate": 4.065954699968363e-05, + "loss": 1.8714, + "step": 17076 + }, + { + "epoch": 1.1461024797825576, + "grad_norm": 3.6152827739715576, + "learning_rate": 4.064887023705003e-05, + "loss": 1.864, + "step": 17078 + }, + { + "epoch": 1.1462367034663266, + "grad_norm": 3.9460878372192383, + "learning_rate": 4.0638193916238914e-05, + "loss": 2.0098, + "step": 17080 + }, + { + "epoch": 1.1463709271500957, + "grad_norm": 3.4690840244293213, + "learning_rate": 4.0627518037754745e-05, + "loss": 1.8325, + "step": 17082 + }, + { + "epoch": 1.1465051508338646, + "grad_norm": 3.548017740249634, + "learning_rate": 4.0616842602101894e-05, + "loss": 1.7702, + "step": 17084 + }, + { + "epoch": 1.1466393745176338, + "grad_norm": 3.9021682739257812, + "learning_rate": 4.060616760978479e-05, + "loss": 1.9723, + "step": 17086 + }, + { + "epoch": 1.1467735982014027, + "grad_norm": 3.7625186443328857, + "learning_rate": 4.05954930613078e-05, + "loss": 2.0167, + "step": 17088 + }, + { + "epoch": 1.1469078218851716, + "grad_norm": 3.2512266635894775, + "learning_rate": 4.058481895717527e-05, + "loss": 2.1311, + "step": 17090 + }, + { + "epoch": 1.1470420455689405, + "grad_norm": 3.6818621158599854, + "learning_rate": 4.057414529789151e-05, + "loss": 1.9452, + "step": 17092 + }, + { + "epoch": 1.1471762692527097, + "grad_norm": 3.6999669075012207, + "learning_rate": 4.056347208396088e-05, + "loss": 2.1374, + "step": 17094 + }, + { + "epoch": 1.1473104929364786, + "grad_norm": 3.8013625144958496, + "learning_rate": 4.05527993158876e-05, + "loss": 1.7983, + "step": 17096 + }, + { + "epoch": 1.1474447166202477, + "grad_norm": 4.312994480133057, + "learning_rate": 4.0542126994176005e-05, + "loss": 1.9856, + "step": 17098 + }, + { + "epoch": 1.1475789403040166, + "grad_norm": 4.3277363777160645, + "learning_rate": 4.053145511933032e-05, + "loss": 2.1192, + "step": 17100 + }, + { + "epoch": 1.1477131639877856, + "grad_norm": 3.9958534240722656, + "learning_rate": 4.052078369185473e-05, + "loss": 2.1602, + "step": 17102 + }, + { + "epoch": 1.1478473876715547, + "grad_norm": 4.1068339347839355, + "learning_rate": 4.051011271225349e-05, + "loss": 2.0101, + "step": 17104 + }, + { + "epoch": 1.1479816113553236, + "grad_norm": 4.385814189910889, + "learning_rate": 4.049944218103076e-05, + "loss": 2.2173, + "step": 17106 + }, + { + "epoch": 1.1481158350390928, + "grad_norm": 4.327518463134766, + "learning_rate": 4.048877209869071e-05, + "loss": 2.0805, + "step": 17108 + }, + { + "epoch": 1.1482500587228617, + "grad_norm": 6.915212631225586, + "learning_rate": 4.047810246573746e-05, + "loss": 2.284, + "step": 17110 + }, + { + "epoch": 1.1483842824066306, + "grad_norm": 4.803452491760254, + "learning_rate": 4.046743328267516e-05, + "loss": 2.2578, + "step": 17112 + }, + { + "epoch": 1.1485185060903997, + "grad_norm": 4.548574447631836, + "learning_rate": 4.045676455000788e-05, + "loss": 2.0202, + "step": 17114 + }, + { + "epoch": 1.1486527297741687, + "grad_norm": 4.075754165649414, + "learning_rate": 4.0446096268239716e-05, + "loss": 1.9424, + "step": 17116 + }, + { + "epoch": 1.1487869534579376, + "grad_norm": 3.594846487045288, + "learning_rate": 4.043542843787472e-05, + "loss": 2.209, + "step": 17118 + }, + { + "epoch": 1.1489211771417067, + "grad_norm": 3.74484920501709, + "learning_rate": 4.0424761059416915e-05, + "loss": 2.198, + "step": 17120 + }, + { + "epoch": 1.1490554008254756, + "grad_norm": 4.094781875610352, + "learning_rate": 4.0414094133370325e-05, + "loss": 2.0725, + "step": 17122 + }, + { + "epoch": 1.1491896245092446, + "grad_norm": 3.5387256145477295, + "learning_rate": 4.040342766023894e-05, + "loss": 1.8944, + "step": 17124 + }, + { + "epoch": 1.1493238481930137, + "grad_norm": 4.256927013397217, + "learning_rate": 4.0392761640526705e-05, + "loss": 1.9018, + "step": 17126 + }, + { + "epoch": 1.1494580718767826, + "grad_norm": 3.711378335952759, + "learning_rate": 4.038209607473763e-05, + "loss": 2.1953, + "step": 17128 + }, + { + "epoch": 1.1495922955605518, + "grad_norm": 4.3256707191467285, + "learning_rate": 4.037143096337556e-05, + "loss": 2.177, + "step": 17130 + }, + { + "epoch": 1.1497265192443207, + "grad_norm": 3.738004684448242, + "learning_rate": 4.0360766306944474e-05, + "loss": 2.1597, + "step": 17132 + }, + { + "epoch": 1.1498607429280896, + "grad_norm": 3.6541125774383545, + "learning_rate": 4.035010210594821e-05, + "loss": 1.9087, + "step": 17134 + }, + { + "epoch": 1.1499949666118587, + "grad_norm": 3.8552310466766357, + "learning_rate": 4.033943836089066e-05, + "loss": 1.9484, + "step": 17136 + }, + { + "epoch": 1.1501291902956277, + "grad_norm": 3.717999219894409, + "learning_rate": 4.032877507227564e-05, + "loss": 1.972, + "step": 17138 + }, + { + "epoch": 1.1502634139793966, + "grad_norm": 5.895840167999268, + "learning_rate": 4.031811224060699e-05, + "loss": 2.0085, + "step": 17140 + }, + { + "epoch": 1.1503976376631657, + "grad_norm": 8.668353080749512, + "learning_rate": 4.030744986638848e-05, + "loss": 1.8586, + "step": 17142 + }, + { + "epoch": 1.1505318613469346, + "grad_norm": 8.167211532592773, + "learning_rate": 4.029678795012392e-05, + "loss": 1.9179, + "step": 17144 + }, + { + "epoch": 1.1506660850307036, + "grad_norm": 3.902292490005493, + "learning_rate": 4.028612649231704e-05, + "loss": 2.0096, + "step": 17146 + }, + { + "epoch": 1.1508003087144727, + "grad_norm": 4.114190101623535, + "learning_rate": 4.027546549347159e-05, + "loss": 1.9994, + "step": 17148 + }, + { + "epoch": 1.1509345323982416, + "grad_norm": 4.284456253051758, + "learning_rate": 4.026480495409127e-05, + "loss": 2.1822, + "step": 17150 + }, + { + "epoch": 1.1510687560820108, + "grad_norm": 4.274032115936279, + "learning_rate": 4.025414487467977e-05, + "loss": 2.2915, + "step": 17152 + }, + { + "epoch": 1.1512029797657797, + "grad_norm": 5.0892767906188965, + "learning_rate": 4.024348525574075e-05, + "loss": 2.0791, + "step": 17154 + }, + { + "epoch": 1.1513372034495486, + "grad_norm": 4.342426300048828, + "learning_rate": 4.0232826097777895e-05, + "loss": 1.9868, + "step": 17156 + }, + { + "epoch": 1.1514714271333177, + "grad_norm": 4.233348846435547, + "learning_rate": 4.0222167401294766e-05, + "loss": 1.7951, + "step": 17158 + }, + { + "epoch": 1.1516056508170867, + "grad_norm": 4.189921855926514, + "learning_rate": 4.021150916679502e-05, + "loss": 1.8565, + "step": 17160 + }, + { + "epoch": 1.1517398745008558, + "grad_norm": 4.558558940887451, + "learning_rate": 4.020085139478221e-05, + "loss": 2.1795, + "step": 17162 + }, + { + "epoch": 1.1518740981846247, + "grad_norm": 3.7284529209136963, + "learning_rate": 4.0190194085759915e-05, + "loss": 1.871, + "step": 17164 + }, + { + "epoch": 1.1520083218683936, + "grad_norm": 3.5783820152282715, + "learning_rate": 4.017953724023165e-05, + "loss": 1.8891, + "step": 17166 + }, + { + "epoch": 1.1521425455521626, + "grad_norm": 4.220331192016602, + "learning_rate": 4.016888085870094e-05, + "loss": 1.8325, + "step": 17168 + }, + { + "epoch": 1.1522767692359317, + "grad_norm": 3.951092004776001, + "learning_rate": 4.0158224941671285e-05, + "loss": 2.1557, + "step": 17170 + }, + { + "epoch": 1.1524109929197006, + "grad_norm": 3.5542142391204834, + "learning_rate": 4.0147569489646135e-05, + "loss": 2.0222, + "step": 17172 + }, + { + "epoch": 1.1525452166034698, + "grad_norm": 3.414698600769043, + "learning_rate": 4.013691450312897e-05, + "loss": 2.367, + "step": 17174 + }, + { + "epoch": 1.1526794402872387, + "grad_norm": 3.5986435413360596, + "learning_rate": 4.012625998262318e-05, + "loss": 1.884, + "step": 17176 + }, + { + "epoch": 1.1528136639710076, + "grad_norm": 4.3097944259643555, + "learning_rate": 4.0115605928632205e-05, + "loss": 1.9672, + "step": 17178 + }, + { + "epoch": 1.1529478876547767, + "grad_norm": 5.024585247039795, + "learning_rate": 4.0104952341659405e-05, + "loss": 2.065, + "step": 17180 + }, + { + "epoch": 1.1530821113385457, + "grad_norm": 4.0181427001953125, + "learning_rate": 4.0094299222208154e-05, + "loss": 2.2981, + "step": 17182 + }, + { + "epoch": 1.1532163350223148, + "grad_norm": 3.7729430198669434, + "learning_rate": 4.008364657078178e-05, + "loss": 1.9314, + "step": 17184 + }, + { + "epoch": 1.1533505587060837, + "grad_norm": 4.368673324584961, + "learning_rate": 4.007299438788362e-05, + "loss": 2.0737, + "step": 17186 + }, + { + "epoch": 1.1534847823898526, + "grad_norm": 3.5447864532470703, + "learning_rate": 4.0062342674016926e-05, + "loss": 2.1128, + "step": 17188 + }, + { + "epoch": 1.1536190060736218, + "grad_norm": 3.5079071521759033, + "learning_rate": 4.005169142968503e-05, + "loss": 2.0484, + "step": 17190 + }, + { + "epoch": 1.1537532297573907, + "grad_norm": 4.368807315826416, + "learning_rate": 4.004104065539112e-05, + "loss": 2.3149, + "step": 17192 + }, + { + "epoch": 1.1538874534411596, + "grad_norm": 5.155821800231934, + "learning_rate": 4.003039035163848e-05, + "loss": 2.063, + "step": 17194 + }, + { + "epoch": 1.1540216771249288, + "grad_norm": 3.9583632946014404, + "learning_rate": 4.0019740518930285e-05, + "loss": 2.0903, + "step": 17196 + }, + { + "epoch": 1.1541559008086977, + "grad_norm": 3.954953670501709, + "learning_rate": 4.000909115776973e-05, + "loss": 2.0802, + "step": 17198 + }, + { + "epoch": 1.1542901244924666, + "grad_norm": 3.9430577754974365, + "learning_rate": 3.999844226865996e-05, + "loss": 2.0655, + "step": 17200 + }, + { + "epoch": 1.1544243481762357, + "grad_norm": 3.9413959980010986, + "learning_rate": 3.998779385210414e-05, + "loss": 1.8381, + "step": 17202 + }, + { + "epoch": 1.1545585718600047, + "grad_norm": 5.140544414520264, + "learning_rate": 3.9977145908605355e-05, + "loss": 1.9098, + "step": 17204 + }, + { + "epoch": 1.1546927955437738, + "grad_norm": 4.027650833129883, + "learning_rate": 3.9966498438666734e-05, + "loss": 2.0863, + "step": 17206 + }, + { + "epoch": 1.1548270192275427, + "grad_norm": 4.412291526794434, + "learning_rate": 3.995585144279132e-05, + "loss": 2.4503, + "step": 17208 + }, + { + "epoch": 1.1549612429113116, + "grad_norm": 4.142503261566162, + "learning_rate": 3.994520492148218e-05, + "loss": 2.0739, + "step": 17210 + }, + { + "epoch": 1.1550954665950808, + "grad_norm": 3.6403908729553223, + "learning_rate": 3.9934558875242337e-05, + "loss": 2.1435, + "step": 17212 + }, + { + "epoch": 1.1552296902788497, + "grad_norm": 4.621021747589111, + "learning_rate": 3.9923913304574794e-05, + "loss": 2.0974, + "step": 17214 + }, + { + "epoch": 1.1553639139626186, + "grad_norm": 3.9361047744750977, + "learning_rate": 3.991326820998251e-05, + "loss": 2.0899, + "step": 17216 + }, + { + "epoch": 1.1554981376463878, + "grad_norm": 4.834806442260742, + "learning_rate": 3.9902623591968514e-05, + "loss": 1.8666, + "step": 17218 + }, + { + "epoch": 1.1556323613301567, + "grad_norm": 3.7355258464813232, + "learning_rate": 3.989197945103565e-05, + "loss": 2.1883, + "step": 17220 + }, + { + "epoch": 1.1557665850139256, + "grad_norm": 4.215298175811768, + "learning_rate": 3.9881335787686915e-05, + "loss": 2.2716, + "step": 17222 + }, + { + "epoch": 1.1559008086976947, + "grad_norm": 3.8984692096710205, + "learning_rate": 3.987069260242515e-05, + "loss": 2.0141, + "step": 17224 + }, + { + "epoch": 1.1560350323814637, + "grad_norm": 6.984610557556152, + "learning_rate": 3.986004989575325e-05, + "loss": 1.9033, + "step": 17226 + }, + { + "epoch": 1.1561692560652328, + "grad_norm": 4.775543212890625, + "learning_rate": 3.9849407668174044e-05, + "loss": 2.1264, + "step": 17228 + }, + { + "epoch": 1.1563034797490017, + "grad_norm": 4.333098888397217, + "learning_rate": 3.9838765920190376e-05, + "loss": 1.9496, + "step": 17230 + }, + { + "epoch": 1.1564377034327706, + "grad_norm": 4.418610095977783, + "learning_rate": 3.9828124652305024e-05, + "loss": 1.9861, + "step": 17232 + }, + { + "epoch": 1.1565719271165398, + "grad_norm": 5.043668746948242, + "learning_rate": 3.98174838650208e-05, + "loss": 2.1385, + "step": 17234 + }, + { + "epoch": 1.1567061508003087, + "grad_norm": 3.5776946544647217, + "learning_rate": 3.9806843558840427e-05, + "loss": 1.9342, + "step": 17236 + }, + { + "epoch": 1.1568403744840778, + "grad_norm": 3.827056646347046, + "learning_rate": 3.979620373426665e-05, + "loss": 2.0183, + "step": 17238 + }, + { + "epoch": 1.1569745981678468, + "grad_norm": 4.8617939949035645, + "learning_rate": 3.978556439180219e-05, + "loss": 1.9891, + "step": 17240 + }, + { + "epoch": 1.1571088218516157, + "grad_norm": 4.320433139801025, + "learning_rate": 3.9774925531949714e-05, + "loss": 2.0634, + "step": 17242 + }, + { + "epoch": 1.1572430455353846, + "grad_norm": 4.587085247039795, + "learning_rate": 3.9764287155211916e-05, + "loss": 2.3658, + "step": 17244 + }, + { + "epoch": 1.1573772692191537, + "grad_norm": 3.948594808578491, + "learning_rate": 3.975364926209141e-05, + "loss": 2.1537, + "step": 17246 + }, + { + "epoch": 1.1575114929029227, + "grad_norm": 4.110910892486572, + "learning_rate": 3.974301185309083e-05, + "loss": 2.0132, + "step": 17248 + }, + { + "epoch": 1.1576457165866918, + "grad_norm": 4.047597408294678, + "learning_rate": 3.973237492871276e-05, + "loss": 2.1966, + "step": 17250 + }, + { + "epoch": 1.1577799402704607, + "grad_norm": 3.9973435401916504, + "learning_rate": 3.972173848945981e-05, + "loss": 2.0346, + "step": 17252 + }, + { + "epoch": 1.1579141639542296, + "grad_norm": 3.726994276046753, + "learning_rate": 3.971110253583448e-05, + "loss": 2.1657, + "step": 17254 + }, + { + "epoch": 1.1580483876379988, + "grad_norm": 4.479213237762451, + "learning_rate": 3.970046706833935e-05, + "loss": 2.1367, + "step": 17256 + }, + { + "epoch": 1.1581826113217677, + "grad_norm": 3.8524513244628906, + "learning_rate": 3.968983208747686e-05, + "loss": 1.9249, + "step": 17258 + }, + { + "epoch": 1.1583168350055368, + "grad_norm": 3.9354114532470703, + "learning_rate": 3.967919759374956e-05, + "loss": 1.9687, + "step": 17260 + }, + { + "epoch": 1.1584510586893058, + "grad_norm": 14.781843185424805, + "learning_rate": 3.9668563587659865e-05, + "loss": 1.8329, + "step": 17262 + }, + { + "epoch": 1.1585852823730747, + "grad_norm": 4.237339973449707, + "learning_rate": 3.9657930069710236e-05, + "loss": 2.2435, + "step": 17264 + }, + { + "epoch": 1.1587195060568438, + "grad_norm": 3.910794734954834, + "learning_rate": 3.9647297040403066e-05, + "loss": 2.1395, + "step": 17266 + }, + { + "epoch": 1.1588537297406127, + "grad_norm": 3.6133859157562256, + "learning_rate": 3.963666450024075e-05, + "loss": 1.8392, + "step": 17268 + }, + { + "epoch": 1.1589879534243817, + "grad_norm": 3.5748751163482666, + "learning_rate": 3.962603244972566e-05, + "loss": 1.8564, + "step": 17270 + }, + { + "epoch": 1.1591221771081508, + "grad_norm": 3.665982484817505, + "learning_rate": 3.9615400889360146e-05, + "loss": 1.8952, + "step": 17272 + }, + { + "epoch": 1.1592564007919197, + "grad_norm": 3.778275728225708, + "learning_rate": 3.9604769819646515e-05, + "loss": 1.798, + "step": 17274 + }, + { + "epoch": 1.1593906244756886, + "grad_norm": 4.133974075317383, + "learning_rate": 3.959413924108707e-05, + "loss": 2.1089, + "step": 17276 + }, + { + "epoch": 1.1595248481594578, + "grad_norm": 4.053686141967773, + "learning_rate": 3.958350915418407e-05, + "loss": 2.0364, + "step": 17278 + }, + { + "epoch": 1.1596590718432267, + "grad_norm": 3.9031856060028076, + "learning_rate": 3.95728795594398e-05, + "loss": 2.0799, + "step": 17280 + }, + { + "epoch": 1.1597932955269958, + "grad_norm": 4.048644542694092, + "learning_rate": 3.9562250457356445e-05, + "loss": 2.2108, + "step": 17282 + }, + { + "epoch": 1.1599275192107648, + "grad_norm": 3.98164701461792, + "learning_rate": 3.955162184843625e-05, + "loss": 2.2849, + "step": 17284 + }, + { + "epoch": 1.1600617428945337, + "grad_norm": 3.491945505142212, + "learning_rate": 3.9540993733181366e-05, + "loss": 1.8978, + "step": 17286 + }, + { + "epoch": 1.1601959665783028, + "grad_norm": 3.6065049171447754, + "learning_rate": 3.953036611209397e-05, + "loss": 1.7282, + "step": 17288 + }, + { + "epoch": 1.1603301902620717, + "grad_norm": 3.4223265647888184, + "learning_rate": 3.951973898567617e-05, + "loss": 1.884, + "step": 17290 + }, + { + "epoch": 1.1604644139458407, + "grad_norm": 4.333828926086426, + "learning_rate": 3.950911235443012e-05, + "loss": 2.2265, + "step": 17292 + }, + { + "epoch": 1.1605986376296098, + "grad_norm": 4.10191011428833, + "learning_rate": 3.949848621885786e-05, + "loss": 1.9859, + "step": 17294 + }, + { + "epoch": 1.1607328613133787, + "grad_norm": 4.021450042724609, + "learning_rate": 3.948786057946149e-05, + "loss": 2.0567, + "step": 17296 + }, + { + "epoch": 1.1608670849971476, + "grad_norm": 3.6615653038024902, + "learning_rate": 3.9477235436743026e-05, + "loss": 2.0346, + "step": 17298 + }, + { + "epoch": 1.1610013086809168, + "grad_norm": 3.8212149143218994, + "learning_rate": 3.9466610791204504e-05, + "loss": 1.9364, + "step": 17300 + }, + { + "epoch": 1.1611355323646857, + "grad_norm": 3.270540952682495, + "learning_rate": 3.94559866433479e-05, + "loss": 1.9069, + "step": 17302 + }, + { + "epoch": 1.1612697560484548, + "grad_norm": 4.03375244140625, + "learning_rate": 3.9445362993675204e-05, + "loss": 2.1036, + "step": 17304 + }, + { + "epoch": 1.1614039797322238, + "grad_norm": 4.091415882110596, + "learning_rate": 3.9434739842688356e-05, + "loss": 2.1266, + "step": 17306 + }, + { + "epoch": 1.1615382034159927, + "grad_norm": 4.416433334350586, + "learning_rate": 3.942411719088926e-05, + "loss": 2.203, + "step": 17308 + }, + { + "epoch": 1.1616724270997618, + "grad_norm": 4.48150634765625, + "learning_rate": 3.941349503877984e-05, + "loss": 2.2732, + "step": 17310 + }, + { + "epoch": 1.1618066507835307, + "grad_norm": 4.414863586425781, + "learning_rate": 3.9402873386861944e-05, + "loss": 2.1457, + "step": 17312 + }, + { + "epoch": 1.1619408744672999, + "grad_norm": 4.345264434814453, + "learning_rate": 3.939225223563747e-05, + "loss": 2.0456, + "step": 17314 + }, + { + "epoch": 1.1620750981510688, + "grad_norm": 3.9269628524780273, + "learning_rate": 3.938163158560819e-05, + "loss": 1.7771, + "step": 17316 + }, + { + "epoch": 1.1622093218348377, + "grad_norm": 4.250432014465332, + "learning_rate": 3.9371011437275965e-05, + "loss": 1.9697, + "step": 17318 + }, + { + "epoch": 1.1623435455186066, + "grad_norm": 3.867906332015991, + "learning_rate": 3.936039179114253e-05, + "loss": 2.1046, + "step": 17320 + }, + { + "epoch": 1.1624777692023758, + "grad_norm": 4.2963337898254395, + "learning_rate": 3.9349772647709666e-05, + "loss": 1.9659, + "step": 17322 + }, + { + "epoch": 1.1626119928861447, + "grad_norm": 3.8769400119781494, + "learning_rate": 3.9339154007479096e-05, + "loss": 2.0049, + "step": 17324 + }, + { + "epoch": 1.1627462165699138, + "grad_norm": 4.117976188659668, + "learning_rate": 3.9328535870952545e-05, + "loss": 2.2156, + "step": 17326 + }, + { + "epoch": 1.1628804402536828, + "grad_norm": 3.889681100845337, + "learning_rate": 3.931791823863168e-05, + "loss": 2.06, + "step": 17328 + }, + { + "epoch": 1.1630146639374517, + "grad_norm": 4.520042896270752, + "learning_rate": 3.930730111101819e-05, + "loss": 2.2939, + "step": 17330 + }, + { + "epoch": 1.1631488876212208, + "grad_norm": 3.8316285610198975, + "learning_rate": 3.9296684488613675e-05, + "loss": 2.0233, + "step": 17332 + }, + { + "epoch": 1.1632831113049897, + "grad_norm": 3.8350720405578613, + "learning_rate": 3.928606837191979e-05, + "loss": 1.9456, + "step": 17334 + }, + { + "epoch": 1.1634173349887589, + "grad_norm": 3.748629331588745, + "learning_rate": 3.9275452761438095e-05, + "loss": 2.0589, + "step": 17336 + }, + { + "epoch": 1.1635515586725278, + "grad_norm": 4.0348358154296875, + "learning_rate": 3.926483765767017e-05, + "loss": 2.0164, + "step": 17338 + }, + { + "epoch": 1.1636857823562967, + "grad_norm": 4.422247886657715, + "learning_rate": 3.925422306111754e-05, + "loss": 2.1122, + "step": 17340 + }, + { + "epoch": 1.1638200060400659, + "grad_norm": 4.1362810134887695, + "learning_rate": 3.924360897228177e-05, + "loss": 2.1666, + "step": 17342 + }, + { + "epoch": 1.1639542297238348, + "grad_norm": 3.805716037750244, + "learning_rate": 3.9232995391664296e-05, + "loss": 1.9172, + "step": 17344 + }, + { + "epoch": 1.1640884534076037, + "grad_norm": 3.8781919479370117, + "learning_rate": 3.922238231976665e-05, + "loss": 1.9853, + "step": 17346 + }, + { + "epoch": 1.1642226770913728, + "grad_norm": 4.186763286590576, + "learning_rate": 3.921176975709021e-05, + "loss": 1.8913, + "step": 17348 + }, + { + "epoch": 1.1643569007751418, + "grad_norm": 3.6779701709747314, + "learning_rate": 3.920115770413646e-05, + "loss": 2.3952, + "step": 17350 + }, + { + "epoch": 1.1644911244589107, + "grad_norm": 4.359996795654297, + "learning_rate": 3.919054616140676e-05, + "loss": 2.076, + "step": 17352 + }, + { + "epoch": 1.1646253481426798, + "grad_norm": 4.4800496101379395, + "learning_rate": 3.917993512940251e-05, + "loss": 1.9541, + "step": 17354 + }, + { + "epoch": 1.1647595718264487, + "grad_norm": 3.88041353225708, + "learning_rate": 3.916932460862504e-05, + "loss": 1.7234, + "step": 17356 + }, + { + "epoch": 1.1648937955102179, + "grad_norm": 3.374702215194702, + "learning_rate": 3.915871459957569e-05, + "loss": 2.0234, + "step": 17358 + }, + { + "epoch": 1.1650280191939868, + "grad_norm": 3.956364154815674, + "learning_rate": 3.914810510275575e-05, + "loss": 1.9336, + "step": 17360 + }, + { + "epoch": 1.1651622428777557, + "grad_norm": 3.9814293384552, + "learning_rate": 3.913749611866651e-05, + "loss": 2.1117, + "step": 17362 + }, + { + "epoch": 1.1652964665615249, + "grad_norm": 4.399720668792725, + "learning_rate": 3.912688764780921e-05, + "loss": 1.7913, + "step": 17364 + }, + { + "epoch": 1.1654306902452938, + "grad_norm": 3.794949769973755, + "learning_rate": 3.91162796906851e-05, + "loss": 1.9711, + "step": 17366 + }, + { + "epoch": 1.1655649139290627, + "grad_norm": 4.257447242736816, + "learning_rate": 3.910567224779535e-05, + "loss": 2.056, + "step": 17368 + }, + { + "epoch": 1.1656991376128318, + "grad_norm": 3.8126637935638428, + "learning_rate": 3.90950653196412e-05, + "loss": 1.9578, + "step": 17370 + }, + { + "epoch": 1.1658333612966008, + "grad_norm": 4.219121932983398, + "learning_rate": 3.908445890672373e-05, + "loss": 2.1594, + "step": 17372 + }, + { + "epoch": 1.1659675849803697, + "grad_norm": 4.0714592933654785, + "learning_rate": 3.907385300954414e-05, + "loss": 1.8287, + "step": 17374 + }, + { + "epoch": 1.1661018086641388, + "grad_norm": 4.3575921058654785, + "learning_rate": 3.906324762860352e-05, + "loss": 2.1882, + "step": 17376 + }, + { + "epoch": 1.1662360323479077, + "grad_norm": 4.236461162567139, + "learning_rate": 3.9052642764402906e-05, + "loss": 2.1902, + "step": 17378 + }, + { + "epoch": 1.1663702560316769, + "grad_norm": 4.111412048339844, + "learning_rate": 3.904203841744343e-05, + "loss": 2.1063, + "step": 17380 + }, + { + "epoch": 1.1665044797154458, + "grad_norm": 3.7571136951446533, + "learning_rate": 3.903143458822606e-05, + "loss": 1.9657, + "step": 17382 + }, + { + "epoch": 1.1666387033992147, + "grad_norm": 4.043102264404297, + "learning_rate": 3.9020831277251863e-05, + "loss": 2.1816, + "step": 17384 + }, + { + "epoch": 1.1667729270829839, + "grad_norm": 3.269176483154297, + "learning_rate": 3.9010228485021784e-05, + "loss": 2.0005, + "step": 17386 + }, + { + "epoch": 1.1669071507667528, + "grad_norm": 4.441411972045898, + "learning_rate": 3.8999626212036805e-05, + "loss": 1.9358, + "step": 17388 + }, + { + "epoch": 1.167041374450522, + "grad_norm": 5.0426483154296875, + "learning_rate": 3.898902445879784e-05, + "loss": 1.8375, + "step": 17390 + }, + { + "epoch": 1.1671755981342908, + "grad_norm": 4.135997772216797, + "learning_rate": 3.897842322580584e-05, + "loss": 2.0294, + "step": 17392 + }, + { + "epoch": 1.1673098218180598, + "grad_norm": 4.274691104888916, + "learning_rate": 3.8967822513561656e-05, + "loss": 2.0919, + "step": 17394 + }, + { + "epoch": 1.1674440455018287, + "grad_norm": 3.9386866092681885, + "learning_rate": 3.895722232256618e-05, + "loss": 2.0598, + "step": 17396 + }, + { + "epoch": 1.1675782691855978, + "grad_norm": 3.8402135372161865, + "learning_rate": 3.8946622653320216e-05, + "loss": 1.9556, + "step": 17398 + }, + { + "epoch": 1.1677124928693667, + "grad_norm": 4.158082962036133, + "learning_rate": 3.89360235063246e-05, + "loss": 1.9917, + "step": 17400 + }, + { + "epoch": 1.1678467165531359, + "grad_norm": 4.135104179382324, + "learning_rate": 3.8925424882080105e-05, + "loss": 1.9625, + "step": 17402 + }, + { + "epoch": 1.1679809402369048, + "grad_norm": 4.281384468078613, + "learning_rate": 3.891482678108754e-05, + "loss": 2.0397, + "step": 17404 + }, + { + "epoch": 1.1681151639206737, + "grad_norm": 4.349688529968262, + "learning_rate": 3.890422920384758e-05, + "loss": 2.2023, + "step": 17406 + }, + { + "epoch": 1.1682493876044429, + "grad_norm": 3.490532398223877, + "learning_rate": 3.8893632150861004e-05, + "loss": 1.9964, + "step": 17408 + }, + { + "epoch": 1.1683836112882118, + "grad_norm": 4.102851867675781, + "learning_rate": 3.888303562262843e-05, + "loss": 2.0898, + "step": 17410 + }, + { + "epoch": 1.168517834971981, + "grad_norm": 4.0127644538879395, + "learning_rate": 3.887243961965059e-05, + "loss": 2.0211, + "step": 17412 + }, + { + "epoch": 1.1686520586557498, + "grad_norm": 4.075071811676025, + "learning_rate": 3.8861844142428083e-05, + "loss": 1.91, + "step": 17414 + }, + { + "epoch": 1.1687862823395188, + "grad_norm": 3.8620247840881348, + "learning_rate": 3.8851249191461544e-05, + "loss": 1.8602, + "step": 17416 + }, + { + "epoch": 1.168920506023288, + "grad_norm": 8.405160903930664, + "learning_rate": 3.884065476725156e-05, + "loss": 2.0219, + "step": 17418 + }, + { + "epoch": 1.1690547297070568, + "grad_norm": 4.67210054397583, + "learning_rate": 3.8830060870298696e-05, + "loss": 2.0777, + "step": 17420 + }, + { + "epoch": 1.1691889533908257, + "grad_norm": 3.9862823486328125, + "learning_rate": 3.881946750110348e-05, + "loss": 2.0563, + "step": 17422 + }, + { + "epoch": 1.1693231770745949, + "grad_norm": 3.8568947315216064, + "learning_rate": 3.880887466016645e-05, + "loss": 2.1141, + "step": 17424 + }, + { + "epoch": 1.1694574007583638, + "grad_norm": 3.6555752754211426, + "learning_rate": 3.879828234798808e-05, + "loss": 1.9744, + "step": 17426 + }, + { + "epoch": 1.1695916244421327, + "grad_norm": 4.603425979614258, + "learning_rate": 3.8787690565068846e-05, + "loss": 2.1507, + "step": 17428 + }, + { + "epoch": 1.1697258481259019, + "grad_norm": 4.4209465980529785, + "learning_rate": 3.877709931190917e-05, + "loss": 2.0197, + "step": 17430 + }, + { + "epoch": 1.1698600718096708, + "grad_norm": 3.9571800231933594, + "learning_rate": 3.876650858900951e-05, + "loss": 2.0902, + "step": 17432 + }, + { + "epoch": 1.16999429549344, + "grad_norm": 4.355197429656982, + "learning_rate": 3.87559183968702e-05, + "loss": 2.1782, + "step": 17434 + }, + { + "epoch": 1.1701285191772088, + "grad_norm": 4.493549823760986, + "learning_rate": 3.8745328735991664e-05, + "loss": 2.052, + "step": 17436 + }, + { + "epoch": 1.1702627428609778, + "grad_norm": 4.084664821624756, + "learning_rate": 3.8734739606874184e-05, + "loss": 1.974, + "step": 17438 + }, + { + "epoch": 1.170396966544747, + "grad_norm": 4.1563520431518555, + "learning_rate": 3.872415101001813e-05, + "loss": 2.0182, + "step": 17440 + }, + { + "epoch": 1.1705311902285158, + "grad_norm": 3.3738532066345215, + "learning_rate": 3.871356294592377e-05, + "loss": 1.8189, + "step": 17442 + }, + { + "epoch": 1.1706654139122847, + "grad_norm": 6.180014133453369, + "learning_rate": 3.870297541509134e-05, + "loss": 1.9543, + "step": 17444 + }, + { + "epoch": 1.1707996375960539, + "grad_norm": 4.494099140167236, + "learning_rate": 3.869238841802113e-05, + "loss": 2.1721, + "step": 17446 + }, + { + "epoch": 1.1709338612798228, + "grad_norm": 4.099746227264404, + "learning_rate": 3.868180195521332e-05, + "loss": 1.9256, + "step": 17448 + }, + { + "epoch": 1.1710680849635917, + "grad_norm": 3.833893299102783, + "learning_rate": 3.867121602716812e-05, + "loss": 1.9874, + "step": 17450 + }, + { + "epoch": 1.1712023086473609, + "grad_norm": 4.308292865753174, + "learning_rate": 3.8660630634385674e-05, + "loss": 2.2203, + "step": 17452 + }, + { + "epoch": 1.1713365323311298, + "grad_norm": 3.6919775009155273, + "learning_rate": 3.865004577736615e-05, + "loss": 1.8124, + "step": 17454 + }, + { + "epoch": 1.171470756014899, + "grad_norm": 4.056827068328857, + "learning_rate": 3.8639461456609626e-05, + "loss": 2.2319, + "step": 17456 + }, + { + "epoch": 1.1716049796986678, + "grad_norm": 4.191446304321289, + "learning_rate": 3.862887767261623e-05, + "loss": 2.0514, + "step": 17458 + }, + { + "epoch": 1.1717392033824368, + "grad_norm": 4.116202354431152, + "learning_rate": 3.861829442588598e-05, + "loss": 2.4102, + "step": 17460 + }, + { + "epoch": 1.171873427066206, + "grad_norm": 3.782087802886963, + "learning_rate": 3.860771171691895e-05, + "loss": 2.0448, + "step": 17462 + }, + { + "epoch": 1.1720076507499748, + "grad_norm": 3.4355621337890625, + "learning_rate": 3.8597129546215135e-05, + "loss": 2.1674, + "step": 17464 + }, + { + "epoch": 1.172141874433744, + "grad_norm": 3.4927384853363037, + "learning_rate": 3.858654791427453e-05, + "loss": 1.9954, + "step": 17466 + }, + { + "epoch": 1.1722760981175129, + "grad_norm": 3.464461326599121, + "learning_rate": 3.8575966821597075e-05, + "loss": 1.9734, + "step": 17468 + }, + { + "epoch": 1.1724103218012818, + "grad_norm": 4.244235038757324, + "learning_rate": 3.856538626868276e-05, + "loss": 2.2104, + "step": 17470 + }, + { + "epoch": 1.1725445454850507, + "grad_norm": 4.270380973815918, + "learning_rate": 3.855480625603142e-05, + "loss": 2.125, + "step": 17472 + }, + { + "epoch": 1.1726787691688199, + "grad_norm": 3.8521955013275146, + "learning_rate": 3.8544226784143e-05, + "loss": 2.0383, + "step": 17474 + }, + { + "epoch": 1.1728129928525888, + "grad_norm": 3.8376636505126953, + "learning_rate": 3.853364785351733e-05, + "loss": 1.9688, + "step": 17476 + }, + { + "epoch": 1.172947216536358, + "grad_norm": 3.5709874629974365, + "learning_rate": 3.852306946465426e-05, + "loss": 1.8823, + "step": 17478 + }, + { + "epoch": 1.1730814402201268, + "grad_norm": 3.955643653869629, + "learning_rate": 3.851249161805358e-05, + "loss": 2.0357, + "step": 17480 + }, + { + "epoch": 1.1732156639038958, + "grad_norm": 4.093015670776367, + "learning_rate": 3.8501914314215095e-05, + "loss": 1.8909, + "step": 17482 + }, + { + "epoch": 1.173349887587665, + "grad_norm": 4.025865077972412, + "learning_rate": 3.849133755363853e-05, + "loss": 1.9103, + "step": 17484 + }, + { + "epoch": 1.1734841112714338, + "grad_norm": 3.8356781005859375, + "learning_rate": 3.8480761336823654e-05, + "loss": 1.8711, + "step": 17486 + }, + { + "epoch": 1.173618334955203, + "grad_norm": 3.934797763824463, + "learning_rate": 3.8470185664270134e-05, + "loss": 2.2301, + "step": 17488 + }, + { + "epoch": 1.1737525586389719, + "grad_norm": 5.698752403259277, + "learning_rate": 3.845961053647768e-05, + "loss": 1.9796, + "step": 17490 + }, + { + "epoch": 1.1738867823227408, + "grad_norm": 4.0713090896606445, + "learning_rate": 3.844903595394592e-05, + "loss": 2.0154, + "step": 17492 + }, + { + "epoch": 1.17402100600651, + "grad_norm": 4.352439880371094, + "learning_rate": 3.843846191717453e-05, + "loss": 2.2978, + "step": 17494 + }, + { + "epoch": 1.1741552296902789, + "grad_norm": 4.233323574066162, + "learning_rate": 3.8427888426663044e-05, + "loss": 2.0223, + "step": 17496 + }, + { + "epoch": 1.1742894533740478, + "grad_norm": 3.906482696533203, + "learning_rate": 3.841731548291111e-05, + "loss": 2.1261, + "step": 17498 + }, + { + "epoch": 1.174423677057817, + "grad_norm": 4.674046039581299, + "learning_rate": 3.840674308641821e-05, + "loss": 2.2569, + "step": 17500 + }, + { + "epoch": 1.1745579007415858, + "grad_norm": 4.026844024658203, + "learning_rate": 3.839617123768392e-05, + "loss": 1.892, + "step": 17502 + }, + { + "epoch": 1.1746921244253548, + "grad_norm": 3.6739447116851807, + "learning_rate": 3.8385599937207713e-05, + "loss": 2.0371, + "step": 17504 + }, + { + "epoch": 1.174826348109124, + "grad_norm": 4.513969421386719, + "learning_rate": 3.837502918548907e-05, + "loss": 1.9559, + "step": 17506 + }, + { + "epoch": 1.1749605717928928, + "grad_norm": 3.790558338165283, + "learning_rate": 3.836445898302745e-05, + "loss": 1.9121, + "step": 17508 + }, + { + "epoch": 1.175094795476662, + "grad_norm": 4.366518974304199, + "learning_rate": 3.835388933032226e-05, + "loss": 2.1889, + "step": 17510 + }, + { + "epoch": 1.1752290191604309, + "grad_norm": 3.814040422439575, + "learning_rate": 3.8343320227872894e-05, + "loss": 1.8491, + "step": 17512 + }, + { + "epoch": 1.1753632428441998, + "grad_norm": 3.581501007080078, + "learning_rate": 3.833275167617872e-05, + "loss": 2.0197, + "step": 17514 + }, + { + "epoch": 1.175497466527969, + "grad_norm": 4.031160831451416, + "learning_rate": 3.83221836757391e-05, + "loss": 1.8939, + "step": 17516 + }, + { + "epoch": 1.1756316902117379, + "grad_norm": 4.381714820861816, + "learning_rate": 3.831161622705332e-05, + "loss": 2.1926, + "step": 17518 + }, + { + "epoch": 1.1757659138955068, + "grad_norm": 3.3912177085876465, + "learning_rate": 3.83010493306207e-05, + "loss": 1.9509, + "step": 17520 + }, + { + "epoch": 1.175900137579276, + "grad_norm": 3.898552656173706, + "learning_rate": 3.829048298694049e-05, + "loss": 2.0927, + "step": 17522 + }, + { + "epoch": 1.1760343612630448, + "grad_norm": 3.5130083560943604, + "learning_rate": 3.827991719651194e-05, + "loss": 1.8009, + "step": 17524 + }, + { + "epoch": 1.1761685849468138, + "grad_norm": 3.4741480350494385, + "learning_rate": 3.826935195983425e-05, + "loss": 1.9741, + "step": 17526 + }, + { + "epoch": 1.176302808630583, + "grad_norm": 4.040469169616699, + "learning_rate": 3.8258787277406615e-05, + "loss": 1.9854, + "step": 17528 + }, + { + "epoch": 1.1764370323143518, + "grad_norm": 4.216850757598877, + "learning_rate": 3.824822314972818e-05, + "loss": 2.0498, + "step": 17530 + }, + { + "epoch": 1.176571255998121, + "grad_norm": 5.427389144897461, + "learning_rate": 3.823765957729813e-05, + "loss": 1.873, + "step": 17532 + }, + { + "epoch": 1.1767054796818899, + "grad_norm": 4.114426136016846, + "learning_rate": 3.8227096560615486e-05, + "loss": 1.989, + "step": 17534 + }, + { + "epoch": 1.1768397033656588, + "grad_norm": 4.068340301513672, + "learning_rate": 3.821653410017941e-05, + "loss": 2.1185, + "step": 17536 + }, + { + "epoch": 1.176973927049428, + "grad_norm": 4.024460315704346, + "learning_rate": 3.820597219648891e-05, + "loss": 2.1607, + "step": 17538 + }, + { + "epoch": 1.1771081507331969, + "grad_norm": 4.9889020919799805, + "learning_rate": 3.819541085004304e-05, + "loss": 2.0628, + "step": 17540 + }, + { + "epoch": 1.177242374416966, + "grad_norm": 3.5026602745056152, + "learning_rate": 3.81848500613408e-05, + "loss": 1.8187, + "step": 17542 + }, + { + "epoch": 1.177376598100735, + "grad_norm": 3.792722702026367, + "learning_rate": 3.817428983088116e-05, + "loss": 1.9882, + "step": 17544 + }, + { + "epoch": 1.1775108217845038, + "grad_norm": 3.4087674617767334, + "learning_rate": 3.8163730159163065e-05, + "loss": 1.7928, + "step": 17546 + }, + { + "epoch": 1.1776450454682728, + "grad_norm": 4.097071170806885, + "learning_rate": 3.8153171046685454e-05, + "loss": 2.1166, + "step": 17548 + }, + { + "epoch": 1.177779269152042, + "grad_norm": 3.9379708766937256, + "learning_rate": 3.8142612493947205e-05, + "loss": 1.8445, + "step": 17550 + }, + { + "epoch": 1.1779134928358108, + "grad_norm": 4.199843883514404, + "learning_rate": 3.8132054501447206e-05, + "loss": 2.1056, + "step": 17552 + }, + { + "epoch": 1.17804771651958, + "grad_norm": 3.9285290241241455, + "learning_rate": 3.8121497069684285e-05, + "loss": 2.178, + "step": 17554 + }, + { + "epoch": 1.1781819402033489, + "grad_norm": 3.7351324558258057, + "learning_rate": 3.8110940199157284e-05, + "loss": 2.0368, + "step": 17556 + }, + { + "epoch": 1.1783161638871178, + "grad_norm": 4.177146911621094, + "learning_rate": 3.8100383890364956e-05, + "loss": 2.0645, + "step": 17558 + }, + { + "epoch": 1.178450387570887, + "grad_norm": 4.0497965812683105, + "learning_rate": 3.808982814380612e-05, + "loss": 2.0253, + "step": 17560 + }, + { + "epoch": 1.1785846112546559, + "grad_norm": 3.929353713989258, + "learning_rate": 3.8079272959979454e-05, + "loss": 2.0755, + "step": 17562 + }, + { + "epoch": 1.178718834938425, + "grad_norm": 3.7800936698913574, + "learning_rate": 3.8068718339383726e-05, + "loss": 1.8719, + "step": 17564 + }, + { + "epoch": 1.178853058622194, + "grad_norm": 3.7215726375579834, + "learning_rate": 3.805816428251757e-05, + "loss": 1.9467, + "step": 17566 + }, + { + "epoch": 1.1789872823059628, + "grad_norm": 3.8158631324768066, + "learning_rate": 3.8047610789879696e-05, + "loss": 2.0317, + "step": 17568 + }, + { + "epoch": 1.179121505989732, + "grad_norm": 3.9939379692077637, + "learning_rate": 3.8037057861968684e-05, + "loss": 2.0006, + "step": 17570 + }, + { + "epoch": 1.179255729673501, + "grad_norm": 4.280476093292236, + "learning_rate": 3.8026505499283184e-05, + "loss": 2.1612, + "step": 17572 + }, + { + "epoch": 1.1793899533572698, + "grad_norm": 4.21048641204834, + "learning_rate": 3.801595370232174e-05, + "loss": 2.1321, + "step": 17574 + }, + { + "epoch": 1.179524177041039, + "grad_norm": 3.8982644081115723, + "learning_rate": 3.800540247158293e-05, + "loss": 1.8925, + "step": 17576 + }, + { + "epoch": 1.1796584007248079, + "grad_norm": 4.83038330078125, + "learning_rate": 3.7994851807565254e-05, + "loss": 1.8711, + "step": 17578 + }, + { + "epoch": 1.1797926244085768, + "grad_norm": 3.78250789642334, + "learning_rate": 3.798430171076723e-05, + "loss": 2.0324, + "step": 17580 + }, + { + "epoch": 1.179926848092346, + "grad_norm": 4.361459732055664, + "learning_rate": 3.7973752181687335e-05, + "loss": 2.1149, + "step": 17582 + }, + { + "epoch": 1.1800610717761149, + "grad_norm": 4.378607273101807, + "learning_rate": 3.796320322082398e-05, + "loss": 2.2427, + "step": 17584 + }, + { + "epoch": 1.180195295459884, + "grad_norm": 3.953510284423828, + "learning_rate": 3.7952654828675616e-05, + "loss": 2.1137, + "step": 17586 + }, + { + "epoch": 1.180329519143653, + "grad_norm": 3.497314691543579, + "learning_rate": 3.794210700574061e-05, + "loss": 2.4574, + "step": 17588 + }, + { + "epoch": 1.1804637428274218, + "grad_norm": 3.580653429031372, + "learning_rate": 3.7931559752517354e-05, + "loss": 2.2216, + "step": 17590 + }, + { + "epoch": 1.180597966511191, + "grad_norm": 5.895651340484619, + "learning_rate": 3.792101306950414e-05, + "loss": 2.308, + "step": 17592 + }, + { + "epoch": 1.18073219019496, + "grad_norm": 3.578716278076172, + "learning_rate": 3.7910466957199336e-05, + "loss": 1.8554, + "step": 17594 + }, + { + "epoch": 1.1808664138787288, + "grad_norm": 4.513785362243652, + "learning_rate": 3.789992141610117e-05, + "loss": 1.9132, + "step": 17596 + }, + { + "epoch": 1.181000637562498, + "grad_norm": 4.265530109405518, + "learning_rate": 3.788937644670794e-05, + "loss": 2.165, + "step": 17598 + }, + { + "epoch": 1.1811348612462669, + "grad_norm": 4.839305400848389, + "learning_rate": 3.7878832049517846e-05, + "loss": 2.1411, + "step": 17600 + }, + { + "epoch": 1.1812690849300358, + "grad_norm": 4.421397686004639, + "learning_rate": 3.786828822502912e-05, + "loss": 1.9841, + "step": 17602 + }, + { + "epoch": 1.181403308613805, + "grad_norm": 4.295802593231201, + "learning_rate": 3.78577449737399e-05, + "loss": 2.1949, + "step": 17604 + }, + { + "epoch": 1.1815375322975739, + "grad_norm": 3.380497932434082, + "learning_rate": 3.784720229614838e-05, + "loss": 2.0554, + "step": 17606 + }, + { + "epoch": 1.181671755981343, + "grad_norm": 4.07744026184082, + "learning_rate": 3.783666019275263e-05, + "loss": 2.256, + "step": 17608 + }, + { + "epoch": 1.181805979665112, + "grad_norm": 4.037672519683838, + "learning_rate": 3.782611866405078e-05, + "loss": 1.8602, + "step": 17610 + }, + { + "epoch": 1.1819402033488808, + "grad_norm": 4.767341613769531, + "learning_rate": 3.7815577710540874e-05, + "loss": 2.137, + "step": 17612 + }, + { + "epoch": 1.18207442703265, + "grad_norm": 3.5512402057647705, + "learning_rate": 3.7805037332720975e-05, + "loss": 2.0253, + "step": 17614 + }, + { + "epoch": 1.182208650716419, + "grad_norm": 4.179635524749756, + "learning_rate": 3.7794497531089066e-05, + "loss": 1.9564, + "step": 17616 + }, + { + "epoch": 1.182342874400188, + "grad_norm": 3.8206984996795654, + "learning_rate": 3.778395830614317e-05, + "loss": 1.9951, + "step": 17618 + }, + { + "epoch": 1.182477098083957, + "grad_norm": 4.33563232421875, + "learning_rate": 3.7773419658381185e-05, + "loss": 1.8306, + "step": 17620 + }, + { + "epoch": 1.1826113217677259, + "grad_norm": 4.057651042938232, + "learning_rate": 3.7762881588301115e-05, + "loss": 1.8743, + "step": 17622 + }, + { + "epoch": 1.1827455454514948, + "grad_norm": 3.601088762283325, + "learning_rate": 3.7752344096400794e-05, + "loss": 2.0946, + "step": 17624 + }, + { + "epoch": 1.182879769135264, + "grad_norm": 4.23453426361084, + "learning_rate": 3.774180718317816e-05, + "loss": 1.8625, + "step": 17626 + }, + { + "epoch": 1.1830139928190329, + "grad_norm": 3.9731812477111816, + "learning_rate": 3.7731270849131014e-05, + "loss": 1.9381, + "step": 17628 + }, + { + "epoch": 1.183148216502802, + "grad_norm": 3.798396348953247, + "learning_rate": 3.772073509475721e-05, + "loss": 2.0561, + "step": 17630 + }, + { + "epoch": 1.183282440186571, + "grad_norm": 4.284620761871338, + "learning_rate": 3.7710199920554513e-05, + "loss": 2.2251, + "step": 17632 + }, + { + "epoch": 1.1834166638703398, + "grad_norm": 4.420039176940918, + "learning_rate": 3.769966532702071e-05, + "loss": 2.0063, + "step": 17634 + }, + { + "epoch": 1.183550887554109, + "grad_norm": 5.418274879455566, + "learning_rate": 3.7689131314653525e-05, + "loss": 1.8041, + "step": 17636 + }, + { + "epoch": 1.183685111237878, + "grad_norm": 3.7930116653442383, + "learning_rate": 3.7678597883950696e-05, + "loss": 1.8781, + "step": 17638 + }, + { + "epoch": 1.183819334921647, + "grad_norm": 4.262333869934082, + "learning_rate": 3.7668065035409864e-05, + "loss": 2.1361, + "step": 17640 + }, + { + "epoch": 1.183953558605416, + "grad_norm": 4.274511337280273, + "learning_rate": 3.765753276952873e-05, + "loss": 2.0133, + "step": 17642 + }, + { + "epoch": 1.1840877822891849, + "grad_norm": 4.286556720733643, + "learning_rate": 3.764700108680489e-05, + "loss": 2.4483, + "step": 17644 + }, + { + "epoch": 1.184222005972954, + "grad_norm": 4.524280548095703, + "learning_rate": 3.763646998773596e-05, + "loss": 1.9087, + "step": 17646 + }, + { + "epoch": 1.184356229656723, + "grad_norm": 3.559089422225952, + "learning_rate": 3.762593947281952e-05, + "loss": 1.6603, + "step": 17648 + }, + { + "epoch": 1.1844904533404919, + "grad_norm": 4.232026100158691, + "learning_rate": 3.7615409542553094e-05, + "loss": 2.1442, + "step": 17650 + }, + { + "epoch": 1.184624677024261, + "grad_norm": 3.106931686401367, + "learning_rate": 3.760488019743422e-05, + "loss": 1.8548, + "step": 17652 + }, + { + "epoch": 1.18475890070803, + "grad_norm": 4.507011890411377, + "learning_rate": 3.759435143796037e-05, + "loss": 2.183, + "step": 17654 + }, + { + "epoch": 1.1848931243917988, + "grad_norm": 6.107194423675537, + "learning_rate": 3.7583823264629045e-05, + "loss": 2.3206, + "step": 17656 + }, + { + "epoch": 1.185027348075568, + "grad_norm": 5.290421485900879, + "learning_rate": 3.7573295677937633e-05, + "loss": 2.0661, + "step": 17658 + }, + { + "epoch": 1.185161571759337, + "grad_norm": 3.862813711166382, + "learning_rate": 3.7562768678383576e-05, + "loss": 1.8583, + "step": 17660 + }, + { + "epoch": 1.185295795443106, + "grad_norm": 4.155878067016602, + "learning_rate": 3.755224226646423e-05, + "loss": 1.9361, + "step": 17662 + }, + { + "epoch": 1.185430019126875, + "grad_norm": 4.159115314483643, + "learning_rate": 3.754171644267698e-05, + "loss": 2.2006, + "step": 17664 + }, + { + "epoch": 1.1855642428106439, + "grad_norm": 4.343072414398193, + "learning_rate": 3.7531191207519114e-05, + "loss": 2.0918, + "step": 17666 + }, + { + "epoch": 1.185698466494413, + "grad_norm": 4.290927886962891, + "learning_rate": 3.752066656148796e-05, + "loss": 2.3145, + "step": 17668 + }, + { + "epoch": 1.185832690178182, + "grad_norm": 3.7341694831848145, + "learning_rate": 3.7510142505080746e-05, + "loss": 2.1194, + "step": 17670 + }, + { + "epoch": 1.1859669138619509, + "grad_norm": 4.458971977233887, + "learning_rate": 3.749961903879477e-05, + "loss": 2.2388, + "step": 17672 + }, + { + "epoch": 1.18610113754572, + "grad_norm": 4.335763931274414, + "learning_rate": 3.748909616312718e-05, + "loss": 1.9513, + "step": 17674 + }, + { + "epoch": 1.186235361229489, + "grad_norm": 3.3947412967681885, + "learning_rate": 3.747857387857522e-05, + "loss": 1.8353, + "step": 17676 + }, + { + "epoch": 1.1863695849132578, + "grad_norm": 4.099704265594482, + "learning_rate": 3.7468052185636e-05, + "loss": 2.1904, + "step": 17678 + }, + { + "epoch": 1.186503808597027, + "grad_norm": 4.274200916290283, + "learning_rate": 3.74575310848067e-05, + "loss": 2.2195, + "step": 17680 + }, + { + "epoch": 1.186638032280796, + "grad_norm": 4.251743793487549, + "learning_rate": 3.744701057658436e-05, + "loss": 1.9537, + "step": 17682 + }, + { + "epoch": 1.186772255964565, + "grad_norm": 4.022055149078369, + "learning_rate": 3.743649066146612e-05, + "loss": 2.3266, + "step": 17684 + }, + { + "epoch": 1.186906479648334, + "grad_norm": 4.13052225112915, + "learning_rate": 3.742597133994896e-05, + "loss": 2.3428, + "step": 17686 + }, + { + "epoch": 1.1870407033321029, + "grad_norm": 3.8645830154418945, + "learning_rate": 3.741545261252994e-05, + "loss": 1.8943, + "step": 17688 + }, + { + "epoch": 1.187174927015872, + "grad_norm": 3.7792184352874756, + "learning_rate": 3.740493447970604e-05, + "loss": 2.0249, + "step": 17690 + }, + { + "epoch": 1.187309150699641, + "grad_norm": 4.169731140136719, + "learning_rate": 3.739441694197422e-05, + "loss": 2.0367, + "step": 17692 + }, + { + "epoch": 1.18744337438341, + "grad_norm": 3.4428770542144775, + "learning_rate": 3.7383899999831406e-05, + "loss": 1.7758, + "step": 17694 + }, + { + "epoch": 1.187577598067179, + "grad_norm": 3.682957887649536, + "learning_rate": 3.737338365377452e-05, + "loss": 1.8572, + "step": 17696 + }, + { + "epoch": 1.187711821750948, + "grad_norm": 4.136600017547607, + "learning_rate": 3.7362867904300415e-05, + "loss": 2.2797, + "step": 17698 + }, + { + "epoch": 1.1878460454347168, + "grad_norm": 3.5561916828155518, + "learning_rate": 3.7352352751905964e-05, + "loss": 1.8715, + "step": 17700 + }, + { + "epoch": 1.187980269118486, + "grad_norm": 3.9872772693634033, + "learning_rate": 3.7341838197087964e-05, + "loss": 2.1599, + "step": 17702 + }, + { + "epoch": 1.188114492802255, + "grad_norm": 3.7501676082611084, + "learning_rate": 3.7331324240343226e-05, + "loss": 2.3001, + "step": 17704 + }, + { + "epoch": 1.188248716486024, + "grad_norm": 4.303133964538574, + "learning_rate": 3.7320810882168495e-05, + "loss": 2.0995, + "step": 17706 + }, + { + "epoch": 1.188382940169793, + "grad_norm": 4.1472296714782715, + "learning_rate": 3.7310298123060527e-05, + "loss": 2.4396, + "step": 17708 + }, + { + "epoch": 1.1885171638535619, + "grad_norm": 4.260064125061035, + "learning_rate": 3.7299785963516e-05, + "loss": 2.1176, + "step": 17710 + }, + { + "epoch": 1.188651387537331, + "grad_norm": 3.907296657562256, + "learning_rate": 3.728927440403165e-05, + "loss": 2.2509, + "step": 17712 + }, + { + "epoch": 1.1887856112211, + "grad_norm": 3.7040207386016846, + "learning_rate": 3.727876344510405e-05, + "loss": 1.8509, + "step": 17714 + }, + { + "epoch": 1.188919834904869, + "grad_norm": 3.918274402618408, + "learning_rate": 3.726825308722989e-05, + "loss": 2.1177, + "step": 17716 + }, + { + "epoch": 1.189054058588638, + "grad_norm": 3.8865373134613037, + "learning_rate": 3.725774333090574e-05, + "loss": 1.7904, + "step": 17718 + }, + { + "epoch": 1.189188282272407, + "grad_norm": 4.097355365753174, + "learning_rate": 3.724723417662813e-05, + "loss": 2.3198, + "step": 17720 + }, + { + "epoch": 1.189322505956176, + "grad_norm": 4.174825668334961, + "learning_rate": 3.723672562489366e-05, + "loss": 1.828, + "step": 17722 + }, + { + "epoch": 1.189456729639945, + "grad_norm": 4.089765548706055, + "learning_rate": 3.7226217676198775e-05, + "loss": 2.2471, + "step": 17724 + }, + { + "epoch": 1.189590953323714, + "grad_norm": 4.480666637420654, + "learning_rate": 3.721571033104001e-05, + "loss": 2.1148, + "step": 17726 + }, + { + "epoch": 1.189725177007483, + "grad_norm": 3.725545883178711, + "learning_rate": 3.720520358991378e-05, + "loss": 1.7731, + "step": 17728 + }, + { + "epoch": 1.189859400691252, + "grad_norm": 4.944480895996094, + "learning_rate": 3.719469745331653e-05, + "loss": 2.3655, + "step": 17730 + }, + { + "epoch": 1.1899936243750209, + "grad_norm": 4.803215503692627, + "learning_rate": 3.718419192174464e-05, + "loss": 1.7333, + "step": 17732 + }, + { + "epoch": 1.19012784805879, + "grad_norm": 3.5096688270568848, + "learning_rate": 3.717368699569448e-05, + "loss": 2.0061, + "step": 17734 + }, + { + "epoch": 1.190262071742559, + "grad_norm": 4.376622676849365, + "learning_rate": 3.716318267566238e-05, + "loss": 1.8162, + "step": 17736 + }, + { + "epoch": 1.190396295426328, + "grad_norm": 5.052789688110352, + "learning_rate": 3.715267896214467e-05, + "loss": 2.3261, + "step": 17738 + }, + { + "epoch": 1.190530519110097, + "grad_norm": 3.973135232925415, + "learning_rate": 3.7142175855637606e-05, + "loss": 1.8587, + "step": 17740 + }, + { + "epoch": 1.190664742793866, + "grad_norm": 4.158870697021484, + "learning_rate": 3.7131673356637464e-05, + "loss": 1.9992, + "step": 17742 + }, + { + "epoch": 1.190798966477635, + "grad_norm": 4.083337783813477, + "learning_rate": 3.7121171465640425e-05, + "loss": 2.0641, + "step": 17744 + }, + { + "epoch": 1.190933190161404, + "grad_norm": 4.14635705947876, + "learning_rate": 3.7110670183142735e-05, + "loss": 2.0831, + "step": 17746 + }, + { + "epoch": 1.191067413845173, + "grad_norm": 3.9749066829681396, + "learning_rate": 3.7100169509640517e-05, + "loss": 2.0682, + "step": 17748 + }, + { + "epoch": 1.191201637528942, + "grad_norm": 4.021341323852539, + "learning_rate": 3.708966944562993e-05, + "loss": 2.1054, + "step": 17750 + }, + { + "epoch": 1.191335861212711, + "grad_norm": 3.5742788314819336, + "learning_rate": 3.707916999160706e-05, + "loss": 1.9403, + "step": 17752 + }, + { + "epoch": 1.1914700848964799, + "grad_norm": 4.376470565795898, + "learning_rate": 3.706867114806803e-05, + "loss": 2.0063, + "step": 17754 + }, + { + "epoch": 1.191604308580249, + "grad_norm": 3.6129579544067383, + "learning_rate": 3.705817291550884e-05, + "loss": 2.1244, + "step": 17756 + }, + { + "epoch": 1.191738532264018, + "grad_norm": 4.625225067138672, + "learning_rate": 3.7047675294425535e-05, + "loss": 2.288, + "step": 17758 + }, + { + "epoch": 1.191872755947787, + "grad_norm": 4.185909748077393, + "learning_rate": 3.70371782853141e-05, + "loss": 1.9201, + "step": 17760 + }, + { + "epoch": 1.192006979631556, + "grad_norm": 4.1606550216674805, + "learning_rate": 3.702668188867051e-05, + "loss": 2.2664, + "step": 17762 + }, + { + "epoch": 1.192141203315325, + "grad_norm": 3.382395029067993, + "learning_rate": 3.701618610499068e-05, + "loss": 1.8881, + "step": 17764 + }, + { + "epoch": 1.192275426999094, + "grad_norm": 3.995638608932495, + "learning_rate": 3.7005690934770534e-05, + "loss": 1.9099, + "step": 17766 + }, + { + "epoch": 1.192409650682863, + "grad_norm": 4.475737571716309, + "learning_rate": 3.6995196378505926e-05, + "loss": 2.1397, + "step": 17768 + }, + { + "epoch": 1.192543874366632, + "grad_norm": 4.248978137969971, + "learning_rate": 3.6984702436692726e-05, + "loss": 2.0223, + "step": 17770 + }, + { + "epoch": 1.192678098050401, + "grad_norm": 5.678727149963379, + "learning_rate": 3.6974209109826726e-05, + "loss": 2.1919, + "step": 17772 + }, + { + "epoch": 1.19281232173417, + "grad_norm": 3.9534339904785156, + "learning_rate": 3.696371639840376e-05, + "loss": 1.9311, + "step": 17774 + }, + { + "epoch": 1.1929465454179389, + "grad_norm": 4.705215930938721, + "learning_rate": 3.695322430291953e-05, + "loss": 2.1831, + "step": 17776 + }, + { + "epoch": 1.193080769101708, + "grad_norm": 3.777242422103882, + "learning_rate": 3.694273282386981e-05, + "loss": 1.9689, + "step": 17778 + }, + { + "epoch": 1.193214992785477, + "grad_norm": 4.271650791168213, + "learning_rate": 3.693224196175029e-05, + "loss": 2.1588, + "step": 17780 + }, + { + "epoch": 1.193349216469246, + "grad_norm": 3.5074336528778076, + "learning_rate": 3.692175171705664e-05, + "loss": 2.0055, + "step": 17782 + }, + { + "epoch": 1.193483440153015, + "grad_norm": 4.476590633392334, + "learning_rate": 3.6911262090284506e-05, + "loss": 2.0212, + "step": 17784 + }, + { + "epoch": 1.193617663836784, + "grad_norm": 4.36214017868042, + "learning_rate": 3.6900773081929496e-05, + "loss": 1.8662, + "step": 17786 + }, + { + "epoch": 1.193751887520553, + "grad_norm": 3.686197519302368, + "learning_rate": 3.689028469248721e-05, + "loss": 1.9718, + "step": 17788 + }, + { + "epoch": 1.193886111204322, + "grad_norm": 4.169187068939209, + "learning_rate": 3.687979692245318e-05, + "loss": 2.2375, + "step": 17790 + }, + { + "epoch": 1.1940203348880911, + "grad_norm": 4.178590774536133, + "learning_rate": 3.6869309772322955e-05, + "loss": 2.1689, + "step": 17792 + }, + { + "epoch": 1.19415455857186, + "grad_norm": 4.229405403137207, + "learning_rate": 3.685882324259201e-05, + "loss": 2.2201, + "step": 17794 + }, + { + "epoch": 1.194288782255629, + "grad_norm": 4.214848518371582, + "learning_rate": 3.684833733375584e-05, + "loss": 1.9454, + "step": 17796 + }, + { + "epoch": 1.194423005939398, + "grad_norm": 3.660121202468872, + "learning_rate": 3.683785204630986e-05, + "loss": 1.979, + "step": 17798 + }, + { + "epoch": 1.194557229623167, + "grad_norm": 4.558324337005615, + "learning_rate": 3.68273673807495e-05, + "loss": 2.0639, + "step": 17800 + }, + { + "epoch": 1.194691453306936, + "grad_norm": 4.12802267074585, + "learning_rate": 3.681688333757011e-05, + "loss": 2.0068, + "step": 17802 + }, + { + "epoch": 1.194825676990705, + "grad_norm": 3.7165751457214355, + "learning_rate": 3.6806399917267074e-05, + "loss": 2.0341, + "step": 17804 + }, + { + "epoch": 1.194959900674474, + "grad_norm": 5.121122360229492, + "learning_rate": 3.679591712033568e-05, + "loss": 1.8846, + "step": 17806 + }, + { + "epoch": 1.195094124358243, + "grad_norm": 4.054849147796631, + "learning_rate": 3.678543494727126e-05, + "loss": 2.3157, + "step": 17808 + }, + { + "epoch": 1.195228348042012, + "grad_norm": 3.6385462284088135, + "learning_rate": 3.677495339856903e-05, + "loss": 2.1569, + "step": 17810 + }, + { + "epoch": 1.195362571725781, + "grad_norm": 4.3738555908203125, + "learning_rate": 3.676447247472428e-05, + "loss": 2.0528, + "step": 17812 + }, + { + "epoch": 1.1954967954095501, + "grad_norm": 3.919443368911743, + "learning_rate": 3.6753992176232136e-05, + "loss": 1.997, + "step": 17814 + }, + { + "epoch": 1.195631019093319, + "grad_norm": 3.963308811187744, + "learning_rate": 3.6743512503587844e-05, + "loss": 1.982, + "step": 17816 + }, + { + "epoch": 1.195765242777088, + "grad_norm": 3.8931446075439453, + "learning_rate": 3.673303345728651e-05, + "loss": 1.9326, + "step": 17818 + }, + { + "epoch": 1.195899466460857, + "grad_norm": 4.30885648727417, + "learning_rate": 3.672255503782326e-05, + "loss": 2.1387, + "step": 17820 + }, + { + "epoch": 1.196033690144626, + "grad_norm": 3.722090005874634, + "learning_rate": 3.671207724569317e-05, + "loss": 2.1407, + "step": 17822 + }, + { + "epoch": 1.196167913828395, + "grad_norm": 3.7632765769958496, + "learning_rate": 3.670160008139131e-05, + "loss": 1.8942, + "step": 17824 + }, + { + "epoch": 1.196302137512164, + "grad_norm": 4.299487113952637, + "learning_rate": 3.669112354541269e-05, + "loss": 2.0221, + "step": 17826 + }, + { + "epoch": 1.196436361195933, + "grad_norm": 4.034973621368408, + "learning_rate": 3.668064763825231e-05, + "loss": 2.2373, + "step": 17828 + }, + { + "epoch": 1.196570584879702, + "grad_norm": 3.851668357849121, + "learning_rate": 3.6670172360405136e-05, + "loss": 1.9042, + "step": 17830 + }, + { + "epoch": 1.196704808563471, + "grad_norm": 3.857797145843506, + "learning_rate": 3.6659697712366116e-05, + "loss": 2.0459, + "step": 17832 + }, + { + "epoch": 1.19683903224724, + "grad_norm": 3.713578939437866, + "learning_rate": 3.664922369463012e-05, + "loss": 1.7979, + "step": 17834 + }, + { + "epoch": 1.1969732559310091, + "grad_norm": 3.684093713760376, + "learning_rate": 3.663875030769209e-05, + "loss": 1.9385, + "step": 17836 + }, + { + "epoch": 1.197107479614778, + "grad_norm": 4.531587600708008, + "learning_rate": 3.66282775520468e-05, + "loss": 1.9564, + "step": 17838 + }, + { + "epoch": 1.197241703298547, + "grad_norm": 4.090007305145264, + "learning_rate": 3.6617805428189135e-05, + "loss": 2.1292, + "step": 17840 + }, + { + "epoch": 1.197375926982316, + "grad_norm": 4.13726806640625, + "learning_rate": 3.660733393661381e-05, + "loss": 2.1374, + "step": 17842 + }, + { + "epoch": 1.197510150666085, + "grad_norm": 3.9824883937835693, + "learning_rate": 3.6596863077815644e-05, + "loss": 2.2203, + "step": 17844 + }, + { + "epoch": 1.197644374349854, + "grad_norm": 4.508298873901367, + "learning_rate": 3.658639285228934e-05, + "loss": 2.4885, + "step": 17846 + }, + { + "epoch": 1.197778598033623, + "grad_norm": 3.415364980697632, + "learning_rate": 3.65759232605296e-05, + "loss": 1.8531, + "step": 17848 + }, + { + "epoch": 1.197912821717392, + "grad_norm": 3.864799976348877, + "learning_rate": 3.656545430303108e-05, + "loss": 1.8725, + "step": 17850 + }, + { + "epoch": 1.198047045401161, + "grad_norm": 4.357699394226074, + "learning_rate": 3.655498598028844e-05, + "loss": 2.1186, + "step": 17852 + }, + { + "epoch": 1.19818126908493, + "grad_norm": 3.8932948112487793, + "learning_rate": 3.654451829279626e-05, + "loss": 1.9567, + "step": 17854 + }, + { + "epoch": 1.198315492768699, + "grad_norm": 4.536191940307617, + "learning_rate": 3.6534051241049137e-05, + "loss": 2.1682, + "step": 17856 + }, + { + "epoch": 1.1984497164524681, + "grad_norm": 3.884850025177002, + "learning_rate": 3.652358482554162e-05, + "loss": 2.1177, + "step": 17858 + }, + { + "epoch": 1.198583940136237, + "grad_norm": 3.78303599357605, + "learning_rate": 3.6513119046768206e-05, + "loss": 2.1459, + "step": 17860 + }, + { + "epoch": 1.198718163820006, + "grad_norm": 4.11902379989624, + "learning_rate": 3.65026539052234e-05, + "loss": 2.2729, + "step": 17862 + }, + { + "epoch": 1.198852387503775, + "grad_norm": 3.8779430389404297, + "learning_rate": 3.6492189401401656e-05, + "loss": 2.1249, + "step": 17864 + }, + { + "epoch": 1.198986611187544, + "grad_norm": 4.560471057891846, + "learning_rate": 3.6481725535797415e-05, + "loss": 2.3232, + "step": 17866 + }, + { + "epoch": 1.1991208348713132, + "grad_norm": 3.6883487701416016, + "learning_rate": 3.647126230890503e-05, + "loss": 2.1459, + "step": 17868 + }, + { + "epoch": 1.199255058555082, + "grad_norm": 3.997070789337158, + "learning_rate": 3.646079972121893e-05, + "loss": 2.0576, + "step": 17870 + }, + { + "epoch": 1.199389282238851, + "grad_norm": 4.284945011138916, + "learning_rate": 3.645033777323339e-05, + "loss": 1.9004, + "step": 17872 + }, + { + "epoch": 1.1995235059226201, + "grad_norm": 3.5917866230010986, + "learning_rate": 3.643987646544278e-05, + "loss": 2.2771, + "step": 17874 + }, + { + "epoch": 1.199657729606389, + "grad_norm": 3.317284345626831, + "learning_rate": 3.642941579834131e-05, + "loss": 1.743, + "step": 17876 + }, + { + "epoch": 1.199791953290158, + "grad_norm": 3.8312883377075195, + "learning_rate": 3.641895577242327e-05, + "loss": 2.2012, + "step": 17878 + }, + { + "epoch": 1.1999261769739271, + "grad_norm": 3.4869935512542725, + "learning_rate": 3.640849638818286e-05, + "loss": 1.9717, + "step": 17880 + }, + { + "epoch": 1.200060400657696, + "grad_norm": 4.318378448486328, + "learning_rate": 3.639803764611428e-05, + "loss": 1.9948, + "step": 17882 + }, + { + "epoch": 1.200194624341465, + "grad_norm": 4.893423557281494, + "learning_rate": 3.6387579546711656e-05, + "loss": 2.2751, + "step": 17884 + }, + { + "epoch": 1.200328848025234, + "grad_norm": 4.9277472496032715, + "learning_rate": 3.637712209046915e-05, + "loss": 1.9907, + "step": 17886 + }, + { + "epoch": 1.200463071709003, + "grad_norm": 3.8176181316375732, + "learning_rate": 3.636666527788083e-05, + "loss": 1.9174, + "step": 17888 + }, + { + "epoch": 1.2005972953927722, + "grad_norm": 4.1089935302734375, + "learning_rate": 3.635620910944077e-05, + "loss": 1.9596, + "step": 17890 + }, + { + "epoch": 1.200731519076541, + "grad_norm": 4.128640174865723, + "learning_rate": 3.6345753585642996e-05, + "loss": 2.1235, + "step": 17892 + }, + { + "epoch": 1.20086574276031, + "grad_norm": 3.8015592098236084, + "learning_rate": 3.6335298706981517e-05, + "loss": 2.0519, + "step": 17894 + }, + { + "epoch": 1.2009999664440791, + "grad_norm": 4.442697048187256, + "learning_rate": 3.632484447395029e-05, + "loss": 2.5651, + "step": 17896 + }, + { + "epoch": 1.201134190127848, + "grad_norm": 4.054257392883301, + "learning_rate": 3.63143908870433e-05, + "loss": 1.9038, + "step": 17898 + }, + { + "epoch": 1.201268413811617, + "grad_norm": 3.8199737071990967, + "learning_rate": 3.63039379467544e-05, + "loss": 2.0854, + "step": 17900 + }, + { + "epoch": 1.2014026374953861, + "grad_norm": 3.6211609840393066, + "learning_rate": 3.629348565357752e-05, + "loss": 1.8347, + "step": 17902 + }, + { + "epoch": 1.201536861179155, + "grad_norm": 3.884718418121338, + "learning_rate": 3.6283034008006465e-05, + "loss": 1.9217, + "step": 17904 + }, + { + "epoch": 1.201671084862924, + "grad_norm": 4.518759727478027, + "learning_rate": 3.627258301053511e-05, + "loss": 2.0338, + "step": 17906 + }, + { + "epoch": 1.201805308546693, + "grad_norm": 4.095243453979492, + "learning_rate": 3.626213266165719e-05, + "loss": 2.2678, + "step": 17908 + }, + { + "epoch": 1.201939532230462, + "grad_norm": 4.125770092010498, + "learning_rate": 3.6251682961866506e-05, + "loss": 2.0668, + "step": 17910 + }, + { + "epoch": 1.2020737559142312, + "grad_norm": 3.9769763946533203, + "learning_rate": 3.6241233911656755e-05, + "loss": 1.8821, + "step": 17912 + }, + { + "epoch": 1.202207979598, + "grad_norm": 4.105252265930176, + "learning_rate": 3.6230785511521656e-05, + "loss": 1.852, + "step": 17914 + }, + { + "epoch": 1.202342203281769, + "grad_norm": 3.982512950897217, + "learning_rate": 3.6220337761954855e-05, + "loss": 1.8081, + "step": 17916 + }, + { + "epoch": 1.2024764269655381, + "grad_norm": 4.255679130554199, + "learning_rate": 3.620989066345001e-05, + "loss": 2.2855, + "step": 17918 + }, + { + "epoch": 1.202610650649307, + "grad_norm": 4.210519790649414, + "learning_rate": 3.61994442165007e-05, + "loss": 1.9993, + "step": 17920 + }, + { + "epoch": 1.202744874333076, + "grad_norm": 3.774225950241089, + "learning_rate": 3.618899842160053e-05, + "loss": 2.0346, + "step": 17922 + }, + { + "epoch": 1.2028790980168451, + "grad_norm": 3.892148017883301, + "learning_rate": 3.617855327924302e-05, + "loss": 2.0314, + "step": 17924 + }, + { + "epoch": 1.203013321700614, + "grad_norm": 5.330074310302734, + "learning_rate": 3.616810878992168e-05, + "loss": 2.1124, + "step": 17926 + }, + { + "epoch": 1.203147545384383, + "grad_norm": 3.8220088481903076, + "learning_rate": 3.6157664954130014e-05, + "loss": 1.9165, + "step": 17928 + }, + { + "epoch": 1.203281769068152, + "grad_norm": 4.1689300537109375, + "learning_rate": 3.6147221772361446e-05, + "loss": 1.9958, + "step": 17930 + }, + { + "epoch": 1.203415992751921, + "grad_norm": 4.179872035980225, + "learning_rate": 3.613677924510942e-05, + "loss": 2.0061, + "step": 17932 + }, + { + "epoch": 1.2035502164356902, + "grad_norm": 4.178359031677246, + "learning_rate": 3.612633737286729e-05, + "loss": 2.2341, + "step": 17934 + }, + { + "epoch": 1.203684440119459, + "grad_norm": 3.993006706237793, + "learning_rate": 3.611589615612847e-05, + "loss": 2.008, + "step": 17936 + }, + { + "epoch": 1.203818663803228, + "grad_norm": 4.027849197387695, + "learning_rate": 3.610545559538623e-05, + "loss": 2.0056, + "step": 17938 + }, + { + "epoch": 1.2039528874869971, + "grad_norm": 3.7758467197418213, + "learning_rate": 3.6095015691133904e-05, + "loss": 2.076, + "step": 17940 + }, + { + "epoch": 1.204087111170766, + "grad_norm": 4.437410831451416, + "learning_rate": 3.608457644386474e-05, + "loss": 2.3369, + "step": 17942 + }, + { + "epoch": 1.2042213348545352, + "grad_norm": 4.246561050415039, + "learning_rate": 3.6074137854071976e-05, + "loss": 2.0602, + "step": 17944 + }, + { + "epoch": 1.2043555585383041, + "grad_norm": 3.7463698387145996, + "learning_rate": 3.606369992224882e-05, + "loss": 1.8258, + "step": 17946 + }, + { + "epoch": 1.204489782222073, + "grad_norm": 3.8675503730773926, + "learning_rate": 3.605326264888843e-05, + "loss": 2.0379, + "step": 17948 + }, + { + "epoch": 1.2046240059058422, + "grad_norm": 4.1065239906311035, + "learning_rate": 3.6042826034483956e-05, + "loss": 2.1607, + "step": 17950 + }, + { + "epoch": 1.204758229589611, + "grad_norm": 4.174368858337402, + "learning_rate": 3.603239007952851e-05, + "loss": 2.0849, + "step": 17952 + }, + { + "epoch": 1.20489245327338, + "grad_norm": 3.9324703216552734, + "learning_rate": 3.602195478451516e-05, + "loss": 2.0222, + "step": 17954 + }, + { + "epoch": 1.2050266769571492, + "grad_norm": 3.6077656745910645, + "learning_rate": 3.6011520149936975e-05, + "loss": 1.8751, + "step": 17956 + }, + { + "epoch": 1.205160900640918, + "grad_norm": 3.5644683837890625, + "learning_rate": 3.600108617628692e-05, + "loss": 2.0749, + "step": 17958 + }, + { + "epoch": 1.205295124324687, + "grad_norm": 4.299354553222656, + "learning_rate": 3.599065286405806e-05, + "loss": 2.2314, + "step": 17960 + }, + { + "epoch": 1.2054293480084561, + "grad_norm": 4.044852256774902, + "learning_rate": 3.598022021374327e-05, + "loss": 1.9851, + "step": 17962 + }, + { + "epoch": 1.205563571692225, + "grad_norm": 4.188790798187256, + "learning_rate": 3.596978822583554e-05, + "loss": 1.981, + "step": 17964 + }, + { + "epoch": 1.2056977953759942, + "grad_norm": 4.294765949249268, + "learning_rate": 3.595935690082769e-05, + "loss": 2.0001, + "step": 17966 + }, + { + "epoch": 1.2058320190597631, + "grad_norm": 4.165108680725098, + "learning_rate": 3.594892623921264e-05, + "loss": 2.1294, + "step": 17968 + }, + { + "epoch": 1.205966242743532, + "grad_norm": 4.550543308258057, + "learning_rate": 3.593849624148319e-05, + "loss": 2.2903, + "step": 17970 + }, + { + "epoch": 1.2061004664273012, + "grad_norm": 4.152347564697266, + "learning_rate": 3.5928066908132144e-05, + "loss": 2.0466, + "step": 17972 + }, + { + "epoch": 1.20623469011107, + "grad_norm": 4.103320598602295, + "learning_rate": 3.591763823965226e-05, + "loss": 2.2701, + "step": 17974 + }, + { + "epoch": 1.206368913794839, + "grad_norm": 3.931725263595581, + "learning_rate": 3.590721023653628e-05, + "loss": 2.057, + "step": 17976 + }, + { + "epoch": 1.2065031374786082, + "grad_norm": 3.9148409366607666, + "learning_rate": 3.5896782899276905e-05, + "loss": 1.8936, + "step": 17978 + }, + { + "epoch": 1.206637361162377, + "grad_norm": 8.704084396362305, + "learning_rate": 3.58863562283668e-05, + "loss": 2.0201, + "step": 17980 + }, + { + "epoch": 1.206771584846146, + "grad_norm": 5.236464977264404, + "learning_rate": 3.5875930224298616e-05, + "loss": 1.8988, + "step": 17982 + }, + { + "epoch": 1.2069058085299151, + "grad_norm": 4.261220455169678, + "learning_rate": 3.586550488756496e-05, + "loss": 2.205, + "step": 17984 + }, + { + "epoch": 1.207040032213684, + "grad_norm": 4.162203788757324, + "learning_rate": 3.585508021865838e-05, + "loss": 2.152, + "step": 17986 + }, + { + "epoch": 1.2071742558974532, + "grad_norm": 3.8919215202331543, + "learning_rate": 3.584465621807148e-05, + "loss": 2.0579, + "step": 17988 + }, + { + "epoch": 1.2073084795812221, + "grad_norm": 3.9070892333984375, + "learning_rate": 3.58342328862967e-05, + "loss": 2.0779, + "step": 17990 + }, + { + "epoch": 1.207442703264991, + "grad_norm": 3.8291478157043457, + "learning_rate": 3.58238102238266e-05, + "loss": 2.18, + "step": 17992 + }, + { + "epoch": 1.2075769269487602, + "grad_norm": 4.049379348754883, + "learning_rate": 3.5813388231153576e-05, + "loss": 2.0384, + "step": 17994 + }, + { + "epoch": 1.207711150632529, + "grad_norm": 3.888669490814209, + "learning_rate": 3.5802966908770044e-05, + "loss": 1.7942, + "step": 17996 + }, + { + "epoch": 1.207845374316298, + "grad_norm": 4.030262470245361, + "learning_rate": 3.579254625716844e-05, + "loss": 2.0044, + "step": 17998 + }, + { + "epoch": 1.2079795980000672, + "grad_norm": 4.60930061340332, + "learning_rate": 3.5782126276841045e-05, + "loss": 2.1648, + "step": 18000 + }, + { + "epoch": 1.208113821683836, + "grad_norm": 4.194060802459717, + "learning_rate": 3.577170696828026e-05, + "loss": 2.0519, + "step": 18002 + }, + { + "epoch": 1.208248045367605, + "grad_norm": 4.078901767730713, + "learning_rate": 3.576128833197832e-05, + "loss": 1.9171, + "step": 18004 + }, + { + "epoch": 1.2083822690513741, + "grad_norm": 4.42930793762207, + "learning_rate": 3.5750870368427526e-05, + "loss": 2.0521, + "step": 18006 + }, + { + "epoch": 1.208516492735143, + "grad_norm": 3.8432860374450684, + "learning_rate": 3.574045307812007e-05, + "loss": 1.9326, + "step": 18008 + }, + { + "epoch": 1.2086507164189122, + "grad_norm": 3.7186338901519775, + "learning_rate": 3.573003646154819e-05, + "loss": 2.0329, + "step": 18010 + }, + { + "epoch": 1.2087849401026811, + "grad_norm": 3.7507078647613525, + "learning_rate": 3.5719620519204e-05, + "loss": 1.8154, + "step": 18012 + }, + { + "epoch": 1.20891916378645, + "grad_norm": 4.605996131896973, + "learning_rate": 3.570920525157968e-05, + "loss": 2.305, + "step": 18014 + }, + { + "epoch": 1.2090533874702192, + "grad_norm": 4.192234516143799, + "learning_rate": 3.569879065916729e-05, + "loss": 2.3569, + "step": 18016 + }, + { + "epoch": 1.209187611153988, + "grad_norm": 3.785813331604004, + "learning_rate": 3.568837674245894e-05, + "loss": 1.9504, + "step": 18018 + }, + { + "epoch": 1.2093218348377572, + "grad_norm": 3.8007378578186035, + "learning_rate": 3.5677963501946634e-05, + "loss": 2.2357, + "step": 18020 + }, + { + "epoch": 1.2094560585215262, + "grad_norm": 4.143321990966797, + "learning_rate": 3.5667550938122396e-05, + "loss": 2.0884, + "step": 18022 + }, + { + "epoch": 1.209590282205295, + "grad_norm": 4.011829376220703, + "learning_rate": 3.565713905147817e-05, + "loss": 2.2101, + "step": 18024 + }, + { + "epoch": 1.2097245058890642, + "grad_norm": 4.020024299621582, + "learning_rate": 3.5646727842505955e-05, + "loss": 2.0597, + "step": 18026 + }, + { + "epoch": 1.2098587295728331, + "grad_norm": 4.461579322814941, + "learning_rate": 3.56363173116976e-05, + "loss": 2.0662, + "step": 18028 + }, + { + "epoch": 1.209992953256602, + "grad_norm": 3.910327434539795, + "learning_rate": 3.562590745954501e-05, + "loss": 2.1366, + "step": 18030 + }, + { + "epoch": 1.2101271769403712, + "grad_norm": 3.89801025390625, + "learning_rate": 3.561549828654003e-05, + "loss": 2.2151, + "step": 18032 + }, + { + "epoch": 1.2102614006241401, + "grad_norm": 4.039394378662109, + "learning_rate": 3.5605089793174485e-05, + "loss": 2.1576, + "step": 18034 + }, + { + "epoch": 1.210395624307909, + "grad_norm": 4.474233150482178, + "learning_rate": 3.559468197994012e-05, + "loss": 1.9171, + "step": 18036 + }, + { + "epoch": 1.2105298479916782, + "grad_norm": 4.317897796630859, + "learning_rate": 3.558427484732872e-05, + "loss": 2.0186, + "step": 18038 + }, + { + "epoch": 1.210664071675447, + "grad_norm": 4.5259108543396, + "learning_rate": 3.5573868395831975e-05, + "loss": 1.898, + "step": 18040 + }, + { + "epoch": 1.2107982953592162, + "grad_norm": 3.964507579803467, + "learning_rate": 3.556346262594159e-05, + "loss": 2.0049, + "step": 18042 + }, + { + "epoch": 1.2109325190429852, + "grad_norm": 3.7565600872039795, + "learning_rate": 3.5553057538149196e-05, + "loss": 2.2189, + "step": 18044 + }, + { + "epoch": 1.211066742726754, + "grad_norm": 3.959792375564575, + "learning_rate": 3.5542653132946435e-05, + "loss": 2.1585, + "step": 18046 + }, + { + "epoch": 1.2112009664105232, + "grad_norm": 4.667056083679199, + "learning_rate": 3.553224941082487e-05, + "loss": 2.3554, + "step": 18048 + }, + { + "epoch": 1.2113351900942921, + "grad_norm": 4.3827996253967285, + "learning_rate": 3.552184637227609e-05, + "loss": 2.0502, + "step": 18050 + }, + { + "epoch": 1.211469413778061, + "grad_norm": 3.7344279289245605, + "learning_rate": 3.5511444017791575e-05, + "loss": 2.2981, + "step": 18052 + }, + { + "epoch": 1.2116036374618302, + "grad_norm": 4.056401252746582, + "learning_rate": 3.550104234786287e-05, + "loss": 2.0938, + "step": 18054 + }, + { + "epoch": 1.2117378611455991, + "grad_norm": 4.38290548324585, + "learning_rate": 3.5490641362981374e-05, + "loss": 1.9953, + "step": 18056 + }, + { + "epoch": 1.211872084829368, + "grad_norm": 4.847411632537842, + "learning_rate": 3.5480241063638566e-05, + "loss": 2.1685, + "step": 18058 + }, + { + "epoch": 1.2120063085131372, + "grad_norm": 4.009082317352295, + "learning_rate": 3.546984145032582e-05, + "loss": 1.6907, + "step": 18060 + }, + { + "epoch": 1.212140532196906, + "grad_norm": 3.831857919692993, + "learning_rate": 3.545944252353446e-05, + "loss": 1.9424, + "step": 18062 + }, + { + "epoch": 1.2122747558806752, + "grad_norm": 4.002022743225098, + "learning_rate": 3.5449044283755874e-05, + "loss": 1.9879, + "step": 18064 + }, + { + "epoch": 1.2124089795644442, + "grad_norm": 4.312160968780518, + "learning_rate": 3.543864673148132e-05, + "loss": 2.0506, + "step": 18066 + }, + { + "epoch": 1.212543203248213, + "grad_norm": 4.112171649932861, + "learning_rate": 3.542824986720209e-05, + "loss": 2.0042, + "step": 18068 + }, + { + "epoch": 1.2126774269319822, + "grad_norm": 3.9992940425872803, + "learning_rate": 3.541785369140938e-05, + "loss": 2.1151, + "step": 18070 + }, + { + "epoch": 1.2128116506157511, + "grad_norm": 4.119078159332275, + "learning_rate": 3.5407458204594426e-05, + "loss": 2.3344, + "step": 18072 + }, + { + "epoch": 1.21294587429952, + "grad_norm": 4.34067964553833, + "learning_rate": 3.5397063407248365e-05, + "loss": 2.1675, + "step": 18074 + }, + { + "epoch": 1.2130800979832892, + "grad_norm": 4.134885787963867, + "learning_rate": 3.5386669299862355e-05, + "loss": 2.0882, + "step": 18076 + }, + { + "epoch": 1.2132143216670581, + "grad_norm": 4.954759120941162, + "learning_rate": 3.5376275882927466e-05, + "loss": 2.116, + "step": 18078 + }, + { + "epoch": 1.213348545350827, + "grad_norm": 4.430233478546143, + "learning_rate": 3.5365883156934795e-05, + "loss": 1.8914, + "step": 18080 + }, + { + "epoch": 1.2134827690345962, + "grad_norm": 3.8565218448638916, + "learning_rate": 3.535549112237537e-05, + "loss": 2.031, + "step": 18082 + }, + { + "epoch": 1.213616992718365, + "grad_norm": 5.4209370613098145, + "learning_rate": 3.534509977974019e-05, + "loss": 1.9859, + "step": 18084 + }, + { + "epoch": 1.2137512164021342, + "grad_norm": 3.9982283115386963, + "learning_rate": 3.533470912952022e-05, + "loss": 2.1124, + "step": 18086 + }, + { + "epoch": 1.2138854400859032, + "grad_norm": 3.8947341442108154, + "learning_rate": 3.532431917220642e-05, + "loss": 1.8169, + "step": 18088 + }, + { + "epoch": 1.214019663769672, + "grad_norm": 3.9325273036956787, + "learning_rate": 3.5313929908289665e-05, + "loss": 2.3713, + "step": 18090 + }, + { + "epoch": 1.2141538874534412, + "grad_norm": 3.6372287273406982, + "learning_rate": 3.5303541338260856e-05, + "loss": 1.9705, + "step": 18092 + }, + { + "epoch": 1.2142881111372101, + "grad_norm": 3.865485429763794, + "learning_rate": 3.529315346261081e-05, + "loss": 2.0219, + "step": 18094 + }, + { + "epoch": 1.2144223348209793, + "grad_norm": 3.6960272789001465, + "learning_rate": 3.528276628183035e-05, + "loss": 1.681, + "step": 18096 + }, + { + "epoch": 1.2145565585047482, + "grad_norm": 4.053394794464111, + "learning_rate": 3.527237979641024e-05, + "loss": 2.048, + "step": 18098 + }, + { + "epoch": 1.2146907821885171, + "grad_norm": 4.067917823791504, + "learning_rate": 3.526199400684124e-05, + "loss": 1.9207, + "step": 18100 + }, + { + "epoch": 1.2148250058722863, + "grad_norm": 4.224887371063232, + "learning_rate": 3.5251608913614014e-05, + "loss": 2.1111, + "step": 18102 + }, + { + "epoch": 1.2149592295560552, + "grad_norm": 4.055767059326172, + "learning_rate": 3.5241224517219286e-05, + "loss": 2.0324, + "step": 18104 + }, + { + "epoch": 1.215093453239824, + "grad_norm": 3.8289730548858643, + "learning_rate": 3.523084081814767e-05, + "loss": 2.1194, + "step": 18106 + }, + { + "epoch": 1.2152276769235932, + "grad_norm": 4.342119216918945, + "learning_rate": 3.5220457816889784e-05, + "loss": 1.8168, + "step": 18108 + }, + { + "epoch": 1.2153619006073622, + "grad_norm": 5.2623701095581055, + "learning_rate": 3.5210075513936206e-05, + "loss": 2.0413, + "step": 18110 + }, + { + "epoch": 1.215496124291131, + "grad_norm": 3.536837577819824, + "learning_rate": 3.519969390977748e-05, + "loss": 1.7045, + "step": 18112 + }, + { + "epoch": 1.2156303479749002, + "grad_norm": 4.168348789215088, + "learning_rate": 3.518931300490409e-05, + "loss": 2.0654, + "step": 18114 + }, + { + "epoch": 1.2157645716586691, + "grad_norm": 4.146714210510254, + "learning_rate": 3.517893279980656e-05, + "loss": 2.1153, + "step": 18116 + }, + { + "epoch": 1.2158987953424383, + "grad_norm": 4.237437725067139, + "learning_rate": 3.516855329497529e-05, + "loss": 1.9211, + "step": 18118 + }, + { + "epoch": 1.2160330190262072, + "grad_norm": 3.7464513778686523, + "learning_rate": 3.5158174490900744e-05, + "loss": 1.9448, + "step": 18120 + }, + { + "epoch": 1.2161672427099761, + "grad_norm": 4.156373023986816, + "learning_rate": 3.514779638807324e-05, + "loss": 2.1674, + "step": 18122 + }, + { + "epoch": 1.2163014663937453, + "grad_norm": 4.404965400695801, + "learning_rate": 3.513741898698317e-05, + "loss": 1.8468, + "step": 18124 + }, + { + "epoch": 1.2164356900775142, + "grad_norm": 3.9857892990112305, + "learning_rate": 3.512704228812082e-05, + "loss": 1.8907, + "step": 18126 + }, + { + "epoch": 1.216569913761283, + "grad_norm": 4.084505081176758, + "learning_rate": 3.5116666291976474e-05, + "loss": 2.1935, + "step": 18128 + }, + { + "epoch": 1.2167041374450522, + "grad_norm": 3.6941978931427, + "learning_rate": 3.510629099904038e-05, + "loss": 1.8854, + "step": 18130 + }, + { + "epoch": 1.2168383611288212, + "grad_norm": 3.756854295730591, + "learning_rate": 3.509591640980274e-05, + "loss": 2.0146, + "step": 18132 + }, + { + "epoch": 1.21697258481259, + "grad_norm": 4.4545512199401855, + "learning_rate": 3.508554252475376e-05, + "loss": 2.1488, + "step": 18134 + }, + { + "epoch": 1.2171068084963592, + "grad_norm": 3.5788888931274414, + "learning_rate": 3.507516934438355e-05, + "loss": 1.8482, + "step": 18136 + }, + { + "epoch": 1.2172410321801281, + "grad_norm": 3.889051914215088, + "learning_rate": 3.506479686918226e-05, + "loss": 2.0908, + "step": 18138 + }, + { + "epoch": 1.2173752558638973, + "grad_norm": 3.914647340774536, + "learning_rate": 3.505442509963993e-05, + "loss": 1.9074, + "step": 18140 + }, + { + "epoch": 1.2175094795476662, + "grad_norm": 3.7343263626098633, + "learning_rate": 3.504405403624664e-05, + "loss": 1.9755, + "step": 18142 + }, + { + "epoch": 1.2176437032314351, + "grad_norm": 4.1946539878845215, + "learning_rate": 3.503368367949237e-05, + "loss": 2.1313, + "step": 18144 + }, + { + "epoch": 1.2177779269152043, + "grad_norm": 4.184709548950195, + "learning_rate": 3.502331402986713e-05, + "loss": 2.0412, + "step": 18146 + }, + { + "epoch": 1.2179121505989732, + "grad_norm": 4.269425392150879, + "learning_rate": 3.501294508786084e-05, + "loss": 1.888, + "step": 18148 + }, + { + "epoch": 1.218046374282742, + "grad_norm": 4.188792705535889, + "learning_rate": 3.500257685396344e-05, + "loss": 2.2126, + "step": 18150 + }, + { + "epoch": 1.2181805979665112, + "grad_norm": 4.429081916809082, + "learning_rate": 3.4992209328664774e-05, + "loss": 2.1266, + "step": 18152 + }, + { + "epoch": 1.2183148216502802, + "grad_norm": 3.890646457672119, + "learning_rate": 3.4981842512454716e-05, + "loss": 2.0224, + "step": 18154 + }, + { + "epoch": 1.218449045334049, + "grad_norm": 4.262169361114502, + "learning_rate": 3.497147640582306e-05, + "loss": 1.9115, + "step": 18156 + }, + { + "epoch": 1.2185832690178182, + "grad_norm": 4.745761394500732, + "learning_rate": 3.4961111009259605e-05, + "loss": 2.2055, + "step": 18158 + }, + { + "epoch": 1.2187174927015871, + "grad_norm": 3.69118595123291, + "learning_rate": 3.495074632325407e-05, + "loss": 2.2088, + "step": 18160 + }, + { + "epoch": 1.2188517163853563, + "grad_norm": 4.56364631652832, + "learning_rate": 3.4940382348296186e-05, + "loss": 2.3146, + "step": 18162 + }, + { + "epoch": 1.2189859400691252, + "grad_norm": 4.006748199462891, + "learning_rate": 3.493001908487561e-05, + "loss": 2.0429, + "step": 18164 + }, + { + "epoch": 1.2191201637528941, + "grad_norm": 4.066694736480713, + "learning_rate": 3.491965653348202e-05, + "loss": 2.1184, + "step": 18166 + }, + { + "epoch": 1.2192543874366633, + "grad_norm": 4.197920799255371, + "learning_rate": 3.490929469460499e-05, + "loss": 2.0634, + "step": 18168 + }, + { + "epoch": 1.2193886111204322, + "grad_norm": 4.530567646026611, + "learning_rate": 3.4898933568734115e-05, + "loss": 1.9795, + "step": 18170 + }, + { + "epoch": 1.2195228348042013, + "grad_norm": 3.5923454761505127, + "learning_rate": 3.488857315635893e-05, + "loss": 1.7609, + "step": 18172 + }, + { + "epoch": 1.2196570584879702, + "grad_norm": 3.8550267219543457, + "learning_rate": 3.487821345796895e-05, + "loss": 1.8506, + "step": 18174 + }, + { + "epoch": 1.2197912821717392, + "grad_norm": 4.29258394241333, + "learning_rate": 3.4867854474053644e-05, + "loss": 1.7146, + "step": 18176 + }, + { + "epoch": 1.2199255058555083, + "grad_norm": 4.217910289764404, + "learning_rate": 3.4857496205102474e-05, + "loss": 2.0065, + "step": 18178 + }, + { + "epoch": 1.2200597295392772, + "grad_norm": 4.14438009262085, + "learning_rate": 3.4847138651604805e-05, + "loss": 2.1646, + "step": 18180 + }, + { + "epoch": 1.2201939532230461, + "grad_norm": 3.339543104171753, + "learning_rate": 3.483678181405006e-05, + "loss": 1.9466, + "step": 18182 + }, + { + "epoch": 1.2203281769068153, + "grad_norm": 4.118716239929199, + "learning_rate": 3.4826425692927545e-05, + "loss": 1.9982, + "step": 18184 + }, + { + "epoch": 1.2204624005905842, + "grad_norm": 3.7889392375946045, + "learning_rate": 3.481607028872659e-05, + "loss": 2.0821, + "step": 18186 + }, + { + "epoch": 1.2205966242743531, + "grad_norm": 4.465428829193115, + "learning_rate": 3.480571560193645e-05, + "loss": 2.3137, + "step": 18188 + }, + { + "epoch": 1.2207308479581223, + "grad_norm": 5.538318634033203, + "learning_rate": 3.479536163304637e-05, + "loss": 1.7924, + "step": 18190 + }, + { + "epoch": 1.2208650716418912, + "grad_norm": 3.715815544128418, + "learning_rate": 3.4785008382545546e-05, + "loss": 2.0513, + "step": 18192 + }, + { + "epoch": 1.2209992953256603, + "grad_norm": 5.23396110534668, + "learning_rate": 3.4774655850923174e-05, + "loss": 2.0741, + "step": 18194 + }, + { + "epoch": 1.2211335190094292, + "grad_norm": 4.059267044067383, + "learning_rate": 3.476430403866836e-05, + "loss": 2.0796, + "step": 18196 + }, + { + "epoch": 1.2212677426931982, + "grad_norm": 3.7343850135803223, + "learning_rate": 3.475395294627023e-05, + "loss": 1.8921, + "step": 18198 + }, + { + "epoch": 1.2214019663769673, + "grad_norm": 3.9103987216949463, + "learning_rate": 3.474360257421784e-05, + "loss": 1.736, + "step": 18200 + }, + { + "epoch": 1.2215361900607362, + "grad_norm": 4.061290264129639, + "learning_rate": 3.4733252923000226e-05, + "loss": 1.904, + "step": 18202 + }, + { + "epoch": 1.2216704137445051, + "grad_norm": 3.739450693130493, + "learning_rate": 3.4722903993106395e-05, + "loss": 2.1447, + "step": 18204 + }, + { + "epoch": 1.2218046374282743, + "grad_norm": 4.405920028686523, + "learning_rate": 3.4712555785025305e-05, + "loss": 2.0736, + "step": 18206 + }, + { + "epoch": 1.2219388611120432, + "grad_norm": 3.752000093460083, + "learning_rate": 3.4702208299245895e-05, + "loss": 1.877, + "step": 18208 + }, + { + "epoch": 1.2220730847958121, + "grad_norm": 3.680269956588745, + "learning_rate": 3.469186153625704e-05, + "loss": 1.997, + "step": 18210 + }, + { + "epoch": 1.2222073084795813, + "grad_norm": 3.911953926086426, + "learning_rate": 3.468151549654766e-05, + "loss": 1.9221, + "step": 18212 + }, + { + "epoch": 1.2223415321633502, + "grad_norm": 3.9955825805664062, + "learning_rate": 3.467117018060652e-05, + "loss": 1.9498, + "step": 18214 + }, + { + "epoch": 1.2224757558471193, + "grad_norm": 3.978137254714966, + "learning_rate": 3.466082558892247e-05, + "loss": 2.0687, + "step": 18216 + }, + { + "epoch": 1.2226099795308882, + "grad_norm": 3.8966643810272217, + "learning_rate": 3.465048172198423e-05, + "loss": 1.8804, + "step": 18218 + }, + { + "epoch": 1.2227442032146572, + "grad_norm": 3.6742348670959473, + "learning_rate": 3.464013858028056e-05, + "loss": 2.0452, + "step": 18220 + }, + { + "epoch": 1.2228784268984263, + "grad_norm": 3.7560653686523438, + "learning_rate": 3.462979616430012e-05, + "loss": 2.041, + "step": 18222 + }, + { + "epoch": 1.2230126505821952, + "grad_norm": 4.104758262634277, + "learning_rate": 3.461945447453161e-05, + "loss": 1.989, + "step": 18224 + }, + { + "epoch": 1.2231468742659641, + "grad_norm": 4.460013389587402, + "learning_rate": 3.4609113511463616e-05, + "loss": 1.9794, + "step": 18226 + }, + { + "epoch": 1.2232810979497333, + "grad_norm": 6.497433185577393, + "learning_rate": 3.459877327558475e-05, + "loss": 2.1075, + "step": 18228 + }, + { + "epoch": 1.2234153216335022, + "grad_norm": 3.9827206134796143, + "learning_rate": 3.458843376738355e-05, + "loss": 2.2625, + "step": 18230 + }, + { + "epoch": 1.2235495453172711, + "grad_norm": 4.415520668029785, + "learning_rate": 3.457809498734857e-05, + "loss": 1.925, + "step": 18232 + }, + { + "epoch": 1.2236837690010403, + "grad_norm": 3.321519374847412, + "learning_rate": 3.456775693596825e-05, + "loss": 1.9535, + "step": 18234 + }, + { + "epoch": 1.2238179926848092, + "grad_norm": 3.7553937435150146, + "learning_rate": 3.455741961373109e-05, + "loss": 2.1054, + "step": 18236 + }, + { + "epoch": 1.2239522163685783, + "grad_norm": 7.965523719787598, + "learning_rate": 3.454708302112547e-05, + "loss": 2.1771, + "step": 18238 + }, + { + "epoch": 1.2240864400523472, + "grad_norm": 3.8028388023376465, + "learning_rate": 3.453674715863982e-05, + "loss": 1.9463, + "step": 18240 + }, + { + "epoch": 1.2242206637361162, + "grad_norm": 3.9713926315307617, + "learning_rate": 3.452641202676242e-05, + "loss": 1.984, + "step": 18242 + }, + { + "epoch": 1.2243548874198853, + "grad_norm": 4.4441986083984375, + "learning_rate": 3.4516077625981644e-05, + "loss": 1.9905, + "step": 18244 + }, + { + "epoch": 1.2244891111036542, + "grad_norm": 3.944491386413574, + "learning_rate": 3.450574395678575e-05, + "loss": 1.935, + "step": 18246 + }, + { + "epoch": 1.2246233347874234, + "grad_norm": 4.191484451293945, + "learning_rate": 3.4495411019662994e-05, + "loss": 1.9548, + "step": 18248 + }, + { + "epoch": 1.2247575584711923, + "grad_norm": 4.321385383605957, + "learning_rate": 3.4485078815101566e-05, + "loss": 2.1066, + "step": 18250 + }, + { + "epoch": 1.2248917821549612, + "grad_norm": 4.35685396194458, + "learning_rate": 3.447474734358967e-05, + "loss": 1.6806, + "step": 18252 + }, + { + "epoch": 1.2250260058387303, + "grad_norm": 3.7657625675201416, + "learning_rate": 3.4464416605615424e-05, + "loss": 2.0029, + "step": 18254 + }, + { + "epoch": 1.2251602295224993, + "grad_norm": 4.35120964050293, + "learning_rate": 3.4454086601666945e-05, + "loss": 2.1031, + "step": 18256 + }, + { + "epoch": 1.2252944532062682, + "grad_norm": 3.9417500495910645, + "learning_rate": 3.44437573322323e-05, + "loss": 1.802, + "step": 18258 + }, + { + "epoch": 1.2254286768900373, + "grad_norm": 3.988286256790161, + "learning_rate": 3.443342879779954e-05, + "loss": 2.1131, + "step": 18260 + }, + { + "epoch": 1.2255629005738062, + "grad_norm": 4.423853874206543, + "learning_rate": 3.442310099885665e-05, + "loss": 1.9415, + "step": 18262 + }, + { + "epoch": 1.2256971242575752, + "grad_norm": 3.942753314971924, + "learning_rate": 3.441277393589162e-05, + "loss": 2.1513, + "step": 18264 + }, + { + "epoch": 1.2258313479413443, + "grad_norm": 3.604342460632324, + "learning_rate": 3.440244760939236e-05, + "loss": 1.8637, + "step": 18266 + }, + { + "epoch": 1.2259655716251132, + "grad_norm": 2.979645252227783, + "learning_rate": 3.439212201984679e-05, + "loss": 1.9994, + "step": 18268 + }, + { + "epoch": 1.2260997953088824, + "grad_norm": 4.1305108070373535, + "learning_rate": 3.438179716774275e-05, + "loss": 2.2356, + "step": 18270 + }, + { + "epoch": 1.2262340189926513, + "grad_norm": 3.774923086166382, + "learning_rate": 3.437147305356807e-05, + "loss": 1.9463, + "step": 18272 + }, + { + "epoch": 1.2263682426764202, + "grad_norm": 3.574517250061035, + "learning_rate": 3.436114967781059e-05, + "loss": 2.2487, + "step": 18274 + }, + { + "epoch": 1.2265024663601893, + "grad_norm": 3.652463436126709, + "learning_rate": 3.4350827040958e-05, + "loss": 1.9794, + "step": 18276 + }, + { + "epoch": 1.2266366900439583, + "grad_norm": 4.927786827087402, + "learning_rate": 3.434050514349809e-05, + "loss": 2.2867, + "step": 18278 + }, + { + "epoch": 1.2267709137277272, + "grad_norm": 4.197466850280762, + "learning_rate": 3.43301839859185e-05, + "loss": 1.8763, + "step": 18280 + }, + { + "epoch": 1.2269051374114963, + "grad_norm": 3.9132347106933594, + "learning_rate": 3.431986356870691e-05, + "loss": 1.9508, + "step": 18282 + }, + { + "epoch": 1.2270393610952652, + "grad_norm": 3.882768154144287, + "learning_rate": 3.430954389235092e-05, + "loss": 1.9432, + "step": 18284 + }, + { + "epoch": 1.2271735847790342, + "grad_norm": 4.072657108306885, + "learning_rate": 3.429922495733815e-05, + "loss": 2.1479, + "step": 18286 + }, + { + "epoch": 1.2273078084628033, + "grad_norm": 3.5437886714935303, + "learning_rate": 3.4288906764156106e-05, + "loss": 1.9199, + "step": 18288 + }, + { + "epoch": 1.2274420321465722, + "grad_norm": 4.936519622802734, + "learning_rate": 3.4278589313292345e-05, + "loss": 2.1664, + "step": 18290 + }, + { + "epoch": 1.2275762558303414, + "grad_norm": 4.134695053100586, + "learning_rate": 3.4268272605234296e-05, + "loss": 1.9755, + "step": 18292 + }, + { + "epoch": 1.2277104795141103, + "grad_norm": 3.833134889602661, + "learning_rate": 3.425795664046946e-05, + "loss": 2.1738, + "step": 18294 + }, + { + "epoch": 1.2278447031978792, + "grad_norm": 4.141845226287842, + "learning_rate": 3.42476414194852e-05, + "loss": 1.9796, + "step": 18296 + }, + { + "epoch": 1.2279789268816483, + "grad_norm": 4.116926670074463, + "learning_rate": 3.423732694276891e-05, + "loss": 2.1996, + "step": 18298 + }, + { + "epoch": 1.2281131505654173, + "grad_norm": 4.313630104064941, + "learning_rate": 3.422701321080791e-05, + "loss": 2.1461, + "step": 18300 + }, + { + "epoch": 1.2282473742491862, + "grad_norm": 3.8677313327789307, + "learning_rate": 3.421670022408956e-05, + "loss": 1.9506, + "step": 18302 + }, + { + "epoch": 1.2283815979329553, + "grad_norm": 4.369185447692871, + "learning_rate": 3.420638798310105e-05, + "loss": 1.9857, + "step": 18304 + }, + { + "epoch": 1.2285158216167242, + "grad_norm": 4.587501525878906, + "learning_rate": 3.419607648832968e-05, + "loss": 2.1618, + "step": 18306 + }, + { + "epoch": 1.2286500453004932, + "grad_norm": 3.7004878520965576, + "learning_rate": 3.4185765740262586e-05, + "loss": 1.8757, + "step": 18308 + }, + { + "epoch": 1.2287842689842623, + "grad_norm": 4.013951301574707, + "learning_rate": 3.417545573938699e-05, + "loss": 2.126, + "step": 18310 + }, + { + "epoch": 1.2289184926680312, + "grad_norm": 4.177886009216309, + "learning_rate": 3.416514648618998e-05, + "loss": 1.938, + "step": 18312 + }, + { + "epoch": 1.2290527163518004, + "grad_norm": 4.218538761138916, + "learning_rate": 3.415483798115867e-05, + "loss": 1.8631, + "step": 18314 + }, + { + "epoch": 1.2291869400355693, + "grad_norm": 4.301321983337402, + "learning_rate": 3.4144530224780095e-05, + "loss": 2.1603, + "step": 18316 + }, + { + "epoch": 1.2293211637193382, + "grad_norm": 4.001043319702148, + "learning_rate": 3.41342232175413e-05, + "loss": 2.0621, + "step": 18318 + }, + { + "epoch": 1.2294553874031073, + "grad_norm": 4.434661865234375, + "learning_rate": 3.412391695992925e-05, + "loss": 2.0358, + "step": 18320 + }, + { + "epoch": 1.2295896110868763, + "grad_norm": 4.191474437713623, + "learning_rate": 3.4113611452430914e-05, + "loss": 2.0261, + "step": 18322 + }, + { + "epoch": 1.2297238347706454, + "grad_norm": 4.1749491691589355, + "learning_rate": 3.410330669553319e-05, + "loss": 2.0069, + "step": 18324 + }, + { + "epoch": 1.2298580584544143, + "grad_norm": 3.934196710586548, + "learning_rate": 3.409300268972298e-05, + "loss": 1.915, + "step": 18326 + }, + { + "epoch": 1.2299922821381832, + "grad_norm": 4.352377891540527, + "learning_rate": 3.4082699435487084e-05, + "loss": 2.1186, + "step": 18328 + }, + { + "epoch": 1.2301265058219524, + "grad_norm": 3.763185501098633, + "learning_rate": 3.4072396933312365e-05, + "loss": 2.217, + "step": 18330 + }, + { + "epoch": 1.2302607295057213, + "grad_norm": 4.123305320739746, + "learning_rate": 3.406209518368555e-05, + "loss": 2.0822, + "step": 18332 + }, + { + "epoch": 1.2303949531894902, + "grad_norm": 4.773456573486328, + "learning_rate": 3.405179418709342e-05, + "loss": 2.0261, + "step": 18334 + }, + { + "epoch": 1.2305291768732594, + "grad_norm": 3.973651885986328, + "learning_rate": 3.4041493944022655e-05, + "loss": 1.8935, + "step": 18336 + }, + { + "epoch": 1.2306634005570283, + "grad_norm": 4.067750930786133, + "learning_rate": 3.40311944549599e-05, + "loss": 2.0815, + "step": 18338 + }, + { + "epoch": 1.2307976242407972, + "grad_norm": 3.5459952354431152, + "learning_rate": 3.402089572039183e-05, + "loss": 1.8167, + "step": 18340 + }, + { + "epoch": 1.2309318479245663, + "grad_norm": 3.8743655681610107, + "learning_rate": 3.401059774080498e-05, + "loss": 1.8905, + "step": 18342 + }, + { + "epoch": 1.2310660716083353, + "grad_norm": 4.118895053863525, + "learning_rate": 3.4000300516685965e-05, + "loss": 1.999, + "step": 18344 + }, + { + "epoch": 1.2312002952921044, + "grad_norm": 4.11130428314209, + "learning_rate": 3.399000404852129e-05, + "loss": 2.1186, + "step": 18346 + }, + { + "epoch": 1.2313345189758733, + "grad_norm": 4.986591815948486, + "learning_rate": 3.397970833679744e-05, + "loss": 1.8104, + "step": 18348 + }, + { + "epoch": 1.2314687426596422, + "grad_norm": 4.227504253387451, + "learning_rate": 3.396941338200087e-05, + "loss": 1.9657, + "step": 18350 + }, + { + "epoch": 1.2316029663434114, + "grad_norm": 4.50209379196167, + "learning_rate": 3.3959119184618e-05, + "loss": 2.0384, + "step": 18352 + }, + { + "epoch": 1.2317371900271803, + "grad_norm": 3.801914930343628, + "learning_rate": 3.394882574513519e-05, + "loss": 1.7939, + "step": 18354 + }, + { + "epoch": 1.2318714137109492, + "grad_norm": 4.779311656951904, + "learning_rate": 3.393853306403881e-05, + "loss": 2.1828, + "step": 18356 + }, + { + "epoch": 1.2320056373947184, + "grad_norm": 3.5665159225463867, + "learning_rate": 3.392824114181516e-05, + "loss": 2.0653, + "step": 18358 + }, + { + "epoch": 1.2321398610784873, + "grad_norm": 3.9684059619903564, + "learning_rate": 3.391794997895051e-05, + "loss": 1.8662, + "step": 18360 + }, + { + "epoch": 1.2322740847622562, + "grad_norm": 5.40781307220459, + "learning_rate": 3.3907659575931096e-05, + "loss": 1.8556, + "step": 18362 + }, + { + "epoch": 1.2324083084460253, + "grad_norm": 3.932274580001831, + "learning_rate": 3.389736993324314e-05, + "loss": 2.0094, + "step": 18364 + }, + { + "epoch": 1.2325425321297943, + "grad_norm": 3.9871485233306885, + "learning_rate": 3.388708105137276e-05, + "loss": 1.8882, + "step": 18366 + }, + { + "epoch": 1.2326767558135634, + "grad_norm": 3.8814890384674072, + "learning_rate": 3.3876792930806144e-05, + "loss": 2.2392, + "step": 18368 + }, + { + "epoch": 1.2328109794973323, + "grad_norm": 4.3915910720825195, + "learning_rate": 3.3866505572029334e-05, + "loss": 1.8034, + "step": 18370 + }, + { + "epoch": 1.2329452031811012, + "grad_norm": 4.735340595245361, + "learning_rate": 3.3856218975528434e-05, + "loss": 2.2586, + "step": 18372 + }, + { + "epoch": 1.2330794268648704, + "grad_norm": 3.9468464851379395, + "learning_rate": 3.384593314178942e-05, + "loss": 1.9859, + "step": 18374 + }, + { + "epoch": 1.2332136505486393, + "grad_norm": 4.015772342681885, + "learning_rate": 3.383564807129832e-05, + "loss": 2.1512, + "step": 18376 + }, + { + "epoch": 1.2333478742324082, + "grad_norm": 4.0438666343688965, + "learning_rate": 3.382536376454104e-05, + "loss": 2.1863, + "step": 18378 + }, + { + "epoch": 1.2334820979161774, + "grad_norm": 3.8281595706939697, + "learning_rate": 3.3815080222003533e-05, + "loss": 1.9303, + "step": 18380 + }, + { + "epoch": 1.2336163215999463, + "grad_norm": 3.6048502922058105, + "learning_rate": 3.3804797444171654e-05, + "loss": 1.9653, + "step": 18382 + }, + { + "epoch": 1.2337505452837152, + "grad_norm": 4.463003635406494, + "learning_rate": 3.379451543153126e-05, + "loss": 2.1495, + "step": 18384 + }, + { + "epoch": 1.2338847689674843, + "grad_norm": 3.899921178817749, + "learning_rate": 3.378423418456813e-05, + "loss": 2.2849, + "step": 18386 + }, + { + "epoch": 1.2340189926512533, + "grad_norm": 4.021352767944336, + "learning_rate": 3.3773953703768055e-05, + "loss": 1.8489, + "step": 18388 + }, + { + "epoch": 1.2341532163350224, + "grad_norm": 4.329668998718262, + "learning_rate": 3.376367398961674e-05, + "loss": 2.1112, + "step": 18390 + }, + { + "epoch": 1.2342874400187913, + "grad_norm": 3.8848860263824463, + "learning_rate": 3.375339504259994e-05, + "loss": 2.0354, + "step": 18392 + }, + { + "epoch": 1.2344216637025602, + "grad_norm": 4.107925891876221, + "learning_rate": 3.3743116863203236e-05, + "loss": 2.1259, + "step": 18394 + }, + { + "epoch": 1.2345558873863294, + "grad_norm": 3.719836473464966, + "learning_rate": 3.3732839451912335e-05, + "loss": 1.8791, + "step": 18396 + }, + { + "epoch": 1.2346901110700983, + "grad_norm": 4.1223578453063965, + "learning_rate": 3.372256280921274e-05, + "loss": 2.0428, + "step": 18398 + }, + { + "epoch": 1.2348243347538674, + "grad_norm": 4.2668585777282715, + "learning_rate": 3.371228693559007e-05, + "loss": 2.2189, + "step": 18400 + }, + { + "epoch": 1.2349585584376364, + "grad_norm": 3.697678804397583, + "learning_rate": 3.3702011831529805e-05, + "loss": 2.0821, + "step": 18402 + }, + { + "epoch": 1.2350927821214053, + "grad_norm": 4.241527557373047, + "learning_rate": 3.369173749751744e-05, + "loss": 1.9078, + "step": 18404 + }, + { + "epoch": 1.2352270058051744, + "grad_norm": 4.69252347946167, + "learning_rate": 3.3681463934038415e-05, + "loss": 1.9806, + "step": 18406 + }, + { + "epoch": 1.2353612294889433, + "grad_norm": 3.7717361450195312, + "learning_rate": 3.3671191141578114e-05, + "loss": 1.9587, + "step": 18408 + }, + { + "epoch": 1.2354954531727123, + "grad_norm": 4.516866683959961, + "learning_rate": 3.366091912062194e-05, + "loss": 1.7964, + "step": 18410 + }, + { + "epoch": 1.2356296768564814, + "grad_norm": 3.656536102294922, + "learning_rate": 3.365064787165518e-05, + "loss": 1.7874, + "step": 18412 + }, + { + "epoch": 1.2357639005402503, + "grad_norm": 4.588382244110107, + "learning_rate": 3.3640377395163185e-05, + "loss": 2.329, + "step": 18414 + }, + { + "epoch": 1.2358981242240192, + "grad_norm": 4.181541919708252, + "learning_rate": 3.3630107691631174e-05, + "loss": 2.2757, + "step": 18416 + }, + { + "epoch": 1.2360323479077884, + "grad_norm": 4.323781490325928, + "learning_rate": 3.361983876154439e-05, + "loss": 1.995, + "step": 18418 + }, + { + "epoch": 1.2361665715915573, + "grad_norm": 3.923442840576172, + "learning_rate": 3.360957060538801e-05, + "loss": 1.7538, + "step": 18420 + }, + { + "epoch": 1.2363007952753264, + "grad_norm": 3.910665512084961, + "learning_rate": 3.35993032236472e-05, + "loss": 2.1071, + "step": 18422 + }, + { + "epoch": 1.2364350189590954, + "grad_norm": 3.833242654800415, + "learning_rate": 3.358903661680703e-05, + "loss": 1.9011, + "step": 18424 + }, + { + "epoch": 1.2365692426428643, + "grad_norm": 3.892305850982666, + "learning_rate": 3.3578770785352654e-05, + "loss": 1.7595, + "step": 18426 + }, + { + "epoch": 1.2367034663266334, + "grad_norm": 3.854322671890259, + "learning_rate": 3.356850572976903e-05, + "loss": 2.197, + "step": 18428 + }, + { + "epoch": 1.2368376900104023, + "grad_norm": 4.260941028594971, + "learning_rate": 3.3558241450541235e-05, + "loss": 1.9558, + "step": 18430 + }, + { + "epoch": 1.2369719136941713, + "grad_norm": 3.923889398574829, + "learning_rate": 3.354797794815416e-05, + "loss": 2.1443, + "step": 18432 + }, + { + "epoch": 1.2371061373779404, + "grad_norm": 3.796133518218994, + "learning_rate": 3.35377152230928e-05, + "loss": 1.9807, + "step": 18434 + }, + { + "epoch": 1.2372403610617093, + "grad_norm": 4.570171356201172, + "learning_rate": 3.352745327584202e-05, + "loss": 2.0757, + "step": 18436 + }, + { + "epoch": 1.2373745847454782, + "grad_norm": 4.757614612579346, + "learning_rate": 3.3517192106886684e-05, + "loss": 2.1839, + "step": 18438 + }, + { + "epoch": 1.2375088084292474, + "grad_norm": 4.415244102478027, + "learning_rate": 3.3506931716711596e-05, + "loss": 1.8785, + "step": 18440 + }, + { + "epoch": 1.2376430321130163, + "grad_norm": 4.433703422546387, + "learning_rate": 3.3496672105801575e-05, + "loss": 1.9621, + "step": 18442 + }, + { + "epoch": 1.2377772557967854, + "grad_norm": 4.34056282043457, + "learning_rate": 3.3486413274641327e-05, + "loss": 2.0377, + "step": 18444 + }, + { + "epoch": 1.2379114794805544, + "grad_norm": 4.311459064483643, + "learning_rate": 3.347615522371559e-05, + "loss": 2.0733, + "step": 18446 + }, + { + "epoch": 1.2380457031643233, + "grad_norm": 4.165348052978516, + "learning_rate": 3.3465897953509006e-05, + "loss": 2.0116, + "step": 18448 + }, + { + "epoch": 1.2381799268480924, + "grad_norm": 4.438968658447266, + "learning_rate": 3.345564146450625e-05, + "loss": 2.0943, + "step": 18450 + }, + { + "epoch": 1.2383141505318613, + "grad_norm": 4.779866695404053, + "learning_rate": 3.3445385757191885e-05, + "loss": 2.0404, + "step": 18452 + }, + { + "epoch": 1.2384483742156303, + "grad_norm": 5.1436991691589355, + "learning_rate": 3.3435130832050517e-05, + "loss": 2.1649, + "step": 18454 + }, + { + "epoch": 1.2385825978993994, + "grad_norm": 3.7376468181610107, + "learning_rate": 3.342487668956661e-05, + "loss": 2.0558, + "step": 18456 + }, + { + "epoch": 1.2387168215831683, + "grad_norm": 3.9314863681793213, + "learning_rate": 3.341462333022472e-05, + "loss": 2.1972, + "step": 18458 + }, + { + "epoch": 1.2388510452669372, + "grad_norm": 4.478096008300781, + "learning_rate": 3.340437075450923e-05, + "loss": 2.069, + "step": 18460 + }, + { + "epoch": 1.2389852689507064, + "grad_norm": 4.483206272125244, + "learning_rate": 3.339411896290462e-05, + "loss": 2.2364, + "step": 18462 + }, + { + "epoch": 1.2391194926344753, + "grad_norm": 4.687709808349609, + "learning_rate": 3.338386795589521e-05, + "loss": 2.2439, + "step": 18464 + }, + { + "epoch": 1.2392537163182444, + "grad_norm": 6.281485557556152, + "learning_rate": 3.337361773396538e-05, + "loss": 1.9218, + "step": 18466 + }, + { + "epoch": 1.2393879400020134, + "grad_norm": 4.098294258117676, + "learning_rate": 3.336336829759941e-05, + "loss": 2.0047, + "step": 18468 + }, + { + "epoch": 1.2395221636857823, + "grad_norm": 4.506698131561279, + "learning_rate": 3.3353119647281575e-05, + "loss": 1.9427, + "step": 18470 + }, + { + "epoch": 1.2396563873695514, + "grad_norm": 4.033741474151611, + "learning_rate": 3.334287178349611e-05, + "loss": 2.0229, + "step": 18472 + }, + { + "epoch": 1.2397906110533203, + "grad_norm": 3.827202081680298, + "learning_rate": 3.333262470672719e-05, + "loss": 1.8195, + "step": 18474 + }, + { + "epoch": 1.2399248347370895, + "grad_norm": 4.571023464202881, + "learning_rate": 3.332237841745898e-05, + "loss": 2.0479, + "step": 18476 + }, + { + "epoch": 1.2400590584208584, + "grad_norm": 4.480169773101807, + "learning_rate": 3.3312132916175586e-05, + "loss": 1.8918, + "step": 18478 + }, + { + "epoch": 1.2401932821046273, + "grad_norm": 4.257112979888916, + "learning_rate": 3.330188820336111e-05, + "loss": 2.3562, + "step": 18480 + }, + { + "epoch": 1.2403275057883965, + "grad_norm": 4.480015277862549, + "learning_rate": 3.329164427949957e-05, + "loss": 2.0511, + "step": 18482 + }, + { + "epoch": 1.2404617294721654, + "grad_norm": 4.046920299530029, + "learning_rate": 3.3281401145075e-05, + "loss": 2.3952, + "step": 18484 + }, + { + "epoch": 1.2405959531559343, + "grad_norm": 3.840654134750366, + "learning_rate": 3.3271158800571326e-05, + "loss": 2.3648, + "step": 18486 + }, + { + "epoch": 1.2407301768397034, + "grad_norm": 4.4285783767700195, + "learning_rate": 3.3260917246472525e-05, + "loss": 2.205, + "step": 18488 + }, + { + "epoch": 1.2408644005234724, + "grad_norm": 4.857621192932129, + "learning_rate": 3.325067648326244e-05, + "loss": 2.103, + "step": 18490 + }, + { + "epoch": 1.2409986242072413, + "grad_norm": 4.3387956619262695, + "learning_rate": 3.324043651142499e-05, + "loss": 1.9056, + "step": 18492 + }, + { + "epoch": 1.2411328478910104, + "grad_norm": 3.9984729290008545, + "learning_rate": 3.323019733144392e-05, + "loss": 2.2325, + "step": 18494 + }, + { + "epoch": 1.2412670715747793, + "grad_norm": 3.9597253799438477, + "learning_rate": 3.3219958943803076e-05, + "loss": 2.1881, + "step": 18496 + }, + { + "epoch": 1.2414012952585485, + "grad_norm": 4.475558757781982, + "learning_rate": 3.3209721348986166e-05, + "loss": 1.9852, + "step": 18498 + }, + { + "epoch": 1.2415355189423174, + "grad_norm": 8.492267608642578, + "learning_rate": 3.3199484547476915e-05, + "loss": 2.087, + "step": 18500 + }, + { + "epoch": 1.2416697426260863, + "grad_norm": 3.9937760829925537, + "learning_rate": 3.3189248539758974e-05, + "loss": 1.8825, + "step": 18502 + }, + { + "epoch": 1.2418039663098555, + "grad_norm": 3.5745458602905273, + "learning_rate": 3.317901332631599e-05, + "loss": 1.9225, + "step": 18504 + }, + { + "epoch": 1.2419381899936244, + "grad_norm": 4.21985387802124, + "learning_rate": 3.3168778907631534e-05, + "loss": 1.9185, + "step": 18506 + }, + { + "epoch": 1.2420724136773933, + "grad_norm": 3.7459282875061035, + "learning_rate": 3.3158545284189204e-05, + "loss": 2.0644, + "step": 18508 + }, + { + "epoch": 1.2422066373611624, + "grad_norm": 4.086984157562256, + "learning_rate": 3.314831245647247e-05, + "loss": 1.9551, + "step": 18510 + }, + { + "epoch": 1.2423408610449314, + "grad_norm": 5.1246657371521, + "learning_rate": 3.3138080424964845e-05, + "loss": 2.081, + "step": 18512 + }, + { + "epoch": 1.2424750847287003, + "grad_norm": 4.365577220916748, + "learning_rate": 3.312784919014974e-05, + "loss": 1.9955, + "step": 18514 + }, + { + "epoch": 1.2426093084124694, + "grad_norm": 4.059395790100098, + "learning_rate": 3.311761875251062e-05, + "loss": 2.3434, + "step": 18516 + }, + { + "epoch": 1.2427435320962383, + "grad_norm": 4.297134876251221, + "learning_rate": 3.3107389112530784e-05, + "loss": 2.0541, + "step": 18518 + }, + { + "epoch": 1.2428777557800075, + "grad_norm": 7.734452724456787, + "learning_rate": 3.309716027069362e-05, + "loss": 2.1599, + "step": 18520 + }, + { + "epoch": 1.2430119794637764, + "grad_norm": 3.518350839614868, + "learning_rate": 3.308693222748237e-05, + "loss": 1.966, + "step": 18522 + }, + { + "epoch": 1.2431462031475453, + "grad_norm": 3.9110054969787598, + "learning_rate": 3.3076704983380334e-05, + "loss": 2.1674, + "step": 18524 + }, + { + "epoch": 1.2432804268313145, + "grad_norm": 4.41225528717041, + "learning_rate": 3.30664785388707e-05, + "loss": 2.224, + "step": 18526 + }, + { + "epoch": 1.2434146505150834, + "grad_norm": 3.8564305305480957, + "learning_rate": 3.305625289443667e-05, + "loss": 1.9781, + "step": 18528 + }, + { + "epoch": 1.2435488741988523, + "grad_norm": 4.047005653381348, + "learning_rate": 3.304602805056135e-05, + "loss": 2.0558, + "step": 18530 + }, + { + "epoch": 1.2436830978826214, + "grad_norm": 3.1130428314208984, + "learning_rate": 3.3035804007727885e-05, + "loss": 1.7712, + "step": 18532 + }, + { + "epoch": 1.2438173215663904, + "grad_norm": 3.9672207832336426, + "learning_rate": 3.302558076641931e-05, + "loss": 1.8723, + "step": 18534 + }, + { + "epoch": 1.2439515452501593, + "grad_norm": 4.30253267288208, + "learning_rate": 3.301535832711867e-05, + "loss": 2.241, + "step": 18536 + }, + { + "epoch": 1.2440857689339284, + "grad_norm": 4.563498020172119, + "learning_rate": 3.300513669030895e-05, + "loss": 2.2638, + "step": 18538 + }, + { + "epoch": 1.2442199926176973, + "grad_norm": 4.372849941253662, + "learning_rate": 3.299491585647311e-05, + "loss": 1.9883, + "step": 18540 + }, + { + "epoch": 1.2443542163014665, + "grad_norm": 4.626254558563232, + "learning_rate": 3.298469582609405e-05, + "loss": 2.2054, + "step": 18542 + }, + { + "epoch": 1.2444884399852354, + "grad_norm": 4.893866062164307, + "learning_rate": 3.2974476599654646e-05, + "loss": 2.248, + "step": 18544 + }, + { + "epoch": 1.2446226636690043, + "grad_norm": 4.153369903564453, + "learning_rate": 3.296425817763776e-05, + "loss": 2.0922, + "step": 18546 + }, + { + "epoch": 1.2447568873527735, + "grad_norm": 4.267186164855957, + "learning_rate": 3.295404056052616e-05, + "loss": 2.2067, + "step": 18548 + }, + { + "epoch": 1.2448911110365424, + "grad_norm": 3.889709949493408, + "learning_rate": 3.2943823748802635e-05, + "loss": 1.9744, + "step": 18550 + }, + { + "epoch": 1.2450253347203115, + "grad_norm": 4.443266868591309, + "learning_rate": 3.2933607742949876e-05, + "loss": 2.0782, + "step": 18552 + }, + { + "epoch": 1.2451595584040804, + "grad_norm": 4.418499946594238, + "learning_rate": 3.292339254345063e-05, + "loss": 1.9067, + "step": 18554 + }, + { + "epoch": 1.2452937820878494, + "grad_norm": 6.839996814727783, + "learning_rate": 3.2913178150787465e-05, + "loss": 2.1783, + "step": 18556 + }, + { + "epoch": 1.2454280057716185, + "grad_norm": 4.000147819519043, + "learning_rate": 3.290296456544306e-05, + "loss": 1.928, + "step": 18558 + }, + { + "epoch": 1.2455622294553874, + "grad_norm": 3.6146044731140137, + "learning_rate": 3.289275178789995e-05, + "loss": 2.1183, + "step": 18560 + }, + { + "epoch": 1.2456964531391563, + "grad_norm": 3.750577926635742, + "learning_rate": 3.288253981864068e-05, + "loss": 2.0556, + "step": 18562 + }, + { + "epoch": 1.2458306768229255, + "grad_norm": 6.6275410652160645, + "learning_rate": 3.287232865814774e-05, + "loss": 1.8244, + "step": 18564 + }, + { + "epoch": 1.2459649005066944, + "grad_norm": 8.017160415649414, + "learning_rate": 3.28621183069036e-05, + "loss": 2.2012, + "step": 18566 + }, + { + "epoch": 1.2460991241904633, + "grad_norm": 4.324739933013916, + "learning_rate": 3.285190876539066e-05, + "loss": 2.1379, + "step": 18568 + }, + { + "epoch": 1.2462333478742325, + "grad_norm": 4.8045220375061035, + "learning_rate": 3.284170003409132e-05, + "loss": 2.0143, + "step": 18570 + }, + { + "epoch": 1.2463675715580014, + "grad_norm": 4.455817222595215, + "learning_rate": 3.2831492113487904e-05, + "loss": 2.1585, + "step": 18572 + }, + { + "epoch": 1.2465017952417705, + "grad_norm": 4.158083438873291, + "learning_rate": 3.282128500406273e-05, + "loss": 2.1828, + "step": 18574 + }, + { + "epoch": 1.2466360189255394, + "grad_norm": 3.4040238857269287, + "learning_rate": 3.281107870629806e-05, + "loss": 1.8677, + "step": 18576 + }, + { + "epoch": 1.2467702426093084, + "grad_norm": 3.766470193862915, + "learning_rate": 3.280087322067612e-05, + "loss": 1.9838, + "step": 18578 + }, + { + "epoch": 1.2469044662930775, + "grad_norm": 4.068746089935303, + "learning_rate": 3.279066854767908e-05, + "loss": 2.0913, + "step": 18580 + }, + { + "epoch": 1.2470386899768464, + "grad_norm": 4.411932945251465, + "learning_rate": 3.2780464687789144e-05, + "loss": 1.9417, + "step": 18582 + }, + { + "epoch": 1.2471729136606153, + "grad_norm": 3.9237425327301025, + "learning_rate": 3.277026164148836e-05, + "loss": 2.2852, + "step": 18584 + }, + { + "epoch": 1.2473071373443845, + "grad_norm": 4.1784586906433105, + "learning_rate": 3.2760059409258855e-05, + "loss": 2.0781, + "step": 18586 + }, + { + "epoch": 1.2474413610281534, + "grad_norm": 3.7892229557037354, + "learning_rate": 3.2749857991582635e-05, + "loss": 1.9846, + "step": 18588 + }, + { + "epoch": 1.2475755847119223, + "grad_norm": 4.219667911529541, + "learning_rate": 3.273965738894171e-05, + "loss": 2.2103, + "step": 18590 + }, + { + "epoch": 1.2477098083956915, + "grad_norm": 3.698237419128418, + "learning_rate": 3.272945760181802e-05, + "loss": 1.8132, + "step": 18592 + }, + { + "epoch": 1.2478440320794604, + "grad_norm": 4.114459991455078, + "learning_rate": 3.271925863069351e-05, + "loss": 2.0503, + "step": 18594 + }, + { + "epoch": 1.2479782557632295, + "grad_norm": 3.8366549015045166, + "learning_rate": 3.270906047605003e-05, + "loss": 1.8798, + "step": 18596 + }, + { + "epoch": 1.2481124794469984, + "grad_norm": 3.911911725997925, + "learning_rate": 3.269886313836947e-05, + "loss": 1.8988, + "step": 18598 + }, + { + "epoch": 1.2482467031307674, + "grad_norm": 4.020765781402588, + "learning_rate": 3.268866661813358e-05, + "loss": 1.9187, + "step": 18600 + }, + { + "epoch": 1.2483809268145365, + "grad_norm": 3.996493339538574, + "learning_rate": 3.2678470915824166e-05, + "loss": 2.0323, + "step": 18602 + }, + { + "epoch": 1.2485151504983054, + "grad_norm": 3.895441770553589, + "learning_rate": 3.266827603192292e-05, + "loss": 2.0544, + "step": 18604 + }, + { + "epoch": 1.2486493741820743, + "grad_norm": 3.5249154567718506, + "learning_rate": 3.265808196691158e-05, + "loss": 1.832, + "step": 18606 + }, + { + "epoch": 1.2487835978658435, + "grad_norm": 4.503840923309326, + "learning_rate": 3.264788872127173e-05, + "loss": 2.0981, + "step": 18608 + }, + { + "epoch": 1.2489178215496124, + "grad_norm": 3.914038896560669, + "learning_rate": 3.2637696295485055e-05, + "loss": 2.1451, + "step": 18610 + }, + { + "epoch": 1.2490520452333813, + "grad_norm": 4.24338960647583, + "learning_rate": 3.262750469003307e-05, + "loss": 1.935, + "step": 18612 + }, + { + "epoch": 1.2491862689171505, + "grad_norm": 4.313173294067383, + "learning_rate": 3.261731390539732e-05, + "loss": 2.1716, + "step": 18614 + }, + { + "epoch": 1.2493204926009194, + "grad_norm": 4.078044891357422, + "learning_rate": 3.260712394205933e-05, + "loss": 1.9017, + "step": 18616 + }, + { + "epoch": 1.2494547162846885, + "grad_norm": 3.5322446823120117, + "learning_rate": 3.25969348005005e-05, + "loss": 2.0023, + "step": 18618 + }, + { + "epoch": 1.2495889399684574, + "grad_norm": 3.8637752532958984, + "learning_rate": 3.2586746481202294e-05, + "loss": 2.1974, + "step": 18620 + }, + { + "epoch": 1.2497231636522264, + "grad_norm": 4.413294315338135, + "learning_rate": 3.2576558984646075e-05, + "loss": 2.2312, + "step": 18622 + }, + { + "epoch": 1.2498573873359955, + "grad_norm": 3.862443447113037, + "learning_rate": 3.256637231131319e-05, + "loss": 2.0313, + "step": 18624 + }, + { + "epoch": 1.2499916110197644, + "grad_norm": 3.5534160137176514, + "learning_rate": 3.255618646168491e-05, + "loss": 2.1062, + "step": 18626 + }, + { + "epoch": 1.2501258347035336, + "grad_norm": 4.067995548248291, + "learning_rate": 3.2546001436242547e-05, + "loss": 2.0592, + "step": 18628 + }, + { + "epoch": 1.2502600583873025, + "grad_norm": 3.7220399379730225, + "learning_rate": 3.253581723546727e-05, + "loss": 1.7091, + "step": 18630 + }, + { + "epoch": 1.2503942820710714, + "grad_norm": 4.334637641906738, + "learning_rate": 3.25256338598403e-05, + "loss": 2.2834, + "step": 18632 + }, + { + "epoch": 1.2505285057548403, + "grad_norm": 3.7186169624328613, + "learning_rate": 3.251545130984276e-05, + "loss": 1.7292, + "step": 18634 + }, + { + "epoch": 1.2506627294386095, + "grad_norm": 4.176265716552734, + "learning_rate": 3.250526958595578e-05, + "loss": 2.1221, + "step": 18636 + }, + { + "epoch": 1.2507969531223784, + "grad_norm": 4.225073337554932, + "learning_rate": 3.249508868866039e-05, + "loss": 1.934, + "step": 18638 + }, + { + "epoch": 1.2509311768061475, + "grad_norm": 4.31087589263916, + "learning_rate": 3.248490861843765e-05, + "loss": 2.0383, + "step": 18640 + }, + { + "epoch": 1.2510654004899164, + "grad_norm": 3.8264026641845703, + "learning_rate": 3.247472937576852e-05, + "loss": 2.1387, + "step": 18642 + }, + { + "epoch": 1.2511996241736854, + "grad_norm": 4.237035274505615, + "learning_rate": 3.246455096113399e-05, + "loss": 2.1679, + "step": 18644 + }, + { + "epoch": 1.2513338478574545, + "grad_norm": 4.32899284362793, + "learning_rate": 3.245437337501491e-05, + "loss": 2.0949, + "step": 18646 + }, + { + "epoch": 1.2514680715412234, + "grad_norm": 4.150428771972656, + "learning_rate": 3.244419661789222e-05, + "loss": 2.3156, + "step": 18648 + }, + { + "epoch": 1.2516022952249926, + "grad_norm": 4.177308082580566, + "learning_rate": 3.24340206902467e-05, + "loss": 1.9909, + "step": 18650 + }, + { + "epoch": 1.2517365189087615, + "grad_norm": 3.603074312210083, + "learning_rate": 3.242384559255917e-05, + "loss": 1.7816, + "step": 18652 + }, + { + "epoch": 1.2518707425925304, + "grad_norm": 4.00182580947876, + "learning_rate": 3.241367132531036e-05, + "loss": 2.1065, + "step": 18654 + }, + { + "epoch": 1.2520049662762995, + "grad_norm": 3.696484327316284, + "learning_rate": 3.2403497888981015e-05, + "loss": 1.9341, + "step": 18656 + }, + { + "epoch": 1.2521391899600685, + "grad_norm": 4.063091278076172, + "learning_rate": 3.2393325284051776e-05, + "loss": 1.9746, + "step": 18658 + }, + { + "epoch": 1.2522734136438374, + "grad_norm": 3.582761764526367, + "learning_rate": 3.238315351100332e-05, + "loss": 2.2328, + "step": 18660 + }, + { + "epoch": 1.2524076373276065, + "grad_norm": 4.113670349121094, + "learning_rate": 3.237298257031619e-05, + "loss": 2.123, + "step": 18662 + }, + { + "epoch": 1.2525418610113754, + "grad_norm": 3.5402016639709473, + "learning_rate": 3.236281246247099e-05, + "loss": 1.9569, + "step": 18664 + }, + { + "epoch": 1.2526760846951444, + "grad_norm": 4.136929988861084, + "learning_rate": 3.235264318794821e-05, + "loss": 2.0769, + "step": 18666 + }, + { + "epoch": 1.2528103083789135, + "grad_norm": 3.8663041591644287, + "learning_rate": 3.234247474722835e-05, + "loss": 1.9608, + "step": 18668 + }, + { + "epoch": 1.2529445320626824, + "grad_norm": 4.6211323738098145, + "learning_rate": 3.2332307140791805e-05, + "loss": 2.0159, + "step": 18670 + }, + { + "epoch": 1.2530787557464516, + "grad_norm": 5.372766017913818, + "learning_rate": 3.2322140369119045e-05, + "loss": 2.0694, + "step": 18672 + }, + { + "epoch": 1.2532129794302205, + "grad_norm": 7.344865322113037, + "learning_rate": 3.231197443269036e-05, + "loss": 1.9622, + "step": 18674 + }, + { + "epoch": 1.2533472031139894, + "grad_norm": 3.9777610301971436, + "learning_rate": 3.230180933198612e-05, + "loss": 1.8982, + "step": 18676 + }, + { + "epoch": 1.2534814267977585, + "grad_norm": 4.319707870483398, + "learning_rate": 3.2291645067486595e-05, + "loss": 2.2397, + "step": 18678 + }, + { + "epoch": 1.2536156504815275, + "grad_norm": 4.050598621368408, + "learning_rate": 3.2281481639672e-05, + "loss": 1.9877, + "step": 18680 + }, + { + "epoch": 1.2537498741652966, + "grad_norm": 4.145373821258545, + "learning_rate": 3.227131904902257e-05, + "loss": 1.9715, + "step": 18682 + }, + { + "epoch": 1.2538840978490655, + "grad_norm": 3.555285692214966, + "learning_rate": 3.226115729601843e-05, + "loss": 1.898, + "step": 18684 + }, + { + "epoch": 1.2540183215328344, + "grad_norm": 4.073456764221191, + "learning_rate": 3.225099638113974e-05, + "loss": 2.1821, + "step": 18686 + }, + { + "epoch": 1.2541525452166034, + "grad_norm": 3.9234986305236816, + "learning_rate": 3.224083630486656e-05, + "loss": 2.193, + "step": 18688 + }, + { + "epoch": 1.2542867689003725, + "grad_norm": 8.502510070800781, + "learning_rate": 3.223067706767896e-05, + "loss": 2.0359, + "step": 18690 + }, + { + "epoch": 1.2544209925841414, + "grad_norm": 4.0523810386657715, + "learning_rate": 3.2220518670056916e-05, + "loss": 2.1798, + "step": 18692 + }, + { + "epoch": 1.2545552162679106, + "grad_norm": 3.891725778579712, + "learning_rate": 3.221036111248041e-05, + "loss": 1.9286, + "step": 18694 + }, + { + "epoch": 1.2546894399516795, + "grad_norm": 4.182806015014648, + "learning_rate": 3.2200204395429344e-05, + "loss": 2.0801, + "step": 18696 + }, + { + "epoch": 1.2548236636354484, + "grad_norm": 3.70468807220459, + "learning_rate": 3.2190048519383635e-05, + "loss": 1.8641, + "step": 18698 + }, + { + "epoch": 1.2549578873192175, + "grad_norm": 3.567793607711792, + "learning_rate": 3.2179893484823106e-05, + "loss": 1.9606, + "step": 18700 + }, + { + "epoch": 1.2550921110029865, + "grad_norm": 4.1514739990234375, + "learning_rate": 3.216973929222757e-05, + "loss": 1.9391, + "step": 18702 + }, + { + "epoch": 1.2552263346867556, + "grad_norm": 4.267300605773926, + "learning_rate": 3.2159585942076775e-05, + "loss": 2.2042, + "step": 18704 + }, + { + "epoch": 1.2553605583705245, + "grad_norm": 4.127199172973633, + "learning_rate": 3.214943343485049e-05, + "loss": 1.8584, + "step": 18706 + }, + { + "epoch": 1.2554947820542934, + "grad_norm": 4.149755001068115, + "learning_rate": 3.213928177102834e-05, + "loss": 1.943, + "step": 18708 + }, + { + "epoch": 1.2556290057380624, + "grad_norm": 4.427430152893066, + "learning_rate": 3.2129130951090036e-05, + "loss": 2.046, + "step": 18710 + }, + { + "epoch": 1.2557632294218315, + "grad_norm": 4.1349992752075195, + "learning_rate": 3.2118980975515134e-05, + "loss": 1.8351, + "step": 18712 + }, + { + "epoch": 1.2558974531056004, + "grad_norm": 3.845427989959717, + "learning_rate": 3.210883184478323e-05, + "loss": 1.8283, + "step": 18714 + }, + { + "epoch": 1.2560316767893696, + "grad_norm": 3.3359134197235107, + "learning_rate": 3.2098683559373835e-05, + "loss": 2.0701, + "step": 18716 + }, + { + "epoch": 1.2561659004731385, + "grad_norm": 3.9165854454040527, + "learning_rate": 3.208853611976644e-05, + "loss": 1.9512, + "step": 18718 + }, + { + "epoch": 1.2563001241569074, + "grad_norm": 4.681850433349609, + "learning_rate": 3.2078389526440495e-05, + "loss": 2.4845, + "step": 18720 + }, + { + "epoch": 1.2564343478406765, + "grad_norm": 4.619125843048096, + "learning_rate": 3.206824377987541e-05, + "loss": 1.9769, + "step": 18722 + }, + { + "epoch": 1.2565685715244455, + "grad_norm": 4.882932186126709, + "learning_rate": 3.205809888055053e-05, + "loss": 2.0501, + "step": 18724 + }, + { + "epoch": 1.2567027952082146, + "grad_norm": 3.931674003601074, + "learning_rate": 3.204795482894521e-05, + "loss": 1.9131, + "step": 18726 + }, + { + "epoch": 1.2568370188919835, + "grad_norm": 3.9970195293426514, + "learning_rate": 3.203781162553871e-05, + "loss": 1.9608, + "step": 18728 + }, + { + "epoch": 1.2569712425757524, + "grad_norm": 3.9448680877685547, + "learning_rate": 3.20276692708103e-05, + "loss": 1.8364, + "step": 18730 + }, + { + "epoch": 1.2571054662595216, + "grad_norm": 3.9265005588531494, + "learning_rate": 3.201752776523917e-05, + "loss": 1.9137, + "step": 18732 + }, + { + "epoch": 1.2572396899432905, + "grad_norm": 4.5435590744018555, + "learning_rate": 3.20073871093045e-05, + "loss": 1.99, + "step": 18734 + }, + { + "epoch": 1.2573739136270594, + "grad_norm": 4.58298921585083, + "learning_rate": 3.199724730348539e-05, + "loss": 1.959, + "step": 18736 + }, + { + "epoch": 1.2575081373108286, + "grad_norm": 4.520092487335205, + "learning_rate": 3.198710834826096e-05, + "loss": 2.1153, + "step": 18738 + }, + { + "epoch": 1.2576423609945975, + "grad_norm": 4.228734493255615, + "learning_rate": 3.1976970244110234e-05, + "loss": 1.8986, + "step": 18740 + }, + { + "epoch": 1.2577765846783664, + "grad_norm": 3.891050338745117, + "learning_rate": 3.196683299151223e-05, + "loss": 1.8781, + "step": 18742 + }, + { + "epoch": 1.2579108083621355, + "grad_norm": 4.065313339233398, + "learning_rate": 3.19566965909459e-05, + "loss": 2.1634, + "step": 18744 + }, + { + "epoch": 1.2580450320459045, + "grad_norm": 3.4540634155273438, + "learning_rate": 3.1946561042890174e-05, + "loss": 1.8047, + "step": 18746 + }, + { + "epoch": 1.2581792557296736, + "grad_norm": 3.7683000564575195, + "learning_rate": 3.1936426347823955e-05, + "loss": 1.8521, + "step": 18748 + }, + { + "epoch": 1.2583134794134425, + "grad_norm": 3.921877145767212, + "learning_rate": 3.1926292506226054e-05, + "loss": 2.0457, + "step": 18750 + }, + { + "epoch": 1.2584477030972114, + "grad_norm": 3.5793063640594482, + "learning_rate": 3.1916159518575294e-05, + "loss": 2.0889, + "step": 18752 + }, + { + "epoch": 1.2585819267809806, + "grad_norm": 4.613918781280518, + "learning_rate": 3.190602738535043e-05, + "loss": 1.9888, + "step": 18754 + }, + { + "epoch": 1.2587161504647495, + "grad_norm": 4.347772121429443, + "learning_rate": 3.189589610703021e-05, + "loss": 1.9684, + "step": 18756 + }, + { + "epoch": 1.2588503741485186, + "grad_norm": 4.0953545570373535, + "learning_rate": 3.188576568409328e-05, + "loss": 2.1568, + "step": 18758 + }, + { + "epoch": 1.2589845978322876, + "grad_norm": 4.1035966873168945, + "learning_rate": 3.187563611701832e-05, + "loss": 1.9582, + "step": 18760 + }, + { + "epoch": 1.2591188215160565, + "grad_norm": 4.122963905334473, + "learning_rate": 3.18655074062839e-05, + "loss": 1.9521, + "step": 18762 + }, + { + "epoch": 1.2592530451998254, + "grad_norm": 4.305302143096924, + "learning_rate": 3.1855379552368605e-05, + "loss": 2.494, + "step": 18764 + }, + { + "epoch": 1.2593872688835945, + "grad_norm": 3.1245181560516357, + "learning_rate": 3.184525255575092e-05, + "loss": 1.8865, + "step": 18766 + }, + { + "epoch": 1.2595214925673635, + "grad_norm": 4.2811503410339355, + "learning_rate": 3.183512641690939e-05, + "loss": 1.9516, + "step": 18768 + }, + { + "epoch": 1.2596557162511326, + "grad_norm": 4.135651588439941, + "learning_rate": 3.182500113632238e-05, + "loss": 2.0204, + "step": 18770 + }, + { + "epoch": 1.2597899399349015, + "grad_norm": 4.310962677001953, + "learning_rate": 3.181487671446836e-05, + "loss": 2.1682, + "step": 18772 + }, + { + "epoch": 1.2599241636186704, + "grad_norm": 4.456271171569824, + "learning_rate": 3.180475315182563e-05, + "loss": 2.0121, + "step": 18774 + }, + { + "epoch": 1.2600583873024396, + "grad_norm": 3.829041004180908, + "learning_rate": 3.1794630448872545e-05, + "loss": 1.8578, + "step": 18776 + }, + { + "epoch": 1.2601926109862085, + "grad_norm": 4.1156744956970215, + "learning_rate": 3.1784508606087374e-05, + "loss": 1.9271, + "step": 18778 + }, + { + "epoch": 1.2603268346699776, + "grad_norm": 3.928471088409424, + "learning_rate": 3.177438762394835e-05, + "loss": 2.0582, + "step": 18780 + }, + { + "epoch": 1.2604610583537466, + "grad_norm": 4.200403213500977, + "learning_rate": 3.1764267502933666e-05, + "loss": 1.9182, + "step": 18782 + }, + { + "epoch": 1.2605952820375155, + "grad_norm": 4.165099143981934, + "learning_rate": 3.17541482435215e-05, + "loss": 2.0032, + "step": 18784 + }, + { + "epoch": 1.2607295057212844, + "grad_norm": 4.032745361328125, + "learning_rate": 3.1744029846189925e-05, + "loss": 2.0789, + "step": 18786 + }, + { + "epoch": 1.2608637294050535, + "grad_norm": 4.092781066894531, + "learning_rate": 3.173391231141706e-05, + "loss": 1.9854, + "step": 18788 + }, + { + "epoch": 1.2609979530888225, + "grad_norm": 4.285175323486328, + "learning_rate": 3.172379563968092e-05, + "loss": 2.0281, + "step": 18790 + }, + { + "epoch": 1.2611321767725916, + "grad_norm": 3.9517650604248047, + "learning_rate": 3.171367983145949e-05, + "loss": 2.111, + "step": 18792 + }, + { + "epoch": 1.2612664004563605, + "grad_norm": 3.9356884956359863, + "learning_rate": 3.170356488723072e-05, + "loss": 1.9727, + "step": 18794 + }, + { + "epoch": 1.2614006241401294, + "grad_norm": 4.940488815307617, + "learning_rate": 3.169345080747256e-05, + "loss": 2.0961, + "step": 18796 + }, + { + "epoch": 1.2615348478238986, + "grad_norm": 5.959822177886963, + "learning_rate": 3.168333759266282e-05, + "loss": 2.2087, + "step": 18798 + }, + { + "epoch": 1.2616690715076675, + "grad_norm": 4.697404384613037, + "learning_rate": 3.167322524327938e-05, + "loss": 2.1623, + "step": 18800 + }, + { + "epoch": 1.2618032951914366, + "grad_norm": 3.9444010257720947, + "learning_rate": 3.166311375979999e-05, + "loss": 1.9111, + "step": 18802 + }, + { + "epoch": 1.2619375188752056, + "grad_norm": 3.9635231494903564, + "learning_rate": 3.1653003142702444e-05, + "loss": 2.1953, + "step": 18804 + }, + { + "epoch": 1.2620717425589745, + "grad_norm": 3.1907169818878174, + "learning_rate": 3.1642893392464404e-05, + "loss": 1.8992, + "step": 18806 + }, + { + "epoch": 1.2622059662427434, + "grad_norm": 4.584324836730957, + "learning_rate": 3.1632784509563565e-05, + "loss": 2.0408, + "step": 18808 + }, + { + "epoch": 1.2623401899265125, + "grad_norm": 3.679699420928955, + "learning_rate": 3.162267649447752e-05, + "loss": 2.0163, + "step": 18810 + }, + { + "epoch": 1.2624744136102815, + "grad_norm": 3.941343307495117, + "learning_rate": 3.1612569347683895e-05, + "loss": 1.8947, + "step": 18812 + }, + { + "epoch": 1.2626086372940506, + "grad_norm": 3.95306396484375, + "learning_rate": 3.160246306966019e-05, + "loss": 2.2256, + "step": 18814 + }, + { + "epoch": 1.2627428609778195, + "grad_norm": 3.807645082473755, + "learning_rate": 3.1592357660883954e-05, + "loss": 1.8832, + "step": 18816 + }, + { + "epoch": 1.2628770846615884, + "grad_norm": 4.2921576499938965, + "learning_rate": 3.158225312183261e-05, + "loss": 2.0916, + "step": 18818 + }, + { + "epoch": 1.2630113083453576, + "grad_norm": 3.5081119537353516, + "learning_rate": 3.157214945298358e-05, + "loss": 1.7833, + "step": 18820 + }, + { + "epoch": 1.2631455320291265, + "grad_norm": 3.8189051151275635, + "learning_rate": 3.156204665481426e-05, + "loss": 1.8956, + "step": 18822 + }, + { + "epoch": 1.2632797557128956, + "grad_norm": 4.224301338195801, + "learning_rate": 3.155194472780197e-05, + "loss": 2.0864, + "step": 18824 + }, + { + "epoch": 1.2634139793966646, + "grad_norm": 4.496669292449951, + "learning_rate": 3.154184367242403e-05, + "loss": 2.1053, + "step": 18826 + }, + { + "epoch": 1.2635482030804335, + "grad_norm": 4.398566722869873, + "learning_rate": 3.1531743489157664e-05, + "loss": 2.126, + "step": 18828 + }, + { + "epoch": 1.2636824267642026, + "grad_norm": 3.8748114109039307, + "learning_rate": 3.152164417848012e-05, + "loss": 2.0343, + "step": 18830 + }, + { + "epoch": 1.2638166504479715, + "grad_norm": 4.154521942138672, + "learning_rate": 3.151154574086853e-05, + "loss": 1.7734, + "step": 18832 + }, + { + "epoch": 1.2639508741317407, + "grad_norm": 4.201342582702637, + "learning_rate": 3.150144817680008e-05, + "loss": 1.9774, + "step": 18834 + }, + { + "epoch": 1.2640850978155096, + "grad_norm": 4.218161106109619, + "learning_rate": 3.149135148675179e-05, + "loss": 2.1172, + "step": 18836 + }, + { + "epoch": 1.2642193214992785, + "grad_norm": 4.107473850250244, + "learning_rate": 3.148125567120076e-05, + "loss": 1.9586, + "step": 18838 + }, + { + "epoch": 1.2643535451830474, + "grad_norm": 3.6544971466064453, + "learning_rate": 3.147116073062399e-05, + "loss": 1.8083, + "step": 18840 + }, + { + "epoch": 1.2644877688668166, + "grad_norm": 4.2606000900268555, + "learning_rate": 3.146106666549844e-05, + "loss": 2.0547, + "step": 18842 + }, + { + "epoch": 1.2646219925505855, + "grad_norm": 3.5727744102478027, + "learning_rate": 3.1450973476301026e-05, + "loss": 1.7779, + "step": 18844 + }, + { + "epoch": 1.2647562162343546, + "grad_norm": 3.916553497314453, + "learning_rate": 3.144088116350866e-05, + "loss": 2.0054, + "step": 18846 + }, + { + "epoch": 1.2648904399181236, + "grad_norm": 3.922372579574585, + "learning_rate": 3.1430789727598145e-05, + "loss": 2.0253, + "step": 18848 + }, + { + "epoch": 1.2650246636018925, + "grad_norm": 3.757260322570801, + "learning_rate": 3.142069916904631e-05, + "loss": 1.8623, + "step": 18850 + }, + { + "epoch": 1.2651588872856616, + "grad_norm": 3.931971311569214, + "learning_rate": 3.14106094883299e-05, + "loss": 2.0518, + "step": 18852 + }, + { + "epoch": 1.2652931109694305, + "grad_norm": 3.986966371536255, + "learning_rate": 3.140052068592564e-05, + "loss": 2.0698, + "step": 18854 + }, + { + "epoch": 1.2654273346531997, + "grad_norm": 4.427046775817871, + "learning_rate": 3.139043276231019e-05, + "loss": 1.9161, + "step": 18856 + }, + { + "epoch": 1.2655615583369686, + "grad_norm": 3.1399269104003906, + "learning_rate": 3.138034571796022e-05, + "loss": 1.7148, + "step": 18858 + }, + { + "epoch": 1.2656957820207375, + "grad_norm": 4.404788970947266, + "learning_rate": 3.1370259553352274e-05, + "loss": 2.0521, + "step": 18860 + }, + { + "epoch": 1.2658300057045064, + "grad_norm": 4.219808578491211, + "learning_rate": 3.1360174268962973e-05, + "loss": 1.9612, + "step": 18862 + }, + { + "epoch": 1.2659642293882756, + "grad_norm": 4.096989631652832, + "learning_rate": 3.135008986526874e-05, + "loss": 1.8176, + "step": 18864 + }, + { + "epoch": 1.2660984530720445, + "grad_norm": 4.18276309967041, + "learning_rate": 3.1340006342746107e-05, + "loss": 2.2129, + "step": 18866 + }, + { + "epoch": 1.2662326767558136, + "grad_norm": 4.388600826263428, + "learning_rate": 3.132992370187148e-05, + "loss": 2.2875, + "step": 18868 + }, + { + "epoch": 1.2663669004395826, + "grad_norm": 4.217967510223389, + "learning_rate": 3.131984194312125e-05, + "loss": 2.0291, + "step": 18870 + }, + { + "epoch": 1.2665011241233515, + "grad_norm": 3.6633007526397705, + "learning_rate": 3.130976106697174e-05, + "loss": 1.6589, + "step": 18872 + }, + { + "epoch": 1.2666353478071206, + "grad_norm": 4.049542427062988, + "learning_rate": 3.1299681073899284e-05, + "loss": 1.7587, + "step": 18874 + }, + { + "epoch": 1.2667695714908895, + "grad_norm": 4.309301853179932, + "learning_rate": 3.128960196438011e-05, + "loss": 2.0048, + "step": 18876 + }, + { + "epoch": 1.2669037951746587, + "grad_norm": 4.2589945793151855, + "learning_rate": 3.127952373889046e-05, + "loss": 1.8948, + "step": 18878 + }, + { + "epoch": 1.2670380188584276, + "grad_norm": 3.863626003265381, + "learning_rate": 3.12694463979065e-05, + "loss": 1.91, + "step": 18880 + }, + { + "epoch": 1.2671722425421965, + "grad_norm": 4.557473182678223, + "learning_rate": 3.1259369941904374e-05, + "loss": 2.2002, + "step": 18882 + }, + { + "epoch": 1.2673064662259654, + "grad_norm": 4.080304145812988, + "learning_rate": 3.124929437136016e-05, + "loss": 1.9064, + "step": 18884 + }, + { + "epoch": 1.2674406899097346, + "grad_norm": 3.812746524810791, + "learning_rate": 3.123921968674991e-05, + "loss": 2.0633, + "step": 18886 + }, + { + "epoch": 1.2675749135935035, + "grad_norm": 3.783416748046875, + "learning_rate": 3.122914588854964e-05, + "loss": 1.9751, + "step": 18888 + }, + { + "epoch": 1.2677091372772726, + "grad_norm": 4.007592678070068, + "learning_rate": 3.1219072977235305e-05, + "loss": 2.1366, + "step": 18890 + }, + { + "epoch": 1.2678433609610416, + "grad_norm": 3.3242316246032715, + "learning_rate": 3.120900095328286e-05, + "loss": 2.0023, + "step": 18892 + }, + { + "epoch": 1.2679775846448105, + "grad_norm": 4.562998294830322, + "learning_rate": 3.1198929817168154e-05, + "loss": 2.0886, + "step": 18894 + }, + { + "epoch": 1.2681118083285796, + "grad_norm": 4.2822980880737305, + "learning_rate": 3.118885956936706e-05, + "loss": 2.0126, + "step": 18896 + }, + { + "epoch": 1.2682460320123485, + "grad_norm": 3.8307061195373535, + "learning_rate": 3.117879021035534e-05, + "loss": 2.0621, + "step": 18898 + }, + { + "epoch": 1.2683802556961177, + "grad_norm": 3.961423397064209, + "learning_rate": 3.116872174060878e-05, + "loss": 2.0366, + "step": 18900 + }, + { + "epoch": 1.2685144793798866, + "grad_norm": 3.7430777549743652, + "learning_rate": 3.1158654160603096e-05, + "loss": 2.065, + "step": 18902 + }, + { + "epoch": 1.2686487030636555, + "grad_norm": 4.23029088973999, + "learning_rate": 3.114858747081395e-05, + "loss": 2.162, + "step": 18904 + }, + { + "epoch": 1.2687829267474247, + "grad_norm": 3.7162673473358154, + "learning_rate": 3.113852167171697e-05, + "loss": 1.9563, + "step": 18906 + }, + { + "epoch": 1.2689171504311936, + "grad_norm": 3.6497740745544434, + "learning_rate": 3.112845676378776e-05, + "loss": 1.7886, + "step": 18908 + }, + { + "epoch": 1.2690513741149627, + "grad_norm": 5.005252361297607, + "learning_rate": 3.1118392747501854e-05, + "loss": 2.0954, + "step": 18910 + }, + { + "epoch": 1.2691855977987316, + "grad_norm": 3.954521417617798, + "learning_rate": 3.1108329623334766e-05, + "loss": 1.9918, + "step": 18912 + }, + { + "epoch": 1.2693198214825006, + "grad_norm": 4.128352165222168, + "learning_rate": 3.109826739176195e-05, + "loss": 1.9492, + "step": 18914 + }, + { + "epoch": 1.2694540451662695, + "grad_norm": 4.561274528503418, + "learning_rate": 3.108820605325883e-05, + "loss": 2.2356, + "step": 18916 + }, + { + "epoch": 1.2695882688500386, + "grad_norm": 4.348280429840088, + "learning_rate": 3.1078145608300776e-05, + "loss": 2.1519, + "step": 18918 + }, + { + "epoch": 1.2697224925338075, + "grad_norm": 3.3822391033172607, + "learning_rate": 3.1068086057363155e-05, + "loss": 1.9655, + "step": 18920 + }, + { + "epoch": 1.2698567162175767, + "grad_norm": 3.9526734352111816, + "learning_rate": 3.105802740092122e-05, + "loss": 2.0012, + "step": 18922 + }, + { + "epoch": 1.2699909399013456, + "grad_norm": 4.23837423324585, + "learning_rate": 3.104796963945027e-05, + "loss": 2.3015, + "step": 18924 + }, + { + "epoch": 1.2701251635851145, + "grad_norm": 4.2359232902526855, + "learning_rate": 3.103791277342545e-05, + "loss": 1.8978, + "step": 18926 + }, + { + "epoch": 1.2702593872688837, + "grad_norm": 4.575779914855957, + "learning_rate": 3.102785680332199e-05, + "loss": 2.2667, + "step": 18928 + }, + { + "epoch": 1.2703936109526526, + "grad_norm": 4.242033958435059, + "learning_rate": 3.101780172961497e-05, + "loss": 2.1436, + "step": 18930 + }, + { + "epoch": 1.2705278346364217, + "grad_norm": 4.271782875061035, + "learning_rate": 3.1007747552779515e-05, + "loss": 2.071, + "step": 18932 + }, + { + "epoch": 1.2706620583201906, + "grad_norm": 3.96649169921875, + "learning_rate": 3.099769427329062e-05, + "loss": 2.127, + "step": 18934 + }, + { + "epoch": 1.2707962820039596, + "grad_norm": 4.230431079864502, + "learning_rate": 3.098764189162332e-05, + "loss": 1.9095, + "step": 18936 + }, + { + "epoch": 1.2709305056877285, + "grad_norm": 4.034116268157959, + "learning_rate": 3.0977590408252546e-05, + "loss": 2.0116, + "step": 18938 + }, + { + "epoch": 1.2710647293714976, + "grad_norm": 3.7883963584899902, + "learning_rate": 3.096753982365323e-05, + "loss": 2.1815, + "step": 18940 + }, + { + "epoch": 1.2711989530552665, + "grad_norm": 3.8921639919281006, + "learning_rate": 3.0957490138300215e-05, + "loss": 2.0558, + "step": 18942 + }, + { + "epoch": 1.2713331767390357, + "grad_norm": 3.63411545753479, + "learning_rate": 3.094744135266836e-05, + "loss": 1.7712, + "step": 18944 + }, + { + "epoch": 1.2714674004228046, + "grad_norm": 4.166527271270752, + "learning_rate": 3.093739346723242e-05, + "loss": 1.8711, + "step": 18946 + }, + { + "epoch": 1.2716016241065735, + "grad_norm": 4.138854026794434, + "learning_rate": 3.092734648246718e-05, + "loss": 2.0377, + "step": 18948 + }, + { + "epoch": 1.2717358477903427, + "grad_norm": 4.083079814910889, + "learning_rate": 3.091730039884728e-05, + "loss": 2.0688, + "step": 18950 + }, + { + "epoch": 1.2718700714741116, + "grad_norm": 4.421020030975342, + "learning_rate": 3.090725521684744e-05, + "loss": 1.8689, + "step": 18952 + }, + { + "epoch": 1.2720042951578807, + "grad_norm": 3.135723829269409, + "learning_rate": 3.089721093694224e-05, + "loss": 1.6504, + "step": 18954 + }, + { + "epoch": 1.2721385188416496, + "grad_norm": 3.712050676345825, + "learning_rate": 3.088716755960624e-05, + "loss": 2.001, + "step": 18956 + }, + { + "epoch": 1.2722727425254186, + "grad_norm": 4.36420202255249, + "learning_rate": 3.087712508531402e-05, + "loss": 2.053, + "step": 18958 + }, + { + "epoch": 1.2724069662091875, + "grad_norm": 4.205966472625732, + "learning_rate": 3.0867083514540004e-05, + "loss": 2.0325, + "step": 18960 + }, + { + "epoch": 1.2725411898929566, + "grad_norm": 3.9151272773742676, + "learning_rate": 3.085704284775869e-05, + "loss": 1.7881, + "step": 18962 + }, + { + "epoch": 1.2726754135767255, + "grad_norm": 4.42035436630249, + "learning_rate": 3.084700308544445e-05, + "loss": 2.1439, + "step": 18964 + }, + { + "epoch": 1.2728096372604947, + "grad_norm": 4.276373386383057, + "learning_rate": 3.083696422807166e-05, + "loss": 2.0083, + "step": 18966 + }, + { + "epoch": 1.2729438609442636, + "grad_norm": 3.183746337890625, + "learning_rate": 3.082692627611462e-05, + "loss": 1.8946, + "step": 18968 + }, + { + "epoch": 1.2730780846280325, + "grad_norm": 4.186310768127441, + "learning_rate": 3.081688923004762e-05, + "loss": 2.0933, + "step": 18970 + }, + { + "epoch": 1.2732123083118017, + "grad_norm": 4.279609203338623, + "learning_rate": 3.080685309034487e-05, + "loss": 2.2754, + "step": 18972 + }, + { + "epoch": 1.2733465319955706, + "grad_norm": 4.168456077575684, + "learning_rate": 3.079681785748057e-05, + "loss": 2.0155, + "step": 18974 + }, + { + "epoch": 1.2734807556793397, + "grad_norm": 3.7859668731689453, + "learning_rate": 3.078678353192887e-05, + "loss": 1.938, + "step": 18976 + }, + { + "epoch": 1.2736149793631086, + "grad_norm": 3.6410880088806152, + "learning_rate": 3.0776750114163876e-05, + "loss": 1.9407, + "step": 18978 + }, + { + "epoch": 1.2737492030468776, + "grad_norm": 4.325371265411377, + "learning_rate": 3.076671760465961e-05, + "loss": 1.9939, + "step": 18980 + }, + { + "epoch": 1.2738834267306467, + "grad_norm": 4.53303337097168, + "learning_rate": 3.0756686003890156e-05, + "loss": 2.1676, + "step": 18982 + }, + { + "epoch": 1.2740176504144156, + "grad_norm": 4.012136459350586, + "learning_rate": 3.074665531232941e-05, + "loss": 1.9326, + "step": 18984 + }, + { + "epoch": 1.2741518740981848, + "grad_norm": 3.574136257171631, + "learning_rate": 3.073662553045138e-05, + "loss": 1.9043, + "step": 18986 + }, + { + "epoch": 1.2742860977819537, + "grad_norm": 4.1038126945495605, + "learning_rate": 3.072659665872988e-05, + "loss": 1.8738, + "step": 18988 + }, + { + "epoch": 1.2744203214657226, + "grad_norm": 4.125857353210449, + "learning_rate": 3.07165686976388e-05, + "loss": 2.3084, + "step": 18990 + }, + { + "epoch": 1.2745545451494915, + "grad_norm": 4.233017444610596, + "learning_rate": 3.070654164765193e-05, + "loss": 2.0386, + "step": 18992 + }, + { + "epoch": 1.2746887688332607, + "grad_norm": 4.088818550109863, + "learning_rate": 3.0696515509243026e-05, + "loss": 1.9291, + "step": 18994 + }, + { + "epoch": 1.2748229925170296, + "grad_norm": 3.776674270629883, + "learning_rate": 3.068649028288581e-05, + "loss": 2.0284, + "step": 18996 + }, + { + "epoch": 1.2749572162007987, + "grad_norm": 4.11598014831543, + "learning_rate": 3.0676465969053946e-05, + "loss": 2.3788, + "step": 18998 + }, + { + "epoch": 1.2750914398845676, + "grad_norm": 4.6984333992004395, + "learning_rate": 3.0666442568221065e-05, + "loss": 2.0572, + "step": 19000 + }, + { + "epoch": 1.2752256635683366, + "grad_norm": 3.6548421382904053, + "learning_rate": 3.065642008086076e-05, + "loss": 1.997, + "step": 19002 + }, + { + "epoch": 1.2753598872521057, + "grad_norm": 4.219428062438965, + "learning_rate": 3.064639850744657e-05, + "loss": 2.2127, + "step": 19004 + }, + { + "epoch": 1.2754941109358746, + "grad_norm": 4.340958595275879, + "learning_rate": 3.063637784845199e-05, + "loss": 2.1991, + "step": 19006 + }, + { + "epoch": 1.2756283346196438, + "grad_norm": 5.010650634765625, + "learning_rate": 3.0626358104350475e-05, + "loss": 1.9188, + "step": 19008 + }, + { + "epoch": 1.2757625583034127, + "grad_norm": 4.037755489349365, + "learning_rate": 3.0616339275615455e-05, + "loss": 2.1374, + "step": 19010 + }, + { + "epoch": 1.2758967819871816, + "grad_norm": 4.043040752410889, + "learning_rate": 3.0606321362720267e-05, + "loss": 1.9496, + "step": 19012 + }, + { + "epoch": 1.2760310056709505, + "grad_norm": 3.92594313621521, + "learning_rate": 3.059630436613829e-05, + "loss": 1.9217, + "step": 19014 + }, + { + "epoch": 1.2761652293547197, + "grad_norm": 3.9588022232055664, + "learning_rate": 3.058628828634274e-05, + "loss": 1.9444, + "step": 19016 + }, + { + "epoch": 1.2762994530384886, + "grad_norm": 3.8772027492523193, + "learning_rate": 3.057627312380692e-05, + "loss": 1.9926, + "step": 19018 + }, + { + "epoch": 1.2764336767222577, + "grad_norm": 4.419902324676514, + "learning_rate": 3.0566258879003986e-05, + "loss": 2.3433, + "step": 19020 + }, + { + "epoch": 1.2765679004060266, + "grad_norm": 4.153539180755615, + "learning_rate": 3.055624555240711e-05, + "loss": 1.8701, + "step": 19022 + }, + { + "epoch": 1.2767021240897956, + "grad_norm": 4.0262651443481445, + "learning_rate": 3.0546233144489414e-05, + "loss": 1.9872, + "step": 19024 + }, + { + "epoch": 1.2768363477735647, + "grad_norm": 4.161719799041748, + "learning_rate": 3.053622165572392e-05, + "loss": 1.968, + "step": 19026 + }, + { + "epoch": 1.2769705714573336, + "grad_norm": 3.3700008392333984, + "learning_rate": 3.05262110865837e-05, + "loss": 1.8212, + "step": 19028 + }, + { + "epoch": 1.2771047951411028, + "grad_norm": 3.7305610179901123, + "learning_rate": 3.0516201437541693e-05, + "loss": 2.0834, + "step": 19030 + }, + { + "epoch": 1.2772390188248717, + "grad_norm": 4.351583957672119, + "learning_rate": 3.0506192709070868e-05, + "loss": 2.1266, + "step": 19032 + }, + { + "epoch": 1.2773732425086406, + "grad_norm": 3.740426778793335, + "learning_rate": 3.04961849016441e-05, + "loss": 1.9647, + "step": 19034 + }, + { + "epoch": 1.2775074661924095, + "grad_norm": 4.19650936126709, + "learning_rate": 3.0486178015734246e-05, + "loss": 2.0599, + "step": 19036 + }, + { + "epoch": 1.2776416898761787, + "grad_norm": 4.088638782501221, + "learning_rate": 3.04761720518141e-05, + "loss": 2.2401, + "step": 19038 + }, + { + "epoch": 1.2777759135599476, + "grad_norm": 3.587409257888794, + "learning_rate": 3.046616701035645e-05, + "loss": 1.8914, + "step": 19040 + }, + { + "epoch": 1.2779101372437167, + "grad_norm": 4.372735023498535, + "learning_rate": 3.0456162891833978e-05, + "loss": 1.8119, + "step": 19042 + }, + { + "epoch": 1.2780443609274856, + "grad_norm": 3.6709156036376953, + "learning_rate": 3.044615969671939e-05, + "loss": 2.0656, + "step": 19044 + }, + { + "epoch": 1.2781785846112546, + "grad_norm": 3.871259927749634, + "learning_rate": 3.043615742548529e-05, + "loss": 1.8349, + "step": 19046 + }, + { + "epoch": 1.2783128082950237, + "grad_norm": 4.432785987854004, + "learning_rate": 3.0426156078604305e-05, + "loss": 1.9456, + "step": 19048 + }, + { + "epoch": 1.2784470319787926, + "grad_norm": 3.9892637729644775, + "learning_rate": 3.0416155656548928e-05, + "loss": 2.2865, + "step": 19050 + }, + { + "epoch": 1.2785812556625618, + "grad_norm": 4.034704208374023, + "learning_rate": 3.040615615979171e-05, + "loss": 2.0581, + "step": 19052 + }, + { + "epoch": 1.2787154793463307, + "grad_norm": 4.22700834274292, + "learning_rate": 3.0396157588805068e-05, + "loss": 2.001, + "step": 19054 + }, + { + "epoch": 1.2788497030300996, + "grad_norm": 3.7653839588165283, + "learning_rate": 3.0386159944061444e-05, + "loss": 1.9149, + "step": 19056 + }, + { + "epoch": 1.2789839267138687, + "grad_norm": 4.250617027282715, + "learning_rate": 3.0376163226033183e-05, + "loss": 1.9562, + "step": 19058 + }, + { + "epoch": 1.2791181503976377, + "grad_norm": 3.893583059310913, + "learning_rate": 3.0366167435192627e-05, + "loss": 2.1435, + "step": 19060 + }, + { + "epoch": 1.2792523740814068, + "grad_norm": 4.737390041351318, + "learning_rate": 3.035617257201204e-05, + "loss": 2.0071, + "step": 19062 + }, + { + "epoch": 1.2793865977651757, + "grad_norm": 3.9258532524108887, + "learning_rate": 3.034617863696369e-05, + "loss": 2.1039, + "step": 19064 + }, + { + "epoch": 1.2795208214489446, + "grad_norm": 5.014461040496826, + "learning_rate": 3.0336185630519737e-05, + "loss": 2.0789, + "step": 19066 + }, + { + "epoch": 1.2796550451327136, + "grad_norm": 3.3917224407196045, + "learning_rate": 3.032619355315236e-05, + "loss": 1.8812, + "step": 19068 + }, + { + "epoch": 1.2797892688164827, + "grad_norm": 4.1063761711120605, + "learning_rate": 3.0316202405333626e-05, + "loss": 2.2349, + "step": 19070 + }, + { + "epoch": 1.2799234925002516, + "grad_norm": 4.29255485534668, + "learning_rate": 3.0306212187535653e-05, + "loss": 2.0195, + "step": 19072 + }, + { + "epoch": 1.2800577161840208, + "grad_norm": 3.8965134620666504, + "learning_rate": 3.0296222900230397e-05, + "loss": 1.9926, + "step": 19074 + }, + { + "epoch": 1.2801919398677897, + "grad_norm": 4.836755275726318, + "learning_rate": 3.0286234543889892e-05, + "loss": 2.5709, + "step": 19076 + }, + { + "epoch": 1.2803261635515586, + "grad_norm": 3.9997928142547607, + "learning_rate": 3.0276247118986013e-05, + "loss": 2.0302, + "step": 19078 + }, + { + "epoch": 1.2804603872353277, + "grad_norm": 3.737091064453125, + "learning_rate": 3.026626062599069e-05, + "loss": 1.7404, + "step": 19080 + }, + { + "epoch": 1.2805946109190967, + "grad_norm": 4.867877960205078, + "learning_rate": 3.025627506537574e-05, + "loss": 2.0554, + "step": 19082 + }, + { + "epoch": 1.2807288346028658, + "grad_norm": 3.5428380966186523, + "learning_rate": 3.0246290437612974e-05, + "loss": 1.8125, + "step": 19084 + }, + { + "epoch": 1.2808630582866347, + "grad_norm": 4.510513782501221, + "learning_rate": 3.0236306743174135e-05, + "loss": 2.1291, + "step": 19086 + }, + { + "epoch": 1.2809972819704036, + "grad_norm": 4.695284366607666, + "learning_rate": 3.0226323982530946e-05, + "loss": 1.8965, + "step": 19088 + }, + { + "epoch": 1.2811315056541726, + "grad_norm": 4.049589157104492, + "learning_rate": 3.0216342156155063e-05, + "loss": 2.0976, + "step": 19090 + }, + { + "epoch": 1.2812657293379417, + "grad_norm": 4.280069828033447, + "learning_rate": 3.0206361264518106e-05, + "loss": 2.2532, + "step": 19092 + }, + { + "epoch": 1.2813999530217106, + "grad_norm": 5.568958759307861, + "learning_rate": 3.019638130809167e-05, + "loss": 1.9403, + "step": 19094 + }, + { + "epoch": 1.2815341767054798, + "grad_norm": 3.9614930152893066, + "learning_rate": 3.018640228734726e-05, + "loss": 2.0251, + "step": 19096 + }, + { + "epoch": 1.2816684003892487, + "grad_norm": 3.7749381065368652, + "learning_rate": 3.0176424202756388e-05, + "loss": 1.8186, + "step": 19098 + }, + { + "epoch": 1.2818026240730176, + "grad_norm": 3.7403717041015625, + "learning_rate": 3.016644705479048e-05, + "loss": 2.0604, + "step": 19100 + }, + { + "epoch": 1.2819368477567867, + "grad_norm": 3.8648834228515625, + "learning_rate": 3.0156470843920965e-05, + "loss": 1.8921, + "step": 19102 + }, + { + "epoch": 1.2820710714405557, + "grad_norm": 4.1476640701293945, + "learning_rate": 3.014649557061917e-05, + "loss": 1.9644, + "step": 19104 + }, + { + "epoch": 1.2822052951243248, + "grad_norm": 4.129083156585693, + "learning_rate": 3.0136521235356425e-05, + "loss": 2.0469, + "step": 19106 + }, + { + "epoch": 1.2823395188080937, + "grad_norm": 4.302798748016357, + "learning_rate": 3.0126547838603976e-05, + "loss": 2.1355, + "step": 19108 + }, + { + "epoch": 1.2824737424918626, + "grad_norm": 4.493409156799316, + "learning_rate": 3.0116575380833088e-05, + "loss": 2.1353, + "step": 19110 + }, + { + "epoch": 1.2826079661756316, + "grad_norm": 4.204872131347656, + "learning_rate": 3.0106603862514882e-05, + "loss": 1.7709, + "step": 19112 + }, + { + "epoch": 1.2827421898594007, + "grad_norm": 3.8701765537261963, + "learning_rate": 3.0096633284120556e-05, + "loss": 1.922, + "step": 19114 + }, + { + "epoch": 1.2828764135431696, + "grad_norm": 3.9896183013916016, + "learning_rate": 3.008666364612115e-05, + "loss": 2.0051, + "step": 19116 + }, + { + "epoch": 1.2830106372269388, + "grad_norm": 3.39762806892395, + "learning_rate": 3.0076694948987738e-05, + "loss": 2.0412, + "step": 19118 + }, + { + "epoch": 1.2831448609107077, + "grad_norm": 3.9156203269958496, + "learning_rate": 3.0066727193191307e-05, + "loss": 1.9299, + "step": 19120 + }, + { + "epoch": 1.2832790845944766, + "grad_norm": 4.457176208496094, + "learning_rate": 3.0056760379202824e-05, + "loss": 2.2114, + "step": 19122 + }, + { + "epoch": 1.2834133082782457, + "grad_norm": 4.012571811676025, + "learning_rate": 3.0046794507493192e-05, + "loss": 1.8644, + "step": 19124 + }, + { + "epoch": 1.2835475319620147, + "grad_norm": 3.7479047775268555, + "learning_rate": 3.0036829578533287e-05, + "loss": 1.7629, + "step": 19126 + }, + { + "epoch": 1.2836817556457838, + "grad_norm": 3.851123332977295, + "learning_rate": 3.0026865592793928e-05, + "loss": 1.8587, + "step": 19128 + }, + { + "epoch": 1.2838159793295527, + "grad_norm": 4.223423957824707, + "learning_rate": 3.0016902550745897e-05, + "loss": 2.1083, + "step": 19130 + }, + { + "epoch": 1.2839502030133216, + "grad_norm": 3.6256182193756104, + "learning_rate": 3.0006940452859916e-05, + "loss": 2.1701, + "step": 19132 + }, + { + "epoch": 1.2840844266970908, + "grad_norm": 4.448319911956787, + "learning_rate": 2.9996979299606697e-05, + "loss": 2.1581, + "step": 19134 + }, + { + "epoch": 1.2842186503808597, + "grad_norm": 3.806687355041504, + "learning_rate": 2.9987019091456846e-05, + "loss": 1.9341, + "step": 19136 + }, + { + "epoch": 1.2843528740646288, + "grad_norm": 4.377286434173584, + "learning_rate": 2.9977059828881027e-05, + "loss": 2.047, + "step": 19138 + }, + { + "epoch": 1.2844870977483978, + "grad_norm": 4.613422870635986, + "learning_rate": 2.996710151234972e-05, + "loss": 2.4014, + "step": 19140 + }, + { + "epoch": 1.2846213214321667, + "grad_norm": 4.07994270324707, + "learning_rate": 2.9957144142333494e-05, + "loss": 1.731, + "step": 19142 + }, + { + "epoch": 1.2847555451159356, + "grad_norm": 3.909926652908325, + "learning_rate": 2.9947187719302792e-05, + "loss": 1.8959, + "step": 19144 + }, + { + "epoch": 1.2848897687997047, + "grad_norm": 4.0475969314575195, + "learning_rate": 2.993723224372804e-05, + "loss": 2.1385, + "step": 19146 + }, + { + "epoch": 1.2850239924834737, + "grad_norm": 4.250983715057373, + "learning_rate": 2.9927277716079605e-05, + "loss": 2.052, + "step": 19148 + }, + { + "epoch": 1.2851582161672428, + "grad_norm": 3.8983678817749023, + "learning_rate": 2.991732413682784e-05, + "loss": 1.9412, + "step": 19150 + }, + { + "epoch": 1.2852924398510117, + "grad_norm": 4.020389556884766, + "learning_rate": 2.9907371506443003e-05, + "loss": 2.1882, + "step": 19152 + }, + { + "epoch": 1.2854266635347806, + "grad_norm": 7.539041519165039, + "learning_rate": 2.9897419825395367e-05, + "loss": 1.9412, + "step": 19154 + }, + { + "epoch": 1.2855608872185498, + "grad_norm": 4.2423577308654785, + "learning_rate": 2.9887469094155108e-05, + "loss": 2.3027, + "step": 19156 + }, + { + "epoch": 1.2856951109023187, + "grad_norm": 4.704409122467041, + "learning_rate": 2.9877519313192386e-05, + "loss": 2.2554, + "step": 19158 + }, + { + "epoch": 1.2858293345860878, + "grad_norm": 4.226144313812256, + "learning_rate": 2.9867570482977316e-05, + "loss": 2.1444, + "step": 19160 + }, + { + "epoch": 1.2859635582698568, + "grad_norm": 4.349644660949707, + "learning_rate": 2.9857622603979933e-05, + "loss": 2.0158, + "step": 19162 + }, + { + "epoch": 1.2860977819536257, + "grad_norm": 3.9742348194122314, + "learning_rate": 2.9847675676670285e-05, + "loss": 2.303, + "step": 19164 + }, + { + "epoch": 1.2862320056373946, + "grad_norm": 4.019457817077637, + "learning_rate": 2.9837729701518325e-05, + "loss": 1.9748, + "step": 19166 + }, + { + "epoch": 1.2863662293211637, + "grad_norm": 4.09002161026001, + "learning_rate": 2.9827784678994003e-05, + "loss": 1.7228, + "step": 19168 + }, + { + "epoch": 1.2865004530049327, + "grad_norm": 4.079833984375, + "learning_rate": 2.9817840609567166e-05, + "loss": 1.9834, + "step": 19170 + }, + { + "epoch": 1.2866346766887018, + "grad_norm": 4.078180313110352, + "learning_rate": 2.9807897493707703e-05, + "loss": 1.9037, + "step": 19172 + }, + { + "epoch": 1.2867689003724707, + "grad_norm": 4.671247959136963, + "learning_rate": 2.9797955331885346e-05, + "loss": 2.0156, + "step": 19174 + }, + { + "epoch": 1.2869031240562396, + "grad_norm": 4.305110454559326, + "learning_rate": 2.9788014124569895e-05, + "loss": 2.1429, + "step": 19176 + }, + { + "epoch": 1.2870373477400088, + "grad_norm": 4.39719295501709, + "learning_rate": 2.977807387223102e-05, + "loss": 1.855, + "step": 19178 + }, + { + "epoch": 1.2871715714237777, + "grad_norm": 3.615467071533203, + "learning_rate": 2.9768134575338402e-05, + "loss": 2.109, + "step": 19180 + }, + { + "epoch": 1.2873057951075468, + "grad_norm": 3.968068838119507, + "learning_rate": 2.975819623436163e-05, + "loss": 2.0463, + "step": 19182 + }, + { + "epoch": 1.2874400187913158, + "grad_norm": 3.869266986846924, + "learning_rate": 2.9748258849770293e-05, + "loss": 2.1574, + "step": 19184 + }, + { + "epoch": 1.2875742424750847, + "grad_norm": 5.285531997680664, + "learning_rate": 2.9738322422033895e-05, + "loss": 1.9714, + "step": 19186 + }, + { + "epoch": 1.2877084661588536, + "grad_norm": 4.664259910583496, + "learning_rate": 2.9728386951621923e-05, + "loss": 2.1031, + "step": 19188 + }, + { + "epoch": 1.2878426898426227, + "grad_norm": 4.490285396575928, + "learning_rate": 2.9718452439003796e-05, + "loss": 1.858, + "step": 19190 + }, + { + "epoch": 1.2879769135263917, + "grad_norm": 4.177495956420898, + "learning_rate": 2.9708518884648923e-05, + "loss": 2.2282, + "step": 19192 + }, + { + "epoch": 1.2881111372101608, + "grad_norm": 3.9812023639678955, + "learning_rate": 2.969858628902662e-05, + "loss": 2.3846, + "step": 19194 + }, + { + "epoch": 1.2882453608939297, + "grad_norm": 3.7651429176330566, + "learning_rate": 2.9688654652606207e-05, + "loss": 1.9689, + "step": 19196 + }, + { + "epoch": 1.2883795845776986, + "grad_norm": 3.6785154342651367, + "learning_rate": 2.967872397585689e-05, + "loss": 2.0837, + "step": 19198 + }, + { + "epoch": 1.2885138082614678, + "grad_norm": 4.144131183624268, + "learning_rate": 2.9668794259247945e-05, + "loss": 2.0395, + "step": 19200 + }, + { + "epoch": 1.2886480319452367, + "grad_norm": 3.935866594314575, + "learning_rate": 2.9658865503248463e-05, + "loss": 2.1031, + "step": 19202 + }, + { + "epoch": 1.2887822556290058, + "grad_norm": 4.17338752746582, + "learning_rate": 2.96489377083276e-05, + "loss": 2.0638, + "step": 19204 + }, + { + "epoch": 1.2889164793127748, + "grad_norm": 4.105684280395508, + "learning_rate": 2.963901087495441e-05, + "loss": 2.0711, + "step": 19206 + }, + { + "epoch": 1.2890507029965437, + "grad_norm": 3.7684223651885986, + "learning_rate": 2.9629085003597918e-05, + "loss": 2.027, + "step": 19208 + }, + { + "epoch": 1.2891849266803128, + "grad_norm": 3.9718220233917236, + "learning_rate": 2.9619160094727093e-05, + "loss": 2.0329, + "step": 19210 + }, + { + "epoch": 1.2893191503640817, + "grad_norm": 9.811216354370117, + "learning_rate": 2.960923614881089e-05, + "loss": 2.1819, + "step": 19212 + }, + { + "epoch": 1.2894533740478509, + "grad_norm": 4.150301933288574, + "learning_rate": 2.9599313166318177e-05, + "loss": 2.0003, + "step": 19214 + }, + { + "epoch": 1.2895875977316198, + "grad_norm": 3.6843812465667725, + "learning_rate": 2.9589391147717803e-05, + "loss": 2.0388, + "step": 19216 + }, + { + "epoch": 1.2897218214153887, + "grad_norm": 4.005952835083008, + "learning_rate": 2.9579470093478558e-05, + "loss": 2.0009, + "step": 19218 + }, + { + "epoch": 1.2898560450991576, + "grad_norm": 3.7149739265441895, + "learning_rate": 2.9569550004069203e-05, + "loss": 1.9357, + "step": 19220 + }, + { + "epoch": 1.2899902687829268, + "grad_norm": 4.0995635986328125, + "learning_rate": 2.9559630879958422e-05, + "loss": 1.7017, + "step": 19222 + }, + { + "epoch": 1.2901244924666957, + "grad_norm": 4.116012096405029, + "learning_rate": 2.9549712721614902e-05, + "loss": 2.1946, + "step": 19224 + }, + { + "epoch": 1.2902587161504648, + "grad_norm": 5.099114418029785, + "learning_rate": 2.953979552950722e-05, + "loss": 2.1649, + "step": 19226 + }, + { + "epoch": 1.2903929398342338, + "grad_norm": 4.180013656616211, + "learning_rate": 2.9529879304103997e-05, + "loss": 1.952, + "step": 19228 + }, + { + "epoch": 1.2905271635180027, + "grad_norm": 4.0111846923828125, + "learning_rate": 2.9519964045873716e-05, + "loss": 2.0733, + "step": 19230 + }, + { + "epoch": 1.2906613872017718, + "grad_norm": 4.547650337219238, + "learning_rate": 2.951004975528484e-05, + "loss": 1.9099, + "step": 19232 + }, + { + "epoch": 1.2907956108855407, + "grad_norm": 3.8617076873779297, + "learning_rate": 2.9500136432805848e-05, + "loss": 2.0752, + "step": 19234 + }, + { + "epoch": 1.2909298345693099, + "grad_norm": 4.5180230140686035, + "learning_rate": 2.949022407890507e-05, + "loss": 2.0183, + "step": 19236 + }, + { + "epoch": 1.2910640582530788, + "grad_norm": 3.9637906551361084, + "learning_rate": 2.9480312694050905e-05, + "loss": 1.8746, + "step": 19238 + }, + { + "epoch": 1.2911982819368477, + "grad_norm": 4.598119258880615, + "learning_rate": 2.9470402278711584e-05, + "loss": 2.0223, + "step": 19240 + }, + { + "epoch": 1.2913325056206166, + "grad_norm": 3.3882668018341064, + "learning_rate": 2.9460492833355407e-05, + "loss": 1.9217, + "step": 19242 + }, + { + "epoch": 1.2914667293043858, + "grad_norm": 4.078577041625977, + "learning_rate": 2.945058435845054e-05, + "loss": 2.2284, + "step": 19244 + }, + { + "epoch": 1.2916009529881547, + "grad_norm": 4.294269561767578, + "learning_rate": 2.9440676854465165e-05, + "loss": 2.0935, + "step": 19246 + }, + { + "epoch": 1.2917351766719238, + "grad_norm": 4.266402721405029, + "learning_rate": 2.9430770321867374e-05, + "loss": 1.9848, + "step": 19248 + }, + { + "epoch": 1.2918694003556928, + "grad_norm": 4.076462745666504, + "learning_rate": 2.942086476112525e-05, + "loss": 1.8638, + "step": 19250 + }, + { + "epoch": 1.2920036240394617, + "grad_norm": 4.0869951248168945, + "learning_rate": 2.941096017270678e-05, + "loss": 2.1426, + "step": 19252 + }, + { + "epoch": 1.2921378477232308, + "grad_norm": 3.8070969581604004, + "learning_rate": 2.9401056557079975e-05, + "loss": 1.8837, + "step": 19254 + }, + { + "epoch": 1.2922720714069997, + "grad_norm": 4.580050468444824, + "learning_rate": 2.9391153914712722e-05, + "loss": 2.2857, + "step": 19256 + }, + { + "epoch": 1.2924062950907689, + "grad_norm": 4.382445335388184, + "learning_rate": 2.938125224607294e-05, + "loss": 2.1413, + "step": 19258 + }, + { + "epoch": 1.2925405187745378, + "grad_norm": 4.0625200271606445, + "learning_rate": 2.937135155162842e-05, + "loss": 2.1452, + "step": 19260 + }, + { + "epoch": 1.2926747424583067, + "grad_norm": 4.0909342765808105, + "learning_rate": 2.9361451831847004e-05, + "loss": 1.9158, + "step": 19262 + }, + { + "epoch": 1.2928089661420756, + "grad_norm": 4.051586151123047, + "learning_rate": 2.935155308719637e-05, + "loss": 2.4122, + "step": 19264 + }, + { + "epoch": 1.2929431898258448, + "grad_norm": 4.01632833480835, + "learning_rate": 2.9341655318144278e-05, + "loss": 2.0101, + "step": 19266 + }, + { + "epoch": 1.2930774135096137, + "grad_norm": 4.027736663818359, + "learning_rate": 2.9331758525158338e-05, + "loss": 2.0444, + "step": 19268 + }, + { + "epoch": 1.2932116371933828, + "grad_norm": 4.102476119995117, + "learning_rate": 2.9321862708706172e-05, + "loss": 2.2027, + "step": 19270 + }, + { + "epoch": 1.2933458608771518, + "grad_norm": 4.090660095214844, + "learning_rate": 2.9311967869255324e-05, + "loss": 2.1279, + "step": 19272 + }, + { + "epoch": 1.2934800845609207, + "grad_norm": 3.9865658283233643, + "learning_rate": 2.9302074007273317e-05, + "loss": 2.1476, + "step": 19274 + }, + { + "epoch": 1.2936143082446898, + "grad_norm": 4.308010101318359, + "learning_rate": 2.9292181123227612e-05, + "loss": 2.0468, + "step": 19276 + }, + { + "epoch": 1.2937485319284587, + "grad_norm": 3.708589553833008, + "learning_rate": 2.9282289217585633e-05, + "loss": 1.7907, + "step": 19278 + }, + { + "epoch": 1.2938827556122279, + "grad_norm": 4.009652137756348, + "learning_rate": 2.927239829081474e-05, + "loss": 2.0006, + "step": 19280 + }, + { + "epoch": 1.2940169792959968, + "grad_norm": 4.020890235900879, + "learning_rate": 2.9262508343382276e-05, + "loss": 1.8532, + "step": 19282 + }, + { + "epoch": 1.2941512029797657, + "grad_norm": 9.195577621459961, + "learning_rate": 2.9252619375755508e-05, + "loss": 1.8038, + "step": 19284 + }, + { + "epoch": 1.2942854266635349, + "grad_norm": 3.6940362453460693, + "learning_rate": 2.9242731388401685e-05, + "loss": 1.8824, + "step": 19286 + }, + { + "epoch": 1.2944196503473038, + "grad_norm": 4.339856147766113, + "learning_rate": 2.9232844381787967e-05, + "loss": 2.0334, + "step": 19288 + }, + { + "epoch": 1.294553874031073, + "grad_norm": 3.691044569015503, + "learning_rate": 2.9222958356381547e-05, + "loss": 2.27, + "step": 19290 + }, + { + "epoch": 1.2946880977148418, + "grad_norm": 4.150519847869873, + "learning_rate": 2.9213073312649452e-05, + "loss": 1.774, + "step": 19292 + }, + { + "epoch": 1.2948223213986108, + "grad_norm": 3.6981959342956543, + "learning_rate": 2.9203189251058792e-05, + "loss": 1.956, + "step": 19294 + }, + { + "epoch": 1.2949565450823797, + "grad_norm": 4.148850440979004, + "learning_rate": 2.9193306172076553e-05, + "loss": 2.0758, + "step": 19296 + }, + { + "epoch": 1.2950907687661488, + "grad_norm": 4.444552898406982, + "learning_rate": 2.9183424076169653e-05, + "loss": 1.977, + "step": 19298 + }, + { + "epoch": 1.2952249924499177, + "grad_norm": 4.866618633270264, + "learning_rate": 2.9173542963805058e-05, + "loss": 1.9327, + "step": 19300 + }, + { + "epoch": 1.2953592161336869, + "grad_norm": 4.020894527435303, + "learning_rate": 2.916366283544959e-05, + "loss": 1.717, + "step": 19302 + }, + { + "epoch": 1.2954934398174558, + "grad_norm": 4.132587432861328, + "learning_rate": 2.915378369157009e-05, + "loss": 2.0967, + "step": 19304 + }, + { + "epoch": 1.2956276635012247, + "grad_norm": 4.08135461807251, + "learning_rate": 2.91439055326333e-05, + "loss": 1.7319, + "step": 19306 + }, + { + "epoch": 1.2957618871849939, + "grad_norm": 4.071516990661621, + "learning_rate": 2.913402835910598e-05, + "loss": 2.132, + "step": 19308 + }, + { + "epoch": 1.2958961108687628, + "grad_norm": 4.395740032196045, + "learning_rate": 2.9124152171454766e-05, + "loss": 2.137, + "step": 19310 + }, + { + "epoch": 1.296030334552532, + "grad_norm": 3.748129367828369, + "learning_rate": 2.9114276970146355e-05, + "loss": 2.0413, + "step": 19312 + }, + { + "epoch": 1.2961645582363008, + "grad_norm": 4.0548505783081055, + "learning_rate": 2.910440275564724e-05, + "loss": 1.9154, + "step": 19314 + }, + { + "epoch": 1.2962987819200698, + "grad_norm": 3.6838791370391846, + "learning_rate": 2.9094529528424032e-05, + "loss": 1.9853, + "step": 19316 + }, + { + "epoch": 1.2964330056038387, + "grad_norm": 4.3000664710998535, + "learning_rate": 2.9084657288943174e-05, + "loss": 2.1221, + "step": 19318 + }, + { + "epoch": 1.2965672292876078, + "grad_norm": 4.293628692626953, + "learning_rate": 2.9074786037671153e-05, + "loss": 2.2144, + "step": 19320 + }, + { + "epoch": 1.2967014529713767, + "grad_norm": 4.087412357330322, + "learning_rate": 2.9064915775074342e-05, + "loss": 2.0498, + "step": 19322 + }, + { + "epoch": 1.2968356766551459, + "grad_norm": 3.8333187103271484, + "learning_rate": 2.905504650161909e-05, + "loss": 1.898, + "step": 19324 + }, + { + "epoch": 1.2969699003389148, + "grad_norm": 4.559422492980957, + "learning_rate": 2.9045178217771684e-05, + "loss": 2.0537, + "step": 19326 + }, + { + "epoch": 1.2971041240226837, + "grad_norm": 3.817640542984009, + "learning_rate": 2.9035310923998427e-05, + "loss": 1.7862, + "step": 19328 + }, + { + "epoch": 1.2972383477064529, + "grad_norm": 3.8986213207244873, + "learning_rate": 2.9025444620765502e-05, + "loss": 1.853, + "step": 19330 + }, + { + "epoch": 1.2973725713902218, + "grad_norm": 3.922086477279663, + "learning_rate": 2.901557930853907e-05, + "loss": 1.8224, + "step": 19332 + }, + { + "epoch": 1.297506795073991, + "grad_norm": 3.808242082595825, + "learning_rate": 2.9005714987785236e-05, + "loss": 2.0299, + "step": 19334 + }, + { + "epoch": 1.2976410187577598, + "grad_norm": 3.796607255935669, + "learning_rate": 2.8995851658970102e-05, + "loss": 1.9799, + "step": 19336 + }, + { + "epoch": 1.2977752424415288, + "grad_norm": 4.816441535949707, + "learning_rate": 2.898598932255966e-05, + "loss": 2.1858, + "step": 19338 + }, + { + "epoch": 1.2979094661252977, + "grad_norm": 3.7440567016601562, + "learning_rate": 2.8976127979019934e-05, + "loss": 2.2101, + "step": 19340 + }, + { + "epoch": 1.2980436898090668, + "grad_norm": 4.886723518371582, + "learning_rate": 2.896626762881678e-05, + "loss": 2.1219, + "step": 19342 + }, + { + "epoch": 1.2981779134928357, + "grad_norm": 3.6328999996185303, + "learning_rate": 2.8956408272416148e-05, + "loss": 1.8492, + "step": 19344 + }, + { + "epoch": 1.2983121371766049, + "grad_norm": 4.022932529449463, + "learning_rate": 2.8946549910283817e-05, + "loss": 2.1664, + "step": 19346 + }, + { + "epoch": 1.2984463608603738, + "grad_norm": 3.9885635375976562, + "learning_rate": 2.8936692542885617e-05, + "loss": 2.0754, + "step": 19348 + }, + { + "epoch": 1.2985805845441427, + "grad_norm": 3.763902187347412, + "learning_rate": 2.8926836170687284e-05, + "loss": 1.9798, + "step": 19350 + }, + { + "epoch": 1.2987148082279119, + "grad_norm": 4.289783477783203, + "learning_rate": 2.8916980794154503e-05, + "loss": 2.1079, + "step": 19352 + }, + { + "epoch": 1.2988490319116808, + "grad_norm": 4.302340507507324, + "learning_rate": 2.8907126413752895e-05, + "loss": 2.2244, + "step": 19354 + }, + { + "epoch": 1.29898325559545, + "grad_norm": 4.550999164581299, + "learning_rate": 2.889727302994811e-05, + "loss": 2.423, + "step": 19356 + }, + { + "epoch": 1.2991174792792188, + "grad_norm": 3.867140531539917, + "learning_rate": 2.8887420643205682e-05, + "loss": 1.9816, + "step": 19358 + }, + { + "epoch": 1.2992517029629878, + "grad_norm": 4.225536346435547, + "learning_rate": 2.887756925399111e-05, + "loss": 1.9552, + "step": 19360 + }, + { + "epoch": 1.299385926646757, + "grad_norm": 3.725884199142456, + "learning_rate": 2.8867718862769834e-05, + "loss": 1.8975, + "step": 19362 + }, + { + "epoch": 1.2995201503305258, + "grad_norm": 4.575340747833252, + "learning_rate": 2.8857869470007302e-05, + "loss": 1.9676, + "step": 19364 + }, + { + "epoch": 1.299654374014295, + "grad_norm": 3.5731313228607178, + "learning_rate": 2.8848021076168875e-05, + "loss": 1.8585, + "step": 19366 + }, + { + "epoch": 1.2997885976980639, + "grad_norm": 4.085967540740967, + "learning_rate": 2.883817368171985e-05, + "loss": 2.0373, + "step": 19368 + }, + { + "epoch": 1.2999228213818328, + "grad_norm": 4.247745513916016, + "learning_rate": 2.882832728712551e-05, + "loss": 2.2001, + "step": 19370 + }, + { + "epoch": 1.3000570450656017, + "grad_norm": 3.774993658065796, + "learning_rate": 2.881848189285105e-05, + "loss": 1.989, + "step": 19372 + }, + { + "epoch": 1.3001912687493709, + "grad_norm": 4.024530410766602, + "learning_rate": 2.880863749936169e-05, + "loss": 2.0888, + "step": 19374 + }, + { + "epoch": 1.3003254924331398, + "grad_norm": 4.596184253692627, + "learning_rate": 2.879879410712252e-05, + "loss": 1.931, + "step": 19376 + }, + { + "epoch": 1.300459716116909, + "grad_norm": 3.822744369506836, + "learning_rate": 2.8788951716598656e-05, + "loss": 1.9905, + "step": 19378 + }, + { + "epoch": 1.3005939398006778, + "grad_norm": 4.463446617126465, + "learning_rate": 2.877911032825511e-05, + "loss": 2.0067, + "step": 19380 + }, + { + "epoch": 1.3007281634844468, + "grad_norm": 4.027451515197754, + "learning_rate": 2.8769269942556875e-05, + "loss": 2.3983, + "step": 19382 + }, + { + "epoch": 1.300862387168216, + "grad_norm": 4.275053024291992, + "learning_rate": 2.875943055996887e-05, + "loss": 2.0944, + "step": 19384 + }, + { + "epoch": 1.3009966108519848, + "grad_norm": 3.929500102996826, + "learning_rate": 2.874959218095602e-05, + "loss": 1.9291, + "step": 19386 + }, + { + "epoch": 1.301130834535754, + "grad_norm": 3.216803789138794, + "learning_rate": 2.873975480598315e-05, + "loss": 1.8357, + "step": 19388 + }, + { + "epoch": 1.3012650582195229, + "grad_norm": 3.6780295372009277, + "learning_rate": 2.8729918435515058e-05, + "loss": 1.8441, + "step": 19390 + }, + { + "epoch": 1.3013992819032918, + "grad_norm": 4.238882064819336, + "learning_rate": 2.872008307001648e-05, + "loss": 1.9705, + "step": 19392 + }, + { + "epoch": 1.3015335055870607, + "grad_norm": 4.198459148406982, + "learning_rate": 2.8710248709952147e-05, + "loss": 2.0431, + "step": 19394 + }, + { + "epoch": 1.3016677292708299, + "grad_norm": 4.139011859893799, + "learning_rate": 2.8700415355786704e-05, + "loss": 2.073, + "step": 19396 + }, + { + "epoch": 1.3018019529545988, + "grad_norm": 3.9025464057922363, + "learning_rate": 2.8690583007984745e-05, + "loss": 2.0144, + "step": 19398 + }, + { + "epoch": 1.301936176638368, + "grad_norm": 3.8844192028045654, + "learning_rate": 2.868075166701082e-05, + "loss": 1.6051, + "step": 19400 + }, + { + "epoch": 1.3020704003221368, + "grad_norm": 4.2549333572387695, + "learning_rate": 2.867092133332947e-05, + "loss": 2.0389, + "step": 19402 + }, + { + "epoch": 1.3022046240059058, + "grad_norm": 3.7714004516601562, + "learning_rate": 2.8661092007405132e-05, + "loss": 1.8204, + "step": 19404 + }, + { + "epoch": 1.302338847689675, + "grad_norm": 3.6088922023773193, + "learning_rate": 2.8651263689702256e-05, + "loss": 1.8679, + "step": 19406 + }, + { + "epoch": 1.3024730713734438, + "grad_norm": 3.846554756164551, + "learning_rate": 2.8641436380685184e-05, + "loss": 1.9012, + "step": 19408 + }, + { + "epoch": 1.302607295057213, + "grad_norm": 3.729757785797119, + "learning_rate": 2.863161008081825e-05, + "loss": 1.8204, + "step": 19410 + }, + { + "epoch": 1.3027415187409819, + "grad_norm": 3.9718923568725586, + "learning_rate": 2.8621784790565696e-05, + "loss": 1.7178, + "step": 19412 + }, + { + "epoch": 1.3028757424247508, + "grad_norm": 4.583271026611328, + "learning_rate": 2.8611960510391795e-05, + "loss": 2.0932, + "step": 19414 + }, + { + "epoch": 1.3030099661085197, + "grad_norm": 4.765818119049072, + "learning_rate": 2.8602137240760695e-05, + "loss": 2.2835, + "step": 19416 + }, + { + "epoch": 1.3031441897922889, + "grad_norm": 3.973057270050049, + "learning_rate": 2.859231498213654e-05, + "loss": 1.9893, + "step": 19418 + }, + { + "epoch": 1.3032784134760578, + "grad_norm": 3.768667459487915, + "learning_rate": 2.8582493734983384e-05, + "loss": 2.0644, + "step": 19420 + }, + { + "epoch": 1.303412637159827, + "grad_norm": 4.04701566696167, + "learning_rate": 2.8572673499765307e-05, + "loss": 1.9205, + "step": 19422 + }, + { + "epoch": 1.3035468608435958, + "grad_norm": 4.398090362548828, + "learning_rate": 2.856285427694627e-05, + "loss": 2.2709, + "step": 19424 + }, + { + "epoch": 1.3036810845273648, + "grad_norm": 4.970524311065674, + "learning_rate": 2.8553036066990214e-05, + "loss": 1.8856, + "step": 19426 + }, + { + "epoch": 1.303815308211134, + "grad_norm": 4.200849533081055, + "learning_rate": 2.854321887036101e-05, + "loss": 1.8713, + "step": 19428 + }, + { + "epoch": 1.3039495318949028, + "grad_norm": 4.018733024597168, + "learning_rate": 2.8533402687522538e-05, + "loss": 1.9286, + "step": 19430 + }, + { + "epoch": 1.304083755578672, + "grad_norm": 4.037346363067627, + "learning_rate": 2.8523587518938554e-05, + "loss": 1.9419, + "step": 19432 + }, + { + "epoch": 1.3042179792624409, + "grad_norm": 3.934601306915283, + "learning_rate": 2.851377336507286e-05, + "loss": 2.0004, + "step": 19434 + }, + { + "epoch": 1.3043522029462098, + "grad_norm": 4.5510640144348145, + "learning_rate": 2.8503960226389136e-05, + "loss": 2.1334, + "step": 19436 + }, + { + "epoch": 1.304486426629979, + "grad_norm": 3.6276636123657227, + "learning_rate": 2.8494148103350983e-05, + "loss": 1.9229, + "step": 19438 + }, + { + "epoch": 1.3046206503137479, + "grad_norm": 4.2859930992126465, + "learning_rate": 2.8484336996422057e-05, + "loss": 2.4469, + "step": 19440 + }, + { + "epoch": 1.304754873997517, + "grad_norm": 4.028028964996338, + "learning_rate": 2.847452690606589e-05, + "loss": 2.191, + "step": 19442 + }, + { + "epoch": 1.304889097681286, + "grad_norm": 3.874722480773926, + "learning_rate": 2.8464717832746014e-05, + "loss": 2.0262, + "step": 19444 + }, + { + "epoch": 1.3050233213650548, + "grad_norm": 4.051851272583008, + "learning_rate": 2.8454909776925865e-05, + "loss": 2.0537, + "step": 19446 + }, + { + "epoch": 1.3051575450488238, + "grad_norm": 4.013634204864502, + "learning_rate": 2.8445102739068873e-05, + "loss": 1.9872, + "step": 19448 + }, + { + "epoch": 1.305291768732593, + "grad_norm": 3.9759674072265625, + "learning_rate": 2.8435296719638366e-05, + "loss": 2.0611, + "step": 19450 + }, + { + "epoch": 1.3054259924163618, + "grad_norm": 3.5337436199188232, + "learning_rate": 2.8425491719097707e-05, + "loss": 1.682, + "step": 19452 + }, + { + "epoch": 1.305560216100131, + "grad_norm": 4.141918659210205, + "learning_rate": 2.8415687737910134e-05, + "loss": 1.9239, + "step": 19454 + }, + { + "epoch": 1.3056944397838999, + "grad_norm": 4.498785018920898, + "learning_rate": 2.8405884776538876e-05, + "loss": 2.1279, + "step": 19456 + }, + { + "epoch": 1.3058286634676688, + "grad_norm": 4.076763153076172, + "learning_rate": 2.839608283544708e-05, + "loss": 2.0151, + "step": 19458 + }, + { + "epoch": 1.305962887151438, + "grad_norm": 3.755805492401123, + "learning_rate": 2.8386281915097907e-05, + "loss": 1.912, + "step": 19460 + }, + { + "epoch": 1.3060971108352069, + "grad_norm": 3.929802656173706, + "learning_rate": 2.8376482015954387e-05, + "loss": 1.7776, + "step": 19462 + }, + { + "epoch": 1.306231334518976, + "grad_norm": 3.8213348388671875, + "learning_rate": 2.836668313847962e-05, + "loss": 2.0366, + "step": 19464 + }, + { + "epoch": 1.306365558202745, + "grad_norm": 4.364765167236328, + "learning_rate": 2.8356885283136485e-05, + "loss": 2.1149, + "step": 19466 + }, + { + "epoch": 1.3064997818865138, + "grad_norm": 3.973921775817871, + "learning_rate": 2.8347088450387986e-05, + "loss": 1.918, + "step": 19468 + }, + { + "epoch": 1.3066340055702828, + "grad_norm": 4.935470104217529, + "learning_rate": 2.833729264069696e-05, + "loss": 1.7854, + "step": 19470 + }, + { + "epoch": 1.306768229254052, + "grad_norm": 3.5938053131103516, + "learning_rate": 2.8327497854526276e-05, + "loss": 1.8994, + "step": 19472 + }, + { + "epoch": 1.3069024529378208, + "grad_norm": 3.6032168865203857, + "learning_rate": 2.8317704092338703e-05, + "loss": 1.5754, + "step": 19474 + }, + { + "epoch": 1.30703667662159, + "grad_norm": 3.9725146293640137, + "learning_rate": 2.8307911354596978e-05, + "loss": 2.1461, + "step": 19476 + }, + { + "epoch": 1.3071709003053589, + "grad_norm": 3.555635929107666, + "learning_rate": 2.8298119641763763e-05, + "loss": 1.9442, + "step": 19478 + }, + { + "epoch": 1.3073051239891278, + "grad_norm": 4.016441345214844, + "learning_rate": 2.828832895430174e-05, + "loss": 2.1246, + "step": 19480 + }, + { + "epoch": 1.307439347672897, + "grad_norm": 3.762857437133789, + "learning_rate": 2.827853929267348e-05, + "loss": 1.7447, + "step": 19482 + }, + { + "epoch": 1.3075735713566659, + "grad_norm": 3.919907808303833, + "learning_rate": 2.8268750657341524e-05, + "loss": 2.0288, + "step": 19484 + }, + { + "epoch": 1.307707795040435, + "grad_norm": 3.5687179565429688, + "learning_rate": 2.825896304876835e-05, + "loss": 1.8292, + "step": 19486 + }, + { + "epoch": 1.307842018724204, + "grad_norm": 4.059016227722168, + "learning_rate": 2.8249176467416438e-05, + "loss": 2.0033, + "step": 19488 + }, + { + "epoch": 1.3079762424079728, + "grad_norm": 4.116788864135742, + "learning_rate": 2.8239390913748144e-05, + "loss": 1.9876, + "step": 19490 + }, + { + "epoch": 1.3081104660917418, + "grad_norm": 3.7241289615631104, + "learning_rate": 2.822960638822588e-05, + "loss": 1.9147, + "step": 19492 + }, + { + "epoch": 1.308244689775511, + "grad_norm": 3.721071720123291, + "learning_rate": 2.8219822891311863e-05, + "loss": 1.8489, + "step": 19494 + }, + { + "epoch": 1.3083789134592798, + "grad_norm": 4.587413311004639, + "learning_rate": 2.8210040423468408e-05, + "loss": 2.0383, + "step": 19496 + }, + { + "epoch": 1.308513137143049, + "grad_norm": 3.767343521118164, + "learning_rate": 2.820025898515768e-05, + "loss": 1.9469, + "step": 19498 + }, + { + "epoch": 1.3086473608268179, + "grad_norm": 4.182920932769775, + "learning_rate": 2.8190478576841862e-05, + "loss": 1.9686, + "step": 19500 + }, + { + "epoch": 1.3087815845105868, + "grad_norm": 4.265953540802002, + "learning_rate": 2.8180699198983062e-05, + "loss": 2.1207, + "step": 19502 + }, + { + "epoch": 1.308915808194356, + "grad_norm": 4.4250030517578125, + "learning_rate": 2.8170920852043286e-05, + "loss": 2.0759, + "step": 19504 + }, + { + "epoch": 1.3090500318781249, + "grad_norm": 4.087188720703125, + "learning_rate": 2.8161143536484592e-05, + "loss": 2.1376, + "step": 19506 + }, + { + "epoch": 1.309184255561894, + "grad_norm": 4.001314640045166, + "learning_rate": 2.81513672527689e-05, + "loss": 1.9233, + "step": 19508 + }, + { + "epoch": 1.309318479245663, + "grad_norm": 3.7607223987579346, + "learning_rate": 2.8141592001358163e-05, + "loss": 1.7876, + "step": 19510 + }, + { + "epoch": 1.3094527029294318, + "grad_norm": 3.669027090072632, + "learning_rate": 2.813181778271422e-05, + "loss": 1.9797, + "step": 19512 + }, + { + "epoch": 1.309586926613201, + "grad_norm": 5.423717498779297, + "learning_rate": 2.8122044597298886e-05, + "loss": 2.178, + "step": 19514 + }, + { + "epoch": 1.30972115029697, + "grad_norm": 3.790292263031006, + "learning_rate": 2.8112272445573905e-05, + "loss": 1.9635, + "step": 19516 + }, + { + "epoch": 1.309855373980739, + "grad_norm": 9.714879035949707, + "learning_rate": 2.810250132800103e-05, + "loss": 2.0408, + "step": 19518 + }, + { + "epoch": 1.309989597664508, + "grad_norm": 4.121469974517822, + "learning_rate": 2.8092731245041903e-05, + "loss": 2.0448, + "step": 19520 + }, + { + "epoch": 1.3101238213482769, + "grad_norm": 4.229625225067139, + "learning_rate": 2.8082962197158148e-05, + "loss": 1.9629, + "step": 19522 + }, + { + "epoch": 1.3102580450320458, + "grad_norm": 8.656262397766113, + "learning_rate": 2.8073194184811314e-05, + "loss": 2.0252, + "step": 19524 + }, + { + "epoch": 1.310392268715815, + "grad_norm": 3.829808235168457, + "learning_rate": 2.8063427208462957e-05, + "loss": 2.0407, + "step": 19526 + }, + { + "epoch": 1.3105264923995839, + "grad_norm": 3.52909517288208, + "learning_rate": 2.8053661268574505e-05, + "loss": 1.7502, + "step": 19528 + }, + { + "epoch": 1.310660716083353, + "grad_norm": 4.292238235473633, + "learning_rate": 2.8043896365607447e-05, + "loss": 2.2015, + "step": 19530 + }, + { + "epoch": 1.310794939767122, + "grad_norm": 3.8827478885650635, + "learning_rate": 2.803413250002307e-05, + "loss": 2.0414, + "step": 19532 + }, + { + "epoch": 1.3109291634508908, + "grad_norm": 4.831069469451904, + "learning_rate": 2.8024369672282756e-05, + "loss": 1.88, + "step": 19534 + }, + { + "epoch": 1.31106338713466, + "grad_norm": 4.562099933624268, + "learning_rate": 2.8014607882847743e-05, + "loss": 1.9426, + "step": 19536 + }, + { + "epoch": 1.311197610818429, + "grad_norm": 3.965757131576538, + "learning_rate": 2.800484713217929e-05, + "loss": 2.0739, + "step": 19538 + }, + { + "epoch": 1.311331834502198, + "grad_norm": 4.1648030281066895, + "learning_rate": 2.7995087420738565e-05, + "loss": 1.9533, + "step": 19540 + }, + { + "epoch": 1.311466058185967, + "grad_norm": 4.126802921295166, + "learning_rate": 2.7985328748986682e-05, + "loss": 2.0893, + "step": 19542 + }, + { + "epoch": 1.3116002818697359, + "grad_norm": 3.810453176498413, + "learning_rate": 2.7975571117384713e-05, + "loss": 1.8481, + "step": 19544 + }, + { + "epoch": 1.3117345055535048, + "grad_norm": 3.8986775875091553, + "learning_rate": 2.7965814526393718e-05, + "loss": 1.9911, + "step": 19546 + }, + { + "epoch": 1.311868729237274, + "grad_norm": 4.19126558303833, + "learning_rate": 2.795605897647466e-05, + "loss": 2.0784, + "step": 19548 + }, + { + "epoch": 1.3120029529210429, + "grad_norm": 4.145925521850586, + "learning_rate": 2.7946304468088463e-05, + "loss": 1.7676, + "step": 19550 + }, + { + "epoch": 1.312137176604812, + "grad_norm": 4.540862083435059, + "learning_rate": 2.7936551001695992e-05, + "loss": 2.0886, + "step": 19552 + }, + { + "epoch": 1.312271400288581, + "grad_norm": 4.415445327758789, + "learning_rate": 2.792679857775813e-05, + "loss": 1.9821, + "step": 19554 + }, + { + "epoch": 1.3124056239723498, + "grad_norm": 4.719698905944824, + "learning_rate": 2.7917047196735602e-05, + "loss": 2.0819, + "step": 19556 + }, + { + "epoch": 1.312539847656119, + "grad_norm": 3.787729024887085, + "learning_rate": 2.790729685908919e-05, + "loss": 2.1011, + "step": 19558 + }, + { + "epoch": 1.312674071339888, + "grad_norm": 4.298770904541016, + "learning_rate": 2.7897547565279557e-05, + "loss": 2.2387, + "step": 19560 + }, + { + "epoch": 1.312808295023657, + "grad_norm": 4.356673240661621, + "learning_rate": 2.788779931576734e-05, + "loss": 2.1347, + "step": 19562 + }, + { + "epoch": 1.312942518707426, + "grad_norm": 3.877899408340454, + "learning_rate": 2.787805211101311e-05, + "loss": 1.9973, + "step": 19564 + }, + { + "epoch": 1.3130767423911949, + "grad_norm": 4.390237331390381, + "learning_rate": 2.7868305951477425e-05, + "loss": 2.1795, + "step": 19566 + }, + { + "epoch": 1.3132109660749638, + "grad_norm": 4.391139507293701, + "learning_rate": 2.7858560837620773e-05, + "loss": 1.8243, + "step": 19568 + }, + { + "epoch": 1.313345189758733, + "grad_norm": 3.806694269180298, + "learning_rate": 2.7848816769903574e-05, + "loss": 2.1076, + "step": 19570 + }, + { + "epoch": 1.3134794134425019, + "grad_norm": 3.8647592067718506, + "learning_rate": 2.783907374878623e-05, + "loss": 1.9743, + "step": 19572 + }, + { + "epoch": 1.313613637126271, + "grad_norm": 4.663578510284424, + "learning_rate": 2.7829331774729056e-05, + "loss": 1.953, + "step": 19574 + }, + { + "epoch": 1.31374786081004, + "grad_norm": 3.8908989429473877, + "learning_rate": 2.781959084819238e-05, + "loss": 1.8158, + "step": 19576 + }, + { + "epoch": 1.3138820844938088, + "grad_norm": 4.309686660766602, + "learning_rate": 2.780985096963641e-05, + "loss": 1.8369, + "step": 19578 + }, + { + "epoch": 1.314016308177578, + "grad_norm": 3.8115804195404053, + "learning_rate": 2.780011213952135e-05, + "loss": 2.0886, + "step": 19580 + }, + { + "epoch": 1.314150531861347, + "grad_norm": 4.144253730773926, + "learning_rate": 2.7790374358307327e-05, + "loss": 1.9819, + "step": 19582 + }, + { + "epoch": 1.314284755545116, + "grad_norm": 4.025634288787842, + "learning_rate": 2.7780637626454452e-05, + "loss": 2.009, + "step": 19584 + }, + { + "epoch": 1.314418979228885, + "grad_norm": 4.04208517074585, + "learning_rate": 2.7770901944422744e-05, + "loss": 1.9523, + "step": 19586 + }, + { + "epoch": 1.3145532029126539, + "grad_norm": 4.2474284172058105, + "learning_rate": 2.7761167312672242e-05, + "loss": 2.071, + "step": 19588 + }, + { + "epoch": 1.314687426596423, + "grad_norm": 3.7500102519989014, + "learning_rate": 2.775143373166281e-05, + "loss": 1.8417, + "step": 19590 + }, + { + "epoch": 1.314821650280192, + "grad_norm": 3.726234197616577, + "learning_rate": 2.7741701201854414e-05, + "loss": 1.9821, + "step": 19592 + }, + { + "epoch": 1.314955873963961, + "grad_norm": 3.656414270401001, + "learning_rate": 2.773196972370684e-05, + "loss": 1.8275, + "step": 19594 + }, + { + "epoch": 1.31509009764773, + "grad_norm": 4.647981643676758, + "learning_rate": 2.772223929767993e-05, + "loss": 2.0308, + "step": 19596 + }, + { + "epoch": 1.315224321331499, + "grad_norm": 3.71774959564209, + "learning_rate": 2.771250992423341e-05, + "loss": 1.8505, + "step": 19598 + }, + { + "epoch": 1.3153585450152678, + "grad_norm": 3.9448516368865967, + "learning_rate": 2.7702781603826965e-05, + "loss": 2.0497, + "step": 19600 + }, + { + "epoch": 1.315492768699037, + "grad_norm": 4.044893741607666, + "learning_rate": 2.7693054336920228e-05, + "loss": 2.0367, + "step": 19602 + }, + { + "epoch": 1.315626992382806, + "grad_norm": 3.6761233806610107, + "learning_rate": 2.7683328123972823e-05, + "loss": 1.9381, + "step": 19604 + }, + { + "epoch": 1.315761216066575, + "grad_norm": 3.790647268295288, + "learning_rate": 2.7673602965444285e-05, + "loss": 2.0387, + "step": 19606 + }, + { + "epoch": 1.315895439750344, + "grad_norm": 3.949280261993408, + "learning_rate": 2.76638788617941e-05, + "loss": 2.0412, + "step": 19608 + }, + { + "epoch": 1.3160296634341129, + "grad_norm": 3.958556890487671, + "learning_rate": 2.76541558134817e-05, + "loss": 1.9601, + "step": 19610 + }, + { + "epoch": 1.316163887117882, + "grad_norm": 3.9539523124694824, + "learning_rate": 2.764443382096652e-05, + "loss": 2.1398, + "step": 19612 + }, + { + "epoch": 1.316298110801651, + "grad_norm": 3.6594769954681396, + "learning_rate": 2.7634712884707852e-05, + "loss": 1.8706, + "step": 19614 + }, + { + "epoch": 1.31643233448542, + "grad_norm": 4.265544414520264, + "learning_rate": 2.7624993005165066e-05, + "loss": 2.1078, + "step": 19616 + }, + { + "epoch": 1.316566558169189, + "grad_norm": 4.2025146484375, + "learning_rate": 2.7615274182797325e-05, + "loss": 1.9149, + "step": 19618 + }, + { + "epoch": 1.316700781852958, + "grad_norm": 3.789443254470825, + "learning_rate": 2.7605556418063877e-05, + "loss": 1.8699, + "step": 19620 + }, + { + "epoch": 1.3168350055367268, + "grad_norm": 4.689788818359375, + "learning_rate": 2.759583971142383e-05, + "loss": 2.2095, + "step": 19622 + }, + { + "epoch": 1.316969229220496, + "grad_norm": 3.832811117172241, + "learning_rate": 2.758612406333633e-05, + "loss": 2.0302, + "step": 19624 + }, + { + "epoch": 1.317103452904265, + "grad_norm": 4.267317771911621, + "learning_rate": 2.7576409474260378e-05, + "loss": 2.1288, + "step": 19626 + }, + { + "epoch": 1.317237676588034, + "grad_norm": 3.799241542816162, + "learning_rate": 2.7566695944654997e-05, + "loss": 1.8919, + "step": 19628 + }, + { + "epoch": 1.317371900271803, + "grad_norm": 3.8705949783325195, + "learning_rate": 2.7556983474979093e-05, + "loss": 1.9819, + "step": 19630 + }, + { + "epoch": 1.3175061239555719, + "grad_norm": 3.9463050365448, + "learning_rate": 2.754727206569161e-05, + "loss": 2.093, + "step": 19632 + }, + { + "epoch": 1.317640347639341, + "grad_norm": 3.982813596725464, + "learning_rate": 2.753756171725137e-05, + "loss": 2.0208, + "step": 19634 + }, + { + "epoch": 1.31777457132311, + "grad_norm": 3.8167197704315186, + "learning_rate": 2.7527852430117167e-05, + "loss": 1.8902, + "step": 19636 + }, + { + "epoch": 1.317908795006879, + "grad_norm": 3.812610626220703, + "learning_rate": 2.7518144204747732e-05, + "loss": 1.9119, + "step": 19638 + }, + { + "epoch": 1.318043018690648, + "grad_norm": 4.143916606903076, + "learning_rate": 2.7508437041601786e-05, + "loss": 1.7364, + "step": 19640 + }, + { + "epoch": 1.318177242374417, + "grad_norm": 4.207032203674316, + "learning_rate": 2.749873094113797e-05, + "loss": 2.1874, + "step": 19642 + }, + { + "epoch": 1.3183114660581858, + "grad_norm": 4.26588249206543, + "learning_rate": 2.748902590381487e-05, + "loss": 1.7723, + "step": 19644 + }, + { + "epoch": 1.318445689741955, + "grad_norm": 3.9035046100616455, + "learning_rate": 2.7479321930091023e-05, + "loss": 1.9304, + "step": 19646 + }, + { + "epoch": 1.318579913425724, + "grad_norm": 4.222590446472168, + "learning_rate": 2.7469619020424913e-05, + "loss": 1.8514, + "step": 19648 + }, + { + "epoch": 1.318714137109493, + "grad_norm": 4.1458516120910645, + "learning_rate": 2.7459917175275018e-05, + "loss": 2.0031, + "step": 19650 + }, + { + "epoch": 1.318848360793262, + "grad_norm": 4.637986660003662, + "learning_rate": 2.7450216395099705e-05, + "loss": 2.302, + "step": 19652 + }, + { + "epoch": 1.3189825844770309, + "grad_norm": 3.8797011375427246, + "learning_rate": 2.7440516680357354e-05, + "loss": 2.022, + "step": 19654 + }, + { + "epoch": 1.3191168081608, + "grad_norm": 3.9623498916625977, + "learning_rate": 2.7430818031506188e-05, + "loss": 1.9702, + "step": 19656 + }, + { + "epoch": 1.319251031844569, + "grad_norm": 4.159145355224609, + "learning_rate": 2.742112044900451e-05, + "loss": 2.0568, + "step": 19658 + }, + { + "epoch": 1.319385255528338, + "grad_norm": 4.03299617767334, + "learning_rate": 2.7411423933310476e-05, + "loss": 1.6704, + "step": 19660 + }, + { + "epoch": 1.319519479212107, + "grad_norm": 4.113770484924316, + "learning_rate": 2.740172848488226e-05, + "loss": 1.9267, + "step": 19662 + }, + { + "epoch": 1.319653702895876, + "grad_norm": 3.725757360458374, + "learning_rate": 2.739203410417794e-05, + "loss": 2.0023, + "step": 19664 + }, + { + "epoch": 1.319787926579645, + "grad_norm": 3.8153207302093506, + "learning_rate": 2.738234079165555e-05, + "loss": 2.0262, + "step": 19666 + }, + { + "epoch": 1.319922150263414, + "grad_norm": 5.273998260498047, + "learning_rate": 2.737264854777306e-05, + "loss": 1.8038, + "step": 19668 + }, + { + "epoch": 1.3200563739471831, + "grad_norm": 3.4621660709381104, + "learning_rate": 2.7362957372988452e-05, + "loss": 1.7075, + "step": 19670 + }, + { + "epoch": 1.320190597630952, + "grad_norm": 4.164013385772705, + "learning_rate": 2.7353267267759587e-05, + "loss": 2.0482, + "step": 19672 + }, + { + "epoch": 1.320324821314721, + "grad_norm": 4.196950435638428, + "learning_rate": 2.734357823254432e-05, + "loss": 2.1843, + "step": 19674 + }, + { + "epoch": 1.3204590449984899, + "grad_norm": 4.455392837524414, + "learning_rate": 2.7333890267800412e-05, + "loss": 2.1238, + "step": 19676 + }, + { + "epoch": 1.320593268682259, + "grad_norm": 4.202584266662598, + "learning_rate": 2.7324203373985626e-05, + "loss": 2.3204, + "step": 19678 + }, + { + "epoch": 1.320727492366028, + "grad_norm": 3.4962143898010254, + "learning_rate": 2.7314517551557627e-05, + "loss": 1.7016, + "step": 19680 + }, + { + "epoch": 1.320861716049797, + "grad_norm": 4.500294208526611, + "learning_rate": 2.7304832800974105e-05, + "loss": 2.0504, + "step": 19682 + }, + { + "epoch": 1.320995939733566, + "grad_norm": 3.782432794570923, + "learning_rate": 2.7295149122692566e-05, + "loss": 1.9026, + "step": 19684 + }, + { + "epoch": 1.321130163417335, + "grad_norm": 3.8127658367156982, + "learning_rate": 2.7285466517170605e-05, + "loss": 2.0216, + "step": 19686 + }, + { + "epoch": 1.321264387101104, + "grad_norm": 8.555583000183105, + "learning_rate": 2.727578498486566e-05, + "loss": 1.9492, + "step": 19688 + }, + { + "epoch": 1.321398610784873, + "grad_norm": 4.145305633544922, + "learning_rate": 2.7266104526235215e-05, + "loss": 2.1439, + "step": 19690 + }, + { + "epoch": 1.3215328344686421, + "grad_norm": 4.4614715576171875, + "learning_rate": 2.725642514173662e-05, + "loss": 1.9085, + "step": 19692 + }, + { + "epoch": 1.321667058152411, + "grad_norm": 3.638500452041626, + "learning_rate": 2.724674683182722e-05, + "loss": 1.9562, + "step": 19694 + }, + { + "epoch": 1.32180128183618, + "grad_norm": 4.2645955085754395, + "learning_rate": 2.7237069596964266e-05, + "loss": 2.0015, + "step": 19696 + }, + { + "epoch": 1.3219355055199489, + "grad_norm": 4.410453796386719, + "learning_rate": 2.722739343760503e-05, + "loss": 2.11, + "step": 19698 + }, + { + "epoch": 1.322069729203718, + "grad_norm": 4.1759772300720215, + "learning_rate": 2.721771835420668e-05, + "loss": 2.2944, + "step": 19700 + }, + { + "epoch": 1.322203952887487, + "grad_norm": 4.299444675445557, + "learning_rate": 2.7208044347226335e-05, + "loss": 1.8915, + "step": 19702 + }, + { + "epoch": 1.322338176571256, + "grad_norm": 3.7404427528381348, + "learning_rate": 2.719837141712106e-05, + "loss": 1.6742, + "step": 19704 + }, + { + "epoch": 1.322472400255025, + "grad_norm": 3.808338165283203, + "learning_rate": 2.718869956434791e-05, + "loss": 1.8181, + "step": 19706 + }, + { + "epoch": 1.322606623938794, + "grad_norm": 4.262500286102295, + "learning_rate": 2.717902878936386e-05, + "loss": 2.2295, + "step": 19708 + }, + { + "epoch": 1.322740847622563, + "grad_norm": 3.700575828552246, + "learning_rate": 2.7169359092625813e-05, + "loss": 1.9662, + "step": 19710 + }, + { + "epoch": 1.322875071306332, + "grad_norm": 3.9964118003845215, + "learning_rate": 2.715969047459066e-05, + "loss": 2.1903, + "step": 19712 + }, + { + "epoch": 1.3230092949901011, + "grad_norm": 3.7277565002441406, + "learning_rate": 2.7150022935715196e-05, + "loss": 2.0878, + "step": 19714 + }, + { + "epoch": 1.32314351867387, + "grad_norm": 4.320284366607666, + "learning_rate": 2.714035647645624e-05, + "loss": 2.0686, + "step": 19716 + }, + { + "epoch": 1.323277742357639, + "grad_norm": 3.3727121353149414, + "learning_rate": 2.7130691097270468e-05, + "loss": 1.8521, + "step": 19718 + }, + { + "epoch": 1.3234119660414079, + "grad_norm": 3.7952492237091064, + "learning_rate": 2.7121026798614583e-05, + "loss": 1.7039, + "step": 19720 + }, + { + "epoch": 1.323546189725177, + "grad_norm": 4.394172191619873, + "learning_rate": 2.7111363580945202e-05, + "loss": 2.1779, + "step": 19722 + }, + { + "epoch": 1.323680413408946, + "grad_norm": 4.11142635345459, + "learning_rate": 2.710170144471888e-05, + "loss": 1.8794, + "step": 19724 + }, + { + "epoch": 1.323814637092715, + "grad_norm": 4.299354553222656, + "learning_rate": 2.7092040390392115e-05, + "loss": 2.1067, + "step": 19726 + }, + { + "epoch": 1.323948860776484, + "grad_norm": 3.895462989807129, + "learning_rate": 2.7082380418421417e-05, + "loss": 2.1204, + "step": 19728 + }, + { + "epoch": 1.324083084460253, + "grad_norm": 3.861283302307129, + "learning_rate": 2.7072721529263177e-05, + "loss": 1.8229, + "step": 19730 + }, + { + "epoch": 1.324217308144022, + "grad_norm": 3.8946332931518555, + "learning_rate": 2.706306372337376e-05, + "loss": 1.8986, + "step": 19732 + }, + { + "epoch": 1.324351531827791, + "grad_norm": 4.212174415588379, + "learning_rate": 2.7053407001209465e-05, + "loss": 1.9837, + "step": 19734 + }, + { + "epoch": 1.3244857555115601, + "grad_norm": 4.768448352813721, + "learning_rate": 2.7043751363226575e-05, + "loss": 1.8751, + "step": 19736 + }, + { + "epoch": 1.324619979195329, + "grad_norm": 4.847423076629639, + "learning_rate": 2.703409680988128e-05, + "loss": 1.8482, + "step": 19738 + }, + { + "epoch": 1.324754202879098, + "grad_norm": 4.209506034851074, + "learning_rate": 2.702444334162979e-05, + "loss": 1.8872, + "step": 19740 + }, + { + "epoch": 1.324888426562867, + "grad_norm": 3.7055504322052, + "learning_rate": 2.7014790958928138e-05, + "loss": 1.8975, + "step": 19742 + }, + { + "epoch": 1.325022650246636, + "grad_norm": 3.56510329246521, + "learning_rate": 2.7005139662232425e-05, + "loss": 1.8546, + "step": 19744 + }, + { + "epoch": 1.3251568739304052, + "grad_norm": 4.563514232635498, + "learning_rate": 2.699548945199863e-05, + "loss": 2.0441, + "step": 19746 + }, + { + "epoch": 1.325291097614174, + "grad_norm": 4.69660758972168, + "learning_rate": 2.6985840328682737e-05, + "loss": 2.0021, + "step": 19748 + }, + { + "epoch": 1.325425321297943, + "grad_norm": 4.163809299468994, + "learning_rate": 2.6976192292740637e-05, + "loss": 2.1186, + "step": 19750 + }, + { + "epoch": 1.325559544981712, + "grad_norm": 4.264693260192871, + "learning_rate": 2.696654534462818e-05, + "loss": 2.1558, + "step": 19752 + }, + { + "epoch": 1.325693768665481, + "grad_norm": 3.873859167098999, + "learning_rate": 2.6956899484801134e-05, + "loss": 2.2775, + "step": 19754 + }, + { + "epoch": 1.32582799234925, + "grad_norm": 3.6060469150543213, + "learning_rate": 2.6947254713715304e-05, + "loss": 1.8501, + "step": 19756 + }, + { + "epoch": 1.3259622160330191, + "grad_norm": 3.31429123878479, + "learning_rate": 2.693761103182635e-05, + "loss": 1.9017, + "step": 19758 + }, + { + "epoch": 1.326096439716788, + "grad_norm": 3.7171554565429688, + "learning_rate": 2.692796843958993e-05, + "loss": 1.8249, + "step": 19760 + }, + { + "epoch": 1.326230663400557, + "grad_norm": 4.211698532104492, + "learning_rate": 2.691832693746161e-05, + "loss": 2.2619, + "step": 19762 + }, + { + "epoch": 1.326364887084326, + "grad_norm": 4.495853900909424, + "learning_rate": 2.6908686525896977e-05, + "loss": 1.9294, + "step": 19764 + }, + { + "epoch": 1.326499110768095, + "grad_norm": 4.258190155029297, + "learning_rate": 2.689904720535147e-05, + "loss": 2.1965, + "step": 19766 + }, + { + "epoch": 1.3266333344518642, + "grad_norm": 4.13493537902832, + "learning_rate": 2.68894089762806e-05, + "loss": 1.9736, + "step": 19768 + }, + { + "epoch": 1.326767558135633, + "grad_norm": 3.799561023712158, + "learning_rate": 2.6879771839139678e-05, + "loss": 1.8153, + "step": 19770 + }, + { + "epoch": 1.326901781819402, + "grad_norm": 4.381650924682617, + "learning_rate": 2.6870135794384084e-05, + "loss": 2.6809, + "step": 19772 + }, + { + "epoch": 1.327036005503171, + "grad_norm": 4.297097682952881, + "learning_rate": 2.686050084246907e-05, + "loss": 1.8903, + "step": 19774 + }, + { + "epoch": 1.32717022918694, + "grad_norm": 3.4506208896636963, + "learning_rate": 2.6850866983849915e-05, + "loss": 1.9139, + "step": 19776 + }, + { + "epoch": 1.327304452870709, + "grad_norm": 3.6232755184173584, + "learning_rate": 2.684123421898179e-05, + "loss": 2.0223, + "step": 19778 + }, + { + "epoch": 1.3274386765544781, + "grad_norm": 4.294733047485352, + "learning_rate": 2.6831602548319773e-05, + "loss": 1.9266, + "step": 19780 + }, + { + "epoch": 1.327572900238247, + "grad_norm": 4.084924697875977, + "learning_rate": 2.6821971972318992e-05, + "loss": 1.8988, + "step": 19782 + }, + { + "epoch": 1.327707123922016, + "grad_norm": 3.220860242843628, + "learning_rate": 2.6812342491434444e-05, + "loss": 1.6553, + "step": 19784 + }, + { + "epoch": 1.327841347605785, + "grad_norm": 4.1886091232299805, + "learning_rate": 2.680271410612113e-05, + "loss": 2.025, + "step": 19786 + }, + { + "epoch": 1.327975571289554, + "grad_norm": 3.2924108505249023, + "learning_rate": 2.6793086816833967e-05, + "loss": 1.9189, + "step": 19788 + }, + { + "epoch": 1.3281097949733232, + "grad_norm": 4.081820487976074, + "learning_rate": 2.6783460624027813e-05, + "loss": 2.1077, + "step": 19790 + }, + { + "epoch": 1.328244018657092, + "grad_norm": 4.030813694000244, + "learning_rate": 2.6773835528157464e-05, + "loss": 1.833, + "step": 19792 + }, + { + "epoch": 1.328378242340861, + "grad_norm": 4.291330814361572, + "learning_rate": 2.676421152967774e-05, + "loss": 2.059, + "step": 19794 + }, + { + "epoch": 1.32851246602463, + "grad_norm": 4.165417194366455, + "learning_rate": 2.675458862904333e-05, + "loss": 1.9044, + "step": 19796 + }, + { + "epoch": 1.328646689708399, + "grad_norm": 4.5716872215271, + "learning_rate": 2.6744966826708906e-05, + "loss": 2.1368, + "step": 19798 + }, + { + "epoch": 1.328780913392168, + "grad_norm": 4.327347755432129, + "learning_rate": 2.673534612312904e-05, + "loss": 1.9772, + "step": 19800 + }, + { + "epoch": 1.3289151370759371, + "grad_norm": 4.529144763946533, + "learning_rate": 2.6725726518758344e-05, + "loss": 2.2377, + "step": 19802 + }, + { + "epoch": 1.329049360759706, + "grad_norm": 3.908935070037842, + "learning_rate": 2.6716108014051282e-05, + "loss": 2.0196, + "step": 19804 + }, + { + "epoch": 1.329183584443475, + "grad_norm": 4.1100921630859375, + "learning_rate": 2.670649060946237e-05, + "loss": 2.0735, + "step": 19806 + }, + { + "epoch": 1.329317808127244, + "grad_norm": 5.4870285987854, + "learning_rate": 2.6696874305445936e-05, + "loss": 2.0471, + "step": 19808 + }, + { + "epoch": 1.329452031811013, + "grad_norm": 4.505583763122559, + "learning_rate": 2.6687259102456386e-05, + "loss": 1.8773, + "step": 19810 + }, + { + "epoch": 1.3295862554947822, + "grad_norm": 4.199224472045898, + "learning_rate": 2.6677645000947982e-05, + "loss": 1.966, + "step": 19812 + }, + { + "epoch": 1.329720479178551, + "grad_norm": 3.7805404663085938, + "learning_rate": 2.6668032001375002e-05, + "loss": 1.9925, + "step": 19814 + }, + { + "epoch": 1.32985470286232, + "grad_norm": 4.1031999588012695, + "learning_rate": 2.665842010419164e-05, + "loss": 1.9246, + "step": 19816 + }, + { + "epoch": 1.3299889265460891, + "grad_norm": 3.993379831314087, + "learning_rate": 2.6648809309852017e-05, + "loss": 1.9958, + "step": 19818 + }, + { + "epoch": 1.330123150229858, + "grad_norm": 3.460287094116211, + "learning_rate": 2.6639199618810228e-05, + "loss": 1.8145, + "step": 19820 + }, + { + "epoch": 1.3302573739136272, + "grad_norm": 3.913473129272461, + "learning_rate": 2.6629591031520334e-05, + "loss": 2.3673, + "step": 19822 + }, + { + "epoch": 1.3303915975973961, + "grad_norm": 4.217419147491455, + "learning_rate": 2.6619983548436313e-05, + "loss": 1.9142, + "step": 19824 + }, + { + "epoch": 1.330525821281165, + "grad_norm": 3.966343402862549, + "learning_rate": 2.661037717001209e-05, + "loss": 2.1748, + "step": 19826 + }, + { + "epoch": 1.330660044964934, + "grad_norm": 4.433574199676514, + "learning_rate": 2.660077189670153e-05, + "loss": 1.8371, + "step": 19828 + }, + { + "epoch": 1.330794268648703, + "grad_norm": 4.0693888664245605, + "learning_rate": 2.659116772895851e-05, + "loss": 2.035, + "step": 19830 + }, + { + "epoch": 1.330928492332472, + "grad_norm": 4.483973979949951, + "learning_rate": 2.6581564667236758e-05, + "loss": 1.8073, + "step": 19832 + }, + { + "epoch": 1.3310627160162412, + "grad_norm": 3.212812662124634, + "learning_rate": 2.6571962711990073e-05, + "loss": 2.0113, + "step": 19834 + }, + { + "epoch": 1.33119693970001, + "grad_norm": 4.276805877685547, + "learning_rate": 2.6562361863672037e-05, + "loss": 1.8149, + "step": 19836 + }, + { + "epoch": 1.331331163383779, + "grad_norm": 4.470346450805664, + "learning_rate": 2.655276212273633e-05, + "loss": 1.968, + "step": 19838 + }, + { + "epoch": 1.3314653870675481, + "grad_norm": 4.298914909362793, + "learning_rate": 2.654316348963649e-05, + "loss": 1.7938, + "step": 19840 + }, + { + "epoch": 1.331599610751317, + "grad_norm": 6.790534019470215, + "learning_rate": 2.653356596482607e-05, + "loss": 1.9995, + "step": 19842 + }, + { + "epoch": 1.3317338344350862, + "grad_norm": 4.1747236251831055, + "learning_rate": 2.652396954875851e-05, + "loss": 2.0777, + "step": 19844 + }, + { + "epoch": 1.3318680581188551, + "grad_norm": 3.539616823196411, + "learning_rate": 2.651437424188723e-05, + "loss": 2.1129, + "step": 19846 + }, + { + "epoch": 1.332002281802624, + "grad_norm": 4.556600570678711, + "learning_rate": 2.6504780044665578e-05, + "loss": 2.0449, + "step": 19848 + }, + { + "epoch": 1.332136505486393, + "grad_norm": 4.058013439178467, + "learning_rate": 2.6495186957546852e-05, + "loss": 2.0721, + "step": 19850 + }, + { + "epoch": 1.332270729170162, + "grad_norm": 4.153592109680176, + "learning_rate": 2.648559498098434e-05, + "loss": 2.215, + "step": 19852 + }, + { + "epoch": 1.332404952853931, + "grad_norm": 3.942617416381836, + "learning_rate": 2.647600411543123e-05, + "loss": 1.9085, + "step": 19854 + }, + { + "epoch": 1.3325391765377002, + "grad_norm": 4.730444431304932, + "learning_rate": 2.6466414361340663e-05, + "loss": 2.2031, + "step": 19856 + }, + { + "epoch": 1.332673400221469, + "grad_norm": 4.242741107940674, + "learning_rate": 2.6456825719165723e-05, + "loss": 2.3529, + "step": 19858 + }, + { + "epoch": 1.332807623905238, + "grad_norm": 3.6508359909057617, + "learning_rate": 2.644723818935949e-05, + "loss": 2.0092, + "step": 19860 + }, + { + "epoch": 1.3329418475890071, + "grad_norm": 3.9898812770843506, + "learning_rate": 2.6437651772374937e-05, + "loss": 1.7687, + "step": 19862 + }, + { + "epoch": 1.333076071272776, + "grad_norm": 3.6996307373046875, + "learning_rate": 2.6428066468665013e-05, + "loss": 1.9286, + "step": 19864 + }, + { + "epoch": 1.3332102949565452, + "grad_norm": 4.381689071655273, + "learning_rate": 2.6418482278682566e-05, + "loss": 1.7847, + "step": 19866 + }, + { + "epoch": 1.3333445186403141, + "grad_norm": 4.000370502471924, + "learning_rate": 2.640889920288049e-05, + "loss": 2.0266, + "step": 19868 + }, + { + "epoch": 1.333478742324083, + "grad_norm": 4.188684940338135, + "learning_rate": 2.6399317241711507e-05, + "loss": 2.0403, + "step": 19870 + }, + { + "epoch": 1.333612966007852, + "grad_norm": 3.846039295196533, + "learning_rate": 2.63897363956284e-05, + "loss": 1.7473, + "step": 19872 + }, + { + "epoch": 1.333747189691621, + "grad_norm": 4.236984729766846, + "learning_rate": 2.638015666508382e-05, + "loss": 1.8523, + "step": 19874 + }, + { + "epoch": 1.33388141337539, + "grad_norm": 4.715508937835693, + "learning_rate": 2.6370578050530392e-05, + "loss": 2.2064, + "step": 19876 + }, + { + "epoch": 1.3340156370591592, + "grad_norm": 4.1012797355651855, + "learning_rate": 2.636100055242067e-05, + "loss": 1.9753, + "step": 19878 + }, + { + "epoch": 1.334149860742928, + "grad_norm": 4.144448280334473, + "learning_rate": 2.63514241712072e-05, + "loss": 2.2175, + "step": 19880 + }, + { + "epoch": 1.334284084426697, + "grad_norm": 4.1569695472717285, + "learning_rate": 2.6341848907342436e-05, + "loss": 1.9598, + "step": 19882 + }, + { + "epoch": 1.3344183081104661, + "grad_norm": 3.7784841060638428, + "learning_rate": 2.6332274761278797e-05, + "loss": 2.1087, + "step": 19884 + }, + { + "epoch": 1.334552531794235, + "grad_norm": 3.882664442062378, + "learning_rate": 2.6322701733468608e-05, + "loss": 1.7791, + "step": 19886 + }, + { + "epoch": 1.3346867554780042, + "grad_norm": 3.639793634414673, + "learning_rate": 2.6313129824364224e-05, + "loss": 2.1964, + "step": 19888 + }, + { + "epoch": 1.3348209791617731, + "grad_norm": 4.043917179107666, + "learning_rate": 2.630355903441788e-05, + "loss": 2.0618, + "step": 19890 + }, + { + "epoch": 1.334955202845542, + "grad_norm": 6.465092182159424, + "learning_rate": 2.6293989364081774e-05, + "loss": 2.0141, + "step": 19892 + }, + { + "epoch": 1.3350894265293112, + "grad_norm": 3.7205379009246826, + "learning_rate": 2.628442081380803e-05, + "loss": 1.8251, + "step": 19894 + }, + { + "epoch": 1.33522365021308, + "grad_norm": 3.631465435028076, + "learning_rate": 2.6274853384048793e-05, + "loss": 1.9761, + "step": 19896 + }, + { + "epoch": 1.3353578738968492, + "grad_norm": 3.9402756690979004, + "learning_rate": 2.6265287075256057e-05, + "loss": 1.7485, + "step": 19898 + }, + { + "epoch": 1.3354920975806182, + "grad_norm": 3.9496965408325195, + "learning_rate": 2.6255721887881857e-05, + "loss": 1.8863, + "step": 19900 + }, + { + "epoch": 1.335626321264387, + "grad_norm": 4.0976481437683105, + "learning_rate": 2.6246157822378104e-05, + "loss": 1.9031, + "step": 19902 + }, + { + "epoch": 1.335760544948156, + "grad_norm": 4.26181173324585, + "learning_rate": 2.6236594879196685e-05, + "loss": 1.9895, + "step": 19904 + }, + { + "epoch": 1.3358947686319251, + "grad_norm": 3.6741137504577637, + "learning_rate": 2.6227033058789408e-05, + "loss": 2.0194, + "step": 19906 + }, + { + "epoch": 1.336028992315694, + "grad_norm": 3.8896117210388184, + "learning_rate": 2.6217472361608094e-05, + "loss": 1.9234, + "step": 19908 + }, + { + "epoch": 1.3361632159994632, + "grad_norm": 4.8473334312438965, + "learning_rate": 2.6207912788104438e-05, + "loss": 2.194, + "step": 19910 + }, + { + "epoch": 1.3362974396832321, + "grad_norm": 3.8867409229278564, + "learning_rate": 2.6198354338730123e-05, + "loss": 1.9074, + "step": 19912 + }, + { + "epoch": 1.336431663367001, + "grad_norm": 4.276253700256348, + "learning_rate": 2.6188797013936762e-05, + "loss": 2.3109, + "step": 19914 + }, + { + "epoch": 1.3365658870507702, + "grad_norm": 4.182199478149414, + "learning_rate": 2.6179240814175897e-05, + "loss": 1.9667, + "step": 19916 + }, + { + "epoch": 1.336700110734539, + "grad_norm": 4.450270175933838, + "learning_rate": 2.6169685739899085e-05, + "loss": 2.2653, + "step": 19918 + }, + { + "epoch": 1.3368343344183082, + "grad_norm": 3.448652982711792, + "learning_rate": 2.6160131791557763e-05, + "loss": 1.8906, + "step": 19920 + }, + { + "epoch": 1.3369685581020772, + "grad_norm": 4.286525249481201, + "learning_rate": 2.6150578969603334e-05, + "loss": 2.1167, + "step": 19922 + }, + { + "epoch": 1.337102781785846, + "grad_norm": 4.22137975692749, + "learning_rate": 2.614102727448713e-05, + "loss": 2.1024, + "step": 19924 + }, + { + "epoch": 1.337237005469615, + "grad_norm": 3.945206642150879, + "learning_rate": 2.6131476706660485e-05, + "loss": 1.9436, + "step": 19926 + }, + { + "epoch": 1.3373712291533841, + "grad_norm": 4.481176853179932, + "learning_rate": 2.612192726657462e-05, + "loss": 1.7248, + "step": 19928 + }, + { + "epoch": 1.337505452837153, + "grad_norm": 3.988888740539551, + "learning_rate": 2.6112378954680773e-05, + "loss": 1.991, + "step": 19930 + }, + { + "epoch": 1.3376396765209222, + "grad_norm": 3.973032236099243, + "learning_rate": 2.6102831771430003e-05, + "loss": 1.9678, + "step": 19932 + }, + { + "epoch": 1.3377739002046911, + "grad_norm": 4.1029181480407715, + "learning_rate": 2.6093285717273465e-05, + "loss": 2.0185, + "step": 19934 + }, + { + "epoch": 1.33790812388846, + "grad_norm": 3.761995553970337, + "learning_rate": 2.6083740792662148e-05, + "loss": 2.1083, + "step": 19936 + }, + { + "epoch": 1.3380423475722292, + "grad_norm": 4.830238342285156, + "learning_rate": 2.607419699804706e-05, + "loss": 2.3267, + "step": 19938 + }, + { + "epoch": 1.338176571255998, + "grad_norm": 3.9792063236236572, + "learning_rate": 2.6064654333879123e-05, + "loss": 2.1124, + "step": 19940 + }, + { + "epoch": 1.3383107949397672, + "grad_norm": 4.524407386779785, + "learning_rate": 2.6055112800609206e-05, + "loss": 2.1318, + "step": 19942 + }, + { + "epoch": 1.3384450186235362, + "grad_norm": 4.153371810913086, + "learning_rate": 2.6045572398688095e-05, + "loss": 1.8524, + "step": 19944 + }, + { + "epoch": 1.338579242307305, + "grad_norm": 4.046847343444824, + "learning_rate": 2.6036033128566608e-05, + "loss": 2.1347, + "step": 19946 + }, + { + "epoch": 1.338713465991074, + "grad_norm": 4.630450248718262, + "learning_rate": 2.6026494990695426e-05, + "loss": 2.0934, + "step": 19948 + }, + { + "epoch": 1.3388476896748431, + "grad_norm": 4.3723320960998535, + "learning_rate": 2.6016957985525227e-05, + "loss": 1.984, + "step": 19950 + }, + { + "epoch": 1.338981913358612, + "grad_norm": 3.7095420360565186, + "learning_rate": 2.6007422113506573e-05, + "loss": 2.2118, + "step": 19952 + }, + { + "epoch": 1.3391161370423812, + "grad_norm": 3.777092933654785, + "learning_rate": 2.599788737509007e-05, + "loss": 2.1409, + "step": 19954 + }, + { + "epoch": 1.3392503607261501, + "grad_norm": 3.8634631633758545, + "learning_rate": 2.5988353770726166e-05, + "loss": 1.8746, + "step": 19956 + }, + { + "epoch": 1.339384584409919, + "grad_norm": 3.8447656631469727, + "learning_rate": 2.5978821300865368e-05, + "loss": 1.9129, + "step": 19958 + }, + { + "epoch": 1.3395188080936882, + "grad_norm": 3.8089959621429443, + "learning_rate": 2.5969289965957988e-05, + "loss": 1.5248, + "step": 19960 + }, + { + "epoch": 1.339653031777457, + "grad_norm": 4.377919673919678, + "learning_rate": 2.5959759766454428e-05, + "loss": 2.0188, + "step": 19962 + }, + { + "epoch": 1.3397872554612262, + "grad_norm": 4.079153537750244, + "learning_rate": 2.5950230702804923e-05, + "loss": 1.9697, + "step": 19964 + }, + { + "epoch": 1.3399214791449952, + "grad_norm": 4.12437629699707, + "learning_rate": 2.5940702775459747e-05, + "loss": 1.9045, + "step": 19966 + }, + { + "epoch": 1.340055702828764, + "grad_norm": 3.482039451599121, + "learning_rate": 2.593117598486905e-05, + "loss": 2.0455, + "step": 19968 + }, + { + "epoch": 1.3401899265125332, + "grad_norm": 4.0327348709106445, + "learning_rate": 2.5921650331482962e-05, + "loss": 1.8494, + "step": 19970 + }, + { + "epoch": 1.3403241501963021, + "grad_norm": 3.8163957595825195, + "learning_rate": 2.591212581575153e-05, + "loss": 1.6757, + "step": 19972 + }, + { + "epoch": 1.3404583738800713, + "grad_norm": 4.794153690338135, + "learning_rate": 2.59026024381248e-05, + "loss": 1.9571, + "step": 19974 + }, + { + "epoch": 1.3405925975638402, + "grad_norm": 3.836858034133911, + "learning_rate": 2.589308019905273e-05, + "loss": 2.2098, + "step": 19976 + }, + { + "epoch": 1.3407268212476091, + "grad_norm": 4.11005163192749, + "learning_rate": 2.5883559098985204e-05, + "loss": 2.1746, + "step": 19978 + }, + { + "epoch": 1.340861044931378, + "grad_norm": 4.051103591918945, + "learning_rate": 2.5874039138372075e-05, + "loss": 1.7887, + "step": 19980 + }, + { + "epoch": 1.3409952686151472, + "grad_norm": 4.024030685424805, + "learning_rate": 2.586452031766317e-05, + "loss": 2.0704, + "step": 19982 + }, + { + "epoch": 1.341129492298916, + "grad_norm": 4.177502155303955, + "learning_rate": 2.5855002637308224e-05, + "loss": 2.1275, + "step": 19984 + }, + { + "epoch": 1.3412637159826852, + "grad_norm": 3.8765268325805664, + "learning_rate": 2.584548609775692e-05, + "loss": 1.9102, + "step": 19986 + }, + { + "epoch": 1.3413979396664542, + "grad_norm": 3.9497992992401123, + "learning_rate": 2.5835970699458906e-05, + "loss": 1.8894, + "step": 19988 + }, + { + "epoch": 1.341532163350223, + "grad_norm": 4.056421279907227, + "learning_rate": 2.582645644286374e-05, + "loss": 1.9471, + "step": 19990 + }, + { + "epoch": 1.3416663870339922, + "grad_norm": 4.34148645401001, + "learning_rate": 2.5816943328420984e-05, + "loss": 2.2325, + "step": 19992 + }, + { + "epoch": 1.3418006107177611, + "grad_norm": 3.871450185775757, + "learning_rate": 2.5807431356580092e-05, + "loss": 1.8218, + "step": 19994 + }, + { + "epoch": 1.3419348344015303, + "grad_norm": 3.974705457687378, + "learning_rate": 2.5797920527790526e-05, + "loss": 1.9728, + "step": 19996 + }, + { + "epoch": 1.3420690580852992, + "grad_norm": 3.60797119140625, + "learning_rate": 2.5788410842501588e-05, + "loss": 2.0974, + "step": 19998 + }, + { + "epoch": 1.3422032817690681, + "grad_norm": 4.0194501876831055, + "learning_rate": 2.577890230116265e-05, + "loss": 1.8273, + "step": 20000 + }, + { + "epoch": 1.342337505452837, + "grad_norm": 3.7801713943481445, + "learning_rate": 2.5769394904222926e-05, + "loss": 1.9828, + "step": 20002 + }, + { + "epoch": 1.3424717291366062, + "grad_norm": 4.189940452575684, + "learning_rate": 2.575988865213167e-05, + "loss": 1.8258, + "step": 20004 + }, + { + "epoch": 1.342605952820375, + "grad_norm": 4.704155921936035, + "learning_rate": 2.5750383545338002e-05, + "loss": 2.0753, + "step": 20006 + }, + { + "epoch": 1.3427401765041442, + "grad_norm": 4.432606220245361, + "learning_rate": 2.5740879584291034e-05, + "loss": 1.9806, + "step": 20008 + }, + { + "epoch": 1.3428744001879132, + "grad_norm": 3.755955934524536, + "learning_rate": 2.5731376769439776e-05, + "loss": 2.0793, + "step": 20010 + }, + { + "epoch": 1.343008623871682, + "grad_norm": 4.2972517013549805, + "learning_rate": 2.572187510123326e-05, + "loss": 1.9808, + "step": 20012 + }, + { + "epoch": 1.3431428475554512, + "grad_norm": 4.140823841094971, + "learning_rate": 2.5712374580120414e-05, + "loss": 2.0334, + "step": 20014 + }, + { + "epoch": 1.3432770712392201, + "grad_norm": 4.179440021514893, + "learning_rate": 2.5702875206550105e-05, + "loss": 2.3127, + "step": 20016 + }, + { + "epoch": 1.3434112949229893, + "grad_norm": 4.06627893447876, + "learning_rate": 2.5693376980971145e-05, + "loss": 1.9733, + "step": 20018 + }, + { + "epoch": 1.3435455186067582, + "grad_norm": 4.492617130279541, + "learning_rate": 2.5683879903832347e-05, + "loss": 1.9739, + "step": 20020 + }, + { + "epoch": 1.3436797422905271, + "grad_norm": 4.214808940887451, + "learning_rate": 2.567438397558239e-05, + "loss": 2.059, + "step": 20022 + }, + { + "epoch": 1.343813965974296, + "grad_norm": 3.747004747390747, + "learning_rate": 2.5664889196669973e-05, + "loss": 1.8377, + "step": 20024 + }, + { + "epoch": 1.3439481896580652, + "grad_norm": 4.361414432525635, + "learning_rate": 2.5655395567543684e-05, + "loss": 2.0779, + "step": 20026 + }, + { + "epoch": 1.344082413341834, + "grad_norm": 4.398056983947754, + "learning_rate": 2.564590308865209e-05, + "loss": 1.8662, + "step": 20028 + }, + { + "epoch": 1.3442166370256032, + "grad_norm": 4.497387409210205, + "learning_rate": 2.5636411760443657e-05, + "loss": 2.1626, + "step": 20030 + }, + { + "epoch": 1.3443508607093722, + "grad_norm": 4.089743614196777, + "learning_rate": 2.5626921583366886e-05, + "loss": 1.9093, + "step": 20032 + }, + { + "epoch": 1.344485084393141, + "grad_norm": 4.53359842300415, + "learning_rate": 2.561743255787014e-05, + "loss": 2.0871, + "step": 20034 + }, + { + "epoch": 1.3446193080769102, + "grad_norm": 3.896087408065796, + "learning_rate": 2.5607944684401764e-05, + "loss": 1.8943, + "step": 20036 + }, + { + "epoch": 1.3447535317606791, + "grad_norm": 3.818002700805664, + "learning_rate": 2.559845796341001e-05, + "loss": 1.8551, + "step": 20038 + }, + { + "epoch": 1.3448877554444483, + "grad_norm": 4.07944393157959, + "learning_rate": 2.5588972395343157e-05, + "loss": 1.8949, + "step": 20040 + }, + { + "epoch": 1.3450219791282172, + "grad_norm": 4.009342670440674, + "learning_rate": 2.557948798064935e-05, + "loss": 2.3636, + "step": 20042 + }, + { + "epoch": 1.3451562028119861, + "grad_norm": 8.096025466918945, + "learning_rate": 2.557000471977673e-05, + "loss": 2.0033, + "step": 20044 + }, + { + "epoch": 1.3452904264957553, + "grad_norm": 4.115297317504883, + "learning_rate": 2.5560522613173314e-05, + "loss": 1.7736, + "step": 20046 + }, + { + "epoch": 1.3454246501795242, + "grad_norm": 4.5466156005859375, + "learning_rate": 2.555104166128717e-05, + "loss": 1.9334, + "step": 20048 + }, + { + "epoch": 1.3455588738632933, + "grad_norm": 4.223916530609131, + "learning_rate": 2.554156186456621e-05, + "loss": 2.2365, + "step": 20050 + }, + { + "epoch": 1.3456930975470622, + "grad_norm": 4.0216803550720215, + "learning_rate": 2.553208322345838e-05, + "loss": 1.9043, + "step": 20052 + }, + { + "epoch": 1.3458273212308312, + "grad_norm": 4.036929130554199, + "learning_rate": 2.552260573841152e-05, + "loss": 2.0582, + "step": 20054 + }, + { + "epoch": 1.3459615449146, + "grad_norm": 4.510562896728516, + "learning_rate": 2.5513129409873372e-05, + "loss": 1.8426, + "step": 20056 + }, + { + "epoch": 1.3460957685983692, + "grad_norm": 3.910784959793091, + "learning_rate": 2.550365423829172e-05, + "loss": 1.8027, + "step": 20058 + }, + { + "epoch": 1.3462299922821381, + "grad_norm": 4.003170013427734, + "learning_rate": 2.5494180224114217e-05, + "loss": 2.1561, + "step": 20060 + }, + { + "epoch": 1.3463642159659073, + "grad_norm": 3.1671559810638428, + "learning_rate": 2.548470736778853e-05, + "loss": 1.8312, + "step": 20062 + }, + { + "epoch": 1.3464984396496762, + "grad_norm": 3.547459125518799, + "learning_rate": 2.547523566976222e-05, + "loss": 2.0264, + "step": 20064 + }, + { + "epoch": 1.3466326633334451, + "grad_norm": 4.020153522491455, + "learning_rate": 2.546576513048279e-05, + "loss": 2.2038, + "step": 20066 + }, + { + "epoch": 1.3467668870172143, + "grad_norm": 3.9972338676452637, + "learning_rate": 2.5456295750397698e-05, + "loss": 1.9029, + "step": 20068 + }, + { + "epoch": 1.3469011107009832, + "grad_norm": 5.0472517013549805, + "learning_rate": 2.5446827529954385e-05, + "loss": 1.9921, + "step": 20070 + }, + { + "epoch": 1.3470353343847523, + "grad_norm": 5.023316860198975, + "learning_rate": 2.543736046960019e-05, + "loss": 2.01, + "step": 20072 + }, + { + "epoch": 1.3471695580685212, + "grad_norm": 3.7701680660247803, + "learning_rate": 2.5427894569782412e-05, + "loss": 1.7361, + "step": 20074 + }, + { + "epoch": 1.3473037817522902, + "grad_norm": 4.273250102996826, + "learning_rate": 2.5418429830948276e-05, + "loss": 2.0064, + "step": 20076 + }, + { + "epoch": 1.347438005436059, + "grad_norm": 3.9943389892578125, + "learning_rate": 2.540896625354502e-05, + "loss": 2.1221, + "step": 20078 + }, + { + "epoch": 1.3475722291198282, + "grad_norm": 4.4119157791137695, + "learning_rate": 2.5399503838019722e-05, + "loss": 1.8844, + "step": 20080 + }, + { + "epoch": 1.3477064528035971, + "grad_norm": 3.5805153846740723, + "learning_rate": 2.539004258481954e-05, + "loss": 1.8997, + "step": 20082 + }, + { + "epoch": 1.3478406764873663, + "grad_norm": 4.0629730224609375, + "learning_rate": 2.5380582494391413e-05, + "loss": 2.0735, + "step": 20084 + }, + { + "epoch": 1.3479749001711352, + "grad_norm": 4.095445156097412, + "learning_rate": 2.5371123567182363e-05, + "loss": 1.8821, + "step": 20086 + }, + { + "epoch": 1.3481091238549041, + "grad_norm": 3.9808719158172607, + "learning_rate": 2.5361665803639282e-05, + "loss": 2.0136, + "step": 20088 + }, + { + "epoch": 1.3482433475386733, + "grad_norm": 4.290801525115967, + "learning_rate": 2.5352209204209065e-05, + "loss": 2.0489, + "step": 20090 + }, + { + "epoch": 1.3483775712224422, + "grad_norm": 4.368113040924072, + "learning_rate": 2.534275376933849e-05, + "loss": 2.0973, + "step": 20092 + }, + { + "epoch": 1.3485117949062113, + "grad_norm": 3.7641093730926514, + "learning_rate": 2.533329949947431e-05, + "loss": 1.7744, + "step": 20094 + }, + { + "epoch": 1.3486460185899802, + "grad_norm": 4.313357830047607, + "learning_rate": 2.532384639506321e-05, + "loss": 1.9115, + "step": 20096 + }, + { + "epoch": 1.3487802422737492, + "grad_norm": 4.0519185066223145, + "learning_rate": 2.531439445655186e-05, + "loss": 1.8395, + "step": 20098 + }, + { + "epoch": 1.348914465957518, + "grad_norm": 4.269370079040527, + "learning_rate": 2.530494368438683e-05, + "loss": 1.8722, + "step": 20100 + }, + { + "epoch": 1.3490486896412872, + "grad_norm": 4.363927364349365, + "learning_rate": 2.5295494079014647e-05, + "loss": 2.0935, + "step": 20102 + }, + { + "epoch": 1.3491829133250561, + "grad_norm": 4.347434997558594, + "learning_rate": 2.528604564088178e-05, + "loss": 2.2014, + "step": 20104 + }, + { + "epoch": 1.3493171370088253, + "grad_norm": 5.326995849609375, + "learning_rate": 2.5276598370434666e-05, + "loss": 1.7636, + "step": 20106 + }, + { + "epoch": 1.3494513606925942, + "grad_norm": 4.095035076141357, + "learning_rate": 2.5267152268119653e-05, + "loss": 2.0268, + "step": 20108 + }, + { + "epoch": 1.3495855843763631, + "grad_norm": 4.137225151062012, + "learning_rate": 2.5257707334383095e-05, + "loss": 2.1055, + "step": 20110 + }, + { + "epoch": 1.3497198080601323, + "grad_norm": 3.7308878898620605, + "learning_rate": 2.524826356967117e-05, + "loss": 1.8882, + "step": 20112 + }, + { + "epoch": 1.3498540317439012, + "grad_norm": 4.203404426574707, + "learning_rate": 2.5238820974430143e-05, + "loss": 1.9087, + "step": 20114 + }, + { + "epoch": 1.3499882554276703, + "grad_norm": 4.099849700927734, + "learning_rate": 2.5229379549106115e-05, + "loss": 2.1091, + "step": 20116 + }, + { + "epoch": 1.3501224791114392, + "grad_norm": 5.899179935455322, + "learning_rate": 2.521993929414521e-05, + "loss": 2.0189, + "step": 20118 + }, + { + "epoch": 1.3502567027952082, + "grad_norm": 4.254183292388916, + "learning_rate": 2.521050020999347e-05, + "loss": 1.9974, + "step": 20120 + }, + { + "epoch": 1.3503909264789773, + "grad_norm": 4.057466983795166, + "learning_rate": 2.5201062297096807e-05, + "loss": 2.2823, + "step": 20122 + }, + { + "epoch": 1.3505251501627462, + "grad_norm": 3.743103504180908, + "learning_rate": 2.51916255559012e-05, + "loss": 2.0983, + "step": 20124 + }, + { + "epoch": 1.3506593738465154, + "grad_norm": 4.097569465637207, + "learning_rate": 2.518218998685249e-05, + "loss": 2.2198, + "step": 20126 + }, + { + "epoch": 1.3507935975302843, + "grad_norm": 3.7391841411590576, + "learning_rate": 2.5172755590396517e-05, + "loss": 2.0033, + "step": 20128 + }, + { + "epoch": 1.3509278212140532, + "grad_norm": 3.800960063934326, + "learning_rate": 2.5163322366979024e-05, + "loss": 2.0677, + "step": 20130 + }, + { + "epoch": 1.3510620448978221, + "grad_norm": 3.857581853866577, + "learning_rate": 2.5153890317045702e-05, + "loss": 1.8823, + "step": 20132 + }, + { + "epoch": 1.3511962685815913, + "grad_norm": 4.398401260375977, + "learning_rate": 2.5144459441042196e-05, + "loss": 2.1738, + "step": 20134 + }, + { + "epoch": 1.3513304922653602, + "grad_norm": 4.372436046600342, + "learning_rate": 2.5135029739414116e-05, + "loss": 1.9629, + "step": 20136 + }, + { + "epoch": 1.3514647159491293, + "grad_norm": 3.786027669906616, + "learning_rate": 2.5125601212606992e-05, + "loss": 1.8517, + "step": 20138 + }, + { + "epoch": 1.3515989396328982, + "grad_norm": 4.390629768371582, + "learning_rate": 2.5116173861066296e-05, + "loss": 1.831, + "step": 20140 + }, + { + "epoch": 1.3517331633166672, + "grad_norm": 3.355832815170288, + "learning_rate": 2.510674768523743e-05, + "loss": 1.8108, + "step": 20142 + }, + { + "epoch": 1.3518673870004363, + "grad_norm": 4.351818561553955, + "learning_rate": 2.509732268556581e-05, + "loss": 1.8892, + "step": 20144 + }, + { + "epoch": 1.3520016106842052, + "grad_norm": 4.208736419677734, + "learning_rate": 2.5087898862496705e-05, + "loss": 1.9182, + "step": 20146 + }, + { + "epoch": 1.3521358343679744, + "grad_norm": 4.517706394195557, + "learning_rate": 2.5078476216475422e-05, + "loss": 1.7985, + "step": 20148 + }, + { + "epoch": 1.3522700580517433, + "grad_norm": 4.084598541259766, + "learning_rate": 2.5069054747947102e-05, + "loss": 2.2219, + "step": 20150 + }, + { + "epoch": 1.3524042817355122, + "grad_norm": 4.067144393920898, + "learning_rate": 2.5059634457356933e-05, + "loss": 1.9044, + "step": 20152 + }, + { + "epoch": 1.3525385054192811, + "grad_norm": 4.41187858581543, + "learning_rate": 2.5050215345149975e-05, + "loss": 2.0495, + "step": 20154 + }, + { + "epoch": 1.3526727291030503, + "grad_norm": 3.922527551651001, + "learning_rate": 2.5040797411771306e-05, + "loss": 1.9594, + "step": 20156 + }, + { + "epoch": 1.3528069527868192, + "grad_norm": 4.269223213195801, + "learning_rate": 2.503138065766587e-05, + "loss": 1.8964, + "step": 20158 + }, + { + "epoch": 1.3529411764705883, + "grad_norm": 4.216597557067871, + "learning_rate": 2.502196508327861e-05, + "loss": 1.9158, + "step": 20160 + }, + { + "epoch": 1.3530754001543572, + "grad_norm": 4.238635063171387, + "learning_rate": 2.5012550689054355e-05, + "loss": 1.806, + "step": 20162 + }, + { + "epoch": 1.3532096238381262, + "grad_norm": 5.1449103355407715, + "learning_rate": 2.5003137475437966e-05, + "loss": 1.8292, + "step": 20164 + }, + { + "epoch": 1.3533438475218953, + "grad_norm": 3.6944522857666016, + "learning_rate": 2.499372544287417e-05, + "loss": 2.0751, + "step": 20166 + }, + { + "epoch": 1.3534780712056642, + "grad_norm": 4.217309474945068, + "learning_rate": 2.4984314591807682e-05, + "loss": 1.9466, + "step": 20168 + }, + { + "epoch": 1.3536122948894334, + "grad_norm": 4.197523593902588, + "learning_rate": 2.4974904922683118e-05, + "loss": 2.0402, + "step": 20170 + }, + { + "epoch": 1.3537465185732023, + "grad_norm": 3.9841456413269043, + "learning_rate": 2.4965496435945106e-05, + "loss": 1.8126, + "step": 20172 + }, + { + "epoch": 1.3538807422569712, + "grad_norm": 4.11088752746582, + "learning_rate": 2.495608913203814e-05, + "loss": 1.7617, + "step": 20174 + }, + { + "epoch": 1.3540149659407401, + "grad_norm": 4.325648307800293, + "learning_rate": 2.4946683011406752e-05, + "loss": 1.9806, + "step": 20176 + }, + { + "epoch": 1.3541491896245093, + "grad_norm": 4.3337860107421875, + "learning_rate": 2.493727807449529e-05, + "loss": 1.8382, + "step": 20178 + }, + { + "epoch": 1.3542834133082782, + "grad_norm": 4.622622966766357, + "learning_rate": 2.4927874321748173e-05, + "loss": 2.0078, + "step": 20180 + }, + { + "epoch": 1.3544176369920473, + "grad_norm": 3.1889686584472656, + "learning_rate": 2.4918471753609675e-05, + "loss": 1.7961, + "step": 20182 + }, + { + "epoch": 1.3545518606758162, + "grad_norm": 3.773325204849243, + "learning_rate": 2.4909070370524083e-05, + "loss": 1.9338, + "step": 20184 + }, + { + "epoch": 1.3546860843595852, + "grad_norm": 4.072450160980225, + "learning_rate": 2.489967017293558e-05, + "loss": 2.0395, + "step": 20186 + }, + { + "epoch": 1.3548203080433543, + "grad_norm": 4.232591152191162, + "learning_rate": 2.4890271161288302e-05, + "loss": 1.6062, + "step": 20188 + }, + { + "epoch": 1.3549545317271232, + "grad_norm": 4.775458335876465, + "learning_rate": 2.4880873336026332e-05, + "loss": 2.0655, + "step": 20190 + }, + { + "epoch": 1.3550887554108924, + "grad_norm": 4.040532112121582, + "learning_rate": 2.4871476697593687e-05, + "loss": 1.9708, + "step": 20192 + }, + { + "epoch": 1.3552229790946613, + "grad_norm": 4.160949230194092, + "learning_rate": 2.4862081246434378e-05, + "loss": 2.0344, + "step": 20194 + }, + { + "epoch": 1.3553572027784302, + "grad_norm": 4.829535484313965, + "learning_rate": 2.485268698299229e-05, + "loss": 2.0605, + "step": 20196 + }, + { + "epoch": 1.3554914264621993, + "grad_norm": 3.829749822616577, + "learning_rate": 2.4843293907711295e-05, + "loss": 1.8854, + "step": 20198 + }, + { + "epoch": 1.3556256501459683, + "grad_norm": 3.98630690574646, + "learning_rate": 2.4833902021035177e-05, + "loss": 1.7792, + "step": 20200 + }, + { + "epoch": 1.3557598738297374, + "grad_norm": 4.009222030639648, + "learning_rate": 2.482451132340772e-05, + "loss": 1.967, + "step": 20202 + }, + { + "epoch": 1.3558940975135063, + "grad_norm": 4.084018230438232, + "learning_rate": 2.4815121815272573e-05, + "loss": 2.0318, + "step": 20204 + }, + { + "epoch": 1.3560283211972752, + "grad_norm": 4.128713607788086, + "learning_rate": 2.480573349707344e-05, + "loss": 1.8751, + "step": 20206 + }, + { + "epoch": 1.3561625448810442, + "grad_norm": 4.2542877197265625, + "learning_rate": 2.479634636925382e-05, + "loss": 1.9201, + "step": 20208 + }, + { + "epoch": 1.3562967685648133, + "grad_norm": 4.379167556762695, + "learning_rate": 2.478696043225729e-05, + "loss": 1.8698, + "step": 20210 + }, + { + "epoch": 1.3564309922485822, + "grad_norm": 4.044027328491211, + "learning_rate": 2.477757568652728e-05, + "loss": 1.966, + "step": 20212 + }, + { + "epoch": 1.3565652159323514, + "grad_norm": 3.671684980392456, + "learning_rate": 2.4768192132507245e-05, + "loss": 1.9929, + "step": 20214 + }, + { + "epoch": 1.3566994396161203, + "grad_norm": 3.7002506256103516, + "learning_rate": 2.475880977064051e-05, + "loss": 1.8913, + "step": 20216 + }, + { + "epoch": 1.3568336632998892, + "grad_norm": 5.551513671875, + "learning_rate": 2.4749428601370384e-05, + "loss": 1.9988, + "step": 20218 + }, + { + "epoch": 1.3569678869836583, + "grad_norm": 3.875223398208618, + "learning_rate": 2.4740048625140078e-05, + "loss": 1.8606, + "step": 20220 + }, + { + "epoch": 1.3571021106674273, + "grad_norm": 4.1057915687561035, + "learning_rate": 2.473066984239282e-05, + "loss": 1.9229, + "step": 20222 + }, + { + "epoch": 1.3572363343511964, + "grad_norm": 4.096020221710205, + "learning_rate": 2.472129225357173e-05, + "loss": 1.8592, + "step": 20224 + }, + { + "epoch": 1.3573705580349653, + "grad_norm": 3.5702366828918457, + "learning_rate": 2.471191585911987e-05, + "loss": 1.959, + "step": 20226 + }, + { + "epoch": 1.3575047817187342, + "grad_norm": 4.603278636932373, + "learning_rate": 2.4702540659480238e-05, + "loss": 1.8812, + "step": 20228 + }, + { + "epoch": 1.3576390054025032, + "grad_norm": 4.481945991516113, + "learning_rate": 2.469316665509584e-05, + "loss": 2.0004, + "step": 20230 + }, + { + "epoch": 1.3577732290862723, + "grad_norm": 3.959070920944214, + "learning_rate": 2.4683793846409524e-05, + "loss": 2.0158, + "step": 20232 + }, + { + "epoch": 1.3579074527700412, + "grad_norm": 3.8968346118927, + "learning_rate": 2.4674422233864213e-05, + "loss": 1.8429, + "step": 20234 + }, + { + "epoch": 1.3580416764538104, + "grad_norm": 3.8128628730773926, + "learning_rate": 2.466505181790261e-05, + "loss": 1.9097, + "step": 20236 + }, + { + "epoch": 1.3581759001375793, + "grad_norm": 4.345405101776123, + "learning_rate": 2.465568259896751e-05, + "loss": 2.0053, + "step": 20238 + }, + { + "epoch": 1.3583101238213482, + "grad_norm": 3.77253794670105, + "learning_rate": 2.464631457750155e-05, + "loss": 1.9304, + "step": 20240 + }, + { + "epoch": 1.3584443475051173, + "grad_norm": 4.378432750701904, + "learning_rate": 2.4636947753947387e-05, + "loss": 1.8883, + "step": 20242 + }, + { + "epoch": 1.3585785711888863, + "grad_norm": 3.8392531871795654, + "learning_rate": 2.4627582128747567e-05, + "loss": 1.8885, + "step": 20244 + }, + { + "epoch": 1.3587127948726554, + "grad_norm": 3.6845905780792236, + "learning_rate": 2.4618217702344603e-05, + "loss": 1.9281, + "step": 20246 + }, + { + "epoch": 1.3588470185564243, + "grad_norm": 4.582029819488525, + "learning_rate": 2.460885447518092e-05, + "loss": 2.1852, + "step": 20248 + }, + { + "epoch": 1.3589812422401932, + "grad_norm": 5.1223320960998535, + "learning_rate": 2.4599492447698958e-05, + "loss": 2.0617, + "step": 20250 + }, + { + "epoch": 1.3591154659239622, + "grad_norm": 4.474033832550049, + "learning_rate": 2.4590131620341023e-05, + "loss": 1.9015, + "step": 20252 + }, + { + "epoch": 1.3592496896077313, + "grad_norm": 4.12116813659668, + "learning_rate": 2.4580771993549407e-05, + "loss": 1.9506, + "step": 20254 + }, + { + "epoch": 1.3593839132915002, + "grad_norm": 3.518620729446411, + "learning_rate": 2.4571413567766305e-05, + "loss": 1.8017, + "step": 20256 + }, + { + "epoch": 1.3595181369752694, + "grad_norm": 4.880042552947998, + "learning_rate": 2.4562056343433933e-05, + "loss": 2.1284, + "step": 20258 + }, + { + "epoch": 1.3596523606590383, + "grad_norm": 3.6160166263580322, + "learning_rate": 2.4552700320994377e-05, + "loss": 1.837, + "step": 20260 + }, + { + "epoch": 1.3597865843428072, + "grad_norm": 3.7176737785339355, + "learning_rate": 2.4543345500889687e-05, + "loss": 2.0601, + "step": 20262 + }, + { + "epoch": 1.3599208080265763, + "grad_norm": 3.8517775535583496, + "learning_rate": 2.4533991883561868e-05, + "loss": 1.6639, + "step": 20264 + }, + { + "epoch": 1.3600550317103453, + "grad_norm": 3.97622013092041, + "learning_rate": 2.4524639469452838e-05, + "loss": 1.891, + "step": 20266 + }, + { + "epoch": 1.3601892553941144, + "grad_norm": 4.392148017883301, + "learning_rate": 2.451528825900451e-05, + "loss": 2.2632, + "step": 20268 + }, + { + "epoch": 1.3603234790778833, + "grad_norm": 3.87080717086792, + "learning_rate": 2.4505938252658682e-05, + "loss": 2.188, + "step": 20270 + }, + { + "epoch": 1.3604577027616522, + "grad_norm": 5.014320373535156, + "learning_rate": 2.449658945085718e-05, + "loss": 1.8802, + "step": 20272 + }, + { + "epoch": 1.3605919264454214, + "grad_norm": 4.433290481567383, + "learning_rate": 2.4487241854041637e-05, + "loss": 2.0027, + "step": 20274 + }, + { + "epoch": 1.3607261501291903, + "grad_norm": 4.652792930603027, + "learning_rate": 2.4477895462653764e-05, + "loss": 1.8972, + "step": 20276 + }, + { + "epoch": 1.3608603738129594, + "grad_norm": 3.472485065460205, + "learning_rate": 2.4468550277135128e-05, + "loss": 1.856, + "step": 20278 + }, + { + "epoch": 1.3609945974967284, + "grad_norm": 4.1001434326171875, + "learning_rate": 2.4459206297927306e-05, + "loss": 1.791, + "step": 20280 + }, + { + "epoch": 1.3611288211804973, + "grad_norm": 3.9814250469207764, + "learning_rate": 2.4449863525471763e-05, + "loss": 1.9131, + "step": 20282 + }, + { + "epoch": 1.3612630448642662, + "grad_norm": 4.600988388061523, + "learning_rate": 2.4440521960209924e-05, + "loss": 2.0849, + "step": 20284 + }, + { + "epoch": 1.3613972685480353, + "grad_norm": 4.180093288421631, + "learning_rate": 2.4431181602583137e-05, + "loss": 2.1123, + "step": 20286 + }, + { + "epoch": 1.3615314922318043, + "grad_norm": 3.7972168922424316, + "learning_rate": 2.442184245303277e-05, + "loss": 1.9363, + "step": 20288 + }, + { + "epoch": 1.3616657159155734, + "grad_norm": 4.2858991622924805, + "learning_rate": 2.441250451200004e-05, + "loss": 1.8214, + "step": 20290 + }, + { + "epoch": 1.3617999395993423, + "grad_norm": 3.973268508911133, + "learning_rate": 2.4403167779926162e-05, + "loss": 2.0249, + "step": 20292 + }, + { + "epoch": 1.3619341632831112, + "grad_norm": 3.3481879234313965, + "learning_rate": 2.4393832257252252e-05, + "loss": 1.8615, + "step": 20294 + }, + { + "epoch": 1.3620683869668804, + "grad_norm": 3.936872720718384, + "learning_rate": 2.438449794441943e-05, + "loss": 1.8973, + "step": 20296 + }, + { + "epoch": 1.3622026106506493, + "grad_norm": 4.178407669067383, + "learning_rate": 2.4375164841868697e-05, + "loss": 1.7618, + "step": 20298 + }, + { + "epoch": 1.3623368343344184, + "grad_norm": 4.042469024658203, + "learning_rate": 2.4365832950041072e-05, + "loss": 1.7589, + "step": 20300 + }, + { + "epoch": 1.3624710580181874, + "grad_norm": 4.48480749130249, + "learning_rate": 2.4356502269377397e-05, + "loss": 2.3569, + "step": 20302 + }, + { + "epoch": 1.3626052817019563, + "grad_norm": 3.59830904006958, + "learning_rate": 2.434717280031858e-05, + "loss": 1.8257, + "step": 20304 + }, + { + "epoch": 1.3627395053857252, + "grad_norm": 6.366357803344727, + "learning_rate": 2.4337844543305387e-05, + "loss": 2.0654, + "step": 20306 + }, + { + "epoch": 1.3628737290694943, + "grad_norm": 4.118037223815918, + "learning_rate": 2.4328517498778596e-05, + "loss": 2.06, + "step": 20308 + }, + { + "epoch": 1.3630079527532633, + "grad_norm": 4.162081241607666, + "learning_rate": 2.431919166717887e-05, + "loss": 1.8177, + "step": 20310 + }, + { + "epoch": 1.3631421764370324, + "grad_norm": 3.673675298690796, + "learning_rate": 2.4309867048946838e-05, + "loss": 1.781, + "step": 20312 + }, + { + "epoch": 1.3632764001208013, + "grad_norm": 4.09054708480835, + "learning_rate": 2.4300543644523056e-05, + "loss": 1.8722, + "step": 20314 + }, + { + "epoch": 1.3634106238045702, + "grad_norm": 4.106335639953613, + "learning_rate": 2.429122145434807e-05, + "loss": 1.8859, + "step": 20316 + }, + { + "epoch": 1.3635448474883394, + "grad_norm": 4.404613971710205, + "learning_rate": 2.4281900478862306e-05, + "loss": 1.8864, + "step": 20318 + }, + { + "epoch": 1.3636790711721083, + "grad_norm": 4.586489677429199, + "learning_rate": 2.427258071850618e-05, + "loss": 2.1512, + "step": 20320 + }, + { + "epoch": 1.3638132948558774, + "grad_norm": 3.8526008129119873, + "learning_rate": 2.4263262173720008e-05, + "loss": 2.1909, + "step": 20322 + }, + { + "epoch": 1.3639475185396464, + "grad_norm": 4.213918209075928, + "learning_rate": 2.4253944844944105e-05, + "loss": 1.9291, + "step": 20324 + }, + { + "epoch": 1.3640817422234153, + "grad_norm": 4.128556728363037, + "learning_rate": 2.424462873261868e-05, + "loss": 1.8595, + "step": 20326 + }, + { + "epoch": 1.3642159659071842, + "grad_norm": 6.943046569824219, + "learning_rate": 2.42353138371839e-05, + "loss": 2.0404, + "step": 20328 + }, + { + "epoch": 1.3643501895909533, + "grad_norm": 3.698884963989258, + "learning_rate": 2.422600015907988e-05, + "loss": 1.5654, + "step": 20330 + }, + { + "epoch": 1.3644844132747223, + "grad_norm": 3.736354112625122, + "learning_rate": 2.421668769874665e-05, + "loss": 1.7706, + "step": 20332 + }, + { + "epoch": 1.3646186369584914, + "grad_norm": 4.012660503387451, + "learning_rate": 2.4207376456624243e-05, + "loss": 2.1202, + "step": 20334 + }, + { + "epoch": 1.3647528606422603, + "grad_norm": 4.226443767547607, + "learning_rate": 2.419806643315256e-05, + "loss": 2.0949, + "step": 20336 + }, + { + "epoch": 1.3648870843260292, + "grad_norm": 4.5535569190979, + "learning_rate": 2.4188757628771524e-05, + "loss": 1.8104, + "step": 20338 + }, + { + "epoch": 1.3650213080097984, + "grad_norm": 4.052158832550049, + "learning_rate": 2.4179450043920933e-05, + "loss": 1.877, + "step": 20340 + }, + { + "epoch": 1.3651555316935673, + "grad_norm": 4.823601722717285, + "learning_rate": 2.4170143679040552e-05, + "loss": 2.1065, + "step": 20342 + }, + { + "epoch": 1.3652897553773364, + "grad_norm": 4.481215476989746, + "learning_rate": 2.4160838534570074e-05, + "loss": 1.7469, + "step": 20344 + }, + { + "epoch": 1.3654239790611054, + "grad_norm": 4.344111919403076, + "learning_rate": 2.4151534610949178e-05, + "loss": 1.9672, + "step": 20346 + }, + { + "epoch": 1.3655582027448743, + "grad_norm": 3.513267993927002, + "learning_rate": 2.4142231908617445e-05, + "loss": 1.8918, + "step": 20348 + }, + { + "epoch": 1.3656924264286434, + "grad_norm": 4.466396331787109, + "learning_rate": 2.413293042801441e-05, + "loss": 2.1229, + "step": 20350 + }, + { + "epoch": 1.3658266501124123, + "grad_norm": 3.8804121017456055, + "learning_rate": 2.4123630169579526e-05, + "loss": 1.826, + "step": 20352 + }, + { + "epoch": 1.3659608737961815, + "grad_norm": 4.17012357711792, + "learning_rate": 2.4114331133752254e-05, + "loss": 1.9436, + "step": 20354 + }, + { + "epoch": 1.3660950974799504, + "grad_norm": 4.170745849609375, + "learning_rate": 2.410503332097193e-05, + "loss": 1.9386, + "step": 20356 + }, + { + "epoch": 1.3662293211637193, + "grad_norm": 5.3151936531066895, + "learning_rate": 2.4095736731677863e-05, + "loss": 2.1705, + "step": 20358 + }, + { + "epoch": 1.3663635448474882, + "grad_norm": 4.810800075531006, + "learning_rate": 2.4086441366309277e-05, + "loss": 1.987, + "step": 20360 + }, + { + "epoch": 1.3664977685312574, + "grad_norm": 3.988229990005493, + "learning_rate": 2.4077147225305403e-05, + "loss": 1.7131, + "step": 20362 + }, + { + "epoch": 1.3666319922150263, + "grad_norm": 3.8846957683563232, + "learning_rate": 2.4067854309105326e-05, + "loss": 1.8461, + "step": 20364 + }, + { + "epoch": 1.3667662158987954, + "grad_norm": 4.258449554443359, + "learning_rate": 2.4058562618148155e-05, + "loss": 1.9916, + "step": 20366 + }, + { + "epoch": 1.3669004395825644, + "grad_norm": 4.234399318695068, + "learning_rate": 2.4049272152872893e-05, + "loss": 2.035, + "step": 20368 + }, + { + "epoch": 1.3670346632663333, + "grad_norm": 4.286736488342285, + "learning_rate": 2.403998291371849e-05, + "loss": 2.0816, + "step": 20370 + }, + { + "epoch": 1.3671688869501024, + "grad_norm": 4.466689109802246, + "learning_rate": 2.4030694901123825e-05, + "loss": 2.1298, + "step": 20372 + }, + { + "epoch": 1.3673031106338713, + "grad_norm": 5.612724781036377, + "learning_rate": 2.402140811552778e-05, + "loss": 1.856, + "step": 20374 + }, + { + "epoch": 1.3674373343176405, + "grad_norm": 3.7365500926971436, + "learning_rate": 2.401212255736912e-05, + "loss": 1.9464, + "step": 20376 + }, + { + "epoch": 1.3675715580014094, + "grad_norm": 3.990419626235962, + "learning_rate": 2.400283822708656e-05, + "loss": 2.0135, + "step": 20378 + }, + { + "epoch": 1.3677057816851783, + "grad_norm": 7.414624214172363, + "learning_rate": 2.399355512511876e-05, + "loss": 1.7826, + "step": 20380 + }, + { + "epoch": 1.3678400053689472, + "grad_norm": 4.923806190490723, + "learning_rate": 2.3984273251904356e-05, + "loss": 1.8879, + "step": 20382 + }, + { + "epoch": 1.3679742290527164, + "grad_norm": 3.856438159942627, + "learning_rate": 2.3974992607881887e-05, + "loss": 1.9309, + "step": 20384 + }, + { + "epoch": 1.3681084527364853, + "grad_norm": 4.120314598083496, + "learning_rate": 2.3965713193489836e-05, + "loss": 1.8805, + "step": 20386 + }, + { + "epoch": 1.3682426764202544, + "grad_norm": 3.859229564666748, + "learning_rate": 2.3956435009166627e-05, + "loss": 1.9554, + "step": 20388 + }, + { + "epoch": 1.3683769001040234, + "grad_norm": 4.492728233337402, + "learning_rate": 2.394715805535067e-05, + "loss": 2.0488, + "step": 20390 + }, + { + "epoch": 1.3685111237877923, + "grad_norm": 4.144999027252197, + "learning_rate": 2.3937882332480243e-05, + "loss": 1.8925, + "step": 20392 + }, + { + "epoch": 1.3686453474715614, + "grad_norm": 4.401748180389404, + "learning_rate": 2.392860784099365e-05, + "loss": 2.0881, + "step": 20394 + }, + { + "epoch": 1.3687795711553303, + "grad_norm": 4.2797532081604, + "learning_rate": 2.3919334581329082e-05, + "loss": 2.0193, + "step": 20396 + }, + { + "epoch": 1.3689137948390995, + "grad_norm": 4.753538608551025, + "learning_rate": 2.391006255392464e-05, + "loss": 2.0429, + "step": 20398 + }, + { + "epoch": 1.3690480185228684, + "grad_norm": 3.4298174381256104, + "learning_rate": 2.390079175921845e-05, + "loss": 1.8965, + "step": 20400 + }, + { + "epoch": 1.3691822422066373, + "grad_norm": 3.978755235671997, + "learning_rate": 2.3891522197648508e-05, + "loss": 1.9905, + "step": 20402 + }, + { + "epoch": 1.3693164658904062, + "grad_norm": 4.8611741065979, + "learning_rate": 2.3882253869652828e-05, + "loss": 2.0253, + "step": 20404 + }, + { + "epoch": 1.3694506895741754, + "grad_norm": 4.677142143249512, + "learning_rate": 2.3872986775669288e-05, + "loss": 1.8036, + "step": 20406 + }, + { + "epoch": 1.3695849132579443, + "grad_norm": 4.083773612976074, + "learning_rate": 2.386372091613575e-05, + "loss": 1.796, + "step": 20408 + }, + { + "epoch": 1.3697191369417134, + "grad_norm": 4.307068347930908, + "learning_rate": 2.3854456291489987e-05, + "loss": 1.8635, + "step": 20410 + }, + { + "epoch": 1.3698533606254824, + "grad_norm": 3.926553964614868, + "learning_rate": 2.3845192902169766e-05, + "loss": 1.9347, + "step": 20412 + }, + { + "epoch": 1.3699875843092513, + "grad_norm": 4.5085296630859375, + "learning_rate": 2.383593074861275e-05, + "loss": 1.9586, + "step": 20414 + }, + { + "epoch": 1.3701218079930204, + "grad_norm": 3.624499559402466, + "learning_rate": 2.3826669831256554e-05, + "loss": 1.8319, + "step": 20416 + }, + { + "epoch": 1.3702560316767893, + "grad_norm": 4.077676296234131, + "learning_rate": 2.3817410150538728e-05, + "loss": 2.1657, + "step": 20418 + }, + { + "epoch": 1.3703902553605585, + "grad_norm": 4.15303373336792, + "learning_rate": 2.3808151706896804e-05, + "loss": 2.0603, + "step": 20420 + }, + { + "epoch": 1.3705244790443274, + "grad_norm": 5.06400728225708, + "learning_rate": 2.3798894500768183e-05, + "loss": 1.8303, + "step": 20422 + }, + { + "epoch": 1.3706587027280963, + "grad_norm": 4.002310276031494, + "learning_rate": 2.378963853259032e-05, + "loss": 1.9566, + "step": 20424 + }, + { + "epoch": 1.3707929264118655, + "grad_norm": 4.060419082641602, + "learning_rate": 2.3780383802800453e-05, + "loss": 1.8363, + "step": 20426 + }, + { + "epoch": 1.3709271500956344, + "grad_norm": 4.02881383895874, + "learning_rate": 2.3771130311835914e-05, + "loss": 2.0043, + "step": 20428 + }, + { + "epoch": 1.3710613737794035, + "grad_norm": 3.9351675510406494, + "learning_rate": 2.3761878060133868e-05, + "loss": 1.626, + "step": 20430 + }, + { + "epoch": 1.3711955974631724, + "grad_norm": 4.16959285736084, + "learning_rate": 2.3752627048131516e-05, + "loss": 2.0429, + "step": 20432 + }, + { + "epoch": 1.3713298211469414, + "grad_norm": 4.18046760559082, + "learning_rate": 2.3743377276265916e-05, + "loss": 1.9867, + "step": 20434 + }, + { + "epoch": 1.3714640448307103, + "grad_norm": 3.7848052978515625, + "learning_rate": 2.373412874497411e-05, + "loss": 1.7321, + "step": 20436 + }, + { + "epoch": 1.3715982685144794, + "grad_norm": 3.8764359951019287, + "learning_rate": 2.3724881454693048e-05, + "loss": 1.9283, + "step": 20438 + }, + { + "epoch": 1.3717324921982483, + "grad_norm": 3.9225387573242188, + "learning_rate": 2.3715635405859692e-05, + "loss": 1.827, + "step": 20440 + }, + { + "epoch": 1.3718667158820175, + "grad_norm": 4.070366859436035, + "learning_rate": 2.3706390598910878e-05, + "loss": 1.8835, + "step": 20442 + }, + { + "epoch": 1.3720009395657864, + "grad_norm": 3.772914171218872, + "learning_rate": 2.3697147034283407e-05, + "loss": 1.7872, + "step": 20444 + }, + { + "epoch": 1.3721351632495553, + "grad_norm": 3.9870400428771973, + "learning_rate": 2.3687904712413996e-05, + "loss": 1.8502, + "step": 20446 + }, + { + "epoch": 1.3722693869333245, + "grad_norm": 4.299996852874756, + "learning_rate": 2.367866363373936e-05, + "loss": 2.0497, + "step": 20448 + }, + { + "epoch": 1.3724036106170934, + "grad_norm": 3.899247884750366, + "learning_rate": 2.3669423798696094e-05, + "loss": 2.2388, + "step": 20450 + }, + { + "epoch": 1.3725378343008625, + "grad_norm": 4.087904930114746, + "learning_rate": 2.3660185207720813e-05, + "loss": 1.958, + "step": 20452 + }, + { + "epoch": 1.3726720579846314, + "grad_norm": 3.7378644943237305, + "learning_rate": 2.365094786124995e-05, + "loss": 1.8743, + "step": 20454 + }, + { + "epoch": 1.3728062816684004, + "grad_norm": 4.507745742797852, + "learning_rate": 2.3641711759720003e-05, + "loss": 2.0151, + "step": 20456 + }, + { + "epoch": 1.3729405053521693, + "grad_norm": 4.474036693572998, + "learning_rate": 2.363247690356733e-05, + "loss": 1.9517, + "step": 20458 + }, + { + "epoch": 1.3730747290359384, + "grad_norm": 4.164031982421875, + "learning_rate": 2.3623243293228287e-05, + "loss": 2.0866, + "step": 20460 + }, + { + "epoch": 1.3732089527197073, + "grad_norm": 3.9810495376586914, + "learning_rate": 2.3614010929139136e-05, + "loss": 1.7639, + "step": 20462 + }, + { + "epoch": 1.3733431764034765, + "grad_norm": 4.002496242523193, + "learning_rate": 2.360477981173608e-05, + "loss": 1.8937, + "step": 20464 + }, + { + "epoch": 1.3734774000872454, + "grad_norm": 3.824721574783325, + "learning_rate": 2.3595549941455276e-05, + "loss": 2.0245, + "step": 20466 + }, + { + "epoch": 1.3736116237710143, + "grad_norm": 6.422678470611572, + "learning_rate": 2.358632131873279e-05, + "loss": 1.8472, + "step": 20468 + }, + { + "epoch": 1.3737458474547835, + "grad_norm": 4.369184970855713, + "learning_rate": 2.3577093944004707e-05, + "loss": 2.1742, + "step": 20470 + }, + { + "epoch": 1.3738800711385524, + "grad_norm": 4.26785945892334, + "learning_rate": 2.3567867817706974e-05, + "loss": 2.2079, + "step": 20472 + }, + { + "epoch": 1.3740142948223215, + "grad_norm": 4.126256942749023, + "learning_rate": 2.355864294027551e-05, + "loss": 1.9003, + "step": 20474 + }, + { + "epoch": 1.3741485185060904, + "grad_norm": 4.244207382202148, + "learning_rate": 2.3549419312146153e-05, + "loss": 1.7416, + "step": 20476 + }, + { + "epoch": 1.3742827421898594, + "grad_norm": 4.296792030334473, + "learning_rate": 2.3540196933754743e-05, + "loss": 1.8148, + "step": 20478 + }, + { + "epoch": 1.3744169658736283, + "grad_norm": 4.616356372833252, + "learning_rate": 2.3530975805536993e-05, + "loss": 1.9659, + "step": 20480 + }, + { + "epoch": 1.3745511895573974, + "grad_norm": 3.9572789669036865, + "learning_rate": 2.352175592792859e-05, + "loss": 1.8365, + "step": 20482 + }, + { + "epoch": 1.3746854132411663, + "grad_norm": 3.599241018295288, + "learning_rate": 2.3512537301365134e-05, + "loss": 1.992, + "step": 20484 + }, + { + "epoch": 1.3748196369249355, + "grad_norm": 3.8939754962921143, + "learning_rate": 2.3503319926282218e-05, + "loss": 1.7512, + "step": 20486 + }, + { + "epoch": 1.3749538606087044, + "grad_norm": 3.904073476791382, + "learning_rate": 2.349410380311532e-05, + "loss": 2.0754, + "step": 20488 + }, + { + "epoch": 1.3750880842924733, + "grad_norm": 4.781492710113525, + "learning_rate": 2.348488893229991e-05, + "loss": 2.1283, + "step": 20490 + }, + { + "epoch": 1.3752223079762425, + "grad_norm": 3.982712507247925, + "learning_rate": 2.3475675314271363e-05, + "loss": 1.7295, + "step": 20492 + }, + { + "epoch": 1.3753565316600114, + "grad_norm": 4.308805465698242, + "learning_rate": 2.3466462949464996e-05, + "loss": 2.0251, + "step": 20494 + }, + { + "epoch": 1.3754907553437805, + "grad_norm": 4.064456939697266, + "learning_rate": 2.345725183831606e-05, + "loss": 2.0575, + "step": 20496 + }, + { + "epoch": 1.3756249790275494, + "grad_norm": 4.050580978393555, + "learning_rate": 2.34480419812598e-05, + "loss": 1.8182, + "step": 20498 + }, + { + "epoch": 1.3757592027113184, + "grad_norm": 3.639702081680298, + "learning_rate": 2.3438833378731338e-05, + "loss": 1.8243, + "step": 20500 + }, + { + "epoch": 1.3758934263950875, + "grad_norm": 4.326784610748291, + "learning_rate": 2.342962603116578e-05, + "loss": 2.0983, + "step": 20502 + }, + { + "epoch": 1.3760276500788564, + "grad_norm": 4.322518348693848, + "learning_rate": 2.342041993899812e-05, + "loss": 2.1263, + "step": 20504 + }, + { + "epoch": 1.3761618737626256, + "grad_norm": 4.400636672973633, + "learning_rate": 2.341121510266337e-05, + "loss": 1.9734, + "step": 20506 + }, + { + "epoch": 1.3762960974463945, + "grad_norm": 3.301471471786499, + "learning_rate": 2.340201152259642e-05, + "loss": 1.8443, + "step": 20508 + }, + { + "epoch": 1.3764303211301634, + "grad_norm": 4.14288330078125, + "learning_rate": 2.3392809199232123e-05, + "loss": 2.0059, + "step": 20510 + }, + { + "epoch": 1.3765645448139323, + "grad_norm": 3.90301251411438, + "learning_rate": 2.3383608133005253e-05, + "loss": 1.9332, + "step": 20512 + }, + { + "epoch": 1.3766987684977015, + "grad_norm": 4.28670072555542, + "learning_rate": 2.337440832435058e-05, + "loss": 1.8284, + "step": 20514 + }, + { + "epoch": 1.3768329921814704, + "grad_norm": 5.688412666320801, + "learning_rate": 2.3365209773702736e-05, + "loss": 2.1448, + "step": 20516 + }, + { + "epoch": 1.3769672158652395, + "grad_norm": 3.4985203742980957, + "learning_rate": 2.335601248149637e-05, + "loss": 1.9363, + "step": 20518 + }, + { + "epoch": 1.3771014395490084, + "grad_norm": 3.4234845638275146, + "learning_rate": 2.3346816448166025e-05, + "loss": 1.8831, + "step": 20520 + }, + { + "epoch": 1.3772356632327774, + "grad_norm": 4.410943508148193, + "learning_rate": 2.3337621674146193e-05, + "loss": 2.265, + "step": 20522 + }, + { + "epoch": 1.3773698869165465, + "grad_norm": 4.076196193695068, + "learning_rate": 2.332842815987129e-05, + "loss": 2.0178, + "step": 20524 + }, + { + "epoch": 1.3775041106003154, + "grad_norm": 4.01806116104126, + "learning_rate": 2.3319235905775727e-05, + "loss": 1.8548, + "step": 20526 + }, + { + "epoch": 1.3776383342840846, + "grad_norm": 4.1463847160339355, + "learning_rate": 2.33100449122938e-05, + "loss": 2.1949, + "step": 20528 + }, + { + "epoch": 1.3777725579678535, + "grad_norm": 4.628542900085449, + "learning_rate": 2.3300855179859765e-05, + "loss": 1.833, + "step": 20530 + }, + { + "epoch": 1.3779067816516224, + "grad_norm": 3.913482904434204, + "learning_rate": 2.3291666708907826e-05, + "loss": 1.9256, + "step": 20532 + }, + { + "epoch": 1.3780410053353913, + "grad_norm": 3.880052328109741, + "learning_rate": 2.3282479499872095e-05, + "loss": 1.9605, + "step": 20534 + }, + { + "epoch": 1.3781752290191605, + "grad_norm": 4.098966598510742, + "learning_rate": 2.3273293553186688e-05, + "loss": 2.0935, + "step": 20536 + }, + { + "epoch": 1.3783094527029294, + "grad_norm": 3.963575601577759, + "learning_rate": 2.3264108869285606e-05, + "loss": 2.0329, + "step": 20538 + }, + { + "epoch": 1.3784436763866985, + "grad_norm": 3.937150716781616, + "learning_rate": 2.3254925448602806e-05, + "loss": 2.0379, + "step": 20540 + }, + { + "epoch": 1.3785779000704674, + "grad_norm": 4.0225725173950195, + "learning_rate": 2.3245743291572164e-05, + "loss": 1.9649, + "step": 20542 + }, + { + "epoch": 1.3787121237542364, + "grad_norm": 4.27218770980835, + "learning_rate": 2.3236562398627566e-05, + "loss": 2.1349, + "step": 20544 + }, + { + "epoch": 1.3788463474380055, + "grad_norm": 4.032973766326904, + "learning_rate": 2.3227382770202744e-05, + "loss": 1.8399, + "step": 20546 + }, + { + "epoch": 1.3789805711217744, + "grad_norm": 4.028371810913086, + "learning_rate": 2.3218204406731474e-05, + "loss": 2.0592, + "step": 20548 + }, + { + "epoch": 1.3791147948055436, + "grad_norm": 3.816420078277588, + "learning_rate": 2.3209027308647352e-05, + "loss": 1.8199, + "step": 20550 + }, + { + "epoch": 1.3792490184893125, + "grad_norm": 4.411160469055176, + "learning_rate": 2.3199851476384022e-05, + "loss": 2.1411, + "step": 20552 + }, + { + "epoch": 1.3793832421730814, + "grad_norm": 4.872894763946533, + "learning_rate": 2.319067691037499e-05, + "loss": 1.9984, + "step": 20554 + }, + { + "epoch": 1.3795174658568503, + "grad_norm": 4.090254306793213, + "learning_rate": 2.318150361105378e-05, + "loss": 2.054, + "step": 20556 + }, + { + "epoch": 1.3796516895406195, + "grad_norm": 3.868131637573242, + "learning_rate": 2.3172331578853785e-05, + "loss": 1.7946, + "step": 20558 + }, + { + "epoch": 1.3797859132243884, + "grad_norm": 4.153293132781982, + "learning_rate": 2.3163160814208367e-05, + "loss": 2.0597, + "step": 20560 + }, + { + "epoch": 1.3799201369081575, + "grad_norm": 4.457843780517578, + "learning_rate": 2.315399131755081e-05, + "loss": 1.9845, + "step": 20562 + }, + { + "epoch": 1.3800543605919264, + "grad_norm": 3.742356061935425, + "learning_rate": 2.3144823089314388e-05, + "loss": 1.8173, + "step": 20564 + }, + { + "epoch": 1.3801885842756954, + "grad_norm": 3.8422133922576904, + "learning_rate": 2.313565612993227e-05, + "loss": 2.0177, + "step": 20566 + }, + { + "epoch": 1.3803228079594645, + "grad_norm": 4.107123374938965, + "learning_rate": 2.3126490439837578e-05, + "loss": 1.9962, + "step": 20568 + }, + { + "epoch": 1.3804570316432334, + "grad_norm": 4.400734901428223, + "learning_rate": 2.311732601946334e-05, + "loss": 2.0122, + "step": 20570 + }, + { + "epoch": 1.3805912553270026, + "grad_norm": 3.827207088470459, + "learning_rate": 2.310816286924261e-05, + "loss": 1.9662, + "step": 20572 + }, + { + "epoch": 1.3807254790107715, + "grad_norm": 4.997955322265625, + "learning_rate": 2.3099000989608288e-05, + "loss": 1.8732, + "step": 20574 + }, + { + "epoch": 1.3808597026945404, + "grad_norm": 4.19305419921875, + "learning_rate": 2.3089840380993305e-05, + "loss": 1.9027, + "step": 20576 + }, + { + "epoch": 1.3809939263783095, + "grad_norm": 4.462586402893066, + "learning_rate": 2.308068104383041e-05, + "loss": 1.8553, + "step": 20578 + }, + { + "epoch": 1.3811281500620785, + "grad_norm": 4.124268054962158, + "learning_rate": 2.3071522978552418e-05, + "loss": 1.8289, + "step": 20580 + }, + { + "epoch": 1.3812623737458474, + "grad_norm": 3.775672435760498, + "learning_rate": 2.3062366185591995e-05, + "loss": 1.8275, + "step": 20582 + }, + { + "epoch": 1.3813965974296165, + "grad_norm": 4.1330766677856445, + "learning_rate": 2.3053210665381818e-05, + "loss": 1.9271, + "step": 20584 + }, + { + "epoch": 1.3815308211133854, + "grad_norm": 4.337868690490723, + "learning_rate": 2.304405641835445e-05, + "loss": 2.0818, + "step": 20586 + }, + { + "epoch": 1.3816650447971544, + "grad_norm": 3.734902858734131, + "learning_rate": 2.3034903444942406e-05, + "loss": 1.8248, + "step": 20588 + }, + { + "epoch": 1.3817992684809235, + "grad_norm": 4.053591728210449, + "learning_rate": 2.3025751745578134e-05, + "loss": 2.0434, + "step": 20590 + }, + { + "epoch": 1.3819334921646924, + "grad_norm": 4.331628322601318, + "learning_rate": 2.3016601320694065e-05, + "loss": 2.1278, + "step": 20592 + }, + { + "epoch": 1.3820677158484616, + "grad_norm": 3.321974277496338, + "learning_rate": 2.3007452170722528e-05, + "loss": 1.8526, + "step": 20594 + }, + { + "epoch": 1.3822019395322305, + "grad_norm": 4.120316028594971, + "learning_rate": 2.299830429609579e-05, + "loss": 1.9227, + "step": 20596 + }, + { + "epoch": 1.3823361632159994, + "grad_norm": 4.087596893310547, + "learning_rate": 2.2989157697246066e-05, + "loss": 2.0104, + "step": 20598 + }, + { + "epoch": 1.3824703868997685, + "grad_norm": 3.686929225921631, + "learning_rate": 2.2980012374605542e-05, + "loss": 1.856, + "step": 20600 + }, + { + "epoch": 1.3826046105835375, + "grad_norm": 4.341070652008057, + "learning_rate": 2.29708683286063e-05, + "loss": 2.1713, + "step": 20602 + }, + { + "epoch": 1.3827388342673066, + "grad_norm": 3.7886455059051514, + "learning_rate": 2.296172555968038e-05, + "loss": 2.0131, + "step": 20604 + }, + { + "epoch": 1.3828730579510755, + "grad_norm": 3.9855289459228516, + "learning_rate": 2.2952584068259757e-05, + "loss": 1.9084, + "step": 20606 + }, + { + "epoch": 1.3830072816348444, + "grad_norm": 4.381679058074951, + "learning_rate": 2.2943443854776335e-05, + "loss": 1.9182, + "step": 20608 + }, + { + "epoch": 1.3831415053186134, + "grad_norm": 4.081211090087891, + "learning_rate": 2.2934304919662003e-05, + "loss": 1.9845, + "step": 20610 + }, + { + "epoch": 1.3832757290023825, + "grad_norm": 3.422076463699341, + "learning_rate": 2.2925167263348525e-05, + "loss": 1.8191, + "step": 20612 + }, + { + "epoch": 1.3834099526861514, + "grad_norm": 3.7705485820770264, + "learning_rate": 2.2916030886267687e-05, + "loss": 1.8404, + "step": 20614 + }, + { + "epoch": 1.3835441763699206, + "grad_norm": 4.419412136077881, + "learning_rate": 2.2906895788851097e-05, + "loss": 2.0106, + "step": 20616 + }, + { + "epoch": 1.3836784000536895, + "grad_norm": 5.882875442504883, + "learning_rate": 2.2897761971530425e-05, + "loss": 1.9534, + "step": 20618 + }, + { + "epoch": 1.3838126237374584, + "grad_norm": 3.760040283203125, + "learning_rate": 2.288862943473718e-05, + "loss": 2.0633, + "step": 20620 + }, + { + "epoch": 1.3839468474212275, + "grad_norm": 3.9979679584503174, + "learning_rate": 2.2879498178902904e-05, + "loss": 1.7554, + "step": 20622 + }, + { + "epoch": 1.3840810711049965, + "grad_norm": 3.9411590099334717, + "learning_rate": 2.2870368204459008e-05, + "loss": 1.8091, + "step": 20624 + }, + { + "epoch": 1.3842152947887656, + "grad_norm": 4.603240489959717, + "learning_rate": 2.2861239511836867e-05, + "loss": 1.9836, + "step": 20626 + }, + { + "epoch": 1.3843495184725345, + "grad_norm": 3.6936700344085693, + "learning_rate": 2.2852112101467778e-05, + "loss": 1.8593, + "step": 20628 + }, + { + "epoch": 1.3844837421563034, + "grad_norm": 4.344094276428223, + "learning_rate": 2.2842985973783026e-05, + "loss": 1.7711, + "step": 20630 + }, + { + "epoch": 1.3846179658400724, + "grad_norm": 3.6886656284332275, + "learning_rate": 2.283386112921378e-05, + "loss": 2.0713, + "step": 20632 + }, + { + "epoch": 1.3847521895238415, + "grad_norm": 3.6638636589050293, + "learning_rate": 2.2824737568191178e-05, + "loss": 2.014, + "step": 20634 + }, + { + "epoch": 1.3848864132076104, + "grad_norm": 4.656457901000977, + "learning_rate": 2.281561529114627e-05, + "loss": 2.0224, + "step": 20636 + }, + { + "epoch": 1.3850206368913796, + "grad_norm": 3.856358051300049, + "learning_rate": 2.28064942985101e-05, + "loss": 2.0983, + "step": 20638 + }, + { + "epoch": 1.3851548605751485, + "grad_norm": 3.8479108810424805, + "learning_rate": 2.279737459071359e-05, + "loss": 1.9174, + "step": 20640 + }, + { + "epoch": 1.3852890842589174, + "grad_norm": 4.100875377655029, + "learning_rate": 2.2788256168187665e-05, + "loss": 1.9594, + "step": 20642 + }, + { + "epoch": 1.3854233079426865, + "grad_norm": 3.911815881729126, + "learning_rate": 2.2779139031363096e-05, + "loss": 1.9423, + "step": 20644 + }, + { + "epoch": 1.3855575316264555, + "grad_norm": 4.530124664306641, + "learning_rate": 2.2770023180670698e-05, + "loss": 2.0893, + "step": 20646 + }, + { + "epoch": 1.3856917553102246, + "grad_norm": 3.893573760986328, + "learning_rate": 2.2760908616541136e-05, + "loss": 2.0536, + "step": 20648 + }, + { + "epoch": 1.3858259789939935, + "grad_norm": 3.8127596378326416, + "learning_rate": 2.2751795339405103e-05, + "loss": 1.8603, + "step": 20650 + }, + { + "epoch": 1.3859602026777624, + "grad_norm": 3.9434235095977783, + "learning_rate": 2.274268334969316e-05, + "loss": 1.9263, + "step": 20652 + }, + { + "epoch": 1.3860944263615316, + "grad_norm": 4.230140209197998, + "learning_rate": 2.2733572647835828e-05, + "loss": 2.0873, + "step": 20654 + }, + { + "epoch": 1.3862286500453005, + "grad_norm": 3.8365478515625, + "learning_rate": 2.2724463234263548e-05, + "loss": 1.9754, + "step": 20656 + }, + { + "epoch": 1.3863628737290694, + "grad_norm": 4.1605963706970215, + "learning_rate": 2.2715355109406766e-05, + "loss": 1.921, + "step": 20658 + }, + { + "epoch": 1.3864970974128386, + "grad_norm": 3.9274790287017822, + "learning_rate": 2.27062482736958e-05, + "loss": 2.1996, + "step": 20660 + }, + { + "epoch": 1.3866313210966075, + "grad_norm": 4.067019939422607, + "learning_rate": 2.2697142727560937e-05, + "loss": 1.9935, + "step": 20662 + }, + { + "epoch": 1.3867655447803764, + "grad_norm": 3.8365941047668457, + "learning_rate": 2.2688038471432373e-05, + "loss": 2.2219, + "step": 20664 + }, + { + "epoch": 1.3868997684641455, + "grad_norm": 4.184869766235352, + "learning_rate": 2.26789355057403e-05, + "loss": 1.9552, + "step": 20666 + }, + { + "epoch": 1.3870339921479145, + "grad_norm": 3.882554769515991, + "learning_rate": 2.2669833830914782e-05, + "loss": 1.891, + "step": 20668 + }, + { + "epoch": 1.3871682158316836, + "grad_norm": 3.792388677597046, + "learning_rate": 2.2660733447385894e-05, + "loss": 1.7244, + "step": 20670 + }, + { + "epoch": 1.3873024395154525, + "grad_norm": 3.6476967334747314, + "learning_rate": 2.2651634355583606e-05, + "loss": 2.005, + "step": 20672 + }, + { + "epoch": 1.3874366631992214, + "grad_norm": 4.153051376342773, + "learning_rate": 2.2642536555937783e-05, + "loss": 1.9683, + "step": 20674 + }, + { + "epoch": 1.3875708868829906, + "grad_norm": 3.880443572998047, + "learning_rate": 2.2633440048878323e-05, + "loss": 1.8875, + "step": 20676 + }, + { + "epoch": 1.3877051105667595, + "grad_norm": 4.296746253967285, + "learning_rate": 2.2624344834834992e-05, + "loss": 1.7639, + "step": 20678 + }, + { + "epoch": 1.3878393342505286, + "grad_norm": 4.03277587890625, + "learning_rate": 2.2615250914237558e-05, + "loss": 2.0262, + "step": 20680 + }, + { + "epoch": 1.3879735579342976, + "grad_norm": 4.319824695587158, + "learning_rate": 2.260615828751566e-05, + "loss": 1.9039, + "step": 20682 + }, + { + "epoch": 1.3881077816180665, + "grad_norm": 3.9740543365478516, + "learning_rate": 2.2597066955098923e-05, + "loss": 1.8805, + "step": 20684 + }, + { + "epoch": 1.3882420053018354, + "grad_norm": 4.263996601104736, + "learning_rate": 2.2587976917416864e-05, + "loss": 1.7738, + "step": 20686 + }, + { + "epoch": 1.3883762289856045, + "grad_norm": 4.381306171417236, + "learning_rate": 2.2578888174899014e-05, + "loss": 2.2843, + "step": 20688 + }, + { + "epoch": 1.3885104526693735, + "grad_norm": 3.733292818069458, + "learning_rate": 2.2569800727974777e-05, + "loss": 1.9636, + "step": 20690 + }, + { + "epoch": 1.3886446763531426, + "grad_norm": 3.7266221046447754, + "learning_rate": 2.2560714577073512e-05, + "loss": 1.753, + "step": 20692 + }, + { + "epoch": 1.3887789000369115, + "grad_norm": 4.52374792098999, + "learning_rate": 2.255162972262451e-05, + "loss": 1.8476, + "step": 20694 + }, + { + "epoch": 1.3889131237206804, + "grad_norm": 4.045181751251221, + "learning_rate": 2.254254616505705e-05, + "loss": 2.1627, + "step": 20696 + }, + { + "epoch": 1.3890473474044496, + "grad_norm": 3.926457405090332, + "learning_rate": 2.2533463904800268e-05, + "loss": 1.8313, + "step": 20698 + }, + { + "epoch": 1.3891815710882185, + "grad_norm": 3.9472129344940186, + "learning_rate": 2.2524382942283345e-05, + "loss": 1.9895, + "step": 20700 + }, + { + "epoch": 1.3893157947719876, + "grad_norm": 5.2707319259643555, + "learning_rate": 2.2515303277935257e-05, + "loss": 2.1413, + "step": 20702 + }, + { + "epoch": 1.3894500184557566, + "grad_norm": 3.576594591140747, + "learning_rate": 2.250622491218507e-05, + "loss": 1.9394, + "step": 20704 + }, + { + "epoch": 1.3895842421395255, + "grad_norm": 4.079716682434082, + "learning_rate": 2.2497147845461668e-05, + "loss": 1.9301, + "step": 20706 + }, + { + "epoch": 1.3897184658232944, + "grad_norm": 3.8863134384155273, + "learning_rate": 2.2488072078193968e-05, + "loss": 1.9396, + "step": 20708 + }, + { + "epoch": 1.3898526895070635, + "grad_norm": 3.584095001220703, + "learning_rate": 2.2478997610810764e-05, + "loss": 1.6324, + "step": 20710 + }, + { + "epoch": 1.3899869131908325, + "grad_norm": 4.115678310394287, + "learning_rate": 2.2469924443740804e-05, + "loss": 2.1547, + "step": 20712 + }, + { + "epoch": 1.3901211368746016, + "grad_norm": 4.4219865798950195, + "learning_rate": 2.2460852577412754e-05, + "loss": 2.2073, + "step": 20714 + }, + { + "epoch": 1.3902553605583705, + "grad_norm": 4.035862922668457, + "learning_rate": 2.2451782012255294e-05, + "loss": 1.9231, + "step": 20716 + }, + { + "epoch": 1.3903895842421394, + "grad_norm": 4.11376953125, + "learning_rate": 2.244271274869696e-05, + "loss": 1.9102, + "step": 20718 + }, + { + "epoch": 1.3905238079259086, + "grad_norm": 4.3340935707092285, + "learning_rate": 2.243364478716626e-05, + "loss": 1.9136, + "step": 20720 + }, + { + "epoch": 1.3906580316096775, + "grad_norm": 4.069490432739258, + "learning_rate": 2.2424578128091617e-05, + "loss": 2.3461, + "step": 20722 + }, + { + "epoch": 1.3907922552934466, + "grad_norm": 3.8842532634735107, + "learning_rate": 2.241551277190145e-05, + "loss": 1.8661, + "step": 20724 + }, + { + "epoch": 1.3909264789772156, + "grad_norm": 3.959810256958008, + "learning_rate": 2.2406448719024044e-05, + "loss": 2.1075, + "step": 20726 + }, + { + "epoch": 1.3910607026609845, + "grad_norm": 4.230012893676758, + "learning_rate": 2.2397385969887713e-05, + "loss": 2.1163, + "step": 20728 + }, + { + "epoch": 1.3911949263447536, + "grad_norm": 4.0240373611450195, + "learning_rate": 2.2388324524920577e-05, + "loss": 1.9219, + "step": 20730 + }, + { + "epoch": 1.3913291500285225, + "grad_norm": 3.994624137878418, + "learning_rate": 2.2379264384550836e-05, + "loss": 1.8939, + "step": 20732 + }, + { + "epoch": 1.3914633737122915, + "grad_norm": 3.6859405040740967, + "learning_rate": 2.237020554920652e-05, + "loss": 1.7568, + "step": 20734 + }, + { + "epoch": 1.3915975973960606, + "grad_norm": 3.736517906188965, + "learning_rate": 2.236114801931567e-05, + "loss": 1.8618, + "step": 20736 + }, + { + "epoch": 1.3917318210798295, + "grad_norm": 4.111973285675049, + "learning_rate": 2.2352091795306252e-05, + "loss": 1.8513, + "step": 20738 + }, + { + "epoch": 1.3918660447635984, + "grad_norm": 4.257200241088867, + "learning_rate": 2.2343036877606094e-05, + "loss": 2.313, + "step": 20740 + }, + { + "epoch": 1.3920002684473676, + "grad_norm": 3.803255081176758, + "learning_rate": 2.233398326664307e-05, + "loss": 1.7831, + "step": 20742 + }, + { + "epoch": 1.3921344921311365, + "grad_norm": 4.175565242767334, + "learning_rate": 2.232493096284492e-05, + "loss": 1.9569, + "step": 20744 + }, + { + "epoch": 1.3922687158149056, + "grad_norm": 4.095466613769531, + "learning_rate": 2.231587996663938e-05, + "loss": 2.1426, + "step": 20746 + }, + { + "epoch": 1.3924029394986746, + "grad_norm": 4.4938578605651855, + "learning_rate": 2.2306830278454072e-05, + "loss": 2.1334, + "step": 20748 + }, + { + "epoch": 1.3925371631824435, + "grad_norm": 3.830733060836792, + "learning_rate": 2.229778189871658e-05, + "loss": 1.9213, + "step": 20750 + }, + { + "epoch": 1.3926713868662126, + "grad_norm": 4.3449602127075195, + "learning_rate": 2.2288734827854395e-05, + "loss": 1.9832, + "step": 20752 + }, + { + "epoch": 1.3928056105499815, + "grad_norm": 4.110617160797119, + "learning_rate": 2.2279689066295022e-05, + "loss": 1.9355, + "step": 20754 + }, + { + "epoch": 1.3929398342337507, + "grad_norm": 4.528824806213379, + "learning_rate": 2.227064461446583e-05, + "loss": 2.1674, + "step": 20756 + }, + { + "epoch": 1.3930740579175196, + "grad_norm": 4.211117744445801, + "learning_rate": 2.2261601472794147e-05, + "loss": 2.0126, + "step": 20758 + }, + { + "epoch": 1.3932082816012885, + "grad_norm": 4.185275554656982, + "learning_rate": 2.2252559641707237e-05, + "loss": 1.6686, + "step": 20760 + }, + { + "epoch": 1.3933425052850574, + "grad_norm": 3.895505905151367, + "learning_rate": 2.2243519121632332e-05, + "loss": 1.9221, + "step": 20762 + }, + { + "epoch": 1.3934767289688266, + "grad_norm": 4.097986698150635, + "learning_rate": 2.2234479912996557e-05, + "loss": 2.1311, + "step": 20764 + }, + { + "epoch": 1.3936109526525955, + "grad_norm": 4.295762062072754, + "learning_rate": 2.2225442016227044e-05, + "loss": 2.4331, + "step": 20766 + }, + { + "epoch": 1.3937451763363646, + "grad_norm": 4.6195292472839355, + "learning_rate": 2.2216405431750735e-05, + "loss": 2.1233, + "step": 20768 + }, + { + "epoch": 1.3938794000201336, + "grad_norm": 4.237769603729248, + "learning_rate": 2.2207370159994662e-05, + "loss": 2.1708, + "step": 20770 + }, + { + "epoch": 1.3940136237039025, + "grad_norm": 3.8090999126434326, + "learning_rate": 2.2198336201385674e-05, + "loss": 1.7955, + "step": 20772 + }, + { + "epoch": 1.3941478473876716, + "grad_norm": 3.656283378601074, + "learning_rate": 2.218930355635065e-05, + "loss": 1.7901, + "step": 20774 + }, + { + "epoch": 1.3942820710714405, + "grad_norm": 4.255315780639648, + "learning_rate": 2.2180272225316346e-05, + "loss": 1.9536, + "step": 20776 + }, + { + "epoch": 1.3944162947552097, + "grad_norm": 3.893555164337158, + "learning_rate": 2.2171242208709474e-05, + "loss": 1.8781, + "step": 20778 + }, + { + "epoch": 1.3945505184389786, + "grad_norm": 4.2845306396484375, + "learning_rate": 2.216221350695667e-05, + "loss": 2.0765, + "step": 20780 + }, + { + "epoch": 1.3946847421227475, + "grad_norm": 6.094376087188721, + "learning_rate": 2.2153186120484546e-05, + "loss": 2.2206, + "step": 20782 + }, + { + "epoch": 1.3948189658065164, + "grad_norm": 4.325292587280273, + "learning_rate": 2.2144160049719632e-05, + "loss": 2.0095, + "step": 20784 + }, + { + "epoch": 1.3949531894902856, + "grad_norm": 4.200784206390381, + "learning_rate": 2.2135135295088373e-05, + "loss": 2.047, + "step": 20786 + }, + { + "epoch": 1.3950874131740545, + "grad_norm": 4.649842739105225, + "learning_rate": 2.2126111857017157e-05, + "loss": 2.1284, + "step": 20788 + }, + { + "epoch": 1.3952216368578236, + "grad_norm": 4.663277626037598, + "learning_rate": 2.2117089735932367e-05, + "loss": 2.3219, + "step": 20790 + }, + { + "epoch": 1.3953558605415926, + "grad_norm": 4.21945858001709, + "learning_rate": 2.2108068932260235e-05, + "loss": 2.0284, + "step": 20792 + }, + { + "epoch": 1.3954900842253615, + "grad_norm": 4.243096828460693, + "learning_rate": 2.2099049446427038e-05, + "loss": 2.0501, + "step": 20794 + }, + { + "epoch": 1.3956243079091306, + "grad_norm": 4.3069658279418945, + "learning_rate": 2.2090031278858854e-05, + "loss": 1.9915, + "step": 20796 + }, + { + "epoch": 1.3957585315928995, + "grad_norm": 3.981990098953247, + "learning_rate": 2.208101442998182e-05, + "loss": 1.8642, + "step": 20798 + }, + { + "epoch": 1.3958927552766687, + "grad_norm": 4.361700534820557, + "learning_rate": 2.2071998900221947e-05, + "loss": 1.9585, + "step": 20800 + }, + { + "epoch": 1.3960269789604376, + "grad_norm": 4.3721818923950195, + "learning_rate": 2.2062984690005224e-05, + "loss": 1.8779, + "step": 20802 + }, + { + "epoch": 1.3961612026442065, + "grad_norm": 3.9552500247955322, + "learning_rate": 2.2053971799757535e-05, + "loss": 1.9277, + "step": 20804 + }, + { + "epoch": 1.3962954263279757, + "grad_norm": 3.8710219860076904, + "learning_rate": 2.2044960229904722e-05, + "loss": 2.0318, + "step": 20806 + }, + { + "epoch": 1.3964296500117446, + "grad_norm": 4.636574745178223, + "learning_rate": 2.2035949980872573e-05, + "loss": 2.0571, + "step": 20808 + }, + { + "epoch": 1.3965638736955135, + "grad_norm": 4.086299419403076, + "learning_rate": 2.2026941053086773e-05, + "loss": 2.2808, + "step": 20810 + }, + { + "epoch": 1.3966980973792826, + "grad_norm": 3.979862689971924, + "learning_rate": 2.2017933446973027e-05, + "loss": 1.8346, + "step": 20812 + }, + { + "epoch": 1.3968323210630516, + "grad_norm": 4.008800029754639, + "learning_rate": 2.20089271629569e-05, + "loss": 1.7964, + "step": 20814 + }, + { + "epoch": 1.3969665447468205, + "grad_norm": 3.7588653564453125, + "learning_rate": 2.199992220146392e-05, + "loss": 1.9688, + "step": 20816 + }, + { + "epoch": 1.3971007684305896, + "grad_norm": 3.8914294242858887, + "learning_rate": 2.199091856291954e-05, + "loss": 1.7811, + "step": 20818 + }, + { + "epoch": 1.3972349921143585, + "grad_norm": 3.981041431427002, + "learning_rate": 2.1981916247749196e-05, + "loss": 1.9234, + "step": 20820 + }, + { + "epoch": 1.3973692157981277, + "grad_norm": 3.778993844985962, + "learning_rate": 2.1972915256378217e-05, + "loss": 1.8278, + "step": 20822 + }, + { + "epoch": 1.3975034394818966, + "grad_norm": 4.563928127288818, + "learning_rate": 2.1963915589231876e-05, + "loss": 1.9997, + "step": 20824 + }, + { + "epoch": 1.3976376631656655, + "grad_norm": 3.7630178928375244, + "learning_rate": 2.195491724673538e-05, + "loss": 1.8102, + "step": 20826 + }, + { + "epoch": 1.3977718868494347, + "grad_norm": 4.7153401374816895, + "learning_rate": 2.194592022931391e-05, + "loss": 1.9882, + "step": 20828 + }, + { + "epoch": 1.3979061105332036, + "grad_norm": 4.428680419921875, + "learning_rate": 2.1936924537392527e-05, + "loss": 1.7508, + "step": 20830 + }, + { + "epoch": 1.3980403342169727, + "grad_norm": 4.349248886108398, + "learning_rate": 2.1927930171396293e-05, + "loss": 2.2387, + "step": 20832 + }, + { + "epoch": 1.3981745579007416, + "grad_norm": 3.878509283065796, + "learning_rate": 2.1918937131750157e-05, + "loss": 2.1484, + "step": 20834 + }, + { + "epoch": 1.3983087815845106, + "grad_norm": 4.44680118560791, + "learning_rate": 2.1909945418879017e-05, + "loss": 1.9057, + "step": 20836 + }, + { + "epoch": 1.3984430052682795, + "grad_norm": 4.1436614990234375, + "learning_rate": 2.190095503320771e-05, + "loss": 2.1906, + "step": 20838 + }, + { + "epoch": 1.3985772289520486, + "grad_norm": 3.284787893295288, + "learning_rate": 2.1891965975161037e-05, + "loss": 1.8924, + "step": 20840 + }, + { + "epoch": 1.3987114526358175, + "grad_norm": 4.637693405151367, + "learning_rate": 2.1882978245163705e-05, + "loss": 1.9133, + "step": 20842 + }, + { + "epoch": 1.3988456763195867, + "grad_norm": 3.798630475997925, + "learning_rate": 2.187399184364035e-05, + "loss": 1.9999, + "step": 20844 + }, + { + "epoch": 1.3989799000033556, + "grad_norm": 4.3002753257751465, + "learning_rate": 2.1865006771015567e-05, + "loss": 1.9284, + "step": 20846 + }, + { + "epoch": 1.3991141236871245, + "grad_norm": 3.662353277206421, + "learning_rate": 2.18560230277139e-05, + "loss": 1.7694, + "step": 20848 + }, + { + "epoch": 1.3992483473708937, + "grad_norm": 4.1159348487854, + "learning_rate": 2.1847040614159804e-05, + "loss": 2.1001, + "step": 20850 + }, + { + "epoch": 1.3993825710546626, + "grad_norm": 4.271273612976074, + "learning_rate": 2.1838059530777677e-05, + "loss": 1.9274, + "step": 20852 + }, + { + "epoch": 1.3995167947384317, + "grad_norm": 3.700115203857422, + "learning_rate": 2.1829079777991844e-05, + "loss": 1.9553, + "step": 20854 + }, + { + "epoch": 1.3996510184222006, + "grad_norm": 3.2946319580078125, + "learning_rate": 2.1820101356226612e-05, + "loss": 1.862, + "step": 20856 + }, + { + "epoch": 1.3997852421059696, + "grad_norm": 3.860799789428711, + "learning_rate": 2.1811124265906157e-05, + "loss": 1.9665, + "step": 20858 + }, + { + "epoch": 1.3999194657897385, + "grad_norm": 3.6631762981414795, + "learning_rate": 2.180214850745467e-05, + "loss": 2.0336, + "step": 20860 + }, + { + "epoch": 1.4000536894735076, + "grad_norm": 4.1038031578063965, + "learning_rate": 2.179317408129622e-05, + "loss": 2.0874, + "step": 20862 + }, + { + "epoch": 1.4001879131572765, + "grad_norm": 3.7889833450317383, + "learning_rate": 2.1784200987854825e-05, + "loss": 1.7887, + "step": 20864 + }, + { + "epoch": 1.4003221368410457, + "grad_norm": 3.6102592945098877, + "learning_rate": 2.1775229227554433e-05, + "loss": 1.753, + "step": 20866 + }, + { + "epoch": 1.4004563605248146, + "grad_norm": 3.8086280822753906, + "learning_rate": 2.1766258800818972e-05, + "loss": 2.2936, + "step": 20868 + }, + { + "epoch": 1.4005905842085835, + "grad_norm": 4.624703407287598, + "learning_rate": 2.1757289708072272e-05, + "loss": 1.889, + "step": 20870 + }, + { + "epoch": 1.4007248078923527, + "grad_norm": 4.277683258056641, + "learning_rate": 2.1748321949738088e-05, + "loss": 1.916, + "step": 20872 + }, + { + "epoch": 1.4008590315761216, + "grad_norm": 5.8949174880981445, + "learning_rate": 2.1739355526240124e-05, + "loss": 2.0244, + "step": 20874 + }, + { + "epoch": 1.4009932552598907, + "grad_norm": 4.794247150421143, + "learning_rate": 2.173039043800206e-05, + "loss": 1.9884, + "step": 20876 + }, + { + "epoch": 1.4011274789436596, + "grad_norm": 4.469595909118652, + "learning_rate": 2.172142668544745e-05, + "loss": 1.8763, + "step": 20878 + }, + { + "epoch": 1.4012617026274286, + "grad_norm": 4.171546936035156, + "learning_rate": 2.1712464268999826e-05, + "loss": 1.9627, + "step": 20880 + }, + { + "epoch": 1.4013959263111977, + "grad_norm": 4.434277534484863, + "learning_rate": 2.1703503189082634e-05, + "loss": 2.1301, + "step": 20882 + }, + { + "epoch": 1.4015301499949666, + "grad_norm": 4.149141788482666, + "learning_rate": 2.1694543446119265e-05, + "loss": 1.7995, + "step": 20884 + }, + { + "epoch": 1.4016643736787355, + "grad_norm": 3.677117347717285, + "learning_rate": 2.1685585040533075e-05, + "loss": 1.9969, + "step": 20886 + }, + { + "epoch": 1.4017985973625047, + "grad_norm": 4.34774112701416, + "learning_rate": 2.167662797274729e-05, + "loss": 2.0363, + "step": 20888 + }, + { + "epoch": 1.4019328210462736, + "grad_norm": 3.974036693572998, + "learning_rate": 2.1667672243185178e-05, + "loss": 1.769, + "step": 20890 + }, + { + "epoch": 1.4020670447300425, + "grad_norm": 3.799644947052002, + "learning_rate": 2.16587178522698e-05, + "loss": 1.8189, + "step": 20892 + }, + { + "epoch": 1.4022012684138117, + "grad_norm": 4.030343055725098, + "learning_rate": 2.1649764800424294e-05, + "loss": 2.1807, + "step": 20894 + }, + { + "epoch": 1.4023354920975806, + "grad_norm": 4.192046165466309, + "learning_rate": 2.1640813088071633e-05, + "loss": 2.179, + "step": 20896 + }, + { + "epoch": 1.4024697157813497, + "grad_norm": 3.6630988121032715, + "learning_rate": 2.163186271563481e-05, + "loss": 1.7486, + "step": 20898 + }, + { + "epoch": 1.4026039394651186, + "grad_norm": 3.9933767318725586, + "learning_rate": 2.1622913683536684e-05, + "loss": 1.794, + "step": 20900 + }, + { + "epoch": 1.4027381631488876, + "grad_norm": 4.082265853881836, + "learning_rate": 2.1613965992200087e-05, + "loss": 1.9705, + "step": 20902 + }, + { + "epoch": 1.4028723868326567, + "grad_norm": 4.447140693664551, + "learning_rate": 2.1605019642047765e-05, + "loss": 2.0495, + "step": 20904 + }, + { + "epoch": 1.4030066105164256, + "grad_norm": 3.9584269523620605, + "learning_rate": 2.159607463350244e-05, + "loss": 1.7299, + "step": 20906 + }, + { + "epoch": 1.4031408342001948, + "grad_norm": 4.283885955810547, + "learning_rate": 2.1587130966986735e-05, + "loss": 1.7594, + "step": 20908 + }, + { + "epoch": 1.4032750578839637, + "grad_norm": 5.2860283851623535, + "learning_rate": 2.1578188642923225e-05, + "loss": 2.2582, + "step": 20910 + }, + { + "epoch": 1.4034092815677326, + "grad_norm": 4.62113618850708, + "learning_rate": 2.1569247661734392e-05, + "loss": 2.2715, + "step": 20912 + }, + { + "epoch": 1.4035435052515015, + "grad_norm": 3.7636706829071045, + "learning_rate": 2.1560308023842717e-05, + "loss": 1.9185, + "step": 20914 + }, + { + "epoch": 1.4036777289352707, + "grad_norm": 4.365087509155273, + "learning_rate": 2.1551369729670546e-05, + "loss": 1.7943, + "step": 20916 + }, + { + "epoch": 1.4038119526190396, + "grad_norm": 3.8688158988952637, + "learning_rate": 2.1542432779640255e-05, + "loss": 1.8782, + "step": 20918 + }, + { + "epoch": 1.4039461763028087, + "grad_norm": 3.990319013595581, + "learning_rate": 2.153349717417401e-05, + "loss": 1.9497, + "step": 20920 + }, + { + "epoch": 1.4040803999865776, + "grad_norm": 3.983217477798462, + "learning_rate": 2.1524562913694074e-05, + "loss": 1.9605, + "step": 20922 + }, + { + "epoch": 1.4042146236703466, + "grad_norm": 3.510162115097046, + "learning_rate": 2.151562999862252e-05, + "loss": 2.0033, + "step": 20924 + }, + { + "epoch": 1.4043488473541157, + "grad_norm": 4.114282608032227, + "learning_rate": 2.150669842938146e-05, + "loss": 2.2316, + "step": 20926 + }, + { + "epoch": 1.4044830710378846, + "grad_norm": 4.172731876373291, + "learning_rate": 2.1497768206392864e-05, + "loss": 2.0063, + "step": 20928 + }, + { + "epoch": 1.4046172947216538, + "grad_norm": 3.819817543029785, + "learning_rate": 2.148883933007868e-05, + "loss": 1.9699, + "step": 20930 + }, + { + "epoch": 1.4047515184054227, + "grad_norm": 3.6852619647979736, + "learning_rate": 2.1479911800860752e-05, + "loss": 1.9318, + "step": 20932 + }, + { + "epoch": 1.4048857420891916, + "grad_norm": 4.814490795135498, + "learning_rate": 2.1470985619160926e-05, + "loss": 1.7574, + "step": 20934 + }, + { + "epoch": 1.4050199657729605, + "grad_norm": 3.3709208965301514, + "learning_rate": 2.146206078540093e-05, + "loss": 1.8136, + "step": 20936 + }, + { + "epoch": 1.4051541894567297, + "grad_norm": 3.8084869384765625, + "learning_rate": 2.1453137300002445e-05, + "loss": 1.8912, + "step": 20938 + }, + { + "epoch": 1.4052884131404986, + "grad_norm": 4.5270209312438965, + "learning_rate": 2.1444215163387067e-05, + "loss": 2.0178, + "step": 20940 + }, + { + "epoch": 1.4054226368242677, + "grad_norm": 4.077504634857178, + "learning_rate": 2.143529437597639e-05, + "loss": 2.1533, + "step": 20942 + }, + { + "epoch": 1.4055568605080366, + "grad_norm": 4.238177299499512, + "learning_rate": 2.1426374938191884e-05, + "loss": 2.0916, + "step": 20944 + }, + { + "epoch": 1.4056910841918056, + "grad_norm": 4.037965297698975, + "learning_rate": 2.1417456850454976e-05, + "loss": 1.8104, + "step": 20946 + }, + { + "epoch": 1.4058253078755747, + "grad_norm": 3.5017757415771484, + "learning_rate": 2.1408540113187027e-05, + "loss": 1.8822, + "step": 20948 + }, + { + "epoch": 1.4059595315593436, + "grad_norm": 5.416930198669434, + "learning_rate": 2.139962472680931e-05, + "loss": 1.8595, + "step": 20950 + }, + { + "epoch": 1.4060937552431128, + "grad_norm": 4.601994037628174, + "learning_rate": 2.139071069174311e-05, + "loss": 2.0158, + "step": 20952 + }, + { + "epoch": 1.4062279789268817, + "grad_norm": 4.2152605056762695, + "learning_rate": 2.1381798008409547e-05, + "loss": 1.9034, + "step": 20954 + }, + { + "epoch": 1.4063622026106506, + "grad_norm": 4.2322564125061035, + "learning_rate": 2.1372886677229775e-05, + "loss": 1.9291, + "step": 20956 + }, + { + "epoch": 1.4064964262944197, + "grad_norm": 4.44310188293457, + "learning_rate": 2.1363976698624815e-05, + "loss": 2.0213, + "step": 20958 + }, + { + "epoch": 1.4066306499781887, + "grad_norm": 3.9292540550231934, + "learning_rate": 2.135506807301565e-05, + "loss": 1.9926, + "step": 20960 + }, + { + "epoch": 1.4067648736619576, + "grad_norm": 3.7675044536590576, + "learning_rate": 2.1346160800823166e-05, + "loss": 1.8734, + "step": 20962 + }, + { + "epoch": 1.4068990973457267, + "grad_norm": 3.879199504852295, + "learning_rate": 2.133725488246826e-05, + "loss": 1.8189, + "step": 20964 + }, + { + "epoch": 1.4070333210294956, + "grad_norm": 3.6913180351257324, + "learning_rate": 2.13283503183717e-05, + "loss": 1.899, + "step": 20966 + }, + { + "epoch": 1.4071675447132646, + "grad_norm": 3.4977669715881348, + "learning_rate": 2.13194471089542e-05, + "loss": 1.8019, + "step": 20968 + }, + { + "epoch": 1.4073017683970337, + "grad_norm": 3.886503219604492, + "learning_rate": 2.1310545254636412e-05, + "loss": 1.908, + "step": 20970 + }, + { + "epoch": 1.4074359920808026, + "grad_norm": 3.953111410140991, + "learning_rate": 2.130164475583896e-05, + "loss": 2.0065, + "step": 20972 + }, + { + "epoch": 1.4075702157645718, + "grad_norm": 4.009715557098389, + "learning_rate": 2.129274561298237e-05, + "loss": 2.0274, + "step": 20974 + }, + { + "epoch": 1.4077044394483407, + "grad_norm": 4.818535327911377, + "learning_rate": 2.1283847826487092e-05, + "loss": 2.1192, + "step": 20976 + }, + { + "epoch": 1.4078386631321096, + "grad_norm": 4.035956382751465, + "learning_rate": 2.1274951396773517e-05, + "loss": 1.9271, + "step": 20978 + }, + { + "epoch": 1.4079728868158787, + "grad_norm": 4.480569839477539, + "learning_rate": 2.1266056324262023e-05, + "loss": 2.0409, + "step": 20980 + }, + { + "epoch": 1.4081071104996477, + "grad_norm": 4.152615070343018, + "learning_rate": 2.125716260937285e-05, + "loss": 1.9025, + "step": 20982 + }, + { + "epoch": 1.4082413341834168, + "grad_norm": 4.3090596199035645, + "learning_rate": 2.1248270252526237e-05, + "loss": 1.8522, + "step": 20984 + }, + { + "epoch": 1.4083755578671857, + "grad_norm": 4.586495399475098, + "learning_rate": 2.1239379254142323e-05, + "loss": 2.0048, + "step": 20986 + }, + { + "epoch": 1.4085097815509546, + "grad_norm": 3.707279920578003, + "learning_rate": 2.1230489614641174e-05, + "loss": 1.9802, + "step": 20988 + }, + { + "epoch": 1.4086440052347236, + "grad_norm": 3.9638233184814453, + "learning_rate": 2.122160133444281e-05, + "loss": 1.9906, + "step": 20990 + }, + { + "epoch": 1.4087782289184927, + "grad_norm": 4.62456750869751, + "learning_rate": 2.1212714413967206e-05, + "loss": 2.2755, + "step": 20992 + }, + { + "epoch": 1.4089124526022616, + "grad_norm": 4.100644111633301, + "learning_rate": 2.120382885363424e-05, + "loss": 2.0095, + "step": 20994 + }, + { + "epoch": 1.4090466762860308, + "grad_norm": 3.902256488800049, + "learning_rate": 2.119494465386373e-05, + "loss": 1.9288, + "step": 20996 + }, + { + "epoch": 1.4091808999697997, + "grad_norm": 3.7341020107269287, + "learning_rate": 2.1186061815075424e-05, + "loss": 2.1066, + "step": 20998 + }, + { + "epoch": 1.4093151236535686, + "grad_norm": 3.946800470352173, + "learning_rate": 2.117718033768906e-05, + "loss": 2.0766, + "step": 21000 + }, + { + "epoch": 1.4094493473373377, + "grad_norm": 4.501173973083496, + "learning_rate": 2.1168300222124248e-05, + "loss": 2.1046, + "step": 21002 + }, + { + "epoch": 1.4095835710211067, + "grad_norm": 4.115889549255371, + "learning_rate": 2.1159421468800556e-05, + "loss": 1.9132, + "step": 21004 + }, + { + "epoch": 1.4097177947048758, + "grad_norm": 4.10782527923584, + "learning_rate": 2.115054407813747e-05, + "loss": 2.171, + "step": 21006 + }, + { + "epoch": 1.4098520183886447, + "grad_norm": 3.9864959716796875, + "learning_rate": 2.1141668050554457e-05, + "loss": 2.1698, + "step": 21008 + }, + { + "epoch": 1.4099862420724136, + "grad_norm": 3.7149953842163086, + "learning_rate": 2.1132793386470867e-05, + "loss": 1.918, + "step": 21010 + }, + { + "epoch": 1.4101204657561826, + "grad_norm": 3.8226404190063477, + "learning_rate": 2.112392008630604e-05, + "loss": 2.2723, + "step": 21012 + }, + { + "epoch": 1.4102546894399517, + "grad_norm": 4.280192852020264, + "learning_rate": 2.1115048150479233e-05, + "loss": 1.8331, + "step": 21014 + }, + { + "epoch": 1.4103889131237206, + "grad_norm": 4.078068733215332, + "learning_rate": 2.110617757940956e-05, + "loss": 2.2626, + "step": 21016 + }, + { + "epoch": 1.4105231368074898, + "grad_norm": 3.2953011989593506, + "learning_rate": 2.1097308373516194e-05, + "loss": 1.7567, + "step": 21018 + }, + { + "epoch": 1.4106573604912587, + "grad_norm": 3.481264352798462, + "learning_rate": 2.1088440533218156e-05, + "loss": 1.6656, + "step": 21020 + }, + { + "epoch": 1.4107915841750276, + "grad_norm": 4.032175540924072, + "learning_rate": 2.107957405893447e-05, + "loss": 1.8954, + "step": 21022 + }, + { + "epoch": 1.4109258078587967, + "grad_norm": 3.985055685043335, + "learning_rate": 2.107070895108404e-05, + "loss": 1.9567, + "step": 21024 + }, + { + "epoch": 1.4110600315425657, + "grad_norm": 3.5247552394866943, + "learning_rate": 2.1061845210085728e-05, + "loss": 1.8506, + "step": 21026 + }, + { + "epoch": 1.4111942552263348, + "grad_norm": 4.695797920227051, + "learning_rate": 2.1052982836358303e-05, + "loss": 1.9598, + "step": 21028 + }, + { + "epoch": 1.4113284789101037, + "grad_norm": 4.2317376136779785, + "learning_rate": 2.1044121830320535e-05, + "loss": 1.7523, + "step": 21030 + }, + { + "epoch": 1.4114627025938726, + "grad_norm": 4.124592304229736, + "learning_rate": 2.1035262192391077e-05, + "loss": 1.8739, + "step": 21032 + }, + { + "epoch": 1.4115969262776418, + "grad_norm": 3.9139978885650635, + "learning_rate": 2.102640392298852e-05, + "loss": 1.8096, + "step": 21034 + }, + { + "epoch": 1.4117311499614107, + "grad_norm": 4.714763641357422, + "learning_rate": 2.101754702253138e-05, + "loss": 2.0364, + "step": 21036 + }, + { + "epoch": 1.4118653736451796, + "grad_norm": 4.096407890319824, + "learning_rate": 2.1008691491438175e-05, + "loss": 2.0615, + "step": 21038 + }, + { + "epoch": 1.4119995973289488, + "grad_norm": 4.347524642944336, + "learning_rate": 2.099983733012727e-05, + "loss": 1.8784, + "step": 21040 + }, + { + "epoch": 1.4121338210127177, + "grad_norm": 4.046730041503906, + "learning_rate": 2.0990984539017062e-05, + "loss": 2.0826, + "step": 21042 + }, + { + "epoch": 1.4122680446964866, + "grad_norm": 3.8567919731140137, + "learning_rate": 2.0982133118525755e-05, + "loss": 1.8928, + "step": 21044 + }, + { + "epoch": 1.4124022683802557, + "grad_norm": 4.163361072540283, + "learning_rate": 2.0973283069071608e-05, + "loss": 2.0685, + "step": 21046 + }, + { + "epoch": 1.4125364920640247, + "grad_norm": 4.285580158233643, + "learning_rate": 2.0964434391072745e-05, + "loss": 1.9952, + "step": 21048 + }, + { + "epoch": 1.4126707157477938, + "grad_norm": 4.724217414855957, + "learning_rate": 2.0955587084947282e-05, + "loss": 1.8119, + "step": 21050 + }, + { + "epoch": 1.4128049394315627, + "grad_norm": 3.394929885864258, + "learning_rate": 2.0946741151113215e-05, + "loss": 1.7934, + "step": 21052 + }, + { + "epoch": 1.4129391631153316, + "grad_norm": 4.55405855178833, + "learning_rate": 2.0937896589988494e-05, + "loss": 1.9241, + "step": 21054 + }, + { + "epoch": 1.4130733867991008, + "grad_norm": 4.443233966827393, + "learning_rate": 2.0929053401990995e-05, + "loss": 2.0877, + "step": 21056 + }, + { + "epoch": 1.4132076104828697, + "grad_norm": 3.6867830753326416, + "learning_rate": 2.0920211587538573e-05, + "loss": 2.3441, + "step": 21058 + }, + { + "epoch": 1.4133418341666388, + "grad_norm": 4.14874792098999, + "learning_rate": 2.091137114704897e-05, + "loss": 1.8685, + "step": 21060 + }, + { + "epoch": 1.4134760578504078, + "grad_norm": 3.4687252044677734, + "learning_rate": 2.0902532080939886e-05, + "loss": 1.9257, + "step": 21062 + }, + { + "epoch": 1.4136102815341767, + "grad_norm": 3.868500232696533, + "learning_rate": 2.089369438962892e-05, + "loss": 1.9887, + "step": 21064 + }, + { + "epoch": 1.4137445052179456, + "grad_norm": 4.046691417694092, + "learning_rate": 2.0884858073533674e-05, + "loss": 2.1462, + "step": 21066 + }, + { + "epoch": 1.4138787289017147, + "grad_norm": 6.380543231964111, + "learning_rate": 2.087602313307162e-05, + "loss": 2.038, + "step": 21068 + }, + { + "epoch": 1.4140129525854837, + "grad_norm": 3.4252476692199707, + "learning_rate": 2.086718956866024e-05, + "loss": 2.042, + "step": 21070 + }, + { + "epoch": 1.4141471762692528, + "grad_norm": 3.6814892292022705, + "learning_rate": 2.0858357380716826e-05, + "loss": 1.8186, + "step": 21072 + }, + { + "epoch": 1.4142813999530217, + "grad_norm": 3.7946972846984863, + "learning_rate": 2.084952656965874e-05, + "loss": 2.1196, + "step": 21074 + }, + { + "epoch": 1.4144156236367906, + "grad_norm": 3.9557998180389404, + "learning_rate": 2.084069713590318e-05, + "loss": 1.8276, + "step": 21076 + }, + { + "epoch": 1.4145498473205598, + "grad_norm": 3.9124860763549805, + "learning_rate": 2.0831869079867368e-05, + "loss": 1.973, + "step": 21078 + }, + { + "epoch": 1.4146840710043287, + "grad_norm": 4.580628871917725, + "learning_rate": 2.0823042401968374e-05, + "loss": 2.0129, + "step": 21080 + }, + { + "epoch": 1.4148182946880978, + "grad_norm": 4.221983909606934, + "learning_rate": 2.0814217102623264e-05, + "loss": 1.984, + "step": 21082 + }, + { + "epoch": 1.4149525183718668, + "grad_norm": 4.092778205871582, + "learning_rate": 2.0805393182248995e-05, + "loss": 1.9272, + "step": 21084 + }, + { + "epoch": 1.4150867420556357, + "grad_norm": 4.296420097351074, + "learning_rate": 2.0796570641262476e-05, + "loss": 1.747, + "step": 21086 + }, + { + "epoch": 1.4152209657394046, + "grad_norm": 3.72943115234375, + "learning_rate": 2.0787749480080587e-05, + "loss": 1.9896, + "step": 21088 + }, + { + "epoch": 1.4153551894231737, + "grad_norm": 3.878122329711914, + "learning_rate": 2.0778929699120088e-05, + "loss": 2.1151, + "step": 21090 + }, + { + "epoch": 1.4154894131069427, + "grad_norm": 4.007120132446289, + "learning_rate": 2.0770111298797702e-05, + "loss": 2.145, + "step": 21092 + }, + { + "epoch": 1.4156236367907118, + "grad_norm": 4.0322113037109375, + "learning_rate": 2.076129427953006e-05, + "loss": 2.0763, + "step": 21094 + }, + { + "epoch": 1.4157578604744807, + "grad_norm": 4.06334924697876, + "learning_rate": 2.0752478641733785e-05, + "loss": 1.9374, + "step": 21096 + }, + { + "epoch": 1.4158920841582496, + "grad_norm": 5.9608259201049805, + "learning_rate": 2.0743664385825385e-05, + "loss": 2.7217, + "step": 21098 + }, + { + "epoch": 1.4160263078420188, + "grad_norm": 4.340204238891602, + "learning_rate": 2.073485151222131e-05, + "loss": 2.1203, + "step": 21100 + }, + { + "epoch": 1.4161605315257877, + "grad_norm": 4.3356475830078125, + "learning_rate": 2.072604002133794e-05, + "loss": 1.9022, + "step": 21102 + }, + { + "epoch": 1.4162947552095568, + "grad_norm": 4.017705917358398, + "learning_rate": 2.071722991359163e-05, + "loss": 1.8223, + "step": 21104 + }, + { + "epoch": 1.4164289788933258, + "grad_norm": 3.777143716812134, + "learning_rate": 2.0708421189398604e-05, + "loss": 1.9592, + "step": 21106 + }, + { + "epoch": 1.4165632025770947, + "grad_norm": 4.618510723114014, + "learning_rate": 2.0699613849175114e-05, + "loss": 1.8551, + "step": 21108 + }, + { + "epoch": 1.4166974262608638, + "grad_norm": 3.593658924102783, + "learning_rate": 2.0690807893337217e-05, + "loss": 1.802, + "step": 21110 + }, + { + "epoch": 1.4168316499446327, + "grad_norm": 3.7359135150909424, + "learning_rate": 2.068200332230103e-05, + "loss": 1.8599, + "step": 21112 + }, + { + "epoch": 1.4169658736284017, + "grad_norm": 4.218937397003174, + "learning_rate": 2.0673200136482518e-05, + "loss": 1.8835, + "step": 21114 + }, + { + "epoch": 1.4171000973121708, + "grad_norm": 4.877134323120117, + "learning_rate": 2.0664398336297646e-05, + "loss": 2.0474, + "step": 21116 + }, + { + "epoch": 1.4172343209959397, + "grad_norm": 3.8031375408172607, + "learning_rate": 2.0655597922162274e-05, + "loss": 1.9658, + "step": 21118 + }, + { + "epoch": 1.4173685446797086, + "grad_norm": 3.9691996574401855, + "learning_rate": 2.064679889449219e-05, + "loss": 1.8327, + "step": 21120 + }, + { + "epoch": 1.4175027683634778, + "grad_norm": 4.846220016479492, + "learning_rate": 2.063800125370312e-05, + "loss": 1.9069, + "step": 21122 + }, + { + "epoch": 1.4176369920472467, + "grad_norm": 3.7999582290649414, + "learning_rate": 2.062920500021078e-05, + "loss": 2.2007, + "step": 21124 + }, + { + "epoch": 1.4177712157310158, + "grad_norm": 5.471084117889404, + "learning_rate": 2.062041013443074e-05, + "loss": 2.0345, + "step": 21126 + }, + { + "epoch": 1.4179054394147848, + "grad_norm": 6.541484355926514, + "learning_rate": 2.0611616656778547e-05, + "loss": 1.6964, + "step": 21128 + }, + { + "epoch": 1.4180396630985537, + "grad_norm": 4.034443378448486, + "learning_rate": 2.0602824567669664e-05, + "loss": 2.0296, + "step": 21130 + }, + { + "epoch": 1.4181738867823228, + "grad_norm": 3.8478949069976807, + "learning_rate": 2.059403386751953e-05, + "loss": 1.8865, + "step": 21132 + }, + { + "epoch": 1.4183081104660917, + "grad_norm": 3.732210874557495, + "learning_rate": 2.0585244556743448e-05, + "loss": 1.8274, + "step": 21134 + }, + { + "epoch": 1.4184423341498609, + "grad_norm": 4.592129230499268, + "learning_rate": 2.0576456635756762e-05, + "loss": 2.0678, + "step": 21136 + }, + { + "epoch": 1.4185765578336298, + "grad_norm": 3.803603410720825, + "learning_rate": 2.0567670104974607e-05, + "loss": 2.025, + "step": 21138 + }, + { + "epoch": 1.4187107815173987, + "grad_norm": 3.877103328704834, + "learning_rate": 2.0558884964812174e-05, + "loss": 2.0252, + "step": 21140 + }, + { + "epoch": 1.4188450052011676, + "grad_norm": 3.8026070594787598, + "learning_rate": 2.055010121568452e-05, + "loss": 1.9504, + "step": 21142 + }, + { + "epoch": 1.4189792288849368, + "grad_norm": 4.23117208480835, + "learning_rate": 2.0541318858006685e-05, + "loss": 1.914, + "step": 21144 + }, + { + "epoch": 1.4191134525687057, + "grad_norm": 3.3163082599639893, + "learning_rate": 2.0532537892193615e-05, + "loss": 1.9864, + "step": 21146 + }, + { + "epoch": 1.4192476762524748, + "grad_norm": 4.316788673400879, + "learning_rate": 2.052375831866018e-05, + "loss": 2.1016, + "step": 21148 + }, + { + "epoch": 1.4193818999362438, + "grad_norm": 4.1432294845581055, + "learning_rate": 2.05149801378212e-05, + "loss": 1.9446, + "step": 21150 + }, + { + "epoch": 1.4195161236200127, + "grad_norm": 3.8129286766052246, + "learning_rate": 2.0506203350091414e-05, + "loss": 1.713, + "step": 21152 + }, + { + "epoch": 1.4196503473037818, + "grad_norm": 3.921907663345337, + "learning_rate": 2.0497427955885533e-05, + "loss": 2.047, + "step": 21154 + }, + { + "epoch": 1.4197845709875507, + "grad_norm": 4.485642910003662, + "learning_rate": 2.0488653955618174e-05, + "loss": 1.9077, + "step": 21156 + }, + { + "epoch": 1.4199187946713199, + "grad_norm": 3.821852684020996, + "learning_rate": 2.0479881349703883e-05, + "loss": 1.8776, + "step": 21158 + }, + { + "epoch": 1.4200530183550888, + "grad_norm": 3.92651104927063, + "learning_rate": 2.047111013855713e-05, + "loss": 1.9942, + "step": 21160 + }, + { + "epoch": 1.4201872420388577, + "grad_norm": 4.076271057128906, + "learning_rate": 2.0462340322592378e-05, + "loss": 1.8852, + "step": 21162 + }, + { + "epoch": 1.4203214657226266, + "grad_norm": 3.3439316749572754, + "learning_rate": 2.0453571902223938e-05, + "loss": 1.6946, + "step": 21164 + }, + { + "epoch": 1.4204556894063958, + "grad_norm": 3.314034938812256, + "learning_rate": 2.044480487786617e-05, + "loss": 1.5403, + "step": 21166 + }, + { + "epoch": 1.4205899130901647, + "grad_norm": 4.352222919464111, + "learning_rate": 2.0436039249933208e-05, + "loss": 1.972, + "step": 21168 + }, + { + "epoch": 1.4207241367739338, + "grad_norm": 4.152187347412109, + "learning_rate": 2.042727501883928e-05, + "loss": 2.0287, + "step": 21170 + }, + { + "epoch": 1.4208583604577028, + "grad_norm": 4.592668533325195, + "learning_rate": 2.041851218499844e-05, + "loss": 1.9906, + "step": 21172 + }, + { + "epoch": 1.4209925841414717, + "grad_norm": 4.273369789123535, + "learning_rate": 2.040975074882474e-05, + "loss": 2.0648, + "step": 21174 + }, + { + "epoch": 1.4211268078252408, + "grad_norm": 4.475753307342529, + "learning_rate": 2.040099071073214e-05, + "loss": 2.0572, + "step": 21176 + }, + { + "epoch": 1.4212610315090097, + "grad_norm": 4.193701267242432, + "learning_rate": 2.0392232071134517e-05, + "loss": 1.8735, + "step": 21178 + }, + { + "epoch": 1.4213952551927789, + "grad_norm": 3.934896469116211, + "learning_rate": 2.0383474830445692e-05, + "loss": 1.9149, + "step": 21180 + }, + { + "epoch": 1.4215294788765478, + "grad_norm": 4.341244220733643, + "learning_rate": 2.037471898907946e-05, + "loss": 1.8643, + "step": 21182 + }, + { + "epoch": 1.4216637025603167, + "grad_norm": 4.297524929046631, + "learning_rate": 2.0365964547449502e-05, + "loss": 1.9213, + "step": 21184 + }, + { + "epoch": 1.4217979262440856, + "grad_norm": 3.467881679534912, + "learning_rate": 2.035721150596944e-05, + "loss": 1.8162, + "step": 21186 + }, + { + "epoch": 1.4219321499278548, + "grad_norm": 3.904359817504883, + "learning_rate": 2.0348459865052832e-05, + "loss": 2.267, + "step": 21188 + }, + { + "epoch": 1.4220663736116237, + "grad_norm": 8.48239803314209, + "learning_rate": 2.0339709625113208e-05, + "loss": 1.9865, + "step": 21190 + }, + { + "epoch": 1.4222005972953928, + "grad_norm": 4.034958362579346, + "learning_rate": 2.0330960786563963e-05, + "loss": 1.758, + "step": 21192 + }, + { + "epoch": 1.4223348209791618, + "grad_norm": 4.566251277923584, + "learning_rate": 2.0322213349818507e-05, + "loss": 1.8761, + "step": 21194 + }, + { + "epoch": 1.4224690446629307, + "grad_norm": 3.9823150634765625, + "learning_rate": 2.031346731529008e-05, + "loss": 1.8491, + "step": 21196 + }, + { + "epoch": 1.4226032683466998, + "grad_norm": 4.066895961761475, + "learning_rate": 2.0304722683391964e-05, + "loss": 1.9366, + "step": 21198 + }, + { + "epoch": 1.4227374920304687, + "grad_norm": 4.248331069946289, + "learning_rate": 2.029597945453729e-05, + "loss": 2.0085, + "step": 21200 + }, + { + "epoch": 1.4228717157142379, + "grad_norm": 4.440950393676758, + "learning_rate": 2.0287237629139192e-05, + "loss": 2.0072, + "step": 21202 + }, + { + "epoch": 1.4230059393980068, + "grad_norm": 3.7693119049072266, + "learning_rate": 2.0278497207610697e-05, + "loss": 2.0164, + "step": 21204 + }, + { + "epoch": 1.4231401630817757, + "grad_norm": 4.0398640632629395, + "learning_rate": 2.0269758190364758e-05, + "loss": 2.0087, + "step": 21206 + }, + { + "epoch": 1.4232743867655449, + "grad_norm": 4.2961835861206055, + "learning_rate": 2.026102057781426e-05, + "loss": 1.8669, + "step": 21208 + }, + { + "epoch": 1.4234086104493138, + "grad_norm": 3.7027812004089355, + "learning_rate": 2.0252284370372087e-05, + "loss": 1.9335, + "step": 21210 + }, + { + "epoch": 1.423542834133083, + "grad_norm": 4.06053352355957, + "learning_rate": 2.0243549568450974e-05, + "loss": 2.0211, + "step": 21212 + }, + { + "epoch": 1.4236770578168518, + "grad_norm": 4.681143760681152, + "learning_rate": 2.023481617246363e-05, + "loss": 1.9233, + "step": 21214 + }, + { + "epoch": 1.4238112815006208, + "grad_norm": 3.8328309059143066, + "learning_rate": 2.022608418282268e-05, + "loss": 1.9199, + "step": 21216 + }, + { + "epoch": 1.4239455051843897, + "grad_norm": 3.81083083152771, + "learning_rate": 2.021735359994072e-05, + "loss": 1.8821, + "step": 21218 + }, + { + "epoch": 1.4240797288681588, + "grad_norm": 4.072551250457764, + "learning_rate": 2.020862442423024e-05, + "loss": 1.9378, + "step": 21220 + }, + { + "epoch": 1.4242139525519277, + "grad_norm": 3.718036651611328, + "learning_rate": 2.0199896656103666e-05, + "loss": 1.965, + "step": 21222 + }, + { + "epoch": 1.4243481762356969, + "grad_norm": 3.706512212753296, + "learning_rate": 2.0191170295973382e-05, + "loss": 1.7704, + "step": 21224 + }, + { + "epoch": 1.4244823999194658, + "grad_norm": 4.463945388793945, + "learning_rate": 2.0182445344251667e-05, + "loss": 2.11, + "step": 21226 + }, + { + "epoch": 1.4246166236032347, + "grad_norm": 3.9761223793029785, + "learning_rate": 2.0173721801350787e-05, + "loss": 1.9906, + "step": 21228 + }, + { + "epoch": 1.4247508472870039, + "grad_norm": 4.1817755699157715, + "learning_rate": 2.0164999667682887e-05, + "loss": 2.0606, + "step": 21230 + }, + { + "epoch": 1.4248850709707728, + "grad_norm": 3.5538909435272217, + "learning_rate": 2.0156278943660123e-05, + "loss": 1.8813, + "step": 21232 + }, + { + "epoch": 1.425019294654542, + "grad_norm": 3.9622385501861572, + "learning_rate": 2.0147559629694455e-05, + "loss": 1.9639, + "step": 21234 + }, + { + "epoch": 1.4251535183383108, + "grad_norm": 3.921381711959839, + "learning_rate": 2.013884172619791e-05, + "loss": 1.9775, + "step": 21236 + }, + { + "epoch": 1.4252877420220798, + "grad_norm": 4.666215419769287, + "learning_rate": 2.013012523358236e-05, + "loss": 2.0439, + "step": 21238 + }, + { + "epoch": 1.4254219657058487, + "grad_norm": 3.5754072666168213, + "learning_rate": 2.012141015225967e-05, + "loss": 1.8417, + "step": 21240 + }, + { + "epoch": 1.4255561893896178, + "grad_norm": 5.990778923034668, + "learning_rate": 2.0112696482641596e-05, + "loss": 2.1147, + "step": 21242 + }, + { + "epoch": 1.4256904130733867, + "grad_norm": 4.242297172546387, + "learning_rate": 2.0103984225139845e-05, + "loss": 1.851, + "step": 21244 + }, + { + "epoch": 1.4258246367571559, + "grad_norm": 3.623289108276367, + "learning_rate": 2.0095273380166036e-05, + "loss": 1.8146, + "step": 21246 + }, + { + "epoch": 1.4259588604409248, + "grad_norm": 3.8699302673339844, + "learning_rate": 2.0086563948131765e-05, + "loss": 1.9651, + "step": 21248 + }, + { + "epoch": 1.4260930841246937, + "grad_norm": 4.388435363769531, + "learning_rate": 2.007785592944853e-05, + "loss": 1.7721, + "step": 21250 + }, + { + "epoch": 1.4262273078084629, + "grad_norm": 4.669814586639404, + "learning_rate": 2.0069149324527763e-05, + "loss": 1.9414, + "step": 21252 + }, + { + "epoch": 1.4263615314922318, + "grad_norm": 4.194583415985107, + "learning_rate": 2.0060444133780816e-05, + "loss": 2.0054, + "step": 21254 + }, + { + "epoch": 1.426495755176001, + "grad_norm": 4.151879787445068, + "learning_rate": 2.0051740357619024e-05, + "loss": 1.8152, + "step": 21256 + }, + { + "epoch": 1.4266299788597698, + "grad_norm": 4.110262870788574, + "learning_rate": 2.0043037996453597e-05, + "loss": 2.0634, + "step": 21258 + }, + { + "epoch": 1.4267642025435388, + "grad_norm": 3.806865930557251, + "learning_rate": 2.0034337050695752e-05, + "loss": 1.9572, + "step": 21260 + }, + { + "epoch": 1.4268984262273077, + "grad_norm": 3.8205626010894775, + "learning_rate": 2.0025637520756518e-05, + "loss": 1.6942, + "step": 21262 + }, + { + "epoch": 1.4270326499110768, + "grad_norm": 4.551548957824707, + "learning_rate": 2.0016939407046987e-05, + "loss": 2.2276, + "step": 21264 + }, + { + "epoch": 1.4271668735948457, + "grad_norm": 4.387186527252197, + "learning_rate": 2.000824270997809e-05, + "loss": 1.7425, + "step": 21266 + }, + { + "epoch": 1.4273010972786149, + "grad_norm": 4.166409492492676, + "learning_rate": 1.9999547429960774e-05, + "loss": 1.8402, + "step": 21268 + }, + { + "epoch": 1.4274353209623838, + "grad_norm": 4.556366443634033, + "learning_rate": 1.9990853567405843e-05, + "loss": 2.0964, + "step": 21270 + }, + { + "epoch": 1.4275695446461527, + "grad_norm": 4.063499927520752, + "learning_rate": 1.998216112272407e-05, + "loss": 1.9273, + "step": 21272 + }, + { + "epoch": 1.4277037683299219, + "grad_norm": 3.9249470233917236, + "learning_rate": 1.997347009632614e-05, + "loss": 2.0704, + "step": 21274 + }, + { + "epoch": 1.4278379920136908, + "grad_norm": 4.12654447555542, + "learning_rate": 1.996478048862272e-05, + "loss": 1.7004, + "step": 21276 + }, + { + "epoch": 1.42797221569746, + "grad_norm": 3.9666991233825684, + "learning_rate": 1.9956092300024364e-05, + "loss": 1.8868, + "step": 21278 + }, + { + "epoch": 1.4281064393812288, + "grad_norm": 3.8565285205841064, + "learning_rate": 1.9947405530941565e-05, + "loss": 2.0264, + "step": 21280 + }, + { + "epoch": 1.4282406630649978, + "grad_norm": 3.5172789096832275, + "learning_rate": 1.9938720181784743e-05, + "loss": 1.8593, + "step": 21282 + }, + { + "epoch": 1.428374886748767, + "grad_norm": 3.74772572517395, + "learning_rate": 1.9930036252964295e-05, + "loss": 1.9567, + "step": 21284 + }, + { + "epoch": 1.4285091104325358, + "grad_norm": 4.041962623596191, + "learning_rate": 1.9921353744890488e-05, + "loss": 1.8652, + "step": 21286 + }, + { + "epoch": 1.428643334116305, + "grad_norm": 3.7638638019561768, + "learning_rate": 1.99126726579736e-05, + "loss": 2.0381, + "step": 21288 + }, + { + "epoch": 1.4287775578000739, + "grad_norm": 4.192185878753662, + "learning_rate": 1.9903992992623755e-05, + "loss": 1.9366, + "step": 21290 + }, + { + "epoch": 1.4289117814838428, + "grad_norm": 4.360293388366699, + "learning_rate": 1.9895314749251038e-05, + "loss": 1.8822, + "step": 21292 + }, + { + "epoch": 1.4290460051676117, + "grad_norm": 4.664591312408447, + "learning_rate": 1.9886637928265516e-05, + "loss": 2.0923, + "step": 21294 + }, + { + "epoch": 1.4291802288513809, + "grad_norm": 4.270801067352295, + "learning_rate": 1.9877962530077123e-05, + "loss": 1.9008, + "step": 21296 + }, + { + "epoch": 1.4293144525351498, + "grad_norm": 4.051314830780029, + "learning_rate": 1.986928855509578e-05, + "loss": 1.8726, + "step": 21298 + }, + { + "epoch": 1.429448676218919, + "grad_norm": 4.080447673797607, + "learning_rate": 1.9860616003731307e-05, + "loss": 1.793, + "step": 21300 + }, + { + "epoch": 1.4295828999026878, + "grad_norm": 4.237814903259277, + "learning_rate": 1.9851944876393463e-05, + "loss": 1.8077, + "step": 21302 + }, + { + "epoch": 1.4297171235864568, + "grad_norm": 4.503100872039795, + "learning_rate": 1.984327517349192e-05, + "loss": 1.8468, + "step": 21304 + }, + { + "epoch": 1.429851347270226, + "grad_norm": 4.4379048347473145, + "learning_rate": 1.983460689543634e-05, + "loss": 2.1427, + "step": 21306 + }, + { + "epoch": 1.4299855709539948, + "grad_norm": 3.7926716804504395, + "learning_rate": 1.982594004263627e-05, + "loss": 1.6973, + "step": 21308 + }, + { + "epoch": 1.430119794637764, + "grad_norm": 4.105006694793701, + "learning_rate": 1.9817274615501204e-05, + "loss": 2.0146, + "step": 21310 + }, + { + "epoch": 1.4302540183215329, + "grad_norm": 4.154086589813232, + "learning_rate": 1.9808610614440532e-05, + "loss": 1.9723, + "step": 21312 + }, + { + "epoch": 1.4303882420053018, + "grad_norm": 4.535845756530762, + "learning_rate": 1.9799948039863666e-05, + "loss": 1.9187, + "step": 21314 + }, + { + "epoch": 1.4305224656890707, + "grad_norm": 3.587555170059204, + "learning_rate": 1.979128689217986e-05, + "loss": 1.775, + "step": 21316 + }, + { + "epoch": 1.4306566893728399, + "grad_norm": 3.9822587966918945, + "learning_rate": 1.9782627171798353e-05, + "loss": 2.0, + "step": 21318 + }, + { + "epoch": 1.4307909130566088, + "grad_norm": 4.108846664428711, + "learning_rate": 1.9773968879128272e-05, + "loss": 1.7104, + "step": 21320 + }, + { + "epoch": 1.430925136740378, + "grad_norm": 3.6273374557495117, + "learning_rate": 1.976531201457874e-05, + "loss": 2.0287, + "step": 21322 + }, + { + "epoch": 1.4310593604241468, + "grad_norm": 4.215423107147217, + "learning_rate": 1.9756656578558746e-05, + "loss": 1.9174, + "step": 21324 + }, + { + "epoch": 1.4311935841079158, + "grad_norm": 3.745798110961914, + "learning_rate": 1.9748002571477277e-05, + "loss": 1.8209, + "step": 21326 + }, + { + "epoch": 1.431327807791685, + "grad_norm": 4.099099636077881, + "learning_rate": 1.9739349993743194e-05, + "loss": 1.9031, + "step": 21328 + }, + { + "epoch": 1.4314620314754538, + "grad_norm": 3.7648978233337402, + "learning_rate": 1.973069884576532e-05, + "loss": 1.95, + "step": 21330 + }, + { + "epoch": 1.431596255159223, + "grad_norm": 3.9908950328826904, + "learning_rate": 1.972204912795239e-05, + "loss": 1.8606, + "step": 21332 + }, + { + "epoch": 1.4317304788429919, + "grad_norm": 3.968017339706421, + "learning_rate": 1.9713400840713116e-05, + "loss": 1.9859, + "step": 21334 + }, + { + "epoch": 1.4318647025267608, + "grad_norm": 4.64775276184082, + "learning_rate": 1.9704753984456094e-05, + "loss": 2.0484, + "step": 21336 + }, + { + "epoch": 1.4319989262105297, + "grad_norm": 4.601584434509277, + "learning_rate": 1.9696108559589875e-05, + "loss": 1.813, + "step": 21338 + }, + { + "epoch": 1.4321331498942989, + "grad_norm": 3.767504930496216, + "learning_rate": 1.968746456652292e-05, + "loss": 1.793, + "step": 21340 + }, + { + "epoch": 1.4322673735780678, + "grad_norm": 4.630252838134766, + "learning_rate": 1.967882200566367e-05, + "loss": 2.0727, + "step": 21342 + }, + { + "epoch": 1.432401597261837, + "grad_norm": 3.8405814170837402, + "learning_rate": 1.967018087742044e-05, + "loss": 1.9129, + "step": 21344 + }, + { + "epoch": 1.4325358209456058, + "grad_norm": 3.5324528217315674, + "learning_rate": 1.966154118220156e-05, + "loss": 1.7542, + "step": 21346 + }, + { + "epoch": 1.4326700446293748, + "grad_norm": 4.050837516784668, + "learning_rate": 1.9652902920415167e-05, + "loss": 1.9727, + "step": 21348 + }, + { + "epoch": 1.432804268313144, + "grad_norm": 4.412144184112549, + "learning_rate": 1.964426609246945e-05, + "loss": 2.2209, + "step": 21350 + }, + { + "epoch": 1.4329384919969128, + "grad_norm": 4.533377647399902, + "learning_rate": 1.9635630698772455e-05, + "loss": 2.1586, + "step": 21352 + }, + { + "epoch": 1.433072715680682, + "grad_norm": 4.334949493408203, + "learning_rate": 1.9626996739732212e-05, + "loss": 1.9308, + "step": 21354 + }, + { + "epoch": 1.4332069393644509, + "grad_norm": 3.5113685131073, + "learning_rate": 1.961836421575667e-05, + "loss": 1.7564, + "step": 21356 + }, + { + "epoch": 1.4333411630482198, + "grad_norm": 3.9741270542144775, + "learning_rate": 1.960973312725364e-05, + "loss": 2.0045, + "step": 21358 + }, + { + "epoch": 1.433475386731989, + "grad_norm": 3.8370347023010254, + "learning_rate": 1.960110347463097e-05, + "loss": 1.9573, + "step": 21360 + }, + { + "epoch": 1.4336096104157579, + "grad_norm": 4.2515387535095215, + "learning_rate": 1.959247525829638e-05, + "loss": 2.0818, + "step": 21362 + }, + { + "epoch": 1.433743834099527, + "grad_norm": 3.703141927719116, + "learning_rate": 1.9583848478657544e-05, + "loss": 2.1027, + "step": 21364 + }, + { + "epoch": 1.433878057783296, + "grad_norm": 3.9296295642852783, + "learning_rate": 1.957522313612207e-05, + "loss": 2.0359, + "step": 21366 + }, + { + "epoch": 1.4340122814670648, + "grad_norm": 4.14156436920166, + "learning_rate": 1.9566599231097467e-05, + "loss": 1.6952, + "step": 21368 + }, + { + "epoch": 1.4341465051508338, + "grad_norm": 4.259618759155273, + "learning_rate": 1.9557976763991188e-05, + "loss": 1.8146, + "step": 21370 + }, + { + "epoch": 1.434280728834603, + "grad_norm": 4.295783042907715, + "learning_rate": 1.9549355735210663e-05, + "loss": 2.1414, + "step": 21372 + }, + { + "epoch": 1.4344149525183718, + "grad_norm": 4.269130229949951, + "learning_rate": 1.95407361451632e-05, + "loss": 2.0508, + "step": 21374 + }, + { + "epoch": 1.434549176202141, + "grad_norm": 5.3051066398620605, + "learning_rate": 1.953211799425606e-05, + "loss": 1.9415, + "step": 21376 + }, + { + "epoch": 1.4346833998859099, + "grad_norm": 4.279820442199707, + "learning_rate": 1.9523501282896417e-05, + "loss": 2.0297, + "step": 21378 + }, + { + "epoch": 1.4348176235696788, + "grad_norm": 3.4521820545196533, + "learning_rate": 1.9514886011491422e-05, + "loss": 1.747, + "step": 21380 + }, + { + "epoch": 1.434951847253448, + "grad_norm": 3.7071115970611572, + "learning_rate": 1.9506272180448104e-05, + "loss": 1.893, + "step": 21382 + }, + { + "epoch": 1.4350860709372169, + "grad_norm": 4.103786468505859, + "learning_rate": 1.9497659790173495e-05, + "loss": 1.9727, + "step": 21384 + }, + { + "epoch": 1.435220294620986, + "grad_norm": 3.7790284156799316, + "learning_rate": 1.9489048841074444e-05, + "loss": 1.7196, + "step": 21386 + }, + { + "epoch": 1.435354518304755, + "grad_norm": 4.574364185333252, + "learning_rate": 1.9480439333557855e-05, + "loss": 1.7465, + "step": 21388 + }, + { + "epoch": 1.4354887419885238, + "grad_norm": 4.7039618492126465, + "learning_rate": 1.9471831268030478e-05, + "loss": 2.0403, + "step": 21390 + }, + { + "epoch": 1.4356229656722928, + "grad_norm": 3.4240634441375732, + "learning_rate": 1.9463224644899057e-05, + "loss": 1.5258, + "step": 21392 + }, + { + "epoch": 1.435757189356062, + "grad_norm": 3.2760212421417236, + "learning_rate": 1.945461946457023e-05, + "loss": 1.4953, + "step": 21394 + }, + { + "epoch": 1.4358914130398308, + "grad_norm": 4.5536723136901855, + "learning_rate": 1.944601572745056e-05, + "loss": 2.0487, + "step": 21396 + }, + { + "epoch": 1.4360256367236, + "grad_norm": 4.3031792640686035, + "learning_rate": 1.9437413433946556e-05, + "loss": 2.1782, + "step": 21398 + }, + { + "epoch": 1.4361598604073689, + "grad_norm": 4.70009708404541, + "learning_rate": 1.942881258446468e-05, + "loss": 1.7983, + "step": 21400 + }, + { + "epoch": 1.4362940840911378, + "grad_norm": 4.152857303619385, + "learning_rate": 1.9420213179411295e-05, + "loss": 2.198, + "step": 21402 + }, + { + "epoch": 1.436428307774907, + "grad_norm": 4.225174427032471, + "learning_rate": 1.941161521919271e-05, + "loss": 2.1598, + "step": 21404 + }, + { + "epoch": 1.4365625314586759, + "grad_norm": 3.8395302295684814, + "learning_rate": 1.940301870421513e-05, + "loss": 1.9226, + "step": 21406 + }, + { + "epoch": 1.436696755142445, + "grad_norm": 4.704364776611328, + "learning_rate": 1.9394423634884772e-05, + "loss": 1.8605, + "step": 21408 + }, + { + "epoch": 1.436830978826214, + "grad_norm": 4.14469575881958, + "learning_rate": 1.9385830011607698e-05, + "loss": 1.7615, + "step": 21410 + }, + { + "epoch": 1.4369652025099828, + "grad_norm": 4.4001874923706055, + "learning_rate": 1.9377237834789987e-05, + "loss": 1.9931, + "step": 21412 + }, + { + "epoch": 1.4370994261937517, + "grad_norm": 4.200325012207031, + "learning_rate": 1.936864710483754e-05, + "loss": 2.0926, + "step": 21414 + }, + { + "epoch": 1.437233649877521, + "grad_norm": 4.087267875671387, + "learning_rate": 1.9360057822156298e-05, + "loss": 2.0065, + "step": 21416 + }, + { + "epoch": 1.4373678735612898, + "grad_norm": 4.209799766540527, + "learning_rate": 1.9351469987152056e-05, + "loss": 2.0354, + "step": 21418 + }, + { + "epoch": 1.437502097245059, + "grad_norm": 4.263820171356201, + "learning_rate": 1.934288360023061e-05, + "loss": 1.8743, + "step": 21420 + }, + { + "epoch": 1.4376363209288279, + "grad_norm": 4.35760498046875, + "learning_rate": 1.9334298661797623e-05, + "loss": 1.9034, + "step": 21422 + }, + { + "epoch": 1.4377705446125968, + "grad_norm": 5.002335548400879, + "learning_rate": 1.932571517225873e-05, + "loss": 2.0361, + "step": 21424 + }, + { + "epoch": 1.437904768296366, + "grad_norm": 3.984393835067749, + "learning_rate": 1.9317133132019472e-05, + "loss": 1.7228, + "step": 21426 + }, + { + "epoch": 1.4380389919801349, + "grad_norm": 4.063528060913086, + "learning_rate": 1.930855254148532e-05, + "loss": 1.7463, + "step": 21428 + }, + { + "epoch": 1.438173215663904, + "grad_norm": 3.798447847366333, + "learning_rate": 1.9299973401061726e-05, + "loss": 2.0139, + "step": 21430 + }, + { + "epoch": 1.438307439347673, + "grad_norm": 4.422934532165527, + "learning_rate": 1.9291395711154025e-05, + "loss": 2.0957, + "step": 21432 + }, + { + "epoch": 1.4384416630314418, + "grad_norm": 4.506597518920898, + "learning_rate": 1.9282819472167484e-05, + "loss": 2.1047, + "step": 21434 + }, + { + "epoch": 1.438575886715211, + "grad_norm": 3.8737592697143555, + "learning_rate": 1.9274244684507307e-05, + "loss": 1.8034, + "step": 21436 + }, + { + "epoch": 1.43871011039898, + "grad_norm": 4.237947463989258, + "learning_rate": 1.9265671348578667e-05, + "loss": 2.1137, + "step": 21438 + }, + { + "epoch": 1.438844334082749, + "grad_norm": 4.490161895751953, + "learning_rate": 1.9257099464786615e-05, + "loss": 1.9757, + "step": 21440 + }, + { + "epoch": 1.438978557766518, + "grad_norm": 9.733732223510742, + "learning_rate": 1.9248529033536162e-05, + "loss": 2.0863, + "step": 21442 + }, + { + "epoch": 1.4391127814502869, + "grad_norm": 4.004281044006348, + "learning_rate": 1.9239960055232226e-05, + "loss": 1.8231, + "step": 21444 + }, + { + "epoch": 1.4392470051340558, + "grad_norm": 4.61552095413208, + "learning_rate": 1.9231392530279706e-05, + "loss": 1.8998, + "step": 21446 + }, + { + "epoch": 1.439381228817825, + "grad_norm": 4.065914630889893, + "learning_rate": 1.922282645908337e-05, + "loss": 2.1713, + "step": 21448 + }, + { + "epoch": 1.4395154525015939, + "grad_norm": 4.514229774475098, + "learning_rate": 1.9214261842047982e-05, + "loss": 2.0082, + "step": 21450 + }, + { + "epoch": 1.439649676185363, + "grad_norm": 3.6311240196228027, + "learning_rate": 1.920569867957818e-05, + "loss": 1.9657, + "step": 21452 + }, + { + "epoch": 1.439783899869132, + "grad_norm": 3.8816823959350586, + "learning_rate": 1.9197136972078563e-05, + "loss": 1.8078, + "step": 21454 + }, + { + "epoch": 1.4399181235529008, + "grad_norm": 4.151867389678955, + "learning_rate": 1.9188576719953633e-05, + "loss": 2.0156, + "step": 21456 + }, + { + "epoch": 1.44005234723667, + "grad_norm": 3.706200361251831, + "learning_rate": 1.9180017923607886e-05, + "loss": 1.7172, + "step": 21458 + }, + { + "epoch": 1.440186570920439, + "grad_norm": 3.83084774017334, + "learning_rate": 1.9171460583445684e-05, + "loss": 1.59, + "step": 21460 + }, + { + "epoch": 1.440320794604208, + "grad_norm": 3.217998504638672, + "learning_rate": 1.9162904699871347e-05, + "loss": 1.6319, + "step": 21462 + }, + { + "epoch": 1.440455018287977, + "grad_norm": 4.179366111755371, + "learning_rate": 1.9154350273289113e-05, + "loss": 2.094, + "step": 21464 + }, + { + "epoch": 1.4405892419717459, + "grad_norm": 3.6815245151519775, + "learning_rate": 1.9145797304103186e-05, + "loss": 2.0755, + "step": 21466 + }, + { + "epoch": 1.4407234656555148, + "grad_norm": 3.9611103534698486, + "learning_rate": 1.9137245792717668e-05, + "loss": 1.887, + "step": 21468 + }, + { + "epoch": 1.440857689339284, + "grad_norm": 4.359685897827148, + "learning_rate": 1.912869573953659e-05, + "loss": 1.9488, + "step": 21470 + }, + { + "epoch": 1.4409919130230529, + "grad_norm": 4.581173896789551, + "learning_rate": 1.9120147144963918e-05, + "loss": 2.0579, + "step": 21472 + }, + { + "epoch": 1.441126136706822, + "grad_norm": 3.989250898361206, + "learning_rate": 1.9111600009403592e-05, + "loss": 1.8813, + "step": 21474 + }, + { + "epoch": 1.441260360390591, + "grad_norm": 4.031585693359375, + "learning_rate": 1.910305433325941e-05, + "loss": 2.2002, + "step": 21476 + }, + { + "epoch": 1.4413945840743598, + "grad_norm": 4.293539047241211, + "learning_rate": 1.9094510116935167e-05, + "loss": 1.941, + "step": 21478 + }, + { + "epoch": 1.441528807758129, + "grad_norm": 4.8430962562561035, + "learning_rate": 1.9085967360834544e-05, + "loss": 2.3492, + "step": 21480 + }, + { + "epoch": 1.441663031441898, + "grad_norm": 3.617171049118042, + "learning_rate": 1.907742606536118e-05, + "loss": 1.6698, + "step": 21482 + }, + { + "epoch": 1.441797255125667, + "grad_norm": 4.148891448974609, + "learning_rate": 1.9068886230918608e-05, + "loss": 1.9347, + "step": 21484 + }, + { + "epoch": 1.441931478809436, + "grad_norm": 4.122305393218994, + "learning_rate": 1.906034785791036e-05, + "loss": 2.2407, + "step": 21486 + }, + { + "epoch": 1.4420657024932049, + "grad_norm": 4.5080766677856445, + "learning_rate": 1.9051810946739828e-05, + "loss": 2.2983, + "step": 21488 + }, + { + "epoch": 1.4421999261769738, + "grad_norm": 3.7293999195098877, + "learning_rate": 1.9043275497810377e-05, + "loss": 1.9491, + "step": 21490 + }, + { + "epoch": 1.442334149860743, + "grad_norm": 3.7625644207000732, + "learning_rate": 1.903474151152527e-05, + "loss": 1.8648, + "step": 21492 + }, + { + "epoch": 1.4424683735445119, + "grad_norm": 3.9934537410736084, + "learning_rate": 1.9026208988287746e-05, + "loss": 1.811, + "step": 21494 + }, + { + "epoch": 1.442602597228281, + "grad_norm": 3.850728988647461, + "learning_rate": 1.901767792850094e-05, + "loss": 2.0442, + "step": 21496 + }, + { + "epoch": 1.44273682091205, + "grad_norm": 4.035011291503906, + "learning_rate": 1.9009148332567932e-05, + "loss": 1.7787, + "step": 21498 + }, + { + "epoch": 1.4428710445958188, + "grad_norm": 4.149894714355469, + "learning_rate": 1.900062020089173e-05, + "loss": 2.1028, + "step": 21500 + }, + { + "epoch": 1.443005268279588, + "grad_norm": 3.8063199520111084, + "learning_rate": 1.899209353387524e-05, + "loss": 1.715, + "step": 21502 + }, + { + "epoch": 1.443139491963357, + "grad_norm": 3.8597142696380615, + "learning_rate": 1.8983568331921375e-05, + "loss": 1.9675, + "step": 21504 + }, + { + "epoch": 1.443273715647126, + "grad_norm": 3.7659335136413574, + "learning_rate": 1.89750445954329e-05, + "loss": 1.8449, + "step": 21506 + }, + { + "epoch": 1.443407939330895, + "grad_norm": 4.377338886260986, + "learning_rate": 1.896652232481259e-05, + "loss": 2.1792, + "step": 21508 + }, + { + "epoch": 1.4435421630146639, + "grad_norm": 4.333934307098389, + "learning_rate": 1.8958001520463043e-05, + "loss": 2.0569, + "step": 21510 + }, + { + "epoch": 1.443676386698433, + "grad_norm": 3.6486763954162598, + "learning_rate": 1.8949482182786904e-05, + "loss": 1.6452, + "step": 21512 + }, + { + "epoch": 1.443810610382202, + "grad_norm": 4.191628456115723, + "learning_rate": 1.8940964312186653e-05, + "loss": 1.9092, + "step": 21514 + }, + { + "epoch": 1.443944834065971, + "grad_norm": 4.1645050048828125, + "learning_rate": 1.8932447909064775e-05, + "loss": 1.7895, + "step": 21516 + }, + { + "epoch": 1.44407905774974, + "grad_norm": 3.3693037033081055, + "learning_rate": 1.892393297382365e-05, + "loss": 1.8609, + "step": 21518 + }, + { + "epoch": 1.444213281433509, + "grad_norm": 4.252381801605225, + "learning_rate": 1.8915419506865583e-05, + "loss": 1.824, + "step": 21520 + }, + { + "epoch": 1.4443475051172778, + "grad_norm": 3.8821005821228027, + "learning_rate": 1.8906907508592798e-05, + "loss": 2.0671, + "step": 21522 + }, + { + "epoch": 1.444481728801047, + "grad_norm": 4.074163436889648, + "learning_rate": 1.889839697940751e-05, + "loss": 1.9979, + "step": 21524 + }, + { + "epoch": 1.444615952484816, + "grad_norm": 4.095183849334717, + "learning_rate": 1.8889887919711808e-05, + "loss": 2.0833, + "step": 21526 + }, + { + "epoch": 1.444750176168585, + "grad_norm": 3.7467825412750244, + "learning_rate": 1.8881380329907722e-05, + "loss": 1.9552, + "step": 21528 + }, + { + "epoch": 1.444884399852354, + "grad_norm": 4.112319469451904, + "learning_rate": 1.8872874210397213e-05, + "loss": 1.6941, + "step": 21530 + }, + { + "epoch": 1.4450186235361229, + "grad_norm": 4.244778156280518, + "learning_rate": 1.88643695615822e-05, + "loss": 2.2202, + "step": 21532 + }, + { + "epoch": 1.445152847219892, + "grad_norm": 3.9752495288848877, + "learning_rate": 1.8855866383864483e-05, + "loss": 1.9595, + "step": 21534 + }, + { + "epoch": 1.445287070903661, + "grad_norm": 4.0352044105529785, + "learning_rate": 1.8847364677645873e-05, + "loss": 2.3078, + "step": 21536 + }, + { + "epoch": 1.44542129458743, + "grad_norm": 4.505187511444092, + "learning_rate": 1.8838864443327986e-05, + "loss": 1.8284, + "step": 21538 + }, + { + "epoch": 1.445555518271199, + "grad_norm": 3.6852500438690186, + "learning_rate": 1.8830365681312502e-05, + "loss": 2.0722, + "step": 21540 + }, + { + "epoch": 1.445689741954968, + "grad_norm": 4.264190673828125, + "learning_rate": 1.8821868392000925e-05, + "loss": 1.9358, + "step": 21542 + }, + { + "epoch": 1.4458239656387368, + "grad_norm": 4.496337890625, + "learning_rate": 1.8813372575794768e-05, + "loss": 2.1586, + "step": 21544 + }, + { + "epoch": 1.445958189322506, + "grad_norm": 3.895392417907715, + "learning_rate": 1.880487823309544e-05, + "loss": 2.017, + "step": 21546 + }, + { + "epoch": 1.446092413006275, + "grad_norm": 3.9078710079193115, + "learning_rate": 1.879638536430427e-05, + "loss": 1.974, + "step": 21548 + }, + { + "epoch": 1.446226636690044, + "grad_norm": 3.853121280670166, + "learning_rate": 1.8787893969822517e-05, + "loss": 1.8368, + "step": 21550 + }, + { + "epoch": 1.446360860373813, + "grad_norm": 3.8521437644958496, + "learning_rate": 1.8779404050051413e-05, + "loss": 1.8585, + "step": 21552 + }, + { + "epoch": 1.4464950840575819, + "grad_norm": 3.075080394744873, + "learning_rate": 1.8770915605392077e-05, + "loss": 1.8405, + "step": 21554 + }, + { + "epoch": 1.446629307741351, + "grad_norm": 4.528481483459473, + "learning_rate": 1.8762428636245564e-05, + "loss": 2.0094, + "step": 21556 + }, + { + "epoch": 1.44676353142512, + "grad_norm": 3.5332870483398438, + "learning_rate": 1.8753943143012864e-05, + "loss": 1.8568, + "step": 21558 + }, + { + "epoch": 1.446897755108889, + "grad_norm": 3.9424400329589844, + "learning_rate": 1.8745459126094915e-05, + "loss": 2.0544, + "step": 21560 + }, + { + "epoch": 1.447031978792658, + "grad_norm": 3.972505807876587, + "learning_rate": 1.8736976585892572e-05, + "loss": 1.8462, + "step": 21562 + }, + { + "epoch": 1.447166202476427, + "grad_norm": 4.639650821685791, + "learning_rate": 1.8728495522806606e-05, + "loss": 1.9531, + "step": 21564 + }, + { + "epoch": 1.4473004261601958, + "grad_norm": 3.8285982608795166, + "learning_rate": 1.8720015937237733e-05, + "loss": 1.7002, + "step": 21566 + }, + { + "epoch": 1.447434649843965, + "grad_norm": 4.173365592956543, + "learning_rate": 1.871153782958658e-05, + "loss": 1.8963, + "step": 21568 + }, + { + "epoch": 1.447568873527734, + "grad_norm": 4.17411470413208, + "learning_rate": 1.870306120025375e-05, + "loss": 1.9521, + "step": 21570 + }, + { + "epoch": 1.447703097211503, + "grad_norm": 3.7534565925598145, + "learning_rate": 1.869458604963973e-05, + "loss": 1.7719, + "step": 21572 + }, + { + "epoch": 1.447837320895272, + "grad_norm": 5.3704142570495605, + "learning_rate": 1.8686112378144988e-05, + "loss": 1.8951, + "step": 21574 + }, + { + "epoch": 1.4479715445790409, + "grad_norm": 4.043721675872803, + "learning_rate": 1.867764018616982e-05, + "loss": 2.0313, + "step": 21576 + }, + { + "epoch": 1.44810576826281, + "grad_norm": 4.388193607330322, + "learning_rate": 1.8669169474114582e-05, + "loss": 2.1248, + "step": 21578 + }, + { + "epoch": 1.448239991946579, + "grad_norm": 3.982766628265381, + "learning_rate": 1.8660700242379463e-05, + "loss": 1.7336, + "step": 21580 + }, + { + "epoch": 1.448374215630348, + "grad_norm": 3.7694644927978516, + "learning_rate": 1.8652232491364648e-05, + "loss": 2.0478, + "step": 21582 + }, + { + "epoch": 1.448508439314117, + "grad_norm": 4.162435531616211, + "learning_rate": 1.8643766221470204e-05, + "loss": 2.1685, + "step": 21584 + }, + { + "epoch": 1.448642662997886, + "grad_norm": 5.5324387550354, + "learning_rate": 1.863530143309615e-05, + "loss": 1.972, + "step": 21586 + }, + { + "epoch": 1.448776886681655, + "grad_norm": 4.101387023925781, + "learning_rate": 1.8626838126642403e-05, + "loss": 1.8581, + "step": 21588 + }, + { + "epoch": 1.448911110365424, + "grad_norm": 4.2849040031433105, + "learning_rate": 1.861837630250888e-05, + "loss": 2.0123, + "step": 21590 + }, + { + "epoch": 1.4490453340491931, + "grad_norm": 3.923736095428467, + "learning_rate": 1.860991596109537e-05, + "loss": 2.1505, + "step": 21592 + }, + { + "epoch": 1.449179557732962, + "grad_norm": 3.874406576156616, + "learning_rate": 1.8601457102801608e-05, + "loss": 1.7419, + "step": 21594 + }, + { + "epoch": 1.449313781416731, + "grad_norm": 4.555093765258789, + "learning_rate": 1.8592999728027234e-05, + "loss": 2.0451, + "step": 21596 + }, + { + "epoch": 1.4494480051004999, + "grad_norm": 4.465390682220459, + "learning_rate": 1.858454383717188e-05, + "loss": 2.1615, + "step": 21598 + }, + { + "epoch": 1.449582228784269, + "grad_norm": 3.6120235919952393, + "learning_rate": 1.857608943063504e-05, + "loss": 1.9075, + "step": 21600 + }, + { + "epoch": 1.449716452468038, + "grad_norm": 4.88520622253418, + "learning_rate": 1.8567636508816212e-05, + "loss": 2.342, + "step": 21602 + }, + { + "epoch": 1.449850676151807, + "grad_norm": 4.513620376586914, + "learning_rate": 1.855918507211472e-05, + "loss": 1.9406, + "step": 21604 + }, + { + "epoch": 1.449984899835576, + "grad_norm": 4.072671413421631, + "learning_rate": 1.8550735120929926e-05, + "loss": 1.9721, + "step": 21606 + }, + { + "epoch": 1.450119123519345, + "grad_norm": 3.969731330871582, + "learning_rate": 1.8542286655661027e-05, + "loss": 2.0727, + "step": 21608 + }, + { + "epoch": 1.450253347203114, + "grad_norm": 3.8360016345977783, + "learning_rate": 1.8533839676707255e-05, + "loss": 2.1091, + "step": 21610 + }, + { + "epoch": 1.450387570886883, + "grad_norm": 4.048786640167236, + "learning_rate": 1.8525394184467676e-05, + "loss": 1.867, + "step": 21612 + }, + { + "epoch": 1.4505217945706521, + "grad_norm": 4.129173755645752, + "learning_rate": 1.851695017934133e-05, + "loss": 2.0367, + "step": 21614 + }, + { + "epoch": 1.450656018254421, + "grad_norm": 4.032386779785156, + "learning_rate": 1.8508507661727163e-05, + "loss": 2.2965, + "step": 21616 + }, + { + "epoch": 1.45079024193819, + "grad_norm": 4.337318420410156, + "learning_rate": 1.85000666320241e-05, + "loss": 2.0523, + "step": 21618 + }, + { + "epoch": 1.4509244656219589, + "grad_norm": 3.9934120178222656, + "learning_rate": 1.8491627090630948e-05, + "loss": 1.858, + "step": 21620 + }, + { + "epoch": 1.451058689305728, + "grad_norm": 3.801828622817993, + "learning_rate": 1.848318903794646e-05, + "loss": 2.0185, + "step": 21622 + }, + { + "epoch": 1.451192912989497, + "grad_norm": 4.234528064727783, + "learning_rate": 1.8474752474369296e-05, + "loss": 1.9287, + "step": 21624 + }, + { + "epoch": 1.451327136673266, + "grad_norm": 3.4606902599334717, + "learning_rate": 1.84663174002981e-05, + "loss": 1.9269, + "step": 21626 + }, + { + "epoch": 1.451461360357035, + "grad_norm": 3.7093429565429688, + "learning_rate": 1.845788381613138e-05, + "loss": 1.912, + "step": 21628 + }, + { + "epoch": 1.451595584040804, + "grad_norm": 3.995342254638672, + "learning_rate": 1.8449451722267646e-05, + "loss": 1.9451, + "step": 21630 + }, + { + "epoch": 1.451729807724573, + "grad_norm": 4.7754411697387695, + "learning_rate": 1.844102111910529e-05, + "loss": 2.1413, + "step": 21632 + }, + { + "epoch": 1.451864031408342, + "grad_norm": 4.233495235443115, + "learning_rate": 1.84325920070426e-05, + "loss": 1.9656, + "step": 21634 + }, + { + "epoch": 1.4519982550921111, + "grad_norm": 4.443921089172363, + "learning_rate": 1.8424164386477878e-05, + "loss": 1.7163, + "step": 21636 + }, + { + "epoch": 1.45213247877588, + "grad_norm": 4.979162216186523, + "learning_rate": 1.8415738257809275e-05, + "loss": 2.0571, + "step": 21638 + }, + { + "epoch": 1.452266702459649, + "grad_norm": 4.803863048553467, + "learning_rate": 1.8407313621434952e-05, + "loss": 1.8428, + "step": 21640 + }, + { + "epoch": 1.4524009261434179, + "grad_norm": 4.552792072296143, + "learning_rate": 1.839889047775294e-05, + "loss": 2.0892, + "step": 21642 + }, + { + "epoch": 1.452535149827187, + "grad_norm": 4.396320819854736, + "learning_rate": 1.8390468827161207e-05, + "loss": 2.0736, + "step": 21644 + }, + { + "epoch": 1.452669373510956, + "grad_norm": 4.509576797485352, + "learning_rate": 1.838204867005765e-05, + "loss": 2.1188, + "step": 21646 + }, + { + "epoch": 1.452803597194725, + "grad_norm": 4.257665634155273, + "learning_rate": 1.837363000684013e-05, + "loss": 1.9002, + "step": 21648 + }, + { + "epoch": 1.452937820878494, + "grad_norm": 4.068183898925781, + "learning_rate": 1.836521283790641e-05, + "loss": 1.7835, + "step": 21650 + }, + { + "epoch": 1.453072044562263, + "grad_norm": 4.030323028564453, + "learning_rate": 1.835679716365417e-05, + "loss": 1.929, + "step": 21652 + }, + { + "epoch": 1.453206268246032, + "grad_norm": 4.148030757904053, + "learning_rate": 1.8348382984481034e-05, + "loss": 1.9287, + "step": 21654 + }, + { + "epoch": 1.453340491929801, + "grad_norm": 3.95646333694458, + "learning_rate": 1.833997030078457e-05, + "loss": 2.0428, + "step": 21656 + }, + { + "epoch": 1.4534747156135701, + "grad_norm": 6.1572675704956055, + "learning_rate": 1.8331559112962232e-05, + "loss": 1.8469, + "step": 21658 + }, + { + "epoch": 1.453608939297339, + "grad_norm": 4.449273109436035, + "learning_rate": 1.8323149421411497e-05, + "loss": 2.0171, + "step": 21660 + }, + { + "epoch": 1.453743162981108, + "grad_norm": 4.101599216461182, + "learning_rate": 1.831474122652962e-05, + "loss": 1.9428, + "step": 21662 + }, + { + "epoch": 1.453877386664877, + "grad_norm": 3.616602659225464, + "learning_rate": 1.830633452871393e-05, + "loss": 1.7891, + "step": 21664 + }, + { + "epoch": 1.454011610348646, + "grad_norm": 3.7326011657714844, + "learning_rate": 1.8297929328361594e-05, + "loss": 1.8146, + "step": 21666 + }, + { + "epoch": 1.4541458340324152, + "grad_norm": 4.208972454071045, + "learning_rate": 1.828952562586978e-05, + "loss": 1.8569, + "step": 21668 + }, + { + "epoch": 1.454280057716184, + "grad_norm": 3.617358684539795, + "learning_rate": 1.828112342163552e-05, + "loss": 1.7251, + "step": 21670 + }, + { + "epoch": 1.454414281399953, + "grad_norm": 3.8970139026641846, + "learning_rate": 1.827272271605581e-05, + "loss": 1.9172, + "step": 21672 + }, + { + "epoch": 1.454548505083722, + "grad_norm": 3.956115961074829, + "learning_rate": 1.8264323509527543e-05, + "loss": 1.9139, + "step": 21674 + }, + { + "epoch": 1.454682728767491, + "grad_norm": 4.108226776123047, + "learning_rate": 1.8255925802447604e-05, + "loss": 1.9015, + "step": 21676 + }, + { + "epoch": 1.45481695245126, + "grad_norm": 3.438971996307373, + "learning_rate": 1.8247529595212743e-05, + "loss": 1.8614, + "step": 21678 + }, + { + "epoch": 1.4549511761350291, + "grad_norm": 4.173226833343506, + "learning_rate": 1.8239134888219676e-05, + "loss": 2.0715, + "step": 21680 + }, + { + "epoch": 1.455085399818798, + "grad_norm": 3.9934122562408447, + "learning_rate": 1.823074168186501e-05, + "loss": 2.0402, + "step": 21682 + }, + { + "epoch": 1.455219623502567, + "grad_norm": 5.146625995635986, + "learning_rate": 1.8222349976545343e-05, + "loss": 2.1619, + "step": 21684 + }, + { + "epoch": 1.455353847186336, + "grad_norm": 3.969407081604004, + "learning_rate": 1.8213959772657142e-05, + "loss": 1.9597, + "step": 21686 + }, + { + "epoch": 1.455488070870105, + "grad_norm": 3.6109840869903564, + "learning_rate": 1.8205571070596867e-05, + "loss": 1.9626, + "step": 21688 + }, + { + "epoch": 1.4556222945538742, + "grad_norm": 4.356677055358887, + "learning_rate": 1.8197183870760803e-05, + "loss": 1.8959, + "step": 21690 + }, + { + "epoch": 1.455756518237643, + "grad_norm": 4.091159343719482, + "learning_rate": 1.818879817354528e-05, + "loss": 1.9591, + "step": 21692 + }, + { + "epoch": 1.455890741921412, + "grad_norm": 4.312813758850098, + "learning_rate": 1.818041397934647e-05, + "loss": 1.6465, + "step": 21694 + }, + { + "epoch": 1.456024965605181, + "grad_norm": 4.034827709197998, + "learning_rate": 1.8172031288560554e-05, + "loss": 1.8174, + "step": 21696 + }, + { + "epoch": 1.45615918928895, + "grad_norm": 4.020444393157959, + "learning_rate": 1.8163650101583567e-05, + "loss": 2.2721, + "step": 21698 + }, + { + "epoch": 1.456293412972719, + "grad_norm": 4.4025678634643555, + "learning_rate": 1.815527041881151e-05, + "loss": 1.8615, + "step": 21700 + }, + { + "epoch": 1.4564276366564881, + "grad_norm": 5.465107440948486, + "learning_rate": 1.8146892240640307e-05, + "loss": 1.8897, + "step": 21702 + }, + { + "epoch": 1.456561860340257, + "grad_norm": 4.428912162780762, + "learning_rate": 1.8138515567465793e-05, + "loss": 2.0627, + "step": 21704 + }, + { + "epoch": 1.456696084024026, + "grad_norm": 4.37305212020874, + "learning_rate": 1.8130140399683776e-05, + "loss": 1.7312, + "step": 21706 + }, + { + "epoch": 1.456830307707795, + "grad_norm": 3.9720499515533447, + "learning_rate": 1.812176673768996e-05, + "loss": 2.0123, + "step": 21708 + }, + { + "epoch": 1.456964531391564, + "grad_norm": 4.109086513519287, + "learning_rate": 1.8113394581879985e-05, + "loss": 1.9695, + "step": 21710 + }, + { + "epoch": 1.4570987550753332, + "grad_norm": 3.359098196029663, + "learning_rate": 1.810502393264939e-05, + "loss": 1.7101, + "step": 21712 + }, + { + "epoch": 1.457232978759102, + "grad_norm": 4.013128280639648, + "learning_rate": 1.8096654790393714e-05, + "loss": 1.8118, + "step": 21714 + }, + { + "epoch": 1.457367202442871, + "grad_norm": 4.2526068687438965, + "learning_rate": 1.808828715550836e-05, + "loss": 2.0345, + "step": 21716 + }, + { + "epoch": 1.45750142612664, + "grad_norm": 3.6479480266571045, + "learning_rate": 1.8079921028388692e-05, + "loss": 1.8969, + "step": 21718 + }, + { + "epoch": 1.457635649810409, + "grad_norm": 3.996673583984375, + "learning_rate": 1.8071556409429968e-05, + "loss": 1.7612, + "step": 21720 + }, + { + "epoch": 1.457769873494178, + "grad_norm": 4.365468978881836, + "learning_rate": 1.8063193299027436e-05, + "loss": 2.3319, + "step": 21722 + }, + { + "epoch": 1.4579040971779471, + "grad_norm": 4.361235618591309, + "learning_rate": 1.8054831697576203e-05, + "loss": 1.9211, + "step": 21724 + }, + { + "epoch": 1.458038320861716, + "grad_norm": 4.065951824188232, + "learning_rate": 1.8046471605471393e-05, + "loss": 1.9166, + "step": 21726 + }, + { + "epoch": 1.458172544545485, + "grad_norm": 3.659872055053711, + "learning_rate": 1.8038113023107928e-05, + "loss": 2.0153, + "step": 21728 + }, + { + "epoch": 1.458306768229254, + "grad_norm": 3.93522572517395, + "learning_rate": 1.8029755950880787e-05, + "loss": 1.8421, + "step": 21730 + }, + { + "epoch": 1.458440991913023, + "grad_norm": 3.8999781608581543, + "learning_rate": 1.8021400389184795e-05, + "loss": 2.0245, + "step": 21732 + }, + { + "epoch": 1.4585752155967922, + "grad_norm": 4.857063293457031, + "learning_rate": 1.801304633841477e-05, + "loss": 2.3869, + "step": 21734 + }, + { + "epoch": 1.458709439280561, + "grad_norm": 4.18338680267334, + "learning_rate": 1.8004693798965404e-05, + "loss": 2.2084, + "step": 21736 + }, + { + "epoch": 1.45884366296433, + "grad_norm": 4.277611255645752, + "learning_rate": 1.799634277123134e-05, + "loss": 1.8903, + "step": 21738 + }, + { + "epoch": 1.4589778866480991, + "grad_norm": 4.2307209968566895, + "learning_rate": 1.7987993255607132e-05, + "loss": 2.1118, + "step": 21740 + }, + { + "epoch": 1.459112110331868, + "grad_norm": 3.7161953449249268, + "learning_rate": 1.797964525248731e-05, + "loss": 1.7379, + "step": 21742 + }, + { + "epoch": 1.4592463340156372, + "grad_norm": 4.290340423583984, + "learning_rate": 1.7971298762266287e-05, + "loss": 2.0428, + "step": 21744 + }, + { + "epoch": 1.4593805576994061, + "grad_norm": 4.236762523651123, + "learning_rate": 1.796295378533841e-05, + "loss": 1.7452, + "step": 21746 + }, + { + "epoch": 1.459514781383175, + "grad_norm": 3.8375580310821533, + "learning_rate": 1.7954610322097953e-05, + "loss": 1.8508, + "step": 21748 + }, + { + "epoch": 1.459649005066944, + "grad_norm": 4.351699352264404, + "learning_rate": 1.794626837293916e-05, + "loss": 2.0508, + "step": 21750 + }, + { + "epoch": 1.459783228750713, + "grad_norm": 4.351319789886475, + "learning_rate": 1.7937927938256143e-05, + "loss": 1.8459, + "step": 21752 + }, + { + "epoch": 1.459917452434482, + "grad_norm": 4.69903039932251, + "learning_rate": 1.7929589018443016e-05, + "loss": 1.8145, + "step": 21754 + }, + { + "epoch": 1.4600516761182512, + "grad_norm": 4.340230464935303, + "learning_rate": 1.7921251613893715e-05, + "loss": 2.1451, + "step": 21756 + }, + { + "epoch": 1.46018589980202, + "grad_norm": 4.072259426116943, + "learning_rate": 1.7912915725002205e-05, + "loss": 2.0094, + "step": 21758 + }, + { + "epoch": 1.460320123485789, + "grad_norm": 4.063436985015869, + "learning_rate": 1.7904581352162312e-05, + "loss": 2.0955, + "step": 21760 + }, + { + "epoch": 1.4604543471695581, + "grad_norm": 3.291503667831421, + "learning_rate": 1.789624849576786e-05, + "loss": 1.6355, + "step": 21762 + }, + { + "epoch": 1.460588570853327, + "grad_norm": 4.5841383934021, + "learning_rate": 1.7887917156212532e-05, + "loss": 2.0006, + "step": 21764 + }, + { + "epoch": 1.4607227945370962, + "grad_norm": 4.128590106964111, + "learning_rate": 1.7879587333889975e-05, + "loss": 1.8528, + "step": 21766 + }, + { + "epoch": 1.4608570182208651, + "grad_norm": 4.270394325256348, + "learning_rate": 1.7871259029193754e-05, + "loss": 1.9073, + "step": 21768 + }, + { + "epoch": 1.460991241904634, + "grad_norm": 4.088672161102295, + "learning_rate": 1.786293224251735e-05, + "loss": 1.7931, + "step": 21770 + }, + { + "epoch": 1.461125465588403, + "grad_norm": 3.322438955307007, + "learning_rate": 1.785460697425422e-05, + "loss": 1.578, + "step": 21772 + }, + { + "epoch": 1.461259689272172, + "grad_norm": 4.145533084869385, + "learning_rate": 1.7846283224797698e-05, + "loss": 1.8903, + "step": 21774 + }, + { + "epoch": 1.461393912955941, + "grad_norm": 4.164388179779053, + "learning_rate": 1.7837960994541063e-05, + "loss": 1.9553, + "step": 21776 + }, + { + "epoch": 1.4615281366397102, + "grad_norm": 4.045705795288086, + "learning_rate": 1.7829640283877514e-05, + "loss": 1.9972, + "step": 21778 + }, + { + "epoch": 1.461662360323479, + "grad_norm": 3.8717153072357178, + "learning_rate": 1.7821321093200217e-05, + "loss": 1.87, + "step": 21780 + }, + { + "epoch": 1.461796584007248, + "grad_norm": 3.9834444522857666, + "learning_rate": 1.7813003422902224e-05, + "loss": 1.7146, + "step": 21782 + }, + { + "epoch": 1.4619308076910171, + "grad_norm": 3.566554069519043, + "learning_rate": 1.7804687273376526e-05, + "loss": 1.6711, + "step": 21784 + }, + { + "epoch": 1.462065031374786, + "grad_norm": 4.23741340637207, + "learning_rate": 1.7796372645016024e-05, + "loss": 2.2492, + "step": 21786 + }, + { + "epoch": 1.4621992550585552, + "grad_norm": 4.26492977142334, + "learning_rate": 1.778805953821361e-05, + "loss": 1.8307, + "step": 21788 + }, + { + "epoch": 1.462333478742324, + "grad_norm": 3.698054790496826, + "learning_rate": 1.777974795336202e-05, + "loss": 2.0183, + "step": 21790 + }, + { + "epoch": 1.462467702426093, + "grad_norm": 4.496737480163574, + "learning_rate": 1.7771437890854e-05, + "loss": 1.786, + "step": 21792 + }, + { + "epoch": 1.462601926109862, + "grad_norm": 4.252647876739502, + "learning_rate": 1.7763129351082165e-05, + "loss": 1.8625, + "step": 21794 + }, + { + "epoch": 1.462736149793631, + "grad_norm": 4.302008152008057, + "learning_rate": 1.7754822334439075e-05, + "loss": 1.9085, + "step": 21796 + }, + { + "epoch": 1.4628703734774, + "grad_norm": 4.36489725112915, + "learning_rate": 1.7746516841317207e-05, + "loss": 1.8767, + "step": 21798 + }, + { + "epoch": 1.4630045971611692, + "grad_norm": 3.7617173194885254, + "learning_rate": 1.773821287210901e-05, + "loss": 2.0032, + "step": 21800 + }, + { + "epoch": 1.463138820844938, + "grad_norm": 4.616260051727295, + "learning_rate": 1.772991042720682e-05, + "loss": 2.0849, + "step": 21802 + }, + { + "epoch": 1.463273044528707, + "grad_norm": 3.9260990619659424, + "learning_rate": 1.77216095070029e-05, + "loss": 1.8606, + "step": 21804 + }, + { + "epoch": 1.4634072682124761, + "grad_norm": 4.27295446395874, + "learning_rate": 1.7713310111889443e-05, + "loss": 2.1264, + "step": 21806 + }, + { + "epoch": 1.463541491896245, + "grad_norm": 3.6134917736053467, + "learning_rate": 1.7705012242258613e-05, + "loss": 2.0018, + "step": 21808 + }, + { + "epoch": 1.4636757155800142, + "grad_norm": 4.112629413604736, + "learning_rate": 1.769671589850243e-05, + "loss": 1.9865, + "step": 21810 + }, + { + "epoch": 1.463809939263783, + "grad_norm": 3.665358781814575, + "learning_rate": 1.768842108101293e-05, + "loss": 1.6673, + "step": 21812 + }, + { + "epoch": 1.463944162947552, + "grad_norm": 3.7102129459381104, + "learning_rate": 1.7680127790181967e-05, + "loss": 1.7674, + "step": 21814 + }, + { + "epoch": 1.4640783866313212, + "grad_norm": 4.147026538848877, + "learning_rate": 1.767183602640143e-05, + "loss": 2.1789, + "step": 21816 + }, + { + "epoch": 1.46421261031509, + "grad_norm": 3.170729875564575, + "learning_rate": 1.7663545790063047e-05, + "loss": 1.9615, + "step": 21818 + }, + { + "epoch": 1.4643468339988592, + "grad_norm": 4.050707817077637, + "learning_rate": 1.765525708155856e-05, + "loss": 1.9784, + "step": 21820 + }, + { + "epoch": 1.4644810576826282, + "grad_norm": 3.6955010890960693, + "learning_rate": 1.764696990127957e-05, + "loss": 1.7308, + "step": 21822 + }, + { + "epoch": 1.464615281366397, + "grad_norm": 3.6782374382019043, + "learning_rate": 1.7638684249617632e-05, + "loss": 1.6894, + "step": 21824 + }, + { + "epoch": 1.464749505050166, + "grad_norm": 4.513892650604248, + "learning_rate": 1.7630400126964214e-05, + "loss": 1.8199, + "step": 21826 + }, + { + "epoch": 1.4648837287339351, + "grad_norm": 4.4129204750061035, + "learning_rate": 1.7622117533710752e-05, + "loss": 1.8711, + "step": 21828 + }, + { + "epoch": 1.465017952417704, + "grad_norm": 4.5180583000183105, + "learning_rate": 1.7613836470248575e-05, + "loss": 1.6742, + "step": 21830 + }, + { + "epoch": 1.4651521761014732, + "grad_norm": 8.419897079467773, + "learning_rate": 1.760555693696893e-05, + "loss": 1.9381, + "step": 21832 + }, + { + "epoch": 1.465286399785242, + "grad_norm": 3.856956958770752, + "learning_rate": 1.7597278934263007e-05, + "loss": 2.0679, + "step": 21834 + }, + { + "epoch": 1.465420623469011, + "grad_norm": 4.4424872398376465, + "learning_rate": 1.7589002462521954e-05, + "loss": 1.8933, + "step": 21836 + }, + { + "epoch": 1.4655548471527802, + "grad_norm": 3.9072043895721436, + "learning_rate": 1.7580727522136804e-05, + "loss": 1.7799, + "step": 21838 + }, + { + "epoch": 1.465689070836549, + "grad_norm": 3.9837162494659424, + "learning_rate": 1.757245411349852e-05, + "loss": 2.0833, + "step": 21840 + }, + { + "epoch": 1.4658232945203182, + "grad_norm": 4.794734954833984, + "learning_rate": 1.7564182236998024e-05, + "loss": 2.1243, + "step": 21842 + }, + { + "epoch": 1.4659575182040872, + "grad_norm": 4.225294589996338, + "learning_rate": 1.755591189302611e-05, + "loss": 2.1807, + "step": 21844 + }, + { + "epoch": 1.466091741887856, + "grad_norm": 4.156105041503906, + "learning_rate": 1.754764308197358e-05, + "loss": 2.0964, + "step": 21846 + }, + { + "epoch": 1.466225965571625, + "grad_norm": 3.6195297241210938, + "learning_rate": 1.7539375804231083e-05, + "loss": 1.9586, + "step": 21848 + }, + { + "epoch": 1.4663601892553941, + "grad_norm": 4.3612494468688965, + "learning_rate": 1.7531110060189283e-05, + "loss": 2.144, + "step": 21850 + }, + { + "epoch": 1.466494412939163, + "grad_norm": 4.100998878479004, + "learning_rate": 1.752284585023865e-05, + "loss": 1.8349, + "step": 21852 + }, + { + "epoch": 1.4666286366229322, + "grad_norm": 4.6018829345703125, + "learning_rate": 1.75145831747697e-05, + "loss": 1.9604, + "step": 21854 + }, + { + "epoch": 1.466762860306701, + "grad_norm": 4.065568923950195, + "learning_rate": 1.7506322034172808e-05, + "loss": 2.147, + "step": 21856 + }, + { + "epoch": 1.46689708399047, + "grad_norm": 3.542161464691162, + "learning_rate": 1.7498062428838314e-05, + "loss": 1.7443, + "step": 21858 + }, + { + "epoch": 1.4670313076742392, + "grad_norm": 3.914262533187866, + "learning_rate": 1.7489804359156458e-05, + "loss": 2.0129, + "step": 21860 + }, + { + "epoch": 1.467165531358008, + "grad_norm": 4.646210670471191, + "learning_rate": 1.748154782551742e-05, + "loss": 2.0097, + "step": 21862 + }, + { + "epoch": 1.4672997550417772, + "grad_norm": 3.910086154937744, + "learning_rate": 1.7473292828311282e-05, + "loss": 2.015, + "step": 21864 + }, + { + "epoch": 1.4674339787255462, + "grad_norm": 4.18550968170166, + "learning_rate": 1.7465039367928117e-05, + "loss": 1.8872, + "step": 21866 + }, + { + "epoch": 1.467568202409315, + "grad_norm": 4.028215408325195, + "learning_rate": 1.745678744475786e-05, + "loss": 1.8127, + "step": 21868 + }, + { + "epoch": 1.467702426093084, + "grad_norm": 4.491466999053955, + "learning_rate": 1.7448537059190407e-05, + "loss": 1.9423, + "step": 21870 + }, + { + "epoch": 1.4678366497768531, + "grad_norm": 4.247272968292236, + "learning_rate": 1.7440288211615553e-05, + "loss": 2.3624, + "step": 21872 + }, + { + "epoch": 1.467970873460622, + "grad_norm": 4.5888237953186035, + "learning_rate": 1.743204090242307e-05, + "loss": 1.9365, + "step": 21874 + }, + { + "epoch": 1.4681050971443912, + "grad_norm": 4.581789493560791, + "learning_rate": 1.742379513200259e-05, + "loss": 2.1421, + "step": 21876 + }, + { + "epoch": 1.46823932082816, + "grad_norm": 3.3861029148101807, + "learning_rate": 1.741555090074377e-05, + "loss": 1.8617, + "step": 21878 + }, + { + "epoch": 1.468373544511929, + "grad_norm": 3.7286155223846436, + "learning_rate": 1.7407308209036066e-05, + "loss": 1.8276, + "step": 21880 + }, + { + "epoch": 1.4685077681956982, + "grad_norm": 4.129473686218262, + "learning_rate": 1.739906705726897e-05, + "loss": 1.9602, + "step": 21882 + }, + { + "epoch": 1.468641991879467, + "grad_norm": 4.139833450317383, + "learning_rate": 1.739082744583183e-05, + "loss": 1.8883, + "step": 21884 + }, + { + "epoch": 1.4687762155632362, + "grad_norm": 3.519582748413086, + "learning_rate": 1.738258937511399e-05, + "loss": 1.7637, + "step": 21886 + }, + { + "epoch": 1.4689104392470052, + "grad_norm": 4.330551624298096, + "learning_rate": 1.737435284550466e-05, + "loss": 1.9623, + "step": 21888 + }, + { + "epoch": 1.469044662930774, + "grad_norm": 4.721566200256348, + "learning_rate": 1.7366117857393007e-05, + "loss": 1.8926, + "step": 21890 + }, + { + "epoch": 1.4691788866145432, + "grad_norm": 4.089926242828369, + "learning_rate": 1.7357884411168097e-05, + "loss": 1.7462, + "step": 21892 + }, + { + "epoch": 1.4693131102983121, + "grad_norm": 4.830456256866455, + "learning_rate": 1.734965250721897e-05, + "loss": 2.0671, + "step": 21894 + }, + { + "epoch": 1.4694473339820813, + "grad_norm": 4.409668922424316, + "learning_rate": 1.7341422145934568e-05, + "loss": 1.9584, + "step": 21896 + }, + { + "epoch": 1.4695815576658502, + "grad_norm": 3.978074073791504, + "learning_rate": 1.7333193327703738e-05, + "loss": 1.798, + "step": 21898 + }, + { + "epoch": 1.469715781349619, + "grad_norm": 3.7917563915252686, + "learning_rate": 1.7324966052915274e-05, + "loss": 1.7303, + "step": 21900 + }, + { + "epoch": 1.469850005033388, + "grad_norm": 4.260056495666504, + "learning_rate": 1.7316740321957937e-05, + "loss": 2.0292, + "step": 21902 + }, + { + "epoch": 1.4699842287171572, + "grad_norm": 4.312327861785889, + "learning_rate": 1.7308516135220325e-05, + "loss": 2.0924, + "step": 21904 + }, + { + "epoch": 1.470118452400926, + "grad_norm": 4.197505474090576, + "learning_rate": 1.7300293493091074e-05, + "loss": 1.8813, + "step": 21906 + }, + { + "epoch": 1.4702526760846952, + "grad_norm": 3.858762502670288, + "learning_rate": 1.7292072395958643e-05, + "loss": 1.7147, + "step": 21908 + }, + { + "epoch": 1.4703868997684642, + "grad_norm": 4.3566741943359375, + "learning_rate": 1.728385284421145e-05, + "loss": 2.038, + "step": 21910 + }, + { + "epoch": 1.470521123452233, + "grad_norm": 4.416569709777832, + "learning_rate": 1.7275634838237897e-05, + "loss": 2.1136, + "step": 21912 + }, + { + "epoch": 1.4706553471360022, + "grad_norm": 3.94740891456604, + "learning_rate": 1.7267418378426232e-05, + "loss": 1.9223, + "step": 21914 + }, + { + "epoch": 1.4707895708197711, + "grad_norm": 4.761109828948975, + "learning_rate": 1.72592034651647e-05, + "loss": 2.1608, + "step": 21916 + }, + { + "epoch": 1.4709237945035403, + "grad_norm": 3.932337999343872, + "learning_rate": 1.7250990098841417e-05, + "loss": 1.9565, + "step": 21918 + }, + { + "epoch": 1.4710580181873092, + "grad_norm": 3.931513547897339, + "learning_rate": 1.7242778279844463e-05, + "loss": 2.0329, + "step": 21920 + }, + { + "epoch": 1.471192241871078, + "grad_norm": 3.8109991550445557, + "learning_rate": 1.7234568008561797e-05, + "loss": 1.6507, + "step": 21922 + }, + { + "epoch": 1.471326465554847, + "grad_norm": 3.5518369674682617, + "learning_rate": 1.722635928538138e-05, + "loss": 2.002, + "step": 21924 + }, + { + "epoch": 1.4714606892386162, + "grad_norm": 4.663365840911865, + "learning_rate": 1.7218152110691044e-05, + "loss": 1.8593, + "step": 21926 + }, + { + "epoch": 1.471594912922385, + "grad_norm": 4.502135276794434, + "learning_rate": 1.7209946484878554e-05, + "loss": 1.8324, + "step": 21928 + }, + { + "epoch": 1.4717291366061542, + "grad_norm": 4.595855236053467, + "learning_rate": 1.72017424083316e-05, + "loss": 1.915, + "step": 21930 + }, + { + "epoch": 1.4718633602899232, + "grad_norm": 4.042469501495361, + "learning_rate": 1.7193539881437837e-05, + "loss": 1.7024, + "step": 21932 + }, + { + "epoch": 1.471997583973692, + "grad_norm": 4.308805465698242, + "learning_rate": 1.71853389045848e-05, + "loss": 1.9532, + "step": 21934 + }, + { + "epoch": 1.4721318076574612, + "grad_norm": 4.039844512939453, + "learning_rate": 1.7177139478159977e-05, + "loss": 1.92, + "step": 21936 + }, + { + "epoch": 1.4722660313412301, + "grad_norm": 3.7954490184783936, + "learning_rate": 1.7168941602550754e-05, + "loss": 1.9853, + "step": 21938 + }, + { + "epoch": 1.4724002550249993, + "grad_norm": 4.0802998542785645, + "learning_rate": 1.7160745278144498e-05, + "loss": 1.7984, + "step": 21940 + }, + { + "epoch": 1.4725344787087682, + "grad_norm": 3.952124834060669, + "learning_rate": 1.715255050532843e-05, + "loss": 1.8231, + "step": 21942 + }, + { + "epoch": 1.472668702392537, + "grad_norm": 4.0561089515686035, + "learning_rate": 1.7144357284489782e-05, + "loss": 1.8802, + "step": 21944 + }, + { + "epoch": 1.472802926076306, + "grad_norm": 4.025638103485107, + "learning_rate": 1.7136165616015636e-05, + "loss": 2.0385, + "step": 21946 + }, + { + "epoch": 1.4729371497600752, + "grad_norm": 4.930600166320801, + "learning_rate": 1.712797550029305e-05, + "loss": 1.9071, + "step": 21948 + }, + { + "epoch": 1.473071373443844, + "grad_norm": 4.735400676727295, + "learning_rate": 1.711978693770896e-05, + "loss": 2.0361, + "step": 21950 + }, + { + "epoch": 1.4732055971276132, + "grad_norm": 4.540361404418945, + "learning_rate": 1.7111599928650302e-05, + "loss": 1.8371, + "step": 21952 + }, + { + "epoch": 1.4733398208113822, + "grad_norm": 3.9385697841644287, + "learning_rate": 1.710341447350387e-05, + "loss": 2.1408, + "step": 21954 + }, + { + "epoch": 1.473474044495151, + "grad_norm": 8.01272201538086, + "learning_rate": 1.7095230572656418e-05, + "loss": 2.0405, + "step": 21956 + }, + { + "epoch": 1.4736082681789202, + "grad_norm": 4.048547267913818, + "learning_rate": 1.70870482264946e-05, + "loss": 1.8489, + "step": 21958 + }, + { + "epoch": 1.4737424918626891, + "grad_norm": 4.612934112548828, + "learning_rate": 1.707886743540505e-05, + "loss": 2.1464, + "step": 21960 + }, + { + "epoch": 1.4738767155464583, + "grad_norm": 4.4074788093566895, + "learning_rate": 1.7070688199774275e-05, + "loss": 1.9998, + "step": 21962 + }, + { + "epoch": 1.4740109392302272, + "grad_norm": 4.340243339538574, + "learning_rate": 1.7062510519988727e-05, + "loss": 1.9451, + "step": 21964 + }, + { + "epoch": 1.474145162913996, + "grad_norm": 4.04251766204834, + "learning_rate": 1.705433439643478e-05, + "loss": 1.8209, + "step": 21966 + }, + { + "epoch": 1.4742793865977653, + "grad_norm": 4.508136749267578, + "learning_rate": 1.704615982949876e-05, + "loss": 1.9613, + "step": 21968 + }, + { + "epoch": 1.4744136102815342, + "grad_norm": 4.516005039215088, + "learning_rate": 1.703798681956687e-05, + "loss": 1.9568, + "step": 21970 + }, + { + "epoch": 1.4745478339653033, + "grad_norm": 3.8290348052978516, + "learning_rate": 1.7029815367025304e-05, + "loss": 1.8921, + "step": 21972 + }, + { + "epoch": 1.4746820576490722, + "grad_norm": 4.378920078277588, + "learning_rate": 1.7021645472260146e-05, + "loss": 2.1937, + "step": 21974 + }, + { + "epoch": 1.4748162813328412, + "grad_norm": 3.7343907356262207, + "learning_rate": 1.701347713565735e-05, + "loss": 2.0566, + "step": 21976 + }, + { + "epoch": 1.47495050501661, + "grad_norm": 4.582740306854248, + "learning_rate": 1.7005310357602916e-05, + "loss": 1.9912, + "step": 21978 + }, + { + "epoch": 1.4750847287003792, + "grad_norm": 4.142771244049072, + "learning_rate": 1.6997145138482674e-05, + "loss": 1.9127, + "step": 21980 + }, + { + "epoch": 1.4752189523841481, + "grad_norm": 4.2628607749938965, + "learning_rate": 1.6988981478682436e-05, + "loss": 2.0057, + "step": 21982 + }, + { + "epoch": 1.4753531760679173, + "grad_norm": 3.838200807571411, + "learning_rate": 1.6980819378587915e-05, + "loss": 1.8674, + "step": 21984 + }, + { + "epoch": 1.4754873997516862, + "grad_norm": 4.358355522155762, + "learning_rate": 1.697265883858475e-05, + "loss": 2.0728, + "step": 21986 + }, + { + "epoch": 1.475621623435455, + "grad_norm": 4.224380016326904, + "learning_rate": 1.6964499859058485e-05, + "loss": 2.0814, + "step": 21988 + }, + { + "epoch": 1.4757558471192243, + "grad_norm": 4.615037441253662, + "learning_rate": 1.6956342440394663e-05, + "loss": 1.9629, + "step": 21990 + }, + { + "epoch": 1.4758900708029932, + "grad_norm": 4.348577976226807, + "learning_rate": 1.6948186582978683e-05, + "loss": 2.044, + "step": 21992 + }, + { + "epoch": 1.4760242944867623, + "grad_norm": 3.923492431640625, + "learning_rate": 1.694003228719589e-05, + "loss": 1.8777, + "step": 21994 + }, + { + "epoch": 1.4761585181705312, + "grad_norm": 4.5695719718933105, + "learning_rate": 1.693187955343155e-05, + "loss": 2.2444, + "step": 21996 + }, + { + "epoch": 1.4762927418543002, + "grad_norm": 3.538875102996826, + "learning_rate": 1.6923728382070886e-05, + "loss": 1.7221, + "step": 21998 + }, + { + "epoch": 1.476426965538069, + "grad_norm": 3.888209581375122, + "learning_rate": 1.6915578773499003e-05, + "loss": 2.105, + "step": 22000 + }, + { + "epoch": 1.4765611892218382, + "grad_norm": 4.112044334411621, + "learning_rate": 1.6907430728101e-05, + "loss": 1.8627, + "step": 22002 + }, + { + "epoch": 1.4766954129056071, + "grad_norm": 3.743859052658081, + "learning_rate": 1.689928424626178e-05, + "loss": 2.0159, + "step": 22004 + }, + { + "epoch": 1.4768296365893763, + "grad_norm": 4.212174415588379, + "learning_rate": 1.6891139328366313e-05, + "loss": 1.9951, + "step": 22006 + }, + { + "epoch": 1.4769638602731452, + "grad_norm": 4.128251552581787, + "learning_rate": 1.6882995974799387e-05, + "loss": 1.8935, + "step": 22008 + }, + { + "epoch": 1.477098083956914, + "grad_norm": 4.290196418762207, + "learning_rate": 1.68748541859458e-05, + "loss": 2.0137, + "step": 22010 + }, + { + "epoch": 1.4772323076406833, + "grad_norm": 3.8535380363464355, + "learning_rate": 1.686671396219021e-05, + "loss": 2.059, + "step": 22012 + }, + { + "epoch": 1.4773665313244522, + "grad_norm": 4.333270072937012, + "learning_rate": 1.6858575303917234e-05, + "loss": 1.9113, + "step": 22014 + }, + { + "epoch": 1.4775007550082213, + "grad_norm": 3.781531572341919, + "learning_rate": 1.685043821151139e-05, + "loss": 1.953, + "step": 22016 + }, + { + "epoch": 1.4776349786919902, + "grad_norm": 4.485109806060791, + "learning_rate": 1.6842302685357165e-05, + "loss": 1.8895, + "step": 22018 + }, + { + "epoch": 1.4777692023757591, + "grad_norm": 4.054008483886719, + "learning_rate": 1.683416872583894e-05, + "loss": 1.978, + "step": 22020 + }, + { + "epoch": 1.477903426059528, + "grad_norm": 4.205047130584717, + "learning_rate": 1.6826036333341027e-05, + "loss": 1.9959, + "step": 22022 + }, + { + "epoch": 1.4780376497432972, + "grad_norm": 3.4715986251831055, + "learning_rate": 1.6817905508247643e-05, + "loss": 2.0216, + "step": 22024 + }, + { + "epoch": 1.4781718734270661, + "grad_norm": 4.020529270172119, + "learning_rate": 1.6809776250942995e-05, + "loss": 1.8548, + "step": 22026 + }, + { + "epoch": 1.4783060971108353, + "grad_norm": 4.181562900543213, + "learning_rate": 1.6801648561811133e-05, + "loss": 1.9017, + "step": 22028 + }, + { + "epoch": 1.4784403207946042, + "grad_norm": 3.9224865436553955, + "learning_rate": 1.679352244123613e-05, + "loss": 2.1884, + "step": 22030 + }, + { + "epoch": 1.478574544478373, + "grad_norm": 4.267264366149902, + "learning_rate": 1.678539788960186e-05, + "loss": 1.6648, + "step": 22032 + }, + { + "epoch": 1.4787087681621423, + "grad_norm": 4.4839582443237305, + "learning_rate": 1.6777274907292245e-05, + "loss": 2.0319, + "step": 22034 + }, + { + "epoch": 1.4788429918459112, + "grad_norm": 4.342587947845459, + "learning_rate": 1.6769153494691043e-05, + "loss": 2.068, + "step": 22036 + }, + { + "epoch": 1.4789772155296803, + "grad_norm": 4.28007698059082, + "learning_rate": 1.6761033652182008e-05, + "loss": 2.0163, + "step": 22038 + }, + { + "epoch": 1.4791114392134492, + "grad_norm": 7.561879634857178, + "learning_rate": 1.675291538014877e-05, + "loss": 1.8482, + "step": 22040 + }, + { + "epoch": 1.4792456628972181, + "grad_norm": 4.294271945953369, + "learning_rate": 1.6744798678974906e-05, + "loss": 1.8672, + "step": 22042 + }, + { + "epoch": 1.4793798865809873, + "grad_norm": 4.372527599334717, + "learning_rate": 1.673668354904391e-05, + "loss": 1.934, + "step": 22044 + }, + { + "epoch": 1.4795141102647562, + "grad_norm": 4.191868782043457, + "learning_rate": 1.672856999073919e-05, + "loss": 1.83, + "step": 22046 + }, + { + "epoch": 1.4796483339485254, + "grad_norm": 4.452010154724121, + "learning_rate": 1.672045800444413e-05, + "loss": 1.9101, + "step": 22048 + }, + { + "epoch": 1.4797825576322943, + "grad_norm": 4.126922130584717, + "learning_rate": 1.6712347590541983e-05, + "loss": 1.9937, + "step": 22050 + }, + { + "epoch": 1.4799167813160632, + "grad_norm": 3.8967831134796143, + "learning_rate": 1.6704238749415957e-05, + "loss": 1.835, + "step": 22052 + }, + { + "epoch": 1.480051004999832, + "grad_norm": 3.7992687225341797, + "learning_rate": 1.6696131481449162e-05, + "loss": 2.1846, + "step": 22054 + }, + { + "epoch": 1.4801852286836013, + "grad_norm": 4.393941402435303, + "learning_rate": 1.668802578702468e-05, + "loss": 1.9375, + "step": 22056 + }, + { + "epoch": 1.4803194523673702, + "grad_norm": 4.199316501617432, + "learning_rate": 1.6679921666525476e-05, + "loss": 1.7858, + "step": 22058 + }, + { + "epoch": 1.4804536760511393, + "grad_norm": 3.879398822784424, + "learning_rate": 1.6671819120334454e-05, + "loss": 1.8148, + "step": 22060 + }, + { + "epoch": 1.4805878997349082, + "grad_norm": 4.8560099601745605, + "learning_rate": 1.6663718148834423e-05, + "loss": 2.0267, + "step": 22062 + }, + { + "epoch": 1.4807221234186771, + "grad_norm": 4.392063140869141, + "learning_rate": 1.6655618752408176e-05, + "loss": 1.6637, + "step": 22064 + }, + { + "epoch": 1.4808563471024463, + "grad_norm": 3.955936908721924, + "learning_rate": 1.6647520931438355e-05, + "loss": 1.9957, + "step": 22066 + }, + { + "epoch": 1.4809905707862152, + "grad_norm": 4.445084571838379, + "learning_rate": 1.6639424686307613e-05, + "loss": 2.151, + "step": 22068 + }, + { + "epoch": 1.4811247944699844, + "grad_norm": 3.4342544078826904, + "learning_rate": 1.663133001739843e-05, + "loss": 1.6516, + "step": 22070 + }, + { + "epoch": 1.4812590181537533, + "grad_norm": 3.900529384613037, + "learning_rate": 1.6623236925093293e-05, + "loss": 2.0069, + "step": 22072 + }, + { + "epoch": 1.4813932418375222, + "grad_norm": 3.7821099758148193, + "learning_rate": 1.661514540977457e-05, + "loss": 1.7453, + "step": 22074 + }, + { + "epoch": 1.481527465521291, + "grad_norm": 4.011965751647949, + "learning_rate": 1.6607055471824595e-05, + "loss": 1.9228, + "step": 22076 + }, + { + "epoch": 1.4816616892050603, + "grad_norm": 3.909048080444336, + "learning_rate": 1.659896711162558e-05, + "loss": 1.9543, + "step": 22078 + }, + { + "epoch": 1.4817959128888292, + "grad_norm": 3.8074450492858887, + "learning_rate": 1.6590880329559692e-05, + "loss": 2.1138, + "step": 22080 + }, + { + "epoch": 1.4819301365725983, + "grad_norm": 3.9071998596191406, + "learning_rate": 1.658279512600899e-05, + "loss": 1.8419, + "step": 22082 + }, + { + "epoch": 1.4820643602563672, + "grad_norm": 4.3846940994262695, + "learning_rate": 1.6574711501355534e-05, + "loss": 1.9693, + "step": 22084 + }, + { + "epoch": 1.4821985839401361, + "grad_norm": 4.317813873291016, + "learning_rate": 1.6566629455981218e-05, + "loss": 1.6876, + "step": 22086 + }, + { + "epoch": 1.4823328076239053, + "grad_norm": 3.8159542083740234, + "learning_rate": 1.6558548990267925e-05, + "loss": 1.7014, + "step": 22088 + }, + { + "epoch": 1.4824670313076742, + "grad_norm": 4.290137767791748, + "learning_rate": 1.6550470104597417e-05, + "loss": 1.8803, + "step": 22090 + }, + { + "epoch": 1.4826012549914434, + "grad_norm": 4.188060760498047, + "learning_rate": 1.654239279935143e-05, + "loss": 1.7132, + "step": 22092 + }, + { + "epoch": 1.4827354786752123, + "grad_norm": 3.7532272338867188, + "learning_rate": 1.6534317074911582e-05, + "loss": 1.9448, + "step": 22094 + }, + { + "epoch": 1.4828697023589812, + "grad_norm": 4.051297187805176, + "learning_rate": 1.652624293165946e-05, + "loss": 1.7039, + "step": 22096 + }, + { + "epoch": 1.48300392604275, + "grad_norm": 4.019271373748779, + "learning_rate": 1.6518170369976532e-05, + "loss": 1.8435, + "step": 22098 + }, + { + "epoch": 1.4831381497265193, + "grad_norm": 3.7992124557495117, + "learning_rate": 1.651009939024422e-05, + "loss": 1.7098, + "step": 22100 + }, + { + "epoch": 1.4832723734102882, + "grad_norm": 4.386690139770508, + "learning_rate": 1.6502029992843836e-05, + "loss": 1.766, + "step": 22102 + }, + { + "epoch": 1.4834065970940573, + "grad_norm": 4.785162925720215, + "learning_rate": 1.6493962178156685e-05, + "loss": 1.9141, + "step": 22104 + }, + { + "epoch": 1.4835408207778262, + "grad_norm": 4.63353967666626, + "learning_rate": 1.6485895946563927e-05, + "loss": 1.9832, + "step": 22106 + }, + { + "epoch": 1.4836750444615951, + "grad_norm": 3.856705665588379, + "learning_rate": 1.647783129844669e-05, + "loss": 2.0672, + "step": 22108 + }, + { + "epoch": 1.4838092681453643, + "grad_norm": 3.739302396774292, + "learning_rate": 1.646976823418599e-05, + "loss": 2.1118, + "step": 22110 + }, + { + "epoch": 1.4839434918291332, + "grad_norm": 4.038050651550293, + "learning_rate": 1.646170675416282e-05, + "loss": 1.8281, + "step": 22112 + }, + { + "epoch": 1.4840777155129024, + "grad_norm": 9.67948055267334, + "learning_rate": 1.6453646858758055e-05, + "loss": 2.1374, + "step": 22114 + }, + { + "epoch": 1.4842119391966713, + "grad_norm": 4.099985599517822, + "learning_rate": 1.644558854835251e-05, + "loss": 2.0751, + "step": 22116 + }, + { + "epoch": 1.4843461628804402, + "grad_norm": 3.7618587017059326, + "learning_rate": 1.6437531823326922e-05, + "loss": 1.9961, + "step": 22118 + }, + { + "epoch": 1.4844803865642093, + "grad_norm": 4.01106071472168, + "learning_rate": 1.6429476684061945e-05, + "loss": 1.9378, + "step": 22120 + }, + { + "epoch": 1.4846146102479783, + "grad_norm": 3.961961507797241, + "learning_rate": 1.6421423130938196e-05, + "loss": 1.9081, + "step": 22122 + }, + { + "epoch": 1.4847488339317474, + "grad_norm": 3.6298842430114746, + "learning_rate": 1.6413371164336154e-05, + "loss": 1.7336, + "step": 22124 + }, + { + "epoch": 1.4848830576155163, + "grad_norm": 3.856550455093384, + "learning_rate": 1.6405320784636318e-05, + "loss": 1.9585, + "step": 22126 + }, + { + "epoch": 1.4850172812992852, + "grad_norm": 3.4849648475646973, + "learning_rate": 1.6397271992218982e-05, + "loss": 1.6921, + "step": 22128 + }, + { + "epoch": 1.4851515049830541, + "grad_norm": 4.229583263397217, + "learning_rate": 1.638922478746448e-05, + "loss": 2.0476, + "step": 22130 + }, + { + "epoch": 1.4852857286668233, + "grad_norm": 4.025295734405518, + "learning_rate": 1.6381179170752997e-05, + "loss": 2.0023, + "step": 22132 + }, + { + "epoch": 1.4854199523505922, + "grad_norm": 3.9394731521606445, + "learning_rate": 1.6373135142464707e-05, + "loss": 1.8633, + "step": 22134 + }, + { + "epoch": 1.4855541760343614, + "grad_norm": 4.520318031311035, + "learning_rate": 1.636509270297966e-05, + "loss": 1.9541, + "step": 22136 + }, + { + "epoch": 1.4856883997181303, + "grad_norm": 5.560293197631836, + "learning_rate": 1.635705185267784e-05, + "loss": 1.9252, + "step": 22138 + }, + { + "epoch": 1.4858226234018992, + "grad_norm": 4.1524338722229, + "learning_rate": 1.634901259193915e-05, + "loss": 2.0994, + "step": 22140 + }, + { + "epoch": 1.4859568470856683, + "grad_norm": 3.8886759281158447, + "learning_rate": 1.6340974921143455e-05, + "loss": 1.6389, + "step": 22142 + }, + { + "epoch": 1.4860910707694373, + "grad_norm": 4.097020626068115, + "learning_rate": 1.6332938840670508e-05, + "loss": 1.8547, + "step": 22144 + }, + { + "epoch": 1.4862252944532064, + "grad_norm": 3.864133596420288, + "learning_rate": 1.63249043509e-05, + "loss": 2.028, + "step": 22146 + }, + { + "epoch": 1.4863595181369753, + "grad_norm": 4.282485008239746, + "learning_rate": 1.6316871452211524e-05, + "loss": 1.9695, + "step": 22148 + }, + { + "epoch": 1.4864937418207442, + "grad_norm": 4.6368408203125, + "learning_rate": 1.6308840144984656e-05, + "loss": 2.1387, + "step": 22150 + }, + { + "epoch": 1.4866279655045131, + "grad_norm": 4.516523838043213, + "learning_rate": 1.6300810429598823e-05, + "loss": 1.9515, + "step": 22152 + }, + { + "epoch": 1.4867621891882823, + "grad_norm": 3.8753156661987305, + "learning_rate": 1.6292782306433462e-05, + "loss": 1.8503, + "step": 22154 + }, + { + "epoch": 1.4868964128720512, + "grad_norm": 4.225417137145996, + "learning_rate": 1.628475577586782e-05, + "loss": 1.87, + "step": 22156 + }, + { + "epoch": 1.4870306365558204, + "grad_norm": 4.516408443450928, + "learning_rate": 1.627673083828119e-05, + "loss": 2.0121, + "step": 22158 + }, + { + "epoch": 1.4871648602395893, + "grad_norm": 3.5104589462280273, + "learning_rate": 1.62687074940527e-05, + "loss": 1.7964, + "step": 22160 + }, + { + "epoch": 1.4872990839233582, + "grad_norm": 3.6166818141937256, + "learning_rate": 1.626068574356146e-05, + "loss": 1.7593, + "step": 22162 + }, + { + "epoch": 1.4874333076071273, + "grad_norm": 4.598283290863037, + "learning_rate": 1.6252665587186477e-05, + "loss": 1.8292, + "step": 22164 + }, + { + "epoch": 1.4875675312908962, + "grad_norm": 3.761305332183838, + "learning_rate": 1.624464702530668e-05, + "loss": 1.765, + "step": 22166 + }, + { + "epoch": 1.4877017549746654, + "grad_norm": 3.9841415882110596, + "learning_rate": 1.6236630058300924e-05, + "loss": 1.8343, + "step": 22168 + }, + { + "epoch": 1.4878359786584343, + "grad_norm": 3.932185173034668, + "learning_rate": 1.6228614686548018e-05, + "loss": 1.9057, + "step": 22170 + }, + { + "epoch": 1.4879702023422032, + "grad_norm": 3.7360942363739014, + "learning_rate": 1.622060091042666e-05, + "loss": 1.9099, + "step": 22172 + }, + { + "epoch": 1.4881044260259721, + "grad_norm": 4.126918792724609, + "learning_rate": 1.621258873031548e-05, + "loss": 2.1493, + "step": 22174 + }, + { + "epoch": 1.4882386497097413, + "grad_norm": 4.4309186935424805, + "learning_rate": 1.620457814659303e-05, + "loss": 1.7422, + "step": 22176 + }, + { + "epoch": 1.4883728733935102, + "grad_norm": 3.8067617416381836, + "learning_rate": 1.6196569159637825e-05, + "loss": 2.1486, + "step": 22178 + }, + { + "epoch": 1.4885070970772794, + "grad_norm": 4.205476760864258, + "learning_rate": 1.6188561769828252e-05, + "loss": 1.6997, + "step": 22180 + }, + { + "epoch": 1.4886413207610483, + "grad_norm": 3.9959325790405273, + "learning_rate": 1.6180555977542655e-05, + "loss": 2.0081, + "step": 22182 + }, + { + "epoch": 1.4887755444448172, + "grad_norm": 3.551949977874756, + "learning_rate": 1.6172551783159278e-05, + "loss": 1.8569, + "step": 22184 + }, + { + "epoch": 1.4889097681285863, + "grad_norm": 4.134200096130371, + "learning_rate": 1.6164549187056294e-05, + "loss": 2.026, + "step": 22186 + }, + { + "epoch": 1.4890439918123552, + "grad_norm": 4.172410011291504, + "learning_rate": 1.6156548189611847e-05, + "loss": 1.7095, + "step": 22188 + }, + { + "epoch": 1.4891782154961244, + "grad_norm": 3.7881362438201904, + "learning_rate": 1.6148548791203926e-05, + "loss": 1.8797, + "step": 22190 + }, + { + "epoch": 1.4893124391798933, + "grad_norm": 4.265657901763916, + "learning_rate": 1.6140550992210545e-05, + "loss": 2.0438, + "step": 22192 + }, + { + "epoch": 1.4894466628636622, + "grad_norm": 3.6585726737976074, + "learning_rate": 1.6132554793009514e-05, + "loss": 1.748, + "step": 22194 + }, + { + "epoch": 1.4895808865474314, + "grad_norm": 4.572390556335449, + "learning_rate": 1.612456019397869e-05, + "loss": 2.0434, + "step": 22196 + }, + { + "epoch": 1.4897151102312003, + "grad_norm": 4.030580520629883, + "learning_rate": 1.6116567195495767e-05, + "loss": 2.0718, + "step": 22198 + }, + { + "epoch": 1.4898493339149694, + "grad_norm": 4.58164644241333, + "learning_rate": 1.610857579793843e-05, + "loss": 1.9745, + "step": 22200 + }, + { + "epoch": 1.4899835575987384, + "grad_norm": 4.08976936340332, + "learning_rate": 1.610058600168424e-05, + "loss": 1.8513, + "step": 22202 + }, + { + "epoch": 1.4901177812825073, + "grad_norm": 3.9999282360076904, + "learning_rate": 1.6092597807110703e-05, + "loss": 1.7651, + "step": 22204 + }, + { + "epoch": 1.4902520049662762, + "grad_norm": 4.052486896514893, + "learning_rate": 1.608461121459523e-05, + "loss": 1.9587, + "step": 22206 + }, + { + "epoch": 1.4903862286500453, + "grad_norm": 4.0680084228515625, + "learning_rate": 1.6076626224515196e-05, + "loss": 1.9123, + "step": 22208 + }, + { + "epoch": 1.4905204523338142, + "grad_norm": 4.084280967712402, + "learning_rate": 1.6068642837247872e-05, + "loss": 1.8475, + "step": 22210 + }, + { + "epoch": 1.4906546760175834, + "grad_norm": 3.7566370964050293, + "learning_rate": 1.6060661053170444e-05, + "loss": 1.7271, + "step": 22212 + }, + { + "epoch": 1.4907888997013523, + "grad_norm": 4.517791748046875, + "learning_rate": 1.6052680872660032e-05, + "loss": 1.6785, + "step": 22214 + }, + { + "epoch": 1.4909231233851212, + "grad_norm": 4.286051273345947, + "learning_rate": 1.6044702296093707e-05, + "loss": 2.3164, + "step": 22216 + }, + { + "epoch": 1.4910573470688904, + "grad_norm": 3.730619430541992, + "learning_rate": 1.603672532384841e-05, + "loss": 1.7404, + "step": 22218 + }, + { + "epoch": 1.4911915707526593, + "grad_norm": 4.261206150054932, + "learning_rate": 1.6028749956301094e-05, + "loss": 1.8019, + "step": 22220 + }, + { + "epoch": 1.4913257944364284, + "grad_norm": 3.876013994216919, + "learning_rate": 1.6020776193828506e-05, + "loss": 1.8751, + "step": 22222 + }, + { + "epoch": 1.4914600181201974, + "grad_norm": 4.372317790985107, + "learning_rate": 1.601280403680744e-05, + "loss": 2.0952, + "step": 22224 + }, + { + "epoch": 1.4915942418039663, + "grad_norm": 3.879643678665161, + "learning_rate": 1.6004833485614536e-05, + "loss": 1.8187, + "step": 22226 + }, + { + "epoch": 1.4917284654877352, + "grad_norm": 4.27821683883667, + "learning_rate": 1.5996864540626416e-05, + "loss": 2.1694, + "step": 22228 + }, + { + "epoch": 1.4918626891715043, + "grad_norm": 3.952427864074707, + "learning_rate": 1.5988897202219582e-05, + "loss": 1.6485, + "step": 22230 + }, + { + "epoch": 1.4919969128552732, + "grad_norm": 11.681438446044922, + "learning_rate": 1.5980931470770476e-05, + "loss": 2.0005, + "step": 22232 + }, + { + "epoch": 1.4921311365390424, + "grad_norm": 4.281419277191162, + "learning_rate": 1.5972967346655448e-05, + "loss": 1.9078, + "step": 22234 + }, + { + "epoch": 1.4922653602228113, + "grad_norm": 4.252011299133301, + "learning_rate": 1.5965004830250814e-05, + "loss": 2.2764, + "step": 22236 + }, + { + "epoch": 1.4923995839065802, + "grad_norm": 3.8641672134399414, + "learning_rate": 1.595704392193278e-05, + "loss": 2.0221, + "step": 22238 + }, + { + "epoch": 1.4925338075903494, + "grad_norm": 4.333716869354248, + "learning_rate": 1.5949084622077472e-05, + "loss": 2.1823, + "step": 22240 + }, + { + "epoch": 1.4926680312741183, + "grad_norm": 3.596454381942749, + "learning_rate": 1.5941126931060947e-05, + "loss": 1.9546, + "step": 22242 + }, + { + "epoch": 1.4928022549578874, + "grad_norm": 3.925058126449585, + "learning_rate": 1.593317084925921e-05, + "loss": 1.9065, + "step": 22244 + }, + { + "epoch": 1.4929364786416564, + "grad_norm": 4.363532066345215, + "learning_rate": 1.5925216377048153e-05, + "loss": 1.9511, + "step": 22246 + }, + { + "epoch": 1.4930707023254253, + "grad_norm": 3.7105515003204346, + "learning_rate": 1.5917263514803643e-05, + "loss": 1.9206, + "step": 22248 + }, + { + "epoch": 1.4932049260091942, + "grad_norm": 4.15047025680542, + "learning_rate": 1.5909312262901395e-05, + "loss": 1.8952, + "step": 22250 + }, + { + "epoch": 1.4933391496929633, + "grad_norm": 4.4344096183776855, + "learning_rate": 1.590136262171709e-05, + "loss": 2.076, + "step": 22252 + }, + { + "epoch": 1.4934733733767322, + "grad_norm": 4.179962158203125, + "learning_rate": 1.5893414591626372e-05, + "loss": 1.9279, + "step": 22254 + }, + { + "epoch": 1.4936075970605014, + "grad_norm": 4.025397300720215, + "learning_rate": 1.5885468173004726e-05, + "loss": 1.8936, + "step": 22256 + }, + { + "epoch": 1.4937418207442703, + "grad_norm": 4.403468132019043, + "learning_rate": 1.5877523366227635e-05, + "loss": 2.0784, + "step": 22258 + }, + { + "epoch": 1.4938760444280392, + "grad_norm": 3.9283719062805176, + "learning_rate": 1.586958017167047e-05, + "loss": 1.8946, + "step": 22260 + }, + { + "epoch": 1.4940102681118084, + "grad_norm": 4.326327800750732, + "learning_rate": 1.5861638589708534e-05, + "loss": 1.8026, + "step": 22262 + }, + { + "epoch": 1.4941444917955773, + "grad_norm": 4.461353302001953, + "learning_rate": 1.585369862071702e-05, + "loss": 2.0137, + "step": 22264 + }, + { + "epoch": 1.4942787154793464, + "grad_norm": 3.4351725578308105, + "learning_rate": 1.5845760265071125e-05, + "loss": 1.894, + "step": 22266 + }, + { + "epoch": 1.4944129391631154, + "grad_norm": 3.914440870285034, + "learning_rate": 1.583782352314589e-05, + "loss": 1.9458, + "step": 22268 + }, + { + "epoch": 1.4945471628468843, + "grad_norm": 4.043105125427246, + "learning_rate": 1.5829888395316317e-05, + "loss": 1.7776, + "step": 22270 + }, + { + "epoch": 1.4946813865306534, + "grad_norm": 4.014505386352539, + "learning_rate": 1.582195488195731e-05, + "loss": 2.0021, + "step": 22272 + }, + { + "epoch": 1.4948156102144223, + "grad_norm": 3.8699533939361572, + "learning_rate": 1.5814022983443743e-05, + "loss": 1.9207, + "step": 22274 + }, + { + "epoch": 1.4949498338981915, + "grad_norm": 4.136548042297363, + "learning_rate": 1.580609270015035e-05, + "loss": 2.2065, + "step": 22276 + }, + { + "epoch": 1.4950840575819604, + "grad_norm": 3.9566643238067627, + "learning_rate": 1.5798164032451867e-05, + "loss": 1.9974, + "step": 22278 + }, + { + "epoch": 1.4952182812657293, + "grad_norm": 4.2617316246032715, + "learning_rate": 1.579023698072285e-05, + "loss": 1.9215, + "step": 22280 + }, + { + "epoch": 1.4953525049494982, + "grad_norm": 4.621911525726318, + "learning_rate": 1.578231154533788e-05, + "loss": 1.9706, + "step": 22282 + }, + { + "epoch": 1.4954867286332674, + "grad_norm": 3.930868148803711, + "learning_rate": 1.5774387726671385e-05, + "loss": 1.9068, + "step": 22284 + }, + { + "epoch": 1.4956209523170363, + "grad_norm": 4.102669715881348, + "learning_rate": 1.5766465525097784e-05, + "loss": 1.7868, + "step": 22286 + }, + { + "epoch": 1.4957551760008054, + "grad_norm": 3.837277412414551, + "learning_rate": 1.575854494099137e-05, + "loss": 1.8372, + "step": 22288 + }, + { + "epoch": 1.4958893996845744, + "grad_norm": 4.143291473388672, + "learning_rate": 1.575062597472638e-05, + "loss": 2.3029, + "step": 22290 + }, + { + "epoch": 1.4960236233683433, + "grad_norm": 4.286818981170654, + "learning_rate": 1.5742708626676943e-05, + "loss": 1.7359, + "step": 22292 + }, + { + "epoch": 1.4961578470521124, + "grad_norm": 4.307257652282715, + "learning_rate": 1.5734792897217178e-05, + "loss": 2.0494, + "step": 22294 + }, + { + "epoch": 1.4962920707358813, + "grad_norm": 4.413841247558594, + "learning_rate": 1.5726878786721067e-05, + "loss": 1.8744, + "step": 22296 + }, + { + "epoch": 1.4964262944196505, + "grad_norm": 4.007072448730469, + "learning_rate": 1.5718966295562538e-05, + "loss": 1.9089, + "step": 22298 + }, + { + "epoch": 1.4965605181034194, + "grad_norm": 4.048382759094238, + "learning_rate": 1.5711055424115424e-05, + "loss": 1.854, + "step": 22300 + }, + { + "epoch": 1.4966947417871883, + "grad_norm": 4.29270601272583, + "learning_rate": 1.570314617275353e-05, + "loss": 1.8087, + "step": 22302 + }, + { + "epoch": 1.4968289654709572, + "grad_norm": 4.479507923126221, + "learning_rate": 1.5695238541850526e-05, + "loss": 1.967, + "step": 22304 + }, + { + "epoch": 1.4969631891547264, + "grad_norm": 3.7584922313690186, + "learning_rate": 1.568733253178007e-05, + "loss": 1.9997, + "step": 22306 + }, + { + "epoch": 1.4970974128384953, + "grad_norm": 4.026547431945801, + "learning_rate": 1.5679428142915652e-05, + "loss": 1.9484, + "step": 22308 + }, + { + "epoch": 1.4972316365222644, + "grad_norm": 4.039816379547119, + "learning_rate": 1.567152537563078e-05, + "loss": 2.1018, + "step": 22310 + }, + { + "epoch": 1.4973658602060334, + "grad_norm": 4.688394069671631, + "learning_rate": 1.566362423029881e-05, + "loss": 1.8876, + "step": 22312 + }, + { + "epoch": 1.4975000838898023, + "grad_norm": 4.067087650299072, + "learning_rate": 1.5655724707293096e-05, + "loss": 1.8099, + "step": 22314 + }, + { + "epoch": 1.4976343075735714, + "grad_norm": 3.770357847213745, + "learning_rate": 1.5647826806986853e-05, + "loss": 1.9459, + "step": 22316 + }, + { + "epoch": 1.4977685312573403, + "grad_norm": 4.319890975952148, + "learning_rate": 1.5639930529753244e-05, + "loss": 1.8243, + "step": 22318 + }, + { + "epoch": 1.4979027549411095, + "grad_norm": 4.133200168609619, + "learning_rate": 1.563203587596535e-05, + "loss": 1.7572, + "step": 22320 + }, + { + "epoch": 1.4980369786248784, + "grad_norm": 4.121951103210449, + "learning_rate": 1.5624142845996165e-05, + "loss": 1.9785, + "step": 22322 + }, + { + "epoch": 1.4981712023086473, + "grad_norm": 4.337647914886475, + "learning_rate": 1.561625144021865e-05, + "loss": 2.1116, + "step": 22324 + }, + { + "epoch": 1.4983054259924162, + "grad_norm": 3.827855348587036, + "learning_rate": 1.5608361659005637e-05, + "loss": 1.9988, + "step": 22326 + }, + { + "epoch": 1.4984396496761854, + "grad_norm": 4.066762447357178, + "learning_rate": 1.560047350272991e-05, + "loss": 2.081, + "step": 22328 + }, + { + "epoch": 1.4985738733599543, + "grad_norm": 3.8490641117095947, + "learning_rate": 1.559258697176415e-05, + "loss": 1.9779, + "step": 22330 + }, + { + "epoch": 1.4987080970437234, + "grad_norm": 3.1373965740203857, + "learning_rate": 1.5584702066481016e-05, + "loss": 1.5165, + "step": 22332 + }, + { + "epoch": 1.4988423207274923, + "grad_norm": 4.027316570281982, + "learning_rate": 1.5576818787253027e-05, + "loss": 2.0736, + "step": 22334 + }, + { + "epoch": 1.4989765444112613, + "grad_norm": 3.8551712036132812, + "learning_rate": 1.5568937134452664e-05, + "loss": 1.877, + "step": 22336 + }, + { + "epoch": 1.4991107680950304, + "grad_norm": 3.376539707183838, + "learning_rate": 1.5561057108452304e-05, + "loss": 1.9328, + "step": 22338 + }, + { + "epoch": 1.4992449917787993, + "grad_norm": 3.923219919204712, + "learning_rate": 1.5553178709624283e-05, + "loss": 1.8148, + "step": 22340 + }, + { + "epoch": 1.4993792154625685, + "grad_norm": 4.212167739868164, + "learning_rate": 1.5545301938340817e-05, + "loss": 2.0936, + "step": 22342 + }, + { + "epoch": 1.4995134391463374, + "grad_norm": 4.289358615875244, + "learning_rate": 1.553742679497412e-05, + "loss": 2.1443, + "step": 22344 + }, + { + "epoch": 1.4996476628301063, + "grad_norm": 4.332179069519043, + "learning_rate": 1.5529553279896202e-05, + "loss": 1.8418, + "step": 22346 + }, + { + "epoch": 1.4997818865138755, + "grad_norm": 4.3793439865112305, + "learning_rate": 1.5521681393479126e-05, + "loss": 2.0204, + "step": 22348 + }, + { + "epoch": 1.4999161101976444, + "grad_norm": 4.224343776702881, + "learning_rate": 1.5513811136094787e-05, + "loss": 1.9803, + "step": 22350 + }, + { + "epoch": 1.5000503338814135, + "grad_norm": 4.362243175506592, + "learning_rate": 1.5505942508115073e-05, + "loss": 1.8701, + "step": 22352 + }, + { + "epoch": 1.5001845575651824, + "grad_norm": 4.058670997619629, + "learning_rate": 1.5498075509911745e-05, + "loss": 1.8341, + "step": 22354 + }, + { + "epoch": 1.5003187812489513, + "grad_norm": 4.636506080627441, + "learning_rate": 1.5490210141856508e-05, + "loss": 2.0514, + "step": 22356 + }, + { + "epoch": 1.5004530049327203, + "grad_norm": 3.994213104248047, + "learning_rate": 1.5482346404320967e-05, + "loss": 2.0115, + "step": 22358 + }, + { + "epoch": 1.5005872286164894, + "grad_norm": 4.168828010559082, + "learning_rate": 1.5474484297676694e-05, + "loss": 1.914, + "step": 22360 + }, + { + "epoch": 1.5007214523002583, + "grad_norm": 4.373939514160156, + "learning_rate": 1.546662382229515e-05, + "loss": 1.9857, + "step": 22362 + }, + { + "epoch": 1.5008556759840275, + "grad_norm": 4.561529636383057, + "learning_rate": 1.5458764978547718e-05, + "loss": 2.1402, + "step": 22364 + }, + { + "epoch": 1.5009898996677964, + "grad_norm": 4.364515781402588, + "learning_rate": 1.545090776680571e-05, + "loss": 1.8097, + "step": 22366 + }, + { + "epoch": 1.5011241233515653, + "grad_norm": 3.9441113471984863, + "learning_rate": 1.544305218744038e-05, + "loss": 1.9675, + "step": 22368 + }, + { + "epoch": 1.5012583470353342, + "grad_norm": 8.821158409118652, + "learning_rate": 1.5435198240822873e-05, + "loss": 1.955, + "step": 22370 + }, + { + "epoch": 1.5013925707191034, + "grad_norm": 3.897141933441162, + "learning_rate": 1.5427345927324305e-05, + "loss": 1.8346, + "step": 22372 + }, + { + "epoch": 1.5015267944028725, + "grad_norm": 4.549829959869385, + "learning_rate": 1.541949524731563e-05, + "loss": 2.0898, + "step": 22374 + }, + { + "epoch": 1.5016610180866414, + "grad_norm": 4.617127895355225, + "learning_rate": 1.5411646201167817e-05, + "loss": 2.0158, + "step": 22376 + }, + { + "epoch": 1.5017952417704103, + "grad_norm": 4.444859504699707, + "learning_rate": 1.5403798789251695e-05, + "loss": 2.0221, + "step": 22378 + }, + { + "epoch": 1.5019294654541793, + "grad_norm": 3.713533878326416, + "learning_rate": 1.5395953011938063e-05, + "loss": 1.7751, + "step": 22380 + }, + { + "epoch": 1.5020636891379484, + "grad_norm": 3.917362928390503, + "learning_rate": 1.5388108869597605e-05, + "loss": 2.0139, + "step": 22382 + }, + { + "epoch": 1.5021979128217176, + "grad_norm": 3.9168341159820557, + "learning_rate": 1.5380266362600943e-05, + "loss": 1.7851, + "step": 22384 + }, + { + "epoch": 1.5023321365054865, + "grad_norm": 4.513367652893066, + "learning_rate": 1.5372425491318615e-05, + "loss": 2.2501, + "step": 22386 + }, + { + "epoch": 1.5024663601892554, + "grad_norm": 4.4053215980529785, + "learning_rate": 1.5364586256121076e-05, + "loss": 2.1015, + "step": 22388 + }, + { + "epoch": 1.5026005838730243, + "grad_norm": 3.9305665493011475, + "learning_rate": 1.535674865737875e-05, + "loss": 1.7749, + "step": 22390 + }, + { + "epoch": 1.5027348075567935, + "grad_norm": 4.071299076080322, + "learning_rate": 1.534891269546192e-05, + "loss": 2.1092, + "step": 22392 + }, + { + "epoch": 1.5028690312405624, + "grad_norm": 3.633821487426758, + "learning_rate": 1.534107837074083e-05, + "loss": 1.9411, + "step": 22394 + }, + { + "epoch": 1.5030032549243315, + "grad_norm": 3.8217408657073975, + "learning_rate": 1.5333245683585618e-05, + "loss": 1.8082, + "step": 22396 + }, + { + "epoch": 1.5031374786081004, + "grad_norm": 3.7850160598754883, + "learning_rate": 1.5325414634366392e-05, + "loss": 1.9501, + "step": 22398 + }, + { + "epoch": 1.5032717022918693, + "grad_norm": 4.361111164093018, + "learning_rate": 1.531758522345314e-05, + "loss": 1.9403, + "step": 22400 + }, + { + "epoch": 1.5034059259756383, + "grad_norm": 3.9278764724731445, + "learning_rate": 1.530975745121579e-05, + "loss": 2.0464, + "step": 22402 + }, + { + "epoch": 1.5035401496594074, + "grad_norm": 4.081361770629883, + "learning_rate": 1.5301931318024166e-05, + "loss": 2.1401, + "step": 22404 + }, + { + "epoch": 1.5036743733431766, + "grad_norm": 3.793201208114624, + "learning_rate": 1.5294106824248065e-05, + "loss": 1.8097, + "step": 22406 + }, + { + "epoch": 1.5038085970269455, + "grad_norm": 4.362032413482666, + "learning_rate": 1.5286283970257166e-05, + "loss": 1.8606, + "step": 22408 + }, + { + "epoch": 1.5039428207107144, + "grad_norm": 4.256608486175537, + "learning_rate": 1.52784627564211e-05, + "loss": 1.787, + "step": 22410 + }, + { + "epoch": 1.5040770443944833, + "grad_norm": 4.202077865600586, + "learning_rate": 1.527064318310939e-05, + "loss": 2.0996, + "step": 22412 + }, + { + "epoch": 1.5042112680782525, + "grad_norm": 4.914397239685059, + "learning_rate": 1.5262825250691497e-05, + "loss": 1.8251, + "step": 22414 + }, + { + "epoch": 1.5043454917620214, + "grad_norm": 4.424179553985596, + "learning_rate": 1.5255008959536787e-05, + "loss": 1.9616, + "step": 22416 + }, + { + "epoch": 1.5044797154457905, + "grad_norm": 3.8267056941986084, + "learning_rate": 1.5247194310014601e-05, + "loss": 1.8005, + "step": 22418 + }, + { + "epoch": 1.5046139391295594, + "grad_norm": 4.1784539222717285, + "learning_rate": 1.5239381302494144e-05, + "loss": 1.8141, + "step": 22420 + }, + { + "epoch": 1.5047481628133283, + "grad_norm": 3.9645345211029053, + "learning_rate": 1.5231569937344564e-05, + "loss": 2.3439, + "step": 22422 + }, + { + "epoch": 1.5048823864970973, + "grad_norm": 4.447378635406494, + "learning_rate": 1.5223760214934917e-05, + "loss": 2.1864, + "step": 22424 + }, + { + "epoch": 1.5050166101808664, + "grad_norm": 3.762317657470703, + "learning_rate": 1.5215952135634237e-05, + "loss": 1.7696, + "step": 22426 + }, + { + "epoch": 1.5051508338646356, + "grad_norm": 4.138880252838135, + "learning_rate": 1.5208145699811415e-05, + "loss": 1.974, + "step": 22428 + }, + { + "epoch": 1.5052850575484045, + "grad_norm": 3.625844717025757, + "learning_rate": 1.5200340907835298e-05, + "loss": 1.8821, + "step": 22430 + }, + { + "epoch": 1.5054192812321734, + "grad_norm": 4.575976371765137, + "learning_rate": 1.519253776007462e-05, + "loss": 1.8669, + "step": 22432 + }, + { + "epoch": 1.5055535049159423, + "grad_norm": 3.79170560836792, + "learning_rate": 1.5184736256898107e-05, + "loss": 1.8987, + "step": 22434 + }, + { + "epoch": 1.5056877285997115, + "grad_norm": 4.469919681549072, + "learning_rate": 1.517693639867433e-05, + "loss": 2.0104, + "step": 22436 + }, + { + "epoch": 1.5058219522834804, + "grad_norm": 4.347492218017578, + "learning_rate": 1.5169138185771841e-05, + "loss": 1.8568, + "step": 22438 + }, + { + "epoch": 1.5059561759672495, + "grad_norm": 3.7464959621429443, + "learning_rate": 1.5161341618559088e-05, + "loss": 1.8908, + "step": 22440 + }, + { + "epoch": 1.5060903996510184, + "grad_norm": 4.839260578155518, + "learning_rate": 1.515354669740443e-05, + "loss": 1.877, + "step": 22442 + }, + { + "epoch": 1.5062246233347873, + "grad_norm": 3.8614466190338135, + "learning_rate": 1.5145753422676157e-05, + "loss": 1.7252, + "step": 22444 + }, + { + "epoch": 1.5063588470185563, + "grad_norm": 4.240077018737793, + "learning_rate": 1.5137961794742511e-05, + "loss": 1.8192, + "step": 22446 + }, + { + "epoch": 1.5064930707023254, + "grad_norm": 4.573522567749023, + "learning_rate": 1.5130171813971617e-05, + "loss": 1.9951, + "step": 22448 + }, + { + "epoch": 1.5066272943860946, + "grad_norm": 3.8363585472106934, + "learning_rate": 1.5122383480731533e-05, + "loss": 1.8255, + "step": 22450 + }, + { + "epoch": 1.5067615180698635, + "grad_norm": 4.053619384765625, + "learning_rate": 1.5114596795390234e-05, + "loss": 1.8869, + "step": 22452 + }, + { + "epoch": 1.5068957417536324, + "grad_norm": 4.100177764892578, + "learning_rate": 1.5106811758315654e-05, + "loss": 2.0787, + "step": 22454 + }, + { + "epoch": 1.5070299654374013, + "grad_norm": 4.469886302947998, + "learning_rate": 1.50990283698756e-05, + "loss": 2.2557, + "step": 22456 + }, + { + "epoch": 1.5071641891211705, + "grad_norm": 4.776904106140137, + "learning_rate": 1.5091246630437827e-05, + "loss": 2.1913, + "step": 22458 + }, + { + "epoch": 1.5072984128049396, + "grad_norm": 4.225692272186279, + "learning_rate": 1.5083466540370006e-05, + "loss": 1.9577, + "step": 22460 + }, + { + "epoch": 1.5074326364887085, + "grad_norm": 4.318523406982422, + "learning_rate": 1.5075688100039715e-05, + "loss": 2.0147, + "step": 22462 + }, + { + "epoch": 1.5075668601724774, + "grad_norm": 4.240555286407471, + "learning_rate": 1.5067911309814503e-05, + "loss": 1.7622, + "step": 22464 + }, + { + "epoch": 1.5077010838562463, + "grad_norm": 7.62746000289917, + "learning_rate": 1.5060136170061773e-05, + "loss": 1.8207, + "step": 22466 + }, + { + "epoch": 1.5078353075400155, + "grad_norm": 3.3622565269470215, + "learning_rate": 1.5052362681148935e-05, + "loss": 1.893, + "step": 22468 + }, + { + "epoch": 1.5079695312237844, + "grad_norm": 4.257326602935791, + "learning_rate": 1.5044590843443207e-05, + "loss": 1.8211, + "step": 22470 + }, + { + "epoch": 1.5081037549075536, + "grad_norm": 3.783116102218628, + "learning_rate": 1.5036820657311839e-05, + "loss": 1.8551, + "step": 22472 + }, + { + "epoch": 1.5082379785913225, + "grad_norm": 3.94856858253479, + "learning_rate": 1.5029052123121928e-05, + "loss": 2.0929, + "step": 22474 + }, + { + "epoch": 1.5083722022750914, + "grad_norm": 3.9490604400634766, + "learning_rate": 1.5021285241240552e-05, + "loss": 1.7856, + "step": 22476 + }, + { + "epoch": 1.5085064259588603, + "grad_norm": 3.5352063179016113, + "learning_rate": 1.5013520012034665e-05, + "loss": 2.0277, + "step": 22478 + }, + { + "epoch": 1.5086406496426294, + "grad_norm": 4.034780502319336, + "learning_rate": 1.5005756435871165e-05, + "loss": 1.93, + "step": 22480 + }, + { + "epoch": 1.5087748733263986, + "grad_norm": 4.455792427062988, + "learning_rate": 1.4997994513116836e-05, + "loss": 2.4562, + "step": 22482 + }, + { + "epoch": 1.5089090970101675, + "grad_norm": 3.9812886714935303, + "learning_rate": 1.4990234244138457e-05, + "loss": 2.04, + "step": 22484 + }, + { + "epoch": 1.5090433206939364, + "grad_norm": 3.4683444499969482, + "learning_rate": 1.4982475629302667e-05, + "loss": 1.7803, + "step": 22486 + }, + { + "epoch": 1.5091775443777053, + "grad_norm": 3.63779616355896, + "learning_rate": 1.4974718668976046e-05, + "loss": 1.8639, + "step": 22488 + }, + { + "epoch": 1.5093117680614745, + "grad_norm": 3.9409401416778564, + "learning_rate": 1.4966963363525077e-05, + "loss": 1.6756, + "step": 22490 + }, + { + "epoch": 1.5094459917452434, + "grad_norm": 3.729785442352295, + "learning_rate": 1.4959209713316213e-05, + "loss": 1.7249, + "step": 22492 + }, + { + "epoch": 1.5095802154290126, + "grad_norm": 3.902287006378174, + "learning_rate": 1.4951457718715772e-05, + "loss": 1.8941, + "step": 22494 + }, + { + "epoch": 1.5097144391127815, + "grad_norm": 4.1375837326049805, + "learning_rate": 1.4943707380090067e-05, + "loss": 2.1637, + "step": 22496 + }, + { + "epoch": 1.5098486627965504, + "grad_norm": 4.1883111000061035, + "learning_rate": 1.493595869780522e-05, + "loss": 2.1995, + "step": 22498 + }, + { + "epoch": 1.5099828864803193, + "grad_norm": 3.805842161178589, + "learning_rate": 1.4928211672227387e-05, + "loss": 1.7559, + "step": 22500 + }, + { + "epoch": 1.5101171101640884, + "grad_norm": 5.107202529907227, + "learning_rate": 1.4920466303722568e-05, + "loss": 1.9037, + "step": 22502 + }, + { + "epoch": 1.5102513338478576, + "grad_norm": 4.203936576843262, + "learning_rate": 1.4912722592656758e-05, + "loss": 2.0202, + "step": 22504 + }, + { + "epoch": 1.5103855575316265, + "grad_norm": 4.508406162261963, + "learning_rate": 1.4904980539395808e-05, + "loss": 1.9749, + "step": 22506 + }, + { + "epoch": 1.5105197812153954, + "grad_norm": 4.394796848297119, + "learning_rate": 1.4897240144305513e-05, + "loss": 1.8981, + "step": 22508 + }, + { + "epoch": 1.5106540048991643, + "grad_norm": 4.307918071746826, + "learning_rate": 1.4889501407751583e-05, + "loss": 2.1532, + "step": 22510 + }, + { + "epoch": 1.5107882285829335, + "grad_norm": 3.9029955863952637, + "learning_rate": 1.4881764330099685e-05, + "loss": 2.0854, + "step": 22512 + }, + { + "epoch": 1.5109224522667024, + "grad_norm": 3.5215930938720703, + "learning_rate": 1.487402891171536e-05, + "loss": 1.6414, + "step": 22514 + }, + { + "epoch": 1.5110566759504716, + "grad_norm": 4.076118469238281, + "learning_rate": 1.4866295152964104e-05, + "loss": 1.9588, + "step": 22516 + }, + { + "epoch": 1.5111908996342405, + "grad_norm": 3.919400453567505, + "learning_rate": 1.4858563054211294e-05, + "loss": 1.9681, + "step": 22518 + }, + { + "epoch": 1.5113251233180094, + "grad_norm": 4.066512584686279, + "learning_rate": 1.4850832615822297e-05, + "loss": 1.8824, + "step": 22520 + }, + { + "epoch": 1.5114593470017783, + "grad_norm": 4.255802154541016, + "learning_rate": 1.484310383816232e-05, + "loss": 2.0073, + "step": 22522 + }, + { + "epoch": 1.5115935706855474, + "grad_norm": 4.158657073974609, + "learning_rate": 1.4835376721596588e-05, + "loss": 1.9788, + "step": 22524 + }, + { + "epoch": 1.5117277943693166, + "grad_norm": 4.508986949920654, + "learning_rate": 1.482765126649014e-05, + "loss": 1.9981, + "step": 22526 + }, + { + "epoch": 1.5118620180530855, + "grad_norm": 4.37610387802124, + "learning_rate": 1.481992747320799e-05, + "loss": 1.9843, + "step": 22528 + }, + { + "epoch": 1.5119962417368544, + "grad_norm": 3.6776769161224365, + "learning_rate": 1.4812205342115104e-05, + "loss": 1.9069, + "step": 22530 + }, + { + "epoch": 1.5121304654206233, + "grad_norm": 4.303517818450928, + "learning_rate": 1.4804484873576314e-05, + "loss": 2.0146, + "step": 22532 + }, + { + "epoch": 1.5122646891043925, + "grad_norm": 4.536104679107666, + "learning_rate": 1.4796766067956424e-05, + "loss": 2.1484, + "step": 22534 + }, + { + "epoch": 1.5123989127881616, + "grad_norm": 5.265168190002441, + "learning_rate": 1.4789048925620091e-05, + "loss": 2.0193, + "step": 22536 + }, + { + "epoch": 1.5125331364719306, + "grad_norm": 3.069338321685791, + "learning_rate": 1.4781333446931966e-05, + "loss": 1.5622, + "step": 22538 + }, + { + "epoch": 1.5126673601556995, + "grad_norm": 3.8567023277282715, + "learning_rate": 1.4773619632256568e-05, + "loss": 1.8096, + "step": 22540 + }, + { + "epoch": 1.5128015838394684, + "grad_norm": 3.435603141784668, + "learning_rate": 1.476590748195839e-05, + "loss": 1.6946, + "step": 22542 + }, + { + "epoch": 1.5129358075232375, + "grad_norm": 3.903139114379883, + "learning_rate": 1.4758196996401796e-05, + "loss": 1.6735, + "step": 22544 + }, + { + "epoch": 1.5130700312070064, + "grad_norm": 4.557210922241211, + "learning_rate": 1.4750488175951099e-05, + "loss": 1.9796, + "step": 22546 + }, + { + "epoch": 1.5132042548907756, + "grad_norm": 5.621368408203125, + "learning_rate": 1.4742781020970498e-05, + "loss": 1.9143, + "step": 22548 + }, + { + "epoch": 1.5133384785745445, + "grad_norm": 3.936818838119507, + "learning_rate": 1.4735075531824183e-05, + "loss": 1.955, + "step": 22550 + }, + { + "epoch": 1.5134727022583134, + "grad_norm": 4.337475776672363, + "learning_rate": 1.4727371708876203e-05, + "loss": 2.0544, + "step": 22552 + }, + { + "epoch": 1.5136069259420823, + "grad_norm": 4.040201187133789, + "learning_rate": 1.4719669552490551e-05, + "loss": 1.8406, + "step": 22554 + }, + { + "epoch": 1.5137411496258515, + "grad_norm": 4.7854509353637695, + "learning_rate": 1.4711969063031117e-05, + "loss": 2.2901, + "step": 22556 + }, + { + "epoch": 1.5138753733096206, + "grad_norm": 3.7311758995056152, + "learning_rate": 1.4704270240861773e-05, + "loss": 1.8557, + "step": 22558 + }, + { + "epoch": 1.5140095969933896, + "grad_norm": 3.9495370388031006, + "learning_rate": 1.4696573086346237e-05, + "loss": 1.8661, + "step": 22560 + }, + { + "epoch": 1.5141438206771585, + "grad_norm": 3.624539375305176, + "learning_rate": 1.4688877599848216e-05, + "loss": 1.7072, + "step": 22562 + }, + { + "epoch": 1.5142780443609274, + "grad_norm": 4.492711544036865, + "learning_rate": 1.4681183781731294e-05, + "loss": 1.7957, + "step": 22564 + }, + { + "epoch": 1.5144122680446965, + "grad_norm": 4.579678535461426, + "learning_rate": 1.4673491632358987e-05, + "loss": 1.9328, + "step": 22566 + }, + { + "epoch": 1.5145464917284654, + "grad_norm": 4.0043253898620605, + "learning_rate": 1.4665801152094715e-05, + "loss": 1.8492, + "step": 22568 + }, + { + "epoch": 1.5146807154122346, + "grad_norm": 4.0609130859375, + "learning_rate": 1.4658112341301878e-05, + "loss": 1.7621, + "step": 22570 + }, + { + "epoch": 1.5148149390960035, + "grad_norm": 4.56958532333374, + "learning_rate": 1.4650425200343732e-05, + "loss": 1.8887, + "step": 22572 + }, + { + "epoch": 1.5149491627797724, + "grad_norm": 4.421917915344238, + "learning_rate": 1.4642739729583482e-05, + "loss": 2.1176, + "step": 22574 + }, + { + "epoch": 1.5150833864635413, + "grad_norm": 3.9134929180145264, + "learning_rate": 1.463505592938424e-05, + "loss": 1.8162, + "step": 22576 + }, + { + "epoch": 1.5152176101473105, + "grad_norm": 3.882678508758545, + "learning_rate": 1.4627373800109078e-05, + "loss": 1.7515, + "step": 22578 + }, + { + "epoch": 1.5153518338310796, + "grad_norm": 3.6148805618286133, + "learning_rate": 1.4619693342120943e-05, + "loss": 1.9386, + "step": 22580 + }, + { + "epoch": 1.5154860575148486, + "grad_norm": 4.289422512054443, + "learning_rate": 1.4612014555782733e-05, + "loss": 1.9332, + "step": 22582 + }, + { + "epoch": 1.5156202811986175, + "grad_norm": 3.742199659347534, + "learning_rate": 1.4604337441457227e-05, + "loss": 1.8331, + "step": 22584 + }, + { + "epoch": 1.5157545048823864, + "grad_norm": 3.9132936000823975, + "learning_rate": 1.459666199950719e-05, + "loss": 1.9489, + "step": 22586 + }, + { + "epoch": 1.5158887285661555, + "grad_norm": 4.181424617767334, + "learning_rate": 1.4588988230295242e-05, + "loss": 1.9495, + "step": 22588 + }, + { + "epoch": 1.5160229522499244, + "grad_norm": 5.01571798324585, + "learning_rate": 1.458131613418398e-05, + "loss": 1.883, + "step": 22590 + }, + { + "epoch": 1.5161571759336936, + "grad_norm": 4.756555557250977, + "learning_rate": 1.4573645711535899e-05, + "loss": 1.9161, + "step": 22592 + }, + { + "epoch": 1.5162913996174625, + "grad_norm": 4.102141380310059, + "learning_rate": 1.4565976962713369e-05, + "loss": 2.0731, + "step": 22594 + }, + { + "epoch": 1.5164256233012314, + "grad_norm": 3.820876121520996, + "learning_rate": 1.4558309888078758e-05, + "loss": 1.6646, + "step": 22596 + }, + { + "epoch": 1.5165598469850003, + "grad_norm": 4.168964862823486, + "learning_rate": 1.4550644487994303e-05, + "loss": 1.8877, + "step": 22598 + }, + { + "epoch": 1.5166940706687695, + "grad_norm": 3.49475359916687, + "learning_rate": 1.45429807628222e-05, + "loss": 1.7252, + "step": 22600 + }, + { + "epoch": 1.5168282943525386, + "grad_norm": 4.399728775024414, + "learning_rate": 1.4535318712924529e-05, + "loss": 1.9125, + "step": 22602 + }, + { + "epoch": 1.5169625180363076, + "grad_norm": 3.769663095474243, + "learning_rate": 1.4527658338663313e-05, + "loss": 1.8147, + "step": 22604 + }, + { + "epoch": 1.5170967417200765, + "grad_norm": 4.464654445648193, + "learning_rate": 1.4519999640400478e-05, + "loss": 2.1059, + "step": 22606 + }, + { + "epoch": 1.5172309654038454, + "grad_norm": 4.236873626708984, + "learning_rate": 1.4512342618497898e-05, + "loss": 2.0759, + "step": 22608 + }, + { + "epoch": 1.5173651890876145, + "grad_norm": 4.454929351806641, + "learning_rate": 1.4504687273317352e-05, + "loss": 1.8238, + "step": 22610 + }, + { + "epoch": 1.5174994127713837, + "grad_norm": 4.279935359954834, + "learning_rate": 1.4497033605220529e-05, + "loss": 1.9518, + "step": 22612 + }, + { + "epoch": 1.5176336364551526, + "grad_norm": 3.7140767574310303, + "learning_rate": 1.4489381614569043e-05, + "loss": 1.7127, + "step": 22614 + }, + { + "epoch": 1.5177678601389215, + "grad_norm": 4.119643688201904, + "learning_rate": 1.4481731301724461e-05, + "loss": 1.9316, + "step": 22616 + }, + { + "epoch": 1.5179020838226904, + "grad_norm": 4.016823768615723, + "learning_rate": 1.4474082667048217e-05, + "loss": 1.8499, + "step": 22618 + }, + { + "epoch": 1.5180363075064596, + "grad_norm": 4.472353935241699, + "learning_rate": 1.4466435710901737e-05, + "loss": 2.1544, + "step": 22620 + }, + { + "epoch": 1.5181705311902285, + "grad_norm": 4.059164047241211, + "learning_rate": 1.4458790433646263e-05, + "loss": 2.003, + "step": 22622 + }, + { + "epoch": 1.5183047548739976, + "grad_norm": 4.804417133331299, + "learning_rate": 1.445114683564307e-05, + "loss": 2.2385, + "step": 22624 + }, + { + "epoch": 1.5184389785577665, + "grad_norm": 4.0181145668029785, + "learning_rate": 1.4443504917253275e-05, + "loss": 1.6752, + "step": 22626 + }, + { + "epoch": 1.5185732022415355, + "grad_norm": 4.224808216094971, + "learning_rate": 1.4435864678837962e-05, + "loss": 1.9834, + "step": 22628 + }, + { + "epoch": 1.5187074259253044, + "grad_norm": 4.786375999450684, + "learning_rate": 1.4428226120758114e-05, + "loss": 2.0251, + "step": 22630 + }, + { + "epoch": 1.5188416496090735, + "grad_norm": 3.9417104721069336, + "learning_rate": 1.4420589243374626e-05, + "loss": 1.9183, + "step": 22632 + }, + { + "epoch": 1.5189758732928427, + "grad_norm": 3.796201705932617, + "learning_rate": 1.4412954047048326e-05, + "loss": 1.7603, + "step": 22634 + }, + { + "epoch": 1.5191100969766116, + "grad_norm": 4.0426201820373535, + "learning_rate": 1.4405320532139976e-05, + "loss": 1.9761, + "step": 22636 + }, + { + "epoch": 1.5192443206603805, + "grad_norm": 3.7556238174438477, + "learning_rate": 1.4397688699010241e-05, + "loss": 1.8568, + "step": 22638 + }, + { + "epoch": 1.5193785443441494, + "grad_norm": 4.3798651695251465, + "learning_rate": 1.4390058548019708e-05, + "loss": 2.0562, + "step": 22640 + }, + { + "epoch": 1.5195127680279186, + "grad_norm": 3.9465527534484863, + "learning_rate": 1.4382430079528864e-05, + "loss": 1.6892, + "step": 22642 + }, + { + "epoch": 1.5196469917116875, + "grad_norm": 3.9186599254608154, + "learning_rate": 1.4374803293898181e-05, + "loss": 1.7797, + "step": 22644 + }, + { + "epoch": 1.5197812153954566, + "grad_norm": 4.623231410980225, + "learning_rate": 1.4367178191487968e-05, + "loss": 2.0366, + "step": 22646 + }, + { + "epoch": 1.5199154390792255, + "grad_norm": 3.9128165245056152, + "learning_rate": 1.4359554772658552e-05, + "loss": 1.9547, + "step": 22648 + }, + { + "epoch": 1.5200496627629945, + "grad_norm": 4.1431756019592285, + "learning_rate": 1.4351933037770049e-05, + "loss": 2.14, + "step": 22650 + }, + { + "epoch": 1.5201838864467634, + "grad_norm": 4.278277397155762, + "learning_rate": 1.4344312987182635e-05, + "loss": 2.0699, + "step": 22652 + }, + { + "epoch": 1.5203181101305325, + "grad_norm": 4.214488983154297, + "learning_rate": 1.4336694621256303e-05, + "loss": 2.0227, + "step": 22654 + }, + { + "epoch": 1.5204523338143017, + "grad_norm": 4.2972002029418945, + "learning_rate": 1.4329077940351031e-05, + "loss": 1.9729, + "step": 22656 + }, + { + "epoch": 1.5205865574980706, + "grad_norm": 4.065945148468018, + "learning_rate": 1.4321462944826685e-05, + "loss": 1.8274, + "step": 22658 + }, + { + "epoch": 1.5207207811818395, + "grad_norm": 3.7837913036346436, + "learning_rate": 1.4313849635043058e-05, + "loss": 1.8702, + "step": 22660 + }, + { + "epoch": 1.5208550048656084, + "grad_norm": 3.7116920948028564, + "learning_rate": 1.4306238011359863e-05, + "loss": 1.7732, + "step": 22662 + }, + { + "epoch": 1.5209892285493776, + "grad_norm": 3.8161063194274902, + "learning_rate": 1.4298628074136717e-05, + "loss": 1.7689, + "step": 22664 + }, + { + "epoch": 1.5211234522331465, + "grad_norm": 3.770059108734131, + "learning_rate": 1.4291019823733203e-05, + "loss": 1.8956, + "step": 22666 + }, + { + "epoch": 1.5212576759169156, + "grad_norm": 4.53851842880249, + "learning_rate": 1.4283413260508788e-05, + "loss": 2.1637, + "step": 22668 + }, + { + "epoch": 1.5213918996006845, + "grad_norm": 4.028188705444336, + "learning_rate": 1.4275808384822858e-05, + "loss": 1.8808, + "step": 22670 + }, + { + "epoch": 1.5215261232844535, + "grad_norm": 4.60374641418457, + "learning_rate": 1.4268205197034717e-05, + "loss": 1.9965, + "step": 22672 + }, + { + "epoch": 1.5216603469682224, + "grad_norm": 4.205395698547363, + "learning_rate": 1.4260603697503638e-05, + "loss": 1.9518, + "step": 22674 + }, + { + "epoch": 1.5217945706519915, + "grad_norm": 4.170927047729492, + "learning_rate": 1.4253003886588751e-05, + "loss": 1.9075, + "step": 22676 + }, + { + "epoch": 1.5219287943357607, + "grad_norm": 3.5510151386260986, + "learning_rate": 1.4245405764649138e-05, + "loss": 1.8791, + "step": 22678 + }, + { + "epoch": 1.5220630180195296, + "grad_norm": 3.913048505783081, + "learning_rate": 1.4237809332043784e-05, + "loss": 1.8611, + "step": 22680 + }, + { + "epoch": 1.5221972417032985, + "grad_norm": 3.77065372467041, + "learning_rate": 1.4230214589131625e-05, + "loss": 1.8798, + "step": 22682 + }, + { + "epoch": 1.5223314653870674, + "grad_norm": 4.000100135803223, + "learning_rate": 1.4222621536271474e-05, + "loss": 1.7886, + "step": 22684 + }, + { + "epoch": 1.5224656890708366, + "grad_norm": 3.7661051750183105, + "learning_rate": 1.4215030173822136e-05, + "loss": 1.7966, + "step": 22686 + }, + { + "epoch": 1.5225999127546057, + "grad_norm": 4.241380214691162, + "learning_rate": 1.4207440502142222e-05, + "loss": 1.8235, + "step": 22688 + }, + { + "epoch": 1.5227341364383746, + "grad_norm": 4.05436372756958, + "learning_rate": 1.4199852521590373e-05, + "loss": 1.978, + "step": 22690 + }, + { + "epoch": 1.5228683601221435, + "grad_norm": 3.8760311603546143, + "learning_rate": 1.419226623252508e-05, + "loss": 2.0297, + "step": 22692 + }, + { + "epoch": 1.5230025838059125, + "grad_norm": 3.9395837783813477, + "learning_rate": 1.4184681635304814e-05, + "loss": 1.924, + "step": 22694 + }, + { + "epoch": 1.5231368074896814, + "grad_norm": 3.8777976036071777, + "learning_rate": 1.4177098730287914e-05, + "loss": 1.7906, + "step": 22696 + }, + { + "epoch": 1.5232710311734505, + "grad_norm": 4.40635347366333, + "learning_rate": 1.4169517517832654e-05, + "loss": 1.8175, + "step": 22698 + }, + { + "epoch": 1.5234052548572197, + "grad_norm": 4.6007280349731445, + "learning_rate": 1.4161937998297214e-05, + "loss": 2.3146, + "step": 22700 + }, + { + "epoch": 1.5235394785409886, + "grad_norm": 3.9736688137054443, + "learning_rate": 1.4154360172039749e-05, + "loss": 2.0544, + "step": 22702 + }, + { + "epoch": 1.5236737022247575, + "grad_norm": 3.7181661128997803, + "learning_rate": 1.4146784039418282e-05, + "loss": 1.8922, + "step": 22704 + }, + { + "epoch": 1.5238079259085264, + "grad_norm": 4.274391174316406, + "learning_rate": 1.4139209600790766e-05, + "loss": 1.7398, + "step": 22706 + }, + { + "epoch": 1.5239421495922956, + "grad_norm": 4.33990478515625, + "learning_rate": 1.4131636856515056e-05, + "loss": 1.7881, + "step": 22708 + }, + { + "epoch": 1.5240763732760647, + "grad_norm": 4.128025054931641, + "learning_rate": 1.4124065806948994e-05, + "loss": 1.9823, + "step": 22710 + }, + { + "epoch": 1.5242105969598336, + "grad_norm": 4.705966472625732, + "learning_rate": 1.4116496452450262e-05, + "loss": 1.6238, + "step": 22712 + }, + { + "epoch": 1.5243448206436025, + "grad_norm": 3.981635332107544, + "learning_rate": 1.4108928793376542e-05, + "loss": 1.7551, + "step": 22714 + }, + { + "epoch": 1.5244790443273715, + "grad_norm": 4.193896770477295, + "learning_rate": 1.4101362830085324e-05, + "loss": 1.8736, + "step": 22716 + }, + { + "epoch": 1.5246132680111406, + "grad_norm": 3.910710334777832, + "learning_rate": 1.4093798562934141e-05, + "loss": 1.9365, + "step": 22718 + }, + { + "epoch": 1.5247474916949095, + "grad_norm": 5.005021572113037, + "learning_rate": 1.408623599228035e-05, + "loss": 1.8071, + "step": 22720 + }, + { + "epoch": 1.5248817153786787, + "grad_norm": 4.806228160858154, + "learning_rate": 1.4078675118481305e-05, + "loss": 2.0991, + "step": 22722 + }, + { + "epoch": 1.5250159390624476, + "grad_norm": 4.139250755310059, + "learning_rate": 1.407111594189422e-05, + "loss": 1.8318, + "step": 22724 + }, + { + "epoch": 1.5251501627462165, + "grad_norm": 4.305028438568115, + "learning_rate": 1.4063558462876259e-05, + "loss": 2.0011, + "step": 22726 + }, + { + "epoch": 1.5252843864299854, + "grad_norm": 3.8654534816741943, + "learning_rate": 1.4056002681784496e-05, + "loss": 1.8061, + "step": 22728 + }, + { + "epoch": 1.5254186101137546, + "grad_norm": 3.7353029251098633, + "learning_rate": 1.4048448598975905e-05, + "loss": 1.8458, + "step": 22730 + }, + { + "epoch": 1.5255528337975237, + "grad_norm": 4.590461254119873, + "learning_rate": 1.4040896214807442e-05, + "loss": 1.8814, + "step": 22732 + }, + { + "epoch": 1.5256870574812926, + "grad_norm": 4.057047367095947, + "learning_rate": 1.4033345529635922e-05, + "loss": 1.7403, + "step": 22734 + }, + { + "epoch": 1.5258212811650615, + "grad_norm": 4.187168598175049, + "learning_rate": 1.4025796543818098e-05, + "loss": 2.0404, + "step": 22736 + }, + { + "epoch": 1.5259555048488305, + "grad_norm": 4.151226043701172, + "learning_rate": 1.4018249257710636e-05, + "loss": 1.7062, + "step": 22738 + }, + { + "epoch": 1.5260897285325996, + "grad_norm": 4.534520626068115, + "learning_rate": 1.4010703671670155e-05, + "loss": 1.8676, + "step": 22740 + }, + { + "epoch": 1.5262239522163685, + "grad_norm": 3.7360997200012207, + "learning_rate": 1.4003159786053139e-05, + "loss": 1.6625, + "step": 22742 + }, + { + "epoch": 1.5263581759001377, + "grad_norm": 4.110637664794922, + "learning_rate": 1.3995617601216066e-05, + "loss": 1.945, + "step": 22744 + }, + { + "epoch": 1.5264923995839066, + "grad_norm": 3.953648567199707, + "learning_rate": 1.3988077117515236e-05, + "loss": 1.8004, + "step": 22746 + }, + { + "epoch": 1.5266266232676755, + "grad_norm": 3.444709539413452, + "learning_rate": 1.3980538335306964e-05, + "loss": 2.0257, + "step": 22748 + }, + { + "epoch": 1.5267608469514444, + "grad_norm": 4.443262577056885, + "learning_rate": 1.3973001254947404e-05, + "loss": 1.9723, + "step": 22750 + }, + { + "epoch": 1.5268950706352136, + "grad_norm": 4.173681735992432, + "learning_rate": 1.3965465876792716e-05, + "loss": 2.0267, + "step": 22752 + }, + { + "epoch": 1.5270292943189827, + "grad_norm": 3.885655164718628, + "learning_rate": 1.3957932201198897e-05, + "loss": 1.8777, + "step": 22754 + }, + { + "epoch": 1.5271635180027516, + "grad_norm": 3.9431562423706055, + "learning_rate": 1.3950400228521914e-05, + "loss": 1.7606, + "step": 22756 + }, + { + "epoch": 1.5272977416865205, + "grad_norm": 3.78403639793396, + "learning_rate": 1.3942869959117616e-05, + "loss": 1.9084, + "step": 22758 + }, + { + "epoch": 1.5274319653702895, + "grad_norm": 4.4929518699646, + "learning_rate": 1.3935341393341822e-05, + "loss": 2.2653, + "step": 22760 + }, + { + "epoch": 1.5275661890540586, + "grad_norm": 4.50465726852417, + "learning_rate": 1.3927814531550226e-05, + "loss": 1.929, + "step": 22762 + }, + { + "epoch": 1.5277004127378278, + "grad_norm": 4.371631622314453, + "learning_rate": 1.3920289374098467e-05, + "loss": 1.757, + "step": 22764 + }, + { + "epoch": 1.5278346364215967, + "grad_norm": 3.811516523361206, + "learning_rate": 1.391276592134207e-05, + "loss": 1.7709, + "step": 22766 + }, + { + "epoch": 1.5279688601053656, + "grad_norm": 3.8502869606018066, + "learning_rate": 1.3905244173636534e-05, + "loss": 1.9707, + "step": 22768 + }, + { + "epoch": 1.5281030837891345, + "grad_norm": 4.041707992553711, + "learning_rate": 1.389772413133722e-05, + "loss": 2.0041, + "step": 22770 + }, + { + "epoch": 1.5282373074729034, + "grad_norm": 4.704575538635254, + "learning_rate": 1.3890205794799476e-05, + "loss": 2.0865, + "step": 22772 + }, + { + "epoch": 1.5283715311566726, + "grad_norm": 4.1070146560668945, + "learning_rate": 1.3882689164378477e-05, + "loss": 1.9177, + "step": 22774 + }, + { + "epoch": 1.5285057548404417, + "grad_norm": 4.36108922958374, + "learning_rate": 1.3875174240429405e-05, + "loss": 1.9586, + "step": 22776 + }, + { + "epoch": 1.5286399785242106, + "grad_norm": 4.022629737854004, + "learning_rate": 1.3867661023307298e-05, + "loss": 1.8958, + "step": 22778 + }, + { + "epoch": 1.5287742022079795, + "grad_norm": 4.23771333694458, + "learning_rate": 1.3860149513367171e-05, + "loss": 2.054, + "step": 22780 + }, + { + "epoch": 1.5289084258917485, + "grad_norm": 3.958345413208008, + "learning_rate": 1.3852639710963916e-05, + "loss": 1.8604, + "step": 22782 + }, + { + "epoch": 1.5290426495755176, + "grad_norm": 4.582435607910156, + "learning_rate": 1.384513161645235e-05, + "loss": 2.2609, + "step": 22784 + }, + { + "epoch": 1.5291768732592868, + "grad_norm": 4.4339118003845215, + "learning_rate": 1.3837625230187212e-05, + "loss": 1.8881, + "step": 22786 + }, + { + "epoch": 1.5293110969430557, + "grad_norm": 4.075619697570801, + "learning_rate": 1.383012055252318e-05, + "loss": 1.8826, + "step": 22788 + }, + { + "epoch": 1.5294453206268246, + "grad_norm": 3.6286306381225586, + "learning_rate": 1.3822617583814828e-05, + "loss": 1.8354, + "step": 22790 + }, + { + "epoch": 1.5295795443105935, + "grad_norm": 3.522712230682373, + "learning_rate": 1.3815116324416654e-05, + "loss": 1.7469, + "step": 22792 + }, + { + "epoch": 1.5297137679943626, + "grad_norm": 6.073153018951416, + "learning_rate": 1.380761677468307e-05, + "loss": 1.847, + "step": 22794 + }, + { + "epoch": 1.5298479916781316, + "grad_norm": 3.3980872631073, + "learning_rate": 1.3800118934968436e-05, + "loss": 1.6912, + "step": 22796 + }, + { + "epoch": 1.5299822153619007, + "grad_norm": 4.492209434509277, + "learning_rate": 1.3792622805626998e-05, + "loss": 1.7895, + "step": 22798 + }, + { + "epoch": 1.5301164390456696, + "grad_norm": 4.6108717918396, + "learning_rate": 1.378512838701293e-05, + "loss": 2.0039, + "step": 22800 + }, + { + "epoch": 1.5302506627294385, + "grad_norm": 4.418165683746338, + "learning_rate": 1.377763567948034e-05, + "loss": 2.0052, + "step": 22802 + }, + { + "epoch": 1.5303848864132075, + "grad_norm": 4.180668830871582, + "learning_rate": 1.3770144683383213e-05, + "loss": 2.0268, + "step": 22804 + }, + { + "epoch": 1.5305191100969766, + "grad_norm": 3.7501707077026367, + "learning_rate": 1.3762655399075524e-05, + "loss": 1.7997, + "step": 22806 + }, + { + "epoch": 1.5306533337807458, + "grad_norm": 4.415273666381836, + "learning_rate": 1.3755167826911097e-05, + "loss": 2.0001, + "step": 22808 + }, + { + "epoch": 1.5307875574645147, + "grad_norm": 4.167758464813232, + "learning_rate": 1.3747681967243748e-05, + "loss": 1.954, + "step": 22810 + }, + { + "epoch": 1.5309217811482836, + "grad_norm": 4.791915416717529, + "learning_rate": 1.3740197820427108e-05, + "loss": 1.8857, + "step": 22812 + }, + { + "epoch": 1.5310560048320525, + "grad_norm": 4.100696563720703, + "learning_rate": 1.3732715386814832e-05, + "loss": 1.8411, + "step": 22814 + }, + { + "epoch": 1.5311902285158216, + "grad_norm": 4.849297046661377, + "learning_rate": 1.3725234666760428e-05, + "loss": 1.9783, + "step": 22816 + }, + { + "epoch": 1.5313244521995906, + "grad_norm": 4.125189781188965, + "learning_rate": 1.3717755660617365e-05, + "loss": 1.8995, + "step": 22818 + }, + { + "epoch": 1.5314586758833597, + "grad_norm": 3.439800500869751, + "learning_rate": 1.3710278368738998e-05, + "loss": 1.7638, + "step": 22820 + }, + { + "epoch": 1.5315928995671286, + "grad_norm": 3.6717662811279297, + "learning_rate": 1.3702802791478625e-05, + "loss": 1.8849, + "step": 22822 + }, + { + "epoch": 1.5317271232508975, + "grad_norm": 4.6779046058654785, + "learning_rate": 1.3695328929189426e-05, + "loss": 2.0494, + "step": 22824 + }, + { + "epoch": 1.5318613469346665, + "grad_norm": 4.484787940979004, + "learning_rate": 1.3687856782224562e-05, + "loss": 1.8538, + "step": 22826 + }, + { + "epoch": 1.5319955706184356, + "grad_norm": 3.9826228618621826, + "learning_rate": 1.3680386350937058e-05, + "loss": 2.01, + "step": 22828 + }, + { + "epoch": 1.5321297943022048, + "grad_norm": 3.8376636505126953, + "learning_rate": 1.3672917635679877e-05, + "loss": 1.8906, + "step": 22830 + }, + { + "epoch": 1.5322640179859737, + "grad_norm": 3.3750874996185303, + "learning_rate": 1.3665450636805893e-05, + "loss": 1.6525, + "step": 22832 + }, + { + "epoch": 1.5323982416697426, + "grad_norm": 4.226762294769287, + "learning_rate": 1.3657985354667929e-05, + "loss": 2.0254, + "step": 22834 + }, + { + "epoch": 1.5325324653535115, + "grad_norm": 4.599710941314697, + "learning_rate": 1.365052178961868e-05, + "loss": 2.0535, + "step": 22836 + }, + { + "epoch": 1.5326666890372806, + "grad_norm": 4.327156066894531, + "learning_rate": 1.3643059942010832e-05, + "loss": 1.9383, + "step": 22838 + }, + { + "epoch": 1.5328009127210498, + "grad_norm": 4.542041778564453, + "learning_rate": 1.3635599812196875e-05, + "loss": 1.9641, + "step": 22840 + }, + { + "epoch": 1.5329351364048187, + "grad_norm": 3.8770532608032227, + "learning_rate": 1.3628141400529337e-05, + "loss": 1.9808, + "step": 22842 + }, + { + "epoch": 1.5330693600885876, + "grad_norm": 4.407589435577393, + "learning_rate": 1.3620684707360576e-05, + "loss": 1.9515, + "step": 22844 + }, + { + "epoch": 1.5332035837723565, + "grad_norm": 4.299392223358154, + "learning_rate": 1.3613229733042943e-05, + "loss": 1.7232, + "step": 22846 + }, + { + "epoch": 1.5333378074561255, + "grad_norm": 4.127176284790039, + "learning_rate": 1.3605776477928656e-05, + "loss": 1.7419, + "step": 22848 + }, + { + "epoch": 1.5334720311398946, + "grad_norm": 4.622857570648193, + "learning_rate": 1.359832494236986e-05, + "loss": 1.9017, + "step": 22850 + }, + { + "epoch": 1.5336062548236638, + "grad_norm": 4.701850891113281, + "learning_rate": 1.3590875126718621e-05, + "loss": 1.7471, + "step": 22852 + }, + { + "epoch": 1.5337404785074327, + "grad_norm": 4.5074920654296875, + "learning_rate": 1.3583427031326946e-05, + "loss": 2.0178, + "step": 22854 + }, + { + "epoch": 1.5338747021912016, + "grad_norm": 3.637558937072754, + "learning_rate": 1.3575980656546732e-05, + "loss": 1.8216, + "step": 22856 + }, + { + "epoch": 1.5340089258749705, + "grad_norm": 3.8298635482788086, + "learning_rate": 1.3568536002729815e-05, + "loss": 1.9875, + "step": 22858 + }, + { + "epoch": 1.5341431495587396, + "grad_norm": 4.1123504638671875, + "learning_rate": 1.3561093070227909e-05, + "loss": 1.9837, + "step": 22860 + }, + { + "epoch": 1.5342773732425088, + "grad_norm": 3.8930699825286865, + "learning_rate": 1.3553651859392725e-05, + "loss": 1.7965, + "step": 22862 + }, + { + "epoch": 1.5344115969262777, + "grad_norm": 3.7589528560638428, + "learning_rate": 1.35462123705758e-05, + "loss": 1.8462, + "step": 22864 + }, + { + "epoch": 1.5345458206100466, + "grad_norm": 4.138334274291992, + "learning_rate": 1.353877460412869e-05, + "loss": 1.8622, + "step": 22866 + }, + { + "epoch": 1.5346800442938155, + "grad_norm": 4.17948055267334, + "learning_rate": 1.3531338560402767e-05, + "loss": 1.8809, + "step": 22868 + }, + { + "epoch": 1.5348142679775847, + "grad_norm": 3.560441017150879, + "learning_rate": 1.352390423974937e-05, + "loss": 1.6917, + "step": 22870 + }, + { + "epoch": 1.5349484916613536, + "grad_norm": 4.450195789337158, + "learning_rate": 1.3516471642519784e-05, + "loss": 1.8667, + "step": 22872 + }, + { + "epoch": 1.5350827153451228, + "grad_norm": 4.474478721618652, + "learning_rate": 1.3509040769065157e-05, + "loss": 2.0715, + "step": 22874 + }, + { + "epoch": 1.5352169390288917, + "grad_norm": 3.9955804347991943, + "learning_rate": 1.3501611619736609e-05, + "loss": 1.8177, + "step": 22876 + }, + { + "epoch": 1.5353511627126606, + "grad_norm": 3.899465322494507, + "learning_rate": 1.3494184194885146e-05, + "loss": 1.892, + "step": 22878 + }, + { + "epoch": 1.5354853863964295, + "grad_norm": 4.6738386154174805, + "learning_rate": 1.3486758494861696e-05, + "loss": 1.9962, + "step": 22880 + }, + { + "epoch": 1.5356196100801986, + "grad_norm": 3.580564498901367, + "learning_rate": 1.3479334520017084e-05, + "loss": 1.7125, + "step": 22882 + }, + { + "epoch": 1.5357538337639678, + "grad_norm": 4.0820393562316895, + "learning_rate": 1.347191227070212e-05, + "loss": 1.6154, + "step": 22884 + }, + { + "epoch": 1.5358880574477367, + "grad_norm": 6.205446243286133, + "learning_rate": 1.3464491747267472e-05, + "loss": 1.9704, + "step": 22886 + }, + { + "epoch": 1.5360222811315056, + "grad_norm": 3.52032732963562, + "learning_rate": 1.3457072950063748e-05, + "loss": 2.0795, + "step": 22888 + }, + { + "epoch": 1.5361565048152745, + "grad_norm": 4.113733291625977, + "learning_rate": 1.3449655879441447e-05, + "loss": 1.9839, + "step": 22890 + }, + { + "epoch": 1.5362907284990437, + "grad_norm": 4.115777015686035, + "learning_rate": 1.3442240535751049e-05, + "loss": 1.8266, + "step": 22892 + }, + { + "epoch": 1.5364249521828126, + "grad_norm": 3.5229761600494385, + "learning_rate": 1.3434826919342901e-05, + "loss": 1.7295, + "step": 22894 + }, + { + "epoch": 1.5365591758665818, + "grad_norm": 4.207487106323242, + "learning_rate": 1.342741503056728e-05, + "loss": 1.9259, + "step": 22896 + }, + { + "epoch": 1.5366933995503507, + "grad_norm": 4.125962257385254, + "learning_rate": 1.3420004869774367e-05, + "loss": 1.7156, + "step": 22898 + }, + { + "epoch": 1.5368276232341196, + "grad_norm": 3.917424440383911, + "learning_rate": 1.3412596437314302e-05, + "loss": 1.9919, + "step": 22900 + }, + { + "epoch": 1.5369618469178885, + "grad_norm": 4.307941436767578, + "learning_rate": 1.3405189733537098e-05, + "loss": 1.6942, + "step": 22902 + }, + { + "epoch": 1.5370960706016576, + "grad_norm": 4.0802764892578125, + "learning_rate": 1.3397784758792731e-05, + "loss": 1.9221, + "step": 22904 + }, + { + "epoch": 1.5372302942854268, + "grad_norm": 4.473561763763428, + "learning_rate": 1.339038151343106e-05, + "loss": 2.0395, + "step": 22906 + }, + { + "epoch": 1.5373645179691957, + "grad_norm": 4.655192852020264, + "learning_rate": 1.3382979997801876e-05, + "loss": 1.8647, + "step": 22908 + }, + { + "epoch": 1.5374987416529646, + "grad_norm": 4.384570121765137, + "learning_rate": 1.3375580212254862e-05, + "loss": 1.9056, + "step": 22910 + }, + { + "epoch": 1.5376329653367335, + "grad_norm": 4.540384292602539, + "learning_rate": 1.3368182157139686e-05, + "loss": 2.0077, + "step": 22912 + }, + { + "epoch": 1.5377671890205027, + "grad_norm": 4.09638786315918, + "learning_rate": 1.3360785832805867e-05, + "loss": 2.0653, + "step": 22914 + }, + { + "epoch": 1.5379014127042718, + "grad_norm": 13.87023639678955, + "learning_rate": 1.335339123960287e-05, + "loss": 1.7648, + "step": 22916 + }, + { + "epoch": 1.5380356363880407, + "grad_norm": 4.006661415100098, + "learning_rate": 1.3345998377880053e-05, + "loss": 2.0117, + "step": 22918 + }, + { + "epoch": 1.5381698600718097, + "grad_norm": 3.925283908843994, + "learning_rate": 1.3338607247986756e-05, + "loss": 2.1873, + "step": 22920 + }, + { + "epoch": 1.5383040837555786, + "grad_norm": 3.8127572536468506, + "learning_rate": 1.3331217850272176e-05, + "loss": 1.8964, + "step": 22922 + }, + { + "epoch": 1.5384383074393475, + "grad_norm": 3.2334089279174805, + "learning_rate": 1.3323830185085445e-05, + "loss": 1.6414, + "step": 22924 + }, + { + "epoch": 1.5385725311231166, + "grad_norm": 4.228536128997803, + "learning_rate": 1.3316444252775601e-05, + "loss": 1.9436, + "step": 22926 + }, + { + "epoch": 1.5387067548068858, + "grad_norm": 4.5066328048706055, + "learning_rate": 1.3309060053691646e-05, + "loss": 1.8607, + "step": 22928 + }, + { + "epoch": 1.5388409784906547, + "grad_norm": 4.352493762969971, + "learning_rate": 1.3301677588182442e-05, + "loss": 1.8123, + "step": 22930 + }, + { + "epoch": 1.5389752021744236, + "grad_norm": 4.683513164520264, + "learning_rate": 1.3294296856596821e-05, + "loss": 2.0517, + "step": 22932 + }, + { + "epoch": 1.5391094258581925, + "grad_norm": 4.122289657592773, + "learning_rate": 1.328691785928351e-05, + "loss": 2.0852, + "step": 22934 + }, + { + "epoch": 1.5392436495419617, + "grad_norm": 4.243522644042969, + "learning_rate": 1.3279540596591105e-05, + "loss": 2.005, + "step": 22936 + }, + { + "epoch": 1.5393778732257308, + "grad_norm": 4.655456066131592, + "learning_rate": 1.327216506886822e-05, + "loss": 1.8857, + "step": 22938 + }, + { + "epoch": 1.5395120969094997, + "grad_norm": 3.776491641998291, + "learning_rate": 1.326479127646329e-05, + "loss": 1.7748, + "step": 22940 + }, + { + "epoch": 1.5396463205932687, + "grad_norm": 4.068592071533203, + "learning_rate": 1.325741921972476e-05, + "loss": 2.089, + "step": 22942 + }, + { + "epoch": 1.5397805442770376, + "grad_norm": 3.768444299697876, + "learning_rate": 1.325004889900091e-05, + "loss": 1.9055, + "step": 22944 + }, + { + "epoch": 1.5399147679608067, + "grad_norm": 4.284362316131592, + "learning_rate": 1.3242680314639993e-05, + "loss": 2.2157, + "step": 22946 + }, + { + "epoch": 1.5400489916445756, + "grad_norm": 3.718061685562134, + "learning_rate": 1.3235313466990129e-05, + "loss": 2.1138, + "step": 22948 + }, + { + "epoch": 1.5401832153283448, + "grad_norm": 3.9741787910461426, + "learning_rate": 1.3227948356399428e-05, + "loss": 2.1037, + "step": 22950 + }, + { + "epoch": 1.5403174390121137, + "grad_norm": 4.288960933685303, + "learning_rate": 1.3220584983215855e-05, + "loss": 1.8456, + "step": 22952 + }, + { + "epoch": 1.5404516626958826, + "grad_norm": 4.0449395179748535, + "learning_rate": 1.3213223347787324e-05, + "loss": 1.7854, + "step": 22954 + }, + { + "epoch": 1.5405858863796515, + "grad_norm": 4.2197418212890625, + "learning_rate": 1.320586345046163e-05, + "loss": 1.9207, + "step": 22956 + }, + { + "epoch": 1.5407201100634207, + "grad_norm": 3.89298415184021, + "learning_rate": 1.3198505291586555e-05, + "loss": 1.9391, + "step": 22958 + }, + { + "epoch": 1.5408543337471898, + "grad_norm": 3.7604570388793945, + "learning_rate": 1.3191148871509724e-05, + "loss": 2.0287, + "step": 22960 + }, + { + "epoch": 1.5409885574309587, + "grad_norm": 4.4350714683532715, + "learning_rate": 1.3183794190578752e-05, + "loss": 1.8133, + "step": 22962 + }, + { + "epoch": 1.5411227811147277, + "grad_norm": 4.076048374176025, + "learning_rate": 1.317644124914108e-05, + "loss": 2.1372, + "step": 22964 + }, + { + "epoch": 1.5412570047984966, + "grad_norm": 4.162502765655518, + "learning_rate": 1.3169090047544164e-05, + "loss": 1.7974, + "step": 22966 + }, + { + "epoch": 1.5413912284822657, + "grad_norm": 4.184635639190674, + "learning_rate": 1.3161740586135308e-05, + "loss": 1.6165, + "step": 22968 + }, + { + "epoch": 1.5415254521660346, + "grad_norm": 4.276618003845215, + "learning_rate": 1.3154392865261778e-05, + "loss": 1.9571, + "step": 22970 + }, + { + "epoch": 1.5416596758498038, + "grad_norm": 5.349656105041504, + "learning_rate": 1.3147046885270736e-05, + "loss": 1.939, + "step": 22972 + }, + { + "epoch": 1.5417938995335727, + "grad_norm": 4.0526885986328125, + "learning_rate": 1.3139702646509255e-05, + "loss": 1.7739, + "step": 22974 + }, + { + "epoch": 1.5419281232173416, + "grad_norm": 3.848656415939331, + "learning_rate": 1.313236014932433e-05, + "loss": 1.833, + "step": 22976 + }, + { + "epoch": 1.5420623469011105, + "grad_norm": 3.730478286743164, + "learning_rate": 1.312501939406291e-05, + "loss": 2.0162, + "step": 22978 + }, + { + "epoch": 1.5421965705848797, + "grad_norm": 3.43464732170105, + "learning_rate": 1.3117680381071807e-05, + "loss": 1.7971, + "step": 22980 + }, + { + "epoch": 1.5423307942686488, + "grad_norm": 4.051702976226807, + "learning_rate": 1.3110343110697782e-05, + "loss": 1.8062, + "step": 22982 + }, + { + "epoch": 1.5424650179524177, + "grad_norm": 7.850664138793945, + "learning_rate": 1.3103007583287486e-05, + "loss": 1.8935, + "step": 22984 + }, + { + "epoch": 1.5425992416361867, + "grad_norm": 4.261627674102783, + "learning_rate": 1.3095673799187546e-05, + "loss": 1.8919, + "step": 22986 + }, + { + "epoch": 1.5427334653199556, + "grad_norm": 4.449037075042725, + "learning_rate": 1.3088341758744432e-05, + "loss": 1.9709, + "step": 22988 + }, + { + "epoch": 1.5428676890037247, + "grad_norm": 5.000998020172119, + "learning_rate": 1.3081011462304615e-05, + "loss": 1.9607, + "step": 22990 + }, + { + "epoch": 1.5430019126874939, + "grad_norm": 3.8069634437561035, + "learning_rate": 1.3073682910214386e-05, + "loss": 1.8216, + "step": 22992 + }, + { + "epoch": 1.5431361363712628, + "grad_norm": 4.1344404220581055, + "learning_rate": 1.3066356102820032e-05, + "loss": 2.1654, + "step": 22994 + }, + { + "epoch": 1.5432703600550317, + "grad_norm": 3.716858148574829, + "learning_rate": 1.3059031040467717e-05, + "loss": 1.7607, + "step": 22996 + }, + { + "epoch": 1.5434045837388006, + "grad_norm": 4.4152092933654785, + "learning_rate": 1.305170772350356e-05, + "loss": 1.8303, + "step": 22998 + }, + { + "epoch": 1.5435388074225695, + "grad_norm": 3.795029401779175, + "learning_rate": 1.3044386152273552e-05, + "loss": 1.834, + "step": 23000 + }, + { + "epoch": 1.5436730311063387, + "grad_norm": 4.018708229064941, + "learning_rate": 1.303706632712363e-05, + "loss": 2.0422, + "step": 23002 + }, + { + "epoch": 1.5438072547901078, + "grad_norm": 4.102641582489014, + "learning_rate": 1.3029748248399642e-05, + "loss": 1.8574, + "step": 23004 + }, + { + "epoch": 1.5439414784738767, + "grad_norm": 3.8628764152526855, + "learning_rate": 1.302243191644733e-05, + "loss": 1.8395, + "step": 23006 + }, + { + "epoch": 1.5440757021576457, + "grad_norm": 3.8958916664123535, + "learning_rate": 1.3015117331612415e-05, + "loss": 1.9035, + "step": 23008 + }, + { + "epoch": 1.5442099258414146, + "grad_norm": 4.328701019287109, + "learning_rate": 1.3007804494240478e-05, + "loss": 2.0552, + "step": 23010 + }, + { + "epoch": 1.5443441495251837, + "grad_norm": 3.9924535751342773, + "learning_rate": 1.3000493404677039e-05, + "loss": 1.7481, + "step": 23012 + }, + { + "epoch": 1.5444783732089529, + "grad_norm": 4.563597202301025, + "learning_rate": 1.2993184063267516e-05, + "loss": 1.8558, + "step": 23014 + }, + { + "epoch": 1.5446125968927218, + "grad_norm": 4.342920780181885, + "learning_rate": 1.2985876470357284e-05, + "loss": 2.1565, + "step": 23016 + }, + { + "epoch": 1.5447468205764907, + "grad_norm": 3.950878143310547, + "learning_rate": 1.2978570626291609e-05, + "loss": 1.9703, + "step": 23018 + }, + { + "epoch": 1.5448810442602596, + "grad_norm": 3.7809464931488037, + "learning_rate": 1.2971266531415672e-05, + "loss": 1.8722, + "step": 23020 + }, + { + "epoch": 1.5450152679440288, + "grad_norm": 4.232345104217529, + "learning_rate": 1.2963964186074563e-05, + "loss": 2.097, + "step": 23022 + }, + { + "epoch": 1.5451494916277977, + "grad_norm": 4.5549702644348145, + "learning_rate": 1.2956663590613327e-05, + "loss": 2.2775, + "step": 23024 + }, + { + "epoch": 1.5452837153115668, + "grad_norm": 4.0149922370910645, + "learning_rate": 1.2949364745376884e-05, + "loss": 2.0127, + "step": 23026 + }, + { + "epoch": 1.5454179389953357, + "grad_norm": 3.868597984313965, + "learning_rate": 1.2942067650710116e-05, + "loss": 1.7605, + "step": 23028 + }, + { + "epoch": 1.5455521626791047, + "grad_norm": 3.0620241165161133, + "learning_rate": 1.2934772306957776e-05, + "loss": 1.812, + "step": 23030 + }, + { + "epoch": 1.5456863863628736, + "grad_norm": 4.049479961395264, + "learning_rate": 1.2927478714464559e-05, + "loss": 1.9365, + "step": 23032 + }, + { + "epoch": 1.5458206100466427, + "grad_norm": 6.661264896392822, + "learning_rate": 1.2920186873575064e-05, + "loss": 1.9563, + "step": 23034 + }, + { + "epoch": 1.5459548337304119, + "grad_norm": 4.1661601066589355, + "learning_rate": 1.2912896784633833e-05, + "loss": 2.0761, + "step": 23036 + }, + { + "epoch": 1.5460890574141808, + "grad_norm": 4.069543838500977, + "learning_rate": 1.2905608447985301e-05, + "loss": 1.9318, + "step": 23038 + }, + { + "epoch": 1.5462232810979497, + "grad_norm": 4.501739025115967, + "learning_rate": 1.2898321863973829e-05, + "loss": 2.0284, + "step": 23040 + }, + { + "epoch": 1.5463575047817186, + "grad_norm": 3.8284754753112793, + "learning_rate": 1.2891037032943676e-05, + "loss": 2.0783, + "step": 23042 + }, + { + "epoch": 1.5464917284654878, + "grad_norm": 3.891153573989868, + "learning_rate": 1.288375395523907e-05, + "loss": 1.825, + "step": 23044 + }, + { + "epoch": 1.5466259521492567, + "grad_norm": 4.5172600746154785, + "learning_rate": 1.2876472631204095e-05, + "loss": 2.1348, + "step": 23046 + }, + { + "epoch": 1.5467601758330258, + "grad_norm": 4.466907501220703, + "learning_rate": 1.2869193061182794e-05, + "loss": 1.9987, + "step": 23048 + }, + { + "epoch": 1.5468943995167947, + "grad_norm": 4.4137654304504395, + "learning_rate": 1.286191524551909e-05, + "loss": 1.8474, + "step": 23050 + }, + { + "epoch": 1.5470286232005637, + "grad_norm": 4.111526966094971, + "learning_rate": 1.2854639184556877e-05, + "loss": 1.8972, + "step": 23052 + }, + { + "epoch": 1.5471628468843326, + "grad_norm": 4.255780220031738, + "learning_rate": 1.2847364878639905e-05, + "loss": 2.1569, + "step": 23054 + }, + { + "epoch": 1.5472970705681017, + "grad_norm": 3.8170714378356934, + "learning_rate": 1.2840092328111903e-05, + "loss": 2.111, + "step": 23056 + }, + { + "epoch": 1.5474312942518709, + "grad_norm": 4.419047832489014, + "learning_rate": 1.2832821533316464e-05, + "loss": 1.7642, + "step": 23058 + }, + { + "epoch": 1.5475655179356398, + "grad_norm": 3.820173978805542, + "learning_rate": 1.282555249459712e-05, + "loss": 2.0332, + "step": 23060 + }, + { + "epoch": 1.5476997416194087, + "grad_norm": 3.9159295558929443, + "learning_rate": 1.2818285212297303e-05, + "loss": 1.6598, + "step": 23062 + }, + { + "epoch": 1.5478339653031776, + "grad_norm": 6.844006061553955, + "learning_rate": 1.2811019686760412e-05, + "loss": 1.8508, + "step": 23064 + }, + { + "epoch": 1.5479681889869468, + "grad_norm": 4.276915073394775, + "learning_rate": 1.2803755918329712e-05, + "loss": 1.7883, + "step": 23066 + }, + { + "epoch": 1.548102412670716, + "grad_norm": 4.228566646575928, + "learning_rate": 1.2796493907348406e-05, + "loss": 2.1819, + "step": 23068 + }, + { + "epoch": 1.5482366363544848, + "grad_norm": 3.7150001525878906, + "learning_rate": 1.2789233654159582e-05, + "loss": 1.7973, + "step": 23070 + }, + { + "epoch": 1.5483708600382537, + "grad_norm": 5.32558012008667, + "learning_rate": 1.2781975159106319e-05, + "loss": 1.8807, + "step": 23072 + }, + { + "epoch": 1.5485050837220227, + "grad_norm": 3.755685567855835, + "learning_rate": 1.277471842253154e-05, + "loss": 1.9359, + "step": 23074 + }, + { + "epoch": 1.5486393074057916, + "grad_norm": 3.4314193725585938, + "learning_rate": 1.2767463444778117e-05, + "loss": 1.85, + "step": 23076 + }, + { + "epoch": 1.5487735310895607, + "grad_norm": 3.945371150970459, + "learning_rate": 1.276021022618883e-05, + "loss": 1.7856, + "step": 23078 + }, + { + "epoch": 1.5489077547733299, + "grad_norm": 4.405202388763428, + "learning_rate": 1.2752958767106366e-05, + "loss": 1.9213, + "step": 23080 + }, + { + "epoch": 1.5490419784570988, + "grad_norm": 3.729142189025879, + "learning_rate": 1.2745709067873369e-05, + "loss": 1.7707, + "step": 23082 + }, + { + "epoch": 1.5491762021408677, + "grad_norm": 3.970487117767334, + "learning_rate": 1.2738461128832353e-05, + "loss": 1.8735, + "step": 23084 + }, + { + "epoch": 1.5493104258246366, + "grad_norm": 4.169857978820801, + "learning_rate": 1.2731214950325799e-05, + "loss": 1.9341, + "step": 23086 + }, + { + "epoch": 1.5494446495084058, + "grad_norm": 4.354539394378662, + "learning_rate": 1.2723970532696027e-05, + "loss": 1.9963, + "step": 23088 + }, + { + "epoch": 1.549578873192175, + "grad_norm": 4.051002025604248, + "learning_rate": 1.271672787628536e-05, + "loss": 1.7763, + "step": 23090 + }, + { + "epoch": 1.5497130968759438, + "grad_norm": 3.8751213550567627, + "learning_rate": 1.2709486981435976e-05, + "loss": 1.8323, + "step": 23092 + }, + { + "epoch": 1.5498473205597127, + "grad_norm": 3.852573871612549, + "learning_rate": 1.2702247848490012e-05, + "loss": 1.7908, + "step": 23094 + }, + { + "epoch": 1.5499815442434817, + "grad_norm": 3.8993279933929443, + "learning_rate": 1.2695010477789498e-05, + "loss": 1.9994, + "step": 23096 + }, + { + "epoch": 1.5501157679272508, + "grad_norm": 3.5364692211151123, + "learning_rate": 1.2687774869676384e-05, + "loss": 1.9492, + "step": 23098 + }, + { + "epoch": 1.5502499916110197, + "grad_norm": 3.454930543899536, + "learning_rate": 1.268054102449252e-05, + "loss": 1.4994, + "step": 23100 + }, + { + "epoch": 1.5503842152947889, + "grad_norm": 3.6576883792877197, + "learning_rate": 1.267330894257972e-05, + "loss": 1.7868, + "step": 23102 + }, + { + "epoch": 1.5505184389785578, + "grad_norm": 4.222530364990234, + "learning_rate": 1.2666078624279676e-05, + "loss": 1.9286, + "step": 23104 + }, + { + "epoch": 1.5506526626623267, + "grad_norm": 4.016541481018066, + "learning_rate": 1.2658850069933998e-05, + "loss": 1.7761, + "step": 23106 + }, + { + "epoch": 1.5507868863460956, + "grad_norm": 4.397496700286865, + "learning_rate": 1.2651623279884211e-05, + "loss": 2.0539, + "step": 23108 + }, + { + "epoch": 1.5509211100298648, + "grad_norm": 3.7025105953216553, + "learning_rate": 1.2644398254471801e-05, + "loss": 1.8516, + "step": 23110 + }, + { + "epoch": 1.551055333713634, + "grad_norm": 4.130939960479736, + "learning_rate": 1.2637174994038104e-05, + "loss": 1.9786, + "step": 23112 + }, + { + "epoch": 1.5511895573974028, + "grad_norm": 3.7187087535858154, + "learning_rate": 1.262995349892444e-05, + "loss": 1.9022, + "step": 23114 + }, + { + "epoch": 1.5513237810811717, + "grad_norm": 4.028985500335693, + "learning_rate": 1.2622733769471962e-05, + "loss": 1.8575, + "step": 23116 + }, + { + "epoch": 1.5514580047649407, + "grad_norm": 4.128970623016357, + "learning_rate": 1.2615515806021827e-05, + "loss": 1.8383, + "step": 23118 + }, + { + "epoch": 1.5515922284487098, + "grad_norm": 3.785112142562866, + "learning_rate": 1.2608299608915047e-05, + "loss": 1.9763, + "step": 23120 + }, + { + "epoch": 1.5517264521324787, + "grad_norm": 3.729605197906494, + "learning_rate": 1.2601085178492594e-05, + "loss": 1.9381, + "step": 23122 + }, + { + "epoch": 1.5518606758162479, + "grad_norm": 4.552008152008057, + "learning_rate": 1.2593872515095323e-05, + "loss": 1.9478, + "step": 23124 + }, + { + "epoch": 1.5519948995000168, + "grad_norm": 4.015423774719238, + "learning_rate": 1.2586661619064022e-05, + "loss": 2.0263, + "step": 23126 + }, + { + "epoch": 1.5521291231837857, + "grad_norm": 4.598470687866211, + "learning_rate": 1.2579452490739374e-05, + "loss": 2.0845, + "step": 23128 + }, + { + "epoch": 1.5522633468675546, + "grad_norm": 3.922083616256714, + "learning_rate": 1.257224513046203e-05, + "loss": 1.713, + "step": 23130 + }, + { + "epoch": 1.5523975705513238, + "grad_norm": 4.121368885040283, + "learning_rate": 1.2565039538572499e-05, + "loss": 1.995, + "step": 23132 + }, + { + "epoch": 1.552531794235093, + "grad_norm": 3.6992433071136475, + "learning_rate": 1.2557835715411237e-05, + "loss": 2.1686, + "step": 23134 + }, + { + "epoch": 1.5526660179188618, + "grad_norm": 3.8843462467193604, + "learning_rate": 1.25506336613186e-05, + "loss": 2.0398, + "step": 23136 + }, + { + "epoch": 1.5528002416026307, + "grad_norm": 4.504191875457764, + "learning_rate": 1.2543433376634894e-05, + "loss": 2.0301, + "step": 23138 + }, + { + "epoch": 1.5529344652863997, + "grad_norm": 4.368800640106201, + "learning_rate": 1.2536234861700307e-05, + "loss": 2.1649, + "step": 23140 + }, + { + "epoch": 1.5530686889701688, + "grad_norm": 4.178317546844482, + "learning_rate": 1.2529038116854951e-05, + "loss": 1.7308, + "step": 23142 + }, + { + "epoch": 1.553202912653938, + "grad_norm": 3.9967947006225586, + "learning_rate": 1.2521843142438867e-05, + "loss": 1.7779, + "step": 23144 + }, + { + "epoch": 1.5533371363377069, + "grad_norm": 3.9352617263793945, + "learning_rate": 1.2514649938791978e-05, + "loss": 1.6674, + "step": 23146 + }, + { + "epoch": 1.5534713600214758, + "grad_norm": 3.871527910232544, + "learning_rate": 1.2507458506254178e-05, + "loss": 1.9383, + "step": 23148 + }, + { + "epoch": 1.5536055837052447, + "grad_norm": 3.633159875869751, + "learning_rate": 1.250026884516523e-05, + "loss": 1.7873, + "step": 23150 + }, + { + "epoch": 1.5537398073890136, + "grad_norm": 4.62493371963501, + "learning_rate": 1.249308095586486e-05, + "loss": 2.0902, + "step": 23152 + }, + { + "epoch": 1.5538740310727828, + "grad_norm": 3.6730988025665283, + "learning_rate": 1.2485894838692641e-05, + "loss": 1.8629, + "step": 23154 + }, + { + "epoch": 1.554008254756552, + "grad_norm": 4.094191551208496, + "learning_rate": 1.2478710493988133e-05, + "loss": 1.8048, + "step": 23156 + }, + { + "epoch": 1.5541424784403208, + "grad_norm": 4.470035552978516, + "learning_rate": 1.2471527922090754e-05, + "loss": 2.1235, + "step": 23158 + }, + { + "epoch": 1.5542767021240897, + "grad_norm": 3.8640453815460205, + "learning_rate": 1.2464347123339904e-05, + "loss": 2.19, + "step": 23160 + }, + { + "epoch": 1.5544109258078587, + "grad_norm": 3.9915947914123535, + "learning_rate": 1.2457168098074839e-05, + "loss": 2.092, + "step": 23162 + }, + { + "epoch": 1.5545451494916278, + "grad_norm": 4.127565383911133, + "learning_rate": 1.2449990846634757e-05, + "loss": 1.9474, + "step": 23164 + }, + { + "epoch": 1.554679373175397, + "grad_norm": 4.22900390625, + "learning_rate": 1.244281536935875e-05, + "loss": 2.0466, + "step": 23166 + }, + { + "epoch": 1.5548135968591659, + "grad_norm": 4.180196762084961, + "learning_rate": 1.2435641666585884e-05, + "loss": 1.8837, + "step": 23168 + }, + { + "epoch": 1.5549478205429348, + "grad_norm": 3.9553468227386475, + "learning_rate": 1.242846973865508e-05, + "loss": 1.6956, + "step": 23170 + }, + { + "epoch": 1.5550820442267037, + "grad_norm": 3.905562400817871, + "learning_rate": 1.24212995859052e-05, + "loss": 2.0431, + "step": 23172 + }, + { + "epoch": 1.5552162679104728, + "grad_norm": 4.2321858406066895, + "learning_rate": 1.2414131208675006e-05, + "loss": 2.0348, + "step": 23174 + }, + { + "epoch": 1.5553504915942418, + "grad_norm": 4.032111644744873, + "learning_rate": 1.2406964607303212e-05, + "loss": 1.92, + "step": 23176 + }, + { + "epoch": 1.555484715278011, + "grad_norm": 3.969851016998291, + "learning_rate": 1.2399799782128407e-05, + "loss": 1.7922, + "step": 23178 + }, + { + "epoch": 1.5556189389617798, + "grad_norm": 3.9321844577789307, + "learning_rate": 1.2392636733489155e-05, + "loss": 2.0465, + "step": 23180 + }, + { + "epoch": 1.5557531626455487, + "grad_norm": 4.0366997718811035, + "learning_rate": 1.2385475461723833e-05, + "loss": 2.0265, + "step": 23182 + }, + { + "epoch": 1.5558873863293177, + "grad_norm": 3.911590337753296, + "learning_rate": 1.2378315967170845e-05, + "loss": 1.8511, + "step": 23184 + }, + { + "epoch": 1.5560216100130868, + "grad_norm": 4.396181106567383, + "learning_rate": 1.2371158250168436e-05, + "loss": 2.0133, + "step": 23186 + }, + { + "epoch": 1.556155833696856, + "grad_norm": 3.8109326362609863, + "learning_rate": 1.2364002311054818e-05, + "loss": 1.9142, + "step": 23188 + }, + { + "epoch": 1.5562900573806249, + "grad_norm": 4.495974540710449, + "learning_rate": 1.2356848150168082e-05, + "loss": 1.9549, + "step": 23190 + }, + { + "epoch": 1.5564242810643938, + "grad_norm": 4.384909629821777, + "learning_rate": 1.2349695767846247e-05, + "loss": 1.969, + "step": 23192 + }, + { + "epoch": 1.5565585047481627, + "grad_norm": 4.50256872177124, + "learning_rate": 1.2342545164427244e-05, + "loss": 1.7707, + "step": 23194 + }, + { + "epoch": 1.5566927284319318, + "grad_norm": 4.003553867340088, + "learning_rate": 1.233539634024894e-05, + "loss": 1.8028, + "step": 23196 + }, + { + "epoch": 1.5568269521157008, + "grad_norm": 4.631532669067383, + "learning_rate": 1.2328249295649097e-05, + "loss": 1.9937, + "step": 23198 + }, + { + "epoch": 1.55696117579947, + "grad_norm": 4.025347709655762, + "learning_rate": 1.232110403096539e-05, + "loss": 1.7154, + "step": 23200 + }, + { + "epoch": 1.5570953994832388, + "grad_norm": 3.659562826156616, + "learning_rate": 1.2313960546535419e-05, + "loss": 1.7308, + "step": 23202 + }, + { + "epoch": 1.5572296231670077, + "grad_norm": 4.220057964324951, + "learning_rate": 1.2306818842696716e-05, + "loss": 2.0082, + "step": 23204 + }, + { + "epoch": 1.5573638468507767, + "grad_norm": 4.362984657287598, + "learning_rate": 1.2299678919786689e-05, + "loss": 1.996, + "step": 23206 + }, + { + "epoch": 1.5574980705345458, + "grad_norm": 4.236664772033691, + "learning_rate": 1.229254077814273e-05, + "loss": 1.9894, + "step": 23208 + }, + { + "epoch": 1.557632294218315, + "grad_norm": 3.6426761150360107, + "learning_rate": 1.2285404418102053e-05, + "loss": 1.7907, + "step": 23210 + }, + { + "epoch": 1.5577665179020839, + "grad_norm": 3.8428211212158203, + "learning_rate": 1.2278269840001838e-05, + "loss": 1.815, + "step": 23212 + }, + { + "epoch": 1.5579007415858528, + "grad_norm": 4.670727729797363, + "learning_rate": 1.2271137044179215e-05, + "loss": 1.9897, + "step": 23214 + }, + { + "epoch": 1.5580349652696217, + "grad_norm": 4.059131622314453, + "learning_rate": 1.226400603097116e-05, + "loss": 2.0096, + "step": 23216 + }, + { + "epoch": 1.5581691889533908, + "grad_norm": 4.1138129234313965, + "learning_rate": 1.2256876800714633e-05, + "loss": 1.9437, + "step": 23218 + }, + { + "epoch": 1.55830341263716, + "grad_norm": 3.687633991241455, + "learning_rate": 1.2249749353746454e-05, + "loss": 1.8802, + "step": 23220 + }, + { + "epoch": 1.558437636320929, + "grad_norm": 3.5630531311035156, + "learning_rate": 1.2242623690403387e-05, + "loss": 1.7211, + "step": 23222 + }, + { + "epoch": 1.5585718600046978, + "grad_norm": 3.7437312602996826, + "learning_rate": 1.223549981102209e-05, + "loss": 1.7313, + "step": 23224 + }, + { + "epoch": 1.5587060836884667, + "grad_norm": 4.54882287979126, + "learning_rate": 1.2228377715939183e-05, + "loss": 1.9919, + "step": 23226 + }, + { + "epoch": 1.5588403073722357, + "grad_norm": 4.136166572570801, + "learning_rate": 1.2221257405491148e-05, + "loss": 1.8909, + "step": 23228 + }, + { + "epoch": 1.5589745310560048, + "grad_norm": 4.067768573760986, + "learning_rate": 1.2214138880014414e-05, + "loss": 2.0557, + "step": 23230 + }, + { + "epoch": 1.559108754739774, + "grad_norm": 3.8356239795684814, + "learning_rate": 1.2207022139845303e-05, + "loss": 1.7976, + "step": 23232 + }, + { + "epoch": 1.5592429784235429, + "grad_norm": 4.399779796600342, + "learning_rate": 1.2199907185320091e-05, + "loss": 1.8387, + "step": 23234 + }, + { + "epoch": 1.5593772021073118, + "grad_norm": 4.23944616317749, + "learning_rate": 1.2192794016774922e-05, + "loss": 1.5864, + "step": 23236 + }, + { + "epoch": 1.5595114257910807, + "grad_norm": 3.797048330307007, + "learning_rate": 1.218568263454592e-05, + "loss": 1.6822, + "step": 23238 + }, + { + "epoch": 1.5596456494748498, + "grad_norm": 4.398746967315674, + "learning_rate": 1.2178573038969026e-05, + "loss": 1.8342, + "step": 23240 + }, + { + "epoch": 1.559779873158619, + "grad_norm": 3.8012681007385254, + "learning_rate": 1.2171465230380196e-05, + "loss": 1.9104, + "step": 23242 + }, + { + "epoch": 1.559914096842388, + "grad_norm": 3.6855227947235107, + "learning_rate": 1.2164359209115234e-05, + "loss": 1.8151, + "step": 23244 + }, + { + "epoch": 1.5600483205261568, + "grad_norm": 3.913891315460205, + "learning_rate": 1.2157254975509912e-05, + "loss": 1.9341, + "step": 23246 + }, + { + "epoch": 1.5601825442099257, + "grad_norm": 3.7453980445861816, + "learning_rate": 1.2150152529899878e-05, + "loss": 1.8755, + "step": 23248 + }, + { + "epoch": 1.560316767893695, + "grad_norm": 4.444465637207031, + "learning_rate": 1.2143051872620708e-05, + "loss": 1.8496, + "step": 23250 + }, + { + "epoch": 1.5604509915774638, + "grad_norm": 3.8847548961639404, + "learning_rate": 1.2135953004007882e-05, + "loss": 1.963, + "step": 23252 + }, + { + "epoch": 1.560585215261233, + "grad_norm": 4.711169719696045, + "learning_rate": 1.2128855924396837e-05, + "loss": 2.0082, + "step": 23254 + }, + { + "epoch": 1.5607194389450019, + "grad_norm": 4.187530517578125, + "learning_rate": 1.2121760634122875e-05, + "loss": 2.0042, + "step": 23256 + }, + { + "epoch": 1.5608536626287708, + "grad_norm": 3.809532880783081, + "learning_rate": 1.211466713352124e-05, + "loss": 1.7272, + "step": 23258 + }, + { + "epoch": 1.5609878863125397, + "grad_norm": 4.628067493438721, + "learning_rate": 1.210757542292707e-05, + "loss": 2.024, + "step": 23260 + }, + { + "epoch": 1.5611221099963088, + "grad_norm": 4.171805381774902, + "learning_rate": 1.2100485502675463e-05, + "loss": 2.0162, + "step": 23262 + }, + { + "epoch": 1.561256333680078, + "grad_norm": 4.767007827758789, + "learning_rate": 1.2093397373101373e-05, + "loss": 2.0522, + "step": 23264 + }, + { + "epoch": 1.561390557363847, + "grad_norm": 4.2744975090026855, + "learning_rate": 1.2086311034539749e-05, + "loss": 2.151, + "step": 23266 + }, + { + "epoch": 1.5615247810476158, + "grad_norm": 4.478797435760498, + "learning_rate": 1.2079226487325347e-05, + "loss": 1.8455, + "step": 23268 + }, + { + "epoch": 1.5616590047313847, + "grad_norm": 4.261718273162842, + "learning_rate": 1.2072143731792934e-05, + "loss": 1.8612, + "step": 23270 + }, + { + "epoch": 1.561793228415154, + "grad_norm": 4.119487285614014, + "learning_rate": 1.2065062768277135e-05, + "loss": 1.8961, + "step": 23272 + }, + { + "epoch": 1.5619274520989228, + "grad_norm": 4.238527774810791, + "learning_rate": 1.2057983597112543e-05, + "loss": 1.8033, + "step": 23274 + }, + { + "epoch": 1.562061675782692, + "grad_norm": 4.242852687835693, + "learning_rate": 1.2050906218633617e-05, + "loss": 2.0533, + "step": 23276 + }, + { + "epoch": 1.5621958994664609, + "grad_norm": 4.178158760070801, + "learning_rate": 1.2043830633174747e-05, + "loss": 1.9166, + "step": 23278 + }, + { + "epoch": 1.5623301231502298, + "grad_norm": 3.8235819339752197, + "learning_rate": 1.203675684107024e-05, + "loss": 1.9546, + "step": 23280 + }, + { + "epoch": 1.5624643468339987, + "grad_norm": 4.195991516113281, + "learning_rate": 1.2029684842654304e-05, + "loss": 1.9649, + "step": 23282 + }, + { + "epoch": 1.5625985705177678, + "grad_norm": 4.3736796379089355, + "learning_rate": 1.2022614638261109e-05, + "loss": 2.1076, + "step": 23284 + }, + { + "epoch": 1.562732794201537, + "grad_norm": 3.9171433448791504, + "learning_rate": 1.20155462282247e-05, + "loss": 1.8839, + "step": 23286 + }, + { + "epoch": 1.562867017885306, + "grad_norm": 4.02685022354126, + "learning_rate": 1.2008479612879032e-05, + "loss": 2.0953, + "step": 23288 + }, + { + "epoch": 1.5630012415690748, + "grad_norm": 4.102956771850586, + "learning_rate": 1.200141479255798e-05, + "loss": 2.1892, + "step": 23290 + }, + { + "epoch": 1.5631354652528437, + "grad_norm": 4.462805271148682, + "learning_rate": 1.1994351767595374e-05, + "loss": 2.1295, + "step": 23292 + }, + { + "epoch": 1.5632696889366129, + "grad_norm": 4.515209197998047, + "learning_rate": 1.1987290538324914e-05, + "loss": 1.8549, + "step": 23294 + }, + { + "epoch": 1.563403912620382, + "grad_norm": 4.01509428024292, + "learning_rate": 1.1980231105080225e-05, + "loss": 2.159, + "step": 23296 + }, + { + "epoch": 1.563538136304151, + "grad_norm": 4.078320503234863, + "learning_rate": 1.1973173468194843e-05, + "loss": 1.7554, + "step": 23298 + }, + { + "epoch": 1.5636723599879199, + "grad_norm": 4.302107334136963, + "learning_rate": 1.1966117628002249e-05, + "loss": 2.0513, + "step": 23300 + }, + { + "epoch": 1.5638065836716888, + "grad_norm": 3.5099565982818604, + "learning_rate": 1.1959063584835795e-05, + "loss": 1.7157, + "step": 23302 + }, + { + "epoch": 1.5639408073554577, + "grad_norm": 5.461663246154785, + "learning_rate": 1.1952011339028807e-05, + "loss": 1.9323, + "step": 23304 + }, + { + "epoch": 1.5640750310392268, + "grad_norm": 4.364641189575195, + "learning_rate": 1.1944960890914442e-05, + "loss": 1.9997, + "step": 23306 + }, + { + "epoch": 1.564209254722996, + "grad_norm": 3.9984614849090576, + "learning_rate": 1.1937912240825849e-05, + "loss": 1.9349, + "step": 23308 + }, + { + "epoch": 1.564343478406765, + "grad_norm": 3.9506547451019287, + "learning_rate": 1.1930865389096052e-05, + "loss": 2.0807, + "step": 23310 + }, + { + "epoch": 1.5644777020905338, + "grad_norm": 3.9210219383239746, + "learning_rate": 1.1923820336058012e-05, + "loss": 1.8063, + "step": 23312 + }, + { + "epoch": 1.5646119257743027, + "grad_norm": 4.311283111572266, + "learning_rate": 1.1916777082044588e-05, + "loss": 1.8969, + "step": 23314 + }, + { + "epoch": 1.5647461494580719, + "grad_norm": 4.241171836853027, + "learning_rate": 1.1909735627388562e-05, + "loss": 1.6978, + "step": 23316 + }, + { + "epoch": 1.564880373141841, + "grad_norm": 4.514918804168701, + "learning_rate": 1.1902695972422618e-05, + "loss": 1.9275, + "step": 23318 + }, + { + "epoch": 1.56501459682561, + "grad_norm": 4.150766849517822, + "learning_rate": 1.189565811747938e-05, + "loss": 2.1829, + "step": 23320 + }, + { + "epoch": 1.5651488205093789, + "grad_norm": 4.5488457679748535, + "learning_rate": 1.188862206289137e-05, + "loss": 1.9888, + "step": 23322 + }, + { + "epoch": 1.5652830441931478, + "grad_norm": 3.7803738117218018, + "learning_rate": 1.1881587808991023e-05, + "loss": 2.0236, + "step": 23324 + }, + { + "epoch": 1.565417267876917, + "grad_norm": 4.176850318908691, + "learning_rate": 1.1874555356110684e-05, + "loss": 2.1655, + "step": 23326 + }, + { + "epoch": 1.5655514915606858, + "grad_norm": 3.906270742416382, + "learning_rate": 1.1867524704582644e-05, + "loss": 1.7802, + "step": 23328 + }, + { + "epoch": 1.565685715244455, + "grad_norm": 4.348519325256348, + "learning_rate": 1.1860495854739067e-05, + "loss": 1.8008, + "step": 23330 + }, + { + "epoch": 1.565819938928224, + "grad_norm": 4.031594276428223, + "learning_rate": 1.1853468806912088e-05, + "loss": 1.6893, + "step": 23332 + }, + { + "epoch": 1.5659541626119928, + "grad_norm": 4.3443169593811035, + "learning_rate": 1.1846443561433673e-05, + "loss": 1.7586, + "step": 23334 + }, + { + "epoch": 1.5660883862957617, + "grad_norm": 3.663055419921875, + "learning_rate": 1.1839420118635785e-05, + "loss": 1.7544, + "step": 23336 + }, + { + "epoch": 1.5662226099795309, + "grad_norm": 3.5566518306732178, + "learning_rate": 1.1832398478850243e-05, + "loss": 1.7132, + "step": 23338 + }, + { + "epoch": 1.5663568336633, + "grad_norm": 3.436635732650757, + "learning_rate": 1.182537864240883e-05, + "loss": 1.5985, + "step": 23340 + }, + { + "epoch": 1.566491057347069, + "grad_norm": 4.151078701019287, + "learning_rate": 1.1818360609643215e-05, + "loss": 1.978, + "step": 23342 + }, + { + "epoch": 1.5666252810308379, + "grad_norm": 4.518624305725098, + "learning_rate": 1.1811344380884975e-05, + "loss": 2.1058, + "step": 23344 + }, + { + "epoch": 1.5667595047146068, + "grad_norm": 4.0468831062316895, + "learning_rate": 1.180432995646562e-05, + "loss": 1.7873, + "step": 23346 + }, + { + "epoch": 1.566893728398376, + "grad_norm": 10.45155143737793, + "learning_rate": 1.1797317336716552e-05, + "loss": 2.045, + "step": 23348 + }, + { + "epoch": 1.5670279520821448, + "grad_norm": 4.226406574249268, + "learning_rate": 1.179030652196913e-05, + "loss": 1.9563, + "step": 23350 + }, + { + "epoch": 1.567162175765914, + "grad_norm": 4.034966468811035, + "learning_rate": 1.1783297512554591e-05, + "loss": 2.0637, + "step": 23352 + }, + { + "epoch": 1.567296399449683, + "grad_norm": 3.7489116191864014, + "learning_rate": 1.1776290308804095e-05, + "loss": 1.8845, + "step": 23354 + }, + { + "epoch": 1.5674306231334518, + "grad_norm": 4.046446323394775, + "learning_rate": 1.1769284911048706e-05, + "loss": 1.8629, + "step": 23356 + }, + { + "epoch": 1.5675648468172207, + "grad_norm": 4.011538028717041, + "learning_rate": 1.176228131961944e-05, + "loss": 1.7649, + "step": 23358 + }, + { + "epoch": 1.5676990705009899, + "grad_norm": 4.02580451965332, + "learning_rate": 1.1755279534847186e-05, + "loss": 1.8286, + "step": 23360 + }, + { + "epoch": 1.567833294184759, + "grad_norm": 4.338679790496826, + "learning_rate": 1.1748279557062775e-05, + "loss": 1.8673, + "step": 23362 + }, + { + "epoch": 1.567967517868528, + "grad_norm": 4.571115493774414, + "learning_rate": 1.1741281386596913e-05, + "loss": 1.7742, + "step": 23364 + }, + { + "epoch": 1.5681017415522969, + "grad_norm": 3.8391013145446777, + "learning_rate": 1.1734285023780295e-05, + "loss": 1.7666, + "step": 23366 + }, + { + "epoch": 1.5682359652360658, + "grad_norm": 4.024032115936279, + "learning_rate": 1.1727290468943447e-05, + "loss": 1.8888, + "step": 23368 + }, + { + "epoch": 1.568370188919835, + "grad_norm": 4.320840835571289, + "learning_rate": 1.172029772241688e-05, + "loss": 2.0138, + "step": 23370 + }, + { + "epoch": 1.568504412603604, + "grad_norm": 3.8726394176483154, + "learning_rate": 1.171330678453097e-05, + "loss": 1.9739, + "step": 23372 + }, + { + "epoch": 1.568638636287373, + "grad_norm": 3.941127061843872, + "learning_rate": 1.1706317655616029e-05, + "loss": 1.7066, + "step": 23374 + }, + { + "epoch": 1.568772859971142, + "grad_norm": 3.629929780960083, + "learning_rate": 1.1699330336002261e-05, + "loss": 1.8891, + "step": 23376 + }, + { + "epoch": 1.5689070836549108, + "grad_norm": 4.329135417938232, + "learning_rate": 1.1692344826019835e-05, + "loss": 1.8759, + "step": 23378 + }, + { + "epoch": 1.5690413073386797, + "grad_norm": 4.237200736999512, + "learning_rate": 1.168536112599879e-05, + "loss": 2.0932, + "step": 23380 + }, + { + "epoch": 1.5691755310224489, + "grad_norm": 3.7816572189331055, + "learning_rate": 1.1678379236269082e-05, + "loss": 1.9413, + "step": 23382 + }, + { + "epoch": 1.569309754706218, + "grad_norm": 4.147528171539307, + "learning_rate": 1.1671399157160589e-05, + "loss": 1.6586, + "step": 23384 + }, + { + "epoch": 1.569443978389987, + "grad_norm": 4.661153793334961, + "learning_rate": 1.1664420889003131e-05, + "loss": 1.8891, + "step": 23386 + }, + { + "epoch": 1.5695782020737559, + "grad_norm": 4.057305812835693, + "learning_rate": 1.1657444432126403e-05, + "loss": 1.7466, + "step": 23388 + }, + { + "epoch": 1.5697124257575248, + "grad_norm": 4.733467102050781, + "learning_rate": 1.1650469786860025e-05, + "loss": 2.1632, + "step": 23390 + }, + { + "epoch": 1.569846649441294, + "grad_norm": 3.8147571086883545, + "learning_rate": 1.1643496953533522e-05, + "loss": 1.8075, + "step": 23392 + }, + { + "epoch": 1.569980873125063, + "grad_norm": 4.06930685043335, + "learning_rate": 1.1636525932476382e-05, + "loss": 1.7234, + "step": 23394 + }, + { + "epoch": 1.570115096808832, + "grad_norm": 4.135396480560303, + "learning_rate": 1.1629556724017936e-05, + "loss": 1.788, + "step": 23396 + }, + { + "epoch": 1.570249320492601, + "grad_norm": 3.941044330596924, + "learning_rate": 1.1622589328487504e-05, + "loss": 2.0207, + "step": 23398 + }, + { + "epoch": 1.5703835441763698, + "grad_norm": 4.668511390686035, + "learning_rate": 1.1615623746214255e-05, + "loss": 1.9714, + "step": 23400 + }, + { + "epoch": 1.570517767860139, + "grad_norm": 3.9432175159454346, + "learning_rate": 1.1608659977527303e-05, + "loss": 1.8558, + "step": 23402 + }, + { + "epoch": 1.5706519915439079, + "grad_norm": 3.9416191577911377, + "learning_rate": 1.1601698022755658e-05, + "loss": 1.9543, + "step": 23404 + }, + { + "epoch": 1.570786215227677, + "grad_norm": 5.465460300445557, + "learning_rate": 1.159473788222829e-05, + "loss": 1.874, + "step": 23406 + }, + { + "epoch": 1.570920438911446, + "grad_norm": 3.9924464225769043, + "learning_rate": 1.1587779556274042e-05, + "loss": 1.9487, + "step": 23408 + }, + { + "epoch": 1.5710546625952149, + "grad_norm": 3.971221685409546, + "learning_rate": 1.1580823045221673e-05, + "loss": 1.8417, + "step": 23410 + }, + { + "epoch": 1.5711888862789838, + "grad_norm": 3.802501916885376, + "learning_rate": 1.1573868349399848e-05, + "loss": 1.7976, + "step": 23412 + }, + { + "epoch": 1.571323109962753, + "grad_norm": 4.195688247680664, + "learning_rate": 1.1566915469137201e-05, + "loss": 1.9534, + "step": 23414 + }, + { + "epoch": 1.571457333646522, + "grad_norm": 4.7306623458862305, + "learning_rate": 1.155996440476222e-05, + "loss": 1.9587, + "step": 23416 + }, + { + "epoch": 1.571591557330291, + "grad_norm": 3.7951598167419434, + "learning_rate": 1.1553015156603331e-05, + "loss": 1.8433, + "step": 23418 + }, + { + "epoch": 1.57172578101406, + "grad_norm": 4.080079078674316, + "learning_rate": 1.1546067724988873e-05, + "loss": 1.7273, + "step": 23420 + }, + { + "epoch": 1.5718600046978288, + "grad_norm": 3.7187633514404297, + "learning_rate": 1.1539122110247086e-05, + "loss": 1.8303, + "step": 23422 + }, + { + "epoch": 1.571994228381598, + "grad_norm": 4.519037246704102, + "learning_rate": 1.1532178312706166e-05, + "loss": 1.979, + "step": 23424 + }, + { + "epoch": 1.5721284520653669, + "grad_norm": 4.146815776824951, + "learning_rate": 1.1525236332694162e-05, + "loss": 2.1418, + "step": 23426 + }, + { + "epoch": 1.572262675749136, + "grad_norm": 3.882922887802124, + "learning_rate": 1.1518296170539105e-05, + "loss": 1.6838, + "step": 23428 + }, + { + "epoch": 1.572396899432905, + "grad_norm": 3.825730085372925, + "learning_rate": 1.1511357826568864e-05, + "loss": 1.8852, + "step": 23430 + }, + { + "epoch": 1.5725311231166739, + "grad_norm": 3.972957134246826, + "learning_rate": 1.150442130111129e-05, + "loss": 2.1476, + "step": 23432 + }, + { + "epoch": 1.5726653468004428, + "grad_norm": 5.6631035804748535, + "learning_rate": 1.14974865944941e-05, + "loss": 1.7945, + "step": 23434 + }, + { + "epoch": 1.572799570484212, + "grad_norm": 4.306706428527832, + "learning_rate": 1.1490553707044965e-05, + "loss": 2.1308, + "step": 23436 + }, + { + "epoch": 1.572933794167981, + "grad_norm": 3.8370678424835205, + "learning_rate": 1.1483622639091446e-05, + "loss": 2.0095, + "step": 23438 + }, + { + "epoch": 1.57306801785175, + "grad_norm": 4.340967178344727, + "learning_rate": 1.1476693390961019e-05, + "loss": 1.7249, + "step": 23440 + }, + { + "epoch": 1.573202241535519, + "grad_norm": 3.936533212661743, + "learning_rate": 1.146976596298106e-05, + "loss": 1.8208, + "step": 23442 + }, + { + "epoch": 1.5733364652192878, + "grad_norm": 4.117605209350586, + "learning_rate": 1.146284035547891e-05, + "loss": 1.8023, + "step": 23444 + }, + { + "epoch": 1.573470688903057, + "grad_norm": 3.2135074138641357, + "learning_rate": 1.1455916568781772e-05, + "loss": 1.7147, + "step": 23446 + }, + { + "epoch": 1.573604912586826, + "grad_norm": 4.686895847320557, + "learning_rate": 1.1448994603216779e-05, + "loss": 1.923, + "step": 23448 + }, + { + "epoch": 1.573739136270595, + "grad_norm": 4.529566764831543, + "learning_rate": 1.1442074459110974e-05, + "loss": 1.9109, + "step": 23450 + }, + { + "epoch": 1.573873359954364, + "grad_norm": 6.023700714111328, + "learning_rate": 1.1435156136791342e-05, + "loss": 2.0885, + "step": 23452 + }, + { + "epoch": 1.5740075836381329, + "grad_norm": 4.9273762702941895, + "learning_rate": 1.1428239636584732e-05, + "loss": 2.0789, + "step": 23454 + }, + { + "epoch": 1.5741418073219018, + "grad_norm": 4.229285717010498, + "learning_rate": 1.142132495881798e-05, + "loss": 1.7565, + "step": 23456 + }, + { + "epoch": 1.574276031005671, + "grad_norm": 4.630785942077637, + "learning_rate": 1.141441210381774e-05, + "loss": 2.0817, + "step": 23458 + }, + { + "epoch": 1.57441025468944, + "grad_norm": 3.624598979949951, + "learning_rate": 1.140750107191066e-05, + "loss": 1.7317, + "step": 23460 + }, + { + "epoch": 1.574544478373209, + "grad_norm": 3.7568106651306152, + "learning_rate": 1.1400591863423254e-05, + "loss": 2.0434, + "step": 23462 + }, + { + "epoch": 1.574678702056978, + "grad_norm": 4.227254867553711, + "learning_rate": 1.1393684478681993e-05, + "loss": 1.9359, + "step": 23464 + }, + { + "epoch": 1.5748129257407468, + "grad_norm": 4.524445056915283, + "learning_rate": 1.1386778918013225e-05, + "loss": 2.0853, + "step": 23466 + }, + { + "epoch": 1.574947149424516, + "grad_norm": 4.352471828460693, + "learning_rate": 1.1379875181743222e-05, + "loss": 1.9169, + "step": 23468 + }, + { + "epoch": 1.575081373108285, + "grad_norm": 4.689810752868652, + "learning_rate": 1.1372973270198163e-05, + "loss": 2.0178, + "step": 23470 + }, + { + "epoch": 1.575215596792054, + "grad_norm": 4.173937797546387, + "learning_rate": 1.1366073183704167e-05, + "loss": 1.957, + "step": 23472 + }, + { + "epoch": 1.575349820475823, + "grad_norm": 4.616366386413574, + "learning_rate": 1.1359174922587251e-05, + "loss": 2.1359, + "step": 23474 + }, + { + "epoch": 1.5754840441595919, + "grad_norm": 4.4599609375, + "learning_rate": 1.1352278487173328e-05, + "loss": 1.7574, + "step": 23476 + }, + { + "epoch": 1.575618267843361, + "grad_norm": 6.084953784942627, + "learning_rate": 1.1345383877788241e-05, + "loss": 1.8223, + "step": 23478 + }, + { + "epoch": 1.57575249152713, + "grad_norm": 4.399728298187256, + "learning_rate": 1.133849109475777e-05, + "loss": 2.018, + "step": 23480 + }, + { + "epoch": 1.575886715210899, + "grad_norm": 4.529194355010986, + "learning_rate": 1.133160013840755e-05, + "loss": 1.8445, + "step": 23482 + }, + { + "epoch": 1.576020938894668, + "grad_norm": 3.621826410293579, + "learning_rate": 1.132471100906322e-05, + "loss": 1.7986, + "step": 23484 + }, + { + "epoch": 1.576155162578437, + "grad_norm": 4.666601181030273, + "learning_rate": 1.1317823707050224e-05, + "loss": 1.9953, + "step": 23486 + }, + { + "epoch": 1.5762893862622058, + "grad_norm": 4.127919673919678, + "learning_rate": 1.131093823269398e-05, + "loss": 1.8376, + "step": 23488 + }, + { + "epoch": 1.576423609945975, + "grad_norm": 5.228948593139648, + "learning_rate": 1.130405458631984e-05, + "loss": 1.7345, + "step": 23490 + }, + { + "epoch": 1.576557833629744, + "grad_norm": 4.259840965270996, + "learning_rate": 1.1297172768253018e-05, + "loss": 1.8812, + "step": 23492 + }, + { + "epoch": 1.576692057313513, + "grad_norm": 4.434686183929443, + "learning_rate": 1.1290292778818684e-05, + "loss": 1.8938, + "step": 23494 + }, + { + "epoch": 1.576826280997282, + "grad_norm": 4.42847204208374, + "learning_rate": 1.1283414618341903e-05, + "loss": 2.1476, + "step": 23496 + }, + { + "epoch": 1.5769605046810509, + "grad_norm": 4.545740127563477, + "learning_rate": 1.1276538287147642e-05, + "loss": 1.921, + "step": 23498 + }, + { + "epoch": 1.57709472836482, + "grad_norm": 4.6752519607543945, + "learning_rate": 1.1269663785560792e-05, + "loss": 2.0937, + "step": 23500 + }, + { + "epoch": 1.577228952048589, + "grad_norm": 4.301072120666504, + "learning_rate": 1.1262791113906179e-05, + "loss": 2.1332, + "step": 23502 + }, + { + "epoch": 1.577363175732358, + "grad_norm": 3.7012457847595215, + "learning_rate": 1.1255920272508519e-05, + "loss": 1.6637, + "step": 23504 + }, + { + "epoch": 1.577497399416127, + "grad_norm": 4.369075298309326, + "learning_rate": 1.1249051261692433e-05, + "loss": 2.0804, + "step": 23506 + }, + { + "epoch": 1.577631623099896, + "grad_norm": 4.473456382751465, + "learning_rate": 1.124218408178246e-05, + "loss": 1.8923, + "step": 23508 + }, + { + "epoch": 1.5777658467836648, + "grad_norm": 4.084301471710205, + "learning_rate": 1.1235318733103089e-05, + "loss": 2.1897, + "step": 23510 + }, + { + "epoch": 1.577900070467434, + "grad_norm": 4.312085151672363, + "learning_rate": 1.1228455215978683e-05, + "loss": 1.7621, + "step": 23512 + }, + { + "epoch": 1.578034294151203, + "grad_norm": 3.3202719688415527, + "learning_rate": 1.1221593530733527e-05, + "loss": 1.8583, + "step": 23514 + }, + { + "epoch": 1.578168517834972, + "grad_norm": 4.062694549560547, + "learning_rate": 1.1214733677691808e-05, + "loss": 2.0042, + "step": 23516 + }, + { + "epoch": 1.578302741518741, + "grad_norm": 3.994215726852417, + "learning_rate": 1.1207875657177663e-05, + "loss": 1.8149, + "step": 23518 + }, + { + "epoch": 1.5784369652025099, + "grad_norm": 5.491581439971924, + "learning_rate": 1.1201019469515106e-05, + "loss": 1.991, + "step": 23520 + }, + { + "epoch": 1.578571188886279, + "grad_norm": 3.4579226970672607, + "learning_rate": 1.1194165115028094e-05, + "loss": 1.8006, + "step": 23522 + }, + { + "epoch": 1.5787054125700481, + "grad_norm": 4.909080982208252, + "learning_rate": 1.1187312594040472e-05, + "loss": 2.0077, + "step": 23524 + }, + { + "epoch": 1.578839636253817, + "grad_norm": 3.541472911834717, + "learning_rate": 1.118046190687601e-05, + "loss": 1.8799, + "step": 23526 + }, + { + "epoch": 1.578973859937586, + "grad_norm": 4.600574970245361, + "learning_rate": 1.1173613053858373e-05, + "loss": 2.3323, + "step": 23528 + }, + { + "epoch": 1.579108083621355, + "grad_norm": 3.948371648788452, + "learning_rate": 1.1166766035311182e-05, + "loss": 1.8601, + "step": 23530 + }, + { + "epoch": 1.5792423073051238, + "grad_norm": 4.076717853546143, + "learning_rate": 1.1159920851557937e-05, + "loss": 1.8759, + "step": 23532 + }, + { + "epoch": 1.579376530988893, + "grad_norm": 4.337462902069092, + "learning_rate": 1.1153077502922055e-05, + "loss": 1.9786, + "step": 23534 + }, + { + "epoch": 1.579510754672662, + "grad_norm": 4.308121204376221, + "learning_rate": 1.1146235989726856e-05, + "loss": 2.0233, + "step": 23536 + }, + { + "epoch": 1.579644978356431, + "grad_norm": 4.223680019378662, + "learning_rate": 1.113939631229562e-05, + "loss": 1.9114, + "step": 23538 + }, + { + "epoch": 1.5797792020402, + "grad_norm": 4.111114025115967, + "learning_rate": 1.1132558470951487e-05, + "loss": 2.1732, + "step": 23540 + }, + { + "epoch": 1.5799134257239689, + "grad_norm": 4.326385498046875, + "learning_rate": 1.1125722466017547e-05, + "loss": 1.9654, + "step": 23542 + }, + { + "epoch": 1.580047649407738, + "grad_norm": 4.645786762237549, + "learning_rate": 1.1118888297816754e-05, + "loss": 1.9643, + "step": 23544 + }, + { + "epoch": 1.5801818730915071, + "grad_norm": 4.292751789093018, + "learning_rate": 1.1112055966672053e-05, + "loss": 1.8193, + "step": 23546 + }, + { + "epoch": 1.580316096775276, + "grad_norm": 3.8404905796051025, + "learning_rate": 1.1105225472906216e-05, + "loss": 1.7837, + "step": 23548 + }, + { + "epoch": 1.580450320459045, + "grad_norm": 3.422560930252075, + "learning_rate": 1.109839681684201e-05, + "loss": 1.9549, + "step": 23550 + }, + { + "epoch": 1.580584544142814, + "grad_norm": 3.8419318199157715, + "learning_rate": 1.1091569998802075e-05, + "loss": 1.7547, + "step": 23552 + }, + { + "epoch": 1.580718767826583, + "grad_norm": 3.785710334777832, + "learning_rate": 1.1084745019108916e-05, + "loss": 1.724, + "step": 23554 + }, + { + "epoch": 1.580852991510352, + "grad_norm": 4.294137001037598, + "learning_rate": 1.1077921878085047e-05, + "loss": 1.9195, + "step": 23556 + }, + { + "epoch": 1.580987215194121, + "grad_norm": 4.138665199279785, + "learning_rate": 1.1071100576052818e-05, + "loss": 2.0119, + "step": 23558 + }, + { + "epoch": 1.58112143887789, + "grad_norm": 4.164572238922119, + "learning_rate": 1.1064281113334546e-05, + "loss": 1.8614, + "step": 23560 + }, + { + "epoch": 1.581255662561659, + "grad_norm": 4.1823296546936035, + "learning_rate": 1.1057463490252434e-05, + "loss": 2.1698, + "step": 23562 + }, + { + "epoch": 1.5813898862454279, + "grad_norm": 4.457241058349609, + "learning_rate": 1.1050647707128592e-05, + "loss": 2.1885, + "step": 23564 + }, + { + "epoch": 1.581524109929197, + "grad_norm": 3.7043237686157227, + "learning_rate": 1.104383376428504e-05, + "loss": 1.614, + "step": 23566 + }, + { + "epoch": 1.5816583336129661, + "grad_norm": 4.029632091522217, + "learning_rate": 1.1037021662043756e-05, + "loss": 1.9829, + "step": 23568 + }, + { + "epoch": 1.581792557296735, + "grad_norm": 4.552183628082275, + "learning_rate": 1.1030211400726576e-05, + "loss": 1.9551, + "step": 23570 + }, + { + "epoch": 1.581926780980504, + "grad_norm": 3.8705976009368896, + "learning_rate": 1.1023402980655279e-05, + "loss": 1.8874, + "step": 23572 + }, + { + "epoch": 1.582061004664273, + "grad_norm": 4.213149547576904, + "learning_rate": 1.1016596402151536e-05, + "loss": 2.1478, + "step": 23574 + }, + { + "epoch": 1.582195228348042, + "grad_norm": 3.84010648727417, + "learning_rate": 1.1009791665536968e-05, + "loss": 2.1568, + "step": 23576 + }, + { + "epoch": 1.582329452031811, + "grad_norm": 11.719099998474121, + "learning_rate": 1.1002988771133054e-05, + "loss": 1.8535, + "step": 23578 + }, + { + "epoch": 1.58246367571558, + "grad_norm": 5.520688056945801, + "learning_rate": 1.0996187719261269e-05, + "loss": 2.0027, + "step": 23580 + }, + { + "epoch": 1.582597899399349, + "grad_norm": 4.167609691619873, + "learning_rate": 1.0989388510242887e-05, + "loss": 1.7251, + "step": 23582 + }, + { + "epoch": 1.582732123083118, + "grad_norm": 4.290101528167725, + "learning_rate": 1.0982591144399201e-05, + "loss": 1.7846, + "step": 23584 + }, + { + "epoch": 1.5828663467668869, + "grad_norm": 4.228955268859863, + "learning_rate": 1.097579562205135e-05, + "loss": 2.1702, + "step": 23586 + }, + { + "epoch": 1.583000570450656, + "grad_norm": 4.276394844055176, + "learning_rate": 1.0969001943520424e-05, + "loss": 1.7984, + "step": 23588 + }, + { + "epoch": 1.5831347941344251, + "grad_norm": 4.602935314178467, + "learning_rate": 1.0962210109127407e-05, + "loss": 1.8766, + "step": 23590 + }, + { + "epoch": 1.583269017818194, + "grad_norm": 4.1463623046875, + "learning_rate": 1.0955420119193199e-05, + "loss": 1.7806, + "step": 23592 + }, + { + "epoch": 1.583403241501963, + "grad_norm": 3.3726398944854736, + "learning_rate": 1.0948631974038604e-05, + "loss": 1.5564, + "step": 23594 + }, + { + "epoch": 1.583537465185732, + "grad_norm": 4.433759689331055, + "learning_rate": 1.0941845673984364e-05, + "loss": 1.9457, + "step": 23596 + }, + { + "epoch": 1.583671688869501, + "grad_norm": 3.963810443878174, + "learning_rate": 1.0935061219351111e-05, + "loss": 1.8187, + "step": 23598 + }, + { + "epoch": 1.5838059125532702, + "grad_norm": 3.525373697280884, + "learning_rate": 1.0928278610459398e-05, + "loss": 1.6521, + "step": 23600 + }, + { + "epoch": 1.583940136237039, + "grad_norm": 4.323057174682617, + "learning_rate": 1.092149784762968e-05, + "loss": 1.9559, + "step": 23602 + }, + { + "epoch": 1.584074359920808, + "grad_norm": 3.5661473274230957, + "learning_rate": 1.0914718931182355e-05, + "loss": 1.7319, + "step": 23604 + }, + { + "epoch": 1.584208583604577, + "grad_norm": 4.463588714599609, + "learning_rate": 1.0907941861437688e-05, + "loss": 1.9616, + "step": 23606 + }, + { + "epoch": 1.5843428072883459, + "grad_norm": 4.653008460998535, + "learning_rate": 1.0901166638715926e-05, + "loss": 2.1211, + "step": 23608 + }, + { + "epoch": 1.584477030972115, + "grad_norm": 4.107627868652344, + "learning_rate": 1.0894393263337127e-05, + "loss": 2.1237, + "step": 23610 + }, + { + "epoch": 1.5846112546558841, + "grad_norm": 3.961113214492798, + "learning_rate": 1.0887621735621362e-05, + "loss": 1.7863, + "step": 23612 + }, + { + "epoch": 1.584745478339653, + "grad_norm": 3.853546142578125, + "learning_rate": 1.0880852055888547e-05, + "loss": 1.7541, + "step": 23614 + }, + { + "epoch": 1.584879702023422, + "grad_norm": 3.912698745727539, + "learning_rate": 1.0874084224458558e-05, + "loss": 1.7908, + "step": 23616 + }, + { + "epoch": 1.585013925707191, + "grad_norm": 3.767315149307251, + "learning_rate": 1.0867318241651154e-05, + "loss": 1.9361, + "step": 23618 + }, + { + "epoch": 1.58514814939096, + "grad_norm": 4.0023298263549805, + "learning_rate": 1.0860554107786015e-05, + "loss": 2.088, + "step": 23620 + }, + { + "epoch": 1.5852823730747292, + "grad_norm": 4.4465484619140625, + "learning_rate": 1.0853791823182723e-05, + "loss": 1.7462, + "step": 23622 + }, + { + "epoch": 1.585416596758498, + "grad_norm": 4.335114479064941, + "learning_rate": 1.0847031388160777e-05, + "loss": 1.9312, + "step": 23624 + }, + { + "epoch": 1.585550820442267, + "grad_norm": 4.794244766235352, + "learning_rate": 1.0840272803039625e-05, + "loss": 1.9273, + "step": 23626 + }, + { + "epoch": 1.585685044126036, + "grad_norm": 3.5601611137390137, + "learning_rate": 1.0833516068138577e-05, + "loss": 1.4924, + "step": 23628 + }, + { + "epoch": 1.585819267809805, + "grad_norm": 3.829209566116333, + "learning_rate": 1.0826761183776879e-05, + "loss": 1.8227, + "step": 23630 + }, + { + "epoch": 1.585953491493574, + "grad_norm": 4.620016098022461, + "learning_rate": 1.0820008150273669e-05, + "loss": 1.9649, + "step": 23632 + }, + { + "epoch": 1.5860877151773431, + "grad_norm": 4.141509532928467, + "learning_rate": 1.0813256967948044e-05, + "loss": 1.9455, + "step": 23634 + }, + { + "epoch": 1.586221938861112, + "grad_norm": 4.107699394226074, + "learning_rate": 1.0806507637118967e-05, + "loss": 1.9496, + "step": 23636 + }, + { + "epoch": 1.586356162544881, + "grad_norm": 3.985492467880249, + "learning_rate": 1.0799760158105337e-05, + "loss": 2.0296, + "step": 23638 + }, + { + "epoch": 1.58649038622865, + "grad_norm": 4.053755760192871, + "learning_rate": 1.0793014531225937e-05, + "loss": 1.7948, + "step": 23640 + }, + { + "epoch": 1.586624609912419, + "grad_norm": 3.960195541381836, + "learning_rate": 1.0786270756799522e-05, + "loss": 1.9001, + "step": 23642 + }, + { + "epoch": 1.5867588335961882, + "grad_norm": 4.4425048828125, + "learning_rate": 1.0779528835144686e-05, + "loss": 1.8579, + "step": 23644 + }, + { + "epoch": 1.586893057279957, + "grad_norm": 3.6443419456481934, + "learning_rate": 1.0772788766580022e-05, + "loss": 1.6204, + "step": 23646 + }, + { + "epoch": 1.587027280963726, + "grad_norm": 3.7707409858703613, + "learning_rate": 1.0766050551423917e-05, + "loss": 2.0012, + "step": 23648 + }, + { + "epoch": 1.587161504647495, + "grad_norm": 4.0009541511535645, + "learning_rate": 1.075931418999479e-05, + "loss": 1.9436, + "step": 23650 + }, + { + "epoch": 1.587295728331264, + "grad_norm": 3.5363852977752686, + "learning_rate": 1.0752579682610891e-05, + "loss": 1.7389, + "step": 23652 + }, + { + "epoch": 1.587429952015033, + "grad_norm": 4.5753865242004395, + "learning_rate": 1.0745847029590439e-05, + "loss": 2.068, + "step": 23654 + }, + { + "epoch": 1.5875641756988021, + "grad_norm": 3.908095359802246, + "learning_rate": 1.0739116231251523e-05, + "loss": 1.9202, + "step": 23656 + }, + { + "epoch": 1.587698399382571, + "grad_norm": 4.053347110748291, + "learning_rate": 1.0732387287912166e-05, + "loss": 1.9209, + "step": 23658 + }, + { + "epoch": 1.58783262306634, + "grad_norm": 4.375077724456787, + "learning_rate": 1.0725660199890275e-05, + "loss": 1.9515, + "step": 23660 + }, + { + "epoch": 1.587966846750109, + "grad_norm": 4.007272720336914, + "learning_rate": 1.0718934967503724e-05, + "loss": 1.9921, + "step": 23662 + }, + { + "epoch": 1.588101070433878, + "grad_norm": 4.218266010284424, + "learning_rate": 1.0712211591070254e-05, + "loss": 1.8516, + "step": 23664 + }, + { + "epoch": 1.5882352941176472, + "grad_norm": 4.365920066833496, + "learning_rate": 1.070549007090753e-05, + "loss": 1.9471, + "step": 23666 + }, + { + "epoch": 1.588369517801416, + "grad_norm": 4.071874618530273, + "learning_rate": 1.069877040733312e-05, + "loss": 1.884, + "step": 23668 + }, + { + "epoch": 1.588503741485185, + "grad_norm": 4.333788871765137, + "learning_rate": 1.0692052600664537e-05, + "loss": 1.8674, + "step": 23670 + }, + { + "epoch": 1.588637965168954, + "grad_norm": 4.343796730041504, + "learning_rate": 1.0685336651219158e-05, + "loss": 1.8778, + "step": 23672 + }, + { + "epoch": 1.588772188852723, + "grad_norm": 4.258458137512207, + "learning_rate": 1.0678622559314344e-05, + "loss": 1.918, + "step": 23674 + }, + { + "epoch": 1.5889064125364922, + "grad_norm": 4.487898826599121, + "learning_rate": 1.067191032526726e-05, + "loss": 1.9066, + "step": 23676 + }, + { + "epoch": 1.5890406362202611, + "grad_norm": 3.9972565174102783, + "learning_rate": 1.0665199949395093e-05, + "loss": 1.8929, + "step": 23678 + }, + { + "epoch": 1.58917485990403, + "grad_norm": 4.24595832824707, + "learning_rate": 1.0658491432014867e-05, + "loss": 2.008, + "step": 23680 + }, + { + "epoch": 1.589309083587799, + "grad_norm": 3.629357099533081, + "learning_rate": 1.0651784773443573e-05, + "loss": 1.7863, + "step": 23682 + }, + { + "epoch": 1.589443307271568, + "grad_norm": 3.912019729614258, + "learning_rate": 1.0645079973998073e-05, + "loss": 1.9108, + "step": 23684 + }, + { + "epoch": 1.589577530955337, + "grad_norm": 4.218672275543213, + "learning_rate": 1.0638377033995156e-05, + "loss": 1.8492, + "step": 23686 + }, + { + "epoch": 1.5897117546391062, + "grad_norm": 5.219738960266113, + "learning_rate": 1.0631675953751508e-05, + "loss": 2.0282, + "step": 23688 + }, + { + "epoch": 1.589845978322875, + "grad_norm": 4.719354152679443, + "learning_rate": 1.0624976733583775e-05, + "loss": 2.023, + "step": 23690 + }, + { + "epoch": 1.589980202006644, + "grad_norm": 3.7436368465423584, + "learning_rate": 1.0618279373808459e-05, + "loss": 1.8486, + "step": 23692 + }, + { + "epoch": 1.590114425690413, + "grad_norm": 4.142632484436035, + "learning_rate": 1.0611583874742004e-05, + "loss": 2.0962, + "step": 23694 + }, + { + "epoch": 1.590248649374182, + "grad_norm": 4.583037853240967, + "learning_rate": 1.0604890236700753e-05, + "loss": 1.9181, + "step": 23696 + }, + { + "epoch": 1.5903828730579512, + "grad_norm": 4.35253381729126, + "learning_rate": 1.0598198460000963e-05, + "loss": 1.9572, + "step": 23698 + }, + { + "epoch": 1.5905170967417201, + "grad_norm": 4.086130142211914, + "learning_rate": 1.0591508544958823e-05, + "loss": 1.7327, + "step": 23700 + }, + { + "epoch": 1.590651320425489, + "grad_norm": 4.106801986694336, + "learning_rate": 1.0584820491890402e-05, + "loss": 1.8728, + "step": 23702 + }, + { + "epoch": 1.590785544109258, + "grad_norm": 3.566444158554077, + "learning_rate": 1.0578134301111731e-05, + "loss": 1.8511, + "step": 23704 + }, + { + "epoch": 1.5909197677930271, + "grad_norm": 4.528006076812744, + "learning_rate": 1.0571449972938668e-05, + "loss": 1.8379, + "step": 23706 + }, + { + "epoch": 1.591053991476796, + "grad_norm": 4.356319904327393, + "learning_rate": 1.0564767507687078e-05, + "loss": 2.0044, + "step": 23708 + }, + { + "epoch": 1.5911882151605652, + "grad_norm": 3.8227555751800537, + "learning_rate": 1.055808690567266e-05, + "loss": 1.9156, + "step": 23710 + }, + { + "epoch": 1.591322438844334, + "grad_norm": 3.894857406616211, + "learning_rate": 1.055140816721109e-05, + "loss": 1.7348, + "step": 23712 + }, + { + "epoch": 1.591456662528103, + "grad_norm": 4.4437079429626465, + "learning_rate": 1.054473129261791e-05, + "loss": 1.8229, + "step": 23714 + }, + { + "epoch": 1.591590886211872, + "grad_norm": 4.443861484527588, + "learning_rate": 1.0538056282208598e-05, + "loss": 1.8276, + "step": 23716 + }, + { + "epoch": 1.591725109895641, + "grad_norm": 4.165951728820801, + "learning_rate": 1.0531383136298507e-05, + "loss": 2.0211, + "step": 23718 + }, + { + "epoch": 1.5918593335794102, + "grad_norm": 3.9094364643096924, + "learning_rate": 1.0524711855202967e-05, + "loss": 2.1525, + "step": 23720 + }, + { + "epoch": 1.5919935572631791, + "grad_norm": 4.3602213859558105, + "learning_rate": 1.051804243923717e-05, + "loss": 1.8903, + "step": 23722 + }, + { + "epoch": 1.592127780946948, + "grad_norm": 4.040722846984863, + "learning_rate": 1.0511374888716224e-05, + "loss": 2.1183, + "step": 23724 + }, + { + "epoch": 1.592262004630717, + "grad_norm": 3.4310965538024902, + "learning_rate": 1.0504709203955155e-05, + "loss": 1.6993, + "step": 23726 + }, + { + "epoch": 1.5923962283144861, + "grad_norm": 4.152184009552002, + "learning_rate": 1.0498045385268923e-05, + "loss": 1.9313, + "step": 23728 + }, + { + "epoch": 1.592530451998255, + "grad_norm": 4.121078968048096, + "learning_rate": 1.0491383432972357e-05, + "loss": 1.7576, + "step": 23730 + }, + { + "epoch": 1.5926646756820242, + "grad_norm": 4.15501070022583, + "learning_rate": 1.048472334738026e-05, + "loss": 1.9668, + "step": 23732 + }, + { + "epoch": 1.592798899365793, + "grad_norm": 4.201764106750488, + "learning_rate": 1.0478065128807251e-05, + "loss": 2.2606, + "step": 23734 + }, + { + "epoch": 1.592933123049562, + "grad_norm": 3.5176827907562256, + "learning_rate": 1.0471408777567965e-05, + "loss": 1.7988, + "step": 23736 + }, + { + "epoch": 1.593067346733331, + "grad_norm": 4.112334251403809, + "learning_rate": 1.0464754293976875e-05, + "loss": 1.908, + "step": 23738 + }, + { + "epoch": 1.5932015704171, + "grad_norm": 4.216094017028809, + "learning_rate": 1.045810167834841e-05, + "loss": 2.1459, + "step": 23740 + }, + { + "epoch": 1.5933357941008692, + "grad_norm": 4.196349143981934, + "learning_rate": 1.0451450930996887e-05, + "loss": 1.8973, + "step": 23742 + }, + { + "epoch": 1.5934700177846381, + "grad_norm": 3.4886634349823, + "learning_rate": 1.0444802052236535e-05, + "loss": 1.7315, + "step": 23744 + }, + { + "epoch": 1.593604241468407, + "grad_norm": 4.654811382293701, + "learning_rate": 1.0438155042381491e-05, + "loss": 1.801, + "step": 23746 + }, + { + "epoch": 1.593738465152176, + "grad_norm": 3.517176628112793, + "learning_rate": 1.0431509901745846e-05, + "loss": 1.8183, + "step": 23748 + }, + { + "epoch": 1.5938726888359451, + "grad_norm": 4.367359161376953, + "learning_rate": 1.0424866630643542e-05, + "loss": 1.7139, + "step": 23750 + }, + { + "epoch": 1.5940069125197143, + "grad_norm": 4.536531925201416, + "learning_rate": 1.0418225229388479e-05, + "loss": 1.831, + "step": 23752 + }, + { + "epoch": 1.5941411362034832, + "grad_norm": 4.285284519195557, + "learning_rate": 1.0411585698294418e-05, + "loss": 1.7856, + "step": 23754 + }, + { + "epoch": 1.594275359887252, + "grad_norm": 4.429383277893066, + "learning_rate": 1.04049480376751e-05, + "loss": 1.9244, + "step": 23756 + }, + { + "epoch": 1.594409583571021, + "grad_norm": 4.535050868988037, + "learning_rate": 1.0398312247844127e-05, + "loss": 2.138, + "step": 23758 + }, + { + "epoch": 1.59454380725479, + "grad_norm": 4.367264270782471, + "learning_rate": 1.0391678329115028e-05, + "loss": 1.8425, + "step": 23760 + }, + { + "epoch": 1.594678030938559, + "grad_norm": 4.048778533935547, + "learning_rate": 1.0385046281801242e-05, + "loss": 1.969, + "step": 23762 + }, + { + "epoch": 1.5948122546223282, + "grad_norm": 4.0553717613220215, + "learning_rate": 1.0378416106216105e-05, + "loss": 1.9978, + "step": 23764 + }, + { + "epoch": 1.5949464783060971, + "grad_norm": 4.01049280166626, + "learning_rate": 1.0371787802672906e-05, + "loss": 1.8257, + "step": 23766 + }, + { + "epoch": 1.595080701989866, + "grad_norm": 3.986743450164795, + "learning_rate": 1.03651613714848e-05, + "loss": 2.0561, + "step": 23768 + }, + { + "epoch": 1.595214925673635, + "grad_norm": 4.3107123374938965, + "learning_rate": 1.0358536812964904e-05, + "loss": 1.7877, + "step": 23770 + }, + { + "epoch": 1.5953491493574041, + "grad_norm": 4.255932331085205, + "learning_rate": 1.0351914127426166e-05, + "loss": 1.6876, + "step": 23772 + }, + { + "epoch": 1.5954833730411733, + "grad_norm": 4.1478271484375, + "learning_rate": 1.034529331518153e-05, + "loss": 1.9445, + "step": 23774 + }, + { + "epoch": 1.5956175967249422, + "grad_norm": 4.037036418914795, + "learning_rate": 1.03386743765438e-05, + "loss": 1.9333, + "step": 23776 + }, + { + "epoch": 1.595751820408711, + "grad_norm": 4.396235942840576, + "learning_rate": 1.0332057311825726e-05, + "loss": 1.8403, + "step": 23778 + }, + { + "epoch": 1.59588604409248, + "grad_norm": 4.1749162673950195, + "learning_rate": 1.032544212133994e-05, + "loss": 1.8691, + "step": 23780 + }, + { + "epoch": 1.5960202677762492, + "grad_norm": 4.195956707000732, + "learning_rate": 1.0318828805398994e-05, + "loss": 2.2225, + "step": 23782 + }, + { + "epoch": 1.596154491460018, + "grad_norm": 4.044699192047119, + "learning_rate": 1.0312217364315351e-05, + "loss": 2.0744, + "step": 23784 + }, + { + "epoch": 1.5962887151437872, + "grad_norm": 4.356987476348877, + "learning_rate": 1.03056077984014e-05, + "loss": 2.2638, + "step": 23786 + }, + { + "epoch": 1.5964229388275561, + "grad_norm": 4.178799152374268, + "learning_rate": 1.029900010796943e-05, + "loss": 1.8194, + "step": 23788 + }, + { + "epoch": 1.596557162511325, + "grad_norm": 4.078964710235596, + "learning_rate": 1.0292394293331637e-05, + "loss": 1.8718, + "step": 23790 + }, + { + "epoch": 1.596691386195094, + "grad_norm": 3.5499815940856934, + "learning_rate": 1.0285790354800118e-05, + "loss": 1.7416, + "step": 23792 + }, + { + "epoch": 1.5968256098788631, + "grad_norm": 4.39594030380249, + "learning_rate": 1.027918829268692e-05, + "loss": 1.9405, + "step": 23794 + }, + { + "epoch": 1.5969598335626323, + "grad_norm": 3.6343472003936768, + "learning_rate": 1.027258810730396e-05, + "loss": 1.7332, + "step": 23796 + }, + { + "epoch": 1.5970940572464012, + "grad_norm": 4.514671325683594, + "learning_rate": 1.0265989798963116e-05, + "loss": 2.0378, + "step": 23798 + }, + { + "epoch": 1.59722828093017, + "grad_norm": 3.5611727237701416, + "learning_rate": 1.025939336797609e-05, + "loss": 1.8421, + "step": 23800 + }, + { + "epoch": 1.597362504613939, + "grad_norm": 4.130030632019043, + "learning_rate": 1.0252798814654597e-05, + "loss": 1.8869, + "step": 23802 + }, + { + "epoch": 1.5974967282977082, + "grad_norm": 3.816419839859009, + "learning_rate": 1.0246206139310187e-05, + "loss": 1.8896, + "step": 23804 + }, + { + "epoch": 1.597630951981477, + "grad_norm": 4.852032661437988, + "learning_rate": 1.0239615342254378e-05, + "loss": 1.9769, + "step": 23806 + }, + { + "epoch": 1.5977651756652462, + "grad_norm": 4.578916549682617, + "learning_rate": 1.0233026423798559e-05, + "loss": 2.0468, + "step": 23808 + }, + { + "epoch": 1.5978993993490151, + "grad_norm": 4.554512977600098, + "learning_rate": 1.0226439384254044e-05, + "loss": 2.1455, + "step": 23810 + }, + { + "epoch": 1.598033623032784, + "grad_norm": 4.600389003753662, + "learning_rate": 1.0219854223932041e-05, + "loss": 2.1647, + "step": 23812 + }, + { + "epoch": 1.598167846716553, + "grad_norm": 4.54832124710083, + "learning_rate": 1.0213270943143716e-05, + "loss": 1.7445, + "step": 23814 + }, + { + "epoch": 1.5983020704003221, + "grad_norm": 4.609463691711426, + "learning_rate": 1.02066895422001e-05, + "loss": 1.8435, + "step": 23816 + }, + { + "epoch": 1.5984362940840913, + "grad_norm": 4.270798683166504, + "learning_rate": 1.020011002141215e-05, + "loss": 2.0775, + "step": 23818 + }, + { + "epoch": 1.5985705177678602, + "grad_norm": 3.7038516998291016, + "learning_rate": 1.0193532381090732e-05, + "loss": 1.8139, + "step": 23820 + }, + { + "epoch": 1.598704741451629, + "grad_norm": 3.9762096405029297, + "learning_rate": 1.0186956621546645e-05, + "loss": 2.1906, + "step": 23822 + }, + { + "epoch": 1.598838965135398, + "grad_norm": 4.364926815032959, + "learning_rate": 1.0180382743090555e-05, + "loss": 1.862, + "step": 23824 + }, + { + "epoch": 1.5989731888191672, + "grad_norm": 3.7851641178131104, + "learning_rate": 1.0173810746033103e-05, + "loss": 1.8647, + "step": 23826 + }, + { + "epoch": 1.5991074125029363, + "grad_norm": 4.021846294403076, + "learning_rate": 1.0167240630684765e-05, + "loss": 2.0013, + "step": 23828 + }, + { + "epoch": 1.5992416361867052, + "grad_norm": 3.764777898788452, + "learning_rate": 1.0160672397355969e-05, + "loss": 1.9326, + "step": 23830 + }, + { + "epoch": 1.5993758598704741, + "grad_norm": 3.9732789993286133, + "learning_rate": 1.0154106046357071e-05, + "loss": 2.0089, + "step": 23832 + }, + { + "epoch": 1.599510083554243, + "grad_norm": 4.0858330726623535, + "learning_rate": 1.0147541577998298e-05, + "loss": 1.8704, + "step": 23834 + }, + { + "epoch": 1.599644307238012, + "grad_norm": 4.202672958374023, + "learning_rate": 1.0140978992589833e-05, + "loss": 1.6615, + "step": 23836 + }, + { + "epoch": 1.5997785309217811, + "grad_norm": 3.74397611618042, + "learning_rate": 1.0134418290441728e-05, + "loss": 1.9403, + "step": 23838 + }, + { + "epoch": 1.5999127546055503, + "grad_norm": 3.8985345363616943, + "learning_rate": 1.012785947186397e-05, + "loss": 1.909, + "step": 23840 + }, + { + "epoch": 1.6000469782893192, + "grad_norm": 4.250677108764648, + "learning_rate": 1.0121302537166433e-05, + "loss": 1.89, + "step": 23842 + }, + { + "epoch": 1.600181201973088, + "grad_norm": 4.507623672485352, + "learning_rate": 1.0114747486658943e-05, + "loss": 2.0643, + "step": 23844 + }, + { + "epoch": 1.600315425656857, + "grad_norm": 4.886037349700928, + "learning_rate": 1.0108194320651205e-05, + "loss": 2.0541, + "step": 23846 + }, + { + "epoch": 1.6004496493406262, + "grad_norm": 4.405424118041992, + "learning_rate": 1.010164303945284e-05, + "loss": 2.0206, + "step": 23848 + }, + { + "epoch": 1.6005838730243953, + "grad_norm": 4.119119644165039, + "learning_rate": 1.0095093643373377e-05, + "loss": 2.069, + "step": 23850 + }, + { + "epoch": 1.6007180967081642, + "grad_norm": 3.952784776687622, + "learning_rate": 1.0088546132722276e-05, + "loss": 2.2786, + "step": 23852 + }, + { + "epoch": 1.6008523203919331, + "grad_norm": 3.736757755279541, + "learning_rate": 1.0082000507808892e-05, + "loss": 2.001, + "step": 23854 + }, + { + "epoch": 1.600986544075702, + "grad_norm": 4.526878356933594, + "learning_rate": 1.0075456768942488e-05, + "loss": 2.147, + "step": 23856 + }, + { + "epoch": 1.6011207677594712, + "grad_norm": 4.685520172119141, + "learning_rate": 1.0068914916432231e-05, + "loss": 2.2398, + "step": 23858 + }, + { + "epoch": 1.6012549914432401, + "grad_norm": 3.8652637004852295, + "learning_rate": 1.0062374950587234e-05, + "loss": 1.9321, + "step": 23860 + }, + { + "epoch": 1.6013892151270093, + "grad_norm": 4.239536285400391, + "learning_rate": 1.0055836871716473e-05, + "loss": 1.9735, + "step": 23862 + }, + { + "epoch": 1.6015234388107782, + "grad_norm": 3.9678359031677246, + "learning_rate": 1.0049300680128887e-05, + "loss": 2.062, + "step": 23864 + }, + { + "epoch": 1.601657662494547, + "grad_norm": 3.868394136428833, + "learning_rate": 1.004276637613329e-05, + "loss": 1.8011, + "step": 23866 + }, + { + "epoch": 1.601791886178316, + "grad_norm": 3.6851367950439453, + "learning_rate": 1.0036233960038399e-05, + "loss": 2.0702, + "step": 23868 + }, + { + "epoch": 1.6019261098620852, + "grad_norm": 4.155148506164551, + "learning_rate": 1.0029703432152859e-05, + "loss": 1.8838, + "step": 23870 + }, + { + "epoch": 1.6020603335458543, + "grad_norm": 4.116538047790527, + "learning_rate": 1.002317479278525e-05, + "loss": 1.9148, + "step": 23872 + }, + { + "epoch": 1.6021945572296232, + "grad_norm": 4.239544868469238, + "learning_rate": 1.001664804224402e-05, + "loss": 1.9661, + "step": 23874 + }, + { + "epoch": 1.6023287809133921, + "grad_norm": 3.958850860595703, + "learning_rate": 1.0010123180837544e-05, + "loss": 1.8338, + "step": 23876 + }, + { + "epoch": 1.602463004597161, + "grad_norm": 4.102325916290283, + "learning_rate": 1.0003600208874098e-05, + "loss": 1.8835, + "step": 23878 + }, + { + "epoch": 1.6025972282809302, + "grad_norm": 4.1944451332092285, + "learning_rate": 9.997079126661907e-06, + "loss": 1.7759, + "step": 23880 + }, + { + "epoch": 1.6027314519646991, + "grad_norm": 4.1582417488098145, + "learning_rate": 9.990559934509053e-06, + "loss": 1.9595, + "step": 23882 + }, + { + "epoch": 1.6028656756484683, + "grad_norm": 4.407383918762207, + "learning_rate": 9.984042632723589e-06, + "loss": 2.1135, + "step": 23884 + }, + { + "epoch": 1.6029998993322372, + "grad_norm": 3.890472173690796, + "learning_rate": 9.977527221613397e-06, + "loss": 2.0512, + "step": 23886 + }, + { + "epoch": 1.603134123016006, + "grad_norm": 4.43627405166626, + "learning_rate": 9.97101370148636e-06, + "loss": 2.2, + "step": 23888 + }, + { + "epoch": 1.603268346699775, + "grad_norm": 4.952737808227539, + "learning_rate": 9.964502072650195e-06, + "loss": 1.9024, + "step": 23890 + }, + { + "epoch": 1.6034025703835442, + "grad_norm": 3.9892401695251465, + "learning_rate": 9.957992335412597e-06, + "loss": 1.8806, + "step": 23892 + }, + { + "epoch": 1.6035367940673133, + "grad_norm": 4.312586307525635, + "learning_rate": 9.95148449008112e-06, + "loss": 1.7682, + "step": 23894 + }, + { + "epoch": 1.6036710177510822, + "grad_norm": 4.761053562164307, + "learning_rate": 9.944978536963246e-06, + "loss": 1.8566, + "step": 23896 + }, + { + "epoch": 1.6038052414348511, + "grad_norm": 3.5780789852142334, + "learning_rate": 9.938474476366378e-06, + "loss": 1.7308, + "step": 23898 + }, + { + "epoch": 1.60393946511862, + "grad_norm": 3.662245750427246, + "learning_rate": 9.931972308597792e-06, + "loss": 1.797, + "step": 23900 + }, + { + "epoch": 1.6040736888023892, + "grad_norm": 4.09004020690918, + "learning_rate": 9.925472033964744e-06, + "loss": 1.8225, + "step": 23902 + }, + { + "epoch": 1.6042079124861581, + "grad_norm": 4.086696147918701, + "learning_rate": 9.918973652774339e-06, + "loss": 2.022, + "step": 23904 + }, + { + "epoch": 1.6043421361699273, + "grad_norm": 6.244596004486084, + "learning_rate": 9.912477165333612e-06, + "loss": 1.949, + "step": 23906 + }, + { + "epoch": 1.6044763598536962, + "grad_norm": 3.8716068267822266, + "learning_rate": 9.905982571949491e-06, + "loss": 1.9169, + "step": 23908 + }, + { + "epoch": 1.604610583537465, + "grad_norm": 3.7366650104522705, + "learning_rate": 9.899489872928874e-06, + "loss": 1.7345, + "step": 23910 + }, + { + "epoch": 1.604744807221234, + "grad_norm": 4.299894332885742, + "learning_rate": 9.892999068578502e-06, + "loss": 1.7067, + "step": 23912 + }, + { + "epoch": 1.6048790309050032, + "grad_norm": 3.5937094688415527, + "learning_rate": 9.886510159205053e-06, + "loss": 1.6011, + "step": 23914 + }, + { + "epoch": 1.6050132545887723, + "grad_norm": 3.9436299800872803, + "learning_rate": 9.880023145115114e-06, + "loss": 1.8732, + "step": 23916 + }, + { + "epoch": 1.6051474782725412, + "grad_norm": 4.4380879402160645, + "learning_rate": 9.873538026615198e-06, + "loss": 1.9194, + "step": 23918 + }, + { + "epoch": 1.6052817019563101, + "grad_norm": 4.3789849281311035, + "learning_rate": 9.867054804011693e-06, + "loss": 1.8431, + "step": 23920 + }, + { + "epoch": 1.605415925640079, + "grad_norm": 3.735490083694458, + "learning_rate": 9.860573477610952e-06, + "loss": 1.9416, + "step": 23922 + }, + { + "epoch": 1.6055501493238482, + "grad_norm": 4.770869731903076, + "learning_rate": 9.854094047719164e-06, + "loss": 1.8147, + "step": 23924 + }, + { + "epoch": 1.6056843730076173, + "grad_norm": 4.600051403045654, + "learning_rate": 9.8476165146425e-06, + "loss": 1.9854, + "step": 23926 + }, + { + "epoch": 1.6058185966913863, + "grad_norm": 4.414779186248779, + "learning_rate": 9.841140878686983e-06, + "loss": 1.8566, + "step": 23928 + }, + { + "epoch": 1.6059528203751552, + "grad_norm": 4.043504238128662, + "learning_rate": 9.834667140158605e-06, + "loss": 2.1251, + "step": 23930 + }, + { + "epoch": 1.606087044058924, + "grad_norm": 4.288966178894043, + "learning_rate": 9.828195299363224e-06, + "loss": 1.7174, + "step": 23932 + }, + { + "epoch": 1.6062212677426932, + "grad_norm": 3.960615396499634, + "learning_rate": 9.82172535660662e-06, + "loss": 1.7848, + "step": 23934 + }, + { + "epoch": 1.6063554914264622, + "grad_norm": 3.562453508377075, + "learning_rate": 9.81525731219447e-06, + "loss": 1.9322, + "step": 23936 + }, + { + "epoch": 1.6064897151102313, + "grad_norm": 4.354695796966553, + "learning_rate": 9.808791166432413e-06, + "loss": 1.9789, + "step": 23938 + }, + { + "epoch": 1.6066239387940002, + "grad_norm": 4.411083698272705, + "learning_rate": 9.802326919625931e-06, + "loss": 1.9223, + "step": 23940 + }, + { + "epoch": 1.6067581624777691, + "grad_norm": 4.066767692565918, + "learning_rate": 9.795864572080466e-06, + "loss": 1.962, + "step": 23942 + }, + { + "epoch": 1.606892386161538, + "grad_norm": 3.615788221359253, + "learning_rate": 9.78940412410132e-06, + "loss": 1.7083, + "step": 23944 + }, + { + "epoch": 1.6070266098453072, + "grad_norm": 4.054454326629639, + "learning_rate": 9.782945575993779e-06, + "loss": 1.696, + "step": 23946 + }, + { + "epoch": 1.6071608335290763, + "grad_norm": 3.858663558959961, + "learning_rate": 9.77648892806296e-06, + "loss": 1.8297, + "step": 23948 + }, + { + "epoch": 1.6072950572128453, + "grad_norm": 4.537731170654297, + "learning_rate": 9.770034180613968e-06, + "loss": 2.091, + "step": 23950 + }, + { + "epoch": 1.6074292808966142, + "grad_norm": 3.8868188858032227, + "learning_rate": 9.763581333951727e-06, + "loss": 1.803, + "step": 23952 + }, + { + "epoch": 1.607563504580383, + "grad_norm": 4.174071311950684, + "learning_rate": 9.757130388381158e-06, + "loss": 1.908, + "step": 23954 + }, + { + "epoch": 1.6076977282641522, + "grad_norm": 5.45490837097168, + "learning_rate": 9.750681344207035e-06, + "loss": 1.9528, + "step": 23956 + }, + { + "epoch": 1.6078319519479212, + "grad_norm": 3.8968498706817627, + "learning_rate": 9.744234201734082e-06, + "loss": 1.9068, + "step": 23958 + }, + { + "epoch": 1.6079661756316903, + "grad_norm": 3.9001848697662354, + "learning_rate": 9.737788961266903e-06, + "loss": 1.8535, + "step": 23960 + }, + { + "epoch": 1.6081003993154592, + "grad_norm": 4.439923286437988, + "learning_rate": 9.731345623110022e-06, + "loss": 1.9349, + "step": 23962 + }, + { + "epoch": 1.6082346229992281, + "grad_norm": 4.007152080535889, + "learning_rate": 9.724904187567879e-06, + "loss": 2.1099, + "step": 23964 + }, + { + "epoch": 1.608368846682997, + "grad_norm": 4.34168815612793, + "learning_rate": 9.718464654944798e-06, + "loss": 1.8865, + "step": 23966 + }, + { + "epoch": 1.6085030703667662, + "grad_norm": 3.5462779998779297, + "learning_rate": 9.712027025545067e-06, + "loss": 1.9471, + "step": 23968 + }, + { + "epoch": 1.6086372940505353, + "grad_norm": 4.333988189697266, + "learning_rate": 9.70559129967284e-06, + "loss": 1.8338, + "step": 23970 + }, + { + "epoch": 1.6087715177343043, + "grad_norm": 4.626573085784912, + "learning_rate": 9.699157477632181e-06, + "loss": 1.8022, + "step": 23972 + }, + { + "epoch": 1.6089057414180732, + "grad_norm": 4.007143974304199, + "learning_rate": 9.692725559727072e-06, + "loss": 2.1327, + "step": 23974 + }, + { + "epoch": 1.609039965101842, + "grad_norm": 4.0807013511657715, + "learning_rate": 9.686295546261436e-06, + "loss": 1.9332, + "step": 23976 + }, + { + "epoch": 1.6091741887856112, + "grad_norm": 4.001431941986084, + "learning_rate": 9.679867437539063e-06, + "loss": 1.8488, + "step": 23978 + }, + { + "epoch": 1.6093084124693802, + "grad_norm": 4.076642990112305, + "learning_rate": 9.673441233863662e-06, + "loss": 1.8498, + "step": 23980 + }, + { + "epoch": 1.6094426361531493, + "grad_norm": 4.654974460601807, + "learning_rate": 9.667016935538859e-06, + "loss": 1.8511, + "step": 23982 + }, + { + "epoch": 1.6095768598369182, + "grad_norm": 3.964921474456787, + "learning_rate": 9.6605945428682e-06, + "loss": 1.8571, + "step": 23984 + }, + { + "epoch": 1.6097110835206871, + "grad_norm": 4.128443241119385, + "learning_rate": 9.654174056155113e-06, + "loss": 1.8977, + "step": 23986 + }, + { + "epoch": 1.609845307204456, + "grad_norm": 3.9232726097106934, + "learning_rate": 9.64775547570298e-06, + "loss": 1.7003, + "step": 23988 + }, + { + "epoch": 1.6099795308882252, + "grad_norm": 3.8396658897399902, + "learning_rate": 9.641338801815048e-06, + "loss": 1.8621, + "step": 23990 + }, + { + "epoch": 1.6101137545719943, + "grad_norm": 4.570084095001221, + "learning_rate": 9.634924034794501e-06, + "loss": 1.8803, + "step": 23992 + }, + { + "epoch": 1.6102479782557633, + "grad_norm": 4.590132713317871, + "learning_rate": 9.628511174944404e-06, + "loss": 2.0504, + "step": 23994 + }, + { + "epoch": 1.6103822019395322, + "grad_norm": 4.15330171585083, + "learning_rate": 9.622100222567775e-06, + "loss": 1.8166, + "step": 23996 + }, + { + "epoch": 1.610516425623301, + "grad_norm": 4.202214241027832, + "learning_rate": 9.615691177967518e-06, + "loss": 1.9265, + "step": 23998 + }, + { + "epoch": 1.6106506493070702, + "grad_norm": 3.884662628173828, + "learning_rate": 9.609284041446438e-06, + "loss": 2.0627, + "step": 24000 + }, + { + "epoch": 1.6107848729908394, + "grad_norm": 4.354757785797119, + "learning_rate": 9.602878813307249e-06, + "loss": 1.7817, + "step": 24002 + }, + { + "epoch": 1.6109190966746083, + "grad_norm": 4.084568977355957, + "learning_rate": 9.596475493852608e-06, + "loss": 1.9445, + "step": 24004 + }, + { + "epoch": 1.6110533203583772, + "grad_norm": 4.486461162567139, + "learning_rate": 9.590074083385053e-06, + "loss": 1.7921, + "step": 24006 + }, + { + "epoch": 1.6111875440421461, + "grad_norm": 4.0023040771484375, + "learning_rate": 9.583674582207036e-06, + "loss": 1.8032, + "step": 24008 + }, + { + "epoch": 1.6113217677259153, + "grad_norm": 3.905430555343628, + "learning_rate": 9.577276990620903e-06, + "loss": 1.6712, + "step": 24010 + }, + { + "epoch": 1.6114559914096842, + "grad_norm": 4.2355055809021, + "learning_rate": 9.570881308928958e-06, + "loss": 1.9823, + "step": 24012 + }, + { + "epoch": 1.6115902150934533, + "grad_norm": 3.995983600616455, + "learning_rate": 9.564487537433365e-06, + "loss": 1.9392, + "step": 24014 + }, + { + "epoch": 1.6117244387772223, + "grad_norm": 3.4409806728363037, + "learning_rate": 9.55809567643623e-06, + "loss": 1.7922, + "step": 24016 + }, + { + "epoch": 1.6118586624609912, + "grad_norm": 4.362349510192871, + "learning_rate": 9.551705726239546e-06, + "loss": 1.7788, + "step": 24018 + }, + { + "epoch": 1.61199288614476, + "grad_norm": 4.107967853546143, + "learning_rate": 9.545317687145232e-06, + "loss": 1.7574, + "step": 24020 + }, + { + "epoch": 1.6121271098285292, + "grad_norm": 3.5170814990997314, + "learning_rate": 9.538931559455095e-06, + "loss": 1.84, + "step": 24022 + }, + { + "epoch": 1.6122613335122984, + "grad_norm": 3.7282345294952393, + "learning_rate": 9.532547343470889e-06, + "loss": 1.763, + "step": 24024 + }, + { + "epoch": 1.6123955571960673, + "grad_norm": 4.886617183685303, + "learning_rate": 9.526165039494244e-06, + "loss": 1.82, + "step": 24026 + }, + { + "epoch": 1.6125297808798362, + "grad_norm": 3.817354440689087, + "learning_rate": 9.519784647826713e-06, + "loss": 1.8814, + "step": 24028 + }, + { + "epoch": 1.6126640045636051, + "grad_norm": 4.742884159088135, + "learning_rate": 9.513406168769745e-06, + "loss": 1.8643, + "step": 24030 + }, + { + "epoch": 1.6127982282473743, + "grad_norm": 4.118808269500732, + "learning_rate": 9.507029602624734e-06, + "loss": 1.7689, + "step": 24032 + }, + { + "epoch": 1.6129324519311432, + "grad_norm": 4.107776165008545, + "learning_rate": 9.50065494969295e-06, + "loss": 1.9663, + "step": 24034 + }, + { + "epoch": 1.6130666756149123, + "grad_norm": 4.217438697814941, + "learning_rate": 9.494282210275579e-06, + "loss": 2.0702, + "step": 24036 + }, + { + "epoch": 1.6132008992986813, + "grad_norm": 4.404104709625244, + "learning_rate": 9.48791138467372e-06, + "loss": 2.0491, + "step": 24038 + }, + { + "epoch": 1.6133351229824502, + "grad_norm": 4.059761047363281, + "learning_rate": 9.481542473188377e-06, + "loss": 2.0456, + "step": 24040 + }, + { + "epoch": 1.613469346666219, + "grad_norm": 4.695176601409912, + "learning_rate": 9.475175476120484e-06, + "loss": 1.9059, + "step": 24042 + }, + { + "epoch": 1.6136035703499882, + "grad_norm": 3.887054920196533, + "learning_rate": 9.468810393770856e-06, + "loss": 1.8242, + "step": 24044 + }, + { + "epoch": 1.6137377940337574, + "grad_norm": 4.3682684898376465, + "learning_rate": 9.462447226440252e-06, + "loss": 1.8302, + "step": 24046 + }, + { + "epoch": 1.6138720177175263, + "grad_norm": 4.315453052520752, + "learning_rate": 9.456085974429286e-06, + "loss": 2.0667, + "step": 24048 + }, + { + "epoch": 1.6140062414012952, + "grad_norm": 4.098113059997559, + "learning_rate": 9.449726638038536e-06, + "loss": 1.8855, + "step": 24050 + }, + { + "epoch": 1.6141404650850641, + "grad_norm": 4.059973239898682, + "learning_rate": 9.443369217568455e-06, + "loss": 1.9676, + "step": 24052 + }, + { + "epoch": 1.6142746887688333, + "grad_norm": 4.070663928985596, + "learning_rate": 9.437013713319437e-06, + "loss": 1.7782, + "step": 24054 + }, + { + "epoch": 1.6144089124526022, + "grad_norm": 3.793797731399536, + "learning_rate": 9.430660125591762e-06, + "loss": 1.9136, + "step": 24056 + }, + { + "epoch": 1.6145431361363713, + "grad_norm": 4.034928321838379, + "learning_rate": 9.424308454685622e-06, + "loss": 1.7065, + "step": 24058 + }, + { + "epoch": 1.6146773598201403, + "grad_norm": 4.171812057495117, + "learning_rate": 9.417958700901097e-06, + "loss": 1.9806, + "step": 24060 + }, + { + "epoch": 1.6148115835039092, + "grad_norm": 4.094841957092285, + "learning_rate": 9.411610864538239e-06, + "loss": 1.9334, + "step": 24062 + }, + { + "epoch": 1.614945807187678, + "grad_norm": 4.217196941375732, + "learning_rate": 9.405264945896959e-06, + "loss": 1.9211, + "step": 24064 + }, + { + "epoch": 1.6150800308714472, + "grad_norm": 4.045564651489258, + "learning_rate": 9.398920945277079e-06, + "loss": 1.9738, + "step": 24066 + }, + { + "epoch": 1.6152142545552164, + "grad_norm": 4.467276573181152, + "learning_rate": 9.392578862978334e-06, + "loss": 1.8019, + "step": 24068 + }, + { + "epoch": 1.6153484782389853, + "grad_norm": 4.464200973510742, + "learning_rate": 9.386238699300398e-06, + "loss": 1.9016, + "step": 24070 + }, + { + "epoch": 1.6154827019227542, + "grad_norm": 4.1446990966796875, + "learning_rate": 9.37990045454281e-06, + "loss": 1.8932, + "step": 24072 + }, + { + "epoch": 1.6156169256065231, + "grad_norm": 4.041545391082764, + "learning_rate": 9.373564129005074e-06, + "loss": 1.9831, + "step": 24074 + }, + { + "epoch": 1.6157511492902923, + "grad_norm": 3.8342843055725098, + "learning_rate": 9.36722972298652e-06, + "loss": 2.1115, + "step": 24076 + }, + { + "epoch": 1.6158853729740614, + "grad_norm": 4.016773700714111, + "learning_rate": 9.36089723678647e-06, + "loss": 1.9262, + "step": 24078 + }, + { + "epoch": 1.6160195966578303, + "grad_norm": 4.6866583824157715, + "learning_rate": 9.354566670704102e-06, + "loss": 1.8436, + "step": 24080 + }, + { + "epoch": 1.6161538203415993, + "grad_norm": 4.70037317276001, + "learning_rate": 9.348238025038547e-06, + "loss": 1.7864, + "step": 24082 + }, + { + "epoch": 1.6162880440253682, + "grad_norm": 3.837646484375, + "learning_rate": 9.341911300088807e-06, + "loss": 1.5718, + "step": 24084 + }, + { + "epoch": 1.6164222677091373, + "grad_norm": 3.8724365234375, + "learning_rate": 9.335586496153804e-06, + "loss": 1.8388, + "step": 24086 + }, + { + "epoch": 1.6165564913929062, + "grad_norm": 4.410144329071045, + "learning_rate": 9.329263613532363e-06, + "loss": 1.9354, + "step": 24088 + }, + { + "epoch": 1.6166907150766754, + "grad_norm": 4.241220951080322, + "learning_rate": 9.322942652523259e-06, + "loss": 1.8094, + "step": 24090 + }, + { + "epoch": 1.6168249387604443, + "grad_norm": 3.6388471126556396, + "learning_rate": 9.316623613425119e-06, + "loss": 1.6207, + "step": 24092 + }, + { + "epoch": 1.6169591624442132, + "grad_norm": 3.927335500717163, + "learning_rate": 9.310306496536519e-06, + "loss": 1.9222, + "step": 24094 + }, + { + "epoch": 1.6170933861279821, + "grad_norm": 4.067849159240723, + "learning_rate": 9.303991302155907e-06, + "loss": 1.7229, + "step": 24096 + }, + { + "epoch": 1.6172276098117513, + "grad_norm": 3.8422372341156006, + "learning_rate": 9.297678030581697e-06, + "loss": 1.6444, + "step": 24098 + }, + { + "epoch": 1.6173618334955204, + "grad_norm": 4.035569667816162, + "learning_rate": 9.291366682112152e-06, + "loss": 1.7346, + "step": 24100 + }, + { + "epoch": 1.6174960571792893, + "grad_norm": 4.041821002960205, + "learning_rate": 9.285057257045498e-06, + "loss": 1.6918, + "step": 24102 + }, + { + "epoch": 1.6176302808630583, + "grad_norm": 4.0666985511779785, + "learning_rate": 9.278749755679823e-06, + "loss": 1.7265, + "step": 24104 + }, + { + "epoch": 1.6177645045468272, + "grad_norm": 4.19816780090332, + "learning_rate": 9.272444178313127e-06, + "loss": 1.8956, + "step": 24106 + }, + { + "epoch": 1.6178987282305963, + "grad_norm": 10.358370780944824, + "learning_rate": 9.266140525243373e-06, + "loss": 1.4745, + "step": 24108 + }, + { + "epoch": 1.6180329519143652, + "grad_norm": 4.622109413146973, + "learning_rate": 9.259838796768367e-06, + "loss": 1.8208, + "step": 24110 + }, + { + "epoch": 1.6181671755981344, + "grad_norm": 3.765242099761963, + "learning_rate": 9.25353899318589e-06, + "loss": 1.6492, + "step": 24112 + }, + { + "epoch": 1.6183013992819033, + "grad_norm": 4.690896034240723, + "learning_rate": 9.247241114793543e-06, + "loss": 2.1016, + "step": 24114 + }, + { + "epoch": 1.6184356229656722, + "grad_norm": 3.6467950344085693, + "learning_rate": 9.240945161888931e-06, + "loss": 1.9599, + "step": 24116 + }, + { + "epoch": 1.6185698466494411, + "grad_norm": 4.342892169952393, + "learning_rate": 9.2346511347695e-06, + "loss": 1.9455, + "step": 24118 + }, + { + "epoch": 1.6187040703332103, + "grad_norm": 4.306427478790283, + "learning_rate": 9.228359033732653e-06, + "loss": 1.77, + "step": 24120 + }, + { + "epoch": 1.6188382940169794, + "grad_norm": 5.55506706237793, + "learning_rate": 9.222068859075667e-06, + "loss": 1.7312, + "step": 24122 + }, + { + "epoch": 1.6189725177007483, + "grad_norm": 4.053586959838867, + "learning_rate": 9.215780611095743e-06, + "loss": 1.9628, + "step": 24124 + }, + { + "epoch": 1.6191067413845173, + "grad_norm": 4.249564170837402, + "learning_rate": 9.209494290089971e-06, + "loss": 2.1339, + "step": 24126 + }, + { + "epoch": 1.6192409650682862, + "grad_norm": 4.36043643951416, + "learning_rate": 9.203209896355398e-06, + "loss": 2.05, + "step": 24128 + }, + { + "epoch": 1.6193751887520553, + "grad_norm": 4.340399265289307, + "learning_rate": 9.196927430188929e-06, + "loss": 2.0851, + "step": 24130 + }, + { + "epoch": 1.6195094124358242, + "grad_norm": 4.586548328399658, + "learning_rate": 9.190646891887405e-06, + "loss": 1.8311, + "step": 24132 + }, + { + "epoch": 1.6196436361195934, + "grad_norm": 4.294103622436523, + "learning_rate": 9.184368281747557e-06, + "loss": 2.2426, + "step": 24134 + }, + { + "epoch": 1.6197778598033623, + "grad_norm": 4.276889324188232, + "learning_rate": 9.178091600066063e-06, + "loss": 2.0419, + "step": 24136 + }, + { + "epoch": 1.6199120834871312, + "grad_norm": 4.332924842834473, + "learning_rate": 9.171816847139448e-06, + "loss": 1.7875, + "step": 24138 + }, + { + "epoch": 1.6200463071709001, + "grad_norm": 4.18890380859375, + "learning_rate": 9.165544023264233e-06, + "loss": 1.7807, + "step": 24140 + }, + { + "epoch": 1.6201805308546693, + "grad_norm": 3.8246350288391113, + "learning_rate": 9.159273128736734e-06, + "loss": 1.9896, + "step": 24142 + }, + { + "epoch": 1.6203147545384384, + "grad_norm": 4.177459716796875, + "learning_rate": 9.15300416385329e-06, + "loss": 2.048, + "step": 24144 + }, + { + "epoch": 1.6204489782222073, + "grad_norm": 4.468746662139893, + "learning_rate": 9.14673712891006e-06, + "loss": 1.9574, + "step": 24146 + }, + { + "epoch": 1.6205832019059763, + "grad_norm": 4.288543701171875, + "learning_rate": 9.140472024203179e-06, + "loss": 1.9469, + "step": 24148 + }, + { + "epoch": 1.6207174255897452, + "grad_norm": 3.790585517883301, + "learning_rate": 9.134208850028647e-06, + "loss": 2.0144, + "step": 24150 + }, + { + "epoch": 1.6208516492735143, + "grad_norm": 4.657275199890137, + "learning_rate": 9.127947606682391e-06, + "loss": 1.97, + "step": 24152 + }, + { + "epoch": 1.6209858729572835, + "grad_norm": 4.756190776824951, + "learning_rate": 9.121688294460223e-06, + "loss": 1.8286, + "step": 24154 + }, + { + "epoch": 1.6211200966410524, + "grad_norm": 4.561924934387207, + "learning_rate": 9.115430913657912e-06, + "loss": 2.0461, + "step": 24156 + }, + { + "epoch": 1.6212543203248213, + "grad_norm": 4.443737506866455, + "learning_rate": 9.109175464571096e-06, + "loss": 1.8684, + "step": 24158 + }, + { + "epoch": 1.6213885440085902, + "grad_norm": 3.661604642868042, + "learning_rate": 9.10292194749533e-06, + "loss": 1.7431, + "step": 24160 + }, + { + "epoch": 1.6215227676923594, + "grad_norm": 4.067526817321777, + "learning_rate": 9.096670362726073e-06, + "loss": 2.0059, + "step": 24162 + }, + { + "epoch": 1.6216569913761283, + "grad_norm": 3.99375319480896, + "learning_rate": 9.090420710558718e-06, + "loss": 1.8562, + "step": 24164 + }, + { + "epoch": 1.6217912150598974, + "grad_norm": 4.397009372711182, + "learning_rate": 9.084172991288525e-06, + "loss": 2.0267, + "step": 24166 + }, + { + "epoch": 1.6219254387436663, + "grad_norm": 4.330295562744141, + "learning_rate": 9.077927205210712e-06, + "loss": 1.9098, + "step": 24168 + }, + { + "epoch": 1.6220596624274353, + "grad_norm": 4.193619251251221, + "learning_rate": 9.071683352620385e-06, + "loss": 1.9799, + "step": 24170 + }, + { + "epoch": 1.6221938861112042, + "grad_norm": 4.6983642578125, + "learning_rate": 9.06544143381251e-06, + "loss": 1.9238, + "step": 24172 + }, + { + "epoch": 1.6223281097949733, + "grad_norm": 3.55770206451416, + "learning_rate": 9.059201449082045e-06, + "loss": 1.8421, + "step": 24174 + }, + { + "epoch": 1.6224623334787425, + "grad_norm": 3.6645851135253906, + "learning_rate": 9.052963398723796e-06, + "loss": 1.7598, + "step": 24176 + }, + { + "epoch": 1.6225965571625114, + "grad_norm": 4.452086925506592, + "learning_rate": 9.046727283032519e-06, + "loss": 1.9239, + "step": 24178 + }, + { + "epoch": 1.6227307808462803, + "grad_norm": 4.004067897796631, + "learning_rate": 9.040493102302844e-06, + "loss": 1.7203, + "step": 24180 + }, + { + "epoch": 1.6228650045300492, + "grad_norm": 4.04992151260376, + "learning_rate": 9.03426085682933e-06, + "loss": 1.7294, + "step": 24182 + }, + { + "epoch": 1.6229992282138184, + "grad_norm": 3.494908571243286, + "learning_rate": 9.028030546906419e-06, + "loss": 1.5937, + "step": 24184 + }, + { + "epoch": 1.6231334518975873, + "grad_norm": 4.214963436126709, + "learning_rate": 9.021802172828509e-06, + "loss": 1.9855, + "step": 24186 + }, + { + "epoch": 1.6232676755813564, + "grad_norm": 3.832341194152832, + "learning_rate": 9.01557573488987e-06, + "loss": 1.8817, + "step": 24188 + }, + { + "epoch": 1.6234018992651253, + "grad_norm": 4.20316743850708, + "learning_rate": 9.009351233384684e-06, + "loss": 1.898, + "step": 24190 + }, + { + "epoch": 1.6235361229488943, + "grad_norm": 4.578279972076416, + "learning_rate": 9.003128668607031e-06, + "loss": 2.0817, + "step": 24192 + }, + { + "epoch": 1.6236703466326632, + "grad_norm": 4.148762226104736, + "learning_rate": 8.99690804085095e-06, + "loss": 1.8547, + "step": 24194 + }, + { + "epoch": 1.6238045703164323, + "grad_norm": 3.915149450302124, + "learning_rate": 8.990689350410314e-06, + "loss": 1.8409, + "step": 24196 + }, + { + "epoch": 1.6239387940002015, + "grad_norm": 4.058082580566406, + "learning_rate": 8.984472597578997e-06, + "loss": 1.8634, + "step": 24198 + }, + { + "epoch": 1.6240730176839704, + "grad_norm": 3.872549295425415, + "learning_rate": 8.978257782650668e-06, + "loss": 1.8554, + "step": 24200 + }, + { + "epoch": 1.6242072413677393, + "grad_norm": 3.7040462493896484, + "learning_rate": 8.972044905919008e-06, + "loss": 1.7296, + "step": 24202 + }, + { + "epoch": 1.6243414650515082, + "grad_norm": 3.9993183612823486, + "learning_rate": 8.965833967677534e-06, + "loss": 1.7007, + "step": 24204 + }, + { + "epoch": 1.6244756887352774, + "grad_norm": 3.7898237705230713, + "learning_rate": 8.959624968219732e-06, + "loss": 1.8812, + "step": 24206 + }, + { + "epoch": 1.6246099124190463, + "grad_norm": 4.059711456298828, + "learning_rate": 8.95341790783894e-06, + "loss": 1.8718, + "step": 24208 + }, + { + "epoch": 1.6247441361028154, + "grad_norm": 4.390162944793701, + "learning_rate": 8.94721278682844e-06, + "loss": 1.7915, + "step": 24210 + }, + { + "epoch": 1.6248783597865843, + "grad_norm": 3.873659133911133, + "learning_rate": 8.941009605481398e-06, + "loss": 1.7374, + "step": 24212 + }, + { + "epoch": 1.6250125834703533, + "grad_norm": 4.450182914733887, + "learning_rate": 8.934808364090924e-06, + "loss": 1.8636, + "step": 24214 + }, + { + "epoch": 1.6251468071541222, + "grad_norm": 3.859612226486206, + "learning_rate": 8.928609062950005e-06, + "loss": 2.0918, + "step": 24216 + }, + { + "epoch": 1.6252810308378913, + "grad_norm": 4.014133453369141, + "learning_rate": 8.922411702351546e-06, + "loss": 1.7568, + "step": 24218 + }, + { + "epoch": 1.6254152545216605, + "grad_norm": 3.763633966445923, + "learning_rate": 8.916216282588341e-06, + "loss": 1.7301, + "step": 24220 + }, + { + "epoch": 1.6255494782054294, + "grad_norm": 4.291180610656738, + "learning_rate": 8.910022803953144e-06, + "loss": 1.901, + "step": 24222 + }, + { + "epoch": 1.6256837018891983, + "grad_norm": 4.264101505279541, + "learning_rate": 8.90383126673856e-06, + "loss": 1.9014, + "step": 24224 + }, + { + "epoch": 1.6258179255729672, + "grad_norm": 4.427036285400391, + "learning_rate": 8.897641671237156e-06, + "loss": 1.8031, + "step": 24226 + }, + { + "epoch": 1.6259521492567364, + "grad_norm": 3.726849317550659, + "learning_rate": 8.891454017741341e-06, + "loss": 1.6145, + "step": 24228 + }, + { + "epoch": 1.6260863729405055, + "grad_norm": 4.246091842651367, + "learning_rate": 8.885268306543492e-06, + "loss": 2.1347, + "step": 24230 + }, + { + "epoch": 1.6262205966242744, + "grad_norm": 3.546433925628662, + "learning_rate": 8.879084537935855e-06, + "loss": 1.658, + "step": 24232 + }, + { + "epoch": 1.6263548203080433, + "grad_norm": 4.137421607971191, + "learning_rate": 8.872902712210628e-06, + "loss": 1.9815, + "step": 24234 + }, + { + "epoch": 1.6264890439918123, + "grad_norm": 4.542464256286621, + "learning_rate": 8.86672282965988e-06, + "loss": 1.78, + "step": 24236 + }, + { + "epoch": 1.6266232676755814, + "grad_norm": 4.764447212219238, + "learning_rate": 8.860544890575584e-06, + "loss": 1.9347, + "step": 24238 + }, + { + "epoch": 1.6267574913593503, + "grad_norm": 4.002866268157959, + "learning_rate": 8.85436889524965e-06, + "loss": 1.7686, + "step": 24240 + }, + { + "epoch": 1.6268917150431195, + "grad_norm": 4.3723225593566895, + "learning_rate": 8.848194843973862e-06, + "loss": 1.9969, + "step": 24242 + }, + { + "epoch": 1.6270259387268884, + "grad_norm": 3.570404291152954, + "learning_rate": 8.842022737039957e-06, + "loss": 1.7516, + "step": 24244 + }, + { + "epoch": 1.6271601624106573, + "grad_norm": 4.517883777618408, + "learning_rate": 8.835852574739544e-06, + "loss": 1.9958, + "step": 24246 + }, + { + "epoch": 1.6272943860944262, + "grad_norm": 4.140350818634033, + "learning_rate": 8.829684357364153e-06, + "loss": 1.8953, + "step": 24248 + }, + { + "epoch": 1.6274286097781954, + "grad_norm": 4.3224873542785645, + "learning_rate": 8.823518085205206e-06, + "loss": 2.1009, + "step": 24250 + }, + { + "epoch": 1.6275628334619645, + "grad_norm": 4.052905082702637, + "learning_rate": 8.817353758554075e-06, + "loss": 1.8967, + "step": 24252 + }, + { + "epoch": 1.6276970571457334, + "grad_norm": 3.9014127254486084, + "learning_rate": 8.811191377701995e-06, + "loss": 1.6426, + "step": 24254 + }, + { + "epoch": 1.6278312808295023, + "grad_norm": 4.298305511474609, + "learning_rate": 8.805030942940123e-06, + "loss": 1.7659, + "step": 24256 + }, + { + "epoch": 1.6279655045132713, + "grad_norm": 4.358601093292236, + "learning_rate": 8.79887245455952e-06, + "loss": 1.8989, + "step": 24258 + }, + { + "epoch": 1.6280997281970404, + "grad_norm": 3.818150043487549, + "learning_rate": 8.792715912851196e-06, + "loss": 1.7151, + "step": 24260 + }, + { + "epoch": 1.6282339518808093, + "grad_norm": 4.064329147338867, + "learning_rate": 8.786561318105996e-06, + "loss": 1.9291, + "step": 24262 + }, + { + "epoch": 1.6283681755645785, + "grad_norm": 4.376039505004883, + "learning_rate": 8.780408670614753e-06, + "loss": 2.091, + "step": 24264 + }, + { + "epoch": 1.6285023992483474, + "grad_norm": 3.579594612121582, + "learning_rate": 8.774257970668127e-06, + "loss": 1.847, + "step": 24266 + }, + { + "epoch": 1.6286366229321163, + "grad_norm": 4.085024356842041, + "learning_rate": 8.768109218556753e-06, + "loss": 1.9266, + "step": 24268 + }, + { + "epoch": 1.6287708466158852, + "grad_norm": 4.8176116943359375, + "learning_rate": 8.76196241457113e-06, + "loss": 1.9353, + "step": 24270 + }, + { + "epoch": 1.6289050702996544, + "grad_norm": 4.027370929718018, + "learning_rate": 8.7558175590017e-06, + "loss": 1.9747, + "step": 24272 + }, + { + "epoch": 1.6290392939834235, + "grad_norm": 4.3259358406066895, + "learning_rate": 8.74967465213879e-06, + "loss": 1.7733, + "step": 24274 + }, + { + "epoch": 1.6291735176671924, + "grad_norm": 4.1228413581848145, + "learning_rate": 8.743533694272638e-06, + "loss": 1.9434, + "step": 24276 + }, + { + "epoch": 1.6293077413509613, + "grad_norm": 7.006977558135986, + "learning_rate": 8.737394685693378e-06, + "loss": 1.9452, + "step": 24278 + }, + { + "epoch": 1.6294419650347303, + "grad_norm": 4.728484153747559, + "learning_rate": 8.731257626691092e-06, + "loss": 1.8372, + "step": 24280 + }, + { + "epoch": 1.6295761887184994, + "grad_norm": 3.925100326538086, + "learning_rate": 8.725122517555734e-06, + "loss": 1.7718, + "step": 24282 + }, + { + "epoch": 1.6297104124022683, + "grad_norm": 3.952695608139038, + "learning_rate": 8.718989358577167e-06, + "loss": 2.0139, + "step": 24284 + }, + { + "epoch": 1.6298446360860375, + "grad_norm": 6.131693363189697, + "learning_rate": 8.712858150045172e-06, + "loss": 1.803, + "step": 24286 + }, + { + "epoch": 1.6299788597698064, + "grad_norm": 4.110157489776611, + "learning_rate": 8.706728892249449e-06, + "loss": 1.8203, + "step": 24288 + }, + { + "epoch": 1.6301130834535753, + "grad_norm": 4.532219886779785, + "learning_rate": 8.700601585479579e-06, + "loss": 2.0003, + "step": 24290 + }, + { + "epoch": 1.6302473071373442, + "grad_norm": 3.8349905014038086, + "learning_rate": 8.694476230025094e-06, + "loss": 1.7169, + "step": 24292 + }, + { + "epoch": 1.6303815308211134, + "grad_norm": 3.6274516582489014, + "learning_rate": 8.68835282617536e-06, + "loss": 1.8557, + "step": 24294 + }, + { + "epoch": 1.6305157545048825, + "grad_norm": 4.499518871307373, + "learning_rate": 8.68223137421973e-06, + "loss": 1.9515, + "step": 24296 + }, + { + "epoch": 1.6306499781886514, + "grad_norm": 4.111870288848877, + "learning_rate": 8.676111874447407e-06, + "loss": 2.0174, + "step": 24298 + }, + { + "epoch": 1.6307842018724203, + "grad_norm": 3.93921160697937, + "learning_rate": 8.669994327147552e-06, + "loss": 2.1634, + "step": 24300 + }, + { + "epoch": 1.6309184255561893, + "grad_norm": 3.8223400115966797, + "learning_rate": 8.663878732609187e-06, + "loss": 1.8379, + "step": 24302 + }, + { + "epoch": 1.6310526492399584, + "grad_norm": 3.8900341987609863, + "learning_rate": 8.657765091121273e-06, + "loss": 1.7801, + "step": 24304 + }, + { + "epoch": 1.6311868729237275, + "grad_norm": 4.370410442352295, + "learning_rate": 8.651653402972654e-06, + "loss": 1.7989, + "step": 24306 + }, + { + "epoch": 1.6313210966074965, + "grad_norm": 3.78320574760437, + "learning_rate": 8.645543668452112e-06, + "loss": 1.951, + "step": 24308 + }, + { + "epoch": 1.6314553202912654, + "grad_norm": 4.653826713562012, + "learning_rate": 8.639435887848307e-06, + "loss": 1.8971, + "step": 24310 + }, + { + "epoch": 1.6315895439750343, + "grad_norm": 3.850735664367676, + "learning_rate": 8.63333006144983e-06, + "loss": 1.785, + "step": 24312 + }, + { + "epoch": 1.6317237676588034, + "grad_norm": 4.287217140197754, + "learning_rate": 8.627226189545162e-06, + "loss": 1.8914, + "step": 24314 + }, + { + "epoch": 1.6318579913425724, + "grad_norm": 4.707744121551514, + "learning_rate": 8.621124272422688e-06, + "loss": 2.0088, + "step": 24316 + }, + { + "epoch": 1.6319922150263415, + "grad_norm": 4.382009983062744, + "learning_rate": 8.61502431037074e-06, + "loss": 2.109, + "step": 24318 + }, + { + "epoch": 1.6321264387101104, + "grad_norm": 4.345433235168457, + "learning_rate": 8.608926303677506e-06, + "loss": 2.1393, + "step": 24320 + }, + { + "epoch": 1.6322606623938793, + "grad_norm": 4.465890407562256, + "learning_rate": 8.602830252631116e-06, + "loss": 1.7327, + "step": 24322 + }, + { + "epoch": 1.6323948860776483, + "grad_norm": 3.8064045906066895, + "learning_rate": 8.59673615751958e-06, + "loss": 1.7639, + "step": 24324 + }, + { + "epoch": 1.6325291097614174, + "grad_norm": 4.102819442749023, + "learning_rate": 8.590644018630861e-06, + "loss": 1.8485, + "step": 24326 + }, + { + "epoch": 1.6326633334451865, + "grad_norm": 4.4916205406188965, + "learning_rate": 8.584553836252768e-06, + "loss": 1.9917, + "step": 24328 + }, + { + "epoch": 1.6327975571289555, + "grad_norm": 3.751910924911499, + "learning_rate": 8.578465610673076e-06, + "loss": 1.8768, + "step": 24330 + }, + { + "epoch": 1.6329317808127244, + "grad_norm": 4.505708694458008, + "learning_rate": 8.572379342179437e-06, + "loss": 1.9181, + "step": 24332 + }, + { + "epoch": 1.6330660044964933, + "grad_norm": 3.9658427238464355, + "learning_rate": 8.566295031059407e-06, + "loss": 1.9913, + "step": 24334 + }, + { + "epoch": 1.6332002281802624, + "grad_norm": 4.027620315551758, + "learning_rate": 8.560212677600448e-06, + "loss": 1.7706, + "step": 24336 + }, + { + "epoch": 1.6333344518640314, + "grad_norm": 4.2562174797058105, + "learning_rate": 8.554132282089967e-06, + "loss": 1.7854, + "step": 24338 + }, + { + "epoch": 1.6334686755478005, + "grad_norm": 4.26728630065918, + "learning_rate": 8.548053844815236e-06, + "loss": 1.839, + "step": 24340 + }, + { + "epoch": 1.6336028992315694, + "grad_norm": 4.195291042327881, + "learning_rate": 8.541977366063448e-06, + "loss": 1.7796, + "step": 24342 + }, + { + "epoch": 1.6337371229153383, + "grad_norm": 3.7660272121429443, + "learning_rate": 8.53590284612169e-06, + "loss": 1.977, + "step": 24344 + }, + { + "epoch": 1.6338713465991073, + "grad_norm": 3.768477439880371, + "learning_rate": 8.529830285277001e-06, + "loss": 1.7671, + "step": 24346 + }, + { + "epoch": 1.6340055702828764, + "grad_norm": 4.196593761444092, + "learning_rate": 8.523759683816274e-06, + "loss": 1.9943, + "step": 24348 + }, + { + "epoch": 1.6341397939666455, + "grad_norm": 3.8747026920318604, + "learning_rate": 8.517691042026365e-06, + "loss": 2.2318, + "step": 24350 + }, + { + "epoch": 1.6342740176504145, + "grad_norm": 3.6823060512542725, + "learning_rate": 8.51162436019396e-06, + "loss": 1.9804, + "step": 24352 + }, + { + "epoch": 1.6344082413341834, + "grad_norm": 4.6158857345581055, + "learning_rate": 8.505559638605732e-06, + "loss": 1.784, + "step": 24354 + }, + { + "epoch": 1.6345424650179523, + "grad_norm": 3.848409652709961, + "learning_rate": 8.499496877548202e-06, + "loss": 1.8853, + "step": 24356 + }, + { + "epoch": 1.6346766887017214, + "grad_norm": 6.136194705963135, + "learning_rate": 8.493436077307848e-06, + "loss": 1.7919, + "step": 24358 + }, + { + "epoch": 1.6348109123854904, + "grad_norm": 3.7486140727996826, + "learning_rate": 8.487377238171024e-06, + "loss": 1.8117, + "step": 24360 + }, + { + "epoch": 1.6349451360692595, + "grad_norm": 3.8680996894836426, + "learning_rate": 8.481320360423994e-06, + "loss": 1.7882, + "step": 24362 + }, + { + "epoch": 1.6350793597530284, + "grad_norm": 4.529312610626221, + "learning_rate": 8.47526544435292e-06, + "loss": 1.8619, + "step": 24364 + }, + { + "epoch": 1.6352135834367973, + "grad_norm": 4.5842390060424805, + "learning_rate": 8.469212490243911e-06, + "loss": 1.967, + "step": 24366 + }, + { + "epoch": 1.6353478071205663, + "grad_norm": 3.753046751022339, + "learning_rate": 8.463161498382948e-06, + "loss": 2.0111, + "step": 24368 + }, + { + "epoch": 1.6354820308043354, + "grad_norm": 3.955496311187744, + "learning_rate": 8.457112469055923e-06, + "loss": 1.9958, + "step": 24370 + }, + { + "epoch": 1.6356162544881045, + "grad_norm": 13.679265022277832, + "learning_rate": 8.45106540254863e-06, + "loss": 1.6066, + "step": 24372 + }, + { + "epoch": 1.6357504781718735, + "grad_norm": 3.793517827987671, + "learning_rate": 8.445020299146812e-06, + "loss": 1.6671, + "step": 24374 + }, + { + "epoch": 1.6358847018556424, + "grad_norm": 3.8076303005218506, + "learning_rate": 8.438977159136063e-06, + "loss": 1.7673, + "step": 24376 + }, + { + "epoch": 1.6360189255394113, + "grad_norm": 4.211023330688477, + "learning_rate": 8.432935982801921e-06, + "loss": 1.853, + "step": 24378 + }, + { + "epoch": 1.6361531492231804, + "grad_norm": 4.405986309051514, + "learning_rate": 8.426896770429815e-06, + "loss": 1.9099, + "step": 24380 + }, + { + "epoch": 1.6362873729069496, + "grad_norm": 4.12642765045166, + "learning_rate": 8.420859522305075e-06, + "loss": 1.8844, + "step": 24382 + }, + { + "epoch": 1.6364215965907185, + "grad_norm": 4.017794609069824, + "learning_rate": 8.41482423871297e-06, + "loss": 1.7663, + "step": 24384 + }, + { + "epoch": 1.6365558202744874, + "grad_norm": 4.332740783691406, + "learning_rate": 8.408790919938636e-06, + "loss": 1.7959, + "step": 24386 + }, + { + "epoch": 1.6366900439582563, + "grad_norm": 4.319294452667236, + "learning_rate": 8.402759566267171e-06, + "loss": 1.6059, + "step": 24388 + }, + { + "epoch": 1.6368242676420255, + "grad_norm": 3.913337230682373, + "learning_rate": 8.396730177983497e-06, + "loss": 2.1387, + "step": 24390 + }, + { + "epoch": 1.6369584913257944, + "grad_norm": 3.940573215484619, + "learning_rate": 8.39070275537252e-06, + "loss": 1.7454, + "step": 24392 + }, + { + "epoch": 1.6370927150095635, + "grad_norm": 3.8749899864196777, + "learning_rate": 8.384677298719006e-06, + "loss": 2.0201, + "step": 24394 + }, + { + "epoch": 1.6372269386933325, + "grad_norm": 4.035922527313232, + "learning_rate": 8.378653808307668e-06, + "loss": 1.7322, + "step": 24396 + }, + { + "epoch": 1.6373611623771014, + "grad_norm": 4.535717964172363, + "learning_rate": 8.372632284423094e-06, + "loss": 1.8623, + "step": 24398 + }, + { + "epoch": 1.6374953860608703, + "grad_norm": 4.40459680557251, + "learning_rate": 8.366612727349787e-06, + "loss": 2.0046, + "step": 24400 + }, + { + "epoch": 1.6376296097446394, + "grad_norm": 4.488327980041504, + "learning_rate": 8.360595137372151e-06, + "loss": 2.0394, + "step": 24402 + }, + { + "epoch": 1.6377638334284086, + "grad_norm": 3.9146993160247803, + "learning_rate": 8.354579514774525e-06, + "loss": 1.8525, + "step": 24404 + }, + { + "epoch": 1.6378980571121775, + "grad_norm": 4.214776992797852, + "learning_rate": 8.348565859841123e-06, + "loss": 1.9979, + "step": 24406 + }, + { + "epoch": 1.6380322807959464, + "grad_norm": 4.521644592285156, + "learning_rate": 8.342554172856076e-06, + "loss": 2.029, + "step": 24408 + }, + { + "epoch": 1.6381665044797153, + "grad_norm": 3.9581832885742188, + "learning_rate": 8.336544454103424e-06, + "loss": 1.8122, + "step": 24410 + }, + { + "epoch": 1.6383007281634845, + "grad_norm": 4.024966239929199, + "learning_rate": 8.330536703867126e-06, + "loss": 1.6852, + "step": 24412 + }, + { + "epoch": 1.6384349518472534, + "grad_norm": 4.053783893585205, + "learning_rate": 8.324530922431018e-06, + "loss": 1.8652, + "step": 24414 + }, + { + "epoch": 1.6385691755310225, + "grad_norm": 3.6927762031555176, + "learning_rate": 8.3185271100789e-06, + "loss": 1.7882, + "step": 24416 + }, + { + "epoch": 1.6387033992147915, + "grad_norm": 4.3422136306762695, + "learning_rate": 8.312525267094385e-06, + "loss": 1.8388, + "step": 24418 + }, + { + "epoch": 1.6388376228985604, + "grad_norm": 3.7583119869232178, + "learning_rate": 8.306525393761095e-06, + "loss": 1.7132, + "step": 24420 + }, + { + "epoch": 1.6389718465823293, + "grad_norm": 4.224813938140869, + "learning_rate": 8.300527490362476e-06, + "loss": 1.9105, + "step": 24422 + }, + { + "epoch": 1.6391060702660984, + "grad_norm": 4.670845985412598, + "learning_rate": 8.294531557181945e-06, + "loss": 2.0686, + "step": 24424 + }, + { + "epoch": 1.6392402939498676, + "grad_norm": 4.190891265869141, + "learning_rate": 8.288537594502788e-06, + "loss": 1.7552, + "step": 24426 + }, + { + "epoch": 1.6393745176336365, + "grad_norm": 4.143064022064209, + "learning_rate": 8.282545602608211e-06, + "loss": 1.8373, + "step": 24428 + }, + { + "epoch": 1.6395087413174054, + "grad_norm": 4.00814962387085, + "learning_rate": 8.276555581781303e-06, + "loss": 1.9374, + "step": 24430 + }, + { + "epoch": 1.6396429650011743, + "grad_norm": 4.046620845794678, + "learning_rate": 8.270567532305118e-06, + "loss": 1.8232, + "step": 24432 + }, + { + "epoch": 1.6397771886849435, + "grad_norm": 3.849466323852539, + "learning_rate": 8.264581454462555e-06, + "loss": 1.8425, + "step": 24434 + }, + { + "epoch": 1.6399114123687124, + "grad_norm": 3.6880440711975098, + "learning_rate": 8.25859734853645e-06, + "loss": 1.6817, + "step": 24436 + }, + { + "epoch": 1.6400456360524815, + "grad_norm": 4.266762733459473, + "learning_rate": 8.252615214809528e-06, + "loss": 2.0479, + "step": 24438 + }, + { + "epoch": 1.6401798597362505, + "grad_norm": 3.920156955718994, + "learning_rate": 8.246635053564455e-06, + "loss": 1.9762, + "step": 24440 + }, + { + "epoch": 1.6403140834200194, + "grad_norm": 4.424656391143799, + "learning_rate": 8.240656865083757e-06, + "loss": 1.7948, + "step": 24442 + }, + { + "epoch": 1.6404483071037883, + "grad_norm": 3.9115686416625977, + "learning_rate": 8.234680649649935e-06, + "loss": 1.8597, + "step": 24444 + }, + { + "epoch": 1.6405825307875574, + "grad_norm": 3.7164101600646973, + "learning_rate": 8.228706407545306e-06, + "loss": 1.8807, + "step": 24446 + }, + { + "epoch": 1.6407167544713266, + "grad_norm": 3.9645774364471436, + "learning_rate": 8.222734139052152e-06, + "loss": 2.1478, + "step": 24448 + }, + { + "epoch": 1.6408509781550955, + "grad_norm": 3.8632192611694336, + "learning_rate": 8.21676384445267e-06, + "loss": 1.7596, + "step": 24450 + }, + { + "epoch": 1.6409852018388644, + "grad_norm": 3.8814756870269775, + "learning_rate": 8.210795524028924e-06, + "loss": 2.0354, + "step": 24452 + }, + { + "epoch": 1.6411194255226333, + "grad_norm": 4.078732490539551, + "learning_rate": 8.204829178062923e-06, + "loss": 1.9094, + "step": 24454 + }, + { + "epoch": 1.6412536492064025, + "grad_norm": 4.126091003417969, + "learning_rate": 8.198864806836553e-06, + "loss": 1.8753, + "step": 24456 + }, + { + "epoch": 1.6413878728901716, + "grad_norm": 4.139654159545898, + "learning_rate": 8.192902410631632e-06, + "loss": 1.6441, + "step": 24458 + }, + { + "epoch": 1.6415220965739405, + "grad_norm": 4.106915473937988, + "learning_rate": 8.18694198972984e-06, + "loss": 1.8082, + "step": 24460 + }, + { + "epoch": 1.6416563202577095, + "grad_norm": 4.288426876068115, + "learning_rate": 8.180983544412834e-06, + "loss": 1.947, + "step": 24462 + }, + { + "epoch": 1.6417905439414784, + "grad_norm": 3.562502861022949, + "learning_rate": 8.175027074962122e-06, + "loss": 1.685, + "step": 24464 + }, + { + "epoch": 1.6419247676252475, + "grad_norm": 4.417750835418701, + "learning_rate": 8.169072581659137e-06, + "loss": 1.8929, + "step": 24466 + }, + { + "epoch": 1.6420589913090164, + "grad_norm": 4.459373474121094, + "learning_rate": 8.163120064785201e-06, + "loss": 1.7758, + "step": 24468 + }, + { + "epoch": 1.6421932149927856, + "grad_norm": 3.647043466567993, + "learning_rate": 8.157169524621583e-06, + "loss": 1.9746, + "step": 24470 + }, + { + "epoch": 1.6423274386765545, + "grad_norm": 4.464458465576172, + "learning_rate": 8.15122096144943e-06, + "loss": 1.9581, + "step": 24472 + }, + { + "epoch": 1.6424616623603234, + "grad_norm": 3.5758888721466064, + "learning_rate": 8.145274375549794e-06, + "loss": 1.6449, + "step": 24474 + }, + { + "epoch": 1.6425958860440923, + "grad_norm": 3.661257266998291, + "learning_rate": 8.139329767203624e-06, + "loss": 1.7171, + "step": 24476 + }, + { + "epoch": 1.6427301097278615, + "grad_norm": 4.326199054718018, + "learning_rate": 8.133387136691822e-06, + "loss": 1.935, + "step": 24478 + }, + { + "epoch": 1.6428643334116306, + "grad_norm": 4.288081169128418, + "learning_rate": 8.127446484295137e-06, + "loss": 1.79, + "step": 24480 + }, + { + "epoch": 1.6429985570953995, + "grad_norm": 3.723392963409424, + "learning_rate": 8.121507810294276e-06, + "loss": 1.6388, + "step": 24482 + }, + { + "epoch": 1.6431327807791685, + "grad_norm": 3.922630786895752, + "learning_rate": 8.115571114969828e-06, + "loss": 1.8699, + "step": 24484 + }, + { + "epoch": 1.6432670044629374, + "grad_norm": 4.350780487060547, + "learning_rate": 8.109636398602276e-06, + "loss": 2.0387, + "step": 24486 + }, + { + "epoch": 1.6434012281467065, + "grad_norm": 4.061994552612305, + "learning_rate": 8.103703661472023e-06, + "loss": 1.8686, + "step": 24488 + }, + { + "epoch": 1.6435354518304754, + "grad_norm": 4.486459255218506, + "learning_rate": 8.09777290385939e-06, + "loss": 2.0712, + "step": 24490 + }, + { + "epoch": 1.6436696755142446, + "grad_norm": 4.026636600494385, + "learning_rate": 8.091844126044601e-06, + "loss": 2.0943, + "step": 24492 + }, + { + "epoch": 1.6438038991980135, + "grad_norm": 4.204600811004639, + "learning_rate": 8.085917328307763e-06, + "loss": 2.0702, + "step": 24494 + }, + { + "epoch": 1.6439381228817824, + "grad_norm": 4.539614677429199, + "learning_rate": 8.079992510928897e-06, + "loss": 1.8679, + "step": 24496 + }, + { + "epoch": 1.6440723465655513, + "grad_norm": 4.374459266662598, + "learning_rate": 8.074069674187968e-06, + "loss": 1.924, + "step": 24498 + }, + { + "epoch": 1.6442065702493205, + "grad_norm": 3.0263898372650146, + "learning_rate": 8.068148818364802e-06, + "loss": 1.6208, + "step": 24500 + }, + { + "epoch": 1.6443407939330896, + "grad_norm": 3.795910358428955, + "learning_rate": 8.062229943739148e-06, + "loss": 1.783, + "step": 24502 + }, + { + "epoch": 1.6444750176168585, + "grad_norm": 4.561002254486084, + "learning_rate": 8.056313050590652e-06, + "loss": 1.8772, + "step": 24504 + }, + { + "epoch": 1.6446092413006275, + "grad_norm": 3.837731122970581, + "learning_rate": 8.050398139198895e-06, + "loss": 1.8186, + "step": 24506 + }, + { + "epoch": 1.6447434649843964, + "grad_norm": 4.760197162628174, + "learning_rate": 8.044485209843327e-06, + "loss": 1.8245, + "step": 24508 + }, + { + "epoch": 1.6448776886681655, + "grad_norm": 4.288637638092041, + "learning_rate": 8.03857426280334e-06, + "loss": 2.0293, + "step": 24510 + }, + { + "epoch": 1.6450119123519344, + "grad_norm": 3.9137425422668457, + "learning_rate": 8.032665298358205e-06, + "loss": 1.8833, + "step": 24512 + }, + { + "epoch": 1.6451461360357036, + "grad_norm": 3.8320629596710205, + "learning_rate": 8.026758316787108e-06, + "loss": 1.8452, + "step": 24514 + }, + { + "epoch": 1.6452803597194725, + "grad_norm": 4.095310211181641, + "learning_rate": 8.020853318369149e-06, + "loss": 1.8578, + "step": 24516 + }, + { + "epoch": 1.6454145834032414, + "grad_norm": 5.461511135101318, + "learning_rate": 8.014950303383305e-06, + "loss": 1.7416, + "step": 24518 + }, + { + "epoch": 1.6455488070870103, + "grad_norm": 4.440298557281494, + "learning_rate": 8.009049272108516e-06, + "loss": 1.7987, + "step": 24520 + }, + { + "epoch": 1.6456830307707795, + "grad_norm": 4.309885025024414, + "learning_rate": 8.003150224823574e-06, + "loss": 2.0231, + "step": 24522 + }, + { + "epoch": 1.6458172544545486, + "grad_norm": 3.5319488048553467, + "learning_rate": 7.997253161807205e-06, + "loss": 1.6385, + "step": 24524 + }, + { + "epoch": 1.6459514781383175, + "grad_norm": 4.1701531410217285, + "learning_rate": 7.991358083338013e-06, + "loss": 1.8081, + "step": 24526 + }, + { + "epoch": 1.6460857018220865, + "grad_norm": 3.7007296085357666, + "learning_rate": 7.985464989694558e-06, + "loss": 1.9711, + "step": 24528 + }, + { + "epoch": 1.6462199255058554, + "grad_norm": 4.254522323608398, + "learning_rate": 7.979573881155261e-06, + "loss": 1.8807, + "step": 24530 + }, + { + "epoch": 1.6463541491896245, + "grad_norm": 4.379473686218262, + "learning_rate": 7.973684757998472e-06, + "loss": 2.1311, + "step": 24532 + }, + { + "epoch": 1.6464883728733937, + "grad_norm": 4.6200480461120605, + "learning_rate": 7.967797620502426e-06, + "loss": 1.9557, + "step": 24534 + }, + { + "epoch": 1.6466225965571626, + "grad_norm": 4.796349048614502, + "learning_rate": 7.961912468945304e-06, + "loss": 1.9399, + "step": 24536 + }, + { + "epoch": 1.6467568202409315, + "grad_norm": 4.275684833526611, + "learning_rate": 7.956029303605134e-06, + "loss": 1.8755, + "step": 24538 + }, + { + "epoch": 1.6468910439247004, + "grad_norm": 3.7303786277770996, + "learning_rate": 7.950148124759932e-06, + "loss": 1.7197, + "step": 24540 + }, + { + "epoch": 1.6470252676084696, + "grad_norm": 4.931495189666748, + "learning_rate": 7.944268932687521e-06, + "loss": 2.0376, + "step": 24542 + }, + { + "epoch": 1.6471594912922385, + "grad_norm": 3.9412033557891846, + "learning_rate": 7.938391727665712e-06, + "loss": 1.8933, + "step": 24544 + }, + { + "epoch": 1.6472937149760076, + "grad_norm": 4.165396213531494, + "learning_rate": 7.93251650997217e-06, + "loss": 1.8163, + "step": 24546 + }, + { + "epoch": 1.6474279386597765, + "grad_norm": 3.743204355239868, + "learning_rate": 7.926643279884521e-06, + "loss": 2.1269, + "step": 24548 + }, + { + "epoch": 1.6475621623435455, + "grad_norm": 3.5531158447265625, + "learning_rate": 7.920772037680236e-06, + "loss": 1.9859, + "step": 24550 + }, + { + "epoch": 1.6476963860273144, + "grad_norm": 4.023550510406494, + "learning_rate": 7.914902783636729e-06, + "loss": 1.9278, + "step": 24552 + }, + { + "epoch": 1.6478306097110835, + "grad_norm": 3.8435795307159424, + "learning_rate": 7.909035518031299e-06, + "loss": 1.7653, + "step": 24554 + }, + { + "epoch": 1.6479648333948527, + "grad_norm": 4.146785736083984, + "learning_rate": 7.903170241141178e-06, + "loss": 1.7958, + "step": 24556 + }, + { + "epoch": 1.6480990570786216, + "grad_norm": 3.6919362545013428, + "learning_rate": 7.897306953243489e-06, + "loss": 1.8341, + "step": 24558 + }, + { + "epoch": 1.6482332807623905, + "grad_norm": 3.5586771965026855, + "learning_rate": 7.891445654615254e-06, + "loss": 1.7886, + "step": 24560 + }, + { + "epoch": 1.6483675044461594, + "grad_norm": 4.004719257354736, + "learning_rate": 7.885586345533397e-06, + "loss": 1.858, + "step": 24562 + }, + { + "epoch": 1.6485017281299286, + "grad_norm": 4.031523704528809, + "learning_rate": 7.879729026274779e-06, + "loss": 1.8508, + "step": 24564 + }, + { + "epoch": 1.6486359518136975, + "grad_norm": 4.7135443687438965, + "learning_rate": 7.873873697116129e-06, + "loss": 1.7596, + "step": 24566 + }, + { + "epoch": 1.6487701754974666, + "grad_norm": 3.933234453201294, + "learning_rate": 7.868020358334139e-06, + "loss": 1.8418, + "step": 24568 + }, + { + "epoch": 1.6489043991812355, + "grad_norm": 4.323208332061768, + "learning_rate": 7.862169010205306e-06, + "loss": 1.844, + "step": 24570 + }, + { + "epoch": 1.6490386228650045, + "grad_norm": 4.020999431610107, + "learning_rate": 7.856319653006144e-06, + "loss": 1.7128, + "step": 24572 + }, + { + "epoch": 1.6491728465487734, + "grad_norm": 3.9219934940338135, + "learning_rate": 7.850472287012988e-06, + "loss": 1.7574, + "step": 24574 + }, + { + "epoch": 1.6493070702325425, + "grad_norm": 4.124209880828857, + "learning_rate": 7.84462691250215e-06, + "loss": 2.0244, + "step": 24576 + }, + { + "epoch": 1.6494412939163117, + "grad_norm": 4.396857738494873, + "learning_rate": 7.8387835297498e-06, + "loss": 1.9249, + "step": 24578 + }, + { + "epoch": 1.6495755176000806, + "grad_norm": 4.1994404792785645, + "learning_rate": 7.832942139032012e-06, + "loss": 1.9834, + "step": 24580 + }, + { + "epoch": 1.6497097412838495, + "grad_norm": 3.430199384689331, + "learning_rate": 7.827102740624798e-06, + "loss": 1.7229, + "step": 24582 + }, + { + "epoch": 1.6498439649676184, + "grad_norm": 4.154445648193359, + "learning_rate": 7.821265334804028e-06, + "loss": 1.8386, + "step": 24584 + }, + { + "epoch": 1.6499781886513876, + "grad_norm": 3.4206159114837646, + "learning_rate": 7.815429921845553e-06, + "loss": 1.5905, + "step": 24586 + }, + { + "epoch": 1.6501124123351565, + "grad_norm": 3.4942054748535156, + "learning_rate": 7.809596502025057e-06, + "loss": 1.9067, + "step": 24588 + }, + { + "epoch": 1.6502466360189256, + "grad_norm": 3.709040880203247, + "learning_rate": 7.803765075618164e-06, + "loss": 1.7896, + "step": 24590 + }, + { + "epoch": 1.6503808597026945, + "grad_norm": 5.849874019622803, + "learning_rate": 7.797935642900378e-06, + "loss": 1.882, + "step": 24592 + }, + { + "epoch": 1.6505150833864635, + "grad_norm": 3.7086386680603027, + "learning_rate": 7.792108204147164e-06, + "loss": 1.8368, + "step": 24594 + }, + { + "epoch": 1.6506493070702324, + "grad_norm": 4.740914821624756, + "learning_rate": 7.786282759633839e-06, + "loss": 1.7525, + "step": 24596 + }, + { + "epoch": 1.6507835307540015, + "grad_norm": 4.292285919189453, + "learning_rate": 7.78045930963564e-06, + "loss": 2.0717, + "step": 24598 + }, + { + "epoch": 1.6509177544377707, + "grad_norm": 4.125484943389893, + "learning_rate": 7.774637854427707e-06, + "loss": 1.7933, + "step": 24600 + }, + { + "epoch": 1.6510519781215396, + "grad_norm": 3.7984185218811035, + "learning_rate": 7.768818394285116e-06, + "loss": 1.7402, + "step": 24602 + }, + { + "epoch": 1.6511862018053085, + "grad_norm": 4.067319393157959, + "learning_rate": 7.7630009294828e-06, + "loss": 1.6598, + "step": 24604 + }, + { + "epoch": 1.6513204254890774, + "grad_norm": 3.0670177936553955, + "learning_rate": 7.757185460295657e-06, + "loss": 1.4728, + "step": 24606 + }, + { + "epoch": 1.6514546491728466, + "grad_norm": 4.632412433624268, + "learning_rate": 7.751371986998412e-06, + "loss": 1.8538, + "step": 24608 + }, + { + "epoch": 1.6515888728566157, + "grad_norm": 3.8157153129577637, + "learning_rate": 7.745560509865773e-06, + "loss": 1.8081, + "step": 24610 + }, + { + "epoch": 1.6517230965403846, + "grad_norm": 3.917832136154175, + "learning_rate": 7.739751029172298e-06, + "loss": 1.9208, + "step": 24612 + }, + { + "epoch": 1.6518573202241535, + "grad_norm": 4.223713397979736, + "learning_rate": 7.7339435451925e-06, + "loss": 1.8532, + "step": 24614 + }, + { + "epoch": 1.6519915439079225, + "grad_norm": 4.689062595367432, + "learning_rate": 7.728138058200757e-06, + "loss": 1.9138, + "step": 24616 + }, + { + "epoch": 1.6521257675916916, + "grad_norm": 3.9975767135620117, + "learning_rate": 7.722334568471362e-06, + "loss": 1.7423, + "step": 24618 + }, + { + "epoch": 1.6522599912754605, + "grad_norm": 4.262695789337158, + "learning_rate": 7.716533076278515e-06, + "loss": 1.7396, + "step": 24620 + }, + { + "epoch": 1.6523942149592297, + "grad_norm": 4.432191371917725, + "learning_rate": 7.710733581896345e-06, + "loss": 1.8207, + "step": 24622 + }, + { + "epoch": 1.6525284386429986, + "grad_norm": 3.815300226211548, + "learning_rate": 7.704936085598852e-06, + "loss": 1.9006, + "step": 24624 + }, + { + "epoch": 1.6526626623267675, + "grad_norm": 4.3718953132629395, + "learning_rate": 7.699140587659965e-06, + "loss": 2.0909, + "step": 24626 + }, + { + "epoch": 1.6527968860105364, + "grad_norm": 3.9519622325897217, + "learning_rate": 7.693347088353487e-06, + "loss": 1.7606, + "step": 24628 + }, + { + "epoch": 1.6529311096943056, + "grad_norm": 4.101098537445068, + "learning_rate": 7.687555587953176e-06, + "loss": 1.8181, + "step": 24630 + }, + { + "epoch": 1.6530653333780747, + "grad_norm": 4.584951400756836, + "learning_rate": 7.681766086732655e-06, + "loss": 1.7347, + "step": 24632 + }, + { + "epoch": 1.6531995570618436, + "grad_norm": 4.0728440284729, + "learning_rate": 7.675978584965482e-06, + "loss": 1.8259, + "step": 24634 + }, + { + "epoch": 1.6533337807456125, + "grad_norm": 4.248436450958252, + "learning_rate": 7.670193082925092e-06, + "loss": 1.8926, + "step": 24636 + }, + { + "epoch": 1.6534680044293815, + "grad_norm": 3.695108652114868, + "learning_rate": 7.66440958088484e-06, + "loss": 1.8687, + "step": 24638 + }, + { + "epoch": 1.6536022281131506, + "grad_norm": 4.256385803222656, + "learning_rate": 7.658628079117974e-06, + "loss": 1.8555, + "step": 24640 + }, + { + "epoch": 1.6537364517969195, + "grad_norm": 3.69364070892334, + "learning_rate": 7.652848577897681e-06, + "loss": 1.7615, + "step": 24642 + }, + { + "epoch": 1.6538706754806887, + "grad_norm": 4.284909248352051, + "learning_rate": 7.647071077497025e-06, + "loss": 1.9991, + "step": 24644 + }, + { + "epoch": 1.6540048991644576, + "grad_norm": 3.9505653381347656, + "learning_rate": 7.641295578188973e-06, + "loss": 2.0984, + "step": 24646 + }, + { + "epoch": 1.6541391228482265, + "grad_norm": 3.488375663757324, + "learning_rate": 7.635522080246404e-06, + "loss": 1.6255, + "step": 24648 + }, + { + "epoch": 1.6542733465319954, + "grad_norm": 3.5695672035217285, + "learning_rate": 7.62975058394212e-06, + "loss": 1.6253, + "step": 24650 + }, + { + "epoch": 1.6544075702157646, + "grad_norm": 4.163980007171631, + "learning_rate": 7.623981089548798e-06, + "loss": 1.9496, + "step": 24652 + }, + { + "epoch": 1.6545417938995337, + "grad_norm": 4.294525623321533, + "learning_rate": 7.618213597339046e-06, + "loss": 2.1091, + "step": 24654 + }, + { + "epoch": 1.6546760175833026, + "grad_norm": 3.8056933879852295, + "learning_rate": 7.612448107585363e-06, + "loss": 1.924, + "step": 24656 + }, + { + "epoch": 1.6548102412670715, + "grad_norm": 4.301234722137451, + "learning_rate": 7.6066846205601396e-06, + "loss": 1.9634, + "step": 24658 + }, + { + "epoch": 1.6549444649508405, + "grad_norm": 3.7825074195861816, + "learning_rate": 7.600923136535715e-06, + "loss": 2.1196, + "step": 24660 + }, + { + "epoch": 1.6550786886346096, + "grad_norm": 4.169075012207031, + "learning_rate": 7.595163655784293e-06, + "loss": 2.0305, + "step": 24662 + }, + { + "epoch": 1.6552129123183785, + "grad_norm": 4.019517421722412, + "learning_rate": 7.589406178578018e-06, + "loss": 1.8703, + "step": 24664 + }, + { + "epoch": 1.6553471360021477, + "grad_norm": 4.024957656860352, + "learning_rate": 7.5836507051888885e-06, + "loss": 1.7984, + "step": 24666 + }, + { + "epoch": 1.6554813596859166, + "grad_norm": 3.7470173835754395, + "learning_rate": 7.577897235888865e-06, + "loss": 1.9658, + "step": 24668 + }, + { + "epoch": 1.6556155833696855, + "grad_norm": 4.326137542724609, + "learning_rate": 7.5721457709497625e-06, + "loss": 2.4967, + "step": 24670 + }, + { + "epoch": 1.6557498070534544, + "grad_norm": 4.118827819824219, + "learning_rate": 7.566396310643353e-06, + "loss": 1.9428, + "step": 24672 + }, + { + "epoch": 1.6558840307372236, + "grad_norm": 4.531857013702393, + "learning_rate": 7.560648855241276e-06, + "loss": 1.9819, + "step": 24674 + }, + { + "epoch": 1.6560182544209927, + "grad_norm": 4.097122669219971, + "learning_rate": 7.5549034050150906e-06, + "loss": 1.9783, + "step": 24676 + }, + { + "epoch": 1.6561524781047616, + "grad_norm": 3.739086389541626, + "learning_rate": 7.549159960236241e-06, + "loss": 1.7554, + "step": 24678 + }, + { + "epoch": 1.6562867017885305, + "grad_norm": 4.396509647369385, + "learning_rate": 7.543418521176121e-06, + "loss": 1.6852, + "step": 24680 + }, + { + "epoch": 1.6564209254722995, + "grad_norm": 4.14329195022583, + "learning_rate": 7.537679088105987e-06, + "loss": 1.9261, + "step": 24682 + }, + { + "epoch": 1.6565551491560686, + "grad_norm": 4.383894443511963, + "learning_rate": 7.531941661297015e-06, + "loss": 2.1173, + "step": 24684 + }, + { + "epoch": 1.6566893728398377, + "grad_norm": 4.115745544433594, + "learning_rate": 7.526206241020278e-06, + "loss": 1.8871, + "step": 24686 + }, + { + "epoch": 1.6568235965236067, + "grad_norm": 3.9340457916259766, + "learning_rate": 7.520472827546793e-06, + "loss": 1.8661, + "step": 24688 + }, + { + "epoch": 1.6569578202073756, + "grad_norm": 4.250241279602051, + "learning_rate": 7.514741421147415e-06, + "loss": 1.6644, + "step": 24690 + }, + { + "epoch": 1.6570920438911445, + "grad_norm": 4.502065658569336, + "learning_rate": 7.509012022092993e-06, + "loss": 2.1101, + "step": 24692 + }, + { + "epoch": 1.6572262675749136, + "grad_norm": 4.249234199523926, + "learning_rate": 7.503284630654167e-06, + "loss": 2.0527, + "step": 24694 + }, + { + "epoch": 1.6573604912586826, + "grad_norm": 4.018184185028076, + "learning_rate": 7.497559247101593e-06, + "loss": 1.8272, + "step": 24696 + }, + { + "epoch": 1.6574947149424517, + "grad_norm": 3.4740095138549805, + "learning_rate": 7.491835871705749e-06, + "loss": 1.7424, + "step": 24698 + }, + { + "epoch": 1.6576289386262206, + "grad_norm": 4.125518798828125, + "learning_rate": 7.486114504737091e-06, + "loss": 2.1554, + "step": 24700 + }, + { + "epoch": 1.6577631623099895, + "grad_norm": 4.194101333618164, + "learning_rate": 7.4803951464659185e-06, + "loss": 1.955, + "step": 24702 + }, + { + "epoch": 1.6578973859937585, + "grad_norm": 4.6957292556762695, + "learning_rate": 7.4746777971624605e-06, + "loss": 1.9083, + "step": 24704 + }, + { + "epoch": 1.6580316096775276, + "grad_norm": 3.7058866024017334, + "learning_rate": 7.468962457096845e-06, + "loss": 1.7652, + "step": 24706 + }, + { + "epoch": 1.6581658333612967, + "grad_norm": 3.632178544998169, + "learning_rate": 7.4632491265391344e-06, + "loss": 1.8073, + "step": 24708 + }, + { + "epoch": 1.6583000570450657, + "grad_norm": 3.78330135345459, + "learning_rate": 7.4575378057592574e-06, + "loss": 1.675, + "step": 24710 + }, + { + "epoch": 1.6584342807288346, + "grad_norm": 4.044266223907471, + "learning_rate": 7.451828495027058e-06, + "loss": 1.8967, + "step": 24712 + }, + { + "epoch": 1.6585685044126035, + "grad_norm": 3.939124822616577, + "learning_rate": 7.446121194612288e-06, + "loss": 1.8065, + "step": 24714 + }, + { + "epoch": 1.6587027280963726, + "grad_norm": 3.6818833351135254, + "learning_rate": 7.440415904784625e-06, + "loss": 2.0056, + "step": 24716 + }, + { + "epoch": 1.6588369517801416, + "grad_norm": 4.247195720672607, + "learning_rate": 7.434712625813605e-06, + "loss": 2.0163, + "step": 24718 + }, + { + "epoch": 1.6589711754639107, + "grad_norm": 4.414757251739502, + "learning_rate": 7.4290113579687405e-06, + "loss": 1.8876, + "step": 24720 + }, + { + "epoch": 1.6591053991476796, + "grad_norm": 4.937116622924805, + "learning_rate": 7.4233121015193586e-06, + "loss": 2.0257, + "step": 24722 + }, + { + "epoch": 1.6592396228314485, + "grad_norm": 4.099129676818848, + "learning_rate": 7.417614856734751e-06, + "loss": 1.9417, + "step": 24724 + }, + { + "epoch": 1.6593738465152175, + "grad_norm": 4.333894729614258, + "learning_rate": 7.411919623884117e-06, + "loss": 1.6324, + "step": 24726 + }, + { + "epoch": 1.6595080701989866, + "grad_norm": 3.7578041553497314, + "learning_rate": 7.406226403236522e-06, + "loss": 1.7203, + "step": 24728 + }, + { + "epoch": 1.6596422938827557, + "grad_norm": 4.085031509399414, + "learning_rate": 7.400535195060998e-06, + "loss": 1.7907, + "step": 24730 + }, + { + "epoch": 1.6597765175665247, + "grad_norm": 3.6348209381103516, + "learning_rate": 7.394845999626393e-06, + "loss": 1.5928, + "step": 24732 + }, + { + "epoch": 1.6599107412502936, + "grad_norm": 4.637832164764404, + "learning_rate": 7.389158817201542e-06, + "loss": 2.08, + "step": 24734 + }, + { + "epoch": 1.6600449649340625, + "grad_norm": 4.752547740936279, + "learning_rate": 7.383473648055139e-06, + "loss": 1.7087, + "step": 24736 + }, + { + "epoch": 1.6601791886178316, + "grad_norm": 4.344281196594238, + "learning_rate": 7.377790492455816e-06, + "loss": 2.1875, + "step": 24738 + }, + { + "epoch": 1.6603134123016006, + "grad_norm": 4.126250743865967, + "learning_rate": 7.3721093506720795e-06, + "loss": 1.6654, + "step": 24740 + }, + { + "epoch": 1.6604476359853697, + "grad_norm": 4.280674457550049, + "learning_rate": 7.366430222972353e-06, + "loss": 1.692, + "step": 24742 + }, + { + "epoch": 1.6605818596691386, + "grad_norm": 4.037924766540527, + "learning_rate": 7.360753109624952e-06, + "loss": 1.7692, + "step": 24744 + }, + { + "epoch": 1.6607160833529075, + "grad_norm": 3.702617645263672, + "learning_rate": 7.355078010898131e-06, + "loss": 2.0941, + "step": 24746 + }, + { + "epoch": 1.6608503070366765, + "grad_norm": 4.1011247634887695, + "learning_rate": 7.349404927060011e-06, + "loss": 1.8353, + "step": 24748 + }, + { + "epoch": 1.6609845307204456, + "grad_norm": 5.468338489532471, + "learning_rate": 7.343733858378643e-06, + "loss": 1.9104, + "step": 24750 + }, + { + "epoch": 1.6611187544042147, + "grad_norm": 3.952716112136841, + "learning_rate": 7.338064805121964e-06, + "loss": 1.7464, + "step": 24752 + }, + { + "epoch": 1.6612529780879837, + "grad_norm": 4.30949068069458, + "learning_rate": 7.332397767557836e-06, + "loss": 1.9567, + "step": 24754 + }, + { + "epoch": 1.6613872017717526, + "grad_norm": 3.9154632091522217, + "learning_rate": 7.3267327459540015e-06, + "loss": 1.6513, + "step": 24756 + }, + { + "epoch": 1.6615214254555215, + "grad_norm": 4.075239658355713, + "learning_rate": 7.321069740578157e-06, + "loss": 1.6558, + "step": 24758 + }, + { + "epoch": 1.6616556491392906, + "grad_norm": 3.9193296432495117, + "learning_rate": 7.315408751697817e-06, + "loss": 2.0102, + "step": 24760 + }, + { + "epoch": 1.6617898728230598, + "grad_norm": 4.1652727127075195, + "learning_rate": 7.309749779580494e-06, + "loss": 2.1737, + "step": 24762 + }, + { + "epoch": 1.6619240965068287, + "grad_norm": 4.1979169845581055, + "learning_rate": 7.30409282449353e-06, + "loss": 1.8774, + "step": 24764 + }, + { + "epoch": 1.6620583201905976, + "grad_norm": 3.813102960586548, + "learning_rate": 7.298437886704235e-06, + "loss": 1.8244, + "step": 24766 + }, + { + "epoch": 1.6621925438743665, + "grad_norm": 3.740065336227417, + "learning_rate": 7.292784966479782e-06, + "loss": 1.6827, + "step": 24768 + }, + { + "epoch": 1.6623267675581357, + "grad_norm": 4.009589672088623, + "learning_rate": 7.287134064087259e-06, + "loss": 1.9001, + "step": 24770 + }, + { + "epoch": 1.6624609912419046, + "grad_norm": 4.077671527862549, + "learning_rate": 7.2814851797936455e-06, + "loss": 1.8861, + "step": 24772 + }, + { + "epoch": 1.6625952149256737, + "grad_norm": 3.836043119430542, + "learning_rate": 7.275838313865874e-06, + "loss": 1.9015, + "step": 24774 + }, + { + "epoch": 1.6627294386094427, + "grad_norm": 3.8912112712860107, + "learning_rate": 7.270193466570724e-06, + "loss": 1.7119, + "step": 24776 + }, + { + "epoch": 1.6628636622932116, + "grad_norm": 4.075517654418945, + "learning_rate": 7.264550638174905e-06, + "loss": 1.5998, + "step": 24778 + }, + { + "epoch": 1.6629978859769805, + "grad_norm": 3.862239122390747, + "learning_rate": 7.258909828945027e-06, + "loss": 1.8949, + "step": 24780 + }, + { + "epoch": 1.6631321096607496, + "grad_norm": 3.9482288360595703, + "learning_rate": 7.2532710391476185e-06, + "loss": 1.881, + "step": 24782 + }, + { + "epoch": 1.6632663333445188, + "grad_norm": 3.5666770935058594, + "learning_rate": 7.247634269049092e-06, + "loss": 1.8697, + "step": 24784 + }, + { + "epoch": 1.6634005570282877, + "grad_norm": 4.566192150115967, + "learning_rate": 7.241999518915793e-06, + "loss": 1.9131, + "step": 24786 + }, + { + "epoch": 1.6635347807120566, + "grad_norm": 4.620408535003662, + "learning_rate": 7.2363667890139265e-06, + "loss": 1.8299, + "step": 24788 + }, + { + "epoch": 1.6636690043958255, + "grad_norm": 5.086532115936279, + "learning_rate": 7.230736079609629e-06, + "loss": 1.9744, + "step": 24790 + }, + { + "epoch": 1.6638032280795947, + "grad_norm": 3.771314859390259, + "learning_rate": 7.22510739096896e-06, + "loss": 2.0339, + "step": 24792 + }, + { + "epoch": 1.6639374517633636, + "grad_norm": 3.949434518814087, + "learning_rate": 7.219480723357841e-06, + "loss": 1.788, + "step": 24794 + }, + { + "epoch": 1.6640716754471327, + "grad_norm": 4.213751792907715, + "learning_rate": 7.213856077042147e-06, + "loss": 1.7673, + "step": 24796 + }, + { + "epoch": 1.6642058991309017, + "grad_norm": 4.799483776092529, + "learning_rate": 7.208233452287616e-06, + "loss": 1.7762, + "step": 24798 + }, + { + "epoch": 1.6643401228146706, + "grad_norm": 4.887887001037598, + "learning_rate": 7.202612849359913e-06, + "loss": 1.6988, + "step": 24800 + }, + { + "epoch": 1.6644743464984395, + "grad_norm": 4.183732509613037, + "learning_rate": 7.196994268524576e-06, + "loss": 1.6277, + "step": 24802 + }, + { + "epoch": 1.6646085701822086, + "grad_norm": 3.802603244781494, + "learning_rate": 7.1913777100471096e-06, + "loss": 1.7537, + "step": 24804 + }, + { + "epoch": 1.6647427938659778, + "grad_norm": 4.1214423179626465, + "learning_rate": 7.185763174192861e-06, + "loss": 1.6861, + "step": 24806 + }, + { + "epoch": 1.6648770175497467, + "grad_norm": 4.244684219360352, + "learning_rate": 7.18015066122712e-06, + "loss": 2.0918, + "step": 24808 + }, + { + "epoch": 1.6650112412335156, + "grad_norm": 3.784911632537842, + "learning_rate": 7.174540171415039e-06, + "loss": 1.8691, + "step": 24810 + }, + { + "epoch": 1.6651454649172845, + "grad_norm": 3.923124313354492, + "learning_rate": 7.168931705021737e-06, + "loss": 1.9859, + "step": 24812 + }, + { + "epoch": 1.6652796886010537, + "grad_norm": 3.8016045093536377, + "learning_rate": 7.163325262312176e-06, + "loss": 1.9522, + "step": 24814 + }, + { + "epoch": 1.6654139122848226, + "grad_norm": 3.937640428543091, + "learning_rate": 7.157720843551286e-06, + "loss": 1.7335, + "step": 24816 + }, + { + "epoch": 1.6655481359685917, + "grad_norm": 4.307268142700195, + "learning_rate": 7.152118449003819e-06, + "loss": 1.9176, + "step": 24818 + }, + { + "epoch": 1.6656823596523607, + "grad_norm": 4.542794704437256, + "learning_rate": 7.146518078934505e-06, + "loss": 2.0977, + "step": 24820 + }, + { + "epoch": 1.6658165833361296, + "grad_norm": 3.6894218921661377, + "learning_rate": 7.140919733607937e-06, + "loss": 1.5647, + "step": 24822 + }, + { + "epoch": 1.6659508070198985, + "grad_norm": 4.185446739196777, + "learning_rate": 7.135323413288641e-06, + "loss": 1.9315, + "step": 24824 + }, + { + "epoch": 1.6660850307036676, + "grad_norm": 3.411898136138916, + "learning_rate": 7.1297291182410246e-06, + "loss": 1.6957, + "step": 24826 + }, + { + "epoch": 1.6662192543874368, + "grad_norm": 4.000819206237793, + "learning_rate": 7.124136848729407e-06, + "loss": 1.8832, + "step": 24828 + }, + { + "epoch": 1.6663534780712057, + "grad_norm": 4.333616256713867, + "learning_rate": 7.118546605018e-06, + "loss": 1.9448, + "step": 24830 + }, + { + "epoch": 1.6664877017549746, + "grad_norm": 4.54733419418335, + "learning_rate": 7.11295838737096e-06, + "loss": 1.7441, + "step": 24832 + }, + { + "epoch": 1.6666219254387435, + "grad_norm": 4.32701301574707, + "learning_rate": 7.107372196052298e-06, + "loss": 2.0435, + "step": 24834 + }, + { + "epoch": 1.6667561491225127, + "grad_norm": 4.23447322845459, + "learning_rate": 7.101788031325951e-06, + "loss": 1.775, + "step": 24836 + }, + { + "epoch": 1.6668903728062818, + "grad_norm": 3.8745789527893066, + "learning_rate": 7.09620589345576e-06, + "loss": 1.9724, + "step": 24838 + }, + { + "epoch": 1.6670245964900507, + "grad_norm": 3.802783966064453, + "learning_rate": 7.0906257827054786e-06, + "loss": 1.8784, + "step": 24840 + }, + { + "epoch": 1.6671588201738197, + "grad_norm": 4.365588188171387, + "learning_rate": 7.085047699338742e-06, + "loss": 2.0623, + "step": 24842 + }, + { + "epoch": 1.6672930438575886, + "grad_norm": 4.155763626098633, + "learning_rate": 7.079471643619135e-06, + "loss": 1.8445, + "step": 24844 + }, + { + "epoch": 1.6674272675413577, + "grad_norm": 3.97289776802063, + "learning_rate": 7.0738976158100715e-06, + "loss": 2.0449, + "step": 24846 + }, + { + "epoch": 1.6675614912251266, + "grad_norm": 4.230801582336426, + "learning_rate": 7.068325616174948e-06, + "loss": 1.9812, + "step": 24848 + }, + { + "epoch": 1.6676957149088958, + "grad_norm": 4.509598731994629, + "learning_rate": 7.062755644977004e-06, + "loss": 1.6184, + "step": 24850 + }, + { + "epoch": 1.6678299385926647, + "grad_norm": 4.472568988800049, + "learning_rate": 7.05718770247944e-06, + "loss": 1.8071, + "step": 24852 + }, + { + "epoch": 1.6679641622764336, + "grad_norm": 4.139092922210693, + "learning_rate": 7.051621788945306e-06, + "loss": 1.7035, + "step": 24854 + }, + { + "epoch": 1.6680983859602025, + "grad_norm": 4.275109767913818, + "learning_rate": 7.046057904637593e-06, + "loss": 1.8202, + "step": 24856 + }, + { + "epoch": 1.6682326096439717, + "grad_norm": 4.147186279296875, + "learning_rate": 7.040496049819178e-06, + "loss": 1.6417, + "step": 24858 + }, + { + "epoch": 1.6683668333277408, + "grad_norm": 4.1974945068359375, + "learning_rate": 7.034936224752836e-06, + "loss": 2.0217, + "step": 24860 + }, + { + "epoch": 1.6685010570115097, + "grad_norm": 5.3748602867126465, + "learning_rate": 7.029378429701278e-06, + "loss": 1.8588, + "step": 24862 + }, + { + "epoch": 1.6686352806952787, + "grad_norm": 3.7756175994873047, + "learning_rate": 7.023822664927099e-06, + "loss": 2.0525, + "step": 24864 + }, + { + "epoch": 1.6687695043790476, + "grad_norm": 4.098616600036621, + "learning_rate": 7.018268930692784e-06, + "loss": 1.7499, + "step": 24866 + }, + { + "epoch": 1.6689037280628167, + "grad_norm": 3.4933366775512695, + "learning_rate": 7.012717227260734e-06, + "loss": 2.0788, + "step": 24868 + }, + { + "epoch": 1.6690379517465856, + "grad_norm": 5.051976203918457, + "learning_rate": 7.007167554893274e-06, + "loss": 1.8945, + "step": 24870 + }, + { + "epoch": 1.6691721754303548, + "grad_norm": 3.99418568611145, + "learning_rate": 7.001619913852603e-06, + "loss": 1.7303, + "step": 24872 + }, + { + "epoch": 1.6693063991141237, + "grad_norm": 4.346214771270752, + "learning_rate": 6.99607430440084e-06, + "loss": 1.8173, + "step": 24874 + }, + { + "epoch": 1.6694406227978926, + "grad_norm": 4.482408046722412, + "learning_rate": 6.990530726799993e-06, + "loss": 1.9684, + "step": 24876 + }, + { + "epoch": 1.6695748464816615, + "grad_norm": 3.9493188858032227, + "learning_rate": 6.984989181312007e-06, + "loss": 2.1624, + "step": 24878 + }, + { + "epoch": 1.6697090701654307, + "grad_norm": 4.382143974304199, + "learning_rate": 6.979449668198679e-06, + "loss": 1.9838, + "step": 24880 + }, + { + "epoch": 1.6698432938491998, + "grad_norm": 4.13386869430542, + "learning_rate": 6.973912187721787e-06, + "loss": 1.7872, + "step": 24882 + }, + { + "epoch": 1.6699775175329687, + "grad_norm": 3.9835166931152344, + "learning_rate": 6.968376740142912e-06, + "loss": 1.8227, + "step": 24884 + }, + { + "epoch": 1.6701117412167377, + "grad_norm": 3.927419424057007, + "learning_rate": 6.962843325723628e-06, + "loss": 1.9272, + "step": 24886 + }, + { + "epoch": 1.6702459649005066, + "grad_norm": 4.4338860511779785, + "learning_rate": 6.957311944725359e-06, + "loss": 1.8851, + "step": 24888 + }, + { + "epoch": 1.6703801885842757, + "grad_norm": 3.5739586353302, + "learning_rate": 6.951782597409473e-06, + "loss": 1.5691, + "step": 24890 + }, + { + "epoch": 1.6705144122680446, + "grad_norm": 3.826922655105591, + "learning_rate": 6.946255284037207e-06, + "loss": 1.5983, + "step": 24892 + }, + { + "epoch": 1.6706486359518138, + "grad_norm": 4.337405204772949, + "learning_rate": 6.940730004869717e-06, + "loss": 2.03, + "step": 24894 + }, + { + "epoch": 1.6707828596355827, + "grad_norm": 3.768092393875122, + "learning_rate": 6.9352067601680506e-06, + "loss": 1.7222, + "step": 24896 + }, + { + "epoch": 1.6709170833193516, + "grad_norm": 3.6846930980682373, + "learning_rate": 6.929685550193193e-06, + "loss": 1.8332, + "step": 24898 + }, + { + "epoch": 1.6710513070031205, + "grad_norm": 4.418386459350586, + "learning_rate": 6.924166375205998e-06, + "loss": 1.8844, + "step": 24900 + }, + { + "epoch": 1.6711855306868897, + "grad_norm": 4.0619215965271, + "learning_rate": 6.918649235467234e-06, + "loss": 2.0433, + "step": 24902 + }, + { + "epoch": 1.6713197543706588, + "grad_norm": 3.7021045684814453, + "learning_rate": 6.913134131237575e-06, + "loss": 1.7542, + "step": 24904 + }, + { + "epoch": 1.6714539780544277, + "grad_norm": 4.353475093841553, + "learning_rate": 6.9076210627776085e-06, + "loss": 1.9608, + "step": 24906 + }, + { + "epoch": 1.6715882017381967, + "grad_norm": 4.338584899902344, + "learning_rate": 6.902110030347797e-06, + "loss": 2.0996, + "step": 24908 + }, + { + "epoch": 1.6717224254219656, + "grad_norm": 3.79937744140625, + "learning_rate": 6.896601034208561e-06, + "loss": 1.7985, + "step": 24910 + }, + { + "epoch": 1.6718566491057347, + "grad_norm": 4.403114318847656, + "learning_rate": 6.891094074620141e-06, + "loss": 1.6991, + "step": 24912 + }, + { + "epoch": 1.6719908727895039, + "grad_norm": 4.226890563964844, + "learning_rate": 6.88558915184277e-06, + "loss": 1.9848, + "step": 24914 + }, + { + "epoch": 1.6721250964732728, + "grad_norm": 3.5537607669830322, + "learning_rate": 6.880086266136521e-06, + "loss": 1.8734, + "step": 24916 + }, + { + "epoch": 1.6722593201570417, + "grad_norm": 4.064116954803467, + "learning_rate": 6.874585417761414e-06, + "loss": 1.7903, + "step": 24918 + }, + { + "epoch": 1.6723935438408106, + "grad_norm": 4.472997665405273, + "learning_rate": 6.869086606977349e-06, + "loss": 1.9368, + "step": 24920 + }, + { + "epoch": 1.6725277675245798, + "grad_norm": 3.652336835861206, + "learning_rate": 6.863589834044121e-06, + "loss": 1.7209, + "step": 24922 + }, + { + "epoch": 1.6726619912083487, + "grad_norm": 4.33463716506958, + "learning_rate": 6.8580950992214395e-06, + "loss": 1.7232, + "step": 24924 + }, + { + "epoch": 1.6727962148921178, + "grad_norm": 3.969252824783325, + "learning_rate": 6.852602402768943e-06, + "loss": 1.8077, + "step": 24926 + }, + { + "epoch": 1.6729304385758867, + "grad_norm": 4.995193004608154, + "learning_rate": 6.847111744946139e-06, + "loss": 2.1, + "step": 24928 + }, + { + "epoch": 1.6730646622596557, + "grad_norm": 3.8661599159240723, + "learning_rate": 6.841623126012442e-06, + "loss": 1.8289, + "step": 24930 + }, + { + "epoch": 1.6731988859434246, + "grad_norm": 4.0537190437316895, + "learning_rate": 6.83613654622719e-06, + "loss": 2.1416, + "step": 24932 + }, + { + "epoch": 1.6733331096271937, + "grad_norm": 4.726398468017578, + "learning_rate": 6.830652005849597e-06, + "loss": 2.0496, + "step": 24934 + }, + { + "epoch": 1.6734673333109629, + "grad_norm": 4.378672122955322, + "learning_rate": 6.825169505138818e-06, + "loss": 1.9414, + "step": 24936 + }, + { + "epoch": 1.6736015569947318, + "grad_norm": 3.8991410732269287, + "learning_rate": 6.819689044353877e-06, + "loss": 2.0606, + "step": 24938 + }, + { + "epoch": 1.6737357806785007, + "grad_norm": 4.338233947753906, + "learning_rate": 6.8142106237537255e-06, + "loss": 1.8039, + "step": 24940 + }, + { + "epoch": 1.6738700043622696, + "grad_norm": 4.363742351531982, + "learning_rate": 6.8087342435971815e-06, + "loss": 1.9131, + "step": 24942 + }, + { + "epoch": 1.6740042280460388, + "grad_norm": 3.5660359859466553, + "learning_rate": 6.803259904143027e-06, + "loss": 1.6854, + "step": 24944 + }, + { + "epoch": 1.6741384517298077, + "grad_norm": 3.632258176803589, + "learning_rate": 6.797787605649891e-06, + "loss": 2.0268, + "step": 24946 + }, + { + "epoch": 1.6742726754135768, + "grad_norm": 3.535168409347534, + "learning_rate": 6.792317348376348e-06, + "loss": 1.7159, + "step": 24948 + }, + { + "epoch": 1.6744068990973457, + "grad_norm": 3.8204495906829834, + "learning_rate": 6.786849132580842e-06, + "loss": 2.0926, + "step": 24950 + }, + { + "epoch": 1.6745411227811147, + "grad_norm": 3.599600315093994, + "learning_rate": 6.781382958521743e-06, + "loss": 1.8525, + "step": 24952 + }, + { + "epoch": 1.6746753464648836, + "grad_norm": 4.4158453941345215, + "learning_rate": 6.7759188264573005e-06, + "loss": 1.7704, + "step": 24954 + }, + { + "epoch": 1.6748095701486527, + "grad_norm": 3.7844691276550293, + "learning_rate": 6.770456736645708e-06, + "loss": 1.7829, + "step": 24956 + }, + { + "epoch": 1.6749437938324219, + "grad_norm": 4.078080177307129, + "learning_rate": 6.764996689345027e-06, + "loss": 1.8873, + "step": 24958 + }, + { + "epoch": 1.6750780175161908, + "grad_norm": 3.901064157485962, + "learning_rate": 6.7595386848132325e-06, + "loss": 1.8473, + "step": 24960 + }, + { + "epoch": 1.6752122411999597, + "grad_norm": 4.5233941078186035, + "learning_rate": 6.754082723308203e-06, + "loss": 2.15, + "step": 24962 + }, + { + "epoch": 1.6753464648837286, + "grad_norm": 4.5781989097595215, + "learning_rate": 6.748628805087731e-06, + "loss": 2.1758, + "step": 24964 + }, + { + "epoch": 1.6754806885674978, + "grad_norm": 4.350677967071533, + "learning_rate": 6.743176930409495e-06, + "loss": 2.0329, + "step": 24966 + }, + { + "epoch": 1.6756149122512667, + "grad_norm": 4.195610523223877, + "learning_rate": 6.737727099531094e-06, + "loss": 1.8782, + "step": 24968 + }, + { + "epoch": 1.6757491359350358, + "grad_norm": 4.617746353149414, + "learning_rate": 6.732279312709999e-06, + "loss": 1.6806, + "step": 24970 + }, + { + "epoch": 1.6758833596188047, + "grad_norm": 4.4416184425354, + "learning_rate": 6.726833570203639e-06, + "loss": 1.8445, + "step": 24972 + }, + { + "epoch": 1.6760175833025737, + "grad_norm": 4.212913513183594, + "learning_rate": 6.721389872269291e-06, + "loss": 1.9387, + "step": 24974 + }, + { + "epoch": 1.6761518069863426, + "grad_norm": 14.073760032653809, + "learning_rate": 6.715948219164175e-06, + "loss": 1.8689, + "step": 24976 + }, + { + "epoch": 1.6762860306701117, + "grad_norm": 3.9038584232330322, + "learning_rate": 6.7105086111453944e-06, + "loss": 1.79, + "step": 24978 + }, + { + "epoch": 1.6764202543538809, + "grad_norm": 3.7727749347686768, + "learning_rate": 6.705071048469952e-06, + "loss": 1.9365, + "step": 24980 + }, + { + "epoch": 1.6765544780376498, + "grad_norm": 4.462003707885742, + "learning_rate": 6.69963553139476e-06, + "loss": 1.886, + "step": 24982 + }, + { + "epoch": 1.6766887017214187, + "grad_norm": 4.191072463989258, + "learning_rate": 6.694202060176652e-06, + "loss": 1.7259, + "step": 24984 + }, + { + "epoch": 1.6768229254051876, + "grad_norm": 3.9232890605926514, + "learning_rate": 6.688770635072339e-06, + "loss": 1.7934, + "step": 24986 + }, + { + "epoch": 1.6769571490889568, + "grad_norm": 3.7366385459899902, + "learning_rate": 6.683341256338444e-06, + "loss": 1.6503, + "step": 24988 + }, + { + "epoch": 1.677091372772726, + "grad_norm": 3.8190953731536865, + "learning_rate": 6.677913924231488e-06, + "loss": 1.9285, + "step": 24990 + }, + { + "epoch": 1.6772255964564948, + "grad_norm": 4.329039096832275, + "learning_rate": 6.672488639007918e-06, + "loss": 1.9036, + "step": 24992 + }, + { + "epoch": 1.6773598201402637, + "grad_norm": 4.067795753479004, + "learning_rate": 6.667065400924066e-06, + "loss": 1.6757, + "step": 24994 + }, + { + "epoch": 1.6774940438240327, + "grad_norm": 4.337757110595703, + "learning_rate": 6.661644210236156e-06, + "loss": 1.8206, + "step": 24996 + }, + { + "epoch": 1.6776282675078018, + "grad_norm": 4.469303131103516, + "learning_rate": 6.656225067200339e-06, + "loss": 1.9872, + "step": 24998 + }, + { + "epoch": 1.6777624911915707, + "grad_norm": 4.1340460777282715, + "learning_rate": 6.650807972072648e-06, + "loss": 1.7142, + "step": 25000 + }, + { + "epoch": 1.6778967148753399, + "grad_norm": 4.339589595794678, + "learning_rate": 6.645392925109045e-06, + "loss": 1.908, + "step": 25002 + }, + { + "epoch": 1.6780309385591088, + "grad_norm": 4.3462419509887695, + "learning_rate": 6.639979926565359e-06, + "loss": 1.8606, + "step": 25004 + }, + { + "epoch": 1.6781651622428777, + "grad_norm": 4.310340881347656, + "learning_rate": 6.634568976697386e-06, + "loss": 1.8616, + "step": 25006 + }, + { + "epoch": 1.6782993859266466, + "grad_norm": 4.867284774780273, + "learning_rate": 6.629160075760726e-06, + "loss": 2.1574, + "step": 25008 + }, + { + "epoch": 1.6784336096104158, + "grad_norm": 4.504015922546387, + "learning_rate": 6.623753224010987e-06, + "loss": 2.1395, + "step": 25010 + }, + { + "epoch": 1.678567833294185, + "grad_norm": 4.262071132659912, + "learning_rate": 6.618348421703596e-06, + "loss": 1.74, + "step": 25012 + }, + { + "epoch": 1.6787020569779538, + "grad_norm": 3.917850971221924, + "learning_rate": 6.6129456690939455e-06, + "loss": 1.8433, + "step": 25014 + }, + { + "epoch": 1.6788362806617227, + "grad_norm": 4.468143939971924, + "learning_rate": 6.607544966437296e-06, + "loss": 1.8911, + "step": 25016 + }, + { + "epoch": 1.6789705043454917, + "grad_norm": 4.271687984466553, + "learning_rate": 6.602146313988822e-06, + "loss": 1.6928, + "step": 25018 + }, + { + "epoch": 1.6791047280292608, + "grad_norm": 3.5823307037353516, + "learning_rate": 6.59674971200358e-06, + "loss": 1.619, + "step": 25020 + }, + { + "epoch": 1.6792389517130297, + "grad_norm": 4.476516246795654, + "learning_rate": 6.591355160736578e-06, + "loss": 1.9078, + "step": 25022 + }, + { + "epoch": 1.6793731753967989, + "grad_norm": 3.8595833778381348, + "learning_rate": 6.5859626604426905e-06, + "loss": 1.807, + "step": 25024 + }, + { + "epoch": 1.6795073990805678, + "grad_norm": 4.629399299621582, + "learning_rate": 6.580572211376695e-06, + "loss": 2.0455, + "step": 25026 + }, + { + "epoch": 1.6796416227643367, + "grad_norm": 4.018665313720703, + "learning_rate": 6.575183813793267e-06, + "loss": 1.9543, + "step": 25028 + }, + { + "epoch": 1.6797758464481056, + "grad_norm": 3.896028995513916, + "learning_rate": 6.569797467947026e-06, + "loss": 1.8625, + "step": 25030 + }, + { + "epoch": 1.6799100701318748, + "grad_norm": 4.2413153648376465, + "learning_rate": 6.564413174092443e-06, + "loss": 2.1135, + "step": 25032 + }, + { + "epoch": 1.680044293815644, + "grad_norm": 4.137266635894775, + "learning_rate": 6.559030932483945e-06, + "loss": 2.1402, + "step": 25034 + }, + { + "epoch": 1.6801785174994128, + "grad_norm": 4.013967990875244, + "learning_rate": 6.553650743375794e-06, + "loss": 1.6337, + "step": 25036 + }, + { + "epoch": 1.6803127411831817, + "grad_norm": 3.7055206298828125, + "learning_rate": 6.548272607022221e-06, + "loss": 1.837, + "step": 25038 + }, + { + "epoch": 1.6804469648669507, + "grad_norm": 4.0318803787231445, + "learning_rate": 6.542896523677311e-06, + "loss": 2.214, + "step": 25040 + }, + { + "epoch": 1.6805811885507198, + "grad_norm": 4.034735679626465, + "learning_rate": 6.537522493595094e-06, + "loss": 2.0455, + "step": 25042 + }, + { + "epoch": 1.6807154122344887, + "grad_norm": 4.585685729980469, + "learning_rate": 6.532150517029478e-06, + "loss": 2.093, + "step": 25044 + }, + { + "epoch": 1.6808496359182579, + "grad_norm": 4.945662498474121, + "learning_rate": 6.526780594234272e-06, + "loss": 1.9507, + "step": 25046 + }, + { + "epoch": 1.6809838596020268, + "grad_norm": 4.006068229675293, + "learning_rate": 6.521412725463188e-06, + "loss": 1.7281, + "step": 25048 + }, + { + "epoch": 1.6811180832857957, + "grad_norm": 4.28605318069458, + "learning_rate": 6.516046910969864e-06, + "loss": 1.8735, + "step": 25050 + }, + { + "epoch": 1.6812523069695646, + "grad_norm": 4.008975505828857, + "learning_rate": 6.510683151007818e-06, + "loss": 1.78, + "step": 25052 + }, + { + "epoch": 1.6813865306533338, + "grad_norm": 4.311112403869629, + "learning_rate": 6.505321445830476e-06, + "loss": 1.6325, + "step": 25054 + }, + { + "epoch": 1.681520754337103, + "grad_norm": 3.9287221431732178, + "learning_rate": 6.499961795691151e-06, + "loss": 1.7633, + "step": 25056 + }, + { + "epoch": 1.6816549780208718, + "grad_norm": 4.010589122772217, + "learning_rate": 6.494604200843107e-06, + "loss": 1.9469, + "step": 25058 + }, + { + "epoch": 1.6817892017046407, + "grad_norm": 3.74045991897583, + "learning_rate": 6.489248661539454e-06, + "loss": 2.0384, + "step": 25060 + }, + { + "epoch": 1.6819234253884097, + "grad_norm": 4.53898811340332, + "learning_rate": 6.483895178033261e-06, + "loss": 2.162, + "step": 25062 + }, + { + "epoch": 1.6820576490721788, + "grad_norm": 4.294960975646973, + "learning_rate": 6.478543750577442e-06, + "loss": 1.9055, + "step": 25064 + }, + { + "epoch": 1.682191872755948, + "grad_norm": 3.982879877090454, + "learning_rate": 6.4731943794248355e-06, + "loss": 1.8384, + "step": 25066 + }, + { + "epoch": 1.6823260964397169, + "grad_norm": 4.150111675262451, + "learning_rate": 6.467847064828214e-06, + "loss": 1.8923, + "step": 25068 + }, + { + "epoch": 1.6824603201234858, + "grad_norm": 4.2800703048706055, + "learning_rate": 6.462501807040211e-06, + "loss": 1.924, + "step": 25070 + }, + { + "epoch": 1.6825945438072547, + "grad_norm": 4.238165378570557, + "learning_rate": 6.457158606313402e-06, + "loss": 1.7555, + "step": 25072 + }, + { + "epoch": 1.6827287674910236, + "grad_norm": 4.569631099700928, + "learning_rate": 6.451817462900206e-06, + "loss": 1.9793, + "step": 25074 + }, + { + "epoch": 1.6828629911747928, + "grad_norm": 3.79895281791687, + "learning_rate": 6.4464783770530055e-06, + "loss": 1.9125, + "step": 25076 + }, + { + "epoch": 1.682997214858562, + "grad_norm": 5.678162097930908, + "learning_rate": 6.441141349024055e-06, + "loss": 2.1642, + "step": 25078 + }, + { + "epoch": 1.6831314385423308, + "grad_norm": 4.44114875793457, + "learning_rate": 6.435806379065529e-06, + "loss": 2.0246, + "step": 25080 + }, + { + "epoch": 1.6832656622260997, + "grad_norm": 3.9433159828186035, + "learning_rate": 6.430473467429482e-06, + "loss": 1.6656, + "step": 25082 + }, + { + "epoch": 1.6833998859098687, + "grad_norm": 4.012415409088135, + "learning_rate": 6.4251426143678904e-06, + "loss": 1.8152, + "step": 25084 + }, + { + "epoch": 1.6835341095936378, + "grad_norm": 3.7611401081085205, + "learning_rate": 6.4198138201326145e-06, + "loss": 1.81, + "step": 25086 + }, + { + "epoch": 1.683668333277407, + "grad_norm": 4.023891925811768, + "learning_rate": 6.414487084975451e-06, + "loss": 1.6535, + "step": 25088 + }, + { + "epoch": 1.6838025569611759, + "grad_norm": 5.833304405212402, + "learning_rate": 6.409162409148062e-06, + "loss": 2.1903, + "step": 25090 + }, + { + "epoch": 1.6839367806449448, + "grad_norm": 3.905595302581787, + "learning_rate": 6.403839792902033e-06, + "loss": 1.761, + "step": 25092 + }, + { + "epoch": 1.6840710043287137, + "grad_norm": 4.104646682739258, + "learning_rate": 6.398519236488837e-06, + "loss": 1.9152, + "step": 25094 + }, + { + "epoch": 1.6842052280124828, + "grad_norm": 3.7065930366516113, + "learning_rate": 6.393200740159877e-06, + "loss": 1.5579, + "step": 25096 + }, + { + "epoch": 1.6843394516962518, + "grad_norm": 4.78115701675415, + "learning_rate": 6.387884304166419e-06, + "loss": 2.1356, + "step": 25098 + }, + { + "epoch": 1.684473675380021, + "grad_norm": 3.9216175079345703, + "learning_rate": 6.382569928759685e-06, + "loss": 1.7749, + "step": 25100 + }, + { + "epoch": 1.6846078990637898, + "grad_norm": 4.016197681427002, + "learning_rate": 6.3772576141907456e-06, + "loss": 2.0061, + "step": 25102 + }, + { + "epoch": 1.6847421227475587, + "grad_norm": 4.136026382446289, + "learning_rate": 6.371947360710606e-06, + "loss": 1.7001, + "step": 25104 + }, + { + "epoch": 1.6848763464313277, + "grad_norm": 4.247840404510498, + "learning_rate": 6.366639168570154e-06, + "loss": 1.857, + "step": 25106 + }, + { + "epoch": 1.6850105701150968, + "grad_norm": 4.085982322692871, + "learning_rate": 6.361333038020212e-06, + "loss": 1.7229, + "step": 25108 + }, + { + "epoch": 1.685144793798866, + "grad_norm": 4.063467502593994, + "learning_rate": 6.356028969311467e-06, + "loss": 2.0568, + "step": 25110 + }, + { + "epoch": 1.6852790174826349, + "grad_norm": 4.3834943771362305, + "learning_rate": 6.350726962694537e-06, + "loss": 1.9806, + "step": 25112 + }, + { + "epoch": 1.6854132411664038, + "grad_norm": 4.359941482543945, + "learning_rate": 6.345427018419908e-06, + "loss": 1.8284, + "step": 25114 + }, + { + "epoch": 1.6855474648501727, + "grad_norm": 3.8693113327026367, + "learning_rate": 6.340129136738027e-06, + "loss": 1.6193, + "step": 25116 + }, + { + "epoch": 1.6856816885339418, + "grad_norm": 2.91025710105896, + "learning_rate": 6.334833317899186e-06, + "loss": 1.6364, + "step": 25118 + }, + { + "epoch": 1.6858159122177108, + "grad_norm": 4.047305583953857, + "learning_rate": 6.329539562153608e-06, + "loss": 2.0304, + "step": 25120 + }, + { + "epoch": 1.68595013590148, + "grad_norm": 3.7546231746673584, + "learning_rate": 6.324247869751398e-06, + "loss": 1.9076, + "step": 25122 + }, + { + "epoch": 1.6860843595852488, + "grad_norm": 3.84684681892395, + "learning_rate": 6.318958240942607e-06, + "loss": 1.7642, + "step": 25124 + }, + { + "epoch": 1.6862185832690177, + "grad_norm": 4.123449802398682, + "learning_rate": 6.313670675977129e-06, + "loss": 1.8285, + "step": 25126 + }, + { + "epoch": 1.6863528069527867, + "grad_norm": 4.373131275177002, + "learning_rate": 6.308385175104819e-06, + "loss": 1.8075, + "step": 25128 + }, + { + "epoch": 1.6864870306365558, + "grad_norm": 4.151175498962402, + "learning_rate": 6.303101738575395e-06, + "loss": 1.7956, + "step": 25130 + }, + { + "epoch": 1.686621254320325, + "grad_norm": 4.281904220581055, + "learning_rate": 6.297820366638485e-06, + "loss": 1.8945, + "step": 25132 + }, + { + "epoch": 1.6867554780040939, + "grad_norm": 4.328705310821533, + "learning_rate": 6.292541059543628e-06, + "loss": 1.9946, + "step": 25134 + }, + { + "epoch": 1.6868897016878628, + "grad_norm": 3.9779410362243652, + "learning_rate": 6.2872638175402455e-06, + "loss": 1.7159, + "step": 25136 + }, + { + "epoch": 1.6870239253716317, + "grad_norm": 3.907533884048462, + "learning_rate": 6.281988640877706e-06, + "loss": 1.9822, + "step": 25138 + }, + { + "epoch": 1.6871581490554008, + "grad_norm": 4.540399551391602, + "learning_rate": 6.276715529805233e-06, + "loss": 1.8258, + "step": 25140 + }, + { + "epoch": 1.68729237273917, + "grad_norm": 3.7441790103912354, + "learning_rate": 6.27144448457197e-06, + "loss": 1.851, + "step": 25142 + }, + { + "epoch": 1.687426596422939, + "grad_norm": 3.938394784927368, + "learning_rate": 6.266175505426958e-06, + "loss": 2.0327, + "step": 25144 + }, + { + "epoch": 1.6875608201067078, + "grad_norm": 3.79235577583313, + "learning_rate": 6.260908592619169e-06, + "loss": 1.9653, + "step": 25146 + }, + { + "epoch": 1.6876950437904767, + "grad_norm": 3.6492919921875, + "learning_rate": 6.255643746397433e-06, + "loss": 1.8163, + "step": 25148 + }, + { + "epoch": 1.6878292674742457, + "grad_norm": 3.795396089553833, + "learning_rate": 6.250380967010516e-06, + "loss": 1.7618, + "step": 25150 + }, + { + "epoch": 1.6879634911580148, + "grad_norm": 4.32287073135376, + "learning_rate": 6.2451202547070545e-06, + "loss": 1.6285, + "step": 25152 + }, + { + "epoch": 1.688097714841784, + "grad_norm": 4.4170660972595215, + "learning_rate": 6.239861609735631e-06, + "loss": 1.6561, + "step": 25154 + }, + { + "epoch": 1.6882319385255529, + "grad_norm": 4.382012367248535, + "learning_rate": 6.2346050323446865e-06, + "loss": 2.1147, + "step": 25156 + }, + { + "epoch": 1.6883661622093218, + "grad_norm": 4.358572483062744, + "learning_rate": 6.229350522782612e-06, + "loss": 1.8799, + "step": 25158 + }, + { + "epoch": 1.6885003858930907, + "grad_norm": 4.698417663574219, + "learning_rate": 6.224098081297636e-06, + "loss": 1.9148, + "step": 25160 + }, + { + "epoch": 1.6886346095768598, + "grad_norm": 4.715865612030029, + "learning_rate": 6.218847708137954e-06, + "loss": 1.7715, + "step": 25162 + }, + { + "epoch": 1.688768833260629, + "grad_norm": 3.9571707248687744, + "learning_rate": 6.213599403551617e-06, + "loss": 1.7071, + "step": 25164 + }, + { + "epoch": 1.688903056944398, + "grad_norm": 4.161215782165527, + "learning_rate": 6.208353167786612e-06, + "loss": 1.7363, + "step": 25166 + }, + { + "epoch": 1.6890372806281668, + "grad_norm": 4.002527713775635, + "learning_rate": 6.203109001090812e-06, + "loss": 2.0492, + "step": 25168 + }, + { + "epoch": 1.6891715043119357, + "grad_norm": 3.7957756519317627, + "learning_rate": 6.197866903711985e-06, + "loss": 1.8574, + "step": 25170 + }, + { + "epoch": 1.6893057279957049, + "grad_norm": 4.812929153442383, + "learning_rate": 6.192626875897806e-06, + "loss": 1.7951, + "step": 25172 + }, + { + "epoch": 1.6894399516794738, + "grad_norm": 4.040680885314941, + "learning_rate": 6.18738891789587e-06, + "loss": 1.7589, + "step": 25174 + }, + { + "epoch": 1.689574175363243, + "grad_norm": 3.499607563018799, + "learning_rate": 6.182153029953658e-06, + "loss": 1.6104, + "step": 25176 + }, + { + "epoch": 1.6897083990470119, + "grad_norm": 3.7531332969665527, + "learning_rate": 6.1769192123185495e-06, + "loss": 1.569, + "step": 25178 + }, + { + "epoch": 1.6898426227307808, + "grad_norm": 4.693965435028076, + "learning_rate": 6.171687465237824e-06, + "loss": 1.772, + "step": 25180 + }, + { + "epoch": 1.6899768464145497, + "grad_norm": 3.7912027835845947, + "learning_rate": 6.1664577889586905e-06, + "loss": 1.9416, + "step": 25182 + }, + { + "epoch": 1.6901110700983188, + "grad_norm": 3.6834723949432373, + "learning_rate": 6.1612301837282225e-06, + "loss": 1.8971, + "step": 25184 + }, + { + "epoch": 1.690245293782088, + "grad_norm": 4.14044189453125, + "learning_rate": 6.156004649793446e-06, + "loss": 1.7609, + "step": 25186 + }, + { + "epoch": 1.690379517465857, + "grad_norm": 4.16223669052124, + "learning_rate": 6.150781187401211e-06, + "loss": 1.8365, + "step": 25188 + }, + { + "epoch": 1.6905137411496258, + "grad_norm": 3.5384089946746826, + "learning_rate": 6.145559796798345e-06, + "loss": 1.7696, + "step": 25190 + }, + { + "epoch": 1.6906479648333947, + "grad_norm": 4.739270210266113, + "learning_rate": 6.140340478231537e-06, + "loss": 2.1788, + "step": 25192 + }, + { + "epoch": 1.6907821885171639, + "grad_norm": 4.239810466766357, + "learning_rate": 6.135123231947404e-06, + "loss": 1.8309, + "step": 25194 + }, + { + "epoch": 1.6909164122009328, + "grad_norm": 4.149385929107666, + "learning_rate": 6.1299080581924465e-06, + "loss": 2.0209, + "step": 25196 + }, + { + "epoch": 1.691050635884702, + "grad_norm": 4.415130615234375, + "learning_rate": 6.124694957213062e-06, + "loss": 1.9228, + "step": 25198 + }, + { + "epoch": 1.6911848595684709, + "grad_norm": 4.189035415649414, + "learning_rate": 6.119483929255559e-06, + "loss": 1.98, + "step": 25200 + }, + { + "epoch": 1.6913190832522398, + "grad_norm": 3.7102622985839844, + "learning_rate": 6.1142749745661536e-06, + "loss": 1.8337, + "step": 25202 + }, + { + "epoch": 1.6914533069360087, + "grad_norm": 4.074142932891846, + "learning_rate": 6.109068093390957e-06, + "loss": 1.8561, + "step": 25204 + }, + { + "epoch": 1.6915875306197778, + "grad_norm": 4.328968524932861, + "learning_rate": 6.103863285975992e-06, + "loss": 1.7952, + "step": 25206 + }, + { + "epoch": 1.691721754303547, + "grad_norm": 4.315784931182861, + "learning_rate": 6.0986605525671705e-06, + "loss": 1.5705, + "step": 25208 + }, + { + "epoch": 1.691855977987316, + "grad_norm": 3.757676601409912, + "learning_rate": 6.093459893410297e-06, + "loss": 2.1234, + "step": 25210 + }, + { + "epoch": 1.6919902016710848, + "grad_norm": 4.256565570831299, + "learning_rate": 6.0882613087511146e-06, + "loss": 1.5748, + "step": 25212 + }, + { + "epoch": 1.6921244253548537, + "grad_norm": 3.9670250415802, + "learning_rate": 6.083064798835236e-06, + "loss": 1.8779, + "step": 25214 + }, + { + "epoch": 1.6922586490386229, + "grad_norm": 4.188969612121582, + "learning_rate": 6.077870363908184e-06, + "loss": 1.9748, + "step": 25216 + }, + { + "epoch": 1.692392872722392, + "grad_norm": 4.197779655456543, + "learning_rate": 6.0726780042153816e-06, + "loss": 2.137, + "step": 25218 + }, + { + "epoch": 1.692527096406161, + "grad_norm": 4.095139980316162, + "learning_rate": 6.067487720002174e-06, + "loss": 1.8171, + "step": 25220 + }, + { + "epoch": 1.6926613200899299, + "grad_norm": 3.812920331954956, + "learning_rate": 6.0622995115137695e-06, + "loss": 1.9073, + "step": 25222 + }, + { + "epoch": 1.6927955437736988, + "grad_norm": 4.073745250701904, + "learning_rate": 6.057113378995338e-06, + "loss": 1.7543, + "step": 25224 + }, + { + "epoch": 1.6929297674574677, + "grad_norm": 4.025613307952881, + "learning_rate": 6.051929322691868e-06, + "loss": 1.7531, + "step": 25226 + }, + { + "epoch": 1.6930639911412368, + "grad_norm": 3.758174419403076, + "learning_rate": 6.046747342848325e-06, + "loss": 1.8684, + "step": 25228 + }, + { + "epoch": 1.693198214825006, + "grad_norm": 3.992278575897217, + "learning_rate": 6.041567439709533e-06, + "loss": 1.8436, + "step": 25230 + }, + { + "epoch": 1.693332438508775, + "grad_norm": 3.6738569736480713, + "learning_rate": 6.036389613520243e-06, + "loss": 1.8887, + "step": 25232 + }, + { + "epoch": 1.6934666621925438, + "grad_norm": 4.309759140014648, + "learning_rate": 6.031213864525098e-06, + "loss": 1.9701, + "step": 25234 + }, + { + "epoch": 1.6936008858763127, + "grad_norm": 4.983991622924805, + "learning_rate": 6.02604019296864e-06, + "loss": 1.9191, + "step": 25236 + }, + { + "epoch": 1.6937351095600819, + "grad_norm": 4.247321605682373, + "learning_rate": 6.0208685990952975e-06, + "loss": 1.4779, + "step": 25238 + }, + { + "epoch": 1.693869333243851, + "grad_norm": 3.7478158473968506, + "learning_rate": 6.0156990831494395e-06, + "loss": 1.9506, + "step": 25240 + }, + { + "epoch": 1.69400355692762, + "grad_norm": 3.7992000579833984, + "learning_rate": 6.010531645375312e-06, + "loss": 1.788, + "step": 25242 + }, + { + "epoch": 1.6941377806113889, + "grad_norm": 4.184092998504639, + "learning_rate": 6.005366286017061e-06, + "loss": 1.9523, + "step": 25244 + }, + { + "epoch": 1.6942720042951578, + "grad_norm": 4.1061692237854, + "learning_rate": 6.000203005318733e-06, + "loss": 1.8264, + "step": 25246 + }, + { + "epoch": 1.694406227978927, + "grad_norm": 3.50988507270813, + "learning_rate": 5.9950418035243036e-06, + "loss": 1.8722, + "step": 25248 + }, + { + "epoch": 1.6945404516626958, + "grad_norm": 4.405223846435547, + "learning_rate": 5.9898826808776006e-06, + "loss": 1.9029, + "step": 25250 + }, + { + "epoch": 1.694674675346465, + "grad_norm": 4.4959893226623535, + "learning_rate": 5.984725637622424e-06, + "loss": 1.872, + "step": 25252 + }, + { + "epoch": 1.694808899030234, + "grad_norm": 3.85923433303833, + "learning_rate": 5.979570674002388e-06, + "loss": 1.8977, + "step": 25254 + }, + { + "epoch": 1.6949431227140028, + "grad_norm": 4.1557207107543945, + "learning_rate": 5.974417790261083e-06, + "loss": 1.701, + "step": 25256 + }, + { + "epoch": 1.6950773463977717, + "grad_norm": 5.959264278411865, + "learning_rate": 5.969266986641953e-06, + "loss": 1.8856, + "step": 25258 + }, + { + "epoch": 1.6952115700815409, + "grad_norm": 4.237531661987305, + "learning_rate": 5.964118263388391e-06, + "loss": 1.8504, + "step": 25260 + }, + { + "epoch": 1.69534579376531, + "grad_norm": 4.106791019439697, + "learning_rate": 5.9589716207436475e-06, + "loss": 1.9035, + "step": 25262 + }, + { + "epoch": 1.695480017449079, + "grad_norm": 4.0168256759643555, + "learning_rate": 5.9538270589508895e-06, + "loss": 2.058, + "step": 25264 + }, + { + "epoch": 1.6956142411328479, + "grad_norm": 4.215775966644287, + "learning_rate": 5.948684578253177e-06, + "loss": 1.8945, + "step": 25266 + }, + { + "epoch": 1.6957484648166168, + "grad_norm": 3.948884963989258, + "learning_rate": 5.943544178893506e-06, + "loss": 1.8286, + "step": 25268 + }, + { + "epoch": 1.695882688500386, + "grad_norm": 4.523481369018555, + "learning_rate": 5.938405861114743e-06, + "loss": 2.0313, + "step": 25270 + }, + { + "epoch": 1.6960169121841548, + "grad_norm": 4.069624900817871, + "learning_rate": 5.93326962515966e-06, + "loss": 1.7934, + "step": 25272 + }, + { + "epoch": 1.696151135867924, + "grad_norm": 3.4668543338775635, + "learning_rate": 5.928135471270929e-06, + "loss": 1.7518, + "step": 25274 + }, + { + "epoch": 1.696285359551693, + "grad_norm": 3.6851909160614014, + "learning_rate": 5.92300339969113e-06, + "loss": 1.7922, + "step": 25276 + }, + { + "epoch": 1.6964195832354618, + "grad_norm": 3.9113247394561768, + "learning_rate": 5.9178734106627495e-06, + "loss": 1.9241, + "step": 25278 + }, + { + "epoch": 1.6965538069192307, + "grad_norm": 3.8796563148498535, + "learning_rate": 5.9127455044281684e-06, + "loss": 1.8007, + "step": 25280 + }, + { + "epoch": 1.6966880306029999, + "grad_norm": 3.481396436691284, + "learning_rate": 5.90761968122967e-06, + "loss": 1.7146, + "step": 25282 + }, + { + "epoch": 1.696822254286769, + "grad_norm": 3.863114833831787, + "learning_rate": 5.902495941309427e-06, + "loss": 1.8453, + "step": 25284 + }, + { + "epoch": 1.696956477970538, + "grad_norm": 5.433472156524658, + "learning_rate": 5.897374284909546e-06, + "loss": 2.0871, + "step": 25286 + }, + { + "epoch": 1.6970907016543069, + "grad_norm": 4.398062705993652, + "learning_rate": 5.892254712272e-06, + "loss": 1.9345, + "step": 25288 + }, + { + "epoch": 1.6972249253380758, + "grad_norm": 3.2396602630615234, + "learning_rate": 5.887137223638689e-06, + "loss": 1.5738, + "step": 25290 + }, + { + "epoch": 1.697359149021845, + "grad_norm": 4.269008159637451, + "learning_rate": 5.882021819251405e-06, + "loss": 1.6801, + "step": 25292 + }, + { + "epoch": 1.697493372705614, + "grad_norm": 4.465020656585693, + "learning_rate": 5.8769084993518355e-06, + "loss": 1.872, + "step": 25294 + }, + { + "epoch": 1.697627596389383, + "grad_norm": 4.234340190887451, + "learning_rate": 5.871797264181561e-06, + "loss": 1.9516, + "step": 25296 + }, + { + "epoch": 1.697761820073152, + "grad_norm": 7.099523544311523, + "learning_rate": 5.866688113982105e-06, + "loss": 1.4997, + "step": 25298 + }, + { + "epoch": 1.6978960437569208, + "grad_norm": 4.2532572746276855, + "learning_rate": 5.861581048994847e-06, + "loss": 1.7662, + "step": 25300 + }, + { + "epoch": 1.6980302674406897, + "grad_norm": 4.424927711486816, + "learning_rate": 5.856476069461092e-06, + "loss": 2.0261, + "step": 25302 + }, + { + "epoch": 1.6981644911244589, + "grad_norm": 4.133995532989502, + "learning_rate": 5.851373175622032e-06, + "loss": 1.8946, + "step": 25304 + }, + { + "epoch": 1.698298714808228, + "grad_norm": 4.279111862182617, + "learning_rate": 5.84627236771878e-06, + "loss": 1.6709, + "step": 25306 + }, + { + "epoch": 1.698432938491997, + "grad_norm": 4.386593818664551, + "learning_rate": 5.841173645992326e-06, + "loss": 1.814, + "step": 25308 + }, + { + "epoch": 1.6985671621757659, + "grad_norm": 3.8723275661468506, + "learning_rate": 5.836077010683599e-06, + "loss": 1.6272, + "step": 25310 + }, + { + "epoch": 1.6987013858595348, + "grad_norm": 3.9855258464813232, + "learning_rate": 5.830982462033374e-06, + "loss": 1.9011, + "step": 25312 + }, + { + "epoch": 1.698835609543304, + "grad_norm": 3.767446994781494, + "learning_rate": 5.82589000028238e-06, + "loss": 1.6551, + "step": 25314 + }, + { + "epoch": 1.698969833227073, + "grad_norm": 4.4512810707092285, + "learning_rate": 5.8207996256712084e-06, + "loss": 2.076, + "step": 25316 + }, + { + "epoch": 1.699104056910842, + "grad_norm": 4.105252265930176, + "learning_rate": 5.815711338440394e-06, + "loss": 1.7786, + "step": 25318 + }, + { + "epoch": 1.699238280594611, + "grad_norm": 3.8244071006774902, + "learning_rate": 5.810625138830333e-06, + "loss": 1.6424, + "step": 25320 + }, + { + "epoch": 1.6993725042783798, + "grad_norm": 4.266604423522949, + "learning_rate": 5.805541027081335e-06, + "loss": 1.9626, + "step": 25322 + }, + { + "epoch": 1.699506727962149, + "grad_norm": 3.9008564949035645, + "learning_rate": 5.800459003433612e-06, + "loss": 1.9802, + "step": 25324 + }, + { + "epoch": 1.6996409516459179, + "grad_norm": 3.9850120544433594, + "learning_rate": 5.795379068127299e-06, + "loss": 2.0515, + "step": 25326 + }, + { + "epoch": 1.699775175329687, + "grad_norm": 4.244085788726807, + "learning_rate": 5.790301221402394e-06, + "loss": 1.8684, + "step": 25328 + }, + { + "epoch": 1.699909399013456, + "grad_norm": 4.173315525054932, + "learning_rate": 5.785225463498828e-06, + "loss": 1.9449, + "step": 25330 + }, + { + "epoch": 1.7000436226972249, + "grad_norm": 4.23328161239624, + "learning_rate": 5.780151794656397e-06, + "loss": 1.9356, + "step": 25332 + }, + { + "epoch": 1.7001778463809938, + "grad_norm": 4.640417575836182, + "learning_rate": 5.775080215114853e-06, + "loss": 1.9673, + "step": 25334 + }, + { + "epoch": 1.700312070064763, + "grad_norm": 3.88277530670166, + "learning_rate": 5.770010725113794e-06, + "loss": 1.7535, + "step": 25336 + }, + { + "epoch": 1.700446293748532, + "grad_norm": 3.556551694869995, + "learning_rate": 5.76494332489278e-06, + "loss": 1.8891, + "step": 25338 + }, + { + "epoch": 1.700580517432301, + "grad_norm": 4.273421764373779, + "learning_rate": 5.759878014691189e-06, + "loss": 2.2195, + "step": 25340 + }, + { + "epoch": 1.70071474111607, + "grad_norm": 4.189088344573975, + "learning_rate": 5.754814794748364e-06, + "loss": 1.7575, + "step": 25342 + }, + { + "epoch": 1.7008489647998388, + "grad_norm": 4.026843547821045, + "learning_rate": 5.749753665303542e-06, + "loss": 1.859, + "step": 25344 + }, + { + "epoch": 1.700983188483608, + "grad_norm": 4.126394748687744, + "learning_rate": 5.744694626595837e-06, + "loss": 1.8804, + "step": 25346 + }, + { + "epoch": 1.7011174121673769, + "grad_norm": 4.082699775695801, + "learning_rate": 5.739637678864307e-06, + "loss": 2.0433, + "step": 25348 + }, + { + "epoch": 1.701251635851146, + "grad_norm": 3.5967485904693604, + "learning_rate": 5.734582822347839e-06, + "loss": 1.561, + "step": 25350 + }, + { + "epoch": 1.701385859534915, + "grad_norm": 4.152017593383789, + "learning_rate": 5.729530057285304e-06, + "loss": 2.0163, + "step": 25352 + }, + { + "epoch": 1.7015200832186839, + "grad_norm": 4.090835094451904, + "learning_rate": 5.724479383915404e-06, + "loss": 1.8624, + "step": 25354 + }, + { + "epoch": 1.7016543069024528, + "grad_norm": 3.8798742294311523, + "learning_rate": 5.719430802476805e-06, + "loss": 1.8151, + "step": 25356 + }, + { + "epoch": 1.701788530586222, + "grad_norm": 3.8679542541503906, + "learning_rate": 5.714384313208021e-06, + "loss": 1.8533, + "step": 25358 + }, + { + "epoch": 1.701922754269991, + "grad_norm": 4.2275190353393555, + "learning_rate": 5.7093399163474945e-06, + "loss": 2.1564, + "step": 25360 + }, + { + "epoch": 1.70205697795376, + "grad_norm": 5.467623233795166, + "learning_rate": 5.704297612133558e-06, + "loss": 1.776, + "step": 25362 + }, + { + "epoch": 1.702191201637529, + "grad_norm": 3.9936487674713135, + "learning_rate": 5.699257400804464e-06, + "loss": 1.9494, + "step": 25364 + }, + { + "epoch": 1.7023254253212978, + "grad_norm": 4.473844528198242, + "learning_rate": 5.69421928259834e-06, + "loss": 2.2623, + "step": 25366 + }, + { + "epoch": 1.702459649005067, + "grad_norm": 4.2125959396362305, + "learning_rate": 5.689183257753234e-06, + "loss": 1.7856, + "step": 25368 + }, + { + "epoch": 1.702593872688836, + "grad_norm": 4.090993404388428, + "learning_rate": 5.684149326507077e-06, + "loss": 1.9693, + "step": 25370 + }, + { + "epoch": 1.702728096372605, + "grad_norm": 3.930698871612549, + "learning_rate": 5.67911748909773e-06, + "loss": 1.708, + "step": 25372 + }, + { + "epoch": 1.702862320056374, + "grad_norm": 3.9332923889160156, + "learning_rate": 5.674087745762918e-06, + "loss": 1.7928, + "step": 25374 + }, + { + "epoch": 1.7029965437401429, + "grad_norm": 4.187496185302734, + "learning_rate": 5.669060096740314e-06, + "loss": 1.8947, + "step": 25376 + }, + { + "epoch": 1.7031307674239118, + "grad_norm": 4.144204616546631, + "learning_rate": 5.6640345422674325e-06, + "loss": 1.9629, + "step": 25378 + }, + { + "epoch": 1.703264991107681, + "grad_norm": 4.161177158355713, + "learning_rate": 5.6590110825817445e-06, + "loss": 2.0217, + "step": 25380 + }, + { + "epoch": 1.70339921479145, + "grad_norm": 3.230931043624878, + "learning_rate": 5.653989717920577e-06, + "loss": 1.4599, + "step": 25382 + }, + { + "epoch": 1.703533438475219, + "grad_norm": 4.373675346374512, + "learning_rate": 5.648970448521207e-06, + "loss": 1.9737, + "step": 25384 + }, + { + "epoch": 1.703667662158988, + "grad_norm": 5.464304447174072, + "learning_rate": 5.643953274620767e-06, + "loss": 2.0795, + "step": 25386 + }, + { + "epoch": 1.7038018858427568, + "grad_norm": 4.387160778045654, + "learning_rate": 5.6389381964563115e-06, + "loss": 1.8803, + "step": 25388 + }, + { + "epoch": 1.703936109526526, + "grad_norm": 4.147479057312012, + "learning_rate": 5.63392521426479e-06, + "loss": 2.0758, + "step": 25390 + }, + { + "epoch": 1.704070333210295, + "grad_norm": 3.415802001953125, + "learning_rate": 5.628914328283064e-06, + "loss": 1.6988, + "step": 25392 + }, + { + "epoch": 1.704204556894064, + "grad_norm": 3.732837677001953, + "learning_rate": 5.623905538747887e-06, + "loss": 1.7682, + "step": 25394 + }, + { + "epoch": 1.704338780577833, + "grad_norm": 4.139795780181885, + "learning_rate": 5.618898845895909e-06, + "loss": 1.7544, + "step": 25396 + }, + { + "epoch": 1.7044730042616019, + "grad_norm": 4.767470836639404, + "learning_rate": 5.613894249963681e-06, + "loss": 1.9455, + "step": 25398 + }, + { + "epoch": 1.704607227945371, + "grad_norm": 4.53263521194458, + "learning_rate": 5.608891751187678e-06, + "loss": 1.6825, + "step": 25400 + }, + { + "epoch": 1.70474145162914, + "grad_norm": 4.328902244567871, + "learning_rate": 5.603891349804241e-06, + "loss": 1.8953, + "step": 25402 + }, + { + "epoch": 1.704875675312909, + "grad_norm": 3.793233633041382, + "learning_rate": 5.5988930460496525e-06, + "loss": 1.904, + "step": 25404 + }, + { + "epoch": 1.705009898996678, + "grad_norm": 4.416013717651367, + "learning_rate": 5.5938968401600455e-06, + "loss": 1.9426, + "step": 25406 + }, + { + "epoch": 1.705144122680447, + "grad_norm": 4.046044826507568, + "learning_rate": 5.588902732371487e-06, + "loss": 1.8975, + "step": 25408 + }, + { + "epoch": 1.7052783463642158, + "grad_norm": 4.497105598449707, + "learning_rate": 5.5839107229199495e-06, + "loss": 2.0298, + "step": 25410 + }, + { + "epoch": 1.705412570047985, + "grad_norm": 4.249667167663574, + "learning_rate": 5.5789208120412824e-06, + "loss": 1.5844, + "step": 25412 + }, + { + "epoch": 1.705546793731754, + "grad_norm": 4.6807098388671875, + "learning_rate": 5.573932999971266e-06, + "loss": 1.8068, + "step": 25414 + }, + { + "epoch": 1.705681017415523, + "grad_norm": 3.452146530151367, + "learning_rate": 5.568947286945553e-06, + "loss": 1.7061, + "step": 25416 + }, + { + "epoch": 1.705815241099292, + "grad_norm": 3.590320348739624, + "learning_rate": 5.563963673199713e-06, + "loss": 1.8021, + "step": 25418 + }, + { + "epoch": 1.7059494647830609, + "grad_norm": 3.7996435165405273, + "learning_rate": 5.558982158969195e-06, + "loss": 1.8131, + "step": 25420 + }, + { + "epoch": 1.70608368846683, + "grad_norm": 3.5064470767974854, + "learning_rate": 5.554002744489395e-06, + "loss": 1.6674, + "step": 25422 + }, + { + "epoch": 1.706217912150599, + "grad_norm": 4.052876949310303, + "learning_rate": 5.549025429995569e-06, + "loss": 1.7999, + "step": 25424 + }, + { + "epoch": 1.706352135834368, + "grad_norm": 4.036584854125977, + "learning_rate": 5.5440502157228794e-06, + "loss": 1.8835, + "step": 25426 + }, + { + "epoch": 1.706486359518137, + "grad_norm": 4.866822719573975, + "learning_rate": 5.539077101906387e-06, + "loss": 1.9221, + "step": 25428 + }, + { + "epoch": 1.706620583201906, + "grad_norm": 3.7189621925354004, + "learning_rate": 5.534106088781082e-06, + "loss": 2.0147, + "step": 25430 + }, + { + "epoch": 1.7067548068856748, + "grad_norm": 3.982146978378296, + "learning_rate": 5.529137176581828e-06, + "loss": 1.9975, + "step": 25432 + }, + { + "epoch": 1.706889030569444, + "grad_norm": 3.705695390701294, + "learning_rate": 5.5241703655433966e-06, + "loss": 1.9976, + "step": 25434 + }, + { + "epoch": 1.707023254253213, + "grad_norm": 4.061058044433594, + "learning_rate": 5.519205655900439e-06, + "loss": 1.725, + "step": 25436 + }, + { + "epoch": 1.707157477936982, + "grad_norm": 4.1618733406066895, + "learning_rate": 5.514243047887563e-06, + "loss": 2.0473, + "step": 25438 + }, + { + "epoch": 1.707291701620751, + "grad_norm": 4.272096157073975, + "learning_rate": 5.509282541739213e-06, + "loss": 1.9699, + "step": 25440 + }, + { + "epoch": 1.7074259253045199, + "grad_norm": 3.9879496097564697, + "learning_rate": 5.504324137689793e-06, + "loss": 1.9711, + "step": 25442 + }, + { + "epoch": 1.707560148988289, + "grad_norm": 4.273292064666748, + "learning_rate": 5.4993678359735514e-06, + "loss": 1.814, + "step": 25444 + }, + { + "epoch": 1.7076943726720581, + "grad_norm": 4.095747470855713, + "learning_rate": 5.494413636824675e-06, + "loss": 1.8266, + "step": 25446 + }, + { + "epoch": 1.707828596355827, + "grad_norm": 4.301661014556885, + "learning_rate": 5.489461540477231e-06, + "loss": 1.9897, + "step": 25448 + }, + { + "epoch": 1.707962820039596, + "grad_norm": 4.312985897064209, + "learning_rate": 5.484511547165211e-06, + "loss": 1.9426, + "step": 25450 + }, + { + "epoch": 1.708097043723365, + "grad_norm": 4.1703200340271, + "learning_rate": 5.479563657122483e-06, + "loss": 1.9525, + "step": 25452 + }, + { + "epoch": 1.7082312674071338, + "grad_norm": 4.367097854614258, + "learning_rate": 5.474617870582826e-06, + "loss": 1.7862, + "step": 25454 + }, + { + "epoch": 1.708365491090903, + "grad_norm": 4.580312252044678, + "learning_rate": 5.469674187779911e-06, + "loss": 2.0664, + "step": 25456 + }, + { + "epoch": 1.708499714774672, + "grad_norm": 4.1197686195373535, + "learning_rate": 5.464732608947337e-06, + "loss": 1.8076, + "step": 25458 + }, + { + "epoch": 1.708633938458441, + "grad_norm": 4.193716526031494, + "learning_rate": 5.45979313431857e-06, + "loss": 1.6723, + "step": 25460 + }, + { + "epoch": 1.70876816214221, + "grad_norm": 3.9746806621551514, + "learning_rate": 5.454855764126992e-06, + "loss": 1.9572, + "step": 25462 + }, + { + "epoch": 1.7089023858259789, + "grad_norm": 4.4447832107543945, + "learning_rate": 5.4499204986058815e-06, + "loss": 1.6662, + "step": 25464 + }, + { + "epoch": 1.709036609509748, + "grad_norm": 4.392482757568359, + "learning_rate": 5.444987337988428e-06, + "loss": 1.8302, + "step": 25466 + }, + { + "epoch": 1.7091708331935171, + "grad_norm": 4.257859706878662, + "learning_rate": 5.440056282507699e-06, + "loss": 1.9524, + "step": 25468 + }, + { + "epoch": 1.709305056877286, + "grad_norm": 3.9855616092681885, + "learning_rate": 5.435127332396695e-06, + "loss": 1.908, + "step": 25470 + }, + { + "epoch": 1.709439280561055, + "grad_norm": 4.00891637802124, + "learning_rate": 5.4302004878882986e-06, + "loss": 1.9147, + "step": 25472 + }, + { + "epoch": 1.709573504244824, + "grad_norm": 3.7663114070892334, + "learning_rate": 5.425275749215281e-06, + "loss": 1.9538, + "step": 25474 + }, + { + "epoch": 1.709707727928593, + "grad_norm": 3.9888086318969727, + "learning_rate": 5.4203531166103325e-06, + "loss": 1.8455, + "step": 25476 + }, + { + "epoch": 1.709841951612362, + "grad_norm": 4.165395736694336, + "learning_rate": 5.415432590306024e-06, + "loss": 1.693, + "step": 25478 + }, + { + "epoch": 1.709976175296131, + "grad_norm": 4.360676288604736, + "learning_rate": 5.41051417053487e-06, + "loss": 1.8823, + "step": 25480 + }, + { + "epoch": 1.7101103989799, + "grad_norm": 4.254600524902344, + "learning_rate": 5.405597857529238e-06, + "loss": 1.9384, + "step": 25482 + }, + { + "epoch": 1.710244622663669, + "grad_norm": 4.40833854675293, + "learning_rate": 5.400683651521416e-06, + "loss": 1.9752, + "step": 25484 + }, + { + "epoch": 1.7103788463474379, + "grad_norm": 3.5119080543518066, + "learning_rate": 5.395771552743578e-06, + "loss": 1.6308, + "step": 25486 + }, + { + "epoch": 1.710513070031207, + "grad_norm": 3.6131882667541504, + "learning_rate": 5.390861561427835e-06, + "loss": 1.598, + "step": 25488 + }, + { + "epoch": 1.7106472937149761, + "grad_norm": 3.7811427116394043, + "learning_rate": 5.3859536778061655e-06, + "loss": 2.0049, + "step": 25490 + }, + { + "epoch": 1.710781517398745, + "grad_norm": 4.070196151733398, + "learning_rate": 5.381047902110453e-06, + "loss": 2.0108, + "step": 25492 + }, + { + "epoch": 1.710915741082514, + "grad_norm": 4.002646446228027, + "learning_rate": 5.376144234572478e-06, + "loss": 1.5853, + "step": 25494 + }, + { + "epoch": 1.711049964766283, + "grad_norm": 3.771136999130249, + "learning_rate": 5.371242675423949e-06, + "loss": 1.595, + "step": 25496 + }, + { + "epoch": 1.711184188450052, + "grad_norm": 3.513777017593384, + "learning_rate": 5.366343224896436e-06, + "loss": 1.7251, + "step": 25498 + }, + { + "epoch": 1.711318412133821, + "grad_norm": 4.155920505523682, + "learning_rate": 5.361445883221456e-06, + "loss": 2.0192, + "step": 25500 + }, + { + "epoch": 1.71145263581759, + "grad_norm": 4.4447407722473145, + "learning_rate": 5.356550650630359e-06, + "loss": 1.9715, + "step": 25502 + }, + { + "epoch": 1.711586859501359, + "grad_norm": 3.7043399810791016, + "learning_rate": 5.35165752735447e-06, + "loss": 1.6579, + "step": 25504 + }, + { + "epoch": 1.711721083185128, + "grad_norm": 4.236926078796387, + "learning_rate": 5.346766513624951e-06, + "loss": 1.8031, + "step": 25506 + }, + { + "epoch": 1.7118553068688969, + "grad_norm": 3.7119503021240234, + "learning_rate": 5.341877609672919e-06, + "loss": 1.9333, + "step": 25508 + }, + { + "epoch": 1.711989530552666, + "grad_norm": 3.8656492233276367, + "learning_rate": 5.336990815729354e-06, + "loss": 1.6193, + "step": 25510 + }, + { + "epoch": 1.7121237542364351, + "grad_norm": 4.752727031707764, + "learning_rate": 5.33210613202515e-06, + "loss": 1.9819, + "step": 25512 + }, + { + "epoch": 1.712257977920204, + "grad_norm": 4.287251949310303, + "learning_rate": 5.327223558791084e-06, + "loss": 2.0655, + "step": 25514 + }, + { + "epoch": 1.712392201603973, + "grad_norm": 4.5138983726501465, + "learning_rate": 5.322343096257864e-06, + "loss": 2.0437, + "step": 25516 + }, + { + "epoch": 1.712526425287742, + "grad_norm": 3.247281551361084, + "learning_rate": 5.317464744656081e-06, + "loss": 1.9414, + "step": 25518 + }, + { + "epoch": 1.712660648971511, + "grad_norm": 4.641493797302246, + "learning_rate": 5.312588504216226e-06, + "loss": 2.249, + "step": 25520 + }, + { + "epoch": 1.7127948726552802, + "grad_norm": 3.6799018383026123, + "learning_rate": 5.307714375168682e-06, + "loss": 2.0562, + "step": 25522 + }, + { + "epoch": 1.712929096339049, + "grad_norm": 4.088497161865234, + "learning_rate": 5.302842357743754e-06, + "loss": 2.1665, + "step": 25524 + }, + { + "epoch": 1.713063320022818, + "grad_norm": 4.281962871551514, + "learning_rate": 5.297972452171629e-06, + "loss": 1.6182, + "step": 25526 + }, + { + "epoch": 1.713197543706587, + "grad_norm": 4.434690952301025, + "learning_rate": 5.293104658682419e-06, + "loss": 1.8883, + "step": 25528 + }, + { + "epoch": 1.7133317673903559, + "grad_norm": 4.210580348968506, + "learning_rate": 5.288238977506077e-06, + "loss": 2.2354, + "step": 25530 + }, + { + "epoch": 1.713465991074125, + "grad_norm": 4.05552339553833, + "learning_rate": 5.283375408872537e-06, + "loss": 1.8973, + "step": 25532 + }, + { + "epoch": 1.7136002147578941, + "grad_norm": 4.7525129318237305, + "learning_rate": 5.278513953011566e-06, + "loss": 2.0431, + "step": 25534 + }, + { + "epoch": 1.713734438441663, + "grad_norm": 3.6836235523223877, + "learning_rate": 5.273654610152884e-06, + "loss": 1.8922, + "step": 25536 + }, + { + "epoch": 1.713868662125432, + "grad_norm": 4.646451950073242, + "learning_rate": 5.268797380526064e-06, + "loss": 1.9617, + "step": 25538 + }, + { + "epoch": 1.714002885809201, + "grad_norm": 3.2290494441986084, + "learning_rate": 5.26394226436061e-06, + "loss": 1.7476, + "step": 25540 + }, + { + "epoch": 1.71413710949297, + "grad_norm": 4.434454917907715, + "learning_rate": 5.259089261885908e-06, + "loss": 1.9765, + "step": 25542 + }, + { + "epoch": 1.7142713331767392, + "grad_norm": 3.9573278427124023, + "learning_rate": 5.254238373331266e-06, + "loss": 1.7341, + "step": 25544 + }, + { + "epoch": 1.714405556860508, + "grad_norm": 3.7868690490722656, + "learning_rate": 5.249389598925869e-06, + "loss": 1.8896, + "step": 25546 + }, + { + "epoch": 1.714539780544277, + "grad_norm": 3.6896188259124756, + "learning_rate": 5.244542938898822e-06, + "loss": 1.7569, + "step": 25548 + }, + { + "epoch": 1.714674004228046, + "grad_norm": 3.9033169746398926, + "learning_rate": 5.2396983934791085e-06, + "loss": 1.8475, + "step": 25550 + }, + { + "epoch": 1.714808227911815, + "grad_norm": 4.030399322509766, + "learning_rate": 5.2348559628956185e-06, + "loss": 1.927, + "step": 25552 + }, + { + "epoch": 1.714942451595584, + "grad_norm": 4.451348304748535, + "learning_rate": 5.230015647377168e-06, + "loss": 1.9593, + "step": 25554 + }, + { + "epoch": 1.7150766752793531, + "grad_norm": 3.6220340728759766, + "learning_rate": 5.225177447152446e-06, + "loss": 2.065, + "step": 25556 + }, + { + "epoch": 1.715210898963122, + "grad_norm": 4.3985795974731445, + "learning_rate": 5.220341362450038e-06, + "loss": 1.9491, + "step": 25558 + }, + { + "epoch": 1.715345122646891, + "grad_norm": 4.12697696685791, + "learning_rate": 5.215507393498437e-06, + "loss": 1.78, + "step": 25560 + }, + { + "epoch": 1.71547934633066, + "grad_norm": 4.071809768676758, + "learning_rate": 5.2106755405260555e-06, + "loss": 1.9564, + "step": 25562 + }, + { + "epoch": 1.715613570014429, + "grad_norm": 3.9052014350891113, + "learning_rate": 5.205845803761172e-06, + "loss": 1.7033, + "step": 25564 + }, + { + "epoch": 1.7157477936981982, + "grad_norm": 4.227178573608398, + "learning_rate": 5.201018183432005e-06, + "loss": 1.7988, + "step": 25566 + }, + { + "epoch": 1.715882017381967, + "grad_norm": 3.7358410358428955, + "learning_rate": 5.196192679766626e-06, + "loss": 1.7791, + "step": 25568 + }, + { + "epoch": 1.716016241065736, + "grad_norm": 4.47592306137085, + "learning_rate": 5.191369292993048e-06, + "loss": 1.9192, + "step": 25570 + }, + { + "epoch": 1.716150464749505, + "grad_norm": 4.347191333770752, + "learning_rate": 5.186548023339144e-06, + "loss": 2.0378, + "step": 25572 + }, + { + "epoch": 1.716284688433274, + "grad_norm": 3.9377431869506836, + "learning_rate": 5.181728871032737e-06, + "loss": 1.8429, + "step": 25574 + }, + { + "epoch": 1.716418912117043, + "grad_norm": 4.22166633605957, + "learning_rate": 5.176911836301507e-06, + "loss": 1.8162, + "step": 25576 + }, + { + "epoch": 1.7165531358008121, + "grad_norm": 4.003525733947754, + "learning_rate": 5.172096919373048e-06, + "loss": 2.1273, + "step": 25578 + }, + { + "epoch": 1.716687359484581, + "grad_norm": 3.7466869354248047, + "learning_rate": 5.167284120474858e-06, + "loss": 1.7397, + "step": 25580 + }, + { + "epoch": 1.71682158316835, + "grad_norm": 4.090458393096924, + "learning_rate": 5.162473439834337e-06, + "loss": 1.8494, + "step": 25582 + }, + { + "epoch": 1.716955806852119, + "grad_norm": 4.140261650085449, + "learning_rate": 5.157664877678781e-06, + "loss": 1.7404, + "step": 25584 + }, + { + "epoch": 1.717090030535888, + "grad_norm": 3.1391828060150146, + "learning_rate": 5.15285843423538e-06, + "loss": 1.531, + "step": 25586 + }, + { + "epoch": 1.7172242542196572, + "grad_norm": 4.5247931480407715, + "learning_rate": 5.148054109731215e-06, + "loss": 1.9281, + "step": 25588 + }, + { + "epoch": 1.717358477903426, + "grad_norm": 4.179714202880859, + "learning_rate": 5.143251904393309e-06, + "loss": 2.1084, + "step": 25590 + }, + { + "epoch": 1.717492701587195, + "grad_norm": 4.291934967041016, + "learning_rate": 5.1384518184485365e-06, + "loss": 1.7868, + "step": 25592 + }, + { + "epoch": 1.717626925270964, + "grad_norm": 3.9194412231445312, + "learning_rate": 5.1336538521237045e-06, + "loss": 1.9388, + "step": 25594 + }, + { + "epoch": 1.717761148954733, + "grad_norm": 4.45451021194458, + "learning_rate": 5.128858005645504e-06, + "loss": 1.8143, + "step": 25596 + }, + { + "epoch": 1.7178953726385022, + "grad_norm": 3.9796342849731445, + "learning_rate": 5.1240642792405314e-06, + "loss": 1.9463, + "step": 25598 + }, + { + "epoch": 1.7180295963222711, + "grad_norm": 4.0908966064453125, + "learning_rate": 5.1192726731352616e-06, + "loss": 1.951, + "step": 25600 + }, + { + "epoch": 1.71816382000604, + "grad_norm": 4.177731513977051, + "learning_rate": 5.114483187556113e-06, + "loss": 1.8888, + "step": 25602 + }, + { + "epoch": 1.718298043689809, + "grad_norm": 4.234723091125488, + "learning_rate": 5.1096958227293765e-06, + "loss": 2.0338, + "step": 25604 + }, + { + "epoch": 1.718432267373578, + "grad_norm": 4.40239953994751, + "learning_rate": 5.104910578881234e-06, + "loss": 1.6849, + "step": 25606 + }, + { + "epoch": 1.718566491057347, + "grad_norm": 3.99098801612854, + "learning_rate": 5.100127456237774e-06, + "loss": 2.0096, + "step": 25608 + }, + { + "epoch": 1.7187007147411162, + "grad_norm": 3.6541831493377686, + "learning_rate": 5.095346455025013e-06, + "loss": 1.7429, + "step": 25610 + }, + { + "epoch": 1.718834938424885, + "grad_norm": 3.984395980834961, + "learning_rate": 5.09056757546883e-06, + "loss": 1.8973, + "step": 25612 + }, + { + "epoch": 1.718969162108654, + "grad_norm": 4.080272197723389, + "learning_rate": 5.085790817795016e-06, + "loss": 1.8145, + "step": 25614 + }, + { + "epoch": 1.719103385792423, + "grad_norm": 4.153635501861572, + "learning_rate": 5.08101618222927e-06, + "loss": 1.8375, + "step": 25616 + }, + { + "epoch": 1.719237609476192, + "grad_norm": 3.7388710975646973, + "learning_rate": 5.076243668997166e-06, + "loss": 1.7131, + "step": 25618 + }, + { + "epoch": 1.7193718331599612, + "grad_norm": 4.077261447906494, + "learning_rate": 5.071473278324223e-06, + "loss": 1.9473, + "step": 25620 + }, + { + "epoch": 1.7195060568437301, + "grad_norm": 4.152169227600098, + "learning_rate": 5.066705010435807e-06, + "loss": 1.9037, + "step": 25622 + }, + { + "epoch": 1.719640280527499, + "grad_norm": 4.293241500854492, + "learning_rate": 5.061938865557242e-06, + "loss": 1.6699, + "step": 25624 + }, + { + "epoch": 1.719774504211268, + "grad_norm": 4.241661071777344, + "learning_rate": 5.057174843913681e-06, + "loss": 1.9307, + "step": 25626 + }, + { + "epoch": 1.7199087278950371, + "grad_norm": 3.989133358001709, + "learning_rate": 5.05241294573024e-06, + "loss": 1.7161, + "step": 25628 + }, + { + "epoch": 1.720042951578806, + "grad_norm": 3.8347198963165283, + "learning_rate": 5.047653171231892e-06, + "loss": 1.6339, + "step": 25630 + }, + { + "epoch": 1.7201771752625752, + "grad_norm": 4.078265190124512, + "learning_rate": 5.042895520643548e-06, + "loss": 1.8103, + "step": 25632 + }, + { + "epoch": 1.720311398946344, + "grad_norm": 3.932617425918579, + "learning_rate": 5.038139994189983e-06, + "loss": 1.6714, + "step": 25634 + }, + { + "epoch": 1.720445622630113, + "grad_norm": 3.7992119789123535, + "learning_rate": 5.03338659209589e-06, + "loss": 1.8981, + "step": 25636 + }, + { + "epoch": 1.720579846313882, + "grad_norm": 4.513242721557617, + "learning_rate": 5.02863531458585e-06, + "loss": 1.866, + "step": 25638 + }, + { + "epoch": 1.720714069997651, + "grad_norm": 3.573974132537842, + "learning_rate": 5.023886161884372e-06, + "loss": 1.7908, + "step": 25640 + }, + { + "epoch": 1.7208482936814202, + "grad_norm": 4.423505783081055, + "learning_rate": 5.019139134215828e-06, + "loss": 1.9826, + "step": 25642 + }, + { + "epoch": 1.7209825173651891, + "grad_norm": 4.302817344665527, + "learning_rate": 5.014394231804503e-06, + "loss": 1.8303, + "step": 25644 + }, + { + "epoch": 1.721116741048958, + "grad_norm": 4.065127849578857, + "learning_rate": 5.009651454874587e-06, + "loss": 1.6863, + "step": 25646 + }, + { + "epoch": 1.721250964732727, + "grad_norm": 4.370792865753174, + "learning_rate": 5.004910803650181e-06, + "loss": 1.8686, + "step": 25648 + }, + { + "epoch": 1.7213851884164961, + "grad_norm": 3.9212534427642822, + "learning_rate": 5.000172278355248e-06, + "loss": 1.8792, + "step": 25650 + }, + { + "epoch": 1.721519412100265, + "grad_norm": 4.27854585647583, + "learning_rate": 4.9954358792137054e-06, + "loss": 1.9294, + "step": 25652 + }, + { + "epoch": 1.7216536357840342, + "grad_norm": 4.310137748718262, + "learning_rate": 4.9907016064493e-06, + "loss": 1.8389, + "step": 25654 + }, + { + "epoch": 1.721787859467803, + "grad_norm": 4.883481502532959, + "learning_rate": 4.98596946028575e-06, + "loss": 1.8589, + "step": 25656 + }, + { + "epoch": 1.721922083151572, + "grad_norm": 3.9852893352508545, + "learning_rate": 4.981239440946612e-06, + "loss": 1.7912, + "step": 25658 + }, + { + "epoch": 1.722056306835341, + "grad_norm": 4.288332939147949, + "learning_rate": 4.976511548655399e-06, + "loss": 1.7019, + "step": 25660 + }, + { + "epoch": 1.72219053051911, + "grad_norm": 4.48889684677124, + "learning_rate": 4.97178578363548e-06, + "loss": 2.0181, + "step": 25662 + }, + { + "epoch": 1.7223247542028792, + "grad_norm": 4.460997104644775, + "learning_rate": 4.967062146110135e-06, + "loss": 2.0069, + "step": 25664 + }, + { + "epoch": 1.7224589778866481, + "grad_norm": 3.983340263366699, + "learning_rate": 4.962340636302543e-06, + "loss": 1.9343, + "step": 25666 + }, + { + "epoch": 1.722593201570417, + "grad_norm": 4.057766914367676, + "learning_rate": 4.957621254435801e-06, + "loss": 2.0452, + "step": 25668 + }, + { + "epoch": 1.722727425254186, + "grad_norm": 3.6128575801849365, + "learning_rate": 4.952904000732883e-06, + "loss": 1.682, + "step": 25670 + }, + { + "epoch": 1.7228616489379551, + "grad_norm": 4.198038101196289, + "learning_rate": 4.948188875416671e-06, + "loss": 1.9696, + "step": 25672 + }, + { + "epoch": 1.7229958726217243, + "grad_norm": 4.526651382446289, + "learning_rate": 4.943475878709936e-06, + "loss": 1.9287, + "step": 25674 + }, + { + "epoch": 1.7231300963054932, + "grad_norm": 4.276797771453857, + "learning_rate": 4.938765010835373e-06, + "loss": 1.9983, + "step": 25676 + }, + { + "epoch": 1.723264319989262, + "grad_norm": 4.346768856048584, + "learning_rate": 4.934056272015541e-06, + "loss": 1.7894, + "step": 25678 + }, + { + "epoch": 1.723398543673031, + "grad_norm": 4.3075432777404785, + "learning_rate": 4.92934966247296e-06, + "loss": 1.8635, + "step": 25680 + }, + { + "epoch": 1.7235327673568, + "grad_norm": 4.57111930847168, + "learning_rate": 4.924645182429966e-06, + "loss": 1.6642, + "step": 25682 + }, + { + "epoch": 1.723666991040569, + "grad_norm": 3.982243299484253, + "learning_rate": 4.919942832108837e-06, + "loss": 2.1136, + "step": 25684 + }, + { + "epoch": 1.7238012147243382, + "grad_norm": 4.418928146362305, + "learning_rate": 4.915242611731774e-06, + "loss": 2.0305, + "step": 25686 + }, + { + "epoch": 1.7239354384081071, + "grad_norm": 4.015188217163086, + "learning_rate": 4.910544521520838e-06, + "loss": 1.6111, + "step": 25688 + }, + { + "epoch": 1.724069662091876, + "grad_norm": 3.8143906593322754, + "learning_rate": 4.905848561698023e-06, + "loss": 1.9039, + "step": 25690 + }, + { + "epoch": 1.724203885775645, + "grad_norm": 4.078932285308838, + "learning_rate": 4.901154732485169e-06, + "loss": 1.7635, + "step": 25692 + }, + { + "epoch": 1.7243381094594141, + "grad_norm": 4.041810512542725, + "learning_rate": 4.896463034104082e-06, + "loss": 1.8088, + "step": 25694 + }, + { + "epoch": 1.7244723331431833, + "grad_norm": 4.055402755737305, + "learning_rate": 4.891773466776417e-06, + "loss": 1.7546, + "step": 25696 + }, + { + "epoch": 1.7246065568269522, + "grad_norm": 4.099616527557373, + "learning_rate": 4.887086030723764e-06, + "loss": 1.8331, + "step": 25698 + }, + { + "epoch": 1.724740780510721, + "grad_norm": 4.285022735595703, + "learning_rate": 4.88240072616758e-06, + "loss": 1.9839, + "step": 25700 + }, + { + "epoch": 1.72487500419449, + "grad_norm": 4.10545015335083, + "learning_rate": 4.877717553329247e-06, + "loss": 1.9439, + "step": 25702 + }, + { + "epoch": 1.7250092278782592, + "grad_norm": 3.685899257659912, + "learning_rate": 4.873036512430018e-06, + "loss": 1.9802, + "step": 25704 + }, + { + "epoch": 1.725143451562028, + "grad_norm": 3.980315685272217, + "learning_rate": 4.868357603691087e-06, + "loss": 1.7135, + "step": 25706 + }, + { + "epoch": 1.7252776752457972, + "grad_norm": 4.291272163391113, + "learning_rate": 4.863680827333511e-06, + "loss": 1.8311, + "step": 25708 + }, + { + "epoch": 1.7254118989295661, + "grad_norm": 3.9932055473327637, + "learning_rate": 4.859006183578263e-06, + "loss": 1.8103, + "step": 25710 + }, + { + "epoch": 1.725546122613335, + "grad_norm": 4.1003241539001465, + "learning_rate": 4.854333672646188e-06, + "loss": 1.5873, + "step": 25712 + }, + { + "epoch": 1.725680346297104, + "grad_norm": 4.298391342163086, + "learning_rate": 4.849663294758089e-06, + "loss": 1.7136, + "step": 25714 + }, + { + "epoch": 1.7258145699808731, + "grad_norm": 3.9579379558563232, + "learning_rate": 4.844995050134604e-06, + "loss": 1.9396, + "step": 25716 + }, + { + "epoch": 1.7259487936646423, + "grad_norm": 4.364680290222168, + "learning_rate": 4.840328938996325e-06, + "loss": 1.8716, + "step": 25718 + }, + { + "epoch": 1.7260830173484112, + "grad_norm": 4.29738187789917, + "learning_rate": 4.835664961563685e-06, + "loss": 1.8501, + "step": 25720 + }, + { + "epoch": 1.72621724103218, + "grad_norm": 4.279882907867432, + "learning_rate": 4.831003118057076e-06, + "loss": 1.6892, + "step": 25722 + }, + { + "epoch": 1.726351464715949, + "grad_norm": 4.133034706115723, + "learning_rate": 4.826343408696732e-06, + "loss": 1.9089, + "step": 25724 + }, + { + "epoch": 1.7264856883997182, + "grad_norm": 4.247915744781494, + "learning_rate": 4.821685833702849e-06, + "loss": 1.9671, + "step": 25726 + }, + { + "epoch": 1.726619912083487, + "grad_norm": 4.12255859375, + "learning_rate": 4.817030393295463e-06, + "loss": 1.7585, + "step": 25728 + }, + { + "epoch": 1.7267541357672562, + "grad_norm": 3.7780039310455322, + "learning_rate": 4.812377087694547e-06, + "loss": 1.7501, + "step": 25730 + }, + { + "epoch": 1.7268883594510251, + "grad_norm": 4.247049331665039, + "learning_rate": 4.807725917119949e-06, + "loss": 1.8157, + "step": 25732 + }, + { + "epoch": 1.727022583134794, + "grad_norm": 4.175944805145264, + "learning_rate": 4.803076881791441e-06, + "loss": 1.747, + "step": 25734 + }, + { + "epoch": 1.727156806818563, + "grad_norm": 3.9780356884002686, + "learning_rate": 4.798429981928676e-06, + "loss": 1.8729, + "step": 25736 + }, + { + "epoch": 1.7272910305023321, + "grad_norm": 3.9035685062408447, + "learning_rate": 4.793785217751206e-06, + "loss": 1.7115, + "step": 25738 + }, + { + "epoch": 1.7274252541861013, + "grad_norm": 4.172904014587402, + "learning_rate": 4.7891425894784845e-06, + "loss": 1.8735, + "step": 25740 + }, + { + "epoch": 1.7275594778698702, + "grad_norm": 3.9434783458709717, + "learning_rate": 4.7845020973298795e-06, + "loss": 2.0488, + "step": 25742 + }, + { + "epoch": 1.727693701553639, + "grad_norm": 3.9278433322906494, + "learning_rate": 4.779863741524626e-06, + "loss": 2.039, + "step": 25744 + }, + { + "epoch": 1.727827925237408, + "grad_norm": 3.642120361328125, + "learning_rate": 4.775227522281911e-06, + "loss": 1.7078, + "step": 25746 + }, + { + "epoch": 1.7279621489211772, + "grad_norm": 4.337493896484375, + "learning_rate": 4.770593439820747e-06, + "loss": 1.7687, + "step": 25748 + }, + { + "epoch": 1.7280963726049463, + "grad_norm": 4.3520612716674805, + "learning_rate": 4.76596149436011e-06, + "loss": 2.0508, + "step": 25750 + }, + { + "epoch": 1.7282305962887152, + "grad_norm": 4.406829357147217, + "learning_rate": 4.761331686118848e-06, + "loss": 1.9573, + "step": 25752 + }, + { + "epoch": 1.7283648199724841, + "grad_norm": 4.337316989898682, + "learning_rate": 4.756704015315694e-06, + "loss": 1.775, + "step": 25754 + }, + { + "epoch": 1.728499043656253, + "grad_norm": 4.143853664398193, + "learning_rate": 4.7520784821693146e-06, + "loss": 1.8036, + "step": 25756 + }, + { + "epoch": 1.728633267340022, + "grad_norm": 4.128250598907471, + "learning_rate": 4.74745508689825e-06, + "loss": 1.5818, + "step": 25758 + }, + { + "epoch": 1.7287674910237911, + "grad_norm": 4.260581016540527, + "learning_rate": 4.742833829720955e-06, + "loss": 1.862, + "step": 25760 + }, + { + "epoch": 1.7289017147075603, + "grad_norm": 4.0141282081604, + "learning_rate": 4.738214710855748e-06, + "loss": 1.9118, + "step": 25762 + }, + { + "epoch": 1.7290359383913292, + "grad_norm": 4.164759635925293, + "learning_rate": 4.733597730520911e-06, + "loss": 1.793, + "step": 25764 + }, + { + "epoch": 1.729170162075098, + "grad_norm": 3.6694958209991455, + "learning_rate": 4.728982888934563e-06, + "loss": 2.0308, + "step": 25766 + }, + { + "epoch": 1.729304385758867, + "grad_norm": 4.430643558502197, + "learning_rate": 4.7243701863147525e-06, + "loss": 1.9818, + "step": 25768 + }, + { + "epoch": 1.7294386094426362, + "grad_norm": 4.266082763671875, + "learning_rate": 4.71975962287941e-06, + "loss": 2.0399, + "step": 25770 + }, + { + "epoch": 1.7295728331264053, + "grad_norm": 3.9155232906341553, + "learning_rate": 4.7151511988463955e-06, + "loss": 1.726, + "step": 25772 + }, + { + "epoch": 1.7297070568101742, + "grad_norm": 4.289565086364746, + "learning_rate": 4.7105449144334345e-06, + "loss": 2.2254, + "step": 25774 + }, + { + "epoch": 1.7298412804939431, + "grad_norm": 4.218628406524658, + "learning_rate": 4.70594076985818e-06, + "loss": 2.0123, + "step": 25776 + }, + { + "epoch": 1.729975504177712, + "grad_norm": 3.7502570152282715, + "learning_rate": 4.7013387653381424e-06, + "loss": 1.6972, + "step": 25778 + }, + { + "epoch": 1.7301097278614812, + "grad_norm": 4.590624809265137, + "learning_rate": 4.696738901090781e-06, + "loss": 1.8377, + "step": 25780 + }, + { + "epoch": 1.7302439515452501, + "grad_norm": 4.191529750823975, + "learning_rate": 4.692141177333409e-06, + "loss": 1.817, + "step": 25782 + }, + { + "epoch": 1.7303781752290193, + "grad_norm": 4.133313179016113, + "learning_rate": 4.687545594283282e-06, + "loss": 1.7712, + "step": 25784 + }, + { + "epoch": 1.7305123989127882, + "grad_norm": 4.392168045043945, + "learning_rate": 4.682952152157522e-06, + "loss": 1.7122, + "step": 25786 + }, + { + "epoch": 1.730646622596557, + "grad_norm": 4.357333660125732, + "learning_rate": 4.6783608511731635e-06, + "loss": 1.8409, + "step": 25788 + }, + { + "epoch": 1.730780846280326, + "grad_norm": 4.493144512176514, + "learning_rate": 4.673771691547124e-06, + "loss": 2.0378, + "step": 25790 + }, + { + "epoch": 1.7309150699640952, + "grad_norm": 4.046872138977051, + "learning_rate": 4.669184673496252e-06, + "loss": 1.5802, + "step": 25792 + }, + { + "epoch": 1.7310492936478643, + "grad_norm": 3.7962770462036133, + "learning_rate": 4.664599797237263e-06, + "loss": 1.6633, + "step": 25794 + }, + { + "epoch": 1.7311835173316332, + "grad_norm": 4.538661003112793, + "learning_rate": 4.660017062986782e-06, + "loss": 1.8212, + "step": 25796 + }, + { + "epoch": 1.7313177410154021, + "grad_norm": 4.4650068283081055, + "learning_rate": 4.655436470961333e-06, + "loss": 2.0392, + "step": 25798 + }, + { + "epoch": 1.731451964699171, + "grad_norm": 3.9020941257476807, + "learning_rate": 4.650858021377352e-06, + "loss": 1.6835, + "step": 25800 + }, + { + "epoch": 1.7315861883829402, + "grad_norm": 3.2646312713623047, + "learning_rate": 4.64628171445115e-06, + "loss": 1.8349, + "step": 25802 + }, + { + "epoch": 1.7317204120667091, + "grad_norm": 3.9168617725372314, + "learning_rate": 4.641707550398966e-06, + "loss": 1.6519, + "step": 25804 + }, + { + "epoch": 1.7318546357504783, + "grad_norm": 4.4771623611450195, + "learning_rate": 4.6371355294368865e-06, + "loss": 2.0235, + "step": 25806 + }, + { + "epoch": 1.7319888594342472, + "grad_norm": 4.329494476318359, + "learning_rate": 4.632565651780968e-06, + "loss": 1.9107, + "step": 25808 + }, + { + "epoch": 1.732123083118016, + "grad_norm": 4.142405986785889, + "learning_rate": 4.627997917647098e-06, + "loss": 1.7261, + "step": 25810 + }, + { + "epoch": 1.732257306801785, + "grad_norm": 5.125955104827881, + "learning_rate": 4.623432327251109e-06, + "loss": 2.0321, + "step": 25812 + }, + { + "epoch": 1.7323915304855542, + "grad_norm": 3.924088716506958, + "learning_rate": 4.618868880808725e-06, + "loss": 1.8584, + "step": 25814 + }, + { + "epoch": 1.7325257541693233, + "grad_norm": 4.340001583099365, + "learning_rate": 4.614307578535537e-06, + "loss": 1.5337, + "step": 25816 + }, + { + "epoch": 1.7326599778530922, + "grad_norm": 4.113027572631836, + "learning_rate": 4.6097484206470756e-06, + "loss": 1.9521, + "step": 25818 + }, + { + "epoch": 1.7327942015368611, + "grad_norm": 4.171198844909668, + "learning_rate": 4.605191407358733e-06, + "loss": 1.7496, + "step": 25820 + }, + { + "epoch": 1.73292842522063, + "grad_norm": 4.315971374511719, + "learning_rate": 4.600636538885844e-06, + "loss": 1.9211, + "step": 25822 + }, + { + "epoch": 1.7330626489043992, + "grad_norm": 4.475301742553711, + "learning_rate": 4.596083815443602e-06, + "loss": 2.1439, + "step": 25824 + }, + { + "epoch": 1.7331968725881683, + "grad_norm": 4.211636543273926, + "learning_rate": 4.5915332372471195e-06, + "loss": 1.9145, + "step": 25826 + }, + { + "epoch": 1.7333310962719373, + "grad_norm": 3.890170097351074, + "learning_rate": 4.586984804511385e-06, + "loss": 1.707, + "step": 25828 + }, + { + "epoch": 1.7334653199557062, + "grad_norm": 4.433126449584961, + "learning_rate": 4.5824385174513316e-06, + "loss": 2.0919, + "step": 25830 + }, + { + "epoch": 1.733599543639475, + "grad_norm": 3.943657159805298, + "learning_rate": 4.577894376281744e-06, + "loss": 1.6391, + "step": 25832 + }, + { + "epoch": 1.733733767323244, + "grad_norm": 4.563302040100098, + "learning_rate": 4.573352381217333e-06, + "loss": 1.8623, + "step": 25834 + }, + { + "epoch": 1.7338679910070132, + "grad_norm": 4.347084045410156, + "learning_rate": 4.568812532472683e-06, + "loss": 1.9469, + "step": 25836 + }, + { + "epoch": 1.7340022146907823, + "grad_norm": 4.389248371124268, + "learning_rate": 4.564274830262316e-06, + "loss": 1.9683, + "step": 25838 + }, + { + "epoch": 1.7341364383745512, + "grad_norm": 4.5127482414245605, + "learning_rate": 4.559739274800606e-06, + "loss": 1.898, + "step": 25840 + }, + { + "epoch": 1.7342706620583201, + "grad_norm": 4.110467433929443, + "learning_rate": 4.555205866301876e-06, + "loss": 1.6499, + "step": 25842 + }, + { + "epoch": 1.734404885742089, + "grad_norm": 4.244119167327881, + "learning_rate": 4.5506746049802925e-06, + "loss": 2.0614, + "step": 25844 + }, + { + "epoch": 1.7345391094258582, + "grad_norm": 4.121755599975586, + "learning_rate": 4.546145491049969e-06, + "loss": 1.8056, + "step": 25846 + }, + { + "epoch": 1.7346733331096273, + "grad_norm": 4.008800983428955, + "learning_rate": 4.541618524724878e-06, + "loss": 1.7683, + "step": 25848 + }, + { + "epoch": 1.7348075567933963, + "grad_norm": 4.589218616485596, + "learning_rate": 4.537093706218937e-06, + "loss": 1.7758, + "step": 25850 + }, + { + "epoch": 1.7349417804771652, + "grad_norm": 3.589454174041748, + "learning_rate": 4.532571035745914e-06, + "loss": 1.6857, + "step": 25852 + }, + { + "epoch": 1.735076004160934, + "grad_norm": 4.079255104064941, + "learning_rate": 4.528050513519505e-06, + "loss": 1.8458, + "step": 25854 + }, + { + "epoch": 1.7352102278447032, + "grad_norm": 3.620393991470337, + "learning_rate": 4.523532139753278e-06, + "loss": 1.8165, + "step": 25856 + }, + { + "epoch": 1.7353444515284722, + "grad_norm": 4.3750481605529785, + "learning_rate": 4.519015914660746e-06, + "loss": 1.8182, + "step": 25858 + }, + { + "epoch": 1.7354786752122413, + "grad_norm": 4.665088653564453, + "learning_rate": 4.514501838455276e-06, + "loss": 1.7632, + "step": 25860 + }, + { + "epoch": 1.7356128988960102, + "grad_norm": 4.333638668060303, + "learning_rate": 4.5099899113501545e-06, + "loss": 1.7523, + "step": 25862 + }, + { + "epoch": 1.7357471225797791, + "grad_norm": 3.863891839981079, + "learning_rate": 4.505480133558543e-06, + "loss": 1.8716, + "step": 25864 + }, + { + "epoch": 1.735881346263548, + "grad_norm": 4.52387809753418, + "learning_rate": 4.500972505293544e-06, + "loss": 1.7398, + "step": 25866 + }, + { + "epoch": 1.7360155699473172, + "grad_norm": 3.8139188289642334, + "learning_rate": 4.4964670267681146e-06, + "loss": 1.8072, + "step": 25868 + }, + { + "epoch": 1.7361497936310863, + "grad_norm": 3.8125035762786865, + "learning_rate": 4.491963698195162e-06, + "loss": 1.7072, + "step": 25870 + }, + { + "epoch": 1.7362840173148553, + "grad_norm": 4.1992363929748535, + "learning_rate": 4.487462519787416e-06, + "loss": 1.7327, + "step": 25872 + }, + { + "epoch": 1.7364182409986242, + "grad_norm": 4.459270477294922, + "learning_rate": 4.482963491757574e-06, + "loss": 2.1629, + "step": 25874 + }, + { + "epoch": 1.736552464682393, + "grad_norm": 4.001871585845947, + "learning_rate": 4.4784666143181996e-06, + "loss": 1.9346, + "step": 25876 + }, + { + "epoch": 1.7366866883661622, + "grad_norm": 4.423608303070068, + "learning_rate": 4.473971887681772e-06, + "loss": 1.9892, + "step": 25878 + }, + { + "epoch": 1.7368209120499312, + "grad_norm": 4.575361251831055, + "learning_rate": 4.469479312060643e-06, + "loss": 1.7223, + "step": 25880 + }, + { + "epoch": 1.7369551357337003, + "grad_norm": 4.064820766448975, + "learning_rate": 4.464988887667087e-06, + "loss": 1.6706, + "step": 25882 + }, + { + "epoch": 1.7370893594174692, + "grad_norm": 4.399496555328369, + "learning_rate": 4.460500614713259e-06, + "loss": 1.996, + "step": 25884 + }, + { + "epoch": 1.7372235831012381, + "grad_norm": 3.7568445205688477, + "learning_rate": 4.456014493411237e-06, + "loss": 1.9589, + "step": 25886 + }, + { + "epoch": 1.737357806785007, + "grad_norm": 3.8717925548553467, + "learning_rate": 4.4515305239729685e-06, + "loss": 1.9456, + "step": 25888 + }, + { + "epoch": 1.7374920304687762, + "grad_norm": 4.093155384063721, + "learning_rate": 4.447048706610318e-06, + "loss": 1.7149, + "step": 25890 + }, + { + "epoch": 1.7376262541525453, + "grad_norm": 4.071346759796143, + "learning_rate": 4.442569041535039e-06, + "loss": 1.8001, + "step": 25892 + }, + { + "epoch": 1.7377604778363143, + "grad_norm": 4.546091079711914, + "learning_rate": 4.4380915289587825e-06, + "loss": 2.0193, + "step": 25894 + }, + { + "epoch": 1.7378947015200832, + "grad_norm": 4.002737522125244, + "learning_rate": 4.433616169093113e-06, + "loss": 2.0014, + "step": 25896 + }, + { + "epoch": 1.738028925203852, + "grad_norm": 4.204023838043213, + "learning_rate": 4.429142962149474e-06, + "loss": 2.194, + "step": 25898 + }, + { + "epoch": 1.7381631488876212, + "grad_norm": 4.025143623352051, + "learning_rate": 4.424671908339223e-06, + "loss": 1.7005, + "step": 25900 + }, + { + "epoch": 1.7382973725713904, + "grad_norm": 3.710338592529297, + "learning_rate": 4.420203007873597e-06, + "loss": 2.0669, + "step": 25902 + }, + { + "epoch": 1.7384315962551593, + "grad_norm": 4.726452827453613, + "learning_rate": 4.415736260963755e-06, + "loss": 1.862, + "step": 25904 + }, + { + "epoch": 1.7385658199389282, + "grad_norm": 4.1309099197387695, + "learning_rate": 4.411271667820727e-06, + "loss": 1.6096, + "step": 25906 + }, + { + "epoch": 1.7387000436226971, + "grad_norm": 3.9148924350738525, + "learning_rate": 4.406809228655479e-06, + "loss": 1.7516, + "step": 25908 + }, + { + "epoch": 1.738834267306466, + "grad_norm": 6.089382171630859, + "learning_rate": 4.4023489436788355e-06, + "loss": 2.0491, + "step": 25910 + }, + { + "epoch": 1.7389684909902352, + "grad_norm": 3.9746193885803223, + "learning_rate": 4.397890813101546e-06, + "loss": 2.0028, + "step": 25912 + }, + { + "epoch": 1.7391027146740043, + "grad_norm": 4.053095817565918, + "learning_rate": 4.39343483713423e-06, + "loss": 1.8132, + "step": 25914 + }, + { + "epoch": 1.7392369383577733, + "grad_norm": 4.131493091583252, + "learning_rate": 4.388981015987448e-06, + "loss": 1.9097, + "step": 25916 + }, + { + "epoch": 1.7393711620415422, + "grad_norm": 4.0423126220703125, + "learning_rate": 4.384529349871625e-06, + "loss": 1.7915, + "step": 25918 + }, + { + "epoch": 1.739505385725311, + "grad_norm": 4.102564334869385, + "learning_rate": 4.380079838997086e-06, + "loss": 1.6927, + "step": 25920 + }, + { + "epoch": 1.7396396094090802, + "grad_norm": 4.54093074798584, + "learning_rate": 4.37563248357406e-06, + "loss": 2.0529, + "step": 25922 + }, + { + "epoch": 1.7397738330928494, + "grad_norm": 3.9899630546569824, + "learning_rate": 4.371187283812689e-06, + "loss": 1.6278, + "step": 25924 + }, + { + "epoch": 1.7399080567766183, + "grad_norm": 3.943861722946167, + "learning_rate": 4.366744239922998e-06, + "loss": 1.86, + "step": 25926 + }, + { + "epoch": 1.7400422804603872, + "grad_norm": 4.361581802368164, + "learning_rate": 4.362303352114905e-06, + "loss": 1.8654, + "step": 25928 + }, + { + "epoch": 1.7401765041441561, + "grad_norm": 4.550539016723633, + "learning_rate": 4.357864620598229e-06, + "loss": 1.838, + "step": 25930 + }, + { + "epoch": 1.7403107278279253, + "grad_norm": 3.333662986755371, + "learning_rate": 4.3534280455827024e-06, + "loss": 1.6218, + "step": 25932 + }, + { + "epoch": 1.7404449515116942, + "grad_norm": 3.781280994415283, + "learning_rate": 4.348993627277936e-06, + "loss": 1.832, + "step": 25934 + }, + { + "epoch": 1.7405791751954633, + "grad_norm": 3.733417272567749, + "learning_rate": 4.3445613658934624e-06, + "loss": 1.9231, + "step": 25936 + }, + { + "epoch": 1.7407133988792323, + "grad_norm": 4.144769668579102, + "learning_rate": 4.340131261638686e-06, + "loss": 1.9881, + "step": 25938 + }, + { + "epoch": 1.7408476225630012, + "grad_norm": 3.847315788269043, + "learning_rate": 4.335703314722916e-06, + "loss": 1.6159, + "step": 25940 + }, + { + "epoch": 1.74098184624677, + "grad_norm": 4.5029215812683105, + "learning_rate": 4.331277525355365e-06, + "loss": 1.7614, + "step": 25942 + }, + { + "epoch": 1.7411160699305392, + "grad_norm": 4.2237629890441895, + "learning_rate": 4.326853893745153e-06, + "loss": 1.6507, + "step": 25944 + }, + { + "epoch": 1.7412502936143084, + "grad_norm": 4.099926948547363, + "learning_rate": 4.3224324201012854e-06, + "loss": 1.7763, + "step": 25946 + }, + { + "epoch": 1.7413845172980773, + "grad_norm": 4.059402942657471, + "learning_rate": 4.3180131046326626e-06, + "loss": 1.754, + "step": 25948 + }, + { + "epoch": 1.7415187409818462, + "grad_norm": 4.263644218444824, + "learning_rate": 4.313595947548082e-06, + "loss": 1.996, + "step": 25950 + }, + { + "epoch": 1.7416529646656151, + "grad_norm": 4.12902307510376, + "learning_rate": 4.309180949056269e-06, + "loss": 1.8241, + "step": 25952 + }, + { + "epoch": 1.7417871883493843, + "grad_norm": 4.249416828155518, + "learning_rate": 4.304768109365792e-06, + "loss": 1.7259, + "step": 25954 + }, + { + "epoch": 1.7419214120331532, + "grad_norm": 4.521233081817627, + "learning_rate": 4.300357428685187e-06, + "loss": 2.0509, + "step": 25956 + }, + { + "epoch": 1.7420556357169223, + "grad_norm": 4.268134593963623, + "learning_rate": 4.295948907222824e-06, + "loss": 1.7017, + "step": 25958 + }, + { + "epoch": 1.7421898594006913, + "grad_norm": 4.314799785614014, + "learning_rate": 4.2915425451869884e-06, + "loss": 1.8532, + "step": 25960 + }, + { + "epoch": 1.7423240830844602, + "grad_norm": 4.681378364562988, + "learning_rate": 4.287138342785896e-06, + "loss": 1.767, + "step": 25962 + }, + { + "epoch": 1.742458306768229, + "grad_norm": 4.044734477996826, + "learning_rate": 4.2827363002276135e-06, + "loss": 1.9531, + "step": 25964 + }, + { + "epoch": 1.7425925304519982, + "grad_norm": 4.834097385406494, + "learning_rate": 4.278336417720163e-06, + "loss": 2.0666, + "step": 25966 + }, + { + "epoch": 1.7427267541357674, + "grad_norm": 3.926182508468628, + "learning_rate": 4.273938695471391e-06, + "loss": 2.1471, + "step": 25968 + }, + { + "epoch": 1.7428609778195363, + "grad_norm": 3.6063051223754883, + "learning_rate": 4.269543133689108e-06, + "loss": 1.7521, + "step": 25970 + }, + { + "epoch": 1.7429952015033052, + "grad_norm": 3.818572998046875, + "learning_rate": 4.265149732580981e-06, + "loss": 1.7523, + "step": 25972 + }, + { + "epoch": 1.7431294251870741, + "grad_norm": 3.931766986846924, + "learning_rate": 4.2607584923546e-06, + "loss": 1.765, + "step": 25974 + }, + { + "epoch": 1.7432636488708433, + "grad_norm": 4.45953893661499, + "learning_rate": 4.256369413217442e-06, + "loss": 2.0467, + "step": 25976 + }, + { + "epoch": 1.7433978725546124, + "grad_norm": 3.9176180362701416, + "learning_rate": 4.251982495376872e-06, + "loss": 1.6961, + "step": 25978 + }, + { + "epoch": 1.7435320962383813, + "grad_norm": 4.674042224884033, + "learning_rate": 4.247597739040166e-06, + "loss": 1.8021, + "step": 25980 + }, + { + "epoch": 1.7436663199221503, + "grad_norm": 4.521872520446777, + "learning_rate": 4.2432151444145085e-06, + "loss": 1.7814, + "step": 25982 + }, + { + "epoch": 1.7438005436059192, + "grad_norm": 4.184162616729736, + "learning_rate": 4.238834711706952e-06, + "loss": 1.8995, + "step": 25984 + }, + { + "epoch": 1.743934767289688, + "grad_norm": 3.845735788345337, + "learning_rate": 4.234456441124474e-06, + "loss": 1.8172, + "step": 25986 + }, + { + "epoch": 1.7440689909734572, + "grad_norm": 3.804110288619995, + "learning_rate": 4.230080332873926e-06, + "loss": 1.775, + "step": 25988 + }, + { + "epoch": 1.7442032146572264, + "grad_norm": 4.107080459594727, + "learning_rate": 4.225706387162087e-06, + "loss": 1.7747, + "step": 25990 + }, + { + "epoch": 1.7443374383409953, + "grad_norm": 4.452687740325928, + "learning_rate": 4.221334604195604e-06, + "loss": 1.9282, + "step": 25992 + }, + { + "epoch": 1.7444716620247642, + "grad_norm": 4.170470714569092, + "learning_rate": 4.21696498418106e-06, + "loss": 1.6204, + "step": 25994 + }, + { + "epoch": 1.7446058857085331, + "grad_norm": 4.121819972991943, + "learning_rate": 4.212597527324869e-06, + "loss": 1.8622, + "step": 25996 + }, + { + "epoch": 1.7447401093923023, + "grad_norm": 4.22740364074707, + "learning_rate": 4.2082322338334204e-06, + "loss": 1.9767, + "step": 25998 + }, + { + "epoch": 1.7448743330760714, + "grad_norm": 3.7462289333343506, + "learning_rate": 4.203869103912944e-06, + "loss": 1.6728, + "step": 26000 + }, + { + "epoch": 1.7450085567598403, + "grad_norm": 4.571611404418945, + "learning_rate": 4.1995081377696035e-06, + "loss": 1.9267, + "step": 26002 + }, + { + "epoch": 1.7451427804436093, + "grad_norm": 3.993023157119751, + "learning_rate": 4.195149335609444e-06, + "loss": 1.9362, + "step": 26004 + }, + { + "epoch": 1.7452770041273782, + "grad_norm": 3.8632395267486572, + "learning_rate": 4.190792697638407e-06, + "loss": 1.6966, + "step": 26006 + }, + { + "epoch": 1.7454112278111473, + "grad_norm": 4.213405132293701, + "learning_rate": 4.186438224062322e-06, + "loss": 1.8972, + "step": 26008 + }, + { + "epoch": 1.7455454514949162, + "grad_norm": 3.9561104774475098, + "learning_rate": 4.182085915086958e-06, + "loss": 1.8087, + "step": 26010 + }, + { + "epoch": 1.7456796751786854, + "grad_norm": 3.5247533321380615, + "learning_rate": 4.1777357709179345e-06, + "loss": 1.9529, + "step": 26012 + }, + { + "epoch": 1.7458138988624543, + "grad_norm": 4.3585286140441895, + "learning_rate": 4.1733877917607914e-06, + "loss": 2.3434, + "step": 26014 + }, + { + "epoch": 1.7459481225462232, + "grad_norm": 3.969076633453369, + "learning_rate": 4.169041977820948e-06, + "loss": 1.8978, + "step": 26016 + }, + { + "epoch": 1.7460823462299921, + "grad_norm": 3.9882876873016357, + "learning_rate": 4.164698329303762e-06, + "loss": 1.8903, + "step": 26018 + }, + { + "epoch": 1.7462165699137613, + "grad_norm": 4.006845951080322, + "learning_rate": 4.1603568464144375e-06, + "loss": 1.7834, + "step": 26020 + }, + { + "epoch": 1.7463507935975304, + "grad_norm": 3.5025079250335693, + "learning_rate": 4.156017529358131e-06, + "loss": 1.8224, + "step": 26022 + }, + { + "epoch": 1.7464850172812993, + "grad_norm": 3.8607940673828125, + "learning_rate": 4.151680378339834e-06, + "loss": 1.7731, + "step": 26024 + }, + { + "epoch": 1.7466192409650683, + "grad_norm": 3.98307466506958, + "learning_rate": 4.147345393564478e-06, + "loss": 1.8124, + "step": 26026 + }, + { + "epoch": 1.7467534646488372, + "grad_norm": 4.1842522621154785, + "learning_rate": 4.1430125752368986e-06, + "loss": 1.6547, + "step": 26028 + }, + { + "epoch": 1.7468876883326063, + "grad_norm": 3.768630027770996, + "learning_rate": 4.138681923561788e-06, + "loss": 1.919, + "step": 26030 + }, + { + "epoch": 1.7470219120163752, + "grad_norm": 4.458158016204834, + "learning_rate": 4.134353438743793e-06, + "loss": 1.9775, + "step": 26032 + }, + { + "epoch": 1.7471561357001444, + "grad_norm": 7.966186046600342, + "learning_rate": 4.13002712098739e-06, + "loss": 1.929, + "step": 26034 + }, + { + "epoch": 1.7472903593839133, + "grad_norm": 3.991698980331421, + "learning_rate": 4.125702970497014e-06, + "loss": 1.8886, + "step": 26036 + }, + { + "epoch": 1.7474245830676822, + "grad_norm": 3.862790107727051, + "learning_rate": 4.121380987476953e-06, + "loss": 1.8226, + "step": 26038 + }, + { + "epoch": 1.7475588067514511, + "grad_norm": 4.212493419647217, + "learning_rate": 4.117061172131437e-06, + "loss": 1.8027, + "step": 26040 + }, + { + "epoch": 1.7476930304352203, + "grad_norm": 3.9570353031158447, + "learning_rate": 4.112743524664553e-06, + "loss": 1.6215, + "step": 26042 + }, + { + "epoch": 1.7478272541189894, + "grad_norm": 3.7357730865478516, + "learning_rate": 4.108428045280305e-06, + "loss": 1.8117, + "step": 26044 + }, + { + "epoch": 1.7479614778027583, + "grad_norm": 3.731083631515503, + "learning_rate": 4.104114734182584e-06, + "loss": 1.9124, + "step": 26046 + }, + { + "epoch": 1.7480957014865273, + "grad_norm": 3.5945515632629395, + "learning_rate": 4.0998035915751965e-06, + "loss": 1.6959, + "step": 26048 + }, + { + "epoch": 1.7482299251702962, + "grad_norm": 4.270467758178711, + "learning_rate": 4.095494617661833e-06, + "loss": 1.76, + "step": 26050 + }, + { + "epoch": 1.7483641488540653, + "grad_norm": 5.131415843963623, + "learning_rate": 4.0911878126460805e-06, + "loss": 1.9247, + "step": 26052 + }, + { + "epoch": 1.7484983725378345, + "grad_norm": 4.146245002746582, + "learning_rate": 4.086883176731415e-06, + "loss": 1.7614, + "step": 26054 + }, + { + "epoch": 1.7486325962216034, + "grad_norm": 4.091569423675537, + "learning_rate": 4.082580710121248e-06, + "loss": 1.6214, + "step": 26056 + }, + { + "epoch": 1.7487668199053723, + "grad_norm": 4.190357208251953, + "learning_rate": 4.078280413018843e-06, + "loss": 1.7874, + "step": 26058 + }, + { + "epoch": 1.7489010435891412, + "grad_norm": 4.059874057769775, + "learning_rate": 4.0739822856273925e-06, + "loss": 1.7295, + "step": 26060 + }, + { + "epoch": 1.7490352672729101, + "grad_norm": 4.395050525665283, + "learning_rate": 4.069686328149969e-06, + "loss": 1.9259, + "step": 26062 + }, + { + "epoch": 1.7491694909566793, + "grad_norm": 4.220469951629639, + "learning_rate": 4.065392540789553e-06, + "loss": 1.6898, + "step": 26064 + }, + { + "epoch": 1.7493037146404484, + "grad_norm": 4.424079895019531, + "learning_rate": 4.061100923749001e-06, + "loss": 1.9068, + "step": 26066 + }, + { + "epoch": 1.7494379383242173, + "grad_norm": 5.521829605102539, + "learning_rate": 4.0568114772311035e-06, + "loss": 1.9848, + "step": 26068 + }, + { + "epoch": 1.7495721620079863, + "grad_norm": 4.025507926940918, + "learning_rate": 4.052524201438523e-06, + "loss": 1.856, + "step": 26070 + }, + { + "epoch": 1.7497063856917552, + "grad_norm": 4.216867446899414, + "learning_rate": 4.048239096573819e-06, + "loss": 2.107, + "step": 26072 + }, + { + "epoch": 1.7498406093755243, + "grad_norm": 3.6347103118896484, + "learning_rate": 4.043956162839452e-06, + "loss": 1.6981, + "step": 26074 + }, + { + "epoch": 1.7499748330592935, + "grad_norm": 3.94718599319458, + "learning_rate": 4.0396754004377925e-06, + "loss": 1.7736, + "step": 26076 + }, + { + "epoch": 1.7501090567430624, + "grad_norm": 4.325935363769531, + "learning_rate": 4.035396809571096e-06, + "loss": 1.803, + "step": 26078 + }, + { + "epoch": 1.7502432804268313, + "grad_norm": 4.17030668258667, + "learning_rate": 4.0311203904415116e-06, + "loss": 1.8712, + "step": 26080 + }, + { + "epoch": 1.7503775041106002, + "grad_norm": 4.101718902587891, + "learning_rate": 4.02684614325109e-06, + "loss": 1.7192, + "step": 26082 + }, + { + "epoch": 1.7505117277943694, + "grad_norm": 3.907712697982788, + "learning_rate": 4.02257406820179e-06, + "loss": 1.9424, + "step": 26084 + }, + { + "epoch": 1.7506459514781383, + "grad_norm": 4.130181312561035, + "learning_rate": 4.0183041654954515e-06, + "loss": 1.6828, + "step": 26086 + }, + { + "epoch": 1.7507801751619074, + "grad_norm": 4.87315034866333, + "learning_rate": 4.0140364353338286e-06, + "loss": 1.7597, + "step": 26088 + }, + { + "epoch": 1.7509143988456763, + "grad_norm": 3.39277982711792, + "learning_rate": 4.009770877918556e-06, + "loss": 1.62, + "step": 26090 + }, + { + "epoch": 1.7510486225294453, + "grad_norm": 4.212294101715088, + "learning_rate": 4.005507493451171e-06, + "loss": 2.0642, + "step": 26092 + }, + { + "epoch": 1.7511828462132142, + "grad_norm": 4.154381275177002, + "learning_rate": 4.0012462821331145e-06, + "loss": 1.8075, + "step": 26094 + }, + { + "epoch": 1.7513170698969833, + "grad_norm": 3.6769092082977295, + "learning_rate": 3.996987244165712e-06, + "loss": 1.8373, + "step": 26096 + }, + { + "epoch": 1.7514512935807525, + "grad_norm": 4.142845630645752, + "learning_rate": 3.9927303797502096e-06, + "loss": 1.6704, + "step": 26098 + }, + { + "epoch": 1.7515855172645214, + "grad_norm": 4.130332946777344, + "learning_rate": 3.988475689087723e-06, + "loss": 1.9098, + "step": 26100 + }, + { + "epoch": 1.7517197409482903, + "grad_norm": 4.278568744659424, + "learning_rate": 3.984223172379287e-06, + "loss": 1.9389, + "step": 26102 + }, + { + "epoch": 1.7518539646320592, + "grad_norm": 3.8688905239105225, + "learning_rate": 3.979972829825807e-06, + "loss": 1.8927, + "step": 26104 + }, + { + "epoch": 1.7519881883158284, + "grad_norm": 4.737236976623535, + "learning_rate": 3.975724661628128e-06, + "loss": 1.868, + "step": 26106 + }, + { + "epoch": 1.7521224119995973, + "grad_norm": 4.537309646606445, + "learning_rate": 3.971478667986955e-06, + "loss": 1.8904, + "step": 26108 + }, + { + "epoch": 1.7522566356833664, + "grad_norm": 4.775686740875244, + "learning_rate": 3.967234849102907e-06, + "loss": 2.1245, + "step": 26110 + }, + { + "epoch": 1.7523908593671353, + "grad_norm": 3.9711899757385254, + "learning_rate": 3.962993205176479e-06, + "loss": 1.8139, + "step": 26112 + }, + { + "epoch": 1.7525250830509043, + "grad_norm": 4.54011344909668, + "learning_rate": 3.958753736408105e-06, + "loss": 2.0604, + "step": 26114 + }, + { + "epoch": 1.7526593067346732, + "grad_norm": 4.279061317443848, + "learning_rate": 3.954516442998074e-06, + "loss": 1.7323, + "step": 26116 + }, + { + "epoch": 1.7527935304184423, + "grad_norm": 4.025203227996826, + "learning_rate": 3.9502813251466096e-06, + "loss": 1.7861, + "step": 26118 + }, + { + "epoch": 1.7529277541022115, + "grad_norm": 4.045186996459961, + "learning_rate": 3.946048383053786e-06, + "loss": 2.2523, + "step": 26120 + }, + { + "epoch": 1.7530619777859804, + "grad_norm": 3.284811019897461, + "learning_rate": 3.941817616919624e-06, + "loss": 1.7048, + "step": 26122 + }, + { + "epoch": 1.7531962014697493, + "grad_norm": 3.6162478923797607, + "learning_rate": 3.937589026944e-06, + "loss": 1.9864, + "step": 26124 + }, + { + "epoch": 1.7533304251535182, + "grad_norm": 4.2072649002075195, + "learning_rate": 3.9333626133267244e-06, + "loss": 2.0893, + "step": 26126 + }, + { + "epoch": 1.7534646488372874, + "grad_norm": 4.0069804191589355, + "learning_rate": 3.929138376267477e-06, + "loss": 2.0933, + "step": 26128 + }, + { + "epoch": 1.7535988725210565, + "grad_norm": 5.723423957824707, + "learning_rate": 3.924916315965854e-06, + "loss": 1.932, + "step": 26130 + }, + { + "epoch": 1.7537330962048254, + "grad_norm": 4.822319030761719, + "learning_rate": 3.920696432621318e-06, + "loss": 1.8895, + "step": 26132 + }, + { + "epoch": 1.7538673198885943, + "grad_norm": 4.173018455505371, + "learning_rate": 3.916478726433276e-06, + "loss": 2.102, + "step": 26134 + }, + { + "epoch": 1.7540015435723633, + "grad_norm": 4.118551731109619, + "learning_rate": 3.91226319760099e-06, + "loss": 1.8914, + "step": 26136 + }, + { + "epoch": 1.7541357672561322, + "grad_norm": 4.00869607925415, + "learning_rate": 3.908049846323647e-06, + "loss": 2.174, + "step": 26138 + }, + { + "epoch": 1.7542699909399013, + "grad_norm": 3.9800565242767334, + "learning_rate": 3.903838672800297e-06, + "loss": 1.9782, + "step": 26140 + }, + { + "epoch": 1.7544042146236705, + "grad_norm": 4.210039138793945, + "learning_rate": 3.899629677229943e-06, + "loss": 2.2886, + "step": 26142 + }, + { + "epoch": 1.7545384383074394, + "grad_norm": 4.777013778686523, + "learning_rate": 3.89542285981142e-06, + "loss": 1.6869, + "step": 26144 + }, + { + "epoch": 1.7546726619912083, + "grad_norm": 4.562379837036133, + "learning_rate": 3.89121822074352e-06, + "loss": 1.8426, + "step": 26146 + }, + { + "epoch": 1.7548068856749772, + "grad_norm": 3.8426730632781982, + "learning_rate": 3.887015760224877e-06, + "loss": 1.9006, + "step": 26148 + }, + { + "epoch": 1.7549411093587464, + "grad_norm": 4.134133338928223, + "learning_rate": 3.8828154784540714e-06, + "loss": 1.716, + "step": 26150 + }, + { + "epoch": 1.7550753330425155, + "grad_norm": 4.106589317321777, + "learning_rate": 3.87861737562954e-06, + "loss": 2.0273, + "step": 26152 + }, + { + "epoch": 1.7552095567262844, + "grad_norm": 4.338353157043457, + "learning_rate": 3.87442145194965e-06, + "loss": 2.1753, + "step": 26154 + }, + { + "epoch": 1.7553437804100533, + "grad_norm": 4.206090450286865, + "learning_rate": 3.87022770761265e-06, + "loss": 1.7437, + "step": 26156 + }, + { + "epoch": 1.7554780040938223, + "grad_norm": 4.373834609985352, + "learning_rate": 3.866036142816676e-06, + "loss": 1.8041, + "step": 26158 + }, + { + "epoch": 1.7556122277775914, + "grad_norm": 3.653142213821411, + "learning_rate": 3.861846757759768e-06, + "loss": 1.8297, + "step": 26160 + }, + { + "epoch": 1.7557464514613603, + "grad_norm": 6.739853858947754, + "learning_rate": 3.857659552639881e-06, + "loss": 1.9602, + "step": 26162 + }, + { + "epoch": 1.7558806751451295, + "grad_norm": 4.4350361824035645, + "learning_rate": 3.853474527654849e-06, + "loss": 1.9812, + "step": 26164 + }, + { + "epoch": 1.7560148988288984, + "grad_norm": 3.4897913932800293, + "learning_rate": 3.849291683002398e-06, + "loss": 1.7336, + "step": 26166 + }, + { + "epoch": 1.7561491225126673, + "grad_norm": 4.04252815246582, + "learning_rate": 3.845111018880165e-06, + "loss": 1.8748, + "step": 26168 + }, + { + "epoch": 1.7562833461964362, + "grad_norm": 3.948110580444336, + "learning_rate": 3.84093253548567e-06, + "loss": 1.6969, + "step": 26170 + }, + { + "epoch": 1.7564175698802054, + "grad_norm": 3.832934856414795, + "learning_rate": 3.836756233016348e-06, + "loss": 1.9365, + "step": 26172 + }, + { + "epoch": 1.7565517935639745, + "grad_norm": 4.149256706237793, + "learning_rate": 3.832582111669525e-06, + "loss": 1.7364, + "step": 26174 + }, + { + "epoch": 1.7566860172477434, + "grad_norm": 3.7409207820892334, + "learning_rate": 3.828410171642404e-06, + "loss": 1.8439, + "step": 26176 + }, + { + "epoch": 1.7568202409315123, + "grad_norm": 3.6273186206817627, + "learning_rate": 3.824240413132107e-06, + "loss": 1.8312, + "step": 26178 + }, + { + "epoch": 1.7569544646152813, + "grad_norm": 3.9888153076171875, + "learning_rate": 3.820072836335658e-06, + "loss": 1.6941, + "step": 26180 + }, + { + "epoch": 1.7570886882990504, + "grad_norm": 4.51741361618042, + "learning_rate": 3.815907441449951e-06, + "loss": 1.8836, + "step": 26182 + }, + { + "epoch": 1.7572229119828193, + "grad_norm": 4.52492618560791, + "learning_rate": 3.811744228671815e-06, + "loss": 1.9042, + "step": 26184 + }, + { + "epoch": 1.7573571356665885, + "grad_norm": 4.260329723358154, + "learning_rate": 3.807583198197923e-06, + "loss": 1.9174, + "step": 26186 + }, + { + "epoch": 1.7574913593503574, + "grad_norm": 4.048501491546631, + "learning_rate": 3.8034243502249045e-06, + "loss": 1.8281, + "step": 26188 + }, + { + "epoch": 1.7576255830341263, + "grad_norm": 4.182744979858398, + "learning_rate": 3.799267684949226e-06, + "loss": 2.0487, + "step": 26190 + }, + { + "epoch": 1.7577598067178952, + "grad_norm": 3.849097490310669, + "learning_rate": 3.795113202567313e-06, + "loss": 1.6232, + "step": 26192 + }, + { + "epoch": 1.7578940304016644, + "grad_norm": 2.8776729106903076, + "learning_rate": 3.7909609032754424e-06, + "loss": 1.6033, + "step": 26194 + }, + { + "epoch": 1.7580282540854335, + "grad_norm": 4.99044132232666, + "learning_rate": 3.786810787269807e-06, + "loss": 1.8262, + "step": 26196 + }, + { + "epoch": 1.7581624777692024, + "grad_norm": 3.8424618244171143, + "learning_rate": 3.782662854746477e-06, + "loss": 2.0767, + "step": 26198 + }, + { + "epoch": 1.7582967014529713, + "grad_norm": 4.160728931427002, + "learning_rate": 3.7785171059014468e-06, + "loss": 1.753, + "step": 26200 + }, + { + "epoch": 1.7584309251367403, + "grad_norm": 3.795523166656494, + "learning_rate": 3.7743735409305982e-06, + "loss": 1.7938, + "step": 26202 + }, + { + "epoch": 1.7585651488205094, + "grad_norm": 4.023842811584473, + "learning_rate": 3.7702321600297017e-06, + "loss": 1.86, + "step": 26204 + }, + { + "epoch": 1.7586993725042785, + "grad_norm": 3.653348922729492, + "learning_rate": 3.7660929633944186e-06, + "loss": 1.7021, + "step": 26206 + }, + { + "epoch": 1.7588335961880475, + "grad_norm": 4.261740684509277, + "learning_rate": 3.7619559512203363e-06, + "loss": 1.6188, + "step": 26208 + }, + { + "epoch": 1.7589678198718164, + "grad_norm": 4.639176368713379, + "learning_rate": 3.757821123702904e-06, + "loss": 2.0174, + "step": 26210 + }, + { + "epoch": 1.7591020435555853, + "grad_norm": 3.965599298477173, + "learning_rate": 3.7536884810375094e-06, + "loss": 1.883, + "step": 26212 + }, + { + "epoch": 1.7592362672393542, + "grad_norm": 4.197188377380371, + "learning_rate": 3.7495580234193804e-06, + "loss": 1.9081, + "step": 26214 + }, + { + "epoch": 1.7593704909231234, + "grad_norm": 4.4752607345581055, + "learning_rate": 3.7454297510436885e-06, + "loss": 1.9333, + "step": 26216 + }, + { + "epoch": 1.7595047146068925, + "grad_norm": 3.769127368927002, + "learning_rate": 3.7413036641054834e-06, + "loss": 1.674, + "step": 26218 + }, + { + "epoch": 1.7596389382906614, + "grad_norm": 4.181955337524414, + "learning_rate": 3.7371797627997194e-06, + "loss": 1.6593, + "step": 26220 + }, + { + "epoch": 1.7597731619744303, + "grad_norm": 3.6668972969055176, + "learning_rate": 3.7330580473212473e-06, + "loss": 1.7683, + "step": 26222 + }, + { + "epoch": 1.7599073856581993, + "grad_norm": 4.317763328552246, + "learning_rate": 3.728938517864794e-06, + "loss": 1.9906, + "step": 26224 + }, + { + "epoch": 1.7600416093419684, + "grad_norm": 6.886631965637207, + "learning_rate": 3.724821174625004e-06, + "loss": 1.7156, + "step": 26226 + }, + { + "epoch": 1.7601758330257375, + "grad_norm": 3.953565835952759, + "learning_rate": 3.720706017796427e-06, + "loss": 1.7606, + "step": 26228 + }, + { + "epoch": 1.7603100567095065, + "grad_norm": 4.917335033416748, + "learning_rate": 3.71659304757348e-06, + "loss": 1.854, + "step": 26230 + }, + { + "epoch": 1.7604442803932754, + "grad_norm": 4.041778564453125, + "learning_rate": 3.7124822641505017e-06, + "loss": 1.5746, + "step": 26232 + }, + { + "epoch": 1.7605785040770443, + "grad_norm": 4.452634334564209, + "learning_rate": 3.7083736677217206e-06, + "loss": 1.7529, + "step": 26234 + }, + { + "epoch": 1.7607127277608134, + "grad_norm": 5.341209411621094, + "learning_rate": 3.704267258481242e-06, + "loss": 1.8419, + "step": 26236 + }, + { + "epoch": 1.7608469514445824, + "grad_norm": 3.9873788356781006, + "learning_rate": 3.7001630366231112e-06, + "loss": 1.6867, + "step": 26238 + }, + { + "epoch": 1.7609811751283515, + "grad_norm": 4.508428573608398, + "learning_rate": 3.696061002341217e-06, + "loss": 1.9046, + "step": 26240 + }, + { + "epoch": 1.7611153988121204, + "grad_norm": 5.000663757324219, + "learning_rate": 3.6919611558294098e-06, + "loss": 1.9788, + "step": 26242 + }, + { + "epoch": 1.7612496224958893, + "grad_norm": 3.4604015350341797, + "learning_rate": 3.6878634972813576e-06, + "loss": 1.5941, + "step": 26244 + }, + { + "epoch": 1.7613838461796583, + "grad_norm": 4.279326915740967, + "learning_rate": 3.6837680268906992e-06, + "loss": 1.9248, + "step": 26246 + }, + { + "epoch": 1.7615180698634274, + "grad_norm": 3.6857244968414307, + "learning_rate": 3.679674744850914e-06, + "loss": 1.576, + "step": 26248 + }, + { + "epoch": 1.7616522935471965, + "grad_norm": 4.304646968841553, + "learning_rate": 3.6755836513554185e-06, + "loss": 1.8775, + "step": 26250 + }, + { + "epoch": 1.7617865172309655, + "grad_norm": 3.8737952709198, + "learning_rate": 3.6714947465975035e-06, + "loss": 1.798, + "step": 26252 + }, + { + "epoch": 1.7619207409147344, + "grad_norm": 4.222915172576904, + "learning_rate": 3.6674080307703586e-06, + "loss": 1.7058, + "step": 26254 + }, + { + "epoch": 1.7620549645985033, + "grad_norm": 4.390568733215332, + "learning_rate": 3.6633235040670675e-06, + "loss": 1.8603, + "step": 26256 + }, + { + "epoch": 1.7621891882822724, + "grad_norm": 3.8409423828125, + "learning_rate": 3.6592411666806326e-06, + "loss": 2.1349, + "step": 26258 + }, + { + "epoch": 1.7623234119660414, + "grad_norm": 4.21790075302124, + "learning_rate": 3.655161018803932e-06, + "loss": 1.7396, + "step": 26260 + }, + { + "epoch": 1.7624576356498105, + "grad_norm": 4.509720802307129, + "learning_rate": 3.6510830606297343e-06, + "loss": 2.0074, + "step": 26262 + }, + { + "epoch": 1.7625918593335794, + "grad_norm": 4.035961151123047, + "learning_rate": 3.6470072923507125e-06, + "loss": 1.8145, + "step": 26264 + }, + { + "epoch": 1.7627260830173483, + "grad_norm": 4.445707321166992, + "learning_rate": 3.6429337141594578e-06, + "loss": 2.007, + "step": 26266 + }, + { + "epoch": 1.7628603067011173, + "grad_norm": 4.629528999328613, + "learning_rate": 3.638862326248421e-06, + "loss": 1.744, + "step": 26268 + }, + { + "epoch": 1.7629945303848864, + "grad_norm": 3.9803550243377686, + "learning_rate": 3.634793128809988e-06, + "loss": 1.9358, + "step": 26270 + }, + { + "epoch": 1.7631287540686555, + "grad_norm": 4.2409844398498535, + "learning_rate": 3.6307261220363876e-06, + "loss": 1.9889, + "step": 26272 + }, + { + "epoch": 1.7632629777524245, + "grad_norm": 4.509955406188965, + "learning_rate": 3.626661306119805e-06, + "loss": 1.8514, + "step": 26274 + }, + { + "epoch": 1.7633972014361934, + "grad_norm": 3.9308676719665527, + "learning_rate": 3.6225986812522816e-06, + "loss": 1.8621, + "step": 26276 + }, + { + "epoch": 1.7635314251199623, + "grad_norm": 4.03289794921875, + "learning_rate": 3.61853824762578e-06, + "loss": 1.9648, + "step": 26278 + }, + { + "epoch": 1.7636656488037314, + "grad_norm": 5.27924108505249, + "learning_rate": 3.6144800054321415e-06, + "loss": 1.7172, + "step": 26280 + }, + { + "epoch": 1.7637998724875004, + "grad_norm": 4.488813400268555, + "learning_rate": 3.6104239548631127e-06, + "loss": 1.7854, + "step": 26282 + }, + { + "epoch": 1.7639340961712695, + "grad_norm": 3.939221143722534, + "learning_rate": 3.6063700961103174e-06, + "loss": 1.9926, + "step": 26284 + }, + { + "epoch": 1.7640683198550384, + "grad_norm": 4.252030372619629, + "learning_rate": 3.6023184293653143e-06, + "loss": 2.0113, + "step": 26286 + }, + { + "epoch": 1.7642025435388073, + "grad_norm": 3.9386391639709473, + "learning_rate": 3.5982689548195338e-06, + "loss": 1.7443, + "step": 26288 + }, + { + "epoch": 1.7643367672225763, + "grad_norm": 4.3121795654296875, + "learning_rate": 3.594221672664294e-06, + "loss": 1.8927, + "step": 26290 + }, + { + "epoch": 1.7644709909063454, + "grad_norm": 4.025094509124756, + "learning_rate": 3.590176583090821e-06, + "loss": 1.7157, + "step": 26292 + }, + { + "epoch": 1.7646052145901145, + "grad_norm": 4.51613712310791, + "learning_rate": 3.586133686290255e-06, + "loss": 1.8158, + "step": 26294 + }, + { + "epoch": 1.7647394382738835, + "grad_norm": 4.326226234436035, + "learning_rate": 3.582092982453589e-06, + "loss": 1.9079, + "step": 26296 + }, + { + "epoch": 1.7648736619576524, + "grad_norm": 4.201403617858887, + "learning_rate": 3.5780544717717747e-06, + "loss": 1.8783, + "step": 26298 + }, + { + "epoch": 1.7650078856414213, + "grad_norm": 4.4772562980651855, + "learning_rate": 3.5740181544355875e-06, + "loss": 1.8409, + "step": 26300 + }, + { + "epoch": 1.7651421093251904, + "grad_norm": 4.164394855499268, + "learning_rate": 3.5699840306357478e-06, + "loss": 2.0579, + "step": 26302 + }, + { + "epoch": 1.7652763330089596, + "grad_norm": 3.7594351768493652, + "learning_rate": 3.565952100562869e-06, + "loss": 1.6076, + "step": 26304 + }, + { + "epoch": 1.7654105566927285, + "grad_norm": 4.410008907318115, + "learning_rate": 3.561922364407433e-06, + "loss": 2.2384, + "step": 26306 + }, + { + "epoch": 1.7655447803764974, + "grad_norm": 4.396519184112549, + "learning_rate": 3.557894822359864e-06, + "loss": 1.6544, + "step": 26308 + }, + { + "epoch": 1.7656790040602663, + "grad_norm": 4.281963348388672, + "learning_rate": 3.5538694746104274e-06, + "loss": 1.8582, + "step": 26310 + }, + { + "epoch": 1.7658132277440355, + "grad_norm": 4.041405200958252, + "learning_rate": 3.5498463213493372e-06, + "loss": 1.8501, + "step": 26312 + }, + { + "epoch": 1.7659474514278044, + "grad_norm": 3.806290864944458, + "learning_rate": 3.545825362766653e-06, + "loss": 1.7285, + "step": 26314 + }, + { + "epoch": 1.7660816751115735, + "grad_norm": 4.355697154998779, + "learning_rate": 3.541806599052383e-06, + "loss": 1.7283, + "step": 26316 + }, + { + "epoch": 1.7662158987953425, + "grad_norm": 3.8573548793792725, + "learning_rate": 3.5377900303963986e-06, + "loss": 2.0489, + "step": 26318 + }, + { + "epoch": 1.7663501224791114, + "grad_norm": 4.102588176727295, + "learning_rate": 3.533775656988464e-06, + "loss": 1.73, + "step": 26320 + }, + { + "epoch": 1.7664843461628803, + "grad_norm": 4.348020553588867, + "learning_rate": 3.5297634790182555e-06, + "loss": 1.8582, + "step": 26322 + }, + { + "epoch": 1.7666185698466494, + "grad_norm": 4.770176887512207, + "learning_rate": 3.525753496675349e-06, + "loss": 2.0148, + "step": 26324 + }, + { + "epoch": 1.7667527935304186, + "grad_norm": 4.152260780334473, + "learning_rate": 3.5217457101492045e-06, + "loss": 1.6153, + "step": 26326 + }, + { + "epoch": 1.7668870172141875, + "grad_norm": 4.320188045501709, + "learning_rate": 3.517740119629176e-06, + "loss": 1.8237, + "step": 26328 + }, + { + "epoch": 1.7670212408979564, + "grad_norm": 4.564131259918213, + "learning_rate": 3.5137367253045173e-06, + "loss": 1.8909, + "step": 26330 + }, + { + "epoch": 1.7671554645817253, + "grad_norm": 3.59000563621521, + "learning_rate": 3.509735527364394e-06, + "loss": 1.7684, + "step": 26332 + }, + { + "epoch": 1.7672896882654945, + "grad_norm": 4.6519927978515625, + "learning_rate": 3.5057365259978383e-06, + "loss": 2.0356, + "step": 26334 + }, + { + "epoch": 1.7674239119492634, + "grad_norm": 4.226385593414307, + "learning_rate": 3.501739721393826e-06, + "loss": 1.8746, + "step": 26336 + }, + { + "epoch": 1.7675581356330325, + "grad_norm": 4.15546178817749, + "learning_rate": 3.4977451137411577e-06, + "loss": 1.9093, + "step": 26338 + }, + { + "epoch": 1.7676923593168015, + "grad_norm": 3.5221521854400635, + "learning_rate": 3.4937527032285976e-06, + "loss": 1.588, + "step": 26340 + }, + { + "epoch": 1.7678265830005704, + "grad_norm": 3.75907564163208, + "learning_rate": 3.4897624900447624e-06, + "loss": 1.6873, + "step": 26342 + }, + { + "epoch": 1.7679608066843393, + "grad_norm": 3.8247663974761963, + "learning_rate": 3.4857744743782007e-06, + "loss": 1.8223, + "step": 26344 + }, + { + "epoch": 1.7680950303681084, + "grad_norm": 3.724047899246216, + "learning_rate": 3.4817886564173287e-06, + "loss": 1.8754, + "step": 26346 + }, + { + "epoch": 1.7682292540518776, + "grad_norm": 4.711092472076416, + "learning_rate": 3.4778050363504675e-06, + "loss": 1.7068, + "step": 26348 + }, + { + "epoch": 1.7683634777356465, + "grad_norm": 4.2992448806762695, + "learning_rate": 3.473823614365829e-06, + "loss": 1.6879, + "step": 26350 + }, + { + "epoch": 1.7684977014194154, + "grad_norm": 4.334253787994385, + "learning_rate": 3.469844390651544e-06, + "loss": 1.7836, + "step": 26352 + }, + { + "epoch": 1.7686319251031843, + "grad_norm": 3.889470338821411, + "learning_rate": 3.465867365395614e-06, + "loss": 1.8543, + "step": 26354 + }, + { + "epoch": 1.7687661487869535, + "grad_norm": 3.9636335372924805, + "learning_rate": 3.4618925387859437e-06, + "loss": 1.9299, + "step": 26356 + }, + { + "epoch": 1.7689003724707224, + "grad_norm": 4.128573417663574, + "learning_rate": 3.457919911010332e-06, + "loss": 1.9202, + "step": 26358 + }, + { + "epoch": 1.7690345961544915, + "grad_norm": 4.607234954833984, + "learning_rate": 3.453949482256491e-06, + "loss": 1.7908, + "step": 26360 + }, + { + "epoch": 1.7691688198382605, + "grad_norm": 4.673115253448486, + "learning_rate": 3.449981252711998e-06, + "loss": 1.9231, + "step": 26362 + }, + { + "epoch": 1.7693030435220294, + "grad_norm": 3.752490282058716, + "learning_rate": 3.4460152225643692e-06, + "loss": 1.8501, + "step": 26364 + }, + { + "epoch": 1.7694372672057983, + "grad_norm": 3.977640390396118, + "learning_rate": 3.442051392000967e-06, + "loss": 1.8588, + "step": 26366 + }, + { + "epoch": 1.7695714908895674, + "grad_norm": 3.845933198928833, + "learning_rate": 3.4380897612090843e-06, + "loss": 1.8106, + "step": 26368 + }, + { + "epoch": 1.7697057145733366, + "grad_norm": 4.6189775466918945, + "learning_rate": 3.4341303303759064e-06, + "loss": 1.6686, + "step": 26370 + }, + { + "epoch": 1.7698399382571055, + "grad_norm": 4.016350746154785, + "learning_rate": 3.4301730996884884e-06, + "loss": 2.1538, + "step": 26372 + }, + { + "epoch": 1.7699741619408744, + "grad_norm": 4.247426986694336, + "learning_rate": 3.4262180693338306e-06, + "loss": 2.1033, + "step": 26374 + }, + { + "epoch": 1.7701083856246433, + "grad_norm": 3.5751070976257324, + "learning_rate": 3.4222652394987843e-06, + "loss": 1.5828, + "step": 26376 + }, + { + "epoch": 1.7702426093084125, + "grad_norm": 3.9756088256835938, + "learning_rate": 3.4183146103701104e-06, + "loss": 1.7121, + "step": 26378 + }, + { + "epoch": 1.7703768329921816, + "grad_norm": 4.159034729003906, + "learning_rate": 3.4143661821344654e-06, + "loss": 1.8078, + "step": 26380 + }, + { + "epoch": 1.7705110566759505, + "grad_norm": 4.283459663391113, + "learning_rate": 3.4104199549784223e-06, + "loss": 1.9624, + "step": 26382 + }, + { + "epoch": 1.7706452803597195, + "grad_norm": 4.125486373901367, + "learning_rate": 3.4064759290884207e-06, + "loss": 1.7843, + "step": 26384 + }, + { + "epoch": 1.7707795040434884, + "grad_norm": 4.173834323883057, + "learning_rate": 3.4025341046508064e-06, + "loss": 1.9922, + "step": 26386 + }, + { + "epoch": 1.7709137277272575, + "grad_norm": 4.3525848388671875, + "learning_rate": 3.398594481851819e-06, + "loss": 1.9771, + "step": 26388 + }, + { + "epoch": 1.7710479514110264, + "grad_norm": 3.7772016525268555, + "learning_rate": 3.394657060877615e-06, + "loss": 1.7452, + "step": 26390 + }, + { + "epoch": 1.7711821750947956, + "grad_norm": 4.3101887702941895, + "learning_rate": 3.3907218419142182e-06, + "loss": 1.8939, + "step": 26392 + }, + { + "epoch": 1.7713163987785645, + "grad_norm": 4.423670291900635, + "learning_rate": 3.3867888251475577e-06, + "loss": 1.7362, + "step": 26394 + }, + { + "epoch": 1.7714506224623334, + "grad_norm": 4.859621524810791, + "learning_rate": 3.3828580107634623e-06, + "loss": 1.6539, + "step": 26396 + }, + { + "epoch": 1.7715848461461023, + "grad_norm": 4.523566722869873, + "learning_rate": 3.378929398947661e-06, + "loss": 1.9961, + "step": 26398 + }, + { + "epoch": 1.7717190698298715, + "grad_norm": 4.528345108032227, + "learning_rate": 3.3750029898857614e-06, + "loss": 1.9241, + "step": 26400 + }, + { + "epoch": 1.7718532935136406, + "grad_norm": 3.5952491760253906, + "learning_rate": 3.3710787837632982e-06, + "loss": 1.6637, + "step": 26402 + }, + { + "epoch": 1.7719875171974095, + "grad_norm": 3.9955854415893555, + "learning_rate": 3.367156780765668e-06, + "loss": 1.8699, + "step": 26404 + }, + { + "epoch": 1.7721217408811785, + "grad_norm": 4.230663299560547, + "learning_rate": 3.3632369810781774e-06, + "loss": 1.9191, + "step": 26406 + }, + { + "epoch": 1.7722559645649474, + "grad_norm": 4.445614814758301, + "learning_rate": 3.3593193848860284e-06, + "loss": 1.9024, + "step": 26408 + }, + { + "epoch": 1.7723901882487165, + "grad_norm": 4.120759010314941, + "learning_rate": 3.3554039923743286e-06, + "loss": 1.814, + "step": 26410 + }, + { + "epoch": 1.7725244119324854, + "grad_norm": 3.9276888370513916, + "learning_rate": 3.351490803728069e-06, + "loss": 1.8823, + "step": 26412 + }, + { + "epoch": 1.7726586356162546, + "grad_norm": 4.5204925537109375, + "learning_rate": 3.3475798191321406e-06, + "loss": 2.2158, + "step": 26414 + }, + { + "epoch": 1.7727928593000235, + "grad_norm": 3.426327705383301, + "learning_rate": 3.3436710387713176e-06, + "loss": 1.5912, + "step": 26416 + }, + { + "epoch": 1.7729270829837924, + "grad_norm": 4.3012375831604, + "learning_rate": 3.3397644628303026e-06, + "loss": 2.0665, + "step": 26418 + }, + { + "epoch": 1.7730613066675613, + "grad_norm": 4.819196701049805, + "learning_rate": 3.335860091493653e-06, + "loss": 1.7323, + "step": 26420 + }, + { + "epoch": 1.7731955303513305, + "grad_norm": 4.09934139251709, + "learning_rate": 3.3319579249458667e-06, + "loss": 1.7257, + "step": 26422 + }, + { + "epoch": 1.7733297540350996, + "grad_norm": 4.095159530639648, + "learning_rate": 3.328057963371284e-06, + "loss": 1.922, + "step": 26424 + }, + { + "epoch": 1.7734639777188685, + "grad_norm": 4.131359577178955, + "learning_rate": 3.3241602069541967e-06, + "loss": 1.818, + "step": 26426 + }, + { + "epoch": 1.7735982014026375, + "grad_norm": 4.482840538024902, + "learning_rate": 3.3202646558787463e-06, + "loss": 2.1063, + "step": 26428 + }, + { + "epoch": 1.7737324250864064, + "grad_norm": 4.509688377380371, + "learning_rate": 3.3163713103290084e-06, + "loss": 1.8243, + "step": 26430 + }, + { + "epoch": 1.7738666487701755, + "grad_norm": 3.77290678024292, + "learning_rate": 3.3124801704889298e-06, + "loss": 2.0764, + "step": 26432 + }, + { + "epoch": 1.7740008724539444, + "grad_norm": 7.115779399871826, + "learning_rate": 3.308591236542352e-06, + "loss": 1.6405, + "step": 26434 + }, + { + "epoch": 1.7741350961377136, + "grad_norm": 3.7939741611480713, + "learning_rate": 3.3047045086730233e-06, + "loss": 1.918, + "step": 26436 + }, + { + "epoch": 1.7742693198214825, + "grad_norm": 4.4269843101501465, + "learning_rate": 3.300819987064574e-06, + "loss": 1.8846, + "step": 26438 + }, + { + "epoch": 1.7744035435052514, + "grad_norm": 4.486355304718018, + "learning_rate": 3.296937671900563e-06, + "loss": 2.012, + "step": 26440 + }, + { + "epoch": 1.7745377671890203, + "grad_norm": 4.022214412689209, + "learning_rate": 3.2930575633644103e-06, + "loss": 1.6877, + "step": 26442 + }, + { + "epoch": 1.7746719908727895, + "grad_norm": 3.627650260925293, + "learning_rate": 3.2891796616394353e-06, + "loss": 1.732, + "step": 26444 + }, + { + "epoch": 1.7748062145565586, + "grad_norm": 3.7687277793884277, + "learning_rate": 3.285303966908865e-06, + "loss": 1.7494, + "step": 26446 + }, + { + "epoch": 1.7749404382403275, + "grad_norm": 3.8505823612213135, + "learning_rate": 3.2814304793558294e-06, + "loss": 1.8053, + "step": 26448 + }, + { + "epoch": 1.7750746619240965, + "grad_norm": 4.107375621795654, + "learning_rate": 3.277559199163338e-06, + "loss": 1.7797, + "step": 26450 + }, + { + "epoch": 1.7752088856078654, + "grad_norm": 4.530089855194092, + "learning_rate": 3.2736901265142948e-06, + "loss": 1.741, + "step": 26452 + }, + { + "epoch": 1.7753431092916345, + "grad_norm": 4.068645000457764, + "learning_rate": 3.2698232615915034e-06, + "loss": 1.8966, + "step": 26454 + }, + { + "epoch": 1.7754773329754037, + "grad_norm": 4.128692626953125, + "learning_rate": 3.265958604577679e-06, + "loss": 1.8205, + "step": 26456 + }, + { + "epoch": 1.7756115566591726, + "grad_norm": 3.537374258041382, + "learning_rate": 3.262096155655403e-06, + "loss": 1.6635, + "step": 26458 + }, + { + "epoch": 1.7757457803429415, + "grad_norm": 4.594486713409424, + "learning_rate": 3.2582359150071963e-06, + "loss": 1.8649, + "step": 26460 + }, + { + "epoch": 1.7758800040267104, + "grad_norm": 4.615654468536377, + "learning_rate": 3.2543778828154125e-06, + "loss": 2.0114, + "step": 26462 + }, + { + "epoch": 1.7760142277104796, + "grad_norm": 3.7271065711975098, + "learning_rate": 3.250522059262362e-06, + "loss": 1.572, + "step": 26464 + }, + { + "epoch": 1.7761484513942485, + "grad_norm": 4.0616068840026855, + "learning_rate": 3.246668444530204e-06, + "loss": 1.9786, + "step": 26466 + }, + { + "epoch": 1.7762826750780176, + "grad_norm": 4.212959289550781, + "learning_rate": 3.2428170388010326e-06, + "loss": 2.0577, + "step": 26468 + }, + { + "epoch": 1.7764168987617865, + "grad_norm": 4.146392822265625, + "learning_rate": 3.238967842256818e-06, + "loss": 1.9053, + "step": 26470 + }, + { + "epoch": 1.7765511224455555, + "grad_norm": 4.269471645355225, + "learning_rate": 3.2351208550794154e-06, + "loss": 1.8194, + "step": 26472 + }, + { + "epoch": 1.7766853461293244, + "grad_norm": 3.740905523300171, + "learning_rate": 3.2312760774505845e-06, + "loss": 1.5738, + "step": 26474 + }, + { + "epoch": 1.7768195698130935, + "grad_norm": 4.329087257385254, + "learning_rate": 3.2274335095520026e-06, + "loss": 1.876, + "step": 26476 + }, + { + "epoch": 1.7769537934968627, + "grad_norm": 4.2261271476745605, + "learning_rate": 3.223593151565213e-06, + "loss": 1.7212, + "step": 26478 + }, + { + "epoch": 1.7770880171806316, + "grad_norm": 3.89770245552063, + "learning_rate": 3.219755003671665e-06, + "loss": 1.8247, + "step": 26480 + }, + { + "epoch": 1.7772222408644005, + "grad_norm": 4.18013858795166, + "learning_rate": 3.2159190660526972e-06, + "loss": 1.7931, + "step": 26482 + }, + { + "epoch": 1.7773564645481694, + "grad_norm": 4.244062900543213, + "learning_rate": 3.2120853388895643e-06, + "loss": 1.8363, + "step": 26484 + }, + { + "epoch": 1.7774906882319386, + "grad_norm": 4.18195104598999, + "learning_rate": 3.2082538223633884e-06, + "loss": 1.7396, + "step": 26486 + }, + { + "epoch": 1.7776249119157075, + "grad_norm": 4.310253620147705, + "learning_rate": 3.204424516655219e-06, + "loss": 1.8524, + "step": 26488 + }, + { + "epoch": 1.7777591355994766, + "grad_norm": 5.165875434875488, + "learning_rate": 3.2005974219459556e-06, + "loss": 1.9117, + "step": 26490 + }, + { + "epoch": 1.7778933592832455, + "grad_norm": 3.82444429397583, + "learning_rate": 3.196772538416448e-06, + "loss": 1.767, + "step": 26492 + }, + { + "epoch": 1.7780275829670145, + "grad_norm": 4.2946648597717285, + "learning_rate": 3.1929498662473965e-06, + "loss": 2.0482, + "step": 26494 + }, + { + "epoch": 1.7781618066507834, + "grad_norm": 4.349788665771484, + "learning_rate": 3.1891294056194233e-06, + "loss": 1.7944, + "step": 26496 + }, + { + "epoch": 1.7782960303345525, + "grad_norm": 4.714035987854004, + "learning_rate": 3.1853111567130387e-06, + "loss": 1.9012, + "step": 26498 + }, + { + "epoch": 1.7784302540183217, + "grad_norm": 3.7464778423309326, + "learning_rate": 3.1814951197086495e-06, + "loss": 1.967, + "step": 26500 + }, + { + "epoch": 1.7785644777020906, + "grad_norm": 3.7750368118286133, + "learning_rate": 3.1776812947865385e-06, + "loss": 1.7657, + "step": 26502 + }, + { + "epoch": 1.7786987013858595, + "grad_norm": 4.297464370727539, + "learning_rate": 3.173869682126923e-06, + "loss": 2.2022, + "step": 26504 + }, + { + "epoch": 1.7788329250696284, + "grad_norm": 3.8516781330108643, + "learning_rate": 3.1700602819098868e-06, + "loss": 1.832, + "step": 26506 + }, + { + "epoch": 1.7789671487533976, + "grad_norm": 4.256399631500244, + "learning_rate": 3.1662530943154134e-06, + "loss": 1.9858, + "step": 26508 + }, + { + "epoch": 1.7791013724371665, + "grad_norm": 4.112867832183838, + "learning_rate": 3.162448119523387e-06, + "loss": 1.9521, + "step": 26510 + }, + { + "epoch": 1.7792355961209356, + "grad_norm": 4.688844203948975, + "learning_rate": 3.1586453577135798e-06, + "loss": 2.2224, + "step": 26512 + }, + { + "epoch": 1.7793698198047045, + "grad_norm": 4.722403049468994, + "learning_rate": 3.154844809065677e-06, + "loss": 1.9436, + "step": 26514 + }, + { + "epoch": 1.7795040434884735, + "grad_norm": 3.707200050354004, + "learning_rate": 3.1510464737592336e-06, + "loss": 1.6213, + "step": 26516 + }, + { + "epoch": 1.7796382671722424, + "grad_norm": 4.394159317016602, + "learning_rate": 3.1472503519737295e-06, + "loss": 1.8759, + "step": 26518 + }, + { + "epoch": 1.7797724908560115, + "grad_norm": 4.12106990814209, + "learning_rate": 3.1434564438884983e-06, + "loss": 1.6159, + "step": 26520 + }, + { + "epoch": 1.7799067145397807, + "grad_norm": 4.2600908279418945, + "learning_rate": 3.1396647496828247e-06, + "loss": 1.6698, + "step": 26522 + }, + { + "epoch": 1.7800409382235496, + "grad_norm": 3.9335873126983643, + "learning_rate": 3.1358752695358316e-06, + "loss": 1.6791, + "step": 26524 + }, + { + "epoch": 1.7801751619073185, + "grad_norm": 3.8675343990325928, + "learning_rate": 3.132088003626588e-06, + "loss": 1.6774, + "step": 26526 + }, + { + "epoch": 1.7803093855910874, + "grad_norm": 4.535398483276367, + "learning_rate": 3.1283029521340212e-06, + "loss": 1.9665, + "step": 26528 + }, + { + "epoch": 1.7804436092748566, + "grad_norm": 3.8045084476470947, + "learning_rate": 3.1245201152369783e-06, + "loss": 2.037, + "step": 26530 + }, + { + "epoch": 1.7805778329586257, + "grad_norm": 3.9409215450286865, + "learning_rate": 3.1207394931141665e-06, + "loss": 1.7722, + "step": 26532 + }, + { + "epoch": 1.7807120566423946, + "grad_norm": 4.15791130065918, + "learning_rate": 3.116961085944242e-06, + "loss": 1.7157, + "step": 26534 + }, + { + "epoch": 1.7808462803261635, + "grad_norm": 4.2830705642700195, + "learning_rate": 3.1131848939057186e-06, + "loss": 1.994, + "step": 26536 + }, + { + "epoch": 1.7809805040099325, + "grad_norm": 4.70108699798584, + "learning_rate": 3.1094109171770027e-06, + "loss": 2.1391, + "step": 26538 + }, + { + "epoch": 1.7811147276937016, + "grad_norm": 3.8229737281799316, + "learning_rate": 3.105639155936413e-06, + "loss": 1.8389, + "step": 26540 + }, + { + "epoch": 1.7812489513774705, + "grad_norm": 3.992347002029419, + "learning_rate": 3.1018696103621625e-06, + "loss": 1.8403, + "step": 26542 + }, + { + "epoch": 1.7813831750612397, + "grad_norm": 3.7813844680786133, + "learning_rate": 3.0981022806323536e-06, + "loss": 1.7431, + "step": 26544 + }, + { + "epoch": 1.7815173987450086, + "grad_norm": 4.350650787353516, + "learning_rate": 3.094337166924982e-06, + "loss": 1.9088, + "step": 26546 + }, + { + "epoch": 1.7816516224287775, + "grad_norm": 3.9300591945648193, + "learning_rate": 3.090574269417934e-06, + "loss": 1.7046, + "step": 26548 + }, + { + "epoch": 1.7817858461125464, + "grad_norm": 4.239412307739258, + "learning_rate": 3.0868135882890224e-06, + "loss": 2.0265, + "step": 26550 + }, + { + "epoch": 1.7819200697963156, + "grad_norm": 3.937623977661133, + "learning_rate": 3.0830551237159056e-06, + "loss": 1.7001, + "step": 26552 + }, + { + "epoch": 1.7820542934800847, + "grad_norm": 4.355440139770508, + "learning_rate": 3.0792988758761854e-06, + "loss": 2.008, + "step": 26554 + }, + { + "epoch": 1.7821885171638536, + "grad_norm": 3.426715612411499, + "learning_rate": 3.0755448449473255e-06, + "loss": 1.8868, + "step": 26556 + }, + { + "epoch": 1.7823227408476225, + "grad_norm": 4.175937652587891, + "learning_rate": 3.071793031106701e-06, + "loss": 1.9174, + "step": 26558 + }, + { + "epoch": 1.7824569645313915, + "grad_norm": 3.6990339756011963, + "learning_rate": 3.0680434345315643e-06, + "loss": 1.8988, + "step": 26560 + }, + { + "epoch": 1.7825911882151606, + "grad_norm": 3.7603063583374023, + "learning_rate": 3.0642960553990963e-06, + "loss": 1.9432, + "step": 26562 + }, + { + "epoch": 1.7827254118989295, + "grad_norm": 4.360939025878906, + "learning_rate": 3.0605508938863493e-06, + "loss": 1.8038, + "step": 26564 + }, + { + "epoch": 1.7828596355826987, + "grad_norm": 4.38274621963501, + "learning_rate": 3.056807950170265e-06, + "loss": 1.951, + "step": 26566 + }, + { + "epoch": 1.7829938592664676, + "grad_norm": 4.775854110717773, + "learning_rate": 3.0530672244276912e-06, + "loss": 1.8961, + "step": 26568 + }, + { + "epoch": 1.7831280829502365, + "grad_norm": 4.341954708099365, + "learning_rate": 3.049328716835376e-06, + "loss": 1.9867, + "step": 26570 + }, + { + "epoch": 1.7832623066340054, + "grad_norm": 4.361432075500488, + "learning_rate": 3.04559242756996e-06, + "loss": 1.7424, + "step": 26572 + }, + { + "epoch": 1.7833965303177746, + "grad_norm": 4.446201324462891, + "learning_rate": 3.0418583568079697e-06, + "loss": 1.9314, + "step": 26574 + }, + { + "epoch": 1.7835307540015437, + "grad_norm": 4.13389253616333, + "learning_rate": 3.03812650472583e-06, + "loss": 2.0865, + "step": 26576 + }, + { + "epoch": 1.7836649776853126, + "grad_norm": 4.572142601013184, + "learning_rate": 3.034396871499856e-06, + "loss": 2.0518, + "step": 26578 + }, + { + "epoch": 1.7837992013690815, + "grad_norm": 4.691253185272217, + "learning_rate": 3.0306694573062898e-06, + "loss": 1.798, + "step": 26580 + }, + { + "epoch": 1.7839334250528505, + "grad_norm": 3.9475698471069336, + "learning_rate": 3.0269442623212186e-06, + "loss": 2.0844, + "step": 26582 + }, + { + "epoch": 1.7840676487366196, + "grad_norm": 4.17352819442749, + "learning_rate": 3.023221286720679e-06, + "loss": 1.8887, + "step": 26584 + }, + { + "epoch": 1.7842018724203885, + "grad_norm": 4.330801486968994, + "learning_rate": 3.019500530680547e-06, + "loss": 1.8507, + "step": 26586 + }, + { + "epoch": 1.7843360961041577, + "grad_norm": 4.179172992706299, + "learning_rate": 3.015781994376632e-06, + "loss": 1.7558, + "step": 26588 + }, + { + "epoch": 1.7844703197879266, + "grad_norm": 4.032217979431152, + "learning_rate": 3.0120656779846214e-06, + "loss": 2.1018, + "step": 26590 + }, + { + "epoch": 1.7846045434716955, + "grad_norm": 3.97247052192688, + "learning_rate": 3.008351581680119e-06, + "loss": 1.9396, + "step": 26592 + }, + { + "epoch": 1.7847387671554644, + "grad_norm": 4.102529048919678, + "learning_rate": 3.004639705638601e-06, + "loss": 1.8871, + "step": 26594 + }, + { + "epoch": 1.7848729908392336, + "grad_norm": 3.9619593620300293, + "learning_rate": 3.0009300500354444e-06, + "loss": 1.7091, + "step": 26596 + }, + { + "epoch": 1.7850072145230027, + "grad_norm": 3.7180769443511963, + "learning_rate": 2.9972226150459137e-06, + "loss": 1.6338, + "step": 26598 + }, + { + "epoch": 1.7851414382067716, + "grad_norm": 3.934730291366577, + "learning_rate": 2.9935174008451917e-06, + "loss": 1.6272, + "step": 26600 + }, + { + "epoch": 1.7852756618905405, + "grad_norm": 3.722046375274658, + "learning_rate": 2.9898144076083433e-06, + "loss": 1.9406, + "step": 26602 + }, + { + "epoch": 1.7854098855743095, + "grad_norm": 4.217231750488281, + "learning_rate": 2.9861136355103235e-06, + "loss": 1.845, + "step": 26604 + }, + { + "epoch": 1.7855441092580786, + "grad_norm": 4.001340866088867, + "learning_rate": 2.982415084725976e-06, + "loss": 1.9077, + "step": 26606 + }, + { + "epoch": 1.7856783329418477, + "grad_norm": 3.3653781414031982, + "learning_rate": 2.9787187554300656e-06, + "loss": 1.7692, + "step": 26608 + }, + { + "epoch": 1.7858125566256167, + "grad_norm": 4.310394763946533, + "learning_rate": 2.97502464779722e-06, + "loss": 1.9417, + "step": 26610 + }, + { + "epoch": 1.7859467803093856, + "grad_norm": 4.262114524841309, + "learning_rate": 2.9713327620020103e-06, + "loss": 2.0111, + "step": 26612 + }, + { + "epoch": 1.7860810039931545, + "grad_norm": 4.083218574523926, + "learning_rate": 2.967643098218831e-06, + "loss": 2.0052, + "step": 26614 + }, + { + "epoch": 1.7862152276769236, + "grad_norm": 4.118166446685791, + "learning_rate": 2.9639556566220415e-06, + "loss": 1.854, + "step": 26616 + }, + { + "epoch": 1.7863494513606926, + "grad_norm": 3.845299005508423, + "learning_rate": 2.9602704373858426e-06, + "loss": 1.8891, + "step": 26618 + }, + { + "epoch": 1.7864836750444617, + "grad_norm": 4.158504962921143, + "learning_rate": 2.9565874406843776e-06, + "loss": 2.0444, + "step": 26620 + }, + { + "epoch": 1.7866178987282306, + "grad_norm": 4.294016361236572, + "learning_rate": 2.952906666691646e-06, + "loss": 1.8253, + "step": 26622 + }, + { + "epoch": 1.7867521224119995, + "grad_norm": 3.8112165927886963, + "learning_rate": 2.949228115581565e-06, + "loss": 1.6557, + "step": 26624 + }, + { + "epoch": 1.7868863460957685, + "grad_norm": 4.078647136688232, + "learning_rate": 2.9455517875279225e-06, + "loss": 1.8991, + "step": 26626 + }, + { + "epoch": 1.7870205697795376, + "grad_norm": 4.263328552246094, + "learning_rate": 2.9418776827044357e-06, + "loss": 2.0339, + "step": 26628 + }, + { + "epoch": 1.7871547934633067, + "grad_norm": 4.3393964767456055, + "learning_rate": 2.9382058012846936e-06, + "loss": 1.9011, + "step": 26630 + }, + { + "epoch": 1.7872890171470757, + "grad_norm": 4.050359725952148, + "learning_rate": 2.93453614344219e-06, + "loss": 1.7797, + "step": 26632 + }, + { + "epoch": 1.7874232408308446, + "grad_norm": 4.13411283493042, + "learning_rate": 2.930868709350287e-06, + "loss": 1.7496, + "step": 26634 + }, + { + "epoch": 1.7875574645146135, + "grad_norm": 4.724362850189209, + "learning_rate": 2.927203499182296e-06, + "loss": 2.0449, + "step": 26636 + }, + { + "epoch": 1.7876916881983826, + "grad_norm": 3.9214093685150146, + "learning_rate": 2.9235405131113615e-06, + "loss": 1.6952, + "step": 26638 + }, + { + "epoch": 1.7878259118821516, + "grad_norm": 4.191667556762695, + "learning_rate": 2.9198797513105834e-06, + "loss": 1.6537, + "step": 26640 + }, + { + "epoch": 1.7879601355659207, + "grad_norm": 4.084116458892822, + "learning_rate": 2.9162212139528967e-06, + "loss": 1.8337, + "step": 26642 + }, + { + "epoch": 1.7880943592496896, + "grad_norm": 3.901854991912842, + "learning_rate": 2.9125649012111678e-06, + "loss": 1.6949, + "step": 26644 + }, + { + "epoch": 1.7882285829334585, + "grad_norm": 4.49906063079834, + "learning_rate": 2.9089108132581587e-06, + "loss": 1.8849, + "step": 26646 + }, + { + "epoch": 1.7883628066172275, + "grad_norm": 3.7636988162994385, + "learning_rate": 2.9052589502665095e-06, + "loss": 1.8029, + "step": 26648 + }, + { + "epoch": 1.7884970303009966, + "grad_norm": 3.6144464015960693, + "learning_rate": 2.9016093124087817e-06, + "loss": 1.7837, + "step": 26650 + }, + { + "epoch": 1.7886312539847657, + "grad_norm": 3.849839210510254, + "learning_rate": 2.897961899857382e-06, + "loss": 1.6475, + "step": 26652 + }, + { + "epoch": 1.7887654776685347, + "grad_norm": 4.128547191619873, + "learning_rate": 2.894316712784667e-06, + "loss": 1.8379, + "step": 26654 + }, + { + "epoch": 1.7888997013523036, + "grad_norm": 4.178230285644531, + "learning_rate": 2.890673751362849e-06, + "loss": 1.8606, + "step": 26656 + }, + { + "epoch": 1.7890339250360725, + "grad_norm": 4.271756172180176, + "learning_rate": 2.887033015764068e-06, + "loss": 1.9115, + "step": 26658 + }, + { + "epoch": 1.7891681487198416, + "grad_norm": 4.244858264923096, + "learning_rate": 2.883394506160336e-06, + "loss": 1.7747, + "step": 26660 + }, + { + "epoch": 1.7893023724036106, + "grad_norm": 4.95219612121582, + "learning_rate": 2.8797582227235608e-06, + "loss": 1.8821, + "step": 26662 + }, + { + "epoch": 1.7894365960873797, + "grad_norm": 4.226404190063477, + "learning_rate": 2.876124165625549e-06, + "loss": 1.6958, + "step": 26664 + }, + { + "epoch": 1.7895708197711486, + "grad_norm": 4.0543293952941895, + "learning_rate": 2.8724923350380075e-06, + "loss": 1.6348, + "step": 26666 + }, + { + "epoch": 1.7897050434549175, + "grad_norm": 4.136929035186768, + "learning_rate": 2.868862731132532e-06, + "loss": 1.9241, + "step": 26668 + }, + { + "epoch": 1.7898392671386865, + "grad_norm": 3.7819526195526123, + "learning_rate": 2.8652353540806086e-06, + "loss": 1.8252, + "step": 26670 + }, + { + "epoch": 1.7899734908224556, + "grad_norm": 4.2698869705200195, + "learning_rate": 2.8616102040536274e-06, + "loss": 1.7729, + "step": 26672 + }, + { + "epoch": 1.7901077145062247, + "grad_norm": 4.4871506690979, + "learning_rate": 2.8579872812228738e-06, + "loss": 1.8648, + "step": 26674 + }, + { + "epoch": 1.7902419381899937, + "grad_norm": 4.29387903213501, + "learning_rate": 2.8543665857595158e-06, + "loss": 2.0781, + "step": 26676 + }, + { + "epoch": 1.7903761618737626, + "grad_norm": 4.0955963134765625, + "learning_rate": 2.850748117834645e-06, + "loss": 2.047, + "step": 26678 + }, + { + "epoch": 1.7905103855575315, + "grad_norm": 4.170943737030029, + "learning_rate": 2.8471318776191914e-06, + "loss": 1.6965, + "step": 26680 + }, + { + "epoch": 1.7906446092413006, + "grad_norm": 4.545037269592285, + "learning_rate": 2.8435178652840456e-06, + "loss": 2.0086, + "step": 26682 + }, + { + "epoch": 1.7907788329250698, + "grad_norm": 3.7834315299987793, + "learning_rate": 2.839906080999938e-06, + "loss": 1.9686, + "step": 26684 + }, + { + "epoch": 1.7909130566088387, + "grad_norm": 4.165268898010254, + "learning_rate": 2.8362965249375485e-06, + "loss": 1.7725, + "step": 26686 + }, + { + "epoch": 1.7910472802926076, + "grad_norm": 4.434182167053223, + "learning_rate": 2.8326891972673965e-06, + "loss": 1.8386, + "step": 26688 + }, + { + "epoch": 1.7911815039763765, + "grad_norm": 4.401557445526123, + "learning_rate": 2.829084098159934e-06, + "loss": 1.9295, + "step": 26690 + }, + { + "epoch": 1.7913157276601457, + "grad_norm": 4.276312351226807, + "learning_rate": 2.8254812277854813e-06, + "loss": 1.707, + "step": 26692 + }, + { + "epoch": 1.7914499513439146, + "grad_norm": 4.469396114349365, + "learning_rate": 2.8218805863142794e-06, + "loss": 2.1644, + "step": 26694 + }, + { + "epoch": 1.7915841750276837, + "grad_norm": 3.7867488861083984, + "learning_rate": 2.8182821739164534e-06, + "loss": 1.7029, + "step": 26696 + }, + { + "epoch": 1.7917183987114527, + "grad_norm": 4.073383808135986, + "learning_rate": 2.8146859907620172e-06, + "loss": 1.5153, + "step": 26698 + }, + { + "epoch": 1.7918526223952216, + "grad_norm": 4.287137985229492, + "learning_rate": 2.8110920370208682e-06, + "loss": 1.6419, + "step": 26700 + }, + { + "epoch": 1.7919868460789905, + "grad_norm": 3.9521820545196533, + "learning_rate": 2.8075003128628374e-06, + "loss": 2.1349, + "step": 26702 + }, + { + "epoch": 1.7921210697627596, + "grad_norm": 4.168284893035889, + "learning_rate": 2.8039108184576113e-06, + "loss": 1.7741, + "step": 26704 + }, + { + "epoch": 1.7922552934465288, + "grad_norm": 4.504157066345215, + "learning_rate": 2.8003235539747984e-06, + "loss": 1.8008, + "step": 26706 + }, + { + "epoch": 1.7923895171302977, + "grad_norm": 3.61846923828125, + "learning_rate": 2.796738519583886e-06, + "loss": 1.8037, + "step": 26708 + }, + { + "epoch": 1.7925237408140666, + "grad_norm": 3.7669076919555664, + "learning_rate": 2.7931557154542487e-06, + "loss": 1.8757, + "step": 26710 + }, + { + "epoch": 1.7926579644978355, + "grad_norm": 4.126394271850586, + "learning_rate": 2.7895751417551852e-06, + "loss": 2.1036, + "step": 26712 + }, + { + "epoch": 1.7927921881816047, + "grad_norm": 3.653017282485962, + "learning_rate": 2.785996798655849e-06, + "loss": 1.7863, + "step": 26714 + }, + { + "epoch": 1.7929264118653736, + "grad_norm": 3.3485751152038574, + "learning_rate": 2.7824206863253264e-06, + "loss": 1.708, + "step": 26716 + }, + { + "epoch": 1.7930606355491427, + "grad_norm": 4.37808084487915, + "learning_rate": 2.778846804932583e-06, + "loss": 1.9557, + "step": 26718 + }, + { + "epoch": 1.7931948592329117, + "grad_norm": 4.166416168212891, + "learning_rate": 2.775275154646467e-06, + "loss": 1.8732, + "step": 26720 + }, + { + "epoch": 1.7933290829166806, + "grad_norm": 4.100563049316406, + "learning_rate": 2.7717057356357323e-06, + "loss": 2.0018, + "step": 26722 + }, + { + "epoch": 1.7934633066004495, + "grad_norm": 3.470092535018921, + "learning_rate": 2.768138548069038e-06, + "loss": 1.7831, + "step": 26724 + }, + { + "epoch": 1.7935975302842186, + "grad_norm": 4.693881988525391, + "learning_rate": 2.7645735921149217e-06, + "loss": 2.192, + "step": 26726 + }, + { + "epoch": 1.7937317539679878, + "grad_norm": 4.042911529541016, + "learning_rate": 2.7610108679418156e-06, + "loss": 1.8385, + "step": 26728 + }, + { + "epoch": 1.7938659776517567, + "grad_norm": 4.111162185668945, + "learning_rate": 2.757450375718046e-06, + "loss": 1.9549, + "step": 26730 + }, + { + "epoch": 1.7940002013355256, + "grad_norm": 4.359826564788818, + "learning_rate": 2.753892115611856e-06, + "loss": 1.9246, + "step": 26732 + }, + { + "epoch": 1.7941344250192945, + "grad_norm": 3.363966464996338, + "learning_rate": 2.7503360877913497e-06, + "loss": 1.7438, + "step": 26734 + }, + { + "epoch": 1.7942686487030637, + "grad_norm": 4.087148666381836, + "learning_rate": 2.7467822924245713e-06, + "loss": 1.7466, + "step": 26736 + }, + { + "epoch": 1.7944028723868326, + "grad_norm": 3.701404094696045, + "learning_rate": 2.7432307296793914e-06, + "loss": 1.7809, + "step": 26738 + }, + { + "epoch": 1.7945370960706017, + "grad_norm": 3.66926908493042, + "learning_rate": 2.739681399723637e-06, + "loss": 1.8816, + "step": 26740 + }, + { + "epoch": 1.7946713197543707, + "grad_norm": 3.8330557346343994, + "learning_rate": 2.736134302725002e-06, + "loss": 1.7703, + "step": 26742 + }, + { + "epoch": 1.7948055434381396, + "grad_norm": 4.824429988861084, + "learning_rate": 2.7325894388510855e-06, + "loss": 1.9806, + "step": 26744 + }, + { + "epoch": 1.7949397671219085, + "grad_norm": 4.078512668609619, + "learning_rate": 2.72904680826937e-06, + "loss": 1.9233, + "step": 26746 + }, + { + "epoch": 1.7950739908056776, + "grad_norm": 3.6742591857910156, + "learning_rate": 2.7255064111472385e-06, + "loss": 1.6792, + "step": 26748 + }, + { + "epoch": 1.7952082144894468, + "grad_norm": 3.8783552646636963, + "learning_rate": 2.721968247651957e-06, + "loss": 1.6082, + "step": 26750 + }, + { + "epoch": 1.7953424381732157, + "grad_norm": 3.9167308807373047, + "learning_rate": 2.71843231795072e-06, + "loss": 1.5465, + "step": 26752 + }, + { + "epoch": 1.7954766618569846, + "grad_norm": 4.59635066986084, + "learning_rate": 2.7148986222105765e-06, + "loss": 2.1535, + "step": 26754 + }, + { + "epoch": 1.7956108855407535, + "grad_norm": 3.776723623275757, + "learning_rate": 2.7113671605984934e-06, + "loss": 1.7299, + "step": 26756 + }, + { + "epoch": 1.7957451092245227, + "grad_norm": 4.389224052429199, + "learning_rate": 2.707837933281321e-06, + "loss": 2.0474, + "step": 26758 + }, + { + "epoch": 1.7958793329082918, + "grad_norm": 3.6616604328155518, + "learning_rate": 2.704310940425808e-06, + "loss": 1.8844, + "step": 26760 + }, + { + "epoch": 1.7960135565920607, + "grad_norm": 4.124492168426514, + "learning_rate": 2.7007861821986e-06, + "loss": 1.6783, + "step": 26762 + }, + { + "epoch": 1.7961477802758297, + "grad_norm": 4.093212127685547, + "learning_rate": 2.697263658766247e-06, + "loss": 2.0507, + "step": 26764 + }, + { + "epoch": 1.7962820039595986, + "grad_norm": 4.154343128204346, + "learning_rate": 2.6937433702951543e-06, + "loss": 1.9458, + "step": 26766 + }, + { + "epoch": 1.7964162276433677, + "grad_norm": 4.38416862487793, + "learning_rate": 2.690225316951672e-06, + "loss": 1.8887, + "step": 26768 + }, + { + "epoch": 1.7965504513271366, + "grad_norm": 3.6346590518951416, + "learning_rate": 2.6867094989020016e-06, + "loss": 1.8019, + "step": 26770 + }, + { + "epoch": 1.7966846750109058, + "grad_norm": 5.040079116821289, + "learning_rate": 2.683195916312281e-06, + "loss": 2.0771, + "step": 26772 + }, + { + "epoch": 1.7968188986946747, + "grad_norm": 4.298085689544678, + "learning_rate": 2.6796845693485064e-06, + "loss": 1.7127, + "step": 26774 + }, + { + "epoch": 1.7969531223784436, + "grad_norm": 3.9951298236846924, + "learning_rate": 2.6761754581765886e-06, + "loss": 1.9757, + "step": 26776 + }, + { + "epoch": 1.7970873460622125, + "grad_norm": 4.423707962036133, + "learning_rate": 2.672668582962312e-06, + "loss": 2.0924, + "step": 26778 + }, + { + "epoch": 1.7972215697459817, + "grad_norm": 5.162038326263428, + "learning_rate": 2.6691639438713834e-06, + "loss": 2.0158, + "step": 26780 + }, + { + "epoch": 1.7973557934297508, + "grad_norm": 4.794566631317139, + "learning_rate": 2.6656615410693918e-06, + "loss": 1.783, + "step": 26782 + }, + { + "epoch": 1.7974900171135197, + "grad_norm": 4.301304817199707, + "learning_rate": 2.662161374721811e-06, + "loss": 1.7337, + "step": 26784 + }, + { + "epoch": 1.7976242407972887, + "grad_norm": 4.011186122894287, + "learning_rate": 2.6586634449940194e-06, + "loss": 2.035, + "step": 26786 + }, + { + "epoch": 1.7977584644810576, + "grad_norm": 4.481747150421143, + "learning_rate": 2.6551677520512797e-06, + "loss": 2.0023, + "step": 26788 + }, + { + "epoch": 1.7978926881648267, + "grad_norm": 3.6369452476501465, + "learning_rate": 2.651674296058776e-06, + "loss": 1.8044, + "step": 26790 + }, + { + "epoch": 1.7980269118485956, + "grad_norm": 4.337130069732666, + "learning_rate": 2.6481830771815486e-06, + "loss": 1.6941, + "step": 26792 + }, + { + "epoch": 1.7981611355323648, + "grad_norm": 4.239383220672607, + "learning_rate": 2.64469409558456e-06, + "loss": 2.1209, + "step": 26794 + }, + { + "epoch": 1.7982953592161337, + "grad_norm": 4.176908493041992, + "learning_rate": 2.6412073514326508e-06, + "loss": 1.84, + "step": 26796 + }, + { + "epoch": 1.7984295828999026, + "grad_norm": 4.286230564117432, + "learning_rate": 2.637722844890572e-06, + "loss": 1.7378, + "step": 26798 + }, + { + "epoch": 1.7985638065836715, + "grad_norm": 4.506876468658447, + "learning_rate": 2.6342405761229485e-06, + "loss": 2.0075, + "step": 26800 + }, + { + "epoch": 1.7986980302674407, + "grad_norm": 4.528065204620361, + "learning_rate": 2.630760545294325e-06, + "loss": 1.8841, + "step": 26802 + }, + { + "epoch": 1.7988322539512098, + "grad_norm": 4.308983325958252, + "learning_rate": 2.6272827525691104e-06, + "loss": 1.7867, + "step": 26804 + }, + { + "epoch": 1.7989664776349787, + "grad_norm": 3.9762401580810547, + "learning_rate": 2.6238071981116383e-06, + "loss": 1.5922, + "step": 26806 + }, + { + "epoch": 1.7991007013187477, + "grad_norm": 4.089524745941162, + "learning_rate": 2.620333882086107e-06, + "loss": 2.0166, + "step": 26808 + }, + { + "epoch": 1.7992349250025166, + "grad_norm": 4.35562801361084, + "learning_rate": 2.616862804656639e-06, + "loss": 1.8822, + "step": 26810 + }, + { + "epoch": 1.7993691486862857, + "grad_norm": 3.474964141845703, + "learning_rate": 2.6133939659872265e-06, + "loss": 1.561, + "step": 26812 + }, + { + "epoch": 1.7995033723700546, + "grad_norm": 3.989548921585083, + "learning_rate": 2.6099273662417713e-06, + "loss": 1.7567, + "step": 26814 + }, + { + "epoch": 1.7996375960538238, + "grad_norm": 4.120115280151367, + "learning_rate": 2.6064630055840477e-06, + "loss": 1.7721, + "step": 26816 + }, + { + "epoch": 1.7997718197375927, + "grad_norm": 4.293820858001709, + "learning_rate": 2.6030008841777585e-06, + "loss": 1.5964, + "step": 26818 + }, + { + "epoch": 1.7999060434213616, + "grad_norm": 4.164609909057617, + "learning_rate": 2.5995410021864787e-06, + "loss": 2.0134, + "step": 26820 + }, + { + "epoch": 1.8000402671051305, + "grad_norm": 4.603229999542236, + "learning_rate": 2.596083359773677e-06, + "loss": 1.981, + "step": 26822 + }, + { + "epoch": 1.8001744907888997, + "grad_norm": 4.232473373413086, + "learning_rate": 2.5926279571027113e-06, + "loss": 2.1162, + "step": 26824 + }, + { + "epoch": 1.8003087144726688, + "grad_norm": 4.174623489379883, + "learning_rate": 2.589174794336863e-06, + "loss": 1.5689, + "step": 26826 + }, + { + "epoch": 1.8004429381564377, + "grad_norm": 10.796433448791504, + "learning_rate": 2.585723871639267e-06, + "loss": 1.9338, + "step": 26828 + }, + { + "epoch": 1.8005771618402067, + "grad_norm": 4.120389461517334, + "learning_rate": 2.5822751891729945e-06, + "loss": 1.7362, + "step": 26830 + }, + { + "epoch": 1.8007113855239756, + "grad_norm": 4.716836452484131, + "learning_rate": 2.578828747100964e-06, + "loss": 1.914, + "step": 26832 + }, + { + "epoch": 1.8008456092077447, + "grad_norm": 4.522303104400635, + "learning_rate": 2.5753845455860347e-06, + "loss": 2.1946, + "step": 26834 + }, + { + "epoch": 1.8009798328915139, + "grad_norm": 3.8956096172332764, + "learning_rate": 2.5719425847909206e-06, + "loss": 1.8568, + "step": 26836 + }, + { + "epoch": 1.8011140565752828, + "grad_norm": 4.310061931610107, + "learning_rate": 2.5685028648782638e-06, + "loss": 1.7543, + "step": 26838 + }, + { + "epoch": 1.8012482802590517, + "grad_norm": 4.354753494262695, + "learning_rate": 2.5650653860105733e-06, + "loss": 1.8323, + "step": 26840 + }, + { + "epoch": 1.8013825039428206, + "grad_norm": 4.283669471740723, + "learning_rate": 2.561630148350269e-06, + "loss": 1.7512, + "step": 26842 + }, + { + "epoch": 1.8015167276265898, + "grad_norm": 4.372633934020996, + "learning_rate": 2.558197152059649e-06, + "loss": 1.9757, + "step": 26844 + }, + { + "epoch": 1.8016509513103587, + "grad_norm": 4.138453006744385, + "learning_rate": 2.5547663973009284e-06, + "loss": 2.0376, + "step": 26846 + }, + { + "epoch": 1.8017851749941278, + "grad_norm": 3.914402961730957, + "learning_rate": 2.551337884236199e-06, + "loss": 1.7575, + "step": 26848 + }, + { + "epoch": 1.8019193986778967, + "grad_norm": 4.531423568725586, + "learning_rate": 2.547911613027454e-06, + "loss": 1.8916, + "step": 26850 + }, + { + "epoch": 1.8020536223616657, + "grad_norm": 4.206552982330322, + "learning_rate": 2.5444875838365745e-06, + "loss": 1.9589, + "step": 26852 + }, + { + "epoch": 1.8021878460454346, + "grad_norm": 4.339237689971924, + "learning_rate": 2.5410657968253317e-06, + "loss": 2.0342, + "step": 26854 + }, + { + "epoch": 1.8023220697292037, + "grad_norm": 4.24057674407959, + "learning_rate": 2.5376462521554125e-06, + "loss": 1.8693, + "step": 26856 + }, + { + "epoch": 1.8024562934129729, + "grad_norm": 4.027923107147217, + "learning_rate": 2.5342289499883766e-06, + "loss": 1.8039, + "step": 26858 + }, + { + "epoch": 1.8025905170967418, + "grad_norm": 3.835784673690796, + "learning_rate": 2.5308138904856838e-06, + "loss": 1.742, + "step": 26860 + }, + { + "epoch": 1.8027247407805107, + "grad_norm": 3.8147776126861572, + "learning_rate": 2.527401073808683e-06, + "loss": 2.0757, + "step": 26862 + }, + { + "epoch": 1.8028589644642796, + "grad_norm": 4.169976711273193, + "learning_rate": 2.5239905001186394e-06, + "loss": 1.886, + "step": 26864 + }, + { + "epoch": 1.8029931881480488, + "grad_norm": 4.44757080078125, + "learning_rate": 2.520582169576685e-06, + "loss": 2.0192, + "step": 26866 + }, + { + "epoch": 1.8031274118318177, + "grad_norm": 4.538173675537109, + "learning_rate": 2.517176082343858e-06, + "loss": 1.708, + "step": 26868 + }, + { + "epoch": 1.8032616355155868, + "grad_norm": 4.0926127433776855, + "learning_rate": 2.513772238581097e-06, + "loss": 1.6276, + "step": 26870 + }, + { + "epoch": 1.8033958591993557, + "grad_norm": 3.875983238220215, + "learning_rate": 2.5103706384492164e-06, + "loss": 1.868, + "step": 26872 + }, + { + "epoch": 1.8035300828831247, + "grad_norm": 4.058338642120361, + "learning_rate": 2.5069712821089277e-06, + "loss": 2.0465, + "step": 26874 + }, + { + "epoch": 1.8036643065668936, + "grad_norm": 3.517664909362793, + "learning_rate": 2.503574169720868e-06, + "loss": 1.7614, + "step": 26876 + }, + { + "epoch": 1.8037985302506627, + "grad_norm": 3.965304374694824, + "learning_rate": 2.5001793014455266e-06, + "loss": 1.8447, + "step": 26878 + }, + { + "epoch": 1.8039327539344319, + "grad_norm": 4.024295806884766, + "learning_rate": 2.4967866774433136e-06, + "loss": 1.5254, + "step": 26880 + }, + { + "epoch": 1.8040669776182008, + "grad_norm": 3.8057754039764404, + "learning_rate": 2.4933962978745117e-06, + "loss": 1.9133, + "step": 26882 + }, + { + "epoch": 1.8042012013019697, + "grad_norm": 3.5533010959625244, + "learning_rate": 2.490008162899321e-06, + "loss": 1.8674, + "step": 26884 + }, + { + "epoch": 1.8043354249857386, + "grad_norm": 4.065617561340332, + "learning_rate": 2.486622272677813e-06, + "loss": 1.7009, + "step": 26886 + }, + { + "epoch": 1.8044696486695078, + "grad_norm": 4.437594890594482, + "learning_rate": 2.483238627369988e-06, + "loss": 1.9642, + "step": 26888 + }, + { + "epoch": 1.8046038723532767, + "grad_norm": 4.553578853607178, + "learning_rate": 2.4798572271356846e-06, + "loss": 1.7517, + "step": 26890 + }, + { + "epoch": 1.8047380960370458, + "grad_norm": 3.779817819595337, + "learning_rate": 2.4764780721346914e-06, + "loss": 2.0606, + "step": 26892 + }, + { + "epoch": 1.8048723197208147, + "grad_norm": 4.777761459350586, + "learning_rate": 2.4731011625266477e-06, + "loss": 1.8, + "step": 26894 + }, + { + "epoch": 1.8050065434045837, + "grad_norm": 4.082887649536133, + "learning_rate": 2.4697264984711257e-06, + "loss": 1.9703, + "step": 26896 + }, + { + "epoch": 1.8051407670883526, + "grad_norm": 3.8027522563934326, + "learning_rate": 2.466354080127564e-06, + "loss": 1.8683, + "step": 26898 + }, + { + "epoch": 1.8052749907721217, + "grad_norm": 4.331504821777344, + "learning_rate": 2.4629839076552974e-06, + "loss": 1.8715, + "step": 26900 + }, + { + "epoch": 1.8054092144558909, + "grad_norm": 3.677621364593506, + "learning_rate": 2.459615981213559e-06, + "loss": 1.7315, + "step": 26902 + }, + { + "epoch": 1.8055434381396598, + "grad_norm": 3.9533791542053223, + "learning_rate": 2.456250300961488e-06, + "loss": 1.8498, + "step": 26904 + }, + { + "epoch": 1.8056776618234287, + "grad_norm": 4.5392608642578125, + "learning_rate": 2.452886867058102e-06, + "loss": 1.8271, + "step": 26906 + }, + { + "epoch": 1.8058118855071976, + "grad_norm": 4.569698810577393, + "learning_rate": 2.449525679662312e-06, + "loss": 2.0062, + "step": 26908 + }, + { + "epoch": 1.8059461091909668, + "grad_norm": 3.878943920135498, + "learning_rate": 2.44616673893292e-06, + "loss": 2.057, + "step": 26910 + }, + { + "epoch": 1.806080332874736, + "grad_norm": 4.126757621765137, + "learning_rate": 2.442810045028654e-06, + "loss": 2.0721, + "step": 26912 + }, + { + "epoch": 1.8062145565585048, + "grad_norm": 3.480307102203369, + "learning_rate": 2.439455598108081e-06, + "loss": 1.8696, + "step": 26914 + }, + { + "epoch": 1.8063487802422737, + "grad_norm": 3.593289375305176, + "learning_rate": 2.4361033983297255e-06, + "loss": 1.6487, + "step": 26916 + }, + { + "epoch": 1.8064830039260427, + "grad_norm": 4.263329982757568, + "learning_rate": 2.432753445851943e-06, + "loss": 1.6661, + "step": 26918 + }, + { + "epoch": 1.8066172276098118, + "grad_norm": 4.229862213134766, + "learning_rate": 2.4294057408330184e-06, + "loss": 1.789, + "step": 26920 + }, + { + "epoch": 1.8067514512935807, + "grad_norm": 4.460287570953369, + "learning_rate": 2.4260602834311364e-06, + "loss": 1.8148, + "step": 26922 + }, + { + "epoch": 1.8068856749773499, + "grad_norm": 3.801304340362549, + "learning_rate": 2.422717073804348e-06, + "loss": 1.8687, + "step": 26924 + }, + { + "epoch": 1.8070198986611188, + "grad_norm": 3.878286600112915, + "learning_rate": 2.4193761121106384e-06, + "loss": 1.8911, + "step": 26926 + }, + { + "epoch": 1.8071541223448877, + "grad_norm": 4.0024847984313965, + "learning_rate": 2.4160373985078256e-06, + "loss": 1.9031, + "step": 26928 + }, + { + "epoch": 1.8072883460286566, + "grad_norm": 4.36549186706543, + "learning_rate": 2.4127009331536832e-06, + "loss": 2.0842, + "step": 26930 + }, + { + "epoch": 1.8074225697124258, + "grad_norm": 4.552010536193848, + "learning_rate": 2.409366716205841e-06, + "loss": 1.9155, + "step": 26932 + }, + { + "epoch": 1.807556793396195, + "grad_norm": 4.20560359954834, + "learning_rate": 2.4060347478218446e-06, + "loss": 2.1078, + "step": 26934 + }, + { + "epoch": 1.8076910170799638, + "grad_norm": 4.224250793457031, + "learning_rate": 2.402705028159119e-06, + "loss": 1.8431, + "step": 26936 + }, + { + "epoch": 1.8078252407637327, + "grad_norm": 3.8197624683380127, + "learning_rate": 2.399377557374982e-06, + "loss": 1.7754, + "step": 26938 + }, + { + "epoch": 1.8079594644475017, + "grad_norm": 3.8954827785491943, + "learning_rate": 2.3960523356266475e-06, + "loss": 1.6436, + "step": 26940 + }, + { + "epoch": 1.8080936881312708, + "grad_norm": 3.6054744720458984, + "learning_rate": 2.3927293630712332e-06, + "loss": 2.0054, + "step": 26942 + }, + { + "epoch": 1.8082279118150397, + "grad_norm": 3.8391060829162598, + "learning_rate": 2.3894086398657478e-06, + "loss": 1.8772, + "step": 26944 + }, + { + "epoch": 1.8083621354988089, + "grad_norm": 3.471550226211548, + "learning_rate": 2.3860901661670764e-06, + "loss": 1.6032, + "step": 26946 + }, + { + "epoch": 1.8084963591825778, + "grad_norm": 3.880465269088745, + "learning_rate": 2.3827739421320105e-06, + "loss": 1.8703, + "step": 26948 + }, + { + "epoch": 1.8086305828663467, + "grad_norm": 4.096855640411377, + "learning_rate": 2.379459967917247e-06, + "loss": 1.8705, + "step": 26950 + }, + { + "epoch": 1.8087648065501156, + "grad_norm": 4.098852157592773, + "learning_rate": 2.376148243679355e-06, + "loss": 2.0021, + "step": 26952 + }, + { + "epoch": 1.8088990302338848, + "grad_norm": 4.619135856628418, + "learning_rate": 2.37283876957482e-06, + "loss": 2.0591, + "step": 26954 + }, + { + "epoch": 1.809033253917654, + "grad_norm": 4.208920001983643, + "learning_rate": 2.369531545759984e-06, + "loss": 1.9279, + "step": 26956 + }, + { + "epoch": 1.8091674776014228, + "grad_norm": 4.189339637756348, + "learning_rate": 2.3662265723911337e-06, + "loss": 1.7091, + "step": 26958 + }, + { + "epoch": 1.8093017012851917, + "grad_norm": 3.972482204437256, + "learning_rate": 2.3629238496243987e-06, + "loss": 1.9483, + "step": 26960 + }, + { + "epoch": 1.8094359249689607, + "grad_norm": 3.8914365768432617, + "learning_rate": 2.359623377615844e-06, + "loss": 1.6997, + "step": 26962 + }, + { + "epoch": 1.8095701486527298, + "grad_norm": 4.392874717712402, + "learning_rate": 2.356325156521405e-06, + "loss": 1.7249, + "step": 26964 + }, + { + "epoch": 1.8097043723364987, + "grad_norm": 3.879878520965576, + "learning_rate": 2.353029186496919e-06, + "loss": 1.9525, + "step": 26966 + }, + { + "epoch": 1.8098385960202679, + "grad_norm": 4.300960540771484, + "learning_rate": 2.349735467698094e-06, + "loss": 1.6511, + "step": 26968 + }, + { + "epoch": 1.8099728197040368, + "grad_norm": 4.136781215667725, + "learning_rate": 2.3464440002805844e-06, + "loss": 1.7676, + "step": 26970 + }, + { + "epoch": 1.8101070433878057, + "grad_norm": 4.0123372077941895, + "learning_rate": 2.3431547843998815e-06, + "loss": 1.8741, + "step": 26972 + }, + { + "epoch": 1.8102412670715746, + "grad_norm": 4.215743064880371, + "learning_rate": 2.3398678202114054e-06, + "loss": 1.9429, + "step": 26974 + }, + { + "epoch": 1.8103754907553438, + "grad_norm": 4.041572570800781, + "learning_rate": 2.336583107870449e-06, + "loss": 1.7275, + "step": 26976 + }, + { + "epoch": 1.810509714439113, + "grad_norm": 4.151956558227539, + "learning_rate": 2.333300647532222e-06, + "loss": 1.8188, + "step": 26978 + }, + { + "epoch": 1.8106439381228818, + "grad_norm": 3.5436580181121826, + "learning_rate": 2.330020439351799e-06, + "loss": 1.9162, + "step": 26980 + }, + { + "epoch": 1.8107781618066507, + "grad_norm": 4.213555335998535, + "learning_rate": 2.3267424834841845e-06, + "loss": 2.0373, + "step": 26982 + }, + { + "epoch": 1.8109123854904197, + "grad_norm": 3.3478758335113525, + "learning_rate": 2.323466780084227e-06, + "loss": 1.5442, + "step": 26984 + }, + { + "epoch": 1.8110466091741888, + "grad_norm": 4.148407936096191, + "learning_rate": 2.3201933293067247e-06, + "loss": 1.6524, + "step": 26986 + }, + { + "epoch": 1.811180832857958, + "grad_norm": 3.9721174240112305, + "learning_rate": 2.3169221313063207e-06, + "loss": 1.8831, + "step": 26988 + }, + { + "epoch": 1.8113150565417269, + "grad_norm": 3.7487175464630127, + "learning_rate": 2.31365318623758e-06, + "loss": 1.7817, + "step": 26990 + }, + { + "epoch": 1.8114492802254958, + "grad_norm": 4.50717830657959, + "learning_rate": 2.3103864942549623e-06, + "loss": 2.0161, + "step": 26992 + }, + { + "epoch": 1.8115835039092647, + "grad_norm": 4.036996364593506, + "learning_rate": 2.307122055512806e-06, + "loss": 1.6567, + "step": 26994 + }, + { + "epoch": 1.8117177275930338, + "grad_norm": 4.151657581329346, + "learning_rate": 2.3038598701653425e-06, + "loss": 1.9562, + "step": 26996 + }, + { + "epoch": 1.8118519512768028, + "grad_norm": 3.8905019760131836, + "learning_rate": 2.30059993836671e-06, + "loss": 1.6768, + "step": 26998 + }, + { + "epoch": 1.811986174960572, + "grad_norm": 4.218891620635986, + "learning_rate": 2.297342260270935e-06, + "loss": 1.7405, + "step": 27000 + }, + { + "epoch": 1.8121203986443408, + "grad_norm": 4.257456302642822, + "learning_rate": 2.294086836031939e-06, + "loss": 1.9184, + "step": 27002 + }, + { + "epoch": 1.8122546223281097, + "grad_norm": 3.907236099243164, + "learning_rate": 2.2908336658035266e-06, + "loss": 1.5798, + "step": 27004 + }, + { + "epoch": 1.8123888460118787, + "grad_norm": 4.227237701416016, + "learning_rate": 2.287582749739403e-06, + "loss": 1.8997, + "step": 27006 + }, + { + "epoch": 1.8125230696956478, + "grad_norm": 4.311452865600586, + "learning_rate": 2.2843340879931783e-06, + "loss": 1.8722, + "step": 27008 + }, + { + "epoch": 1.812657293379417, + "grad_norm": 4.411679267883301, + "learning_rate": 2.2810876807183414e-06, + "loss": 1.8422, + "step": 27010 + }, + { + "epoch": 1.8127915170631859, + "grad_norm": 4.122270584106445, + "learning_rate": 2.277843528068274e-06, + "loss": 1.7861, + "step": 27012 + }, + { + "epoch": 1.8129257407469548, + "grad_norm": 3.8618857860565186, + "learning_rate": 2.274601630196249e-06, + "loss": 1.8639, + "step": 27014 + }, + { + "epoch": 1.8130599644307237, + "grad_norm": 4.383569717407227, + "learning_rate": 2.2713619872554603e-06, + "loss": 1.9307, + "step": 27016 + }, + { + "epoch": 1.8131941881144928, + "grad_norm": 4.453375816345215, + "learning_rate": 2.268124599398952e-06, + "loss": 1.7664, + "step": 27018 + }, + { + "epoch": 1.8133284117982618, + "grad_norm": 4.628758907318115, + "learning_rate": 2.264889466779707e-06, + "loss": 1.8844, + "step": 27020 + }, + { + "epoch": 1.813462635482031, + "grad_norm": 3.951591730117798, + "learning_rate": 2.261656589550565e-06, + "loss": 1.8692, + "step": 27022 + }, + { + "epoch": 1.8135968591657998, + "grad_norm": 4.248632431030273, + "learning_rate": 2.258425967864275e-06, + "loss": 1.972, + "step": 27024 + }, + { + "epoch": 1.8137310828495687, + "grad_norm": 4.111885070800781, + "learning_rate": 2.2551976018734766e-06, + "loss": 1.9912, + "step": 27026 + }, + { + "epoch": 1.8138653065333377, + "grad_norm": 5.044241428375244, + "learning_rate": 2.251971491730709e-06, + "loss": 1.8548, + "step": 27028 + }, + { + "epoch": 1.8139995302171068, + "grad_norm": 4.24172306060791, + "learning_rate": 2.248747637588394e-06, + "loss": 1.9108, + "step": 27030 + }, + { + "epoch": 1.814133753900876, + "grad_norm": 4.288963794708252, + "learning_rate": 2.2455260395988553e-06, + "loss": 1.7795, + "step": 27032 + }, + { + "epoch": 1.8142679775846449, + "grad_norm": 4.40950870513916, + "learning_rate": 2.242306697914298e-06, + "loss": 1.8935, + "step": 27034 + }, + { + "epoch": 1.8144022012684138, + "grad_norm": 4.336639881134033, + "learning_rate": 2.2390896126868453e-06, + "loss": 2.0651, + "step": 27036 + }, + { + "epoch": 1.8145364249521827, + "grad_norm": 4.119831085205078, + "learning_rate": 2.2358747840684924e-06, + "loss": 1.7883, + "step": 27038 + }, + { + "epoch": 1.8146706486359518, + "grad_norm": 3.771799087524414, + "learning_rate": 2.232662212211134e-06, + "loss": 1.8584, + "step": 27040 + }, + { + "epoch": 1.8148048723197208, + "grad_norm": 3.0611085891723633, + "learning_rate": 2.2294518972665434e-06, + "loss": 1.781, + "step": 27042 + }, + { + "epoch": 1.81493909600349, + "grad_norm": 3.7915139198303223, + "learning_rate": 2.2262438393864214e-06, + "loss": 1.8097, + "step": 27044 + }, + { + "epoch": 1.8150733196872588, + "grad_norm": 4.632130146026611, + "learning_rate": 2.2230380387223305e-06, + "loss": 2.016, + "step": 27046 + }, + { + "epoch": 1.8152075433710277, + "grad_norm": 3.5255987644195557, + "learning_rate": 2.219834495425749e-06, + "loss": 1.5303, + "step": 27048 + }, + { + "epoch": 1.8153417670547967, + "grad_norm": 3.7652268409729004, + "learning_rate": 2.2166332096480337e-06, + "loss": 1.9011, + "step": 27050 + }, + { + "epoch": 1.8154759907385658, + "grad_norm": 4.358224868774414, + "learning_rate": 2.213434181540436e-06, + "loss": 1.6894, + "step": 27052 + }, + { + "epoch": 1.815610214422335, + "grad_norm": 4.337924003601074, + "learning_rate": 2.2102374112541013e-06, + "loss": 1.8343, + "step": 27054 + }, + { + "epoch": 1.8157444381061039, + "grad_norm": 3.8260793685913086, + "learning_rate": 2.207042898940076e-06, + "loss": 1.7642, + "step": 27056 + }, + { + "epoch": 1.8158786617898728, + "grad_norm": 4.267097473144531, + "learning_rate": 2.2038506447492945e-06, + "loss": 1.8897, + "step": 27058 + }, + { + "epoch": 1.8160128854736417, + "grad_norm": 4.335625171661377, + "learning_rate": 2.200660648832581e-06, + "loss": 1.9286, + "step": 27060 + }, + { + "epoch": 1.8161471091574108, + "grad_norm": 4.424828052520752, + "learning_rate": 2.197472911340659e-06, + "loss": 1.9486, + "step": 27062 + }, + { + "epoch": 1.81628133284118, + "grad_norm": 4.42557430267334, + "learning_rate": 2.1942874324241357e-06, + "loss": 2.2329, + "step": 27064 + }, + { + "epoch": 1.816415556524949, + "grad_norm": 4.045375823974609, + "learning_rate": 2.1911042122335356e-06, + "loss": 2.2779, + "step": 27066 + }, + { + "epoch": 1.8165497802087178, + "grad_norm": 4.083019256591797, + "learning_rate": 2.1879232509192494e-06, + "loss": 1.7283, + "step": 27068 + }, + { + "epoch": 1.8166840038924867, + "grad_norm": 3.823678970336914, + "learning_rate": 2.1847445486315675e-06, + "loss": 1.577, + "step": 27070 + }, + { + "epoch": 1.8168182275762559, + "grad_norm": 4.110856056213379, + "learning_rate": 2.181568105520676e-06, + "loss": 1.9851, + "step": 27072 + }, + { + "epoch": 1.8169524512600248, + "grad_norm": 4.461495876312256, + "learning_rate": 2.1783939217366655e-06, + "loss": 1.9042, + "step": 27074 + }, + { + "epoch": 1.817086674943794, + "grad_norm": 3.70774507522583, + "learning_rate": 2.1752219974294995e-06, + "loss": 1.8446, + "step": 27076 + }, + { + "epoch": 1.8172208986275629, + "grad_norm": 4.2536234855651855, + "learning_rate": 2.1720523327490636e-06, + "loss": 1.8985, + "step": 27078 + }, + { + "epoch": 1.8173551223113318, + "grad_norm": 3.6566121578216553, + "learning_rate": 2.168884927845094e-06, + "loss": 1.9308, + "step": 27080 + }, + { + "epoch": 1.8174893459951007, + "grad_norm": 3.54663348197937, + "learning_rate": 2.165719782867259e-06, + "loss": 1.9227, + "step": 27082 + }, + { + "epoch": 1.8176235696788698, + "grad_norm": 3.836632013320923, + "learning_rate": 2.1625568979651014e-06, + "loss": 1.6297, + "step": 27084 + }, + { + "epoch": 1.817757793362639, + "grad_norm": 3.4969868659973145, + "learning_rate": 2.1593962732880615e-06, + "loss": 1.6842, + "step": 27086 + }, + { + "epoch": 1.817892017046408, + "grad_norm": 4.595363616943359, + "learning_rate": 2.1562379089854755e-06, + "loss": 1.9423, + "step": 27088 + }, + { + "epoch": 1.8180262407301768, + "grad_norm": 4.039866924285889, + "learning_rate": 2.1530818052065695e-06, + "loss": 1.8573, + "step": 27090 + }, + { + "epoch": 1.8181604644139457, + "grad_norm": 4.005762577056885, + "learning_rate": 2.1499279621004564e-06, + "loss": 1.9074, + "step": 27092 + }, + { + "epoch": 1.8182946880977149, + "grad_norm": 4.248556137084961, + "learning_rate": 2.146776379816157e-06, + "loss": 2.0281, + "step": 27094 + }, + { + "epoch": 1.8184289117814838, + "grad_norm": 4.284428596496582, + "learning_rate": 2.1436270585025732e-06, + "loss": 1.9514, + "step": 27096 + }, + { + "epoch": 1.818563135465253, + "grad_norm": 4.228694915771484, + "learning_rate": 2.140479998308509e-06, + "loss": 2.1998, + "step": 27098 + }, + { + "epoch": 1.8186973591490219, + "grad_norm": 4.463009357452393, + "learning_rate": 2.137335199382645e-06, + "loss": 1.767, + "step": 27100 + }, + { + "epoch": 1.8188315828327908, + "grad_norm": 3.9033915996551514, + "learning_rate": 2.1341926618735796e-06, + "loss": 1.9188, + "step": 27102 + }, + { + "epoch": 1.8189658065165597, + "grad_norm": 3.9838550090789795, + "learning_rate": 2.1310523859297828e-06, + "loss": 1.9078, + "step": 27104 + }, + { + "epoch": 1.8191000302003288, + "grad_norm": 3.918630599975586, + "learning_rate": 2.1279143716996464e-06, + "loss": 1.8379, + "step": 27106 + }, + { + "epoch": 1.819234253884098, + "grad_norm": 3.6818552017211914, + "learning_rate": 2.1247786193314025e-06, + "loss": 1.9837, + "step": 27108 + }, + { + "epoch": 1.819368477567867, + "grad_norm": 4.010140895843506, + "learning_rate": 2.121645128973232e-06, + "loss": 1.8107, + "step": 27110 + }, + { + "epoch": 1.8195027012516358, + "grad_norm": 4.350226879119873, + "learning_rate": 2.118513900773178e-06, + "loss": 1.85, + "step": 27112 + }, + { + "epoch": 1.8196369249354047, + "grad_norm": 4.1691813468933105, + "learning_rate": 2.1153849348791887e-06, + "loss": 1.8645, + "step": 27114 + }, + { + "epoch": 1.8197711486191739, + "grad_norm": 3.8314177989959717, + "learning_rate": 2.1122582314391013e-06, + "loss": 1.902, + "step": 27116 + }, + { + "epoch": 1.8199053723029428, + "grad_norm": 3.5142226219177246, + "learning_rate": 2.1091337906006482e-06, + "loss": 1.7338, + "step": 27118 + }, + { + "epoch": 1.820039595986712, + "grad_norm": 4.053452968597412, + "learning_rate": 2.1060116125114436e-06, + "loss": 1.9606, + "step": 27120 + }, + { + "epoch": 1.8201738196704809, + "grad_norm": 4.247061729431152, + "learning_rate": 2.102891697319015e-06, + "loss": 1.8625, + "step": 27122 + }, + { + "epoch": 1.8203080433542498, + "grad_norm": 4.04962682723999, + "learning_rate": 2.099774045170766e-06, + "loss": 1.8763, + "step": 27124 + }, + { + "epoch": 1.8204422670380187, + "grad_norm": 3.7653894424438477, + "learning_rate": 2.096658656214007e-06, + "loss": 2.0535, + "step": 27126 + }, + { + "epoch": 1.8205764907217878, + "grad_norm": 3.3025741577148438, + "learning_rate": 2.093545530595925e-06, + "loss": 1.7304, + "step": 27128 + }, + { + "epoch": 1.820710714405557, + "grad_norm": 3.9061107635498047, + "learning_rate": 2.090434668463609e-06, + "loss": 1.7163, + "step": 27130 + }, + { + "epoch": 1.820844938089326, + "grad_norm": 3.1799771785736084, + "learning_rate": 2.0873260699640462e-06, + "loss": 1.6826, + "step": 27132 + }, + { + "epoch": 1.8209791617730948, + "grad_norm": 4.597368240356445, + "learning_rate": 2.084219735244114e-06, + "loss": 1.9235, + "step": 27134 + }, + { + "epoch": 1.8211133854568637, + "grad_norm": 4.037221431732178, + "learning_rate": 2.081115664450578e-06, + "loss": 1.92, + "step": 27136 + }, + { + "epoch": 1.8212476091406329, + "grad_norm": 4.9644083976745605, + "learning_rate": 2.078013857730088e-06, + "loss": 1.8674, + "step": 27138 + }, + { + "epoch": 1.821381832824402, + "grad_norm": 4.410196304321289, + "learning_rate": 2.0749143152292204e-06, + "loss": 1.7727, + "step": 27140 + }, + { + "epoch": 1.821516056508171, + "grad_norm": 3.944392204284668, + "learning_rate": 2.071817037094398e-06, + "loss": 1.8985, + "step": 27142 + }, + { + "epoch": 1.8216502801919399, + "grad_norm": 3.84732985496521, + "learning_rate": 2.068722023471992e-06, + "loss": 1.9584, + "step": 27144 + }, + { + "epoch": 1.8217845038757088, + "grad_norm": 4.202569484710693, + "learning_rate": 2.065629274508202e-06, + "loss": 2.2035, + "step": 27146 + }, + { + "epoch": 1.821918727559478, + "grad_norm": 4.235292434692383, + "learning_rate": 2.062538790349178e-06, + "loss": 1.6609, + "step": 27148 + }, + { + "epoch": 1.8220529512432468, + "grad_norm": 4.170900344848633, + "learning_rate": 2.059450571140925e-06, + "loss": 1.9939, + "step": 27150 + }, + { + "epoch": 1.822187174927016, + "grad_norm": 4.473316669464111, + "learning_rate": 2.056364617029366e-06, + "loss": 2.0358, + "step": 27152 + }, + { + "epoch": 1.822321398610785, + "grad_norm": 3.731180429458618, + "learning_rate": 2.0532809281603062e-06, + "loss": 1.8415, + "step": 27154 + }, + { + "epoch": 1.8224556222945538, + "grad_norm": 4.5942702293396, + "learning_rate": 2.050199504679434e-06, + "loss": 2.0048, + "step": 27156 + }, + { + "epoch": 1.8225898459783227, + "grad_norm": 3.9296562671661377, + "learning_rate": 2.0471203467323398e-06, + "loss": 1.7066, + "step": 27158 + }, + { + "epoch": 1.8227240696620919, + "grad_norm": 3.70497727394104, + "learning_rate": 2.0440434544645227e-06, + "loss": 1.7785, + "step": 27160 + }, + { + "epoch": 1.822858293345861, + "grad_norm": 4.256872177124023, + "learning_rate": 2.0409688280213503e-06, + "loss": 1.8102, + "step": 27162 + }, + { + "epoch": 1.82299251702963, + "grad_norm": 4.657016754150391, + "learning_rate": 2.037896467548095e-06, + "loss": 2.1487, + "step": 27164 + }, + { + "epoch": 1.8231267407133989, + "grad_norm": 3.807194471359253, + "learning_rate": 2.0348263731899076e-06, + "loss": 1.7755, + "step": 27166 + }, + { + "epoch": 1.8232609643971678, + "grad_norm": 4.971982955932617, + "learning_rate": 2.0317585450918663e-06, + "loss": 2.0281, + "step": 27168 + }, + { + "epoch": 1.823395188080937, + "grad_norm": 4.116217613220215, + "learning_rate": 2.0286929833988943e-06, + "loss": 1.9604, + "step": 27170 + }, + { + "epoch": 1.8235294117647058, + "grad_norm": 3.8150079250335693, + "learning_rate": 2.025629688255859e-06, + "loss": 1.7027, + "step": 27172 + }, + { + "epoch": 1.823663635448475, + "grad_norm": 4.747952461242676, + "learning_rate": 2.0225686598074835e-06, + "loss": 1.7951, + "step": 27174 + }, + { + "epoch": 1.823797859132244, + "grad_norm": 4.072630405426025, + "learning_rate": 2.0195098981983963e-06, + "loss": 1.9375, + "step": 27176 + }, + { + "epoch": 1.8239320828160128, + "grad_norm": 4.570821762084961, + "learning_rate": 2.01645340357311e-06, + "loss": 1.9424, + "step": 27178 + }, + { + "epoch": 1.8240663064997817, + "grad_norm": 4.002171993255615, + "learning_rate": 2.0133991760760475e-06, + "loss": 1.854, + "step": 27180 + }, + { + "epoch": 1.8242005301835509, + "grad_norm": 3.6534714698791504, + "learning_rate": 2.0103472158515158e-06, + "loss": 1.6342, + "step": 27182 + }, + { + "epoch": 1.82433475386732, + "grad_norm": 3.9633123874664307, + "learning_rate": 2.007297523043711e-06, + "loss": 1.7272, + "step": 27184 + }, + { + "epoch": 1.824468977551089, + "grad_norm": 3.501537322998047, + "learning_rate": 2.0042500977967172e-06, + "loss": 1.6726, + "step": 27186 + }, + { + "epoch": 1.8246032012348579, + "grad_norm": 4.586471080780029, + "learning_rate": 2.0012049402545306e-06, + "loss": 1.7342, + "step": 27188 + }, + { + "epoch": 1.8247374249186268, + "grad_norm": 4.6357879638671875, + "learning_rate": 1.9981620505610255e-06, + "loss": 1.6966, + "step": 27190 + }, + { + "epoch": 1.824871648602396, + "grad_norm": 4.774326801300049, + "learning_rate": 1.995121428859975e-06, + "loss": 2.026, + "step": 27192 + }, + { + "epoch": 1.8250058722861648, + "grad_norm": 4.038241863250732, + "learning_rate": 1.992083075295037e-06, + "loss": 1.827, + "step": 27194 + }, + { + "epoch": 1.825140095969934, + "grad_norm": 4.2046403884887695, + "learning_rate": 1.989046990009763e-06, + "loss": 1.9382, + "step": 27196 + }, + { + "epoch": 1.825274319653703, + "grad_norm": 4.156599521636963, + "learning_rate": 1.9860131731476217e-06, + "loss": 1.8277, + "step": 27198 + }, + { + "epoch": 1.8254085433374718, + "grad_norm": 4.695516109466553, + "learning_rate": 1.982981624851932e-06, + "loss": 1.8203, + "step": 27200 + }, + { + "epoch": 1.8255427670212407, + "grad_norm": 4.404665470123291, + "learning_rate": 1.979952345265951e-06, + "loss": 2.0241, + "step": 27202 + }, + { + "epoch": 1.8256769907050099, + "grad_norm": 4.144914150238037, + "learning_rate": 1.976925334532781e-06, + "loss": 1.8334, + "step": 27204 + }, + { + "epoch": 1.825811214388779, + "grad_norm": 4.236912250518799, + "learning_rate": 1.9739005927954633e-06, + "loss": 1.8343, + "step": 27206 + }, + { + "epoch": 1.825945438072548, + "grad_norm": 4.8627610206604, + "learning_rate": 1.9708781201968952e-06, + "loss": 2.0208, + "step": 27208 + }, + { + "epoch": 1.8260796617563169, + "grad_norm": 4.498232364654541, + "learning_rate": 1.9678579168799004e-06, + "loss": 1.7179, + "step": 27210 + }, + { + "epoch": 1.8262138854400858, + "grad_norm": 4.270575046539307, + "learning_rate": 1.9648399829871654e-06, + "loss": 1.8199, + "step": 27212 + }, + { + "epoch": 1.826348109123855, + "grad_norm": 4.352452754974365, + "learning_rate": 1.9618243186612815e-06, + "loss": 1.9061, + "step": 27214 + }, + { + "epoch": 1.826482332807624, + "grad_norm": 3.6290738582611084, + "learning_rate": 1.958810924044735e-06, + "loss": 1.9539, + "step": 27216 + }, + { + "epoch": 1.826616556491393, + "grad_norm": 4.012609004974365, + "learning_rate": 1.955799799279906e-06, + "loss": 1.6234, + "step": 27218 + }, + { + "epoch": 1.826750780175162, + "grad_norm": 3.8702077865600586, + "learning_rate": 1.952790944509064e-06, + "loss": 1.8182, + "step": 27220 + }, + { + "epoch": 1.8268850038589308, + "grad_norm": 3.8011398315429688, + "learning_rate": 1.949784359874368e-06, + "loss": 1.7552, + "step": 27222 + }, + { + "epoch": 1.8270192275427, + "grad_norm": 4.253540992736816, + "learning_rate": 1.9467800455178654e-06, + "loss": 1.9547, + "step": 27224 + }, + { + "epoch": 1.8271534512264689, + "grad_norm": 4.279043674468994, + "learning_rate": 1.9437780015815254e-06, + "loss": 2.1131, + "step": 27226 + }, + { + "epoch": 1.827287674910238, + "grad_norm": 4.631351470947266, + "learning_rate": 1.940778228207163e-06, + "loss": 1.6477, + "step": 27228 + }, + { + "epoch": 1.827421898594007, + "grad_norm": 4.543736457824707, + "learning_rate": 1.937780725536542e-06, + "loss": 1.9549, + "step": 27230 + }, + { + "epoch": 1.8275561222777759, + "grad_norm": 3.1199288368225098, + "learning_rate": 1.9347854937112606e-06, + "loss": 1.6542, + "step": 27232 + }, + { + "epoch": 1.8276903459615448, + "grad_norm": 3.736006021499634, + "learning_rate": 1.9317925328728504e-06, + "loss": 1.8302, + "step": 27234 + }, + { + "epoch": 1.827824569645314, + "grad_norm": 3.4618771076202393, + "learning_rate": 1.9288018431627143e-06, + "loss": 1.4613, + "step": 27236 + }, + { + "epoch": 1.827958793329083, + "grad_norm": 4.081121444702148, + "learning_rate": 1.9258134247221725e-06, + "loss": 2.1624, + "step": 27238 + }, + { + "epoch": 1.828093017012852, + "grad_norm": 4.130151271820068, + "learning_rate": 1.922827277692413e-06, + "loss": 1.8013, + "step": 27240 + }, + { + "epoch": 1.828227240696621, + "grad_norm": 4.028398513793945, + "learning_rate": 1.9198434022145216e-06, + "loss": 1.8777, + "step": 27242 + }, + { + "epoch": 1.8283614643803898, + "grad_norm": 4.173541069030762, + "learning_rate": 1.916861798429481e-06, + "loss": 2.0249, + "step": 27244 + }, + { + "epoch": 1.828495688064159, + "grad_norm": 4.179261684417725, + "learning_rate": 1.913882466478173e-06, + "loss": 1.7862, + "step": 27246 + }, + { + "epoch": 1.8286299117479279, + "grad_norm": 4.3275065422058105, + "learning_rate": 1.910905406501362e-06, + "loss": 2.0168, + "step": 27248 + }, + { + "epoch": 1.828764135431697, + "grad_norm": 4.1722636222839355, + "learning_rate": 1.907930618639703e-06, + "loss": 2.0714, + "step": 27250 + }, + { + "epoch": 1.828898359115466, + "grad_norm": 4.128744602203369, + "learning_rate": 1.9049581030337548e-06, + "loss": 1.8528, + "step": 27252 + }, + { + "epoch": 1.8290325827992349, + "grad_norm": 4.061409950256348, + "learning_rate": 1.9019878598239615e-06, + "loss": 1.836, + "step": 27254 + }, + { + "epoch": 1.8291668064830038, + "grad_norm": 3.637096405029297, + "learning_rate": 1.8990198891506605e-06, + "loss": 1.6292, + "step": 27256 + }, + { + "epoch": 1.829301030166773, + "grad_norm": 5.855587005615234, + "learning_rate": 1.8960541911540896e-06, + "loss": 1.8931, + "step": 27258 + }, + { + "epoch": 1.829435253850542, + "grad_norm": 4.142984390258789, + "learning_rate": 1.8930907659743646e-06, + "loss": 1.9144, + "step": 27260 + }, + { + "epoch": 1.829569477534311, + "grad_norm": 3.6684553623199463, + "learning_rate": 1.89012961375149e-06, + "loss": 2.072, + "step": 27262 + }, + { + "epoch": 1.82970370121808, + "grad_norm": 4.3269500732421875, + "learning_rate": 1.8871707346253986e-06, + "loss": 1.8004, + "step": 27264 + }, + { + "epoch": 1.8298379249018488, + "grad_norm": 4.239622116088867, + "learning_rate": 1.8842141287358727e-06, + "loss": 1.775, + "step": 27266 + }, + { + "epoch": 1.829972148585618, + "grad_norm": 4.450201034545898, + "learning_rate": 1.8812597962226286e-06, + "loss": 1.8773, + "step": 27268 + }, + { + "epoch": 1.8301063722693869, + "grad_norm": 4.044822692871094, + "learning_rate": 1.878307737225221e-06, + "loss": 2.0062, + "step": 27270 + }, + { + "epoch": 1.830240595953156, + "grad_norm": 4.639606475830078, + "learning_rate": 1.8753579518831554e-06, + "loss": 2.0616, + "step": 27272 + }, + { + "epoch": 1.830374819636925, + "grad_norm": 4.142370700836182, + "learning_rate": 1.8724104403357868e-06, + "loss": 1.9433, + "step": 27274 + }, + { + "epoch": 1.8305090433206939, + "grad_norm": 4.219827651977539, + "learning_rate": 1.8694652027223925e-06, + "loss": 1.6382, + "step": 27276 + }, + { + "epoch": 1.8306432670044628, + "grad_norm": 4.026155948638916, + "learning_rate": 1.8665222391821169e-06, + "loss": 2.0255, + "step": 27278 + }, + { + "epoch": 1.830777490688232, + "grad_norm": 3.601552724838257, + "learning_rate": 1.863581549854021e-06, + "loss": 1.7292, + "step": 27280 + }, + { + "epoch": 1.830911714372001, + "grad_norm": 4.164125442504883, + "learning_rate": 1.8606431348770325e-06, + "loss": 1.7204, + "step": 27282 + }, + { + "epoch": 1.83104593805577, + "grad_norm": 4.043823719024658, + "learning_rate": 1.8577069943900017e-06, + "loss": 1.8166, + "step": 27284 + }, + { + "epoch": 1.831180161739539, + "grad_norm": 4.2221198081970215, + "learning_rate": 1.854773128531645e-06, + "loss": 1.9255, + "step": 27286 + }, + { + "epoch": 1.8313143854233078, + "grad_norm": 3.9721717834472656, + "learning_rate": 1.851841537440585e-06, + "loss": 1.8617, + "step": 27288 + }, + { + "epoch": 1.831448609107077, + "grad_norm": 4.383234977722168, + "learning_rate": 1.8489122212553279e-06, + "loss": 2.001, + "step": 27290 + }, + { + "epoch": 1.831582832790846, + "grad_norm": 4.386233329772949, + "learning_rate": 1.8459851801142902e-06, + "loss": 1.9086, + "step": 27292 + }, + { + "epoch": 1.831717056474615, + "grad_norm": 5.016953945159912, + "learning_rate": 1.8430604141557562e-06, + "loss": 1.9482, + "step": 27294 + }, + { + "epoch": 1.831851280158384, + "grad_norm": 4.159138202667236, + "learning_rate": 1.8401379235179316e-06, + "loss": 1.7683, + "step": 27296 + }, + { + "epoch": 1.8319855038421529, + "grad_norm": 4.136733531951904, + "learning_rate": 1.8372177083388786e-06, + "loss": 2.0056, + "step": 27298 + }, + { + "epoch": 1.832119727525922, + "grad_norm": 4.354679584503174, + "learning_rate": 1.834299768756581e-06, + "loss": 2.0377, + "step": 27300 + }, + { + "epoch": 1.832253951209691, + "grad_norm": 4.224242210388184, + "learning_rate": 1.8313841049089065e-06, + "loss": 1.8743, + "step": 27302 + }, + { + "epoch": 1.83238817489346, + "grad_norm": 4.6432013511657715, + "learning_rate": 1.8284707169336169e-06, + "loss": 1.6511, + "step": 27304 + }, + { + "epoch": 1.832522398577229, + "grad_norm": 4.2709431648254395, + "learning_rate": 1.8255596049683576e-06, + "loss": 1.9855, + "step": 27306 + }, + { + "epoch": 1.832656622260998, + "grad_norm": 4.632303237915039, + "learning_rate": 1.82265076915068e-06, + "loss": 1.8649, + "step": 27308 + }, + { + "epoch": 1.8327908459447668, + "grad_norm": 4.385166168212891, + "learning_rate": 1.8197442096180072e-06, + "loss": 1.978, + "step": 27310 + }, + { + "epoch": 1.832925069628536, + "grad_norm": 4.207446098327637, + "learning_rate": 1.8168399265076852e-06, + "loss": 1.8297, + "step": 27312 + }, + { + "epoch": 1.833059293312305, + "grad_norm": 3.9463629722595215, + "learning_rate": 1.8139379199569318e-06, + "loss": 1.7066, + "step": 27314 + }, + { + "epoch": 1.833193516996074, + "grad_norm": 4.343576431274414, + "learning_rate": 1.8110381901028539e-06, + "loss": 1.7811, + "step": 27316 + }, + { + "epoch": 1.833327740679843, + "grad_norm": 4.161701202392578, + "learning_rate": 1.8081407370824588e-06, + "loss": 1.612, + "step": 27318 + }, + { + "epoch": 1.8334619643636119, + "grad_norm": 4.08317232131958, + "learning_rate": 1.8052455610326534e-06, + "loss": 1.8075, + "step": 27320 + }, + { + "epoch": 1.833596188047381, + "grad_norm": 4.336402893066406, + "learning_rate": 1.8023526620902176e-06, + "loss": 1.8286, + "step": 27322 + }, + { + "epoch": 1.83373041173115, + "grad_norm": 4.3985209465026855, + "learning_rate": 1.7994620403918527e-06, + "loss": 1.8104, + "step": 27324 + }, + { + "epoch": 1.833864635414919, + "grad_norm": 4.262119770050049, + "learning_rate": 1.7965736960741164e-06, + "loss": 1.8449, + "step": 27326 + }, + { + "epoch": 1.833998859098688, + "grad_norm": 6.580257892608643, + "learning_rate": 1.7936876292734883e-06, + "loss": 1.6695, + "step": 27328 + }, + { + "epoch": 1.834133082782457, + "grad_norm": 4.447585582733154, + "learning_rate": 1.790803840126326e-06, + "loss": 2.1706, + "step": 27330 + }, + { + "epoch": 1.8342673064662258, + "grad_norm": 3.7748825550079346, + "learning_rate": 1.7879223287688761e-06, + "loss": 1.9723, + "step": 27332 + }, + { + "epoch": 1.834401530149995, + "grad_norm": 3.7950668334960938, + "learning_rate": 1.7850430953372966e-06, + "loss": 1.7629, + "step": 27334 + }, + { + "epoch": 1.834535753833764, + "grad_norm": 3.9678001403808594, + "learning_rate": 1.7821661399676226e-06, + "loss": 1.8245, + "step": 27336 + }, + { + "epoch": 1.834669977517533, + "grad_norm": 4.306466579437256, + "learning_rate": 1.779291462795779e-06, + "loss": 1.7984, + "step": 27338 + }, + { + "epoch": 1.834804201201302, + "grad_norm": 4.047208786010742, + "learning_rate": 1.7764190639575907e-06, + "loss": 1.8116, + "step": 27340 + }, + { + "epoch": 1.8349384248850709, + "grad_norm": 5.905943393707275, + "learning_rate": 1.7735489435887764e-06, + "loss": 1.7489, + "step": 27342 + }, + { + "epoch": 1.83507264856884, + "grad_norm": 3.9530277252197266, + "learning_rate": 1.7706811018249447e-06, + "loss": 2.0589, + "step": 27344 + }, + { + "epoch": 1.835206872252609, + "grad_norm": 4.148167133331299, + "learning_rate": 1.7678155388015872e-06, + "loss": 1.9044, + "step": 27346 + }, + { + "epoch": 1.835341095936378, + "grad_norm": 4.267519474029541, + "learning_rate": 1.764952254654101e-06, + "loss": 1.9804, + "step": 27348 + }, + { + "epoch": 1.835475319620147, + "grad_norm": 4.450095176696777, + "learning_rate": 1.7620912495177721e-06, + "loss": 1.6593, + "step": 27350 + }, + { + "epoch": 1.835609543303916, + "grad_norm": 4.27948522567749, + "learning_rate": 1.7592325235277762e-06, + "loss": 1.7809, + "step": 27352 + }, + { + "epoch": 1.8357437669876848, + "grad_norm": 4.009988307952881, + "learning_rate": 1.7563760768191828e-06, + "loss": 2.0726, + "step": 27354 + }, + { + "epoch": 1.835877990671454, + "grad_norm": 3.8965046405792236, + "learning_rate": 1.7535219095269451e-06, + "loss": 1.7268, + "step": 27356 + }, + { + "epoch": 1.836012214355223, + "grad_norm": 4.388085842132568, + "learning_rate": 1.7506700217859329e-06, + "loss": 1.7218, + "step": 27358 + }, + { + "epoch": 1.836146438038992, + "grad_norm": 4.268976211547852, + "learning_rate": 1.7478204137308773e-06, + "loss": 2.0339, + "step": 27360 + }, + { + "epoch": 1.836280661722761, + "grad_norm": 4.856917381286621, + "learning_rate": 1.744973085496432e-06, + "loss": 1.9311, + "step": 27362 + }, + { + "epoch": 1.8364148854065299, + "grad_norm": 3.9702327251434326, + "learning_rate": 1.7421280372171167e-06, + "loss": 1.8992, + "step": 27364 + }, + { + "epoch": 1.836549109090299, + "grad_norm": 3.9008138179779053, + "learning_rate": 1.739285269027352e-06, + "loss": 1.8698, + "step": 27366 + }, + { + "epoch": 1.8366833327740681, + "grad_norm": 4.455026626586914, + "learning_rate": 1.736444781061458e-06, + "loss": 1.9441, + "step": 27368 + }, + { + "epoch": 1.836817556457837, + "grad_norm": 4.80894660949707, + "learning_rate": 1.7336065734536444e-06, + "loss": 1.7667, + "step": 27370 + }, + { + "epoch": 1.836951780141606, + "grad_norm": 4.436129570007324, + "learning_rate": 1.7307706463380092e-06, + "loss": 1.7999, + "step": 27372 + }, + { + "epoch": 1.837086003825375, + "grad_norm": 4.028081893920898, + "learning_rate": 1.727936999848545e-06, + "loss": 1.7357, + "step": 27374 + }, + { + "epoch": 1.837220227509144, + "grad_norm": 4.093870639801025, + "learning_rate": 1.7251056341191285e-06, + "loss": 1.9466, + "step": 27376 + }, + { + "epoch": 1.837354451192913, + "grad_norm": 4.6950812339782715, + "learning_rate": 1.7222765492835468e-06, + "loss": 2.0434, + "step": 27378 + }, + { + "epoch": 1.837488674876682, + "grad_norm": 4.213068962097168, + "learning_rate": 1.7194497454754599e-06, + "loss": 2.0106, + "step": 27380 + }, + { + "epoch": 1.837622898560451, + "grad_norm": 4.456737041473389, + "learning_rate": 1.716625222828444e-06, + "loss": 1.9435, + "step": 27382 + }, + { + "epoch": 1.83775712224422, + "grad_norm": 4.601782321929932, + "learning_rate": 1.7138029814759316e-06, + "loss": 1.8863, + "step": 27384 + }, + { + "epoch": 1.8378913459279889, + "grad_norm": 4.206491470336914, + "learning_rate": 1.7109830215512767e-06, + "loss": 1.827, + "step": 27386 + }, + { + "epoch": 1.838025569611758, + "grad_norm": 4.294702053070068, + "learning_rate": 1.7081653431877175e-06, + "loss": 1.9808, + "step": 27388 + }, + { + "epoch": 1.8381597932955271, + "grad_norm": 4.1304826736450195, + "learning_rate": 1.7053499465183864e-06, + "loss": 1.9543, + "step": 27390 + }, + { + "epoch": 1.838294016979296, + "grad_norm": 3.6232314109802246, + "learning_rate": 1.7025368316763047e-06, + "loss": 1.7589, + "step": 27392 + }, + { + "epoch": 1.838428240663065, + "grad_norm": 4.024653911590576, + "learning_rate": 1.699725998794388e-06, + "loss": 1.6912, + "step": 27394 + }, + { + "epoch": 1.838562464346834, + "grad_norm": 3.8322741985321045, + "learning_rate": 1.6969174480054305e-06, + "loss": 1.8936, + "step": 27396 + }, + { + "epoch": 1.838696688030603, + "grad_norm": 4.104907512664795, + "learning_rate": 1.6941111794421482e-06, + "loss": 1.5976, + "step": 27398 + }, + { + "epoch": 1.838830911714372, + "grad_norm": 4.345146179199219, + "learning_rate": 1.6913071932371184e-06, + "loss": 2.1383, + "step": 27400 + }, + { + "epoch": 1.838965135398141, + "grad_norm": 4.335310459136963, + "learning_rate": 1.688505489522829e-06, + "loss": 1.9736, + "step": 27402 + }, + { + "epoch": 1.83909935908191, + "grad_norm": 4.079138278961182, + "learning_rate": 1.6857060684316584e-06, + "loss": 1.7929, + "step": 27404 + }, + { + "epoch": 1.839233582765679, + "grad_norm": 4.28877067565918, + "learning_rate": 1.682908930095861e-06, + "loss": 1.9233, + "step": 27406 + }, + { + "epoch": 1.8393678064494479, + "grad_norm": 4.04513692855835, + "learning_rate": 1.6801140746476152e-06, + "loss": 1.8333, + "step": 27408 + }, + { + "epoch": 1.839502030133217, + "grad_norm": 4.694528102874756, + "learning_rate": 1.6773215022189592e-06, + "loss": 1.9049, + "step": 27410 + }, + { + "epoch": 1.8396362538169861, + "grad_norm": 4.258336067199707, + "learning_rate": 1.6745312129418434e-06, + "loss": 1.7741, + "step": 27412 + }, + { + "epoch": 1.839770477500755, + "grad_norm": 4.294193267822266, + "learning_rate": 1.6717432069480897e-06, + "loss": 1.8224, + "step": 27414 + }, + { + "epoch": 1.839904701184524, + "grad_norm": 3.8313558101654053, + "learning_rate": 1.6689574843694433e-06, + "loss": 1.8138, + "step": 27416 + }, + { + "epoch": 1.840038924868293, + "grad_norm": 4.449397087097168, + "learning_rate": 1.6661740453375096e-06, + "loss": 2.1089, + "step": 27418 + }, + { + "epoch": 1.840173148552062, + "grad_norm": 4.437559127807617, + "learning_rate": 1.6633928899838224e-06, + "loss": 2.0467, + "step": 27420 + }, + { + "epoch": 1.840307372235831, + "grad_norm": 4.136992454528809, + "learning_rate": 1.6606140184397546e-06, + "loss": 1.8495, + "step": 27422 + }, + { + "epoch": 1.8404415959196, + "grad_norm": 4.38981294631958, + "learning_rate": 1.6578374308366285e-06, + "loss": 1.8671, + "step": 27424 + }, + { + "epoch": 1.840575819603369, + "grad_norm": 3.4600632190704346, + "learning_rate": 1.6550631273056171e-06, + "loss": 1.7027, + "step": 27426 + }, + { + "epoch": 1.840710043287138, + "grad_norm": 4.712905406951904, + "learning_rate": 1.6522911079778158e-06, + "loss": 1.8496, + "step": 27428 + }, + { + "epoch": 1.8408442669709069, + "grad_norm": 3.7578537464141846, + "learning_rate": 1.6495213729841808e-06, + "loss": 1.7877, + "step": 27430 + }, + { + "epoch": 1.840978490654676, + "grad_norm": 3.526218891143799, + "learning_rate": 1.6467539224555906e-06, + "loss": 2.0105, + "step": 27432 + }, + { + "epoch": 1.8411127143384451, + "grad_norm": 4.094633102416992, + "learning_rate": 1.643988756522785e-06, + "loss": 1.6514, + "step": 27434 + }, + { + "epoch": 1.841246938022214, + "grad_norm": 4.331361770629883, + "learning_rate": 1.6412258753164322e-06, + "loss": 2.1434, + "step": 27436 + }, + { + "epoch": 1.841381161705983, + "grad_norm": 4.4700398445129395, + "learning_rate": 1.6384652789670608e-06, + "loss": 2.0178, + "step": 27438 + }, + { + "epoch": 1.841515385389752, + "grad_norm": 3.9303202629089355, + "learning_rate": 1.6357069676051051e-06, + "loss": 1.7933, + "step": 27440 + }, + { + "epoch": 1.841649609073521, + "grad_norm": 4.478289604187012, + "learning_rate": 1.6329509413608834e-06, + "loss": 1.8226, + "step": 27442 + }, + { + "epoch": 1.8417838327572902, + "grad_norm": 4.493978500366211, + "learning_rate": 1.6301972003646304e-06, + "loss": 1.8847, + "step": 27444 + }, + { + "epoch": 1.841918056441059, + "grad_norm": 3.8663711547851562, + "learning_rate": 1.6274457447464363e-06, + "loss": 2.1382, + "step": 27446 + }, + { + "epoch": 1.842052280124828, + "grad_norm": 3.8044934272766113, + "learning_rate": 1.6246965746363197e-06, + "loss": 1.8387, + "step": 27448 + }, + { + "epoch": 1.842186503808597, + "grad_norm": 3.6369504928588867, + "learning_rate": 1.6219496901641596e-06, + "loss": 1.7314, + "step": 27450 + }, + { + "epoch": 1.8423207274923659, + "grad_norm": 4.292850017547607, + "learning_rate": 1.6192050914597467e-06, + "loss": 1.8093, + "step": 27452 + }, + { + "epoch": 1.842454951176135, + "grad_norm": 3.9325952529907227, + "learning_rate": 1.6164627786527498e-06, + "loss": 2.091, + "step": 27454 + }, + { + "epoch": 1.8425891748599041, + "grad_norm": 5.199668884277344, + "learning_rate": 1.6137227518727482e-06, + "loss": 1.8157, + "step": 27456 + }, + { + "epoch": 1.842723398543673, + "grad_norm": 3.891570806503296, + "learning_rate": 1.6109850112492054e-06, + "loss": 2.087, + "step": 27458 + }, + { + "epoch": 1.842857622227442, + "grad_norm": 4.093017101287842, + "learning_rate": 1.608249556911462e-06, + "loss": 1.739, + "step": 27460 + }, + { + "epoch": 1.842991845911211, + "grad_norm": 3.7970826625823975, + "learning_rate": 1.6055163889887647e-06, + "loss": 2.0442, + "step": 27462 + }, + { + "epoch": 1.84312606959498, + "grad_norm": 4.435250282287598, + "learning_rate": 1.6027855076102605e-06, + "loss": 2.0342, + "step": 27464 + }, + { + "epoch": 1.8432602932787492, + "grad_norm": 3.5015270709991455, + "learning_rate": 1.6000569129049735e-06, + "loss": 1.731, + "step": 27466 + }, + { + "epoch": 1.843394516962518, + "grad_norm": 4.234226703643799, + "learning_rate": 1.5973306050018178e-06, + "loss": 1.981, + "step": 27468 + }, + { + "epoch": 1.843528740646287, + "grad_norm": 3.773401975631714, + "learning_rate": 1.5946065840296177e-06, + "loss": 1.7919, + "step": 27470 + }, + { + "epoch": 1.843662964330056, + "grad_norm": 4.074881553649902, + "learning_rate": 1.5918848501170647e-06, + "loss": 2.0005, + "step": 27472 + }, + { + "epoch": 1.843797188013825, + "grad_norm": 3.829176425933838, + "learning_rate": 1.5891654033927617e-06, + "loss": 1.6366, + "step": 27474 + }, + { + "epoch": 1.843931411697594, + "grad_norm": 4.096720218658447, + "learning_rate": 1.5864482439852058e-06, + "loss": 1.7845, + "step": 27476 + }, + { + "epoch": 1.8440656353813631, + "grad_norm": 4.116787910461426, + "learning_rate": 1.583733372022761e-06, + "loss": 1.6552, + "step": 27478 + }, + { + "epoch": 1.844199859065132, + "grad_norm": 4.172433853149414, + "learning_rate": 1.581020787633708e-06, + "loss": 1.8891, + "step": 27480 + }, + { + "epoch": 1.844334082748901, + "grad_norm": 4.098811626434326, + "learning_rate": 1.5783104909462166e-06, + "loss": 1.5905, + "step": 27482 + }, + { + "epoch": 1.84446830643267, + "grad_norm": 4.234775543212891, + "learning_rate": 1.5756024820883287e-06, + "loss": 1.9972, + "step": 27484 + }, + { + "epoch": 1.844602530116439, + "grad_norm": 4.43559455871582, + "learning_rate": 1.5728967611880085e-06, + "loss": 1.9255, + "step": 27486 + }, + { + "epoch": 1.8447367538002082, + "grad_norm": 3.6064462661743164, + "learning_rate": 1.5701933283730875e-06, + "loss": 1.8061, + "step": 27488 + }, + { + "epoch": 1.844870977483977, + "grad_norm": 3.769141674041748, + "learning_rate": 1.567492183771302e-06, + "loss": 2.169, + "step": 27490 + }, + { + "epoch": 1.845005201167746, + "grad_norm": 4.2511210441589355, + "learning_rate": 1.564793327510261e-06, + "loss": 1.8662, + "step": 27492 + }, + { + "epoch": 1.845139424851515, + "grad_norm": 3.5335021018981934, + "learning_rate": 1.5620967597175017e-06, + "loss": 1.7939, + "step": 27494 + }, + { + "epoch": 1.845273648535284, + "grad_norm": 4.514803409576416, + "learning_rate": 1.559402480520422e-06, + "loss": 2.0718, + "step": 27496 + }, + { + "epoch": 1.845407872219053, + "grad_norm": 4.428549289703369, + "learning_rate": 1.5567104900463147e-06, + "loss": 1.9833, + "step": 27498 + }, + { + "epoch": 1.8455420959028221, + "grad_norm": 3.9345014095306396, + "learning_rate": 1.554020788422378e-06, + "loss": 1.9581, + "step": 27500 + }, + { + "epoch": 1.845676319586591, + "grad_norm": 4.76250696182251, + "learning_rate": 1.5513333757756987e-06, + "loss": 1.8626, + "step": 27502 + }, + { + "epoch": 1.84581054327036, + "grad_norm": 4.174131393432617, + "learning_rate": 1.5486482522332424e-06, + "loss": 1.8573, + "step": 27504 + }, + { + "epoch": 1.845944766954129, + "grad_norm": 4.184377193450928, + "learning_rate": 1.5459654179218852e-06, + "loss": 1.6466, + "step": 27506 + }, + { + "epoch": 1.846078990637898, + "grad_norm": 3.945906639099121, + "learning_rate": 1.5432848729683758e-06, + "loss": 1.8415, + "step": 27508 + }, + { + "epoch": 1.8462132143216672, + "grad_norm": 3.5656538009643555, + "learning_rate": 1.5406066174993739e-06, + "loss": 1.8863, + "step": 27510 + }, + { + "epoch": 1.846347438005436, + "grad_norm": 5.098588943481445, + "learning_rate": 1.5379306516414115e-06, + "loss": 2.2101, + "step": 27512 + }, + { + "epoch": 1.846481661689205, + "grad_norm": 4.141976833343506, + "learning_rate": 1.5352569755209378e-06, + "loss": 1.8733, + "step": 27514 + }, + { + "epoch": 1.846615885372974, + "grad_norm": 3.5033984184265137, + "learning_rate": 1.5325855892642682e-06, + "loss": 1.6932, + "step": 27516 + }, + { + "epoch": 1.846750109056743, + "grad_norm": 4.348273277282715, + "learning_rate": 1.529916492997624e-06, + "loss": 2.1027, + "step": 27518 + }, + { + "epoch": 1.8468843327405122, + "grad_norm": 4.240142822265625, + "learning_rate": 1.5272496868471043e-06, + "loss": 2.089, + "step": 27520 + }, + { + "epoch": 1.8470185564242811, + "grad_norm": 3.9131417274475098, + "learning_rate": 1.5245851709387249e-06, + "loss": 1.8026, + "step": 27522 + }, + { + "epoch": 1.84715278010805, + "grad_norm": 5.791218280792236, + "learning_rate": 1.5219229453983796e-06, + "loss": 1.8799, + "step": 27524 + }, + { + "epoch": 1.847287003791819, + "grad_norm": 4.244935512542725, + "learning_rate": 1.5192630103518402e-06, + "loss": 1.926, + "step": 27526 + }, + { + "epoch": 1.847421227475588, + "grad_norm": 4.052842617034912, + "learning_rate": 1.5166053659247892e-06, + "loss": 1.9039, + "step": 27528 + }, + { + "epoch": 1.847555451159357, + "grad_norm": 4.118327617645264, + "learning_rate": 1.5139500122427986e-06, + "loss": 1.7537, + "step": 27530 + }, + { + "epoch": 1.8476896748431262, + "grad_norm": 3.993527889251709, + "learning_rate": 1.511296949431329e-06, + "loss": 1.6769, + "step": 27532 + }, + { + "epoch": 1.847823898526895, + "grad_norm": 4.265981674194336, + "learning_rate": 1.5086461776157245e-06, + "loss": 2.0482, + "step": 27534 + }, + { + "epoch": 1.847958122210664, + "grad_norm": 3.4064080715179443, + "learning_rate": 1.5059976969212408e-06, + "loss": 1.9034, + "step": 27536 + }, + { + "epoch": 1.848092345894433, + "grad_norm": 3.7990965843200684, + "learning_rate": 1.5033515074729942e-06, + "loss": 1.6932, + "step": 27538 + }, + { + "epoch": 1.848226569578202, + "grad_norm": 4.499819755554199, + "learning_rate": 1.5007076093960348e-06, + "loss": 1.7583, + "step": 27540 + }, + { + "epoch": 1.8483607932619712, + "grad_norm": 4.116093635559082, + "learning_rate": 1.4980660028152627e-06, + "loss": 1.9104, + "step": 27542 + }, + { + "epoch": 1.8484950169457401, + "grad_norm": 3.601280689239502, + "learning_rate": 1.4954266878555112e-06, + "loss": 1.714, + "step": 27544 + }, + { + "epoch": 1.848629240629509, + "grad_norm": 5.128321647644043, + "learning_rate": 1.4927896646414529e-06, + "loss": 1.8303, + "step": 27546 + }, + { + "epoch": 1.848763464313278, + "grad_norm": 3.8753821849823, + "learning_rate": 1.4901549332977105e-06, + "loss": 1.7274, + "step": 27548 + }, + { + "epoch": 1.8488976879970471, + "grad_norm": 3.9652884006500244, + "learning_rate": 1.4875224939487454e-06, + "loss": 1.5359, + "step": 27550 + }, + { + "epoch": 1.849031911680816, + "grad_norm": 4.219564437866211, + "learning_rate": 1.4848923467189524e-06, + "loss": 1.8215, + "step": 27552 + }, + { + "epoch": 1.8491661353645852, + "grad_norm": 4.467137813568115, + "learning_rate": 1.482264491732599e-06, + "loss": 1.9374, + "step": 27554 + }, + { + "epoch": 1.849300359048354, + "grad_norm": 4.378707408905029, + "learning_rate": 1.4796389291138358e-06, + "loss": 1.9256, + "step": 27556 + }, + { + "epoch": 1.849434582732123, + "grad_norm": 3.7498536109924316, + "learning_rate": 1.4770156589867246e-06, + "loss": 1.7578, + "step": 27558 + }, + { + "epoch": 1.849568806415892, + "grad_norm": 4.45932674407959, + "learning_rate": 1.4743946814752053e-06, + "loss": 2.1161, + "step": 27560 + }, + { + "epoch": 1.849703030099661, + "grad_norm": 3.3635404109954834, + "learning_rate": 1.471775996703123e-06, + "loss": 1.7837, + "step": 27562 + }, + { + "epoch": 1.8498372537834302, + "grad_norm": 4.112618923187256, + "learning_rate": 1.4691596047941902e-06, + "loss": 1.7772, + "step": 27564 + }, + { + "epoch": 1.8499714774671991, + "grad_norm": 4.558855056762695, + "learning_rate": 1.4665455058720357e-06, + "loss": 1.9363, + "step": 27566 + }, + { + "epoch": 1.850105701150968, + "grad_norm": 3.8046252727508545, + "learning_rate": 1.4639337000601717e-06, + "loss": 1.7546, + "step": 27568 + }, + { + "epoch": 1.850239924834737, + "grad_norm": 4.336655616760254, + "learning_rate": 1.4613241874819938e-06, + "loss": 1.9757, + "step": 27570 + }, + { + "epoch": 1.8503741485185061, + "grad_norm": 4.133337020874023, + "learning_rate": 1.4587169682608093e-06, + "loss": 2.1923, + "step": 27572 + }, + { + "epoch": 1.850508372202275, + "grad_norm": 4.013980865478516, + "learning_rate": 1.456112042519786e-06, + "loss": 1.6767, + "step": 27574 + }, + { + "epoch": 1.8506425958860442, + "grad_norm": 4.422879695892334, + "learning_rate": 1.45350941038202e-06, + "loss": 1.9596, + "step": 27576 + }, + { + "epoch": 1.850776819569813, + "grad_norm": 4.128218650817871, + "learning_rate": 1.4509090719704631e-06, + "loss": 1.8312, + "step": 27578 + }, + { + "epoch": 1.850911043253582, + "grad_norm": 3.530390739440918, + "learning_rate": 1.4483110274079891e-06, + "loss": 1.6614, + "step": 27580 + }, + { + "epoch": 1.851045266937351, + "grad_norm": 4.298157215118408, + "learning_rate": 1.4457152768173444e-06, + "loss": 1.9237, + "step": 27582 + }, + { + "epoch": 1.85117949062112, + "grad_norm": 4.272356033325195, + "learning_rate": 1.4431218203211804e-06, + "loss": 1.6281, + "step": 27584 + }, + { + "epoch": 1.8513137143048892, + "grad_norm": 4.195593357086182, + "learning_rate": 1.440530658042022e-06, + "loss": 1.7023, + "step": 27586 + }, + { + "epoch": 1.8514479379886581, + "grad_norm": 4.383427143096924, + "learning_rate": 1.437941790102304e-06, + "loss": 1.96, + "step": 27588 + }, + { + "epoch": 1.851582161672427, + "grad_norm": 4.1806640625, + "learning_rate": 1.4353552166243457e-06, + "loss": 1.8464, + "step": 27590 + }, + { + "epoch": 1.851716385356196, + "grad_norm": 4.014082431793213, + "learning_rate": 1.4327709377303544e-06, + "loss": 1.682, + "step": 27592 + }, + { + "epoch": 1.8518506090399651, + "grad_norm": 4.02851676940918, + "learning_rate": 1.4301889535424218e-06, + "loss": 1.8756, + "step": 27594 + }, + { + "epoch": 1.8519848327237343, + "grad_norm": 3.945621967315674, + "learning_rate": 1.4276092641825668e-06, + "loss": 1.6439, + "step": 27596 + }, + { + "epoch": 1.8521190564075032, + "grad_norm": 4.312952518463135, + "learning_rate": 1.4250318697726529e-06, + "loss": 1.8876, + "step": 27598 + }, + { + "epoch": 1.852253280091272, + "grad_norm": 4.257695198059082, + "learning_rate": 1.4224567704344716e-06, + "loss": 1.8404, + "step": 27600 + }, + { + "epoch": 1.852387503775041, + "grad_norm": 4.390069961547852, + "learning_rate": 1.4198839662896758e-06, + "loss": 2.1937, + "step": 27602 + }, + { + "epoch": 1.85252172745881, + "grad_norm": 4.488198757171631, + "learning_rate": 1.4173134574598402e-06, + "loss": 1.7865, + "step": 27604 + }, + { + "epoch": 1.852655951142579, + "grad_norm": 4.135413646697998, + "learning_rate": 1.4147452440664065e-06, + "loss": 1.7837, + "step": 27606 + }, + { + "epoch": 1.8527901748263482, + "grad_norm": 4.054834842681885, + "learning_rate": 1.4121793262307226e-06, + "loss": 2.0051, + "step": 27608 + }, + { + "epoch": 1.8529243985101171, + "grad_norm": 4.189916133880615, + "learning_rate": 1.4096157040740244e-06, + "loss": 1.7778, + "step": 27610 + }, + { + "epoch": 1.853058622193886, + "grad_norm": 4.48264217376709, + "learning_rate": 1.4070543777174317e-06, + "loss": 1.8048, + "step": 27612 + }, + { + "epoch": 1.853192845877655, + "grad_norm": 4.124946594238281, + "learning_rate": 1.4044953472819644e-06, + "loss": 1.9472, + "step": 27614 + }, + { + "epoch": 1.8533270695614241, + "grad_norm": 4.129456520080566, + "learning_rate": 1.4019386128885314e-06, + "loss": 1.9223, + "step": 27616 + }, + { + "epoch": 1.8534612932451933, + "grad_norm": 4.144992828369141, + "learning_rate": 1.3993841746579416e-06, + "loss": 1.9702, + "step": 27618 + }, + { + "epoch": 1.8535955169289622, + "grad_norm": 4.022944450378418, + "learning_rate": 1.396832032710882e-06, + "loss": 1.6903, + "step": 27620 + }, + { + "epoch": 1.853729740612731, + "grad_norm": 3.675442934036255, + "learning_rate": 1.3942821871679279e-06, + "loss": 2.1045, + "step": 27622 + }, + { + "epoch": 1.8538639642965, + "grad_norm": 4.7017822265625, + "learning_rate": 1.391734638149561e-06, + "loss": 1.87, + "step": 27624 + }, + { + "epoch": 1.8539981879802692, + "grad_norm": 4.590610027313232, + "learning_rate": 1.3891893857761519e-06, + "loss": 1.7537, + "step": 27626 + }, + { + "epoch": 1.854132411664038, + "grad_norm": 4.4616899490356445, + "learning_rate": 1.3866464301679593e-06, + "loss": 2.1624, + "step": 27628 + }, + { + "epoch": 1.8542666353478072, + "grad_norm": 4.381720542907715, + "learning_rate": 1.3841057714451212e-06, + "loss": 2.1005, + "step": 27630 + }, + { + "epoch": 1.8544008590315761, + "grad_norm": 4.099536895751953, + "learning_rate": 1.3815674097276854e-06, + "loss": 1.9712, + "step": 27632 + }, + { + "epoch": 1.854535082715345, + "grad_norm": 3.8337314128875732, + "learning_rate": 1.379031345135595e-06, + "loss": 2.1038, + "step": 27634 + }, + { + "epoch": 1.854669306399114, + "grad_norm": 4.219716548919678, + "learning_rate": 1.3764975777886547e-06, + "loss": 1.7527, + "step": 27636 + }, + { + "epoch": 1.8548035300828831, + "grad_norm": 2.883056402206421, + "learning_rate": 1.3739661078065957e-06, + "loss": 1.6491, + "step": 27638 + }, + { + "epoch": 1.8549377537666523, + "grad_norm": 4.1758809089660645, + "learning_rate": 1.3714369353090173e-06, + "loss": 1.6274, + "step": 27640 + }, + { + "epoch": 1.8550719774504212, + "grad_norm": 4.167160511016846, + "learning_rate": 1.3689100604154182e-06, + "loss": 1.5846, + "step": 27642 + }, + { + "epoch": 1.85520620113419, + "grad_norm": 4.296935081481934, + "learning_rate": 1.3663854832451916e-06, + "loss": 1.7923, + "step": 27644 + }, + { + "epoch": 1.855340424817959, + "grad_norm": 4.036269664764404, + "learning_rate": 1.3638632039176147e-06, + "loss": 2.0763, + "step": 27646 + }, + { + "epoch": 1.8554746485017282, + "grad_norm": 4.031567573547363, + "learning_rate": 1.3613432225518696e-06, + "loss": 1.9058, + "step": 27648 + }, + { + "epoch": 1.855608872185497, + "grad_norm": 3.9592390060424805, + "learning_rate": 1.3588255392670058e-06, + "loss": 2.1544, + "step": 27650 + }, + { + "epoch": 1.8557430958692662, + "grad_norm": 4.310605525970459, + "learning_rate": 1.3563101541819889e-06, + "loss": 1.9867, + "step": 27652 + }, + { + "epoch": 1.8558773195530351, + "grad_norm": 6.853755950927734, + "learning_rate": 1.3537970674156631e-06, + "loss": 2.0489, + "step": 27654 + }, + { + "epoch": 1.856011543236804, + "grad_norm": 4.171067237854004, + "learning_rate": 1.351286279086772e-06, + "loss": 2.0302, + "step": 27656 + }, + { + "epoch": 1.856145766920573, + "grad_norm": 4.288027763366699, + "learning_rate": 1.3487777893139374e-06, + "loss": 2.1348, + "step": 27658 + }, + { + "epoch": 1.8562799906043421, + "grad_norm": 3.928776502609253, + "learning_rate": 1.346271598215676e-06, + "loss": 2.0656, + "step": 27660 + }, + { + "epoch": 1.8564142142881113, + "grad_norm": 3.70324969291687, + "learning_rate": 1.3437677059104147e-06, + "loss": 1.7708, + "step": 27662 + }, + { + "epoch": 1.8565484379718802, + "grad_norm": 4.375190734863281, + "learning_rate": 1.3412661125164483e-06, + "loss": 1.9994, + "step": 27664 + }, + { + "epoch": 1.856682661655649, + "grad_norm": 4.15472412109375, + "learning_rate": 1.338766818151982e-06, + "loss": 1.6721, + "step": 27666 + }, + { + "epoch": 1.856816885339418, + "grad_norm": 4.362308979034424, + "learning_rate": 1.3362698229350935e-06, + "loss": 1.6874, + "step": 27668 + }, + { + "epoch": 1.8569511090231872, + "grad_norm": 4.39711332321167, + "learning_rate": 1.3337751269837606e-06, + "loss": 2.0393, + "step": 27670 + }, + { + "epoch": 1.8570853327069563, + "grad_norm": 4.50571346282959, + "learning_rate": 1.331282730415856e-06, + "loss": 1.8853, + "step": 27672 + }, + { + "epoch": 1.8572195563907252, + "grad_norm": 3.903775691986084, + "learning_rate": 1.3287926333491353e-06, + "loss": 1.6979, + "step": 27674 + }, + { + "epoch": 1.8573537800744941, + "grad_norm": 3.8814644813537598, + "learning_rate": 1.3263048359012543e-06, + "loss": 2.0149, + "step": 27676 + }, + { + "epoch": 1.857488003758263, + "grad_norm": 4.22409725189209, + "learning_rate": 1.3238193381897635e-06, + "loss": 1.8169, + "step": 27678 + }, + { + "epoch": 1.857622227442032, + "grad_norm": 3.726166009902954, + "learning_rate": 1.321336140332091e-06, + "loss": 1.8468, + "step": 27680 + }, + { + "epoch": 1.8577564511258011, + "grad_norm": 4.903127670288086, + "learning_rate": 1.3188552424455546e-06, + "loss": 2.1418, + "step": 27682 + }, + { + "epoch": 1.8578906748095703, + "grad_norm": 3.2475099563598633, + "learning_rate": 1.316376644647388e-06, + "loss": 1.6438, + "step": 27684 + }, + { + "epoch": 1.8580248984933392, + "grad_norm": 3.712686061859131, + "learning_rate": 1.3139003470546918e-06, + "loss": 1.5915, + "step": 27686 + }, + { + "epoch": 1.858159122177108, + "grad_norm": 4.675756454467773, + "learning_rate": 1.3114263497844669e-06, + "loss": 1.6029, + "step": 27688 + }, + { + "epoch": 1.858293345860877, + "grad_norm": 3.9656388759613037, + "learning_rate": 1.3089546529536034e-06, + "loss": 1.9036, + "step": 27690 + }, + { + "epoch": 1.8584275695446462, + "grad_norm": 4.047912120819092, + "learning_rate": 1.306485256678891e-06, + "loss": 1.9756, + "step": 27692 + }, + { + "epoch": 1.8585617932284153, + "grad_norm": 3.896547317504883, + "learning_rate": 1.3040181610769865e-06, + "loss": 1.6754, + "step": 27694 + }, + { + "epoch": 1.8586960169121842, + "grad_norm": 4.03818416595459, + "learning_rate": 1.3015533662644852e-06, + "loss": 1.6634, + "step": 27696 + }, + { + "epoch": 1.8588302405959531, + "grad_norm": 4.177455902099609, + "learning_rate": 1.299090872357811e-06, + "loss": 1.8333, + "step": 27698 + }, + { + "epoch": 1.858964464279722, + "grad_norm": 3.773087501525879, + "learning_rate": 1.2966306794733318e-06, + "loss": 1.9619, + "step": 27700 + }, + { + "epoch": 1.8590986879634912, + "grad_norm": 5.736821174621582, + "learning_rate": 1.2941727877272825e-06, + "loss": 1.6234, + "step": 27702 + }, + { + "epoch": 1.8592329116472601, + "grad_norm": 4.287180423736572, + "learning_rate": 1.2917171972357922e-06, + "loss": 1.995, + "step": 27704 + }, + { + "epoch": 1.8593671353310293, + "grad_norm": 4.636211395263672, + "learning_rate": 1.289263908114885e-06, + "loss": 1.8819, + "step": 27706 + }, + { + "epoch": 1.8595013590147982, + "grad_norm": 4.145171165466309, + "learning_rate": 1.2868129204804735e-06, + "loss": 1.9891, + "step": 27708 + }, + { + "epoch": 1.859635582698567, + "grad_norm": 4.249598979949951, + "learning_rate": 1.2843642344483542e-06, + "loss": 1.8121, + "step": 27710 + }, + { + "epoch": 1.859769806382336, + "grad_norm": 4.635797500610352, + "learning_rate": 1.2819178501342343e-06, + "loss": 1.7751, + "step": 27712 + }, + { + "epoch": 1.8599040300661052, + "grad_norm": 4.092945098876953, + "learning_rate": 1.2794737676536994e-06, + "loss": 2.0956, + "step": 27714 + }, + { + "epoch": 1.8600382537498743, + "grad_norm": 3.6464428901672363, + "learning_rate": 1.2770319871222236e-06, + "loss": 1.8995, + "step": 27716 + }, + { + "epoch": 1.8601724774336432, + "grad_norm": 3.9578564167022705, + "learning_rate": 1.2745925086551702e-06, + "loss": 1.7858, + "step": 27718 + }, + { + "epoch": 1.8603067011174121, + "grad_norm": 3.95796799659729, + "learning_rate": 1.2721553323678137e-06, + "loss": 1.8258, + "step": 27720 + }, + { + "epoch": 1.860440924801181, + "grad_norm": 3.7315573692321777, + "learning_rate": 1.2697204583752898e-06, + "loss": 1.9644, + "step": 27722 + }, + { + "epoch": 1.8605751484849502, + "grad_norm": 4.4037861824035645, + "learning_rate": 1.267287886792662e-06, + "loss": 1.9214, + "step": 27724 + }, + { + "epoch": 1.8607093721687191, + "grad_norm": 4.130522727966309, + "learning_rate": 1.2648576177348437e-06, + "loss": 1.9796, + "step": 27726 + }, + { + "epoch": 1.8608435958524883, + "grad_norm": 4.571295738220215, + "learning_rate": 1.2624296513166712e-06, + "loss": 1.8524, + "step": 27728 + }, + { + "epoch": 1.8609778195362572, + "grad_norm": 4.41890811920166, + "learning_rate": 1.260003987652858e-06, + "loss": 1.7508, + "step": 27730 + }, + { + "epoch": 1.861112043220026, + "grad_norm": 4.622462272644043, + "learning_rate": 1.2575806268580182e-06, + "loss": 1.8646, + "step": 27732 + }, + { + "epoch": 1.861246266903795, + "grad_norm": 3.9995787143707275, + "learning_rate": 1.2551595690466434e-06, + "loss": 1.7646, + "step": 27734 + }, + { + "epoch": 1.8613804905875642, + "grad_norm": 4.079648971557617, + "learning_rate": 1.252740814333131e-06, + "loss": 1.8847, + "step": 27736 + }, + { + "epoch": 1.8615147142713333, + "grad_norm": 4.754183769226074, + "learning_rate": 1.250324362831745e-06, + "loss": 2.1533, + "step": 27738 + }, + { + "epoch": 1.8616489379551022, + "grad_norm": 4.4315876960754395, + "learning_rate": 1.2479102146566834e-06, + "loss": 1.8802, + "step": 27740 + }, + { + "epoch": 1.8617831616388711, + "grad_norm": 4.447164535522461, + "learning_rate": 1.2454983699219936e-06, + "loss": 1.9664, + "step": 27742 + }, + { + "epoch": 1.86191738532264, + "grad_norm": 3.94346022605896, + "learning_rate": 1.2430888287416342e-06, + "loss": 1.8438, + "step": 27744 + }, + { + "epoch": 1.8620516090064092, + "grad_norm": 3.957122564315796, + "learning_rate": 1.2406815912294535e-06, + "loss": 1.6126, + "step": 27746 + }, + { + "epoch": 1.8621858326901783, + "grad_norm": 4.3485212326049805, + "learning_rate": 1.2382766574991766e-06, + "loss": 1.7798, + "step": 27748 + }, + { + "epoch": 1.8623200563739473, + "grad_norm": 4.11932373046875, + "learning_rate": 1.235874027664452e-06, + "loss": 1.9448, + "step": 27750 + }, + { + "epoch": 1.8624542800577162, + "grad_norm": 3.7460272312164307, + "learning_rate": 1.2334737018387887e-06, + "loss": 1.8669, + "step": 27752 + }, + { + "epoch": 1.862588503741485, + "grad_norm": 4.115728378295898, + "learning_rate": 1.231075680135596e-06, + "loss": 1.8309, + "step": 27754 + }, + { + "epoch": 1.862722727425254, + "grad_norm": 3.7168514728546143, + "learning_rate": 1.2286799626681721e-06, + "loss": 1.8513, + "step": 27756 + }, + { + "epoch": 1.8628569511090232, + "grad_norm": 4.237320899963379, + "learning_rate": 1.226286549549721e-06, + "loss": 1.8264, + "step": 27758 + }, + { + "epoch": 1.8629911747927923, + "grad_norm": 3.7806098461151123, + "learning_rate": 1.2238954408933134e-06, + "loss": 1.6112, + "step": 27760 + }, + { + "epoch": 1.8631253984765612, + "grad_norm": 4.008288383483887, + "learning_rate": 1.2215066368119476e-06, + "loss": 1.8187, + "step": 27762 + }, + { + "epoch": 1.8632596221603301, + "grad_norm": 3.887197971343994, + "learning_rate": 1.2191201374184614e-06, + "loss": 1.7177, + "step": 27764 + }, + { + "epoch": 1.863393845844099, + "grad_norm": 4.415825366973877, + "learning_rate": 1.2167359428256253e-06, + "loss": 1.8589, + "step": 27766 + }, + { + "epoch": 1.8635280695278682, + "grad_norm": 4.0977702140808105, + "learning_rate": 1.2143540531460885e-06, + "loss": 1.9449, + "step": 27768 + }, + { + "epoch": 1.8636622932116373, + "grad_norm": 4.215953350067139, + "learning_rate": 1.211974468492394e-06, + "loss": 1.6587, + "step": 27770 + }, + { + "epoch": 1.8637965168954063, + "grad_norm": 4.413201332092285, + "learning_rate": 1.2095971889769686e-06, + "loss": 1.8574, + "step": 27772 + }, + { + "epoch": 1.8639307405791752, + "grad_norm": 4.2217183113098145, + "learning_rate": 1.2072222147121338e-06, + "loss": 1.9522, + "step": 27774 + }, + { + "epoch": 1.864064964262944, + "grad_norm": 4.214277267456055, + "learning_rate": 1.2048495458100995e-06, + "loss": 1.854, + "step": 27776 + }, + { + "epoch": 1.8641991879467132, + "grad_norm": 3.856031656265259, + "learning_rate": 1.2024791823829762e-06, + "loss": 1.5271, + "step": 27778 + }, + { + "epoch": 1.8643334116304822, + "grad_norm": 4.224305629730225, + "learning_rate": 1.200111124542752e-06, + "loss": 2.0673, + "step": 27780 + }, + { + "epoch": 1.8644676353142513, + "grad_norm": 4.135661602020264, + "learning_rate": 1.1977453724013154e-06, + "loss": 1.9664, + "step": 27782 + }, + { + "epoch": 1.8646018589980202, + "grad_norm": 4.286875247955322, + "learning_rate": 1.1953819260704436e-06, + "loss": 2.1119, + "step": 27784 + }, + { + "epoch": 1.8647360826817891, + "grad_norm": 4.008508205413818, + "learning_rate": 1.1930207856618137e-06, + "loss": 1.8071, + "step": 27786 + }, + { + "epoch": 1.864870306365558, + "grad_norm": 4.472963809967041, + "learning_rate": 1.1906619512869644e-06, + "loss": 1.9139, + "step": 27788 + }, + { + "epoch": 1.8650045300493272, + "grad_norm": 4.176332950592041, + "learning_rate": 1.1883054230573731e-06, + "loss": 1.8594, + "step": 27790 + }, + { + "epoch": 1.8651387537330963, + "grad_norm": 3.729100227355957, + "learning_rate": 1.1859512010843565e-06, + "loss": 1.7507, + "step": 27792 + }, + { + "epoch": 1.8652729774168653, + "grad_norm": 4.716344356536865, + "learning_rate": 1.183599285479159e-06, + "loss": 1.9074, + "step": 27794 + }, + { + "epoch": 1.8654072011006342, + "grad_norm": 4.577476501464844, + "learning_rate": 1.1812496763528968e-06, + "loss": 2.2655, + "step": 27796 + }, + { + "epoch": 1.865541424784403, + "grad_norm": 4.10700798034668, + "learning_rate": 1.1789023738165983e-06, + "loss": 1.9714, + "step": 27798 + }, + { + "epoch": 1.8656756484681722, + "grad_norm": 3.3562655448913574, + "learning_rate": 1.1765573779811578e-06, + "loss": 1.6172, + "step": 27800 + }, + { + "epoch": 1.8658098721519412, + "grad_norm": 4.302330493927002, + "learning_rate": 1.1742146889573758e-06, + "loss": 1.8051, + "step": 27802 + }, + { + "epoch": 1.8659440958357103, + "grad_norm": 4.232178688049316, + "learning_rate": 1.17187430685593e-06, + "loss": 1.9908, + "step": 27804 + }, + { + "epoch": 1.8660783195194792, + "grad_norm": 4.419615268707275, + "learning_rate": 1.1695362317874158e-06, + "loss": 2.106, + "step": 27806 + }, + { + "epoch": 1.8662125432032481, + "grad_norm": 4.052347183227539, + "learning_rate": 1.1672004638622892e-06, + "loss": 1.8808, + "step": 27808 + }, + { + "epoch": 1.866346766887017, + "grad_norm": 4.272402763366699, + "learning_rate": 1.164867003190917e-06, + "loss": 1.9568, + "step": 27810 + }, + { + "epoch": 1.8664809905707862, + "grad_norm": 4.810849189758301, + "learning_rate": 1.1625358498835505e-06, + "loss": 2.0646, + "step": 27812 + }, + { + "epoch": 1.8666152142545553, + "grad_norm": 4.060275077819824, + "learning_rate": 1.1602070040503232e-06, + "loss": 1.6865, + "step": 27814 + }, + { + "epoch": 1.8667494379383243, + "grad_norm": 3.6719446182250977, + "learning_rate": 1.157880465801281e-06, + "loss": 1.9751, + "step": 27816 + }, + { + "epoch": 1.8668836616220932, + "grad_norm": 3.64314341545105, + "learning_rate": 1.155556235246341e-06, + "loss": 1.5666, + "step": 27818 + }, + { + "epoch": 1.867017885305862, + "grad_norm": 4.309627532958984, + "learning_rate": 1.1532343124953216e-06, + "loss": 1.8999, + "step": 27820 + }, + { + "epoch": 1.8671521089896312, + "grad_norm": 4.095072269439697, + "learning_rate": 1.1509146976579232e-06, + "loss": 1.6815, + "step": 27822 + }, + { + "epoch": 1.8672863326734004, + "grad_norm": 4.09210205078125, + "learning_rate": 1.148597390843753e-06, + "loss": 1.6387, + "step": 27824 + }, + { + "epoch": 1.8674205563571693, + "grad_norm": 5.0481390953063965, + "learning_rate": 1.14628239216229e-06, + "loss": 2.0941, + "step": 27826 + }, + { + "epoch": 1.8675547800409382, + "grad_norm": 4.085901260375977, + "learning_rate": 1.143969701722919e-06, + "loss": 1.7607, + "step": 27828 + }, + { + "epoch": 1.8676890037247071, + "grad_norm": 4.141376495361328, + "learning_rate": 1.1416593196349134e-06, + "loss": 1.7647, + "step": 27830 + }, + { + "epoch": 1.867823227408476, + "grad_norm": 4.492360591888428, + "learning_rate": 1.139351246007425e-06, + "loss": 1.7491, + "step": 27832 + }, + { + "epoch": 1.8679574510922452, + "grad_norm": 3.909946918487549, + "learning_rate": 1.1370454809495056e-06, + "loss": 1.6185, + "step": 27834 + }, + { + "epoch": 1.8680916747760143, + "grad_norm": 3.618016481399536, + "learning_rate": 1.1347420245701068e-06, + "loss": 1.7749, + "step": 27836 + }, + { + "epoch": 1.8682258984597833, + "grad_norm": 4.045231342315674, + "learning_rate": 1.1324408769780636e-06, + "loss": 1.852, + "step": 27838 + }, + { + "epoch": 1.8683601221435522, + "grad_norm": 4.295373916625977, + "learning_rate": 1.1301420382820893e-06, + "loss": 2.0479, + "step": 27840 + }, + { + "epoch": 1.868494345827321, + "grad_norm": 3.91694974899292, + "learning_rate": 1.1278455085908025e-06, + "loss": 1.9159, + "step": 27842 + }, + { + "epoch": 1.8686285695110902, + "grad_norm": 4.511457443237305, + "learning_rate": 1.125551288012716e-06, + "loss": 1.7625, + "step": 27844 + }, + { + "epoch": 1.8687627931948594, + "grad_norm": 4.420565605163574, + "learning_rate": 1.123259376656216e-06, + "loss": 1.9303, + "step": 27846 + }, + { + "epoch": 1.8688970168786283, + "grad_norm": 4.698948383331299, + "learning_rate": 1.12096977462961e-06, + "loss": 1.7401, + "step": 27848 + }, + { + "epoch": 1.8690312405623972, + "grad_norm": 4.227839946746826, + "learning_rate": 1.1186824820410614e-06, + "loss": 1.7189, + "step": 27850 + }, + { + "epoch": 1.8691654642461661, + "grad_norm": 3.6674466133117676, + "learning_rate": 1.1163974989986447e-06, + "loss": 1.8098, + "step": 27852 + }, + { + "epoch": 1.8692996879299353, + "grad_norm": 4.873587131500244, + "learning_rate": 1.1141148256103128e-06, + "loss": 1.9741, + "step": 27854 + }, + { + "epoch": 1.8694339116137042, + "grad_norm": 4.253871440887451, + "learning_rate": 1.111834461983935e-06, + "loss": 2.0478, + "step": 27856 + }, + { + "epoch": 1.8695681352974733, + "grad_norm": 3.778656482696533, + "learning_rate": 1.1095564082272469e-06, + "loss": 1.7998, + "step": 27858 + }, + { + "epoch": 1.8697023589812423, + "grad_norm": 4.744358062744141, + "learning_rate": 1.1072806644478739e-06, + "loss": 1.9333, + "step": 27860 + }, + { + "epoch": 1.8698365826650112, + "grad_norm": 4.218618869781494, + "learning_rate": 1.105007230753341e-06, + "loss": 1.6571, + "step": 27862 + }, + { + "epoch": 1.86997080634878, + "grad_norm": 4.275766849517822, + "learning_rate": 1.1027361072510788e-06, + "loss": 1.8246, + "step": 27864 + }, + { + "epoch": 1.8701050300325492, + "grad_norm": 4.684969425201416, + "learning_rate": 1.1004672940483796e-06, + "loss": 2.0189, + "step": 27866 + }, + { + "epoch": 1.8702392537163184, + "grad_norm": 3.7866921424865723, + "learning_rate": 1.0982007912524405e-06, + "loss": 1.5828, + "step": 27868 + }, + { + "epoch": 1.8703734774000873, + "grad_norm": 4.1014299392700195, + "learning_rate": 1.0959365989703541e-06, + "loss": 1.7087, + "step": 27870 + }, + { + "epoch": 1.8705077010838562, + "grad_norm": 4.125133037567139, + "learning_rate": 1.093674717309101e-06, + "loss": 1.7529, + "step": 27872 + }, + { + "epoch": 1.8706419247676251, + "grad_norm": 3.706160306930542, + "learning_rate": 1.0914151463755407e-06, + "loss": 1.7467, + "step": 27874 + }, + { + "epoch": 1.8707761484513943, + "grad_norm": 4.149466514587402, + "learning_rate": 1.089157886276454e-06, + "loss": 1.8344, + "step": 27876 + }, + { + "epoch": 1.8709103721351632, + "grad_norm": 3.808236837387085, + "learning_rate": 1.086902937118467e-06, + "loss": 1.6243, + "step": 27878 + }, + { + "epoch": 1.8710445958189323, + "grad_norm": 4.3116607666015625, + "learning_rate": 1.084650299008133e-06, + "loss": 1.8988, + "step": 27880 + }, + { + "epoch": 1.8711788195027013, + "grad_norm": 4.0037078857421875, + "learning_rate": 1.082399972051884e-06, + "loss": 2.0268, + "step": 27882 + }, + { + "epoch": 1.8713130431864702, + "grad_norm": 3.499826192855835, + "learning_rate": 1.080151956356046e-06, + "loss": 1.8461, + "step": 27884 + }, + { + "epoch": 1.871447266870239, + "grad_norm": 3.771484851837158, + "learning_rate": 1.0779062520268335e-06, + "loss": 1.8231, + "step": 27886 + }, + { + "epoch": 1.8715814905540082, + "grad_norm": 4.06089973449707, + "learning_rate": 1.0756628591703455e-06, + "loss": 1.872, + "step": 27888 + }, + { + "epoch": 1.8717157142377774, + "grad_norm": 4.550949573516846, + "learning_rate": 1.0734217778925858e-06, + "loss": 1.7101, + "step": 27890 + }, + { + "epoch": 1.8718499379215463, + "grad_norm": 4.140484809875488, + "learning_rate": 1.0711830082994312e-06, + "loss": 2.0072, + "step": 27892 + }, + { + "epoch": 1.8719841616053152, + "grad_norm": 4.711318492889404, + "learning_rate": 1.068946550496669e-06, + "loss": 1.5643, + "step": 27894 + }, + { + "epoch": 1.8721183852890841, + "grad_norm": 4.1185407638549805, + "learning_rate": 1.0667124045899646e-06, + "loss": 1.7381, + "step": 27896 + }, + { + "epoch": 1.8722526089728533, + "grad_norm": 6.217225074768066, + "learning_rate": 1.0644805706848782e-06, + "loss": 1.9301, + "step": 27898 + }, + { + "epoch": 1.8723868326566224, + "grad_norm": 4.015264987945557, + "learning_rate": 1.0622510488868476e-06, + "loss": 1.6971, + "step": 27900 + }, + { + "epoch": 1.8725210563403913, + "grad_norm": 3.986368417739868, + "learning_rate": 1.0600238393012329e-06, + "loss": 1.9108, + "step": 27902 + }, + { + "epoch": 1.8726552800241603, + "grad_norm": 4.256256580352783, + "learning_rate": 1.05779894203325e-06, + "loss": 2.0087, + "step": 27904 + }, + { + "epoch": 1.8727895037079292, + "grad_norm": 4.094473838806152, + "learning_rate": 1.0555763571880318e-06, + "loss": 1.9421, + "step": 27906 + }, + { + "epoch": 1.872923727391698, + "grad_norm": 4.82670259475708, + "learning_rate": 1.053356084870577e-06, + "loss": 1.9632, + "step": 27908 + }, + { + "epoch": 1.8730579510754672, + "grad_norm": 3.606086492538452, + "learning_rate": 1.051138125185802e-06, + "loss": 1.7671, + "step": 27910 + }, + { + "epoch": 1.8731921747592364, + "grad_norm": 4.620556354522705, + "learning_rate": 1.0489224782384955e-06, + "loss": 1.9783, + "step": 27912 + }, + { + "epoch": 1.8733263984430053, + "grad_norm": 3.459566354751587, + "learning_rate": 1.046709144133351e-06, + "loss": 1.9465, + "step": 27914 + }, + { + "epoch": 1.8734606221267742, + "grad_norm": 3.611170768737793, + "learning_rate": 1.0444981229749295e-06, + "loss": 1.6022, + "step": 27916 + }, + { + "epoch": 1.8735948458105431, + "grad_norm": 3.6338562965393066, + "learning_rate": 1.0422894148677087e-06, + "loss": 1.8711, + "step": 27918 + }, + { + "epoch": 1.8737290694943123, + "grad_norm": 4.121998310089111, + "learning_rate": 1.0400830199160328e-06, + "loss": 2.4902, + "step": 27920 + }, + { + "epoch": 1.8738632931780814, + "grad_norm": 4.40189790725708, + "learning_rate": 1.0378789382241682e-06, + "loss": 1.8649, + "step": 27922 + }, + { + "epoch": 1.8739975168618503, + "grad_norm": 4.321652412414551, + "learning_rate": 1.035677169896243e-06, + "loss": 1.7091, + "step": 27924 + }, + { + "epoch": 1.8741317405456193, + "grad_norm": 4.447340488433838, + "learning_rate": 1.033477715036285e-06, + "loss": 1.9366, + "step": 27926 + }, + { + "epoch": 1.8742659642293882, + "grad_norm": 4.029574871063232, + "learning_rate": 1.0312805737482167e-06, + "loss": 1.8497, + "step": 27928 + }, + { + "epoch": 1.8744001879131573, + "grad_norm": 4.057535648345947, + "learning_rate": 1.0290857461358495e-06, + "loss": 1.8017, + "step": 27930 + }, + { + "epoch": 1.8745344115969262, + "grad_norm": 4.456622123718262, + "learning_rate": 1.0268932323028834e-06, + "loss": 1.8998, + "step": 27932 + }, + { + "epoch": 1.8746686352806954, + "grad_norm": 3.742248773574829, + "learning_rate": 1.0247030323529138e-06, + "loss": 1.9109, + "step": 27934 + }, + { + "epoch": 1.8748028589644643, + "grad_norm": 4.369076728820801, + "learning_rate": 1.0225151463894133e-06, + "loss": 1.9043, + "step": 27936 + }, + { + "epoch": 1.8749370826482332, + "grad_norm": 15.849043846130371, + "learning_rate": 1.020329574515766e-06, + "loss": 2.0292, + "step": 27938 + }, + { + "epoch": 1.8750713063320021, + "grad_norm": 4.4804368019104, + "learning_rate": 1.018146316835228e-06, + "loss": 2.35, + "step": 27940 + }, + { + "epoch": 1.8752055300157713, + "grad_norm": 4.119014739990234, + "learning_rate": 1.0159653734509666e-06, + "loss": 1.8843, + "step": 27942 + }, + { + "epoch": 1.8753397536995404, + "grad_norm": 4.396493911743164, + "learning_rate": 1.0137867444660055e-06, + "loss": 2.1233, + "step": 27944 + }, + { + "epoch": 1.8754739773833093, + "grad_norm": 3.6766138076782227, + "learning_rate": 1.0116104299833062e-06, + "loss": 1.861, + "step": 27946 + }, + { + "epoch": 1.8756082010670783, + "grad_norm": 3.913895845413208, + "learning_rate": 1.0094364301056759e-06, + "loss": 1.8745, + "step": 27948 + }, + { + "epoch": 1.8757424247508472, + "grad_norm": 4.079484462738037, + "learning_rate": 1.0072647449358375e-06, + "loss": 1.8705, + "step": 27950 + }, + { + "epoch": 1.8758766484346163, + "grad_norm": 3.9409916400909424, + "learning_rate": 1.0050953745764037e-06, + "loss": 1.6155, + "step": 27952 + }, + { + "epoch": 1.8760108721183852, + "grad_norm": 4.315953731536865, + "learning_rate": 1.0029283191298644e-06, + "loss": 1.9615, + "step": 27954 + }, + { + "epoch": 1.8761450958021544, + "grad_norm": 3.9245681762695312, + "learning_rate": 1.0007635786986214e-06, + "loss": 1.7824, + "step": 27956 + }, + { + "epoch": 1.8762793194859233, + "grad_norm": 4.092309474945068, + "learning_rate": 9.98601153384937e-07, + "loss": 1.8785, + "step": 27958 + }, + { + "epoch": 1.8764135431696922, + "grad_norm": 3.4673213958740234, + "learning_rate": 9.964410432909965e-07, + "loss": 1.7834, + "step": 27960 + }, + { + "epoch": 1.8765477668534611, + "grad_norm": 4.600630283355713, + "learning_rate": 9.94283248518857e-07, + "loss": 2.0163, + "step": 27962 + }, + { + "epoch": 1.8766819905372303, + "grad_norm": 4.289376735687256, + "learning_rate": 9.921277691704644e-07, + "loss": 1.853, + "step": 27964 + }, + { + "epoch": 1.8768162142209994, + "grad_norm": 4.209737300872803, + "learning_rate": 9.899746053476655e-07, + "loss": 1.8543, + "step": 27966 + }, + { + "epoch": 1.8769504379047683, + "grad_norm": 4.5778703689575195, + "learning_rate": 9.878237571521898e-07, + "loss": 1.8272, + "step": 27968 + }, + { + "epoch": 1.8770846615885373, + "grad_norm": 3.94956111907959, + "learning_rate": 9.85675224685667e-07, + "loss": 2.1045, + "step": 27970 + }, + { + "epoch": 1.8772188852723062, + "grad_norm": 4.4332685470581055, + "learning_rate": 9.835290080496107e-07, + "loss": 1.9387, + "step": 27972 + }, + { + "epoch": 1.8773531089560753, + "grad_norm": 4.3221588134765625, + "learning_rate": 9.813851073454117e-07, + "loss": 1.9179, + "step": 27974 + }, + { + "epoch": 1.8774873326398445, + "grad_norm": 3.4491004943847656, + "learning_rate": 9.792435226743835e-07, + "loss": 1.8818, + "step": 27976 + }, + { + "epoch": 1.8776215563236134, + "grad_norm": 3.7980432510375977, + "learning_rate": 9.771042541377008e-07, + "loss": 1.9019, + "step": 27978 + }, + { + "epoch": 1.8777557800073823, + "grad_norm": 3.857921600341797, + "learning_rate": 9.749673018364436e-07, + "loss": 1.8882, + "step": 27980 + }, + { + "epoch": 1.8778900036911512, + "grad_norm": 4.029998779296875, + "learning_rate": 9.728326658715815e-07, + "loss": 1.9839, + "step": 27982 + }, + { + "epoch": 1.8780242273749201, + "grad_norm": 4.506307125091553, + "learning_rate": 9.707003463439668e-07, + "loss": 1.7917, + "step": 27984 + }, + { + "epoch": 1.8781584510586893, + "grad_norm": 4.375648021697998, + "learning_rate": 9.685703433543414e-07, + "loss": 1.8381, + "step": 27986 + }, + { + "epoch": 1.8782926747424584, + "grad_norm": 4.932759761810303, + "learning_rate": 9.66442657003358e-07, + "loss": 2.2448, + "step": 27988 + }, + { + "epoch": 1.8784268984262273, + "grad_norm": 5.249248504638672, + "learning_rate": 9.643172873915363e-07, + "loss": 1.7846, + "step": 27990 + }, + { + "epoch": 1.8785611221099963, + "grad_norm": 3.658806562423706, + "learning_rate": 9.621942346193013e-07, + "loss": 1.7565, + "step": 27992 + }, + { + "epoch": 1.8786953457937652, + "grad_norm": 4.564085006713867, + "learning_rate": 9.600734987869564e-07, + "loss": 2.1134, + "step": 27994 + }, + { + "epoch": 1.8788295694775343, + "grad_norm": 3.6224427223205566, + "learning_rate": 9.579550799947046e-07, + "loss": 1.6917, + "step": 27996 + }, + { + "epoch": 1.8789637931613035, + "grad_norm": 4.246708393096924, + "learning_rate": 9.558389783426436e-07, + "loss": 1.7387, + "step": 27998 + }, + { + "epoch": 1.8790980168450724, + "grad_norm": 4.2644219398498535, + "learning_rate": 9.53725193930749e-07, + "loss": 1.9195, + "step": 28000 + }, + { + "epoch": 1.8792322405288413, + "grad_norm": 4.451013565063477, + "learning_rate": 9.516137268588854e-07, + "loss": 1.8324, + "step": 28002 + }, + { + "epoch": 1.8793664642126102, + "grad_norm": 3.682281732559204, + "learning_rate": 9.495045772268341e-07, + "loss": 1.6804, + "step": 28004 + }, + { + "epoch": 1.8795006878963794, + "grad_norm": 4.5145134925842285, + "learning_rate": 9.473977451342265e-07, + "loss": 1.8089, + "step": 28006 + }, + { + "epoch": 1.8796349115801483, + "grad_norm": 4.617901802062988, + "learning_rate": 9.452932306806273e-07, + "loss": 1.8377, + "step": 28008 + }, + { + "epoch": 1.8797691352639174, + "grad_norm": 4.237493515014648, + "learning_rate": 9.43191033965457e-07, + "loss": 1.8604, + "step": 28010 + }, + { + "epoch": 1.8799033589476863, + "grad_norm": 4.351297855377197, + "learning_rate": 9.410911550880475e-07, + "loss": 1.819, + "step": 28012 + }, + { + "epoch": 1.8800375826314553, + "grad_norm": 4.052086353302002, + "learning_rate": 9.38993594147608e-07, + "loss": 1.6066, + "step": 28014 + }, + { + "epoch": 1.8801718063152242, + "grad_norm": 4.236962795257568, + "learning_rate": 9.368983512432483e-07, + "loss": 1.8689, + "step": 28016 + }, + { + "epoch": 1.8803060299989933, + "grad_norm": 4.53511905670166, + "learning_rate": 9.348054264739614e-07, + "loss": 1.8426, + "step": 28018 + }, + { + "epoch": 1.8804402536827625, + "grad_norm": 3.9796879291534424, + "learning_rate": 9.327148199386404e-07, + "loss": 1.8641, + "step": 28020 + }, + { + "epoch": 1.8805744773665314, + "grad_norm": 4.018612384796143, + "learning_rate": 9.306265317360507e-07, + "loss": 2.0215, + "step": 28022 + }, + { + "epoch": 1.8807087010503003, + "grad_norm": 3.7293694019317627, + "learning_rate": 9.28540561964869e-07, + "loss": 1.7235, + "step": 28024 + }, + { + "epoch": 1.8808429247340692, + "grad_norm": 4.318394184112549, + "learning_rate": 9.264569107236498e-07, + "loss": 1.9854, + "step": 28026 + }, + { + "epoch": 1.8809771484178384, + "grad_norm": 4.527831554412842, + "learning_rate": 9.243755781108476e-07, + "loss": 1.9659, + "step": 28028 + }, + { + "epoch": 1.8811113721016073, + "grad_norm": 3.9032227993011475, + "learning_rate": 9.222965642247949e-07, + "loss": 1.7972, + "step": 28030 + }, + { + "epoch": 1.8812455957853764, + "grad_norm": 3.8436903953552246, + "learning_rate": 9.202198691637131e-07, + "loss": 1.7835, + "step": 28032 + }, + { + "epoch": 1.8813798194691453, + "grad_norm": 5.531787872314453, + "learning_rate": 9.181454930257405e-07, + "loss": 1.8103, + "step": 28034 + }, + { + "epoch": 1.8815140431529143, + "grad_norm": 4.579492568969727, + "learning_rate": 9.160734359088763e-07, + "loss": 1.9689, + "step": 28036 + }, + { + "epoch": 1.8816482668366832, + "grad_norm": 3.731904983520508, + "learning_rate": 9.140036979110256e-07, + "loss": 1.8428, + "step": 28038 + }, + { + "epoch": 1.8817824905204523, + "grad_norm": 4.134167671203613, + "learning_rate": 9.119362791299713e-07, + "loss": 1.7154, + "step": 28040 + }, + { + "epoch": 1.8819167142042215, + "grad_norm": 4.4252400398254395, + "learning_rate": 9.098711796634018e-07, + "loss": 1.9813, + "step": 28042 + }, + { + "epoch": 1.8820509378879904, + "grad_norm": 3.897536039352417, + "learning_rate": 9.078083996088838e-07, + "loss": 1.8532, + "step": 28044 + }, + { + "epoch": 1.8821851615717593, + "grad_norm": 3.2539985179901123, + "learning_rate": 9.05747939063889e-07, + "loss": 1.4845, + "step": 28046 + }, + { + "epoch": 1.8823193852555282, + "grad_norm": 3.5590691566467285, + "learning_rate": 9.036897981257675e-07, + "loss": 1.7695, + "step": 28048 + }, + { + "epoch": 1.8824536089392974, + "grad_norm": 4.281125545501709, + "learning_rate": 9.016339768917526e-07, + "loss": 1.7908, + "step": 28050 + }, + { + "epoch": 1.8825878326230665, + "grad_norm": 4.336503982543945, + "learning_rate": 8.995804754589832e-07, + "loss": 1.9566, + "step": 28052 + }, + { + "epoch": 1.8827220563068354, + "grad_norm": 4.398139476776123, + "learning_rate": 8.975292939244928e-07, + "loss": 1.9968, + "step": 28054 + }, + { + "epoch": 1.8828562799906043, + "grad_norm": 4.285695552825928, + "learning_rate": 8.954804323851818e-07, + "loss": 1.8409, + "step": 28056 + }, + { + "epoch": 1.8829905036743733, + "grad_norm": 3.8295469284057617, + "learning_rate": 8.934338909378615e-07, + "loss": 1.6222, + "step": 28058 + }, + { + "epoch": 1.8831247273581422, + "grad_norm": 4.959747791290283, + "learning_rate": 8.913896696792212e-07, + "loss": 1.8379, + "step": 28060 + }, + { + "epoch": 1.8832589510419113, + "grad_norm": 3.451972246170044, + "learning_rate": 8.893477687058615e-07, + "loss": 1.8023, + "step": 28062 + }, + { + "epoch": 1.8833931747256805, + "grad_norm": 3.5668506622314453, + "learning_rate": 8.873081881142386e-07, + "loss": 1.6469, + "step": 28064 + }, + { + "epoch": 1.8835273984094494, + "grad_norm": 4.433609962463379, + "learning_rate": 8.85270928000731e-07, + "loss": 1.8462, + "step": 28066 + }, + { + "epoch": 1.8836616220932183, + "grad_norm": 4.614348411560059, + "learning_rate": 8.832359884615893e-07, + "loss": 1.7334, + "step": 28068 + }, + { + "epoch": 1.8837958457769872, + "grad_norm": 4.004487037658691, + "learning_rate": 8.8120336959297e-07, + "loss": 1.8311, + "step": 28070 + }, + { + "epoch": 1.8839300694607564, + "grad_norm": 4.048551082611084, + "learning_rate": 8.791730714908964e-07, + "loss": 1.8398, + "step": 28072 + }, + { + "epoch": 1.8840642931445255, + "grad_norm": 3.6129000186920166, + "learning_rate": 8.771450942513081e-07, + "loss": 1.8449, + "step": 28074 + }, + { + "epoch": 1.8841985168282944, + "grad_norm": 3.939016819000244, + "learning_rate": 8.751194379700179e-07, + "loss": 1.9327, + "step": 28076 + }, + { + "epoch": 1.8843327405120633, + "grad_norm": 3.9538025856018066, + "learning_rate": 8.730961027427321e-07, + "loss": 1.9239, + "step": 28078 + }, + { + "epoch": 1.8844669641958323, + "grad_norm": 4.50231409072876, + "learning_rate": 8.710750886650465e-07, + "loss": 2.113, + "step": 28080 + }, + { + "epoch": 1.8846011878796014, + "grad_norm": 8.158286094665527, + "learning_rate": 8.690563958324627e-07, + "loss": 1.7783, + "step": 28082 + }, + { + "epoch": 1.8847354115633703, + "grad_norm": 4.769750595092773, + "learning_rate": 8.670400243403543e-07, + "loss": 2.113, + "step": 28084 + }, + { + "epoch": 1.8848696352471395, + "grad_norm": 4.1224775314331055, + "learning_rate": 8.65025974283984e-07, + "loss": 2.0366, + "step": 28086 + }, + { + "epoch": 1.8850038589309084, + "grad_norm": 4.534761905670166, + "learning_rate": 8.6301424575852e-07, + "loss": 1.7739, + "step": 28088 + }, + { + "epoch": 1.8851380826146773, + "grad_norm": 4.718416213989258, + "learning_rate": 8.610048388590031e-07, + "loss": 1.9007, + "step": 28090 + }, + { + "epoch": 1.8852723062984462, + "grad_norm": 4.17460823059082, + "learning_rate": 8.58997753680385e-07, + "loss": 1.8125, + "step": 28092 + }, + { + "epoch": 1.8854065299822154, + "grad_norm": 3.9276859760284424, + "learning_rate": 8.569929903174901e-07, + "loss": 1.6895, + "step": 28094 + }, + { + "epoch": 1.8855407536659845, + "grad_norm": 4.833826065063477, + "learning_rate": 8.549905488650422e-07, + "loss": 1.7002, + "step": 28096 + }, + { + "epoch": 1.8856749773497534, + "grad_norm": 4.505548000335693, + "learning_rate": 8.529904294176494e-07, + "loss": 1.8247, + "step": 28098 + }, + { + "epoch": 1.8858092010335223, + "grad_norm": 3.864331007003784, + "learning_rate": 8.509926320698137e-07, + "loss": 1.9727, + "step": 28100 + }, + { + "epoch": 1.8859434247172913, + "grad_norm": 4.014715194702148, + "learning_rate": 8.489971569159261e-07, + "loss": 1.9079, + "step": 28102 + }, + { + "epoch": 1.8860776484010604, + "grad_norm": 4.1535749435424805, + "learning_rate": 8.470040040502836e-07, + "loss": 1.8891, + "step": 28104 + }, + { + "epoch": 1.8862118720848293, + "grad_norm": 4.4767374992370605, + "learning_rate": 8.450131735670386e-07, + "loss": 2.0921, + "step": 28106 + }, + { + "epoch": 1.8863460957685985, + "grad_norm": 3.925844430923462, + "learning_rate": 8.430246655602602e-07, + "loss": 1.7547, + "step": 28108 + }, + { + "epoch": 1.8864803194523674, + "grad_norm": 3.6998279094696045, + "learning_rate": 8.410384801239068e-07, + "loss": 1.6007, + "step": 28110 + }, + { + "epoch": 1.8866145431361363, + "grad_norm": 4.245959281921387, + "learning_rate": 8.390546173518143e-07, + "loss": 1.8147, + "step": 28112 + }, + { + "epoch": 1.8867487668199052, + "grad_norm": 4.012907028198242, + "learning_rate": 8.370730773377245e-07, + "loss": 2.0691, + "step": 28114 + }, + { + "epoch": 1.8868829905036744, + "grad_norm": 4.0799407958984375, + "learning_rate": 8.350938601752567e-07, + "loss": 1.8587, + "step": 28116 + }, + { + "epoch": 1.8870172141874435, + "grad_norm": 4.207050323486328, + "learning_rate": 8.331169659579252e-07, + "loss": 1.794, + "step": 28118 + }, + { + "epoch": 1.8871514378712124, + "grad_norm": 3.904081106185913, + "learning_rate": 8.31142394779133e-07, + "loss": 1.6569, + "step": 28120 + }, + { + "epoch": 1.8872856615549813, + "grad_norm": 4.811886310577393, + "learning_rate": 8.291701467321778e-07, + "loss": 1.9242, + "step": 28122 + }, + { + "epoch": 1.8874198852387503, + "grad_norm": 4.001278400421143, + "learning_rate": 8.272002219102459e-07, + "loss": 2.0714, + "step": 28124 + }, + { + "epoch": 1.8875541089225194, + "grad_norm": 4.041013717651367, + "learning_rate": 8.252326204064021e-07, + "loss": 1.7026, + "step": 28126 + }, + { + "epoch": 1.8876883326062885, + "grad_norm": 4.645742893218994, + "learning_rate": 8.232673423136217e-07, + "loss": 1.8513, + "step": 28128 + }, + { + "epoch": 1.8878225562900575, + "grad_norm": 4.4904632568359375, + "learning_rate": 8.213043877247528e-07, + "loss": 2.0006, + "step": 28130 + }, + { + "epoch": 1.8879567799738264, + "grad_norm": 4.437152862548828, + "learning_rate": 8.193437567325546e-07, + "loss": 1.8508, + "step": 28132 + }, + { + "epoch": 1.8880910036575953, + "grad_norm": 4.522233963012695, + "learning_rate": 8.173854494296529e-07, + "loss": 2.1171, + "step": 28134 + }, + { + "epoch": 1.8882252273413642, + "grad_norm": 4.507450580596924, + "learning_rate": 8.154294659085737e-07, + "loss": 2.0859, + "step": 28136 + }, + { + "epoch": 1.8883594510251334, + "grad_norm": 4.608407020568848, + "learning_rate": 8.13475806261732e-07, + "loss": 2.1188, + "step": 28138 + }, + { + "epoch": 1.8884936747089025, + "grad_norm": 4.1631364822387695, + "learning_rate": 8.115244705814429e-07, + "loss": 2.0022, + "step": 28140 + }, + { + "epoch": 1.8886278983926714, + "grad_norm": 4.100566864013672, + "learning_rate": 8.095754589598936e-07, + "loss": 2.0476, + "step": 28142 + }, + { + "epoch": 1.8887621220764403, + "grad_norm": 4.405656337738037, + "learning_rate": 8.076287714891773e-07, + "loss": 2.1088, + "step": 28144 + }, + { + "epoch": 1.8888963457602093, + "grad_norm": 3.6656243801116943, + "learning_rate": 8.056844082612647e-07, + "loss": 1.5934, + "step": 28146 + }, + { + "epoch": 1.8890305694439784, + "grad_norm": 4.246613502502441, + "learning_rate": 8.037423693680324e-07, + "loss": 1.9164, + "step": 28148 + }, + { + "epoch": 1.8891647931277475, + "grad_norm": 4.458495616912842, + "learning_rate": 8.018026549012292e-07, + "loss": 1.8641, + "step": 28150 + }, + { + "epoch": 1.8892990168115165, + "grad_norm": 4.696106433868408, + "learning_rate": 7.998652649525096e-07, + "loss": 1.8698, + "step": 28152 + }, + { + "epoch": 1.8894332404952854, + "grad_norm": 4.205369472503662, + "learning_rate": 7.979301996134059e-07, + "loss": 1.8031, + "step": 28154 + }, + { + "epoch": 1.8895674641790543, + "grad_norm": 4.6458821296691895, + "learning_rate": 7.95997458975345e-07, + "loss": 1.8187, + "step": 28156 + }, + { + "epoch": 1.8897016878628234, + "grad_norm": 4.829007625579834, + "learning_rate": 7.940670431296538e-07, + "loss": 1.8051, + "step": 28158 + }, + { + "epoch": 1.8898359115465924, + "grad_norm": 4.039152145385742, + "learning_rate": 7.921389521675315e-07, + "loss": 1.7835, + "step": 28160 + }, + { + "epoch": 1.8899701352303615, + "grad_norm": 4.147747039794922, + "learning_rate": 7.902131861800888e-07, + "loss": 1.8488, + "step": 28162 + }, + { + "epoch": 1.8901043589141304, + "grad_norm": 4.421010971069336, + "learning_rate": 7.882897452583027e-07, + "loss": 1.748, + "step": 28164 + }, + { + "epoch": 1.8902385825978993, + "grad_norm": 4.2043046951293945, + "learning_rate": 7.863686294930506e-07, + "loss": 1.8299, + "step": 28166 + }, + { + "epoch": 1.8903728062816683, + "grad_norm": 4.817014217376709, + "learning_rate": 7.844498389751098e-07, + "loss": 1.9342, + "step": 28168 + }, + { + "epoch": 1.8905070299654374, + "grad_norm": 4.155102252960205, + "learning_rate": 7.825333737951357e-07, + "loss": 2.1237, + "step": 28170 + }, + { + "epoch": 1.8906412536492065, + "grad_norm": 4.538684844970703, + "learning_rate": 7.80619234043678e-07, + "loss": 1.8065, + "step": 28172 + }, + { + "epoch": 1.8907754773329755, + "grad_norm": 3.784930467605591, + "learning_rate": 7.787074198111755e-07, + "loss": 1.8358, + "step": 28174 + }, + { + "epoch": 1.8909097010167444, + "grad_norm": 4.052253246307373, + "learning_rate": 7.767979311879559e-07, + "loss": 1.7694, + "step": 28176 + }, + { + "epoch": 1.8910439247005133, + "grad_norm": 3.9428555965423584, + "learning_rate": 7.748907682642414e-07, + "loss": 1.7732, + "step": 28178 + }, + { + "epoch": 1.8911781483842824, + "grad_norm": 3.9893481731414795, + "learning_rate": 7.729859311301435e-07, + "loss": 2.0182, + "step": 28180 + }, + { + "epoch": 1.8913123720680514, + "grad_norm": 3.5151455402374268, + "learning_rate": 7.710834198756622e-07, + "loss": 1.674, + "step": 28182 + }, + { + "epoch": 1.8914465957518205, + "grad_norm": 4.1504316329956055, + "learning_rate": 7.691832345906757e-07, + "loss": 1.8492, + "step": 28184 + }, + { + "epoch": 1.8915808194355894, + "grad_norm": 4.644562244415283, + "learning_rate": 7.672853753649789e-07, + "loss": 1.8545, + "step": 28186 + }, + { + "epoch": 1.8917150431193583, + "grad_norm": 4.632850170135498, + "learning_rate": 7.653898422882333e-07, + "loss": 2.0307, + "step": 28188 + }, + { + "epoch": 1.8918492668031273, + "grad_norm": 4.639255046844482, + "learning_rate": 7.634966354500117e-07, + "loss": 1.842, + "step": 28190 + }, + { + "epoch": 1.8919834904868964, + "grad_norm": 3.545111894607544, + "learning_rate": 7.616057549397427e-07, + "loss": 1.6161, + "step": 28192 + }, + { + "epoch": 1.8921177141706655, + "grad_norm": 3.6963589191436768, + "learning_rate": 7.597172008467824e-07, + "loss": 1.7653, + "step": 28194 + }, + { + "epoch": 1.8922519378544345, + "grad_norm": 4.117505073547363, + "learning_rate": 7.578309732603539e-07, + "loss": 1.9884, + "step": 28196 + }, + { + "epoch": 1.8923861615382034, + "grad_norm": 4.150859832763672, + "learning_rate": 7.559470722695916e-07, + "loss": 1.9587, + "step": 28198 + }, + { + "epoch": 1.8925203852219723, + "grad_norm": 4.238442897796631, + "learning_rate": 7.540654979634909e-07, + "loss": 1.7674, + "step": 28200 + }, + { + "epoch": 1.8926546089057414, + "grad_norm": 4.656989097595215, + "learning_rate": 7.52186250430953e-07, + "loss": 1.919, + "step": 28202 + }, + { + "epoch": 1.8927888325895106, + "grad_norm": 3.6535935401916504, + "learning_rate": 7.503093297607732e-07, + "loss": 1.9145, + "step": 28204 + }, + { + "epoch": 1.8929230562732795, + "grad_norm": 6.0994768142700195, + "learning_rate": 7.484347360416367e-07, + "loss": 1.7269, + "step": 28206 + }, + { + "epoch": 1.8930572799570484, + "grad_norm": 3.9868829250335693, + "learning_rate": 7.465624693621109e-07, + "loss": 1.7904, + "step": 28208 + }, + { + "epoch": 1.8931915036408173, + "grad_norm": 4.660818099975586, + "learning_rate": 7.446925298106532e-07, + "loss": 2.0287, + "step": 28210 + }, + { + "epoch": 1.8933257273245863, + "grad_norm": 4.229373931884766, + "learning_rate": 7.428249174756152e-07, + "loss": 2.1609, + "step": 28212 + }, + { + "epoch": 1.8934599510083554, + "grad_norm": 4.112990379333496, + "learning_rate": 7.409596324452428e-07, + "loss": 1.8659, + "step": 28214 + }, + { + "epoch": 1.8935941746921245, + "grad_norm": 4.492527484893799, + "learning_rate": 7.390966748076599e-07, + "loss": 1.95, + "step": 28216 + }, + { + "epoch": 1.8937283983758935, + "grad_norm": 4.184624671936035, + "learning_rate": 7.372360446509019e-07, + "loss": 1.8468, + "step": 28218 + }, + { + "epoch": 1.8938626220596624, + "grad_norm": 3.897782802581787, + "learning_rate": 7.353777420628594e-07, + "loss": 1.9388, + "step": 28220 + }, + { + "epoch": 1.8939968457434313, + "grad_norm": 4.354614734649658, + "learning_rate": 7.335217671313455e-07, + "loss": 1.9828, + "step": 28222 + }, + { + "epoch": 1.8941310694272004, + "grad_norm": 4.313567638397217, + "learning_rate": 7.316681199440568e-07, + "loss": 1.9581, + "step": 28224 + }, + { + "epoch": 1.8942652931109696, + "grad_norm": 4.253139972686768, + "learning_rate": 7.298168005885564e-07, + "loss": 1.8318, + "step": 28226 + }, + { + "epoch": 1.8943995167947385, + "grad_norm": 3.939821720123291, + "learning_rate": 7.279678091523357e-07, + "loss": 1.8784, + "step": 28228 + }, + { + "epoch": 1.8945337404785074, + "grad_norm": 4.122672080993652, + "learning_rate": 7.261211457227413e-07, + "loss": 1.6137, + "step": 28230 + }, + { + "epoch": 1.8946679641622763, + "grad_norm": 3.60455584526062, + "learning_rate": 7.242768103870312e-07, + "loss": 1.7251, + "step": 28232 + }, + { + "epoch": 1.8948021878460455, + "grad_norm": 4.7661519050598145, + "learning_rate": 7.22434803232347e-07, + "loss": 1.9445, + "step": 28234 + }, + { + "epoch": 1.8949364115298144, + "grad_norm": 4.3577117919921875, + "learning_rate": 7.205951243457132e-07, + "loss": 1.8433, + "step": 28236 + }, + { + "epoch": 1.8950706352135835, + "grad_norm": 4.934947490692139, + "learning_rate": 7.187577738140605e-07, + "loss": 2.1031, + "step": 28238 + }, + { + "epoch": 1.8952048588973525, + "grad_norm": 4.282474517822266, + "learning_rate": 7.169227517241972e-07, + "loss": 1.9266, + "step": 28240 + }, + { + "epoch": 1.8953390825811214, + "grad_norm": 4.766186714172363, + "learning_rate": 7.150900581628206e-07, + "loss": 1.8684, + "step": 28242 + }, + { + "epoch": 1.8954733062648903, + "grad_norm": 3.9148781299591064, + "learning_rate": 7.132596932165225e-07, + "loss": 1.7646, + "step": 28244 + }, + { + "epoch": 1.8956075299486594, + "grad_norm": 4.09848690032959, + "learning_rate": 7.114316569717894e-07, + "loss": 1.7535, + "step": 28246 + }, + { + "epoch": 1.8957417536324286, + "grad_norm": 4.532949924468994, + "learning_rate": 7.096059495149854e-07, + "loss": 1.9043, + "step": 28248 + }, + { + "epoch": 1.8958759773161975, + "grad_norm": 4.156283855438232, + "learning_rate": 7.077825709323749e-07, + "loss": 1.7915, + "step": 28250 + }, + { + "epoch": 1.8960102009999664, + "grad_norm": 4.032278537750244, + "learning_rate": 7.059615213101112e-07, + "loss": 1.7815, + "step": 28252 + }, + { + "epoch": 1.8961444246837353, + "grad_norm": 5.448999404907227, + "learning_rate": 7.041428007342254e-07, + "loss": 1.9142, + "step": 28254 + }, + { + "epoch": 1.8962786483675045, + "grad_norm": 4.297141075134277, + "learning_rate": 7.023264092906711e-07, + "loss": 2.0486, + "step": 28256 + }, + { + "epoch": 1.8964128720512734, + "grad_norm": 3.693990468978882, + "learning_rate": 7.005123470652408e-07, + "loss": 1.6599, + "step": 28258 + }, + { + "epoch": 1.8965470957350425, + "grad_norm": 4.551848411560059, + "learning_rate": 6.987006141436659e-07, + "loss": 2.028, + "step": 28260 + }, + { + "epoch": 1.8966813194188115, + "grad_norm": 4.5344319343566895, + "learning_rate": 6.968912106115333e-07, + "loss": 1.7522, + "step": 28262 + }, + { + "epoch": 1.8968155431025804, + "grad_norm": 3.8991706371307373, + "learning_rate": 6.95084136554347e-07, + "loss": 1.8247, + "step": 28264 + }, + { + "epoch": 1.8969497667863493, + "grad_norm": 5.04088020324707, + "learning_rate": 6.932793920574831e-07, + "loss": 1.8504, + "step": 28266 + }, + { + "epoch": 1.8970839904701184, + "grad_norm": 4.026176452636719, + "learning_rate": 6.914769772062069e-07, + "loss": 1.9298, + "step": 28268 + }, + { + "epoch": 1.8972182141538876, + "grad_norm": 4.27121114730835, + "learning_rate": 6.896768920856778e-07, + "loss": 1.9206, + "step": 28270 + }, + { + "epoch": 1.8973524378376565, + "grad_norm": 4.567217826843262, + "learning_rate": 6.878791367809556e-07, + "loss": 1.897, + "step": 28272 + }, + { + "epoch": 1.8974866615214254, + "grad_norm": 4.01743745803833, + "learning_rate": 6.86083711376978e-07, + "loss": 1.9072, + "step": 28274 + }, + { + "epoch": 1.8976208852051943, + "grad_norm": 4.382755279541016, + "learning_rate": 6.842906159585716e-07, + "loss": 2.0104, + "step": 28276 + }, + { + "epoch": 1.8977551088889635, + "grad_norm": 3.5249578952789307, + "learning_rate": 6.824998506104574e-07, + "loss": 1.7828, + "step": 28278 + }, + { + "epoch": 1.8978893325727326, + "grad_norm": 4.127964019775391, + "learning_rate": 6.807114154172456e-07, + "loss": 2.128, + "step": 28280 + }, + { + "epoch": 1.8980235562565015, + "grad_norm": 3.7751739025115967, + "learning_rate": 6.789253104634352e-07, + "loss": 1.7526, + "step": 28282 + }, + { + "epoch": 1.8981577799402705, + "grad_norm": 3.9620983600616455, + "learning_rate": 6.77141535833431e-07, + "loss": 1.9203, + "step": 28284 + }, + { + "epoch": 1.8982920036240394, + "grad_norm": 3.9752776622772217, + "learning_rate": 6.753600916114877e-07, + "loss": 1.8474, + "step": 28286 + }, + { + "epoch": 1.8984262273078083, + "grad_norm": 3.7728707790374756, + "learning_rate": 6.735809778817881e-07, + "loss": 1.7992, + "step": 28288 + }, + { + "epoch": 1.8985604509915774, + "grad_norm": 4.4159836769104, + "learning_rate": 6.718041947283926e-07, + "loss": 1.9934, + "step": 28290 + }, + { + "epoch": 1.8986946746753466, + "grad_norm": 4.553109169006348, + "learning_rate": 6.700297422352508e-07, + "loss": 1.8855, + "step": 28292 + }, + { + "epoch": 1.8988288983591155, + "grad_norm": 4.051055908203125, + "learning_rate": 6.682576204862012e-07, + "loss": 1.606, + "step": 28294 + }, + { + "epoch": 1.8989631220428844, + "grad_norm": 4.14284610748291, + "learning_rate": 6.664878295649713e-07, + "loss": 1.8257, + "step": 28296 + }, + { + "epoch": 1.8990973457266533, + "grad_norm": 4.061545372009277, + "learning_rate": 6.647203695551829e-07, + "loss": 2.0513, + "step": 28298 + }, + { + "epoch": 1.8992315694104225, + "grad_norm": 4.327839374542236, + "learning_rate": 6.629552405403361e-07, + "loss": 1.8676, + "step": 28300 + }, + { + "epoch": 1.8993657930941916, + "grad_norm": 4.020798683166504, + "learning_rate": 6.611924426038419e-07, + "loss": 1.6609, + "step": 28302 + }, + { + "epoch": 1.8995000167779605, + "grad_norm": 3.867178201675415, + "learning_rate": 6.594319758289836e-07, + "loss": 1.8954, + "step": 28304 + }, + { + "epoch": 1.8996342404617295, + "grad_norm": 3.6179282665252686, + "learning_rate": 6.576738402989447e-07, + "loss": 1.7739, + "step": 28306 + }, + { + "epoch": 1.8997684641454984, + "grad_norm": 3.98771071434021, + "learning_rate": 6.55918036096781e-07, + "loss": 1.803, + "step": 28308 + }, + { + "epoch": 1.8999026878292675, + "grad_norm": 4.7286882400512695, + "learning_rate": 6.54164563305465e-07, + "loss": 2.0219, + "step": 28310 + }, + { + "epoch": 1.9000369115130364, + "grad_norm": 4.285242080688477, + "learning_rate": 6.52413422007836e-07, + "loss": 1.8735, + "step": 28312 + }, + { + "epoch": 1.9001711351968056, + "grad_norm": 4.12814998626709, + "learning_rate": 6.506646122866445e-07, + "loss": 1.6028, + "step": 28314 + }, + { + "epoch": 1.9003053588805745, + "grad_norm": 4.274417400360107, + "learning_rate": 6.489181342244965e-07, + "loss": 2.0347, + "step": 28316 + }, + { + "epoch": 1.9004395825643434, + "grad_norm": 3.938891887664795, + "learning_rate": 6.471739879039262e-07, + "loss": 1.7324, + "step": 28318 + }, + { + "epoch": 1.9005738062481123, + "grad_norm": 4.572030067443848, + "learning_rate": 6.454321734073344e-07, + "loss": 2.1656, + "step": 28320 + }, + { + "epoch": 1.9007080299318815, + "grad_norm": 3.9899888038635254, + "learning_rate": 6.43692690817027e-07, + "loss": 2.0959, + "step": 28322 + }, + { + "epoch": 1.9008422536156506, + "grad_norm": 4.275861740112305, + "learning_rate": 6.419555402151777e-07, + "loss": 2.2068, + "step": 28324 + }, + { + "epoch": 1.9009764772994195, + "grad_norm": 4.268010139465332, + "learning_rate": 6.402207216838762e-07, + "loss": 1.8417, + "step": 28326 + }, + { + "epoch": 1.9011107009831885, + "grad_norm": 3.9826624393463135, + "learning_rate": 6.384882353050791e-07, + "loss": 2.0546, + "step": 28328 + }, + { + "epoch": 1.9012449246669574, + "grad_norm": 4.127320766448975, + "learning_rate": 6.367580811606544e-07, + "loss": 1.9141, + "step": 28330 + }, + { + "epoch": 1.9013791483507265, + "grad_norm": 4.634667873382568, + "learning_rate": 6.350302593323365e-07, + "loss": 1.704, + "step": 28332 + }, + { + "epoch": 1.9015133720344954, + "grad_norm": 4.293228626251221, + "learning_rate": 6.333047699017714e-07, + "loss": 1.7543, + "step": 28334 + }, + { + "epoch": 1.9016475957182646, + "grad_norm": 4.092335224151611, + "learning_rate": 6.315816129504715e-07, + "loss": 1.9755, + "step": 28336 + }, + { + "epoch": 1.9017818194020335, + "grad_norm": 4.128259658813477, + "learning_rate": 6.298607885598718e-07, + "loss": 1.8402, + "step": 28338 + }, + { + "epoch": 1.9019160430858024, + "grad_norm": 4.048526287078857, + "learning_rate": 6.281422968112571e-07, + "loss": 1.8815, + "step": 28340 + }, + { + "epoch": 1.9020502667695713, + "grad_norm": 3.953449249267578, + "learning_rate": 6.26426137785846e-07, + "loss": 1.7374, + "step": 28342 + }, + { + "epoch": 1.9021844904533405, + "grad_norm": 3.406179666519165, + "learning_rate": 6.247123115647013e-07, + "loss": 1.7436, + "step": 28344 + }, + { + "epoch": 1.9023187141371096, + "grad_norm": 3.9160525798797607, + "learning_rate": 6.230008182288083e-07, + "loss": 1.9416, + "step": 28346 + }, + { + "epoch": 1.9024529378208785, + "grad_norm": 4.630080699920654, + "learning_rate": 6.212916578590355e-07, + "loss": 1.7217, + "step": 28348 + }, + { + "epoch": 1.9025871615046475, + "grad_norm": 4.315492153167725, + "learning_rate": 6.195848305361296e-07, + "loss": 2.1542, + "step": 28350 + }, + { + "epoch": 1.9027213851884164, + "grad_norm": 4.289133071899414, + "learning_rate": 6.178803363407371e-07, + "loss": 1.8652, + "step": 28352 + }, + { + "epoch": 1.9028556088721855, + "grad_norm": 4.221247673034668, + "learning_rate": 6.16178175353399e-07, + "loss": 1.8558, + "step": 28354 + }, + { + "epoch": 1.9029898325559547, + "grad_norm": 4.014423847198486, + "learning_rate": 6.144783476545234e-07, + "loss": 1.913, + "step": 28356 + }, + { + "epoch": 1.9031240562397236, + "grad_norm": 4.015252590179443, + "learning_rate": 6.127808533244406e-07, + "loss": 1.9425, + "step": 28358 + }, + { + "epoch": 1.9032582799234925, + "grad_norm": 4.332964897155762, + "learning_rate": 6.110856924433473e-07, + "loss": 1.8352, + "step": 28360 + }, + { + "epoch": 1.9033925036072614, + "grad_norm": 4.2127156257629395, + "learning_rate": 6.093928650913294e-07, + "loss": 1.7748, + "step": 28362 + }, + { + "epoch": 1.9035267272910303, + "grad_norm": 3.9584972858428955, + "learning_rate": 6.077023713483843e-07, + "loss": 1.6999, + "step": 28364 + }, + { + "epoch": 1.9036609509747995, + "grad_norm": 3.5038068294525146, + "learning_rate": 6.060142112943701e-07, + "loss": 1.6837, + "step": 28366 + }, + { + "epoch": 1.9037951746585686, + "grad_norm": 4.075870990753174, + "learning_rate": 6.043283850090564e-07, + "loss": 1.9808, + "step": 28368 + }, + { + "epoch": 1.9039293983423375, + "grad_norm": 4.030488014221191, + "learning_rate": 6.026448925720962e-07, + "loss": 1.6511, + "step": 28370 + }, + { + "epoch": 1.9040636220261065, + "grad_norm": 4.7844977378845215, + "learning_rate": 6.009637340630258e-07, + "loss": 2.1023, + "step": 28372 + }, + { + "epoch": 1.9041978457098754, + "grad_norm": 4.22601842880249, + "learning_rate": 5.992849095612819e-07, + "loss": 1.7506, + "step": 28374 + }, + { + "epoch": 1.9043320693936445, + "grad_norm": 4.42030668258667, + "learning_rate": 5.97608419146184e-07, + "loss": 1.8517, + "step": 28376 + }, + { + "epoch": 1.9044662930774137, + "grad_norm": 4.5308966636657715, + "learning_rate": 5.95934262896941e-07, + "loss": 1.81, + "step": 28378 + }, + { + "epoch": 1.9046005167611826, + "grad_norm": 4.091604232788086, + "learning_rate": 5.942624408926623e-07, + "loss": 1.7501, + "step": 28380 + }, + { + "epoch": 1.9047347404449515, + "grad_norm": 4.087924480438232, + "learning_rate": 5.925929532123231e-07, + "loss": 1.7478, + "step": 28382 + }, + { + "epoch": 1.9048689641287204, + "grad_norm": 3.8197410106658936, + "learning_rate": 5.909257999348106e-07, + "loss": 1.5791, + "step": 28384 + }, + { + "epoch": 1.9050031878124896, + "grad_norm": 4.414361953735352, + "learning_rate": 5.892609811388949e-07, + "loss": 1.864, + "step": 28386 + }, + { + "epoch": 1.9051374114962585, + "grad_norm": 4.20404577255249, + "learning_rate": 5.87598496903241e-07, + "loss": 1.9273, + "step": 28388 + }, + { + "epoch": 1.9052716351800276, + "grad_norm": 4.249545097351074, + "learning_rate": 5.859383473063918e-07, + "loss": 1.8283, + "step": 28390 + }, + { + "epoch": 1.9054058588637965, + "grad_norm": 3.953094720840454, + "learning_rate": 5.842805324267897e-07, + "loss": 1.7643, + "step": 28392 + }, + { + "epoch": 1.9055400825475655, + "grad_norm": 3.693310022354126, + "learning_rate": 5.826250523427557e-07, + "loss": 1.6836, + "step": 28394 + }, + { + "epoch": 1.9056743062313344, + "grad_norm": 3.7979562282562256, + "learning_rate": 5.809719071325103e-07, + "loss": 1.621, + "step": 28396 + }, + { + "epoch": 1.9058085299151035, + "grad_norm": 4.679463863372803, + "learning_rate": 5.793210968741691e-07, + "loss": 1.9532, + "step": 28398 + }, + { + "epoch": 1.9059427535988727, + "grad_norm": 4.243464946746826, + "learning_rate": 5.776726216457251e-07, + "loss": 1.5626, + "step": 28400 + }, + { + "epoch": 1.9060769772826416, + "grad_norm": 3.581423759460449, + "learning_rate": 5.760264815250605e-07, + "loss": 1.7227, + "step": 28402 + }, + { + "epoch": 1.9062112009664105, + "grad_norm": 3.8290319442749023, + "learning_rate": 5.743826765899629e-07, + "loss": 1.9273, + "step": 28404 + }, + { + "epoch": 1.9063454246501794, + "grad_norm": 4.470434665679932, + "learning_rate": 5.727412069180871e-07, + "loss": 1.901, + "step": 28406 + }, + { + "epoch": 1.9064796483339486, + "grad_norm": 3.4563634395599365, + "learning_rate": 5.711020725869986e-07, + "loss": 1.8975, + "step": 28408 + }, + { + "epoch": 1.9066138720177175, + "grad_norm": 4.277403354644775, + "learning_rate": 5.694652736741357e-07, + "loss": 2.0187, + "step": 28410 + }, + { + "epoch": 1.9067480957014866, + "grad_norm": 4.433515548706055, + "learning_rate": 5.678308102568364e-07, + "loss": 1.8524, + "step": 28412 + }, + { + "epoch": 1.9068823193852555, + "grad_norm": 4.015473365783691, + "learning_rate": 5.661986824123278e-07, + "loss": 2.0271, + "step": 28414 + }, + { + "epoch": 1.9070165430690245, + "grad_norm": 4.116622447967529, + "learning_rate": 5.645688902177315e-07, + "loss": 1.9952, + "step": 28416 + }, + { + "epoch": 1.9071507667527934, + "grad_norm": 4.503079414367676, + "learning_rate": 5.62941433750036e-07, + "loss": 2.0511, + "step": 28418 + }, + { + "epoch": 1.9072849904365625, + "grad_norm": 4.151495933532715, + "learning_rate": 5.613163130861521e-07, + "loss": 1.8395, + "step": 28420 + }, + { + "epoch": 1.9074192141203317, + "grad_norm": 4.287341117858887, + "learning_rate": 5.59693528302846e-07, + "loss": 1.8213, + "step": 28422 + }, + { + "epoch": 1.9075534378041006, + "grad_norm": 3.7147834300994873, + "learning_rate": 5.580730794768064e-07, + "loss": 1.7764, + "step": 28424 + }, + { + "epoch": 1.9076876614878695, + "grad_norm": 3.590188980102539, + "learning_rate": 5.564549666845886e-07, + "loss": 2.0937, + "step": 28426 + }, + { + "epoch": 1.9078218851716384, + "grad_norm": 4.84113883972168, + "learning_rate": 5.548391900026484e-07, + "loss": 2.2494, + "step": 28428 + }, + { + "epoch": 1.9079561088554076, + "grad_norm": 3.546868324279785, + "learning_rate": 5.532257495073245e-07, + "loss": 1.8285, + "step": 28430 + }, + { + "epoch": 1.9080903325391767, + "grad_norm": 3.9116504192352295, + "learning_rate": 5.516146452748506e-07, + "loss": 1.9895, + "step": 28432 + }, + { + "epoch": 1.9082245562229456, + "grad_norm": 4.018218040466309, + "learning_rate": 5.500058773813543e-07, + "loss": 2.0311, + "step": 28434 + }, + { + "epoch": 1.9083587799067145, + "grad_norm": 4.118683815002441, + "learning_rate": 5.483994459028363e-07, + "loss": 1.908, + "step": 28436 + }, + { + "epoch": 1.9084930035904835, + "grad_norm": 4.005366802215576, + "learning_rate": 5.467953509152024e-07, + "loss": 2.0854, + "step": 28438 + }, + { + "epoch": 1.9086272272742524, + "grad_norm": 3.656139850616455, + "learning_rate": 5.45193592494242e-07, + "loss": 1.8702, + "step": 28440 + }, + { + "epoch": 1.9087614509580215, + "grad_norm": 4.024683475494385, + "learning_rate": 5.435941707156389e-07, + "loss": 1.6609, + "step": 28442 + }, + { + "epoch": 1.9088956746417907, + "grad_norm": 5.22144889831543, + "learning_rate": 5.41997085654955e-07, + "loss": 1.7851, + "step": 28444 + }, + { + "epoch": 1.9090298983255596, + "grad_norm": 4.09938907623291, + "learning_rate": 5.404023373876521e-07, + "loss": 1.7012, + "step": 28446 + }, + { + "epoch": 1.9091641220093285, + "grad_norm": 4.272224426269531, + "learning_rate": 5.388099259890867e-07, + "loss": 1.7761, + "step": 28448 + }, + { + "epoch": 1.9092983456930974, + "grad_norm": 3.9086947441101074, + "learning_rate": 5.372198515344929e-07, + "loss": 1.5924, + "step": 28450 + }, + { + "epoch": 1.9094325693768666, + "grad_norm": 12.15160846710205, + "learning_rate": 5.356321140989884e-07, + "loss": 1.8894, + "step": 28452 + }, + { + "epoch": 1.9095667930606357, + "grad_norm": 4.330392360687256, + "learning_rate": 5.340467137576022e-07, + "loss": 1.8307, + "step": 28454 + }, + { + "epoch": 1.9097010167444046, + "grad_norm": 4.295815944671631, + "learning_rate": 5.32463650585241e-07, + "loss": 1.7099, + "step": 28456 + }, + { + "epoch": 1.9098352404281735, + "grad_norm": 4.415718078613281, + "learning_rate": 5.308829246567004e-07, + "loss": 1.9624, + "step": 28458 + }, + { + "epoch": 1.9099694641119425, + "grad_norm": 3.9599483013153076, + "learning_rate": 5.293045360466541e-07, + "loss": 1.5637, + "step": 28460 + }, + { + "epoch": 1.9101036877957116, + "grad_norm": 4.216400146484375, + "learning_rate": 5.277284848296981e-07, + "loss": 1.7267, + "step": 28462 + }, + { + "epoch": 1.9102379114794805, + "grad_norm": 3.9203426837921143, + "learning_rate": 5.261547710802894e-07, + "loss": 1.8732, + "step": 28464 + }, + { + "epoch": 1.9103721351632497, + "grad_norm": 4.193074703216553, + "learning_rate": 5.245833948727741e-07, + "loss": 1.7572, + "step": 28466 + }, + { + "epoch": 1.9105063588470186, + "grad_norm": 4.494725227355957, + "learning_rate": 5.230143562814093e-07, + "loss": 2.1004, + "step": 28468 + }, + { + "epoch": 1.9106405825307875, + "grad_norm": 3.521847724914551, + "learning_rate": 5.214476553803193e-07, + "loss": 1.8523, + "step": 28470 + }, + { + "epoch": 1.9107748062145564, + "grad_norm": 4.302805423736572, + "learning_rate": 5.198832922435337e-07, + "loss": 1.8732, + "step": 28472 + }, + { + "epoch": 1.9109090298983256, + "grad_norm": 3.8608548641204834, + "learning_rate": 5.183212669449656e-07, + "loss": 1.5881, + "step": 28474 + }, + { + "epoch": 1.9110432535820947, + "grad_norm": 3.8373594284057617, + "learning_rate": 5.167615795584169e-07, + "loss": 1.9484, + "step": 28476 + }, + { + "epoch": 1.9111774772658636, + "grad_norm": 4.168583393096924, + "learning_rate": 5.152042301575788e-07, + "loss": 1.8105, + "step": 28478 + }, + { + "epoch": 1.9113117009496325, + "grad_norm": 4.192415237426758, + "learning_rate": 5.136492188160313e-07, + "loss": 1.874, + "step": 28480 + }, + { + "epoch": 1.9114459246334015, + "grad_norm": 3.8990373611450195, + "learning_rate": 5.12096545607249e-07, + "loss": 1.8726, + "step": 28482 + }, + { + "epoch": 1.9115801483171706, + "grad_norm": 3.611747980117798, + "learning_rate": 5.105462106045955e-07, + "loss": 1.762, + "step": 28484 + }, + { + "epoch": 1.9117143720009395, + "grad_norm": 3.893322467803955, + "learning_rate": 5.08998213881312e-07, + "loss": 1.8179, + "step": 28486 + }, + { + "epoch": 1.9118485956847087, + "grad_norm": 4.125214576721191, + "learning_rate": 5.074525555105403e-07, + "loss": 2.0861, + "step": 28488 + }, + { + "epoch": 1.9119828193684776, + "grad_norm": 4.4278740882873535, + "learning_rate": 5.059092355653161e-07, + "loss": 1.7922, + "step": 28490 + }, + { + "epoch": 1.9121170430522465, + "grad_norm": 4.498102188110352, + "learning_rate": 5.043682541185479e-07, + "loss": 1.9602, + "step": 28492 + }, + { + "epoch": 1.9122512667360154, + "grad_norm": 4.186977863311768, + "learning_rate": 5.028296112430608e-07, + "loss": 1.7603, + "step": 28494 + }, + { + "epoch": 1.9123854904197846, + "grad_norm": 4.077006816864014, + "learning_rate": 5.012933070115411e-07, + "loss": 1.9162, + "step": 28496 + }, + { + "epoch": 1.9125197141035537, + "grad_norm": 5.267678260803223, + "learning_rate": 4.997593414965751e-07, + "loss": 1.9834, + "step": 28498 + }, + { + "epoch": 1.9126539377873226, + "grad_norm": 3.4513776302337646, + "learning_rate": 4.982277147706382e-07, + "loss": 1.7202, + "step": 28500 + }, + { + "epoch": 1.9127881614710915, + "grad_norm": 4.207413673400879, + "learning_rate": 4.966984269061059e-07, + "loss": 2.0109, + "step": 28502 + }, + { + "epoch": 1.9129223851548605, + "grad_norm": 4.129273414611816, + "learning_rate": 4.951714779752314e-07, + "loss": 1.9751, + "step": 28504 + }, + { + "epoch": 1.9130566088386296, + "grad_norm": 4.077866554260254, + "learning_rate": 4.93646868050146e-07, + "loss": 1.8803, + "step": 28506 + }, + { + "epoch": 1.9131908325223987, + "grad_norm": 3.9162797927856445, + "learning_rate": 4.921245972029087e-07, + "loss": 1.8014, + "step": 28508 + }, + { + "epoch": 1.9133250562061677, + "grad_norm": 4.865694999694824, + "learning_rate": 4.906046655054231e-07, + "loss": 1.8686, + "step": 28510 + }, + { + "epoch": 1.9134592798899366, + "grad_norm": 4.717785835266113, + "learning_rate": 4.89087073029515e-07, + "loss": 1.8808, + "step": 28512 + }, + { + "epoch": 1.9135935035737055, + "grad_norm": 4.166231632232666, + "learning_rate": 4.875718198468827e-07, + "loss": 1.9108, + "step": 28514 + }, + { + "epoch": 1.9137277272574744, + "grad_norm": 3.9637086391448975, + "learning_rate": 4.860589060291188e-07, + "loss": 2.295, + "step": 28516 + }, + { + "epoch": 1.9138619509412436, + "grad_norm": 4.155350208282471, + "learning_rate": 4.845483316477051e-07, + "loss": 1.8334, + "step": 28518 + }, + { + "epoch": 1.9139961746250127, + "grad_norm": 3.975820541381836, + "learning_rate": 4.830400967740178e-07, + "loss": 1.6703, + "step": 28520 + }, + { + "epoch": 1.9141303983087816, + "grad_norm": 4.236272811889648, + "learning_rate": 4.815342014793167e-07, + "loss": 1.9997, + "step": 28522 + }, + { + "epoch": 1.9142646219925505, + "grad_norm": 4.107719421386719, + "learning_rate": 4.800306458347448e-07, + "loss": 1.851, + "step": 28524 + }, + { + "epoch": 1.9143988456763195, + "grad_norm": 4.097564220428467, + "learning_rate": 4.785294299113508e-07, + "loss": 1.8359, + "step": 28526 + }, + { + "epoch": 1.9145330693600886, + "grad_norm": 3.679889678955078, + "learning_rate": 4.770305537800613e-07, + "loss": 1.7779, + "step": 28528 + }, + { + "epoch": 1.9146672930438577, + "grad_norm": 4.25518274307251, + "learning_rate": 4.7553401751169735e-07, + "loss": 1.899, + "step": 28530 + }, + { + "epoch": 1.9148015167276267, + "grad_norm": 3.496350049972534, + "learning_rate": 4.7403982117696923e-07, + "loss": 1.8738, + "step": 28532 + }, + { + "epoch": 1.9149357404113956, + "grad_norm": 4.127264499664307, + "learning_rate": 4.7254796484645925e-07, + "loss": 1.7228, + "step": 28534 + }, + { + "epoch": 1.9150699640951645, + "grad_norm": 3.676823377609253, + "learning_rate": 4.710584485906777e-07, + "loss": 1.6942, + "step": 28536 + }, + { + "epoch": 1.9152041877789336, + "grad_norm": 3.4836158752441406, + "learning_rate": 4.695712724799795e-07, + "loss": 1.8297, + "step": 28538 + }, + { + "epoch": 1.9153384114627026, + "grad_norm": 4.307678699493408, + "learning_rate": 4.680864365846471e-07, + "loss": 2.0166, + "step": 28540 + }, + { + "epoch": 1.9154726351464717, + "grad_norm": 3.9838833808898926, + "learning_rate": 4.666039409748357e-07, + "loss": 1.7126, + "step": 28542 + }, + { + "epoch": 1.9156068588302406, + "grad_norm": 3.85554838180542, + "learning_rate": 4.651237857205781e-07, + "loss": 1.7143, + "step": 28544 + }, + { + "epoch": 1.9157410825140095, + "grad_norm": 4.460402965545654, + "learning_rate": 4.636459708918128e-07, + "loss": 1.9887, + "step": 28546 + }, + { + "epoch": 1.9158753061977785, + "grad_norm": 3.643688917160034, + "learning_rate": 4.6217049655837287e-07, + "loss": 1.7973, + "step": 28548 + }, + { + "epoch": 1.9160095298815476, + "grad_norm": 4.317516803741455, + "learning_rate": 4.606973627899636e-07, + "loss": 2.0923, + "step": 28550 + }, + { + "epoch": 1.9161437535653167, + "grad_norm": 4.265377521514893, + "learning_rate": 4.5922656965618484e-07, + "loss": 1.7246, + "step": 28552 + }, + { + "epoch": 1.9162779772490857, + "grad_norm": 4.3143815994262695, + "learning_rate": 4.577581172265366e-07, + "loss": 1.7861, + "step": 28554 + }, + { + "epoch": 1.9164122009328546, + "grad_norm": 4.300205707550049, + "learning_rate": 4.5629200557039674e-07, + "loss": 2.0305, + "step": 28556 + }, + { + "epoch": 1.9165464246166235, + "grad_norm": 3.9083919525146484, + "learning_rate": 4.5482823475703205e-07, + "loss": 2.0509, + "step": 28558 + }, + { + "epoch": 1.9166806483003926, + "grad_norm": 4.255168437957764, + "learning_rate": 4.53366804855615e-07, + "loss": 1.7995, + "step": 28560 + }, + { + "epoch": 1.9168148719841616, + "grad_norm": 3.9694955348968506, + "learning_rate": 4.519077159351792e-07, + "loss": 1.8098, + "step": 28562 + }, + { + "epoch": 1.9169490956679307, + "grad_norm": 4.231239318847656, + "learning_rate": 4.504509680646751e-07, + "loss": 1.9017, + "step": 28564 + }, + { + "epoch": 1.9170833193516996, + "grad_norm": 4.105679035186768, + "learning_rate": 4.48996561312931e-07, + "loss": 1.8301, + "step": 28566 + }, + { + "epoch": 1.9172175430354685, + "grad_norm": 3.8417165279388428, + "learning_rate": 4.47544495748653e-07, + "loss": 1.8289, + "step": 28568 + }, + { + "epoch": 1.9173517667192375, + "grad_norm": 3.979680299758911, + "learning_rate": 4.4609477144046395e-07, + "loss": 1.9086, + "step": 28570 + }, + { + "epoch": 1.9174859904030066, + "grad_norm": 4.572367191314697, + "learning_rate": 4.4464738845685363e-07, + "loss": 1.9112, + "step": 28572 + }, + { + "epoch": 1.9176202140867757, + "grad_norm": 3.98709774017334, + "learning_rate": 4.432023468662061e-07, + "loss": 1.9805, + "step": 28574 + }, + { + "epoch": 1.9177544377705447, + "grad_norm": 3.480583906173706, + "learning_rate": 4.417596467367946e-07, + "loss": 1.7085, + "step": 28576 + }, + { + "epoch": 1.9178886614543136, + "grad_norm": 4.172345161437988, + "learning_rate": 4.4031928813679235e-07, + "loss": 1.8642, + "step": 28578 + }, + { + "epoch": 1.9180228851380825, + "grad_norm": 4.122261047363281, + "learning_rate": 4.3888127113424496e-07, + "loss": 2.0164, + "step": 28580 + }, + { + "epoch": 1.9181571088218516, + "grad_norm": 4.003516674041748, + "learning_rate": 4.3744559579710353e-07, + "loss": 1.9722, + "step": 28582 + }, + { + "epoch": 1.9182913325056208, + "grad_norm": 4.100731372833252, + "learning_rate": 4.360122621931917e-07, + "loss": 1.9251, + "step": 28584 + }, + { + "epoch": 1.9184255561893897, + "grad_norm": 4.105721473693848, + "learning_rate": 4.345812703902441e-07, + "loss": 1.8342, + "step": 28586 + }, + { + "epoch": 1.9185597798731586, + "grad_norm": 4.005073070526123, + "learning_rate": 4.3315262045586224e-07, + "loss": 2.0023, + "step": 28588 + }, + { + "epoch": 1.9186940035569275, + "grad_norm": 3.5308642387390137, + "learning_rate": 4.317263124575477e-07, + "loss": 1.8694, + "step": 28590 + }, + { + "epoch": 1.9188282272406965, + "grad_norm": 3.8521738052368164, + "learning_rate": 4.3030234646269653e-07, + "loss": 1.8811, + "step": 28592 + }, + { + "epoch": 1.9189624509244656, + "grad_norm": 4.606770038604736, + "learning_rate": 4.288807225385827e-07, + "loss": 1.9596, + "step": 28594 + }, + { + "epoch": 1.9190966746082347, + "grad_norm": 4.053338050842285, + "learning_rate": 4.274614407523747e-07, + "loss": 1.8173, + "step": 28596 + }, + { + "epoch": 1.9192308982920037, + "grad_norm": 4.05116605758667, + "learning_rate": 4.2604450117114114e-07, + "loss": 1.8215, + "step": 28598 + }, + { + "epoch": 1.9193651219757726, + "grad_norm": 4.299737453460693, + "learning_rate": 4.2462990386181735e-07, + "loss": 1.8664, + "step": 28600 + }, + { + "epoch": 1.9194993456595415, + "grad_norm": 5.274219036102295, + "learning_rate": 4.2321764889124425e-07, + "loss": 1.8616, + "step": 28602 + }, + { + "epoch": 1.9196335693433106, + "grad_norm": 4.1227593421936035, + "learning_rate": 4.2180773632614637e-07, + "loss": 1.7773, + "step": 28604 + }, + { + "epoch": 1.9197677930270798, + "grad_norm": 4.044521808624268, + "learning_rate": 4.2040016623314804e-07, + "loss": 1.8467, + "step": 28606 + }, + { + "epoch": 1.9199020167108487, + "grad_norm": 3.8064181804656982, + "learning_rate": 4.189949386787462e-07, + "loss": 1.8297, + "step": 28608 + }, + { + "epoch": 1.9200362403946176, + "grad_norm": 3.526052474975586, + "learning_rate": 4.1759205372933206e-07, + "loss": 1.9229, + "step": 28610 + }, + { + "epoch": 1.9201704640783865, + "grad_norm": 3.9099459648132324, + "learning_rate": 4.161915114511972e-07, + "loss": 1.9371, + "step": 28612 + }, + { + "epoch": 1.9203046877621557, + "grad_norm": 4.6606950759887695, + "learning_rate": 4.147933119105107e-07, + "loss": 1.9461, + "step": 28614 + }, + { + "epoch": 1.9204389114459246, + "grad_norm": 3.6474897861480713, + "learning_rate": 4.133974551733366e-07, + "loss": 1.65, + "step": 28616 + }, + { + "epoch": 1.9205731351296937, + "grad_norm": 3.875153064727783, + "learning_rate": 4.12003941305622e-07, + "loss": 2.0332, + "step": 28618 + }, + { + "epoch": 1.9207073588134627, + "grad_norm": 3.7185888290405273, + "learning_rate": 4.106127703732088e-07, + "loss": 1.6672, + "step": 28620 + }, + { + "epoch": 1.9208415824972316, + "grad_norm": 3.7216992378234863, + "learning_rate": 4.0922394244183315e-07, + "loss": 1.9181, + "step": 28622 + }, + { + "epoch": 1.9209758061810005, + "grad_norm": 4.007662773132324, + "learning_rate": 4.0783745757710935e-07, + "loss": 2.0373, + "step": 28624 + }, + { + "epoch": 1.9211100298647696, + "grad_norm": 4.351637840270996, + "learning_rate": 4.0645331584454606e-07, + "loss": 2.0702, + "step": 28626 + }, + { + "epoch": 1.9212442535485388, + "grad_norm": 3.964808940887451, + "learning_rate": 4.0507151730954095e-07, + "loss": 1.9349, + "step": 28628 + }, + { + "epoch": 1.9213784772323077, + "grad_norm": 4.079667091369629, + "learning_rate": 4.036920620373863e-07, + "loss": 1.7846, + "step": 28630 + }, + { + "epoch": 1.9215127009160766, + "grad_norm": 3.6951637268066406, + "learning_rate": 4.0231495009325215e-07, + "loss": 1.7473, + "step": 28632 + }, + { + "epoch": 1.9216469245998455, + "grad_norm": 3.4966578483581543, + "learning_rate": 4.0094018154220316e-07, + "loss": 1.8685, + "step": 28634 + }, + { + "epoch": 1.9217811482836147, + "grad_norm": 4.04823637008667, + "learning_rate": 3.9956775644920395e-07, + "loss": 1.7229, + "step": 28636 + }, + { + "epoch": 1.9219153719673836, + "grad_norm": 3.5633349418640137, + "learning_rate": 3.9819767487909165e-07, + "loss": 1.7107, + "step": 28638 + }, + { + "epoch": 1.9220495956511527, + "grad_norm": 3.9478631019592285, + "learning_rate": 3.968299368966033e-07, + "loss": 1.8299, + "step": 28640 + }, + { + "epoch": 1.9221838193349217, + "grad_norm": 4.542208194732666, + "learning_rate": 3.954645425663539e-07, + "loss": 1.9254, + "step": 28642 + }, + { + "epoch": 1.9223180430186906, + "grad_norm": 4.066678047180176, + "learning_rate": 3.9410149195286963e-07, + "loss": 1.7988, + "step": 28644 + }, + { + "epoch": 1.9224522667024595, + "grad_norm": 5.296835899353027, + "learning_rate": 3.927407851205378e-07, + "loss": 1.8076, + "step": 28646 + }, + { + "epoch": 1.9225864903862286, + "grad_norm": 4.106463432312012, + "learning_rate": 3.9138242213365703e-07, + "loss": 1.7642, + "step": 28648 + }, + { + "epoch": 1.9227207140699978, + "grad_norm": 4.409948348999023, + "learning_rate": 3.900264030564038e-07, + "loss": 1.6493, + "step": 28650 + }, + { + "epoch": 1.9228549377537667, + "grad_norm": 4.760623931884766, + "learning_rate": 3.8867272795285456e-07, + "loss": 1.7642, + "step": 28652 + }, + { + "epoch": 1.9229891614375356, + "grad_norm": 4.662656784057617, + "learning_rate": 3.8732139688695825e-07, + "loss": 1.9104, + "step": 28654 + }, + { + "epoch": 1.9231233851213045, + "grad_norm": 4.014105319976807, + "learning_rate": 3.8597240992256924e-07, + "loss": 1.5928, + "step": 28656 + }, + { + "epoch": 1.9232576088050737, + "grad_norm": 4.320156097412109, + "learning_rate": 3.846257671234199e-07, + "loss": 2.015, + "step": 28658 + }, + { + "epoch": 1.9233918324888426, + "grad_norm": 4.037420272827148, + "learning_rate": 3.8328146855314275e-07, + "loss": 1.7884, + "step": 28660 + }, + { + "epoch": 1.9235260561726117, + "grad_norm": 3.7216758728027344, + "learning_rate": 3.8193951427524243e-07, + "loss": 1.76, + "step": 28662 + }, + { + "epoch": 1.9236602798563807, + "grad_norm": 7.015322208404541, + "learning_rate": 3.8059990435313495e-07, + "loss": 1.9299, + "step": 28664 + }, + { + "epoch": 1.9237945035401496, + "grad_norm": 4.815721035003662, + "learning_rate": 3.792626388501086e-07, + "loss": 2.0607, + "step": 28666 + }, + { + "epoch": 1.9239287272239185, + "grad_norm": 4.1414408683776855, + "learning_rate": 3.7792771782934613e-07, + "loss": 1.9704, + "step": 28668 + }, + { + "epoch": 1.9240629509076876, + "grad_norm": 3.6824324131011963, + "learning_rate": 3.7659514135391924e-07, + "loss": 1.7458, + "step": 28670 + }, + { + "epoch": 1.9241971745914568, + "grad_norm": 4.329347133636475, + "learning_rate": 3.7526490948679995e-07, + "loss": 1.8201, + "step": 28672 + }, + { + "epoch": 1.9243313982752257, + "grad_norm": 4.291193008422852, + "learning_rate": 3.739370222908267e-07, + "loss": 2.0028, + "step": 28674 + }, + { + "epoch": 1.9244656219589946, + "grad_norm": 4.367865085601807, + "learning_rate": 3.726114798287439e-07, + "loss": 1.8882, + "step": 28676 + }, + { + "epoch": 1.9245998456427635, + "grad_norm": 4.105124473571777, + "learning_rate": 3.712882821631736e-07, + "loss": 1.7945, + "step": 28678 + }, + { + "epoch": 1.9247340693265327, + "grad_norm": 4.380781650543213, + "learning_rate": 3.6996742935664906e-07, + "loss": 1.8406, + "step": 28680 + }, + { + "epoch": 1.9248682930103018, + "grad_norm": 4.080618381500244, + "learning_rate": 3.6864892147156496e-07, + "loss": 1.553, + "step": 28682 + }, + { + "epoch": 1.9250025166940707, + "grad_norm": 3.5231616497039795, + "learning_rate": 3.673327585702324e-07, + "loss": 1.7879, + "step": 28684 + }, + { + "epoch": 1.9251367403778397, + "grad_norm": 4.101806640625, + "learning_rate": 3.6601894071482403e-07, + "loss": 1.8612, + "step": 28686 + }, + { + "epoch": 1.9252709640616086, + "grad_norm": 4.085648536682129, + "learning_rate": 3.6470746796741783e-07, + "loss": 2.084, + "step": 28688 + }, + { + "epoch": 1.9254051877453777, + "grad_norm": 4.203180313110352, + "learning_rate": 3.6339834038997545e-07, + "loss": 2.1674, + "step": 28690 + }, + { + "epoch": 1.9255394114291466, + "grad_norm": 4.691618919372559, + "learning_rate": 3.62091558044364e-07, + "loss": 2.0281, + "step": 28692 + }, + { + "epoch": 1.9256736351129158, + "grad_norm": 4.2079877853393555, + "learning_rate": 3.607871209923175e-07, + "loss": 1.8081, + "step": 28694 + }, + { + "epoch": 1.9258078587966847, + "grad_norm": 3.431684732437134, + "learning_rate": 3.5948502929546433e-07, + "loss": 1.7505, + "step": 28696 + }, + { + "epoch": 1.9259420824804536, + "grad_norm": 4.079338073730469, + "learning_rate": 3.581852830153276e-07, + "loss": 1.7302, + "step": 28698 + }, + { + "epoch": 1.9260763061642225, + "grad_norm": 4.28552770614624, + "learning_rate": 3.5688788221332483e-07, + "loss": 1.9425, + "step": 28700 + }, + { + "epoch": 1.9262105298479917, + "grad_norm": 4.369853496551514, + "learning_rate": 3.555928269507458e-07, + "loss": 1.7378, + "step": 28702 + }, + { + "epoch": 1.9263447535317608, + "grad_norm": 4.416769027709961, + "learning_rate": 3.5430011728879164e-07, + "loss": 1.9548, + "step": 28704 + }, + { + "epoch": 1.9264789772155297, + "grad_norm": 3.8651912212371826, + "learning_rate": 3.5300975328853014e-07, + "loss": 1.8885, + "step": 28706 + }, + { + "epoch": 1.9266132008992987, + "grad_norm": 4.3296589851379395, + "learning_rate": 3.517217350109236e-07, + "loss": 2.0756, + "step": 28708 + }, + { + "epoch": 1.9267474245830676, + "grad_norm": 4.0863494873046875, + "learning_rate": 3.504360625168457e-07, + "loss": 2.2452, + "step": 28710 + }, + { + "epoch": 1.9268816482668367, + "grad_norm": 4.456963539123535, + "learning_rate": 3.4915273586702546e-07, + "loss": 2.0891, + "step": 28712 + }, + { + "epoch": 1.9270158719506056, + "grad_norm": 5.585681438446045, + "learning_rate": 3.478717551221089e-07, + "loss": 1.9421, + "step": 28714 + }, + { + "epoch": 1.9271500956343748, + "grad_norm": 4.229620456695557, + "learning_rate": 3.465931203426087e-07, + "loss": 1.759, + "step": 28716 + }, + { + "epoch": 1.9272843193181437, + "grad_norm": 3.94560170173645, + "learning_rate": 3.453168315889488e-07, + "loss": 1.8642, + "step": 28718 + }, + { + "epoch": 1.9274185430019126, + "grad_norm": 4.087550640106201, + "learning_rate": 3.440428889214253e-07, + "loss": 1.7665, + "step": 28720 + }, + { + "epoch": 1.9275527666856815, + "grad_norm": 4.614492893218994, + "learning_rate": 3.427712924002402e-07, + "loss": 2.0044, + "step": 28722 + }, + { + "epoch": 1.9276869903694507, + "grad_norm": 4.694524765014648, + "learning_rate": 3.415020420854509e-07, + "loss": 1.9725, + "step": 28724 + }, + { + "epoch": 1.9278212140532198, + "grad_norm": 4.342487335205078, + "learning_rate": 3.402351380370483e-07, + "loss": 1.7605, + "step": 28726 + }, + { + "epoch": 1.9279554377369887, + "grad_norm": 3.8980915546417236, + "learning_rate": 3.3897058031487906e-07, + "loss": 1.9023, + "step": 28728 + }, + { + "epoch": 1.9280896614207577, + "grad_norm": 3.7703263759613037, + "learning_rate": 3.3770836897870086e-07, + "loss": 1.827, + "step": 28730 + }, + { + "epoch": 1.9282238851045266, + "grad_norm": 3.5413150787353516, + "learning_rate": 3.364485040881438e-07, + "loss": 1.8291, + "step": 28732 + }, + { + "epoch": 1.9283581087882957, + "grad_norm": 3.5399413108825684, + "learning_rate": 3.3519098570273797e-07, + "loss": 1.6704, + "step": 28734 + }, + { + "epoch": 1.9284923324720646, + "grad_norm": 4.170644283294678, + "learning_rate": 3.3393581388189157e-07, + "loss": 1.792, + "step": 28736 + }, + { + "epoch": 1.9286265561558338, + "grad_norm": 4.479520797729492, + "learning_rate": 3.326829886849181e-07, + "loss": 2.1708, + "step": 28738 + }, + { + "epoch": 1.9287607798396027, + "grad_norm": 3.9723682403564453, + "learning_rate": 3.314325101710036e-07, + "loss": 1.7839, + "step": 28740 + }, + { + "epoch": 1.9288950035233716, + "grad_norm": 4.7066450119018555, + "learning_rate": 3.301843783992398e-07, + "loss": 1.846, + "step": 28742 + }, + { + "epoch": 1.9290292272071405, + "grad_norm": 3.975949287414551, + "learning_rate": 3.289385934285849e-07, + "loss": 1.8296, + "step": 28744 + }, + { + "epoch": 1.9291634508909097, + "grad_norm": 4.433737754821777, + "learning_rate": 3.276951553179086e-07, + "loss": 1.9671, + "step": 28746 + }, + { + "epoch": 1.9292976745746788, + "grad_norm": 3.610661029815674, + "learning_rate": 3.264540641259639e-07, + "loss": 1.7181, + "step": 28748 + }, + { + "epoch": 1.9294318982584477, + "grad_norm": 4.285410404205322, + "learning_rate": 3.252153199113872e-07, + "loss": 2.1495, + "step": 28750 + }, + { + "epoch": 1.9295661219422167, + "grad_norm": 3.7363221645355225, + "learning_rate": 3.2397892273269835e-07, + "loss": 1.5893, + "step": 28752 + }, + { + "epoch": 1.9297003456259856, + "grad_norm": 4.002396583557129, + "learning_rate": 3.227448726483284e-07, + "loss": 1.941, + "step": 28754 + }, + { + "epoch": 1.9298345693097547, + "grad_norm": 3.73169207572937, + "learning_rate": 3.2151316971656963e-07, + "loss": 1.8061, + "step": 28756 + }, + { + "epoch": 1.9299687929935239, + "grad_norm": 4.51427698135376, + "learning_rate": 3.2028381399563103e-07, + "loss": 1.7915, + "step": 28758 + }, + { + "epoch": 1.9301030166772928, + "grad_norm": 3.7884740829467773, + "learning_rate": 3.190568055435883e-07, + "loss": 1.7466, + "step": 28760 + }, + { + "epoch": 1.9302372403610617, + "grad_norm": 4.229030132293701, + "learning_rate": 3.178321444184229e-07, + "loss": 1.7025, + "step": 28762 + }, + { + "epoch": 1.9303714640448306, + "grad_norm": 3.6988465785980225, + "learning_rate": 3.1660983067798856e-07, + "loss": 1.7134, + "step": 28764 + }, + { + "epoch": 1.9305056877285998, + "grad_norm": 4.12883996963501, + "learning_rate": 3.15389864380039e-07, + "loss": 1.6987, + "step": 28766 + }, + { + "epoch": 1.9306399114123687, + "grad_norm": 4.254380702972412, + "learning_rate": 3.141722455822227e-07, + "loss": 1.832, + "step": 28768 + }, + { + "epoch": 1.9307741350961378, + "grad_norm": 4.151044845581055, + "learning_rate": 3.1295697434206573e-07, + "loss": 1.7225, + "step": 28770 + }, + { + "epoch": 1.9309083587799067, + "grad_norm": 4.170154571533203, + "learning_rate": 3.117440507169833e-07, + "loss": 1.9365, + "step": 28772 + }, + { + "epoch": 1.9310425824636757, + "grad_norm": 4.374159336090088, + "learning_rate": 3.105334747642852e-07, + "loss": 1.8502, + "step": 28774 + }, + { + "epoch": 1.9311768061474446, + "grad_norm": 4.199262619018555, + "learning_rate": 3.093252465411756e-07, + "loss": 1.7498, + "step": 28776 + }, + { + "epoch": 1.9313110298312137, + "grad_norm": 3.982353687286377, + "learning_rate": 3.0811936610473103e-07, + "loss": 1.7803, + "step": 28778 + }, + { + "epoch": 1.9314452535149829, + "grad_norm": 3.429239273071289, + "learning_rate": 3.0691583351193377e-07, + "loss": 1.6845, + "step": 28780 + }, + { + "epoch": 1.9315794771987518, + "grad_norm": 4.016417026519775, + "learning_rate": 3.0571464881964385e-07, + "loss": 1.6525, + "step": 28782 + }, + { + "epoch": 1.9317137008825207, + "grad_norm": 3.967665672302246, + "learning_rate": 3.0451581208462143e-07, + "loss": 1.6771, + "step": 28784 + }, + { + "epoch": 1.9318479245662896, + "grad_norm": 3.7940382957458496, + "learning_rate": 3.0331932336349897e-07, + "loss": 1.6488, + "step": 28786 + }, + { + "epoch": 1.9319821482500588, + "grad_norm": 4.410193920135498, + "learning_rate": 3.0212518271281466e-07, + "loss": 1.7809, + "step": 28788 + }, + { + "epoch": 1.9321163719338277, + "grad_norm": 4.281333923339844, + "learning_rate": 3.0093339018899544e-07, + "loss": 1.936, + "step": 28790 + }, + { + "epoch": 1.9322505956175968, + "grad_norm": 3.683426856994629, + "learning_rate": 2.9974394584834085e-07, + "loss": 1.5966, + "step": 28792 + }, + { + "epoch": 1.9323848193013657, + "grad_norm": 4.01843786239624, + "learning_rate": 2.9855684974705034e-07, + "loss": 1.842, + "step": 28794 + }, + { + "epoch": 1.9325190429851347, + "grad_norm": 3.9611973762512207, + "learning_rate": 2.973721019412179e-07, + "loss": 1.9204, + "step": 28796 + }, + { + "epoch": 1.9326532666689036, + "grad_norm": 4.250574111938477, + "learning_rate": 2.96189702486821e-07, + "loss": 1.8736, + "step": 28798 + }, + { + "epoch": 1.9327874903526727, + "grad_norm": 3.9800422191619873, + "learning_rate": 2.950096514397149e-07, + "loss": 1.7572, + "step": 28800 + }, + { + "epoch": 1.9329217140364419, + "grad_norm": 3.902822732925415, + "learning_rate": 2.9383194885566623e-07, + "loss": 1.89, + "step": 28802 + }, + { + "epoch": 1.9330559377202108, + "grad_norm": 3.9066901206970215, + "learning_rate": 2.926565947903137e-07, + "loss": 1.9893, + "step": 28804 + }, + { + "epoch": 1.9331901614039797, + "grad_norm": 4.316364765167236, + "learning_rate": 2.9148358929919073e-07, + "loss": 1.8338, + "step": 28806 + }, + { + "epoch": 1.9333243850877486, + "grad_norm": 4.408977031707764, + "learning_rate": 2.903129324377252e-07, + "loss": 1.9239, + "step": 28808 + }, + { + "epoch": 1.9334586087715178, + "grad_norm": 4.608521461486816, + "learning_rate": 2.891446242612228e-07, + "loss": 2.0754, + "step": 28810 + }, + { + "epoch": 1.9335928324552867, + "grad_norm": 3.752448320388794, + "learning_rate": 2.8797866482488387e-07, + "loss": 1.8247, + "step": 28812 + }, + { + "epoch": 1.9337270561390558, + "grad_norm": 3.8203675746917725, + "learning_rate": 2.868150541837922e-07, + "loss": 1.5024, + "step": 28814 + }, + { + "epoch": 1.9338612798228247, + "grad_norm": 3.2712368965148926, + "learning_rate": 2.8565379239294257e-07, + "loss": 1.8683, + "step": 28816 + }, + { + "epoch": 1.9339955035065937, + "grad_norm": 3.8176095485687256, + "learning_rate": 2.844948795071856e-07, + "loss": 2.1008, + "step": 28818 + }, + { + "epoch": 1.9341297271903626, + "grad_norm": 4.184636116027832, + "learning_rate": 2.8333831558128856e-07, + "loss": 1.5583, + "step": 28820 + }, + { + "epoch": 1.9342639508741317, + "grad_norm": 4.325780868530273, + "learning_rate": 2.8218410066988554e-07, + "loss": 1.8581, + "step": 28822 + }, + { + "epoch": 1.9343981745579009, + "grad_norm": 4.372868061065674, + "learning_rate": 2.8103223482752183e-07, + "loss": 1.9612, + "step": 28824 + }, + { + "epoch": 1.9345323982416698, + "grad_norm": 3.854546070098877, + "learning_rate": 2.7988271810862054e-07, + "loss": 1.7184, + "step": 28826 + }, + { + "epoch": 1.9346666219254387, + "grad_norm": 3.7839105129241943, + "learning_rate": 2.787355505674882e-07, + "loss": 1.7437, + "step": 28828 + }, + { + "epoch": 1.9348008456092076, + "grad_norm": 3.7672736644744873, + "learning_rate": 2.77590732258326e-07, + "loss": 1.9273, + "step": 28830 + }, + { + "epoch": 1.9349350692929768, + "grad_norm": 3.6332316398620605, + "learning_rate": 2.7644826323522943e-07, + "loss": 1.7375, + "step": 28832 + }, + { + "epoch": 1.935069292976746, + "grad_norm": 4.3118062019348145, + "learning_rate": 2.75308143552172e-07, + "loss": 1.9415, + "step": 28834 + }, + { + "epoch": 1.9352035166605148, + "grad_norm": 4.1131486892700195, + "learning_rate": 2.741703732630274e-07, + "loss": 1.7829, + "step": 28836 + }, + { + "epoch": 1.9353377403442837, + "grad_norm": 4.172784328460693, + "learning_rate": 2.7303495242155254e-07, + "loss": 1.6297, + "step": 28838 + }, + { + "epoch": 1.9354719640280527, + "grad_norm": 3.9813060760498047, + "learning_rate": 2.7190188108138784e-07, + "loss": 1.8464, + "step": 28840 + }, + { + "epoch": 1.9356061877118218, + "grad_norm": 3.3151416778564453, + "learning_rate": 2.707711592960793e-07, + "loss": 1.5498, + "step": 28842 + }, + { + "epoch": 1.9357404113955907, + "grad_norm": 4.311954975128174, + "learning_rate": 2.696427871190399e-07, + "loss": 1.8002, + "step": 28844 + }, + { + "epoch": 1.9358746350793599, + "grad_norm": 4.155744552612305, + "learning_rate": 2.6851676460359355e-07, + "loss": 2.0343, + "step": 28846 + }, + { + "epoch": 1.9360088587631288, + "grad_norm": 4.0194525718688965, + "learning_rate": 2.67393091802931e-07, + "loss": 1.8652, + "step": 28848 + }, + { + "epoch": 1.9361430824468977, + "grad_norm": 4.551218032836914, + "learning_rate": 2.6627176877015435e-07, + "loss": 2.2004, + "step": 28850 + }, + { + "epoch": 1.9362773061306666, + "grad_norm": 4.225657939910889, + "learning_rate": 2.651527955582378e-07, + "loss": 2.0285, + "step": 28852 + }, + { + "epoch": 1.9364115298144358, + "grad_norm": 3.4834790229797363, + "learning_rate": 2.6403617222005017e-07, + "loss": 2.031, + "step": 28854 + }, + { + "epoch": 1.936545753498205, + "grad_norm": 4.136857986450195, + "learning_rate": 2.629218988083548e-07, + "loss": 1.8352, + "step": 28856 + }, + { + "epoch": 1.9366799771819738, + "grad_norm": 4.632990837097168, + "learning_rate": 2.6180997537579856e-07, + "loss": 1.9922, + "step": 28858 + }, + { + "epoch": 1.9368142008657427, + "grad_norm": 4.827854156494141, + "learning_rate": 2.60700401974906e-07, + "loss": 2.0356, + "step": 28860 + }, + { + "epoch": 1.9369484245495117, + "grad_norm": 4.396442890167236, + "learning_rate": 2.595931786581185e-07, + "loss": 1.9086, + "step": 28862 + }, + { + "epoch": 1.9370826482332808, + "grad_norm": 3.886652946472168, + "learning_rate": 2.584883054777443e-07, + "loss": 1.7029, + "step": 28864 + }, + { + "epoch": 1.9372168719170497, + "grad_norm": 4.0837812423706055, + "learning_rate": 2.5738578248598044e-07, + "loss": 1.687, + "step": 28866 + }, + { + "epoch": 1.9373510956008189, + "grad_norm": 3.9687631130218506, + "learning_rate": 2.562856097349242e-07, + "loss": 1.8563, + "step": 28868 + }, + { + "epoch": 1.9374853192845878, + "grad_norm": 4.722868919372559, + "learning_rate": 2.551877872765562e-07, + "loss": 1.8285, + "step": 28870 + }, + { + "epoch": 1.9376195429683567, + "grad_norm": 3.5171103477478027, + "learning_rate": 2.540923151627461e-07, + "loss": 1.5315, + "step": 28872 + }, + { + "epoch": 1.9377537666521256, + "grad_norm": 3.8333218097686768, + "learning_rate": 2.52999193445258e-07, + "loss": 1.9813, + "step": 28874 + }, + { + "epoch": 1.9378879903358948, + "grad_norm": 4.148074150085449, + "learning_rate": 2.5190842217573396e-07, + "loss": 1.9138, + "step": 28876 + }, + { + "epoch": 1.938022214019664, + "grad_norm": 3.787607431411743, + "learning_rate": 2.5082000140570493e-07, + "loss": 1.8305, + "step": 28878 + }, + { + "epoch": 1.9381564377034328, + "grad_norm": 4.897492408752441, + "learning_rate": 2.4973393118660757e-07, + "loss": 1.7904, + "step": 28880 + }, + { + "epoch": 1.9382906613872017, + "grad_norm": 3.896101713180542, + "learning_rate": 2.4865021156975085e-07, + "loss": 1.6094, + "step": 28882 + }, + { + "epoch": 1.9384248850709707, + "grad_norm": 4.807461738586426, + "learning_rate": 2.4756884260634384e-07, + "loss": 1.9731, + "step": 28884 + }, + { + "epoch": 1.9385591087547398, + "grad_norm": 3.621094226837158, + "learning_rate": 2.464898243474734e-07, + "loss": 1.6916, + "step": 28886 + }, + { + "epoch": 1.9386933324385087, + "grad_norm": 4.103925704956055, + "learning_rate": 2.454131568441154e-07, + "loss": 1.7909, + "step": 28888 + }, + { + "epoch": 1.9388275561222779, + "grad_norm": 3.635885000228882, + "learning_rate": 2.443388401471569e-07, + "loss": 1.9676, + "step": 28890 + }, + { + "epoch": 1.9389617798060468, + "grad_norm": 3.6393356323242188, + "learning_rate": 2.432668743073463e-07, + "loss": 1.8259, + "step": 28892 + }, + { + "epoch": 1.9390960034898157, + "grad_norm": 3.9940340518951416, + "learning_rate": 2.421972593753319e-07, + "loss": 1.7806, + "step": 28894 + }, + { + "epoch": 1.9392302271735846, + "grad_norm": 4.028698921203613, + "learning_rate": 2.4112999540165103e-07, + "loss": 1.8679, + "step": 28896 + }, + { + "epoch": 1.9393644508573538, + "grad_norm": 4.038124084472656, + "learning_rate": 2.400650824367301e-07, + "loss": 1.7211, + "step": 28898 + }, + { + "epoch": 1.939498674541123, + "grad_norm": 5.102506637573242, + "learning_rate": 2.3900252053088435e-07, + "loss": 2.1301, + "step": 28900 + }, + { + "epoch": 1.9396328982248918, + "grad_norm": 4.340782165527344, + "learning_rate": 2.379423097343292e-07, + "loss": 1.8351, + "step": 28902 + }, + { + "epoch": 1.9397671219086607, + "grad_norm": 4.010103702545166, + "learning_rate": 2.3688445009713566e-07, + "loss": 1.6674, + "step": 28904 + }, + { + "epoch": 1.9399013455924297, + "grad_norm": 4.016787052154541, + "learning_rate": 2.3582894166930268e-07, + "loss": 1.8743, + "step": 28906 + }, + { + "epoch": 1.9400355692761988, + "grad_norm": 3.8705942630767822, + "learning_rate": 2.3477578450069038e-07, + "loss": 1.661, + "step": 28908 + }, + { + "epoch": 1.940169792959968, + "grad_norm": 4.465551376342773, + "learning_rate": 2.3372497864106445e-07, + "loss": 1.7753, + "step": 28910 + }, + { + "epoch": 1.9403040166437369, + "grad_norm": 4.709262847900391, + "learning_rate": 2.3267652414007414e-07, + "loss": 1.7601, + "step": 28912 + }, + { + "epoch": 1.9404382403275058, + "grad_norm": 4.184792995452881, + "learning_rate": 2.316304210472575e-07, + "loss": 1.8285, + "step": 28914 + }, + { + "epoch": 1.9405724640112747, + "grad_norm": 4.106136322021484, + "learning_rate": 2.3058666941203623e-07, + "loss": 1.9376, + "step": 28916 + }, + { + "epoch": 1.9407066876950438, + "grad_norm": 4.150669574737549, + "learning_rate": 2.2954526928372632e-07, + "loss": 2.0472, + "step": 28918 + }, + { + "epoch": 1.9408409113788128, + "grad_norm": 4.532611846923828, + "learning_rate": 2.2850622071153293e-07, + "loss": 1.8351, + "step": 28920 + }, + { + "epoch": 1.940975135062582, + "grad_norm": 3.669381618499756, + "learning_rate": 2.2746952374455011e-07, + "loss": 1.9281, + "step": 28922 + }, + { + "epoch": 1.9411093587463508, + "grad_norm": 4.214819431304932, + "learning_rate": 2.264351784317553e-07, + "loss": 2.0303, + "step": 28924 + }, + { + "epoch": 1.9412435824301197, + "grad_norm": 3.7186455726623535, + "learning_rate": 2.2540318482202615e-07, + "loss": 1.6382, + "step": 28926 + }, + { + "epoch": 1.9413778061138887, + "grad_norm": 4.112656593322754, + "learning_rate": 2.2437354296411805e-07, + "loss": 2.3177, + "step": 28928 + }, + { + "epoch": 1.9415120297976578, + "grad_norm": 4.783576965332031, + "learning_rate": 2.23346252906681e-07, + "loss": 1.7348, + "step": 28930 + }, + { + "epoch": 1.941646253481427, + "grad_norm": 4.557685852050781, + "learning_rate": 2.223213146982539e-07, + "loss": 1.956, + "step": 28932 + }, + { + "epoch": 1.9417804771651959, + "grad_norm": 4.344444274902344, + "learning_rate": 2.2129872838725364e-07, + "loss": 1.7573, + "step": 28934 + }, + { + "epoch": 1.9419147008489648, + "grad_norm": 3.9184987545013428, + "learning_rate": 2.2027849402201373e-07, + "loss": 2.207, + "step": 28936 + }, + { + "epoch": 1.9420489245327337, + "grad_norm": 4.304259300231934, + "learning_rate": 2.1926061165071788e-07, + "loss": 1.7759, + "step": 28938 + }, + { + "epoch": 1.9421831482165028, + "grad_norm": 3.7064120769500732, + "learning_rate": 2.1824508132147204e-07, + "loss": 1.9432, + "step": 28940 + }, + { + "epoch": 1.9423173719002718, + "grad_norm": 3.934603691101074, + "learning_rate": 2.1723190308225448e-07, + "loss": 1.8816, + "step": 28942 + }, + { + "epoch": 1.942451595584041, + "grad_norm": 4.942292213439941, + "learning_rate": 2.1622107698093808e-07, + "loss": 2.1253, + "step": 28944 + }, + { + "epoch": 1.9425858192678098, + "grad_norm": 3.818629741668701, + "learning_rate": 2.1521260306527903e-07, + "loss": 1.691, + "step": 28946 + }, + { + "epoch": 1.9427200429515787, + "grad_norm": 3.9868273735046387, + "learning_rate": 2.1420648138292253e-07, + "loss": 1.6975, + "step": 28948 + }, + { + "epoch": 1.9428542666353477, + "grad_norm": 4.201986312866211, + "learning_rate": 2.1320271198141395e-07, + "loss": 1.8455, + "step": 28950 + }, + { + "epoch": 1.9429884903191168, + "grad_norm": 4.356717586517334, + "learning_rate": 2.122012949081764e-07, + "loss": 1.7744, + "step": 28952 + }, + { + "epoch": 1.943122714002886, + "grad_norm": 3.836244583129883, + "learning_rate": 2.112022302105221e-07, + "loss": 1.9547, + "step": 28954 + }, + { + "epoch": 1.9432569376866549, + "grad_norm": 4.2509284019470215, + "learning_rate": 2.1020551793565768e-07, + "loss": 1.6791, + "step": 28956 + }, + { + "epoch": 1.9433911613704238, + "grad_norm": 4.011435508728027, + "learning_rate": 2.092111581306788e-07, + "loss": 1.7994, + "step": 28958 + }, + { + "epoch": 1.9435253850541927, + "grad_norm": 3.654696226119995, + "learning_rate": 2.0821915084255906e-07, + "loss": 1.9252, + "step": 28960 + }, + { + "epoch": 1.9436596087379618, + "grad_norm": 3.811150312423706, + "learning_rate": 2.0722949611817198e-07, + "loss": 1.9587, + "step": 28962 + }, + { + "epoch": 1.9437938324217308, + "grad_norm": 3.8950998783111572, + "learning_rate": 2.0624219400428023e-07, + "loss": 1.8965, + "step": 28964 + }, + { + "epoch": 1.9439280561055, + "grad_norm": 4.024796962738037, + "learning_rate": 2.052572445475298e-07, + "loss": 1.7579, + "step": 28966 + }, + { + "epoch": 1.9440622797892688, + "grad_norm": 3.573364496231079, + "learning_rate": 2.0427464779445572e-07, + "loss": 1.7794, + "step": 28968 + }, + { + "epoch": 1.9441965034730377, + "grad_norm": 3.569404363632202, + "learning_rate": 2.0329440379148746e-07, + "loss": 2.127, + "step": 28970 + }, + { + "epoch": 1.9443307271568067, + "grad_norm": 4.604458808898926, + "learning_rate": 2.02316512584938e-07, + "loss": 1.8836, + "step": 28972 + }, + { + "epoch": 1.9444649508405758, + "grad_norm": 4.122928142547607, + "learning_rate": 2.0134097422100928e-07, + "loss": 1.8058, + "step": 28974 + }, + { + "epoch": 1.944599174524345, + "grad_norm": 4.133227348327637, + "learning_rate": 2.0036778874579775e-07, + "loss": 1.9131, + "step": 28976 + }, + { + "epoch": 1.9447333982081139, + "grad_norm": 3.820744752883911, + "learning_rate": 1.9939695620527777e-07, + "loss": 1.9539, + "step": 28978 + }, + { + "epoch": 1.9448676218918828, + "grad_norm": 4.699275970458984, + "learning_rate": 1.984284766453237e-07, + "loss": 1.8893, + "step": 28980 + }, + { + "epoch": 1.9450018455756517, + "grad_norm": 4.122354984283447, + "learning_rate": 1.9746235011169344e-07, + "loss": 1.8746, + "step": 28982 + }, + { + "epoch": 1.9451360692594208, + "grad_norm": 4.111239433288574, + "learning_rate": 1.964985766500338e-07, + "loss": 1.8769, + "step": 28984 + }, + { + "epoch": 1.94527029294319, + "grad_norm": 4.494509220123291, + "learning_rate": 1.9553715630588053e-07, + "loss": 2.0656, + "step": 28986 + }, + { + "epoch": 1.945404516626959, + "grad_norm": 4.303322792053223, + "learning_rate": 1.9457808912466402e-07, + "loss": 1.8312, + "step": 28988 + }, + { + "epoch": 1.9455387403107278, + "grad_norm": 3.962733745574951, + "learning_rate": 1.9362137515169242e-07, + "loss": 1.6612, + "step": 28990 + }, + { + "epoch": 1.9456729639944967, + "grad_norm": 3.823240280151367, + "learning_rate": 1.9266701443217406e-07, + "loss": 1.8904, + "step": 28992 + }, + { + "epoch": 1.9458071876782659, + "grad_norm": 3.6574525833129883, + "learning_rate": 1.9171500701119504e-07, + "loss": 1.7548, + "step": 28994 + }, + { + "epoch": 1.9459414113620348, + "grad_norm": 4.024068832397461, + "learning_rate": 1.907653529337361e-07, + "loss": 1.7357, + "step": 28996 + }, + { + "epoch": 1.946075635045804, + "grad_norm": 4.303837776184082, + "learning_rate": 1.8981805224467242e-07, + "loss": 1.7923, + "step": 28998 + }, + { + "epoch": 1.9462098587295729, + "grad_norm": 4.450523853302002, + "learning_rate": 1.8887310498875155e-07, + "loss": 2.0083, + "step": 29000 + }, + { + "epoch": 1.9463440824133418, + "grad_norm": 4.232508182525635, + "learning_rate": 1.879305112106322e-07, + "loss": 1.9733, + "step": 29002 + }, + { + "epoch": 1.9464783060971107, + "grad_norm": 4.130880832672119, + "learning_rate": 1.8699027095484545e-07, + "loss": 1.9662, + "step": 29004 + }, + { + "epoch": 1.9466125297808798, + "grad_norm": 3.8544156551361084, + "learning_rate": 1.8605238426581683e-07, + "loss": 1.8463, + "step": 29006 + }, + { + "epoch": 1.946746753464649, + "grad_norm": 4.540312767028809, + "learning_rate": 1.8511685118785538e-07, + "loss": 2.0479, + "step": 29008 + }, + { + "epoch": 1.946880977148418, + "grad_norm": 3.8221380710601807, + "learning_rate": 1.841836717651646e-07, + "loss": 1.888, + "step": 29010 + }, + { + "epoch": 1.9470152008321868, + "grad_norm": 4.195241451263428, + "learning_rate": 1.83252846041837e-07, + "loss": 1.8743, + "step": 29012 + }, + { + "epoch": 1.9471494245159557, + "grad_norm": 4.129055500030518, + "learning_rate": 1.823243740618541e-07, + "loss": 1.876, + "step": 29014 + }, + { + "epoch": 1.9472836481997249, + "grad_norm": 4.651121139526367, + "learning_rate": 1.8139825586908076e-07, + "loss": 1.8992, + "step": 29016 + }, + { + "epoch": 1.9474178718834938, + "grad_norm": 4.144933223724365, + "learning_rate": 1.8047449150727648e-07, + "loss": 1.7995, + "step": 29018 + }, + { + "epoch": 1.947552095567263, + "grad_norm": 4.110438823699951, + "learning_rate": 1.795530810200896e-07, + "loss": 1.6365, + "step": 29020 + }, + { + "epoch": 1.9476863192510319, + "grad_norm": 4.171736717224121, + "learning_rate": 1.786340244510465e-07, + "loss": 1.9319, + "step": 29022 + }, + { + "epoch": 1.9478205429348008, + "grad_norm": 4.2382707595825195, + "learning_rate": 1.7771732184357904e-07, + "loss": 2.0053, + "step": 29024 + }, + { + "epoch": 1.9479547666185697, + "grad_norm": 3.976552724838257, + "learning_rate": 1.7680297324099703e-07, + "loss": 1.8325, + "step": 29026 + }, + { + "epoch": 1.9480889903023388, + "grad_norm": 4.297659397125244, + "learning_rate": 1.758909786864993e-07, + "loss": 1.8787, + "step": 29028 + }, + { + "epoch": 1.948223213986108, + "grad_norm": 3.8601925373077393, + "learning_rate": 1.7498133822317908e-07, + "loss": 1.8234, + "step": 29030 + }, + { + "epoch": 1.948357437669877, + "grad_norm": 3.9192965030670166, + "learning_rate": 1.7407405189401315e-07, + "loss": 1.6722, + "step": 29032 + }, + { + "epoch": 1.9484916613536458, + "grad_norm": 3.994136095046997, + "learning_rate": 1.7316911974187276e-07, + "loss": 1.8183, + "step": 29034 + }, + { + "epoch": 1.9486258850374147, + "grad_norm": 4.112056732177734, + "learning_rate": 1.7226654180950708e-07, + "loss": 1.8987, + "step": 29036 + }, + { + "epoch": 1.9487601087211839, + "grad_norm": 4.066359996795654, + "learning_rate": 1.7136631813957082e-07, + "loss": 1.7867, + "step": 29038 + }, + { + "epoch": 1.9488943324049528, + "grad_norm": 4.6653971672058105, + "learning_rate": 1.7046844877458556e-07, + "loss": 1.9441, + "step": 29040 + }, + { + "epoch": 1.949028556088722, + "grad_norm": 3.961143732070923, + "learning_rate": 1.6957293375698403e-07, + "loss": 1.8712, + "step": 29042 + }, + { + "epoch": 1.9491627797724909, + "grad_norm": 4.222787857055664, + "learning_rate": 1.6867977312907678e-07, + "loss": 2.122, + "step": 29044 + }, + { + "epoch": 1.9492970034562598, + "grad_norm": 4.285600662231445, + "learning_rate": 1.677889669330579e-07, + "loss": 1.9303, + "step": 29046 + }, + { + "epoch": 1.9494312271400287, + "grad_norm": 4.183567047119141, + "learning_rate": 1.6690051521102146e-07, + "loss": 2.1747, + "step": 29048 + }, + { + "epoch": 1.9495654508237978, + "grad_norm": 4.201929092407227, + "learning_rate": 1.6601441800493945e-07, + "loss": 2.0037, + "step": 29050 + }, + { + "epoch": 1.949699674507567, + "grad_norm": 4.068594455718994, + "learning_rate": 1.6513067535668392e-07, + "loss": 2.0566, + "step": 29052 + }, + { + "epoch": 1.949833898191336, + "grad_norm": 3.3694305419921875, + "learning_rate": 1.6424928730801036e-07, + "loss": 1.8221, + "step": 29054 + }, + { + "epoch": 1.9499681218751048, + "grad_norm": 4.543045997619629, + "learning_rate": 1.633702539005577e-07, + "loss": 1.9146, + "step": 29056 + }, + { + "epoch": 1.9501023455588737, + "grad_norm": 3.9835009574890137, + "learning_rate": 1.6249357517585938e-07, + "loss": 1.9061, + "step": 29058 + }, + { + "epoch": 1.9502365692426429, + "grad_norm": 4.592475414276123, + "learning_rate": 1.6161925117533783e-07, + "loss": 1.8573, + "step": 29060 + }, + { + "epoch": 1.950370792926412, + "grad_norm": 5.334604263305664, + "learning_rate": 1.6074728194030442e-07, + "loss": 1.73, + "step": 29062 + }, + { + "epoch": 1.950505016610181, + "grad_norm": 4.398961544036865, + "learning_rate": 1.5987766751195953e-07, + "loss": 1.6225, + "step": 29064 + }, + { + "epoch": 1.9506392402939499, + "grad_norm": 4.216375350952148, + "learning_rate": 1.5901040793138699e-07, + "loss": 1.8223, + "step": 29066 + }, + { + "epoch": 1.9507734639777188, + "grad_norm": 4.124303340911865, + "learning_rate": 1.5814550323957066e-07, + "loss": 2.0437, + "step": 29068 + }, + { + "epoch": 1.950907687661488, + "grad_norm": 4.112043380737305, + "learning_rate": 1.5728295347736123e-07, + "loss": 1.9978, + "step": 29070 + }, + { + "epoch": 1.9510419113452568, + "grad_norm": 4.184831142425537, + "learning_rate": 1.5642275868552602e-07, + "loss": 1.8532, + "step": 29072 + }, + { + "epoch": 1.951176135029026, + "grad_norm": 3.9894673824310303, + "learning_rate": 1.5556491890469927e-07, + "loss": 1.8325, + "step": 29074 + }, + { + "epoch": 1.951310358712795, + "grad_norm": 4.237041473388672, + "learning_rate": 1.5470943417541518e-07, + "loss": 1.6811, + "step": 29076 + }, + { + "epoch": 1.9514445823965638, + "grad_norm": 3.8236868381500244, + "learning_rate": 1.538563045380914e-07, + "loss": 2.0808, + "step": 29078 + }, + { + "epoch": 1.9515788060803327, + "grad_norm": 4.484443664550781, + "learning_rate": 1.5300553003304575e-07, + "loss": 1.8436, + "step": 29080 + }, + { + "epoch": 1.9517130297641019, + "grad_norm": 4.341437816619873, + "learning_rate": 1.521571107004627e-07, + "loss": 1.8862, + "step": 29082 + }, + { + "epoch": 1.951847253447871, + "grad_norm": 4.04935359954834, + "learning_rate": 1.5131104658043794e-07, + "loss": 1.7336, + "step": 29084 + }, + { + "epoch": 1.95198147713164, + "grad_norm": 4.178221225738525, + "learning_rate": 1.5046733771293953e-07, + "loss": 1.6369, + "step": 29086 + }, + { + "epoch": 1.9521157008154089, + "grad_norm": 4.303088665008545, + "learning_rate": 1.4962598413784113e-07, + "loss": 1.8075, + "step": 29088 + }, + { + "epoch": 1.9522499244991778, + "grad_norm": 4.461789131164551, + "learning_rate": 1.4878698589488315e-07, + "loss": 1.7992, + "step": 29090 + }, + { + "epoch": 1.952384148182947, + "grad_norm": 3.927995204925537, + "learning_rate": 1.4795034302371168e-07, + "loss": 2.0684, + "step": 29092 + }, + { + "epoch": 1.9525183718667158, + "grad_norm": 3.820427894592285, + "learning_rate": 1.4711605556385622e-07, + "loss": 1.816, + "step": 29094 + }, + { + "epoch": 1.952652595550485, + "grad_norm": 3.8056044578552246, + "learning_rate": 1.4628412355474076e-07, + "loss": 1.7626, + "step": 29096 + }, + { + "epoch": 1.952786819234254, + "grad_norm": 4.284554481506348, + "learning_rate": 1.4545454703566165e-07, + "loss": 1.8113, + "step": 29098 + }, + { + "epoch": 1.9529210429180228, + "grad_norm": 4.605967998504639, + "learning_rate": 1.4462732604582086e-07, + "loss": 1.96, + "step": 29100 + }, + { + "epoch": 1.9530552666017917, + "grad_norm": 4.965300559997559, + "learning_rate": 1.4380246062430935e-07, + "loss": 1.8811, + "step": 29102 + }, + { + "epoch": 1.9531894902855609, + "grad_norm": 3.858179807662964, + "learning_rate": 1.4297995081008487e-07, + "loss": 1.84, + "step": 29104 + }, + { + "epoch": 1.95332371396933, + "grad_norm": 3.4888997077941895, + "learning_rate": 1.4215979664202183e-07, + "loss": 1.8076, + "step": 29106 + }, + { + "epoch": 1.953457937653099, + "grad_norm": 4.528899669647217, + "learning_rate": 1.4134199815886705e-07, + "loss": 1.9795, + "step": 29108 + }, + { + "epoch": 1.9535921613368679, + "grad_norm": 3.791921377182007, + "learning_rate": 1.4052655539926184e-07, + "loss": 1.647, + "step": 29110 + }, + { + "epoch": 1.9537263850206368, + "grad_norm": 4.414677143096924, + "learning_rate": 1.3971346840173095e-07, + "loss": 1.6601, + "step": 29112 + }, + { + "epoch": 1.953860608704406, + "grad_norm": 3.7441766262054443, + "learning_rate": 1.389027372046936e-07, + "loss": 1.75, + "step": 29114 + }, + { + "epoch": 1.9539948323881748, + "grad_norm": 4.443990707397461, + "learning_rate": 1.3809436184645252e-07, + "loss": 1.9141, + "step": 29116 + }, + { + "epoch": 1.954129056071944, + "grad_norm": 3.96110463142395, + "learning_rate": 1.3728834236520493e-07, + "loss": 1.7623, + "step": 29118 + }, + { + "epoch": 1.954263279755713, + "grad_norm": 3.5471715927124023, + "learning_rate": 1.3648467879902594e-07, + "loss": 1.8937, + "step": 29120 + }, + { + "epoch": 1.9543975034394818, + "grad_norm": 4.0178751945495605, + "learning_rate": 1.356833711859018e-07, + "loss": 2.0781, + "step": 29122 + }, + { + "epoch": 1.9545317271232507, + "grad_norm": 3.71732234954834, + "learning_rate": 1.3488441956368003e-07, + "loss": 1.9837, + "step": 29124 + }, + { + "epoch": 1.9546659508070199, + "grad_norm": 3.9994964599609375, + "learning_rate": 1.3408782397011932e-07, + "loss": 2.0471, + "step": 29126 + }, + { + "epoch": 1.954800174490789, + "grad_norm": 3.2754993438720703, + "learning_rate": 1.3329358444284513e-07, + "loss": 1.6259, + "step": 29128 + }, + { + "epoch": 1.954934398174558, + "grad_norm": 4.243786334991455, + "learning_rate": 1.3250170101939407e-07, + "loss": 1.8222, + "step": 29130 + }, + { + "epoch": 1.9550686218583269, + "grad_norm": 4.139628887176514, + "learning_rate": 1.3171217373717516e-07, + "loss": 1.9362, + "step": 29132 + }, + { + "epoch": 1.9552028455420958, + "grad_norm": 4.3795576095581055, + "learning_rate": 1.309250026334974e-07, + "loss": 1.8128, + "step": 29134 + }, + { + "epoch": 1.955337069225865, + "grad_norm": 4.073946475982666, + "learning_rate": 1.301401877455477e-07, + "loss": 1.7429, + "step": 29136 + }, + { + "epoch": 1.955471292909634, + "grad_norm": 3.92104434967041, + "learning_rate": 1.293577291104131e-07, + "loss": 2.066, + "step": 29138 + }, + { + "epoch": 1.955605516593403, + "grad_norm": 3.9553945064544678, + "learning_rate": 1.285776267650529e-07, + "loss": 1.7181, + "step": 29140 + }, + { + "epoch": 1.955739740277172, + "grad_norm": 4.542988300323486, + "learning_rate": 1.2779988074633765e-07, + "loss": 1.7568, + "step": 29142 + }, + { + "epoch": 1.9558739639609408, + "grad_norm": 3.6680545806884766, + "learning_rate": 1.2702449109100455e-07, + "loss": 1.6721, + "step": 29144 + }, + { + "epoch": 1.95600818764471, + "grad_norm": 4.173490047454834, + "learning_rate": 1.2625145783569658e-07, + "loss": 1.8172, + "step": 29146 + }, + { + "epoch": 1.9561424113284789, + "grad_norm": 4.284706115722656, + "learning_rate": 1.2548078101692894e-07, + "loss": 1.824, + "step": 29148 + }, + { + "epoch": 1.956276635012248, + "grad_norm": 4.545224666595459, + "learning_rate": 1.2471246067112807e-07, + "loss": 1.9169, + "step": 29150 + }, + { + "epoch": 1.956410858696017, + "grad_norm": 4.005102634429932, + "learning_rate": 1.239464968345816e-07, + "loss": 1.9072, + "step": 29152 + }, + { + "epoch": 1.9565450823797859, + "grad_norm": 4.072662353515625, + "learning_rate": 1.2318288954348833e-07, + "loss": 1.6863, + "step": 29154 + }, + { + "epoch": 1.9566793060635548, + "grad_norm": 4.573484897613525, + "learning_rate": 1.22421638833925e-07, + "loss": 1.9564, + "step": 29156 + }, + { + "epoch": 1.956813529747324, + "grad_norm": 4.331247329711914, + "learning_rate": 1.216627447418628e-07, + "loss": 1.8578, + "step": 29158 + }, + { + "epoch": 1.956947753431093, + "grad_norm": 3.931947708129883, + "learning_rate": 1.2090620730315084e-07, + "loss": 1.9219, + "step": 29160 + }, + { + "epoch": 1.957081977114862, + "grad_norm": 3.8955957889556885, + "learning_rate": 1.201520265535383e-07, + "loss": 1.7858, + "step": 29162 + }, + { + "epoch": 1.957216200798631, + "grad_norm": 4.186088562011719, + "learning_rate": 1.1940020252865226e-07, + "loss": 2.0051, + "step": 29164 + }, + { + "epoch": 1.9573504244823998, + "grad_norm": 4.116036415100098, + "learning_rate": 1.1865073526402537e-07, + "loss": 1.8987, + "step": 29166 + }, + { + "epoch": 1.957484648166169, + "grad_norm": 4.261275291442871, + "learning_rate": 1.1790362479506822e-07, + "loss": 1.5642, + "step": 29168 + }, + { + "epoch": 1.9576188718499379, + "grad_norm": 4.050167083740234, + "learning_rate": 1.1715887115706926e-07, + "loss": 2.0733, + "step": 29170 + }, + { + "epoch": 1.957753095533707, + "grad_norm": 3.8869664669036865, + "learning_rate": 1.1641647438522807e-07, + "loss": 1.8061, + "step": 29172 + }, + { + "epoch": 1.957887319217476, + "grad_norm": 3.8428244590759277, + "learning_rate": 1.1567643451461108e-07, + "loss": 1.8411, + "step": 29174 + }, + { + "epoch": 1.9580215429012449, + "grad_norm": 4.075674057006836, + "learning_rate": 1.1493875158019584e-07, + "loss": 1.7563, + "step": 29176 + }, + { + "epoch": 1.9581557665850138, + "grad_norm": 4.479036331176758, + "learning_rate": 1.1420342561682673e-07, + "loss": 1.8669, + "step": 29178 + }, + { + "epoch": 1.958289990268783, + "grad_norm": 3.9039499759674072, + "learning_rate": 1.1347045665924816e-07, + "loss": 1.8793, + "step": 29180 + }, + { + "epoch": 1.958424213952552, + "grad_norm": 3.7028026580810547, + "learning_rate": 1.1273984474209354e-07, + "loss": 1.8327, + "step": 29182 + }, + { + "epoch": 1.958558437636321, + "grad_norm": 4.0977983474731445, + "learning_rate": 1.1201158989988525e-07, + "loss": 1.9055, + "step": 29184 + }, + { + "epoch": 1.95869266132009, + "grad_norm": 3.8750381469726562, + "learning_rate": 1.1128569216702356e-07, + "loss": 1.7748, + "step": 29186 + }, + { + "epoch": 1.9588268850038588, + "grad_norm": 3.9880709648132324, + "learning_rate": 1.1056215157781435e-07, + "loss": 1.8438, + "step": 29188 + }, + { + "epoch": 1.958961108687628, + "grad_norm": 3.7470169067382812, + "learning_rate": 1.098409681664414e-07, + "loss": 1.5854, + "step": 29190 + }, + { + "epoch": 1.9590953323713969, + "grad_norm": 4.415503978729248, + "learning_rate": 1.091221419669719e-07, + "loss": 1.7414, + "step": 29192 + }, + { + "epoch": 1.959229556055166, + "grad_norm": 3.597207546234131, + "learning_rate": 1.0840567301337868e-07, + "loss": 1.7807, + "step": 29194 + }, + { + "epoch": 1.959363779738935, + "grad_norm": 4.286258697509766, + "learning_rate": 1.0769156133951241e-07, + "loss": 1.7368, + "step": 29196 + }, + { + "epoch": 1.9594980034227039, + "grad_norm": 4.605869770050049, + "learning_rate": 1.0697980697910725e-07, + "loss": 1.7923, + "step": 29198 + }, + { + "epoch": 1.9596322271064728, + "grad_norm": 4.322166919708252, + "learning_rate": 1.0627040996579741e-07, + "loss": 1.6849, + "step": 29200 + }, + { + "epoch": 1.959766450790242, + "grad_norm": 4.087263107299805, + "learning_rate": 1.0556337033310048e-07, + "loss": 1.6722, + "step": 29202 + }, + { + "epoch": 1.959900674474011, + "grad_norm": 4.6926069259643555, + "learning_rate": 1.0485868811441757e-07, + "loss": 1.8517, + "step": 29204 + }, + { + "epoch": 1.96003489815778, + "grad_norm": 4.301548004150391, + "learning_rate": 1.0415636334304979e-07, + "loss": 1.9965, + "step": 29206 + }, + { + "epoch": 1.960169121841549, + "grad_norm": 4.30197286605835, + "learning_rate": 1.0345639605217616e-07, + "loss": 1.8995, + "step": 29208 + }, + { + "epoch": 1.9603033455253178, + "grad_norm": 3.8802034854888916, + "learning_rate": 1.0275878627487023e-07, + "loss": 1.8262, + "step": 29210 + }, + { + "epoch": 1.960437569209087, + "grad_norm": 3.7146859169006348, + "learning_rate": 1.0206353404409452e-07, + "loss": 1.7883, + "step": 29212 + }, + { + "epoch": 1.960571792892856, + "grad_norm": 4.468938827514648, + "learning_rate": 1.0137063939269497e-07, + "loss": 1.9816, + "step": 29214 + }, + { + "epoch": 1.960706016576625, + "grad_norm": 4.362532138824463, + "learning_rate": 1.0068010235341208e-07, + "loss": 1.7548, + "step": 29216 + }, + { + "epoch": 1.960840240260394, + "grad_norm": 3.7371408939361572, + "learning_rate": 9.999192295886972e-08, + "loss": 1.9027, + "step": 29218 + }, + { + "epoch": 1.9609744639441629, + "grad_norm": 3.8390161991119385, + "learning_rate": 9.930610124158634e-08, + "loss": 1.9322, + "step": 29220 + }, + { + "epoch": 1.961108687627932, + "grad_norm": 4.315835475921631, + "learning_rate": 9.862263723396382e-08, + "loss": 2.2053, + "step": 29222 + }, + { + "epoch": 1.961242911311701, + "grad_norm": 4.424888610839844, + "learning_rate": 9.79415309682985e-08, + "loss": 2.053, + "step": 29224 + }, + { + "epoch": 1.96137713499547, + "grad_norm": 3.729285478591919, + "learning_rate": 9.726278247676468e-08, + "loss": 1.6386, + "step": 29226 + }, + { + "epoch": 1.961511358679239, + "grad_norm": 4.087092876434326, + "learning_rate": 9.658639179143669e-08, + "loss": 1.8018, + "step": 29228 + }, + { + "epoch": 1.961645582363008, + "grad_norm": 4.878542900085449, + "learning_rate": 9.591235894426675e-08, + "loss": 1.67, + "step": 29230 + }, + { + "epoch": 1.9617798060467768, + "grad_norm": 4.005993843078613, + "learning_rate": 9.524068396710717e-08, + "loss": 2.0214, + "step": 29232 + }, + { + "epoch": 1.961914029730546, + "grad_norm": 3.5923447608947754, + "learning_rate": 9.457136689169366e-08, + "loss": 1.6137, + "step": 29234 + }, + { + "epoch": 1.962048253414315, + "grad_norm": 5.2312188148498535, + "learning_rate": 9.390440774965092e-08, + "loss": 2.0556, + "step": 29236 + }, + { + "epoch": 1.962182477098084, + "grad_norm": 3.995424509048462, + "learning_rate": 9.323980657248154e-08, + "loss": 1.8854, + "step": 29238 + }, + { + "epoch": 1.962316700781853, + "grad_norm": 3.8016164302825928, + "learning_rate": 9.257756339159929e-08, + "loss": 1.6943, + "step": 29240 + }, + { + "epoch": 1.9624509244656219, + "grad_norm": 4.778343677520752, + "learning_rate": 9.191767823828467e-08, + "loss": 1.8833, + "step": 29242 + }, + { + "epoch": 1.962585148149391, + "grad_norm": 4.175442695617676, + "learning_rate": 9.126015114372388e-08, + "loss": 1.9024, + "step": 29244 + }, + { + "epoch": 1.96271937183316, + "grad_norm": 3.6427643299102783, + "learning_rate": 9.060498213897539e-08, + "loss": 1.6995, + "step": 29246 + }, + { + "epoch": 1.962853595516929, + "grad_norm": 3.6806724071502686, + "learning_rate": 8.995217125500333e-08, + "loss": 1.7409, + "step": 29248 + }, + { + "epoch": 1.962987819200698, + "grad_norm": 3.933912992477417, + "learning_rate": 8.930171852264413e-08, + "loss": 1.6342, + "step": 29250 + }, + { + "epoch": 1.963122042884467, + "grad_norm": 3.942666530609131, + "learning_rate": 8.865362397263433e-08, + "loss": 1.7498, + "step": 29252 + }, + { + "epoch": 1.9632562665682358, + "grad_norm": 4.563726425170898, + "learning_rate": 8.800788763559386e-08, + "loss": 1.9466, + "step": 29254 + }, + { + "epoch": 1.963390490252005, + "grad_norm": 3.6302618980407715, + "learning_rate": 8.736450954203168e-08, + "loss": 1.6798, + "step": 29256 + }, + { + "epoch": 1.963524713935774, + "grad_norm": 3.6679115295410156, + "learning_rate": 8.672348972235122e-08, + "loss": 1.662, + "step": 29258 + }, + { + "epoch": 1.963658937619543, + "grad_norm": 4.717917442321777, + "learning_rate": 8.608482820682828e-08, + "loss": 1.9436, + "step": 29260 + }, + { + "epoch": 1.963793161303312, + "grad_norm": 4.520405292510986, + "learning_rate": 8.544852502565537e-08, + "loss": 1.7749, + "step": 29262 + }, + { + "epoch": 1.9639273849870809, + "grad_norm": 3.405221939086914, + "learning_rate": 8.481458020888066e-08, + "loss": 1.8012, + "step": 29264 + }, + { + "epoch": 1.96406160867085, + "grad_norm": 4.529057502746582, + "learning_rate": 8.418299378646355e-08, + "loss": 2.0769, + "step": 29266 + }, + { + "epoch": 1.964195832354619, + "grad_norm": 3.8615598678588867, + "learning_rate": 8.355376578824681e-08, + "loss": 1.6788, + "step": 29268 + }, + { + "epoch": 1.964330056038388, + "grad_norm": 4.274468898773193, + "learning_rate": 8.292689624395666e-08, + "loss": 1.7431, + "step": 29270 + }, + { + "epoch": 1.964464279722157, + "grad_norm": 3.878342866897583, + "learning_rate": 8.230238518321387e-08, + "loss": 1.8002, + "step": 29272 + }, + { + "epoch": 1.964598503405926, + "grad_norm": 4.122524261474609, + "learning_rate": 8.168023263552815e-08, + "loss": 1.7847, + "step": 29274 + }, + { + "epoch": 1.9647327270896948, + "grad_norm": 3.8958003520965576, + "learning_rate": 8.106043863028157e-08, + "loss": 1.8263, + "step": 29276 + }, + { + "epoch": 1.964866950773464, + "grad_norm": 4.117306232452393, + "learning_rate": 8.044300319677844e-08, + "loss": 1.8155, + "step": 29278 + }, + { + "epoch": 1.965001174457233, + "grad_norm": 4.295535087585449, + "learning_rate": 7.982792636417324e-08, + "loss": 1.6628, + "step": 29280 + }, + { + "epoch": 1.965135398141002, + "grad_norm": 4.4148406982421875, + "learning_rate": 7.921520816153716e-08, + "loss": 1.8507, + "step": 29282 + }, + { + "epoch": 1.965269621824771, + "grad_norm": 6.329296588897705, + "learning_rate": 7.860484861781925e-08, + "loss": 1.8012, + "step": 29284 + }, + { + "epoch": 1.9654038455085399, + "grad_norm": 4.084864139556885, + "learning_rate": 7.799684776185201e-08, + "loss": 1.8414, + "step": 29286 + }, + { + "epoch": 1.965538069192309, + "grad_norm": 4.650043964385986, + "learning_rate": 7.739120562236802e-08, + "loss": 2.0301, + "step": 29288 + }, + { + "epoch": 1.9656722928760781, + "grad_norm": 3.944263219833374, + "learning_rate": 7.678792222798325e-08, + "loss": 1.6905, + "step": 29290 + }, + { + "epoch": 1.965806516559847, + "grad_norm": 4.124500751495361, + "learning_rate": 7.618699760719716e-08, + "loss": 1.6935, + "step": 29292 + }, + { + "epoch": 1.965940740243616, + "grad_norm": 4.3155412673950195, + "learning_rate": 7.558843178840924e-08, + "loss": 1.7935, + "step": 29294 + }, + { + "epoch": 1.966074963927385, + "grad_norm": 3.714954376220703, + "learning_rate": 7.499222479989132e-08, + "loss": 1.5847, + "step": 29296 + }, + { + "epoch": 1.966209187611154, + "grad_norm": 4.3791584968566895, + "learning_rate": 7.43983766698153e-08, + "loss": 1.9045, + "step": 29298 + }, + { + "epoch": 1.966343411294923, + "grad_norm": 4.322763919830322, + "learning_rate": 7.380688742624209e-08, + "loss": 1.7843, + "step": 29300 + }, + { + "epoch": 1.966477634978692, + "grad_norm": 4.230788707733154, + "learning_rate": 7.321775709712153e-08, + "loss": 1.7724, + "step": 29302 + }, + { + "epoch": 1.966611858662461, + "grad_norm": 4.3509650230407715, + "learning_rate": 7.263098571028138e-08, + "loss": 2.1059, + "step": 29304 + }, + { + "epoch": 1.96674608234623, + "grad_norm": 4.111655235290527, + "learning_rate": 7.204657329345498e-08, + "loss": 1.9234, + "step": 29306 + }, + { + "epoch": 1.9668803060299989, + "grad_norm": 4.216988563537598, + "learning_rate": 7.146451987424252e-08, + "loss": 1.8949, + "step": 29308 + }, + { + "epoch": 1.967014529713768, + "grad_norm": 4.3796868324279785, + "learning_rate": 7.088482548015529e-08, + "loss": 2.0509, + "step": 29310 + }, + { + "epoch": 1.9671487533975371, + "grad_norm": 7.395297527313232, + "learning_rate": 7.030749013857696e-08, + "loss": 1.6242, + "step": 29312 + }, + { + "epoch": 1.967282977081306, + "grad_norm": 3.367982864379883, + "learning_rate": 6.97325138767857e-08, + "loss": 1.6174, + "step": 29314 + }, + { + "epoch": 1.967417200765075, + "grad_norm": 3.9487249851226807, + "learning_rate": 6.915989672195422e-08, + "loss": 1.7346, + "step": 29316 + }, + { + "epoch": 1.967551424448844, + "grad_norm": 3.947474241256714, + "learning_rate": 6.858963870112756e-08, + "loss": 1.8813, + "step": 29318 + }, + { + "epoch": 1.967685648132613, + "grad_norm": 4.761293888092041, + "learning_rate": 6.802173984125637e-08, + "loss": 2.0152, + "step": 29320 + }, + { + "epoch": 1.967819871816382, + "grad_norm": 3.6658337116241455, + "learning_rate": 6.745620016917476e-08, + "loss": 1.7409, + "step": 29322 + }, + { + "epoch": 1.967954095500151, + "grad_norm": 3.5355889797210693, + "learning_rate": 6.689301971159467e-08, + "loss": 1.7673, + "step": 29324 + }, + { + "epoch": 1.96808831918392, + "grad_norm": 4.36375617980957, + "learning_rate": 6.633219849513372e-08, + "loss": 1.9764, + "step": 29326 + }, + { + "epoch": 1.968222542867689, + "grad_norm": 4.336069583892822, + "learning_rate": 6.577373654628183e-08, + "loss": 2.0967, + "step": 29328 + }, + { + "epoch": 1.9683567665514579, + "grad_norm": 4.329853057861328, + "learning_rate": 6.521763389142899e-08, + "loss": 1.7967, + "step": 29330 + }, + { + "epoch": 1.968490990235227, + "grad_norm": 4.050182342529297, + "learning_rate": 6.466389055685418e-08, + "loss": 2.0022, + "step": 29332 + }, + { + "epoch": 1.9686252139189961, + "grad_norm": 4.023314476013184, + "learning_rate": 6.411250656871426e-08, + "loss": 1.9266, + "step": 29334 + }, + { + "epoch": 1.968759437602765, + "grad_norm": 4.099867820739746, + "learning_rate": 6.356348195306616e-08, + "loss": 1.8429, + "step": 29336 + }, + { + "epoch": 1.968893661286534, + "grad_norm": 4.821789264678955, + "learning_rate": 6.301681673585025e-08, + "loss": 2.1092, + "step": 29338 + }, + { + "epoch": 1.969027884970303, + "grad_norm": 5.0040059089660645, + "learning_rate": 6.24725109428903e-08, + "loss": 1.8724, + "step": 29340 + }, + { + "epoch": 1.969162108654072, + "grad_norm": 4.142021656036377, + "learning_rate": 6.193056459990465e-08, + "loss": 1.954, + "step": 29342 + }, + { + "epoch": 1.969296332337841, + "grad_norm": 3.9227705001831055, + "learning_rate": 6.139097773250057e-08, + "loss": 1.7808, + "step": 29344 + }, + { + "epoch": 1.96943055602161, + "grad_norm": 3.9302680492401123, + "learning_rate": 6.085375036617436e-08, + "loss": 1.7736, + "step": 29346 + }, + { + "epoch": 1.969564779705379, + "grad_norm": 4.064699172973633, + "learning_rate": 6.031888252630569e-08, + "loss": 1.7423, + "step": 29348 + }, + { + "epoch": 1.969699003389148, + "grad_norm": 4.062044143676758, + "learning_rate": 5.97863742381688e-08, + "loss": 1.8943, + "step": 29350 + }, + { + "epoch": 1.9698332270729169, + "grad_norm": 4.140050888061523, + "learning_rate": 5.9256225526921336e-08, + "loss": 1.9941, + "step": 29352 + }, + { + "epoch": 1.969967450756686, + "grad_norm": 4.384344577789307, + "learning_rate": 5.8728436417615494e-08, + "loss": 1.7251, + "step": 29354 + }, + { + "epoch": 1.9701016744404551, + "grad_norm": 4.116833209991455, + "learning_rate": 5.820300693518133e-08, + "loss": 1.8484, + "step": 29356 + }, + { + "epoch": 1.970235898124224, + "grad_norm": 5.185526371002197, + "learning_rate": 5.7679937104454516e-08, + "loss": 1.7822, + "step": 29358 + }, + { + "epoch": 1.970370121807993, + "grad_norm": 4.1709489822387695, + "learning_rate": 5.715922695013753e-08, + "loss": 1.8904, + "step": 29360 + }, + { + "epoch": 1.970504345491762, + "grad_norm": 3.8374788761138916, + "learning_rate": 5.664087649684402e-08, + "loss": 1.6934, + "step": 29362 + }, + { + "epoch": 1.970638569175531, + "grad_norm": 4.302660942077637, + "learning_rate": 5.6124885769054394e-08, + "loss": 1.8706, + "step": 29364 + }, + { + "epoch": 1.9707727928593002, + "grad_norm": 3.9799492359161377, + "learning_rate": 5.5611254791154696e-08, + "loss": 1.6773, + "step": 29366 + }, + { + "epoch": 1.970907016543069, + "grad_norm": 4.251739025115967, + "learning_rate": 5.509998358741442e-08, + "loss": 1.645, + "step": 29368 + }, + { + "epoch": 1.971041240226838, + "grad_norm": 5.106240749359131, + "learning_rate": 5.4591072181986444e-08, + "loss": 1.9109, + "step": 29370 + }, + { + "epoch": 1.971175463910607, + "grad_norm": 4.209507465362549, + "learning_rate": 5.408452059891822e-08, + "loss": 1.6424, + "step": 29372 + }, + { + "epoch": 1.971309687594376, + "grad_norm": 4.450859546661377, + "learning_rate": 5.358032886214059e-08, + "loss": 1.7524, + "step": 29374 + }, + { + "epoch": 1.971443911278145, + "grad_norm": 3.745793581008911, + "learning_rate": 5.307849699547895e-08, + "loss": 1.823, + "step": 29376 + }, + { + "epoch": 1.9715781349619141, + "grad_norm": 4.4223127365112305, + "learning_rate": 5.257902502263656e-08, + "loss": 1.8909, + "step": 29378 + }, + { + "epoch": 1.971712358645683, + "grad_norm": 4.195838928222656, + "learning_rate": 5.208191296722231e-08, + "loss": 1.8839, + "step": 29380 + }, + { + "epoch": 1.971846582329452, + "grad_norm": 4.462430000305176, + "learning_rate": 5.158716085271742e-08, + "loss": 1.7419, + "step": 29382 + }, + { + "epoch": 1.971980806013221, + "grad_norm": 4.002575397491455, + "learning_rate": 5.109476870250318e-08, + "loss": 1.7262, + "step": 29384 + }, + { + "epoch": 1.97211502969699, + "grad_norm": 3.6755526065826416, + "learning_rate": 5.060473653983877e-08, + "loss": 1.8925, + "step": 29386 + }, + { + "epoch": 1.9722492533807592, + "grad_norm": 4.210241794586182, + "learning_rate": 5.0117064387877885e-08, + "loss": 1.9265, + "step": 29388 + }, + { + "epoch": 1.972383477064528, + "grad_norm": 4.501154899597168, + "learning_rate": 4.9631752269663213e-08, + "loss": 1.902, + "step": 29390 + }, + { + "epoch": 1.972517700748297, + "grad_norm": 5.578855514526367, + "learning_rate": 4.91488002081264e-08, + "loss": 1.6841, + "step": 29392 + }, + { + "epoch": 1.972651924432066, + "grad_norm": 3.856826066970825, + "learning_rate": 4.8668208226088085e-08, + "loss": 1.466, + "step": 29394 + }, + { + "epoch": 1.972786148115835, + "grad_norm": 4.121853828430176, + "learning_rate": 4.818997634624678e-08, + "loss": 2.0004, + "step": 29396 + }, + { + "epoch": 1.972920371799604, + "grad_norm": 4.661076068878174, + "learning_rate": 4.771410459120662e-08, + "loss": 1.8289, + "step": 29398 + }, + { + "epoch": 1.9730545954833731, + "grad_norm": 3.9281017780303955, + "learning_rate": 4.724059298344408e-08, + "loss": 1.8187, + "step": 29400 + }, + { + "epoch": 1.973188819167142, + "grad_norm": 8.155288696289062, + "learning_rate": 4.676944154533569e-08, + "loss": 1.8476, + "step": 29402 + }, + { + "epoch": 1.973323042850911, + "grad_norm": 4.434152603149414, + "learning_rate": 4.630065029914698e-08, + "loss": 1.8623, + "step": 29404 + }, + { + "epoch": 1.97345726653468, + "grad_norm": 4.132818698883057, + "learning_rate": 4.583421926701581e-08, + "loss": 1.6732, + "step": 29406 + }, + { + "epoch": 1.973591490218449, + "grad_norm": 4.169618129730225, + "learning_rate": 4.537014847099119e-08, + "loss": 1.8914, + "step": 29408 + }, + { + "epoch": 1.9737257139022182, + "grad_norm": 4.225330829620361, + "learning_rate": 4.490843793300003e-08, + "loss": 1.9229, + "step": 29410 + }, + { + "epoch": 1.973859937585987, + "grad_norm": 3.933213472366333, + "learning_rate": 4.4449087674847125e-08, + "loss": 1.6659, + "step": 29412 + }, + { + "epoch": 1.973994161269756, + "grad_norm": 3.9753267765045166, + "learning_rate": 4.399209771824287e-08, + "loss": 1.8683, + "step": 29414 + }, + { + "epoch": 1.974128384953525, + "grad_norm": 3.8325247764587402, + "learning_rate": 4.353746808477554e-08, + "loss": 1.9031, + "step": 29416 + }, + { + "epoch": 1.974262608637294, + "grad_norm": 3.736541986465454, + "learning_rate": 4.3085198795933536e-08, + "loss": 1.866, + "step": 29418 + }, + { + "epoch": 1.974396832321063, + "grad_norm": 4.494192123413086, + "learning_rate": 4.263528987307197e-08, + "loss": 1.8003, + "step": 29420 + }, + { + "epoch": 1.9745310560048321, + "grad_norm": 4.189336776733398, + "learning_rate": 4.2187741337462724e-08, + "loss": 1.7168, + "step": 29422 + }, + { + "epoch": 1.974665279688601, + "grad_norm": 4.528428554534912, + "learning_rate": 4.1742553210238896e-08, + "loss": 2.0395, + "step": 29424 + }, + { + "epoch": 1.97479950337237, + "grad_norm": 4.295046329498291, + "learning_rate": 4.129972551244476e-08, + "loss": 1.6289, + "step": 29426 + }, + { + "epoch": 1.974933727056139, + "grad_norm": 3.852415084838867, + "learning_rate": 4.085925826499692e-08, + "loss": 2.0468, + "step": 29428 + }, + { + "epoch": 1.975067950739908, + "grad_norm": 4.388916492462158, + "learning_rate": 4.0421151488712064e-08, + "loss": 1.9881, + "step": 29430 + }, + { + "epoch": 1.9752021744236772, + "grad_norm": 4.272842884063721, + "learning_rate": 3.998540520428473e-08, + "loss": 2.0091, + "step": 29432 + }, + { + "epoch": 1.975336398107446, + "grad_norm": 4.262409210205078, + "learning_rate": 3.955201943230402e-08, + "loss": 1.8998, + "step": 29434 + }, + { + "epoch": 1.975470621791215, + "grad_norm": 3.7661893367767334, + "learning_rate": 3.9120994193247994e-08, + "loss": 1.8211, + "step": 29436 + }, + { + "epoch": 1.975604845474984, + "grad_norm": 4.589046001434326, + "learning_rate": 3.869232950747814e-08, + "loss": 1.8633, + "step": 29438 + }, + { + "epoch": 1.975739069158753, + "grad_norm": 3.996811866760254, + "learning_rate": 3.826602539525603e-08, + "loss": 1.544, + "step": 29440 + }, + { + "epoch": 1.9758732928425222, + "grad_norm": 3.684903860092163, + "learning_rate": 3.784208187671556e-08, + "loss": 1.7598, + "step": 29442 + }, + { + "epoch": 1.9760075165262911, + "grad_norm": 4.157538414001465, + "learning_rate": 3.7420498971890706e-08, + "loss": 1.6971, + "step": 29444 + }, + { + "epoch": 1.97614174021006, + "grad_norm": 4.284748554229736, + "learning_rate": 3.700127670070441e-08, + "loss": 1.7937, + "step": 29446 + }, + { + "epoch": 1.976275963893829, + "grad_norm": 3.8541464805603027, + "learning_rate": 3.658441508295196e-08, + "loss": 1.9233, + "step": 29448 + }, + { + "epoch": 1.9764101875775981, + "grad_norm": 3.884507656097412, + "learning_rate": 3.616991413834536e-08, + "loss": 2.0774, + "step": 29450 + }, + { + "epoch": 1.976544411261367, + "grad_norm": 4.018420219421387, + "learning_rate": 3.57577738864523e-08, + "loss": 2.0779, + "step": 29452 + }, + { + "epoch": 1.9766786349451362, + "grad_norm": 4.54725456237793, + "learning_rate": 3.534799434676273e-08, + "loss": 1.9882, + "step": 29454 + }, + { + "epoch": 1.976812858628905, + "grad_norm": 3.909823417663574, + "learning_rate": 3.494057553862229e-08, + "loss": 1.9374, + "step": 29456 + }, + { + "epoch": 1.976947082312674, + "grad_norm": 4.069825172424316, + "learning_rate": 3.453551748128781e-08, + "loss": 1.8533, + "step": 29458 + }, + { + "epoch": 1.977081305996443, + "grad_norm": 3.583383798599243, + "learning_rate": 3.4132820193899514e-08, + "loss": 1.6688, + "step": 29460 + }, + { + "epoch": 1.977215529680212, + "grad_norm": 4.061973571777344, + "learning_rate": 3.3732483695481097e-08, + "loss": 1.8227, + "step": 29462 + }, + { + "epoch": 1.9773497533639812, + "grad_norm": 5.814359664916992, + "learning_rate": 3.333450800495075e-08, + "loss": 1.7829, + "step": 29464 + }, + { + "epoch": 1.9774839770477501, + "grad_norm": 4.3631978034973145, + "learning_rate": 3.2938893141110094e-08, + "loss": 1.8033, + "step": 29466 + }, + { + "epoch": 1.977618200731519, + "grad_norm": 4.3401665687561035, + "learning_rate": 3.254563912264419e-08, + "loss": 2.0844, + "step": 29468 + }, + { + "epoch": 1.977752424415288, + "grad_norm": 3.849046230316162, + "learning_rate": 3.215474596814372e-08, + "loss": 1.9837, + "step": 29470 + }, + { + "epoch": 1.9778866480990571, + "grad_norm": 3.671442985534668, + "learning_rate": 3.176621369607724e-08, + "loss": 1.8017, + "step": 29472 + }, + { + "epoch": 1.978020871782826, + "grad_norm": 4.192652702331543, + "learning_rate": 3.138004232479674e-08, + "loss": 1.8448, + "step": 29474 + }, + { + "epoch": 1.9781550954665952, + "grad_norm": 4.265591621398926, + "learning_rate": 3.099623187254874e-08, + "loss": 1.8285, + "step": 29476 + }, + { + "epoch": 1.978289319150364, + "grad_norm": 4.504610061645508, + "learning_rate": 3.061478235746873e-08, + "loss": 1.826, + "step": 29478 + }, + { + "epoch": 1.978423542834133, + "grad_norm": 3.8458681106567383, + "learning_rate": 3.023569379758118e-08, + "loss": 1.9606, + "step": 29480 + }, + { + "epoch": 1.978557766517902, + "grad_norm": 3.928490161895752, + "learning_rate": 2.985896621079398e-08, + "loss": 1.7282, + "step": 29482 + }, + { + "epoch": 1.978691990201671, + "grad_norm": 4.065850257873535, + "learning_rate": 2.948459961490957e-08, + "loss": 1.8606, + "step": 29484 + }, + { + "epoch": 1.9788262138854402, + "grad_norm": 3.953895330429077, + "learning_rate": 2.9112594027619346e-08, + "loss": 1.8839, + "step": 29486 + }, + { + "epoch": 1.9789604375692091, + "grad_norm": 4.2289347648620605, + "learning_rate": 2.8742949466487036e-08, + "loss": 1.6469, + "step": 29488 + }, + { + "epoch": 1.979094661252978, + "grad_norm": 3.988835334777832, + "learning_rate": 2.8375665948993103e-08, + "loss": 1.8021, + "step": 29490 + }, + { + "epoch": 1.979228884936747, + "grad_norm": 4.20178747177124, + "learning_rate": 2.801074349247923e-08, + "loss": 1.8042, + "step": 29492 + }, + { + "epoch": 1.9793631086205161, + "grad_norm": 3.4687113761901855, + "learning_rate": 2.7648182114198285e-08, + "loss": 1.7997, + "step": 29494 + }, + { + "epoch": 1.979497332304285, + "grad_norm": 4.479478359222412, + "learning_rate": 2.7287981831269905e-08, + "loss": 1.7524, + "step": 29496 + }, + { + "epoch": 1.9796315559880542, + "grad_norm": 4.02789306640625, + "learning_rate": 2.693014266071381e-08, + "loss": 1.5581, + "step": 29498 + }, + { + "epoch": 1.979765779671823, + "grad_norm": 3.697138547897339, + "learning_rate": 2.6574664619444244e-08, + "loss": 1.9768, + "step": 29500 + }, + { + "epoch": 1.979900003355592, + "grad_norm": 4.3648905754089355, + "learning_rate": 2.6221547724253337e-08, + "loss": 1.9286, + "step": 29502 + }, + { + "epoch": 1.980034227039361, + "grad_norm": 4.258299350738525, + "learning_rate": 2.5870791991827737e-08, + "loss": 2.1362, + "step": 29504 + }, + { + "epoch": 1.98016845072313, + "grad_norm": 3.9726617336273193, + "learning_rate": 2.552239743873197e-08, + "loss": 1.7304, + "step": 29506 + }, + { + "epoch": 1.9803026744068992, + "grad_norm": 4.231447219848633, + "learning_rate": 2.51763640814362e-08, + "loss": 1.7713, + "step": 29508 + }, + { + "epoch": 1.9804368980906681, + "grad_norm": 4.064836025238037, + "learning_rate": 2.4832691936282902e-08, + "loss": 1.7288, + "step": 29510 + }, + { + "epoch": 1.980571121774437, + "grad_norm": 3.9222493171691895, + "learning_rate": 2.4491381019520198e-08, + "loss": 1.6214, + "step": 29512 + }, + { + "epoch": 1.980705345458206, + "grad_norm": 4.589057445526123, + "learning_rate": 2.415243134725742e-08, + "loss": 1.9895, + "step": 29514 + }, + { + "epoch": 1.9808395691419751, + "grad_norm": 3.726539373397827, + "learning_rate": 2.381584293552619e-08, + "loss": 1.978, + "step": 29516 + }, + { + "epoch": 1.9809737928257443, + "grad_norm": 4.046703815460205, + "learning_rate": 2.3481615800219347e-08, + "loss": 1.6991, + "step": 29518 + }, + { + "epoch": 1.9811080165095132, + "grad_norm": 4.275355815887451, + "learning_rate": 2.3149749957129818e-08, + "loss": 1.7271, + "step": 29520 + }, + { + "epoch": 1.981242240193282, + "grad_norm": 7.331688404083252, + "learning_rate": 2.28202454219395e-08, + "loss": 1.7117, + "step": 29522 + }, + { + "epoch": 1.981376463877051, + "grad_norm": 4.386005878448486, + "learning_rate": 2.2493102210219275e-08, + "loss": 1.7982, + "step": 29524 + }, + { + "epoch": 1.9815106875608202, + "grad_norm": 4.5647478103637695, + "learning_rate": 2.2168320337423442e-08, + "loss": 1.8571, + "step": 29526 + }, + { + "epoch": 1.981644911244589, + "grad_norm": 4.0511040687561035, + "learning_rate": 2.1845899818895287e-08, + "loss": 1.7409, + "step": 29528 + }, + { + "epoch": 1.9817791349283582, + "grad_norm": 3.9524447917938232, + "learning_rate": 2.152584066987262e-08, + "loss": 1.8627, + "step": 29530 + }, + { + "epoch": 1.9819133586121271, + "grad_norm": 3.7001280784606934, + "learning_rate": 2.120814290547668e-08, + "loss": 1.8956, + "step": 29532 + }, + { + "epoch": 1.982047582295896, + "grad_norm": 4.451706886291504, + "learning_rate": 2.089280654071213e-08, + "loss": 1.7439, + "step": 29534 + }, + { + "epoch": 1.982181805979665, + "grad_norm": 4.376474380493164, + "learning_rate": 2.057983159048926e-08, + "loss": 1.8919, + "step": 29536 + }, + { + "epoch": 1.9823160296634341, + "grad_norm": 4.32984733581543, + "learning_rate": 2.026921806958515e-08, + "loss": 1.8923, + "step": 29538 + }, + { + "epoch": 1.9824502533472033, + "grad_norm": 4.28172492980957, + "learning_rate": 1.996096599267694e-08, + "loss": 1.9309, + "step": 29540 + }, + { + "epoch": 1.9825844770309722, + "grad_norm": 4.172758102416992, + "learning_rate": 1.965507537433631e-08, + "loss": 1.9825, + "step": 29542 + }, + { + "epoch": 1.982718700714741, + "grad_norm": 3.9112813472747803, + "learning_rate": 1.9351546229007256e-08, + "loss": 1.7502, + "step": 29544 + }, + { + "epoch": 1.98285292439851, + "grad_norm": 4.365701198577881, + "learning_rate": 1.9050378571039418e-08, + "loss": 1.7561, + "step": 29546 + }, + { + "epoch": 1.9829871480822792, + "grad_norm": 4.418905258178711, + "learning_rate": 1.875157241465475e-08, + "loss": 1.9901, + "step": 29548 + }, + { + "epoch": 1.983121371766048, + "grad_norm": 3.778115749359131, + "learning_rate": 1.845512777397529e-08, + "loss": 1.9193, + "step": 29550 + }, + { + "epoch": 1.9832555954498172, + "grad_norm": 4.6405487060546875, + "learning_rate": 1.8161044663000948e-08, + "loss": 1.6777, + "step": 29552 + }, + { + "epoch": 1.9833898191335861, + "grad_norm": 4.70914888381958, + "learning_rate": 1.786932309564282e-08, + "loss": 1.8936, + "step": 29554 + }, + { + "epoch": 1.983524042817355, + "grad_norm": 3.7357473373413086, + "learning_rate": 1.7579963085667672e-08, + "loss": 1.8765, + "step": 29556 + }, + { + "epoch": 1.983658266501124, + "grad_norm": 3.735499858856201, + "learning_rate": 1.7292964646753453e-08, + "loss": 1.701, + "step": 29558 + }, + { + "epoch": 1.9837924901848931, + "grad_norm": 4.214782238006592, + "learning_rate": 1.700832779245598e-08, + "loss": 1.724, + "step": 29560 + }, + { + "epoch": 1.9839267138686623, + "grad_norm": 3.229109525680542, + "learning_rate": 1.672605253623116e-08, + "loss": 1.7597, + "step": 29562 + }, + { + "epoch": 1.9840609375524312, + "grad_norm": 3.945394277572632, + "learning_rate": 1.6446138891412777e-08, + "loss": 1.7145, + "step": 29564 + }, + { + "epoch": 1.9841951612362, + "grad_norm": 4.07525634765625, + "learning_rate": 1.616858687122913e-08, + "loss": 1.9018, + "step": 29566 + }, + { + "epoch": 1.984329384919969, + "grad_norm": 4.204347610473633, + "learning_rate": 1.5893396488786407e-08, + "loss": 1.8775, + "step": 29568 + }, + { + "epoch": 1.9844636086037382, + "grad_norm": 3.658442735671997, + "learning_rate": 1.5620567757090865e-08, + "loss": 1.8163, + "step": 29570 + }, + { + "epoch": 1.984597832287507, + "grad_norm": 4.772525787353516, + "learning_rate": 1.535010068903775e-08, + "loss": 1.905, + "step": 29572 + }, + { + "epoch": 1.9847320559712762, + "grad_norm": 4.0973358154296875, + "learning_rate": 1.5081995297400177e-08, + "loss": 1.9325, + "step": 29574 + }, + { + "epoch": 1.9848662796550451, + "grad_norm": 4.475614547729492, + "learning_rate": 1.4816251594845787e-08, + "loss": 1.899, + "step": 29576 + }, + { + "epoch": 1.985000503338814, + "grad_norm": 4.136242389678955, + "learning_rate": 1.4552869593931207e-08, + "loss": 1.755, + "step": 29578 + }, + { + "epoch": 1.985134727022583, + "grad_norm": 4.15523624420166, + "learning_rate": 1.4291849307102034e-08, + "loss": 1.7428, + "step": 29580 + }, + { + "epoch": 1.9852689507063521, + "grad_norm": 4.781779766082764, + "learning_rate": 1.4033190746687297e-08, + "loss": 2.0625, + "step": 29582 + }, + { + "epoch": 1.9854031743901213, + "grad_norm": 4.214940547943115, + "learning_rate": 1.37768939249161e-08, + "loss": 1.8311, + "step": 29584 + }, + { + "epoch": 1.9855373980738902, + "grad_norm": 3.885404348373413, + "learning_rate": 1.3522958853889877e-08, + "loss": 2.028, + "step": 29586 + }, + { + "epoch": 1.985671621757659, + "grad_norm": 3.6309070587158203, + "learning_rate": 1.3271385545610137e-08, + "loss": 1.8096, + "step": 29588 + }, + { + "epoch": 1.985805845441428, + "grad_norm": 4.310778617858887, + "learning_rate": 1.302217401196737e-08, + "loss": 2.0554, + "step": 29590 + }, + { + "epoch": 1.9859400691251972, + "grad_norm": 3.496863842010498, + "learning_rate": 1.2775324264724386e-08, + "loss": 1.8888, + "step": 29592 + }, + { + "epoch": 1.9860742928089663, + "grad_norm": 4.195777893066406, + "learning_rate": 1.2530836315555183e-08, + "loss": 2.0695, + "step": 29594 + }, + { + "epoch": 1.9862085164927352, + "grad_norm": 4.084662914276123, + "learning_rate": 1.228871017601163e-08, + "loss": 2.0589, + "step": 29596 + }, + { + "epoch": 1.9863427401765041, + "grad_norm": 4.175770282745361, + "learning_rate": 1.2048945857523475e-08, + "loss": 2.0087, + "step": 29598 + }, + { + "epoch": 1.986476963860273, + "grad_norm": 4.240734577178955, + "learning_rate": 1.1811543371431644e-08, + "loss": 1.7547, + "step": 29600 + }, + { + "epoch": 1.9866111875440422, + "grad_norm": 3.796980857849121, + "learning_rate": 1.1576502728938287e-08, + "loss": 1.8546, + "step": 29602 + }, + { + "epoch": 1.9867454112278111, + "grad_norm": 4.014173984527588, + "learning_rate": 1.134382394116229e-08, + "loss": 1.6852, + "step": 29604 + }, + { + "epoch": 1.9868796349115803, + "grad_norm": 3.863452196121216, + "learning_rate": 1.111350701909486e-08, + "loss": 1.8225, + "step": 29606 + }, + { + "epoch": 1.9870138585953492, + "grad_norm": 4.18723726272583, + "learning_rate": 1.088555197361063e-08, + "loss": 2.1812, + "step": 29608 + }, + { + "epoch": 1.987148082279118, + "grad_norm": 3.8532328605651855, + "learning_rate": 1.0659958815489868e-08, + "loss": 1.6649, + "step": 29610 + }, + { + "epoch": 1.987282305962887, + "grad_norm": 4.590604305267334, + "learning_rate": 1.043672755537961e-08, + "loss": 1.9794, + "step": 29612 + }, + { + "epoch": 1.9874165296466562, + "grad_norm": 3.5838327407836914, + "learning_rate": 1.021585820383808e-08, + "loss": 1.7222, + "step": 29614 + }, + { + "epoch": 1.9875507533304253, + "grad_norm": 3.3830924034118652, + "learning_rate": 9.997350771295821e-09, + "loss": 1.6946, + "step": 29616 + }, + { + "epoch": 1.9876849770141942, + "grad_norm": 4.274023532867432, + "learning_rate": 9.781205268077908e-09, + "loss": 1.9431, + "step": 29618 + }, + { + "epoch": 1.9878192006979631, + "grad_norm": 4.080018043518066, + "learning_rate": 9.567421704392843e-09, + "loss": 1.9373, + "step": 29620 + }, + { + "epoch": 1.987953424381732, + "grad_norm": 4.468358993530273, + "learning_rate": 9.356000090349204e-09, + "loss": 1.8434, + "step": 29622 + }, + { + "epoch": 1.9880876480655012, + "grad_norm": 3.2377572059631348, + "learning_rate": 9.146940435933449e-09, + "loss": 1.6007, + "step": 29624 + }, + { + "epoch": 1.9882218717492701, + "grad_norm": 4.015376091003418, + "learning_rate": 8.94024275102101e-09, + "loss": 1.7149, + "step": 29626 + }, + { + "epoch": 1.9883560954330393, + "grad_norm": 3.795046091079712, + "learning_rate": 8.735907045376301e-09, + "loss": 1.7624, + "step": 29628 + }, + { + "epoch": 1.9884903191168082, + "grad_norm": 4.246764183044434, + "learning_rate": 8.533933328658262e-09, + "loss": 2.1852, + "step": 29630 + }, + { + "epoch": 1.988624542800577, + "grad_norm": 4.360902309417725, + "learning_rate": 8.334321610403706e-09, + "loss": 1.7486, + "step": 29632 + }, + { + "epoch": 1.988758766484346, + "grad_norm": 4.3072686195373535, + "learning_rate": 8.137071900055082e-09, + "loss": 2.0296, + "step": 29634 + }, + { + "epoch": 1.9888929901681152, + "grad_norm": 4.869019031524658, + "learning_rate": 7.942184206921611e-09, + "loss": 2.1063, + "step": 29636 + }, + { + "epoch": 1.9890272138518843, + "grad_norm": 4.327263355255127, + "learning_rate": 7.74965854021259e-09, + "loss": 2.0679, + "step": 29638 + }, + { + "epoch": 1.9891614375356532, + "grad_norm": 4.421610355377197, + "learning_rate": 7.5594949090263e-09, + "loss": 2.0013, + "step": 29640 + }, + { + "epoch": 1.9892956612194221, + "grad_norm": 3.7997753620147705, + "learning_rate": 7.371693322349993e-09, + "loss": 1.9924, + "step": 29642 + }, + { + "epoch": 1.989429884903191, + "grad_norm": 6.521599292755127, + "learning_rate": 7.186253789059905e-09, + "loss": 1.9017, + "step": 29644 + }, + { + "epoch": 1.9895641085869602, + "grad_norm": 4.1349687576293945, + "learning_rate": 7.003176317904591e-09, + "loss": 1.9088, + "step": 29646 + }, + { + "epoch": 1.9896983322707291, + "grad_norm": 4.430261611938477, + "learning_rate": 6.822460917549345e-09, + "loss": 1.9795, + "step": 29648 + }, + { + "epoch": 1.9898325559544983, + "grad_norm": 4.177305698394775, + "learning_rate": 6.644107596520677e-09, + "loss": 1.882, + "step": 29650 + }, + { + "epoch": 1.9899667796382672, + "grad_norm": 4.499268054962158, + "learning_rate": 6.4681163632507314e-09, + "loss": 2.0083, + "step": 29652 + }, + { + "epoch": 1.990101003322036, + "grad_norm": 3.9800865650177, + "learning_rate": 6.294487226055079e-09, + "loss": 1.8998, + "step": 29654 + }, + { + "epoch": 1.990235227005805, + "grad_norm": 4.1693243980407715, + "learning_rate": 6.123220193132717e-09, + "loss": 1.9189, + "step": 29656 + }, + { + "epoch": 1.9903694506895742, + "grad_norm": 4.557847499847412, + "learning_rate": 5.9543152725827226e-09, + "loss": 1.9057, + "step": 29658 + }, + { + "epoch": 1.9905036743733433, + "grad_norm": 4.329014778137207, + "learning_rate": 5.787772472382047e-09, + "loss": 1.8429, + "step": 29660 + }, + { + "epoch": 1.9906378980571122, + "grad_norm": 3.7215728759765625, + "learning_rate": 5.623591800402173e-09, + "loss": 1.8368, + "step": 29662 + }, + { + "epoch": 1.9907721217408811, + "grad_norm": 3.8735806941986084, + "learning_rate": 5.461773264398007e-09, + "loss": 1.8346, + "step": 29664 + }, + { + "epoch": 1.99090634542465, + "grad_norm": 4.4330668449401855, + "learning_rate": 5.302316872013435e-09, + "loss": 2.0844, + "step": 29666 + }, + { + "epoch": 1.9910405691084192, + "grad_norm": 4.0359344482421875, + "learning_rate": 5.145222630781321e-09, + "loss": 2.0067, + "step": 29668 + }, + { + "epoch": 1.9911747927921883, + "grad_norm": 4.336332321166992, + "learning_rate": 4.990490548129057e-09, + "loss": 1.8897, + "step": 29670 + }, + { + "epoch": 1.9913090164759573, + "grad_norm": 3.9659690856933594, + "learning_rate": 4.838120631361909e-09, + "loss": 2.0836, + "step": 29672 + }, + { + "epoch": 1.9914432401597262, + "grad_norm": 3.9647419452667236, + "learning_rate": 4.688112887685225e-09, + "loss": 1.7944, + "step": 29674 + }, + { + "epoch": 1.991577463843495, + "grad_norm": 3.9611403942108154, + "learning_rate": 4.540467324187781e-09, + "loss": 1.8226, + "step": 29676 + }, + { + "epoch": 1.9917116875272642, + "grad_norm": 3.049403190612793, + "learning_rate": 4.3951839478362235e-09, + "loss": 1.7124, + "step": 29678 + }, + { + "epoch": 1.9918459112110332, + "grad_norm": 5.652111530303955, + "learning_rate": 4.2522627655028346e-09, + "loss": 1.623, + "step": 29680 + }, + { + "epoch": 1.9919801348948023, + "grad_norm": 4.579681873321533, + "learning_rate": 4.111703783932219e-09, + "loss": 1.82, + "step": 29682 + }, + { + "epoch": 1.9921143585785712, + "grad_norm": 3.89047908782959, + "learning_rate": 3.973507009774613e-09, + "loss": 1.9246, + "step": 29684 + }, + { + "epoch": 1.9922485822623401, + "grad_norm": 3.7775843143463135, + "learning_rate": 3.8376724495581276e-09, + "loss": 1.7324, + "step": 29686 + }, + { + "epoch": 1.992382805946109, + "grad_norm": 3.541384220123291, + "learning_rate": 3.7042001096943e-09, + "loss": 1.5001, + "step": 29688 + }, + { + "epoch": 1.9925170296298782, + "grad_norm": 3.9055957794189453, + "learning_rate": 3.5730899964947495e-09, + "loss": 1.8096, + "step": 29690 + }, + { + "epoch": 1.9926512533136473, + "grad_norm": 4.449036121368408, + "learning_rate": 3.4443421161545197e-09, + "loss": 1.9, + "step": 29692 + }, + { + "epoch": 1.9927854769974163, + "grad_norm": 4.117572784423828, + "learning_rate": 3.317956474757633e-09, + "loss": 1.8533, + "step": 29694 + }, + { + "epoch": 1.9929197006811852, + "grad_norm": 4.335999965667725, + "learning_rate": 3.193933078265987e-09, + "loss": 1.9063, + "step": 29696 + }, + { + "epoch": 1.993053924364954, + "grad_norm": 3.8152952194213867, + "learning_rate": 3.0722719325526615e-09, + "loss": 1.6877, + "step": 29698 + }, + { + "epoch": 1.9931881480487232, + "grad_norm": 4.035757064819336, + "learning_rate": 2.95297304335751e-09, + "loss": 1.6228, + "step": 29700 + }, + { + "epoch": 1.9933223717324922, + "grad_norm": 4.051229476928711, + "learning_rate": 2.8360364163149135e-09, + "loss": 1.7672, + "step": 29702 + }, + { + "epoch": 1.9934565954162613, + "grad_norm": 4.468512535095215, + "learning_rate": 2.721462056959334e-09, + "loss": 1.7679, + "step": 29704 + }, + { + "epoch": 1.9935908191000302, + "grad_norm": 3.872483015060425, + "learning_rate": 2.609249970697558e-09, + "loss": 1.9542, + "step": 29706 + }, + { + "epoch": 1.9937250427837991, + "grad_norm": 3.8068034648895264, + "learning_rate": 2.4994001628364517e-09, + "loss": 2.0738, + "step": 29708 + }, + { + "epoch": 1.993859266467568, + "grad_norm": 4.20297908782959, + "learning_rate": 2.391912638560756e-09, + "loss": 1.8451, + "step": 29710 + }, + { + "epoch": 1.9939934901513372, + "grad_norm": 3.563854455947876, + "learning_rate": 2.2867874029497415e-09, + "loss": 1.689, + "step": 29712 + }, + { + "epoch": 1.9941277138351063, + "grad_norm": 4.209000110626221, + "learning_rate": 2.1840244609716565e-09, + "loss": 2.0223, + "step": 29714 + }, + { + "epoch": 1.9942619375188753, + "grad_norm": 4.100368976593018, + "learning_rate": 2.0836238174837264e-09, + "loss": 1.8481, + "step": 29716 + }, + { + "epoch": 1.9943961612026442, + "grad_norm": 4.291011810302734, + "learning_rate": 1.9855854772266037e-09, + "loss": 1.8635, + "step": 29718 + }, + { + "epoch": 1.994530384886413, + "grad_norm": 4.25451135635376, + "learning_rate": 1.8899094448354693e-09, + "loss": 1.8231, + "step": 29720 + }, + { + "epoch": 1.9946646085701822, + "grad_norm": 3.976914644241333, + "learning_rate": 1.796595724828931e-09, + "loss": 1.8165, + "step": 29722 + }, + { + "epoch": 1.9947988322539512, + "grad_norm": 3.775137186050415, + "learning_rate": 1.7056443216145746e-09, + "loss": 1.8426, + "step": 29724 + }, + { + "epoch": 1.9949330559377203, + "grad_norm": 3.996706962585449, + "learning_rate": 1.6170552394889627e-09, + "loss": 1.6014, + "step": 29726 + }, + { + "epoch": 1.9950672796214892, + "grad_norm": 3.597660541534424, + "learning_rate": 1.5308284826431874e-09, + "loss": 1.623, + "step": 29728 + }, + { + "epoch": 1.9952015033052581, + "grad_norm": 4.06671667098999, + "learning_rate": 1.446964055146216e-09, + "loss": 1.9331, + "step": 29730 + }, + { + "epoch": 1.995335726989027, + "grad_norm": 4.63083028793335, + "learning_rate": 1.365461960961545e-09, + "loss": 1.8594, + "step": 29732 + }, + { + "epoch": 1.9954699506727962, + "grad_norm": 4.228280067443848, + "learning_rate": 1.2863222039416478e-09, + "loss": 1.8572, + "step": 29734 + }, + { + "epoch": 1.9956041743565653, + "grad_norm": 4.228320598602295, + "learning_rate": 1.2095447878279765e-09, + "loss": 1.8207, + "step": 29736 + }, + { + "epoch": 1.9957383980403343, + "grad_norm": 4.243010520935059, + "learning_rate": 1.1351297162398578e-09, + "loss": 1.8049, + "step": 29738 + }, + { + "epoch": 1.9958726217241032, + "grad_norm": 3.9978137016296387, + "learning_rate": 1.06307699270225e-09, + "loss": 1.9135, + "step": 29740 + }, + { + "epoch": 1.996006845407872, + "grad_norm": 3.6843583583831787, + "learning_rate": 9.933866206124353e-10, + "loss": 1.6993, + "step": 29742 + }, + { + "epoch": 1.9961410690916412, + "grad_norm": 3.9809529781341553, + "learning_rate": 9.260586032677765e-10, + "loss": 1.7344, + "step": 29744 + }, + { + "epoch": 1.9962752927754104, + "grad_norm": 4.305218696594238, + "learning_rate": 8.610929438490623e-10, + "loss": 1.7374, + "step": 29746 + }, + { + "epoch": 1.9964095164591793, + "grad_norm": 3.7976925373077393, + "learning_rate": 7.984896454260593e-10, + "loss": 1.7573, + "step": 29748 + }, + { + "epoch": 1.9965437401429482, + "grad_norm": 4.080177307128906, + "learning_rate": 7.382487109519609e-10, + "loss": 1.8125, + "step": 29750 + }, + { + "epoch": 1.9966779638267171, + "grad_norm": 4.095365047454834, + "learning_rate": 6.803701432744891e-10, + "loss": 1.7039, + "step": 29752 + }, + { + "epoch": 1.9968121875104863, + "grad_norm": 4.7731146812438965, + "learning_rate": 6.248539451358948e-10, + "loss": 2.0996, + "step": 29754 + }, + { + "epoch": 1.9969464111942552, + "grad_norm": 4.0240325927734375, + "learning_rate": 5.71700119145202e-10, + "loss": 1.6727, + "step": 29756 + }, + { + "epoch": 1.9970806348780243, + "grad_norm": 4.265284061431885, + "learning_rate": 5.209086678281683e-10, + "loss": 1.8315, + "step": 29758 + }, + { + "epoch": 1.9972148585617933, + "grad_norm": 5.515148639678955, + "learning_rate": 4.724795935773241e-10, + "loss": 2.0155, + "step": 29760 + }, + { + "epoch": 1.9973490822455622, + "grad_norm": 4.682764053344727, + "learning_rate": 4.2641289868528e-10, + "loss": 2.3061, + "step": 29762 + }, + { + "epoch": 1.997483305929331, + "grad_norm": 4.705201625823975, + "learning_rate": 3.82708585316971e-10, + "loss": 1.8393, + "step": 29764 + }, + { + "epoch": 1.9976175296131002, + "grad_norm": 4.205397605895996, + "learning_rate": 3.413666555540651e-10, + "loss": 2.0548, + "step": 29766 + }, + { + "epoch": 1.9977517532968694, + "grad_norm": 4.276141166687012, + "learning_rate": 3.023871113339016e-10, + "loss": 1.8682, + "step": 29768 + }, + { + "epoch": 1.9978859769806383, + "grad_norm": 4.105380058288574, + "learning_rate": 2.65769954510553e-10, + "loss": 1.818, + "step": 29770 + }, + { + "epoch": 1.9980202006644072, + "grad_norm": 4.040588855743408, + "learning_rate": 2.3151518681041595e-10, + "loss": 1.7452, + "step": 29772 + }, + { + "epoch": 1.9981544243481761, + "grad_norm": 3.906928539276123, + "learning_rate": 1.9962280984886507e-10, + "loss": 1.8881, + "step": 29774 + }, + { + "epoch": 1.9982886480319453, + "grad_norm": 4.087926387786865, + "learning_rate": 1.700928251358036e-10, + "loss": 1.7738, + "step": 29776 + }, + { + "epoch": 1.9984228717157142, + "grad_norm": 4.356101036071777, + "learning_rate": 1.429252340645615e-10, + "loss": 2.0355, + "step": 29778 + }, + { + "epoch": 1.9985570953994833, + "grad_norm": 4.308615684509277, + "learning_rate": 1.181200379174463e-10, + "loss": 1.8246, + "step": 29780 + }, + { + "epoch": 1.9986913190832523, + "grad_norm": 3.8434715270996094, + "learning_rate": 9.567723787129445e-11, + "loss": 1.7378, + "step": 29782 + }, + { + "epoch": 1.9988255427670212, + "grad_norm": 3.728109836578369, + "learning_rate": 7.559683498081782e-11, + "loss": 1.7412, + "step": 29784 + }, + { + "epoch": 1.99895976645079, + "grad_norm": 4.285849094390869, + "learning_rate": 5.7878830200808196e-11, + "loss": 1.7242, + "step": 29786 + }, + { + "epoch": 1.9990939901345592, + "grad_norm": 4.231853008270264, + "learning_rate": 4.2523224363932856e-11, + "loss": 1.97, + "step": 29788 + }, + { + "epoch": 1.9992282138183284, + "grad_norm": 4.406625747680664, + "learning_rate": 2.953001819738788e-11, + "loss": 1.9407, + "step": 29790 + }, + { + "epoch": 1.9993624375020973, + "grad_norm": 4.077744960784912, + "learning_rate": 1.8899212317347036e-11, + "loss": 1.8809, + "step": 29792 + }, + { + "epoch": 1.9994966611858662, + "grad_norm": 3.3885247707366943, + "learning_rate": 1.06308072234107e-11, + "loss": 1.8003, + "step": 29794 + }, + { + "epoch": 1.9996308848696351, + "grad_norm": 3.914541721343994, + "learning_rate": 4.7248033041569216e-12, + "loss": 2.1604, + "step": 29796 + }, + { + "epoch": 1.9997651085534043, + "grad_norm": 4.054013252258301, + "learning_rate": 1.1812008371414608e-12, + "loss": 1.7309, + "step": 29798 + }, + { + "epoch": 1.9998993322371732, + "grad_norm": 4.338361740112305, + "learning_rate": 0.0, + "loss": 1.6033, + "step": 29800 + } + ], + "logging_steps": 2, + "max_steps": 29800, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.9092202294188442e+18, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}