{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 1000000, "global_step": 47151, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006362537379907107, "grad_norm": 5.6875, "learning_rate": 2.8631418209581985e-08, "loss": 0.0906, "step": 10 }, { "epoch": 0.0012725074759814214, "grad_norm": 9.1875, "learning_rate": 6.044410510911752e-08, "loss": 0.0929, "step": 20 }, { "epoch": 0.001908761213972132, "grad_norm": 16.75, "learning_rate": 9.225679200865305e-08, "loss": 0.105, "step": 30 }, { "epoch": 0.0025450149519628427, "grad_norm": 10.0, "learning_rate": 1.240694789081886e-07, "loss": 0.084, "step": 40 }, { "epoch": 0.0031812686899535536, "grad_norm": 9.0, "learning_rate": 1.5588216580772414e-07, "loss": 0.099, "step": 50 }, { "epoch": 0.003817522427944264, "grad_norm": 20.75, "learning_rate": 1.8769485270725965e-07, "loss": 0.1481, "step": 60 }, { "epoch": 0.0044537761659349745, "grad_norm": 9.3125, "learning_rate": 2.195075396067952e-07, "loss": 0.0981, "step": 70 }, { "epoch": 0.005090029903925685, "grad_norm": 13.0625, "learning_rate": 2.5132022650633075e-07, "loss": 0.1564, "step": 80 }, { "epoch": 0.005726283641916396, "grad_norm": 4.28125, "learning_rate": 2.8313291340586627e-07, "loss": 0.0768, "step": 90 }, { "epoch": 0.006362537379907107, "grad_norm": 6.34375, "learning_rate": 3.1494560030540183e-07, "loss": 0.1256, "step": 100 }, { "epoch": 0.006998791117897817, "grad_norm": 20.375, "learning_rate": 3.4675828720493734e-07, "loss": 0.1107, "step": 110 }, { "epoch": 0.007635044855888528, "grad_norm": 22.125, "learning_rate": 3.7857097410447285e-07, "loss": 0.1103, "step": 120 }, { "epoch": 0.008271298593879239, "grad_norm": 13.125, "learning_rate": 4.103836610040084e-07, "loss": 0.1392, "step": 130 }, { "epoch": 0.008907552331869949, "grad_norm": 6.625, "learning_rate": 4.4219634790354393e-07, "loss": 0.1096, "step": 140 }, { "epoch": 0.00954380606986066, "grad_norm": 10.3125, "learning_rate": 4.740090348030795e-07, "loss": 0.1226, "step": 150 }, { "epoch": 0.01018005980785137, "grad_norm": 14.5625, "learning_rate": 5.05821721702615e-07, "loss": 0.1351, "step": 160 }, { "epoch": 0.010816313545842083, "grad_norm": 6.46875, "learning_rate": 5.376344086021506e-07, "loss": 0.0854, "step": 170 }, { "epoch": 0.011452567283832793, "grad_norm": 10.3125, "learning_rate": 5.694470955016861e-07, "loss": 0.1008, "step": 180 }, { "epoch": 0.012088821021823503, "grad_norm": 16.75, "learning_rate": 6.012597824012216e-07, "loss": 0.0952, "step": 190 }, { "epoch": 0.012725074759814214, "grad_norm": 8.0625, "learning_rate": 6.330724693007572e-07, "loss": 0.1231, "step": 200 }, { "epoch": 0.013361328497804924, "grad_norm": 8.5, "learning_rate": 6.648851562002927e-07, "loss": 0.0801, "step": 210 }, { "epoch": 0.013997582235795634, "grad_norm": 9.125, "learning_rate": 6.966978430998282e-07, "loss": 0.0883, "step": 220 }, { "epoch": 0.014633835973786346, "grad_norm": 7.71875, "learning_rate": 7.285105299993638e-07, "loss": 0.0723, "step": 230 }, { "epoch": 0.015270089711777056, "grad_norm": 21.75, "learning_rate": 7.603232168988993e-07, "loss": 0.1246, "step": 240 }, { "epoch": 0.015906343449767768, "grad_norm": 6.0, "learning_rate": 7.921359037984348e-07, "loss": 0.1029, "step": 250 }, { "epoch": 0.016542597187758478, "grad_norm": 14.0625, "learning_rate": 8.239485906979703e-07, "loss": 0.0597, "step": 260 }, { "epoch": 0.017178850925749188, "grad_norm": 19.125, "learning_rate": 8.55761277597506e-07, "loss": 0.095, "step": 270 }, { "epoch": 0.017815104663739898, "grad_norm": 13.9375, "learning_rate": 8.875739644970415e-07, "loss": 0.0602, "step": 280 }, { "epoch": 0.01845135840173061, "grad_norm": 13.1875, "learning_rate": 9.19386651396577e-07, "loss": 0.0572, "step": 290 }, { "epoch": 0.01908761213972132, "grad_norm": 16.625, "learning_rate": 9.511993382961125e-07, "loss": 0.0985, "step": 300 }, { "epoch": 0.01972386587771203, "grad_norm": 9.5625, "learning_rate": 9.83012025195648e-07, "loss": 0.081, "step": 310 }, { "epoch": 0.02036011961570274, "grad_norm": 11.875, "learning_rate": 1.0148247120951837e-06, "loss": 0.0935, "step": 320 }, { "epoch": 0.02099637335369345, "grad_norm": 7.6875, "learning_rate": 1.0466373989947193e-06, "loss": 0.084, "step": 330 }, { "epoch": 0.021632627091684165, "grad_norm": 10.0625, "learning_rate": 1.0784500858942546e-06, "loss": 0.0669, "step": 340 }, { "epoch": 0.022268880829674875, "grad_norm": 8.5625, "learning_rate": 1.1102627727937902e-06, "loss": 0.0914, "step": 350 }, { "epoch": 0.022905134567665585, "grad_norm": 21.75, "learning_rate": 1.1420754596933258e-06, "loss": 0.1082, "step": 360 }, { "epoch": 0.023541388305656295, "grad_norm": 5.0, "learning_rate": 1.1738881465928613e-06, "loss": 0.0505, "step": 370 }, { "epoch": 0.024177642043647005, "grad_norm": 7.40625, "learning_rate": 1.2057008334923969e-06, "loss": 0.056, "step": 380 }, { "epoch": 0.02481389578163772, "grad_norm": 11.8125, "learning_rate": 1.2375135203919324e-06, "loss": 0.0652, "step": 390 }, { "epoch": 0.02545014951962843, "grad_norm": 9.8125, "learning_rate": 1.2693262072914678e-06, "loss": 0.0624, "step": 400 }, { "epoch": 0.02608640325761914, "grad_norm": 6.78125, "learning_rate": 1.3011388941910034e-06, "loss": 0.0709, "step": 410 }, { "epoch": 0.02672265699560985, "grad_norm": 10.9375, "learning_rate": 1.3329515810905391e-06, "loss": 0.047, "step": 420 }, { "epoch": 0.02735891073360056, "grad_norm": 7.34375, "learning_rate": 1.3647642679900745e-06, "loss": 0.0496, "step": 430 }, { "epoch": 0.02799516447159127, "grad_norm": 6.09375, "learning_rate": 1.39657695488961e-06, "loss": 0.0523, "step": 440 }, { "epoch": 0.028631418209581982, "grad_norm": 4.15625, "learning_rate": 1.4283896417891456e-06, "loss": 0.069, "step": 450 }, { "epoch": 0.029267671947572692, "grad_norm": 8.9375, "learning_rate": 1.460202328688681e-06, "loss": 0.04, "step": 460 }, { "epoch": 0.029903925685563403, "grad_norm": 19.0, "learning_rate": 1.4920150155882167e-06, "loss": 0.0506, "step": 470 }, { "epoch": 0.030540179423554113, "grad_norm": 2.765625, "learning_rate": 1.5238277024877523e-06, "loss": 0.0382, "step": 480 }, { "epoch": 0.031176433161544823, "grad_norm": 9.9375, "learning_rate": 1.5556403893872877e-06, "loss": 0.0693, "step": 490 }, { "epoch": 0.031812686899535536, "grad_norm": 8.375, "learning_rate": 1.5874530762868232e-06, "loss": 0.0538, "step": 500 }, { "epoch": 0.03244894063752624, "grad_norm": 5.1875, "learning_rate": 1.619265763186359e-06, "loss": 0.0452, "step": 510 }, { "epoch": 0.033085194375516956, "grad_norm": 8.25, "learning_rate": 1.6510784500858941e-06, "loss": 0.0262, "step": 520 }, { "epoch": 0.03372144811350767, "grad_norm": 4.40625, "learning_rate": 1.68289113698543e-06, "loss": 0.0256, "step": 530 }, { "epoch": 0.034357701851498376, "grad_norm": 7.125, "learning_rate": 1.7147038238849653e-06, "loss": 0.0271, "step": 540 }, { "epoch": 0.03499395558948909, "grad_norm": 10.875, "learning_rate": 1.7465165107845008e-06, "loss": 0.0391, "step": 550 }, { "epoch": 0.035630209327479796, "grad_norm": 3.34375, "learning_rate": 1.7783291976840366e-06, "loss": 0.0338, "step": 560 }, { "epoch": 0.03626646306547051, "grad_norm": 4.21875, "learning_rate": 1.810141884583572e-06, "loss": 0.0174, "step": 570 }, { "epoch": 0.03690271680346122, "grad_norm": 6.40625, "learning_rate": 1.8419545714831075e-06, "loss": 0.0194, "step": 580 }, { "epoch": 0.03753897054145193, "grad_norm": 12.875, "learning_rate": 1.8737672583826429e-06, "loss": 0.0364, "step": 590 }, { "epoch": 0.03817522427944264, "grad_norm": 6.34375, "learning_rate": 1.9055799452821787e-06, "loss": 0.0273, "step": 600 }, { "epoch": 0.03881147801743335, "grad_norm": 6.125, "learning_rate": 1.9373926321817142e-06, "loss": 0.0247, "step": 610 }, { "epoch": 0.03944773175542406, "grad_norm": 3.75, "learning_rate": 1.9692053190812496e-06, "loss": 0.0351, "step": 620 }, { "epoch": 0.04008398549341478, "grad_norm": 7.09375, "learning_rate": 2.0010180059807853e-06, "loss": 0.0281, "step": 630 }, { "epoch": 0.04072023923140548, "grad_norm": 17.0, "learning_rate": 2.0328306928803207e-06, "loss": 0.0237, "step": 640 }, { "epoch": 0.0413564929693962, "grad_norm": 5.0, "learning_rate": 2.0646433797798565e-06, "loss": 0.024, "step": 650 }, { "epoch": 0.0419927467073869, "grad_norm": 5.8125, "learning_rate": 2.096456066679392e-06, "loss": 0.0186, "step": 660 }, { "epoch": 0.04262900044537762, "grad_norm": 0.734375, "learning_rate": 2.128268753578927e-06, "loss": 0.0174, "step": 670 }, { "epoch": 0.04326525418336833, "grad_norm": 3.96875, "learning_rate": 2.160081440478463e-06, "loss": 0.0175, "step": 680 }, { "epoch": 0.04390150792135904, "grad_norm": 8.5, "learning_rate": 2.1918941273779983e-06, "loss": 0.0176, "step": 690 }, { "epoch": 0.04453776165934975, "grad_norm": 4.125, "learning_rate": 2.223706814277534e-06, "loss": 0.0398, "step": 700 }, { "epoch": 0.04517401539734046, "grad_norm": 3.21875, "learning_rate": 2.25551950117707e-06, "loss": 0.0432, "step": 710 }, { "epoch": 0.04581026913533117, "grad_norm": 7.34375, "learning_rate": 2.287332188076605e-06, "loss": 0.0094, "step": 720 }, { "epoch": 0.046446522873321884, "grad_norm": 18.125, "learning_rate": 2.3191448749761406e-06, "loss": 0.0163, "step": 730 }, { "epoch": 0.04708277661131259, "grad_norm": 2.796875, "learning_rate": 2.350957561875676e-06, "loss": 0.0203, "step": 740 }, { "epoch": 0.047719030349303304, "grad_norm": 5.3125, "learning_rate": 2.3827702487752117e-06, "loss": 0.0189, "step": 750 }, { "epoch": 0.04835528408729401, "grad_norm": 5.15625, "learning_rate": 2.4145829356747475e-06, "loss": 0.0191, "step": 760 }, { "epoch": 0.048991537825284724, "grad_norm": 7.71875, "learning_rate": 2.446395622574283e-06, "loss": 0.0203, "step": 770 }, { "epoch": 0.04962779156327544, "grad_norm": 4.5625, "learning_rate": 2.478208309473818e-06, "loss": 0.009, "step": 780 }, { "epoch": 0.050264045301266144, "grad_norm": 0.53515625, "learning_rate": 2.5100209963733535e-06, "loss": 0.012, "step": 790 }, { "epoch": 0.05090029903925686, "grad_norm": 0.921875, "learning_rate": 2.5418336832728893e-06, "loss": 0.0149, "step": 800 }, { "epoch": 0.051536552777247564, "grad_norm": 8.3125, "learning_rate": 2.573646370172425e-06, "loss": 0.0255, "step": 810 }, { "epoch": 0.05217280651523828, "grad_norm": 1.796875, "learning_rate": 2.6054590570719604e-06, "loss": 0.009, "step": 820 }, { "epoch": 0.052809060253228984, "grad_norm": 8.8125, "learning_rate": 2.637271743971496e-06, "loss": 0.0384, "step": 830 }, { "epoch": 0.0534453139912197, "grad_norm": 2.453125, "learning_rate": 2.6690844308710316e-06, "loss": 0.0324, "step": 840 }, { "epoch": 0.05408156772921041, "grad_norm": 15.375, "learning_rate": 2.700897117770567e-06, "loss": 0.0196, "step": 850 }, { "epoch": 0.05471782146720112, "grad_norm": 14.4375, "learning_rate": 2.7327098046701027e-06, "loss": 0.0228, "step": 860 }, { "epoch": 0.05535407520519183, "grad_norm": 3.46875, "learning_rate": 2.764522491569638e-06, "loss": 0.0107, "step": 870 }, { "epoch": 0.05599032894318254, "grad_norm": 0.58984375, "learning_rate": 2.796335178469174e-06, "loss": 0.0131, "step": 880 }, { "epoch": 0.05662658268117325, "grad_norm": 3.59375, "learning_rate": 2.828147865368709e-06, "loss": 0.0098, "step": 890 }, { "epoch": 0.057262836419163965, "grad_norm": 2.140625, "learning_rate": 2.859960552268245e-06, "loss": 0.0382, "step": 900 }, { "epoch": 0.05789909015715467, "grad_norm": 12.625, "learning_rate": 2.89177323916778e-06, "loss": 0.0214, "step": 910 }, { "epoch": 0.058535343895145385, "grad_norm": 3.703125, "learning_rate": 2.9235859260673156e-06, "loss": 0.0112, "step": 920 }, { "epoch": 0.05917159763313609, "grad_norm": 5.6875, "learning_rate": 2.9553986129668514e-06, "loss": 0.0197, "step": 930 }, { "epoch": 0.059807851371126805, "grad_norm": 3.984375, "learning_rate": 2.9872112998663868e-06, "loss": 0.0083, "step": 940 }, { "epoch": 0.06044410510911752, "grad_norm": 1.296875, "learning_rate": 3.0190239867659226e-06, "loss": 0.0239, "step": 950 }, { "epoch": 0.061080358847108225, "grad_norm": 0.58203125, "learning_rate": 3.050836673665458e-06, "loss": 0.0113, "step": 960 }, { "epoch": 0.06171661258509894, "grad_norm": 1.2421875, "learning_rate": 3.0826493605649933e-06, "loss": 0.0069, "step": 970 }, { "epoch": 0.062352866323089645, "grad_norm": 0.48828125, "learning_rate": 3.114462047464529e-06, "loss": 0.0103, "step": 980 }, { "epoch": 0.06298912006108036, "grad_norm": 2.15625, "learning_rate": 3.1462747343640644e-06, "loss": 0.01, "step": 990 }, { "epoch": 0.06362537379907107, "grad_norm": 2.96875, "learning_rate": 3.1780874212636e-06, "loss": 0.0076, "step": 1000 }, { "epoch": 0.06426162753706179, "grad_norm": 6.96875, "learning_rate": 3.209900108163136e-06, "loss": 0.0155, "step": 1010 }, { "epoch": 0.06489788127505249, "grad_norm": 5.875, "learning_rate": 3.2417127950626713e-06, "loss": 0.014, "step": 1020 }, { "epoch": 0.0655341350130432, "grad_norm": 9.5, "learning_rate": 3.2735254819622066e-06, "loss": 0.0106, "step": 1030 }, { "epoch": 0.06617038875103391, "grad_norm": 0.76171875, "learning_rate": 3.305338168861742e-06, "loss": 0.016, "step": 1040 }, { "epoch": 0.06680664248902463, "grad_norm": 22.375, "learning_rate": 3.3371508557612778e-06, "loss": 0.0169, "step": 1050 }, { "epoch": 0.06744289622701534, "grad_norm": 0.2119140625, "learning_rate": 3.3689635426608135e-06, "loss": 0.0231, "step": 1060 }, { "epoch": 0.06807914996500604, "grad_norm": 2.140625, "learning_rate": 3.4007762295603493e-06, "loss": 0.0216, "step": 1070 }, { "epoch": 0.06871540370299675, "grad_norm": 2.53125, "learning_rate": 3.4325889164598842e-06, "loss": 0.0084, "step": 1080 }, { "epoch": 0.06935165744098747, "grad_norm": 0.79296875, "learning_rate": 3.46440160335942e-06, "loss": 0.0057, "step": 1090 }, { "epoch": 0.06998791117897818, "grad_norm": 4.21875, "learning_rate": 3.4962142902589554e-06, "loss": 0.0143, "step": 1100 }, { "epoch": 0.07062416491696889, "grad_norm": 5.65625, "learning_rate": 3.528026977158491e-06, "loss": 0.0142, "step": 1110 }, { "epoch": 0.07126041865495959, "grad_norm": 10.1875, "learning_rate": 3.559839664058027e-06, "loss": 0.0236, "step": 1120 }, { "epoch": 0.0718966723929503, "grad_norm": 5.21875, "learning_rate": 3.591652350957562e-06, "loss": 0.0209, "step": 1130 }, { "epoch": 0.07253292613094102, "grad_norm": 2.984375, "learning_rate": 3.6234650378570976e-06, "loss": 0.0108, "step": 1140 }, { "epoch": 0.07316917986893173, "grad_norm": 2.03125, "learning_rate": 3.655277724756633e-06, "loss": 0.0167, "step": 1150 }, { "epoch": 0.07380543360692245, "grad_norm": 4.1875, "learning_rate": 3.6870904116561688e-06, "loss": 0.0142, "step": 1160 }, { "epoch": 0.07444168734491315, "grad_norm": 0.146484375, "learning_rate": 3.7189030985557037e-06, "loss": 0.0126, "step": 1170 }, { "epoch": 0.07507794108290386, "grad_norm": 3.46875, "learning_rate": 3.7507157854552395e-06, "loss": 0.0304, "step": 1180 }, { "epoch": 0.07571419482089457, "grad_norm": 0.1259765625, "learning_rate": 3.7825284723547752e-06, "loss": 0.0042, "step": 1190 }, { "epoch": 0.07635044855888529, "grad_norm": 0.4296875, "learning_rate": 3.814341159254311e-06, "loss": 0.009, "step": 1200 }, { "epoch": 0.076986702296876, "grad_norm": 1.9453125, "learning_rate": 3.846153846153847e-06, "loss": 0.0304, "step": 1210 }, { "epoch": 0.0776229560348667, "grad_norm": 0.6953125, "learning_rate": 3.877966533053381e-06, "loss": 0.0117, "step": 1220 }, { "epoch": 0.07825920977285741, "grad_norm": 0.65234375, "learning_rate": 3.9097792199529175e-06, "loss": 0.0214, "step": 1230 }, { "epoch": 0.07889546351084813, "grad_norm": 2.546875, "learning_rate": 3.941591906852453e-06, "loss": 0.0336, "step": 1240 }, { "epoch": 0.07953171724883884, "grad_norm": 0.89453125, "learning_rate": 3.973404593751988e-06, "loss": 0.0198, "step": 1250 }, { "epoch": 0.08016797098682955, "grad_norm": 1.8203125, "learning_rate": 4.005217280651524e-06, "loss": 0.0101, "step": 1260 }, { "epoch": 0.08080422472482025, "grad_norm": 0.4375, "learning_rate": 4.037029967551059e-06, "loss": 0.0163, "step": 1270 }, { "epoch": 0.08144047846281097, "grad_norm": 1.484375, "learning_rate": 4.068842654450595e-06, "loss": 0.0052, "step": 1280 }, { "epoch": 0.08207673220080168, "grad_norm": 8.125, "learning_rate": 4.1006553413501305e-06, "loss": 0.0068, "step": 1290 }, { "epoch": 0.0827129859387924, "grad_norm": 2.65625, "learning_rate": 4.132468028249667e-06, "loss": 0.016, "step": 1300 }, { "epoch": 0.08334923967678311, "grad_norm": 1.6015625, "learning_rate": 4.164280715149202e-06, "loss": 0.0125, "step": 1310 }, { "epoch": 0.0839854934147738, "grad_norm": 2.328125, "learning_rate": 4.1960934020487365e-06, "loss": 0.0237, "step": 1320 }, { "epoch": 0.08462174715276452, "grad_norm": 0.25390625, "learning_rate": 4.227906088948273e-06, "loss": 0.0074, "step": 1330 }, { "epoch": 0.08525800089075523, "grad_norm": 7.90625, "learning_rate": 4.259718775847808e-06, "loss": 0.0168, "step": 1340 }, { "epoch": 0.08589425462874595, "grad_norm": 2.78125, "learning_rate": 4.291531462747344e-06, "loss": 0.0165, "step": 1350 }, { "epoch": 0.08653050836673666, "grad_norm": 2.046875, "learning_rate": 4.32334414964688e-06, "loss": 0.0112, "step": 1360 }, { "epoch": 0.08716676210472736, "grad_norm": 9.9375, "learning_rate": 4.355156836546415e-06, "loss": 0.0081, "step": 1370 }, { "epoch": 0.08780301584271807, "grad_norm": 0.734375, "learning_rate": 4.38696952344595e-06, "loss": 0.0078, "step": 1380 }, { "epoch": 0.08843926958070879, "grad_norm": 0.3046875, "learning_rate": 4.418782210345486e-06, "loss": 0.0034, "step": 1390 }, { "epoch": 0.0890755233186995, "grad_norm": 1.078125, "learning_rate": 4.450594897245022e-06, "loss": 0.0074, "step": 1400 }, { "epoch": 0.08971177705669021, "grad_norm": 0.2294921875, "learning_rate": 4.482407584144557e-06, "loss": 0.0231, "step": 1410 }, { "epoch": 0.09034803079468091, "grad_norm": 5.0625, "learning_rate": 4.514220271044093e-06, "loss": 0.0167, "step": 1420 }, { "epoch": 0.09098428453267163, "grad_norm": 0.1357421875, "learning_rate": 4.546032957943628e-06, "loss": 0.0106, "step": 1430 }, { "epoch": 0.09162053827066234, "grad_norm": 20.125, "learning_rate": 4.577845644843163e-06, "loss": 0.0118, "step": 1440 }, { "epoch": 0.09225679200865305, "grad_norm": 0.361328125, "learning_rate": 4.6096583317426995e-06, "loss": 0.0031, "step": 1450 }, { "epoch": 0.09289304574664377, "grad_norm": 0.10205078125, "learning_rate": 4.641471018642235e-06, "loss": 0.0085, "step": 1460 }, { "epoch": 0.09352929948463447, "grad_norm": 1.9375, "learning_rate": 4.67328370554177e-06, "loss": 0.0098, "step": 1470 }, { "epoch": 0.09416555322262518, "grad_norm": 1.5859375, "learning_rate": 4.7050963924413055e-06, "loss": 0.0143, "step": 1480 }, { "epoch": 0.0948018069606159, "grad_norm": 1.109375, "learning_rate": 4.736909079340842e-06, "loss": 0.0146, "step": 1490 }, { "epoch": 0.09543806069860661, "grad_norm": 0.9921875, "learning_rate": 4.768721766240377e-06, "loss": 0.0022, "step": 1500 }, { "epoch": 0.09607431443659732, "grad_norm": 0.2392578125, "learning_rate": 4.8005344531399124e-06, "loss": 0.0362, "step": 1510 }, { "epoch": 0.09671056817458802, "grad_norm": 4.09375, "learning_rate": 4.832347140039448e-06, "loss": 0.0082, "step": 1520 }, { "epoch": 0.09734682191257873, "grad_norm": 11.6875, "learning_rate": 4.864159826938983e-06, "loss": 0.0055, "step": 1530 }, { "epoch": 0.09798307565056945, "grad_norm": 0.70703125, "learning_rate": 4.895972513838519e-06, "loss": 0.0069, "step": 1540 }, { "epoch": 0.09861932938856016, "grad_norm": 10.3125, "learning_rate": 4.927785200738055e-06, "loss": 0.0109, "step": 1550 }, { "epoch": 0.09925558312655088, "grad_norm": 5.09375, "learning_rate": 4.95959788763759e-06, "loss": 0.009, "step": 1560 }, { "epoch": 0.09989183686454157, "grad_norm": 0.34375, "learning_rate": 4.991410574537125e-06, "loss": 0.003, "step": 1570 }, { "epoch": 0.10052809060253229, "grad_norm": 6.59375, "learning_rate": 5.023223261436661e-06, "loss": 0.0062, "step": 1580 }, { "epoch": 0.101164344340523, "grad_norm": 0.030517578125, "learning_rate": 5.055035948336197e-06, "loss": 0.0036, "step": 1590 }, { "epoch": 0.10180059807851372, "grad_norm": 0.61328125, "learning_rate": 5.086848635235732e-06, "loss": 0.0426, "step": 1600 }, { "epoch": 0.10243685181650442, "grad_norm": 0.25390625, "learning_rate": 5.118661322135268e-06, "loss": 0.0085, "step": 1610 }, { "epoch": 0.10307310555449513, "grad_norm": 3.0625, "learning_rate": 5.150474009034803e-06, "loss": 0.0028, "step": 1620 }, { "epoch": 0.10370935929248584, "grad_norm": 0.107421875, "learning_rate": 5.182286695934338e-06, "loss": 0.0258, "step": 1630 }, { "epoch": 0.10434561303047656, "grad_norm": 1.4765625, "learning_rate": 5.2140993828338746e-06, "loss": 0.0156, "step": 1640 }, { "epoch": 0.10498186676846727, "grad_norm": 0.5078125, "learning_rate": 5.24591206973341e-06, "loss": 0.0037, "step": 1650 }, { "epoch": 0.10561812050645797, "grad_norm": 6.34375, "learning_rate": 5.277724756632946e-06, "loss": 0.019, "step": 1660 }, { "epoch": 0.10625437424444868, "grad_norm": 0.88671875, "learning_rate": 5.309537443532481e-06, "loss": 0.0147, "step": 1670 }, { "epoch": 0.1068906279824394, "grad_norm": 0.3203125, "learning_rate": 5.341350130432017e-06, "loss": 0.0246, "step": 1680 }, { "epoch": 0.10752688172043011, "grad_norm": 0.0634765625, "learning_rate": 5.373162817331552e-06, "loss": 0.0128, "step": 1690 }, { "epoch": 0.10816313545842082, "grad_norm": 0.0751953125, "learning_rate": 5.4049755042310875e-06, "loss": 0.0126, "step": 1700 }, { "epoch": 0.10879938919641152, "grad_norm": 0.408203125, "learning_rate": 5.436788191130624e-06, "loss": 0.0156, "step": 1710 }, { "epoch": 0.10943564293440224, "grad_norm": 1.2109375, "learning_rate": 5.468600878030158e-06, "loss": 0.0045, "step": 1720 }, { "epoch": 0.11007189667239295, "grad_norm": 0.09423828125, "learning_rate": 5.5004135649296944e-06, "loss": 0.0034, "step": 1730 }, { "epoch": 0.11070815041038366, "grad_norm": 0.421875, "learning_rate": 5.53222625182923e-06, "loss": 0.0067, "step": 1740 }, { "epoch": 0.11134440414837438, "grad_norm": 0.455078125, "learning_rate": 5.564038938728765e-06, "loss": 0.0162, "step": 1750 }, { "epoch": 0.11198065788636508, "grad_norm": 0.5, "learning_rate": 5.595851625628301e-06, "loss": 0.01, "step": 1760 }, { "epoch": 0.11261691162435579, "grad_norm": 3.328125, "learning_rate": 5.627664312527836e-06, "loss": 0.0216, "step": 1770 }, { "epoch": 0.1132531653623465, "grad_norm": 0.091796875, "learning_rate": 5.659476999427372e-06, "loss": 0.0031, "step": 1780 }, { "epoch": 0.11388941910033722, "grad_norm": 8.8125, "learning_rate": 5.691289686326907e-06, "loss": 0.0091, "step": 1790 }, { "epoch": 0.11452567283832793, "grad_norm": 15.25, "learning_rate": 5.723102373226443e-06, "loss": 0.0065, "step": 1800 }, { "epoch": 0.11516192657631863, "grad_norm": 2.265625, "learning_rate": 5.754915060125979e-06, "loss": 0.0246, "step": 1810 }, { "epoch": 0.11579818031430934, "grad_norm": 4.375, "learning_rate": 5.7867277470255134e-06, "loss": 0.0094, "step": 1820 }, { "epoch": 0.11643443405230006, "grad_norm": 0.30078125, "learning_rate": 5.81854043392505e-06, "loss": 0.0076, "step": 1830 }, { "epoch": 0.11707068779029077, "grad_norm": 11.6875, "learning_rate": 5.850353120824585e-06, "loss": 0.0075, "step": 1840 }, { "epoch": 0.11770694152828148, "grad_norm": 2.140625, "learning_rate": 5.882165807724121e-06, "loss": 0.008, "step": 1850 }, { "epoch": 0.11834319526627218, "grad_norm": 0.2470703125, "learning_rate": 5.9139784946236566e-06, "loss": 0.0032, "step": 1860 }, { "epoch": 0.1189794490042629, "grad_norm": 16.5, "learning_rate": 5.945791181523191e-06, "loss": 0.007, "step": 1870 }, { "epoch": 0.11961570274225361, "grad_norm": 4.03125, "learning_rate": 5.977603868422727e-06, "loss": 0.0163, "step": 1880 }, { "epoch": 0.12025195648024432, "grad_norm": 12.5, "learning_rate": 6.009416555322263e-06, "loss": 0.0125, "step": 1890 }, { "epoch": 0.12088821021823504, "grad_norm": 1.859375, "learning_rate": 6.041229242221799e-06, "loss": 0.0046, "step": 1900 }, { "epoch": 0.12152446395622574, "grad_norm": 0.1640625, "learning_rate": 6.073041929121334e-06, "loss": 0.0082, "step": 1910 }, { "epoch": 0.12216071769421645, "grad_norm": 0.44140625, "learning_rate": 6.1048546160208695e-06, "loss": 0.019, "step": 1920 }, { "epoch": 0.12279697143220716, "grad_norm": 0.181640625, "learning_rate": 6.136667302920405e-06, "loss": 0.0108, "step": 1930 }, { "epoch": 0.12343322517019788, "grad_norm": 3.203125, "learning_rate": 6.16847998981994e-06, "loss": 0.0062, "step": 1940 }, { "epoch": 0.12406947890818859, "grad_norm": 0.0181884765625, "learning_rate": 6.200292676719476e-06, "loss": 0.005, "step": 1950 }, { "epoch": 0.12470573264617929, "grad_norm": 0.6171875, "learning_rate": 6.232105363619012e-06, "loss": 0.0197, "step": 1960 }, { "epoch": 0.12534198638417002, "grad_norm": 0.271484375, "learning_rate": 6.263918050518547e-06, "loss": 0.0047, "step": 1970 }, { "epoch": 0.12597824012216072, "grad_norm": 1.2265625, "learning_rate": 6.295730737418083e-06, "loss": 0.002, "step": 1980 }, { "epoch": 0.12661449386015142, "grad_norm": 5.625, "learning_rate": 6.327543424317618e-06, "loss": 0.007, "step": 1990 }, { "epoch": 0.12725074759814214, "grad_norm": 0.6015625, "learning_rate": 6.359356111217153e-06, "loss": 0.0066, "step": 2000 }, { "epoch": 0.12788700133613284, "grad_norm": 1.4296875, "learning_rate": 6.391168798116689e-06, "loss": 0.0267, "step": 2010 }, { "epoch": 0.12852325507412357, "grad_norm": 0.412109375, "learning_rate": 6.422981485016225e-06, "loss": 0.0099, "step": 2020 }, { "epoch": 0.12915950881211427, "grad_norm": 7.0625, "learning_rate": 6.454794171915761e-06, "loss": 0.0083, "step": 2030 }, { "epoch": 0.12979576255010497, "grad_norm": 2.15625, "learning_rate": 6.486606858815296e-06, "loss": 0.0087, "step": 2040 }, { "epoch": 0.1304320162880957, "grad_norm": 6.78125, "learning_rate": 6.518419545714831e-06, "loss": 0.0087, "step": 2050 }, { "epoch": 0.1310682700260864, "grad_norm": 0.08154296875, "learning_rate": 6.550232232614367e-06, "loss": 0.01, "step": 2060 }, { "epoch": 0.13170452376407712, "grad_norm": 0.034423828125, "learning_rate": 6.582044919513902e-06, "loss": 0.0148, "step": 2070 }, { "epoch": 0.13234077750206782, "grad_norm": 4.4375, "learning_rate": 6.6138576064134385e-06, "loss": 0.0161, "step": 2080 }, { "epoch": 0.13297703124005852, "grad_norm": 0.09765625, "learning_rate": 6.645670293312974e-06, "loss": 0.0089, "step": 2090 }, { "epoch": 0.13361328497804925, "grad_norm": 0.93359375, "learning_rate": 6.677482980212508e-06, "loss": 0.0028, "step": 2100 }, { "epoch": 0.13424953871603995, "grad_norm": 25.5, "learning_rate": 6.709295667112045e-06, "loss": 0.0252, "step": 2110 }, { "epoch": 0.13488579245403068, "grad_norm": 0.453125, "learning_rate": 6.74110835401158e-06, "loss": 0.0216, "step": 2120 }, { "epoch": 0.13552204619202138, "grad_norm": 0.1875, "learning_rate": 6.772921040911116e-06, "loss": 0.0043, "step": 2130 }, { "epoch": 0.13615829993001208, "grad_norm": 0.294921875, "learning_rate": 6.8047337278106515e-06, "loss": 0.0012, "step": 2140 }, { "epoch": 0.1367945536680028, "grad_norm": 0.54296875, "learning_rate": 6.836546414710186e-06, "loss": 0.0078, "step": 2150 }, { "epoch": 0.1374308074059935, "grad_norm": 8.3125, "learning_rate": 6.868359101609722e-06, "loss": 0.0051, "step": 2160 }, { "epoch": 0.13806706114398423, "grad_norm": 0.0908203125, "learning_rate": 6.9001717885092576e-06, "loss": 0.006, "step": 2170 }, { "epoch": 0.13870331488197493, "grad_norm": 0.083984375, "learning_rate": 6.931984475408794e-06, "loss": 0.0048, "step": 2180 }, { "epoch": 0.13933956861996563, "grad_norm": 0.1533203125, "learning_rate": 6.963797162308329e-06, "loss": 0.0025, "step": 2190 }, { "epoch": 0.13997582235795636, "grad_norm": 0.0029754638671875, "learning_rate": 6.995609849207864e-06, "loss": 0.0119, "step": 2200 }, { "epoch": 0.14061207609594706, "grad_norm": 5.40625, "learning_rate": 7.027422536107401e-06, "loss": 0.0045, "step": 2210 }, { "epoch": 0.14124832983393779, "grad_norm": 3.078125, "learning_rate": 7.059235223006935e-06, "loss": 0.0062, "step": 2220 }, { "epoch": 0.14188458357192849, "grad_norm": 17.125, "learning_rate": 7.091047909906471e-06, "loss": 0.0237, "step": 2230 }, { "epoch": 0.14252083730991918, "grad_norm": 15.75, "learning_rate": 7.122860596806007e-06, "loss": 0.0089, "step": 2240 }, { "epoch": 0.1431570910479099, "grad_norm": 0.06494140625, "learning_rate": 7.154673283705541e-06, "loss": 0.0016, "step": 2250 }, { "epoch": 0.1437933447859006, "grad_norm": 7.75, "learning_rate": 7.186485970605078e-06, "loss": 0.0085, "step": 2260 }, { "epoch": 0.14442959852389134, "grad_norm": 3.953125, "learning_rate": 7.218298657504613e-06, "loss": 0.0032, "step": 2270 }, { "epoch": 0.14506585226188204, "grad_norm": 0.5078125, "learning_rate": 7.250111344404149e-06, "loss": 0.0138, "step": 2280 }, { "epoch": 0.14570210599987274, "grad_norm": 1.2265625, "learning_rate": 7.281924031303684e-06, "loss": 0.0046, "step": 2290 }, { "epoch": 0.14633835973786347, "grad_norm": 0.09326171875, "learning_rate": 7.31373671820322e-06, "loss": 0.0026, "step": 2300 }, { "epoch": 0.14697461347585417, "grad_norm": 0.423828125, "learning_rate": 7.345549405102756e-06, "loss": 0.0014, "step": 2310 }, { "epoch": 0.1476108672138449, "grad_norm": 0.171875, "learning_rate": 7.37736209200229e-06, "loss": 0.0019, "step": 2320 }, { "epoch": 0.1482471209518356, "grad_norm": 6.84375, "learning_rate": 7.4091747789018274e-06, "loss": 0.0041, "step": 2330 }, { "epoch": 0.1488833746898263, "grad_norm": 0.546875, "learning_rate": 7.440987465801362e-06, "loss": 0.018, "step": 2340 }, { "epoch": 0.14951962842781702, "grad_norm": 0.07861328125, "learning_rate": 7.472800152700897e-06, "loss": 0.0121, "step": 2350 }, { "epoch": 0.15015588216580772, "grad_norm": 2.734375, "learning_rate": 7.5046128396004335e-06, "loss": 0.0055, "step": 2360 }, { "epoch": 0.15079213590379845, "grad_norm": 0.3984375, "learning_rate": 7.536425526499968e-06, "loss": 0.003, "step": 2370 }, { "epoch": 0.15142838964178915, "grad_norm": 10.875, "learning_rate": 7.568238213399505e-06, "loss": 0.0104, "step": 2380 }, { "epoch": 0.15206464337977985, "grad_norm": 3.34375, "learning_rate": 7.6000509002990395e-06, "loss": 0.0017, "step": 2390 }, { "epoch": 0.15270089711777057, "grad_norm": 1.0390625, "learning_rate": 7.631863587198575e-06, "loss": 0.0159, "step": 2400 }, { "epoch": 0.15333715085576127, "grad_norm": 0.007080078125, "learning_rate": 7.66367627409811e-06, "loss": 0.0043, "step": 2410 }, { "epoch": 0.153973404593752, "grad_norm": 14.625, "learning_rate": 7.695488960997646e-06, "loss": 0.0137, "step": 2420 }, { "epoch": 0.1546096583317427, "grad_norm": 0.361328125, "learning_rate": 7.727301647897183e-06, "loss": 0.0065, "step": 2430 }, { "epoch": 0.1552459120697334, "grad_norm": 0.8828125, "learning_rate": 7.759114334796718e-06, "loss": 0.0038, "step": 2440 }, { "epoch": 0.15588216580772413, "grad_norm": 1.28125, "learning_rate": 7.790927021696252e-06, "loss": 0.0119, "step": 2450 }, { "epoch": 0.15651841954571483, "grad_norm": 0.10498046875, "learning_rate": 7.822739708595789e-06, "loss": 0.0045, "step": 2460 }, { "epoch": 0.15715467328370555, "grad_norm": 1.8671875, "learning_rate": 7.854552395495324e-06, "loss": 0.0078, "step": 2470 }, { "epoch": 0.15779092702169625, "grad_norm": 2.046875, "learning_rate": 7.88636508239486e-06, "loss": 0.001, "step": 2480 }, { "epoch": 0.15842718075968695, "grad_norm": 0.765625, "learning_rate": 7.918177769294395e-06, "loss": 0.0085, "step": 2490 }, { "epoch": 0.15906343449767768, "grad_norm": 0.94140625, "learning_rate": 7.94999045619393e-06, "loss": 0.0116, "step": 2500 }, { "epoch": 0.15969968823566838, "grad_norm": 0.6015625, "learning_rate": 7.981803143093467e-06, "loss": 0.0098, "step": 2510 }, { "epoch": 0.1603359419736591, "grad_norm": 0.16796875, "learning_rate": 8.013615829993e-06, "loss": 0.0027, "step": 2520 }, { "epoch": 0.1609721957116498, "grad_norm": 0.1337890625, "learning_rate": 8.045428516892538e-06, "loss": 0.0009, "step": 2530 }, { "epoch": 0.1616084494496405, "grad_norm": 1.21875, "learning_rate": 8.077241203792073e-06, "loss": 0.0026, "step": 2540 }, { "epoch": 0.16224470318763123, "grad_norm": 0.4140625, "learning_rate": 8.109053890691609e-06, "loss": 0.0031, "step": 2550 }, { "epoch": 0.16288095692562193, "grad_norm": 13.0, "learning_rate": 8.140866577591144e-06, "loss": 0.0126, "step": 2560 }, { "epoch": 0.16351721066361266, "grad_norm": 0.026123046875, "learning_rate": 8.17267926449068e-06, "loss": 0.0011, "step": 2570 }, { "epoch": 0.16415346440160336, "grad_norm": 0.2001953125, "learning_rate": 8.204491951390216e-06, "loss": 0.0029, "step": 2580 }, { "epoch": 0.16478971813959406, "grad_norm": 1.140625, "learning_rate": 8.23630463828975e-06, "loss": 0.0019, "step": 2590 }, { "epoch": 0.1654259718775848, "grad_norm": 4.34375, "learning_rate": 8.268117325189285e-06, "loss": 0.0075, "step": 2600 }, { "epoch": 0.1660622256155755, "grad_norm": 0.322265625, "learning_rate": 8.299930012088822e-06, "loss": 0.0027, "step": 2610 }, { "epoch": 0.16669847935356621, "grad_norm": 0.62890625, "learning_rate": 8.331742698988356e-06, "loss": 0.0168, "step": 2620 }, { "epoch": 0.16733473309155691, "grad_norm": 2.515625, "learning_rate": 8.363555385887893e-06, "loss": 0.0079, "step": 2630 }, { "epoch": 0.1679709868295476, "grad_norm": 0.29296875, "learning_rate": 8.395368072787428e-06, "loss": 0.0062, "step": 2640 }, { "epoch": 0.16860724056753834, "grad_norm": 0.59375, "learning_rate": 8.427180759686964e-06, "loss": 0.0087, "step": 2650 }, { "epoch": 0.16924349430552904, "grad_norm": 0.1474609375, "learning_rate": 8.458993446586499e-06, "loss": 0.0032, "step": 2660 }, { "epoch": 0.16987974804351977, "grad_norm": 1.6796875, "learning_rate": 8.490806133486034e-06, "loss": 0.0038, "step": 2670 }, { "epoch": 0.17051600178151047, "grad_norm": 0.8203125, "learning_rate": 8.522618820385572e-06, "loss": 0.0062, "step": 2680 }, { "epoch": 0.17115225551950117, "grad_norm": 2.71875, "learning_rate": 8.554431507285105e-06, "loss": 0.006, "step": 2690 }, { "epoch": 0.1717885092574919, "grad_norm": 9.5, "learning_rate": 8.58624419418464e-06, "loss": 0.0045, "step": 2700 }, { "epoch": 0.1724247629954826, "grad_norm": 0.048095703125, "learning_rate": 8.618056881084178e-06, "loss": 0.0063, "step": 2710 }, { "epoch": 0.17306101673347332, "grad_norm": 0.1025390625, "learning_rate": 8.649869567983713e-06, "loss": 0.0011, "step": 2720 }, { "epoch": 0.17369727047146402, "grad_norm": 0.10205078125, "learning_rate": 8.681682254883248e-06, "loss": 0.003, "step": 2730 }, { "epoch": 0.17433352420945472, "grad_norm": 0.1484375, "learning_rate": 8.713494941782784e-06, "loss": 0.0086, "step": 2740 }, { "epoch": 0.17496977794744545, "grad_norm": 51.5, "learning_rate": 8.745307628682319e-06, "loss": 0.0145, "step": 2750 }, { "epoch": 0.17560603168543615, "grad_norm": 0.0419921875, "learning_rate": 8.777120315581854e-06, "loss": 0.0208, "step": 2760 }, { "epoch": 0.17624228542342688, "grad_norm": 0.07275390625, "learning_rate": 8.80893300248139e-06, "loss": 0.0121, "step": 2770 }, { "epoch": 0.17687853916141758, "grad_norm": 0.072265625, "learning_rate": 8.840745689380925e-06, "loss": 0.0005, "step": 2780 }, { "epoch": 0.17751479289940827, "grad_norm": 0.7578125, "learning_rate": 8.87255837628046e-06, "loss": 0.0093, "step": 2790 }, { "epoch": 0.178151046637399, "grad_norm": 2.484375, "learning_rate": 8.904371063179996e-06, "loss": 0.01, "step": 2800 }, { "epoch": 0.1787873003753897, "grad_norm": 9.0625, "learning_rate": 8.936183750079533e-06, "loss": 0.0051, "step": 2810 }, { "epoch": 0.17942355411338043, "grad_norm": 0.3984375, "learning_rate": 8.967996436979068e-06, "loss": 0.0017, "step": 2820 }, { "epoch": 0.18005980785137113, "grad_norm": 0.04443359375, "learning_rate": 8.999809123878602e-06, "loss": 0.0045, "step": 2830 }, { "epoch": 0.18069606158936183, "grad_norm": 1.84375, "learning_rate": 9.031621810778139e-06, "loss": 0.0043, "step": 2840 }, { "epoch": 0.18133231532735256, "grad_norm": 0.12890625, "learning_rate": 9.063434497677674e-06, "loss": 0.0015, "step": 2850 }, { "epoch": 0.18196856906534326, "grad_norm": 0.859375, "learning_rate": 9.09524718457721e-06, "loss": 0.001, "step": 2860 }, { "epoch": 0.18260482280333398, "grad_norm": 0.546875, "learning_rate": 9.127059871476745e-06, "loss": 0.0057, "step": 2870 }, { "epoch": 0.18324107654132468, "grad_norm": 2.546875, "learning_rate": 9.15887255837628e-06, "loss": 0.0119, "step": 2880 }, { "epoch": 0.18387733027931538, "grad_norm": 0.421875, "learning_rate": 9.190685245275817e-06, "loss": 0.0027, "step": 2890 }, { "epoch": 0.1845135840173061, "grad_norm": 1.0234375, "learning_rate": 9.222497932175351e-06, "loss": 0.0014, "step": 2900 }, { "epoch": 0.1851498377552968, "grad_norm": 0.322265625, "learning_rate": 9.254310619074888e-06, "loss": 0.001, "step": 2910 }, { "epoch": 0.18578609149328754, "grad_norm": 0.0242919921875, "learning_rate": 9.286123305974423e-06, "loss": 0.012, "step": 2920 }, { "epoch": 0.18642234523127824, "grad_norm": 0.21875, "learning_rate": 9.317935992873959e-06, "loss": 0.0191, "step": 2930 }, { "epoch": 0.18705859896926894, "grad_norm": 0.38671875, "learning_rate": 9.349748679773494e-06, "loss": 0.008, "step": 2940 }, { "epoch": 0.18769485270725966, "grad_norm": 3.34375, "learning_rate": 9.38156136667303e-06, "loss": 0.0024, "step": 2950 }, { "epoch": 0.18833110644525036, "grad_norm": 0.30859375, "learning_rate": 9.413374053572565e-06, "loss": 0.0035, "step": 2960 }, { "epoch": 0.1889673601832411, "grad_norm": 0.130859375, "learning_rate": 9.4451867404721e-06, "loss": 0.009, "step": 2970 }, { "epoch": 0.1896036139212318, "grad_norm": 0.494140625, "learning_rate": 9.476999427371635e-06, "loss": 0.0176, "step": 2980 }, { "epoch": 0.1902398676592225, "grad_norm": 4.5625, "learning_rate": 9.508812114271173e-06, "loss": 0.0045, "step": 2990 }, { "epoch": 0.19087612139721322, "grad_norm": 0.16796875, "learning_rate": 9.540624801170706e-06, "loss": 0.0017, "step": 3000 }, { "epoch": 0.19151237513520392, "grad_norm": 2.15625, "learning_rate": 9.572437488070243e-06, "loss": 0.0014, "step": 3010 }, { "epoch": 0.19214862887319464, "grad_norm": 0.369140625, "learning_rate": 9.604250174969779e-06, "loss": 0.0115, "step": 3020 }, { "epoch": 0.19278488261118534, "grad_norm": 3.078125, "learning_rate": 9.636062861869314e-06, "loss": 0.0033, "step": 3030 }, { "epoch": 0.19342113634917604, "grad_norm": 2.140625, "learning_rate": 9.66787554876885e-06, "loss": 0.0069, "step": 3040 }, { "epoch": 0.19405739008716677, "grad_norm": 0.029541015625, "learning_rate": 9.699688235668385e-06, "loss": 0.0034, "step": 3050 }, { "epoch": 0.19469364382515747, "grad_norm": 0.0167236328125, "learning_rate": 9.731500922567922e-06, "loss": 0.0008, "step": 3060 }, { "epoch": 0.1953298975631482, "grad_norm": 0.19140625, "learning_rate": 9.763313609467455e-06, "loss": 0.0028, "step": 3070 }, { "epoch": 0.1959661513011389, "grad_norm": 0.150390625, "learning_rate": 9.79512629636699e-06, "loss": 0.0008, "step": 3080 }, { "epoch": 0.1966024050391296, "grad_norm": 0.169921875, "learning_rate": 9.826938983266528e-06, "loss": 0.002, "step": 3090 }, { "epoch": 0.19723865877712032, "grad_norm": 0.244140625, "learning_rate": 9.858751670166063e-06, "loss": 0.001, "step": 3100 }, { "epoch": 0.19787491251511102, "grad_norm": 0.1240234375, "learning_rate": 9.890564357065598e-06, "loss": 0.0121, "step": 3110 }, { "epoch": 0.19851116625310175, "grad_norm": 0.2001953125, "learning_rate": 9.922377043965134e-06, "loss": 0.0016, "step": 3120 }, { "epoch": 0.19914741999109245, "grad_norm": 1.484375, "learning_rate": 9.95418973086467e-06, "loss": 0.0016, "step": 3130 }, { "epoch": 0.19978367372908315, "grad_norm": 0.1044921875, "learning_rate": 9.986002417764205e-06, "loss": 0.0038, "step": 3140 }, { "epoch": 0.20041992746707388, "grad_norm": 0.169921875, "learning_rate": 1.001781510466374e-05, "loss": 0.0012, "step": 3150 }, { "epoch": 0.20105618120506458, "grad_norm": 0.875, "learning_rate": 1.0049627791563277e-05, "loss": 0.0077, "step": 3160 }, { "epoch": 0.2016924349430553, "grad_norm": 0.0250244140625, "learning_rate": 1.008144047846281e-05, "loss": 0.021, "step": 3170 }, { "epoch": 0.202328688681046, "grad_norm": 1.34375, "learning_rate": 1.0113253165362346e-05, "loss": 0.0048, "step": 3180 }, { "epoch": 0.2029649424190367, "grad_norm": 16.125, "learning_rate": 1.0145065852261883e-05, "loss": 0.0107, "step": 3190 }, { "epoch": 0.20360119615702743, "grad_norm": 0.8046875, "learning_rate": 1.0176878539161418e-05, "loss": 0.0069, "step": 3200 }, { "epoch": 0.20423744989501813, "grad_norm": 13.8125, "learning_rate": 1.0208691226060954e-05, "loss": 0.0103, "step": 3210 }, { "epoch": 0.20487370363300883, "grad_norm": 0.0281982421875, "learning_rate": 1.0240503912960489e-05, "loss": 0.0013, "step": 3220 }, { "epoch": 0.20550995737099956, "grad_norm": 0.0216064453125, "learning_rate": 1.0272316599860024e-05, "loss": 0.0042, "step": 3230 }, { "epoch": 0.20614621110899026, "grad_norm": 0.028076171875, "learning_rate": 1.030412928675956e-05, "loss": 0.0013, "step": 3240 }, { "epoch": 0.20678246484698098, "grad_norm": 0.036865234375, "learning_rate": 1.0335941973659095e-05, "loss": 0.0103, "step": 3250 }, { "epoch": 0.20741871858497168, "grad_norm": 2.828125, "learning_rate": 1.0367754660558632e-05, "loss": 0.003, "step": 3260 }, { "epoch": 0.20805497232296238, "grad_norm": 1.28125, "learning_rate": 1.0399567347458167e-05, "loss": 0.0069, "step": 3270 }, { "epoch": 0.2086912260609531, "grad_norm": 0.06591796875, "learning_rate": 1.0431380034357701e-05, "loss": 0.0032, "step": 3280 }, { "epoch": 0.2093274797989438, "grad_norm": 0.055908203125, "learning_rate": 1.0463192721257238e-05, "loss": 0.0097, "step": 3290 }, { "epoch": 0.20996373353693454, "grad_norm": 0.099609375, "learning_rate": 1.0495005408156774e-05, "loss": 0.0196, "step": 3300 }, { "epoch": 0.21059998727492524, "grad_norm": 0.03515625, "learning_rate": 1.0526818095056309e-05, "loss": 0.0064, "step": 3310 }, { "epoch": 0.21123624101291594, "grad_norm": 0.0078125, "learning_rate": 1.0558630781955844e-05, "loss": 0.0017, "step": 3320 }, { "epoch": 0.21187249475090666, "grad_norm": 0.318359375, "learning_rate": 1.059044346885538e-05, "loss": 0.0211, "step": 3330 }, { "epoch": 0.21250874848889736, "grad_norm": 0.171875, "learning_rate": 1.0622256155754915e-05, "loss": 0.0079, "step": 3340 }, { "epoch": 0.2131450022268881, "grad_norm": 0.05078125, "learning_rate": 1.065406884265445e-05, "loss": 0.0043, "step": 3350 }, { "epoch": 0.2137812559648788, "grad_norm": 0.65234375, "learning_rate": 1.0685881529553987e-05, "loss": 0.0081, "step": 3360 }, { "epoch": 0.2144175097028695, "grad_norm": 0.02490234375, "learning_rate": 1.0717694216453523e-05, "loss": 0.0026, "step": 3370 }, { "epoch": 0.21505376344086022, "grad_norm": 2.390625, "learning_rate": 1.0749506903353056e-05, "loss": 0.004, "step": 3380 }, { "epoch": 0.21569001717885092, "grad_norm": 3.921875, "learning_rate": 1.0781319590252593e-05, "loss": 0.0034, "step": 3390 }, { "epoch": 0.21632627091684165, "grad_norm": 0.1376953125, "learning_rate": 1.0813132277152129e-05, "loss": 0.001, "step": 3400 }, { "epoch": 0.21696252465483234, "grad_norm": 15.1875, "learning_rate": 1.0844944964051664e-05, "loss": 0.008, "step": 3410 }, { "epoch": 0.21759877839282304, "grad_norm": 0.034423828125, "learning_rate": 1.08767576509512e-05, "loss": 0.0017, "step": 3420 }, { "epoch": 0.21823503213081377, "grad_norm": 1.46875, "learning_rate": 1.0908570337850735e-05, "loss": 0.0087, "step": 3430 }, { "epoch": 0.21887128586880447, "grad_norm": 0.060302734375, "learning_rate": 1.0940383024750272e-05, "loss": 0.0059, "step": 3440 }, { "epoch": 0.2195075396067952, "grad_norm": 0.007293701171875, "learning_rate": 1.0972195711649806e-05, "loss": 0.0006, "step": 3450 }, { "epoch": 0.2201437933447859, "grad_norm": 3.140625, "learning_rate": 1.1004008398549343e-05, "loss": 0.0048, "step": 3460 }, { "epoch": 0.2207800470827766, "grad_norm": 0.06689453125, "learning_rate": 1.1035821085448878e-05, "loss": 0.0013, "step": 3470 }, { "epoch": 0.22141630082076733, "grad_norm": 0.0218505859375, "learning_rate": 1.1067633772348413e-05, "loss": 0.0014, "step": 3480 }, { "epoch": 0.22205255455875803, "grad_norm": 0.03759765625, "learning_rate": 1.1099446459247949e-05, "loss": 0.0015, "step": 3490 }, { "epoch": 0.22268880829674875, "grad_norm": 0.33984375, "learning_rate": 1.1131259146147484e-05, "loss": 0.0041, "step": 3500 }, { "epoch": 0.22332506203473945, "grad_norm": 0.037109375, "learning_rate": 1.1163071833047021e-05, "loss": 0.0067, "step": 3510 }, { "epoch": 0.22396131577273015, "grad_norm": 19.0, "learning_rate": 1.1194884519946555e-05, "loss": 0.009, "step": 3520 }, { "epoch": 0.22459756951072088, "grad_norm": 0.6015625, "learning_rate": 1.122669720684609e-05, "loss": 0.0074, "step": 3530 }, { "epoch": 0.22523382324871158, "grad_norm": 0.08984375, "learning_rate": 1.1258509893745627e-05, "loss": 0.0008, "step": 3540 }, { "epoch": 0.2258700769867023, "grad_norm": 0.041259765625, "learning_rate": 1.129032258064516e-05, "loss": 0.002, "step": 3550 }, { "epoch": 0.226506330724693, "grad_norm": 0.38671875, "learning_rate": 1.1322135267544698e-05, "loss": 0.0066, "step": 3560 }, { "epoch": 0.2271425844626837, "grad_norm": 0.06689453125, "learning_rate": 1.1353947954444233e-05, "loss": 0.0064, "step": 3570 }, { "epoch": 0.22777883820067443, "grad_norm": 0.0118408203125, "learning_rate": 1.1385760641343768e-05, "loss": 0.0119, "step": 3580 }, { "epoch": 0.22841509193866513, "grad_norm": 1.6640625, "learning_rate": 1.1417573328243304e-05, "loss": 0.0105, "step": 3590 }, { "epoch": 0.22905134567665586, "grad_norm": 0.016845703125, "learning_rate": 1.144938601514284e-05, "loss": 0.0024, "step": 3600 }, { "epoch": 0.22968759941464656, "grad_norm": 13.75, "learning_rate": 1.1481198702042376e-05, "loss": 0.0124, "step": 3610 }, { "epoch": 0.23032385315263726, "grad_norm": 0.02001953125, "learning_rate": 1.151301138894191e-05, "loss": 0.0072, "step": 3620 }, { "epoch": 0.230960106890628, "grad_norm": 0.62109375, "learning_rate": 1.1544824075841445e-05, "loss": 0.001, "step": 3630 }, { "epoch": 0.23159636062861869, "grad_norm": 0.06884765625, "learning_rate": 1.1576636762740982e-05, "loss": 0.0047, "step": 3640 }, { "epoch": 0.2322326143666094, "grad_norm": 0.1513671875, "learning_rate": 1.1608449449640518e-05, "loss": 0.0051, "step": 3650 }, { "epoch": 0.2328688681046001, "grad_norm": 0.5859375, "learning_rate": 1.1640262136540053e-05, "loss": 0.0036, "step": 3660 }, { "epoch": 0.2335051218425908, "grad_norm": 0.02392578125, "learning_rate": 1.1672074823439588e-05, "loss": 0.0079, "step": 3670 }, { "epoch": 0.23414137558058154, "grad_norm": 0.072265625, "learning_rate": 1.1703887510339124e-05, "loss": 0.0021, "step": 3680 }, { "epoch": 0.23477762931857224, "grad_norm": 3.390625, "learning_rate": 1.1735700197238659e-05, "loss": 0.0019, "step": 3690 }, { "epoch": 0.23541388305656297, "grad_norm": 0.1748046875, "learning_rate": 1.1767512884138194e-05, "loss": 0.0006, "step": 3700 }, { "epoch": 0.23605013679455367, "grad_norm": 2.96875, "learning_rate": 1.1799325571037731e-05, "loss": 0.0091, "step": 3710 }, { "epoch": 0.23668639053254437, "grad_norm": 0.12255859375, "learning_rate": 1.1831138257937265e-05, "loss": 0.0035, "step": 3720 }, { "epoch": 0.2373226442705351, "grad_norm": 0.361328125, "learning_rate": 1.18629509448368e-05, "loss": 0.0019, "step": 3730 }, { "epoch": 0.2379588980085258, "grad_norm": 1.0546875, "learning_rate": 1.1894763631736338e-05, "loss": 0.0024, "step": 3740 }, { "epoch": 0.23859515174651652, "grad_norm": 0.46484375, "learning_rate": 1.1926576318635873e-05, "loss": 0.0046, "step": 3750 }, { "epoch": 0.23923140548450722, "grad_norm": 0.140625, "learning_rate": 1.1958389005535408e-05, "loss": 0.0004, "step": 3760 }, { "epoch": 0.23986765922249792, "grad_norm": 5.375, "learning_rate": 1.1990201692434944e-05, "loss": 0.0109, "step": 3770 }, { "epoch": 0.24050391296048865, "grad_norm": 0.054443359375, "learning_rate": 1.2022014379334479e-05, "loss": 0.0088, "step": 3780 }, { "epoch": 0.24114016669847935, "grad_norm": 0.0137939453125, "learning_rate": 1.2053827066234014e-05, "loss": 0.014, "step": 3790 }, { "epoch": 0.24177642043647007, "grad_norm": 0.023193359375, "learning_rate": 1.208563975313355e-05, "loss": 0.008, "step": 3800 }, { "epoch": 0.24241267417446077, "grad_norm": 0.39453125, "learning_rate": 1.2117452440033087e-05, "loss": 0.0053, "step": 3810 }, { "epoch": 0.24304892791245147, "grad_norm": 0.3203125, "learning_rate": 1.2149265126932622e-05, "loss": 0.0009, "step": 3820 }, { "epoch": 0.2436851816504422, "grad_norm": 0.01513671875, "learning_rate": 1.2181077813832156e-05, "loss": 0.0018, "step": 3830 }, { "epoch": 0.2443214353884329, "grad_norm": 0.38671875, "learning_rate": 1.2212890500731693e-05, "loss": 0.0017, "step": 3840 }, { "epoch": 0.24495768912642363, "grad_norm": 0.1337890625, "learning_rate": 1.2244703187631228e-05, "loss": 0.0038, "step": 3850 }, { "epoch": 0.24559394286441433, "grad_norm": 0.005523681640625, "learning_rate": 1.2276515874530763e-05, "loss": 0.0062, "step": 3860 }, { "epoch": 0.24623019660240503, "grad_norm": 0.047119140625, "learning_rate": 1.2308328561430299e-05, "loss": 0.0057, "step": 3870 }, { "epoch": 0.24686645034039575, "grad_norm": 0.205078125, "learning_rate": 1.2340141248329834e-05, "loss": 0.0117, "step": 3880 }, { "epoch": 0.24750270407838645, "grad_norm": 15.3125, "learning_rate": 1.2371953935229371e-05, "loss": 0.0096, "step": 3890 }, { "epoch": 0.24813895781637718, "grad_norm": 0.1826171875, "learning_rate": 1.2403766622128905e-05, "loss": 0.0005, "step": 3900 }, { "epoch": 0.24877521155436788, "grad_norm": 0.19921875, "learning_rate": 1.2435579309028442e-05, "loss": 0.0007, "step": 3910 }, { "epoch": 0.24941146529235858, "grad_norm": 0.400390625, "learning_rate": 1.2467391995927977e-05, "loss": 0.0017, "step": 3920 }, { "epoch": 0.2500477190303493, "grad_norm": 0.00677490234375, "learning_rate": 1.2499204682827511e-05, "loss": 0.0008, "step": 3930 }, { "epoch": 0.25068397276834004, "grad_norm": 0.058837890625, "learning_rate": 1.2531017369727046e-05, "loss": 0.0032, "step": 3940 }, { "epoch": 0.25132022650633074, "grad_norm": 0.00885009765625, "learning_rate": 1.2562830056626585e-05, "loss": 0.0024, "step": 3950 }, { "epoch": 0.25195648024432143, "grad_norm": 0.07470703125, "learning_rate": 1.2594642743526119e-05, "loss": 0.0014, "step": 3960 }, { "epoch": 0.25259273398231213, "grad_norm": 7.65625, "learning_rate": 1.2626455430425654e-05, "loss": 0.0032, "step": 3970 }, { "epoch": 0.25322898772030283, "grad_norm": 0.13671875, "learning_rate": 1.265826811732519e-05, "loss": 0.0018, "step": 3980 }, { "epoch": 0.2538652414582936, "grad_norm": 0.0179443359375, "learning_rate": 1.2690080804224725e-05, "loss": 0.0073, "step": 3990 }, { "epoch": 0.2545014951962843, "grad_norm": 0.21875, "learning_rate": 1.2721893491124262e-05, "loss": 0.0027, "step": 4000 }, { "epoch": 0.255137748934275, "grad_norm": 0.359375, "learning_rate": 1.2753706178023797e-05, "loss": 0.0031, "step": 4010 }, { "epoch": 0.2557740026722657, "grad_norm": 0.392578125, "learning_rate": 1.2785518864923332e-05, "loss": 0.0085, "step": 4020 }, { "epoch": 0.2564102564102564, "grad_norm": 0.486328125, "learning_rate": 1.2817331551822868e-05, "loss": 0.001, "step": 4030 }, { "epoch": 0.25704651014824714, "grad_norm": 0.1337890625, "learning_rate": 1.2849144238722401e-05, "loss": 0.0014, "step": 4040 }, { "epoch": 0.25768276388623784, "grad_norm": 0.09423828125, "learning_rate": 1.288095692562194e-05, "loss": 0.0005, "step": 4050 }, { "epoch": 0.25831901762422854, "grad_norm": 0.12353515625, "learning_rate": 1.2912769612521476e-05, "loss": 0.0004, "step": 4060 }, { "epoch": 0.25895527136221924, "grad_norm": 0.0032806396484375, "learning_rate": 1.294458229942101e-05, "loss": 0.0049, "step": 4070 }, { "epoch": 0.25959152510020994, "grad_norm": 0.0120849609375, "learning_rate": 1.2976394986320545e-05, "loss": 0.0008, "step": 4080 }, { "epoch": 0.2602277788382007, "grad_norm": 0.2021484375, "learning_rate": 1.300820767322008e-05, "loss": 0.0005, "step": 4090 }, { "epoch": 0.2608640325761914, "grad_norm": 0.0693359375, "learning_rate": 1.3040020360119617e-05, "loss": 0.0028, "step": 4100 }, { "epoch": 0.2615002863141821, "grad_norm": 0.16796875, "learning_rate": 1.3071833047019152e-05, "loss": 0.003, "step": 4110 }, { "epoch": 0.2621365400521728, "grad_norm": 4.09375, "learning_rate": 1.3103645733918688e-05, "loss": 0.003, "step": 4120 }, { "epoch": 0.2627727937901635, "grad_norm": 0.0267333984375, "learning_rate": 1.3135458420818223e-05, "loss": 0.0008, "step": 4130 }, { "epoch": 0.26340904752815425, "grad_norm": 0.00860595703125, "learning_rate": 1.3167271107717757e-05, "loss": 0.0005, "step": 4140 }, { "epoch": 0.26404530126614495, "grad_norm": 0.01092529296875, "learning_rate": 1.3199083794617295e-05, "loss": 0.0037, "step": 4150 }, { "epoch": 0.26468155500413565, "grad_norm": 0.04931640625, "learning_rate": 1.323089648151683e-05, "loss": 0.0126, "step": 4160 }, { "epoch": 0.26531780874212635, "grad_norm": 0.375, "learning_rate": 1.3262709168416364e-05, "loss": 0.0072, "step": 4170 }, { "epoch": 0.26595406248011705, "grad_norm": 0.35546875, "learning_rate": 1.32945218553159e-05, "loss": 0.0085, "step": 4180 }, { "epoch": 0.2665903162181078, "grad_norm": 0.0247802734375, "learning_rate": 1.3326334542215435e-05, "loss": 0.0007, "step": 4190 }, { "epoch": 0.2672265699560985, "grad_norm": 0.08349609375, "learning_rate": 1.3358147229114972e-05, "loss": 0.0049, "step": 4200 }, { "epoch": 0.2678628236940892, "grad_norm": 0.1591796875, "learning_rate": 1.3389959916014508e-05, "loss": 0.007, "step": 4210 }, { "epoch": 0.2684990774320799, "grad_norm": 1.0546875, "learning_rate": 1.3421772602914043e-05, "loss": 0.0037, "step": 4220 }, { "epoch": 0.2691353311700706, "grad_norm": 4.53125, "learning_rate": 1.3453585289813578e-05, "loss": 0.013, "step": 4230 }, { "epoch": 0.26977158490806136, "grad_norm": 0.048828125, "learning_rate": 1.3485397976713114e-05, "loss": 0.005, "step": 4240 }, { "epoch": 0.27040783864605206, "grad_norm": 13.5625, "learning_rate": 1.351721066361265e-05, "loss": 0.0092, "step": 4250 }, { "epoch": 0.27104409238404276, "grad_norm": 0.053955078125, "learning_rate": 1.3549023350512186e-05, "loss": 0.0006, "step": 4260 }, { "epoch": 0.27168034612203346, "grad_norm": 0.173828125, "learning_rate": 1.358083603741172e-05, "loss": 0.0021, "step": 4270 }, { "epoch": 0.27231659986002416, "grad_norm": 8.25, "learning_rate": 1.3612648724311255e-05, "loss": 0.009, "step": 4280 }, { "epoch": 0.2729528535980149, "grad_norm": 8.6875, "learning_rate": 1.364446141121079e-05, "loss": 0.0101, "step": 4290 }, { "epoch": 0.2735891073360056, "grad_norm": 0.0849609375, "learning_rate": 1.3676274098110327e-05, "loss": 0.0014, "step": 4300 }, { "epoch": 0.2742253610739963, "grad_norm": 0.63671875, "learning_rate": 1.3708086785009863e-05, "loss": 0.0011, "step": 4310 }, { "epoch": 0.274861614811987, "grad_norm": 0.5, "learning_rate": 1.3739899471909398e-05, "loss": 0.0041, "step": 4320 }, { "epoch": 0.2754978685499777, "grad_norm": 2.265625, "learning_rate": 1.3771712158808933e-05, "loss": 0.0044, "step": 4330 }, { "epoch": 0.27613412228796846, "grad_norm": 1.2734375, "learning_rate": 1.3803524845708469e-05, "loss": 0.0018, "step": 4340 }, { "epoch": 0.27677037602595916, "grad_norm": 1.46875, "learning_rate": 1.3835337532608006e-05, "loss": 0.0147, "step": 4350 }, { "epoch": 0.27740662976394986, "grad_norm": 0.02685546875, "learning_rate": 1.3867150219507541e-05, "loss": 0.001, "step": 4360 }, { "epoch": 0.27804288350194056, "grad_norm": 0.052978515625, "learning_rate": 1.3898962906407077e-05, "loss": 0.0007, "step": 4370 }, { "epoch": 0.27867913723993126, "grad_norm": 3.9375, "learning_rate": 1.393077559330661e-05, "loss": 0.0055, "step": 4380 }, { "epoch": 0.279315390977922, "grad_norm": 0.0260009765625, "learning_rate": 1.3962588280206146e-05, "loss": 0.0015, "step": 4390 }, { "epoch": 0.2799516447159127, "grad_norm": 0.01806640625, "learning_rate": 1.3994400967105684e-05, "loss": 0.0051, "step": 4400 }, { "epoch": 0.2805878984539034, "grad_norm": 0.04736328125, "learning_rate": 1.4026213654005218e-05, "loss": 0.005, "step": 4410 }, { "epoch": 0.2812241521918941, "grad_norm": 2.78125, "learning_rate": 1.4058026340904753e-05, "loss": 0.0078, "step": 4420 }, { "epoch": 0.2818604059298848, "grad_norm": 2.828125, "learning_rate": 1.4089839027804289e-05, "loss": 0.0036, "step": 4430 }, { "epoch": 0.28249665966787557, "grad_norm": 0.09619140625, "learning_rate": 1.4121651714703824e-05, "loss": 0.0024, "step": 4440 }, { "epoch": 0.28313291340586627, "grad_norm": 0.1484375, "learning_rate": 1.4153464401603361e-05, "loss": 0.0012, "step": 4450 }, { "epoch": 0.28376916714385697, "grad_norm": 0.318359375, "learning_rate": 1.4185277088502896e-05, "loss": 0.0074, "step": 4460 }, { "epoch": 0.28440542088184767, "grad_norm": 0.00634765625, "learning_rate": 1.4217089775402432e-05, "loss": 0.0083, "step": 4470 }, { "epoch": 0.28504167461983837, "grad_norm": 4.4375, "learning_rate": 1.4248902462301965e-05, "loss": 0.0023, "step": 4480 }, { "epoch": 0.2856779283578291, "grad_norm": 0.1494140625, "learning_rate": 1.42807151492015e-05, "loss": 0.0095, "step": 4490 }, { "epoch": 0.2863141820958198, "grad_norm": 0.1826171875, "learning_rate": 1.431252783610104e-05, "loss": 0.0076, "step": 4500 }, { "epoch": 0.2869504358338105, "grad_norm": 0.5859375, "learning_rate": 1.4344340523000573e-05, "loss": 0.0012, "step": 4510 }, { "epoch": 0.2875866895718012, "grad_norm": 0.25390625, "learning_rate": 1.4376153209900109e-05, "loss": 0.004, "step": 4520 }, { "epoch": 0.2882229433097919, "grad_norm": 0.1494140625, "learning_rate": 1.4407965896799644e-05, "loss": 0.0014, "step": 4530 }, { "epoch": 0.2888591970477827, "grad_norm": 0.53515625, "learning_rate": 1.443977858369918e-05, "loss": 0.0012, "step": 4540 }, { "epoch": 0.2894954507857734, "grad_norm": 0.01483154296875, "learning_rate": 1.4471591270598716e-05, "loss": 0.0026, "step": 4550 }, { "epoch": 0.2901317045237641, "grad_norm": 0.035400390625, "learning_rate": 1.4503403957498252e-05, "loss": 0.001, "step": 4560 }, { "epoch": 0.2907679582617548, "grad_norm": 0.10205078125, "learning_rate": 1.4535216644397787e-05, "loss": 0.0089, "step": 4570 }, { "epoch": 0.2914042119997455, "grad_norm": 7.3125, "learning_rate": 1.4567029331297322e-05, "loss": 0.0063, "step": 4580 }, { "epoch": 0.29204046573773623, "grad_norm": 0.06689453125, "learning_rate": 1.4598842018196856e-05, "loss": 0.0044, "step": 4590 }, { "epoch": 0.29267671947572693, "grad_norm": 1.2890625, "learning_rate": 1.4630654705096395e-05, "loss": 0.003, "step": 4600 }, { "epoch": 0.29331297321371763, "grad_norm": 0.044921875, "learning_rate": 1.466246739199593e-05, "loss": 0.0061, "step": 4610 }, { "epoch": 0.29394922695170833, "grad_norm": 0.2138671875, "learning_rate": 1.4694280078895464e-05, "loss": 0.0065, "step": 4620 }, { "epoch": 0.29458548068969903, "grad_norm": 0.007781982421875, "learning_rate": 1.4726092765794999e-05, "loss": 0.0032, "step": 4630 }, { "epoch": 0.2952217344276898, "grad_norm": 10.375, "learning_rate": 1.4757905452694534e-05, "loss": 0.0099, "step": 4640 }, { "epoch": 0.2958579881656805, "grad_norm": 0.2236328125, "learning_rate": 1.4789718139594072e-05, "loss": 0.004, "step": 4650 }, { "epoch": 0.2964942419036712, "grad_norm": 5.84375, "learning_rate": 1.4821530826493607e-05, "loss": 0.0042, "step": 4660 }, { "epoch": 0.2971304956416619, "grad_norm": 0.056396484375, "learning_rate": 1.4853343513393142e-05, "loss": 0.0009, "step": 4670 }, { "epoch": 0.2977667493796526, "grad_norm": 10.25, "learning_rate": 1.4885156200292678e-05, "loss": 0.008, "step": 4680 }, { "epoch": 0.29840300311764334, "grad_norm": 0.0380859375, "learning_rate": 1.4916968887192211e-05, "loss": 0.0003, "step": 4690 }, { "epoch": 0.29903925685563404, "grad_norm": 1.265625, "learning_rate": 1.494878157409175e-05, "loss": 0.0018, "step": 4700 }, { "epoch": 0.29967551059362474, "grad_norm": 0.0206298828125, "learning_rate": 1.4980594260991285e-05, "loss": 0.0117, "step": 4710 }, { "epoch": 0.30031176433161544, "grad_norm": 0.01239013671875, "learning_rate": 1.5012406947890819e-05, "loss": 0.0009, "step": 4720 }, { "epoch": 0.30094801806960614, "grad_norm": 0.00848388671875, "learning_rate": 1.5044219634790354e-05, "loss": 0.0003, "step": 4730 }, { "epoch": 0.3015842718075969, "grad_norm": 0.02880859375, "learning_rate": 1.507603232168989e-05, "loss": 0.0005, "step": 4740 }, { "epoch": 0.3022205255455876, "grad_norm": 0.06884765625, "learning_rate": 1.5107845008589427e-05, "loss": 0.0011, "step": 4750 }, { "epoch": 0.3028567792835783, "grad_norm": 0.0303955078125, "learning_rate": 1.5139657695488962e-05, "loss": 0.0009, "step": 4760 }, { "epoch": 0.303493033021569, "grad_norm": 1.1484375, "learning_rate": 1.5171470382388497e-05, "loss": 0.0024, "step": 4770 }, { "epoch": 0.3041292867595597, "grad_norm": 0.01165771484375, "learning_rate": 1.5203283069288033e-05, "loss": 0.0009, "step": 4780 }, { "epoch": 0.30476554049755045, "grad_norm": 0.0439453125, "learning_rate": 1.5235095756187568e-05, "loss": 0.0023, "step": 4790 }, { "epoch": 0.30540179423554115, "grad_norm": 1.71875, "learning_rate": 1.5266908443087105e-05, "loss": 0.0018, "step": 4800 }, { "epoch": 0.30603804797353185, "grad_norm": 0.037109375, "learning_rate": 1.529872112998664e-05, "loss": 0.0059, "step": 4810 }, { "epoch": 0.30667430171152255, "grad_norm": 0.0595703125, "learning_rate": 1.5330533816886176e-05, "loss": 0.0038, "step": 4820 }, { "epoch": 0.30731055544951325, "grad_norm": 0.1123046875, "learning_rate": 1.536234650378571e-05, "loss": 0.0081, "step": 4830 }, { "epoch": 0.307946809187504, "grad_norm": 0.08642578125, "learning_rate": 1.5394159190685247e-05, "loss": 0.004, "step": 4840 }, { "epoch": 0.3085830629254947, "grad_norm": 0.158203125, "learning_rate": 1.5425971877584784e-05, "loss": 0.0087, "step": 4850 }, { "epoch": 0.3092193166634854, "grad_norm": 0.005126953125, "learning_rate": 1.5457784564484317e-05, "loss": 0.0014, "step": 4860 }, { "epoch": 0.3098555704014761, "grad_norm": 6.96875, "learning_rate": 1.5489597251383854e-05, "loss": 0.003, "step": 4870 }, { "epoch": 0.3104918241394668, "grad_norm": 0.0966796875, "learning_rate": 1.5521409938283388e-05, "loss": 0.0018, "step": 4880 }, { "epoch": 0.31112807787745755, "grad_norm": 0.216796875, "learning_rate": 1.555322262518292e-05, "loss": 0.001, "step": 4890 }, { "epoch": 0.31176433161544825, "grad_norm": 3.328125, "learning_rate": 1.5585035312082462e-05, "loss": 0.0048, "step": 4900 }, { "epoch": 0.31240058535343895, "grad_norm": 0.06298828125, "learning_rate": 1.5616847998981996e-05, "loss": 0.0082, "step": 4910 }, { "epoch": 0.31303683909142965, "grad_norm": 0.003173828125, "learning_rate": 1.564866068588153e-05, "loss": 0.0011, "step": 4920 }, { "epoch": 0.31367309282942035, "grad_norm": 0.0257568359375, "learning_rate": 1.5680473372781066e-05, "loss": 0.0062, "step": 4930 }, { "epoch": 0.3143093465674111, "grad_norm": 0.205078125, "learning_rate": 1.57122860596806e-05, "loss": 0.0008, "step": 4940 }, { "epoch": 0.3149456003054018, "grad_norm": 0.294921875, "learning_rate": 1.5744098746580137e-05, "loss": 0.0007, "step": 4950 }, { "epoch": 0.3155818540433925, "grad_norm": 0.47265625, "learning_rate": 1.5775911433479674e-05, "loss": 0.0006, "step": 4960 }, { "epoch": 0.3162181077813832, "grad_norm": 0.0177001953125, "learning_rate": 1.5807724120379208e-05, "loss": 0.0007, "step": 4970 }, { "epoch": 0.3168543615193739, "grad_norm": 0.0283203125, "learning_rate": 1.583953680727874e-05, "loss": 0.0098, "step": 4980 }, { "epoch": 0.31749061525736466, "grad_norm": 2.15625, "learning_rate": 1.587134949417828e-05, "loss": 0.0065, "step": 4990 }, { "epoch": 0.31812686899535536, "grad_norm": 0.05615234375, "learning_rate": 1.5903162181077816e-05, "loss": 0.0007, "step": 5000 }, { "epoch": 0.31876312273334606, "grad_norm": 0.10986328125, "learning_rate": 1.593497486797735e-05, "loss": 0.0007, "step": 5010 }, { "epoch": 0.31939937647133676, "grad_norm": 0.0986328125, "learning_rate": 1.5966787554876886e-05, "loss": 0.001, "step": 5020 }, { "epoch": 0.32003563020932746, "grad_norm": 0.359375, "learning_rate": 1.599860024177642e-05, "loss": 0.0015, "step": 5030 }, { "epoch": 0.3206718839473182, "grad_norm": 0.0150146484375, "learning_rate": 1.6030412928675957e-05, "loss": 0.0036, "step": 5040 }, { "epoch": 0.3213081376853089, "grad_norm": 0.054443359375, "learning_rate": 1.6062225615575494e-05, "loss": 0.0015, "step": 5050 }, { "epoch": 0.3219443914232996, "grad_norm": 0.236328125, "learning_rate": 1.6094038302475028e-05, "loss": 0.0007, "step": 5060 }, { "epoch": 0.3225806451612903, "grad_norm": 0.01348876953125, "learning_rate": 1.6125850989374565e-05, "loss": 0.0007, "step": 5070 }, { "epoch": 0.323216898899281, "grad_norm": 0.01263427734375, "learning_rate": 1.61576636762741e-05, "loss": 0.0024, "step": 5080 }, { "epoch": 0.32385315263727177, "grad_norm": 1.5546875, "learning_rate": 1.6189476363173632e-05, "loss": 0.0015, "step": 5090 }, { "epoch": 0.32448940637526247, "grad_norm": 0.0031890869140625, "learning_rate": 1.6221289050073173e-05, "loss": 0.0047, "step": 5100 }, { "epoch": 0.32512566011325317, "grad_norm": 0.057861328125, "learning_rate": 1.6253101736972706e-05, "loss": 0.0086, "step": 5110 }, { "epoch": 0.32576191385124387, "grad_norm": 0.1083984375, "learning_rate": 1.628491442387224e-05, "loss": 0.0008, "step": 5120 }, { "epoch": 0.32639816758923457, "grad_norm": 0.0595703125, "learning_rate": 1.6316727110771777e-05, "loss": 0.0007, "step": 5130 }, { "epoch": 0.3270344213272253, "grad_norm": 0.119140625, "learning_rate": 1.634853979767131e-05, "loss": 0.0014, "step": 5140 }, { "epoch": 0.327670675065216, "grad_norm": 0.01165771484375, "learning_rate": 1.6380352484570848e-05, "loss": 0.0005, "step": 5150 }, { "epoch": 0.3283069288032067, "grad_norm": 0.08349609375, "learning_rate": 1.6412165171470385e-05, "loss": 0.0022, "step": 5160 }, { "epoch": 0.3289431825411974, "grad_norm": 0.083984375, "learning_rate": 1.6443977858369918e-05, "loss": 0.0132, "step": 5170 }, { "epoch": 0.3295794362791881, "grad_norm": 0.00909423828125, "learning_rate": 1.6475790545269455e-05, "loss": 0.0043, "step": 5180 }, { "epoch": 0.3302156900171789, "grad_norm": 0.1669921875, "learning_rate": 1.650760323216899e-05, "loss": 0.0013, "step": 5190 }, { "epoch": 0.3308519437551696, "grad_norm": 3.75, "learning_rate": 1.6539415919068526e-05, "loss": 0.003, "step": 5200 }, { "epoch": 0.3314881974931603, "grad_norm": 0.0130615234375, "learning_rate": 1.6571228605968063e-05, "loss": 0.0009, "step": 5210 }, { "epoch": 0.332124451231151, "grad_norm": 0.08056640625, "learning_rate": 1.6603041292867597e-05, "loss": 0.011, "step": 5220 }, { "epoch": 0.3327607049691417, "grad_norm": 12.6875, "learning_rate": 1.663485397976713e-05, "loss": 0.0089, "step": 5230 }, { "epoch": 0.33339695870713243, "grad_norm": 1.4609375, "learning_rate": 1.6666666666666667e-05, "loss": 0.0017, "step": 5240 }, { "epoch": 0.33403321244512313, "grad_norm": 0.007049560546875, "learning_rate": 1.6698479353566205e-05, "loss": 0.0058, "step": 5250 }, { "epoch": 0.33466946618311383, "grad_norm": 0.6328125, "learning_rate": 1.6730292040465738e-05, "loss": 0.0033, "step": 5260 }, { "epoch": 0.33530571992110453, "grad_norm": 0.0419921875, "learning_rate": 1.6762104727365275e-05, "loss": 0.0024, "step": 5270 }, { "epoch": 0.3359419736590952, "grad_norm": 0.27734375, "learning_rate": 1.679391741426481e-05, "loss": 0.0009, "step": 5280 }, { "epoch": 0.336578227397086, "grad_norm": 0.275390625, "learning_rate": 1.6825730101164346e-05, "loss": 0.0016, "step": 5290 }, { "epoch": 0.3372144811350767, "grad_norm": 0.1123046875, "learning_rate": 1.6857542788063883e-05, "loss": 0.0022, "step": 5300 }, { "epoch": 0.3378507348730674, "grad_norm": 0.035400390625, "learning_rate": 1.6889355474963417e-05, "loss": 0.0016, "step": 5310 }, { "epoch": 0.3384869886110581, "grad_norm": 0.0022430419921875, "learning_rate": 1.692116816186295e-05, "loss": 0.002, "step": 5320 }, { "epoch": 0.3391232423490488, "grad_norm": 0.4140625, "learning_rate": 1.6952980848762487e-05, "loss": 0.0049, "step": 5330 }, { "epoch": 0.33975949608703954, "grad_norm": 7.5625, "learning_rate": 1.698479353566202e-05, "loss": 0.0086, "step": 5340 }, { "epoch": 0.34039574982503024, "grad_norm": 0.038818359375, "learning_rate": 1.7016606222561558e-05, "loss": 0.0013, "step": 5350 }, { "epoch": 0.34103200356302094, "grad_norm": 0.1376953125, "learning_rate": 1.7048418909461095e-05, "loss": 0.0043, "step": 5360 }, { "epoch": 0.34166825730101164, "grad_norm": 10.5, "learning_rate": 1.708023159636063e-05, "loss": 0.0092, "step": 5370 }, { "epoch": 0.34230451103900233, "grad_norm": 0.0478515625, "learning_rate": 1.7112044283260166e-05, "loss": 0.0032, "step": 5380 }, { "epoch": 0.3429407647769931, "grad_norm": 0.005645751953125, "learning_rate": 1.71438569701597e-05, "loss": 0.005, "step": 5390 }, { "epoch": 0.3435770185149838, "grad_norm": 3.65625, "learning_rate": 1.7175669657059236e-05, "loss": 0.0025, "step": 5400 }, { "epoch": 0.3442132722529745, "grad_norm": 0.31640625, "learning_rate": 1.7207482343958774e-05, "loss": 0.0009, "step": 5410 }, { "epoch": 0.3448495259909652, "grad_norm": 8.6875, "learning_rate": 1.7239295030858307e-05, "loss": 0.0064, "step": 5420 }, { "epoch": 0.3454857797289559, "grad_norm": 0.01019287109375, "learning_rate": 1.727110771775784e-05, "loss": 0.0007, "step": 5430 }, { "epoch": 0.34612203346694664, "grad_norm": 1.359375, "learning_rate": 1.7302920404657378e-05, "loss": 0.0174, "step": 5440 }, { "epoch": 0.34675828720493734, "grad_norm": 0.061279296875, "learning_rate": 1.7334733091556915e-05, "loss": 0.0015, "step": 5450 }, { "epoch": 0.34739454094292804, "grad_norm": 0.0206298828125, "learning_rate": 1.736654577845645e-05, "loss": 0.0072, "step": 5460 }, { "epoch": 0.34803079468091874, "grad_norm": 0.1357421875, "learning_rate": 1.7398358465355986e-05, "loss": 0.0071, "step": 5470 }, { "epoch": 0.34866704841890944, "grad_norm": 0.72265625, "learning_rate": 1.743017115225552e-05, "loss": 0.0016, "step": 5480 }, { "epoch": 0.3493033021569002, "grad_norm": 0.21484375, "learning_rate": 1.7461983839155056e-05, "loss": 0.0029, "step": 5490 }, { "epoch": 0.3499395558948909, "grad_norm": 0.150390625, "learning_rate": 1.7493796526054593e-05, "loss": 0.0166, "step": 5500 }, { "epoch": 0.3505758096328816, "grad_norm": 1.34375, "learning_rate": 1.7525609212954127e-05, "loss": 0.0031, "step": 5510 }, { "epoch": 0.3512120633708723, "grad_norm": 0.037841796875, "learning_rate": 1.7557421899853664e-05, "loss": 0.0026, "step": 5520 }, { "epoch": 0.351848317108863, "grad_norm": 0.01708984375, "learning_rate": 1.7589234586753198e-05, "loss": 0.0006, "step": 5530 }, { "epoch": 0.35248457084685375, "grad_norm": 0.0037994384765625, "learning_rate": 1.762104727365273e-05, "loss": 0.0009, "step": 5540 }, { "epoch": 0.35312082458484445, "grad_norm": 0.052734375, "learning_rate": 1.7652859960552272e-05, "loss": 0.0102, "step": 5550 }, { "epoch": 0.35375707832283515, "grad_norm": 0.058349609375, "learning_rate": 1.7684672647451806e-05, "loss": 0.0044, "step": 5560 }, { "epoch": 0.35439333206082585, "grad_norm": 0.265625, "learning_rate": 1.771648533435134e-05, "loss": 0.0051, "step": 5570 }, { "epoch": 0.35502958579881655, "grad_norm": 0.63671875, "learning_rate": 1.7748298021250876e-05, "loss": 0.0017, "step": 5580 }, { "epoch": 0.3556658395368073, "grad_norm": 1.3359375, "learning_rate": 1.778011070815041e-05, "loss": 0.0056, "step": 5590 }, { "epoch": 0.356302093274798, "grad_norm": 0.0634765625, "learning_rate": 1.7811923395049947e-05, "loss": 0.0009, "step": 5600 }, { "epoch": 0.3569383470127887, "grad_norm": 0.055419921875, "learning_rate": 1.7843736081949484e-05, "loss": 0.0043, "step": 5610 }, { "epoch": 0.3575746007507794, "grad_norm": 0.11376953125, "learning_rate": 1.7875548768849018e-05, "loss": 0.0034, "step": 5620 }, { "epoch": 0.3582108544887701, "grad_norm": 0.023193359375, "learning_rate": 1.7907361455748555e-05, "loss": 0.0033, "step": 5630 }, { "epoch": 0.35884710822676086, "grad_norm": 0.70703125, "learning_rate": 1.793917414264809e-05, "loss": 0.0016, "step": 5640 }, { "epoch": 0.35948336196475156, "grad_norm": 0.083984375, "learning_rate": 1.7970986829547625e-05, "loss": 0.0041, "step": 5650 }, { "epoch": 0.36011961570274226, "grad_norm": 0.07958984375, "learning_rate": 1.8002799516447162e-05, "loss": 0.0017, "step": 5660 }, { "epoch": 0.36075586944073296, "grad_norm": 0.09765625, "learning_rate": 1.8034612203346696e-05, "loss": 0.0007, "step": 5670 }, { "epoch": 0.36139212317872366, "grad_norm": 1.59375, "learning_rate": 1.806642489024623e-05, "loss": 0.0137, "step": 5680 }, { "epoch": 0.3620283769167144, "grad_norm": 3.546875, "learning_rate": 1.8098237577145767e-05, "loss": 0.0057, "step": 5690 }, { "epoch": 0.3626646306547051, "grad_norm": 0.0179443359375, "learning_rate": 1.8130050264045304e-05, "loss": 0.0003, "step": 5700 }, { "epoch": 0.3633008843926958, "grad_norm": 0.03466796875, "learning_rate": 1.8161862950944837e-05, "loss": 0.003, "step": 5710 }, { "epoch": 0.3639371381306865, "grad_norm": 0.546875, "learning_rate": 1.8193675637844375e-05, "loss": 0.0038, "step": 5720 }, { "epoch": 0.3645733918686772, "grad_norm": 8.5, "learning_rate": 1.8225488324743908e-05, "loss": 0.0102, "step": 5730 }, { "epoch": 0.36520964560666797, "grad_norm": 0.703125, "learning_rate": 1.8257301011643442e-05, "loss": 0.0079, "step": 5740 }, { "epoch": 0.36584589934465866, "grad_norm": 0.212890625, "learning_rate": 1.8289113698542982e-05, "loss": 0.0003, "step": 5750 }, { "epoch": 0.36648215308264936, "grad_norm": 0.35546875, "learning_rate": 1.8320926385442516e-05, "loss": 0.0063, "step": 5760 }, { "epoch": 0.36711840682064006, "grad_norm": 12.5625, "learning_rate": 1.835273907234205e-05, "loss": 0.0081, "step": 5770 }, { "epoch": 0.36775466055863076, "grad_norm": 0.02294921875, "learning_rate": 1.8384551759241587e-05, "loss": 0.0002, "step": 5780 }, { "epoch": 0.3683909142966215, "grad_norm": 0.33203125, "learning_rate": 1.841636444614112e-05, "loss": 0.0007, "step": 5790 }, { "epoch": 0.3690271680346122, "grad_norm": 0.06787109375, "learning_rate": 1.8448177133040657e-05, "loss": 0.0092, "step": 5800 }, { "epoch": 0.3696634217726029, "grad_norm": 0.0206298828125, "learning_rate": 1.8479989819940194e-05, "loss": 0.0007, "step": 5810 }, { "epoch": 0.3702996755105936, "grad_norm": 3.484375, "learning_rate": 1.8511802506839728e-05, "loss": 0.0069, "step": 5820 }, { "epoch": 0.3709359292485843, "grad_norm": 0.01611328125, "learning_rate": 1.8543615193739265e-05, "loss": 0.0005, "step": 5830 }, { "epoch": 0.3715721829865751, "grad_norm": 0.0260009765625, "learning_rate": 1.85754278806388e-05, "loss": 0.0059, "step": 5840 }, { "epoch": 0.37220843672456577, "grad_norm": 0.22265625, "learning_rate": 1.8607240567538336e-05, "loss": 0.0009, "step": 5850 }, { "epoch": 0.37284469046255647, "grad_norm": 3.875, "learning_rate": 1.8639053254437873e-05, "loss": 0.0039, "step": 5860 }, { "epoch": 0.37348094420054717, "grad_norm": 0.267578125, "learning_rate": 1.8670865941337407e-05, "loss": 0.0016, "step": 5870 }, { "epoch": 0.37411719793853787, "grad_norm": 0.140625, "learning_rate": 1.870267862823694e-05, "loss": 0.002, "step": 5880 }, { "epoch": 0.3747534516765286, "grad_norm": 0.059326171875, "learning_rate": 1.8734491315136477e-05, "loss": 0.0071, "step": 5890 }, { "epoch": 0.3753897054145193, "grad_norm": 0.66015625, "learning_rate": 1.8766304002036014e-05, "loss": 0.0198, "step": 5900 }, { "epoch": 0.37602595915251, "grad_norm": 0.76953125, "learning_rate": 1.8798116688935548e-05, "loss": 0.0025, "step": 5910 }, { "epoch": 0.3766622128905007, "grad_norm": 0.034912109375, "learning_rate": 1.8829929375835085e-05, "loss": 0.0093, "step": 5920 }, { "epoch": 0.3772984666284914, "grad_norm": 0.00634765625, "learning_rate": 1.886174206273462e-05, "loss": 0.0019, "step": 5930 }, { "epoch": 0.3779347203664822, "grad_norm": 0.1728515625, "learning_rate": 1.8893554749634156e-05, "loss": 0.0006, "step": 5940 }, { "epoch": 0.3785709741044729, "grad_norm": 0.30078125, "learning_rate": 1.892536743653369e-05, "loss": 0.0013, "step": 5950 }, { "epoch": 0.3792072278424636, "grad_norm": 0.1103515625, "learning_rate": 1.8957180123433226e-05, "loss": 0.0088, "step": 5960 }, { "epoch": 0.3798434815804543, "grad_norm": 0.012451171875, "learning_rate": 1.8988992810332763e-05, "loss": 0.0015, "step": 5970 }, { "epoch": 0.380479735318445, "grad_norm": 0.0087890625, "learning_rate": 1.9020805497232297e-05, "loss": 0.0027, "step": 5980 }, { "epoch": 0.38111598905643573, "grad_norm": 0.490234375, "learning_rate": 1.905261818413183e-05, "loss": 0.0006, "step": 5990 }, { "epoch": 0.38175224279442643, "grad_norm": 0.036376953125, "learning_rate": 1.9084430871031368e-05, "loss": 0.0142, "step": 6000 }, { "epoch": 0.38238849653241713, "grad_norm": 0.020263671875, "learning_rate": 1.9116243557930905e-05, "loss": 0.0192, "step": 6010 }, { "epoch": 0.38302475027040783, "grad_norm": 0.0419921875, "learning_rate": 1.914805624483044e-05, "loss": 0.0118, "step": 6020 }, { "epoch": 0.38366100400839853, "grad_norm": 0.026123046875, "learning_rate": 1.9179868931729976e-05, "loss": 0.0068, "step": 6030 }, { "epoch": 0.3842972577463893, "grad_norm": 0.00518798828125, "learning_rate": 1.921168161862951e-05, "loss": 0.0039, "step": 6040 }, { "epoch": 0.38493351148438, "grad_norm": 0.515625, "learning_rate": 1.9243494305529043e-05, "loss": 0.0009, "step": 6050 }, { "epoch": 0.3855697652223707, "grad_norm": 0.05078125, "learning_rate": 1.9275306992428583e-05, "loss": 0.0007, "step": 6060 }, { "epoch": 0.3862060189603614, "grad_norm": 0.05908203125, "learning_rate": 1.9307119679328117e-05, "loss": 0.0026, "step": 6070 }, { "epoch": 0.3868422726983521, "grad_norm": 0.011962890625, "learning_rate": 1.933893236622765e-05, "loss": 0.0016, "step": 6080 }, { "epoch": 0.38747852643634284, "grad_norm": 5.28125, "learning_rate": 1.9370745053127188e-05, "loss": 0.0032, "step": 6090 }, { "epoch": 0.38811478017433354, "grad_norm": 0.00604248046875, "learning_rate": 1.940255774002672e-05, "loss": 0.0004, "step": 6100 }, { "epoch": 0.38875103391232424, "grad_norm": 0.06396484375, "learning_rate": 1.943437042692626e-05, "loss": 0.0003, "step": 6110 }, { "epoch": 0.38938728765031494, "grad_norm": 2.234375, "learning_rate": 1.9466183113825795e-05, "loss": 0.0012, "step": 6120 }, { "epoch": 0.39002354138830564, "grad_norm": 1.2890625, "learning_rate": 1.949799580072533e-05, "loss": 0.0047, "step": 6130 }, { "epoch": 0.3906597951262964, "grad_norm": 0.02783203125, "learning_rate": 1.9529808487624866e-05, "loss": 0.0054, "step": 6140 }, { "epoch": 0.3912960488642871, "grad_norm": 0.158203125, "learning_rate": 1.95616211745244e-05, "loss": 0.0009, "step": 6150 }, { "epoch": 0.3919323026022778, "grad_norm": 0.16015625, "learning_rate": 1.9593433861423937e-05, "loss": 0.0063, "step": 6160 }, { "epoch": 0.3925685563402685, "grad_norm": 0.44921875, "learning_rate": 1.9625246548323474e-05, "loss": 0.0005, "step": 6170 }, { "epoch": 0.3932048100782592, "grad_norm": 0.00701904296875, "learning_rate": 1.9657059235223008e-05, "loss": 0.0015, "step": 6180 }, { "epoch": 0.39384106381624995, "grad_norm": 0.007049560546875, "learning_rate": 1.968887192212254e-05, "loss": 0.0059, "step": 6190 }, { "epoch": 0.39447731755424065, "grad_norm": 0.043212890625, "learning_rate": 1.9720684609022078e-05, "loss": 0.003, "step": 6200 }, { "epoch": 0.39511357129223135, "grad_norm": 0.58203125, "learning_rate": 1.9752497295921615e-05, "loss": 0.0179, "step": 6210 }, { "epoch": 0.39574982503022205, "grad_norm": 0.0235595703125, "learning_rate": 1.978430998282115e-05, "loss": 0.0004, "step": 6220 }, { "epoch": 0.39638607876821275, "grad_norm": 0.01275634765625, "learning_rate": 1.9816122669720686e-05, "loss": 0.0007, "step": 6230 }, { "epoch": 0.3970223325062035, "grad_norm": 0.1572265625, "learning_rate": 1.984793535662022e-05, "loss": 0.0003, "step": 6240 }, { "epoch": 0.3976585862441942, "grad_norm": 0.1494140625, "learning_rate": 1.9879748043519757e-05, "loss": 0.0032, "step": 6250 }, { "epoch": 0.3982948399821849, "grad_norm": 0.1552734375, "learning_rate": 1.9911560730419294e-05, "loss": 0.0066, "step": 6260 }, { "epoch": 0.3989310937201756, "grad_norm": 0.01226806640625, "learning_rate": 1.9943373417318827e-05, "loss": 0.002, "step": 6270 }, { "epoch": 0.3995673474581663, "grad_norm": 1.0703125, "learning_rate": 1.9975186104218364e-05, "loss": 0.0024, "step": 6280 }, { "epoch": 0.40020360119615705, "grad_norm": 0.1904296875, "learning_rate": 2.0006998791117898e-05, "loss": 0.0009, "step": 6290 }, { "epoch": 0.40083985493414775, "grad_norm": 0.0244140625, "learning_rate": 2.0038811478017432e-05, "loss": 0.0055, "step": 6300 }, { "epoch": 0.40147610867213845, "grad_norm": 0.061767578125, "learning_rate": 2.0070624164916972e-05, "loss": 0.0023, "step": 6310 }, { "epoch": 0.40211236241012915, "grad_norm": 0.0625, "learning_rate": 2.0102436851816506e-05, "loss": 0.0002, "step": 6320 }, { "epoch": 0.40274861614811985, "grad_norm": 0.0062255859375, "learning_rate": 2.013424953871604e-05, "loss": 0.0035, "step": 6330 }, { "epoch": 0.4033848698861106, "grad_norm": 1.3203125, "learning_rate": 2.0166062225615577e-05, "loss": 0.0009, "step": 6340 }, { "epoch": 0.4040211236241013, "grad_norm": 0.0299072265625, "learning_rate": 2.019787491251511e-05, "loss": 0.0003, "step": 6350 }, { "epoch": 0.404657377362092, "grad_norm": 0.015625, "learning_rate": 2.0229687599414647e-05, "loss": 0.0064, "step": 6360 }, { "epoch": 0.4052936311000827, "grad_norm": 0.126953125, "learning_rate": 2.0261500286314184e-05, "loss": 0.0021, "step": 6370 }, { "epoch": 0.4059298848380734, "grad_norm": 1.4921875, "learning_rate": 2.0293312973213718e-05, "loss": 0.0075, "step": 6380 }, { "epoch": 0.4065661385760641, "grad_norm": 0.486328125, "learning_rate": 2.0325125660113255e-05, "loss": 0.0006, "step": 6390 }, { "epoch": 0.40720239231405486, "grad_norm": 0.0771484375, "learning_rate": 2.035693834701279e-05, "loss": 0.0053, "step": 6400 }, { "epoch": 0.40783864605204556, "grad_norm": 0.00286865234375, "learning_rate": 2.0388751033912326e-05, "loss": 0.0018, "step": 6410 }, { "epoch": 0.40847489979003626, "grad_norm": 0.1533203125, "learning_rate": 2.0420563720811863e-05, "loss": 0.0017, "step": 6420 }, { "epoch": 0.40911115352802696, "grad_norm": 1.5234375, "learning_rate": 2.0452376407711396e-05, "loss": 0.0025, "step": 6430 }, { "epoch": 0.40974740726601766, "grad_norm": 0.0111083984375, "learning_rate": 2.048418909461093e-05, "loss": 0.0006, "step": 6440 }, { "epoch": 0.4103836610040084, "grad_norm": 0.01141357421875, "learning_rate": 2.0516001781510467e-05, "loss": 0.0005, "step": 6450 }, { "epoch": 0.4110199147419991, "grad_norm": 0.01336669921875, "learning_rate": 2.0547814468410004e-05, "loss": 0.0003, "step": 6460 }, { "epoch": 0.4116561684799898, "grad_norm": 0.125, "learning_rate": 2.0579627155309538e-05, "loss": 0.0003, "step": 6470 }, { "epoch": 0.4122924222179805, "grad_norm": 0.055419921875, "learning_rate": 2.0611439842209075e-05, "loss": 0.0008, "step": 6480 }, { "epoch": 0.4129286759559712, "grad_norm": 0.875, "learning_rate": 2.064325252910861e-05, "loss": 0.011, "step": 6490 }, { "epoch": 0.41356492969396197, "grad_norm": 0.8671875, "learning_rate": 2.0675065216008142e-05, "loss": 0.0013, "step": 6500 }, { "epoch": 0.41420118343195267, "grad_norm": 0.0037994384765625, "learning_rate": 2.0706877902907683e-05, "loss": 0.0019, "step": 6510 }, { "epoch": 0.41483743716994337, "grad_norm": 2.640625, "learning_rate": 2.0738690589807216e-05, "loss": 0.0065, "step": 6520 }, { "epoch": 0.41547369090793407, "grad_norm": 0.2041015625, "learning_rate": 2.077050327670675e-05, "loss": 0.0001, "step": 6530 }, { "epoch": 0.41610994464592477, "grad_norm": 0.1640625, "learning_rate": 2.0802315963606287e-05, "loss": 0.0014, "step": 6540 }, { "epoch": 0.4167461983839155, "grad_norm": 0.73046875, "learning_rate": 2.083412865050582e-05, "loss": 0.0073, "step": 6550 }, { "epoch": 0.4173824521219062, "grad_norm": 5.5625, "learning_rate": 2.0865941337405358e-05, "loss": 0.0093, "step": 6560 }, { "epoch": 0.4180187058598969, "grad_norm": 0.0035400390625, "learning_rate": 2.0897754024304895e-05, "loss": 0.0009, "step": 6570 }, { "epoch": 0.4186549595978876, "grad_norm": 0.2177734375, "learning_rate": 2.092956671120443e-05, "loss": 0.0005, "step": 6580 }, { "epoch": 0.4192912133358783, "grad_norm": 0.078125, "learning_rate": 2.0961379398103965e-05, "loss": 0.005, "step": 6590 }, { "epoch": 0.4199274670738691, "grad_norm": 0.001068115234375, "learning_rate": 2.09931920850035e-05, "loss": 0.0007, "step": 6600 }, { "epoch": 0.4205637208118598, "grad_norm": 0.1708984375, "learning_rate": 2.1025004771903036e-05, "loss": 0.0054, "step": 6610 }, { "epoch": 0.4211999745498505, "grad_norm": 3.015625, "learning_rate": 2.1056817458802573e-05, "loss": 0.0175, "step": 6620 }, { "epoch": 0.4218362282878412, "grad_norm": 0.1767578125, "learning_rate": 2.1088630145702107e-05, "loss": 0.0049, "step": 6630 }, { "epoch": 0.4224724820258319, "grad_norm": 0.09912109375, "learning_rate": 2.112044283260164e-05, "loss": 0.006, "step": 6640 }, { "epoch": 0.42310873576382263, "grad_norm": 0.0703125, "learning_rate": 2.1152255519501178e-05, "loss": 0.0018, "step": 6650 }, { "epoch": 0.42374498950181333, "grad_norm": 0.02099609375, "learning_rate": 2.1184068206400715e-05, "loss": 0.0007, "step": 6660 }, { "epoch": 0.42438124323980403, "grad_norm": 0.099609375, "learning_rate": 2.1215880893300248e-05, "loss": 0.0077, "step": 6670 }, { "epoch": 0.42501749697779473, "grad_norm": 0.50390625, "learning_rate": 2.1247693580199785e-05, "loss": 0.0007, "step": 6680 }, { "epoch": 0.42565375071578543, "grad_norm": 0.150390625, "learning_rate": 2.127950626709932e-05, "loss": 0.001, "step": 6690 }, { "epoch": 0.4262900044537762, "grad_norm": 0.08154296875, "learning_rate": 2.1311318953998856e-05, "loss": 0.0115, "step": 6700 }, { "epoch": 0.4269262581917669, "grad_norm": 3.46875, "learning_rate": 2.1343131640898393e-05, "loss": 0.0014, "step": 6710 }, { "epoch": 0.4275625119297576, "grad_norm": 0.89453125, "learning_rate": 2.1374944327797927e-05, "loss": 0.002, "step": 6720 }, { "epoch": 0.4281987656677483, "grad_norm": 0.0224609375, "learning_rate": 2.1406757014697464e-05, "loss": 0.0051, "step": 6730 }, { "epoch": 0.428835019405739, "grad_norm": 0.11669921875, "learning_rate": 2.1438569701596997e-05, "loss": 0.0018, "step": 6740 }, { "epoch": 0.42947127314372974, "grad_norm": 1.1484375, "learning_rate": 2.147038238849653e-05, "loss": 0.004, "step": 6750 }, { "epoch": 0.43010752688172044, "grad_norm": 0.00970458984375, "learning_rate": 2.150219507539607e-05, "loss": 0.0004, "step": 6760 }, { "epoch": 0.43074378061971114, "grad_norm": 0.3203125, "learning_rate": 2.1534007762295605e-05, "loss": 0.001, "step": 6770 }, { "epoch": 0.43138003435770184, "grad_norm": 0.01141357421875, "learning_rate": 2.156582044919514e-05, "loss": 0.0004, "step": 6780 }, { "epoch": 0.43201628809569254, "grad_norm": 0.0830078125, "learning_rate": 2.1597633136094676e-05, "loss": 0.0004, "step": 6790 }, { "epoch": 0.4326525418336833, "grad_norm": 0.1572265625, "learning_rate": 2.162944582299421e-05, "loss": 0.0062, "step": 6800 }, { "epoch": 0.433288795571674, "grad_norm": 0.0390625, "learning_rate": 2.1661258509893747e-05, "loss": 0.0012, "step": 6810 }, { "epoch": 0.4339250493096647, "grad_norm": 3.375, "learning_rate": 2.1693071196793284e-05, "loss": 0.0015, "step": 6820 }, { "epoch": 0.4345613030476554, "grad_norm": 0.001220703125, "learning_rate": 2.1724883883692817e-05, "loss": 0.0049, "step": 6830 }, { "epoch": 0.4351975567856461, "grad_norm": 7.59375, "learning_rate": 2.175669657059235e-05, "loss": 0.0054, "step": 6840 }, { "epoch": 0.43583381052363684, "grad_norm": 0.1650390625, "learning_rate": 2.1788509257491888e-05, "loss": 0.0034, "step": 6850 }, { "epoch": 0.43647006426162754, "grad_norm": 0.10107421875, "learning_rate": 2.1820321944391425e-05, "loss": 0.0004, "step": 6860 }, { "epoch": 0.43710631799961824, "grad_norm": 0.002349853515625, "learning_rate": 2.185213463129096e-05, "loss": 0.0004, "step": 6870 }, { "epoch": 0.43774257173760894, "grad_norm": 0.1474609375, "learning_rate": 2.1883947318190496e-05, "loss": 0.0018, "step": 6880 }, { "epoch": 0.43837882547559964, "grad_norm": 0.0103759765625, "learning_rate": 2.191576000509003e-05, "loss": 0.0079, "step": 6890 }, { "epoch": 0.4390150792135904, "grad_norm": 0.02392578125, "learning_rate": 2.1947572691989566e-05, "loss": 0.0008, "step": 6900 }, { "epoch": 0.4396513329515811, "grad_norm": 0.05322265625, "learning_rate": 2.1979385378889103e-05, "loss": 0.0005, "step": 6910 }, { "epoch": 0.4402875866895718, "grad_norm": 9.3125, "learning_rate": 2.2011198065788637e-05, "loss": 0.0071, "step": 6920 }, { "epoch": 0.4409238404275625, "grad_norm": 0.1513671875, "learning_rate": 2.2043010752688174e-05, "loss": 0.0011, "step": 6930 }, { "epoch": 0.4415600941655532, "grad_norm": 0.0103759765625, "learning_rate": 2.2074823439587708e-05, "loss": 0.0022, "step": 6940 }, { "epoch": 0.44219634790354395, "grad_norm": 0.154296875, "learning_rate": 2.210663612648724e-05, "loss": 0.0005, "step": 6950 }, { "epoch": 0.44283260164153465, "grad_norm": 0.1962890625, "learning_rate": 2.2138448813386782e-05, "loss": 0.0004, "step": 6960 }, { "epoch": 0.44346885537952535, "grad_norm": 0.158203125, "learning_rate": 2.2170261500286316e-05, "loss": 0.0006, "step": 6970 }, { "epoch": 0.44410510911751605, "grad_norm": 0.01220703125, "learning_rate": 2.220207418718585e-05, "loss": 0.0035, "step": 6980 }, { "epoch": 0.44474136285550675, "grad_norm": 8.1875, "learning_rate": 2.2233886874085386e-05, "loss": 0.0029, "step": 6990 }, { "epoch": 0.4453776165934975, "grad_norm": 0.035888671875, "learning_rate": 2.226569956098492e-05, "loss": 0.0028, "step": 7000 }, { "epoch": 0.4460138703314882, "grad_norm": 0.0167236328125, "learning_rate": 2.2297512247884457e-05, "loss": 0.001, "step": 7010 }, { "epoch": 0.4466501240694789, "grad_norm": 0.48046875, "learning_rate": 2.2329324934783994e-05, "loss": 0.0022, "step": 7020 }, { "epoch": 0.4472863778074696, "grad_norm": 1.125, "learning_rate": 2.2361137621683528e-05, "loss": 0.0006, "step": 7030 }, { "epoch": 0.4479226315454603, "grad_norm": 0.026123046875, "learning_rate": 2.2392950308583065e-05, "loss": 0.0047, "step": 7040 }, { "epoch": 0.44855888528345106, "grad_norm": 1.75, "learning_rate": 2.24247629954826e-05, "loss": 0.0012, "step": 7050 }, { "epoch": 0.44919513902144176, "grad_norm": 3.078125, "learning_rate": 2.2456575682382135e-05, "loss": 0.0018, "step": 7060 }, { "epoch": 0.44983139275943246, "grad_norm": 0.016357421875, "learning_rate": 2.2488388369281673e-05, "loss": 0.0003, "step": 7070 }, { "epoch": 0.45046764649742316, "grad_norm": 0.056640625, "learning_rate": 2.2520201056181206e-05, "loss": 0.0019, "step": 7080 }, { "epoch": 0.45110390023541386, "grad_norm": 0.0169677734375, "learning_rate": 2.255201374308074e-05, "loss": 0.0008, "step": 7090 }, { "epoch": 0.4517401539734046, "grad_norm": 0.004302978515625, "learning_rate": 2.2583826429980277e-05, "loss": 0.0002, "step": 7100 }, { "epoch": 0.4523764077113953, "grad_norm": 0.01300048828125, "learning_rate": 2.2615639116879814e-05, "loss": 0.0166, "step": 7110 }, { "epoch": 0.453012661449386, "grad_norm": 10.625, "learning_rate": 2.2647451803779348e-05, "loss": 0.0078, "step": 7120 }, { "epoch": 0.4536489151873767, "grad_norm": 0.08544921875, "learning_rate": 2.2679264490678885e-05, "loss": 0.0011, "step": 7130 }, { "epoch": 0.4542851689253674, "grad_norm": 0.02978515625, "learning_rate": 2.2711077177578418e-05, "loss": 0.0006, "step": 7140 }, { "epoch": 0.45492142266335817, "grad_norm": 1.625, "learning_rate": 2.2742889864477955e-05, "loss": 0.0032, "step": 7150 }, { "epoch": 0.45555767640134887, "grad_norm": 0.0020904541015625, "learning_rate": 2.2774702551377492e-05, "loss": 0.0017, "step": 7160 }, { "epoch": 0.45619393013933957, "grad_norm": 0.07275390625, "learning_rate": 2.2806515238277026e-05, "loss": 0.0003, "step": 7170 }, { "epoch": 0.45683018387733026, "grad_norm": 0.453125, "learning_rate": 2.2838327925176563e-05, "loss": 0.0007, "step": 7180 }, { "epoch": 0.45746643761532096, "grad_norm": 0.1513671875, "learning_rate": 2.2870140612076097e-05, "loss": 0.0064, "step": 7190 }, { "epoch": 0.4581026913533117, "grad_norm": 0.0147705078125, "learning_rate": 2.290195329897563e-05, "loss": 0.0138, "step": 7200 }, { "epoch": 0.4587389450913024, "grad_norm": 0.0023193359375, "learning_rate": 2.2933765985875167e-05, "loss": 0.0014, "step": 7210 }, { "epoch": 0.4593751988292931, "grad_norm": 0.16796875, "learning_rate": 2.2965578672774704e-05, "loss": 0.0096, "step": 7220 }, { "epoch": 0.4600114525672838, "grad_norm": 0.005645751953125, "learning_rate": 2.2997391359674238e-05, "loss": 0.0005, "step": 7230 }, { "epoch": 0.4606477063052745, "grad_norm": 0.0126953125, "learning_rate": 2.3029204046573775e-05, "loss": 0.0011, "step": 7240 }, { "epoch": 0.4612839600432653, "grad_norm": 0.025146484375, "learning_rate": 2.306101673347331e-05, "loss": 0.0023, "step": 7250 }, { "epoch": 0.461920213781256, "grad_norm": 0.0732421875, "learning_rate": 2.3092829420372846e-05, "loss": 0.0035, "step": 7260 }, { "epoch": 0.4625564675192467, "grad_norm": 0.01177978515625, "learning_rate": 2.3124642107272383e-05, "loss": 0.0015, "step": 7270 }, { "epoch": 0.46319272125723737, "grad_norm": 0.01226806640625, "learning_rate": 2.3156454794171917e-05, "loss": 0.0001, "step": 7280 }, { "epoch": 0.46382897499522807, "grad_norm": 5.25, "learning_rate": 2.318826748107145e-05, "loss": 0.0049, "step": 7290 }, { "epoch": 0.4644652287332188, "grad_norm": 0.1376953125, "learning_rate": 2.3220080167970987e-05, "loss": 0.0005, "step": 7300 }, { "epoch": 0.4651014824712095, "grad_norm": 0.0673828125, "learning_rate": 2.3251892854870524e-05, "loss": 0.0006, "step": 7310 }, { "epoch": 0.4657377362092002, "grad_norm": 0.177734375, "learning_rate": 2.3283705541770058e-05, "loss": 0.0019, "step": 7320 }, { "epoch": 0.4663739899471909, "grad_norm": 0.0947265625, "learning_rate": 2.3315518228669595e-05, "loss": 0.0056, "step": 7330 }, { "epoch": 0.4670102436851816, "grad_norm": 0.015869140625, "learning_rate": 2.334733091556913e-05, "loss": 0.0077, "step": 7340 }, { "epoch": 0.4676464974231724, "grad_norm": 0.07421875, "learning_rate": 2.3379143602468666e-05, "loss": 0.0006, "step": 7350 }, { "epoch": 0.4682827511611631, "grad_norm": 0.01336669921875, "learning_rate": 2.3410956289368203e-05, "loss": 0.0018, "step": 7360 }, { "epoch": 0.4689190048991538, "grad_norm": 0.09912109375, "learning_rate": 2.3442768976267736e-05, "loss": 0.0072, "step": 7370 }, { "epoch": 0.4695552586371445, "grad_norm": 1.2421875, "learning_rate": 2.3474581663167274e-05, "loss": 0.001, "step": 7380 }, { "epoch": 0.4701915123751352, "grad_norm": 9.9375, "learning_rate": 2.3506394350066807e-05, "loss": 0.0052, "step": 7390 }, { "epoch": 0.47082776611312593, "grad_norm": 0.0047607421875, "learning_rate": 2.353820703696634e-05, "loss": 0.0086, "step": 7400 }, { "epoch": 0.47146401985111663, "grad_norm": 1.0390625, "learning_rate": 2.357001972386588e-05, "loss": 0.0178, "step": 7410 }, { "epoch": 0.47210027358910733, "grad_norm": 0.92578125, "learning_rate": 2.3601832410765415e-05, "loss": 0.0005, "step": 7420 }, { "epoch": 0.47273652732709803, "grad_norm": 1.515625, "learning_rate": 2.363364509766495e-05, "loss": 0.0009, "step": 7430 }, { "epoch": 0.47337278106508873, "grad_norm": 7.65625, "learning_rate": 2.3665457784564486e-05, "loss": 0.0136, "step": 7440 }, { "epoch": 0.4740090348030795, "grad_norm": 0.00537109375, "learning_rate": 2.369727047146402e-05, "loss": 0.0049, "step": 7450 }, { "epoch": 0.4746452885410702, "grad_norm": 1.7265625, "learning_rate": 2.3729083158363556e-05, "loss": 0.0077, "step": 7460 }, { "epoch": 0.4752815422790609, "grad_norm": 0.032470703125, "learning_rate": 2.3760895845263093e-05, "loss": 0.001, "step": 7470 }, { "epoch": 0.4759177960170516, "grad_norm": 0.00046539306640625, "learning_rate": 2.3792708532162627e-05, "loss": 0.0034, "step": 7480 }, { "epoch": 0.4765540497550423, "grad_norm": 0.12255859375, "learning_rate": 2.3824521219062164e-05, "loss": 0.0006, "step": 7490 }, { "epoch": 0.47719030349303304, "grad_norm": 0.0556640625, "learning_rate": 2.3856333905961698e-05, "loss": 0.0034, "step": 7500 }, { "epoch": 0.47782655723102374, "grad_norm": 0.01458740234375, "learning_rate": 2.3888146592861235e-05, "loss": 0.0002, "step": 7510 }, { "epoch": 0.47846281096901444, "grad_norm": 0.51953125, "learning_rate": 2.3919959279760772e-05, "loss": 0.0005, "step": 7520 }, { "epoch": 0.47909906470700514, "grad_norm": 0.0037384033203125, "learning_rate": 2.3951771966660305e-05, "loss": 0.003, "step": 7530 }, { "epoch": 0.47973531844499584, "grad_norm": 0.326171875, "learning_rate": 2.398358465355984e-05, "loss": 0.0005, "step": 7540 }, { "epoch": 0.4803715721829866, "grad_norm": 0.015869140625, "learning_rate": 2.4015397340459376e-05, "loss": 0.0051, "step": 7550 }, { "epoch": 0.4810078259209773, "grad_norm": 0.05224609375, "learning_rate": 2.4047210027358913e-05, "loss": 0.0066, "step": 7560 }, { "epoch": 0.481644079658968, "grad_norm": 0.061279296875, "learning_rate": 2.4079022714258447e-05, "loss": 0.001, "step": 7570 }, { "epoch": 0.4822803333969587, "grad_norm": 0.002532958984375, "learning_rate": 2.4110835401157984e-05, "loss": 0.0006, "step": 7580 }, { "epoch": 0.4829165871349494, "grad_norm": 0.37890625, "learning_rate": 2.4142648088057518e-05, "loss": 0.0066, "step": 7590 }, { "epoch": 0.48355284087294015, "grad_norm": 0.042236328125, "learning_rate": 2.417446077495705e-05, "loss": 0.004, "step": 7600 }, { "epoch": 0.48418909461093085, "grad_norm": 0.056396484375, "learning_rate": 2.4206273461856592e-05, "loss": 0.0004, "step": 7610 }, { "epoch": 0.48482534834892155, "grad_norm": 0.0849609375, "learning_rate": 2.4238086148756125e-05, "loss": 0.0011, "step": 7620 }, { "epoch": 0.48546160208691225, "grad_norm": 0.0400390625, "learning_rate": 2.426989883565566e-05, "loss": 0.0059, "step": 7630 }, { "epoch": 0.48609785582490295, "grad_norm": 0.2734375, "learning_rate": 2.4301711522555196e-05, "loss": 0.0046, "step": 7640 }, { "epoch": 0.4867341095628937, "grad_norm": 0.298828125, "learning_rate": 2.433352420945473e-05, "loss": 0.0003, "step": 7650 }, { "epoch": 0.4873703633008844, "grad_norm": 0.0250244140625, "learning_rate": 2.4365336896354267e-05, "loss": 0.0004, "step": 7660 }, { "epoch": 0.4880066170388751, "grad_norm": 0.1728515625, "learning_rate": 2.4397149583253804e-05, "loss": 0.0021, "step": 7670 }, { "epoch": 0.4886428707768658, "grad_norm": 0.025146484375, "learning_rate": 2.4428962270153337e-05, "loss": 0.001, "step": 7680 }, { "epoch": 0.4892791245148565, "grad_norm": 0.04541015625, "learning_rate": 2.4460774957052875e-05, "loss": 0.0002, "step": 7690 }, { "epoch": 0.48991537825284726, "grad_norm": 10.125, "learning_rate": 2.4492587643952408e-05, "loss": 0.008, "step": 7700 }, { "epoch": 0.49055163199083796, "grad_norm": 0.01104736328125, "learning_rate": 2.4524400330851945e-05, "loss": 0.0011, "step": 7710 }, { "epoch": 0.49118788572882865, "grad_norm": 26.125, "learning_rate": 2.4556213017751482e-05, "loss": 0.0068, "step": 7720 }, { "epoch": 0.49182413946681935, "grad_norm": 1.0, "learning_rate": 2.4588025704651016e-05, "loss": 0.0022, "step": 7730 }, { "epoch": 0.49246039320481005, "grad_norm": 0.326171875, "learning_rate": 2.461983839155055e-05, "loss": 0.0004, "step": 7740 }, { "epoch": 0.4930966469428008, "grad_norm": 0.1953125, "learning_rate": 2.4651651078450087e-05, "loss": 0.0093, "step": 7750 }, { "epoch": 0.4937329006807915, "grad_norm": 0.25390625, "learning_rate": 2.4683463765349624e-05, "loss": 0.0006, "step": 7760 }, { "epoch": 0.4943691544187822, "grad_norm": 0.8984375, "learning_rate": 2.4715276452249157e-05, "loss": 0.0007, "step": 7770 }, { "epoch": 0.4950054081567729, "grad_norm": 0.431640625, "learning_rate": 2.4747089139148694e-05, "loss": 0.0068, "step": 7780 }, { "epoch": 0.4956416618947636, "grad_norm": 0.83203125, "learning_rate": 2.4778901826048228e-05, "loss": 0.0016, "step": 7790 }, { "epoch": 0.49627791563275436, "grad_norm": 0.025146484375, "learning_rate": 2.4810714512947765e-05, "loss": 0.0008, "step": 7800 }, { "epoch": 0.49691416937074506, "grad_norm": 0.06396484375, "learning_rate": 2.4842527199847302e-05, "loss": 0.0013, "step": 7810 }, { "epoch": 0.49755042310873576, "grad_norm": 0.84765625, "learning_rate": 2.4874339886746836e-05, "loss": 0.002, "step": 7820 }, { "epoch": 0.49818667684672646, "grad_norm": 0.0086669921875, "learning_rate": 2.4906152573646373e-05, "loss": 0.0006, "step": 7830 }, { "epoch": 0.49882293058471716, "grad_norm": 0.011474609375, "learning_rate": 2.4937965260545906e-05, "loss": 0.0004, "step": 7840 }, { "epoch": 0.4994591843227079, "grad_norm": 0.298828125, "learning_rate": 2.496977794744544e-05, "loss": 0.0009, "step": 7850 }, { "epoch": 0.5000954380606986, "grad_norm": 0.034423828125, "learning_rate": 2.500159063434498e-05, "loss": 0.0048, "step": 7860 }, { "epoch": 0.5007316917986894, "grad_norm": 0.06396484375, "learning_rate": 2.5033403321244514e-05, "loss": 0.0019, "step": 7870 }, { "epoch": 0.5013679455366801, "grad_norm": 4.625, "learning_rate": 2.5065216008144048e-05, "loss": 0.0051, "step": 7880 }, { "epoch": 0.5020041992746708, "grad_norm": 3.59375, "learning_rate": 2.5097028695043585e-05, "loss": 0.0021, "step": 7890 }, { "epoch": 0.5026404530126615, "grad_norm": 0.0030670166015625, "learning_rate": 2.512884138194312e-05, "loss": 0.0005, "step": 7900 }, { "epoch": 0.5032767067506522, "grad_norm": 0.240234375, "learning_rate": 2.5160654068842656e-05, "loss": 0.0016, "step": 7910 }, { "epoch": 0.5039129604886429, "grad_norm": 7.5625, "learning_rate": 2.519246675574219e-05, "loss": 0.0104, "step": 7920 }, { "epoch": 0.5045492142266336, "grad_norm": 0.05517578125, "learning_rate": 2.522427944264173e-05, "loss": 0.0006, "step": 7930 }, { "epoch": 0.5051854679646243, "grad_norm": 0.07861328125, "learning_rate": 2.5256092129541263e-05, "loss": 0.0043, "step": 7940 }, { "epoch": 0.505821721702615, "grad_norm": 0.02197265625, "learning_rate": 2.52879048164408e-05, "loss": 0.0004, "step": 7950 }, { "epoch": 0.5064579754406057, "grad_norm": 0.1240234375, "learning_rate": 2.5319717503340334e-05, "loss": 0.0011, "step": 7960 }, { "epoch": 0.5070942291785965, "grad_norm": 0.498046875, "learning_rate": 2.5351530190239868e-05, "loss": 0.0003, "step": 7970 }, { "epoch": 0.5077304829165872, "grad_norm": 0.0274658203125, "learning_rate": 2.5383342877139405e-05, "loss": 0.0011, "step": 7980 }, { "epoch": 0.5083667366545779, "grad_norm": 0.0166015625, "learning_rate": 2.541515556403894e-05, "loss": 0.0027, "step": 7990 }, { "epoch": 0.5090029903925686, "grad_norm": 1.4296875, "learning_rate": 2.5446968250938476e-05, "loss": 0.0007, "step": 8000 }, { "epoch": 0.5096392441305593, "grad_norm": 0.7109375, "learning_rate": 2.547878093783801e-05, "loss": 0.0008, "step": 8010 }, { "epoch": 0.51027549786855, "grad_norm": 0.0098876953125, "learning_rate": 2.5510593624737543e-05, "loss": 0.0131, "step": 8020 }, { "epoch": 0.5109117516065407, "grad_norm": 0.014892578125, "learning_rate": 2.5542406311637083e-05, "loss": 0.0009, "step": 8030 }, { "epoch": 0.5115480053445314, "grad_norm": 0.04296875, "learning_rate": 2.557421899853662e-05, "loss": 0.0006, "step": 8040 }, { "epoch": 0.5121842590825221, "grad_norm": 0.04541015625, "learning_rate": 2.5606031685436154e-05, "loss": 0.0038, "step": 8050 }, { "epoch": 0.5128205128205128, "grad_norm": 0.1455078125, "learning_rate": 2.563784437233569e-05, "loss": 0.0006, "step": 8060 }, { "epoch": 0.5134567665585036, "grad_norm": 0.0191650390625, "learning_rate": 2.5669657059235225e-05, "loss": 0.006, "step": 8070 }, { "epoch": 0.5140930202964943, "grad_norm": 1.140625, "learning_rate": 2.570146974613476e-05, "loss": 0.0013, "step": 8080 }, { "epoch": 0.514729274034485, "grad_norm": 0.0291748046875, "learning_rate": 2.5733282433034295e-05, "loss": 0.0073, "step": 8090 }, { "epoch": 0.5153655277724757, "grad_norm": 0.0810546875, "learning_rate": 2.576509511993383e-05, "loss": 0.0012, "step": 8100 }, { "epoch": 0.5160017815104664, "grad_norm": 0.061279296875, "learning_rate": 2.5796907806833366e-05, "loss": 0.0004, "step": 8110 }, { "epoch": 0.5166380352484571, "grad_norm": 0.1435546875, "learning_rate": 2.58287204937329e-05, "loss": 0.0045, "step": 8120 }, { "epoch": 0.5172742889864478, "grad_norm": 0.193359375, "learning_rate": 2.5860533180632433e-05, "loss": 0.0021, "step": 8130 }, { "epoch": 0.5179105427244385, "grad_norm": 0.01068115234375, "learning_rate": 2.5892345867531974e-05, "loss": 0.0117, "step": 8140 }, { "epoch": 0.5185467964624292, "grad_norm": 0.018310546875, "learning_rate": 2.592415855443151e-05, "loss": 0.0009, "step": 8150 }, { "epoch": 0.5191830502004199, "grad_norm": 0.6875, "learning_rate": 2.5955971241331045e-05, "loss": 0.0006, "step": 8160 }, { "epoch": 0.5198193039384107, "grad_norm": 0.1357421875, "learning_rate": 2.598778392823058e-05, "loss": 0.0041, "step": 8170 }, { "epoch": 0.5204555576764014, "grad_norm": 0.047607421875, "learning_rate": 2.6019596615130115e-05, "loss": 0.0004, "step": 8180 }, { "epoch": 0.5210918114143921, "grad_norm": 0.0096435546875, "learning_rate": 2.605140930202965e-05, "loss": 0.0006, "step": 8190 }, { "epoch": 0.5217280651523828, "grad_norm": 0.00482177734375, "learning_rate": 2.6083221988929186e-05, "loss": 0.0041, "step": 8200 }, { "epoch": 0.5223643188903735, "grad_norm": 0.01361083984375, "learning_rate": 2.611503467582872e-05, "loss": 0.0005, "step": 8210 }, { "epoch": 0.5230005726283642, "grad_norm": 0.058349609375, "learning_rate": 2.6146847362728257e-05, "loss": 0.0003, "step": 8220 }, { "epoch": 0.5236368263663549, "grad_norm": 0.00927734375, "learning_rate": 2.617866004962779e-05, "loss": 0.0002, "step": 8230 }, { "epoch": 0.5242730801043456, "grad_norm": 0.00897216796875, "learning_rate": 2.621047273652733e-05, "loss": 0.0008, "step": 8240 }, { "epoch": 0.5249093338423363, "grad_norm": 3.390625, "learning_rate": 2.6242285423426864e-05, "loss": 0.0069, "step": 8250 }, { "epoch": 0.525545587580327, "grad_norm": 0.04833984375, "learning_rate": 2.62740981103264e-05, "loss": 0.0047, "step": 8260 }, { "epoch": 0.5261818413183178, "grad_norm": 16.625, "learning_rate": 2.6305910797225935e-05, "loss": 0.0086, "step": 8270 }, { "epoch": 0.5268180950563085, "grad_norm": 0.026123046875, "learning_rate": 2.6337723484125472e-05, "loss": 0.0033, "step": 8280 }, { "epoch": 0.5274543487942992, "grad_norm": 0.5234375, "learning_rate": 2.6369536171025006e-05, "loss": 0.0069, "step": 8290 }, { "epoch": 0.5280906025322899, "grad_norm": 0.005828857421875, "learning_rate": 2.640134885792454e-05, "loss": 0.0006, "step": 8300 }, { "epoch": 0.5287268562702806, "grad_norm": 0.013916015625, "learning_rate": 2.6433161544824077e-05, "loss": 0.0041, "step": 8310 }, { "epoch": 0.5293631100082713, "grad_norm": 0.1357421875, "learning_rate": 2.646497423172361e-05, "loss": 0.0115, "step": 8320 }, { "epoch": 0.529999363746262, "grad_norm": 0.00179290771484375, "learning_rate": 2.6496786918623144e-05, "loss": 0.0045, "step": 8330 }, { "epoch": 0.5306356174842527, "grad_norm": 0.00921630859375, "learning_rate": 2.6528599605522688e-05, "loss": 0.0013, "step": 8340 }, { "epoch": 0.5312718712222434, "grad_norm": 0.0341796875, "learning_rate": 2.656041229242222e-05, "loss": 0.0082, "step": 8350 }, { "epoch": 0.5319081249602341, "grad_norm": 0.047607421875, "learning_rate": 2.6592224979321755e-05, "loss": 0.0004, "step": 8360 }, { "epoch": 0.5325443786982249, "grad_norm": 0.11083984375, "learning_rate": 2.6624037666221292e-05, "loss": 0.0015, "step": 8370 }, { "epoch": 0.5331806324362156, "grad_norm": 0.1845703125, "learning_rate": 2.6655850353120826e-05, "loss": 0.0008, "step": 8380 }, { "epoch": 0.5338168861742063, "grad_norm": 0.004150390625, "learning_rate": 2.668766304002036e-05, "loss": 0.0124, "step": 8390 }, { "epoch": 0.534453139912197, "grad_norm": 0.0220947265625, "learning_rate": 2.6719475726919896e-05, "loss": 0.0043, "step": 8400 }, { "epoch": 0.5350893936501877, "grad_norm": 0.007354736328125, "learning_rate": 2.675128841381943e-05, "loss": 0.0032, "step": 8410 }, { "epoch": 0.5357256473881784, "grad_norm": 0.021240234375, "learning_rate": 2.6783101100718967e-05, "loss": 0.0117, "step": 8420 }, { "epoch": 0.5363619011261691, "grad_norm": 0.0439453125, "learning_rate": 2.68149137876185e-05, "loss": 0.0048, "step": 8430 }, { "epoch": 0.5369981548641598, "grad_norm": 1.71875, "learning_rate": 2.684672647451804e-05, "loss": 0.0028, "step": 8440 }, { "epoch": 0.5376344086021505, "grad_norm": 0.00958251953125, "learning_rate": 2.6878539161417575e-05, "loss": 0.0034, "step": 8450 }, { "epoch": 0.5382706623401412, "grad_norm": 0.054931640625, "learning_rate": 2.6910351848317112e-05, "loss": 0.0004, "step": 8460 }, { "epoch": 0.538906916078132, "grad_norm": 0.07763671875, "learning_rate": 2.6942164535216646e-05, "loss": 0.002, "step": 8470 }, { "epoch": 0.5395431698161227, "grad_norm": 0.068359375, "learning_rate": 2.6973977222116183e-05, "loss": 0.0008, "step": 8480 }, { "epoch": 0.5401794235541134, "grad_norm": 0.251953125, "learning_rate": 2.7005789909015716e-05, "loss": 0.003, "step": 8490 }, { "epoch": 0.5408156772921041, "grad_norm": 0.038818359375, "learning_rate": 2.703760259591525e-05, "loss": 0.0032, "step": 8500 }, { "epoch": 0.5414519310300948, "grad_norm": 0.13671875, "learning_rate": 2.7069415282814787e-05, "loss": 0.0005, "step": 8510 }, { "epoch": 0.5420881847680855, "grad_norm": 0.0615234375, "learning_rate": 2.710122796971432e-05, "loss": 0.0053, "step": 8520 }, { "epoch": 0.5427244385060762, "grad_norm": 0.421875, "learning_rate": 2.7133040656613858e-05, "loss": 0.0022, "step": 8530 }, { "epoch": 0.5433606922440669, "grad_norm": 5.34375, "learning_rate": 2.7164853343513398e-05, "loss": 0.0099, "step": 8540 }, { "epoch": 0.5439969459820576, "grad_norm": 0.01214599609375, "learning_rate": 2.7196666030412932e-05, "loss": 0.0018, "step": 8550 }, { "epoch": 0.5446331997200483, "grad_norm": 0.00811767578125, "learning_rate": 2.7228478717312465e-05, "loss": 0.0083, "step": 8560 }, { "epoch": 0.545269453458039, "grad_norm": 0.74609375, "learning_rate": 2.7260291404212002e-05, "loss": 0.0018, "step": 8570 }, { "epoch": 0.5459057071960298, "grad_norm": 0.04248046875, "learning_rate": 2.7292104091111536e-05, "loss": 0.0014, "step": 8580 }, { "epoch": 0.5465419609340205, "grad_norm": 0.04248046875, "learning_rate": 2.7323916778011073e-05, "loss": 0.0024, "step": 8590 }, { "epoch": 0.5471782146720112, "grad_norm": 0.0159912109375, "learning_rate": 2.7355729464910607e-05, "loss": 0.0018, "step": 8600 }, { "epoch": 0.5478144684100019, "grad_norm": 0.01116943359375, "learning_rate": 2.738754215181014e-05, "loss": 0.0026, "step": 8610 }, { "epoch": 0.5484507221479926, "grad_norm": 0.028564453125, "learning_rate": 2.7419354838709678e-05, "loss": 0.0005, "step": 8620 }, { "epoch": 0.5490869758859833, "grad_norm": 0.37890625, "learning_rate": 2.745116752560921e-05, "loss": 0.001, "step": 8630 }, { "epoch": 0.549723229623974, "grad_norm": 0.46875, "learning_rate": 2.748298021250875e-05, "loss": 0.0003, "step": 8640 }, { "epoch": 0.5503594833619647, "grad_norm": 0.068359375, "learning_rate": 2.751479289940829e-05, "loss": 0.0038, "step": 8650 }, { "epoch": 0.5509957370999554, "grad_norm": 0.14453125, "learning_rate": 2.7546605586307822e-05, "loss": 0.0011, "step": 8660 }, { "epoch": 0.5516319908379461, "grad_norm": 0.01275634765625, "learning_rate": 2.7578418273207356e-05, "loss": 0.0002, "step": 8670 }, { "epoch": 0.5522682445759369, "grad_norm": 0.01470947265625, "learning_rate": 2.7610230960106893e-05, "loss": 0.0035, "step": 8680 }, { "epoch": 0.5529044983139276, "grad_norm": 0.00567626953125, "learning_rate": 2.7642043647006427e-05, "loss": 0.0029, "step": 8690 }, { "epoch": 0.5535407520519183, "grad_norm": 0.3515625, "learning_rate": 2.767385633390596e-05, "loss": 0.0022, "step": 8700 }, { "epoch": 0.554177005789909, "grad_norm": 0.05322265625, "learning_rate": 2.7705669020805497e-05, "loss": 0.0011, "step": 8710 }, { "epoch": 0.5548132595278997, "grad_norm": 0.0537109375, "learning_rate": 2.773748170770503e-05, "loss": 0.0013, "step": 8720 }, { "epoch": 0.5554495132658904, "grad_norm": 0.058837890625, "learning_rate": 2.7769294394604568e-05, "loss": 0.0053, "step": 8730 }, { "epoch": 0.5560857670038811, "grad_norm": 0.0125732421875, "learning_rate": 2.780110708150411e-05, "loss": 0.0042, "step": 8740 }, { "epoch": 0.5567220207418718, "grad_norm": 0.1259765625, "learning_rate": 2.7832919768403642e-05, "loss": 0.0036, "step": 8750 }, { "epoch": 0.5573582744798625, "grad_norm": 0.020263671875, "learning_rate": 2.7864732455303176e-05, "loss": 0.002, "step": 8760 }, { "epoch": 0.5579945282178532, "grad_norm": 0.002899169921875, "learning_rate": 2.7896545142202713e-05, "loss": 0.0009, "step": 8770 }, { "epoch": 0.558630781955844, "grad_norm": 0.016845703125, "learning_rate": 2.7928357829102247e-05, "loss": 0.0007, "step": 8780 }, { "epoch": 0.5592670356938347, "grad_norm": 0.1923828125, "learning_rate": 2.7960170516001784e-05, "loss": 0.0132, "step": 8790 }, { "epoch": 0.5599032894318254, "grad_norm": 0.033203125, "learning_rate": 2.7991983202901317e-05, "loss": 0.0032, "step": 8800 }, { "epoch": 0.5605395431698161, "grad_norm": 0.6953125, "learning_rate": 2.802379588980085e-05, "loss": 0.0005, "step": 8810 }, { "epoch": 0.5611757969078068, "grad_norm": 0.208984375, "learning_rate": 2.8055608576700388e-05, "loss": 0.0008, "step": 8820 }, { "epoch": 0.5618120506457975, "grad_norm": 10.375, "learning_rate": 2.808742126359992e-05, "loss": 0.0041, "step": 8830 }, { "epoch": 0.5624483043837882, "grad_norm": 0.03125, "learning_rate": 2.8119233950499462e-05, "loss": 0.0028, "step": 8840 }, { "epoch": 0.5630845581217789, "grad_norm": 0.349609375, "learning_rate": 2.8151046637399e-05, "loss": 0.0011, "step": 8850 }, { "epoch": 0.5637208118597696, "grad_norm": 1.578125, "learning_rate": 2.8182859324298533e-05, "loss": 0.0011, "step": 8860 }, { "epoch": 0.5643570655977603, "grad_norm": 0.11865234375, "learning_rate": 2.8214672011198066e-05, "loss": 0.0005, "step": 8870 }, { "epoch": 0.5649933193357511, "grad_norm": 0.037353515625, "learning_rate": 2.8246484698097603e-05, "loss": 0.0067, "step": 8880 }, { "epoch": 0.5656295730737418, "grad_norm": 0.013671875, "learning_rate": 2.8278297384997137e-05, "loss": 0.0006, "step": 8890 }, { "epoch": 0.5662658268117325, "grad_norm": 0.0115966796875, "learning_rate": 2.8310110071896674e-05, "loss": 0.0016, "step": 8900 }, { "epoch": 0.5669020805497232, "grad_norm": 0.040771484375, "learning_rate": 2.8341922758796208e-05, "loss": 0.0055, "step": 8910 }, { "epoch": 0.5675383342877139, "grad_norm": 0.09326171875, "learning_rate": 2.837373544569574e-05, "loss": 0.0114, "step": 8920 }, { "epoch": 0.5681745880257046, "grad_norm": 0.021484375, "learning_rate": 2.840554813259528e-05, "loss": 0.0004, "step": 8930 }, { "epoch": 0.5688108417636953, "grad_norm": 0.005706787109375, "learning_rate": 2.843736081949482e-05, "loss": 0.0055, "step": 8940 }, { "epoch": 0.569447095501686, "grad_norm": 0.1474609375, "learning_rate": 2.8469173506394353e-05, "loss": 0.0005, "step": 8950 }, { "epoch": 0.5700833492396767, "grad_norm": 0.0020751953125, "learning_rate": 2.850098619329389e-05, "loss": 0.0026, "step": 8960 }, { "epoch": 0.5707196029776674, "grad_norm": 0.0107421875, "learning_rate": 2.8532798880193423e-05, "loss": 0.0063, "step": 8970 }, { "epoch": 0.5713558567156583, "grad_norm": 2.0, "learning_rate": 2.8564611567092957e-05, "loss": 0.0016, "step": 8980 }, { "epoch": 0.571992110453649, "grad_norm": 0.1279296875, "learning_rate": 2.8596424253992494e-05, "loss": 0.0065, "step": 8990 }, { "epoch": 0.5726283641916396, "grad_norm": 0.0264892578125, "learning_rate": 2.8628236940892028e-05, "loss": 0.0178, "step": 9000 }, { "epoch": 0.5732646179296303, "grad_norm": 0.13671875, "learning_rate": 2.8660049627791565e-05, "loss": 0.006, "step": 9010 }, { "epoch": 0.573900871667621, "grad_norm": 0.1455078125, "learning_rate": 2.86918623146911e-05, "loss": 0.0003, "step": 9020 }, { "epoch": 0.5745371254056117, "grad_norm": 12.1875, "learning_rate": 2.8723675001590632e-05, "loss": 0.0098, "step": 9030 }, { "epoch": 0.5751733791436024, "grad_norm": 0.01068115234375, "learning_rate": 2.8755487688490172e-05, "loss": 0.0037, "step": 9040 }, { "epoch": 0.5758096328815931, "grad_norm": 0.4921875, "learning_rate": 2.878730037538971e-05, "loss": 0.0019, "step": 9050 }, { "epoch": 0.5764458866195838, "grad_norm": 0.01708984375, "learning_rate": 2.8819113062289243e-05, "loss": 0.0121, "step": 9060 }, { "epoch": 0.5770821403575745, "grad_norm": 7.59375, "learning_rate": 2.885092574918878e-05, "loss": 0.0078, "step": 9070 }, { "epoch": 0.5777183940955654, "grad_norm": 3.734375, "learning_rate": 2.8882738436088314e-05, "loss": 0.0022, "step": 9080 }, { "epoch": 0.5783546478335561, "grad_norm": 0.10986328125, "learning_rate": 2.8914551122987848e-05, "loss": 0.0122, "step": 9090 }, { "epoch": 0.5789909015715468, "grad_norm": 0.01177978515625, "learning_rate": 2.8946363809887385e-05, "loss": 0.0012, "step": 9100 }, { "epoch": 0.5796271553095375, "grad_norm": 0.027587890625, "learning_rate": 2.8978176496786918e-05, "loss": 0.0016, "step": 9110 }, { "epoch": 0.5802634090475282, "grad_norm": 0.0211181640625, "learning_rate": 2.9009989183686452e-05, "loss": 0.0041, "step": 9120 }, { "epoch": 0.5808996627855189, "grad_norm": 0.016845703125, "learning_rate": 2.904180187058599e-05, "loss": 0.0003, "step": 9130 }, { "epoch": 0.5815359165235096, "grad_norm": 0.015869140625, "learning_rate": 2.907361455748553e-05, "loss": 0.001, "step": 9140 }, { "epoch": 0.5821721702615003, "grad_norm": 1.1796875, "learning_rate": 2.9105427244385063e-05, "loss": 0.0011, "step": 9150 }, { "epoch": 0.582808423999491, "grad_norm": 0.1044921875, "learning_rate": 2.91372399312846e-05, "loss": 0.0128, "step": 9160 }, { "epoch": 0.5834446777374817, "grad_norm": 0.361328125, "learning_rate": 2.9169052618184134e-05, "loss": 0.0004, "step": 9170 }, { "epoch": 0.5840809314754725, "grad_norm": 0.0849609375, "learning_rate": 2.9200865305083667e-05, "loss": 0.0004, "step": 9180 }, { "epoch": 0.5847171852134632, "grad_norm": 0.027099609375, "learning_rate": 2.9232677991983204e-05, "loss": 0.0003, "step": 9190 }, { "epoch": 0.5853534389514539, "grad_norm": 0.006927490234375, "learning_rate": 2.9264490678882738e-05, "loss": 0.0001, "step": 9200 }, { "epoch": 0.5859896926894446, "grad_norm": 0.134765625, "learning_rate": 2.9296303365782275e-05, "loss": 0.0006, "step": 9210 }, { "epoch": 0.5866259464274353, "grad_norm": 0.0179443359375, "learning_rate": 2.932811605268181e-05, "loss": 0.0028, "step": 9220 }, { "epoch": 0.587262200165426, "grad_norm": 0.13671875, "learning_rate": 2.9359928739581342e-05, "loss": 0.0024, "step": 9230 }, { "epoch": 0.5878984539034167, "grad_norm": 0.00543212890625, "learning_rate": 2.9391741426480883e-05, "loss": 0.0014, "step": 9240 }, { "epoch": 0.5885347076414074, "grad_norm": 0.062255859375, "learning_rate": 2.942355411338042e-05, "loss": 0.0014, "step": 9250 }, { "epoch": 0.5891709613793981, "grad_norm": 1.546875, "learning_rate": 2.9455366800279954e-05, "loss": 0.001, "step": 9260 }, { "epoch": 0.5898072151173888, "grad_norm": 0.01422119140625, "learning_rate": 2.948717948717949e-05, "loss": 0.01, "step": 9270 }, { "epoch": 0.5904434688553796, "grad_norm": 0.0020294189453125, "learning_rate": 2.9518992174079024e-05, "loss": 0.0006, "step": 9280 }, { "epoch": 0.5910797225933703, "grad_norm": 0.314453125, "learning_rate": 2.9550804860978558e-05, "loss": 0.0064, "step": 9290 }, { "epoch": 0.591715976331361, "grad_norm": 9.5, "learning_rate": 2.9582617547878095e-05, "loss": 0.0054, "step": 9300 }, { "epoch": 0.5923522300693517, "grad_norm": 0.012451171875, "learning_rate": 2.961443023477763e-05, "loss": 0.0009, "step": 9310 }, { "epoch": 0.5929884838073424, "grad_norm": 0.111328125, "learning_rate": 2.9646242921677166e-05, "loss": 0.0023, "step": 9320 }, { "epoch": 0.5936247375453331, "grad_norm": 0.189453125, "learning_rate": 2.96780556085767e-05, "loss": 0.0011, "step": 9330 }, { "epoch": 0.5942609912833238, "grad_norm": 0.000591278076171875, "learning_rate": 2.970986829547624e-05, "loss": 0.0058, "step": 9340 }, { "epoch": 0.5948972450213145, "grad_norm": 0.3671875, "learning_rate": 2.9741680982375773e-05, "loss": 0.0003, "step": 9350 }, { "epoch": 0.5955334987593052, "grad_norm": 0.01123046875, "learning_rate": 2.977349366927531e-05, "loss": 0.0059, "step": 9360 }, { "epoch": 0.5961697524972959, "grad_norm": 0.005279541015625, "learning_rate": 2.9805306356174844e-05, "loss": 0.0073, "step": 9370 }, { "epoch": 0.5968060062352867, "grad_norm": 0.3359375, "learning_rate": 2.983711904307438e-05, "loss": 0.0014, "step": 9380 }, { "epoch": 0.5974422599732774, "grad_norm": 0.04638671875, "learning_rate": 2.9868931729973915e-05, "loss": 0.006, "step": 9390 }, { "epoch": 0.5980785137112681, "grad_norm": 0.006256103515625, "learning_rate": 2.990074441687345e-05, "loss": 0.0002, "step": 9400 }, { "epoch": 0.5987147674492588, "grad_norm": 0.076171875, "learning_rate": 2.9932557103772986e-05, "loss": 0.0008, "step": 9410 }, { "epoch": 0.5993510211872495, "grad_norm": 0.0556640625, "learning_rate": 2.996436979067252e-05, "loss": 0.0022, "step": 9420 }, { "epoch": 0.5999872749252402, "grad_norm": 0.016845703125, "learning_rate": 2.9996182477572053e-05, "loss": 0.0003, "step": 9430 }, { "epoch": 0.6006235286632309, "grad_norm": 0.061279296875, "learning_rate": 3.0027995164471597e-05, "loss": 0.001, "step": 9440 }, { "epoch": 0.6012597824012216, "grad_norm": 0.042236328125, "learning_rate": 3.005980785137113e-05, "loss": 0.0048, "step": 9450 }, { "epoch": 0.6018960361392123, "grad_norm": 4.875, "learning_rate": 3.0091620538270664e-05, "loss": 0.0031, "step": 9460 }, { "epoch": 0.602532289877203, "grad_norm": 0.376953125, "learning_rate": 3.01234332251702e-05, "loss": 0.0063, "step": 9470 }, { "epoch": 0.6031685436151938, "grad_norm": 0.007293701171875, "learning_rate": 3.0155245912069735e-05, "loss": 0.005, "step": 9480 }, { "epoch": 0.6038047973531845, "grad_norm": 0.326171875, "learning_rate": 3.018705859896927e-05, "loss": 0.0003, "step": 9490 }, { "epoch": 0.6044410510911752, "grad_norm": 0.75390625, "learning_rate": 3.0218871285868805e-05, "loss": 0.001, "step": 9500 }, { "epoch": 0.6050773048291659, "grad_norm": 0.68359375, "learning_rate": 3.025068397276834e-05, "loss": 0.0007, "step": 9510 }, { "epoch": 0.6057135585671566, "grad_norm": 0.1513671875, "learning_rate": 3.0282496659667876e-05, "loss": 0.0061, "step": 9520 }, { "epoch": 0.6063498123051473, "grad_norm": 0.032470703125, "learning_rate": 3.031430934656741e-05, "loss": 0.0035, "step": 9530 }, { "epoch": 0.606986066043138, "grad_norm": 0.06689453125, "learning_rate": 3.034612203346695e-05, "loss": 0.0003, "step": 9540 }, { "epoch": 0.6076223197811287, "grad_norm": 0.6328125, "learning_rate": 3.0377934720366484e-05, "loss": 0.0037, "step": 9550 }, { "epoch": 0.6082585735191194, "grad_norm": 0.053955078125, "learning_rate": 3.040974740726602e-05, "loss": 0.0052, "step": 9560 }, { "epoch": 0.6088948272571101, "grad_norm": 0.01177978515625, "learning_rate": 3.0441560094165555e-05, "loss": 0.0012, "step": 9570 }, { "epoch": 0.6095310809951009, "grad_norm": 0.0078125, "learning_rate": 3.047337278106509e-05, "loss": 0.0052, "step": 9580 }, { "epoch": 0.6101673347330916, "grad_norm": 2.015625, "learning_rate": 3.0505185467964625e-05, "loss": 0.0153, "step": 9590 }, { "epoch": 0.6108035884710823, "grad_norm": 0.01544189453125, "learning_rate": 3.053699815486416e-05, "loss": 0.0019, "step": 9600 }, { "epoch": 0.611439842209073, "grad_norm": 0.07421875, "learning_rate": 3.0568810841763696e-05, "loss": 0.001, "step": 9610 }, { "epoch": 0.6120760959470637, "grad_norm": 0.232421875, "learning_rate": 3.060062352866323e-05, "loss": 0.0013, "step": 9620 }, { "epoch": 0.6127123496850544, "grad_norm": 0.006134033203125, "learning_rate": 3.063243621556276e-05, "loss": 0.0067, "step": 9630 }, { "epoch": 0.6133486034230451, "grad_norm": 1.5390625, "learning_rate": 3.0664248902462304e-05, "loss": 0.0086, "step": 9640 }, { "epoch": 0.6139848571610358, "grad_norm": 0.07470703125, "learning_rate": 3.0696061589361844e-05, "loss": 0.0077, "step": 9650 }, { "epoch": 0.6146211108990265, "grad_norm": 0.04443359375, "learning_rate": 3.072787427626138e-05, "loss": 0.0013, "step": 9660 }, { "epoch": 0.6152573646370172, "grad_norm": 0.03955078125, "learning_rate": 3.075968696316091e-05, "loss": 0.0003, "step": 9670 }, { "epoch": 0.615893618375008, "grad_norm": 0.31640625, "learning_rate": 3.0791499650060445e-05, "loss": 0.0048, "step": 9680 }, { "epoch": 0.6165298721129987, "grad_norm": 0.00101470947265625, "learning_rate": 3.082331233695998e-05, "loss": 0.001, "step": 9690 }, { "epoch": 0.6171661258509894, "grad_norm": 3.25, "learning_rate": 3.085512502385951e-05, "loss": 0.0047, "step": 9700 }, { "epoch": 0.6178023795889801, "grad_norm": 0.228515625, "learning_rate": 3.088693771075905e-05, "loss": 0.0052, "step": 9710 }, { "epoch": 0.6184386333269708, "grad_norm": 0.478515625, "learning_rate": 3.091875039765859e-05, "loss": 0.0008, "step": 9720 }, { "epoch": 0.6190748870649615, "grad_norm": 0.004638671875, "learning_rate": 3.095056308455812e-05, "loss": 0.0007, "step": 9730 }, { "epoch": 0.6197111408029522, "grad_norm": 0.03955078125, "learning_rate": 3.098237577145766e-05, "loss": 0.0026, "step": 9740 }, { "epoch": 0.6203473945409429, "grad_norm": 0.0191650390625, "learning_rate": 3.1014188458357194e-05, "loss": 0.0014, "step": 9750 }, { "epoch": 0.6209836482789336, "grad_norm": 0.0115966796875, "learning_rate": 3.104600114525673e-05, "loss": 0.0002, "step": 9760 }, { "epoch": 0.6216199020169243, "grad_norm": 0.01220703125, "learning_rate": 3.107781383215627e-05, "loss": 0.0159, "step": 9770 }, { "epoch": 0.6222561557549151, "grad_norm": 0.08154296875, "learning_rate": 3.11096265190558e-05, "loss": 0.002, "step": 9780 }, { "epoch": 0.6228924094929058, "grad_norm": 0.0031585693359375, "learning_rate": 3.1141439205955336e-05, "loss": 0.0014, "step": 9790 }, { "epoch": 0.6235286632308965, "grad_norm": 1.5234375, "learning_rate": 3.117325189285487e-05, "loss": 0.0013, "step": 9800 }, { "epoch": 0.6241649169688872, "grad_norm": 7.125, "learning_rate": 3.12050645797544e-05, "loss": 0.0055, "step": 9810 }, { "epoch": 0.6248011707068779, "grad_norm": 5.15625, "learning_rate": 3.1236877266653944e-05, "loss": 0.0053, "step": 9820 }, { "epoch": 0.6254374244448686, "grad_norm": 0.01519775390625, "learning_rate": 3.126868995355348e-05, "loss": 0.0033, "step": 9830 }, { "epoch": 0.6260736781828593, "grad_norm": 0.053466796875, "learning_rate": 3.130050264045302e-05, "loss": 0.0229, "step": 9840 }, { "epoch": 0.62670993192085, "grad_norm": 0.0028076171875, "learning_rate": 3.133231532735255e-05, "loss": 0.0014, "step": 9850 }, { "epoch": 0.6273461856588407, "grad_norm": 0.00811767578125, "learning_rate": 3.1364128014252085e-05, "loss": 0.0003, "step": 9860 }, { "epoch": 0.6279824393968314, "grad_norm": 0.015380859375, "learning_rate": 3.139594070115162e-05, "loss": 0.0007, "step": 9870 }, { "epoch": 0.6286186931348222, "grad_norm": 0.08935546875, "learning_rate": 3.142775338805116e-05, "loss": 0.0051, "step": 9880 }, { "epoch": 0.6292549468728129, "grad_norm": 0.2890625, "learning_rate": 3.145956607495069e-05, "loss": 0.0008, "step": 9890 }, { "epoch": 0.6298912006108036, "grad_norm": 0.0267333984375, "learning_rate": 3.1491378761850226e-05, "loss": 0.0057, "step": 9900 }, { "epoch": 0.6305274543487943, "grad_norm": 0.03466796875, "learning_rate": 3.152319144874976e-05, "loss": 0.0053, "step": 9910 }, { "epoch": 0.631163708086785, "grad_norm": 0.0306396484375, "learning_rate": 3.1555004135649294e-05, "loss": 0.0023, "step": 9920 }, { "epoch": 0.6317999618247757, "grad_norm": 0.01025390625, "learning_rate": 3.1586816822548834e-05, "loss": 0.0018, "step": 9930 }, { "epoch": 0.6324362155627664, "grad_norm": 0.026611328125, "learning_rate": 3.1618629509448375e-05, "loss": 0.0009, "step": 9940 }, { "epoch": 0.6330724693007571, "grad_norm": 0.018310546875, "learning_rate": 3.165044219634791e-05, "loss": 0.0005, "step": 9950 }, { "epoch": 0.6337087230387478, "grad_norm": 0.146484375, "learning_rate": 3.168225488324744e-05, "loss": 0.0122, "step": 9960 }, { "epoch": 0.6343449767767385, "grad_norm": 0.06640625, "learning_rate": 3.1714067570146976e-05, "loss": 0.0034, "step": 9970 }, { "epoch": 0.6349812305147293, "grad_norm": 0.03076171875, "learning_rate": 3.174588025704651e-05, "loss": 0.0002, "step": 9980 }, { "epoch": 0.63561748425272, "grad_norm": 0.2890625, "learning_rate": 3.177769294394605e-05, "loss": 0.0012, "step": 9990 }, { "epoch": 0.6362537379907107, "grad_norm": 0.6328125, "learning_rate": 3.180950563084558e-05, "loss": 0.0012, "step": 10000 }, { "epoch": 0.6368899917287014, "grad_norm": 0.050537109375, "learning_rate": 3.184131831774512e-05, "loss": 0.0005, "step": 10010 }, { "epoch": 0.6375262454666921, "grad_norm": 0.004180908203125, "learning_rate": 3.187313100464465e-05, "loss": 0.0049, "step": 10020 }, { "epoch": 0.6381624992046828, "grad_norm": 0.1591796875, "learning_rate": 3.1904943691544184e-05, "loss": 0.0045, "step": 10030 }, { "epoch": 0.6387987529426735, "grad_norm": 3.78125, "learning_rate": 3.1936756378443725e-05, "loss": 0.0141, "step": 10040 }, { "epoch": 0.6394350066806642, "grad_norm": 0.00799560546875, "learning_rate": 3.1968569065343265e-05, "loss": 0.0023, "step": 10050 }, { "epoch": 0.6400712604186549, "grad_norm": 0.002166748046875, "learning_rate": 3.20003817522428e-05, "loss": 0.0002, "step": 10060 }, { "epoch": 0.6407075141566456, "grad_norm": 0.6875, "learning_rate": 3.203219443914233e-05, "loss": 0.0008, "step": 10070 }, { "epoch": 0.6413437678946364, "grad_norm": 0.03515625, "learning_rate": 3.2064007126041866e-05, "loss": 0.0014, "step": 10080 }, { "epoch": 0.6419800216326271, "grad_norm": 0.0311279296875, "learning_rate": 3.20958198129414e-05, "loss": 0.0228, "step": 10090 }, { "epoch": 0.6426162753706178, "grad_norm": 0.044677734375, "learning_rate": 3.212763249984094e-05, "loss": 0.0025, "step": 10100 }, { "epoch": 0.6432525291086085, "grad_norm": 0.6484375, "learning_rate": 3.2159445186740474e-05, "loss": 0.0012, "step": 10110 }, { "epoch": 0.6438887828465992, "grad_norm": 0.0224609375, "learning_rate": 3.219125787364001e-05, "loss": 0.0027, "step": 10120 }, { "epoch": 0.6445250365845899, "grad_norm": 0.20703125, "learning_rate": 3.222307056053954e-05, "loss": 0.0013, "step": 10130 }, { "epoch": 0.6451612903225806, "grad_norm": 0.021240234375, "learning_rate": 3.225488324743908e-05, "loss": 0.0099, "step": 10140 }, { "epoch": 0.6457975440605713, "grad_norm": 0.26953125, "learning_rate": 3.2286695934338615e-05, "loss": 0.0017, "step": 10150 }, { "epoch": 0.646433797798562, "grad_norm": 0.33203125, "learning_rate": 3.2318508621238156e-05, "loss": 0.0004, "step": 10160 }, { "epoch": 0.6470700515365527, "grad_norm": 0.3203125, "learning_rate": 3.235032130813769e-05, "loss": 0.0007, "step": 10170 }, { "epoch": 0.6477063052745435, "grad_norm": 0.064453125, "learning_rate": 3.238213399503722e-05, "loss": 0.0009, "step": 10180 }, { "epoch": 0.6483425590125342, "grad_norm": 0.034423828125, "learning_rate": 3.241394668193676e-05, "loss": 0.0006, "step": 10190 }, { "epoch": 0.6489788127505249, "grad_norm": 0.47265625, "learning_rate": 3.244575936883629e-05, "loss": 0.0049, "step": 10200 }, { "epoch": 0.6496150664885156, "grad_norm": 0.5625, "learning_rate": 3.247757205573583e-05, "loss": 0.015, "step": 10210 }, { "epoch": 0.6502513202265063, "grad_norm": 0.0031280517578125, "learning_rate": 3.2509384742635364e-05, "loss": 0.0004, "step": 10220 }, { "epoch": 0.650887573964497, "grad_norm": 0.087890625, "learning_rate": 3.25411974295349e-05, "loss": 0.0183, "step": 10230 }, { "epoch": 0.6515238277024877, "grad_norm": 10.625, "learning_rate": 3.257301011643444e-05, "loss": 0.0076, "step": 10240 }, { "epoch": 0.6521600814404784, "grad_norm": 0.78125, "learning_rate": 3.260482280333397e-05, "loss": 0.0054, "step": 10250 }, { "epoch": 0.6527963351784691, "grad_norm": 5.9375, "learning_rate": 3.2636635490233506e-05, "loss": 0.0051, "step": 10260 }, { "epoch": 0.6534325889164598, "grad_norm": 0.007568359375, "learning_rate": 3.2668448177133046e-05, "loss": 0.0017, "step": 10270 }, { "epoch": 0.6540688426544506, "grad_norm": 8.5, "learning_rate": 3.270026086403258e-05, "loss": 0.0096, "step": 10280 }, { "epoch": 0.6547050963924413, "grad_norm": 0.1298828125, "learning_rate": 3.2732073550932114e-05, "loss": 0.0022, "step": 10290 }, { "epoch": 0.655341350130432, "grad_norm": 0.09814453125, "learning_rate": 3.276388623783165e-05, "loss": 0.0244, "step": 10300 }, { "epoch": 0.6559776038684227, "grad_norm": 0.2119140625, "learning_rate": 3.279569892473118e-05, "loss": 0.0027, "step": 10310 }, { "epoch": 0.6566138576064134, "grad_norm": 0.609375, "learning_rate": 3.282751161163072e-05, "loss": 0.0005, "step": 10320 }, { "epoch": 0.6572501113444041, "grad_norm": 0.039794921875, "learning_rate": 3.2859324298530255e-05, "loss": 0.0022, "step": 10330 }, { "epoch": 0.6578863650823948, "grad_norm": 0.1337890625, "learning_rate": 3.2891136985429795e-05, "loss": 0.0006, "step": 10340 }, { "epoch": 0.6585226188203855, "grad_norm": 0.004638671875, "learning_rate": 3.292294967232933e-05, "loss": 0.0014, "step": 10350 }, { "epoch": 0.6591588725583762, "grad_norm": 0.0263671875, "learning_rate": 3.295476235922886e-05, "loss": 0.004, "step": 10360 }, { "epoch": 0.6597951262963669, "grad_norm": 0.01458740234375, "learning_rate": 3.2986575046128396e-05, "loss": 0.0006, "step": 10370 }, { "epoch": 0.6604313800343578, "grad_norm": 2.609375, "learning_rate": 3.301838773302794e-05, "loss": 0.0045, "step": 10380 }, { "epoch": 0.6610676337723485, "grad_norm": 0.003814697265625, "learning_rate": 3.305020041992747e-05, "loss": 0.0018, "step": 10390 }, { "epoch": 0.6617038875103392, "grad_norm": 0.0089111328125, "learning_rate": 3.3082013106827004e-05, "loss": 0.0002, "step": 10400 }, { "epoch": 0.6623401412483299, "grad_norm": 0.008544921875, "learning_rate": 3.311382579372654e-05, "loss": 0.0007, "step": 10410 }, { "epoch": 0.6629763949863205, "grad_norm": 0.01953125, "learning_rate": 3.314563848062607e-05, "loss": 0.0003, "step": 10420 }, { "epoch": 0.6636126487243112, "grad_norm": 0.01348876953125, "learning_rate": 3.3177451167525605e-05, "loss": 0.0007, "step": 10430 }, { "epoch": 0.664248902462302, "grad_norm": 0.5234375, "learning_rate": 3.3209263854425146e-05, "loss": 0.0005, "step": 10440 }, { "epoch": 0.6648851562002926, "grad_norm": 1.15625, "learning_rate": 3.3241076541324686e-05, "loss": 0.003, "step": 10450 }, { "epoch": 0.6655214099382833, "grad_norm": 0.255859375, "learning_rate": 3.327288922822422e-05, "loss": 0.0059, "step": 10460 }, { "epoch": 0.666157663676274, "grad_norm": 5.0, "learning_rate": 3.330470191512375e-05, "loss": 0.0044, "step": 10470 }, { "epoch": 0.6667939174142649, "grad_norm": 2.390625, "learning_rate": 3.333651460202329e-05, "loss": 0.0011, "step": 10480 }, { "epoch": 0.6674301711522556, "grad_norm": 0.007476806640625, "learning_rate": 3.336832728892282e-05, "loss": 0.0018, "step": 10490 }, { "epoch": 0.6680664248902463, "grad_norm": 0.001220703125, "learning_rate": 3.340013997582236e-05, "loss": 0.0007, "step": 10500 }, { "epoch": 0.668702678628237, "grad_norm": 0.047119140625, "learning_rate": 3.3431952662721895e-05, "loss": 0.0037, "step": 10510 }, { "epoch": 0.6693389323662277, "grad_norm": 0.12109375, "learning_rate": 3.346376534962143e-05, "loss": 0.0004, "step": 10520 }, { "epoch": 0.6699751861042184, "grad_norm": 0.01226806640625, "learning_rate": 3.349557803652096e-05, "loss": 0.0004, "step": 10530 }, { "epoch": 0.6706114398422091, "grad_norm": 0.01055908203125, "learning_rate": 3.35273907234205e-05, "loss": 0.0014, "step": 10540 }, { "epoch": 0.6712476935801998, "grad_norm": 0.064453125, "learning_rate": 3.3559203410320036e-05, "loss": 0.0055, "step": 10550 }, { "epoch": 0.6718839473181905, "grad_norm": 0.072265625, "learning_rate": 3.3591016097219577e-05, "loss": 0.0024, "step": 10560 }, { "epoch": 0.6725202010561812, "grad_norm": 0.166015625, "learning_rate": 3.362282878411911e-05, "loss": 0.0007, "step": 10570 }, { "epoch": 0.673156454794172, "grad_norm": 0.1416015625, "learning_rate": 3.3654641471018644e-05, "loss": 0.0013, "step": 10580 }, { "epoch": 0.6737927085321627, "grad_norm": 0.01611328125, "learning_rate": 3.368645415791818e-05, "loss": 0.0043, "step": 10590 }, { "epoch": 0.6744289622701534, "grad_norm": 0.0169677734375, "learning_rate": 3.371826684481771e-05, "loss": 0.001, "step": 10600 }, { "epoch": 0.6750652160081441, "grad_norm": 0.73828125, "learning_rate": 3.375007953171725e-05, "loss": 0.0082, "step": 10610 }, { "epoch": 0.6757014697461348, "grad_norm": 0.03173828125, "learning_rate": 3.3781892218616785e-05, "loss": 0.0026, "step": 10620 }, { "epoch": 0.6763377234841255, "grad_norm": 0.0031585693359375, "learning_rate": 3.381370490551632e-05, "loss": 0.001, "step": 10630 }, { "epoch": 0.6769739772221162, "grad_norm": 0.95703125, "learning_rate": 3.384551759241586e-05, "loss": 0.0015, "step": 10640 }, { "epoch": 0.6776102309601069, "grad_norm": 0.1708984375, "learning_rate": 3.387733027931539e-05, "loss": 0.0007, "step": 10650 }, { "epoch": 0.6782464846980976, "grad_norm": 0.40625, "learning_rate": 3.390914296621493e-05, "loss": 0.0012, "step": 10660 }, { "epoch": 0.6788827384360883, "grad_norm": 0.33984375, "learning_rate": 3.394095565311447e-05, "loss": 0.0032, "step": 10670 }, { "epoch": 0.6795189921740791, "grad_norm": 0.0078125, "learning_rate": 3.3972768340014e-05, "loss": 0.0076, "step": 10680 }, { "epoch": 0.6801552459120698, "grad_norm": 0.037353515625, "learning_rate": 3.4004581026913534e-05, "loss": 0.0005, "step": 10690 }, { "epoch": 0.6807914996500605, "grad_norm": 4.71875, "learning_rate": 3.403639371381307e-05, "loss": 0.0238, "step": 10700 }, { "epoch": 0.6814277533880512, "grad_norm": 0.0245361328125, "learning_rate": 3.40682064007126e-05, "loss": 0.0013, "step": 10710 }, { "epoch": 0.6820640071260419, "grad_norm": 6.875, "learning_rate": 3.410001908761214e-05, "loss": 0.0176, "step": 10720 }, { "epoch": 0.6827002608640326, "grad_norm": 0.20703125, "learning_rate": 3.4131831774511676e-05, "loss": 0.0006, "step": 10730 }, { "epoch": 0.6833365146020233, "grad_norm": 0.109375, "learning_rate": 3.4163644461411216e-05, "loss": 0.001, "step": 10740 }, { "epoch": 0.683972768340014, "grad_norm": 0.4140625, "learning_rate": 3.419545714831075e-05, "loss": 0.0014, "step": 10750 }, { "epoch": 0.6846090220780047, "grad_norm": 0.42578125, "learning_rate": 3.4227269835210284e-05, "loss": 0.0254, "step": 10760 }, { "epoch": 0.6852452758159954, "grad_norm": 0.228515625, "learning_rate": 3.425908252210982e-05, "loss": 0.0021, "step": 10770 }, { "epoch": 0.6858815295539862, "grad_norm": 0.0216064453125, "learning_rate": 3.429089520900936e-05, "loss": 0.0007, "step": 10780 }, { "epoch": 0.6865177832919769, "grad_norm": 2.515625, "learning_rate": 3.432270789590889e-05, "loss": 0.0035, "step": 10790 }, { "epoch": 0.6871540370299676, "grad_norm": 0.006744384765625, "learning_rate": 3.4354520582808425e-05, "loss": 0.0173, "step": 10800 }, { "epoch": 0.6877902907679583, "grad_norm": 0.083984375, "learning_rate": 3.438633326970796e-05, "loss": 0.0015, "step": 10810 }, { "epoch": 0.688426544505949, "grad_norm": 0.0012359619140625, "learning_rate": 3.441814595660749e-05, "loss": 0.0007, "step": 10820 }, { "epoch": 0.6890627982439397, "grad_norm": 0.1484375, "learning_rate": 3.444995864350703e-05, "loss": 0.0009, "step": 10830 }, { "epoch": 0.6896990519819304, "grad_norm": 14.125, "learning_rate": 3.448177133040657e-05, "loss": 0.0154, "step": 10840 }, { "epoch": 0.6903353057199211, "grad_norm": 0.0084228515625, "learning_rate": 3.451358401730611e-05, "loss": 0.0022, "step": 10850 }, { "epoch": 0.6909715594579118, "grad_norm": 0.7265625, "learning_rate": 3.454539670420564e-05, "loss": 0.0087, "step": 10860 }, { "epoch": 0.6916078131959025, "grad_norm": 2.53125, "learning_rate": 3.4577209391105174e-05, "loss": 0.0022, "step": 10870 }, { "epoch": 0.6922440669338933, "grad_norm": 1.1015625, "learning_rate": 3.460902207800471e-05, "loss": 0.0104, "step": 10880 }, { "epoch": 0.692880320671884, "grad_norm": 0.00555419921875, "learning_rate": 3.464083476490425e-05, "loss": 0.0002, "step": 10890 }, { "epoch": 0.6935165744098747, "grad_norm": 2.9375, "learning_rate": 3.467264745180378e-05, "loss": 0.0022, "step": 10900 }, { "epoch": 0.6941528281478654, "grad_norm": 0.0361328125, "learning_rate": 3.4704460138703316e-05, "loss": 0.0039, "step": 10910 }, { "epoch": 0.6947890818858561, "grad_norm": 0.2373046875, "learning_rate": 3.473627282560285e-05, "loss": 0.002, "step": 10920 }, { "epoch": 0.6954253356238468, "grad_norm": 0.00238037109375, "learning_rate": 3.476808551250238e-05, "loss": 0.0007, "step": 10930 }, { "epoch": 0.6960615893618375, "grad_norm": 0.0191650390625, "learning_rate": 3.479989819940192e-05, "loss": 0.0005, "step": 10940 }, { "epoch": 0.6966978430998282, "grad_norm": 0.0179443359375, "learning_rate": 3.4831710886301464e-05, "loss": 0.0022, "step": 10950 }, { "epoch": 0.6973340968378189, "grad_norm": 0.0673828125, "learning_rate": 3.4863523573201e-05, "loss": 0.0015, "step": 10960 }, { "epoch": 0.6979703505758096, "grad_norm": 1.09375, "learning_rate": 3.489533626010053e-05, "loss": 0.0017, "step": 10970 }, { "epoch": 0.6986066043138004, "grad_norm": 0.021728515625, "learning_rate": 3.4927148947000065e-05, "loss": 0.0003, "step": 10980 }, { "epoch": 0.6992428580517911, "grad_norm": 0.357421875, "learning_rate": 3.49589616338996e-05, "loss": 0.0014, "step": 10990 }, { "epoch": 0.6998791117897818, "grad_norm": 0.23828125, "learning_rate": 3.499077432079914e-05, "loss": 0.0007, "step": 11000 }, { "epoch": 0.7005153655277725, "grad_norm": 1.5, "learning_rate": 3.502258700769867e-05, "loss": 0.0014, "step": 11010 }, { "epoch": 0.7011516192657632, "grad_norm": 0.0093994140625, "learning_rate": 3.5054399694598206e-05, "loss": 0.0005, "step": 11020 }, { "epoch": 0.7017878730037539, "grad_norm": 0.006317138671875, "learning_rate": 3.508621238149774e-05, "loss": 0.0188, "step": 11030 }, { "epoch": 0.7024241267417446, "grad_norm": 0.45703125, "learning_rate": 3.511802506839728e-05, "loss": 0.0045, "step": 11040 }, { "epoch": 0.7030603804797353, "grad_norm": 0.0205078125, "learning_rate": 3.5149837755296814e-05, "loss": 0.01, "step": 11050 }, { "epoch": 0.703696634217726, "grad_norm": 0.404296875, "learning_rate": 3.5181650442196354e-05, "loss": 0.0005, "step": 11060 }, { "epoch": 0.7043328879557167, "grad_norm": 0.01300048828125, "learning_rate": 3.521346312909589e-05, "loss": 0.0106, "step": 11070 }, { "epoch": 0.7049691416937075, "grad_norm": 1.4765625, "learning_rate": 3.524527581599542e-05, "loss": 0.0008, "step": 11080 }, { "epoch": 0.7056053954316982, "grad_norm": 0.0211181640625, "learning_rate": 3.5277088502894955e-05, "loss": 0.0007, "step": 11090 }, { "epoch": 0.7062416491696889, "grad_norm": 0.000896453857421875, "learning_rate": 3.530890118979449e-05, "loss": 0.002, "step": 11100 }, { "epoch": 0.7068779029076796, "grad_norm": 0.01055908203125, "learning_rate": 3.534071387669403e-05, "loss": 0.0005, "step": 11110 }, { "epoch": 0.7075141566456703, "grad_norm": 0.06494140625, "learning_rate": 3.537252656359356e-05, "loss": 0.0003, "step": 11120 }, { "epoch": 0.708150410383661, "grad_norm": 0.0888671875, "learning_rate": 3.54043392504931e-05, "loss": 0.0011, "step": 11130 }, { "epoch": 0.7087866641216517, "grad_norm": 0.00177764892578125, "learning_rate": 3.543615193739264e-05, "loss": 0.0003, "step": 11140 }, { "epoch": 0.7094229178596424, "grad_norm": 0.00482177734375, "learning_rate": 3.546796462429217e-05, "loss": 0.0013, "step": 11150 }, { "epoch": 0.7100591715976331, "grad_norm": 0.0123291015625, "learning_rate": 3.5499777311191704e-05, "loss": 0.0004, "step": 11160 }, { "epoch": 0.7106954253356238, "grad_norm": 0.0245361328125, "learning_rate": 3.553158999809124e-05, "loss": 0.0076, "step": 11170 }, { "epoch": 0.7113316790736146, "grad_norm": 0.00115203857421875, "learning_rate": 3.556340268499078e-05, "loss": 0.0012, "step": 11180 }, { "epoch": 0.7119679328116053, "grad_norm": 0.0654296875, "learning_rate": 3.559521537189031e-05, "loss": 0.0002, "step": 11190 }, { "epoch": 0.712604186549596, "grad_norm": 0.011474609375, "learning_rate": 3.5627028058789846e-05, "loss": 0.0006, "step": 11200 }, { "epoch": 0.7132404402875867, "grad_norm": 0.46484375, "learning_rate": 3.565884074568938e-05, "loss": 0.0045, "step": 11210 }, { "epoch": 0.7138766940255774, "grad_norm": 0.00093841552734375, "learning_rate": 3.569065343258891e-05, "loss": 0.0013, "step": 11220 }, { "epoch": 0.7145129477635681, "grad_norm": 0.006011962890625, "learning_rate": 3.5722466119488454e-05, "loss": 0.0003, "step": 11230 }, { "epoch": 0.7151492015015588, "grad_norm": 0.02734375, "learning_rate": 3.5754278806387994e-05, "loss": 0.0009, "step": 11240 }, { "epoch": 0.7157854552395495, "grad_norm": 1.546875, "learning_rate": 3.578609149328753e-05, "loss": 0.0025, "step": 11250 }, { "epoch": 0.7164217089775402, "grad_norm": 0.0810546875, "learning_rate": 3.581790418018706e-05, "loss": 0.0015, "step": 11260 }, { "epoch": 0.7170579627155309, "grad_norm": 0.7578125, "learning_rate": 3.5849716867086595e-05, "loss": 0.0006, "step": 11270 }, { "epoch": 0.7176942164535217, "grad_norm": 0.0196533203125, "learning_rate": 3.588152955398613e-05, "loss": 0.001, "step": 11280 }, { "epoch": 0.7183304701915124, "grad_norm": 6.15625, "learning_rate": 3.591334224088567e-05, "loss": 0.004, "step": 11290 }, { "epoch": 0.7189667239295031, "grad_norm": 0.0213623046875, "learning_rate": 3.59451549277852e-05, "loss": 0.0007, "step": 11300 }, { "epoch": 0.7196029776674938, "grad_norm": 0.193359375, "learning_rate": 3.5976967614684736e-05, "loss": 0.0012, "step": 11310 }, { "epoch": 0.7202392314054845, "grad_norm": 0.10595703125, "learning_rate": 3.600878030158427e-05, "loss": 0.0006, "step": 11320 }, { "epoch": 0.7208754851434752, "grad_norm": 0.470703125, "learning_rate": 3.6040592988483804e-05, "loss": 0.0011, "step": 11330 }, { "epoch": 0.7215117388814659, "grad_norm": 0.01275634765625, "learning_rate": 3.6072405675383344e-05, "loss": 0.0155, "step": 11340 }, { "epoch": 0.7221479926194566, "grad_norm": 0.00118255615234375, "learning_rate": 3.6104218362282885e-05, "loss": 0.0008, "step": 11350 }, { "epoch": 0.7227842463574473, "grad_norm": 0.04443359375, "learning_rate": 3.613603104918242e-05, "loss": 0.0124, "step": 11360 }, { "epoch": 0.723420500095438, "grad_norm": 0.0216064453125, "learning_rate": 3.616784373608195e-05, "loss": 0.0002, "step": 11370 }, { "epoch": 0.7240567538334288, "grad_norm": 1.15625, "learning_rate": 3.6199656422981486e-05, "loss": 0.0009, "step": 11380 }, { "epoch": 0.7246930075714195, "grad_norm": 0.07568359375, "learning_rate": 3.623146910988102e-05, "loss": 0.0014, "step": 11390 }, { "epoch": 0.7253292613094102, "grad_norm": 0.0233154296875, "learning_rate": 3.626328179678056e-05, "loss": 0.0021, "step": 11400 }, { "epoch": 0.7259655150474009, "grad_norm": 0.004241943359375, "learning_rate": 3.629509448368009e-05, "loss": 0.0312, "step": 11410 }, { "epoch": 0.7266017687853916, "grad_norm": 0.0228271484375, "learning_rate": 3.632690717057963e-05, "loss": 0.0005, "step": 11420 }, { "epoch": 0.7272380225233823, "grad_norm": 0.1123046875, "learning_rate": 3.635871985747916e-05, "loss": 0.0006, "step": 11430 }, { "epoch": 0.727874276261373, "grad_norm": 0.043212890625, "learning_rate": 3.63905325443787e-05, "loss": 0.0022, "step": 11440 }, { "epoch": 0.7285105299993637, "grad_norm": 0.00469970703125, "learning_rate": 3.6422345231278235e-05, "loss": 0.001, "step": 11450 }, { "epoch": 0.7291467837373544, "grad_norm": 0.05126953125, "learning_rate": 3.6454157918177775e-05, "loss": 0.0002, "step": 11460 }, { "epoch": 0.7297830374753451, "grad_norm": 0.00433349609375, "learning_rate": 3.648597060507731e-05, "loss": 0.0006, "step": 11470 }, { "epoch": 0.7304192912133359, "grad_norm": 0.006378173828125, "learning_rate": 3.651778329197684e-05, "loss": 0.0014, "step": 11480 }, { "epoch": 0.7310555449513266, "grad_norm": 0.0235595703125, "learning_rate": 3.6549595978876376e-05, "loss": 0.0032, "step": 11490 }, { "epoch": 0.7316917986893173, "grad_norm": 0.130859375, "learning_rate": 3.658140866577591e-05, "loss": 0.0005, "step": 11500 }, { "epoch": 0.732328052427308, "grad_norm": 0.013916015625, "learning_rate": 3.661322135267545e-05, "loss": 0.0001, "step": 11510 }, { "epoch": 0.7329643061652987, "grad_norm": 4.15625, "learning_rate": 3.6645034039574984e-05, "loss": 0.0038, "step": 11520 }, { "epoch": 0.7336005599032894, "grad_norm": 0.058349609375, "learning_rate": 3.667684672647452e-05, "loss": 0.0001, "step": 11530 }, { "epoch": 0.7342368136412801, "grad_norm": 0.447265625, "learning_rate": 3.670865941337406e-05, "loss": 0.0015, "step": 11540 }, { "epoch": 0.7348730673792708, "grad_norm": 0.007354736328125, "learning_rate": 3.674047210027359e-05, "loss": 0.0148, "step": 11550 }, { "epoch": 0.7355093211172615, "grad_norm": 1.4921875, "learning_rate": 3.6772284787173125e-05, "loss": 0.0048, "step": 11560 }, { "epoch": 0.7361455748552522, "grad_norm": 0.033935546875, "learning_rate": 3.6804097474072666e-05, "loss": 0.0115, "step": 11570 }, { "epoch": 0.736781828593243, "grad_norm": 0.025146484375, "learning_rate": 3.68359101609722e-05, "loss": 0.0145, "step": 11580 }, { "epoch": 0.7374180823312337, "grad_norm": 0.2265625, "learning_rate": 3.686772284787173e-05, "loss": 0.0079, "step": 11590 }, { "epoch": 0.7380543360692244, "grad_norm": 0.06494140625, "learning_rate": 3.689953553477127e-05, "loss": 0.0027, "step": 11600 }, { "epoch": 0.7386905898072151, "grad_norm": 0.0103759765625, "learning_rate": 3.69313482216708e-05, "loss": 0.0068, "step": 11610 }, { "epoch": 0.7393268435452058, "grad_norm": 0.0157470703125, "learning_rate": 3.696316090857034e-05, "loss": 0.0055, "step": 11620 }, { "epoch": 0.7399630972831965, "grad_norm": 8.375, "learning_rate": 3.6994973595469874e-05, "loss": 0.027, "step": 11630 }, { "epoch": 0.7405993510211872, "grad_norm": 0.035888671875, "learning_rate": 3.7026786282369415e-05, "loss": 0.0014, "step": 11640 }, { "epoch": 0.7412356047591779, "grad_norm": 0.080078125, "learning_rate": 3.705859896926895e-05, "loss": 0.0003, "step": 11650 }, { "epoch": 0.7418718584971686, "grad_norm": 1.7421875, "learning_rate": 3.709041165616848e-05, "loss": 0.0025, "step": 11660 }, { "epoch": 0.7425081122351593, "grad_norm": 0.05029296875, "learning_rate": 3.7122224343068016e-05, "loss": 0.0001, "step": 11670 }, { "epoch": 0.7431443659731501, "grad_norm": 1.4453125, "learning_rate": 3.7154037029967556e-05, "loss": 0.0008, "step": 11680 }, { "epoch": 0.7437806197111408, "grad_norm": 0.0546875, "learning_rate": 3.718584971686709e-05, "loss": 0.0018, "step": 11690 }, { "epoch": 0.7444168734491315, "grad_norm": 0.0087890625, "learning_rate": 3.7217662403766624e-05, "loss": 0.0036, "step": 11700 }, { "epoch": 0.7450531271871222, "grad_norm": 0.0380859375, "learning_rate": 3.724947509066616e-05, "loss": 0.0016, "step": 11710 }, { "epoch": 0.7456893809251129, "grad_norm": 0.003997802734375, "learning_rate": 3.728128777756569e-05, "loss": 0.0003, "step": 11720 }, { "epoch": 0.7463256346631036, "grad_norm": 4.5625, "learning_rate": 3.731310046446523e-05, "loss": 0.0046, "step": 11730 }, { "epoch": 0.7469618884010943, "grad_norm": 0.09619140625, "learning_rate": 3.734491315136477e-05, "loss": 0.0006, "step": 11740 }, { "epoch": 0.747598142139085, "grad_norm": 0.039794921875, "learning_rate": 3.7376725838264305e-05, "loss": 0.0002, "step": 11750 }, { "epoch": 0.7482343958770757, "grad_norm": 0.04052734375, "learning_rate": 3.740853852516384e-05, "loss": 0.0026, "step": 11760 }, { "epoch": 0.7488706496150664, "grad_norm": 0.15625, "learning_rate": 3.744035121206337e-05, "loss": 0.0053, "step": 11770 }, { "epoch": 0.7495069033530573, "grad_norm": 0.01385498046875, "learning_rate": 3.7472163898962906e-05, "loss": 0.001, "step": 11780 }, { "epoch": 0.750143157091048, "grad_norm": 0.388671875, "learning_rate": 3.750397658586245e-05, "loss": 0.006, "step": 11790 }, { "epoch": 0.7507794108290387, "grad_norm": 0.028564453125, "learning_rate": 3.753578927276198e-05, "loss": 0.0056, "step": 11800 }, { "epoch": 0.7514156645670294, "grad_norm": 0.0291748046875, "learning_rate": 3.7567601959661514e-05, "loss": 0.0014, "step": 11810 }, { "epoch": 0.75205191830502, "grad_norm": 0.0859375, "learning_rate": 3.759941464656105e-05, "loss": 0.0019, "step": 11820 }, { "epoch": 0.7526881720430108, "grad_norm": 0.021484375, "learning_rate": 3.763122733346058e-05, "loss": 0.0005, "step": 11830 }, { "epoch": 0.7533244257810014, "grad_norm": 0.81640625, "learning_rate": 3.766304002036012e-05, "loss": 0.001, "step": 11840 }, { "epoch": 0.7539606795189921, "grad_norm": 0.0089111328125, "learning_rate": 3.769485270725966e-05, "loss": 0.0043, "step": 11850 }, { "epoch": 0.7545969332569828, "grad_norm": 0.076171875, "learning_rate": 3.7726665394159196e-05, "loss": 0.0005, "step": 11860 }, { "epoch": 0.7552331869949735, "grad_norm": 0.00118255615234375, "learning_rate": 3.775847808105873e-05, "loss": 0.0019, "step": 11870 }, { "epoch": 0.7558694407329644, "grad_norm": 7.5625, "learning_rate": 3.779029076795826e-05, "loss": 0.0085, "step": 11880 }, { "epoch": 0.7565056944709551, "grad_norm": 0.0120849609375, "learning_rate": 3.78221034548578e-05, "loss": 0.0029, "step": 11890 }, { "epoch": 0.7571419482089458, "grad_norm": 0.012451171875, "learning_rate": 3.785391614175733e-05, "loss": 0.0006, "step": 11900 }, { "epoch": 0.7577782019469365, "grad_norm": 0.000644683837890625, "learning_rate": 3.788572882865687e-05, "loss": 0.0009, "step": 11910 }, { "epoch": 0.7584144556849272, "grad_norm": 0.03662109375, "learning_rate": 3.7917541515556405e-05, "loss": 0.0061, "step": 11920 }, { "epoch": 0.7590507094229179, "grad_norm": 0.61328125, "learning_rate": 3.794935420245594e-05, "loss": 0.0005, "step": 11930 }, { "epoch": 0.7596869631609086, "grad_norm": 0.12158203125, "learning_rate": 3.798116688935548e-05, "loss": 0.0016, "step": 11940 }, { "epoch": 0.7603232168988993, "grad_norm": 0.2041015625, "learning_rate": 3.801297957625501e-05, "loss": 0.0009, "step": 11950 }, { "epoch": 0.76095947063689, "grad_norm": 0.01116943359375, "learning_rate": 3.8044792263154546e-05, "loss": 0.0003, "step": 11960 }, { "epoch": 0.7615957243748807, "grad_norm": 0.0025177001953125, "learning_rate": 3.807660495005409e-05, "loss": 0.0005, "step": 11970 }, { "epoch": 0.7622319781128715, "grad_norm": 0.008544921875, "learning_rate": 3.810841763695362e-05, "loss": 0.0006, "step": 11980 }, { "epoch": 0.7628682318508622, "grad_norm": 0.1220703125, "learning_rate": 3.8140230323853154e-05, "loss": 0.0031, "step": 11990 }, { "epoch": 0.7635044855888529, "grad_norm": 0.004791259765625, "learning_rate": 3.817204301075269e-05, "loss": 0.0011, "step": 12000 }, { "epoch": 0.7641407393268436, "grad_norm": 0.59375, "learning_rate": 3.820385569765222e-05, "loss": 0.0014, "step": 12010 }, { "epoch": 0.7647769930648343, "grad_norm": 1.171875, "learning_rate": 3.823566838455176e-05, "loss": 0.0009, "step": 12020 }, { "epoch": 0.765413246802825, "grad_norm": 0.87109375, "learning_rate": 3.8267481071451295e-05, "loss": 0.0124, "step": 12030 }, { "epoch": 0.7660495005408157, "grad_norm": 0.1787109375, "learning_rate": 3.8299293758350836e-05, "loss": 0.0004, "step": 12040 }, { "epoch": 0.7666857542788064, "grad_norm": 5.09375, "learning_rate": 3.833110644525037e-05, "loss": 0.0173, "step": 12050 }, { "epoch": 0.7673220080167971, "grad_norm": 0.051513671875, "learning_rate": 3.83629191321499e-05, "loss": 0.0017, "step": 12060 }, { "epoch": 0.7679582617547878, "grad_norm": 1.8984375, "learning_rate": 3.839473181904944e-05, "loss": 0.001, "step": 12070 }, { "epoch": 0.7685945154927786, "grad_norm": 0.06787109375, "learning_rate": 3.842654450594898e-05, "loss": 0.0012, "step": 12080 }, { "epoch": 0.7692307692307693, "grad_norm": 0.018310546875, "learning_rate": 3.845835719284851e-05, "loss": 0.0093, "step": 12090 }, { "epoch": 0.76986702296876, "grad_norm": 0.0038604736328125, "learning_rate": 3.8490169879748045e-05, "loss": 0.0002, "step": 12100 }, { "epoch": 0.7705032767067507, "grad_norm": 0.09130859375, "learning_rate": 3.852198256664758e-05, "loss": 0.0007, "step": 12110 }, { "epoch": 0.7711395304447414, "grad_norm": 0.01123046875, "learning_rate": 3.855379525354711e-05, "loss": 0.0072, "step": 12120 }, { "epoch": 0.7717757841827321, "grad_norm": 0.1943359375, "learning_rate": 3.858560794044665e-05, "loss": 0.0032, "step": 12130 }, { "epoch": 0.7724120379207228, "grad_norm": 0.2158203125, "learning_rate": 3.861742062734619e-05, "loss": 0.0013, "step": 12140 }, { "epoch": 0.7730482916587135, "grad_norm": 0.01544189453125, "learning_rate": 3.8649233314245726e-05, "loss": 0.0002, "step": 12150 }, { "epoch": 0.7736845453967042, "grad_norm": 0.021240234375, "learning_rate": 3.868104600114526e-05, "loss": 0.0012, "step": 12160 }, { "epoch": 0.7743207991346949, "grad_norm": 0.0262451171875, "learning_rate": 3.8712858688044794e-05, "loss": 0.0026, "step": 12170 }, { "epoch": 0.7749570528726857, "grad_norm": 0.2734375, "learning_rate": 3.874467137494433e-05, "loss": 0.0013, "step": 12180 }, { "epoch": 0.7755933066106764, "grad_norm": 0.036376953125, "learning_rate": 3.877648406184387e-05, "loss": 0.0002, "step": 12190 }, { "epoch": 0.7762295603486671, "grad_norm": 0.041748046875, "learning_rate": 3.88082967487434e-05, "loss": 0.0006, "step": 12200 }, { "epoch": 0.7768658140866578, "grad_norm": 0.2734375, "learning_rate": 3.8840109435642935e-05, "loss": 0.0005, "step": 12210 }, { "epoch": 0.7775020678246485, "grad_norm": 1.2421875, "learning_rate": 3.887192212254247e-05, "loss": 0.0014, "step": 12220 }, { "epoch": 0.7781383215626392, "grad_norm": 0.0242919921875, "learning_rate": 3.8903734809442e-05, "loss": 0.0018, "step": 12230 }, { "epoch": 0.7787745753006299, "grad_norm": 0.01214599609375, "learning_rate": 3.893554749634154e-05, "loss": 0.0075, "step": 12240 }, { "epoch": 0.7794108290386206, "grad_norm": 0.004150390625, "learning_rate": 3.896736018324108e-05, "loss": 0.0002, "step": 12250 }, { "epoch": 0.7800470827766113, "grad_norm": 0.0751953125, "learning_rate": 3.899917287014062e-05, "loss": 0.0017, "step": 12260 }, { "epoch": 0.780683336514602, "grad_norm": 1.9375, "learning_rate": 3.903098555704015e-05, "loss": 0.0023, "step": 12270 }, { "epoch": 0.7813195902525928, "grad_norm": 0.0299072265625, "learning_rate": 3.9062798243939684e-05, "loss": 0.0007, "step": 12280 }, { "epoch": 0.7819558439905835, "grad_norm": 0.2041015625, "learning_rate": 3.909461093083922e-05, "loss": 0.0006, "step": 12290 }, { "epoch": 0.7825920977285742, "grad_norm": 0.00433349609375, "learning_rate": 3.912642361773876e-05, "loss": 0.0003, "step": 12300 }, { "epoch": 0.7832283514665649, "grad_norm": 0.03271484375, "learning_rate": 3.915823630463829e-05, "loss": 0.0041, "step": 12310 }, { "epoch": 0.7838646052045556, "grad_norm": 0.05908203125, "learning_rate": 3.9190048991537826e-05, "loss": 0.0033, "step": 12320 }, { "epoch": 0.7845008589425463, "grad_norm": 0.283203125, "learning_rate": 3.922186167843736e-05, "loss": 0.0024, "step": 12330 }, { "epoch": 0.785137112680537, "grad_norm": 0.0120849609375, "learning_rate": 3.925367436533689e-05, "loss": 0.0002, "step": 12340 }, { "epoch": 0.7857733664185277, "grad_norm": 0.86328125, "learning_rate": 3.9285487052236433e-05, "loss": 0.0214, "step": 12350 }, { "epoch": 0.7864096201565184, "grad_norm": 0.055908203125, "learning_rate": 3.9317299739135974e-05, "loss": 0.0007, "step": 12360 }, { "epoch": 0.7870458738945091, "grad_norm": 0.01055908203125, "learning_rate": 3.934911242603551e-05, "loss": 0.0034, "step": 12370 }, { "epoch": 0.7876821276324999, "grad_norm": 0.033447265625, "learning_rate": 3.938092511293504e-05, "loss": 0.0004, "step": 12380 }, { "epoch": 0.7883183813704906, "grad_norm": 0.0162353515625, "learning_rate": 3.9412737799834575e-05, "loss": 0.0002, "step": 12390 }, { "epoch": 0.7889546351084813, "grad_norm": 0.007537841796875, "learning_rate": 3.944455048673411e-05, "loss": 0.0017, "step": 12400 }, { "epoch": 0.789590888846472, "grad_norm": 0.03173828125, "learning_rate": 3.947636317363365e-05, "loss": 0.0009, "step": 12410 }, { "epoch": 0.7902271425844627, "grad_norm": 0.005096435546875, "learning_rate": 3.950817586053318e-05, "loss": 0.009, "step": 12420 }, { "epoch": 0.7908633963224534, "grad_norm": 0.1181640625, "learning_rate": 3.9539988547432716e-05, "loss": 0.0009, "step": 12430 }, { "epoch": 0.7914996500604441, "grad_norm": 0.0223388671875, "learning_rate": 3.957180123433225e-05, "loss": 0.0012, "step": 12440 }, { "epoch": 0.7921359037984348, "grad_norm": 0.006927490234375, "learning_rate": 3.960361392123179e-05, "loss": 0.0003, "step": 12450 }, { "epoch": 0.7927721575364255, "grad_norm": 0.0556640625, "learning_rate": 3.9635426608131324e-05, "loss": 0.0007, "step": 12460 }, { "epoch": 0.7934084112744162, "grad_norm": 0.1865234375, "learning_rate": 3.9667239295030864e-05, "loss": 0.0078, "step": 12470 }, { "epoch": 0.794044665012407, "grad_norm": 0.298828125, "learning_rate": 3.96990519819304e-05, "loss": 0.001, "step": 12480 }, { "epoch": 0.7946809187503977, "grad_norm": 0.00445556640625, "learning_rate": 3.973086466882993e-05, "loss": 0.0012, "step": 12490 }, { "epoch": 0.7953171724883884, "grad_norm": 0.50390625, "learning_rate": 3.9762677355729465e-05, "loss": 0.0009, "step": 12500 }, { "epoch": 0.7959534262263791, "grad_norm": 0.0615234375, "learning_rate": 3.9794490042629e-05, "loss": 0.0002, "step": 12510 }, { "epoch": 0.7965896799643698, "grad_norm": 0.00799560546875, "learning_rate": 3.982630272952854e-05, "loss": 0.0024, "step": 12520 }, { "epoch": 0.7972259337023605, "grad_norm": 0.01446533203125, "learning_rate": 3.985811541642807e-05, "loss": 0.0026, "step": 12530 }, { "epoch": 0.7978621874403512, "grad_norm": 0.00823974609375, "learning_rate": 3.988992810332761e-05, "loss": 0.0016, "step": 12540 }, { "epoch": 0.7984984411783419, "grad_norm": 0.07177734375, "learning_rate": 3.992174079022715e-05, "loss": 0.002, "step": 12550 }, { "epoch": 0.7991346949163326, "grad_norm": 0.043701171875, "learning_rate": 3.995355347712668e-05, "loss": 0.0016, "step": 12560 }, { "epoch": 0.7997709486543233, "grad_norm": 1.09375, "learning_rate": 3.9985366164026215e-05, "loss": 0.0014, "step": 12570 }, { "epoch": 0.8004072023923141, "grad_norm": 0.79296875, "learning_rate": 4.0017178850925755e-05, "loss": 0.0122, "step": 12580 }, { "epoch": 0.8010434561303048, "grad_norm": 0.00823974609375, "learning_rate": 4.004899153782529e-05, "loss": 0.0029, "step": 12590 }, { "epoch": 0.8016797098682955, "grad_norm": 0.671875, "learning_rate": 4.008080422472482e-05, "loss": 0.0005, "step": 12600 }, { "epoch": 0.8023159636062862, "grad_norm": 0.27734375, "learning_rate": 4.0112616911624356e-05, "loss": 0.0024, "step": 12610 }, { "epoch": 0.8029522173442769, "grad_norm": 0.0283203125, "learning_rate": 4.014442959852389e-05, "loss": 0.0002, "step": 12620 }, { "epoch": 0.8035884710822676, "grad_norm": 0.01129150390625, "learning_rate": 4.017624228542342e-05, "loss": 0.0007, "step": 12630 }, { "epoch": 0.8042247248202583, "grad_norm": 0.0001888275146484375, "learning_rate": 4.0208054972322964e-05, "loss": 0.0197, "step": 12640 }, { "epoch": 0.804860978558249, "grad_norm": 0.046142578125, "learning_rate": 4.0239867659222504e-05, "loss": 0.0142, "step": 12650 }, { "epoch": 0.8054972322962397, "grad_norm": 0.00057220458984375, "learning_rate": 4.027168034612204e-05, "loss": 0.0006, "step": 12660 }, { "epoch": 0.8061334860342304, "grad_norm": 0.057373046875, "learning_rate": 4.030349303302157e-05, "loss": 0.0003, "step": 12670 }, { "epoch": 0.8067697397722212, "grad_norm": 0.07666015625, "learning_rate": 4.0335305719921105e-05, "loss": 0.0019, "step": 12680 }, { "epoch": 0.8074059935102119, "grad_norm": 0.00133514404296875, "learning_rate": 4.036711840682064e-05, "loss": 0.0052, "step": 12690 }, { "epoch": 0.8080422472482026, "grad_norm": 0.0294189453125, "learning_rate": 4.039893109372018e-05, "loss": 0.0078, "step": 12700 }, { "epoch": 0.8086785009861933, "grad_norm": 0.0546875, "learning_rate": 4.043074378061971e-05, "loss": 0.0002, "step": 12710 }, { "epoch": 0.809314754724184, "grad_norm": 0.0299072265625, "learning_rate": 4.0462556467519247e-05, "loss": 0.0005, "step": 12720 }, { "epoch": 0.8099510084621747, "grad_norm": 0.0634765625, "learning_rate": 4.049436915441878e-05, "loss": 0.0139, "step": 12730 }, { "epoch": 0.8105872622001654, "grad_norm": 0.08984375, "learning_rate": 4.0526181841318314e-05, "loss": 0.002, "step": 12740 }, { "epoch": 0.8112235159381561, "grad_norm": 1.828125, "learning_rate": 4.0557994528217854e-05, "loss": 0.0042, "step": 12750 }, { "epoch": 0.8118597696761468, "grad_norm": 0.01031494140625, "learning_rate": 4.0589807215117395e-05, "loss": 0.0093, "step": 12760 }, { "epoch": 0.8124960234141375, "grad_norm": 0.0306396484375, "learning_rate": 4.062161990201693e-05, "loss": 0.0259, "step": 12770 }, { "epoch": 0.8131322771521282, "grad_norm": 0.0018157958984375, "learning_rate": 4.065343258891646e-05, "loss": 0.0059, "step": 12780 }, { "epoch": 0.813768530890119, "grad_norm": 0.2197265625, "learning_rate": 4.0685245275815996e-05, "loss": 0.0008, "step": 12790 }, { "epoch": 0.8144047846281097, "grad_norm": 0.048828125, "learning_rate": 4.071705796271553e-05, "loss": 0.0152, "step": 12800 }, { "epoch": 0.8150410383661004, "grad_norm": 0.0233154296875, "learning_rate": 4.074887064961507e-05, "loss": 0.0006, "step": 12810 }, { "epoch": 0.8156772921040911, "grad_norm": 1.1171875, "learning_rate": 4.0780683336514603e-05, "loss": 0.0013, "step": 12820 }, { "epoch": 0.8163135458420818, "grad_norm": 0.06298828125, "learning_rate": 4.081249602341414e-05, "loss": 0.0003, "step": 12830 }, { "epoch": 0.8169497995800725, "grad_norm": 0.005615234375, "learning_rate": 4.084430871031367e-05, "loss": 0.001, "step": 12840 }, { "epoch": 0.8175860533180632, "grad_norm": 0.03564453125, "learning_rate": 4.087612139721321e-05, "loss": 0.0003, "step": 12850 }, { "epoch": 0.8182223070560539, "grad_norm": 0.003814697265625, "learning_rate": 4.0907934084112745e-05, "loss": 0.0002, "step": 12860 }, { "epoch": 0.8188585607940446, "grad_norm": 0.09521484375, "learning_rate": 4.0939746771012285e-05, "loss": 0.0015, "step": 12870 }, { "epoch": 0.8194948145320353, "grad_norm": 0.2119140625, "learning_rate": 4.097155945791182e-05, "loss": 0.0011, "step": 12880 }, { "epoch": 0.8201310682700261, "grad_norm": 0.498046875, "learning_rate": 4.100337214481135e-05, "loss": 0.0006, "step": 12890 }, { "epoch": 0.8207673220080168, "grad_norm": 0.40234375, "learning_rate": 4.1035184831710886e-05, "loss": 0.0024, "step": 12900 }, { "epoch": 0.8214035757460075, "grad_norm": 0.007476806640625, "learning_rate": 4.106699751861042e-05, "loss": 0.0012, "step": 12910 }, { "epoch": 0.8220398294839982, "grad_norm": 0.005859375, "learning_rate": 4.109881020550996e-05, "loss": 0.0125, "step": 12920 }, { "epoch": 0.8226760832219889, "grad_norm": 0.1044921875, "learning_rate": 4.1130622892409494e-05, "loss": 0.0149, "step": 12930 }, { "epoch": 0.8233123369599796, "grad_norm": 0.0019989013671875, "learning_rate": 4.116243557930903e-05, "loss": 0.0005, "step": 12940 }, { "epoch": 0.8239485906979703, "grad_norm": 0.2236328125, "learning_rate": 4.119424826620857e-05, "loss": 0.0011, "step": 12950 }, { "epoch": 0.824584844435961, "grad_norm": 0.00087738037109375, "learning_rate": 4.12260609531081e-05, "loss": 0.0042, "step": 12960 }, { "epoch": 0.8252210981739517, "grad_norm": 0.07373046875, "learning_rate": 4.1257873640007635e-05, "loss": 0.0191, "step": 12970 }, { "epoch": 0.8258573519119424, "grad_norm": 0.00653076171875, "learning_rate": 4.1289686326907176e-05, "loss": 0.0008, "step": 12980 }, { "epoch": 0.8264936056499332, "grad_norm": 0.047119140625, "learning_rate": 4.132149901380671e-05, "loss": 0.0009, "step": 12990 }, { "epoch": 0.8271298593879239, "grad_norm": 0.02734375, "learning_rate": 4.135331170070624e-05, "loss": 0.0017, "step": 13000 }, { "epoch": 0.8277661131259146, "grad_norm": 0.026123046875, "learning_rate": 4.138512438760578e-05, "loss": 0.0003, "step": 13010 }, { "epoch": 0.8284023668639053, "grad_norm": 0.00653076171875, "learning_rate": 4.141693707450531e-05, "loss": 0.0017, "step": 13020 }, { "epoch": 0.829038620601896, "grad_norm": 0.005828857421875, "learning_rate": 4.144874976140485e-05, "loss": 0.0021, "step": 13030 }, { "epoch": 0.8296748743398867, "grad_norm": 0.3828125, "learning_rate": 4.1480562448304385e-05, "loss": 0.0007, "step": 13040 }, { "epoch": 0.8303111280778774, "grad_norm": 0.01239013671875, "learning_rate": 4.1512375135203925e-05, "loss": 0.0021, "step": 13050 }, { "epoch": 0.8309473818158681, "grad_norm": 0.0120849609375, "learning_rate": 4.154418782210346e-05, "loss": 0.0041, "step": 13060 }, { "epoch": 0.8315836355538588, "grad_norm": 3.234375, "learning_rate": 4.157600050900299e-05, "loss": 0.0016, "step": 13070 }, { "epoch": 0.8322198892918495, "grad_norm": 0.0279541015625, "learning_rate": 4.1607813195902526e-05, "loss": 0.0028, "step": 13080 }, { "epoch": 0.8328561430298403, "grad_norm": 0.0152587890625, "learning_rate": 4.1639625882802066e-05, "loss": 0.0009, "step": 13090 }, { "epoch": 0.833492396767831, "grad_norm": 0.01080322265625, "learning_rate": 4.16714385697016e-05, "loss": 0.0016, "step": 13100 }, { "epoch": 0.8341286505058217, "grad_norm": 0.484375, "learning_rate": 4.1703251256601134e-05, "loss": 0.001, "step": 13110 }, { "epoch": 0.8347649042438124, "grad_norm": 0.0203857421875, "learning_rate": 4.173506394350067e-05, "loss": 0.0031, "step": 13120 }, { "epoch": 0.8354011579818031, "grad_norm": 0.01251220703125, "learning_rate": 4.17668766304002e-05, "loss": 0.0006, "step": 13130 }, { "epoch": 0.8360374117197938, "grad_norm": 0.0159912109375, "learning_rate": 4.179868931729974e-05, "loss": 0.0053, "step": 13140 }, { "epoch": 0.8366736654577845, "grad_norm": 0.0291748046875, "learning_rate": 4.183050200419928e-05, "loss": 0.0073, "step": 13150 }, { "epoch": 0.8373099191957752, "grad_norm": 1.1015625, "learning_rate": 4.1862314691098816e-05, "loss": 0.0015, "step": 13160 }, { "epoch": 0.8379461729337659, "grad_norm": 0.10107421875, "learning_rate": 4.189412737799835e-05, "loss": 0.0011, "step": 13170 }, { "epoch": 0.8385824266717566, "grad_norm": 0.060302734375, "learning_rate": 4.192594006489788e-05, "loss": 0.0045, "step": 13180 }, { "epoch": 0.8392186804097475, "grad_norm": 0.2412109375, "learning_rate": 4.1957752751797417e-05, "loss": 0.001, "step": 13190 }, { "epoch": 0.8398549341477382, "grad_norm": 0.041259765625, "learning_rate": 4.198956543869696e-05, "loss": 0.0042, "step": 13200 }, { "epoch": 0.8404911878857289, "grad_norm": 0.0264892578125, "learning_rate": 4.202137812559649e-05, "loss": 0.0004, "step": 13210 }, { "epoch": 0.8411274416237196, "grad_norm": 0.0023040771484375, "learning_rate": 4.2053190812496024e-05, "loss": 0.0006, "step": 13220 }, { "epoch": 0.8417636953617103, "grad_norm": 0.02392578125, "learning_rate": 4.208500349939556e-05, "loss": 0.0015, "step": 13230 }, { "epoch": 0.842399949099701, "grad_norm": 0.6796875, "learning_rate": 4.211681618629509e-05, "loss": 0.0005, "step": 13240 }, { "epoch": 0.8430362028376917, "grad_norm": 0.443359375, "learning_rate": 4.214862887319463e-05, "loss": 0.0021, "step": 13250 }, { "epoch": 0.8436724565756824, "grad_norm": 0.072265625, "learning_rate": 4.218044156009417e-05, "loss": 0.0015, "step": 13260 }, { "epoch": 0.844308710313673, "grad_norm": 0.00494384765625, "learning_rate": 4.2212254246993706e-05, "loss": 0.0002, "step": 13270 }, { "epoch": 0.8449449640516637, "grad_norm": 0.107421875, "learning_rate": 4.224406693389324e-05, "loss": 0.0007, "step": 13280 }, { "epoch": 0.8455812177896546, "grad_norm": 0.01190185546875, "learning_rate": 4.2275879620792773e-05, "loss": 0.001, "step": 13290 }, { "epoch": 0.8462174715276453, "grad_norm": 0.030517578125, "learning_rate": 4.230769230769231e-05, "loss": 0.0006, "step": 13300 }, { "epoch": 0.846853725265636, "grad_norm": 0.52734375, "learning_rate": 4.233950499459185e-05, "loss": 0.0012, "step": 13310 }, { "epoch": 0.8474899790036267, "grad_norm": 0.1142578125, "learning_rate": 4.237131768149138e-05, "loss": 0.0016, "step": 13320 }, { "epoch": 0.8481262327416174, "grad_norm": 0.01104736328125, "learning_rate": 4.2403130368390915e-05, "loss": 0.0002, "step": 13330 }, { "epoch": 0.8487624864796081, "grad_norm": 0.306640625, "learning_rate": 4.243494305529045e-05, "loss": 0.0058, "step": 13340 }, { "epoch": 0.8493987402175988, "grad_norm": 0.028564453125, "learning_rate": 4.246675574218999e-05, "loss": 0.0006, "step": 13350 }, { "epoch": 0.8500349939555895, "grad_norm": 0.041015625, "learning_rate": 4.249856842908952e-05, "loss": 0.0037, "step": 13360 }, { "epoch": 0.8506712476935802, "grad_norm": 0.02490234375, "learning_rate": 4.253038111598906e-05, "loss": 0.0004, "step": 13370 }, { "epoch": 0.8513075014315709, "grad_norm": 0.03857421875, "learning_rate": 4.25621938028886e-05, "loss": 0.0018, "step": 13380 }, { "epoch": 0.8519437551695617, "grad_norm": 0.08251953125, "learning_rate": 4.259400648978813e-05, "loss": 0.0013, "step": 13390 }, { "epoch": 0.8525800089075524, "grad_norm": 0.0145263671875, "learning_rate": 4.2625819176687664e-05, "loss": 0.0019, "step": 13400 }, { "epoch": 0.8532162626455431, "grad_norm": 0.026611328125, "learning_rate": 4.26576318635872e-05, "loss": 0.002, "step": 13410 }, { "epoch": 0.8538525163835338, "grad_norm": 0.01806640625, "learning_rate": 4.268944455048673e-05, "loss": 0.0059, "step": 13420 }, { "epoch": 0.8544887701215245, "grad_norm": 0.115234375, "learning_rate": 4.272125723738627e-05, "loss": 0.0002, "step": 13430 }, { "epoch": 0.8551250238595152, "grad_norm": 0.10400390625, "learning_rate": 4.2753069924285805e-05, "loss": 0.0097, "step": 13440 }, { "epoch": 0.8557612775975059, "grad_norm": 0.029541015625, "learning_rate": 4.2784882611185346e-05, "loss": 0.0003, "step": 13450 }, { "epoch": 0.8563975313354966, "grad_norm": 0.053466796875, "learning_rate": 4.281669529808488e-05, "loss": 0.0065, "step": 13460 }, { "epoch": 0.8570337850734873, "grad_norm": 0.57421875, "learning_rate": 4.284850798498441e-05, "loss": 0.0022, "step": 13470 }, { "epoch": 0.857670038811478, "grad_norm": 0.171875, "learning_rate": 4.288032067188395e-05, "loss": 0.0026, "step": 13480 }, { "epoch": 0.8583062925494688, "grad_norm": 0.349609375, "learning_rate": 4.291213335878349e-05, "loss": 0.0005, "step": 13490 }, { "epoch": 0.8589425462874595, "grad_norm": 0.0208740234375, "learning_rate": 4.294394604568302e-05, "loss": 0.0054, "step": 13500 }, { "epoch": 0.8595788000254502, "grad_norm": 0.0027008056640625, "learning_rate": 4.2975758732582555e-05, "loss": 0.0007, "step": 13510 }, { "epoch": 0.8602150537634409, "grad_norm": 0.16015625, "learning_rate": 4.300757141948209e-05, "loss": 0.0004, "step": 13520 }, { "epoch": 0.8608513075014316, "grad_norm": 0.059814453125, "learning_rate": 4.303938410638162e-05, "loss": 0.0064, "step": 13530 }, { "epoch": 0.8614875612394223, "grad_norm": 0.8046875, "learning_rate": 4.307119679328116e-05, "loss": 0.0016, "step": 13540 }, { "epoch": 0.862123814977413, "grad_norm": 0.00946044921875, "learning_rate": 4.31030094801807e-05, "loss": 0.0003, "step": 13550 }, { "epoch": 0.8627600687154037, "grad_norm": 0.0361328125, "learning_rate": 4.3134822167080236e-05, "loss": 0.0028, "step": 13560 }, { "epoch": 0.8633963224533944, "grad_norm": 0.306640625, "learning_rate": 4.316663485397977e-05, "loss": 0.0003, "step": 13570 }, { "epoch": 0.8640325761913851, "grad_norm": 0.00531005859375, "learning_rate": 4.3198447540879304e-05, "loss": 0.0057, "step": 13580 }, { "epoch": 0.8646688299293759, "grad_norm": 0.0059814453125, "learning_rate": 4.323026022777884e-05, "loss": 0.0004, "step": 13590 }, { "epoch": 0.8653050836673666, "grad_norm": 0.00592041015625, "learning_rate": 4.326207291467838e-05, "loss": 0.0077, "step": 13600 }, { "epoch": 0.8659413374053573, "grad_norm": 0.003021240234375, "learning_rate": 4.329388560157791e-05, "loss": 0.0173, "step": 13610 }, { "epoch": 0.866577591143348, "grad_norm": 0.310546875, "learning_rate": 4.3325698288477445e-05, "loss": 0.0012, "step": 13620 }, { "epoch": 0.8672138448813387, "grad_norm": 0.025146484375, "learning_rate": 4.335751097537698e-05, "loss": 0.0019, "step": 13630 }, { "epoch": 0.8678500986193294, "grad_norm": 1.4375, "learning_rate": 4.338932366227651e-05, "loss": 0.0019, "step": 13640 }, { "epoch": 0.8684863523573201, "grad_norm": 0.04443359375, "learning_rate": 4.342113634917605e-05, "loss": 0.0035, "step": 13650 }, { "epoch": 0.8691226060953108, "grad_norm": 0.37890625, "learning_rate": 4.345294903607559e-05, "loss": 0.0005, "step": 13660 }, { "epoch": 0.8697588598333015, "grad_norm": 0.546875, "learning_rate": 4.348476172297513e-05, "loss": 0.0071, "step": 13670 }, { "epoch": 0.8703951135712922, "grad_norm": 0.09716796875, "learning_rate": 4.351657440987466e-05, "loss": 0.004, "step": 13680 }, { "epoch": 0.871031367309283, "grad_norm": 0.02294921875, "learning_rate": 4.3548387096774194e-05, "loss": 0.0056, "step": 13690 }, { "epoch": 0.8716676210472737, "grad_norm": 0.10009765625, "learning_rate": 4.358019978367373e-05, "loss": 0.0011, "step": 13700 }, { "epoch": 0.8723038747852644, "grad_norm": 0.046630859375, "learning_rate": 4.361201247057327e-05, "loss": 0.0009, "step": 13710 }, { "epoch": 0.8729401285232551, "grad_norm": 10.3125, "learning_rate": 4.36438251574728e-05, "loss": 0.0123, "step": 13720 }, { "epoch": 0.8735763822612458, "grad_norm": 0.2890625, "learning_rate": 4.3675637844372336e-05, "loss": 0.0024, "step": 13730 }, { "epoch": 0.8742126359992365, "grad_norm": 0.337890625, "learning_rate": 4.370745053127187e-05, "loss": 0.0051, "step": 13740 }, { "epoch": 0.8748488897372272, "grad_norm": 0.33984375, "learning_rate": 4.373926321817141e-05, "loss": 0.0005, "step": 13750 }, { "epoch": 0.8754851434752179, "grad_norm": 0.02587890625, "learning_rate": 4.3771075905070943e-05, "loss": 0.0004, "step": 13760 }, { "epoch": 0.8761213972132086, "grad_norm": 0.078125, "learning_rate": 4.3802888591970484e-05, "loss": 0.0017, "step": 13770 }, { "epoch": 0.8767576509511993, "grad_norm": 0.0216064453125, "learning_rate": 4.383470127887002e-05, "loss": 0.0036, "step": 13780 }, { "epoch": 0.8773939046891901, "grad_norm": 0.059814453125, "learning_rate": 4.386651396576955e-05, "loss": 0.0042, "step": 13790 }, { "epoch": 0.8780301584271808, "grad_norm": 0.1435546875, "learning_rate": 4.3898326652669085e-05, "loss": 0.0028, "step": 13800 }, { "epoch": 0.8786664121651715, "grad_norm": 0.005462646484375, "learning_rate": 4.393013933956862e-05, "loss": 0.0005, "step": 13810 }, { "epoch": 0.8793026659031622, "grad_norm": 0.0133056640625, "learning_rate": 4.396195202646816e-05, "loss": 0.0019, "step": 13820 }, { "epoch": 0.8799389196411529, "grad_norm": 0.032470703125, "learning_rate": 4.399376471336769e-05, "loss": 0.0049, "step": 13830 }, { "epoch": 0.8805751733791436, "grad_norm": 0.000965118408203125, "learning_rate": 4.4025577400267226e-05, "loss": 0.0067, "step": 13840 }, { "epoch": 0.8812114271171343, "grad_norm": 0.06494140625, "learning_rate": 4.405739008716677e-05, "loss": 0.0006, "step": 13850 }, { "epoch": 0.881847680855125, "grad_norm": 0.1513671875, "learning_rate": 4.40892027740663e-05, "loss": 0.0005, "step": 13860 }, { "epoch": 0.8824839345931157, "grad_norm": 0.232421875, "learning_rate": 4.4121015460965834e-05, "loss": 0.0008, "step": 13870 }, { "epoch": 0.8831201883311064, "grad_norm": 0.01544189453125, "learning_rate": 4.4152828147865374e-05, "loss": 0.0008, "step": 13880 }, { "epoch": 0.8837564420690972, "grad_norm": 0.392578125, "learning_rate": 4.418464083476491e-05, "loss": 0.0016, "step": 13890 }, { "epoch": 0.8843926958070879, "grad_norm": 0.005462646484375, "learning_rate": 4.421645352166444e-05, "loss": 0.0002, "step": 13900 }, { "epoch": 0.8850289495450786, "grad_norm": 0.13671875, "learning_rate": 4.4248266208563975e-05, "loss": 0.0027, "step": 13910 }, { "epoch": 0.8856652032830693, "grad_norm": 0.041748046875, "learning_rate": 4.428007889546351e-05, "loss": 0.001, "step": 13920 }, { "epoch": 0.88630145702106, "grad_norm": 0.11474609375, "learning_rate": 4.431189158236305e-05, "loss": 0.0171, "step": 13930 }, { "epoch": 0.8869377107590507, "grad_norm": 0.005584716796875, "learning_rate": 4.434370426926258e-05, "loss": 0.0148, "step": 13940 }, { "epoch": 0.8875739644970414, "grad_norm": 0.0712890625, "learning_rate": 4.4375516956162124e-05, "loss": 0.0011, "step": 13950 }, { "epoch": 0.8882102182350321, "grad_norm": 0.023681640625, "learning_rate": 4.440732964306166e-05, "loss": 0.0106, "step": 13960 }, { "epoch": 0.8888464719730228, "grad_norm": 0.00433349609375, "learning_rate": 4.443914232996119e-05, "loss": 0.0123, "step": 13970 }, { "epoch": 0.8894827257110135, "grad_norm": 8.0625, "learning_rate": 4.4470955016860725e-05, "loss": 0.0167, "step": 13980 }, { "epoch": 0.8901189794490043, "grad_norm": 0.0054931640625, "learning_rate": 4.4502767703760265e-05, "loss": 0.0163, "step": 13990 }, { "epoch": 0.890755233186995, "grad_norm": 0.0751953125, "learning_rate": 4.45345803906598e-05, "loss": 0.0051, "step": 14000 }, { "epoch": 0.8913914869249857, "grad_norm": 0.0257568359375, "learning_rate": 4.456639307755933e-05, "loss": 0.0005, "step": 14010 }, { "epoch": 0.8920277406629764, "grad_norm": 0.043701171875, "learning_rate": 4.4598205764458866e-05, "loss": 0.0008, "step": 14020 }, { "epoch": 0.8926639944009671, "grad_norm": 0.00909423828125, "learning_rate": 4.46300184513584e-05, "loss": 0.0004, "step": 14030 }, { "epoch": 0.8933002481389578, "grad_norm": 0.2236328125, "learning_rate": 4.466183113825794e-05, "loss": 0.004, "step": 14040 }, { "epoch": 0.8939365018769485, "grad_norm": 0.1943359375, "learning_rate": 4.469364382515748e-05, "loss": 0.0011, "step": 14050 }, { "epoch": 0.8945727556149392, "grad_norm": 0.036376953125, "learning_rate": 4.4725456512057014e-05, "loss": 0.0007, "step": 14060 }, { "epoch": 0.8952090093529299, "grad_norm": 0.007537841796875, "learning_rate": 4.475726919895655e-05, "loss": 0.0031, "step": 14070 }, { "epoch": 0.8958452630909206, "grad_norm": 3.0, "learning_rate": 4.478908188585608e-05, "loss": 0.0007, "step": 14080 }, { "epoch": 0.8964815168289114, "grad_norm": 0.0081787109375, "learning_rate": 4.4820894572755615e-05, "loss": 0.0004, "step": 14090 }, { "epoch": 0.8971177705669021, "grad_norm": 0.0211181640625, "learning_rate": 4.4852707259655156e-05, "loss": 0.0003, "step": 14100 }, { "epoch": 0.8977540243048928, "grad_norm": 0.03662109375, "learning_rate": 4.488451994655469e-05, "loss": 0.0026, "step": 14110 }, { "epoch": 0.8983902780428835, "grad_norm": 0.006317138671875, "learning_rate": 4.491633263345422e-05, "loss": 0.0003, "step": 14120 }, { "epoch": 0.8990265317808742, "grad_norm": 0.026123046875, "learning_rate": 4.4948145320353757e-05, "loss": 0.0022, "step": 14130 }, { "epoch": 0.8996627855188649, "grad_norm": 2.9375, "learning_rate": 4.497995800725329e-05, "loss": 0.0016, "step": 14140 }, { "epoch": 0.9002990392568556, "grad_norm": 0.009033203125, "learning_rate": 4.501177069415283e-05, "loss": 0.0002, "step": 14150 }, { "epoch": 0.9009352929948463, "grad_norm": 0.11767578125, "learning_rate": 4.504358338105237e-05, "loss": 0.0075, "step": 14160 }, { "epoch": 0.901571546732837, "grad_norm": 0.052734375, "learning_rate": 4.5075396067951905e-05, "loss": 0.0033, "step": 14170 }, { "epoch": 0.9022078004708277, "grad_norm": 0.0017547607421875, "learning_rate": 4.510720875485144e-05, "loss": 0.0043, "step": 14180 }, { "epoch": 0.9028440542088185, "grad_norm": 0.00555419921875, "learning_rate": 4.513902144175097e-05, "loss": 0.0049, "step": 14190 }, { "epoch": 0.9034803079468092, "grad_norm": 0.0189208984375, "learning_rate": 4.5170834128650506e-05, "loss": 0.0013, "step": 14200 }, { "epoch": 0.9041165616847999, "grad_norm": 0.006256103515625, "learning_rate": 4.520264681555004e-05, "loss": 0.0002, "step": 14210 }, { "epoch": 0.9047528154227906, "grad_norm": 3.625, "learning_rate": 4.523445950244958e-05, "loss": 0.0044, "step": 14220 }, { "epoch": 0.9053890691607813, "grad_norm": 0.0224609375, "learning_rate": 4.5266272189349114e-05, "loss": 0.0056, "step": 14230 }, { "epoch": 0.906025322898772, "grad_norm": 0.314453125, "learning_rate": 4.529808487624865e-05, "loss": 0.0006, "step": 14240 }, { "epoch": 0.9066615766367627, "grad_norm": 0.000942230224609375, "learning_rate": 4.532989756314819e-05, "loss": 0.0034, "step": 14250 }, { "epoch": 0.9072978303747534, "grad_norm": 0.013427734375, "learning_rate": 4.536171025004772e-05, "loss": 0.0015, "step": 14260 }, { "epoch": 0.9079340841127441, "grad_norm": 0.00799560546875, "learning_rate": 4.5393522936947255e-05, "loss": 0.0008, "step": 14270 }, { "epoch": 0.9085703378507348, "grad_norm": 0.01116943359375, "learning_rate": 4.5425335623846795e-05, "loss": 0.0049, "step": 14280 }, { "epoch": 0.9092065915887256, "grad_norm": 2.34375, "learning_rate": 4.545714831074633e-05, "loss": 0.0019, "step": 14290 }, { "epoch": 0.9098428453267163, "grad_norm": 0.01214599609375, "learning_rate": 4.548896099764586e-05, "loss": 0.0017, "step": 14300 }, { "epoch": 0.910479099064707, "grad_norm": 0.0040283203125, "learning_rate": 4.5520773684545396e-05, "loss": 0.0009, "step": 14310 }, { "epoch": 0.9111153528026977, "grad_norm": 0.002716064453125, "learning_rate": 4.555258637144493e-05, "loss": 0.0002, "step": 14320 }, { "epoch": 0.9117516065406884, "grad_norm": 7.4375, "learning_rate": 4.558439905834447e-05, "loss": 0.0058, "step": 14330 }, { "epoch": 0.9123878602786791, "grad_norm": 0.1318359375, "learning_rate": 4.5616211745244004e-05, "loss": 0.0104, "step": 14340 }, { "epoch": 0.9130241140166698, "grad_norm": 0.177734375, "learning_rate": 4.5648024432143545e-05, "loss": 0.0008, "step": 14350 }, { "epoch": 0.9136603677546605, "grad_norm": 0.0390625, "learning_rate": 4.567983711904308e-05, "loss": 0.0004, "step": 14360 }, { "epoch": 0.9142966214926512, "grad_norm": 0.06689453125, "learning_rate": 4.571164980594261e-05, "loss": 0.0006, "step": 14370 }, { "epoch": 0.9149328752306419, "grad_norm": 0.25390625, "learning_rate": 4.5743462492842145e-05, "loss": 0.0007, "step": 14380 }, { "epoch": 0.9155691289686327, "grad_norm": 0.01019287109375, "learning_rate": 4.5775275179741686e-05, "loss": 0.0063, "step": 14390 }, { "epoch": 0.9162053827066234, "grad_norm": 0.00167083740234375, "learning_rate": 4.580708786664122e-05, "loss": 0.0006, "step": 14400 }, { "epoch": 0.9168416364446141, "grad_norm": 0.0218505859375, "learning_rate": 4.583890055354075e-05, "loss": 0.0001, "step": 14410 }, { "epoch": 0.9174778901826048, "grad_norm": 0.02392578125, "learning_rate": 4.587071324044029e-05, "loss": 0.0002, "step": 14420 }, { "epoch": 0.9181141439205955, "grad_norm": 0.03955078125, "learning_rate": 4.590252592733982e-05, "loss": 0.0003, "step": 14430 }, { "epoch": 0.9187503976585862, "grad_norm": 0.1328125, "learning_rate": 4.593433861423936e-05, "loss": 0.003, "step": 14440 }, { "epoch": 0.9193866513965769, "grad_norm": 3.8125, "learning_rate": 4.59661513011389e-05, "loss": 0.0038, "step": 14450 }, { "epoch": 0.9200229051345676, "grad_norm": 0.1845703125, "learning_rate": 4.5997963988038435e-05, "loss": 0.0006, "step": 14460 }, { "epoch": 0.9206591588725583, "grad_norm": 0.004058837890625, "learning_rate": 4.602977667493797e-05, "loss": 0.0008, "step": 14470 }, { "epoch": 0.921295412610549, "grad_norm": 0.10986328125, "learning_rate": 4.60615893618375e-05, "loss": 0.0003, "step": 14480 }, { "epoch": 0.9219316663485398, "grad_norm": 0.09326171875, "learning_rate": 4.6093402048737036e-05, "loss": 0.0003, "step": 14490 }, { "epoch": 0.9225679200865305, "grad_norm": 0.006011962890625, "learning_rate": 4.6125214735636576e-05, "loss": 0.0081, "step": 14500 }, { "epoch": 0.9232041738245212, "grad_norm": 0.01611328125, "learning_rate": 4.615702742253611e-05, "loss": 0.008, "step": 14510 }, { "epoch": 0.923840427562512, "grad_norm": 0.05419921875, "learning_rate": 4.6188840109435644e-05, "loss": 0.0053, "step": 14520 }, { "epoch": 0.9244766813005026, "grad_norm": 0.0257568359375, "learning_rate": 4.622065279633518e-05, "loss": 0.0053, "step": 14530 }, { "epoch": 0.9251129350384933, "grad_norm": 0.51953125, "learning_rate": 4.625246548323471e-05, "loss": 0.0078, "step": 14540 }, { "epoch": 0.925749188776484, "grad_norm": 1.609375, "learning_rate": 4.628427817013425e-05, "loss": 0.003, "step": 14550 }, { "epoch": 0.9263854425144747, "grad_norm": 0.0120849609375, "learning_rate": 4.631609085703379e-05, "loss": 0.0011, "step": 14560 }, { "epoch": 0.9270216962524654, "grad_norm": 0.640625, "learning_rate": 4.6347903543933326e-05, "loss": 0.0015, "step": 14570 }, { "epoch": 0.9276579499904561, "grad_norm": 0.010009765625, "learning_rate": 4.637971623083286e-05, "loss": 0.0018, "step": 14580 }, { "epoch": 0.928294203728447, "grad_norm": 0.00244140625, "learning_rate": 4.641152891773239e-05, "loss": 0.0078, "step": 14590 }, { "epoch": 0.9289304574664377, "grad_norm": 0.00274658203125, "learning_rate": 4.644334160463193e-05, "loss": 0.0014, "step": 14600 }, { "epoch": 0.9295667112044284, "grad_norm": 0.005157470703125, "learning_rate": 4.647515429153147e-05, "loss": 0.0005, "step": 14610 }, { "epoch": 0.930202964942419, "grad_norm": 2.65625, "learning_rate": 4.6506966978431e-05, "loss": 0.0015, "step": 14620 }, { "epoch": 0.9308392186804098, "grad_norm": 0.0201416015625, "learning_rate": 4.6538779665330534e-05, "loss": 0.0013, "step": 14630 }, { "epoch": 0.9314754724184005, "grad_norm": 0.0040283203125, "learning_rate": 4.657059235223007e-05, "loss": 0.0021, "step": 14640 }, { "epoch": 0.9321117261563912, "grad_norm": 0.58203125, "learning_rate": 4.660240503912961e-05, "loss": 0.0018, "step": 14650 }, { "epoch": 0.9327479798943819, "grad_norm": 0.2021484375, "learning_rate": 4.663421772602914e-05, "loss": 0.0003, "step": 14660 }, { "epoch": 0.9333842336323726, "grad_norm": 0.76953125, "learning_rate": 4.666603041292868e-05, "loss": 0.0006, "step": 14670 }, { "epoch": 0.9340204873703633, "grad_norm": 0.0947265625, "learning_rate": 4.6697843099828216e-05, "loss": 0.0038, "step": 14680 }, { "epoch": 0.9346567411083541, "grad_norm": 0.0595703125, "learning_rate": 4.672965578672775e-05, "loss": 0.005, "step": 14690 }, { "epoch": 0.9352929948463448, "grad_norm": 0.05712890625, "learning_rate": 4.6761468473627284e-05, "loss": 0.0013, "step": 14700 }, { "epoch": 0.9359292485843355, "grad_norm": 0.05517578125, "learning_rate": 4.679328116052682e-05, "loss": 0.0014, "step": 14710 }, { "epoch": 0.9365655023223262, "grad_norm": 0.01202392578125, "learning_rate": 4.682509384742636e-05, "loss": 0.0049, "step": 14720 }, { "epoch": 0.9372017560603169, "grad_norm": 0.1376953125, "learning_rate": 4.685690653432589e-05, "loss": 0.0007, "step": 14730 }, { "epoch": 0.9378380097983076, "grad_norm": 1.6328125, "learning_rate": 4.6888719221225425e-05, "loss": 0.0072, "step": 14740 }, { "epoch": 0.9384742635362983, "grad_norm": 2.390625, "learning_rate": 4.6920531908124965e-05, "loss": 0.0014, "step": 14750 }, { "epoch": 0.939110517274289, "grad_norm": 3.859375, "learning_rate": 4.69523445950245e-05, "loss": 0.0016, "step": 14760 }, { "epoch": 0.9397467710122797, "grad_norm": 0.1953125, "learning_rate": 4.698415728192403e-05, "loss": 0.0021, "step": 14770 }, { "epoch": 0.9403830247502704, "grad_norm": 0.045654296875, "learning_rate": 4.701596996882357e-05, "loss": 0.0003, "step": 14780 }, { "epoch": 0.9410192784882612, "grad_norm": 0.25, "learning_rate": 4.704778265572311e-05, "loss": 0.0014, "step": 14790 }, { "epoch": 0.9416555322262519, "grad_norm": 0.06396484375, "learning_rate": 4.707959534262264e-05, "loss": 0.0029, "step": 14800 }, { "epoch": 0.9422917859642426, "grad_norm": 0.028076171875, "learning_rate": 4.7111408029522174e-05, "loss": 0.0004, "step": 14810 }, { "epoch": 0.9429280397022333, "grad_norm": 0.02685546875, "learning_rate": 4.714322071642171e-05, "loss": 0.0081, "step": 14820 }, { "epoch": 0.943564293440224, "grad_norm": 0.423828125, "learning_rate": 4.717503340332125e-05, "loss": 0.0007, "step": 14830 }, { "epoch": 0.9442005471782147, "grad_norm": 0.01251220703125, "learning_rate": 4.720684609022078e-05, "loss": 0.0001, "step": 14840 }, { "epoch": 0.9448368009162054, "grad_norm": 0.06396484375, "learning_rate": 4.723865877712032e-05, "loss": 0.0024, "step": 14850 }, { "epoch": 0.9454730546541961, "grad_norm": 0.0038299560546875, "learning_rate": 4.7270471464019856e-05, "loss": 0.0002, "step": 14860 }, { "epoch": 0.9461093083921868, "grad_norm": 0.02587890625, "learning_rate": 4.730228415091939e-05, "loss": 0.0032, "step": 14870 }, { "epoch": 0.9467455621301775, "grad_norm": 0.057373046875, "learning_rate": 4.733409683781892e-05, "loss": 0.0004, "step": 14880 }, { "epoch": 0.9473818158681683, "grad_norm": 0.01171875, "learning_rate": 4.7365909524718464e-05, "loss": 0.0141, "step": 14890 }, { "epoch": 0.948018069606159, "grad_norm": 2.5625, "learning_rate": 4.7397722211618e-05, "loss": 0.0032, "step": 14900 }, { "epoch": 0.9486543233441497, "grad_norm": 0.006072998046875, "learning_rate": 4.742953489851753e-05, "loss": 0.0006, "step": 14910 }, { "epoch": 0.9492905770821404, "grad_norm": 0.03271484375, "learning_rate": 4.7461347585417065e-05, "loss": 0.0004, "step": 14920 }, { "epoch": 0.9499268308201311, "grad_norm": 0.087890625, "learning_rate": 4.74931602723166e-05, "loss": 0.0011, "step": 14930 }, { "epoch": 0.9505630845581218, "grad_norm": 1.203125, "learning_rate": 4.752497295921613e-05, "loss": 0.0007, "step": 14940 }, { "epoch": 0.9511993382961125, "grad_norm": 0.01153564453125, "learning_rate": 4.755678564611567e-05, "loss": 0.0139, "step": 14950 }, { "epoch": 0.9518355920341032, "grad_norm": 0.0419921875, "learning_rate": 4.758859833301521e-05, "loss": 0.0017, "step": 14960 }, { "epoch": 0.9524718457720939, "grad_norm": 0.111328125, "learning_rate": 4.7620411019914747e-05, "loss": 0.0003, "step": 14970 }, { "epoch": 0.9531080995100846, "grad_norm": 0.0040283203125, "learning_rate": 4.765222370681428e-05, "loss": 0.0033, "step": 14980 }, { "epoch": 0.9537443532480754, "grad_norm": 0.02880859375, "learning_rate": 4.7684036393713814e-05, "loss": 0.0014, "step": 14990 }, { "epoch": 0.9543806069860661, "grad_norm": 0.109375, "learning_rate": 4.771584908061335e-05, "loss": 0.0004, "step": 15000 }, { "epoch": 0.9550168607240568, "grad_norm": 0.004974365234375, "learning_rate": 4.774766176751289e-05, "loss": 0.0043, "step": 15010 }, { "epoch": 0.9556531144620475, "grad_norm": 0.030029296875, "learning_rate": 4.777947445441242e-05, "loss": 0.0166, "step": 15020 }, { "epoch": 0.9562893682000382, "grad_norm": 4.59375, "learning_rate": 4.7811287141311955e-05, "loss": 0.0031, "step": 15030 }, { "epoch": 0.9569256219380289, "grad_norm": 0.02587890625, "learning_rate": 4.784309982821149e-05, "loss": 0.0044, "step": 15040 }, { "epoch": 0.9575618756760196, "grad_norm": 0.005096435546875, "learning_rate": 4.787491251511103e-05, "loss": 0.0042, "step": 15050 }, { "epoch": 0.9581981294140103, "grad_norm": 0.0030975341796875, "learning_rate": 4.790672520201056e-05, "loss": 0.0033, "step": 15060 }, { "epoch": 0.958834383152001, "grad_norm": 0.006256103515625, "learning_rate": 4.7938537888910103e-05, "loss": 0.0028, "step": 15070 }, { "epoch": 0.9594706368899917, "grad_norm": 0.07958984375, "learning_rate": 4.797035057580964e-05, "loss": 0.0014, "step": 15080 }, { "epoch": 0.9601068906279825, "grad_norm": 0.0030364990234375, "learning_rate": 4.800216326270917e-05, "loss": 0.0002, "step": 15090 }, { "epoch": 0.9607431443659732, "grad_norm": 0.0040283203125, "learning_rate": 4.8033975949608704e-05, "loss": 0.0089, "step": 15100 }, { "epoch": 0.9613793981039639, "grad_norm": 0.1748046875, "learning_rate": 4.806578863650824e-05, "loss": 0.0068, "step": 15110 }, { "epoch": 0.9620156518419546, "grad_norm": 3.203125, "learning_rate": 4.809760132340778e-05, "loss": 0.0115, "step": 15120 }, { "epoch": 0.9626519055799453, "grad_norm": 0.0213623046875, "learning_rate": 4.812941401030731e-05, "loss": 0.0036, "step": 15130 }, { "epoch": 0.963288159317936, "grad_norm": 0.6875, "learning_rate": 4.8161226697206846e-05, "loss": 0.0013, "step": 15140 }, { "epoch": 0.9639244130559267, "grad_norm": 0.2041015625, "learning_rate": 4.8193039384106386e-05, "loss": 0.0043, "step": 15150 }, { "epoch": 0.9645606667939174, "grad_norm": 0.0106201171875, "learning_rate": 4.822485207100592e-05, "loss": 0.0037, "step": 15160 }, { "epoch": 0.9651969205319081, "grad_norm": 0.0167236328125, "learning_rate": 4.8256664757905454e-05, "loss": 0.0126, "step": 15170 }, { "epoch": 0.9658331742698988, "grad_norm": 0.796875, "learning_rate": 4.8288477444804994e-05, "loss": 0.0016, "step": 15180 }, { "epoch": 0.9664694280078896, "grad_norm": 0.08984375, "learning_rate": 4.832029013170453e-05, "loss": 0.0005, "step": 15190 }, { "epoch": 0.9671056817458803, "grad_norm": 0.0247802734375, "learning_rate": 4.835210281860406e-05, "loss": 0.0005, "step": 15200 }, { "epoch": 0.967741935483871, "grad_norm": 0.016357421875, "learning_rate": 4.8383915505503595e-05, "loss": 0.0008, "step": 15210 }, { "epoch": 0.9683781892218617, "grad_norm": 0.029052734375, "learning_rate": 4.841572819240313e-05, "loss": 0.0005, "step": 15220 }, { "epoch": 0.9690144429598524, "grad_norm": 2.734375, "learning_rate": 4.844754087930267e-05, "loss": 0.0017, "step": 15230 }, { "epoch": 0.9696506966978431, "grad_norm": 3.78125, "learning_rate": 4.84793535662022e-05, "loss": 0.0077, "step": 15240 }, { "epoch": 0.9702869504358338, "grad_norm": 0.0380859375, "learning_rate": 4.851116625310174e-05, "loss": 0.001, "step": 15250 }, { "epoch": 0.9709232041738245, "grad_norm": 0.02392578125, "learning_rate": 4.854297894000128e-05, "loss": 0.0063, "step": 15260 }, { "epoch": 0.9715594579118152, "grad_norm": 0.01318359375, "learning_rate": 4.857479162690081e-05, "loss": 0.0049, "step": 15270 }, { "epoch": 0.9721957116498059, "grad_norm": 0.00058746337890625, "learning_rate": 4.8606604313800344e-05, "loss": 0.0015, "step": 15280 }, { "epoch": 0.9728319653877967, "grad_norm": 8.5, "learning_rate": 4.8638417000699885e-05, "loss": 0.0121, "step": 15290 }, { "epoch": 0.9734682191257874, "grad_norm": 0.003997802734375, "learning_rate": 4.867022968759942e-05, "loss": 0.0037, "step": 15300 }, { "epoch": 0.9741044728637781, "grad_norm": 7.59375, "learning_rate": 4.870204237449895e-05, "loss": 0.0114, "step": 15310 }, { "epoch": 0.9747407266017688, "grad_norm": 0.011962890625, "learning_rate": 4.8733855061398486e-05, "loss": 0.0164, "step": 15320 }, { "epoch": 0.9753769803397595, "grad_norm": 0.197265625, "learning_rate": 4.876566774829802e-05, "loss": 0.0068, "step": 15330 }, { "epoch": 0.9760132340777502, "grad_norm": 0.01171875, "learning_rate": 4.879748043519756e-05, "loss": 0.0013, "step": 15340 }, { "epoch": 0.9766494878157409, "grad_norm": 0.0028839111328125, "learning_rate": 4.88292931220971e-05, "loss": 0.0021, "step": 15350 }, { "epoch": 0.9772857415537316, "grad_norm": 0.045654296875, "learning_rate": 4.8861105808996634e-05, "loss": 0.0002, "step": 15360 }, { "epoch": 0.9779219952917223, "grad_norm": 0.0439453125, "learning_rate": 4.889291849589617e-05, "loss": 0.0028, "step": 15370 }, { "epoch": 0.978558249029713, "grad_norm": 0.06103515625, "learning_rate": 4.89247311827957e-05, "loss": 0.0054, "step": 15380 }, { "epoch": 0.9791945027677038, "grad_norm": 0.2490234375, "learning_rate": 4.8956543869695235e-05, "loss": 0.0074, "step": 15390 }, { "epoch": 0.9798307565056945, "grad_norm": 0.016845703125, "learning_rate": 4.8988356556594775e-05, "loss": 0.0004, "step": 15400 }, { "epoch": 0.9804670102436852, "grad_norm": 0.30859375, "learning_rate": 4.902016924349431e-05, "loss": 0.0003, "step": 15410 }, { "epoch": 0.9811032639816759, "grad_norm": 0.002593994140625, "learning_rate": 4.905198193039384e-05, "loss": 0.0023, "step": 15420 }, { "epoch": 0.9817395177196666, "grad_norm": 2.265625, "learning_rate": 4.9083794617293376e-05, "loss": 0.0033, "step": 15430 }, { "epoch": 0.9823757714576573, "grad_norm": 0.014404296875, "learning_rate": 4.911560730419291e-05, "loss": 0.0006, "step": 15440 }, { "epoch": 0.983012025195648, "grad_norm": 0.01470947265625, "learning_rate": 4.914741999109245e-05, "loss": 0.0013, "step": 15450 }, { "epoch": 0.9836482789336387, "grad_norm": 0.07421875, "learning_rate": 4.917923267799199e-05, "loss": 0.0009, "step": 15460 }, { "epoch": 0.9842845326716294, "grad_norm": 0.0400390625, "learning_rate": 4.9211045364891524e-05, "loss": 0.0003, "step": 15470 }, { "epoch": 0.9849207864096201, "grad_norm": 0.0086669921875, "learning_rate": 4.924285805179106e-05, "loss": 0.0011, "step": 15480 }, { "epoch": 0.9855570401476109, "grad_norm": 0.10302734375, "learning_rate": 4.927467073869059e-05, "loss": 0.016, "step": 15490 }, { "epoch": 0.9861932938856016, "grad_norm": 1.875, "learning_rate": 4.9306483425590125e-05, "loss": 0.0009, "step": 15500 }, { "epoch": 0.9868295476235923, "grad_norm": 0.058837890625, "learning_rate": 4.9338296112489666e-05, "loss": 0.0024, "step": 15510 }, { "epoch": 0.987465801361583, "grad_norm": 0.0147705078125, "learning_rate": 4.93701087993892e-05, "loss": 0.0033, "step": 15520 }, { "epoch": 0.9881020550995737, "grad_norm": 0.0196533203125, "learning_rate": 4.940192148628873e-05, "loss": 0.0036, "step": 15530 }, { "epoch": 0.9887383088375644, "grad_norm": 0.04052734375, "learning_rate": 4.943373417318827e-05, "loss": 0.0012, "step": 15540 }, { "epoch": 0.9893745625755551, "grad_norm": 0.019287109375, "learning_rate": 4.946554686008781e-05, "loss": 0.0003, "step": 15550 }, { "epoch": 0.9900108163135458, "grad_norm": 0.034912109375, "learning_rate": 4.949735954698734e-05, "loss": 0.0049, "step": 15560 }, { "epoch": 0.9906470700515365, "grad_norm": 0.05078125, "learning_rate": 4.952917223388688e-05, "loss": 0.0041, "step": 15570 }, { "epoch": 0.9912833237895272, "grad_norm": 0.0228271484375, "learning_rate": 4.9560984920786415e-05, "loss": 0.0004, "step": 15580 }, { "epoch": 0.991919577527518, "grad_norm": 0.0174560546875, "learning_rate": 4.959279760768595e-05, "loss": 0.0087, "step": 15590 }, { "epoch": 0.9925558312655087, "grad_norm": 0.1845703125, "learning_rate": 4.962461029458548e-05, "loss": 0.0014, "step": 15600 }, { "epoch": 0.9931920850034994, "grad_norm": 0.026611328125, "learning_rate": 4.9656422981485016e-05, "loss": 0.0029, "step": 15610 }, { "epoch": 0.9938283387414901, "grad_norm": 0.0223388671875, "learning_rate": 4.9688235668384556e-05, "loss": 0.0006, "step": 15620 }, { "epoch": 0.9944645924794808, "grad_norm": 0.0040283203125, "learning_rate": 4.972004835528409e-05, "loss": 0.0078, "step": 15630 }, { "epoch": 0.9951008462174715, "grad_norm": 2.109375, "learning_rate": 4.9751861042183624e-05, "loss": 0.0047, "step": 15640 }, { "epoch": 0.9957370999554622, "grad_norm": 0.0037994384765625, "learning_rate": 4.9783673729083164e-05, "loss": 0.001, "step": 15650 }, { "epoch": 0.9963733536934529, "grad_norm": 0.017578125, "learning_rate": 4.98154864159827e-05, "loss": 0.0002, "step": 15660 }, { "epoch": 0.9970096074314436, "grad_norm": 0.16796875, "learning_rate": 4.984729910288223e-05, "loss": 0.0013, "step": 15670 }, { "epoch": 0.9976458611694343, "grad_norm": 0.388671875, "learning_rate": 4.9879111789781765e-05, "loss": 0.0007, "step": 15680 }, { "epoch": 0.9982821149074251, "grad_norm": 0.453125, "learning_rate": 4.9910924476681305e-05, "loss": 0.0033, "step": 15690 }, { "epoch": 0.9989183686454158, "grad_norm": 0.126953125, "learning_rate": 4.994273716358084e-05, "loss": 0.0005, "step": 15700 }, { "epoch": 0.9995546223834065, "grad_norm": 0.013916015625, "learning_rate": 4.997454985048037e-05, "loss": 0.0005, "step": 15710 }, { "epoch": 1.0001908761213971, "grad_norm": 0.0546875, "learning_rate": 4.999929305140224e-05, "loss": 0.004, "step": 15720 }, { "epoch": 1.0008271298593878, "grad_norm": 0.007598876953125, "learning_rate": 4.9995758308413395e-05, "loss": 0.0004, "step": 15730 }, { "epoch": 1.0014633835973787, "grad_norm": 0.031494140625, "learning_rate": 4.999222356542456e-05, "loss": 0.0004, "step": 15740 }, { "epoch": 1.0020996373353694, "grad_norm": 0.007537841796875, "learning_rate": 4.9988688822435726e-05, "loss": 0.0003, "step": 15750 }, { "epoch": 1.0027358910733601, "grad_norm": 0.07470703125, "learning_rate": 4.9985154079446884e-05, "loss": 0.0013, "step": 15760 }, { "epoch": 1.0033721448113508, "grad_norm": 0.006683349609375, "learning_rate": 4.998161933645805e-05, "loss": 0.0004, "step": 15770 }, { "epoch": 1.0040083985493415, "grad_norm": 0.02490234375, "learning_rate": 4.997808459346921e-05, "loss": 0.0001, "step": 15780 }, { "epoch": 1.0046446522873322, "grad_norm": 0.08984375, "learning_rate": 4.997454985048037e-05, "loss": 0.0031, "step": 15790 }, { "epoch": 1.005280906025323, "grad_norm": 0.0517578125, "learning_rate": 4.997101510749154e-05, "loss": 0.0004, "step": 15800 }, { "epoch": 1.0059171597633136, "grad_norm": 0.035400390625, "learning_rate": 4.99674803645027e-05, "loss": 0.0002, "step": 15810 }, { "epoch": 1.0065534135013043, "grad_norm": 0.036376953125, "learning_rate": 4.996394562151386e-05, "loss": 0.0003, "step": 15820 }, { "epoch": 1.007189667239295, "grad_norm": 0.021240234375, "learning_rate": 4.996041087852503e-05, "loss": 0.0028, "step": 15830 }, { "epoch": 1.0078259209772857, "grad_norm": 0.004425048828125, "learning_rate": 4.995687613553619e-05, "loss": 0.0004, "step": 15840 }, { "epoch": 1.0084621747152764, "grad_norm": 0.01190185546875, "learning_rate": 4.995334139254735e-05, "loss": 0.0007, "step": 15850 }, { "epoch": 1.0090984284532671, "grad_norm": 0.004608154296875, "learning_rate": 4.994980664955851e-05, "loss": 0.0006, "step": 15860 }, { "epoch": 1.0097346821912578, "grad_norm": 0.059814453125, "learning_rate": 4.9946271906569674e-05, "loss": 0.0003, "step": 15870 }, { "epoch": 1.0103709359292485, "grad_norm": 0.0017547607421875, "learning_rate": 4.994273716358084e-05, "loss": 0.0062, "step": 15880 }, { "epoch": 1.0110071896672392, "grad_norm": 0.068359375, "learning_rate": 4.9939202420592004e-05, "loss": 0.0007, "step": 15890 }, { "epoch": 1.01164344340523, "grad_norm": 0.0098876953125, "learning_rate": 4.993566767760316e-05, "loss": 0.0007, "step": 15900 }, { "epoch": 1.0122796971432206, "grad_norm": 0.8203125, "learning_rate": 4.993213293461433e-05, "loss": 0.0005, "step": 15910 }, { "epoch": 1.0129159508812113, "grad_norm": 0.000453948974609375, "learning_rate": 4.992859819162549e-05, "loss": 0.0012, "step": 15920 }, { "epoch": 1.013552204619202, "grad_norm": 0.015869140625, "learning_rate": 4.992506344863665e-05, "loss": 0.0001, "step": 15930 }, { "epoch": 1.014188458357193, "grad_norm": 0.007598876953125, "learning_rate": 4.992152870564781e-05, "loss": 0.0025, "step": 15940 }, { "epoch": 1.0148247120951837, "grad_norm": 0.00567626953125, "learning_rate": 4.9917993962658975e-05, "loss": 0.0001, "step": 15950 }, { "epoch": 1.0154609658331744, "grad_norm": 0.00555419921875, "learning_rate": 4.991445921967014e-05, "loss": 0.0012, "step": 15960 }, { "epoch": 1.016097219571165, "grad_norm": 0.1455078125, "learning_rate": 4.9910924476681305e-05, "loss": 0.002, "step": 15970 }, { "epoch": 1.0167334733091558, "grad_norm": 8.5625, "learning_rate": 4.9907389733692464e-05, "loss": 0.0083, "step": 15980 }, { "epoch": 1.0173697270471465, "grad_norm": 0.005096435546875, "learning_rate": 4.990385499070363e-05, "loss": 0.0007, "step": 15990 }, { "epoch": 1.0180059807851372, "grad_norm": 0.044921875, "learning_rate": 4.9900320247714794e-05, "loss": 0.0002, "step": 16000 }, { "epoch": 1.0186422345231279, "grad_norm": 0.00909423828125, "learning_rate": 4.989678550472595e-05, "loss": 0.0019, "step": 16010 }, { "epoch": 1.0192784882611186, "grad_norm": 0.318359375, "learning_rate": 4.989325076173712e-05, "loss": 0.0019, "step": 16020 }, { "epoch": 1.0199147419991093, "grad_norm": 0.009765625, "learning_rate": 4.9889716018748276e-05, "loss": 0.0022, "step": 16030 }, { "epoch": 1.0205509957371, "grad_norm": 0.0203857421875, "learning_rate": 4.988618127575944e-05, "loss": 0.0018, "step": 16040 }, { "epoch": 1.0211872494750907, "grad_norm": 0.017578125, "learning_rate": 4.9882646532770607e-05, "loss": 0.0008, "step": 16050 }, { "epoch": 1.0218235032130814, "grad_norm": 0.0233154296875, "learning_rate": 4.9879111789781765e-05, "loss": 0.0003, "step": 16060 }, { "epoch": 1.022459756951072, "grad_norm": 0.007720947265625, "learning_rate": 4.987557704679293e-05, "loss": 0.0004, "step": 16070 }, { "epoch": 1.0230960106890628, "grad_norm": 0.037353515625, "learning_rate": 4.9872042303804095e-05, "loss": 0.0002, "step": 16080 }, { "epoch": 1.0237322644270535, "grad_norm": 0.083984375, "learning_rate": 4.9868507560815254e-05, "loss": 0.0005, "step": 16090 }, { "epoch": 1.0243685181650442, "grad_norm": 0.05859375, "learning_rate": 4.986497281782642e-05, "loss": 0.0039, "step": 16100 }, { "epoch": 1.0250047719030349, "grad_norm": 0.0057373046875, "learning_rate": 4.986143807483758e-05, "loss": 0.0007, "step": 16110 }, { "epoch": 1.0256410256410255, "grad_norm": 0.314453125, "learning_rate": 4.985790333184874e-05, "loss": 0.0133, "step": 16120 }, { "epoch": 1.0262772793790162, "grad_norm": 0.027587890625, "learning_rate": 4.985436858885991e-05, "loss": 0.0005, "step": 16130 }, { "epoch": 1.0269135331170072, "grad_norm": 0.7109375, "learning_rate": 4.985083384587107e-05, "loss": 0.0146, "step": 16140 }, { "epoch": 1.0275497868549979, "grad_norm": 0.083984375, "learning_rate": 4.984729910288223e-05, "loss": 0.0004, "step": 16150 }, { "epoch": 1.0281860405929886, "grad_norm": 0.0025787353515625, "learning_rate": 4.9843764359893397e-05, "loss": 0.0004, "step": 16160 }, { "epoch": 1.0288222943309793, "grad_norm": 0.07568359375, "learning_rate": 4.9840229616904555e-05, "loss": 0.0004, "step": 16170 }, { "epoch": 1.02945854806897, "grad_norm": 0.045654296875, "learning_rate": 4.983669487391572e-05, "loss": 0.0005, "step": 16180 }, { "epoch": 1.0300948018069607, "grad_norm": 0.017333984375, "learning_rate": 4.983316013092688e-05, "loss": 0.0005, "step": 16190 }, { "epoch": 1.0307310555449514, "grad_norm": 0.0084228515625, "learning_rate": 4.9829625387938044e-05, "loss": 0.0001, "step": 16200 }, { "epoch": 1.031367309282942, "grad_norm": 0.01495361328125, "learning_rate": 4.982609064494921e-05, "loss": 0.0004, "step": 16210 }, { "epoch": 1.0320035630209328, "grad_norm": 5.03125, "learning_rate": 4.9822555901960374e-05, "loss": 0.0034, "step": 16220 }, { "epoch": 1.0326398167589235, "grad_norm": 0.037109375, "learning_rate": 4.981902115897153e-05, "loss": 0.0009, "step": 16230 }, { "epoch": 1.0332760704969142, "grad_norm": 0.08544921875, "learning_rate": 4.98154864159827e-05, "loss": 0.0002, "step": 16240 }, { "epoch": 1.0339123242349049, "grad_norm": 0.05322265625, "learning_rate": 4.9811951672993856e-05, "loss": 0.0029, "step": 16250 }, { "epoch": 1.0345485779728956, "grad_norm": 0.1484375, "learning_rate": 4.980841693000502e-05, "loss": 0.0007, "step": 16260 }, { "epoch": 1.0351848317108863, "grad_norm": 0.005218505859375, "learning_rate": 4.980488218701618e-05, "loss": 0.0029, "step": 16270 }, { "epoch": 1.035821085448877, "grad_norm": 0.00897216796875, "learning_rate": 4.9801347444027345e-05, "loss": 0.0048, "step": 16280 }, { "epoch": 1.0364573391868677, "grad_norm": 4.96875, "learning_rate": 4.979781270103851e-05, "loss": 0.0056, "step": 16290 }, { "epoch": 1.0370935929248584, "grad_norm": 0.64453125, "learning_rate": 4.9794277958049675e-05, "loss": 0.0004, "step": 16300 }, { "epoch": 1.037729846662849, "grad_norm": 0.04150390625, "learning_rate": 4.9790743215060834e-05, "loss": 0.0039, "step": 16310 }, { "epoch": 1.0383661004008398, "grad_norm": 0.15234375, "learning_rate": 4.9787208472072e-05, "loss": 0.0005, "step": 16320 }, { "epoch": 1.0390023541388305, "grad_norm": 0.00653076171875, "learning_rate": 4.9783673729083164e-05, "loss": 0.0005, "step": 16330 }, { "epoch": 1.0396386078768214, "grad_norm": 2.09375, "learning_rate": 4.978013898609432e-05, "loss": 0.0009, "step": 16340 }, { "epoch": 1.040274861614812, "grad_norm": 0.07373046875, "learning_rate": 4.977660424310549e-05, "loss": 0.0005, "step": 16350 }, { "epoch": 1.0409111153528028, "grad_norm": 0.083984375, "learning_rate": 4.9773069500116646e-05, "loss": 0.0002, "step": 16360 }, { "epoch": 1.0415473690907935, "grad_norm": 0.00848388671875, "learning_rate": 4.976953475712781e-05, "loss": 0.0008, "step": 16370 }, { "epoch": 1.0421836228287842, "grad_norm": 0.01092529296875, "learning_rate": 4.9766000014138976e-05, "loss": 0.0002, "step": 16380 }, { "epoch": 1.0428198765667749, "grad_norm": 3.25, "learning_rate": 4.976246527115014e-05, "loss": 0.0034, "step": 16390 }, { "epoch": 1.0434561303047656, "grad_norm": 0.024169921875, "learning_rate": 4.97589305281613e-05, "loss": 0.0007, "step": 16400 }, { "epoch": 1.0440923840427563, "grad_norm": 0.058837890625, "learning_rate": 4.9755395785172465e-05, "loss": 0.0038, "step": 16410 }, { "epoch": 1.044728637780747, "grad_norm": 0.11669921875, "learning_rate": 4.9751861042183624e-05, "loss": 0.0014, "step": 16420 }, { "epoch": 1.0453648915187377, "grad_norm": 0.138671875, "learning_rate": 4.974832629919479e-05, "loss": 0.0007, "step": 16430 }, { "epoch": 1.0460011452567284, "grad_norm": 0.0172119140625, "learning_rate": 4.974479155620595e-05, "loss": 0.0025, "step": 16440 }, { "epoch": 1.046637398994719, "grad_norm": 0.0223388671875, "learning_rate": 4.974125681321711e-05, "loss": 0.0064, "step": 16450 }, { "epoch": 1.0472736527327098, "grad_norm": 0.0030059814453125, "learning_rate": 4.973772207022828e-05, "loss": 0.0003, "step": 16460 }, { "epoch": 1.0479099064707005, "grad_norm": 0.035888671875, "learning_rate": 4.973418732723944e-05, "loss": 0.0006, "step": 16470 }, { "epoch": 1.0485461602086912, "grad_norm": 0.00909423828125, "learning_rate": 4.97306525842506e-05, "loss": 0.0004, "step": 16480 }, { "epoch": 1.0491824139466819, "grad_norm": 0.006256103515625, "learning_rate": 4.9727117841261766e-05, "loss": 0.0071, "step": 16490 }, { "epoch": 1.0498186676846726, "grad_norm": 0.002593994140625, "learning_rate": 4.9723583098272925e-05, "loss": 0.0016, "step": 16500 }, { "epoch": 1.0504549214226633, "grad_norm": 0.01300048828125, "learning_rate": 4.972004835528409e-05, "loss": 0.0004, "step": 16510 }, { "epoch": 1.051091175160654, "grad_norm": 0.275390625, "learning_rate": 4.971651361229525e-05, "loss": 0.0035, "step": 16520 }, { "epoch": 1.0517274288986447, "grad_norm": 0.0194091796875, "learning_rate": 4.9712978869306414e-05, "loss": 0.0071, "step": 16530 }, { "epoch": 1.0523636826366354, "grad_norm": 0.0908203125, "learning_rate": 4.970944412631758e-05, "loss": 0.0004, "step": 16540 }, { "epoch": 1.0529999363746263, "grad_norm": 0.0264892578125, "learning_rate": 4.9705909383328744e-05, "loss": 0.0002, "step": 16550 }, { "epoch": 1.053636190112617, "grad_norm": 0.2275390625, "learning_rate": 4.97023746403399e-05, "loss": 0.0035, "step": 16560 }, { "epoch": 1.0542724438506077, "grad_norm": 0.150390625, "learning_rate": 4.969883989735107e-05, "loss": 0.0013, "step": 16570 }, { "epoch": 1.0549086975885984, "grad_norm": 0.0311279296875, "learning_rate": 4.9695305154362226e-05, "loss": 0.0009, "step": 16580 }, { "epoch": 1.055544951326589, "grad_norm": 0.00250244140625, "learning_rate": 4.969177041137339e-05, "loss": 0.0055, "step": 16590 }, { "epoch": 1.0561812050645798, "grad_norm": 0.0263671875, "learning_rate": 4.9688235668384556e-05, "loss": 0.0003, "step": 16600 }, { "epoch": 1.0568174588025705, "grad_norm": 0.0142822265625, "learning_rate": 4.9684700925395715e-05, "loss": 0.0001, "step": 16610 }, { "epoch": 1.0574537125405612, "grad_norm": 0.0084228515625, "learning_rate": 4.968116618240688e-05, "loss": 0.0003, "step": 16620 }, { "epoch": 1.058089966278552, "grad_norm": 0.021240234375, "learning_rate": 4.9677631439418045e-05, "loss": 0.0039, "step": 16630 }, { "epoch": 1.0587262200165426, "grad_norm": 0.059326171875, "learning_rate": 4.9674096696429203e-05, "loss": 0.0007, "step": 16640 }, { "epoch": 1.0593624737545333, "grad_norm": 0.005645751953125, "learning_rate": 4.967056195344037e-05, "loss": 0.0189, "step": 16650 }, { "epoch": 1.059998727492524, "grad_norm": 0.036376953125, "learning_rate": 4.966702721045153e-05, "loss": 0.0009, "step": 16660 }, { "epoch": 1.0606349812305147, "grad_norm": 0.01007080078125, "learning_rate": 4.966349246746269e-05, "loss": 0.0006, "step": 16670 }, { "epoch": 1.0612712349685054, "grad_norm": 0.10009765625, "learning_rate": 4.965995772447386e-05, "loss": 0.0006, "step": 16680 }, { "epoch": 1.061907488706496, "grad_norm": 0.036376953125, "learning_rate": 4.9656422981485016e-05, "loss": 0.0004, "step": 16690 }, { "epoch": 1.0625437424444868, "grad_norm": 0.50390625, "learning_rate": 4.965288823849618e-05, "loss": 0.0105, "step": 16700 }, { "epoch": 1.0631799961824775, "grad_norm": 0.953125, "learning_rate": 4.9649353495507346e-05, "loss": 0.0012, "step": 16710 }, { "epoch": 1.0638162499204682, "grad_norm": 0.004730224609375, "learning_rate": 4.964581875251851e-05, "loss": 0.0021, "step": 16720 }, { "epoch": 1.064452503658459, "grad_norm": 0.095703125, "learning_rate": 4.964228400952967e-05, "loss": 0.0063, "step": 16730 }, { "epoch": 1.0650887573964498, "grad_norm": 0.08203125, "learning_rate": 4.9638749266540835e-05, "loss": 0.0027, "step": 16740 }, { "epoch": 1.0657250111344405, "grad_norm": 0.006500244140625, "learning_rate": 4.9635214523551993e-05, "loss": 0.0006, "step": 16750 }, { "epoch": 1.0663612648724312, "grad_norm": 0.0201416015625, "learning_rate": 4.963167978056316e-05, "loss": 0.0003, "step": 16760 }, { "epoch": 1.066997518610422, "grad_norm": 0.03662109375, "learning_rate": 4.962814503757432e-05, "loss": 0.0009, "step": 16770 }, { "epoch": 1.0676337723484126, "grad_norm": 0.0013275146484375, "learning_rate": 4.962461029458548e-05, "loss": 0.0007, "step": 16780 }, { "epoch": 1.0682700260864033, "grad_norm": 0.040771484375, "learning_rate": 4.962107555159665e-05, "loss": 0.0007, "step": 16790 }, { "epoch": 1.068906279824394, "grad_norm": 0.015625, "learning_rate": 4.961754080860781e-05, "loss": 0.0007, "step": 16800 }, { "epoch": 1.0695425335623847, "grad_norm": 0.04150390625, "learning_rate": 4.961400606561897e-05, "loss": 0.0003, "step": 16810 }, { "epoch": 1.0701787873003754, "grad_norm": 0.1259765625, "learning_rate": 4.9610471322630136e-05, "loss": 0.0004, "step": 16820 }, { "epoch": 1.0708150410383661, "grad_norm": 0.014404296875, "learning_rate": 4.9606936579641295e-05, "loss": 0.0002, "step": 16830 }, { "epoch": 1.0714512947763568, "grad_norm": 0.0274658203125, "learning_rate": 4.960340183665246e-05, "loss": 0.0019, "step": 16840 }, { "epoch": 1.0720875485143475, "grad_norm": 0.00701904296875, "learning_rate": 4.959986709366362e-05, "loss": 0.0002, "step": 16850 }, { "epoch": 1.0727238022523382, "grad_norm": 0.0380859375, "learning_rate": 4.959633235067478e-05, "loss": 0.0072, "step": 16860 }, { "epoch": 1.073360055990329, "grad_norm": 0.365234375, "learning_rate": 4.959279760768595e-05, "loss": 0.0004, "step": 16870 }, { "epoch": 1.0739963097283196, "grad_norm": 0.2216796875, "learning_rate": 4.9589262864697114e-05, "loss": 0.0007, "step": 16880 }, { "epoch": 1.0746325634663103, "grad_norm": 0.0252685546875, "learning_rate": 4.958572812170827e-05, "loss": 0.0062, "step": 16890 }, { "epoch": 1.075268817204301, "grad_norm": 0.1796875, "learning_rate": 4.958219337871944e-05, "loss": 0.0007, "step": 16900 }, { "epoch": 1.0759050709422917, "grad_norm": 0.000843048095703125, "learning_rate": 4.9578658635730596e-05, "loss": 0.0018, "step": 16910 }, { "epoch": 1.0765413246802824, "grad_norm": 0.2216796875, "learning_rate": 4.957512389274176e-05, "loss": 0.0003, "step": 16920 }, { "epoch": 1.077177578418273, "grad_norm": 0.058349609375, "learning_rate": 4.9571589149752926e-05, "loss": 0.0002, "step": 16930 }, { "epoch": 1.0778138321562638, "grad_norm": 0.0089111328125, "learning_rate": 4.9568054406764085e-05, "loss": 0.0001, "step": 16940 }, { "epoch": 1.0784500858942547, "grad_norm": 0.0634765625, "learning_rate": 4.956451966377525e-05, "loss": 0.0006, "step": 16950 }, { "epoch": 1.0790863396322454, "grad_norm": 0.056396484375, "learning_rate": 4.9560984920786415e-05, "loss": 0.0003, "step": 16960 }, { "epoch": 1.0797225933702361, "grad_norm": 0.4609375, "learning_rate": 4.955745017779757e-05, "loss": 0.0028, "step": 16970 }, { "epoch": 1.0803588471082268, "grad_norm": 7.34375, "learning_rate": 4.955391543480874e-05, "loss": 0.0069, "step": 16980 }, { "epoch": 1.0809951008462175, "grad_norm": 0.10009765625, "learning_rate": 4.95503806918199e-05, "loss": 0.0069, "step": 16990 }, { "epoch": 1.0816313545842082, "grad_norm": 0.0027313232421875, "learning_rate": 4.954684594883106e-05, "loss": 0.0044, "step": 17000 }, { "epoch": 1.082267608322199, "grad_norm": 0.006683349609375, "learning_rate": 4.954331120584223e-05, "loss": 0.0121, "step": 17010 }, { "epoch": 1.0829038620601896, "grad_norm": 0.0030517578125, "learning_rate": 4.9539776462853386e-05, "loss": 0.0002, "step": 17020 }, { "epoch": 1.0835401157981803, "grad_norm": 0.2119140625, "learning_rate": 4.953624171986455e-05, "loss": 0.0005, "step": 17030 }, { "epoch": 1.084176369536171, "grad_norm": 0.396484375, "learning_rate": 4.9532706976875716e-05, "loss": 0.003, "step": 17040 }, { "epoch": 1.0848126232741617, "grad_norm": 0.138671875, "learning_rate": 4.952917223388688e-05, "loss": 0.0195, "step": 17050 }, { "epoch": 1.0854488770121524, "grad_norm": 0.0302734375, "learning_rate": 4.952563749089804e-05, "loss": 0.0002, "step": 17060 }, { "epoch": 1.0860851307501431, "grad_norm": 0.0224609375, "learning_rate": 4.95221027479092e-05, "loss": 0.0003, "step": 17070 }, { "epoch": 1.0867213844881338, "grad_norm": 0.0206298828125, "learning_rate": 4.951856800492036e-05, "loss": 0.0012, "step": 17080 }, { "epoch": 1.0873576382261245, "grad_norm": 0.0021209716796875, "learning_rate": 4.951503326193153e-05, "loss": 0.0003, "step": 17090 }, { "epoch": 1.0879938919641152, "grad_norm": 0.044921875, "learning_rate": 4.951149851894269e-05, "loss": 0.0056, "step": 17100 }, { "epoch": 1.088630145702106, "grad_norm": 0.07666015625, "learning_rate": 4.950796377595385e-05, "loss": 0.0003, "step": 17110 }, { "epoch": 1.0892663994400966, "grad_norm": 0.0021209716796875, "learning_rate": 4.950442903296502e-05, "loss": 0.0004, "step": 17120 }, { "epoch": 1.0899026531780873, "grad_norm": 0.031982421875, "learning_rate": 4.950089428997618e-05, "loss": 0.0002, "step": 17130 }, { "epoch": 1.0905389069160782, "grad_norm": 0.0037078857421875, "learning_rate": 4.949735954698734e-05, "loss": 0.0034, "step": 17140 }, { "epoch": 1.091175160654069, "grad_norm": 0.0068359375, "learning_rate": 4.9493824803998506e-05, "loss": 0.0003, "step": 17150 }, { "epoch": 1.0918114143920596, "grad_norm": 1.25, "learning_rate": 4.9490290061009664e-05, "loss": 0.0017, "step": 17160 }, { "epoch": 1.0924476681300503, "grad_norm": 0.1259765625, "learning_rate": 4.948675531802083e-05, "loss": 0.0012, "step": 17170 }, { "epoch": 1.093083921868041, "grad_norm": 0.01953125, "learning_rate": 4.948322057503199e-05, "loss": 0.0001, "step": 17180 }, { "epoch": 1.0937201756060317, "grad_norm": 0.00156402587890625, "learning_rate": 4.947968583204315e-05, "loss": 0.0003, "step": 17190 }, { "epoch": 1.0943564293440224, "grad_norm": 0.01123046875, "learning_rate": 4.947615108905432e-05, "loss": 0.0003, "step": 17200 }, { "epoch": 1.0949926830820131, "grad_norm": 0.00127410888671875, "learning_rate": 4.9472616346065484e-05, "loss": 0.0076, "step": 17210 }, { "epoch": 1.0956289368200038, "grad_norm": 0.00396728515625, "learning_rate": 4.946908160307664e-05, "loss": 0.0003, "step": 17220 }, { "epoch": 1.0962651905579945, "grad_norm": 0.030029296875, "learning_rate": 4.946554686008781e-05, "loss": 0.0009, "step": 17230 }, { "epoch": 1.0969014442959852, "grad_norm": 0.076171875, "learning_rate": 4.9462012117098966e-05, "loss": 0.0006, "step": 17240 }, { "epoch": 1.097537698033976, "grad_norm": 0.0079345703125, "learning_rate": 4.945847737411013e-05, "loss": 0.0006, "step": 17250 }, { "epoch": 1.0981739517719666, "grad_norm": 0.06689453125, "learning_rate": 4.9454942631121296e-05, "loss": 0.0003, "step": 17260 }, { "epoch": 1.0988102055099573, "grad_norm": 0.002960205078125, "learning_rate": 4.9451407888132454e-05, "loss": 0.0039, "step": 17270 }, { "epoch": 1.099446459247948, "grad_norm": 0.302734375, "learning_rate": 4.944787314514362e-05, "loss": 0.005, "step": 17280 }, { "epoch": 1.1000827129859387, "grad_norm": 0.02587890625, "learning_rate": 4.9444338402154785e-05, "loss": 0.0001, "step": 17290 }, { "epoch": 1.1007189667239294, "grad_norm": 0.041259765625, "learning_rate": 4.944080365916594e-05, "loss": 0.0003, "step": 17300 }, { "epoch": 1.1013552204619201, "grad_norm": 0.15234375, "learning_rate": 4.943726891617711e-05, "loss": 0.0005, "step": 17310 }, { "epoch": 1.1019914741999108, "grad_norm": 5.125, "learning_rate": 4.943373417318827e-05, "loss": 0.0082, "step": 17320 }, { "epoch": 1.1026277279379015, "grad_norm": 0.01611328125, "learning_rate": 4.943019943019943e-05, "loss": 0.0005, "step": 17330 }, { "epoch": 1.1032639816758922, "grad_norm": 0.035888671875, "learning_rate": 4.94266646872106e-05, "loss": 0.0116, "step": 17340 }, { "epoch": 1.1039002354138832, "grad_norm": 4.46875, "learning_rate": 4.9423129944221755e-05, "loss": 0.004, "step": 17350 }, { "epoch": 1.1045364891518739, "grad_norm": 0.1513671875, "learning_rate": 4.941959520123292e-05, "loss": 0.0033, "step": 17360 }, { "epoch": 1.1051727428898646, "grad_norm": 0.01409912109375, "learning_rate": 4.9416060458244086e-05, "loss": 0.0008, "step": 17370 }, { "epoch": 1.1058089966278553, "grad_norm": 0.00592041015625, "learning_rate": 4.941252571525525e-05, "loss": 0.0049, "step": 17380 }, { "epoch": 1.106445250365846, "grad_norm": 0.26171875, "learning_rate": 4.940899097226641e-05, "loss": 0.0002, "step": 17390 }, { "epoch": 1.1070815041038367, "grad_norm": 0.18359375, "learning_rate": 4.940545622927757e-05, "loss": 0.0081, "step": 17400 }, { "epoch": 1.1077177578418274, "grad_norm": 0.003814697265625, "learning_rate": 4.940192148628873e-05, "loss": 0.0002, "step": 17410 }, { "epoch": 1.108354011579818, "grad_norm": 0.00439453125, "learning_rate": 4.93983867432999e-05, "loss": 0.0002, "step": 17420 }, { "epoch": 1.1089902653178088, "grad_norm": 0.064453125, "learning_rate": 4.939485200031106e-05, "loss": 0.0086, "step": 17430 }, { "epoch": 1.1096265190557995, "grad_norm": 0.052734375, "learning_rate": 4.939131725732222e-05, "loss": 0.0009, "step": 17440 }, { "epoch": 1.1102627727937902, "grad_norm": 0.01068115234375, "learning_rate": 4.938778251433339e-05, "loss": 0.0007, "step": 17450 }, { "epoch": 1.1108990265317809, "grad_norm": 0.042236328125, "learning_rate": 4.938424777134455e-05, "loss": 0.0024, "step": 17460 }, { "epoch": 1.1115352802697716, "grad_norm": 0.03564453125, "learning_rate": 4.938071302835571e-05, "loss": 0.0078, "step": 17470 }, { "epoch": 1.1121715340077623, "grad_norm": 0.0072021484375, "learning_rate": 4.937717828536687e-05, "loss": 0.0054, "step": 17480 }, { "epoch": 1.112807787745753, "grad_norm": 0.55859375, "learning_rate": 4.9373643542378034e-05, "loss": 0.0026, "step": 17490 }, { "epoch": 1.1134440414837437, "grad_norm": 0.09619140625, "learning_rate": 4.93701087993892e-05, "loss": 0.0019, "step": 17500 }, { "epoch": 1.1140802952217344, "grad_norm": 0.00860595703125, "learning_rate": 4.936657405640036e-05, "loss": 0.0002, "step": 17510 }, { "epoch": 1.114716548959725, "grad_norm": 0.0137939453125, "learning_rate": 4.936303931341152e-05, "loss": 0.0007, "step": 17520 }, { "epoch": 1.1153528026977158, "grad_norm": 0.023193359375, "learning_rate": 4.935950457042269e-05, "loss": 0.0004, "step": 17530 }, { "epoch": 1.1159890564357067, "grad_norm": 0.039794921875, "learning_rate": 4.935596982743385e-05, "loss": 0.0001, "step": 17540 }, { "epoch": 1.1166253101736974, "grad_norm": 0.259765625, "learning_rate": 4.935243508444501e-05, "loss": 0.0005, "step": 17550 }, { "epoch": 1.117261563911688, "grad_norm": 0.322265625, "learning_rate": 4.934890034145617e-05, "loss": 0.0004, "step": 17560 }, { "epoch": 1.1178978176496788, "grad_norm": 0.0081787109375, "learning_rate": 4.9345365598467335e-05, "loss": 0.0009, "step": 17570 }, { "epoch": 1.1185340713876695, "grad_norm": 0.205078125, "learning_rate": 4.93418308554785e-05, "loss": 0.0003, "step": 17580 }, { "epoch": 1.1191703251256602, "grad_norm": 0.010986328125, "learning_rate": 4.9338296112489666e-05, "loss": 0.0027, "step": 17590 }, { "epoch": 1.1198065788636509, "grad_norm": 0.0179443359375, "learning_rate": 4.9334761369500824e-05, "loss": 0.0003, "step": 17600 }, { "epoch": 1.1204428326016416, "grad_norm": 0.005889892578125, "learning_rate": 4.933122662651199e-05, "loss": 0.0061, "step": 17610 }, { "epoch": 1.1210790863396323, "grad_norm": 0.01519775390625, "learning_rate": 4.9327691883523154e-05, "loss": 0.0019, "step": 17620 }, { "epoch": 1.121715340077623, "grad_norm": 0.482421875, "learning_rate": 4.932415714053431e-05, "loss": 0.0013, "step": 17630 }, { "epoch": 1.1223515938156137, "grad_norm": 0.00830078125, "learning_rate": 4.932062239754548e-05, "loss": 0.0007, "step": 17640 }, { "epoch": 1.1229878475536044, "grad_norm": 0.035888671875, "learning_rate": 4.9317087654556636e-05, "loss": 0.0005, "step": 17650 }, { "epoch": 1.123624101291595, "grad_norm": 0.3125, "learning_rate": 4.93135529115678e-05, "loss": 0.0004, "step": 17660 }, { "epoch": 1.1242603550295858, "grad_norm": 0.005218505859375, "learning_rate": 4.931001816857897e-05, "loss": 0.0007, "step": 17670 }, { "epoch": 1.1248966087675765, "grad_norm": 0.0849609375, "learning_rate": 4.9306483425590125e-05, "loss": 0.0011, "step": 17680 }, { "epoch": 1.1255328625055672, "grad_norm": 1.1171875, "learning_rate": 4.930294868260129e-05, "loss": 0.001, "step": 17690 }, { "epoch": 1.1261691162435579, "grad_norm": 0.037841796875, "learning_rate": 4.9299413939612456e-05, "loss": 0.0039, "step": 17700 }, { "epoch": 1.1268053699815486, "grad_norm": 0.044189453125, "learning_rate": 4.929587919662362e-05, "loss": 0.0013, "step": 17710 }, { "epoch": 1.1274416237195393, "grad_norm": 0.0012359619140625, "learning_rate": 4.929234445363478e-05, "loss": 0.001, "step": 17720 }, { "epoch": 1.12807787745753, "grad_norm": 0.2236328125, "learning_rate": 4.928880971064594e-05, "loss": 0.0006, "step": 17730 }, { "epoch": 1.1287141311955207, "grad_norm": 0.0478515625, "learning_rate": 4.92852749676571e-05, "loss": 0.0002, "step": 17740 }, { "epoch": 1.1293503849335114, "grad_norm": 0.1025390625, "learning_rate": 4.928174022466827e-05, "loss": 0.0004, "step": 17750 }, { "epoch": 1.1299866386715023, "grad_norm": 0.66796875, "learning_rate": 4.9278205481679426e-05, "loss": 0.0008, "step": 17760 }, { "epoch": 1.130622892409493, "grad_norm": 0.004486083984375, "learning_rate": 4.927467073869059e-05, "loss": 0.0001, "step": 17770 }, { "epoch": 1.1312591461474837, "grad_norm": 0.030517578125, "learning_rate": 4.927113599570176e-05, "loss": 0.0002, "step": 17780 }, { "epoch": 1.1318953998854744, "grad_norm": 1.296875, "learning_rate": 4.926760125271292e-05, "loss": 0.0007, "step": 17790 }, { "epoch": 1.132531653623465, "grad_norm": 0.00543212890625, "learning_rate": 4.926406650972408e-05, "loss": 0.0013, "step": 17800 }, { "epoch": 1.1331679073614558, "grad_norm": 0.0091552734375, "learning_rate": 4.926053176673524e-05, "loss": 0.0001, "step": 17810 }, { "epoch": 1.1338041610994465, "grad_norm": 0.024658203125, "learning_rate": 4.9256997023746404e-05, "loss": 0.0108, "step": 17820 }, { "epoch": 1.1344404148374372, "grad_norm": 0.004180908203125, "learning_rate": 4.925346228075757e-05, "loss": 0.0087, "step": 17830 }, { "epoch": 1.1350766685754279, "grad_norm": 0.0067138671875, "learning_rate": 4.924992753776873e-05, "loss": 0.0005, "step": 17840 }, { "epoch": 1.1357129223134186, "grad_norm": 0.09033203125, "learning_rate": 4.924639279477989e-05, "loss": 0.0004, "step": 17850 }, { "epoch": 1.1363491760514093, "grad_norm": 0.01220703125, "learning_rate": 4.924285805179106e-05, "loss": 0.0018, "step": 17860 }, { "epoch": 1.1369854297894, "grad_norm": 0.025390625, "learning_rate": 4.923932330880222e-05, "loss": 0.0068, "step": 17870 }, { "epoch": 1.1376216835273907, "grad_norm": 0.031982421875, "learning_rate": 4.923578856581338e-05, "loss": 0.0013, "step": 17880 }, { "epoch": 1.1382579372653814, "grad_norm": 0.06884765625, "learning_rate": 4.923225382282454e-05, "loss": 0.0045, "step": 17890 }, { "epoch": 1.138894191003372, "grad_norm": 0.0152587890625, "learning_rate": 4.9228719079835705e-05, "loss": 0.0003, "step": 17900 }, { "epoch": 1.1395304447413628, "grad_norm": 0.060791015625, "learning_rate": 4.922518433684687e-05, "loss": 0.0002, "step": 17910 }, { "epoch": 1.1401666984793535, "grad_norm": 0.0146484375, "learning_rate": 4.9221649593858036e-05, "loss": 0.001, "step": 17920 }, { "epoch": 1.1408029522173444, "grad_norm": 0.97265625, "learning_rate": 4.9218114850869194e-05, "loss": 0.002, "step": 17930 }, { "epoch": 1.141439205955335, "grad_norm": 0.00113677978515625, "learning_rate": 4.921458010788036e-05, "loss": 0.0002, "step": 17940 }, { "epoch": 1.1420754596933258, "grad_norm": 0.0028076171875, "learning_rate": 4.9211045364891524e-05, "loss": 0.0001, "step": 17950 }, { "epoch": 1.1427117134313165, "grad_norm": 0.035400390625, "learning_rate": 4.920751062190268e-05, "loss": 0.0007, "step": 17960 }, { "epoch": 1.1433479671693072, "grad_norm": 0.01226806640625, "learning_rate": 4.920397587891384e-05, "loss": 0.0003, "step": 17970 }, { "epoch": 1.143984220907298, "grad_norm": 0.00933837890625, "learning_rate": 4.9200441135925006e-05, "loss": 0.0003, "step": 17980 }, { "epoch": 1.1446204746452886, "grad_norm": 0.01202392578125, "learning_rate": 4.919690639293617e-05, "loss": 0.0061, "step": 17990 }, { "epoch": 1.1452567283832793, "grad_norm": 0.0634765625, "learning_rate": 4.919337164994734e-05, "loss": 0.0007, "step": 18000 }, { "epoch": 1.14589298212127, "grad_norm": 0.05419921875, "learning_rate": 4.9189836906958495e-05, "loss": 0.0041, "step": 18010 }, { "epoch": 1.1465292358592607, "grad_norm": 0.0947265625, "learning_rate": 4.918630216396966e-05, "loss": 0.0015, "step": 18020 }, { "epoch": 1.1471654895972514, "grad_norm": 0.044677734375, "learning_rate": 4.9182767420980825e-05, "loss": 0.0087, "step": 18030 }, { "epoch": 1.147801743335242, "grad_norm": 0.037841796875, "learning_rate": 4.917923267799199e-05, "loss": 0.0029, "step": 18040 }, { "epoch": 1.1484379970732328, "grad_norm": 0.028564453125, "learning_rate": 4.917569793500315e-05, "loss": 0.0015, "step": 18050 }, { "epoch": 1.1490742508112235, "grad_norm": 0.0771484375, "learning_rate": 4.917216319201431e-05, "loss": 0.003, "step": 18060 }, { "epoch": 1.1497105045492142, "grad_norm": 0.07666015625, "learning_rate": 4.916862844902547e-05, "loss": 0.0004, "step": 18070 }, { "epoch": 1.150346758287205, "grad_norm": 5.875, "learning_rate": 4.916509370603664e-05, "loss": 0.0082, "step": 18080 }, { "epoch": 1.1509830120251956, "grad_norm": 1.2734375, "learning_rate": 4.9161558963047796e-05, "loss": 0.0026, "step": 18090 }, { "epoch": 1.1516192657631863, "grad_norm": 0.1474609375, "learning_rate": 4.915802422005896e-05, "loss": 0.0005, "step": 18100 }, { "epoch": 1.152255519501177, "grad_norm": 0.0223388671875, "learning_rate": 4.915448947707013e-05, "loss": 0.0007, "step": 18110 }, { "epoch": 1.1528917732391677, "grad_norm": 0.00274658203125, "learning_rate": 4.915095473408129e-05, "loss": 0.0002, "step": 18120 }, { "epoch": 1.1535280269771584, "grad_norm": 0.0242919921875, "learning_rate": 4.914741999109245e-05, "loss": 0.0001, "step": 18130 }, { "epoch": 1.154164280715149, "grad_norm": 0.00087738037109375, "learning_rate": 4.914388524810361e-05, "loss": 0.0029, "step": 18140 }, { "epoch": 1.1548005344531398, "grad_norm": 0.010009765625, "learning_rate": 4.9140350505114774e-05, "loss": 0.0032, "step": 18150 }, { "epoch": 1.1554367881911307, "grad_norm": 0.019775390625, "learning_rate": 4.913681576212594e-05, "loss": 0.0025, "step": 18160 }, { "epoch": 1.1560730419291214, "grad_norm": 0.028564453125, "learning_rate": 4.91332810191371e-05, "loss": 0.003, "step": 18170 }, { "epoch": 1.1567092956671121, "grad_norm": 0.47265625, "learning_rate": 4.912974627614826e-05, "loss": 0.0006, "step": 18180 }, { "epoch": 1.1573455494051028, "grad_norm": 0.4140625, "learning_rate": 4.912621153315943e-05, "loss": 0.002, "step": 18190 }, { "epoch": 1.1579818031430935, "grad_norm": 0.216796875, "learning_rate": 4.912267679017059e-05, "loss": 0.0017, "step": 18200 }, { "epoch": 1.1586180568810842, "grad_norm": 0.076171875, "learning_rate": 4.911914204718175e-05, "loss": 0.0014, "step": 18210 }, { "epoch": 1.159254310619075, "grad_norm": 0.0289306640625, "learning_rate": 4.911560730419291e-05, "loss": 0.0005, "step": 18220 }, { "epoch": 1.1598905643570656, "grad_norm": 0.0025787353515625, "learning_rate": 4.9112072561204075e-05, "loss": 0.0003, "step": 18230 }, { "epoch": 1.1605268180950563, "grad_norm": 0.0037689208984375, "learning_rate": 4.910853781821524e-05, "loss": 0.0006, "step": 18240 }, { "epoch": 1.161163071833047, "grad_norm": 0.275390625, "learning_rate": 4.9105003075226405e-05, "loss": 0.0005, "step": 18250 }, { "epoch": 1.1617993255710377, "grad_norm": 0.0625, "learning_rate": 4.9101468332237564e-05, "loss": 0.0009, "step": 18260 }, { "epoch": 1.1624355793090284, "grad_norm": 0.010986328125, "learning_rate": 4.909793358924873e-05, "loss": 0.0004, "step": 18270 }, { "epoch": 1.163071833047019, "grad_norm": 0.00518798828125, "learning_rate": 4.9094398846259894e-05, "loss": 0.0168, "step": 18280 }, { "epoch": 1.1637080867850098, "grad_norm": 0.06982421875, "learning_rate": 4.909086410327105e-05, "loss": 0.0054, "step": 18290 }, { "epoch": 1.1643443405230005, "grad_norm": 0.017822265625, "learning_rate": 4.908732936028221e-05, "loss": 0.0003, "step": 18300 }, { "epoch": 1.1649805942609912, "grad_norm": 0.006378173828125, "learning_rate": 4.9083794617293376e-05, "loss": 0.0011, "step": 18310 }, { "epoch": 1.165616847998982, "grad_norm": 3.375, "learning_rate": 4.908025987430454e-05, "loss": 0.0048, "step": 18320 }, { "epoch": 1.1662531017369726, "grad_norm": 0.0113525390625, "learning_rate": 4.9076725131315706e-05, "loss": 0.0122, "step": 18330 }, { "epoch": 1.1668893554749635, "grad_norm": 0.482421875, "learning_rate": 4.9073190388326865e-05, "loss": 0.0012, "step": 18340 }, { "epoch": 1.1675256092129542, "grad_norm": 0.003570556640625, "learning_rate": 4.906965564533803e-05, "loss": 0.0008, "step": 18350 }, { "epoch": 1.168161862950945, "grad_norm": 0.703125, "learning_rate": 4.9066120902349195e-05, "loss": 0.003, "step": 18360 }, { "epoch": 1.1687981166889356, "grad_norm": 0.396484375, "learning_rate": 4.906258615936036e-05, "loss": 0.0004, "step": 18370 }, { "epoch": 1.1694343704269263, "grad_norm": 0.2431640625, "learning_rate": 4.905905141637151e-05, "loss": 0.0077, "step": 18380 }, { "epoch": 1.170070624164917, "grad_norm": 0.0130615234375, "learning_rate": 4.905551667338268e-05, "loss": 0.0001, "step": 18390 }, { "epoch": 1.1707068779029077, "grad_norm": 0.00148773193359375, "learning_rate": 4.905198193039384e-05, "loss": 0.0004, "step": 18400 }, { "epoch": 1.1713431316408984, "grad_norm": 0.0019989013671875, "learning_rate": 4.904844718740501e-05, "loss": 0.0008, "step": 18410 }, { "epoch": 1.1719793853788891, "grad_norm": 0.000946044921875, "learning_rate": 4.9044912444416166e-05, "loss": 0.0009, "step": 18420 }, { "epoch": 1.1726156391168798, "grad_norm": 4.875, "learning_rate": 4.904137770142733e-05, "loss": 0.0079, "step": 18430 }, { "epoch": 1.1732518928548705, "grad_norm": 0.00390625, "learning_rate": 4.9037842958438496e-05, "loss": 0.0044, "step": 18440 }, { "epoch": 1.1738881465928612, "grad_norm": 0.00897216796875, "learning_rate": 4.903430821544966e-05, "loss": 0.0039, "step": 18450 }, { "epoch": 1.174524400330852, "grad_norm": 0.018310546875, "learning_rate": 4.903077347246082e-05, "loss": 0.0011, "step": 18460 }, { "epoch": 1.1751606540688426, "grad_norm": 0.0498046875, "learning_rate": 4.902723872947198e-05, "loss": 0.0006, "step": 18470 }, { "epoch": 1.1757969078068333, "grad_norm": 0.169921875, "learning_rate": 4.9023703986483144e-05, "loss": 0.0006, "step": 18480 }, { "epoch": 1.176433161544824, "grad_norm": 0.263671875, "learning_rate": 4.902016924349431e-05, "loss": 0.0007, "step": 18490 }, { "epoch": 1.1770694152828147, "grad_norm": 0.134765625, "learning_rate": 4.901663450050547e-05, "loss": 0.0034, "step": 18500 }, { "epoch": 1.1777056690208054, "grad_norm": 0.010009765625, "learning_rate": 4.901309975751663e-05, "loss": 0.0065, "step": 18510 }, { "epoch": 1.1783419227587961, "grad_norm": 0.056884765625, "learning_rate": 4.90095650145278e-05, "loss": 0.0003, "step": 18520 }, { "epoch": 1.1789781764967868, "grad_norm": 0.005615234375, "learning_rate": 4.900603027153896e-05, "loss": 0.0003, "step": 18530 }, { "epoch": 1.1796144302347775, "grad_norm": 0.00750732421875, "learning_rate": 4.900249552855012e-05, "loss": 0.0002, "step": 18540 }, { "epoch": 1.1802506839727682, "grad_norm": 0.006866455078125, "learning_rate": 4.899896078556128e-05, "loss": 0.0005, "step": 18550 }, { "epoch": 1.1808869377107591, "grad_norm": 0.03564453125, "learning_rate": 4.8995426042572445e-05, "loss": 0.0031, "step": 18560 }, { "epoch": 1.1815231914487498, "grad_norm": 0.006103515625, "learning_rate": 4.899189129958361e-05, "loss": 0.0004, "step": 18570 }, { "epoch": 1.1821594451867405, "grad_norm": 0.004302978515625, "learning_rate": 4.8988356556594775e-05, "loss": 0.0048, "step": 18580 }, { "epoch": 1.1827956989247312, "grad_norm": 0.0196533203125, "learning_rate": 4.8984821813605934e-05, "loss": 0.0013, "step": 18590 }, { "epoch": 1.183431952662722, "grad_norm": 8.5625, "learning_rate": 4.89812870706171e-05, "loss": 0.0052, "step": 18600 }, { "epoch": 1.1840682064007126, "grad_norm": 0.66796875, "learning_rate": 4.8977752327628264e-05, "loss": 0.0009, "step": 18610 }, { "epoch": 1.1847044601387033, "grad_norm": 2.671875, "learning_rate": 4.897421758463943e-05, "loss": 0.0041, "step": 18620 }, { "epoch": 1.185340713876694, "grad_norm": 0.28515625, "learning_rate": 4.897068284165058e-05, "loss": 0.0021, "step": 18630 }, { "epoch": 1.1859769676146847, "grad_norm": 0.044677734375, "learning_rate": 4.8967148098661746e-05, "loss": 0.0004, "step": 18640 }, { "epoch": 1.1866132213526754, "grad_norm": 0.01123046875, "learning_rate": 4.896361335567291e-05, "loss": 0.0003, "step": 18650 }, { "epoch": 1.1872494750906661, "grad_norm": 0.03173828125, "learning_rate": 4.8960078612684076e-05, "loss": 0.0004, "step": 18660 }, { "epoch": 1.1878857288286568, "grad_norm": 0.00738525390625, "learning_rate": 4.8956543869695235e-05, "loss": 0.0004, "step": 18670 }, { "epoch": 1.1885219825666475, "grad_norm": 0.0140380859375, "learning_rate": 4.89530091267064e-05, "loss": 0.0017, "step": 18680 }, { "epoch": 1.1891582363046382, "grad_norm": 0.1923828125, "learning_rate": 4.8949474383717565e-05, "loss": 0.0051, "step": 18690 }, { "epoch": 1.189794490042629, "grad_norm": 0.01422119140625, "learning_rate": 4.894593964072873e-05, "loss": 0.0004, "step": 18700 }, { "epoch": 1.1904307437806196, "grad_norm": 3.125, "learning_rate": 4.894240489773988e-05, "loss": 0.0018, "step": 18710 }, { "epoch": 1.1910669975186103, "grad_norm": 0.033203125, "learning_rate": 4.893887015475105e-05, "loss": 0.0006, "step": 18720 }, { "epoch": 1.191703251256601, "grad_norm": 0.051513671875, "learning_rate": 4.893533541176221e-05, "loss": 0.0013, "step": 18730 }, { "epoch": 1.192339504994592, "grad_norm": 0.00543212890625, "learning_rate": 4.893180066877338e-05, "loss": 0.0006, "step": 18740 }, { "epoch": 1.1929757587325827, "grad_norm": 0.0026702880859375, "learning_rate": 4.8928265925784536e-05, "loss": 0.0055, "step": 18750 }, { "epoch": 1.1936120124705734, "grad_norm": 0.0390625, "learning_rate": 4.89247311827957e-05, "loss": 0.0036, "step": 18760 }, { "epoch": 1.194248266208564, "grad_norm": 0.0035247802734375, "learning_rate": 4.8921196439806866e-05, "loss": 0.0201, "step": 18770 }, { "epoch": 1.1948845199465548, "grad_norm": 0.006134033203125, "learning_rate": 4.891766169681803e-05, "loss": 0.0135, "step": 18780 }, { "epoch": 1.1955207736845455, "grad_norm": 0.00872802734375, "learning_rate": 4.891412695382919e-05, "loss": 0.0009, "step": 18790 }, { "epoch": 1.1961570274225362, "grad_norm": 0.203125, "learning_rate": 4.891059221084035e-05, "loss": 0.0109, "step": 18800 }, { "epoch": 1.1967932811605269, "grad_norm": 0.01531982421875, "learning_rate": 4.8907057467851513e-05, "loss": 0.0014, "step": 18810 }, { "epoch": 1.1974295348985176, "grad_norm": 0.029296875, "learning_rate": 4.890352272486268e-05, "loss": 0.001, "step": 18820 }, { "epoch": 1.1980657886365083, "grad_norm": 0.01116943359375, "learning_rate": 4.8899987981873844e-05, "loss": 0.0049, "step": 18830 }, { "epoch": 1.198702042374499, "grad_norm": 0.0888671875, "learning_rate": 4.8896453238885e-05, "loss": 0.0015, "step": 18840 }, { "epoch": 1.1993382961124897, "grad_norm": 0.189453125, "learning_rate": 4.889291849589617e-05, "loss": 0.0002, "step": 18850 }, { "epoch": 1.1999745498504804, "grad_norm": 0.000789642333984375, "learning_rate": 4.888938375290733e-05, "loss": 0.0002, "step": 18860 }, { "epoch": 1.200610803588471, "grad_norm": 0.60546875, "learning_rate": 4.888584900991849e-05, "loss": 0.0052, "step": 18870 }, { "epoch": 1.2012470573264618, "grad_norm": 0.02587890625, "learning_rate": 4.888231426692965e-05, "loss": 0.0003, "step": 18880 }, { "epoch": 1.2018833110644525, "grad_norm": 0.007415771484375, "learning_rate": 4.8878779523940815e-05, "loss": 0.0079, "step": 18890 }, { "epoch": 1.2025195648024432, "grad_norm": 0.0299072265625, "learning_rate": 4.887524478095198e-05, "loss": 0.0004, "step": 18900 }, { "epoch": 1.2031558185404339, "grad_norm": 0.00628662109375, "learning_rate": 4.8871710037963145e-05, "loss": 0.0009, "step": 18910 }, { "epoch": 1.2037920722784246, "grad_norm": 0.00183868408203125, "learning_rate": 4.88681752949743e-05, "loss": 0.0155, "step": 18920 }, { "epoch": 1.2044283260164153, "grad_norm": 4.375, "learning_rate": 4.886464055198547e-05, "loss": 0.0084, "step": 18930 }, { "epoch": 1.205064579754406, "grad_norm": 0.2734375, "learning_rate": 4.8861105808996634e-05, "loss": 0.02, "step": 18940 }, { "epoch": 1.2057008334923967, "grad_norm": 0.205078125, "learning_rate": 4.88575710660078e-05, "loss": 0.0003, "step": 18950 }, { "epoch": 1.2063370872303876, "grad_norm": 0.5078125, "learning_rate": 4.885403632301895e-05, "loss": 0.0007, "step": 18960 }, { "epoch": 1.2069733409683783, "grad_norm": 0.09521484375, "learning_rate": 4.8850501580030116e-05, "loss": 0.0005, "step": 18970 }, { "epoch": 1.207609594706369, "grad_norm": 2.78125, "learning_rate": 4.884696683704128e-05, "loss": 0.0154, "step": 18980 }, { "epoch": 1.2082458484443597, "grad_norm": 0.04248046875, "learning_rate": 4.8843432094052446e-05, "loss": 0.0021, "step": 18990 }, { "epoch": 1.2088821021823504, "grad_norm": 0.30078125, "learning_rate": 4.8839897351063605e-05, "loss": 0.0016, "step": 19000 }, { "epoch": 1.209518355920341, "grad_norm": 0.00811767578125, "learning_rate": 4.883636260807477e-05, "loss": 0.0001, "step": 19010 }, { "epoch": 1.2101546096583318, "grad_norm": 0.00958251953125, "learning_rate": 4.8832827865085935e-05, "loss": 0.0003, "step": 19020 }, { "epoch": 1.2107908633963225, "grad_norm": 0.251953125, "learning_rate": 4.88292931220971e-05, "loss": 0.0006, "step": 19030 }, { "epoch": 1.2114271171343132, "grad_norm": 0.003997802734375, "learning_rate": 4.882575837910826e-05, "loss": 0.0014, "step": 19040 }, { "epoch": 1.2120633708723039, "grad_norm": 0.00848388671875, "learning_rate": 4.882222363611942e-05, "loss": 0.0009, "step": 19050 }, { "epoch": 1.2126996246102946, "grad_norm": 0.00176239013671875, "learning_rate": 4.881868889313058e-05, "loss": 0.0004, "step": 19060 }, { "epoch": 1.2133358783482853, "grad_norm": 0.0017852783203125, "learning_rate": 4.881515415014175e-05, "loss": 0.0111, "step": 19070 }, { "epoch": 1.213972132086276, "grad_norm": 0.03173828125, "learning_rate": 4.8811619407152906e-05, "loss": 0.006, "step": 19080 }, { "epoch": 1.2146083858242667, "grad_norm": 0.007720947265625, "learning_rate": 4.880808466416407e-05, "loss": 0.0009, "step": 19090 }, { "epoch": 1.2152446395622574, "grad_norm": 0.421875, "learning_rate": 4.8804549921175236e-05, "loss": 0.0006, "step": 19100 }, { "epoch": 1.215880893300248, "grad_norm": 0.04052734375, "learning_rate": 4.88010151781864e-05, "loss": 0.0005, "step": 19110 }, { "epoch": 1.2165171470382388, "grad_norm": 0.018310546875, "learning_rate": 4.879748043519756e-05, "loss": 0.0006, "step": 19120 }, { "epoch": 1.2171534007762295, "grad_norm": 0.45703125, "learning_rate": 4.879394569220872e-05, "loss": 0.0017, "step": 19130 }, { "epoch": 1.2177896545142204, "grad_norm": 0.0030059814453125, "learning_rate": 4.879041094921988e-05, "loss": 0.0077, "step": 19140 }, { "epoch": 1.218425908252211, "grad_norm": 0.004730224609375, "learning_rate": 4.878687620623105e-05, "loss": 0.0003, "step": 19150 }, { "epoch": 1.2190621619902018, "grad_norm": 0.0037841796875, "learning_rate": 4.8783341463242214e-05, "loss": 0.0008, "step": 19160 }, { "epoch": 1.2196984157281925, "grad_norm": 0.0634765625, "learning_rate": 4.877980672025337e-05, "loss": 0.0008, "step": 19170 }, { "epoch": 1.2203346694661832, "grad_norm": 0.0216064453125, "learning_rate": 4.877627197726454e-05, "loss": 0.0002, "step": 19180 }, { "epoch": 1.2209709232041739, "grad_norm": 0.0027008056640625, "learning_rate": 4.87727372342757e-05, "loss": 0.0008, "step": 19190 }, { "epoch": 1.2216071769421646, "grad_norm": 0.008056640625, "learning_rate": 4.876920249128686e-05, "loss": 0.0001, "step": 19200 }, { "epoch": 1.2222434306801553, "grad_norm": 0.044921875, "learning_rate": 4.876566774829802e-05, "loss": 0.0003, "step": 19210 }, { "epoch": 1.222879684418146, "grad_norm": 0.01361083984375, "learning_rate": 4.8762133005309184e-05, "loss": 0.0002, "step": 19220 }, { "epoch": 1.2235159381561367, "grad_norm": 0.00131988525390625, "learning_rate": 4.875859826232035e-05, "loss": 0.0003, "step": 19230 }, { "epoch": 1.2241521918941274, "grad_norm": 0.39453125, "learning_rate": 4.8755063519331515e-05, "loss": 0.0035, "step": 19240 }, { "epoch": 1.224788445632118, "grad_norm": 0.00286865234375, "learning_rate": 4.875152877634267e-05, "loss": 0.0, "step": 19250 }, { "epoch": 1.2254246993701088, "grad_norm": 0.0576171875, "learning_rate": 4.874799403335384e-05, "loss": 0.0097, "step": 19260 }, { "epoch": 1.2260609531080995, "grad_norm": 0.041748046875, "learning_rate": 4.8744459290365004e-05, "loss": 0.0002, "step": 19270 }, { "epoch": 1.2266972068460902, "grad_norm": 0.00799560546875, "learning_rate": 4.874092454737616e-05, "loss": 0.0021, "step": 19280 }, { "epoch": 1.2273334605840809, "grad_norm": 0.15625, "learning_rate": 4.873738980438732e-05, "loss": 0.0024, "step": 19290 }, { "epoch": 1.2279697143220716, "grad_norm": 0.00138092041015625, "learning_rate": 4.8733855061398486e-05, "loss": 0.0005, "step": 19300 }, { "epoch": 1.2286059680600623, "grad_norm": 0.01348876953125, "learning_rate": 4.873032031840965e-05, "loss": 0.0003, "step": 19310 }, { "epoch": 1.229242221798053, "grad_norm": 0.0089111328125, "learning_rate": 4.8726785575420816e-05, "loss": 0.0002, "step": 19320 }, { "epoch": 1.2298784755360437, "grad_norm": 0.00341796875, "learning_rate": 4.8723250832431974e-05, "loss": 0.0025, "step": 19330 }, { "epoch": 1.2305147292740344, "grad_norm": 0.703125, "learning_rate": 4.871971608944314e-05, "loss": 0.0013, "step": 19340 }, { "epoch": 1.231150983012025, "grad_norm": 0.0042724609375, "learning_rate": 4.8716181346454305e-05, "loss": 0.0002, "step": 19350 }, { "epoch": 1.231787236750016, "grad_norm": 0.002532958984375, "learning_rate": 4.871264660346546e-05, "loss": 0.0041, "step": 19360 }, { "epoch": 1.2324234904880067, "grad_norm": 0.030029296875, "learning_rate": 4.870911186047663e-05, "loss": 0.001, "step": 19370 }, { "epoch": 1.2330597442259974, "grad_norm": 0.012939453125, "learning_rate": 4.870557711748779e-05, "loss": 0.0008, "step": 19380 }, { "epoch": 1.233695997963988, "grad_norm": 1.046875, "learning_rate": 4.870204237449895e-05, "loss": 0.0021, "step": 19390 }, { "epoch": 1.2343322517019788, "grad_norm": 0.005035400390625, "learning_rate": 4.869850763151012e-05, "loss": 0.0112, "step": 19400 }, { "epoch": 1.2349685054399695, "grad_norm": 0.01190185546875, "learning_rate": 4.8694972888521275e-05, "loss": 0.0005, "step": 19410 }, { "epoch": 1.2356047591779602, "grad_norm": 2.09375, "learning_rate": 4.869143814553244e-05, "loss": 0.0035, "step": 19420 }, { "epoch": 1.236241012915951, "grad_norm": 0.0859375, "learning_rate": 4.8687903402543606e-05, "loss": 0.0167, "step": 19430 }, { "epoch": 1.2368772666539416, "grad_norm": 0.0130615234375, "learning_rate": 4.868436865955477e-05, "loss": 0.0001, "step": 19440 }, { "epoch": 1.2375135203919323, "grad_norm": 3.421875, "learning_rate": 4.868083391656593e-05, "loss": 0.0056, "step": 19450 }, { "epoch": 1.238149774129923, "grad_norm": 1.78125, "learning_rate": 4.867729917357709e-05, "loss": 0.0015, "step": 19460 }, { "epoch": 1.2387860278679137, "grad_norm": 0.040771484375, "learning_rate": 4.867376443058825e-05, "loss": 0.0004, "step": 19470 }, { "epoch": 1.2394222816059044, "grad_norm": 1.234375, "learning_rate": 4.867022968759942e-05, "loss": 0.0008, "step": 19480 }, { "epoch": 1.240058535343895, "grad_norm": 0.006439208984375, "learning_rate": 4.8666694944610583e-05, "loss": 0.0021, "step": 19490 }, { "epoch": 1.2406947890818858, "grad_norm": 0.19140625, "learning_rate": 4.866316020162174e-05, "loss": 0.0045, "step": 19500 }, { "epoch": 1.2413310428198765, "grad_norm": 0.40625, "learning_rate": 4.865962545863291e-05, "loss": 0.0013, "step": 19510 }, { "epoch": 1.2419672965578672, "grad_norm": 0.0027923583984375, "learning_rate": 4.865609071564407e-05, "loss": 0.0001, "step": 19520 }, { "epoch": 1.242603550295858, "grad_norm": 3.265625, "learning_rate": 4.865255597265523e-05, "loss": 0.0044, "step": 19530 }, { "epoch": 1.2432398040338488, "grad_norm": 4.65625, "learning_rate": 4.864902122966639e-05, "loss": 0.0035, "step": 19540 }, { "epoch": 1.2438760577718395, "grad_norm": 0.1923828125, "learning_rate": 4.8645486486677554e-05, "loss": 0.0019, "step": 19550 }, { "epoch": 1.2445123115098302, "grad_norm": 0.01202392578125, "learning_rate": 4.864195174368872e-05, "loss": 0.0018, "step": 19560 }, { "epoch": 1.245148565247821, "grad_norm": 0.030517578125, "learning_rate": 4.8638417000699885e-05, "loss": 0.0014, "step": 19570 }, { "epoch": 1.2457848189858116, "grad_norm": 0.0078125, "learning_rate": 4.863488225771104e-05, "loss": 0.0007, "step": 19580 }, { "epoch": 1.2464210727238023, "grad_norm": 0.01904296875, "learning_rate": 4.863134751472221e-05, "loss": 0.0062, "step": 19590 }, { "epoch": 1.247057326461793, "grad_norm": 0.00933837890625, "learning_rate": 4.862781277173337e-05, "loss": 0.0002, "step": 19600 }, { "epoch": 1.2476935801997837, "grad_norm": 1.359375, "learning_rate": 4.862427802874453e-05, "loss": 0.0059, "step": 19610 }, { "epoch": 1.2483298339377744, "grad_norm": 0.1484375, "learning_rate": 4.862074328575569e-05, "loss": 0.0002, "step": 19620 }, { "epoch": 1.2489660876757651, "grad_norm": 0.00897216796875, "learning_rate": 4.8617208542766855e-05, "loss": 0.0003, "step": 19630 }, { "epoch": 1.2496023414137558, "grad_norm": 3.328125, "learning_rate": 4.861367379977802e-05, "loss": 0.0019, "step": 19640 }, { "epoch": 1.2502385951517465, "grad_norm": 0.016845703125, "learning_rate": 4.8610139056789186e-05, "loss": 0.0003, "step": 19650 }, { "epoch": 1.2508748488897372, "grad_norm": 0.046630859375, "learning_rate": 4.8606604313800344e-05, "loss": 0.0011, "step": 19660 }, { "epoch": 1.251511102627728, "grad_norm": 0.004425048828125, "learning_rate": 4.860306957081151e-05, "loss": 0.0001, "step": 19670 }, { "epoch": 1.2521473563657186, "grad_norm": 0.02392578125, "learning_rate": 4.8599534827822675e-05, "loss": 0.0006, "step": 19680 }, { "epoch": 1.2527836101037093, "grad_norm": 0.08642578125, "learning_rate": 4.859600008483383e-05, "loss": 0.0006, "step": 19690 }, { "epoch": 1.2534198638417, "grad_norm": 0.017333984375, "learning_rate": 4.8592465341845e-05, "loss": 0.0017, "step": 19700 }, { "epoch": 1.2540561175796907, "grad_norm": 5.125, "learning_rate": 4.8588930598856157e-05, "loss": 0.0048, "step": 19710 }, { "epoch": 1.2546923713176814, "grad_norm": 0.00262451171875, "learning_rate": 4.858539585586732e-05, "loss": 0.0005, "step": 19720 }, { "epoch": 1.255328625055672, "grad_norm": 0.01251220703125, "learning_rate": 4.858186111287849e-05, "loss": 0.0001, "step": 19730 }, { "epoch": 1.2559648787936628, "grad_norm": 0.0498046875, "learning_rate": 4.8578326369889645e-05, "loss": 0.0013, "step": 19740 }, { "epoch": 1.2566011325316535, "grad_norm": 0.0439453125, "learning_rate": 4.857479162690081e-05, "loss": 0.0003, "step": 19750 }, { "epoch": 1.2572373862696442, "grad_norm": 0.0038909912109375, "learning_rate": 4.8571256883911976e-05, "loss": 0.0001, "step": 19760 }, { "epoch": 1.257873640007635, "grad_norm": 0.04638671875, "learning_rate": 4.8567722140923134e-05, "loss": 0.0028, "step": 19770 }, { "epoch": 1.2585098937456258, "grad_norm": 0.0255126953125, "learning_rate": 4.85641873979343e-05, "loss": 0.0006, "step": 19780 }, { "epoch": 1.2591461474836165, "grad_norm": 0.02392578125, "learning_rate": 4.856065265494546e-05, "loss": 0.0051, "step": 19790 }, { "epoch": 1.2597824012216072, "grad_norm": 0.2021484375, "learning_rate": 4.855711791195662e-05, "loss": 0.0008, "step": 19800 }, { "epoch": 1.260418654959598, "grad_norm": 0.181640625, "learning_rate": 4.855358316896779e-05, "loss": 0.0005, "step": 19810 }, { "epoch": 1.2610549086975886, "grad_norm": 0.0036773681640625, "learning_rate": 4.855004842597895e-05, "loss": 0.0003, "step": 19820 }, { "epoch": 1.2616911624355793, "grad_norm": 0.0164794921875, "learning_rate": 4.854651368299011e-05, "loss": 0.0005, "step": 19830 }, { "epoch": 1.26232741617357, "grad_norm": 0.50390625, "learning_rate": 4.854297894000128e-05, "loss": 0.002, "step": 19840 }, { "epoch": 1.2629636699115607, "grad_norm": 0.044189453125, "learning_rate": 4.853944419701244e-05, "loss": 0.0003, "step": 19850 }, { "epoch": 1.2635999236495514, "grad_norm": 0.0067138671875, "learning_rate": 4.85359094540236e-05, "loss": 0.0004, "step": 19860 }, { "epoch": 1.2642361773875421, "grad_norm": 0.06494140625, "learning_rate": 4.853237471103476e-05, "loss": 0.0009, "step": 19870 }, { "epoch": 1.2648724311255328, "grad_norm": 0.008056640625, "learning_rate": 4.8528839968045924e-05, "loss": 0.0014, "step": 19880 }, { "epoch": 1.2655086848635235, "grad_norm": 0.009521484375, "learning_rate": 4.852530522505709e-05, "loss": 0.0002, "step": 19890 }, { "epoch": 1.2661449386015142, "grad_norm": 0.017333984375, "learning_rate": 4.8521770482068254e-05, "loss": 0.001, "step": 19900 }, { "epoch": 1.266781192339505, "grad_norm": 0.0234375, "learning_rate": 4.851823573907941e-05, "loss": 0.0008, "step": 19910 }, { "epoch": 1.2674174460774956, "grad_norm": 0.01544189453125, "learning_rate": 4.851470099609058e-05, "loss": 0.0002, "step": 19920 }, { "epoch": 1.2680536998154865, "grad_norm": 0.00128936767578125, "learning_rate": 4.851116625310174e-05, "loss": 0.0006, "step": 19930 }, { "epoch": 1.2686899535534772, "grad_norm": 1.3984375, "learning_rate": 4.85076315101129e-05, "loss": 0.001, "step": 19940 }, { "epoch": 1.269326207291468, "grad_norm": 0.0009918212890625, "learning_rate": 4.850409676712406e-05, "loss": 0.0006, "step": 19950 }, { "epoch": 1.2699624610294586, "grad_norm": 0.0888671875, "learning_rate": 4.8500562024135225e-05, "loss": 0.0002, "step": 19960 }, { "epoch": 1.2705987147674493, "grad_norm": 0.0194091796875, "learning_rate": 4.849702728114639e-05, "loss": 0.0013, "step": 19970 }, { "epoch": 1.27123496850544, "grad_norm": 0.05029296875, "learning_rate": 4.8493492538157556e-05, "loss": 0.0018, "step": 19980 }, { "epoch": 1.2718712222434307, "grad_norm": 0.0181884765625, "learning_rate": 4.8489957795168714e-05, "loss": 0.002, "step": 19990 }, { "epoch": 1.2725074759814214, "grad_norm": 0.0498046875, "learning_rate": 4.848642305217988e-05, "loss": 0.0051, "step": 20000 }, { "epoch": 1.2731437297194121, "grad_norm": 0.0196533203125, "learning_rate": 4.8482888309191044e-05, "loss": 0.0021, "step": 20010 }, { "epoch": 1.2737799834574028, "grad_norm": 0.0147705078125, "learning_rate": 4.84793535662022e-05, "loss": 0.0002, "step": 20020 }, { "epoch": 1.2744162371953935, "grad_norm": 0.62109375, "learning_rate": 4.847581882321337e-05, "loss": 0.0006, "step": 20030 }, { "epoch": 1.2750524909333842, "grad_norm": 0.01190185546875, "learning_rate": 4.8472284080224526e-05, "loss": 0.0002, "step": 20040 }, { "epoch": 1.275688744671375, "grad_norm": 0.0130615234375, "learning_rate": 4.846874933723569e-05, "loss": 0.0005, "step": 20050 }, { "epoch": 1.2763249984093656, "grad_norm": 0.19140625, "learning_rate": 4.846521459424686e-05, "loss": 0.0012, "step": 20060 }, { "epoch": 1.2769612521473563, "grad_norm": 0.00567626953125, "learning_rate": 4.8461679851258015e-05, "loss": 0.0004, "step": 20070 }, { "epoch": 1.277597505885347, "grad_norm": 0.00262451171875, "learning_rate": 4.845814510826918e-05, "loss": 0.0021, "step": 20080 }, { "epoch": 1.2782337596233377, "grad_norm": 0.048828125, "learning_rate": 4.8454610365280345e-05, "loss": 0.0004, "step": 20090 }, { "epoch": 1.2788700133613284, "grad_norm": 0.00982666015625, "learning_rate": 4.8451075622291504e-05, "loss": 0.009, "step": 20100 }, { "epoch": 1.2795062670993191, "grad_norm": 0.01153564453125, "learning_rate": 4.844754087930267e-05, "loss": 0.0007, "step": 20110 }, { "epoch": 1.2801425208373098, "grad_norm": 0.021484375, "learning_rate": 4.844400613631383e-05, "loss": 0.0014, "step": 20120 }, { "epoch": 1.2807787745753005, "grad_norm": 0.00015735626220703125, "learning_rate": 4.844047139332499e-05, "loss": 0.0051, "step": 20130 }, { "epoch": 1.2814150283132912, "grad_norm": 1.6796875, "learning_rate": 4.843693665033616e-05, "loss": 0.0008, "step": 20140 }, { "epoch": 1.282051282051282, "grad_norm": 0.0120849609375, "learning_rate": 4.843340190734732e-05, "loss": 0.0015, "step": 20150 }, { "epoch": 1.2826875357892726, "grad_norm": 0.0098876953125, "learning_rate": 4.842986716435848e-05, "loss": 0.0012, "step": 20160 }, { "epoch": 1.2833237895272633, "grad_norm": 0.26171875, "learning_rate": 4.842633242136965e-05, "loss": 0.0008, "step": 20170 }, { "epoch": 1.2839600432652543, "grad_norm": 0.0181884765625, "learning_rate": 4.8422797678380805e-05, "loss": 0.0005, "step": 20180 }, { "epoch": 1.284596297003245, "grad_norm": 0.01239013671875, "learning_rate": 4.841926293539197e-05, "loss": 0.0002, "step": 20190 }, { "epoch": 1.2852325507412357, "grad_norm": 0.040283203125, "learning_rate": 4.841572819240313e-05, "loss": 0.001, "step": 20200 }, { "epoch": 1.2858688044792264, "grad_norm": 0.0025482177734375, "learning_rate": 4.8412193449414294e-05, "loss": 0.0032, "step": 20210 }, { "epoch": 1.286505058217217, "grad_norm": 0.115234375, "learning_rate": 4.840865870642546e-05, "loss": 0.0003, "step": 20220 }, { "epoch": 1.2871413119552078, "grad_norm": 0.037109375, "learning_rate": 4.8405123963436624e-05, "loss": 0.0005, "step": 20230 }, { "epoch": 1.2877775656931985, "grad_norm": 0.330078125, "learning_rate": 4.840158922044778e-05, "loss": 0.0003, "step": 20240 }, { "epoch": 1.2884138194311892, "grad_norm": 0.00836181640625, "learning_rate": 4.839805447745895e-05, "loss": 0.0013, "step": 20250 }, { "epoch": 1.2890500731691799, "grad_norm": 0.37109375, "learning_rate": 4.8394519734470106e-05, "loss": 0.0013, "step": 20260 }, { "epoch": 1.2896863269071706, "grad_norm": 0.005462646484375, "learning_rate": 4.839098499148127e-05, "loss": 0.0001, "step": 20270 }, { "epoch": 1.2903225806451613, "grad_norm": 0.008544921875, "learning_rate": 4.838745024849243e-05, "loss": 0.0004, "step": 20280 }, { "epoch": 1.290958834383152, "grad_norm": 0.0206298828125, "learning_rate": 4.8383915505503595e-05, "loss": 0.0001, "step": 20290 }, { "epoch": 1.2915950881211427, "grad_norm": 0.006317138671875, "learning_rate": 4.838038076251476e-05, "loss": 0.0009, "step": 20300 }, { "epoch": 1.2922313418591334, "grad_norm": 0.003997802734375, "learning_rate": 4.8376846019525925e-05, "loss": 0.0019, "step": 20310 }, { "epoch": 1.292867595597124, "grad_norm": 0.015625, "learning_rate": 4.8373311276537084e-05, "loss": 0.0008, "step": 20320 }, { "epoch": 1.293503849335115, "grad_norm": 0.01422119140625, "learning_rate": 4.836977653354825e-05, "loss": 0.0012, "step": 20330 }, { "epoch": 1.2941401030731057, "grad_norm": 0.0279541015625, "learning_rate": 4.8366241790559414e-05, "loss": 0.0003, "step": 20340 }, { "epoch": 1.2947763568110964, "grad_norm": 0.0225830078125, "learning_rate": 4.836270704757057e-05, "loss": 0.0022, "step": 20350 }, { "epoch": 1.295412610549087, "grad_norm": 0.00811767578125, "learning_rate": 4.835917230458174e-05, "loss": 0.004, "step": 20360 }, { "epoch": 1.2960488642870778, "grad_norm": 0.001251220703125, "learning_rate": 4.8355637561592896e-05, "loss": 0.0003, "step": 20370 }, { "epoch": 1.2966851180250685, "grad_norm": 0.004638671875, "learning_rate": 4.835210281860406e-05, "loss": 0.0003, "step": 20380 }, { "epoch": 1.2973213717630592, "grad_norm": 0.01300048828125, "learning_rate": 4.8348568075615227e-05, "loss": 0.0043, "step": 20390 }, { "epoch": 1.2979576255010499, "grad_norm": 4.0, "learning_rate": 4.8345033332626385e-05, "loss": 0.0075, "step": 20400 }, { "epoch": 1.2985938792390406, "grad_norm": 0.004730224609375, "learning_rate": 4.834149858963755e-05, "loss": 0.0001, "step": 20410 }, { "epoch": 1.2992301329770313, "grad_norm": 0.005279541015625, "learning_rate": 4.8337963846648715e-05, "loss": 0.0026, "step": 20420 }, { "epoch": 1.299866386715022, "grad_norm": 0.0022430419921875, "learning_rate": 4.8334429103659874e-05, "loss": 0.0004, "step": 20430 }, { "epoch": 1.3005026404530127, "grad_norm": 0.00110626220703125, "learning_rate": 4.833089436067104e-05, "loss": 0.0067, "step": 20440 }, { "epoch": 1.3011388941910034, "grad_norm": 0.07421875, "learning_rate": 4.83273596176822e-05, "loss": 0.0016, "step": 20450 }, { "epoch": 1.301775147928994, "grad_norm": 0.015869140625, "learning_rate": 4.832382487469336e-05, "loss": 0.0013, "step": 20460 }, { "epoch": 1.3024114016669848, "grad_norm": 0.018310546875, "learning_rate": 4.832029013170453e-05, "loss": 0.0001, "step": 20470 }, { "epoch": 1.3030476554049755, "grad_norm": 0.021728515625, "learning_rate": 4.831675538871569e-05, "loss": 0.0107, "step": 20480 }, { "epoch": 1.3036839091429662, "grad_norm": 0.006927490234375, "learning_rate": 4.831322064572685e-05, "loss": 0.0003, "step": 20490 }, { "epoch": 1.3043201628809569, "grad_norm": 0.0213623046875, "learning_rate": 4.8309685902738016e-05, "loss": 0.0048, "step": 20500 }, { "epoch": 1.3049564166189476, "grad_norm": 0.08251953125, "learning_rate": 4.8306151159749175e-05, "loss": 0.0123, "step": 20510 }, { "epoch": 1.3055926703569383, "grad_norm": 0.59765625, "learning_rate": 4.830261641676034e-05, "loss": 0.0007, "step": 20520 }, { "epoch": 1.306228924094929, "grad_norm": 0.032958984375, "learning_rate": 4.82990816737715e-05, "loss": 0.0063, "step": 20530 }, { "epoch": 1.3068651778329197, "grad_norm": 0.0172119140625, "learning_rate": 4.8295546930782664e-05, "loss": 0.0005, "step": 20540 }, { "epoch": 1.3075014315709104, "grad_norm": 0.040771484375, "learning_rate": 4.829201218779383e-05, "loss": 0.001, "step": 20550 }, { "epoch": 1.308137685308901, "grad_norm": 0.007659912109375, "learning_rate": 4.8288477444804994e-05, "loss": 0.0023, "step": 20560 }, { "epoch": 1.3087739390468918, "grad_norm": 0.388671875, "learning_rate": 4.828494270181615e-05, "loss": 0.0003, "step": 20570 }, { "epoch": 1.3094101927848827, "grad_norm": 0.1484375, "learning_rate": 4.828140795882732e-05, "loss": 0.0014, "step": 20580 }, { "epoch": 1.3100464465228734, "grad_norm": 0.00933837890625, "learning_rate": 4.8277873215838476e-05, "loss": 0.0002, "step": 20590 }, { "epoch": 1.310682700260864, "grad_norm": 4.78125, "learning_rate": 4.827433847284964e-05, "loss": 0.0019, "step": 20600 }, { "epoch": 1.3113189539988548, "grad_norm": 0.03662109375, "learning_rate": 4.82708037298608e-05, "loss": 0.0002, "step": 20610 }, { "epoch": 1.3119552077368455, "grad_norm": 0.07421875, "learning_rate": 4.8267268986871965e-05, "loss": 0.0002, "step": 20620 }, { "epoch": 1.3125914614748362, "grad_norm": 0.3671875, "learning_rate": 4.826373424388313e-05, "loss": 0.0013, "step": 20630 }, { "epoch": 1.3132277152128269, "grad_norm": 0.00982666015625, "learning_rate": 4.8260199500894295e-05, "loss": 0.0064, "step": 20640 }, { "epoch": 1.3138639689508176, "grad_norm": 0.0034942626953125, "learning_rate": 4.8256664757905454e-05, "loss": 0.0001, "step": 20650 }, { "epoch": 1.3145002226888083, "grad_norm": 7.59375, "learning_rate": 4.825313001491662e-05, "loss": 0.007, "step": 20660 }, { "epoch": 1.315136476426799, "grad_norm": 0.2099609375, "learning_rate": 4.824959527192778e-05, "loss": 0.0003, "step": 20670 }, { "epoch": 1.3157727301647897, "grad_norm": 0.9375, "learning_rate": 4.824606052893894e-05, "loss": 0.0005, "step": 20680 }, { "epoch": 1.3164089839027804, "grad_norm": 2.140625, "learning_rate": 4.824252578595011e-05, "loss": 0.0011, "step": 20690 }, { "epoch": 1.317045237640771, "grad_norm": 1.2578125, "learning_rate": 4.8238991042961266e-05, "loss": 0.0027, "step": 20700 }, { "epoch": 1.3176814913787618, "grad_norm": 0.00860595703125, "learning_rate": 4.823545629997243e-05, "loss": 0.0004, "step": 20710 }, { "epoch": 1.3183177451167525, "grad_norm": 0.00927734375, "learning_rate": 4.8231921556983596e-05, "loss": 0.0001, "step": 20720 }, { "epoch": 1.3189539988547434, "grad_norm": 0.01068115234375, "learning_rate": 4.8228386813994755e-05, "loss": 0.0002, "step": 20730 }, { "epoch": 1.319590252592734, "grad_norm": 0.00396728515625, "learning_rate": 4.822485207100592e-05, "loss": 0.0001, "step": 20740 }, { "epoch": 1.3202265063307248, "grad_norm": 0.416015625, "learning_rate": 4.8221317328017085e-05, "loss": 0.0003, "step": 20750 }, { "epoch": 1.3208627600687155, "grad_norm": 0.083984375, "learning_rate": 4.8217782585028244e-05, "loss": 0.0007, "step": 20760 }, { "epoch": 1.3214990138067062, "grad_norm": 0.0296630859375, "learning_rate": 4.821424784203941e-05, "loss": 0.0124, "step": 20770 }, { "epoch": 1.322135267544697, "grad_norm": 0.1201171875, "learning_rate": 4.821071309905057e-05, "loss": 0.0013, "step": 20780 }, { "epoch": 1.3227715212826876, "grad_norm": 0.0107421875, "learning_rate": 4.820717835606173e-05, "loss": 0.0002, "step": 20790 }, { "epoch": 1.3234077750206783, "grad_norm": 0.0751953125, "learning_rate": 4.82036436130729e-05, "loss": 0.0012, "step": 20800 }, { "epoch": 1.324044028758669, "grad_norm": 0.0167236328125, "learning_rate": 4.820010887008406e-05, "loss": 0.0003, "step": 20810 }, { "epoch": 1.3246802824966597, "grad_norm": 0.0025634765625, "learning_rate": 4.819657412709522e-05, "loss": 0.0002, "step": 20820 }, { "epoch": 1.3253165362346504, "grad_norm": 0.005615234375, "learning_rate": 4.8193039384106386e-05, "loss": 0.0004, "step": 20830 }, { "epoch": 1.325952789972641, "grad_norm": 0.00054168701171875, "learning_rate": 4.8189504641117545e-05, "loss": 0.0013, "step": 20840 }, { "epoch": 1.3265890437106318, "grad_norm": 0.0003986358642578125, "learning_rate": 4.818596989812871e-05, "loss": 0.0003, "step": 20850 }, { "epoch": 1.3272252974486225, "grad_norm": 0.255859375, "learning_rate": 4.818243515513987e-05, "loss": 0.003, "step": 20860 }, { "epoch": 1.3278615511866132, "grad_norm": 0.0013275146484375, "learning_rate": 4.8178900412151033e-05, "loss": 0.0002, "step": 20870 }, { "epoch": 1.328497804924604, "grad_norm": 0.00457763671875, "learning_rate": 4.81753656691622e-05, "loss": 0.0005, "step": 20880 }, { "epoch": 1.3291340586625946, "grad_norm": 1.3671875, "learning_rate": 4.8171830926173364e-05, "loss": 0.0013, "step": 20890 }, { "epoch": 1.3297703124005853, "grad_norm": 2.40625, "learning_rate": 4.816829618318452e-05, "loss": 0.0015, "step": 20900 }, { "epoch": 1.330406566138576, "grad_norm": 9.0625, "learning_rate": 4.816476144019569e-05, "loss": 0.009, "step": 20910 }, { "epoch": 1.3310428198765667, "grad_norm": 0.00150299072265625, "learning_rate": 4.8161226697206846e-05, "loss": 0.0011, "step": 20920 }, { "epoch": 1.3316790736145574, "grad_norm": 0.059814453125, "learning_rate": 4.815769195421801e-05, "loss": 0.0005, "step": 20930 }, { "epoch": 1.332315327352548, "grad_norm": 0.022216796875, "learning_rate": 4.815415721122917e-05, "loss": 0.003, "step": 20940 }, { "epoch": 1.3329515810905388, "grad_norm": 0.0341796875, "learning_rate": 4.8150622468240335e-05, "loss": 0.0002, "step": 20950 }, { "epoch": 1.3335878348285295, "grad_norm": 0.0036773681640625, "learning_rate": 4.81470877252515e-05, "loss": 0.0016, "step": 20960 }, { "epoch": 1.3342240885665202, "grad_norm": 3.546875, "learning_rate": 4.8143552982262665e-05, "loss": 0.0028, "step": 20970 }, { "epoch": 1.3348603423045111, "grad_norm": 0.0419921875, "learning_rate": 4.814001823927382e-05, "loss": 0.0015, "step": 20980 }, { "epoch": 1.3354965960425018, "grad_norm": 0.00262451171875, "learning_rate": 4.813648349628499e-05, "loss": 0.0005, "step": 20990 }, { "epoch": 1.3361328497804925, "grad_norm": 0.01165771484375, "learning_rate": 4.813294875329615e-05, "loss": 0.0005, "step": 21000 }, { "epoch": 1.3367691035184832, "grad_norm": 0.0947265625, "learning_rate": 4.812941401030731e-05, "loss": 0.0002, "step": 21010 }, { "epoch": 1.337405357256474, "grad_norm": 0.0556640625, "learning_rate": 4.812587926731848e-05, "loss": 0.0023, "step": 21020 }, { "epoch": 1.3380416109944646, "grad_norm": 0.076171875, "learning_rate": 4.8122344524329636e-05, "loss": 0.0009, "step": 21030 }, { "epoch": 1.3386778647324553, "grad_norm": 0.0079345703125, "learning_rate": 4.81188097813408e-05, "loss": 0.0025, "step": 21040 }, { "epoch": 1.339314118470446, "grad_norm": 0.0084228515625, "learning_rate": 4.8115275038351966e-05, "loss": 0.0001, "step": 21050 }, { "epoch": 1.3399503722084367, "grad_norm": 0.1435546875, "learning_rate": 4.811174029536313e-05, "loss": 0.0004, "step": 21060 }, { "epoch": 1.3405866259464274, "grad_norm": 0.015869140625, "learning_rate": 4.810820555237429e-05, "loss": 0.0051, "step": 21070 }, { "epoch": 1.3412228796844181, "grad_norm": 0.01025390625, "learning_rate": 4.810467080938545e-05, "loss": 0.0006, "step": 21080 }, { "epoch": 1.3418591334224088, "grad_norm": 1.296875, "learning_rate": 4.810113606639661e-05, "loss": 0.0013, "step": 21090 }, { "epoch": 1.3424953871603995, "grad_norm": 0.0089111328125, "learning_rate": 4.809760132340778e-05, "loss": 0.0009, "step": 21100 }, { "epoch": 1.3431316408983902, "grad_norm": 0.005645751953125, "learning_rate": 4.809406658041894e-05, "loss": 0.0043, "step": 21110 }, { "epoch": 1.343767894636381, "grad_norm": 2.203125, "learning_rate": 4.80905318374301e-05, "loss": 0.0013, "step": 21120 }, { "epoch": 1.3444041483743718, "grad_norm": 6.46875, "learning_rate": 4.808699709444127e-05, "loss": 0.0143, "step": 21130 }, { "epoch": 1.3450404021123625, "grad_norm": 0.009521484375, "learning_rate": 4.808346235145243e-05, "loss": 0.0015, "step": 21140 }, { "epoch": 1.3456766558503532, "grad_norm": 0.1416015625, "learning_rate": 4.807992760846359e-05, "loss": 0.0007, "step": 21150 }, { "epoch": 1.346312909588344, "grad_norm": 0.2119140625, "learning_rate": 4.8076392865474756e-05, "loss": 0.0097, "step": 21160 }, { "epoch": 1.3469491633263346, "grad_norm": 0.0021209716796875, "learning_rate": 4.8072858122485914e-05, "loss": 0.0009, "step": 21170 }, { "epoch": 1.3475854170643253, "grad_norm": 0.033935546875, "learning_rate": 4.806932337949708e-05, "loss": 0.0011, "step": 21180 }, { "epoch": 1.348221670802316, "grad_norm": 0.1318359375, "learning_rate": 4.806578863650824e-05, "loss": 0.0012, "step": 21190 }, { "epoch": 1.3488579245403067, "grad_norm": 0.03173828125, "learning_rate": 4.80622538935194e-05, "loss": 0.0003, "step": 21200 }, { "epoch": 1.3494941782782974, "grad_norm": 0.01129150390625, "learning_rate": 4.805871915053057e-05, "loss": 0.0006, "step": 21210 }, { "epoch": 1.3501304320162881, "grad_norm": 0.02099609375, "learning_rate": 4.8055184407541734e-05, "loss": 0.0002, "step": 21220 }, { "epoch": 1.3507666857542788, "grad_norm": 0.04248046875, "learning_rate": 4.805164966455289e-05, "loss": 0.0012, "step": 21230 }, { "epoch": 1.3514029394922695, "grad_norm": 0.0283203125, "learning_rate": 4.804811492156406e-05, "loss": 0.0006, "step": 21240 }, { "epoch": 1.3520391932302602, "grad_norm": 0.0113525390625, "learning_rate": 4.8044580178575216e-05, "loss": 0.0007, "step": 21250 }, { "epoch": 1.352675446968251, "grad_norm": 0.01519775390625, "learning_rate": 4.804104543558638e-05, "loss": 0.0009, "step": 21260 }, { "epoch": 1.3533117007062416, "grad_norm": 0.05322265625, "learning_rate": 4.8037510692597546e-05, "loss": 0.0004, "step": 21270 }, { "epoch": 1.3539479544442323, "grad_norm": 0.0035400390625, "learning_rate": 4.8033975949608704e-05, "loss": 0.0006, "step": 21280 }, { "epoch": 1.354584208182223, "grad_norm": 0.005706787109375, "learning_rate": 4.803044120661987e-05, "loss": 0.0004, "step": 21290 }, { "epoch": 1.3552204619202137, "grad_norm": 0.0220947265625, "learning_rate": 4.8026906463631035e-05, "loss": 0.0036, "step": 21300 }, { "epoch": 1.3558567156582044, "grad_norm": 3.109375, "learning_rate": 4.802337172064219e-05, "loss": 0.0021, "step": 21310 }, { "epoch": 1.3564929693961951, "grad_norm": 0.103515625, "learning_rate": 4.801983697765336e-05, "loss": 0.0008, "step": 21320 }, { "epoch": 1.3571292231341858, "grad_norm": 2.546875, "learning_rate": 4.801630223466452e-05, "loss": 0.0024, "step": 21330 }, { "epoch": 1.3577654768721765, "grad_norm": 0.2138671875, "learning_rate": 4.801276749167568e-05, "loss": 0.0013, "step": 21340 }, { "epoch": 1.3584017306101672, "grad_norm": 0.035888671875, "learning_rate": 4.800923274868685e-05, "loss": 0.0004, "step": 21350 }, { "epoch": 1.359037984348158, "grad_norm": 0.0869140625, "learning_rate": 4.8005698005698006e-05, "loss": 0.0021, "step": 21360 }, { "epoch": 1.3596742380861486, "grad_norm": 0.00469970703125, "learning_rate": 4.800216326270917e-05, "loss": 0.0027, "step": 21370 }, { "epoch": 1.3603104918241395, "grad_norm": 0.010986328125, "learning_rate": 4.7998628519720336e-05, "loss": 0.0016, "step": 21380 }, { "epoch": 1.3609467455621302, "grad_norm": 0.1083984375, "learning_rate": 4.79950937767315e-05, "loss": 0.0003, "step": 21390 }, { "epoch": 1.361582999300121, "grad_norm": 0.0047607421875, "learning_rate": 4.799155903374266e-05, "loss": 0.0026, "step": 21400 }, { "epoch": 1.3622192530381116, "grad_norm": 0.1025390625, "learning_rate": 4.798802429075382e-05, "loss": 0.0003, "step": 21410 }, { "epoch": 1.3628555067761023, "grad_norm": 0.341796875, "learning_rate": 4.798448954776498e-05, "loss": 0.0006, "step": 21420 }, { "epoch": 1.363491760514093, "grad_norm": 0.00045013427734375, "learning_rate": 4.798095480477615e-05, "loss": 0.0015, "step": 21430 }, { "epoch": 1.3641280142520837, "grad_norm": 0.006134033203125, "learning_rate": 4.797742006178731e-05, "loss": 0.0003, "step": 21440 }, { "epoch": 1.3647642679900744, "grad_norm": 0.00286865234375, "learning_rate": 4.797388531879847e-05, "loss": 0.0011, "step": 21450 }, { "epoch": 1.3654005217280651, "grad_norm": 0.04638671875, "learning_rate": 4.797035057580964e-05, "loss": 0.0003, "step": 21460 }, { "epoch": 1.3660367754660558, "grad_norm": 0.0128173828125, "learning_rate": 4.79668158328208e-05, "loss": 0.0012, "step": 21470 }, { "epoch": 1.3666730292040465, "grad_norm": 0.00148773193359375, "learning_rate": 4.796328108983196e-05, "loss": 0.0019, "step": 21480 }, { "epoch": 1.3673092829420372, "grad_norm": 0.00982666015625, "learning_rate": 4.795974634684312e-05, "loss": 0.0006, "step": 21490 }, { "epoch": 1.367945536680028, "grad_norm": 0.0908203125, "learning_rate": 4.7956211603854284e-05, "loss": 0.0086, "step": 21500 }, { "epoch": 1.3685817904180186, "grad_norm": 0.001800537109375, "learning_rate": 4.795267686086545e-05, "loss": 0.0024, "step": 21510 }, { "epoch": 1.3692180441560093, "grad_norm": 0.035400390625, "learning_rate": 4.794914211787661e-05, "loss": 0.0003, "step": 21520 }, { "epoch": 1.3698542978940003, "grad_norm": 0.1953125, "learning_rate": 4.794560737488777e-05, "loss": 0.0003, "step": 21530 }, { "epoch": 1.370490551631991, "grad_norm": 0.00152587890625, "learning_rate": 4.794207263189894e-05, "loss": 0.0008, "step": 21540 }, { "epoch": 1.3711268053699817, "grad_norm": 0.051513671875, "learning_rate": 4.7938537888910103e-05, "loss": 0.0014, "step": 21550 }, { "epoch": 1.3717630591079724, "grad_norm": 0.2109375, "learning_rate": 4.793500314592126e-05, "loss": 0.0049, "step": 21560 }, { "epoch": 1.372399312845963, "grad_norm": 0.0220947265625, "learning_rate": 4.793146840293242e-05, "loss": 0.0002, "step": 21570 }, { "epoch": 1.3730355665839538, "grad_norm": 0.43359375, "learning_rate": 4.7927933659943585e-05, "loss": 0.0005, "step": 21580 }, { "epoch": 1.3736718203219445, "grad_norm": 0.01104736328125, "learning_rate": 4.792439891695475e-05, "loss": 0.0002, "step": 21590 }, { "epoch": 1.3743080740599352, "grad_norm": 0.000812530517578125, "learning_rate": 4.7920864173965916e-05, "loss": 0.0009, "step": 21600 }, { "epoch": 1.3749443277979259, "grad_norm": 0.1123046875, "learning_rate": 4.7917329430977074e-05, "loss": 0.0001, "step": 21610 }, { "epoch": 1.3755805815359166, "grad_norm": 0.87890625, "learning_rate": 4.791379468798824e-05, "loss": 0.0006, "step": 21620 }, { "epoch": 1.3762168352739073, "grad_norm": 0.05126953125, "learning_rate": 4.7910259944999405e-05, "loss": 0.0007, "step": 21630 }, { "epoch": 1.376853089011898, "grad_norm": 0.1025390625, "learning_rate": 4.790672520201056e-05, "loss": 0.0002, "step": 21640 }, { "epoch": 1.3774893427498887, "grad_norm": 0.004547119140625, "learning_rate": 4.790319045902173e-05, "loss": 0.0006, "step": 21650 }, { "epoch": 1.3781255964878794, "grad_norm": 0.00799560546875, "learning_rate": 4.7899655716032887e-05, "loss": 0.0109, "step": 21660 }, { "epoch": 1.37876185022587, "grad_norm": 0.002105712890625, "learning_rate": 4.789612097304405e-05, "loss": 0.0032, "step": 21670 }, { "epoch": 1.3793981039638608, "grad_norm": 4.96875, "learning_rate": 4.789258623005522e-05, "loss": 0.0041, "step": 21680 }, { "epoch": 1.3800343577018515, "grad_norm": 4.78125, "learning_rate": 4.7889051487066375e-05, "loss": 0.0089, "step": 21690 }, { "epoch": 1.3806706114398422, "grad_norm": 0.0245361328125, "learning_rate": 4.788551674407754e-05, "loss": 0.0006, "step": 21700 }, { "epoch": 1.3813068651778329, "grad_norm": 0.45703125, "learning_rate": 4.7881982001088706e-05, "loss": 0.0004, "step": 21710 }, { "epoch": 1.3819431189158236, "grad_norm": 7.3125, "learning_rate": 4.787844725809987e-05, "loss": 0.0048, "step": 21720 }, { "epoch": 1.3825793726538143, "grad_norm": 0.008544921875, "learning_rate": 4.787491251511103e-05, "loss": 0.0011, "step": 21730 }, { "epoch": 1.383215626391805, "grad_norm": 0.006378173828125, "learning_rate": 4.787137777212219e-05, "loss": 0.0003, "step": 21740 }, { "epoch": 1.3838518801297957, "grad_norm": 0.87109375, "learning_rate": 4.786784302913335e-05, "loss": 0.0013, "step": 21750 }, { "epoch": 1.3844881338677864, "grad_norm": 0.03857421875, "learning_rate": 4.786430828614452e-05, "loss": 0.0017, "step": 21760 }, { "epoch": 1.385124387605777, "grad_norm": 0.043701171875, "learning_rate": 4.7860773543155677e-05, "loss": 0.0011, "step": 21770 }, { "epoch": 1.385760641343768, "grad_norm": 0.0277099609375, "learning_rate": 4.785723880016684e-05, "loss": 0.0005, "step": 21780 }, { "epoch": 1.3863968950817587, "grad_norm": 0.0322265625, "learning_rate": 4.785370405717801e-05, "loss": 0.0005, "step": 21790 }, { "epoch": 1.3870331488197494, "grad_norm": 16.75, "learning_rate": 4.785016931418917e-05, "loss": 0.0028, "step": 21800 }, { "epoch": 1.38766940255774, "grad_norm": 0.00665283203125, "learning_rate": 4.784663457120033e-05, "loss": 0.0004, "step": 21810 }, { "epoch": 1.3883056562957308, "grad_norm": 0.36328125, "learning_rate": 4.784309982821149e-05, "loss": 0.0004, "step": 21820 }, { "epoch": 1.3889419100337215, "grad_norm": 0.14453125, "learning_rate": 4.7839565085222654e-05, "loss": 0.0007, "step": 21830 }, { "epoch": 1.3895781637717122, "grad_norm": 0.00168609619140625, "learning_rate": 4.783603034223382e-05, "loss": 0.0008, "step": 21840 }, { "epoch": 1.3902144175097029, "grad_norm": 0.208984375, "learning_rate": 4.783249559924498e-05, "loss": 0.0005, "step": 21850 }, { "epoch": 1.3908506712476936, "grad_norm": 0.0024871826171875, "learning_rate": 4.782896085625614e-05, "loss": 0.0003, "step": 21860 }, { "epoch": 1.3914869249856843, "grad_norm": 0.07568359375, "learning_rate": 4.782542611326731e-05, "loss": 0.0003, "step": 21870 }, { "epoch": 1.392123178723675, "grad_norm": 1.453125, "learning_rate": 4.782189137027847e-05, "loss": 0.001, "step": 21880 }, { "epoch": 1.3927594324616657, "grad_norm": 0.00482177734375, "learning_rate": 4.781835662728963e-05, "loss": 0.0008, "step": 21890 }, { "epoch": 1.3933956861996564, "grad_norm": 0.004425048828125, "learning_rate": 4.781482188430079e-05, "loss": 0.0006, "step": 21900 }, { "epoch": 1.394031939937647, "grad_norm": 0.002593994140625, "learning_rate": 4.7811287141311955e-05, "loss": 0.0002, "step": 21910 }, { "epoch": 1.3946681936756378, "grad_norm": 0.00148773193359375, "learning_rate": 4.780775239832312e-05, "loss": 0.0016, "step": 21920 }, { "epoch": 1.3953044474136287, "grad_norm": 0.008544921875, "learning_rate": 4.7804217655334286e-05, "loss": 0.0004, "step": 21930 }, { "epoch": 1.3959407011516194, "grad_norm": 0.1015625, "learning_rate": 4.7800682912345444e-05, "loss": 0.0043, "step": 21940 }, { "epoch": 1.39657695488961, "grad_norm": 0.0625, "learning_rate": 4.779714816935661e-05, "loss": 0.0121, "step": 21950 }, { "epoch": 1.3972132086276008, "grad_norm": 0.1533203125, "learning_rate": 4.7793613426367774e-05, "loss": 0.0006, "step": 21960 }, { "epoch": 1.3978494623655915, "grad_norm": 0.314453125, "learning_rate": 4.779007868337893e-05, "loss": 0.0014, "step": 21970 }, { "epoch": 1.3984857161035822, "grad_norm": 6.65625, "learning_rate": 4.778654394039009e-05, "loss": 0.0058, "step": 21980 }, { "epoch": 1.3991219698415729, "grad_norm": 0.0196533203125, "learning_rate": 4.7783009197401256e-05, "loss": 0.0019, "step": 21990 }, { "epoch": 1.3997582235795636, "grad_norm": 8.916854858398438e-05, "learning_rate": 4.777947445441242e-05, "loss": 0.0022, "step": 22000 }, { "epoch": 1.4003944773175543, "grad_norm": 0.09375, "learning_rate": 4.777593971142359e-05, "loss": 0.0057, "step": 22010 }, { "epoch": 1.401030731055545, "grad_norm": 0.296875, "learning_rate": 4.7772404968434745e-05, "loss": 0.0098, "step": 22020 }, { "epoch": 1.4016669847935357, "grad_norm": 0.00142669677734375, "learning_rate": 4.776887022544591e-05, "loss": 0.0003, "step": 22030 }, { "epoch": 1.4023032385315264, "grad_norm": 0.005157470703125, "learning_rate": 4.7765335482457076e-05, "loss": 0.0003, "step": 22040 }, { "epoch": 1.402939492269517, "grad_norm": 0.004852294921875, "learning_rate": 4.776180073946824e-05, "loss": 0.0077, "step": 22050 }, { "epoch": 1.4035757460075078, "grad_norm": 0.0159912109375, "learning_rate": 4.77582659964794e-05, "loss": 0.0012, "step": 22060 }, { "epoch": 1.4042119997454985, "grad_norm": 0.00701904296875, "learning_rate": 4.775473125349056e-05, "loss": 0.001, "step": 22070 }, { "epoch": 1.4048482534834892, "grad_norm": 0.59765625, "learning_rate": 4.775119651050172e-05, "loss": 0.0008, "step": 22080 }, { "epoch": 1.4054845072214799, "grad_norm": 0.00384521484375, "learning_rate": 4.774766176751289e-05, "loss": 0.0006, "step": 22090 }, { "epoch": 1.4061207609594706, "grad_norm": 0.04541015625, "learning_rate": 4.7744127024524046e-05, "loss": 0.0002, "step": 22100 }, { "epoch": 1.4067570146974613, "grad_norm": 0.080078125, "learning_rate": 4.774059228153521e-05, "loss": 0.0002, "step": 22110 }, { "epoch": 1.407393268435452, "grad_norm": 0.042724609375, "learning_rate": 4.773705753854638e-05, "loss": 0.0006, "step": 22120 }, { "epoch": 1.4080295221734427, "grad_norm": 0.004241943359375, "learning_rate": 4.773352279555754e-05, "loss": 0.0061, "step": 22130 }, { "epoch": 1.4086657759114334, "grad_norm": 0.0027923583984375, "learning_rate": 4.77299880525687e-05, "loss": 0.0003, "step": 22140 }, { "epoch": 1.409302029649424, "grad_norm": 0.4296875, "learning_rate": 4.772645330957986e-05, "loss": 0.0015, "step": 22150 }, { "epoch": 1.4099382833874148, "grad_norm": 0.052978515625, "learning_rate": 4.7722918566591024e-05, "loss": 0.0005, "step": 22160 }, { "epoch": 1.4105745371254055, "grad_norm": 0.1318359375, "learning_rate": 4.771938382360219e-05, "loss": 0.0077, "step": 22170 }, { "epoch": 1.4112107908633964, "grad_norm": 0.006561279296875, "learning_rate": 4.771584908061335e-05, "loss": 0.0002, "step": 22180 }, { "epoch": 1.411847044601387, "grad_norm": 0.007232666015625, "learning_rate": 4.771231433762451e-05, "loss": 0.0002, "step": 22190 }, { "epoch": 1.4124832983393778, "grad_norm": 0.01544189453125, "learning_rate": 4.770877959463568e-05, "loss": 0.0004, "step": 22200 }, { "epoch": 1.4131195520773685, "grad_norm": 0.0050048828125, "learning_rate": 4.770524485164684e-05, "loss": 0.0062, "step": 22210 }, { "epoch": 1.4137558058153592, "grad_norm": 0.0034332275390625, "learning_rate": 4.7701710108658e-05, "loss": 0.0003, "step": 22220 }, { "epoch": 1.41439205955335, "grad_norm": 0.00604248046875, "learning_rate": 4.769817536566916e-05, "loss": 0.0005, "step": 22230 }, { "epoch": 1.4150283132913406, "grad_norm": 0.020751953125, "learning_rate": 4.7694640622680325e-05, "loss": 0.0002, "step": 22240 }, { "epoch": 1.4156645670293313, "grad_norm": 0.050537109375, "learning_rate": 4.769110587969149e-05, "loss": 0.0037, "step": 22250 }, { "epoch": 1.416300820767322, "grad_norm": 0.0118408203125, "learning_rate": 4.7687571136702655e-05, "loss": 0.0002, "step": 22260 }, { "epoch": 1.4169370745053127, "grad_norm": 0.12255859375, "learning_rate": 4.7684036393713814e-05, "loss": 0.0002, "step": 22270 }, { "epoch": 1.4175733282433034, "grad_norm": 0.0054931640625, "learning_rate": 4.768050165072498e-05, "loss": 0.0001, "step": 22280 }, { "epoch": 1.418209581981294, "grad_norm": 0.011962890625, "learning_rate": 4.7676966907736144e-05, "loss": 0.0161, "step": 22290 }, { "epoch": 1.4188458357192848, "grad_norm": 0.006927490234375, "learning_rate": 4.76734321647473e-05, "loss": 0.0007, "step": 22300 }, { "epoch": 1.4194820894572755, "grad_norm": 0.0299072265625, "learning_rate": 4.766989742175846e-05, "loss": 0.0005, "step": 22310 }, { "epoch": 1.4201183431952662, "grad_norm": 0.0380859375, "learning_rate": 4.7666362678769626e-05, "loss": 0.0059, "step": 22320 }, { "epoch": 1.4207545969332571, "grad_norm": 0.00799560546875, "learning_rate": 4.766282793578079e-05, "loss": 0.0005, "step": 22330 }, { "epoch": 1.4213908506712478, "grad_norm": 0.00823974609375, "learning_rate": 4.7659293192791957e-05, "loss": 0.001, "step": 22340 }, { "epoch": 1.4220271044092385, "grad_norm": 0.01904296875, "learning_rate": 4.7655758449803115e-05, "loss": 0.0003, "step": 22350 }, { "epoch": 1.4226633581472292, "grad_norm": 0.18359375, "learning_rate": 4.765222370681428e-05, "loss": 0.0013, "step": 22360 }, { "epoch": 1.42329961188522, "grad_norm": 0.051025390625, "learning_rate": 4.7648688963825445e-05, "loss": 0.013, "step": 22370 }, { "epoch": 1.4239358656232106, "grad_norm": 0.0034332275390625, "learning_rate": 4.764515422083661e-05, "loss": 0.0001, "step": 22380 }, { "epoch": 1.4245721193612013, "grad_norm": 0.08837890625, "learning_rate": 4.764161947784776e-05, "loss": 0.0004, "step": 22390 }, { "epoch": 1.425208373099192, "grad_norm": 0.0068359375, "learning_rate": 4.763808473485893e-05, "loss": 0.0001, "step": 22400 }, { "epoch": 1.4258446268371827, "grad_norm": 0.0118408203125, "learning_rate": 4.763454999187009e-05, "loss": 0.0005, "step": 22410 }, { "epoch": 1.4264808805751734, "grad_norm": 0.01904296875, "learning_rate": 4.763101524888126e-05, "loss": 0.0004, "step": 22420 }, { "epoch": 1.4271171343131641, "grad_norm": 0.07373046875, "learning_rate": 4.7627480505892416e-05, "loss": 0.0026, "step": 22430 }, { "epoch": 1.4277533880511548, "grad_norm": 0.00433349609375, "learning_rate": 4.762394576290358e-05, "loss": 0.0012, "step": 22440 }, { "epoch": 1.4283896417891455, "grad_norm": 0.083984375, "learning_rate": 4.7620411019914747e-05, "loss": 0.003, "step": 22450 }, { "epoch": 1.4290258955271362, "grad_norm": 0.7109375, "learning_rate": 4.761687627692591e-05, "loss": 0.0009, "step": 22460 }, { "epoch": 1.429662149265127, "grad_norm": 0.002532958984375, "learning_rate": 4.761334153393707e-05, "loss": 0.0003, "step": 22470 }, { "epoch": 1.4302984030031176, "grad_norm": 0.0166015625, "learning_rate": 4.760980679094823e-05, "loss": 0.0003, "step": 22480 }, { "epoch": 1.4309346567411083, "grad_norm": 0.0035247802734375, "learning_rate": 4.7606272047959394e-05, "loss": 0.0005, "step": 22490 }, { "epoch": 1.431570910479099, "grad_norm": 4.875, "learning_rate": 4.760273730497056e-05, "loss": 0.0049, "step": 22500 }, { "epoch": 1.4322071642170897, "grad_norm": 0.040771484375, "learning_rate": 4.759920256198172e-05, "loss": 0.0002, "step": 22510 }, { "epoch": 1.4328434179550804, "grad_norm": 0.00714111328125, "learning_rate": 4.759566781899288e-05, "loss": 0.0027, "step": 22520 }, { "epoch": 1.4334796716930711, "grad_norm": 0.0091552734375, "learning_rate": 4.759213307600405e-05, "loss": 0.0003, "step": 22530 }, { "epoch": 1.4341159254310618, "grad_norm": 0.02685546875, "learning_rate": 4.758859833301521e-05, "loss": 0.0005, "step": 22540 }, { "epoch": 1.4347521791690525, "grad_norm": 0.0042724609375, "learning_rate": 4.758506359002637e-05, "loss": 0.0001, "step": 22550 }, { "epoch": 1.4353884329070432, "grad_norm": 0.022216796875, "learning_rate": 4.758152884703753e-05, "loss": 0.0004, "step": 22560 }, { "epoch": 1.436024686645034, "grad_norm": 0.0125732421875, "learning_rate": 4.7577994104048695e-05, "loss": 0.0003, "step": 22570 }, { "epoch": 1.4366609403830248, "grad_norm": 0.02734375, "learning_rate": 4.757445936105986e-05, "loss": 0.0007, "step": 22580 }, { "epoch": 1.4372971941210155, "grad_norm": 0.01348876953125, "learning_rate": 4.7570924618071025e-05, "loss": 0.0002, "step": 22590 }, { "epoch": 1.4379334478590062, "grad_norm": 0.07177734375, "learning_rate": 4.7567389875082184e-05, "loss": 0.0002, "step": 22600 }, { "epoch": 1.438569701596997, "grad_norm": 0.578125, "learning_rate": 4.756385513209335e-05, "loss": 0.0005, "step": 22610 }, { "epoch": 1.4392059553349876, "grad_norm": 0.0079345703125, "learning_rate": 4.7560320389104514e-05, "loss": 0.0002, "step": 22620 }, { "epoch": 1.4398422090729783, "grad_norm": 0.10693359375, "learning_rate": 4.755678564611567e-05, "loss": 0.0003, "step": 22630 }, { "epoch": 1.440478462810969, "grad_norm": 0.031982421875, "learning_rate": 4.755325090312683e-05, "loss": 0.0005, "step": 22640 }, { "epoch": 1.4411147165489597, "grad_norm": 0.17578125, "learning_rate": 4.7549716160137996e-05, "loss": 0.0003, "step": 22650 }, { "epoch": 1.4417509702869504, "grad_norm": 0.004638671875, "learning_rate": 4.754618141714916e-05, "loss": 0.0002, "step": 22660 }, { "epoch": 1.4423872240249411, "grad_norm": 0.06201171875, "learning_rate": 4.7542646674160326e-05, "loss": 0.0002, "step": 22670 }, { "epoch": 1.4430234777629318, "grad_norm": 0.001922607421875, "learning_rate": 4.7539111931171485e-05, "loss": 0.0004, "step": 22680 }, { "epoch": 1.4436597315009225, "grad_norm": 0.1435546875, "learning_rate": 4.753557718818265e-05, "loss": 0.0022, "step": 22690 }, { "epoch": 1.4442959852389132, "grad_norm": 0.012451171875, "learning_rate": 4.7532042445193815e-05, "loss": 0.0006, "step": 22700 }, { "epoch": 1.444932238976904, "grad_norm": 0.51953125, "learning_rate": 4.752850770220498e-05, "loss": 0.0005, "step": 22710 }, { "epoch": 1.4455684927148946, "grad_norm": 2.40625, "learning_rate": 4.752497295921613e-05, "loss": 0.0108, "step": 22720 }, { "epoch": 1.4462047464528855, "grad_norm": 0.01007080078125, "learning_rate": 4.75214382162273e-05, "loss": 0.0062, "step": 22730 }, { "epoch": 1.4468410001908762, "grad_norm": 0.03662109375, "learning_rate": 4.751790347323846e-05, "loss": 0.0021, "step": 22740 }, { "epoch": 1.447477253928867, "grad_norm": 0.490234375, "learning_rate": 4.751436873024963e-05, "loss": 0.0011, "step": 22750 }, { "epoch": 1.4481135076668576, "grad_norm": 0.00518798828125, "learning_rate": 4.7510833987260786e-05, "loss": 0.0062, "step": 22760 }, { "epoch": 1.4487497614048483, "grad_norm": 0.006103515625, "learning_rate": 4.750729924427195e-05, "loss": 0.0003, "step": 22770 }, { "epoch": 1.449386015142839, "grad_norm": 0.059814453125, "learning_rate": 4.7503764501283116e-05, "loss": 0.0003, "step": 22780 }, { "epoch": 1.4500222688808297, "grad_norm": 0.00286865234375, "learning_rate": 4.750022975829428e-05, "loss": 0.0003, "step": 22790 }, { "epoch": 1.4506585226188204, "grad_norm": 0.5, "learning_rate": 4.749669501530544e-05, "loss": 0.001, "step": 22800 }, { "epoch": 1.4512947763568111, "grad_norm": 0.373046875, "learning_rate": 4.74931602723166e-05, "loss": 0.0004, "step": 22810 }, { "epoch": 1.4519310300948018, "grad_norm": 0.0091552734375, "learning_rate": 4.7489625529327764e-05, "loss": 0.0006, "step": 22820 }, { "epoch": 1.4525672838327925, "grad_norm": 0.009033203125, "learning_rate": 4.748609078633893e-05, "loss": 0.0008, "step": 22830 }, { "epoch": 1.4532035375707832, "grad_norm": 0.041015625, "learning_rate": 4.748255604335009e-05, "loss": 0.0007, "step": 22840 }, { "epoch": 1.453839791308774, "grad_norm": 0.00286865234375, "learning_rate": 4.747902130036125e-05, "loss": 0.0001, "step": 22850 }, { "epoch": 1.4544760450467646, "grad_norm": 0.310546875, "learning_rate": 4.747548655737242e-05, "loss": 0.0018, "step": 22860 }, { "epoch": 1.4551122987847553, "grad_norm": 0.018798828125, "learning_rate": 4.747195181438358e-05, "loss": 0.0046, "step": 22870 }, { "epoch": 1.455748552522746, "grad_norm": 0.0091552734375, "learning_rate": 4.746841707139474e-05, "loss": 0.0003, "step": 22880 }, { "epoch": 1.4563848062607367, "grad_norm": 0.0093994140625, "learning_rate": 4.74648823284059e-05, "loss": 0.0004, "step": 22890 }, { "epoch": 1.4570210599987274, "grad_norm": 0.0224609375, "learning_rate": 4.7461347585417065e-05, "loss": 0.0001, "step": 22900 }, { "epoch": 1.4576573137367181, "grad_norm": 0.0308837890625, "learning_rate": 4.745781284242823e-05, "loss": 0.0066, "step": 22910 }, { "epoch": 1.4582935674747088, "grad_norm": 0.19921875, "learning_rate": 4.7454278099439395e-05, "loss": 0.0003, "step": 22920 }, { "epoch": 1.4589298212126995, "grad_norm": 0.017578125, "learning_rate": 4.7450743356450553e-05, "loss": 0.0032, "step": 22930 }, { "epoch": 1.4595660749506902, "grad_norm": 0.029052734375, "learning_rate": 4.744720861346172e-05, "loss": 0.0009, "step": 22940 }, { "epoch": 1.460202328688681, "grad_norm": 0.000705718994140625, "learning_rate": 4.7443673870472884e-05, "loss": 0.0035, "step": 22950 }, { "epoch": 1.4608385824266716, "grad_norm": 0.00335693359375, "learning_rate": 4.744013912748405e-05, "loss": 0.0052, "step": 22960 }, { "epoch": 1.4614748361646623, "grad_norm": 0.0003185272216796875, "learning_rate": 4.74366043844952e-05, "loss": 0.0003, "step": 22970 }, { "epoch": 1.4621110899026533, "grad_norm": 0.006866455078125, "learning_rate": 4.7433069641506366e-05, "loss": 0.004, "step": 22980 }, { "epoch": 1.462747343640644, "grad_norm": 0.0244140625, "learning_rate": 4.742953489851753e-05, "loss": 0.0006, "step": 22990 }, { "epoch": 1.4633835973786347, "grad_norm": 1.1015625, "learning_rate": 4.7426000155528696e-05, "loss": 0.0145, "step": 23000 }, { "epoch": 1.4640198511166254, "grad_norm": 0.005950927734375, "learning_rate": 4.7422465412539855e-05, "loss": 0.0037, "step": 23010 }, { "epoch": 1.464656104854616, "grad_norm": 0.07861328125, "learning_rate": 4.741893066955102e-05, "loss": 0.0038, "step": 23020 }, { "epoch": 1.4652923585926068, "grad_norm": 0.008056640625, "learning_rate": 4.7415395926562185e-05, "loss": 0.0006, "step": 23030 }, { "epoch": 1.4659286123305975, "grad_norm": 0.033447265625, "learning_rate": 4.741186118357335e-05, "loss": 0.0045, "step": 23040 }, { "epoch": 1.4665648660685882, "grad_norm": 0.373046875, "learning_rate": 4.74083264405845e-05, "loss": 0.0055, "step": 23050 }, { "epoch": 1.4672011198065789, "grad_norm": 0.244140625, "learning_rate": 4.740479169759567e-05, "loss": 0.0004, "step": 23060 }, { "epoch": 1.4678373735445696, "grad_norm": 0.008544921875, "learning_rate": 4.740125695460683e-05, "loss": 0.0008, "step": 23070 }, { "epoch": 1.4684736272825603, "grad_norm": 0.71875, "learning_rate": 4.7397722211618e-05, "loss": 0.0071, "step": 23080 }, { "epoch": 1.469109881020551, "grad_norm": 0.0213623046875, "learning_rate": 4.7394187468629156e-05, "loss": 0.0007, "step": 23090 }, { "epoch": 1.4697461347585417, "grad_norm": 0.00079345703125, "learning_rate": 4.739065272564032e-05, "loss": 0.0005, "step": 23100 }, { "epoch": 1.4703823884965324, "grad_norm": 0.0693359375, "learning_rate": 4.7387117982651486e-05, "loss": 0.0005, "step": 23110 }, { "epoch": 1.471018642234523, "grad_norm": 0.001678466796875, "learning_rate": 4.738358323966265e-05, "loss": 0.0062, "step": 23120 }, { "epoch": 1.471654895972514, "grad_norm": 0.005859375, "learning_rate": 4.738004849667381e-05, "loss": 0.0002, "step": 23130 }, { "epoch": 1.4722911497105047, "grad_norm": 0.056396484375, "learning_rate": 4.737651375368497e-05, "loss": 0.0016, "step": 23140 }, { "epoch": 1.4729274034484954, "grad_norm": 0.07861328125, "learning_rate": 4.737297901069613e-05, "loss": 0.0021, "step": 23150 }, { "epoch": 1.473563657186486, "grad_norm": 0.026123046875, "learning_rate": 4.73694442677073e-05, "loss": 0.0005, "step": 23160 }, { "epoch": 1.4741999109244768, "grad_norm": 0.00921630859375, "learning_rate": 4.7365909524718464e-05, "loss": 0.0003, "step": 23170 }, { "epoch": 1.4748361646624675, "grad_norm": 0.07568359375, "learning_rate": 4.736237478172962e-05, "loss": 0.0012, "step": 23180 }, { "epoch": 1.4754724184004582, "grad_norm": 0.0103759765625, "learning_rate": 4.735884003874079e-05, "loss": 0.0002, "step": 23190 }, { "epoch": 1.4761086721384489, "grad_norm": 0.0303955078125, "learning_rate": 4.735530529575195e-05, "loss": 0.0016, "step": 23200 }, { "epoch": 1.4767449258764396, "grad_norm": 0.033935546875, "learning_rate": 4.735177055276311e-05, "loss": 0.0002, "step": 23210 }, { "epoch": 1.4773811796144303, "grad_norm": 0.0140380859375, "learning_rate": 4.734823580977427e-05, "loss": 0.0004, "step": 23220 }, { "epoch": 1.478017433352421, "grad_norm": 0.032470703125, "learning_rate": 4.7344701066785434e-05, "loss": 0.0001, "step": 23230 }, { "epoch": 1.4786536870904117, "grad_norm": 0.0849609375, "learning_rate": 4.73411663237966e-05, "loss": 0.0018, "step": 23240 }, { "epoch": 1.4792899408284024, "grad_norm": 0.0888671875, "learning_rate": 4.7337631580807765e-05, "loss": 0.0002, "step": 23250 }, { "epoch": 1.479926194566393, "grad_norm": 1.4921875, "learning_rate": 4.733409683781892e-05, "loss": 0.0013, "step": 23260 }, { "epoch": 1.4805624483043838, "grad_norm": 0.66796875, "learning_rate": 4.733056209483009e-05, "loss": 0.001, "step": 23270 }, { "epoch": 1.4811987020423745, "grad_norm": 0.1259765625, "learning_rate": 4.7327027351841254e-05, "loss": 0.0007, "step": 23280 }, { "epoch": 1.4818349557803652, "grad_norm": 0.009765625, "learning_rate": 4.732349260885241e-05, "loss": 0.0009, "step": 23290 }, { "epoch": 1.4824712095183559, "grad_norm": 0.02587890625, "learning_rate": 4.731995786586357e-05, "loss": 0.0024, "step": 23300 }, { "epoch": 1.4831074632563466, "grad_norm": 0.00098419189453125, "learning_rate": 4.7316423122874736e-05, "loss": 0.0006, "step": 23310 }, { "epoch": 1.4837437169943373, "grad_norm": 0.333984375, "learning_rate": 4.73128883798859e-05, "loss": 0.0004, "step": 23320 }, { "epoch": 1.484379970732328, "grad_norm": 0.384765625, "learning_rate": 4.7309353636897066e-05, "loss": 0.0004, "step": 23330 }, { "epoch": 1.4850162244703187, "grad_norm": 0.012939453125, "learning_rate": 4.7305818893908224e-05, "loss": 0.0017, "step": 23340 }, { "epoch": 1.4856524782083094, "grad_norm": 0.0218505859375, "learning_rate": 4.730228415091939e-05, "loss": 0.001, "step": 23350 }, { "epoch": 1.4862887319463, "grad_norm": 2.21875, "learning_rate": 4.7298749407930555e-05, "loss": 0.0154, "step": 23360 }, { "epoch": 1.4869249856842908, "grad_norm": 0.00061798095703125, "learning_rate": 4.729521466494171e-05, "loss": 0.0034, "step": 23370 }, { "epoch": 1.4875612394222815, "grad_norm": 0.00162506103515625, "learning_rate": 4.729167992195287e-05, "loss": 0.0004, "step": 23380 }, { "epoch": 1.4881974931602724, "grad_norm": 0.0030059814453125, "learning_rate": 4.728814517896404e-05, "loss": 0.0001, "step": 23390 }, { "epoch": 1.488833746898263, "grad_norm": 1.0625, "learning_rate": 4.72846104359752e-05, "loss": 0.0038, "step": 23400 }, { "epoch": 1.4894700006362538, "grad_norm": 0.01031494140625, "learning_rate": 4.728107569298637e-05, "loss": 0.0021, "step": 23410 }, { "epoch": 1.4901062543742445, "grad_norm": 0.01190185546875, "learning_rate": 4.7277540949997526e-05, "loss": 0.0001, "step": 23420 }, { "epoch": 1.4907425081122352, "grad_norm": 0.006988525390625, "learning_rate": 4.727400620700869e-05, "loss": 0.0015, "step": 23430 }, { "epoch": 1.4913787618502259, "grad_norm": 0.287109375, "learning_rate": 4.7270471464019856e-05, "loss": 0.0007, "step": 23440 }, { "epoch": 1.4920150155882166, "grad_norm": 0.08740234375, "learning_rate": 4.726693672103102e-05, "loss": 0.0003, "step": 23450 }, { "epoch": 1.4926512693262073, "grad_norm": 0.0216064453125, "learning_rate": 4.726340197804218e-05, "loss": 0.0007, "step": 23460 }, { "epoch": 1.493287523064198, "grad_norm": 0.56640625, "learning_rate": 4.725986723505334e-05, "loss": 0.001, "step": 23470 }, { "epoch": 1.4939237768021887, "grad_norm": 0.482421875, "learning_rate": 4.72563324920645e-05, "loss": 0.0012, "step": 23480 }, { "epoch": 1.4945600305401794, "grad_norm": 7.65625, "learning_rate": 4.725279774907567e-05, "loss": 0.0101, "step": 23490 }, { "epoch": 1.49519628427817, "grad_norm": 0.08447265625, "learning_rate": 4.7249263006086834e-05, "loss": 0.0013, "step": 23500 }, { "epoch": 1.4958325380161608, "grad_norm": 0.05810546875, "learning_rate": 4.724572826309799e-05, "loss": 0.0015, "step": 23510 }, { "epoch": 1.4964687917541515, "grad_norm": 1.0390625, "learning_rate": 4.724219352010916e-05, "loss": 0.0034, "step": 23520 }, { "epoch": 1.4971050454921422, "grad_norm": 0.00689697265625, "learning_rate": 4.723865877712032e-05, "loss": 0.0046, "step": 23530 }, { "epoch": 1.497741299230133, "grad_norm": 0.013916015625, "learning_rate": 4.723512403413148e-05, "loss": 0.0004, "step": 23540 }, { "epoch": 1.4983775529681238, "grad_norm": 0.004547119140625, "learning_rate": 4.723158929114264e-05, "loss": 0.0002, "step": 23550 }, { "epoch": 1.4990138067061145, "grad_norm": 0.056396484375, "learning_rate": 4.7228054548153804e-05, "loss": 0.0003, "step": 23560 }, { "epoch": 1.4996500604441052, "grad_norm": 0.00106048583984375, "learning_rate": 4.722451980516497e-05, "loss": 0.0007, "step": 23570 }, { "epoch": 1.500286314182096, "grad_norm": 0.00982666015625, "learning_rate": 4.7220985062176135e-05, "loss": 0.0003, "step": 23580 }, { "epoch": 1.5009225679200866, "grad_norm": 0.061279296875, "learning_rate": 4.721745031918729e-05, "loss": 0.0006, "step": 23590 }, { "epoch": 1.5015588216580773, "grad_norm": 0.006683349609375, "learning_rate": 4.721391557619846e-05, "loss": 0.0004, "step": 23600 }, { "epoch": 1.502195075396068, "grad_norm": 0.0007781982421875, "learning_rate": 4.7210380833209623e-05, "loss": 0.001, "step": 23610 }, { "epoch": 1.5028313291340587, "grad_norm": 7.75, "learning_rate": 4.720684609022078e-05, "loss": 0.013, "step": 23620 }, { "epoch": 1.5034675828720494, "grad_norm": 0.00103759765625, "learning_rate": 4.720331134723194e-05, "loss": 0.0016, "step": 23630 }, { "epoch": 1.50410383661004, "grad_norm": 0.0361328125, "learning_rate": 4.7199776604243105e-05, "loss": 0.0024, "step": 23640 }, { "epoch": 1.5047400903480308, "grad_norm": 0.462890625, "learning_rate": 4.719624186125427e-05, "loss": 0.0048, "step": 23650 }, { "epoch": 1.5053763440860215, "grad_norm": 0.004241943359375, "learning_rate": 4.7192707118265436e-05, "loss": 0.0002, "step": 23660 }, { "epoch": 1.5060125978240122, "grad_norm": 0.11767578125, "learning_rate": 4.7189172375276594e-05, "loss": 0.0002, "step": 23670 }, { "epoch": 1.506648851562003, "grad_norm": 0.002105712890625, "learning_rate": 4.718563763228776e-05, "loss": 0.005, "step": 23680 }, { "epoch": 1.5072851052999936, "grad_norm": 0.015869140625, "learning_rate": 4.7182102889298925e-05, "loss": 0.0002, "step": 23690 }, { "epoch": 1.5079213590379843, "grad_norm": 0.0184326171875, "learning_rate": 4.717856814631008e-05, "loss": 0.0048, "step": 23700 }, { "epoch": 1.508557612775975, "grad_norm": 0.003753662109375, "learning_rate": 4.717503340332125e-05, "loss": 0.0005, "step": 23710 }, { "epoch": 1.5091938665139657, "grad_norm": 4.34375, "learning_rate": 4.717149866033241e-05, "loss": 0.0104, "step": 23720 }, { "epoch": 1.5098301202519564, "grad_norm": 0.0034027099609375, "learning_rate": 4.716796391734357e-05, "loss": 0.0016, "step": 23730 }, { "epoch": 1.510466373989947, "grad_norm": 0.1142578125, "learning_rate": 4.716442917435474e-05, "loss": 0.0007, "step": 23740 }, { "epoch": 1.5111026277279378, "grad_norm": 0.000972747802734375, "learning_rate": 4.7160894431365895e-05, "loss": 0.0001, "step": 23750 }, { "epoch": 1.5117388814659285, "grad_norm": 0.004241943359375, "learning_rate": 4.715735968837706e-05, "loss": 0.0001, "step": 23760 }, { "epoch": 1.5123751352039192, "grad_norm": 0.02783203125, "learning_rate": 4.7153824945388226e-05, "loss": 0.0015, "step": 23770 }, { "epoch": 1.51301138894191, "grad_norm": 0.00162506103515625, "learning_rate": 4.7150290202399384e-05, "loss": 0.0006, "step": 23780 }, { "epoch": 1.5136476426799006, "grad_norm": 0.0140380859375, "learning_rate": 4.714675545941055e-05, "loss": 0.0005, "step": 23790 }, { "epoch": 1.5142838964178913, "grad_norm": 0.0062255859375, "learning_rate": 4.714322071642171e-05, "loss": 0.0009, "step": 23800 }, { "epoch": 1.5149201501558822, "grad_norm": 0.032958984375, "learning_rate": 4.713968597343287e-05, "loss": 0.0018, "step": 23810 }, { "epoch": 1.515556403893873, "grad_norm": 0.0028839111328125, "learning_rate": 4.713615123044404e-05, "loss": 0.0002, "step": 23820 }, { "epoch": 1.5161926576318636, "grad_norm": 0.046875, "learning_rate": 4.71326164874552e-05, "loss": 0.0008, "step": 23830 }, { "epoch": 1.5168289113698543, "grad_norm": 0.0084228515625, "learning_rate": 4.712908174446636e-05, "loss": 0.0015, "step": 23840 }, { "epoch": 1.517465165107845, "grad_norm": 0.0196533203125, "learning_rate": 4.712554700147753e-05, "loss": 0.0015, "step": 23850 }, { "epoch": 1.5181014188458357, "grad_norm": 0.0050048828125, "learning_rate": 4.712201225848869e-05, "loss": 0.0006, "step": 23860 }, { "epoch": 1.5187376725838264, "grad_norm": 0.007720947265625, "learning_rate": 4.711847751549985e-05, "loss": 0.0008, "step": 23870 }, { "epoch": 1.5193739263218171, "grad_norm": 0.0030059814453125, "learning_rate": 4.711494277251101e-05, "loss": 0.0007, "step": 23880 }, { "epoch": 1.5200101800598078, "grad_norm": 0.00531005859375, "learning_rate": 4.7111408029522174e-05, "loss": 0.0002, "step": 23890 }, { "epoch": 1.5206464337977985, "grad_norm": 0.64453125, "learning_rate": 4.710787328653334e-05, "loss": 0.001, "step": 23900 }, { "epoch": 1.5212826875357894, "grad_norm": 0.04296875, "learning_rate": 4.7104338543544504e-05, "loss": 0.005, "step": 23910 }, { "epoch": 1.5219189412737801, "grad_norm": 0.02490234375, "learning_rate": 4.710080380055566e-05, "loss": 0.0002, "step": 23920 }, { "epoch": 1.5225551950117708, "grad_norm": 0.0166015625, "learning_rate": 4.709726905756683e-05, "loss": 0.0098, "step": 23930 }, { "epoch": 1.5231914487497615, "grad_norm": 0.00848388671875, "learning_rate": 4.709373431457799e-05, "loss": 0.0002, "step": 23940 }, { "epoch": 1.5238277024877522, "grad_norm": 0.005218505859375, "learning_rate": 4.709019957158915e-05, "loss": 0.0002, "step": 23950 }, { "epoch": 1.524463956225743, "grad_norm": 0.0125732421875, "learning_rate": 4.708666482860031e-05, "loss": 0.0001, "step": 23960 }, { "epoch": 1.5251002099637336, "grad_norm": 0.0032501220703125, "learning_rate": 4.7083130085611475e-05, "loss": 0.0003, "step": 23970 }, { "epoch": 1.5257364637017243, "grad_norm": 0.007293701171875, "learning_rate": 4.707959534262264e-05, "loss": 0.0002, "step": 23980 }, { "epoch": 1.526372717439715, "grad_norm": 0.0322265625, "learning_rate": 4.7076060599633806e-05, "loss": 0.0129, "step": 23990 }, { "epoch": 1.5270089711777057, "grad_norm": 0.00164031982421875, "learning_rate": 4.7072525856644964e-05, "loss": 0.0029, "step": 24000 }, { "epoch": 1.5276452249156964, "grad_norm": 0.053466796875, "learning_rate": 4.706899111365613e-05, "loss": 0.0001, "step": 24010 }, { "epoch": 1.5282814786536871, "grad_norm": 0.01458740234375, "learning_rate": 4.7065456370667294e-05, "loss": 0.0002, "step": 24020 }, { "epoch": 1.5289177323916778, "grad_norm": 0.021240234375, "learning_rate": 4.706192162767845e-05, "loss": 0.0004, "step": 24030 }, { "epoch": 1.5295539861296685, "grad_norm": 0.0067138671875, "learning_rate": 4.705838688468962e-05, "loss": 0.0061, "step": 24040 }, { "epoch": 1.5301902398676592, "grad_norm": 2.15625, "learning_rate": 4.7054852141700776e-05, "loss": 0.0013, "step": 24050 }, { "epoch": 1.53082649360565, "grad_norm": 0.006072998046875, "learning_rate": 4.705131739871194e-05, "loss": 0.0027, "step": 24060 }, { "epoch": 1.5314627473436406, "grad_norm": 0.002655029296875, "learning_rate": 4.704778265572311e-05, "loss": 0.0001, "step": 24070 }, { "epoch": 1.5320990010816313, "grad_norm": 0.028076171875, "learning_rate": 4.7044247912734265e-05, "loss": 0.0003, "step": 24080 }, { "epoch": 1.532735254819622, "grad_norm": 0.028076171875, "learning_rate": 4.704071316974543e-05, "loss": 0.0002, "step": 24090 }, { "epoch": 1.5333715085576127, "grad_norm": 0.00396728515625, "learning_rate": 4.7037178426756596e-05, "loss": 0.0002, "step": 24100 }, { "epoch": 1.5340077622956034, "grad_norm": 4.4375, "learning_rate": 4.7033643683767754e-05, "loss": 0.0026, "step": 24110 }, { "epoch": 1.5346440160335941, "grad_norm": 0.0079345703125, "learning_rate": 4.703010894077892e-05, "loss": 0.0041, "step": 24120 }, { "epoch": 1.5352802697715848, "grad_norm": 0.01806640625, "learning_rate": 4.702657419779008e-05, "loss": 0.0002, "step": 24130 }, { "epoch": 1.5359165235095755, "grad_norm": 3.921875, "learning_rate": 4.702303945480124e-05, "loss": 0.0089, "step": 24140 }, { "epoch": 1.5365527772475662, "grad_norm": 0.11572265625, "learning_rate": 4.701950471181241e-05, "loss": 0.0096, "step": 24150 }, { "epoch": 1.537189030985557, "grad_norm": 0.005157470703125, "learning_rate": 4.701596996882357e-05, "loss": 0.0008, "step": 24160 }, { "epoch": 1.5378252847235476, "grad_norm": 0.357421875, "learning_rate": 4.701243522583473e-05, "loss": 0.0046, "step": 24170 }, { "epoch": 1.5384615384615383, "grad_norm": 0.007598876953125, "learning_rate": 4.70089004828459e-05, "loss": 0.0037, "step": 24180 }, { "epoch": 1.539097792199529, "grad_norm": 0.003814697265625, "learning_rate": 4.7005365739857055e-05, "loss": 0.0006, "step": 24190 }, { "epoch": 1.5397340459375197, "grad_norm": 0.00909423828125, "learning_rate": 4.700183099686822e-05, "loss": 0.0022, "step": 24200 }, { "epoch": 1.5403702996755106, "grad_norm": 0.0035247802734375, "learning_rate": 4.699829625387938e-05, "loss": 0.005, "step": 24210 }, { "epoch": 1.5410065534135013, "grad_norm": 0.09326171875, "learning_rate": 4.6994761510890544e-05, "loss": 0.0019, "step": 24220 }, { "epoch": 1.541642807151492, "grad_norm": 0.0014190673828125, "learning_rate": 4.699122676790171e-05, "loss": 0.0012, "step": 24230 }, { "epoch": 1.5422790608894827, "grad_norm": 0.03955078125, "learning_rate": 4.6987692024912874e-05, "loss": 0.0003, "step": 24240 }, { "epoch": 1.5429153146274734, "grad_norm": 0.0068359375, "learning_rate": 4.698415728192403e-05, "loss": 0.0007, "step": 24250 }, { "epoch": 1.5435515683654641, "grad_norm": 1.0546875, "learning_rate": 4.69806225389352e-05, "loss": 0.0012, "step": 24260 }, { "epoch": 1.5441878221034548, "grad_norm": 0.007598876953125, "learning_rate": 4.6977087795946356e-05, "loss": 0.0006, "step": 24270 }, { "epoch": 1.5448240758414455, "grad_norm": 1.3828125, "learning_rate": 4.697355305295752e-05, "loss": 0.0134, "step": 24280 }, { "epoch": 1.5454603295794362, "grad_norm": 0.00433349609375, "learning_rate": 4.697001830996868e-05, "loss": 0.0004, "step": 24290 }, { "epoch": 1.546096583317427, "grad_norm": 0.08251953125, "learning_rate": 4.6966483566979845e-05, "loss": 0.0002, "step": 24300 }, { "epoch": 1.5467328370554179, "grad_norm": 0.00958251953125, "learning_rate": 4.696294882399101e-05, "loss": 0.0032, "step": 24310 }, { "epoch": 1.5473690907934086, "grad_norm": 0.000530242919921875, "learning_rate": 4.6959414081002175e-05, "loss": 0.0001, "step": 24320 }, { "epoch": 1.5480053445313993, "grad_norm": 6.46875, "learning_rate": 4.6955879338013334e-05, "loss": 0.0034, "step": 24330 }, { "epoch": 1.54864159826939, "grad_norm": 0.006561279296875, "learning_rate": 4.69523445950245e-05, "loss": 0.0003, "step": 24340 }, { "epoch": 1.5492778520073807, "grad_norm": 0.004852294921875, "learning_rate": 4.6948809852035664e-05, "loss": 0.0012, "step": 24350 }, { "epoch": 1.5499141057453714, "grad_norm": 0.01483154296875, "learning_rate": 4.694527510904682e-05, "loss": 0.0002, "step": 24360 }, { "epoch": 1.550550359483362, "grad_norm": 0.0018463134765625, "learning_rate": 4.694174036605799e-05, "loss": 0.0006, "step": 24370 }, { "epoch": 1.5511866132213528, "grad_norm": 0.52734375, "learning_rate": 4.6938205623069146e-05, "loss": 0.0004, "step": 24380 }, { "epoch": 1.5518228669593435, "grad_norm": 0.01025390625, "learning_rate": 4.693467088008031e-05, "loss": 0.0001, "step": 24390 }, { "epoch": 1.5524591206973342, "grad_norm": 0.07763671875, "learning_rate": 4.6931136137091477e-05, "loss": 0.0007, "step": 24400 }, { "epoch": 1.5530953744353249, "grad_norm": 0.00077056884765625, "learning_rate": 4.6927601394102635e-05, "loss": 0.0017, "step": 24410 }, { "epoch": 1.5537316281733156, "grad_norm": 0.0380859375, "learning_rate": 4.69240666511138e-05, "loss": 0.0004, "step": 24420 }, { "epoch": 1.5543678819113063, "grad_norm": 0.12890625, "learning_rate": 4.6920531908124965e-05, "loss": 0.0003, "step": 24430 }, { "epoch": 1.555004135649297, "grad_norm": 0.095703125, "learning_rate": 4.6916997165136124e-05, "loss": 0.0041, "step": 24440 }, { "epoch": 1.5556403893872877, "grad_norm": 0.00274658203125, "learning_rate": 4.691346242214729e-05, "loss": 0.0011, "step": 24450 }, { "epoch": 1.5562766431252784, "grad_norm": 4.4375, "learning_rate": 4.690992767915845e-05, "loss": 0.0032, "step": 24460 }, { "epoch": 1.556912896863269, "grad_norm": 0.00193023681640625, "learning_rate": 4.690639293616961e-05, "loss": 0.0003, "step": 24470 }, { "epoch": 1.5575491506012598, "grad_norm": 0.019287109375, "learning_rate": 4.690285819318078e-05, "loss": 0.0001, "step": 24480 }, { "epoch": 1.5581854043392505, "grad_norm": 0.05322265625, "learning_rate": 4.689932345019194e-05, "loss": 0.0003, "step": 24490 }, { "epoch": 1.5588216580772412, "grad_norm": 0.265625, "learning_rate": 4.68957887072031e-05, "loss": 0.0055, "step": 24500 }, { "epoch": 1.5594579118152319, "grad_norm": 10.3125, "learning_rate": 4.6892253964214267e-05, "loss": 0.021, "step": 24510 }, { "epoch": 1.5600941655532226, "grad_norm": 0.10400390625, "learning_rate": 4.6888719221225425e-05, "loss": 0.0007, "step": 24520 }, { "epoch": 1.5607304192912133, "grad_norm": 0.035888671875, "learning_rate": 4.688518447823659e-05, "loss": 0.0004, "step": 24530 }, { "epoch": 1.561366673029204, "grad_norm": 1.0625, "learning_rate": 4.688164973524775e-05, "loss": 0.0016, "step": 24540 }, { "epoch": 1.5620029267671947, "grad_norm": 0.00567626953125, "learning_rate": 4.6878114992258914e-05, "loss": 0.0009, "step": 24550 }, { "epoch": 1.5626391805051854, "grad_norm": 0.007720947265625, "learning_rate": 4.687458024927008e-05, "loss": 0.0017, "step": 24560 }, { "epoch": 1.563275434243176, "grad_norm": 0.05615234375, "learning_rate": 4.6871045506281244e-05, "loss": 0.0001, "step": 24570 }, { "epoch": 1.5639116879811668, "grad_norm": 0.007720947265625, "learning_rate": 4.68675107632924e-05, "loss": 0.0065, "step": 24580 }, { "epoch": 1.5645479417191575, "grad_norm": 0.6640625, "learning_rate": 4.686397602030357e-05, "loss": 0.0032, "step": 24590 }, { "epoch": 1.5651841954571482, "grad_norm": 0.01171875, "learning_rate": 4.6860441277314726e-05, "loss": 0.0044, "step": 24600 }, { "epoch": 1.565820449195139, "grad_norm": 0.40234375, "learning_rate": 4.685690653432589e-05, "loss": 0.0005, "step": 24610 }, { "epoch": 1.5664567029331298, "grad_norm": 0.025634765625, "learning_rate": 4.685337179133705e-05, "loss": 0.0001, "step": 24620 }, { "epoch": 1.5670929566711205, "grad_norm": 0.0023651123046875, "learning_rate": 4.6849837048348215e-05, "loss": 0.0007, "step": 24630 }, { "epoch": 1.5677292104091112, "grad_norm": 0.4453125, "learning_rate": 4.684630230535938e-05, "loss": 0.0013, "step": 24640 }, { "epoch": 1.5683654641471019, "grad_norm": 2.140625, "learning_rate": 4.6842767562370545e-05, "loss": 0.0023, "step": 24650 }, { "epoch": 1.5690017178850926, "grad_norm": 0.006927490234375, "learning_rate": 4.6839232819381704e-05, "loss": 0.0003, "step": 24660 }, { "epoch": 1.5696379716230833, "grad_norm": 0.203125, "learning_rate": 4.683569807639287e-05, "loss": 0.0007, "step": 24670 }, { "epoch": 1.570274225361074, "grad_norm": 0.004425048828125, "learning_rate": 4.683216333340403e-05, "loss": 0.0001, "step": 24680 }, { "epoch": 1.5709104790990647, "grad_norm": 0.00616455078125, "learning_rate": 4.682862859041519e-05, "loss": 0.0006, "step": 24690 }, { "epoch": 1.5715467328370554, "grad_norm": 3.59375, "learning_rate": 4.682509384742636e-05, "loss": 0.0033, "step": 24700 }, { "epoch": 1.5721829865750463, "grad_norm": 0.005462646484375, "learning_rate": 4.6821559104437516e-05, "loss": 0.0001, "step": 24710 }, { "epoch": 1.572819240313037, "grad_norm": 0.022705078125, "learning_rate": 4.681802436144868e-05, "loss": 0.0006, "step": 24720 }, { "epoch": 1.5734554940510277, "grad_norm": 0.041259765625, "learning_rate": 4.6814489618459846e-05, "loss": 0.0012, "step": 24730 }, { "epoch": 1.5740917477890184, "grad_norm": 0.1884765625, "learning_rate": 4.6810954875471005e-05, "loss": 0.0012, "step": 24740 }, { "epoch": 1.574728001527009, "grad_norm": 0.00147247314453125, "learning_rate": 4.680742013248217e-05, "loss": 0.0002, "step": 24750 }, { "epoch": 1.5753642552649998, "grad_norm": 0.1005859375, "learning_rate": 4.6803885389493335e-05, "loss": 0.0012, "step": 24760 }, { "epoch": 1.5760005090029905, "grad_norm": 0.040771484375, "learning_rate": 4.6800350646504494e-05, "loss": 0.0007, "step": 24770 }, { "epoch": 1.5766367627409812, "grad_norm": 0.00958251953125, "learning_rate": 4.679681590351566e-05, "loss": 0.0003, "step": 24780 }, { "epoch": 1.577273016478972, "grad_norm": 5.1875, "learning_rate": 4.679328116052682e-05, "loss": 0.013, "step": 24790 }, { "epoch": 1.5779092702169626, "grad_norm": 0.06103515625, "learning_rate": 4.678974641753798e-05, "loss": 0.0075, "step": 24800 }, { "epoch": 1.5785455239549533, "grad_norm": 0.00150299072265625, "learning_rate": 4.678621167454915e-05, "loss": 0.0032, "step": 24810 }, { "epoch": 1.579181777692944, "grad_norm": 0.0439453125, "learning_rate": 4.678267693156031e-05, "loss": 0.0015, "step": 24820 }, { "epoch": 1.5798180314309347, "grad_norm": 4.53125, "learning_rate": 4.677914218857147e-05, "loss": 0.0031, "step": 24830 }, { "epoch": 1.5804542851689254, "grad_norm": 0.038330078125, "learning_rate": 4.6775607445582636e-05, "loss": 0.0001, "step": 24840 }, { "epoch": 1.581090538906916, "grad_norm": 0.0025634765625, "learning_rate": 4.6772072702593795e-05, "loss": 0.0005, "step": 24850 }, { "epoch": 1.5817267926449068, "grad_norm": 0.023193359375, "learning_rate": 4.676853795960496e-05, "loss": 0.0002, "step": 24860 }, { "epoch": 1.5823630463828975, "grad_norm": 0.01141357421875, "learning_rate": 4.676500321661612e-05, "loss": 0.0003, "step": 24870 }, { "epoch": 1.5829993001208882, "grad_norm": 0.00787353515625, "learning_rate": 4.6761468473627284e-05, "loss": 0.0091, "step": 24880 }, { "epoch": 1.5836355538588789, "grad_norm": 0.00543212890625, "learning_rate": 4.675793373063845e-05, "loss": 0.0001, "step": 24890 }, { "epoch": 1.5842718075968696, "grad_norm": 0.001983642578125, "learning_rate": 4.6754398987649614e-05, "loss": 0.0012, "step": 24900 }, { "epoch": 1.5849080613348603, "grad_norm": 0.001434326171875, "learning_rate": 4.675086424466077e-05, "loss": 0.0053, "step": 24910 }, { "epoch": 1.585544315072851, "grad_norm": 0.004302978515625, "learning_rate": 4.674732950167194e-05, "loss": 0.0008, "step": 24920 }, { "epoch": 1.5861805688108417, "grad_norm": 0.11181640625, "learning_rate": 4.6743794758683096e-05, "loss": 0.0002, "step": 24930 }, { "epoch": 1.5868168225488324, "grad_norm": 0.01287841796875, "learning_rate": 4.674026001569426e-05, "loss": 0.0002, "step": 24940 }, { "epoch": 1.587453076286823, "grad_norm": 0.005584716796875, "learning_rate": 4.673672527270542e-05, "loss": 0.0017, "step": 24950 }, { "epoch": 1.5880893300248138, "grad_norm": 0.1494140625, "learning_rate": 4.6733190529716585e-05, "loss": 0.0004, "step": 24960 }, { "epoch": 1.5887255837628045, "grad_norm": 0.1708984375, "learning_rate": 4.672965578672775e-05, "loss": 0.001, "step": 24970 }, { "epoch": 1.5893618375007952, "grad_norm": 0.029296875, "learning_rate": 4.6726121043738915e-05, "loss": 0.0019, "step": 24980 }, { "epoch": 1.5899980912387859, "grad_norm": 0.018798828125, "learning_rate": 4.6722586300750073e-05, "loss": 0.0044, "step": 24990 }, { "epoch": 1.5906343449767766, "grad_norm": 1.90625, "learning_rate": 4.671905155776124e-05, "loss": 0.0011, "step": 25000 }, { "epoch": 1.5912705987147675, "grad_norm": 0.0174560546875, "learning_rate": 4.67155168147724e-05, "loss": 0.0031, "step": 25010 }, { "epoch": 1.5919068524527582, "grad_norm": 0.2392578125, "learning_rate": 4.671198207178356e-05, "loss": 0.0001, "step": 25020 }, { "epoch": 1.592543106190749, "grad_norm": 0.484375, "learning_rate": 4.670844732879473e-05, "loss": 0.0004, "step": 25030 }, { "epoch": 1.5931793599287396, "grad_norm": 0.05322265625, "learning_rate": 4.6704912585805886e-05, "loss": 0.0028, "step": 25040 }, { "epoch": 1.5938156136667303, "grad_norm": 0.000896453857421875, "learning_rate": 4.670137784281705e-05, "loss": 0.0006, "step": 25050 }, { "epoch": 1.594451867404721, "grad_norm": 0.00543212890625, "learning_rate": 4.6697843099828216e-05, "loss": 0.0004, "step": 25060 }, { "epoch": 1.5950881211427117, "grad_norm": 0.2431640625, "learning_rate": 4.6694308356839375e-05, "loss": 0.0003, "step": 25070 }, { "epoch": 1.5957243748807024, "grad_norm": 0.00616455078125, "learning_rate": 4.669077361385054e-05, "loss": 0.0009, "step": 25080 }, { "epoch": 1.596360628618693, "grad_norm": 0.078125, "learning_rate": 4.66872388708617e-05, "loss": 0.0004, "step": 25090 }, { "epoch": 1.5969968823566838, "grad_norm": 0.1689453125, "learning_rate": 4.6683704127872863e-05, "loss": 0.0066, "step": 25100 }, { "epoch": 1.5976331360946747, "grad_norm": 0.427734375, "learning_rate": 4.668016938488403e-05, "loss": 0.0003, "step": 25110 }, { "epoch": 1.5982693898326654, "grad_norm": 0.002105712890625, "learning_rate": 4.667663464189519e-05, "loss": 0.0002, "step": 25120 }, { "epoch": 1.5989056435706561, "grad_norm": 0.006256103515625, "learning_rate": 4.667309989890635e-05, "loss": 0.0001, "step": 25130 }, { "epoch": 1.5995418973086468, "grad_norm": 0.08935546875, "learning_rate": 4.666956515591752e-05, "loss": 0.0016, "step": 25140 }, { "epoch": 1.6001781510466375, "grad_norm": 0.07958984375, "learning_rate": 4.666603041292868e-05, "loss": 0.0007, "step": 25150 }, { "epoch": 1.6008144047846282, "grad_norm": 0.404296875, "learning_rate": 4.666249566993984e-05, "loss": 0.0014, "step": 25160 }, { "epoch": 1.601450658522619, "grad_norm": 0.0250244140625, "learning_rate": 4.6658960926951006e-05, "loss": 0.0003, "step": 25170 }, { "epoch": 1.6020869122606096, "grad_norm": 0.0035400390625, "learning_rate": 4.6655426183962165e-05, "loss": 0.0009, "step": 25180 }, { "epoch": 1.6027231659986003, "grad_norm": 0.005126953125, "learning_rate": 4.665189144097333e-05, "loss": 0.0012, "step": 25190 }, { "epoch": 1.603359419736591, "grad_norm": 0.6875, "learning_rate": 4.664835669798449e-05, "loss": 0.0007, "step": 25200 }, { "epoch": 1.6039956734745817, "grad_norm": 0.01507568359375, "learning_rate": 4.664482195499565e-05, "loss": 0.0003, "step": 25210 }, { "epoch": 1.6046319272125724, "grad_norm": 0.0150146484375, "learning_rate": 4.664128721200682e-05, "loss": 0.0006, "step": 25220 }, { "epoch": 1.6052681809505631, "grad_norm": 0.00787353515625, "learning_rate": 4.6637752469017984e-05, "loss": 0.0055, "step": 25230 }, { "epoch": 1.6059044346885538, "grad_norm": 0.0208740234375, "learning_rate": 4.663421772602914e-05, "loss": 0.0018, "step": 25240 }, { "epoch": 1.6065406884265445, "grad_norm": 0.006256103515625, "learning_rate": 4.663068298304031e-05, "loss": 0.0003, "step": 25250 }, { "epoch": 1.6071769421645352, "grad_norm": 0.00286865234375, "learning_rate": 4.6627148240051466e-05, "loss": 0.0002, "step": 25260 }, { "epoch": 1.607813195902526, "grad_norm": 0.1845703125, "learning_rate": 4.662361349706263e-05, "loss": 0.0074, "step": 25270 }, { "epoch": 1.6084494496405166, "grad_norm": 0.00274658203125, "learning_rate": 4.662007875407379e-05, "loss": 0.0001, "step": 25280 }, { "epoch": 1.6090857033785073, "grad_norm": 0.006378173828125, "learning_rate": 4.6616544011084955e-05, "loss": 0.0002, "step": 25290 }, { "epoch": 1.609721957116498, "grad_norm": 0.025634765625, "learning_rate": 4.661300926809612e-05, "loss": 0.0049, "step": 25300 }, { "epoch": 1.6103582108544887, "grad_norm": 0.033935546875, "learning_rate": 4.6609474525107285e-05, "loss": 0.0064, "step": 25310 }, { "epoch": 1.6109944645924794, "grad_norm": 0.000888824462890625, "learning_rate": 4.660593978211844e-05, "loss": 0.001, "step": 25320 }, { "epoch": 1.6116307183304701, "grad_norm": 1.1484375, "learning_rate": 4.660240503912961e-05, "loss": 0.0019, "step": 25330 }, { "epoch": 1.6122669720684608, "grad_norm": 0.00518798828125, "learning_rate": 4.659887029614077e-05, "loss": 0.0018, "step": 25340 }, { "epoch": 1.6129032258064515, "grad_norm": 0.0191650390625, "learning_rate": 4.659533555315193e-05, "loss": 0.0003, "step": 25350 }, { "epoch": 1.6135394795444422, "grad_norm": 0.0079345703125, "learning_rate": 4.65918008101631e-05, "loss": 0.0004, "step": 25360 }, { "epoch": 1.614175733282433, "grad_norm": 0.034912109375, "learning_rate": 4.6588266067174256e-05, "loss": 0.0002, "step": 25370 }, { "epoch": 1.6148119870204236, "grad_norm": 0.0238037109375, "learning_rate": 4.658473132418542e-05, "loss": 0.0011, "step": 25380 }, { "epoch": 1.6154482407584143, "grad_norm": 0.0001583099365234375, "learning_rate": 4.6581196581196586e-05, "loss": 0.0014, "step": 25390 }, { "epoch": 1.616084494496405, "grad_norm": 0.0037078857421875, "learning_rate": 4.657766183820775e-05, "loss": 0.0033, "step": 25400 }, { "epoch": 1.616720748234396, "grad_norm": 0.09375, "learning_rate": 4.657412709521891e-05, "loss": 0.0026, "step": 25410 }, { "epoch": 1.6173570019723866, "grad_norm": 0.034912109375, "learning_rate": 4.657059235223007e-05, "loss": 0.0008, "step": 25420 }, { "epoch": 1.6179932557103773, "grad_norm": 0.024169921875, "learning_rate": 4.656705760924123e-05, "loss": 0.002, "step": 25430 }, { "epoch": 1.618629509448368, "grad_norm": 0.333984375, "learning_rate": 4.65635228662524e-05, "loss": 0.0008, "step": 25440 }, { "epoch": 1.6192657631863587, "grad_norm": 1.765625, "learning_rate": 4.655998812326356e-05, "loss": 0.0018, "step": 25450 }, { "epoch": 1.6199020169243494, "grad_norm": 0.6796875, "learning_rate": 4.655645338027472e-05, "loss": 0.0045, "step": 25460 }, { "epoch": 1.6205382706623401, "grad_norm": 0.0260009765625, "learning_rate": 4.655291863728589e-05, "loss": 0.0001, "step": 25470 }, { "epoch": 1.6211745244003308, "grad_norm": 0.0252685546875, "learning_rate": 4.654938389429705e-05, "loss": 0.0005, "step": 25480 }, { "epoch": 1.6218107781383215, "grad_norm": 0.10009765625, "learning_rate": 4.654584915130821e-05, "loss": 0.0006, "step": 25490 }, { "epoch": 1.6224470318763122, "grad_norm": 0.0029449462890625, "learning_rate": 4.654231440831937e-05, "loss": 0.0003, "step": 25500 }, { "epoch": 1.623083285614303, "grad_norm": 0.055419921875, "learning_rate": 4.6538779665330534e-05, "loss": 0.0003, "step": 25510 }, { "epoch": 1.6237195393522939, "grad_norm": 0.01483154296875, "learning_rate": 4.65352449223417e-05, "loss": 0.0017, "step": 25520 }, { "epoch": 1.6243557930902845, "grad_norm": 0.03955078125, "learning_rate": 4.653171017935286e-05, "loss": 0.0015, "step": 25530 }, { "epoch": 1.6249920468282752, "grad_norm": 0.009521484375, "learning_rate": 4.652817543636402e-05, "loss": 0.0002, "step": 25540 }, { "epoch": 1.625628300566266, "grad_norm": 0.00958251953125, "learning_rate": 4.652464069337519e-05, "loss": 0.0015, "step": 25550 }, { "epoch": 1.6262645543042566, "grad_norm": 0.08056640625, "learning_rate": 4.6521105950386354e-05, "loss": 0.0008, "step": 25560 }, { "epoch": 1.6269008080422473, "grad_norm": 0.609375, "learning_rate": 4.651757120739751e-05, "loss": 0.0007, "step": 25570 }, { "epoch": 1.627537061780238, "grad_norm": 0.001678466796875, "learning_rate": 4.651403646440867e-05, "loss": 0.0003, "step": 25580 }, { "epoch": 1.6281733155182287, "grad_norm": 0.00531005859375, "learning_rate": 4.6510501721419836e-05, "loss": 0.0005, "step": 25590 }, { "epoch": 1.6288095692562194, "grad_norm": 0.04150390625, "learning_rate": 4.6506966978431e-05, "loss": 0.0029, "step": 25600 }, { "epoch": 1.6294458229942101, "grad_norm": 0.0751953125, "learning_rate": 4.6503432235442166e-05, "loss": 0.0036, "step": 25610 }, { "epoch": 1.6300820767322008, "grad_norm": 0.267578125, "learning_rate": 4.6499897492453324e-05, "loss": 0.0008, "step": 25620 }, { "epoch": 1.6307183304701915, "grad_norm": 0.0184326171875, "learning_rate": 4.649636274946449e-05, "loss": 0.002, "step": 25630 }, { "epoch": 1.6313545842081822, "grad_norm": 4.59375, "learning_rate": 4.6492828006475655e-05, "loss": 0.0064, "step": 25640 }, { "epoch": 1.631990837946173, "grad_norm": 0.193359375, "learning_rate": 4.648929326348681e-05, "loss": 0.0012, "step": 25650 }, { "epoch": 1.6326270916841636, "grad_norm": 0.025634765625, "learning_rate": 4.648575852049798e-05, "loss": 0.0004, "step": 25660 }, { "epoch": 1.6332633454221543, "grad_norm": 0.03076171875, "learning_rate": 4.648222377750914e-05, "loss": 0.0001, "step": 25670 }, { "epoch": 1.633899599160145, "grad_norm": 0.0262451171875, "learning_rate": 4.64786890345203e-05, "loss": 0.0016, "step": 25680 }, { "epoch": 1.6345358528981357, "grad_norm": 0.00787353515625, "learning_rate": 4.647515429153147e-05, "loss": 0.0004, "step": 25690 }, { "epoch": 1.6351721066361264, "grad_norm": 0.01458740234375, "learning_rate": 4.6471619548542625e-05, "loss": 0.0001, "step": 25700 }, { "epoch": 1.6358083603741171, "grad_norm": 0.0166015625, "learning_rate": 4.646808480555379e-05, "loss": 0.0002, "step": 25710 }, { "epoch": 1.6364446141121078, "grad_norm": 0.000865936279296875, "learning_rate": 4.6464550062564956e-05, "loss": 0.0088, "step": 25720 }, { "epoch": 1.6370808678500985, "grad_norm": 0.390625, "learning_rate": 4.646101531957612e-05, "loss": 0.0004, "step": 25730 }, { "epoch": 1.6377171215880892, "grad_norm": 0.0224609375, "learning_rate": 4.645748057658728e-05, "loss": 0.0153, "step": 25740 }, { "epoch": 1.63835337532608, "grad_norm": 0.0225830078125, "learning_rate": 4.645394583359844e-05, "loss": 0.0009, "step": 25750 }, { "epoch": 1.6389896290640706, "grad_norm": 4.0625, "learning_rate": 4.64504110906096e-05, "loss": 0.006, "step": 25760 }, { "epoch": 1.6396258828020613, "grad_norm": 0.0079345703125, "learning_rate": 4.644687634762077e-05, "loss": 0.0019, "step": 25770 }, { "epoch": 1.640262136540052, "grad_norm": 0.11181640625, "learning_rate": 4.644334160463193e-05, "loss": 0.0004, "step": 25780 }, { "epoch": 1.6408983902780427, "grad_norm": 0.267578125, "learning_rate": 4.643980686164309e-05, "loss": 0.0007, "step": 25790 }, { "epoch": 1.6415346440160334, "grad_norm": 0.00677490234375, "learning_rate": 4.643627211865426e-05, "loss": 0.0026, "step": 25800 }, { "epoch": 1.6421708977540241, "grad_norm": 0.11083984375, "learning_rate": 4.643273737566542e-05, "loss": 0.0016, "step": 25810 }, { "epoch": 1.642807151492015, "grad_norm": 0.0712890625, "learning_rate": 4.642920263267658e-05, "loss": 0.0014, "step": 25820 }, { "epoch": 1.6434434052300058, "grad_norm": 0.043212890625, "learning_rate": 4.642566788968774e-05, "loss": 0.0129, "step": 25830 }, { "epoch": 1.6440796589679965, "grad_norm": 0.296875, "learning_rate": 4.6422133146698904e-05, "loss": 0.0051, "step": 25840 }, { "epoch": 1.6447159127059872, "grad_norm": 0.314453125, "learning_rate": 4.641859840371007e-05, "loss": 0.0021, "step": 25850 }, { "epoch": 1.6453521664439779, "grad_norm": 0.010986328125, "learning_rate": 4.641506366072123e-05, "loss": 0.0008, "step": 25860 }, { "epoch": 1.6459884201819686, "grad_norm": 0.03173828125, "learning_rate": 4.641152891773239e-05, "loss": 0.003, "step": 25870 }, { "epoch": 1.6466246739199593, "grad_norm": 0.0130615234375, "learning_rate": 4.640799417474356e-05, "loss": 0.0013, "step": 25880 }, { "epoch": 1.64726092765795, "grad_norm": 0.004638671875, "learning_rate": 4.640445943175472e-05, "loss": 0.0004, "step": 25890 }, { "epoch": 1.6478971813959407, "grad_norm": 0.09033203125, "learning_rate": 4.640092468876588e-05, "loss": 0.0012, "step": 25900 }, { "epoch": 1.6485334351339314, "grad_norm": 0.07275390625, "learning_rate": 4.639738994577704e-05, "loss": 0.0007, "step": 25910 }, { "epoch": 1.6491696888719223, "grad_norm": 0.1484375, "learning_rate": 4.6393855202788205e-05, "loss": 0.0003, "step": 25920 }, { "epoch": 1.649805942609913, "grad_norm": 0.0296630859375, "learning_rate": 4.639032045979937e-05, "loss": 0.0075, "step": 25930 }, { "epoch": 1.6504421963479037, "grad_norm": 0.000904083251953125, "learning_rate": 4.6386785716810536e-05, "loss": 0.0002, "step": 25940 }, { "epoch": 1.6510784500858944, "grad_norm": 0.00799560546875, "learning_rate": 4.6383250973821694e-05, "loss": 0.0003, "step": 25950 }, { "epoch": 1.651714703823885, "grad_norm": 0.2119140625, "learning_rate": 4.637971623083286e-05, "loss": 0.0005, "step": 25960 }, { "epoch": 1.6523509575618758, "grad_norm": 0.005462646484375, "learning_rate": 4.6376181487844024e-05, "loss": 0.0002, "step": 25970 }, { "epoch": 1.6529872112998665, "grad_norm": 0.11865234375, "learning_rate": 4.637264674485518e-05, "loss": 0.0006, "step": 25980 }, { "epoch": 1.6536234650378572, "grad_norm": 0.035888671875, "learning_rate": 4.636911200186634e-05, "loss": 0.0002, "step": 25990 }, { "epoch": 1.6542597187758479, "grad_norm": 0.0152587890625, "learning_rate": 4.6365577258877507e-05, "loss": 0.0004, "step": 26000 }, { "epoch": 1.6548959725138386, "grad_norm": 0.0172119140625, "learning_rate": 4.636204251588867e-05, "loss": 0.0004, "step": 26010 }, { "epoch": 1.6555322262518293, "grad_norm": 0.0020294189453125, "learning_rate": 4.635850777289984e-05, "loss": 0.0005, "step": 26020 }, { "epoch": 1.65616847998982, "grad_norm": 0.0013275146484375, "learning_rate": 4.6354973029910995e-05, "loss": 0.0001, "step": 26030 }, { "epoch": 1.6568047337278107, "grad_norm": 0.0021820068359375, "learning_rate": 4.635143828692216e-05, "loss": 0.0007, "step": 26040 }, { "epoch": 1.6574409874658014, "grad_norm": 3.09375, "learning_rate": 4.6347903543933326e-05, "loss": 0.0015, "step": 26050 }, { "epoch": 1.658077241203792, "grad_norm": 0.01806640625, "learning_rate": 4.634436880094449e-05, "loss": 0.0004, "step": 26060 }, { "epoch": 1.6587134949417828, "grad_norm": 0.00457763671875, "learning_rate": 4.634083405795565e-05, "loss": 0.0002, "step": 26070 }, { "epoch": 1.6593497486797735, "grad_norm": 0.02490234375, "learning_rate": 4.633729931496681e-05, "loss": 0.0063, "step": 26080 }, { "epoch": 1.6599860024177642, "grad_norm": 0.1337890625, "learning_rate": 4.633376457197797e-05, "loss": 0.0004, "step": 26090 }, { "epoch": 1.6606222561557549, "grad_norm": 0.859375, "learning_rate": 4.633022982898914e-05, "loss": 0.0008, "step": 26100 }, { "epoch": 1.6612585098937456, "grad_norm": 0.0260009765625, "learning_rate": 4.6326695086000296e-05, "loss": 0.0005, "step": 26110 }, { "epoch": 1.6618947636317363, "grad_norm": 2.234375, "learning_rate": 4.632316034301146e-05, "loss": 0.002, "step": 26120 }, { "epoch": 1.662531017369727, "grad_norm": 6.53125, "learning_rate": 4.631962560002263e-05, "loss": 0.0049, "step": 26130 }, { "epoch": 1.6631672711077177, "grad_norm": 0.001556396484375, "learning_rate": 4.631609085703379e-05, "loss": 0.0009, "step": 26140 }, { "epoch": 1.6638035248457084, "grad_norm": 0.00360107421875, "learning_rate": 4.631255611404495e-05, "loss": 0.0, "step": 26150 }, { "epoch": 1.664439778583699, "grad_norm": 0.00616455078125, "learning_rate": 4.630902137105611e-05, "loss": 0.0002, "step": 26160 }, { "epoch": 1.6650760323216898, "grad_norm": 0.0021820068359375, "learning_rate": 4.6305486628067274e-05, "loss": 0.0001, "step": 26170 }, { "epoch": 1.6657122860596805, "grad_norm": 0.035400390625, "learning_rate": 4.630195188507844e-05, "loss": 0.0047, "step": 26180 }, { "epoch": 1.6663485397976712, "grad_norm": 0.01531982421875, "learning_rate": 4.62984171420896e-05, "loss": 0.0002, "step": 26190 }, { "epoch": 1.6669847935356619, "grad_norm": 0.002838134765625, "learning_rate": 4.629488239910076e-05, "loss": 0.0003, "step": 26200 }, { "epoch": 1.6676210472736526, "grad_norm": 0.006439208984375, "learning_rate": 4.629134765611193e-05, "loss": 0.0015, "step": 26210 }, { "epoch": 1.6682573010116435, "grad_norm": 0.00299072265625, "learning_rate": 4.628781291312309e-05, "loss": 0.0003, "step": 26220 }, { "epoch": 1.6688935547496342, "grad_norm": 0.0115966796875, "learning_rate": 4.628427817013425e-05, "loss": 0.0002, "step": 26230 }, { "epoch": 1.6695298084876249, "grad_norm": 0.0079345703125, "learning_rate": 4.628074342714541e-05, "loss": 0.0007, "step": 26240 }, { "epoch": 1.6701660622256156, "grad_norm": 0.03466796875, "learning_rate": 4.6277208684156575e-05, "loss": 0.0006, "step": 26250 }, { "epoch": 1.6708023159636063, "grad_norm": 1.1640625, "learning_rate": 4.627367394116774e-05, "loss": 0.0009, "step": 26260 }, { "epoch": 1.671438569701597, "grad_norm": 0.024658203125, "learning_rate": 4.6270139198178906e-05, "loss": 0.0004, "step": 26270 }, { "epoch": 1.6720748234395877, "grad_norm": 0.0380859375, "learning_rate": 4.6266604455190064e-05, "loss": 0.0002, "step": 26280 }, { "epoch": 1.6727110771775784, "grad_norm": 0.001373291015625, "learning_rate": 4.626306971220123e-05, "loss": 0.0006, "step": 26290 }, { "epoch": 1.673347330915569, "grad_norm": 0.004241943359375, "learning_rate": 4.6259534969212394e-05, "loss": 0.0001, "step": 26300 }, { "epoch": 1.6739835846535598, "grad_norm": 0.369140625, "learning_rate": 4.625600022622355e-05, "loss": 0.0003, "step": 26310 }, { "epoch": 1.6746198383915507, "grad_norm": 0.0007476806640625, "learning_rate": 4.625246548323471e-05, "loss": 0.0004, "step": 26320 }, { "epoch": 1.6752560921295414, "grad_norm": 0.006134033203125, "learning_rate": 4.6248930740245876e-05, "loss": 0.0063, "step": 26330 }, { "epoch": 1.675892345867532, "grad_norm": 0.021240234375, "learning_rate": 4.624539599725704e-05, "loss": 0.0011, "step": 26340 }, { "epoch": 1.6765285996055228, "grad_norm": 0.0771484375, "learning_rate": 4.624186125426821e-05, "loss": 0.0002, "step": 26350 }, { "epoch": 1.6771648533435135, "grad_norm": 0.07470703125, "learning_rate": 4.6238326511279365e-05, "loss": 0.0002, "step": 26360 }, { "epoch": 1.6778011070815042, "grad_norm": 0.00958251953125, "learning_rate": 4.623479176829053e-05, "loss": 0.0006, "step": 26370 }, { "epoch": 1.678437360819495, "grad_norm": 0.08056640625, "learning_rate": 4.6231257025301695e-05, "loss": 0.0003, "step": 26380 }, { "epoch": 1.6790736145574856, "grad_norm": 0.337890625, "learning_rate": 4.622772228231286e-05, "loss": 0.0006, "step": 26390 }, { "epoch": 1.6797098682954763, "grad_norm": 0.140625, "learning_rate": 4.622418753932401e-05, "loss": 0.0002, "step": 26400 }, { "epoch": 1.680346122033467, "grad_norm": 0.007659912109375, "learning_rate": 4.622065279633518e-05, "loss": 0.0004, "step": 26410 }, { "epoch": 1.6809823757714577, "grad_norm": 0.00147247314453125, "learning_rate": 4.621711805334634e-05, "loss": 0.0003, "step": 26420 }, { "epoch": 1.6816186295094484, "grad_norm": 0.038818359375, "learning_rate": 4.621358331035751e-05, "loss": 0.0002, "step": 26430 }, { "epoch": 1.682254883247439, "grad_norm": 0.042724609375, "learning_rate": 4.6210048567368666e-05, "loss": 0.005, "step": 26440 }, { "epoch": 1.6828911369854298, "grad_norm": 0.022216796875, "learning_rate": 4.620651382437983e-05, "loss": 0.0001, "step": 26450 }, { "epoch": 1.6835273907234205, "grad_norm": 0.0015869140625, "learning_rate": 4.6202979081391e-05, "loss": 0.0004, "step": 26460 }, { "epoch": 1.6841636444614112, "grad_norm": 0.00640869140625, "learning_rate": 4.619944433840216e-05, "loss": 0.0035, "step": 26470 }, { "epoch": 1.684799898199402, "grad_norm": 0.011962890625, "learning_rate": 4.619590959541332e-05, "loss": 0.0003, "step": 26480 }, { "epoch": 1.6854361519373926, "grad_norm": 0.25390625, "learning_rate": 4.619237485242448e-05, "loss": 0.0006, "step": 26490 }, { "epoch": 1.6860724056753833, "grad_norm": 0.0038299560546875, "learning_rate": 4.6188840109435644e-05, "loss": 0.0003, "step": 26500 }, { "epoch": 1.686708659413374, "grad_norm": 0.0031585693359375, "learning_rate": 4.618530536644681e-05, "loss": 0.0006, "step": 26510 }, { "epoch": 1.6873449131513647, "grad_norm": 0.0072021484375, "learning_rate": 4.618177062345797e-05, "loss": 0.0001, "step": 26520 }, { "epoch": 1.6879811668893554, "grad_norm": 0.023681640625, "learning_rate": 4.617823588046913e-05, "loss": 0.003, "step": 26530 }, { "epoch": 1.688617420627346, "grad_norm": 0.01116943359375, "learning_rate": 4.61747011374803e-05, "loss": 0.0002, "step": 26540 }, { "epoch": 1.6892536743653368, "grad_norm": 0.0093994140625, "learning_rate": 4.617116639449146e-05, "loss": 0.0001, "step": 26550 }, { "epoch": 1.6898899281033275, "grad_norm": 1.546875, "learning_rate": 4.616763165150262e-05, "loss": 0.001, "step": 26560 }, { "epoch": 1.6905261818413182, "grad_norm": 0.000823974609375, "learning_rate": 4.616409690851378e-05, "loss": 0.0013, "step": 26570 }, { "epoch": 1.691162435579309, "grad_norm": 0.1513671875, "learning_rate": 4.6160562165524945e-05, "loss": 0.0095, "step": 26580 }, { "epoch": 1.6917986893172996, "grad_norm": 1.53125, "learning_rate": 4.615702742253611e-05, "loss": 0.0043, "step": 26590 }, { "epoch": 1.6924349430552903, "grad_norm": 6.8125, "learning_rate": 4.6153492679547275e-05, "loss": 0.0021, "step": 26600 }, { "epoch": 1.693071196793281, "grad_norm": 0.0047607421875, "learning_rate": 4.6149957936558434e-05, "loss": 0.0001, "step": 26610 }, { "epoch": 1.693707450531272, "grad_norm": 0.0732421875, "learning_rate": 4.61464231935696e-05, "loss": 0.0003, "step": 26620 }, { "epoch": 1.6943437042692626, "grad_norm": 0.0216064453125, "learning_rate": 4.6142888450580764e-05, "loss": 0.0002, "step": 26630 }, { "epoch": 1.6949799580072533, "grad_norm": 0.1259765625, "learning_rate": 4.613935370759192e-05, "loss": 0.0002, "step": 26640 }, { "epoch": 1.695616211745244, "grad_norm": 0.69921875, "learning_rate": 4.613581896460308e-05, "loss": 0.0011, "step": 26650 }, { "epoch": 1.6962524654832347, "grad_norm": 0.0024871826171875, "learning_rate": 4.6132284221614246e-05, "loss": 0.0085, "step": 26660 }, { "epoch": 1.6968887192212254, "grad_norm": 0.00159454345703125, "learning_rate": 4.612874947862541e-05, "loss": 0.0004, "step": 26670 }, { "epoch": 1.6975249729592161, "grad_norm": 0.0181884765625, "learning_rate": 4.6125214735636576e-05, "loss": 0.0006, "step": 26680 }, { "epoch": 1.6981612266972068, "grad_norm": 0.005584716796875, "learning_rate": 4.6121679992647735e-05, "loss": 0.0004, "step": 26690 }, { "epoch": 1.6987974804351975, "grad_norm": 0.006988525390625, "learning_rate": 4.61181452496589e-05, "loss": 0.0002, "step": 26700 }, { "epoch": 1.6994337341731882, "grad_norm": 0.09375, "learning_rate": 4.6114610506670065e-05, "loss": 0.0036, "step": 26710 }, { "epoch": 1.7000699879111791, "grad_norm": 0.06201171875, "learning_rate": 4.611107576368123e-05, "loss": 0.015, "step": 26720 }, { "epoch": 1.7007062416491698, "grad_norm": 0.0211181640625, "learning_rate": 4.610754102069238e-05, "loss": 0.0002, "step": 26730 }, { "epoch": 1.7013424953871605, "grad_norm": 0.12109375, "learning_rate": 4.610400627770355e-05, "loss": 0.0008, "step": 26740 }, { "epoch": 1.7019787491251512, "grad_norm": 0.1474609375, "learning_rate": 4.610047153471471e-05, "loss": 0.0003, "step": 26750 }, { "epoch": 1.702615002863142, "grad_norm": 0.007476806640625, "learning_rate": 4.609693679172588e-05, "loss": 0.0007, "step": 26760 }, { "epoch": 1.7032512566011326, "grad_norm": 0.003021240234375, "learning_rate": 4.6093402048737036e-05, "loss": 0.0002, "step": 26770 }, { "epoch": 1.7038875103391233, "grad_norm": 0.0888671875, "learning_rate": 4.60898673057482e-05, "loss": 0.0003, "step": 26780 }, { "epoch": 1.704523764077114, "grad_norm": 0.0089111328125, "learning_rate": 4.6086332562759366e-05, "loss": 0.0003, "step": 26790 }, { "epoch": 1.7051600178151047, "grad_norm": 0.0001926422119140625, "learning_rate": 4.608279781977053e-05, "loss": 0.0001, "step": 26800 }, { "epoch": 1.7057962715530954, "grad_norm": 0.609375, "learning_rate": 4.607926307678169e-05, "loss": 0.0005, "step": 26810 }, { "epoch": 1.7064325252910861, "grad_norm": 0.1416015625, "learning_rate": 4.607572833379285e-05, "loss": 0.0032, "step": 26820 }, { "epoch": 1.7070687790290768, "grad_norm": 0.6796875, "learning_rate": 4.6072193590804014e-05, "loss": 0.0005, "step": 26830 }, { "epoch": 1.7077050327670675, "grad_norm": 0.083984375, "learning_rate": 4.606865884781518e-05, "loss": 0.0005, "step": 26840 }, { "epoch": 1.7083412865050582, "grad_norm": 0.19140625, "learning_rate": 4.606512410482634e-05, "loss": 0.0019, "step": 26850 }, { "epoch": 1.708977540243049, "grad_norm": 0.0108642578125, "learning_rate": 4.60615893618375e-05, "loss": 0.0005, "step": 26860 }, { "epoch": 1.7096137939810396, "grad_norm": 0.0751953125, "learning_rate": 4.605805461884867e-05, "loss": 0.0005, "step": 26870 }, { "epoch": 1.7102500477190303, "grad_norm": 0.6953125, "learning_rate": 4.605451987585983e-05, "loss": 0.0004, "step": 26880 }, { "epoch": 1.710886301457021, "grad_norm": 0.0048828125, "learning_rate": 4.605098513287099e-05, "loss": 0.0006, "step": 26890 }, { "epoch": 1.7115225551950117, "grad_norm": 0.01397705078125, "learning_rate": 4.604745038988215e-05, "loss": 0.0008, "step": 26900 }, { "epoch": 1.7121588089330024, "grad_norm": 2.671875, "learning_rate": 4.6043915646893315e-05, "loss": 0.0038, "step": 26910 }, { "epoch": 1.7127950626709931, "grad_norm": 0.00946044921875, "learning_rate": 4.604038090390448e-05, "loss": 0.0032, "step": 26920 }, { "epoch": 1.7134313164089838, "grad_norm": 3.03125, "learning_rate": 4.6036846160915645e-05, "loss": 0.0056, "step": 26930 }, { "epoch": 1.7140675701469745, "grad_norm": 0.00093841552734375, "learning_rate": 4.6033311417926804e-05, "loss": 0.0012, "step": 26940 }, { "epoch": 1.7147038238849652, "grad_norm": 0.01324462890625, "learning_rate": 4.602977667493797e-05, "loss": 0.0033, "step": 26950 }, { "epoch": 1.715340077622956, "grad_norm": 0.0159912109375, "learning_rate": 4.6026241931949134e-05, "loss": 0.0092, "step": 26960 }, { "epoch": 1.7159763313609466, "grad_norm": 0.0128173828125, "learning_rate": 4.602270718896029e-05, "loss": 0.0003, "step": 26970 }, { "epoch": 1.7166125850989373, "grad_norm": 0.00396728515625, "learning_rate": 4.601917244597145e-05, "loss": 0.0081, "step": 26980 }, { "epoch": 1.717248838836928, "grad_norm": 0.004730224609375, "learning_rate": 4.6015637702982616e-05, "loss": 0.0007, "step": 26990 }, { "epoch": 1.7178850925749187, "grad_norm": 1.34375, "learning_rate": 4.601210295999378e-05, "loss": 0.002, "step": 27000 }, { "epoch": 1.7185213463129094, "grad_norm": 0.0103759765625, "learning_rate": 4.6008568217004946e-05, "loss": 0.0007, "step": 27010 }, { "epoch": 1.7191576000509003, "grad_norm": 0.0057373046875, "learning_rate": 4.6005033474016105e-05, "loss": 0.0005, "step": 27020 }, { "epoch": 1.719793853788891, "grad_norm": 0.0035552978515625, "learning_rate": 4.600149873102727e-05, "loss": 0.0006, "step": 27030 }, { "epoch": 1.7204301075268817, "grad_norm": 0.2890625, "learning_rate": 4.5997963988038435e-05, "loss": 0.0008, "step": 27040 }, { "epoch": 1.7210663612648724, "grad_norm": 0.005126953125, "learning_rate": 4.59944292450496e-05, "loss": 0.0003, "step": 27050 }, { "epoch": 1.7217026150028631, "grad_norm": 0.458984375, "learning_rate": 4.599089450206075e-05, "loss": 0.0012, "step": 27060 }, { "epoch": 1.7223388687408538, "grad_norm": 0.00104522705078125, "learning_rate": 4.598735975907192e-05, "loss": 0.0012, "step": 27070 }, { "epoch": 1.7229751224788445, "grad_norm": 0.023193359375, "learning_rate": 4.598382501608308e-05, "loss": 0.0025, "step": 27080 }, { "epoch": 1.7236113762168352, "grad_norm": 0.044677734375, "learning_rate": 4.598029027309425e-05, "loss": 0.0034, "step": 27090 }, { "epoch": 1.724247629954826, "grad_norm": 0.0022430419921875, "learning_rate": 4.5976755530105406e-05, "loss": 0.0004, "step": 27100 }, { "epoch": 1.7248838836928166, "grad_norm": 0.04150390625, "learning_rate": 4.597322078711657e-05, "loss": 0.0041, "step": 27110 }, { "epoch": 1.7255201374308076, "grad_norm": 0.00164794921875, "learning_rate": 4.5969686044127736e-05, "loss": 0.0003, "step": 27120 }, { "epoch": 1.7261563911687983, "grad_norm": 0.1298828125, "learning_rate": 4.59661513011389e-05, "loss": 0.0005, "step": 27130 }, { "epoch": 1.726792644906789, "grad_norm": 0.046630859375, "learning_rate": 4.596261655815006e-05, "loss": 0.0003, "step": 27140 }, { "epoch": 1.7274288986447797, "grad_norm": 0.083984375, "learning_rate": 4.595908181516122e-05, "loss": 0.0015, "step": 27150 }, { "epoch": 1.7280651523827704, "grad_norm": 0.02197265625, "learning_rate": 4.5955547072172383e-05, "loss": 0.0002, "step": 27160 }, { "epoch": 1.728701406120761, "grad_norm": 0.60546875, "learning_rate": 4.595201232918355e-05, "loss": 0.0226, "step": 27170 }, { "epoch": 1.7293376598587518, "grad_norm": 1.0625, "learning_rate": 4.594847758619471e-05, "loss": 0.0015, "step": 27180 }, { "epoch": 1.7299739135967425, "grad_norm": 0.033935546875, "learning_rate": 4.594494284320587e-05, "loss": 0.001, "step": 27190 }, { "epoch": 1.7306101673347332, "grad_norm": 0.0458984375, "learning_rate": 4.594140810021704e-05, "loss": 0.0008, "step": 27200 }, { "epoch": 1.7312464210727239, "grad_norm": 0.12890625, "learning_rate": 4.59378733572282e-05, "loss": 0.0004, "step": 27210 }, { "epoch": 1.7318826748107146, "grad_norm": 0.0303955078125, "learning_rate": 4.593433861423936e-05, "loss": 0.0069, "step": 27220 }, { "epoch": 1.7325189285487053, "grad_norm": 0.0625, "learning_rate": 4.593080387125052e-05, "loss": 0.0014, "step": 27230 }, { "epoch": 1.733155182286696, "grad_norm": 0.03125, "learning_rate": 4.5927269128261685e-05, "loss": 0.0002, "step": 27240 }, { "epoch": 1.7337914360246867, "grad_norm": 0.470703125, "learning_rate": 4.592373438527285e-05, "loss": 0.0004, "step": 27250 }, { "epoch": 1.7344276897626774, "grad_norm": 0.00616455078125, "learning_rate": 4.5920199642284015e-05, "loss": 0.0007, "step": 27260 }, { "epoch": 1.735063943500668, "grad_norm": 0.0186767578125, "learning_rate": 4.591666489929517e-05, "loss": 0.0045, "step": 27270 }, { "epoch": 1.7357001972386588, "grad_norm": 0.1328125, "learning_rate": 4.591313015630634e-05, "loss": 0.0061, "step": 27280 }, { "epoch": 1.7363364509766495, "grad_norm": 0.021484375, "learning_rate": 4.5909595413317504e-05, "loss": 0.0033, "step": 27290 }, { "epoch": 1.7369727047146402, "grad_norm": 0.00848388671875, "learning_rate": 4.590606067032866e-05, "loss": 0.0005, "step": 27300 }, { "epoch": 1.7376089584526309, "grad_norm": 0.00168609619140625, "learning_rate": 4.590252592733982e-05, "loss": 0.0002, "step": 27310 }, { "epoch": 1.7382452121906216, "grad_norm": 0.0026092529296875, "learning_rate": 4.5898991184350986e-05, "loss": 0.0096, "step": 27320 }, { "epoch": 1.7388814659286123, "grad_norm": 0.00311279296875, "learning_rate": 4.589545644136215e-05, "loss": 0.0005, "step": 27330 }, { "epoch": 1.739517719666603, "grad_norm": 0.011962890625, "learning_rate": 4.5891921698373316e-05, "loss": 0.0075, "step": 27340 }, { "epoch": 1.7401539734045937, "grad_norm": 0.0032958984375, "learning_rate": 4.5888386955384475e-05, "loss": 0.0001, "step": 27350 }, { "epoch": 1.7407902271425844, "grad_norm": 0.126953125, "learning_rate": 4.588485221239564e-05, "loss": 0.0003, "step": 27360 }, { "epoch": 1.741426480880575, "grad_norm": 0.01177978515625, "learning_rate": 4.5881317469406805e-05, "loss": 0.0069, "step": 27370 }, { "epoch": 1.7420627346185658, "grad_norm": 0.1083984375, "learning_rate": 4.587778272641796e-05, "loss": 0.0003, "step": 27380 }, { "epoch": 1.7426989883565565, "grad_norm": 0.0013275146484375, "learning_rate": 4.587424798342912e-05, "loss": 0.0042, "step": 27390 }, { "epoch": 1.7433352420945472, "grad_norm": 0.1337890625, "learning_rate": 4.587071324044029e-05, "loss": 0.0003, "step": 27400 }, { "epoch": 1.7439714958325379, "grad_norm": 0.01129150390625, "learning_rate": 4.586717849745145e-05, "loss": 0.0002, "step": 27410 }, { "epoch": 1.7446077495705288, "grad_norm": 0.006805419921875, "learning_rate": 4.586364375446262e-05, "loss": 0.0001, "step": 27420 }, { "epoch": 1.7452440033085195, "grad_norm": 0.01214599609375, "learning_rate": 4.5860109011473776e-05, "loss": 0.0019, "step": 27430 }, { "epoch": 1.7458802570465102, "grad_norm": 0.130859375, "learning_rate": 4.585657426848494e-05, "loss": 0.0029, "step": 27440 }, { "epoch": 1.7465165107845009, "grad_norm": 0.0458984375, "learning_rate": 4.5853039525496106e-05, "loss": 0.0005, "step": 27450 }, { "epoch": 1.7471527645224916, "grad_norm": 0.0250244140625, "learning_rate": 4.584950478250727e-05, "loss": 0.0021, "step": 27460 }, { "epoch": 1.7477890182604823, "grad_norm": 0.0250244140625, "learning_rate": 4.584597003951843e-05, "loss": 0.0002, "step": 27470 }, { "epoch": 1.748425271998473, "grad_norm": 0.049072265625, "learning_rate": 4.584243529652959e-05, "loss": 0.0002, "step": 27480 }, { "epoch": 1.7490615257364637, "grad_norm": 2.609375, "learning_rate": 4.583890055354075e-05, "loss": 0.0034, "step": 27490 }, { "epoch": 1.7496977794744544, "grad_norm": 7.71875, "learning_rate": 4.583536581055192e-05, "loss": 0.0137, "step": 27500 }, { "epoch": 1.750334033212445, "grad_norm": 0.00433349609375, "learning_rate": 4.583183106756308e-05, "loss": 0.0003, "step": 27510 }, { "epoch": 1.750970286950436, "grad_norm": 0.0146484375, "learning_rate": 4.582829632457424e-05, "loss": 0.0004, "step": 27520 }, { "epoch": 1.7516065406884267, "grad_norm": 0.03857421875, "learning_rate": 4.582476158158541e-05, "loss": 0.0004, "step": 27530 }, { "epoch": 1.7522427944264174, "grad_norm": 0.004119873046875, "learning_rate": 4.582122683859657e-05, "loss": 0.0009, "step": 27540 }, { "epoch": 1.752879048164408, "grad_norm": 0.0250244140625, "learning_rate": 4.581769209560773e-05, "loss": 0.0028, "step": 27550 }, { "epoch": 1.7535153019023988, "grad_norm": 0.1513671875, "learning_rate": 4.581415735261889e-05, "loss": 0.01, "step": 27560 }, { "epoch": 1.7541515556403895, "grad_norm": 0.0103759765625, "learning_rate": 4.5810622609630054e-05, "loss": 0.0015, "step": 27570 }, { "epoch": 1.7547878093783802, "grad_norm": 0.0103759765625, "learning_rate": 4.580708786664122e-05, "loss": 0.0006, "step": 27580 }, { "epoch": 1.755424063116371, "grad_norm": 0.1884765625, "learning_rate": 4.5803553123652385e-05, "loss": 0.0124, "step": 27590 }, { "epoch": 1.7560603168543616, "grad_norm": 0.0111083984375, "learning_rate": 4.580001838066354e-05, "loss": 0.0004, "step": 27600 }, { "epoch": 1.7566965705923523, "grad_norm": 0.026611328125, "learning_rate": 4.579648363767471e-05, "loss": 0.0019, "step": 27610 }, { "epoch": 1.757332824330343, "grad_norm": 0.004150390625, "learning_rate": 4.5792948894685874e-05, "loss": 0.0029, "step": 27620 }, { "epoch": 1.7579690780683337, "grad_norm": 0.0216064453125, "learning_rate": 4.578941415169703e-05, "loss": 0.0003, "step": 27630 }, { "epoch": 1.7586053318063244, "grad_norm": 0.1015625, "learning_rate": 4.578587940870819e-05, "loss": 0.0014, "step": 27640 }, { "epoch": 1.759241585544315, "grad_norm": 0.005157470703125, "learning_rate": 4.5782344665719356e-05, "loss": 0.0006, "step": 27650 }, { "epoch": 1.7598778392823058, "grad_norm": 0.052734375, "learning_rate": 4.577880992273052e-05, "loss": 0.0002, "step": 27660 }, { "epoch": 1.7605140930202965, "grad_norm": 0.00634765625, "learning_rate": 4.5775275179741686e-05, "loss": 0.0004, "step": 27670 }, { "epoch": 1.7611503467582872, "grad_norm": 0.007598876953125, "learning_rate": 4.5771740436752844e-05, "loss": 0.0002, "step": 27680 }, { "epoch": 1.7617866004962779, "grad_norm": 4.0, "learning_rate": 4.576820569376401e-05, "loss": 0.0095, "step": 27690 }, { "epoch": 1.7624228542342686, "grad_norm": 0.032470703125, "learning_rate": 4.5764670950775175e-05, "loss": 0.0008, "step": 27700 }, { "epoch": 1.7630591079722593, "grad_norm": 0.0172119140625, "learning_rate": 4.576113620778633e-05, "loss": 0.0003, "step": 27710 }, { "epoch": 1.76369536171025, "grad_norm": 0.1669921875, "learning_rate": 4.575760146479749e-05, "loss": 0.0013, "step": 27720 }, { "epoch": 1.7643316154482407, "grad_norm": 0.039306640625, "learning_rate": 4.575406672180866e-05, "loss": 0.0001, "step": 27730 }, { "epoch": 1.7649678691862314, "grad_norm": 0.005706787109375, "learning_rate": 4.575053197881982e-05, "loss": 0.0005, "step": 27740 }, { "epoch": 1.765604122924222, "grad_norm": 0.0245361328125, "learning_rate": 4.574699723583099e-05, "loss": 0.0074, "step": 27750 }, { "epoch": 1.7662403766622128, "grad_norm": 0.000797271728515625, "learning_rate": 4.5743462492842145e-05, "loss": 0.0001, "step": 27760 }, { "epoch": 1.7668766304002035, "grad_norm": 0.1923828125, "learning_rate": 4.573992774985331e-05, "loss": 0.0013, "step": 27770 }, { "epoch": 1.7675128841381942, "grad_norm": 0.0020599365234375, "learning_rate": 4.5736393006864476e-05, "loss": 0.0006, "step": 27780 }, { "epoch": 1.7681491378761849, "grad_norm": 0.053466796875, "learning_rate": 4.5732858263875634e-05, "loss": 0.0008, "step": 27790 }, { "epoch": 1.7687853916141756, "grad_norm": 1.359375, "learning_rate": 4.57293235208868e-05, "loss": 0.0016, "step": 27800 }, { "epoch": 1.7694216453521663, "grad_norm": 0.005523681640625, "learning_rate": 4.572578877789796e-05, "loss": 0.0039, "step": 27810 }, { "epoch": 1.7700578990901572, "grad_norm": 0.0108642578125, "learning_rate": 4.572225403490912e-05, "loss": 0.0005, "step": 27820 }, { "epoch": 1.770694152828148, "grad_norm": 0.046875, "learning_rate": 4.571871929192029e-05, "loss": 0.0022, "step": 27830 }, { "epoch": 1.7713304065661386, "grad_norm": 0.0081787109375, "learning_rate": 4.5715184548931453e-05, "loss": 0.0027, "step": 27840 }, { "epoch": 1.7719666603041293, "grad_norm": 0.00152587890625, "learning_rate": 4.571164980594261e-05, "loss": 0.0004, "step": 27850 }, { "epoch": 1.77260291404212, "grad_norm": 0.1044921875, "learning_rate": 4.570811506295378e-05, "loss": 0.0056, "step": 27860 }, { "epoch": 1.7732391677801107, "grad_norm": 0.00182342529296875, "learning_rate": 4.570458031996494e-05, "loss": 0.0041, "step": 27870 }, { "epoch": 1.7738754215181014, "grad_norm": 0.02392578125, "learning_rate": 4.57010455769761e-05, "loss": 0.0003, "step": 27880 }, { "epoch": 1.774511675256092, "grad_norm": 0.07470703125, "learning_rate": 4.569751083398726e-05, "loss": 0.0005, "step": 27890 }, { "epoch": 1.7751479289940828, "grad_norm": 0.0009918212890625, "learning_rate": 4.5693976090998424e-05, "loss": 0.0004, "step": 27900 }, { "epoch": 1.7757841827320735, "grad_norm": 0.0223388671875, "learning_rate": 4.569044134800959e-05, "loss": 0.0003, "step": 27910 }, { "epoch": 1.7764204364700644, "grad_norm": 0.375, "learning_rate": 4.5686906605020755e-05, "loss": 0.0003, "step": 27920 }, { "epoch": 1.7770566902080551, "grad_norm": 0.216796875, "learning_rate": 4.568337186203191e-05, "loss": 0.0005, "step": 27930 }, { "epoch": 1.7776929439460458, "grad_norm": 0.021484375, "learning_rate": 4.567983711904308e-05, "loss": 0.0002, "step": 27940 }, { "epoch": 1.7783291976840365, "grad_norm": 0.05029296875, "learning_rate": 4.567630237605424e-05, "loss": 0.0004, "step": 27950 }, { "epoch": 1.7789654514220272, "grad_norm": 0.00830078125, "learning_rate": 4.56727676330654e-05, "loss": 0.0001, "step": 27960 }, { "epoch": 1.779601705160018, "grad_norm": 0.004791259765625, "learning_rate": 4.566923289007656e-05, "loss": 0.0002, "step": 27970 }, { "epoch": 1.7802379588980086, "grad_norm": 0.36328125, "learning_rate": 4.5665698147087725e-05, "loss": 0.0006, "step": 27980 }, { "epoch": 1.7808742126359993, "grad_norm": 0.19921875, "learning_rate": 4.566216340409889e-05, "loss": 0.0012, "step": 27990 }, { "epoch": 1.78151046637399, "grad_norm": 0.0033111572265625, "learning_rate": 4.5658628661110056e-05, "loss": 0.0011, "step": 28000 }, { "epoch": 1.7821467201119807, "grad_norm": 5.5625, "learning_rate": 4.5655093918121214e-05, "loss": 0.006, "step": 28010 }, { "epoch": 1.7827829738499714, "grad_norm": 0.365234375, "learning_rate": 4.565155917513238e-05, "loss": 0.0018, "step": 28020 }, { "epoch": 1.7834192275879621, "grad_norm": 0.466796875, "learning_rate": 4.5648024432143545e-05, "loss": 0.0003, "step": 28030 }, { "epoch": 1.7840554813259528, "grad_norm": 0.004547119140625, "learning_rate": 4.56444896891547e-05, "loss": 0.0004, "step": 28040 }, { "epoch": 1.7846917350639435, "grad_norm": 0.025146484375, "learning_rate": 4.564095494616587e-05, "loss": 0.0002, "step": 28050 }, { "epoch": 1.7853279888019342, "grad_norm": 0.041748046875, "learning_rate": 4.5637420203177027e-05, "loss": 0.0002, "step": 28060 }, { "epoch": 1.785964242539925, "grad_norm": 0.0185546875, "learning_rate": 4.563388546018819e-05, "loss": 0.0181, "step": 28070 }, { "epoch": 1.7866004962779156, "grad_norm": 0.263671875, "learning_rate": 4.563035071719936e-05, "loss": 0.0023, "step": 28080 }, { "epoch": 1.7872367500159063, "grad_norm": 0.0048828125, "learning_rate": 4.5626815974210515e-05, "loss": 0.0005, "step": 28090 }, { "epoch": 1.787873003753897, "grad_norm": 0.005828857421875, "learning_rate": 4.562328123122168e-05, "loss": 0.0007, "step": 28100 }, { "epoch": 1.7885092574918877, "grad_norm": 0.0020599365234375, "learning_rate": 4.5619746488232846e-05, "loss": 0.0002, "step": 28110 }, { "epoch": 1.7891455112298784, "grad_norm": 0.0081787109375, "learning_rate": 4.5616211745244004e-05, "loss": 0.0004, "step": 28120 }, { "epoch": 1.7897817649678691, "grad_norm": 0.039794921875, "learning_rate": 4.561267700225517e-05, "loss": 0.0014, "step": 28130 }, { "epoch": 1.7904180187058598, "grad_norm": 0.01409912109375, "learning_rate": 4.560914225926633e-05, "loss": 0.0005, "step": 28140 }, { "epoch": 1.7910542724438505, "grad_norm": 0.01397705078125, "learning_rate": 4.560560751627749e-05, "loss": 0.0007, "step": 28150 }, { "epoch": 1.7916905261818412, "grad_norm": 0.037841796875, "learning_rate": 4.560207277328866e-05, "loss": 0.0003, "step": 28160 }, { "epoch": 1.792326779919832, "grad_norm": 0.59375, "learning_rate": 4.559853803029982e-05, "loss": 0.012, "step": 28170 }, { "epoch": 1.7929630336578226, "grad_norm": 0.98046875, "learning_rate": 4.559500328731098e-05, "loss": 0.0044, "step": 28180 }, { "epoch": 1.7935992873958133, "grad_norm": 2.609375, "learning_rate": 4.559146854432215e-05, "loss": 0.001, "step": 28190 }, { "epoch": 1.794235541133804, "grad_norm": 0.006988525390625, "learning_rate": 4.5587933801333305e-05, "loss": 0.0039, "step": 28200 }, { "epoch": 1.7948717948717947, "grad_norm": 2.6875, "learning_rate": 4.558439905834447e-05, "loss": 0.0025, "step": 28210 }, { "epoch": 1.7955080486097856, "grad_norm": 0.0172119140625, "learning_rate": 4.558086431535563e-05, "loss": 0.0001, "step": 28220 }, { "epoch": 1.7961443023477763, "grad_norm": 0.00153350830078125, "learning_rate": 4.5577329572366794e-05, "loss": 0.0041, "step": 28230 }, { "epoch": 1.796780556085767, "grad_norm": 0.068359375, "learning_rate": 4.557379482937796e-05, "loss": 0.0022, "step": 28240 }, { "epoch": 1.7974168098237577, "grad_norm": 0.0546875, "learning_rate": 4.5570260086389124e-05, "loss": 0.0003, "step": 28250 }, { "epoch": 1.7980530635617484, "grad_norm": 0.00799560546875, "learning_rate": 4.556672534340028e-05, "loss": 0.0004, "step": 28260 }, { "epoch": 1.7986893172997391, "grad_norm": 0.1416015625, "learning_rate": 4.556319060041145e-05, "loss": 0.0177, "step": 28270 }, { "epoch": 1.7993255710377298, "grad_norm": 0.048583984375, "learning_rate": 4.5559655857422606e-05, "loss": 0.0008, "step": 28280 }, { "epoch": 1.7999618247757205, "grad_norm": 0.09521484375, "learning_rate": 4.555612111443377e-05, "loss": 0.0007, "step": 28290 }, { "epoch": 1.8005980785137112, "grad_norm": 0.001495361328125, "learning_rate": 4.555258637144493e-05, "loss": 0.0018, "step": 28300 }, { "epoch": 1.801234332251702, "grad_norm": 0.0189208984375, "learning_rate": 4.5549051628456095e-05, "loss": 0.007, "step": 28310 }, { "epoch": 1.8018705859896929, "grad_norm": 0.0011138916015625, "learning_rate": 4.554551688546726e-05, "loss": 0.0001, "step": 28320 }, { "epoch": 1.8025068397276836, "grad_norm": 0.00372314453125, "learning_rate": 4.5541982142478426e-05, "loss": 0.0059, "step": 28330 }, { "epoch": 1.8031430934656743, "grad_norm": 0.0240478515625, "learning_rate": 4.5538447399489584e-05, "loss": 0.0008, "step": 28340 }, { "epoch": 1.803779347203665, "grad_norm": 0.0703125, "learning_rate": 4.553491265650075e-05, "loss": 0.0001, "step": 28350 }, { "epoch": 1.8044156009416557, "grad_norm": 0.010009765625, "learning_rate": 4.5531377913511914e-05, "loss": 0.001, "step": 28360 }, { "epoch": 1.8050518546796464, "grad_norm": 1.2265625, "learning_rate": 4.552784317052307e-05, "loss": 0.0014, "step": 28370 }, { "epoch": 1.805688108417637, "grad_norm": 0.00445556640625, "learning_rate": 4.552430842753424e-05, "loss": 0.0006, "step": 28380 }, { "epoch": 1.8063243621556277, "grad_norm": 0.0042724609375, "learning_rate": 4.5520773684545396e-05, "loss": 0.0001, "step": 28390 }, { "epoch": 1.8069606158936184, "grad_norm": 0.0771484375, "learning_rate": 4.551723894155656e-05, "loss": 0.0024, "step": 28400 }, { "epoch": 1.8075968696316091, "grad_norm": 0.0201416015625, "learning_rate": 4.551370419856773e-05, "loss": 0.0002, "step": 28410 }, { "epoch": 1.8082331233695998, "grad_norm": 0.0177001953125, "learning_rate": 4.5510169455578885e-05, "loss": 0.0012, "step": 28420 }, { "epoch": 1.8088693771075905, "grad_norm": 0.004638671875, "learning_rate": 4.550663471259005e-05, "loss": 0.0002, "step": 28430 }, { "epoch": 1.8095056308455812, "grad_norm": 0.006805419921875, "learning_rate": 4.5503099969601215e-05, "loss": 0.0001, "step": 28440 }, { "epoch": 1.810141884583572, "grad_norm": 0.0177001953125, "learning_rate": 4.5499565226612374e-05, "loss": 0.0009, "step": 28450 }, { "epoch": 1.8107781383215626, "grad_norm": 0.0029449462890625, "learning_rate": 4.549603048362354e-05, "loss": 0.0039, "step": 28460 }, { "epoch": 1.8114143920595533, "grad_norm": 0.0016021728515625, "learning_rate": 4.54924957406347e-05, "loss": 0.0016, "step": 28470 }, { "epoch": 1.812050645797544, "grad_norm": 0.0133056640625, "learning_rate": 4.548896099764586e-05, "loss": 0.0002, "step": 28480 }, { "epoch": 1.8126868995355347, "grad_norm": 0.04296875, "learning_rate": 4.548542625465703e-05, "loss": 0.0003, "step": 28490 }, { "epoch": 1.8133231532735254, "grad_norm": 0.043212890625, "learning_rate": 4.548189151166819e-05, "loss": 0.0002, "step": 28500 }, { "epoch": 1.8139594070115161, "grad_norm": 0.001129150390625, "learning_rate": 4.547835676867935e-05, "loss": 0.0188, "step": 28510 }, { "epoch": 1.8145956607495068, "grad_norm": 0.006378173828125, "learning_rate": 4.547482202569052e-05, "loss": 0.0045, "step": 28520 }, { "epoch": 1.8152319144874975, "grad_norm": 0.55078125, "learning_rate": 4.5471287282701675e-05, "loss": 0.0003, "step": 28530 }, { "epoch": 1.8158681682254882, "grad_norm": 0.00180816650390625, "learning_rate": 4.546775253971284e-05, "loss": 0.0006, "step": 28540 }, { "epoch": 1.816504421963479, "grad_norm": 0.003387451171875, "learning_rate": 4.5464217796724e-05, "loss": 0.0004, "step": 28550 }, { "epoch": 1.8171406757014696, "grad_norm": 0.0024261474609375, "learning_rate": 4.5460683053735164e-05, "loss": 0.0003, "step": 28560 }, { "epoch": 1.8177769294394603, "grad_norm": 0.515625, "learning_rate": 4.545714831074633e-05, "loss": 0.0007, "step": 28570 }, { "epoch": 1.818413183177451, "grad_norm": 0.875, "learning_rate": 4.5453613567757494e-05, "loss": 0.0008, "step": 28580 }, { "epoch": 1.8190494369154417, "grad_norm": 0.0035858154296875, "learning_rate": 4.545007882476865e-05, "loss": 0.0022, "step": 28590 }, { "epoch": 1.8196856906534324, "grad_norm": 0.004302978515625, "learning_rate": 4.544654408177982e-05, "loss": 0.0008, "step": 28600 }, { "epoch": 1.8203219443914231, "grad_norm": 0.004547119140625, "learning_rate": 4.5443009338790976e-05, "loss": 0.0001, "step": 28610 }, { "epoch": 1.820958198129414, "grad_norm": 0.0021820068359375, "learning_rate": 4.543947459580214e-05, "loss": 0.001, "step": 28620 }, { "epoch": 1.8215944518674048, "grad_norm": 0.06494140625, "learning_rate": 4.54359398528133e-05, "loss": 0.0003, "step": 28630 }, { "epoch": 1.8222307056053955, "grad_norm": 0.00408935546875, "learning_rate": 4.5432405109824465e-05, "loss": 0.0007, "step": 28640 }, { "epoch": 1.8228669593433862, "grad_norm": 0.034423828125, "learning_rate": 4.542887036683563e-05, "loss": 0.0011, "step": 28650 }, { "epoch": 1.8235032130813769, "grad_norm": 0.041259765625, "learning_rate": 4.5425335623846795e-05, "loss": 0.0008, "step": 28660 }, { "epoch": 1.8241394668193676, "grad_norm": 0.07763671875, "learning_rate": 4.5421800880857954e-05, "loss": 0.0002, "step": 28670 }, { "epoch": 1.8247757205573583, "grad_norm": 0.01019287109375, "learning_rate": 4.541826613786912e-05, "loss": 0.0006, "step": 28680 }, { "epoch": 1.825411974295349, "grad_norm": 0.005279541015625, "learning_rate": 4.541473139488028e-05, "loss": 0.0006, "step": 28690 }, { "epoch": 1.8260482280333397, "grad_norm": 0.0007781982421875, "learning_rate": 4.541119665189144e-05, "loss": 0.0079, "step": 28700 }, { "epoch": 1.8266844817713304, "grad_norm": 0.099609375, "learning_rate": 4.540766190890261e-05, "loss": 0.0002, "step": 28710 }, { "epoch": 1.8273207355093213, "grad_norm": 0.01806640625, "learning_rate": 4.5404127165913766e-05, "loss": 0.0004, "step": 28720 }, { "epoch": 1.827956989247312, "grad_norm": 0.01544189453125, "learning_rate": 4.540059242292493e-05, "loss": 0.0055, "step": 28730 }, { "epoch": 1.8285932429853027, "grad_norm": 0.0166015625, "learning_rate": 4.5397057679936097e-05, "loss": 0.0025, "step": 28740 }, { "epoch": 1.8292294967232934, "grad_norm": 0.173828125, "learning_rate": 4.5393522936947255e-05, "loss": 0.0026, "step": 28750 }, { "epoch": 1.829865750461284, "grad_norm": 0.28515625, "learning_rate": 4.538998819395842e-05, "loss": 0.0009, "step": 28760 }, { "epoch": 1.8305020041992748, "grad_norm": 0.43359375, "learning_rate": 4.5386453450969585e-05, "loss": 0.0027, "step": 28770 }, { "epoch": 1.8311382579372655, "grad_norm": 0.032470703125, "learning_rate": 4.5382918707980744e-05, "loss": 0.0012, "step": 28780 }, { "epoch": 1.8317745116752562, "grad_norm": 0.0291748046875, "learning_rate": 4.537938396499191e-05, "loss": 0.0015, "step": 28790 }, { "epoch": 1.8324107654132469, "grad_norm": 0.00982666015625, "learning_rate": 4.537584922200307e-05, "loss": 0.0009, "step": 28800 }, { "epoch": 1.8330470191512376, "grad_norm": 0.0089111328125, "learning_rate": 4.537231447901423e-05, "loss": 0.0004, "step": 28810 }, { "epoch": 1.8336832728892283, "grad_norm": 2.078125, "learning_rate": 4.53687797360254e-05, "loss": 0.0017, "step": 28820 }, { "epoch": 1.834319526627219, "grad_norm": 0.0014801025390625, "learning_rate": 4.536524499303656e-05, "loss": 0.0012, "step": 28830 }, { "epoch": 1.8349557803652097, "grad_norm": 0.004974365234375, "learning_rate": 4.536171025004772e-05, "loss": 0.0002, "step": 28840 }, { "epoch": 1.8355920341032004, "grad_norm": 0.0059814453125, "learning_rate": 4.5358175507058886e-05, "loss": 0.0007, "step": 28850 }, { "epoch": 1.836228287841191, "grad_norm": 0.01312255859375, "learning_rate": 4.5354640764070045e-05, "loss": 0.0018, "step": 28860 }, { "epoch": 1.8368645415791818, "grad_norm": 0.021484375, "learning_rate": 4.535110602108121e-05, "loss": 0.0036, "step": 28870 }, { "epoch": 1.8375007953171725, "grad_norm": 0.0033721923828125, "learning_rate": 4.534757127809237e-05, "loss": 0.0005, "step": 28880 }, { "epoch": 1.8381370490551632, "grad_norm": 0.0289306640625, "learning_rate": 4.5344036535103534e-05, "loss": 0.0027, "step": 28890 }, { "epoch": 1.8387733027931539, "grad_norm": 0.1376953125, "learning_rate": 4.53405017921147e-05, "loss": 0.0004, "step": 28900 }, { "epoch": 1.8394095565311446, "grad_norm": 0.02099609375, "learning_rate": 4.5336967049125864e-05, "loss": 0.0172, "step": 28910 }, { "epoch": 1.8400458102691353, "grad_norm": 0.041748046875, "learning_rate": 4.533343230613702e-05, "loss": 0.0006, "step": 28920 }, { "epoch": 1.840682064007126, "grad_norm": 0.00299072265625, "learning_rate": 4.532989756314819e-05, "loss": 0.0032, "step": 28930 }, { "epoch": 1.8413183177451167, "grad_norm": 0.0615234375, "learning_rate": 4.5326362820159346e-05, "loss": 0.0028, "step": 28940 }, { "epoch": 1.8419545714831074, "grad_norm": 0.0089111328125, "learning_rate": 4.532282807717051e-05, "loss": 0.0059, "step": 28950 }, { "epoch": 1.842590825221098, "grad_norm": 0.00665283203125, "learning_rate": 4.531929333418167e-05, "loss": 0.0005, "step": 28960 }, { "epoch": 1.8432270789590888, "grad_norm": 0.000591278076171875, "learning_rate": 4.5315758591192835e-05, "loss": 0.0004, "step": 28970 }, { "epoch": 1.8438633326970795, "grad_norm": 0.0040283203125, "learning_rate": 4.5312223848204e-05, "loss": 0.0002, "step": 28980 }, { "epoch": 1.8444995864350702, "grad_norm": 1.96875, "learning_rate": 4.5308689105215165e-05, "loss": 0.0012, "step": 28990 }, { "epoch": 1.8451358401730609, "grad_norm": 0.14453125, "learning_rate": 4.5305154362226324e-05, "loss": 0.0001, "step": 29000 }, { "epoch": 1.8457720939110516, "grad_norm": 0.06298828125, "learning_rate": 4.530161961923749e-05, "loss": 0.0057, "step": 29010 }, { "epoch": 1.8464083476490425, "grad_norm": 0.00494384765625, "learning_rate": 4.529808487624865e-05, "loss": 0.0001, "step": 29020 }, { "epoch": 1.8470446013870332, "grad_norm": 0.0028076171875, "learning_rate": 4.529455013325981e-05, "loss": 0.004, "step": 29030 }, { "epoch": 1.847680855125024, "grad_norm": 0.004730224609375, "learning_rate": 4.529101539027098e-05, "loss": 0.0004, "step": 29040 }, { "epoch": 1.8483171088630146, "grad_norm": 0.04248046875, "learning_rate": 4.5287480647282136e-05, "loss": 0.0044, "step": 29050 }, { "epoch": 1.8489533626010053, "grad_norm": 1.109375, "learning_rate": 4.52839459042933e-05, "loss": 0.0012, "step": 29060 }, { "epoch": 1.849589616338996, "grad_norm": 0.01092529296875, "learning_rate": 4.5280411161304466e-05, "loss": 0.0007, "step": 29070 }, { "epoch": 1.8502258700769867, "grad_norm": 0.482421875, "learning_rate": 4.5276876418315625e-05, "loss": 0.0231, "step": 29080 }, { "epoch": 1.8508621238149774, "grad_norm": 0.040283203125, "learning_rate": 4.527334167532679e-05, "loss": 0.0032, "step": 29090 }, { "epoch": 1.851498377552968, "grad_norm": 0.005340576171875, "learning_rate": 4.526980693233795e-05, "loss": 0.0013, "step": 29100 }, { "epoch": 1.8521346312909588, "grad_norm": 0.09326171875, "learning_rate": 4.5266272189349114e-05, "loss": 0.001, "step": 29110 }, { "epoch": 1.8527708850289497, "grad_norm": 0.003875732421875, "learning_rate": 4.526273744636028e-05, "loss": 0.0005, "step": 29120 }, { "epoch": 1.8534071387669404, "grad_norm": 0.005157470703125, "learning_rate": 4.525920270337144e-05, "loss": 0.0012, "step": 29130 }, { "epoch": 1.854043392504931, "grad_norm": 0.057861328125, "learning_rate": 4.52556679603826e-05, "loss": 0.0001, "step": 29140 }, { "epoch": 1.8546796462429218, "grad_norm": 1.2734375, "learning_rate": 4.525213321739377e-05, "loss": 0.0012, "step": 29150 }, { "epoch": 1.8553158999809125, "grad_norm": 0.006805419921875, "learning_rate": 4.524859847440493e-05, "loss": 0.0011, "step": 29160 }, { "epoch": 1.8559521537189032, "grad_norm": 0.006591796875, "learning_rate": 4.524506373141609e-05, "loss": 0.0008, "step": 29170 }, { "epoch": 1.856588407456894, "grad_norm": 0.365234375, "learning_rate": 4.524152898842725e-05, "loss": 0.0004, "step": 29180 }, { "epoch": 1.8572246611948846, "grad_norm": 0.00162506103515625, "learning_rate": 4.5237994245438415e-05, "loss": 0.001, "step": 29190 }, { "epoch": 1.8578609149328753, "grad_norm": 0.30078125, "learning_rate": 4.523445950244958e-05, "loss": 0.0014, "step": 29200 }, { "epoch": 1.858497168670866, "grad_norm": 0.515625, "learning_rate": 4.523092475946074e-05, "loss": 0.0045, "step": 29210 }, { "epoch": 1.8591334224088567, "grad_norm": 0.2216796875, "learning_rate": 4.5227390016471903e-05, "loss": 0.0002, "step": 29220 }, { "epoch": 1.8597696761468474, "grad_norm": 2.1875, "learning_rate": 4.522385527348307e-05, "loss": 0.0028, "step": 29230 }, { "epoch": 1.860405929884838, "grad_norm": 1.09375, "learning_rate": 4.5220320530494234e-05, "loss": 0.0068, "step": 29240 }, { "epoch": 1.8610421836228288, "grad_norm": 0.0341796875, "learning_rate": 4.521678578750539e-05, "loss": 0.0036, "step": 29250 }, { "epoch": 1.8616784373608195, "grad_norm": 0.005706787109375, "learning_rate": 4.521325104451656e-05, "loss": 0.0003, "step": 29260 }, { "epoch": 1.8623146910988102, "grad_norm": 0.73046875, "learning_rate": 4.5209716301527716e-05, "loss": 0.0007, "step": 29270 }, { "epoch": 1.862950944836801, "grad_norm": 0.189453125, "learning_rate": 4.520618155853888e-05, "loss": 0.0046, "step": 29280 }, { "epoch": 1.8635871985747916, "grad_norm": 0.0303955078125, "learning_rate": 4.520264681555004e-05, "loss": 0.001, "step": 29290 }, { "epoch": 1.8642234523127823, "grad_norm": 0.1416015625, "learning_rate": 4.5199112072561205e-05, "loss": 0.0002, "step": 29300 }, { "epoch": 1.864859706050773, "grad_norm": 0.0021514892578125, "learning_rate": 4.519557732957237e-05, "loss": 0.0023, "step": 29310 }, { "epoch": 1.8654959597887637, "grad_norm": 0.01165771484375, "learning_rate": 4.5192042586583535e-05, "loss": 0.0008, "step": 29320 }, { "epoch": 1.8661322135267544, "grad_norm": 0.004241943359375, "learning_rate": 4.518850784359469e-05, "loss": 0.0003, "step": 29330 }, { "epoch": 1.866768467264745, "grad_norm": 0.0035552978515625, "learning_rate": 4.518497310060586e-05, "loss": 0.0002, "step": 29340 }, { "epoch": 1.8674047210027358, "grad_norm": 0.0057373046875, "learning_rate": 4.518143835761702e-05, "loss": 0.0065, "step": 29350 }, { "epoch": 1.8680409747407265, "grad_norm": 0.00408935546875, "learning_rate": 4.517790361462818e-05, "loss": 0.0004, "step": 29360 }, { "epoch": 1.8686772284787172, "grad_norm": 0.056396484375, "learning_rate": 4.517436887163935e-05, "loss": 0.0006, "step": 29370 }, { "epoch": 1.869313482216708, "grad_norm": 0.197265625, "learning_rate": 4.5170834128650506e-05, "loss": 0.0041, "step": 29380 }, { "epoch": 1.8699497359546986, "grad_norm": 0.06982421875, "learning_rate": 4.516729938566167e-05, "loss": 0.0017, "step": 29390 }, { "epoch": 1.8705859896926893, "grad_norm": 0.0673828125, "learning_rate": 4.5163764642672836e-05, "loss": 0.0003, "step": 29400 }, { "epoch": 1.87122224343068, "grad_norm": 0.1552734375, "learning_rate": 4.5160229899683995e-05, "loss": 0.0031, "step": 29410 }, { "epoch": 1.871858497168671, "grad_norm": 0.0245361328125, "learning_rate": 4.515669515669516e-05, "loss": 0.0007, "step": 29420 }, { "epoch": 1.8724947509066616, "grad_norm": 0.000835418701171875, "learning_rate": 4.515316041370632e-05, "loss": 0.0002, "step": 29430 }, { "epoch": 1.8731310046446523, "grad_norm": 0.0184326171875, "learning_rate": 4.514962567071748e-05, "loss": 0.0006, "step": 29440 }, { "epoch": 1.873767258382643, "grad_norm": 0.003021240234375, "learning_rate": 4.514609092772865e-05, "loss": 0.0001, "step": 29450 }, { "epoch": 1.8744035121206337, "grad_norm": 0.0107421875, "learning_rate": 4.514255618473981e-05, "loss": 0.0003, "step": 29460 }, { "epoch": 1.8750397658586244, "grad_norm": 0.0478515625, "learning_rate": 4.513902144175097e-05, "loss": 0.0011, "step": 29470 }, { "epoch": 1.8756760195966151, "grad_norm": 0.00151824951171875, "learning_rate": 4.513548669876214e-05, "loss": 0.0003, "step": 29480 }, { "epoch": 1.8763122733346058, "grad_norm": 0.00189208984375, "learning_rate": 4.51319519557733e-05, "loss": 0.0006, "step": 29490 }, { "epoch": 1.8769485270725965, "grad_norm": 0.018798828125, "learning_rate": 4.512841721278446e-05, "loss": 0.0001, "step": 29500 }, { "epoch": 1.8775847808105872, "grad_norm": 0.07666015625, "learning_rate": 4.512488246979562e-05, "loss": 0.0005, "step": 29510 }, { "epoch": 1.8782210345485781, "grad_norm": 0.004852294921875, "learning_rate": 4.5121347726806784e-05, "loss": 0.0119, "step": 29520 }, { "epoch": 1.8788572882865688, "grad_norm": 0.09326171875, "learning_rate": 4.511781298381795e-05, "loss": 0.0005, "step": 29530 }, { "epoch": 1.8794935420245595, "grad_norm": 0.1806640625, "learning_rate": 4.511427824082911e-05, "loss": 0.0004, "step": 29540 }, { "epoch": 1.8801297957625502, "grad_norm": 0.002471923828125, "learning_rate": 4.511074349784027e-05, "loss": 0.0005, "step": 29550 }, { "epoch": 1.880766049500541, "grad_norm": 0.66796875, "learning_rate": 4.510720875485144e-05, "loss": 0.0018, "step": 29560 }, { "epoch": 1.8814023032385316, "grad_norm": 0.01336669921875, "learning_rate": 4.5103674011862604e-05, "loss": 0.0004, "step": 29570 }, { "epoch": 1.8820385569765223, "grad_norm": 0.07177734375, "learning_rate": 4.510013926887376e-05, "loss": 0.001, "step": 29580 }, { "epoch": 1.882674810714513, "grad_norm": 0.004638671875, "learning_rate": 4.509660452588492e-05, "loss": 0.0003, "step": 29590 }, { "epoch": 1.8833110644525037, "grad_norm": 0.00958251953125, "learning_rate": 4.5093069782896086e-05, "loss": 0.0035, "step": 29600 }, { "epoch": 1.8839473181904944, "grad_norm": 0.05810546875, "learning_rate": 4.508953503990725e-05, "loss": 0.0036, "step": 29610 }, { "epoch": 1.8845835719284851, "grad_norm": 0.013671875, "learning_rate": 4.508600029691841e-05, "loss": 0.0006, "step": 29620 }, { "epoch": 1.8852198256664758, "grad_norm": 0.030517578125, "learning_rate": 4.5082465553929574e-05, "loss": 0.0006, "step": 29630 }, { "epoch": 1.8858560794044665, "grad_norm": 0.08349609375, "learning_rate": 4.507893081094074e-05, "loss": 0.0004, "step": 29640 }, { "epoch": 1.8864923331424572, "grad_norm": 0.0311279296875, "learning_rate": 4.5075396067951905e-05, "loss": 0.0037, "step": 29650 }, { "epoch": 1.887128586880448, "grad_norm": 0.00107574462890625, "learning_rate": 4.507186132496306e-05, "loss": 0.0007, "step": 29660 }, { "epoch": 1.8877648406184386, "grad_norm": 0.263671875, "learning_rate": 4.506832658197423e-05, "loss": 0.0008, "step": 29670 }, { "epoch": 1.8884010943564293, "grad_norm": 0.013427734375, "learning_rate": 4.506479183898539e-05, "loss": 0.0002, "step": 29680 }, { "epoch": 1.88903734809442, "grad_norm": 0.039794921875, "learning_rate": 4.506125709599655e-05, "loss": 0.005, "step": 29690 }, { "epoch": 1.8896736018324107, "grad_norm": 0.0240478515625, "learning_rate": 4.505772235300772e-05, "loss": 0.0006, "step": 29700 }, { "epoch": 1.8903098555704014, "grad_norm": 0.01031494140625, "learning_rate": 4.5054187610018876e-05, "loss": 0.0001, "step": 29710 }, { "epoch": 1.8909461093083921, "grad_norm": 0.0030517578125, "learning_rate": 4.505065286703004e-05, "loss": 0.0001, "step": 29720 }, { "epoch": 1.8915823630463828, "grad_norm": 0.0419921875, "learning_rate": 4.5047118124041206e-05, "loss": 0.0024, "step": 29730 }, { "epoch": 1.8922186167843735, "grad_norm": 0.0108642578125, "learning_rate": 4.504358338105237e-05, "loss": 0.0005, "step": 29740 }, { "epoch": 1.8928548705223642, "grad_norm": 0.044189453125, "learning_rate": 4.504004863806353e-05, "loss": 0.002, "step": 29750 }, { "epoch": 1.893491124260355, "grad_norm": 0.01904296875, "learning_rate": 4.503651389507469e-05, "loss": 0.0002, "step": 29760 }, { "epoch": 1.8941273779983456, "grad_norm": 0.0177001953125, "learning_rate": 4.503297915208585e-05, "loss": 0.0002, "step": 29770 }, { "epoch": 1.8947636317363363, "grad_norm": 0.05322265625, "learning_rate": 4.502944440909702e-05, "loss": 0.0004, "step": 29780 }, { "epoch": 1.895399885474327, "grad_norm": 0.025390625, "learning_rate": 4.502590966610818e-05, "loss": 0.0109, "step": 29790 }, { "epoch": 1.8960361392123177, "grad_norm": 0.031005859375, "learning_rate": 4.502237492311934e-05, "loss": 0.0002, "step": 29800 }, { "epoch": 1.8966723929503084, "grad_norm": 0.03515625, "learning_rate": 4.501884018013051e-05, "loss": 0.0009, "step": 29810 }, { "epoch": 1.8973086466882991, "grad_norm": 1.34375, "learning_rate": 4.501530543714167e-05, "loss": 0.0142, "step": 29820 }, { "epoch": 1.89794490042629, "grad_norm": 0.0203857421875, "learning_rate": 4.501177069415283e-05, "loss": 0.0008, "step": 29830 }, { "epoch": 1.8985811541642807, "grad_norm": 0.003631591796875, "learning_rate": 4.500823595116399e-05, "loss": 0.0001, "step": 29840 }, { "epoch": 1.8992174079022714, "grad_norm": 0.013916015625, "learning_rate": 4.5004701208175154e-05, "loss": 0.0012, "step": 29850 }, { "epoch": 1.8998536616402621, "grad_norm": 0.1826171875, "learning_rate": 4.500116646518632e-05, "loss": 0.0091, "step": 29860 }, { "epoch": 1.9004899153782528, "grad_norm": 0.369140625, "learning_rate": 4.499763172219748e-05, "loss": 0.0003, "step": 29870 }, { "epoch": 1.9011261691162435, "grad_norm": 0.0054931640625, "learning_rate": 4.499409697920864e-05, "loss": 0.0005, "step": 29880 }, { "epoch": 1.9017624228542342, "grad_norm": 0.01220703125, "learning_rate": 4.499056223621981e-05, "loss": 0.0001, "step": 29890 }, { "epoch": 1.902398676592225, "grad_norm": 0.306640625, "learning_rate": 4.4987027493230973e-05, "loss": 0.0003, "step": 29900 }, { "epoch": 1.9030349303302156, "grad_norm": 0.1181640625, "learning_rate": 4.498349275024213e-05, "loss": 0.0057, "step": 29910 }, { "epoch": 1.9036711840682063, "grad_norm": 0.029541015625, "learning_rate": 4.497995800725329e-05, "loss": 0.0013, "step": 29920 }, { "epoch": 1.9043074378061973, "grad_norm": 0.002899169921875, "learning_rate": 4.4976423264264455e-05, "loss": 0.0002, "step": 29930 }, { "epoch": 1.904943691544188, "grad_norm": 0.0238037109375, "learning_rate": 4.497288852127562e-05, "loss": 0.0011, "step": 29940 }, { "epoch": 1.9055799452821787, "grad_norm": 0.2216796875, "learning_rate": 4.496935377828678e-05, "loss": 0.0004, "step": 29950 }, { "epoch": 1.9062161990201694, "grad_norm": 0.032470703125, "learning_rate": 4.4965819035297944e-05, "loss": 0.0038, "step": 29960 }, { "epoch": 1.90685245275816, "grad_norm": 3.390625, "learning_rate": 4.496228429230911e-05, "loss": 0.0193, "step": 29970 }, { "epoch": 1.9074887064961508, "grad_norm": 0.006072998046875, "learning_rate": 4.4958749549320275e-05, "loss": 0.0007, "step": 29980 }, { "epoch": 1.9081249602341415, "grad_norm": 0.08251953125, "learning_rate": 4.495521480633143e-05, "loss": 0.0021, "step": 29990 }, { "epoch": 1.9087612139721322, "grad_norm": 0.06396484375, "learning_rate": 4.495168006334259e-05, "loss": 0.0018, "step": 30000 }, { "epoch": 1.9093974677101229, "grad_norm": 0.1923828125, "learning_rate": 4.4948145320353757e-05, "loss": 0.0022, "step": 30010 }, { "epoch": 1.9100337214481136, "grad_norm": 0.006561279296875, "learning_rate": 4.494461057736492e-05, "loss": 0.0003, "step": 30020 }, { "epoch": 1.9106699751861043, "grad_norm": 0.00543212890625, "learning_rate": 4.494107583437609e-05, "loss": 0.0112, "step": 30030 }, { "epoch": 1.911306228924095, "grad_norm": 0.004425048828125, "learning_rate": 4.4937541091387245e-05, "loss": 0.0002, "step": 30040 }, { "epoch": 1.9119424826620857, "grad_norm": 0.00830078125, "learning_rate": 4.493400634839841e-05, "loss": 0.0003, "step": 30050 }, { "epoch": 1.9125787364000764, "grad_norm": 0.0027313232421875, "learning_rate": 4.4930471605409576e-05, "loss": 0.0024, "step": 30060 }, { "epoch": 1.913214990138067, "grad_norm": 0.00159454345703125, "learning_rate": 4.492693686242074e-05, "loss": 0.0005, "step": 30070 }, { "epoch": 1.9138512438760578, "grad_norm": 0.10986328125, "learning_rate": 4.49234021194319e-05, "loss": 0.0032, "step": 30080 }, { "epoch": 1.9144874976140485, "grad_norm": 0.014404296875, "learning_rate": 4.491986737644306e-05, "loss": 0.0007, "step": 30090 }, { "epoch": 1.9151237513520392, "grad_norm": 0.039306640625, "learning_rate": 4.491633263345422e-05, "loss": 0.0012, "step": 30100 }, { "epoch": 1.9157600050900299, "grad_norm": 0.005828857421875, "learning_rate": 4.491279789046539e-05, "loss": 0.0006, "step": 30110 }, { "epoch": 1.9163962588280206, "grad_norm": 0.07373046875, "learning_rate": 4.4909263147476547e-05, "loss": 0.0023, "step": 30120 }, { "epoch": 1.9170325125660113, "grad_norm": 0.0218505859375, "learning_rate": 4.490572840448771e-05, "loss": 0.0005, "step": 30130 }, { "epoch": 1.917668766304002, "grad_norm": 0.0027923583984375, "learning_rate": 4.490219366149888e-05, "loss": 0.0005, "step": 30140 }, { "epoch": 1.9183050200419927, "grad_norm": 0.010986328125, "learning_rate": 4.489865891851004e-05, "loss": 0.0001, "step": 30150 }, { "epoch": 1.9189412737799834, "grad_norm": 0.0093994140625, "learning_rate": 4.48951241755212e-05, "loss": 0.001, "step": 30160 }, { "epoch": 1.919577527517974, "grad_norm": 0.0098876953125, "learning_rate": 4.489158943253236e-05, "loss": 0.001, "step": 30170 }, { "epoch": 1.9202137812559648, "grad_norm": 0.0029296875, "learning_rate": 4.4888054689543524e-05, "loss": 0.0002, "step": 30180 }, { "epoch": 1.9208500349939555, "grad_norm": 0.00830078125, "learning_rate": 4.488451994655469e-05, "loss": 0.0004, "step": 30190 }, { "epoch": 1.9214862887319462, "grad_norm": 0.00119781494140625, "learning_rate": 4.488098520356585e-05, "loss": 0.0049, "step": 30200 }, { "epoch": 1.9221225424699369, "grad_norm": 1.34375, "learning_rate": 4.487745046057701e-05, "loss": 0.0012, "step": 30210 }, { "epoch": 1.9227587962079276, "grad_norm": 0.150390625, "learning_rate": 4.487391571758818e-05, "loss": 0.0013, "step": 30220 }, { "epoch": 1.9233950499459185, "grad_norm": 0.0019683837890625, "learning_rate": 4.487038097459934e-05, "loss": 0.0001, "step": 30230 }, { "epoch": 1.9240313036839092, "grad_norm": 0.002197265625, "learning_rate": 4.48668462316105e-05, "loss": 0.0007, "step": 30240 }, { "epoch": 1.9246675574218999, "grad_norm": 0.0038604736328125, "learning_rate": 4.486331148862166e-05, "loss": 0.0019, "step": 30250 }, { "epoch": 1.9253038111598906, "grad_norm": 0.0269775390625, "learning_rate": 4.4859776745632825e-05, "loss": 0.0002, "step": 30260 }, { "epoch": 1.9259400648978813, "grad_norm": 0.017822265625, "learning_rate": 4.485624200264399e-05, "loss": 0.0004, "step": 30270 }, { "epoch": 1.926576318635872, "grad_norm": 0.1025390625, "learning_rate": 4.4852707259655156e-05, "loss": 0.0001, "step": 30280 }, { "epoch": 1.9272125723738627, "grad_norm": 0.00286865234375, "learning_rate": 4.4849172516666314e-05, "loss": 0.0007, "step": 30290 }, { "epoch": 1.9278488261118534, "grad_norm": 1.6875, "learning_rate": 4.484563777367748e-05, "loss": 0.0052, "step": 30300 }, { "epoch": 1.928485079849844, "grad_norm": 0.0137939453125, "learning_rate": 4.4842103030688644e-05, "loss": 0.0001, "step": 30310 }, { "epoch": 1.9291213335878348, "grad_norm": 0.75390625, "learning_rate": 4.48385682876998e-05, "loss": 0.0005, "step": 30320 }, { "epoch": 1.9297575873258257, "grad_norm": 0.01611328125, "learning_rate": 4.483503354471096e-05, "loss": 0.0026, "step": 30330 }, { "epoch": 1.9303938410638164, "grad_norm": 0.00098419189453125, "learning_rate": 4.4831498801722126e-05, "loss": 0.0084, "step": 30340 }, { "epoch": 1.931030094801807, "grad_norm": 0.01324462890625, "learning_rate": 4.482796405873329e-05, "loss": 0.0004, "step": 30350 }, { "epoch": 1.9316663485397978, "grad_norm": 0.0194091796875, "learning_rate": 4.482442931574446e-05, "loss": 0.0028, "step": 30360 }, { "epoch": 1.9323026022777885, "grad_norm": 0.00182342529296875, "learning_rate": 4.4820894572755615e-05, "loss": 0.0018, "step": 30370 }, { "epoch": 1.9329388560157792, "grad_norm": 0.00238037109375, "learning_rate": 4.481735982976678e-05, "loss": 0.005, "step": 30380 }, { "epoch": 1.93357510975377, "grad_norm": 0.009033203125, "learning_rate": 4.4813825086777946e-05, "loss": 0.0017, "step": 30390 }, { "epoch": 1.9342113634917606, "grad_norm": 0.00787353515625, "learning_rate": 4.481029034378911e-05, "loss": 0.0008, "step": 30400 }, { "epoch": 1.9348476172297513, "grad_norm": 0.072265625, "learning_rate": 4.480675560080026e-05, "loss": 0.0007, "step": 30410 }, { "epoch": 1.935483870967742, "grad_norm": 0.047607421875, "learning_rate": 4.480322085781143e-05, "loss": 0.0032, "step": 30420 }, { "epoch": 1.9361201247057327, "grad_norm": 0.0034942626953125, "learning_rate": 4.479968611482259e-05, "loss": 0.0007, "step": 30430 }, { "epoch": 1.9367563784437234, "grad_norm": 0.0751953125, "learning_rate": 4.479615137183376e-05, "loss": 0.0002, "step": 30440 }, { "epoch": 1.937392632181714, "grad_norm": 0.1484375, "learning_rate": 4.4792616628844916e-05, "loss": 0.0004, "step": 30450 }, { "epoch": 1.9380288859197048, "grad_norm": 0.0185546875, "learning_rate": 4.478908188585608e-05, "loss": 0.0003, "step": 30460 }, { "epoch": 1.9386651396576955, "grad_norm": 0.0057373046875, "learning_rate": 4.478554714286725e-05, "loss": 0.0043, "step": 30470 }, { "epoch": 1.9393013933956862, "grad_norm": 0.021484375, "learning_rate": 4.478201239987841e-05, "loss": 0.0131, "step": 30480 }, { "epoch": 1.939937647133677, "grad_norm": 0.00188446044921875, "learning_rate": 4.477847765688957e-05, "loss": 0.0005, "step": 30490 }, { "epoch": 1.9405739008716676, "grad_norm": 0.003326416015625, "learning_rate": 4.477494291390073e-05, "loss": 0.0009, "step": 30500 }, { "epoch": 1.9412101546096583, "grad_norm": 0.0177001953125, "learning_rate": 4.4771408170911894e-05, "loss": 0.0002, "step": 30510 }, { "epoch": 1.941846408347649, "grad_norm": 0.0517578125, "learning_rate": 4.476787342792306e-05, "loss": 0.0004, "step": 30520 }, { "epoch": 1.9424826620856397, "grad_norm": 0.005706787109375, "learning_rate": 4.476433868493422e-05, "loss": 0.0012, "step": 30530 }, { "epoch": 1.9431189158236304, "grad_norm": 0.005950927734375, "learning_rate": 4.476080394194538e-05, "loss": 0.0004, "step": 30540 }, { "epoch": 1.943755169561621, "grad_norm": 0.01361083984375, "learning_rate": 4.475726919895655e-05, "loss": 0.0003, "step": 30550 }, { "epoch": 1.9443914232996118, "grad_norm": 0.0024871826171875, "learning_rate": 4.475373445596771e-05, "loss": 0.0024, "step": 30560 }, { "epoch": 1.9450276770376025, "grad_norm": 0.17578125, "learning_rate": 4.475019971297887e-05, "loss": 0.0003, "step": 30570 }, { "epoch": 1.9456639307755932, "grad_norm": 0.0206298828125, "learning_rate": 4.474666496999003e-05, "loss": 0.0004, "step": 30580 }, { "epoch": 1.9463001845135839, "grad_norm": 0.025634765625, "learning_rate": 4.4743130227001195e-05, "loss": 0.0005, "step": 30590 }, { "epoch": 1.9469364382515746, "grad_norm": 0.004241943359375, "learning_rate": 4.473959548401236e-05, "loss": 0.0007, "step": 30600 }, { "epoch": 1.9475726919895653, "grad_norm": 0.01129150390625, "learning_rate": 4.4736060741023525e-05, "loss": 0.0017, "step": 30610 }, { "epoch": 1.948208945727556, "grad_norm": 0.00830078125, "learning_rate": 4.4732525998034684e-05, "loss": 0.0004, "step": 30620 }, { "epoch": 1.948845199465547, "grad_norm": 0.0067138671875, "learning_rate": 4.472899125504585e-05, "loss": 0.0001, "step": 30630 }, { "epoch": 1.9494814532035376, "grad_norm": 0.3046875, "learning_rate": 4.4725456512057014e-05, "loss": 0.0005, "step": 30640 }, { "epoch": 1.9501177069415283, "grad_norm": 0.31640625, "learning_rate": 4.472192176906817e-05, "loss": 0.0027, "step": 30650 }, { "epoch": 1.950753960679519, "grad_norm": 0.0419921875, "learning_rate": 4.471838702607933e-05, "loss": 0.0004, "step": 30660 }, { "epoch": 1.9513902144175097, "grad_norm": 0.0159912109375, "learning_rate": 4.4714852283090496e-05, "loss": 0.0003, "step": 30670 }, { "epoch": 1.9520264681555004, "grad_norm": 0.388671875, "learning_rate": 4.471131754010166e-05, "loss": 0.0013, "step": 30680 }, { "epoch": 1.952662721893491, "grad_norm": 0.0048828125, "learning_rate": 4.4707782797112827e-05, "loss": 0.0004, "step": 30690 }, { "epoch": 1.9532989756314818, "grad_norm": 0.03125, "learning_rate": 4.4704248054123985e-05, "loss": 0.0018, "step": 30700 }, { "epoch": 1.9539352293694725, "grad_norm": 0.01458740234375, "learning_rate": 4.470071331113515e-05, "loss": 0.0005, "step": 30710 }, { "epoch": 1.9545714831074632, "grad_norm": 1.4140625, "learning_rate": 4.4697178568146315e-05, "loss": 0.0008, "step": 30720 }, { "epoch": 1.9552077368454541, "grad_norm": 0.0045166015625, "learning_rate": 4.469364382515748e-05, "loss": 0.0002, "step": 30730 }, { "epoch": 1.9558439905834448, "grad_norm": 0.001678466796875, "learning_rate": 4.469010908216863e-05, "loss": 0.0004, "step": 30740 }, { "epoch": 1.9564802443214355, "grad_norm": 0.040283203125, "learning_rate": 4.46865743391798e-05, "loss": 0.0005, "step": 30750 }, { "epoch": 1.9571164980594262, "grad_norm": 0.0028228759765625, "learning_rate": 4.468303959619096e-05, "loss": 0.001, "step": 30760 }, { "epoch": 1.957752751797417, "grad_norm": 0.00148773193359375, "learning_rate": 4.467950485320213e-05, "loss": 0.0007, "step": 30770 }, { "epoch": 1.9583890055354076, "grad_norm": 0.08203125, "learning_rate": 4.4675970110213286e-05, "loss": 0.0004, "step": 30780 }, { "epoch": 1.9590252592733983, "grad_norm": 0.0016021728515625, "learning_rate": 4.467243536722445e-05, "loss": 0.0004, "step": 30790 }, { "epoch": 1.959661513011389, "grad_norm": 0.0029144287109375, "learning_rate": 4.4668900624235617e-05, "loss": 0.0052, "step": 30800 }, { "epoch": 1.9602977667493797, "grad_norm": 0.00482177734375, "learning_rate": 4.466536588124678e-05, "loss": 0.0016, "step": 30810 }, { "epoch": 1.9609340204873704, "grad_norm": 0.09326171875, "learning_rate": 4.466183113825794e-05, "loss": 0.0004, "step": 30820 }, { "epoch": 1.9615702742253611, "grad_norm": 0.0927734375, "learning_rate": 4.46582963952691e-05, "loss": 0.0007, "step": 30830 }, { "epoch": 1.9622065279633518, "grad_norm": 0.158203125, "learning_rate": 4.4654761652280264e-05, "loss": 0.0005, "step": 30840 }, { "epoch": 1.9628427817013425, "grad_norm": 0.0023651123046875, "learning_rate": 4.465122690929143e-05, "loss": 0.0034, "step": 30850 }, { "epoch": 1.9634790354393332, "grad_norm": 0.255859375, "learning_rate": 4.464769216630259e-05, "loss": 0.0003, "step": 30860 }, { "epoch": 1.964115289177324, "grad_norm": 0.004150390625, "learning_rate": 4.464415742331375e-05, "loss": 0.0007, "step": 30870 }, { "epoch": 1.9647515429153146, "grad_norm": 0.000591278076171875, "learning_rate": 4.464062268032492e-05, "loss": 0.0001, "step": 30880 }, { "epoch": 1.9653877966533053, "grad_norm": 2.96875, "learning_rate": 4.463708793733608e-05, "loss": 0.0077, "step": 30890 }, { "epoch": 1.966024050391296, "grad_norm": 0.0007171630859375, "learning_rate": 4.463355319434724e-05, "loss": 0.0044, "step": 30900 }, { "epoch": 1.9666603041292867, "grad_norm": 0.0054931640625, "learning_rate": 4.46300184513584e-05, "loss": 0.0016, "step": 30910 }, { "epoch": 1.9672965578672774, "grad_norm": 0.05810546875, "learning_rate": 4.4626483708369565e-05, "loss": 0.0003, "step": 30920 }, { "epoch": 1.9679328116052681, "grad_norm": 0.00058746337890625, "learning_rate": 4.462294896538073e-05, "loss": 0.0004, "step": 30930 }, { "epoch": 1.9685690653432588, "grad_norm": 0.000560760498046875, "learning_rate": 4.4619414222391895e-05, "loss": 0.0002, "step": 30940 }, { "epoch": 1.9692053190812495, "grad_norm": 0.056884765625, "learning_rate": 4.4615879479403054e-05, "loss": 0.0002, "step": 30950 }, { "epoch": 1.9698415728192402, "grad_norm": 0.006805419921875, "learning_rate": 4.461234473641422e-05, "loss": 0.0002, "step": 30960 }, { "epoch": 1.970477826557231, "grad_norm": 0.00201416015625, "learning_rate": 4.4608809993425384e-05, "loss": 0.0075, "step": 30970 }, { "epoch": 1.9711140802952216, "grad_norm": 3.78125, "learning_rate": 4.460527525043654e-05, "loss": 0.0064, "step": 30980 }, { "epoch": 1.9717503340332123, "grad_norm": 0.01104736328125, "learning_rate": 4.46017405074477e-05, "loss": 0.0068, "step": 30990 }, { "epoch": 1.972386587771203, "grad_norm": 0.1513671875, "learning_rate": 4.4598205764458866e-05, "loss": 0.0004, "step": 31000 }, { "epoch": 1.9730228415091937, "grad_norm": 0.0048828125, "learning_rate": 4.459467102147003e-05, "loss": 0.0007, "step": 31010 }, { "epoch": 1.9736590952471844, "grad_norm": 0.04443359375, "learning_rate": 4.4591136278481196e-05, "loss": 0.0023, "step": 31020 }, { "epoch": 1.9742953489851753, "grad_norm": 0.36328125, "learning_rate": 4.4587601535492355e-05, "loss": 0.0004, "step": 31030 }, { "epoch": 1.974931602723166, "grad_norm": 3.640625, "learning_rate": 4.458406679250352e-05, "loss": 0.0076, "step": 31040 }, { "epoch": 1.9755678564611567, "grad_norm": 0.0133056640625, "learning_rate": 4.4580532049514685e-05, "loss": 0.001, "step": 31050 }, { "epoch": 1.9762041101991474, "grad_norm": 1.1640625, "learning_rate": 4.457699730652585e-05, "loss": 0.0041, "step": 31060 }, { "epoch": 1.9768403639371381, "grad_norm": 0.00933837890625, "learning_rate": 4.4573462563537e-05, "loss": 0.0014, "step": 31070 }, { "epoch": 1.9774766176751288, "grad_norm": 0.06396484375, "learning_rate": 4.456992782054817e-05, "loss": 0.0155, "step": 31080 }, { "epoch": 1.9781128714131195, "grad_norm": 0.0032196044921875, "learning_rate": 4.456639307755933e-05, "loss": 0.0001, "step": 31090 }, { "epoch": 1.9787491251511102, "grad_norm": 0.01422119140625, "learning_rate": 4.45628583345705e-05, "loss": 0.0009, "step": 31100 }, { "epoch": 1.979385378889101, "grad_norm": 0.0030059814453125, "learning_rate": 4.4559323591581656e-05, "loss": 0.0004, "step": 31110 }, { "epoch": 1.9800216326270916, "grad_norm": 0.15234375, "learning_rate": 4.455578884859282e-05, "loss": 0.0004, "step": 31120 }, { "epoch": 1.9806578863650826, "grad_norm": 0.00051116943359375, "learning_rate": 4.4552254105603986e-05, "loss": 0.003, "step": 31130 }, { "epoch": 1.9812941401030733, "grad_norm": 0.0390625, "learning_rate": 4.454871936261515e-05, "loss": 0.0009, "step": 31140 }, { "epoch": 1.981930393841064, "grad_norm": 0.0018463134765625, "learning_rate": 4.454518461962631e-05, "loss": 0.0005, "step": 31150 }, { "epoch": 1.9825666475790547, "grad_norm": 0.87890625, "learning_rate": 4.454164987663747e-05, "loss": 0.0055, "step": 31160 }, { "epoch": 1.9832029013170454, "grad_norm": 0.004791259765625, "learning_rate": 4.4538115133648634e-05, "loss": 0.0003, "step": 31170 }, { "epoch": 1.983839155055036, "grad_norm": 0.42578125, "learning_rate": 4.45345803906598e-05, "loss": 0.0082, "step": 31180 }, { "epoch": 1.9844754087930268, "grad_norm": 0.212890625, "learning_rate": 4.453104564767096e-05, "loss": 0.0007, "step": 31190 }, { "epoch": 1.9851116625310175, "grad_norm": 0.00193023681640625, "learning_rate": 4.452751090468212e-05, "loss": 0.0043, "step": 31200 }, { "epoch": 1.9857479162690082, "grad_norm": 0.00098419189453125, "learning_rate": 4.452397616169329e-05, "loss": 0.0001, "step": 31210 }, { "epoch": 1.9863841700069989, "grad_norm": 0.2275390625, "learning_rate": 4.452044141870445e-05, "loss": 0.0005, "step": 31220 }, { "epoch": 1.9870204237449896, "grad_norm": 0.0012054443359375, "learning_rate": 4.451690667571561e-05, "loss": 0.0006, "step": 31230 }, { "epoch": 1.9876566774829802, "grad_norm": 1.453125, "learning_rate": 4.451337193272677e-05, "loss": 0.0018, "step": 31240 }, { "epoch": 1.988292931220971, "grad_norm": 1.5078125, "learning_rate": 4.4509837189737935e-05, "loss": 0.0055, "step": 31250 }, { "epoch": 1.9889291849589616, "grad_norm": 0.031494140625, "learning_rate": 4.45063024467491e-05, "loss": 0.0007, "step": 31260 }, { "epoch": 1.9895654386969523, "grad_norm": 0.011474609375, "learning_rate": 4.4502767703760265e-05, "loss": 0.0001, "step": 31270 }, { "epoch": 1.990201692434943, "grad_norm": 0.09912109375, "learning_rate": 4.4499232960771423e-05, "loss": 0.0006, "step": 31280 }, { "epoch": 1.9908379461729337, "grad_norm": 0.00531005859375, "learning_rate": 4.449569821778259e-05, "loss": 0.0009, "step": 31290 }, { "epoch": 1.9914741999109244, "grad_norm": 0.019775390625, "learning_rate": 4.4492163474793754e-05, "loss": 0.0005, "step": 31300 }, { "epoch": 1.9921104536489151, "grad_norm": 0.0185546875, "learning_rate": 4.448862873180491e-05, "loss": 0.0006, "step": 31310 }, { "epoch": 1.9927467073869058, "grad_norm": 0.0096435546875, "learning_rate": 4.448509398881607e-05, "loss": 0.0004, "step": 31320 }, { "epoch": 1.9933829611248965, "grad_norm": 0.306640625, "learning_rate": 4.4481559245827236e-05, "loss": 0.0002, "step": 31330 }, { "epoch": 1.9940192148628872, "grad_norm": 0.56640625, "learning_rate": 4.44780245028384e-05, "loss": 0.0004, "step": 31340 }, { "epoch": 1.994655468600878, "grad_norm": 0.035400390625, "learning_rate": 4.4474489759849566e-05, "loss": 0.0006, "step": 31350 }, { "epoch": 1.9952917223388686, "grad_norm": 0.0400390625, "learning_rate": 4.4470955016860725e-05, "loss": 0.0003, "step": 31360 }, { "epoch": 1.9959279760768593, "grad_norm": 0.392578125, "learning_rate": 4.446742027387189e-05, "loss": 0.0007, "step": 31370 }, { "epoch": 1.99656422981485, "grad_norm": 0.003204345703125, "learning_rate": 4.4463885530883055e-05, "loss": 0.0014, "step": 31380 }, { "epoch": 1.9972004835528407, "grad_norm": 0.00982666015625, "learning_rate": 4.4460350787894213e-05, "loss": 0.0012, "step": 31390 }, { "epoch": 1.9978367372908314, "grad_norm": 0.044189453125, "learning_rate": 4.445681604490537e-05, "loss": 0.0001, "step": 31400 }, { "epoch": 1.9984729910288221, "grad_norm": 0.034423828125, "learning_rate": 4.445328130191654e-05, "loss": 0.0034, "step": 31410 }, { "epoch": 1.9991092447668128, "grad_norm": 0.001983642578125, "learning_rate": 4.44497465589277e-05, "loss": 0.0009, "step": 31420 }, { "epoch": 1.9997454985048038, "grad_norm": 0.061279296875, "learning_rate": 4.444621181593887e-05, "loss": 0.0033, "step": 31430 }, { "epoch": 2.0003817522427942, "grad_norm": 0.08544921875, "learning_rate": 4.4442677072950026e-05, "loss": 0.0001, "step": 31440 }, { "epoch": 2.001018005980785, "grad_norm": 0.0234375, "learning_rate": 4.443914232996119e-05, "loss": 0.0014, "step": 31450 }, { "epoch": 2.0016542597187756, "grad_norm": 0.017578125, "learning_rate": 4.4435607586972356e-05, "loss": 0.0002, "step": 31460 }, { "epoch": 2.0022905134567663, "grad_norm": 0.005401611328125, "learning_rate": 4.443207284398352e-05, "loss": 0.0004, "step": 31470 }, { "epoch": 2.0029267671947575, "grad_norm": 0.01385498046875, "learning_rate": 4.442853810099468e-05, "loss": 0.0002, "step": 31480 }, { "epoch": 2.003563020932748, "grad_norm": 0.026611328125, "learning_rate": 4.442500335800584e-05, "loss": 0.0034, "step": 31490 }, { "epoch": 2.004199274670739, "grad_norm": 0.51171875, "learning_rate": 4.4421468615017e-05, "loss": 0.0007, "step": 31500 }, { "epoch": 2.0048355284087296, "grad_norm": 0.2421875, "learning_rate": 4.441793387202817e-05, "loss": 0.0065, "step": 31510 }, { "epoch": 2.0054717821467203, "grad_norm": 0.09619140625, "learning_rate": 4.441439912903933e-05, "loss": 0.0013, "step": 31520 }, { "epoch": 2.006108035884711, "grad_norm": 0.0022735595703125, "learning_rate": 4.441086438605049e-05, "loss": 0.0007, "step": 31530 }, { "epoch": 2.0067442896227017, "grad_norm": 0.03955078125, "learning_rate": 4.440732964306166e-05, "loss": 0.0003, "step": 31540 }, { "epoch": 2.0073805433606924, "grad_norm": 0.045166015625, "learning_rate": 4.440379490007282e-05, "loss": 0.0004, "step": 31550 }, { "epoch": 2.008016797098683, "grad_norm": 0.10986328125, "learning_rate": 4.440026015708398e-05, "loss": 0.0003, "step": 31560 }, { "epoch": 2.008653050836674, "grad_norm": 0.01336669921875, "learning_rate": 4.439672541409514e-05, "loss": 0.002, "step": 31570 }, { "epoch": 2.0092893045746645, "grad_norm": 0.0859375, "learning_rate": 4.4393190671106304e-05, "loss": 0.0006, "step": 31580 }, { "epoch": 2.009925558312655, "grad_norm": 0.0135498046875, "learning_rate": 4.438965592811747e-05, "loss": 0.0003, "step": 31590 }, { "epoch": 2.010561812050646, "grad_norm": 0.09228515625, "learning_rate": 4.4386121185128635e-05, "loss": 0.0016, "step": 31600 }, { "epoch": 2.0111980657886366, "grad_norm": 0.0380859375, "learning_rate": 4.438258644213979e-05, "loss": 0.0002, "step": 31610 }, { "epoch": 2.0118343195266273, "grad_norm": 0.01251220703125, "learning_rate": 4.437905169915096e-05, "loss": 0.0003, "step": 31620 }, { "epoch": 2.012470573264618, "grad_norm": 0.0244140625, "learning_rate": 4.4375516956162124e-05, "loss": 0.007, "step": 31630 }, { "epoch": 2.0131068270026087, "grad_norm": 0.3125, "learning_rate": 4.437198221317328e-05, "loss": 0.0003, "step": 31640 }, { "epoch": 2.0137430807405994, "grad_norm": 0.000179290771484375, "learning_rate": 4.436844747018444e-05, "loss": 0.0097, "step": 31650 }, { "epoch": 2.01437933447859, "grad_norm": 0.019775390625, "learning_rate": 4.4364912727195606e-05, "loss": 0.0014, "step": 31660 }, { "epoch": 2.0150155882165808, "grad_norm": 0.006988525390625, "learning_rate": 4.436137798420677e-05, "loss": 0.0004, "step": 31670 }, { "epoch": 2.0156518419545715, "grad_norm": 0.04248046875, "learning_rate": 4.4357843241217936e-05, "loss": 0.0007, "step": 31680 }, { "epoch": 2.016288095692562, "grad_norm": 0.13671875, "learning_rate": 4.4354308498229094e-05, "loss": 0.0046, "step": 31690 }, { "epoch": 2.016924349430553, "grad_norm": 0.0184326171875, "learning_rate": 4.435077375524026e-05, "loss": 0.0003, "step": 31700 }, { "epoch": 2.0175606031685436, "grad_norm": 0.000850677490234375, "learning_rate": 4.4347239012251425e-05, "loss": 0.0001, "step": 31710 }, { "epoch": 2.0181968569065343, "grad_norm": 0.01019287109375, "learning_rate": 4.434370426926258e-05, "loss": 0.0001, "step": 31720 }, { "epoch": 2.018833110644525, "grad_norm": 0.185546875, "learning_rate": 4.434016952627374e-05, "loss": 0.0004, "step": 31730 }, { "epoch": 2.0194693643825157, "grad_norm": 0.0206298828125, "learning_rate": 4.433663478328491e-05, "loss": 0.0002, "step": 31740 }, { "epoch": 2.0201056181205064, "grad_norm": 0.0361328125, "learning_rate": 4.433310004029607e-05, "loss": 0.0001, "step": 31750 }, { "epoch": 2.020741871858497, "grad_norm": 0.004119873046875, "learning_rate": 4.432956529730724e-05, "loss": 0.0008, "step": 31760 }, { "epoch": 2.0213781255964878, "grad_norm": 0.0262451171875, "learning_rate": 4.4326030554318396e-05, "loss": 0.0023, "step": 31770 }, { "epoch": 2.0220143793344785, "grad_norm": 0.0106201171875, "learning_rate": 4.432249581132956e-05, "loss": 0.0001, "step": 31780 }, { "epoch": 2.022650633072469, "grad_norm": 0.00628662109375, "learning_rate": 4.4318961068340726e-05, "loss": 0.0008, "step": 31790 }, { "epoch": 2.02328688681046, "grad_norm": 0.0004253387451171875, "learning_rate": 4.4315426325351884e-05, "loss": 0.0001, "step": 31800 }, { "epoch": 2.0239231405484506, "grad_norm": 0.02294921875, "learning_rate": 4.431189158236305e-05, "loss": 0.0004, "step": 31810 }, { "epoch": 2.0245593942864413, "grad_norm": 0.00445556640625, "learning_rate": 4.430835683937421e-05, "loss": 0.0035, "step": 31820 }, { "epoch": 2.025195648024432, "grad_norm": 0.00909423828125, "learning_rate": 4.430482209638537e-05, "loss": 0.0003, "step": 31830 }, { "epoch": 2.0258319017624227, "grad_norm": 0.00164031982421875, "learning_rate": 4.430128735339654e-05, "loss": 0.0004, "step": 31840 }, { "epoch": 2.0264681555004134, "grad_norm": 0.0014495849609375, "learning_rate": 4.42977526104077e-05, "loss": 0.0002, "step": 31850 }, { "epoch": 2.027104409238404, "grad_norm": 2.296875, "learning_rate": 4.429421786741886e-05, "loss": 0.0014, "step": 31860 }, { "epoch": 2.0277406629763948, "grad_norm": 0.005218505859375, "learning_rate": 4.429068312443003e-05, "loss": 0.0004, "step": 31870 }, { "epoch": 2.028376916714386, "grad_norm": 0.0026702880859375, "learning_rate": 4.428714838144119e-05, "loss": 0.0009, "step": 31880 }, { "epoch": 2.0290131704523766, "grad_norm": 0.01251220703125, "learning_rate": 4.428361363845235e-05, "loss": 0.0014, "step": 31890 }, { "epoch": 2.0296494241903673, "grad_norm": 0.0771484375, "learning_rate": 4.428007889546351e-05, "loss": 0.0003, "step": 31900 }, { "epoch": 2.030285677928358, "grad_norm": 0.00421142578125, "learning_rate": 4.4276544152474674e-05, "loss": 0.0001, "step": 31910 }, { "epoch": 2.0309219316663487, "grad_norm": 0.00148773193359375, "learning_rate": 4.427300940948584e-05, "loss": 0.0007, "step": 31920 }, { "epoch": 2.0315581854043394, "grad_norm": 0.06201171875, "learning_rate": 4.4269474666497005e-05, "loss": 0.0003, "step": 31930 }, { "epoch": 2.03219443914233, "grad_norm": 0.0079345703125, "learning_rate": 4.426593992350816e-05, "loss": 0.0006, "step": 31940 }, { "epoch": 2.032830692880321, "grad_norm": 0.55078125, "learning_rate": 4.426240518051933e-05, "loss": 0.0003, "step": 31950 }, { "epoch": 2.0334669466183115, "grad_norm": 0.002532958984375, "learning_rate": 4.4258870437530493e-05, "loss": 0.0002, "step": 31960 }, { "epoch": 2.034103200356302, "grad_norm": 0.01092529296875, "learning_rate": 4.425533569454165e-05, "loss": 0.0002, "step": 31970 }, { "epoch": 2.034739454094293, "grad_norm": 0.004852294921875, "learning_rate": 4.425180095155281e-05, "loss": 0.0003, "step": 31980 }, { "epoch": 2.0353757078322836, "grad_norm": 0.000713348388671875, "learning_rate": 4.4248266208563975e-05, "loss": 0.0002, "step": 31990 }, { "epoch": 2.0360119615702743, "grad_norm": 0.007568359375, "learning_rate": 4.424473146557514e-05, "loss": 0.0006, "step": 32000 }, { "epoch": 2.036648215308265, "grad_norm": 0.0020751953125, "learning_rate": 4.4241196722586306e-05, "loss": 0.0013, "step": 32010 }, { "epoch": 2.0372844690462557, "grad_norm": 0.0164794921875, "learning_rate": 4.4237661979597464e-05, "loss": 0.001, "step": 32020 }, { "epoch": 2.0379207227842464, "grad_norm": 0.004638671875, "learning_rate": 4.423412723660863e-05, "loss": 0.0001, "step": 32030 }, { "epoch": 2.038556976522237, "grad_norm": 0.008056640625, "learning_rate": 4.4230592493619795e-05, "loss": 0.0002, "step": 32040 }, { "epoch": 2.039193230260228, "grad_norm": 0.126953125, "learning_rate": 4.422705775063095e-05, "loss": 0.0005, "step": 32050 }, { "epoch": 2.0398294839982185, "grad_norm": 0.009765625, "learning_rate": 4.422352300764211e-05, "loss": 0.0022, "step": 32060 }, { "epoch": 2.040465737736209, "grad_norm": 0.006195068359375, "learning_rate": 4.421998826465328e-05, "loss": 0.0001, "step": 32070 }, { "epoch": 2.0411019914742, "grad_norm": 0.255859375, "learning_rate": 4.421645352166444e-05, "loss": 0.0044, "step": 32080 }, { "epoch": 2.0417382452121906, "grad_norm": 0.0002765655517578125, "learning_rate": 4.421291877867561e-05, "loss": 0.0002, "step": 32090 }, { "epoch": 2.0423744989501813, "grad_norm": 0.00179290771484375, "learning_rate": 4.4209384035686765e-05, "loss": 0.0002, "step": 32100 }, { "epoch": 2.043010752688172, "grad_norm": 0.006744384765625, "learning_rate": 4.420584929269793e-05, "loss": 0.0002, "step": 32110 }, { "epoch": 2.0436470064261627, "grad_norm": 0.00543212890625, "learning_rate": 4.4202314549709096e-05, "loss": 0.0004, "step": 32120 }, { "epoch": 2.0442832601641534, "grad_norm": 0.0279541015625, "learning_rate": 4.4198779806720254e-05, "loss": 0.0001, "step": 32130 }, { "epoch": 2.044919513902144, "grad_norm": 0.05029296875, "learning_rate": 4.419524506373142e-05, "loss": 0.0001, "step": 32140 }, { "epoch": 2.045555767640135, "grad_norm": 0.953125, "learning_rate": 4.419171032074258e-05, "loss": 0.0046, "step": 32150 }, { "epoch": 2.0461920213781255, "grad_norm": 0.0014495849609375, "learning_rate": 4.418817557775374e-05, "loss": 0.0002, "step": 32160 }, { "epoch": 2.046828275116116, "grad_norm": 0.0703125, "learning_rate": 4.418464083476491e-05, "loss": 0.0013, "step": 32170 }, { "epoch": 2.047464528854107, "grad_norm": 0.036865234375, "learning_rate": 4.418110609177607e-05, "loss": 0.0013, "step": 32180 }, { "epoch": 2.0481007825920976, "grad_norm": 0.02294921875, "learning_rate": 4.417757134878723e-05, "loss": 0.0006, "step": 32190 }, { "epoch": 2.0487370363300883, "grad_norm": 0.07666015625, "learning_rate": 4.41740366057984e-05, "loss": 0.0002, "step": 32200 }, { "epoch": 2.049373290068079, "grad_norm": 0.004913330078125, "learning_rate": 4.4170501862809555e-05, "loss": 0.0009, "step": 32210 }, { "epoch": 2.0500095438060697, "grad_norm": 0.039306640625, "learning_rate": 4.416696711982072e-05, "loss": 0.0003, "step": 32220 }, { "epoch": 2.0506457975440604, "grad_norm": 0.00946044921875, "learning_rate": 4.416343237683188e-05, "loss": 0.0004, "step": 32230 }, { "epoch": 2.051282051282051, "grad_norm": 0.0390625, "learning_rate": 4.4159897633843044e-05, "loss": 0.0032, "step": 32240 }, { "epoch": 2.051918305020042, "grad_norm": 0.040771484375, "learning_rate": 4.415636289085421e-05, "loss": 0.0004, "step": 32250 }, { "epoch": 2.0525545587580325, "grad_norm": 0.00457763671875, "learning_rate": 4.4152828147865374e-05, "loss": 0.0001, "step": 32260 }, { "epoch": 2.053190812496023, "grad_norm": 0.26953125, "learning_rate": 4.414929340487653e-05, "loss": 0.0003, "step": 32270 }, { "epoch": 2.0538270662340143, "grad_norm": 0.0026092529296875, "learning_rate": 4.41457586618877e-05, "loss": 0.0083, "step": 32280 }, { "epoch": 2.054463319972005, "grad_norm": 0.000823974609375, "learning_rate": 4.4142223918898856e-05, "loss": 0.0001, "step": 32290 }, { "epoch": 2.0550995737099957, "grad_norm": 0.005157470703125, "learning_rate": 4.413868917591002e-05, "loss": 0.0027, "step": 32300 }, { "epoch": 2.0557358274479864, "grad_norm": 0.0201416015625, "learning_rate": 4.413515443292118e-05, "loss": 0.0005, "step": 32310 }, { "epoch": 2.056372081185977, "grad_norm": 0.0272216796875, "learning_rate": 4.4131619689932345e-05, "loss": 0.0005, "step": 32320 }, { "epoch": 2.057008334923968, "grad_norm": 0.22265625, "learning_rate": 4.412808494694351e-05, "loss": 0.0008, "step": 32330 }, { "epoch": 2.0576445886619585, "grad_norm": 0.002716064453125, "learning_rate": 4.4124550203954676e-05, "loss": 0.0004, "step": 32340 }, { "epoch": 2.0582808423999492, "grad_norm": 0.00408935546875, "learning_rate": 4.4121015460965834e-05, "loss": 0.0001, "step": 32350 }, { "epoch": 2.05891709613794, "grad_norm": 0.06201171875, "learning_rate": 4.4117480717977e-05, "loss": 0.0008, "step": 32360 }, { "epoch": 2.0595533498759306, "grad_norm": 0.158203125, "learning_rate": 4.4113945974988164e-05, "loss": 0.0003, "step": 32370 }, { "epoch": 2.0601896036139213, "grad_norm": 0.00186920166015625, "learning_rate": 4.411041123199932e-05, "loss": 0.004, "step": 32380 }, { "epoch": 2.060825857351912, "grad_norm": 0.0093994140625, "learning_rate": 4.410687648901049e-05, "loss": 0.0001, "step": 32390 }, { "epoch": 2.0614621110899027, "grad_norm": 0.0074462890625, "learning_rate": 4.4103341746021646e-05, "loss": 0.0014, "step": 32400 }, { "epoch": 2.0620983648278934, "grad_norm": 0.01373291015625, "learning_rate": 4.409980700303281e-05, "loss": 0.0001, "step": 32410 }, { "epoch": 2.062734618565884, "grad_norm": 0.011474609375, "learning_rate": 4.409627226004398e-05, "loss": 0.0005, "step": 32420 }, { "epoch": 2.063370872303875, "grad_norm": 0.0027618408203125, "learning_rate": 4.4092737517055135e-05, "loss": 0.0001, "step": 32430 }, { "epoch": 2.0640071260418655, "grad_norm": 0.01519775390625, "learning_rate": 4.40892027740663e-05, "loss": 0.0007, "step": 32440 }, { "epoch": 2.0646433797798562, "grad_norm": 0.1708984375, "learning_rate": 4.4085668031077466e-05, "loss": 0.0006, "step": 32450 }, { "epoch": 2.065279633517847, "grad_norm": 0.1025390625, "learning_rate": 4.4082133288088624e-05, "loss": 0.0034, "step": 32460 }, { "epoch": 2.0659158872558376, "grad_norm": 0.011474609375, "learning_rate": 4.407859854509979e-05, "loss": 0.0001, "step": 32470 }, { "epoch": 2.0665521409938283, "grad_norm": 0.1337890625, "learning_rate": 4.407506380211095e-05, "loss": 0.0008, "step": 32480 }, { "epoch": 2.067188394731819, "grad_norm": 0.0118408203125, "learning_rate": 4.407152905912211e-05, "loss": 0.0003, "step": 32490 }, { "epoch": 2.0678246484698097, "grad_norm": 0.53515625, "learning_rate": 4.406799431613328e-05, "loss": 0.0012, "step": 32500 }, { "epoch": 2.0684609022078004, "grad_norm": 0.00518798828125, "learning_rate": 4.406445957314444e-05, "loss": 0.0002, "step": 32510 }, { "epoch": 2.069097155945791, "grad_norm": 0.0240478515625, "learning_rate": 4.40609248301556e-05, "loss": 0.0003, "step": 32520 }, { "epoch": 2.069733409683782, "grad_norm": 0.080078125, "learning_rate": 4.405739008716677e-05, "loss": 0.0001, "step": 32530 }, { "epoch": 2.0703696634217725, "grad_norm": 0.0179443359375, "learning_rate": 4.4053855344177925e-05, "loss": 0.0002, "step": 32540 }, { "epoch": 2.0710059171597632, "grad_norm": 0.00518798828125, "learning_rate": 4.405032060118909e-05, "loss": 0.0009, "step": 32550 }, { "epoch": 2.071642170897754, "grad_norm": 0.06640625, "learning_rate": 4.404678585820025e-05, "loss": 0.0004, "step": 32560 }, { "epoch": 2.0722784246357446, "grad_norm": 0.0322265625, "learning_rate": 4.4043251115211414e-05, "loss": 0.0005, "step": 32570 }, { "epoch": 2.0729146783737353, "grad_norm": 0.0240478515625, "learning_rate": 4.403971637222258e-05, "loss": 0.0004, "step": 32580 }, { "epoch": 2.073550932111726, "grad_norm": 0.019775390625, "learning_rate": 4.4036181629233744e-05, "loss": 0.0001, "step": 32590 }, { "epoch": 2.0741871858497167, "grad_norm": 0.005035400390625, "learning_rate": 4.40326468862449e-05, "loss": 0.0012, "step": 32600 }, { "epoch": 2.0748234395877074, "grad_norm": 0.0034637451171875, "learning_rate": 4.402911214325607e-05, "loss": 0.0005, "step": 32610 }, { "epoch": 2.075459693325698, "grad_norm": 0.0244140625, "learning_rate": 4.4025577400267226e-05, "loss": 0.0003, "step": 32620 }, { "epoch": 2.076095947063689, "grad_norm": 0.56640625, "learning_rate": 4.402204265727839e-05, "loss": 0.0005, "step": 32630 }, { "epoch": 2.0767322008016795, "grad_norm": 0.01446533203125, "learning_rate": 4.401850791428955e-05, "loss": 0.0013, "step": 32640 }, { "epoch": 2.0773684545396702, "grad_norm": 0.003662109375, "learning_rate": 4.4014973171300715e-05, "loss": 0.001, "step": 32650 }, { "epoch": 2.078004708277661, "grad_norm": 0.0201416015625, "learning_rate": 4.401143842831188e-05, "loss": 0.0001, "step": 32660 }, { "epoch": 2.0786409620156516, "grad_norm": 0.0107421875, "learning_rate": 4.4007903685323045e-05, "loss": 0.0001, "step": 32670 }, { "epoch": 2.0792772157536428, "grad_norm": 0.00518798828125, "learning_rate": 4.4004368942334204e-05, "loss": 0.0003, "step": 32680 }, { "epoch": 2.0799134694916335, "grad_norm": 0.0030975341796875, "learning_rate": 4.400083419934537e-05, "loss": 0.0011, "step": 32690 }, { "epoch": 2.080549723229624, "grad_norm": 0.0033416748046875, "learning_rate": 4.399729945635653e-05, "loss": 0.0022, "step": 32700 }, { "epoch": 2.081185976967615, "grad_norm": 0.018798828125, "learning_rate": 4.399376471336769e-05, "loss": 0.0001, "step": 32710 }, { "epoch": 2.0818222307056056, "grad_norm": 0.00070953369140625, "learning_rate": 4.399022997037886e-05, "loss": 0.0001, "step": 32720 }, { "epoch": 2.0824584844435963, "grad_norm": 0.0228271484375, "learning_rate": 4.3986695227390016e-05, "loss": 0.0001, "step": 32730 }, { "epoch": 2.083094738181587, "grad_norm": 0.10498046875, "learning_rate": 4.398316048440118e-05, "loss": 0.0003, "step": 32740 }, { "epoch": 2.0837309919195777, "grad_norm": 0.005340576171875, "learning_rate": 4.3979625741412347e-05, "loss": 0.002, "step": 32750 }, { "epoch": 2.0843672456575684, "grad_norm": 0.0002956390380859375, "learning_rate": 4.3976090998423505e-05, "loss": 0.0002, "step": 32760 }, { "epoch": 2.085003499395559, "grad_norm": 0.002166748046875, "learning_rate": 4.397255625543467e-05, "loss": 0.0003, "step": 32770 }, { "epoch": 2.0856397531335498, "grad_norm": 0.003021240234375, "learning_rate": 4.3969021512445835e-05, "loss": 0.0001, "step": 32780 }, { "epoch": 2.0862760068715405, "grad_norm": 0.0035247802734375, "learning_rate": 4.3965486769456994e-05, "loss": 0.004, "step": 32790 }, { "epoch": 2.086912260609531, "grad_norm": 0.1728515625, "learning_rate": 4.396195202646816e-05, "loss": 0.0002, "step": 32800 }, { "epoch": 2.087548514347522, "grad_norm": 0.000949859619140625, "learning_rate": 4.395841728347932e-05, "loss": 0.0001, "step": 32810 }, { "epoch": 2.0881847680855126, "grad_norm": 0.09033203125, "learning_rate": 4.395488254049048e-05, "loss": 0.0002, "step": 32820 }, { "epoch": 2.0888210218235033, "grad_norm": 0.0634765625, "learning_rate": 4.395134779750165e-05, "loss": 0.0006, "step": 32830 }, { "epoch": 2.089457275561494, "grad_norm": 0.00177764892578125, "learning_rate": 4.394781305451281e-05, "loss": 0.0005, "step": 32840 }, { "epoch": 2.0900935292994847, "grad_norm": 0.005126953125, "learning_rate": 4.394427831152397e-05, "loss": 0.0002, "step": 32850 }, { "epoch": 2.0907297830374754, "grad_norm": 0.00014209747314453125, "learning_rate": 4.3940743568535137e-05, "loss": 0.0029, "step": 32860 }, { "epoch": 2.091366036775466, "grad_norm": 0.0050048828125, "learning_rate": 4.3937208825546295e-05, "loss": 0.0001, "step": 32870 }, { "epoch": 2.0920022905134568, "grad_norm": 0.054443359375, "learning_rate": 4.393367408255746e-05, "loss": 0.0004, "step": 32880 }, { "epoch": 2.0926385442514475, "grad_norm": 0.00335693359375, "learning_rate": 4.393013933956862e-05, "loss": 0.0, "step": 32890 }, { "epoch": 2.093274797989438, "grad_norm": 0.1767578125, "learning_rate": 4.3926604596579784e-05, "loss": 0.0088, "step": 32900 }, { "epoch": 2.093911051727429, "grad_norm": 0.0020904541015625, "learning_rate": 4.392306985359095e-05, "loss": 0.0002, "step": 32910 }, { "epoch": 2.0945473054654196, "grad_norm": 0.02783203125, "learning_rate": 4.3919535110602114e-05, "loss": 0.0071, "step": 32920 }, { "epoch": 2.0951835592034103, "grad_norm": 0.0027008056640625, "learning_rate": 4.391600036761327e-05, "loss": 0.0001, "step": 32930 }, { "epoch": 2.095819812941401, "grad_norm": 0.0223388671875, "learning_rate": 4.391246562462444e-05, "loss": 0.0007, "step": 32940 }, { "epoch": 2.0964560666793917, "grad_norm": 0.005584716796875, "learning_rate": 4.3908930881635596e-05, "loss": 0.0002, "step": 32950 }, { "epoch": 2.0970923204173824, "grad_norm": 6.4375, "learning_rate": 4.390539613864676e-05, "loss": 0.0037, "step": 32960 }, { "epoch": 2.097728574155373, "grad_norm": 0.0115966796875, "learning_rate": 4.390186139565792e-05, "loss": 0.0002, "step": 32970 }, { "epoch": 2.0983648278933638, "grad_norm": 7.4375, "learning_rate": 4.3898326652669085e-05, "loss": 0.009, "step": 32980 }, { "epoch": 2.0990010816313545, "grad_norm": 0.2119140625, "learning_rate": 4.389479190968025e-05, "loss": 0.0007, "step": 32990 }, { "epoch": 2.099637335369345, "grad_norm": 0.06298828125, "learning_rate": 4.3891257166691415e-05, "loss": 0.0016, "step": 33000 }, { "epoch": 2.100273589107336, "grad_norm": 0.01446533203125, "learning_rate": 4.3887722423702574e-05, "loss": 0.0002, "step": 33010 }, { "epoch": 2.1009098428453266, "grad_norm": 0.07470703125, "learning_rate": 4.388418768071374e-05, "loss": 0.0007, "step": 33020 }, { "epoch": 2.1015460965833173, "grad_norm": 0.018310546875, "learning_rate": 4.38806529377249e-05, "loss": 0.0001, "step": 33030 }, { "epoch": 2.102182350321308, "grad_norm": 0.0283203125, "learning_rate": 4.387711819473606e-05, "loss": 0.0003, "step": 33040 }, { "epoch": 2.1028186040592987, "grad_norm": 0.006683349609375, "learning_rate": 4.387358345174723e-05, "loss": 0.0005, "step": 33050 }, { "epoch": 2.1034548577972894, "grad_norm": 0.0038299560546875, "learning_rate": 4.3870048708758386e-05, "loss": 0.0001, "step": 33060 }, { "epoch": 2.10409111153528, "grad_norm": 0.016845703125, "learning_rate": 4.386651396576955e-05, "loss": 0.0001, "step": 33070 }, { "epoch": 2.1047273652732708, "grad_norm": 0.41015625, "learning_rate": 4.3862979222780716e-05, "loss": 0.0041, "step": 33080 }, { "epoch": 2.105363619011262, "grad_norm": 0.007568359375, "learning_rate": 4.3859444479791875e-05, "loss": 0.0001, "step": 33090 }, { "epoch": 2.1059998727492526, "grad_norm": 0.0025787353515625, "learning_rate": 4.385590973680304e-05, "loss": 0.0025, "step": 33100 }, { "epoch": 2.1066361264872433, "grad_norm": 0.007537841796875, "learning_rate": 4.38523749938142e-05, "loss": 0.0006, "step": 33110 }, { "epoch": 2.107272380225234, "grad_norm": 0.04736328125, "learning_rate": 4.3848840250825364e-05, "loss": 0.0034, "step": 33120 }, { "epoch": 2.1079086339632247, "grad_norm": 0.0400390625, "learning_rate": 4.384530550783653e-05, "loss": 0.0001, "step": 33130 }, { "epoch": 2.1085448877012154, "grad_norm": 0.0260009765625, "learning_rate": 4.384177076484769e-05, "loss": 0.0023, "step": 33140 }, { "epoch": 2.109181141439206, "grad_norm": 0.003936767578125, "learning_rate": 4.383823602185885e-05, "loss": 0.0051, "step": 33150 }, { "epoch": 2.109817395177197, "grad_norm": 4.15625, "learning_rate": 4.383470127887002e-05, "loss": 0.0037, "step": 33160 }, { "epoch": 2.1104536489151875, "grad_norm": 0.005767822265625, "learning_rate": 4.383116653588118e-05, "loss": 0.0004, "step": 33170 }, { "epoch": 2.111089902653178, "grad_norm": 0.80859375, "learning_rate": 4.382763179289234e-05, "loss": 0.0072, "step": 33180 }, { "epoch": 2.111726156391169, "grad_norm": 0.003997802734375, "learning_rate": 4.38240970499035e-05, "loss": 0.0034, "step": 33190 }, { "epoch": 2.1123624101291596, "grad_norm": 0.54296875, "learning_rate": 4.3820562306914665e-05, "loss": 0.0007, "step": 33200 }, { "epoch": 2.1129986638671503, "grad_norm": 0.015869140625, "learning_rate": 4.381702756392583e-05, "loss": 0.0001, "step": 33210 }, { "epoch": 2.113634917605141, "grad_norm": 0.006866455078125, "learning_rate": 4.381349282093699e-05, "loss": 0.0001, "step": 33220 }, { "epoch": 2.1142711713431317, "grad_norm": 0.005340576171875, "learning_rate": 4.3809958077948154e-05, "loss": 0.0002, "step": 33230 }, { "epoch": 2.1149074250811224, "grad_norm": 0.06396484375, "learning_rate": 4.380642333495932e-05, "loss": 0.0006, "step": 33240 }, { "epoch": 2.115543678819113, "grad_norm": 0.01123046875, "learning_rate": 4.3802888591970484e-05, "loss": 0.0002, "step": 33250 }, { "epoch": 2.116179932557104, "grad_norm": 0.00494384765625, "learning_rate": 4.379935384898164e-05, "loss": 0.0001, "step": 33260 }, { "epoch": 2.1168161862950945, "grad_norm": 3.75, "learning_rate": 4.379581910599281e-05, "loss": 0.0059, "step": 33270 }, { "epoch": 2.117452440033085, "grad_norm": 0.009033203125, "learning_rate": 4.3792284363003966e-05, "loss": 0.0024, "step": 33280 }, { "epoch": 2.118088693771076, "grad_norm": 0.005615234375, "learning_rate": 4.378874962001513e-05, "loss": 0.0015, "step": 33290 }, { "epoch": 2.1187249475090666, "grad_norm": 0.0026092529296875, "learning_rate": 4.378521487702629e-05, "loss": 0.0004, "step": 33300 }, { "epoch": 2.1193612012470573, "grad_norm": 0.051513671875, "learning_rate": 4.3781680134037455e-05, "loss": 0.001, "step": 33310 }, { "epoch": 2.119997454985048, "grad_norm": 0.005401611328125, "learning_rate": 4.377814539104862e-05, "loss": 0.0003, "step": 33320 }, { "epoch": 2.1206337087230387, "grad_norm": 0.7890625, "learning_rate": 4.3774610648059785e-05, "loss": 0.0014, "step": 33330 }, { "epoch": 2.1212699624610294, "grad_norm": 0.08544921875, "learning_rate": 4.3771075905070943e-05, "loss": 0.0003, "step": 33340 }, { "epoch": 2.12190621619902, "grad_norm": 0.0703125, "learning_rate": 4.376754116208211e-05, "loss": 0.0003, "step": 33350 }, { "epoch": 2.122542469937011, "grad_norm": 0.01556396484375, "learning_rate": 4.376400641909327e-05, "loss": 0.0004, "step": 33360 }, { "epoch": 2.1231787236750015, "grad_norm": 0.007171630859375, "learning_rate": 4.376047167610443e-05, "loss": 0.0001, "step": 33370 }, { "epoch": 2.123814977412992, "grad_norm": 0.0087890625, "learning_rate": 4.37569369331156e-05, "loss": 0.0007, "step": 33380 }, { "epoch": 2.124451231150983, "grad_norm": 0.03369140625, "learning_rate": 4.3753402190126756e-05, "loss": 0.0002, "step": 33390 }, { "epoch": 2.1250874848889736, "grad_norm": 0.003692626953125, "learning_rate": 4.374986744713792e-05, "loss": 0.0004, "step": 33400 }, { "epoch": 2.1257237386269643, "grad_norm": 0.0025634765625, "learning_rate": 4.3746332704149086e-05, "loss": 0.0, "step": 33410 }, { "epoch": 2.126359992364955, "grad_norm": 1.515625, "learning_rate": 4.3742797961160245e-05, "loss": 0.0203, "step": 33420 }, { "epoch": 2.1269962461029457, "grad_norm": 0.001312255859375, "learning_rate": 4.373926321817141e-05, "loss": 0.0006, "step": 33430 }, { "epoch": 2.1276324998409364, "grad_norm": 0.07861328125, "learning_rate": 4.373572847518257e-05, "loss": 0.0004, "step": 33440 }, { "epoch": 2.128268753578927, "grad_norm": 0.00799560546875, "learning_rate": 4.3732193732193733e-05, "loss": 0.0026, "step": 33450 }, { "epoch": 2.128905007316918, "grad_norm": 0.00384521484375, "learning_rate": 4.37286589892049e-05, "loss": 0.0007, "step": 33460 }, { "epoch": 2.129541261054909, "grad_norm": 0.03369140625, "learning_rate": 4.372512424621606e-05, "loss": 0.0016, "step": 33470 }, { "epoch": 2.1301775147928996, "grad_norm": 0.006134033203125, "learning_rate": 4.372158950322722e-05, "loss": 0.0001, "step": 33480 }, { "epoch": 2.1308137685308903, "grad_norm": 0.00469970703125, "learning_rate": 4.371805476023839e-05, "loss": 0.0004, "step": 33490 }, { "epoch": 2.131450022268881, "grad_norm": 0.003021240234375, "learning_rate": 4.371452001724955e-05, "loss": 0.0013, "step": 33500 }, { "epoch": 2.1320862760068717, "grad_norm": 0.9453125, "learning_rate": 4.371098527426071e-05, "loss": 0.0004, "step": 33510 }, { "epoch": 2.1327225297448624, "grad_norm": 0.01177978515625, "learning_rate": 4.370745053127187e-05, "loss": 0.0001, "step": 33520 }, { "epoch": 2.133358783482853, "grad_norm": 0.001953125, "learning_rate": 4.3703915788283035e-05, "loss": 0.0003, "step": 33530 }, { "epoch": 2.133995037220844, "grad_norm": 0.474609375, "learning_rate": 4.37003810452942e-05, "loss": 0.0028, "step": 33540 }, { "epoch": 2.1346312909588345, "grad_norm": 0.016845703125, "learning_rate": 4.369684630230536e-05, "loss": 0.0001, "step": 33550 }, { "epoch": 2.1352675446968252, "grad_norm": 0.0419921875, "learning_rate": 4.369331155931652e-05, "loss": 0.0003, "step": 33560 }, { "epoch": 2.135903798434816, "grad_norm": 0.01708984375, "learning_rate": 4.368977681632769e-05, "loss": 0.0002, "step": 33570 }, { "epoch": 2.1365400521728066, "grad_norm": 0.014404296875, "learning_rate": 4.3686242073338854e-05, "loss": 0.0004, "step": 33580 }, { "epoch": 2.1371763059107973, "grad_norm": 0.0306396484375, "learning_rate": 4.368270733035001e-05, "loss": 0.0002, "step": 33590 }, { "epoch": 2.137812559648788, "grad_norm": 0.0007781982421875, "learning_rate": 4.367917258736117e-05, "loss": 0.0017, "step": 33600 }, { "epoch": 2.1384488133867787, "grad_norm": 0.01373291015625, "learning_rate": 4.3675637844372336e-05, "loss": 0.0005, "step": 33610 }, { "epoch": 2.1390850671247694, "grad_norm": 0.0263671875, "learning_rate": 4.36721031013835e-05, "loss": 0.0009, "step": 33620 }, { "epoch": 2.13972132086276, "grad_norm": 0.00141143798828125, "learning_rate": 4.366856835839466e-05, "loss": 0.0001, "step": 33630 }, { "epoch": 2.140357574600751, "grad_norm": 0.0576171875, "learning_rate": 4.3665033615405825e-05, "loss": 0.0005, "step": 33640 }, { "epoch": 2.1409938283387415, "grad_norm": 0.0537109375, "learning_rate": 4.366149887241699e-05, "loss": 0.0001, "step": 33650 }, { "epoch": 2.1416300820767322, "grad_norm": 0.032470703125, "learning_rate": 4.3657964129428155e-05, "loss": 0.0003, "step": 33660 }, { "epoch": 2.142266335814723, "grad_norm": 0.333984375, "learning_rate": 4.365442938643931e-05, "loss": 0.0004, "step": 33670 }, { "epoch": 2.1429025895527136, "grad_norm": 2.015625, "learning_rate": 4.365089464345048e-05, "loss": 0.0034, "step": 33680 }, { "epoch": 2.1435388432907043, "grad_norm": 1.34375, "learning_rate": 4.364735990046164e-05, "loss": 0.0009, "step": 33690 }, { "epoch": 2.144175097028695, "grad_norm": 0.004547119140625, "learning_rate": 4.36438251574728e-05, "loss": 0.0002, "step": 33700 }, { "epoch": 2.1448113507666857, "grad_norm": 0.08349609375, "learning_rate": 4.364029041448397e-05, "loss": 0.0002, "step": 33710 }, { "epoch": 2.1454476045046764, "grad_norm": 0.00225830078125, "learning_rate": 4.3636755671495126e-05, "loss": 0.0004, "step": 33720 }, { "epoch": 2.146083858242667, "grad_norm": 0.2373046875, "learning_rate": 4.363322092850629e-05, "loss": 0.0081, "step": 33730 }, { "epoch": 2.146720111980658, "grad_norm": 0.004730224609375, "learning_rate": 4.3629686185517456e-05, "loss": 0.0005, "step": 33740 }, { "epoch": 2.1473563657186485, "grad_norm": 0.052734375, "learning_rate": 4.3626151442528614e-05, "loss": 0.0002, "step": 33750 }, { "epoch": 2.147992619456639, "grad_norm": 0.169921875, "learning_rate": 4.362261669953978e-05, "loss": 0.0019, "step": 33760 }, { "epoch": 2.14862887319463, "grad_norm": 0.0150146484375, "learning_rate": 4.361908195655094e-05, "loss": 0.0015, "step": 33770 }, { "epoch": 2.1492651269326206, "grad_norm": 0.0216064453125, "learning_rate": 4.36155472135621e-05, "loss": 0.0001, "step": 33780 }, { "epoch": 2.1499013806706113, "grad_norm": 0.041015625, "learning_rate": 4.361201247057327e-05, "loss": 0.0002, "step": 33790 }, { "epoch": 2.150537634408602, "grad_norm": 0.00127410888671875, "learning_rate": 4.360847772758443e-05, "loss": 0.0001, "step": 33800 }, { "epoch": 2.1511738881465927, "grad_norm": 0.037353515625, "learning_rate": 4.360494298459559e-05, "loss": 0.001, "step": 33810 }, { "epoch": 2.1518101418845834, "grad_norm": 0.0009002685546875, "learning_rate": 4.360140824160676e-05, "loss": 0.0005, "step": 33820 }, { "epoch": 2.152446395622574, "grad_norm": 1.453125, "learning_rate": 4.359787349861792e-05, "loss": 0.0005, "step": 33830 }, { "epoch": 2.153082649360565, "grad_norm": 0.00174713134765625, "learning_rate": 4.359433875562908e-05, "loss": 0.0005, "step": 33840 }, { "epoch": 2.1537189030985555, "grad_norm": 0.022216796875, "learning_rate": 4.359080401264024e-05, "loss": 0.0035, "step": 33850 }, { "epoch": 2.154355156836546, "grad_norm": 0.01336669921875, "learning_rate": 4.3587269269651404e-05, "loss": 0.001, "step": 33860 }, { "epoch": 2.154991410574537, "grad_norm": 0.024169921875, "learning_rate": 4.358373452666257e-05, "loss": 0.0003, "step": 33870 }, { "epoch": 2.1556276643125276, "grad_norm": 0.06005859375, "learning_rate": 4.358019978367373e-05, "loss": 0.0002, "step": 33880 }, { "epoch": 2.1562639180505183, "grad_norm": 0.00092315673828125, "learning_rate": 4.357666504068489e-05, "loss": 0.0003, "step": 33890 }, { "epoch": 2.1569001717885095, "grad_norm": 0.02099609375, "learning_rate": 4.357313029769606e-05, "loss": 0.0002, "step": 33900 }, { "epoch": 2.1575364255265, "grad_norm": 0.0072021484375, "learning_rate": 4.3569595554707224e-05, "loss": 0.0003, "step": 33910 }, { "epoch": 2.158172679264491, "grad_norm": 0.017822265625, "learning_rate": 4.356606081171838e-05, "loss": 0.0001, "step": 33920 }, { "epoch": 2.1588089330024816, "grad_norm": 0.007293701171875, "learning_rate": 4.356252606872954e-05, "loss": 0.0023, "step": 33930 }, { "epoch": 2.1594451867404723, "grad_norm": 0.019775390625, "learning_rate": 4.3558991325740706e-05, "loss": 0.0001, "step": 33940 }, { "epoch": 2.160081440478463, "grad_norm": 0.0023040771484375, "learning_rate": 4.355545658275187e-05, "loss": 0.0003, "step": 33950 }, { "epoch": 2.1607176942164537, "grad_norm": 0.0028076171875, "learning_rate": 4.355192183976303e-05, "loss": 0.0003, "step": 33960 }, { "epoch": 2.1613539479544444, "grad_norm": 0.10009765625, "learning_rate": 4.3548387096774194e-05, "loss": 0.0002, "step": 33970 }, { "epoch": 2.161990201692435, "grad_norm": 0.259765625, "learning_rate": 4.354485235378536e-05, "loss": 0.0004, "step": 33980 }, { "epoch": 2.1626264554304258, "grad_norm": 0.0142822265625, "learning_rate": 4.3541317610796525e-05, "loss": 0.0015, "step": 33990 }, { "epoch": 2.1632627091684165, "grad_norm": 0.007293701171875, "learning_rate": 4.353778286780768e-05, "loss": 0.0001, "step": 34000 }, { "epoch": 2.163898962906407, "grad_norm": 0.0014495849609375, "learning_rate": 4.353424812481884e-05, "loss": 0.0029, "step": 34010 }, { "epoch": 2.164535216644398, "grad_norm": 0.0203857421875, "learning_rate": 4.353071338183001e-05, "loss": 0.0025, "step": 34020 }, { "epoch": 2.1651714703823886, "grad_norm": 0.0201416015625, "learning_rate": 4.352717863884117e-05, "loss": 0.0002, "step": 34030 }, { "epoch": 2.1658077241203793, "grad_norm": 0.01434326171875, "learning_rate": 4.352364389585234e-05, "loss": 0.0047, "step": 34040 }, { "epoch": 2.16644397785837, "grad_norm": 0.000835418701171875, "learning_rate": 4.3520109152863495e-05, "loss": 0.0002, "step": 34050 }, { "epoch": 2.1670802315963607, "grad_norm": 0.023681640625, "learning_rate": 4.351657440987466e-05, "loss": 0.0149, "step": 34060 }, { "epoch": 2.1677164853343514, "grad_norm": 0.0203857421875, "learning_rate": 4.3513039666885826e-05, "loss": 0.0006, "step": 34070 }, { "epoch": 2.168352739072342, "grad_norm": 0.01275634765625, "learning_rate": 4.3509504923896984e-05, "loss": 0.0005, "step": 34080 }, { "epoch": 2.1689889928103327, "grad_norm": 0.00799560546875, "learning_rate": 4.350597018090815e-05, "loss": 0.0002, "step": 34090 }, { "epoch": 2.1696252465483234, "grad_norm": 0.0037994384765625, "learning_rate": 4.350243543791931e-05, "loss": 0.0003, "step": 34100 }, { "epoch": 2.170261500286314, "grad_norm": 0.00116729736328125, "learning_rate": 4.349890069493047e-05, "loss": 0.0007, "step": 34110 }, { "epoch": 2.170897754024305, "grad_norm": 0.04638671875, "learning_rate": 4.349536595194164e-05, "loss": 0.0003, "step": 34120 }, { "epoch": 2.1715340077622955, "grad_norm": 0.00121307373046875, "learning_rate": 4.34918312089528e-05, "loss": 0.0002, "step": 34130 }, { "epoch": 2.1721702615002862, "grad_norm": 0.01611328125, "learning_rate": 4.348829646596396e-05, "loss": 0.0017, "step": 34140 }, { "epoch": 2.172806515238277, "grad_norm": 0.003570556640625, "learning_rate": 4.348476172297513e-05, "loss": 0.0, "step": 34150 }, { "epoch": 2.1734427689762676, "grad_norm": 0.003570556640625, "learning_rate": 4.348122697998629e-05, "loss": 0.0006, "step": 34160 }, { "epoch": 2.1740790227142583, "grad_norm": 0.0040283203125, "learning_rate": 4.347769223699745e-05, "loss": 0.0001, "step": 34170 }, { "epoch": 2.174715276452249, "grad_norm": 0.0693359375, "learning_rate": 4.347415749400861e-05, "loss": 0.0013, "step": 34180 }, { "epoch": 2.1753515301902397, "grad_norm": 0.000736236572265625, "learning_rate": 4.3470622751019774e-05, "loss": 0.0057, "step": 34190 }, { "epoch": 2.1759877839282304, "grad_norm": 0.0732421875, "learning_rate": 4.346708800803094e-05, "loss": 0.0002, "step": 34200 }, { "epoch": 2.176624037666221, "grad_norm": 0.0004253387451171875, "learning_rate": 4.34635532650421e-05, "loss": 0.0004, "step": 34210 }, { "epoch": 2.177260291404212, "grad_norm": 0.828125, "learning_rate": 4.346001852205326e-05, "loss": 0.0004, "step": 34220 }, { "epoch": 2.1778965451422025, "grad_norm": 0.00061798095703125, "learning_rate": 4.345648377906443e-05, "loss": 0.0008, "step": 34230 }, { "epoch": 2.1785327988801932, "grad_norm": 0.004119873046875, "learning_rate": 4.345294903607559e-05, "loss": 0.0009, "step": 34240 }, { "epoch": 2.179169052618184, "grad_norm": 0.0020751953125, "learning_rate": 4.344941429308675e-05, "loss": 0.0001, "step": 34250 }, { "epoch": 2.1798053063561746, "grad_norm": 0.00848388671875, "learning_rate": 4.344587955009791e-05, "loss": 0.0003, "step": 34260 }, { "epoch": 2.180441560094166, "grad_norm": 0.00787353515625, "learning_rate": 4.3442344807109075e-05, "loss": 0.0006, "step": 34270 }, { "epoch": 2.1810778138321565, "grad_norm": 0.0272216796875, "learning_rate": 4.343881006412024e-05, "loss": 0.0008, "step": 34280 }, { "epoch": 2.181714067570147, "grad_norm": 0.00390625, "learning_rate": 4.34352753211314e-05, "loss": 0.0006, "step": 34290 }, { "epoch": 2.182350321308138, "grad_norm": 0.3203125, "learning_rate": 4.3431740578142564e-05, "loss": 0.0005, "step": 34300 }, { "epoch": 2.1829865750461286, "grad_norm": 0.00130462646484375, "learning_rate": 4.342820583515373e-05, "loss": 0.01, "step": 34310 }, { "epoch": 2.1836228287841193, "grad_norm": 0.00014591217041015625, "learning_rate": 4.3424671092164895e-05, "loss": 0.0123, "step": 34320 }, { "epoch": 2.18425908252211, "grad_norm": 0.001708984375, "learning_rate": 4.342113634917605e-05, "loss": 0.0001, "step": 34330 }, { "epoch": 2.1848953362601007, "grad_norm": 0.00147247314453125, "learning_rate": 4.341760160618721e-05, "loss": 0.0021, "step": 34340 }, { "epoch": 2.1855315899980914, "grad_norm": 0.0023345947265625, "learning_rate": 4.3414066863198377e-05, "loss": 0.0001, "step": 34350 }, { "epoch": 2.186167843736082, "grad_norm": 0.1806640625, "learning_rate": 4.341053212020954e-05, "loss": 0.0004, "step": 34360 }, { "epoch": 2.186804097474073, "grad_norm": 0.00384521484375, "learning_rate": 4.340699737722071e-05, "loss": 0.0038, "step": 34370 }, { "epoch": 2.1874403512120635, "grad_norm": 0.0033416748046875, "learning_rate": 4.3403462634231865e-05, "loss": 0.0008, "step": 34380 }, { "epoch": 2.188076604950054, "grad_norm": 0.01806640625, "learning_rate": 4.339992789124303e-05, "loss": 0.001, "step": 34390 }, { "epoch": 2.188712858688045, "grad_norm": 0.051513671875, "learning_rate": 4.3396393148254196e-05, "loss": 0.0029, "step": 34400 }, { "epoch": 2.1893491124260356, "grad_norm": 0.00469970703125, "learning_rate": 4.339285840526536e-05, "loss": 0.0004, "step": 34410 }, { "epoch": 2.1899853661640263, "grad_norm": 0.0179443359375, "learning_rate": 4.338932366227651e-05, "loss": 0.0009, "step": 34420 }, { "epoch": 2.190621619902017, "grad_norm": 0.0036468505859375, "learning_rate": 4.338578891928768e-05, "loss": 0.0006, "step": 34430 }, { "epoch": 2.1912578736400077, "grad_norm": 0.004119873046875, "learning_rate": 4.338225417629884e-05, "loss": 0.0084, "step": 34440 }, { "epoch": 2.1918941273779984, "grad_norm": 0.0201416015625, "learning_rate": 4.337871943331001e-05, "loss": 0.0004, "step": 34450 }, { "epoch": 2.192530381115989, "grad_norm": 0.224609375, "learning_rate": 4.3375184690321166e-05, "loss": 0.0002, "step": 34460 }, { "epoch": 2.19316663485398, "grad_norm": 0.050048828125, "learning_rate": 4.337164994733233e-05, "loss": 0.0006, "step": 34470 }, { "epoch": 2.1938028885919705, "grad_norm": 0.01214599609375, "learning_rate": 4.33681152043435e-05, "loss": 0.0002, "step": 34480 }, { "epoch": 2.194439142329961, "grad_norm": 0.0096435546875, "learning_rate": 4.336458046135466e-05, "loss": 0.0001, "step": 34490 }, { "epoch": 2.195075396067952, "grad_norm": 0.00958251953125, "learning_rate": 4.3361045718365814e-05, "loss": 0.0003, "step": 34500 }, { "epoch": 2.1957116498059426, "grad_norm": 0.0245361328125, "learning_rate": 4.335751097537698e-05, "loss": 0.0005, "step": 34510 }, { "epoch": 2.1963479035439333, "grad_norm": 0.002655029296875, "learning_rate": 4.3353976232388144e-05, "loss": 0.0006, "step": 34520 }, { "epoch": 2.196984157281924, "grad_norm": 0.003662109375, "learning_rate": 4.335044148939931e-05, "loss": 0.0034, "step": 34530 }, { "epoch": 2.1976204110199147, "grad_norm": 0.01416015625, "learning_rate": 4.334690674641047e-05, "loss": 0.0028, "step": 34540 }, { "epoch": 2.1982566647579054, "grad_norm": 0.000820159912109375, "learning_rate": 4.334337200342163e-05, "loss": 0.0001, "step": 34550 }, { "epoch": 2.198892918495896, "grad_norm": 0.0023193359375, "learning_rate": 4.33398372604328e-05, "loss": 0.0004, "step": 34560 }, { "epoch": 2.1995291722338868, "grad_norm": 0.0098876953125, "learning_rate": 4.333630251744396e-05, "loss": 0.0012, "step": 34570 }, { "epoch": 2.2001654259718775, "grad_norm": 0.0014495849609375, "learning_rate": 4.333276777445512e-05, "loss": 0.001, "step": 34580 }, { "epoch": 2.200801679709868, "grad_norm": 2.625, "learning_rate": 4.332923303146628e-05, "loss": 0.0025, "step": 34590 }, { "epoch": 2.201437933447859, "grad_norm": 0.0162353515625, "learning_rate": 4.3325698288477445e-05, "loss": 0.0004, "step": 34600 }, { "epoch": 2.2020741871858496, "grad_norm": 0.005523681640625, "learning_rate": 4.332216354548861e-05, "loss": 0.0011, "step": 34610 }, { "epoch": 2.2027104409238403, "grad_norm": 0.04736328125, "learning_rate": 4.3318628802499776e-05, "loss": 0.0001, "step": 34620 }, { "epoch": 2.203346694661831, "grad_norm": 0.000732421875, "learning_rate": 4.3315094059510934e-05, "loss": 0.0047, "step": 34630 }, { "epoch": 2.2039829483998217, "grad_norm": 0.138671875, "learning_rate": 4.33115593165221e-05, "loss": 0.0003, "step": 34640 }, { "epoch": 2.2046192021378124, "grad_norm": 0.004638671875, "learning_rate": 4.3308024573533264e-05, "loss": 0.0004, "step": 34650 }, { "epoch": 2.205255455875803, "grad_norm": 0.0030670166015625, "learning_rate": 4.330448983054442e-05, "loss": 0.0012, "step": 34660 }, { "epoch": 2.2058917096137938, "grad_norm": 0.016357421875, "learning_rate": 4.330095508755558e-05, "loss": 0.002, "step": 34670 }, { "epoch": 2.2065279633517845, "grad_norm": 0.2236328125, "learning_rate": 4.3297420344566746e-05, "loss": 0.0002, "step": 34680 }, { "epoch": 2.207164217089775, "grad_norm": 0.030517578125, "learning_rate": 4.329388560157791e-05, "loss": 0.0001, "step": 34690 }, { "epoch": 2.2078004708277663, "grad_norm": 0.005950927734375, "learning_rate": 4.329035085858908e-05, "loss": 0.001, "step": 34700 }, { "epoch": 2.208436724565757, "grad_norm": 0.005767822265625, "learning_rate": 4.3286816115600235e-05, "loss": 0.0049, "step": 34710 }, { "epoch": 2.2090729783037477, "grad_norm": 0.0303955078125, "learning_rate": 4.32832813726114e-05, "loss": 0.0004, "step": 34720 }, { "epoch": 2.2097092320417384, "grad_norm": 0.00408935546875, "learning_rate": 4.3279746629622565e-05, "loss": 0.0002, "step": 34730 }, { "epoch": 2.210345485779729, "grad_norm": 0.00848388671875, "learning_rate": 4.327621188663373e-05, "loss": 0.0001, "step": 34740 }, { "epoch": 2.21098173951772, "grad_norm": 0.0302734375, "learning_rate": 4.327267714364488e-05, "loss": 0.0002, "step": 34750 }, { "epoch": 2.2116179932557105, "grad_norm": 0.015625, "learning_rate": 4.326914240065605e-05, "loss": 0.0002, "step": 34760 }, { "epoch": 2.212254246993701, "grad_norm": 0.00128173828125, "learning_rate": 4.326560765766721e-05, "loss": 0.0001, "step": 34770 }, { "epoch": 2.212890500731692, "grad_norm": 0.0052490234375, "learning_rate": 4.326207291467838e-05, "loss": 0.0001, "step": 34780 }, { "epoch": 2.2135267544696826, "grad_norm": 0.00148773193359375, "learning_rate": 4.3258538171689536e-05, "loss": 0.0111, "step": 34790 }, { "epoch": 2.2141630082076733, "grad_norm": 0.00151824951171875, "learning_rate": 4.32550034287007e-05, "loss": 0.0002, "step": 34800 }, { "epoch": 2.214799261945664, "grad_norm": 0.0033111572265625, "learning_rate": 4.325146868571187e-05, "loss": 0.0001, "step": 34810 }, { "epoch": 2.2154355156836547, "grad_norm": 0.006988525390625, "learning_rate": 4.324793394272303e-05, "loss": 0.0009, "step": 34820 }, { "epoch": 2.2160717694216454, "grad_norm": 0.0045166015625, "learning_rate": 4.324439919973419e-05, "loss": 0.0041, "step": 34830 }, { "epoch": 2.216708023159636, "grad_norm": 0.01165771484375, "learning_rate": 4.324086445674535e-05, "loss": 0.0005, "step": 34840 }, { "epoch": 2.217344276897627, "grad_norm": 1.2421875, "learning_rate": 4.3237329713756514e-05, "loss": 0.0006, "step": 34850 }, { "epoch": 2.2179805306356175, "grad_norm": 0.03271484375, "learning_rate": 4.323379497076768e-05, "loss": 0.0025, "step": 34860 }, { "epoch": 2.218616784373608, "grad_norm": 0.017578125, "learning_rate": 4.323026022777884e-05, "loss": 0.0002, "step": 34870 }, { "epoch": 2.219253038111599, "grad_norm": 0.00701904296875, "learning_rate": 4.322672548479e-05, "loss": 0.0001, "step": 34880 }, { "epoch": 2.2198892918495896, "grad_norm": 0.0033721923828125, "learning_rate": 4.322319074180117e-05, "loss": 0.0042, "step": 34890 }, { "epoch": 2.2205255455875803, "grad_norm": 0.00616455078125, "learning_rate": 4.321965599881233e-05, "loss": 0.0001, "step": 34900 }, { "epoch": 2.221161799325571, "grad_norm": 0.1328125, "learning_rate": 4.321612125582349e-05, "loss": 0.0002, "step": 34910 }, { "epoch": 2.2217980530635617, "grad_norm": 0.00098419189453125, "learning_rate": 4.321258651283465e-05, "loss": 0.0001, "step": 34920 }, { "epoch": 2.2224343068015524, "grad_norm": 0.0135498046875, "learning_rate": 4.3209051769845815e-05, "loss": 0.0003, "step": 34930 }, { "epoch": 2.223070560539543, "grad_norm": 0.00555419921875, "learning_rate": 4.320551702685698e-05, "loss": 0.0005, "step": 34940 }, { "epoch": 2.223706814277534, "grad_norm": 0.002960205078125, "learning_rate": 4.3201982283868145e-05, "loss": 0.0006, "step": 34950 }, { "epoch": 2.2243430680155245, "grad_norm": 0.0130615234375, "learning_rate": 4.3198447540879304e-05, "loss": 0.0005, "step": 34960 }, { "epoch": 2.224979321753515, "grad_norm": 0.1484375, "learning_rate": 4.319491279789047e-05, "loss": 0.0005, "step": 34970 }, { "epoch": 2.225615575491506, "grad_norm": 0.0771484375, "learning_rate": 4.3191378054901634e-05, "loss": 0.0096, "step": 34980 }, { "epoch": 2.2262518292294966, "grad_norm": 0.000865936279296875, "learning_rate": 4.318784331191279e-05, "loss": 0.0003, "step": 34990 }, { "epoch": 2.2268880829674873, "grad_norm": 0.00067901611328125, "learning_rate": 4.318430856892395e-05, "loss": 0.0001, "step": 35000 }, { "epoch": 2.227524336705478, "grad_norm": 0.005767822265625, "learning_rate": 4.3180773825935116e-05, "loss": 0.0003, "step": 35010 }, { "epoch": 2.2281605904434687, "grad_norm": 0.00015926361083984375, "learning_rate": 4.317723908294628e-05, "loss": 0.0069, "step": 35020 }, { "epoch": 2.2287968441814594, "grad_norm": 0.01544189453125, "learning_rate": 4.3173704339957447e-05, "loss": 0.0002, "step": 35030 }, { "epoch": 2.22943309791945, "grad_norm": 0.00286865234375, "learning_rate": 4.3170169596968605e-05, "loss": 0.0, "step": 35040 }, { "epoch": 2.230069351657441, "grad_norm": 0.00604248046875, "learning_rate": 4.316663485397977e-05, "loss": 0.0001, "step": 35050 }, { "epoch": 2.2307056053954315, "grad_norm": 0.0013275146484375, "learning_rate": 4.3163100110990935e-05, "loss": 0.0, "step": 35060 }, { "epoch": 2.231341859133422, "grad_norm": 0.330078125, "learning_rate": 4.31595653680021e-05, "loss": 0.0002, "step": 35070 }, { "epoch": 2.2319781128714133, "grad_norm": 0.0093994140625, "learning_rate": 4.315603062501325e-05, "loss": 0.0022, "step": 35080 }, { "epoch": 2.232614366609404, "grad_norm": 0.0126953125, "learning_rate": 4.315249588202442e-05, "loss": 0.0001, "step": 35090 }, { "epoch": 2.2332506203473947, "grad_norm": 0.00494384765625, "learning_rate": 4.314896113903558e-05, "loss": 0.0006, "step": 35100 }, { "epoch": 2.2338868740853854, "grad_norm": 0.12451171875, "learning_rate": 4.314542639604675e-05, "loss": 0.0003, "step": 35110 }, { "epoch": 2.234523127823376, "grad_norm": 0.01080322265625, "learning_rate": 4.3141891653057906e-05, "loss": 0.0005, "step": 35120 }, { "epoch": 2.235159381561367, "grad_norm": 0.032470703125, "learning_rate": 4.313835691006907e-05, "loss": 0.0005, "step": 35130 }, { "epoch": 2.2357956352993575, "grad_norm": 0.0004329681396484375, "learning_rate": 4.3134822167080236e-05, "loss": 0.0002, "step": 35140 }, { "epoch": 2.2364318890373482, "grad_norm": 0.0203857421875, "learning_rate": 4.31312874240914e-05, "loss": 0.0002, "step": 35150 }, { "epoch": 2.237068142775339, "grad_norm": 0.0006561279296875, "learning_rate": 4.312775268110256e-05, "loss": 0.0007, "step": 35160 }, { "epoch": 2.2377043965133296, "grad_norm": 0.004241943359375, "learning_rate": 4.312421793811372e-05, "loss": 0.0072, "step": 35170 }, { "epoch": 2.2383406502513203, "grad_norm": 0.02392578125, "learning_rate": 4.3120683195124884e-05, "loss": 0.0007, "step": 35180 }, { "epoch": 2.238976903989311, "grad_norm": 0.0023345947265625, "learning_rate": 4.311714845213605e-05, "loss": 0.0002, "step": 35190 }, { "epoch": 2.2396131577273017, "grad_norm": 0.004486083984375, "learning_rate": 4.311361370914721e-05, "loss": 0.0001, "step": 35200 }, { "epoch": 2.2402494114652924, "grad_norm": 0.018310546875, "learning_rate": 4.311007896615837e-05, "loss": 0.0002, "step": 35210 }, { "epoch": 2.240885665203283, "grad_norm": 0.00116729736328125, "learning_rate": 4.310654422316954e-05, "loss": 0.0028, "step": 35220 }, { "epoch": 2.241521918941274, "grad_norm": 1.453125, "learning_rate": 4.31030094801807e-05, "loss": 0.001, "step": 35230 }, { "epoch": 2.2421581726792645, "grad_norm": 0.006744384765625, "learning_rate": 4.309947473719186e-05, "loss": 0.0007, "step": 35240 }, { "epoch": 2.2427944264172552, "grad_norm": 0.00335693359375, "learning_rate": 4.309593999420302e-05, "loss": 0.0002, "step": 35250 }, { "epoch": 2.243430680155246, "grad_norm": 0.0019989013671875, "learning_rate": 4.3092405251214185e-05, "loss": 0.0005, "step": 35260 }, { "epoch": 2.2440669338932366, "grad_norm": 0.049560546875, "learning_rate": 4.308887050822535e-05, "loss": 0.0002, "step": 35270 }, { "epoch": 2.2447031876312273, "grad_norm": 0.00189971923828125, "learning_rate": 4.3085335765236515e-05, "loss": 0.0027, "step": 35280 }, { "epoch": 2.245339441369218, "grad_norm": 0.00396728515625, "learning_rate": 4.3081801022247674e-05, "loss": 0.001, "step": 35290 }, { "epoch": 2.2459756951072087, "grad_norm": 0.00677490234375, "learning_rate": 4.307826627925884e-05, "loss": 0.0002, "step": 35300 }, { "epoch": 2.2466119488451994, "grad_norm": 0.00110626220703125, "learning_rate": 4.3074731536270004e-05, "loss": 0.0001, "step": 35310 }, { "epoch": 2.24724820258319, "grad_norm": 0.0022125244140625, "learning_rate": 4.307119679328116e-05, "loss": 0.0003, "step": 35320 }, { "epoch": 2.247884456321181, "grad_norm": 1.8046875, "learning_rate": 4.306766205029232e-05, "loss": 0.0007, "step": 35330 }, { "epoch": 2.2485207100591715, "grad_norm": 0.002716064453125, "learning_rate": 4.3064127307303486e-05, "loss": 0.0011, "step": 35340 }, { "epoch": 2.2491569637971622, "grad_norm": 0.1376953125, "learning_rate": 4.306059256431465e-05, "loss": 0.0016, "step": 35350 }, { "epoch": 2.249793217535153, "grad_norm": 0.0031890869140625, "learning_rate": 4.3057057821325816e-05, "loss": 0.0001, "step": 35360 }, { "epoch": 2.2504294712731436, "grad_norm": 0.027587890625, "learning_rate": 4.3053523078336975e-05, "loss": 0.0004, "step": 35370 }, { "epoch": 2.2510657250111343, "grad_norm": 0.0162353515625, "learning_rate": 4.304998833534814e-05, "loss": 0.0001, "step": 35380 }, { "epoch": 2.251701978749125, "grad_norm": 0.028564453125, "learning_rate": 4.3046453592359305e-05, "loss": 0.0015, "step": 35390 }, { "epoch": 2.2523382324871157, "grad_norm": 0.000904083251953125, "learning_rate": 4.3042918849370464e-05, "loss": 0.0001, "step": 35400 }, { "epoch": 2.2529744862251064, "grad_norm": 0.173828125, "learning_rate": 4.303938410638162e-05, "loss": 0.0032, "step": 35410 }, { "epoch": 2.253610739963097, "grad_norm": 0.0034332275390625, "learning_rate": 4.303584936339279e-05, "loss": 0.0108, "step": 35420 }, { "epoch": 2.254246993701088, "grad_norm": 2.125, "learning_rate": 4.303231462040395e-05, "loss": 0.0022, "step": 35430 }, { "epoch": 2.2548832474390785, "grad_norm": 0.0022430419921875, "learning_rate": 4.302877987741512e-05, "loss": 0.0003, "step": 35440 }, { "epoch": 2.2555195011770692, "grad_norm": 0.013427734375, "learning_rate": 4.3025245134426276e-05, "loss": 0.0002, "step": 35450 }, { "epoch": 2.25615575491506, "grad_norm": 0.000682830810546875, "learning_rate": 4.302171039143744e-05, "loss": 0.0003, "step": 35460 }, { "epoch": 2.2567920086530506, "grad_norm": 0.00836181640625, "learning_rate": 4.3018175648448606e-05, "loss": 0.006, "step": 35470 }, { "epoch": 2.2574282623910413, "grad_norm": 0.012939453125, "learning_rate": 4.301464090545977e-05, "loss": 0.0002, "step": 35480 }, { "epoch": 2.258064516129032, "grad_norm": 0.0126953125, "learning_rate": 4.301110616247093e-05, "loss": 0.0003, "step": 35490 }, { "epoch": 2.2587007698670227, "grad_norm": 0.038818359375, "learning_rate": 4.300757141948209e-05, "loss": 0.0002, "step": 35500 }, { "epoch": 2.259337023605014, "grad_norm": 0.0052490234375, "learning_rate": 4.3004036676493253e-05, "loss": 0.0023, "step": 35510 }, { "epoch": 2.2599732773430046, "grad_norm": 0.0113525390625, "learning_rate": 4.300050193350442e-05, "loss": 0.0004, "step": 35520 }, { "epoch": 2.2606095310809953, "grad_norm": 0.004791259765625, "learning_rate": 4.299696719051558e-05, "loss": 0.0014, "step": 35530 }, { "epoch": 2.261245784818986, "grad_norm": 0.05419921875, "learning_rate": 4.299343244752674e-05, "loss": 0.0011, "step": 35540 }, { "epoch": 2.2618820385569767, "grad_norm": 0.2314453125, "learning_rate": 4.298989770453791e-05, "loss": 0.0003, "step": 35550 }, { "epoch": 2.2625182922949674, "grad_norm": 0.0025177001953125, "learning_rate": 4.298636296154907e-05, "loss": 0.0002, "step": 35560 }, { "epoch": 2.263154546032958, "grad_norm": 0.69140625, "learning_rate": 4.298282821856023e-05, "loss": 0.0009, "step": 35570 }, { "epoch": 2.2637907997709488, "grad_norm": 0.000408172607421875, "learning_rate": 4.297929347557139e-05, "loss": 0.0001, "step": 35580 }, { "epoch": 2.2644270535089395, "grad_norm": 0.0380859375, "learning_rate": 4.2975758732582555e-05, "loss": 0.0018, "step": 35590 }, { "epoch": 2.26506330724693, "grad_norm": 0.064453125, "learning_rate": 4.297222398959372e-05, "loss": 0.0004, "step": 35600 }, { "epoch": 2.265699560984921, "grad_norm": 0.002288818359375, "learning_rate": 4.2968689246604885e-05, "loss": 0.0003, "step": 35610 }, { "epoch": 2.2663358147229116, "grad_norm": 0.007110595703125, "learning_rate": 4.296515450361604e-05, "loss": 0.0003, "step": 35620 }, { "epoch": 2.2669720684609023, "grad_norm": 0.026123046875, "learning_rate": 4.296161976062721e-05, "loss": 0.0, "step": 35630 }, { "epoch": 2.267608322198893, "grad_norm": 0.003662109375, "learning_rate": 4.2958085017638374e-05, "loss": 0.0054, "step": 35640 }, { "epoch": 2.2682445759368837, "grad_norm": 0.0030975341796875, "learning_rate": 4.295455027464953e-05, "loss": 0.0005, "step": 35650 }, { "epoch": 2.2688808296748744, "grad_norm": 0.037109375, "learning_rate": 4.295101553166069e-05, "loss": 0.0006, "step": 35660 }, { "epoch": 2.269517083412865, "grad_norm": 0.0026702880859375, "learning_rate": 4.2947480788671856e-05, "loss": 0.0001, "step": 35670 }, { "epoch": 2.2701533371508558, "grad_norm": 0.0390625, "learning_rate": 4.294394604568302e-05, "loss": 0.0001, "step": 35680 }, { "epoch": 2.2707895908888465, "grad_norm": 0.00164794921875, "learning_rate": 4.2940411302694186e-05, "loss": 0.0024, "step": 35690 }, { "epoch": 2.271425844626837, "grad_norm": 0.00677490234375, "learning_rate": 4.2936876559705345e-05, "loss": 0.0001, "step": 35700 }, { "epoch": 2.272062098364828, "grad_norm": 0.005218505859375, "learning_rate": 4.293334181671651e-05, "loss": 0.0004, "step": 35710 }, { "epoch": 2.2726983521028186, "grad_norm": 0.017333984375, "learning_rate": 4.2929807073727675e-05, "loss": 0.0004, "step": 35720 }, { "epoch": 2.2733346058408093, "grad_norm": 0.64453125, "learning_rate": 4.292627233073883e-05, "loss": 0.0004, "step": 35730 }, { "epoch": 2.2739708595788, "grad_norm": 0.00689697265625, "learning_rate": 4.292273758774999e-05, "loss": 0.0002, "step": 35740 }, { "epoch": 2.2746071133167907, "grad_norm": 0.044189453125, "learning_rate": 4.291920284476116e-05, "loss": 0.0003, "step": 35750 }, { "epoch": 2.2752433670547814, "grad_norm": 0.0155029296875, "learning_rate": 4.291566810177232e-05, "loss": 0.0003, "step": 35760 }, { "epoch": 2.275879620792772, "grad_norm": 0.006744384765625, "learning_rate": 4.291213335878349e-05, "loss": 0.0019, "step": 35770 }, { "epoch": 2.2765158745307628, "grad_norm": 0.00811767578125, "learning_rate": 4.2908598615794646e-05, "loss": 0.0005, "step": 35780 }, { "epoch": 2.2771521282687535, "grad_norm": 0.00135040283203125, "learning_rate": 4.290506387280581e-05, "loss": 0.0004, "step": 35790 }, { "epoch": 2.277788382006744, "grad_norm": 0.0028228759765625, "learning_rate": 4.2901529129816976e-05, "loss": 0.0001, "step": 35800 }, { "epoch": 2.278424635744735, "grad_norm": 0.00469970703125, "learning_rate": 4.2897994386828134e-05, "loss": 0.0014, "step": 35810 }, { "epoch": 2.2790608894827256, "grad_norm": 3.65625, "learning_rate": 4.28944596438393e-05, "loss": 0.0008, "step": 35820 }, { "epoch": 2.2796971432207163, "grad_norm": 0.010009765625, "learning_rate": 4.289092490085046e-05, "loss": 0.0004, "step": 35830 }, { "epoch": 2.280333396958707, "grad_norm": 7.03125, "learning_rate": 4.288739015786162e-05, "loss": 0.0096, "step": 35840 }, { "epoch": 2.2809696506966977, "grad_norm": 0.004730224609375, "learning_rate": 4.288385541487279e-05, "loss": 0.0001, "step": 35850 }, { "epoch": 2.281605904434689, "grad_norm": 0.015380859375, "learning_rate": 4.288032067188395e-05, "loss": 0.0001, "step": 35860 }, { "epoch": 2.2822421581726795, "grad_norm": 0.10009765625, "learning_rate": 4.287678592889511e-05, "loss": 0.0003, "step": 35870 }, { "epoch": 2.28287841191067, "grad_norm": 0.63671875, "learning_rate": 4.287325118590628e-05, "loss": 0.0004, "step": 35880 }, { "epoch": 2.283514665648661, "grad_norm": 0.0103759765625, "learning_rate": 4.286971644291744e-05, "loss": 0.0001, "step": 35890 }, { "epoch": 2.2841509193866516, "grad_norm": 0.00188446044921875, "learning_rate": 4.28661816999286e-05, "loss": 0.0003, "step": 35900 }, { "epoch": 2.2847871731246423, "grad_norm": 0.314453125, "learning_rate": 4.286264695693976e-05, "loss": 0.0011, "step": 35910 }, { "epoch": 2.285423426862633, "grad_norm": 0.50390625, "learning_rate": 4.2859112213950924e-05, "loss": 0.0015, "step": 35920 }, { "epoch": 2.2860596806006237, "grad_norm": 0.0732421875, "learning_rate": 4.285557747096209e-05, "loss": 0.0077, "step": 35930 }, { "epoch": 2.2866959343386144, "grad_norm": 0.027099609375, "learning_rate": 4.2852042727973255e-05, "loss": 0.0001, "step": 35940 }, { "epoch": 2.287332188076605, "grad_norm": 0.01336669921875, "learning_rate": 4.284850798498441e-05, "loss": 0.0025, "step": 35950 }, { "epoch": 2.287968441814596, "grad_norm": 0.00159454345703125, "learning_rate": 4.284497324199558e-05, "loss": 0.0002, "step": 35960 }, { "epoch": 2.2886046955525865, "grad_norm": 0.000701904296875, "learning_rate": 4.2841438499006744e-05, "loss": 0.0014, "step": 35970 }, { "epoch": 2.289240949290577, "grad_norm": 0.048828125, "learning_rate": 4.28379037560179e-05, "loss": 0.0071, "step": 35980 }, { "epoch": 2.289877203028568, "grad_norm": 0.006927490234375, "learning_rate": 4.283436901302906e-05, "loss": 0.0002, "step": 35990 }, { "epoch": 2.2905134567665586, "grad_norm": 0.0079345703125, "learning_rate": 4.2830834270040226e-05, "loss": 0.0001, "step": 36000 }, { "epoch": 2.2911497105045493, "grad_norm": 0.004852294921875, "learning_rate": 4.282729952705139e-05, "loss": 0.0009, "step": 36010 }, { "epoch": 2.29178596424254, "grad_norm": 0.0023345947265625, "learning_rate": 4.2823764784062556e-05, "loss": 0.0003, "step": 36020 }, { "epoch": 2.2924222179805307, "grad_norm": 0.0022735595703125, "learning_rate": 4.2820230041073714e-05, "loss": 0.0001, "step": 36030 }, { "epoch": 2.2930584717185214, "grad_norm": 0.006500244140625, "learning_rate": 4.281669529808488e-05, "loss": 0.0005, "step": 36040 }, { "epoch": 2.293694725456512, "grad_norm": 0.003326416015625, "learning_rate": 4.2813160555096045e-05, "loss": 0.0002, "step": 36050 }, { "epoch": 2.294330979194503, "grad_norm": 0.02734375, "learning_rate": 4.28096258121072e-05, "loss": 0.0004, "step": 36060 }, { "epoch": 2.2949672329324935, "grad_norm": 0.043701171875, "learning_rate": 4.280609106911836e-05, "loss": 0.0091, "step": 36070 }, { "epoch": 2.295603486670484, "grad_norm": 0.0250244140625, "learning_rate": 4.280255632612953e-05, "loss": 0.0032, "step": 36080 }, { "epoch": 2.296239740408475, "grad_norm": 0.240234375, "learning_rate": 4.279902158314069e-05, "loss": 0.0003, "step": 36090 }, { "epoch": 2.2968759941464656, "grad_norm": 0.1787109375, "learning_rate": 4.279548684015186e-05, "loss": 0.0002, "step": 36100 }, { "epoch": 2.2975122478844563, "grad_norm": 0.00732421875, "learning_rate": 4.2791952097163015e-05, "loss": 0.0003, "step": 36110 }, { "epoch": 2.298148501622447, "grad_norm": 0.056396484375, "learning_rate": 4.278841735417418e-05, "loss": 0.0012, "step": 36120 }, { "epoch": 2.2987847553604377, "grad_norm": 0.001312255859375, "learning_rate": 4.2784882611185346e-05, "loss": 0.0007, "step": 36130 }, { "epoch": 2.2994210090984284, "grad_norm": 0.0009613037109375, "learning_rate": 4.2781347868196504e-05, "loss": 0.0011, "step": 36140 }, { "epoch": 2.300057262836419, "grad_norm": 0.0036468505859375, "learning_rate": 4.277781312520767e-05, "loss": 0.0003, "step": 36150 }, { "epoch": 2.30069351657441, "grad_norm": 0.017578125, "learning_rate": 4.277427838221883e-05, "loss": 0.0009, "step": 36160 }, { "epoch": 2.3013297703124005, "grad_norm": 0.283203125, "learning_rate": 4.277074363922999e-05, "loss": 0.0003, "step": 36170 }, { "epoch": 2.301966024050391, "grad_norm": 0.004913330078125, "learning_rate": 4.276720889624116e-05, "loss": 0.0006, "step": 36180 }, { "epoch": 2.302602277788382, "grad_norm": 0.012939453125, "learning_rate": 4.276367415325232e-05, "loss": 0.0003, "step": 36190 }, { "epoch": 2.3032385315263726, "grad_norm": 0.15625, "learning_rate": 4.276013941026348e-05, "loss": 0.0002, "step": 36200 }, { "epoch": 2.3038747852643633, "grad_norm": 0.017822265625, "learning_rate": 4.275660466727465e-05, "loss": 0.0005, "step": 36210 }, { "epoch": 2.304511039002354, "grad_norm": 0.07421875, "learning_rate": 4.2753069924285805e-05, "loss": 0.0001, "step": 36220 }, { "epoch": 2.3051472927403447, "grad_norm": 0.0162353515625, "learning_rate": 4.274953518129697e-05, "loss": 0.0002, "step": 36230 }, { "epoch": 2.3057835464783354, "grad_norm": 0.0091552734375, "learning_rate": 4.274600043830813e-05, "loss": 0.0006, "step": 36240 }, { "epoch": 2.306419800216326, "grad_norm": 0.06103515625, "learning_rate": 4.2742465695319294e-05, "loss": 0.0002, "step": 36250 }, { "epoch": 2.307056053954317, "grad_norm": 0.017822265625, "learning_rate": 4.273893095233046e-05, "loss": 0.0001, "step": 36260 }, { "epoch": 2.3076923076923075, "grad_norm": 0.006744384765625, "learning_rate": 4.2735396209341625e-05, "loss": 0.0001, "step": 36270 }, { "epoch": 2.308328561430298, "grad_norm": 0.3046875, "learning_rate": 4.273186146635278e-05, "loss": 0.0003, "step": 36280 }, { "epoch": 2.308964815168289, "grad_norm": 0.0032196044921875, "learning_rate": 4.272832672336395e-05, "loss": 0.0009, "step": 36290 }, { "epoch": 2.3096010689062796, "grad_norm": 0.0029296875, "learning_rate": 4.2724791980375107e-05, "loss": 0.0035, "step": 36300 }, { "epoch": 2.3102373226442703, "grad_norm": 0.0179443359375, "learning_rate": 4.272125723738627e-05, "loss": 0.001, "step": 36310 }, { "epoch": 2.3108735763822614, "grad_norm": 0.00128173828125, "learning_rate": 4.271772249439743e-05, "loss": 0.0001, "step": 36320 }, { "epoch": 2.311509830120252, "grad_norm": 0.6796875, "learning_rate": 4.2714187751408595e-05, "loss": 0.0008, "step": 36330 }, { "epoch": 2.312146083858243, "grad_norm": 0.04150390625, "learning_rate": 4.271065300841976e-05, "loss": 0.0003, "step": 36340 }, { "epoch": 2.3127823375962335, "grad_norm": 0.00144195556640625, "learning_rate": 4.2707118265430926e-05, "loss": 0.0009, "step": 36350 }, { "epoch": 2.3134185913342242, "grad_norm": 5.34375, "learning_rate": 4.2703583522442084e-05, "loss": 0.0065, "step": 36360 }, { "epoch": 2.314054845072215, "grad_norm": 0.00384521484375, "learning_rate": 4.270004877945325e-05, "loss": 0.0001, "step": 36370 }, { "epoch": 2.3146910988102056, "grad_norm": 0.004425048828125, "learning_rate": 4.2696514036464415e-05, "loss": 0.0004, "step": 36380 }, { "epoch": 2.3153273525481963, "grad_norm": 0.0001983642578125, "learning_rate": 4.269297929347557e-05, "loss": 0.0003, "step": 36390 }, { "epoch": 2.315963606286187, "grad_norm": 0.00390625, "learning_rate": 4.268944455048673e-05, "loss": 0.0001, "step": 36400 }, { "epoch": 2.3165998600241777, "grad_norm": 0.00762939453125, "learning_rate": 4.2685909807497897e-05, "loss": 0.0001, "step": 36410 }, { "epoch": 2.3172361137621684, "grad_norm": 0.000270843505859375, "learning_rate": 4.268237506450906e-05, "loss": 0.0125, "step": 36420 }, { "epoch": 2.317872367500159, "grad_norm": 0.0517578125, "learning_rate": 4.267884032152023e-05, "loss": 0.0006, "step": 36430 }, { "epoch": 2.31850862123815, "grad_norm": 0.0120849609375, "learning_rate": 4.2675305578531385e-05, "loss": 0.0001, "step": 36440 }, { "epoch": 2.3191448749761405, "grad_norm": 0.0029754638671875, "learning_rate": 4.267177083554255e-05, "loss": 0.0001, "step": 36450 }, { "epoch": 2.3197811287141312, "grad_norm": 0.002593994140625, "learning_rate": 4.2668236092553716e-05, "loss": 0.0014, "step": 36460 }, { "epoch": 2.320417382452122, "grad_norm": 0.0086669921875, "learning_rate": 4.2664701349564874e-05, "loss": 0.0004, "step": 36470 }, { "epoch": 2.3210536361901126, "grad_norm": 0.01416015625, "learning_rate": 4.266116660657604e-05, "loss": 0.0001, "step": 36480 }, { "epoch": 2.3216898899281033, "grad_norm": 0.1552734375, "learning_rate": 4.26576318635872e-05, "loss": 0.0002, "step": 36490 }, { "epoch": 2.322326143666094, "grad_norm": 0.003936767578125, "learning_rate": 4.265409712059836e-05, "loss": 0.0003, "step": 36500 }, { "epoch": 2.3229623974040847, "grad_norm": 0.12451171875, "learning_rate": 4.265056237760953e-05, "loss": 0.0003, "step": 36510 }, { "epoch": 2.3235986511420754, "grad_norm": 0.002227783203125, "learning_rate": 4.2647027634620686e-05, "loss": 0.0024, "step": 36520 }, { "epoch": 2.324234904880066, "grad_norm": 0.005126953125, "learning_rate": 4.264349289163185e-05, "loss": 0.0007, "step": 36530 }, { "epoch": 2.324871158618057, "grad_norm": 0.0218505859375, "learning_rate": 4.263995814864302e-05, "loss": 0.0001, "step": 36540 }, { "epoch": 2.3255074123560475, "grad_norm": 0.005859375, "learning_rate": 4.2636423405654175e-05, "loss": 0.0017, "step": 36550 }, { "epoch": 2.326143666094038, "grad_norm": 0.23046875, "learning_rate": 4.263288866266534e-05, "loss": 0.0008, "step": 36560 }, { "epoch": 2.326779919832029, "grad_norm": 0.050537109375, "learning_rate": 4.26293539196765e-05, "loss": 0.0025, "step": 36570 }, { "epoch": 2.3274161735700196, "grad_norm": 0.00165557861328125, "learning_rate": 4.2625819176687664e-05, "loss": 0.0007, "step": 36580 }, { "epoch": 2.3280524273080103, "grad_norm": 0.00144195556640625, "learning_rate": 4.262228443369883e-05, "loss": 0.0001, "step": 36590 }, { "epoch": 2.328688681046001, "grad_norm": 0.0068359375, "learning_rate": 4.2618749690709994e-05, "loss": 0.0006, "step": 36600 }, { "epoch": 2.3293249347839917, "grad_norm": 0.03466796875, "learning_rate": 4.261521494772115e-05, "loss": 0.0003, "step": 36610 }, { "epoch": 2.3299611885219824, "grad_norm": 0.341796875, "learning_rate": 4.261168020473232e-05, "loss": 0.0003, "step": 36620 }, { "epoch": 2.330597442259973, "grad_norm": 0.01141357421875, "learning_rate": 4.2608145461743476e-05, "loss": 0.0003, "step": 36630 }, { "epoch": 2.331233695997964, "grad_norm": 0.12109375, "learning_rate": 4.260461071875464e-05, "loss": 0.001, "step": 36640 }, { "epoch": 2.3318699497359545, "grad_norm": 0.0269775390625, "learning_rate": 4.26010759757658e-05, "loss": 0.0003, "step": 36650 }, { "epoch": 2.332506203473945, "grad_norm": 0.002532958984375, "learning_rate": 4.2597541232776965e-05, "loss": 0.0005, "step": 36660 }, { "epoch": 2.3331424572119364, "grad_norm": 0.220703125, "learning_rate": 4.259400648978813e-05, "loss": 0.0003, "step": 36670 }, { "epoch": 2.333778710949927, "grad_norm": 0.0169677734375, "learning_rate": 4.2590471746799296e-05, "loss": 0.0009, "step": 36680 }, { "epoch": 2.3344149646879178, "grad_norm": 0.0091552734375, "learning_rate": 4.2586937003810454e-05, "loss": 0.0004, "step": 36690 }, { "epoch": 2.3350512184259085, "grad_norm": 0.498046875, "learning_rate": 4.258340226082162e-05, "loss": 0.0005, "step": 36700 }, { "epoch": 2.335687472163899, "grad_norm": 0.0299072265625, "learning_rate": 4.257986751783278e-05, "loss": 0.0003, "step": 36710 }, { "epoch": 2.33632372590189, "grad_norm": 0.109375, "learning_rate": 4.257633277484394e-05, "loss": 0.0049, "step": 36720 }, { "epoch": 2.3369599796398806, "grad_norm": 0.00274658203125, "learning_rate": 4.25727980318551e-05, "loss": 0.0001, "step": 36730 }, { "epoch": 2.3375962333778713, "grad_norm": 0.006744384765625, "learning_rate": 4.2569263288866266e-05, "loss": 0.0003, "step": 36740 }, { "epoch": 2.338232487115862, "grad_norm": 0.013916015625, "learning_rate": 4.256572854587743e-05, "loss": 0.0056, "step": 36750 }, { "epoch": 2.3388687408538527, "grad_norm": 0.0018157958984375, "learning_rate": 4.25621938028886e-05, "loss": 0.0004, "step": 36760 }, { "epoch": 2.3395049945918434, "grad_norm": 0.01312255859375, "learning_rate": 4.2558659059899755e-05, "loss": 0.0001, "step": 36770 }, { "epoch": 2.340141248329834, "grad_norm": 0.0111083984375, "learning_rate": 4.255512431691092e-05, "loss": 0.0004, "step": 36780 }, { "epoch": 2.3407775020678248, "grad_norm": 0.001983642578125, "learning_rate": 4.2551589573922085e-05, "loss": 0.0001, "step": 36790 }, { "epoch": 2.3414137558058155, "grad_norm": 0.000263214111328125, "learning_rate": 4.2548054830933244e-05, "loss": 0.0003, "step": 36800 }, { "epoch": 2.342050009543806, "grad_norm": 0.0006103515625, "learning_rate": 4.254452008794441e-05, "loss": 0.0013, "step": 36810 }, { "epoch": 2.342686263281797, "grad_norm": 0.0084228515625, "learning_rate": 4.254098534495557e-05, "loss": 0.0003, "step": 36820 }, { "epoch": 2.3433225170197876, "grad_norm": 0.1884765625, "learning_rate": 4.253745060196673e-05, "loss": 0.0003, "step": 36830 }, { "epoch": 2.3439587707577783, "grad_norm": 3.3125, "learning_rate": 4.25339158589779e-05, "loss": 0.0049, "step": 36840 }, { "epoch": 2.344595024495769, "grad_norm": 0.006134033203125, "learning_rate": 4.253038111598906e-05, "loss": 0.0002, "step": 36850 }, { "epoch": 2.3452312782337597, "grad_norm": 0.004241943359375, "learning_rate": 4.252684637300022e-05, "loss": 0.0026, "step": 36860 }, { "epoch": 2.3458675319717504, "grad_norm": 0.00946044921875, "learning_rate": 4.252331163001139e-05, "loss": 0.0002, "step": 36870 }, { "epoch": 2.346503785709741, "grad_norm": 0.08349609375, "learning_rate": 4.2519776887022545e-05, "loss": 0.0004, "step": 36880 }, { "epoch": 2.3471400394477318, "grad_norm": 0.0062255859375, "learning_rate": 4.251624214403371e-05, "loss": 0.001, "step": 36890 }, { "epoch": 2.3477762931857225, "grad_norm": 0.046630859375, "learning_rate": 4.251270740104487e-05, "loss": 0.0001, "step": 36900 }, { "epoch": 2.348412546923713, "grad_norm": 0.005401611328125, "learning_rate": 4.2509172658056034e-05, "loss": 0.0006, "step": 36910 }, { "epoch": 2.349048800661704, "grad_norm": 0.01080322265625, "learning_rate": 4.25056379150672e-05, "loss": 0.0016, "step": 36920 }, { "epoch": 2.3496850543996946, "grad_norm": 0.005096435546875, "learning_rate": 4.2502103172078364e-05, "loss": 0.0005, "step": 36930 }, { "epoch": 2.3503213081376853, "grad_norm": 0.007232666015625, "learning_rate": 4.249856842908952e-05, "loss": 0.0012, "step": 36940 }, { "epoch": 2.350957561875676, "grad_norm": 0.055419921875, "learning_rate": 4.249503368610069e-05, "loss": 0.0002, "step": 36950 }, { "epoch": 2.3515938156136666, "grad_norm": 0.002288818359375, "learning_rate": 4.2491498943111846e-05, "loss": 0.0001, "step": 36960 }, { "epoch": 2.3522300693516573, "grad_norm": 0.90234375, "learning_rate": 4.248796420012301e-05, "loss": 0.0003, "step": 36970 }, { "epoch": 2.352866323089648, "grad_norm": 0.01275634765625, "learning_rate": 4.248442945713417e-05, "loss": 0.0004, "step": 36980 }, { "epoch": 2.3535025768276387, "grad_norm": 0.014404296875, "learning_rate": 4.2480894714145335e-05, "loss": 0.0002, "step": 36990 }, { "epoch": 2.3541388305656294, "grad_norm": 0.033935546875, "learning_rate": 4.24773599711565e-05, "loss": 0.0109, "step": 37000 }, { "epoch": 2.35477508430362, "grad_norm": 0.00469970703125, "learning_rate": 4.2473825228167665e-05, "loss": 0.005, "step": 37010 }, { "epoch": 2.355411338041611, "grad_norm": 0.07568359375, "learning_rate": 4.2470290485178824e-05, "loss": 0.0003, "step": 37020 }, { "epoch": 2.3560475917796015, "grad_norm": 0.0054931640625, "learning_rate": 4.246675574218999e-05, "loss": 0.0001, "step": 37030 }, { "epoch": 2.3566838455175922, "grad_norm": 0.01336669921875, "learning_rate": 4.246322099920115e-05, "loss": 0.0003, "step": 37040 }, { "epoch": 2.357320099255583, "grad_norm": 0.0198974609375, "learning_rate": 4.245968625621231e-05, "loss": 0.0007, "step": 37050 }, { "epoch": 2.3579563529935736, "grad_norm": 0.00173187255859375, "learning_rate": 4.245615151322348e-05, "loss": 0.0003, "step": 37060 }, { "epoch": 2.3585926067315643, "grad_norm": 0.103515625, "learning_rate": 4.2452616770234636e-05, "loss": 0.0008, "step": 37070 }, { "epoch": 2.359228860469555, "grad_norm": 0.96875, "learning_rate": 4.24490820272458e-05, "loss": 0.0007, "step": 37080 }, { "epoch": 2.3598651142075457, "grad_norm": 0.0078125, "learning_rate": 4.2445547284256967e-05, "loss": 0.0018, "step": 37090 }, { "epoch": 2.3605013679455364, "grad_norm": 0.002471923828125, "learning_rate": 4.2442012541268125e-05, "loss": 0.0003, "step": 37100 }, { "epoch": 2.361137621683527, "grad_norm": 0.0140380859375, "learning_rate": 4.243847779827929e-05, "loss": 0.0024, "step": 37110 }, { "epoch": 2.3617738754215183, "grad_norm": 0.0206298828125, "learning_rate": 4.243494305529045e-05, "loss": 0.0002, "step": 37120 }, { "epoch": 2.362410129159509, "grad_norm": 0.0296630859375, "learning_rate": 4.2431408312301614e-05, "loss": 0.0002, "step": 37130 }, { "epoch": 2.3630463828974997, "grad_norm": 0.15234375, "learning_rate": 4.242787356931278e-05, "loss": 0.0039, "step": 37140 }, { "epoch": 2.3636826366354904, "grad_norm": 0.0037689208984375, "learning_rate": 4.242433882632394e-05, "loss": 0.0004, "step": 37150 }, { "epoch": 2.364318890373481, "grad_norm": 2.140625, "learning_rate": 4.24208040833351e-05, "loss": 0.004, "step": 37160 }, { "epoch": 2.364955144111472, "grad_norm": 0.00872802734375, "learning_rate": 4.241726934034627e-05, "loss": 0.0001, "step": 37170 }, { "epoch": 2.3655913978494625, "grad_norm": 0.0059814453125, "learning_rate": 4.241373459735743e-05, "loss": 0.0001, "step": 37180 }, { "epoch": 2.366227651587453, "grad_norm": 0.002349853515625, "learning_rate": 4.241019985436859e-05, "loss": 0.0001, "step": 37190 }, { "epoch": 2.366863905325444, "grad_norm": 0.06787109375, "learning_rate": 4.240666511137975e-05, "loss": 0.0002, "step": 37200 }, { "epoch": 2.3675001590634346, "grad_norm": 0.07421875, "learning_rate": 4.2403130368390915e-05, "loss": 0.0002, "step": 37210 }, { "epoch": 2.3681364128014253, "grad_norm": 0.0152587890625, "learning_rate": 4.239959562540208e-05, "loss": 0.0001, "step": 37220 }, { "epoch": 2.368772666539416, "grad_norm": 0.0008392333984375, "learning_rate": 4.239606088241324e-05, "loss": 0.0032, "step": 37230 }, { "epoch": 2.3694089202774067, "grad_norm": 0.0078125, "learning_rate": 4.2392526139424404e-05, "loss": 0.0016, "step": 37240 }, { "epoch": 2.3700451740153974, "grad_norm": 0.275390625, "learning_rate": 4.238899139643557e-05, "loss": 0.0058, "step": 37250 }, { "epoch": 2.370681427753388, "grad_norm": 0.00555419921875, "learning_rate": 4.2385456653446734e-05, "loss": 0.0039, "step": 37260 }, { "epoch": 2.371317681491379, "grad_norm": 0.030517578125, "learning_rate": 4.238192191045789e-05, "loss": 0.0005, "step": 37270 }, { "epoch": 2.3719539352293695, "grad_norm": 0.050537109375, "learning_rate": 4.237838716746906e-05, "loss": 0.0003, "step": 37280 }, { "epoch": 2.37259018896736, "grad_norm": 0.04052734375, "learning_rate": 4.2374852424480216e-05, "loss": 0.0003, "step": 37290 }, { "epoch": 2.373226442705351, "grad_norm": 0.00148773193359375, "learning_rate": 4.237131768149138e-05, "loss": 0.0003, "step": 37300 }, { "epoch": 2.3738626964433416, "grad_norm": 0.1298828125, "learning_rate": 4.236778293850254e-05, "loss": 0.0002, "step": 37310 }, { "epoch": 2.3744989501813323, "grad_norm": 0.010009765625, "learning_rate": 4.2364248195513705e-05, "loss": 0.0003, "step": 37320 }, { "epoch": 2.375135203919323, "grad_norm": 0.0167236328125, "learning_rate": 4.236071345252487e-05, "loss": 0.0038, "step": 37330 }, { "epoch": 2.3757714576573137, "grad_norm": 0.00142669677734375, "learning_rate": 4.2357178709536035e-05, "loss": 0.0116, "step": 37340 }, { "epoch": 2.3764077113953044, "grad_norm": 0.013427734375, "learning_rate": 4.2353643966547194e-05, "loss": 0.0011, "step": 37350 }, { "epoch": 2.377043965133295, "grad_norm": 0.1201171875, "learning_rate": 4.235010922355836e-05, "loss": 0.0026, "step": 37360 }, { "epoch": 2.3776802188712858, "grad_norm": 0.00116729736328125, "learning_rate": 4.234657448056952e-05, "loss": 0.0021, "step": 37370 }, { "epoch": 2.3783164726092765, "grad_norm": 0.00347900390625, "learning_rate": 4.234303973758068e-05, "loss": 0.0004, "step": 37380 }, { "epoch": 2.378952726347267, "grad_norm": 0.0108642578125, "learning_rate": 4.233950499459185e-05, "loss": 0.0002, "step": 37390 }, { "epoch": 2.379588980085258, "grad_norm": 0.057373046875, "learning_rate": 4.2335970251603006e-05, "loss": 0.0002, "step": 37400 }, { "epoch": 2.3802252338232486, "grad_norm": 0.043212890625, "learning_rate": 4.233243550861417e-05, "loss": 0.0002, "step": 37410 }, { "epoch": 2.3808614875612393, "grad_norm": 0.00046539306640625, "learning_rate": 4.2328900765625336e-05, "loss": 0.0001, "step": 37420 }, { "epoch": 2.38149774129923, "grad_norm": 0.03125, "learning_rate": 4.2325366022636495e-05, "loss": 0.0002, "step": 37430 }, { "epoch": 2.3821339950372207, "grad_norm": 0.08935546875, "learning_rate": 4.232183127964766e-05, "loss": 0.0001, "step": 37440 }, { "epoch": 2.3827702487752114, "grad_norm": 0.023193359375, "learning_rate": 4.231829653665882e-05, "loss": 0.0013, "step": 37450 }, { "epoch": 2.383406502513202, "grad_norm": 0.005767822265625, "learning_rate": 4.2314761793669984e-05, "loss": 0.0054, "step": 37460 }, { "epoch": 2.384042756251193, "grad_norm": 0.15234375, "learning_rate": 4.231122705068115e-05, "loss": 0.0002, "step": 37470 }, { "epoch": 2.384679009989184, "grad_norm": 0.037841796875, "learning_rate": 4.230769230769231e-05, "loss": 0.0004, "step": 37480 }, { "epoch": 2.3853152637271746, "grad_norm": 0.259765625, "learning_rate": 4.230415756470347e-05, "loss": 0.0002, "step": 37490 }, { "epoch": 2.3859515174651653, "grad_norm": 0.008544921875, "learning_rate": 4.230062282171464e-05, "loss": 0.0002, "step": 37500 }, { "epoch": 2.386587771203156, "grad_norm": 0.042236328125, "learning_rate": 4.22970880787258e-05, "loss": 0.0005, "step": 37510 }, { "epoch": 2.3872240249411467, "grad_norm": 1.0625, "learning_rate": 4.229355333573696e-05, "loss": 0.0008, "step": 37520 }, { "epoch": 2.3878602786791374, "grad_norm": 0.00360107421875, "learning_rate": 4.229001859274812e-05, "loss": 0.0003, "step": 37530 }, { "epoch": 2.388496532417128, "grad_norm": 0.0026702880859375, "learning_rate": 4.2286483849759285e-05, "loss": 0.0016, "step": 37540 }, { "epoch": 2.389132786155119, "grad_norm": 0.05712890625, "learning_rate": 4.228294910677045e-05, "loss": 0.0001, "step": 37550 }, { "epoch": 2.3897690398931095, "grad_norm": 0.451171875, "learning_rate": 4.227941436378161e-05, "loss": 0.0019, "step": 37560 }, { "epoch": 2.3904052936311, "grad_norm": 0.042724609375, "learning_rate": 4.2275879620792773e-05, "loss": 0.0002, "step": 37570 }, { "epoch": 2.391041547369091, "grad_norm": 0.0263671875, "learning_rate": 4.227234487780394e-05, "loss": 0.0004, "step": 37580 }, { "epoch": 2.3916778011070816, "grad_norm": 0.0167236328125, "learning_rate": 4.2268810134815104e-05, "loss": 0.001, "step": 37590 }, { "epoch": 2.3923140548450723, "grad_norm": 0.0242919921875, "learning_rate": 4.226527539182626e-05, "loss": 0.0003, "step": 37600 }, { "epoch": 2.392950308583063, "grad_norm": 0.0002880096435546875, "learning_rate": 4.226174064883742e-05, "loss": 0.0006, "step": 37610 }, { "epoch": 2.3935865623210537, "grad_norm": 0.005859375, "learning_rate": 4.2258205905848586e-05, "loss": 0.0001, "step": 37620 }, { "epoch": 2.3942228160590444, "grad_norm": 0.00750732421875, "learning_rate": 4.225467116285975e-05, "loss": 0.0073, "step": 37630 }, { "epoch": 2.394859069797035, "grad_norm": 1.1171875, "learning_rate": 4.225113641987091e-05, "loss": 0.0058, "step": 37640 }, { "epoch": 2.395495323535026, "grad_norm": 0.010986328125, "learning_rate": 4.2247601676882075e-05, "loss": 0.0001, "step": 37650 }, { "epoch": 2.3961315772730165, "grad_norm": 0.5859375, "learning_rate": 4.224406693389324e-05, "loss": 0.0027, "step": 37660 }, { "epoch": 2.396767831011007, "grad_norm": 0.07958984375, "learning_rate": 4.2240532190904405e-05, "loss": 0.0003, "step": 37670 }, { "epoch": 2.397404084748998, "grad_norm": 0.00250244140625, "learning_rate": 4.2236997447915563e-05, "loss": 0.0003, "step": 37680 }, { "epoch": 2.3980403384869886, "grad_norm": 0.03564453125, "learning_rate": 4.223346270492673e-05, "loss": 0.0001, "step": 37690 }, { "epoch": 2.3986765922249793, "grad_norm": 0.0036163330078125, "learning_rate": 4.222992796193789e-05, "loss": 0.0002, "step": 37700 }, { "epoch": 2.39931284596297, "grad_norm": 0.017578125, "learning_rate": 4.222639321894905e-05, "loss": 0.0001, "step": 37710 }, { "epoch": 2.3999490997009607, "grad_norm": 0.0201416015625, "learning_rate": 4.222285847596022e-05, "loss": 0.0012, "step": 37720 }, { "epoch": 2.4005853534389514, "grad_norm": 0.1943359375, "learning_rate": 4.2219323732971376e-05, "loss": 0.0007, "step": 37730 }, { "epoch": 2.401221607176942, "grad_norm": 0.59375, "learning_rate": 4.221578898998254e-05, "loss": 0.0007, "step": 37740 }, { "epoch": 2.401857860914933, "grad_norm": 0.08154296875, "learning_rate": 4.2212254246993706e-05, "loss": 0.0002, "step": 37750 }, { "epoch": 2.4024941146529235, "grad_norm": 0.0011749267578125, "learning_rate": 4.2208719504004865e-05, "loss": 0.0018, "step": 37760 }, { "epoch": 2.403130368390914, "grad_norm": 0.007049560546875, "learning_rate": 4.220518476101603e-05, "loss": 0.0001, "step": 37770 }, { "epoch": 2.403766622128905, "grad_norm": 1.0, "learning_rate": 4.220165001802719e-05, "loss": 0.0055, "step": 37780 }, { "epoch": 2.4044028758668956, "grad_norm": 0.134765625, "learning_rate": 4.219811527503835e-05, "loss": 0.0002, "step": 37790 }, { "epoch": 2.4050391296048863, "grad_norm": 0.01123046875, "learning_rate": 4.219458053204952e-05, "loss": 0.0002, "step": 37800 }, { "epoch": 2.405675383342877, "grad_norm": 0.46875, "learning_rate": 4.219104578906068e-05, "loss": 0.0007, "step": 37810 }, { "epoch": 2.4063116370808677, "grad_norm": 0.06005859375, "learning_rate": 4.218751104607184e-05, "loss": 0.0007, "step": 37820 }, { "epoch": 2.4069478908188584, "grad_norm": 0.044921875, "learning_rate": 4.218397630308301e-05, "loss": 0.0002, "step": 37830 }, { "epoch": 2.407584144556849, "grad_norm": 0.007293701171875, "learning_rate": 4.218044156009417e-05, "loss": 0.0013, "step": 37840 }, { "epoch": 2.40822039829484, "grad_norm": 0.000652313232421875, "learning_rate": 4.217690681710533e-05, "loss": 0.0001, "step": 37850 }, { "epoch": 2.4088566520328305, "grad_norm": 0.007415771484375, "learning_rate": 4.217337207411649e-05, "loss": 0.0001, "step": 37860 }, { "epoch": 2.409492905770821, "grad_norm": 0.00799560546875, "learning_rate": 4.2169837331127654e-05, "loss": 0.0005, "step": 37870 }, { "epoch": 2.410129159508812, "grad_norm": 0.005706787109375, "learning_rate": 4.216630258813882e-05, "loss": 0.0043, "step": 37880 }, { "epoch": 2.4107654132468026, "grad_norm": 0.0030975341796875, "learning_rate": 4.216276784514998e-05, "loss": 0.0069, "step": 37890 }, { "epoch": 2.4114016669847933, "grad_norm": 0.00726318359375, "learning_rate": 4.215923310216114e-05, "loss": 0.0002, "step": 37900 }, { "epoch": 2.412037920722784, "grad_norm": 0.00099945068359375, "learning_rate": 4.215569835917231e-05, "loss": 0.0004, "step": 37910 }, { "epoch": 2.412674174460775, "grad_norm": 0.138671875, "learning_rate": 4.2152163616183474e-05, "loss": 0.0015, "step": 37920 }, { "epoch": 2.413310428198766, "grad_norm": 0.03271484375, "learning_rate": 4.214862887319463e-05, "loss": 0.0001, "step": 37930 }, { "epoch": 2.4139466819367565, "grad_norm": 0.045166015625, "learning_rate": 4.214509413020579e-05, "loss": 0.0001, "step": 37940 }, { "epoch": 2.4145829356747472, "grad_norm": 0.005096435546875, "learning_rate": 4.2141559387216956e-05, "loss": 0.0001, "step": 37950 }, { "epoch": 2.415219189412738, "grad_norm": 1.375, "learning_rate": 4.213802464422812e-05, "loss": 0.0018, "step": 37960 }, { "epoch": 2.4158554431507286, "grad_norm": 0.001556396484375, "learning_rate": 4.213448990123928e-05, "loss": 0.0001, "step": 37970 }, { "epoch": 2.4164916968887193, "grad_norm": 0.01458740234375, "learning_rate": 4.2130955158250444e-05, "loss": 0.0004, "step": 37980 }, { "epoch": 2.41712795062671, "grad_norm": 0.007080078125, "learning_rate": 4.212742041526161e-05, "loss": 0.0032, "step": 37990 }, { "epoch": 2.4177642043647007, "grad_norm": 0.01055908203125, "learning_rate": 4.2123885672272775e-05, "loss": 0.0001, "step": 38000 }, { "epoch": 2.4184004581026914, "grad_norm": 0.07470703125, "learning_rate": 4.212035092928393e-05, "loss": 0.0002, "step": 38010 }, { "epoch": 2.419036711840682, "grad_norm": 0.07421875, "learning_rate": 4.211681618629509e-05, "loss": 0.0005, "step": 38020 }, { "epoch": 2.419672965578673, "grad_norm": 0.00537109375, "learning_rate": 4.211328144330626e-05, "loss": 0.0007, "step": 38030 }, { "epoch": 2.4203092193166635, "grad_norm": 3.953125, "learning_rate": 4.210974670031742e-05, "loss": 0.0021, "step": 38040 }, { "epoch": 2.4209454730546542, "grad_norm": 0.005126953125, "learning_rate": 4.210621195732859e-05, "loss": 0.0024, "step": 38050 }, { "epoch": 2.421581726792645, "grad_norm": 0.0030975341796875, "learning_rate": 4.2102677214339746e-05, "loss": 0.0002, "step": 38060 }, { "epoch": 2.4222179805306356, "grad_norm": 0.038818359375, "learning_rate": 4.209914247135091e-05, "loss": 0.0003, "step": 38070 }, { "epoch": 2.4228542342686263, "grad_norm": 0.08251953125, "learning_rate": 4.2095607728362076e-05, "loss": 0.0005, "step": 38080 }, { "epoch": 2.423490488006617, "grad_norm": 0.01190185546875, "learning_rate": 4.2092072985373234e-05, "loss": 0.0002, "step": 38090 }, { "epoch": 2.4241267417446077, "grad_norm": 0.03173828125, "learning_rate": 4.20885382423844e-05, "loss": 0.0002, "step": 38100 }, { "epoch": 2.4247629954825984, "grad_norm": 0.00921630859375, "learning_rate": 4.208500349939556e-05, "loss": 0.0004, "step": 38110 }, { "epoch": 2.425399249220589, "grad_norm": 0.002716064453125, "learning_rate": 4.208146875640672e-05, "loss": 0.0004, "step": 38120 }, { "epoch": 2.42603550295858, "grad_norm": 0.01953125, "learning_rate": 4.207793401341789e-05, "loss": 0.0001, "step": 38130 }, { "epoch": 2.4266717566965705, "grad_norm": 0.0035400390625, "learning_rate": 4.207439927042905e-05, "loss": 0.0145, "step": 38140 }, { "epoch": 2.4273080104345612, "grad_norm": 0.024169921875, "learning_rate": 4.207086452744021e-05, "loss": 0.0003, "step": 38150 }, { "epoch": 2.427944264172552, "grad_norm": 0.004180908203125, "learning_rate": 4.206732978445138e-05, "loss": 0.0003, "step": 38160 }, { "epoch": 2.4285805179105426, "grad_norm": 0.01116943359375, "learning_rate": 4.206379504146254e-05, "loss": 0.0001, "step": 38170 }, { "epoch": 2.4292167716485333, "grad_norm": 0.2119140625, "learning_rate": 4.20602602984737e-05, "loss": 0.0021, "step": 38180 }, { "epoch": 2.429853025386524, "grad_norm": 0.006500244140625, "learning_rate": 4.205672555548486e-05, "loss": 0.0004, "step": 38190 }, { "epoch": 2.4304892791245147, "grad_norm": 0.0732421875, "learning_rate": 4.2053190812496024e-05, "loss": 0.0005, "step": 38200 }, { "epoch": 2.4311255328625054, "grad_norm": 0.0037994384765625, "learning_rate": 4.204965606950719e-05, "loss": 0.0002, "step": 38210 }, { "epoch": 2.431761786600496, "grad_norm": 0.0162353515625, "learning_rate": 4.204612132651835e-05, "loss": 0.0002, "step": 38220 }, { "epoch": 2.432398040338487, "grad_norm": 0.00201416015625, "learning_rate": 4.204258658352951e-05, "loss": 0.0008, "step": 38230 }, { "epoch": 2.4330342940764775, "grad_norm": 0.0654296875, "learning_rate": 4.203905184054068e-05, "loss": 0.0002, "step": 38240 }, { "epoch": 2.4336705478144682, "grad_norm": 0.042724609375, "learning_rate": 4.2035517097551843e-05, "loss": 0.0001, "step": 38250 }, { "epoch": 2.434306801552459, "grad_norm": 0.00128173828125, "learning_rate": 4.2031982354563e-05, "loss": 0.0025, "step": 38260 }, { "epoch": 2.43494305529045, "grad_norm": 0.0021209716796875, "learning_rate": 4.202844761157416e-05, "loss": 0.0002, "step": 38270 }, { "epoch": 2.4355793090284408, "grad_norm": 0.7734375, "learning_rate": 4.2024912868585325e-05, "loss": 0.0028, "step": 38280 }, { "epoch": 2.4362155627664315, "grad_norm": 2.15625, "learning_rate": 4.202137812559649e-05, "loss": 0.0024, "step": 38290 }, { "epoch": 2.436851816504422, "grad_norm": 0.00799560546875, "learning_rate": 4.201784338260765e-05, "loss": 0.0005, "step": 38300 }, { "epoch": 2.437488070242413, "grad_norm": 0.0011138916015625, "learning_rate": 4.2014308639618814e-05, "loss": 0.0005, "step": 38310 }, { "epoch": 2.4381243239804036, "grad_norm": 0.0037078857421875, "learning_rate": 4.201077389662998e-05, "loss": 0.0003, "step": 38320 }, { "epoch": 2.4387605777183943, "grad_norm": 0.00069427490234375, "learning_rate": 4.2007239153641145e-05, "loss": 0.0002, "step": 38330 }, { "epoch": 2.439396831456385, "grad_norm": 0.0291748046875, "learning_rate": 4.20037044106523e-05, "loss": 0.0008, "step": 38340 }, { "epoch": 2.4400330851943757, "grad_norm": 0.002716064453125, "learning_rate": 4.200016966766346e-05, "loss": 0.0057, "step": 38350 }, { "epoch": 2.4406693389323664, "grad_norm": 0.06884765625, "learning_rate": 4.1996634924674627e-05, "loss": 0.0026, "step": 38360 }, { "epoch": 2.441305592670357, "grad_norm": 0.8203125, "learning_rate": 4.199310018168579e-05, "loss": 0.0008, "step": 38370 }, { "epoch": 2.4419418464083478, "grad_norm": 0.01458740234375, "learning_rate": 4.198956543869696e-05, "loss": 0.0003, "step": 38380 }, { "epoch": 2.4425781001463385, "grad_norm": 0.013427734375, "learning_rate": 4.1986030695708115e-05, "loss": 0.0018, "step": 38390 }, { "epoch": 2.443214353884329, "grad_norm": 0.0009307861328125, "learning_rate": 4.198249595271928e-05, "loss": 0.0002, "step": 38400 }, { "epoch": 2.44385060762232, "grad_norm": 0.00109100341796875, "learning_rate": 4.1978961209730446e-05, "loss": 0.0002, "step": 38410 }, { "epoch": 2.4444868613603106, "grad_norm": 0.1923828125, "learning_rate": 4.1975426466741604e-05, "loss": 0.0003, "step": 38420 }, { "epoch": 2.4451231150983013, "grad_norm": 1.9140625, "learning_rate": 4.197189172375276e-05, "loss": 0.0024, "step": 38430 }, { "epoch": 2.445759368836292, "grad_norm": 0.0130615234375, "learning_rate": 4.196835698076393e-05, "loss": 0.0007, "step": 38440 }, { "epoch": 2.4463956225742827, "grad_norm": 0.000339508056640625, "learning_rate": 4.196482223777509e-05, "loss": 0.0001, "step": 38450 }, { "epoch": 2.4470318763122734, "grad_norm": 0.003082275390625, "learning_rate": 4.196128749478626e-05, "loss": 0.0003, "step": 38460 }, { "epoch": 2.447668130050264, "grad_norm": 0.00531005859375, "learning_rate": 4.1957752751797417e-05, "loss": 0.0023, "step": 38470 }, { "epoch": 2.4483043837882548, "grad_norm": 0.07666015625, "learning_rate": 4.195421800880858e-05, "loss": 0.0089, "step": 38480 }, { "epoch": 2.4489406375262455, "grad_norm": 2.9375, "learning_rate": 4.195068326581975e-05, "loss": 0.003, "step": 38490 }, { "epoch": 2.449576891264236, "grad_norm": 0.011962890625, "learning_rate": 4.194714852283091e-05, "loss": 0.0005, "step": 38500 }, { "epoch": 2.450213145002227, "grad_norm": 0.0240478515625, "learning_rate": 4.1943613779842064e-05, "loss": 0.0001, "step": 38510 }, { "epoch": 2.4508493987402176, "grad_norm": 0.009033203125, "learning_rate": 4.194007903685323e-05, "loss": 0.0008, "step": 38520 }, { "epoch": 2.4514856524782083, "grad_norm": 0.00909423828125, "learning_rate": 4.1936544293864394e-05, "loss": 0.0036, "step": 38530 }, { "epoch": 2.452121906216199, "grad_norm": 0.0140380859375, "learning_rate": 4.193300955087556e-05, "loss": 0.0002, "step": 38540 }, { "epoch": 2.4527581599541897, "grad_norm": 0.0264892578125, "learning_rate": 4.192947480788672e-05, "loss": 0.0002, "step": 38550 }, { "epoch": 2.4533944136921804, "grad_norm": 1.90625, "learning_rate": 4.192594006489788e-05, "loss": 0.0012, "step": 38560 }, { "epoch": 2.454030667430171, "grad_norm": 0.01348876953125, "learning_rate": 4.192240532190905e-05, "loss": 0.0002, "step": 38570 }, { "epoch": 2.4546669211681618, "grad_norm": 0.03955078125, "learning_rate": 4.191887057892021e-05, "loss": 0.0016, "step": 38580 }, { "epoch": 2.4553031749061525, "grad_norm": 0.0189208984375, "learning_rate": 4.191533583593137e-05, "loss": 0.005, "step": 38590 }, { "epoch": 2.455939428644143, "grad_norm": 0.02783203125, "learning_rate": 4.191180109294253e-05, "loss": 0.0001, "step": 38600 }, { "epoch": 2.456575682382134, "grad_norm": 0.00135040283203125, "learning_rate": 4.1908266349953695e-05, "loss": 0.0003, "step": 38610 }, { "epoch": 2.4572119361201246, "grad_norm": 0.020263671875, "learning_rate": 4.190473160696486e-05, "loss": 0.0002, "step": 38620 }, { "epoch": 2.4578481898581153, "grad_norm": 0.01226806640625, "learning_rate": 4.190119686397602e-05, "loss": 0.0002, "step": 38630 }, { "epoch": 2.458484443596106, "grad_norm": 0.00150299072265625, "learning_rate": 4.1897662120987184e-05, "loss": 0.0002, "step": 38640 }, { "epoch": 2.4591206973340967, "grad_norm": 0.034912109375, "learning_rate": 4.189412737799835e-05, "loss": 0.0007, "step": 38650 }, { "epoch": 2.4597569510720874, "grad_norm": 0.061279296875, "learning_rate": 4.1890592635009514e-05, "loss": 0.0006, "step": 38660 }, { "epoch": 2.460393204810078, "grad_norm": 0.017822265625, "learning_rate": 4.188705789202067e-05, "loss": 0.0001, "step": 38670 }, { "epoch": 2.4610294585480688, "grad_norm": 0.005035400390625, "learning_rate": 4.188352314903183e-05, "loss": 0.0018, "step": 38680 }, { "epoch": 2.4616657122860595, "grad_norm": 0.004730224609375, "learning_rate": 4.1879988406042996e-05, "loss": 0.0012, "step": 38690 }, { "epoch": 2.46230196602405, "grad_norm": 0.02685546875, "learning_rate": 4.187645366305416e-05, "loss": 0.0004, "step": 38700 }, { "epoch": 2.462938219762041, "grad_norm": 0.0130615234375, "learning_rate": 4.187291892006533e-05, "loss": 0.0001, "step": 38710 }, { "epoch": 2.463574473500032, "grad_norm": 0.0712890625, "learning_rate": 4.1869384177076485e-05, "loss": 0.0012, "step": 38720 }, { "epoch": 2.4642107272380227, "grad_norm": 0.0233154296875, "learning_rate": 4.186584943408765e-05, "loss": 0.0003, "step": 38730 }, { "epoch": 2.4648469809760134, "grad_norm": 0.003082275390625, "learning_rate": 4.1862314691098816e-05, "loss": 0.0001, "step": 38740 }, { "epoch": 2.465483234714004, "grad_norm": 0.0036163330078125, "learning_rate": 4.185877994810998e-05, "loss": 0.0001, "step": 38750 }, { "epoch": 2.466119488451995, "grad_norm": 0.0147705078125, "learning_rate": 4.185524520512113e-05, "loss": 0.0009, "step": 38760 }, { "epoch": 2.4667557421899855, "grad_norm": 1.5, "learning_rate": 4.18517104621323e-05, "loss": 0.0013, "step": 38770 }, { "epoch": 2.467391995927976, "grad_norm": 0.028564453125, "learning_rate": 4.184817571914346e-05, "loss": 0.0003, "step": 38780 }, { "epoch": 2.468028249665967, "grad_norm": 0.000942230224609375, "learning_rate": 4.184464097615463e-05, "loss": 0.0003, "step": 38790 }, { "epoch": 2.4686645034039576, "grad_norm": 0.0849609375, "learning_rate": 4.1841106233165786e-05, "loss": 0.0003, "step": 38800 }, { "epoch": 2.4693007571419483, "grad_norm": 0.027587890625, "learning_rate": 4.183757149017695e-05, "loss": 0.0002, "step": 38810 }, { "epoch": 2.469937010879939, "grad_norm": 0.00921630859375, "learning_rate": 4.183403674718812e-05, "loss": 0.0009, "step": 38820 }, { "epoch": 2.4705732646179297, "grad_norm": 0.00225830078125, "learning_rate": 4.183050200419928e-05, "loss": 0.0002, "step": 38830 }, { "epoch": 2.4712095183559204, "grad_norm": 0.00518798828125, "learning_rate": 4.1826967261210434e-05, "loss": 0.0003, "step": 38840 }, { "epoch": 2.471845772093911, "grad_norm": 0.01123046875, "learning_rate": 4.18234325182216e-05, "loss": 0.0004, "step": 38850 }, { "epoch": 2.472482025831902, "grad_norm": 0.0281982421875, "learning_rate": 4.1819897775232764e-05, "loss": 0.0001, "step": 38860 }, { "epoch": 2.4731182795698925, "grad_norm": 0.029052734375, "learning_rate": 4.181636303224393e-05, "loss": 0.0004, "step": 38870 }, { "epoch": 2.473754533307883, "grad_norm": 0.015625, "learning_rate": 4.181282828925509e-05, "loss": 0.0002, "step": 38880 }, { "epoch": 2.474390787045874, "grad_norm": 0.07421875, "learning_rate": 4.180929354626625e-05, "loss": 0.0006, "step": 38890 }, { "epoch": 2.4750270407838646, "grad_norm": 0.0014190673828125, "learning_rate": 4.180575880327742e-05, "loss": 0.0001, "step": 38900 }, { "epoch": 2.4756632945218553, "grad_norm": 0.09228515625, "learning_rate": 4.180222406028858e-05, "loss": 0.0006, "step": 38910 }, { "epoch": 2.476299548259846, "grad_norm": 0.006805419921875, "learning_rate": 4.179868931729974e-05, "loss": 0.0002, "step": 38920 }, { "epoch": 2.4769358019978367, "grad_norm": 0.038330078125, "learning_rate": 4.17951545743109e-05, "loss": 0.0005, "step": 38930 }, { "epoch": 2.4775720557358274, "grad_norm": 0.0030975341796875, "learning_rate": 4.1791619831322065e-05, "loss": 0.0001, "step": 38940 }, { "epoch": 2.478208309473818, "grad_norm": 0.0211181640625, "learning_rate": 4.178808508833323e-05, "loss": 0.0007, "step": 38950 }, { "epoch": 2.478844563211809, "grad_norm": 0.0096435546875, "learning_rate": 4.1784550345344395e-05, "loss": 0.0001, "step": 38960 }, { "epoch": 2.4794808169497995, "grad_norm": 2.5625, "learning_rate": 4.1781015602355554e-05, "loss": 0.0028, "step": 38970 }, { "epoch": 2.48011707068779, "grad_norm": 0.03271484375, "learning_rate": 4.177748085936672e-05, "loss": 0.0001, "step": 38980 }, { "epoch": 2.480753324425781, "grad_norm": 0.0140380859375, "learning_rate": 4.1773946116377884e-05, "loss": 0.0001, "step": 38990 }, { "epoch": 2.4813895781637716, "grad_norm": 0.00994873046875, "learning_rate": 4.177041137338904e-05, "loss": 0.0014, "step": 39000 }, { "epoch": 2.4820258319017623, "grad_norm": 0.0174560546875, "learning_rate": 4.17668766304002e-05, "loss": 0.0001, "step": 39010 }, { "epoch": 2.482662085639753, "grad_norm": 0.00897216796875, "learning_rate": 4.1763341887411366e-05, "loss": 0.0004, "step": 39020 }, { "epoch": 2.4832983393777437, "grad_norm": 0.01190185546875, "learning_rate": 4.175980714442253e-05, "loss": 0.0002, "step": 39030 }, { "epoch": 2.4839345931157344, "grad_norm": 0.0023040771484375, "learning_rate": 4.1756272401433697e-05, "loss": 0.0003, "step": 39040 }, { "epoch": 2.484570846853725, "grad_norm": 0.008544921875, "learning_rate": 4.1752737658444855e-05, "loss": 0.0035, "step": 39050 }, { "epoch": 2.485207100591716, "grad_norm": 0.002227783203125, "learning_rate": 4.174920291545602e-05, "loss": 0.0001, "step": 39060 }, { "epoch": 2.485843354329707, "grad_norm": 0.00823974609375, "learning_rate": 4.1745668172467185e-05, "loss": 0.0001, "step": 39070 }, { "epoch": 2.4864796080676976, "grad_norm": 0.9921875, "learning_rate": 4.174213342947835e-05, "loss": 0.0007, "step": 39080 }, { "epoch": 2.4871158618056883, "grad_norm": 0.0027313232421875, "learning_rate": 4.17385986864895e-05, "loss": 0.0067, "step": 39090 }, { "epoch": 2.487752115543679, "grad_norm": 0.138671875, "learning_rate": 4.173506394350067e-05, "loss": 0.0001, "step": 39100 }, { "epoch": 2.4883883692816697, "grad_norm": 2.015625, "learning_rate": 4.173152920051183e-05, "loss": 0.0013, "step": 39110 }, { "epoch": 2.4890246230196604, "grad_norm": 0.240234375, "learning_rate": 4.1727994457523e-05, "loss": 0.0012, "step": 39120 }, { "epoch": 2.489660876757651, "grad_norm": 0.2109375, "learning_rate": 4.1724459714534156e-05, "loss": 0.0002, "step": 39130 }, { "epoch": 2.490297130495642, "grad_norm": 0.1025390625, "learning_rate": 4.172092497154532e-05, "loss": 0.0003, "step": 39140 }, { "epoch": 2.4909333842336325, "grad_norm": 0.015625, "learning_rate": 4.1717390228556487e-05, "loss": 0.0005, "step": 39150 }, { "epoch": 2.4915696379716232, "grad_norm": 0.006072998046875, "learning_rate": 4.171385548556765e-05, "loss": 0.0001, "step": 39160 }, { "epoch": 2.492205891709614, "grad_norm": 0.0026702880859375, "learning_rate": 4.17103207425788e-05, "loss": 0.0051, "step": 39170 }, { "epoch": 2.4928421454476046, "grad_norm": 0.009765625, "learning_rate": 4.170678599958997e-05, "loss": 0.0006, "step": 39180 }, { "epoch": 2.4934783991855953, "grad_norm": 0.01324462890625, "learning_rate": 4.1703251256601134e-05, "loss": 0.0001, "step": 39190 }, { "epoch": 2.494114652923586, "grad_norm": 0.002227783203125, "learning_rate": 4.16997165136123e-05, "loss": 0.0034, "step": 39200 }, { "epoch": 2.4947509066615767, "grad_norm": 0.01080322265625, "learning_rate": 4.169618177062346e-05, "loss": 0.0002, "step": 39210 }, { "epoch": 2.4953871603995674, "grad_norm": 0.000560760498046875, "learning_rate": 4.169264702763462e-05, "loss": 0.0016, "step": 39220 }, { "epoch": 2.496023414137558, "grad_norm": 0.59375, "learning_rate": 4.168911228464579e-05, "loss": 0.0008, "step": 39230 }, { "epoch": 2.496659667875549, "grad_norm": 0.01483154296875, "learning_rate": 4.168557754165695e-05, "loss": 0.0057, "step": 39240 }, { "epoch": 2.4972959216135395, "grad_norm": 0.004058837890625, "learning_rate": 4.168204279866811e-05, "loss": 0.0005, "step": 39250 }, { "epoch": 2.4979321753515302, "grad_norm": 0.000125885009765625, "learning_rate": 4.167850805567927e-05, "loss": 0.0002, "step": 39260 }, { "epoch": 2.498568429089521, "grad_norm": 0.57421875, "learning_rate": 4.1674973312690435e-05, "loss": 0.0005, "step": 39270 }, { "epoch": 2.4992046828275116, "grad_norm": 0.76171875, "learning_rate": 4.16714385697016e-05, "loss": 0.0032, "step": 39280 }, { "epoch": 2.4998409365655023, "grad_norm": 0.09130859375, "learning_rate": 4.1667903826712765e-05, "loss": 0.0005, "step": 39290 }, { "epoch": 2.500477190303493, "grad_norm": 0.004241943359375, "learning_rate": 4.1664369083723924e-05, "loss": 0.0005, "step": 39300 }, { "epoch": 2.5011134440414837, "grad_norm": 0.06640625, "learning_rate": 4.166083434073509e-05, "loss": 0.0002, "step": 39310 }, { "epoch": 2.5017496977794744, "grad_norm": 0.0032958984375, "learning_rate": 4.1657299597746254e-05, "loss": 0.0002, "step": 39320 }, { "epoch": 2.502385951517465, "grad_norm": 0.00445556640625, "learning_rate": 4.165376485475741e-05, "loss": 0.0002, "step": 39330 }, { "epoch": 2.503022205255456, "grad_norm": 0.296875, "learning_rate": 4.165023011176857e-05, "loss": 0.0008, "step": 39340 }, { "epoch": 2.5036584589934465, "grad_norm": 0.0020294189453125, "learning_rate": 4.1646695368779736e-05, "loss": 0.0003, "step": 39350 }, { "epoch": 2.5042947127314372, "grad_norm": 0.0751953125, "learning_rate": 4.16431606257909e-05, "loss": 0.0005, "step": 39360 }, { "epoch": 2.504930966469428, "grad_norm": 0.007354736328125, "learning_rate": 4.1639625882802066e-05, "loss": 0.0001, "step": 39370 }, { "epoch": 2.5055672202074186, "grad_norm": 0.01031494140625, "learning_rate": 4.1636091139813225e-05, "loss": 0.0001, "step": 39380 }, { "epoch": 2.5062034739454093, "grad_norm": 0.00665283203125, "learning_rate": 4.163255639682439e-05, "loss": 0.0021, "step": 39390 }, { "epoch": 2.5068397276834, "grad_norm": 0.00750732421875, "learning_rate": 4.1629021653835555e-05, "loss": 0.0002, "step": 39400 }, { "epoch": 2.5074759814213907, "grad_norm": 0.55859375, "learning_rate": 4.1625486910846714e-05, "loss": 0.0012, "step": 39410 }, { "epoch": 2.5081122351593814, "grad_norm": 0.007720947265625, "learning_rate": 4.162195216785787e-05, "loss": 0.0007, "step": 39420 }, { "epoch": 2.508748488897372, "grad_norm": 0.028076171875, "learning_rate": 4.161841742486904e-05, "loss": 0.0103, "step": 39430 }, { "epoch": 2.509384742635363, "grad_norm": 0.001312255859375, "learning_rate": 4.16148826818802e-05, "loss": 0.0005, "step": 39440 }, { "epoch": 2.5100209963733535, "grad_norm": 0.16796875, "learning_rate": 4.161134793889137e-05, "loss": 0.0001, "step": 39450 }, { "epoch": 2.510657250111344, "grad_norm": 0.01385498046875, "learning_rate": 4.1607813195902526e-05, "loss": 0.0002, "step": 39460 }, { "epoch": 2.511293503849335, "grad_norm": 6.09375, "learning_rate": 4.160427845291369e-05, "loss": 0.0114, "step": 39470 }, { "epoch": 2.5119297575873256, "grad_norm": 0.002197265625, "learning_rate": 4.1600743709924856e-05, "loss": 0.0001, "step": 39480 }, { "epoch": 2.5125660113253163, "grad_norm": 0.0078125, "learning_rate": 4.159720896693602e-05, "loss": 0.0042, "step": 39490 }, { "epoch": 2.513202265063307, "grad_norm": 0.02099609375, "learning_rate": 4.159367422394718e-05, "loss": 0.0003, "step": 39500 }, { "epoch": 2.5138385188012977, "grad_norm": 0.56640625, "learning_rate": 4.159013948095834e-05, "loss": 0.0007, "step": 39510 }, { "epoch": 2.5144747725392884, "grad_norm": 0.00579833984375, "learning_rate": 4.1586604737969504e-05, "loss": 0.0016, "step": 39520 }, { "epoch": 2.515111026277279, "grad_norm": 0.0135498046875, "learning_rate": 4.158306999498067e-05, "loss": 0.0002, "step": 39530 }, { "epoch": 2.51574728001527, "grad_norm": 0.0308837890625, "learning_rate": 4.157953525199183e-05, "loss": 0.0003, "step": 39540 }, { "epoch": 2.516383533753261, "grad_norm": 0.11328125, "learning_rate": 4.157600050900299e-05, "loss": 0.0013, "step": 39550 }, { "epoch": 2.5170197874912517, "grad_norm": 0.30859375, "learning_rate": 4.157246576601416e-05, "loss": 0.0002, "step": 39560 }, { "epoch": 2.5176560412292424, "grad_norm": 0.00885009765625, "learning_rate": 4.156893102302532e-05, "loss": 0.0001, "step": 39570 }, { "epoch": 2.518292294967233, "grad_norm": 0.0986328125, "learning_rate": 4.156539628003648e-05, "loss": 0.0002, "step": 39580 }, { "epoch": 2.5189285487052238, "grad_norm": 0.0003376007080078125, "learning_rate": 4.156186153704764e-05, "loss": 0.0001, "step": 39590 }, { "epoch": 2.5195648024432145, "grad_norm": 0.0022125244140625, "learning_rate": 4.1558326794058805e-05, "loss": 0.0007, "step": 39600 }, { "epoch": 2.520201056181205, "grad_norm": 0.006256103515625, "learning_rate": 4.155479205106997e-05, "loss": 0.0004, "step": 39610 }, { "epoch": 2.520837309919196, "grad_norm": 0.00836181640625, "learning_rate": 4.1551257308081135e-05, "loss": 0.0001, "step": 39620 }, { "epoch": 2.5214735636571866, "grad_norm": 0.031005859375, "learning_rate": 4.1547722565092293e-05, "loss": 0.0003, "step": 39630 }, { "epoch": 2.5221098173951773, "grad_norm": 0.006622314453125, "learning_rate": 4.154418782210346e-05, "loss": 0.0001, "step": 39640 }, { "epoch": 2.522746071133168, "grad_norm": 0.11279296875, "learning_rate": 4.1540653079114624e-05, "loss": 0.0004, "step": 39650 }, { "epoch": 2.5233823248711587, "grad_norm": 0.0263671875, "learning_rate": 4.153711833612578e-05, "loss": 0.0002, "step": 39660 }, { "epoch": 2.5240185786091494, "grad_norm": 0.00396728515625, "learning_rate": 4.153358359313694e-05, "loss": 0.0001, "step": 39670 }, { "epoch": 2.52465483234714, "grad_norm": 1.9609375, "learning_rate": 4.1530048850148106e-05, "loss": 0.0009, "step": 39680 }, { "epoch": 2.5252910860851308, "grad_norm": 0.026611328125, "learning_rate": 4.152651410715927e-05, "loss": 0.0001, "step": 39690 }, { "epoch": 2.5259273398231215, "grad_norm": 0.01544189453125, "learning_rate": 4.1522979364170436e-05, "loss": 0.0007, "step": 39700 }, { "epoch": 2.526563593561112, "grad_norm": 0.00872802734375, "learning_rate": 4.1519444621181595e-05, "loss": 0.001, "step": 39710 }, { "epoch": 2.527199847299103, "grad_norm": 0.0008697509765625, "learning_rate": 4.151590987819276e-05, "loss": 0.0021, "step": 39720 }, { "epoch": 2.5278361010370936, "grad_norm": 0.0830078125, "learning_rate": 4.1512375135203925e-05, "loss": 0.0001, "step": 39730 }, { "epoch": 2.5284723547750843, "grad_norm": 0.00933837890625, "learning_rate": 4.1508840392215083e-05, "loss": 0.0004, "step": 39740 }, { "epoch": 2.529108608513075, "grad_norm": 0.003936767578125, "learning_rate": 4.150530564922624e-05, "loss": 0.0002, "step": 39750 }, { "epoch": 2.5297448622510657, "grad_norm": 2.71875, "learning_rate": 4.150177090623741e-05, "loss": 0.0085, "step": 39760 }, { "epoch": 2.5303811159890564, "grad_norm": 0.0023956298828125, "learning_rate": 4.149823616324857e-05, "loss": 0.0001, "step": 39770 }, { "epoch": 2.531017369727047, "grad_norm": 0.28125, "learning_rate": 4.149470142025974e-05, "loss": 0.0002, "step": 39780 }, { "epoch": 2.5316536234650378, "grad_norm": 0.026123046875, "learning_rate": 4.1491166677270896e-05, "loss": 0.0002, "step": 39790 }, { "epoch": 2.5322898772030284, "grad_norm": 0.018310546875, "learning_rate": 4.148763193428206e-05, "loss": 0.0017, "step": 39800 }, { "epoch": 2.532926130941019, "grad_norm": 0.027587890625, "learning_rate": 4.1484097191293226e-05, "loss": 0.0013, "step": 39810 }, { "epoch": 2.53356238467901, "grad_norm": 0.00958251953125, "learning_rate": 4.1480562448304385e-05, "loss": 0.0006, "step": 39820 }, { "epoch": 2.5341986384170005, "grad_norm": 0.0098876953125, "learning_rate": 4.147702770531555e-05, "loss": 0.0001, "step": 39830 }, { "epoch": 2.5348348921549912, "grad_norm": 0.055419921875, "learning_rate": 4.147349296232671e-05, "loss": 0.0044, "step": 39840 }, { "epoch": 2.5354711458929824, "grad_norm": 0.00469970703125, "learning_rate": 4.146995821933787e-05, "loss": 0.0001, "step": 39850 }, { "epoch": 2.536107399630973, "grad_norm": 0.00274658203125, "learning_rate": 4.146642347634904e-05, "loss": 0.0069, "step": 39860 }, { "epoch": 2.536743653368964, "grad_norm": 0.08154296875, "learning_rate": 4.14628887333602e-05, "loss": 0.0114, "step": 39870 }, { "epoch": 2.5373799071069545, "grad_norm": 0.07470703125, "learning_rate": 4.145935399037136e-05, "loss": 0.0018, "step": 39880 }, { "epoch": 2.538016160844945, "grad_norm": 0.703125, "learning_rate": 4.145581924738253e-05, "loss": 0.0017, "step": 39890 }, { "epoch": 2.538652414582936, "grad_norm": 0.01104736328125, "learning_rate": 4.145228450439369e-05, "loss": 0.0008, "step": 39900 }, { "epoch": 2.5392886683209266, "grad_norm": 0.000701904296875, "learning_rate": 4.144874976140485e-05, "loss": 0.0003, "step": 39910 }, { "epoch": 2.5399249220589173, "grad_norm": 0.0023651123046875, "learning_rate": 4.144521501841601e-05, "loss": 0.0037, "step": 39920 }, { "epoch": 2.540561175796908, "grad_norm": 0.06591796875, "learning_rate": 4.1441680275427175e-05, "loss": 0.0021, "step": 39930 }, { "epoch": 2.5411974295348987, "grad_norm": 0.011962890625, "learning_rate": 4.143814553243834e-05, "loss": 0.0004, "step": 39940 }, { "epoch": 2.5418336832728894, "grad_norm": 0.00958251953125, "learning_rate": 4.1434610789449505e-05, "loss": 0.0005, "step": 39950 }, { "epoch": 2.54246993701088, "grad_norm": 0.0439453125, "learning_rate": 4.143107604646066e-05, "loss": 0.0002, "step": 39960 }, { "epoch": 2.543106190748871, "grad_norm": 0.58203125, "learning_rate": 4.142754130347183e-05, "loss": 0.0007, "step": 39970 }, { "epoch": 2.5437424444868615, "grad_norm": 0.1474609375, "learning_rate": 4.1424006560482994e-05, "loss": 0.0002, "step": 39980 }, { "epoch": 2.544378698224852, "grad_norm": 0.09423828125, "learning_rate": 4.142047181749415e-05, "loss": 0.0002, "step": 39990 }, { "epoch": 2.545014951962843, "grad_norm": 0.00286865234375, "learning_rate": 4.141693707450531e-05, "loss": 0.0003, "step": 40000 }, { "epoch": 2.5456512057008336, "grad_norm": 0.01055908203125, "learning_rate": 4.1413402331516476e-05, "loss": 0.0004, "step": 40010 }, { "epoch": 2.5462874594388243, "grad_norm": 0.001220703125, "learning_rate": 4.140986758852764e-05, "loss": 0.0002, "step": 40020 }, { "epoch": 2.546923713176815, "grad_norm": 0.01519775390625, "learning_rate": 4.1406332845538806e-05, "loss": 0.005, "step": 40030 }, { "epoch": 2.5475599669148057, "grad_norm": 0.00186920166015625, "learning_rate": 4.1402798102549964e-05, "loss": 0.0022, "step": 40040 }, { "epoch": 2.5481962206527964, "grad_norm": 0.029296875, "learning_rate": 4.139926335956113e-05, "loss": 0.0001, "step": 40050 }, { "epoch": 2.548832474390787, "grad_norm": 0.0118408203125, "learning_rate": 4.1395728616572295e-05, "loss": 0.0001, "step": 40060 }, { "epoch": 2.549468728128778, "grad_norm": 0.0537109375, "learning_rate": 4.139219387358345e-05, "loss": 0.0038, "step": 40070 }, { "epoch": 2.5501049818667685, "grad_norm": 0.004730224609375, "learning_rate": 4.138865913059461e-05, "loss": 0.0008, "step": 40080 }, { "epoch": 2.550741235604759, "grad_norm": 0.03271484375, "learning_rate": 4.138512438760578e-05, "loss": 0.0001, "step": 40090 }, { "epoch": 2.55137748934275, "grad_norm": 0.130859375, "learning_rate": 4.138158964461694e-05, "loss": 0.0001, "step": 40100 }, { "epoch": 2.5520137430807406, "grad_norm": 0.0439453125, "learning_rate": 4.137805490162811e-05, "loss": 0.0032, "step": 40110 }, { "epoch": 2.5526499968187313, "grad_norm": 1.90625, "learning_rate": 4.1374520158639266e-05, "loss": 0.0007, "step": 40120 }, { "epoch": 2.553286250556722, "grad_norm": 0.0030059814453125, "learning_rate": 4.137098541565043e-05, "loss": 0.0002, "step": 40130 }, { "epoch": 2.5539225042947127, "grad_norm": 0.00946044921875, "learning_rate": 4.1367450672661596e-05, "loss": 0.0002, "step": 40140 }, { "epoch": 2.5545587580327034, "grad_norm": 0.031982421875, "learning_rate": 4.1363915929672754e-05, "loss": 0.0001, "step": 40150 }, { "epoch": 2.555195011770694, "grad_norm": 0.53125, "learning_rate": 4.136038118668392e-05, "loss": 0.0004, "step": 40160 }, { "epoch": 2.555831265508685, "grad_norm": 0.07080078125, "learning_rate": 4.135684644369508e-05, "loss": 0.0015, "step": 40170 }, { "epoch": 2.5564675192466755, "grad_norm": 0.016357421875, "learning_rate": 4.135331170070624e-05, "loss": 0.0005, "step": 40180 }, { "epoch": 2.557103772984666, "grad_norm": 0.036376953125, "learning_rate": 4.134977695771741e-05, "loss": 0.0003, "step": 40190 }, { "epoch": 2.557740026722657, "grad_norm": 0.0003299713134765625, "learning_rate": 4.134624221472857e-05, "loss": 0.0037, "step": 40200 }, { "epoch": 2.5583762804606476, "grad_norm": 1.1875, "learning_rate": 4.134270747173973e-05, "loss": 0.0021, "step": 40210 }, { "epoch": 2.5590125341986383, "grad_norm": 0.04248046875, "learning_rate": 4.13391727287509e-05, "loss": 0.0002, "step": 40220 }, { "epoch": 2.559648787936629, "grad_norm": 0.0081787109375, "learning_rate": 4.1335637985762056e-05, "loss": 0.0001, "step": 40230 }, { "epoch": 2.5602850416746197, "grad_norm": 0.02978515625, "learning_rate": 4.133210324277322e-05, "loss": 0.0021, "step": 40240 }, { "epoch": 2.5609212954126104, "grad_norm": 0.005950927734375, "learning_rate": 4.132856849978438e-05, "loss": 0.0002, "step": 40250 }, { "epoch": 2.561557549150601, "grad_norm": 0.193359375, "learning_rate": 4.1325033756795544e-05, "loss": 0.0132, "step": 40260 }, { "epoch": 2.5621938028885918, "grad_norm": 0.392578125, "learning_rate": 4.132149901380671e-05, "loss": 0.0109, "step": 40270 }, { "epoch": 2.5628300566265825, "grad_norm": 0.002777099609375, "learning_rate": 4.1317964270817875e-05, "loss": 0.0012, "step": 40280 }, { "epoch": 2.563466310364573, "grad_norm": 0.0166015625, "learning_rate": 4.131442952782903e-05, "loss": 0.0005, "step": 40290 }, { "epoch": 2.564102564102564, "grad_norm": 0.0012359619140625, "learning_rate": 4.13108947848402e-05, "loss": 0.0001, "step": 40300 }, { "epoch": 2.5647388178405546, "grad_norm": 0.0205078125, "learning_rate": 4.130736004185136e-05, "loss": 0.0004, "step": 40310 }, { "epoch": 2.5653750715785453, "grad_norm": 0.0045166015625, "learning_rate": 4.130382529886252e-05, "loss": 0.0001, "step": 40320 }, { "epoch": 2.566011325316536, "grad_norm": 0.05322265625, "learning_rate": 4.130029055587368e-05, "loss": 0.0006, "step": 40330 }, { "epoch": 2.5666475790545267, "grad_norm": 0.034912109375, "learning_rate": 4.1296755812884845e-05, "loss": 0.0006, "step": 40340 }, { "epoch": 2.567283832792518, "grad_norm": 0.03857421875, "learning_rate": 4.129322106989601e-05, "loss": 0.0004, "step": 40350 }, { "epoch": 2.5679200865305085, "grad_norm": 0.024169921875, "learning_rate": 4.1289686326907176e-05, "loss": 0.0002, "step": 40360 }, { "epoch": 2.568556340268499, "grad_norm": 0.279296875, "learning_rate": 4.1286151583918334e-05, "loss": 0.0004, "step": 40370 }, { "epoch": 2.56919259400649, "grad_norm": 0.0272216796875, "learning_rate": 4.12826168409295e-05, "loss": 0.0005, "step": 40380 }, { "epoch": 2.5698288477444806, "grad_norm": 0.040283203125, "learning_rate": 4.1279082097940665e-05, "loss": 0.0003, "step": 40390 }, { "epoch": 2.5704651014824713, "grad_norm": 0.06005859375, "learning_rate": 4.127554735495182e-05, "loss": 0.0001, "step": 40400 }, { "epoch": 2.571101355220462, "grad_norm": 0.220703125, "learning_rate": 4.127201261196298e-05, "loss": 0.0017, "step": 40410 }, { "epoch": 2.5717376089584527, "grad_norm": 0.01025390625, "learning_rate": 4.126847786897415e-05, "loss": 0.0005, "step": 40420 }, { "epoch": 2.5723738626964434, "grad_norm": 0.003570556640625, "learning_rate": 4.126494312598531e-05, "loss": 0.0041, "step": 40430 }, { "epoch": 2.573010116434434, "grad_norm": 0.002105712890625, "learning_rate": 4.126140838299648e-05, "loss": 0.0003, "step": 40440 }, { "epoch": 2.573646370172425, "grad_norm": 0.00927734375, "learning_rate": 4.1257873640007635e-05, "loss": 0.0003, "step": 40450 }, { "epoch": 2.5742826239104155, "grad_norm": 0.005126953125, "learning_rate": 4.12543388970188e-05, "loss": 0.0002, "step": 40460 }, { "epoch": 2.574918877648406, "grad_norm": 0.00726318359375, "learning_rate": 4.1250804154029966e-05, "loss": 0.0016, "step": 40470 }, { "epoch": 2.575555131386397, "grad_norm": 0.0185546875, "learning_rate": 4.1247269411041124e-05, "loss": 0.0001, "step": 40480 }, { "epoch": 2.5761913851243876, "grad_norm": 1.7734375, "learning_rate": 4.124373466805229e-05, "loss": 0.0031, "step": 40490 }, { "epoch": 2.5768276388623783, "grad_norm": 0.00075531005859375, "learning_rate": 4.124019992506345e-05, "loss": 0.0001, "step": 40500 }, { "epoch": 2.577463892600369, "grad_norm": 0.00543212890625, "learning_rate": 4.123666518207461e-05, "loss": 0.0001, "step": 40510 }, { "epoch": 2.5781001463383597, "grad_norm": 0.052734375, "learning_rate": 4.123313043908578e-05, "loss": 0.0007, "step": 40520 }, { "epoch": 2.5787364000763504, "grad_norm": 0.09765625, "learning_rate": 4.1229595696096937e-05, "loss": 0.0004, "step": 40530 }, { "epoch": 2.579372653814341, "grad_norm": 0.02294921875, "learning_rate": 4.12260609531081e-05, "loss": 0.0001, "step": 40540 }, { "epoch": 2.580008907552332, "grad_norm": 0.000446319580078125, "learning_rate": 4.122252621011927e-05, "loss": 0.0001, "step": 40550 }, { "epoch": 2.5806451612903225, "grad_norm": 0.007293701171875, "learning_rate": 4.1218991467130425e-05, "loss": 0.0001, "step": 40560 }, { "epoch": 2.581281415028313, "grad_norm": 0.004364013671875, "learning_rate": 4.121545672414159e-05, "loss": 0.0002, "step": 40570 }, { "epoch": 2.581917668766304, "grad_norm": 0.01153564453125, "learning_rate": 4.121192198115275e-05, "loss": 0.0001, "step": 40580 }, { "epoch": 2.5825539225042946, "grad_norm": 0.00064849853515625, "learning_rate": 4.1208387238163914e-05, "loss": 0.0001, "step": 40590 }, { "epoch": 2.5831901762422853, "grad_norm": 0.0228271484375, "learning_rate": 4.120485249517508e-05, "loss": 0.0003, "step": 40600 }, { "epoch": 2.583826429980276, "grad_norm": 0.1591796875, "learning_rate": 4.1201317752186244e-05, "loss": 0.0003, "step": 40610 }, { "epoch": 2.5844626837182667, "grad_norm": 0.0242919921875, "learning_rate": 4.11977830091974e-05, "loss": 0.0008, "step": 40620 }, { "epoch": 2.5850989374562574, "grad_norm": 0.00299072265625, "learning_rate": 4.119424826620857e-05, "loss": 0.0004, "step": 40630 }, { "epoch": 2.585735191194248, "grad_norm": 0.0162353515625, "learning_rate": 4.1190713523219726e-05, "loss": 0.0001, "step": 40640 }, { "epoch": 2.5863714449322392, "grad_norm": 0.015625, "learning_rate": 4.118717878023089e-05, "loss": 0.0004, "step": 40650 }, { "epoch": 2.58700769867023, "grad_norm": 0.00154876708984375, "learning_rate": 4.118364403724205e-05, "loss": 0.0002, "step": 40660 }, { "epoch": 2.5876439524082206, "grad_norm": 0.00146484375, "learning_rate": 4.1180109294253215e-05, "loss": 0.0004, "step": 40670 }, { "epoch": 2.5882802061462113, "grad_norm": 0.2001953125, "learning_rate": 4.117657455126438e-05, "loss": 0.0033, "step": 40680 }, { "epoch": 2.588916459884202, "grad_norm": 0.003326416015625, "learning_rate": 4.1173039808275546e-05, "loss": 0.0002, "step": 40690 }, { "epoch": 2.5895527136221927, "grad_norm": 0.00836181640625, "learning_rate": 4.1169505065286704e-05, "loss": 0.0004, "step": 40700 }, { "epoch": 2.5901889673601834, "grad_norm": 0.00494384765625, "learning_rate": 4.116597032229787e-05, "loss": 0.0001, "step": 40710 }, { "epoch": 2.590825221098174, "grad_norm": 0.09619140625, "learning_rate": 4.116243557930903e-05, "loss": 0.0002, "step": 40720 }, { "epoch": 2.591461474836165, "grad_norm": 0.0007781982421875, "learning_rate": 4.115890083632019e-05, "loss": 0.0012, "step": 40730 }, { "epoch": 2.5920977285741555, "grad_norm": 0.03759765625, "learning_rate": 4.115536609333135e-05, "loss": 0.0008, "step": 40740 }, { "epoch": 2.5927339823121462, "grad_norm": 0.79296875, "learning_rate": 4.1151831350342516e-05, "loss": 0.0006, "step": 40750 }, { "epoch": 2.593370236050137, "grad_norm": 0.01373291015625, "learning_rate": 4.114829660735368e-05, "loss": 0.0015, "step": 40760 }, { "epoch": 2.5940064897881276, "grad_norm": 0.0030975341796875, "learning_rate": 4.114476186436485e-05, "loss": 0.002, "step": 40770 }, { "epoch": 2.5946427435261183, "grad_norm": 0.0130615234375, "learning_rate": 4.1141227121376005e-05, "loss": 0.0007, "step": 40780 }, { "epoch": 2.595278997264109, "grad_norm": 8.1875, "learning_rate": 4.113769237838717e-05, "loss": 0.0063, "step": 40790 }, { "epoch": 2.5959152510020997, "grad_norm": 0.004364013671875, "learning_rate": 4.1134157635398336e-05, "loss": 0.0019, "step": 40800 }, { "epoch": 2.5965515047400904, "grad_norm": 0.0089111328125, "learning_rate": 4.1130622892409494e-05, "loss": 0.0001, "step": 40810 }, { "epoch": 2.597187758478081, "grad_norm": 0.06689453125, "learning_rate": 4.112708814942066e-05, "loss": 0.0006, "step": 40820 }, { "epoch": 2.597824012216072, "grad_norm": 0.005523681640625, "learning_rate": 4.112355340643182e-05, "loss": 0.0003, "step": 40830 }, { "epoch": 2.5984602659540625, "grad_norm": 0.002655029296875, "learning_rate": 4.112001866344298e-05, "loss": 0.0002, "step": 40840 }, { "epoch": 2.5990965196920532, "grad_norm": 0.0167236328125, "learning_rate": 4.111648392045415e-05, "loss": 0.0005, "step": 40850 }, { "epoch": 2.599732773430044, "grad_norm": 0.00347900390625, "learning_rate": 4.1112949177465306e-05, "loss": 0.0002, "step": 40860 }, { "epoch": 2.6003690271680346, "grad_norm": 0.0020904541015625, "learning_rate": 4.110941443447647e-05, "loss": 0.0033, "step": 40870 }, { "epoch": 2.6010052809060253, "grad_norm": 0.5859375, "learning_rate": 4.110587969148764e-05, "loss": 0.0007, "step": 40880 }, { "epoch": 2.601641534644016, "grad_norm": 0.01275634765625, "learning_rate": 4.1102344948498795e-05, "loss": 0.0029, "step": 40890 }, { "epoch": 2.6022777883820067, "grad_norm": 0.005401611328125, "learning_rate": 4.109881020550996e-05, "loss": 0.0011, "step": 40900 }, { "epoch": 2.6029140421199974, "grad_norm": 4.28125, "learning_rate": 4.109527546252112e-05, "loss": 0.0027, "step": 40910 }, { "epoch": 2.603550295857988, "grad_norm": 0.00131988525390625, "learning_rate": 4.1091740719532284e-05, "loss": 0.0069, "step": 40920 }, { "epoch": 2.604186549595979, "grad_norm": 0.10302734375, "learning_rate": 4.108820597654345e-05, "loss": 0.0019, "step": 40930 }, { "epoch": 2.6048228033339695, "grad_norm": 0.035400390625, "learning_rate": 4.1084671233554614e-05, "loss": 0.0002, "step": 40940 }, { "epoch": 2.6054590570719602, "grad_norm": 0.007568359375, "learning_rate": 4.108113649056577e-05, "loss": 0.0011, "step": 40950 }, { "epoch": 2.606095310809951, "grad_norm": 0.0169677734375, "learning_rate": 4.107760174757694e-05, "loss": 0.0005, "step": 40960 }, { "epoch": 2.6067315645479416, "grad_norm": 0.0267333984375, "learning_rate": 4.1074067004588096e-05, "loss": 0.0001, "step": 40970 }, { "epoch": 2.6073678182859323, "grad_norm": 1.3046875, "learning_rate": 4.107053226159926e-05, "loss": 0.0006, "step": 40980 }, { "epoch": 2.608004072023923, "grad_norm": 3.171875, "learning_rate": 4.106699751861042e-05, "loss": 0.0056, "step": 40990 }, { "epoch": 2.6086403257619137, "grad_norm": 0.6328125, "learning_rate": 4.1063462775621585e-05, "loss": 0.0017, "step": 41000 }, { "epoch": 2.6092765794999044, "grad_norm": 0.00113677978515625, "learning_rate": 4.105992803263275e-05, "loss": 0.0005, "step": 41010 }, { "epoch": 2.609912833237895, "grad_norm": 0.0040283203125, "learning_rate": 4.1056393289643915e-05, "loss": 0.0013, "step": 41020 }, { "epoch": 2.610549086975886, "grad_norm": 0.001678466796875, "learning_rate": 4.1052858546655074e-05, "loss": 0.0003, "step": 41030 }, { "epoch": 2.6111853407138765, "grad_norm": 0.000858306884765625, "learning_rate": 4.104932380366624e-05, "loss": 0.0007, "step": 41040 }, { "epoch": 2.6118215944518672, "grad_norm": 0.0257568359375, "learning_rate": 4.10457890606774e-05, "loss": 0.0013, "step": 41050 }, { "epoch": 2.612457848189858, "grad_norm": 0.3125, "learning_rate": 4.104225431768856e-05, "loss": 0.0003, "step": 41060 }, { "epoch": 2.6130941019278486, "grad_norm": 0.005340576171875, "learning_rate": 4.103871957469972e-05, "loss": 0.0002, "step": 41070 }, { "epoch": 2.6137303556658393, "grad_norm": 0.01171875, "learning_rate": 4.1035184831710886e-05, "loss": 0.0004, "step": 41080 }, { "epoch": 2.61436660940383, "grad_norm": 0.0303955078125, "learning_rate": 4.103165008872205e-05, "loss": 0.0007, "step": 41090 }, { "epoch": 2.6150028631418207, "grad_norm": 0.189453125, "learning_rate": 4.102811534573322e-05, "loss": 0.0004, "step": 41100 }, { "epoch": 2.6156391168798114, "grad_norm": 0.01007080078125, "learning_rate": 4.1024580602744375e-05, "loss": 0.0128, "step": 41110 }, { "epoch": 2.616275370617802, "grad_norm": 0.0186767578125, "learning_rate": 4.102104585975554e-05, "loss": 0.0004, "step": 41120 }, { "epoch": 2.616911624355793, "grad_norm": 4.59375, "learning_rate": 4.10175111167667e-05, "loss": 0.0037, "step": 41130 }, { "epoch": 2.6175478780937835, "grad_norm": 0.041259765625, "learning_rate": 4.1013976373777864e-05, "loss": 0.0003, "step": 41140 }, { "epoch": 2.6181841318317747, "grad_norm": 0.00157928466796875, "learning_rate": 4.101044163078903e-05, "loss": 0.0031, "step": 41150 }, { "epoch": 2.6188203855697654, "grad_norm": 0.6171875, "learning_rate": 4.100690688780019e-05, "loss": 0.0004, "step": 41160 }, { "epoch": 2.619456639307756, "grad_norm": 0.03759765625, "learning_rate": 4.100337214481135e-05, "loss": 0.0002, "step": 41170 }, { "epoch": 2.6200928930457468, "grad_norm": 0.01513671875, "learning_rate": 4.099983740182252e-05, "loss": 0.0002, "step": 41180 }, { "epoch": 2.6207291467837375, "grad_norm": 0.005218505859375, "learning_rate": 4.099630265883368e-05, "loss": 0.0001, "step": 41190 }, { "epoch": 2.621365400521728, "grad_norm": 0.01007080078125, "learning_rate": 4.099276791584484e-05, "loss": 0.0017, "step": 41200 }, { "epoch": 2.622001654259719, "grad_norm": 0.00732421875, "learning_rate": 4.0989233172856e-05, "loss": 0.0001, "step": 41210 }, { "epoch": 2.6226379079977096, "grad_norm": 0.484375, "learning_rate": 4.0985698429867165e-05, "loss": 0.0006, "step": 41220 }, { "epoch": 2.6232741617357003, "grad_norm": 0.00933837890625, "learning_rate": 4.098216368687833e-05, "loss": 0.0007, "step": 41230 }, { "epoch": 2.623910415473691, "grad_norm": 0.015380859375, "learning_rate": 4.097862894388949e-05, "loss": 0.0163, "step": 41240 }, { "epoch": 2.6245466692116817, "grad_norm": 0.0022430419921875, "learning_rate": 4.0975094200900654e-05, "loss": 0.0002, "step": 41250 }, { "epoch": 2.6251829229496724, "grad_norm": 0.05859375, "learning_rate": 4.097155945791182e-05, "loss": 0.0011, "step": 41260 }, { "epoch": 2.625819176687663, "grad_norm": 0.064453125, "learning_rate": 4.0968024714922984e-05, "loss": 0.0016, "step": 41270 }, { "epoch": 2.6264554304256538, "grad_norm": 0.000820159912109375, "learning_rate": 4.096448997193414e-05, "loss": 0.0002, "step": 41280 }, { "epoch": 2.6270916841636445, "grad_norm": 0.265625, "learning_rate": 4.096095522894531e-05, "loss": 0.0002, "step": 41290 }, { "epoch": 2.627727937901635, "grad_norm": 0.000896453857421875, "learning_rate": 4.0957420485956466e-05, "loss": 0.0014, "step": 41300 }, { "epoch": 2.628364191639626, "grad_norm": 0.00750732421875, "learning_rate": 4.095388574296763e-05, "loss": 0.0002, "step": 41310 }, { "epoch": 2.6290004453776166, "grad_norm": 0.12060546875, "learning_rate": 4.095035099997879e-05, "loss": 0.0023, "step": 41320 }, { "epoch": 2.6296366991156073, "grad_norm": 0.0027313232421875, "learning_rate": 4.0946816256989955e-05, "loss": 0.0001, "step": 41330 }, { "epoch": 2.630272952853598, "grad_norm": 0.765625, "learning_rate": 4.094328151400112e-05, "loss": 0.0029, "step": 41340 }, { "epoch": 2.6309092065915887, "grad_norm": 0.0074462890625, "learning_rate": 4.0939746771012285e-05, "loss": 0.0037, "step": 41350 }, { "epoch": 2.6315454603295794, "grad_norm": 0.01129150390625, "learning_rate": 4.0936212028023444e-05, "loss": 0.0004, "step": 41360 }, { "epoch": 2.63218171406757, "grad_norm": 0.00130462646484375, "learning_rate": 4.093267728503461e-05, "loss": 0.0005, "step": 41370 }, { "epoch": 2.6328179678055608, "grad_norm": 0.064453125, "learning_rate": 4.092914254204577e-05, "loss": 0.0001, "step": 41380 }, { "epoch": 2.6334542215435515, "grad_norm": 0.002044677734375, "learning_rate": 4.092560779905693e-05, "loss": 0.0001, "step": 41390 }, { "epoch": 2.634090475281542, "grad_norm": 0.021484375, "learning_rate": 4.09220730560681e-05, "loss": 0.0165, "step": 41400 }, { "epoch": 2.634726729019533, "grad_norm": 0.0322265625, "learning_rate": 4.0918538313079256e-05, "loss": 0.002, "step": 41410 }, { "epoch": 2.6353629827575236, "grad_norm": 0.04296875, "learning_rate": 4.091500357009042e-05, "loss": 0.0003, "step": 41420 }, { "epoch": 2.6359992364955143, "grad_norm": 0.01611328125, "learning_rate": 4.0911468827101586e-05, "loss": 0.0006, "step": 41430 }, { "epoch": 2.636635490233505, "grad_norm": 0.0308837890625, "learning_rate": 4.0907934084112745e-05, "loss": 0.0002, "step": 41440 }, { "epoch": 2.637271743971496, "grad_norm": 0.042724609375, "learning_rate": 4.090439934112391e-05, "loss": 0.0003, "step": 41450 }, { "epoch": 2.637907997709487, "grad_norm": 0.002105712890625, "learning_rate": 4.090086459813507e-05, "loss": 0.0006, "step": 41460 }, { "epoch": 2.6385442514474775, "grad_norm": 0.00738525390625, "learning_rate": 4.0897329855146234e-05, "loss": 0.0001, "step": 41470 }, { "epoch": 2.639180505185468, "grad_norm": 0.16015625, "learning_rate": 4.08937951121574e-05, "loss": 0.0003, "step": 41480 }, { "epoch": 2.639816758923459, "grad_norm": 0.0380859375, "learning_rate": 4.089026036916856e-05, "loss": 0.0003, "step": 41490 }, { "epoch": 2.6404530126614496, "grad_norm": 0.0157470703125, "learning_rate": 4.088672562617972e-05, "loss": 0.0033, "step": 41500 }, { "epoch": 2.6410892663994403, "grad_norm": 0.0026092529296875, "learning_rate": 4.088319088319089e-05, "loss": 0.0002, "step": 41510 }, { "epoch": 2.641725520137431, "grad_norm": 0.0830078125, "learning_rate": 4.087965614020205e-05, "loss": 0.0029, "step": 41520 }, { "epoch": 2.6423617738754217, "grad_norm": 0.0177001953125, "learning_rate": 4.087612139721321e-05, "loss": 0.0022, "step": 41530 }, { "epoch": 2.6429980276134124, "grad_norm": 0.0986328125, "learning_rate": 4.087258665422437e-05, "loss": 0.0006, "step": 41540 }, { "epoch": 2.643634281351403, "grad_norm": 0.08935546875, "learning_rate": 4.0869051911235535e-05, "loss": 0.0003, "step": 41550 }, { "epoch": 2.644270535089394, "grad_norm": 0.0030517578125, "learning_rate": 4.08655171682467e-05, "loss": 0.0011, "step": 41560 }, { "epoch": 2.6449067888273845, "grad_norm": 0.01190185546875, "learning_rate": 4.086198242525786e-05, "loss": 0.0012, "step": 41570 }, { "epoch": 2.645543042565375, "grad_norm": 0.012939453125, "learning_rate": 4.0858447682269024e-05, "loss": 0.0003, "step": 41580 }, { "epoch": 2.646179296303366, "grad_norm": 0.08251953125, "learning_rate": 4.085491293928019e-05, "loss": 0.0002, "step": 41590 }, { "epoch": 2.6468155500413566, "grad_norm": 0.0011749267578125, "learning_rate": 4.0851378196291354e-05, "loss": 0.0002, "step": 41600 }, { "epoch": 2.6474518037793473, "grad_norm": 0.0029449462890625, "learning_rate": 4.084784345330251e-05, "loss": 0.0003, "step": 41610 }, { "epoch": 2.648088057517338, "grad_norm": 0.0245361328125, "learning_rate": 4.084430871031367e-05, "loss": 0.0007, "step": 41620 }, { "epoch": 2.6487243112553287, "grad_norm": 0.007720947265625, "learning_rate": 4.0840773967324836e-05, "loss": 0.0007, "step": 41630 }, { "epoch": 2.6493605649933194, "grad_norm": 0.05859375, "learning_rate": 4.0837239224336e-05, "loss": 0.0023, "step": 41640 }, { "epoch": 2.64999681873131, "grad_norm": 0.0106201171875, "learning_rate": 4.083370448134716e-05, "loss": 0.0002, "step": 41650 }, { "epoch": 2.650633072469301, "grad_norm": 0.00099945068359375, "learning_rate": 4.0830169738358325e-05, "loss": 0.0037, "step": 41660 }, { "epoch": 2.6512693262072915, "grad_norm": 0.00921630859375, "learning_rate": 4.082663499536949e-05, "loss": 0.0002, "step": 41670 }, { "epoch": 2.651905579945282, "grad_norm": 0.0498046875, "learning_rate": 4.0823100252380655e-05, "loss": 0.0002, "step": 41680 }, { "epoch": 2.652541833683273, "grad_norm": 0.00165557861328125, "learning_rate": 4.0819565509391813e-05, "loss": 0.0002, "step": 41690 }, { "epoch": 2.6531780874212636, "grad_norm": 0.0062255859375, "learning_rate": 4.081603076640298e-05, "loss": 0.0005, "step": 41700 }, { "epoch": 2.6538143411592543, "grad_norm": 0.00262451171875, "learning_rate": 4.081249602341414e-05, "loss": 0.0154, "step": 41710 }, { "epoch": 2.654450594897245, "grad_norm": 0.0205078125, "learning_rate": 4.08089612804253e-05, "loss": 0.0002, "step": 41720 }, { "epoch": 2.6550868486352357, "grad_norm": 0.0224609375, "learning_rate": 4.080542653743647e-05, "loss": 0.0002, "step": 41730 }, { "epoch": 2.6557231023732264, "grad_norm": 0.012451171875, "learning_rate": 4.0801891794447626e-05, "loss": 0.0004, "step": 41740 }, { "epoch": 2.656359356111217, "grad_norm": 0.039306640625, "learning_rate": 4.079835705145879e-05, "loss": 0.0001, "step": 41750 }, { "epoch": 2.656995609849208, "grad_norm": 0.0037841796875, "learning_rate": 4.0794822308469956e-05, "loss": 0.0007, "step": 41760 }, { "epoch": 2.6576318635871985, "grad_norm": 0.064453125, "learning_rate": 4.0791287565481115e-05, "loss": 0.0004, "step": 41770 }, { "epoch": 2.658268117325189, "grad_norm": 0.039306640625, "learning_rate": 4.078775282249228e-05, "loss": 0.0001, "step": 41780 }, { "epoch": 2.65890437106318, "grad_norm": 0.004608154296875, "learning_rate": 4.078421807950344e-05, "loss": 0.0008, "step": 41790 }, { "epoch": 2.6595406248011706, "grad_norm": 0.005645751953125, "learning_rate": 4.0780683336514603e-05, "loss": 0.0001, "step": 41800 }, { "epoch": 2.6601768785391613, "grad_norm": 0.016357421875, "learning_rate": 4.077714859352577e-05, "loss": 0.0007, "step": 41810 }, { "epoch": 2.660813132277152, "grad_norm": 0.01422119140625, "learning_rate": 4.077361385053693e-05, "loss": 0.0019, "step": 41820 }, { "epoch": 2.6614493860151427, "grad_norm": 0.005523681640625, "learning_rate": 4.077007910754809e-05, "loss": 0.0014, "step": 41830 }, { "epoch": 2.6620856397531334, "grad_norm": 1.53125, "learning_rate": 4.076654436455926e-05, "loss": 0.0011, "step": 41840 }, { "epoch": 2.662721893491124, "grad_norm": 0.0042724609375, "learning_rate": 4.076300962157042e-05, "loss": 0.0002, "step": 41850 }, { "epoch": 2.663358147229115, "grad_norm": 0.000522613525390625, "learning_rate": 4.075947487858158e-05, "loss": 0.0006, "step": 41860 }, { "epoch": 2.6639944009671055, "grad_norm": 0.004119873046875, "learning_rate": 4.075594013559274e-05, "loss": 0.0002, "step": 41870 }, { "epoch": 2.664630654705096, "grad_norm": 0.00107574462890625, "learning_rate": 4.0752405392603905e-05, "loss": 0.0001, "step": 41880 }, { "epoch": 2.665266908443087, "grad_norm": 0.06982421875, "learning_rate": 4.074887064961507e-05, "loss": 0.0106, "step": 41890 }, { "epoch": 2.6659031621810776, "grad_norm": 0.271484375, "learning_rate": 4.074533590662623e-05, "loss": 0.0019, "step": 41900 }, { "epoch": 2.6665394159190683, "grad_norm": 0.00543212890625, "learning_rate": 4.074180116363739e-05, "loss": 0.0002, "step": 41910 }, { "epoch": 2.667175669657059, "grad_norm": 0.00872802734375, "learning_rate": 4.073826642064856e-05, "loss": 0.0001, "step": 41920 }, { "epoch": 2.6678119233950497, "grad_norm": 0.1474609375, "learning_rate": 4.0734731677659724e-05, "loss": 0.0028, "step": 41930 }, { "epoch": 2.6684481771330404, "grad_norm": 0.0076904296875, "learning_rate": 4.073119693467088e-05, "loss": 0.0053, "step": 41940 }, { "epoch": 2.6690844308710315, "grad_norm": 0.162109375, "learning_rate": 4.072766219168204e-05, "loss": 0.0025, "step": 41950 }, { "epoch": 2.6697206846090222, "grad_norm": 0.003814697265625, "learning_rate": 4.0724127448693206e-05, "loss": 0.0007, "step": 41960 }, { "epoch": 2.670356938347013, "grad_norm": 0.00787353515625, "learning_rate": 4.072059270570437e-05, "loss": 0.0001, "step": 41970 }, { "epoch": 2.6709931920850036, "grad_norm": 0.73046875, "learning_rate": 4.071705796271553e-05, "loss": 0.0011, "step": 41980 }, { "epoch": 2.6716294458229943, "grad_norm": 0.04931640625, "learning_rate": 4.0713523219726695e-05, "loss": 0.0136, "step": 41990 }, { "epoch": 2.672265699560985, "grad_norm": 0.042724609375, "learning_rate": 4.070998847673786e-05, "loss": 0.0004, "step": 42000 }, { "epoch": 2.6729019532989757, "grad_norm": 0.0133056640625, "learning_rate": 4.0706453733749025e-05, "loss": 0.0002, "step": 42010 }, { "epoch": 2.6735382070369664, "grad_norm": 0.0269775390625, "learning_rate": 4.070291899076018e-05, "loss": 0.0043, "step": 42020 }, { "epoch": 2.674174460774957, "grad_norm": 0.10205078125, "learning_rate": 4.069938424777134e-05, "loss": 0.0003, "step": 42030 }, { "epoch": 2.674810714512948, "grad_norm": 0.0019378662109375, "learning_rate": 4.069584950478251e-05, "loss": 0.0019, "step": 42040 }, { "epoch": 2.6754469682509385, "grad_norm": 0.004180908203125, "learning_rate": 4.069231476179367e-05, "loss": 0.0002, "step": 42050 }, { "epoch": 2.6760832219889292, "grad_norm": 0.0203857421875, "learning_rate": 4.068878001880484e-05, "loss": 0.0001, "step": 42060 }, { "epoch": 2.67671947572692, "grad_norm": 0.23828125, "learning_rate": 4.0685245275815996e-05, "loss": 0.0002, "step": 42070 }, { "epoch": 2.6773557294649106, "grad_norm": 0.27734375, "learning_rate": 4.068171053282716e-05, "loss": 0.0034, "step": 42080 }, { "epoch": 2.6779919832029013, "grad_norm": 0.1416015625, "learning_rate": 4.0678175789838326e-05, "loss": 0.0002, "step": 42090 }, { "epoch": 2.678628236940892, "grad_norm": 0.02392578125, "learning_rate": 4.0674641046849484e-05, "loss": 0.0002, "step": 42100 }, { "epoch": 2.6792644906788827, "grad_norm": 0.263671875, "learning_rate": 4.067110630386065e-05, "loss": 0.0007, "step": 42110 }, { "epoch": 2.6799007444168734, "grad_norm": 0.08544921875, "learning_rate": 4.066757156087181e-05, "loss": 0.0002, "step": 42120 }, { "epoch": 2.680536998154864, "grad_norm": 0.0162353515625, "learning_rate": 4.066403681788297e-05, "loss": 0.0001, "step": 42130 }, { "epoch": 2.681173251892855, "grad_norm": 0.0103759765625, "learning_rate": 4.066050207489414e-05, "loss": 0.0003, "step": 42140 }, { "epoch": 2.6818095056308455, "grad_norm": 0.00732421875, "learning_rate": 4.06569673319053e-05, "loss": 0.0003, "step": 42150 }, { "epoch": 2.6824457593688362, "grad_norm": 0.0211181640625, "learning_rate": 4.065343258891646e-05, "loss": 0.0004, "step": 42160 }, { "epoch": 2.683082013106827, "grad_norm": 0.0250244140625, "learning_rate": 4.064989784592763e-05, "loss": 0.0002, "step": 42170 }, { "epoch": 2.6837182668448176, "grad_norm": 0.035400390625, "learning_rate": 4.064636310293879e-05, "loss": 0.001, "step": 42180 }, { "epoch": 2.6843545205828083, "grad_norm": 0.007598876953125, "learning_rate": 4.064282835994995e-05, "loss": 0.0001, "step": 42190 }, { "epoch": 2.684990774320799, "grad_norm": 0.044677734375, "learning_rate": 4.063929361696111e-05, "loss": 0.0002, "step": 42200 }, { "epoch": 2.6856270280587897, "grad_norm": 0.00201416015625, "learning_rate": 4.0635758873972274e-05, "loss": 0.0001, "step": 42210 }, { "epoch": 2.6862632817967804, "grad_norm": 0.00958251953125, "learning_rate": 4.063222413098344e-05, "loss": 0.0001, "step": 42220 }, { "epoch": 2.686899535534771, "grad_norm": 0.0018310546875, "learning_rate": 4.06286893879946e-05, "loss": 0.0002, "step": 42230 }, { "epoch": 2.687535789272762, "grad_norm": 1.625, "learning_rate": 4.062515464500576e-05, "loss": 0.0011, "step": 42240 }, { "epoch": 2.688172043010753, "grad_norm": 0.09912109375, "learning_rate": 4.062161990201693e-05, "loss": 0.0006, "step": 42250 }, { "epoch": 2.6888082967487437, "grad_norm": 0.00160980224609375, "learning_rate": 4.0618085159028094e-05, "loss": 0.0003, "step": 42260 }, { "epoch": 2.6894445504867344, "grad_norm": 0.001129150390625, "learning_rate": 4.061455041603925e-05, "loss": 0.0028, "step": 42270 }, { "epoch": 2.690080804224725, "grad_norm": 0.0025634765625, "learning_rate": 4.061101567305041e-05, "loss": 0.0027, "step": 42280 }, { "epoch": 2.6907170579627158, "grad_norm": 0.020751953125, "learning_rate": 4.0607480930061576e-05, "loss": 0.0001, "step": 42290 }, { "epoch": 2.6913533117007065, "grad_norm": 0.009033203125, "learning_rate": 4.060394618707274e-05, "loss": 0.0006, "step": 42300 }, { "epoch": 2.691989565438697, "grad_norm": 0.0625, "learning_rate": 4.06004114440839e-05, "loss": 0.007, "step": 42310 }, { "epoch": 2.692625819176688, "grad_norm": 0.00494384765625, "learning_rate": 4.0596876701095064e-05, "loss": 0.0001, "step": 42320 }, { "epoch": 2.6932620729146786, "grad_norm": 0.0145263671875, "learning_rate": 4.059334195810623e-05, "loss": 0.0001, "step": 42330 }, { "epoch": 2.6938983266526693, "grad_norm": 0.000934600830078125, "learning_rate": 4.0589807215117395e-05, "loss": 0.0005, "step": 42340 }, { "epoch": 2.69453458039066, "grad_norm": 0.03466796875, "learning_rate": 4.058627247212855e-05, "loss": 0.0009, "step": 42350 }, { "epoch": 2.6951708341286507, "grad_norm": 0.036865234375, "learning_rate": 4.058273772913971e-05, "loss": 0.0008, "step": 42360 }, { "epoch": 2.6958070878666414, "grad_norm": 5.09375, "learning_rate": 4.057920298615088e-05, "loss": 0.003, "step": 42370 }, { "epoch": 2.696443341604632, "grad_norm": 0.00299072265625, "learning_rate": 4.057566824316204e-05, "loss": 0.0001, "step": 42380 }, { "epoch": 2.6970795953426228, "grad_norm": 0.0216064453125, "learning_rate": 4.057213350017321e-05, "loss": 0.0007, "step": 42390 }, { "epoch": 2.6977158490806135, "grad_norm": 0.00157928466796875, "learning_rate": 4.0568598757184365e-05, "loss": 0.0002, "step": 42400 }, { "epoch": 2.698352102818604, "grad_norm": 0.015380859375, "learning_rate": 4.056506401419553e-05, "loss": 0.0001, "step": 42410 }, { "epoch": 2.698988356556595, "grad_norm": 0.01202392578125, "learning_rate": 4.0561529271206696e-05, "loss": 0.0002, "step": 42420 }, { "epoch": 2.6996246102945856, "grad_norm": 0.000919342041015625, "learning_rate": 4.0557994528217854e-05, "loss": 0.0003, "step": 42430 }, { "epoch": 2.7002608640325763, "grad_norm": 5.1875, "learning_rate": 4.055445978522901e-05, "loss": 0.0092, "step": 42440 }, { "epoch": 2.700897117770567, "grad_norm": 0.125, "learning_rate": 4.055092504224018e-05, "loss": 0.0004, "step": 42450 }, { "epoch": 2.7015333715085577, "grad_norm": 0.006134033203125, "learning_rate": 4.054739029925134e-05, "loss": 0.0001, "step": 42460 }, { "epoch": 2.7021696252465484, "grad_norm": 0.2138671875, "learning_rate": 4.054385555626251e-05, "loss": 0.0003, "step": 42470 }, { "epoch": 2.702805878984539, "grad_norm": 0.00750732421875, "learning_rate": 4.054032081327367e-05, "loss": 0.0001, "step": 42480 }, { "epoch": 2.7034421327225298, "grad_norm": 0.01239013671875, "learning_rate": 4.053678607028483e-05, "loss": 0.0001, "step": 42490 }, { "epoch": 2.7040783864605205, "grad_norm": 0.0087890625, "learning_rate": 4.0533251327296e-05, "loss": 0.0225, "step": 42500 }, { "epoch": 2.704714640198511, "grad_norm": 0.0091552734375, "learning_rate": 4.052971658430716e-05, "loss": 0.0003, "step": 42510 }, { "epoch": 2.705350893936502, "grad_norm": 0.021728515625, "learning_rate": 4.0526181841318314e-05, "loss": 0.0027, "step": 42520 }, { "epoch": 2.7059871476744926, "grad_norm": 0.022216796875, "learning_rate": 4.052264709832948e-05, "loss": 0.0048, "step": 42530 }, { "epoch": 2.7066234014124833, "grad_norm": 0.000408172607421875, "learning_rate": 4.0519112355340644e-05, "loss": 0.0003, "step": 42540 }, { "epoch": 2.707259655150474, "grad_norm": 0.004150390625, "learning_rate": 4.051557761235181e-05, "loss": 0.0015, "step": 42550 }, { "epoch": 2.7078959088884647, "grad_norm": 0.03466796875, "learning_rate": 4.051204286936297e-05, "loss": 0.0061, "step": 42560 }, { "epoch": 2.7085321626264554, "grad_norm": 0.0031890869140625, "learning_rate": 4.050850812637413e-05, "loss": 0.0009, "step": 42570 }, { "epoch": 2.709168416364446, "grad_norm": 0.0693359375, "learning_rate": 4.05049733833853e-05, "loss": 0.004, "step": 42580 }, { "epoch": 2.7098046701024368, "grad_norm": 0.0028076171875, "learning_rate": 4.050143864039646e-05, "loss": 0.0001, "step": 42590 }, { "epoch": 2.7104409238404275, "grad_norm": 0.006683349609375, "learning_rate": 4.049790389740762e-05, "loss": 0.0083, "step": 42600 }, { "epoch": 2.711077177578418, "grad_norm": 0.0020904541015625, "learning_rate": 4.049436915441878e-05, "loss": 0.0002, "step": 42610 }, { "epoch": 2.711713431316409, "grad_norm": 0.0024566650390625, "learning_rate": 4.0490834411429945e-05, "loss": 0.0006, "step": 42620 }, { "epoch": 2.7123496850543996, "grad_norm": 0.005523681640625, "learning_rate": 4.048729966844111e-05, "loss": 0.0001, "step": 42630 }, { "epoch": 2.7129859387923903, "grad_norm": 0.005767822265625, "learning_rate": 4.048376492545227e-05, "loss": 0.0002, "step": 42640 }, { "epoch": 2.713622192530381, "grad_norm": 0.2353515625, "learning_rate": 4.0480230182463434e-05, "loss": 0.0003, "step": 42650 }, { "epoch": 2.7142584462683716, "grad_norm": 0.1494140625, "learning_rate": 4.04766954394746e-05, "loss": 0.0009, "step": 42660 }, { "epoch": 2.7148947000063623, "grad_norm": 0.0042724609375, "learning_rate": 4.0473160696485765e-05, "loss": 0.0011, "step": 42670 }, { "epoch": 2.715530953744353, "grad_norm": 0.06103515625, "learning_rate": 4.046962595349692e-05, "loss": 0.0003, "step": 42680 }, { "epoch": 2.7161672074823437, "grad_norm": 0.21875, "learning_rate": 4.046609121050808e-05, "loss": 0.0122, "step": 42690 }, { "epoch": 2.7168034612203344, "grad_norm": 0.4140625, "learning_rate": 4.0462556467519247e-05, "loss": 0.0005, "step": 42700 }, { "epoch": 2.717439714958325, "grad_norm": 0.0859375, "learning_rate": 4.045902172453041e-05, "loss": 0.0014, "step": 42710 }, { "epoch": 2.718075968696316, "grad_norm": 0.0107421875, "learning_rate": 4.045548698154158e-05, "loss": 0.0002, "step": 42720 }, { "epoch": 2.7187122224343065, "grad_norm": 0.01165771484375, "learning_rate": 4.0451952238552735e-05, "loss": 0.0001, "step": 42730 }, { "epoch": 2.7193484761722972, "grad_norm": 0.0048828125, "learning_rate": 4.04484174955639e-05, "loss": 0.0002, "step": 42740 }, { "epoch": 2.719984729910288, "grad_norm": 0.03076171875, "learning_rate": 4.0444882752575066e-05, "loss": 0.0015, "step": 42750 }, { "epoch": 2.720620983648279, "grad_norm": 0.06787109375, "learning_rate": 4.0441348009586224e-05, "loss": 0.0005, "step": 42760 }, { "epoch": 2.72125723738627, "grad_norm": 3.515625, "learning_rate": 4.043781326659738e-05, "loss": 0.0014, "step": 42770 }, { "epoch": 2.7218934911242605, "grad_norm": 0.006622314453125, "learning_rate": 4.043427852360855e-05, "loss": 0.0005, "step": 42780 }, { "epoch": 2.722529744862251, "grad_norm": 0.0211181640625, "learning_rate": 4.043074378061971e-05, "loss": 0.0003, "step": 42790 }, { "epoch": 2.723165998600242, "grad_norm": 0.004180908203125, "learning_rate": 4.042720903763088e-05, "loss": 0.0001, "step": 42800 }, { "epoch": 2.7238022523382326, "grad_norm": 0.0079345703125, "learning_rate": 4.0423674294642036e-05, "loss": 0.0016, "step": 42810 }, { "epoch": 2.7244385060762233, "grad_norm": 0.0103759765625, "learning_rate": 4.04201395516532e-05, "loss": 0.0009, "step": 42820 }, { "epoch": 2.725074759814214, "grad_norm": 0.09375, "learning_rate": 4.041660480866437e-05, "loss": 0.0004, "step": 42830 }, { "epoch": 2.7257110135522047, "grad_norm": 0.0101318359375, "learning_rate": 4.041307006567553e-05, "loss": 0.0001, "step": 42840 }, { "epoch": 2.7263472672901954, "grad_norm": 0.0166015625, "learning_rate": 4.0409535322686684e-05, "loss": 0.0019, "step": 42850 }, { "epoch": 2.726983521028186, "grad_norm": 0.41796875, "learning_rate": 4.040600057969785e-05, "loss": 0.0008, "step": 42860 }, { "epoch": 2.727619774766177, "grad_norm": 0.006439208984375, "learning_rate": 4.0402465836709014e-05, "loss": 0.0002, "step": 42870 }, { "epoch": 2.7282560285041675, "grad_norm": 0.1533203125, "learning_rate": 4.039893109372018e-05, "loss": 0.0002, "step": 42880 }, { "epoch": 2.728892282242158, "grad_norm": 0.044921875, "learning_rate": 4.039539635073134e-05, "loss": 0.0007, "step": 42890 }, { "epoch": 2.729528535980149, "grad_norm": 0.0390625, "learning_rate": 4.03918616077425e-05, "loss": 0.0001, "step": 42900 }, { "epoch": 2.7301647897181396, "grad_norm": 0.0155029296875, "learning_rate": 4.038832686475367e-05, "loss": 0.0003, "step": 42910 }, { "epoch": 2.7308010434561303, "grad_norm": 0.06689453125, "learning_rate": 4.038479212176483e-05, "loss": 0.0002, "step": 42920 }, { "epoch": 2.731437297194121, "grad_norm": 0.00167083740234375, "learning_rate": 4.038125737877599e-05, "loss": 0.0005, "step": 42930 }, { "epoch": 2.7320735509321117, "grad_norm": 0.004791259765625, "learning_rate": 4.037772263578715e-05, "loss": 0.0001, "step": 42940 }, { "epoch": 2.7327098046701024, "grad_norm": 0.0296630859375, "learning_rate": 4.0374187892798315e-05, "loss": 0.004, "step": 42950 }, { "epoch": 2.733346058408093, "grad_norm": 0.00151824951171875, "learning_rate": 4.037065314980948e-05, "loss": 0.0001, "step": 42960 }, { "epoch": 2.733982312146084, "grad_norm": 0.0021820068359375, "learning_rate": 4.036711840682064e-05, "loss": 0.0001, "step": 42970 }, { "epoch": 2.7346185658840745, "grad_norm": 3.96875, "learning_rate": 4.0363583663831804e-05, "loss": 0.0039, "step": 42980 }, { "epoch": 2.735254819622065, "grad_norm": 0.01116943359375, "learning_rate": 4.036004892084297e-05, "loss": 0.0009, "step": 42990 }, { "epoch": 2.735891073360056, "grad_norm": 0.12890625, "learning_rate": 4.0356514177854134e-05, "loss": 0.0009, "step": 43000 }, { "epoch": 2.7365273270980466, "grad_norm": 0.0003299713134765625, "learning_rate": 4.035297943486529e-05, "loss": 0.0004, "step": 43010 }, { "epoch": 2.7371635808360373, "grad_norm": 0.0019073486328125, "learning_rate": 4.034944469187645e-05, "loss": 0.0044, "step": 43020 }, { "epoch": 2.737799834574028, "grad_norm": 0.00054168701171875, "learning_rate": 4.0345909948887616e-05, "loss": 0.0105, "step": 43030 }, { "epoch": 2.7384360883120187, "grad_norm": 0.0089111328125, "learning_rate": 4.034237520589878e-05, "loss": 0.0001, "step": 43040 }, { "epoch": 2.7390723420500094, "grad_norm": 0.1220703125, "learning_rate": 4.033884046290995e-05, "loss": 0.0002, "step": 43050 }, { "epoch": 2.7397085957880005, "grad_norm": 0.00135040283203125, "learning_rate": 4.0335305719921105e-05, "loss": 0.0006, "step": 43060 }, { "epoch": 2.7403448495259912, "grad_norm": 0.006195068359375, "learning_rate": 4.033177097693227e-05, "loss": 0.0002, "step": 43070 }, { "epoch": 2.740981103263982, "grad_norm": 0.0419921875, "learning_rate": 4.0328236233943435e-05, "loss": 0.0001, "step": 43080 }, { "epoch": 2.7416173570019726, "grad_norm": 0.119140625, "learning_rate": 4.0324701490954594e-05, "loss": 0.0002, "step": 43090 }, { "epoch": 2.7422536107399633, "grad_norm": 0.049560546875, "learning_rate": 4.032116674796575e-05, "loss": 0.0042, "step": 43100 }, { "epoch": 2.742889864477954, "grad_norm": 0.001251220703125, "learning_rate": 4.031763200497692e-05, "loss": 0.0035, "step": 43110 }, { "epoch": 2.7435261182159447, "grad_norm": 0.0050048828125, "learning_rate": 4.031409726198808e-05, "loss": 0.0066, "step": 43120 }, { "epoch": 2.7441623719539354, "grad_norm": 0.08056640625, "learning_rate": 4.031056251899925e-05, "loss": 0.0001, "step": 43130 }, { "epoch": 2.744798625691926, "grad_norm": 0.0023956298828125, "learning_rate": 4.0307027776010406e-05, "loss": 0.0005, "step": 43140 }, { "epoch": 2.745434879429917, "grad_norm": 0.0311279296875, "learning_rate": 4.030349303302157e-05, "loss": 0.0009, "step": 43150 }, { "epoch": 2.7460711331679075, "grad_norm": 0.036865234375, "learning_rate": 4.029995829003274e-05, "loss": 0.0006, "step": 43160 }, { "epoch": 2.746707386905898, "grad_norm": 1.203125, "learning_rate": 4.02964235470439e-05, "loss": 0.0005, "step": 43170 }, { "epoch": 2.747343640643889, "grad_norm": 0.00286865234375, "learning_rate": 4.0292888804055053e-05, "loss": 0.0064, "step": 43180 }, { "epoch": 2.7479798943818796, "grad_norm": 0.0038604736328125, "learning_rate": 4.028935406106622e-05, "loss": 0.0002, "step": 43190 }, { "epoch": 2.7486161481198703, "grad_norm": 0.10693359375, "learning_rate": 4.0285819318077384e-05, "loss": 0.0002, "step": 43200 }, { "epoch": 2.749252401857861, "grad_norm": 0.036376953125, "learning_rate": 4.028228457508855e-05, "loss": 0.0003, "step": 43210 }, { "epoch": 2.7498886555958517, "grad_norm": 0.014892578125, "learning_rate": 4.027874983209971e-05, "loss": 0.0008, "step": 43220 }, { "epoch": 2.7505249093338424, "grad_norm": 0.000881195068359375, "learning_rate": 4.027521508911087e-05, "loss": 0.0011, "step": 43230 }, { "epoch": 2.751161163071833, "grad_norm": 0.01904296875, "learning_rate": 4.027168034612204e-05, "loss": 0.0001, "step": 43240 }, { "epoch": 2.751797416809824, "grad_norm": 0.09375, "learning_rate": 4.02681456031332e-05, "loss": 0.0009, "step": 43250 }, { "epoch": 2.7524336705478145, "grad_norm": 0.09130859375, "learning_rate": 4.026461086014436e-05, "loss": 0.0002, "step": 43260 }, { "epoch": 2.753069924285805, "grad_norm": 0.0037689208984375, "learning_rate": 4.026107611715552e-05, "loss": 0.0005, "step": 43270 }, { "epoch": 2.753706178023796, "grad_norm": 0.00823974609375, "learning_rate": 4.0257541374166685e-05, "loss": 0.0017, "step": 43280 }, { "epoch": 2.7543424317617866, "grad_norm": 0.025634765625, "learning_rate": 4.025400663117785e-05, "loss": 0.0001, "step": 43290 }, { "epoch": 2.7549786854997773, "grad_norm": 0.66015625, "learning_rate": 4.025047188818901e-05, "loss": 0.0017, "step": 43300 }, { "epoch": 2.755614939237768, "grad_norm": 0.00311279296875, "learning_rate": 4.0246937145200174e-05, "loss": 0.0007, "step": 43310 }, { "epoch": 2.7562511929757587, "grad_norm": 0.00171661376953125, "learning_rate": 4.024340240221134e-05, "loss": 0.0005, "step": 43320 }, { "epoch": 2.7568874467137494, "grad_norm": 0.0299072265625, "learning_rate": 4.0239867659222504e-05, "loss": 0.0029, "step": 43330 }, { "epoch": 2.75752370045174, "grad_norm": 0.0255126953125, "learning_rate": 4.023633291623366e-05, "loss": 0.0001, "step": 43340 }, { "epoch": 2.758159954189731, "grad_norm": 0.0162353515625, "learning_rate": 4.023279817324482e-05, "loss": 0.0002, "step": 43350 }, { "epoch": 2.7587962079277215, "grad_norm": 0.00144195556640625, "learning_rate": 4.0229263430255986e-05, "loss": 0.0006, "step": 43360 }, { "epoch": 2.759432461665712, "grad_norm": 0.11669921875, "learning_rate": 4.022572868726715e-05, "loss": 0.0006, "step": 43370 }, { "epoch": 2.760068715403703, "grad_norm": 0.0162353515625, "learning_rate": 4.0222193944278317e-05, "loss": 0.0003, "step": 43380 }, { "epoch": 2.7607049691416936, "grad_norm": 0.0096435546875, "learning_rate": 4.0218659201289475e-05, "loss": 0.0, "step": 43390 }, { "epoch": 2.7613412228796843, "grad_norm": 0.01373291015625, "learning_rate": 4.021512445830064e-05, "loss": 0.0001, "step": 43400 }, { "epoch": 2.761977476617675, "grad_norm": 0.006988525390625, "learning_rate": 4.0211589715311805e-05, "loss": 0.0027, "step": 43410 }, { "epoch": 2.7626137303556657, "grad_norm": 0.0047607421875, "learning_rate": 4.0208054972322964e-05, "loss": 0.0031, "step": 43420 }, { "epoch": 2.7632499840936564, "grad_norm": 0.0006103515625, "learning_rate": 4.020452022933412e-05, "loss": 0.0001, "step": 43430 }, { "epoch": 2.763886237831647, "grad_norm": 0.0263671875, "learning_rate": 4.020098548634529e-05, "loss": 0.0039, "step": 43440 }, { "epoch": 2.764522491569638, "grad_norm": 0.0035552978515625, "learning_rate": 4.019745074335645e-05, "loss": 0.0003, "step": 43450 }, { "epoch": 2.7651587453076285, "grad_norm": 0.291015625, "learning_rate": 4.019391600036762e-05, "loss": 0.0014, "step": 43460 }, { "epoch": 2.765794999045619, "grad_norm": 0.00439453125, "learning_rate": 4.0190381257378776e-05, "loss": 0.0002, "step": 43470 }, { "epoch": 2.76643125278361, "grad_norm": 0.054443359375, "learning_rate": 4.018684651438994e-05, "loss": 0.0006, "step": 43480 }, { "epoch": 2.7670675065216006, "grad_norm": 0.004608154296875, "learning_rate": 4.0183311771401106e-05, "loss": 0.0003, "step": 43490 }, { "epoch": 2.7677037602595913, "grad_norm": 0.0016326904296875, "learning_rate": 4.017977702841227e-05, "loss": 0.0005, "step": 43500 }, { "epoch": 2.768340013997582, "grad_norm": 0.01458740234375, "learning_rate": 4.017624228542342e-05, "loss": 0.0001, "step": 43510 }, { "epoch": 2.7689762677355727, "grad_norm": 0.11767578125, "learning_rate": 4.017270754243459e-05, "loss": 0.0002, "step": 43520 }, { "epoch": 2.7696125214735634, "grad_norm": 0.003387451171875, "learning_rate": 4.0169172799445754e-05, "loss": 0.0017, "step": 43530 }, { "epoch": 2.770248775211554, "grad_norm": 0.003448486328125, "learning_rate": 4.016563805645692e-05, "loss": 0.0002, "step": 43540 }, { "epoch": 2.770885028949545, "grad_norm": 0.000579833984375, "learning_rate": 4.016210331346808e-05, "loss": 0.0004, "step": 43550 }, { "epoch": 2.771521282687536, "grad_norm": 0.006561279296875, "learning_rate": 4.015856857047924e-05, "loss": 0.0016, "step": 43560 }, { "epoch": 2.7721575364255266, "grad_norm": 0.0021514892578125, "learning_rate": 4.015503382749041e-05, "loss": 0.0003, "step": 43570 }, { "epoch": 2.7727937901635173, "grad_norm": 0.016845703125, "learning_rate": 4.015149908450157e-05, "loss": 0.0001, "step": 43580 }, { "epoch": 2.773430043901508, "grad_norm": 0.34375, "learning_rate": 4.014796434151273e-05, "loss": 0.0005, "step": 43590 }, { "epoch": 2.7740662976394987, "grad_norm": 0.006439208984375, "learning_rate": 4.014442959852389e-05, "loss": 0.0002, "step": 43600 }, { "epoch": 2.7747025513774894, "grad_norm": 0.0004425048828125, "learning_rate": 4.0140894855535055e-05, "loss": 0.0001, "step": 43610 }, { "epoch": 2.77533880511548, "grad_norm": 0.0027618408203125, "learning_rate": 4.013736011254622e-05, "loss": 0.003, "step": 43620 }, { "epoch": 2.775975058853471, "grad_norm": 0.03271484375, "learning_rate": 4.0133825369557385e-05, "loss": 0.0029, "step": 43630 }, { "epoch": 2.7766113125914615, "grad_norm": 0.01513671875, "learning_rate": 4.0130290626568544e-05, "loss": 0.0002, "step": 43640 }, { "epoch": 2.7772475663294522, "grad_norm": 0.0189208984375, "learning_rate": 4.012675588357971e-05, "loss": 0.0005, "step": 43650 }, { "epoch": 2.777883820067443, "grad_norm": 0.0264892578125, "learning_rate": 4.0123221140590874e-05, "loss": 0.0014, "step": 43660 }, { "epoch": 2.7785200738054336, "grad_norm": 0.06689453125, "learning_rate": 4.011968639760203e-05, "loss": 0.0002, "step": 43670 }, { "epoch": 2.7791563275434243, "grad_norm": 0.076171875, "learning_rate": 4.011615165461319e-05, "loss": 0.0002, "step": 43680 }, { "epoch": 2.779792581281415, "grad_norm": 0.0908203125, "learning_rate": 4.0112616911624356e-05, "loss": 0.0005, "step": 43690 }, { "epoch": 2.7804288350194057, "grad_norm": 0.01239013671875, "learning_rate": 4.010908216863552e-05, "loss": 0.0006, "step": 43700 }, { "epoch": 2.7810650887573964, "grad_norm": 0.0130615234375, "learning_rate": 4.0105547425646686e-05, "loss": 0.0007, "step": 43710 }, { "epoch": 2.781701342495387, "grad_norm": 0.00616455078125, "learning_rate": 4.0102012682657845e-05, "loss": 0.0004, "step": 43720 }, { "epoch": 2.782337596233378, "grad_norm": 0.03564453125, "learning_rate": 4.009847793966901e-05, "loss": 0.0009, "step": 43730 }, { "epoch": 2.7829738499713685, "grad_norm": 0.00069427490234375, "learning_rate": 4.0094943196680175e-05, "loss": 0.0006, "step": 43740 }, { "epoch": 2.7836101037093592, "grad_norm": 0.00885009765625, "learning_rate": 4.0091408453691334e-05, "loss": 0.0001, "step": 43750 }, { "epoch": 2.78424635744735, "grad_norm": 0.01251220703125, "learning_rate": 4.008787371070249e-05, "loss": 0.0003, "step": 43760 }, { "epoch": 2.7848826111853406, "grad_norm": 0.0205078125, "learning_rate": 4.008433896771366e-05, "loss": 0.0, "step": 43770 }, { "epoch": 2.7855188649233313, "grad_norm": 0.015869140625, "learning_rate": 4.008080422472482e-05, "loss": 0.0006, "step": 43780 }, { "epoch": 2.786155118661322, "grad_norm": 0.00109100341796875, "learning_rate": 4.007726948173599e-05, "loss": 0.0001, "step": 43790 }, { "epoch": 2.7867913723993127, "grad_norm": 0.047119140625, "learning_rate": 4.0073734738747146e-05, "loss": 0.0001, "step": 43800 }, { "epoch": 2.7874276261373034, "grad_norm": 0.031005859375, "learning_rate": 4.007019999575831e-05, "loss": 0.0001, "step": 43810 }, { "epoch": 2.788063879875294, "grad_norm": 0.001068115234375, "learning_rate": 4.0066665252769476e-05, "loss": 0.0001, "step": 43820 }, { "epoch": 2.788700133613285, "grad_norm": 0.004730224609375, "learning_rate": 4.0063130509780635e-05, "loss": 0.0037, "step": 43830 }, { "epoch": 2.7893363873512755, "grad_norm": 0.005645751953125, "learning_rate": 4.00595957667918e-05, "loss": 0.0004, "step": 43840 }, { "epoch": 2.7899726410892662, "grad_norm": 0.000431060791015625, "learning_rate": 4.005606102380296e-05, "loss": 0.0001, "step": 43850 }, { "epoch": 2.7906088948272574, "grad_norm": 0.0015869140625, "learning_rate": 4.0052526280814123e-05, "loss": 0.0001, "step": 43860 }, { "epoch": 2.791245148565248, "grad_norm": 0.0189208984375, "learning_rate": 4.004899153782529e-05, "loss": 0.0002, "step": 43870 }, { "epoch": 2.791881402303239, "grad_norm": 0.00970458984375, "learning_rate": 4.004545679483645e-05, "loss": 0.0001, "step": 43880 }, { "epoch": 2.7925176560412295, "grad_norm": 0.0016021728515625, "learning_rate": 4.004192205184761e-05, "loss": 0.0063, "step": 43890 }, { "epoch": 2.79315390977922, "grad_norm": 0.00128936767578125, "learning_rate": 4.003838730885878e-05, "loss": 0.0001, "step": 43900 }, { "epoch": 2.793790163517211, "grad_norm": 0.007598876953125, "learning_rate": 4.003485256586994e-05, "loss": 0.0005, "step": 43910 }, { "epoch": 2.7944264172552016, "grad_norm": 3.875, "learning_rate": 4.00313178228811e-05, "loss": 0.0049, "step": 43920 }, { "epoch": 2.7950626709931923, "grad_norm": 0.009765625, "learning_rate": 4.002778307989226e-05, "loss": 0.0001, "step": 43930 }, { "epoch": 2.795698924731183, "grad_norm": 0.0123291015625, "learning_rate": 4.0024248336903425e-05, "loss": 0.0005, "step": 43940 }, { "epoch": 2.7963351784691737, "grad_norm": 0.00787353515625, "learning_rate": 4.002071359391459e-05, "loss": 0.0001, "step": 43950 }, { "epoch": 2.7969714322071644, "grad_norm": 0.1357421875, "learning_rate": 4.0017178850925755e-05, "loss": 0.0051, "step": 43960 }, { "epoch": 2.797607685945155, "grad_norm": 0.01300048828125, "learning_rate": 4.001364410793691e-05, "loss": 0.0001, "step": 43970 }, { "epoch": 2.7982439396831458, "grad_norm": 0.0023651123046875, "learning_rate": 4.001010936494808e-05, "loss": 0.0008, "step": 43980 }, { "epoch": 2.7988801934211365, "grad_norm": 0.0390625, "learning_rate": 4.0006574621959244e-05, "loss": 0.0001, "step": 43990 }, { "epoch": 2.799516447159127, "grad_norm": 0.018798828125, "learning_rate": 4.00030398789704e-05, "loss": 0.0001, "step": 44000 }, { "epoch": 2.800152700897118, "grad_norm": 0.042724609375, "learning_rate": 3.999950513598156e-05, "loss": 0.0002, "step": 44010 }, { "epoch": 2.8007889546351086, "grad_norm": 1.6015625, "learning_rate": 3.9995970392992726e-05, "loss": 0.0008, "step": 44020 }, { "epoch": 2.8014252083730993, "grad_norm": 0.0166015625, "learning_rate": 3.999243565000389e-05, "loss": 0.0003, "step": 44030 }, { "epoch": 2.80206146211109, "grad_norm": 0.004150390625, "learning_rate": 3.9988900907015056e-05, "loss": 0.0002, "step": 44040 }, { "epoch": 2.8026977158490807, "grad_norm": 0.004058837890625, "learning_rate": 3.9985366164026215e-05, "loss": 0.0005, "step": 44050 }, { "epoch": 2.8033339695870714, "grad_norm": 0.005523681640625, "learning_rate": 3.998183142103738e-05, "loss": 0.0002, "step": 44060 }, { "epoch": 2.803970223325062, "grad_norm": 0.054443359375, "learning_rate": 3.9978296678048545e-05, "loss": 0.0001, "step": 44070 }, { "epoch": 2.8046064770630528, "grad_norm": 0.0245361328125, "learning_rate": 3.99747619350597e-05, "loss": 0.0005, "step": 44080 }, { "epoch": 2.8052427308010435, "grad_norm": 0.037353515625, "learning_rate": 3.997122719207086e-05, "loss": 0.0003, "step": 44090 }, { "epoch": 2.805878984539034, "grad_norm": 0.01068115234375, "learning_rate": 3.996769244908203e-05, "loss": 0.0003, "step": 44100 }, { "epoch": 2.806515238277025, "grad_norm": 0.000774383544921875, "learning_rate": 3.996415770609319e-05, "loss": 0.0002, "step": 44110 }, { "epoch": 2.8071514920150156, "grad_norm": 0.0164794921875, "learning_rate": 3.996062296310436e-05, "loss": 0.0043, "step": 44120 }, { "epoch": 2.8077877457530063, "grad_norm": 0.01068115234375, "learning_rate": 3.9957088220115516e-05, "loss": 0.0047, "step": 44130 }, { "epoch": 2.808423999490997, "grad_norm": 0.0162353515625, "learning_rate": 3.995355347712668e-05, "loss": 0.0006, "step": 44140 }, { "epoch": 2.8090602532289877, "grad_norm": 3.015625, "learning_rate": 3.9950018734137846e-05, "loss": 0.0037, "step": 44150 }, { "epoch": 2.8096965069669784, "grad_norm": 1.3515625, "learning_rate": 3.9946483991149004e-05, "loss": 0.0018, "step": 44160 }, { "epoch": 2.810332760704969, "grad_norm": 0.1640625, "learning_rate": 3.994294924816017e-05, "loss": 0.0011, "step": 44170 }, { "epoch": 2.8109690144429598, "grad_norm": 0.045166015625, "learning_rate": 3.993941450517133e-05, "loss": 0.0002, "step": 44180 }, { "epoch": 2.8116052681809505, "grad_norm": 0.03857421875, "learning_rate": 3.993587976218249e-05, "loss": 0.0002, "step": 44190 }, { "epoch": 2.812241521918941, "grad_norm": 0.53125, "learning_rate": 3.993234501919366e-05, "loss": 0.0004, "step": 44200 }, { "epoch": 2.812877775656932, "grad_norm": 0.044677734375, "learning_rate": 3.992881027620482e-05, "loss": 0.0002, "step": 44210 }, { "epoch": 2.8135140293949226, "grad_norm": 0.0068359375, "learning_rate": 3.992527553321598e-05, "loss": 0.0001, "step": 44220 }, { "epoch": 2.8141502831329133, "grad_norm": 0.0198974609375, "learning_rate": 3.992174079022715e-05, "loss": 0.0022, "step": 44230 }, { "epoch": 2.814786536870904, "grad_norm": 0.01806640625, "learning_rate": 3.9918206047238306e-05, "loss": 0.0054, "step": 44240 }, { "epoch": 2.8154227906088947, "grad_norm": 0.007598876953125, "learning_rate": 3.991467130424947e-05, "loss": 0.0003, "step": 44250 }, { "epoch": 2.8160590443468854, "grad_norm": 0.0098876953125, "learning_rate": 3.991113656126063e-05, "loss": 0.0002, "step": 44260 }, { "epoch": 2.816695298084876, "grad_norm": 0.01202392578125, "learning_rate": 3.9907601818271794e-05, "loss": 0.0066, "step": 44270 }, { "epoch": 2.8173315518228668, "grad_norm": 0.00726318359375, "learning_rate": 3.990406707528296e-05, "loss": 0.0048, "step": 44280 }, { "epoch": 2.8179678055608575, "grad_norm": 0.0218505859375, "learning_rate": 3.9900532332294125e-05, "loss": 0.0002, "step": 44290 }, { "epoch": 2.818604059298848, "grad_norm": 0.12060546875, "learning_rate": 3.989699758930528e-05, "loss": 0.005, "step": 44300 }, { "epoch": 2.819240313036839, "grad_norm": 0.263671875, "learning_rate": 3.989346284631645e-05, "loss": 0.0003, "step": 44310 }, { "epoch": 2.8198765667748296, "grad_norm": 0.004058837890625, "learning_rate": 3.988992810332761e-05, "loss": 0.0002, "step": 44320 }, { "epoch": 2.8205128205128203, "grad_norm": 0.01080322265625, "learning_rate": 3.988639336033877e-05, "loss": 0.0001, "step": 44330 }, { "epoch": 2.821149074250811, "grad_norm": 0.0380859375, "learning_rate": 3.988285861734993e-05, "loss": 0.0002, "step": 44340 }, { "epoch": 2.8217853279888017, "grad_norm": 0.003265380859375, "learning_rate": 3.9879323874361096e-05, "loss": 0.0003, "step": 44350 }, { "epoch": 2.822421581726793, "grad_norm": 0.0213623046875, "learning_rate": 3.987578913137226e-05, "loss": 0.0004, "step": 44360 }, { "epoch": 2.8230578354647835, "grad_norm": 0.0164794921875, "learning_rate": 3.9872254388383426e-05, "loss": 0.0002, "step": 44370 }, { "epoch": 2.823694089202774, "grad_norm": 0.005157470703125, "learning_rate": 3.9868719645394584e-05, "loss": 0.0007, "step": 44380 }, { "epoch": 2.824330342940765, "grad_norm": 0.00604248046875, "learning_rate": 3.986518490240575e-05, "loss": 0.0004, "step": 44390 }, { "epoch": 2.8249665966787556, "grad_norm": 0.0654296875, "learning_rate": 3.9861650159416915e-05, "loss": 0.0001, "step": 44400 }, { "epoch": 2.8256028504167463, "grad_norm": 0.0673828125, "learning_rate": 3.985811541642807e-05, "loss": 0.0056, "step": 44410 }, { "epoch": 2.826239104154737, "grad_norm": 0.005523681640625, "learning_rate": 3.985458067343923e-05, "loss": 0.0001, "step": 44420 }, { "epoch": 2.8268753578927277, "grad_norm": 0.03076171875, "learning_rate": 3.98510459304504e-05, "loss": 0.0005, "step": 44430 }, { "epoch": 2.8275116116307184, "grad_norm": 0.0400390625, "learning_rate": 3.984751118746156e-05, "loss": 0.0128, "step": 44440 }, { "epoch": 2.828147865368709, "grad_norm": 0.0103759765625, "learning_rate": 3.984397644447273e-05, "loss": 0.0002, "step": 44450 }, { "epoch": 2.8287841191067, "grad_norm": 0.0079345703125, "learning_rate": 3.9840441701483886e-05, "loss": 0.0001, "step": 44460 }, { "epoch": 2.8294203728446905, "grad_norm": 0.0017242431640625, "learning_rate": 3.983690695849505e-05, "loss": 0.0003, "step": 44470 }, { "epoch": 2.830056626582681, "grad_norm": 0.00113677978515625, "learning_rate": 3.9833372215506216e-05, "loss": 0.0005, "step": 44480 }, { "epoch": 2.830692880320672, "grad_norm": 0.1201171875, "learning_rate": 3.9829837472517374e-05, "loss": 0.0005, "step": 44490 }, { "epoch": 2.8313291340586626, "grad_norm": 0.53125, "learning_rate": 3.982630272952854e-05, "loss": 0.0004, "step": 44500 }, { "epoch": 2.8319653877966533, "grad_norm": 0.003204345703125, "learning_rate": 3.98227679865397e-05, "loss": 0.0001, "step": 44510 }, { "epoch": 2.832601641534644, "grad_norm": 0.036865234375, "learning_rate": 3.981923324355086e-05, "loss": 0.0009, "step": 44520 }, { "epoch": 2.8332378952726347, "grad_norm": 0.000865936279296875, "learning_rate": 3.981569850056203e-05, "loss": 0.0005, "step": 44530 }, { "epoch": 2.8338741490106254, "grad_norm": 0.005126953125, "learning_rate": 3.981216375757319e-05, "loss": 0.0001, "step": 44540 }, { "epoch": 2.834510402748616, "grad_norm": 0.0030670166015625, "learning_rate": 3.980862901458435e-05, "loss": 0.0005, "step": 44550 }, { "epoch": 2.835146656486607, "grad_norm": 0.048095703125, "learning_rate": 3.980509427159552e-05, "loss": 0.0005, "step": 44560 }, { "epoch": 2.8357829102245975, "grad_norm": 0.40625, "learning_rate": 3.9801559528606675e-05, "loss": 0.0003, "step": 44570 }, { "epoch": 2.836419163962588, "grad_norm": 0.3515625, "learning_rate": 3.979802478561784e-05, "loss": 0.0004, "step": 44580 }, { "epoch": 2.837055417700579, "grad_norm": 0.0015716552734375, "learning_rate": 3.9794490042629e-05, "loss": 0.0003, "step": 44590 }, { "epoch": 2.8376916714385696, "grad_norm": 0.0927734375, "learning_rate": 3.9790955299640164e-05, "loss": 0.0026, "step": 44600 }, { "epoch": 2.8383279251765603, "grad_norm": 0.134765625, "learning_rate": 3.978742055665133e-05, "loss": 0.0005, "step": 44610 }, { "epoch": 2.838964178914551, "grad_norm": 0.0216064453125, "learning_rate": 3.9783885813662495e-05, "loss": 0.0002, "step": 44620 }, { "epoch": 2.8396004326525417, "grad_norm": 0.01116943359375, "learning_rate": 3.978035107067365e-05, "loss": 0.0002, "step": 44630 }, { "epoch": 2.8402366863905324, "grad_norm": 0.00848388671875, "learning_rate": 3.977681632768482e-05, "loss": 0.0004, "step": 44640 }, { "epoch": 2.840872940128523, "grad_norm": 0.00433349609375, "learning_rate": 3.9773281584695977e-05, "loss": 0.0001, "step": 44650 }, { "epoch": 2.8415091938665142, "grad_norm": 0.1181640625, "learning_rate": 3.976974684170714e-05, "loss": 0.0001, "step": 44660 }, { "epoch": 2.842145447604505, "grad_norm": 1.421875, "learning_rate": 3.97662120987183e-05, "loss": 0.0055, "step": 44670 }, { "epoch": 2.8427817013424956, "grad_norm": 0.0014495849609375, "learning_rate": 3.9762677355729465e-05, "loss": 0.003, "step": 44680 }, { "epoch": 2.8434179550804863, "grad_norm": 0.06884765625, "learning_rate": 3.975914261274063e-05, "loss": 0.0005, "step": 44690 }, { "epoch": 2.844054208818477, "grad_norm": 0.05126953125, "learning_rate": 3.9755607869751796e-05, "loss": 0.001, "step": 44700 }, { "epoch": 2.8446904625564677, "grad_norm": 0.018798828125, "learning_rate": 3.9752073126762954e-05, "loss": 0.0006, "step": 44710 }, { "epoch": 2.8453267162944584, "grad_norm": 0.00168609619140625, "learning_rate": 3.974853838377412e-05, "loss": 0.0004, "step": 44720 }, { "epoch": 2.845962970032449, "grad_norm": 0.056396484375, "learning_rate": 3.974500364078528e-05, "loss": 0.0055, "step": 44730 }, { "epoch": 2.84659922377044, "grad_norm": 0.0029296875, "learning_rate": 3.974146889779644e-05, "loss": 0.0073, "step": 44740 }, { "epoch": 2.8472354775084305, "grad_norm": 0.0791015625, "learning_rate": 3.97379341548076e-05, "loss": 0.0003, "step": 44750 }, { "epoch": 2.8478717312464212, "grad_norm": 0.00439453125, "learning_rate": 3.9734399411818767e-05, "loss": 0.0001, "step": 44760 }, { "epoch": 2.848507984984412, "grad_norm": 0.0002155303955078125, "learning_rate": 3.973086466882993e-05, "loss": 0.0001, "step": 44770 }, { "epoch": 2.8491442387224026, "grad_norm": 0.004364013671875, "learning_rate": 3.97273299258411e-05, "loss": 0.001, "step": 44780 }, { "epoch": 2.8497804924603933, "grad_norm": 0.04150390625, "learning_rate": 3.9723795182852255e-05, "loss": 0.0002, "step": 44790 }, { "epoch": 2.850416746198384, "grad_norm": 0.0059814453125, "learning_rate": 3.972026043986342e-05, "loss": 0.0001, "step": 44800 }, { "epoch": 2.8510529999363747, "grad_norm": 0.82421875, "learning_rate": 3.9716725696874586e-05, "loss": 0.0005, "step": 44810 }, { "epoch": 2.8516892536743654, "grad_norm": 0.001312255859375, "learning_rate": 3.9713190953885744e-05, "loss": 0.0002, "step": 44820 }, { "epoch": 2.852325507412356, "grad_norm": 0.1875, "learning_rate": 3.970965621089691e-05, "loss": 0.0005, "step": 44830 }, { "epoch": 2.852961761150347, "grad_norm": 0.0247802734375, "learning_rate": 3.970612146790807e-05, "loss": 0.0082, "step": 44840 }, { "epoch": 2.8535980148883375, "grad_norm": 0.279296875, "learning_rate": 3.970258672491923e-05, "loss": 0.0003, "step": 44850 }, { "epoch": 2.8542342686263282, "grad_norm": 0.1240234375, "learning_rate": 3.96990519819304e-05, "loss": 0.0002, "step": 44860 }, { "epoch": 2.854870522364319, "grad_norm": 0.001983642578125, "learning_rate": 3.9695517238941556e-05, "loss": 0.0001, "step": 44870 }, { "epoch": 2.8555067761023096, "grad_norm": 0.036865234375, "learning_rate": 3.969198249595272e-05, "loss": 0.0003, "step": 44880 }, { "epoch": 2.8561430298403003, "grad_norm": 0.0274658203125, "learning_rate": 3.968844775296389e-05, "loss": 0.0002, "step": 44890 }, { "epoch": 2.856779283578291, "grad_norm": 0.0022430419921875, "learning_rate": 3.9684913009975045e-05, "loss": 0.0004, "step": 44900 }, { "epoch": 2.8574155373162817, "grad_norm": 0.017822265625, "learning_rate": 3.968137826698621e-05, "loss": 0.0003, "step": 44910 }, { "epoch": 2.8580517910542724, "grad_norm": 0.06298828125, "learning_rate": 3.967784352399737e-05, "loss": 0.0001, "step": 44920 }, { "epoch": 2.858688044792263, "grad_norm": 0.0037841796875, "learning_rate": 3.9674308781008534e-05, "loss": 0.0001, "step": 44930 }, { "epoch": 2.859324298530254, "grad_norm": 0.015869140625, "learning_rate": 3.96707740380197e-05, "loss": 0.0084, "step": 44940 }, { "epoch": 2.8599605522682445, "grad_norm": 0.130859375, "learning_rate": 3.9667239295030864e-05, "loss": 0.0002, "step": 44950 }, { "epoch": 2.8605968060062352, "grad_norm": 0.001312255859375, "learning_rate": 3.966370455204202e-05, "loss": 0.0002, "step": 44960 }, { "epoch": 2.861233059744226, "grad_norm": 0.0194091796875, "learning_rate": 3.966016980905319e-05, "loss": 0.0006, "step": 44970 }, { "epoch": 2.8618693134822166, "grad_norm": 0.0126953125, "learning_rate": 3.9656635066064346e-05, "loss": 0.0003, "step": 44980 }, { "epoch": 2.8625055672202073, "grad_norm": 0.004119873046875, "learning_rate": 3.965310032307551e-05, "loss": 0.0001, "step": 44990 }, { "epoch": 2.863141820958198, "grad_norm": 0.006683349609375, "learning_rate": 3.964956558008667e-05, "loss": 0.0001, "step": 45000 }, { "epoch": 2.8637780746961887, "grad_norm": 0.00341796875, "learning_rate": 3.9646030837097835e-05, "loss": 0.0001, "step": 45010 }, { "epoch": 2.8644143284341794, "grad_norm": 0.01300048828125, "learning_rate": 3.9642496094109e-05, "loss": 0.001, "step": 45020 }, { "epoch": 2.86505058217217, "grad_norm": 0.02978515625, "learning_rate": 3.9638961351120166e-05, "loss": 0.0001, "step": 45030 }, { "epoch": 2.865686835910161, "grad_norm": 0.0050048828125, "learning_rate": 3.9635426608131324e-05, "loss": 0.0005, "step": 45040 }, { "epoch": 2.8663230896481515, "grad_norm": 0.03125, "learning_rate": 3.963189186514249e-05, "loss": 0.0001, "step": 45050 }, { "epoch": 2.8669593433861422, "grad_norm": 0.02392578125, "learning_rate": 3.962835712215365e-05, "loss": 0.0002, "step": 45060 }, { "epoch": 2.867595597124133, "grad_norm": 7.84375, "learning_rate": 3.962482237916481e-05, "loss": 0.0129, "step": 45070 }, { "epoch": 2.8682318508621236, "grad_norm": 0.01446533203125, "learning_rate": 3.962128763617597e-05, "loss": 0.0045, "step": 45080 }, { "epoch": 2.8688681046001143, "grad_norm": 0.1142578125, "learning_rate": 3.9617752893187136e-05, "loss": 0.0001, "step": 45090 }, { "epoch": 2.869504358338105, "grad_norm": 0.0028533935546875, "learning_rate": 3.96142181501983e-05, "loss": 0.0002, "step": 45100 }, { "epoch": 2.8701406120760957, "grad_norm": 0.0869140625, "learning_rate": 3.961068340720947e-05, "loss": 0.0002, "step": 45110 }, { "epoch": 2.8707768658140864, "grad_norm": 0.478515625, "learning_rate": 3.9607148664220625e-05, "loss": 0.0004, "step": 45120 }, { "epoch": 2.871413119552077, "grad_norm": 0.94921875, "learning_rate": 3.960361392123179e-05, "loss": 0.0009, "step": 45130 }, { "epoch": 2.872049373290068, "grad_norm": 0.51171875, "learning_rate": 3.960007917824295e-05, "loss": 0.0007, "step": 45140 }, { "epoch": 2.8726856270280585, "grad_norm": 0.6953125, "learning_rate": 3.9596544435254114e-05, "loss": 0.0014, "step": 45150 }, { "epoch": 2.8733218807660497, "grad_norm": 0.0050048828125, "learning_rate": 3.959300969226528e-05, "loss": 0.0003, "step": 45160 }, { "epoch": 2.8739581345040404, "grad_norm": 0.80078125, "learning_rate": 3.958947494927644e-05, "loss": 0.0005, "step": 45170 }, { "epoch": 2.874594388242031, "grad_norm": 0.0036163330078125, "learning_rate": 3.95859402062876e-05, "loss": 0.0009, "step": 45180 }, { "epoch": 2.8752306419800218, "grad_norm": 0.1572265625, "learning_rate": 3.958240546329877e-05, "loss": 0.0002, "step": 45190 }, { "epoch": 2.8758668957180125, "grad_norm": 0.0025482177734375, "learning_rate": 3.9578870720309926e-05, "loss": 0.0162, "step": 45200 }, { "epoch": 2.876503149456003, "grad_norm": 0.06005859375, "learning_rate": 3.957533597732109e-05, "loss": 0.0002, "step": 45210 }, { "epoch": 2.877139403193994, "grad_norm": 0.169921875, "learning_rate": 3.957180123433225e-05, "loss": 0.0008, "step": 45220 }, { "epoch": 2.8777756569319846, "grad_norm": 0.00186920166015625, "learning_rate": 3.9568266491343415e-05, "loss": 0.0001, "step": 45230 }, { "epoch": 2.8784119106699753, "grad_norm": 0.002410888671875, "learning_rate": 3.956473174835458e-05, "loss": 0.0001, "step": 45240 }, { "epoch": 2.879048164407966, "grad_norm": 0.01708984375, "learning_rate": 3.956119700536574e-05, "loss": 0.0013, "step": 45250 }, { "epoch": 2.8796844181459567, "grad_norm": 0.00177001953125, "learning_rate": 3.9557662262376904e-05, "loss": 0.0004, "step": 45260 }, { "epoch": 2.8803206718839474, "grad_norm": 0.00057220458984375, "learning_rate": 3.955412751938807e-05, "loss": 0.0001, "step": 45270 }, { "epoch": 2.880956925621938, "grad_norm": 0.002532958984375, "learning_rate": 3.9550592776399234e-05, "loss": 0.0002, "step": 45280 }, { "epoch": 2.8815931793599288, "grad_norm": 3.8125, "learning_rate": 3.954705803341039e-05, "loss": 0.0046, "step": 45290 }, { "epoch": 2.8822294330979195, "grad_norm": 0.181640625, "learning_rate": 3.954352329042156e-05, "loss": 0.0003, "step": 45300 }, { "epoch": 2.88286568683591, "grad_norm": 0.00112152099609375, "learning_rate": 3.9539988547432716e-05, "loss": 0.0024, "step": 45310 }, { "epoch": 2.883501940573901, "grad_norm": 0.041015625, "learning_rate": 3.953645380444388e-05, "loss": 0.0001, "step": 45320 }, { "epoch": 2.8841381943118916, "grad_norm": 0.1650390625, "learning_rate": 3.953291906145504e-05, "loss": 0.0005, "step": 45330 }, { "epoch": 2.8847744480498823, "grad_norm": 0.000873565673828125, "learning_rate": 3.9529384318466205e-05, "loss": 0.0006, "step": 45340 }, { "epoch": 2.885410701787873, "grad_norm": 0.034912109375, "learning_rate": 3.952584957547737e-05, "loss": 0.0001, "step": 45350 }, { "epoch": 2.8860469555258637, "grad_norm": 0.11181640625, "learning_rate": 3.9522314832488535e-05, "loss": 0.0003, "step": 45360 }, { "epoch": 2.8866832092638544, "grad_norm": 0.0040283203125, "learning_rate": 3.9518780089499694e-05, "loss": 0.0005, "step": 45370 }, { "epoch": 2.887319463001845, "grad_norm": 0.0791015625, "learning_rate": 3.951524534651086e-05, "loss": 0.0001, "step": 45380 }, { "epoch": 2.8879557167398358, "grad_norm": 0.0003795623779296875, "learning_rate": 3.951171060352202e-05, "loss": 0.0002, "step": 45390 }, { "epoch": 2.8885919704778265, "grad_norm": 0.000514984130859375, "learning_rate": 3.950817586053318e-05, "loss": 0.0022, "step": 45400 }, { "epoch": 2.889228224215817, "grad_norm": 0.00012302398681640625, "learning_rate": 3.950464111754434e-05, "loss": 0.0002, "step": 45410 }, { "epoch": 2.889864477953808, "grad_norm": 0.00127410888671875, "learning_rate": 3.9501106374555506e-05, "loss": 0.001, "step": 45420 }, { "epoch": 2.8905007316917986, "grad_norm": 0.00885009765625, "learning_rate": 3.949757163156667e-05, "loss": 0.0003, "step": 45430 }, { "epoch": 2.8911369854297893, "grad_norm": 0.003509521484375, "learning_rate": 3.9494036888577837e-05, "loss": 0.0021, "step": 45440 }, { "epoch": 2.89177323916778, "grad_norm": 0.0036468505859375, "learning_rate": 3.9490502145588995e-05, "loss": 0.0001, "step": 45450 }, { "epoch": 2.892409492905771, "grad_norm": 0.0062255859375, "learning_rate": 3.948696740260016e-05, "loss": 0.0001, "step": 45460 }, { "epoch": 2.893045746643762, "grad_norm": 0.00311279296875, "learning_rate": 3.948343265961132e-05, "loss": 0.0001, "step": 45470 }, { "epoch": 2.8936820003817525, "grad_norm": 0.1220703125, "learning_rate": 3.9479897916622484e-05, "loss": 0.0002, "step": 45480 }, { "epoch": 2.894318254119743, "grad_norm": 0.4140625, "learning_rate": 3.947636317363365e-05, "loss": 0.0005, "step": 45490 }, { "epoch": 2.894954507857734, "grad_norm": 0.03564453125, "learning_rate": 3.947282843064481e-05, "loss": 0.0013, "step": 45500 }, { "epoch": 2.8955907615957246, "grad_norm": 0.0306396484375, "learning_rate": 3.946929368765597e-05, "loss": 0.0003, "step": 45510 }, { "epoch": 2.8962270153337153, "grad_norm": 0.00238037109375, "learning_rate": 3.946575894466714e-05, "loss": 0.0003, "step": 45520 }, { "epoch": 2.896863269071706, "grad_norm": 0.003204345703125, "learning_rate": 3.9462224201678296e-05, "loss": 0.0001, "step": 45530 }, { "epoch": 2.8974995228096967, "grad_norm": 0.00634765625, "learning_rate": 3.945868945868946e-05, "loss": 0.0012, "step": 45540 }, { "epoch": 2.8981357765476874, "grad_norm": 0.10986328125, "learning_rate": 3.945515471570062e-05, "loss": 0.0065, "step": 45550 }, { "epoch": 2.898772030285678, "grad_norm": 2.359375, "learning_rate": 3.9451619972711785e-05, "loss": 0.0026, "step": 45560 }, { "epoch": 2.899408284023669, "grad_norm": 0.0810546875, "learning_rate": 3.944808522972295e-05, "loss": 0.0055, "step": 45570 }, { "epoch": 2.9000445377616595, "grad_norm": 0.1650390625, "learning_rate": 3.944455048673411e-05, "loss": 0.0015, "step": 45580 }, { "epoch": 2.90068079149965, "grad_norm": 0.002349853515625, "learning_rate": 3.9441015743745274e-05, "loss": 0.0002, "step": 45590 }, { "epoch": 2.901317045237641, "grad_norm": 0.01068115234375, "learning_rate": 3.943748100075644e-05, "loss": 0.0007, "step": 45600 }, { "epoch": 2.9019532989756316, "grad_norm": 0.003448486328125, "learning_rate": 3.9433946257767604e-05, "loss": 0.0003, "step": 45610 }, { "epoch": 2.9025895527136223, "grad_norm": 0.0011138916015625, "learning_rate": 3.943041151477876e-05, "loss": 0.0002, "step": 45620 }, { "epoch": 2.903225806451613, "grad_norm": 0.0086669921875, "learning_rate": 3.942687677178992e-05, "loss": 0.0002, "step": 45630 }, { "epoch": 2.9038620601896037, "grad_norm": 0.00836181640625, "learning_rate": 3.9423342028801086e-05, "loss": 0.0002, "step": 45640 }, { "epoch": 2.9044983139275944, "grad_norm": 0.061767578125, "learning_rate": 3.941980728581225e-05, "loss": 0.0002, "step": 45650 }, { "epoch": 2.905134567665585, "grad_norm": 0.00128936767578125, "learning_rate": 3.941627254282341e-05, "loss": 0.0054, "step": 45660 }, { "epoch": 2.905770821403576, "grad_norm": 0.002777099609375, "learning_rate": 3.9412737799834575e-05, "loss": 0.0003, "step": 45670 }, { "epoch": 2.9064070751415665, "grad_norm": 0.005645751953125, "learning_rate": 3.940920305684574e-05, "loss": 0.0001, "step": 45680 }, { "epoch": 2.907043328879557, "grad_norm": 0.00045013427734375, "learning_rate": 3.9405668313856905e-05, "loss": 0.0014, "step": 45690 }, { "epoch": 2.907679582617548, "grad_norm": 0.09765625, "learning_rate": 3.9402133570868064e-05, "loss": 0.0055, "step": 45700 }, { "epoch": 2.9083158363555386, "grad_norm": 0.001373291015625, "learning_rate": 3.939859882787923e-05, "loss": 0.0001, "step": 45710 }, { "epoch": 2.9089520900935293, "grad_norm": 0.00150299072265625, "learning_rate": 3.939506408489039e-05, "loss": 0.0005, "step": 45720 }, { "epoch": 2.90958834383152, "grad_norm": 0.00518798828125, "learning_rate": 3.939152934190155e-05, "loss": 0.0003, "step": 45730 }, { "epoch": 2.9102245975695107, "grad_norm": 0.0556640625, "learning_rate": 3.938799459891271e-05, "loss": 0.0009, "step": 45740 }, { "epoch": 2.9108608513075014, "grad_norm": 0.006927490234375, "learning_rate": 3.9384459855923876e-05, "loss": 0.0003, "step": 45750 }, { "epoch": 2.911497105045492, "grad_norm": 0.0107421875, "learning_rate": 3.938092511293504e-05, "loss": 0.0003, "step": 45760 }, { "epoch": 2.912133358783483, "grad_norm": 0.042724609375, "learning_rate": 3.9377390369946206e-05, "loss": 0.0002, "step": 45770 }, { "epoch": 2.9127696125214735, "grad_norm": 0.0002994537353515625, "learning_rate": 3.9373855626957365e-05, "loss": 0.0001, "step": 45780 }, { "epoch": 2.913405866259464, "grad_norm": 0.0146484375, "learning_rate": 3.937032088396853e-05, "loss": 0.0001, "step": 45790 }, { "epoch": 2.914042119997455, "grad_norm": 0.00335693359375, "learning_rate": 3.936678614097969e-05, "loss": 0.0018, "step": 45800 }, { "epoch": 2.9146783737354456, "grad_norm": 0.07177734375, "learning_rate": 3.9363251397990854e-05, "loss": 0.0004, "step": 45810 }, { "epoch": 2.9153146274734363, "grad_norm": 0.00171661376953125, "learning_rate": 3.935971665500202e-05, "loss": 0.0022, "step": 45820 }, { "epoch": 2.915950881211427, "grad_norm": 0.1845703125, "learning_rate": 3.935618191201318e-05, "loss": 0.0004, "step": 45830 }, { "epoch": 2.9165871349494177, "grad_norm": 0.058349609375, "learning_rate": 3.935264716902434e-05, "loss": 0.0035, "step": 45840 }, { "epoch": 2.9172233886874084, "grad_norm": 0.005706787109375, "learning_rate": 3.934911242603551e-05, "loss": 0.0075, "step": 45850 }, { "epoch": 2.917859642425399, "grad_norm": 0.002716064453125, "learning_rate": 3.934557768304667e-05, "loss": 0.0003, "step": 45860 }, { "epoch": 2.91849589616339, "grad_norm": 0.07275390625, "learning_rate": 3.934204294005783e-05, "loss": 0.0002, "step": 45870 }, { "epoch": 2.9191321499013805, "grad_norm": 0.043701171875, "learning_rate": 3.933850819706899e-05, "loss": 0.0002, "step": 45880 }, { "epoch": 2.919768403639371, "grad_norm": 0.0947265625, "learning_rate": 3.9334973454080155e-05, "loss": 0.0002, "step": 45890 }, { "epoch": 2.920404657377362, "grad_norm": 0.00335693359375, "learning_rate": 3.933143871109132e-05, "loss": 0.0007, "step": 45900 }, { "epoch": 2.9210409111153526, "grad_norm": 0.003509521484375, "learning_rate": 3.932790396810248e-05, "loss": 0.0002, "step": 45910 }, { "epoch": 2.9216771648533433, "grad_norm": 0.044921875, "learning_rate": 3.9324369225113643e-05, "loss": 0.0004, "step": 45920 }, { "epoch": 2.922313418591334, "grad_norm": 0.0072021484375, "learning_rate": 3.932083448212481e-05, "loss": 0.0001, "step": 45930 }, { "epoch": 2.9229496723293247, "grad_norm": 0.0146484375, "learning_rate": 3.9317299739135974e-05, "loss": 0.0001, "step": 45940 }, { "epoch": 2.9235859260673154, "grad_norm": 0.00897216796875, "learning_rate": 3.931376499614713e-05, "loss": 0.0005, "step": 45950 }, { "epoch": 2.9242221798053065, "grad_norm": 0.23046875, "learning_rate": 3.931023025315829e-05, "loss": 0.0002, "step": 45960 }, { "epoch": 2.924858433543297, "grad_norm": 0.00811767578125, "learning_rate": 3.9306695510169456e-05, "loss": 0.0001, "step": 45970 }, { "epoch": 2.925494687281288, "grad_norm": 0.05810546875, "learning_rate": 3.930316076718062e-05, "loss": 0.0003, "step": 45980 }, { "epoch": 2.9261309410192786, "grad_norm": 0.0322265625, "learning_rate": 3.929962602419178e-05, "loss": 0.0002, "step": 45990 }, { "epoch": 2.9267671947572693, "grad_norm": 0.0673828125, "learning_rate": 3.9296091281202945e-05, "loss": 0.0012, "step": 46000 }, { "epoch": 2.92740344849526, "grad_norm": 0.003204345703125, "learning_rate": 3.929255653821411e-05, "loss": 0.0001, "step": 46010 }, { "epoch": 2.9280397022332507, "grad_norm": 0.0022735595703125, "learning_rate": 3.9289021795225275e-05, "loss": 0.0014, "step": 46020 }, { "epoch": 2.9286759559712414, "grad_norm": 0.7578125, "learning_rate": 3.9285487052236433e-05, "loss": 0.0006, "step": 46030 }, { "epoch": 2.929312209709232, "grad_norm": 0.0159912109375, "learning_rate": 3.928195230924759e-05, "loss": 0.0006, "step": 46040 }, { "epoch": 2.929948463447223, "grad_norm": 0.02392578125, "learning_rate": 3.927841756625876e-05, "loss": 0.0014, "step": 46050 }, { "epoch": 2.9305847171852135, "grad_norm": 0.0732421875, "learning_rate": 3.927488282326992e-05, "loss": 0.0009, "step": 46060 }, { "epoch": 2.931220970923204, "grad_norm": 0.0228271484375, "learning_rate": 3.927134808028109e-05, "loss": 0.0037, "step": 46070 }, { "epoch": 2.931857224661195, "grad_norm": 0.06201171875, "learning_rate": 3.9267813337292246e-05, "loss": 0.0008, "step": 46080 }, { "epoch": 2.9324934783991856, "grad_norm": 0.01544189453125, "learning_rate": 3.926427859430341e-05, "loss": 0.0003, "step": 46090 }, { "epoch": 2.9331297321371763, "grad_norm": 0.00122833251953125, "learning_rate": 3.9260743851314576e-05, "loss": 0.0006, "step": 46100 }, { "epoch": 2.933765985875167, "grad_norm": 0.01416015625, "learning_rate": 3.9257209108325735e-05, "loss": 0.0001, "step": 46110 }, { "epoch": 2.9344022396131577, "grad_norm": 0.06640625, "learning_rate": 3.925367436533689e-05, "loss": 0.0002, "step": 46120 }, { "epoch": 2.9350384933511484, "grad_norm": 0.244140625, "learning_rate": 3.925013962234806e-05, "loss": 0.0002, "step": 46130 }, { "epoch": 2.935674747089139, "grad_norm": 0.0023345947265625, "learning_rate": 3.924660487935922e-05, "loss": 0.0007, "step": 46140 }, { "epoch": 2.93631100082713, "grad_norm": 0.0164794921875, "learning_rate": 3.924307013637039e-05, "loss": 0.0001, "step": 46150 }, { "epoch": 2.9369472545651205, "grad_norm": 0.001007080078125, "learning_rate": 3.923953539338155e-05, "loss": 0.0002, "step": 46160 }, { "epoch": 2.937583508303111, "grad_norm": 0.02734375, "learning_rate": 3.923600065039271e-05, "loss": 0.0003, "step": 46170 }, { "epoch": 2.938219762041102, "grad_norm": 0.0125732421875, "learning_rate": 3.923246590740388e-05, "loss": 0.0001, "step": 46180 }, { "epoch": 2.9388560157790926, "grad_norm": 0.1328125, "learning_rate": 3.922893116441504e-05, "loss": 0.0004, "step": 46190 }, { "epoch": 2.9394922695170833, "grad_norm": 0.0012969970703125, "learning_rate": 3.92253964214262e-05, "loss": 0.0007, "step": 46200 }, { "epoch": 2.940128523255074, "grad_norm": 0.0013885498046875, "learning_rate": 3.922186167843736e-05, "loss": 0.0013, "step": 46210 }, { "epoch": 2.9407647769930647, "grad_norm": 0.04931640625, "learning_rate": 3.9218326935448524e-05, "loss": 0.001, "step": 46220 }, { "epoch": 2.9414010307310554, "grad_norm": 0.0556640625, "learning_rate": 3.921479219245969e-05, "loss": 0.0011, "step": 46230 }, { "epoch": 2.942037284469046, "grad_norm": 0.0135498046875, "learning_rate": 3.921125744947085e-05, "loss": 0.0002, "step": 46240 }, { "epoch": 2.942673538207037, "grad_norm": 0.0067138671875, "learning_rate": 3.920772270648201e-05, "loss": 0.0011, "step": 46250 }, { "epoch": 2.943309791945028, "grad_norm": 0.00994873046875, "learning_rate": 3.920418796349318e-05, "loss": 0.0001, "step": 46260 }, { "epoch": 2.9439460456830187, "grad_norm": 0.00946044921875, "learning_rate": 3.9200653220504344e-05, "loss": 0.0036, "step": 46270 }, { "epoch": 2.9445822994210094, "grad_norm": 0.00445556640625, "learning_rate": 3.91971184775155e-05, "loss": 0.0009, "step": 46280 }, { "epoch": 2.945218553159, "grad_norm": 0.015625, "learning_rate": 3.919358373452666e-05, "loss": 0.0002, "step": 46290 }, { "epoch": 2.9458548068969908, "grad_norm": 0.002471923828125, "learning_rate": 3.9190048991537826e-05, "loss": 0.0005, "step": 46300 }, { "epoch": 2.9464910606349815, "grad_norm": 0.0064697265625, "learning_rate": 3.918651424854899e-05, "loss": 0.0004, "step": 46310 }, { "epoch": 2.947127314372972, "grad_norm": 0.01239013671875, "learning_rate": 3.918297950556015e-05, "loss": 0.0002, "step": 46320 }, { "epoch": 2.947763568110963, "grad_norm": 0.01336669921875, "learning_rate": 3.9179444762571314e-05, "loss": 0.0001, "step": 46330 }, { "epoch": 2.9483998218489536, "grad_norm": 0.000743865966796875, "learning_rate": 3.917591001958248e-05, "loss": 0.0002, "step": 46340 }, { "epoch": 2.9490360755869442, "grad_norm": 2.21875, "learning_rate": 3.9172375276593645e-05, "loss": 0.0014, "step": 46350 }, { "epoch": 2.949672329324935, "grad_norm": 0.0068359375, "learning_rate": 3.91688405336048e-05, "loss": 0.0013, "step": 46360 }, { "epoch": 2.9503085830629256, "grad_norm": 0.173828125, "learning_rate": 3.916530579061596e-05, "loss": 0.0001, "step": 46370 }, { "epoch": 2.9509448368009163, "grad_norm": 0.00897216796875, "learning_rate": 3.916177104762713e-05, "loss": 0.0008, "step": 46380 }, { "epoch": 2.951581090538907, "grad_norm": 0.033203125, "learning_rate": 3.915823630463829e-05, "loss": 0.0002, "step": 46390 }, { "epoch": 2.9522173442768977, "grad_norm": 0.01708984375, "learning_rate": 3.915470156164946e-05, "loss": 0.0008, "step": 46400 }, { "epoch": 2.9528535980148884, "grad_norm": 0.004150390625, "learning_rate": 3.9151166818660616e-05, "loss": 0.0005, "step": 46410 }, { "epoch": 2.953489851752879, "grad_norm": 0.01068115234375, "learning_rate": 3.914763207567178e-05, "loss": 0.0001, "step": 46420 }, { "epoch": 2.95412610549087, "grad_norm": 0.006195068359375, "learning_rate": 3.9144097332682946e-05, "loss": 0.0019, "step": 46430 }, { "epoch": 2.9547623592288605, "grad_norm": 0.050537109375, "learning_rate": 3.9140562589694104e-05, "loss": 0.0002, "step": 46440 }, { "epoch": 2.9553986129668512, "grad_norm": 0.00567626953125, "learning_rate": 3.913702784670526e-05, "loss": 0.0002, "step": 46450 }, { "epoch": 2.956034866704842, "grad_norm": 0.0012969970703125, "learning_rate": 3.913349310371643e-05, "loss": 0.0001, "step": 46460 }, { "epoch": 2.9566711204428326, "grad_norm": 5.875, "learning_rate": 3.912995836072759e-05, "loss": 0.0037, "step": 46470 }, { "epoch": 2.9573073741808233, "grad_norm": 0.037353515625, "learning_rate": 3.912642361773876e-05, "loss": 0.0003, "step": 46480 }, { "epoch": 2.957943627918814, "grad_norm": 0.034912109375, "learning_rate": 3.912288887474992e-05, "loss": 0.004, "step": 46490 }, { "epoch": 2.9585798816568047, "grad_norm": 0.02783203125, "learning_rate": 3.911935413176108e-05, "loss": 0.0002, "step": 46500 }, { "epoch": 2.9592161353947954, "grad_norm": 0.016357421875, "learning_rate": 3.911581938877225e-05, "loss": 0.0001, "step": 46510 }, { "epoch": 2.959852389132786, "grad_norm": 0.0186767578125, "learning_rate": 3.911228464578341e-05, "loss": 0.0001, "step": 46520 }, { "epoch": 2.960488642870777, "grad_norm": 0.005767822265625, "learning_rate": 3.9108749902794564e-05, "loss": 0.0001, "step": 46530 }, { "epoch": 2.9611248966087675, "grad_norm": 0.1337890625, "learning_rate": 3.910521515980573e-05, "loss": 0.0006, "step": 46540 }, { "epoch": 2.9617611503467582, "grad_norm": 0.005706787109375, "learning_rate": 3.9101680416816894e-05, "loss": 0.0001, "step": 46550 }, { "epoch": 2.962397404084749, "grad_norm": 0.07958984375, "learning_rate": 3.909814567382806e-05, "loss": 0.0004, "step": 46560 }, { "epoch": 2.9630336578227396, "grad_norm": 0.0030364990234375, "learning_rate": 3.909461093083922e-05, "loss": 0.0001, "step": 46570 }, { "epoch": 2.9636699115607303, "grad_norm": 1.6953125, "learning_rate": 3.909107618785038e-05, "loss": 0.0006, "step": 46580 }, { "epoch": 2.964306165298721, "grad_norm": 0.00167083740234375, "learning_rate": 3.908754144486155e-05, "loss": 0.0005, "step": 46590 }, { "epoch": 2.9649424190367117, "grad_norm": 0.00714111328125, "learning_rate": 3.9084006701872713e-05, "loss": 0.001, "step": 46600 }, { "epoch": 2.9655786727747024, "grad_norm": 0.0103759765625, "learning_rate": 3.908047195888387e-05, "loss": 0.0002, "step": 46610 }, { "epoch": 2.966214926512693, "grad_norm": 0.053466796875, "learning_rate": 3.907693721589503e-05, "loss": 0.0012, "step": 46620 }, { "epoch": 2.966851180250684, "grad_norm": 9.0, "learning_rate": 3.9073402472906195e-05, "loss": 0.0091, "step": 46630 }, { "epoch": 2.9674874339886745, "grad_norm": 0.1181640625, "learning_rate": 3.906986772991736e-05, "loss": 0.0008, "step": 46640 }, { "epoch": 2.9681236877266652, "grad_norm": 0.007080078125, "learning_rate": 3.906633298692852e-05, "loss": 0.0004, "step": 46650 }, { "epoch": 2.968759941464656, "grad_norm": 0.01385498046875, "learning_rate": 3.9062798243939684e-05, "loss": 0.0004, "step": 46660 }, { "epoch": 2.9693961952026466, "grad_norm": 0.1279296875, "learning_rate": 3.905926350095085e-05, "loss": 0.0066, "step": 46670 }, { "epoch": 2.9700324489406373, "grad_norm": 0.003021240234375, "learning_rate": 3.9055728757962015e-05, "loss": 0.0108, "step": 46680 }, { "epoch": 2.970668702678628, "grad_norm": 0.00341796875, "learning_rate": 3.905219401497317e-05, "loss": 0.0007, "step": 46690 }, { "epoch": 2.9713049564166187, "grad_norm": 0.0030670166015625, "learning_rate": 3.904865927198433e-05, "loss": 0.0041, "step": 46700 }, { "epoch": 2.9719412101546094, "grad_norm": 0.020263671875, "learning_rate": 3.90451245289955e-05, "loss": 0.0005, "step": 46710 }, { "epoch": 2.9725774638926, "grad_norm": 1.96875, "learning_rate": 3.904158978600666e-05, "loss": 0.0014, "step": 46720 }, { "epoch": 2.973213717630591, "grad_norm": 0.03515625, "learning_rate": 3.903805504301783e-05, "loss": 0.0005, "step": 46730 }, { "epoch": 2.9738499713685815, "grad_norm": 0.01373291015625, "learning_rate": 3.9034520300028985e-05, "loss": 0.0001, "step": 46740 }, { "epoch": 2.9744862251065722, "grad_norm": 0.01904296875, "learning_rate": 3.903098555704015e-05, "loss": 0.0001, "step": 46750 }, { "epoch": 2.975122478844563, "grad_norm": 0.000926971435546875, "learning_rate": 3.9027450814051316e-05, "loss": 0.0001, "step": 46760 }, { "epoch": 2.975758732582554, "grad_norm": 0.0673828125, "learning_rate": 3.9023916071062474e-05, "loss": 0.0002, "step": 46770 }, { "epoch": 2.9763949863205448, "grad_norm": 0.00860595703125, "learning_rate": 3.902038132807363e-05, "loss": 0.0001, "step": 46780 }, { "epoch": 2.9770312400585355, "grad_norm": 0.01092529296875, "learning_rate": 3.90168465850848e-05, "loss": 0.0002, "step": 46790 }, { "epoch": 2.977667493796526, "grad_norm": 0.004180908203125, "learning_rate": 3.901331184209596e-05, "loss": 0.0001, "step": 46800 }, { "epoch": 2.978303747534517, "grad_norm": 0.03271484375, "learning_rate": 3.900977709910713e-05, "loss": 0.0001, "step": 46810 }, { "epoch": 2.9789400012725076, "grad_norm": 0.01611328125, "learning_rate": 3.9006242356118287e-05, "loss": 0.0058, "step": 46820 }, { "epoch": 2.9795762550104983, "grad_norm": 0.02978515625, "learning_rate": 3.900270761312945e-05, "loss": 0.0011, "step": 46830 }, { "epoch": 2.980212508748489, "grad_norm": 0.000644683837890625, "learning_rate": 3.899917287014062e-05, "loss": 0.0002, "step": 46840 }, { "epoch": 2.9808487624864797, "grad_norm": 0.0908203125, "learning_rate": 3.899563812715178e-05, "loss": 0.0002, "step": 46850 }, { "epoch": 2.9814850162244704, "grad_norm": 0.07275390625, "learning_rate": 3.8992103384162934e-05, "loss": 0.0007, "step": 46860 }, { "epoch": 2.982121269962461, "grad_norm": 0.025146484375, "learning_rate": 3.89885686411741e-05, "loss": 0.0, "step": 46870 }, { "epoch": 2.9827575237004518, "grad_norm": 0.11083984375, "learning_rate": 3.8985033898185264e-05, "loss": 0.0013, "step": 46880 }, { "epoch": 2.9833937774384425, "grad_norm": 0.04248046875, "learning_rate": 3.898149915519643e-05, "loss": 0.0016, "step": 46890 }, { "epoch": 2.984030031176433, "grad_norm": 0.00109100341796875, "learning_rate": 3.897796441220759e-05, "loss": 0.0004, "step": 46900 }, { "epoch": 2.984666284914424, "grad_norm": 0.00848388671875, "learning_rate": 3.897442966921875e-05, "loss": 0.0002, "step": 46910 }, { "epoch": 2.9853025386524146, "grad_norm": 0.1748046875, "learning_rate": 3.897089492622992e-05, "loss": 0.008, "step": 46920 }, { "epoch": 2.9859387923904053, "grad_norm": 0.0040283203125, "learning_rate": 3.896736018324108e-05, "loss": 0.0002, "step": 46930 }, { "epoch": 2.986575046128396, "grad_norm": 0.00640869140625, "learning_rate": 3.896382544025224e-05, "loss": 0.0012, "step": 46940 }, { "epoch": 2.9872112998663867, "grad_norm": 0.01373291015625, "learning_rate": 3.89602906972634e-05, "loss": 0.0003, "step": 46950 }, { "epoch": 2.9878475536043774, "grad_norm": 0.010986328125, "learning_rate": 3.8956755954274565e-05, "loss": 0.0009, "step": 46960 }, { "epoch": 2.988483807342368, "grad_norm": 0.1142578125, "learning_rate": 3.895322121128573e-05, "loss": 0.0015, "step": 46970 }, { "epoch": 2.9891200610803588, "grad_norm": 0.059814453125, "learning_rate": 3.894968646829689e-05, "loss": 0.0038, "step": 46980 }, { "epoch": 2.9897563148183495, "grad_norm": 0.00799560546875, "learning_rate": 3.8946151725308054e-05, "loss": 0.0001, "step": 46990 }, { "epoch": 2.99039256855634, "grad_norm": 0.1396484375, "learning_rate": 3.894261698231922e-05, "loss": 0.0003, "step": 47000 }, { "epoch": 2.991028822294331, "grad_norm": 0.0022125244140625, "learning_rate": 3.8939082239330384e-05, "loss": 0.0002, "step": 47010 }, { "epoch": 2.9916650760323216, "grad_norm": 0.0034027099609375, "learning_rate": 3.893554749634154e-05, "loss": 0.0003, "step": 47020 }, { "epoch": 2.9923013297703123, "grad_norm": 0.00982666015625, "learning_rate": 3.89320127533527e-05, "loss": 0.0006, "step": 47030 }, { "epoch": 2.992937583508303, "grad_norm": 0.0022735595703125, "learning_rate": 3.8928478010363866e-05, "loss": 0.0004, "step": 47040 }, { "epoch": 2.9935738372462937, "grad_norm": 0.07421875, "learning_rate": 3.892494326737503e-05, "loss": 0.0134, "step": 47050 }, { "epoch": 2.9942100909842844, "grad_norm": 0.005279541015625, "learning_rate": 3.89214085243862e-05, "loss": 0.0005, "step": 47060 }, { "epoch": 2.9948463447222755, "grad_norm": 0.01336669921875, "learning_rate": 3.8917873781397355e-05, "loss": 0.0001, "step": 47070 }, { "epoch": 2.995482598460266, "grad_norm": 0.28125, "learning_rate": 3.891433903840852e-05, "loss": 0.0005, "step": 47080 }, { "epoch": 2.996118852198257, "grad_norm": 0.01202392578125, "learning_rate": 3.8910804295419686e-05, "loss": 0.0028, "step": 47090 }, { "epoch": 2.9967551059362476, "grad_norm": 0.006988525390625, "learning_rate": 3.8907269552430844e-05, "loss": 0.0007, "step": 47100 }, { "epoch": 2.9973913596742383, "grad_norm": 0.01019287109375, "learning_rate": 3.8903734809442e-05, "loss": 0.0001, "step": 47110 }, { "epoch": 2.998027613412229, "grad_norm": 0.2138671875, "learning_rate": 3.890020006645317e-05, "loss": 0.0003, "step": 47120 }, { "epoch": 2.9986638671502197, "grad_norm": 0.005126953125, "learning_rate": 3.889666532346433e-05, "loss": 0.0004, "step": 47130 }, { "epoch": 2.9993001208882104, "grad_norm": 0.01055908203125, "learning_rate": 3.88931305804755e-05, "loss": 0.0002, "step": 47140 }, { "epoch": 2.999936374626201, "grad_norm": 0.85546875, "learning_rate": 3.8889595837486656e-05, "loss": 0.0005, "step": 47150 } ], "logging_steps": 10, "max_steps": 157170, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.121429067698012e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }