{ "best_metric": 0.9950745058918885, "best_model_checkpoint": "/content/drive/My Drive/results/checkpoint-32080", "epoch": 5.0, "eval_steps": 500, "global_step": 40100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012468827930174563, "grad_norm": 1.4657020568847656, "learning_rate": 1.9995012468827932e-05, "loss": 1.0656, "step": 10 }, { "epoch": 0.0024937655860349127, "grad_norm": 1.603716492652893, "learning_rate": 1.9990024937655863e-05, "loss": 1.0429, "step": 20 }, { "epoch": 0.003740648379052369, "grad_norm": 1.6681715250015259, "learning_rate": 1.998503740648379e-05, "loss": 1.0375, "step": 30 }, { "epoch": 0.004987531172069825, "grad_norm": 2.473482608795166, "learning_rate": 1.998004987531172e-05, "loss": 1.0195, "step": 40 }, { "epoch": 0.006234413965087282, "grad_norm": 3.1133217811584473, "learning_rate": 1.997506234413965e-05, "loss": 0.9203, "step": 50 }, { "epoch": 0.007481296758104738, "grad_norm": 4.7094550132751465, "learning_rate": 1.9970074812967582e-05, "loss": 0.8761, "step": 60 }, { "epoch": 0.008728179551122194, "grad_norm": 5.289208889007568, "learning_rate": 1.9965087281795513e-05, "loss": 0.7031, "step": 70 }, { "epoch": 0.00997506234413965, "grad_norm": 8.144381523132324, "learning_rate": 1.9960099750623443e-05, "loss": 0.5534, "step": 80 }, { "epoch": 0.011221945137157107, "grad_norm": 10.670867919921875, "learning_rate": 1.9955112219451374e-05, "loss": 0.5365, "step": 90 }, { "epoch": 0.012468827930174564, "grad_norm": 9.720280647277832, "learning_rate": 1.995062344139651e-05, "loss": 0.3846, "step": 100 }, { "epoch": 0.01371571072319202, "grad_norm": 8.397249221801758, "learning_rate": 1.994563591022444e-05, "loss": 0.4167, "step": 110 }, { "epoch": 0.014962593516209476, "grad_norm": 5.264407157897949, "learning_rate": 1.9940648379052368e-05, "loss": 0.3376, "step": 120 }, { "epoch": 0.016209476309226933, "grad_norm": 12.360579490661621, "learning_rate": 1.99356608478803e-05, "loss": 0.3407, "step": 130 }, { "epoch": 0.017456359102244388, "grad_norm": 6.203169345855713, "learning_rate": 1.993067331670823e-05, "loss": 0.3516, "step": 140 }, { "epoch": 0.018703241895261846, "grad_norm": 10.887129783630371, "learning_rate": 1.9925685785536163e-05, "loss": 0.3273, "step": 150 }, { "epoch": 0.0199501246882793, "grad_norm": 4.2213287353515625, "learning_rate": 1.992069825436409e-05, "loss": 0.3442, "step": 160 }, { "epoch": 0.02119700748129676, "grad_norm": 12.920991897583008, "learning_rate": 1.991620947630923e-05, "loss": 0.2905, "step": 170 }, { "epoch": 0.022443890274314215, "grad_norm": 10.486888885498047, "learning_rate": 1.9911221945137158e-05, "loss": 0.3545, "step": 180 }, { "epoch": 0.02369077306733167, "grad_norm": 6.999630928039551, "learning_rate": 1.990623441396509e-05, "loss": 0.1938, "step": 190 }, { "epoch": 0.02493765586034913, "grad_norm": 21.676868438720703, "learning_rate": 1.990124688279302e-05, "loss": 0.263, "step": 200 }, { "epoch": 0.026184538653366583, "grad_norm": 24.033039093017578, "learning_rate": 1.9896758104738156e-05, "loss": 0.2844, "step": 210 }, { "epoch": 0.02743142144638404, "grad_norm": 5.284111976623535, "learning_rate": 1.9891770573566086e-05, "loss": 0.2864, "step": 220 }, { "epoch": 0.028678304239401497, "grad_norm": 21.66871452331543, "learning_rate": 1.9886783042394017e-05, "loss": 0.3661, "step": 230 }, { "epoch": 0.029925187032418952, "grad_norm": 2.484323740005493, "learning_rate": 1.9881795511221947e-05, "loss": 0.1745, "step": 240 }, { "epoch": 0.03117206982543641, "grad_norm": 12.316446304321289, "learning_rate": 1.9876807980049878e-05, "loss": 0.2384, "step": 250 }, { "epoch": 0.032418952618453865, "grad_norm": 15.095812797546387, "learning_rate": 1.987182044887781e-05, "loss": 0.2639, "step": 260 }, { "epoch": 0.03366583541147132, "grad_norm": 4.764219760894775, "learning_rate": 1.9866832917705736e-05, "loss": 0.1828, "step": 270 }, { "epoch": 0.034912718204488775, "grad_norm": 6.600683689117432, "learning_rate": 1.9861845386533667e-05, "loss": 0.2929, "step": 280 }, { "epoch": 0.03615960099750624, "grad_norm": 10.377800941467285, "learning_rate": 1.9856857855361597e-05, "loss": 0.2242, "step": 290 }, { "epoch": 0.03740648379052369, "grad_norm": 14.586189270019531, "learning_rate": 1.9851870324189528e-05, "loss": 0.1666, "step": 300 }, { "epoch": 0.03865336658354115, "grad_norm": 10.822226524353027, "learning_rate": 1.984688279301746e-05, "loss": 0.3677, "step": 310 }, { "epoch": 0.0399002493765586, "grad_norm": 18.322172164916992, "learning_rate": 1.984189526184539e-05, "loss": 0.3073, "step": 320 }, { "epoch": 0.04114713216957606, "grad_norm": 12.16933822631836, "learning_rate": 1.983690773067332e-05, "loss": 0.2328, "step": 330 }, { "epoch": 0.04239401496259352, "grad_norm": 18.187196731567383, "learning_rate": 1.983192019950125e-05, "loss": 0.237, "step": 340 }, { "epoch": 0.043640897755610975, "grad_norm": 19.095233917236328, "learning_rate": 1.982693266832918e-05, "loss": 0.2531, "step": 350 }, { "epoch": 0.04488778054862843, "grad_norm": 19.445892333984375, "learning_rate": 1.9821945137157108e-05, "loss": 0.1914, "step": 360 }, { "epoch": 0.046134663341645885, "grad_norm": 7.831539630889893, "learning_rate": 1.981695760598504e-05, "loss": 0.2373, "step": 370 }, { "epoch": 0.04738154613466334, "grad_norm": 19.73242950439453, "learning_rate": 1.981197007481297e-05, "loss": 0.2471, "step": 380 }, { "epoch": 0.048628428927680795, "grad_norm": 18.188596725463867, "learning_rate": 1.98069825436409e-05, "loss": 0.2306, "step": 390 }, { "epoch": 0.04987531172069826, "grad_norm": 15.301477432250977, "learning_rate": 1.9801995012468827e-05, "loss": 0.2787, "step": 400 }, { "epoch": 0.05112219451371571, "grad_norm": 0.6682356595993042, "learning_rate": 1.9797007481296758e-05, "loss": 0.1546, "step": 410 }, { "epoch": 0.05236907730673317, "grad_norm": 12.32183837890625, "learning_rate": 1.979201995012469e-05, "loss": 0.2832, "step": 420 }, { "epoch": 0.05361596009975062, "grad_norm": 13.420507431030273, "learning_rate": 1.9787032418952622e-05, "loss": 0.2139, "step": 430 }, { "epoch": 0.05486284289276808, "grad_norm": 21.027982711791992, "learning_rate": 1.978204488778055e-05, "loss": 0.2048, "step": 440 }, { "epoch": 0.05610972568578554, "grad_norm": 1.9423059225082397, "learning_rate": 1.977705735660848e-05, "loss": 0.1517, "step": 450 }, { "epoch": 0.057356608478802994, "grad_norm": 4.442639350891113, "learning_rate": 1.977206982543641e-05, "loss": 0.168, "step": 460 }, { "epoch": 0.05860349127182045, "grad_norm": 25.09494972229004, "learning_rate": 1.976708229426434e-05, "loss": 0.2924, "step": 470 }, { "epoch": 0.059850374064837904, "grad_norm": 8.644868850708008, "learning_rate": 1.976209476309227e-05, "loss": 0.335, "step": 480 }, { "epoch": 0.06109725685785536, "grad_norm": 15.42319393157959, "learning_rate": 1.97571072319202e-05, "loss": 0.2315, "step": 490 }, { "epoch": 0.06234413965087282, "grad_norm": 11.491703987121582, "learning_rate": 1.975211970074813e-05, "loss": 0.1148, "step": 500 }, { "epoch": 0.06359102244389027, "grad_norm": 9.482577323913574, "learning_rate": 1.974713216957606e-05, "loss": 0.1658, "step": 510 }, { "epoch": 0.06483790523690773, "grad_norm": 0.5576394200325012, "learning_rate": 1.974214463840399e-05, "loss": 0.232, "step": 520 }, { "epoch": 0.06608478802992519, "grad_norm": 0.857613742351532, "learning_rate": 1.9737157107231922e-05, "loss": 0.0998, "step": 530 }, { "epoch": 0.06733167082294264, "grad_norm": 10.808552742004395, "learning_rate": 1.9732169576059853e-05, "loss": 0.1623, "step": 540 }, { "epoch": 0.0685785536159601, "grad_norm": 6.505871772766113, "learning_rate": 1.9727182044887783e-05, "loss": 0.1348, "step": 550 }, { "epoch": 0.06982543640897755, "grad_norm": 11.519365310668945, "learning_rate": 1.9722194513715714e-05, "loss": 0.1733, "step": 560 }, { "epoch": 0.07107231920199501, "grad_norm": 5.1333537101745605, "learning_rate": 1.971720698254364e-05, "loss": 0.1829, "step": 570 }, { "epoch": 0.07231920199501247, "grad_norm": 24.81167221069336, "learning_rate": 1.971221945137157e-05, "loss": 0.2117, "step": 580 }, { "epoch": 0.07356608478802992, "grad_norm": 21.29195213317871, "learning_rate": 1.9707231920199502e-05, "loss": 0.173, "step": 590 }, { "epoch": 0.07481296758104738, "grad_norm": 1.9257557392120361, "learning_rate": 1.9702244389027433e-05, "loss": 0.1143, "step": 600 }, { "epoch": 0.07605985037406483, "grad_norm": 9.582220077514648, "learning_rate": 1.9697256857855363e-05, "loss": 0.1527, "step": 610 }, { "epoch": 0.0773067331670823, "grad_norm": 10.984932899475098, "learning_rate": 1.9692269326683294e-05, "loss": 0.3024, "step": 620 }, { "epoch": 0.07855361596009976, "grad_norm": 10.277746200561523, "learning_rate": 1.9687281795511225e-05, "loss": 0.1069, "step": 630 }, { "epoch": 0.0798004987531172, "grad_norm": 8.234357833862305, "learning_rate": 1.9682294264339155e-05, "loss": 0.0953, "step": 640 }, { "epoch": 0.08104738154613467, "grad_norm": 5.079126834869385, "learning_rate": 1.9677306733167083e-05, "loss": 0.0765, "step": 650 }, { "epoch": 0.08229426433915212, "grad_norm": 32.218772888183594, "learning_rate": 1.9672319201995013e-05, "loss": 0.1209, "step": 660 }, { "epoch": 0.08354114713216958, "grad_norm": 7.151175022125244, "learning_rate": 1.9667331670822944e-05, "loss": 0.1674, "step": 670 }, { "epoch": 0.08478802992518704, "grad_norm": 18.98018455505371, "learning_rate": 1.9662344139650874e-05, "loss": 0.2542, "step": 680 }, { "epoch": 0.08603491271820449, "grad_norm": 8.492568969726562, "learning_rate": 1.9657356608478805e-05, "loss": 0.177, "step": 690 }, { "epoch": 0.08728179551122195, "grad_norm": 15.768592834472656, "learning_rate": 1.9652369077306736e-05, "loss": 0.1671, "step": 700 }, { "epoch": 0.0885286783042394, "grad_norm": 8.789374351501465, "learning_rate": 1.9647381546134666e-05, "loss": 0.1056, "step": 710 }, { "epoch": 0.08977556109725686, "grad_norm": 18.865314483642578, "learning_rate": 1.9642394014962597e-05, "loss": 0.1371, "step": 720 }, { "epoch": 0.09102244389027432, "grad_norm": 5.666704177856445, "learning_rate": 1.9637406483790524e-05, "loss": 0.2404, "step": 730 }, { "epoch": 0.09226932668329177, "grad_norm": 3.111618995666504, "learning_rate": 1.9632418952618455e-05, "loss": 0.2291, "step": 740 }, { "epoch": 0.09351620947630923, "grad_norm": 23.726316452026367, "learning_rate": 1.9627431421446385e-05, "loss": 0.1431, "step": 750 }, { "epoch": 0.09476309226932668, "grad_norm": 0.19223853945732117, "learning_rate": 1.9622443890274316e-05, "loss": 0.0933, "step": 760 }, { "epoch": 0.09600997506234414, "grad_norm": 3.894308090209961, "learning_rate": 1.9617456359102243e-05, "loss": 0.1174, "step": 770 }, { "epoch": 0.09725685785536159, "grad_norm": 28.369335174560547, "learning_rate": 1.9612468827930177e-05, "loss": 0.1417, "step": 780 }, { "epoch": 0.09850374064837905, "grad_norm": 2.8270206451416016, "learning_rate": 1.9607481296758108e-05, "loss": 0.2054, "step": 790 }, { "epoch": 0.09975062344139651, "grad_norm": 27.438745498657227, "learning_rate": 1.960249376558604e-05, "loss": 0.1436, "step": 800 }, { "epoch": 0.10099750623441396, "grad_norm": 5.45671272277832, "learning_rate": 1.959750623441397e-05, "loss": 0.1969, "step": 810 }, { "epoch": 0.10224438902743142, "grad_norm": 9.771449089050293, "learning_rate": 1.9592518703241896e-05, "loss": 0.1026, "step": 820 }, { "epoch": 0.10349127182044887, "grad_norm": 13.02491569519043, "learning_rate": 1.9587531172069827e-05, "loss": 0.125, "step": 830 }, { "epoch": 0.10473815461346633, "grad_norm": 4.949501991271973, "learning_rate": 1.9582543640897758e-05, "loss": 0.0813, "step": 840 }, { "epoch": 0.1059850374064838, "grad_norm": 25.449613571166992, "learning_rate": 1.9577556109725688e-05, "loss": 0.1862, "step": 850 }, { "epoch": 0.10723192019950124, "grad_norm": 5.5177412033081055, "learning_rate": 1.9572568578553615e-05, "loss": 0.0964, "step": 860 }, { "epoch": 0.1084788029925187, "grad_norm": 8.13312816619873, "learning_rate": 1.9567581047381546e-05, "loss": 0.1136, "step": 870 }, { "epoch": 0.10972568578553615, "grad_norm": 19.513381958007812, "learning_rate": 1.9562593516209477e-05, "loss": 0.1429, "step": 880 }, { "epoch": 0.11097256857855362, "grad_norm": 16.46674346923828, "learning_rate": 1.9557605985037407e-05, "loss": 0.0936, "step": 890 }, { "epoch": 0.11221945137157108, "grad_norm": 36.8519401550293, "learning_rate": 1.9552618453865338e-05, "loss": 0.1374, "step": 900 }, { "epoch": 0.11346633416458853, "grad_norm": 4.3099188804626465, "learning_rate": 1.954763092269327e-05, "loss": 0.1874, "step": 910 }, { "epoch": 0.11471321695760599, "grad_norm": 15.681669235229492, "learning_rate": 1.95426433915212e-05, "loss": 0.1291, "step": 920 }, { "epoch": 0.11596009975062344, "grad_norm": 46.113014221191406, "learning_rate": 1.953765586034913e-05, "loss": 0.1844, "step": 930 }, { "epoch": 0.1172069825436409, "grad_norm": 0.08473481982946396, "learning_rate": 1.9532668329177057e-05, "loss": 0.1307, "step": 940 }, { "epoch": 0.11845386533665836, "grad_norm": 0.33060723543167114, "learning_rate": 1.9527680798004988e-05, "loss": 0.1662, "step": 950 }, { "epoch": 0.11970074812967581, "grad_norm": 24.14071273803711, "learning_rate": 1.9522693266832918e-05, "loss": 0.2013, "step": 960 }, { "epoch": 0.12094763092269327, "grad_norm": 1.8103233575820923, "learning_rate": 1.951770573566085e-05, "loss": 0.2362, "step": 970 }, { "epoch": 0.12219451371571072, "grad_norm": 1.4162346124649048, "learning_rate": 1.951271820448878e-05, "loss": 0.0292, "step": 980 }, { "epoch": 0.12344139650872818, "grad_norm": 0.10805538296699524, "learning_rate": 1.950773067331671e-05, "loss": 0.1698, "step": 990 }, { "epoch": 0.12468827930174564, "grad_norm": 25.68360137939453, "learning_rate": 1.950274314214464e-05, "loss": 0.2032, "step": 1000 }, { "epoch": 0.1259351620947631, "grad_norm": 26.8183536529541, "learning_rate": 1.949775561097257e-05, "loss": 0.0992, "step": 1010 }, { "epoch": 0.12718204488778054, "grad_norm": 21.154848098754883, "learning_rate": 1.9492768079800502e-05, "loss": 0.2059, "step": 1020 }, { "epoch": 0.128428927680798, "grad_norm": 1.1379319429397583, "learning_rate": 1.948778054862843e-05, "loss": 0.1347, "step": 1030 }, { "epoch": 0.12967581047381546, "grad_norm": 19.544143676757812, "learning_rate": 1.948279301745636e-05, "loss": 0.1401, "step": 1040 }, { "epoch": 0.13092269326683292, "grad_norm": 0.07502997666597366, "learning_rate": 1.947780548628429e-05, "loss": 0.2728, "step": 1050 }, { "epoch": 0.13216957605985039, "grad_norm": 19.109968185424805, "learning_rate": 1.947281795511222e-05, "loss": 0.3007, "step": 1060 }, { "epoch": 0.13341645885286782, "grad_norm": 16.674196243286133, "learning_rate": 1.946783042394015e-05, "loss": 0.1649, "step": 1070 }, { "epoch": 0.13466334164588528, "grad_norm": 20.961565017700195, "learning_rate": 1.9462842892768082e-05, "loss": 0.1938, "step": 1080 }, { "epoch": 0.13591022443890274, "grad_norm": 14.865165710449219, "learning_rate": 1.9457855361596013e-05, "loss": 0.1775, "step": 1090 }, { "epoch": 0.1371571072319202, "grad_norm": 33.68144226074219, "learning_rate": 1.9452867830423943e-05, "loss": 0.243, "step": 1100 }, { "epoch": 0.13840399002493767, "grad_norm": 0.23615525662899017, "learning_rate": 1.944788029925187e-05, "loss": 0.1302, "step": 1110 }, { "epoch": 0.1396508728179551, "grad_norm": 2.4923934936523438, "learning_rate": 1.94428927680798e-05, "loss": 0.1697, "step": 1120 }, { "epoch": 0.14089775561097256, "grad_norm": 0.05718168243765831, "learning_rate": 1.9437905236907732e-05, "loss": 0.1866, "step": 1130 }, { "epoch": 0.14214463840399003, "grad_norm": 0.2824549674987793, "learning_rate": 1.9432917705735663e-05, "loss": 0.0844, "step": 1140 }, { "epoch": 0.1433915211970075, "grad_norm": 28.53221893310547, "learning_rate": 1.9427930174563593e-05, "loss": 0.2335, "step": 1150 }, { "epoch": 0.14463840399002495, "grad_norm": 3.9056732654571533, "learning_rate": 1.9422942643391524e-05, "loss": 0.2637, "step": 1160 }, { "epoch": 0.14588528678304238, "grad_norm": 0.9593342542648315, "learning_rate": 1.9417955112219454e-05, "loss": 0.1542, "step": 1170 }, { "epoch": 0.14713216957605985, "grad_norm": 3.5426342487335205, "learning_rate": 1.9412967581047385e-05, "loss": 0.0539, "step": 1180 }, { "epoch": 0.1483790523690773, "grad_norm": 24.645750045776367, "learning_rate": 1.9407980049875312e-05, "loss": 0.0829, "step": 1190 }, { "epoch": 0.14962593516209477, "grad_norm": 7.14509916305542, "learning_rate": 1.9402992518703243e-05, "loss": 0.1406, "step": 1200 }, { "epoch": 0.15087281795511223, "grad_norm": 0.049528706818819046, "learning_rate": 1.9398004987531174e-05, "loss": 0.1438, "step": 1210 }, { "epoch": 0.15211970074812967, "grad_norm": 13.300673484802246, "learning_rate": 1.9393017456359104e-05, "loss": 0.1346, "step": 1220 }, { "epoch": 0.15336658354114713, "grad_norm": 1.943932056427002, "learning_rate": 1.938802992518703e-05, "loss": 0.0558, "step": 1230 }, { "epoch": 0.1546134663341646, "grad_norm": 5.144111156463623, "learning_rate": 1.9383042394014962e-05, "loss": 0.1436, "step": 1240 }, { "epoch": 0.15586034912718205, "grad_norm": 16.302080154418945, "learning_rate": 1.9378054862842896e-05, "loss": 0.1697, "step": 1250 }, { "epoch": 0.1571072319201995, "grad_norm": 22.913436889648438, "learning_rate": 1.9373067331670827e-05, "loss": 0.1536, "step": 1260 }, { "epoch": 0.15835411471321695, "grad_norm": 5.865168571472168, "learning_rate": 1.9368079800498757e-05, "loss": 0.107, "step": 1270 }, { "epoch": 0.1596009975062344, "grad_norm": 0.23293504118919373, "learning_rate": 1.9363092269326684e-05, "loss": 0.0561, "step": 1280 }, { "epoch": 0.16084788029925187, "grad_norm": 8.98757553100586, "learning_rate": 1.9358104738154615e-05, "loss": 0.0798, "step": 1290 }, { "epoch": 0.16209476309226933, "grad_norm": 39.03519821166992, "learning_rate": 1.9353117206982546e-05, "loss": 0.3176, "step": 1300 }, { "epoch": 0.1633416458852868, "grad_norm": 1.581081509590149, "learning_rate": 1.9348129675810476e-05, "loss": 0.1908, "step": 1310 }, { "epoch": 0.16458852867830423, "grad_norm": 8.705527305603027, "learning_rate": 1.9343142144638404e-05, "loss": 0.2558, "step": 1320 }, { "epoch": 0.1658354114713217, "grad_norm": 2.133392333984375, "learning_rate": 1.9338154613466334e-05, "loss": 0.1147, "step": 1330 }, { "epoch": 0.16708229426433915, "grad_norm": 8.2285737991333, "learning_rate": 1.9333167082294265e-05, "loss": 0.1026, "step": 1340 }, { "epoch": 0.16832917705735662, "grad_norm": 12.087708473205566, "learning_rate": 1.9328179551122195e-05, "loss": 0.2513, "step": 1350 }, { "epoch": 0.16957605985037408, "grad_norm": 4.111790180206299, "learning_rate": 1.9323192019950126e-05, "loss": 0.1766, "step": 1360 }, { "epoch": 0.1708229426433915, "grad_norm": 11.881768226623535, "learning_rate": 1.9318204488778057e-05, "loss": 0.14, "step": 1370 }, { "epoch": 0.17206982543640897, "grad_norm": 11.08498764038086, "learning_rate": 1.9313216957605987e-05, "loss": 0.1501, "step": 1380 }, { "epoch": 0.17331670822942644, "grad_norm": 0.2787410020828247, "learning_rate": 1.9308229426433918e-05, "loss": 0.1337, "step": 1390 }, { "epoch": 0.1745635910224439, "grad_norm": 35.38869094848633, "learning_rate": 1.9303241895261845e-05, "loss": 0.1991, "step": 1400 }, { "epoch": 0.17581047381546136, "grad_norm": 0.29990383982658386, "learning_rate": 1.9298254364089776e-05, "loss": 0.0835, "step": 1410 }, { "epoch": 0.1770573566084788, "grad_norm": 0.33510634303092957, "learning_rate": 1.9293266832917706e-05, "loss": 0.1892, "step": 1420 }, { "epoch": 0.17830423940149626, "grad_norm": 0.3308704197406769, "learning_rate": 1.9288279301745637e-05, "loss": 0.0766, "step": 1430 }, { "epoch": 0.17955112219451372, "grad_norm": 29.095256805419922, "learning_rate": 1.9283291770573568e-05, "loss": 0.1195, "step": 1440 }, { "epoch": 0.18079800498753118, "grad_norm": 0.04486589506268501, "learning_rate": 1.9278304239401498e-05, "loss": 0.0551, "step": 1450 }, { "epoch": 0.18204488778054864, "grad_norm": 30.3422794342041, "learning_rate": 1.927331670822943e-05, "loss": 0.1743, "step": 1460 }, { "epoch": 0.18329177057356608, "grad_norm": 10.932034492492676, "learning_rate": 1.926832917705736e-05, "loss": 0.0966, "step": 1470 }, { "epoch": 0.18453865336658354, "grad_norm": 16.97438621520996, "learning_rate": 1.9263341645885287e-05, "loss": 0.1924, "step": 1480 }, { "epoch": 0.185785536159601, "grad_norm": 0.11332409828901291, "learning_rate": 1.9258354114713217e-05, "loss": 0.1246, "step": 1490 }, { "epoch": 0.18703241895261846, "grad_norm": 32.340240478515625, "learning_rate": 1.9253366583541148e-05, "loss": 0.1624, "step": 1500 }, { "epoch": 0.1882793017456359, "grad_norm": 22.712034225463867, "learning_rate": 1.924837905236908e-05, "loss": 0.2361, "step": 1510 }, { "epoch": 0.18952618453865336, "grad_norm": 6.54374885559082, "learning_rate": 1.924339152119701e-05, "loss": 0.2299, "step": 1520 }, { "epoch": 0.19077306733167082, "grad_norm": 13.710122108459473, "learning_rate": 1.923840399002494e-05, "loss": 0.0964, "step": 1530 }, { "epoch": 0.19201995012468828, "grad_norm": 2.097973585128784, "learning_rate": 1.923341645885287e-05, "loss": 0.0844, "step": 1540 }, { "epoch": 0.19326683291770574, "grad_norm": 0.21922890841960907, "learning_rate": 1.92284289276808e-05, "loss": 0.2207, "step": 1550 }, { "epoch": 0.19451371571072318, "grad_norm": 21.300539016723633, "learning_rate": 1.922344139650873e-05, "loss": 0.1941, "step": 1560 }, { "epoch": 0.19576059850374064, "grad_norm": 27.9648494720459, "learning_rate": 1.921845386533666e-05, "loss": 0.1927, "step": 1570 }, { "epoch": 0.1970074812967581, "grad_norm": 0.3763630986213684, "learning_rate": 1.921346633416459e-05, "loss": 0.058, "step": 1580 }, { "epoch": 0.19825436408977556, "grad_norm": 7.284142971038818, "learning_rate": 1.920847880299252e-05, "loss": 0.0967, "step": 1590 }, { "epoch": 0.19950124688279303, "grad_norm": 0.2558720111846924, "learning_rate": 1.920349127182045e-05, "loss": 0.0766, "step": 1600 }, { "epoch": 0.20074812967581046, "grad_norm": 0.2345203459262848, "learning_rate": 1.919850374064838e-05, "loss": 0.1259, "step": 1610 }, { "epoch": 0.20199501246882792, "grad_norm": 0.19105514883995056, "learning_rate": 1.9193516209476312e-05, "loss": 0.0979, "step": 1620 }, { "epoch": 0.20324189526184538, "grad_norm": 16.620332717895508, "learning_rate": 1.9188528678304243e-05, "loss": 0.1425, "step": 1630 }, { "epoch": 0.20448877805486285, "grad_norm": 34.320369720458984, "learning_rate": 1.9183541147132173e-05, "loss": 0.1714, "step": 1640 }, { "epoch": 0.2057356608478803, "grad_norm": 3.3170464038848877, "learning_rate": 1.91785536159601e-05, "loss": 0.0134, "step": 1650 }, { "epoch": 0.20698254364089774, "grad_norm": 9.176933288574219, "learning_rate": 1.917356608478803e-05, "loss": 0.1046, "step": 1660 }, { "epoch": 0.2082294264339152, "grad_norm": 0.061658814549446106, "learning_rate": 1.9168578553615962e-05, "loss": 0.0464, "step": 1670 }, { "epoch": 0.20947630922693267, "grad_norm": 1.5302953720092773, "learning_rate": 1.9163591022443892e-05, "loss": 0.1487, "step": 1680 }, { "epoch": 0.21072319201995013, "grad_norm": 0.038375142961740494, "learning_rate": 1.915860349127182e-05, "loss": 0.079, "step": 1690 }, { "epoch": 0.2119700748129676, "grad_norm": 0.4806802570819855, "learning_rate": 1.915361596009975e-05, "loss": 0.0285, "step": 1700 }, { "epoch": 0.21321695760598502, "grad_norm": 0.042496830224990845, "learning_rate": 1.914862842892768e-05, "loss": 0.1011, "step": 1710 }, { "epoch": 0.2144638403990025, "grad_norm": 2.657435894012451, "learning_rate": 1.9143640897755615e-05, "loss": 0.1359, "step": 1720 }, { "epoch": 0.21571072319201995, "grad_norm": 22.948381423950195, "learning_rate": 1.9138653366583542e-05, "loss": 0.1223, "step": 1730 }, { "epoch": 0.2169576059850374, "grad_norm": 2.4010567665100098, "learning_rate": 1.9133665835411473e-05, "loss": 0.0725, "step": 1740 }, { "epoch": 0.21820448877805487, "grad_norm": 20.701522827148438, "learning_rate": 1.9128678304239403e-05, "loss": 0.1088, "step": 1750 }, { "epoch": 0.2194513715710723, "grad_norm": 0.9114007949829102, "learning_rate": 1.9123690773067334e-05, "loss": 0.1289, "step": 1760 }, { "epoch": 0.22069825436408977, "grad_norm": 0.5298766493797302, "learning_rate": 1.9118703241895265e-05, "loss": 0.1754, "step": 1770 }, { "epoch": 0.22194513715710723, "grad_norm": 20.310009002685547, "learning_rate": 1.9113715710723192e-05, "loss": 0.0428, "step": 1780 }, { "epoch": 0.2231920199501247, "grad_norm": 16.548736572265625, "learning_rate": 1.9108728179551122e-05, "loss": 0.1378, "step": 1790 }, { "epoch": 0.22443890274314215, "grad_norm": 0.10499307513237, "learning_rate": 1.9103740648379053e-05, "loss": 0.1144, "step": 1800 }, { "epoch": 0.2256857855361596, "grad_norm": 32.94569778442383, "learning_rate": 1.9098753117206984e-05, "loss": 0.1308, "step": 1810 }, { "epoch": 0.22693266832917705, "grad_norm": 12.977508544921875, "learning_rate": 1.9093765586034914e-05, "loss": 0.0695, "step": 1820 }, { "epoch": 0.2281795511221945, "grad_norm": 22.582475662231445, "learning_rate": 1.9088778054862845e-05, "loss": 0.1223, "step": 1830 }, { "epoch": 0.22942643391521197, "grad_norm": 0.45582443475723267, "learning_rate": 1.9083790523690775e-05, "loss": 0.1618, "step": 1840 }, { "epoch": 0.23067331670822944, "grad_norm": 14.70036792755127, "learning_rate": 1.9078802992518706e-05, "loss": 0.1359, "step": 1850 }, { "epoch": 0.23192019950124687, "grad_norm": 5.940314292907715, "learning_rate": 1.9073815461346633e-05, "loss": 0.1857, "step": 1860 }, { "epoch": 0.23316708229426433, "grad_norm": 2.6022796630859375, "learning_rate": 1.9068827930174564e-05, "loss": 0.1109, "step": 1870 }, { "epoch": 0.2344139650872818, "grad_norm": 7.17425012588501, "learning_rate": 1.9063840399002495e-05, "loss": 0.0246, "step": 1880 }, { "epoch": 0.23566084788029926, "grad_norm": 0.07173322141170502, "learning_rate": 1.9058852867830425e-05, "loss": 0.1983, "step": 1890 }, { "epoch": 0.23690773067331672, "grad_norm": 7.803640842437744, "learning_rate": 1.9053865336658356e-05, "loss": 0.2229, "step": 1900 }, { "epoch": 0.23815461346633415, "grad_norm": 6.28038215637207, "learning_rate": 1.9048877805486286e-05, "loss": 0.0816, "step": 1910 }, { "epoch": 0.23940149625935161, "grad_norm": 17.327741622924805, "learning_rate": 1.9043890274314217e-05, "loss": 0.0725, "step": 1920 }, { "epoch": 0.24064837905236908, "grad_norm": 0.18219555914402008, "learning_rate": 1.9038902743142148e-05, "loss": 0.0918, "step": 1930 }, { "epoch": 0.24189526184538654, "grad_norm": 7.153271675109863, "learning_rate": 1.9033915211970075e-05, "loss": 0.1641, "step": 1940 }, { "epoch": 0.243142144638404, "grad_norm": 0.31039583683013916, "learning_rate": 1.9028927680798005e-05, "loss": 0.0713, "step": 1950 }, { "epoch": 0.24438902743142144, "grad_norm": 21.150863647460938, "learning_rate": 1.9023940149625936e-05, "loss": 0.1273, "step": 1960 }, { "epoch": 0.2456359102244389, "grad_norm": 2.9307498931884766, "learning_rate": 1.9018952618453867e-05, "loss": 0.1131, "step": 1970 }, { "epoch": 0.24688279301745636, "grad_norm": 32.69029998779297, "learning_rate": 1.9013965087281797e-05, "loss": 0.154, "step": 1980 }, { "epoch": 0.24812967581047382, "grad_norm": 2.6511595249176025, "learning_rate": 1.9008977556109728e-05, "loss": 0.0656, "step": 1990 }, { "epoch": 0.24937655860349128, "grad_norm": 15.334160804748535, "learning_rate": 1.900399002493766e-05, "loss": 0.0894, "step": 2000 }, { "epoch": 0.2506234413965087, "grad_norm": 29.95343780517578, "learning_rate": 1.899900249376559e-05, "loss": 0.1466, "step": 2010 }, { "epoch": 0.2518703241895262, "grad_norm": 1.1614725589752197, "learning_rate": 1.899401496259352e-05, "loss": 0.156, "step": 2020 }, { "epoch": 0.25311720698254364, "grad_norm": 0.3376743495464325, "learning_rate": 1.8989027431421447e-05, "loss": 0.0605, "step": 2030 }, { "epoch": 0.2543640897755611, "grad_norm": 0.05432422459125519, "learning_rate": 1.8984039900249378e-05, "loss": 0.1247, "step": 2040 }, { "epoch": 0.25561097256857856, "grad_norm": 0.2011519819498062, "learning_rate": 1.8979052369077308e-05, "loss": 0.0973, "step": 2050 }, { "epoch": 0.256857855361596, "grad_norm": 23.83899688720703, "learning_rate": 1.897406483790524e-05, "loss": 0.1737, "step": 2060 }, { "epoch": 0.2581047381546135, "grad_norm": 16.04006004333496, "learning_rate": 1.896907730673317e-05, "loss": 0.1346, "step": 2070 }, { "epoch": 0.2593516209476309, "grad_norm": 6.27106237411499, "learning_rate": 1.89640897755611e-05, "loss": 0.118, "step": 2080 }, { "epoch": 0.26059850374064836, "grad_norm": 3.775754451751709, "learning_rate": 1.895910224438903e-05, "loss": 0.2552, "step": 2090 }, { "epoch": 0.26184538653366585, "grad_norm": 0.19130301475524902, "learning_rate": 1.895411471321696e-05, "loss": 0.1049, "step": 2100 }, { "epoch": 0.2630922693266833, "grad_norm": 12.918098449707031, "learning_rate": 1.894912718204489e-05, "loss": 0.1938, "step": 2110 }, { "epoch": 0.26433915211970077, "grad_norm": 33.832454681396484, "learning_rate": 1.894413965087282e-05, "loss": 0.0929, "step": 2120 }, { "epoch": 0.2655860349127182, "grad_norm": 3.3025131225585938, "learning_rate": 1.893915211970075e-05, "loss": 0.0542, "step": 2130 }, { "epoch": 0.26683291770573564, "grad_norm": 0.16082660853862762, "learning_rate": 1.893416458852868e-05, "loss": 0.0435, "step": 2140 }, { "epoch": 0.26807980049875313, "grad_norm": 22.90357780456543, "learning_rate": 1.8929177057356608e-05, "loss": 0.064, "step": 2150 }, { "epoch": 0.26932668329177056, "grad_norm": 0.03351182863116264, "learning_rate": 1.892418952618454e-05, "loss": 0.0409, "step": 2160 }, { "epoch": 0.27057356608478805, "grad_norm": 0.09229867160320282, "learning_rate": 1.891920199501247e-05, "loss": 0.0076, "step": 2170 }, { "epoch": 0.2718204488778055, "grad_norm": 0.03719809651374817, "learning_rate": 1.8914214463840403e-05, "loss": 0.1627, "step": 2180 }, { "epoch": 0.2730673316708229, "grad_norm": 8.800865173339844, "learning_rate": 1.890922693266833e-05, "loss": 0.1737, "step": 2190 }, { "epoch": 0.2743142144638404, "grad_norm": 0.03148549422621727, "learning_rate": 1.890423940149626e-05, "loss": 0.2566, "step": 2200 }, { "epoch": 0.27556109725685785, "grad_norm": 0.2769756019115448, "learning_rate": 1.889925187032419e-05, "loss": 0.0604, "step": 2210 }, { "epoch": 0.27680798004987534, "grad_norm": 37.58738708496094, "learning_rate": 1.8894264339152122e-05, "loss": 0.0611, "step": 2220 }, { "epoch": 0.27805486284289277, "grad_norm": 0.1621832549571991, "learning_rate": 1.888927680798005e-05, "loss": 0.0724, "step": 2230 }, { "epoch": 0.2793017456359102, "grad_norm": 0.07667220383882523, "learning_rate": 1.888428927680798e-05, "loss": 0.1044, "step": 2240 }, { "epoch": 0.2805486284289277, "grad_norm": 0.16220073401927948, "learning_rate": 1.887930174563591e-05, "loss": 0.1377, "step": 2250 }, { "epoch": 0.2817955112219451, "grad_norm": 0.0622367337346077, "learning_rate": 1.887431421446384e-05, "loss": 0.1296, "step": 2260 }, { "epoch": 0.2830423940149626, "grad_norm": 0.31281012296676636, "learning_rate": 1.8869326683291772e-05, "loss": 0.1597, "step": 2270 }, { "epoch": 0.28428927680798005, "grad_norm": 17.161144256591797, "learning_rate": 1.8864339152119702e-05, "loss": 0.0763, "step": 2280 }, { "epoch": 0.2855361596009975, "grad_norm": 21.191686630249023, "learning_rate": 1.8859351620947633e-05, "loss": 0.0821, "step": 2290 }, { "epoch": 0.286783042394015, "grad_norm": 0.03687586635351181, "learning_rate": 1.8854364089775564e-05, "loss": 0.0243, "step": 2300 }, { "epoch": 0.2880299251870324, "grad_norm": 0.7001633048057556, "learning_rate": 1.8849376558603494e-05, "loss": 0.1851, "step": 2310 }, { "epoch": 0.2892768079800499, "grad_norm": 46.858665466308594, "learning_rate": 1.884438902743142e-05, "loss": 0.1349, "step": 2320 }, { "epoch": 0.29052369077306733, "grad_norm": 0.0679154023528099, "learning_rate": 1.8839401496259352e-05, "loss": 0.0741, "step": 2330 }, { "epoch": 0.29177057356608477, "grad_norm": 21.36118507385254, "learning_rate": 1.8834413965087283e-05, "loss": 0.0935, "step": 2340 }, { "epoch": 0.29301745635910226, "grad_norm": 4.055055141448975, "learning_rate": 1.8829426433915213e-05, "loss": 0.1776, "step": 2350 }, { "epoch": 0.2942643391521197, "grad_norm": 0.02688492275774479, "learning_rate": 1.8824438902743144e-05, "loss": 0.0141, "step": 2360 }, { "epoch": 0.2955112219451372, "grad_norm": 43.924827575683594, "learning_rate": 1.8819451371571075e-05, "loss": 0.1315, "step": 2370 }, { "epoch": 0.2967581047381546, "grad_norm": 22.235816955566406, "learning_rate": 1.8814463840399005e-05, "loss": 0.0706, "step": 2380 }, { "epoch": 0.29800498753117205, "grad_norm": 0.8161360621452332, "learning_rate": 1.8809476309226936e-05, "loss": 0.1809, "step": 2390 }, { "epoch": 0.29925187032418954, "grad_norm": 6.17124080657959, "learning_rate": 1.8804488778054863e-05, "loss": 0.1138, "step": 2400 }, { "epoch": 0.300498753117207, "grad_norm": 6.6597771644592285, "learning_rate": 1.8799501246882794e-05, "loss": 0.0869, "step": 2410 }, { "epoch": 0.30174563591022446, "grad_norm": 0.28154247999191284, "learning_rate": 1.8794513715710724e-05, "loss": 0.0437, "step": 2420 }, { "epoch": 0.3029925187032419, "grad_norm": 0.0688408687710762, "learning_rate": 1.8789526184538655e-05, "loss": 0.043, "step": 2430 }, { "epoch": 0.30423940149625933, "grad_norm": 25.441835403442383, "learning_rate": 1.8784538653366586e-05, "loss": 0.0242, "step": 2440 }, { "epoch": 0.3054862842892768, "grad_norm": 0.04695994779467583, "learning_rate": 1.8779551122194516e-05, "loss": 0.0904, "step": 2450 }, { "epoch": 0.30673316708229426, "grad_norm": 0.020629514008760452, "learning_rate": 1.8774563591022447e-05, "loss": 0.0748, "step": 2460 }, { "epoch": 0.30798004987531175, "grad_norm": 3.5206122398376465, "learning_rate": 1.8770074812967583e-05, "loss": 0.091, "step": 2470 }, { "epoch": 0.3092269326683292, "grad_norm": 29.27650260925293, "learning_rate": 1.8765087281795514e-05, "loss": 0.1235, "step": 2480 }, { "epoch": 0.3104738154613466, "grad_norm": 4.11005973815918, "learning_rate": 1.8760099750623445e-05, "loss": 0.1624, "step": 2490 }, { "epoch": 0.3117206982543641, "grad_norm": 0.38684889674186707, "learning_rate": 1.8755112219451372e-05, "loss": 0.0606, "step": 2500 }, { "epoch": 0.31296758104738154, "grad_norm": 0.18241973221302032, "learning_rate": 1.8750124688279302e-05, "loss": 0.1873, "step": 2510 }, { "epoch": 0.314214463840399, "grad_norm": 0.10438118875026703, "learning_rate": 1.8745137157107233e-05, "loss": 0.0626, "step": 2520 }, { "epoch": 0.31546134663341646, "grad_norm": 0.040058329701423645, "learning_rate": 1.8740149625935164e-05, "loss": 0.1456, "step": 2530 }, { "epoch": 0.3167082294264339, "grad_norm": 9.054058074951172, "learning_rate": 1.8735162094763094e-05, "loss": 0.0662, "step": 2540 }, { "epoch": 0.3179551122194514, "grad_norm": 9.296021461486816, "learning_rate": 1.8730174563591025e-05, "loss": 0.1729, "step": 2550 }, { "epoch": 0.3192019950124688, "grad_norm": 0.03742093965411186, "learning_rate": 1.8725187032418955e-05, "loss": 0.1425, "step": 2560 }, { "epoch": 0.3204488778054863, "grad_norm": 30.912044525146484, "learning_rate": 1.8720199501246886e-05, "loss": 0.105, "step": 2570 }, { "epoch": 0.32169576059850374, "grad_norm": 0.7337010502815247, "learning_rate": 1.8715211970074813e-05, "loss": 0.1298, "step": 2580 }, { "epoch": 0.3229426433915212, "grad_norm": 21.679298400878906, "learning_rate": 1.8710224438902744e-05, "loss": 0.0749, "step": 2590 }, { "epoch": 0.32418952618453867, "grad_norm": 2.8091278076171875, "learning_rate": 1.8705236907730675e-05, "loss": 0.0869, "step": 2600 }, { "epoch": 0.3254364089775561, "grad_norm": 0.2369414120912552, "learning_rate": 1.8700249376558605e-05, "loss": 0.0693, "step": 2610 }, { "epoch": 0.3266832917705736, "grad_norm": 5.367069721221924, "learning_rate": 1.8695261845386536e-05, "loss": 0.1397, "step": 2620 }, { "epoch": 0.327930174563591, "grad_norm": 12.290024757385254, "learning_rate": 1.8690274314214466e-05, "loss": 0.2084, "step": 2630 }, { "epoch": 0.32917705735660846, "grad_norm": 9.538579940795898, "learning_rate": 1.8685286783042397e-05, "loss": 0.1456, "step": 2640 }, { "epoch": 0.33042394014962595, "grad_norm": 28.887372970581055, "learning_rate": 1.8680299251870328e-05, "loss": 0.1234, "step": 2650 }, { "epoch": 0.3316708229426434, "grad_norm": 0.04460681602358818, "learning_rate": 1.8675311720698255e-05, "loss": 0.0909, "step": 2660 }, { "epoch": 0.3329177057356609, "grad_norm": 14.794498443603516, "learning_rate": 1.8670324189526186e-05, "loss": 0.1336, "step": 2670 }, { "epoch": 0.3341645885286783, "grad_norm": 12.941160202026367, "learning_rate": 1.8665336658354116e-05, "loss": 0.1972, "step": 2680 }, { "epoch": 0.33541147132169574, "grad_norm": 24.143878936767578, "learning_rate": 1.8660349127182047e-05, "loss": 0.1048, "step": 2690 }, { "epoch": 0.33665835411471323, "grad_norm": 26.018558502197266, "learning_rate": 1.8655361596009977e-05, "loss": 0.1691, "step": 2700 }, { "epoch": 0.33790523690773067, "grad_norm": 2.2344202995300293, "learning_rate": 1.8650374064837905e-05, "loss": 0.084, "step": 2710 }, { "epoch": 0.33915211970074816, "grad_norm": 2.037916660308838, "learning_rate": 1.8645386533665835e-05, "loss": 0.0796, "step": 2720 }, { "epoch": 0.3403990024937656, "grad_norm": 0.05853952094912529, "learning_rate": 1.864039900249377e-05, "loss": 0.0909, "step": 2730 }, { "epoch": 0.341645885286783, "grad_norm": 20.358482360839844, "learning_rate": 1.86354114713217e-05, "loss": 0.148, "step": 2740 }, { "epoch": 0.3428927680798005, "grad_norm": 0.28583335876464844, "learning_rate": 1.8630423940149627e-05, "loss": 0.1022, "step": 2750 }, { "epoch": 0.34413965087281795, "grad_norm": 15.605045318603516, "learning_rate": 1.8625436408977558e-05, "loss": 0.0773, "step": 2760 }, { "epoch": 0.34538653366583544, "grad_norm": 0.038003236055374146, "learning_rate": 1.862044887780549e-05, "loss": 0.1025, "step": 2770 }, { "epoch": 0.34663341645885287, "grad_norm": 0.06002043932676315, "learning_rate": 1.861546134663342e-05, "loss": 0.1234, "step": 2780 }, { "epoch": 0.3478802992518703, "grad_norm": 40.75321960449219, "learning_rate": 1.8610473815461346e-05, "loss": 0.0429, "step": 2790 }, { "epoch": 0.3491271820448878, "grad_norm": 4.2325029373168945, "learning_rate": 1.8605486284289277e-05, "loss": 0.1183, "step": 2800 }, { "epoch": 0.35037406483790523, "grad_norm": 0.026608416810631752, "learning_rate": 1.8600498753117207e-05, "loss": 0.0885, "step": 2810 }, { "epoch": 0.3516209476309227, "grad_norm": 7.794644832611084, "learning_rate": 1.8595511221945138e-05, "loss": 0.083, "step": 2820 }, { "epoch": 0.35286783042394015, "grad_norm": 12.012767791748047, "learning_rate": 1.859052369077307e-05, "loss": 0.0543, "step": 2830 }, { "epoch": 0.3541147132169576, "grad_norm": 1.2812813520431519, "learning_rate": 1.8585536159601e-05, "loss": 0.1791, "step": 2840 }, { "epoch": 0.3553615960099751, "grad_norm": 0.8094510436058044, "learning_rate": 1.858054862842893e-05, "loss": 0.0956, "step": 2850 }, { "epoch": 0.3566084788029925, "grad_norm": 21.321758270263672, "learning_rate": 1.857556109725686e-05, "loss": 0.099, "step": 2860 }, { "epoch": 0.35785536159601, "grad_norm": 0.2500157058238983, "learning_rate": 1.8570573566084788e-05, "loss": 0.0875, "step": 2870 }, { "epoch": 0.35910224438902744, "grad_norm": 1.042913556098938, "learning_rate": 1.856558603491272e-05, "loss": 0.0707, "step": 2880 }, { "epoch": 0.36034912718204487, "grad_norm": 7.620708465576172, "learning_rate": 1.856059850374065e-05, "loss": 0.2129, "step": 2890 }, { "epoch": 0.36159600997506236, "grad_norm": 54.09572219848633, "learning_rate": 1.855561097256858e-05, "loss": 0.0507, "step": 2900 }, { "epoch": 0.3628428927680798, "grad_norm": 8.779691696166992, "learning_rate": 1.855062344139651e-05, "loss": 0.0428, "step": 2910 }, { "epoch": 0.3640897755610973, "grad_norm": 16.20435333251953, "learning_rate": 1.854563591022444e-05, "loss": 0.1638, "step": 2920 }, { "epoch": 0.3653366583541147, "grad_norm": 24.456098556518555, "learning_rate": 1.854064837905237e-05, "loss": 0.1414, "step": 2930 }, { "epoch": 0.36658354114713215, "grad_norm": 16.864614486694336, "learning_rate": 1.8535660847880302e-05, "loss": 0.1618, "step": 2940 }, { "epoch": 0.36783042394014964, "grad_norm": 8.25745677947998, "learning_rate": 1.8530673316708233e-05, "loss": 0.1248, "step": 2950 }, { "epoch": 0.3690773067331671, "grad_norm": 1.0005909204483032, "learning_rate": 1.852568578553616e-05, "loss": 0.1164, "step": 2960 }, { "epoch": 0.37032418952618457, "grad_norm": 13.815901756286621, "learning_rate": 1.852069825436409e-05, "loss": 0.1161, "step": 2970 }, { "epoch": 0.371571072319202, "grad_norm": 0.46969351172447205, "learning_rate": 1.851571072319202e-05, "loss": 0.02, "step": 2980 }, { "epoch": 0.37281795511221943, "grad_norm": 0.06556516140699387, "learning_rate": 1.8510723192019952e-05, "loss": 0.184, "step": 2990 }, { "epoch": 0.3740648379052369, "grad_norm": 0.3786510229110718, "learning_rate": 1.8505735660847882e-05, "loss": 0.0972, "step": 3000 }, { "epoch": 0.37531172069825436, "grad_norm": 0.15150892734527588, "learning_rate": 1.8500748129675813e-05, "loss": 0.0366, "step": 3010 }, { "epoch": 0.3765586034912718, "grad_norm": 0.38234204053878784, "learning_rate": 1.8495760598503744e-05, "loss": 0.0643, "step": 3020 }, { "epoch": 0.3778054862842893, "grad_norm": 0.059004172682762146, "learning_rate": 1.8490773067331674e-05, "loss": 0.029, "step": 3030 }, { "epoch": 0.3790523690773067, "grad_norm": 0.02871227078139782, "learning_rate": 1.84857855361596e-05, "loss": 0.0238, "step": 3040 }, { "epoch": 0.3802992518703242, "grad_norm": 0.2601139545440674, "learning_rate": 1.8480798004987532e-05, "loss": 0.1566, "step": 3050 }, { "epoch": 0.38154613466334164, "grad_norm": 0.06762529909610748, "learning_rate": 1.8475810473815463e-05, "loss": 0.0609, "step": 3060 }, { "epoch": 0.3827930174563591, "grad_norm": 26.341711044311523, "learning_rate": 1.8470822942643393e-05, "loss": 0.0876, "step": 3070 }, { "epoch": 0.38403990024937656, "grad_norm": 9.868642807006836, "learning_rate": 1.8465835411471324e-05, "loss": 0.1248, "step": 3080 }, { "epoch": 0.385286783042394, "grad_norm": 0.05572517588734627, "learning_rate": 1.8460847880299255e-05, "loss": 0.1603, "step": 3090 }, { "epoch": 0.3865336658354115, "grad_norm": 8.873476028442383, "learning_rate": 1.8455860349127185e-05, "loss": 0.1626, "step": 3100 }, { "epoch": 0.3877805486284289, "grad_norm": 26.02020835876465, "learning_rate": 1.8450872817955116e-05, "loss": 0.1455, "step": 3110 }, { "epoch": 0.38902743142144636, "grad_norm": 0.302876353263855, "learning_rate": 1.8445885286783043e-05, "loss": 0.0821, "step": 3120 }, { "epoch": 0.39027431421446385, "grad_norm": 25.011768341064453, "learning_rate": 1.8440897755610974e-05, "loss": 0.0415, "step": 3130 }, { "epoch": 0.3915211970074813, "grad_norm": 0.10454079508781433, "learning_rate": 1.8435910224438904e-05, "loss": 0.0653, "step": 3140 }, { "epoch": 0.39276807980049877, "grad_norm": 6.67221212387085, "learning_rate": 1.8430922693266835e-05, "loss": 0.0632, "step": 3150 }, { "epoch": 0.3940149625935162, "grad_norm": 34.12371826171875, "learning_rate": 1.8425935162094762e-05, "loss": 0.0664, "step": 3160 }, { "epoch": 0.39526184538653364, "grad_norm": 0.04642521217465401, "learning_rate": 1.8420947630922693e-05, "loss": 0.0891, "step": 3170 }, { "epoch": 0.39650872817955113, "grad_norm": 0.056110795587301254, "learning_rate": 1.8415960099750623e-05, "loss": 0.1448, "step": 3180 }, { "epoch": 0.39775561097256856, "grad_norm": 0.01612841710448265, "learning_rate": 1.8410972568578554e-05, "loss": 0.1943, "step": 3190 }, { "epoch": 0.39900249376558605, "grad_norm": 0.07397930324077606, "learning_rate": 1.8405985037406488e-05, "loss": 0.1414, "step": 3200 }, { "epoch": 0.4002493765586035, "grad_norm": 21.90389633178711, "learning_rate": 1.8400997506234415e-05, "loss": 0.0627, "step": 3210 }, { "epoch": 0.4014962593516209, "grad_norm": 0.032755568623542786, "learning_rate": 1.8396009975062346e-05, "loss": 0.0087, "step": 3220 }, { "epoch": 0.4027431421446384, "grad_norm": 26.460538864135742, "learning_rate": 1.8391022443890276e-05, "loss": 0.1188, "step": 3230 }, { "epoch": 0.40399002493765584, "grad_norm": 0.40684857964515686, "learning_rate": 1.8386034912718207e-05, "loss": 0.0714, "step": 3240 }, { "epoch": 0.40523690773067333, "grad_norm": 0.03340421989560127, "learning_rate": 1.8381047381546134e-05, "loss": 0.0362, "step": 3250 }, { "epoch": 0.40648379052369077, "grad_norm": 0.2820630371570587, "learning_rate": 1.8376059850374065e-05, "loss": 0.1361, "step": 3260 }, { "epoch": 0.4077306733167082, "grad_norm": 0.018967803567647934, "learning_rate": 1.8371072319201996e-05, "loss": 0.1086, "step": 3270 }, { "epoch": 0.4089775561097257, "grad_norm": 0.6365144848823547, "learning_rate": 1.8366084788029926e-05, "loss": 0.0465, "step": 3280 }, { "epoch": 0.4102244389027431, "grad_norm": 0.10352272540330887, "learning_rate": 1.8361097256857857e-05, "loss": 0.0639, "step": 3290 }, { "epoch": 0.4114713216957606, "grad_norm": 20.832256317138672, "learning_rate": 1.8356109725685787e-05, "loss": 0.1182, "step": 3300 }, { "epoch": 0.41271820448877805, "grad_norm": 0.034411683678627014, "learning_rate": 1.8351122194513718e-05, "loss": 0.0996, "step": 3310 }, { "epoch": 0.4139650872817955, "grad_norm": 15.220785140991211, "learning_rate": 1.834613466334165e-05, "loss": 0.2009, "step": 3320 }, { "epoch": 0.415211970074813, "grad_norm": 0.23576247692108154, "learning_rate": 1.8341147132169576e-05, "loss": 0.0933, "step": 3330 }, { "epoch": 0.4164588528678304, "grad_norm": 0.03238127753138542, "learning_rate": 1.8336159600997507e-05, "loss": 0.1636, "step": 3340 }, { "epoch": 0.4177057356608479, "grad_norm": 10.25538444519043, "learning_rate": 1.8331172069825437e-05, "loss": 0.118, "step": 3350 }, { "epoch": 0.41895261845386533, "grad_norm": 0.4214819073677063, "learning_rate": 1.8326184538653368e-05, "loss": 0.0542, "step": 3360 }, { "epoch": 0.42019950124688277, "grad_norm": 2.8215088844299316, "learning_rate": 1.83211970074813e-05, "loss": 0.152, "step": 3370 }, { "epoch": 0.42144638403990026, "grad_norm": 29.304697036743164, "learning_rate": 1.831620947630923e-05, "loss": 0.128, "step": 3380 }, { "epoch": 0.4226932668329177, "grad_norm": 0.18514405190944672, "learning_rate": 1.831122194513716e-05, "loss": 0.1495, "step": 3390 }, { "epoch": 0.4239401496259352, "grad_norm": 0.17302575707435608, "learning_rate": 1.830623441396509e-05, "loss": 0.0485, "step": 3400 }, { "epoch": 0.4251870324189526, "grad_norm": 4.4047532081604, "learning_rate": 1.8301246882793017e-05, "loss": 0.0401, "step": 3410 }, { "epoch": 0.42643391521197005, "grad_norm": 2.6274476051330566, "learning_rate": 1.8296259351620948e-05, "loss": 0.1061, "step": 3420 }, { "epoch": 0.42768079800498754, "grad_norm": 0.21970237791538239, "learning_rate": 1.829127182044888e-05, "loss": 0.0306, "step": 3430 }, { "epoch": 0.428927680798005, "grad_norm": 1.312694787979126, "learning_rate": 1.828628428927681e-05, "loss": 0.1095, "step": 3440 }, { "epoch": 0.43017456359102246, "grad_norm": 0.2038409262895584, "learning_rate": 1.828129675810474e-05, "loss": 0.1052, "step": 3450 }, { "epoch": 0.4314214463840399, "grad_norm": 0.17797330021858215, "learning_rate": 1.827630922693267e-05, "loss": 0.0035, "step": 3460 }, { "epoch": 0.43266832917705733, "grad_norm": 1.5515074729919434, "learning_rate": 1.82713216957606e-05, "loss": 0.0753, "step": 3470 }, { "epoch": 0.4339152119700748, "grad_norm": 10.284847259521484, "learning_rate": 1.8266334164588532e-05, "loss": 0.1561, "step": 3480 }, { "epoch": 0.43516209476309226, "grad_norm": 5.16575813293457, "learning_rate": 1.8261346633416462e-05, "loss": 0.1061, "step": 3490 }, { "epoch": 0.43640897755610975, "grad_norm": 0.04115693271160126, "learning_rate": 1.825635910224439e-05, "loss": 0.1139, "step": 3500 }, { "epoch": 0.4376558603491272, "grad_norm": 0.7454102635383606, "learning_rate": 1.825137157107232e-05, "loss": 0.0631, "step": 3510 }, { "epoch": 0.4389027431421446, "grad_norm": 26.38698387145996, "learning_rate": 1.824638403990025e-05, "loss": 0.0937, "step": 3520 }, { "epoch": 0.4401496259351621, "grad_norm": 29.491291046142578, "learning_rate": 1.824139650872818e-05, "loss": 0.1123, "step": 3530 }, { "epoch": 0.44139650872817954, "grad_norm": 23.8365421295166, "learning_rate": 1.823640897755611e-05, "loss": 0.0995, "step": 3540 }, { "epoch": 0.442643391521197, "grad_norm": 41.31846237182617, "learning_rate": 1.8231421446384043e-05, "loss": 0.0294, "step": 3550 }, { "epoch": 0.44389027431421446, "grad_norm": 34.70884704589844, "learning_rate": 1.8226433915211973e-05, "loss": 0.1029, "step": 3560 }, { "epoch": 0.4451371571072319, "grad_norm": 9.22887134552002, "learning_rate": 1.8221446384039904e-05, "loss": 0.1245, "step": 3570 }, { "epoch": 0.4463840399002494, "grad_norm": 0.04959265887737274, "learning_rate": 1.821645885286783e-05, "loss": 0.0174, "step": 3580 }, { "epoch": 0.4476309226932668, "grad_norm": 0.8959947824478149, "learning_rate": 1.8211471321695762e-05, "loss": 0.0333, "step": 3590 }, { "epoch": 0.4488778054862843, "grad_norm": 0.14847318828105927, "learning_rate": 1.8206483790523692e-05, "loss": 0.1173, "step": 3600 }, { "epoch": 0.45012468827930174, "grad_norm": 0.10429783910512924, "learning_rate": 1.8201496259351623e-05, "loss": 0.0177, "step": 3610 }, { "epoch": 0.4513715710723192, "grad_norm": 0.1884181648492813, "learning_rate": 1.819650872817955e-05, "loss": 0.0785, "step": 3620 }, { "epoch": 0.45261845386533667, "grad_norm": 0.03715066611766815, "learning_rate": 1.819152119700748e-05, "loss": 0.012, "step": 3630 }, { "epoch": 0.4538653366583541, "grad_norm": 27.58835220336914, "learning_rate": 1.818653366583541e-05, "loss": 0.1235, "step": 3640 }, { "epoch": 0.4551122194513716, "grad_norm": 16.778362274169922, "learning_rate": 1.8181546134663342e-05, "loss": 0.2646, "step": 3650 }, { "epoch": 0.456359102244389, "grad_norm": 22.702903747558594, "learning_rate": 1.8176558603491273e-05, "loss": 0.0658, "step": 3660 }, { "epoch": 0.45760598503740646, "grad_norm": 0.09582442045211792, "learning_rate": 1.8171571072319203e-05, "loss": 0.104, "step": 3670 }, { "epoch": 0.45885286783042395, "grad_norm": 0.026133723556995392, "learning_rate": 1.8166583541147134e-05, "loss": 0.0753, "step": 3680 }, { "epoch": 0.4600997506234414, "grad_norm": 2.048128128051758, "learning_rate": 1.8161596009975065e-05, "loss": 0.0564, "step": 3690 }, { "epoch": 0.4613466334164589, "grad_norm": 1.270678997039795, "learning_rate": 1.8156608478802995e-05, "loss": 0.146, "step": 3700 }, { "epoch": 0.4625935162094763, "grad_norm": 17.18134307861328, "learning_rate": 1.8151620947630923e-05, "loss": 0.0391, "step": 3710 }, { "epoch": 0.46384039900249374, "grad_norm": 8.475462913513184, "learning_rate": 1.8146633416458853e-05, "loss": 0.0857, "step": 3720 }, { "epoch": 0.46508728179551123, "grad_norm": 0.06224285066127777, "learning_rate": 1.8141645885286784e-05, "loss": 0.069, "step": 3730 }, { "epoch": 0.46633416458852867, "grad_norm": 5.85067081451416, "learning_rate": 1.8136658354114714e-05, "loss": 0.1032, "step": 3740 }, { "epoch": 0.46758104738154616, "grad_norm": 0.02149471826851368, "learning_rate": 1.8131670822942645e-05, "loss": 0.0613, "step": 3750 }, { "epoch": 0.4688279301745636, "grad_norm": 21.774948120117188, "learning_rate": 1.8126683291770576e-05, "loss": 0.0499, "step": 3760 }, { "epoch": 0.470074812967581, "grad_norm": 0.01490458007901907, "learning_rate": 1.8121695760598506e-05, "loss": 0.034, "step": 3770 }, { "epoch": 0.4713216957605985, "grad_norm": 0.31544768810272217, "learning_rate": 1.8116708229426437e-05, "loss": 0.0978, "step": 3780 }, { "epoch": 0.47256857855361595, "grad_norm": 1.336230754852295, "learning_rate": 1.8111720698254364e-05, "loss": 0.0361, "step": 3790 }, { "epoch": 0.47381546134663344, "grad_norm": 3.5526254177093506, "learning_rate": 1.8106733167082295e-05, "loss": 0.0204, "step": 3800 }, { "epoch": 0.47506234413965087, "grad_norm": 0.0623275563120842, "learning_rate": 1.8101745635910225e-05, "loss": 0.102, "step": 3810 }, { "epoch": 0.4763092269326683, "grad_norm": 3.013119697570801, "learning_rate": 1.8096758104738156e-05, "loss": 0.0479, "step": 3820 }, { "epoch": 0.4775561097256858, "grad_norm": 0.018023619428277016, "learning_rate": 1.8091770573566087e-05, "loss": 0.058, "step": 3830 }, { "epoch": 0.47880299251870323, "grad_norm": 0.05830911546945572, "learning_rate": 1.8086783042394017e-05, "loss": 0.014, "step": 3840 }, { "epoch": 0.4800498753117207, "grad_norm": 0.3777295649051666, "learning_rate": 1.8081795511221948e-05, "loss": 0.08, "step": 3850 }, { "epoch": 0.48129675810473815, "grad_norm": 0.010745448060333729, "learning_rate": 1.807680798004988e-05, "loss": 0.1088, "step": 3860 }, { "epoch": 0.4825436408977556, "grad_norm": 0.2874574065208435, "learning_rate": 1.8071820448877806e-05, "loss": 0.062, "step": 3870 }, { "epoch": 0.4837905236907731, "grad_norm": 0.011869000270962715, "learning_rate": 1.8066832917705736e-05, "loss": 0.1795, "step": 3880 }, { "epoch": 0.4850374064837905, "grad_norm": 5.51703405380249, "learning_rate": 1.8061845386533667e-05, "loss": 0.0493, "step": 3890 }, { "epoch": 0.486284289276808, "grad_norm": 0.057681404054164886, "learning_rate": 1.8056857855361597e-05, "loss": 0.0127, "step": 3900 }, { "epoch": 0.48753117206982544, "grad_norm": 0.4616217613220215, "learning_rate": 1.8051870324189528e-05, "loss": 0.0861, "step": 3910 }, { "epoch": 0.48877805486284287, "grad_norm": 38.90608596801758, "learning_rate": 1.804688279301746e-05, "loss": 0.1033, "step": 3920 }, { "epoch": 0.49002493765586036, "grad_norm": 0.015819426625967026, "learning_rate": 1.804189526184539e-05, "loss": 0.077, "step": 3930 }, { "epoch": 0.4912718204488778, "grad_norm": 55.003570556640625, "learning_rate": 1.803690773067332e-05, "loss": 0.1412, "step": 3940 }, { "epoch": 0.4925187032418953, "grad_norm": 0.5927708148956299, "learning_rate": 1.803192019950125e-05, "loss": 0.0562, "step": 3950 }, { "epoch": 0.4937655860349127, "grad_norm": 30.10116195678711, "learning_rate": 1.8026932668329178e-05, "loss": 0.1947, "step": 3960 }, { "epoch": 0.49501246882793015, "grad_norm": 28.36803436279297, "learning_rate": 1.802194513715711e-05, "loss": 0.0687, "step": 3970 }, { "epoch": 0.49625935162094764, "grad_norm": 4.661812782287598, "learning_rate": 1.801695760598504e-05, "loss": 0.1681, "step": 3980 }, { "epoch": 0.4975062344139651, "grad_norm": 6.973540306091309, "learning_rate": 1.801197007481297e-05, "loss": 0.1108, "step": 3990 }, { "epoch": 0.49875311720698257, "grad_norm": 0.3708229660987854, "learning_rate": 1.8006982543640897e-05, "loss": 0.081, "step": 4000 }, { "epoch": 0.5, "grad_norm": 0.08219872415065765, "learning_rate": 1.8001995012468828e-05, "loss": 0.067, "step": 4010 }, { "epoch": 0.5012468827930174, "grad_norm": 0.05088973417878151, "learning_rate": 1.799700748129676e-05, "loss": 0.1103, "step": 4020 }, { "epoch": 0.5024937655860349, "grad_norm": 9.498299598693848, "learning_rate": 1.7992019950124692e-05, "loss": 0.0871, "step": 4030 }, { "epoch": 0.5037406483790524, "grad_norm": 10.723158836364746, "learning_rate": 1.798703241895262e-05, "loss": 0.0854, "step": 4040 }, { "epoch": 0.5049875311720698, "grad_norm": 1.3653846979141235, "learning_rate": 1.798204488778055e-05, "loss": 0.0603, "step": 4050 }, { "epoch": 0.5062344139650873, "grad_norm": 0.060665953904390335, "learning_rate": 1.797705735660848e-05, "loss": 0.0416, "step": 4060 }, { "epoch": 0.5074812967581047, "grad_norm": 0.05798611417412758, "learning_rate": 1.797206982543641e-05, "loss": 0.1228, "step": 4070 }, { "epoch": 0.5087281795511222, "grad_norm": 0.26482218503952026, "learning_rate": 1.796708229426434e-05, "loss": 0.0735, "step": 4080 }, { "epoch": 0.5099750623441397, "grad_norm": 0.2417094111442566, "learning_rate": 1.796209476309227e-05, "loss": 0.0061, "step": 4090 }, { "epoch": 0.5112219451371571, "grad_norm": 0.032584238797426224, "learning_rate": 1.79571072319202e-05, "loss": 0.0511, "step": 4100 }, { "epoch": 0.5124688279301746, "grad_norm": 57.22048568725586, "learning_rate": 1.795211970074813e-05, "loss": 0.1156, "step": 4110 }, { "epoch": 0.513715710723192, "grad_norm": 40.46946334838867, "learning_rate": 1.794713216957606e-05, "loss": 0.1286, "step": 4120 }, { "epoch": 0.5149625935162094, "grad_norm": 0.05879024416208267, "learning_rate": 1.794214463840399e-05, "loss": 0.1155, "step": 4130 }, { "epoch": 0.516209476309227, "grad_norm": 0.017375554889440536, "learning_rate": 1.7937157107231922e-05, "loss": 0.0445, "step": 4140 }, { "epoch": 0.5174563591022444, "grad_norm": 0.07895904779434204, "learning_rate": 1.7932169576059853e-05, "loss": 0.0931, "step": 4150 }, { "epoch": 0.5187032418952618, "grad_norm": 17.329853057861328, "learning_rate": 1.792718204488778e-05, "loss": 0.1106, "step": 4160 }, { "epoch": 0.5199501246882793, "grad_norm": 6.252292156219482, "learning_rate": 1.792219451371571e-05, "loss": 0.1499, "step": 4170 }, { "epoch": 0.5211970074812967, "grad_norm": 4.099524974822998, "learning_rate": 1.791720698254364e-05, "loss": 0.0033, "step": 4180 }, { "epoch": 0.5224438902743143, "grad_norm": 26.322385787963867, "learning_rate": 1.7912219451371572e-05, "loss": 0.0409, "step": 4190 }, { "epoch": 0.5236907730673317, "grad_norm": 44.511905670166016, "learning_rate": 1.7907231920199503e-05, "loss": 0.1004, "step": 4200 }, { "epoch": 0.5249376558603491, "grad_norm": 0.012727702967822552, "learning_rate": 1.7902244389027433e-05, "loss": 0.0249, "step": 4210 }, { "epoch": 0.5261845386533666, "grad_norm": 5.822264194488525, "learning_rate": 1.7897256857855364e-05, "loss": 0.0436, "step": 4220 }, { "epoch": 0.527431421446384, "grad_norm": 0.01628972217440605, "learning_rate": 1.7892269326683294e-05, "loss": 0.1117, "step": 4230 }, { "epoch": 0.5286783042394015, "grad_norm": 0.15717430412769318, "learning_rate": 1.7887281795511225e-05, "loss": 0.0526, "step": 4240 }, { "epoch": 0.529925187032419, "grad_norm": 0.03874291479587555, "learning_rate": 1.7882294264339152e-05, "loss": 0.0353, "step": 4250 }, { "epoch": 0.5311720698254364, "grad_norm": 0.02741248533129692, "learning_rate": 1.7877306733167083e-05, "loss": 0.2038, "step": 4260 }, { "epoch": 0.5324189526184538, "grad_norm": 0.10414528846740723, "learning_rate": 1.7872319201995013e-05, "loss": 0.0228, "step": 4270 }, { "epoch": 0.5336658354114713, "grad_norm": 11.053817749023438, "learning_rate": 1.7867331670822944e-05, "loss": 0.1856, "step": 4280 }, { "epoch": 0.5349127182044888, "grad_norm": 0.044340793043375015, "learning_rate": 1.7862344139650875e-05, "loss": 0.1366, "step": 4290 }, { "epoch": 0.5361596009975063, "grad_norm": 0.7331774830818176, "learning_rate": 1.7857356608478805e-05, "loss": 0.075, "step": 4300 }, { "epoch": 0.5374064837905237, "grad_norm": 0.03085249662399292, "learning_rate": 1.7852369077306736e-05, "loss": 0.0312, "step": 4310 }, { "epoch": 0.5386533665835411, "grad_norm": 0.10700695961713791, "learning_rate": 1.7847381546134667e-05, "loss": 0.0418, "step": 4320 }, { "epoch": 0.5399002493765586, "grad_norm": 14.032788276672363, "learning_rate": 1.7842394014962594e-05, "loss": 0.1199, "step": 4330 }, { "epoch": 0.5411471321695761, "grad_norm": 1.1185815334320068, "learning_rate": 1.7837406483790524e-05, "loss": 0.1206, "step": 4340 }, { "epoch": 0.5423940149625935, "grad_norm": 0.02402268722653389, "learning_rate": 1.7832418952618455e-05, "loss": 0.0587, "step": 4350 }, { "epoch": 0.543640897755611, "grad_norm": 10.920904159545898, "learning_rate": 1.7827431421446386e-05, "loss": 0.096, "step": 4360 }, { "epoch": 0.5448877805486284, "grad_norm": 0.791688859462738, "learning_rate": 1.7822443890274316e-05, "loss": 0.0948, "step": 4370 }, { "epoch": 0.5461346633416458, "grad_norm": 0.16529826819896698, "learning_rate": 1.7817456359102247e-05, "loss": 0.0734, "step": 4380 }, { "epoch": 0.5473815461346634, "grad_norm": 0.02176223322749138, "learning_rate": 1.7812468827930178e-05, "loss": 0.0816, "step": 4390 }, { "epoch": 0.5486284289276808, "grad_norm": 0.0856514424085617, "learning_rate": 1.7807481296758108e-05, "loss": 0.0703, "step": 4400 }, { "epoch": 0.5498753117206983, "grad_norm": 0.014872372150421143, "learning_rate": 1.7802493765586035e-05, "loss": 0.0823, "step": 4410 }, { "epoch": 0.5511221945137157, "grad_norm": 5.188063621520996, "learning_rate": 1.7797506234413966e-05, "loss": 0.0336, "step": 4420 }, { "epoch": 0.5523690773067331, "grad_norm": 0.14992894232273102, "learning_rate": 1.7792518703241897e-05, "loss": 0.0741, "step": 4430 }, { "epoch": 0.5536159600997507, "grad_norm": 1.0823159217834473, "learning_rate": 1.7787531172069827e-05, "loss": 0.0574, "step": 4440 }, { "epoch": 0.5548628428927681, "grad_norm": 0.0768449455499649, "learning_rate": 1.7782543640897758e-05, "loss": 0.0845, "step": 4450 }, { "epoch": 0.5561097256857855, "grad_norm": 0.015782279893755913, "learning_rate": 1.7777556109725685e-05, "loss": 0.0828, "step": 4460 }, { "epoch": 0.557356608478803, "grad_norm": 0.022945543751120567, "learning_rate": 1.7772568578553616e-05, "loss": 0.1476, "step": 4470 }, { "epoch": 0.5586034912718204, "grad_norm": 0.04019872471690178, "learning_rate": 1.7767581047381546e-05, "loss": 0.0036, "step": 4480 }, { "epoch": 0.559850374064838, "grad_norm": 30.77685546875, "learning_rate": 1.776259351620948e-05, "loss": 0.0562, "step": 4490 }, { "epoch": 0.5610972568578554, "grad_norm": 0.035899385809898376, "learning_rate": 1.7757605985037408e-05, "loss": 0.0271, "step": 4500 }, { "epoch": 0.5623441396508728, "grad_norm": Infinity, "learning_rate": 1.7753117206982544e-05, "loss": 0.1555, "step": 4510 }, { "epoch": 0.5635910224438903, "grad_norm": 47.183685302734375, "learning_rate": 1.7748129675810475e-05, "loss": 0.1514, "step": 4520 }, { "epoch": 0.5648379052369077, "grad_norm": 0.1414586901664734, "learning_rate": 1.7743142144638405e-05, "loss": 0.0446, "step": 4530 }, { "epoch": 0.5660847880299252, "grad_norm": 6.158328056335449, "learning_rate": 1.7738154613466336e-05, "loss": 0.092, "step": 4540 }, { "epoch": 0.5673316708229427, "grad_norm": 0.13686908781528473, "learning_rate": 1.7733167082294263e-05, "loss": 0.0638, "step": 4550 }, { "epoch": 0.5685785536159601, "grad_norm": 0.09912706911563873, "learning_rate": 1.7728179551122194e-05, "loss": 0.0048, "step": 4560 }, { "epoch": 0.5698254364089775, "grad_norm": 31.120927810668945, "learning_rate": 1.7723192019950128e-05, "loss": 0.0572, "step": 4570 }, { "epoch": 0.571072319201995, "grad_norm": 0.1557435244321823, "learning_rate": 1.771820448877806e-05, "loss": 0.0498, "step": 4580 }, { "epoch": 0.5723192019950125, "grad_norm": 0.07351752370595932, "learning_rate": 1.771321695760599e-05, "loss": 0.0842, "step": 4590 }, { "epoch": 0.57356608478803, "grad_norm": 0.2594575583934784, "learning_rate": 1.7708229426433916e-05, "loss": 0.0907, "step": 4600 }, { "epoch": 0.5748129675810474, "grad_norm": 0.011486229486763477, "learning_rate": 1.7703241895261847e-05, "loss": 0.0171, "step": 4610 }, { "epoch": 0.5760598503740648, "grad_norm": 0.23140795528888702, "learning_rate": 1.7698254364089778e-05, "loss": 0.0596, "step": 4620 }, { "epoch": 0.5773067331670823, "grad_norm": 0.12582992017269135, "learning_rate": 1.7693266832917708e-05, "loss": 0.0593, "step": 4630 }, { "epoch": 0.5785536159600998, "grad_norm": 1.3327033519744873, "learning_rate": 1.7688279301745635e-05, "loss": 0.1281, "step": 4640 }, { "epoch": 0.5798004987531172, "grad_norm": 0.009501133114099503, "learning_rate": 1.7683291770573566e-05, "loss": 0.0942, "step": 4650 }, { "epoch": 0.5810473815461347, "grad_norm": 0.15668757259845734, "learning_rate": 1.7678304239401497e-05, "loss": 0.0793, "step": 4660 }, { "epoch": 0.5822942643391521, "grad_norm": 0.04126293212175369, "learning_rate": 1.7673316708229427e-05, "loss": 0.0937, "step": 4670 }, { "epoch": 0.5835411471321695, "grad_norm": 0.015120592899620533, "learning_rate": 1.7668329177057358e-05, "loss": 0.0046, "step": 4680 }, { "epoch": 0.5847880299251871, "grad_norm": 0.05965186282992363, "learning_rate": 1.766334164588529e-05, "loss": 0.1212, "step": 4690 }, { "epoch": 0.5860349127182045, "grad_norm": 15.11082649230957, "learning_rate": 1.765835411471322e-05, "loss": 0.1122, "step": 4700 }, { "epoch": 0.587281795511222, "grad_norm": 10.484663009643555, "learning_rate": 1.765336658354115e-05, "loss": 0.0907, "step": 4710 }, { "epoch": 0.5885286783042394, "grad_norm": 24.538185119628906, "learning_rate": 1.7648379052369077e-05, "loss": 0.1617, "step": 4720 }, { "epoch": 0.5897755610972568, "grad_norm": 4.25324821472168, "learning_rate": 1.7643391521197008e-05, "loss": 0.0357, "step": 4730 }, { "epoch": 0.5910224438902744, "grad_norm": 4.515763282775879, "learning_rate": 1.7638403990024938e-05, "loss": 0.0999, "step": 4740 }, { "epoch": 0.5922693266832918, "grad_norm": 0.033570531755685806, "learning_rate": 1.763341645885287e-05, "loss": 0.0471, "step": 4750 }, { "epoch": 0.5935162094763092, "grad_norm": 0.023912442848086357, "learning_rate": 1.76284289276808e-05, "loss": 0.0029, "step": 4760 }, { "epoch": 0.5947630922693267, "grad_norm": 0.4626505374908447, "learning_rate": 1.762344139650873e-05, "loss": 0.1512, "step": 4770 }, { "epoch": 0.5960099750623441, "grad_norm": 0.5773617029190063, "learning_rate": 1.761845386533666e-05, "loss": 0.0808, "step": 4780 }, { "epoch": 0.5972568578553616, "grad_norm": 0.0236464012414217, "learning_rate": 1.761346633416459e-05, "loss": 0.0687, "step": 4790 }, { "epoch": 0.5985037406483791, "grad_norm": 5.944710731506348, "learning_rate": 1.760847880299252e-05, "loss": 0.0253, "step": 4800 }, { "epoch": 0.5997506234413965, "grad_norm": 19.9612979888916, "learning_rate": 1.760349127182045e-05, "loss": 0.1472, "step": 4810 }, { "epoch": 0.600997506234414, "grad_norm": 31.07427406311035, "learning_rate": 1.759850374064838e-05, "loss": 0.0273, "step": 4820 }, { "epoch": 0.6022443890274314, "grad_norm": 0.054695580154657364, "learning_rate": 1.759351620947631e-05, "loss": 0.0612, "step": 4830 }, { "epoch": 0.6034912718204489, "grad_norm": 0.5819992423057556, "learning_rate": 1.758852867830424e-05, "loss": 0.0695, "step": 4840 }, { "epoch": 0.6047381546134664, "grad_norm": 0.04540835693478584, "learning_rate": 1.758354114713217e-05, "loss": 0.0961, "step": 4850 }, { "epoch": 0.6059850374064838, "grad_norm": 0.22553811967372894, "learning_rate": 1.7578553615960102e-05, "loss": 0.0476, "step": 4860 }, { "epoch": 0.6072319201995012, "grad_norm": 0.016304977238178253, "learning_rate": 1.7573566084788033e-05, "loss": 0.1701, "step": 4870 }, { "epoch": 0.6084788029925187, "grad_norm": 3.0572760105133057, "learning_rate": 1.7568578553615963e-05, "loss": 0.0097, "step": 4880 }, { "epoch": 0.6097256857855362, "grad_norm": 11.772346496582031, "learning_rate": 1.756359102244389e-05, "loss": 0.0492, "step": 4890 }, { "epoch": 0.6109725685785536, "grad_norm": 6.350620746612549, "learning_rate": 1.755860349127182e-05, "loss": 0.0661, "step": 4900 }, { "epoch": 0.6122194513715711, "grad_norm": 0.015547297894954681, "learning_rate": 1.7553615960099752e-05, "loss": 0.0835, "step": 4910 }, { "epoch": 0.6134663341645885, "grad_norm": 37.15751266479492, "learning_rate": 1.7548628428927683e-05, "loss": 0.0571, "step": 4920 }, { "epoch": 0.614713216957606, "grad_norm": 0.03281101956963539, "learning_rate": 1.7543640897755613e-05, "loss": 0.1247, "step": 4930 }, { "epoch": 0.6159600997506235, "grad_norm": 6.25891637802124, "learning_rate": 1.7538653366583544e-05, "loss": 0.0423, "step": 4940 }, { "epoch": 0.6172069825436409, "grad_norm": 30.621646881103516, "learning_rate": 1.7533665835411474e-05, "loss": 0.0622, "step": 4950 }, { "epoch": 0.6184538653366584, "grad_norm": 37.61522674560547, "learning_rate": 1.7528678304239405e-05, "loss": 0.1214, "step": 4960 }, { "epoch": 0.6197007481296758, "grad_norm": 0.08783353865146637, "learning_rate": 1.7523690773067332e-05, "loss": 0.0865, "step": 4970 }, { "epoch": 0.6209476309226932, "grad_norm": 5.21088171005249, "learning_rate": 1.7518703241895263e-05, "loss": 0.1417, "step": 4980 }, { "epoch": 0.6221945137157108, "grad_norm": 1.4768476486206055, "learning_rate": 1.7513715710723193e-05, "loss": 0.0691, "step": 4990 }, { "epoch": 0.6234413965087282, "grad_norm": 0.037035878747701645, "learning_rate": 1.7508728179551124e-05, "loss": 0.0474, "step": 5000 }, { "epoch": 0.6246882793017456, "grad_norm": 7.823322296142578, "learning_rate": 1.750374064837905e-05, "loss": 0.1349, "step": 5010 }, { "epoch": 0.6259351620947631, "grad_norm": 0.01354975439608097, "learning_rate": 1.7498753117206982e-05, "loss": 0.086, "step": 5020 }, { "epoch": 0.6271820448877805, "grad_norm": 0.1404193490743637, "learning_rate": 1.7493765586034913e-05, "loss": 0.0822, "step": 5030 }, { "epoch": 0.628428927680798, "grad_norm": 27.83915138244629, "learning_rate": 1.7488778054862847e-05, "loss": 0.0993, "step": 5040 }, { "epoch": 0.6296758104738155, "grad_norm": 0.03925345838069916, "learning_rate": 1.7483790523690774e-05, "loss": 0.0913, "step": 5050 }, { "epoch": 0.6309226932668329, "grad_norm": 21.24605369567871, "learning_rate": 1.7478802992518704e-05, "loss": 0.0682, "step": 5060 }, { "epoch": 0.6321695760598504, "grad_norm": 7.983184337615967, "learning_rate": 1.7473815461346635e-05, "loss": 0.1279, "step": 5070 }, { "epoch": 0.6334164588528678, "grad_norm": 0.03581385314464569, "learning_rate": 1.7468827930174566e-05, "loss": 0.0188, "step": 5080 }, { "epoch": 0.6346633416458853, "grad_norm": 0.03261757269501686, "learning_rate": 1.7463840399002496e-05, "loss": 0.0554, "step": 5090 }, { "epoch": 0.6359102244389028, "grad_norm": 0.060816071927547455, "learning_rate": 1.7458852867830424e-05, "loss": 0.0434, "step": 5100 }, { "epoch": 0.6371571072319202, "grad_norm": 20.245588302612305, "learning_rate": 1.7453865336658354e-05, "loss": 0.1272, "step": 5110 }, { "epoch": 0.6384039900249376, "grad_norm": 18.135963439941406, "learning_rate": 1.7448877805486285e-05, "loss": 0.1471, "step": 5120 }, { "epoch": 0.6396508728179551, "grad_norm": 25.722951889038086, "learning_rate": 1.7443890274314215e-05, "loss": 0.0608, "step": 5130 }, { "epoch": 0.6408977556109726, "grad_norm": 8.285656929016113, "learning_rate": 1.7438902743142146e-05, "loss": 0.111, "step": 5140 }, { "epoch": 0.64214463840399, "grad_norm": 0.15213392674922943, "learning_rate": 1.7433915211970077e-05, "loss": 0.1607, "step": 5150 }, { "epoch": 0.6433915211970075, "grad_norm": 9.305551528930664, "learning_rate": 1.7428927680798007e-05, "loss": 0.0749, "step": 5160 }, { "epoch": 0.6446384039900249, "grad_norm": 27.343847274780273, "learning_rate": 1.7423940149625938e-05, "loss": 0.0931, "step": 5170 }, { "epoch": 0.6458852867830424, "grad_norm": 0.03424638509750366, "learning_rate": 1.7418952618453865e-05, "loss": 0.0532, "step": 5180 }, { "epoch": 0.6471321695760599, "grad_norm": 3.35919451713562, "learning_rate": 1.7413965087281796e-05, "loss": 0.022, "step": 5190 }, { "epoch": 0.6483790523690773, "grad_norm": 5.452242374420166, "learning_rate": 1.7408977556109726e-05, "loss": 0.0829, "step": 5200 }, { "epoch": 0.6496259351620948, "grad_norm": 0.03790761157870293, "learning_rate": 1.7403990024937657e-05, "loss": 0.0938, "step": 5210 }, { "epoch": 0.6508728179551122, "grad_norm": 0.05794584006071091, "learning_rate": 1.7399002493765588e-05, "loss": 0.0363, "step": 5220 }, { "epoch": 0.6521197007481296, "grad_norm": 33.54241180419922, "learning_rate": 1.7394014962593518e-05, "loss": 0.0982, "step": 5230 }, { "epoch": 0.6533665835411472, "grad_norm": 31.3079891204834, "learning_rate": 1.738902743142145e-05, "loss": 0.1463, "step": 5240 }, { "epoch": 0.6546134663341646, "grad_norm": 0.4372890591621399, "learning_rate": 1.738403990024938e-05, "loss": 0.0741, "step": 5250 }, { "epoch": 0.655860349127182, "grad_norm": 25.03449821472168, "learning_rate": 1.7379052369077307e-05, "loss": 0.1668, "step": 5260 }, { "epoch": 0.6571072319201995, "grad_norm": 8.57763957977295, "learning_rate": 1.7374064837905237e-05, "loss": 0.0704, "step": 5270 }, { "epoch": 0.6583541147132169, "grad_norm": 0.027418160811066628, "learning_rate": 1.7369077306733168e-05, "loss": 0.0578, "step": 5280 }, { "epoch": 0.6596009975062345, "grad_norm": 0.06828586012125015, "learning_rate": 1.73640897755611e-05, "loss": 0.1153, "step": 5290 }, { "epoch": 0.6608478802992519, "grad_norm": 16.321455001831055, "learning_rate": 1.735910224438903e-05, "loss": 0.0171, "step": 5300 }, { "epoch": 0.6620947630922693, "grad_norm": 0.0578501932322979, "learning_rate": 1.735411471321696e-05, "loss": 0.0581, "step": 5310 }, { "epoch": 0.6633416458852868, "grad_norm": 0.021183906123042107, "learning_rate": 1.734912718204489e-05, "loss": 0.025, "step": 5320 }, { "epoch": 0.6645885286783042, "grad_norm": 0.0674271285533905, "learning_rate": 1.734413965087282e-05, "loss": 0.0311, "step": 5330 }, { "epoch": 0.6658354114713217, "grad_norm": 0.015874937176704407, "learning_rate": 1.733915211970075e-05, "loss": 0.0561, "step": 5340 }, { "epoch": 0.6670822942643392, "grad_norm": 19.91120719909668, "learning_rate": 1.733416458852868e-05, "loss": 0.0661, "step": 5350 }, { "epoch": 0.6683291770573566, "grad_norm": 0.144506573677063, "learning_rate": 1.732917705735661e-05, "loss": 0.0181, "step": 5360 }, { "epoch": 0.669576059850374, "grad_norm": 0.03757374733686447, "learning_rate": 1.732418952618454e-05, "loss": 0.0721, "step": 5370 }, { "epoch": 0.6708229426433915, "grad_norm": 5.466207504272461, "learning_rate": 1.731920199501247e-05, "loss": 0.035, "step": 5380 }, { "epoch": 0.672069825436409, "grad_norm": 0.16552552580833435, "learning_rate": 1.73142144638404e-05, "loss": 0.0149, "step": 5390 }, { "epoch": 0.6733167082294265, "grad_norm": 17.71544647216797, "learning_rate": 1.7309226932668332e-05, "loss": 0.0357, "step": 5400 }, { "epoch": 0.6745635910224439, "grad_norm": 21.00069236755371, "learning_rate": 1.7304239401496263e-05, "loss": 0.102, "step": 5410 }, { "epoch": 0.6758104738154613, "grad_norm": 0.01819881983101368, "learning_rate": 1.7299251870324193e-05, "loss": 0.0978, "step": 5420 }, { "epoch": 0.6770573566084788, "grad_norm": 0.01410367339849472, "learning_rate": 1.729426433915212e-05, "loss": 0.0038, "step": 5430 }, { "epoch": 0.6783042394014963, "grad_norm": 34.391624450683594, "learning_rate": 1.728927680798005e-05, "loss": 0.0995, "step": 5440 }, { "epoch": 0.6795511221945137, "grad_norm": 0.0064294287003576756, "learning_rate": 1.728428927680798e-05, "loss": 0.0772, "step": 5450 }, { "epoch": 0.6807980049875312, "grad_norm": 11.470839500427246, "learning_rate": 1.7279301745635912e-05, "loss": 0.0381, "step": 5460 }, { "epoch": 0.6820448877805486, "grad_norm": 0.05597744882106781, "learning_rate": 1.727431421446384e-05, "loss": 0.0378, "step": 5470 }, { "epoch": 0.683291770573566, "grad_norm": 22.418514251708984, "learning_rate": 1.726932668329177e-05, "loss": 0.1113, "step": 5480 }, { "epoch": 0.6845386533665836, "grad_norm": 23.031198501586914, "learning_rate": 1.72643391521197e-05, "loss": 0.0742, "step": 5490 }, { "epoch": 0.685785536159601, "grad_norm": 0.4775936007499695, "learning_rate": 1.725935162094763e-05, "loss": 0.1278, "step": 5500 }, { "epoch": 0.6870324189526185, "grad_norm": 0.008405367843806744, "learning_rate": 1.7254364089775562e-05, "loss": 0.0549, "step": 5510 }, { "epoch": 0.6882793017456359, "grad_norm": 0.021200506016612053, "learning_rate": 1.7249376558603493e-05, "loss": 0.0112, "step": 5520 }, { "epoch": 0.6895261845386533, "grad_norm": 0.009464691393077374, "learning_rate": 1.7244389027431423e-05, "loss": 0.0338, "step": 5530 }, { "epoch": 0.6907730673316709, "grad_norm": 0.1951027363538742, "learning_rate": 1.7239401496259354e-05, "loss": 0.0759, "step": 5540 }, { "epoch": 0.6920199501246883, "grad_norm": 6.1231231689453125, "learning_rate": 1.723441396508728e-05, "loss": 0.1388, "step": 5550 }, { "epoch": 0.6932668329177057, "grad_norm": 19.70039939880371, "learning_rate": 1.7229426433915212e-05, "loss": 0.0925, "step": 5560 }, { "epoch": 0.6945137157107232, "grad_norm": 4.138513088226318, "learning_rate": 1.7224438902743142e-05, "loss": 0.0567, "step": 5570 }, { "epoch": 0.6957605985037406, "grad_norm": 0.1160452663898468, "learning_rate": 1.7219451371571073e-05, "loss": 0.1006, "step": 5580 }, { "epoch": 0.6970074812967582, "grad_norm": 15.692948341369629, "learning_rate": 1.7214463840399004e-05, "loss": 0.1361, "step": 5590 }, { "epoch": 0.6982543640897756, "grad_norm": 0.011452429927885532, "learning_rate": 1.7209476309226934e-05, "loss": 0.005, "step": 5600 }, { "epoch": 0.699501246882793, "grad_norm": 0.11285385489463806, "learning_rate": 1.7204488778054865e-05, "loss": 0.0102, "step": 5610 }, { "epoch": 0.7007481296758105, "grad_norm": 23.67371940612793, "learning_rate": 1.7199501246882795e-05, "loss": 0.0358, "step": 5620 }, { "epoch": 0.7019950124688279, "grad_norm": 5.408916473388672, "learning_rate": 1.7194513715710726e-05, "loss": 0.0636, "step": 5630 }, { "epoch": 0.7032418952618454, "grad_norm": 0.05330492928624153, "learning_rate": 1.7189526184538653e-05, "loss": 0.1374, "step": 5640 }, { "epoch": 0.7044887780548629, "grad_norm": 19.458311080932617, "learning_rate": 1.7184538653366584e-05, "loss": 0.0404, "step": 5650 }, { "epoch": 0.7057356608478803, "grad_norm": 0.012190482579171658, "learning_rate": 1.7179551122194515e-05, "loss": 0.1912, "step": 5660 }, { "epoch": 0.7069825436408977, "grad_norm": 1.1746265888214111, "learning_rate": 1.7174563591022445e-05, "loss": 0.0023, "step": 5670 }, { "epoch": 0.7082294264339152, "grad_norm": 0.18074272572994232, "learning_rate": 1.7169576059850376e-05, "loss": 0.1154, "step": 5680 }, { "epoch": 0.7094763092269327, "grad_norm": 0.013185709714889526, "learning_rate": 1.7164588528678306e-05, "loss": 0.0299, "step": 5690 }, { "epoch": 0.7107231920199502, "grad_norm": 1.1641533374786377, "learning_rate": 1.7159600997506237e-05, "loss": 0.049, "step": 5700 }, { "epoch": 0.7119700748129676, "grad_norm": 0.016002580523490906, "learning_rate": 1.7154613466334168e-05, "loss": 0.0609, "step": 5710 }, { "epoch": 0.713216957605985, "grad_norm": 0.015217593871057034, "learning_rate": 1.7149625935162095e-05, "loss": 0.1336, "step": 5720 }, { "epoch": 0.7144638403990025, "grad_norm": 0.008298971690237522, "learning_rate": 1.7144638403990025e-05, "loss": 0.0302, "step": 5730 }, { "epoch": 0.71571072319202, "grad_norm": 6.263095855712891, "learning_rate": 1.7139650872817956e-05, "loss": 0.0637, "step": 5740 }, { "epoch": 0.7169576059850374, "grad_norm": 0.22118152678012848, "learning_rate": 1.7134663341645887e-05, "loss": 0.0101, "step": 5750 }, { "epoch": 0.7182044887780549, "grad_norm": 5.958626747131348, "learning_rate": 1.7129675810473817e-05, "loss": 0.0364, "step": 5760 }, { "epoch": 0.7194513715710723, "grad_norm": 1.9810842275619507, "learning_rate": 1.7124688279301748e-05, "loss": 0.156, "step": 5770 }, { "epoch": 0.7206982543640897, "grad_norm": 0.03880726546049118, "learning_rate": 1.711970074812968e-05, "loss": 0.0693, "step": 5780 }, { "epoch": 0.7219451371571073, "grad_norm": 30.136775970458984, "learning_rate": 1.711471321695761e-05, "loss": 0.0967, "step": 5790 }, { "epoch": 0.7231920199501247, "grad_norm": 0.06249859556555748, "learning_rate": 1.7109725685785536e-05, "loss": 0.0188, "step": 5800 }, { "epoch": 0.7244389027431422, "grad_norm": 0.1799100935459137, "learning_rate": 1.7104738154613467e-05, "loss": 0.1193, "step": 5810 }, { "epoch": 0.7256857855361596, "grad_norm": 13.862339973449707, "learning_rate": 1.7099750623441398e-05, "loss": 0.0895, "step": 5820 }, { "epoch": 0.726932668329177, "grad_norm": 0.24228820204734802, "learning_rate": 1.7094763092269328e-05, "loss": 0.0109, "step": 5830 }, { "epoch": 0.7281795511221946, "grad_norm": 0.2275868058204651, "learning_rate": 1.708977556109726e-05, "loss": 0.0442, "step": 5840 }, { "epoch": 0.729426433915212, "grad_norm": 0.032161250710487366, "learning_rate": 1.708478802992519e-05, "loss": 0.1272, "step": 5850 }, { "epoch": 0.7306733167082294, "grad_norm": 38.00654220581055, "learning_rate": 1.707980049875312e-05, "loss": 0.0593, "step": 5860 }, { "epoch": 0.7319201995012469, "grad_norm": 12.633605003356934, "learning_rate": 1.707481296758105e-05, "loss": 0.1155, "step": 5870 }, { "epoch": 0.7331670822942643, "grad_norm": 0.16538533568382263, "learning_rate": 1.706982543640898e-05, "loss": 0.0581, "step": 5880 }, { "epoch": 0.7344139650872819, "grad_norm": 0.48946434259414673, "learning_rate": 1.706483790523691e-05, "loss": 0.0584, "step": 5890 }, { "epoch": 0.7356608478802993, "grad_norm": 0.027832500636577606, "learning_rate": 1.705985037406484e-05, "loss": 0.049, "step": 5900 }, { "epoch": 0.7369077306733167, "grad_norm": 0.07207493484020233, "learning_rate": 1.705486284289277e-05, "loss": 0.0297, "step": 5910 }, { "epoch": 0.7381546134663342, "grad_norm": 6.354367256164551, "learning_rate": 1.70498753117207e-05, "loss": 0.0807, "step": 5920 }, { "epoch": 0.7394014962593516, "grad_norm": 6.380330562591553, "learning_rate": 1.7044887780548628e-05, "loss": 0.1014, "step": 5930 }, { "epoch": 0.7406483790523691, "grad_norm": 43.741695404052734, "learning_rate": 1.7039900249376558e-05, "loss": 0.0899, "step": 5940 }, { "epoch": 0.7418952618453866, "grad_norm": 0.0922226682305336, "learning_rate": 1.703491271820449e-05, "loss": 0.1055, "step": 5950 }, { "epoch": 0.743142144638404, "grad_norm": 2.604271173477173, "learning_rate": 1.702992518703242e-05, "loss": 0.0821, "step": 5960 }, { "epoch": 0.7443890274314214, "grad_norm": 21.0607967376709, "learning_rate": 1.702493765586035e-05, "loss": 0.1462, "step": 5970 }, { "epoch": 0.7456359102244389, "grad_norm": 0.054813966155052185, "learning_rate": 1.701995012468828e-05, "loss": 0.1546, "step": 5980 }, { "epoch": 0.7468827930174564, "grad_norm": 41.906089782714844, "learning_rate": 1.701496259351621e-05, "loss": 0.0519, "step": 5990 }, { "epoch": 0.7481296758104738, "grad_norm": 0.035245224833488464, "learning_rate": 1.7009975062344142e-05, "loss": 0.0809, "step": 6000 }, { "epoch": 0.7493765586034913, "grad_norm": 0.029683228582143784, "learning_rate": 1.700498753117207e-05, "loss": 0.0532, "step": 6010 }, { "epoch": 0.7506234413965087, "grad_norm": 0.02576983906328678, "learning_rate": 1.7e-05, "loss": 0.0713, "step": 6020 }, { "epoch": 0.7518703241895262, "grad_norm": 0.014055408537387848, "learning_rate": 1.699501246882793e-05, "loss": 0.0065, "step": 6030 }, { "epoch": 0.7531172069825436, "grad_norm": 0.011719699949026108, "learning_rate": 1.699002493765586e-05, "loss": 0.0325, "step": 6040 }, { "epoch": 0.7543640897755611, "grad_norm": 0.021255536004900932, "learning_rate": 1.6985037406483792e-05, "loss": 0.1086, "step": 6050 }, { "epoch": 0.7556109725685786, "grad_norm": 0.6811969876289368, "learning_rate": 1.6980049875311722e-05, "loss": 0.0901, "step": 6060 }, { "epoch": 0.756857855361596, "grad_norm": 0.38240328431129456, "learning_rate": 1.6975062344139653e-05, "loss": 0.0455, "step": 6070 }, { "epoch": 0.7581047381546134, "grad_norm": 0.0566604882478714, "learning_rate": 1.6970074812967584e-05, "loss": 0.0675, "step": 6080 }, { "epoch": 0.7593516209476309, "grad_norm": 0.12242597341537476, "learning_rate": 1.6965087281795514e-05, "loss": 0.0114, "step": 6090 }, { "epoch": 0.7605985037406484, "grad_norm": 0.2005094587802887, "learning_rate": 1.696009975062344e-05, "loss": 0.0902, "step": 6100 }, { "epoch": 0.7618453865336658, "grad_norm": 0.3194235861301422, "learning_rate": 1.6955112219451372e-05, "loss": 0.0054, "step": 6110 }, { "epoch": 0.7630922693266833, "grad_norm": 4.714141845703125, "learning_rate": 1.6950124688279303e-05, "loss": 0.1559, "step": 6120 }, { "epoch": 0.7643391521197007, "grad_norm": 2.5545177459716797, "learning_rate": 1.6945137157107233e-05, "loss": 0.0985, "step": 6130 }, { "epoch": 0.7655860349127181, "grad_norm": 0.014316755346953869, "learning_rate": 1.6940149625935164e-05, "loss": 0.0605, "step": 6140 }, { "epoch": 0.7668329177057357, "grad_norm": 0.5875915288925171, "learning_rate": 1.6935162094763095e-05, "loss": 0.009, "step": 6150 }, { "epoch": 0.7680798004987531, "grad_norm": 0.05648162588477135, "learning_rate": 1.6930174563591025e-05, "loss": 0.0426, "step": 6160 }, { "epoch": 0.7693266832917706, "grad_norm": 5.96530294418335, "learning_rate": 1.6925187032418956e-05, "loss": 0.0853, "step": 6170 }, { "epoch": 0.770573566084788, "grad_norm": 0.21136033535003662, "learning_rate": 1.6920199501246883e-05, "loss": 0.0251, "step": 6180 }, { "epoch": 0.7718204488778054, "grad_norm": 1.1626219749450684, "learning_rate": 1.6915211970074814e-05, "loss": 0.0397, "step": 6190 }, { "epoch": 0.773067331670823, "grad_norm": 0.010134602896869183, "learning_rate": 1.6910224438902744e-05, "loss": 0.0024, "step": 6200 }, { "epoch": 0.7743142144638404, "grad_norm": 0.15311940014362335, "learning_rate": 1.6905236907730675e-05, "loss": 0.0406, "step": 6210 }, { "epoch": 0.7755610972568578, "grad_norm": 0.00632174639031291, "learning_rate": 1.6900249376558605e-05, "loss": 0.0354, "step": 6220 }, { "epoch": 0.7768079800498753, "grad_norm": 2.030371904373169, "learning_rate": 1.6895261845386536e-05, "loss": 0.0799, "step": 6230 }, { "epoch": 0.7780548628428927, "grad_norm": 3.4602127075195312, "learning_rate": 1.6890274314214467e-05, "loss": 0.0697, "step": 6240 }, { "epoch": 0.7793017456359103, "grad_norm": 2.2719037532806396, "learning_rate": 1.6885286783042397e-05, "loss": 0.1192, "step": 6250 }, { "epoch": 0.7805486284289277, "grad_norm": 0.015602881088852882, "learning_rate": 1.6880299251870325e-05, "loss": 0.0238, "step": 6260 }, { "epoch": 0.7817955112219451, "grad_norm": 2.9303834438323975, "learning_rate": 1.6875311720698255e-05, "loss": 0.0726, "step": 6270 }, { "epoch": 0.7830423940149626, "grad_norm": 0.4048525393009186, "learning_rate": 1.6870324189526186e-05, "loss": 0.0489, "step": 6280 }, { "epoch": 0.78428927680798, "grad_norm": 0.10114028304815292, "learning_rate": 1.6865336658354116e-05, "loss": 0.0263, "step": 6290 }, { "epoch": 0.7855361596009975, "grad_norm": 10.986588478088379, "learning_rate": 1.6860349127182044e-05, "loss": 0.0775, "step": 6300 }, { "epoch": 0.786783042394015, "grad_norm": 16.34410285949707, "learning_rate": 1.6855361596009974e-05, "loss": 0.1214, "step": 6310 }, { "epoch": 0.7880299251870324, "grad_norm": 0.12011321634054184, "learning_rate": 1.6850374064837908e-05, "loss": 0.0858, "step": 6320 }, { "epoch": 0.7892768079800498, "grad_norm": 0.021219773218035698, "learning_rate": 1.684538653366584e-05, "loss": 0.1699, "step": 6330 }, { "epoch": 0.7905236907730673, "grad_norm": 0.5230464935302734, "learning_rate": 1.684039900249377e-05, "loss": 0.0289, "step": 6340 }, { "epoch": 0.7917705735660848, "grad_norm": 1.6605863571166992, "learning_rate": 1.6835411471321697e-05, "loss": 0.0072, "step": 6350 }, { "epoch": 0.7930174563591023, "grad_norm": 0.14761345088481903, "learning_rate": 1.6830423940149627e-05, "loss": 0.0995, "step": 6360 }, { "epoch": 0.7942643391521197, "grad_norm": 0.01933436468243599, "learning_rate": 1.6825436408977558e-05, "loss": 0.0627, "step": 6370 }, { "epoch": 0.7955112219451371, "grad_norm": 3.975325107574463, "learning_rate": 1.682044887780549e-05, "loss": 0.1487, "step": 6380 }, { "epoch": 0.7967581047381546, "grad_norm": 14.431473731994629, "learning_rate": 1.6815461346633416e-05, "loss": 0.1239, "step": 6390 }, { "epoch": 0.7980049875311721, "grad_norm": 0.07805877178907394, "learning_rate": 1.6810473815461346e-05, "loss": 0.0531, "step": 6400 }, { "epoch": 0.7992518703241895, "grad_norm": 0.10893228650093079, "learning_rate": 1.6805486284289277e-05, "loss": 0.0493, "step": 6410 }, { "epoch": 0.800498753117207, "grad_norm": 0.12542615830898285, "learning_rate": 1.6800498753117208e-05, "loss": 0.0231, "step": 6420 }, { "epoch": 0.8017456359102244, "grad_norm": 0.028069086372852325, "learning_rate": 1.679551122194514e-05, "loss": 0.2296, "step": 6430 }, { "epoch": 0.8029925187032418, "grad_norm": 25.13450813293457, "learning_rate": 1.679052369077307e-05, "loss": 0.0988, "step": 6440 }, { "epoch": 0.8042394014962594, "grad_norm": 0.1958978772163391, "learning_rate": 1.6785536159601e-05, "loss": 0.015, "step": 6450 }, { "epoch": 0.8054862842892768, "grad_norm": 4.110210418701172, "learning_rate": 1.678054862842893e-05, "loss": 0.0694, "step": 6460 }, { "epoch": 0.8067331670822943, "grad_norm": 0.1173863485455513, "learning_rate": 1.6775561097256857e-05, "loss": 0.0204, "step": 6470 }, { "epoch": 0.8079800498753117, "grad_norm": 0.0829818993806839, "learning_rate": 1.6770573566084788e-05, "loss": 0.0636, "step": 6480 }, { "epoch": 0.8092269326683291, "grad_norm": 0.2234659045934677, "learning_rate": 1.676558603491272e-05, "loss": 0.1161, "step": 6490 }, { "epoch": 0.8104738154613467, "grad_norm": 16.363990783691406, "learning_rate": 1.676059850374065e-05, "loss": 0.0608, "step": 6500 }, { "epoch": 0.8117206982543641, "grad_norm": 1.963186264038086, "learning_rate": 1.675561097256858e-05, "loss": 0.0062, "step": 6510 }, { "epoch": 0.8129675810473815, "grad_norm": 0.018916714936494827, "learning_rate": 1.675062344139651e-05, "loss": 0.1132, "step": 6520 }, { "epoch": 0.814214463840399, "grad_norm": 10.125832557678223, "learning_rate": 1.674563591022444e-05, "loss": 0.0553, "step": 6530 }, { "epoch": 0.8154613466334164, "grad_norm": 0.034013696014881134, "learning_rate": 1.6740648379052372e-05, "loss": 0.0283, "step": 6540 }, { "epoch": 0.816708229426434, "grad_norm": Infinity, "learning_rate": 1.6736159600997508e-05, "loss": 0.0506, "step": 6550 }, { "epoch": 0.8179551122194514, "grad_norm": 18.428625106811523, "learning_rate": 1.673117206982544e-05, "loss": 0.0326, "step": 6560 }, { "epoch": 0.8192019950124688, "grad_norm": 18.967618942260742, "learning_rate": 1.6726184538653366e-05, "loss": 0.0608, "step": 6570 }, { "epoch": 0.8204488778054863, "grad_norm": 9.87349796295166, "learning_rate": 1.6721197007481297e-05, "loss": 0.066, "step": 6580 }, { "epoch": 0.8216957605985037, "grad_norm": 0.5438946485519409, "learning_rate": 1.6716209476309227e-05, "loss": 0.0908, "step": 6590 }, { "epoch": 0.8229426433915212, "grad_norm": 15.214516639709473, "learning_rate": 1.6711221945137158e-05, "loss": 0.0403, "step": 6600 }, { "epoch": 0.8241895261845387, "grad_norm": 3.780027151107788, "learning_rate": 1.670623441396509e-05, "loss": 0.0534, "step": 6610 }, { "epoch": 0.8254364089775561, "grad_norm": 0.5979506373405457, "learning_rate": 1.670124688279302e-05, "loss": 0.0648, "step": 6620 }, { "epoch": 0.8266832917705735, "grad_norm": 0.009059612639248371, "learning_rate": 1.669625935162095e-05, "loss": 0.0309, "step": 6630 }, { "epoch": 0.827930174563591, "grad_norm": 0.012184210121631622, "learning_rate": 1.669127182044888e-05, "loss": 0.059, "step": 6640 }, { "epoch": 0.8291770573566085, "grad_norm": 0.005635616835206747, "learning_rate": 1.6686284289276808e-05, "loss": 0.003, "step": 6650 }, { "epoch": 0.830423940149626, "grad_norm": 0.09480497986078262, "learning_rate": 1.668129675810474e-05, "loss": 0.075, "step": 6660 }, { "epoch": 0.8316708229426434, "grad_norm": 0.024640752002596855, "learning_rate": 1.667630922693267e-05, "loss": 0.0301, "step": 6670 }, { "epoch": 0.8329177057356608, "grad_norm": 0.7184823155403137, "learning_rate": 1.66713216957606e-05, "loss": 0.0377, "step": 6680 }, { "epoch": 0.8341645885286783, "grad_norm": 38.88700485229492, "learning_rate": 1.666633416458853e-05, "loss": 0.0133, "step": 6690 }, { "epoch": 0.8354114713216958, "grad_norm": 14.80359172821045, "learning_rate": 1.666134663341646e-05, "loss": 0.0799, "step": 6700 }, { "epoch": 0.8366583541147132, "grad_norm": 0.06874788552522659, "learning_rate": 1.665635910224439e-05, "loss": 0.0161, "step": 6710 }, { "epoch": 0.8379052369077307, "grad_norm": 0.01392221450805664, "learning_rate": 1.6651371571072322e-05, "loss": 0.0068, "step": 6720 }, { "epoch": 0.8391521197007481, "grad_norm": 0.03665358945727348, "learning_rate": 1.664638403990025e-05, "loss": 0.0389, "step": 6730 }, { "epoch": 0.8403990024937655, "grad_norm": 0.10448456555604935, "learning_rate": 1.664139650872818e-05, "loss": 0.0791, "step": 6740 }, { "epoch": 0.8416458852867831, "grad_norm": 0.1475229412317276, "learning_rate": 1.663640897755611e-05, "loss": 0.1261, "step": 6750 }, { "epoch": 0.8428927680798005, "grad_norm": 5.749692916870117, "learning_rate": 1.663142144638404e-05, "loss": 0.0223, "step": 6760 }, { "epoch": 0.844139650872818, "grad_norm": 20.07712173461914, "learning_rate": 1.6626433915211972e-05, "loss": 0.0936, "step": 6770 }, { "epoch": 0.8453865336658354, "grad_norm": 6.963824272155762, "learning_rate": 1.6621446384039902e-05, "loss": 0.074, "step": 6780 }, { "epoch": 0.8466334164588528, "grad_norm": 0.057135455310344696, "learning_rate": 1.6616458852867833e-05, "loss": 0.1048, "step": 6790 }, { "epoch": 0.8478802992518704, "grad_norm": 0.057420868426561356, "learning_rate": 1.6611471321695764e-05, "loss": 0.0706, "step": 6800 }, { "epoch": 0.8491271820448878, "grad_norm": 14.05396842956543, "learning_rate": 1.6606483790523694e-05, "loss": 0.098, "step": 6810 }, { "epoch": 0.8503740648379052, "grad_norm": 0.3075912594795227, "learning_rate": 1.660149625935162e-05, "loss": 0.0735, "step": 6820 }, { "epoch": 0.8516209476309227, "grad_norm": 0.05939553305506706, "learning_rate": 1.6596508728179552e-05, "loss": 0.0436, "step": 6830 }, { "epoch": 0.8528678304239401, "grad_norm": 0.012576499953866005, "learning_rate": 1.6591521197007483e-05, "loss": 0.0904, "step": 6840 }, { "epoch": 0.8541147132169576, "grad_norm": 35.2950325012207, "learning_rate": 1.6586533665835413e-05, "loss": 0.0344, "step": 6850 }, { "epoch": 0.8553615960099751, "grad_norm": 0.36233705282211304, "learning_rate": 1.658154613466334e-05, "loss": 0.0047, "step": 6860 }, { "epoch": 0.8566084788029925, "grad_norm": 4.004518508911133, "learning_rate": 1.6576558603491275e-05, "loss": 0.0688, "step": 6870 }, { "epoch": 0.85785536159601, "grad_norm": 48.95320129394531, "learning_rate": 1.6571571072319205e-05, "loss": 0.0667, "step": 6880 }, { "epoch": 0.8591022443890274, "grad_norm": 30.85384178161621, "learning_rate": 1.6566583541147136e-05, "loss": 0.1294, "step": 6890 }, { "epoch": 0.8603491271820449, "grad_norm": 0.15950468182563782, "learning_rate": 1.6561596009975063e-05, "loss": 0.1183, "step": 6900 }, { "epoch": 0.8615960099750624, "grad_norm": 34.85268020629883, "learning_rate": 1.6556608478802994e-05, "loss": 0.0239, "step": 6910 }, { "epoch": 0.8628428927680798, "grad_norm": 4.5437164306640625, "learning_rate": 1.6551620947630924e-05, "loss": 0.0897, "step": 6920 }, { "epoch": 0.8640897755610972, "grad_norm": 0.06113681197166443, "learning_rate": 1.6546633416458855e-05, "loss": 0.076, "step": 6930 }, { "epoch": 0.8653366583541147, "grad_norm": 0.1872865855693817, "learning_rate": 1.6541645885286782e-05, "loss": 0.1372, "step": 6940 }, { "epoch": 0.8665835411471322, "grad_norm": 0.03576788678765297, "learning_rate": 1.6536658354114713e-05, "loss": 0.0023, "step": 6950 }, { "epoch": 0.8678304239401496, "grad_norm": 0.040885813534259796, "learning_rate": 1.6531670822942643e-05, "loss": 0.0762, "step": 6960 }, { "epoch": 0.8690773067331671, "grad_norm": 13.959723472595215, "learning_rate": 1.6526683291770574e-05, "loss": 0.0301, "step": 6970 }, { "epoch": 0.8703241895261845, "grad_norm": 0.5481641292572021, "learning_rate": 1.6521695760598505e-05, "loss": 0.0881, "step": 6980 }, { "epoch": 0.871571072319202, "grad_norm": 0.04001607373356819, "learning_rate": 1.6516708229426435e-05, "loss": 0.0146, "step": 6990 }, { "epoch": 0.8728179551122195, "grad_norm": 0.044817376881837845, "learning_rate": 1.6511720698254366e-05, "loss": 0.0412, "step": 7000 }, { "epoch": 0.8740648379052369, "grad_norm": 0.027472732588648796, "learning_rate": 1.6506733167082296e-05, "loss": 0.0923, "step": 7010 }, { "epoch": 0.8753117206982544, "grad_norm": 0.013095473870635033, "learning_rate": 1.6501745635910227e-05, "loss": 0.024, "step": 7020 }, { "epoch": 0.8765586034912718, "grad_norm": 0.2899605333805084, "learning_rate": 1.6496758104738154e-05, "loss": 0.0865, "step": 7030 }, { "epoch": 0.8778054862842892, "grad_norm": 0.07437755167484283, "learning_rate": 1.6491770573566085e-05, "loss": 0.0572, "step": 7040 }, { "epoch": 0.8790523690773068, "grad_norm": 0.01817794144153595, "learning_rate": 1.6486783042394016e-05, "loss": 0.0611, "step": 7050 }, { "epoch": 0.8802992518703242, "grad_norm": 2.1446876525878906, "learning_rate": 1.6481795511221946e-05, "loss": 0.1099, "step": 7060 }, { "epoch": 0.8815461346633416, "grad_norm": 0.42781832814216614, "learning_rate": 1.6476807980049877e-05, "loss": 0.1443, "step": 7070 }, { "epoch": 0.8827930174563591, "grad_norm": 0.02748112753033638, "learning_rate": 1.6471820448877807e-05, "loss": 0.0817, "step": 7080 }, { "epoch": 0.8840399002493765, "grad_norm": 0.025286074727773666, "learning_rate": 1.6466832917705738e-05, "loss": 0.0076, "step": 7090 }, { "epoch": 0.885286783042394, "grad_norm": 0.04752347618341446, "learning_rate": 1.646184538653367e-05, "loss": 0.0026, "step": 7100 }, { "epoch": 0.8865336658354115, "grad_norm": 0.01973377726972103, "learning_rate": 1.6456857855361596e-05, "loss": 0.058, "step": 7110 }, { "epoch": 0.8877805486284289, "grad_norm": 0.05321444198489189, "learning_rate": 1.6451870324189526e-05, "loss": 0.0864, "step": 7120 }, { "epoch": 0.8890274314214464, "grad_norm": 0.23317782580852509, "learning_rate": 1.6446882793017457e-05, "loss": 0.0535, "step": 7130 }, { "epoch": 0.8902743142144638, "grad_norm": 0.06026599556207657, "learning_rate": 1.6441895261845388e-05, "loss": 0.0212, "step": 7140 }, { "epoch": 0.8915211970074813, "grad_norm": 33.116111755371094, "learning_rate": 1.643690773067332e-05, "loss": 0.0691, "step": 7150 }, { "epoch": 0.8927680798004988, "grad_norm": 0.029996905475854874, "learning_rate": 1.643192019950125e-05, "loss": 0.0297, "step": 7160 }, { "epoch": 0.8940149625935162, "grad_norm": 34.09917068481445, "learning_rate": 1.642693266832918e-05, "loss": 0.1996, "step": 7170 }, { "epoch": 0.8952618453865336, "grad_norm": 14.287615776062012, "learning_rate": 1.642194513715711e-05, "loss": 0.0957, "step": 7180 }, { "epoch": 0.8965087281795511, "grad_norm": 0.03745246306061745, "learning_rate": 1.6416957605985037e-05, "loss": 0.1025, "step": 7190 }, { "epoch": 0.8977556109725686, "grad_norm": 1.472991943359375, "learning_rate": 1.6411970074812968e-05, "loss": 0.0165, "step": 7200 }, { "epoch": 0.899002493765586, "grad_norm": 10.71480655670166, "learning_rate": 1.64069825436409e-05, "loss": 0.0921, "step": 7210 }, { "epoch": 0.9002493765586035, "grad_norm": 10.575648307800293, "learning_rate": 1.640199501246883e-05, "loss": 0.032, "step": 7220 }, { "epoch": 0.9014962593516209, "grad_norm": 0.08652441203594208, "learning_rate": 1.639700748129676e-05, "loss": 0.0408, "step": 7230 }, { "epoch": 0.9027431421446384, "grad_norm": 0.15267132222652435, "learning_rate": 1.639201995012469e-05, "loss": 0.0088, "step": 7240 }, { "epoch": 0.9039900249376559, "grad_norm": 0.027022046968340874, "learning_rate": 1.638703241895262e-05, "loss": 0.0372, "step": 7250 }, { "epoch": 0.9052369077306733, "grad_norm": 0.02087612822651863, "learning_rate": 1.6382044887780552e-05, "loss": 0.0057, "step": 7260 }, { "epoch": 0.9064837905236908, "grad_norm": 0.007287871092557907, "learning_rate": 1.6377057356608482e-05, "loss": 0.0169, "step": 7270 }, { "epoch": 0.9077306733167082, "grad_norm": 1.5269182920455933, "learning_rate": 1.637206982543641e-05, "loss": 0.0477, "step": 7280 }, { "epoch": 0.9089775561097256, "grad_norm": 0.03564201295375824, "learning_rate": 1.636708229426434e-05, "loss": 0.0856, "step": 7290 }, { "epoch": 0.9102244389027432, "grad_norm": 0.033967025578022, "learning_rate": 1.636209476309227e-05, "loss": 0.0049, "step": 7300 }, { "epoch": 0.9114713216957606, "grad_norm": 0.0063847266137599945, "learning_rate": 1.63571072319202e-05, "loss": 0.0798, "step": 7310 }, { "epoch": 0.912718204488778, "grad_norm": 0.018450727686285973, "learning_rate": 1.635211970074813e-05, "loss": 0.1612, "step": 7320 }, { "epoch": 0.9139650872817955, "grad_norm": 0.16694334149360657, "learning_rate": 1.634713216957606e-05, "loss": 0.0057, "step": 7330 }, { "epoch": 0.9152119700748129, "grad_norm": 0.015072043053805828, "learning_rate": 1.6342144638403993e-05, "loss": 0.1719, "step": 7340 }, { "epoch": 0.9164588528678305, "grad_norm": 0.013904220424592495, "learning_rate": 1.6337157107231924e-05, "loss": 0.0412, "step": 7350 }, { "epoch": 0.9177057356608479, "grad_norm": 0.04418414086103439, "learning_rate": 1.633216957605985e-05, "loss": 0.0191, "step": 7360 }, { "epoch": 0.9189526184538653, "grad_norm": 8.246712684631348, "learning_rate": 1.6327182044887782e-05, "loss": 0.0405, "step": 7370 }, { "epoch": 0.9201995012468828, "grad_norm": 0.02130391076207161, "learning_rate": 1.6322194513715712e-05, "loss": 0.0013, "step": 7380 }, { "epoch": 0.9214463840399002, "grad_norm": 0.07500559091567993, "learning_rate": 1.6317206982543643e-05, "loss": 0.0669, "step": 7390 }, { "epoch": 0.9226932668329177, "grad_norm": 0.14592240750789642, "learning_rate": 1.631221945137157e-05, "loss": 0.0455, "step": 7400 }, { "epoch": 0.9239401496259352, "grad_norm": 0.12824711203575134, "learning_rate": 1.63072319201995e-05, "loss": 0.1218, "step": 7410 }, { "epoch": 0.9251870324189526, "grad_norm": 29.46427345275879, "learning_rate": 1.630224438902743e-05, "loss": 0.0587, "step": 7420 }, { "epoch": 0.92643391521197, "grad_norm": 0.04521302878856659, "learning_rate": 1.6297256857855362e-05, "loss": 0.0363, "step": 7430 }, { "epoch": 0.9276807980049875, "grad_norm": 0.17441897094249725, "learning_rate": 1.6292269326683293e-05, "loss": 0.0856, "step": 7440 }, { "epoch": 0.928927680798005, "grad_norm": 0.02611939236521721, "learning_rate": 1.6287281795511223e-05, "loss": 0.0012, "step": 7450 }, { "epoch": 0.9301745635910225, "grad_norm": 0.2440149188041687, "learning_rate": 1.6282294264339154e-05, "loss": 0.0674, "step": 7460 }, { "epoch": 0.9314214463840399, "grad_norm": 0.029897717759013176, "learning_rate": 1.6277306733167085e-05, "loss": 0.0296, "step": 7470 }, { "epoch": 0.9326683291770573, "grad_norm": 0.022027527913451195, "learning_rate": 1.6272319201995012e-05, "loss": 0.0202, "step": 7480 }, { "epoch": 0.9339152119700748, "grad_norm": 6.706576347351074, "learning_rate": 1.6267331670822942e-05, "loss": 0.0872, "step": 7490 }, { "epoch": 0.9351620947630923, "grad_norm": 0.02055455558001995, "learning_rate": 1.6262344139650873e-05, "loss": 0.0233, "step": 7500 }, { "epoch": 0.9364089775561097, "grad_norm": 0.25276321172714233, "learning_rate": 1.6257356608478804e-05, "loss": 0.0446, "step": 7510 }, { "epoch": 0.9376558603491272, "grad_norm": 0.08888930082321167, "learning_rate": 1.6252369077306734e-05, "loss": 0.0068, "step": 7520 }, { "epoch": 0.9389027431421446, "grad_norm": 1.5026854276657104, "learning_rate": 1.6247381546134665e-05, "loss": 0.0756, "step": 7530 }, { "epoch": 0.940149625935162, "grad_norm": 0.0982433333992958, "learning_rate": 1.6242394014962596e-05, "loss": 0.2113, "step": 7540 }, { "epoch": 0.9413965087281796, "grad_norm": 0.02279825694859028, "learning_rate": 1.6237406483790526e-05, "loss": 0.0353, "step": 7550 }, { "epoch": 0.942643391521197, "grad_norm": 0.6083737015724182, "learning_rate": 1.6232418952618457e-05, "loss": 0.0942, "step": 7560 }, { "epoch": 0.9438902743142145, "grad_norm": 0.08139675855636597, "learning_rate": 1.6227431421446384e-05, "loss": 0.0854, "step": 7570 }, { "epoch": 0.9451371571072319, "grad_norm": 10.0869140625, "learning_rate": 1.6222443890274315e-05, "loss": 0.0515, "step": 7580 }, { "epoch": 0.9463840399002493, "grad_norm": 1.962074875831604, "learning_rate": 1.6217456359102245e-05, "loss": 0.1042, "step": 7590 }, { "epoch": 0.9476309226932669, "grad_norm": 4.804053783416748, "learning_rate": 1.6212468827930176e-05, "loss": 0.0433, "step": 7600 }, { "epoch": 0.9488778054862843, "grad_norm": 0.2560361921787262, "learning_rate": 1.6207481296758107e-05, "loss": 0.0221, "step": 7610 }, { "epoch": 0.9501246882793017, "grad_norm": 0.042769655585289, "learning_rate": 1.6202493765586037e-05, "loss": 0.0464, "step": 7620 }, { "epoch": 0.9513715710723192, "grad_norm": 0.06933890283107758, "learning_rate": 1.6197506234413968e-05, "loss": 0.0249, "step": 7630 }, { "epoch": 0.9526184538653366, "grad_norm": 9.516181945800781, "learning_rate": 1.61925187032419e-05, "loss": 0.0285, "step": 7640 }, { "epoch": 0.9538653366583542, "grad_norm": 6.549726963043213, "learning_rate": 1.6187531172069826e-05, "loss": 0.0574, "step": 7650 }, { "epoch": 0.9551122194513716, "grad_norm": 0.3095720410346985, "learning_rate": 1.6182543640897756e-05, "loss": 0.001, "step": 7660 }, { "epoch": 0.956359102244389, "grad_norm": 38.27455139160156, "learning_rate": 1.6177556109725687e-05, "loss": 0.1111, "step": 7670 }, { "epoch": 0.9576059850374065, "grad_norm": 21.775671005249023, "learning_rate": 1.6172568578553617e-05, "loss": 0.1198, "step": 7680 }, { "epoch": 0.9588528678304239, "grad_norm": 3.9372639656066895, "learning_rate": 1.6167581047381548e-05, "loss": 0.0646, "step": 7690 }, { "epoch": 0.9600997506234414, "grad_norm": 0.037020809948444366, "learning_rate": 1.616259351620948e-05, "loss": 0.094, "step": 7700 }, { "epoch": 0.9613466334164589, "grad_norm": 26.675029754638672, "learning_rate": 1.615760598503741e-05, "loss": 0.0267, "step": 7710 }, { "epoch": 0.9625935162094763, "grad_norm": 0.5148128271102905, "learning_rate": 1.615261845386534e-05, "loss": 0.0383, "step": 7720 }, { "epoch": 0.9638403990024937, "grad_norm": 0.024793069809675217, "learning_rate": 1.614763092269327e-05, "loss": 0.0373, "step": 7730 }, { "epoch": 0.9650872817955112, "grad_norm": 0.036278147250413895, "learning_rate": 1.6142643391521198e-05, "loss": 0.0275, "step": 7740 }, { "epoch": 0.9663341645885287, "grad_norm": 37.560386657714844, "learning_rate": 1.613765586034913e-05, "loss": 0.1835, "step": 7750 }, { "epoch": 0.9675810473815462, "grad_norm": 0.013977359049022198, "learning_rate": 1.613266832917706e-05, "loss": 0.0191, "step": 7760 }, { "epoch": 0.9688279301745636, "grad_norm": 5.472465991973877, "learning_rate": 1.612768079800499e-05, "loss": 0.0751, "step": 7770 }, { "epoch": 0.970074812967581, "grad_norm": 0.008948855102062225, "learning_rate": 1.6122693266832917e-05, "loss": 0.0461, "step": 7780 }, { "epoch": 0.9713216957605985, "grad_norm": 0.03792479261755943, "learning_rate": 1.6117705735660847e-05, "loss": 0.1022, "step": 7790 }, { "epoch": 0.972568578553616, "grad_norm": 13.02475357055664, "learning_rate": 1.6112718204488778e-05, "loss": 0.0728, "step": 7800 }, { "epoch": 0.9738154613466334, "grad_norm": 1.2862738370895386, "learning_rate": 1.6107730673316712e-05, "loss": 0.0599, "step": 7810 }, { "epoch": 0.9750623441396509, "grad_norm": 0.07089701294898987, "learning_rate": 1.610274314214464e-05, "loss": 0.1124, "step": 7820 }, { "epoch": 0.9763092269326683, "grad_norm": 0.04689112678170204, "learning_rate": 1.609775561097257e-05, "loss": 0.0802, "step": 7830 }, { "epoch": 0.9775561097256857, "grad_norm": 0.01914226822555065, "learning_rate": 1.60927680798005e-05, "loss": 0.096, "step": 7840 }, { "epoch": 0.9788029925187033, "grad_norm": 0.05238153040409088, "learning_rate": 1.608778054862843e-05, "loss": 0.0627, "step": 7850 }, { "epoch": 0.9800498753117207, "grad_norm": 0.011097576469182968, "learning_rate": 1.608279301745636e-05, "loss": 0.1045, "step": 7860 }, { "epoch": 0.9812967581047382, "grad_norm": 0.01885097473859787, "learning_rate": 1.607780548628429e-05, "loss": 0.0469, "step": 7870 }, { "epoch": 0.9825436408977556, "grad_norm": 15.283947944641113, "learning_rate": 1.607281795511222e-05, "loss": 0.114, "step": 7880 }, { "epoch": 0.983790523690773, "grad_norm": 16.590517044067383, "learning_rate": 1.606783042394015e-05, "loss": 0.1792, "step": 7890 }, { "epoch": 0.9850374064837906, "grad_norm": 0.038308024406433105, "learning_rate": 1.606284289276808e-05, "loss": 0.155, "step": 7900 }, { "epoch": 0.986284289276808, "grad_norm": 0.04093620926141739, "learning_rate": 1.605785536159601e-05, "loss": 0.0326, "step": 7910 }, { "epoch": 0.9875311720698254, "grad_norm": 0.23573100566864014, "learning_rate": 1.6052867830423942e-05, "loss": 0.041, "step": 7920 }, { "epoch": 0.9887780548628429, "grad_norm": 0.023155642673373222, "learning_rate": 1.6047880299251873e-05, "loss": 0.0076, "step": 7930 }, { "epoch": 0.9900249376558603, "grad_norm": 0.01830834336578846, "learning_rate": 1.60428927680798e-05, "loss": 0.0559, "step": 7940 }, { "epoch": 0.9912718204488778, "grad_norm": 0.010360955260694027, "learning_rate": 1.603790523690773e-05, "loss": 0.0761, "step": 7950 }, { "epoch": 0.9925187032418953, "grad_norm": 32.94133377075195, "learning_rate": 1.603291770573566e-05, "loss": 0.1009, "step": 7960 }, { "epoch": 0.9937655860349127, "grad_norm": 5.8751630783081055, "learning_rate": 1.6027930174563592e-05, "loss": 0.0327, "step": 7970 }, { "epoch": 0.9950124688279302, "grad_norm": 0.03165939822793007, "learning_rate": 1.6022942643391522e-05, "loss": 0.0085, "step": 7980 }, { "epoch": 0.9962593516209476, "grad_norm": 33.61581039428711, "learning_rate": 1.6017955112219453e-05, "loss": 0.0331, "step": 7990 }, { "epoch": 0.9975062344139651, "grad_norm": 0.19740180671215057, "learning_rate": 1.6012967581047384e-05, "loss": 0.0377, "step": 8000 }, { "epoch": 0.9987531172069826, "grad_norm": 4.508144378662109, "learning_rate": 1.6007980049875314e-05, "loss": 0.0937, "step": 8010 }, { "epoch": 1.0, "grad_norm": 0.10585662722587585, "learning_rate": 1.6002992518703245e-05, "loss": 0.0008, "step": 8020 }, { "epoch": 1.0, "eval_accuracy": 0.990273707837147, "eval_loss": 0.045430995523929596, "eval_runtime": 17.5172, "eval_samples_per_second": 915.616, "eval_steps_per_second": 57.258, "step": 8020 }, { "epoch": 1.0012468827930174, "grad_norm": 0.32553237676620483, "learning_rate": 1.5998004987531172e-05, "loss": 0.0008, "step": 8030 }, { "epoch": 1.0024937655860349, "grad_norm": 42.72439193725586, "learning_rate": 1.5993017456359103e-05, "loss": 0.0373, "step": 8040 }, { "epoch": 1.0037406483790523, "grad_norm": 0.026368023827672005, "learning_rate": 1.5988029925187033e-05, "loss": 0.001, "step": 8050 }, { "epoch": 1.0049875311720697, "grad_norm": 0.006805025972425938, "learning_rate": 1.5983042394014964e-05, "loss": 0.0352, "step": 8060 }, { "epoch": 1.0062344139650872, "grad_norm": 0.007014899980276823, "learning_rate": 1.5978054862842895e-05, "loss": 0.0006, "step": 8070 }, { "epoch": 1.0074812967581048, "grad_norm": 0.007944006472826004, "learning_rate": 1.5973067331670825e-05, "loss": 0.0684, "step": 8080 }, { "epoch": 1.0087281795511223, "grad_norm": 0.13892197608947754, "learning_rate": 1.5968079800498756e-05, "loss": 0.0397, "step": 8090 }, { "epoch": 1.0099750623441397, "grad_norm": 0.007291398011147976, "learning_rate": 1.5963092269326687e-05, "loss": 0.028, "step": 8100 }, { "epoch": 1.0112219451371571, "grad_norm": 0.007756800390779972, "learning_rate": 1.5958104738154614e-05, "loss": 0.0021, "step": 8110 }, { "epoch": 1.0124688279301746, "grad_norm": 0.15053221583366394, "learning_rate": 1.5953117206982544e-05, "loss": 0.005, "step": 8120 }, { "epoch": 1.013715710723192, "grad_norm": 0.029048511758446693, "learning_rate": 1.5948129675810475e-05, "loss": 0.0087, "step": 8130 }, { "epoch": 1.0149625935162094, "grad_norm": 0.8408064842224121, "learning_rate": 1.5943142144638406e-05, "loss": 0.058, "step": 8140 }, { "epoch": 1.0162094763092269, "grad_norm": 0.011089330539107323, "learning_rate": 1.5938154613466333e-05, "loss": 0.0189, "step": 8150 }, { "epoch": 1.0174563591022443, "grad_norm": 0.1405433863401413, "learning_rate": 1.5933167082294267e-05, "loss": 0.0594, "step": 8160 }, { "epoch": 1.018703241895262, "grad_norm": 0.020905321463942528, "learning_rate": 1.5928179551122197e-05, "loss": 0.0215, "step": 8170 }, { "epoch": 1.0199501246882794, "grad_norm": 0.018605615943670273, "learning_rate": 1.5923192019950128e-05, "loss": 0.0087, "step": 8180 }, { "epoch": 1.0211970074812968, "grad_norm": 0.017665930092334747, "learning_rate": 1.5918204488778055e-05, "loss": 0.0563, "step": 8190 }, { "epoch": 1.0224438902743143, "grad_norm": 0.005314531270414591, "learning_rate": 1.5913216957605986e-05, "loss": 0.0773, "step": 8200 }, { "epoch": 1.0236907730673317, "grad_norm": 74.58911895751953, "learning_rate": 1.5908229426433917e-05, "loss": 0.063, "step": 8210 }, { "epoch": 1.0249376558603491, "grad_norm": 18.045536041259766, "learning_rate": 1.5903241895261847e-05, "loss": 0.06, "step": 8220 }, { "epoch": 1.0261845386533666, "grad_norm": 26.885278701782227, "learning_rate": 1.5898254364089778e-05, "loss": 0.0587, "step": 8230 }, { "epoch": 1.027431421446384, "grad_norm": 0.007998127490282059, "learning_rate": 1.5893266832917705e-05, "loss": 0.0427, "step": 8240 }, { "epoch": 1.0286783042394014, "grad_norm": 0.010522495023906231, "learning_rate": 1.5888279301745636e-05, "loss": 0.0416, "step": 8250 }, { "epoch": 1.0299251870324189, "grad_norm": 0.05275670811533928, "learning_rate": 1.5883291770573566e-05, "loss": 0.0016, "step": 8260 }, { "epoch": 1.0311720698254363, "grad_norm": 0.1919606626033783, "learning_rate": 1.5878304239401497e-05, "loss": 0.0005, "step": 8270 }, { "epoch": 1.032418952618454, "grad_norm": 0.13268867135047913, "learning_rate": 1.5873316708229428e-05, "loss": 0.0356, "step": 8280 }, { "epoch": 1.0336658354114714, "grad_norm": 0.007612032815814018, "learning_rate": 1.5868329177057358e-05, "loss": 0.0397, "step": 8290 }, { "epoch": 1.0349127182044888, "grad_norm": 2.647951364517212, "learning_rate": 1.586334164588529e-05, "loss": 0.0069, "step": 8300 }, { "epoch": 1.0361596009975063, "grad_norm": 0.402752548456192, "learning_rate": 1.585835411471322e-05, "loss": 0.0495, "step": 8310 }, { "epoch": 1.0374064837905237, "grad_norm": 0.4408397972583771, "learning_rate": 1.5853366583541147e-05, "loss": 0.0061, "step": 8320 }, { "epoch": 1.0386533665835411, "grad_norm": 0.057391341775655746, "learning_rate": 1.5848379052369077e-05, "loss": 0.0003, "step": 8330 }, { "epoch": 1.0399002493765586, "grad_norm": 0.18651100993156433, "learning_rate": 1.5843391521197008e-05, "loss": 0.0004, "step": 8340 }, { "epoch": 1.041147132169576, "grad_norm": 0.04775964096188545, "learning_rate": 1.583840399002494e-05, "loss": 0.0992, "step": 8350 }, { "epoch": 1.0423940149625934, "grad_norm": 49.80867385864258, "learning_rate": 1.583341645885287e-05, "loss": 0.0121, "step": 8360 }, { "epoch": 1.043640897755611, "grad_norm": 0.006509189493954182, "learning_rate": 1.58284289276808e-05, "loss": 0.0172, "step": 8370 }, { "epoch": 1.0448877805486285, "grad_norm": 0.08325016498565674, "learning_rate": 1.582344139650873e-05, "loss": 0.0464, "step": 8380 }, { "epoch": 1.046134663341646, "grad_norm": 0.007545843254774809, "learning_rate": 1.581845386533666e-05, "loss": 0.0351, "step": 8390 }, { "epoch": 1.0473815461346634, "grad_norm": 0.013103622011840343, "learning_rate": 1.5813466334164588e-05, "loss": 0.0497, "step": 8400 }, { "epoch": 1.0486284289276808, "grad_norm": 83.83370208740234, "learning_rate": 1.580847880299252e-05, "loss": 0.0322, "step": 8410 }, { "epoch": 1.0498753117206983, "grad_norm": 0.11098748445510864, "learning_rate": 1.580349127182045e-05, "loss": 0.0285, "step": 8420 }, { "epoch": 1.0511221945137157, "grad_norm": 0.003446460235863924, "learning_rate": 1.579850374064838e-05, "loss": 0.0917, "step": 8430 }, { "epoch": 1.0523690773067331, "grad_norm": 0.4329495131969452, "learning_rate": 1.579351620947631e-05, "loss": 0.004, "step": 8440 }, { "epoch": 1.0536159600997506, "grad_norm": 0.017273055389523506, "learning_rate": 1.578852867830424e-05, "loss": 0.0577, "step": 8450 }, { "epoch": 1.054862842892768, "grad_norm": 0.05111582577228546, "learning_rate": 1.5783541147132172e-05, "loss": 0.0187, "step": 8460 }, { "epoch": 1.0561097256857854, "grad_norm": 19.794246673583984, "learning_rate": 1.5778553615960103e-05, "loss": 0.0926, "step": 8470 }, { "epoch": 1.057356608478803, "grad_norm": 0.003474411554634571, "learning_rate": 1.5773566084788033e-05, "loss": 0.0509, "step": 8480 }, { "epoch": 1.0586034912718205, "grad_norm": 0.0037080910988152027, "learning_rate": 1.576857855361596e-05, "loss": 0.0011, "step": 8490 }, { "epoch": 1.059850374064838, "grad_norm": 0.19036197662353516, "learning_rate": 1.576359102244389e-05, "loss": 0.0427, "step": 8500 }, { "epoch": 1.0610972568578554, "grad_norm": 0.0166669599711895, "learning_rate": 1.575860349127182e-05, "loss": 0.0569, "step": 8510 }, { "epoch": 1.0623441396508728, "grad_norm": 0.02685883827507496, "learning_rate": 1.5753615960099752e-05, "loss": 0.0405, "step": 8520 }, { "epoch": 1.0635910224438903, "grad_norm": 0.003904626239091158, "learning_rate": 1.5748628428927683e-05, "loss": 0.045, "step": 8530 }, { "epoch": 1.0648379052369077, "grad_norm": 0.045057203620672226, "learning_rate": 1.5743640897755613e-05, "loss": 0.0318, "step": 8540 }, { "epoch": 1.0660847880299251, "grad_norm": 37.062721252441406, "learning_rate": 1.5738653366583544e-05, "loss": 0.0082, "step": 8550 }, { "epoch": 1.0673316708229426, "grad_norm": 2.563587188720703, "learning_rate": 1.5733665835411475e-05, "loss": 0.0009, "step": 8560 }, { "epoch": 1.0685785536159602, "grad_norm": 16.887672424316406, "learning_rate": 1.5728678304239402e-05, "loss": 0.1206, "step": 8570 }, { "epoch": 1.0698254364089776, "grad_norm": 0.2500365376472473, "learning_rate": 1.5723690773067333e-05, "loss": 0.044, "step": 8580 }, { "epoch": 1.071072319201995, "grad_norm": 0.046570807695388794, "learning_rate": 1.5718703241895263e-05, "loss": 0.0458, "step": 8590 }, { "epoch": 1.0723192019950125, "grad_norm": 6.890371322631836, "learning_rate": 1.5713715710723194e-05, "loss": 0.0034, "step": 8600 }, { "epoch": 1.07356608478803, "grad_norm": 0.011060179211199284, "learning_rate": 1.570872817955112e-05, "loss": 0.1253, "step": 8610 }, { "epoch": 1.0748129675810474, "grad_norm": 0.009608889929950237, "learning_rate": 1.570374064837905e-05, "loss": 0.0238, "step": 8620 }, { "epoch": 1.0760598503740648, "grad_norm": 5.49344539642334, "learning_rate": 1.5698753117206986e-05, "loss": 0.1225, "step": 8630 }, { "epoch": 1.0773067331670823, "grad_norm": 0.20928065478801727, "learning_rate": 1.5693765586034916e-05, "loss": 0.0018, "step": 8640 }, { "epoch": 1.0785536159600997, "grad_norm": 0.00485580787062645, "learning_rate": 1.5688778054862844e-05, "loss": 0.0448, "step": 8650 }, { "epoch": 1.0798004987531171, "grad_norm": 0.007073690649122, "learning_rate": 1.5684289276807983e-05, "loss": 0.0339, "step": 8660 }, { "epoch": 1.0810473815461346, "grad_norm": 24.870899200439453, "learning_rate": 1.567930174563591e-05, "loss": 0.0324, "step": 8670 }, { "epoch": 1.0822942643391522, "grad_norm": 0.05342073366045952, "learning_rate": 1.567431421446384e-05, "loss": 0.0957, "step": 8680 }, { "epoch": 1.0835411471321696, "grad_norm": 0.01296048704534769, "learning_rate": 1.5669326683291772e-05, "loss": 0.0448, "step": 8690 }, { "epoch": 1.084788029925187, "grad_norm": 0.008593573234975338, "learning_rate": 1.5664339152119703e-05, "loss": 0.0046, "step": 8700 }, { "epoch": 1.0860349127182045, "grad_norm": 0.13003209233283997, "learning_rate": 1.5659351620947633e-05, "loss": 0.0026, "step": 8710 }, { "epoch": 1.087281795511222, "grad_norm": 0.06288526207208633, "learning_rate": 1.5654364089775564e-05, "loss": 0.0531, "step": 8720 }, { "epoch": 1.0885286783042394, "grad_norm": 0.01975036971271038, "learning_rate": 1.5649376558603494e-05, "loss": 0.0335, "step": 8730 }, { "epoch": 1.0897755610972568, "grad_norm": 0.0037824225146323442, "learning_rate": 1.5644389027431425e-05, "loss": 0.0229, "step": 8740 }, { "epoch": 1.0910224438902743, "grad_norm": 0.01450043823570013, "learning_rate": 1.5639401496259352e-05, "loss": 0.0006, "step": 8750 }, { "epoch": 1.0922693266832917, "grad_norm": 0.015385148115456104, "learning_rate": 1.5634413965087283e-05, "loss": 0.0302, "step": 8760 }, { "epoch": 1.0935162094763093, "grad_norm": 0.04634137824177742, "learning_rate": 1.5629426433915213e-05, "loss": 0.0027, "step": 8770 }, { "epoch": 1.0947630922693268, "grad_norm": 0.01995709352195263, "learning_rate": 1.5624438902743144e-05, "loss": 0.0533, "step": 8780 }, { "epoch": 1.0960099750623442, "grad_norm": 0.9856222867965698, "learning_rate": 1.561945137157107e-05, "loss": 0.109, "step": 8790 }, { "epoch": 1.0972568578553616, "grad_norm": 0.011172446422278881, "learning_rate": 1.5614463840399002e-05, "loss": 0.0347, "step": 8800 }, { "epoch": 1.098503740648379, "grad_norm": 63.454627990722656, "learning_rate": 1.5609476309226933e-05, "loss": 0.0124, "step": 8810 }, { "epoch": 1.0997506234413965, "grad_norm": 0.06844215095043182, "learning_rate": 1.5604488778054863e-05, "loss": 0.0149, "step": 8820 }, { "epoch": 1.100997506234414, "grad_norm": 13.142332077026367, "learning_rate": 1.5599501246882794e-05, "loss": 0.0348, "step": 8830 }, { "epoch": 1.1022443890274314, "grad_norm": 0.0036785067059099674, "learning_rate": 1.5594513715710724e-05, "loss": 0.0002, "step": 8840 }, { "epoch": 1.1034912718204488, "grad_norm": 0.002891476731747389, "learning_rate": 1.5589526184538655e-05, "loss": 0.0345, "step": 8850 }, { "epoch": 1.1047381546134662, "grad_norm": 0.021872134879231453, "learning_rate": 1.5584538653366586e-05, "loss": 0.0163, "step": 8860 }, { "epoch": 1.1059850374064837, "grad_norm": 0.09742742776870728, "learning_rate": 1.5579551122194513e-05, "loss": 0.1558, "step": 8870 }, { "epoch": 1.1072319201995013, "grad_norm": 0.005483236163854599, "learning_rate": 1.5574563591022444e-05, "loss": 0.0449, "step": 8880 }, { "epoch": 1.1084788029925188, "grad_norm": 14.885406494140625, "learning_rate": 1.5569576059850374e-05, "loss": 0.0061, "step": 8890 }, { "epoch": 1.1097256857855362, "grad_norm": 5.764937400817871, "learning_rate": 1.5564588528678305e-05, "loss": 0.1297, "step": 8900 }, { "epoch": 1.1109725685785536, "grad_norm": 0.0030513282399624586, "learning_rate": 1.5559600997506235e-05, "loss": 0.1447, "step": 8910 }, { "epoch": 1.112219451371571, "grad_norm": 0.19194713234901428, "learning_rate": 1.5554613466334166e-05, "loss": 0.0968, "step": 8920 }, { "epoch": 1.1134663341645885, "grad_norm": 0.012002095580101013, "learning_rate": 1.5549625935162097e-05, "loss": 0.0003, "step": 8930 }, { "epoch": 1.114713216957606, "grad_norm": 0.005370179656893015, "learning_rate": 1.5544638403990027e-05, "loss": 0.0005, "step": 8940 }, { "epoch": 1.1159600997506234, "grad_norm": 0.054277919232845306, "learning_rate": 1.5539650872817958e-05, "loss": 0.0459, "step": 8950 }, { "epoch": 1.1172069825436408, "grad_norm": 0.006045327056199312, "learning_rate": 1.5534663341645885e-05, "loss": 0.064, "step": 8960 }, { "epoch": 1.1184538653366585, "grad_norm": 0.020867647603154182, "learning_rate": 1.5529675810473816e-05, "loss": 0.0516, "step": 8970 }, { "epoch": 1.119700748129676, "grad_norm": 0.011230267584323883, "learning_rate": 1.5524688279301746e-05, "loss": 0.001, "step": 8980 }, { "epoch": 1.1209476309226933, "grad_norm": 0.05982297286391258, "learning_rate": 1.5519700748129677e-05, "loss": 0.0004, "step": 8990 }, { "epoch": 1.1221945137157108, "grad_norm": 9.108420372009277, "learning_rate": 1.5514713216957608e-05, "loss": 0.0456, "step": 9000 }, { "epoch": 1.1234413965087282, "grad_norm": 0.022048041224479675, "learning_rate": 1.5509725685785538e-05, "loss": 0.0168, "step": 9010 }, { "epoch": 1.1246882793017456, "grad_norm": 0.010482951998710632, "learning_rate": 1.550473815461347e-05, "loss": 0.022, "step": 9020 }, { "epoch": 1.125935162094763, "grad_norm": 57.64718246459961, "learning_rate": 1.54997506234414e-05, "loss": 0.0169, "step": 9030 }, { "epoch": 1.1271820448877805, "grad_norm": 0.008993241004645824, "learning_rate": 1.5494763092269327e-05, "loss": 0.0206, "step": 9040 }, { "epoch": 1.128428927680798, "grad_norm": 0.4927586019039154, "learning_rate": 1.5489775561097257e-05, "loss": 0.0098, "step": 9050 }, { "epoch": 1.1296758104738154, "grad_norm": 0.0061515928246080875, "learning_rate": 1.5484788029925188e-05, "loss": 0.134, "step": 9060 }, { "epoch": 1.1309226932668328, "grad_norm": 0.20207062363624573, "learning_rate": 1.547980049875312e-05, "loss": 0.1086, "step": 9070 }, { "epoch": 1.1321695760598505, "grad_norm": 0.006458123680204153, "learning_rate": 1.547481296758105e-05, "loss": 0.0006, "step": 9080 }, { "epoch": 1.133416458852868, "grad_norm": 0.19042325019836426, "learning_rate": 1.546982543640898e-05, "loss": 0.0299, "step": 9090 }, { "epoch": 1.1346633416458853, "grad_norm": 0.015476653352379799, "learning_rate": 1.546483790523691e-05, "loss": 0.0454, "step": 9100 }, { "epoch": 1.1359102244389028, "grad_norm": 0.015153790824115276, "learning_rate": 1.545985037406484e-05, "loss": 0.0818, "step": 9110 }, { "epoch": 1.1371571072319202, "grad_norm": 19.77805519104004, "learning_rate": 1.5454862842892768e-05, "loss": 0.1194, "step": 9120 }, { "epoch": 1.1384039900249376, "grad_norm": 0.10593175888061523, "learning_rate": 1.54498753117207e-05, "loss": 0.0019, "step": 9130 }, { "epoch": 1.139650872817955, "grad_norm": 0.013980482704937458, "learning_rate": 1.544488778054863e-05, "loss": 0.0429, "step": 9140 }, { "epoch": 1.1408977556109725, "grad_norm": 0.025008754804730415, "learning_rate": 1.543990024937656e-05, "loss": 0.0121, "step": 9150 }, { "epoch": 1.14214463840399, "grad_norm": 0.19609186053276062, "learning_rate": 1.543491271820449e-05, "loss": 0.0212, "step": 9160 }, { "epoch": 1.1433915211970076, "grad_norm": 0.013282079249620438, "learning_rate": 1.5429925187032418e-05, "loss": 0.0507, "step": 9170 }, { "epoch": 1.144638403990025, "grad_norm": 0.003723228583112359, "learning_rate": 1.5424937655860352e-05, "loss": 0.0539, "step": 9180 }, { "epoch": 1.1458852867830425, "grad_norm": 5.059774398803711, "learning_rate": 1.5419950124688283e-05, "loss": 0.0509, "step": 9190 }, { "epoch": 1.14713216957606, "grad_norm": 0.01049418281763792, "learning_rate": 1.5414962593516213e-05, "loss": 0.1348, "step": 9200 }, { "epoch": 1.1483790523690773, "grad_norm": 0.020952526479959488, "learning_rate": 1.540997506234414e-05, "loss": 0.0618, "step": 9210 }, { "epoch": 1.1496259351620948, "grad_norm": 0.03677148371934891, "learning_rate": 1.540498753117207e-05, "loss": 0.0568, "step": 9220 }, { "epoch": 1.1508728179551122, "grad_norm": 0.006018058396875858, "learning_rate": 1.54e-05, "loss": 0.0224, "step": 9230 }, { "epoch": 1.1521197007481296, "grad_norm": 16.032939910888672, "learning_rate": 1.5395012468827932e-05, "loss": 0.0931, "step": 9240 }, { "epoch": 1.153366583541147, "grad_norm": 0.0084098344668746, "learning_rate": 1.539002493765586e-05, "loss": 0.0289, "step": 9250 }, { "epoch": 1.1546134663341645, "grad_norm": 0.015302467159926891, "learning_rate": 1.538503740648379e-05, "loss": 0.0055, "step": 9260 }, { "epoch": 1.155860349127182, "grad_norm": 0.08080907166004181, "learning_rate": 1.538004987531172e-05, "loss": 0.0461, "step": 9270 }, { "epoch": 1.1571072319201996, "grad_norm": 0.13290953636169434, "learning_rate": 1.537506234413965e-05, "loss": 0.0356, "step": 9280 }, { "epoch": 1.158354114713217, "grad_norm": 0.007370785344392061, "learning_rate": 1.5370074812967582e-05, "loss": 0.0503, "step": 9290 }, { "epoch": 1.1596009975062345, "grad_norm": 0.4330526292324066, "learning_rate": 1.5365087281795513e-05, "loss": 0.0256, "step": 9300 }, { "epoch": 1.160847880299252, "grad_norm": 0.03412933647632599, "learning_rate": 1.5360099750623443e-05, "loss": 0.0217, "step": 9310 }, { "epoch": 1.1620947630922693, "grad_norm": 0.03334099426865578, "learning_rate": 1.5355112219451374e-05, "loss": 0.0815, "step": 9320 }, { "epoch": 1.1633416458852868, "grad_norm": 0.007466933690011501, "learning_rate": 1.53501246882793e-05, "loss": 0.0378, "step": 9330 }, { "epoch": 1.1645885286783042, "grad_norm": 11.656465530395508, "learning_rate": 1.534513715710723e-05, "loss": 0.0019, "step": 9340 }, { "epoch": 1.1658354114713216, "grad_norm": 0.007075449451804161, "learning_rate": 1.5340149625935162e-05, "loss": 0.0461, "step": 9350 }, { "epoch": 1.167082294264339, "grad_norm": 0.012630700133740902, "learning_rate": 1.5335162094763093e-05, "loss": 0.0148, "step": 9360 }, { "epoch": 1.1683291770573567, "grad_norm": 0.0029928339645266533, "learning_rate": 1.5330174563591024e-05, "loss": 0.0267, "step": 9370 }, { "epoch": 1.1695760598503742, "grad_norm": 0.007222963962703943, "learning_rate": 1.5325187032418954e-05, "loss": 0.0554, "step": 9380 }, { "epoch": 1.1708229426433916, "grad_norm": 26.852447509765625, "learning_rate": 1.5320199501246885e-05, "loss": 0.0709, "step": 9390 }, { "epoch": 1.172069825436409, "grad_norm": 4.241142272949219, "learning_rate": 1.5315211970074815e-05, "loss": 0.082, "step": 9400 }, { "epoch": 1.1733167082294265, "grad_norm": 0.007325719576328993, "learning_rate": 1.5310224438902746e-05, "loss": 0.0004, "step": 9410 }, { "epoch": 1.174563591022444, "grad_norm": 3.9386239051818848, "learning_rate": 1.5305236907730673e-05, "loss": 0.0012, "step": 9420 }, { "epoch": 1.1758104738154613, "grad_norm": 0.01459722314029932, "learning_rate": 1.5300249376558604e-05, "loss": 0.0516, "step": 9430 }, { "epoch": 1.1770573566084788, "grad_norm": 2.8290350437164307, "learning_rate": 1.5295261845386534e-05, "loss": 0.0472, "step": 9440 }, { "epoch": 1.1783042394014962, "grad_norm": 0.013476746156811714, "learning_rate": 1.5290274314214465e-05, "loss": 0.0376, "step": 9450 }, { "epoch": 1.1795511221945136, "grad_norm": 0.18037371337413788, "learning_rate": 1.5285286783042396e-05, "loss": 0.0848, "step": 9460 }, { "epoch": 1.180798004987531, "grad_norm": 1.987277865409851, "learning_rate": 1.5280299251870326e-05, "loss": 0.0579, "step": 9470 }, { "epoch": 1.1820448877805487, "grad_norm": 0.14287962019443512, "learning_rate": 1.5275311720698257e-05, "loss": 0.0394, "step": 9480 }, { "epoch": 1.1832917705735662, "grad_norm": 0.09158501774072647, "learning_rate": 1.5270324189526188e-05, "loss": 0.0104, "step": 9490 }, { "epoch": 1.1845386533665836, "grad_norm": 0.1351417750120163, "learning_rate": 1.5265336658354115e-05, "loss": 0.0261, "step": 9500 }, { "epoch": 1.185785536159601, "grad_norm": 0.029170798137784004, "learning_rate": 1.5260349127182045e-05, "loss": 0.1452, "step": 9510 }, { "epoch": 1.1870324189526185, "grad_norm": 0.015503687784075737, "learning_rate": 1.5255361596009976e-05, "loss": 0.0232, "step": 9520 }, { "epoch": 1.188279301745636, "grad_norm": 0.00761051382869482, "learning_rate": 1.5250374064837907e-05, "loss": 0.0007, "step": 9530 }, { "epoch": 1.1895261845386533, "grad_norm": 0.03034345805644989, "learning_rate": 1.5245386533665836e-05, "loss": 0.0061, "step": 9540 }, { "epoch": 1.1907730673316708, "grad_norm": 0.20128989219665527, "learning_rate": 1.5240399002493766e-05, "loss": 0.1083, "step": 9550 }, { "epoch": 1.1920199501246882, "grad_norm": 1.2821415662765503, "learning_rate": 1.5235411471321697e-05, "loss": 0.0026, "step": 9560 }, { "epoch": 1.1932668329177059, "grad_norm": 0.18182295560836792, "learning_rate": 1.5230423940149627e-05, "loss": 0.0444, "step": 9570 }, { "epoch": 1.1945137157107233, "grad_norm": 0.008188813924789429, "learning_rate": 1.5225436408977556e-05, "loss": 0.0649, "step": 9580 }, { "epoch": 1.1957605985037407, "grad_norm": 0.0054941922426223755, "learning_rate": 1.5220448877805487e-05, "loss": 0.0004, "step": 9590 }, { "epoch": 1.1970074812967582, "grad_norm": 0.005160823930054903, "learning_rate": 1.5215461346633418e-05, "loss": 0.0357, "step": 9600 }, { "epoch": 1.1982543640897756, "grad_norm": 0.010838705115020275, "learning_rate": 1.5210473815461348e-05, "loss": 0.049, "step": 9610 }, { "epoch": 1.199501246882793, "grad_norm": 0.008219705894589424, "learning_rate": 1.5205486284289277e-05, "loss": 0.0287, "step": 9620 }, { "epoch": 1.2007481296758105, "grad_norm": 0.12174854427576065, "learning_rate": 1.5200498753117208e-05, "loss": 0.0009, "step": 9630 }, { "epoch": 1.201995012468828, "grad_norm": 0.006584263406693935, "learning_rate": 1.5195511221945138e-05, "loss": 0.0149, "step": 9640 }, { "epoch": 1.2032418952618453, "grad_norm": 0.0048215193673968315, "learning_rate": 1.5190523690773069e-05, "loss": 0.002, "step": 9650 }, { "epoch": 1.2044887780548628, "grad_norm": 0.08884479105472565, "learning_rate": 1.5185536159601e-05, "loss": 0.0005, "step": 9660 }, { "epoch": 1.2057356608478802, "grad_norm": 27.005584716796875, "learning_rate": 1.5180548628428929e-05, "loss": 0.0372, "step": 9670 }, { "epoch": 1.2069825436408976, "grad_norm": 0.00920418743044138, "learning_rate": 1.517556109725686e-05, "loss": 0.0308, "step": 9680 }, { "epoch": 1.2082294264339153, "grad_norm": 0.009550421498715878, "learning_rate": 1.517057356608479e-05, "loss": 0.0009, "step": 9690 }, { "epoch": 1.2094763092269327, "grad_norm": 0.8975104689598083, "learning_rate": 1.516558603491272e-05, "loss": 0.0436, "step": 9700 }, { "epoch": 1.2107231920199502, "grad_norm": 0.02298056147992611, "learning_rate": 1.516059850374065e-05, "loss": 0.0955, "step": 9710 }, { "epoch": 1.2119700748129676, "grad_norm": 0.002887186361476779, "learning_rate": 1.515561097256858e-05, "loss": 0.0004, "step": 9720 }, { "epoch": 1.213216957605985, "grad_norm": 0.015636254101991653, "learning_rate": 1.515062344139651e-05, "loss": 0.0628, "step": 9730 }, { "epoch": 1.2144638403990025, "grad_norm": 24.223073959350586, "learning_rate": 1.5145635910224441e-05, "loss": 0.0733, "step": 9740 }, { "epoch": 1.21571072319202, "grad_norm": 0.024109138175845146, "learning_rate": 1.514064837905237e-05, "loss": 0.0172, "step": 9750 }, { "epoch": 1.2169576059850373, "grad_norm": 0.006183837074786425, "learning_rate": 1.51356608478803e-05, "loss": 0.1089, "step": 9760 }, { "epoch": 1.218204488778055, "grad_norm": 0.013143481686711311, "learning_rate": 1.5130673316708231e-05, "loss": 0.0008, "step": 9770 }, { "epoch": 1.2194513715710724, "grad_norm": 8.031755447387695, "learning_rate": 1.5125685785536162e-05, "loss": 0.0874, "step": 9780 }, { "epoch": 1.2206982543640899, "grad_norm": 0.026733698323369026, "learning_rate": 1.5120698254364091e-05, "loss": 0.0888, "step": 9790 }, { "epoch": 1.2219451371571073, "grad_norm": 0.8957703113555908, "learning_rate": 1.5115710723192022e-05, "loss": 0.0762, "step": 9800 }, { "epoch": 1.2231920199501247, "grad_norm": 5.752279758453369, "learning_rate": 1.5110723192019952e-05, "loss": 0.0462, "step": 9810 }, { "epoch": 1.2244389027431422, "grad_norm": 0.04652789980173111, "learning_rate": 1.5105735660847883e-05, "loss": 0.0072, "step": 9820 }, { "epoch": 1.2256857855361596, "grad_norm": 0.035636574029922485, "learning_rate": 1.510074812967581e-05, "loss": 0.0919, "step": 9830 }, { "epoch": 1.226932668329177, "grad_norm": 0.010265025310218334, "learning_rate": 1.509576059850374e-05, "loss": 0.0213, "step": 9840 }, { "epoch": 1.2281795511221945, "grad_norm": 31.54733657836914, "learning_rate": 1.5090773067331673e-05, "loss": 0.0269, "step": 9850 }, { "epoch": 1.229426433915212, "grad_norm": 0.1989414095878601, "learning_rate": 1.5085785536159604e-05, "loss": 0.0064, "step": 9860 }, { "epoch": 1.2306733167082293, "grad_norm": 0.182249516248703, "learning_rate": 1.508079800498753e-05, "loss": 0.0546, "step": 9870 }, { "epoch": 1.2319201995012468, "grad_norm": 0.022076517343521118, "learning_rate": 1.5075810473815461e-05, "loss": 0.0712, "step": 9880 }, { "epoch": 1.2331670822942644, "grad_norm": 0.8623358011245728, "learning_rate": 1.5070822942643392e-05, "loss": 0.0008, "step": 9890 }, { "epoch": 1.2344139650872819, "grad_norm": 0.004727656487375498, "learning_rate": 1.5065835411471323e-05, "loss": 0.017, "step": 9900 }, { "epoch": 1.2356608478802993, "grad_norm": 0.03601466864347458, "learning_rate": 1.5060847880299255e-05, "loss": 0.0348, "step": 9910 }, { "epoch": 1.2369077306733167, "grad_norm": 0.004477897193282843, "learning_rate": 1.5055860349127182e-05, "loss": 0.0005, "step": 9920 }, { "epoch": 1.2381546134663342, "grad_norm": 25.25444221496582, "learning_rate": 1.5050872817955113e-05, "loss": 0.14, "step": 9930 }, { "epoch": 1.2394014962593516, "grad_norm": 0.05620008334517479, "learning_rate": 1.5045885286783043e-05, "loss": 0.0469, "step": 9940 }, { "epoch": 1.240648379052369, "grad_norm": 0.049697790294885635, "learning_rate": 1.5040897755610974e-05, "loss": 0.0004, "step": 9950 }, { "epoch": 1.2418952618453865, "grad_norm": 0.041144270449876785, "learning_rate": 1.5035910224438903e-05, "loss": 0.0318, "step": 9960 }, { "epoch": 1.2431421446384041, "grad_norm": 0.009246827103197575, "learning_rate": 1.5030922693266834e-05, "loss": 0.0779, "step": 9970 }, { "epoch": 1.2443890274314215, "grad_norm": 0.18010537326335907, "learning_rate": 1.5025935162094764e-05, "loss": 0.0013, "step": 9980 }, { "epoch": 1.245635910224439, "grad_norm": 0.0025472913403064013, "learning_rate": 1.5020947630922695e-05, "loss": 0.0244, "step": 9990 }, { "epoch": 1.2468827930174564, "grad_norm": 35.89253616333008, "learning_rate": 1.5015960099750624e-05, "loss": 0.0626, "step": 10000 }, { "epoch": 1.2481296758104738, "grad_norm": 0.004349586088210344, "learning_rate": 1.5010972568578554e-05, "loss": 0.0659, "step": 10010 }, { "epoch": 1.2493765586034913, "grad_norm": 0.0075108809396624565, "learning_rate": 1.5005985037406485e-05, "loss": 0.0926, "step": 10020 }, { "epoch": 1.2506234413965087, "grad_norm": 21.287639617919922, "learning_rate": 1.5000997506234416e-05, "loss": 0.0954, "step": 10030 }, { "epoch": 1.2518703241895262, "grad_norm": 0.01195969246327877, "learning_rate": 1.4996009975062345e-05, "loss": 0.0552, "step": 10040 }, { "epoch": 1.2531172069825436, "grad_norm": 0.0076157813891768456, "learning_rate": 1.4991022443890275e-05, "loss": 0.0252, "step": 10050 }, { "epoch": 1.254364089775561, "grad_norm": 0.005119314882904291, "learning_rate": 1.4986034912718206e-05, "loss": 0.0726, "step": 10060 }, { "epoch": 1.2556109725685785, "grad_norm": 0.005796648096293211, "learning_rate": 1.4981047381546136e-05, "loss": 0.0626, "step": 10070 }, { "epoch": 1.2568578553615959, "grad_norm": 0.007139543071389198, "learning_rate": 1.4976059850374065e-05, "loss": 0.0278, "step": 10080 }, { "epoch": 1.2581047381546135, "grad_norm": 0.003264715662226081, "learning_rate": 1.4971072319201996e-05, "loss": 0.0508, "step": 10090 }, { "epoch": 1.259351620947631, "grad_norm": 34.05919647216797, "learning_rate": 1.4966084788029927e-05, "loss": 0.0393, "step": 10100 }, { "epoch": 1.2605985037406484, "grad_norm": 0.0849919244647026, "learning_rate": 1.4961097256857857e-05, "loss": 0.0342, "step": 10110 }, { "epoch": 1.2618453865336658, "grad_norm": 0.14518681168556213, "learning_rate": 1.4956109725685786e-05, "loss": 0.0008, "step": 10120 }, { "epoch": 1.2630922693266833, "grad_norm": 0.1475505381822586, "learning_rate": 1.4951122194513717e-05, "loss": 0.018, "step": 10130 }, { "epoch": 1.2643391521197007, "grad_norm": 0.006064166314899921, "learning_rate": 1.4946134663341647e-05, "loss": 0.0796, "step": 10140 }, { "epoch": 1.2655860349127181, "grad_norm": 0.004532515071332455, "learning_rate": 1.4941147132169578e-05, "loss": 0.0015, "step": 10150 }, { "epoch": 1.2668329177057356, "grad_norm": 0.04352598637342453, "learning_rate": 1.4936159600997509e-05, "loss": 0.0003, "step": 10160 }, { "epoch": 1.2680798004987532, "grad_norm": 0.49034398794174194, "learning_rate": 1.4931172069825438e-05, "loss": 0.0086, "step": 10170 }, { "epoch": 1.2693266832917707, "grad_norm": 0.003610881045460701, "learning_rate": 1.4926184538653368e-05, "loss": 0.0022, "step": 10180 }, { "epoch": 1.270573566084788, "grad_norm": 0.003048134967684746, "learning_rate": 1.4921197007481299e-05, "loss": 0.1527, "step": 10190 }, { "epoch": 1.2718204488778055, "grad_norm": 0.06748661398887634, "learning_rate": 1.491620947630923e-05, "loss": 0.0343, "step": 10200 }, { "epoch": 1.273067331670823, "grad_norm": 0.014995983801782131, "learning_rate": 1.4911221945137158e-05, "loss": 0.017, "step": 10210 }, { "epoch": 1.2743142144638404, "grad_norm": 0.004312419332563877, "learning_rate": 1.4906234413965089e-05, "loss": 0.0005, "step": 10220 }, { "epoch": 1.2755610972568578, "grad_norm": 3.3320200443267822, "learning_rate": 1.490124688279302e-05, "loss": 0.0468, "step": 10230 }, { "epoch": 1.2768079800498753, "grad_norm": 0.144317626953125, "learning_rate": 1.489625935162095e-05, "loss": 0.0271, "step": 10240 }, { "epoch": 1.2780548628428927, "grad_norm": 1.370514154434204, "learning_rate": 1.4891271820448877e-05, "loss": 0.0018, "step": 10250 }, { "epoch": 1.2793017456359101, "grad_norm": 0.012159171514213085, "learning_rate": 1.488628428927681e-05, "loss": 0.0184, "step": 10260 }, { "epoch": 1.2805486284289276, "grad_norm": 4.12328577041626, "learning_rate": 1.488129675810474e-05, "loss": 0.0019, "step": 10270 }, { "epoch": 1.281795511221945, "grad_norm": 0.1401759535074234, "learning_rate": 1.4876309226932671e-05, "loss": 0.0284, "step": 10280 }, { "epoch": 1.2830423940149627, "grad_norm": 0.011864711530506611, "learning_rate": 1.4871321695760598e-05, "loss": 0.0447, "step": 10290 }, { "epoch": 1.28428927680798, "grad_norm": 0.021031511947512627, "learning_rate": 1.4866334164588529e-05, "loss": 0.0722, "step": 10300 }, { "epoch": 1.2855361596009975, "grad_norm": 0.021378498524427414, "learning_rate": 1.486134663341646e-05, "loss": 0.028, "step": 10310 }, { "epoch": 1.286783042394015, "grad_norm": 0.014980390667915344, "learning_rate": 1.4856359102244392e-05, "loss": 0.031, "step": 10320 }, { "epoch": 1.2880299251870324, "grad_norm": 0.0073977308347821236, "learning_rate": 1.4851371571072319e-05, "loss": 0.0417, "step": 10330 }, { "epoch": 1.2892768079800498, "grad_norm": 17.954195022583008, "learning_rate": 1.484638403990025e-05, "loss": 0.0598, "step": 10340 }, { "epoch": 1.2905236907730673, "grad_norm": 0.3461831510066986, "learning_rate": 1.484139650872818e-05, "loss": 0.0336, "step": 10350 }, { "epoch": 1.2917705735660847, "grad_norm": 28.57352638244629, "learning_rate": 1.483640897755611e-05, "loss": 0.0314, "step": 10360 }, { "epoch": 1.2930174563591024, "grad_norm": 0.00324003747664392, "learning_rate": 1.483142144638404e-05, "loss": 0.0225, "step": 10370 }, { "epoch": 1.2942643391521198, "grad_norm": 31.616849899291992, "learning_rate": 1.482643391521197e-05, "loss": 0.0034, "step": 10380 }, { "epoch": 1.2955112219451372, "grad_norm": 0.2275863140821457, "learning_rate": 1.4821446384039901e-05, "loss": 0.0004, "step": 10390 }, { "epoch": 1.2967581047381547, "grad_norm": 11.458198547363281, "learning_rate": 1.4816458852867832e-05, "loss": 0.0021, "step": 10400 }, { "epoch": 1.298004987531172, "grad_norm": 0.006564176641404629, "learning_rate": 1.4811471321695762e-05, "loss": 0.0001, "step": 10410 }, { "epoch": 1.2992518703241895, "grad_norm": 0.0027762737590819597, "learning_rate": 1.4806483790523691e-05, "loss": 0.1087, "step": 10420 }, { "epoch": 1.300498753117207, "grad_norm": 19.512054443359375, "learning_rate": 1.4801496259351622e-05, "loss": 0.042, "step": 10430 }, { "epoch": 1.3017456359102244, "grad_norm": 35.38471984863281, "learning_rate": 1.4796508728179552e-05, "loss": 0.0666, "step": 10440 }, { "epoch": 1.3029925187032418, "grad_norm": 0.01785680651664734, "learning_rate": 1.4791521197007483e-05, "loss": 0.0299, "step": 10450 }, { "epoch": 1.3042394014962593, "grad_norm": 0.0048726629465818405, "learning_rate": 1.4786533665835412e-05, "loss": 0.0096, "step": 10460 }, { "epoch": 1.3054862842892767, "grad_norm": 32.89852523803711, "learning_rate": 1.4781546134663343e-05, "loss": 0.0799, "step": 10470 }, { "epoch": 1.3067331670822941, "grad_norm": 0.004758730530738831, "learning_rate": 1.4776558603491273e-05, "loss": 0.0005, "step": 10480 }, { "epoch": 1.3079800498753118, "grad_norm": 0.002490266226232052, "learning_rate": 1.4771571072319204e-05, "loss": 0.0568, "step": 10490 }, { "epoch": 1.3092269326683292, "grad_norm": 0.006255331449210644, "learning_rate": 1.4766583541147133e-05, "loss": 0.0442, "step": 10500 }, { "epoch": 1.3104738154613467, "grad_norm": 12.40581226348877, "learning_rate": 1.4761596009975063e-05, "loss": 0.0257, "step": 10510 }, { "epoch": 1.311720698254364, "grad_norm": 0.01563987322151661, "learning_rate": 1.4756608478802994e-05, "loss": 0.1362, "step": 10520 }, { "epoch": 1.3129675810473815, "grad_norm": 0.005479677580296993, "learning_rate": 1.4751620947630925e-05, "loss": 0.0003, "step": 10530 }, { "epoch": 1.314214463840399, "grad_norm": 12.577947616577148, "learning_rate": 1.4746633416458853e-05, "loss": 0.0335, "step": 10540 }, { "epoch": 1.3154613466334164, "grad_norm": 0.015153160318732262, "learning_rate": 1.4741645885286784e-05, "loss": 0.0022, "step": 10550 }, { "epoch": 1.3167082294264338, "grad_norm": 0.1170441284775734, "learning_rate": 1.4736658354114715e-05, "loss": 0.0674, "step": 10560 }, { "epoch": 1.3179551122194515, "grad_norm": 0.003648881334811449, "learning_rate": 1.4731670822942645e-05, "loss": 0.0116, "step": 10570 }, { "epoch": 1.319201995012469, "grad_norm": 0.002758126938715577, "learning_rate": 1.4726683291770574e-05, "loss": 0.0733, "step": 10580 }, { "epoch": 1.3204488778054864, "grad_norm": 0.028896719217300415, "learning_rate": 1.4721695760598505e-05, "loss": 0.0008, "step": 10590 }, { "epoch": 1.3216957605985038, "grad_norm": 3.604020595550537, "learning_rate": 1.4716708229426436e-05, "loss": 0.0425, "step": 10600 }, { "epoch": 1.3229426433915212, "grad_norm": 0.5918417572975159, "learning_rate": 1.4711720698254366e-05, "loss": 0.0613, "step": 10610 }, { "epoch": 1.3241895261845387, "grad_norm": 0.012655685655772686, "learning_rate": 1.4706733167082295e-05, "loss": 0.1018, "step": 10620 }, { "epoch": 1.325436408977556, "grad_norm": 0.0313897430896759, "learning_rate": 1.4701745635910226e-05, "loss": 0.0305, "step": 10630 }, { "epoch": 1.3266832917705735, "grad_norm": 2.2387187480926514, "learning_rate": 1.4696758104738156e-05, "loss": 0.0364, "step": 10640 }, { "epoch": 1.327930174563591, "grad_norm": 37.755985260009766, "learning_rate": 1.4691770573566087e-05, "loss": 0.0127, "step": 10650 }, { "epoch": 1.3291770573566084, "grad_norm": 0.16674213111400604, "learning_rate": 1.4686783042394018e-05, "loss": 0.0011, "step": 10660 }, { "epoch": 1.3304239401496258, "grad_norm": 0.018290970474481583, "learning_rate": 1.4681795511221946e-05, "loss": 0.0714, "step": 10670 }, { "epoch": 1.3316708229426433, "grad_norm": 0.700610876083374, "learning_rate": 1.4676807980049877e-05, "loss": 0.0035, "step": 10680 }, { "epoch": 1.332917705735661, "grad_norm": 0.004379532765597105, "learning_rate": 1.4671820448877808e-05, "loss": 0.0004, "step": 10690 }, { "epoch": 1.3341645885286784, "grad_norm": 0.020778026431798935, "learning_rate": 1.4666832917705738e-05, "loss": 0.0046, "step": 10700 }, { "epoch": 1.3354114713216958, "grad_norm": 19.630970001220703, "learning_rate": 1.4661845386533666e-05, "loss": 0.0661, "step": 10710 }, { "epoch": 1.3366583541147132, "grad_norm": 44.413639068603516, "learning_rate": 1.4656857855361596e-05, "loss": 0.0341, "step": 10720 }, { "epoch": 1.3379052369077307, "grad_norm": 0.032850079238414764, "learning_rate": 1.4651870324189528e-05, "loss": 0.0003, "step": 10730 }, { "epoch": 1.339152119700748, "grad_norm": 0.002132246969267726, "learning_rate": 1.4646882793017459e-05, "loss": 0.0006, "step": 10740 }, { "epoch": 1.3403990024937655, "grad_norm": 22.134477615356445, "learning_rate": 1.4641895261845386e-05, "loss": 0.0573, "step": 10750 }, { "epoch": 1.341645885286783, "grad_norm": 1.7164827585220337, "learning_rate": 1.4636907730673317e-05, "loss": 0.0053, "step": 10760 }, { "epoch": 1.3428927680798006, "grad_norm": 0.007867738604545593, "learning_rate": 1.4631920199501248e-05, "loss": 0.0013, "step": 10770 }, { "epoch": 1.344139650872818, "grad_norm": 0.04650270566344261, "learning_rate": 1.4626932668329178e-05, "loss": 0.0003, "step": 10780 }, { "epoch": 1.3453865336658355, "grad_norm": 0.009355682879686356, "learning_rate": 1.4621945137157107e-05, "loss": 0.1536, "step": 10790 }, { "epoch": 1.346633416458853, "grad_norm": 0.01851280778646469, "learning_rate": 1.4616957605985038e-05, "loss": 0.0736, "step": 10800 }, { "epoch": 1.3478802992518704, "grad_norm": 0.838029682636261, "learning_rate": 1.4611970074812968e-05, "loss": 0.1233, "step": 10810 }, { "epoch": 1.3491271820448878, "grad_norm": 0.28855791687965393, "learning_rate": 1.4606982543640899e-05, "loss": 0.004, "step": 10820 }, { "epoch": 1.3503740648379052, "grad_norm": 0.021240094676613808, "learning_rate": 1.4601995012468828e-05, "loss": 0.0722, "step": 10830 }, { "epoch": 1.3516209476309227, "grad_norm": 0.1711479127407074, "learning_rate": 1.4597007481296759e-05, "loss": 0.0011, "step": 10840 }, { "epoch": 1.35286783042394, "grad_norm": 0.00700349360704422, "learning_rate": 1.4592019950124689e-05, "loss": 0.0229, "step": 10850 }, { "epoch": 1.3541147132169575, "grad_norm": 22.966121673583984, "learning_rate": 1.458703241895262e-05, "loss": 0.0711, "step": 10860 }, { "epoch": 1.355361596009975, "grad_norm": 14.840107917785645, "learning_rate": 1.4582044887780549e-05, "loss": 0.1087, "step": 10870 }, { "epoch": 1.3566084788029924, "grad_norm": 0.008188108913600445, "learning_rate": 1.457705735660848e-05, "loss": 0.0099, "step": 10880 }, { "epoch": 1.35785536159601, "grad_norm": 0.03862173855304718, "learning_rate": 1.457206982543641e-05, "loss": 0.0301, "step": 10890 }, { "epoch": 1.3591022443890275, "grad_norm": 0.029760289937257767, "learning_rate": 1.456708229426434e-05, "loss": 0.0249, "step": 10900 }, { "epoch": 1.360349127182045, "grad_norm": 0.00440254807472229, "learning_rate": 1.4562094763092271e-05, "loss": 0.0615, "step": 10910 }, { "epoch": 1.3615960099750624, "grad_norm": 0.009855284355580807, "learning_rate": 1.45571072319202e-05, "loss": 0.0212, "step": 10920 }, { "epoch": 1.3628428927680798, "grad_norm": 2.0530319213867188, "learning_rate": 1.455211970074813e-05, "loss": 0.0594, "step": 10930 }, { "epoch": 1.3640897755610972, "grad_norm": 0.09458717703819275, "learning_rate": 1.4547132169576061e-05, "loss": 0.1251, "step": 10940 }, { "epoch": 1.3653366583541147, "grad_norm": 0.03143605589866638, "learning_rate": 1.4542144638403992e-05, "loss": 0.032, "step": 10950 }, { "epoch": 1.366583541147132, "grad_norm": 0.02471662499010563, "learning_rate": 1.4537157107231921e-05, "loss": 0.0062, "step": 10960 }, { "epoch": 1.3678304239401498, "grad_norm": 0.014755766838788986, "learning_rate": 1.4532169576059851e-05, "loss": 0.001, "step": 10970 }, { "epoch": 1.3690773067331672, "grad_norm": 0.0027742686215788126, "learning_rate": 1.4527182044887782e-05, "loss": 0.0038, "step": 10980 }, { "epoch": 1.3703241895261846, "grad_norm": 0.08576522767543793, "learning_rate": 1.4522194513715713e-05, "loss": 0.0157, "step": 10990 }, { "epoch": 1.371571072319202, "grad_norm": 0.0026629208587110043, "learning_rate": 1.4517206982543642e-05, "loss": 0.0017, "step": 11000 }, { "epoch": 1.3728179551122195, "grad_norm": 0.023445982486009598, "learning_rate": 1.4512219451371572e-05, "loss": 0.0692, "step": 11010 }, { "epoch": 1.374064837905237, "grad_norm": 0.005688042845577002, "learning_rate": 1.4507231920199503e-05, "loss": 0.0359, "step": 11020 }, { "epoch": 1.3753117206982544, "grad_norm": 11.719991683959961, "learning_rate": 1.4502244389027434e-05, "loss": 0.0445, "step": 11030 }, { "epoch": 1.3765586034912718, "grad_norm": 0.037872131913900375, "learning_rate": 1.4497256857855362e-05, "loss": 0.0844, "step": 11040 }, { "epoch": 1.3778054862842892, "grad_norm": 0.07881703227758408, "learning_rate": 1.4492269326683293e-05, "loss": 0.0597, "step": 11050 }, { "epoch": 1.3790523690773067, "grad_norm": 0.012651286087930202, "learning_rate": 1.4487281795511224e-05, "loss": 0.0433, "step": 11060 }, { "epoch": 1.380299251870324, "grad_norm": 55.89772033691406, "learning_rate": 1.4482294264339154e-05, "loss": 0.0772, "step": 11070 }, { "epoch": 1.3815461346633415, "grad_norm": 0.003019166411831975, "learning_rate": 1.4477306733167083e-05, "loss": 0.0618, "step": 11080 }, { "epoch": 1.382793017456359, "grad_norm": 0.6327642798423767, "learning_rate": 1.4472319201995014e-05, "loss": 0.0481, "step": 11090 }, { "epoch": 1.3840399002493766, "grad_norm": 0.029262885451316833, "learning_rate": 1.4467331670822944e-05, "loss": 0.0041, "step": 11100 }, { "epoch": 1.385286783042394, "grad_norm": 0.0019791433587670326, "learning_rate": 1.4462344139650875e-05, "loss": 0.0026, "step": 11110 }, { "epoch": 1.3865336658354115, "grad_norm": 0.2362566888332367, "learning_rate": 1.4457356608478802e-05, "loss": 0.0016, "step": 11120 }, { "epoch": 1.387780548628429, "grad_norm": 0.05378960818052292, "learning_rate": 1.4452369077306735e-05, "loss": 0.0029, "step": 11130 }, { "epoch": 1.3890274314214464, "grad_norm": 0.015104220248758793, "learning_rate": 1.4447381546134665e-05, "loss": 0.0276, "step": 11140 }, { "epoch": 1.3902743142144638, "grad_norm": 23.29336929321289, "learning_rate": 1.4442394014962596e-05, "loss": 0.0701, "step": 11150 }, { "epoch": 1.3915211970074812, "grad_norm": 1.6370915174484253, "learning_rate": 1.4437406483790526e-05, "loss": 0.0766, "step": 11160 }, { "epoch": 1.3927680798004989, "grad_norm": 0.08965132385492325, "learning_rate": 1.4432418952618454e-05, "loss": 0.0046, "step": 11170 }, { "epoch": 1.3940149625935163, "grad_norm": 0.14832371473312378, "learning_rate": 1.4427431421446384e-05, "loss": 0.0101, "step": 11180 }, { "epoch": 1.3952618453865338, "grad_norm": 0.012930690310895443, "learning_rate": 1.4422443890274315e-05, "loss": 0.1335, "step": 11190 }, { "epoch": 1.3965087281795512, "grad_norm": 0.018670091405510902, "learning_rate": 1.4417456359102247e-05, "loss": 0.0581, "step": 11200 }, { "epoch": 1.3977556109725686, "grad_norm": 0.009946133941411972, "learning_rate": 1.4412468827930174e-05, "loss": 0.0004, "step": 11210 }, { "epoch": 1.399002493765586, "grad_norm": 0.007296042516827583, "learning_rate": 1.4407481296758105e-05, "loss": 0.064, "step": 11220 }, { "epoch": 1.4002493765586035, "grad_norm": 0.031165320426225662, "learning_rate": 1.4402493765586036e-05, "loss": 0.0197, "step": 11230 }, { "epoch": 1.401496259351621, "grad_norm": 0.020699728280305862, "learning_rate": 1.4397506234413966e-05, "loss": 0.001, "step": 11240 }, { "epoch": 1.4027431421446384, "grad_norm": 0.05278054252266884, "learning_rate": 1.4392518703241895e-05, "loss": 0.0429, "step": 11250 }, { "epoch": 1.4039900249376558, "grad_norm": 0.10893259197473526, "learning_rate": 1.4387531172069826e-05, "loss": 0.0664, "step": 11260 }, { "epoch": 1.4052369077306732, "grad_norm": 0.23036803305149078, "learning_rate": 1.4382543640897757e-05, "loss": 0.0108, "step": 11270 }, { "epoch": 1.4064837905236907, "grad_norm": 0.04595217853784561, "learning_rate": 1.4377556109725687e-05, "loss": 0.0779, "step": 11280 }, { "epoch": 1.407730673316708, "grad_norm": 0.0674215629696846, "learning_rate": 1.4372568578553616e-05, "loss": 0.0235, "step": 11290 }, { "epoch": 1.4089775561097257, "grad_norm": 0.0016587678110226989, "learning_rate": 1.4368079800498756e-05, "loss": 0.0588, "step": 11300 }, { "epoch": 1.4102244389027432, "grad_norm": 0.007998665794730186, "learning_rate": 1.4363092269326683e-05, "loss": 0.0003, "step": 11310 }, { "epoch": 1.4114713216957606, "grad_norm": 0.008180646225810051, "learning_rate": 1.4358104738154614e-05, "loss": 0.006, "step": 11320 }, { "epoch": 1.412718204488778, "grad_norm": 0.04490172863006592, "learning_rate": 1.4353117206982544e-05, "loss": 0.032, "step": 11330 }, { "epoch": 1.4139650872817955, "grad_norm": 29.317243576049805, "learning_rate": 1.4348129675810477e-05, "loss": 0.1154, "step": 11340 }, { "epoch": 1.415211970074813, "grad_norm": 0.005312860477715731, "learning_rate": 1.4343142144638404e-05, "loss": 0.0432, "step": 11350 }, { "epoch": 1.4164588528678304, "grad_norm": 4.058748245239258, "learning_rate": 1.4338154613466335e-05, "loss": 0.004, "step": 11360 }, { "epoch": 1.417705735660848, "grad_norm": 14.437801361083984, "learning_rate": 1.4333167082294265e-05, "loss": 0.0123, "step": 11370 }, { "epoch": 1.4189526184538654, "grad_norm": 0.009661520831286907, "learning_rate": 1.4328179551122196e-05, "loss": 0.1112, "step": 11380 }, { "epoch": 1.4201995012468829, "grad_norm": 0.010127179324626923, "learning_rate": 1.4323192019950125e-05, "loss": 0.0103, "step": 11390 }, { "epoch": 1.4214463840399003, "grad_norm": 0.005153048317879438, "learning_rate": 1.4318204488778055e-05, "loss": 0.0009, "step": 11400 }, { "epoch": 1.4226932668329177, "grad_norm": 0.008940218947827816, "learning_rate": 1.4313216957605986e-05, "loss": 0.0538, "step": 11410 }, { "epoch": 1.4239401496259352, "grad_norm": 0.01359684206545353, "learning_rate": 1.4308229426433917e-05, "loss": 0.0435, "step": 11420 }, { "epoch": 1.4251870324189526, "grad_norm": 0.008474809117615223, "learning_rate": 1.4303241895261846e-05, "loss": 0.0064, "step": 11430 }, { "epoch": 1.42643391521197, "grad_norm": 0.004166824277490377, "learning_rate": 1.4298254364089776e-05, "loss": 0.0953, "step": 11440 }, { "epoch": 1.4276807980049875, "grad_norm": 0.021383510902523994, "learning_rate": 1.4293266832917707e-05, "loss": 0.0212, "step": 11450 }, { "epoch": 1.428927680798005, "grad_norm": 1.1953846216201782, "learning_rate": 1.4288279301745637e-05, "loss": 0.0579, "step": 11460 }, { "epoch": 1.4301745635910224, "grad_norm": 0.0063395812176167965, "learning_rate": 1.4283291770573566e-05, "loss": 0.0174, "step": 11470 }, { "epoch": 1.4314214463840398, "grad_norm": 0.003997775260359049, "learning_rate": 1.4278304239401497e-05, "loss": 0.0678, "step": 11480 }, { "epoch": 1.4326683291770572, "grad_norm": 7.376005172729492, "learning_rate": 1.4273316708229428e-05, "loss": 0.051, "step": 11490 }, { "epoch": 1.4339152119700749, "grad_norm": 38.96922302246094, "learning_rate": 1.4268329177057358e-05, "loss": 0.048, "step": 11500 }, { "epoch": 1.4351620947630923, "grad_norm": 0.06101633608341217, "learning_rate": 1.4263341645885287e-05, "loss": 0.0408, "step": 11510 }, { "epoch": 1.4364089775561097, "grad_norm": 0.11863374710083008, "learning_rate": 1.4258354114713218e-05, "loss": 0.0763, "step": 11520 }, { "epoch": 1.4376558603491272, "grad_norm": 0.006517260335385799, "learning_rate": 1.4253366583541148e-05, "loss": 0.0835, "step": 11530 }, { "epoch": 1.4389027431421446, "grad_norm": 0.005422229878604412, "learning_rate": 1.4248379052369079e-05, "loss": 0.0375, "step": 11540 }, { "epoch": 1.440149625935162, "grad_norm": 1.8197985887527466, "learning_rate": 1.424339152119701e-05, "loss": 0.0052, "step": 11550 }, { "epoch": 1.4413965087281795, "grad_norm": 0.018879203125834465, "learning_rate": 1.4238403990024939e-05, "loss": 0.0119, "step": 11560 }, { "epoch": 1.4426433915211971, "grad_norm": 7.549375534057617, "learning_rate": 1.423341645885287e-05, "loss": 0.036, "step": 11570 }, { "epoch": 1.4438902743142146, "grad_norm": 0.015450420789420605, "learning_rate": 1.42284289276808e-05, "loss": 0.0054, "step": 11580 }, { "epoch": 1.445137157107232, "grad_norm": 30.17746925354004, "learning_rate": 1.422344139650873e-05, "loss": 0.0805, "step": 11590 }, { "epoch": 1.4463840399002494, "grad_norm": 0.004945869091898203, "learning_rate": 1.421845386533666e-05, "loss": 0.0387, "step": 11600 }, { "epoch": 1.4476309226932669, "grad_norm": 32.93588638305664, "learning_rate": 1.421346633416459e-05, "loss": 0.0309, "step": 11610 }, { "epoch": 1.4488778054862843, "grad_norm": 0.02093518152832985, "learning_rate": 1.420847880299252e-05, "loss": 0.0953, "step": 11620 }, { "epoch": 1.4501246882793017, "grad_norm": 0.004532126244157553, "learning_rate": 1.4203491271820451e-05, "loss": 0.0002, "step": 11630 }, { "epoch": 1.4513715710723192, "grad_norm": 0.005582903511822224, "learning_rate": 1.419850374064838e-05, "loss": 0.056, "step": 11640 }, { "epoch": 1.4526184538653366, "grad_norm": 33.18028259277344, "learning_rate": 1.419351620947631e-05, "loss": 0.0984, "step": 11650 }, { "epoch": 1.453865336658354, "grad_norm": 0.005102763418108225, "learning_rate": 1.4188528678304241e-05, "loss": 0.0693, "step": 11660 }, { "epoch": 1.4551122194513715, "grad_norm": 0.011876431293785572, "learning_rate": 1.4183541147132172e-05, "loss": 0.0454, "step": 11670 }, { "epoch": 1.456359102244389, "grad_norm": 0.047975603491067886, "learning_rate": 1.4178553615960101e-05, "loss": 0.0018, "step": 11680 }, { "epoch": 1.4576059850374063, "grad_norm": 0.016569582745432854, "learning_rate": 1.4173566084788032e-05, "loss": 0.0607, "step": 11690 }, { "epoch": 1.458852867830424, "grad_norm": 1.9552041292190552, "learning_rate": 1.4168578553615962e-05, "loss": 0.0601, "step": 11700 }, { "epoch": 1.4600997506234414, "grad_norm": 0.05047905817627907, "learning_rate": 1.4163591022443893e-05, "loss": 0.1526, "step": 11710 }, { "epoch": 1.4613466334164589, "grad_norm": 0.08394069969654083, "learning_rate": 1.415860349127182e-05, "loss": 0.0009, "step": 11720 }, { "epoch": 1.4625935162094763, "grad_norm": 0.04425472021102905, "learning_rate": 1.415361596009975e-05, "loss": 0.0538, "step": 11730 }, { "epoch": 1.4638403990024937, "grad_norm": 0.0035213951487094164, "learning_rate": 1.4148628428927681e-05, "loss": 0.0362, "step": 11740 }, { "epoch": 1.4650872817955112, "grad_norm": 6.682374000549316, "learning_rate": 1.4143640897755614e-05, "loss": 0.0823, "step": 11750 }, { "epoch": 1.4663341645885286, "grad_norm": 0.15339747071266174, "learning_rate": 1.413865336658354e-05, "loss": 0.0878, "step": 11760 }, { "epoch": 1.4675810473815463, "grad_norm": 0.010146928951144218, "learning_rate": 1.4133665835411471e-05, "loss": 0.0315, "step": 11770 }, { "epoch": 1.4688279301745637, "grad_norm": 0.591601550579071, "learning_rate": 1.4128678304239402e-05, "loss": 0.0008, "step": 11780 }, { "epoch": 1.4700748129675811, "grad_norm": 0.13875390589237213, "learning_rate": 1.4123690773067333e-05, "loss": 0.0831, "step": 11790 }, { "epoch": 1.4713216957605986, "grad_norm": 0.046373747289180756, "learning_rate": 1.4118703241895263e-05, "loss": 0.0239, "step": 11800 }, { "epoch": 1.472568578553616, "grad_norm": 0.11153218150138855, "learning_rate": 1.4113715710723192e-05, "loss": 0.0091, "step": 11810 }, { "epoch": 1.4738154613466334, "grad_norm": 0.008640721440315247, "learning_rate": 1.4108728179551123e-05, "loss": 0.0008, "step": 11820 }, { "epoch": 1.4750623441396509, "grad_norm": 0.022386932745575905, "learning_rate": 1.4103740648379053e-05, "loss": 0.0166, "step": 11830 }, { "epoch": 1.4763092269326683, "grad_norm": 2.148536443710327, "learning_rate": 1.4098753117206984e-05, "loss": 0.0432, "step": 11840 }, { "epoch": 1.4775561097256857, "grad_norm": 0.008330950513482094, "learning_rate": 1.4093765586034913e-05, "loss": 0.0562, "step": 11850 }, { "epoch": 1.4788029925187032, "grad_norm": 5.3832597732543945, "learning_rate": 1.4088778054862844e-05, "loss": 0.0942, "step": 11860 }, { "epoch": 1.4800498753117206, "grad_norm": 0.01949029415845871, "learning_rate": 1.4083790523690774e-05, "loss": 0.0111, "step": 11870 }, { "epoch": 1.481296758104738, "grad_norm": 0.006169204134494066, "learning_rate": 1.4078802992518705e-05, "loss": 0.0007, "step": 11880 }, { "epoch": 1.4825436408977555, "grad_norm": 0.0029075045604258776, "learning_rate": 1.4073815461346634e-05, "loss": 0.0033, "step": 11890 }, { "epoch": 1.4837905236907731, "grad_norm": 0.09177600592374802, "learning_rate": 1.4068827930174564e-05, "loss": 0.0989, "step": 11900 }, { "epoch": 1.4850374064837906, "grad_norm": 0.0033431975170969963, "learning_rate": 1.4063840399002495e-05, "loss": 0.027, "step": 11910 }, { "epoch": 1.486284289276808, "grad_norm": 0.006316428538411856, "learning_rate": 1.4058852867830426e-05, "loss": 0.0007, "step": 11920 }, { "epoch": 1.4875311720698254, "grad_norm": 0.3648694157600403, "learning_rate": 1.4053865336658355e-05, "loss": 0.0083, "step": 11930 }, { "epoch": 1.4887780548628429, "grad_norm": 0.10771086066961288, "learning_rate": 1.4048877805486285e-05, "loss": 0.0285, "step": 11940 }, { "epoch": 1.4900249376558603, "grad_norm": 20.01485824584961, "learning_rate": 1.4043890274314216e-05, "loss": 0.0287, "step": 11950 }, { "epoch": 1.4912718204488777, "grad_norm": 0.0025918507017195225, "learning_rate": 1.4038902743142146e-05, "loss": 0.059, "step": 11960 }, { "epoch": 1.4925187032418954, "grad_norm": 0.0017820337088778615, "learning_rate": 1.4033915211970075e-05, "loss": 0.0087, "step": 11970 }, { "epoch": 1.4937655860349128, "grad_norm": 40.38088607788086, "learning_rate": 1.4028927680798006e-05, "loss": 0.1003, "step": 11980 }, { "epoch": 1.4950124688279303, "grad_norm": 0.1557849496603012, "learning_rate": 1.4023940149625937e-05, "loss": 0.0726, "step": 11990 }, { "epoch": 1.4962593516209477, "grad_norm": 0.012030398473143578, "learning_rate": 1.4018952618453867e-05, "loss": 0.0073, "step": 12000 }, { "epoch": 1.4975062344139651, "grad_norm": 0.0018893389496952295, "learning_rate": 1.4013965087281796e-05, "loss": 0.0332, "step": 12010 }, { "epoch": 1.4987531172069826, "grad_norm": 17.814115524291992, "learning_rate": 1.4008977556109727e-05, "loss": 0.0321, "step": 12020 }, { "epoch": 1.5, "grad_norm": 0.0038544260896742344, "learning_rate": 1.4003990024937657e-05, "loss": 0.0243, "step": 12030 }, { "epoch": 1.5012468827930174, "grad_norm": 48.38309097290039, "learning_rate": 1.3999002493765588e-05, "loss": 0.0636, "step": 12040 }, { "epoch": 1.5024937655860349, "grad_norm": 0.007168745622038841, "learning_rate": 1.3994014962593519e-05, "loss": 0.0393, "step": 12050 }, { "epoch": 1.5037406483790523, "grad_norm": 0.006634199526160955, "learning_rate": 1.3989027431421447e-05, "loss": 0.0642, "step": 12060 }, { "epoch": 1.5049875311720697, "grad_norm": 0.005328933708369732, "learning_rate": 1.3984039900249378e-05, "loss": 0.037, "step": 12070 }, { "epoch": 1.5062344139650872, "grad_norm": 0.6847673058509827, "learning_rate": 1.3979052369077309e-05, "loss": 0.0006, "step": 12080 }, { "epoch": 1.5074812967581046, "grad_norm": 12.247014999389648, "learning_rate": 1.397406483790524e-05, "loss": 0.0148, "step": 12090 }, { "epoch": 1.508728179551122, "grad_norm": 0.012827611528337002, "learning_rate": 1.3969077306733168e-05, "loss": 0.0064, "step": 12100 }, { "epoch": 1.5099750623441397, "grad_norm": 0.002687269588932395, "learning_rate": 1.3964089775561099e-05, "loss": 0.0304, "step": 12110 }, { "epoch": 1.5112219451371571, "grad_norm": 0.009229120798408985, "learning_rate": 1.395910224438903e-05, "loss": 0.0832, "step": 12120 }, { "epoch": 1.5124688279301746, "grad_norm": 0.010797587223351002, "learning_rate": 1.395411471321696e-05, "loss": 0.057, "step": 12130 }, { "epoch": 1.513715710723192, "grad_norm": 0.017812317237257957, "learning_rate": 1.3949127182044887e-05, "loss": 0.0554, "step": 12140 }, { "epoch": 1.5149625935162094, "grad_norm": 0.09482496231794357, "learning_rate": 1.394413965087282e-05, "loss": 0.0457, "step": 12150 }, { "epoch": 1.516209476309227, "grad_norm": 1.7302439212799072, "learning_rate": 1.393915211970075e-05, "loss": 0.0728, "step": 12160 }, { "epoch": 1.5174563591022445, "grad_norm": 0.19030161201953888, "learning_rate": 1.3934164588528681e-05, "loss": 0.1303, "step": 12170 }, { "epoch": 1.518703241895262, "grad_norm": 7.949008941650391, "learning_rate": 1.3929177057356608e-05, "loss": 0.0347, "step": 12180 }, { "epoch": 1.5199501246882794, "grad_norm": 0.012247119098901749, "learning_rate": 1.3924189526184539e-05, "loss": 0.0021, "step": 12190 }, { "epoch": 1.5211970074812968, "grad_norm": 0.005694092251360416, "learning_rate": 1.391920199501247e-05, "loss": 0.0012, "step": 12200 }, { "epoch": 1.5224438902743143, "grad_norm": 0.00615575909614563, "learning_rate": 1.3914214463840402e-05, "loss": 0.0228, "step": 12210 }, { "epoch": 1.5236907730673317, "grad_norm": 0.013162520714104176, "learning_rate": 1.3909226932668329e-05, "loss": 0.0133, "step": 12220 }, { "epoch": 1.5249376558603491, "grad_norm": 23.256078720092773, "learning_rate": 1.390423940149626e-05, "loss": 0.0441, "step": 12230 }, { "epoch": 1.5261845386533666, "grad_norm": 0.007707939483225346, "learning_rate": 1.389925187032419e-05, "loss": 0.0024, "step": 12240 }, { "epoch": 1.527431421446384, "grad_norm": 0.014242901466786861, "learning_rate": 1.389426433915212e-05, "loss": 0.001, "step": 12250 }, { "epoch": 1.5286783042394014, "grad_norm": 0.003064389806240797, "learning_rate": 1.388927680798005e-05, "loss": 0.0003, "step": 12260 }, { "epoch": 1.5299251870324189, "grad_norm": 0.0030908039771020412, "learning_rate": 1.388428927680798e-05, "loss": 0.043, "step": 12270 }, { "epoch": 1.5311720698254363, "grad_norm": 0.010099162347614765, "learning_rate": 1.3879301745635911e-05, "loss": 0.0003, "step": 12280 }, { "epoch": 1.5324189526184537, "grad_norm": 2.3775289058685303, "learning_rate": 1.3874314214463842e-05, "loss": 0.0066, "step": 12290 }, { "epoch": 1.5336658354114712, "grad_norm": 0.00410651508718729, "learning_rate": 1.3869326683291772e-05, "loss": 0.0009, "step": 12300 }, { "epoch": 1.5349127182044888, "grad_norm": 0.007365523837506771, "learning_rate": 1.3864339152119701e-05, "loss": 0.0376, "step": 12310 }, { "epoch": 1.5361596009975063, "grad_norm": 0.004369727335870266, "learning_rate": 1.3859351620947632e-05, "loss": 0.0124, "step": 12320 }, { "epoch": 1.5374064837905237, "grad_norm": 2.023164987564087, "learning_rate": 1.3854364089775562e-05, "loss": 0.0051, "step": 12330 }, { "epoch": 1.5386533665835411, "grad_norm": 0.008770514279603958, "learning_rate": 1.3849376558603493e-05, "loss": 0.0002, "step": 12340 }, { "epoch": 1.5399002493765586, "grad_norm": 0.001449818373657763, "learning_rate": 1.3844389027431422e-05, "loss": 0.0028, "step": 12350 }, { "epoch": 1.5411471321695762, "grad_norm": 0.11426911503076553, "learning_rate": 1.3839401496259353e-05, "loss": 0.027, "step": 12360 }, { "epoch": 1.5423940149625937, "grad_norm": 0.009318443946540356, "learning_rate": 1.3834413965087283e-05, "loss": 0.0007, "step": 12370 }, { "epoch": 1.543640897755611, "grad_norm": 0.0020017463248223066, "learning_rate": 1.3829426433915214e-05, "loss": 0.0002, "step": 12380 }, { "epoch": 1.5448877805486285, "grad_norm": 0.002024906687438488, "learning_rate": 1.3824438902743143e-05, "loss": 0.0023, "step": 12390 }, { "epoch": 1.546134663341646, "grad_norm": 0.01683424413204193, "learning_rate": 1.3819451371571073e-05, "loss": 0.0022, "step": 12400 }, { "epoch": 1.5473815461346634, "grad_norm": 0.19490936398506165, "learning_rate": 1.3814463840399004e-05, "loss": 0.0003, "step": 12410 }, { "epoch": 1.5486284289276808, "grad_norm": 0.007523283362388611, "learning_rate": 1.3809476309226935e-05, "loss": 0.0524, "step": 12420 }, { "epoch": 1.5498753117206983, "grad_norm": 0.009266285225749016, "learning_rate": 1.3804488778054863e-05, "loss": 0.0669, "step": 12430 }, { "epoch": 1.5511221945137157, "grad_norm": 0.006378598511219025, "learning_rate": 1.3799501246882794e-05, "loss": 0.0714, "step": 12440 }, { "epoch": 1.5523690773067331, "grad_norm": 0.26278775930404663, "learning_rate": 1.3794513715710725e-05, "loss": 0.0538, "step": 12450 }, { "epoch": 1.5536159600997506, "grad_norm": 0.7849764823913574, "learning_rate": 1.3789526184538655e-05, "loss": 0.0009, "step": 12460 }, { "epoch": 1.554862842892768, "grad_norm": 0.04737240821123123, "learning_rate": 1.3784538653366584e-05, "loss": 0.0342, "step": 12470 }, { "epoch": 1.5561097256857854, "grad_norm": 32.681884765625, "learning_rate": 1.3779551122194515e-05, "loss": 0.0475, "step": 12480 }, { "epoch": 1.5573566084788029, "grad_norm": 24.724851608276367, "learning_rate": 1.3774563591022445e-05, "loss": 0.0479, "step": 12490 }, { "epoch": 1.5586034912718203, "grad_norm": 0.0013840860920026898, "learning_rate": 1.3769576059850376e-05, "loss": 0.0367, "step": 12500 }, { "epoch": 1.559850374064838, "grad_norm": 0.008518923074007034, "learning_rate": 1.3764588528678305e-05, "loss": 0.1191, "step": 12510 }, { "epoch": 1.5610972568578554, "grad_norm": 0.00492417486384511, "learning_rate": 1.3759600997506236e-05, "loss": 0.0017, "step": 12520 }, { "epoch": 1.5623441396508728, "grad_norm": 2.3020267486572266, "learning_rate": 1.3754613466334166e-05, "loss": 0.0278, "step": 12530 }, { "epoch": 1.5635910224438903, "grad_norm": 4.384720325469971, "learning_rate": 1.3749625935162097e-05, "loss": 0.0561, "step": 12540 }, { "epoch": 1.5648379052369077, "grad_norm": 0.007074257824569941, "learning_rate": 1.3744638403990028e-05, "loss": 0.0002, "step": 12550 }, { "epoch": 1.5660847880299253, "grad_norm": 0.016326969489455223, "learning_rate": 1.3739650872817956e-05, "loss": 0.0003, "step": 12560 }, { "epoch": 1.5673316708229428, "grad_norm": 0.010336228646337986, "learning_rate": 1.3734663341645887e-05, "loss": 0.0397, "step": 12570 }, { "epoch": 1.5685785536159602, "grad_norm": 0.15242627263069153, "learning_rate": 1.3729675810473818e-05, "loss": 0.0902, "step": 12580 }, { "epoch": 1.5698254364089776, "grad_norm": 0.1268034428358078, "learning_rate": 1.3724688279301748e-05, "loss": 0.0435, "step": 12590 }, { "epoch": 1.571072319201995, "grad_norm": 9.81682014465332, "learning_rate": 1.3719700748129676e-05, "loss": 0.04, "step": 12600 }, { "epoch": 1.5723192019950125, "grad_norm": 0.034833166748285294, "learning_rate": 1.3714713216957606e-05, "loss": 0.0007, "step": 12610 }, { "epoch": 1.57356608478803, "grad_norm": 0.01569850742816925, "learning_rate": 1.3709725685785538e-05, "loss": 0.041, "step": 12620 }, { "epoch": 1.5748129675810474, "grad_norm": 0.0046609025448560715, "learning_rate": 1.3704738154613469e-05, "loss": 0.0856, "step": 12630 }, { "epoch": 1.5760598503740648, "grad_norm": 0.024893207475543022, "learning_rate": 1.3699750623441396e-05, "loss": 0.0223, "step": 12640 }, { "epoch": 1.5773067331670823, "grad_norm": 0.09220171719789505, "learning_rate": 1.3694763092269327e-05, "loss": 0.0718, "step": 12650 }, { "epoch": 1.5785536159600997, "grad_norm": 0.04910089075565338, "learning_rate": 1.3689775561097258e-05, "loss": 0.0545, "step": 12660 }, { "epoch": 1.5798004987531171, "grad_norm": 0.01659906841814518, "learning_rate": 1.3684788029925188e-05, "loss": 0.03, "step": 12670 }, { "epoch": 1.5810473815461346, "grad_norm": 0.02599448524415493, "learning_rate": 1.3679800498753117e-05, "loss": 0.0515, "step": 12680 }, { "epoch": 1.582294264339152, "grad_norm": 0.030495228245854378, "learning_rate": 1.3674812967581048e-05, "loss": 0.0084, "step": 12690 }, { "epoch": 1.5835411471321694, "grad_norm": 0.06779367476701736, "learning_rate": 1.3669825436408978e-05, "loss": 0.0029, "step": 12700 }, { "epoch": 1.584788029925187, "grad_norm": 0.02692810632288456, "learning_rate": 1.3664837905236909e-05, "loss": 0.1027, "step": 12710 }, { "epoch": 1.5860349127182045, "grad_norm": 42.45810317993164, "learning_rate": 1.3659850374064838e-05, "loss": 0.0173, "step": 12720 }, { "epoch": 1.587281795511222, "grad_norm": 0.17942014336585999, "learning_rate": 1.3654862842892769e-05, "loss": 0.0006, "step": 12730 }, { "epoch": 1.5885286783042394, "grad_norm": 0.03577357530593872, "learning_rate": 1.3649875311720699e-05, "loss": 0.001, "step": 12740 }, { "epoch": 1.5897755610972568, "grad_norm": 0.2650788724422455, "learning_rate": 1.364488778054863e-05, "loss": 0.0389, "step": 12750 }, { "epoch": 1.5910224438902745, "grad_norm": 0.018843036144971848, "learning_rate": 1.3639900249376559e-05, "loss": 0.0141, "step": 12760 }, { "epoch": 1.592269326683292, "grad_norm": 0.693320631980896, "learning_rate": 1.363491271820449e-05, "loss": 0.0018, "step": 12770 }, { "epoch": 1.5935162094763093, "grad_norm": 0.0032859332859516144, "learning_rate": 1.362992518703242e-05, "loss": 0.0008, "step": 12780 }, { "epoch": 1.5947630922693268, "grad_norm": 0.018069753423333168, "learning_rate": 1.362493765586035e-05, "loss": 0.0003, "step": 12790 }, { "epoch": 1.5960099750623442, "grad_norm": 0.003422915004193783, "learning_rate": 1.3619950124688281e-05, "loss": 0.0004, "step": 12800 }, { "epoch": 1.5972568578553616, "grad_norm": 71.4630355834961, "learning_rate": 1.361496259351621e-05, "loss": 0.0492, "step": 12810 }, { "epoch": 1.598503740648379, "grad_norm": 0.07465198636054993, "learning_rate": 1.360997506234414e-05, "loss": 0.0666, "step": 12820 }, { "epoch": 1.5997506234413965, "grad_norm": 0.005183480214327574, "learning_rate": 1.3604987531172071e-05, "loss": 0.0027, "step": 12830 }, { "epoch": 1.600997506234414, "grad_norm": 20.43458366394043, "learning_rate": 1.3600000000000002e-05, "loss": 0.0305, "step": 12840 }, { "epoch": 1.6022443890274314, "grad_norm": 0.0220947228372097, "learning_rate": 1.3595012468827931e-05, "loss": 0.0226, "step": 12850 }, { "epoch": 1.6034912718204488, "grad_norm": 0.0029411078430712223, "learning_rate": 1.3590024937655861e-05, "loss": 0.0654, "step": 12860 }, { "epoch": 1.6047381546134662, "grad_norm": 0.0013770603109151125, "learning_rate": 1.3585037406483792e-05, "loss": 0.0466, "step": 12870 }, { "epoch": 1.6059850374064837, "grad_norm": 0.35909491777420044, "learning_rate": 1.3580049875311723e-05, "loss": 0.0055, "step": 12880 }, { "epoch": 1.6072319201995011, "grad_norm": 0.06752067804336548, "learning_rate": 1.3575062344139652e-05, "loss": 0.0771, "step": 12890 }, { "epoch": 1.6084788029925186, "grad_norm": 0.002513469662517309, "learning_rate": 1.3570074812967582e-05, "loss": 0.0009, "step": 12900 }, { "epoch": 1.6097256857855362, "grad_norm": 0.00214858609251678, "learning_rate": 1.3565087281795513e-05, "loss": 0.0031, "step": 12910 }, { "epoch": 1.6109725685785536, "grad_norm": 0.0014683667104691267, "learning_rate": 1.3560099750623443e-05, "loss": 0.0013, "step": 12920 }, { "epoch": 1.612219451371571, "grad_norm": 0.3527969419956207, "learning_rate": 1.3555112219451372e-05, "loss": 0.076, "step": 12930 }, { "epoch": 1.6134663341645885, "grad_norm": 5.842223167419434, "learning_rate": 1.3550124688279303e-05, "loss": 0.0978, "step": 12940 }, { "epoch": 1.614713216957606, "grad_norm": 7.368631362915039, "learning_rate": 1.3545137157107234e-05, "loss": 0.0027, "step": 12950 }, { "epoch": 1.6159600997506236, "grad_norm": 1.2361329793930054, "learning_rate": 1.3540149625935164e-05, "loss": 0.018, "step": 12960 }, { "epoch": 1.617206982543641, "grad_norm": 4.243699550628662, "learning_rate": 1.3535162094763093e-05, "loss": 0.0365, "step": 12970 }, { "epoch": 1.6184538653366585, "grad_norm": 0.009675173088908195, "learning_rate": 1.3530174563591024e-05, "loss": 0.0002, "step": 12980 }, { "epoch": 1.619700748129676, "grad_norm": 6.819517612457275, "learning_rate": 1.3525187032418954e-05, "loss": 0.106, "step": 12990 }, { "epoch": 1.6209476309226933, "grad_norm": 0.08170343935489655, "learning_rate": 1.3520199501246885e-05, "loss": 0.0457, "step": 13000 }, { "epoch": 1.6221945137157108, "grad_norm": 0.0533466599881649, "learning_rate": 1.3515211970074812e-05, "loss": 0.0109, "step": 13010 }, { "epoch": 1.6234413965087282, "grad_norm": 0.22970125079154968, "learning_rate": 1.3510224438902743e-05, "loss": 0.0215, "step": 13020 }, { "epoch": 1.6246882793017456, "grad_norm": 0.006124209146946669, "learning_rate": 1.3505236907730675e-05, "loss": 0.0003, "step": 13030 }, { "epoch": 1.625935162094763, "grad_norm": 0.005711245816200972, "learning_rate": 1.3500249376558606e-05, "loss": 0.0916, "step": 13040 }, { "epoch": 1.6271820448877805, "grad_norm": 0.06058807671070099, "learning_rate": 1.3495261845386536e-05, "loss": 0.0927, "step": 13050 }, { "epoch": 1.628428927680798, "grad_norm": 0.021231012418866158, "learning_rate": 1.3490274314214464e-05, "loss": 0.0043, "step": 13060 }, { "epoch": 1.6296758104738154, "grad_norm": 0.044012606143951416, "learning_rate": 1.3485286783042394e-05, "loss": 0.0078, "step": 13070 }, { "epoch": 1.6309226932668328, "grad_norm": 0.0040282756090164185, "learning_rate": 1.3480299251870325e-05, "loss": 0.0294, "step": 13080 }, { "epoch": 1.6321695760598502, "grad_norm": 13.578664779663086, "learning_rate": 1.3475311720698257e-05, "loss": 0.0791, "step": 13090 }, { "epoch": 1.6334164588528677, "grad_norm": 0.018825553357601166, "learning_rate": 1.3470324189526184e-05, "loss": 0.0313, "step": 13100 }, { "epoch": 1.6346633416458853, "grad_norm": 0.18112289905548096, "learning_rate": 1.3465336658354115e-05, "loss": 0.0004, "step": 13110 }, { "epoch": 1.6359102244389028, "grad_norm": 0.03131983429193497, "learning_rate": 1.3460349127182046e-05, "loss": 0.0008, "step": 13120 }, { "epoch": 1.6371571072319202, "grad_norm": 6.653219699859619, "learning_rate": 1.3455361596009976e-05, "loss": 0.0314, "step": 13130 }, { "epoch": 1.6384039900249376, "grad_norm": 1.7269296646118164, "learning_rate": 1.3450374064837905e-05, "loss": 0.0793, "step": 13140 }, { "epoch": 1.639650872817955, "grad_norm": 0.011295067146420479, "learning_rate": 1.3445386533665836e-05, "loss": 0.0018, "step": 13150 }, { "epoch": 1.6408977556109727, "grad_norm": 0.5535463690757751, "learning_rate": 1.3440399002493767e-05, "loss": 0.0448, "step": 13160 }, { "epoch": 1.6421446384039902, "grad_norm": 0.045436304062604904, "learning_rate": 1.3435411471321697e-05, "loss": 0.0146, "step": 13170 }, { "epoch": 1.6433915211970076, "grad_norm": 0.12149259448051453, "learning_rate": 1.3430423940149626e-05, "loss": 0.0252, "step": 13180 }, { "epoch": 1.644638403990025, "grad_norm": 0.10686697065830231, "learning_rate": 1.3425436408977557e-05, "loss": 0.0019, "step": 13190 }, { "epoch": 1.6458852867830425, "grad_norm": 0.021616408601403236, "learning_rate": 1.3420448877805487e-05, "loss": 0.0019, "step": 13200 }, { "epoch": 1.64713216957606, "grad_norm": 41.76093673706055, "learning_rate": 1.3415461346633418e-05, "loss": 0.018, "step": 13210 }, { "epoch": 1.6483790523690773, "grad_norm": 0.0063141207210719585, "learning_rate": 1.3410473815461347e-05, "loss": 0.0422, "step": 13220 }, { "epoch": 1.6496259351620948, "grad_norm": 19.624000549316406, "learning_rate": 1.3405486284289277e-05, "loss": 0.0317, "step": 13230 }, { "epoch": 1.6508728179551122, "grad_norm": 0.09289072453975677, "learning_rate": 1.3400498753117208e-05, "loss": 0.053, "step": 13240 }, { "epoch": 1.6521197007481296, "grad_norm": 0.008266476914286613, "learning_rate": 1.3395511221945139e-05, "loss": 0.0634, "step": 13250 }, { "epoch": 1.653366583541147, "grad_norm": 0.05262904614210129, "learning_rate": 1.3390523690773068e-05, "loss": 0.0002, "step": 13260 }, { "epoch": 1.6546134663341645, "grad_norm": 0.014744916930794716, "learning_rate": 1.3385536159600998e-05, "loss": 0.0099, "step": 13270 }, { "epoch": 1.655860349127182, "grad_norm": 0.012456094846129417, "learning_rate": 1.3380548628428929e-05, "loss": 0.0959, "step": 13280 }, { "epoch": 1.6571072319201994, "grad_norm": 0.3287002742290497, "learning_rate": 1.337556109725686e-05, "loss": 0.065, "step": 13290 }, { "epoch": 1.6583541147132168, "grad_norm": 15.193024635314941, "learning_rate": 1.337057356608479e-05, "loss": 0.0479, "step": 13300 }, { "epoch": 1.6596009975062345, "grad_norm": 26.678428649902344, "learning_rate": 1.3365586034912719e-05, "loss": 0.0896, "step": 13310 }, { "epoch": 1.660847880299252, "grad_norm": 0.01345492247492075, "learning_rate": 1.336059850374065e-05, "loss": 0.0075, "step": 13320 }, { "epoch": 1.6620947630922693, "grad_norm": 0.003237128956243396, "learning_rate": 1.335561097256858e-05, "loss": 0.0008, "step": 13330 }, { "epoch": 1.6633416458852868, "grad_norm": 0.012729836627840996, "learning_rate": 1.3350623441396511e-05, "loss": 0.105, "step": 13340 }, { "epoch": 1.6645885286783042, "grad_norm": 0.001644920208491385, "learning_rate": 1.334563591022444e-05, "loss": 0.0701, "step": 13350 }, { "epoch": 1.6658354114713219, "grad_norm": 20.716304779052734, "learning_rate": 1.334064837905237e-05, "loss": 0.0816, "step": 13360 }, { "epoch": 1.6670822942643393, "grad_norm": 0.09023711085319519, "learning_rate": 1.3335660847880301e-05, "loss": 0.1163, "step": 13370 }, { "epoch": 1.6683291770573567, "grad_norm": 8.607697486877441, "learning_rate": 1.3330673316708232e-05, "loss": 0.067, "step": 13380 }, { "epoch": 1.6695760598503742, "grad_norm": 0.024321427568793297, "learning_rate": 1.332568578553616e-05, "loss": 0.1202, "step": 13390 }, { "epoch": 1.6708229426433916, "grad_norm": 0.2328629195690155, "learning_rate": 1.3320698254364091e-05, "loss": 0.0063, "step": 13400 }, { "epoch": 1.672069825436409, "grad_norm": 0.053208332508802414, "learning_rate": 1.3315710723192022e-05, "loss": 0.0353, "step": 13410 }, { "epoch": 1.6733167082294265, "grad_norm": 0.006619223393499851, "learning_rate": 1.3310723192019952e-05, "loss": 0.0316, "step": 13420 }, { "epoch": 1.674563591022444, "grad_norm": 0.006179279647767544, "learning_rate": 1.330573566084788e-05, "loss": 0.0117, "step": 13430 }, { "epoch": 1.6758104738154613, "grad_norm": 4.701325416564941, "learning_rate": 1.3300748129675812e-05, "loss": 0.0027, "step": 13440 }, { "epoch": 1.6770573566084788, "grad_norm": 2.834035634994507, "learning_rate": 1.3296259351620949e-05, "loss": 0.0534, "step": 13450 }, { "epoch": 1.6783042394014962, "grad_norm": 0.005688248202204704, "learning_rate": 1.3291271820448879e-05, "loss": 0.0429, "step": 13460 }, { "epoch": 1.6795511221945136, "grad_norm": 0.11691449582576752, "learning_rate": 1.328628428927681e-05, "loss": 0.0756, "step": 13470 }, { "epoch": 1.680798004987531, "grad_norm": 0.005437224637717009, "learning_rate": 1.328129675810474e-05, "loss": 0.0323, "step": 13480 }, { "epoch": 1.6820448877805485, "grad_norm": 0.0028406723868101835, "learning_rate": 1.327630922693267e-05, "loss": 0.0679, "step": 13490 }, { "epoch": 1.683291770573566, "grad_norm": 1.3356423377990723, "learning_rate": 1.32713216957606e-05, "loss": 0.0044, "step": 13500 }, { "epoch": 1.6845386533665836, "grad_norm": 9.16796875, "learning_rate": 1.326633416458853e-05, "loss": 0.044, "step": 13510 }, { "epoch": 1.685785536159601, "grad_norm": 0.0038144674617797136, "learning_rate": 1.3261346633416461e-05, "loss": 0.0342, "step": 13520 }, { "epoch": 1.6870324189526185, "grad_norm": 0.0027151876129209995, "learning_rate": 1.325635910224439e-05, "loss": 0.077, "step": 13530 }, { "epoch": 1.688279301745636, "grad_norm": 47.315982818603516, "learning_rate": 1.325137157107232e-05, "loss": 0.0227, "step": 13540 }, { "epoch": 1.6895261845386533, "grad_norm": 0.04255508631467819, "learning_rate": 1.3246384039900251e-05, "loss": 0.0003, "step": 13550 }, { "epoch": 1.690773067331671, "grad_norm": 3.864309310913086, "learning_rate": 1.3241396508728182e-05, "loss": 0.0153, "step": 13560 }, { "epoch": 1.6920199501246884, "grad_norm": 0.44293212890625, "learning_rate": 1.323640897755611e-05, "loss": 0.0138, "step": 13570 }, { "epoch": 1.6932668329177059, "grad_norm": 9.212722778320312, "learning_rate": 1.3231421446384041e-05, "loss": 0.084, "step": 13580 }, { "epoch": 1.6945137157107233, "grad_norm": 0.07577641308307648, "learning_rate": 1.3226433915211972e-05, "loss": 0.0004, "step": 13590 }, { "epoch": 1.6957605985037407, "grad_norm": 14.254758834838867, "learning_rate": 1.3221446384039903e-05, "loss": 0.0978, "step": 13600 }, { "epoch": 1.6970074812967582, "grad_norm": 0.5562378168106079, "learning_rate": 1.321645885286783e-05, "loss": 0.0175, "step": 13610 }, { "epoch": 1.6982543640897756, "grad_norm": 0.31031671166419983, "learning_rate": 1.321147132169576e-05, "loss": 0.0909, "step": 13620 }, { "epoch": 1.699501246882793, "grad_norm": 0.018233343958854675, "learning_rate": 1.3206483790523691e-05, "loss": 0.0012, "step": 13630 }, { "epoch": 1.7007481296758105, "grad_norm": 0.01966504380106926, "learning_rate": 1.3201496259351624e-05, "loss": 0.0036, "step": 13640 }, { "epoch": 1.701995012468828, "grad_norm": 0.004362176638096571, "learning_rate": 1.319650872817955e-05, "loss": 0.045, "step": 13650 }, { "epoch": 1.7032418952618453, "grad_norm": 0.02185465581715107, "learning_rate": 1.3191521197007481e-05, "loss": 0.0368, "step": 13660 }, { "epoch": 1.7044887780548628, "grad_norm": 0.01592567190527916, "learning_rate": 1.3186533665835412e-05, "loss": 0.0017, "step": 13670 }, { "epoch": 1.7057356608478802, "grad_norm": 0.002704000100493431, "learning_rate": 1.3181546134663343e-05, "loss": 0.0002, "step": 13680 }, { "epoch": 1.7069825436408976, "grad_norm": 0.01973789371550083, "learning_rate": 1.3176558603491272e-05, "loss": 0.2209, "step": 13690 }, { "epoch": 1.708229426433915, "grad_norm": 0.7285020351409912, "learning_rate": 1.3171571072319202e-05, "loss": 0.0378, "step": 13700 }, { "epoch": 1.7094763092269327, "grad_norm": 0.01300446130335331, "learning_rate": 1.3166583541147133e-05, "loss": 0.0008, "step": 13710 }, { "epoch": 1.7107231920199502, "grad_norm": 0.048056092113256454, "learning_rate": 1.3161596009975063e-05, "loss": 0.0412, "step": 13720 }, { "epoch": 1.7119700748129676, "grad_norm": 0.01191894430667162, "learning_rate": 1.3156608478802994e-05, "loss": 0.0009, "step": 13730 }, { "epoch": 1.713216957605985, "grad_norm": 0.01139442902058363, "learning_rate": 1.3151620947630923e-05, "loss": 0.0161, "step": 13740 }, { "epoch": 1.7144638403990025, "grad_norm": 0.028019532561302185, "learning_rate": 1.3146633416458854e-05, "loss": 0.0103, "step": 13750 }, { "epoch": 1.7157107231920201, "grad_norm": 0.05028171464800835, "learning_rate": 1.3141645885286784e-05, "loss": 0.0328, "step": 13760 }, { "epoch": 1.7169576059850375, "grad_norm": 0.002151243155822158, "learning_rate": 1.3136658354114715e-05, "loss": 0.0004, "step": 13770 }, { "epoch": 1.718204488778055, "grad_norm": 0.06736582517623901, "learning_rate": 1.3131670822942644e-05, "loss": 0.0813, "step": 13780 }, { "epoch": 1.7194513715710724, "grad_norm": 0.1269422173500061, "learning_rate": 1.3126683291770574e-05, "loss": 0.0327, "step": 13790 }, { "epoch": 1.7206982543640899, "grad_norm": 0.00913538970053196, "learning_rate": 1.3121695760598505e-05, "loss": 0.0058, "step": 13800 }, { "epoch": 1.7219451371571073, "grad_norm": 0.0841362401843071, "learning_rate": 1.3116708229426436e-05, "loss": 0.0288, "step": 13810 }, { "epoch": 1.7231920199501247, "grad_norm": 0.007267114706337452, "learning_rate": 1.3111720698254365e-05, "loss": 0.036, "step": 13820 }, { "epoch": 1.7244389027431422, "grad_norm": 0.013743101619184017, "learning_rate": 1.3106733167082295e-05, "loss": 0.1493, "step": 13830 }, { "epoch": 1.7256857855361596, "grad_norm": 0.0051359133794903755, "learning_rate": 1.3101745635910226e-05, "loss": 0.0403, "step": 13840 }, { "epoch": 1.726932668329177, "grad_norm": 0.01619003526866436, "learning_rate": 1.3096758104738156e-05, "loss": 0.0269, "step": 13850 }, { "epoch": 1.7281795511221945, "grad_norm": 0.017638299614191055, "learning_rate": 1.3091770573566085e-05, "loss": 0.0004, "step": 13860 }, { "epoch": 1.729426433915212, "grad_norm": 25.916101455688477, "learning_rate": 1.3086783042394016e-05, "loss": 0.0045, "step": 13870 }, { "epoch": 1.7306733167082293, "grad_norm": 0.033160511404275894, "learning_rate": 1.3081795511221947e-05, "loss": 0.0809, "step": 13880 }, { "epoch": 1.7319201995012468, "grad_norm": 0.00405894685536623, "learning_rate": 1.3076807980049877e-05, "loss": 0.0233, "step": 13890 }, { "epoch": 1.7331670822942642, "grad_norm": 0.003699847497045994, "learning_rate": 1.3071820448877806e-05, "loss": 0.0149, "step": 13900 }, { "epoch": 1.7344139650872819, "grad_norm": 0.003829776542261243, "learning_rate": 1.3066832917705737e-05, "loss": 0.0105, "step": 13910 }, { "epoch": 1.7356608478802993, "grad_norm": 0.0024333177134394646, "learning_rate": 1.3061845386533667e-05, "loss": 0.0835, "step": 13920 }, { "epoch": 1.7369077306733167, "grad_norm": 0.0065351747907698154, "learning_rate": 1.3056857855361598e-05, "loss": 0.0005, "step": 13930 }, { "epoch": 1.7381546134663342, "grad_norm": 0.6707267165184021, "learning_rate": 1.3051870324189527e-05, "loss": 0.002, "step": 13940 }, { "epoch": 1.7394014962593516, "grad_norm": 0.0017735377186909318, "learning_rate": 1.3046882793017457e-05, "loss": 0.0132, "step": 13950 }, { "epoch": 1.7406483790523692, "grad_norm": 0.004138738848268986, "learning_rate": 1.3041895261845388e-05, "loss": 0.0674, "step": 13960 }, { "epoch": 1.7418952618453867, "grad_norm": 0.003494670381769538, "learning_rate": 1.3036907730673319e-05, "loss": 0.0539, "step": 13970 }, { "epoch": 1.7431421446384041, "grad_norm": 0.015017693862318993, "learning_rate": 1.303192019950125e-05, "loss": 0.0721, "step": 13980 }, { "epoch": 1.7443890274314215, "grad_norm": 0.008463400416076183, "learning_rate": 1.3026932668329178e-05, "loss": 0.0173, "step": 13990 }, { "epoch": 1.745635910224439, "grad_norm": 0.008293639868497849, "learning_rate": 1.3021945137157109e-05, "loss": 0.0527, "step": 14000 }, { "epoch": 1.7468827930174564, "grad_norm": 0.012015490792691708, "learning_rate": 1.301695760598504e-05, "loss": 0.0784, "step": 14010 }, { "epoch": 1.7481296758104738, "grad_norm": 0.009153264574706554, "learning_rate": 1.301197007481297e-05, "loss": 0.0005, "step": 14020 }, { "epoch": 1.7493765586034913, "grad_norm": 0.014061570167541504, "learning_rate": 1.3006982543640897e-05, "loss": 0.0163, "step": 14030 }, { "epoch": 1.7506234413965087, "grad_norm": 0.09979971498250961, "learning_rate": 1.3001995012468828e-05, "loss": 0.0026, "step": 14040 }, { "epoch": 1.7518703241895262, "grad_norm": 0.013468071818351746, "learning_rate": 1.299700748129676e-05, "loss": 0.0016, "step": 14050 }, { "epoch": 1.7531172069825436, "grad_norm": 0.037439607083797455, "learning_rate": 1.2992019950124691e-05, "loss": 0.0009, "step": 14060 }, { "epoch": 1.754364089775561, "grad_norm": 0.004306161310523748, "learning_rate": 1.2987032418952618e-05, "loss": 0.0004, "step": 14070 }, { "epoch": 1.7556109725685785, "grad_norm": 0.009254061616957188, "learning_rate": 1.2982044887780549e-05, "loss": 0.0003, "step": 14080 }, { "epoch": 1.7568578553615959, "grad_norm": 0.002783331088721752, "learning_rate": 1.297705735660848e-05, "loss": 0.0178, "step": 14090 }, { "epoch": 1.7581047381546133, "grad_norm": 2.467909097671509, "learning_rate": 1.297206982543641e-05, "loss": 0.099, "step": 14100 }, { "epoch": 1.7593516209476308, "grad_norm": 0.0018527753418311477, "learning_rate": 1.2967082294264339e-05, "loss": 0.0005, "step": 14110 }, { "epoch": 1.7605985037406484, "grad_norm": 0.011839174665510654, "learning_rate": 1.296209476309227e-05, "loss": 0.0006, "step": 14120 }, { "epoch": 1.7618453865336658, "grad_norm": 0.2751910388469696, "learning_rate": 1.29571072319202e-05, "loss": 0.0003, "step": 14130 }, { "epoch": 1.7630922693266833, "grad_norm": 0.005288688465952873, "learning_rate": 1.295211970074813e-05, "loss": 0.0527, "step": 14140 }, { "epoch": 1.7643391521197007, "grad_norm": 0.0022090657148510218, "learning_rate": 1.294713216957606e-05, "loss": 0.043, "step": 14150 }, { "epoch": 1.7655860349127181, "grad_norm": 0.07446596771478653, "learning_rate": 1.294214463840399e-05, "loss": 0.0411, "step": 14160 }, { "epoch": 1.7668329177057358, "grad_norm": 15.950480461120605, "learning_rate": 1.2937157107231921e-05, "loss": 0.0487, "step": 14170 }, { "epoch": 1.7680798004987532, "grad_norm": 0.001634975546039641, "learning_rate": 1.2932169576059852e-05, "loss": 0.0003, "step": 14180 }, { "epoch": 1.7693266832917707, "grad_norm": 0.007447084411978722, "learning_rate": 1.292718204488778e-05, "loss": 0.0424, "step": 14190 }, { "epoch": 1.770573566084788, "grad_norm": 27.95421600341797, "learning_rate": 1.2922194513715711e-05, "loss": 0.0173, "step": 14200 }, { "epoch": 1.7718204488778055, "grad_norm": 0.0031554321758449078, "learning_rate": 1.2917206982543642e-05, "loss": 0.0199, "step": 14210 }, { "epoch": 1.773067331670823, "grad_norm": 0.0069986735470592976, "learning_rate": 1.2912219451371572e-05, "loss": 0.0115, "step": 14220 }, { "epoch": 1.7743142144638404, "grad_norm": 0.01298275962471962, "learning_rate": 1.2907231920199503e-05, "loss": 0.0625, "step": 14230 }, { "epoch": 1.7755610972568578, "grad_norm": 0.01622236892580986, "learning_rate": 1.2902244389027432e-05, "loss": 0.008, "step": 14240 }, { "epoch": 1.7768079800498753, "grad_norm": 0.012705344706773758, "learning_rate": 1.2897256857855363e-05, "loss": 0.0186, "step": 14250 }, { "epoch": 1.7780548628428927, "grad_norm": 21.812990188598633, "learning_rate": 1.2892269326683293e-05, "loss": 0.0209, "step": 14260 }, { "epoch": 1.7793017456359101, "grad_norm": 0.06607460975646973, "learning_rate": 1.2887281795511224e-05, "loss": 0.0421, "step": 14270 }, { "epoch": 1.7805486284289276, "grad_norm": 0.559027910232544, "learning_rate": 1.2882294264339153e-05, "loss": 0.0309, "step": 14280 }, { "epoch": 1.781795511221945, "grad_norm": 0.007746709045022726, "learning_rate": 1.2877306733167083e-05, "loss": 0.0147, "step": 14290 }, { "epoch": 1.7830423940149625, "grad_norm": 0.002600023988634348, "learning_rate": 1.2872319201995014e-05, "loss": 0.0383, "step": 14300 }, { "epoch": 1.7842892768079799, "grad_norm": 0.004319756757467985, "learning_rate": 1.2867331670822945e-05, "loss": 0.0005, "step": 14310 }, { "epoch": 1.7855361596009975, "grad_norm": 0.004818111192435026, "learning_rate": 1.2862344139650873e-05, "loss": 0.0221, "step": 14320 }, { "epoch": 1.786783042394015, "grad_norm": 0.007039282936602831, "learning_rate": 1.2857356608478804e-05, "loss": 0.0003, "step": 14330 }, { "epoch": 1.7880299251870324, "grad_norm": 0.022477056831121445, "learning_rate": 1.2852369077306735e-05, "loss": 0.0019, "step": 14340 }, { "epoch": 1.7892768079800498, "grad_norm": 0.12508761882781982, "learning_rate": 1.2847381546134665e-05, "loss": 0.0665, "step": 14350 }, { "epoch": 1.7905236907730673, "grad_norm": 1.3086934089660645, "learning_rate": 1.2842394014962594e-05, "loss": 0.0309, "step": 14360 }, { "epoch": 1.791770573566085, "grad_norm": 0.00528644397854805, "learning_rate": 1.2837406483790525e-05, "loss": 0.0002, "step": 14370 }, { "epoch": 1.7930174563591024, "grad_norm": 3.16678524017334, "learning_rate": 1.2832418952618455e-05, "loss": 0.052, "step": 14380 }, { "epoch": 1.7942643391521198, "grad_norm": 0.11913002282381058, "learning_rate": 1.2827431421446386e-05, "loss": 0.0003, "step": 14390 }, { "epoch": 1.7955112219451372, "grad_norm": 16.50526237487793, "learning_rate": 1.2822443890274315e-05, "loss": 0.0356, "step": 14400 }, { "epoch": 1.7967581047381547, "grad_norm": 0.0016488219844177365, "learning_rate": 1.2817456359102246e-05, "loss": 0.0841, "step": 14410 }, { "epoch": 1.798004987531172, "grad_norm": 36.24075698852539, "learning_rate": 1.2812468827930176e-05, "loss": 0.0096, "step": 14420 }, { "epoch": 1.7992518703241895, "grad_norm": 7.698852062225342, "learning_rate": 1.2807481296758107e-05, "loss": 0.0015, "step": 14430 }, { "epoch": 1.800498753117207, "grad_norm": 0.0017419640207663178, "learning_rate": 1.2802493765586034e-05, "loss": 0.0724, "step": 14440 }, { "epoch": 1.8017456359102244, "grad_norm": 12.817113876342773, "learning_rate": 1.2797506234413965e-05, "loss": 0.0966, "step": 14450 }, { "epoch": 1.8029925187032418, "grad_norm": 0.006786948535591364, "learning_rate": 1.2792518703241897e-05, "loss": 0.0309, "step": 14460 }, { "epoch": 1.8042394014962593, "grad_norm": 0.046667564660310745, "learning_rate": 1.2787531172069828e-05, "loss": 0.0011, "step": 14470 }, { "epoch": 1.8054862842892767, "grad_norm": 18.406494140625, "learning_rate": 1.2782543640897758e-05, "loss": 0.0441, "step": 14480 }, { "epoch": 1.8067331670822941, "grad_norm": 0.14043211936950684, "learning_rate": 1.2777556109725686e-05, "loss": 0.0286, "step": 14490 }, { "epoch": 1.8079800498753116, "grad_norm": 0.0052274251356720924, "learning_rate": 1.2772568578553616e-05, "loss": 0.0556, "step": 14500 }, { "epoch": 1.809226932668329, "grad_norm": 0.038864616304636, "learning_rate": 1.2767581047381547e-05, "loss": 0.0005, "step": 14510 }, { "epoch": 1.8104738154613467, "grad_norm": 0.011043121106922626, "learning_rate": 1.2762593516209479e-05, "loss": 0.0702, "step": 14520 }, { "epoch": 1.811720698254364, "grad_norm": 0.34502744674682617, "learning_rate": 1.2757605985037406e-05, "loss": 0.0249, "step": 14530 }, { "epoch": 1.8129675810473815, "grad_norm": 0.7066327929496765, "learning_rate": 1.2752618453865337e-05, "loss": 0.0004, "step": 14540 }, { "epoch": 1.814214463840399, "grad_norm": 0.008470536209642887, "learning_rate": 1.2747630922693268e-05, "loss": 0.0475, "step": 14550 }, { "epoch": 1.8154613466334164, "grad_norm": 0.02327272854745388, "learning_rate": 1.2742643391521198e-05, "loss": 0.0482, "step": 14560 }, { "epoch": 1.816708229426434, "grad_norm": 0.025802046060562134, "learning_rate": 1.2737655860349127e-05, "loss": 0.0017, "step": 14570 }, { "epoch": 1.8179551122194515, "grad_norm": 0.004240632988512516, "learning_rate": 1.2732668329177058e-05, "loss": 0.0158, "step": 14580 }, { "epoch": 1.819201995012469, "grad_norm": 0.03840193897485733, "learning_rate": 1.2727680798004988e-05, "loss": 0.0602, "step": 14590 }, { "epoch": 1.8204488778054864, "grad_norm": 71.08118438720703, "learning_rate": 1.2722693266832919e-05, "loss": 0.0875, "step": 14600 }, { "epoch": 1.8216957605985038, "grad_norm": 0.010022374801337719, "learning_rate": 1.2717705735660848e-05, "loss": 0.0301, "step": 14610 }, { "epoch": 1.8229426433915212, "grad_norm": 41.78736877441406, "learning_rate": 1.2712718204488778e-05, "loss": 0.0334, "step": 14620 }, { "epoch": 1.8241895261845387, "grad_norm": 0.00742789451032877, "learning_rate": 1.2707730673316709e-05, "loss": 0.0003, "step": 14630 }, { "epoch": 1.825436408977556, "grad_norm": 0.013341421261429787, "learning_rate": 1.270274314214464e-05, "loss": 0.0258, "step": 14640 }, { "epoch": 1.8266832917705735, "grad_norm": 0.02488613687455654, "learning_rate": 1.2697755610972569e-05, "loss": 0.0536, "step": 14650 }, { "epoch": 1.827930174563591, "grad_norm": 0.0047700293362140656, "learning_rate": 1.26927680798005e-05, "loss": 0.0003, "step": 14660 }, { "epoch": 1.8291770573566084, "grad_norm": 0.18093907833099365, "learning_rate": 1.268778054862843e-05, "loss": 0.0003, "step": 14670 }, { "epoch": 1.8304239401496258, "grad_norm": 0.004082898609340191, "learning_rate": 1.268279301745636e-05, "loss": 0.0738, "step": 14680 }, { "epoch": 1.8316708229426433, "grad_norm": 0.01219853200018406, "learning_rate": 1.2677805486284291e-05, "loss": 0.01, "step": 14690 }, { "epoch": 1.8329177057356607, "grad_norm": 0.012780722230672836, "learning_rate": 1.267281795511222e-05, "loss": 0.1225, "step": 14700 }, { "epoch": 1.8341645885286781, "grad_norm": 0.011909456923604012, "learning_rate": 1.266783042394015e-05, "loss": 0.1042, "step": 14710 }, { "epoch": 1.8354114713216958, "grad_norm": 0.010442191734910011, "learning_rate": 1.2662842892768081e-05, "loss": 0.0065, "step": 14720 }, { "epoch": 1.8366583541147132, "grad_norm": 2.012455940246582, "learning_rate": 1.2657855361596012e-05, "loss": 0.0284, "step": 14730 }, { "epoch": 1.8379052369077307, "grad_norm": 0.023272445425391197, "learning_rate": 1.265286783042394e-05, "loss": 0.0042, "step": 14740 }, { "epoch": 1.839152119700748, "grad_norm": 0.013164395466446877, "learning_rate": 1.2647880299251871e-05, "loss": 0.0005, "step": 14750 }, { "epoch": 1.8403990024937655, "grad_norm": 0.006373642943799496, "learning_rate": 1.2642892768079802e-05, "loss": 0.0036, "step": 14760 }, { "epoch": 1.8416458852867832, "grad_norm": 0.021037481725215912, "learning_rate": 1.2637905236907733e-05, "loss": 0.0721, "step": 14770 }, { "epoch": 1.8428927680798006, "grad_norm": 23.66687774658203, "learning_rate": 1.2632917705735662e-05, "loss": 0.0069, "step": 14780 }, { "epoch": 1.844139650872818, "grad_norm": 0.4130190908908844, "learning_rate": 1.2627930174563592e-05, "loss": 0.0017, "step": 14790 }, { "epoch": 1.8453865336658355, "grad_norm": 0.0021154251880943775, "learning_rate": 1.2622942643391523e-05, "loss": 0.0153, "step": 14800 }, { "epoch": 1.846633416458853, "grad_norm": 0.003588662948459387, "learning_rate": 1.2617955112219453e-05, "loss": 0.0021, "step": 14810 }, { "epoch": 1.8478802992518704, "grad_norm": 0.004530202131718397, "learning_rate": 1.2612967581047382e-05, "loss": 0.0238, "step": 14820 }, { "epoch": 1.8491271820448878, "grad_norm": 0.02361251227557659, "learning_rate": 1.2607980049875313e-05, "loss": 0.0414, "step": 14830 }, { "epoch": 1.8503740648379052, "grad_norm": 0.012375129386782646, "learning_rate": 1.2602992518703244e-05, "loss": 0.0006, "step": 14840 }, { "epoch": 1.8516209476309227, "grad_norm": 0.00249878759495914, "learning_rate": 1.2598004987531174e-05, "loss": 0.0198, "step": 14850 }, { "epoch": 1.85286783042394, "grad_norm": 0.008120937272906303, "learning_rate": 1.2593017456359101e-05, "loss": 0.1278, "step": 14860 }, { "epoch": 1.8541147132169575, "grad_norm": 1.1773933172225952, "learning_rate": 1.2588029925187034e-05, "loss": 0.0037, "step": 14870 }, { "epoch": 1.855361596009975, "grad_norm": 0.10049566626548767, "learning_rate": 1.2583042394014964e-05, "loss": 0.0004, "step": 14880 }, { "epoch": 1.8566084788029924, "grad_norm": 0.029792172834277153, "learning_rate": 1.2578054862842895e-05, "loss": 0.0048, "step": 14890 }, { "epoch": 1.8578553615960098, "grad_norm": 12.35927963256836, "learning_rate": 1.2573067331670822e-05, "loss": 0.0016, "step": 14900 }, { "epoch": 1.8591022443890273, "grad_norm": 0.00204622489400208, "learning_rate": 1.2568079800498753e-05, "loss": 0.0002, "step": 14910 }, { "epoch": 1.860349127182045, "grad_norm": 0.014671762473881245, "learning_rate": 1.2563092269326684e-05, "loss": 0.0143, "step": 14920 }, { "epoch": 1.8615960099750624, "grad_norm": 0.0035495534539222717, "learning_rate": 1.2558104738154616e-05, "loss": 0.0276, "step": 14930 }, { "epoch": 1.8628428927680798, "grad_norm": 0.0024204726796597242, "learning_rate": 1.2553117206982546e-05, "loss": 0.0546, "step": 14940 }, { "epoch": 1.8640897755610972, "grad_norm": 0.002721786266192794, "learning_rate": 1.2548129675810474e-05, "loss": 0.0434, "step": 14950 }, { "epoch": 1.8653366583541147, "grad_norm": 14.899277687072754, "learning_rate": 1.2543142144638404e-05, "loss": 0.0351, "step": 14960 }, { "epoch": 1.8665835411471323, "grad_norm": 0.0012579448521137238, "learning_rate": 1.2538154613466335e-05, "loss": 0.0452, "step": 14970 }, { "epoch": 1.8678304239401498, "grad_norm": 0.006773448083549738, "learning_rate": 1.2533167082294266e-05, "loss": 0.0017, "step": 14980 }, { "epoch": 1.8690773067331672, "grad_norm": 0.0034950890112668276, "learning_rate": 1.2528179551122194e-05, "loss": 0.058, "step": 14990 }, { "epoch": 1.8703241895261846, "grad_norm": 0.007557340431958437, "learning_rate": 1.2523192019950125e-05, "loss": 0.0452, "step": 15000 }, { "epoch": 1.871571072319202, "grad_norm": 0.025978926569223404, "learning_rate": 1.2518204488778056e-05, "loss": 0.0006, "step": 15010 }, { "epoch": 1.8728179551122195, "grad_norm": 0.003996451385319233, "learning_rate": 1.2513216957605986e-05, "loss": 0.0024, "step": 15020 }, { "epoch": 1.874064837905237, "grad_norm": 0.0036994144320487976, "learning_rate": 1.2508229426433915e-05, "loss": 0.0314, "step": 15030 }, { "epoch": 1.8753117206982544, "grad_norm": 0.004972072783857584, "learning_rate": 1.2503241895261846e-05, "loss": 0.0292, "step": 15040 }, { "epoch": 1.8765586034912718, "grad_norm": 0.0011615961557254195, "learning_rate": 1.2498254364089776e-05, "loss": 0.0122, "step": 15050 }, { "epoch": 1.8778054862842892, "grad_norm": 0.009926113300025463, "learning_rate": 1.2493266832917707e-05, "loss": 0.0533, "step": 15060 }, { "epoch": 1.8790523690773067, "grad_norm": 0.0035635512322187424, "learning_rate": 1.2488279301745636e-05, "loss": 0.0677, "step": 15070 }, { "epoch": 1.880299251870324, "grad_norm": 0.0073900381103158, "learning_rate": 1.2483291770573567e-05, "loss": 0.0003, "step": 15080 }, { "epoch": 1.8815461346633415, "grad_norm": 0.015382417477667332, "learning_rate": 1.2478304239401497e-05, "loss": 0.0003, "step": 15090 }, { "epoch": 1.882793017456359, "grad_norm": 0.005023096688091755, "learning_rate": 1.2473316708229428e-05, "loss": 0.0011, "step": 15100 }, { "epoch": 1.8840399002493764, "grad_norm": 0.0024068865459412336, "learning_rate": 1.2468329177057357e-05, "loss": 0.0302, "step": 15110 }, { "epoch": 1.885286783042394, "grad_norm": 0.01790803112089634, "learning_rate": 1.2463341645885287e-05, "loss": 0.0005, "step": 15120 }, { "epoch": 1.8865336658354115, "grad_norm": 0.002026566304266453, "learning_rate": 1.2458354114713218e-05, "loss": 0.0379, "step": 15130 }, { "epoch": 1.887780548628429, "grad_norm": 0.05049702525138855, "learning_rate": 1.2453366583541149e-05, "loss": 0.0142, "step": 15140 }, { "epoch": 1.8890274314214464, "grad_norm": 27.768470764160156, "learning_rate": 1.2448379052369078e-05, "loss": 0.0653, "step": 15150 }, { "epoch": 1.8902743142144638, "grad_norm": 0.016509858891367912, "learning_rate": 1.2443391521197008e-05, "loss": 0.0102, "step": 15160 }, { "epoch": 1.8915211970074814, "grad_norm": 0.9496610760688782, "learning_rate": 1.2438403990024939e-05, "loss": 0.0006, "step": 15170 }, { "epoch": 1.8927680798004989, "grad_norm": 54.05585479736328, "learning_rate": 1.243341645885287e-05, "loss": 0.1083, "step": 15180 }, { "epoch": 1.8940149625935163, "grad_norm": 0.021726036444306374, "learning_rate": 1.24284289276808e-05, "loss": 0.0922, "step": 15190 }, { "epoch": 1.8952618453865338, "grad_norm": 0.008410331793129444, "learning_rate": 1.2423441396508729e-05, "loss": 0.0066, "step": 15200 }, { "epoch": 1.8965087281795512, "grad_norm": 10.102691650390625, "learning_rate": 1.241845386533666e-05, "loss": 0.0943, "step": 15210 }, { "epoch": 1.8977556109725686, "grad_norm": 16.93928337097168, "learning_rate": 1.241346633416459e-05, "loss": 0.0458, "step": 15220 }, { "epoch": 1.899002493765586, "grad_norm": 0.0049775936640799046, "learning_rate": 1.2408478802992521e-05, "loss": 0.0003, "step": 15230 }, { "epoch": 1.9002493765586035, "grad_norm": 40.914817810058594, "learning_rate": 1.240349127182045e-05, "loss": 0.0349, "step": 15240 }, { "epoch": 1.901496259351621, "grad_norm": 0.0028333088848739862, "learning_rate": 1.239850374064838e-05, "loss": 0.0035, "step": 15250 }, { "epoch": 1.9027431421446384, "grad_norm": 0.3201017379760742, "learning_rate": 1.2393516209476311e-05, "loss": 0.0065, "step": 15260 }, { "epoch": 1.9039900249376558, "grad_norm": 0.007169653195887804, "learning_rate": 1.2388528678304242e-05, "loss": 0.0231, "step": 15270 }, { "epoch": 1.9052369077306732, "grad_norm": 0.002448942745104432, "learning_rate": 1.238354114713217e-05, "loss": 0.0394, "step": 15280 }, { "epoch": 1.9064837905236907, "grad_norm": 2.1332430839538574, "learning_rate": 1.2378553615960101e-05, "loss": 0.0195, "step": 15290 }, { "epoch": 1.907730673316708, "grad_norm": 0.1876264065504074, "learning_rate": 1.2373566084788032e-05, "loss": 0.0685, "step": 15300 }, { "epoch": 1.9089775561097255, "grad_norm": 0.012347985059022903, "learning_rate": 1.2368578553615962e-05, "loss": 0.0733, "step": 15310 }, { "epoch": 1.9102244389027432, "grad_norm": 0.007458592299371958, "learning_rate": 1.236359102244389e-05, "loss": 0.049, "step": 15320 }, { "epoch": 1.9114713216957606, "grad_norm": 0.0171657707542181, "learning_rate": 1.2358603491271822e-05, "loss": 0.0267, "step": 15330 }, { "epoch": 1.912718204488778, "grad_norm": 0.037479467689991, "learning_rate": 1.2353615960099753e-05, "loss": 0.0227, "step": 15340 }, { "epoch": 1.9139650872817955, "grad_norm": 0.23706205189228058, "learning_rate": 1.2348628428927683e-05, "loss": 0.0251, "step": 15350 }, { "epoch": 1.915211970074813, "grad_norm": 0.02107255533337593, "learning_rate": 1.234364089775561e-05, "loss": 0.0005, "step": 15360 }, { "epoch": 1.9164588528678306, "grad_norm": 0.0038479601498693228, "learning_rate": 1.2338653366583541e-05, "loss": 0.091, "step": 15370 }, { "epoch": 1.917705735660848, "grad_norm": 0.0024802994448691607, "learning_rate": 1.2333665835411472e-05, "loss": 0.0436, "step": 15380 }, { "epoch": 1.9189526184538654, "grad_norm": 0.00154173094779253, "learning_rate": 1.2328678304239404e-05, "loss": 0.0002, "step": 15390 }, { "epoch": 1.9201995012468829, "grad_norm": 0.021371448412537575, "learning_rate": 1.2323690773067331e-05, "loss": 0.1347, "step": 15400 }, { "epoch": 1.9214463840399003, "grad_norm": 0.809636652469635, "learning_rate": 1.2318703241895262e-05, "loss": 0.0564, "step": 15410 }, { "epoch": 1.9226932668329177, "grad_norm": 0.01050503272563219, "learning_rate": 1.2313715710723192e-05, "loss": 0.0089, "step": 15420 }, { "epoch": 1.9239401496259352, "grad_norm": 0.02801765315234661, "learning_rate": 1.2308728179551123e-05, "loss": 0.0365, "step": 15430 }, { "epoch": 1.9251870324189526, "grad_norm": 0.005082667805254459, "learning_rate": 1.2303740648379054e-05, "loss": 0.0059, "step": 15440 }, { "epoch": 1.92643391521197, "grad_norm": 0.005822836421430111, "learning_rate": 1.2298753117206983e-05, "loss": 0.0035, "step": 15450 }, { "epoch": 1.9276807980049875, "grad_norm": 0.0021450971253216267, "learning_rate": 1.2293765586034913e-05, "loss": 0.0004, "step": 15460 }, { "epoch": 1.928927680798005, "grad_norm": 0.003937265835702419, "learning_rate": 1.2288778054862844e-05, "loss": 0.0006, "step": 15470 }, { "epoch": 1.9301745635910224, "grad_norm": 0.008908499032258987, "learning_rate": 1.2283790523690774e-05, "loss": 0.0385, "step": 15480 }, { "epoch": 1.9314214463840398, "grad_norm": 0.15880395472049713, "learning_rate": 1.2278802992518703e-05, "loss": 0.0386, "step": 15490 }, { "epoch": 1.9326683291770572, "grad_norm": 0.00402474170550704, "learning_rate": 1.2273815461346634e-05, "loss": 0.0295, "step": 15500 }, { "epoch": 1.9339152119700747, "grad_norm": 0.038588814437389374, "learning_rate": 1.2268827930174565e-05, "loss": 0.0339, "step": 15510 }, { "epoch": 1.9351620947630923, "grad_norm": 0.004865721333771944, "learning_rate": 1.2263840399002495e-05, "loss": 0.0102, "step": 15520 }, { "epoch": 1.9364089775561097, "grad_norm": 0.415129154920578, "learning_rate": 1.2258852867830424e-05, "loss": 0.1301, "step": 15530 }, { "epoch": 1.9376558603491272, "grad_norm": 0.01758623868227005, "learning_rate": 1.2253865336658355e-05, "loss": 0.0191, "step": 15540 }, { "epoch": 1.9389027431421446, "grad_norm": 19.56462860107422, "learning_rate": 1.2248877805486285e-05, "loss": 0.0375, "step": 15550 }, { "epoch": 1.940149625935162, "grad_norm": 0.0069420519284904, "learning_rate": 1.2243890274314216e-05, "loss": 0.0007, "step": 15560 }, { "epoch": 1.9413965087281797, "grad_norm": 0.005758563056588173, "learning_rate": 1.2238902743142145e-05, "loss": 0.0156, "step": 15570 }, { "epoch": 1.9426433915211971, "grad_norm": 0.014544529840350151, "learning_rate": 1.2233915211970076e-05, "loss": 0.0002, "step": 15580 }, { "epoch": 1.9438902743142146, "grad_norm": 0.051376599818468094, "learning_rate": 1.2228927680798006e-05, "loss": 0.116, "step": 15590 }, { "epoch": 1.945137157107232, "grad_norm": 0.005440854001790285, "learning_rate": 1.2223940149625937e-05, "loss": 0.0246, "step": 15600 }, { "epoch": 1.9463840399002494, "grad_norm": 0.012521665543317795, "learning_rate": 1.2218952618453866e-05, "loss": 0.0378, "step": 15610 }, { "epoch": 1.9476309226932669, "grad_norm": 9.66970443725586, "learning_rate": 1.2213965087281796e-05, "loss": 0.0433, "step": 15620 }, { "epoch": 1.9488778054862843, "grad_norm": 0.010639001615345478, "learning_rate": 1.2208977556109727e-05, "loss": 0.0421, "step": 15630 }, { "epoch": 1.9501246882793017, "grad_norm": 12.684861183166504, "learning_rate": 1.2203990024937658e-05, "loss": 0.1112, "step": 15640 }, { "epoch": 1.9513715710723192, "grad_norm": 0.013296867720782757, "learning_rate": 1.2199002493765587e-05, "loss": 0.0455, "step": 15650 }, { "epoch": 1.9526184538653366, "grad_norm": 0.018770115450024605, "learning_rate": 1.2194014962593517e-05, "loss": 0.0144, "step": 15660 }, { "epoch": 1.953865336658354, "grad_norm": 0.017290420830249786, "learning_rate": 1.2189027431421448e-05, "loss": 0.0195, "step": 15670 }, { "epoch": 1.9551122194513715, "grad_norm": 0.9697034955024719, "learning_rate": 1.2184039900249378e-05, "loss": 0.0517, "step": 15680 }, { "epoch": 1.956359102244389, "grad_norm": 0.011194895952939987, "learning_rate": 1.2179052369077309e-05, "loss": 0.0004, "step": 15690 }, { "epoch": 1.9576059850374063, "grad_norm": 23.957162857055664, "learning_rate": 1.2174563591022446e-05, "loss": 0.1551, "step": 15700 }, { "epoch": 1.9588528678304238, "grad_norm": 0.026693115010857582, "learning_rate": 1.2169576059850374e-05, "loss": 0.0011, "step": 15710 }, { "epoch": 1.9600997506234414, "grad_norm": 0.3291858434677124, "learning_rate": 1.2164588528678305e-05, "loss": 0.0524, "step": 15720 }, { "epoch": 1.9613466334164589, "grad_norm": 0.006115868221968412, "learning_rate": 1.2159600997506236e-05, "loss": 0.0008, "step": 15730 }, { "epoch": 1.9625935162094763, "grad_norm": 0.002324812114238739, "learning_rate": 1.2154613466334166e-05, "loss": 0.0008, "step": 15740 }, { "epoch": 1.9638403990024937, "grad_norm": 0.013870062306523323, "learning_rate": 1.2149625935162095e-05, "loss": 0.0449, "step": 15750 }, { "epoch": 1.9650872817955112, "grad_norm": 0.02065792866051197, "learning_rate": 1.2144638403990026e-05, "loss": 0.101, "step": 15760 }, { "epoch": 1.9663341645885288, "grad_norm": 0.06540193408727646, "learning_rate": 1.2139650872817957e-05, "loss": 0.0092, "step": 15770 }, { "epoch": 1.9675810473815463, "grad_norm": 0.766767144203186, "learning_rate": 1.2134663341645887e-05, "loss": 0.0006, "step": 15780 }, { "epoch": 1.9688279301745637, "grad_norm": 0.024145884439349174, "learning_rate": 1.2129675810473816e-05, "loss": 0.0008, "step": 15790 }, { "epoch": 1.9700748129675811, "grad_norm": 0.009867561981081963, "learning_rate": 1.2124688279301747e-05, "loss": 0.0454, "step": 15800 }, { "epoch": 1.9713216957605986, "grad_norm": 0.01759646274149418, "learning_rate": 1.2119700748129677e-05, "loss": 0.0003, "step": 15810 }, { "epoch": 1.972568578553616, "grad_norm": 0.018786191940307617, "learning_rate": 1.2114713216957608e-05, "loss": 0.0423, "step": 15820 }, { "epoch": 1.9738154613466334, "grad_norm": 0.02426152490079403, "learning_rate": 1.2109725685785537e-05, "loss": 0.0601, "step": 15830 }, { "epoch": 1.9750623441396509, "grad_norm": 0.004218484740704298, "learning_rate": 1.2104738154613467e-05, "loss": 0.0002, "step": 15840 }, { "epoch": 1.9763092269326683, "grad_norm": 0.012382776476442814, "learning_rate": 1.2099750623441398e-05, "loss": 0.0194, "step": 15850 }, { "epoch": 1.9775561097256857, "grad_norm": 0.010796369053423405, "learning_rate": 1.2094763092269329e-05, "loss": 0.022, "step": 15860 }, { "epoch": 1.9788029925187032, "grad_norm": 0.008351016789674759, "learning_rate": 1.208977556109726e-05, "loss": 0.0394, "step": 15870 }, { "epoch": 1.9800498753117206, "grad_norm": 0.0044389450922608376, "learning_rate": 1.2084788029925188e-05, "loss": 0.045, "step": 15880 }, { "epoch": 1.981296758104738, "grad_norm": 0.03672898933291435, "learning_rate": 1.2079800498753119e-05, "loss": 0.0536, "step": 15890 }, { "epoch": 1.9825436408977555, "grad_norm": 0.05319588631391525, "learning_rate": 1.207481296758105e-05, "loss": 0.0828, "step": 15900 }, { "epoch": 1.983790523690773, "grad_norm": 5.178584098815918, "learning_rate": 1.206982543640898e-05, "loss": 0.1052, "step": 15910 }, { "epoch": 1.9850374064837906, "grad_norm": 0.005634423345327377, "learning_rate": 1.2064837905236907e-05, "loss": 0.0469, "step": 15920 }, { "epoch": 1.986284289276808, "grad_norm": 0.03831524774432182, "learning_rate": 1.2059850374064838e-05, "loss": 0.0012, "step": 15930 }, { "epoch": 1.9875311720698254, "grad_norm": 0.006244161166250706, "learning_rate": 1.205486284289277e-05, "loss": 0.0126, "step": 15940 }, { "epoch": 1.9887780548628429, "grad_norm": 7.760480880737305, "learning_rate": 1.2049875311720701e-05, "loss": 0.0419, "step": 15950 }, { "epoch": 1.9900249376558603, "grad_norm": 5.048219203948975, "learning_rate": 1.2044887780548628e-05, "loss": 0.0377, "step": 15960 }, { "epoch": 1.991271820448878, "grad_norm": 0.0169609934091568, "learning_rate": 1.2039900249376559e-05, "loss": 0.0487, "step": 15970 }, { "epoch": 1.9925187032418954, "grad_norm": 0.020963355898857117, "learning_rate": 1.203491271820449e-05, "loss": 0.0217, "step": 15980 }, { "epoch": 1.9937655860349128, "grad_norm": 0.008009551092982292, "learning_rate": 1.202992518703242e-05, "loss": 0.0015, "step": 15990 }, { "epoch": 1.9950124688279303, "grad_norm": 0.01416187733411789, "learning_rate": 1.2024937655860349e-05, "loss": 0.0373, "step": 16000 }, { "epoch": 1.9962593516209477, "grad_norm": 0.038335274904966354, "learning_rate": 1.201995012468828e-05, "loss": 0.0381, "step": 16010 }, { "epoch": 1.9975062344139651, "grad_norm": 0.04764119163155556, "learning_rate": 1.201496259351621e-05, "loss": 0.0308, "step": 16020 }, { "epoch": 1.9987531172069826, "grad_norm": 9.592329978942871, "learning_rate": 1.200997506234414e-05, "loss": 0.0543, "step": 16030 }, { "epoch": 2.0, "grad_norm": 0.008706348016858101, "learning_rate": 1.200498753117207e-05, "loss": 0.0003, "step": 16040 }, { "epoch": 2.0, "eval_accuracy": 0.9936405012781345, "eval_loss": 0.03504941612482071, "eval_runtime": 17.7412, "eval_samples_per_second": 904.053, "eval_steps_per_second": 56.535, "step": 16040 }, { "epoch": 2.0012468827930174, "grad_norm": 0.006675767712295055, "learning_rate": 1.2e-05, "loss": 0.0004, "step": 16050 }, { "epoch": 2.002493765586035, "grad_norm": 0.0029956961516290903, "learning_rate": 1.1995012468827931e-05, "loss": 0.0002, "step": 16060 }, { "epoch": 2.0037406483790523, "grad_norm": 0.024461830034852028, "learning_rate": 1.1990024937655862e-05, "loss": 0.0002, "step": 16070 }, { "epoch": 2.0049875311720697, "grad_norm": 0.024466920644044876, "learning_rate": 1.198503740648379e-05, "loss": 0.0002, "step": 16080 }, { "epoch": 2.006234413965087, "grad_norm": 0.004781090654432774, "learning_rate": 1.1980049875311721e-05, "loss": 0.0254, "step": 16090 }, { "epoch": 2.0074812967581046, "grad_norm": 55.558677673339844, "learning_rate": 1.1975062344139652e-05, "loss": 0.0569, "step": 16100 }, { "epoch": 2.008728179551122, "grad_norm": 0.007196883670985699, "learning_rate": 1.1970074812967582e-05, "loss": 0.0002, "step": 16110 }, { "epoch": 2.0099750623441395, "grad_norm": 0.002488025464117527, "learning_rate": 1.1965087281795513e-05, "loss": 0.0018, "step": 16120 }, { "epoch": 2.011221945137157, "grad_norm": 0.0017350780544802547, "learning_rate": 1.1960099750623442e-05, "loss": 0.0004, "step": 16130 }, { "epoch": 2.0124688279301743, "grad_norm": 15.045056343078613, "learning_rate": 1.1955112219451372e-05, "loss": 0.0436, "step": 16140 }, { "epoch": 2.013715710723192, "grad_norm": 28.597244262695312, "learning_rate": 1.1950124688279303e-05, "loss": 0.0061, "step": 16150 }, { "epoch": 2.0149625935162097, "grad_norm": 4.736278057098389, "learning_rate": 1.1945137157107234e-05, "loss": 0.0434, "step": 16160 }, { "epoch": 2.016209476309227, "grad_norm": 0.0017900534439831972, "learning_rate": 1.1940149625935163e-05, "loss": 0.0428, "step": 16170 }, { "epoch": 2.0174563591022445, "grad_norm": 0.004499204456806183, "learning_rate": 1.1935162094763093e-05, "loss": 0.0003, "step": 16180 }, { "epoch": 2.018703241895262, "grad_norm": 0.002972670830786228, "learning_rate": 1.1930174563591024e-05, "loss": 0.0634, "step": 16190 }, { "epoch": 2.0199501246882794, "grad_norm": 0.011449567042291164, "learning_rate": 1.1925187032418955e-05, "loss": 0.0002, "step": 16200 }, { "epoch": 2.021197007481297, "grad_norm": 0.020797425881028175, "learning_rate": 1.1920199501246883e-05, "loss": 0.0002, "step": 16210 }, { "epoch": 2.0224438902743143, "grad_norm": 0.4915730953216553, "learning_rate": 1.1915211970074814e-05, "loss": 0.0731, "step": 16220 }, { "epoch": 2.0236907730673317, "grad_norm": 0.9496850371360779, "learning_rate": 1.1910224438902745e-05, "loss": 0.0157, "step": 16230 }, { "epoch": 2.024937655860349, "grad_norm": 0.004698851145803928, "learning_rate": 1.1905236907730675e-05, "loss": 0.0004, "step": 16240 }, { "epoch": 2.0261845386533666, "grad_norm": 0.19270732998847961, "learning_rate": 1.1900249376558604e-05, "loss": 0.0355, "step": 16250 }, { "epoch": 2.027431421446384, "grad_norm": 0.017890894785523415, "learning_rate": 1.1895261845386535e-05, "loss": 0.0002, "step": 16260 }, { "epoch": 2.0286783042394014, "grad_norm": 0.018559958785772324, "learning_rate": 1.1890274314214465e-05, "loss": 0.0003, "step": 16270 }, { "epoch": 2.029925187032419, "grad_norm": 0.019640089944005013, "learning_rate": 1.1885286783042396e-05, "loss": 0.0003, "step": 16280 }, { "epoch": 2.0311720698254363, "grad_norm": 0.04718825966119766, "learning_rate": 1.1880299251870325e-05, "loss": 0.0003, "step": 16290 }, { "epoch": 2.0324189526184537, "grad_norm": 0.001198633573949337, "learning_rate": 1.1875311720698256e-05, "loss": 0.0006, "step": 16300 }, { "epoch": 2.033665835411471, "grad_norm": 0.1309710144996643, "learning_rate": 1.1870324189526186e-05, "loss": 0.1054, "step": 16310 }, { "epoch": 2.0349127182044886, "grad_norm": 0.008803281933069229, "learning_rate": 1.1865336658354117e-05, "loss": 0.0249, "step": 16320 }, { "epoch": 2.036159600997506, "grad_norm": 0.45416969060897827, "learning_rate": 1.1860349127182044e-05, "loss": 0.0191, "step": 16330 }, { "epoch": 2.037406483790524, "grad_norm": 0.022696293890476227, "learning_rate": 1.1855361596009975e-05, "loss": 0.0002, "step": 16340 }, { "epoch": 2.0386533665835413, "grad_norm": 0.018615229055285454, "learning_rate": 1.1850374064837907e-05, "loss": 0.0252, "step": 16350 }, { "epoch": 2.039900249376559, "grad_norm": 0.004977565258741379, "learning_rate": 1.1845386533665838e-05, "loss": 0.0043, "step": 16360 }, { "epoch": 2.041147132169576, "grad_norm": 0.024628182873129845, "learning_rate": 1.1840399002493768e-05, "loss": 0.0001, "step": 16370 }, { "epoch": 2.0423940149625937, "grad_norm": 0.0027379884850233793, "learning_rate": 1.1835411471321695e-05, "loss": 0.0002, "step": 16380 }, { "epoch": 2.043640897755611, "grad_norm": 0.0037043734919279814, "learning_rate": 1.1830423940149626e-05, "loss": 0.0092, "step": 16390 }, { "epoch": 2.0448877805486285, "grad_norm": 0.009499566629529, "learning_rate": 1.1825436408977557e-05, "loss": 0.0001, "step": 16400 }, { "epoch": 2.046134663341646, "grad_norm": 0.0012607629178091884, "learning_rate": 1.1820448877805489e-05, "loss": 0.0664, "step": 16410 }, { "epoch": 2.0473815461346634, "grad_norm": 0.0217350572347641, "learning_rate": 1.1815461346633416e-05, "loss": 0.0002, "step": 16420 }, { "epoch": 2.048628428927681, "grad_norm": 0.14604200422763824, "learning_rate": 1.1810473815461347e-05, "loss": 0.0083, "step": 16430 }, { "epoch": 2.0498753117206983, "grad_norm": 9.621197700500488, "learning_rate": 1.1805486284289278e-05, "loss": 0.0328, "step": 16440 }, { "epoch": 2.0511221945137157, "grad_norm": 3.1825685501098633, "learning_rate": 1.1800498753117208e-05, "loss": 0.0006, "step": 16450 }, { "epoch": 2.052369077306733, "grad_norm": 0.002401788020506501, "learning_rate": 1.1795511221945137e-05, "loss": 0.0001, "step": 16460 }, { "epoch": 2.0536159600997506, "grad_norm": 0.039967939257621765, "learning_rate": 1.1790523690773068e-05, "loss": 0.0001, "step": 16470 }, { "epoch": 2.054862842892768, "grad_norm": 0.014854871667921543, "learning_rate": 1.1785536159600998e-05, "loss": 0.0103, "step": 16480 }, { "epoch": 2.0561097256857854, "grad_norm": 0.005126704927533865, "learning_rate": 1.1780548628428929e-05, "loss": 0.0003, "step": 16490 }, { "epoch": 2.057356608478803, "grad_norm": 0.0005838332581333816, "learning_rate": 1.1775561097256858e-05, "loss": 0.0002, "step": 16500 }, { "epoch": 2.0586034912718203, "grad_norm": 0.004360176622867584, "learning_rate": 1.1770573566084788e-05, "loss": 0.0024, "step": 16510 }, { "epoch": 2.0598503740648377, "grad_norm": 0.0009690506267361343, "learning_rate": 1.1765586034912719e-05, "loss": 0.0628, "step": 16520 }, { "epoch": 2.061097256857855, "grad_norm": 0.005594516638666391, "learning_rate": 1.176059850374065e-05, "loss": 0.0001, "step": 16530 }, { "epoch": 2.0623441396508726, "grad_norm": 0.0011889605084434152, "learning_rate": 1.1755610972568579e-05, "loss": 0.0207, "step": 16540 }, { "epoch": 2.0635910224438905, "grad_norm": 0.0011221276363357902, "learning_rate": 1.175062344139651e-05, "loss": 0.018, "step": 16550 }, { "epoch": 2.064837905236908, "grad_norm": 7.532267093658447, "learning_rate": 1.174563591022444e-05, "loss": 0.0719, "step": 16560 }, { "epoch": 2.0660847880299253, "grad_norm": 0.0037209379952400923, "learning_rate": 1.174064837905237e-05, "loss": 0.0422, "step": 16570 }, { "epoch": 2.067331670822943, "grad_norm": 3.0293376445770264, "learning_rate": 1.17356608478803e-05, "loss": 0.0082, "step": 16580 }, { "epoch": 2.06857855361596, "grad_norm": 8.686629295349121, "learning_rate": 1.173067331670823e-05, "loss": 0.0012, "step": 16590 }, { "epoch": 2.0698254364089776, "grad_norm": 0.02134956605732441, "learning_rate": 1.172568578553616e-05, "loss": 0.0003, "step": 16600 }, { "epoch": 2.071072319201995, "grad_norm": 0.004345687106251717, "learning_rate": 1.1720698254364091e-05, "loss": 0.0003, "step": 16610 }, { "epoch": 2.0723192019950125, "grad_norm": 0.004607527982443571, "learning_rate": 1.1715710723192022e-05, "loss": 0.0553, "step": 16620 }, { "epoch": 2.07356608478803, "grad_norm": 0.0037352608051151037, "learning_rate": 1.171072319201995e-05, "loss": 0.0442, "step": 16630 }, { "epoch": 2.0748129675810474, "grad_norm": 2.7062952518463135, "learning_rate": 1.1705735660847881e-05, "loss": 0.0388, "step": 16640 }, { "epoch": 2.076059850374065, "grad_norm": 0.2463214248418808, "learning_rate": 1.1700748129675812e-05, "loss": 0.0012, "step": 16650 }, { "epoch": 2.0773067331670823, "grad_norm": 0.0029989476315677166, "learning_rate": 1.1695760598503743e-05, "loss": 0.0003, "step": 16660 }, { "epoch": 2.0785536159600997, "grad_norm": 0.01061512902379036, "learning_rate": 1.1690773067331672e-05, "loss": 0.0006, "step": 16670 }, { "epoch": 2.079800498753117, "grad_norm": 0.0029966856818646193, "learning_rate": 1.1685785536159602e-05, "loss": 0.0022, "step": 16680 }, { "epoch": 2.0810473815461346, "grad_norm": 0.0038471422158181667, "learning_rate": 1.1680798004987533e-05, "loss": 0.0263, "step": 16690 }, { "epoch": 2.082294264339152, "grad_norm": 0.013609787449240685, "learning_rate": 1.1675810473815463e-05, "loss": 0.0007, "step": 16700 }, { "epoch": 2.0835411471321694, "grad_norm": 0.0019859191961586475, "learning_rate": 1.1670822942643392e-05, "loss": 0.0372, "step": 16710 }, { "epoch": 2.084788029925187, "grad_norm": 0.0010857946472242475, "learning_rate": 1.1665835411471323e-05, "loss": 0.028, "step": 16720 }, { "epoch": 2.0860349127182043, "grad_norm": 3.9083871841430664, "learning_rate": 1.1660847880299254e-05, "loss": 0.0014, "step": 16730 }, { "epoch": 2.087281795511222, "grad_norm": 51.425758361816406, "learning_rate": 1.1655860349127184e-05, "loss": 0.0603, "step": 16740 }, { "epoch": 2.0885286783042396, "grad_norm": 0.0008689459064044058, "learning_rate": 1.1650872817955111e-05, "loss": 0.0034, "step": 16750 }, { "epoch": 2.089775561097257, "grad_norm": 0.0023854838218539953, "learning_rate": 1.1645885286783044e-05, "loss": 0.0001, "step": 16760 }, { "epoch": 2.0910224438902745, "grad_norm": 0.010867654345929623, "learning_rate": 1.1640897755610974e-05, "loss": 0.0562, "step": 16770 }, { "epoch": 2.092269326683292, "grad_norm": 0.002745213219895959, "learning_rate": 1.1635910224438905e-05, "loss": 0.0013, "step": 16780 }, { "epoch": 2.0935162094763093, "grad_norm": 0.004098067991435528, "learning_rate": 1.1630922693266832e-05, "loss": 0.0698, "step": 16790 }, { "epoch": 2.0947630922693268, "grad_norm": 0.0039171320386230946, "learning_rate": 1.1625935162094763e-05, "loss": 0.0032, "step": 16800 }, { "epoch": 2.096009975062344, "grad_norm": 1.781934380531311, "learning_rate": 1.1620947630922693e-05, "loss": 0.0178, "step": 16810 }, { "epoch": 2.0972568578553616, "grad_norm": 0.0031692027114331722, "learning_rate": 1.1615960099750626e-05, "loss": 0.0001, "step": 16820 }, { "epoch": 2.098503740648379, "grad_norm": 0.07670797407627106, "learning_rate": 1.1610972568578553e-05, "loss": 0.0285, "step": 16830 }, { "epoch": 2.0997506234413965, "grad_norm": 0.03737671673297882, "learning_rate": 1.1605985037406484e-05, "loss": 0.0001, "step": 16840 }, { "epoch": 2.100997506234414, "grad_norm": 0.008449270389974117, "learning_rate": 1.1600997506234414e-05, "loss": 0.0002, "step": 16850 }, { "epoch": 2.1022443890274314, "grad_norm": 0.004309145733714104, "learning_rate": 1.1596009975062345e-05, "loss": 0.0519, "step": 16860 }, { "epoch": 2.103491271820449, "grad_norm": 0.053323499858379364, "learning_rate": 1.1591022443890276e-05, "loss": 0.0136, "step": 16870 }, { "epoch": 2.1047381546134662, "grad_norm": 0.003335277084261179, "learning_rate": 1.1586034912718204e-05, "loss": 0.0001, "step": 16880 }, { "epoch": 2.1059850374064837, "grad_norm": 0.008609478361904621, "learning_rate": 1.1581047381546135e-05, "loss": 0.0623, "step": 16890 }, { "epoch": 2.107231920199501, "grad_norm": 0.003915522713214159, "learning_rate": 1.1576059850374066e-05, "loss": 0.0002, "step": 16900 }, { "epoch": 2.1084788029925186, "grad_norm": 0.005430086050182581, "learning_rate": 1.1571072319201996e-05, "loss": 0.0002, "step": 16910 }, { "epoch": 2.109725685785536, "grad_norm": 0.01652948372066021, "learning_rate": 1.1566084788029925e-05, "loss": 0.0227, "step": 16920 }, { "epoch": 2.1109725685785534, "grad_norm": 0.015955012291669846, "learning_rate": 1.1561097256857856e-05, "loss": 0.0037, "step": 16930 }, { "epoch": 2.112219451371571, "grad_norm": 0.04435069486498833, "learning_rate": 1.1556109725685786e-05, "loss": 0.0716, "step": 16940 }, { "epoch": 2.1134663341645887, "grad_norm": 0.01482419017702341, "learning_rate": 1.1551122194513717e-05, "loss": 0.04, "step": 16950 }, { "epoch": 2.114713216957606, "grad_norm": 0.06014908477663994, "learning_rate": 1.1546134663341646e-05, "loss": 0.0003, "step": 16960 }, { "epoch": 2.1159600997506236, "grad_norm": 0.07580575346946716, "learning_rate": 1.1541147132169577e-05, "loss": 0.0386, "step": 16970 }, { "epoch": 2.117206982543641, "grad_norm": 7.51279354095459, "learning_rate": 1.1536159600997507e-05, "loss": 0.0024, "step": 16980 }, { "epoch": 2.1184538653366585, "grad_norm": 2.0569701194763184, "learning_rate": 1.1531172069825438e-05, "loss": 0.0009, "step": 16990 }, { "epoch": 2.119700748129676, "grad_norm": 27.27562141418457, "learning_rate": 1.1526184538653367e-05, "loss": 0.0072, "step": 17000 }, { "epoch": 2.1209476309226933, "grad_norm": 0.011304572224617004, "learning_rate": 1.1521197007481297e-05, "loss": 0.0005, "step": 17010 }, { "epoch": 2.1221945137157108, "grad_norm": 15.838469505310059, "learning_rate": 1.1516209476309228e-05, "loss": 0.0097, "step": 17020 }, { "epoch": 2.123441396508728, "grad_norm": 0.011107388883829117, "learning_rate": 1.1511221945137159e-05, "loss": 0.0005, "step": 17030 }, { "epoch": 2.1246882793017456, "grad_norm": 0.06625682860612869, "learning_rate": 1.1506234413965088e-05, "loss": 0.0605, "step": 17040 }, { "epoch": 2.125935162094763, "grad_norm": 0.0023085640277713537, "learning_rate": 1.1501246882793018e-05, "loss": 0.0268, "step": 17050 }, { "epoch": 2.1271820448877805, "grad_norm": 5.297941207885742, "learning_rate": 1.1496259351620949e-05, "loss": 0.0378, "step": 17060 }, { "epoch": 2.128428927680798, "grad_norm": 0.020840341225266457, "learning_rate": 1.149127182044888e-05, "loss": 0.0342, "step": 17070 }, { "epoch": 2.1296758104738154, "grad_norm": 0.0028252899646759033, "learning_rate": 1.1486284289276808e-05, "loss": 0.0006, "step": 17080 }, { "epoch": 2.130922693266833, "grad_norm": 0.0014341454952955246, "learning_rate": 1.1481296758104739e-05, "loss": 0.011, "step": 17090 }, { "epoch": 2.1321695760598502, "grad_norm": 0.0027584710624068975, "learning_rate": 1.147630922693267e-05, "loss": 0.0848, "step": 17100 }, { "epoch": 2.1334164588528677, "grad_norm": 81.5111083984375, "learning_rate": 1.14713216957606e-05, "loss": 0.0485, "step": 17110 }, { "epoch": 2.134663341645885, "grad_norm": 0.558646023273468, "learning_rate": 1.146633416458853e-05, "loss": 0.0664, "step": 17120 }, { "epoch": 2.1359102244389025, "grad_norm": 0.04680400714278221, "learning_rate": 1.146134663341646e-05, "loss": 0.0025, "step": 17130 }, { "epoch": 2.1371571072319204, "grad_norm": 0.2647761404514313, "learning_rate": 1.145635910224439e-05, "loss": 0.008, "step": 17140 }, { "epoch": 2.138403990024938, "grad_norm": 0.0029155698139220476, "learning_rate": 1.1451371571072321e-05, "loss": 0.0001, "step": 17150 }, { "epoch": 2.1396508728179553, "grad_norm": 13.286086082458496, "learning_rate": 1.1446384039900252e-05, "loss": 0.0207, "step": 17160 }, { "epoch": 2.1408977556109727, "grad_norm": 0.7246861457824707, "learning_rate": 1.144139650872818e-05, "loss": 0.002, "step": 17170 }, { "epoch": 2.14214463840399, "grad_norm": 0.1160615086555481, "learning_rate": 1.1436408977556111e-05, "loss": 0.0561, "step": 17180 }, { "epoch": 2.1433915211970076, "grad_norm": 0.0032474566251039505, "learning_rate": 1.1431421446384042e-05, "loss": 0.0001, "step": 17190 }, { "epoch": 2.144638403990025, "grad_norm": 0.009272860363125801, "learning_rate": 1.1426433915211972e-05, "loss": 0.0001, "step": 17200 }, { "epoch": 2.1458852867830425, "grad_norm": 0.012639547698199749, "learning_rate": 1.14214463840399e-05, "loss": 0.0001, "step": 17210 }, { "epoch": 2.14713216957606, "grad_norm": 0.0023768385872244835, "learning_rate": 1.141645885286783e-05, "loss": 0.0006, "step": 17220 }, { "epoch": 2.1483790523690773, "grad_norm": 0.0028579439967870712, "learning_rate": 1.1411471321695763e-05, "loss": 0.0001, "step": 17230 }, { "epoch": 2.1496259351620948, "grad_norm": 0.015758490189909935, "learning_rate": 1.1406483790523693e-05, "loss": 0.0247, "step": 17240 }, { "epoch": 2.150872817955112, "grad_norm": 26.11306381225586, "learning_rate": 1.140149625935162e-05, "loss": 0.0048, "step": 17250 }, { "epoch": 2.1521197007481296, "grad_norm": 0.002438984578475356, "learning_rate": 1.1396508728179551e-05, "loss": 0.0003, "step": 17260 }, { "epoch": 2.153366583541147, "grad_norm": 0.00045664477511309087, "learning_rate": 1.1391521197007482e-05, "loss": 0.0002, "step": 17270 }, { "epoch": 2.1546134663341645, "grad_norm": 8.155107498168945, "learning_rate": 1.1386533665835412e-05, "loss": 0.0429, "step": 17280 }, { "epoch": 2.155860349127182, "grad_norm": 0.0036463397555053234, "learning_rate": 1.1381546134663341e-05, "loss": 0.0088, "step": 17290 }, { "epoch": 2.1571072319201994, "grad_norm": 0.0040887449868023396, "learning_rate": 1.1376558603491272e-05, "loss": 0.0276, "step": 17300 }, { "epoch": 2.158354114713217, "grad_norm": 0.0022863755002617836, "learning_rate": 1.1371571072319202e-05, "loss": 0.0001, "step": 17310 }, { "epoch": 2.1596009975062342, "grad_norm": 0.0571252815425396, "learning_rate": 1.1366583541147133e-05, "loss": 0.0597, "step": 17320 }, { "epoch": 2.1608478802992517, "grad_norm": 0.0020809960551559925, "learning_rate": 1.1361596009975062e-05, "loss": 0.0001, "step": 17330 }, { "epoch": 2.162094763092269, "grad_norm": 0.003306854283437133, "learning_rate": 1.1356608478802993e-05, "loss": 0.0356, "step": 17340 }, { "epoch": 2.163341645885287, "grad_norm": 0.0009579791803844273, "learning_rate": 1.1351620947630923e-05, "loss": 0.0359, "step": 17350 }, { "epoch": 2.1645885286783044, "grad_norm": 0.012262257747352123, "learning_rate": 1.1346633416458854e-05, "loss": 0.0001, "step": 17360 }, { "epoch": 2.165835411471322, "grad_norm": 0.00868857093155384, "learning_rate": 1.1341645885286784e-05, "loss": 0.0001, "step": 17370 }, { "epoch": 2.1670822942643393, "grad_norm": 0.0008276562439277768, "learning_rate": 1.1336658354114713e-05, "loss": 0.0081, "step": 17380 }, { "epoch": 2.1683291770573567, "grad_norm": 0.0012495616683736444, "learning_rate": 1.1331670822942644e-05, "loss": 0.0001, "step": 17390 }, { "epoch": 2.169576059850374, "grad_norm": 0.007158514112234116, "learning_rate": 1.1326683291770575e-05, "loss": 0.0001, "step": 17400 }, { "epoch": 2.1708229426433916, "grad_norm": 0.015533825382590294, "learning_rate": 1.1321695760598505e-05, "loss": 0.0979, "step": 17410 }, { "epoch": 2.172069825436409, "grad_norm": 0.0036152431275695562, "learning_rate": 1.1316708229426434e-05, "loss": 0.0502, "step": 17420 }, { "epoch": 2.1733167082294265, "grad_norm": 0.05553935095667839, "learning_rate": 1.1311720698254365e-05, "loss": 0.0002, "step": 17430 }, { "epoch": 2.174563591022444, "grad_norm": 0.0016775907715782523, "learning_rate": 1.1306733167082295e-05, "loss": 0.0262, "step": 17440 }, { "epoch": 2.1758104738154613, "grad_norm": 0.03827022761106491, "learning_rate": 1.1301745635910226e-05, "loss": 0.0414, "step": 17450 }, { "epoch": 2.1770573566084788, "grad_norm": 0.0065053971484303474, "learning_rate": 1.1296758104738155e-05, "loss": 0.1002, "step": 17460 }, { "epoch": 2.178304239401496, "grad_norm": 0.001964627066627145, "learning_rate": 1.1291770573566086e-05, "loss": 0.0297, "step": 17470 }, { "epoch": 2.1795511221945136, "grad_norm": 0.0028284601867198944, "learning_rate": 1.1286783042394016e-05, "loss": 0.0001, "step": 17480 }, { "epoch": 2.180798004987531, "grad_norm": 0.009003949351608753, "learning_rate": 1.1281795511221947e-05, "loss": 0.0197, "step": 17490 }, { "epoch": 2.1820448877805485, "grad_norm": 0.007605138700455427, "learning_rate": 1.1276807980049876e-05, "loss": 0.0002, "step": 17500 }, { "epoch": 2.183291770573566, "grad_norm": 0.008635989390313625, "learning_rate": 1.1271820448877806e-05, "loss": 0.0888, "step": 17510 }, { "epoch": 2.1845386533665834, "grad_norm": 0.05233432725071907, "learning_rate": 1.1266832917705737e-05, "loss": 0.0114, "step": 17520 }, { "epoch": 2.185785536159601, "grad_norm": 0.0250040665268898, "learning_rate": 1.1261845386533668e-05, "loss": 0.0003, "step": 17530 }, { "epoch": 2.1870324189526187, "grad_norm": 0.0034498500172048807, "learning_rate": 1.1256857855361597e-05, "loss": 0.0484, "step": 17540 }, { "epoch": 2.188279301745636, "grad_norm": 0.0024719752836972475, "learning_rate": 1.1251870324189527e-05, "loss": 0.0359, "step": 17550 }, { "epoch": 2.1895261845386536, "grad_norm": 0.0028829684015363455, "learning_rate": 1.1246882793017458e-05, "loss": 0.0002, "step": 17560 }, { "epoch": 2.190773067331671, "grad_norm": 0.0012650667922571301, "learning_rate": 1.1241895261845388e-05, "loss": 0.0322, "step": 17570 }, { "epoch": 2.1920199501246884, "grad_norm": 0.01969640515744686, "learning_rate": 1.1236907730673317e-05, "loss": 0.0165, "step": 17580 }, { "epoch": 2.193266832917706, "grad_norm": 49.82617950439453, "learning_rate": 1.1231920199501248e-05, "loss": 0.0073, "step": 17590 }, { "epoch": 2.1945137157107233, "grad_norm": 39.034271240234375, "learning_rate": 1.1226932668329179e-05, "loss": 0.026, "step": 17600 }, { "epoch": 2.1957605985037407, "grad_norm": 26.204391479492188, "learning_rate": 1.122194513715711e-05, "loss": 0.0747, "step": 17610 }, { "epoch": 2.197007481296758, "grad_norm": 0.0015059361467137933, "learning_rate": 1.121695760598504e-05, "loss": 0.0002, "step": 17620 }, { "epoch": 2.1982543640897756, "grad_norm": 0.016991477459669113, "learning_rate": 1.1211970074812967e-05, "loss": 0.0005, "step": 17630 }, { "epoch": 2.199501246882793, "grad_norm": 0.006192653905600309, "learning_rate": 1.12069825436409e-05, "loss": 0.0422, "step": 17640 }, { "epoch": 2.2007481296758105, "grad_norm": 0.014278242364525795, "learning_rate": 1.120199501246883e-05, "loss": 0.0144, "step": 17650 }, { "epoch": 2.201995012468828, "grad_norm": 1.6394509077072144, "learning_rate": 1.119700748129676e-05, "loss": 0.0006, "step": 17660 }, { "epoch": 2.2032418952618453, "grad_norm": 0.09551705420017242, "learning_rate": 1.1192019950124688e-05, "loss": 0.0002, "step": 17670 }, { "epoch": 2.2044887780548628, "grad_norm": 0.0012173291761428118, "learning_rate": 1.1187032418952618e-05, "loss": 0.001, "step": 17680 }, { "epoch": 2.20573566084788, "grad_norm": 0.004221228417009115, "learning_rate": 1.1182044887780549e-05, "loss": 0.0003, "step": 17690 }, { "epoch": 2.2069825436408976, "grad_norm": 0.004237989895045757, "learning_rate": 1.1177057356608481e-05, "loss": 0.0001, "step": 17700 }, { "epoch": 2.208229426433915, "grad_norm": 0.0009970074752345681, "learning_rate": 1.1172069825436409e-05, "loss": 0.0438, "step": 17710 }, { "epoch": 2.2094763092269325, "grad_norm": 0.037249162793159485, "learning_rate": 1.116708229426434e-05, "loss": 0.0002, "step": 17720 }, { "epoch": 2.21072319201995, "grad_norm": 0.0009120830800384283, "learning_rate": 1.116209476309227e-05, "loss": 0.0542, "step": 17730 }, { "epoch": 2.2119700748129674, "grad_norm": 0.011899283155798912, "learning_rate": 1.11571072319202e-05, "loss": 0.0282, "step": 17740 }, { "epoch": 2.213216957605985, "grad_norm": 0.0017361732898280025, "learning_rate": 1.115211970074813e-05, "loss": 0.0087, "step": 17750 }, { "epoch": 2.2144638403990027, "grad_norm": 0.0160725899040699, "learning_rate": 1.114713216957606e-05, "loss": 0.0008, "step": 17760 }, { "epoch": 2.21571072319202, "grad_norm": 0.029333338141441345, "learning_rate": 1.114214463840399e-05, "loss": 0.0059, "step": 17770 }, { "epoch": 2.2169576059850375, "grad_norm": 21.30148696899414, "learning_rate": 1.1137157107231921e-05, "loss": 0.0894, "step": 17780 }, { "epoch": 2.218204488778055, "grad_norm": 0.014875189401209354, "learning_rate": 1.113216957605985e-05, "loss": 0.0252, "step": 17790 }, { "epoch": 2.2194513715710724, "grad_norm": 0.0027588570956140757, "learning_rate": 1.112718204488778e-05, "loss": 0.0001, "step": 17800 }, { "epoch": 2.22069825436409, "grad_norm": 0.0018637663451954722, "learning_rate": 1.1122194513715711e-05, "loss": 0.0002, "step": 17810 }, { "epoch": 2.2219451371571073, "grad_norm": 0.0022036132868379354, "learning_rate": 1.1117206982543642e-05, "loss": 0.0685, "step": 17820 }, { "epoch": 2.2231920199501247, "grad_norm": 0.43087127804756165, "learning_rate": 1.1112219451371571e-05, "loss": 0.001, "step": 17830 }, { "epoch": 2.224438902743142, "grad_norm": 0.0076804617419838905, "learning_rate": 1.1107231920199502e-05, "loss": 0.0005, "step": 17840 }, { "epoch": 2.2256857855361596, "grad_norm": 0.04199754819273949, "learning_rate": 1.1102743142144638e-05, "loss": 0.0408, "step": 17850 }, { "epoch": 2.226932668329177, "grad_norm": 0.004518990404903889, "learning_rate": 1.1097755610972569e-05, "loss": 0.0005, "step": 17860 }, { "epoch": 2.2281795511221945, "grad_norm": 0.006416243966668844, "learning_rate": 1.10927680798005e-05, "loss": 0.0002, "step": 17870 }, { "epoch": 2.229426433915212, "grad_norm": 25.688091278076172, "learning_rate": 1.108778054862843e-05, "loss": 0.0553, "step": 17880 }, { "epoch": 2.2306733167082293, "grad_norm": 0.002934435848146677, "learning_rate": 1.1082793017456359e-05, "loss": 0.0019, "step": 17890 }, { "epoch": 2.2319201995012468, "grad_norm": 0.0010024199727922678, "learning_rate": 1.107780548628429e-05, "loss": 0.0001, "step": 17900 }, { "epoch": 2.233167082294264, "grad_norm": 0.0025311210192739964, "learning_rate": 1.107281795511222e-05, "loss": 0.0002, "step": 17910 }, { "epoch": 2.2344139650872816, "grad_norm": 0.0057137515395879745, "learning_rate": 1.106783042394015e-05, "loss": 0.0377, "step": 17920 }, { "epoch": 2.235660847880299, "grad_norm": 0.0014953430509194732, "learning_rate": 1.106284289276808e-05, "loss": 0.0317, "step": 17930 }, { "epoch": 2.236907730673317, "grad_norm": 0.0010911138961091638, "learning_rate": 1.105785536159601e-05, "loss": 0.0792, "step": 17940 }, { "epoch": 2.2381546134663344, "grad_norm": 0.0012966920621693134, "learning_rate": 1.1052867830423941e-05, "loss": 0.0123, "step": 17950 }, { "epoch": 2.239401496259352, "grad_norm": 0.05284218490123749, "learning_rate": 1.1047880299251872e-05, "loss": 0.0001, "step": 17960 }, { "epoch": 2.2406483790523692, "grad_norm": 0.0015723456162959337, "learning_rate": 1.10428927680798e-05, "loss": 0.0001, "step": 17970 }, { "epoch": 2.2418952618453867, "grad_norm": 0.075335793197155, "learning_rate": 1.1037905236907731e-05, "loss": 0.0676, "step": 17980 }, { "epoch": 2.243142144638404, "grad_norm": 0.0010825825156643987, "learning_rate": 1.1032917705735662e-05, "loss": 0.0001, "step": 17990 }, { "epoch": 2.2443890274314215, "grad_norm": 0.0036625145003199577, "learning_rate": 1.1027930174563592e-05, "loss": 0.0016, "step": 18000 }, { "epoch": 2.245635910224439, "grad_norm": 0.001976029947400093, "learning_rate": 1.1022942643391523e-05, "loss": 0.0451, "step": 18010 }, { "epoch": 2.2468827930174564, "grad_norm": 0.005771087482571602, "learning_rate": 1.1017955112219452e-05, "loss": 0.0546, "step": 18020 }, { "epoch": 2.248129675810474, "grad_norm": 0.0013193414779379964, "learning_rate": 1.1012967581047382e-05, "loss": 0.0255, "step": 18030 }, { "epoch": 2.2493765586034913, "grad_norm": 0.0013921220088377595, "learning_rate": 1.1007980049875313e-05, "loss": 0.0002, "step": 18040 }, { "epoch": 2.2506234413965087, "grad_norm": 0.16599981486797333, "learning_rate": 1.1002992518703244e-05, "loss": 0.0003, "step": 18050 }, { "epoch": 2.251870324189526, "grad_norm": 0.003775638760998845, "learning_rate": 1.0998004987531173e-05, "loss": 0.0001, "step": 18060 }, { "epoch": 2.2531172069825436, "grad_norm": 0.0013987654820084572, "learning_rate": 1.0993017456359103e-05, "loss": 0.0002, "step": 18070 }, { "epoch": 2.254364089775561, "grad_norm": 0.0029886842239648104, "learning_rate": 1.0988029925187034e-05, "loss": 0.0535, "step": 18080 }, { "epoch": 2.2556109725685785, "grad_norm": 0.001835786853916943, "learning_rate": 1.0983042394014964e-05, "loss": 0.0001, "step": 18090 }, { "epoch": 2.256857855361596, "grad_norm": 0.02437090501189232, "learning_rate": 1.0978054862842893e-05, "loss": 0.0012, "step": 18100 }, { "epoch": 2.2581047381546133, "grad_norm": 0.958751380443573, "learning_rate": 1.0973067331670824e-05, "loss": 0.052, "step": 18110 }, { "epoch": 2.2593516209476308, "grad_norm": 0.3797219395637512, "learning_rate": 1.0968079800498755e-05, "loss": 0.0012, "step": 18120 }, { "epoch": 2.260598503740648, "grad_norm": 0.0013616789365187287, "learning_rate": 1.0963092269326685e-05, "loss": 0.0001, "step": 18130 }, { "epoch": 2.2618453865336656, "grad_norm": 0.0048110876232385635, "learning_rate": 1.0958104738154614e-05, "loss": 0.0148, "step": 18140 }, { "epoch": 2.263092269326683, "grad_norm": 0.0008074498036876321, "learning_rate": 1.0953117206982545e-05, "loss": 0.0002, "step": 18150 }, { "epoch": 2.264339152119701, "grad_norm": 0.0034766916651278734, "learning_rate": 1.0948129675810475e-05, "loss": 0.0346, "step": 18160 }, { "epoch": 2.2655860349127184, "grad_norm": 0.0013960660435259342, "learning_rate": 1.0943142144638406e-05, "loss": 0.0001, "step": 18170 }, { "epoch": 2.266832917705736, "grad_norm": 0.0030948331113904715, "learning_rate": 1.0938154613466333e-05, "loss": 0.0376, "step": 18180 }, { "epoch": 2.2680798004987532, "grad_norm": 0.00950552523136139, "learning_rate": 1.0933167082294266e-05, "loss": 0.0444, "step": 18190 }, { "epoch": 2.2693266832917707, "grad_norm": 0.30063608288764954, "learning_rate": 1.0928179551122196e-05, "loss": 0.0001, "step": 18200 }, { "epoch": 2.270573566084788, "grad_norm": 0.0005822066450491548, "learning_rate": 1.0923192019950127e-05, "loss": 0.1135, "step": 18210 }, { "epoch": 2.2718204488778055, "grad_norm": 0.0010623615235090256, "learning_rate": 1.0918204488778054e-05, "loss": 0.0456, "step": 18220 }, { "epoch": 2.273067331670823, "grad_norm": 0.00394933158531785, "learning_rate": 1.0913216957605985e-05, "loss": 0.0294, "step": 18230 }, { "epoch": 2.2743142144638404, "grad_norm": 0.08037488162517548, "learning_rate": 1.0908229426433915e-05, "loss": 0.0003, "step": 18240 }, { "epoch": 2.275561097256858, "grad_norm": 8.893105506896973, "learning_rate": 1.0903241895261848e-05, "loss": 0.04, "step": 18250 }, { "epoch": 2.2768079800498753, "grad_norm": 0.12478364259004593, "learning_rate": 1.0898254364089778e-05, "loss": 0.0167, "step": 18260 }, { "epoch": 2.2780548628428927, "grad_norm": 0.013608187437057495, "learning_rate": 1.0893266832917705e-05, "loss": 0.0263, "step": 18270 }, { "epoch": 2.27930174563591, "grad_norm": 0.015265119262039661, "learning_rate": 1.0888279301745636e-05, "loss": 0.0161, "step": 18280 }, { "epoch": 2.2805486284289276, "grad_norm": 0.004604879766702652, "learning_rate": 1.0883291770573567e-05, "loss": 0.0419, "step": 18290 }, { "epoch": 2.281795511221945, "grad_norm": 0.0062502785585820675, "learning_rate": 1.0878304239401497e-05, "loss": 0.0004, "step": 18300 }, { "epoch": 2.2830423940149625, "grad_norm": 0.3815927505493164, "learning_rate": 1.0873316708229426e-05, "loss": 0.0002, "step": 18310 }, { "epoch": 2.28428927680798, "grad_norm": 0.0019634002819657326, "learning_rate": 1.0868329177057357e-05, "loss": 0.0146, "step": 18320 }, { "epoch": 2.2855361596009973, "grad_norm": 0.00816398486495018, "learning_rate": 1.0863341645885288e-05, "loss": 0.0426, "step": 18330 }, { "epoch": 2.286783042394015, "grad_norm": 0.0013969374122098088, "learning_rate": 1.0858354114713218e-05, "loss": 0.0677, "step": 18340 }, { "epoch": 2.2880299251870326, "grad_norm": 0.002813225844874978, "learning_rate": 1.0853366583541147e-05, "loss": 0.0005, "step": 18350 }, { "epoch": 2.28927680798005, "grad_norm": 21.107683181762695, "learning_rate": 1.0848379052369078e-05, "loss": 0.1074, "step": 18360 }, { "epoch": 2.2905236907730675, "grad_norm": 0.007905518636107445, "learning_rate": 1.0843391521197008e-05, "loss": 0.0124, "step": 18370 }, { "epoch": 2.291770573566085, "grad_norm": 0.05258958786725998, "learning_rate": 1.0838403990024939e-05, "loss": 0.019, "step": 18380 }, { "epoch": 2.2930174563591024, "grad_norm": 0.009827799163758755, "learning_rate": 1.0833416458852868e-05, "loss": 0.0448, "step": 18390 }, { "epoch": 2.29426433915212, "grad_norm": 0.009128672070801258, "learning_rate": 1.0828428927680798e-05, "loss": 0.0133, "step": 18400 }, { "epoch": 2.2955112219451372, "grad_norm": 28.526371002197266, "learning_rate": 1.0823441396508729e-05, "loss": 0.0464, "step": 18410 }, { "epoch": 2.2967581047381547, "grad_norm": 0.014480763114988804, "learning_rate": 1.081845386533666e-05, "loss": 0.0222, "step": 18420 }, { "epoch": 2.298004987531172, "grad_norm": 1.6726791858673096, "learning_rate": 1.0813466334164589e-05, "loss": 0.0547, "step": 18430 }, { "epoch": 2.2992518703241895, "grad_norm": 14.110183715820312, "learning_rate": 1.080847880299252e-05, "loss": 0.0386, "step": 18440 }, { "epoch": 2.300498753117207, "grad_norm": 18.027050018310547, "learning_rate": 1.080349127182045e-05, "loss": 0.0216, "step": 18450 }, { "epoch": 2.3017456359102244, "grad_norm": 0.0013066193787381053, "learning_rate": 1.079850374064838e-05, "loss": 0.0446, "step": 18460 }, { "epoch": 2.302992518703242, "grad_norm": 0.21965332329273224, "learning_rate": 1.079351620947631e-05, "loss": 0.0004, "step": 18470 }, { "epoch": 2.3042394014962593, "grad_norm": 0.0025097858160734177, "learning_rate": 1.078852867830424e-05, "loss": 0.0262, "step": 18480 }, { "epoch": 2.3054862842892767, "grad_norm": 0.0015427289763465524, "learning_rate": 1.078354114713217e-05, "loss": 0.0283, "step": 18490 }, { "epoch": 2.306733167082294, "grad_norm": 0.00836893543601036, "learning_rate": 1.0778553615960101e-05, "loss": 0.0423, "step": 18500 }, { "epoch": 2.3079800498753116, "grad_norm": 0.010444067418575287, "learning_rate": 1.0773566084788032e-05, "loss": 0.0002, "step": 18510 }, { "epoch": 2.309226932668329, "grad_norm": 0.00873679667711258, "learning_rate": 1.076857855361596e-05, "loss": 0.0003, "step": 18520 }, { "epoch": 2.3104738154613464, "grad_norm": 0.0064266156405210495, "learning_rate": 1.0763591022443891e-05, "loss": 0.0003, "step": 18530 }, { "epoch": 2.311720698254364, "grad_norm": 0.020968349650502205, "learning_rate": 1.0758603491271822e-05, "loss": 0.0052, "step": 18540 }, { "epoch": 2.3129675810473813, "grad_norm": 0.03227841854095459, "learning_rate": 1.0753615960099753e-05, "loss": 0.0062, "step": 18550 }, { "epoch": 2.314214463840399, "grad_norm": 0.001537321018986404, "learning_rate": 1.0748628428927682e-05, "loss": 0.0034, "step": 18560 }, { "epoch": 2.3154613466334166, "grad_norm": 0.019995620474219322, "learning_rate": 1.0743640897755612e-05, "loss": 0.0001, "step": 18570 }, { "epoch": 2.316708229426434, "grad_norm": 0.002400660654529929, "learning_rate": 1.0738653366583543e-05, "loss": 0.0111, "step": 18580 }, { "epoch": 2.3179551122194515, "grad_norm": 0.0035773960407823324, "learning_rate": 1.0733665835411473e-05, "loss": 0.0077, "step": 18590 }, { "epoch": 2.319201995012469, "grad_norm": 0.006284439004957676, "learning_rate": 1.0728678304239402e-05, "loss": 0.0373, "step": 18600 }, { "epoch": 2.3204488778054864, "grad_norm": 0.015236135572195053, "learning_rate": 1.0723690773067333e-05, "loss": 0.0001, "step": 18610 }, { "epoch": 2.321695760598504, "grad_norm": 0.004189270548522472, "learning_rate": 1.0718703241895264e-05, "loss": 0.0002, "step": 18620 }, { "epoch": 2.3229426433915212, "grad_norm": 0.0024999042507261038, "learning_rate": 1.0713715710723194e-05, "loss": 0.0001, "step": 18630 }, { "epoch": 2.3241895261845387, "grad_norm": 0.0008350514690391719, "learning_rate": 1.0708728179551121e-05, "loss": 0.0188, "step": 18640 }, { "epoch": 2.325436408977556, "grad_norm": 0.005228615365922451, "learning_rate": 1.0703740648379052e-05, "loss": 0.0433, "step": 18650 }, { "epoch": 2.3266832917705735, "grad_norm": 0.02997826412320137, "learning_rate": 1.0698753117206984e-05, "loss": 0.0227, "step": 18660 }, { "epoch": 2.327930174563591, "grad_norm": 0.001615180866792798, "learning_rate": 1.0693765586034915e-05, "loss": 0.0002, "step": 18670 }, { "epoch": 2.3291770573566084, "grad_norm": 0.0009207409457303584, "learning_rate": 1.0688778054862842e-05, "loss": 0.0001, "step": 18680 }, { "epoch": 2.330423940149626, "grad_norm": 11.185653686523438, "learning_rate": 1.0683790523690773e-05, "loss": 0.001, "step": 18690 }, { "epoch": 2.3316708229426433, "grad_norm": 0.0005180281004868448, "learning_rate": 1.0678802992518703e-05, "loss": 0.0031, "step": 18700 }, { "epoch": 2.3329177057356607, "grad_norm": 0.0009638606570661068, "learning_rate": 1.0673815461346634e-05, "loss": 0.0016, "step": 18710 }, { "epoch": 2.334164588528678, "grad_norm": 18.893842697143555, "learning_rate": 1.0668827930174563e-05, "loss": 0.0544, "step": 18720 }, { "epoch": 2.3354114713216956, "grad_norm": 0.010233055800199509, "learning_rate": 1.0663840399002494e-05, "loss": 0.045, "step": 18730 }, { "epoch": 2.3366583541147135, "grad_norm": 0.0015732423635199666, "learning_rate": 1.0658852867830424e-05, "loss": 0.0543, "step": 18740 }, { "epoch": 2.337905236907731, "grad_norm": 0.003005718346685171, "learning_rate": 1.0653865336658355e-05, "loss": 0.022, "step": 18750 }, { "epoch": 2.3391521197007483, "grad_norm": 0.003704602364450693, "learning_rate": 1.0648877805486286e-05, "loss": 0.0001, "step": 18760 }, { "epoch": 2.3403990024937658, "grad_norm": 0.07532232254743576, "learning_rate": 1.0643890274314214e-05, "loss": 0.0028, "step": 18770 }, { "epoch": 2.341645885286783, "grad_norm": 0.0018420940032228827, "learning_rate": 1.0638902743142145e-05, "loss": 0.0467, "step": 18780 }, { "epoch": 2.3428927680798006, "grad_norm": 0.005858957301825285, "learning_rate": 1.0633915211970076e-05, "loss": 0.0387, "step": 18790 }, { "epoch": 2.344139650872818, "grad_norm": 0.020084038376808167, "learning_rate": 1.0628927680798006e-05, "loss": 0.0513, "step": 18800 }, { "epoch": 2.3453865336658355, "grad_norm": 0.008996500633656979, "learning_rate": 1.0623940149625935e-05, "loss": 0.0003, "step": 18810 }, { "epoch": 2.346633416458853, "grad_norm": 0.003694218583405018, "learning_rate": 1.0618952618453866e-05, "loss": 0.0001, "step": 18820 }, { "epoch": 2.3478802992518704, "grad_norm": 0.018681691959500313, "learning_rate": 1.0613965087281796e-05, "loss": 0.0012, "step": 18830 }, { "epoch": 2.349127182044888, "grad_norm": 0.00553112244233489, "learning_rate": 1.0608977556109727e-05, "loss": 0.0102, "step": 18840 }, { "epoch": 2.3503740648379052, "grad_norm": 0.10294589400291443, "learning_rate": 1.0603990024937656e-05, "loss": 0.0603, "step": 18850 }, { "epoch": 2.3516209476309227, "grad_norm": 0.0040004318580031395, "learning_rate": 1.0599002493765587e-05, "loss": 0.0424, "step": 18860 }, { "epoch": 2.35286783042394, "grad_norm": 0.0010672288481146097, "learning_rate": 1.0594014962593517e-05, "loss": 0.0001, "step": 18870 }, { "epoch": 2.3541147132169575, "grad_norm": 0.003163151443004608, "learning_rate": 1.0589027431421448e-05, "loss": 0.0031, "step": 18880 }, { "epoch": 2.355361596009975, "grad_norm": 0.24321340024471283, "learning_rate": 1.0584039900249377e-05, "loss": 0.0003, "step": 18890 }, { "epoch": 2.3566084788029924, "grad_norm": 0.7143842577934265, "learning_rate": 1.0579052369077307e-05, "loss": 0.0274, "step": 18900 }, { "epoch": 2.35785536159601, "grad_norm": 0.0029214757960289717, "learning_rate": 1.0574064837905238e-05, "loss": 0.0373, "step": 18910 }, { "epoch": 2.3591022443890273, "grad_norm": 0.0009077019640244544, "learning_rate": 1.0569077306733169e-05, "loss": 0.0375, "step": 18920 }, { "epoch": 2.3603491271820447, "grad_norm": 0.007901903241872787, "learning_rate": 1.0564089775561098e-05, "loss": 0.0001, "step": 18930 }, { "epoch": 2.361596009975062, "grad_norm": 23.092758178710938, "learning_rate": 1.0559102244389028e-05, "loss": 0.028, "step": 18940 }, { "epoch": 2.3628428927680796, "grad_norm": 0.006665545050054789, "learning_rate": 1.0554114713216959e-05, "loss": 0.029, "step": 18950 }, { "epoch": 2.3640897755610975, "grad_norm": 0.1113353744149208, "learning_rate": 1.054912718204489e-05, "loss": 0.0002, "step": 18960 }, { "epoch": 2.365336658354115, "grad_norm": 0.0011409720173105597, "learning_rate": 1.0544139650872818e-05, "loss": 0.0443, "step": 18970 }, { "epoch": 2.3665835411471323, "grad_norm": 0.006274055223912001, "learning_rate": 1.0539152119700749e-05, "loss": 0.0001, "step": 18980 }, { "epoch": 2.3678304239401498, "grad_norm": 0.03493626415729523, "learning_rate": 1.053416458852868e-05, "loss": 0.0148, "step": 18990 }, { "epoch": 2.369077306733167, "grad_norm": 0.0024186724331229925, "learning_rate": 1.052917705735661e-05, "loss": 0.0008, "step": 19000 }, { "epoch": 2.3703241895261846, "grad_norm": 0.0004868563555646688, "learning_rate": 1.052418952618454e-05, "loss": 0.0154, "step": 19010 }, { "epoch": 2.371571072319202, "grad_norm": 0.01056770421564579, "learning_rate": 1.051920199501247e-05, "loss": 0.0014, "step": 19020 }, { "epoch": 2.3728179551122195, "grad_norm": 0.0035313288681209087, "learning_rate": 1.05142144638404e-05, "loss": 0.0001, "step": 19030 }, { "epoch": 2.374064837905237, "grad_norm": 62.090789794921875, "learning_rate": 1.0509226932668331e-05, "loss": 0.0211, "step": 19040 }, { "epoch": 2.3753117206982544, "grad_norm": 0.0009852793300524354, "learning_rate": 1.0504239401496262e-05, "loss": 0.0001, "step": 19050 }, { "epoch": 2.376558603491272, "grad_norm": 0.017761345952749252, "learning_rate": 1.0499251870324189e-05, "loss": 0.0371, "step": 19060 }, { "epoch": 2.3778054862842892, "grad_norm": 0.003993969410657883, "learning_rate": 1.0494264339152121e-05, "loss": 0.011, "step": 19070 }, { "epoch": 2.3790523690773067, "grad_norm": 0.14724621176719666, "learning_rate": 1.0489276807980052e-05, "loss": 0.0002, "step": 19080 }, { "epoch": 2.380299251870324, "grad_norm": 102.79788208007812, "learning_rate": 1.0484289276807982e-05, "loss": 0.0175, "step": 19090 }, { "epoch": 2.3815461346633415, "grad_norm": 49.15989303588867, "learning_rate": 1.047930174563591e-05, "loss": 0.0631, "step": 19100 }, { "epoch": 2.382793017456359, "grad_norm": 0.16368083655834198, "learning_rate": 1.047431421446384e-05, "loss": 0.004, "step": 19110 }, { "epoch": 2.3840399002493764, "grad_norm": 0.0005702089983969927, "learning_rate": 1.0469326683291771e-05, "loss": 0.0001, "step": 19120 }, { "epoch": 2.385286783042394, "grad_norm": 0.0047392770648002625, "learning_rate": 1.0464339152119703e-05, "loss": 0.0477, "step": 19130 }, { "epoch": 2.3865336658354117, "grad_norm": 0.0008517673704773188, "learning_rate": 1.045935162094763e-05, "loss": 0.0668, "step": 19140 }, { "epoch": 2.387780548628429, "grad_norm": 0.0008810373838059604, "learning_rate": 1.0454364089775561e-05, "loss": 0.0002, "step": 19150 }, { "epoch": 2.3890274314214466, "grad_norm": 0.0014551517087966204, "learning_rate": 1.0449376558603492e-05, "loss": 0.0002, "step": 19160 }, { "epoch": 2.390274314214464, "grad_norm": 0.00041260942816734314, "learning_rate": 1.0444389027431422e-05, "loss": 0.0601, "step": 19170 }, { "epoch": 2.3915211970074814, "grad_norm": 0.012174108065664768, "learning_rate": 1.0439401496259351e-05, "loss": 0.0321, "step": 19180 }, { "epoch": 2.392768079800499, "grad_norm": 0.055804114788770676, "learning_rate": 1.0434413965087282e-05, "loss": 0.0318, "step": 19190 }, { "epoch": 2.3940149625935163, "grad_norm": 0.005796500016003847, "learning_rate": 1.0429426433915212e-05, "loss": 0.0001, "step": 19200 }, { "epoch": 2.3952618453865338, "grad_norm": 0.0011061924742534757, "learning_rate": 1.0424438902743143e-05, "loss": 0.0374, "step": 19210 }, { "epoch": 2.396508728179551, "grad_norm": 0.0011768280528485775, "learning_rate": 1.0419451371571072e-05, "loss": 0.0058, "step": 19220 }, { "epoch": 2.3977556109725686, "grad_norm": 0.004094341304153204, "learning_rate": 1.0414463840399003e-05, "loss": 0.0001, "step": 19230 }, { "epoch": 2.399002493765586, "grad_norm": 0.005375190172344446, "learning_rate": 1.0409476309226933e-05, "loss": 0.0001, "step": 19240 }, { "epoch": 2.4002493765586035, "grad_norm": 0.0006093172705732286, "learning_rate": 1.0404488778054864e-05, "loss": 0.0007, "step": 19250 }, { "epoch": 2.401496259351621, "grad_norm": 3.3103253841400146, "learning_rate": 1.0399501246882794e-05, "loss": 0.0166, "step": 19260 }, { "epoch": 2.4027431421446384, "grad_norm": 0.0015977061120793223, "learning_rate": 1.0394513715710723e-05, "loss": 0.0, "step": 19270 }, { "epoch": 2.403990024937656, "grad_norm": 23.932458877563477, "learning_rate": 1.0389526184538654e-05, "loss": 0.0425, "step": 19280 }, { "epoch": 2.4052369077306732, "grad_norm": 0.0015871457289904356, "learning_rate": 1.0384538653366585e-05, "loss": 0.042, "step": 19290 }, { "epoch": 2.4064837905236907, "grad_norm": 0.002823758404701948, "learning_rate": 1.0379551122194515e-05, "loss": 0.0002, "step": 19300 }, { "epoch": 2.407730673316708, "grad_norm": 0.004873698577284813, "learning_rate": 1.0374563591022444e-05, "loss": 0.0028, "step": 19310 }, { "epoch": 2.4089775561097255, "grad_norm": 70.15239715576172, "learning_rate": 1.0369576059850375e-05, "loss": 0.0039, "step": 19320 }, { "epoch": 2.410224438902743, "grad_norm": 8.596805572509766, "learning_rate": 1.0364588528678305e-05, "loss": 0.048, "step": 19330 }, { "epoch": 2.4114713216957604, "grad_norm": 0.0022795642726123333, "learning_rate": 1.0359600997506236e-05, "loss": 0.0956, "step": 19340 }, { "epoch": 2.412718204488778, "grad_norm": 0.0011459199013188481, "learning_rate": 1.0354613466334165e-05, "loss": 0.0231, "step": 19350 }, { "epoch": 2.4139650872817953, "grad_norm": 0.014403182081878185, "learning_rate": 1.0349625935162096e-05, "loss": 0.0007, "step": 19360 }, { "epoch": 2.415211970074813, "grad_norm": 0.0014125104062259197, "learning_rate": 1.0344638403990026e-05, "loss": 0.0001, "step": 19370 }, { "epoch": 2.4164588528678306, "grad_norm": 20.214357376098633, "learning_rate": 1.0339650872817957e-05, "loss": 0.0022, "step": 19380 }, { "epoch": 2.417705735660848, "grad_norm": 0.015539759770035744, "learning_rate": 1.0334663341645886e-05, "loss": 0.0359, "step": 19390 }, { "epoch": 2.4189526184538654, "grad_norm": 0.002788018202409148, "learning_rate": 1.0329675810473816e-05, "loss": 0.0505, "step": 19400 }, { "epoch": 2.420199501246883, "grad_norm": 0.008958714082837105, "learning_rate": 1.0324688279301747e-05, "loss": 0.0033, "step": 19410 }, { "epoch": 2.4214463840399003, "grad_norm": 0.03293564170598984, "learning_rate": 1.0319700748129678e-05, "loss": 0.0016, "step": 19420 }, { "epoch": 2.4226932668329177, "grad_norm": 0.002108257031068206, "learning_rate": 1.0314713216957607e-05, "loss": 0.0005, "step": 19430 }, { "epoch": 2.423940149625935, "grad_norm": 0.0034332843497395515, "learning_rate": 1.0309725685785537e-05, "loss": 0.0068, "step": 19440 }, { "epoch": 2.4251870324189526, "grad_norm": 0.01002184022217989, "learning_rate": 1.0304738154613468e-05, "loss": 0.0025, "step": 19450 }, { "epoch": 2.42643391521197, "grad_norm": 0.0007235811208374798, "learning_rate": 1.0299750623441398e-05, "loss": 0.0333, "step": 19460 }, { "epoch": 2.4276807980049875, "grad_norm": 87.51153564453125, "learning_rate": 1.0294763092269327e-05, "loss": 0.021, "step": 19470 }, { "epoch": 2.428927680798005, "grad_norm": 0.01124797947704792, "learning_rate": 1.0289775561097258e-05, "loss": 0.0001, "step": 19480 }, { "epoch": 2.4301745635910224, "grad_norm": 0.06198972091078758, "learning_rate": 1.0284788029925189e-05, "loss": 0.0001, "step": 19490 }, { "epoch": 2.43142144638404, "grad_norm": 0.00533120846375823, "learning_rate": 1.027980049875312e-05, "loss": 0.0658, "step": 19500 }, { "epoch": 2.432668329177057, "grad_norm": 9.353033065795898, "learning_rate": 1.027481296758105e-05, "loss": 0.0092, "step": 19510 }, { "epoch": 2.4339152119700747, "grad_norm": 0.00038408240652643144, "learning_rate": 1.0269825436408977e-05, "loss": 0.0513, "step": 19520 }, { "epoch": 2.435162094763092, "grad_norm": 0.010251792147755623, "learning_rate": 1.026483790523691e-05, "loss": 0.0001, "step": 19530 }, { "epoch": 2.43640897755611, "grad_norm": 0.0018415587255731225, "learning_rate": 1.025985037406484e-05, "loss": 0.0001, "step": 19540 }, { "epoch": 2.4376558603491274, "grad_norm": 0.0017100636614486575, "learning_rate": 1.025486284289277e-05, "loss": 0.0023, "step": 19550 }, { "epoch": 2.438902743142145, "grad_norm": 0.0007173551712185144, "learning_rate": 1.0249875311720698e-05, "loss": 0.0741, "step": 19560 }, { "epoch": 2.4401496259351623, "grad_norm": 0.001977028325200081, "learning_rate": 1.0244887780548628e-05, "loss": 0.0235, "step": 19570 }, { "epoch": 2.4413965087281797, "grad_norm": 23.780227661132812, "learning_rate": 1.0239900249376559e-05, "loss": 0.044, "step": 19580 }, { "epoch": 2.442643391521197, "grad_norm": 0.012191710062325, "learning_rate": 1.0234912718204491e-05, "loss": 0.0306, "step": 19590 }, { "epoch": 2.4438902743142146, "grad_norm": 0.007011815905570984, "learning_rate": 1.0229925187032419e-05, "loss": 0.0174, "step": 19600 }, { "epoch": 2.445137157107232, "grad_norm": 6.738776683807373, "learning_rate": 1.022493765586035e-05, "loss": 0.0007, "step": 19610 }, { "epoch": 2.4463840399002494, "grad_norm": 0.0014683338813483715, "learning_rate": 1.021995012468828e-05, "loss": 0.0327, "step": 19620 }, { "epoch": 2.447630922693267, "grad_norm": 0.003384089795872569, "learning_rate": 1.021496259351621e-05, "loss": 0.0271, "step": 19630 }, { "epoch": 2.4488778054862843, "grad_norm": 36.67137908935547, "learning_rate": 1.020997506234414e-05, "loss": 0.0254, "step": 19640 }, { "epoch": 2.4501246882793017, "grad_norm": 0.01410730741918087, "learning_rate": 1.020498753117207e-05, "loss": 0.0142, "step": 19650 }, { "epoch": 2.451371571072319, "grad_norm": 0.060422949492931366, "learning_rate": 1.02e-05, "loss": 0.034, "step": 19660 }, { "epoch": 2.4526184538653366, "grad_norm": 0.0010058670304715633, "learning_rate": 1.0195012468827931e-05, "loss": 0.0001, "step": 19670 }, { "epoch": 2.453865336658354, "grad_norm": 0.4131668210029602, "learning_rate": 1.019002493765586e-05, "loss": 0.0342, "step": 19680 }, { "epoch": 2.4551122194513715, "grad_norm": 0.0024416018277406693, "learning_rate": 1.018503740648379e-05, "loss": 0.0509, "step": 19690 }, { "epoch": 2.456359102244389, "grad_norm": 0.012069313786923885, "learning_rate": 1.0180049875311721e-05, "loss": 0.0849, "step": 19700 }, { "epoch": 2.4576059850374063, "grad_norm": 0.01398361474275589, "learning_rate": 1.0175062344139652e-05, "loss": 0.0031, "step": 19710 }, { "epoch": 2.458852867830424, "grad_norm": 0.004862997680902481, "learning_rate": 1.0170074812967581e-05, "loss": 0.0204, "step": 19720 }, { "epoch": 2.460099750623441, "grad_norm": 20.287076950073242, "learning_rate": 1.0165087281795512e-05, "loss": 0.0088, "step": 19730 }, { "epoch": 2.4613466334164587, "grad_norm": 0.06296839565038681, "learning_rate": 1.0160099750623442e-05, "loss": 0.0001, "step": 19740 }, { "epoch": 2.462593516209476, "grad_norm": 0.0026271676179021597, "learning_rate": 1.0155112219451373e-05, "loss": 0.0116, "step": 19750 }, { "epoch": 2.4638403990024935, "grad_norm": 0.03639456257224083, "learning_rate": 1.0150124688279303e-05, "loss": 0.0002, "step": 19760 }, { "epoch": 2.4650872817955114, "grad_norm": 1.8751825094223022, "learning_rate": 1.0145137157107232e-05, "loss": 0.0206, "step": 19770 }, { "epoch": 2.466334164588529, "grad_norm": 0.011576034128665924, "learning_rate": 1.0140149625935163e-05, "loss": 0.048, "step": 19780 }, { "epoch": 2.4675810473815463, "grad_norm": 25.818098068237305, "learning_rate": 1.0135162094763094e-05, "loss": 0.108, "step": 19790 }, { "epoch": 2.4688279301745637, "grad_norm": 0.0033021382987499237, "learning_rate": 1.0130174563591024e-05, "loss": 0.0261, "step": 19800 }, { "epoch": 2.470074812967581, "grad_norm": 0.0013052476570010185, "learning_rate": 1.0125187032418953e-05, "loss": 0.0002, "step": 19810 }, { "epoch": 2.4713216957605986, "grad_norm": 0.018260836601257324, "learning_rate": 1.0120199501246884e-05, "loss": 0.0285, "step": 19820 }, { "epoch": 2.472568578553616, "grad_norm": 0.005579350516200066, "learning_rate": 1.0115211970074814e-05, "loss": 0.0009, "step": 19830 }, { "epoch": 2.4738154613466334, "grad_norm": 0.007938280701637268, "learning_rate": 1.0110224438902745e-05, "loss": 0.0001, "step": 19840 }, { "epoch": 2.475062344139651, "grad_norm": 0.0019651507027447224, "learning_rate": 1.0105236907730674e-05, "loss": 0.0751, "step": 19850 }, { "epoch": 2.4763092269326683, "grad_norm": 0.005481358617544174, "learning_rate": 1.0100249376558605e-05, "loss": 0.0428, "step": 19860 }, { "epoch": 2.4775561097256857, "grad_norm": 0.041737351566553116, "learning_rate": 1.0095261845386535e-05, "loss": 0.0002, "step": 19870 }, { "epoch": 2.478802992518703, "grad_norm": 0.0019266968593001366, "learning_rate": 1.0090274314214466e-05, "loss": 0.0009, "step": 19880 }, { "epoch": 2.4800498753117206, "grad_norm": 0.014034237712621689, "learning_rate": 1.0085286783042395e-05, "loss": 0.0298, "step": 19890 }, { "epoch": 2.481296758104738, "grad_norm": 0.286851167678833, "learning_rate": 1.0080299251870325e-05, "loss": 0.0002, "step": 19900 }, { "epoch": 2.4825436408977555, "grad_norm": 0.002897009951993823, "learning_rate": 1.0075311720698256e-05, "loss": 0.0002, "step": 19910 }, { "epoch": 2.483790523690773, "grad_norm": 0.4794400930404663, "learning_rate": 1.0070324189526187e-05, "loss": 0.0003, "step": 19920 }, { "epoch": 2.4850374064837903, "grad_norm": 0.02620755136013031, "learning_rate": 1.0065336658354114e-05, "loss": 0.0392, "step": 19930 }, { "epoch": 2.4862842892768082, "grad_norm": Infinity, "learning_rate": 1.0060847880299254e-05, "loss": 0.0221, "step": 19940 }, { "epoch": 2.4875311720698257, "grad_norm": 1.459934115409851, "learning_rate": 1.0055860349127183e-05, "loss": 0.0003, "step": 19950 }, { "epoch": 2.488778054862843, "grad_norm": 0.012412721291184425, "learning_rate": 1.0050872817955113e-05, "loss": 0.0384, "step": 19960 }, { "epoch": 2.4900249376558605, "grad_norm": 0.005780264735221863, "learning_rate": 1.0045885286783044e-05, "loss": 0.0007, "step": 19970 }, { "epoch": 2.491271820448878, "grad_norm": 0.00027117590070702136, "learning_rate": 1.0040897755610974e-05, "loss": 0.0019, "step": 19980 }, { "epoch": 2.4925187032418954, "grad_norm": 0.3980230391025543, "learning_rate": 1.0035910224438903e-05, "loss": 0.0002, "step": 19990 }, { "epoch": 2.493765586034913, "grad_norm": 0.0018493568059056997, "learning_rate": 1.0030922693266834e-05, "loss": 0.0006, "step": 20000 }, { "epoch": 2.4950124688279303, "grad_norm": 0.000776842818595469, "learning_rate": 1.0025935162094765e-05, "loss": 0.0001, "step": 20010 }, { "epoch": 2.4962593516209477, "grad_norm": 0.003493920899927616, "learning_rate": 1.0020947630922695e-05, "loss": 0.0608, "step": 20020 }, { "epoch": 2.497506234413965, "grad_norm": 0.012550266459584236, "learning_rate": 1.0015960099750624e-05, "loss": 0.0002, "step": 20030 }, { "epoch": 2.4987531172069826, "grad_norm": 23.514270782470703, "learning_rate": 1.0010972568578555e-05, "loss": 0.0664, "step": 20040 }, { "epoch": 2.5, "grad_norm": 0.0014747095992788672, "learning_rate": 1.0005985037406485e-05, "loss": 0.0001, "step": 20050 }, { "epoch": 2.5012468827930174, "grad_norm": 0.0006397130782715976, "learning_rate": 1.0000997506234416e-05, "loss": 0.0527, "step": 20060 }, { "epoch": 2.502493765586035, "grad_norm": 0.004135058261454105, "learning_rate": 9.996009975062345e-06, "loss": 0.0001, "step": 20070 }, { "epoch": 2.5037406483790523, "grad_norm": 0.00247123371809721, "learning_rate": 9.991022443890276e-06, "loss": 0.0001, "step": 20080 }, { "epoch": 2.5049875311720697, "grad_norm": 0.2668535113334656, "learning_rate": 9.986034912718206e-06, "loss": 0.0003, "step": 20090 }, { "epoch": 2.506234413965087, "grad_norm": 0.001860244432464242, "learning_rate": 9.981047381546135e-06, "loss": 0.0459, "step": 20100 }, { "epoch": 2.5074812967581046, "grad_norm": 0.0009490547818131745, "learning_rate": 9.976059850374066e-06, "loss": 0.0171, "step": 20110 }, { "epoch": 2.508728179551122, "grad_norm": 0.002527831355109811, "learning_rate": 9.971072319201995e-06, "loss": 0.0003, "step": 20120 }, { "epoch": 2.5099750623441395, "grad_norm": 0.0007606217986904085, "learning_rate": 9.966084788029925e-06, "loss": 0.0406, "step": 20130 }, { "epoch": 2.511221945137157, "grad_norm": 0.0025609172880649567, "learning_rate": 9.961097256857856e-06, "loss": 0.0, "step": 20140 }, { "epoch": 2.5124688279301743, "grad_norm": 0.008907945826649666, "learning_rate": 9.956109725685787e-06, "loss": 0.0002, "step": 20150 }, { "epoch": 2.5137157107231918, "grad_norm": 0.001625910634174943, "learning_rate": 9.951122194513715e-06, "loss": 0.0094, "step": 20160 }, { "epoch": 2.514962593516209, "grad_norm": 0.0023298319429159164, "learning_rate": 9.946134663341646e-06, "loss": 0.0001, "step": 20170 }, { "epoch": 2.516209476309227, "grad_norm": 0.0006635018507950008, "learning_rate": 9.941147132169577e-06, "loss": 0.0, "step": 20180 }, { "epoch": 2.5174563591022445, "grad_norm": 0.008249226957559586, "learning_rate": 9.936159600997507e-06, "loss": 0.0, "step": 20190 }, { "epoch": 2.518703241895262, "grad_norm": 15.991927146911621, "learning_rate": 9.931172069825438e-06, "loss": 0.0927, "step": 20200 }, { "epoch": 2.5199501246882794, "grad_norm": 0.01762673631310463, "learning_rate": 9.926184538653367e-06, "loss": 0.0001, "step": 20210 }, { "epoch": 2.521197007481297, "grad_norm": 0.24744418263435364, "learning_rate": 9.921197007481297e-06, "loss": 0.0003, "step": 20220 }, { "epoch": 2.5224438902743143, "grad_norm": 0.003450005082413554, "learning_rate": 9.916209476309228e-06, "loss": 0.0606, "step": 20230 }, { "epoch": 2.5236907730673317, "grad_norm": 0.42235738039016724, "learning_rate": 9.911221945137159e-06, "loss": 0.0005, "step": 20240 }, { "epoch": 2.524937655860349, "grad_norm": 0.01129044871777296, "learning_rate": 9.906234413965088e-06, "loss": 0.0001, "step": 20250 }, { "epoch": 2.5261845386533666, "grad_norm": 0.0008669699891470373, "learning_rate": 9.901246882793018e-06, "loss": 0.0352, "step": 20260 }, { "epoch": 2.527431421446384, "grad_norm": 0.001846897415816784, "learning_rate": 9.896259351620949e-06, "loss": 0.0014, "step": 20270 }, { "epoch": 2.5286783042394014, "grad_norm": 0.0006753307534381747, "learning_rate": 9.89127182044888e-06, "loss": 0.0023, "step": 20280 }, { "epoch": 2.529925187032419, "grad_norm": 0.0007630666368640959, "learning_rate": 9.886284289276808e-06, "loss": 0.0, "step": 20290 }, { "epoch": 2.5311720698254363, "grad_norm": 0.0007382095791399479, "learning_rate": 9.881296758104739e-06, "loss": 0.0001, "step": 20300 }, { "epoch": 2.5324189526184537, "grad_norm": 0.0006792868953198195, "learning_rate": 9.876309226932668e-06, "loss": 0.0114, "step": 20310 }, { "epoch": 2.533665835411471, "grad_norm": 0.001210409332998097, "learning_rate": 9.8713216957606e-06, "loss": 0.0006, "step": 20320 }, { "epoch": 2.534912718204489, "grad_norm": 0.5116429924964905, "learning_rate": 9.86633416458853e-06, "loss": 0.0005, "step": 20330 }, { "epoch": 2.5361596009975065, "grad_norm": 0.031346336007118225, "learning_rate": 9.86134663341646e-06, "loss": 0.0445, "step": 20340 }, { "epoch": 2.537406483790524, "grad_norm": 0.0006276296335272491, "learning_rate": 9.856359102244389e-06, "loss": 0.0015, "step": 20350 }, { "epoch": 2.5386533665835413, "grad_norm": 46.322044372558594, "learning_rate": 9.85137157107232e-06, "loss": 0.0762, "step": 20360 }, { "epoch": 2.539900249376559, "grad_norm": 0.0012356149964034557, "learning_rate": 9.84638403990025e-06, "loss": 0.0908, "step": 20370 }, { "epoch": 2.541147132169576, "grad_norm": 0.0005176261183805764, "learning_rate": 9.84139650872818e-06, "loss": 0.0001, "step": 20380 }, { "epoch": 2.5423940149625937, "grad_norm": 0.009803502820432186, "learning_rate": 9.83640897755611e-06, "loss": 0.0001, "step": 20390 }, { "epoch": 2.543640897755611, "grad_norm": 0.006273037288337946, "learning_rate": 9.83142144638404e-06, "loss": 0.0001, "step": 20400 }, { "epoch": 2.5448877805486285, "grad_norm": 0.0008425627020187676, "learning_rate": 9.82643391521197e-06, "loss": 0.003, "step": 20410 }, { "epoch": 2.546134663341646, "grad_norm": 0.0024014932569116354, "learning_rate": 9.821446384039901e-06, "loss": 0.0062, "step": 20420 }, { "epoch": 2.5473815461346634, "grad_norm": 0.0010444270446896553, "learning_rate": 9.816458852867832e-06, "loss": 0.03, "step": 20430 }, { "epoch": 2.548628428927681, "grad_norm": 0.002096327021718025, "learning_rate": 9.811471321695761e-06, "loss": 0.0051, "step": 20440 }, { "epoch": 2.5498753117206983, "grad_norm": 0.18898934125900269, "learning_rate": 9.806483790523692e-06, "loss": 0.0004, "step": 20450 }, { "epoch": 2.5511221945137157, "grad_norm": 0.006348905619233847, "learning_rate": 9.801496259351622e-06, "loss": 0.0003, "step": 20460 }, { "epoch": 2.552369077306733, "grad_norm": 0.0026317578740417957, "learning_rate": 9.796508728179553e-06, "loss": 0.0468, "step": 20470 }, { "epoch": 2.5536159600997506, "grad_norm": 0.0028476768638938665, "learning_rate": 9.791521197007482e-06, "loss": 0.0614, "step": 20480 }, { "epoch": 2.554862842892768, "grad_norm": 0.0013454663567245007, "learning_rate": 9.786533665835412e-06, "loss": 0.0615, "step": 20490 }, { "epoch": 2.5561097256857854, "grad_norm": 0.0007891925633884966, "learning_rate": 9.781546134663343e-06, "loss": 0.0, "step": 20500 }, { "epoch": 2.557356608478803, "grad_norm": 0.001992990029975772, "learning_rate": 9.776558603491274e-06, "loss": 0.0553, "step": 20510 }, { "epoch": 2.5586034912718203, "grad_norm": 0.0006816174718551338, "learning_rate": 9.771571072319203e-06, "loss": 0.0001, "step": 20520 }, { "epoch": 2.5598503740648377, "grad_norm": 0.004548321943730116, "learning_rate": 9.766583541147133e-06, "loss": 0.0001, "step": 20530 }, { "epoch": 2.561097256857855, "grad_norm": 0.002307687886059284, "learning_rate": 9.761596009975062e-06, "loss": 0.0056, "step": 20540 }, { "epoch": 2.5623441396508726, "grad_norm": 0.001487944507971406, "learning_rate": 9.756608478802994e-06, "loss": 0.0143, "step": 20550 }, { "epoch": 2.56359102244389, "grad_norm": 2.5814740657806396, "learning_rate": 9.751620947630923e-06, "loss": 0.0006, "step": 20560 }, { "epoch": 2.5648379052369075, "grad_norm": 0.0018412600038573146, "learning_rate": 9.746633416458854e-06, "loss": 0.0002, "step": 20570 }, { "epoch": 2.5660847880299253, "grad_norm": 0.007456095889210701, "learning_rate": 9.741645885286783e-06, "loss": 0.0001, "step": 20580 }, { "epoch": 2.567331670822943, "grad_norm": 0.003610810497775674, "learning_rate": 9.736658354114713e-06, "loss": 0.0019, "step": 20590 }, { "epoch": 2.56857855361596, "grad_norm": 0.008344599045813084, "learning_rate": 9.731670822942644e-06, "loss": 0.0067, "step": 20600 }, { "epoch": 2.5698254364089776, "grad_norm": 75.59593200683594, "learning_rate": 9.726683291770575e-06, "loss": 0.037, "step": 20610 }, { "epoch": 2.571072319201995, "grad_norm": 0.18108241260051727, "learning_rate": 9.721695760598504e-06, "loss": 0.0001, "step": 20620 }, { "epoch": 2.5723192019950125, "grad_norm": 0.7683297395706177, "learning_rate": 9.716708229426434e-06, "loss": 0.0002, "step": 20630 }, { "epoch": 2.57356608478803, "grad_norm": 9.048687934875488, "learning_rate": 9.711720698254365e-06, "loss": 0.0009, "step": 20640 }, { "epoch": 2.5748129675810474, "grad_norm": 0.001917938468977809, "learning_rate": 9.706733167082295e-06, "loss": 0.0022, "step": 20650 }, { "epoch": 2.576059850374065, "grad_norm": 0.0009557070443406701, "learning_rate": 9.701745635910226e-06, "loss": 0.0024, "step": 20660 }, { "epoch": 2.5773067331670823, "grad_norm": 0.000653248920571059, "learning_rate": 9.696758104738155e-06, "loss": 0.0277, "step": 20670 }, { "epoch": 2.5785536159600997, "grad_norm": 0.0034873723052442074, "learning_rate": 9.691770573566086e-06, "loss": 0.0077, "step": 20680 }, { "epoch": 2.579800498753117, "grad_norm": 0.0020467734429985285, "learning_rate": 9.686783042394016e-06, "loss": 0.0054, "step": 20690 }, { "epoch": 2.5810473815461346, "grad_norm": 0.003967711236327887, "learning_rate": 9.681795511221947e-06, "loss": 0.0, "step": 20700 }, { "epoch": 2.582294264339152, "grad_norm": 0.0005544735467992723, "learning_rate": 9.676807980049876e-06, "loss": 0.043, "step": 20710 }, { "epoch": 2.5835411471321694, "grad_norm": 19.80573272705078, "learning_rate": 9.671820448877806e-06, "loss": 0.073, "step": 20720 }, { "epoch": 2.5847880299251873, "grad_norm": 0.06793244928121567, "learning_rate": 9.666832917705737e-06, "loss": 0.0036, "step": 20730 }, { "epoch": 2.5860349127182047, "grad_norm": 38.223018646240234, "learning_rate": 9.661845386533668e-06, "loss": 0.005, "step": 20740 }, { "epoch": 2.587281795511222, "grad_norm": 0.0006797302630729973, "learning_rate": 9.656857855361597e-06, "loss": 0.0001, "step": 20750 }, { "epoch": 2.5885286783042396, "grad_norm": 0.013156076893210411, "learning_rate": 9.651870324189527e-06, "loss": 0.0262, "step": 20760 }, { "epoch": 2.589775561097257, "grad_norm": 0.0013134418986737728, "learning_rate": 9.646882793017456e-06, "loss": 0.0, "step": 20770 }, { "epoch": 2.5910224438902745, "grad_norm": 0.0012859961716458201, "learning_rate": 9.641895261845387e-06, "loss": 0.0, "step": 20780 }, { "epoch": 2.592269326683292, "grad_norm": 0.0007756856502965093, "learning_rate": 9.636907730673317e-06, "loss": 0.0273, "step": 20790 }, { "epoch": 2.5935162094763093, "grad_norm": 0.0018667828990146518, "learning_rate": 9.631920199501248e-06, "loss": 0.0001, "step": 20800 }, { "epoch": 2.5947630922693268, "grad_norm": 0.0006253106403164566, "learning_rate": 9.626932668329177e-06, "loss": 0.0324, "step": 20810 }, { "epoch": 2.596009975062344, "grad_norm": 0.004920145496726036, "learning_rate": 9.621945137157108e-06, "loss": 0.044, "step": 20820 }, { "epoch": 2.5972568578553616, "grad_norm": 0.00701934564858675, "learning_rate": 9.616957605985038e-06, "loss": 0.0001, "step": 20830 }, { "epoch": 2.598503740648379, "grad_norm": 0.0005125249153934419, "learning_rate": 9.611970074812969e-06, "loss": 0.0004, "step": 20840 }, { "epoch": 2.5997506234413965, "grad_norm": 0.000786464661359787, "learning_rate": 9.606982543640898e-06, "loss": 0.0, "step": 20850 }, { "epoch": 2.600997506234414, "grad_norm": 0.0011195202823728323, "learning_rate": 9.601995012468828e-06, "loss": 0.0, "step": 20860 }, { "epoch": 2.6022443890274314, "grad_norm": 0.0008278349414467812, "learning_rate": 9.597007481296759e-06, "loss": 0.0644, "step": 20870 }, { "epoch": 2.603491271820449, "grad_norm": 0.004355038516223431, "learning_rate": 9.59201995012469e-06, "loss": 0.0001, "step": 20880 }, { "epoch": 2.6047381546134662, "grad_norm": 0.005529096815735102, "learning_rate": 9.587032418952618e-06, "loss": 0.0002, "step": 20890 }, { "epoch": 2.6059850374064837, "grad_norm": 0.013243569061160088, "learning_rate": 9.582044887780549e-06, "loss": 0.0489, "step": 20900 }, { "epoch": 2.607231920199501, "grad_norm": 2.0397703647613525, "learning_rate": 9.57705735660848e-06, "loss": 0.0005, "step": 20910 }, { "epoch": 2.6084788029925186, "grad_norm": 0.0008958008256740868, "learning_rate": 9.57206982543641e-06, "loss": 0.0571, "step": 20920 }, { "epoch": 2.609725685785536, "grad_norm": 0.022373545914888382, "learning_rate": 9.567082294264341e-06, "loss": 0.0812, "step": 20930 }, { "epoch": 2.6109725685785534, "grad_norm": 11.973950386047363, "learning_rate": 9.56209476309227e-06, "loss": 0.0421, "step": 20940 }, { "epoch": 2.612219451371571, "grad_norm": 0.0004938875208608806, "learning_rate": 9.5571072319202e-06, "loss": 0.001, "step": 20950 }, { "epoch": 2.6134663341645883, "grad_norm": 0.004167089704424143, "learning_rate": 9.552119700748131e-06, "loss": 0.0583, "step": 20960 }, { "epoch": 2.6147132169576057, "grad_norm": 0.11523910611867905, "learning_rate": 9.547132169576062e-06, "loss": 0.0001, "step": 20970 }, { "epoch": 2.6159600997506236, "grad_norm": 0.0013754901010543108, "learning_rate": 9.54214463840399e-06, "loss": 0.0001, "step": 20980 }, { "epoch": 2.617206982543641, "grad_norm": 28.791500091552734, "learning_rate": 9.537157107231921e-06, "loss": 0.0372, "step": 20990 }, { "epoch": 2.6184538653366585, "grad_norm": 0.0019191583851352334, "learning_rate": 9.53216957605985e-06, "loss": 0.0003, "step": 21000 }, { "epoch": 2.619700748129676, "grad_norm": 0.001109414966776967, "learning_rate": 9.527182044887781e-06, "loss": 0.0002, "step": 21010 }, { "epoch": 2.6209476309226933, "grad_norm": 0.0011104693403467536, "learning_rate": 9.522194513715711e-06, "loss": 0.0176, "step": 21020 }, { "epoch": 2.6221945137157108, "grad_norm": 6.231688022613525, "learning_rate": 9.517206982543642e-06, "loss": 0.001, "step": 21030 }, { "epoch": 2.623441396508728, "grad_norm": 0.0015027527697384357, "learning_rate": 9.512219451371571e-06, "loss": 0.0525, "step": 21040 }, { "epoch": 2.6246882793017456, "grad_norm": 0.15147195756435394, "learning_rate": 9.507231920199502e-06, "loss": 0.0001, "step": 21050 }, { "epoch": 2.625935162094763, "grad_norm": 0.00728128245100379, "learning_rate": 9.502244389027432e-06, "loss": 0.0008, "step": 21060 }, { "epoch": 2.6271820448877805, "grad_norm": 0.029355598613619804, "learning_rate": 9.497256857855363e-06, "loss": 0.0006, "step": 21070 }, { "epoch": 2.628428927680798, "grad_norm": 0.005401694681495428, "learning_rate": 9.492269326683292e-06, "loss": 0.0001, "step": 21080 }, { "epoch": 2.6296758104738154, "grad_norm": 0.09019399434328079, "learning_rate": 9.487281795511222e-06, "loss": 0.0001, "step": 21090 }, { "epoch": 2.630922693266833, "grad_norm": 0.013616259209811687, "learning_rate": 9.482294264339153e-06, "loss": 0.0075, "step": 21100 }, { "epoch": 2.6321695760598502, "grad_norm": 0.16121892631053925, "learning_rate": 9.477306733167084e-06, "loss": 0.039, "step": 21110 }, { "epoch": 2.6334164588528677, "grad_norm": 0.028935719281435013, "learning_rate": 9.472319201995013e-06, "loss": 0.0344, "step": 21120 }, { "epoch": 2.6346633416458856, "grad_norm": 0.003628489328548312, "learning_rate": 9.467331670822943e-06, "loss": 0.0001, "step": 21130 }, { "epoch": 2.635910224438903, "grad_norm": 0.08887571841478348, "learning_rate": 9.462344139650874e-06, "loss": 0.0367, "step": 21140 }, { "epoch": 2.6371571072319204, "grad_norm": 43.324520111083984, "learning_rate": 9.457356608478804e-06, "loss": 0.077, "step": 21150 }, { "epoch": 2.638403990024938, "grad_norm": 0.0015528800431638956, "learning_rate": 9.452369077306735e-06, "loss": 0.0102, "step": 21160 }, { "epoch": 2.6396508728179553, "grad_norm": 0.001877202419564128, "learning_rate": 9.447381546134664e-06, "loss": 0.0363, "step": 21170 }, { "epoch": 2.6408977556109727, "grad_norm": 0.0009081983589567244, "learning_rate": 9.442394014962595e-06, "loss": 0.0257, "step": 21180 }, { "epoch": 2.64214463840399, "grad_norm": 0.007661967538297176, "learning_rate": 9.437406483790524e-06, "loss": 0.0042, "step": 21190 }, { "epoch": 2.6433915211970076, "grad_norm": 0.0014459396479651332, "learning_rate": 9.432418952618456e-06, "loss": 0.1062, "step": 21200 }, { "epoch": 2.644638403990025, "grad_norm": 0.005056493915617466, "learning_rate": 9.427431421446385e-06, "loss": 0.0654, "step": 21210 }, { "epoch": 2.6458852867830425, "grad_norm": 0.002910393523052335, "learning_rate": 9.422443890274315e-06, "loss": 0.0004, "step": 21220 }, { "epoch": 2.64713216957606, "grad_norm": 0.013651818037033081, "learning_rate": 9.417456359102244e-06, "loss": 0.03, "step": 21230 }, { "epoch": 2.6483790523690773, "grad_norm": 0.000517072097864002, "learning_rate": 9.412468827930175e-06, "loss": 0.031, "step": 21240 }, { "epoch": 2.6496259351620948, "grad_norm": 0.0005630677915178239, "learning_rate": 9.407481296758106e-06, "loss": 0.0001, "step": 21250 }, { "epoch": 2.650872817955112, "grad_norm": 0.01756085641682148, "learning_rate": 9.402493765586036e-06, "loss": 0.0407, "step": 21260 }, { "epoch": 2.6521197007481296, "grad_norm": 0.003300454467535019, "learning_rate": 9.397506234413965e-06, "loss": 0.0068, "step": 21270 }, { "epoch": 2.653366583541147, "grad_norm": 0.01421553548425436, "learning_rate": 9.392518703241896e-06, "loss": 0.0445, "step": 21280 }, { "epoch": 2.6546134663341645, "grad_norm": 0.010401526466012001, "learning_rate": 9.387531172069826e-06, "loss": 0.0026, "step": 21290 }, { "epoch": 2.655860349127182, "grad_norm": 0.0018934487598016858, "learning_rate": 9.382543640897757e-06, "loss": 0.0295, "step": 21300 }, { "epoch": 2.6571072319201994, "grad_norm": 0.0010117872152477503, "learning_rate": 9.377556109725686e-06, "loss": 0.0109, "step": 21310 }, { "epoch": 2.658354114713217, "grad_norm": 0.1258365958929062, "learning_rate": 9.372568578553616e-06, "loss": 0.1095, "step": 21320 }, { "epoch": 2.6596009975062342, "grad_norm": 0.002599473111331463, "learning_rate": 9.367581047381547e-06, "loss": 0.0659, "step": 21330 }, { "epoch": 2.6608478802992517, "grad_norm": 0.01711544394493103, "learning_rate": 9.362593516209478e-06, "loss": 0.0001, "step": 21340 }, { "epoch": 2.662094763092269, "grad_norm": 0.0035147909075021744, "learning_rate": 9.357605985037407e-06, "loss": 0.0365, "step": 21350 }, { "epoch": 2.6633416458852865, "grad_norm": 0.3379318118095398, "learning_rate": 9.352618453865337e-06, "loss": 0.0303, "step": 21360 }, { "epoch": 2.664588528678304, "grad_norm": 4.695526123046875, "learning_rate": 9.347630922693268e-06, "loss": 0.001, "step": 21370 }, { "epoch": 2.665835411471322, "grad_norm": 0.007908876985311508, "learning_rate": 9.342643391521199e-06, "loss": 0.0356, "step": 21380 }, { "epoch": 2.6670822942643393, "grad_norm": 0.006391298491507769, "learning_rate": 9.337655860349127e-06, "loss": 0.0015, "step": 21390 }, { "epoch": 2.6683291770573567, "grad_norm": 0.000589667062740773, "learning_rate": 9.332668329177058e-06, "loss": 0.0001, "step": 21400 }, { "epoch": 2.669576059850374, "grad_norm": 0.0071742599830031395, "learning_rate": 9.327680798004989e-06, "loss": 0.0726, "step": 21410 }, { "epoch": 2.6708229426433916, "grad_norm": 26.3955078125, "learning_rate": 9.322693266832918e-06, "loss": 0.0816, "step": 21420 }, { "epoch": 2.672069825436409, "grad_norm": 0.006245684809982777, "learning_rate": 9.31770573566085e-06, "loss": 0.0301, "step": 21430 }, { "epoch": 2.6733167082294265, "grad_norm": 0.0037288444582372904, "learning_rate": 9.312718204488779e-06, "loss": 0.0007, "step": 21440 }, { "epoch": 2.674563591022444, "grad_norm": 0.023546254262328148, "learning_rate": 9.30773067331671e-06, "loss": 0.0205, "step": 21450 }, { "epoch": 2.6758104738154613, "grad_norm": 0.05411310866475105, "learning_rate": 9.302743142144638e-06, "loss": 0.0353, "step": 21460 }, { "epoch": 2.6770573566084788, "grad_norm": 0.0028213809709995985, "learning_rate": 9.297755610972569e-06, "loss": 0.0014, "step": 21470 }, { "epoch": 2.678304239401496, "grad_norm": 0.1361941546201706, "learning_rate": 9.2927680798005e-06, "loss": 0.0416, "step": 21480 }, { "epoch": 2.6795511221945136, "grad_norm": 0.00903173815459013, "learning_rate": 9.28778054862843e-06, "loss": 0.0002, "step": 21490 }, { "epoch": 2.680798004987531, "grad_norm": 0.17789961397647858, "learning_rate": 9.28279301745636e-06, "loss": 0.0472, "step": 21500 }, { "epoch": 2.6820448877805485, "grad_norm": 0.026107098907232285, "learning_rate": 9.27780548628429e-06, "loss": 0.0002, "step": 21510 }, { "epoch": 2.683291770573566, "grad_norm": 0.0030536989215761423, "learning_rate": 9.27281795511222e-06, "loss": 0.0064, "step": 21520 }, { "epoch": 2.684538653366584, "grad_norm": 0.004354922566562891, "learning_rate": 9.267830423940151e-06, "loss": 0.0357, "step": 21530 }, { "epoch": 2.6857855361596013, "grad_norm": 0.026045413687825203, "learning_rate": 9.26284289276808e-06, "loss": 0.0243, "step": 21540 }, { "epoch": 2.6870324189526187, "grad_norm": 0.060480982065200806, "learning_rate": 9.25785536159601e-06, "loss": 0.0013, "step": 21550 }, { "epoch": 2.688279301745636, "grad_norm": 0.03292575851082802, "learning_rate": 9.252867830423941e-06, "loss": 0.0135, "step": 21560 }, { "epoch": 2.6895261845386536, "grad_norm": 4.122298717498779, "learning_rate": 9.247880299251872e-06, "loss": 0.0265, "step": 21570 }, { "epoch": 2.690773067331671, "grad_norm": 0.0019274103688076138, "learning_rate": 9.2428927680798e-06, "loss": 0.036, "step": 21580 }, { "epoch": 2.6920199501246884, "grad_norm": 0.008120754733681679, "learning_rate": 9.237905236907731e-06, "loss": 0.0259, "step": 21590 }, { "epoch": 2.693266832917706, "grad_norm": 0.23278528451919556, "learning_rate": 9.232917705735662e-06, "loss": 0.0005, "step": 21600 }, { "epoch": 2.6945137157107233, "grad_norm": 0.0032407217659056187, "learning_rate": 9.227930174563593e-06, "loss": 0.0034, "step": 21610 }, { "epoch": 2.6957605985037407, "grad_norm": 0.0015782383270561695, "learning_rate": 9.222942643391522e-06, "loss": 0.0001, "step": 21620 }, { "epoch": 2.697007481296758, "grad_norm": 0.002903692191466689, "learning_rate": 9.217955112219452e-06, "loss": 0.004, "step": 21630 }, { "epoch": 2.6982543640897756, "grad_norm": 0.09239204227924347, "learning_rate": 9.212967581047381e-06, "loss": 0.0275, "step": 21640 }, { "epoch": 2.699501246882793, "grad_norm": 0.36612099409103394, "learning_rate": 9.207980049875312e-06, "loss": 0.0287, "step": 21650 }, { "epoch": 2.7007481296758105, "grad_norm": 0.0029661785811185837, "learning_rate": 9.202992518703244e-06, "loss": 0.0003, "step": 21660 }, { "epoch": 2.701995012468828, "grad_norm": 0.0013707918114960194, "learning_rate": 9.198004987531173e-06, "loss": 0.0363, "step": 21670 }, { "epoch": 2.7032418952618453, "grad_norm": 0.008741063997149467, "learning_rate": 9.193017456359104e-06, "loss": 0.0053, "step": 21680 }, { "epoch": 2.7044887780548628, "grad_norm": 0.015000330284237862, "learning_rate": 9.188029925187032e-06, "loss": 0.0001, "step": 21690 }, { "epoch": 2.70573566084788, "grad_norm": 0.021080205217003822, "learning_rate": 9.183042394014963e-06, "loss": 0.0001, "step": 21700 }, { "epoch": 2.7069825436408976, "grad_norm": 19.31171226501465, "learning_rate": 9.178054862842894e-06, "loss": 0.0308, "step": 21710 }, { "epoch": 2.708229426433915, "grad_norm": 0.0010757783893495798, "learning_rate": 9.173067331670824e-06, "loss": 0.0178, "step": 21720 }, { "epoch": 2.7094763092269325, "grad_norm": 0.0015340207610279322, "learning_rate": 9.168079800498753e-06, "loss": 0.0034, "step": 21730 }, { "epoch": 2.71072319201995, "grad_norm": 0.0007147280848585069, "learning_rate": 9.163092269326684e-06, "loss": 0.0, "step": 21740 }, { "epoch": 2.7119700748129674, "grad_norm": 0.0007995369378477335, "learning_rate": 9.158104738154614e-06, "loss": 0.0003, "step": 21750 }, { "epoch": 2.713216957605985, "grad_norm": 0.03697559982538223, "learning_rate": 9.153117206982545e-06, "loss": 0.0007, "step": 21760 }, { "epoch": 2.7144638403990022, "grad_norm": 0.0008297286694869399, "learning_rate": 9.148129675810474e-06, "loss": 0.0001, "step": 21770 }, { "epoch": 2.71571072319202, "grad_norm": 0.000999205862171948, "learning_rate": 9.143142144638405e-06, "loss": 0.0006, "step": 21780 }, { "epoch": 2.7169576059850375, "grad_norm": 0.00030942095327191055, "learning_rate": 9.138154613466335e-06, "loss": 0.0001, "step": 21790 }, { "epoch": 2.718204488778055, "grad_norm": 0.011203780770301819, "learning_rate": 9.133167082294266e-06, "loss": 0.0004, "step": 21800 }, { "epoch": 2.7194513715710724, "grad_norm": 0.001825497834943235, "learning_rate": 9.128179551122195e-06, "loss": 0.0001, "step": 21810 }, { "epoch": 2.72069825436409, "grad_norm": 0.0016962387599050999, "learning_rate": 9.123192019950125e-06, "loss": 0.019, "step": 21820 }, { "epoch": 2.7219451371571073, "grad_norm": 27.44611358642578, "learning_rate": 9.118204488778054e-06, "loss": 0.0996, "step": 21830 }, { "epoch": 2.7231920199501247, "grad_norm": 0.0013832871336489916, "learning_rate": 9.113216957605987e-06, "loss": 0.0408, "step": 21840 }, { "epoch": 2.724438902743142, "grad_norm": 0.017434384673833847, "learning_rate": 9.108229426433916e-06, "loss": 0.0005, "step": 21850 }, { "epoch": 2.7256857855361596, "grad_norm": 0.0008209617808461189, "learning_rate": 9.103241895261846e-06, "loss": 0.0001, "step": 21860 }, { "epoch": 2.726932668329177, "grad_norm": 0.057141683995723724, "learning_rate": 9.098254364089775e-06, "loss": 0.0026, "step": 21870 }, { "epoch": 2.7281795511221945, "grad_norm": 0.0015385545557364821, "learning_rate": 9.093266832917706e-06, "loss": 0.0108, "step": 21880 }, { "epoch": 2.729426433915212, "grad_norm": 0.0029364656656980515, "learning_rate": 9.088279301745636e-06, "loss": 0.0513, "step": 21890 }, { "epoch": 2.7306733167082293, "grad_norm": 0.005240010563284159, "learning_rate": 9.083291770573567e-06, "loss": 0.0025, "step": 21900 }, { "epoch": 2.7319201995012468, "grad_norm": 0.0024220088962465525, "learning_rate": 9.078304239401498e-06, "loss": 0.0074, "step": 21910 }, { "epoch": 2.733167082294264, "grad_norm": 0.0010143570834770799, "learning_rate": 9.073316708229427e-06, "loss": 0.0833, "step": 21920 }, { "epoch": 2.734413965087282, "grad_norm": 0.002402370562776923, "learning_rate": 9.068329177057357e-06, "loss": 0.0035, "step": 21930 }, { "epoch": 2.7356608478802995, "grad_norm": 0.0024646620731800795, "learning_rate": 9.063341645885288e-06, "loss": 0.0393, "step": 21940 }, { "epoch": 2.736907730673317, "grad_norm": 0.00634728604927659, "learning_rate": 9.058354114713218e-06, "loss": 0.0001, "step": 21950 }, { "epoch": 2.7381546134663344, "grad_norm": 0.0004894645535387099, "learning_rate": 9.053366583541147e-06, "loss": 0.0424, "step": 21960 }, { "epoch": 2.739401496259352, "grad_norm": 0.008502400480210781, "learning_rate": 9.048379052369078e-06, "loss": 0.0002, "step": 21970 }, { "epoch": 2.7406483790523692, "grad_norm": 0.0029366957023739815, "learning_rate": 9.043391521197009e-06, "loss": 0.0819, "step": 21980 }, { "epoch": 2.7418952618453867, "grad_norm": 0.003992959391325712, "learning_rate": 9.03840399002494e-06, "loss": 0.0001, "step": 21990 }, { "epoch": 2.743142144638404, "grad_norm": 0.004475648049265146, "learning_rate": 9.033416458852868e-06, "loss": 0.065, "step": 22000 }, { "epoch": 2.7443890274314215, "grad_norm": 0.004949849098920822, "learning_rate": 9.028428927680799e-06, "loss": 0.0007, "step": 22010 }, { "epoch": 2.745635910224439, "grad_norm": 0.0036813318729400635, "learning_rate": 9.02344139650873e-06, "loss": 0.103, "step": 22020 }, { "epoch": 2.7468827930174564, "grad_norm": 0.006201781798154116, "learning_rate": 9.01845386533666e-06, "loss": 0.0162, "step": 22030 }, { "epoch": 2.748129675810474, "grad_norm": 0.003080939408391714, "learning_rate": 9.013466334164589e-06, "loss": 0.0005, "step": 22040 }, { "epoch": 2.7493765586034913, "grad_norm": 0.07051017880439758, "learning_rate": 9.00847880299252e-06, "loss": 0.0008, "step": 22050 }, { "epoch": 2.7506234413965087, "grad_norm": 0.11431020498275757, "learning_rate": 9.003491271820448e-06, "loss": 0.0003, "step": 22060 }, { "epoch": 2.751870324189526, "grad_norm": 57.69249725341797, "learning_rate": 8.99850374064838e-06, "loss": 0.0093, "step": 22070 }, { "epoch": 2.7531172069825436, "grad_norm": 0.06492702662944794, "learning_rate": 8.99351620947631e-06, "loss": 0.0018, "step": 22080 }, { "epoch": 2.754364089775561, "grad_norm": 0.004266063217073679, "learning_rate": 8.98852867830424e-06, "loss": 0.0132, "step": 22090 }, { "epoch": 2.7556109725685785, "grad_norm": 0.01503852941095829, "learning_rate": 8.98354114713217e-06, "loss": 0.0246, "step": 22100 }, { "epoch": 2.756857855361596, "grad_norm": 0.00257158768363297, "learning_rate": 8.9785536159601e-06, "loss": 0.0002, "step": 22110 }, { "epoch": 2.7581047381546133, "grad_norm": 61.2979736328125, "learning_rate": 8.97356608478803e-06, "loss": 0.0168, "step": 22120 }, { "epoch": 2.7593516209476308, "grad_norm": 0.01783956028521061, "learning_rate": 8.968578553615961e-06, "loss": 0.0084, "step": 22130 }, { "epoch": 2.760598503740648, "grad_norm": 0.004634057637304068, "learning_rate": 8.96359102244389e-06, "loss": 0.0007, "step": 22140 }, { "epoch": 2.7618453865336656, "grad_norm": 0.004496326670050621, "learning_rate": 8.95860349127182e-06, "loss": 0.0002, "step": 22150 }, { "epoch": 2.763092269326683, "grad_norm": 0.0019250494660809636, "learning_rate": 8.953615960099751e-06, "loss": 0.0097, "step": 22160 }, { "epoch": 2.7643391521197005, "grad_norm": 0.0009380208211950958, "learning_rate": 8.948628428927682e-06, "loss": 0.0089, "step": 22170 }, { "epoch": 2.765586034912718, "grad_norm": 0.07003989815711975, "learning_rate": 8.943640897755613e-06, "loss": 0.0016, "step": 22180 }, { "epoch": 2.766832917705736, "grad_norm": 0.0017002633539959788, "learning_rate": 8.938653366583541e-06, "loss": 0.0001, "step": 22190 }, { "epoch": 2.7680798004987532, "grad_norm": 0.0055083842016756535, "learning_rate": 8.933665835411472e-06, "loss": 0.0108, "step": 22200 }, { "epoch": 2.7693266832917707, "grad_norm": 22.551620483398438, "learning_rate": 8.928678304239403e-06, "loss": 0.0031, "step": 22210 }, { "epoch": 2.770573566084788, "grad_norm": 0.017204271629452705, "learning_rate": 8.923690773067333e-06, "loss": 0.0211, "step": 22220 }, { "epoch": 2.7718204488778055, "grad_norm": 0.01693701557815075, "learning_rate": 8.918703241895262e-06, "loss": 0.0008, "step": 22230 }, { "epoch": 2.773067331670823, "grad_norm": 0.0010547166457399726, "learning_rate": 8.913715710723193e-06, "loss": 0.0356, "step": 22240 }, { "epoch": 2.7743142144638404, "grad_norm": 0.0005315632442943752, "learning_rate": 8.908728179551123e-06, "loss": 0.0001, "step": 22250 }, { "epoch": 2.775561097256858, "grad_norm": 0.04161824285984039, "learning_rate": 8.903740648379054e-06, "loss": 0.0002, "step": 22260 }, { "epoch": 2.7768079800498753, "grad_norm": 0.29313403367996216, "learning_rate": 8.898753117206983e-06, "loss": 0.0003, "step": 22270 }, { "epoch": 2.7780548628428927, "grad_norm": 0.002154151676222682, "learning_rate": 8.893765586034914e-06, "loss": 0.0001, "step": 22280 }, { "epoch": 2.77930174563591, "grad_norm": 0.001438229694031179, "learning_rate": 8.888778054862843e-06, "loss": 0.0002, "step": 22290 }, { "epoch": 2.7805486284289276, "grad_norm": 0.0007435880834236741, "learning_rate": 8.883790523690773e-06, "loss": 0.0002, "step": 22300 }, { "epoch": 2.781795511221945, "grad_norm": 0.0008660277235321701, "learning_rate": 8.878802992518704e-06, "loss": 0.0001, "step": 22310 }, { "epoch": 2.7830423940149625, "grad_norm": 42.99732971191406, "learning_rate": 8.873815461346634e-06, "loss": 0.0339, "step": 22320 }, { "epoch": 2.78428927680798, "grad_norm": 0.0006988770910538733, "learning_rate": 8.868827930174563e-06, "loss": 0.0, "step": 22330 }, { "epoch": 2.7855361596009978, "grad_norm": 0.0033045527525246143, "learning_rate": 8.863840399002494e-06, "loss": 0.0001, "step": 22340 }, { "epoch": 2.786783042394015, "grad_norm": 0.3833758533000946, "learning_rate": 8.858852867830425e-06, "loss": 0.0001, "step": 22350 }, { "epoch": 2.7880299251870326, "grad_norm": 0.00027476897230371833, "learning_rate": 8.853865336658355e-06, "loss": 0.0488, "step": 22360 }, { "epoch": 2.78927680798005, "grad_norm": 0.20554883778095245, "learning_rate": 8.848877805486284e-06, "loss": 0.0352, "step": 22370 }, { "epoch": 2.7905236907730675, "grad_norm": 0.0017478206427767873, "learning_rate": 8.843890274314215e-06, "loss": 0.0, "step": 22380 }, { "epoch": 2.791770573566085, "grad_norm": 0.0004887666436843574, "learning_rate": 8.838902743142145e-06, "loss": 0.0001, "step": 22390 }, { "epoch": 2.7930174563591024, "grad_norm": 0.0020243488252162933, "learning_rate": 8.833915211970076e-06, "loss": 0.0153, "step": 22400 }, { "epoch": 2.79426433915212, "grad_norm": 0.013695642352104187, "learning_rate": 8.828927680798007e-06, "loss": 0.0293, "step": 22410 }, { "epoch": 2.7955112219451372, "grad_norm": 0.0038832302670925856, "learning_rate": 8.823940149625936e-06, "loss": 0.0003, "step": 22420 }, { "epoch": 2.7967581047381547, "grad_norm": 0.000641527003608644, "learning_rate": 8.818952618453866e-06, "loss": 0.0, "step": 22430 }, { "epoch": 2.798004987531172, "grad_norm": 0.0015550283715128899, "learning_rate": 8.813965087281797e-06, "loss": 0.0002, "step": 22440 }, { "epoch": 2.7992518703241895, "grad_norm": 0.005692547652870417, "learning_rate": 8.808977556109727e-06, "loss": 0.0004, "step": 22450 }, { "epoch": 2.800498753117207, "grad_norm": 0.0040212650783360004, "learning_rate": 8.803990024937656e-06, "loss": 0.0436, "step": 22460 }, { "epoch": 2.8017456359102244, "grad_norm": 29.452655792236328, "learning_rate": 8.799002493765587e-06, "loss": 0.0255, "step": 22470 }, { "epoch": 2.802992518703242, "grad_norm": 0.04328829422593117, "learning_rate": 8.794014962593518e-06, "loss": 0.0001, "step": 22480 }, { "epoch": 2.8042394014962593, "grad_norm": 0.001273810165002942, "learning_rate": 8.789027431421448e-06, "loss": 0.066, "step": 22490 }, { "epoch": 2.8054862842892767, "grad_norm": 0.0005523357540369034, "learning_rate": 8.784039900249377e-06, "loss": 0.0, "step": 22500 }, { "epoch": 2.806733167082294, "grad_norm": 0.004913564305752516, "learning_rate": 8.779052369077308e-06, "loss": 0.0017, "step": 22510 }, { "epoch": 2.8079800498753116, "grad_norm": 0.0007415362633764744, "learning_rate": 8.774064837905237e-06, "loss": 0.0137, "step": 22520 }, { "epoch": 2.809226932668329, "grad_norm": 0.0008837560890242457, "learning_rate": 8.769077306733167e-06, "loss": 0.0089, "step": 22530 }, { "epoch": 2.8104738154613464, "grad_norm": 0.00038785228389315307, "learning_rate": 8.764089775561098e-06, "loss": 0.0, "step": 22540 }, { "epoch": 2.811720698254364, "grad_norm": 0.0018591403495520353, "learning_rate": 8.759102244389028e-06, "loss": 0.0005, "step": 22550 }, { "epoch": 2.8129675810473813, "grad_norm": 0.0027543804608285427, "learning_rate": 8.754114713216957e-06, "loss": 0.0017, "step": 22560 }, { "epoch": 2.8142144638403987, "grad_norm": 0.000695584574714303, "learning_rate": 8.749127182044888e-06, "loss": 0.0001, "step": 22570 }, { "epoch": 2.815461346633416, "grad_norm": 0.0011798815103247762, "learning_rate": 8.744139650872819e-06, "loss": 0.0117, "step": 22580 }, { "epoch": 2.816708229426434, "grad_norm": 0.001240040990523994, "learning_rate": 8.73915211970075e-06, "loss": 0.0001, "step": 22590 }, { "epoch": 2.8179551122194515, "grad_norm": 0.0005738515174016356, "learning_rate": 8.734164588528678e-06, "loss": 0.0325, "step": 22600 }, { "epoch": 2.819201995012469, "grad_norm": 0.0006897413404658437, "learning_rate": 8.729177057356609e-06, "loss": 0.0274, "step": 22610 }, { "epoch": 2.8204488778054864, "grad_norm": 0.0004890425479970872, "learning_rate": 8.72418952618454e-06, "loss": 0.0001, "step": 22620 }, { "epoch": 2.821695760598504, "grad_norm": 0.0007049014675430954, "learning_rate": 8.71920199501247e-06, "loss": 0.0398, "step": 22630 }, { "epoch": 2.8229426433915212, "grad_norm": 0.0005111999926157296, "learning_rate": 8.7142144638404e-06, "loss": 0.0023, "step": 22640 }, { "epoch": 2.8241895261845387, "grad_norm": 0.05086338147521019, "learning_rate": 8.70922693266833e-06, "loss": 0.0001, "step": 22650 }, { "epoch": 2.825436408977556, "grad_norm": 0.00038924190448597074, "learning_rate": 8.70423940149626e-06, "loss": 0.0002, "step": 22660 }, { "epoch": 2.8266832917705735, "grad_norm": 0.0014980288688093424, "learning_rate": 8.69925187032419e-06, "loss": 0.0, "step": 22670 }, { "epoch": 2.827930174563591, "grad_norm": 0.0011230731615796685, "learning_rate": 8.694264339152121e-06, "loss": 0.0157, "step": 22680 }, { "epoch": 2.8291770573566084, "grad_norm": 0.0012151073897257447, "learning_rate": 8.68927680798005e-06, "loss": 0.0001, "step": 22690 }, { "epoch": 2.830423940149626, "grad_norm": 0.0005313754081726074, "learning_rate": 8.684289276807981e-06, "loss": 0.0001, "step": 22700 }, { "epoch": 2.8316708229426433, "grad_norm": 0.0022479502949863672, "learning_rate": 8.679301745635912e-06, "loss": 0.0, "step": 22710 }, { "epoch": 2.8329177057356607, "grad_norm": 0.0011304900981485844, "learning_rate": 8.674314214463842e-06, "loss": 0.0101, "step": 22720 }, { "epoch": 2.834164588528678, "grad_norm": 11.917057991027832, "learning_rate": 8.669326683291771e-06, "loss": 0.0283, "step": 22730 }, { "epoch": 2.835411471321696, "grad_norm": 17.62745475769043, "learning_rate": 8.664339152119702e-06, "loss": 0.0024, "step": 22740 }, { "epoch": 2.8366583541147135, "grad_norm": 48.56819152832031, "learning_rate": 8.65935162094763e-06, "loss": 0.0499, "step": 22750 }, { "epoch": 2.837905236907731, "grad_norm": 0.04258447512984276, "learning_rate": 8.654364089775561e-06, "loss": 0.0001, "step": 22760 }, { "epoch": 2.8391521197007483, "grad_norm": 0.0031998485792428255, "learning_rate": 8.649376558603492e-06, "loss": 0.0159, "step": 22770 }, { "epoch": 2.8403990024937658, "grad_norm": 0.00028625703998841345, "learning_rate": 8.644389027431423e-06, "loss": 0.0, "step": 22780 }, { "epoch": 2.841645885286783, "grad_norm": 0.06957925111055374, "learning_rate": 8.639401496259351e-06, "loss": 0.0473, "step": 22790 }, { "epoch": 2.8428927680798006, "grad_norm": 0.019344905391335487, "learning_rate": 8.634413965087282e-06, "loss": 0.0443, "step": 22800 }, { "epoch": 2.844139650872818, "grad_norm": 4.3518805503845215, "learning_rate": 8.629426433915213e-06, "loss": 0.0028, "step": 22810 }, { "epoch": 2.8453865336658355, "grad_norm": 0.001046427059918642, "learning_rate": 8.624937655860351e-06, "loss": 0.028, "step": 22820 }, { "epoch": 2.846633416458853, "grad_norm": 0.0013077203184366226, "learning_rate": 8.61995012468828e-06, "loss": 0.0308, "step": 22830 }, { "epoch": 2.8478802992518704, "grad_norm": 0.003886470338329673, "learning_rate": 8.61496259351621e-06, "loss": 0.0003, "step": 22840 }, { "epoch": 2.849127182044888, "grad_norm": 0.00042426149593666196, "learning_rate": 8.60997506234414e-06, "loss": 0.0001, "step": 22850 }, { "epoch": 2.8503740648379052, "grad_norm": 0.03965604305267334, "learning_rate": 8.604987531172072e-06, "loss": 0.039, "step": 22860 }, { "epoch": 2.8516209476309227, "grad_norm": 0.00838763639330864, "learning_rate": 8.6e-06, "loss": 0.0503, "step": 22870 }, { "epoch": 2.85286783042394, "grad_norm": 0.026502016931772232, "learning_rate": 8.595012468827931e-06, "loss": 0.0001, "step": 22880 }, { "epoch": 2.8541147132169575, "grad_norm": 0.0036835346836596727, "learning_rate": 8.59002493765586e-06, "loss": 0.0, "step": 22890 }, { "epoch": 2.855361596009975, "grad_norm": 0.06453216820955276, "learning_rate": 8.58503740648379e-06, "loss": 0.0002, "step": 22900 }, { "epoch": 2.8566084788029924, "grad_norm": 0.032697517424821854, "learning_rate": 8.580049875311721e-06, "loss": 0.0003, "step": 22910 }, { "epoch": 2.85785536159601, "grad_norm": 0.0004190378822386265, "learning_rate": 8.575062344139652e-06, "loss": 0.0018, "step": 22920 }, { "epoch": 2.8591022443890273, "grad_norm": 0.030965005978941917, "learning_rate": 8.570074812967581e-06, "loss": 0.0004, "step": 22930 }, { "epoch": 2.8603491271820447, "grad_norm": 0.00719516770914197, "learning_rate": 8.565087281795512e-06, "loss": 0.0032, "step": 22940 }, { "epoch": 2.861596009975062, "grad_norm": 0.0006492987158708274, "learning_rate": 8.560099750623442e-06, "loss": 0.0452, "step": 22950 }, { "epoch": 2.8628428927680796, "grad_norm": 23.008272171020508, "learning_rate": 8.555112219451373e-06, "loss": 0.003, "step": 22960 }, { "epoch": 2.864089775561097, "grad_norm": 0.0006293976912274957, "learning_rate": 8.550124688279302e-06, "loss": 0.0003, "step": 22970 }, { "epoch": 2.8653366583541144, "grad_norm": 0.0036729283165186644, "learning_rate": 8.545137157107232e-06, "loss": 0.0, "step": 22980 }, { "epoch": 2.8665835411471323, "grad_norm": 0.01885777898132801, "learning_rate": 8.540149625935163e-06, "loss": 0.0327, "step": 22990 }, { "epoch": 2.8678304239401498, "grad_norm": 0.0021062095183879137, "learning_rate": 8.535162094763094e-06, "loss": 0.0315, "step": 23000 }, { "epoch": 2.869077306733167, "grad_norm": 0.0004612091288436204, "learning_rate": 8.530174563591023e-06, "loss": 0.0001, "step": 23010 }, { "epoch": 2.8703241895261846, "grad_norm": 0.00048473486094735563, "learning_rate": 8.525187032418953e-06, "loss": 0.0, "step": 23020 }, { "epoch": 2.871571072319202, "grad_norm": 0.39935484528541565, "learning_rate": 8.520199501246884e-06, "loss": 0.0002, "step": 23030 }, { "epoch": 2.8728179551122195, "grad_norm": 0.002812149003148079, "learning_rate": 8.515211970074814e-06, "loss": 0.0045, "step": 23040 }, { "epoch": 2.874064837905237, "grad_norm": 0.00135338946711272, "learning_rate": 8.510224438902743e-06, "loss": 0.0, "step": 23050 }, { "epoch": 2.8753117206982544, "grad_norm": 0.002936942270025611, "learning_rate": 8.505236907730674e-06, "loss": 0.001, "step": 23060 }, { "epoch": 2.876558603491272, "grad_norm": 0.0027010170742869377, "learning_rate": 8.500249376558605e-06, "loss": 0.0078, "step": 23070 }, { "epoch": 2.8778054862842892, "grad_norm": 0.001148201641626656, "learning_rate": 8.495261845386534e-06, "loss": 0.0001, "step": 23080 }, { "epoch": 2.8790523690773067, "grad_norm": 0.1928788721561432, "learning_rate": 8.490274314214466e-06, "loss": 0.0481, "step": 23090 }, { "epoch": 2.880299251870324, "grad_norm": 0.0031981042120605707, "learning_rate": 8.485286783042395e-06, "loss": 0.0001, "step": 23100 }, { "epoch": 2.8815461346633415, "grad_norm": 0.0007050613639876246, "learning_rate": 8.480299251870325e-06, "loss": 0.0, "step": 23110 }, { "epoch": 2.882793017456359, "grad_norm": 0.0008105888264253736, "learning_rate": 8.475311720698254e-06, "loss": 0.0974, "step": 23120 }, { "epoch": 2.8840399002493764, "grad_norm": 4.346730709075928, "learning_rate": 8.470324189526185e-06, "loss": 0.0094, "step": 23130 }, { "epoch": 2.8852867830423943, "grad_norm": 10.774336814880371, "learning_rate": 8.465336658354116e-06, "loss": 0.0009, "step": 23140 }, { "epoch": 2.8865336658354117, "grad_norm": 0.001677769236266613, "learning_rate": 8.460349127182046e-06, "loss": 0.0, "step": 23150 }, { "epoch": 2.887780548628429, "grad_norm": 0.0012615503510460258, "learning_rate": 8.455361596009975e-06, "loss": 0.0, "step": 23160 }, { "epoch": 2.8890274314214466, "grad_norm": 0.0020845013204962015, "learning_rate": 8.450374064837906e-06, "loss": 0.0137, "step": 23170 }, { "epoch": 2.890274314214464, "grad_norm": 0.015877505764365196, "learning_rate": 8.445386533665836e-06, "loss": 0.0001, "step": 23180 }, { "epoch": 2.8915211970074814, "grad_norm": 2.8694908618927, "learning_rate": 8.440399002493767e-06, "loss": 0.0003, "step": 23190 }, { "epoch": 2.892768079800499, "grad_norm": 0.00029893871396780014, "learning_rate": 8.435411471321696e-06, "loss": 0.0001, "step": 23200 }, { "epoch": 2.8940149625935163, "grad_norm": 0.0019202682888135314, "learning_rate": 8.430423940149626e-06, "loss": 0.0197, "step": 23210 }, { "epoch": 2.8952618453865338, "grad_norm": 78.83489227294922, "learning_rate": 8.425436408977557e-06, "loss": 0.0308, "step": 23220 }, { "epoch": 2.896508728179551, "grad_norm": 0.0004875132581219077, "learning_rate": 8.420448877805488e-06, "loss": 0.0, "step": 23230 }, { "epoch": 2.8977556109725686, "grad_norm": 0.0007964399410411716, "learning_rate": 8.415461346633417e-06, "loss": 0.0, "step": 23240 }, { "epoch": 2.899002493765586, "grad_norm": 0.007691626902669668, "learning_rate": 8.410473815461347e-06, "loss": 0.0275, "step": 23250 }, { "epoch": 2.9002493765586035, "grad_norm": 0.012193195521831512, "learning_rate": 8.405486284289278e-06, "loss": 0.0571, "step": 23260 }, { "epoch": 2.901496259351621, "grad_norm": 0.0132495928555727, "learning_rate": 8.400498753117209e-06, "loss": 0.0, "step": 23270 }, { "epoch": 2.9027431421446384, "grad_norm": 0.0009290704620070755, "learning_rate": 8.395511221945137e-06, "loss": 0.0, "step": 23280 }, { "epoch": 2.903990024937656, "grad_norm": 22.11893081665039, "learning_rate": 8.390523690773068e-06, "loss": 0.022, "step": 23290 }, { "epoch": 2.9052369077306732, "grad_norm": 0.003784743370488286, "learning_rate": 8.385536159600997e-06, "loss": 0.0, "step": 23300 }, { "epoch": 2.9064837905236907, "grad_norm": 0.0003365647862665355, "learning_rate": 8.380548628428928e-06, "loss": 0.0001, "step": 23310 }, { "epoch": 2.907730673316708, "grad_norm": 0.018379248678684235, "learning_rate": 8.37556109725686e-06, "loss": 0.0001, "step": 23320 }, { "epoch": 2.9089775561097255, "grad_norm": 0.00030551591771654785, "learning_rate": 8.370573566084789e-06, "loss": 0.0439, "step": 23330 }, { "epoch": 2.910224438902743, "grad_norm": 0.11214791983366013, "learning_rate": 8.36558603491272e-06, "loss": 0.0001, "step": 23340 }, { "epoch": 2.9114713216957604, "grad_norm": 0.0002639228187035769, "learning_rate": 8.360598503740648e-06, "loss": 0.0199, "step": 23350 }, { "epoch": 2.912718204488778, "grad_norm": 0.003870609914883971, "learning_rate": 8.355610972568579e-06, "loss": 0.0242, "step": 23360 }, { "epoch": 2.9139650872817953, "grad_norm": 0.3105866611003876, "learning_rate": 8.35062344139651e-06, "loss": 0.0004, "step": 23370 }, { "epoch": 2.9152119700748127, "grad_norm": 0.0002682608610484749, "learning_rate": 8.34563591022444e-06, "loss": 0.0048, "step": 23380 }, { "epoch": 2.9164588528678306, "grad_norm": 0.00046468869550153613, "learning_rate": 8.34064837905237e-06, "loss": 0.0001, "step": 23390 }, { "epoch": 2.917705735660848, "grad_norm": 0.0008718192693777382, "learning_rate": 8.3356608478803e-06, "loss": 0.0862, "step": 23400 }, { "epoch": 2.9189526184538654, "grad_norm": 0.003581983968615532, "learning_rate": 8.33067331670823e-06, "loss": 0.0001, "step": 23410 }, { "epoch": 2.920199501246883, "grad_norm": 0.0020457198843359947, "learning_rate": 8.325685785536161e-06, "loss": 0.0001, "step": 23420 }, { "epoch": 2.9214463840399003, "grad_norm": 14.641356468200684, "learning_rate": 8.32069825436409e-06, "loss": 0.0014, "step": 23430 }, { "epoch": 2.9226932668329177, "grad_norm": 0.19841350615024567, "learning_rate": 8.31571072319202e-06, "loss": 0.0001, "step": 23440 }, { "epoch": 2.923940149625935, "grad_norm": 22.013063430786133, "learning_rate": 8.310723192019951e-06, "loss": 0.0313, "step": 23450 }, { "epoch": 2.9251870324189526, "grad_norm": 0.0051285261288285255, "learning_rate": 8.305735660847882e-06, "loss": 0.0002, "step": 23460 }, { "epoch": 2.92643391521197, "grad_norm": 0.0011192033998668194, "learning_rate": 8.30074812967581e-06, "loss": 0.0528, "step": 23470 }, { "epoch": 2.9276807980049875, "grad_norm": 0.017419705167412758, "learning_rate": 8.295760598503741e-06, "loss": 0.0013, "step": 23480 }, { "epoch": 2.928927680798005, "grad_norm": 0.006357110105454922, "learning_rate": 8.29077306733167e-06, "loss": 0.0414, "step": 23490 }, { "epoch": 2.9301745635910224, "grad_norm": 0.007139955647289753, "learning_rate": 8.285785536159603e-06, "loss": 0.111, "step": 23500 }, { "epoch": 2.93142144638404, "grad_norm": 0.014205764047801495, "learning_rate": 8.280798004987532e-06, "loss": 0.0093, "step": 23510 }, { "epoch": 2.932668329177057, "grad_norm": 0.009623657912015915, "learning_rate": 8.275810473815462e-06, "loss": 0.0053, "step": 23520 }, { "epoch": 2.9339152119700747, "grad_norm": 0.0029291717801243067, "learning_rate": 8.270822942643391e-06, "loss": 0.0002, "step": 23530 }, { "epoch": 2.9351620947630925, "grad_norm": 0.0059037720784544945, "learning_rate": 8.265835411471322e-06, "loss": 0.0044, "step": 23540 }, { "epoch": 2.93640897755611, "grad_norm": 24.23624038696289, "learning_rate": 8.260847880299252e-06, "loss": 0.0675, "step": 23550 }, { "epoch": 2.9376558603491274, "grad_norm": 0.009634561836719513, "learning_rate": 8.255860349127183e-06, "loss": 0.0352, "step": 23560 }, { "epoch": 2.938902743142145, "grad_norm": 0.001296225585974753, "learning_rate": 8.250872817955114e-06, "loss": 0.0014, "step": 23570 }, { "epoch": 2.9401496259351623, "grad_norm": 0.02580132707953453, "learning_rate": 8.245885286783042e-06, "loss": 0.0, "step": 23580 }, { "epoch": 2.9413965087281797, "grad_norm": 0.003978101536631584, "learning_rate": 8.240897755610973e-06, "loss": 0.0271, "step": 23590 }, { "epoch": 2.942643391521197, "grad_norm": 0.009165945462882519, "learning_rate": 8.235910224438904e-06, "loss": 0.0424, "step": 23600 }, { "epoch": 2.9438902743142146, "grad_norm": 1.0541449785232544, "learning_rate": 8.230922693266834e-06, "loss": 0.0003, "step": 23610 }, { "epoch": 2.945137157107232, "grad_norm": 0.00030957505805417895, "learning_rate": 8.225935162094763e-06, "loss": 0.0397, "step": 23620 }, { "epoch": 2.9463840399002494, "grad_norm": 0.0014495253562927246, "learning_rate": 8.220947630922694e-06, "loss": 0.0941, "step": 23630 }, { "epoch": 2.947630922693267, "grad_norm": 0.002056701574474573, "learning_rate": 8.215960099750624e-06, "loss": 0.0001, "step": 23640 }, { "epoch": 2.9488778054862843, "grad_norm": 0.002144606551155448, "learning_rate": 8.210972568578555e-06, "loss": 0.0005, "step": 23650 }, { "epoch": 2.9501246882793017, "grad_norm": 0.0010960723739117384, "learning_rate": 8.205985037406484e-06, "loss": 0.0039, "step": 23660 }, { "epoch": 2.951371571072319, "grad_norm": 0.0012292738538235426, "learning_rate": 8.200997506234415e-06, "loss": 0.0001, "step": 23670 }, { "epoch": 2.9526184538653366, "grad_norm": 0.0013217430096119642, "learning_rate": 8.196009975062345e-06, "loss": 0.0064, "step": 23680 }, { "epoch": 2.953865336658354, "grad_norm": 0.0012824861332774162, "learning_rate": 8.191022443890276e-06, "loss": 0.0004, "step": 23690 }, { "epoch": 2.9551122194513715, "grad_norm": 0.0026409977581351995, "learning_rate": 8.186034912718205e-06, "loss": 0.0032, "step": 23700 }, { "epoch": 2.956359102244389, "grad_norm": 0.0022691749036312103, "learning_rate": 8.181047381546135e-06, "loss": 0.0408, "step": 23710 }, { "epoch": 2.9576059850374063, "grad_norm": 0.0014671648386865854, "learning_rate": 8.176059850374064e-06, "loss": 0.0003, "step": 23720 }, { "epoch": 2.958852867830424, "grad_norm": 0.0031290799379348755, "learning_rate": 8.171072319201997e-06, "loss": 0.0203, "step": 23730 }, { "epoch": 2.960099750623441, "grad_norm": 0.0008347875555045903, "learning_rate": 8.166084788029926e-06, "loss": 0.0919, "step": 23740 }, { "epoch": 2.9613466334164587, "grad_norm": 0.021511143073439598, "learning_rate": 8.161097256857856e-06, "loss": 0.0019, "step": 23750 }, { "epoch": 2.962593516209476, "grad_norm": 0.0033409115858376026, "learning_rate": 8.156109725685785e-06, "loss": 0.0001, "step": 23760 }, { "epoch": 2.9638403990024935, "grad_norm": 0.8422829508781433, "learning_rate": 8.151122194513716e-06, "loss": 0.0598, "step": 23770 }, { "epoch": 2.965087281795511, "grad_norm": 0.020311573520302773, "learning_rate": 8.146134663341646e-06, "loss": 0.0001, "step": 23780 }, { "epoch": 2.966334164588529, "grad_norm": 0.00174625008367002, "learning_rate": 8.141147132169577e-06, "loss": 0.0236, "step": 23790 }, { "epoch": 2.9675810473815463, "grad_norm": 0.0019468831596896052, "learning_rate": 8.136159600997506e-06, "loss": 0.0306, "step": 23800 }, { "epoch": 2.9688279301745637, "grad_norm": 0.014754627831280231, "learning_rate": 8.131172069825437e-06, "loss": 0.0641, "step": 23810 }, { "epoch": 2.970074812967581, "grad_norm": 0.08833127468824387, "learning_rate": 8.126184538653367e-06, "loss": 0.0026, "step": 23820 }, { "epoch": 2.9713216957605986, "grad_norm": 0.09355904906988144, "learning_rate": 8.121197007481298e-06, "loss": 0.0004, "step": 23830 }, { "epoch": 2.972568578553616, "grad_norm": 0.15409357845783234, "learning_rate": 8.116209476309228e-06, "loss": 0.0003, "step": 23840 }, { "epoch": 2.9738154613466334, "grad_norm": 0.0047310395166277885, "learning_rate": 8.111221945137157e-06, "loss": 0.0001, "step": 23850 }, { "epoch": 2.975062344139651, "grad_norm": 0.00219229725189507, "learning_rate": 8.106234413965088e-06, "loss": 0.0001, "step": 23860 }, { "epoch": 2.9763092269326683, "grad_norm": 0.001115076127462089, "learning_rate": 8.101246882793019e-06, "loss": 0.0001, "step": 23870 }, { "epoch": 2.9775561097256857, "grad_norm": 0.003730418160557747, "learning_rate": 8.09625935162095e-06, "loss": 0.0584, "step": 23880 }, { "epoch": 2.978802992518703, "grad_norm": 0.0017962405690923333, "learning_rate": 8.091271820448878e-06, "loss": 0.0002, "step": 23890 }, { "epoch": 2.9800498753117206, "grad_norm": 58.13080596923828, "learning_rate": 8.086284289276809e-06, "loss": 0.032, "step": 23900 }, { "epoch": 2.981296758104738, "grad_norm": 0.04415411129593849, "learning_rate": 8.08129675810474e-06, "loss": 0.0055, "step": 23910 }, { "epoch": 2.9825436408977555, "grad_norm": 0.12564721703529358, "learning_rate": 8.07630922693267e-06, "loss": 0.0619, "step": 23920 }, { "epoch": 2.983790523690773, "grad_norm": 0.0032362595666199923, "learning_rate": 8.071321695760599e-06, "loss": 0.0002, "step": 23930 }, { "epoch": 2.985037406483791, "grad_norm": 29.849327087402344, "learning_rate": 8.06633416458853e-06, "loss": 0.1159, "step": 23940 }, { "epoch": 2.9862842892768082, "grad_norm": 0.010884604416787624, "learning_rate": 8.061346633416458e-06, "loss": 0.0005, "step": 23950 }, { "epoch": 2.9875311720698257, "grad_norm": 0.0014786362880840898, "learning_rate": 8.056359102244389e-06, "loss": 0.0053, "step": 23960 }, { "epoch": 2.988778054862843, "grad_norm": 0.002504403702914715, "learning_rate": 8.05137157107232e-06, "loss": 0.0001, "step": 23970 }, { "epoch": 2.9900249376558605, "grad_norm": 0.0029743423219770193, "learning_rate": 8.04638403990025e-06, "loss": 0.0307, "step": 23980 }, { "epoch": 2.991271820448878, "grad_norm": 0.002073820913210511, "learning_rate": 8.04139650872818e-06, "loss": 0.0285, "step": 23990 }, { "epoch": 2.9925187032418954, "grad_norm": 0.0032272618263959885, "learning_rate": 8.03640897755611e-06, "loss": 0.0024, "step": 24000 }, { "epoch": 2.993765586034913, "grad_norm": 0.003961468581110239, "learning_rate": 8.03142144638404e-06, "loss": 0.0135, "step": 24010 }, { "epoch": 2.9950124688279303, "grad_norm": 4.218677520751953, "learning_rate": 8.026433915211971e-06, "loss": 0.002, "step": 24020 }, { "epoch": 2.9962593516209477, "grad_norm": 1.7304545640945435, "learning_rate": 8.0214463840399e-06, "loss": 0.0088, "step": 24030 }, { "epoch": 2.997506234413965, "grad_norm": 0.0033019916154444218, "learning_rate": 8.01645885286783e-06, "loss": 0.0006, "step": 24040 }, { "epoch": 2.9987531172069826, "grad_norm": 17.089313507080078, "learning_rate": 8.011471321695761e-06, "loss": 0.0184, "step": 24050 }, { "epoch": 3.0, "grad_norm": 0.001758578815497458, "learning_rate": 8.006483790523692e-06, "loss": 0.0008, "step": 24060 }, { "epoch": 3.0, "eval_accuracy": 0.9931417170646549, "eval_loss": 0.03674669936299324, "eval_runtime": 17.4363, "eval_samples_per_second": 919.864, "eval_steps_per_second": 57.524, "step": 24060 }, { "epoch": 3.0012468827930174, "grad_norm": 0.004041421227157116, "learning_rate": 8.001496259351622e-06, "loss": 0.0425, "step": 24070 }, { "epoch": 3.002493765586035, "grad_norm": 0.007590838707983494, "learning_rate": 7.996508728179551e-06, "loss": 0.0422, "step": 24080 }, { "epoch": 3.0037406483790523, "grad_norm": 0.0012900944566354156, "learning_rate": 7.991521197007482e-06, "loss": 0.0002, "step": 24090 }, { "epoch": 3.0049875311720697, "grad_norm": 0.02778124250471592, "learning_rate": 7.986533665835413e-06, "loss": 0.0001, "step": 24100 }, { "epoch": 3.006234413965087, "grad_norm": 0.001751819159835577, "learning_rate": 7.981546134663343e-06, "loss": 0.0009, "step": 24110 }, { "epoch": 3.0074812967581046, "grad_norm": 0.005507446825504303, "learning_rate": 7.976558603491272e-06, "loss": 0.0065, "step": 24120 }, { "epoch": 3.008728179551122, "grad_norm": 0.02041839249432087, "learning_rate": 7.971571072319203e-06, "loss": 0.0001, "step": 24130 }, { "epoch": 3.0099750623441395, "grad_norm": 0.009164830669760704, "learning_rate": 7.966583541147133e-06, "loss": 0.0001, "step": 24140 }, { "epoch": 3.011221945137157, "grad_norm": 0.002013957826420665, "learning_rate": 7.961596009975064e-06, "loss": 0.0001, "step": 24150 }, { "epoch": 3.0124688279301743, "grad_norm": 0.0525008849799633, "learning_rate": 7.956608478802993e-06, "loss": 0.0001, "step": 24160 }, { "epoch": 3.013715710723192, "grad_norm": 0.0016975915059447289, "learning_rate": 7.951620947630924e-06, "loss": 0.0001, "step": 24170 }, { "epoch": 3.0149625935162097, "grad_norm": 0.0022598865907639265, "learning_rate": 7.946633416458853e-06, "loss": 0.0002, "step": 24180 }, { "epoch": 3.016209476309227, "grad_norm": 0.0035758563317358494, "learning_rate": 7.941645885286783e-06, "loss": 0.0001, "step": 24190 }, { "epoch": 3.0174563591022445, "grad_norm": 0.007171344943344593, "learning_rate": 7.936658354114714e-06, "loss": 0.0001, "step": 24200 }, { "epoch": 3.018703241895262, "grad_norm": 0.18562623858451843, "learning_rate": 7.931670822942644e-06, "loss": 0.0002, "step": 24210 }, { "epoch": 3.0199501246882794, "grad_norm": 0.001060682232491672, "learning_rate": 7.926683291770573e-06, "loss": 0.0012, "step": 24220 }, { "epoch": 3.021197007481297, "grad_norm": 20.250171661376953, "learning_rate": 7.921695760598504e-06, "loss": 0.0029, "step": 24230 }, { "epoch": 3.0224438902743143, "grad_norm": 0.0019098323537036777, "learning_rate": 7.916708229426435e-06, "loss": 0.0405, "step": 24240 }, { "epoch": 3.0236907730673317, "grad_norm": 0.019634464755654335, "learning_rate": 7.911720698254365e-06, "loss": 0.0001, "step": 24250 }, { "epoch": 3.024937655860349, "grad_norm": 56.8670768737793, "learning_rate": 7.906733167082294e-06, "loss": 0.023, "step": 24260 }, { "epoch": 3.0261845386533666, "grad_norm": 42.67612838745117, "learning_rate": 7.901745635910225e-06, "loss": 0.0519, "step": 24270 }, { "epoch": 3.027431421446384, "grad_norm": 0.000763443938922137, "learning_rate": 7.896758104738155e-06, "loss": 0.0, "step": 24280 }, { "epoch": 3.0286783042394014, "grad_norm": 0.000828297808766365, "learning_rate": 7.891770573566086e-06, "loss": 0.0, "step": 24290 }, { "epoch": 3.029925187032419, "grad_norm": 0.0021556930150836706, "learning_rate": 7.886783042394017e-06, "loss": 0.0001, "step": 24300 }, { "epoch": 3.0311720698254363, "grad_norm": 0.003809295129030943, "learning_rate": 7.881795511221945e-06, "loss": 0.0, "step": 24310 }, { "epoch": 3.0324189526184537, "grad_norm": 0.011999325826764107, "learning_rate": 7.876807980049876e-06, "loss": 0.0001, "step": 24320 }, { "epoch": 3.033665835411471, "grad_norm": 0.001197949517518282, "learning_rate": 7.871820448877807e-06, "loss": 0.0, "step": 24330 }, { "epoch": 3.0349127182044886, "grad_norm": 0.01645563915371895, "learning_rate": 7.866832917705737e-06, "loss": 0.0001, "step": 24340 }, { "epoch": 3.036159600997506, "grad_norm": 0.0022310710046440363, "learning_rate": 7.861845386533666e-06, "loss": 0.001, "step": 24350 }, { "epoch": 3.037406483790524, "grad_norm": 0.06693558394908905, "learning_rate": 7.856857855361597e-06, "loss": 0.0706, "step": 24360 }, { "epoch": 3.0386533665835413, "grad_norm": 0.000548367970623076, "learning_rate": 7.851870324189526e-06, "loss": 0.0001, "step": 24370 }, { "epoch": 3.039900249376559, "grad_norm": 0.0035804430954158306, "learning_rate": 7.846882793017458e-06, "loss": 0.0013, "step": 24380 }, { "epoch": 3.041147132169576, "grad_norm": 0.000565096503123641, "learning_rate": 7.841895261845387e-06, "loss": 0.0, "step": 24390 }, { "epoch": 3.0423940149625937, "grad_norm": 0.000567717244848609, "learning_rate": 7.836907730673318e-06, "loss": 0.0001, "step": 24400 }, { "epoch": 3.043640897755611, "grad_norm": 0.10077092796564102, "learning_rate": 7.831920199501247e-06, "loss": 0.0004, "step": 24410 }, { "epoch": 3.0448877805486285, "grad_norm": 0.005670513026416302, "learning_rate": 7.826932668329177e-06, "loss": 0.0175, "step": 24420 }, { "epoch": 3.046134663341646, "grad_norm": 0.01621229201555252, "learning_rate": 7.821945137157108e-06, "loss": 0.0002, "step": 24430 }, { "epoch": 3.0473815461346634, "grad_norm": 0.05025783181190491, "learning_rate": 7.816957605985038e-06, "loss": 0.0711, "step": 24440 }, { "epoch": 3.048628428927681, "grad_norm": 18.436004638671875, "learning_rate": 7.811970074812967e-06, "loss": 0.0026, "step": 24450 }, { "epoch": 3.0498753117206983, "grad_norm": 2.656740665435791, "learning_rate": 7.806982543640898e-06, "loss": 0.0015, "step": 24460 }, { "epoch": 3.0511221945137157, "grad_norm": 0.004006446339190006, "learning_rate": 7.801995012468829e-06, "loss": 0.0001, "step": 24470 }, { "epoch": 3.052369077306733, "grad_norm": 0.0018630550475791097, "learning_rate": 7.79700748129676e-06, "loss": 0.0002, "step": 24480 }, { "epoch": 3.0536159600997506, "grad_norm": 0.0004907494876533747, "learning_rate": 7.792019950124688e-06, "loss": 0.0211, "step": 24490 }, { "epoch": 3.054862842892768, "grad_norm": 0.0018739727092906833, "learning_rate": 7.787032418952619e-06, "loss": 0.0116, "step": 24500 }, { "epoch": 3.0561097256857854, "grad_norm": 0.0038170109037309885, "learning_rate": 7.78204488778055e-06, "loss": 0.0002, "step": 24510 }, { "epoch": 3.057356608478803, "grad_norm": 0.0017289503011852503, "learning_rate": 7.77705735660848e-06, "loss": 0.0001, "step": 24520 }, { "epoch": 3.0586034912718203, "grad_norm": 0.0016951598227024078, "learning_rate": 7.772069825436409e-06, "loss": 0.0003, "step": 24530 }, { "epoch": 3.0598503740648377, "grad_norm": 0.003196330275386572, "learning_rate": 7.76708229426434e-06, "loss": 0.0001, "step": 24540 }, { "epoch": 3.061097256857855, "grad_norm": 86.77098846435547, "learning_rate": 7.76209476309227e-06, "loss": 0.0368, "step": 24550 }, { "epoch": 3.0623441396508726, "grad_norm": 0.0013502277433872223, "learning_rate": 7.7571072319202e-06, "loss": 0.0, "step": 24560 }, { "epoch": 3.0635910224438905, "grad_norm": 0.0012974691344425082, "learning_rate": 7.752119700748131e-06, "loss": 0.0001, "step": 24570 }, { "epoch": 3.064837905236908, "grad_norm": 0.4907504916191101, "learning_rate": 7.74713216957606e-06, "loss": 0.0094, "step": 24580 }, { "epoch": 3.0660847880299253, "grad_norm": 0.0008519966504536569, "learning_rate": 7.742144638403991e-06, "loss": 0.0, "step": 24590 }, { "epoch": 3.067331670822943, "grad_norm": 0.004892734810709953, "learning_rate": 7.73715710723192e-06, "loss": 0.0014, "step": 24600 }, { "epoch": 3.06857855361596, "grad_norm": 0.01416561659425497, "learning_rate": 7.732169576059852e-06, "loss": 0.0006, "step": 24610 }, { "epoch": 3.0698254364089776, "grad_norm": 0.0011699148453772068, "learning_rate": 7.727182044887781e-06, "loss": 0.0002, "step": 24620 }, { "epoch": 3.071072319201995, "grad_norm": 0.40756234526634216, "learning_rate": 7.722194513715712e-06, "loss": 0.0212, "step": 24630 }, { "epoch": 3.0723192019950125, "grad_norm": 0.0006810550694353878, "learning_rate": 7.71720698254364e-06, "loss": 0.0001, "step": 24640 }, { "epoch": 3.07356608478803, "grad_norm": 0.0026810094714164734, "learning_rate": 7.712219451371571e-06, "loss": 0.0001, "step": 24650 }, { "epoch": 3.0748129675810474, "grad_norm": 0.001643951516598463, "learning_rate": 7.707231920199502e-06, "loss": 0.0334, "step": 24660 }, { "epoch": 3.076059850374065, "grad_norm": 0.0028612790629267693, "learning_rate": 7.702244389027433e-06, "loss": 0.0, "step": 24670 }, { "epoch": 3.0773067331670823, "grad_norm": 0.019000135362148285, "learning_rate": 7.697256857855361e-06, "loss": 0.0, "step": 24680 }, { "epoch": 3.0785536159600997, "grad_norm": 0.0008096080855466425, "learning_rate": 7.692269326683292e-06, "loss": 0.0032, "step": 24690 }, { "epoch": 3.079800498753117, "grad_norm": 0.00035487712011672556, "learning_rate": 7.687281795511223e-06, "loss": 0.0275, "step": 24700 }, { "epoch": 3.0810473815461346, "grad_norm": 0.003728532698005438, "learning_rate": 7.682294264339153e-06, "loss": 0.0001, "step": 24710 }, { "epoch": 3.082294264339152, "grad_norm": 0.018092088401317596, "learning_rate": 7.677306733167082e-06, "loss": 0.0002, "step": 24720 }, { "epoch": 3.0835411471321694, "grad_norm": 0.0015284158289432526, "learning_rate": 7.672319201995013e-06, "loss": 0.0333, "step": 24730 }, { "epoch": 3.084788029925187, "grad_norm": 0.06528772413730621, "learning_rate": 7.667331670822943e-06, "loss": 0.0001, "step": 24740 }, { "epoch": 3.0860349127182043, "grad_norm": 0.0012805687729269266, "learning_rate": 7.662344139650874e-06, "loss": 0.0, "step": 24750 }, { "epoch": 3.087281795511222, "grad_norm": 24.267126083374023, "learning_rate": 7.657356608478803e-06, "loss": 0.0254, "step": 24760 }, { "epoch": 3.0885286783042396, "grad_norm": 48.1351203918457, "learning_rate": 7.652369077306734e-06, "loss": 0.0196, "step": 24770 }, { "epoch": 3.089775561097257, "grad_norm": 0.001478642807342112, "learning_rate": 7.647381546134664e-06, "loss": 0.0001, "step": 24780 }, { "epoch": 3.0910224438902745, "grad_norm": 0.010895784012973309, "learning_rate": 7.642394014962595e-06, "loss": 0.0004, "step": 24790 }, { "epoch": 3.092269326683292, "grad_norm": 0.0012013005325570703, "learning_rate": 7.637406483790526e-06, "loss": 0.0055, "step": 24800 }, { "epoch": 3.0935162094763093, "grad_norm": 0.0007267069304361939, "learning_rate": 7.632418952618454e-06, "loss": 0.0001, "step": 24810 }, { "epoch": 3.0947630922693268, "grad_norm": 0.0029194443486630917, "learning_rate": 7.627431421446385e-06, "loss": 0.0, "step": 24820 }, { "epoch": 3.096009975062344, "grad_norm": 0.09928423166275024, "learning_rate": 7.622443890274315e-06, "loss": 0.0001, "step": 24830 }, { "epoch": 3.0972568578553616, "grad_norm": 0.0014014774933457375, "learning_rate": 7.6174563591022455e-06, "loss": 0.0, "step": 24840 }, { "epoch": 3.098503740648379, "grad_norm": 11.482453346252441, "learning_rate": 7.612468827930175e-06, "loss": 0.0066, "step": 24850 }, { "epoch": 3.0997506234413965, "grad_norm": 0.0034667307045310736, "learning_rate": 7.607481296758106e-06, "loss": 0.0315, "step": 24860 }, { "epoch": 3.100997506234414, "grad_norm": 0.0012264890829101205, "learning_rate": 7.602493765586036e-06, "loss": 0.0001, "step": 24870 }, { "epoch": 3.1022443890274314, "grad_norm": 0.03299201279878616, "learning_rate": 7.597506234413966e-06, "loss": 0.0113, "step": 24880 }, { "epoch": 3.103491271820449, "grad_norm": 2.718482255935669, "learning_rate": 7.592518703241896e-06, "loss": 0.0006, "step": 24890 }, { "epoch": 3.1047381546134662, "grad_norm": 0.0009285429841838777, "learning_rate": 7.587531172069827e-06, "loss": 0.0423, "step": 24900 }, { "epoch": 3.1059850374064837, "grad_norm": 0.0012682249071076512, "learning_rate": 7.5825436408977556e-06, "loss": 0.0, "step": 24910 }, { "epoch": 3.107231920199501, "grad_norm": 0.0003040742303710431, "learning_rate": 7.577556109725687e-06, "loss": 0.0529, "step": 24920 }, { "epoch": 3.1084788029925186, "grad_norm": 0.022092144936323166, "learning_rate": 7.572568578553616e-06, "loss": 0.0, "step": 24930 }, { "epoch": 3.109725685785536, "grad_norm": 0.0005716979503631592, "learning_rate": 7.5675810473815466e-06, "loss": 0.0012, "step": 24940 }, { "epoch": 3.1109725685785534, "grad_norm": 0.013232385739684105, "learning_rate": 7.562593516209476e-06, "loss": 0.0254, "step": 24950 }, { "epoch": 3.112219451371571, "grad_norm": 0.0007010533008724451, "learning_rate": 7.557605985037407e-06, "loss": 0.0001, "step": 24960 }, { "epoch": 3.1134663341645887, "grad_norm": 0.0009610903216525912, "learning_rate": 7.552618453865337e-06, "loss": 0.0313, "step": 24970 }, { "epoch": 3.114713216957606, "grad_norm": 0.0008341351058334112, "learning_rate": 7.547630922693267e-06, "loss": 0.0213, "step": 24980 }, { "epoch": 3.1159600997506236, "grad_norm": 0.0012973628472536802, "learning_rate": 7.542643391521197e-06, "loss": 0.0001, "step": 24990 }, { "epoch": 3.117206982543641, "grad_norm": 0.0037772981449961662, "learning_rate": 7.537655860349128e-06, "loss": 0.0042, "step": 25000 }, { "epoch": 3.1184538653366585, "grad_norm": 3.0461456775665283, "learning_rate": 7.5326683291770575e-06, "loss": 0.0283, "step": 25010 }, { "epoch": 3.119700748129676, "grad_norm": 0.001213893759995699, "learning_rate": 7.527680798004988e-06, "loss": 0.0, "step": 25020 }, { "epoch": 3.1209476309226933, "grad_norm": 0.0026515673380345106, "learning_rate": 7.522693266832918e-06, "loss": 0.0488, "step": 25030 }, { "epoch": 3.1221945137157108, "grad_norm": 0.00047578109661117196, "learning_rate": 7.5177057356608485e-06, "loss": 0.0, "step": 25040 }, { "epoch": 3.123441396508728, "grad_norm": 0.01547097135335207, "learning_rate": 7.512718204488779e-06, "loss": 0.0007, "step": 25050 }, { "epoch": 3.1246882793017456, "grad_norm": 0.010162184946238995, "learning_rate": 7.507730673316709e-06, "loss": 0.0001, "step": 25060 }, { "epoch": 3.125935162094763, "grad_norm": 0.0032746554352343082, "learning_rate": 7.5027431421446395e-06, "loss": 0.0001, "step": 25070 }, { "epoch": 3.1271820448877805, "grad_norm": 0.0003806925960816443, "learning_rate": 7.497755610972569e-06, "loss": 0.0003, "step": 25080 }, { "epoch": 3.128428927680798, "grad_norm": 0.0004894788726232946, "learning_rate": 7.4927680798005e-06, "loss": 0.0, "step": 25090 }, { "epoch": 3.1296758104738154, "grad_norm": 0.00042781481170095503, "learning_rate": 7.48778054862843e-06, "loss": 0.0, "step": 25100 }, { "epoch": 3.130922693266833, "grad_norm": 0.006498263217508793, "learning_rate": 7.48279301745636e-06, "loss": 0.0437, "step": 25110 }, { "epoch": 3.1321695760598502, "grad_norm": 0.0033179358579218388, "learning_rate": 7.47780548628429e-06, "loss": 0.0, "step": 25120 }, { "epoch": 3.1334164588528677, "grad_norm": 0.0005243680789135396, "learning_rate": 7.472817955112221e-06, "loss": 0.0239, "step": 25130 }, { "epoch": 3.134663341645885, "grad_norm": 0.0005233477568253875, "learning_rate": 7.46783042394015e-06, "loss": 0.0, "step": 25140 }, { "epoch": 3.1359102244389025, "grad_norm": 0.0014765688683837652, "learning_rate": 7.462842892768081e-06, "loss": 0.0, "step": 25150 }, { "epoch": 3.1371571072319204, "grad_norm": Infinity, "learning_rate": 7.458354114713218e-06, "loss": 0.0085, "step": 25160 }, { "epoch": 3.138403990024938, "grad_norm": 0.0004119553486816585, "learning_rate": 7.453366583541147e-06, "loss": 0.0001, "step": 25170 }, { "epoch": 3.1396508728179553, "grad_norm": 0.00120373978279531, "learning_rate": 7.448379052369078e-06, "loss": 0.0001, "step": 25180 }, { "epoch": 3.1408977556109727, "grad_norm": 0.0002969623601529747, "learning_rate": 7.443391521197008e-06, "loss": 0.0001, "step": 25190 }, { "epoch": 3.14214463840399, "grad_norm": 0.0029246362391859293, "learning_rate": 7.4384039900249384e-06, "loss": 0.0, "step": 25200 }, { "epoch": 3.1433915211970076, "grad_norm": 0.0017841997323557734, "learning_rate": 7.433416458852868e-06, "loss": 0.0004, "step": 25210 }, { "epoch": 3.144638403990025, "grad_norm": 0.0006051593809388578, "learning_rate": 7.428428927680799e-06, "loss": 0.0, "step": 25220 }, { "epoch": 3.1458852867830425, "grad_norm": 0.0006382103892974555, "learning_rate": 7.4234413965087294e-06, "loss": 0.0, "step": 25230 }, { "epoch": 3.14713216957606, "grad_norm": 0.0008329673437401652, "learning_rate": 7.418453865336659e-06, "loss": 0.0, "step": 25240 }, { "epoch": 3.1483790523690773, "grad_norm": 0.0003178466286044568, "learning_rate": 7.41346633416459e-06, "loss": 0.0001, "step": 25250 }, { "epoch": 3.1496259351620948, "grad_norm": 4.301796913146973, "learning_rate": 7.40847880299252e-06, "loss": 0.0007, "step": 25260 }, { "epoch": 3.150872817955112, "grad_norm": 0.0017022284446284175, "learning_rate": 7.40349127182045e-06, "loss": 0.0001, "step": 25270 }, { "epoch": 3.1521197007481296, "grad_norm": 0.002003247383981943, "learning_rate": 7.398503740648379e-06, "loss": 0.0424, "step": 25280 }, { "epoch": 3.153366583541147, "grad_norm": 0.000499337911605835, "learning_rate": 7.393516209476311e-06, "loss": 0.0005, "step": 25290 }, { "epoch": 3.1546134663341645, "grad_norm": 0.003963457886129618, "learning_rate": 7.3885286783042395e-06, "loss": 0.0, "step": 25300 }, { "epoch": 3.155860349127182, "grad_norm": 0.0008439902449026704, "learning_rate": 7.38354114713217e-06, "loss": 0.0009, "step": 25310 }, { "epoch": 3.1571072319201994, "grad_norm": 0.04566599801182747, "learning_rate": 7.3785536159601e-06, "loss": 0.0003, "step": 25320 }, { "epoch": 3.158354114713217, "grad_norm": 0.0065649570897221565, "learning_rate": 7.3735660847880306e-06, "loss": 0.0, "step": 25330 }, { "epoch": 3.1596009975062342, "grad_norm": 0.062061987817287445, "learning_rate": 7.36857855361596e-06, "loss": 0.0352, "step": 25340 }, { "epoch": 3.1608478802992517, "grad_norm": 0.0006778668030165136, "learning_rate": 7.363591022443891e-06, "loss": 0.0, "step": 25350 }, { "epoch": 3.162094763092269, "grad_norm": 0.001736179576255381, "learning_rate": 7.358603491271821e-06, "loss": 0.0, "step": 25360 }, { "epoch": 3.163341645885287, "grad_norm": 0.0019771254155784845, "learning_rate": 7.353615960099751e-06, "loss": 0.0, "step": 25370 }, { "epoch": 3.1645885286783044, "grad_norm": 0.0007458809996023774, "learning_rate": 7.348628428927681e-06, "loss": 0.0246, "step": 25380 }, { "epoch": 3.165835411471322, "grad_norm": 0.001378356129862368, "learning_rate": 7.343640897755612e-06, "loss": 0.0001, "step": 25390 }, { "epoch": 3.1670822942643393, "grad_norm": 0.0007513531600125134, "learning_rate": 7.3386533665835415e-06, "loss": 0.0001, "step": 25400 }, { "epoch": 3.1683291770573567, "grad_norm": 0.00032524106791242957, "learning_rate": 7.333665835411472e-06, "loss": 0.0, "step": 25410 }, { "epoch": 3.169576059850374, "grad_norm": 0.0005543709266930819, "learning_rate": 7.328678304239402e-06, "loss": 0.0001, "step": 25420 }, { "epoch": 3.1708229426433916, "grad_norm": 0.0007538437494076788, "learning_rate": 7.3236907730673325e-06, "loss": 0.0001, "step": 25430 }, { "epoch": 3.172069825436409, "grad_norm": 0.0003479034348856658, "learning_rate": 7.318703241895262e-06, "loss": 0.0, "step": 25440 }, { "epoch": 3.1733167082294265, "grad_norm": 0.00032112517510540783, "learning_rate": 7.313715710723193e-06, "loss": 0.0514, "step": 25450 }, { "epoch": 3.174563591022444, "grad_norm": 68.04743194580078, "learning_rate": 7.308728179551122e-06, "loss": 0.0246, "step": 25460 }, { "epoch": 3.1758104738154613, "grad_norm": 0.003141113556921482, "learning_rate": 7.303740648379053e-06, "loss": 0.0, "step": 25470 }, { "epoch": 3.1770573566084788, "grad_norm": 127.886474609375, "learning_rate": 7.298753117206984e-06, "loss": 0.0086, "step": 25480 }, { "epoch": 3.178304239401496, "grad_norm": 0.0004362338804639876, "learning_rate": 7.293765586034913e-06, "loss": 0.0138, "step": 25490 }, { "epoch": 3.1795511221945136, "grad_norm": 0.43064260482788086, "learning_rate": 7.288778054862844e-06, "loss": 0.0002, "step": 25500 }, { "epoch": 3.180798004987531, "grad_norm": 0.0011624015169218183, "learning_rate": 7.283790523690773e-06, "loss": 0.0, "step": 25510 }, { "epoch": 3.1820448877805485, "grad_norm": 0.0005321303615346551, "learning_rate": 7.278802992518704e-06, "loss": 0.004, "step": 25520 }, { "epoch": 3.183291770573566, "grad_norm": 0.0004226687888149172, "learning_rate": 7.273815461346634e-06, "loss": 0.0, "step": 25530 }, { "epoch": 3.1845386533665834, "grad_norm": 0.00025975590688176453, "learning_rate": 7.268827930174564e-06, "loss": 0.0082, "step": 25540 }, { "epoch": 3.185785536159601, "grad_norm": 0.00041354497079737484, "learning_rate": 7.263840399002494e-06, "loss": 0.0548, "step": 25550 }, { "epoch": 3.1870324189526187, "grad_norm": 0.0013360625598579645, "learning_rate": 7.258852867830425e-06, "loss": 0.0001, "step": 25560 }, { "epoch": 3.188279301745636, "grad_norm": 0.00040121175698004663, "learning_rate": 7.253865336658354e-06, "loss": 0.0464, "step": 25570 }, { "epoch": 3.1895261845386536, "grad_norm": 0.001519997720606625, "learning_rate": 7.248877805486285e-06, "loss": 0.0, "step": 25580 }, { "epoch": 3.190773067331671, "grad_norm": 0.0005633990513160825, "learning_rate": 7.243890274314215e-06, "loss": 0.0, "step": 25590 }, { "epoch": 3.1920199501246884, "grad_norm": 0.0007187969167716801, "learning_rate": 7.238902743142145e-06, "loss": 0.0, "step": 25600 }, { "epoch": 3.193266832917706, "grad_norm": 0.026836052536964417, "learning_rate": 7.233915211970075e-06, "loss": 0.0002, "step": 25610 }, { "epoch": 3.1945137157107233, "grad_norm": 0.001607574988156557, "learning_rate": 7.228927680798006e-06, "loss": 0.0034, "step": 25620 }, { "epoch": 3.1957605985037407, "grad_norm": 0.0025988956913352013, "learning_rate": 7.223940149625936e-06, "loss": 0.0, "step": 25630 }, { "epoch": 3.197007481296758, "grad_norm": 86.0445556640625, "learning_rate": 7.218952618453866e-06, "loss": 0.0128, "step": 25640 }, { "epoch": 3.1982543640897756, "grad_norm": 0.008842726238071918, "learning_rate": 7.213965087281796e-06, "loss": 0.0, "step": 25650 }, { "epoch": 3.199501246882793, "grad_norm": 0.0012176205636933446, "learning_rate": 7.208977556109727e-06, "loss": 0.0242, "step": 25660 }, { "epoch": 3.2007481296758105, "grad_norm": 0.00039986352203413844, "learning_rate": 7.203990024937656e-06, "loss": 0.0, "step": 25670 }, { "epoch": 3.201995012468828, "grad_norm": 0.0003596430760808289, "learning_rate": 7.199002493765587e-06, "loss": 0.0, "step": 25680 }, { "epoch": 3.2032418952618453, "grad_norm": 0.0017086360603570938, "learning_rate": 7.194014962593516e-06, "loss": 0.0317, "step": 25690 }, { "epoch": 3.2044887780548628, "grad_norm": 0.005245603155344725, "learning_rate": 7.189027431421447e-06, "loss": 0.0358, "step": 25700 }, { "epoch": 3.20573566084788, "grad_norm": 0.0008312583668157458, "learning_rate": 7.184039900249378e-06, "loss": 0.0, "step": 25710 }, { "epoch": 3.2069825436408976, "grad_norm": 22.51250648498535, "learning_rate": 7.179052369077307e-06, "loss": 0.0022, "step": 25720 }, { "epoch": 3.208229426433915, "grad_norm": 0.001020867028273642, "learning_rate": 7.174064837905238e-06, "loss": 0.0004, "step": 25730 }, { "epoch": 3.2094763092269325, "grad_norm": 0.00029622670263051987, "learning_rate": 7.169077306733167e-06, "loss": 0.0034, "step": 25740 }, { "epoch": 3.21072319201995, "grad_norm": 0.001556044677272439, "learning_rate": 7.164089775561098e-06, "loss": 0.033, "step": 25750 }, { "epoch": 3.2119700748129674, "grad_norm": 0.0003500843304209411, "learning_rate": 7.159102244389028e-06, "loss": 0.049, "step": 25760 }, { "epoch": 3.213216957605985, "grad_norm": 0.0003617958864197135, "learning_rate": 7.154114713216958e-06, "loss": 0.0001, "step": 25770 }, { "epoch": 3.2144638403990027, "grad_norm": 0.0004431404813658446, "learning_rate": 7.149127182044888e-06, "loss": 0.0, "step": 25780 }, { "epoch": 3.21571072319202, "grad_norm": 0.0016715474193915725, "learning_rate": 7.144139650872819e-06, "loss": 0.0, "step": 25790 }, { "epoch": 3.2169576059850375, "grad_norm": 0.001232789596542716, "learning_rate": 7.1391521197007485e-06, "loss": 0.0, "step": 25800 }, { "epoch": 3.218204488778055, "grad_norm": 0.0007746867486275733, "learning_rate": 7.134164588528679e-06, "loss": 0.0, "step": 25810 }, { "epoch": 3.2194513715710724, "grad_norm": 0.0025138193741440773, "learning_rate": 7.129177057356609e-06, "loss": 0.0002, "step": 25820 }, { "epoch": 3.22069825436409, "grad_norm": 0.00021336287318263203, "learning_rate": 7.1241895261845395e-06, "loss": 0.048, "step": 25830 }, { "epoch": 3.2219451371571073, "grad_norm": 0.008427153341472149, "learning_rate": 7.119201995012469e-06, "loss": 0.0344, "step": 25840 }, { "epoch": 3.2231920199501247, "grad_norm": 15.330975532531738, "learning_rate": 7.1142144638404e-06, "loss": 0.1138, "step": 25850 }, { "epoch": 3.224438902743142, "grad_norm": 0.01085032057017088, "learning_rate": 7.10922693266833e-06, "loss": 0.0, "step": 25860 }, { "epoch": 3.2256857855361596, "grad_norm": 0.0007842654013074934, "learning_rate": 7.10423940149626e-06, "loss": 0.0265, "step": 25870 }, { "epoch": 3.226932668329177, "grad_norm": 0.0009141101036220789, "learning_rate": 7.09925187032419e-06, "loss": 0.0001, "step": 25880 }, { "epoch": 3.2281795511221945, "grad_norm": 0.004496920388191938, "learning_rate": 7.094264339152121e-06, "loss": 0.0116, "step": 25890 }, { "epoch": 3.229426433915212, "grad_norm": 0.0016870100516825914, "learning_rate": 7.0892768079800504e-06, "loss": 0.004, "step": 25900 }, { "epoch": 3.2306733167082293, "grad_norm": 0.0009073169785551727, "learning_rate": 7.084289276807981e-06, "loss": 0.0001, "step": 25910 }, { "epoch": 3.2319201995012468, "grad_norm": 0.0010009161196649075, "learning_rate": 7.07930174563591e-06, "loss": 0.0469, "step": 25920 }, { "epoch": 3.233167082294264, "grad_norm": 0.002670197281986475, "learning_rate": 7.074314214463841e-06, "loss": 0.0142, "step": 25930 }, { "epoch": 3.2344139650872816, "grad_norm": 0.0038882438093423843, "learning_rate": 7.06932668329177e-06, "loss": 0.0001, "step": 25940 }, { "epoch": 3.235660847880299, "grad_norm": 6.603517532348633, "learning_rate": 7.064339152119701e-06, "loss": 0.0008, "step": 25950 }, { "epoch": 3.236907730673317, "grad_norm": 0.3903234601020813, "learning_rate": 7.059351620947632e-06, "loss": 0.0002, "step": 25960 }, { "epoch": 3.2381546134663344, "grad_norm": 0.0041964612901210785, "learning_rate": 7.054364089775561e-06, "loss": 0.0066, "step": 25970 }, { "epoch": 3.239401496259352, "grad_norm": 0.0007558322977274656, "learning_rate": 7.049376558603492e-06, "loss": 0.0002, "step": 25980 }, { "epoch": 3.2406483790523692, "grad_norm": 0.00039787104469724, "learning_rate": 7.044389027431422e-06, "loss": 0.0, "step": 25990 }, { "epoch": 3.2418952618453867, "grad_norm": 0.0004548293072730303, "learning_rate": 7.039401496259352e-06, "loss": 0.0003, "step": 26000 }, { "epoch": 3.243142144638404, "grad_norm": 0.0005862874095328152, "learning_rate": 7.034413965087282e-06, "loss": 0.0, "step": 26010 }, { "epoch": 3.2443890274314215, "grad_norm": 0.021068502217531204, "learning_rate": 7.029426433915213e-06, "loss": 0.0, "step": 26020 }, { "epoch": 3.245635910224439, "grad_norm": 0.01115128118544817, "learning_rate": 7.0244389027431426e-06, "loss": 0.0, "step": 26030 }, { "epoch": 3.2468827930174564, "grad_norm": 0.012454736977815628, "learning_rate": 7.019451371571073e-06, "loss": 0.0034, "step": 26040 }, { "epoch": 3.248129675810474, "grad_norm": 0.003131842939183116, "learning_rate": 7.014463840399003e-06, "loss": 0.0001, "step": 26050 }, { "epoch": 3.2493765586034913, "grad_norm": 0.0019079704070463777, "learning_rate": 7.009476309226934e-06, "loss": 0.0, "step": 26060 }, { "epoch": 3.2506234413965087, "grad_norm": 0.0006699951482005417, "learning_rate": 7.004488778054863e-06, "loss": 0.0001, "step": 26070 }, { "epoch": 3.251870324189526, "grad_norm": 21.092519760131836, "learning_rate": 6.999501246882794e-06, "loss": 0.043, "step": 26080 }, { "epoch": 3.2531172069825436, "grad_norm": 15.397884368896484, "learning_rate": 6.994513715710724e-06, "loss": 0.0011, "step": 26090 }, { "epoch": 3.254364089775561, "grad_norm": 0.0016310204518958926, "learning_rate": 6.989526184538654e-06, "loss": 0.0001, "step": 26100 }, { "epoch": 3.2556109725685785, "grad_norm": 0.0010182487312704325, "learning_rate": 6.984538653366584e-06, "loss": 0.0289, "step": 26110 }, { "epoch": 3.256857855361596, "grad_norm": 0.0008488223538734019, "learning_rate": 6.979551122194515e-06, "loss": 0.0003, "step": 26120 }, { "epoch": 3.2581047381546133, "grad_norm": 0.040211670100688934, "learning_rate": 6.974563591022444e-06, "loss": 0.0001, "step": 26130 }, { "epoch": 3.2593516209476308, "grad_norm": 0.0003044347686227411, "learning_rate": 6.969576059850375e-06, "loss": 0.0, "step": 26140 }, { "epoch": 3.260598503740648, "grad_norm": 0.0003406737232580781, "learning_rate": 6.964588528678304e-06, "loss": 0.0283, "step": 26150 }, { "epoch": 3.2618453865336656, "grad_norm": 0.0006763077690266073, "learning_rate": 6.959600997506235e-06, "loss": 0.0006, "step": 26160 }, { "epoch": 3.263092269326683, "grad_norm": 20.708433151245117, "learning_rate": 6.9546134663341645e-06, "loss": 0.041, "step": 26170 }, { "epoch": 3.264339152119701, "grad_norm": 0.000766753451898694, "learning_rate": 6.949625935162095e-06, "loss": 0.0, "step": 26180 }, { "epoch": 3.2655860349127184, "grad_norm": 0.00931212492287159, "learning_rate": 6.944638403990025e-06, "loss": 0.0, "step": 26190 }, { "epoch": 3.266832917705736, "grad_norm": 0.0005553375813178718, "learning_rate": 6.9396508728179555e-06, "loss": 0.0, "step": 26200 }, { "epoch": 3.2680798004987532, "grad_norm": 0.0011248165974393487, "learning_rate": 6.934663341645886e-06, "loss": 0.0187, "step": 26210 }, { "epoch": 3.2693266832917707, "grad_norm": 0.02715372107923031, "learning_rate": 6.929675810473816e-06, "loss": 0.0, "step": 26220 }, { "epoch": 3.270573566084788, "grad_norm": 0.0005466627771966159, "learning_rate": 6.9246882793017465e-06, "loss": 0.0, "step": 26230 }, { "epoch": 3.2718204488778055, "grad_norm": 0.0001991336466744542, "learning_rate": 6.919700748129676e-06, "loss": 0.0, "step": 26240 }, { "epoch": 3.273067331670823, "grad_norm": 0.0004780891176778823, "learning_rate": 6.914713216957607e-06, "loss": 0.0, "step": 26250 }, { "epoch": 3.2743142144638404, "grad_norm": 0.0012583925854414701, "learning_rate": 6.909725685785537e-06, "loss": 0.0, "step": 26260 }, { "epoch": 3.275561097256858, "grad_norm": 0.0002583898603916168, "learning_rate": 6.904738154613467e-06, "loss": 0.0001, "step": 26270 }, { "epoch": 3.2768079800498753, "grad_norm": 0.008905318565666676, "learning_rate": 6.899750623441397e-06, "loss": 0.0256, "step": 26280 }, { "epoch": 3.2780548628428927, "grad_norm": 0.00047971465392038226, "learning_rate": 6.894763092269328e-06, "loss": 0.0, "step": 26290 }, { "epoch": 3.27930174563591, "grad_norm": 0.01915428787469864, "learning_rate": 6.8897755610972574e-06, "loss": 0.0001, "step": 26300 }, { "epoch": 3.2805486284289276, "grad_norm": 0.00032235312392003834, "learning_rate": 6.884788029925188e-06, "loss": 0.0, "step": 26310 }, { "epoch": 3.281795511221945, "grad_norm": 0.0005441360990516841, "learning_rate": 6.879800498753118e-06, "loss": 0.0, "step": 26320 }, { "epoch": 3.2830423940149625, "grad_norm": 42.98784637451172, "learning_rate": 6.8748129675810484e-06, "loss": 0.06, "step": 26330 }, { "epoch": 3.28428927680798, "grad_norm": 0.000571958429645747, "learning_rate": 6.869825436408978e-06, "loss": 0.0167, "step": 26340 }, { "epoch": 3.2855361596009973, "grad_norm": 0.00067878607660532, "learning_rate": 6.864837905236909e-06, "loss": 0.0001, "step": 26350 }, { "epoch": 3.286783042394015, "grad_norm": 0.0006823897711001337, "learning_rate": 6.859850374064838e-06, "loss": 0.0, "step": 26360 }, { "epoch": 3.2880299251870326, "grad_norm": 0.0009005233878269792, "learning_rate": 6.854862842892769e-06, "loss": 0.0408, "step": 26370 }, { "epoch": 3.28927680798005, "grad_norm": 0.0013887096429243684, "learning_rate": 6.849875311720698e-06, "loss": 0.0, "step": 26380 }, { "epoch": 3.2905236907730675, "grad_norm": 0.6036227345466614, "learning_rate": 6.844887780548629e-06, "loss": 0.0001, "step": 26390 }, { "epoch": 3.291770573566085, "grad_norm": 0.0027249318081885576, "learning_rate": 6.8399002493765585e-06, "loss": 0.0, "step": 26400 }, { "epoch": 3.2930174563591024, "grad_norm": 0.0006757620139978826, "learning_rate": 6.834912718204489e-06, "loss": 0.0, "step": 26410 }, { "epoch": 3.29426433915212, "grad_norm": 0.0004033853765577078, "learning_rate": 6.829925187032419e-06, "loss": 0.0, "step": 26420 }, { "epoch": 3.2955112219451372, "grad_norm": 0.0029158780816942453, "learning_rate": 6.8249376558603496e-06, "loss": 0.0004, "step": 26430 }, { "epoch": 3.2967581047381547, "grad_norm": 0.0003885283076670021, "learning_rate": 6.819950124688279e-06, "loss": 0.0, "step": 26440 }, { "epoch": 3.298004987531172, "grad_norm": 0.0016651960322633386, "learning_rate": 6.81496259351621e-06, "loss": 0.0, "step": 26450 }, { "epoch": 3.2992518703241895, "grad_norm": 27.317020416259766, "learning_rate": 6.8099750623441406e-06, "loss": 0.0071, "step": 26460 }, { "epoch": 3.300498753117207, "grad_norm": 0.002943522296845913, "learning_rate": 6.80498753117207e-06, "loss": 0.0001, "step": 26470 }, { "epoch": 3.3017456359102244, "grad_norm": 0.0005533623043447733, "learning_rate": 6.800000000000001e-06, "loss": 0.0461, "step": 26480 }, { "epoch": 3.302992518703242, "grad_norm": 0.0010401762556284666, "learning_rate": 6.795012468827931e-06, "loss": 0.0, "step": 26490 }, { "epoch": 3.3042394014962593, "grad_norm": 0.0009254756732843816, "learning_rate": 6.790024937655861e-06, "loss": 0.0021, "step": 26500 }, { "epoch": 3.3054862842892767, "grad_norm": 0.0019067944958806038, "learning_rate": 6.785037406483791e-06, "loss": 0.0001, "step": 26510 }, { "epoch": 3.306733167082294, "grad_norm": 0.0010872195707634091, "learning_rate": 6.780049875311722e-06, "loss": 0.0, "step": 26520 }, { "epoch": 3.3079800498753116, "grad_norm": 0.0006007709307596087, "learning_rate": 6.7750623441396515e-06, "loss": 0.0001, "step": 26530 }, { "epoch": 3.309226932668329, "grad_norm": 39.467369079589844, "learning_rate": 6.770074812967582e-06, "loss": 0.0233, "step": 26540 }, { "epoch": 3.3104738154613464, "grad_norm": 20.561660766601562, "learning_rate": 6.765087281795512e-06, "loss": 0.0417, "step": 26550 }, { "epoch": 3.311720698254364, "grad_norm": 0.000338366546202451, "learning_rate": 6.7600997506234425e-06, "loss": 0.0001, "step": 26560 }, { "epoch": 3.3129675810473813, "grad_norm": 0.0014671689132228494, "learning_rate": 6.7551122194513715e-06, "loss": 0.0097, "step": 26570 }, { "epoch": 3.314214463840399, "grad_norm": 0.0009753374033607543, "learning_rate": 6.750124688279303e-06, "loss": 0.0286, "step": 26580 }, { "epoch": 3.3154613466334166, "grad_norm": 0.0007874347502365708, "learning_rate": 6.745137157107232e-06, "loss": 0.0, "step": 26590 }, { "epoch": 3.316708229426434, "grad_norm": 0.003849888453260064, "learning_rate": 6.7401496259351625e-06, "loss": 0.0001, "step": 26600 }, { "epoch": 3.3179551122194515, "grad_norm": 0.0004690260102506727, "learning_rate": 6.735162094763092e-06, "loss": 0.0, "step": 26610 }, { "epoch": 3.319201995012469, "grad_norm": 0.00019917161262128502, "learning_rate": 6.730174563591023e-06, "loss": 0.029, "step": 26620 }, { "epoch": 3.3204488778054864, "grad_norm": 0.008112260140478611, "learning_rate": 6.725187032418953e-06, "loss": 0.0001, "step": 26630 }, { "epoch": 3.321695760598504, "grad_norm": 0.000990871456451714, "learning_rate": 6.720199501246883e-06, "loss": 0.0001, "step": 26640 }, { "epoch": 3.3229426433915212, "grad_norm": 0.0007548572611995041, "learning_rate": 6.715211970074813e-06, "loss": 0.0, "step": 26650 }, { "epoch": 3.3241895261845387, "grad_norm": 0.00047548863221891224, "learning_rate": 6.710224438902744e-06, "loss": 0.0001, "step": 26660 }, { "epoch": 3.325436408977556, "grad_norm": 0.004564858041703701, "learning_rate": 6.705236907730673e-06, "loss": 0.0115, "step": 26670 }, { "epoch": 3.3266832917705735, "grad_norm": 0.0006807594327256083, "learning_rate": 6.700249376558604e-06, "loss": 0.0003, "step": 26680 }, { "epoch": 3.327930174563591, "grad_norm": 0.0005415382911451161, "learning_rate": 6.695261845386534e-06, "loss": 0.0018, "step": 26690 }, { "epoch": 3.3291770573566084, "grad_norm": 0.0002564255555626005, "learning_rate": 6.6902743142144644e-06, "loss": 0.0, "step": 26700 }, { "epoch": 3.330423940149626, "grad_norm": 0.000789106881711632, "learning_rate": 6.685286783042395e-06, "loss": 0.0001, "step": 26710 }, { "epoch": 3.3316708229426433, "grad_norm": 0.0043001859448850155, "learning_rate": 6.680299251870325e-06, "loss": 0.0, "step": 26720 }, { "epoch": 3.3329177057356607, "grad_norm": 0.00039159294101409614, "learning_rate": 6.6753117206982554e-06, "loss": 0.0, "step": 26730 }, { "epoch": 3.334164588528678, "grad_norm": 0.00038974048220552504, "learning_rate": 6.670324189526185e-06, "loss": 0.0, "step": 26740 }, { "epoch": 3.3354114713216956, "grad_norm": 0.00043641115189529955, "learning_rate": 6.665336658354116e-06, "loss": 0.0, "step": 26750 }, { "epoch": 3.3366583541147135, "grad_norm": 0.0005020300159230828, "learning_rate": 6.660349127182046e-06, "loss": 0.0018, "step": 26760 }, { "epoch": 3.337905236907731, "grad_norm": 0.0005201539606787264, "learning_rate": 6.655361596009976e-06, "loss": 0.0553, "step": 26770 }, { "epoch": 3.3391521197007483, "grad_norm": 0.0006099382881075144, "learning_rate": 6.650374064837906e-06, "loss": 0.0, "step": 26780 }, { "epoch": 3.3403990024937658, "grad_norm": 0.0018274527974426746, "learning_rate": 6.645386533665837e-06, "loss": 0.0, "step": 26790 }, { "epoch": 3.341645885286783, "grad_norm": 0.0006980765028856695, "learning_rate": 6.6403990024937655e-06, "loss": 0.0045, "step": 26800 }, { "epoch": 3.3428927680798006, "grad_norm": 0.00030875447555445135, "learning_rate": 6.635411471321697e-06, "loss": 0.0091, "step": 26810 }, { "epoch": 3.344139650872818, "grad_norm": 1.3320343494415283, "learning_rate": 6.630423940149626e-06, "loss": 0.0002, "step": 26820 }, { "epoch": 3.3453865336658355, "grad_norm": 0.019607802852988243, "learning_rate": 6.6254364089775565e-06, "loss": 0.0, "step": 26830 }, { "epoch": 3.346633416458853, "grad_norm": 0.00037203048123046756, "learning_rate": 6.620448877805486e-06, "loss": 0.0, "step": 26840 }, { "epoch": 3.3478802992518704, "grad_norm": 0.0021146514918655157, "learning_rate": 6.615461346633417e-06, "loss": 0.0001, "step": 26850 }, { "epoch": 3.349127182044888, "grad_norm": 0.002595689380541444, "learning_rate": 6.610473815461347e-06, "loss": 0.0751, "step": 26860 }, { "epoch": 3.3503740648379052, "grad_norm": 0.00038239543209783733, "learning_rate": 6.605486284289277e-06, "loss": 0.0, "step": 26870 }, { "epoch": 3.3516209476309227, "grad_norm": 0.0037817058619111776, "learning_rate": 6.600498753117207e-06, "loss": 0.0565, "step": 26880 }, { "epoch": 3.35286783042394, "grad_norm": 0.0012505949707701802, "learning_rate": 6.595511221945138e-06, "loss": 0.0, "step": 26890 }, { "epoch": 3.3541147132169575, "grad_norm": 0.0010186401195824146, "learning_rate": 6.5905236907730675e-06, "loss": 0.0002, "step": 26900 }, { "epoch": 3.355361596009975, "grad_norm": 0.0003353485371917486, "learning_rate": 6.585536159600998e-06, "loss": 0.0006, "step": 26910 }, { "epoch": 3.3566084788029924, "grad_norm": 0.0005550871719606221, "learning_rate": 6.580548628428928e-06, "loss": 0.0004, "step": 26920 }, { "epoch": 3.35785536159601, "grad_norm": 0.0023827122058719397, "learning_rate": 6.5755610972568585e-06, "loss": 0.0031, "step": 26930 }, { "epoch": 3.3591022443890273, "grad_norm": 0.007178848143666983, "learning_rate": 6.570573566084788e-06, "loss": 0.049, "step": 26940 }, { "epoch": 3.3603491271820447, "grad_norm": 0.0006301432149484754, "learning_rate": 6.565586034912719e-06, "loss": 0.0, "step": 26950 }, { "epoch": 3.361596009975062, "grad_norm": 0.0008799605420790613, "learning_rate": 6.5605985037406495e-06, "loss": 0.0, "step": 26960 }, { "epoch": 3.3628428927680796, "grad_norm": 0.000438713381299749, "learning_rate": 6.555610972568579e-06, "loss": 0.0382, "step": 26970 }, { "epoch": 3.3640897755610975, "grad_norm": 0.13686221837997437, "learning_rate": 6.55062344139651e-06, "loss": 0.0001, "step": 26980 }, { "epoch": 3.365336658354115, "grad_norm": 0.4949225187301636, "learning_rate": 6.54563591022444e-06, "loss": 0.0001, "step": 26990 }, { "epoch": 3.3665835411471323, "grad_norm": 0.03666635975241661, "learning_rate": 6.54064837905237e-06, "loss": 0.053, "step": 27000 }, { "epoch": 3.3678304239401498, "grad_norm": 0.0034022931940853596, "learning_rate": 6.535660847880299e-06, "loss": 0.0001, "step": 27010 }, { "epoch": 3.369077306733167, "grad_norm": 0.006523266434669495, "learning_rate": 6.530673316708231e-06, "loss": 0.0147, "step": 27020 }, { "epoch": 3.3703241895261846, "grad_norm": 0.03280205652117729, "learning_rate": 6.52568578553616e-06, "loss": 0.0372, "step": 27030 }, { "epoch": 3.371571072319202, "grad_norm": 0.002701780293136835, "learning_rate": 6.52069825436409e-06, "loss": 0.0098, "step": 27040 }, { "epoch": 3.3728179551122195, "grad_norm": 0.0005019395030103624, "learning_rate": 6.51571072319202e-06, "loss": 0.0017, "step": 27050 }, { "epoch": 3.374064837905237, "grad_norm": 0.000222506292629987, "learning_rate": 6.510723192019951e-06, "loss": 0.0001, "step": 27060 }, { "epoch": 3.3753117206982544, "grad_norm": 0.05252045765519142, "learning_rate": 6.50573566084788e-06, "loss": 0.0001, "step": 27070 }, { "epoch": 3.376558603491272, "grad_norm": 0.0026390505954623222, "learning_rate": 6.500748129675811e-06, "loss": 0.0, "step": 27080 }, { "epoch": 3.3778054862842892, "grad_norm": 0.0017233664402738214, "learning_rate": 6.495760598503741e-06, "loss": 0.0001, "step": 27090 }, { "epoch": 3.3790523690773067, "grad_norm": 0.000762695271987468, "learning_rate": 6.490773067331671e-06, "loss": 0.0001, "step": 27100 }, { "epoch": 3.380299251870324, "grad_norm": 0.03908080980181694, "learning_rate": 6.485785536159601e-06, "loss": 0.0, "step": 27110 }, { "epoch": 3.3815461346633415, "grad_norm": 0.009048929437994957, "learning_rate": 6.480798004987532e-06, "loss": 0.0783, "step": 27120 }, { "epoch": 3.382793017456359, "grad_norm": 0.0003922952164430171, "learning_rate": 6.475810473815462e-06, "loss": 0.0, "step": 27130 }, { "epoch": 3.3840399002493764, "grad_norm": 0.0037547594401985407, "learning_rate": 6.470822942643392e-06, "loss": 0.0003, "step": 27140 }, { "epoch": 3.385286783042394, "grad_norm": 0.002254948951303959, "learning_rate": 6.465835411471322e-06, "loss": 0.0, "step": 27150 }, { "epoch": 3.3865336658354117, "grad_norm": 0.0015663010999560356, "learning_rate": 6.460847880299253e-06, "loss": 0.0001, "step": 27160 }, { "epoch": 3.387780548628429, "grad_norm": 0.009484532289206982, "learning_rate": 6.455860349127182e-06, "loss": 0.0005, "step": 27170 }, { "epoch": 3.3890274314214466, "grad_norm": 1.5235114097595215, "learning_rate": 6.450872817955113e-06, "loss": 0.0036, "step": 27180 }, { "epoch": 3.390274314214464, "grad_norm": 0.000923574494663626, "learning_rate": 6.445885286783043e-06, "loss": 0.0002, "step": 27190 }, { "epoch": 3.3915211970074814, "grad_norm": 0.00219345442019403, "learning_rate": 6.440897755610973e-06, "loss": 0.0, "step": 27200 }, { "epoch": 3.392768079800499, "grad_norm": 0.00026506150607019663, "learning_rate": 6.435910224438904e-06, "loss": 0.0404, "step": 27210 }, { "epoch": 3.3940149625935163, "grad_norm": 0.00028258541715331376, "learning_rate": 6.430922693266834e-06, "loss": 0.0002, "step": 27220 }, { "epoch": 3.3952618453865338, "grad_norm": 0.0068185459822416306, "learning_rate": 6.425935162094764e-06, "loss": 0.0003, "step": 27230 }, { "epoch": 3.396508728179551, "grad_norm": 0.0003236836346331984, "learning_rate": 6.421446384039901e-06, "loss": 0.0054, "step": 27240 }, { "epoch": 3.3977556109725686, "grad_norm": 0.0008242797921411693, "learning_rate": 6.416458852867831e-06, "loss": 0.0, "step": 27250 }, { "epoch": 3.399002493765586, "grad_norm": 0.00034484322532080114, "learning_rate": 6.411471321695761e-06, "loss": 0.0397, "step": 27260 }, { "epoch": 3.4002493765586035, "grad_norm": 0.0011520058615133166, "learning_rate": 6.406483790523691e-06, "loss": 0.0, "step": 27270 }, { "epoch": 3.401496259351621, "grad_norm": 0.00017111722263507545, "learning_rate": 6.401496259351622e-06, "loss": 0.0, "step": 27280 }, { "epoch": 3.4027431421446384, "grad_norm": 0.0003394628001842648, "learning_rate": 6.3965087281795515e-06, "loss": 0.0, "step": 27290 }, { "epoch": 3.403990024937656, "grad_norm": 0.00040050517418421805, "learning_rate": 6.391521197007482e-06, "loss": 0.0, "step": 27300 }, { "epoch": 3.4052369077306732, "grad_norm": 0.0007083789096213877, "learning_rate": 6.386533665835412e-06, "loss": 0.0, "step": 27310 }, { "epoch": 3.4064837905236907, "grad_norm": 0.0008792931330390275, "learning_rate": 6.3815461346633425e-06, "loss": 0.0, "step": 27320 }, { "epoch": 3.407730673316708, "grad_norm": 0.00042291832505725324, "learning_rate": 6.376558603491272e-06, "loss": 0.0002, "step": 27330 }, { "epoch": 3.4089775561097255, "grad_norm": 0.015671808272600174, "learning_rate": 6.371571072319203e-06, "loss": 0.0001, "step": 27340 }, { "epoch": 3.410224438902743, "grad_norm": 0.00033854617504402995, "learning_rate": 6.366583541147132e-06, "loss": 0.0002, "step": 27350 }, { "epoch": 3.4114713216957604, "grad_norm": 0.015324210748076439, "learning_rate": 6.361596009975063e-06, "loss": 0.0, "step": 27360 }, { "epoch": 3.412718204488778, "grad_norm": 0.00865886453539133, "learning_rate": 6.356608478802994e-06, "loss": 0.0, "step": 27370 }, { "epoch": 3.4139650872817953, "grad_norm": 0.00048793648602440953, "learning_rate": 6.351620947630923e-06, "loss": 0.0015, "step": 27380 }, { "epoch": 3.415211970074813, "grad_norm": 0.0030084741301834583, "learning_rate": 6.346633416458854e-06, "loss": 0.0, "step": 27390 }, { "epoch": 3.4164588528678306, "grad_norm": 0.0003942911571357399, "learning_rate": 6.341645885286783e-06, "loss": 0.0, "step": 27400 }, { "epoch": 3.417705735660848, "grad_norm": 0.002323322230949998, "learning_rate": 6.336658354114714e-06, "loss": 0.0, "step": 27410 }, { "epoch": 3.4189526184538654, "grad_norm": 0.0004297247505746782, "learning_rate": 6.331670822942644e-06, "loss": 0.0, "step": 27420 }, { "epoch": 3.420199501246883, "grad_norm": 0.005649714730679989, "learning_rate": 6.326683291770574e-06, "loss": 0.0001, "step": 27430 }, { "epoch": 3.4214463840399003, "grad_norm": 0.013090058229863644, "learning_rate": 6.321695760598504e-06, "loss": 0.0142, "step": 27440 }, { "epoch": 3.4226932668329177, "grad_norm": 0.018311014398932457, "learning_rate": 6.316708229426435e-06, "loss": 0.0008, "step": 27450 }, { "epoch": 3.423940149625935, "grad_norm": 0.0004497721092775464, "learning_rate": 6.311720698254364e-06, "loss": 0.0001, "step": 27460 }, { "epoch": 3.4251870324189526, "grad_norm": 59.970619201660156, "learning_rate": 6.306733167082295e-06, "loss": 0.0225, "step": 27470 }, { "epoch": 3.42643391521197, "grad_norm": 0.00020840381330344826, "learning_rate": 6.301745635910225e-06, "loss": 0.0004, "step": 27480 }, { "epoch": 3.4276807980049875, "grad_norm": 0.002133867936208844, "learning_rate": 6.296758104738155e-06, "loss": 0.0, "step": 27490 }, { "epoch": 3.428927680798005, "grad_norm": 0.00448261946439743, "learning_rate": 6.291770573566085e-06, "loss": 0.0, "step": 27500 }, { "epoch": 3.4301745635910224, "grad_norm": 7.075658321380615, "learning_rate": 6.286783042394016e-06, "loss": 0.0531, "step": 27510 }, { "epoch": 3.43142144638404, "grad_norm": 0.00027553230756893754, "learning_rate": 6.2817955112219456e-06, "loss": 0.0, "step": 27520 }, { "epoch": 3.432668329177057, "grad_norm": 0.0008446983993053436, "learning_rate": 6.276807980049876e-06, "loss": 0.0001, "step": 27530 }, { "epoch": 3.4339152119700747, "grad_norm": 0.00023807137040421367, "learning_rate": 6.271820448877806e-06, "loss": 0.0, "step": 27540 }, { "epoch": 3.435162094763092, "grad_norm": 0.00029441763763315976, "learning_rate": 6.266832917705737e-06, "loss": 0.1117, "step": 27550 }, { "epoch": 3.43640897755611, "grad_norm": 0.005209818482398987, "learning_rate": 6.2618453865336655e-06, "loss": 0.0056, "step": 27560 }, { "epoch": 3.4376558603491274, "grad_norm": 0.035485655069351196, "learning_rate": 6.256857855361597e-06, "loss": 0.0, "step": 27570 }, { "epoch": 3.438902743142145, "grad_norm": 0.0030119556467980146, "learning_rate": 6.251870324189526e-06, "loss": 0.0, "step": 27580 }, { "epoch": 3.4401496259351623, "grad_norm": 0.0007518244674429297, "learning_rate": 6.2468827930174565e-06, "loss": 0.0, "step": 27590 }, { "epoch": 3.4413965087281797, "grad_norm": 0.000781845476012677, "learning_rate": 6.241895261845386e-06, "loss": 0.0, "step": 27600 }, { "epoch": 3.442643391521197, "grad_norm": 0.046625442802906036, "learning_rate": 6.236907730673317e-06, "loss": 0.0, "step": 27610 }, { "epoch": 3.4438902743142146, "grad_norm": 0.0009465779294259846, "learning_rate": 6.2319201995012475e-06, "loss": 0.0124, "step": 27620 }, { "epoch": 3.445137157107232, "grad_norm": 6.235825538635254, "learning_rate": 6.226932668329177e-06, "loss": 0.0005, "step": 27630 }, { "epoch": 3.4463840399002494, "grad_norm": 0.00022234499920159578, "learning_rate": 6.221945137157108e-06, "loss": 0.0, "step": 27640 }, { "epoch": 3.447630922693267, "grad_norm": 0.00037174602039158344, "learning_rate": 6.216957605985038e-06, "loss": 0.0032, "step": 27650 }, { "epoch": 3.4488778054862843, "grad_norm": 0.0011786530958488584, "learning_rate": 6.211970074812968e-06, "loss": 0.0, "step": 27660 }, { "epoch": 3.4501246882793017, "grad_norm": 0.0008283031056635082, "learning_rate": 6.206982543640898e-06, "loss": 0.0002, "step": 27670 }, { "epoch": 3.451371571072319, "grad_norm": 0.00013709701306652278, "learning_rate": 6.201995012468829e-06, "loss": 0.0264, "step": 27680 }, { "epoch": 3.4526184538653366, "grad_norm": 0.0002293088473379612, "learning_rate": 6.1970074812967585e-06, "loss": 0.0, "step": 27690 }, { "epoch": 3.453865336658354, "grad_norm": 0.0011270396644249558, "learning_rate": 6.192019950124689e-06, "loss": 0.0186, "step": 27700 }, { "epoch": 3.4551122194513715, "grad_norm": 0.002886722795665264, "learning_rate": 6.187032418952619e-06, "loss": 0.0, "step": 27710 }, { "epoch": 3.456359102244389, "grad_norm": 0.002366556553170085, "learning_rate": 6.1820448877805495e-06, "loss": 0.0, "step": 27720 }, { "epoch": 3.4576059850374063, "grad_norm": 0.00040443125180900097, "learning_rate": 6.177057356608479e-06, "loss": 0.0, "step": 27730 }, { "epoch": 3.458852867830424, "grad_norm": 8.750553131103516, "learning_rate": 6.17206982543641e-06, "loss": 0.0339, "step": 27740 }, { "epoch": 3.460099750623441, "grad_norm": 0.00026490926393307745, "learning_rate": 6.16708229426434e-06, "loss": 0.0, "step": 27750 }, { "epoch": 3.4613466334164587, "grad_norm": 0.0013308553025126457, "learning_rate": 6.16209476309227e-06, "loss": 0.0, "step": 27760 }, { "epoch": 3.462593516209476, "grad_norm": 0.000745519355405122, "learning_rate": 6.1571072319202e-06, "loss": 0.0037, "step": 27770 }, { "epoch": 3.4638403990024935, "grad_norm": 0.00035905808908864856, "learning_rate": 6.152119700748131e-06, "loss": 0.0097, "step": 27780 }, { "epoch": 3.4650872817955114, "grad_norm": 0.04515613242983818, "learning_rate": 6.14713216957606e-06, "loss": 0.0001, "step": 27790 }, { "epoch": 3.466334164588529, "grad_norm": 0.0003502692561596632, "learning_rate": 6.142144638403991e-06, "loss": 0.0001, "step": 27800 }, { "epoch": 3.4675810473815463, "grad_norm": 0.0015125253703445196, "learning_rate": 6.13715710723192e-06, "loss": 0.0, "step": 27810 }, { "epoch": 3.4688279301745637, "grad_norm": 0.00017602364823687822, "learning_rate": 6.132169576059851e-06, "loss": 0.0, "step": 27820 }, { "epoch": 3.470074812967581, "grad_norm": 0.0025234976783394814, "learning_rate": 6.12718204488778e-06, "loss": 0.0, "step": 27830 }, { "epoch": 3.4713216957605986, "grad_norm": 0.0006344380090013146, "learning_rate": 6.122194513715711e-06, "loss": 0.0, "step": 27840 }, { "epoch": 3.472568578553616, "grad_norm": 0.0031119780614972115, "learning_rate": 6.117206982543641e-06, "loss": 0.0, "step": 27850 }, { "epoch": 3.4738154613466334, "grad_norm": 0.0002057456149486825, "learning_rate": 6.112219451371571e-06, "loss": 0.0011, "step": 27860 }, { "epoch": 3.475062344139651, "grad_norm": 0.0012670621508732438, "learning_rate": 6.107231920199502e-06, "loss": 0.0003, "step": 27870 }, { "epoch": 3.4763092269326683, "grad_norm": 0.0004024482041131705, "learning_rate": 6.102244389027432e-06, "loss": 0.0059, "step": 27880 }, { "epoch": 3.4775561097256857, "grad_norm": 53.51555252075195, "learning_rate": 6.097256857855362e-06, "loss": 0.0405, "step": 27890 }, { "epoch": 3.478802992518703, "grad_norm": 0.00030399305978789926, "learning_rate": 6.092269326683292e-06, "loss": 0.0, "step": 27900 }, { "epoch": 3.4800498753117206, "grad_norm": 0.058239106088876724, "learning_rate": 6.087281795511223e-06, "loss": 0.0238, "step": 27910 }, { "epoch": 3.481296758104738, "grad_norm": 0.0014490766916424036, "learning_rate": 6.0822942643391526e-06, "loss": 0.0, "step": 27920 }, { "epoch": 3.4825436408977555, "grad_norm": 0.0002840702945832163, "learning_rate": 6.077306733167083e-06, "loss": 0.0029, "step": 27930 }, { "epoch": 3.483790523690773, "grad_norm": 0.005863568279892206, "learning_rate": 6.072319201995013e-06, "loss": 0.0, "step": 27940 }, { "epoch": 3.4850374064837903, "grad_norm": 0.00029472613823600113, "learning_rate": 6.0673316708229436e-06, "loss": 0.0, "step": 27950 }, { "epoch": 3.4862842892768082, "grad_norm": 0.0002991229121107608, "learning_rate": 6.062344139650873e-06, "loss": 0.0, "step": 27960 }, { "epoch": 3.4875311720698257, "grad_norm": 0.0006273513427004218, "learning_rate": 6.057356608478804e-06, "loss": 0.0002, "step": 27970 }, { "epoch": 3.488778054862843, "grad_norm": 0.0022930046543478966, "learning_rate": 6.052369077306734e-06, "loss": 0.0002, "step": 27980 }, { "epoch": 3.4900249376558605, "grad_norm": 0.00040200044168159366, "learning_rate": 6.047381546134664e-06, "loss": 0.0, "step": 27990 }, { "epoch": 3.491271820448878, "grad_norm": 0.0002788232814054936, "learning_rate": 6.042394014962594e-06, "loss": 0.0032, "step": 28000 }, { "epoch": 3.4925187032418954, "grad_norm": 0.0012403487926349044, "learning_rate": 6.037406483790525e-06, "loss": 0.0, "step": 28010 }, { "epoch": 3.493765586034913, "grad_norm": 0.0010406679939478636, "learning_rate": 6.032418952618454e-06, "loss": 0.0553, "step": 28020 }, { "epoch": 3.4950124688279303, "grad_norm": 0.0004845515941269696, "learning_rate": 6.027431421446385e-06, "loss": 0.0039, "step": 28030 }, { "epoch": 3.4962593516209477, "grad_norm": 0.11747261136770248, "learning_rate": 6.022443890274314e-06, "loss": 0.001, "step": 28040 }, { "epoch": 3.497506234413965, "grad_norm": 0.0910341814160347, "learning_rate": 6.017456359102245e-06, "loss": 0.0, "step": 28050 }, { "epoch": 3.4987531172069826, "grad_norm": 0.0005173732060939074, "learning_rate": 6.0124688279301745e-06, "loss": 0.0159, "step": 28060 }, { "epoch": 3.5, "grad_norm": 0.002428569132462144, "learning_rate": 6.007481296758105e-06, "loss": 0.0007, "step": 28070 }, { "epoch": 3.5012468827930174, "grad_norm": 0.00020148402836639434, "learning_rate": 6.002493765586035e-06, "loss": 0.0, "step": 28080 }, { "epoch": 3.502493765586035, "grad_norm": 26.454116821289062, "learning_rate": 5.9975062344139655e-06, "loss": 0.0262, "step": 28090 }, { "epoch": 3.5037406483790523, "grad_norm": 0.00031432858668267727, "learning_rate": 5.992518703241895e-06, "loss": 0.0001, "step": 28100 }, { "epoch": 3.5049875311720697, "grad_norm": 0.00043840528815053403, "learning_rate": 5.987531172069826e-06, "loss": 0.0011, "step": 28110 }, { "epoch": 3.506234413965087, "grad_norm": 0.007529322523623705, "learning_rate": 5.9825436408977565e-06, "loss": 0.0, "step": 28120 }, { "epoch": 3.5074812967581046, "grad_norm": 0.00011467208241811022, "learning_rate": 5.977556109725686e-06, "loss": 0.0, "step": 28130 }, { "epoch": 3.508728179551122, "grad_norm": 0.002503705210983753, "learning_rate": 5.972568578553617e-06, "loss": 0.0522, "step": 28140 }, { "epoch": 3.5099750623441395, "grad_norm": 0.01109258271753788, "learning_rate": 5.967581047381547e-06, "loss": 0.0007, "step": 28150 }, { "epoch": 3.511221945137157, "grad_norm": 0.002344650449231267, "learning_rate": 5.962593516209477e-06, "loss": 0.0001, "step": 28160 }, { "epoch": 3.5124688279301743, "grad_norm": 0.0004216936940792948, "learning_rate": 5.957605985037407e-06, "loss": 0.0543, "step": 28170 }, { "epoch": 3.5137157107231918, "grad_norm": 0.0004381619510240853, "learning_rate": 5.952618453865338e-06, "loss": 0.0217, "step": 28180 }, { "epoch": 3.514962593516209, "grad_norm": 0.00039310386637225747, "learning_rate": 5.947630922693267e-06, "loss": 0.0498, "step": 28190 }, { "epoch": 3.516209476309227, "grad_norm": 0.0005476093501783907, "learning_rate": 5.942643391521198e-06, "loss": 0.0009, "step": 28200 }, { "epoch": 3.5174563591022445, "grad_norm": 0.00018081016605719924, "learning_rate": 5.937655860349128e-06, "loss": 0.0, "step": 28210 }, { "epoch": 3.518703241895262, "grad_norm": 23.967710494995117, "learning_rate": 5.9326683291770584e-06, "loss": 0.0058, "step": 28220 }, { "epoch": 3.5199501246882794, "grad_norm": 0.0003085663774982095, "learning_rate": 5.927680798004987e-06, "loss": 0.0055, "step": 28230 }, { "epoch": 3.521197007481297, "grad_norm": 0.0006709218141622841, "learning_rate": 5.922693266832919e-06, "loss": 0.0149, "step": 28240 }, { "epoch": 3.5224438902743143, "grad_norm": 0.00043505855137482285, "learning_rate": 5.917705735660848e-06, "loss": 0.0757, "step": 28250 }, { "epoch": 3.5236907730673317, "grad_norm": 0.00016124600369948894, "learning_rate": 5.912718204488778e-06, "loss": 0.0001, "step": 28260 }, { "epoch": 3.524937655860349, "grad_norm": 0.008092806674540043, "learning_rate": 5.907730673316708e-06, "loss": 0.0004, "step": 28270 }, { "epoch": 3.5261845386533666, "grad_norm": 0.0011323315557092428, "learning_rate": 5.902743142144639e-06, "loss": 0.0, "step": 28280 }, { "epoch": 3.527431421446384, "grad_norm": 0.08847978711128235, "learning_rate": 5.8977556109725685e-06, "loss": 0.0344, "step": 28290 }, { "epoch": 3.5286783042394014, "grad_norm": 0.0006359029794111848, "learning_rate": 5.892768079800499e-06, "loss": 0.0, "step": 28300 }, { "epoch": 3.529925187032419, "grad_norm": 0.0006071751704439521, "learning_rate": 5.887780548628429e-06, "loss": 0.0331, "step": 28310 }, { "epoch": 3.5311720698254363, "grad_norm": 0.0004466983664315194, "learning_rate": 5.8827930174563595e-06, "loss": 0.0, "step": 28320 }, { "epoch": 3.5324189526184537, "grad_norm": 0.0018139990279451013, "learning_rate": 5.877805486284289e-06, "loss": 0.0, "step": 28330 }, { "epoch": 3.533665835411471, "grad_norm": 0.0002320937201147899, "learning_rate": 5.87281795511222e-06, "loss": 0.0003, "step": 28340 }, { "epoch": 3.534912718204489, "grad_norm": 0.0004413308924995363, "learning_rate": 5.86783042394015e-06, "loss": 0.0, "step": 28350 }, { "epoch": 3.5361596009975065, "grad_norm": 0.27286967635154724, "learning_rate": 5.86284289276808e-06, "loss": 0.0516, "step": 28360 }, { "epoch": 3.537406483790524, "grad_norm": 0.00041090993909165263, "learning_rate": 5.857855361596011e-06, "loss": 0.0285, "step": 28370 }, { "epoch": 3.5386533665835413, "grad_norm": 0.00043929347884841263, "learning_rate": 5.852867830423941e-06, "loss": 0.0, "step": 28380 }, { "epoch": 3.539900249376559, "grad_norm": 1.708524227142334, "learning_rate": 5.847880299251871e-06, "loss": 0.0391, "step": 28390 }, { "epoch": 3.541147132169576, "grad_norm": 0.0006791963241994381, "learning_rate": 5.842892768079801e-06, "loss": 0.0002, "step": 28400 }, { "epoch": 3.5423940149625937, "grad_norm": 0.0003175217134412378, "learning_rate": 5.837905236907732e-06, "loss": 0.0, "step": 28410 }, { "epoch": 3.543640897755611, "grad_norm": 0.00026248840731568635, "learning_rate": 5.8329177057356615e-06, "loss": 0.0011, "step": 28420 }, { "epoch": 3.5448877805486285, "grad_norm": 28.28713607788086, "learning_rate": 5.827930174563592e-06, "loss": 0.0493, "step": 28430 }, { "epoch": 3.546134663341646, "grad_norm": 0.0004381363105494529, "learning_rate": 5.822942643391522e-06, "loss": 0.0476, "step": 28440 }, { "epoch": 3.5473815461346634, "grad_norm": 0.00020880838565062732, "learning_rate": 5.8179551122194525e-06, "loss": 0.0, "step": 28450 }, { "epoch": 3.548628428927681, "grad_norm": 0.0002543655864428729, "learning_rate": 5.8129675810473814e-06, "loss": 0.0513, "step": 28460 }, { "epoch": 3.5498753117206983, "grad_norm": 0.0017961261328309774, "learning_rate": 5.807980049875313e-06, "loss": 0.0253, "step": 28470 }, { "epoch": 3.5511221945137157, "grad_norm": 0.0062349289655685425, "learning_rate": 5.802992518703242e-06, "loss": 0.0, "step": 28480 }, { "epoch": 3.552369077306733, "grad_norm": 45.63996124267578, "learning_rate": 5.7980049875311725e-06, "loss": 0.0339, "step": 28490 }, { "epoch": 3.5536159600997506, "grad_norm": 0.021966515108942986, "learning_rate": 5.793017456359102e-06, "loss": 0.0, "step": 28500 }, { "epoch": 3.554862842892768, "grad_norm": 0.007263632025569677, "learning_rate": 5.788029925187033e-06, "loss": 0.0001, "step": 28510 }, { "epoch": 3.5561097256857854, "grad_norm": 0.001259212614968419, "learning_rate": 5.783042394014963e-06, "loss": 0.0, "step": 28520 }, { "epoch": 3.557356608478803, "grad_norm": 0.0005705059738829732, "learning_rate": 5.778054862842893e-06, "loss": 0.0, "step": 28530 }, { "epoch": 3.5586034912718203, "grad_norm": 0.012875759974122047, "learning_rate": 5.773067331670823e-06, "loss": 0.0002, "step": 28540 }, { "epoch": 3.5598503740648377, "grad_norm": 0.002192272339016199, "learning_rate": 5.768079800498754e-06, "loss": 0.0009, "step": 28550 }, { "epoch": 3.561097256857855, "grad_norm": 0.00031267275335267186, "learning_rate": 5.763092269326683e-06, "loss": 0.0152, "step": 28560 }, { "epoch": 3.5623441396508726, "grad_norm": 0.00037283176789060235, "learning_rate": 5.758104738154614e-06, "loss": 0.0001, "step": 28570 }, { "epoch": 3.56359102244389, "grad_norm": 0.0001530810259282589, "learning_rate": 5.753117206982544e-06, "loss": 0.0, "step": 28580 }, { "epoch": 3.5648379052369075, "grad_norm": 0.0004657857643906027, "learning_rate": 5.748129675810474e-06, "loss": 0.0034, "step": 28590 }, { "epoch": 3.5660847880299253, "grad_norm": 0.000526457850355655, "learning_rate": 5.743142144638404e-06, "loss": 0.0, "step": 28600 }, { "epoch": 3.567331670822943, "grad_norm": 0.0016893032006919384, "learning_rate": 5.738154613466335e-06, "loss": 0.0001, "step": 28610 }, { "epoch": 3.56857855361596, "grad_norm": 0.0004379678575787693, "learning_rate": 5.733167082294265e-06, "loss": 0.0001, "step": 28620 }, { "epoch": 3.5698254364089776, "grad_norm": 0.00044744761544279754, "learning_rate": 5.728179551122195e-06, "loss": 0.0, "step": 28630 }, { "epoch": 3.571072319201995, "grad_norm": 0.0023382350336760283, "learning_rate": 5.723192019950126e-06, "loss": 0.0, "step": 28640 }, { "epoch": 3.5723192019950125, "grad_norm": 0.0007506693946197629, "learning_rate": 5.718204488778056e-06, "loss": 0.0, "step": 28650 }, { "epoch": 3.57356608478803, "grad_norm": 0.0003731877659447491, "learning_rate": 5.713216957605986e-06, "loss": 0.0001, "step": 28660 }, { "epoch": 3.5748129675810474, "grad_norm": 0.0005579161224886775, "learning_rate": 5.708229426433915e-06, "loss": 0.0001, "step": 28670 }, { "epoch": 3.576059850374065, "grad_norm": 0.000176784407813102, "learning_rate": 5.703241895261847e-06, "loss": 0.0002, "step": 28680 }, { "epoch": 3.5773067331670823, "grad_norm": 0.0005845030536875129, "learning_rate": 5.6982543640897755e-06, "loss": 0.0216, "step": 28690 }, { "epoch": 3.5785536159600997, "grad_norm": 4.049169540405273, "learning_rate": 5.693266832917706e-06, "loss": 0.0007, "step": 28700 }, { "epoch": 3.579800498753117, "grad_norm": 0.0001853818102972582, "learning_rate": 5.688279301745636e-06, "loss": 0.0041, "step": 28710 }, { "epoch": 3.5810473815461346, "grad_norm": 0.003477283287793398, "learning_rate": 5.6832917705735665e-06, "loss": 0.0, "step": 28720 }, { "epoch": 3.582294264339152, "grad_norm": 0.002856282517313957, "learning_rate": 5.678304239401496e-06, "loss": 0.0, "step": 28730 }, { "epoch": 3.5835411471321694, "grad_norm": 0.00022709465702064335, "learning_rate": 5.673316708229427e-06, "loss": 0.0001, "step": 28740 }, { "epoch": 3.5847880299251873, "grad_norm": 0.0019931760616600513, "learning_rate": 5.668329177057357e-06, "loss": 0.0, "step": 28750 }, { "epoch": 3.5860349127182047, "grad_norm": 0.0034395332913845778, "learning_rate": 5.663341645885287e-06, "loss": 0.0, "step": 28760 }, { "epoch": 3.587281795511222, "grad_norm": 0.0003281449025962502, "learning_rate": 5.658354114713217e-06, "loss": 0.0, "step": 28770 }, { "epoch": 3.5885286783042396, "grad_norm": 0.00019891714327968657, "learning_rate": 5.653366583541148e-06, "loss": 0.0, "step": 28780 }, { "epoch": 3.589775561097257, "grad_norm": 0.0002504756848793477, "learning_rate": 5.6483790523690775e-06, "loss": 0.0, "step": 28790 }, { "epoch": 3.5910224438902745, "grad_norm": 0.0025514832232147455, "learning_rate": 5.643391521197008e-06, "loss": 0.0, "step": 28800 }, { "epoch": 3.592269326683292, "grad_norm": 0.00017150930943898857, "learning_rate": 5.638403990024938e-06, "loss": 0.0, "step": 28810 }, { "epoch": 3.5935162094763093, "grad_norm": 0.153523251414299, "learning_rate": 5.6334164588528685e-06, "loss": 0.0001, "step": 28820 }, { "epoch": 3.5947630922693268, "grad_norm": 0.00042072118958458304, "learning_rate": 5.628428927680798e-06, "loss": 0.0, "step": 28830 }, { "epoch": 3.596009975062344, "grad_norm": 0.0004099764919374138, "learning_rate": 5.623441396508729e-06, "loss": 0.0404, "step": 28840 }, { "epoch": 3.5972568578553616, "grad_norm": 0.00011178813292644918, "learning_rate": 5.618453865336659e-06, "loss": 0.0001, "step": 28850 }, { "epoch": 3.598503740648379, "grad_norm": 0.005052752792835236, "learning_rate": 5.613466334164589e-06, "loss": 0.0, "step": 28860 }, { "epoch": 3.5997506234413965, "grad_norm": 0.00014898031076882035, "learning_rate": 5.60847880299252e-06, "loss": 0.0001, "step": 28870 }, { "epoch": 3.600997506234414, "grad_norm": 0.001589166116900742, "learning_rate": 5.60349127182045e-06, "loss": 0.0727, "step": 28880 }, { "epoch": 3.6022443890274314, "grad_norm": 0.0010678882244974375, "learning_rate": 5.59850374064838e-06, "loss": 0.0553, "step": 28890 }, { "epoch": 3.603491271820449, "grad_norm": 0.0002836325438693166, "learning_rate": 5.593516209476309e-06, "loss": 0.0, "step": 28900 }, { "epoch": 3.6047381546134662, "grad_norm": 0.0003612026630435139, "learning_rate": 5.588528678304241e-06, "loss": 0.0001, "step": 28910 }, { "epoch": 3.6059850374064837, "grad_norm": 0.0014816455077379942, "learning_rate": 5.58354114713217e-06, "loss": 0.0009, "step": 28920 }, { "epoch": 3.607231920199501, "grad_norm": 0.00015595743025187403, "learning_rate": 5.5785536159601e-06, "loss": 0.0001, "step": 28930 }, { "epoch": 3.6084788029925186, "grad_norm": 0.0009463661117479205, "learning_rate": 5.57356608478803e-06, "loss": 0.0, "step": 28940 }, { "epoch": 3.609725685785536, "grad_norm": 0.11780844628810883, "learning_rate": 5.568578553615961e-06, "loss": 0.0004, "step": 28950 }, { "epoch": 3.6109725685785534, "grad_norm": 0.0011268043890595436, "learning_rate": 5.56359102244389e-06, "loss": 0.0, "step": 28960 }, { "epoch": 3.612219451371571, "grad_norm": 0.0016340049915015697, "learning_rate": 5.558603491271821e-06, "loss": 0.0, "step": 28970 }, { "epoch": 3.6134663341645883, "grad_norm": 0.004008717834949493, "learning_rate": 5.553615960099751e-06, "loss": 0.0, "step": 28980 }, { "epoch": 3.6147132169576057, "grad_norm": 0.0002216768334619701, "learning_rate": 5.548628428927681e-06, "loss": 0.0, "step": 28990 }, { "epoch": 3.6159600997506236, "grad_norm": 0.00013463239884003997, "learning_rate": 5.543640897755611e-06, "loss": 0.0003, "step": 29000 }, { "epoch": 3.617206982543641, "grad_norm": 0.00029779941542074084, "learning_rate": 5.538653366583542e-06, "loss": 0.0, "step": 29010 }, { "epoch": 3.6184538653366585, "grad_norm": 0.00019660868565551937, "learning_rate": 5.5336658354114716e-06, "loss": 0.0042, "step": 29020 }, { "epoch": 3.619700748129676, "grad_norm": 0.0009630105341784656, "learning_rate": 5.528678304239402e-06, "loss": 0.0, "step": 29030 }, { "epoch": 3.6209476309226933, "grad_norm": 0.03896058723330498, "learning_rate": 5.523690773067332e-06, "loss": 0.0, "step": 29040 }, { "epoch": 3.6221945137157108, "grad_norm": 0.0003263501566834748, "learning_rate": 5.5187032418952626e-06, "loss": 0.0077, "step": 29050 }, { "epoch": 3.623441396508728, "grad_norm": 0.003972560167312622, "learning_rate": 5.513715710723192e-06, "loss": 0.0, "step": 29060 }, { "epoch": 3.6246882793017456, "grad_norm": 0.001412399928085506, "learning_rate": 5.508728179551123e-06, "loss": 0.0, "step": 29070 }, { "epoch": 3.625935162094763, "grad_norm": 0.00019415126007515937, "learning_rate": 5.503740648379052e-06, "loss": 0.0039, "step": 29080 }, { "epoch": 3.6271820448877805, "grad_norm": 0.00024378967646043748, "learning_rate": 5.498753117206983e-06, "loss": 0.0444, "step": 29090 }, { "epoch": 3.628428927680798, "grad_norm": 0.0002585103502497077, "learning_rate": 5.493765586034914e-06, "loss": 0.0, "step": 29100 }, { "epoch": 3.6296758104738154, "grad_norm": 0.0010230403859168291, "learning_rate": 5.488778054862843e-06, "loss": 0.0, "step": 29110 }, { "epoch": 3.630922693266833, "grad_norm": 0.00033736188197508454, "learning_rate": 5.483790523690774e-06, "loss": 0.0, "step": 29120 }, { "epoch": 3.6321695760598502, "grad_norm": 0.00012601564230863005, "learning_rate": 5.478802992518703e-06, "loss": 0.0, "step": 29130 }, { "epoch": 3.6334164588528677, "grad_norm": 0.0002682583872228861, "learning_rate": 5.473815461346634e-06, "loss": 0.0, "step": 29140 }, { "epoch": 3.6346633416458856, "grad_norm": 0.00011863713734783232, "learning_rate": 5.468827930174564e-06, "loss": 0.0, "step": 29150 }, { "epoch": 3.635910224438903, "grad_norm": 0.0008033208432607353, "learning_rate": 5.463840399002494e-06, "loss": 0.0, "step": 29160 }, { "epoch": 3.6371571072319204, "grad_norm": 0.00037883687764406204, "learning_rate": 5.458852867830424e-06, "loss": 0.003, "step": 29170 }, { "epoch": 3.638403990024938, "grad_norm": 3.7584238052368164, "learning_rate": 5.453865336658355e-06, "loss": 0.0423, "step": 29180 }, { "epoch": 3.6396508728179553, "grad_norm": 0.0013585427077487111, "learning_rate": 5.4488778054862845e-06, "loss": 0.0391, "step": 29190 }, { "epoch": 3.6408977556109727, "grad_norm": 0.0007598469965159893, "learning_rate": 5.443890274314215e-06, "loss": 0.0022, "step": 29200 }, { "epoch": 3.64214463840399, "grad_norm": 0.0015173618448898196, "learning_rate": 5.438902743142145e-06, "loss": 0.0, "step": 29210 }, { "epoch": 3.6433915211970076, "grad_norm": 0.011424501426517963, "learning_rate": 5.4339152119700755e-06, "loss": 0.0, "step": 29220 }, { "epoch": 3.644638403990025, "grad_norm": 0.01082046888768673, "learning_rate": 5.428927680798005e-06, "loss": 0.0017, "step": 29230 }, { "epoch": 3.6458852867830425, "grad_norm": 0.0007360957097262144, "learning_rate": 5.423940149625936e-06, "loss": 0.0002, "step": 29240 }, { "epoch": 3.64713216957606, "grad_norm": 0.0010136303026229143, "learning_rate": 5.418952618453866e-06, "loss": 0.0, "step": 29250 }, { "epoch": 3.6483790523690773, "grad_norm": 9.919052124023438, "learning_rate": 5.413965087281796e-06, "loss": 0.0114, "step": 29260 }, { "epoch": 3.6496259351620948, "grad_norm": 0.17939278483390808, "learning_rate": 5.408977556109726e-06, "loss": 0.0007, "step": 29270 }, { "epoch": 3.650872817955112, "grad_norm": 0.00015363430429715663, "learning_rate": 5.403990024937657e-06, "loss": 0.0003, "step": 29280 }, { "epoch": 3.6521197007481296, "grad_norm": 0.0038780230097472668, "learning_rate": 5.3990024937655864e-06, "loss": 0.0, "step": 29290 }, { "epoch": 3.653366583541147, "grad_norm": 0.00012658373452723026, "learning_rate": 5.394014962593517e-06, "loss": 0.0, "step": 29300 }, { "epoch": 3.6546134663341645, "grad_norm": 0.02487073466181755, "learning_rate": 5.389027431421446e-06, "loss": 0.0001, "step": 29310 }, { "epoch": 3.655860349127182, "grad_norm": 0.00010112265590578318, "learning_rate": 5.3840399002493774e-06, "loss": 0.0108, "step": 29320 }, { "epoch": 3.6571072319201994, "grad_norm": 0.0008333465084433556, "learning_rate": 5.379052369077306e-06, "loss": 0.0, "step": 29330 }, { "epoch": 3.658354114713217, "grad_norm": 0.007760242559015751, "learning_rate": 5.374064837905237e-06, "loss": 0.01, "step": 29340 }, { "epoch": 3.6596009975062342, "grad_norm": 0.00010809869854710996, "learning_rate": 5.3690773067331685e-06, "loss": 0.0001, "step": 29350 }, { "epoch": 3.6608478802992517, "grad_norm": 0.001450029551051557, "learning_rate": 5.364089775561097e-06, "loss": 0.0, "step": 29360 }, { "epoch": 3.662094763092269, "grad_norm": 13.586308479309082, "learning_rate": 5.359102244389028e-06, "loss": 0.002, "step": 29370 }, { "epoch": 3.6633416458852865, "grad_norm": 0.001481684041209519, "learning_rate": 5.354114713216958e-06, "loss": 0.0, "step": 29380 }, { "epoch": 3.664588528678304, "grad_norm": 0.0003070503589697182, "learning_rate": 5.349127182044888e-06, "loss": 0.0, "step": 29390 }, { "epoch": 3.665835411471322, "grad_norm": 0.0039036672096699476, "learning_rate": 5.344139650872818e-06, "loss": 0.0, "step": 29400 }, { "epoch": 3.6670822942643393, "grad_norm": 0.0003815802629105747, "learning_rate": 5.339152119700749e-06, "loss": 0.0128, "step": 29410 }, { "epoch": 3.6683291770573567, "grad_norm": 0.0005427736323326826, "learning_rate": 5.3341645885286786e-06, "loss": 0.024, "step": 29420 }, { "epoch": 3.669576059850374, "grad_norm": 0.00020941866387147456, "learning_rate": 5.329177057356609e-06, "loss": 0.0, "step": 29430 }, { "epoch": 3.6708229426433916, "grad_norm": 0.0005622466560453176, "learning_rate": 5.324189526184539e-06, "loss": 0.0002, "step": 29440 }, { "epoch": 3.672069825436409, "grad_norm": 0.00019208036246709526, "learning_rate": 5.3192019950124696e-06, "loss": 0.003, "step": 29450 }, { "epoch": 3.6733167082294265, "grad_norm": 0.00985199399292469, "learning_rate": 5.314214463840399e-06, "loss": 0.0054, "step": 29460 }, { "epoch": 3.674563591022444, "grad_norm": 0.00011043824633816257, "learning_rate": 5.30922693266833e-06, "loss": 0.0216, "step": 29470 }, { "epoch": 3.6758104738154613, "grad_norm": 0.0001325913763139397, "learning_rate": 5.30423940149626e-06, "loss": 0.0, "step": 29480 }, { "epoch": 3.6770573566084788, "grad_norm": 0.005065642762929201, "learning_rate": 5.29925187032419e-06, "loss": 0.0281, "step": 29490 }, { "epoch": 3.678304239401496, "grad_norm": 0.0005468591116368771, "learning_rate": 5.29426433915212e-06, "loss": 0.0, "step": 29500 }, { "epoch": 3.6795511221945136, "grad_norm": 0.0002817381464410573, "learning_rate": 5.289276807980051e-06, "loss": 0.0001, "step": 29510 }, { "epoch": 3.680798004987531, "grad_norm": 0.0014619999565184116, "learning_rate": 5.2842892768079805e-06, "loss": 0.0, "step": 29520 }, { "epoch": 3.6820448877805485, "grad_norm": 0.00014814763562753797, "learning_rate": 5.279301745635911e-06, "loss": 0.0015, "step": 29530 }, { "epoch": 3.683291770573566, "grad_norm": 0.001574231660924852, "learning_rate": 5.27431421446384e-06, "loss": 0.0001, "step": 29540 }, { "epoch": 3.684538653366584, "grad_norm": 0.00010609447053866461, "learning_rate": 5.2693266832917715e-06, "loss": 0.0, "step": 29550 }, { "epoch": 3.6857855361596013, "grad_norm": 0.00022953233565203846, "learning_rate": 5.2643391521197004e-06, "loss": 0.0, "step": 29560 }, { "epoch": 3.6870324189526187, "grad_norm": 9.888015483738855e-05, "learning_rate": 5.259351620947631e-06, "loss": 0.0002, "step": 29570 }, { "epoch": 3.688279301745636, "grad_norm": 0.00013748396304436028, "learning_rate": 5.254364089775561e-06, "loss": 0.0, "step": 29580 }, { "epoch": 3.6895261845386536, "grad_norm": 0.00013026520900893956, "learning_rate": 5.2493765586034915e-06, "loss": 0.0219, "step": 29590 }, { "epoch": 3.690773067331671, "grad_norm": 0.0008388891001231968, "learning_rate": 5.244389027431422e-06, "loss": 0.0617, "step": 29600 }, { "epoch": 3.6920199501246884, "grad_norm": 0.0006873853853903711, "learning_rate": 5.239401496259352e-06, "loss": 0.0381, "step": 29610 }, { "epoch": 3.693266832917706, "grad_norm": 7.518016338348389, "learning_rate": 5.2344139650872825e-06, "loss": 0.0623, "step": 29620 }, { "epoch": 3.6945137157107233, "grad_norm": 0.0008727543754503131, "learning_rate": 5.229426433915212e-06, "loss": 0.0, "step": 29630 }, { "epoch": 3.6957605985037407, "grad_norm": 0.005849814973771572, "learning_rate": 5.224438902743143e-06, "loss": 0.055, "step": 29640 }, { "epoch": 3.697007481296758, "grad_norm": 0.0005927455495111644, "learning_rate": 5.219451371571073e-06, "loss": 0.0, "step": 29650 }, { "epoch": 3.6982543640897756, "grad_norm": 0.0024080451112240553, "learning_rate": 5.214463840399003e-06, "loss": 0.0135, "step": 29660 }, { "epoch": 3.699501246882793, "grad_norm": 0.0016026614466682076, "learning_rate": 5.209476309226933e-06, "loss": 0.0504, "step": 29670 }, { "epoch": 3.7007481296758105, "grad_norm": 0.0008264243369922042, "learning_rate": 5.204488778054864e-06, "loss": 0.0024, "step": 29680 }, { "epoch": 3.701995012468828, "grad_norm": 0.0018659487832337618, "learning_rate": 5.199501246882793e-06, "loss": 0.0001, "step": 29690 }, { "epoch": 3.7032418952618453, "grad_norm": 0.018036233261227608, "learning_rate": 5.194513715710724e-06, "loss": 0.0001, "step": 29700 }, { "epoch": 3.7044887780548628, "grad_norm": 0.017439868301153183, "learning_rate": 5.189526184538654e-06, "loss": 0.0348, "step": 29710 }, { "epoch": 3.70573566084788, "grad_norm": 0.0008811484440229833, "learning_rate": 5.1845386533665844e-06, "loss": 0.0, "step": 29720 }, { "epoch": 3.7069825436408976, "grad_norm": 0.00037909552338533103, "learning_rate": 5.179551122194514e-06, "loss": 0.0, "step": 29730 }, { "epoch": 3.708229426433915, "grad_norm": 0.00022248552704695612, "learning_rate": 5.174563591022445e-06, "loss": 0.0, "step": 29740 }, { "epoch": 3.7094763092269325, "grad_norm": 0.014188253320753574, "learning_rate": 5.169576059850374e-06, "loss": 0.0001, "step": 29750 }, { "epoch": 3.71072319201995, "grad_norm": 0.013957753777503967, "learning_rate": 5.164588528678305e-06, "loss": 0.0, "step": 29760 }, { "epoch": 3.7119700748129674, "grad_norm": 0.0008046287694014609, "learning_rate": 5.159600997506234e-06, "loss": 0.0, "step": 29770 }, { "epoch": 3.713216957605985, "grad_norm": 0.0015382746933028102, "learning_rate": 5.154613466334165e-06, "loss": 0.0004, "step": 29780 }, { "epoch": 3.7144638403990022, "grad_norm": 0.0011267090449109674, "learning_rate": 5.1496259351620945e-06, "loss": 0.0, "step": 29790 }, { "epoch": 3.71571072319202, "grad_norm": 0.0019451412372291088, "learning_rate": 5.144638403990025e-06, "loss": 0.0, "step": 29800 }, { "epoch": 3.7169576059850375, "grad_norm": 0.0013380858581513166, "learning_rate": 5.139650872817955e-06, "loss": 0.0003, "step": 29810 }, { "epoch": 3.718204488778055, "grad_norm": 0.0008705657673999667, "learning_rate": 5.1346633416458855e-06, "loss": 0.0001, "step": 29820 }, { "epoch": 3.7194513715710724, "grad_norm": 0.0007306481129489839, "learning_rate": 5.129675810473815e-06, "loss": 0.0, "step": 29830 }, { "epoch": 3.72069825436409, "grad_norm": 0.0004138894146308303, "learning_rate": 5.124688279301746e-06, "loss": 0.0, "step": 29840 }, { "epoch": 3.7219451371571073, "grad_norm": 0.0006331994663923979, "learning_rate": 5.1197007481296766e-06, "loss": 0.0255, "step": 29850 }, { "epoch": 3.7231920199501247, "grad_norm": 0.0023739382158964872, "learning_rate": 5.114713216957606e-06, "loss": 0.0, "step": 29860 }, { "epoch": 3.724438902743142, "grad_norm": 0.0019165941048413515, "learning_rate": 5.109725685785537e-06, "loss": 0.0, "step": 29870 }, { "epoch": 3.7256857855361596, "grad_norm": 0.00022552658629138023, "learning_rate": 5.104738154613467e-06, "loss": 0.0, "step": 29880 }, { "epoch": 3.726932668329177, "grad_norm": 0.0007717168191447854, "learning_rate": 5.099750623441397e-06, "loss": 0.0, "step": 29890 }, { "epoch": 3.7281795511221945, "grad_norm": 0.027365686371922493, "learning_rate": 5.094763092269327e-06, "loss": 0.0, "step": 29900 }, { "epoch": 3.729426433915212, "grad_norm": 0.00012416514800861478, "learning_rate": 5.089775561097258e-06, "loss": 0.0, "step": 29910 }, { "epoch": 3.7306733167082293, "grad_norm": 0.0002640146412886679, "learning_rate": 5.0847880299251875e-06, "loss": 0.0, "step": 29920 }, { "epoch": 3.7319201995012468, "grad_norm": 0.00022865763457957655, "learning_rate": 5.079800498753118e-06, "loss": 0.0511, "step": 29930 }, { "epoch": 3.733167082294264, "grad_norm": 0.007766488939523697, "learning_rate": 5.074812967581048e-06, "loss": 0.0, "step": 29940 }, { "epoch": 3.734413965087282, "grad_norm": 0.003563706064596772, "learning_rate": 5.0698254364089785e-06, "loss": 0.0365, "step": 29950 }, { "epoch": 3.7356608478802995, "grad_norm": 0.0021255253814160824, "learning_rate": 5.064837905236908e-06, "loss": 0.0001, "step": 29960 }, { "epoch": 3.736907730673317, "grad_norm": 0.00031094119185581803, "learning_rate": 5.059850374064839e-06, "loss": 0.0146, "step": 29970 }, { "epoch": 3.7381546134663344, "grad_norm": 0.0007570157176814973, "learning_rate": 5.054862842892768e-06, "loss": 0.0001, "step": 29980 }, { "epoch": 3.739401496259352, "grad_norm": 0.0021029124036431313, "learning_rate": 5.049875311720699e-06, "loss": 0.0011, "step": 29990 }, { "epoch": 3.7406483790523692, "grad_norm": 0.0010909464908763766, "learning_rate": 5.044887780548628e-06, "loss": 0.0037, "step": 30000 }, { "epoch": 3.7418952618453867, "grad_norm": 0.0006604224909096956, "learning_rate": 5.039900249376559e-06, "loss": 0.0007, "step": 30010 }, { "epoch": 3.743142144638404, "grad_norm": 0.00019923903164453804, "learning_rate": 5.034912718204489e-06, "loss": 0.0551, "step": 30020 }, { "epoch": 3.7443890274314215, "grad_norm": 0.00044206419261172414, "learning_rate": 5.029925187032419e-06, "loss": 0.0, "step": 30030 }, { "epoch": 3.745635910224439, "grad_norm": 0.0007878596661612391, "learning_rate": 5.024937655860349e-06, "loss": 0.0001, "step": 30040 }, { "epoch": 3.7468827930174564, "grad_norm": 0.000546826864592731, "learning_rate": 5.01995012468828e-06, "loss": 0.0, "step": 30050 }, { "epoch": 3.748129675810474, "grad_norm": 0.0003211061120964587, "learning_rate": 5.014962593516209e-06, "loss": 0.0002, "step": 30060 }, { "epoch": 3.7493765586034913, "grad_norm": 9.711940219858661e-05, "learning_rate": 5.00997506234414e-06, "loss": 0.0362, "step": 30070 }, { "epoch": 3.7506234413965087, "grad_norm": 0.0026123167481273413, "learning_rate": 5.00498753117207e-06, "loss": 0.0, "step": 30080 }, { "epoch": 3.751870324189526, "grad_norm": 0.005424060393124819, "learning_rate": 5e-06, "loss": 0.0, "step": 30090 }, { "epoch": 3.7531172069825436, "grad_norm": 0.00042586520430631936, "learning_rate": 4.99501246882793e-06, "loss": 0.0006, "step": 30100 }, { "epoch": 3.754364089775561, "grad_norm": 0.0006117548909969628, "learning_rate": 4.990024937655861e-06, "loss": 0.0, "step": 30110 }, { "epoch": 3.7556109725685785, "grad_norm": 0.00031122061773203313, "learning_rate": 4.9850374064837906e-06, "loss": 0.0001, "step": 30120 }, { "epoch": 3.756857855361596, "grad_norm": 0.0004732254019472748, "learning_rate": 4.980049875311721e-06, "loss": 0.0, "step": 30130 }, { "epoch": 3.7581047381546133, "grad_norm": 0.00018621271010488272, "learning_rate": 4.975561097256858e-06, "loss": 0.0076, "step": 30140 }, { "epoch": 3.7593516209476308, "grad_norm": 0.0011101874988526106, "learning_rate": 4.970573566084788e-06, "loss": 0.0023, "step": 30150 }, { "epoch": 3.760598503740648, "grad_norm": 0.00034718040842562914, "learning_rate": 4.965586034912719e-06, "loss": 0.0009, "step": 30160 }, { "epoch": 3.7618453865336656, "grad_norm": 0.0011685335775837302, "learning_rate": 4.960598503740649e-06, "loss": 0.0001, "step": 30170 }, { "epoch": 3.763092269326683, "grad_norm": 0.0011077482486143708, "learning_rate": 4.955610972568579e-06, "loss": 0.0001, "step": 30180 }, { "epoch": 3.7643391521197005, "grad_norm": 0.00016491774294991046, "learning_rate": 4.950623441396509e-06, "loss": 0.0, "step": 30190 }, { "epoch": 3.765586034912718, "grad_norm": 0.0005304404185153544, "learning_rate": 4.94563591022444e-06, "loss": 0.0007, "step": 30200 }, { "epoch": 3.766832917705736, "grad_norm": 0.0007268518093042076, "learning_rate": 4.9406483790523695e-06, "loss": 0.0, "step": 30210 }, { "epoch": 3.7680798004987532, "grad_norm": 0.0003884352627210319, "learning_rate": 4.9356608478803e-06, "loss": 0.0, "step": 30220 }, { "epoch": 3.7693266832917707, "grad_norm": 0.03600273281335831, "learning_rate": 4.93067331670823e-06, "loss": 0.0, "step": 30230 }, { "epoch": 3.770573566084788, "grad_norm": 0.00010087342525366694, "learning_rate": 4.92568578553616e-06, "loss": 0.0, "step": 30240 }, { "epoch": 3.7718204488778055, "grad_norm": 0.0014833903405815363, "learning_rate": 4.92069825436409e-06, "loss": 0.0, "step": 30250 }, { "epoch": 3.773067331670823, "grad_norm": 0.0001319284929195419, "learning_rate": 4.91571072319202e-06, "loss": 0.0, "step": 30260 }, { "epoch": 3.7743142144638404, "grad_norm": 0.00032203830778598785, "learning_rate": 4.910723192019951e-06, "loss": 0.0001, "step": 30270 }, { "epoch": 3.775561097256858, "grad_norm": 0.0009229084243997931, "learning_rate": 4.9057356608478805e-06, "loss": 0.0392, "step": 30280 }, { "epoch": 3.7768079800498753, "grad_norm": 0.0002168490318581462, "learning_rate": 4.900748129675811e-06, "loss": 0.0, "step": 30290 }, { "epoch": 3.7780548628428927, "grad_norm": 0.00036489724880084395, "learning_rate": 4.895760598503741e-06, "loss": 0.0, "step": 30300 }, { "epoch": 3.77930174563591, "grad_norm": 0.00013854789722245187, "learning_rate": 4.8907730673316715e-06, "loss": 0.0, "step": 30310 }, { "epoch": 3.7805486284289276, "grad_norm": 0.00039082334842532873, "learning_rate": 4.885785536159601e-06, "loss": 0.0, "step": 30320 }, { "epoch": 3.781795511221945, "grad_norm": 0.0018215180607512593, "learning_rate": 4.880798004987531e-06, "loss": 0.021, "step": 30330 }, { "epoch": 3.7830423940149625, "grad_norm": 0.00018214010924566537, "learning_rate": 4.875810473815462e-06, "loss": 0.0717, "step": 30340 }, { "epoch": 3.78428927680798, "grad_norm": 0.0006838123081251979, "learning_rate": 4.8708229426433914e-06, "loss": 0.0, "step": 30350 }, { "epoch": 3.7855361596009978, "grad_norm": 0.004681428894400597, "learning_rate": 4.865835411471322e-06, "loss": 0.0463, "step": 30360 }, { "epoch": 3.786783042394015, "grad_norm": 0.0029412105213850737, "learning_rate": 4.860847880299252e-06, "loss": 0.0, "step": 30370 }, { "epoch": 3.7880299251870326, "grad_norm": 0.00215451349504292, "learning_rate": 4.8558603491271824e-06, "loss": 0.0, "step": 30380 }, { "epoch": 3.78927680798005, "grad_norm": 0.0009547757799737155, "learning_rate": 4.850872817955113e-06, "loss": 0.0, "step": 30390 }, { "epoch": 3.7905236907730675, "grad_norm": 0.0007622092380188406, "learning_rate": 4.845885286783043e-06, "loss": 0.0, "step": 30400 }, { "epoch": 3.791770573566085, "grad_norm": 0.00015706811973359436, "learning_rate": 4.8408977556109734e-06, "loss": 0.0001, "step": 30410 }, { "epoch": 3.7930174563591024, "grad_norm": 0.0003935690619982779, "learning_rate": 4.835910224438903e-06, "loss": 0.0402, "step": 30420 }, { "epoch": 3.79426433915212, "grad_norm": 0.0001764619373716414, "learning_rate": 4.830922693266834e-06, "loss": 0.0, "step": 30430 }, { "epoch": 3.7955112219451372, "grad_norm": 0.00022006318613421172, "learning_rate": 4.825935162094764e-06, "loss": 0.0, "step": 30440 }, { "epoch": 3.7967581047381547, "grad_norm": 0.004599731881171465, "learning_rate": 4.820947630922693e-06, "loss": 0.0, "step": 30450 }, { "epoch": 3.798004987531172, "grad_norm": 0.00025898893363773823, "learning_rate": 4.815960099750624e-06, "loss": 0.0, "step": 30460 }, { "epoch": 3.7992518703241895, "grad_norm": 0.1059185266494751, "learning_rate": 4.810972568578554e-06, "loss": 0.0496, "step": 30470 }, { "epoch": 3.800498753117207, "grad_norm": 0.00024309437139891088, "learning_rate": 4.805985037406484e-06, "loss": 0.0, "step": 30480 }, { "epoch": 3.8017456359102244, "grad_norm": 0.000419674877775833, "learning_rate": 4.800997506234414e-06, "loss": 0.0436, "step": 30490 }, { "epoch": 3.802992518703242, "grad_norm": 0.000803522125352174, "learning_rate": 4.796009975062345e-06, "loss": 0.0732, "step": 30500 }, { "epoch": 3.8042394014962593, "grad_norm": 0.0010885476367548108, "learning_rate": 4.7910224438902746e-06, "loss": 0.0, "step": 30510 }, { "epoch": 3.8054862842892767, "grad_norm": 0.012714964337646961, "learning_rate": 4.786034912718205e-06, "loss": 0.0049, "step": 30520 }, { "epoch": 3.806733167082294, "grad_norm": 0.00023770575353410095, "learning_rate": 4.781047381546135e-06, "loss": 0.0, "step": 30530 }, { "epoch": 3.8079800498753116, "grad_norm": 0.0004311330849304795, "learning_rate": 4.7760598503740656e-06, "loss": 0.0, "step": 30540 }, { "epoch": 3.809226932668329, "grad_norm": 0.00026620420976541936, "learning_rate": 4.771072319201995e-06, "loss": 0.0068, "step": 30550 }, { "epoch": 3.8104738154613464, "grad_norm": 0.0027953782118856907, "learning_rate": 4.766084788029925e-06, "loss": 0.0, "step": 30560 }, { "epoch": 3.811720698254364, "grad_norm": 0.0024098586291074753, "learning_rate": 4.761097256857856e-06, "loss": 0.0, "step": 30570 }, { "epoch": 3.8129675810473813, "grad_norm": 58.85100173950195, "learning_rate": 4.7561097256857855e-06, "loss": 0.0309, "step": 30580 }, { "epoch": 3.8142144638403987, "grad_norm": 0.0023890752345323563, "learning_rate": 4.751122194513716e-06, "loss": 0.0554, "step": 30590 }, { "epoch": 3.815461346633416, "grad_norm": 0.0031661030370742083, "learning_rate": 4.746134663341646e-06, "loss": 0.0, "step": 30600 }, { "epoch": 3.816708229426434, "grad_norm": 0.0001850063999881968, "learning_rate": 4.7411471321695765e-06, "loss": 0.001, "step": 30610 }, { "epoch": 3.8179551122194515, "grad_norm": 0.020202938467264175, "learning_rate": 4.736159600997506e-06, "loss": 0.0289, "step": 30620 }, { "epoch": 3.819201995012469, "grad_norm": 0.0003594272129703313, "learning_rate": 4.731172069825437e-06, "loss": 0.0438, "step": 30630 }, { "epoch": 3.8204488778054864, "grad_norm": 0.01444461289793253, "learning_rate": 4.7261845386533675e-06, "loss": 0.0, "step": 30640 }, { "epoch": 3.821695760598504, "grad_norm": 0.0005201894673518836, "learning_rate": 4.721197007481297e-06, "loss": 0.0, "step": 30650 }, { "epoch": 3.8229426433915212, "grad_norm": 0.0026227838825434446, "learning_rate": 4.716209476309228e-06, "loss": 0.0001, "step": 30660 }, { "epoch": 3.8241895261845387, "grad_norm": 0.0017443817341700196, "learning_rate": 4.711221945137158e-06, "loss": 0.0006, "step": 30670 }, { "epoch": 3.825436408977556, "grad_norm": 0.0010206064907833934, "learning_rate": 4.7062344139650875e-06, "loss": 0.0002, "step": 30680 }, { "epoch": 3.8266832917705735, "grad_norm": 0.009462667629122734, "learning_rate": 4.701246882793018e-06, "loss": 0.0001, "step": 30690 }, { "epoch": 3.827930174563591, "grad_norm": 0.0020114071667194366, "learning_rate": 4.696259351620948e-06, "loss": 0.0, "step": 30700 }, { "epoch": 3.8291770573566084, "grad_norm": 0.00016912985302042216, "learning_rate": 4.6912718204488785e-06, "loss": 0.0, "step": 30710 }, { "epoch": 3.830423940149626, "grad_norm": 0.0017660657176747918, "learning_rate": 4.686284289276808e-06, "loss": 0.0001, "step": 30720 }, { "epoch": 3.8316708229426433, "grad_norm": 0.000822771864477545, "learning_rate": 4.681296758104739e-06, "loss": 0.0, "step": 30730 }, { "epoch": 3.8329177057356607, "grad_norm": 0.0009312349720858037, "learning_rate": 4.676309226932669e-06, "loss": 0.0, "step": 30740 }, { "epoch": 3.834164588528678, "grad_norm": 0.0004992979229427874, "learning_rate": 4.671321695760599e-06, "loss": 0.0114, "step": 30750 }, { "epoch": 3.835411471321696, "grad_norm": 0.009063317440450191, "learning_rate": 4.666334164588529e-06, "loss": 0.0001, "step": 30760 }, { "epoch": 3.8366583541147135, "grad_norm": 0.0017595086246728897, "learning_rate": 4.661346633416459e-06, "loss": 0.0112, "step": 30770 }, { "epoch": 3.837905236907731, "grad_norm": 0.016222162172198296, "learning_rate": 4.6563591022443894e-06, "loss": 0.0222, "step": 30780 }, { "epoch": 3.8391521197007483, "grad_norm": 0.009317958727478981, "learning_rate": 4.651371571072319e-06, "loss": 0.0, "step": 30790 }, { "epoch": 3.8403990024937658, "grad_norm": 0.0005668486119247973, "learning_rate": 4.64638403990025e-06, "loss": 0.0362, "step": 30800 }, { "epoch": 3.841645885286783, "grad_norm": 0.00031317881075665355, "learning_rate": 4.64139650872818e-06, "loss": 0.0007, "step": 30810 }, { "epoch": 3.8428927680798006, "grad_norm": 0.00042529948404990137, "learning_rate": 4.63640897755611e-06, "loss": 0.0022, "step": 30820 }, { "epoch": 3.844139650872818, "grad_norm": 0.000505742384120822, "learning_rate": 4.63142144638404e-06, "loss": 0.0, "step": 30830 }, { "epoch": 3.8453865336658355, "grad_norm": 0.0003615685855038464, "learning_rate": 4.626433915211971e-06, "loss": 0.0001, "step": 30840 }, { "epoch": 3.846633416458853, "grad_norm": 0.0002692066482268274, "learning_rate": 4.6214463840399e-06, "loss": 0.0001, "step": 30850 }, { "epoch": 3.8478802992518704, "grad_norm": 0.00105293991509825, "learning_rate": 4.616458852867831e-06, "loss": 0.0, "step": 30860 }, { "epoch": 3.849127182044888, "grad_norm": 0.0077606611885130405, "learning_rate": 4.611471321695761e-06, "loss": 0.0025, "step": 30870 }, { "epoch": 3.8503740648379052, "grad_norm": 0.0007624849677085876, "learning_rate": 4.6064837905236905e-06, "loss": 0.0, "step": 30880 }, { "epoch": 3.8516209476309227, "grad_norm": 0.00021824889699928463, "learning_rate": 4.601496259351622e-06, "loss": 0.0, "step": 30890 }, { "epoch": 3.85286783042394, "grad_norm": 0.0004284057067707181, "learning_rate": 4.596508728179552e-06, "loss": 0.0003, "step": 30900 }, { "epoch": 3.8541147132169575, "grad_norm": 0.0011223534820601344, "learning_rate": 4.5915211970074815e-06, "loss": 0.0001, "step": 30910 }, { "epoch": 3.855361596009975, "grad_norm": 0.0003385642194189131, "learning_rate": 4.586533665835412e-06, "loss": 0.0002, "step": 30920 }, { "epoch": 3.8566084788029924, "grad_norm": 7.129866571631283e-05, "learning_rate": 4.581546134663342e-06, "loss": 0.0, "step": 30930 }, { "epoch": 3.85785536159601, "grad_norm": 0.0835379809141159, "learning_rate": 4.5765586034912726e-06, "loss": 0.0001, "step": 30940 }, { "epoch": 3.8591022443890273, "grad_norm": 0.0009691972518339753, "learning_rate": 4.571571072319202e-06, "loss": 0.0281, "step": 30950 }, { "epoch": 3.8603491271820447, "grad_norm": 0.004851430654525757, "learning_rate": 4.566583541147133e-06, "loss": 0.0, "step": 30960 }, { "epoch": 3.861596009975062, "grad_norm": 0.00014346891839522868, "learning_rate": 4.561596009975063e-06, "loss": 0.0, "step": 30970 }, { "epoch": 3.8628428927680796, "grad_norm": 0.0014384101377800107, "learning_rate": 4.556608478802993e-06, "loss": 0.0, "step": 30980 }, { "epoch": 3.864089775561097, "grad_norm": 0.0002044325665337965, "learning_rate": 4.551620947630923e-06, "loss": 0.0, "step": 30990 }, { "epoch": 3.8653366583541144, "grad_norm": 0.0003074342093896121, "learning_rate": 4.546633416458853e-06, "loss": 0.0364, "step": 31000 }, { "epoch": 3.8665835411471323, "grad_norm": 0.0023151414934545755, "learning_rate": 4.5416458852867835e-06, "loss": 0.0, "step": 31010 }, { "epoch": 3.8678304239401498, "grad_norm": 0.005174377933144569, "learning_rate": 4.536658354114713e-06, "loss": 0.0, "step": 31020 }, { "epoch": 3.869077306733167, "grad_norm": 0.002776858163997531, "learning_rate": 4.531670822942644e-06, "loss": 0.0, "step": 31030 }, { "epoch": 3.8703241895261846, "grad_norm": 0.008138804696500301, "learning_rate": 4.526683291770574e-06, "loss": 0.0, "step": 31040 }, { "epoch": 3.871571072319202, "grad_norm": 0.00016253614739980549, "learning_rate": 4.521695760598504e-06, "loss": 0.0001, "step": 31050 }, { "epoch": 3.8728179551122195, "grad_norm": 0.0030649008695036173, "learning_rate": 4.516708229426434e-06, "loss": 0.0431, "step": 31060 }, { "epoch": 3.874064837905237, "grad_norm": 0.00010678763646865264, "learning_rate": 4.511720698254365e-06, "loss": 0.0127, "step": 31070 }, { "epoch": 3.8753117206982544, "grad_norm": 0.00013451423728838563, "learning_rate": 4.5067331670822945e-06, "loss": 0.0003, "step": 31080 }, { "epoch": 3.876558603491272, "grad_norm": 0.003577812807634473, "learning_rate": 4.501745635910224e-06, "loss": 0.0004, "step": 31090 }, { "epoch": 3.8778054862842892, "grad_norm": 0.00018554333655629307, "learning_rate": 4.496758104738155e-06, "loss": 0.0, "step": 31100 }, { "epoch": 3.8790523690773067, "grad_norm": 0.0001478716148994863, "learning_rate": 4.491770573566085e-06, "loss": 0.0353, "step": 31110 }, { "epoch": 3.880299251870324, "grad_norm": 0.0002082760474877432, "learning_rate": 4.486783042394015e-06, "loss": 0.0, "step": 31120 }, { "epoch": 3.8815461346633415, "grad_norm": 0.0001366027572657913, "learning_rate": 4.481795511221945e-06, "loss": 0.0, "step": 31130 }, { "epoch": 3.882793017456359, "grad_norm": 0.004299758467823267, "learning_rate": 4.476807980049876e-06, "loss": 0.0, "step": 31140 }, { "epoch": 3.8840399002493764, "grad_norm": 0.00022500457998830825, "learning_rate": 4.471820448877806e-06, "loss": 0.0001, "step": 31150 }, { "epoch": 3.8852867830423943, "grad_norm": 0.0007693602237850428, "learning_rate": 4.466832917705736e-06, "loss": 0.0003, "step": 31160 }, { "epoch": 3.8865336658354117, "grad_norm": 0.002516511594876647, "learning_rate": 4.461845386533667e-06, "loss": 0.0001, "step": 31170 }, { "epoch": 3.887780548628429, "grad_norm": 0.0010788318468257785, "learning_rate": 4.456857855361596e-06, "loss": 0.0, "step": 31180 }, { "epoch": 3.8890274314214466, "grad_norm": 0.00012673945457208902, "learning_rate": 4.451870324189527e-06, "loss": 0.0, "step": 31190 }, { "epoch": 3.890274314214464, "grad_norm": 105.11888122558594, "learning_rate": 4.446882793017457e-06, "loss": 0.0167, "step": 31200 }, { "epoch": 3.8915211970074814, "grad_norm": 0.0005449632881209254, "learning_rate": 4.441895261845387e-06, "loss": 0.0417, "step": 31210 }, { "epoch": 3.892768079800499, "grad_norm": 0.004787981975823641, "learning_rate": 4.436907730673317e-06, "loss": 0.0012, "step": 31220 }, { "epoch": 3.8940149625935163, "grad_norm": 0.000260724569670856, "learning_rate": 4.431920199501247e-06, "loss": 0.0, "step": 31230 }, { "epoch": 3.8952618453865338, "grad_norm": 0.00011065945727750659, "learning_rate": 4.426932668329178e-06, "loss": 0.0, "step": 31240 }, { "epoch": 3.896508728179551, "grad_norm": 0.007168873678892851, "learning_rate": 4.421945137157107e-06, "loss": 0.0, "step": 31250 }, { "epoch": 3.8977556109725686, "grad_norm": 0.05084488168358803, "learning_rate": 4.416957605985038e-06, "loss": 0.0, "step": 31260 }, { "epoch": 3.899002493765586, "grad_norm": 0.0007356269052252173, "learning_rate": 4.411970074812968e-06, "loss": 0.0004, "step": 31270 }, { "epoch": 3.9002493765586035, "grad_norm": 0.0008322421344928443, "learning_rate": 4.406982543640898e-06, "loss": 0.0, "step": 31280 }, { "epoch": 3.901496259351621, "grad_norm": 1.249976396560669, "learning_rate": 4.401995012468828e-06, "loss": 0.0002, "step": 31290 }, { "epoch": 3.9027431421446384, "grad_norm": 0.00026053888723254204, "learning_rate": 4.397007481296759e-06, "loss": 0.0, "step": 31300 }, { "epoch": 3.903990024937656, "grad_norm": 0.00010390794341219589, "learning_rate": 4.3920199501246885e-06, "loss": 0.0, "step": 31310 }, { "epoch": 3.9052369077306732, "grad_norm": 0.0004543437680695206, "learning_rate": 4.387032418952618e-06, "loss": 0.0, "step": 31320 }, { "epoch": 3.9064837905236907, "grad_norm": 0.0006542736664414406, "learning_rate": 4.382044887780549e-06, "loss": 0.0, "step": 31330 }, { "epoch": 3.907730673316708, "grad_norm": 0.011358565650880337, "learning_rate": 4.377057356608479e-06, "loss": 0.0001, "step": 31340 }, { "epoch": 3.9089775561097255, "grad_norm": 0.00022623350378125906, "learning_rate": 4.372069825436409e-06, "loss": 0.0, "step": 31350 }, { "epoch": 3.910224438902743, "grad_norm": 0.0005444650305435061, "learning_rate": 4.367082294264339e-06, "loss": 0.0, "step": 31360 }, { "epoch": 3.9114713216957604, "grad_norm": 0.0046605272218585014, "learning_rate": 4.36209476309227e-06, "loss": 0.0148, "step": 31370 }, { "epoch": 3.912718204488778, "grad_norm": 0.0033968230709433556, "learning_rate": 4.3571072319202e-06, "loss": 0.0011, "step": 31380 }, { "epoch": 3.9139650872817953, "grad_norm": 0.0001498850469943136, "learning_rate": 4.35211970074813e-06, "loss": 0.0006, "step": 31390 }, { "epoch": 3.9152119700748127, "grad_norm": 0.0003460958832874894, "learning_rate": 4.347132169576061e-06, "loss": 0.0, "step": 31400 }, { "epoch": 3.9164588528678306, "grad_norm": 0.0003900064912158996, "learning_rate": 4.3421446384039905e-06, "loss": 0.0155, "step": 31410 }, { "epoch": 3.917705735660848, "grad_norm": 0.00030247578979469836, "learning_rate": 4.337157107231921e-06, "loss": 0.003, "step": 31420 }, { "epoch": 3.9189526184538654, "grad_norm": 0.0009005562751553953, "learning_rate": 4.332169576059851e-06, "loss": 0.0, "step": 31430 }, { "epoch": 3.920199501246883, "grad_norm": 0.0002896441437769681, "learning_rate": 4.327182044887781e-06, "loss": 0.0234, "step": 31440 }, { "epoch": 3.9214463840399003, "grad_norm": 0.0017723769415169954, "learning_rate": 4.322194513715711e-06, "loss": 0.0, "step": 31450 }, { "epoch": 3.9226932668329177, "grad_norm": 0.0003186598187312484, "learning_rate": 4.317206982543641e-06, "loss": 0.0254, "step": 31460 }, { "epoch": 3.923940149625935, "grad_norm": 0.4577343761920929, "learning_rate": 4.312219451371572e-06, "loss": 0.0052, "step": 31470 }, { "epoch": 3.9251870324189526, "grad_norm": 0.00017377693438902497, "learning_rate": 4.3072319201995014e-06, "loss": 0.0, "step": 31480 }, { "epoch": 3.92643391521197, "grad_norm": 0.00017991484492085874, "learning_rate": 4.302244389027432e-06, "loss": 0.0613, "step": 31490 }, { "epoch": 3.9276807980049875, "grad_norm": 0.009794537909328938, "learning_rate": 4.297256857855362e-06, "loss": 0.0, "step": 31500 }, { "epoch": 3.928927680798005, "grad_norm": 0.00017601059516891837, "learning_rate": 4.2922693266832925e-06, "loss": 0.0, "step": 31510 }, { "epoch": 3.9301745635910224, "grad_norm": 0.0005012759938836098, "learning_rate": 4.287281795511222e-06, "loss": 0.0002, "step": 31520 }, { "epoch": 3.93142144638404, "grad_norm": 0.0002343037340324372, "learning_rate": 4.282294264339152e-06, "loss": 0.0016, "step": 31530 }, { "epoch": 3.932668329177057, "grad_norm": 0.002151391003280878, "learning_rate": 4.277306733167083e-06, "loss": 0.0, "step": 31540 }, { "epoch": 3.9339152119700747, "grad_norm": 0.00020666795899160206, "learning_rate": 4.272319201995012e-06, "loss": 0.0001, "step": 31550 }, { "epoch": 3.9351620947630925, "grad_norm": 0.0015923150349408388, "learning_rate": 4.267331670822943e-06, "loss": 0.0109, "step": 31560 }, { "epoch": 3.93640897755611, "grad_norm": 0.00036555586848407984, "learning_rate": 4.262344139650873e-06, "loss": 0.0002, "step": 31570 }, { "epoch": 3.9376558603491274, "grad_norm": 0.000919479236472398, "learning_rate": 4.257356608478803e-06, "loss": 0.0, "step": 31580 }, { "epoch": 3.938902743142145, "grad_norm": 0.00040298415115103126, "learning_rate": 4.252369077306733e-06, "loss": 0.0001, "step": 31590 }, { "epoch": 3.9401496259351623, "grad_norm": 0.0005349696730263531, "learning_rate": 4.247381546134664e-06, "loss": 0.0, "step": 31600 }, { "epoch": 3.9413965087281797, "grad_norm": 0.0037811186630278826, "learning_rate": 4.2423940149625936e-06, "loss": 0.0, "step": 31610 }, { "epoch": 3.942643391521197, "grad_norm": 0.00016215858340729028, "learning_rate": 4.237406483790524e-06, "loss": 0.0, "step": 31620 }, { "epoch": 3.9438902743142146, "grad_norm": 0.0003500250750221312, "learning_rate": 4.232418952618455e-06, "loss": 0.0, "step": 31630 }, { "epoch": 3.945137157107232, "grad_norm": 0.00022250486654229462, "learning_rate": 4.227930174563591e-06, "loss": 0.024, "step": 31640 }, { "epoch": 3.9463840399002494, "grad_norm": 0.00041795289143919945, "learning_rate": 4.222942643391522e-06, "loss": 0.0, "step": 31650 }, { "epoch": 3.947630922693267, "grad_norm": 0.0001636535016587004, "learning_rate": 4.217955112219452e-06, "loss": 0.0115, "step": 31660 }, { "epoch": 3.9488778054862843, "grad_norm": 0.00020100557594560087, "learning_rate": 4.2129675810473815e-06, "loss": 0.0211, "step": 31670 }, { "epoch": 3.9501246882793017, "grad_norm": 0.00010202820703852922, "learning_rate": 4.207980049875312e-06, "loss": 0.0, "step": 31680 }, { "epoch": 3.951371571072319, "grad_norm": 0.0005976628744974732, "learning_rate": 4.202992518703242e-06, "loss": 0.0308, "step": 31690 }, { "epoch": 3.9526184538653366, "grad_norm": 0.00015598548634443432, "learning_rate": 4.1980049875311725e-06, "loss": 0.0001, "step": 31700 }, { "epoch": 3.953865336658354, "grad_norm": 0.0006848460761830211, "learning_rate": 4.193017456359102e-06, "loss": 0.0001, "step": 31710 }, { "epoch": 3.9551122194513715, "grad_norm": 0.00013792548270430416, "learning_rate": 4.188029925187033e-06, "loss": 0.0, "step": 31720 }, { "epoch": 3.956359102244389, "grad_norm": 7.759319123579189e-05, "learning_rate": 4.183042394014963e-06, "loss": 0.0, "step": 31730 }, { "epoch": 3.9576059850374063, "grad_norm": 0.0001390322286169976, "learning_rate": 4.178054862842893e-06, "loss": 0.0, "step": 31740 }, { "epoch": 3.958852867830424, "grad_norm": 1.8627551794052124, "learning_rate": 4.173067331670823e-06, "loss": 0.0003, "step": 31750 }, { "epoch": 3.960099750623441, "grad_norm": 72.14037322998047, "learning_rate": 4.168079800498753e-06, "loss": 0.045, "step": 31760 }, { "epoch": 3.9613466334164587, "grad_norm": 0.00022149724827613682, "learning_rate": 4.1630922693266835e-06, "loss": 0.0, "step": 31770 }, { "epoch": 3.962593516209476, "grad_norm": 0.0002591174270492047, "learning_rate": 4.158104738154613e-06, "loss": 0.0002, "step": 31780 }, { "epoch": 3.9638403990024935, "grad_norm": 0.0002038097009062767, "learning_rate": 4.153117206982544e-06, "loss": 0.0, "step": 31790 }, { "epoch": 3.965087281795511, "grad_norm": 0.0007396162254735827, "learning_rate": 4.1481296758104745e-06, "loss": 0.0, "step": 31800 }, { "epoch": 3.966334164588529, "grad_norm": 0.0005356416804715991, "learning_rate": 4.143142144638404e-06, "loss": 0.0, "step": 31810 }, { "epoch": 3.9675810473815463, "grad_norm": 0.0005426599527709186, "learning_rate": 4.138154613466335e-06, "loss": 0.0, "step": 31820 }, { "epoch": 3.9688279301745637, "grad_norm": 0.004455675836652517, "learning_rate": 4.133167082294265e-06, "loss": 0.0, "step": 31830 }, { "epoch": 3.970074812967581, "grad_norm": 0.0010512182489037514, "learning_rate": 4.128179551122195e-06, "loss": 0.0, "step": 31840 }, { "epoch": 3.9713216957605986, "grad_norm": 0.00018547069339547306, "learning_rate": 4.123192019950125e-06, "loss": 0.0, "step": 31850 }, { "epoch": 3.972568578553616, "grad_norm": 0.003366265445947647, "learning_rate": 4.118204488778056e-06, "loss": 0.0452, "step": 31860 }, { "epoch": 3.9738154613466334, "grad_norm": 0.10161428898572922, "learning_rate": 4.1132169576059854e-06, "loss": 0.0001, "step": 31870 }, { "epoch": 3.975062344139651, "grad_norm": 0.0009560501202940941, "learning_rate": 4.108229426433916e-06, "loss": 0.0, "step": 31880 }, { "epoch": 3.9763092269326683, "grad_norm": 0.0022827445063740015, "learning_rate": 4.103241895261846e-06, "loss": 0.0159, "step": 31890 }, { "epoch": 3.9775561097256857, "grad_norm": 0.00025452362024225295, "learning_rate": 4.098254364089776e-06, "loss": 0.0, "step": 31900 }, { "epoch": 3.978802992518703, "grad_norm": 0.002267055446282029, "learning_rate": 4.093266832917706e-06, "loss": 0.0, "step": 31910 }, { "epoch": 3.9800498753117206, "grad_norm": 0.007032580208033323, "learning_rate": 4.088279301745636e-06, "loss": 0.0001, "step": 31920 }, { "epoch": 3.981296758104738, "grad_norm": 0.0001074229076039046, "learning_rate": 4.083291770573567e-06, "loss": 0.0, "step": 31930 }, { "epoch": 3.9825436408977555, "grad_norm": 14.748251914978027, "learning_rate": 4.078304239401496e-06, "loss": 0.001, "step": 31940 }, { "epoch": 3.983790523690773, "grad_norm": 7.268755143741146e-05, "learning_rate": 4.073316708229427e-06, "loss": 0.0, "step": 31950 }, { "epoch": 3.985037406483791, "grad_norm": 0.0001998672669287771, "learning_rate": 4.068329177057357e-06, "loss": 0.037, "step": 31960 }, { "epoch": 3.9862842892768082, "grad_norm": 9.201698412653059e-05, "learning_rate": 4.063341645885287e-06, "loss": 0.0314, "step": 31970 }, { "epoch": 3.9875311720698257, "grad_norm": 0.0019448976963758469, "learning_rate": 4.058354114713217e-06, "loss": 0.0, "step": 31980 }, { "epoch": 3.988778054862843, "grad_norm": 0.0184951052069664, "learning_rate": 4.053366583541147e-06, "loss": 0.0688, "step": 31990 }, { "epoch": 3.9900249376558605, "grad_norm": 0.534318208694458, "learning_rate": 4.0483790523690776e-06, "loss": 0.0193, "step": 32000 }, { "epoch": 3.991271820448878, "grad_norm": 0.0036638781893998384, "learning_rate": 4.043391521197007e-06, "loss": 0.0, "step": 32010 }, { "epoch": 3.9925187032418954, "grad_norm": 0.00014929140161257237, "learning_rate": 4.038403990024938e-06, "loss": 0.0, "step": 32020 }, { "epoch": 3.993765586034913, "grad_norm": 0.00022338645067065954, "learning_rate": 4.033416458852868e-06, "loss": 0.0, "step": 32030 }, { "epoch": 3.9950124688279303, "grad_norm": 0.004427746869623661, "learning_rate": 4.028428927680798e-06, "loss": 0.0, "step": 32040 }, { "epoch": 3.9962593516209477, "grad_norm": 0.0004692091897595674, "learning_rate": 4.023441396508729e-06, "loss": 0.0, "step": 32050 }, { "epoch": 3.997506234413965, "grad_norm": 0.0003176057361997664, "learning_rate": 4.018453865336659e-06, "loss": 0.0, "step": 32060 }, { "epoch": 3.9987531172069826, "grad_norm": 0.0016481290804222226, "learning_rate": 4.013466334164589e-06, "loss": 0.0245, "step": 32070 }, { "epoch": 4.0, "grad_norm": 0.0001445809903088957, "learning_rate": 4.008478802992519e-06, "loss": 0.0, "step": 32080 }, { "epoch": 4.0, "eval_accuracy": 0.9950745058918885, "eval_loss": 0.03809603676199913, "eval_runtime": 17.8987, "eval_samples_per_second": 896.097, "eval_steps_per_second": 56.038, "step": 32080 }, { "epoch": 4.001246882793017, "grad_norm": 0.0001878750917967409, "learning_rate": 4.00349127182045e-06, "loss": 0.0001, "step": 32090 }, { "epoch": 4.002493765586035, "grad_norm": 0.00025198431103490293, "learning_rate": 3.9985037406483795e-06, "loss": 0.0, "step": 32100 }, { "epoch": 4.003740648379052, "grad_norm": 0.0003719111846294254, "learning_rate": 3.993516209476309e-06, "loss": 0.0197, "step": 32110 }, { "epoch": 4.00498753117207, "grad_norm": 8.772647561272606e-05, "learning_rate": 3.98852867830424e-06, "loss": 0.0403, "step": 32120 }, { "epoch": 4.006234413965087, "grad_norm": 0.0019759670831263065, "learning_rate": 3.98354114713217e-06, "loss": 0.0, "step": 32130 }, { "epoch": 4.007481296758105, "grad_norm": 0.004112126771360636, "learning_rate": 3.9785536159601e-06, "loss": 0.0, "step": 32140 }, { "epoch": 4.008728179551122, "grad_norm": 0.0010385045316070318, "learning_rate": 3.97356608478803e-06, "loss": 0.0, "step": 32150 }, { "epoch": 4.0099750623441395, "grad_norm": 0.0010680771665647626, "learning_rate": 3.968578553615961e-06, "loss": 0.0, "step": 32160 }, { "epoch": 4.011221945137157, "grad_norm": 0.00011815309699159116, "learning_rate": 3.9635910224438905e-06, "loss": 0.0027, "step": 32170 }, { "epoch": 4.012468827930174, "grad_norm": 0.005580600816756487, "learning_rate": 3.958603491271821e-06, "loss": 0.0, "step": 32180 }, { "epoch": 4.013715710723192, "grad_norm": 0.0022200201638042927, "learning_rate": 3.953615960099751e-06, "loss": 0.0, "step": 32190 }, { "epoch": 4.014962593516209, "grad_norm": 0.0008524726727046072, "learning_rate": 3.9486284289276815e-06, "loss": 0.0, "step": 32200 }, { "epoch": 4.016209476309227, "grad_norm": 1.3903931379318237, "learning_rate": 3.943640897755611e-06, "loss": 0.0001, "step": 32210 }, { "epoch": 4.017456359102244, "grad_norm": 0.001720804488286376, "learning_rate": 3.938653366583541e-06, "loss": 0.0, "step": 32220 }, { "epoch": 4.0187032418952615, "grad_norm": 0.0005697371670976281, "learning_rate": 3.933665835411472e-06, "loss": 0.0, "step": 32230 }, { "epoch": 4.019950124688279, "grad_norm": 7.924262899905443e-05, "learning_rate": 3.928678304239401e-06, "loss": 0.0, "step": 32240 }, { "epoch": 4.021197007481296, "grad_norm": 0.0015229410491883755, "learning_rate": 3.923690773067332e-06, "loss": 0.0001, "step": 32250 }, { "epoch": 4.022443890274314, "grad_norm": 0.0004081030492670834, "learning_rate": 3.918703241895262e-06, "loss": 0.0, "step": 32260 }, { "epoch": 4.023690773067331, "grad_norm": 0.002482213545590639, "learning_rate": 3.913715710723192e-06, "loss": 0.0, "step": 32270 }, { "epoch": 4.024937655860349, "grad_norm": 0.05672341212630272, "learning_rate": 3.908728179551122e-06, "loss": 0.0001, "step": 32280 }, { "epoch": 4.026184538653367, "grad_norm": 0.002141641452908516, "learning_rate": 3.903740648379053e-06, "loss": 0.0, "step": 32290 }, { "epoch": 4.027431421446384, "grad_norm": 9.009883797261864e-05, "learning_rate": 3.8987531172069834e-06, "loss": 0.0, "step": 32300 }, { "epoch": 4.028678304239402, "grad_norm": 0.0011927575105801225, "learning_rate": 3.893765586034913e-06, "loss": 0.0, "step": 32310 }, { "epoch": 4.029925187032419, "grad_norm": 0.0026731493417173624, "learning_rate": 3.888778054862844e-06, "loss": 0.0, "step": 32320 }, { "epoch": 4.031172069825437, "grad_norm": 0.009158428758382797, "learning_rate": 3.883790523690774e-06, "loss": 0.0023, "step": 32330 }, { "epoch": 4.032418952618454, "grad_norm": 7.04668927937746e-05, "learning_rate": 3.878802992518703e-06, "loss": 0.0257, "step": 32340 }, { "epoch": 4.033665835411472, "grad_norm": 0.0011140556307509542, "learning_rate": 3.873815461346634e-06, "loss": 0.0, "step": 32350 }, { "epoch": 4.034912718204489, "grad_norm": 0.0011954933870583773, "learning_rate": 3.868827930174564e-06, "loss": 0.0, "step": 32360 }, { "epoch": 4.0361596009975065, "grad_norm": 0.00039964847383089364, "learning_rate": 3.863840399002494e-06, "loss": 0.0215, "step": 32370 }, { "epoch": 4.037406483790524, "grad_norm": 0.09617782384157181, "learning_rate": 3.858852867830424e-06, "loss": 0.0125, "step": 32380 }, { "epoch": 4.038653366583541, "grad_norm": 0.00016554733156226575, "learning_rate": 3.853865336658355e-06, "loss": 0.006, "step": 32390 }, { "epoch": 4.039900249376559, "grad_norm": 0.00015381313278339803, "learning_rate": 3.8488778054862845e-06, "loss": 0.0, "step": 32400 }, { "epoch": 4.041147132169576, "grad_norm": 0.00016141528612934053, "learning_rate": 3.843890274314215e-06, "loss": 0.0, "step": 32410 }, { "epoch": 4.042394014962594, "grad_norm": 0.002103125909343362, "learning_rate": 3.838902743142145e-06, "loss": 0.0, "step": 32420 }, { "epoch": 4.043640897755611, "grad_norm": 0.00045178926666267216, "learning_rate": 3.833915211970075e-06, "loss": 0.0011, "step": 32430 }, { "epoch": 4.0448877805486285, "grad_norm": 0.00012241245713084936, "learning_rate": 3.828927680798005e-06, "loss": 0.0001, "step": 32440 }, { "epoch": 4.046134663341646, "grad_norm": 0.0002882802509702742, "learning_rate": 3.823940149625935e-06, "loss": 0.0, "step": 32450 }, { "epoch": 4.047381546134663, "grad_norm": 0.00012750252790283412, "learning_rate": 3.818952618453866e-06, "loss": 0.0001, "step": 32460 }, { "epoch": 4.048628428927681, "grad_norm": 9.009744098875672e-05, "learning_rate": 3.813965087281796e-06, "loss": 0.0, "step": 32470 }, { "epoch": 4.049875311720698, "grad_norm": 0.017066551372408867, "learning_rate": 3.8089775561097257e-06, "loss": 0.0, "step": 32480 }, { "epoch": 4.051122194513716, "grad_norm": 0.0004483947705011815, "learning_rate": 3.803990024937656e-06, "loss": 0.0, "step": 32490 }, { "epoch": 4.052369077306733, "grad_norm": 0.009565845131874084, "learning_rate": 3.799002493765586e-06, "loss": 0.0, "step": 32500 }, { "epoch": 4.053615960099751, "grad_norm": 0.0004378945450298488, "learning_rate": 3.7940149625935163e-06, "loss": 0.0, "step": 32510 }, { "epoch": 4.054862842892768, "grad_norm": 0.00025369730428792536, "learning_rate": 3.7890274314214465e-06, "loss": 0.0, "step": 32520 }, { "epoch": 4.056109725685785, "grad_norm": 0.00017954749637283385, "learning_rate": 3.7840399002493767e-06, "loss": 0.0, "step": 32530 }, { "epoch": 4.057356608478803, "grad_norm": 0.00015994634304661304, "learning_rate": 3.779052369077307e-06, "loss": 0.0, "step": 32540 }, { "epoch": 4.05860349127182, "grad_norm": 0.0004566339775919914, "learning_rate": 3.7740648379052375e-06, "loss": 0.0, "step": 32550 }, { "epoch": 4.059850374064838, "grad_norm": 0.015646522864699364, "learning_rate": 3.7690773067331677e-06, "loss": 0.0, "step": 32560 }, { "epoch": 4.061097256857855, "grad_norm": 0.0808909609913826, "learning_rate": 3.764089775561098e-06, "loss": 0.0, "step": 32570 }, { "epoch": 4.062344139650873, "grad_norm": 0.0001575273199705407, "learning_rate": 3.759102244389028e-06, "loss": 0.0, "step": 32580 }, { "epoch": 4.06359102244389, "grad_norm": 0.0014995918609201908, "learning_rate": 3.7541147132169583e-06, "loss": 0.0001, "step": 32590 }, { "epoch": 4.0648379052369075, "grad_norm": 0.00016508818953298032, "learning_rate": 3.749127182044888e-06, "loss": 0.006, "step": 32600 }, { "epoch": 4.066084788029925, "grad_norm": 8.671959221828729e-05, "learning_rate": 3.7441396508728182e-06, "loss": 0.0, "step": 32610 }, { "epoch": 4.067331670822942, "grad_norm": 0.0003152689023409039, "learning_rate": 3.7391521197007484e-06, "loss": 0.0, "step": 32620 }, { "epoch": 4.06857855361596, "grad_norm": 8.40279899421148e-05, "learning_rate": 3.7341645885286786e-06, "loss": 0.0575, "step": 32630 }, { "epoch": 4.069825436408977, "grad_norm": 0.00015720934607088566, "learning_rate": 3.729177057356609e-06, "loss": 0.0, "step": 32640 }, { "epoch": 4.071072319201995, "grad_norm": 0.00012393532961141318, "learning_rate": 3.724189526184539e-06, "loss": 0.0, "step": 32650 }, { "epoch": 4.072319201995012, "grad_norm": 0.001939267385751009, "learning_rate": 3.7192019950124692e-06, "loss": 0.0, "step": 32660 }, { "epoch": 4.0735660847880295, "grad_norm": 0.00018286392150912434, "learning_rate": 3.7142144638403994e-06, "loss": 0.0, "step": 32670 }, { "epoch": 4.074812967581048, "grad_norm": 7.55258442950435e-05, "learning_rate": 3.7092269326683296e-06, "loss": 0.0, "step": 32680 }, { "epoch": 4.076059850374065, "grad_norm": 0.00010201766417594627, "learning_rate": 3.70423940149626e-06, "loss": 0.0, "step": 32690 }, { "epoch": 4.077306733167083, "grad_norm": 0.0008396367775276303, "learning_rate": 3.6992518703241896e-06, "loss": 0.0, "step": 32700 }, { "epoch": 4.0785536159601, "grad_norm": 0.0001478978811064735, "learning_rate": 3.6942643391521198e-06, "loss": 0.0105, "step": 32710 }, { "epoch": 4.079800498753118, "grad_norm": 0.01776493340730667, "learning_rate": 3.68927680798005e-06, "loss": 0.0, "step": 32720 }, { "epoch": 4.081047381546135, "grad_norm": 0.004389037843793631, "learning_rate": 3.68428927680798e-06, "loss": 0.0, "step": 32730 }, { "epoch": 4.082294264339152, "grad_norm": 0.002210435224696994, "learning_rate": 3.6793017456359104e-06, "loss": 0.0, "step": 32740 }, { "epoch": 4.08354114713217, "grad_norm": 0.00010050527635030448, "learning_rate": 3.6743142144638406e-06, "loss": 0.0349, "step": 32750 }, { "epoch": 4.084788029925187, "grad_norm": 0.00010919102351181209, "learning_rate": 3.6693266832917707e-06, "loss": 0.0, "step": 32760 }, { "epoch": 4.086034912718205, "grad_norm": 0.0007898484473116696, "learning_rate": 3.664339152119701e-06, "loss": 0.0287, "step": 32770 }, { "epoch": 4.087281795511222, "grad_norm": 0.0010982404928654432, "learning_rate": 3.659351620947631e-06, "loss": 0.0128, "step": 32780 }, { "epoch": 4.08852867830424, "grad_norm": 0.000133848880068399, "learning_rate": 3.654364089775561e-06, "loss": 0.0, "step": 32790 }, { "epoch": 4.089775561097257, "grad_norm": 0.0003971258702222258, "learning_rate": 3.649376558603492e-06, "loss": 0.0, "step": 32800 }, { "epoch": 4.0910224438902745, "grad_norm": 0.00021624041255563498, "learning_rate": 3.644389027431422e-06, "loss": 0.0, "step": 32810 }, { "epoch": 4.092269326683292, "grad_norm": 0.00039008044404909015, "learning_rate": 3.639401496259352e-06, "loss": 0.0002, "step": 32820 }, { "epoch": 4.093516209476309, "grad_norm": 0.0006581239867955446, "learning_rate": 3.634413965087282e-06, "loss": 0.0002, "step": 32830 }, { "epoch": 4.094763092269327, "grad_norm": 0.0005299003678373992, "learning_rate": 3.6294264339152123e-06, "loss": 0.0, "step": 32840 }, { "epoch": 4.096009975062344, "grad_norm": 0.00010087557166116312, "learning_rate": 3.6244389027431425e-06, "loss": 0.0001, "step": 32850 }, { "epoch": 4.097256857855362, "grad_norm": 0.0001816202566260472, "learning_rate": 3.6194513715710727e-06, "loss": 0.0001, "step": 32860 }, { "epoch": 4.098503740648379, "grad_norm": 0.0035556235816329718, "learning_rate": 3.614463840399003e-06, "loss": 0.0001, "step": 32870 }, { "epoch": 4.0997506234413965, "grad_norm": 0.0010074899764731526, "learning_rate": 3.609476309226933e-06, "loss": 0.0, "step": 32880 }, { "epoch": 4.100997506234414, "grad_norm": 0.000434140587458387, "learning_rate": 3.6044887780548633e-06, "loss": 0.0, "step": 32890 }, { "epoch": 4.102244389027431, "grad_norm": 0.00044290581718087196, "learning_rate": 3.5995012468827935e-06, "loss": 0.0, "step": 32900 }, { "epoch": 4.103491271820449, "grad_norm": 0.00020887772552669048, "learning_rate": 3.5945137157107237e-06, "loss": 0.0, "step": 32910 }, { "epoch": 4.104738154613466, "grad_norm": 0.0007029934786260128, "learning_rate": 3.5895261845386535e-06, "loss": 0.0, "step": 32920 }, { "epoch": 4.105985037406484, "grad_norm": 0.0001252719812327996, "learning_rate": 3.5845386533665837e-06, "loss": 0.0, "step": 32930 }, { "epoch": 4.107231920199501, "grad_norm": 0.0001325898483628407, "learning_rate": 3.579551122194514e-06, "loss": 0.0, "step": 32940 }, { "epoch": 4.1084788029925186, "grad_norm": 0.00044591727782972157, "learning_rate": 3.574563591022444e-06, "loss": 0.0, "step": 32950 }, { "epoch": 4.109725685785536, "grad_norm": 0.0007702121511101723, "learning_rate": 3.5695760598503742e-06, "loss": 0.0, "step": 32960 }, { "epoch": 4.110972568578553, "grad_norm": 0.00013088526611682028, "learning_rate": 3.5645885286783044e-06, "loss": 0.0625, "step": 32970 }, { "epoch": 4.112219451371571, "grad_norm": 0.10816075652837753, "learning_rate": 3.5596009975062346e-06, "loss": 0.0, "step": 32980 }, { "epoch": 4.113466334164588, "grad_norm": 0.00041107553988695145, "learning_rate": 3.554613466334165e-06, "loss": 0.0, "step": 32990 }, { "epoch": 4.114713216957606, "grad_norm": 0.00045265851076692343, "learning_rate": 3.549625935162095e-06, "loss": 0.0, "step": 33000 }, { "epoch": 4.115960099750623, "grad_norm": 0.00031145347747951746, "learning_rate": 3.5446384039900252e-06, "loss": 0.0, "step": 33010 }, { "epoch": 4.117206982543641, "grad_norm": 0.0004202498821541667, "learning_rate": 3.539650872817955e-06, "loss": 0.0011, "step": 33020 }, { "epoch": 4.118453865336658, "grad_norm": 0.00154070311691612, "learning_rate": 3.534663341645885e-06, "loss": 0.0, "step": 33030 }, { "epoch": 4.1197007481296755, "grad_norm": 0.03240791708230972, "learning_rate": 3.529675810473816e-06, "loss": 0.0, "step": 33040 }, { "epoch": 4.120947630922693, "grad_norm": 0.00011043441918445751, "learning_rate": 3.524688279301746e-06, "loss": 0.0, "step": 33050 }, { "epoch": 4.12219451371571, "grad_norm": 0.00030088701169006526, "learning_rate": 3.519700748129676e-06, "loss": 0.0, "step": 33060 }, { "epoch": 4.123441396508728, "grad_norm": 0.0001598861563252285, "learning_rate": 3.5147132169576064e-06, "loss": 0.0432, "step": 33070 }, { "epoch": 4.124688279301745, "grad_norm": 0.0008438412332907319, "learning_rate": 3.5097256857855366e-06, "loss": 0.0, "step": 33080 }, { "epoch": 4.1259351620947635, "grad_norm": 0.0008505359292030334, "learning_rate": 3.504738154613467e-06, "loss": 0.0002, "step": 33090 }, { "epoch": 4.127182044887781, "grad_norm": 0.0002946627791970968, "learning_rate": 3.499750623441397e-06, "loss": 0.0, "step": 33100 }, { "epoch": 4.128428927680798, "grad_norm": 0.0001894429442472756, "learning_rate": 3.494763092269327e-06, "loss": 0.0, "step": 33110 }, { "epoch": 4.129675810473816, "grad_norm": 0.00012741118553094566, "learning_rate": 3.4897755610972574e-06, "loss": 0.0, "step": 33120 }, { "epoch": 4.130922693266833, "grad_norm": 0.002783777192234993, "learning_rate": 3.4847880299251876e-06, "loss": 0.0, "step": 33130 }, { "epoch": 4.132169576059851, "grad_norm": 8.558244735468179e-05, "learning_rate": 3.4798004987531173e-06, "loss": 0.0, "step": 33140 }, { "epoch": 4.133416458852868, "grad_norm": 0.005597286857664585, "learning_rate": 3.4748129675810475e-06, "loss": 0.0, "step": 33150 }, { "epoch": 4.134663341645886, "grad_norm": 0.0004539691435638815, "learning_rate": 3.4698254364089777e-06, "loss": 0.0, "step": 33160 }, { "epoch": 4.135910224438903, "grad_norm": 0.0011278591118752956, "learning_rate": 3.464837905236908e-06, "loss": 0.0001, "step": 33170 }, { "epoch": 4.13715710723192, "grad_norm": 41.194732666015625, "learning_rate": 3.459850374064838e-06, "loss": 0.0266, "step": 33180 }, { "epoch": 4.138403990024938, "grad_norm": 0.0007619396201334894, "learning_rate": 3.4548628428927683e-06, "loss": 0.0, "step": 33190 }, { "epoch": 4.139650872817955, "grad_norm": 0.0006730849854648113, "learning_rate": 3.4498753117206985e-06, "loss": 0.0, "step": 33200 }, { "epoch": 4.140897755610973, "grad_norm": 0.004926951136440039, "learning_rate": 3.4448877805486287e-06, "loss": 0.0009, "step": 33210 }, { "epoch": 4.14214463840399, "grad_norm": 0.0021408579777926207, "learning_rate": 3.439900249376559e-06, "loss": 0.0, "step": 33220 }, { "epoch": 4.143391521197008, "grad_norm": 0.0005824111867696047, "learning_rate": 3.434912718204489e-06, "loss": 0.0, "step": 33230 }, { "epoch": 4.144638403990025, "grad_norm": 0.00010687837493605912, "learning_rate": 3.429925187032419e-06, "loss": 0.0, "step": 33240 }, { "epoch": 4.1458852867830425, "grad_norm": 9.500970190856606e-05, "learning_rate": 3.424937655860349e-06, "loss": 0.0, "step": 33250 }, { "epoch": 4.14713216957606, "grad_norm": 0.00021169520914554596, "learning_rate": 3.4199501246882793e-06, "loss": 0.0, "step": 33260 }, { "epoch": 4.148379052369077, "grad_norm": 0.0015832402277737856, "learning_rate": 3.4149625935162095e-06, "loss": 0.0027, "step": 33270 }, { "epoch": 4.149625935162095, "grad_norm": 0.0007897487957961857, "learning_rate": 3.4099750623441397e-06, "loss": 0.0, "step": 33280 }, { "epoch": 4.150872817955112, "grad_norm": 0.00020493895863182843, "learning_rate": 3.4049875311720703e-06, "loss": 0.0, "step": 33290 }, { "epoch": 4.15211970074813, "grad_norm": 0.0025839328300207853, "learning_rate": 3.4000000000000005e-06, "loss": 0.0003, "step": 33300 }, { "epoch": 4.153366583541147, "grad_norm": 0.00012474734103307128, "learning_rate": 3.3950124688279307e-06, "loss": 0.0, "step": 33310 }, { "epoch": 4.1546134663341645, "grad_norm": 0.00013193800987210125, "learning_rate": 3.390024937655861e-06, "loss": 0.0, "step": 33320 }, { "epoch": 4.155860349127182, "grad_norm": 0.000410831329645589, "learning_rate": 3.385037406483791e-06, "loss": 0.001, "step": 33330 }, { "epoch": 4.157107231920199, "grad_norm": 0.00024243925872724503, "learning_rate": 3.3800498753117213e-06, "loss": 0.0029, "step": 33340 }, { "epoch": 4.158354114713217, "grad_norm": 0.0016697397222742438, "learning_rate": 3.3750623441396515e-06, "loss": 0.0, "step": 33350 }, { "epoch": 4.159600997506234, "grad_norm": 0.0011250133393332362, "learning_rate": 3.3700748129675812e-06, "loss": 0.0, "step": 33360 }, { "epoch": 4.160847880299252, "grad_norm": 0.0002701326156966388, "learning_rate": 3.3650872817955114e-06, "loss": 0.0001, "step": 33370 }, { "epoch": 4.162094763092269, "grad_norm": 0.013247229158878326, "learning_rate": 3.3600997506234416e-06, "loss": 0.0278, "step": 33380 }, { "epoch": 4.1633416458852865, "grad_norm": 0.00168042560108006, "learning_rate": 3.355112219451372e-06, "loss": 0.0001, "step": 33390 }, { "epoch": 4.164588528678304, "grad_norm": 0.00028758039115928113, "learning_rate": 3.350124688279302e-06, "loss": 0.0, "step": 33400 }, { "epoch": 4.165835411471321, "grad_norm": 0.00012562373012769967, "learning_rate": 3.3451371571072322e-06, "loss": 0.0, "step": 33410 }, { "epoch": 4.167082294264339, "grad_norm": 0.00406464422121644, "learning_rate": 3.3401496259351624e-06, "loss": 0.0, "step": 33420 }, { "epoch": 4.168329177057356, "grad_norm": 0.00012497564603108913, "learning_rate": 3.3351620947630926e-06, "loss": 0.0, "step": 33430 }, { "epoch": 4.169576059850374, "grad_norm": 0.00038507606950588524, "learning_rate": 3.330174563591023e-06, "loss": 0.0, "step": 33440 }, { "epoch": 4.170822942643391, "grad_norm": 6.687084533041343e-05, "learning_rate": 3.325187032418953e-06, "loss": 0.0, "step": 33450 }, { "epoch": 4.172069825436409, "grad_norm": 4.686644388129935e-05, "learning_rate": 3.3201995012468828e-06, "loss": 0.0, "step": 33460 }, { "epoch": 4.173316708229426, "grad_norm": 0.0007018337491899729, "learning_rate": 3.315211970074813e-06, "loss": 0.0, "step": 33470 }, { "epoch": 4.174563591022444, "grad_norm": 0.004306942690163851, "learning_rate": 3.310224438902743e-06, "loss": 0.0, "step": 33480 }, { "epoch": 4.175810473815462, "grad_norm": 0.03564343601465225, "learning_rate": 3.3052369077306734e-06, "loss": 0.0, "step": 33490 }, { "epoch": 4.177057356608479, "grad_norm": 0.00011154711683047935, "learning_rate": 3.3002493765586036e-06, "loss": 0.0, "step": 33500 }, { "epoch": 4.178304239401497, "grad_norm": 0.001141234184615314, "learning_rate": 3.2952618453865337e-06, "loss": 0.0, "step": 33510 }, { "epoch": 4.179551122194514, "grad_norm": 0.00015788464224897325, "learning_rate": 3.290274314214464e-06, "loss": 0.0, "step": 33520 }, { "epoch": 4.1807980049875315, "grad_norm": 9.262099047191441e-05, "learning_rate": 3.285286783042394e-06, "loss": 0.0, "step": 33530 }, { "epoch": 4.182044887780549, "grad_norm": 9.043302270583808e-05, "learning_rate": 3.2802992518703248e-06, "loss": 0.0, "step": 33540 }, { "epoch": 4.183291770573566, "grad_norm": 0.00017372763250023127, "learning_rate": 3.275311720698255e-06, "loss": 0.0, "step": 33550 }, { "epoch": 4.184538653366584, "grad_norm": 0.00010978298087138683, "learning_rate": 3.270324189526185e-06, "loss": 0.0, "step": 33560 }, { "epoch": 4.185785536159601, "grad_norm": 0.0002875495993066579, "learning_rate": 3.2653366583541153e-06, "loss": 0.0003, "step": 33570 }, { "epoch": 4.187032418952619, "grad_norm": 0.33438995480537415, "learning_rate": 3.260349127182045e-06, "loss": 0.0001, "step": 33580 }, { "epoch": 4.188279301745636, "grad_norm": 0.0013525570975616574, "learning_rate": 3.2553615960099753e-06, "loss": 0.0, "step": 33590 }, { "epoch": 4.1895261845386536, "grad_norm": 0.00015009859635028988, "learning_rate": 3.2503740648379055e-06, "loss": 0.0, "step": 33600 }, { "epoch": 4.190773067331671, "grad_norm": 0.2856062352657318, "learning_rate": 3.2453865336658357e-06, "loss": 0.0001, "step": 33610 }, { "epoch": 4.192019950124688, "grad_norm": 0.00011341932986397296, "learning_rate": 3.240399002493766e-06, "loss": 0.0, "step": 33620 }, { "epoch": 4.193266832917706, "grad_norm": 0.000367656844900921, "learning_rate": 3.235411471321696e-06, "loss": 0.0035, "step": 33630 }, { "epoch": 4.194513715710723, "grad_norm": 0.0016489146510139108, "learning_rate": 3.2304239401496263e-06, "loss": 0.0, "step": 33640 }, { "epoch": 4.195760598503741, "grad_norm": 0.039787132292985916, "learning_rate": 3.2254364089775565e-06, "loss": 0.0, "step": 33650 }, { "epoch": 4.197007481296758, "grad_norm": 0.00011767564137699082, "learning_rate": 3.2204488778054867e-06, "loss": 0.0, "step": 33660 }, { "epoch": 4.198254364089776, "grad_norm": 0.007115756161510944, "learning_rate": 3.215461346633417e-06, "loss": 0.0, "step": 33670 }, { "epoch": 4.199501246882793, "grad_norm": 0.001513071358203888, "learning_rate": 3.2104738154613467e-06, "loss": 0.0486, "step": 33680 }, { "epoch": 4.2007481296758105, "grad_norm": 0.00044022343354299664, "learning_rate": 3.205486284289277e-06, "loss": 0.0, "step": 33690 }, { "epoch": 4.201995012468828, "grad_norm": 0.00032083020778372884, "learning_rate": 3.200498753117207e-06, "loss": 0.0, "step": 33700 }, { "epoch": 4.203241895261845, "grad_norm": 0.0002220886672148481, "learning_rate": 3.1955112219451372e-06, "loss": 0.0, "step": 33710 }, { "epoch": 4.204488778054863, "grad_norm": 0.0044029937125742435, "learning_rate": 3.1905236907730674e-06, "loss": 0.0154, "step": 33720 }, { "epoch": 4.20573566084788, "grad_norm": 0.004143120255321264, "learning_rate": 3.1855361596009976e-06, "loss": 0.0483, "step": 33730 }, { "epoch": 4.206982543640898, "grad_norm": 0.00012014318781439215, "learning_rate": 3.180548628428928e-06, "loss": 0.0, "step": 33740 }, { "epoch": 4.208229426433915, "grad_norm": 0.0004365080676507205, "learning_rate": 3.175561097256858e-06, "loss": 0.0, "step": 33750 }, { "epoch": 4.2094763092269325, "grad_norm": 0.0001294610119657591, "learning_rate": 3.1705735660847882e-06, "loss": 0.0, "step": 33760 }, { "epoch": 4.21072319201995, "grad_norm": 0.0007133973995223641, "learning_rate": 3.1655860349127184e-06, "loss": 0.0001, "step": 33770 }, { "epoch": 4.211970074812967, "grad_norm": 0.00016860711912158877, "learning_rate": 3.160598503740648e-06, "loss": 0.0412, "step": 33780 }, { "epoch": 4.213216957605985, "grad_norm": 0.00022174940386321396, "learning_rate": 3.1556109725685792e-06, "loss": 0.0001, "step": 33790 }, { "epoch": 4.214463840399002, "grad_norm": 0.0011991856154054403, "learning_rate": 3.1506234413965094e-06, "loss": 0.0, "step": 33800 }, { "epoch": 4.21571072319202, "grad_norm": 0.0002086485328618437, "learning_rate": 3.145635910224439e-06, "loss": 0.0, "step": 33810 }, { "epoch": 4.216957605985037, "grad_norm": 0.00021221544011496007, "learning_rate": 3.1406483790523694e-06, "loss": 0.0007, "step": 33820 }, { "epoch": 4.2182044887780545, "grad_norm": 0.0001043640440911986, "learning_rate": 3.1356608478802996e-06, "loss": 0.0062, "step": 33830 }, { "epoch": 4.219451371571072, "grad_norm": 0.0013198963133618236, "learning_rate": 3.13067331670823e-06, "loss": 0.0023, "step": 33840 }, { "epoch": 4.220698254364089, "grad_norm": 0.0001692106743576005, "learning_rate": 3.12568578553616e-06, "loss": 0.0616, "step": 33850 }, { "epoch": 4.221945137157107, "grad_norm": 0.0012058233842253685, "learning_rate": 3.12069825436409e-06, "loss": 0.0, "step": 33860 }, { "epoch": 4.223192019950124, "grad_norm": 0.0029543384443968534, "learning_rate": 3.1157107231920204e-06, "loss": 0.0, "step": 33870 }, { "epoch": 4.224438902743142, "grad_norm": 0.00011094296496594325, "learning_rate": 3.1107231920199506e-06, "loss": 0.0298, "step": 33880 }, { "epoch": 4.225685785536159, "grad_norm": 0.00020665784541051835, "learning_rate": 3.1057356608478808e-06, "loss": 0.0001, "step": 33890 }, { "epoch": 4.2269326683291775, "grad_norm": 0.0001479942729929462, "learning_rate": 3.1007481296758105e-06, "loss": 0.0, "step": 33900 }, { "epoch": 4.228179551122195, "grad_norm": 42.932716369628906, "learning_rate": 3.0957605985037407e-06, "loss": 0.0041, "step": 33910 }, { "epoch": 4.229426433915212, "grad_norm": 0.00013323694292921573, "learning_rate": 3.090773067331671e-06, "loss": 0.0, "step": 33920 }, { "epoch": 4.23067331670823, "grad_norm": 0.008350051939487457, "learning_rate": 3.085785536159601e-06, "loss": 0.0006, "step": 33930 }, { "epoch": 4.231920199501247, "grad_norm": 0.003039455274119973, "learning_rate": 3.0807980049875313e-06, "loss": 0.0001, "step": 33940 }, { "epoch": 4.233167082294265, "grad_norm": 0.00027307405252940953, "learning_rate": 3.0758104738154615e-06, "loss": 0.0, "step": 33950 }, { "epoch": 4.234413965087282, "grad_norm": 0.0001598122325958684, "learning_rate": 3.0708229426433917e-06, "loss": 0.0, "step": 33960 }, { "epoch": 4.2356608478802995, "grad_norm": 0.00011701675248332322, "learning_rate": 3.065835411471322e-06, "loss": 0.0184, "step": 33970 }, { "epoch": 4.236907730673317, "grad_norm": 0.0001561442913953215, "learning_rate": 3.060847880299252e-06, "loss": 0.0, "step": 33980 }, { "epoch": 4.238154613466334, "grad_norm": 0.0002617475111037493, "learning_rate": 3.0558603491271823e-06, "loss": 0.0016, "step": 33990 }, { "epoch": 4.239401496259352, "grad_norm": 0.0004843343631364405, "learning_rate": 3.050872817955112e-06, "loss": 0.0103, "step": 34000 }, { "epoch": 4.240648379052369, "grad_norm": 0.00012468890054151416, "learning_rate": 3.0458852867830423e-06, "loss": 0.0002, "step": 34010 }, { "epoch": 4.241895261845387, "grad_norm": 0.0029277161229401827, "learning_rate": 3.0408977556109725e-06, "loss": 0.0, "step": 34020 }, { "epoch": 4.243142144638404, "grad_norm": 9.913599933497608e-05, "learning_rate": 3.035910224438903e-06, "loss": 0.0, "step": 34030 }, { "epoch": 4.2443890274314215, "grad_norm": 8.990171772893518e-05, "learning_rate": 3.0309226932668333e-06, "loss": 0.0285, "step": 34040 }, { "epoch": 4.245635910224439, "grad_norm": 0.00037068844540044665, "learning_rate": 3.0259351620947635e-06, "loss": 0.0455, "step": 34050 }, { "epoch": 4.246882793017456, "grad_norm": 0.0005540825659409165, "learning_rate": 3.0209476309226937e-06, "loss": 0.0752, "step": 34060 }, { "epoch": 4.248129675810474, "grad_norm": 0.20835602283477783, "learning_rate": 3.015960099750624e-06, "loss": 0.0, "step": 34070 }, { "epoch": 4.249376558603491, "grad_norm": 0.006817484740167856, "learning_rate": 3.010972568578554e-06, "loss": 0.0, "step": 34080 }, { "epoch": 4.250623441396509, "grad_norm": 0.00453070318326354, "learning_rate": 3.0059850374064843e-06, "loss": 0.0, "step": 34090 }, { "epoch": 4.251870324189526, "grad_norm": 0.00016027323727030307, "learning_rate": 3.0009975062344145e-06, "loss": 0.0, "step": 34100 }, { "epoch": 4.253117206982544, "grad_norm": 0.0012369597097858787, "learning_rate": 2.9960099750623447e-06, "loss": 0.0001, "step": 34110 }, { "epoch": 4.254364089775561, "grad_norm": 0.00018037218251265585, "learning_rate": 2.9910224438902744e-06, "loss": 0.0006, "step": 34120 }, { "epoch": 4.2556109725685785, "grad_norm": 0.001004421734251082, "learning_rate": 2.9860349127182046e-06, "loss": 0.0, "step": 34130 }, { "epoch": 4.256857855361596, "grad_norm": 0.00010146049316972494, "learning_rate": 2.981047381546135e-06, "loss": 0.0, "step": 34140 }, { "epoch": 4.258104738154613, "grad_norm": 0.00021762358665000647, "learning_rate": 2.976059850374065e-06, "loss": 0.0022, "step": 34150 }, { "epoch": 4.259351620947631, "grad_norm": 0.0008794396417215466, "learning_rate": 2.9710723192019952e-06, "loss": 0.0, "step": 34160 }, { "epoch": 4.260598503740648, "grad_norm": 0.0002460590039845556, "learning_rate": 2.9660847880299254e-06, "loss": 0.0, "step": 34170 }, { "epoch": 4.261845386533666, "grad_norm": 0.0002297249884577468, "learning_rate": 2.9610972568578556e-06, "loss": 0.0013, "step": 34180 }, { "epoch": 4.263092269326683, "grad_norm": 0.2807936668395996, "learning_rate": 2.956109725685786e-06, "loss": 0.0001, "step": 34190 }, { "epoch": 4.2643391521197005, "grad_norm": 0.0002086867461912334, "learning_rate": 2.951122194513716e-06, "loss": 0.0, "step": 34200 }, { "epoch": 4.265586034912718, "grad_norm": 0.00010574016778264195, "learning_rate": 2.946134663341646e-06, "loss": 0.0, "step": 34210 }, { "epoch": 4.266832917705735, "grad_norm": 0.0005821581580676138, "learning_rate": 2.941147132169576e-06, "loss": 0.0055, "step": 34220 }, { "epoch": 4.268079800498753, "grad_norm": 0.08548212796449661, "learning_rate": 2.936159600997506e-06, "loss": 0.0001, "step": 34230 }, { "epoch": 4.26932668329177, "grad_norm": 0.0003080704773310572, "learning_rate": 2.9311720698254364e-06, "loss": 0.0, "step": 34240 }, { "epoch": 4.270573566084788, "grad_norm": 0.024417024105787277, "learning_rate": 2.9261845386533666e-06, "loss": 0.0, "step": 34250 }, { "epoch": 4.271820448877805, "grad_norm": 0.00023198517737910151, "learning_rate": 2.9211970074812967e-06, "loss": 0.0352, "step": 34260 }, { "epoch": 4.2730673316708225, "grad_norm": 0.0014956368831917644, "learning_rate": 2.916209476309227e-06, "loss": 0.0, "step": 34270 }, { "epoch": 4.274314214463841, "grad_norm": 0.000349792797351256, "learning_rate": 2.9112219451371576e-06, "loss": 0.0, "step": 34280 }, { "epoch": 4.275561097256858, "grad_norm": 0.0010751100489869714, "learning_rate": 2.9062344139650878e-06, "loss": 0.0, "step": 34290 }, { "epoch": 4.276807980049876, "grad_norm": 0.00015223190712276846, "learning_rate": 2.901246882793018e-06, "loss": 0.0, "step": 34300 }, { "epoch": 4.278054862842893, "grad_norm": 0.00022834049013908952, "learning_rate": 2.896259351620948e-06, "loss": 0.0, "step": 34310 }, { "epoch": 4.279301745635911, "grad_norm": 0.0009195749298669398, "learning_rate": 2.8912718204488783e-06, "loss": 0.0, "step": 34320 }, { "epoch": 4.280548628428928, "grad_norm": 0.00018121050379704684, "learning_rate": 2.8862842892768085e-06, "loss": 0.0, "step": 34330 }, { "epoch": 4.2817955112219455, "grad_norm": 0.003172141034156084, "learning_rate": 2.8812967581047383e-06, "loss": 0.0, "step": 34340 }, { "epoch": 4.283042394014963, "grad_norm": 0.00037091257399879396, "learning_rate": 2.8763092269326685e-06, "loss": 0.0, "step": 34350 }, { "epoch": 4.28428927680798, "grad_norm": 0.0002679352182894945, "learning_rate": 2.8713216957605987e-06, "loss": 0.0, "step": 34360 }, { "epoch": 4.285536159600998, "grad_norm": 0.00014105028822086751, "learning_rate": 2.866334164588529e-06, "loss": 0.0, "step": 34370 }, { "epoch": 4.286783042394015, "grad_norm": 0.0001462107029510662, "learning_rate": 2.861346633416459e-06, "loss": 0.0, "step": 34380 }, { "epoch": 4.288029925187033, "grad_norm": 0.007252872921526432, "learning_rate": 2.8563591022443893e-06, "loss": 0.0032, "step": 34390 }, { "epoch": 4.28927680798005, "grad_norm": 8.567462646169588e-05, "learning_rate": 2.8513715710723195e-06, "loss": 0.0085, "step": 34400 }, { "epoch": 4.2905236907730675, "grad_norm": 6.732952897436917e-05, "learning_rate": 2.8463840399002497e-06, "loss": 0.0, "step": 34410 }, { "epoch": 4.291770573566085, "grad_norm": 22.605783462524414, "learning_rate": 2.84139650872818e-06, "loss": 0.0234, "step": 34420 }, { "epoch": 4.293017456359102, "grad_norm": 0.00013546543777920306, "learning_rate": 2.83640897755611e-06, "loss": 0.0, "step": 34430 }, { "epoch": 4.29426433915212, "grad_norm": 0.0021784852724522352, "learning_rate": 2.83142144638404e-06, "loss": 0.0, "step": 34440 }, { "epoch": 4.295511221945137, "grad_norm": 0.0005654957494698465, "learning_rate": 2.82643391521197e-06, "loss": 0.026, "step": 34450 }, { "epoch": 4.296758104738155, "grad_norm": 0.00012377439998090267, "learning_rate": 2.8214463840399002e-06, "loss": 0.024, "step": 34460 }, { "epoch": 4.298004987531172, "grad_norm": 0.00014131332864053547, "learning_rate": 2.8164588528678304e-06, "loss": 0.0, "step": 34470 }, { "epoch": 4.2992518703241895, "grad_norm": 0.0001872741268016398, "learning_rate": 2.8114713216957606e-06, "loss": 0.0, "step": 34480 }, { "epoch": 4.300498753117207, "grad_norm": 0.000619906117208302, "learning_rate": 2.806483790523691e-06, "loss": 0.0158, "step": 34490 }, { "epoch": 4.301745635910224, "grad_norm": 0.008459209464490414, "learning_rate": 2.801496259351621e-06, "loss": 0.0001, "step": 34500 }, { "epoch": 4.302992518703242, "grad_norm": 0.0034343558363616467, "learning_rate": 2.7965087281795512e-06, "loss": 0.0, "step": 34510 }, { "epoch": 4.304239401496259, "grad_norm": 0.0003617657348513603, "learning_rate": 2.7915211970074814e-06, "loss": 0.0335, "step": 34520 }, { "epoch": 4.305486284289277, "grad_norm": 0.00014825259859208018, "learning_rate": 2.786533665835412e-06, "loss": 0.0001, "step": 34530 }, { "epoch": 4.306733167082294, "grad_norm": 0.00040610635187476873, "learning_rate": 2.7815461346633422e-06, "loss": 0.0, "step": 34540 }, { "epoch": 4.307980049875312, "grad_norm": 0.0001523315440863371, "learning_rate": 2.7765586034912724e-06, "loss": 0.0, "step": 34550 }, { "epoch": 4.309226932668329, "grad_norm": 0.00016806507483124733, "learning_rate": 2.7715710723192026e-06, "loss": 0.0, "step": 34560 }, { "epoch": 4.3104738154613464, "grad_norm": 0.000623812957201153, "learning_rate": 2.7665835411471324e-06, "loss": 0.0, "step": 34570 }, { "epoch": 4.311720698254364, "grad_norm": 0.0008510535699315369, "learning_rate": 2.7615960099750626e-06, "loss": 0.0286, "step": 34580 }, { "epoch": 4.312967581047381, "grad_norm": 0.0007971362792886794, "learning_rate": 2.756608478802993e-06, "loss": 0.0, "step": 34590 }, { "epoch": 4.314214463840399, "grad_norm": 0.0081477090716362, "learning_rate": 2.751620947630923e-06, "loss": 0.0001, "step": 34600 }, { "epoch": 4.315461346633416, "grad_norm": 0.00035156356170773506, "learning_rate": 2.746633416458853e-06, "loss": 0.0, "step": 34610 }, { "epoch": 4.316708229426434, "grad_norm": 0.0002370004658587277, "learning_rate": 2.7416458852867834e-06, "loss": 0.0, "step": 34620 }, { "epoch": 4.317955112219451, "grad_norm": 0.0005394717445597053, "learning_rate": 2.7366583541147136e-06, "loss": 0.0, "step": 34630 }, { "epoch": 4.3192019950124685, "grad_norm": 4.46143421868328e-05, "learning_rate": 2.7316708229426438e-06, "loss": 0.0, "step": 34640 }, { "epoch": 4.320448877805486, "grad_norm": 5.566642357734963e-05, "learning_rate": 2.726683291770574e-06, "loss": 0.0163, "step": 34650 }, { "epoch": 4.321695760598503, "grad_norm": 0.00012989880633540452, "learning_rate": 2.7216957605985037e-06, "loss": 0.0, "step": 34660 }, { "epoch": 4.322942643391521, "grad_norm": 0.00028789890347979963, "learning_rate": 2.716708229426434e-06, "loss": 0.0103, "step": 34670 }, { "epoch": 4.324189526184538, "grad_norm": 0.00021803946583531797, "learning_rate": 2.711720698254364e-06, "loss": 0.0, "step": 34680 }, { "epoch": 4.325436408977556, "grad_norm": 0.0003036449197679758, "learning_rate": 2.7067331670822943e-06, "loss": 0.0, "step": 34690 }, { "epoch": 4.326683291770574, "grad_norm": 0.0001597816008143127, "learning_rate": 2.7017456359102245e-06, "loss": 0.0, "step": 34700 }, { "epoch": 4.327930174563591, "grad_norm": 51.117496490478516, "learning_rate": 2.6967581047381547e-06, "loss": 0.0202, "step": 34710 }, { "epoch": 4.329177057356609, "grad_norm": 0.00017285061767324805, "learning_rate": 2.691770573566085e-06, "loss": 0.0, "step": 34720 }, { "epoch": 4.330423940149626, "grad_norm": 0.006054338067770004, "learning_rate": 2.686783042394015e-06, "loss": 0.0, "step": 34730 }, { "epoch": 4.331670822942644, "grad_norm": 0.0001271928776986897, "learning_rate": 2.6817955112219453e-06, "loss": 0.0, "step": 34740 }, { "epoch": 4.332917705735661, "grad_norm": 0.0003265069390181452, "learning_rate": 2.6768079800498755e-06, "loss": 0.0, "step": 34750 }, { "epoch": 4.334164588528679, "grad_norm": 0.00024125789059326053, "learning_rate": 2.6718204488778053e-06, "loss": 0.0135, "step": 34760 }, { "epoch": 4.335411471321696, "grad_norm": 0.00010157257929677144, "learning_rate": 2.6668329177057355e-06, "loss": 0.0, "step": 34770 }, { "epoch": 4.3366583541147135, "grad_norm": 0.0003305143618490547, "learning_rate": 2.6618453865336665e-06, "loss": 0.0, "step": 34780 }, { "epoch": 4.337905236907731, "grad_norm": 0.00010430354450363666, "learning_rate": 2.6568578553615963e-06, "loss": 0.0, "step": 34790 }, { "epoch": 4.339152119700748, "grad_norm": 0.0006443153251893818, "learning_rate": 2.6518703241895265e-06, "loss": 0.0001, "step": 34800 }, { "epoch": 4.340399002493766, "grad_norm": 0.00023026694543659687, "learning_rate": 2.6468827930174567e-06, "loss": 0.0001, "step": 34810 }, { "epoch": 4.341645885286783, "grad_norm": 0.0033188401721417904, "learning_rate": 2.641895261845387e-06, "loss": 0.0001, "step": 34820 }, { "epoch": 4.342892768079801, "grad_norm": 0.00031385180773213506, "learning_rate": 2.636907730673317e-06, "loss": 0.0, "step": 34830 }, { "epoch": 4.344139650872818, "grad_norm": 0.00039108918281272054, "learning_rate": 2.6319201995012473e-06, "loss": 0.0, "step": 34840 }, { "epoch": 4.3453865336658355, "grad_norm": 0.00011719971371348947, "learning_rate": 2.6269326683291775e-06, "loss": 0.0, "step": 34850 }, { "epoch": 4.346633416458853, "grad_norm": 5.473527198773809e-05, "learning_rate": 2.6219451371571077e-06, "loss": 0.0001, "step": 34860 }, { "epoch": 4.34788029925187, "grad_norm": 0.00020321720512583852, "learning_rate": 2.616957605985038e-06, "loss": 0.0, "step": 34870 }, { "epoch": 4.349127182044888, "grad_norm": 0.0006901020533405244, "learning_rate": 2.6119700748129676e-06, "loss": 0.0, "step": 34880 }, { "epoch": 4.350374064837905, "grad_norm": 0.0014119262341409922, "learning_rate": 2.606982543640898e-06, "loss": 0.0, "step": 34890 }, { "epoch": 4.351620947630923, "grad_norm": 0.0002134747483069077, "learning_rate": 2.601995012468828e-06, "loss": 0.0, "step": 34900 }, { "epoch": 4.35286783042394, "grad_norm": 0.00012892240192741156, "learning_rate": 2.597007481296758e-06, "loss": 0.0, "step": 34910 }, { "epoch": 4.3541147132169575, "grad_norm": 0.00012061430606991053, "learning_rate": 2.5920199501246884e-06, "loss": 0.0023, "step": 34920 }, { "epoch": 4.355361596009975, "grad_norm": 0.00020210719958413392, "learning_rate": 2.5870324189526186e-06, "loss": 0.0, "step": 34930 }, { "epoch": 4.356608478802992, "grad_norm": 0.001012521330267191, "learning_rate": 2.582044887780549e-06, "loss": 0.0039, "step": 34940 }, { "epoch": 4.35785536159601, "grad_norm": 0.00021262998052407056, "learning_rate": 2.577057356608479e-06, "loss": 0.023, "step": 34950 }, { "epoch": 4.359102244389027, "grad_norm": 0.000502545852214098, "learning_rate": 2.572069825436409e-06, "loss": 0.0, "step": 34960 }, { "epoch": 4.360349127182045, "grad_norm": 6.385482993209735e-05, "learning_rate": 2.5670822942643394e-06, "loss": 0.0, "step": 34970 }, { "epoch": 4.361596009975062, "grad_norm": 7.826248474884778e-05, "learning_rate": 2.562094763092269e-06, "loss": 0.0, "step": 34980 }, { "epoch": 4.36284289276808, "grad_norm": 5.5803859140723944e-05, "learning_rate": 2.5571072319201994e-06, "loss": 0.0525, "step": 34990 }, { "epoch": 4.364089775561097, "grad_norm": 4.6033244871068746e-05, "learning_rate": 2.5521197007481296e-06, "loss": 0.0, "step": 35000 }, { "epoch": 4.365336658354114, "grad_norm": 0.00029482910758815706, "learning_rate": 2.5471321695760597e-06, "loss": 0.0, "step": 35010 }, { "epoch": 4.366583541147132, "grad_norm": 0.00016616334323771298, "learning_rate": 2.5421446384039904e-06, "loss": 0.0, "step": 35020 }, { "epoch": 4.367830423940149, "grad_norm": 0.00011380357318557799, "learning_rate": 2.5371571072319206e-06, "loss": 0.0, "step": 35030 }, { "epoch": 4.369077306733167, "grad_norm": 7.7957367466297e-05, "learning_rate": 2.5321695760598508e-06, "loss": 0.0, "step": 35040 }, { "epoch": 4.370324189526184, "grad_norm": 0.0003632204607129097, "learning_rate": 2.527182044887781e-06, "loss": 0.0045, "step": 35050 }, { "epoch": 4.371571072319202, "grad_norm": 0.1488853245973587, "learning_rate": 2.522194513715711e-06, "loss": 0.0175, "step": 35060 }, { "epoch": 4.372817955112219, "grad_norm": 9.009828499983996e-05, "learning_rate": 2.5172069825436413e-06, "loss": 0.0, "step": 35070 }, { "epoch": 4.374064837905237, "grad_norm": 0.06685556471347809, "learning_rate": 2.5122194513715715e-06, "loss": 0.0, "step": 35080 }, { "epoch": 4.375311720698255, "grad_norm": 0.0001171770563814789, "learning_rate": 2.5072319201995017e-06, "loss": 0.0, "step": 35090 }, { "epoch": 4.376558603491272, "grad_norm": 0.00015780334069859236, "learning_rate": 2.5022443890274315e-06, "loss": 0.0, "step": 35100 }, { "epoch": 4.37780548628429, "grad_norm": 0.0005297464085742831, "learning_rate": 2.4972568578553617e-06, "loss": 0.0001, "step": 35110 }, { "epoch": 4.379052369077307, "grad_norm": 0.0001470494898967445, "learning_rate": 2.492269326683292e-06, "loss": 0.0, "step": 35120 }, { "epoch": 4.3802992518703245, "grad_norm": 0.008314243517816067, "learning_rate": 2.487281795511222e-06, "loss": 0.0, "step": 35130 }, { "epoch": 4.381546134663342, "grad_norm": 9.446181502426043e-05, "learning_rate": 2.4822942643391523e-06, "loss": 0.0, "step": 35140 }, { "epoch": 4.382793017456359, "grad_norm": 7.607245788676664e-05, "learning_rate": 2.4773067331670825e-06, "loss": 0.0, "step": 35150 }, { "epoch": 4.384039900249377, "grad_norm": 0.0001715715043246746, "learning_rate": 2.4723192019950127e-06, "loss": 0.0, "step": 35160 }, { "epoch": 4.385286783042394, "grad_norm": 0.0006570697296410799, "learning_rate": 2.467331670822943e-06, "loss": 0.0, "step": 35170 }, { "epoch": 4.386533665835412, "grad_norm": 0.0072289216332137585, "learning_rate": 2.462344139650873e-06, "loss": 0.0, "step": 35180 }, { "epoch": 4.387780548628429, "grad_norm": 0.00012217392213642597, "learning_rate": 2.4573566084788033e-06, "loss": 0.0, "step": 35190 }, { "epoch": 4.389027431421447, "grad_norm": 0.00018129154341295362, "learning_rate": 2.452369077306733e-06, "loss": 0.0, "step": 35200 }, { "epoch": 4.390274314214464, "grad_norm": 0.0001524377439636737, "learning_rate": 2.4473815461346637e-06, "loss": 0.0, "step": 35210 }, { "epoch": 4.3915211970074814, "grad_norm": 0.00016739932470954955, "learning_rate": 2.442394014962594e-06, "loss": 0.0, "step": 35220 }, { "epoch": 4.392768079800499, "grad_norm": 9.303903061663732e-05, "learning_rate": 2.437406483790524e-06, "loss": 0.0001, "step": 35230 }, { "epoch": 4.394014962593516, "grad_norm": 0.06675028055906296, "learning_rate": 2.4324189526184543e-06, "loss": 0.0, "step": 35240 }, { "epoch": 4.395261845386534, "grad_norm": 0.00039317607297562063, "learning_rate": 2.4274314214463844e-06, "loss": 0.0, "step": 35250 }, { "epoch": 4.396508728179551, "grad_norm": 6.668036803603172e-05, "learning_rate": 2.4224438902743142e-06, "loss": 0.0, "step": 35260 }, { "epoch": 4.397755610972569, "grad_norm": 0.002230043290182948, "learning_rate": 2.4174563591022444e-06, "loss": 0.0005, "step": 35270 }, { "epoch": 4.399002493765586, "grad_norm": 0.00021028569608461112, "learning_rate": 2.4124688279301746e-06, "loss": 0.0, "step": 35280 }, { "epoch": 4.4002493765586035, "grad_norm": 6.0643418692052364e-05, "learning_rate": 2.407481296758105e-06, "loss": 0.001, "step": 35290 }, { "epoch": 4.401496259351621, "grad_norm": 0.00025467347586527467, "learning_rate": 2.402493765586035e-06, "loss": 0.0, "step": 35300 }, { "epoch": 4.402743142144638, "grad_norm": 0.0018303323304280639, "learning_rate": 2.397506234413965e-06, "loss": 0.0, "step": 35310 }, { "epoch": 4.403990024937656, "grad_norm": 0.0001063878953573294, "learning_rate": 2.3925187032418954e-06, "loss": 0.0, "step": 35320 }, { "epoch": 4.405236907730673, "grad_norm": 6.145804218249395e-05, "learning_rate": 2.3875311720698256e-06, "loss": 0.0624, "step": 35330 }, { "epoch": 4.406483790523691, "grad_norm": 0.0006972160190343857, "learning_rate": 2.382543640897756e-06, "loss": 0.0, "step": 35340 }, { "epoch": 4.407730673316708, "grad_norm": 0.00030007187160663307, "learning_rate": 2.377556109725686e-06, "loss": 0.012, "step": 35350 }, { "epoch": 4.4089775561097255, "grad_norm": 0.0004714836541097611, "learning_rate": 2.372568578553616e-06, "loss": 0.0, "step": 35360 }, { "epoch": 4.410224438902743, "grad_norm": 8.451470785075799e-05, "learning_rate": 2.3675810473815464e-06, "loss": 0.0, "step": 35370 }, { "epoch": 4.41147132169576, "grad_norm": 0.00015619279292877764, "learning_rate": 2.3625935162094766e-06, "loss": 0.0, "step": 35380 }, { "epoch": 4.412718204488778, "grad_norm": 6.922364264028147e-05, "learning_rate": 2.3576059850374068e-06, "loss": 0.0, "step": 35390 }, { "epoch": 4.413965087281795, "grad_norm": 0.0002487407182343304, "learning_rate": 2.352618453865337e-06, "loss": 0.0, "step": 35400 }, { "epoch": 4.415211970074813, "grad_norm": 0.0003662613744381815, "learning_rate": 2.347630922693267e-06, "loss": 0.0, "step": 35410 }, { "epoch": 4.41645885286783, "grad_norm": 5.250598042039201e-05, "learning_rate": 2.342643391521197e-06, "loss": 0.0, "step": 35420 }, { "epoch": 4.417705735660848, "grad_norm": 0.00037984587834216654, "learning_rate": 2.337655860349127e-06, "loss": 0.0, "step": 35430 }, { "epoch": 4.418952618453865, "grad_norm": 0.0001668424520175904, "learning_rate": 2.3326683291770573e-06, "loss": 0.0, "step": 35440 }, { "epoch": 4.420199501246882, "grad_norm": 0.00022639970120508224, "learning_rate": 2.3276807980049875e-06, "loss": 0.0006, "step": 35450 }, { "epoch": 4.4214463840399, "grad_norm": 0.00010051777644548565, "learning_rate": 2.322693266832918e-06, "loss": 0.0, "step": 35460 }, { "epoch": 4.422693266832917, "grad_norm": 7.006821397226304e-05, "learning_rate": 2.3177057356608483e-06, "loss": 0.0, "step": 35470 }, { "epoch": 4.423940149625935, "grad_norm": 0.0008471541223116219, "learning_rate": 2.312718204488778e-06, "loss": 0.0, "step": 35480 }, { "epoch": 4.425187032418952, "grad_norm": 5.219184822635725e-05, "learning_rate": 2.3077306733167083e-06, "loss": 0.0, "step": 35490 }, { "epoch": 4.42643391521197, "grad_norm": 0.00016879831673577428, "learning_rate": 2.3027431421446385e-06, "loss": 0.0444, "step": 35500 }, { "epoch": 4.427680798004988, "grad_norm": 0.011657807044684887, "learning_rate": 2.2977556109725687e-06, "loss": 0.0, "step": 35510 }, { "epoch": 4.428927680798005, "grad_norm": 0.002720105927437544, "learning_rate": 2.292768079800499e-06, "loss": 0.0, "step": 35520 }, { "epoch": 4.430174563591023, "grad_norm": 3.922603355022147e-05, "learning_rate": 2.287780548628429e-06, "loss": 0.0, "step": 35530 }, { "epoch": 4.43142144638404, "grad_norm": 0.0001942600356414914, "learning_rate": 2.2827930174563593e-06, "loss": 0.0, "step": 35540 }, { "epoch": 4.432668329177058, "grad_norm": 0.0001732075761537999, "learning_rate": 2.2778054862842895e-06, "loss": 0.0083, "step": 35550 }, { "epoch": 4.433915211970075, "grad_norm": 4.679305857280269e-05, "learning_rate": 2.2728179551122197e-06, "loss": 0.0, "step": 35560 }, { "epoch": 4.4351620947630925, "grad_norm": 0.00015754872583784163, "learning_rate": 2.26783042394015e-06, "loss": 0.0, "step": 35570 }, { "epoch": 4.43640897755611, "grad_norm": 5.866608626092784e-05, "learning_rate": 2.26284289276808e-06, "loss": 0.0008, "step": 35580 }, { "epoch": 4.437655860349127, "grad_norm": 6.611185381188989e-05, "learning_rate": 2.2578553615960103e-06, "loss": 0.0, "step": 35590 }, { "epoch": 4.438902743142145, "grad_norm": 5.2472081733867526e-05, "learning_rate": 2.2528678304239405e-06, "loss": 0.0144, "step": 35600 }, { "epoch": 4.440149625935162, "grad_norm": 0.00015604296640958637, "learning_rate": 2.2478802992518707e-06, "loss": 0.0, "step": 35610 }, { "epoch": 4.44139650872818, "grad_norm": 0.0003281154204159975, "learning_rate": 2.242892768079801e-06, "loss": 0.0, "step": 35620 }, { "epoch": 4.442643391521197, "grad_norm": 0.00010192135232500732, "learning_rate": 2.237905236907731e-06, "loss": 0.0269, "step": 35630 }, { "epoch": 4.443890274314215, "grad_norm": 9.66684747254476e-05, "learning_rate": 2.232917705735661e-06, "loss": 0.0, "step": 35640 }, { "epoch": 4.445137157107232, "grad_norm": 0.0003734457422979176, "learning_rate": 2.227930174563591e-06, "loss": 0.0, "step": 35650 }, { "epoch": 4.446384039900249, "grad_norm": 7.611950422869995e-05, "learning_rate": 2.222942643391521e-06, "loss": 0.0, "step": 35660 }, { "epoch": 4.447630922693267, "grad_norm": 0.0004907181719318032, "learning_rate": 2.2179551122194514e-06, "loss": 0.0, "step": 35670 }, { "epoch": 4.448877805486284, "grad_norm": 0.0001631807826925069, "learning_rate": 2.2129675810473816e-06, "loss": 0.0, "step": 35680 }, { "epoch": 4.450124688279302, "grad_norm": 9.863793820841238e-05, "learning_rate": 2.207980049875312e-06, "loss": 0.0, "step": 35690 }, { "epoch": 4.451371571072319, "grad_norm": 0.0002982678124681115, "learning_rate": 2.2029925187032424e-06, "loss": 0.0008, "step": 35700 }, { "epoch": 4.452618453865337, "grad_norm": 0.002366115804761648, "learning_rate": 2.198004987531172e-06, "loss": 0.0, "step": 35710 }, { "epoch": 4.453865336658354, "grad_norm": 0.09239999949932098, "learning_rate": 2.1930174563591024e-06, "loss": 0.0487, "step": 35720 }, { "epoch": 4.4551122194513715, "grad_norm": 9.49343666434288e-05, "learning_rate": 2.1880299251870326e-06, "loss": 0.0, "step": 35730 }, { "epoch": 4.456359102244389, "grad_norm": 0.0003900736046489328, "learning_rate": 2.1830423940149628e-06, "loss": 0.0, "step": 35740 }, { "epoch": 4.457605985037406, "grad_norm": 9.119904279941693e-05, "learning_rate": 2.178054862842893e-06, "loss": 0.0, "step": 35750 }, { "epoch": 4.458852867830424, "grad_norm": 4.107060158275999e-05, "learning_rate": 2.173067331670823e-06, "loss": 0.0015, "step": 35760 }, { "epoch": 4.460099750623441, "grad_norm": 8.932932541938499e-05, "learning_rate": 2.1680798004987534e-06, "loss": 0.0, "step": 35770 }, { "epoch": 4.461346633416459, "grad_norm": 9.386024612467736e-05, "learning_rate": 2.1630922693266836e-06, "loss": 0.0, "step": 35780 }, { "epoch": 4.462593516209476, "grad_norm": 0.0001702239242149517, "learning_rate": 2.1581047381546138e-06, "loss": 0.0326, "step": 35790 }, { "epoch": 4.4638403990024935, "grad_norm": 0.00030721109942533076, "learning_rate": 2.1531172069825435e-06, "loss": 0.0, "step": 35800 }, { "epoch": 4.465087281795511, "grad_norm": 0.00013089546700939536, "learning_rate": 2.1481296758104737e-06, "loss": 0.0, "step": 35810 }, { "epoch": 4.466334164588528, "grad_norm": 4.946505214320496e-05, "learning_rate": 2.143142144638404e-06, "loss": 0.004, "step": 35820 }, { "epoch": 4.467581047381546, "grad_norm": 0.0005132012884132564, "learning_rate": 2.1381546134663345e-06, "loss": 0.0003, "step": 35830 }, { "epoch": 4.468827930174563, "grad_norm": 0.00019803202303592116, "learning_rate": 2.1331670822942647e-06, "loss": 0.0, "step": 35840 }, { "epoch": 4.470074812967581, "grad_norm": 0.0005909769097343087, "learning_rate": 2.128179551122195e-06, "loss": 0.0, "step": 35850 }, { "epoch": 4.471321695760598, "grad_norm": 0.0003146138333249837, "learning_rate": 2.1231920199501247e-06, "loss": 0.0, "step": 35860 }, { "epoch": 4.472568578553616, "grad_norm": 0.00033271979191340506, "learning_rate": 2.118204488778055e-06, "loss": 0.0, "step": 35870 }, { "epoch": 4.473815461346634, "grad_norm": 23.879375457763672, "learning_rate": 2.113216957605985e-06, "loss": 0.0013, "step": 35880 }, { "epoch": 4.475062344139651, "grad_norm": 0.0009702108800411224, "learning_rate": 2.1082294264339153e-06, "loss": 0.0, "step": 35890 }, { "epoch": 4.476309226932669, "grad_norm": 0.00046513532288372517, "learning_rate": 2.1032418952618455e-06, "loss": 0.0, "step": 35900 }, { "epoch": 4.477556109725686, "grad_norm": 0.00018407838069833815, "learning_rate": 2.0982543640897757e-06, "loss": 0.0, "step": 35910 }, { "epoch": 4.478802992518704, "grad_norm": 0.00024080314324237406, "learning_rate": 2.093266832917706e-06, "loss": 0.0051, "step": 35920 }, { "epoch": 4.480049875311721, "grad_norm": 0.00024162052432075143, "learning_rate": 2.088279301745636e-06, "loss": 0.0, "step": 35930 }, { "epoch": 4.4812967581047385, "grad_norm": 5.4415671911556274e-05, "learning_rate": 2.0832917705735663e-06, "loss": 0.0, "step": 35940 }, { "epoch": 4.482543640897756, "grad_norm": 9.320250683231279e-05, "learning_rate": 2.0783042394014965e-06, "loss": 0.0, "step": 35950 }, { "epoch": 4.483790523690773, "grad_norm": 7.990971789695323e-05, "learning_rate": 2.0733167082294267e-06, "loss": 0.0, "step": 35960 }, { "epoch": 4.485037406483791, "grad_norm": 9.405690798303112e-05, "learning_rate": 2.068329177057357e-06, "loss": 0.0, "step": 35970 }, { "epoch": 4.486284289276808, "grad_norm": 0.03336402401328087, "learning_rate": 2.063341645885287e-06, "loss": 0.0, "step": 35980 }, { "epoch": 4.487531172069826, "grad_norm": 0.0044538709335029125, "learning_rate": 2.0583541147132173e-06, "loss": 0.0045, "step": 35990 }, { "epoch": 4.488778054862843, "grad_norm": 5.0544094847282395e-05, "learning_rate": 2.0533665835411474e-06, "loss": 0.0, "step": 36000 }, { "epoch": 4.4900249376558605, "grad_norm": 9.530440001981333e-05, "learning_rate": 2.0483790523690776e-06, "loss": 0.025, "step": 36010 }, { "epoch": 4.491271820448878, "grad_norm": 0.0016637337394058704, "learning_rate": 2.0433915211970074e-06, "loss": 0.0, "step": 36020 }, { "epoch": 4.492518703241895, "grad_norm": 0.0019435989670455456, "learning_rate": 2.0384039900249376e-06, "loss": 0.0001, "step": 36030 }, { "epoch": 4.493765586034913, "grad_norm": 0.00041165429865941405, "learning_rate": 2.033416458852868e-06, "loss": 0.0409, "step": 36040 }, { "epoch": 4.49501246882793, "grad_norm": 0.00011268148227827623, "learning_rate": 2.028428927680798e-06, "loss": 0.0106, "step": 36050 }, { "epoch": 4.496259351620948, "grad_norm": 0.0026401542127132416, "learning_rate": 2.023441396508728e-06, "loss": 0.0004, "step": 36060 }, { "epoch": 4.497506234413965, "grad_norm": 0.00018546386854723096, "learning_rate": 2.018453865336659e-06, "loss": 0.0, "step": 36070 }, { "epoch": 4.498753117206983, "grad_norm": 0.0018113835249096155, "learning_rate": 2.013466334164589e-06, "loss": 0.0, "step": 36080 }, { "epoch": 4.5, "grad_norm": 0.0028145157266408205, "learning_rate": 2.0084788029925188e-06, "loss": 0.0, "step": 36090 }, { "epoch": 4.501246882793017, "grad_norm": 0.0008215096895582974, "learning_rate": 2.003491271820449e-06, "loss": 0.0, "step": 36100 }, { "epoch": 4.502493765586035, "grad_norm": 9.587730892235413e-05, "learning_rate": 1.998503740648379e-06, "loss": 0.0, "step": 36110 }, { "epoch": 4.503740648379052, "grad_norm": 7.04293925082311e-05, "learning_rate": 1.9935162094763094e-06, "loss": 0.0, "step": 36120 }, { "epoch": 4.50498753117207, "grad_norm": 5.8869190979748964e-05, "learning_rate": 1.9885286783042396e-06, "loss": 0.0, "step": 36130 }, { "epoch": 4.506234413965087, "grad_norm": 0.0003809872141573578, "learning_rate": 1.9835411471321698e-06, "loss": 0.0, "step": 36140 }, { "epoch": 4.507481296758105, "grad_norm": 0.016797462478280067, "learning_rate": 1.9785536159601e-06, "loss": 0.0016, "step": 36150 }, { "epoch": 4.508728179551122, "grad_norm": 0.0005834007752127945, "learning_rate": 1.97356608478803e-06, "loss": 0.0, "step": 36160 }, { "epoch": 4.5099750623441395, "grad_norm": 0.000800897425506264, "learning_rate": 1.9685785536159604e-06, "loss": 0.0, "step": 36170 }, { "epoch": 4.511221945137157, "grad_norm": 0.00021014439698774368, "learning_rate": 1.96359102244389e-06, "loss": 0.0, "step": 36180 }, { "epoch": 4.512468827930174, "grad_norm": 3.088951052632183e-05, "learning_rate": 1.9586034912718203e-06, "loss": 0.0, "step": 36190 }, { "epoch": 4.513715710723192, "grad_norm": 0.00011189231736352667, "learning_rate": 1.953615960099751e-06, "loss": 0.0, "step": 36200 }, { "epoch": 4.514962593516209, "grad_norm": 0.00038580026011914015, "learning_rate": 1.948628428927681e-06, "loss": 0.0, "step": 36210 }, { "epoch": 4.516209476309227, "grad_norm": 0.0007074935710988939, "learning_rate": 1.9436408977556113e-06, "loss": 0.0, "step": 36220 }, { "epoch": 4.517456359102244, "grad_norm": 0.00019117812917102128, "learning_rate": 1.9386533665835415e-06, "loss": 0.0007, "step": 36230 }, { "epoch": 4.5187032418952615, "grad_norm": 5.8276415074942634e-05, "learning_rate": 1.9336658354114713e-06, "loss": 0.0, "step": 36240 }, { "epoch": 4.519950124688279, "grad_norm": 5.8334331697551534e-05, "learning_rate": 1.9286783042394015e-06, "loss": 0.0, "step": 36250 }, { "epoch": 4.521197007481296, "grad_norm": 7.67667661421001e-05, "learning_rate": 1.9236907730673317e-06, "loss": 0.0001, "step": 36260 }, { "epoch": 4.522443890274314, "grad_norm": 0.0006992538692429662, "learning_rate": 1.918703241895262e-06, "loss": 0.0, "step": 36270 }, { "epoch": 4.523690773067331, "grad_norm": 6.20846840320155e-05, "learning_rate": 1.913715710723192e-06, "loss": 0.0, "step": 36280 }, { "epoch": 4.524937655860349, "grad_norm": 0.00012284106924198568, "learning_rate": 1.9092269326683295e-06, "loss": 0.0059, "step": 36290 }, { "epoch": 4.526184538653366, "grad_norm": 4.695883035310544e-05, "learning_rate": 1.9042394014962595e-06, "loss": 0.0, "step": 36300 }, { "epoch": 4.5274314214463836, "grad_norm": 0.012509011663496494, "learning_rate": 1.8992518703241897e-06, "loss": 0.0, "step": 36310 }, { "epoch": 4.528678304239402, "grad_norm": 0.000993928057141602, "learning_rate": 1.8942643391521199e-06, "loss": 0.0, "step": 36320 }, { "epoch": 4.529925187032419, "grad_norm": 0.00010343554458813742, "learning_rate": 1.88927680798005e-06, "loss": 0.0, "step": 36330 }, { "epoch": 4.531172069825437, "grad_norm": 6.238223431864753e-05, "learning_rate": 1.88428927680798e-06, "loss": 0.0, "step": 36340 }, { "epoch": 4.532418952618454, "grad_norm": 7.380228635156527e-05, "learning_rate": 1.8793017456359102e-06, "loss": 0.0, "step": 36350 }, { "epoch": 4.533665835411472, "grad_norm": 0.000599766499362886, "learning_rate": 1.8743142144638404e-06, "loss": 0.0264, "step": 36360 }, { "epoch": 4.534912718204489, "grad_norm": 0.00018769617599900812, "learning_rate": 1.8693266832917708e-06, "loss": 0.0, "step": 36370 }, { "epoch": 4.5361596009975065, "grad_norm": 0.0028365144971758127, "learning_rate": 1.864339152119701e-06, "loss": 0.0, "step": 36380 }, { "epoch": 4.537406483790524, "grad_norm": 5.832207898492925e-05, "learning_rate": 1.8593516209476312e-06, "loss": 0.0, "step": 36390 }, { "epoch": 4.538653366583541, "grad_norm": 0.0001258625416085124, "learning_rate": 1.8543640897755614e-06, "loss": 0.0, "step": 36400 }, { "epoch": 4.539900249376559, "grad_norm": 0.0002180346637032926, "learning_rate": 1.8493765586034914e-06, "loss": 0.0, "step": 36410 }, { "epoch": 4.541147132169576, "grad_norm": 0.00015187938697636127, "learning_rate": 1.8443890274314216e-06, "loss": 0.0, "step": 36420 }, { "epoch": 4.542394014962594, "grad_norm": 0.0003344623255543411, "learning_rate": 1.8394014962593518e-06, "loss": 0.0007, "step": 36430 }, { "epoch": 4.543640897755611, "grad_norm": 0.0008474570931866765, "learning_rate": 1.834413965087282e-06, "loss": 0.0517, "step": 36440 }, { "epoch": 4.5448877805486285, "grad_norm": 7.383363117696717e-05, "learning_rate": 1.829426433915212e-06, "loss": 0.0, "step": 36450 }, { "epoch": 4.546134663341646, "grad_norm": 0.00031828609644435346, "learning_rate": 1.8244389027431422e-06, "loss": 0.0, "step": 36460 }, { "epoch": 4.547381546134663, "grad_norm": 0.0006407342152670026, "learning_rate": 1.8194513715710724e-06, "loss": 0.0, "step": 36470 }, { "epoch": 4.548628428927681, "grad_norm": 0.0001172191696241498, "learning_rate": 1.8144638403990026e-06, "loss": 0.0001, "step": 36480 }, { "epoch": 4.549875311720698, "grad_norm": 0.0008851775201037526, "learning_rate": 1.8094763092269328e-06, "loss": 0.0, "step": 36490 }, { "epoch": 4.551122194513716, "grad_norm": 0.00010299222049070522, "learning_rate": 1.8044887780548632e-06, "loss": 0.0, "step": 36500 }, { "epoch": 4.552369077306733, "grad_norm": 0.00030066914041526616, "learning_rate": 1.7995012468827934e-06, "loss": 0.0, "step": 36510 }, { "epoch": 4.553615960099751, "grad_norm": 0.0028322464786469936, "learning_rate": 1.7945137157107233e-06, "loss": 0.0, "step": 36520 }, { "epoch": 4.554862842892768, "grad_norm": 0.00024018692784011364, "learning_rate": 1.7895261845386535e-06, "loss": 0.006, "step": 36530 }, { "epoch": 4.556109725685785, "grad_norm": 7.3418123065494e-05, "learning_rate": 1.7845386533665837e-06, "loss": 0.0, "step": 36540 }, { "epoch": 4.557356608478803, "grad_norm": 63.39867401123047, "learning_rate": 1.779551122194514e-06, "loss": 0.0361, "step": 36550 }, { "epoch": 4.55860349127182, "grad_norm": 0.00010892859427258372, "learning_rate": 1.774563591022444e-06, "loss": 0.0, "step": 36560 }, { "epoch": 4.559850374064838, "grad_norm": 0.00015360671386588365, "learning_rate": 1.7695760598503741e-06, "loss": 0.0057, "step": 36570 }, { "epoch": 4.561097256857855, "grad_norm": 5.321098797139712e-05, "learning_rate": 1.7645885286783043e-06, "loss": 0.0157, "step": 36580 }, { "epoch": 4.562344139650873, "grad_norm": 8.898463420337066e-05, "learning_rate": 1.7596009975062345e-06, "loss": 0.0, "step": 36590 }, { "epoch": 4.56359102244389, "grad_norm": 0.0003251841408200562, "learning_rate": 1.7546134663341647e-06, "loss": 0.0329, "step": 36600 }, { "epoch": 4.5648379052369075, "grad_norm": 0.00042449383181519806, "learning_rate": 1.7496259351620947e-06, "loss": 0.0, "step": 36610 }, { "epoch": 4.566084788029925, "grad_norm": 0.005706509575247765, "learning_rate": 1.7446384039900253e-06, "loss": 0.0, "step": 36620 }, { "epoch": 4.567331670822942, "grad_norm": 7.963182724779472e-05, "learning_rate": 1.7396508728179553e-06, "loss": 0.0001, "step": 36630 }, { "epoch": 4.56857855361596, "grad_norm": 0.0028878054581582546, "learning_rate": 1.7346633416458855e-06, "loss": 0.0, "step": 36640 }, { "epoch": 4.569825436408977, "grad_norm": 0.0009514418779872358, "learning_rate": 1.7296758104738157e-06, "loss": 0.0, "step": 36650 }, { "epoch": 4.571072319201995, "grad_norm": 0.0022119847126305103, "learning_rate": 1.7246882793017459e-06, "loss": 0.0, "step": 36660 }, { "epoch": 4.572319201995013, "grad_norm": 0.0009768361924216151, "learning_rate": 1.719700748129676e-06, "loss": 0.0, "step": 36670 }, { "epoch": 4.57356608478803, "grad_norm": 6.927720096427947e-05, "learning_rate": 1.714713216957606e-06, "loss": 0.0, "step": 36680 }, { "epoch": 4.574812967581048, "grad_norm": 0.001001956406980753, "learning_rate": 1.7097256857855363e-06, "loss": 0.0, "step": 36690 }, { "epoch": 4.576059850374065, "grad_norm": 0.00015158270252868533, "learning_rate": 1.7047381546134664e-06, "loss": 0.0, "step": 36700 }, { "epoch": 4.577306733167083, "grad_norm": 0.0009215485770255327, "learning_rate": 1.6997506234413966e-06, "loss": 0.0, "step": 36710 }, { "epoch": 4.5785536159601, "grad_norm": 0.0002993463131133467, "learning_rate": 1.6947630922693266e-06, "loss": 0.0, "step": 36720 }, { "epoch": 4.579800498753118, "grad_norm": 0.000504608207847923, "learning_rate": 1.6897755610972568e-06, "loss": 0.0001, "step": 36730 }, { "epoch": 4.581047381546135, "grad_norm": 6.463566387537867e-05, "learning_rate": 1.6847880299251872e-06, "loss": 0.0001, "step": 36740 }, { "epoch": 4.582294264339152, "grad_norm": 5.9693622461054474e-05, "learning_rate": 1.6798004987531174e-06, "loss": 0.0, "step": 36750 }, { "epoch": 4.58354114713217, "grad_norm": 0.00012298337242100388, "learning_rate": 1.6748129675810476e-06, "loss": 0.0, "step": 36760 }, { "epoch": 4.584788029925187, "grad_norm": 0.0008290084660984576, "learning_rate": 1.6698254364089778e-06, "loss": 0.0741, "step": 36770 }, { "epoch": 4.586034912718205, "grad_norm": 0.0001284991012653336, "learning_rate": 1.664837905236908e-06, "loss": 0.0, "step": 36780 }, { "epoch": 4.587281795511222, "grad_norm": 0.00019413598056416959, "learning_rate": 1.659850374064838e-06, "loss": 0.0, "step": 36790 }, { "epoch": 4.58852867830424, "grad_norm": 0.00030527933267876506, "learning_rate": 1.6548628428927682e-06, "loss": 0.0, "step": 36800 }, { "epoch": 4.589775561097257, "grad_norm": 0.0001714515092317015, "learning_rate": 1.6498753117206984e-06, "loss": 0.0, "step": 36810 }, { "epoch": 4.5910224438902745, "grad_norm": 6.510254024760798e-05, "learning_rate": 1.6448877805486286e-06, "loss": 0.0, "step": 36820 }, { "epoch": 4.592269326683292, "grad_norm": 8.50441720103845e-05, "learning_rate": 1.6399002493765586e-06, "loss": 0.0, "step": 36830 }, { "epoch": 4.593516209476309, "grad_norm": 0.011633110232651234, "learning_rate": 1.6349127182044888e-06, "loss": 0.0, "step": 36840 }, { "epoch": 4.594763092269327, "grad_norm": 0.008615621365606785, "learning_rate": 1.629925187032419e-06, "loss": 0.0, "step": 36850 }, { "epoch": 4.596009975062344, "grad_norm": 0.00014177417324390262, "learning_rate": 1.6249376558603492e-06, "loss": 0.0286, "step": 36860 }, { "epoch": 4.597256857855362, "grad_norm": 8.73705284902826e-05, "learning_rate": 1.6199501246882796e-06, "loss": 0.0, "step": 36870 }, { "epoch": 4.598503740648379, "grad_norm": 0.000408838881412521, "learning_rate": 1.6149625935162098e-06, "loss": 0.0001, "step": 36880 }, { "epoch": 4.5997506234413965, "grad_norm": 5.491747651831247e-05, "learning_rate": 1.60997506234414e-06, "loss": 0.0397, "step": 36890 }, { "epoch": 4.600997506234414, "grad_norm": 5.760222120443359e-05, "learning_rate": 1.60498753117207e-06, "loss": 0.0, "step": 36900 }, { "epoch": 4.602244389027431, "grad_norm": 0.00023852268350310624, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "step": 36910 }, { "epoch": 4.603491271820449, "grad_norm": 0.00017992674838751554, "learning_rate": 1.5950124688279303e-06, "loss": 0.0, "step": 36920 }, { "epoch": 4.604738154613466, "grad_norm": 0.00035725522320717573, "learning_rate": 1.5900249376558605e-06, "loss": 0.0, "step": 36930 }, { "epoch": 4.605985037406484, "grad_norm": 0.0001332864776486531, "learning_rate": 1.5850374064837905e-06, "loss": 0.0253, "step": 36940 }, { "epoch": 4.607231920199501, "grad_norm": 0.00014862054376862943, "learning_rate": 1.5800498753117207e-06, "loss": 0.0, "step": 36950 }, { "epoch": 4.6084788029925186, "grad_norm": 8.171637455234304e-05, "learning_rate": 1.575062344139651e-06, "loss": 0.0, "step": 36960 }, { "epoch": 4.609725685785536, "grad_norm": 7.45083307265304e-05, "learning_rate": 1.570074812967581e-06, "loss": 0.0, "step": 36970 }, { "epoch": 4.610972568578553, "grad_norm": 7.226823072414845e-05, "learning_rate": 1.5650872817955113e-06, "loss": 0.027, "step": 36980 }, { "epoch": 4.612219451371571, "grad_norm": 8.009558951016515e-05, "learning_rate": 1.5600997506234417e-06, "loss": 0.0, "step": 36990 }, { "epoch": 4.613466334164588, "grad_norm": 0.00047921130317263305, "learning_rate": 1.555112219451372e-06, "loss": 0.0, "step": 37000 }, { "epoch": 4.614713216957606, "grad_norm": 0.00016305805183947086, "learning_rate": 1.5501246882793019e-06, "loss": 0.0004, "step": 37010 }, { "epoch": 4.615960099750623, "grad_norm": 0.0001415027945768088, "learning_rate": 1.545137157107232e-06, "loss": 0.0, "step": 37020 }, { "epoch": 4.617206982543641, "grad_norm": 0.00013072905130684376, "learning_rate": 1.5401496259351623e-06, "loss": 0.0, "step": 37030 }, { "epoch": 4.618453865336658, "grad_norm": 0.00042544634197838604, "learning_rate": 1.5351620947630925e-06, "loss": 0.0078, "step": 37040 }, { "epoch": 4.6197007481296755, "grad_norm": 8.365141547983512e-05, "learning_rate": 1.5301745635910227e-06, "loss": 0.0, "step": 37050 }, { "epoch": 4.620947630922693, "grad_norm": 0.0003506205976009369, "learning_rate": 1.5251870324189527e-06, "loss": 0.0, "step": 37060 }, { "epoch": 4.62219451371571, "grad_norm": 9.824489097809419e-05, "learning_rate": 1.5201995012468829e-06, "loss": 0.0, "step": 37070 }, { "epoch": 4.623441396508728, "grad_norm": 0.0017115233931690454, "learning_rate": 1.515211970074813e-06, "loss": 0.0, "step": 37080 }, { "epoch": 4.624688279301745, "grad_norm": 0.0024657100439071655, "learning_rate": 1.5102244389027432e-06, "loss": 0.0, "step": 37090 }, { "epoch": 4.625935162094763, "grad_norm": 0.0031729680486023426, "learning_rate": 1.5052369077306732e-06, "loss": 0.0, "step": 37100 }, { "epoch": 4.62718204488778, "grad_norm": 0.0001139767118729651, "learning_rate": 1.5002493765586034e-06, "loss": 0.0183, "step": 37110 }, { "epoch": 4.628428927680798, "grad_norm": 0.00011955766240134835, "learning_rate": 1.4952618453865338e-06, "loss": 0.0, "step": 37120 }, { "epoch": 4.629675810473816, "grad_norm": 9.51491019804962e-05, "learning_rate": 1.490274314214464e-06, "loss": 0.0025, "step": 37130 }, { "epoch": 4.630922693266833, "grad_norm": 6.892336386954412e-05, "learning_rate": 1.4852867830423942e-06, "loss": 0.0, "step": 37140 }, { "epoch": 4.632169576059851, "grad_norm": 0.000913655967451632, "learning_rate": 1.4802992518703244e-06, "loss": 0.0, "step": 37150 }, { "epoch": 4.633416458852868, "grad_norm": 0.00010036984895123169, "learning_rate": 1.4753117206982546e-06, "loss": 0.0, "step": 37160 }, { "epoch": 4.634663341645886, "grad_norm": 0.0002912537893280387, "learning_rate": 1.4703241895261846e-06, "loss": 0.0, "step": 37170 }, { "epoch": 4.635910224438903, "grad_norm": 0.00021481663861777633, "learning_rate": 1.4653366583541148e-06, "loss": 0.0288, "step": 37180 }, { "epoch": 4.63715710723192, "grad_norm": 0.00040280946996062994, "learning_rate": 1.460349127182045e-06, "loss": 0.0, "step": 37190 }, { "epoch": 4.638403990024938, "grad_norm": 9.865140600595623e-05, "learning_rate": 1.4553615960099752e-06, "loss": 0.0, "step": 37200 }, { "epoch": 4.639650872817955, "grad_norm": 0.00029368087416514754, "learning_rate": 1.4503740648379052e-06, "loss": 0.0, "step": 37210 }, { "epoch": 4.640897755610973, "grad_norm": 0.009541047737002373, "learning_rate": 1.4453865336658354e-06, "loss": 0.0, "step": 37220 }, { "epoch": 4.64214463840399, "grad_norm": 0.00014524144353345037, "learning_rate": 1.4403990024937656e-06, "loss": 0.0, "step": 37230 }, { "epoch": 4.643391521197008, "grad_norm": 8.261830225819722e-05, "learning_rate": 1.435411471321696e-06, "loss": 0.0, "step": 37240 }, { "epoch": 4.644638403990025, "grad_norm": 0.010629824362695217, "learning_rate": 1.4304239401496262e-06, "loss": 0.0, "step": 37250 }, { "epoch": 4.6458852867830425, "grad_norm": 0.0003399306442588568, "learning_rate": 1.4254364089775564e-06, "loss": 0.001, "step": 37260 }, { "epoch": 4.64713216957606, "grad_norm": 0.0002243259659735486, "learning_rate": 1.4204488778054866e-06, "loss": 0.0, "step": 37270 }, { "epoch": 4.648379052369077, "grad_norm": 8.796357724349946e-05, "learning_rate": 1.4154613466334165e-06, "loss": 0.0102, "step": 37280 }, { "epoch": 4.649625935162095, "grad_norm": 0.006956281140446663, "learning_rate": 1.4104738154613467e-06, "loss": 0.0, "step": 37290 }, { "epoch": 4.650872817955112, "grad_norm": 0.0003823429869953543, "learning_rate": 1.405486284289277e-06, "loss": 0.0, "step": 37300 }, { "epoch": 4.65211970074813, "grad_norm": 4.093679672223516e-05, "learning_rate": 1.4004987531172071e-06, "loss": 0.0, "step": 37310 }, { "epoch": 4.653366583541147, "grad_norm": 0.028871528804302216, "learning_rate": 1.3955112219451371e-06, "loss": 0.0, "step": 37320 }, { "epoch": 4.6546134663341645, "grad_norm": 0.0019481899216771126, "learning_rate": 1.3905236907730673e-06, "loss": 0.0, "step": 37330 }, { "epoch": 4.655860349127182, "grad_norm": 0.00046627031406387687, "learning_rate": 1.3855361596009975e-06, "loss": 0.0, "step": 37340 }, { "epoch": 4.657107231920199, "grad_norm": 0.00011581995931919664, "learning_rate": 1.3805486284289277e-06, "loss": 0.0001, "step": 37350 }, { "epoch": 4.658354114713217, "grad_norm": 0.00014464250125456601, "learning_rate": 1.3755610972568581e-06, "loss": 0.0, "step": 37360 }, { "epoch": 4.659600997506234, "grad_norm": 0.0001020960189634934, "learning_rate": 1.3705735660847883e-06, "loss": 0.0001, "step": 37370 }, { "epoch": 4.660847880299252, "grad_norm": 0.0036403797566890717, "learning_rate": 1.3655860349127185e-06, "loss": 0.0, "step": 37380 }, { "epoch": 4.662094763092269, "grad_norm": 7.654747605556622e-05, "learning_rate": 1.3605985037406485e-06, "loss": 0.0, "step": 37390 }, { "epoch": 4.6633416458852865, "grad_norm": 0.000111608904262539, "learning_rate": 1.3556109725685787e-06, "loss": 0.0105, "step": 37400 }, { "epoch": 4.664588528678304, "grad_norm": 0.000193897751159966, "learning_rate": 1.3506234413965089e-06, "loss": 0.0, "step": 37410 }, { "epoch": 4.665835411471321, "grad_norm": 0.001126578776165843, "learning_rate": 1.345635910224439e-06, "loss": 0.0, "step": 37420 }, { "epoch": 4.667082294264339, "grad_norm": 0.04877956211566925, "learning_rate": 1.3406483790523693e-06, "loss": 0.0, "step": 37430 }, { "epoch": 4.668329177057356, "grad_norm": 0.0032060265075415373, "learning_rate": 1.3356608478802993e-06, "loss": 0.0, "step": 37440 }, { "epoch": 4.669576059850374, "grad_norm": 0.00014094925427343696, "learning_rate": 1.3306733167082294e-06, "loss": 0.0, "step": 37450 }, { "epoch": 4.670822942643391, "grad_norm": 0.1798996776342392, "learning_rate": 1.3256857855361596e-06, "loss": 0.0, "step": 37460 }, { "epoch": 4.6720698254364095, "grad_norm": 0.002865089802071452, "learning_rate": 1.3206982543640898e-06, "loss": 0.0, "step": 37470 }, { "epoch": 4.673316708229427, "grad_norm": 0.00010365377966081724, "learning_rate": 1.3157107231920198e-06, "loss": 0.0, "step": 37480 }, { "epoch": 4.674563591022444, "grad_norm": 0.0003474355034995824, "learning_rate": 1.3107231920199504e-06, "loss": 0.0, "step": 37490 }, { "epoch": 4.675810473815462, "grad_norm": 0.016976043581962585, "learning_rate": 1.3057356608478804e-06, "loss": 0.0, "step": 37500 }, { "epoch": 4.677057356608479, "grad_norm": 0.00019507709657773376, "learning_rate": 1.3007481296758106e-06, "loss": 0.0, "step": 37510 }, { "epoch": 4.678304239401497, "grad_norm": 7.908452244009823e-05, "learning_rate": 1.2957605985037408e-06, "loss": 0.0164, "step": 37520 }, { "epoch": 4.679551122194514, "grad_norm": 0.00025233966880477965, "learning_rate": 1.290773067331671e-06, "loss": 0.0219, "step": 37530 }, { "epoch": 4.6807980049875315, "grad_norm": 5.758203042205423e-05, "learning_rate": 1.2857855361596012e-06, "loss": 0.0, "step": 37540 }, { "epoch": 4.682044887780549, "grad_norm": 0.000734044355340302, "learning_rate": 1.2807980049875312e-06, "loss": 0.0, "step": 37550 }, { "epoch": 4.683291770573566, "grad_norm": 4.313397585065104e-05, "learning_rate": 1.2758104738154614e-06, "loss": 0.0465, "step": 37560 }, { "epoch": 4.684538653366584, "grad_norm": 32.49720764160156, "learning_rate": 1.2708229426433916e-06, "loss": 0.0364, "step": 37570 }, { "epoch": 4.685785536159601, "grad_norm": 0.00016528677952010185, "learning_rate": 1.2658354114713218e-06, "loss": 0.0002, "step": 37580 }, { "epoch": 4.687032418952619, "grad_norm": 0.004134850576519966, "learning_rate": 1.2608478802992518e-06, "loss": 0.0, "step": 37590 }, { "epoch": 4.688279301745636, "grad_norm": 0.0005740217748098075, "learning_rate": 1.255860349127182e-06, "loss": 0.0, "step": 37600 }, { "epoch": 4.6895261845386536, "grad_norm": 5.220138336881064e-05, "learning_rate": 1.2508728179551124e-06, "loss": 0.0, "step": 37610 }, { "epoch": 4.690773067331671, "grad_norm": 9.989990940084681e-05, "learning_rate": 1.2458852867830426e-06, "loss": 0.0, "step": 37620 }, { "epoch": 4.692019950124688, "grad_norm": 0.001709539326839149, "learning_rate": 1.2408977556109726e-06, "loss": 0.0, "step": 37630 }, { "epoch": 4.693266832917706, "grad_norm": 0.02466406300663948, "learning_rate": 1.235910224438903e-06, "loss": 0.0, "step": 37640 }, { "epoch": 4.694513715710723, "grad_norm": 7.842500053811818e-05, "learning_rate": 1.2309226932668332e-06, "loss": 0.0, "step": 37650 }, { "epoch": 4.695760598503741, "grad_norm": 0.00015703195822425187, "learning_rate": 1.2259351620947631e-06, "loss": 0.0001, "step": 37660 }, { "epoch": 4.697007481296758, "grad_norm": 0.0005103639559820294, "learning_rate": 1.2209476309226933e-06, "loss": 0.0, "step": 37670 }, { "epoch": 4.698254364089776, "grad_norm": 0.0009048631764017045, "learning_rate": 1.2159600997506235e-06, "loss": 0.0, "step": 37680 }, { "epoch": 4.699501246882793, "grad_norm": 0.008464858867228031, "learning_rate": 1.2109725685785537e-06, "loss": 0.0001, "step": 37690 }, { "epoch": 4.7007481296758105, "grad_norm": 7.08570732967928e-05, "learning_rate": 1.205985037406484e-06, "loss": 0.0, "step": 37700 }, { "epoch": 4.701995012468828, "grad_norm": 4.024140434921719e-05, "learning_rate": 1.2009975062344141e-06, "loss": 0.0, "step": 37710 }, { "epoch": 4.703241895261845, "grad_norm": 0.0004707244224846363, "learning_rate": 1.1960099750623443e-06, "loss": 0.0092, "step": 37720 }, { "epoch": 4.704488778054863, "grad_norm": 0.010790945962071419, "learning_rate": 1.1910224438902745e-06, "loss": 0.0, "step": 37730 }, { "epoch": 4.70573566084788, "grad_norm": 5.667881850968115e-05, "learning_rate": 1.1860349127182045e-06, "loss": 0.0, "step": 37740 }, { "epoch": 4.706982543640898, "grad_norm": 8.554881787858903e-05, "learning_rate": 1.1810473815461347e-06, "loss": 0.0011, "step": 37750 }, { "epoch": 4.708229426433915, "grad_norm": 6.647824920946732e-05, "learning_rate": 1.1760598503740649e-06, "loss": 0.0, "step": 37760 }, { "epoch": 4.7094763092269325, "grad_norm": 0.0001823468046495691, "learning_rate": 1.171072319201995e-06, "loss": 0.0718, "step": 37770 }, { "epoch": 4.71072319201995, "grad_norm": 3.875693801091984e-05, "learning_rate": 1.1660847880299253e-06, "loss": 0.0112, "step": 37780 }, { "epoch": 4.711970074812967, "grad_norm": 0.0006465655169449747, "learning_rate": 1.1610972568578555e-06, "loss": 0.0, "step": 37790 }, { "epoch": 4.713216957605985, "grad_norm": 0.0006873203092254698, "learning_rate": 1.1561097256857857e-06, "loss": 0.0, "step": 37800 }, { "epoch": 4.714463840399002, "grad_norm": 0.00017582096916157752, "learning_rate": 1.1511221945137159e-06, "loss": 0.0, "step": 37810 }, { "epoch": 4.71571072319202, "grad_norm": 0.002465422498062253, "learning_rate": 1.1461346633416458e-06, "loss": 0.0049, "step": 37820 }, { "epoch": 4.716957605985037, "grad_norm": 0.00022965417883824557, "learning_rate": 1.1411471321695763e-06, "loss": 0.0, "step": 37830 }, { "epoch": 4.7182044887780545, "grad_norm": 0.00011124199227197096, "learning_rate": 1.1361596009975065e-06, "loss": 0.0, "step": 37840 }, { "epoch": 4.719451371571072, "grad_norm": 0.00040846472256816924, "learning_rate": 1.1311720698254364e-06, "loss": 0.0741, "step": 37850 }, { "epoch": 4.720698254364089, "grad_norm": 0.00880060438066721, "learning_rate": 1.1261845386533666e-06, "loss": 0.0, "step": 37860 }, { "epoch": 4.721945137157107, "grad_norm": 5.207054709899239e-05, "learning_rate": 1.1211970074812968e-06, "loss": 0.091, "step": 37870 }, { "epoch": 4.723192019950124, "grad_norm": 0.00037851626984775066, "learning_rate": 1.116209476309227e-06, "loss": 0.0, "step": 37880 }, { "epoch": 4.724438902743142, "grad_norm": 0.0003735979844350368, "learning_rate": 1.1112219451371572e-06, "loss": 0.0, "step": 37890 }, { "epoch": 4.725685785536159, "grad_norm": 0.00029443929088301957, "learning_rate": 1.1062344139650874e-06, "loss": 0.0, "step": 37900 }, { "epoch": 4.726932668329177, "grad_norm": 9.832724026637152e-05, "learning_rate": 1.1012468827930176e-06, "loss": 0.0, "step": 37910 }, { "epoch": 4.728179551122195, "grad_norm": 0.00041520065860822797, "learning_rate": 1.0962593516209478e-06, "loss": 0.0, "step": 37920 }, { "epoch": 4.729426433915212, "grad_norm": 7.619358802912757e-05, "learning_rate": 1.0912718204488778e-06, "loss": 0.0, "step": 37930 }, { "epoch": 4.73067331670823, "grad_norm": 0.00065153295872733, "learning_rate": 1.086284289276808e-06, "loss": 0.0, "step": 37940 }, { "epoch": 4.731920199501247, "grad_norm": 0.0009183948859572411, "learning_rate": 1.0812967581047384e-06, "loss": 0.0, "step": 37950 }, { "epoch": 4.733167082294265, "grad_norm": 0.0008579808054491878, "learning_rate": 1.0763092269326684e-06, "loss": 0.0001, "step": 37960 }, { "epoch": 4.734413965087282, "grad_norm": 0.00019635986245702952, "learning_rate": 1.0713216957605986e-06, "loss": 0.0003, "step": 37970 }, { "epoch": 4.7356608478802995, "grad_norm": 0.0011371946893632412, "learning_rate": 1.0663341645885288e-06, "loss": 0.0, "step": 37980 }, { "epoch": 4.736907730673317, "grad_norm": 5.076320667285472e-05, "learning_rate": 1.061346633416459e-06, "loss": 0.0011, "step": 37990 }, { "epoch": 4.738154613466334, "grad_norm": 4.8724108637543395e-05, "learning_rate": 1.0563591022443892e-06, "loss": 0.0, "step": 38000 }, { "epoch": 4.739401496259352, "grad_norm": 0.00011148832709295675, "learning_rate": 1.0513715710723194e-06, "loss": 0.0, "step": 38010 }, { "epoch": 4.740648379052369, "grad_norm": 7.638114766450599e-05, "learning_rate": 1.0463840399002496e-06, "loss": 0.0, "step": 38020 }, { "epoch": 4.741895261845387, "grad_norm": 0.00010628465679474175, "learning_rate": 1.0413965087281798e-06, "loss": 0.0129, "step": 38030 }, { "epoch": 4.743142144638404, "grad_norm": 6.248387217056006e-05, "learning_rate": 1.0364089775561097e-06, "loss": 0.0, "step": 38040 }, { "epoch": 4.7443890274314215, "grad_norm": 5.823820174555294e-05, "learning_rate": 1.03142144638404e-06, "loss": 0.0, "step": 38050 }, { "epoch": 4.745635910224439, "grad_norm": 0.005552787333726883, "learning_rate": 1.0264339152119701e-06, "loss": 0.0, "step": 38060 }, { "epoch": 4.746882793017456, "grad_norm": 0.004743535071611404, "learning_rate": 1.0214463840399003e-06, "loss": 0.0, "step": 38070 }, { "epoch": 4.748129675810474, "grad_norm": 5.940657501923852e-05, "learning_rate": 1.0164588528678305e-06, "loss": 0.0, "step": 38080 }, { "epoch": 4.749376558603491, "grad_norm": 7.897276373114437e-05, "learning_rate": 1.0114713216957607e-06, "loss": 0.0, "step": 38090 }, { "epoch": 4.750623441396509, "grad_norm": 8.272264676634222e-05, "learning_rate": 1.006483790523691e-06, "loss": 0.0, "step": 38100 }, { "epoch": 4.751870324189526, "grad_norm": 0.0009390998166054487, "learning_rate": 1.0014962593516211e-06, "loss": 0.0, "step": 38110 }, { "epoch": 4.753117206982544, "grad_norm": 6.277462671278045e-05, "learning_rate": 9.96508728179551e-07, "loss": 0.0, "step": 38120 }, { "epoch": 4.754364089775561, "grad_norm": 3.5075980122201145e-05, "learning_rate": 9.915211970074813e-07, "loss": 0.0075, "step": 38130 }, { "epoch": 4.7556109725685785, "grad_norm": 0.00023939134553074837, "learning_rate": 9.865336658354117e-07, "loss": 0.0, "step": 38140 }, { "epoch": 4.756857855361596, "grad_norm": 0.00011813923629233614, "learning_rate": 9.815461346633417e-07, "loss": 0.0001, "step": 38150 }, { "epoch": 4.758104738154613, "grad_norm": 0.00019262704881839454, "learning_rate": 9.765586034912719e-07, "loss": 0.0, "step": 38160 }, { "epoch": 4.759351620947631, "grad_norm": 4.599255771609023e-05, "learning_rate": 9.71571072319202e-07, "loss": 0.0, "step": 38170 }, { "epoch": 4.760598503740648, "grad_norm": 6.70359586365521e-05, "learning_rate": 9.665835411471323e-07, "loss": 0.0, "step": 38180 }, { "epoch": 4.761845386533666, "grad_norm": 0.0001310739608015865, "learning_rate": 9.615960099750625e-07, "loss": 0.0, "step": 38190 }, { "epoch": 4.763092269326683, "grad_norm": 0.0007167681469582021, "learning_rate": 9.566084788029927e-07, "loss": 0.0, "step": 38200 }, { "epoch": 4.7643391521197005, "grad_norm": 0.0002816536871250719, "learning_rate": 9.516209476309229e-07, "loss": 0.0, "step": 38210 }, { "epoch": 4.765586034912718, "grad_norm": 0.0004467430990189314, "learning_rate": 9.466334164588529e-07, "loss": 0.0, "step": 38220 }, { "epoch": 4.766832917705735, "grad_norm": 0.00010718198609538376, "learning_rate": 9.416458852867831e-07, "loss": 0.0, "step": 38230 }, { "epoch": 4.768079800498753, "grad_norm": 0.03801560774445534, "learning_rate": 9.366583541147132e-07, "loss": 0.0, "step": 38240 }, { "epoch": 4.76932668329177, "grad_norm": 4.085026739630848e-05, "learning_rate": 9.316708229426434e-07, "loss": 0.0, "step": 38250 }, { "epoch": 4.770573566084788, "grad_norm": 46.325836181640625, "learning_rate": 9.266832917705737e-07, "loss": 0.0069, "step": 38260 }, { "epoch": 4.771820448877805, "grad_norm": 0.0001395172148477286, "learning_rate": 9.216957605985038e-07, "loss": 0.0, "step": 38270 }, { "epoch": 4.773067331670823, "grad_norm": 0.0033392098266631365, "learning_rate": 9.16708229426434e-07, "loss": 0.0, "step": 38280 }, { "epoch": 4.774314214463841, "grad_norm": 7.12929613655433e-05, "learning_rate": 9.117206982543641e-07, "loss": 0.0, "step": 38290 }, { "epoch": 4.775561097256858, "grad_norm": 0.0020014506299048662, "learning_rate": 9.067331670822943e-07, "loss": 0.0, "step": 38300 }, { "epoch": 4.776807980049876, "grad_norm": 0.00013769141514785588, "learning_rate": 9.017456359102245e-07, "loss": 0.0, "step": 38310 }, { "epoch": 4.778054862842893, "grad_norm": 8.867657015798613e-05, "learning_rate": 8.967581047381548e-07, "loss": 0.0006, "step": 38320 }, { "epoch": 4.779301745635911, "grad_norm": 0.0008881071116775274, "learning_rate": 8.917705735660849e-07, "loss": 0.0002, "step": 38330 }, { "epoch": 4.780548628428928, "grad_norm": 0.0002598560240585357, "learning_rate": 8.867830423940151e-07, "loss": 0.0, "step": 38340 }, { "epoch": 4.7817955112219455, "grad_norm": 0.004675508942455053, "learning_rate": 8.817955112219452e-07, "loss": 0.0, "step": 38350 }, { "epoch": 4.783042394014963, "grad_norm": 0.009007451124489307, "learning_rate": 8.768079800498754e-07, "loss": 0.0, "step": 38360 }, { "epoch": 4.78428927680798, "grad_norm": 4.994167829863727e-05, "learning_rate": 8.718204488778055e-07, "loss": 0.0, "step": 38370 }, { "epoch": 4.785536159600998, "grad_norm": 0.000342125742463395, "learning_rate": 8.668329177057358e-07, "loss": 0.0, "step": 38380 }, { "epoch": 4.786783042394015, "grad_norm": 0.0014879869995638728, "learning_rate": 8.61845386533666e-07, "loss": 0.0, "step": 38390 }, { "epoch": 4.788029925187033, "grad_norm": 0.0005899175885133445, "learning_rate": 8.568578553615962e-07, "loss": 0.0, "step": 38400 }, { "epoch": 4.78927680798005, "grad_norm": 0.0007447946700267494, "learning_rate": 8.518703241895262e-07, "loss": 0.0, "step": 38410 }, { "epoch": 4.7905236907730675, "grad_norm": 0.00011481838737381622, "learning_rate": 8.468827930174564e-07, "loss": 0.0, "step": 38420 }, { "epoch": 4.791770573566085, "grad_norm": 0.00012223895464558154, "learning_rate": 8.418952618453865e-07, "loss": 0.0, "step": 38430 }, { "epoch": 4.793017456359102, "grad_norm": 5.043927740189247e-05, "learning_rate": 8.369077306733167e-07, "loss": 0.0, "step": 38440 }, { "epoch": 4.79426433915212, "grad_norm": 0.0004362256149761379, "learning_rate": 8.31920199501247e-07, "loss": 0.0002, "step": 38450 }, { "epoch": 4.795511221945137, "grad_norm": 0.0016937785549089313, "learning_rate": 8.269326683291771e-07, "loss": 0.035, "step": 38460 }, { "epoch": 4.796758104738155, "grad_norm": 0.00018390719196759164, "learning_rate": 8.219451371571073e-07, "loss": 0.0, "step": 38470 }, { "epoch": 4.798004987531172, "grad_norm": 5.6730925280135125e-05, "learning_rate": 8.174563591022444e-07, "loss": 0.03, "step": 38480 }, { "epoch": 4.7992518703241895, "grad_norm": 0.00022890319814905524, "learning_rate": 8.124688279301746e-07, "loss": 0.0368, "step": 38490 }, { "epoch": 4.800498753117207, "grad_norm": 0.00020031262829434127, "learning_rate": 8.074812967581049e-07, "loss": 0.0, "step": 38500 }, { "epoch": 4.801745635910224, "grad_norm": 45.181060791015625, "learning_rate": 8.02493765586035e-07, "loss": 0.0328, "step": 38510 }, { "epoch": 4.802992518703242, "grad_norm": 0.0005216131103225052, "learning_rate": 7.975062344139652e-07, "loss": 0.0091, "step": 38520 }, { "epoch": 4.804239401496259, "grad_norm": 6.901921005919576e-05, "learning_rate": 7.925187032418953e-07, "loss": 0.0, "step": 38530 }, { "epoch": 4.805486284289277, "grad_norm": 0.024445077404379845, "learning_rate": 7.875311720698255e-07, "loss": 0.0, "step": 38540 }, { "epoch": 4.806733167082294, "grad_norm": 0.003273892914876342, "learning_rate": 7.825436408977556e-07, "loss": 0.0486, "step": 38550 }, { "epoch": 4.807980049875312, "grad_norm": 0.00015414993686135858, "learning_rate": 7.77556109725686e-07, "loss": 0.0, "step": 38560 }, { "epoch": 4.809226932668329, "grad_norm": 6.870167999295518e-05, "learning_rate": 7.72568578553616e-07, "loss": 0.0, "step": 38570 }, { "epoch": 4.8104738154613464, "grad_norm": 0.001188624300993979, "learning_rate": 7.675810473815462e-07, "loss": 0.0, "step": 38580 }, { "epoch": 4.811720698254364, "grad_norm": 6.985004438320175e-05, "learning_rate": 7.625935162094763e-07, "loss": 0.0, "step": 38590 }, { "epoch": 4.812967581047381, "grad_norm": 7.557850040029734e-05, "learning_rate": 7.576059850374065e-07, "loss": 0.0, "step": 38600 }, { "epoch": 4.814214463840399, "grad_norm": 0.00018456621910445392, "learning_rate": 7.526184538653366e-07, "loss": 0.0, "step": 38610 }, { "epoch": 4.815461346633416, "grad_norm": 0.00035883314558304846, "learning_rate": 7.476309226932669e-07, "loss": 0.0, "step": 38620 }, { "epoch": 4.816708229426434, "grad_norm": 0.0002625574416015297, "learning_rate": 7.426433915211971e-07, "loss": 0.0061, "step": 38630 }, { "epoch": 4.817955112219451, "grad_norm": 0.00019378215074539185, "learning_rate": 7.376558603491273e-07, "loss": 0.0002, "step": 38640 }, { "epoch": 4.8192019950124685, "grad_norm": 7.995535270310938e-05, "learning_rate": 7.326683291770574e-07, "loss": 0.0, "step": 38650 }, { "epoch": 4.820448877805486, "grad_norm": 0.0010213602799922228, "learning_rate": 7.276807980049876e-07, "loss": 0.0, "step": 38660 }, { "epoch": 4.821695760598503, "grad_norm": 9.212228178512305e-05, "learning_rate": 7.226932668329177e-07, "loss": 0.0056, "step": 38670 }, { "epoch": 4.822942643391521, "grad_norm": 6.223101081559435e-05, "learning_rate": 7.17705735660848e-07, "loss": 0.0, "step": 38680 }, { "epoch": 4.824189526184538, "grad_norm": 0.0006266386481001973, "learning_rate": 7.127182044887782e-07, "loss": 0.0, "step": 38690 }, { "epoch": 4.825436408977556, "grad_norm": 6.241526716621593e-05, "learning_rate": 7.077306733167083e-07, "loss": 0.0, "step": 38700 }, { "epoch": 4.826683291770573, "grad_norm": 0.00010659959662007168, "learning_rate": 7.027431421446385e-07, "loss": 0.0, "step": 38710 }, { "epoch": 4.8279301745635905, "grad_norm": 5.2077397413086146e-05, "learning_rate": 6.977556109725686e-07, "loss": 0.0, "step": 38720 }, { "epoch": 4.829177057356609, "grad_norm": 0.0012044019531458616, "learning_rate": 6.927680798004988e-07, "loss": 0.0, "step": 38730 }, { "epoch": 4.830423940149626, "grad_norm": 0.0006541931070387363, "learning_rate": 6.877805486284291e-07, "loss": 0.0, "step": 38740 }, { "epoch": 4.831670822942644, "grad_norm": 7.856899901526049e-05, "learning_rate": 6.827930174563593e-07, "loss": 0.0, "step": 38750 }, { "epoch": 4.832917705735661, "grad_norm": 0.01197484228760004, "learning_rate": 6.778054862842893e-07, "loss": 0.0532, "step": 38760 }, { "epoch": 4.834164588528679, "grad_norm": 0.00016221591795329005, "learning_rate": 6.728179551122195e-07, "loss": 0.0, "step": 38770 }, { "epoch": 4.835411471321696, "grad_norm": 0.006540379952639341, "learning_rate": 6.678304239401496e-07, "loss": 0.0, "step": 38780 }, { "epoch": 4.8366583541147135, "grad_norm": 4.641401028493419e-05, "learning_rate": 6.628428927680798e-07, "loss": 0.0, "step": 38790 }, { "epoch": 4.837905236907731, "grad_norm": 0.00019889388931915164, "learning_rate": 6.578553615960099e-07, "loss": 0.0, "step": 38800 }, { "epoch": 4.839152119700748, "grad_norm": 0.003469712333753705, "learning_rate": 6.528678304239402e-07, "loss": 0.0, "step": 38810 }, { "epoch": 4.840399002493766, "grad_norm": 0.00012375341611914337, "learning_rate": 6.478802992518704e-07, "loss": 0.0, "step": 38820 }, { "epoch": 4.841645885286783, "grad_norm": 0.0009892029920592904, "learning_rate": 6.428927680798006e-07, "loss": 0.0001, "step": 38830 }, { "epoch": 4.842892768079801, "grad_norm": 0.014581103809177876, "learning_rate": 6.379052369077307e-07, "loss": 0.0002, "step": 38840 }, { "epoch": 4.844139650872818, "grad_norm": 6.556464359164238e-05, "learning_rate": 6.329177057356609e-07, "loss": 0.0, "step": 38850 }, { "epoch": 4.8453865336658355, "grad_norm": 7.368988735834137e-05, "learning_rate": 6.27930174563591e-07, "loss": 0.0, "step": 38860 }, { "epoch": 4.846633416458853, "grad_norm": 0.0001805610372684896, "learning_rate": 6.229426433915213e-07, "loss": 0.0, "step": 38870 }, { "epoch": 4.84788029925187, "grad_norm": 0.00016412808327004313, "learning_rate": 6.179551122194515e-07, "loss": 0.0, "step": 38880 }, { "epoch": 4.849127182044888, "grad_norm": 0.00047609827015548944, "learning_rate": 6.129675810473816e-07, "loss": 0.0163, "step": 38890 }, { "epoch": 4.850374064837905, "grad_norm": 0.004086254630237818, "learning_rate": 6.079800498753118e-07, "loss": 0.0018, "step": 38900 }, { "epoch": 4.851620947630923, "grad_norm": 0.0005081315175630152, "learning_rate": 6.02992518703242e-07, "loss": 0.0001, "step": 38910 }, { "epoch": 4.85286783042394, "grad_norm": 0.0003619830240495503, "learning_rate": 5.980049875311722e-07, "loss": 0.0003, "step": 38920 }, { "epoch": 4.8541147132169575, "grad_norm": 8.470124157611281e-05, "learning_rate": 5.930174563591022e-07, "loss": 0.001, "step": 38930 }, { "epoch": 4.855361596009975, "grad_norm": 0.00033788950531743467, "learning_rate": 5.880299251870324e-07, "loss": 0.0, "step": 38940 }, { "epoch": 4.856608478802992, "grad_norm": 0.009631326422095299, "learning_rate": 5.830423940149626e-07, "loss": 0.0001, "step": 38950 }, { "epoch": 4.85785536159601, "grad_norm": 6.478117575170472e-05, "learning_rate": 5.780548628428928e-07, "loss": 0.0, "step": 38960 }, { "epoch": 4.859102244389027, "grad_norm": 0.00014513339556287974, "learning_rate": 5.730673316708229e-07, "loss": 0.0371, "step": 38970 }, { "epoch": 4.860349127182045, "grad_norm": 0.002222485141828656, "learning_rate": 5.680798004987532e-07, "loss": 0.0, "step": 38980 }, { "epoch": 4.861596009975062, "grad_norm": 6.835142994532362e-05, "learning_rate": 5.630922693266833e-07, "loss": 0.0, "step": 38990 }, { "epoch": 4.86284289276808, "grad_norm": 8.790072752162814e-05, "learning_rate": 5.581047381546135e-07, "loss": 0.0001, "step": 39000 }, { "epoch": 4.864089775561097, "grad_norm": 6.300320819718763e-05, "learning_rate": 5.531172069825437e-07, "loss": 0.0, "step": 39010 }, { "epoch": 4.865336658354114, "grad_norm": 0.00011180248111486435, "learning_rate": 5.481296758104739e-07, "loss": 0.0, "step": 39020 }, { "epoch": 4.866583541147132, "grad_norm": 7.89304613135755e-05, "learning_rate": 5.43142144638404e-07, "loss": 0.0, "step": 39030 }, { "epoch": 4.867830423940149, "grad_norm": 0.00046899239532649517, "learning_rate": 5.381546134663342e-07, "loss": 0.0, "step": 39040 }, { "epoch": 4.869077306733167, "grad_norm": 0.00407341867685318, "learning_rate": 5.331670822942644e-07, "loss": 0.0, "step": 39050 }, { "epoch": 4.870324189526184, "grad_norm": 0.000564980844501406, "learning_rate": 5.281795511221946e-07, "loss": 0.0, "step": 39060 }, { "epoch": 4.871571072319202, "grad_norm": 9.515364217804745e-05, "learning_rate": 5.231920199501248e-07, "loss": 0.0, "step": 39070 }, { "epoch": 4.87281795511222, "grad_norm": 0.00015221108333207667, "learning_rate": 5.182044887780549e-07, "loss": 0.002, "step": 39080 }, { "epoch": 4.874064837905237, "grad_norm": 0.007447673939168453, "learning_rate": 5.132169576059851e-07, "loss": 0.0, "step": 39090 }, { "epoch": 4.875311720698255, "grad_norm": 0.00013376775314100087, "learning_rate": 5.082294264339153e-07, "loss": 0.0329, "step": 39100 }, { "epoch": 4.876558603491272, "grad_norm": 0.00014891306636855006, "learning_rate": 5.032418952618455e-07, "loss": 0.0, "step": 39110 }, { "epoch": 4.87780548628429, "grad_norm": 0.3053286671638489, "learning_rate": 4.982543640897755e-07, "loss": 0.0, "step": 39120 }, { "epoch": 4.879052369077307, "grad_norm": 6.43767198198475e-05, "learning_rate": 4.932668329177058e-07, "loss": 0.0, "step": 39130 }, { "epoch": 4.8802992518703245, "grad_norm": 0.00010566661512712017, "learning_rate": 4.882793017456359e-07, "loss": 0.0, "step": 39140 }, { "epoch": 4.881546134663342, "grad_norm": 0.00010986766574205831, "learning_rate": 4.832917705735661e-07, "loss": 0.0, "step": 39150 }, { "epoch": 4.882793017456359, "grad_norm": 9.44228595471941e-05, "learning_rate": 4.783042394014963e-07, "loss": 0.0, "step": 39160 }, { "epoch": 4.884039900249377, "grad_norm": 0.00012891492224298418, "learning_rate": 4.7331670822942647e-07, "loss": 0.0, "step": 39170 }, { "epoch": 4.885286783042394, "grad_norm": 0.00010865272633964196, "learning_rate": 4.683291770573566e-07, "loss": 0.0, "step": 39180 }, { "epoch": 4.886533665835412, "grad_norm": 0.0020185792818665504, "learning_rate": 4.6334164588528686e-07, "loss": 0.0042, "step": 39190 }, { "epoch": 4.887780548628429, "grad_norm": 0.00019098054326605052, "learning_rate": 4.58354114713217e-07, "loss": 0.0, "step": 39200 }, { "epoch": 4.889027431421447, "grad_norm": 3.40030892402865e-05, "learning_rate": 4.5336658354114715e-07, "loss": 0.0, "step": 39210 }, { "epoch": 4.890274314214464, "grad_norm": 0.00010806202044477686, "learning_rate": 4.483790523690774e-07, "loss": 0.0, "step": 39220 }, { "epoch": 4.8915211970074814, "grad_norm": 7.696980173932388e-05, "learning_rate": 4.4339152119700754e-07, "loss": 0.0, "step": 39230 }, { "epoch": 4.892768079800499, "grad_norm": 3.1001432944322005e-05, "learning_rate": 4.384039900249377e-07, "loss": 0.0, "step": 39240 }, { "epoch": 4.894014962593516, "grad_norm": 0.000295392848784104, "learning_rate": 4.334164588528679e-07, "loss": 0.0, "step": 39250 }, { "epoch": 4.895261845386534, "grad_norm": 0.0001579842937644571, "learning_rate": 4.284289276807981e-07, "loss": 0.0494, "step": 39260 }, { "epoch": 4.896508728179551, "grad_norm": 0.000137755909236148, "learning_rate": 4.234413965087282e-07, "loss": 0.0, "step": 39270 }, { "epoch": 4.897755610972569, "grad_norm": 0.00016627574223093688, "learning_rate": 4.1845386533665836e-07, "loss": 0.0006, "step": 39280 }, { "epoch": 4.899002493765586, "grad_norm": 0.00019424950005486608, "learning_rate": 4.1346633416458856e-07, "loss": 0.0, "step": 39290 }, { "epoch": 4.9002493765586035, "grad_norm": 0.00012249869178049266, "learning_rate": 4.084788029925187e-07, "loss": 0.0016, "step": 39300 }, { "epoch": 4.901496259351621, "grad_norm": 3.534234201651998e-05, "learning_rate": 4.034912718204489e-07, "loss": 0.0, "step": 39310 }, { "epoch": 4.902743142144638, "grad_norm": 5.2465853514149785e-05, "learning_rate": 3.985037406483791e-07, "loss": 0.0, "step": 39320 }, { "epoch": 4.903990024937656, "grad_norm": 0.00019393919501453638, "learning_rate": 3.9351620947630924e-07, "loss": 0.0, "step": 39330 }, { "epoch": 4.905236907730673, "grad_norm": 0.0001156614744104445, "learning_rate": 3.885286783042394e-07, "loss": 0.0, "step": 39340 }, { "epoch": 4.906483790523691, "grad_norm": 0.00013547898561228067, "learning_rate": 3.8354114713216963e-07, "loss": 0.0, "step": 39350 }, { "epoch": 4.907730673316708, "grad_norm": 0.001548072206787765, "learning_rate": 3.7855361596009977e-07, "loss": 0.0002, "step": 39360 }, { "epoch": 4.9089775561097255, "grad_norm": 0.00010577407374512404, "learning_rate": 3.735660847880299e-07, "loss": 0.0001, "step": 39370 }, { "epoch": 4.910224438902743, "grad_norm": 0.0005236375727690756, "learning_rate": 3.6857855361596016e-07, "loss": 0.0, "step": 39380 }, { "epoch": 4.91147132169576, "grad_norm": 0.00018029804050456733, "learning_rate": 3.635910224438903e-07, "loss": 0.0, "step": 39390 }, { "epoch": 4.912718204488778, "grad_norm": 7.623321289429441e-05, "learning_rate": 3.5860349127182045e-07, "loss": 0.0, "step": 39400 }, { "epoch": 4.913965087281795, "grad_norm": 6.454013055190444e-05, "learning_rate": 3.536159600997507e-07, "loss": 0.0, "step": 39410 }, { "epoch": 4.915211970074813, "grad_norm": 0.00047880125930532813, "learning_rate": 3.4862842892768084e-07, "loss": 0.0, "step": 39420 }, { "epoch": 4.91645885286783, "grad_norm": 0.0002257343294331804, "learning_rate": 3.43640897755611e-07, "loss": 0.0, "step": 39430 }, { "epoch": 4.917705735660848, "grad_norm": 5.091218918096274e-05, "learning_rate": 3.386533665835412e-07, "loss": 0.0001, "step": 39440 }, { "epoch": 4.918952618453865, "grad_norm": 9.202091314364225e-05, "learning_rate": 3.336658354114714e-07, "loss": 0.0, "step": 39450 }, { "epoch": 4.920199501246882, "grad_norm": 0.00017863092944025993, "learning_rate": 3.286783042394015e-07, "loss": 0.0, "step": 39460 }, { "epoch": 4.9214463840399, "grad_norm": 0.0003498582518659532, "learning_rate": 3.236907730673317e-07, "loss": 0.0, "step": 39470 }, { "epoch": 4.922693266832917, "grad_norm": 0.021878132596611977, "learning_rate": 3.1870324189526186e-07, "loss": 0.0, "step": 39480 }, { "epoch": 4.923940149625935, "grad_norm": 0.00010034618026111275, "learning_rate": 3.13715710723192e-07, "loss": 0.0, "step": 39490 }, { "epoch": 4.925187032418952, "grad_norm": 0.0004102317616343498, "learning_rate": 3.087281795511222e-07, "loss": 0.0, "step": 39500 }, { "epoch": 4.92643391521197, "grad_norm": 0.0001442254288122058, "learning_rate": 3.037406483790524e-07, "loss": 0.0, "step": 39510 }, { "epoch": 4.927680798004987, "grad_norm": 0.00011706881923601031, "learning_rate": 2.9875311720698254e-07, "loss": 0.0, "step": 39520 }, { "epoch": 4.928927680798005, "grad_norm": 0.0002453117340337485, "learning_rate": 2.9376558603491273e-07, "loss": 0.0, "step": 39530 }, { "epoch": 4.930174563591023, "grad_norm": 0.0002756392350420356, "learning_rate": 2.8877805486284293e-07, "loss": 0.0, "step": 39540 }, { "epoch": 4.93142144638404, "grad_norm": 7.422151247737929e-05, "learning_rate": 2.8379052369077307e-07, "loss": 0.0, "step": 39550 }, { "epoch": 4.932668329177058, "grad_norm": 7.200430991360918e-05, "learning_rate": 2.7880299251870327e-07, "loss": 0.0, "step": 39560 }, { "epoch": 4.933915211970075, "grad_norm": 0.00013249997573439032, "learning_rate": 2.7381546134663346e-07, "loss": 0.0, "step": 39570 }, { "epoch": 4.9351620947630925, "grad_norm": 0.008125074207782745, "learning_rate": 2.688279301745636e-07, "loss": 0.0, "step": 39580 }, { "epoch": 4.93640897755611, "grad_norm": 7.183442357927561e-05, "learning_rate": 2.638403990024938e-07, "loss": 0.0, "step": 39590 }, { "epoch": 4.937655860349127, "grad_norm": 0.00014083849964663386, "learning_rate": 2.58852867830424e-07, "loss": 0.0, "step": 39600 }, { "epoch": 4.938902743142145, "grad_norm": 0.00042223025229759514, "learning_rate": 2.5386533665835414e-07, "loss": 0.0, "step": 39610 }, { "epoch": 4.940149625935162, "grad_norm": 8.984528540167958e-05, "learning_rate": 2.4887780548628434e-07, "loss": 0.0, "step": 39620 }, { "epoch": 4.94139650872818, "grad_norm": 0.000292430748231709, "learning_rate": 2.438902743142145e-07, "loss": 0.0, "step": 39630 }, { "epoch": 4.942643391521197, "grad_norm": 5.6125212722690776e-05, "learning_rate": 2.389027431421447e-07, "loss": 0.0, "step": 39640 }, { "epoch": 4.943890274314215, "grad_norm": 0.00033320783404633403, "learning_rate": 2.3391521197007484e-07, "loss": 0.0, "step": 39650 }, { "epoch": 4.945137157107232, "grad_norm": 0.00011107645696029067, "learning_rate": 2.2892768079800501e-07, "loss": 0.0, "step": 39660 }, { "epoch": 4.946384039900249, "grad_norm": 0.00048359643551521003, "learning_rate": 2.2394014962593518e-07, "loss": 0.0428, "step": 39670 }, { "epoch": 4.947630922693267, "grad_norm": 0.0021955021657049656, "learning_rate": 2.1895261845386535e-07, "loss": 0.0, "step": 39680 }, { "epoch": 4.948877805486284, "grad_norm": 0.0004385665524750948, "learning_rate": 2.1396508728179552e-07, "loss": 0.0, "step": 39690 }, { "epoch": 4.950124688279302, "grad_norm": 4.744509351439774e-05, "learning_rate": 2.089775561097257e-07, "loss": 0.0005, "step": 39700 }, { "epoch": 4.951371571072319, "grad_norm": 4.5862499973736703e-05, "learning_rate": 2.039900249376559e-07, "loss": 0.0, "step": 39710 }, { "epoch": 4.952618453865337, "grad_norm": 0.0032922455575317144, "learning_rate": 1.9900249376558603e-07, "loss": 0.0001, "step": 39720 }, { "epoch": 4.953865336658354, "grad_norm": 0.00016673465142957866, "learning_rate": 1.9401496259351623e-07, "loss": 0.0, "step": 39730 }, { "epoch": 4.9551122194513715, "grad_norm": 4.499312854022719e-05, "learning_rate": 1.8902743142144642e-07, "loss": 0.0, "step": 39740 }, { "epoch": 4.956359102244389, "grad_norm": 0.00012304374831728637, "learning_rate": 1.8403990024937656e-07, "loss": 0.0, "step": 39750 }, { "epoch": 4.957605985037406, "grad_norm": 0.00027020517154596746, "learning_rate": 1.7905236907730676e-07, "loss": 0.0063, "step": 39760 }, { "epoch": 4.958852867830424, "grad_norm": 0.1422053575515747, "learning_rate": 1.7406483790523693e-07, "loss": 0.0, "step": 39770 }, { "epoch": 4.960099750623441, "grad_norm": 7.926650869194418e-05, "learning_rate": 1.6907730673316707e-07, "loss": 0.0, "step": 39780 }, { "epoch": 4.961346633416459, "grad_norm": 5.627061182167381e-05, "learning_rate": 1.6408977556109727e-07, "loss": 0.0, "step": 39790 }, { "epoch": 4.962593516209476, "grad_norm": 0.00012039417924825102, "learning_rate": 1.5910224438902747e-07, "loss": 0.0, "step": 39800 }, { "epoch": 4.9638403990024935, "grad_norm": 0.0011466448195278645, "learning_rate": 1.541147132169576e-07, "loss": 0.0, "step": 39810 }, { "epoch": 4.965087281795511, "grad_norm": 0.00031051429687067866, "learning_rate": 1.491271820448878e-07, "loss": 0.0, "step": 39820 }, { "epoch": 4.966334164588528, "grad_norm": 9.515901183476672e-05, "learning_rate": 1.4413965087281797e-07, "loss": 0.0, "step": 39830 }, { "epoch": 4.967581047381546, "grad_norm": 1.0523102283477783, "learning_rate": 1.3915211970074814e-07, "loss": 0.0001, "step": 39840 }, { "epoch": 4.968827930174563, "grad_norm": 0.0002691926492843777, "learning_rate": 1.341645885286783e-07, "loss": 0.0, "step": 39850 }, { "epoch": 4.970074812967581, "grad_norm": 0.002537587657570839, "learning_rate": 1.2917705735660848e-07, "loss": 0.0, "step": 39860 }, { "epoch": 4.971321695760598, "grad_norm": 4.5559128921013325e-05, "learning_rate": 1.2418952618453865e-07, "loss": 0.0, "step": 39870 }, { "epoch": 4.9725685785536164, "grad_norm": 3.524202838889323e-05, "learning_rate": 1.1920199501246885e-07, "loss": 0.0, "step": 39880 }, { "epoch": 4.973815461346634, "grad_norm": 0.00018697594350669533, "learning_rate": 1.1421446384039902e-07, "loss": 0.0, "step": 39890 }, { "epoch": 4.975062344139651, "grad_norm": 0.0023984755389392376, "learning_rate": 1.0922693266832919e-07, "loss": 0.0, "step": 39900 }, { "epoch": 4.976309226932669, "grad_norm": 0.0003739015955943614, "learning_rate": 1.0423940149625936e-07, "loss": 0.0, "step": 39910 }, { "epoch": 4.977556109725686, "grad_norm": 5.126370524521917e-05, "learning_rate": 9.925187032418954e-08, "loss": 0.0, "step": 39920 }, { "epoch": 4.978802992518704, "grad_norm": 5.107426113681868e-05, "learning_rate": 9.426433915211971e-08, "loss": 0.0, "step": 39930 }, { "epoch": 4.980049875311721, "grad_norm": 9.020322613650933e-05, "learning_rate": 8.927680798004988e-08, "loss": 0.0, "step": 39940 }, { "epoch": 4.9812967581047385, "grad_norm": 9.316183422924951e-05, "learning_rate": 8.428927680798006e-08, "loss": 0.0, "step": 39950 }, { "epoch": 4.982543640897756, "grad_norm": 0.01621038280427456, "learning_rate": 7.930174563591023e-08, "loss": 0.0, "step": 39960 }, { "epoch": 4.983790523690773, "grad_norm": 0.0002640775346662849, "learning_rate": 7.43142144638404e-08, "loss": 0.0, "step": 39970 }, { "epoch": 4.985037406483791, "grad_norm": 9.524546476313844e-05, "learning_rate": 6.932668329177058e-08, "loss": 0.0, "step": 39980 }, { "epoch": 4.986284289276808, "grad_norm": 4.487358091864735e-05, "learning_rate": 6.433915211970075e-08, "loss": 0.0, "step": 39990 }, { "epoch": 4.987531172069826, "grad_norm": 0.0012119744205847383, "learning_rate": 5.935162094763093e-08, "loss": 0.0255, "step": 40000 }, { "epoch": 4.988778054862843, "grad_norm": 59.783939361572266, "learning_rate": 5.43640897755611e-08, "loss": 0.033, "step": 40010 }, { "epoch": 4.9900249376558605, "grad_norm": 0.00014162635488901287, "learning_rate": 4.937655860349127e-08, "loss": 0.0, "step": 40020 }, { "epoch": 4.991271820448878, "grad_norm": 5.852362301084213e-05, "learning_rate": 4.4389027431421455e-08, "loss": 0.0, "step": 40030 }, { "epoch": 4.992518703241895, "grad_norm": 0.00034787526237778366, "learning_rate": 3.940149625935162e-08, "loss": 0.0, "step": 40040 }, { "epoch": 4.993765586034913, "grad_norm": 0.000130303087644279, "learning_rate": 3.44139650872818e-08, "loss": 0.0, "step": 40050 }, { "epoch": 4.99501246882793, "grad_norm": 0.0005980631103739142, "learning_rate": 2.9426433915211973e-08, "loss": 0.0, "step": 40060 }, { "epoch": 4.996259351620948, "grad_norm": 0.00011551733041414991, "learning_rate": 2.4438902743142146e-08, "loss": 0.0, "step": 40070 }, { "epoch": 4.997506234413965, "grad_norm": 8.48295385367237e-05, "learning_rate": 1.9451371571072322e-08, "loss": 0.0, "step": 40080 }, { "epoch": 4.998753117206983, "grad_norm": 0.00042257612221874297, "learning_rate": 1.4463840399002495e-08, "loss": 0.0, "step": 40090 }, { "epoch": 5.0, "grad_norm": 0.0019580533262342215, "learning_rate": 9.476309226932669e-09, "loss": 0.0, "step": 40100 } ], "logging_steps": 10, "max_steps": 40100, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.12473217322432e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }