{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6666666666666666, "eval_steps": 500, "global_step": 40000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 4.756218910217285, "learning_rate": 3.0000000000000004e-07, "loss": 1.4157, "step": 10 }, { "grad_norm": 4.533816814422607, "learning_rate": 6.333333333333333e-07, "loss": 1.4125, "step": 20 }, { "grad_norm": 4.002695560455322, "learning_rate": 9.666666666666668e-07, "loss": 1.3849, "step": 30 }, { "grad_norm": 2.823151111602783, "learning_rate": 1.3e-06, "loss": 1.3157, "step": 40 }, { "grad_norm": 1.5573179721832275, "learning_rate": 1.6333333333333333e-06, "loss": 1.2429, "step": 50 }, { "grad_norm": 0.9189413189888, "learning_rate": 1.9666666666666668e-06, "loss": 1.1899, "step": 60 }, { "grad_norm": 0.5993644595146179, "learning_rate": 2.3e-06, "loss": 1.1445, "step": 70 }, { "grad_norm": 0.4388406276702881, "learning_rate": 2.6333333333333337e-06, "loss": 1.1259, "step": 80 }, { "grad_norm": 0.5618627071380615, "learning_rate": 2.966666666666667e-06, "loss": 1.1022, "step": 90 }, { "grad_norm": 0.5426422953605652, "learning_rate": 3.3e-06, "loss": 1.1008, "step": 100 }, { "grad_norm": 0.6314268708229065, "learning_rate": 3.633333333333334e-06, "loss": 1.0895, "step": 110 }, { "grad_norm": 0.5122863054275513, "learning_rate": 3.966666666666667e-06, "loss": 1.0792, "step": 120 }, { "grad_norm": 0.5248027443885803, "learning_rate": 4.2999999999999995e-06, "loss": 1.0694, "step": 130 }, { "grad_norm": 0.6071310639381409, "learning_rate": 4.633333333333334e-06, "loss": 1.0675, "step": 140 }, { "grad_norm": 0.566581666469574, "learning_rate": 4.966666666666667e-06, "loss": 1.0633, "step": 150 }, { "grad_norm": 0.6244418025016785, "learning_rate": 5.3e-06, "loss": 1.0559, "step": 160 }, { "grad_norm": 0.4709051549434662, "learning_rate": 5.633333333333333e-06, "loss": 1.0554, "step": 170 }, { "grad_norm": 0.6256153583526611, "learning_rate": 5.9666666666666666e-06, "loss": 1.0538, "step": 180 }, { "grad_norm": 0.6180979609489441, "learning_rate": 6.300000000000001e-06, "loss": 1.0465, "step": 190 }, { "grad_norm": 0.5741148591041565, "learning_rate": 6.633333333333333e-06, "loss": 1.0468, "step": 200 }, { "grad_norm": 0.5811980366706848, "learning_rate": 6.966666666666667e-06, "loss": 1.0448, "step": 210 }, { "grad_norm": 0.6171549558639526, "learning_rate": 7.2999999999999996e-06, "loss": 1.04, "step": 220 }, { "grad_norm": 0.6265953183174133, "learning_rate": 7.633333333333334e-06, "loss": 1.0324, "step": 230 }, { "grad_norm": 0.6889417767524719, "learning_rate": 7.966666666666666e-06, "loss": 1.0267, "step": 240 }, { "grad_norm": 0.8068269491195679, "learning_rate": 8.3e-06, "loss": 1.0159, "step": 250 }, { "grad_norm": 0.8558771014213562, "learning_rate": 8.633333333333334e-06, "loss": 0.9939, "step": 260 }, { "grad_norm": 0.8399918079376221, "learning_rate": 8.966666666666668e-06, "loss": 0.9742, "step": 270 }, { "grad_norm": 1.2498595714569092, "learning_rate": 9.3e-06, "loss": 0.9264, "step": 280 }, { "grad_norm": 1.109431266784668, "learning_rate": 9.633333333333335e-06, "loss": 0.8852, "step": 290 }, { "grad_norm": 1.3449949026107788, "learning_rate": 9.966666666666667e-06, "loss": 0.827, "step": 300 }, { "grad_norm": 1.495901346206665, "learning_rate": 1.03e-05, "loss": 0.7714, "step": 310 }, { "grad_norm": 1.4885666370391846, "learning_rate": 1.0633333333333334e-05, "loss": 0.7105, "step": 320 }, { "grad_norm": 1.8869973421096802, "learning_rate": 1.0966666666666666e-05, "loss": 0.6658, "step": 330 }, { "grad_norm": 1.8417017459869385, "learning_rate": 1.13e-05, "loss": 0.6087, "step": 340 }, { "grad_norm": 2.179997682571411, "learning_rate": 1.1633333333333334e-05, "loss": 0.5631, "step": 350 }, { "grad_norm": 1.715309500694275, "learning_rate": 1.1966666666666668e-05, "loss": 0.5301, "step": 360 }, { "grad_norm": 2.6223623752593994, "learning_rate": 1.23e-05, "loss": 0.5061, "step": 370 }, { "grad_norm": 2.5044667720794678, "learning_rate": 1.2633333333333333e-05, "loss": 0.4606, "step": 380 }, { "grad_norm": 2.0453903675079346, "learning_rate": 1.2966666666666669e-05, "loss": 0.4261, "step": 390 }, { "grad_norm": 1.932612419128418, "learning_rate": 1.3300000000000001e-05, "loss": 0.3947, "step": 400 }, { "grad_norm": 1.958109974861145, "learning_rate": 1.3633333333333334e-05, "loss": 0.3719, "step": 410 }, { "grad_norm": 3.0342814922332764, "learning_rate": 1.3966666666666666e-05, "loss": 0.3453, "step": 420 }, { "grad_norm": 2.7047736644744873, "learning_rate": 1.43e-05, "loss": 0.3351, "step": 430 }, { "grad_norm": 2.491868734359741, "learning_rate": 1.4633333333333334e-05, "loss": 0.3125, "step": 440 }, { "grad_norm": 2.332961320877075, "learning_rate": 1.4966666666666668e-05, "loss": 0.2964, "step": 450 }, { "grad_norm": 2.4805514812469482, "learning_rate": 1.53e-05, "loss": 0.28, "step": 460 }, { "grad_norm": 2.3569538593292236, "learning_rate": 1.563333333333333e-05, "loss": 0.2752, "step": 470 }, { "grad_norm": 2.5456480979919434, "learning_rate": 1.5966666666666667e-05, "loss": 0.2511, "step": 480 }, { "grad_norm": 2.4560647010803223, "learning_rate": 1.63e-05, "loss": 0.2392, "step": 490 }, { "grad_norm": 2.0467610359191895, "learning_rate": 1.6633333333333336e-05, "loss": 0.223, "step": 500 }, { "grad_norm": 2.2710206508636475, "learning_rate": 1.6966666666666668e-05, "loss": 0.2181, "step": 510 }, { "grad_norm": 2.6807422637939453, "learning_rate": 1.73e-05, "loss": 0.197, "step": 520 }, { "grad_norm": 2.534992218017578, "learning_rate": 1.7633333333333336e-05, "loss": 0.2032, "step": 530 }, { "grad_norm": 1.784435749053955, "learning_rate": 1.796666666666667e-05, "loss": 0.1766, "step": 540 }, { "grad_norm": 2.40442156791687, "learning_rate": 1.83e-05, "loss": 0.1762, "step": 550 }, { "grad_norm": 2.732968330383301, "learning_rate": 1.8633333333333333e-05, "loss": 0.1682, "step": 560 }, { "grad_norm": 2.360205888748169, "learning_rate": 1.896666666666667e-05, "loss": 0.1527, "step": 570 }, { "grad_norm": 2.251589775085449, "learning_rate": 1.93e-05, "loss": 0.1554, "step": 580 }, { "grad_norm": 2.385878562927246, "learning_rate": 1.9633333333333334e-05, "loss": 0.1616, "step": 590 }, { "grad_norm": 2.435128927230835, "learning_rate": 1.9966666666666666e-05, "loss": 0.155, "step": 600 }, { "grad_norm": 2.9539012908935547, "learning_rate": 2.0300000000000002e-05, "loss": 0.1641, "step": 610 }, { "grad_norm": 2.2021026611328125, "learning_rate": 2.0633333333333335e-05, "loss": 0.1413, "step": 620 }, { "grad_norm": 2.5571975708007812, "learning_rate": 2.0966666666666667e-05, "loss": 0.1299, "step": 630 }, { "grad_norm": 2.4322869777679443, "learning_rate": 2.13e-05, "loss": 0.1402, "step": 640 }, { "grad_norm": 2.5171878337860107, "learning_rate": 2.1633333333333332e-05, "loss": 0.1384, "step": 650 }, { "grad_norm": 2.2761356830596924, "learning_rate": 2.1966666666666668e-05, "loss": 0.1293, "step": 660 }, { "grad_norm": 2.6294305324554443, "learning_rate": 2.23e-05, "loss": 0.1545, "step": 670 }, { "grad_norm": 2.5020318031311035, "learning_rate": 2.2633333333333336e-05, "loss": 0.1362, "step": 680 }, { "grad_norm": 2.714468002319336, "learning_rate": 2.2966666666666668e-05, "loss": 0.1303, "step": 690 }, { "grad_norm": 2.5271618366241455, "learning_rate": 2.3300000000000004e-05, "loss": 0.1314, "step": 700 }, { "grad_norm": 2.491684913635254, "learning_rate": 2.3633333333333336e-05, "loss": 0.1422, "step": 710 }, { "grad_norm": 2.2820470333099365, "learning_rate": 2.396666666666667e-05, "loss": 0.1239, "step": 720 }, { "grad_norm": 2.5786640644073486, "learning_rate": 2.43e-05, "loss": 0.1261, "step": 730 }, { "grad_norm": 2.4451346397399902, "learning_rate": 2.4633333333333334e-05, "loss": 0.1094, "step": 740 }, { "grad_norm": 2.223487377166748, "learning_rate": 2.496666666666667e-05, "loss": 0.1253, "step": 750 }, { "grad_norm": 2.8065717220306396, "learning_rate": 2.5300000000000002e-05, "loss": 0.1143, "step": 760 }, { "grad_norm": 2.274339437484741, "learning_rate": 2.5633333333333338e-05, "loss": 0.1171, "step": 770 }, { "grad_norm": 1.814670205116272, "learning_rate": 2.5966666666666667e-05, "loss": 0.1179, "step": 780 }, { "grad_norm": 2.0471904277801514, "learning_rate": 2.6300000000000002e-05, "loss": 0.1096, "step": 790 }, { "grad_norm": 2.3558876514434814, "learning_rate": 2.663333333333333e-05, "loss": 0.1171, "step": 800 }, { "grad_norm": 2.887620449066162, "learning_rate": 2.6966666666666667e-05, "loss": 0.1218, "step": 810 }, { "grad_norm": 1.7995444536209106, "learning_rate": 2.7300000000000003e-05, "loss": 0.1136, "step": 820 }, { "grad_norm": 1.9670045375823975, "learning_rate": 2.7633333333333332e-05, "loss": 0.1135, "step": 830 }, { "grad_norm": 1.573832631111145, "learning_rate": 2.7966666666666668e-05, "loss": 0.1058, "step": 840 }, { "grad_norm": 2.520409345626831, "learning_rate": 2.83e-05, "loss": 0.1265, "step": 850 }, { "grad_norm": 2.2822840213775635, "learning_rate": 2.8633333333333336e-05, "loss": 0.1077, "step": 860 }, { "grad_norm": 2.5108864307403564, "learning_rate": 2.8966666666666668e-05, "loss": 0.1099, "step": 870 }, { "grad_norm": 2.2583436965942383, "learning_rate": 2.93e-05, "loss": 0.102, "step": 880 }, { "grad_norm": 1.973009705543518, "learning_rate": 2.9633333333333336e-05, "loss": 0.1006, "step": 890 }, { "grad_norm": 2.5470638275146484, "learning_rate": 2.9966666666666672e-05, "loss": 0.1162, "step": 900 }, { "grad_norm": 2.4625256061553955, "learning_rate": 3.03e-05, "loss": 0.0983, "step": 910 }, { "grad_norm": 2.042452096939087, "learning_rate": 3.063333333333334e-05, "loss": 0.1113, "step": 920 }, { "grad_norm": 2.270254373550415, "learning_rate": 3.096666666666666e-05, "loss": 0.1007, "step": 930 }, { "grad_norm": 1.8730031251907349, "learning_rate": 3.13e-05, "loss": 0.0979, "step": 940 }, { "grad_norm": 1.923866868019104, "learning_rate": 3.1633333333333334e-05, "loss": 0.1135, "step": 950 }, { "grad_norm": 2.1694090366363525, "learning_rate": 3.196666666666667e-05, "loss": 0.0968, "step": 960 }, { "grad_norm": 2.5053062438964844, "learning_rate": 3.2300000000000006e-05, "loss": 0.0962, "step": 970 }, { "grad_norm": 1.9679805040359497, "learning_rate": 3.263333333333333e-05, "loss": 0.1052, "step": 980 }, { "grad_norm": 2.5273709297180176, "learning_rate": 3.296666666666667e-05, "loss": 0.1073, "step": 990 }, { "grad_norm": 2.220262050628662, "learning_rate": 3.33e-05, "loss": 0.1015, "step": 1000 }, { "grad_norm": 2.0471818447113037, "learning_rate": 3.3633333333333335e-05, "loss": 0.0954, "step": 1010 }, { "grad_norm": 1.8970118761062622, "learning_rate": 3.396666666666667e-05, "loss": 0.0973, "step": 1020 }, { "grad_norm": 1.8871910572052002, "learning_rate": 3.430000000000001e-05, "loss": 0.0991, "step": 1030 }, { "grad_norm": 1.8973971605300903, "learning_rate": 3.463333333333333e-05, "loss": 0.0956, "step": 1040 }, { "grad_norm": 2.0561819076538086, "learning_rate": 3.496666666666667e-05, "loss": 0.0913, "step": 1050 }, { "grad_norm": 2.165154457092285, "learning_rate": 3.53e-05, "loss": 0.1109, "step": 1060 }, { "grad_norm": 2.1164817810058594, "learning_rate": 3.563333333333334e-05, "loss": 0.0997, "step": 1070 }, { "grad_norm": 2.1834139823913574, "learning_rate": 3.596666666666667e-05, "loss": 0.1002, "step": 1080 }, { "grad_norm": 1.7735356092453003, "learning_rate": 3.63e-05, "loss": 0.081, "step": 1090 }, { "grad_norm": 2.0510787963867188, "learning_rate": 3.6633333333333334e-05, "loss": 0.0949, "step": 1100 }, { "grad_norm": 2.2429168224334717, "learning_rate": 3.6966666666666666e-05, "loss": 0.0972, "step": 1110 }, { "grad_norm": 1.9839978218078613, "learning_rate": 3.73e-05, "loss": 0.0939, "step": 1120 }, { "grad_norm": 1.8827766180038452, "learning_rate": 3.763333333333334e-05, "loss": 0.1018, "step": 1130 }, { "grad_norm": 1.7244727611541748, "learning_rate": 3.796666666666667e-05, "loss": 0.096, "step": 1140 }, { "grad_norm": 1.6272896528244019, "learning_rate": 3.83e-05, "loss": 0.0962, "step": 1150 }, { "grad_norm": 1.6950806379318237, "learning_rate": 3.8633333333333335e-05, "loss": 0.0985, "step": 1160 }, { "grad_norm": 1.6659928560256958, "learning_rate": 3.896666666666667e-05, "loss": 0.0897, "step": 1170 }, { "grad_norm": 1.8704904317855835, "learning_rate": 3.9300000000000007e-05, "loss": 0.0916, "step": 1180 }, { "grad_norm": 2.13916277885437, "learning_rate": 3.963333333333333e-05, "loss": 0.093, "step": 1190 }, { "grad_norm": 1.256888508796692, "learning_rate": 3.996666666666667e-05, "loss": 0.0991, "step": 1200 }, { "grad_norm": 1.7913360595703125, "learning_rate": 4.0300000000000004e-05, "loss": 0.096, "step": 1210 }, { "grad_norm": 1.8980103731155396, "learning_rate": 4.0633333333333336e-05, "loss": 0.0886, "step": 1220 }, { "grad_norm": 1.5002257823944092, "learning_rate": 4.096666666666667e-05, "loss": 0.0946, "step": 1230 }, { "grad_norm": 1.6084682941436768, "learning_rate": 4.13e-05, "loss": 0.0892, "step": 1240 }, { "grad_norm": 1.6400270462036133, "learning_rate": 4.1633333333333333e-05, "loss": 0.0955, "step": 1250 }, { "grad_norm": 1.6958472728729248, "learning_rate": 4.196666666666667e-05, "loss": 0.0845, "step": 1260 }, { "grad_norm": 1.517759919166565, "learning_rate": 4.23e-05, "loss": 0.0965, "step": 1270 }, { "grad_norm": 1.7336673736572266, "learning_rate": 4.263333333333334e-05, "loss": 0.0946, "step": 1280 }, { "grad_norm": 1.8478679656982422, "learning_rate": 4.296666666666666e-05, "loss": 0.0963, "step": 1290 }, { "grad_norm": 1.453667402267456, "learning_rate": 4.33e-05, "loss": 0.0882, "step": 1300 }, { "grad_norm": 1.4964280128479004, "learning_rate": 4.3633333333333335e-05, "loss": 0.0831, "step": 1310 }, { "grad_norm": 1.585028052330017, "learning_rate": 4.396666666666667e-05, "loss": 0.0965, "step": 1320 }, { "grad_norm": 1.6701347827911377, "learning_rate": 4.43e-05, "loss": 0.0824, "step": 1330 }, { "grad_norm": 1.593643307685852, "learning_rate": 4.463333333333334e-05, "loss": 0.0888, "step": 1340 }, { "grad_norm": 1.9641443490982056, "learning_rate": 4.496666666666667e-05, "loss": 0.0832, "step": 1350 }, { "grad_norm": 2.0581891536712646, "learning_rate": 4.53e-05, "loss": 0.0925, "step": 1360 }, { "grad_norm": 1.3554548025131226, "learning_rate": 4.5633333333333336e-05, "loss": 0.0772, "step": 1370 }, { "grad_norm": 1.6955831050872803, "learning_rate": 4.596666666666667e-05, "loss": 0.0835, "step": 1380 }, { "grad_norm": 1.43825364112854, "learning_rate": 4.630000000000001e-05, "loss": 0.0824, "step": 1390 }, { "grad_norm": 1.4359674453735352, "learning_rate": 4.663333333333333e-05, "loss": 0.0919, "step": 1400 }, { "grad_norm": 1.3146463632583618, "learning_rate": 4.696666666666667e-05, "loss": 0.0747, "step": 1410 }, { "grad_norm": 1.3383302688598633, "learning_rate": 4.73e-05, "loss": 0.0852, "step": 1420 }, { "grad_norm": 1.4164702892303467, "learning_rate": 4.763333333333334e-05, "loss": 0.088, "step": 1430 }, { "grad_norm": 1.6717121601104736, "learning_rate": 4.796666666666667e-05, "loss": 0.0838, "step": 1440 }, { "grad_norm": 1.6992640495300293, "learning_rate": 4.83e-05, "loss": 0.0885, "step": 1450 }, { "grad_norm": 1.4425464868545532, "learning_rate": 4.8633333333333334e-05, "loss": 0.08, "step": 1460 }, { "grad_norm": 1.7380220890045166, "learning_rate": 4.8966666666666667e-05, "loss": 0.0857, "step": 1470 }, { "grad_norm": 1.3327770233154297, "learning_rate": 4.93e-05, "loss": 0.0857, "step": 1480 }, { "grad_norm": 1.3948441743850708, "learning_rate": 4.963333333333334e-05, "loss": 0.0768, "step": 1490 }, { "grad_norm": 1.4107425212860107, "learning_rate": 4.996666666666667e-05, "loss": 0.0956, "step": 1500 }, { "grad_norm": 1.5134875774383545, "learning_rate": 5.03e-05, "loss": 0.081, "step": 1510 }, { "grad_norm": 1.4226447343826294, "learning_rate": 5.0633333333333335e-05, "loss": 0.0835, "step": 1520 }, { "grad_norm": 1.7193541526794434, "learning_rate": 5.0966666666666674e-05, "loss": 0.0847, "step": 1530 }, { "grad_norm": 1.5719715356826782, "learning_rate": 5.130000000000001e-05, "loss": 0.0889, "step": 1540 }, { "grad_norm": 1.4408549070358276, "learning_rate": 5.163333333333333e-05, "loss": 0.084, "step": 1550 }, { "grad_norm": 1.4915046691894531, "learning_rate": 5.196666666666667e-05, "loss": 0.0844, "step": 1560 }, { "grad_norm": 1.3872088193893433, "learning_rate": 5.2300000000000004e-05, "loss": 0.0797, "step": 1570 }, { "grad_norm": 1.6842246055603027, "learning_rate": 5.2633333333333336e-05, "loss": 0.0743, "step": 1580 }, { "grad_norm": 1.87129807472229, "learning_rate": 5.296666666666666e-05, "loss": 0.0831, "step": 1590 }, { "grad_norm": 1.3865938186645508, "learning_rate": 5.330000000000001e-05, "loss": 0.0813, "step": 1600 }, { "grad_norm": 1.0861713886260986, "learning_rate": 5.3633333333333334e-05, "loss": 0.0744, "step": 1610 }, { "grad_norm": 1.3400224447250366, "learning_rate": 5.3966666666666666e-05, "loss": 0.0702, "step": 1620 }, { "grad_norm": 1.258410096168518, "learning_rate": 5.4300000000000005e-05, "loss": 0.0774, "step": 1630 }, { "grad_norm": 1.5131471157073975, "learning_rate": 5.463333333333334e-05, "loss": 0.076, "step": 1640 }, { "grad_norm": 1.4185198545455933, "learning_rate": 5.496666666666666e-05, "loss": 0.0849, "step": 1650 }, { "grad_norm": 1.3197180032730103, "learning_rate": 5.530000000000001e-05, "loss": 0.0678, "step": 1660 }, { "grad_norm": 1.8167043924331665, "learning_rate": 5.5633333333333335e-05, "loss": 0.0781, "step": 1670 }, { "grad_norm": 1.2916417121887207, "learning_rate": 5.596666666666667e-05, "loss": 0.0833, "step": 1680 }, { "grad_norm": 1.3026211261749268, "learning_rate": 5.63e-05, "loss": 0.0829, "step": 1690 }, { "grad_norm": 1.4835796356201172, "learning_rate": 5.663333333333334e-05, "loss": 0.08, "step": 1700 }, { "grad_norm": 1.2610836029052734, "learning_rate": 5.696666666666667e-05, "loss": 0.0745, "step": 1710 }, { "grad_norm": 1.566968321800232, "learning_rate": 5.73e-05, "loss": 0.0828, "step": 1720 }, { "grad_norm": 1.255800485610962, "learning_rate": 5.7633333333333336e-05, "loss": 0.0824, "step": 1730 }, { "grad_norm": 1.2247788906097412, "learning_rate": 5.796666666666667e-05, "loss": 0.0723, "step": 1740 }, { "grad_norm": 1.3425395488739014, "learning_rate": 5.83e-05, "loss": 0.0729, "step": 1750 }, { "grad_norm": 1.2652937173843384, "learning_rate": 5.863333333333334e-05, "loss": 0.0823, "step": 1760 }, { "grad_norm": 1.3104197978973389, "learning_rate": 5.896666666666667e-05, "loss": 0.0763, "step": 1770 }, { "grad_norm": 1.1734591722488403, "learning_rate": 5.93e-05, "loss": 0.0765, "step": 1780 }, { "grad_norm": 1.1746596097946167, "learning_rate": 5.9633333333333344e-05, "loss": 0.0852, "step": 1790 }, { "grad_norm": 1.4064711332321167, "learning_rate": 5.996666666666667e-05, "loss": 0.0745, "step": 1800 }, { "grad_norm": 0.9678300619125366, "learning_rate": 6.03e-05, "loss": 0.0756, "step": 1810 }, { "grad_norm": 1.1954820156097412, "learning_rate": 6.063333333333333e-05, "loss": 0.071, "step": 1820 }, { "grad_norm": 1.303094744682312, "learning_rate": 6.0966666666666674e-05, "loss": 0.0712, "step": 1830 }, { "grad_norm": 1.378692626953125, "learning_rate": 6.13e-05, "loss": 0.0734, "step": 1840 }, { "grad_norm": 1.252637505531311, "learning_rate": 6.163333333333333e-05, "loss": 0.0719, "step": 1850 }, { "grad_norm": 1.215796947479248, "learning_rate": 6.196666666666668e-05, "loss": 0.0832, "step": 1860 }, { "grad_norm": 1.1180384159088135, "learning_rate": 6.23e-05, "loss": 0.0809, "step": 1870 }, { "grad_norm": 1.3103326559066772, "learning_rate": 6.263333333333333e-05, "loss": 0.07, "step": 1880 }, { "grad_norm": 1.1420583724975586, "learning_rate": 6.296666666666667e-05, "loss": 0.0655, "step": 1890 }, { "grad_norm": 1.397101879119873, "learning_rate": 6.330000000000001e-05, "loss": 0.0888, "step": 1900 }, { "grad_norm": 1.2339445352554321, "learning_rate": 6.363333333333334e-05, "loss": 0.0784, "step": 1910 }, { "grad_norm": 1.3281197547912598, "learning_rate": 6.396666666666667e-05, "loss": 0.0742, "step": 1920 }, { "grad_norm": 1.4674954414367676, "learning_rate": 6.43e-05, "loss": 0.0753, "step": 1930 }, { "grad_norm": 1.1643928289413452, "learning_rate": 6.463333333333334e-05, "loss": 0.0774, "step": 1940 }, { "grad_norm": 1.4948179721832275, "learning_rate": 6.496666666666667e-05, "loss": 0.0777, "step": 1950 }, { "grad_norm": 1.1078697443008423, "learning_rate": 6.53e-05, "loss": 0.0628, "step": 1960 }, { "grad_norm": 1.1832149028778076, "learning_rate": 6.563333333333333e-05, "loss": 0.0764, "step": 1970 }, { "grad_norm": 1.3219122886657715, "learning_rate": 6.596666666666667e-05, "loss": 0.083, "step": 1980 }, { "grad_norm": 1.2816904783248901, "learning_rate": 6.630000000000001e-05, "loss": 0.073, "step": 1990 }, { "grad_norm": 0.922963559627533, "learning_rate": 6.663333333333333e-05, "loss": 0.0667, "step": 2000 }, { "grad_norm": 1.0142713785171509, "learning_rate": 6.696666666666666e-05, "loss": 0.0682, "step": 2010 }, { "grad_norm": 1.1142243146896362, "learning_rate": 6.730000000000001e-05, "loss": 0.0702, "step": 2020 }, { "grad_norm": 1.3863885402679443, "learning_rate": 6.763333333333334e-05, "loss": 0.0748, "step": 2030 }, { "grad_norm": 0.8603031635284424, "learning_rate": 6.796666666666666e-05, "loss": 0.0776, "step": 2040 }, { "grad_norm": 1.0744670629501343, "learning_rate": 6.83e-05, "loss": 0.0697, "step": 2050 }, { "grad_norm": 1.0202767848968506, "learning_rate": 6.863333333333334e-05, "loss": 0.0658, "step": 2060 }, { "grad_norm": 1.1308379173278809, "learning_rate": 6.896666666666667e-05, "loss": 0.0656, "step": 2070 }, { "grad_norm": 1.1652814149856567, "learning_rate": 6.93e-05, "loss": 0.0663, "step": 2080 }, { "grad_norm": 1.0037552118301392, "learning_rate": 6.963333333333334e-05, "loss": 0.0651, "step": 2090 }, { "grad_norm": 1.3986576795578003, "learning_rate": 6.996666666666667e-05, "loss": 0.0671, "step": 2100 }, { "grad_norm": 1.111999273300171, "learning_rate": 7.03e-05, "loss": 0.0658, "step": 2110 }, { "grad_norm": 0.9913539886474609, "learning_rate": 7.063333333333333e-05, "loss": 0.0624, "step": 2120 }, { "grad_norm": 1.0654922723770142, "learning_rate": 7.096666666666667e-05, "loss": 0.0738, "step": 2130 }, { "grad_norm": 1.2569122314453125, "learning_rate": 7.13e-05, "loss": 0.0717, "step": 2140 }, { "grad_norm": 1.0777548551559448, "learning_rate": 7.163333333333334e-05, "loss": 0.0788, "step": 2150 }, { "grad_norm": 1.2611500024795532, "learning_rate": 7.196666666666668e-05, "loss": 0.0681, "step": 2160 }, { "grad_norm": 1.1128934621810913, "learning_rate": 7.23e-05, "loss": 0.0647, "step": 2170 }, { "grad_norm": 1.0642884969711304, "learning_rate": 7.263333333333334e-05, "loss": 0.0755, "step": 2180 }, { "grad_norm": 1.2553468942642212, "learning_rate": 7.296666666666667e-05, "loss": 0.0685, "step": 2190 }, { "grad_norm": 0.9592381715774536, "learning_rate": 7.33e-05, "loss": 0.0639, "step": 2200 }, { "grad_norm": 1.0337885618209839, "learning_rate": 7.363333333333334e-05, "loss": 0.0634, "step": 2210 }, { "grad_norm": 1.0090278387069702, "learning_rate": 7.396666666666667e-05, "loss": 0.0692, "step": 2220 }, { "grad_norm": 1.358959436416626, "learning_rate": 7.43e-05, "loss": 0.0749, "step": 2230 }, { "grad_norm": 1.103554368019104, "learning_rate": 7.463333333333334e-05, "loss": 0.0619, "step": 2240 }, { "grad_norm": 0.8154667019844055, "learning_rate": 7.496666666666667e-05, "loss": 0.0714, "step": 2250 }, { "grad_norm": 0.9753686785697937, "learning_rate": 7.53e-05, "loss": 0.0685, "step": 2260 }, { "grad_norm": 1.071243405342102, "learning_rate": 7.563333333333333e-05, "loss": 0.0604, "step": 2270 }, { "grad_norm": 1.1003142595291138, "learning_rate": 7.596666666666668e-05, "loss": 0.0709, "step": 2280 }, { "grad_norm": 1.0595791339874268, "learning_rate": 7.630000000000001e-05, "loss": 0.0636, "step": 2290 }, { "grad_norm": 0.9559823870658875, "learning_rate": 7.663333333333333e-05, "loss": 0.0612, "step": 2300 }, { "grad_norm": 1.1679898500442505, "learning_rate": 7.696666666666668e-05, "loss": 0.0739, "step": 2310 }, { "grad_norm": 1.0987716913223267, "learning_rate": 7.730000000000001e-05, "loss": 0.0629, "step": 2320 }, { "grad_norm": 1.1461509466171265, "learning_rate": 7.763333333333334e-05, "loss": 0.0713, "step": 2330 }, { "grad_norm": 1.0480573177337646, "learning_rate": 7.796666666666666e-05, "loss": 0.0646, "step": 2340 }, { "grad_norm": 1.0487347841262817, "learning_rate": 7.83e-05, "loss": 0.0708, "step": 2350 }, { "grad_norm": 0.8615747094154358, "learning_rate": 7.863333333333334e-05, "loss": 0.0633, "step": 2360 }, { "grad_norm": 0.961333692073822, "learning_rate": 7.896666666666667e-05, "loss": 0.063, "step": 2370 }, { "grad_norm": 0.8947559595108032, "learning_rate": 7.93e-05, "loss": 0.0596, "step": 2380 }, { "grad_norm": 1.1710392236709595, "learning_rate": 7.963333333333334e-05, "loss": 0.0697, "step": 2390 }, { "grad_norm": 1.1182241439819336, "learning_rate": 7.996666666666667e-05, "loss": 0.0677, "step": 2400 }, { "grad_norm": 1.1741247177124023, "learning_rate": 8.030000000000001e-05, "loss": 0.0614, "step": 2410 }, { "grad_norm": 1.0437695980072021, "learning_rate": 8.063333333333333e-05, "loss": 0.0615, "step": 2420 }, { "grad_norm": 0.9634862542152405, "learning_rate": 8.096666666666667e-05, "loss": 0.0616, "step": 2430 }, { "grad_norm": 0.8846582174301147, "learning_rate": 8.13e-05, "loss": 0.0633, "step": 2440 }, { "grad_norm": 1.008697509765625, "learning_rate": 8.163333333333334e-05, "loss": 0.0595, "step": 2450 }, { "grad_norm": 0.9665005803108215, "learning_rate": 8.196666666666668e-05, "loss": 0.0682, "step": 2460 }, { "grad_norm": 0.8760867714881897, "learning_rate": 8.23e-05, "loss": 0.0598, "step": 2470 }, { "grad_norm": 0.8840994238853455, "learning_rate": 8.263333333333334e-05, "loss": 0.0603, "step": 2480 }, { "grad_norm": 0.6845605373382568, "learning_rate": 8.296666666666667e-05, "loss": 0.0544, "step": 2490 }, { "grad_norm": 1.1670717000961304, "learning_rate": 8.33e-05, "loss": 0.0689, "step": 2500 }, { "grad_norm": 0.9465165734291077, "learning_rate": 8.363333333333334e-05, "loss": 0.068, "step": 2510 }, { "grad_norm": 0.7773513197898865, "learning_rate": 8.396666666666667e-05, "loss": 0.0636, "step": 2520 }, { "grad_norm": 0.9731705784797668, "learning_rate": 8.43e-05, "loss": 0.059, "step": 2530 }, { "grad_norm": 1.029721736907959, "learning_rate": 8.463333333333335e-05, "loss": 0.0691, "step": 2540 }, { "grad_norm": 0.88335120677948, "learning_rate": 8.496666666666667e-05, "loss": 0.0751, "step": 2550 }, { "grad_norm": 0.790886640548706, "learning_rate": 8.53e-05, "loss": 0.0679, "step": 2560 }, { "grad_norm": 0.7625932693481445, "learning_rate": 8.563333333333333e-05, "loss": 0.0597, "step": 2570 }, { "grad_norm": 0.8958044648170471, "learning_rate": 8.596666666666668e-05, "loss": 0.0641, "step": 2580 }, { "grad_norm": 0.877277672290802, "learning_rate": 8.63e-05, "loss": 0.0641, "step": 2590 }, { "grad_norm": 0.9416974782943726, "learning_rate": 8.663333333333333e-05, "loss": 0.0612, "step": 2600 }, { "grad_norm": 0.8281979560852051, "learning_rate": 8.696666666666668e-05, "loss": 0.059, "step": 2610 }, { "grad_norm": 0.8689745664596558, "learning_rate": 8.730000000000001e-05, "loss": 0.0615, "step": 2620 }, { "grad_norm": 0.8341608643531799, "learning_rate": 8.763333333333334e-05, "loss": 0.0561, "step": 2630 }, { "grad_norm": 0.698942244052887, "learning_rate": 8.796666666666667e-05, "loss": 0.0522, "step": 2640 }, { "grad_norm": 0.8354687094688416, "learning_rate": 8.83e-05, "loss": 0.0604, "step": 2650 }, { "grad_norm": 1.0889543294906616, "learning_rate": 8.863333333333334e-05, "loss": 0.0719, "step": 2660 }, { "grad_norm": 0.9338927268981934, "learning_rate": 8.896666666666667e-05, "loss": 0.0679, "step": 2670 }, { "grad_norm": 0.8350193500518799, "learning_rate": 8.93e-05, "loss": 0.0661, "step": 2680 }, { "grad_norm": 0.7167150378227234, "learning_rate": 8.963333333333333e-05, "loss": 0.0568, "step": 2690 }, { "grad_norm": 0.8887212872505188, "learning_rate": 8.996666666666667e-05, "loss": 0.0646, "step": 2700 }, { "grad_norm": 0.8746552467346191, "learning_rate": 9.030000000000001e-05, "loss": 0.0602, "step": 2710 }, { "grad_norm": 0.8915983438491821, "learning_rate": 9.063333333333333e-05, "loss": 0.061, "step": 2720 }, { "grad_norm": 0.7062487006187439, "learning_rate": 9.096666666666666e-05, "loss": 0.0578, "step": 2730 }, { "grad_norm": 0.9249823093414307, "learning_rate": 9.130000000000001e-05, "loss": 0.0597, "step": 2740 }, { "grad_norm": 0.8496858477592468, "learning_rate": 9.163333333333334e-05, "loss": 0.0543, "step": 2750 }, { "grad_norm": 0.8931301236152649, "learning_rate": 9.196666666666666e-05, "loss": 0.0565, "step": 2760 }, { "grad_norm": 0.8664289712905884, "learning_rate": 9.230000000000001e-05, "loss": 0.0569, "step": 2770 }, { "grad_norm": 0.7348942756652832, "learning_rate": 9.263333333333334e-05, "loss": 0.0564, "step": 2780 }, { "grad_norm": 0.8783203363418579, "learning_rate": 9.296666666666667e-05, "loss": 0.0525, "step": 2790 }, { "grad_norm": 0.8279241323471069, "learning_rate": 9.33e-05, "loss": 0.0551, "step": 2800 }, { "grad_norm": 0.8857786655426025, "learning_rate": 9.363333333333334e-05, "loss": 0.0566, "step": 2810 }, { "grad_norm": 0.8813221454620361, "learning_rate": 9.396666666666667e-05, "loss": 0.0562, "step": 2820 }, { "grad_norm": 0.8645294904708862, "learning_rate": 9.43e-05, "loss": 0.0549, "step": 2830 }, { "grad_norm": 0.8404380083084106, "learning_rate": 9.463333333333333e-05, "loss": 0.0546, "step": 2840 }, { "grad_norm": 0.914763867855072, "learning_rate": 9.496666666666667e-05, "loss": 0.0559, "step": 2850 }, { "grad_norm": 0.6793450117111206, "learning_rate": 9.53e-05, "loss": 0.0576, "step": 2860 }, { "grad_norm": 0.8766696453094482, "learning_rate": 9.563333333333334e-05, "loss": 0.0544, "step": 2870 }, { "grad_norm": 0.9002400040626526, "learning_rate": 9.596666666666668e-05, "loss": 0.0521, "step": 2880 }, { "grad_norm": 0.8387170433998108, "learning_rate": 9.63e-05, "loss": 0.0529, "step": 2890 }, { "grad_norm": 0.8228157162666321, "learning_rate": 9.663333333333334e-05, "loss": 0.0547, "step": 2900 }, { "grad_norm": 0.8644803166389465, "learning_rate": 9.696666666666667e-05, "loss": 0.0694, "step": 2910 }, { "grad_norm": 0.8468896150588989, "learning_rate": 9.730000000000001e-05, "loss": 0.0547, "step": 2920 }, { "grad_norm": 0.6621829271316528, "learning_rate": 9.763333333333334e-05, "loss": 0.0548, "step": 2930 }, { "grad_norm": 0.7150681614875793, "learning_rate": 9.796666666666667e-05, "loss": 0.0515, "step": 2940 }, { "grad_norm": 0.6284530758857727, "learning_rate": 9.83e-05, "loss": 0.0521, "step": 2950 }, { "grad_norm": 0.7855443358421326, "learning_rate": 9.863333333333334e-05, "loss": 0.0636, "step": 2960 }, { "grad_norm": 0.9308194518089294, "learning_rate": 9.896666666666667e-05, "loss": 0.0584, "step": 2970 }, { "grad_norm": 0.8599221110343933, "learning_rate": 9.93e-05, "loss": 0.0582, "step": 2980 }, { "grad_norm": 0.7483208179473877, "learning_rate": 9.963333333333333e-05, "loss": 0.0515, "step": 2990 }, { "grad_norm": 0.8008257746696472, "learning_rate": 9.996666666666668e-05, "loss": 0.0606, "step": 3000 }, { "grad_norm": 0.8120972514152527, "learning_rate": 9.999999384858465e-05, "loss": 0.0551, "step": 3010 }, { "grad_norm": 0.9966566562652588, "learning_rate": 9.999997258443473e-05, "loss": 0.0652, "step": 3020 }, { "grad_norm": 0.7767784595489502, "learning_rate": 9.999993613161331e-05, "loss": 0.0526, "step": 3030 }, { "grad_norm": 0.7611342072486877, "learning_rate": 9.999988449013146e-05, "loss": 0.0501, "step": 3040 }, { "grad_norm": 0.899966835975647, "learning_rate": 9.99998176600049e-05, "loss": 0.0582, "step": 3050 }, { "grad_norm": 0.7014909982681274, "learning_rate": 9.999973564125389e-05, "loss": 0.0603, "step": 3060 }, { "grad_norm": 0.8234820365905762, "learning_rate": 9.999963843390335e-05, "loss": 0.0625, "step": 3070 }, { "grad_norm": 0.8866924047470093, "learning_rate": 9.999952603798282e-05, "loss": 0.0541, "step": 3080 }, { "grad_norm": 0.6899054050445557, "learning_rate": 9.999939845352646e-05, "loss": 0.055, "step": 3090 }, { "grad_norm": 0.7839197516441345, "learning_rate": 9.999925568057298e-05, "loss": 0.0547, "step": 3100 }, { "grad_norm": 0.8694648146629333, "learning_rate": 9.999909771916578e-05, "loss": 0.0606, "step": 3110 }, { "grad_norm": 0.7287220358848572, "learning_rate": 9.999892456935285e-05, "loss": 0.052, "step": 3120 }, { "grad_norm": 0.7638460397720337, "learning_rate": 9.999873623118679e-05, "loss": 0.0569, "step": 3130 }, { "grad_norm": 0.8368794322013855, "learning_rate": 9.999853270472479e-05, "loss": 0.0579, "step": 3140 }, { "grad_norm": 0.8717802166938782, "learning_rate": 9.999831399002871e-05, "loss": 0.0553, "step": 3150 }, { "grad_norm": 0.7637776136398315, "learning_rate": 9.999808008716494e-05, "loss": 0.0507, "step": 3160 }, { "grad_norm": 0.733996570110321, "learning_rate": 9.999783099620459e-05, "loss": 0.0551, "step": 3170 }, { "grad_norm": 0.5987215042114258, "learning_rate": 9.999756671722328e-05, "loss": 0.0517, "step": 3180 }, { "grad_norm": 0.7850061058998108, "learning_rate": 9.99972872503013e-05, "loss": 0.0553, "step": 3190 }, { "grad_norm": 0.7434727549552917, "learning_rate": 9.999699259552359e-05, "loss": 0.0593, "step": 3200 }, { "grad_norm": 0.8338168263435364, "learning_rate": 9.99966827529796e-05, "loss": 0.0569, "step": 3210 }, { "grad_norm": 0.953469455242157, "learning_rate": 9.999635772276348e-05, "loss": 0.0502, "step": 3220 }, { "grad_norm": 0.7861298322677612, "learning_rate": 9.999601750497396e-05, "loss": 0.0529, "step": 3230 }, { "grad_norm": 0.7575981616973877, "learning_rate": 9.99956620997144e-05, "loss": 0.0459, "step": 3240 }, { "grad_norm": 0.7318950891494751, "learning_rate": 9.999529150709275e-05, "loss": 0.0484, "step": 3250 }, { "grad_norm": 0.858807384967804, "learning_rate": 9.999490572722158e-05, "loss": 0.0622, "step": 3260 }, { "grad_norm": 0.7145293354988098, "learning_rate": 9.99945047602181e-05, "loss": 0.0544, "step": 3270 }, { "grad_norm": 0.7078118920326233, "learning_rate": 9.99940886062041e-05, "loss": 0.0475, "step": 3280 }, { "grad_norm": 0.7223478555679321, "learning_rate": 9.999365726530599e-05, "loss": 0.0547, "step": 3290 }, { "grad_norm": 0.7214303612709045, "learning_rate": 9.999321073765481e-05, "loss": 0.0518, "step": 3300 }, { "grad_norm": 0.7803531885147095, "learning_rate": 9.99927490233862e-05, "loss": 0.0485, "step": 3310 }, { "grad_norm": 0.6373307704925537, "learning_rate": 9.999227212264043e-05, "loss": 0.0463, "step": 3320 }, { "grad_norm": 0.7561566829681396, "learning_rate": 9.999178003556236e-05, "loss": 0.049, "step": 3330 }, { "grad_norm": 0.5519773960113525, "learning_rate": 9.999127276230146e-05, "loss": 0.0475, "step": 3340 }, { "grad_norm": 0.6129007935523987, "learning_rate": 9.999075030301184e-05, "loss": 0.0539, "step": 3350 }, { "grad_norm": 0.9630881547927856, "learning_rate": 9.999021265785221e-05, "loss": 0.049, "step": 3360 }, { "grad_norm": 0.654624342918396, "learning_rate": 9.998965982698589e-05, "loss": 0.0608, "step": 3370 }, { "grad_norm": 0.6920844316482544, "learning_rate": 9.998909181058082e-05, "loss": 0.0532, "step": 3380 }, { "grad_norm": 0.7399725317955017, "learning_rate": 9.998850860880953e-05, "loss": 0.055, "step": 3390 }, { "grad_norm": 0.9067150354385376, "learning_rate": 9.998791022184922e-05, "loss": 0.0604, "step": 3400 }, { "grad_norm": 0.7762711048126221, "learning_rate": 9.99872966498816e-05, "loss": 0.0516, "step": 3410 }, { "grad_norm": 0.7466596961021423, "learning_rate": 9.998666789309313e-05, "loss": 0.053, "step": 3420 }, { "grad_norm": 0.6821275949478149, "learning_rate": 9.998602395167475e-05, "loss": 0.0472, "step": 3430 }, { "grad_norm": 0.7221843600273132, "learning_rate": 9.998536482582213e-05, "loss": 0.0492, "step": 3440 }, { "grad_norm": 0.7387469410896301, "learning_rate": 9.998469051573544e-05, "loss": 0.0452, "step": 3450 }, { "grad_norm": 0.6583935022354126, "learning_rate": 9.998400102161954e-05, "loss": 0.0491, "step": 3460 }, { "grad_norm": 0.7039799690246582, "learning_rate": 9.998329634368388e-05, "loss": 0.0441, "step": 3470 }, { "grad_norm": 0.677544891834259, "learning_rate": 9.998257648214253e-05, "loss": 0.0504, "step": 3480 }, { "grad_norm": 0.6488243937492371, "learning_rate": 9.998184143721417e-05, "loss": 0.0462, "step": 3490 }, { "grad_norm": 0.7597591876983643, "learning_rate": 9.998109120912206e-05, "loss": 0.0563, "step": 3500 }, { "grad_norm": 0.7398865818977356, "learning_rate": 9.998032579809411e-05, "loss": 0.0491, "step": 3510 }, { "grad_norm": 0.68448406457901, "learning_rate": 9.997954520436286e-05, "loss": 0.0471, "step": 3520 }, { "grad_norm": 0.6528464555740356, "learning_rate": 9.997874942816538e-05, "loss": 0.0508, "step": 3530 }, { "grad_norm": 0.7047237157821655, "learning_rate": 9.997793846974345e-05, "loss": 0.057, "step": 3540 }, { "grad_norm": 0.7688554525375366, "learning_rate": 9.997711232934341e-05, "loss": 0.0502, "step": 3550 }, { "grad_norm": 0.6895895004272461, "learning_rate": 9.99762710072162e-05, "loss": 0.0501, "step": 3560 }, { "grad_norm": 0.7453030943870544, "learning_rate": 9.997541450361743e-05, "loss": 0.0543, "step": 3570 }, { "grad_norm": 0.5960200428962708, "learning_rate": 9.997454281880723e-05, "loss": 0.0538, "step": 3580 }, { "grad_norm": 0.707518994808197, "learning_rate": 9.997365595305044e-05, "loss": 0.0483, "step": 3590 }, { "grad_norm": 0.5813127756118774, "learning_rate": 9.997275390661644e-05, "loss": 0.0527, "step": 3600 }, { "grad_norm": 0.5622773766517639, "learning_rate": 9.997183667977926e-05, "loss": 0.0472, "step": 3610 }, { "grad_norm": 0.758303701877594, "learning_rate": 9.997090427281752e-05, "loss": 0.0485, "step": 3620 }, { "grad_norm": 0.6680123209953308, "learning_rate": 9.996995668601448e-05, "loss": 0.0497, "step": 3630 }, { "grad_norm": 0.6016977429389954, "learning_rate": 9.996899391965798e-05, "loss": 0.0493, "step": 3640 }, { "grad_norm": 0.5767600536346436, "learning_rate": 9.996801597404048e-05, "loss": 0.0486, "step": 3650 }, { "grad_norm": 0.7810037732124329, "learning_rate": 9.996702284945905e-05, "loss": 0.0417, "step": 3660 }, { "grad_norm": 0.6699069738388062, "learning_rate": 9.996601454621539e-05, "loss": 0.0454, "step": 3670 }, { "grad_norm": 0.6514731049537659, "learning_rate": 9.996499106461577e-05, "loss": 0.0518, "step": 3680 }, { "grad_norm": 0.6613548994064331, "learning_rate": 9.996395240497112e-05, "loss": 0.0504, "step": 3690 }, { "grad_norm": 0.6204959750175476, "learning_rate": 9.996289856759696e-05, "loss": 0.0459, "step": 3700 }, { "grad_norm": 0.7176038026809692, "learning_rate": 9.996182955281342e-05, "loss": 0.0506, "step": 3710 }, { "grad_norm": 0.5343673825263977, "learning_rate": 9.996074536094519e-05, "loss": 0.049, "step": 3720 }, { "grad_norm": 0.5605956315994263, "learning_rate": 9.995964599232168e-05, "loss": 0.0454, "step": 3730 }, { "grad_norm": 0.6410989761352539, "learning_rate": 9.995853144727683e-05, "loss": 0.0503, "step": 3740 }, { "grad_norm": 0.6830344200134277, "learning_rate": 9.99574017261492e-05, "loss": 0.0472, "step": 3750 }, { "grad_norm": 0.8887002468109131, "learning_rate": 9.995625682928198e-05, "loss": 0.0451, "step": 3760 }, { "grad_norm": 0.7443501949310303, "learning_rate": 9.995509675702295e-05, "loss": 0.0534, "step": 3770 }, { "grad_norm": 0.6347053050994873, "learning_rate": 9.995392150972451e-05, "loss": 0.0471, "step": 3780 }, { "grad_norm": 0.8508003354072571, "learning_rate": 9.995273108774366e-05, "loss": 0.0458, "step": 3790 }, { "grad_norm": 0.5818729996681213, "learning_rate": 9.995152549144205e-05, "loss": 0.0478, "step": 3800 }, { "grad_norm": 0.6865646243095398, "learning_rate": 9.995030472118587e-05, "loss": 0.0393, "step": 3810 }, { "grad_norm": 0.5445404052734375, "learning_rate": 9.9949068777346e-05, "loss": 0.0478, "step": 3820 }, { "grad_norm": 0.7957289218902588, "learning_rate": 9.994781766029786e-05, "loss": 0.0496, "step": 3830 }, { "grad_norm": 0.6646504402160645, "learning_rate": 9.994655137042151e-05, "loss": 0.0451, "step": 3840 }, { "grad_norm": 0.6870298981666565, "learning_rate": 9.99452699081016e-05, "loss": 0.0425, "step": 3850 }, { "grad_norm": 0.6771845817565918, "learning_rate": 9.994397327372743e-05, "loss": 0.0477, "step": 3860 }, { "grad_norm": 0.6277785301208496, "learning_rate": 9.994266146769286e-05, "loss": 0.0446, "step": 3870 }, { "grad_norm": 0.776437520980835, "learning_rate": 9.994133449039642e-05, "loss": 0.0442, "step": 3880 }, { "grad_norm": 0.6355234384536743, "learning_rate": 9.993999234224118e-05, "loss": 0.0446, "step": 3890 }, { "grad_norm": 0.6138414144515991, "learning_rate": 9.993863502363485e-05, "loss": 0.0459, "step": 3900 }, { "grad_norm": 0.5557674169540405, "learning_rate": 9.993726253498976e-05, "loss": 0.0454, "step": 3910 }, { "grad_norm": 0.7285213470458984, "learning_rate": 9.993587487672282e-05, "loss": 0.0426, "step": 3920 }, { "grad_norm": 0.7623494863510132, "learning_rate": 9.993447204925558e-05, "loss": 0.0439, "step": 3930 }, { "grad_norm": 0.6527449488639832, "learning_rate": 9.993305405301416e-05, "loss": 0.0452, "step": 3940 }, { "grad_norm": 0.8385573029518127, "learning_rate": 9.993162088842935e-05, "loss": 0.0468, "step": 3950 }, { "grad_norm": 0.6902685165405273, "learning_rate": 9.993017255593646e-05, "loss": 0.04, "step": 3960 }, { "grad_norm": 0.580554187297821, "learning_rate": 9.992870905597548e-05, "loss": 0.0456, "step": 3970 }, { "grad_norm": 0.6301660537719727, "learning_rate": 9.9927230388991e-05, "loss": 0.0398, "step": 3980 }, { "grad_norm": 0.7712828516960144, "learning_rate": 9.992573655543215e-05, "loss": 0.042, "step": 3990 }, { "grad_norm": 0.6661479473114014, "learning_rate": 9.992422755575277e-05, "loss": 0.0493, "step": 4000 }, { "grad_norm": 0.6758480668067932, "learning_rate": 9.992270339041123e-05, "loss": 0.0446, "step": 4010 }, { "grad_norm": 0.6842628717422485, "learning_rate": 9.992116405987053e-05, "loss": 0.0399, "step": 4020 }, { "grad_norm": 0.6388065814971924, "learning_rate": 9.991960956459828e-05, "loss": 0.047, "step": 4030 }, { "grad_norm": 0.8170465230941772, "learning_rate": 9.991803990506669e-05, "loss": 0.0456, "step": 4040 }, { "grad_norm": 0.7072161436080933, "learning_rate": 9.991645508175258e-05, "loss": 0.0423, "step": 4050 }, { "grad_norm": 0.5126902461051941, "learning_rate": 9.99148550951374e-05, "loss": 0.0497, "step": 4060 }, { "grad_norm": 0.5235459208488464, "learning_rate": 9.991323994570716e-05, "loss": 0.042, "step": 4070 }, { "grad_norm": 0.54132080078125, "learning_rate": 9.99116096339525e-05, "loss": 0.04, "step": 4080 }, { "grad_norm": 0.7617301344871521, "learning_rate": 9.990996416036869e-05, "loss": 0.0405, "step": 4090 }, { "grad_norm": 0.6913269758224487, "learning_rate": 9.990830352545555e-05, "loss": 0.0437, "step": 4100 }, { "grad_norm": 0.6376801133155823, "learning_rate": 9.990662772971756e-05, "loss": 0.043, "step": 4110 }, { "grad_norm": 0.6430414319038391, "learning_rate": 9.990493677366376e-05, "loss": 0.0422, "step": 4120 }, { "grad_norm": 0.497507780790329, "learning_rate": 9.990323065780786e-05, "loss": 0.041, "step": 4130 }, { "grad_norm": 0.6109529733657837, "learning_rate": 9.990150938266808e-05, "loss": 0.0455, "step": 4140 }, { "grad_norm": 0.6755995154380798, "learning_rate": 9.989977294876733e-05, "loss": 0.0416, "step": 4150 }, { "grad_norm": 0.6640646457672119, "learning_rate": 9.989802135663308e-05, "loss": 0.0402, "step": 4160 }, { "grad_norm": 0.6534554958343506, "learning_rate": 9.989625460679743e-05, "loss": 0.0503, "step": 4170 }, { "grad_norm": 0.769862711429596, "learning_rate": 9.989447269979706e-05, "loss": 0.0467, "step": 4180 }, { "grad_norm": 0.7438839673995972, "learning_rate": 9.989267563617328e-05, "loss": 0.0445, "step": 4190 }, { "grad_norm": 0.6223868727684021, "learning_rate": 9.989086341647198e-05, "loss": 0.0409, "step": 4200 }, { "grad_norm": 0.759129524230957, "learning_rate": 9.988903604124366e-05, "loss": 0.0391, "step": 4210 }, { "grad_norm": 0.7649771571159363, "learning_rate": 9.988719351104343e-05, "loss": 0.0426, "step": 4220 }, { "grad_norm": 0.5601566433906555, "learning_rate": 9.9885335826431e-05, "loss": 0.0389, "step": 4230 }, { "grad_norm": 0.657940149307251, "learning_rate": 9.988346298797071e-05, "loss": 0.0441, "step": 4240 }, { "grad_norm": 0.6650236248970032, "learning_rate": 9.988157499623146e-05, "loss": 0.045, "step": 4250 }, { "grad_norm": 0.5703973770141602, "learning_rate": 9.987967185178677e-05, "loss": 0.043, "step": 4260 }, { "grad_norm": 0.5973702073097229, "learning_rate": 9.987775355521476e-05, "loss": 0.0404, "step": 4270 }, { "grad_norm": 0.6352563500404358, "learning_rate": 9.987582010709817e-05, "loss": 0.0379, "step": 4280 }, { "grad_norm": 0.667305052280426, "learning_rate": 9.987387150802431e-05, "loss": 0.0416, "step": 4290 }, { "grad_norm": 0.6370511054992676, "learning_rate": 9.987190775858517e-05, "loss": 0.0424, "step": 4300 }, { "grad_norm": 0.7280203104019165, "learning_rate": 9.98699288593772e-05, "loss": 0.0376, "step": 4310 }, { "grad_norm": 0.6414651870727539, "learning_rate": 9.986793481100161e-05, "loss": 0.0405, "step": 4320 }, { "grad_norm": 0.5290331244468689, "learning_rate": 9.986592561406412e-05, "loss": 0.0371, "step": 4330 }, { "grad_norm": 0.5593175888061523, "learning_rate": 9.986390126917503e-05, "loss": 0.0397, "step": 4340 }, { "grad_norm": 0.6993313431739807, "learning_rate": 9.986186177694933e-05, "loss": 0.0411, "step": 4350 }, { "grad_norm": 0.7027745246887207, "learning_rate": 9.985980713800656e-05, "loss": 0.0395, "step": 4360 }, { "grad_norm": 0.5979444980621338, "learning_rate": 9.985773735297084e-05, "loss": 0.0385, "step": 4370 }, { "grad_norm": 0.5139358639717102, "learning_rate": 9.985565242247092e-05, "loss": 0.0369, "step": 4380 }, { "grad_norm": 0.7809072732925415, "learning_rate": 9.985355234714016e-05, "loss": 0.0413, "step": 4390 }, { "grad_norm": 0.696377158164978, "learning_rate": 9.985143712761652e-05, "loss": 0.0468, "step": 4400 }, { "grad_norm": 0.455167293548584, "learning_rate": 9.984930676454252e-05, "loss": 0.0397, "step": 4410 }, { "grad_norm": 0.7265823483467102, "learning_rate": 9.984716125856532e-05, "loss": 0.0439, "step": 4420 }, { "grad_norm": 0.584205687046051, "learning_rate": 9.984500061033667e-05, "loss": 0.0409, "step": 4430 }, { "grad_norm": 0.6065359711647034, "learning_rate": 9.984282482051293e-05, "loss": 0.0458, "step": 4440 }, { "grad_norm": 0.6577481627464294, "learning_rate": 9.9840633889755e-05, "loss": 0.0383, "step": 4450 }, { "grad_norm": 0.6028100848197937, "learning_rate": 9.983842781872848e-05, "loss": 0.0388, "step": 4460 }, { "grad_norm": 0.6861836314201355, "learning_rate": 9.98362066081035e-05, "loss": 0.0385, "step": 4470 }, { "grad_norm": 0.5267810821533203, "learning_rate": 9.983397025855479e-05, "loss": 0.0384, "step": 4480 }, { "grad_norm": 0.5755560994148254, "learning_rate": 9.983171877076171e-05, "loss": 0.0393, "step": 4490 }, { "grad_norm": 0.6919408440589905, "learning_rate": 9.98294521454082e-05, "loss": 0.0402, "step": 4500 }, { "grad_norm": 0.5324994325637817, "learning_rate": 9.98271703831828e-05, "loss": 0.0446, "step": 4510 }, { "grad_norm": 0.7347158789634705, "learning_rate": 9.982487348477865e-05, "loss": 0.0393, "step": 4520 }, { "grad_norm": 0.5538548231124878, "learning_rate": 9.982256145089347e-05, "loss": 0.0438, "step": 4530 }, { "grad_norm": 0.774620532989502, "learning_rate": 9.982023428222962e-05, "loss": 0.0384, "step": 4540 }, { "grad_norm": 0.6034471392631531, "learning_rate": 9.981789197949403e-05, "loss": 0.0385, "step": 4550 }, { "grad_norm": 0.6088311076164246, "learning_rate": 9.98155345433982e-05, "loss": 0.036, "step": 4560 }, { "grad_norm": 0.5806707739830017, "learning_rate": 9.981316197465831e-05, "loss": 0.0373, "step": 4570 }, { "grad_norm": 0.7010080814361572, "learning_rate": 9.981077427399504e-05, "loss": 0.0377, "step": 4580 }, { "grad_norm": 0.5499981641769409, "learning_rate": 9.980837144213371e-05, "loss": 0.0362, "step": 4590 }, { "grad_norm": 0.5494488477706909, "learning_rate": 9.980595347980426e-05, "loss": 0.037, "step": 4600 }, { "grad_norm": 0.671785831451416, "learning_rate": 9.980352038774119e-05, "loss": 0.0362, "step": 4610 }, { "grad_norm": 0.6250209808349609, "learning_rate": 9.98010721666836e-05, "loss": 0.0407, "step": 4620 }, { "grad_norm": 0.6483927965164185, "learning_rate": 9.979860881737523e-05, "loss": 0.0387, "step": 4630 }, { "grad_norm": 0.7463237643241882, "learning_rate": 9.979613034056434e-05, "loss": 0.0413, "step": 4640 }, { "grad_norm": 0.7241187691688538, "learning_rate": 9.979363673700386e-05, "loss": 0.0391, "step": 4650 }, { "grad_norm": 0.6088847517967224, "learning_rate": 9.979112800745124e-05, "loss": 0.045, "step": 4660 }, { "grad_norm": 0.5330309271812439, "learning_rate": 9.978860415266861e-05, "loss": 0.0386, "step": 4670 }, { "grad_norm": 0.45378994941711426, "learning_rate": 9.978606517342262e-05, "loss": 0.0319, "step": 4680 }, { "grad_norm": 0.6310836672782898, "learning_rate": 9.978351107048456e-05, "loss": 0.0405, "step": 4690 }, { "grad_norm": 0.7219730615615845, "learning_rate": 9.978094184463029e-05, "loss": 0.0425, "step": 4700 }, { "grad_norm": 0.6553520560264587, "learning_rate": 9.977835749664029e-05, "loss": 0.0366, "step": 4710 }, { "grad_norm": 0.6998794078826904, "learning_rate": 9.97757580272996e-05, "loss": 0.0369, "step": 4720 }, { "grad_norm": 0.5620837211608887, "learning_rate": 9.977314343739786e-05, "loss": 0.0372, "step": 4730 }, { "grad_norm": 0.4794282615184784, "learning_rate": 9.977051372772934e-05, "loss": 0.0332, "step": 4740 }, { "grad_norm": 0.6408950686454773, "learning_rate": 9.976786889909286e-05, "loss": 0.0348, "step": 4750 }, { "grad_norm": 0.6270435452461243, "learning_rate": 9.976520895229185e-05, "loss": 0.0347, "step": 4760 }, { "grad_norm": 0.6506294012069702, "learning_rate": 9.976253388813433e-05, "loss": 0.0375, "step": 4770 }, { "grad_norm": 0.5918760299682617, "learning_rate": 9.975984370743293e-05, "loss": 0.04, "step": 4780 }, { "grad_norm": 0.742189347743988, "learning_rate": 9.975713841100485e-05, "loss": 0.0346, "step": 4790 }, { "grad_norm": 0.648874044418335, "learning_rate": 9.975441799967187e-05, "loss": 0.0346, "step": 4800 }, { "grad_norm": 0.5405378937721252, "learning_rate": 9.975168247426039e-05, "loss": 0.0334, "step": 4810 }, { "grad_norm": 0.7033109664916992, "learning_rate": 9.974893183560139e-05, "loss": 0.0308, "step": 4820 }, { "grad_norm": 0.5814917087554932, "learning_rate": 9.974616608453045e-05, "loss": 0.0388, "step": 4830 }, { "grad_norm": 0.5261088013648987, "learning_rate": 9.974338522188772e-05, "loss": 0.0424, "step": 4840 }, { "grad_norm": 0.5280689001083374, "learning_rate": 9.974058924851797e-05, "loss": 0.0367, "step": 4850 }, { "grad_norm": 0.6398464441299438, "learning_rate": 9.973777816527051e-05, "loss": 0.0346, "step": 4860 }, { "grad_norm": 0.5820260643959045, "learning_rate": 9.973495197299931e-05, "loss": 0.0322, "step": 4870 }, { "grad_norm": 0.5457234978675842, "learning_rate": 9.973211067256287e-05, "loss": 0.0388, "step": 4880 }, { "grad_norm": 0.646477997303009, "learning_rate": 9.97292542648243e-05, "loss": 0.035, "step": 4890 }, { "grad_norm": 0.6922654509544373, "learning_rate": 9.972638275065131e-05, "loss": 0.0351, "step": 4900 }, { "grad_norm": 0.6583635807037354, "learning_rate": 9.972349613091621e-05, "loss": 0.032, "step": 4910 }, { "grad_norm": 0.69245445728302, "learning_rate": 9.972059440649584e-05, "loss": 0.0309, "step": 4920 }, { "grad_norm": 0.6526948809623718, "learning_rate": 9.971767757827168e-05, "loss": 0.0357, "step": 4930 }, { "grad_norm": 0.5548874139785767, "learning_rate": 9.971474564712982e-05, "loss": 0.031, "step": 4940 }, { "grad_norm": 0.7070045471191406, "learning_rate": 9.971179861396084e-05, "loss": 0.0348, "step": 4950 }, { "grad_norm": 0.6669615507125854, "learning_rate": 9.970883647966003e-05, "loss": 0.0316, "step": 4960 }, { "grad_norm": 0.5660342574119568, "learning_rate": 9.970585924512717e-05, "loss": 0.0392, "step": 4970 }, { "grad_norm": 0.5953735709190369, "learning_rate": 9.970286691126669e-05, "loss": 0.0398, "step": 4980 }, { "grad_norm": 0.45345062017440796, "learning_rate": 9.969985947898756e-05, "loss": 0.0326, "step": 4990 }, { "grad_norm": 0.5851004123687744, "learning_rate": 9.969683694920337e-05, "loss": 0.0353, "step": 5000 }, { "grad_norm": 0.6838672757148743, "learning_rate": 9.969379932283228e-05, "loss": 0.0334, "step": 5010 }, { "grad_norm": 0.4194786250591278, "learning_rate": 9.969074660079704e-05, "loss": 0.0321, "step": 5020 }, { "grad_norm": 0.5828918814659119, "learning_rate": 9.968767878402501e-05, "loss": 0.0351, "step": 5030 }, { "grad_norm": 0.562113344669342, "learning_rate": 9.968459587344808e-05, "loss": 0.0333, "step": 5040 }, { "grad_norm": 0.629982590675354, "learning_rate": 9.968149787000278e-05, "loss": 0.0366, "step": 5050 }, { "grad_norm": 0.48110151290893555, "learning_rate": 9.967838477463018e-05, "loss": 0.0329, "step": 5060 }, { "grad_norm": 0.6148000359535217, "learning_rate": 9.967525658827597e-05, "loss": 0.029, "step": 5070 }, { "grad_norm": 0.5258086919784546, "learning_rate": 9.967211331189042e-05, "loss": 0.0331, "step": 5080 }, { "grad_norm": 0.46001553535461426, "learning_rate": 9.966895494642834e-05, "loss": 0.0325, "step": 5090 }, { "grad_norm": 0.6187411546707153, "learning_rate": 9.96657814928492e-05, "loss": 0.0346, "step": 5100 }, { "grad_norm": 0.5961857438087463, "learning_rate": 9.966259295211697e-05, "loss": 0.0307, "step": 5110 }, { "grad_norm": 0.5988697409629822, "learning_rate": 9.965938932520028e-05, "loss": 0.041, "step": 5120 }, { "grad_norm": 0.5525689125061035, "learning_rate": 9.965617061307229e-05, "loss": 0.0299, "step": 5130 }, { "grad_norm": 0.5115570425987244, "learning_rate": 9.965293681671077e-05, "loss": 0.0313, "step": 5140 }, { "grad_norm": 0.5240322947502136, "learning_rate": 9.964968793709804e-05, "loss": 0.0285, "step": 5150 }, { "grad_norm": 0.5910024046897888, "learning_rate": 9.964642397522106e-05, "loss": 0.0313, "step": 5160 }, { "grad_norm": 0.7692542672157288, "learning_rate": 9.96431449320713e-05, "loss": 0.025, "step": 5170 }, { "grad_norm": 0.6028837561607361, "learning_rate": 9.963985080864486e-05, "loss": 0.033, "step": 5180 }, { "grad_norm": 0.5340349674224854, "learning_rate": 9.96365416059424e-05, "loss": 0.0339, "step": 5190 }, { "grad_norm": 0.5638813376426697, "learning_rate": 9.963321732496919e-05, "loss": 0.0389, "step": 5200 }, { "grad_norm": 0.6781941652297974, "learning_rate": 9.962987796673506e-05, "loss": 0.0361, "step": 5210 }, { "grad_norm": 0.47857561707496643, "learning_rate": 9.962652353225438e-05, "loss": 0.0344, "step": 5220 }, { "grad_norm": 0.587444543838501, "learning_rate": 9.962315402254619e-05, "loss": 0.0334, "step": 5230 }, { "grad_norm": 0.5667542815208435, "learning_rate": 9.9619769438634e-05, "loss": 0.0287, "step": 5240 }, { "grad_norm": 0.6488659977912903, "learning_rate": 9.9616369781546e-05, "loss": 0.0276, "step": 5250 }, { "grad_norm": 0.5722289681434631, "learning_rate": 9.961295505231491e-05, "loss": 0.0348, "step": 5260 }, { "grad_norm": 0.501595675945282, "learning_rate": 9.960952525197804e-05, "loss": 0.0326, "step": 5270 }, { "grad_norm": 0.626385509967804, "learning_rate": 9.960608038157724e-05, "loss": 0.0325, "step": 5280 }, { "grad_norm": 0.581871509552002, "learning_rate": 9.960262044215901e-05, "loss": 0.0339, "step": 5290 }, { "grad_norm": 0.6409176588058472, "learning_rate": 9.959914543477435e-05, "loss": 0.0315, "step": 5300 }, { "grad_norm": 0.5938498377799988, "learning_rate": 9.959565536047892e-05, "loss": 0.0301, "step": 5310 }, { "grad_norm": 0.5349408984184265, "learning_rate": 9.959215022033288e-05, "loss": 0.0329, "step": 5320 }, { "grad_norm": 0.47248077392578125, "learning_rate": 9.9588630015401e-05, "loss": 0.0294, "step": 5330 }, { "grad_norm": 0.4678635001182556, "learning_rate": 9.958509474675264e-05, "loss": 0.0325, "step": 5340 }, { "grad_norm": 0.5386501550674438, "learning_rate": 9.958154441546171e-05, "loss": 0.0277, "step": 5350 }, { "grad_norm": 0.5271458029747009, "learning_rate": 9.957797902260673e-05, "loss": 0.0331, "step": 5360 }, { "grad_norm": 0.6122616529464722, "learning_rate": 9.957439856927073e-05, "loss": 0.0348, "step": 5370 }, { "grad_norm": 0.6210896968841553, "learning_rate": 9.957080305654139e-05, "loss": 0.0313, "step": 5380 }, { "grad_norm": 0.5200783610343933, "learning_rate": 9.956719248551092e-05, "loss": 0.0286, "step": 5390 }, { "grad_norm": 0.4495662748813629, "learning_rate": 9.956356685727612e-05, "loss": 0.0301, "step": 5400 }, { "grad_norm": 0.6084323525428772, "learning_rate": 9.955992617293836e-05, "loss": 0.0312, "step": 5410 }, { "grad_norm": 0.47591859102249146, "learning_rate": 9.955627043360358e-05, "loss": 0.0277, "step": 5420 }, { "grad_norm": 0.5773627758026123, "learning_rate": 9.955259964038231e-05, "loss": 0.0306, "step": 5430 }, { "grad_norm": 0.4637615382671356, "learning_rate": 9.954891379438962e-05, "loss": 0.0309, "step": 5440 }, { "grad_norm": 0.6408292055130005, "learning_rate": 9.954521289674519e-05, "loss": 0.0356, "step": 5450 }, { "grad_norm": 0.6400755643844604, "learning_rate": 9.954149694857325e-05, "loss": 0.0301, "step": 5460 }, { "grad_norm": 0.5934323072433472, "learning_rate": 9.953776595100258e-05, "loss": 0.032, "step": 5470 }, { "grad_norm": 0.5204159617424011, "learning_rate": 9.95340199051666e-05, "loss": 0.03, "step": 5480 }, { "grad_norm": 0.7988465428352356, "learning_rate": 9.953025881220325e-05, "loss": 0.0298, "step": 5490 }, { "grad_norm": 0.9485117197036743, "learning_rate": 9.952648267325504e-05, "loss": 0.0305, "step": 5500 }, { "grad_norm": 0.46983402967453003, "learning_rate": 9.952269148946905e-05, "loss": 0.0368, "step": 5510 }, { "grad_norm": 0.44692879915237427, "learning_rate": 9.951888526199697e-05, "loss": 0.0306, "step": 5520 }, { "grad_norm": 0.4629494547843933, "learning_rate": 9.951506399199501e-05, "loss": 0.0389, "step": 5530 }, { "grad_norm": 0.5767475962638855, "learning_rate": 9.951122768062399e-05, "loss": 0.0296, "step": 5540 }, { "grad_norm": 0.5135264992713928, "learning_rate": 9.950737632904927e-05, "loss": 0.0266, "step": 5550 }, { "grad_norm": 0.5246177911758423, "learning_rate": 9.950350993844077e-05, "loss": 0.0305, "step": 5560 }, { "grad_norm": 0.4465120732784271, "learning_rate": 9.949962850997303e-05, "loss": 0.029, "step": 5570 }, { "grad_norm": 0.5132999420166016, "learning_rate": 9.949573204482512e-05, "loss": 0.0306, "step": 5580 }, { "grad_norm": 0.514188289642334, "learning_rate": 9.949182054418064e-05, "loss": 0.0302, "step": 5590 }, { "grad_norm": 0.431026816368103, "learning_rate": 9.948789400922787e-05, "loss": 0.0343, "step": 5600 }, { "grad_norm": 0.6673880219459534, "learning_rate": 9.948395244115953e-05, "loss": 0.034, "step": 5610 }, { "grad_norm": 0.6397469639778137, "learning_rate": 9.9479995841173e-05, "loss": 0.0339, "step": 5620 }, { "grad_norm": 0.5425045490264893, "learning_rate": 9.947602421047017e-05, "loss": 0.0288, "step": 5630 }, { "grad_norm": 0.5681215524673462, "learning_rate": 9.947203755025753e-05, "loss": 0.0359, "step": 5640 }, { "grad_norm": 0.6781080961227417, "learning_rate": 9.946803586174611e-05, "loss": 0.0258, "step": 5650 }, { "grad_norm": 0.5687102675437927, "learning_rate": 9.946401914615151e-05, "loss": 0.0319, "step": 5660 }, { "grad_norm": 0.5427479147911072, "learning_rate": 9.945998740469394e-05, "loss": 0.029, "step": 5670 }, { "grad_norm": 0.521224856376648, "learning_rate": 9.945594063859809e-05, "loss": 0.0284, "step": 5680 }, { "grad_norm": 0.628860592842102, "learning_rate": 9.94518788490933e-05, "loss": 0.0343, "step": 5690 }, { "grad_norm": 0.5305071473121643, "learning_rate": 9.944780203741341e-05, "loss": 0.0263, "step": 5700 }, { "grad_norm": 0.656499981880188, "learning_rate": 9.944371020479686e-05, "loss": 0.0344, "step": 5710 }, { "grad_norm": 0.46921977400779724, "learning_rate": 9.943960335248662e-05, "loss": 0.0334, "step": 5720 }, { "grad_norm": 0.5762396454811096, "learning_rate": 9.943548148173027e-05, "loss": 0.0332, "step": 5730 }, { "grad_norm": 0.4832884669303894, "learning_rate": 9.943134459377992e-05, "loss": 0.0268, "step": 5740 }, { "grad_norm": 0.48663529753685, "learning_rate": 9.942719268989222e-05, "loss": 0.0268, "step": 5750 }, { "grad_norm": 0.4288635551929474, "learning_rate": 9.942302577132844e-05, "loss": 0.0278, "step": 5760 }, { "grad_norm": 0.5483534336090088, "learning_rate": 9.941884383935438e-05, "loss": 0.0303, "step": 5770 }, { "grad_norm": 0.590912401676178, "learning_rate": 9.941464689524039e-05, "loss": 0.0332, "step": 5780 }, { "grad_norm": 0.5242199301719666, "learning_rate": 9.941043494026139e-05, "loss": 0.0317, "step": 5790 }, { "grad_norm": 0.5846026539802551, "learning_rate": 9.940620797569685e-05, "loss": 0.0268, "step": 5800 }, { "grad_norm": 0.5359113812446594, "learning_rate": 9.940196600283082e-05, "loss": 0.026, "step": 5810 }, { "grad_norm": 0.5197345614433289, "learning_rate": 9.939770902295192e-05, "loss": 0.0265, "step": 5820 }, { "grad_norm": 0.4654234051704407, "learning_rate": 9.939343703735329e-05, "loss": 0.0322, "step": 5830 }, { "grad_norm": 0.5041200518608093, "learning_rate": 9.938915004733264e-05, "loss": 0.0278, "step": 5840 }, { "grad_norm": 0.5169036388397217, "learning_rate": 9.938484805419224e-05, "loss": 0.0354, "step": 5850 }, { "grad_norm": 0.35643133521080017, "learning_rate": 9.938053105923894e-05, "loss": 0.0268, "step": 5860 }, { "grad_norm": 0.508072555065155, "learning_rate": 9.937619906378413e-05, "loss": 0.0258, "step": 5870 }, { "grad_norm": 0.44759124517440796, "learning_rate": 9.937185206914374e-05, "loss": 0.0387, "step": 5880 }, { "grad_norm": 0.4900940954685211, "learning_rate": 9.936749007663829e-05, "loss": 0.0296, "step": 5890 }, { "grad_norm": 0.48753684759140015, "learning_rate": 9.93631130875928e-05, "loss": 0.0299, "step": 5900 }, { "grad_norm": 0.47119656205177307, "learning_rate": 9.935872110333692e-05, "loss": 0.03, "step": 5910 }, { "grad_norm": 0.4481375813484192, "learning_rate": 9.935431412520484e-05, "loss": 0.03, "step": 5920 }, { "grad_norm": 0.41306567192077637, "learning_rate": 9.934989215453523e-05, "loss": 0.0252, "step": 5930 }, { "grad_norm": 0.5313475728034973, "learning_rate": 9.934545519267139e-05, "loss": 0.0261, "step": 5940 }, { "grad_norm": 0.586621105670929, "learning_rate": 9.934100324096117e-05, "loss": 0.0251, "step": 5950 }, { "grad_norm": 0.5261890888214111, "learning_rate": 9.933653630075692e-05, "loss": 0.0247, "step": 5960 }, { "grad_norm": 0.5286491513252258, "learning_rate": 9.93320543734156e-05, "loss": 0.0223, "step": 5970 }, { "grad_norm": 0.42709657549858093, "learning_rate": 9.932755746029871e-05, "loss": 0.0281, "step": 5980 }, { "grad_norm": 0.5061715841293335, "learning_rate": 9.932304556277228e-05, "loss": 0.0255, "step": 5990 }, { "grad_norm": 0.6657582521438599, "learning_rate": 9.93185186822069e-05, "loss": 0.0284, "step": 6000 }, { "grad_norm": 0.4694940745830536, "learning_rate": 9.931397681997773e-05, "loss": 0.0252, "step": 6010 }, { "grad_norm": 0.7472635507583618, "learning_rate": 9.930941997746446e-05, "loss": 0.0294, "step": 6020 }, { "grad_norm": 0.584044337272644, "learning_rate": 9.930484815605134e-05, "loss": 0.0255, "step": 6030 }, { "grad_norm": 0.5345356464385986, "learning_rate": 9.930026135712717e-05, "loss": 0.0304, "step": 6040 }, { "grad_norm": 0.41178804636001587, "learning_rate": 9.92956595820853e-05, "loss": 0.0221, "step": 6050 }, { "grad_norm": 0.6209378242492676, "learning_rate": 9.929104283232362e-05, "loss": 0.0252, "step": 6060 }, { "grad_norm": 0.537927508354187, "learning_rate": 9.92864111092446e-05, "loss": 0.0247, "step": 6070 }, { "grad_norm": 0.4989626109600067, "learning_rate": 9.92817644142552e-05, "loss": 0.0301, "step": 6080 }, { "grad_norm": 0.48206672072410583, "learning_rate": 9.927710274876698e-05, "loss": 0.0268, "step": 6090 }, { "grad_norm": 0.4491099417209625, "learning_rate": 9.927242611419603e-05, "loss": 0.0228, "step": 6100 }, { "grad_norm": 0.5502435564994812, "learning_rate": 9.926773451196301e-05, "loss": 0.0231, "step": 6110 }, { "grad_norm": 0.602737307548523, "learning_rate": 9.926302794349306e-05, "loss": 0.0266, "step": 6120 }, { "grad_norm": 0.5966441035270691, "learning_rate": 9.925830641021594e-05, "loss": 0.0268, "step": 6130 }, { "grad_norm": 0.3484886884689331, "learning_rate": 9.925356991356593e-05, "loss": 0.0223, "step": 6140 }, { "grad_norm": 0.5503848791122437, "learning_rate": 9.924881845498184e-05, "loss": 0.0235, "step": 6150 }, { "grad_norm": 0.42554327845573425, "learning_rate": 9.924405203590705e-05, "loss": 0.0296, "step": 6160 }, { "grad_norm": 0.4679335355758667, "learning_rate": 9.923927065778946e-05, "loss": 0.0275, "step": 6170 }, { "grad_norm": 0.46980857849121094, "learning_rate": 9.923447432208154e-05, "loss": 0.0262, "step": 6180 }, { "grad_norm": 0.6113616824150085, "learning_rate": 9.922966303024027e-05, "loss": 0.0306, "step": 6190 }, { "grad_norm": 0.5694376826286316, "learning_rate": 9.922483678372721e-05, "loss": 0.0271, "step": 6200 }, { "grad_norm": 0.5350271463394165, "learning_rate": 9.921999558400845e-05, "loss": 0.0262, "step": 6210 }, { "grad_norm": 0.5522851943969727, "learning_rate": 9.92151394325546e-05, "loss": 0.0257, "step": 6220 }, { "grad_norm": 0.5042399764060974, "learning_rate": 9.921026833084084e-05, "loss": 0.025, "step": 6230 }, { "grad_norm": 0.522422730922699, "learning_rate": 9.920538228034689e-05, "loss": 0.027, "step": 6240 }, { "grad_norm": 0.5598258972167969, "learning_rate": 9.920048128255699e-05, "loss": 0.0281, "step": 6250 }, { "grad_norm": 0.45274388790130615, "learning_rate": 9.919556533895995e-05, "loss": 0.0294, "step": 6260 }, { "grad_norm": 0.5932326912879944, "learning_rate": 9.919063445104907e-05, "loss": 0.0268, "step": 6270 }, { "grad_norm": 0.5251244306564331, "learning_rate": 9.918568862032227e-05, "loss": 0.0279, "step": 6280 }, { "grad_norm": 0.5877266526222229, "learning_rate": 9.918072784828194e-05, "loss": 0.0313, "step": 6290 }, { "grad_norm": 0.48378267884254456, "learning_rate": 9.917575213643501e-05, "loss": 0.0245, "step": 6300 }, { "grad_norm": 0.5477020740509033, "learning_rate": 9.917076148629302e-05, "loss": 0.0271, "step": 6310 }, { "grad_norm": 0.6756936311721802, "learning_rate": 9.916575589937196e-05, "loss": 0.0237, "step": 6320 }, { "grad_norm": 0.5312181115150452, "learning_rate": 9.916073537719239e-05, "loss": 0.0254, "step": 6330 }, { "grad_norm": 0.7678133845329285, "learning_rate": 9.915569992127944e-05, "loss": 0.0285, "step": 6340 }, { "grad_norm": 0.4704437553882599, "learning_rate": 9.915064953316273e-05, "loss": 0.0217, "step": 6350 }, { "grad_norm": 0.5444080233573914, "learning_rate": 9.914558421437645e-05, "loss": 0.025, "step": 6360 }, { "grad_norm": 0.6273277997970581, "learning_rate": 9.914050396645929e-05, "loss": 0.0274, "step": 6370 }, { "grad_norm": 0.460055410861969, "learning_rate": 9.913540879095452e-05, "loss": 0.024, "step": 6380 }, { "grad_norm": 0.4514501392841339, "learning_rate": 9.913029868940987e-05, "loss": 0.0254, "step": 6390 }, { "grad_norm": 0.5933006405830383, "learning_rate": 9.912517366337772e-05, "loss": 0.0255, "step": 6400 }, { "grad_norm": 0.5432915687561035, "learning_rate": 9.912003371441487e-05, "loss": 0.0257, "step": 6410 }, { "grad_norm": 0.5635409355163574, "learning_rate": 9.911487884408271e-05, "loss": 0.0268, "step": 6420 }, { "grad_norm": 0.45503464341163635, "learning_rate": 9.910970905394719e-05, "loss": 0.0218, "step": 6430 }, { "grad_norm": 0.5414345860481262, "learning_rate": 9.91045243455787e-05, "loss": 0.0247, "step": 6440 }, { "grad_norm": 0.46933358907699585, "learning_rate": 9.909932472055225e-05, "loss": 0.0212, "step": 6450 }, { "grad_norm": 0.49070093035697937, "learning_rate": 9.909411018044734e-05, "loss": 0.0221, "step": 6460 }, { "grad_norm": 0.472852885723114, "learning_rate": 9.908888072684802e-05, "loss": 0.0269, "step": 6470 }, { "grad_norm": 0.5244875550270081, "learning_rate": 9.908363636134285e-05, "loss": 0.0219, "step": 6480 }, { "grad_norm": 0.49625301361083984, "learning_rate": 9.907837708552493e-05, "loss": 0.0246, "step": 6490 }, { "grad_norm": 0.529707133769989, "learning_rate": 9.90731029009919e-05, "loss": 0.0233, "step": 6500 }, { "grad_norm": 0.5402315855026245, "learning_rate": 9.906781380934589e-05, "loss": 0.0244, "step": 6510 }, { "grad_norm": 0.46394890546798706, "learning_rate": 9.906250981219362e-05, "loss": 0.0285, "step": 6520 }, { "grad_norm": 0.41746652126312256, "learning_rate": 9.905719091114628e-05, "loss": 0.0252, "step": 6530 }, { "grad_norm": 0.44857144355773926, "learning_rate": 9.905185710781964e-05, "loss": 0.0235, "step": 6540 }, { "grad_norm": 0.6205761432647705, "learning_rate": 9.904650840383392e-05, "loss": 0.0235, "step": 6550 }, { "grad_norm": 0.378711074590683, "learning_rate": 9.904114480081397e-05, "loss": 0.0259, "step": 6560 }, { "grad_norm": 0.5038026571273804, "learning_rate": 9.903576630038906e-05, "loss": 0.0288, "step": 6570 }, { "grad_norm": 0.5139516592025757, "learning_rate": 9.903037290419309e-05, "loss": 0.0343, "step": 6580 }, { "grad_norm": 0.4362604022026062, "learning_rate": 9.902496461386439e-05, "loss": 0.0256, "step": 6590 }, { "grad_norm": 0.5129692554473877, "learning_rate": 9.901954143104588e-05, "loss": 0.0238, "step": 6600 }, { "grad_norm": 0.5847436785697937, "learning_rate": 9.901410335738496e-05, "loss": 0.0242, "step": 6610 }, { "grad_norm": 0.5122170448303223, "learning_rate": 9.900865039453358e-05, "loss": 0.0233, "step": 6620 }, { "grad_norm": 0.5081768035888672, "learning_rate": 9.900318254414821e-05, "loss": 0.0279, "step": 6630 }, { "grad_norm": 0.480114609003067, "learning_rate": 9.899769980788985e-05, "loss": 0.0253, "step": 6640 }, { "grad_norm": 0.51370769739151, "learning_rate": 9.899220218742398e-05, "loss": 0.0222, "step": 6650 }, { "grad_norm": 0.446836918592453, "learning_rate": 9.898668968442066e-05, "loss": 0.0225, "step": 6660 }, { "grad_norm": 0.4553784728050232, "learning_rate": 9.898116230055443e-05, "loss": 0.0244, "step": 6670 }, { "grad_norm": 0.621615469455719, "learning_rate": 9.897562003750437e-05, "loss": 0.0276, "step": 6680 }, { "grad_norm": 0.5878480076789856, "learning_rate": 9.897006289695407e-05, "loss": 0.0243, "step": 6690 }, { "grad_norm": 0.4821608066558838, "learning_rate": 9.896449088059164e-05, "loss": 0.0237, "step": 6700 }, { "grad_norm": 0.4933816194534302, "learning_rate": 9.89589039901097e-05, "loss": 0.0287, "step": 6710 }, { "grad_norm": 0.49964186549186707, "learning_rate": 9.895330222720542e-05, "loss": 0.0279, "step": 6720 }, { "grad_norm": 0.49987050890922546, "learning_rate": 9.894768559358047e-05, "loss": 0.0276, "step": 6730 }, { "grad_norm": 0.40598466992378235, "learning_rate": 9.894205409094101e-05, "loss": 0.0289, "step": 6740 }, { "grad_norm": 0.4163476526737213, "learning_rate": 9.893640772099777e-05, "loss": 0.0219, "step": 6750 }, { "grad_norm": 0.5128793120384216, "learning_rate": 9.893074648546595e-05, "loss": 0.0231, "step": 6760 }, { "grad_norm": 0.5630953311920166, "learning_rate": 9.892507038606528e-05, "loss": 0.0246, "step": 6770 }, { "grad_norm": 0.4373803436756134, "learning_rate": 9.891937942452003e-05, "loss": 0.0245, "step": 6780 }, { "grad_norm": 0.4257532060146332, "learning_rate": 9.891367360255895e-05, "loss": 0.0252, "step": 6790 }, { "grad_norm": 0.5192376375198364, "learning_rate": 9.890795292191532e-05, "loss": 0.0323, "step": 6800 }, { "grad_norm": 0.5340325236320496, "learning_rate": 9.890221738432694e-05, "loss": 0.0241, "step": 6810 }, { "grad_norm": 0.6282180547714233, "learning_rate": 9.88964669915361e-05, "loss": 0.0256, "step": 6820 }, { "grad_norm": 0.513201892375946, "learning_rate": 9.889070174528963e-05, "loss": 0.0212, "step": 6830 }, { "grad_norm": 0.38406088948249817, "learning_rate": 9.888492164733883e-05, "loss": 0.0229, "step": 6840 }, { "grad_norm": 0.40800413489341736, "learning_rate": 9.88791266994396e-05, "loss": 0.0221, "step": 6850 }, { "grad_norm": 0.5369046330451965, "learning_rate": 9.887331690335223e-05, "loss": 0.0278, "step": 6860 }, { "grad_norm": 0.32878604531288147, "learning_rate": 9.886749226084163e-05, "loss": 0.028, "step": 6870 }, { "grad_norm": 0.4625433683395386, "learning_rate": 9.886165277367714e-05, "loss": 0.0301, "step": 6880 }, { "grad_norm": 0.4818950891494751, "learning_rate": 9.885579844363265e-05, "loss": 0.026, "step": 6890 }, { "grad_norm": 0.5412994623184204, "learning_rate": 9.884992927248656e-05, "loss": 0.0247, "step": 6900 }, { "grad_norm": 0.9018186330795288, "learning_rate": 9.884404526202178e-05, "loss": 0.026, "step": 6910 }, { "grad_norm": 1.1377595663070679, "learning_rate": 9.883814641402568e-05, "loss": 0.0423, "step": 6920 }, { "grad_norm": 0.5635457038879395, "learning_rate": 9.88322327302902e-05, "loss": 0.0298, "step": 6930 }, { "grad_norm": 0.4904860258102417, "learning_rate": 9.882630421261176e-05, "loss": 0.0273, "step": 6940 }, { "grad_norm": 0.49857255816459656, "learning_rate": 9.88203608627913e-05, "loss": 0.0339, "step": 6950 }, { "grad_norm": 0.5071297883987427, "learning_rate": 9.881440268263422e-05, "loss": 0.0315, "step": 6960 }, { "grad_norm": 0.5229262709617615, "learning_rate": 9.880842967395048e-05, "loss": 0.0295, "step": 6970 }, { "grad_norm": 0.8999877572059631, "learning_rate": 9.880244183855452e-05, "loss": 0.0267, "step": 6980 }, { "grad_norm": 0.5732887983322144, "learning_rate": 9.879643917826527e-05, "loss": 0.033, "step": 6990 }, { "grad_norm": 0.5922005772590637, "learning_rate": 9.87904216949062e-05, "loss": 0.0369, "step": 7000 }, { "grad_norm": 0.5878987908363342, "learning_rate": 9.878438939030526e-05, "loss": 0.0301, "step": 7010 }, { "grad_norm": 0.4599475860595703, "learning_rate": 9.877834226629489e-05, "loss": 0.026, "step": 7020 }, { "grad_norm": 0.4800131618976593, "learning_rate": 9.877228032471206e-05, "loss": 0.0274, "step": 7030 }, { "grad_norm": 0.45589321851730347, "learning_rate": 9.876620356739823e-05, "loss": 0.0284, "step": 7040 }, { "grad_norm": 0.622838020324707, "learning_rate": 9.876011199619935e-05, "loss": 0.0362, "step": 7050 }, { "grad_norm": 0.4409644603729248, "learning_rate": 9.875400561296589e-05, "loss": 0.0264, "step": 7060 }, { "grad_norm": 0.5039188265800476, "learning_rate": 9.874788441955278e-05, "loss": 0.0289, "step": 7070 }, { "grad_norm": 0.4262818396091461, "learning_rate": 9.874174841781951e-05, "loss": 0.0231, "step": 7080 }, { "grad_norm": 0.5230967998504639, "learning_rate": 9.873559760963003e-05, "loss": 0.024, "step": 7090 }, { "grad_norm": 0.3851624131202698, "learning_rate": 9.872943199685278e-05, "loss": 0.0267, "step": 7100 }, { "grad_norm": 0.8489354252815247, "learning_rate": 9.872325158136071e-05, "loss": 0.0268, "step": 7110 }, { "grad_norm": 0.4670383632183075, "learning_rate": 9.871705636503128e-05, "loss": 0.026, "step": 7120 }, { "grad_norm": 0.5082715749740601, "learning_rate": 9.871084634974641e-05, "loss": 0.0247, "step": 7130 }, { "grad_norm": 0.44554927945137024, "learning_rate": 9.870462153739257e-05, "loss": 0.0267, "step": 7140 }, { "grad_norm": 0.4707428514957428, "learning_rate": 9.869838192986067e-05, "loss": 0.0273, "step": 7150 }, { "grad_norm": 0.5337039232254028, "learning_rate": 9.869212752904616e-05, "loss": 0.0294, "step": 7160 }, { "grad_norm": 0.517871081829071, "learning_rate": 9.868585833684894e-05, "loss": 0.0251, "step": 7170 }, { "grad_norm": 0.4735605716705322, "learning_rate": 9.867957435517342e-05, "loss": 0.026, "step": 7180 }, { "grad_norm": 0.4602915048599243, "learning_rate": 9.867327558592854e-05, "loss": 0.0292, "step": 7190 }, { "grad_norm": 0.43385154008865356, "learning_rate": 9.866696203102766e-05, "loss": 0.0241, "step": 7200 }, { "grad_norm": 0.47179746627807617, "learning_rate": 9.86606336923887e-05, "loss": 0.0214, "step": 7210 }, { "grad_norm": 0.5017564296722412, "learning_rate": 9.865429057193403e-05, "loss": 0.0278, "step": 7220 }, { "grad_norm": 0.39125856757164, "learning_rate": 9.864793267159053e-05, "loss": 0.0221, "step": 7230 }, { "grad_norm": 0.4022477865219116, "learning_rate": 9.864155999328957e-05, "loss": 0.0238, "step": 7240 }, { "grad_norm": 0.4445357918739319, "learning_rate": 9.8635172538967e-05, "loss": 0.0213, "step": 7250 }, { "grad_norm": 0.5076184272766113, "learning_rate": 9.862877031056312e-05, "loss": 0.0221, "step": 7260 }, { "grad_norm": 0.4562908709049225, "learning_rate": 9.862235331002279e-05, "loss": 0.0241, "step": 7270 }, { "grad_norm": 0.4024653434753418, "learning_rate": 9.861592153929533e-05, "loss": 0.0205, "step": 7280 }, { "grad_norm": 0.5546900033950806, "learning_rate": 9.860947500033455e-05, "loss": 0.0218, "step": 7290 }, { "grad_norm": 0.37578102946281433, "learning_rate": 9.86030136950987e-05, "loss": 0.0241, "step": 7300 }, { "grad_norm": 0.44185230135917664, "learning_rate": 9.85965376255506e-05, "loss": 0.0272, "step": 7310 }, { "grad_norm": 0.581455647945404, "learning_rate": 9.859004679365747e-05, "loss": 0.023, "step": 7320 }, { "grad_norm": 0.5566856265068054, "learning_rate": 9.858354120139108e-05, "loss": 0.0274, "step": 7330 }, { "grad_norm": 0.45162466168403625, "learning_rate": 9.857702085072764e-05, "loss": 0.022, "step": 7340 }, { "grad_norm": 0.48306140303611755, "learning_rate": 9.857048574364787e-05, "loss": 0.021, "step": 7350 }, { "grad_norm": 0.42747581005096436, "learning_rate": 9.856393588213698e-05, "loss": 0.0215, "step": 7360 }, { "grad_norm": 0.46995165944099426, "learning_rate": 9.855737126818458e-05, "loss": 0.0232, "step": 7370 }, { "grad_norm": 0.4869796931743622, "learning_rate": 9.855079190378491e-05, "loss": 0.0255, "step": 7380 }, { "grad_norm": 0.6607803702354431, "learning_rate": 9.854419779093655e-05, "loss": 0.0285, "step": 7390 }, { "grad_norm": 0.506090521812439, "learning_rate": 9.853758893164264e-05, "loss": 0.0252, "step": 7400 }, { "grad_norm": 0.42590656876564026, "learning_rate": 9.853096532791078e-05, "loss": 0.0215, "step": 7410 }, { "grad_norm": 0.5458431243896484, "learning_rate": 9.852432698175304e-05, "loss": 0.026, "step": 7420 }, { "grad_norm": 0.500715434551239, "learning_rate": 9.851767389518597e-05, "loss": 0.0228, "step": 7430 }, { "grad_norm": 0.3433137834072113, "learning_rate": 9.85110060702306e-05, "loss": 0.0256, "step": 7440 }, { "grad_norm": 0.44231247901916504, "learning_rate": 9.850432350891245e-05, "loss": 0.0205, "step": 7450 }, { "grad_norm": 0.4089946746826172, "learning_rate": 9.84976262132615e-05, "loss": 0.0233, "step": 7460 }, { "grad_norm": 0.4727373421192169, "learning_rate": 9.849091418531222e-05, "loss": 0.0223, "step": 7470 }, { "grad_norm": 0.46963366866111755, "learning_rate": 9.848418742710353e-05, "loss": 0.0234, "step": 7480 }, { "grad_norm": 0.47123104333877563, "learning_rate": 9.847744594067885e-05, "loss": 0.0239, "step": 7490 }, { "grad_norm": 0.39739325642585754, "learning_rate": 9.847068972808607e-05, "loss": 0.0204, "step": 7500 }, { "grad_norm": 0.46723365783691406, "learning_rate": 9.846391879137756e-05, "loss": 0.0192, "step": 7510 }, { "grad_norm": 0.3652726709842682, "learning_rate": 9.845713313261012e-05, "loss": 0.0245, "step": 7520 }, { "grad_norm": 0.5234922170639038, "learning_rate": 9.845033275384505e-05, "loss": 0.0247, "step": 7530 }, { "grad_norm": 0.40047717094421387, "learning_rate": 9.844351765714818e-05, "loss": 0.0217, "step": 7540 }, { "grad_norm": 0.4735856354236603, "learning_rate": 9.843668784458971e-05, "loss": 0.0221, "step": 7550 }, { "grad_norm": 0.6301396489143372, "learning_rate": 9.842984331824437e-05, "loss": 0.0221, "step": 7560 }, { "grad_norm": 0.40476080775260925, "learning_rate": 9.842298408019133e-05, "loss": 0.0235, "step": 7570 }, { "grad_norm": 0.4858342707157135, "learning_rate": 9.841611013251429e-05, "loss": 0.0241, "step": 7580 }, { "grad_norm": 0.41062504053115845, "learning_rate": 9.840922147730133e-05, "loss": 0.022, "step": 7590 }, { "grad_norm": 0.39325666427612305, "learning_rate": 9.840231811664506e-05, "loss": 0.0225, "step": 7600 }, { "grad_norm": 0.43286484479904175, "learning_rate": 9.839540005264252e-05, "loss": 0.0183, "step": 7610 }, { "grad_norm": 0.4203701913356781, "learning_rate": 9.838846728739527e-05, "loss": 0.0194, "step": 7620 }, { "grad_norm": 0.3241867423057556, "learning_rate": 9.838151982300927e-05, "loss": 0.0186, "step": 7630 }, { "grad_norm": 0.472136527299881, "learning_rate": 9.8374557661595e-05, "loss": 0.0208, "step": 7640 }, { "grad_norm": 0.3961484134197235, "learning_rate": 9.836758080526735e-05, "loss": 0.0247, "step": 7650 }, { "grad_norm": 0.43421080708503723, "learning_rate": 9.836058925614575e-05, "loss": 0.0208, "step": 7660 }, { "grad_norm": 0.5333141684532166, "learning_rate": 9.8353583016354e-05, "loss": 0.0213, "step": 7670 }, { "grad_norm": 0.36163684725761414, "learning_rate": 9.834656208802044e-05, "loss": 0.0206, "step": 7680 }, { "grad_norm": 0.48670899868011475, "learning_rate": 9.833952647327784e-05, "loss": 0.021, "step": 7690 }, { "grad_norm": 0.46940380334854126, "learning_rate": 9.833247617426342e-05, "loss": 0.0194, "step": 7700 }, { "grad_norm": 0.4104388356208801, "learning_rate": 9.832541119311889e-05, "loss": 0.0213, "step": 7710 }, { "grad_norm": 0.40959322452545166, "learning_rate": 9.83183315319904e-05, "loss": 0.0193, "step": 7720 }, { "grad_norm": 0.4774872064590454, "learning_rate": 9.831123719302855e-05, "loss": 0.0252, "step": 7730 }, { "grad_norm": 0.5066968202590942, "learning_rate": 9.830412817838842e-05, "loss": 0.0224, "step": 7740 }, { "grad_norm": 0.4927401840686798, "learning_rate": 9.829700449022956e-05, "loss": 0.0204, "step": 7750 }, { "grad_norm": 0.4348791837692261, "learning_rate": 9.828986613071593e-05, "loss": 0.0199, "step": 7760 }, { "grad_norm": 0.44254836440086365, "learning_rate": 9.828271310201601e-05, "loss": 0.0169, "step": 7770 }, { "grad_norm": 0.4523863196372986, "learning_rate": 9.827554540630268e-05, "loss": 0.0232, "step": 7780 }, { "grad_norm": 0.3269347548484802, "learning_rate": 9.826836304575329e-05, "loss": 0.0195, "step": 7790 }, { "grad_norm": 0.38588622212409973, "learning_rate": 9.826116602254966e-05, "loss": 0.023, "step": 7800 }, { "grad_norm": 0.38418668508529663, "learning_rate": 9.825395433887805e-05, "loss": 0.0237, "step": 7810 }, { "grad_norm": 0.5125848054885864, "learning_rate": 9.824672799692917e-05, "loss": 0.0278, "step": 7820 }, { "grad_norm": 0.5025792717933655, "learning_rate": 9.823948699889823e-05, "loss": 0.0326, "step": 7830 }, { "grad_norm": 0.42523086071014404, "learning_rate": 9.823223134698483e-05, "loss": 0.0233, "step": 7840 }, { "grad_norm": 0.5269473791122437, "learning_rate": 9.822496104339303e-05, "loss": 0.0209, "step": 7850 }, { "grad_norm": 0.4181648790836334, "learning_rate": 9.821767609033138e-05, "loss": 0.0205, "step": 7860 }, { "grad_norm": 0.4053167402744293, "learning_rate": 9.821037649001284e-05, "loss": 0.0213, "step": 7870 }, { "grad_norm": 0.5356869101524353, "learning_rate": 9.820306224465486e-05, "loss": 0.023, "step": 7880 }, { "grad_norm": 0.47925180196762085, "learning_rate": 9.819573335647928e-05, "loss": 0.0214, "step": 7890 }, { "grad_norm": 0.5209689736366272, "learning_rate": 9.818838982771246e-05, "loss": 0.0195, "step": 7900 }, { "grad_norm": 0.41950997710227966, "learning_rate": 9.818103166058514e-05, "loss": 0.027, "step": 7910 }, { "grad_norm": 0.398406982421875, "learning_rate": 9.817365885733254e-05, "loss": 0.0205, "step": 7920 }, { "grad_norm": 0.451774924993515, "learning_rate": 9.816627142019434e-05, "loss": 0.0229, "step": 7930 }, { "grad_norm": 0.590817391872406, "learning_rate": 9.815886935141463e-05, "loss": 0.0213, "step": 7940 }, { "grad_norm": 0.43801072239875793, "learning_rate": 9.8151452653242e-05, "loss": 0.0271, "step": 7950 }, { "grad_norm": 0.3935137689113617, "learning_rate": 9.814402132792939e-05, "loss": 0.029, "step": 7960 }, { "grad_norm": 0.5044687986373901, "learning_rate": 9.813657537773428e-05, "loss": 0.0223, "step": 7970 }, { "grad_norm": 0.47347891330718994, "learning_rate": 9.812911480491854e-05, "loss": 0.0292, "step": 7980 }, { "grad_norm": 0.46765559911727905, "learning_rate": 9.81216396117485e-05, "loss": 0.0222, "step": 7990 }, { "grad_norm": 0.4351886212825775, "learning_rate": 9.811414980049491e-05, "loss": 0.0211, "step": 8000 }, { "grad_norm": 0.4591696262359619, "learning_rate": 9.810664537343301e-05, "loss": 0.0248, "step": 8010 }, { "grad_norm": 0.5217757821083069, "learning_rate": 9.809912633284243e-05, "loss": 0.0212, "step": 8020 }, { "grad_norm": 0.47767433524131775, "learning_rate": 9.809159268100725e-05, "loss": 0.0228, "step": 8030 }, { "grad_norm": 0.468953937292099, "learning_rate": 9.808404442021599e-05, "loss": 0.0185, "step": 8040 }, { "grad_norm": 0.406931072473526, "learning_rate": 9.807648155276163e-05, "loss": 0.017, "step": 8050 }, { "grad_norm": 0.5090564489364624, "learning_rate": 9.806890408094156e-05, "loss": 0.0203, "step": 8060 }, { "grad_norm": 0.4319576621055603, "learning_rate": 9.806131200705761e-05, "loss": 0.0194, "step": 8070 }, { "grad_norm": 0.34996670484542847, "learning_rate": 9.805370533341605e-05, "loss": 0.0191, "step": 8080 }, { "grad_norm": 0.3921457529067993, "learning_rate": 9.804608406232762e-05, "loss": 0.018, "step": 8090 }, { "grad_norm": 0.35749679803848267, "learning_rate": 9.803844819610741e-05, "loss": 0.0173, "step": 8100 }, { "grad_norm": 0.35104918479919434, "learning_rate": 9.803079773707504e-05, "loss": 0.0205, "step": 8110 }, { "grad_norm": 0.3688827455043793, "learning_rate": 9.802313268755447e-05, "loss": 0.0206, "step": 8120 }, { "grad_norm": 0.37073761224746704, "learning_rate": 9.801545304987419e-05, "loss": 0.0174, "step": 8130 }, { "grad_norm": 0.4200715720653534, "learning_rate": 9.800775882636704e-05, "loss": 0.0218, "step": 8140 }, { "grad_norm": 0.45175638794898987, "learning_rate": 9.800005001937034e-05, "loss": 0.021, "step": 8150 }, { "grad_norm": 0.3802504539489746, "learning_rate": 9.79923266312258e-05, "loss": 0.0216, "step": 8160 }, { "grad_norm": 0.43102043867111206, "learning_rate": 9.79845886642796e-05, "loss": 0.0208, "step": 8170 }, { "grad_norm": 0.3668661117553711, "learning_rate": 9.797683612088233e-05, "loss": 0.0221, "step": 8180 }, { "grad_norm": 0.4550149440765381, "learning_rate": 9.796906900338898e-05, "loss": 0.0203, "step": 8190 }, { "grad_norm": 0.45624905824661255, "learning_rate": 9.796128731415903e-05, "loss": 0.0212, "step": 8200 }, { "grad_norm": 0.4912322461605072, "learning_rate": 9.795349105555634e-05, "loss": 0.0209, "step": 8210 }, { "grad_norm": 0.5102741718292236, "learning_rate": 9.794568022994922e-05, "loss": 0.023, "step": 8220 }, { "grad_norm": 0.3720044195652008, "learning_rate": 9.793785483971034e-05, "loss": 0.0222, "step": 8230 }, { "grad_norm": 0.388742059469223, "learning_rate": 9.793001488721691e-05, "loss": 0.0176, "step": 8240 }, { "grad_norm": 0.41112661361694336, "learning_rate": 9.792216037485047e-05, "loss": 0.0173, "step": 8250 }, { "grad_norm": 0.540833592414856, "learning_rate": 9.791429130499704e-05, "loss": 0.0182, "step": 8260 }, { "grad_norm": 0.40391018986701965, "learning_rate": 9.790640768004698e-05, "loss": 0.027, "step": 8270 }, { "grad_norm": 0.44928425550460815, "learning_rate": 9.789850950239518e-05, "loss": 0.0216, "step": 8280 }, { "grad_norm": 0.4349338412284851, "learning_rate": 9.789059677444089e-05, "loss": 0.0212, "step": 8290 }, { "grad_norm": 0.3825474977493286, "learning_rate": 9.788266949858776e-05, "loss": 0.0196, "step": 8300 }, { "grad_norm": 0.4604344666004181, "learning_rate": 9.787472767724392e-05, "loss": 0.0257, "step": 8310 }, { "grad_norm": 0.488352507352829, "learning_rate": 9.786677131282185e-05, "loss": 0.0232, "step": 8320 }, { "grad_norm": 0.42431139945983887, "learning_rate": 9.785880040773853e-05, "loss": 0.0225, "step": 8330 }, { "grad_norm": 0.41543862223625183, "learning_rate": 9.785081496441527e-05, "loss": 0.0213, "step": 8340 }, { "grad_norm": 0.45707616209983826, "learning_rate": 9.784281498527785e-05, "loss": 0.0202, "step": 8350 }, { "grad_norm": 0.3349325656890869, "learning_rate": 9.783480047275646e-05, "loss": 0.0198, "step": 8360 }, { "grad_norm": 0.5466597080230713, "learning_rate": 9.78267714292857e-05, "loss": 0.0165, "step": 8370 }, { "grad_norm": 0.42251455783843994, "learning_rate": 9.781872785730454e-05, "loss": 0.0208, "step": 8380 }, { "grad_norm": 0.47086822986602783, "learning_rate": 9.781066975925646e-05, "loss": 0.0201, "step": 8390 }, { "grad_norm": 0.39013954997062683, "learning_rate": 9.780259713758928e-05, "loss": 0.017, "step": 8400 }, { "grad_norm": 0.43151065707206726, "learning_rate": 9.779450999475524e-05, "loss": 0.0192, "step": 8410 }, { "grad_norm": 0.6166801452636719, "learning_rate": 9.7786408333211e-05, "loss": 0.0205, "step": 8420 }, { "grad_norm": 0.6463521122932434, "learning_rate": 9.777829215541764e-05, "loss": 0.025, "step": 8430 }, { "grad_norm": 0.5083146691322327, "learning_rate": 9.777016146384064e-05, "loss": 0.0232, "step": 8440 }, { "grad_norm": 0.4979715943336487, "learning_rate": 9.776201626094988e-05, "loss": 0.0214, "step": 8450 }, { "grad_norm": 0.3771785497665405, "learning_rate": 9.775385654921965e-05, "loss": 0.0211, "step": 8460 }, { "grad_norm": 0.4617980420589447, "learning_rate": 9.774568233112868e-05, "loss": 0.0202, "step": 8470 }, { "grad_norm": 0.5535889863967896, "learning_rate": 9.773749360916007e-05, "loss": 0.0287, "step": 8480 }, { "grad_norm": 0.430027574300766, "learning_rate": 9.772929038580134e-05, "loss": 0.0272, "step": 8490 }, { "grad_norm": 0.5492871999740601, "learning_rate": 9.772107266354439e-05, "loss": 0.0222, "step": 8500 }, { "grad_norm": 0.45631569623947144, "learning_rate": 9.77128404448856e-05, "loss": 0.0238, "step": 8510 }, { "grad_norm": 0.4331152141094208, "learning_rate": 9.770459373232565e-05, "loss": 0.0225, "step": 8520 }, { "grad_norm": 0.4748595356941223, "learning_rate": 9.769633252836969e-05, "loss": 0.0187, "step": 8530 }, { "grad_norm": 0.4157826900482178, "learning_rate": 9.768805683552724e-05, "loss": 0.0175, "step": 8540 }, { "grad_norm": 0.3154793381690979, "learning_rate": 9.767976665631228e-05, "loss": 0.0231, "step": 8550 }, { "grad_norm": 0.4567323625087738, "learning_rate": 9.767146199324311e-05, "loss": 0.0197, "step": 8560 }, { "grad_norm": 0.4639695882797241, "learning_rate": 9.766314284884249e-05, "loss": 0.0242, "step": 8570 }, { "grad_norm": 0.45093733072280884, "learning_rate": 9.765480922563752e-05, "loss": 0.0207, "step": 8580 }, { "grad_norm": 0.41865789890289307, "learning_rate": 9.764646112615978e-05, "loss": 0.0201, "step": 8590 }, { "grad_norm": 0.37710443139076233, "learning_rate": 9.763809855294517e-05, "loss": 0.0231, "step": 8600 }, { "grad_norm": 0.4054875671863556, "learning_rate": 9.762972150853404e-05, "loss": 0.0232, "step": 8610 }, { "grad_norm": 0.46502891182899475, "learning_rate": 9.762132999547111e-05, "loss": 0.023, "step": 8620 }, { "grad_norm": 0.43600308895111084, "learning_rate": 9.761292401630549e-05, "loss": 0.0206, "step": 8630 }, { "grad_norm": 0.41320857405662537, "learning_rate": 9.76045035735907e-05, "loss": 0.0192, "step": 8640 }, { "grad_norm": 0.38332414627075195, "learning_rate": 9.759606866988464e-05, "loss": 0.0254, "step": 8650 }, { "grad_norm": 0.41022416949272156, "learning_rate": 9.758761930774963e-05, "loss": 0.0202, "step": 8660 }, { "grad_norm": 0.47089439630508423, "learning_rate": 9.757915548975235e-05, "loss": 0.0218, "step": 8670 }, { "grad_norm": 0.42582616209983826, "learning_rate": 9.757067721846389e-05, "loss": 0.0233, "step": 8680 }, { "grad_norm": 0.5691205859184265, "learning_rate": 9.756218449645971e-05, "loss": 0.0208, "step": 8690 }, { "grad_norm": 0.44527509808540344, "learning_rate": 9.75536773263197e-05, "loss": 0.0214, "step": 8700 }, { "grad_norm": 0.47421303391456604, "learning_rate": 9.75451557106281e-05, "loss": 0.0179, "step": 8710 }, { "grad_norm": 0.37851452827453613, "learning_rate": 9.753661965197354e-05, "loss": 0.0193, "step": 8720 }, { "grad_norm": 0.4480518698692322, "learning_rate": 9.752806915294908e-05, "loss": 0.0214, "step": 8730 }, { "grad_norm": 0.33988139033317566, "learning_rate": 9.75195042161521e-05, "loss": 0.0201, "step": 8740 }, { "grad_norm": 0.5393968820571899, "learning_rate": 9.751092484418442e-05, "loss": 0.0221, "step": 8750 }, { "grad_norm": 0.3613015413284302, "learning_rate": 9.750233103965224e-05, "loss": 0.0228, "step": 8760 }, { "grad_norm": 0.43525227904319763, "learning_rate": 9.749372280516611e-05, "loss": 0.0194, "step": 8770 }, { "grad_norm": 0.5981136560440063, "learning_rate": 9.748510014334097e-05, "loss": 0.0226, "step": 8780 }, { "grad_norm": 0.4568023979663849, "learning_rate": 9.747646305679621e-05, "loss": 0.0242, "step": 8790 }, { "grad_norm": 0.35805001854896545, "learning_rate": 9.74678115481555e-05, "loss": 0.0197, "step": 8800 }, { "grad_norm": 0.4945303499698639, "learning_rate": 9.745914562004696e-05, "loss": 0.0225, "step": 8810 }, { "grad_norm": 0.5129665732383728, "learning_rate": 9.745046527510307e-05, "loss": 0.0236, "step": 8820 }, { "grad_norm": 0.44416165351867676, "learning_rate": 9.744177051596068e-05, "loss": 0.021, "step": 8830 }, { "grad_norm": 0.41258132457733154, "learning_rate": 9.743306134526105e-05, "loss": 0.0176, "step": 8840 }, { "grad_norm": 0.719923198223114, "learning_rate": 9.742433776564977e-05, "loss": 0.0216, "step": 8850 }, { "grad_norm": 0.27094727754592896, "learning_rate": 9.741559977977683e-05, "loss": 0.0232, "step": 8860 }, { "grad_norm": 0.4060170650482178, "learning_rate": 9.740684739029661e-05, "loss": 0.0186, "step": 8870 }, { "grad_norm": 0.34258833527565, "learning_rate": 9.739808059986789e-05, "loss": 0.0214, "step": 8880 }, { "grad_norm": 0.4309740662574768, "learning_rate": 9.738929941115373e-05, "loss": 0.0167, "step": 8890 }, { "grad_norm": 0.4455328583717346, "learning_rate": 9.738050382682167e-05, "loss": 0.023, "step": 8900 }, { "grad_norm": 0.3460875451564789, "learning_rate": 9.737169384954355e-05, "loss": 0.0175, "step": 8910 }, { "grad_norm": 0.5134332180023193, "learning_rate": 9.736286948199562e-05, "loss": 0.0179, "step": 8920 }, { "grad_norm": 0.38894134759902954, "learning_rate": 9.735403072685848e-05, "loss": 0.017, "step": 8930 }, { "grad_norm": 0.35251039266586304, "learning_rate": 9.734517758681712e-05, "loss": 0.0176, "step": 8940 }, { "grad_norm": 0.41770267486572266, "learning_rate": 9.733631006456088e-05, "loss": 0.0201, "step": 8950 }, { "grad_norm": 0.4722731113433838, "learning_rate": 9.732742816278348e-05, "loss": 0.0216, "step": 8960 }, { "grad_norm": 0.3995599150657654, "learning_rate": 9.731853188418302e-05, "loss": 0.0153, "step": 8970 }, { "grad_norm": 0.4286731779575348, "learning_rate": 9.730962123146194e-05, "loss": 0.021, "step": 8980 }, { "grad_norm": 0.49526873230934143, "learning_rate": 9.730069620732709e-05, "loss": 0.0211, "step": 8990 }, { "grad_norm": 0.4411509335041046, "learning_rate": 9.72917568144896e-05, "loss": 0.0243, "step": 9000 }, { "grad_norm": 0.33901944756507874, "learning_rate": 9.728280305566509e-05, "loss": 0.0187, "step": 9010 }, { "grad_norm": 0.31252339482307434, "learning_rate": 9.727383493357343e-05, "loss": 0.0174, "step": 9020 }, { "grad_norm": 0.43856510519981384, "learning_rate": 9.726485245093891e-05, "loss": 0.0179, "step": 9030 }, { "grad_norm": 0.3634667694568634, "learning_rate": 9.725585561049018e-05, "loss": 0.0175, "step": 9040 }, { "grad_norm": 0.46502628922462463, "learning_rate": 9.724684441496022e-05, "loss": 0.0175, "step": 9050 }, { "grad_norm": 0.40700143575668335, "learning_rate": 9.72378188670864e-05, "loss": 0.0203, "step": 9060 }, { "grad_norm": 0.40098509192466736, "learning_rate": 9.722877896961047e-05, "loss": 0.0181, "step": 9070 }, { "grad_norm": 0.33963191509246826, "learning_rate": 9.721972472527848e-05, "loss": 0.0179, "step": 9080 }, { "grad_norm": 0.4348234534263611, "learning_rate": 9.721065613684089e-05, "loss": 0.0177, "step": 9090 }, { "grad_norm": 0.40149277448654175, "learning_rate": 9.72015732070525e-05, "loss": 0.0218, "step": 9100 }, { "grad_norm": 0.471910297870636, "learning_rate": 9.719247593867244e-05, "loss": 0.0205, "step": 9110 }, { "grad_norm": 0.4839973449707031, "learning_rate": 9.718336433446423e-05, "loss": 0.0207, "step": 9120 }, { "grad_norm": 0.43992653489112854, "learning_rate": 9.717423839719574e-05, "loss": 0.0153, "step": 9130 }, { "grad_norm": 0.32272520661354065, "learning_rate": 9.71650981296392e-05, "loss": 0.0201, "step": 9140 }, { "grad_norm": 0.46651262044906616, "learning_rate": 9.715594353457118e-05, "loss": 0.0228, "step": 9150 }, { "grad_norm": 0.3393571376800537, "learning_rate": 9.714677461477257e-05, "loss": 0.0185, "step": 9160 }, { "grad_norm": 0.5020555853843689, "learning_rate": 9.713759137302869e-05, "loss": 0.022, "step": 9170 }, { "grad_norm": 0.4701262414455414, "learning_rate": 9.712839381212914e-05, "loss": 0.0209, "step": 9180 }, { "grad_norm": 0.4095565676689148, "learning_rate": 9.71191819348679e-05, "loss": 0.0161, "step": 9190 }, { "grad_norm": 0.4011421203613281, "learning_rate": 9.710995574404331e-05, "loss": 0.0203, "step": 9200 }, { "grad_norm": 0.4591890573501587, "learning_rate": 9.710071524245802e-05, "loss": 0.0184, "step": 9210 }, { "grad_norm": 0.4001629054546356, "learning_rate": 9.709146043291906e-05, "loss": 0.0224, "step": 9220 }, { "grad_norm": 0.4999217689037323, "learning_rate": 9.70821913182378e-05, "loss": 0.0199, "step": 9230 }, { "grad_norm": 0.377604603767395, "learning_rate": 9.707290790122995e-05, "loss": 0.0188, "step": 9240 }, { "grad_norm": 0.3935345411300659, "learning_rate": 9.706361018471557e-05, "loss": 0.0229, "step": 9250 }, { "grad_norm": 0.3941493034362793, "learning_rate": 9.705429817151906e-05, "loss": 0.0196, "step": 9260 }, { "grad_norm": 0.38897705078125, "learning_rate": 9.704497186446917e-05, "loss": 0.0203, "step": 9270 }, { "grad_norm": 0.4263511598110199, "learning_rate": 9.703563126639896e-05, "loss": 0.0154, "step": 9280 }, { "grad_norm": 0.43895986676216125, "learning_rate": 9.70262763801459e-05, "loss": 0.0184, "step": 9290 }, { "grad_norm": 0.47976306080818176, "learning_rate": 9.701690720855171e-05, "loss": 0.0208, "step": 9300 }, { "grad_norm": 0.5062249898910522, "learning_rate": 9.700752375446253e-05, "loss": 0.0206, "step": 9310 }, { "grad_norm": 0.4274824261665344, "learning_rate": 9.69981260207288e-05, "loss": 0.0204, "step": 9320 }, { "grad_norm": 0.48724469542503357, "learning_rate": 9.698871401020529e-05, "loss": 0.0232, "step": 9330 }, { "grad_norm": 0.44530192017555237, "learning_rate": 9.697928772575112e-05, "loss": 0.0215, "step": 9340 }, { "grad_norm": 0.34917524456977844, "learning_rate": 9.696984717022976e-05, "loss": 0.0178, "step": 9350 }, { "grad_norm": 0.33441391587257385, "learning_rate": 9.6960392346509e-05, "loss": 0.021, "step": 9360 }, { "grad_norm": 0.48504024744033813, "learning_rate": 9.695092325746097e-05, "loss": 0.0245, "step": 9370 }, { "grad_norm": 0.407268226146698, "learning_rate": 9.694143990596211e-05, "loss": 0.022, "step": 9380 }, { "grad_norm": 0.3820374608039856, "learning_rate": 9.693194229489325e-05, "loss": 0.02, "step": 9390 }, { "grad_norm": 0.5052766799926758, "learning_rate": 9.692243042713944e-05, "loss": 0.0175, "step": 9400 }, { "grad_norm": 0.4108522832393646, "learning_rate": 9.691290430559022e-05, "loss": 0.0177, "step": 9410 }, { "grad_norm": 0.49897515773773193, "learning_rate": 9.690336393313932e-05, "loss": 0.0207, "step": 9420 }, { "grad_norm": 0.5770998001098633, "learning_rate": 9.689380931268487e-05, "loss": 0.0183, "step": 9430 }, { "grad_norm": 0.3491075336933136, "learning_rate": 9.688424044712932e-05, "loss": 0.0253, "step": 9440 }, { "grad_norm": 0.3772425055503845, "learning_rate": 9.687465733937942e-05, "loss": 0.0176, "step": 9450 }, { "grad_norm": 0.4027300477027893, "learning_rate": 9.686505999234627e-05, "loss": 0.0217, "step": 9460 }, { "grad_norm": 0.48696863651275635, "learning_rate": 9.685544840894529e-05, "loss": 0.0212, "step": 9470 }, { "grad_norm": 0.41473135352134705, "learning_rate": 9.684582259209624e-05, "loss": 0.0195, "step": 9480 }, { "grad_norm": 0.4275462329387665, "learning_rate": 9.683618254472317e-05, "loss": 0.0233, "step": 9490 }, { "grad_norm": 0.4512259364128113, "learning_rate": 9.682652826975449e-05, "loss": 0.0231, "step": 9500 }, { "grad_norm": 0.3336307108402252, "learning_rate": 9.681685977012291e-05, "loss": 0.019, "step": 9510 }, { "grad_norm": 0.33084091544151306, "learning_rate": 9.680717704876546e-05, "loss": 0.0223, "step": 9520 }, { "grad_norm": 0.39084622263908386, "learning_rate": 9.679748010862349e-05, "loss": 0.0196, "step": 9530 }, { "grad_norm": 0.4734356105327606, "learning_rate": 9.678776895264267e-05, "loss": 0.0214, "step": 9540 }, { "grad_norm": 0.4087027907371521, "learning_rate": 9.6778043583773e-05, "loss": 0.0221, "step": 9550 }, { "grad_norm": 0.4503321945667267, "learning_rate": 9.67683040049688e-05, "loss": 0.0229, "step": 9560 }, { "grad_norm": 0.516207218170166, "learning_rate": 9.675855021918869e-05, "loss": 0.0213, "step": 9570 }, { "grad_norm": 0.4708077311515808, "learning_rate": 9.674878222939561e-05, "loss": 0.0227, "step": 9580 }, { "grad_norm": 0.39283517003059387, "learning_rate": 9.673900003855681e-05, "loss": 0.0174, "step": 9590 }, { "grad_norm": 0.4651467204093933, "learning_rate": 9.672920364964389e-05, "loss": 0.0202, "step": 9600 }, { "grad_norm": 0.44438183307647705, "learning_rate": 9.671939306563269e-05, "loss": 0.0221, "step": 9610 }, { "grad_norm": 0.4396190047264099, "learning_rate": 9.670956828950345e-05, "loss": 0.0208, "step": 9620 }, { "grad_norm": 0.5852305889129639, "learning_rate": 9.669972932424065e-05, "loss": 0.0228, "step": 9630 }, { "grad_norm": 0.3958914577960968, "learning_rate": 9.668987617283312e-05, "loss": 0.0171, "step": 9640 }, { "grad_norm": 0.37439844012260437, "learning_rate": 9.668000883827397e-05, "loss": 0.0219, "step": 9650 }, { "grad_norm": 0.44193920493125916, "learning_rate": 9.667012732356067e-05, "loss": 0.0183, "step": 9660 }, { "grad_norm": 0.41145873069763184, "learning_rate": 9.666023163169493e-05, "loss": 0.0195, "step": 9670 }, { "grad_norm": 0.5343393087387085, "learning_rate": 9.665032176568281e-05, "loss": 0.019, "step": 9680 }, { "grad_norm": 0.496954083442688, "learning_rate": 9.664039772853469e-05, "loss": 0.0168, "step": 9690 }, { "grad_norm": 0.3513059616088867, "learning_rate": 9.663045952326518e-05, "loss": 0.0178, "step": 9700 }, { "grad_norm": 0.47588497400283813, "learning_rate": 9.662050715289328e-05, "loss": 0.0204, "step": 9710 }, { "grad_norm": 0.4222734868526459, "learning_rate": 9.661054062044226e-05, "loss": 0.0199, "step": 9720 }, { "grad_norm": 0.4997919201850891, "learning_rate": 9.660055992893968e-05, "loss": 0.0197, "step": 9730 }, { "grad_norm": 0.31947237253189087, "learning_rate": 9.659056508141739e-05, "loss": 0.0177, "step": 9740 }, { "grad_norm": 0.3846675157546997, "learning_rate": 9.658055608091161e-05, "loss": 0.018, "step": 9750 }, { "grad_norm": 0.36692550778388977, "learning_rate": 9.657053293046276e-05, "loss": 0.0183, "step": 9760 }, { "grad_norm": 0.5022429823875427, "learning_rate": 9.656049563311564e-05, "loss": 0.018, "step": 9770 }, { "grad_norm": 0.4174310564994812, "learning_rate": 9.655044419191929e-05, "loss": 0.019, "step": 9780 }, { "grad_norm": 0.4601549804210663, "learning_rate": 9.654037860992711e-05, "loss": 0.0212, "step": 9790 }, { "grad_norm": 0.38675203919410706, "learning_rate": 9.653029889019672e-05, "loss": 0.0154, "step": 9800 }, { "grad_norm": 0.5508227944374084, "learning_rate": 9.65202050357901e-05, "loss": 0.0199, "step": 9810 }, { "grad_norm": 0.3828381597995758, "learning_rate": 9.651009704977347e-05, "loss": 0.0159, "step": 9820 }, { "grad_norm": 0.426347941160202, "learning_rate": 9.649997493521738e-05, "loss": 0.0229, "step": 9830 }, { "grad_norm": 0.3594949543476105, "learning_rate": 9.64898386951967e-05, "loss": 0.0204, "step": 9840 }, { "grad_norm": 0.4823181927204132, "learning_rate": 9.647968833279049e-05, "loss": 0.0239, "step": 9850 }, { "grad_norm": 0.4254204034805298, "learning_rate": 9.646952385108218e-05, "loss": 0.0207, "step": 9860 }, { "grad_norm": 0.41789454221725464, "learning_rate": 9.645934525315951e-05, "loss": 0.0175, "step": 9870 }, { "grad_norm": 0.3172294795513153, "learning_rate": 9.644915254211442e-05, "loss": 0.0192, "step": 9880 }, { "grad_norm": 0.4198409616947174, "learning_rate": 9.643894572104321e-05, "loss": 0.0243, "step": 9890 }, { "grad_norm": 0.3813696503639221, "learning_rate": 9.642872479304644e-05, "loss": 0.0192, "step": 9900 }, { "grad_norm": 0.3871660530567169, "learning_rate": 9.641848976122895e-05, "loss": 0.0196, "step": 9910 }, { "grad_norm": 0.3427238464355469, "learning_rate": 9.64082406286999e-05, "loss": 0.0226, "step": 9920 }, { "grad_norm": 0.28634995222091675, "learning_rate": 9.639797739857269e-05, "loss": 0.015, "step": 9930 }, { "grad_norm": 0.399654746055603, "learning_rate": 9.638770007396498e-05, "loss": 0.0229, "step": 9940 }, { "grad_norm": 0.3774471580982208, "learning_rate": 9.63774086579988e-05, "loss": 0.0211, "step": 9950 }, { "grad_norm": 0.5255162119865417, "learning_rate": 9.63671031538004e-05, "loss": 0.0197, "step": 9960 }, { "grad_norm": 0.41620728373527527, "learning_rate": 9.635678356450031e-05, "loss": 0.0201, "step": 9970 }, { "grad_norm": 0.3821359872817993, "learning_rate": 9.634644989323336e-05, "loss": 0.018, "step": 9980 }, { "grad_norm": 0.476431280374527, "learning_rate": 9.633610214313861e-05, "loss": 0.023, "step": 9990 }, { "grad_norm": 0.3535378873348236, "learning_rate": 9.632574031735951e-05, "loss": 0.0249, "step": 10000 }, { "grad_norm": 0.4726323187351227, "learning_rate": 9.631536441904364e-05, "loss": 0.0225, "step": 10010 }, { "grad_norm": 0.4587574899196625, "learning_rate": 9.630497445134293e-05, "loss": 0.0211, "step": 10020 }, { "grad_norm": 0.36823904514312744, "learning_rate": 9.62945704174136e-05, "loss": 0.022, "step": 10030 }, { "grad_norm": 0.5425640940666199, "learning_rate": 9.628415232041612e-05, "loss": 0.0255, "step": 10040 }, { "grad_norm": 0.40507611632347107, "learning_rate": 9.627372016351524e-05, "loss": 0.0171, "step": 10050 }, { "grad_norm": 0.4909762144088745, "learning_rate": 9.626327394987995e-05, "loss": 0.0212, "step": 10060 }, { "grad_norm": 0.36966660618782043, "learning_rate": 9.625281368268355e-05, "loss": 0.022, "step": 10070 }, { "grad_norm": 0.5324448943138123, "learning_rate": 9.624233936510357e-05, "loss": 0.0208, "step": 10080 }, { "grad_norm": 0.44957393407821655, "learning_rate": 9.623185100032187e-05, "loss": 0.0207, "step": 10090 }, { "grad_norm": 0.3709962069988251, "learning_rate": 9.62213485915245e-05, "loss": 0.0213, "step": 10100 }, { "grad_norm": 0.6879405975341797, "learning_rate": 9.621083214190186e-05, "loss": 0.019, "step": 10110 }, { "grad_norm": 0.4251745045185089, "learning_rate": 9.62003016546485e-05, "loss": 0.0205, "step": 10120 }, { "grad_norm": 0.42113032937049866, "learning_rate": 9.618975713296339e-05, "loss": 0.0207, "step": 10130 }, { "grad_norm": 0.4057314097881317, "learning_rate": 9.61791985800496e-05, "loss": 0.0194, "step": 10140 }, { "grad_norm": 0.2854096591472626, "learning_rate": 9.616862599911458e-05, "loss": 0.0178, "step": 10150 }, { "grad_norm": 0.4426201283931732, "learning_rate": 9.615803939337e-05, "loss": 0.0177, "step": 10160 }, { "grad_norm": 0.4161039888858795, "learning_rate": 9.614743876603178e-05, "loss": 0.0208, "step": 10170 }, { "grad_norm": 0.44474175572395325, "learning_rate": 9.613682412032013e-05, "loss": 0.0186, "step": 10180 }, { "grad_norm": 0.3343549072742462, "learning_rate": 9.612619545945947e-05, "loss": 0.0198, "step": 10190 }, { "grad_norm": 0.4139963984489441, "learning_rate": 9.611555278667852e-05, "loss": 0.0198, "step": 10200 }, { "grad_norm": 0.47918879985809326, "learning_rate": 9.610489610521024e-05, "loss": 0.0222, "step": 10210 }, { "grad_norm": 0.39651069045066833, "learning_rate": 9.609422541829187e-05, "loss": 0.0201, "step": 10220 }, { "grad_norm": 0.393915593624115, "learning_rate": 9.608354072916486e-05, "loss": 0.0193, "step": 10230 }, { "grad_norm": 0.4271937608718872, "learning_rate": 9.607284204107493e-05, "loss": 0.0166, "step": 10240 }, { "grad_norm": 0.4442124366760254, "learning_rate": 9.606212935727208e-05, "loss": 0.0193, "step": 10250 }, { "grad_norm": 0.33683204650878906, "learning_rate": 9.605140268101052e-05, "loss": 0.0202, "step": 10260 }, { "grad_norm": 0.30396541953086853, "learning_rate": 9.604066201554875e-05, "loss": 0.0187, "step": 10270 }, { "grad_norm": 0.4092296063899994, "learning_rate": 9.60299073641495e-05, "loss": 0.0254, "step": 10280 }, { "grad_norm": 0.3376172184944153, "learning_rate": 9.601913873007974e-05, "loss": 0.0214, "step": 10290 }, { "grad_norm": 0.3814334571361542, "learning_rate": 9.60083561166107e-05, "loss": 0.0193, "step": 10300 }, { "grad_norm": 0.3993943929672241, "learning_rate": 9.599755952701783e-05, "loss": 0.0194, "step": 10310 }, { "grad_norm": 0.4468556344509125, "learning_rate": 9.598674896458089e-05, "loss": 0.0225, "step": 10320 }, { "grad_norm": 0.39103955030441284, "learning_rate": 9.597592443258383e-05, "loss": 0.0181, "step": 10330 }, { "grad_norm": 0.29275935888290405, "learning_rate": 9.596508593431483e-05, "loss": 0.0162, "step": 10340 }, { "grad_norm": 0.3820743262767792, "learning_rate": 9.59542334730664e-05, "loss": 0.0159, "step": 10350 }, { "grad_norm": 0.2735508978366852, "learning_rate": 9.594336705213516e-05, "loss": 0.0134, "step": 10360 }, { "grad_norm": 0.3563217222690582, "learning_rate": 9.593248667482208e-05, "loss": 0.0188, "step": 10370 }, { "grad_norm": 0.4323899447917938, "learning_rate": 9.592159234443233e-05, "loss": 0.0166, "step": 10380 }, { "grad_norm": 0.4390353858470917, "learning_rate": 9.59106840642753e-05, "loss": 0.0164, "step": 10390 }, { "grad_norm": 0.49410876631736755, "learning_rate": 9.589976183766467e-05, "loss": 0.0205, "step": 10400 }, { "grad_norm": 0.40017032623291016, "learning_rate": 9.58888256679183e-05, "loss": 0.0188, "step": 10410 }, { "grad_norm": 0.4119909703731537, "learning_rate": 9.587787555835832e-05, "loss": 0.0185, "step": 10420 }, { "grad_norm": 0.4083598554134369, "learning_rate": 9.586691151231107e-05, "loss": 0.0165, "step": 10430 }, { "grad_norm": 0.38529402017593384, "learning_rate": 9.585593353310715e-05, "loss": 0.0187, "step": 10440 }, { "grad_norm": 0.3211827576160431, "learning_rate": 9.58449416240814e-05, "loss": 0.0169, "step": 10450 }, { "grad_norm": 0.4595158100128174, "learning_rate": 9.583393578857283e-05, "loss": 0.0164, "step": 10460 }, { "grad_norm": 0.4236113131046295, "learning_rate": 9.582291602992474e-05, "loss": 0.0194, "step": 10470 }, { "grad_norm": 0.3500874936580658, "learning_rate": 9.581188235148466e-05, "loss": 0.0175, "step": 10480 }, { "grad_norm": 0.4048246145248413, "learning_rate": 9.58008347566043e-05, "loss": 0.0182, "step": 10490 }, { "grad_norm": 0.46567973494529724, "learning_rate": 9.578977324863965e-05, "loss": 0.0185, "step": 10500 }, { "grad_norm": 0.35758182406425476, "learning_rate": 9.577869783095089e-05, "loss": 0.0219, "step": 10510 }, { "grad_norm": 0.42754676938056946, "learning_rate": 9.576760850690245e-05, "loss": 0.0229, "step": 10520 }, { "grad_norm": 0.43167948722839355, "learning_rate": 9.575650527986298e-05, "loss": 0.0214, "step": 10530 }, { "grad_norm": 0.29197803139686584, "learning_rate": 9.574538815320531e-05, "loss": 0.0162, "step": 10540 }, { "grad_norm": 0.3847380578517914, "learning_rate": 9.573425713030656e-05, "loss": 0.0146, "step": 10550 }, { "grad_norm": 0.37215369939804077, "learning_rate": 9.572311221454806e-05, "loss": 0.0197, "step": 10560 }, { "grad_norm": 0.3433307707309723, "learning_rate": 9.57119534093153e-05, "loss": 0.0186, "step": 10570 }, { "grad_norm": 0.33722245693206787, "learning_rate": 9.570078071799806e-05, "loss": 0.0156, "step": 10580 }, { "grad_norm": 0.3338375687599182, "learning_rate": 9.568959414399028e-05, "loss": 0.018, "step": 10590 }, { "grad_norm": 0.3536750376224518, "learning_rate": 9.567839369069018e-05, "loss": 0.0166, "step": 10600 }, { "grad_norm": 0.3826412260532379, "learning_rate": 9.566717936150013e-05, "loss": 0.0189, "step": 10610 }, { "grad_norm": 0.3103746175765991, "learning_rate": 9.565595115982678e-05, "loss": 0.0168, "step": 10620 }, { "grad_norm": 0.3635064661502838, "learning_rate": 9.564470908908094e-05, "loss": 0.0183, "step": 10630 }, { "grad_norm": 0.28928908705711365, "learning_rate": 9.563345315267764e-05, "loss": 0.0162, "step": 10640 }, { "grad_norm": 0.4335481524467468, "learning_rate": 9.562218335403616e-05, "loss": 0.0146, "step": 10650 }, { "grad_norm": 0.3834221363067627, "learning_rate": 9.561089969657999e-05, "loss": 0.0184, "step": 10660 }, { "grad_norm": 0.35768556594848633, "learning_rate": 9.559960218373673e-05, "loss": 0.0167, "step": 10670 }, { "grad_norm": 0.43383631110191345, "learning_rate": 9.558829081893836e-05, "loss": 0.0214, "step": 10680 }, { "grad_norm": 0.4365203082561493, "learning_rate": 9.55769656056209e-05, "loss": 0.0154, "step": 10690 }, { "grad_norm": 0.3742186725139618, "learning_rate": 9.556562654722469e-05, "loss": 0.0157, "step": 10700 }, { "grad_norm": 0.4212026596069336, "learning_rate": 9.555427364719422e-05, "loss": 0.019, "step": 10710 }, { "grad_norm": 0.3102274537086487, "learning_rate": 9.55429069089782e-05, "loss": 0.0157, "step": 10720 }, { "grad_norm": 0.30959296226501465, "learning_rate": 9.553152633602956e-05, "loss": 0.0163, "step": 10730 }, { "grad_norm": 0.39335960149765015, "learning_rate": 9.552013193180543e-05, "loss": 0.0187, "step": 10740 }, { "grad_norm": 0.46019238233566284, "learning_rate": 9.550872369976707e-05, "loss": 0.0186, "step": 10750 }, { "grad_norm": 0.5118435025215149, "learning_rate": 9.549730164338007e-05, "loss": 0.0181, "step": 10760 }, { "grad_norm": 0.38193657994270325, "learning_rate": 9.548586576611408e-05, "loss": 0.0176, "step": 10770 }, { "grad_norm": 0.3409776985645294, "learning_rate": 9.54744160714431e-05, "loss": 0.0145, "step": 10780 }, { "grad_norm": 0.4351353347301483, "learning_rate": 9.546295256284516e-05, "loss": 0.0197, "step": 10790 }, { "grad_norm": 0.3557078242301941, "learning_rate": 9.545147524380265e-05, "loss": 0.0178, "step": 10800 }, { "grad_norm": 0.4559164047241211, "learning_rate": 9.543998411780201e-05, "loss": 0.0158, "step": 10810 }, { "grad_norm": 0.28311851620674133, "learning_rate": 9.542847918833397e-05, "loss": 0.0198, "step": 10820 }, { "grad_norm": 0.4107087254524231, "learning_rate": 9.541696045889343e-05, "loss": 0.0179, "step": 10830 }, { "grad_norm": 0.366896390914917, "learning_rate": 9.540542793297947e-05, "loss": 0.0184, "step": 10840 }, { "grad_norm": 0.4244709610939026, "learning_rate": 9.539388161409537e-05, "loss": 0.0179, "step": 10850 }, { "grad_norm": 0.31330156326293945, "learning_rate": 9.538232150574857e-05, "loss": 0.0192, "step": 10860 }, { "grad_norm": 0.24634262919425964, "learning_rate": 9.537074761145076e-05, "loss": 0.0161, "step": 10870 }, { "grad_norm": 0.2985272705554962, "learning_rate": 9.535915993471778e-05, "loss": 0.0154, "step": 10880 }, { "grad_norm": 0.41853320598602295, "learning_rate": 9.534755847906964e-05, "loss": 0.0162, "step": 10890 }, { "grad_norm": 0.34125640988349915, "learning_rate": 9.533594324803057e-05, "loss": 0.0152, "step": 10900 }, { "grad_norm": 0.4875860810279846, "learning_rate": 9.532431424512895e-05, "loss": 0.0153, "step": 10910 }, { "grad_norm": 0.40720805525779724, "learning_rate": 9.531267147389741e-05, "loss": 0.016, "step": 10920 }, { "grad_norm": 0.39791691303253174, "learning_rate": 9.530101493787266e-05, "loss": 0.0198, "step": 10930 }, { "grad_norm": 0.32826876640319824, "learning_rate": 9.528934464059571e-05, "loss": 0.0151, "step": 10940 }, { "grad_norm": 0.37639832496643066, "learning_rate": 9.527766058561163e-05, "loss": 0.0166, "step": 10950 }, { "grad_norm": 0.3262758255004883, "learning_rate": 9.526596277646976e-05, "loss": 0.0154, "step": 10960 }, { "grad_norm": 0.4586452841758728, "learning_rate": 9.525425121672358e-05, "loss": 0.0169, "step": 10970 }, { "grad_norm": 0.34545138478279114, "learning_rate": 9.524252590993074e-05, "loss": 0.017, "step": 10980 }, { "grad_norm": 0.3826800584793091, "learning_rate": 9.523078685965309e-05, "loss": 0.0186, "step": 10990 }, { "grad_norm": 0.3638732433319092, "learning_rate": 9.521903406945664e-05, "loss": 0.0208, "step": 11000 }, { "grad_norm": 0.29514259099960327, "learning_rate": 9.520726754291158e-05, "loss": 0.0161, "step": 11010 }, { "grad_norm": 0.4679412543773651, "learning_rate": 9.519548728359227e-05, "loss": 0.0183, "step": 11020 }, { "grad_norm": 0.3475341796875, "learning_rate": 9.518369329507726e-05, "loss": 0.0196, "step": 11030 }, { "grad_norm": 0.35793352127075195, "learning_rate": 9.51718855809492e-05, "loss": 0.0161, "step": 11040 }, { "grad_norm": 0.32648763060569763, "learning_rate": 9.516006414479502e-05, "loss": 0.0161, "step": 11050 }, { "grad_norm": 0.369131863117218, "learning_rate": 9.514822899020572e-05, "loss": 0.0148, "step": 11060 }, { "grad_norm": 0.5173619389533997, "learning_rate": 9.513638012077654e-05, "loss": 0.0221, "step": 11070 }, { "grad_norm": 0.29296427965164185, "learning_rate": 9.512451754010683e-05, "loss": 0.0175, "step": 11080 }, { "grad_norm": 0.342509925365448, "learning_rate": 9.511264125180013e-05, "loss": 0.0174, "step": 11090 }, { "grad_norm": 0.3526631295681, "learning_rate": 9.510075125946414e-05, "loss": 0.0181, "step": 11100 }, { "grad_norm": 0.3492644727230072, "learning_rate": 9.508884756671075e-05, "loss": 0.0213, "step": 11110 }, { "grad_norm": 0.4704457223415375, "learning_rate": 9.507693017715596e-05, "loss": 0.0194, "step": 11120 }, { "grad_norm": 0.38454246520996094, "learning_rate": 9.506499909441997e-05, "loss": 0.0168, "step": 11130 }, { "grad_norm": 0.42641520500183105, "learning_rate": 9.505305432212713e-05, "loss": 0.022, "step": 11140 }, { "grad_norm": 0.513241708278656, "learning_rate": 9.504109586390595e-05, "loss": 0.0248, "step": 11150 }, { "grad_norm": 0.4879985749721527, "learning_rate": 9.502912372338908e-05, "loss": 0.0206, "step": 11160 }, { "grad_norm": 0.4405342638492584, "learning_rate": 9.501713790421335e-05, "loss": 0.0194, "step": 11170 }, { "grad_norm": 0.44434618949890137, "learning_rate": 9.500513841001974e-05, "loss": 0.0165, "step": 11180 }, { "grad_norm": 0.2799209654331207, "learning_rate": 9.499312524445336e-05, "loss": 0.0221, "step": 11190 }, { "grad_norm": 0.42130419611930847, "learning_rate": 9.498109841116351e-05, "loss": 0.0217, "step": 11200 }, { "grad_norm": 0.3674059212207794, "learning_rate": 9.496905791380363e-05, "loss": 0.0222, "step": 11210 }, { "grad_norm": 0.3240606486797333, "learning_rate": 9.495700375603129e-05, "loss": 0.0163, "step": 11220 }, { "grad_norm": 0.3400369882583618, "learning_rate": 9.494493594150822e-05, "loss": 0.02, "step": 11230 }, { "grad_norm": 0.39605817198753357, "learning_rate": 9.493285447390032e-05, "loss": 0.0164, "step": 11240 }, { "grad_norm": 0.481381356716156, "learning_rate": 9.492075935687761e-05, "loss": 0.0205, "step": 11250 }, { "grad_norm": 0.4486084282398224, "learning_rate": 9.490865059411427e-05, "loss": 0.0164, "step": 11260 }, { "grad_norm": 0.3878118097782135, "learning_rate": 9.489652818928863e-05, "loss": 0.0161, "step": 11270 }, { "grad_norm": 0.4139678180217743, "learning_rate": 9.488439214608315e-05, "loss": 0.0169, "step": 11280 }, { "grad_norm": 0.5058616399765015, "learning_rate": 9.487224246818444e-05, "loss": 0.0182, "step": 11290 }, { "grad_norm": 0.31589049100875854, "learning_rate": 9.486007915928325e-05, "loss": 0.0187, "step": 11300 }, { "grad_norm": 0.38299834728240967, "learning_rate": 9.484790222307448e-05, "loss": 0.0188, "step": 11310 }, { "grad_norm": 0.3907677233219147, "learning_rate": 9.483571166325716e-05, "loss": 0.0164, "step": 11320 }, { "grad_norm": 0.27185311913490295, "learning_rate": 9.482350748353444e-05, "loss": 0.0183, "step": 11330 }, { "grad_norm": 0.40438348054885864, "learning_rate": 9.481128968761363e-05, "loss": 0.0204, "step": 11340 }, { "grad_norm": 0.3392781913280487, "learning_rate": 9.479905827920621e-05, "loss": 0.0194, "step": 11350 }, { "grad_norm": 0.46958860754966736, "learning_rate": 9.478681326202773e-05, "loss": 0.0173, "step": 11360 }, { "grad_norm": 0.4522368311882019, "learning_rate": 9.477455463979791e-05, "loss": 0.0201, "step": 11370 }, { "grad_norm": 0.313892126083374, "learning_rate": 9.476228241624059e-05, "loss": 0.0163, "step": 11380 }, { "grad_norm": 0.3670874834060669, "learning_rate": 9.474999659508374e-05, "loss": 0.014, "step": 11390 }, { "grad_norm": 0.3470529317855835, "learning_rate": 9.47376971800595e-05, "loss": 0.0139, "step": 11400 }, { "grad_norm": 0.33186304569244385, "learning_rate": 9.472538417490409e-05, "loss": 0.0181, "step": 11410 }, { "grad_norm": 0.369559109210968, "learning_rate": 9.471305758335784e-05, "loss": 0.0171, "step": 11420 }, { "grad_norm": 0.2526124119758606, "learning_rate": 9.47007174091653e-05, "loss": 0.0173, "step": 11430 }, { "grad_norm": 0.508969247341156, "learning_rate": 9.468836365607507e-05, "loss": 0.0147, "step": 11440 }, { "grad_norm": 0.414359450340271, "learning_rate": 9.467599632783988e-05, "loss": 0.0169, "step": 11450 }, { "grad_norm": 0.37647607922554016, "learning_rate": 9.466361542821662e-05, "loss": 0.023, "step": 11460 }, { "grad_norm": 0.33450043201446533, "learning_rate": 9.465122096096625e-05, "loss": 0.0198, "step": 11470 }, { "grad_norm": 0.33466044068336487, "learning_rate": 9.463881292985391e-05, "loss": 0.0189, "step": 11480 }, { "grad_norm": 0.32771435379981995, "learning_rate": 9.462639133864881e-05, "loss": 0.0154, "step": 11490 }, { "grad_norm": 0.2925075888633728, "learning_rate": 9.461395619112432e-05, "loss": 0.0185, "step": 11500 }, { "grad_norm": 0.3099575638771057, "learning_rate": 9.460150749105791e-05, "loss": 0.0156, "step": 11510 }, { "grad_norm": 0.3670234978199005, "learning_rate": 9.458904524223116e-05, "loss": 0.0179, "step": 11520 }, { "grad_norm": 0.5236250758171082, "learning_rate": 9.457656944842976e-05, "loss": 0.0198, "step": 11530 }, { "grad_norm": 0.48519566655158997, "learning_rate": 9.456408011344353e-05, "loss": 0.0166, "step": 11540 }, { "grad_norm": 0.4143960773944855, "learning_rate": 9.455157724106643e-05, "loss": 0.0169, "step": 11550 }, { "grad_norm": 0.3964592516422272, "learning_rate": 9.453906083509647e-05, "loss": 0.0165, "step": 11560 }, { "grad_norm": 0.42435070872306824, "learning_rate": 9.45265308993358e-05, "loss": 0.0164, "step": 11570 }, { "grad_norm": 0.4208449125289917, "learning_rate": 9.451398743759071e-05, "loss": 0.0147, "step": 11580 }, { "grad_norm": 0.3677162826061249, "learning_rate": 9.450143045367156e-05, "loss": 0.0184, "step": 11590 }, { "grad_norm": 0.35301387310028076, "learning_rate": 9.448885995139283e-05, "loss": 0.0159, "step": 11600 }, { "grad_norm": 0.3524612784385681, "learning_rate": 9.44762759345731e-05, "loss": 0.0172, "step": 11610 }, { "grad_norm": 0.36330240964889526, "learning_rate": 9.446367840703509e-05, "loss": 0.0191, "step": 11620 }, { "grad_norm": 0.3642682731151581, "learning_rate": 9.445106737260556e-05, "loss": 0.0157, "step": 11630 }, { "grad_norm": 0.37559136748313904, "learning_rate": 9.443844283511543e-05, "loss": 0.0185, "step": 11640 }, { "grad_norm": 0.36658918857574463, "learning_rate": 9.442580479839968e-05, "loss": 0.0141, "step": 11650 }, { "grad_norm": 0.38368797302246094, "learning_rate": 9.441315326629745e-05, "loss": 0.015, "step": 11660 }, { "grad_norm": 0.38235482573509216, "learning_rate": 9.44004882426519e-05, "loss": 0.0206, "step": 11670 }, { "grad_norm": 0.3051309585571289, "learning_rate": 9.438780973131037e-05, "loss": 0.0137, "step": 11680 }, { "grad_norm": 0.35470375418663025, "learning_rate": 9.437511773612423e-05, "loss": 0.017, "step": 11690 }, { "grad_norm": 0.34460386633872986, "learning_rate": 9.436241226094896e-05, "loss": 0.0188, "step": 11700 }, { "grad_norm": 0.40933752059936523, "learning_rate": 9.434969330964418e-05, "loss": 0.0216, "step": 11710 }, { "grad_norm": 0.40221714973449707, "learning_rate": 9.433696088607356e-05, "loss": 0.0157, "step": 11720 }, { "grad_norm": 0.30475372076034546, "learning_rate": 9.432421499410486e-05, "loss": 0.0165, "step": 11730 }, { "grad_norm": 0.30148208141326904, "learning_rate": 9.431145563760998e-05, "loss": 0.0152, "step": 11740 }, { "grad_norm": 0.3930191397666931, "learning_rate": 9.429868282046484e-05, "loss": 0.015, "step": 11750 }, { "grad_norm": 0.3486611843109131, "learning_rate": 9.428589654654951e-05, "loss": 0.0172, "step": 11760 }, { "grad_norm": 0.36533284187316895, "learning_rate": 9.42730968197481e-05, "loss": 0.0136, "step": 11770 }, { "grad_norm": 0.33773350715637207, "learning_rate": 9.426028364394883e-05, "loss": 0.0181, "step": 11780 }, { "grad_norm": 0.47433459758758545, "learning_rate": 9.424745702304402e-05, "loss": 0.0176, "step": 11790 }, { "grad_norm": 0.2880942225456238, "learning_rate": 9.423461696093006e-05, "loss": 0.017, "step": 11800 }, { "grad_norm": 0.29186585545539856, "learning_rate": 9.422176346150741e-05, "loss": 0.0156, "step": 11810 }, { "grad_norm": 0.36226364970207214, "learning_rate": 9.420889652868063e-05, "loss": 0.0208, "step": 11820 }, { "grad_norm": 0.44331395626068115, "learning_rate": 9.419601616635836e-05, "loss": 0.0173, "step": 11830 }, { "grad_norm": 0.32699716091156006, "learning_rate": 9.418312237845331e-05, "loss": 0.0148, "step": 11840 }, { "grad_norm": 0.333482027053833, "learning_rate": 9.417021516888225e-05, "loss": 0.0181, "step": 11850 }, { "grad_norm": 0.3288630545139313, "learning_rate": 9.415729454156608e-05, "loss": 0.0188, "step": 11860 }, { "grad_norm": 0.2743556797504425, "learning_rate": 9.414436050042973e-05, "loss": 0.0212, "step": 11870 }, { "grad_norm": 0.3627742528915405, "learning_rate": 9.413141304940223e-05, "loss": 0.0154, "step": 11880 }, { "grad_norm": 0.4373108446598053, "learning_rate": 9.411845219241666e-05, "loss": 0.0182, "step": 11890 }, { "grad_norm": 0.34265047311782837, "learning_rate": 9.410547793341021e-05, "loss": 0.0186, "step": 11900 }, { "grad_norm": 0.3257286548614502, "learning_rate": 9.409249027632408e-05, "loss": 0.017, "step": 11910 }, { "grad_norm": 0.3931034803390503, "learning_rate": 9.407948922510362e-05, "loss": 0.0168, "step": 11920 }, { "grad_norm": 0.3891640603542328, "learning_rate": 9.406647478369817e-05, "loss": 0.015, "step": 11930 }, { "grad_norm": 0.3381834030151367, "learning_rate": 9.405344695606118e-05, "loss": 0.0181, "step": 11940 }, { "grad_norm": 0.35451003909111023, "learning_rate": 9.404040574615018e-05, "loss": 0.0173, "step": 11950 }, { "grad_norm": 0.384470134973526, "learning_rate": 9.402735115792674e-05, "loss": 0.0177, "step": 11960 }, { "grad_norm": 0.39360836148262024, "learning_rate": 9.401428319535649e-05, "loss": 0.0191, "step": 11970 }, { "grad_norm": 0.4161492586135864, "learning_rate": 9.400120186240912e-05, "loss": 0.0158, "step": 11980 }, { "grad_norm": 0.4901255965232849, "learning_rate": 9.398810716305844e-05, "loss": 0.016, "step": 11990 }, { "grad_norm": 0.34617289900779724, "learning_rate": 9.397499910128222e-05, "loss": 0.0145, "step": 12000 }, { "grad_norm": 0.352186381816864, "learning_rate": 9.396187768106237e-05, "loss": 0.019, "step": 12010 }, { "grad_norm": 0.35461387038230896, "learning_rate": 9.394874290638482e-05, "loss": 0.0205, "step": 12020 }, { "grad_norm": 0.3936839699745178, "learning_rate": 9.393559478123959e-05, "loss": 0.016, "step": 12030 }, { "grad_norm": 0.4476394057273865, "learning_rate": 9.39224333096207e-05, "loss": 0.0142, "step": 12040 }, { "grad_norm": 0.37746569514274597, "learning_rate": 9.390925849552629e-05, "loss": 0.0173, "step": 12050 }, { "grad_norm": 0.47711020708084106, "learning_rate": 9.389607034295849e-05, "loss": 0.0195, "step": 12060 }, { "grad_norm": 0.3538389205932617, "learning_rate": 9.388286885592355e-05, "loss": 0.0156, "step": 12070 }, { "grad_norm": 0.35910990834236145, "learning_rate": 9.386965403843168e-05, "loss": 0.0151, "step": 12080 }, { "grad_norm": 0.30144861340522766, "learning_rate": 9.385642589449726e-05, "loss": 0.014, "step": 12090 }, { "grad_norm": 0.31444376707077026, "learning_rate": 9.38431844281386e-05, "loss": 0.0167, "step": 12100 }, { "grad_norm": 0.3443724811077118, "learning_rate": 9.38299296433781e-05, "loss": 0.0153, "step": 12110 }, { "grad_norm": 0.3032764494419098, "learning_rate": 9.381666154424226e-05, "loss": 0.0159, "step": 12120 }, { "grad_norm": 0.372444748878479, "learning_rate": 9.380338013476157e-05, "loss": 0.0144, "step": 12130 }, { "grad_norm": 0.35097065567970276, "learning_rate": 9.379008541897054e-05, "loss": 0.0159, "step": 12140 }, { "grad_norm": 0.39746779203414917, "learning_rate": 9.377677740090777e-05, "loss": 0.0163, "step": 12150 }, { "grad_norm": 0.33482182025909424, "learning_rate": 9.376345608461588e-05, "loss": 0.0238, "step": 12160 }, { "grad_norm": 0.2922261357307434, "learning_rate": 9.375012147414155e-05, "loss": 0.0145, "step": 12170 }, { "grad_norm": 0.3712657690048218, "learning_rate": 9.373677357353545e-05, "loss": 0.0189, "step": 12180 }, { "grad_norm": 0.36195287108421326, "learning_rate": 9.372341238685237e-05, "loss": 0.0196, "step": 12190 }, { "grad_norm": 0.3878498673439026, "learning_rate": 9.371003791815102e-05, "loss": 0.0146, "step": 12200 }, { "grad_norm": 0.4278899133205414, "learning_rate": 9.369665017149429e-05, "loss": 0.02, "step": 12210 }, { "grad_norm": 0.3481537997722626, "learning_rate": 9.368324915094895e-05, "loss": 0.0194, "step": 12220 }, { "grad_norm": 0.3668903410434723, "learning_rate": 9.366983486058591e-05, "loss": 0.0195, "step": 12230 }, { "grad_norm": 0.37120190262794495, "learning_rate": 9.365640730448009e-05, "loss": 0.015, "step": 12240 }, { "grad_norm": 0.33916807174682617, "learning_rate": 9.36429664867104e-05, "loss": 0.0179, "step": 12250 }, { "grad_norm": 0.37433159351348877, "learning_rate": 9.362951241135982e-05, "loss": 0.0165, "step": 12260 }, { "grad_norm": 0.3238275647163391, "learning_rate": 9.361604508251534e-05, "loss": 0.0193, "step": 12270 }, { "grad_norm": 0.4189596474170685, "learning_rate": 9.360256450426799e-05, "loss": 0.0182, "step": 12280 }, { "grad_norm": 0.3561323881149292, "learning_rate": 9.358907068071279e-05, "loss": 0.0149, "step": 12290 }, { "grad_norm": 0.32975974678993225, "learning_rate": 9.357556361594882e-05, "loss": 0.0144, "step": 12300 }, { "grad_norm": 0.4341329038143158, "learning_rate": 9.356204331407917e-05, "loss": 0.0164, "step": 12310 }, { "grad_norm": 0.35126253962516785, "learning_rate": 9.354850977921094e-05, "loss": 0.0143, "step": 12320 }, { "grad_norm": 0.3386474549770355, "learning_rate": 9.353496301545529e-05, "loss": 0.0147, "step": 12330 }, { "grad_norm": 0.43438291549682617, "learning_rate": 9.352140302692733e-05, "loss": 0.0147, "step": 12340 }, { "grad_norm": 0.3747648298740387, "learning_rate": 9.350782981774627e-05, "loss": 0.0161, "step": 12350 }, { "grad_norm": 0.3662484586238861, "learning_rate": 9.349424339203526e-05, "loss": 0.0146, "step": 12360 }, { "grad_norm": 0.32128167152404785, "learning_rate": 9.34806437539215e-05, "loss": 0.012, "step": 12370 }, { "grad_norm": 0.42437613010406494, "learning_rate": 9.346703090753622e-05, "loss": 0.0194, "step": 12380 }, { "grad_norm": 0.3569956123828888, "learning_rate": 9.345340485701461e-05, "loss": 0.0154, "step": 12390 }, { "grad_norm": 0.5556051135063171, "learning_rate": 9.343976560649595e-05, "loss": 0.0192, "step": 12400 }, { "grad_norm": 0.4483718276023865, "learning_rate": 9.342611316012344e-05, "loss": 0.0188, "step": 12410 }, { "grad_norm": 0.317126989364624, "learning_rate": 9.341244752204437e-05, "loss": 0.0189, "step": 12420 }, { "grad_norm": 0.306243896484375, "learning_rate": 9.339876869640995e-05, "loss": 0.0174, "step": 12430 }, { "grad_norm": 0.35216355323791504, "learning_rate": 9.33850766873755e-05, "loss": 0.0145, "step": 12440 }, { "grad_norm": 0.35221606492996216, "learning_rate": 9.337137149910028e-05, "loss": 0.0129, "step": 12450 }, { "grad_norm": 0.4230436682701111, "learning_rate": 9.335765313574753e-05, "loss": 0.0167, "step": 12460 }, { "grad_norm": 0.4327002763748169, "learning_rate": 9.334392160148457e-05, "loss": 0.0146, "step": 12470 }, { "grad_norm": 0.4073230028152466, "learning_rate": 9.333017690048264e-05, "loss": 0.0186, "step": 12480 }, { "grad_norm": 0.44986507296562195, "learning_rate": 9.331641903691706e-05, "loss": 0.0171, "step": 12490 }, { "grad_norm": 0.41319334506988525, "learning_rate": 9.330264801496707e-05, "loss": 0.0174, "step": 12500 }, { "grad_norm": 0.3380400836467743, "learning_rate": 9.328886383881594e-05, "loss": 0.0149, "step": 12510 }, { "grad_norm": 0.461952269077301, "learning_rate": 9.327506651265095e-05, "loss": 0.0148, "step": 12520 }, { "grad_norm": 0.285542368888855, "learning_rate": 9.326125604066338e-05, "loss": 0.0152, "step": 12530 }, { "grad_norm": 0.3334369659423828, "learning_rate": 9.324743242704847e-05, "loss": 0.014, "step": 12540 }, { "grad_norm": 0.3627273738384247, "learning_rate": 9.323359567600546e-05, "loss": 0.0165, "step": 12550 }, { "grad_norm": 0.4116191864013672, "learning_rate": 9.321974579173761e-05, "loss": 0.0129, "step": 12560 }, { "grad_norm": 0.3877717852592468, "learning_rate": 9.320588277845213e-05, "loss": 0.0172, "step": 12570 }, { "grad_norm": 0.3369559943675995, "learning_rate": 9.319200664036026e-05, "loss": 0.0153, "step": 12580 }, { "grad_norm": 0.446371853351593, "learning_rate": 9.31781173816772e-05, "loss": 0.0142, "step": 12590 }, { "grad_norm": 0.381043016910553, "learning_rate": 9.316421500662212e-05, "loss": 0.0156, "step": 12600 }, { "grad_norm": 0.2549687623977661, "learning_rate": 9.31502995194182e-05, "loss": 0.0145, "step": 12610 }, { "grad_norm": 0.29747188091278076, "learning_rate": 9.31363709242926e-05, "loss": 0.0135, "step": 12620 }, { "grad_norm": 0.38646069169044495, "learning_rate": 9.312242922547647e-05, "loss": 0.0167, "step": 12630 }, { "grad_norm": 0.33346793055534363, "learning_rate": 9.310847442720492e-05, "loss": 0.015, "step": 12640 }, { "grad_norm": 0.32923072576522827, "learning_rate": 9.309450653371706e-05, "loss": 0.0164, "step": 12650 }, { "grad_norm": 0.3573673665523529, "learning_rate": 9.308052554925595e-05, "loss": 0.018, "step": 12660 }, { "grad_norm": 0.4661363661289215, "learning_rate": 9.306653147806867e-05, "loss": 0.0181, "step": 12670 }, { "grad_norm": 0.41974154114723206, "learning_rate": 9.305252432440622e-05, "loss": 0.0146, "step": 12680 }, { "grad_norm": 0.41825205087661743, "learning_rate": 9.303850409252361e-05, "loss": 0.0213, "step": 12690 }, { "grad_norm": 0.38984861969947815, "learning_rate": 9.302447078667985e-05, "loss": 0.0222, "step": 12700 }, { "grad_norm": 0.33325374126434326, "learning_rate": 9.301042441113783e-05, "loss": 0.0139, "step": 12710 }, { "grad_norm": 0.40057238936424255, "learning_rate": 9.299636497016451e-05, "loss": 0.0165, "step": 12720 }, { "grad_norm": 0.3229687213897705, "learning_rate": 9.298229246803076e-05, "loss": 0.0155, "step": 12730 }, { "grad_norm": 0.37557703256607056, "learning_rate": 9.296820690901144e-05, "loss": 0.0171, "step": 12740 }, { "grad_norm": 0.36332058906555176, "learning_rate": 9.295410829738539e-05, "loss": 0.0153, "step": 12750 }, { "grad_norm": 0.3513367474079132, "learning_rate": 9.293999663743535e-05, "loss": 0.0164, "step": 12760 }, { "grad_norm": 0.29821816086769104, "learning_rate": 9.292587193344813e-05, "loss": 0.0155, "step": 12770 }, { "grad_norm": 0.4072168469429016, "learning_rate": 9.291173418971437e-05, "loss": 0.021, "step": 12780 }, { "grad_norm": 0.34375905990600586, "learning_rate": 9.28975834105288e-05, "loss": 0.0194, "step": 12790 }, { "grad_norm": 0.35275042057037354, "learning_rate": 9.288341960019004e-05, "loss": 0.0184, "step": 12800 }, { "grad_norm": 0.4296000897884369, "learning_rate": 9.286924276300067e-05, "loss": 0.0201, "step": 12810 }, { "grad_norm": 0.32279685139656067, "learning_rate": 9.285505290326726e-05, "loss": 0.017, "step": 12820 }, { "grad_norm": 0.3500145971775055, "learning_rate": 9.284085002530027e-05, "loss": 0.0162, "step": 12830 }, { "grad_norm": 0.34305018186569214, "learning_rate": 9.282663413341422e-05, "loss": 0.0173, "step": 12840 }, { "grad_norm": 0.3377833664417267, "learning_rate": 9.281240523192747e-05, "loss": 0.0154, "step": 12850 }, { "grad_norm": 0.3174279034137726, "learning_rate": 9.279816332516242e-05, "loss": 0.0143, "step": 12860 }, { "grad_norm": 0.3884032964706421, "learning_rate": 9.278390841744536e-05, "loss": 0.0153, "step": 12870 }, { "grad_norm": 0.39290687441825867, "learning_rate": 9.276964051310658e-05, "loss": 0.0166, "step": 12880 }, { "grad_norm": 0.33610856533050537, "learning_rate": 9.275535961648027e-05, "loss": 0.0162, "step": 12890 }, { "grad_norm": 0.39055031538009644, "learning_rate": 9.274106573190459e-05, "loss": 0.0158, "step": 12900 }, { "grad_norm": 0.40723916888237, "learning_rate": 9.272675886372168e-05, "loss": 0.0152, "step": 12910 }, { "grad_norm": 0.4116220772266388, "learning_rate": 9.271243901627754e-05, "loss": 0.0175, "step": 12920 }, { "grad_norm": 0.3669080138206482, "learning_rate": 9.269810619392219e-05, "loss": 0.0144, "step": 12930 }, { "grad_norm": 0.29769766330718994, "learning_rate": 9.268376040100955e-05, "loss": 0.0133, "step": 12940 }, { "grad_norm": 0.6282018423080444, "learning_rate": 9.266940164189752e-05, "loss": 0.02, "step": 12950 }, { "grad_norm": 0.39701253175735474, "learning_rate": 9.265502992094787e-05, "loss": 0.0208, "step": 12960 }, { "grad_norm": 0.3430003821849823, "learning_rate": 9.264064524252638e-05, "loss": 0.019, "step": 12970 }, { "grad_norm": 0.31682834029197693, "learning_rate": 9.262624761100271e-05, "loss": 0.0188, "step": 12980 }, { "grad_norm": 0.5807638168334961, "learning_rate": 9.261183703075051e-05, "loss": 0.0205, "step": 12990 }, { "grad_norm": 0.5272794961929321, "learning_rate": 9.259741350614733e-05, "loss": 0.0189, "step": 13000 }, { "grad_norm": 0.37830850481987, "learning_rate": 9.258297704157464e-05, "loss": 0.0221, "step": 13010 }, { "grad_norm": 0.37266066670417786, "learning_rate": 9.256852764141786e-05, "loss": 0.0182, "step": 13020 }, { "grad_norm": 0.3502638339996338, "learning_rate": 9.255406531006634e-05, "loss": 0.0176, "step": 13030 }, { "grad_norm": 0.3538574278354645, "learning_rate": 9.253959005191335e-05, "loss": 0.0197, "step": 13040 }, { "grad_norm": 0.5364757776260376, "learning_rate": 9.25251018713561e-05, "loss": 0.0167, "step": 13050 }, { "grad_norm": 0.3922223448753357, "learning_rate": 9.251060077279571e-05, "loss": 0.0186, "step": 13060 }, { "grad_norm": 0.3120754659175873, "learning_rate": 9.249608676063724e-05, "loss": 0.0133, "step": 13070 }, { "grad_norm": 0.3706241846084595, "learning_rate": 9.248155983928964e-05, "loss": 0.014, "step": 13080 }, { "grad_norm": 0.36041760444641113, "learning_rate": 9.246702001316583e-05, "loss": 0.0176, "step": 13090 }, { "grad_norm": 0.4849241077899933, "learning_rate": 9.245246728668262e-05, "loss": 0.0174, "step": 13100 }, { "grad_norm": 0.3944106996059418, "learning_rate": 9.243790166426073e-05, "loss": 0.0154, "step": 13110 }, { "grad_norm": 0.4160780608654022, "learning_rate": 9.242332315032484e-05, "loss": 0.0174, "step": 13120 }, { "grad_norm": 0.45664340257644653, "learning_rate": 9.240873174930349e-05, "loss": 0.0185, "step": 13130 }, { "grad_norm": 0.35323366522789, "learning_rate": 9.239412746562917e-05, "loss": 0.0168, "step": 13140 }, { "grad_norm": 0.3198740780353546, "learning_rate": 9.237951030373828e-05, "loss": 0.0146, "step": 13150 }, { "grad_norm": 0.36298424005508423, "learning_rate": 9.236488026807113e-05, "loss": 0.0116, "step": 13160 }, { "grad_norm": 0.3357863128185272, "learning_rate": 9.235023736307193e-05, "loss": 0.0156, "step": 13170 }, { "grad_norm": 0.27079761028289795, "learning_rate": 9.233558159318881e-05, "loss": 0.0136, "step": 13180 }, { "grad_norm": 0.31432172656059265, "learning_rate": 9.232091296287382e-05, "loss": 0.0123, "step": 13190 }, { "grad_norm": 0.3665439784526825, "learning_rate": 9.230623147658288e-05, "loss": 0.0205, "step": 13200 }, { "grad_norm": 0.36489400267601013, "learning_rate": 9.229153713877586e-05, "loss": 0.0148, "step": 13210 }, { "grad_norm": 0.4255466163158417, "learning_rate": 9.227682995391649e-05, "loss": 0.0151, "step": 13220 }, { "grad_norm": 0.4850270748138428, "learning_rate": 9.226210992647243e-05, "loss": 0.0201, "step": 13230 }, { "grad_norm": 0.4035317599773407, "learning_rate": 9.224737706091525e-05, "loss": 0.0224, "step": 13240 }, { "grad_norm": 0.31821107864379883, "learning_rate": 9.223263136172039e-05, "loss": 0.0204, "step": 13250 }, { "grad_norm": 0.31971925497055054, "learning_rate": 9.22178728333672e-05, "loss": 0.0213, "step": 13260 }, { "grad_norm": 0.4425103962421417, "learning_rate": 9.220310148033897e-05, "loss": 0.0194, "step": 13270 }, { "grad_norm": 0.3672041594982147, "learning_rate": 9.21883173071228e-05, "loss": 0.0168, "step": 13280 }, { "grad_norm": 0.27310770750045776, "learning_rate": 9.217352031820976e-05, "loss": 0.0193, "step": 13290 }, { "grad_norm": 0.3518291711807251, "learning_rate": 9.215871051809477e-05, "loss": 0.0192, "step": 13300 }, { "grad_norm": 0.27240341901779175, "learning_rate": 9.214388791127666e-05, "loss": 0.0161, "step": 13310 }, { "grad_norm": 0.35625964403152466, "learning_rate": 9.212905250225814e-05, "loss": 0.0158, "step": 13320 }, { "grad_norm": 0.3670119047164917, "learning_rate": 9.211420429554583e-05, "loss": 0.0175, "step": 13330 }, { "grad_norm": 0.2852018475532532, "learning_rate": 9.209934329565022e-05, "loss": 0.0147, "step": 13340 }, { "grad_norm": 0.3819805383682251, "learning_rate": 9.208446950708568e-05, "loss": 0.0136, "step": 13350 }, { "grad_norm": 0.37003952264785767, "learning_rate": 9.20695829343705e-05, "loss": 0.0157, "step": 13360 }, { "grad_norm": 0.30146515369415283, "learning_rate": 9.205468358202678e-05, "loss": 0.0142, "step": 13370 }, { "grad_norm": 0.312034547328949, "learning_rate": 9.203977145458059e-05, "loss": 0.0146, "step": 13380 }, { "grad_norm": 0.3290935456752777, "learning_rate": 9.202484655656182e-05, "loss": 0.0174, "step": 13390 }, { "grad_norm": 0.3005056381225586, "learning_rate": 9.200990889250427e-05, "loss": 0.0126, "step": 13400 }, { "grad_norm": 0.613081157207489, "learning_rate": 9.19949584669456e-05, "loss": 0.0163, "step": 13410 }, { "grad_norm": 0.31315794587135315, "learning_rate": 9.197999528442738e-05, "loss": 0.0168, "step": 13420 }, { "grad_norm": 0.5277458429336548, "learning_rate": 9.196501934949499e-05, "loss": 0.0182, "step": 13430 }, { "grad_norm": 0.4309273958206177, "learning_rate": 9.195003066669776e-05, "loss": 0.0209, "step": 13440 }, { "grad_norm": 0.4055766761302948, "learning_rate": 9.193502924058884e-05, "loss": 0.0164, "step": 13450 }, { "grad_norm": 0.3456658720970154, "learning_rate": 9.192001507572526e-05, "loss": 0.018, "step": 13460 }, { "grad_norm": 0.40551888942718506, "learning_rate": 9.190498817666793e-05, "loss": 0.0181, "step": 13470 }, { "grad_norm": 0.3050583600997925, "learning_rate": 9.188994854798163e-05, "loss": 0.0143, "step": 13480 }, { "grad_norm": 0.3459891378879547, "learning_rate": 9.187489619423499e-05, "loss": 0.0225, "step": 13490 }, { "grad_norm": 0.4638763666152954, "learning_rate": 9.185983112000056e-05, "loss": 0.02, "step": 13500 }, { "grad_norm": 0.3626478612422943, "learning_rate": 9.184475332985464e-05, "loss": 0.0207, "step": 13510 }, { "grad_norm": 0.34336400032043457, "learning_rate": 9.182966282837754e-05, "loss": 0.0127, "step": 13520 }, { "grad_norm": 0.4573110044002533, "learning_rate": 9.18145596201533e-05, "loss": 0.0151, "step": 13530 }, { "grad_norm": 0.3770318925380707, "learning_rate": 9.179944370976991e-05, "loss": 0.0199, "step": 13540 }, { "grad_norm": 0.39217254519462585, "learning_rate": 9.178431510181918e-05, "loss": 0.0172, "step": 13550 }, { "grad_norm": 0.39840951561927795, "learning_rate": 9.176917380089675e-05, "loss": 0.0181, "step": 13560 }, { "grad_norm": 0.3409793972969055, "learning_rate": 9.175401981160219e-05, "loss": 0.0131, "step": 13570 }, { "grad_norm": 0.4742024838924408, "learning_rate": 9.173885313853885e-05, "loss": 0.0187, "step": 13580 }, { "grad_norm": 0.2477242350578308, "learning_rate": 9.172367378631398e-05, "loss": 0.0137, "step": 13590 }, { "grad_norm": 0.3063543736934662, "learning_rate": 9.170848175953866e-05, "loss": 0.0167, "step": 13600 }, { "grad_norm": 0.32829663157463074, "learning_rate": 9.169327706282784e-05, "loss": 0.016, "step": 13610 }, { "grad_norm": 0.44849711656570435, "learning_rate": 9.167805970080029e-05, "loss": 0.0135, "step": 13620 }, { "grad_norm": 0.42181119322776794, "learning_rate": 9.166282967807864e-05, "loss": 0.0164, "step": 13630 }, { "grad_norm": 0.4155069589614868, "learning_rate": 9.16475869992894e-05, "loss": 0.0185, "step": 13640 }, { "grad_norm": 0.3853234052658081, "learning_rate": 9.163233166906284e-05, "loss": 0.0147, "step": 13650 }, { "grad_norm": 0.30451634526252747, "learning_rate": 9.161706369203317e-05, "loss": 0.0164, "step": 13660 }, { "grad_norm": 0.31337645649909973, "learning_rate": 9.16017830728384e-05, "loss": 0.0121, "step": 13670 }, { "grad_norm": 0.3939133882522583, "learning_rate": 9.158648981612035e-05, "loss": 0.0177, "step": 13680 }, { "grad_norm": 0.4016934335231781, "learning_rate": 9.157118392652472e-05, "loss": 0.0149, "step": 13690 }, { "grad_norm": 0.3945441246032715, "learning_rate": 9.155586540870104e-05, "loss": 0.0171, "step": 13700 }, { "grad_norm": 0.34516316652297974, "learning_rate": 9.154053426730267e-05, "loss": 0.0188, "step": 13710 }, { "grad_norm": 0.33230695128440857, "learning_rate": 9.15251905069868e-05, "loss": 0.0134, "step": 13720 }, { "grad_norm": 0.391711562871933, "learning_rate": 9.150983413241446e-05, "loss": 0.0162, "step": 13730 }, { "grad_norm": 0.34310588240623474, "learning_rate": 9.149446514825051e-05, "loss": 0.014, "step": 13740 }, { "grad_norm": 0.4484381079673767, "learning_rate": 9.147908355916365e-05, "loss": 0.0174, "step": 13750 }, { "grad_norm": 0.3011036217212677, "learning_rate": 9.146368936982642e-05, "loss": 0.0139, "step": 13760 }, { "grad_norm": 0.3673052489757538, "learning_rate": 9.144828258491511e-05, "loss": 0.0132, "step": 13770 }, { "grad_norm": 0.44669434428215027, "learning_rate": 9.143286320910996e-05, "loss": 0.0139, "step": 13780 }, { "grad_norm": 0.32562264800071716, "learning_rate": 9.141743124709491e-05, "loss": 0.0165, "step": 13790 }, { "grad_norm": 0.3485332131385803, "learning_rate": 9.140198670355784e-05, "loss": 0.0169, "step": 13800 }, { "grad_norm": 0.4706399738788605, "learning_rate": 9.138652958319034e-05, "loss": 0.0176, "step": 13810 }, { "grad_norm": 0.3889845609664917, "learning_rate": 9.137105989068791e-05, "loss": 0.0155, "step": 13820 }, { "grad_norm": 0.34890374541282654, "learning_rate": 9.135557763074983e-05, "loss": 0.0167, "step": 13830 }, { "grad_norm": 0.29354432225227356, "learning_rate": 9.13400828080792e-05, "loss": 0.0139, "step": 13840 }, { "grad_norm": 0.3396795392036438, "learning_rate": 9.132457542738292e-05, "loss": 0.0131, "step": 13850 }, { "grad_norm": 0.35870692133903503, "learning_rate": 9.130905549337174e-05, "loss": 0.0167, "step": 13860 }, { "grad_norm": 0.27290400862693787, "learning_rate": 9.129352301076021e-05, "loss": 0.0124, "step": 13870 }, { "grad_norm": 0.32596784830093384, "learning_rate": 9.127797798426668e-05, "loss": 0.014, "step": 13880 }, { "grad_norm": 0.37084895372390747, "learning_rate": 9.126242041861333e-05, "loss": 0.0159, "step": 13890 }, { "grad_norm": 0.34786126017570496, "learning_rate": 9.124685031852611e-05, "loss": 0.0167, "step": 13900 }, { "grad_norm": 0.3394896984100342, "learning_rate": 9.123126768873482e-05, "loss": 0.0142, "step": 13910 }, { "grad_norm": 0.3149508237838745, "learning_rate": 9.121567253397308e-05, "loss": 0.0159, "step": 13920 }, { "grad_norm": 0.2656719386577606, "learning_rate": 9.120006485897824e-05, "loss": 0.0136, "step": 13930 }, { "grad_norm": 0.3305380344390869, "learning_rate": 9.118444466849152e-05, "loss": 0.0155, "step": 13940 }, { "grad_norm": 0.2971353828907013, "learning_rate": 9.116881196725793e-05, "loss": 0.0166, "step": 13950 }, { "grad_norm": 0.36611509323120117, "learning_rate": 9.115316676002627e-05, "loss": 0.015, "step": 13960 }, { "grad_norm": 0.26781439781188965, "learning_rate": 9.113750905154911e-05, "loss": 0.0126, "step": 13970 }, { "grad_norm": 0.40599337220191956, "learning_rate": 9.112183884658289e-05, "loss": 0.0149, "step": 13980 }, { "grad_norm": 0.3325079679489136, "learning_rate": 9.11061561498878e-05, "loss": 0.0128, "step": 13990 }, { "grad_norm": 0.36591359972953796, "learning_rate": 9.109046096622779e-05, "loss": 0.0129, "step": 14000 }, { "grad_norm": 0.3110436499118805, "learning_rate": 9.107475330037069e-05, "loss": 0.0144, "step": 14010 }, { "grad_norm": 0.42516830563545227, "learning_rate": 9.105903315708806e-05, "loss": 0.0169, "step": 14020 }, { "grad_norm": 0.3548910617828369, "learning_rate": 9.104330054115524e-05, "loss": 0.0154, "step": 14030 }, { "grad_norm": 0.2738393247127533, "learning_rate": 9.102755545735141e-05, "loss": 0.0141, "step": 14040 }, { "grad_norm": 0.31736063957214355, "learning_rate": 9.10117979104595e-05, "loss": 0.0117, "step": 14050 }, { "grad_norm": 0.2881460189819336, "learning_rate": 9.099602790526624e-05, "loss": 0.0132, "step": 14060 }, { "grad_norm": 0.3362765610218048, "learning_rate": 9.098024544656212e-05, "loss": 0.0144, "step": 14070 }, { "grad_norm": 0.24119043350219727, "learning_rate": 9.096445053914148e-05, "loss": 0.015, "step": 14080 }, { "grad_norm": 0.41081079840660095, "learning_rate": 9.094864318780236e-05, "loss": 0.0139, "step": 14090 }, { "grad_norm": 0.35552293062210083, "learning_rate": 9.093282339734663e-05, "loss": 0.0167, "step": 14100 }, { "grad_norm": 0.29461267590522766, "learning_rate": 9.091699117257992e-05, "loss": 0.0122, "step": 14110 }, { "grad_norm": 0.312937468290329, "learning_rate": 9.090114651831163e-05, "loss": 0.0134, "step": 14120 }, { "grad_norm": 0.39879798889160156, "learning_rate": 9.088528943935497e-05, "loss": 0.0158, "step": 14130 }, { "grad_norm": 0.36404165625572205, "learning_rate": 9.086941994052689e-05, "loss": 0.0171, "step": 14140 }, { "grad_norm": 0.3130766451358795, "learning_rate": 9.085353802664813e-05, "loss": 0.015, "step": 14150 }, { "grad_norm": 0.3249577581882477, "learning_rate": 9.08376437025432e-05, "loss": 0.0155, "step": 14160 }, { "grad_norm": 0.32672080397605896, "learning_rate": 9.082173697304035e-05, "loss": 0.0164, "step": 14170 }, { "grad_norm": 0.3939981460571289, "learning_rate": 9.080581784297166e-05, "loss": 0.014, "step": 14180 }, { "grad_norm": 0.3205048441886902, "learning_rate": 9.078988631717291e-05, "loss": 0.0136, "step": 14190 }, { "grad_norm": 0.4194487929344177, "learning_rate": 9.077394240048369e-05, "loss": 0.0152, "step": 14200 }, { "grad_norm": 0.3741471469402313, "learning_rate": 9.075798609774736e-05, "loss": 0.0152, "step": 14210 }, { "grad_norm": 0.3669159412384033, "learning_rate": 9.0742017413811e-05, "loss": 0.0154, "step": 14220 }, { "grad_norm": 0.35949963331222534, "learning_rate": 9.072603635352548e-05, "loss": 0.0167, "step": 14230 }, { "grad_norm": 0.3275270164012909, "learning_rate": 9.071004292174541e-05, "loss": 0.0129, "step": 14240 }, { "grad_norm": 0.35712912678718567, "learning_rate": 9.06940371233292e-05, "loss": 0.0159, "step": 14250 }, { "grad_norm": 0.29049134254455566, "learning_rate": 9.067801896313898e-05, "loss": 0.0161, "step": 14260 }, { "grad_norm": 0.3525155484676361, "learning_rate": 9.066198844604064e-05, "loss": 0.0133, "step": 14270 }, { "grad_norm": 0.330254465341568, "learning_rate": 9.06459455769038e-05, "loss": 0.0141, "step": 14280 }, { "grad_norm": 0.2980960011482239, "learning_rate": 9.062989036060193e-05, "loss": 0.0105, "step": 14290 }, { "grad_norm": 0.305086612701416, "learning_rate": 9.061382280201212e-05, "loss": 0.0135, "step": 14300 }, { "grad_norm": 0.3218506872653961, "learning_rate": 9.059774290601528e-05, "loss": 0.0131, "step": 14310 }, { "grad_norm": 0.36173155903816223, "learning_rate": 9.058165067749606e-05, "loss": 0.0179, "step": 14320 }, { "grad_norm": 0.3954292833805084, "learning_rate": 9.056554612134288e-05, "loss": 0.0166, "step": 14330 }, { "grad_norm": 0.3722546398639679, "learning_rate": 9.054942924244785e-05, "loss": 0.0152, "step": 14340 }, { "grad_norm": 0.3163178861141205, "learning_rate": 9.053330004570686e-05, "loss": 0.0138, "step": 14350 }, { "grad_norm": 0.3141244649887085, "learning_rate": 9.051715853601955e-05, "loss": 0.0176, "step": 14360 }, { "grad_norm": 0.3372696340084076, "learning_rate": 9.050100471828926e-05, "loss": 0.0154, "step": 14370 }, { "grad_norm": 0.32954317331314087, "learning_rate": 9.048483859742311e-05, "loss": 0.0164, "step": 14380 }, { "grad_norm": 0.3441936671733856, "learning_rate": 9.046866017833193e-05, "loss": 0.0151, "step": 14390 }, { "grad_norm": 0.3688376843929291, "learning_rate": 9.045246946593029e-05, "loss": 0.014, "step": 14400 }, { "grad_norm": 0.3306489586830139, "learning_rate": 9.043626646513652e-05, "loss": 0.0157, "step": 14410 }, { "grad_norm": 0.3034699857234955, "learning_rate": 9.042005118087267e-05, "loss": 0.0147, "step": 14420 }, { "grad_norm": 0.2809334993362427, "learning_rate": 9.040382361806448e-05, "loss": 0.0123, "step": 14430 }, { "grad_norm": 0.3783026337623596, "learning_rate": 9.038758378164148e-05, "loss": 0.0137, "step": 14440 }, { "grad_norm": 0.4293455481529236, "learning_rate": 9.037133167653691e-05, "loss": 0.0229, "step": 14450 }, { "grad_norm": 0.3802051842212677, "learning_rate": 9.035506730768771e-05, "loss": 0.0191, "step": 14460 }, { "grad_norm": 0.42028242349624634, "learning_rate": 9.033879068003458e-05, "loss": 0.0196, "step": 14470 }, { "grad_norm": 0.4385514259338379, "learning_rate": 9.032250179852193e-05, "loss": 0.0195, "step": 14480 }, { "grad_norm": 0.3730545938014984, "learning_rate": 9.030620066809787e-05, "loss": 0.0165, "step": 14490 }, { "grad_norm": 0.393687903881073, "learning_rate": 9.028988729371428e-05, "loss": 0.0125, "step": 14500 }, { "grad_norm": 0.3687753975391388, "learning_rate": 9.027356168032673e-05, "loss": 0.0147, "step": 14510 }, { "grad_norm": 0.3404821753501892, "learning_rate": 9.02572238328945e-05, "loss": 0.0126, "step": 14520 }, { "grad_norm": 0.363484650850296, "learning_rate": 9.02408737563806e-05, "loss": 0.016, "step": 14530 }, { "grad_norm": 0.5019651651382446, "learning_rate": 9.022451145575174e-05, "loss": 0.0161, "step": 14540 }, { "grad_norm": 0.3429701626300812, "learning_rate": 9.02081369359784e-05, "loss": 0.0168, "step": 14550 }, { "grad_norm": 0.38273975253105164, "learning_rate": 9.019175020203465e-05, "loss": 0.0147, "step": 14560 }, { "grad_norm": 0.3277057409286499, "learning_rate": 9.017535125889842e-05, "loss": 0.0159, "step": 14570 }, { "grad_norm": 0.36684003472328186, "learning_rate": 9.015894011155124e-05, "loss": 0.0156, "step": 14580 }, { "grad_norm": 0.35012102127075195, "learning_rate": 9.014251676497838e-05, "loss": 0.0139, "step": 14590 }, { "grad_norm": 0.24774228036403656, "learning_rate": 9.012608122416884e-05, "loss": 0.0149, "step": 14600 }, { "grad_norm": 0.31932333111763, "learning_rate": 9.010963349411529e-05, "loss": 0.0164, "step": 14610 }, { "grad_norm": 0.28318291902542114, "learning_rate": 9.00931735798141e-05, "loss": 0.0134, "step": 14620 }, { "grad_norm": 0.3513835370540619, "learning_rate": 9.00767014862654e-05, "loss": 0.0188, "step": 14630 }, { "grad_norm": 0.43248969316482544, "learning_rate": 9.006021721847295e-05, "loss": 0.016, "step": 14640 }, { "grad_norm": 0.3420730233192444, "learning_rate": 9.004372078144423e-05, "loss": 0.0146, "step": 14650 }, { "grad_norm": 0.38896235823631287, "learning_rate": 9.002721218019043e-05, "loss": 0.0143, "step": 14660 }, { "grad_norm": 0.384267657995224, "learning_rate": 9.001069141972642e-05, "loss": 0.0131, "step": 14670 }, { "grad_norm": 0.3028285503387451, "learning_rate": 8.99941585050708e-05, "loss": 0.0142, "step": 14680 }, { "grad_norm": 0.2744828462600708, "learning_rate": 8.997761344124578e-05, "loss": 0.0117, "step": 14690 }, { "grad_norm": 0.27059218287467957, "learning_rate": 8.996105623327737e-05, "loss": 0.0166, "step": 14700 }, { "grad_norm": 0.34469544887542725, "learning_rate": 8.994448688619517e-05, "loss": 0.0242, "step": 14710 }, { "grad_norm": 0.4304150938987732, "learning_rate": 8.992790540503253e-05, "loss": 0.018, "step": 14720 }, { "grad_norm": 0.36087173223495483, "learning_rate": 8.991131179482648e-05, "loss": 0.0165, "step": 14730 }, { "grad_norm": 0.3183897137641907, "learning_rate": 8.989470606061768e-05, "loss": 0.0196, "step": 14740 }, { "grad_norm": 0.4256609082221985, "learning_rate": 8.987808820745056e-05, "loss": 0.0143, "step": 14750 }, { "grad_norm": 0.2877207398414612, "learning_rate": 8.986145824037315e-05, "loss": 0.0151, "step": 14760 }, { "grad_norm": 0.3889293968677521, "learning_rate": 8.984481616443721e-05, "loss": 0.0185, "step": 14770 }, { "grad_norm": 0.28330937027931213, "learning_rate": 8.982816198469815e-05, "loss": 0.0144, "step": 14780 }, { "grad_norm": 0.2906559407711029, "learning_rate": 8.98114957062151e-05, "loss": 0.0177, "step": 14790 }, { "grad_norm": 0.3713333308696747, "learning_rate": 8.97948173340508e-05, "loss": 0.0125, "step": 14800 }, { "grad_norm": 0.36049315333366394, "learning_rate": 8.977812687327172e-05, "loss": 0.0145, "step": 14810 }, { "grad_norm": 0.3280653655529022, "learning_rate": 8.976142432894798e-05, "loss": 0.0137, "step": 14820 }, { "grad_norm": 0.3982951045036316, "learning_rate": 8.974470970615336e-05, "loss": 0.0139, "step": 14830 }, { "grad_norm": 0.3464769721031189, "learning_rate": 8.972798300996534e-05, "loss": 0.012, "step": 14840 }, { "grad_norm": 0.39642462134361267, "learning_rate": 8.971124424546504e-05, "loss": 0.0126, "step": 14850 }, { "grad_norm": 0.29961955547332764, "learning_rate": 8.969449341773724e-05, "loss": 0.0138, "step": 14860 }, { "grad_norm": 0.3007674813270569, "learning_rate": 8.967773053187042e-05, "loss": 0.0132, "step": 14870 }, { "grad_norm": 0.41528192162513733, "learning_rate": 8.966095559295668e-05, "loss": 0.0136, "step": 14880 }, { "grad_norm": 0.29742881655693054, "learning_rate": 8.964416860609184e-05, "loss": 0.0144, "step": 14890 }, { "grad_norm": 0.3091881573200226, "learning_rate": 8.962736957637532e-05, "loss": 0.0144, "step": 14900 }, { "grad_norm": 0.30855152010917664, "learning_rate": 8.96105585089102e-05, "loss": 0.0114, "step": 14910 }, { "grad_norm": 0.4418489336967468, "learning_rate": 8.959373540880329e-05, "loss": 0.0197, "step": 14920 }, { "grad_norm": 0.3194306492805481, "learning_rate": 8.957690028116495e-05, "loss": 0.0149, "step": 14930 }, { "grad_norm": 0.34865105152130127, "learning_rate": 8.956005313110928e-05, "loss": 0.0147, "step": 14940 }, { "grad_norm": 0.3382810652256012, "learning_rate": 8.9543193963754e-05, "loss": 0.0142, "step": 14950 }, { "grad_norm": 0.3505389094352722, "learning_rate": 8.952632278422048e-05, "loss": 0.0125, "step": 14960 }, { "grad_norm": 0.3530474603176117, "learning_rate": 8.95094395976337e-05, "loss": 0.0117, "step": 14970 }, { "grad_norm": 0.4002842903137207, "learning_rate": 8.949254440912239e-05, "loss": 0.0143, "step": 14980 }, { "grad_norm": 0.3761323094367981, "learning_rate": 8.94756372238188e-05, "loss": 0.0186, "step": 14990 }, { "grad_norm": 0.4517865478992462, "learning_rate": 8.945871804685892e-05, "loss": 0.0137, "step": 15000 }, { "grad_norm": 0.3776147663593292, "learning_rate": 8.944178688338236e-05, "loss": 0.0126, "step": 15010 }, { "grad_norm": 0.2930288016796112, "learning_rate": 8.942484373853233e-05, "loss": 0.0116, "step": 15020 }, { "grad_norm": 0.2978624105453491, "learning_rate": 8.940788861745572e-05, "loss": 0.015, "step": 15030 }, { "grad_norm": 0.3503165543079376, "learning_rate": 8.939092152530308e-05, "loss": 0.0171, "step": 15040 }, { "grad_norm": 0.25890377163887024, "learning_rate": 8.937394246722853e-05, "loss": 0.0122, "step": 15050 }, { "grad_norm": 0.29408955574035645, "learning_rate": 8.935695144838984e-05, "loss": 0.0137, "step": 15060 }, { "grad_norm": 0.3139441907405853, "learning_rate": 8.933994847394849e-05, "loss": 0.0136, "step": 15070 }, { "grad_norm": 0.25799211859703064, "learning_rate": 8.932293354906949e-05, "loss": 0.0111, "step": 15080 }, { "grad_norm": 0.2083788514137268, "learning_rate": 8.930590667892153e-05, "loss": 0.0121, "step": 15090 }, { "grad_norm": 0.3402978777885437, "learning_rate": 8.928886786867696e-05, "loss": 0.0127, "step": 15100 }, { "grad_norm": 0.29362717270851135, "learning_rate": 8.927181712351168e-05, "loss": 0.0186, "step": 15110 }, { "grad_norm": 0.3671301603317261, "learning_rate": 8.925475444860527e-05, "loss": 0.0168, "step": 15120 }, { "grad_norm": 0.36894676089286804, "learning_rate": 8.923767984914092e-05, "loss": 0.0149, "step": 15130 }, { "grad_norm": 0.28404465317726135, "learning_rate": 8.922059333030545e-05, "loss": 0.012, "step": 15140 }, { "grad_norm": 0.24064715206623077, "learning_rate": 8.920349489728928e-05, "loss": 0.0145, "step": 15150 }, { "grad_norm": 0.3710125982761383, "learning_rate": 8.918638455528646e-05, "loss": 0.0124, "step": 15160 }, { "grad_norm": 0.3326571583747864, "learning_rate": 8.916926230949468e-05, "loss": 0.0141, "step": 15170 }, { "grad_norm": 0.3539113700389862, "learning_rate": 8.915212816511522e-05, "loss": 0.0117, "step": 15180 }, { "grad_norm": 0.3796995282173157, "learning_rate": 8.913498212735296e-05, "loss": 0.0173, "step": 15190 }, { "grad_norm": 0.37052762508392334, "learning_rate": 8.911782420141643e-05, "loss": 0.0148, "step": 15200 }, { "grad_norm": 0.2812616229057312, "learning_rate": 8.910065439251775e-05, "loss": 0.0126, "step": 15210 }, { "grad_norm": 0.34295588731765747, "learning_rate": 8.908347270587268e-05, "loss": 0.0127, "step": 15220 }, { "grad_norm": 0.35617485642433167, "learning_rate": 8.906627914670054e-05, "loss": 0.0127, "step": 15230 }, { "grad_norm": 0.3306311368942261, "learning_rate": 8.904907372022427e-05, "loss": 0.0162, "step": 15240 }, { "grad_norm": 0.33233681321144104, "learning_rate": 8.903185643167042e-05, "loss": 0.0182, "step": 15250 }, { "grad_norm": 0.32985642552375793, "learning_rate": 8.901462728626919e-05, "loss": 0.0141, "step": 15260 }, { "grad_norm": 0.4287532567977905, "learning_rate": 8.899738628925429e-05, "loss": 0.0176, "step": 15270 }, { "grad_norm": 0.3267117142677307, "learning_rate": 8.898013344586312e-05, "loss": 0.0141, "step": 15280 }, { "grad_norm": 0.3373110890388489, "learning_rate": 8.896286876133661e-05, "loss": 0.0173, "step": 15290 }, { "grad_norm": 0.3415488004684448, "learning_rate": 8.894559224091933e-05, "loss": 0.013, "step": 15300 }, { "grad_norm": 0.4470892548561096, "learning_rate": 8.892830388985942e-05, "loss": 0.0149, "step": 15310 }, { "grad_norm": 0.42608416080474854, "learning_rate": 8.891100371340864e-05, "loss": 0.0179, "step": 15320 }, { "grad_norm": 0.3032132387161255, "learning_rate": 8.889369171682231e-05, "loss": 0.0168, "step": 15330 }, { "grad_norm": 0.2819351851940155, "learning_rate": 8.887636790535936e-05, "loss": 0.015, "step": 15340 }, { "grad_norm": 0.3128340244293213, "learning_rate": 8.885903228428231e-05, "loss": 0.0138, "step": 15350 }, { "grad_norm": 0.3368318974971771, "learning_rate": 8.884168485885727e-05, "loss": 0.0141, "step": 15360 }, { "grad_norm": 0.3355594277381897, "learning_rate": 8.882432563435393e-05, "loss": 0.013, "step": 15370 }, { "grad_norm": 0.2632829546928406, "learning_rate": 8.880695461604556e-05, "loss": 0.0132, "step": 15380 }, { "grad_norm": 0.2651746869087219, "learning_rate": 8.878957180920901e-05, "loss": 0.0185, "step": 15390 }, { "grad_norm": 0.39096003770828247, "learning_rate": 8.877217721912473e-05, "loss": 0.0155, "step": 15400 }, { "grad_norm": 0.33127567172050476, "learning_rate": 8.875477085107673e-05, "loss": 0.0131, "step": 15410 }, { "grad_norm": 0.2860681116580963, "learning_rate": 8.87373527103526e-05, "loss": 0.0176, "step": 15420 }, { "grad_norm": 0.3429126441478729, "learning_rate": 8.871992280224353e-05, "loss": 0.0113, "step": 15430 }, { "grad_norm": 0.4041215777397156, "learning_rate": 8.870248113204422e-05, "loss": 0.0127, "step": 15440 }, { "grad_norm": 0.3912096619606018, "learning_rate": 8.868502770505306e-05, "loss": 0.0152, "step": 15450 }, { "grad_norm": 0.3423306345939636, "learning_rate": 8.86675625265719e-05, "loss": 0.0147, "step": 15460 }, { "grad_norm": 0.4177573323249817, "learning_rate": 8.865008560190618e-05, "loss": 0.0153, "step": 15470 }, { "grad_norm": 0.31812793016433716, "learning_rate": 8.863259693636496e-05, "loss": 0.0126, "step": 15480 }, { "grad_norm": 0.26776182651519775, "learning_rate": 8.861509653526083e-05, "loss": 0.0159, "step": 15490 }, { "grad_norm": 0.31965771317481995, "learning_rate": 8.859758440390993e-05, "loss": 0.0117, "step": 15500 }, { "grad_norm": 0.3053452670574188, "learning_rate": 8.858006054763202e-05, "loss": 0.0118, "step": 15510 }, { "grad_norm": 0.3347854018211365, "learning_rate": 8.856252497175035e-05, "loss": 0.0126, "step": 15520 }, { "grad_norm": 0.2923136055469513, "learning_rate": 8.854497768159178e-05, "loss": 0.0204, "step": 15530 }, { "grad_norm": 0.32943835854530334, "learning_rate": 8.852741868248671e-05, "loss": 0.0164, "step": 15540 }, { "grad_norm": 0.28264474868774414, "learning_rate": 8.85098479797691e-05, "loss": 0.0145, "step": 15550 }, { "grad_norm": 0.3556481897830963, "learning_rate": 8.849226557877646e-05, "loss": 0.0126, "step": 15560 }, { "grad_norm": 0.2995128035545349, "learning_rate": 8.84746714848499e-05, "loss": 0.012, "step": 15570 }, { "grad_norm": 0.40936970710754395, "learning_rate": 8.845706570333397e-05, "loss": 0.0126, "step": 15580 }, { "grad_norm": 0.38154274225234985, "learning_rate": 8.84394482395769e-05, "loss": 0.0141, "step": 15590 }, { "grad_norm": 0.32766827940940857, "learning_rate": 8.842181909893038e-05, "loss": 0.013, "step": 15600 }, { "grad_norm": 0.3190121650695801, "learning_rate": 8.840417828674969e-05, "loss": 0.0125, "step": 15610 }, { "grad_norm": 0.3312472701072693, "learning_rate": 8.838652580839364e-05, "loss": 0.0127, "step": 15620 }, { "grad_norm": 0.35937488079071045, "learning_rate": 8.836886166922458e-05, "loss": 0.0111, "step": 15630 }, { "grad_norm": 0.38958367705345154, "learning_rate": 8.835118587460844e-05, "loss": 0.0116, "step": 15640 }, { "grad_norm": 0.3366395831108093, "learning_rate": 8.83334984299146e-05, "loss": 0.0166, "step": 15650 }, { "grad_norm": 0.2910676896572113, "learning_rate": 8.83157993405161e-05, "loss": 0.0144, "step": 15660 }, { "grad_norm": 0.2860357463359833, "learning_rate": 8.829808861178943e-05, "loss": 0.0128, "step": 15670 }, { "grad_norm": 0.405865877866745, "learning_rate": 8.828036624911464e-05, "loss": 0.011, "step": 15680 }, { "grad_norm": 0.3807377219200134, "learning_rate": 8.826263225787532e-05, "loss": 0.0148, "step": 15690 }, { "grad_norm": 0.3049972355365753, "learning_rate": 8.824488664345858e-05, "loss": 0.0125, "step": 15700 }, { "grad_norm": 0.24119381606578827, "learning_rate": 8.822712941125508e-05, "loss": 0.0147, "step": 15710 }, { "grad_norm": 0.3006763756275177, "learning_rate": 8.820936056665898e-05, "loss": 0.0147, "step": 15720 }, { "grad_norm": 0.3721540868282318, "learning_rate": 8.819158011506801e-05, "loss": 0.0132, "step": 15730 }, { "grad_norm": 0.329062283039093, "learning_rate": 8.81737880618834e-05, "loss": 0.0148, "step": 15740 }, { "grad_norm": 0.330485999584198, "learning_rate": 8.815598441250987e-05, "loss": 0.0125, "step": 15750 }, { "grad_norm": 0.32903993129730225, "learning_rate": 8.813816917235576e-05, "loss": 0.0143, "step": 15760 }, { "grad_norm": 0.4610109329223633, "learning_rate": 8.812034234683282e-05, "loss": 0.0144, "step": 15770 }, { "grad_norm": 0.2858344316482544, "learning_rate": 8.810250394135637e-05, "loss": 0.0154, "step": 15780 }, { "grad_norm": 0.3090382218360901, "learning_rate": 8.808465396134529e-05, "loss": 0.0138, "step": 15790 }, { "grad_norm": 0.4043864607810974, "learning_rate": 8.806679241222189e-05, "loss": 0.0149, "step": 15800 }, { "grad_norm": 0.41810423135757446, "learning_rate": 8.804891929941203e-05, "loss": 0.0131, "step": 15810 }, { "grad_norm": 0.30472487211227417, "learning_rate": 8.803103462834514e-05, "loss": 0.015, "step": 15820 }, { "grad_norm": 0.4549807608127594, "learning_rate": 8.801313840445408e-05, "loss": 0.0154, "step": 15830 }, { "grad_norm": 0.29225510358810425, "learning_rate": 8.799523063317524e-05, "loss": 0.0131, "step": 15840 }, { "grad_norm": 0.3015229105949402, "learning_rate": 8.797731131994854e-05, "loss": 0.0162, "step": 15850 }, { "grad_norm": 0.3149995505809784, "learning_rate": 8.795938047021739e-05, "loss": 0.0117, "step": 15860 }, { "grad_norm": 0.33604562282562256, "learning_rate": 8.794143808942872e-05, "loss": 0.0126, "step": 15870 }, { "grad_norm": 0.3729907274246216, "learning_rate": 8.792348418303296e-05, "loss": 0.0177, "step": 15880 }, { "grad_norm": 0.49357935786247253, "learning_rate": 8.790551875648398e-05, "loss": 0.0133, "step": 15890 }, { "grad_norm": 0.4085029065608978, "learning_rate": 8.788754181523926e-05, "loss": 0.0166, "step": 15900 }, { "grad_norm": 0.33292534947395325, "learning_rate": 8.78695533647597e-05, "loss": 0.019, "step": 15910 }, { "grad_norm": 0.37383317947387695, "learning_rate": 8.785155341050972e-05, "loss": 0.0159, "step": 15920 }, { "grad_norm": 0.43480196595191956, "learning_rate": 8.783354195795721e-05, "loss": 0.0227, "step": 15930 }, { "grad_norm": 0.3275454342365265, "learning_rate": 8.78155190125736e-05, "loss": 0.014, "step": 15940 }, { "grad_norm": 0.2989807724952698, "learning_rate": 8.779748457983378e-05, "loss": 0.0159, "step": 15950 }, { "grad_norm": 0.40545862913131714, "learning_rate": 8.777943866521612e-05, "loss": 0.0182, "step": 15960 }, { "grad_norm": 0.43560394644737244, "learning_rate": 8.77613812742025e-05, "loss": 0.0139, "step": 15970 }, { "grad_norm": 0.3351658582687378, "learning_rate": 8.774331241227829e-05, "loss": 0.0143, "step": 15980 }, { "grad_norm": 0.27033624053001404, "learning_rate": 8.772523208493232e-05, "loss": 0.0123, "step": 15990 }, { "grad_norm": 0.2905561029911041, "learning_rate": 8.770714029765692e-05, "loss": 0.0114, "step": 16000 }, { "grad_norm": 0.300875186920166, "learning_rate": 8.768903705594789e-05, "loss": 0.0114, "step": 16010 }, { "grad_norm": 0.5580713152885437, "learning_rate": 8.767092236530453e-05, "loss": 0.0101, "step": 16020 }, { "grad_norm": 0.2916533648967743, "learning_rate": 8.76527962312296e-05, "loss": 0.0117, "step": 16030 }, { "grad_norm": 0.2396416813135147, "learning_rate": 8.763465865922934e-05, "loss": 0.0136, "step": 16040 }, { "grad_norm": 0.36523109674453735, "learning_rate": 8.761650965481347e-05, "loss": 0.014, "step": 16050 }, { "grad_norm": 0.2911982536315918, "learning_rate": 8.759834922349516e-05, "loss": 0.0123, "step": 16060 }, { "grad_norm": 0.32615402340888977, "learning_rate": 8.758017737079108e-05, "loss": 0.0136, "step": 16070 }, { "grad_norm": 0.32375550270080566, "learning_rate": 8.756199410222137e-05, "loss": 0.0153, "step": 16080 }, { "grad_norm": 0.3893955945968628, "learning_rate": 8.754379942330963e-05, "loss": 0.0143, "step": 16090 }, { "grad_norm": 0.2964588403701782, "learning_rate": 8.75255933395829e-05, "loss": 0.015, "step": 16100 }, { "grad_norm": 0.23883485794067383, "learning_rate": 8.750737585657171e-05, "loss": 0.0139, "step": 16110 }, { "grad_norm": 0.28332650661468506, "learning_rate": 8.748914697981008e-05, "loss": 0.0133, "step": 16120 }, { "grad_norm": 0.3785288333892822, "learning_rate": 8.747090671483542e-05, "loss": 0.0111, "step": 16130 }, { "grad_norm": 0.31500139832496643, "learning_rate": 8.745265506718869e-05, "loss": 0.0151, "step": 16140 }, { "grad_norm": 0.3582603633403778, "learning_rate": 8.74343920424142e-05, "loss": 0.0115, "step": 16150 }, { "grad_norm": 0.3371714949607849, "learning_rate": 8.741611764605982e-05, "loss": 0.0184, "step": 16160 }, { "grad_norm": 0.4119216799736023, "learning_rate": 8.739783188367682e-05, "loss": 0.0127, "step": 16170 }, { "grad_norm": 0.2599102854728699, "learning_rate": 8.737953476081991e-05, "loss": 0.0166, "step": 16180 }, { "grad_norm": 0.44500142335891724, "learning_rate": 8.73612262830473e-05, "loss": 0.0134, "step": 16190 }, { "grad_norm": 0.4116831123828888, "learning_rate": 8.734290645592061e-05, "loss": 0.0188, "step": 16200 }, { "grad_norm": 0.27751389145851135, "learning_rate": 8.732457528500493e-05, "loss": 0.011, "step": 16210 }, { "grad_norm": 0.3647110164165497, "learning_rate": 8.730623277586875e-05, "loss": 0.0135, "step": 16220 }, { "grad_norm": 0.3718293309211731, "learning_rate": 8.72878789340841e-05, "loss": 0.0134, "step": 16230 }, { "grad_norm": 0.2759992778301239, "learning_rate": 8.726951376522635e-05, "loss": 0.013, "step": 16240 }, { "grad_norm": 0.3869374394416809, "learning_rate": 8.725113727487435e-05, "loss": 0.0119, "step": 16250 }, { "grad_norm": 0.2692963778972626, "learning_rate": 8.723274946861042e-05, "loss": 0.0133, "step": 16260 }, { "grad_norm": 0.4123724400997162, "learning_rate": 8.721435035202026e-05, "loss": 0.0132, "step": 16270 }, { "grad_norm": 0.31288018822669983, "learning_rate": 8.719593993069306e-05, "loss": 0.0121, "step": 16280 }, { "grad_norm": 0.24861818552017212, "learning_rate": 8.717751821022139e-05, "loss": 0.0173, "step": 16290 }, { "grad_norm": 0.3155044615268707, "learning_rate": 8.715908519620134e-05, "loss": 0.0123, "step": 16300 }, { "grad_norm": 0.4249679148197174, "learning_rate": 8.71406408942323e-05, "loss": 0.0144, "step": 16310 }, { "grad_norm": 0.30136585235595703, "learning_rate": 8.712218530991723e-05, "loss": 0.0127, "step": 16320 }, { "grad_norm": 0.3271821141242981, "learning_rate": 8.710371844886241e-05, "loss": 0.0152, "step": 16330 }, { "grad_norm": 0.27764326333999634, "learning_rate": 8.708524031667758e-05, "loss": 0.012, "step": 16340 }, { "grad_norm": 0.4021338224411011, "learning_rate": 8.706675091897592e-05, "loss": 0.0129, "step": 16350 }, { "grad_norm": 0.30744513869285583, "learning_rate": 8.704825026137404e-05, "loss": 0.0139, "step": 16360 }, { "grad_norm": 0.322570264339447, "learning_rate": 8.702973834949192e-05, "loss": 0.0134, "step": 16370 }, { "grad_norm": 0.32617056369781494, "learning_rate": 8.701121518895301e-05, "loss": 0.0149, "step": 16380 }, { "grad_norm": 0.27813100814819336, "learning_rate": 8.699268078538414e-05, "loss": 0.0124, "step": 16390 }, { "grad_norm": 0.3387972414493561, "learning_rate": 8.69741351444156e-05, "loss": 0.0133, "step": 16400 }, { "grad_norm": 0.3083968460559845, "learning_rate": 8.695557827168101e-05, "loss": 0.0132, "step": 16410 }, { "grad_norm": 0.3003661036491394, "learning_rate": 8.693701017281753e-05, "loss": 0.0116, "step": 16420 }, { "grad_norm": 0.2909878194332123, "learning_rate": 8.691843085346563e-05, "loss": 0.0112, "step": 16430 }, { "grad_norm": 0.38834142684936523, "learning_rate": 8.689984031926919e-05, "loss": 0.0131, "step": 16440 }, { "grad_norm": 0.325787752866745, "learning_rate": 8.688123857587555e-05, "loss": 0.0116, "step": 16450 }, { "grad_norm": 0.2978907823562622, "learning_rate": 8.686262562893544e-05, "loss": 0.0138, "step": 16460 }, { "grad_norm": 0.2757437527179718, "learning_rate": 8.684400148410294e-05, "loss": 0.0139, "step": 16470 }, { "grad_norm": 0.27132338285446167, "learning_rate": 8.682536614703562e-05, "loss": 0.0155, "step": 16480 }, { "grad_norm": 0.3353572189807892, "learning_rate": 8.680671962339437e-05, "loss": 0.0158, "step": 16490 }, { "grad_norm": 0.347239226102829, "learning_rate": 8.678806191884352e-05, "loss": 0.0136, "step": 16500 }, { "grad_norm": 0.3027389943599701, "learning_rate": 8.67693930390508e-05, "loss": 0.0126, "step": 16510 }, { "grad_norm": 0.4040740132331848, "learning_rate": 8.67507129896873e-05, "loss": 0.0125, "step": 16520 }, { "grad_norm": 0.34633320569992065, "learning_rate": 8.673202177642757e-05, "loss": 0.0127, "step": 16530 }, { "grad_norm": 0.28745752573013306, "learning_rate": 8.671331940494945e-05, "loss": 0.0128, "step": 16540 }, { "grad_norm": 0.28032582998275757, "learning_rate": 8.669460588093427e-05, "loss": 0.0144, "step": 16550 }, { "grad_norm": 0.41900312900543213, "learning_rate": 8.667588121006667e-05, "loss": 0.0136, "step": 16560 }, { "grad_norm": 0.30153435468673706, "learning_rate": 8.665714539803475e-05, "loss": 0.013, "step": 16570 }, { "grad_norm": 0.30236735939979553, "learning_rate": 8.663839845052993e-05, "loss": 0.0155, "step": 16580 }, { "grad_norm": 0.2843511402606964, "learning_rate": 8.661964037324703e-05, "loss": 0.0125, "step": 16590 }, { "grad_norm": 0.3118680417537689, "learning_rate": 8.660087117188427e-05, "loss": 0.0114, "step": 16600 }, { "grad_norm": 0.32381507754325867, "learning_rate": 8.658209085214325e-05, "loss": 0.0147, "step": 16610 }, { "grad_norm": 0.3829399645328522, "learning_rate": 8.656329941972891e-05, "loss": 0.0139, "step": 16620 }, { "grad_norm": 0.27941471338272095, "learning_rate": 8.654449688034963e-05, "loss": 0.0128, "step": 16630 }, { "grad_norm": 0.3418424129486084, "learning_rate": 8.652568323971706e-05, "loss": 0.0121, "step": 16640 }, { "grad_norm": 0.3883926570415497, "learning_rate": 8.650685850354636e-05, "loss": 0.0139, "step": 16650 }, { "grad_norm": 0.32417985796928406, "learning_rate": 8.648802267755593e-05, "loss": 0.0125, "step": 16660 }, { "grad_norm": 0.42773956060409546, "learning_rate": 8.646917576746764e-05, "loss": 0.0133, "step": 16670 }, { "grad_norm": 0.31513354182243347, "learning_rate": 8.645031777900666e-05, "loss": 0.0117, "step": 16680 }, { "grad_norm": 0.2895990014076233, "learning_rate": 8.643144871790154e-05, "loss": 0.0125, "step": 16690 }, { "grad_norm": 0.42464783787727356, "learning_rate": 8.641256858988424e-05, "loss": 0.018, "step": 16700 }, { "grad_norm": 0.33051595091819763, "learning_rate": 8.639367740069e-05, "loss": 0.0122, "step": 16710 }, { "grad_norm": 0.3542407155036926, "learning_rate": 8.63747751560575e-05, "loss": 0.0172, "step": 16720 }, { "grad_norm": 0.2745692729949951, "learning_rate": 8.635586186172871e-05, "loss": 0.0124, "step": 16730 }, { "grad_norm": 0.3380177319049835, "learning_rate": 8.633693752344902e-05, "loss": 0.0107, "step": 16740 }, { "grad_norm": 0.36971718072891235, "learning_rate": 8.631800214696713e-05, "loss": 0.011, "step": 16750 }, { "grad_norm": 0.45011842250823975, "learning_rate": 8.629905573803511e-05, "loss": 0.0135, "step": 16760 }, { "grad_norm": 0.29282692074775696, "learning_rate": 8.628009830240839e-05, "loss": 0.0152, "step": 16770 }, { "grad_norm": 0.32043442130088806, "learning_rate": 8.626112984584571e-05, "loss": 0.0119, "step": 16780 }, { "grad_norm": 0.3498082756996155, "learning_rate": 8.62421503741092e-05, "loss": 0.0138, "step": 16790 }, { "grad_norm": 0.26590287685394287, "learning_rate": 8.622315989296432e-05, "loss": 0.0142, "step": 16800 }, { "grad_norm": 0.4875786602497101, "learning_rate": 8.62041584081799e-05, "loss": 0.0177, "step": 16810 }, { "grad_norm": 0.33064335584640503, "learning_rate": 8.618514592552807e-05, "loss": 0.0149, "step": 16820 }, { "grad_norm": 0.3084089457988739, "learning_rate": 8.616612245078431e-05, "loss": 0.0122, "step": 16830 }, { "grad_norm": 0.3607918620109558, "learning_rate": 8.614708798972746e-05, "loss": 0.0127, "step": 16840 }, { "grad_norm": 0.24627645313739777, "learning_rate": 8.61280425481397e-05, "loss": 0.0121, "step": 16850 }, { "grad_norm": 0.39730602502822876, "learning_rate": 8.61089861318065e-05, "loss": 0.0113, "step": 16860 }, { "grad_norm": 0.3092910945415497, "learning_rate": 8.608991874651673e-05, "loss": 0.0105, "step": 16870 }, { "grad_norm": 0.419026255607605, "learning_rate": 8.607084039806255e-05, "loss": 0.0161, "step": 16880 }, { "grad_norm": 0.3118002116680145, "learning_rate": 8.605175109223944e-05, "loss": 0.0183, "step": 16890 }, { "grad_norm": 0.4125400185585022, "learning_rate": 8.603265083484624e-05, "loss": 0.0213, "step": 16900 }, { "grad_norm": 0.3133799433708191, "learning_rate": 8.60135396316851e-05, "loss": 0.0149, "step": 16910 }, { "grad_norm": 0.32613906264305115, "learning_rate": 8.599441748856152e-05, "loss": 0.0152, "step": 16920 }, { "grad_norm": 0.3974539339542389, "learning_rate": 8.597528441128427e-05, "loss": 0.0135, "step": 16930 }, { "grad_norm": 0.22940893471240997, "learning_rate": 8.595614040566549e-05, "loss": 0.0152, "step": 16940 }, { "grad_norm": 0.21529826521873474, "learning_rate": 8.593698547752063e-05, "loss": 0.0105, "step": 16950 }, { "grad_norm": 0.359500914812088, "learning_rate": 8.591781963266843e-05, "loss": 0.0146, "step": 16960 }, { "grad_norm": 0.36617258191108704, "learning_rate": 8.5898642876931e-05, "loss": 0.0159, "step": 16970 }, { "grad_norm": 0.22257599234580994, "learning_rate": 8.587945521613369e-05, "loss": 0.0126, "step": 16980 }, { "grad_norm": 0.3628152906894684, "learning_rate": 8.586025665610524e-05, "loss": 0.0156, "step": 16990 }, { "grad_norm": 0.32624298334121704, "learning_rate": 8.584104720267765e-05, "loss": 0.0173, "step": 17000 }, { "grad_norm": 0.2702179551124573, "learning_rate": 8.582182686168625e-05, "loss": 0.0131, "step": 17010 }, { "grad_norm": 0.30716437101364136, "learning_rate": 8.580259563896967e-05, "loss": 0.0136, "step": 17020 }, { "grad_norm": 0.34903740882873535, "learning_rate": 8.578335354036983e-05, "loss": 0.0137, "step": 17030 }, { "grad_norm": 0.38253846764564514, "learning_rate": 8.576410057173201e-05, "loss": 0.0128, "step": 17040 }, { "grad_norm": 0.2828770577907562, "learning_rate": 8.574483673890474e-05, "loss": 0.0123, "step": 17050 }, { "grad_norm": 0.37984320521354675, "learning_rate": 8.572556204773983e-05, "loss": 0.0121, "step": 17060 }, { "grad_norm": 0.38941362500190735, "learning_rate": 8.570627650409246e-05, "loss": 0.0169, "step": 17070 }, { "grad_norm": 0.3261432647705078, "learning_rate": 8.568698011382107e-05, "loss": 0.0144, "step": 17080 }, { "grad_norm": 0.22355127334594727, "learning_rate": 8.566767288278738e-05, "loss": 0.0102, "step": 17090 }, { "grad_norm": 0.20584484934806824, "learning_rate": 8.56483548168564e-05, "loss": 0.0136, "step": 17100 }, { "grad_norm": 0.2694486975669861, "learning_rate": 8.562902592189648e-05, "loss": 0.0141, "step": 17110 }, { "grad_norm": 0.2758514881134033, "learning_rate": 8.560968620377921e-05, "loss": 0.0135, "step": 17120 }, { "grad_norm": 0.3015854060649872, "learning_rate": 8.559033566837951e-05, "loss": 0.0171, "step": 17130 }, { "grad_norm": 0.2922641932964325, "learning_rate": 8.557097432157551e-05, "loss": 0.0176, "step": 17140 }, { "grad_norm": 0.27565670013427734, "learning_rate": 8.555160216924872e-05, "loss": 0.014, "step": 17150 }, { "grad_norm": 0.31398844718933105, "learning_rate": 8.55322192172839e-05, "loss": 0.0137, "step": 17160 }, { "grad_norm": 0.3369823694229126, "learning_rate": 8.551282547156902e-05, "loss": 0.013, "step": 17170 }, { "grad_norm": 0.34948980808258057, "learning_rate": 8.549342093799544e-05, "loss": 0.0134, "step": 17180 }, { "grad_norm": 0.3204096555709839, "learning_rate": 8.547400562245773e-05, "loss": 0.0133, "step": 17190 }, { "grad_norm": 0.3625577390193939, "learning_rate": 8.545457953085374e-05, "loss": 0.0129, "step": 17200 }, { "grad_norm": 0.2795736491680145, "learning_rate": 8.543514266908463e-05, "loss": 0.0158, "step": 17210 }, { "grad_norm": 0.3084838390350342, "learning_rate": 8.541569504305478e-05, "loss": 0.0123, "step": 17220 }, { "grad_norm": 0.2944566607475281, "learning_rate": 8.539623665867187e-05, "loss": 0.012, "step": 17230 }, { "grad_norm": 0.27562278509140015, "learning_rate": 8.537676752184685e-05, "loss": 0.0133, "step": 17240 }, { "grad_norm": 0.31810757517814636, "learning_rate": 8.53572876384939e-05, "loss": 0.0168, "step": 17250 }, { "grad_norm": 0.23705923557281494, "learning_rate": 8.533779701453056e-05, "loss": 0.0117, "step": 17260 }, { "grad_norm": 0.37942931056022644, "learning_rate": 8.53182956558775e-05, "loss": 0.0118, "step": 17270 }, { "grad_norm": 0.35399091243743896, "learning_rate": 8.529878356845877e-05, "loss": 0.015, "step": 17280 }, { "grad_norm": 0.4677645266056061, "learning_rate": 8.527926075820158e-05, "loss": 0.0137, "step": 17290 }, { "grad_norm": 0.3171139657497406, "learning_rate": 8.525972723103648e-05, "loss": 0.0157, "step": 17300 }, { "grad_norm": 0.28595083951950073, "learning_rate": 8.524018299289722e-05, "loss": 0.0149, "step": 17310 }, { "grad_norm": 0.2826521396636963, "learning_rate": 8.522062804972083e-05, "loss": 0.0108, "step": 17320 }, { "grad_norm": 0.38181421160697937, "learning_rate": 8.520106240744759e-05, "loss": 0.0143, "step": 17330 }, { "grad_norm": 0.2895301878452301, "learning_rate": 8.518148607202102e-05, "loss": 0.0142, "step": 17340 }, { "grad_norm": 0.3157401978969574, "learning_rate": 8.51618990493879e-05, "loss": 0.0141, "step": 17350 }, { "grad_norm": 0.31055977940559387, "learning_rate": 8.514230134549823e-05, "loss": 0.0143, "step": 17360 }, { "grad_norm": 0.24353331327438354, "learning_rate": 8.51226929663053e-05, "loss": 0.011, "step": 17370 }, { "grad_norm": 0.23701763153076172, "learning_rate": 8.51030739177656e-05, "loss": 0.0112, "step": 17380 }, { "grad_norm": 0.2186122089624405, "learning_rate": 8.508344420583889e-05, "loss": 0.01, "step": 17390 }, { "grad_norm": 0.3303378224372864, "learning_rate": 8.506380383648816e-05, "loss": 0.0118, "step": 17400 }, { "grad_norm": 0.28739821910858154, "learning_rate": 8.504415281567963e-05, "loss": 0.0125, "step": 17410 }, { "grad_norm": 0.410834401845932, "learning_rate": 8.502449114938275e-05, "loss": 0.0131, "step": 17420 }, { "grad_norm": 0.40596508979797363, "learning_rate": 8.500481884357025e-05, "loss": 0.0167, "step": 17430 }, { "grad_norm": 0.26776716113090515, "learning_rate": 8.498513590421801e-05, "loss": 0.01, "step": 17440 }, { "grad_norm": 0.29014843702316284, "learning_rate": 8.496544233730522e-05, "loss": 0.0122, "step": 17450 }, { "grad_norm": 0.24734145402908325, "learning_rate": 8.494573814881426e-05, "loss": 0.0128, "step": 17460 }, { "grad_norm": 0.32968658208847046, "learning_rate": 8.492602334473074e-05, "loss": 0.0122, "step": 17470 }, { "grad_norm": 0.36287587881088257, "learning_rate": 8.49062979310435e-05, "loss": 0.0128, "step": 17480 }, { "grad_norm": 0.3090816140174866, "learning_rate": 8.488656191374458e-05, "loss": 0.0116, "step": 17490 }, { "grad_norm": 0.24727264046669006, "learning_rate": 8.48668152988293e-05, "loss": 0.0107, "step": 17500 }, { "grad_norm": 0.264192134141922, "learning_rate": 8.484705809229612e-05, "loss": 0.0117, "step": 17510 }, { "grad_norm": 0.2668319046497345, "learning_rate": 8.482729030014677e-05, "loss": 0.0119, "step": 17520 }, { "grad_norm": 0.3539884388446808, "learning_rate": 8.48075119283862e-05, "loss": 0.0139, "step": 17530 }, { "grad_norm": 0.3166970908641815, "learning_rate": 8.478772298302254e-05, "loss": 0.0154, "step": 17540 }, { "grad_norm": 0.4734901487827301, "learning_rate": 8.476792347006716e-05, "loss": 0.0159, "step": 17550 }, { "grad_norm": 0.36261293292045593, "learning_rate": 8.474811339553462e-05, "loss": 0.0138, "step": 17560 }, { "grad_norm": 0.2863885164260864, "learning_rate": 8.47282927654427e-05, "loss": 0.0126, "step": 17570 }, { "grad_norm": 0.30727434158325195, "learning_rate": 8.470846158581238e-05, "loss": 0.0149, "step": 17580 }, { "grad_norm": 0.3462671637535095, "learning_rate": 8.468861986266787e-05, "loss": 0.0113, "step": 17590 }, { "grad_norm": 0.3012579679489136, "learning_rate": 8.466876760203654e-05, "loss": 0.0109, "step": 17600 }, { "grad_norm": 0.2637174725532532, "learning_rate": 8.464890480994898e-05, "loss": 0.0113, "step": 17610 }, { "grad_norm": 0.33702734112739563, "learning_rate": 8.462903149243899e-05, "loss": 0.0112, "step": 17620 }, { "grad_norm": 0.34073346853256226, "learning_rate": 8.460914765554357e-05, "loss": 0.0118, "step": 17630 }, { "grad_norm": 0.3656296133995056, "learning_rate": 8.458925330530288e-05, "loss": 0.0125, "step": 17640 }, { "grad_norm": 0.31052330136299133, "learning_rate": 8.456934844776032e-05, "loss": 0.0117, "step": 17650 }, { "grad_norm": 0.32945623993873596, "learning_rate": 8.454943308896246e-05, "loss": 0.012, "step": 17660 }, { "grad_norm": 0.37586280703544617, "learning_rate": 8.452950723495905e-05, "loss": 0.0167, "step": 17670 }, { "grad_norm": 0.33211398124694824, "learning_rate": 8.450957089180303e-05, "loss": 0.0106, "step": 17680 }, { "grad_norm": 0.38755369186401367, "learning_rate": 8.448962406555055e-05, "loss": 0.011, "step": 17690 }, { "grad_norm": 0.3846781551837921, "learning_rate": 8.446966676226093e-05, "loss": 0.0189, "step": 17700 }, { "grad_norm": 0.37892523407936096, "learning_rate": 8.444969898799667e-05, "loss": 0.0124, "step": 17710 }, { "grad_norm": 0.34158629179000854, "learning_rate": 8.442972074882343e-05, "loss": 0.0132, "step": 17720 }, { "grad_norm": 0.33211708068847656, "learning_rate": 8.44097320508101e-05, "loss": 0.0131, "step": 17730 }, { "grad_norm": 0.37847191095352173, "learning_rate": 8.43897329000287e-05, "loss": 0.0134, "step": 17740 }, { "grad_norm": 0.35738256573677063, "learning_rate": 8.436972330255448e-05, "loss": 0.0108, "step": 17750 }, { "grad_norm": 0.40498390793800354, "learning_rate": 8.434970326446579e-05, "loss": 0.0129, "step": 17760 }, { "grad_norm": 0.23706281185150146, "learning_rate": 8.432967279184418e-05, "loss": 0.0167, "step": 17770 }, { "grad_norm": 0.2722736895084381, "learning_rate": 8.430963189077441e-05, "loss": 0.0123, "step": 17780 }, { "grad_norm": 0.28679096698760986, "learning_rate": 8.428958056734437e-05, "loss": 0.0111, "step": 17790 }, { "grad_norm": 0.2708021104335785, "learning_rate": 8.426951882764513e-05, "loss": 0.0152, "step": 17800 }, { "grad_norm": 0.25948208570480347, "learning_rate": 8.424944667777089e-05, "loss": 0.013, "step": 17810 }, { "grad_norm": 0.32522982358932495, "learning_rate": 8.422936412381905e-05, "loss": 0.0115, "step": 17820 }, { "grad_norm": 0.353273868560791, "learning_rate": 8.420927117189017e-05, "loss": 0.0124, "step": 17830 }, { "grad_norm": 0.31373462080955505, "learning_rate": 8.418916782808795e-05, "loss": 0.015, "step": 17840 }, { "grad_norm": 0.35310378670692444, "learning_rate": 8.416905409851926e-05, "loss": 0.0215, "step": 17850 }, { "grad_norm": 0.33689039945602417, "learning_rate": 8.41489299892941e-05, "loss": 0.0116, "step": 17860 }, { "grad_norm": 0.2950003147125244, "learning_rate": 8.412879550652566e-05, "loss": 0.0106, "step": 17870 }, { "grad_norm": 0.33894798159599304, "learning_rate": 8.410865065633029e-05, "loss": 0.0136, "step": 17880 }, { "grad_norm": 0.22040528059005737, "learning_rate": 8.408849544482742e-05, "loss": 0.0104, "step": 17890 }, { "grad_norm": 0.2856816053390503, "learning_rate": 8.406832987813968e-05, "loss": 0.0133, "step": 17900 }, { "grad_norm": 0.29003340005874634, "learning_rate": 8.404815396239286e-05, "loss": 0.0106, "step": 17910 }, { "grad_norm": 0.36010029911994934, "learning_rate": 8.402796770371587e-05, "loss": 0.0159, "step": 17920 }, { "grad_norm": 0.2954373061656952, "learning_rate": 8.400777110824071e-05, "loss": 0.0145, "step": 17930 }, { "grad_norm": 0.29758456349372864, "learning_rate": 8.398756418210263e-05, "loss": 0.0168, "step": 17940 }, { "grad_norm": 0.32659077644348145, "learning_rate": 8.396734693143993e-05, "loss": 0.0144, "step": 17950 }, { "grad_norm": 0.31495726108551025, "learning_rate": 8.39471193623941e-05, "loss": 0.0159, "step": 17960 }, { "grad_norm": 0.40374577045440674, "learning_rate": 8.392688148110974e-05, "loss": 0.0192, "step": 17970 }, { "grad_norm": 0.3350340723991394, "learning_rate": 8.390663329373456e-05, "loss": 0.014, "step": 17980 }, { "grad_norm": 0.3928029537200928, "learning_rate": 8.388637480641944e-05, "loss": 0.0142, "step": 17990 }, { "grad_norm": 0.3154008090496063, "learning_rate": 8.386610602531837e-05, "loss": 0.0141, "step": 18000 }, { "grad_norm": 0.30427834391593933, "learning_rate": 8.384582695658847e-05, "loss": 0.0144, "step": 18010 }, { "grad_norm": 0.30862873792648315, "learning_rate": 8.382553760638999e-05, "loss": 0.0169, "step": 18020 }, { "grad_norm": 0.22508589923381805, "learning_rate": 8.380523798088631e-05, "loss": 0.0172, "step": 18030 }, { "grad_norm": 0.301836222410202, "learning_rate": 8.378492808624389e-05, "loss": 0.0145, "step": 18040 }, { "grad_norm": 0.2518996596336365, "learning_rate": 8.376460792863237e-05, "loss": 0.0154, "step": 18050 }, { "grad_norm": 0.2455400824546814, "learning_rate": 8.374427751422444e-05, "loss": 0.0162, "step": 18060 }, { "grad_norm": 0.2845357358455658, "learning_rate": 8.3723936849196e-05, "loss": 0.0126, "step": 18070 }, { "grad_norm": 0.34579500555992126, "learning_rate": 8.370358593972595e-05, "loss": 0.012, "step": 18080 }, { "grad_norm": 0.2587417960166931, "learning_rate": 8.36832247919964e-05, "loss": 0.0118, "step": 18090 }, { "grad_norm": 0.25982800126075745, "learning_rate": 8.36628534121925e-05, "loss": 0.013, "step": 18100 }, { "grad_norm": 0.308040976524353, "learning_rate": 8.364247180650254e-05, "loss": 0.0122, "step": 18110 }, { "grad_norm": 0.3716839849948883, "learning_rate": 8.362207998111794e-05, "loss": 0.012, "step": 18120 }, { "grad_norm": 0.287418007850647, "learning_rate": 8.360167794223318e-05, "loss": 0.0097, "step": 18130 }, { "grad_norm": 0.274263471364975, "learning_rate": 8.358126569604586e-05, "loss": 0.015, "step": 18140 }, { "grad_norm": 0.24698516726493835, "learning_rate": 8.356084324875668e-05, "loss": 0.0101, "step": 18150 }, { "grad_norm": 0.3069823682308197, "learning_rate": 8.354041060656945e-05, "loss": 0.0154, "step": 18160 }, { "grad_norm": 0.18795974552631378, "learning_rate": 8.351996777569106e-05, "loss": 0.0098, "step": 18170 }, { "grad_norm": 0.23682774603366852, "learning_rate": 8.349951476233148e-05, "loss": 0.015, "step": 18180 }, { "grad_norm": 0.27422621846199036, "learning_rate": 8.347905157270386e-05, "loss": 0.0135, "step": 18190 }, { "grad_norm": 0.2154785692691803, "learning_rate": 8.345857821302432e-05, "loss": 0.0093, "step": 18200 }, { "grad_norm": 0.31889963150024414, "learning_rate": 8.343809468951213e-05, "loss": 0.0093, "step": 18210 }, { "grad_norm": 0.4687674343585968, "learning_rate": 8.341760100838965e-05, "loss": 0.0119, "step": 18220 }, { "grad_norm": 0.4000532031059265, "learning_rate": 8.339709717588233e-05, "loss": 0.0146, "step": 18230 }, { "grad_norm": 0.24362359941005707, "learning_rate": 8.33765831982187e-05, "loss": 0.0137, "step": 18240 }, { "grad_norm": 0.29831790924072266, "learning_rate": 8.335605908163035e-05, "loss": 0.0152, "step": 18250 }, { "grad_norm": 0.3350246250629425, "learning_rate": 8.333552483235196e-05, "loss": 0.0138, "step": 18260 }, { "grad_norm": 0.3507806956768036, "learning_rate": 8.33149804566213e-05, "loss": 0.014, "step": 18270 }, { "grad_norm": 0.2848481237888336, "learning_rate": 8.329442596067921e-05, "loss": 0.0121, "step": 18280 }, { "grad_norm": 0.3517798185348511, "learning_rate": 8.32738613507696e-05, "loss": 0.0141, "step": 18290 }, { "grad_norm": 0.37726861238479614, "learning_rate": 8.325328663313946e-05, "loss": 0.0152, "step": 18300 }, { "grad_norm": 0.3971586227416992, "learning_rate": 8.323270181403884e-05, "loss": 0.0137, "step": 18310 }, { "grad_norm": 0.22382038831710815, "learning_rate": 8.321210689972086e-05, "loss": 0.0102, "step": 18320 }, { "grad_norm": 0.3550434112548828, "learning_rate": 8.319150189644174e-05, "loss": 0.0147, "step": 18330 }, { "grad_norm": 0.36026695370674133, "learning_rate": 8.31708868104607e-05, "loss": 0.0114, "step": 18340 }, { "grad_norm": 0.26386353373527527, "learning_rate": 8.315026164804007e-05, "loss": 0.0106, "step": 18350 }, { "grad_norm": 0.2927246689796448, "learning_rate": 8.312962641544524e-05, "loss": 0.0129, "step": 18360 }, { "grad_norm": 0.29330456256866455, "learning_rate": 8.310898111894465e-05, "loss": 0.0146, "step": 18370 }, { "grad_norm": 0.4507358968257904, "learning_rate": 8.308832576480977e-05, "loss": 0.0113, "step": 18380 }, { "grad_norm": 0.37675946950912476, "learning_rate": 8.306766035931519e-05, "loss": 0.0118, "step": 18390 }, { "grad_norm": 0.2877444624900818, "learning_rate": 8.304698490873847e-05, "loss": 0.0153, "step": 18400 }, { "grad_norm": 0.33232858777046204, "learning_rate": 8.30262994193603e-05, "loss": 0.0158, "step": 18410 }, { "grad_norm": 0.3234443664550781, "learning_rate": 8.300560389746438e-05, "loss": 0.0137, "step": 18420 }, { "grad_norm": 0.22922036051750183, "learning_rate": 8.298489834933745e-05, "loss": 0.0126, "step": 18430 }, { "grad_norm": 0.38775700330734253, "learning_rate": 8.296418278126934e-05, "loss": 0.0114, "step": 18440 }, { "grad_norm": 0.30138495564460754, "learning_rate": 8.294345719955284e-05, "loss": 0.0114, "step": 18450 }, { "grad_norm": 0.348544180393219, "learning_rate": 8.29227216104839e-05, "loss": 0.014, "step": 18460 }, { "grad_norm": 0.40388041734695435, "learning_rate": 8.290197602036137e-05, "loss": 0.0108, "step": 18470 }, { "grad_norm": 0.2986717224121094, "learning_rate": 8.288122043548725e-05, "loss": 0.0101, "step": 18480 }, { "grad_norm": 0.19775034487247467, "learning_rate": 8.286045486216657e-05, "loss": 0.0128, "step": 18490 }, { "grad_norm": 0.3551136255264282, "learning_rate": 8.283967930670733e-05, "loss": 0.0122, "step": 18500 }, { "grad_norm": 0.37588241696357727, "learning_rate": 8.281889377542058e-05, "loss": 0.0169, "step": 18510 }, { "grad_norm": 0.34498050808906555, "learning_rate": 8.279809827462045e-05, "loss": 0.013, "step": 18520 }, { "grad_norm": 0.3164331614971161, "learning_rate": 8.277729281062402e-05, "loss": 0.0141, "step": 18530 }, { "grad_norm": 0.2591860294342041, "learning_rate": 8.27564773897515e-05, "loss": 0.0183, "step": 18540 }, { "grad_norm": 0.3670145869255066, "learning_rate": 8.273565201832602e-05, "loss": 0.0115, "step": 18550 }, { "grad_norm": 0.3703955411911011, "learning_rate": 8.27148167026738e-05, "loss": 0.0154, "step": 18560 }, { "grad_norm": 0.3050447106361389, "learning_rate": 8.269397144912405e-05, "loss": 0.0142, "step": 18570 }, { "grad_norm": 0.2702122628688812, "learning_rate": 8.267311626400899e-05, "loss": 0.0121, "step": 18580 }, { "grad_norm": 0.4275292456150055, "learning_rate": 8.26522511536639e-05, "loss": 0.0153, "step": 18590 }, { "grad_norm": 0.3297639787197113, "learning_rate": 8.263137612442706e-05, "loss": 0.0142, "step": 18600 }, { "grad_norm": 0.36414727568626404, "learning_rate": 8.261049118263971e-05, "loss": 0.0136, "step": 18610 }, { "grad_norm": 0.23326407372951508, "learning_rate": 8.258959633464619e-05, "loss": 0.0109, "step": 18620 }, { "grad_norm": 0.28079086542129517, "learning_rate": 8.256869158679377e-05, "loss": 0.013, "step": 18630 }, { "grad_norm": 0.33998361229896545, "learning_rate": 8.254777694543278e-05, "loss": 0.0098, "step": 18640 }, { "grad_norm": 0.5050308108329773, "learning_rate": 8.252685241691651e-05, "loss": 0.0116, "step": 18650 }, { "grad_norm": 0.4916621148586273, "learning_rate": 8.250591800760133e-05, "loss": 0.0148, "step": 18660 }, { "grad_norm": 0.46857693791389465, "learning_rate": 8.248497372384649e-05, "loss": 0.0233, "step": 18670 }, { "grad_norm": 0.32564258575439453, "learning_rate": 8.246401957201437e-05, "loss": 0.0197, "step": 18680 }, { "grad_norm": 0.3152516484260559, "learning_rate": 8.244305555847027e-05, "loss": 0.0178, "step": 18690 }, { "grad_norm": 0.3127182424068451, "learning_rate": 8.24220816895825e-05, "loss": 0.0168, "step": 18700 }, { "grad_norm": 0.3927346467971802, "learning_rate": 8.240109797172237e-05, "loss": 0.0166, "step": 18710 }, { "grad_norm": 0.37310805916786194, "learning_rate": 8.238010441126416e-05, "loss": 0.0137, "step": 18720 }, { "grad_norm": 0.31839457154273987, "learning_rate": 8.23591010145852e-05, "loss": 0.0128, "step": 18730 }, { "grad_norm": 0.31938979029655457, "learning_rate": 8.233808778806571e-05, "loss": 0.0141, "step": 18740 }, { "grad_norm": 0.2878010869026184, "learning_rate": 8.231706473808903e-05, "loss": 0.012, "step": 18750 }, { "grad_norm": 0.3095709979534149, "learning_rate": 8.229603187104133e-05, "loss": 0.0107, "step": 18760 }, { "grad_norm": 0.28657129406929016, "learning_rate": 8.22749891933119e-05, "loss": 0.0126, "step": 18770 }, { "grad_norm": 0.23694774508476257, "learning_rate": 8.225393671129291e-05, "loss": 0.0127, "step": 18780 }, { "grad_norm": 0.2705461084842682, "learning_rate": 8.223287443137957e-05, "loss": 0.0132, "step": 18790 }, { "grad_norm": 0.3463711142539978, "learning_rate": 8.221180235997004e-05, "loss": 0.0116, "step": 18800 }, { "grad_norm": 0.3288988173007965, "learning_rate": 8.219072050346544e-05, "loss": 0.0133, "step": 18810 }, { "grad_norm": 0.32352596521377563, "learning_rate": 8.216962886826992e-05, "loss": 0.0127, "step": 18820 }, { "grad_norm": 0.31411558389663696, "learning_rate": 8.214852746079054e-05, "loss": 0.0104, "step": 18830 }, { "grad_norm": 0.28551432490348816, "learning_rate": 8.212741628743732e-05, "loss": 0.01, "step": 18840 }, { "grad_norm": 0.3180924952030182, "learning_rate": 8.210629535462333e-05, "loss": 0.013, "step": 18850 }, { "grad_norm": 0.30611202120780945, "learning_rate": 8.208516466876453e-05, "loss": 0.0123, "step": 18860 }, { "grad_norm": 0.3751974403858185, "learning_rate": 8.206402423627986e-05, "loss": 0.0117, "step": 18870 }, { "grad_norm": 0.2996487021446228, "learning_rate": 8.204287406359124e-05, "loss": 0.0132, "step": 18880 }, { "grad_norm": 0.25353679060935974, "learning_rate": 8.20217141571235e-05, "loss": 0.0106, "step": 18890 }, { "grad_norm": 0.28663378953933716, "learning_rate": 8.200054452330449e-05, "loss": 0.0133, "step": 18900 }, { "grad_norm": 0.31912288069725037, "learning_rate": 8.197936516856499e-05, "loss": 0.0105, "step": 18910 }, { "grad_norm": 0.3114042282104492, "learning_rate": 8.195817609933871e-05, "loss": 0.0147, "step": 18920 }, { "grad_norm": 0.3462892770767212, "learning_rate": 8.193697732206233e-05, "loss": 0.0129, "step": 18930 }, { "grad_norm": 0.2560267448425293, "learning_rate": 8.19157688431755e-05, "loss": 0.0113, "step": 18940 }, { "grad_norm": 0.309586763381958, "learning_rate": 8.189455066912077e-05, "loss": 0.0104, "step": 18950 }, { "grad_norm": 0.2984698414802551, "learning_rate": 8.187332280634369e-05, "loss": 0.0116, "step": 18960 }, { "grad_norm": 0.3184657692909241, "learning_rate": 8.18520852612927e-05, "loss": 0.011, "step": 18970 }, { "grad_norm": 0.30604785680770874, "learning_rate": 8.183083804041921e-05, "loss": 0.0111, "step": 18980 }, { "grad_norm": 0.2852771580219269, "learning_rate": 8.180958115017757e-05, "loss": 0.0096, "step": 18990 }, { "grad_norm": 0.3685297966003418, "learning_rate": 8.178831459702505e-05, "loss": 0.0102, "step": 19000 }, { "grad_norm": 0.3843708038330078, "learning_rate": 8.17670383874219e-05, "loss": 0.0109, "step": 19010 }, { "grad_norm": 0.43317270278930664, "learning_rate": 8.174575252783124e-05, "loss": 0.0138, "step": 19020 }, { "grad_norm": 0.31003043055534363, "learning_rate": 8.172445702471914e-05, "loss": 0.0141, "step": 19030 }, { "grad_norm": 0.28035664558410645, "learning_rate": 8.170315188455466e-05, "loss": 0.011, "step": 19040 }, { "grad_norm": 0.31346672773361206, "learning_rate": 8.168183711380969e-05, "loss": 0.0101, "step": 19050 }, { "grad_norm": 0.2836418151855469, "learning_rate": 8.166051271895913e-05, "loss": 0.0102, "step": 19060 }, { "grad_norm": 0.3067103624343872, "learning_rate": 8.163917870648075e-05, "loss": 0.0116, "step": 19070 }, { "grad_norm": 0.32565218210220337, "learning_rate": 8.161783508285526e-05, "loss": 0.0104, "step": 19080 }, { "grad_norm": 0.23752239346504211, "learning_rate": 8.159648185456628e-05, "loss": 0.0092, "step": 19090 }, { "grad_norm": 0.3650795519351959, "learning_rate": 8.157511902810038e-05, "loss": 0.0108, "step": 19100 }, { "grad_norm": 0.23786623775959015, "learning_rate": 8.155374660994701e-05, "loss": 0.0098, "step": 19110 }, { "grad_norm": 0.22118094563484192, "learning_rate": 8.153236460659857e-05, "loss": 0.0108, "step": 19120 }, { "grad_norm": 0.32005828619003296, "learning_rate": 8.151097302455031e-05, "loss": 0.0135, "step": 19130 }, { "grad_norm": 0.3606661856174469, "learning_rate": 8.148957187030044e-05, "loss": 0.0122, "step": 19140 }, { "grad_norm": 0.28908205032348633, "learning_rate": 8.146816115035006e-05, "loss": 0.0137, "step": 19150 }, { "grad_norm": 0.3197175860404968, "learning_rate": 8.14467408712032e-05, "loss": 0.0137, "step": 19160 }, { "grad_norm": 0.3105429708957672, "learning_rate": 8.142531103936678e-05, "loss": 0.015, "step": 19170 }, { "grad_norm": 0.36727669835090637, "learning_rate": 8.14038716613506e-05, "loss": 0.0114, "step": 19180 }, { "grad_norm": 0.3169114589691162, "learning_rate": 8.138242274366736e-05, "loss": 0.0118, "step": 19190 }, { "grad_norm": 0.3648217022418976, "learning_rate": 8.136096429283271e-05, "loss": 0.0123, "step": 19200 }, { "grad_norm": 0.2427861988544464, "learning_rate": 8.133949631536515e-05, "loss": 0.0103, "step": 19210 }, { "grad_norm": 0.34194543957710266, "learning_rate": 8.131801881778607e-05, "loss": 0.0102, "step": 19220 }, { "grad_norm": 0.2932226061820984, "learning_rate": 8.129653180661978e-05, "loss": 0.0127, "step": 19230 }, { "grad_norm": 0.42991992831230164, "learning_rate": 8.127503528839346e-05, "loss": 0.0116, "step": 19240 }, { "grad_norm": 0.3318198025226593, "learning_rate": 8.125352926963721e-05, "loss": 0.0129, "step": 19250 }, { "grad_norm": 0.31246650218963623, "learning_rate": 8.123201375688395e-05, "loss": 0.0117, "step": 19260 }, { "grad_norm": 0.3080601394176483, "learning_rate": 8.121048875666954e-05, "loss": 0.0113, "step": 19270 }, { "grad_norm": 0.2473263144493103, "learning_rate": 8.118895427553274e-05, "loss": 0.0133, "step": 19280 }, { "grad_norm": 0.2663351893424988, "learning_rate": 8.116741032001511e-05, "loss": 0.0129, "step": 19290 }, { "grad_norm": 0.2721877694129944, "learning_rate": 8.114585689666114e-05, "loss": 0.0104, "step": 19300 }, { "grad_norm": 0.3005412220954895, "learning_rate": 8.112429401201821e-05, "loss": 0.0104, "step": 19310 }, { "grad_norm": 0.2333231419324875, "learning_rate": 8.110272167263656e-05, "loss": 0.0099, "step": 19320 }, { "grad_norm": 0.3086380362510681, "learning_rate": 8.108113988506929e-05, "loss": 0.0111, "step": 19330 }, { "grad_norm": 0.2621247470378876, "learning_rate": 8.105954865587235e-05, "loss": 0.0113, "step": 19340 }, { "grad_norm": 0.3123207092285156, "learning_rate": 8.103794799160463e-05, "loss": 0.0121, "step": 19350 }, { "grad_norm": 0.3243536353111267, "learning_rate": 8.101633789882781e-05, "loss": 0.0113, "step": 19360 }, { "grad_norm": 0.28597328066825867, "learning_rate": 8.099471838410648e-05, "loss": 0.01, "step": 19370 }, { "grad_norm": 0.3140491545200348, "learning_rate": 8.097308945400806e-05, "loss": 0.0105, "step": 19380 }, { "grad_norm": 0.29438039660453796, "learning_rate": 8.095145111510288e-05, "loss": 0.0116, "step": 19390 }, { "grad_norm": 0.27930429577827454, "learning_rate": 8.092980337396406e-05, "loss": 0.011, "step": 19400 }, { "grad_norm": 0.3430565595626831, "learning_rate": 8.090814623716763e-05, "loss": 0.0148, "step": 19410 }, { "grad_norm": 0.29080599546432495, "learning_rate": 8.088647971129246e-05, "loss": 0.0106, "step": 19420 }, { "grad_norm": 0.25844845175743103, "learning_rate": 8.086480380292026e-05, "loss": 0.0113, "step": 19430 }, { "grad_norm": 0.3026571571826935, "learning_rate": 8.084311851863562e-05, "loss": 0.0128, "step": 19440 }, { "grad_norm": 0.28641223907470703, "learning_rate": 8.082142386502591e-05, "loss": 0.0105, "step": 19450 }, { "grad_norm": 0.2502995729446411, "learning_rate": 8.079971984868145e-05, "loss": 0.011, "step": 19460 }, { "grad_norm": 0.36017531156539917, "learning_rate": 8.077800647619532e-05, "loss": 0.0143, "step": 19470 }, { "grad_norm": 0.2801513373851776, "learning_rate": 8.075628375416345e-05, "loss": 0.016, "step": 19480 }, { "grad_norm": 0.3113939166069031, "learning_rate": 8.073455168918464e-05, "loss": 0.0144, "step": 19490 }, { "grad_norm": 0.3204207420349121, "learning_rate": 8.071281028786055e-05, "loss": 0.0114, "step": 19500 }, { "grad_norm": 0.3569873869419098, "learning_rate": 8.069105955679562e-05, "loss": 0.0143, "step": 19510 }, { "grad_norm": 0.30875831842422485, "learning_rate": 8.066929950259713e-05, "loss": 0.0097, "step": 19520 }, { "grad_norm": 0.3048214614391327, "learning_rate": 8.064753013187522e-05, "loss": 0.013, "step": 19530 }, { "grad_norm": 0.3600217401981354, "learning_rate": 8.062575145124289e-05, "loss": 0.0119, "step": 19540 }, { "grad_norm": 0.30416548252105713, "learning_rate": 8.060396346731587e-05, "loss": 0.0156, "step": 19550 }, { "grad_norm": 0.30622172355651855, "learning_rate": 8.058216618671281e-05, "loss": 0.0152, "step": 19560 }, { "grad_norm": 0.2818950414657593, "learning_rate": 8.056035961605514e-05, "loss": 0.0141, "step": 19570 }, { "grad_norm": 0.41939905285835266, "learning_rate": 8.05385437619671e-05, "loss": 0.0114, "step": 19580 }, { "grad_norm": 0.2622312307357788, "learning_rate": 8.05167186310758e-05, "loss": 0.0105, "step": 19590 }, { "grad_norm": 0.2238624542951584, "learning_rate": 8.049488423001113e-05, "loss": 0.0101, "step": 19600 }, { "grad_norm": 0.2899109125137329, "learning_rate": 8.047304056540581e-05, "loss": 0.0094, "step": 19610 }, { "grad_norm": 0.39610743522644043, "learning_rate": 8.045118764389534e-05, "loss": 0.0118, "step": 19620 }, { "grad_norm": 0.2885169982910156, "learning_rate": 8.042932547211809e-05, "loss": 0.0115, "step": 19630 }, { "grad_norm": 0.2602410614490509, "learning_rate": 8.04074540567152e-05, "loss": 0.0095, "step": 19640 }, { "grad_norm": 0.24544866383075714, "learning_rate": 8.038557340433063e-05, "loss": 0.0091, "step": 19650 }, { "grad_norm": 0.35027745366096497, "learning_rate": 8.036368352161115e-05, "loss": 0.013, "step": 19660 }, { "grad_norm": 0.34638291597366333, "learning_rate": 8.034178441520633e-05, "loss": 0.013, "step": 19670 }, { "grad_norm": 0.3000337779521942, "learning_rate": 8.031987609176852e-05, "loss": 0.0094, "step": 19680 }, { "grad_norm": 0.3059388995170593, "learning_rate": 8.02979585579529e-05, "loss": 0.01, "step": 19690 }, { "grad_norm": 0.384428471326828, "learning_rate": 8.027603182041745e-05, "loss": 0.0177, "step": 19700 }, { "grad_norm": 0.28550684452056885, "learning_rate": 8.025409588582292e-05, "loss": 0.0092, "step": 19710 }, { "grad_norm": 0.24289412796497345, "learning_rate": 8.023215076083288e-05, "loss": 0.0103, "step": 19720 }, { "grad_norm": 0.31261584162712097, "learning_rate": 8.021019645211367e-05, "loss": 0.0126, "step": 19730 }, { "grad_norm": 0.3674880564212799, "learning_rate": 8.018823296633441e-05, "loss": 0.0138, "step": 19740 }, { "grad_norm": 0.31053563952445984, "learning_rate": 8.016626031016708e-05, "loss": 0.0128, "step": 19750 }, { "grad_norm": 0.48972266912460327, "learning_rate": 8.014427849028636e-05, "loss": 0.0131, "step": 19760 }, { "grad_norm": 0.24872824549674988, "learning_rate": 8.012228751336974e-05, "loss": 0.0122, "step": 19770 }, { "grad_norm": 0.2696179151535034, "learning_rate": 8.01002873860975e-05, "loss": 0.0148, "step": 19780 }, { "grad_norm": 0.29384976625442505, "learning_rate": 8.00782781151527e-05, "loss": 0.0154, "step": 19790 }, { "grad_norm": 0.29732707142829895, "learning_rate": 8.005625970722119e-05, "loss": 0.0112, "step": 19800 }, { "grad_norm": 0.3391878306865692, "learning_rate": 8.003423216899158e-05, "loss": 0.0134, "step": 19810 }, { "grad_norm": 0.36865752935409546, "learning_rate": 8.001219550715522e-05, "loss": 0.0103, "step": 19820 }, { "grad_norm": 0.23980113863945007, "learning_rate": 7.999014972840632e-05, "loss": 0.0141, "step": 19830 }, { "grad_norm": 0.35647082328796387, "learning_rate": 7.996809483944174e-05, "loss": 0.0153, "step": 19840 }, { "grad_norm": 0.36074432730674744, "learning_rate": 7.994603084696124e-05, "loss": 0.0148, "step": 19850 }, { "grad_norm": 0.3432263731956482, "learning_rate": 7.992395775766724e-05, "loss": 0.0158, "step": 19860 }, { "grad_norm": 0.3934478461742401, "learning_rate": 7.990187557826497e-05, "loss": 0.0124, "step": 19870 }, { "grad_norm": 0.28861114382743835, "learning_rate": 7.987978431546242e-05, "loss": 0.0109, "step": 19880 }, { "grad_norm": 0.2743509113788605, "learning_rate": 7.985768397597031e-05, "loss": 0.013, "step": 19890 }, { "grad_norm": 0.30137819051742554, "learning_rate": 7.983557456650216e-05, "loss": 0.0116, "step": 19900 }, { "grad_norm": 0.28131797909736633, "learning_rate": 7.981345609377422e-05, "loss": 0.0101, "step": 19910 }, { "grad_norm": 0.24351835250854492, "learning_rate": 7.97913285645055e-05, "loss": 0.0108, "step": 19920 }, { "grad_norm": 0.20535027980804443, "learning_rate": 7.976919198541776e-05, "loss": 0.0089, "step": 19930 }, { "grad_norm": 0.32849133014678955, "learning_rate": 7.974704636323548e-05, "loss": 0.0096, "step": 19940 }, { "grad_norm": 0.3754799962043762, "learning_rate": 7.972489170468597e-05, "loss": 0.0111, "step": 19950 }, { "grad_norm": 0.27436015009880066, "learning_rate": 7.970272801649918e-05, "loss": 0.0112, "step": 19960 }, { "grad_norm": 0.31236574053764343, "learning_rate": 7.96805553054079e-05, "loss": 0.0125, "step": 19970 }, { "grad_norm": 0.2469499409198761, "learning_rate": 7.965837357814756e-05, "loss": 0.0105, "step": 19980 }, { "grad_norm": 0.2964403033256531, "learning_rate": 7.963618284145643e-05, "loss": 0.0115, "step": 19990 }, { "grad_norm": 0.25933167338371277, "learning_rate": 7.961398310207544e-05, "loss": 0.0097, "step": 20000 }, { "grad_norm": 0.31781768798828125, "learning_rate": 7.95917743667483e-05, "loss": 0.0106, "step": 20010 }, { "grad_norm": 0.2968432605266571, "learning_rate": 7.956955664222144e-05, "loss": 0.0096, "step": 20020 }, { "grad_norm": 0.3397810459136963, "learning_rate": 7.954732993524399e-05, "loss": 0.0109, "step": 20030 }, { "grad_norm": 0.2422524094581604, "learning_rate": 7.952509425256786e-05, "loss": 0.011, "step": 20040 }, { "grad_norm": 0.20908181369304657, "learning_rate": 7.950284960094767e-05, "loss": 0.0109, "step": 20050 }, { "grad_norm": 0.28764376044273376, "learning_rate": 7.948059598714076e-05, "loss": 0.0116, "step": 20060 }, { "grad_norm": 0.3075757324695587, "learning_rate": 7.945833341790717e-05, "loss": 0.0103, "step": 20070 }, { "grad_norm": 0.2256336510181427, "learning_rate": 7.94360619000097e-05, "loss": 0.0097, "step": 20080 }, { "grad_norm": 0.4072895646095276, "learning_rate": 7.941378144021381e-05, "loss": 0.0103, "step": 20090 }, { "grad_norm": 0.31318414211273193, "learning_rate": 7.939149204528777e-05, "loss": 0.0113, "step": 20100 }, { "grad_norm": 0.22689473628997803, "learning_rate": 7.936919372200246e-05, "loss": 0.0112, "step": 20110 }, { "grad_norm": 0.2850944995880127, "learning_rate": 7.934688647713158e-05, "loss": 0.0093, "step": 20120 }, { "grad_norm": 0.32716110348701477, "learning_rate": 7.932457031745143e-05, "loss": 0.0093, "step": 20130 }, { "grad_norm": 0.2928125560283661, "learning_rate": 7.930224524974108e-05, "loss": 0.0119, "step": 20140 }, { "grad_norm": 0.3440922200679779, "learning_rate": 7.927991128078232e-05, "loss": 0.0121, "step": 20150 }, { "grad_norm": 0.31154802441596985, "learning_rate": 7.925756841735958e-05, "loss": 0.0131, "step": 20160 }, { "grad_norm": 0.43167048692703247, "learning_rate": 7.923521666626008e-05, "loss": 0.0113, "step": 20170 }, { "grad_norm": 0.4501725733280182, "learning_rate": 7.921285603427366e-05, "loss": 0.0141, "step": 20180 }, { "grad_norm": 0.3480492830276489, "learning_rate": 7.91904865281929e-05, "loss": 0.0121, "step": 20190 }, { "grad_norm": 0.24132727086544037, "learning_rate": 7.916810815481307e-05, "loss": 0.0162, "step": 20200 }, { "grad_norm": 0.3155970275402069, "learning_rate": 7.914572092093211e-05, "loss": 0.0104, "step": 20210 }, { "grad_norm": 0.26399046182632446, "learning_rate": 7.912332483335068e-05, "loss": 0.0124, "step": 20220 }, { "grad_norm": 0.2848663330078125, "learning_rate": 7.910091989887213e-05, "loss": 0.0102, "step": 20230 }, { "grad_norm": 0.22715935111045837, "learning_rate": 7.907850612430248e-05, "loss": 0.0121, "step": 20240 }, { "grad_norm": 0.2672522962093353, "learning_rate": 7.905608351645044e-05, "loss": 0.009, "step": 20250 }, { "grad_norm": 0.19731563329696655, "learning_rate": 7.90336520821274e-05, "loss": 0.0113, "step": 20260 }, { "grad_norm": 0.23456822335720062, "learning_rate": 7.901121182814746e-05, "loss": 0.011, "step": 20270 }, { "grad_norm": 0.2440684735774994, "learning_rate": 7.898876276132736e-05, "loss": 0.0097, "step": 20280 }, { "grad_norm": 0.24227331578731537, "learning_rate": 7.896630488848654e-05, "loss": 0.0088, "step": 20290 }, { "grad_norm": 0.3493794798851013, "learning_rate": 7.89438382164471e-05, "loss": 0.0109, "step": 20300 }, { "grad_norm": 0.2959541380405426, "learning_rate": 7.892136275203383e-05, "loss": 0.0101, "step": 20310 }, { "grad_norm": 0.2868903577327728, "learning_rate": 7.889887850207418e-05, "loss": 0.0111, "step": 20320 }, { "grad_norm": 0.23743665218353271, "learning_rate": 7.887638547339827e-05, "loss": 0.0117, "step": 20330 }, { "grad_norm": 0.3137747347354889, "learning_rate": 7.885388367283891e-05, "loss": 0.0176, "step": 20340 }, { "grad_norm": 0.320621520280838, "learning_rate": 7.88313731072315e-05, "loss": 0.0119, "step": 20350 }, { "grad_norm": 0.2784513831138611, "learning_rate": 7.88088537834142e-05, "loss": 0.0108, "step": 20360 }, { "grad_norm": 0.21401779353618622, "learning_rate": 7.878632570822778e-05, "loss": 0.01, "step": 20370 }, { "grad_norm": 0.3468092679977417, "learning_rate": 7.876378888851567e-05, "loss": 0.009, "step": 20380 }, { "grad_norm": 0.32845574617385864, "learning_rate": 7.874124333112396e-05, "loss": 0.0128, "step": 20390 }, { "grad_norm": 0.26932287216186523, "learning_rate": 7.871868904290138e-05, "loss": 0.0088, "step": 20400 }, { "grad_norm": 0.30503544211387634, "learning_rate": 7.869612603069935e-05, "loss": 0.0118, "step": 20410 }, { "grad_norm": 0.2474668323993683, "learning_rate": 7.867355430137192e-05, "loss": 0.0095, "step": 20420 }, { "grad_norm": 0.2821892499923706, "learning_rate": 7.865097386177577e-05, "loss": 0.0105, "step": 20430 }, { "grad_norm": 0.30608218908309937, "learning_rate": 7.862838471877023e-05, "loss": 0.0123, "step": 20440 }, { "grad_norm": 0.33611175417900085, "learning_rate": 7.860578687921731e-05, "loss": 0.0103, "step": 20450 }, { "grad_norm": 0.2612968683242798, "learning_rate": 7.858318034998164e-05, "loss": 0.0107, "step": 20460 }, { "grad_norm": 0.32673442363739014, "learning_rate": 7.856056513793046e-05, "loss": 0.0128, "step": 20470 }, { "grad_norm": 0.2968776822090149, "learning_rate": 7.85379412499337e-05, "loss": 0.0103, "step": 20480 }, { "grad_norm": 0.26962485909461975, "learning_rate": 7.851530869286389e-05, "loss": 0.0094, "step": 20490 }, { "grad_norm": 0.2564465403556824, "learning_rate": 7.849266747359619e-05, "loss": 0.009, "step": 20500 }, { "grad_norm": 0.2763496935367584, "learning_rate": 7.847001759900843e-05, "loss": 0.0136, "step": 20510 }, { "grad_norm": 0.34941181540489197, "learning_rate": 7.844735907598102e-05, "loss": 0.0123, "step": 20520 }, { "grad_norm": 0.3446839153766632, "learning_rate": 7.842469191139703e-05, "loss": 0.0097, "step": 20530 }, { "grad_norm": 0.24879102408885956, "learning_rate": 7.840201611214215e-05, "loss": 0.0118, "step": 20540 }, { "grad_norm": 0.2297147810459137, "learning_rate": 7.837933168510469e-05, "loss": 0.0116, "step": 20550 }, { "grad_norm": 0.2655164301395416, "learning_rate": 7.835663863717559e-05, "loss": 0.0131, "step": 20560 }, { "grad_norm": 0.29914048314094543, "learning_rate": 7.833393697524838e-05, "loss": 0.0107, "step": 20570 }, { "grad_norm": 0.30555757880210876, "learning_rate": 7.831122670621922e-05, "loss": 0.0144, "step": 20580 }, { "grad_norm": 0.26726293563842773, "learning_rate": 7.82885078369869e-05, "loss": 0.01, "step": 20590 }, { "grad_norm": 0.326022744178772, "learning_rate": 7.826578037445283e-05, "loss": 0.0132, "step": 20600 }, { "grad_norm": 0.3944125175476074, "learning_rate": 7.824304432552097e-05, "loss": 0.0126, "step": 20610 }, { "grad_norm": 0.24064595997333527, "learning_rate": 7.822029969709798e-05, "loss": 0.0101, "step": 20620 }, { "grad_norm": 0.23692484200000763, "learning_rate": 7.819754649609306e-05, "loss": 0.0095, "step": 20630 }, { "grad_norm": 0.32726019620895386, "learning_rate": 7.817478472941802e-05, "loss": 0.0096, "step": 20640 }, { "grad_norm": 0.32033249735832214, "learning_rate": 7.815201440398727e-05, "loss": 0.0097, "step": 20650 }, { "grad_norm": 0.28522205352783203, "learning_rate": 7.812923552671789e-05, "loss": 0.0141, "step": 20660 }, { "grad_norm": 0.39275965094566345, "learning_rate": 7.810644810452945e-05, "loss": 0.0118, "step": 20670 }, { "grad_norm": 0.24330002069473267, "learning_rate": 7.808365214434417e-05, "loss": 0.0104, "step": 20680 }, { "grad_norm": 0.30953559279441833, "learning_rate": 7.80608476530869e-05, "loss": 0.0105, "step": 20690 }, { "grad_norm": 0.24254587292671204, "learning_rate": 7.8038034637685e-05, "loss": 0.0105, "step": 20700 }, { "grad_norm": 0.34047412872314453, "learning_rate": 7.801521310506848e-05, "loss": 0.01, "step": 20710 }, { "grad_norm": 0.2513106167316437, "learning_rate": 7.799238306216994e-05, "loss": 0.0106, "step": 20720 }, { "grad_norm": 0.22358845174312592, "learning_rate": 7.796954451592448e-05, "loss": 0.0114, "step": 20730 }, { "grad_norm": 0.27987927198410034, "learning_rate": 7.794669747326992e-05, "loss": 0.0095, "step": 20740 }, { "grad_norm": 0.29166197776794434, "learning_rate": 7.792384194114654e-05, "loss": 0.0091, "step": 20750 }, { "grad_norm": 0.37438836693763733, "learning_rate": 7.790097792649729e-05, "loss": 0.0102, "step": 20760 }, { "grad_norm": 0.22963950037956238, "learning_rate": 7.787810543626762e-05, "loss": 0.0108, "step": 20770 }, { "grad_norm": 0.3060239553451538, "learning_rate": 7.785522447740558e-05, "loss": 0.0135, "step": 20780 }, { "grad_norm": 0.3001355230808258, "learning_rate": 7.783233505686182e-05, "loss": 0.013, "step": 20790 }, { "grad_norm": 0.24417084455490112, "learning_rate": 7.780943718158955e-05, "loss": 0.0129, "step": 20800 }, { "grad_norm": 0.2219369262456894, "learning_rate": 7.778653085854453e-05, "loss": 0.011, "step": 20810 }, { "grad_norm": 0.2586994469165802, "learning_rate": 7.77636160946851e-05, "loss": 0.0099, "step": 20820 }, { "grad_norm": 0.33123070001602173, "learning_rate": 7.774069289697215e-05, "loss": 0.0093, "step": 20830 }, { "grad_norm": 0.24347035586833954, "learning_rate": 7.771776127236913e-05, "loss": 0.0109, "step": 20840 }, { "grad_norm": 0.23947462439537048, "learning_rate": 7.769482122784212e-05, "loss": 0.0107, "step": 20850 }, { "grad_norm": 0.2801704406738281, "learning_rate": 7.767187277035963e-05, "loss": 0.0083, "step": 20860 }, { "grad_norm": 0.3863893449306488, "learning_rate": 7.764891590689285e-05, "loss": 0.0099, "step": 20870 }, { "grad_norm": 0.31459543108940125, "learning_rate": 7.762595064441542e-05, "loss": 0.0091, "step": 20880 }, { "grad_norm": 0.21968580782413483, "learning_rate": 7.760297698990362e-05, "loss": 0.0086, "step": 20890 }, { "grad_norm": 0.27186062932014465, "learning_rate": 7.757999495033623e-05, "loss": 0.0105, "step": 20900 }, { "grad_norm": 0.23455645143985748, "learning_rate": 7.755700453269456e-05, "loss": 0.0112, "step": 20910 }, { "grad_norm": 0.3652326762676239, "learning_rate": 7.753400574396254e-05, "loss": 0.0106, "step": 20920 }, { "grad_norm": 0.22082045674324036, "learning_rate": 7.751099859112655e-05, "loss": 0.0113, "step": 20930 }, { "grad_norm": 0.22481656074523926, "learning_rate": 7.748798308117557e-05, "loss": 0.0102, "step": 20940 }, { "grad_norm": 0.23719453811645508, "learning_rate": 7.746495922110112e-05, "loss": 0.0098, "step": 20950 }, { "grad_norm": 0.32906338572502136, "learning_rate": 7.744192701789723e-05, "loss": 0.0096, "step": 20960 }, { "grad_norm": 0.30039024353027344, "learning_rate": 7.741888647856046e-05, "loss": 0.0123, "step": 20970 }, { "grad_norm": 0.22510989010334015, "learning_rate": 7.739583761008994e-05, "loss": 0.0104, "step": 20980 }, { "grad_norm": 0.28414151072502136, "learning_rate": 7.73727804194873e-05, "loss": 0.0136, "step": 20990 }, { "grad_norm": 0.42576834559440613, "learning_rate": 7.734971491375671e-05, "loss": 0.0163, "step": 21000 }, { "grad_norm": 0.2854042649269104, "learning_rate": 7.732664109990485e-05, "loss": 0.0093, "step": 21010 }, { "grad_norm": 0.2741122841835022, "learning_rate": 7.730355898494095e-05, "loss": 0.0089, "step": 21020 }, { "grad_norm": 0.2783111333847046, "learning_rate": 7.728046857587673e-05, "loss": 0.011, "step": 21030 }, { "grad_norm": 0.2575124502182007, "learning_rate": 7.725736987972647e-05, "loss": 0.0087, "step": 21040 }, { "grad_norm": 0.4278194308280945, "learning_rate": 7.723426290350691e-05, "loss": 0.0096, "step": 21050 }, { "grad_norm": 0.3166457414627075, "learning_rate": 7.721114765423736e-05, "loss": 0.012, "step": 21060 }, { "grad_norm": 0.2779615819454193, "learning_rate": 7.718802413893963e-05, "loss": 0.0097, "step": 21070 }, { "grad_norm": 0.28359705209732056, "learning_rate": 7.716489236463802e-05, "loss": 0.0147, "step": 21080 }, { "grad_norm": 0.25712549686431885, "learning_rate": 7.714175233835936e-05, "loss": 0.0098, "step": 21090 }, { "grad_norm": 0.2537403106689453, "learning_rate": 7.711860406713299e-05, "loss": 0.0113, "step": 21100 }, { "grad_norm": 0.3189406394958496, "learning_rate": 7.70954475579907e-05, "loss": 0.0108, "step": 21110 }, { "grad_norm": 0.3373509645462036, "learning_rate": 7.707228281796688e-05, "loss": 0.0116, "step": 21120 }, { "grad_norm": 0.21083569526672363, "learning_rate": 7.704910985409833e-05, "loss": 0.0129, "step": 21130 }, { "grad_norm": 0.34385946393013, "learning_rate": 7.702592867342439e-05, "loss": 0.011, "step": 21140 }, { "grad_norm": 0.32396259903907776, "learning_rate": 7.700273928298691e-05, "loss": 0.0113, "step": 21150 }, { "grad_norm": 0.23247189819812775, "learning_rate": 7.697954168983021e-05, "loss": 0.0086, "step": 21160 }, { "grad_norm": 0.34575891494750977, "learning_rate": 7.695633590100109e-05, "loss": 0.0122, "step": 21170 }, { "grad_norm": 0.2856544852256775, "learning_rate": 7.693312192354886e-05, "loss": 0.0101, "step": 21180 }, { "grad_norm": 0.36304816603660583, "learning_rate": 7.690989976452532e-05, "loss": 0.0104, "step": 21190 }, { "grad_norm": 0.48910048604011536, "learning_rate": 7.688666943098475e-05, "loss": 0.0114, "step": 21200 }, { "grad_norm": 0.2329770177602768, "learning_rate": 7.686343092998389e-05, "loss": 0.0093, "step": 21210 }, { "grad_norm": 0.3122267723083496, "learning_rate": 7.684018426858202e-05, "loss": 0.0094, "step": 21220 }, { "grad_norm": 0.38143762946128845, "learning_rate": 7.681692945384084e-05, "loss": 0.0126, "step": 21230 }, { "grad_norm": 0.2841351330280304, "learning_rate": 7.679366649282456e-05, "loss": 0.0112, "step": 21240 }, { "grad_norm": 0.20977483689785004, "learning_rate": 7.677039539259983e-05, "loss": 0.01, "step": 21250 }, { "grad_norm": 0.3063945472240448, "learning_rate": 7.674711616023581e-05, "loss": 0.012, "step": 21260 }, { "grad_norm": 0.33445024490356445, "learning_rate": 7.672382880280413e-05, "loss": 0.0156, "step": 21270 }, { "grad_norm": 0.2996259927749634, "learning_rate": 7.670053332737885e-05, "loss": 0.0115, "step": 21280 }, { "grad_norm": 0.299744188785553, "learning_rate": 7.667722974103654e-05, "loss": 0.0115, "step": 21290 }, { "grad_norm": 0.2751113176345825, "learning_rate": 7.66539180508562e-05, "loss": 0.0152, "step": 21300 }, { "grad_norm": 0.32271459698677063, "learning_rate": 7.663059826391932e-05, "loss": 0.0103, "step": 21310 }, { "grad_norm": 0.36063945293426514, "learning_rate": 7.660727038730981e-05, "loss": 0.0124, "step": 21320 }, { "grad_norm": 0.45738816261291504, "learning_rate": 7.65839344281141e-05, "loss": 0.0126, "step": 21330 }, { "grad_norm": 0.36818403005599976, "learning_rate": 7.656059039342101e-05, "loss": 0.0151, "step": 21340 }, { "grad_norm": 0.25361382961273193, "learning_rate": 7.653723829032187e-05, "loss": 0.0123, "step": 21350 }, { "grad_norm": 0.34046679735183716, "learning_rate": 7.65138781259104e-05, "loss": 0.0149, "step": 21360 }, { "grad_norm": 0.2896510362625122, "learning_rate": 7.649050990728279e-05, "loss": 0.0144, "step": 21370 }, { "grad_norm": 0.3445018231868744, "learning_rate": 7.646713364153774e-05, "loss": 0.0121, "step": 21380 }, { "grad_norm": 0.3236828148365021, "learning_rate": 7.64437493357763e-05, "loss": 0.0114, "step": 21390 }, { "grad_norm": 0.3111863434314728, "learning_rate": 7.642035699710202e-05, "loss": 0.0154, "step": 21400 }, { "grad_norm": 0.3892368972301483, "learning_rate": 7.639695663262089e-05, "loss": 0.0141, "step": 21410 }, { "grad_norm": 0.34180542826652527, "learning_rate": 7.637354824944128e-05, "loss": 0.0133, "step": 21420 }, { "grad_norm": 0.4371061623096466, "learning_rate": 7.635013185467408e-05, "loss": 0.0131, "step": 21430 }, { "grad_norm": 0.3792824149131775, "learning_rate": 7.632670745543256e-05, "loss": 0.0157, "step": 21440 }, { "grad_norm": 0.2838890850543976, "learning_rate": 7.630327505883242e-05, "loss": 0.0177, "step": 21450 }, { "grad_norm": 0.2905552387237549, "learning_rate": 7.627983467199182e-05, "loss": 0.0133, "step": 21460 }, { "grad_norm": 0.31462568044662476, "learning_rate": 7.625638630203132e-05, "loss": 0.0173, "step": 21470 }, { "grad_norm": 0.28866228461265564, "learning_rate": 7.623292995607394e-05, "loss": 0.0151, "step": 21480 }, { "grad_norm": 0.33535662293434143, "learning_rate": 7.620946564124507e-05, "loss": 0.0145, "step": 21490 }, { "grad_norm": 0.30741414427757263, "learning_rate": 7.618599336467256e-05, "loss": 0.0134, "step": 21500 }, { "grad_norm": 0.34034204483032227, "learning_rate": 7.616251313348666e-05, "loss": 0.0135, "step": 21510 }, { "grad_norm": 0.2424885630607605, "learning_rate": 7.613902495482005e-05, "loss": 0.014, "step": 21520 }, { "grad_norm": 0.23943130671977997, "learning_rate": 7.611552883580784e-05, "loss": 0.0098, "step": 21530 }, { "grad_norm": 0.29430538415908813, "learning_rate": 7.609202478358748e-05, "loss": 0.0134, "step": 21540 }, { "grad_norm": 0.2681763768196106, "learning_rate": 7.606851280529895e-05, "loss": 0.0092, "step": 21550 }, { "grad_norm": 0.28645193576812744, "learning_rate": 7.604499290808449e-05, "loss": 0.0096, "step": 21560 }, { "grad_norm": 0.2646980583667755, "learning_rate": 7.602146509908888e-05, "loss": 0.0122, "step": 21570 }, { "grad_norm": 0.2294079065322876, "learning_rate": 7.599792938545921e-05, "loss": 0.0119, "step": 21580 }, { "grad_norm": 0.3118074834346771, "learning_rate": 7.597438577434506e-05, "loss": 0.0136, "step": 21590 }, { "grad_norm": 0.28577011823654175, "learning_rate": 7.595083427289831e-05, "loss": 0.0099, "step": 21600 }, { "grad_norm": 0.3082162141799927, "learning_rate": 7.59272748882733e-05, "loss": 0.0124, "step": 21610 }, { "grad_norm": 0.4013248682022095, "learning_rate": 7.590370762762675e-05, "loss": 0.0118, "step": 21620 }, { "grad_norm": 0.3080922067165375, "learning_rate": 7.588013249811777e-05, "loss": 0.0161, "step": 21630 }, { "grad_norm": 0.39616358280181885, "learning_rate": 7.585654950690786e-05, "loss": 0.0124, "step": 21640 }, { "grad_norm": 0.24922987818717957, "learning_rate": 7.583295866116091e-05, "loss": 0.0116, "step": 21650 }, { "grad_norm": 0.27102354168891907, "learning_rate": 7.580935996804321e-05, "loss": 0.0128, "step": 21660 }, { "grad_norm": 0.37369731068611145, "learning_rate": 7.57857534347234e-05, "loss": 0.0116, "step": 21670 }, { "grad_norm": 0.274298757314682, "learning_rate": 7.576213906837254e-05, "loss": 0.0106, "step": 21680 }, { "grad_norm": 0.27538877725601196, "learning_rate": 7.573851687616403e-05, "loss": 0.0123, "step": 21690 }, { "grad_norm": 0.31631436944007874, "learning_rate": 7.571488686527368e-05, "loss": 0.0135, "step": 21700 }, { "grad_norm": 0.30979371070861816, "learning_rate": 7.569124904287968e-05, "loss": 0.0109, "step": 21710 }, { "grad_norm": 0.24036063253879547, "learning_rate": 7.566760341616254e-05, "loss": 0.0103, "step": 21720 }, { "grad_norm": 0.24478644132614136, "learning_rate": 7.564394999230519e-05, "loss": 0.0124, "step": 21730 }, { "grad_norm": 0.33754533529281616, "learning_rate": 7.562028877849294e-05, "loss": 0.0106, "step": 21740 }, { "grad_norm": 0.3073609471321106, "learning_rate": 7.559661978191341e-05, "loss": 0.0121, "step": 21750 }, { "grad_norm": 0.2695678472518921, "learning_rate": 7.557294300975664e-05, "loss": 0.0113, "step": 21760 }, { "grad_norm": 0.2535015046596527, "learning_rate": 7.554925846921499e-05, "loss": 0.0108, "step": 21770 }, { "grad_norm": 0.23987755179405212, "learning_rate": 7.552556616748321e-05, "loss": 0.0114, "step": 21780 }, { "grad_norm": 0.2868041396141052, "learning_rate": 7.550186611175838e-05, "loss": 0.0111, "step": 21790 }, { "grad_norm": 0.2640947997570038, "learning_rate": 7.547815830923998e-05, "loss": 0.0114, "step": 21800 }, { "grad_norm": 0.36012017726898193, "learning_rate": 7.54544427671298e-05, "loss": 0.0126, "step": 21810 }, { "grad_norm": 0.2856258451938629, "learning_rate": 7.543071949263198e-05, "loss": 0.0111, "step": 21820 }, { "grad_norm": 0.27696359157562256, "learning_rate": 7.540698849295305e-05, "loss": 0.0112, "step": 21830 }, { "grad_norm": 0.26459866762161255, "learning_rate": 7.538324977530183e-05, "loss": 0.0097, "step": 21840 }, { "grad_norm": 0.3199159801006317, "learning_rate": 7.535950334688955e-05, "loss": 0.0122, "step": 21850 }, { "grad_norm": 0.29098185896873474, "learning_rate": 7.533574921492972e-05, "loss": 0.0103, "step": 21860 }, { "grad_norm": 0.3367266356945038, "learning_rate": 7.531198738663824e-05, "loss": 0.0106, "step": 21870 }, { "grad_norm": 0.30199435353279114, "learning_rate": 7.528821786923333e-05, "loss": 0.0109, "step": 21880 }, { "grad_norm": 0.32572558522224426, "learning_rate": 7.52644406699355e-05, "loss": 0.0107, "step": 21890 }, { "grad_norm": 0.23641718924045563, "learning_rate": 7.524065579596766e-05, "loss": 0.0117, "step": 21900 }, { "grad_norm": 0.29985103011131287, "learning_rate": 7.521686325455506e-05, "loss": 0.0107, "step": 21910 }, { "grad_norm": 0.32768669724464417, "learning_rate": 7.51930630529252e-05, "loss": 0.0085, "step": 21920 }, { "grad_norm": 0.3467303216457367, "learning_rate": 7.516925519830797e-05, "loss": 0.0149, "step": 21930 }, { "grad_norm": 0.3408523499965668, "learning_rate": 7.514543969793557e-05, "loss": 0.0114, "step": 21940 }, { "grad_norm": 0.3072498142719269, "learning_rate": 7.512161655904251e-05, "loss": 0.0122, "step": 21950 }, { "grad_norm": 0.3192087709903717, "learning_rate": 7.509778578886563e-05, "loss": 0.0108, "step": 21960 }, { "grad_norm": 0.3271007537841797, "learning_rate": 7.507394739464412e-05, "loss": 0.008, "step": 21970 }, { "grad_norm": 0.3040149211883545, "learning_rate": 7.50501013836194e-05, "loss": 0.0164, "step": 21980 }, { "grad_norm": 0.23984147608280182, "learning_rate": 7.50262477630353e-05, "loss": 0.0102, "step": 21990 }, { "grad_norm": 0.25129538774490356, "learning_rate": 7.500238654013794e-05, "loss": 0.0097, "step": 22000 }, { "grad_norm": 0.2837453782558441, "learning_rate": 7.497851772217566e-05, "loss": 0.0099, "step": 22010 }, { "grad_norm": 0.2572399377822876, "learning_rate": 7.495464131639924e-05, "loss": 0.0101, "step": 22020 }, { "grad_norm": 0.31302526593208313, "learning_rate": 7.493075733006166e-05, "loss": 0.0115, "step": 22030 }, { "grad_norm": 0.27019184827804565, "learning_rate": 7.490686577041828e-05, "loss": 0.0099, "step": 22040 }, { "grad_norm": 0.20474080741405487, "learning_rate": 7.488296664472668e-05, "loss": 0.011, "step": 22050 }, { "grad_norm": 0.2632288932800293, "learning_rate": 7.485905996024682e-05, "loss": 0.0082, "step": 22060 }, { "grad_norm": 0.31164175271987915, "learning_rate": 7.483514572424093e-05, "loss": 0.0089, "step": 22070 }, { "grad_norm": 0.2798174023628235, "learning_rate": 7.481122394397349e-05, "loss": 0.0094, "step": 22080 }, { "grad_norm": 0.2775808870792389, "learning_rate": 7.478729462671131e-05, "loss": 0.0101, "step": 22090 }, { "grad_norm": 0.2715277373790741, "learning_rate": 7.47633577797235e-05, "loss": 0.0092, "step": 22100 }, { "grad_norm": 0.29180750250816345, "learning_rate": 7.473941341028144e-05, "loss": 0.0136, "step": 22110 }, { "grad_norm": 0.28126060962677, "learning_rate": 7.471546152565879e-05, "loss": 0.0103, "step": 22120 }, { "grad_norm": 0.22522731125354767, "learning_rate": 7.46915021331315e-05, "loss": 0.0093, "step": 22130 }, { "grad_norm": 0.28509920835494995, "learning_rate": 7.466753523997778e-05, "loss": 0.0099, "step": 22140 }, { "grad_norm": 0.20971417427062988, "learning_rate": 7.464356085347819e-05, "loss": 0.0112, "step": 22150 }, { "grad_norm": 0.24829228222370148, "learning_rate": 7.461957898091548e-05, "loss": 0.0108, "step": 22160 }, { "grad_norm": 0.28717026114463806, "learning_rate": 7.459558962957473e-05, "loss": 0.0084, "step": 22170 }, { "grad_norm": 0.19303816556930542, "learning_rate": 7.457159280674326e-05, "loss": 0.0104, "step": 22180 }, { "grad_norm": 0.19239214062690735, "learning_rate": 7.454758851971066e-05, "loss": 0.0086, "step": 22190 }, { "grad_norm": 0.24407100677490234, "learning_rate": 7.45235767757688e-05, "loss": 0.0113, "step": 22200 }, { "grad_norm": 0.28287962079048157, "learning_rate": 7.449955758221183e-05, "loss": 0.0088, "step": 22210 }, { "grad_norm": 0.2901305854320526, "learning_rate": 7.447553094633615e-05, "loss": 0.0114, "step": 22220 }, { "grad_norm": 0.32799839973449707, "learning_rate": 7.445149687544039e-05, "loss": 0.0106, "step": 22230 }, { "grad_norm": 0.281703382730484, "learning_rate": 7.44274553768255e-05, "loss": 0.0107, "step": 22240 }, { "grad_norm": 0.35231515765190125, "learning_rate": 7.440340645779464e-05, "loss": 0.0109, "step": 22250 }, { "grad_norm": 0.21387960016727448, "learning_rate": 7.437935012565322e-05, "loss": 0.0118, "step": 22260 }, { "grad_norm": 0.2847549021244049, "learning_rate": 7.435528638770893e-05, "loss": 0.0091, "step": 22270 }, { "grad_norm": 0.26194649934768677, "learning_rate": 7.433121525127171e-05, "loss": 0.0098, "step": 22280 }, { "grad_norm": 0.24325090646743774, "learning_rate": 7.430713672365371e-05, "loss": 0.0089, "step": 22290 }, { "grad_norm": 0.24431732296943665, "learning_rate": 7.428305081216938e-05, "loss": 0.0077, "step": 22300 }, { "grad_norm": 0.24896951019763947, "learning_rate": 7.425895752413536e-05, "loss": 0.0082, "step": 22310 }, { "grad_norm": 0.18142609298229218, "learning_rate": 7.423485686687057e-05, "loss": 0.0113, "step": 22320 }, { "grad_norm": 0.2753553092479706, "learning_rate": 7.421074884769616e-05, "loss": 0.0117, "step": 22330 }, { "grad_norm": 0.2827988266944885, "learning_rate": 7.418663347393548e-05, "loss": 0.0109, "step": 22340 }, { "grad_norm": 0.29138875007629395, "learning_rate": 7.416251075291418e-05, "loss": 0.0102, "step": 22350 }, { "grad_norm": 0.287031352519989, "learning_rate": 7.413838069196007e-05, "loss": 0.008, "step": 22360 }, { "grad_norm": 0.2766510248184204, "learning_rate": 7.411424329840324e-05, "loss": 0.0109, "step": 22370 }, { "grad_norm": 0.3620835244655609, "learning_rate": 7.409009857957601e-05, "loss": 0.0132, "step": 22380 }, { "grad_norm": 0.20657455921173096, "learning_rate": 7.40659465428129e-05, "loss": 0.0105, "step": 22390 }, { "grad_norm": 0.31867966055870056, "learning_rate": 7.404178719545063e-05, "loss": 0.0116, "step": 22400 }, { "grad_norm": 0.42490971088409424, "learning_rate": 7.401762054482822e-05, "loss": 0.0125, "step": 22410 }, { "grad_norm": 0.2265114039182663, "learning_rate": 7.39934465982868e-05, "loss": 0.0098, "step": 22420 }, { "grad_norm": 0.2840220034122467, "learning_rate": 7.396926536316984e-05, "loss": 0.0102, "step": 22430 }, { "grad_norm": 0.28045937418937683, "learning_rate": 7.394507684682293e-05, "loss": 0.0086, "step": 22440 }, { "grad_norm": 0.294400155544281, "learning_rate": 7.392088105659393e-05, "loss": 0.0153, "step": 22450 }, { "grad_norm": 0.2399234175682068, "learning_rate": 7.389667799983284e-05, "loss": 0.0127, "step": 22460 }, { "grad_norm": 0.2573299705982208, "learning_rate": 7.387246768389193e-05, "loss": 0.0099, "step": 22470 }, { "grad_norm": 0.362834632396698, "learning_rate": 7.384825011612563e-05, "loss": 0.0102, "step": 22480 }, { "grad_norm": 0.37360212206840515, "learning_rate": 7.382402530389066e-05, "loss": 0.0108, "step": 22490 }, { "grad_norm": 0.28495290875434875, "learning_rate": 7.379979325454582e-05, "loss": 0.0091, "step": 22500 }, { "grad_norm": 0.3052418529987335, "learning_rate": 7.37755539754522e-05, "loss": 0.0097, "step": 22510 }, { "grad_norm": 0.27458980679512024, "learning_rate": 7.375130747397302e-05, "loss": 0.0096, "step": 22520 }, { "grad_norm": 0.3065735399723053, "learning_rate": 7.372705375747377e-05, "loss": 0.0108, "step": 22530 }, { "grad_norm": 0.2778896689414978, "learning_rate": 7.370279283332205e-05, "loss": 0.0081, "step": 22540 }, { "grad_norm": 0.29692456126213074, "learning_rate": 7.36785247088877e-05, "loss": 0.0088, "step": 22550 }, { "grad_norm": 0.24722807109355927, "learning_rate": 7.365424939154275e-05, "loss": 0.0081, "step": 22560 }, { "grad_norm": 0.2320321798324585, "learning_rate": 7.362996688866138e-05, "loss": 0.0109, "step": 22570 }, { "grad_norm": 0.18126583099365234, "learning_rate": 7.360567720761999e-05, "loss": 0.0071, "step": 22580 }, { "grad_norm": 0.22640566527843475, "learning_rate": 7.358138035579711e-05, "loss": 0.0084, "step": 22590 }, { "grad_norm": 0.2487303614616394, "learning_rate": 7.355707634057354e-05, "loss": 0.0091, "step": 22600 }, { "grad_norm": 0.34879443049430847, "learning_rate": 7.353276516933215e-05, "loss": 0.0103, "step": 22610 }, { "grad_norm": 0.2628817856311798, "learning_rate": 7.350844684945806e-05, "loss": 0.0088, "step": 22620 }, { "grad_norm": 0.21212516725063324, "learning_rate": 7.348412138833851e-05, "loss": 0.0074, "step": 22630 }, { "grad_norm": 0.3087599575519562, "learning_rate": 7.345978879336295e-05, "loss": 0.0113, "step": 22640 }, { "grad_norm": 0.27432340383529663, "learning_rate": 7.343544907192296e-05, "loss": 0.0101, "step": 22650 }, { "grad_norm": 0.25573548674583435, "learning_rate": 7.341110223141235e-05, "loss": 0.0096, "step": 22660 }, { "grad_norm": 0.2608964145183563, "learning_rate": 7.3386748279227e-05, "loss": 0.0097, "step": 22670 }, { "grad_norm": 0.2299220860004425, "learning_rate": 7.336238722276501e-05, "loss": 0.0102, "step": 22680 }, { "grad_norm": 0.26084446907043457, "learning_rate": 7.333801906942663e-05, "loss": 0.01, "step": 22690 }, { "grad_norm": 0.35514795780181885, "learning_rate": 7.331364382661428e-05, "loss": 0.0135, "step": 22700 }, { "grad_norm": 0.26130834221839905, "learning_rate": 7.328926150173248e-05, "loss": 0.0123, "step": 22710 }, { "grad_norm": 0.30046066641807556, "learning_rate": 7.326487210218795e-05, "loss": 0.0106, "step": 22720 }, { "grad_norm": 0.2628723084926605, "learning_rate": 7.324047563538955e-05, "loss": 0.0124, "step": 22730 }, { "grad_norm": 0.3095598816871643, "learning_rate": 7.321607210874828e-05, "loss": 0.012, "step": 22740 }, { "grad_norm": 0.23997288942337036, "learning_rate": 7.31916615296773e-05, "loss": 0.0107, "step": 22750 }, { "grad_norm": 0.27912813425064087, "learning_rate": 7.316724390559188e-05, "loss": 0.0109, "step": 22760 }, { "grad_norm": 0.20605476200580597, "learning_rate": 7.314281924390946e-05, "loss": 0.01, "step": 22770 }, { "grad_norm": 0.3583897054195404, "learning_rate": 7.311838755204959e-05, "loss": 0.0096, "step": 22780 }, { "grad_norm": 0.24017024040222168, "learning_rate": 7.3093948837434e-05, "loss": 0.013, "step": 22790 }, { "grad_norm": 0.3490748107433319, "learning_rate": 7.306950310748651e-05, "loss": 0.0109, "step": 22800 }, { "grad_norm": 0.3087576925754547, "learning_rate": 7.304505036963311e-05, "loss": 0.0126, "step": 22810 }, { "grad_norm": 0.2967706024646759, "learning_rate": 7.302059063130186e-05, "loss": 0.0112, "step": 22820 }, { "grad_norm": 0.2868776321411133, "learning_rate": 7.2996123899923e-05, "loss": 0.009, "step": 22830 }, { "grad_norm": 0.22777420282363892, "learning_rate": 7.297165018292886e-05, "loss": 0.01, "step": 22840 }, { "grad_norm": 0.19153814017772675, "learning_rate": 7.294716948775396e-05, "loss": 0.0076, "step": 22850 }, { "grad_norm": 0.2611812949180603, "learning_rate": 7.292268182183484e-05, "loss": 0.01, "step": 22860 }, { "grad_norm": 0.21843092143535614, "learning_rate": 7.28981871926102e-05, "loss": 0.008, "step": 22870 }, { "grad_norm": 0.2903699576854706, "learning_rate": 7.28736856075209e-05, "loss": 0.0097, "step": 22880 }, { "grad_norm": 0.23961956799030304, "learning_rate": 7.284917707400985e-05, "loss": 0.0122, "step": 22890 }, { "grad_norm": 0.2326366901397705, "learning_rate": 7.282466159952212e-05, "loss": 0.0102, "step": 22900 }, { "grad_norm": 0.3050483763217926, "learning_rate": 7.280013919150483e-05, "loss": 0.0113, "step": 22910 }, { "grad_norm": 0.2711528539657593, "learning_rate": 7.277560985740728e-05, "loss": 0.011, "step": 22920 }, { "grad_norm": 0.2670758366584778, "learning_rate": 7.275107360468079e-05, "loss": 0.0097, "step": 22930 }, { "grad_norm": 0.3290386199951172, "learning_rate": 7.272653044077885e-05, "loss": 0.0079, "step": 22940 }, { "grad_norm": 0.3536282777786255, "learning_rate": 7.270198037315703e-05, "loss": 0.0102, "step": 22950 }, { "grad_norm": 0.24289318919181824, "learning_rate": 7.267742340927297e-05, "loss": 0.0092, "step": 22960 }, { "grad_norm": 0.27409830689430237, "learning_rate": 7.265285955658645e-05, "loss": 0.0075, "step": 22970 }, { "grad_norm": 0.26687631011009216, "learning_rate": 7.26282888225593e-05, "loss": 0.0115, "step": 22980 }, { "grad_norm": 0.1967734843492508, "learning_rate": 7.260371121465548e-05, "loss": 0.0087, "step": 22990 }, { "grad_norm": 0.32088401913642883, "learning_rate": 7.2579126740341e-05, "loss": 0.0104, "step": 23000 }, { "grad_norm": 0.3110400438308716, "learning_rate": 7.2554535407084e-05, "loss": 0.0109, "step": 23010 }, { "grad_norm": 0.2756114900112152, "learning_rate": 7.252993722235464e-05, "loss": 0.0096, "step": 23020 }, { "grad_norm": 0.24227185547351837, "learning_rate": 7.250533219362523e-05, "loss": 0.0098, "step": 23030 }, { "grad_norm": 0.21470095217227936, "learning_rate": 7.248072032837012e-05, "loss": 0.0108, "step": 23040 }, { "grad_norm": 0.6737625598907471, "learning_rate": 7.245610163406575e-05, "loss": 0.0153, "step": 23050 }, { "grad_norm": 0.31704360246658325, "learning_rate": 7.243147611819061e-05, "loss": 0.0119, "step": 23060 }, { "grad_norm": 0.2883269488811493, "learning_rate": 7.240684378822531e-05, "loss": 0.0147, "step": 23070 }, { "grad_norm": 0.26238542795181274, "learning_rate": 7.238220465165248e-05, "loss": 0.0117, "step": 23080 }, { "grad_norm": 0.42589902877807617, "learning_rate": 7.235755871595684e-05, "loss": 0.0102, "step": 23090 }, { "grad_norm": 0.34058448672294617, "learning_rate": 7.233290598862517e-05, "loss": 0.0118, "step": 23100 }, { "grad_norm": 0.29225847125053406, "learning_rate": 7.230824647714635e-05, "loss": 0.0114, "step": 23110 }, { "grad_norm": 0.2492419332265854, "learning_rate": 7.228358018901124e-05, "loss": 0.0117, "step": 23120 }, { "grad_norm": 0.27833303809165955, "learning_rate": 7.225890713171286e-05, "loss": 0.0087, "step": 23130 }, { "grad_norm": 0.2806122601032257, "learning_rate": 7.223422731274618e-05, "loss": 0.01, "step": 23140 }, { "grad_norm": 0.2733665406703949, "learning_rate": 7.220954073960832e-05, "loss": 0.0126, "step": 23150 }, { "grad_norm": 0.295369952917099, "learning_rate": 7.218484741979838e-05, "loss": 0.0119, "step": 23160 }, { "grad_norm": 0.24726513028144836, "learning_rate": 7.216014736081756e-05, "loss": 0.0101, "step": 23170 }, { "grad_norm": 0.27534812688827515, "learning_rate": 7.213544057016906e-05, "loss": 0.012, "step": 23180 }, { "grad_norm": 0.29914525151252747, "learning_rate": 7.211072705535819e-05, "loss": 0.0091, "step": 23190 }, { "grad_norm": 0.3329485058784485, "learning_rate": 7.208600682389224e-05, "loss": 0.0116, "step": 23200 }, { "grad_norm": 0.26701098680496216, "learning_rate": 7.206127988328055e-05, "loss": 0.0147, "step": 23210 }, { "grad_norm": 0.20883502066135406, "learning_rate": 7.203654624103453e-05, "loss": 0.0107, "step": 23220 }, { "grad_norm": 0.3203555941581726, "learning_rate": 7.201180590466761e-05, "loss": 0.0164, "step": 23230 }, { "grad_norm": 0.288200706243515, "learning_rate": 7.198705888169523e-05, "loss": 0.0128, "step": 23240 }, { "grad_norm": 0.23264338076114655, "learning_rate": 7.196230517963491e-05, "loss": 0.0137, "step": 23250 }, { "grad_norm": 0.24267594516277313, "learning_rate": 7.193754480600615e-05, "loss": 0.0132, "step": 23260 }, { "grad_norm": 0.28565508127212524, "learning_rate": 7.19127777683305e-05, "loss": 0.0089, "step": 23270 }, { "grad_norm": 0.24211053550243378, "learning_rate": 7.188800407413156e-05, "loss": 0.0111, "step": 23280 }, { "grad_norm": 0.272942453622818, "learning_rate": 7.186322373093489e-05, "loss": 0.0125, "step": 23290 }, { "grad_norm": 0.26006919145584106, "learning_rate": 7.18384367462681e-05, "loss": 0.0084, "step": 23300 }, { "grad_norm": 0.2471270114183426, "learning_rate": 7.181364312766085e-05, "loss": 0.0088, "step": 23310 }, { "grad_norm": 0.3497978746891022, "learning_rate": 7.178884288264477e-05, "loss": 0.0099, "step": 23320 }, { "grad_norm": 0.31600794196128845, "learning_rate": 7.176403601875353e-05, "loss": 0.0108, "step": 23330 }, { "grad_norm": 0.23590435087680817, "learning_rate": 7.173922254352279e-05, "loss": 0.0096, "step": 23340 }, { "grad_norm": 0.3787817656993866, "learning_rate": 7.171440246449024e-05, "loss": 0.0108, "step": 23350 }, { "grad_norm": 0.26142868399620056, "learning_rate": 7.168957578919555e-05, "loss": 0.0081, "step": 23360 }, { "grad_norm": 0.2901480793952942, "learning_rate": 7.16647425251804e-05, "loss": 0.0096, "step": 23370 }, { "grad_norm": 0.28418490290641785, "learning_rate": 7.163990267998852e-05, "loss": 0.0103, "step": 23380 }, { "grad_norm": 0.31199508905410767, "learning_rate": 7.161505626116556e-05, "loss": 0.0111, "step": 23390 }, { "grad_norm": 0.3080494701862335, "learning_rate": 7.159020327625923e-05, "loss": 0.0126, "step": 23400 }, { "grad_norm": 0.21442535519599915, "learning_rate": 7.15653437328192e-05, "loss": 0.0099, "step": 23410 }, { "grad_norm": 0.25426360964775085, "learning_rate": 7.154047763839713e-05, "loss": 0.0093, "step": 23420 }, { "grad_norm": 0.21504269540309906, "learning_rate": 7.15156050005467e-05, "loss": 0.0098, "step": 23430 }, { "grad_norm": 0.29118266701698303, "learning_rate": 7.149072582682357e-05, "loss": 0.0095, "step": 23440 }, { "grad_norm": 0.1889866590499878, "learning_rate": 7.146584012478535e-05, "loss": 0.0093, "step": 23450 }, { "grad_norm": 0.24235953390598297, "learning_rate": 7.144094790199169e-05, "loss": 0.0119, "step": 23460 }, { "grad_norm": 0.23944562673568726, "learning_rate": 7.141604916600415e-05, "loss": 0.0098, "step": 23470 }, { "grad_norm": 0.24665893614292145, "learning_rate": 7.139114392438635e-05, "loss": 0.0114, "step": 23480 }, { "grad_norm": 0.2754136621952057, "learning_rate": 7.136623218470382e-05, "loss": 0.01, "step": 23490 }, { "grad_norm": 0.3582766652107239, "learning_rate": 7.13413139545241e-05, "loss": 0.0111, "step": 23500 }, { "grad_norm": 0.3257825970649719, "learning_rate": 7.131638924141668e-05, "loss": 0.0087, "step": 23510 }, { "grad_norm": 0.39841777086257935, "learning_rate": 7.129145805295304e-05, "loss": 0.0117, "step": 23520 }, { "grad_norm": 0.2691058814525604, "learning_rate": 7.126652039670661e-05, "loss": 0.0103, "step": 23530 }, { "grad_norm": 0.30880314111709595, "learning_rate": 7.124157628025278e-05, "loss": 0.0102, "step": 23540 }, { "grad_norm": 0.3092741072177887, "learning_rate": 7.121662571116894e-05, "loss": 0.009, "step": 23550 }, { "grad_norm": 0.28892335295677185, "learning_rate": 7.119166869703441e-05, "loss": 0.0098, "step": 23560 }, { "grad_norm": 0.21413643658161163, "learning_rate": 7.116670524543044e-05, "loss": 0.0108, "step": 23570 }, { "grad_norm": 0.30003151297569275, "learning_rate": 7.114173536394032e-05, "loss": 0.0107, "step": 23580 }, { "grad_norm": 0.23662851750850677, "learning_rate": 7.111675906014917e-05, "loss": 0.0111, "step": 23590 }, { "grad_norm": 0.30537134408950806, "learning_rate": 7.109177634164421e-05, "loss": 0.0122, "step": 23600 }, { "grad_norm": 0.3178028166294098, "learning_rate": 7.106678721601449e-05, "loss": 0.0122, "step": 23610 }, { "grad_norm": 0.2335910052061081, "learning_rate": 7.104179169085103e-05, "loss": 0.0115, "step": 23620 }, { "grad_norm": 0.26295721530914307, "learning_rate": 7.101678977374683e-05, "loss": 0.01, "step": 23630 }, { "grad_norm": 0.32382676005363464, "learning_rate": 7.099178147229685e-05, "loss": 0.0149, "step": 23640 }, { "grad_norm": 0.27652525901794434, "learning_rate": 7.096676679409789e-05, "loss": 0.0115, "step": 23650 }, { "grad_norm": 0.3778683543205261, "learning_rate": 7.094174574674877e-05, "loss": 0.0117, "step": 23660 }, { "grad_norm": 0.4018316864967346, "learning_rate": 7.091671833785025e-05, "loss": 0.0118, "step": 23670 }, { "grad_norm": 0.37560781836509705, "learning_rate": 7.089168457500493e-05, "loss": 0.0097, "step": 23680 }, { "grad_norm": 0.33410680294036865, "learning_rate": 7.086664446581747e-05, "loss": 0.0109, "step": 23690 }, { "grad_norm": 0.28020018339157104, "learning_rate": 7.084159801789438e-05, "loss": 0.0105, "step": 23700 }, { "grad_norm": 0.21257483959197998, "learning_rate": 7.081654523884411e-05, "loss": 0.0076, "step": 23710 }, { "grad_norm": 0.2456098049879074, "learning_rate": 7.0791486136277e-05, "loss": 0.0105, "step": 23720 }, { "grad_norm": 0.24972784519195557, "learning_rate": 7.07664207178054e-05, "loss": 0.0148, "step": 23730 }, { "grad_norm": 0.3468533158302307, "learning_rate": 7.074134899104345e-05, "loss": 0.0085, "step": 23740 }, { "grad_norm": 0.2879554331302643, "learning_rate": 7.071627096360735e-05, "loss": 0.0121, "step": 23750 }, { "grad_norm": 0.2748449444770813, "learning_rate": 7.069118664311511e-05, "loss": 0.0105, "step": 23760 }, { "grad_norm": 0.32097867131233215, "learning_rate": 7.06660960371867e-05, "loss": 0.0106, "step": 23770 }, { "grad_norm": 0.25515103340148926, "learning_rate": 7.064099915344396e-05, "loss": 0.01, "step": 23780 }, { "grad_norm": 0.2812367379665375, "learning_rate": 7.061589599951066e-05, "loss": 0.0112, "step": 23790 }, { "grad_norm": 0.2585762143135071, "learning_rate": 7.05907865830125e-05, "loss": 0.0094, "step": 23800 }, { "grad_norm": 0.3192617893218994, "learning_rate": 7.056567091157703e-05, "loss": 0.0106, "step": 23810 }, { "grad_norm": 0.2826465368270874, "learning_rate": 7.054054899283375e-05, "loss": 0.0097, "step": 23820 }, { "grad_norm": 0.21344923973083496, "learning_rate": 7.051542083441403e-05, "loss": 0.0107, "step": 23830 }, { "grad_norm": 0.2828599810600281, "learning_rate": 7.049028644395113e-05, "loss": 0.0094, "step": 23840 }, { "grad_norm": 0.3610101044178009, "learning_rate": 7.046514582908024e-05, "loss": 0.0143, "step": 23850 }, { "grad_norm": 0.3311883211135864, "learning_rate": 7.043999899743838e-05, "loss": 0.011, "step": 23860 }, { "grad_norm": 0.28398993611335754, "learning_rate": 7.041484595666451e-05, "loss": 0.0119, "step": 23870 }, { "grad_norm": 0.3013603389263153, "learning_rate": 7.038968671439948e-05, "loss": 0.0148, "step": 23880 }, { "grad_norm": 0.2967699468135834, "learning_rate": 7.036452127828596e-05, "loss": 0.0115, "step": 23890 }, { "grad_norm": 0.29695960879325867, "learning_rate": 7.033934965596859e-05, "loss": 0.0097, "step": 23900 }, { "grad_norm": 0.3152150511741638, "learning_rate": 7.031417185509381e-05, "loss": 0.0124, "step": 23910 }, { "grad_norm": 0.23919104039669037, "learning_rate": 7.028898788331e-05, "loss": 0.0131, "step": 23920 }, { "grad_norm": 0.27760595083236694, "learning_rate": 7.026379774826736e-05, "loss": 0.0113, "step": 23930 }, { "grad_norm": 0.2188689410686493, "learning_rate": 7.0238601457618e-05, "loss": 0.0101, "step": 23940 }, { "grad_norm": 0.39000511169433594, "learning_rate": 7.02133990190159e-05, "loss": 0.0108, "step": 23950 }, { "grad_norm": 0.2574511766433716, "learning_rate": 7.018819044011687e-05, "loss": 0.009, "step": 23960 }, { "grad_norm": 0.3448181748390198, "learning_rate": 7.016297572857863e-05, "loss": 0.0089, "step": 23970 }, { "grad_norm": 0.2615724205970764, "learning_rate": 7.013775489206072e-05, "loss": 0.0094, "step": 23980 }, { "grad_norm": 0.2373533844947815, "learning_rate": 7.01125279382246e-05, "loss": 0.0126, "step": 23990 }, { "grad_norm": 0.22340206801891327, "learning_rate": 7.008729487473351e-05, "loss": 0.0114, "step": 24000 }, { "grad_norm": 0.22240720689296722, "learning_rate": 7.006205570925263e-05, "loss": 0.0107, "step": 24010 }, { "grad_norm": 0.24717000126838684, "learning_rate": 7.003681044944892e-05, "loss": 0.0125, "step": 24020 }, { "grad_norm": 0.21538789570331573, "learning_rate": 7.001155910299126e-05, "loss": 0.0115, "step": 24030 }, { "grad_norm": 0.22406692802906036, "learning_rate": 6.99863016775503e-05, "loss": 0.0104, "step": 24040 }, { "grad_norm": 0.2345639169216156, "learning_rate": 6.996103818079859e-05, "loss": 0.0106, "step": 24050 }, { "grad_norm": 0.26302775740623474, "learning_rate": 6.993576862041054e-05, "loss": 0.0095, "step": 24060 }, { "grad_norm": 0.31514737010002136, "learning_rate": 6.991049300406235e-05, "loss": 0.0101, "step": 24070 }, { "grad_norm": 0.268582820892334, "learning_rate": 6.988521133943209e-05, "loss": 0.0086, "step": 24080 }, { "grad_norm": 0.38309189677238464, "learning_rate": 6.985992363419966e-05, "loss": 0.0132, "step": 24090 }, { "grad_norm": 0.2702401280403137, "learning_rate": 6.983462989604682e-05, "loss": 0.008, "step": 24100 }, { "grad_norm": 0.3436410129070282, "learning_rate": 6.980933013265709e-05, "loss": 0.0103, "step": 24110 }, { "grad_norm": 0.27887746691703796, "learning_rate": 6.978402435171592e-05, "loss": 0.0112, "step": 24120 }, { "grad_norm": 0.3148270845413208, "learning_rate": 6.975871256091052e-05, "loss": 0.009, "step": 24130 }, { "grad_norm": 0.3671909272670746, "learning_rate": 6.973339476792995e-05, "loss": 0.0083, "step": 24140 }, { "grad_norm": 0.29265913367271423, "learning_rate": 6.970807098046505e-05, "loss": 0.0095, "step": 24150 }, { "grad_norm": 0.35181355476379395, "learning_rate": 6.968274120620858e-05, "loss": 0.0095, "step": 24160 }, { "grad_norm": 0.27665621042251587, "learning_rate": 6.965740545285499e-05, "loss": 0.0095, "step": 24170 }, { "grad_norm": 0.3085362911224365, "learning_rate": 6.963206372810068e-05, "loss": 0.0094, "step": 24180 }, { "grad_norm": 0.3252646327018738, "learning_rate": 6.960671603964375e-05, "loss": 0.0104, "step": 24190 }, { "grad_norm": 0.2893024682998657, "learning_rate": 6.958136239518418e-05, "loss": 0.0078, "step": 24200 }, { "grad_norm": 0.21432098746299744, "learning_rate": 6.955600280242371e-05, "loss": 0.0104, "step": 24210 }, { "grad_norm": 0.2944180965423584, "learning_rate": 6.953063726906596e-05, "loss": 0.0095, "step": 24220 }, { "grad_norm": 0.2731104791164398, "learning_rate": 6.950526580281626e-05, "loss": 0.009, "step": 24230 }, { "grad_norm": 0.34196025133132935, "learning_rate": 6.947988841138184e-05, "loss": 0.0098, "step": 24240 }, { "grad_norm": 0.24402837455272675, "learning_rate": 6.945450510247165e-05, "loss": 0.0083, "step": 24250 }, { "grad_norm": 0.31558629870414734, "learning_rate": 6.942911588379647e-05, "loss": 0.0113, "step": 24260 }, { "grad_norm": 0.34831908345222473, "learning_rate": 6.940372076306888e-05, "loss": 0.0083, "step": 24270 }, { "grad_norm": 0.24995142221450806, "learning_rate": 6.937831974800326e-05, "loss": 0.0093, "step": 24280 }, { "grad_norm": 0.24983029067516327, "learning_rate": 6.935291284631574e-05, "loss": 0.0089, "step": 24290 }, { "grad_norm": 0.2778373658657074, "learning_rate": 6.932750006572428e-05, "loss": 0.0101, "step": 24300 }, { "grad_norm": 0.21077312529087067, "learning_rate": 6.930208141394863e-05, "loss": 0.0087, "step": 24310 }, { "grad_norm": 0.25653186440467834, "learning_rate": 6.927665689871026e-05, "loss": 0.0122, "step": 24320 }, { "grad_norm": 0.26916074752807617, "learning_rate": 6.925122652773253e-05, "loss": 0.0088, "step": 24330 }, { "grad_norm": 0.28328290581703186, "learning_rate": 6.922579030874046e-05, "loss": 0.0089, "step": 24340 }, { "grad_norm": 0.2448911815881729, "learning_rate": 6.920034824946093e-05, "loss": 0.0105, "step": 24350 }, { "grad_norm": 0.36229923367500305, "learning_rate": 6.917490035762255e-05, "loss": 0.0102, "step": 24360 }, { "grad_norm": 0.328237920999527, "learning_rate": 6.914944664095573e-05, "loss": 0.0087, "step": 24370 }, { "grad_norm": 0.33668920397758484, "learning_rate": 6.912398710719264e-05, "loss": 0.0086, "step": 24380 }, { "grad_norm": 0.23626187443733215, "learning_rate": 6.90985217640672e-05, "loss": 0.0137, "step": 24390 }, { "grad_norm": 0.270663857460022, "learning_rate": 6.90730506193151e-05, "loss": 0.009, "step": 24400 }, { "grad_norm": 0.25239109992980957, "learning_rate": 6.904757368067384e-05, "loss": 0.0087, "step": 24410 }, { "grad_norm": 0.28416988253593445, "learning_rate": 6.90220909558826e-05, "loss": 0.0125, "step": 24420 }, { "grad_norm": 0.2679899334907532, "learning_rate": 6.899660245268237e-05, "loss": 0.0079, "step": 24430 }, { "grad_norm": 0.3217419981956482, "learning_rate": 6.897110817881592e-05, "loss": 0.008, "step": 24440 }, { "grad_norm": 0.29920387268066406, "learning_rate": 6.894560814202769e-05, "loss": 0.0096, "step": 24450 }, { "grad_norm": 0.24745391309261322, "learning_rate": 6.892010235006394e-05, "loss": 0.0099, "step": 24460 }, { "grad_norm": 0.31302884221076965, "learning_rate": 6.889459081067264e-05, "loss": 0.0103, "step": 24470 }, { "grad_norm": 0.1890210211277008, "learning_rate": 6.886907353160356e-05, "loss": 0.0092, "step": 24480 }, { "grad_norm": 0.22642023861408234, "learning_rate": 6.884355052060814e-05, "loss": 0.0079, "step": 24490 }, { "grad_norm": 0.23838257789611816, "learning_rate": 6.88180217854396e-05, "loss": 0.0079, "step": 24500 }, { "grad_norm": 0.2048512101173401, "learning_rate": 6.87924873338529e-05, "loss": 0.0082, "step": 24510 }, { "grad_norm": 0.22101201117038727, "learning_rate": 6.876694717360475e-05, "loss": 0.0095, "step": 24520 }, { "grad_norm": 0.27038589119911194, "learning_rate": 6.874140131245355e-05, "loss": 0.0123, "step": 24530 }, { "grad_norm": 0.2969529628753662, "learning_rate": 6.871584975815948e-05, "loss": 0.0108, "step": 24540 }, { "grad_norm": 0.3149702548980713, "learning_rate": 6.86902925184844e-05, "loss": 0.0083, "step": 24550 }, { "grad_norm": 0.18586546182632446, "learning_rate": 6.866472960119195e-05, "loss": 0.0074, "step": 24560 }, { "grad_norm": 0.2549111545085907, "learning_rate": 6.863916101404748e-05, "loss": 0.0075, "step": 24570 }, { "grad_norm": 0.20822013914585114, "learning_rate": 6.8613586764818e-05, "loss": 0.0071, "step": 24580 }, { "grad_norm": 0.37086981534957886, "learning_rate": 6.858800686127233e-05, "loss": 0.0123, "step": 24590 }, { "grad_norm": 0.34554389119148254, "learning_rate": 6.856242131118097e-05, "loss": 0.0122, "step": 24600 }, { "grad_norm": 0.28629931807518005, "learning_rate": 6.853683012231614e-05, "loss": 0.012, "step": 24610 }, { "grad_norm": 0.2933083772659302, "learning_rate": 6.851123330245173e-05, "loss": 0.0118, "step": 24620 }, { "grad_norm": 0.23303160071372986, "learning_rate": 6.848563085936343e-05, "loss": 0.0101, "step": 24630 }, { "grad_norm": 0.31656932830810547, "learning_rate": 6.846002280082853e-05, "loss": 0.0072, "step": 24640 }, { "grad_norm": 0.2558818757534027, "learning_rate": 6.843440913462614e-05, "loss": 0.0084, "step": 24650 }, { "grad_norm": 0.2798789441585541, "learning_rate": 6.840878986853698e-05, "loss": 0.0095, "step": 24660 }, { "grad_norm": 0.23279936611652374, "learning_rate": 6.838316501034352e-05, "loss": 0.0101, "step": 24670 }, { "grad_norm": 0.20617307722568512, "learning_rate": 6.83575345678299e-05, "loss": 0.0093, "step": 24680 }, { "grad_norm": 0.2819904088973999, "learning_rate": 6.833189854878196e-05, "loss": 0.0113, "step": 24690 }, { "grad_norm": 0.27071455121040344, "learning_rate": 6.83062569609873e-05, "loss": 0.011, "step": 24700 }, { "grad_norm": 0.30463576316833496, "learning_rate": 6.828060981223512e-05, "loss": 0.0072, "step": 24710 }, { "grad_norm": 0.2996367812156677, "learning_rate": 6.825495711031634e-05, "loss": 0.0118, "step": 24720 }, { "grad_norm": 0.21536804735660553, "learning_rate": 6.822929886302359e-05, "loss": 0.0103, "step": 24730 }, { "grad_norm": 0.2069932371377945, "learning_rate": 6.820363507815116e-05, "loss": 0.0076, "step": 24740 }, { "grad_norm": 0.2908797562122345, "learning_rate": 6.817796576349501e-05, "loss": 0.0098, "step": 24750 }, { "grad_norm": 0.2410261332988739, "learning_rate": 6.815229092685285e-05, "loss": 0.0067, "step": 24760 }, { "grad_norm": 0.29949018359184265, "learning_rate": 6.812661057602399e-05, "loss": 0.0076, "step": 24770 }, { "grad_norm": 0.3082233667373657, "learning_rate": 6.810092471880943e-05, "loss": 0.0117, "step": 24780 }, { "grad_norm": 0.24515505135059357, "learning_rate": 6.807523336301187e-05, "loss": 0.0105, "step": 24790 }, { "grad_norm": 0.1797519028186798, "learning_rate": 6.804953651643566e-05, "loss": 0.0088, "step": 24800 }, { "grad_norm": 0.16697648167610168, "learning_rate": 6.802383418688685e-05, "loss": 0.0089, "step": 24810 }, { "grad_norm": 0.20068761706352234, "learning_rate": 6.799812638217309e-05, "loss": 0.0069, "step": 24820 }, { "grad_norm": 0.21536289155483246, "learning_rate": 6.797241311010373e-05, "loss": 0.0093, "step": 24830 }, { "grad_norm": 0.25599241256713867, "learning_rate": 6.794669437848982e-05, "loss": 0.009, "step": 24840 }, { "grad_norm": 0.2785956859588623, "learning_rate": 6.792097019514402e-05, "loss": 0.0106, "step": 24850 }, { "grad_norm": 0.2358226478099823, "learning_rate": 6.789524056788064e-05, "loss": 0.0107, "step": 24860 }, { "grad_norm": 0.24556805193424225, "learning_rate": 6.786950550451567e-05, "loss": 0.0065, "step": 24870 }, { "grad_norm": 0.25816965103149414, "learning_rate": 6.784376501286676e-05, "loss": 0.0097, "step": 24880 }, { "grad_norm": 0.32570332288742065, "learning_rate": 6.781801910075316e-05, "loss": 0.0085, "step": 24890 }, { "grad_norm": 0.30192995071411133, "learning_rate": 6.779226777599581e-05, "loss": 0.0084, "step": 24900 }, { "grad_norm": 0.24069426953792572, "learning_rate": 6.776651104641729e-05, "loss": 0.0087, "step": 24910 }, { "grad_norm": 0.2186458706855774, "learning_rate": 6.774074891984183e-05, "loss": 0.0081, "step": 24920 }, { "grad_norm": 0.35617315769195557, "learning_rate": 6.771498140409526e-05, "loss": 0.009, "step": 24930 }, { "grad_norm": 0.3431120216846466, "learning_rate": 6.768920850700506e-05, "loss": 0.0117, "step": 24940 }, { "grad_norm": 0.30532458424568176, "learning_rate": 6.766343023640039e-05, "loss": 0.0105, "step": 24950 }, { "grad_norm": 0.24238221347332, "learning_rate": 6.763764660011198e-05, "loss": 0.008, "step": 24960 }, { "grad_norm": 0.32051047682762146, "learning_rate": 6.761185760597223e-05, "loss": 0.0158, "step": 24970 }, { "grad_norm": 0.3396850824356079, "learning_rate": 6.758606326181515e-05, "loss": 0.0097, "step": 24980 }, { "grad_norm": 0.1894432008266449, "learning_rate": 6.75602635754764e-05, "loss": 0.0093, "step": 24990 }, { "grad_norm": 0.27720701694488525, "learning_rate": 6.75344585547932e-05, "loss": 0.0078, "step": 25000 }, { "grad_norm": 0.25347939133644104, "learning_rate": 6.750864820760449e-05, "loss": 0.0121, "step": 25010 }, { "grad_norm": 0.1976553350687027, "learning_rate": 6.748283254175072e-05, "loss": 0.0104, "step": 25020 }, { "grad_norm": 0.24601860344409943, "learning_rate": 6.745701156507404e-05, "loss": 0.0084, "step": 25030 }, { "grad_norm": 0.2013106346130371, "learning_rate": 6.743118528541818e-05, "loss": 0.0082, "step": 25040 }, { "grad_norm": 0.28903135657310486, "learning_rate": 6.740535371062846e-05, "loss": 0.0101, "step": 25050 }, { "grad_norm": 0.31321632862091064, "learning_rate": 6.737951684855185e-05, "loss": 0.0107, "step": 25060 }, { "grad_norm": 0.25952136516571045, "learning_rate": 6.735367470703691e-05, "loss": 0.0079, "step": 25070 }, { "grad_norm": 0.26057928800582886, "learning_rate": 6.732782729393379e-05, "loss": 0.0073, "step": 25080 }, { "grad_norm": 0.22828325629234314, "learning_rate": 6.730197461709425e-05, "loss": 0.0087, "step": 25090 }, { "grad_norm": 0.21687515079975128, "learning_rate": 6.727611668437164e-05, "loss": 0.0087, "step": 25100 }, { "grad_norm": 0.28764843940734863, "learning_rate": 6.725025350362094e-05, "loss": 0.0124, "step": 25110 }, { "grad_norm": 0.2035594880580902, "learning_rate": 6.72243850826987e-05, "loss": 0.0098, "step": 25120 }, { "grad_norm": 0.33995723724365234, "learning_rate": 6.719851142946305e-05, "loss": 0.0076, "step": 25130 }, { "grad_norm": 0.32248228788375854, "learning_rate": 6.717263255177372e-05, "loss": 0.0108, "step": 25140 }, { "grad_norm": 0.3214641213417053, "learning_rate": 6.714674845749205e-05, "loss": 0.0122, "step": 25150 }, { "grad_norm": 0.2932858467102051, "learning_rate": 6.712085915448092e-05, "loss": 0.011, "step": 25160 }, { "grad_norm": 0.23090533912181854, "learning_rate": 6.709496465060486e-05, "loss": 0.0101, "step": 25170 }, { "grad_norm": 0.2181621938943863, "learning_rate": 6.706906495372987e-05, "loss": 0.0075, "step": 25180 }, { "grad_norm": 0.23707233369350433, "learning_rate": 6.704316007172365e-05, "loss": 0.0086, "step": 25190 }, { "grad_norm": 0.2770235538482666, "learning_rate": 6.701725001245539e-05, "loss": 0.0105, "step": 25200 }, { "grad_norm": 0.3039032220840454, "learning_rate": 6.699133478379588e-05, "loss": 0.0068, "step": 25210 }, { "grad_norm": 0.25725725293159485, "learning_rate": 6.69654143936175e-05, "loss": 0.0075, "step": 25220 }, { "grad_norm": 0.23771362006664276, "learning_rate": 6.693948884979419e-05, "loss": 0.0095, "step": 25230 }, { "grad_norm": 0.20604613423347473, "learning_rate": 6.691355816020142e-05, "loss": 0.0072, "step": 25240 }, { "grad_norm": 0.18242569267749786, "learning_rate": 6.688762233271624e-05, "loss": 0.0109, "step": 25250 }, { "grad_norm": 0.24743270874023438, "learning_rate": 6.68616813752173e-05, "loss": 0.009, "step": 25260 }, { "grad_norm": 0.2938723862171173, "learning_rate": 6.683573529558477e-05, "loss": 0.012, "step": 25270 }, { "grad_norm": 0.3096805810928345, "learning_rate": 6.680978410170037e-05, "loss": 0.0084, "step": 25280 }, { "grad_norm": 0.25070205330848694, "learning_rate": 6.678382780144741e-05, "loss": 0.0085, "step": 25290 }, { "grad_norm": 0.22265048325061798, "learning_rate": 6.675786640271071e-05, "loss": 0.0084, "step": 25300 }, { "grad_norm": 0.2640604078769684, "learning_rate": 6.673189991337665e-05, "loss": 0.0098, "step": 25310 }, { "grad_norm": 0.2184966802597046, "learning_rate": 6.670592834133317e-05, "loss": 0.0088, "step": 25320 }, { "grad_norm": 0.26096197962760925, "learning_rate": 6.667995169446979e-05, "loss": 0.0107, "step": 25330 }, { "grad_norm": 0.30984076857566833, "learning_rate": 6.665396998067747e-05, "loss": 0.008, "step": 25340 }, { "grad_norm": 0.29806143045425415, "learning_rate": 6.66279832078488e-05, "loss": 0.0092, "step": 25350 }, { "grad_norm": 0.3218159079551697, "learning_rate": 6.660199138387786e-05, "loss": 0.0074, "step": 25360 }, { "grad_norm": 0.41157570481300354, "learning_rate": 6.65759945166603e-05, "loss": 0.0129, "step": 25370 }, { "grad_norm": 0.2378745824098587, "learning_rate": 6.654999261409326e-05, "loss": 0.0094, "step": 25380 }, { "grad_norm": 0.23097263276576996, "learning_rate": 6.652398568407544e-05, "loss": 0.0093, "step": 25390 }, { "grad_norm": 0.20936210453510284, "learning_rate": 6.649797373450707e-05, "loss": 0.0081, "step": 25400 }, { "grad_norm": 0.24232088029384613, "learning_rate": 6.647195677328988e-05, "loss": 0.0077, "step": 25410 }, { "grad_norm": 0.2789475619792938, "learning_rate": 6.644593480832712e-05, "loss": 0.0101, "step": 25420 }, { "grad_norm": 0.3859996497631073, "learning_rate": 6.641990784752363e-05, "loss": 0.0109, "step": 25430 }, { "grad_norm": 0.2903768718242645, "learning_rate": 6.639387589878566e-05, "loss": 0.0134, "step": 25440 }, { "grad_norm": 0.30548423528671265, "learning_rate": 6.636783897002103e-05, "loss": 0.012, "step": 25450 }, { "grad_norm": 0.2943495512008667, "learning_rate": 6.63417970691391e-05, "loss": 0.0127, "step": 25460 }, { "grad_norm": 0.21564921736717224, "learning_rate": 6.63157502040507e-05, "loss": 0.0075, "step": 25470 }, { "grad_norm": 0.32418978214263916, "learning_rate": 6.628969838266819e-05, "loss": 0.0077, "step": 25480 }, { "grad_norm": 0.25460556149482727, "learning_rate": 6.626364161290541e-05, "loss": 0.0097, "step": 25490 }, { "grad_norm": 0.3148524761199951, "learning_rate": 6.623757990267774e-05, "loss": 0.0103, "step": 25500 }, { "grad_norm": 0.289489209651947, "learning_rate": 6.621151325990201e-05, "loss": 0.0138, "step": 25510 }, { "grad_norm": 0.2804276943206787, "learning_rate": 6.618544169249657e-05, "loss": 0.0107, "step": 25520 }, { "grad_norm": 0.2988085448741913, "learning_rate": 6.615936520838133e-05, "loss": 0.0132, "step": 25530 }, { "grad_norm": 0.26156941056251526, "learning_rate": 6.613328381547759e-05, "loss": 0.0092, "step": 25540 }, { "grad_norm": 0.2910844385623932, "learning_rate": 6.610719752170821e-05, "loss": 0.0087, "step": 25550 }, { "grad_norm": 0.19885356724262238, "learning_rate": 6.60811063349975e-05, "loss": 0.0117, "step": 25560 }, { "grad_norm": 0.2849405109882355, "learning_rate": 6.605501026327127e-05, "loss": 0.0114, "step": 25570 }, { "grad_norm": 0.31312671303749084, "learning_rate": 6.602890931445685e-05, "loss": 0.0081, "step": 25580 }, { "grad_norm": 0.29845884442329407, "learning_rate": 6.6002803496483e-05, "loss": 0.0107, "step": 25590 }, { "grad_norm": 0.24458935856819153, "learning_rate": 6.597669281727997e-05, "loss": 0.0099, "step": 25600 }, { "grad_norm": 0.2881734371185303, "learning_rate": 6.595057728477949e-05, "loss": 0.0087, "step": 25610 }, { "grad_norm": 0.27456358075141907, "learning_rate": 6.59244569069148e-05, "loss": 0.0093, "step": 25620 }, { "grad_norm": 0.3113868832588196, "learning_rate": 6.589833169162054e-05, "loss": 0.0099, "step": 25630 }, { "grad_norm": 0.21675272285938263, "learning_rate": 6.587220164683291e-05, "loss": 0.0093, "step": 25640 }, { "grad_norm": 0.2244863361120224, "learning_rate": 6.58460667804895e-05, "loss": 0.0094, "step": 25650 }, { "grad_norm": 0.23073671758174896, "learning_rate": 6.581992710052938e-05, "loss": 0.0089, "step": 25660 }, { "grad_norm": 0.2773357629776001, "learning_rate": 6.579378261489311e-05, "loss": 0.0082, "step": 25670 }, { "grad_norm": 0.34319114685058594, "learning_rate": 6.576763333152268e-05, "loss": 0.0095, "step": 25680 }, { "grad_norm": 0.27868327498435974, "learning_rate": 6.574147925836159e-05, "loss": 0.0099, "step": 25690 }, { "grad_norm": 0.34104734659194946, "learning_rate": 6.571532040335472e-05, "loss": 0.0118, "step": 25700 }, { "grad_norm": 0.24662648141384125, "learning_rate": 6.568915677444845e-05, "loss": 0.0082, "step": 25710 }, { "grad_norm": 0.28442811965942383, "learning_rate": 6.56629883795906e-05, "loss": 0.0096, "step": 25720 }, { "grad_norm": 0.33433881402015686, "learning_rate": 6.563681522673043e-05, "loss": 0.0112, "step": 25730 }, { "grad_norm": 0.27193188667297363, "learning_rate": 6.561063732381867e-05, "loss": 0.0093, "step": 25740 }, { "grad_norm": 0.305787056684494, "learning_rate": 6.558445467880745e-05, "loss": 0.0086, "step": 25750 }, { "grad_norm": 0.28165528178215027, "learning_rate": 6.55582672996504e-05, "loss": 0.0094, "step": 25760 }, { "grad_norm": 0.29080915451049805, "learning_rate": 6.553207519430253e-05, "loss": 0.0078, "step": 25770 }, { "grad_norm": 0.3609759211540222, "learning_rate": 6.550587837072032e-05, "loss": 0.013, "step": 25780 }, { "grad_norm": 0.2628922760486603, "learning_rate": 6.547967683686166e-05, "loss": 0.0091, "step": 25790 }, { "grad_norm": 0.38569116592407227, "learning_rate": 6.545347060068591e-05, "loss": 0.0088, "step": 25800 }, { "grad_norm": 0.3310096859931946, "learning_rate": 6.542725967015382e-05, "loss": 0.0075, "step": 25810 }, { "grad_norm": 0.27954334020614624, "learning_rate": 6.540104405322757e-05, "loss": 0.0075, "step": 25820 }, { "grad_norm": 0.23628857731819153, "learning_rate": 6.537482375787077e-05, "loss": 0.0083, "step": 25830 }, { "grad_norm": 0.27185583114624023, "learning_rate": 6.534859879204845e-05, "loss": 0.01, "step": 25840 }, { "grad_norm": 0.35953018069267273, "learning_rate": 6.532236916372709e-05, "loss": 0.0104, "step": 25850 }, { "grad_norm": 0.2615014314651489, "learning_rate": 6.529613488087454e-05, "loss": 0.0107, "step": 25860 }, { "grad_norm": 0.256808340549469, "learning_rate": 6.526989595146009e-05, "loss": 0.0088, "step": 25870 }, { "grad_norm": 0.29117870330810547, "learning_rate": 6.524365238345441e-05, "loss": 0.0088, "step": 25880 }, { "grad_norm": 0.3596153259277344, "learning_rate": 6.521740418482964e-05, "loss": 0.0108, "step": 25890 }, { "grad_norm": 0.22378768026828766, "learning_rate": 6.519115136355925e-05, "loss": 0.0105, "step": 25900 }, { "grad_norm": 0.23168601095676422, "learning_rate": 6.51648939276182e-05, "loss": 0.0092, "step": 25910 }, { "grad_norm": 0.19651584327220917, "learning_rate": 6.513863188498277e-05, "loss": 0.0074, "step": 25920 }, { "grad_norm": 0.18426480889320374, "learning_rate": 6.511236524363068e-05, "loss": 0.0072, "step": 25930 }, { "grad_norm": 0.2757687270641327, "learning_rate": 6.508609401154104e-05, "loss": 0.0082, "step": 25940 }, { "grad_norm": 0.3203938901424408, "learning_rate": 6.505981819669439e-05, "loss": 0.0095, "step": 25950 }, { "grad_norm": 0.26939141750335693, "learning_rate": 6.503353780707258e-05, "loss": 0.0092, "step": 25960 }, { "grad_norm": 0.38569483160972595, "learning_rate": 6.500725285065895e-05, "loss": 0.0105, "step": 25970 }, { "grad_norm": 0.18550324440002441, "learning_rate": 6.498096333543813e-05, "loss": 0.0082, "step": 25980 }, { "grad_norm": 0.2904600203037262, "learning_rate": 6.49546692693962e-05, "loss": 0.0127, "step": 25990 }, { "grad_norm": 0.36599794030189514, "learning_rate": 6.492837066052059e-05, "loss": 0.0104, "step": 26000 }, { "grad_norm": 0.2258162796497345, "learning_rate": 6.490206751680014e-05, "loss": 0.0086, "step": 26010 }, { "grad_norm": 0.32688409090042114, "learning_rate": 6.487575984622505e-05, "loss": 0.0091, "step": 26020 }, { "grad_norm": 0.25327086448669434, "learning_rate": 6.484944765678689e-05, "loss": 0.0118, "step": 26030 }, { "grad_norm": 0.19642053544521332, "learning_rate": 6.482313095647861e-05, "loss": 0.008, "step": 26040 }, { "grad_norm": 0.18546327948570251, "learning_rate": 6.479680975329451e-05, "loss": 0.0075, "step": 26050 }, { "grad_norm": 0.23450107872486115, "learning_rate": 6.477048405523031e-05, "loss": 0.011, "step": 26060 }, { "grad_norm": 0.2481645941734314, "learning_rate": 6.474415387028304e-05, "loss": 0.0064, "step": 26070 }, { "grad_norm": 0.26536768674850464, "learning_rate": 6.471781920645114e-05, "loss": 0.0094, "step": 26080 }, { "grad_norm": 0.45446938276290894, "learning_rate": 6.469148007173434e-05, "loss": 0.0085, "step": 26090 }, { "grad_norm": 0.24124768376350403, "learning_rate": 6.466513647413381e-05, "loss": 0.0121, "step": 26100 }, { "grad_norm": 0.29521143436431885, "learning_rate": 6.463878842165203e-05, "loss": 0.0094, "step": 26110 }, { "grad_norm": 0.2847363352775574, "learning_rate": 6.461243592229286e-05, "loss": 0.0085, "step": 26120 }, { "grad_norm": 0.21861809492111206, "learning_rate": 6.458607898406146e-05, "loss": 0.0075, "step": 26130 }, { "grad_norm": 0.19682231545448303, "learning_rate": 6.455971761496439e-05, "loss": 0.0065, "step": 26140 }, { "grad_norm": 0.25846272706985474, "learning_rate": 6.453335182300953e-05, "loss": 0.0105, "step": 26150 }, { "grad_norm": 0.2956348657608032, "learning_rate": 6.450698161620612e-05, "loss": 0.009, "step": 26160 }, { "grad_norm": 0.28191423416137695, "learning_rate": 6.448060700256473e-05, "loss": 0.0097, "step": 26170 }, { "grad_norm": 0.23963071405887604, "learning_rate": 6.445422799009726e-05, "loss": 0.0123, "step": 26180 }, { "grad_norm": 0.28072118759155273, "learning_rate": 6.442784458681699e-05, "loss": 0.0073, "step": 26190 }, { "grad_norm": 0.2605874240398407, "learning_rate": 6.440145680073847e-05, "loss": 0.0104, "step": 26200 }, { "grad_norm": 0.24629166722297668, "learning_rate": 6.437506463987762e-05, "loss": 0.0096, "step": 26210 }, { "grad_norm": 0.25993961095809937, "learning_rate": 6.434866811225168e-05, "loss": 0.0102, "step": 26220 }, { "grad_norm": 0.3774401843547821, "learning_rate": 6.432226722587923e-05, "loss": 0.0101, "step": 26230 }, { "grad_norm": 0.2263116091489792, "learning_rate": 6.429586198878015e-05, "loss": 0.0078, "step": 26240 }, { "grad_norm": 0.24977029860019684, "learning_rate": 6.426945240897566e-05, "loss": 0.009, "step": 26250 }, { "grad_norm": 0.21061046421527863, "learning_rate": 6.424303849448829e-05, "loss": 0.0086, "step": 26260 }, { "grad_norm": 0.2580801546573639, "learning_rate": 6.42166202533419e-05, "loss": 0.0088, "step": 26270 }, { "grad_norm": 0.2564726769924164, "learning_rate": 6.419019769356164e-05, "loss": 0.0079, "step": 26280 }, { "grad_norm": 0.23917706310749054, "learning_rate": 6.416377082317398e-05, "loss": 0.0107, "step": 26290 }, { "grad_norm": 0.21972177922725677, "learning_rate": 6.413733965020674e-05, "loss": 0.0074, "step": 26300 }, { "grad_norm": 0.25634920597076416, "learning_rate": 6.411090418268896e-05, "loss": 0.0087, "step": 26310 }, { "grad_norm": 0.27690601348876953, "learning_rate": 6.408446442865109e-05, "loss": 0.0073, "step": 26320 }, { "grad_norm": 0.3255150318145752, "learning_rate": 6.405802039612479e-05, "loss": 0.0107, "step": 26330 }, { "grad_norm": 0.3836117684841156, "learning_rate": 6.403157209314308e-05, "loss": 0.0085, "step": 26340 }, { "grad_norm": 0.34810012578964233, "learning_rate": 6.400511952774024e-05, "loss": 0.009, "step": 26350 }, { "grad_norm": 0.23736333847045898, "learning_rate": 6.397866270795187e-05, "loss": 0.0083, "step": 26360 }, { "grad_norm": 0.2519403100013733, "learning_rate": 6.395220164181489e-05, "loss": 0.0091, "step": 26370 }, { "grad_norm": 0.292632520198822, "learning_rate": 6.39257363373674e-05, "loss": 0.0114, "step": 26380 }, { "grad_norm": 0.23322105407714844, "learning_rate": 6.389926680264892e-05, "loss": 0.0062, "step": 26390 }, { "grad_norm": 0.20453332364559174, "learning_rate": 6.387279304570017e-05, "loss": 0.0075, "step": 26400 }, { "grad_norm": 0.3414888381958008, "learning_rate": 6.384631507456319e-05, "loss": 0.0086, "step": 26410 }, { "grad_norm": 0.3276098072528839, "learning_rate": 6.381983289728126e-05, "loss": 0.0077, "step": 26420 }, { "grad_norm": 0.22395369410514832, "learning_rate": 6.3793346521899e-05, "loss": 0.009, "step": 26430 }, { "grad_norm": 0.17065633833408356, "learning_rate": 6.376685595646226e-05, "loss": 0.0085, "step": 26440 }, { "grad_norm": 0.31599608063697815, "learning_rate": 6.374036120901816e-05, "loss": 0.0101, "step": 26450 }, { "grad_norm": 0.34175431728363037, "learning_rate": 6.371386228761514e-05, "loss": 0.0087, "step": 26460 }, { "grad_norm": 0.27586376667022705, "learning_rate": 6.368735920030283e-05, "loss": 0.0072, "step": 26470 }, { "grad_norm": 0.3355049788951874, "learning_rate": 6.366085195513218e-05, "loss": 0.0076, "step": 26480 }, { "grad_norm": 0.22536924481391907, "learning_rate": 6.363434056015543e-05, "loss": 0.0081, "step": 26490 }, { "grad_norm": 0.2461698353290558, "learning_rate": 6.360782502342599e-05, "loss": 0.009, "step": 26500 }, { "grad_norm": 0.3334399163722992, "learning_rate": 6.358130535299862e-05, "loss": 0.0096, "step": 26510 }, { "grad_norm": 0.2501167953014374, "learning_rate": 6.355478155692926e-05, "loss": 0.012, "step": 26520 }, { "grad_norm": 0.16247116029262543, "learning_rate": 6.352825364327517e-05, "loss": 0.0073, "step": 26530 }, { "grad_norm": 0.27164220809936523, "learning_rate": 6.350172162009482e-05, "loss": 0.0105, "step": 26540 }, { "grad_norm": 0.33109942078590393, "learning_rate": 6.347518549544793e-05, "loss": 0.0097, "step": 26550 }, { "grad_norm": 0.3005988895893097, "learning_rate": 6.344864527739547e-05, "loss": 0.0082, "step": 26560 }, { "grad_norm": 0.28174588084220886, "learning_rate": 6.342210097399966e-05, "loss": 0.0101, "step": 26570 }, { "grad_norm": 0.3186158835887909, "learning_rate": 6.339555259332398e-05, "loss": 0.0105, "step": 26580 }, { "grad_norm": 0.22998090088367462, "learning_rate": 6.33690001434331e-05, "loss": 0.0082, "step": 26590 }, { "grad_norm": 0.2678934633731842, "learning_rate": 6.334244363239296e-05, "loss": 0.0115, "step": 26600 }, { "grad_norm": 0.2527500092983246, "learning_rate": 6.331588306827073e-05, "loss": 0.0097, "step": 26610 }, { "grad_norm": 0.2242431938648224, "learning_rate": 6.328931845913483e-05, "loss": 0.0083, "step": 26620 }, { "grad_norm": 0.23258468508720398, "learning_rate": 6.326274981305484e-05, "loss": 0.0091, "step": 26630 }, { "grad_norm": 0.2950723469257355, "learning_rate": 6.323617713810166e-05, "loss": 0.0136, "step": 26640 }, { "grad_norm": 0.2832319438457489, "learning_rate": 6.320960044234734e-05, "loss": 0.0102, "step": 26650 }, { "grad_norm": 0.22223219275474548, "learning_rate": 6.318301973386518e-05, "loss": 0.009, "step": 26660 }, { "grad_norm": 0.27043962478637695, "learning_rate": 6.315643502072971e-05, "loss": 0.0081, "step": 26670 }, { "grad_norm": 0.23619243502616882, "learning_rate": 6.312984631101667e-05, "loss": 0.0085, "step": 26680 }, { "grad_norm": 0.17916107177734375, "learning_rate": 6.310325361280297e-05, "loss": 0.0075, "step": 26690 }, { "grad_norm": 0.2921355664730072, "learning_rate": 6.30766569341668e-05, "loss": 0.0081, "step": 26700 }, { "grad_norm": 0.2554694712162018, "learning_rate": 6.305005628318753e-05, "loss": 0.0108, "step": 26710 }, { "grad_norm": 0.28823018074035645, "learning_rate": 6.302345166794572e-05, "loss": 0.01, "step": 26720 }, { "grad_norm": 0.3092280626296997, "learning_rate": 6.299684309652316e-05, "loss": 0.0105, "step": 26730 }, { "grad_norm": 0.21332131326198578, "learning_rate": 6.297023057700283e-05, "loss": 0.0078, "step": 26740 }, { "grad_norm": 0.2608703076839447, "learning_rate": 6.294361411746891e-05, "loss": 0.009, "step": 26750 }, { "grad_norm": 0.251953125, "learning_rate": 6.291699372600677e-05, "loss": 0.0097, "step": 26760 }, { "grad_norm": 0.18997237086296082, "learning_rate": 6.2890369410703e-05, "loss": 0.0088, "step": 26770 }, { "grad_norm": 0.22552882134914398, "learning_rate": 6.286374117964534e-05, "loss": 0.0088, "step": 26780 }, { "grad_norm": 0.347776859998703, "learning_rate": 6.283710904092277e-05, "loss": 0.0081, "step": 26790 }, { "grad_norm": 0.29550039768218994, "learning_rate": 6.281047300262542e-05, "loss": 0.0102, "step": 26800 }, { "grad_norm": 0.4152955412864685, "learning_rate": 6.278383307284461e-05, "loss": 0.01, "step": 26810 }, { "grad_norm": 0.29928719997406006, "learning_rate": 6.275718925967284e-05, "loss": 0.0083, "step": 26820 }, { "grad_norm": 0.3078267574310303, "learning_rate": 6.273054157120382e-05, "loss": 0.0096, "step": 26830 }, { "grad_norm": 0.28713661432266235, "learning_rate": 6.270389001553238e-05, "loss": 0.0108, "step": 26840 }, { "grad_norm": 0.3074244558811188, "learning_rate": 6.26772346007546e-05, "loss": 0.0098, "step": 26850 }, { "grad_norm": 0.32200419902801514, "learning_rate": 6.265057533496767e-05, "loss": 0.0117, "step": 26860 }, { "grad_norm": 0.3505547046661377, "learning_rate": 6.262391222626997e-05, "loss": 0.0098, "step": 26870 }, { "grad_norm": 0.3319917321205139, "learning_rate": 6.259724528276106e-05, "loss": 0.012, "step": 26880 }, { "grad_norm": 0.2519925534725189, "learning_rate": 6.257057451254162e-05, "loss": 0.0071, "step": 26890 }, { "grad_norm": 0.24205760657787323, "learning_rate": 6.254389992371357e-05, "loss": 0.0102, "step": 26900 }, { "grad_norm": 0.29665759205818176, "learning_rate": 6.25172215243799e-05, "loss": 0.0099, "step": 26910 }, { "grad_norm": 0.21631327271461487, "learning_rate": 6.249053932264486e-05, "loss": 0.0075, "step": 26920 }, { "grad_norm": 0.2641735374927521, "learning_rate": 6.246385332661376e-05, "loss": 0.0079, "step": 26930 }, { "grad_norm": 0.25667640566825867, "learning_rate": 6.24371635443931e-05, "loss": 0.0133, "step": 26940 }, { "grad_norm": 0.23632048070430756, "learning_rate": 6.241046998409054e-05, "loss": 0.0092, "step": 26950 }, { "grad_norm": 0.2546229660511017, "learning_rate": 6.238377265381489e-05, "loss": 0.0099, "step": 26960 }, { "grad_norm": 0.2508358657360077, "learning_rate": 6.235707156167607e-05, "loss": 0.0089, "step": 26970 }, { "grad_norm": 0.26633042097091675, "learning_rate": 6.233036671578519e-05, "loss": 0.0086, "step": 26980 }, { "grad_norm": 0.1944900006055832, "learning_rate": 6.230365812425445e-05, "loss": 0.0111, "step": 26990 }, { "grad_norm": 0.3162895441055298, "learning_rate": 6.227694579519724e-05, "loss": 0.0086, "step": 27000 }, { "grad_norm": 0.2586805820465088, "learning_rate": 6.225022973672805e-05, "loss": 0.0104, "step": 27010 }, { "grad_norm": 0.27403485774993896, "learning_rate": 6.222350995696253e-05, "loss": 0.0083, "step": 27020 }, { "grad_norm": 0.2791021466255188, "learning_rate": 6.21967864640174e-05, "loss": 0.0083, "step": 27030 }, { "grad_norm": 0.22853180766105652, "learning_rate": 6.217005926601059e-05, "loss": 0.0081, "step": 27040 }, { "grad_norm": 0.25925320386886597, "learning_rate": 6.214332837106111e-05, "loss": 0.008, "step": 27050 }, { "grad_norm": 0.24230362474918365, "learning_rate": 6.21165937872891e-05, "loss": 0.0072, "step": 27060 }, { "grad_norm": 0.2683258652687073, "learning_rate": 6.208985552281582e-05, "loss": 0.0083, "step": 27070 }, { "grad_norm": 0.299369752407074, "learning_rate": 6.206311358576364e-05, "loss": 0.0098, "step": 27080 }, { "grad_norm": 0.2882424592971802, "learning_rate": 6.203636798425608e-05, "loss": 0.0114, "step": 27090 }, { "grad_norm": 0.2645210325717926, "learning_rate": 6.20096187264177e-05, "loss": 0.0116, "step": 27100 }, { "grad_norm": 0.2970835566520691, "learning_rate": 6.198286582037425e-05, "loss": 0.011, "step": 27110 }, { "grad_norm": 0.19154250621795654, "learning_rate": 6.195610927425256e-05, "loss": 0.0095, "step": 27120 }, { "grad_norm": 0.26174408197402954, "learning_rate": 6.192934909618056e-05, "loss": 0.0109, "step": 27130 }, { "grad_norm": 0.2742542326450348, "learning_rate": 6.190258529428728e-05, "loss": 0.0092, "step": 27140 }, { "grad_norm": 0.2624475061893463, "learning_rate": 6.187581787670285e-05, "loss": 0.0097, "step": 27150 }, { "grad_norm": 0.19591739773750305, "learning_rate": 6.184904685155852e-05, "loss": 0.0092, "step": 27160 }, { "grad_norm": 0.270652174949646, "learning_rate": 6.18222722269866e-05, "loss": 0.0082, "step": 27170 }, { "grad_norm": 0.28079813718795776, "learning_rate": 6.179549401112053e-05, "loss": 0.0158, "step": 27180 }, { "grad_norm": 0.26096683740615845, "learning_rate": 6.176871221209482e-05, "loss": 0.0104, "step": 27190 }, { "grad_norm": 0.25848788022994995, "learning_rate": 6.174192683804508e-05, "loss": 0.0087, "step": 27200 }, { "grad_norm": 0.27694791555404663, "learning_rate": 6.1715137897108e-05, "loss": 0.0076, "step": 27210 }, { "grad_norm": 0.2889971137046814, "learning_rate": 6.168834539742134e-05, "loss": 0.0077, "step": 27220 }, { "grad_norm": 0.19497044384479523, "learning_rate": 6.166154934712397e-05, "loss": 0.0082, "step": 27230 }, { "grad_norm": 0.2787424325942993, "learning_rate": 6.163474975435581e-05, "loss": 0.0075, "step": 27240 }, { "grad_norm": 0.25081756711006165, "learning_rate": 6.160794662725787e-05, "loss": 0.0088, "step": 27250 }, { "grad_norm": 0.20029161870479584, "learning_rate": 6.158113997397222e-05, "loss": 0.0093, "step": 27260 }, { "grad_norm": 0.25263383984565735, "learning_rate": 6.155432980264205e-05, "loss": 0.0089, "step": 27270 }, { "grad_norm": 0.23779967427253723, "learning_rate": 6.152751612141156e-05, "loss": 0.0085, "step": 27280 }, { "grad_norm": 0.24605008959770203, "learning_rate": 6.150069893842602e-05, "loss": 0.0096, "step": 27290 }, { "grad_norm": 0.3267499804496765, "learning_rate": 6.147387826183182e-05, "loss": 0.0099, "step": 27300 }, { "grad_norm": 0.26676827669143677, "learning_rate": 6.144705409977635e-05, "loss": 0.0081, "step": 27310 }, { "grad_norm": 0.22183412313461304, "learning_rate": 6.142022646040808e-05, "loss": 0.01, "step": 27320 }, { "grad_norm": 0.30547916889190674, "learning_rate": 6.139339535187653e-05, "loss": 0.0076, "step": 27330 }, { "grad_norm": 0.23658980429172516, "learning_rate": 6.136656078233232e-05, "loss": 0.0086, "step": 27340 }, { "grad_norm": 0.20363584160804749, "learning_rate": 6.133972275992707e-05, "loss": 0.0117, "step": 27350 }, { "grad_norm": 0.22415928542613983, "learning_rate": 6.131288129281342e-05, "loss": 0.0099, "step": 27360 }, { "grad_norm": 0.283632755279541, "learning_rate": 6.128603638914516e-05, "loss": 0.0106, "step": 27370 }, { "grad_norm": 0.2705747187137604, "learning_rate": 6.125918805707704e-05, "loss": 0.0078, "step": 27380 }, { "grad_norm": 0.2821328639984131, "learning_rate": 6.123233630476485e-05, "loss": 0.0109, "step": 27390 }, { "grad_norm": 0.26058027148246765, "learning_rate": 6.120548114036547e-05, "loss": 0.0101, "step": 27400 }, { "grad_norm": 0.21678268909454346, "learning_rate": 6.117862257203679e-05, "loss": 0.009, "step": 27410 }, { "grad_norm": 0.24432334303855896, "learning_rate": 6.115176060793771e-05, "loss": 0.0107, "step": 27420 }, { "grad_norm": 0.21130980551242828, "learning_rate": 6.112489525622822e-05, "loss": 0.0072, "step": 27430 }, { "grad_norm": 0.18233679234981537, "learning_rate": 6.109802652506928e-05, "loss": 0.0069, "step": 27440 }, { "grad_norm": 0.1726444810628891, "learning_rate": 6.107115442262291e-05, "loss": 0.0083, "step": 27450 }, { "grad_norm": 0.26311635971069336, "learning_rate": 6.104427895705214e-05, "loss": 0.0141, "step": 27460 }, { "grad_norm": 0.2378009408712387, "learning_rate": 6.101740013652103e-05, "loss": 0.0073, "step": 27470 }, { "grad_norm": 0.238592267036438, "learning_rate": 6.099051796919465e-05, "loss": 0.0088, "step": 27480 }, { "grad_norm": 0.2625325620174408, "learning_rate": 6.096363246323911e-05, "loss": 0.0113, "step": 27490 }, { "grad_norm": 0.2785114645957947, "learning_rate": 6.0936743626821504e-05, "loss": 0.0101, "step": 27500 }, { "grad_norm": 0.28394466638565063, "learning_rate": 6.090985146810996e-05, "loss": 0.0087, "step": 27510 }, { "grad_norm": 0.2965015769004822, "learning_rate": 6.088295599527357e-05, "loss": 0.0101, "step": 27520 }, { "grad_norm": 0.2593602240085602, "learning_rate": 6.085605721648252e-05, "loss": 0.0059, "step": 27530 }, { "grad_norm": 0.34770137071609497, "learning_rate": 6.082915513990792e-05, "loss": 0.0088, "step": 27540 }, { "grad_norm": 0.2729474902153015, "learning_rate": 6.080224977372192e-05, "loss": 0.0065, "step": 27550 }, { "grad_norm": 0.3115181624889374, "learning_rate": 6.0775341126097666e-05, "loss": 0.0071, "step": 27560 }, { "grad_norm": 0.21085843443870544, "learning_rate": 6.074842920520926e-05, "loss": 0.009, "step": 27570 }, { "grad_norm": 0.21746951341629028, "learning_rate": 6.072151401923186e-05, "loss": 0.0066, "step": 27580 }, { "grad_norm": 0.21931703388690948, "learning_rate": 6.069459557634159e-05, "loss": 0.0064, "step": 27590 }, { "grad_norm": 0.22384639084339142, "learning_rate": 6.066767388471557e-05, "loss": 0.0082, "step": 27600 }, { "grad_norm": 0.3302254378795624, "learning_rate": 6.064074895253188e-05, "loss": 0.0091, "step": 27610 }, { "grad_norm": 0.18144726753234863, "learning_rate": 6.061382078796961e-05, "loss": 0.0067, "step": 27620 }, { "grad_norm": 0.20308656990528107, "learning_rate": 6.0586889399208814e-05, "loss": 0.0077, "step": 27630 }, { "grad_norm": 0.3091198205947876, "learning_rate": 6.0559954794430565e-05, "loss": 0.0084, "step": 27640 }, { "grad_norm": 0.3475000560283661, "learning_rate": 6.053301698181687e-05, "loss": 0.0117, "step": 27650 }, { "grad_norm": 0.259026437997818, "learning_rate": 6.0506075969550725e-05, "loss": 0.0087, "step": 27660 }, { "grad_norm": 0.2733376920223236, "learning_rate": 6.047913176581609e-05, "loss": 0.0084, "step": 27670 }, { "grad_norm": 0.24698317050933838, "learning_rate": 6.0452184378797904e-05, "loss": 0.0082, "step": 27680 }, { "grad_norm": 0.41196587681770325, "learning_rate": 6.042523381668209e-05, "loss": 0.0078, "step": 27690 }, { "grad_norm": 0.2769913077354431, "learning_rate": 6.03982800876555e-05, "loss": 0.009, "step": 27700 }, { "grad_norm": 0.18807680904865265, "learning_rate": 6.0371323199905975e-05, "loss": 0.0077, "step": 27710 }, { "grad_norm": 0.3096144199371338, "learning_rate": 6.03443631616223e-05, "loss": 0.008, "step": 27720 }, { "grad_norm": 0.22993217408657074, "learning_rate": 6.031739998099421e-05, "loss": 0.0124, "step": 27730 }, { "grad_norm": 0.2718513011932373, "learning_rate": 6.029043366621243e-05, "loss": 0.0093, "step": 27740 }, { "grad_norm": 0.21488773822784424, "learning_rate": 6.0263464225468615e-05, "loss": 0.0094, "step": 27750 }, { "grad_norm": 0.21636754274368286, "learning_rate": 6.023649166695534e-05, "loss": 0.0085, "step": 27760 }, { "grad_norm": 0.2927066683769226, "learning_rate": 6.0209515998866186e-05, "loss": 0.0082, "step": 27770 }, { "grad_norm": 0.26456037163734436, "learning_rate": 6.018253722939563e-05, "loss": 0.0085, "step": 27780 }, { "grad_norm": 0.233922079205513, "learning_rate": 6.015555536673914e-05, "loss": 0.0094, "step": 27790 }, { "grad_norm": 0.25748538970947266, "learning_rate": 6.0128570419093054e-05, "loss": 0.0073, "step": 27800 }, { "grad_norm": 0.2991805076599121, "learning_rate": 6.010158239465471e-05, "loss": 0.0114, "step": 27810 }, { "grad_norm": 0.27685466408729553, "learning_rate": 6.007459130162235e-05, "loss": 0.0084, "step": 27820 }, { "grad_norm": 0.21518674492835999, "learning_rate": 6.004759714819516e-05, "loss": 0.0088, "step": 27830 }, { "grad_norm": 0.1939920336008072, "learning_rate": 6.002059994257323e-05, "loss": 0.0084, "step": 27840 }, { "grad_norm": 0.21805702149868011, "learning_rate": 5.999359969295764e-05, "loss": 0.0073, "step": 27850 }, { "grad_norm": 0.24761803448200226, "learning_rate": 5.9966596407550314e-05, "loss": 0.0064, "step": 27860 }, { "grad_norm": 0.23991671204566956, "learning_rate": 5.993959009455416e-05, "loss": 0.008, "step": 27870 }, { "grad_norm": 0.259254515171051, "learning_rate": 5.991258076217298e-05, "loss": 0.0082, "step": 27880 }, { "grad_norm": 0.2831530272960663, "learning_rate": 5.988556841861147e-05, "loss": 0.0098, "step": 27890 }, { "grad_norm": 0.20290721952915192, "learning_rate": 5.985855307207531e-05, "loss": 0.0082, "step": 27900 }, { "grad_norm": 0.23735889792442322, "learning_rate": 5.9831534730771e-05, "loss": 0.0098, "step": 27910 }, { "grad_norm": 0.2403845489025116, "learning_rate": 5.980451340290605e-05, "loss": 0.0082, "step": 27920 }, { "grad_norm": 0.3064778745174408, "learning_rate": 5.97774890966888e-05, "loss": 0.011, "step": 27930 }, { "grad_norm": 0.2955588102340698, "learning_rate": 5.975046182032851e-05, "loss": 0.0087, "step": 27940 }, { "grad_norm": 0.29320523142814636, "learning_rate": 5.972343158203537e-05, "loss": 0.0094, "step": 27950 }, { "grad_norm": 0.1543625444173813, "learning_rate": 5.969639839002045e-05, "loss": 0.0076, "step": 27960 }, { "grad_norm": 0.25194329023361206, "learning_rate": 5.966936225249572e-05, "loss": 0.008, "step": 27970 }, { "grad_norm": 0.2639315724372864, "learning_rate": 5.9642323177674044e-05, "loss": 0.0086, "step": 27980 }, { "grad_norm": 0.19331061840057373, "learning_rate": 5.9615281173769154e-05, "loss": 0.0067, "step": 27990 }, { "grad_norm": 0.230178564786911, "learning_rate": 5.958823624899574e-05, "loss": 0.0079, "step": 28000 }, { "grad_norm": 0.2751917541027069, "learning_rate": 5.956118841156933e-05, "loss": 0.0062, "step": 28010 }, { "grad_norm": 0.2951553463935852, "learning_rate": 5.953413766970631e-05, "loss": 0.0118, "step": 28020 }, { "grad_norm": 0.2595328986644745, "learning_rate": 5.9507084031624e-05, "loss": 0.0091, "step": 28030 }, { "grad_norm": 0.28204429149627686, "learning_rate": 5.948002750554058e-05, "loss": 0.0096, "step": 28040 }, { "grad_norm": 0.2264937162399292, "learning_rate": 5.9452968099675124e-05, "loss": 0.0076, "step": 28050 }, { "grad_norm": 0.21322259306907654, "learning_rate": 5.9425905822247527e-05, "loss": 0.0085, "step": 28060 }, { "grad_norm": 0.20056018233299255, "learning_rate": 5.939884068147864e-05, "loss": 0.0062, "step": 28070 }, { "grad_norm": 0.19608286023139954, "learning_rate": 5.937177268559011e-05, "loss": 0.0085, "step": 28080 }, { "grad_norm": 0.20257672667503357, "learning_rate": 5.934470184280448e-05, "loss": 0.0073, "step": 28090 }, { "grad_norm": 0.24828822910785675, "learning_rate": 5.931762816134516e-05, "loss": 0.0106, "step": 28100 }, { "grad_norm": 0.26942571997642517, "learning_rate": 5.9290551649436434e-05, "loss": 0.0069, "step": 28110 }, { "grad_norm": 0.23649120330810547, "learning_rate": 5.9263472315303416e-05, "loss": 0.0079, "step": 28120 }, { "grad_norm": 0.2716841697692871, "learning_rate": 5.9236390167172096e-05, "loss": 0.0069, "step": 28130 }, { "grad_norm": 0.28548988699913025, "learning_rate": 5.920930521326932e-05, "loss": 0.0062, "step": 28140 }, { "grad_norm": 0.18524621427059174, "learning_rate": 5.918221746182276e-05, "loss": 0.0076, "step": 28150 }, { "grad_norm": 0.25055769085884094, "learning_rate": 5.9155126921061e-05, "loss": 0.0066, "step": 28160 }, { "grad_norm": 0.19401398301124573, "learning_rate": 5.91280335992134e-05, "loss": 0.0069, "step": 28170 }, { "grad_norm": 0.23617658019065857, "learning_rate": 5.91009375045102e-05, "loss": 0.0066, "step": 28180 }, { "grad_norm": 0.18584458529949188, "learning_rate": 5.9073838645182476e-05, "loss": 0.0066, "step": 28190 }, { "grad_norm": 0.2191765010356903, "learning_rate": 5.904673702946217e-05, "loss": 0.0092, "step": 28200 }, { "grad_norm": 0.2633270025253296, "learning_rate": 5.9019632665582004e-05, "loss": 0.0082, "step": 28210 }, { "grad_norm": 0.32821163535118103, "learning_rate": 5.899252556177559e-05, "loss": 0.0091, "step": 28220 }, { "grad_norm": 0.28726497292518616, "learning_rate": 5.896541572627735e-05, "loss": 0.0105, "step": 28230 }, { "grad_norm": 0.26289427280426025, "learning_rate": 5.893830316732253e-05, "loss": 0.0085, "step": 28240 }, { "grad_norm": 0.18791896104812622, "learning_rate": 5.8911187893147214e-05, "loss": 0.0086, "step": 28250 }, { "grad_norm": 0.31127044558525085, "learning_rate": 5.888406991198828e-05, "loss": 0.0077, "step": 28260 }, { "grad_norm": 0.21324124932289124, "learning_rate": 5.885694923208349e-05, "loss": 0.0077, "step": 28270 }, { "grad_norm": 0.2801651060581207, "learning_rate": 5.882982586167138e-05, "loss": 0.0079, "step": 28280 }, { "grad_norm": 0.26293638348579407, "learning_rate": 5.880269980899131e-05, "loss": 0.0065, "step": 28290 }, { "grad_norm": 0.24821090698242188, "learning_rate": 5.8775571082283465e-05, "loss": 0.0067, "step": 28300 }, { "grad_norm": 0.3018585443496704, "learning_rate": 5.8748439689788824e-05, "loss": 0.008, "step": 28310 }, { "grad_norm": 0.2888025939464569, "learning_rate": 5.87213056397492e-05, "loss": 0.0102, "step": 28320 }, { "grad_norm": 0.26619553565979004, "learning_rate": 5.869416894040719e-05, "loss": 0.011, "step": 28330 }, { "grad_norm": 0.3204382359981537, "learning_rate": 5.866702960000621e-05, "loss": 0.0067, "step": 28340 }, { "grad_norm": 0.23167967796325684, "learning_rate": 5.863988762679048e-05, "loss": 0.0082, "step": 28350 }, { "grad_norm": 0.3500116765499115, "learning_rate": 5.8612743029005e-05, "loss": 0.0078, "step": 28360 }, { "grad_norm": 0.2260109782218933, "learning_rate": 5.858559581489561e-05, "loss": 0.0083, "step": 28370 }, { "grad_norm": 0.21143600344657898, "learning_rate": 5.85584459927089e-05, "loss": 0.0098, "step": 28380 }, { "grad_norm": 0.28960177302360535, "learning_rate": 5.853129357069227e-05, "loss": 0.0074, "step": 28390 }, { "grad_norm": 0.28553298115730286, "learning_rate": 5.8504138557093913e-05, "loss": 0.0095, "step": 28400 }, { "grad_norm": 0.24120651185512543, "learning_rate": 5.8476980960162784e-05, "loss": 0.0087, "step": 28410 }, { "grad_norm": 0.22933678328990936, "learning_rate": 5.844982078814868e-05, "loss": 0.0095, "step": 28420 }, { "grad_norm": 0.2297295480966568, "learning_rate": 5.842265804930211e-05, "loss": 0.0067, "step": 28430 }, { "grad_norm": 0.25334498286247253, "learning_rate": 5.839549275187444e-05, "loss": 0.011, "step": 28440 }, { "grad_norm": 0.22509747743606567, "learning_rate": 5.836832490411771e-05, "loss": 0.0073, "step": 28450 }, { "grad_norm": 0.34018099308013916, "learning_rate": 5.834115451428485e-05, "loss": 0.0105, "step": 28460 }, { "grad_norm": 0.30208104848861694, "learning_rate": 5.831398159062946e-05, "loss": 0.0078, "step": 28470 }, { "grad_norm": 0.2879299819469452, "learning_rate": 5.828680614140599e-05, "loss": 0.0096, "step": 28480 }, { "grad_norm": 0.2353498935699463, "learning_rate": 5.825962817486962e-05, "loss": 0.0092, "step": 28490 }, { "grad_norm": 0.38227665424346924, "learning_rate": 5.823244769927629e-05, "loss": 0.008, "step": 28500 }, { "grad_norm": 0.2462458312511444, "learning_rate": 5.8205264722882716e-05, "loss": 0.0081, "step": 28510 }, { "grad_norm": 0.2623266875743866, "learning_rate": 5.817807925394636e-05, "loss": 0.0081, "step": 28520 }, { "grad_norm": 0.21388186514377594, "learning_rate": 5.815089130072546e-05, "loss": 0.0076, "step": 28530 }, { "grad_norm": 0.2926730513572693, "learning_rate": 5.8123700871479e-05, "loss": 0.0097, "step": 28540 }, { "grad_norm": 0.23351100087165833, "learning_rate": 5.809650797446671e-05, "loss": 0.0086, "step": 28550 }, { "grad_norm": 0.21775932610034943, "learning_rate": 5.806931261794907e-05, "loss": 0.0061, "step": 28560 }, { "grad_norm": 0.1772022247314453, "learning_rate": 5.804211481018731e-05, "loss": 0.0062, "step": 28570 }, { "grad_norm": 0.2332395613193512, "learning_rate": 5.801491455944341e-05, "loss": 0.0086, "step": 28580 }, { "grad_norm": 0.2631393074989319, "learning_rate": 5.79877118739801e-05, "loss": 0.0071, "step": 28590 }, { "grad_norm": 0.3406801223754883, "learning_rate": 5.7960506762060816e-05, "loss": 0.009, "step": 28600 }, { "grad_norm": 0.2846415042877197, "learning_rate": 5.793329923194977e-05, "loss": 0.0081, "step": 28610 }, { "grad_norm": 0.305441677570343, "learning_rate": 5.790608929191187e-05, "loss": 0.0134, "step": 28620 }, { "grad_norm": 0.28540533781051636, "learning_rate": 5.78788769502128e-05, "loss": 0.0097, "step": 28630 }, { "grad_norm": 0.18442299962043762, "learning_rate": 5.785166221511894e-05, "loss": 0.0079, "step": 28640 }, { "grad_norm": 0.21536025404930115, "learning_rate": 5.7824445094897415e-05, "loss": 0.0061, "step": 28650 }, { "grad_norm": 0.24934975802898407, "learning_rate": 5.7797225597816065e-05, "loss": 0.0062, "step": 28660 }, { "grad_norm": 0.27574193477630615, "learning_rate": 5.777000373214345e-05, "loss": 0.0099, "step": 28670 }, { "grad_norm": 0.28934425115585327, "learning_rate": 5.774277950614885e-05, "loss": 0.0083, "step": 28680 }, { "grad_norm": 0.22388799488544464, "learning_rate": 5.771555292810227e-05, "loss": 0.0073, "step": 28690 }, { "grad_norm": 0.24774283170700073, "learning_rate": 5.768832400627444e-05, "loss": 0.0082, "step": 28700 }, { "grad_norm": 0.2151235193014145, "learning_rate": 5.7661092748936775e-05, "loss": 0.0093, "step": 28710 }, { "grad_norm": 0.16786257922649384, "learning_rate": 5.76338591643614e-05, "loss": 0.0074, "step": 28720 }, { "grad_norm": 0.2493477314710617, "learning_rate": 5.760662326082118e-05, "loss": 0.0086, "step": 28730 }, { "grad_norm": 0.20675677061080933, "learning_rate": 5.757938504658965e-05, "loss": 0.007, "step": 28740 }, { "grad_norm": 0.27235719561576843, "learning_rate": 5.755214452994107e-05, "loss": 0.0071, "step": 28750 }, { "grad_norm": 0.2107953429222107, "learning_rate": 5.752490171915039e-05, "loss": 0.0103, "step": 28760 }, { "grad_norm": 0.23601463437080383, "learning_rate": 5.749765662249324e-05, "loss": 0.0101, "step": 28770 }, { "grad_norm": 0.3236654996871948, "learning_rate": 5.747040924824596e-05, "loss": 0.0091, "step": 28780 }, { "grad_norm": 0.26263535022735596, "learning_rate": 5.7443159604685613e-05, "loss": 0.0081, "step": 28790 }, { "grad_norm": 0.277301549911499, "learning_rate": 5.74159077000899e-05, "loss": 0.007, "step": 28800 }, { "grad_norm": 0.23350243270397186, "learning_rate": 5.7388653542737235e-05, "loss": 0.0067, "step": 28810 }, { "grad_norm": 0.22064347565174103, "learning_rate": 5.736139714090672e-05, "loss": 0.0056, "step": 28820 }, { "grad_norm": 0.17626850306987762, "learning_rate": 5.73341385028781e-05, "loss": 0.0049, "step": 28830 }, { "grad_norm": 0.15675033628940582, "learning_rate": 5.7306877636931855e-05, "loss": 0.0065, "step": 28840 }, { "grad_norm": 0.22760313749313354, "learning_rate": 5.7279614551349125e-05, "loss": 0.0076, "step": 28850 }, { "grad_norm": 0.21552512049674988, "learning_rate": 5.725234925441169e-05, "loss": 0.0068, "step": 28860 }, { "grad_norm": 0.2988857924938202, "learning_rate": 5.7225081754402044e-05, "loss": 0.0103, "step": 28870 }, { "grad_norm": 0.19499582052230835, "learning_rate": 5.7197812059603326e-05, "loss": 0.0074, "step": 28880 }, { "grad_norm": 0.24426379799842834, "learning_rate": 5.717054017829934e-05, "loss": 0.0082, "step": 28890 }, { "grad_norm": 0.2361854761838913, "learning_rate": 5.7143266118774584e-05, "loss": 0.0077, "step": 28900 }, { "grad_norm": 0.25014418363571167, "learning_rate": 5.711598988931418e-05, "loss": 0.008, "step": 28910 }, { "grad_norm": 0.25537556409835815, "learning_rate": 5.7088711498203954e-05, "loss": 0.0089, "step": 28920 }, { "grad_norm": 0.25293126702308655, "learning_rate": 5.706143095373033e-05, "loss": 0.0095, "step": 28930 }, { "grad_norm": 0.21512947976589203, "learning_rate": 5.703414826418042e-05, "loss": 0.0049, "step": 28940 }, { "grad_norm": 0.20542387664318085, "learning_rate": 5.7006863437842007e-05, "loss": 0.0065, "step": 28950 }, { "grad_norm": 0.224168062210083, "learning_rate": 5.697957648300348e-05, "loss": 0.0057, "step": 28960 }, { "grad_norm": 0.2938244938850403, "learning_rate": 5.695228740795391e-05, "loss": 0.0072, "step": 28970 }, { "grad_norm": 0.28032246232032776, "learning_rate": 5.6924996220982985e-05, "loss": 0.0091, "step": 28980 }, { "grad_norm": 0.37351134419441223, "learning_rate": 5.6897702930381045e-05, "loss": 0.0123, "step": 28990 }, { "grad_norm": 0.2779284417629242, "learning_rate": 5.687040754443908e-05, "loss": 0.0076, "step": 29000 }, { "grad_norm": 0.23601925373077393, "learning_rate": 5.6843110071448725e-05, "loss": 0.01, "step": 29010 }, { "grad_norm": 0.23414798080921173, "learning_rate": 5.6815810519702194e-05, "loss": 0.0082, "step": 29020 }, { "grad_norm": 0.19407857954502106, "learning_rate": 5.6788508897492396e-05, "loss": 0.0057, "step": 29030 }, { "grad_norm": 0.19704720377922058, "learning_rate": 5.676120521311282e-05, "loss": 0.0076, "step": 29040 }, { "grad_norm": 0.22677984833717346, "learning_rate": 5.6733899474857634e-05, "loss": 0.0061, "step": 29050 }, { "grad_norm": 0.2066618800163269, "learning_rate": 5.670659169102157e-05, "loss": 0.008, "step": 29060 }, { "grad_norm": 0.1862179934978485, "learning_rate": 5.6679281869900044e-05, "loss": 0.0076, "step": 29070 }, { "grad_norm": 0.20768149197101593, "learning_rate": 5.6651970019789045e-05, "loss": 0.0064, "step": 29080 }, { "grad_norm": 0.2686161696910858, "learning_rate": 5.662465614898519e-05, "loss": 0.011, "step": 29090 }, { "grad_norm": 0.25573837757110596, "learning_rate": 5.6597340265785695e-05, "loss": 0.0081, "step": 29100 }, { "grad_norm": 0.31042012572288513, "learning_rate": 5.657002237848843e-05, "loss": 0.012, "step": 29110 }, { "grad_norm": 0.251038521528244, "learning_rate": 5.654270249539183e-05, "loss": 0.0082, "step": 29120 }, { "grad_norm": 0.21690836548805237, "learning_rate": 5.651538062479498e-05, "loss": 0.0063, "step": 29130 }, { "grad_norm": 0.22868235409259796, "learning_rate": 5.648805677499751e-05, "loss": 0.0078, "step": 29140 }, { "grad_norm": 0.17891651391983032, "learning_rate": 5.646073095429969e-05, "loss": 0.0076, "step": 29150 }, { "grad_norm": 0.2311461716890335, "learning_rate": 5.643340317100241e-05, "loss": 0.0072, "step": 29160 }, { "grad_norm": 0.25308936834335327, "learning_rate": 5.64060734334071e-05, "loss": 0.0076, "step": 29170 }, { "grad_norm": 0.2328048199415207, "learning_rate": 5.637874174981583e-05, "loss": 0.0077, "step": 29180 }, { "grad_norm": 0.22059611976146698, "learning_rate": 5.635140812853124e-05, "loss": 0.0067, "step": 29190 }, { "grad_norm": 0.31788837909698486, "learning_rate": 5.6324072577856544e-05, "loss": 0.0105, "step": 29200 }, { "grad_norm": 0.2928001284599304, "learning_rate": 5.629673510609559e-05, "loss": 0.0091, "step": 29210 }, { "grad_norm": 0.29898592829704285, "learning_rate": 5.626939572155276e-05, "loss": 0.0104, "step": 29220 }, { "grad_norm": 0.3294891119003296, "learning_rate": 5.6242054432533054e-05, "loss": 0.0076, "step": 29230 }, { "grad_norm": 0.27903491258621216, "learning_rate": 5.621471124734201e-05, "loss": 0.0072, "step": 29240 }, { "grad_norm": 0.24635784327983856, "learning_rate": 5.6187366174285794e-05, "loss": 0.0073, "step": 29250 }, { "grad_norm": 0.2724165916442871, "learning_rate": 5.616001922167109e-05, "loss": 0.0102, "step": 29260 }, { "grad_norm": 0.3436252176761627, "learning_rate": 5.61326703978052e-05, "loss": 0.0091, "step": 29270 }, { "grad_norm": 0.3081994652748108, "learning_rate": 5.6105319710995964e-05, "loss": 0.0066, "step": 29280 }, { "grad_norm": 0.27519500255584717, "learning_rate": 5.60779671695518e-05, "loss": 0.0103, "step": 29290 }, { "grad_norm": 0.20871011912822723, "learning_rate": 5.6050612781781684e-05, "loss": 0.0067, "step": 29300 }, { "grad_norm": 0.21957652270793915, "learning_rate": 5.602325655599516e-05, "loss": 0.008, "step": 29310 }, { "grad_norm": 0.20027408003807068, "learning_rate": 5.599589850050234e-05, "loss": 0.0075, "step": 29320 }, { "grad_norm": 0.22094036638736725, "learning_rate": 5.5968538623613874e-05, "loss": 0.0095, "step": 29330 }, { "grad_norm": 0.2588355541229248, "learning_rate": 5.594117693364095e-05, "loss": 0.0068, "step": 29340 }, { "grad_norm": 0.30398425459861755, "learning_rate": 5.591381343889535e-05, "loss": 0.0098, "step": 29350 }, { "grad_norm": 0.18580931425094604, "learning_rate": 5.5886448147689355e-05, "loss": 0.0069, "step": 29360 }, { "grad_norm": 0.2529332935810089, "learning_rate": 5.585908106833585e-05, "loss": 0.0057, "step": 29370 }, { "grad_norm": 0.21863658726215363, "learning_rate": 5.5831712209148226e-05, "loss": 0.0095, "step": 29380 }, { "grad_norm": 0.2676388919353485, "learning_rate": 5.58043415784404e-05, "loss": 0.0073, "step": 29390 }, { "grad_norm": 0.30261924862861633, "learning_rate": 5.577696918452686e-05, "loss": 0.0088, "step": 29400 }, { "grad_norm": 0.2661668062210083, "learning_rate": 5.5749595035722604e-05, "loss": 0.0074, "step": 29410 }, { "grad_norm": 0.2667039632797241, "learning_rate": 5.5722219140343193e-05, "loss": 0.0066, "step": 29420 }, { "grad_norm": 0.2321736365556717, "learning_rate": 5.56948415067047e-05, "loss": 0.008, "step": 29430 }, { "grad_norm": 0.1692020744085312, "learning_rate": 5.5667462143123704e-05, "loss": 0.0055, "step": 29440 }, { "grad_norm": 0.23038333654403687, "learning_rate": 5.564008105791737e-05, "loss": 0.0082, "step": 29450 }, { "grad_norm": 0.2729283273220062, "learning_rate": 5.5612698259403316e-05, "loss": 0.0093, "step": 29460 }, { "grad_norm": 0.24506737291812897, "learning_rate": 5.5585313755899724e-05, "loss": 0.0074, "step": 29470 }, { "grad_norm": 0.2097046822309494, "learning_rate": 5.5557927555725285e-05, "loss": 0.0071, "step": 29480 }, { "grad_norm": 0.20105338096618652, "learning_rate": 5.55305396671992e-05, "loss": 0.0071, "step": 29490 }, { "grad_norm": 0.21321679651737213, "learning_rate": 5.55031500986412e-05, "loss": 0.0059, "step": 29500 }, { "grad_norm": 0.24555663764476776, "learning_rate": 5.547575885837149e-05, "loss": 0.011, "step": 29510 }, { "grad_norm": 0.18333902955055237, "learning_rate": 5.5448365954710825e-05, "loss": 0.0092, "step": 29520 }, { "grad_norm": 0.21150030195713043, "learning_rate": 5.5420971395980446e-05, "loss": 0.0078, "step": 29530 }, { "grad_norm": 0.25246310234069824, "learning_rate": 5.539357519050209e-05, "loss": 0.0068, "step": 29540 }, { "grad_norm": 0.1876419484615326, "learning_rate": 5.536617734659799e-05, "loss": 0.0063, "step": 29550 }, { "grad_norm": 0.262921005487442, "learning_rate": 5.533877787259091e-05, "loss": 0.0086, "step": 29560 }, { "grad_norm": 0.17875295877456665, "learning_rate": 5.5311376776804044e-05, "loss": 0.0076, "step": 29570 }, { "grad_norm": 0.35606858134269714, "learning_rate": 5.528397406756118e-05, "loss": 0.0088, "step": 29580 }, { "grad_norm": 0.31185194849967957, "learning_rate": 5.525656975318652e-05, "loss": 0.0086, "step": 29590 }, { "grad_norm": 0.2646799385547638, "learning_rate": 5.522916384200474e-05, "loss": 0.01, "step": 29600 }, { "grad_norm": 0.21735616028308868, "learning_rate": 5.520175634234106e-05, "loss": 0.0069, "step": 29610 }, { "grad_norm": 0.2125009149312973, "learning_rate": 5.517434726252113e-05, "loss": 0.0065, "step": 29620 }, { "grad_norm": 0.20805130898952484, "learning_rate": 5.514693661087113e-05, "loss": 0.0074, "step": 29630 }, { "grad_norm": 0.2838054597377777, "learning_rate": 5.511952439571769e-05, "loss": 0.006, "step": 29640 }, { "grad_norm": 0.24208292365074158, "learning_rate": 5.509211062538791e-05, "loss": 0.0073, "step": 29650 }, { "grad_norm": 0.23704229295253754, "learning_rate": 5.506469530820939e-05, "loss": 0.0086, "step": 29660 }, { "grad_norm": 0.2304278314113617, "learning_rate": 5.503727845251014e-05, "loss": 0.0082, "step": 29670 }, { "grad_norm": 0.2341359406709671, "learning_rate": 5.50098600666187e-05, "loss": 0.0085, "step": 29680 }, { "grad_norm": 0.23659272491931915, "learning_rate": 5.498244015886406e-05, "loss": 0.0064, "step": 29690 }, { "grad_norm": 0.2730538547039032, "learning_rate": 5.495501873757565e-05, "loss": 0.006, "step": 29700 }, { "grad_norm": 0.2138662487268448, "learning_rate": 5.492759581108336e-05, "loss": 0.0078, "step": 29710 }, { "grad_norm": 0.2787604331970215, "learning_rate": 5.490017138771759e-05, "loss": 0.0087, "step": 29720 }, { "grad_norm": 0.23710469901561737, "learning_rate": 5.487274547580912e-05, "loss": 0.0073, "step": 29730 }, { "grad_norm": 0.24497291445732117, "learning_rate": 5.484531808368923e-05, "loss": 0.0079, "step": 29740 }, { "grad_norm": 0.20636817812919617, "learning_rate": 5.4817889219689656e-05, "loss": 0.0071, "step": 29750 }, { "grad_norm": 0.2868874967098236, "learning_rate": 5.4790458892142536e-05, "loss": 0.009, "step": 29760 }, { "grad_norm": 0.24427495896816254, "learning_rate": 5.476302710938048e-05, "loss": 0.007, "step": 29770 }, { "grad_norm": 0.23827575147151947, "learning_rate": 5.473559387973657e-05, "loss": 0.0069, "step": 29780 }, { "grad_norm": 0.2940636873245239, "learning_rate": 5.470815921154425e-05, "loss": 0.0085, "step": 29790 }, { "grad_norm": 0.2819958031177521, "learning_rate": 5.468072311313749e-05, "loss": 0.0094, "step": 29800 }, { "grad_norm": 0.2547339200973511, "learning_rate": 5.465328559285063e-05, "loss": 0.0094, "step": 29810 }, { "grad_norm": 0.23722390830516815, "learning_rate": 5.462584665901849e-05, "loss": 0.01, "step": 29820 }, { "grad_norm": 0.30876991152763367, "learning_rate": 5.4598406319976235e-05, "loss": 0.0066, "step": 29830 }, { "grad_norm": 0.2791697382926941, "learning_rate": 5.457096458405958e-05, "loss": 0.0104, "step": 29840 }, { "grad_norm": 0.27456432580947876, "learning_rate": 5.454352145960457e-05, "loss": 0.0087, "step": 29850 }, { "grad_norm": 0.2564623951911926, "learning_rate": 5.4516076954947715e-05, "loss": 0.0094, "step": 29860 }, { "grad_norm": 0.23036301136016846, "learning_rate": 5.448863107842591e-05, "loss": 0.0097, "step": 29870 }, { "grad_norm": 0.2549186646938324, "learning_rate": 5.446118383837651e-05, "loss": 0.0074, "step": 29880 }, { "grad_norm": 0.17569203674793243, "learning_rate": 5.443373524313722e-05, "loss": 0.0057, "step": 29890 }, { "grad_norm": 0.2941034734249115, "learning_rate": 5.440628530104626e-05, "loss": 0.0099, "step": 29900 }, { "grad_norm": 0.1800224632024765, "learning_rate": 5.4378834020442146e-05, "loss": 0.0081, "step": 29910 }, { "grad_norm": 0.2607594132423401, "learning_rate": 5.4351381409663884e-05, "loss": 0.0056, "step": 29920 }, { "grad_norm": 0.2771557867527008, "learning_rate": 5.432392747705084e-05, "loss": 0.0088, "step": 29930 }, { "grad_norm": 0.28794682025909424, "learning_rate": 5.429647223094278e-05, "loss": 0.0071, "step": 29940 }, { "grad_norm": 0.2534019351005554, "learning_rate": 5.4269015679679924e-05, "loss": 0.0071, "step": 29950 }, { "grad_norm": 0.217961847782135, "learning_rate": 5.424155783160281e-05, "loss": 0.0087, "step": 29960 }, { "grad_norm": 0.28829023241996765, "learning_rate": 5.4214098695052415e-05, "loss": 0.0071, "step": 29970 }, { "grad_norm": 0.26356667280197144, "learning_rate": 5.418663827837012e-05, "loss": 0.0072, "step": 29980 }, { "grad_norm": 0.24365897476673126, "learning_rate": 5.415917658989763e-05, "loss": 0.007, "step": 29990 }, { "grad_norm": 0.22417080402374268, "learning_rate": 5.413171363797713e-05, "loss": 0.0076, "step": 30000 }, { "grad_norm": 0.20520949363708496, "learning_rate": 5.4104249430951116e-05, "loss": 0.0118, "step": 30010 }, { "grad_norm": 0.18385519087314606, "learning_rate": 5.4076783977162494e-05, "loss": 0.0082, "step": 30020 }, { "grad_norm": 0.307303786277771, "learning_rate": 5.4049317284954525e-05, "loss": 0.0087, "step": 30030 }, { "grad_norm": 0.2726621925830841, "learning_rate": 5.4021849362670884e-05, "loss": 0.0092, "step": 30040 }, { "grad_norm": 0.21024097502231598, "learning_rate": 5.3994380218655604e-05, "loss": 0.0069, "step": 30050 }, { "grad_norm": 0.24375587701797485, "learning_rate": 5.396690986125309e-05, "loss": 0.0083, "step": 30060 }, { "grad_norm": 0.24686092138290405, "learning_rate": 5.3939438298808075e-05, "loss": 0.0087, "step": 30070 }, { "grad_norm": 0.2115754634141922, "learning_rate": 5.3911965539665744e-05, "loss": 0.006, "step": 30080 }, { "grad_norm": 0.2142253965139389, "learning_rate": 5.388449159217156e-05, "loss": 0.0096, "step": 30090 }, { "grad_norm": 0.20955142378807068, "learning_rate": 5.3857016464671385e-05, "loss": 0.0088, "step": 30100 }, { "grad_norm": 0.24378670752048492, "learning_rate": 5.382954016551146e-05, "loss": 0.0087, "step": 30110 }, { "grad_norm": 0.19760951399803162, "learning_rate": 5.380206270303835e-05, "loss": 0.0088, "step": 30120 }, { "grad_norm": 0.20087166130542755, "learning_rate": 5.377458408559897e-05, "loss": 0.0095, "step": 30130 }, { "grad_norm": 0.21888969838619232, "learning_rate": 5.374710432154061e-05, "loss": 0.0095, "step": 30140 }, { "grad_norm": 0.3185484707355499, "learning_rate": 5.3719623419210886e-05, "loss": 0.0119, "step": 30150 }, { "grad_norm": 0.24816665053367615, "learning_rate": 5.3692141386957786e-05, "loss": 0.01, "step": 30160 }, { "grad_norm": 0.2567881941795349, "learning_rate": 5.3664658233129616e-05, "loss": 0.0087, "step": 30170 }, { "grad_norm": 0.3140625059604645, "learning_rate": 5.363717396607504e-05, "loss": 0.0109, "step": 30180 }, { "grad_norm": 0.25329121947288513, "learning_rate": 5.360968859414305e-05, "loss": 0.0097, "step": 30190 }, { "grad_norm": 0.19744175672531128, "learning_rate": 5.358220212568295e-05, "loss": 0.008, "step": 30200 }, { "grad_norm": 0.3458896577358246, "learning_rate": 5.355471456904444e-05, "loss": 0.0071, "step": 30210 }, { "grad_norm": 0.25304165482521057, "learning_rate": 5.3527225932577495e-05, "loss": 0.0116, "step": 30220 }, { "grad_norm": 0.2320389300584793, "learning_rate": 5.349973622463246e-05, "loss": 0.0086, "step": 30230 }, { "grad_norm": 0.24490326642990112, "learning_rate": 5.3472245453559956e-05, "loss": 0.0076, "step": 30240 }, { "grad_norm": 0.1762550175189972, "learning_rate": 5.3444753627710955e-05, "loss": 0.0061, "step": 30250 }, { "grad_norm": 0.3221755921840668, "learning_rate": 5.341726075543676e-05, "loss": 0.0077, "step": 30260 }, { "grad_norm": 0.2090737521648407, "learning_rate": 5.338976684508898e-05, "loss": 0.0073, "step": 30270 }, { "grad_norm": 0.23273541033267975, "learning_rate": 5.336227190501953e-05, "loss": 0.008, "step": 30280 }, { "grad_norm": 0.18089531362056732, "learning_rate": 5.3334775943580664e-05, "loss": 0.0064, "step": 30290 }, { "grad_norm": 0.22441692650318146, "learning_rate": 5.330727896912491e-05, "loss": 0.0066, "step": 30300 }, { "grad_norm": 0.2760082185268402, "learning_rate": 5.327978099000511e-05, "loss": 0.0077, "step": 30310 }, { "grad_norm": 0.2562609612941742, "learning_rate": 5.3252282014574465e-05, "loss": 0.0063, "step": 30320 }, { "grad_norm": 0.29413658380508423, "learning_rate": 5.322478205118641e-05, "loss": 0.009, "step": 30330 }, { "grad_norm": 0.29796522855758667, "learning_rate": 5.3197281108194704e-05, "loss": 0.0063, "step": 30340 }, { "grad_norm": 0.167960062623024, "learning_rate": 5.316977919395342e-05, "loss": 0.0054, "step": 30350 }, { "grad_norm": 0.264981210231781, "learning_rate": 5.314227631681691e-05, "loss": 0.0072, "step": 30360 }, { "grad_norm": 0.2510298192501068, "learning_rate": 5.311477248513982e-05, "loss": 0.0081, "step": 30370 }, { "grad_norm": 0.26735007762908936, "learning_rate": 5.30872677072771e-05, "loss": 0.0109, "step": 30380 }, { "grad_norm": 0.22081099450588226, "learning_rate": 5.3059761991583954e-05, "loss": 0.0078, "step": 30390 }, { "grad_norm": 0.22445517778396606, "learning_rate": 5.303225534641592e-05, "loss": 0.0078, "step": 30400 }, { "grad_norm": 0.2581021785736084, "learning_rate": 5.300474778012875e-05, "loss": 0.0065, "step": 30410 }, { "grad_norm": 0.2959626019001007, "learning_rate": 5.297723930107855e-05, "loss": 0.0081, "step": 30420 }, { "grad_norm": 0.28532201051712036, "learning_rate": 5.294972991762167e-05, "loss": 0.0068, "step": 30430 }, { "grad_norm": 0.22832205891609192, "learning_rate": 5.292221963811472e-05, "loss": 0.0077, "step": 30440 }, { "grad_norm": 0.2599642872810364, "learning_rate": 5.28947084709146e-05, "loss": 0.0064, "step": 30450 }, { "grad_norm": 0.22445081174373627, "learning_rate": 5.2867196424378465e-05, "loss": 0.01, "step": 30460 }, { "grad_norm": 0.31649258732795715, "learning_rate": 5.2839683506863765e-05, "loss": 0.009, "step": 30470 }, { "grad_norm": 0.23157016932964325, "learning_rate": 5.281216972672821e-05, "loss": 0.0077, "step": 30480 }, { "grad_norm": 0.2759285569190979, "learning_rate": 5.278465509232973e-05, "loss": 0.0086, "step": 30490 }, { "grad_norm": 0.3200608193874359, "learning_rate": 5.275713961202655e-05, "loss": 0.007, "step": 30500 }, { "grad_norm": 0.1513795554637909, "learning_rate": 5.2729623294177165e-05, "loss": 0.0083, "step": 30510 }, { "grad_norm": 0.19994240999221802, "learning_rate": 5.270210614714028e-05, "loss": 0.0075, "step": 30520 }, { "grad_norm": 0.28533345460891724, "learning_rate": 5.267458817927491e-05, "loss": 0.0102, "step": 30530 }, { "grad_norm": 0.3048255443572998, "learning_rate": 5.264706939894026e-05, "loss": 0.0072, "step": 30540 }, { "grad_norm": 0.22084756195545197, "learning_rate": 5.261954981449584e-05, "loss": 0.0091, "step": 30550 }, { "grad_norm": 0.22976526618003845, "learning_rate": 5.2592029434301324e-05, "loss": 0.0072, "step": 30560 }, { "grad_norm": 0.29769137501716614, "learning_rate": 5.256450826671672e-05, "loss": 0.0087, "step": 30570 }, { "grad_norm": 0.28434839844703674, "learning_rate": 5.253698632010221e-05, "loss": 0.0085, "step": 30580 }, { "grad_norm": 0.2321949154138565, "learning_rate": 5.2509463602818246e-05, "loss": 0.0097, "step": 30590 }, { "grad_norm": 0.26280343532562256, "learning_rate": 5.248194012322549e-05, "loss": 0.0102, "step": 30600 }, { "grad_norm": 0.19255560636520386, "learning_rate": 5.245441588968486e-05, "loss": 0.0085, "step": 30610 }, { "grad_norm": 0.18992942571640015, "learning_rate": 5.242689091055748e-05, "loss": 0.0063, "step": 30620 }, { "grad_norm": 0.19230274856090546, "learning_rate": 5.239936519420473e-05, "loss": 0.0056, "step": 30630 }, { "grad_norm": 0.24540460109710693, "learning_rate": 5.2371838748988175e-05, "loss": 0.005, "step": 30640 }, { "grad_norm": 0.19172360002994537, "learning_rate": 5.234431158326965e-05, "loss": 0.0058, "step": 30650 }, { "grad_norm": 0.23868797719478607, "learning_rate": 5.231678370541115e-05, "loss": 0.0078, "step": 30660 }, { "grad_norm": 0.2989415228366852, "learning_rate": 5.228925512377495e-05, "loss": 0.007, "step": 30670 }, { "grad_norm": 0.3222934901714325, "learning_rate": 5.2261725846723465e-05, "loss": 0.0088, "step": 30680 }, { "grad_norm": 0.23949749767780304, "learning_rate": 5.22341958826194e-05, "loss": 0.0088, "step": 30690 }, { "grad_norm": 0.23432166874408722, "learning_rate": 5.22066652398256e-05, "loss": 0.0071, "step": 30700 }, { "grad_norm": 0.22309230268001556, "learning_rate": 5.2179133926705185e-05, "loss": 0.0083, "step": 30710 }, { "grad_norm": 0.2281610369682312, "learning_rate": 5.215160195162141e-05, "loss": 0.0089, "step": 30720 }, { "grad_norm": 0.19826920330524445, "learning_rate": 5.212406932293776e-05, "loss": 0.0101, "step": 30730 }, { "grad_norm": 0.3359815180301666, "learning_rate": 5.209653604901795e-05, "loss": 0.0078, "step": 30740 }, { "grad_norm": 0.24799604713916779, "learning_rate": 5.206900213822584e-05, "loss": 0.0085, "step": 30750 }, { "grad_norm": 0.2714398503303528, "learning_rate": 5.204146759892551e-05, "loss": 0.0064, "step": 30760 }, { "grad_norm": 0.29669567942619324, "learning_rate": 5.2013932439481216e-05, "loss": 0.012, "step": 30770 }, { "grad_norm": 0.22248001396656036, "learning_rate": 5.198639666825743e-05, "loss": 0.0074, "step": 30780 }, { "grad_norm": 0.2481614351272583, "learning_rate": 5.195886029361877e-05, "loss": 0.0113, "step": 30790 }, { "grad_norm": 0.2154862880706787, "learning_rate": 5.193132332393009e-05, "loss": 0.0058, "step": 30800 }, { "grad_norm": 0.19185087084770203, "learning_rate": 5.1903785767556376e-05, "loss": 0.006, "step": 30810 }, { "grad_norm": 0.22475433349609375, "learning_rate": 5.187624763286282e-05, "loss": 0.0055, "step": 30820 }, { "grad_norm": 0.46180668473243713, "learning_rate": 5.184870892821475e-05, "loss": 0.0101, "step": 30830 }, { "grad_norm": 0.1878470629453659, "learning_rate": 5.182116966197773e-05, "loss": 0.0096, "step": 30840 }, { "grad_norm": 0.27362531423568726, "learning_rate": 5.1793629842517466e-05, "loss": 0.0079, "step": 30850 }, { "grad_norm": 0.22345761954784393, "learning_rate": 5.17660894781998e-05, "loss": 0.0064, "step": 30860 }, { "grad_norm": 0.3262491524219513, "learning_rate": 5.173854857739079e-05, "loss": 0.0117, "step": 30870 }, { "grad_norm": 0.2509889304637909, "learning_rate": 5.171100714845661e-05, "loss": 0.0074, "step": 30880 }, { "grad_norm": 0.2029818296432495, "learning_rate": 5.1683465199763646e-05, "loss": 0.0071, "step": 30890 }, { "grad_norm": 0.1668272316455841, "learning_rate": 5.16559227396784e-05, "loss": 0.0062, "step": 30900 }, { "grad_norm": 0.22027690708637238, "learning_rate": 5.1628379776567556e-05, "loss": 0.007, "step": 30910 }, { "grad_norm": 0.14922089874744415, "learning_rate": 5.160083631879792e-05, "loss": 0.0054, "step": 30920 }, { "grad_norm": 0.21996065974235535, "learning_rate": 5.1573292374736484e-05, "loss": 0.0053, "step": 30930 }, { "grad_norm": 0.21836510300636292, "learning_rate": 5.1545747952750356e-05, "loss": 0.0049, "step": 30940 }, { "grad_norm": 0.2488766610622406, "learning_rate": 5.151820306120682e-05, "loss": 0.0104, "step": 30950 }, { "grad_norm": 0.32899609208106995, "learning_rate": 5.149065770847328e-05, "loss": 0.0066, "step": 30960 }, { "grad_norm": 0.30933311581611633, "learning_rate": 5.1463111902917297e-05, "loss": 0.0085, "step": 30970 }, { "grad_norm": 0.22308926284313202, "learning_rate": 5.143556565290654e-05, "loss": 0.0085, "step": 30980 }, { "grad_norm": 0.3138863444328308, "learning_rate": 5.140801896680882e-05, "loss": 0.0129, "step": 30990 }, { "grad_norm": 0.20931702852249146, "learning_rate": 5.1380471852992144e-05, "loss": 0.0087, "step": 31000 }, { "grad_norm": 0.25267651677131653, "learning_rate": 5.135292431982457e-05, "loss": 0.0072, "step": 31010 }, { "grad_norm": 0.20820403099060059, "learning_rate": 5.1325376375674294e-05, "loss": 0.0086, "step": 31020 }, { "grad_norm": 0.3354862630367279, "learning_rate": 5.129782802890968e-05, "loss": 0.0079, "step": 31030 }, { "grad_norm": 0.2758081257343292, "learning_rate": 5.127027928789916e-05, "loss": 0.0098, "step": 31040 }, { "grad_norm": 0.21352413296699524, "learning_rate": 5.124273016101135e-05, "loss": 0.0064, "step": 31050 }, { "grad_norm": 0.3500865697860718, "learning_rate": 5.121518065661492e-05, "loss": 0.0089, "step": 31060 }, { "grad_norm": 0.24476422369480133, "learning_rate": 5.11876307830787e-05, "loss": 0.0078, "step": 31070 }, { "grad_norm": 0.23534061014652252, "learning_rate": 5.1160080548771596e-05, "loss": 0.0072, "step": 31080 }, { "grad_norm": 0.2848494350910187, "learning_rate": 5.1132529962062656e-05, "loss": 0.0083, "step": 31090 }, { "grad_norm": 0.23742736876010895, "learning_rate": 5.110497903132101e-05, "loss": 0.0074, "step": 31100 }, { "grad_norm": 0.28013914823532104, "learning_rate": 5.107742776491592e-05, "loss": 0.0125, "step": 31110 }, { "grad_norm": 0.20948348939418793, "learning_rate": 5.104987617121673e-05, "loss": 0.0064, "step": 31120 }, { "grad_norm": 0.2949061095714569, "learning_rate": 5.102232425859287e-05, "loss": 0.0089, "step": 31130 }, { "grad_norm": 0.4035622179508209, "learning_rate": 5.09947720354139e-05, "loss": 0.0081, "step": 31140 }, { "grad_norm": 0.2383720576763153, "learning_rate": 5.096721951004942e-05, "loss": 0.0105, "step": 31150 }, { "grad_norm": 0.22843797504901886, "learning_rate": 5.0939666690869227e-05, "loss": 0.0079, "step": 31160 }, { "grad_norm": 0.1961676925420761, "learning_rate": 5.0912113586243096e-05, "loss": 0.0063, "step": 31170 }, { "grad_norm": 0.2668704688549042, "learning_rate": 5.0884560204540935e-05, "loss": 0.0065, "step": 31180 }, { "grad_norm": 0.2266463041305542, "learning_rate": 5.0857006554132736e-05, "loss": 0.0069, "step": 31190 }, { "grad_norm": 0.23129190504550934, "learning_rate": 5.0829452643388575e-05, "loss": 0.0073, "step": 31200 }, { "grad_norm": 0.1816384643316269, "learning_rate": 5.08018984806786e-05, "loss": 0.0058, "step": 31210 }, { "grad_norm": 0.2779273986816406, "learning_rate": 5.0774344074373036e-05, "loss": 0.0076, "step": 31220 }, { "grad_norm": 0.20822004973888397, "learning_rate": 5.07467894328422e-05, "loss": 0.0049, "step": 31230 }, { "grad_norm": 0.15333092212677002, "learning_rate": 5.0719234564456454e-05, "loss": 0.009, "step": 31240 }, { "grad_norm": 0.21931354701519012, "learning_rate": 5.0691679477586216e-05, "loss": 0.0067, "step": 31250 }, { "grad_norm": 0.228150874376297, "learning_rate": 5.0664124180602035e-05, "loss": 0.0059, "step": 31260 }, { "grad_norm": 0.18881909549236298, "learning_rate": 5.063656868187447e-05, "loss": 0.0065, "step": 31270 }, { "grad_norm": 0.242721289396286, "learning_rate": 5.060901298977413e-05, "loss": 0.0067, "step": 31280 }, { "grad_norm": 0.24006229639053345, "learning_rate": 5.0581457112671725e-05, "loss": 0.0075, "step": 31290 }, { "grad_norm": 0.2504872679710388, "learning_rate": 5.0553901058938016e-05, "loss": 0.0065, "step": 31300 }, { "grad_norm": 0.20765410363674164, "learning_rate": 5.052634483694377e-05, "loss": 0.0056, "step": 31310 }, { "grad_norm": 0.26836851239204407, "learning_rate": 5.049878845505988e-05, "loss": 0.0098, "step": 31320 }, { "grad_norm": 0.290301650762558, "learning_rate": 5.047123192165721e-05, "loss": 0.0067, "step": 31330 }, { "grad_norm": 0.36293449997901917, "learning_rate": 5.0443675245106735e-05, "loss": 0.0065, "step": 31340 }, { "grad_norm": 0.246487557888031, "learning_rate": 5.0416118433779426e-05, "loss": 0.0059, "step": 31350 }, { "grad_norm": 0.2621065378189087, "learning_rate": 5.038856149604633e-05, "loss": 0.0094, "step": 31360 }, { "grad_norm": 0.22675533592700958, "learning_rate": 5.03610044402785e-05, "loss": 0.0069, "step": 31370 }, { "grad_norm": 0.18436338007450104, "learning_rate": 5.033344727484707e-05, "loss": 0.006, "step": 31380 }, { "grad_norm": 0.22949163615703583, "learning_rate": 5.030589000812315e-05, "loss": 0.0066, "step": 31390 }, { "grad_norm": 0.21328596770763397, "learning_rate": 5.027833264847793e-05, "loss": 0.0071, "step": 31400 }, { "grad_norm": 0.3004545271396637, "learning_rate": 5.025077520428258e-05, "loss": 0.0109, "step": 31410 }, { "grad_norm": 0.24109072983264923, "learning_rate": 5.022321768390837e-05, "loss": 0.0067, "step": 31420 }, { "grad_norm": 0.16669245064258575, "learning_rate": 5.0195660095726516e-05, "loss": 0.0068, "step": 31430 }, { "grad_norm": 0.2005222588777542, "learning_rate": 5.016810244810829e-05, "loss": 0.0078, "step": 31440 }, { "grad_norm": 0.16585180163383484, "learning_rate": 5.0140544749424976e-05, "loss": 0.009, "step": 31450 }, { "grad_norm": 0.1720247119665146, "learning_rate": 5.0112987008047874e-05, "loss": 0.007, "step": 31460 }, { "grad_norm": 0.22980570793151855, "learning_rate": 5.008542923234831e-05, "loss": 0.0074, "step": 31470 }, { "grad_norm": 0.21856798231601715, "learning_rate": 5.00578714306976e-05, "loss": 0.0065, "step": 31480 }, { "grad_norm": 0.2693788409233093, "learning_rate": 5.0030313611467084e-05, "loss": 0.0096, "step": 31490 }, { "grad_norm": 0.18391215801239014, "learning_rate": 5.0002755783028074e-05, "loss": 0.0088, "step": 31500 }, { "grad_norm": 0.2168341875076294, "learning_rate": 4.997519795375194e-05, "loss": 0.008, "step": 31510 }, { "grad_norm": 0.23137378692626953, "learning_rate": 4.9947640132010016e-05, "loss": 0.0057, "step": 31520 }, { "grad_norm": 0.18477937579154968, "learning_rate": 4.9920082326173625e-05, "loss": 0.0094, "step": 31530 }, { "grad_norm": 0.19932635128498077, "learning_rate": 4.9892524544614114e-05, "loss": 0.0073, "step": 31540 }, { "grad_norm": 0.217888742685318, "learning_rate": 4.986496679570283e-05, "loss": 0.0054, "step": 31550 }, { "grad_norm": 0.3126939535140991, "learning_rate": 4.983740908781105e-05, "loss": 0.0077, "step": 31560 }, { "grad_norm": 0.30557388067245483, "learning_rate": 4.9809851429310116e-05, "loss": 0.0096, "step": 31570 }, { "grad_norm": 0.2228098064661026, "learning_rate": 4.9782293828571275e-05, "loss": 0.0078, "step": 31580 }, { "grad_norm": 0.2402837574481964, "learning_rate": 4.9754736293965846e-05, "loss": 0.006, "step": 31590 }, { "grad_norm": 0.3416842520236969, "learning_rate": 4.972717883386502e-05, "loss": 0.0127, "step": 31600 }, { "grad_norm": 0.2635408341884613, "learning_rate": 4.9699621456640075e-05, "loss": 0.0083, "step": 31610 }, { "grad_norm": 0.2247954159975052, "learning_rate": 4.9672064170662214e-05, "loss": 0.0069, "step": 31620 }, { "grad_norm": 0.2249128520488739, "learning_rate": 4.9644506984302583e-05, "loss": 0.0069, "step": 31630 }, { "grad_norm": 0.2463538944721222, "learning_rate": 4.9616949905932356e-05, "loss": 0.0088, "step": 31640 }, { "grad_norm": 0.2651323676109314, "learning_rate": 4.9589392943922615e-05, "loss": 0.0077, "step": 31650 }, { "grad_norm": 0.22297944128513336, "learning_rate": 4.956183610664447e-05, "loss": 0.0083, "step": 31660 }, { "grad_norm": 0.22701004147529602, "learning_rate": 4.9534279402468945e-05, "loss": 0.0083, "step": 31670 }, { "grad_norm": 0.15203142166137695, "learning_rate": 4.9506722839767036e-05, "loss": 0.0061, "step": 31680 }, { "grad_norm": 0.32742583751678467, "learning_rate": 4.947916642690972e-05, "loss": 0.0089, "step": 31690 }, { "grad_norm": 0.2184576690196991, "learning_rate": 4.9451610172267874e-05, "loss": 0.0068, "step": 31700 }, { "grad_norm": 0.17164179682731628, "learning_rate": 4.9424054084212376e-05, "loss": 0.0084, "step": 31710 }, { "grad_norm": 0.24547478556632996, "learning_rate": 4.939649817111407e-05, "loss": 0.009, "step": 31720 }, { "grad_norm": 0.2324201464653015, "learning_rate": 4.936894244134365e-05, "loss": 0.0075, "step": 31730 }, { "grad_norm": 0.17113351821899414, "learning_rate": 4.9341386903271886e-05, "loss": 0.007, "step": 31740 }, { "grad_norm": 0.2053913027048111, "learning_rate": 4.931383156526936e-05, "loss": 0.0071, "step": 31750 }, { "grad_norm": 0.24972592294216156, "learning_rate": 4.92862764357067e-05, "loss": 0.0081, "step": 31760 }, { "grad_norm": 0.20848095417022705, "learning_rate": 4.925872152295443e-05, "loss": 0.007, "step": 31770 }, { "grad_norm": 0.2362770438194275, "learning_rate": 4.923116683538296e-05, "loss": 0.0112, "step": 31780 }, { "grad_norm": 0.18292094767093658, "learning_rate": 4.920361238136273e-05, "loss": 0.0067, "step": 31790 }, { "grad_norm": 0.2739775776863098, "learning_rate": 4.9176058169264014e-05, "loss": 0.0095, "step": 31800 }, { "grad_norm": 0.23343752324581146, "learning_rate": 4.9148504207457074e-05, "loss": 0.0067, "step": 31810 }, { "grad_norm": 0.2778758108615875, "learning_rate": 4.912095050431208e-05, "loss": 0.0075, "step": 31820 }, { "grad_norm": 0.1821853071451187, "learning_rate": 4.909339706819911e-05, "loss": 0.0065, "step": 31830 }, { "grad_norm": 0.23555830121040344, "learning_rate": 4.906584390748819e-05, "loss": 0.0073, "step": 31840 }, { "grad_norm": 0.24512019753456116, "learning_rate": 4.9038291030549195e-05, "loss": 0.0063, "step": 31850 }, { "grad_norm": 0.20886830985546112, "learning_rate": 4.9010738445751995e-05, "loss": 0.0085, "step": 31860 }, { "grad_norm": 0.2214621752500534, "learning_rate": 4.8983186161466364e-05, "loss": 0.0069, "step": 31870 }, { "grad_norm": 0.1699652373790741, "learning_rate": 4.89556341860619e-05, "loss": 0.0056, "step": 31880 }, { "grad_norm": 0.1831630915403366, "learning_rate": 4.892808252790822e-05, "loss": 0.007, "step": 31890 }, { "grad_norm": 0.3026648759841919, "learning_rate": 4.890053119537475e-05, "loss": 0.0093, "step": 31900 }, { "grad_norm": 0.2638152539730072, "learning_rate": 4.887298019683087e-05, "loss": 0.0066, "step": 31910 }, { "grad_norm": 0.2961420714855194, "learning_rate": 4.884542954064587e-05, "loss": 0.0066, "step": 31920 }, { "grad_norm": 0.22466962039470673, "learning_rate": 4.881787923518887e-05, "loss": 0.0073, "step": 31930 }, { "grad_norm": 0.2759089767932892, "learning_rate": 4.879032928882896e-05, "loss": 0.0088, "step": 31940 }, { "grad_norm": 0.2509841322898865, "learning_rate": 4.876277970993505e-05, "loss": 0.0067, "step": 31950 }, { "grad_norm": 0.30766138434410095, "learning_rate": 4.873523050687602e-05, "loss": 0.0103, "step": 31960 }, { "grad_norm": 0.21319961547851562, "learning_rate": 4.870768168802056e-05, "loss": 0.0061, "step": 31970 }, { "grad_norm": 0.22675667703151703, "learning_rate": 4.868013326173728e-05, "loss": 0.0089, "step": 31980 }, { "grad_norm": 0.17030957341194153, "learning_rate": 4.865258523639468e-05, "loss": 0.0055, "step": 31990 }, { "grad_norm": 0.23386850953102112, "learning_rate": 4.862503762036109e-05, "loss": 0.0082, "step": 32000 }, { "grad_norm": 0.2531217336654663, "learning_rate": 4.859749042200478e-05, "loss": 0.0063, "step": 32010 }, { "grad_norm": 0.2289196252822876, "learning_rate": 4.856994364969384e-05, "loss": 0.0064, "step": 32020 }, { "grad_norm": 0.22871866822242737, "learning_rate": 4.854239731179625e-05, "loss": 0.0078, "step": 32030 }, { "grad_norm": 0.26219645142555237, "learning_rate": 4.85148514166799e-05, "loss": 0.0073, "step": 32040 }, { "grad_norm": 0.22148561477661133, "learning_rate": 4.8487305972712456e-05, "loss": 0.006, "step": 32050 }, { "grad_norm": 0.24628743529319763, "learning_rate": 4.8459760988261526e-05, "loss": 0.0079, "step": 32060 }, { "grad_norm": 0.20057158172130585, "learning_rate": 4.843221647169453e-05, "loss": 0.0057, "step": 32070 }, { "grad_norm": 0.18639099597930908, "learning_rate": 4.840467243137878e-05, "loss": 0.008, "step": 32080 }, { "grad_norm": 0.18927448987960815, "learning_rate": 4.837712887568143e-05, "loss": 0.0068, "step": 32090 }, { "grad_norm": 0.2605811059474945, "learning_rate": 4.8349585812969464e-05, "loss": 0.0059, "step": 32100 }, { "grad_norm": 0.2559966444969177, "learning_rate": 4.8322043251609775e-05, "loss": 0.0074, "step": 32110 }, { "grad_norm": 0.38517898321151733, "learning_rate": 4.8294501199969015e-05, "loss": 0.0098, "step": 32120 }, { "grad_norm": 0.16990350186824799, "learning_rate": 4.826695966641376e-05, "loss": 0.0053, "step": 32130 }, { "grad_norm": 0.28396254777908325, "learning_rate": 4.823941865931043e-05, "loss": 0.0067, "step": 32140 }, { "grad_norm": 0.2862111032009125, "learning_rate": 4.82118781870252e-05, "loss": 0.0065, "step": 32150 }, { "grad_norm": 0.24131996929645538, "learning_rate": 4.8184338257924185e-05, "loss": 0.0088, "step": 32160 }, { "grad_norm": 0.26497340202331543, "learning_rate": 4.815679888037324e-05, "loss": 0.0105, "step": 32170 }, { "grad_norm": 0.2897150218486786, "learning_rate": 4.8129260062738135e-05, "loss": 0.0089, "step": 32180 }, { "grad_norm": 0.24623708426952362, "learning_rate": 4.810172181338445e-05, "loss": 0.0062, "step": 32190 }, { "grad_norm": 0.20998826622962952, "learning_rate": 4.807418414067753e-05, "loss": 0.0054, "step": 32200 }, { "grad_norm": 0.23517116904258728, "learning_rate": 4.804664705298264e-05, "loss": 0.0058, "step": 32210 }, { "grad_norm": 0.13893675804138184, "learning_rate": 4.80191105586648e-05, "loss": 0.0064, "step": 32220 }, { "grad_norm": 0.22650250792503357, "learning_rate": 4.799157466608886e-05, "loss": 0.0074, "step": 32230 }, { "grad_norm": 0.22311876714229584, "learning_rate": 4.796403938361951e-05, "loss": 0.0068, "step": 32240 }, { "grad_norm": 0.19218571484088898, "learning_rate": 4.793650471962123e-05, "loss": 0.0092, "step": 32250 }, { "grad_norm": 0.2698209285736084, "learning_rate": 4.790897068245835e-05, "loss": 0.0055, "step": 32260 }, { "grad_norm": 0.1621396243572235, "learning_rate": 4.7881437280494954e-05, "loss": 0.006, "step": 32270 }, { "grad_norm": 0.21912983059883118, "learning_rate": 4.7853904522094965e-05, "loss": 0.0073, "step": 32280 }, { "grad_norm": 0.2590363025665283, "learning_rate": 4.782637241562215e-05, "loss": 0.0087, "step": 32290 }, { "grad_norm": 0.1961211860179901, "learning_rate": 4.779884096943997e-05, "loss": 0.0056, "step": 32300 }, { "grad_norm": 0.2392387092113495, "learning_rate": 4.777131019191182e-05, "loss": 0.0072, "step": 32310 }, { "grad_norm": 0.23692595958709717, "learning_rate": 4.774378009140076e-05, "loss": 0.0061, "step": 32320 }, { "grad_norm": 0.13764940202236176, "learning_rate": 4.7716250676269735e-05, "loss": 0.0063, "step": 32330 }, { "grad_norm": 0.24975548684597015, "learning_rate": 4.7688721954881485e-05, "loss": 0.007, "step": 32340 }, { "grad_norm": 0.23488099873065948, "learning_rate": 4.7661193935598446e-05, "loss": 0.0061, "step": 32350 }, { "grad_norm": 0.2668916583061218, "learning_rate": 4.763366662678296e-05, "loss": 0.0077, "step": 32360 }, { "grad_norm": 0.2959752082824707, "learning_rate": 4.7606140036797064e-05, "loss": 0.0087, "step": 32370 }, { "grad_norm": 0.268684983253479, "learning_rate": 4.7578614174002614e-05, "loss": 0.0061, "step": 32380 }, { "grad_norm": 0.3234284222126007, "learning_rate": 4.755108904676125e-05, "loss": 0.0093, "step": 32390 }, { "grad_norm": 0.24526089429855347, "learning_rate": 4.752356466343436e-05, "loss": 0.0087, "step": 32400 }, { "grad_norm": 0.21310359239578247, "learning_rate": 4.7496041032383174e-05, "loss": 0.0068, "step": 32410 }, { "grad_norm": 0.2801964282989502, "learning_rate": 4.746851816196858e-05, "loss": 0.0075, "step": 32420 }, { "grad_norm": 0.17972113192081451, "learning_rate": 4.744099606055135e-05, "loss": 0.0079, "step": 32430 }, { "grad_norm": 0.30641815066337585, "learning_rate": 4.741347473649193e-05, "loss": 0.0086, "step": 32440 }, { "grad_norm": 0.2734004557132721, "learning_rate": 4.738595419815058e-05, "loss": 0.0067, "step": 32450 }, { "grad_norm": 0.2160828411579132, "learning_rate": 4.7358434453887365e-05, "loss": 0.0081, "step": 32460 }, { "grad_norm": 0.18313400447368622, "learning_rate": 4.7330915512061976e-05, "loss": 0.006, "step": 32470 }, { "grad_norm": 0.22924011945724487, "learning_rate": 4.730339738103402e-05, "loss": 0.0048, "step": 32480 }, { "grad_norm": 0.3271869719028473, "learning_rate": 4.727588006916271e-05, "loss": 0.0099, "step": 32490 }, { "grad_norm": 0.24933359026908875, "learning_rate": 4.724836358480711e-05, "loss": 0.0067, "step": 32500 }, { "grad_norm": 0.3346271514892578, "learning_rate": 4.722084793632601e-05, "loss": 0.0084, "step": 32510 }, { "grad_norm": 0.22423043847084045, "learning_rate": 4.719333313207792e-05, "loss": 0.0078, "step": 32520 }, { "grad_norm": 0.2899785041809082, "learning_rate": 4.716581918042114e-05, "loss": 0.0081, "step": 32530 }, { "grad_norm": 0.26931262016296387, "learning_rate": 4.7138306089713636e-05, "loss": 0.0087, "step": 32540 }, { "grad_norm": 0.22534888982772827, "learning_rate": 4.7110793868313183e-05, "loss": 0.0072, "step": 32550 }, { "grad_norm": 0.21676774322986603, "learning_rate": 4.708328252457729e-05, "loss": 0.0079, "step": 32560 }, { "grad_norm": 0.30667024850845337, "learning_rate": 4.7055772066863135e-05, "loss": 0.009, "step": 32570 }, { "grad_norm": 0.2854798436164856, "learning_rate": 4.702826250352771e-05, "loss": 0.0122, "step": 32580 }, { "grad_norm": 0.288504958152771, "learning_rate": 4.7000753842927653e-05, "loss": 0.0068, "step": 32590 }, { "grad_norm": 0.25443020462989807, "learning_rate": 4.6973246093419384e-05, "loss": 0.0087, "step": 32600 }, { "grad_norm": 0.18471738696098328, "learning_rate": 4.694573926335906e-05, "loss": 0.0063, "step": 32610 }, { "grad_norm": 0.20200151205062866, "learning_rate": 4.6918233361102476e-05, "loss": 0.0074, "step": 32620 }, { "grad_norm": 0.20897750556468964, "learning_rate": 4.689072839500525e-05, "loss": 0.0062, "step": 32630 }, { "grad_norm": 0.2191777229309082, "learning_rate": 4.6863224373422635e-05, "loss": 0.0079, "step": 32640 }, { "grad_norm": 0.24972093105316162, "learning_rate": 4.683572130470962e-05, "loss": 0.009, "step": 32650 }, { "grad_norm": 0.20447924733161926, "learning_rate": 4.680821919722094e-05, "loss": 0.0062, "step": 32660 }, { "grad_norm": 0.2813320457935333, "learning_rate": 4.6780718059310975e-05, "loss": 0.0072, "step": 32670 }, { "grad_norm": 0.27030473947525024, "learning_rate": 4.675321789933389e-05, "loss": 0.0064, "step": 32680 }, { "grad_norm": 0.11901049315929413, "learning_rate": 4.6725718725643464e-05, "loss": 0.0056, "step": 32690 }, { "grad_norm": 0.18552948534488678, "learning_rate": 4.669822054659323e-05, "loss": 0.0055, "step": 32700 }, { "grad_norm": 0.3296489119529724, "learning_rate": 4.667072337053644e-05, "loss": 0.009, "step": 32710 }, { "grad_norm": 0.2645881474018097, "learning_rate": 4.6643227205825965e-05, "loss": 0.0064, "step": 32720 }, { "grad_norm": 0.2363099753856659, "learning_rate": 4.6615732060814454e-05, "loss": 0.0059, "step": 32730 }, { "grad_norm": 0.2088632732629776, "learning_rate": 4.658823794385417e-05, "loss": 0.0076, "step": 32740 }, { "grad_norm": 0.1993638426065445, "learning_rate": 4.6560744863297115e-05, "loss": 0.0064, "step": 32750 }, { "grad_norm": 0.19181689620018005, "learning_rate": 4.653325282749498e-05, "loss": 0.0061, "step": 32760 }, { "grad_norm": 0.2350449115037918, "learning_rate": 4.6505761844799075e-05, "loss": 0.009, "step": 32770 }, { "grad_norm": 0.23427946865558624, "learning_rate": 4.647827192356048e-05, "loss": 0.008, "step": 32780 }, { "grad_norm": 0.20481781661510468, "learning_rate": 4.645078307212989e-05, "loss": 0.0054, "step": 32790 }, { "grad_norm": 0.24765610694885254, "learning_rate": 4.642329529885768e-05, "loss": 0.0068, "step": 32800 }, { "grad_norm": 0.2681053876876831, "learning_rate": 4.639580861209393e-05, "loss": 0.0072, "step": 32810 }, { "grad_norm": 0.2844184339046478, "learning_rate": 4.636832302018835e-05, "loss": 0.0053, "step": 32820 }, { "grad_norm": 0.24991938471794128, "learning_rate": 4.6340838531490365e-05, "loss": 0.0058, "step": 32830 }, { "grad_norm": 0.20992514491081238, "learning_rate": 4.6313355154349e-05, "loss": 0.0074, "step": 32840 }, { "grad_norm": 0.18317939341068268, "learning_rate": 4.6285872897113025e-05, "loss": 0.0096, "step": 32850 }, { "grad_norm": 0.17887622117996216, "learning_rate": 4.625839176813077e-05, "loss": 0.0059, "step": 32860 }, { "grad_norm": 0.2228119671344757, "learning_rate": 4.623091177575031e-05, "loss": 0.0059, "step": 32870 }, { "grad_norm": 0.19908063113689423, "learning_rate": 4.620343292831936e-05, "loss": 0.0066, "step": 32880 }, { "grad_norm": 0.28317978978157043, "learning_rate": 4.6175955234185206e-05, "loss": 0.0074, "step": 32890 }, { "grad_norm": 0.29716062545776367, "learning_rate": 4.614847870169492e-05, "loss": 0.007, "step": 32900 }, { "grad_norm": 0.24557605385780334, "learning_rate": 4.612100333919509e-05, "loss": 0.0062, "step": 32910 }, { "grad_norm": 0.2536318004131317, "learning_rate": 4.609352915503202e-05, "loss": 0.0054, "step": 32920 }, { "grad_norm": 0.27791568636894226, "learning_rate": 4.606605615755166e-05, "loss": 0.0058, "step": 32930 }, { "grad_norm": 0.16657011210918427, "learning_rate": 4.6038584355099576e-05, "loss": 0.0065, "step": 32940 }, { "grad_norm": 0.19968189299106598, "learning_rate": 4.6011113756020964e-05, "loss": 0.0086, "step": 32950 }, { "grad_norm": 0.2506800889968872, "learning_rate": 4.598364436866066e-05, "loss": 0.0071, "step": 32960 }, { "grad_norm": 0.33925187587738037, "learning_rate": 4.595617620136316e-05, "loss": 0.0091, "step": 32970 }, { "grad_norm": 0.24584905803203583, "learning_rate": 4.592870926247257e-05, "loss": 0.0085, "step": 32980 }, { "grad_norm": 0.22604088485240936, "learning_rate": 4.5901243560332594e-05, "loss": 0.0053, "step": 32990 }, { "grad_norm": 0.26294711232185364, "learning_rate": 4.587377910328662e-05, "loss": 0.0061, "step": 33000 }, { "grad_norm": 0.14909221231937408, "learning_rate": 4.5846315899677586e-05, "loss": 0.0078, "step": 33010 }, { "grad_norm": 0.21891048550605774, "learning_rate": 4.5818853957848114e-05, "loss": 0.0054, "step": 33020 }, { "grad_norm": 0.1842089593410492, "learning_rate": 4.579139328614043e-05, "loss": 0.0059, "step": 33030 }, { "grad_norm": 0.227177232503891, "learning_rate": 4.576393389289633e-05, "loss": 0.0064, "step": 33040 }, { "grad_norm": 0.22151440382003784, "learning_rate": 4.573647578645728e-05, "loss": 0.0067, "step": 33050 }, { "grad_norm": 0.25708913803100586, "learning_rate": 4.57090189751643e-05, "loss": 0.0064, "step": 33060 }, { "grad_norm": 0.22885015606880188, "learning_rate": 4.568156346735806e-05, "loss": 0.0061, "step": 33070 }, { "grad_norm": 0.1635526418685913, "learning_rate": 4.565410927137882e-05, "loss": 0.0051, "step": 33080 }, { "grad_norm": 0.16752150654792786, "learning_rate": 4.562665639556644e-05, "loss": 0.0056, "step": 33090 }, { "grad_norm": 0.2851332426071167, "learning_rate": 4.559920484826037e-05, "loss": 0.0066, "step": 33100 }, { "grad_norm": 0.23949535191059113, "learning_rate": 4.5571754637799665e-05, "loss": 0.0068, "step": 33110 }, { "grad_norm": 0.1821199208498001, "learning_rate": 4.554430577252298e-05, "loss": 0.0061, "step": 33120 }, { "grad_norm": 0.24349737167358398, "learning_rate": 4.551685826076858e-05, "loss": 0.0056, "step": 33130 }, { "grad_norm": 0.2185320407152176, "learning_rate": 4.5489412110874246e-05, "loss": 0.0069, "step": 33140 }, { "grad_norm": 0.28381046652793884, "learning_rate": 4.5461967331177444e-05, "loss": 0.0095, "step": 33150 }, { "grad_norm": 0.2769639194011688, "learning_rate": 4.5434523930015115e-05, "loss": 0.0059, "step": 33160 }, { "grad_norm": 0.1941869705915451, "learning_rate": 4.540708191572388e-05, "loss": 0.0076, "step": 33170 }, { "grad_norm": 0.21477438509464264, "learning_rate": 4.537964129663991e-05, "loss": 0.007, "step": 33180 }, { "grad_norm": 0.24743351340293884, "learning_rate": 4.535220208109889e-05, "loss": 0.0089, "step": 33190 }, { "grad_norm": 0.16789889335632324, "learning_rate": 4.5324764277436194e-05, "loss": 0.0085, "step": 33200 }, { "grad_norm": 0.21021337807178497, "learning_rate": 4.529732789398664e-05, "loss": 0.0069, "step": 33210 }, { "grad_norm": 0.18843376636505127, "learning_rate": 4.526989293908472e-05, "loss": 0.0067, "step": 33220 }, { "grad_norm": 0.1962655931711197, "learning_rate": 4.524245942106442e-05, "loss": 0.0066, "step": 33230 }, { "grad_norm": 0.20544129610061646, "learning_rate": 4.5215027348259345e-05, "loss": 0.0064, "step": 33240 }, { "grad_norm": 0.1872073858976364, "learning_rate": 4.5187596729002616e-05, "loss": 0.0059, "step": 33250 }, { "grad_norm": 0.18753452599048615, "learning_rate": 4.516016757162693e-05, "loss": 0.0068, "step": 33260 }, { "grad_norm": 0.19614441692829132, "learning_rate": 4.513273988446457e-05, "loss": 0.0089, "step": 33270 }, { "grad_norm": 0.2291834056377411, "learning_rate": 4.5105313675847296e-05, "loss": 0.0082, "step": 33280 }, { "grad_norm": 0.26047879457473755, "learning_rate": 4.5077888954106495e-05, "loss": 0.0055, "step": 33290 }, { "grad_norm": 0.2506044805049896, "learning_rate": 4.505046572757309e-05, "loss": 0.0075, "step": 33300 }, { "grad_norm": 0.17611005902290344, "learning_rate": 4.502304400457749e-05, "loss": 0.0054, "step": 33310 }, { "grad_norm": 0.31874096393585205, "learning_rate": 4.499562379344973e-05, "loss": 0.0073, "step": 33320 }, { "grad_norm": 0.2621985971927643, "learning_rate": 4.4968205102519306e-05, "loss": 0.0084, "step": 33330 }, { "grad_norm": 0.2626568675041199, "learning_rate": 4.494078794011532e-05, "loss": 0.0063, "step": 33340 }, { "grad_norm": 0.21751157939434052, "learning_rate": 4.491337231456639e-05, "loss": 0.008, "step": 33350 }, { "grad_norm": 0.1921292543411255, "learning_rate": 4.4885958234200634e-05, "loss": 0.0089, "step": 33360 }, { "grad_norm": 0.32345283031463623, "learning_rate": 4.485854570734575e-05, "loss": 0.0134, "step": 33370 }, { "grad_norm": 0.29324713349342346, "learning_rate": 4.483113474232891e-05, "loss": 0.0089, "step": 33380 }, { "grad_norm": 0.2626071572303772, "learning_rate": 4.480372534747688e-05, "loss": 0.0091, "step": 33390 }, { "grad_norm": 0.15506887435913086, "learning_rate": 4.477631753111588e-05, "loss": 0.0076, "step": 33400 }, { "grad_norm": 0.18630781769752502, "learning_rate": 4.4748911301571686e-05, "loss": 0.0058, "step": 33410 }, { "grad_norm": 0.2392105907201767, "learning_rate": 4.472150666716961e-05, "loss": 0.0074, "step": 33420 }, { "grad_norm": 0.20797984302043915, "learning_rate": 4.469410363623442e-05, "loss": 0.0074, "step": 33430 }, { "grad_norm": 0.2930055856704712, "learning_rate": 4.466670221709044e-05, "loss": 0.0064, "step": 33440 }, { "grad_norm": 0.22188888490200043, "learning_rate": 4.463930241806154e-05, "loss": 0.0055, "step": 33450 }, { "grad_norm": 0.279517263174057, "learning_rate": 4.4611904247471006e-05, "loss": 0.0075, "step": 33460 }, { "grad_norm": 0.21537578105926514, "learning_rate": 4.458450771364171e-05, "loss": 0.0052, "step": 33470 }, { "grad_norm": 0.1981678456068039, "learning_rate": 4.4557112824895965e-05, "loss": 0.0067, "step": 33480 }, { "grad_norm": 0.25552329421043396, "learning_rate": 4.452971958955563e-05, "loss": 0.0076, "step": 33490 }, { "grad_norm": 0.23445437848567963, "learning_rate": 4.450232801594208e-05, "loss": 0.0084, "step": 33500 }, { "grad_norm": 0.23738905787467957, "learning_rate": 4.447493811237609e-05, "loss": 0.0074, "step": 33510 }, { "grad_norm": 0.2786206901073456, "learning_rate": 4.444754988717804e-05, "loss": 0.006, "step": 33520 }, { "grad_norm": 0.2710059881210327, "learning_rate": 4.442016334866771e-05, "loss": 0.0075, "step": 33530 }, { "grad_norm": 0.20788449048995972, "learning_rate": 4.4392778505164445e-05, "loss": 0.0068, "step": 33540 }, { "grad_norm": 0.24663987755775452, "learning_rate": 4.436539536498702e-05, "loss": 0.0072, "step": 33550 }, { "grad_norm": 0.38234665989875793, "learning_rate": 4.433801393645369e-05, "loss": 0.0064, "step": 33560 }, { "grad_norm": 0.18163244426250458, "learning_rate": 4.431063422788226e-05, "loss": 0.0054, "step": 33570 }, { "grad_norm": 0.20825371146202087, "learning_rate": 4.428325624758991e-05, "loss": 0.0078, "step": 33580 }, { "grad_norm": 0.2456311136484146, "learning_rate": 4.4255880003893366e-05, "loss": 0.0075, "step": 33590 }, { "grad_norm": 0.2841171324253082, "learning_rate": 4.422850550510884e-05, "loss": 0.0089, "step": 33600 }, { "grad_norm": 0.18170909583568573, "learning_rate": 4.4201132759551934e-05, "loss": 0.0078, "step": 33610 }, { "grad_norm": 0.20780561864376068, "learning_rate": 4.4173761775537804e-05, "loss": 0.0108, "step": 33620 }, { "grad_norm": 0.15301473438739777, "learning_rate": 4.414639256138099e-05, "loss": 0.0063, "step": 33630 }, { "grad_norm": 0.16479627788066864, "learning_rate": 4.411902512539557e-05, "loss": 0.0092, "step": 33640 }, { "grad_norm": 0.24109962582588196, "learning_rate": 4.4091659475895044e-05, "loss": 0.0081, "step": 33650 }, { "grad_norm": 0.26153942942619324, "learning_rate": 4.406429562119235e-05, "loss": 0.0066, "step": 33660 }, { "grad_norm": 0.14684417843818665, "learning_rate": 4.4036933569599945e-05, "loss": 0.0064, "step": 33670 }, { "grad_norm": 0.1940925121307373, "learning_rate": 4.400957332942965e-05, "loss": 0.0054, "step": 33680 }, { "grad_norm": 0.1748809516429901, "learning_rate": 4.3982214908992844e-05, "loss": 0.006, "step": 33690 }, { "grad_norm": 0.23238727450370789, "learning_rate": 4.3954858316600235e-05, "loss": 0.0062, "step": 33700 }, { "grad_norm": 0.27522721886634827, "learning_rate": 4.392750356056205e-05, "loss": 0.006, "step": 33710 }, { "grad_norm": 0.17680269479751587, "learning_rate": 4.390015064918798e-05, "loss": 0.0055, "step": 33720 }, { "grad_norm": 0.21424515545368195, "learning_rate": 4.387279959078705e-05, "loss": 0.0049, "step": 33730 }, { "grad_norm": 0.1855633407831192, "learning_rate": 4.384545039366786e-05, "loss": 0.0091, "step": 33740 }, { "grad_norm": 0.20762841403484344, "learning_rate": 4.381810306613831e-05, "loss": 0.005, "step": 33750 }, { "grad_norm": 0.16944566369056702, "learning_rate": 4.3790757616505826e-05, "loss": 0.0067, "step": 33760 }, { "grad_norm": 0.20522117614746094, "learning_rate": 4.376341405307725e-05, "loss": 0.0057, "step": 33770 }, { "grad_norm": 0.19686530530452728, "learning_rate": 4.37360723841588e-05, "loss": 0.0069, "step": 33780 }, { "grad_norm": 0.19774030148983002, "learning_rate": 4.370873261805619e-05, "loss": 0.0055, "step": 33790 }, { "grad_norm": 0.30278995633125305, "learning_rate": 4.368139476307449e-05, "loss": 0.011, "step": 33800 }, { "grad_norm": 0.24399347603321075, "learning_rate": 4.365405882751822e-05, "loss": 0.0057, "step": 33810 }, { "grad_norm": 0.2786239981651306, "learning_rate": 4.3626724819691326e-05, "loss": 0.0072, "step": 33820 }, { "grad_norm": 0.22116117179393768, "learning_rate": 4.359939274789715e-05, "loss": 0.0054, "step": 33830 }, { "grad_norm": 0.24048860371112823, "learning_rate": 4.357206262043848e-05, "loss": 0.0071, "step": 33840 }, { "grad_norm": 0.1859717071056366, "learning_rate": 4.354473444561745e-05, "loss": 0.0062, "step": 33850 }, { "grad_norm": 0.22413165867328644, "learning_rate": 4.3517408231735644e-05, "loss": 0.0065, "step": 33860 }, { "grad_norm": 0.20817112922668457, "learning_rate": 4.3490083987094086e-05, "loss": 0.0053, "step": 33870 }, { "grad_norm": 0.24079784750938416, "learning_rate": 4.34627617199931e-05, "loss": 0.0051, "step": 33880 }, { "grad_norm": 0.2651442587375641, "learning_rate": 4.3435441438732526e-05, "loss": 0.0048, "step": 33890 }, { "grad_norm": 0.2585642337799072, "learning_rate": 4.340812315161149e-05, "loss": 0.0072, "step": 33900 }, { "grad_norm": 0.23831868171691895, "learning_rate": 4.338080686692859e-05, "loss": 0.0068, "step": 33910 }, { "grad_norm": 0.23950357735157013, "learning_rate": 4.3353492592981816e-05, "loss": 0.0056, "step": 33920 }, { "grad_norm": 0.1677539348602295, "learning_rate": 4.3326180338068485e-05, "loss": 0.0048, "step": 33930 }, { "grad_norm": 0.31859931349754333, "learning_rate": 4.3298870110485356e-05, "loss": 0.0058, "step": 33940 }, { "grad_norm": 0.2327817678451538, "learning_rate": 4.3271561918528567e-05, "loss": 0.0086, "step": 33950 }, { "grad_norm": 0.17561651766300201, "learning_rate": 4.324425577049359e-05, "loss": 0.007, "step": 33960 }, { "grad_norm": 0.2269870489835739, "learning_rate": 4.321695167467535e-05, "loss": 0.0107, "step": 33970 }, { "grad_norm": 0.2643167972564697, "learning_rate": 4.3189649639368093e-05, "loss": 0.0073, "step": 33980 }, { "grad_norm": 0.15537384152412415, "learning_rate": 4.316234967286547e-05, "loss": 0.0055, "step": 33990 }, { "grad_norm": 0.24303454160690308, "learning_rate": 4.313505178346046e-05, "loss": 0.0099, "step": 34000 }, { "grad_norm": 0.2226843237876892, "learning_rate": 4.3107755979445465e-05, "loss": 0.0069, "step": 34010 }, { "grad_norm": 0.22701045870780945, "learning_rate": 4.308046226911224e-05, "loss": 0.0077, "step": 34020 }, { "grad_norm": 0.2765866816043854, "learning_rate": 4.305317066075185e-05, "loss": 0.0087, "step": 34030 }, { "grad_norm": 0.21416576206684113, "learning_rate": 4.302588116265482e-05, "loss": 0.0079, "step": 34040 }, { "grad_norm": 0.2295892834663391, "learning_rate": 4.299859378311094e-05, "loss": 0.0061, "step": 34050 }, { "grad_norm": 0.22910450398921967, "learning_rate": 4.2971308530409424e-05, "loss": 0.0065, "step": 34060 }, { "grad_norm": 0.27474451065063477, "learning_rate": 4.2944025412838765e-05, "loss": 0.0094, "step": 34070 }, { "grad_norm": 0.20846879482269287, "learning_rate": 4.291674443868689e-05, "loss": 0.0064, "step": 34080 }, { "grad_norm": 0.2005152404308319, "learning_rate": 4.288946561624104e-05, "loss": 0.0054, "step": 34090 }, { "grad_norm": 0.15358655154705048, "learning_rate": 4.2862188953787794e-05, "loss": 0.0053, "step": 34100 }, { "grad_norm": 0.2720765173435211, "learning_rate": 4.283491445961308e-05, "loss": 0.008, "step": 34110 }, { "grad_norm": 0.19924184679985046, "learning_rate": 4.2807642142002155e-05, "loss": 0.0074, "step": 34120 }, { "grad_norm": 0.1904723048210144, "learning_rate": 4.278037200923966e-05, "loss": 0.0046, "step": 34130 }, { "grad_norm": 0.15538300573825836, "learning_rate": 4.275310406960953e-05, "loss": 0.006, "step": 34140 }, { "grad_norm": 0.19680018723011017, "learning_rate": 4.272583833139502e-05, "loss": 0.008, "step": 34150 }, { "grad_norm": 0.2198832929134369, "learning_rate": 4.2698574802878794e-05, "loss": 0.0055, "step": 34160 }, { "grad_norm": 0.21210193634033203, "learning_rate": 4.2671313492342734e-05, "loss": 0.0073, "step": 34170 }, { "grad_norm": 0.22931769490242004, "learning_rate": 4.264405440806813e-05, "loss": 0.0056, "step": 34180 }, { "grad_norm": 0.23736603558063507, "learning_rate": 4.26167975583356e-05, "loss": 0.0068, "step": 34190 }, { "grad_norm": 0.23222510516643524, "learning_rate": 4.2589542951425e-05, "loss": 0.007, "step": 34200 }, { "grad_norm": 0.27665337920188904, "learning_rate": 4.2562290595615615e-05, "loss": 0.0081, "step": 34210 }, { "grad_norm": 0.2881568372249603, "learning_rate": 4.2535040499185946e-05, "loss": 0.0069, "step": 34220 }, { "grad_norm": 0.18981459736824036, "learning_rate": 4.250779267041387e-05, "loss": 0.0051, "step": 34230 }, { "grad_norm": 0.2082643359899521, "learning_rate": 4.248054711757657e-05, "loss": 0.0059, "step": 34240 }, { "grad_norm": 0.24757909774780273, "learning_rate": 4.245330384895052e-05, "loss": 0.0071, "step": 34250 }, { "grad_norm": 0.1938125640153885, "learning_rate": 4.242606287281151e-05, "loss": 0.0056, "step": 34260 }, { "grad_norm": 0.24106638133525848, "learning_rate": 4.2398824197434595e-05, "loss": 0.0071, "step": 34270 }, { "grad_norm": 0.20034322142601013, "learning_rate": 4.23715878310942e-05, "loss": 0.006, "step": 34280 }, { "grad_norm": 0.23234619200229645, "learning_rate": 4.234435378206402e-05, "loss": 0.0065, "step": 34290 }, { "grad_norm": 0.18687517940998077, "learning_rate": 4.2317122058617006e-05, "loss": 0.0056, "step": 34300 }, { "grad_norm": 0.19216594099998474, "learning_rate": 4.2289892669025485e-05, "loss": 0.0073, "step": 34310 }, { "grad_norm": 0.194553405046463, "learning_rate": 4.226266562156097e-05, "loss": 0.0067, "step": 34320 }, { "grad_norm": 0.21808727085590363, "learning_rate": 4.223544092449435e-05, "loss": 0.0067, "step": 34330 }, { "grad_norm": 0.2061053067445755, "learning_rate": 4.2208218586095784e-05, "loss": 0.0058, "step": 34340 }, { "grad_norm": 0.21847741305828094, "learning_rate": 4.218099861463466e-05, "loss": 0.0071, "step": 34350 }, { "grad_norm": 0.26483115553855896, "learning_rate": 4.215378101837972e-05, "loss": 0.0059, "step": 34360 }, { "grad_norm": 0.23244349658489227, "learning_rate": 4.2126565805598937e-05, "loss": 0.0055, "step": 34370 }, { "grad_norm": 0.21753989160060883, "learning_rate": 4.209935298455957e-05, "loss": 0.0077, "step": 34380 }, { "grad_norm": 0.24541738629341125, "learning_rate": 4.207214256352817e-05, "loss": 0.0052, "step": 34390 }, { "grad_norm": 0.3085399866104126, "learning_rate": 4.2044934550770524e-05, "loss": 0.0064, "step": 34400 }, { "grad_norm": 0.2537444829940796, "learning_rate": 4.201772895455174e-05, "loss": 0.0065, "step": 34410 }, { "grad_norm": 0.2249620109796524, "learning_rate": 4.199052578313613e-05, "loss": 0.0063, "step": 34420 }, { "grad_norm": 0.26493632793426514, "learning_rate": 4.1963325044787294e-05, "loss": 0.0085, "step": 34430 }, { "grad_norm": 0.18385231494903564, "learning_rate": 4.193612674776814e-05, "loss": 0.0068, "step": 34440 }, { "grad_norm": 0.23978319764137268, "learning_rate": 4.1908930900340745e-05, "loss": 0.0073, "step": 34450 }, { "grad_norm": 0.19395625591278076, "learning_rate": 4.1881737510766536e-05, "loss": 0.0072, "step": 34460 }, { "grad_norm": 0.20736093819141388, "learning_rate": 4.185454658730609e-05, "loss": 0.0064, "step": 34470 }, { "grad_norm": 0.22516420483589172, "learning_rate": 4.1827358138219355e-05, "loss": 0.0056, "step": 34480 }, { "grad_norm": 0.274665504693985, "learning_rate": 4.1800172171765404e-05, "loss": 0.0083, "step": 34490 }, { "grad_norm": 0.1988043487071991, "learning_rate": 4.177298869620264e-05, "loss": 0.0071, "step": 34500 }, { "grad_norm": 0.37068867683410645, "learning_rate": 4.1745807719788705e-05, "loss": 0.0076, "step": 34510 }, { "grad_norm": 0.18292322754859924, "learning_rate": 4.1718629250780445e-05, "loss": 0.0072, "step": 34520 }, { "grad_norm": 0.2500819265842438, "learning_rate": 4.1691453297433956e-05, "loss": 0.0064, "step": 34530 }, { "grad_norm": 0.26304394006729126, "learning_rate": 4.166427986800457e-05, "loss": 0.0052, "step": 34540 }, { "grad_norm": 0.26817741990089417, "learning_rate": 4.163710897074688e-05, "loss": 0.0067, "step": 34550 }, { "grad_norm": 0.2203875333070755, "learning_rate": 4.1609940613914686e-05, "loss": 0.0064, "step": 34560 }, { "grad_norm": 0.2155834138393402, "learning_rate": 4.1582774805760996e-05, "loss": 0.0057, "step": 34570 }, { "grad_norm": 0.25557976961135864, "learning_rate": 4.155561155453809e-05, "loss": 0.0057, "step": 34580 }, { "grad_norm": 0.19896359741687775, "learning_rate": 4.15284508684974e-05, "loss": 0.0078, "step": 34590 }, { "grad_norm": 0.22413352131843567, "learning_rate": 4.1501292755889675e-05, "loss": 0.0072, "step": 34600 }, { "grad_norm": 0.18792198598384857, "learning_rate": 4.1474137224964833e-05, "loss": 0.0056, "step": 34610 }, { "grad_norm": 0.24979586899280548, "learning_rate": 4.144698428397197e-05, "loss": 0.0094, "step": 34620 }, { "grad_norm": 0.20292992889881134, "learning_rate": 4.1419833941159466e-05, "loss": 0.0063, "step": 34630 }, { "grad_norm": 0.21745379269123077, "learning_rate": 4.1392686204774846e-05, "loss": 0.0056, "step": 34640 }, { "grad_norm": 0.20036591589450836, "learning_rate": 4.13655410830649e-05, "loss": 0.0066, "step": 34650 }, { "grad_norm": 0.21364815533161163, "learning_rate": 4.1338398584275594e-05, "loss": 0.0059, "step": 34660 }, { "grad_norm": 0.20373791456222534, "learning_rate": 4.1311258716652104e-05, "loss": 0.0049, "step": 34670 }, { "grad_norm": 0.22007043659687042, "learning_rate": 4.128412148843881e-05, "loss": 0.0067, "step": 34680 }, { "grad_norm": 0.19879628717899323, "learning_rate": 4.125698690787926e-05, "loss": 0.0068, "step": 34690 }, { "grad_norm": 0.23853397369384766, "learning_rate": 4.1229854983216245e-05, "loss": 0.0063, "step": 34700 }, { "grad_norm": 0.20324555039405823, "learning_rate": 4.120272572269175e-05, "loss": 0.0048, "step": 34710 }, { "grad_norm": 0.23553842306137085, "learning_rate": 4.117559913454687e-05, "loss": 0.0065, "step": 34720 }, { "grad_norm": 0.24592702090740204, "learning_rate": 4.114847522702201e-05, "loss": 0.0063, "step": 34730 }, { "grad_norm": 0.2628866732120514, "learning_rate": 4.112135400835664e-05, "loss": 0.0061, "step": 34740 }, { "grad_norm": 0.17729780077934265, "learning_rate": 4.109423548678949e-05, "loss": 0.0068, "step": 34750 }, { "grad_norm": 0.26210513710975647, "learning_rate": 4.106711967055848e-05, "loss": 0.0061, "step": 34760 }, { "grad_norm": 0.23312030732631683, "learning_rate": 4.1040006567900636e-05, "loss": 0.005, "step": 34770 }, { "grad_norm": 0.2101515531539917, "learning_rate": 4.101289618705224e-05, "loss": 0.0081, "step": 34780 }, { "grad_norm": 0.19878444075584412, "learning_rate": 4.0985788536248675e-05, "loss": 0.0071, "step": 34790 }, { "grad_norm": 0.23772254586219788, "learning_rate": 4.095868362372454e-05, "loss": 0.0123, "step": 34800 }, { "grad_norm": 0.21559622883796692, "learning_rate": 4.0931581457713614e-05, "loss": 0.0052, "step": 34810 }, { "grad_norm": 0.2253088355064392, "learning_rate": 4.09044820464488e-05, "loss": 0.0063, "step": 34820 }, { "grad_norm": 0.24349753558635712, "learning_rate": 4.087738539816219e-05, "loss": 0.0046, "step": 34830 }, { "grad_norm": 0.2786179780960083, "learning_rate": 4.085029152108501e-05, "loss": 0.0068, "step": 34840 }, { "grad_norm": 0.1682848185300827, "learning_rate": 4.0823200423447714e-05, "loss": 0.0062, "step": 34850 }, { "grad_norm": 0.22498951852321625, "learning_rate": 4.079611211347981e-05, "loss": 0.0065, "step": 34860 }, { "grad_norm": 0.16856351494789124, "learning_rate": 4.076902659941002e-05, "loss": 0.0048, "step": 34870 }, { "grad_norm": 0.17777703702449799, "learning_rate": 4.074194388946624e-05, "loss": 0.005, "step": 34880 }, { "grad_norm": 0.2599576711654663, "learning_rate": 4.071486399187545e-05, "loss": 0.0056, "step": 34890 }, { "grad_norm": 0.2105918526649475, "learning_rate": 4.0687786914863836e-05, "loss": 0.0049, "step": 34900 }, { "grad_norm": 0.2822265923023224, "learning_rate": 4.0660712666656666e-05, "loss": 0.0064, "step": 34910 }, { "grad_norm": 0.2873954772949219, "learning_rate": 4.0633641255478394e-05, "loss": 0.008, "step": 34920 }, { "grad_norm": 0.22428452968597412, "learning_rate": 4.0606572689552624e-05, "loss": 0.0064, "step": 34930 }, { "grad_norm": 0.2152257263660431, "learning_rate": 4.0579506977102036e-05, "loss": 0.0064, "step": 34940 }, { "grad_norm": 0.22747565805912018, "learning_rate": 4.055244412634849e-05, "loss": 0.0053, "step": 34950 }, { "grad_norm": 0.23033103346824646, "learning_rate": 4.052538414551298e-05, "loss": 0.006, "step": 34960 }, { "grad_norm": 0.23219303786754608, "learning_rate": 4.0498327042815596e-05, "loss": 0.0065, "step": 34970 }, { "grad_norm": 0.21151939034461975, "learning_rate": 4.047127282647559e-05, "loss": 0.0087, "step": 34980 }, { "grad_norm": 0.13151350617408752, "learning_rate": 4.04442215047113e-05, "loss": 0.0044, "step": 34990 }, { "grad_norm": 0.15589280426502228, "learning_rate": 4.041717308574023e-05, "loss": 0.0063, "step": 35000 }, { "grad_norm": 0.17671816051006317, "learning_rate": 4.039012757777893e-05, "loss": 0.0048, "step": 35010 }, { "grad_norm": 0.23570609092712402, "learning_rate": 4.036308498904314e-05, "loss": 0.0053, "step": 35020 }, { "grad_norm": 0.229853555560112, "learning_rate": 4.033604532774771e-05, "loss": 0.0067, "step": 35030 }, { "grad_norm": 0.27073344588279724, "learning_rate": 4.030900860210652e-05, "loss": 0.0062, "step": 35040 }, { "grad_norm": 0.26767176389694214, "learning_rate": 4.028197482033266e-05, "loss": 0.0075, "step": 35050 }, { "grad_norm": 0.2794646620750427, "learning_rate": 4.0254943990638246e-05, "loss": 0.0078, "step": 35060 }, { "grad_norm": 0.20178599655628204, "learning_rate": 4.022791612123454e-05, "loss": 0.007, "step": 35070 }, { "grad_norm": 0.23101627826690674, "learning_rate": 4.020089122033192e-05, "loss": 0.0062, "step": 35080 }, { "grad_norm": 0.21919691562652588, "learning_rate": 4.01738692961398e-05, "loss": 0.0055, "step": 35090 }, { "grad_norm": 0.24761426448822021, "learning_rate": 4.014685035686675e-05, "loss": 0.0062, "step": 35100 }, { "grad_norm": 0.2112138569355011, "learning_rate": 4.011983441072039e-05, "loss": 0.0061, "step": 35110 }, { "grad_norm": 0.19377121329307556, "learning_rate": 4.0092821465907485e-05, "loss": 0.007, "step": 35120 }, { "grad_norm": 0.18435238301753998, "learning_rate": 4.006581153063383e-05, "loss": 0.0076, "step": 35130 }, { "grad_norm": 0.2108333855867386, "learning_rate": 4.003880461310432e-05, "loss": 0.0075, "step": 35140 }, { "grad_norm": 0.20133459568023682, "learning_rate": 4.001180072152298e-05, "loss": 0.0083, "step": 35150 }, { "grad_norm": 0.2420373260974884, "learning_rate": 3.998479986409285e-05, "loss": 0.0054, "step": 35160 }, { "grad_norm": 0.4009375274181366, "learning_rate": 3.995780204901607e-05, "loss": 0.008, "step": 35170 }, { "grad_norm": 0.2990924119949341, "learning_rate": 3.993080728449391e-05, "loss": 0.0055, "step": 35180 }, { "grad_norm": 0.23826587200164795, "learning_rate": 3.990381557872661e-05, "loss": 0.0067, "step": 35190 }, { "grad_norm": 0.16642212867736816, "learning_rate": 3.987682693991359e-05, "loss": 0.0061, "step": 35200 }, { "grad_norm": 0.24827103316783905, "learning_rate": 3.9849841376253226e-05, "loss": 0.0061, "step": 35210 }, { "grad_norm": 0.20943792164325714, "learning_rate": 3.982285889594306e-05, "loss": 0.0056, "step": 35220 }, { "grad_norm": 0.15458567440509796, "learning_rate": 3.9795879507179665e-05, "loss": 0.0044, "step": 35230 }, { "grad_norm": 0.15484702587127686, "learning_rate": 3.9768903218158634e-05, "loss": 0.0044, "step": 35240 }, { "grad_norm": 0.29244929552078247, "learning_rate": 3.974193003707468e-05, "loss": 0.0076, "step": 35250 }, { "grad_norm": 0.28837576508522034, "learning_rate": 3.971495997212152e-05, "loss": 0.0086, "step": 35260 }, { "grad_norm": 0.2749343514442444, "learning_rate": 3.9687993031491985e-05, "loss": 0.0075, "step": 35270 }, { "grad_norm": 0.27931755781173706, "learning_rate": 3.966102922337787e-05, "loss": 0.0071, "step": 35280 }, { "grad_norm": 0.23164190351963043, "learning_rate": 3.963406855597009e-05, "loss": 0.0071, "step": 35290 }, { "grad_norm": 0.2774469256401062, "learning_rate": 3.960711103745861e-05, "loss": 0.0081, "step": 35300 }, { "grad_norm": 0.28086864948272705, "learning_rate": 3.958015667603237e-05, "loss": 0.0088, "step": 35310 }, { "grad_norm": 0.21511541306972504, "learning_rate": 3.955320547987943e-05, "loss": 0.0067, "step": 35320 }, { "grad_norm": 0.16067419946193695, "learning_rate": 3.952625745718681e-05, "loss": 0.0045, "step": 35330 }, { "grad_norm": 0.22300614416599274, "learning_rate": 3.949931261614064e-05, "loss": 0.0051, "step": 35340 }, { "grad_norm": 0.18996204435825348, "learning_rate": 3.947237096492605e-05, "loss": 0.0056, "step": 35350 }, { "grad_norm": 0.1919393390417099, "learning_rate": 3.944543251172719e-05, "loss": 0.0061, "step": 35360 }, { "grad_norm": 0.1869780570268631, "learning_rate": 3.941849726472725e-05, "loss": 0.0095, "step": 35370 }, { "grad_norm": 0.2237592339515686, "learning_rate": 3.939156523210846e-05, "loss": 0.007, "step": 35380 }, { "grad_norm": 0.23426155745983124, "learning_rate": 3.9364636422052046e-05, "loss": 0.0069, "step": 35390 }, { "grad_norm": 0.2058716118335724, "learning_rate": 3.933771084273828e-05, "loss": 0.0045, "step": 35400 }, { "grad_norm": 0.20779933035373688, "learning_rate": 3.931078850234643e-05, "loss": 0.0054, "step": 35410 }, { "grad_norm": 0.20989027619361877, "learning_rate": 3.928386940905483e-05, "loss": 0.0043, "step": 35420 }, { "grad_norm": 0.17320884764194489, "learning_rate": 3.925695357104073e-05, "loss": 0.0053, "step": 35430 }, { "grad_norm": 0.23786145448684692, "learning_rate": 3.923004099648049e-05, "loss": 0.0056, "step": 35440 }, { "grad_norm": 0.2616221010684967, "learning_rate": 3.920313169354944e-05, "loss": 0.0067, "step": 35450 }, { "grad_norm": 0.31618523597717285, "learning_rate": 3.9176225670421897e-05, "loss": 0.0054, "step": 35460 }, { "grad_norm": 0.17155753076076508, "learning_rate": 3.9149322935271224e-05, "loss": 0.0052, "step": 35470 }, { "grad_norm": 0.2382754385471344, "learning_rate": 3.9122423496269725e-05, "loss": 0.0059, "step": 35480 }, { "grad_norm": 0.17716126143932343, "learning_rate": 3.909552736158877e-05, "loss": 0.005, "step": 35490 }, { "grad_norm": 0.2661457359790802, "learning_rate": 3.90686345393987e-05, "loss": 0.0065, "step": 35500 }, { "grad_norm": 0.20925626158714294, "learning_rate": 3.9041745037868816e-05, "loss": 0.0063, "step": 35510 }, { "grad_norm": 0.20722226798534393, "learning_rate": 3.9014858865167465e-05, "loss": 0.005, "step": 35520 }, { "grad_norm": 0.2820894420146942, "learning_rate": 3.8987976029461935e-05, "loss": 0.0057, "step": 35530 }, { "grad_norm": 0.2097587138414383, "learning_rate": 3.896109653891853e-05, "loss": 0.0052, "step": 35540 }, { "grad_norm": 0.1808178573846817, "learning_rate": 3.893422040170254e-05, "loss": 0.0055, "step": 35550 }, { "grad_norm": 0.18658258020877838, "learning_rate": 3.8907347625978207e-05, "loss": 0.006, "step": 35560 }, { "grad_norm": 0.2943296730518341, "learning_rate": 3.88804782199088e-05, "loss": 0.0058, "step": 35570 }, { "grad_norm": 0.21354100108146667, "learning_rate": 3.8853612191656495e-05, "loss": 0.007, "step": 35580 }, { "grad_norm": 0.15112528204917908, "learning_rate": 3.88267495493825e-05, "loss": 0.0044, "step": 35590 }, { "grad_norm": 0.22299808263778687, "learning_rate": 3.8799890301247004e-05, "loss": 0.0065, "step": 35600 }, { "grad_norm": 0.13837043941020966, "learning_rate": 3.8773034455409096e-05, "loss": 0.0086, "step": 35610 }, { "grad_norm": 0.16553260385990143, "learning_rate": 3.8746182020026904e-05, "loss": 0.0055, "step": 35620 }, { "grad_norm": 0.162267804145813, "learning_rate": 3.871933300325745e-05, "loss": 0.0057, "step": 35630 }, { "grad_norm": 0.23652200400829315, "learning_rate": 3.869248741325679e-05, "loss": 0.0073, "step": 35640 }, { "grad_norm": 0.22564180195331573, "learning_rate": 3.866564525817992e-05, "loss": 0.0056, "step": 35650 }, { "grad_norm": 0.20974014699459076, "learning_rate": 3.8638806546180725e-05, "loss": 0.0052, "step": 35660 }, { "grad_norm": 0.26222461462020874, "learning_rate": 3.861197128541213e-05, "loss": 0.0066, "step": 35670 }, { "grad_norm": 0.2708519995212555, "learning_rate": 3.858513948402599e-05, "loss": 0.0058, "step": 35680 }, { "grad_norm": 0.19602173566818237, "learning_rate": 3.8558311150173077e-05, "loss": 0.0055, "step": 35690 }, { "grad_norm": 0.1802608072757721, "learning_rate": 3.853148629200312e-05, "loss": 0.0067, "step": 35700 }, { "grad_norm": 0.21257595717906952, "learning_rate": 3.850466491766482e-05, "loss": 0.0063, "step": 35710 }, { "grad_norm": 0.18815435469150543, "learning_rate": 3.847784703530583e-05, "loss": 0.0057, "step": 35720 }, { "grad_norm": 0.21281659603118896, "learning_rate": 3.845103265307266e-05, "loss": 0.008, "step": 35730 }, { "grad_norm": 0.21290278434753418, "learning_rate": 3.842422177911086e-05, "loss": 0.0051, "step": 35740 }, { "grad_norm": 0.147047758102417, "learning_rate": 3.8397414421564826e-05, "loss": 0.0048, "step": 35750 }, { "grad_norm": 0.19908230006694794, "learning_rate": 3.8370610588577935e-05, "loss": 0.0074, "step": 35760 }, { "grad_norm": 0.22344379127025604, "learning_rate": 3.834381028829251e-05, "loss": 0.0065, "step": 35770 }, { "grad_norm": 0.23005081713199615, "learning_rate": 3.8317013528849745e-05, "loss": 0.0058, "step": 35780 }, { "grad_norm": 0.20861759781837463, "learning_rate": 3.8290220318389815e-05, "loss": 0.0067, "step": 35790 }, { "grad_norm": 0.2356967180967331, "learning_rate": 3.8263430665051746e-05, "loss": 0.0061, "step": 35800 }, { "grad_norm": 0.22956997156143188, "learning_rate": 3.8236644576973554e-05, "loss": 0.0058, "step": 35810 }, { "grad_norm": 0.198786661028862, "learning_rate": 3.820986206229217e-05, "loss": 0.0049, "step": 35820 }, { "grad_norm": 0.21523654460906982, "learning_rate": 3.8183083129143384e-05, "loss": 0.0071, "step": 35830 }, { "grad_norm": 0.37201812863349915, "learning_rate": 3.815630778566193e-05, "loss": 0.0065, "step": 35840 }, { "grad_norm": 0.19066141545772552, "learning_rate": 3.812953603998145e-05, "loss": 0.0068, "step": 35850 }, { "grad_norm": 0.22858557105064392, "learning_rate": 3.8102767900234504e-05, "loss": 0.0048, "step": 35860 }, { "grad_norm": 0.2726020812988281, "learning_rate": 3.807600337455256e-05, "loss": 0.0055, "step": 35870 }, { "grad_norm": 0.2349100410938263, "learning_rate": 3.804924247106593e-05, "loss": 0.006, "step": 35880 }, { "grad_norm": 0.15375220775604248, "learning_rate": 3.8022485197903925e-05, "loss": 0.0053, "step": 35890 }, { "grad_norm": 0.19056303799152374, "learning_rate": 3.799573156319464e-05, "loss": 0.0059, "step": 35900 }, { "grad_norm": 0.22171743214130402, "learning_rate": 3.796898157506515e-05, "loss": 0.0055, "step": 35910 }, { "grad_norm": 0.24871499836444855, "learning_rate": 3.794223524164143e-05, "loss": 0.0059, "step": 35920 }, { "grad_norm": 0.2852282226085663, "learning_rate": 3.7915492571048245e-05, "loss": 0.0072, "step": 35930 }, { "grad_norm": 0.20029722154140472, "learning_rate": 3.788875357140937e-05, "loss": 0.0065, "step": 35940 }, { "grad_norm": 0.16307583451271057, "learning_rate": 3.786201825084736e-05, "loss": 0.0053, "step": 35950 }, { "grad_norm": 0.15474049746990204, "learning_rate": 3.783528661748372e-05, "loss": 0.0044, "step": 35960 }, { "grad_norm": 0.19798745214939117, "learning_rate": 3.780855867943882e-05, "loss": 0.004, "step": 35970 }, { "grad_norm": 0.15268008410930634, "learning_rate": 3.778183444483189e-05, "loss": 0.0053, "step": 35980 }, { "grad_norm": 0.19586017727851868, "learning_rate": 3.775511392178108e-05, "loss": 0.0047, "step": 35990 }, { "grad_norm": 0.21341058611869812, "learning_rate": 3.772839711840332e-05, "loss": 0.0058, "step": 36000 }, { "grad_norm": 0.19834588468074799, "learning_rate": 3.7701684042814515e-05, "loss": 0.0058, "step": 36010 }, { "grad_norm": 0.20601443946361542, "learning_rate": 3.76749747031294e-05, "loss": 0.0072, "step": 36020 }, { "grad_norm": 0.19616585969924927, "learning_rate": 3.764826910746152e-05, "loss": 0.006, "step": 36030 }, { "grad_norm": 0.22022952139377594, "learning_rate": 3.762156726392338e-05, "loss": 0.0055, "step": 36040 }, { "grad_norm": 0.17854173481464386, "learning_rate": 3.759486918062625e-05, "loss": 0.0052, "step": 36050 }, { "grad_norm": 0.20402492582798004, "learning_rate": 3.756817486568033e-05, "loss": 0.0064, "step": 36060 }, { "grad_norm": 0.23468294739723206, "learning_rate": 3.7541484327194654e-05, "loss": 0.0064, "step": 36070 }, { "grad_norm": 0.27246394753456116, "learning_rate": 3.751479757327707e-05, "loss": 0.0069, "step": 36080 }, { "grad_norm": 0.23805776238441467, "learning_rate": 3.7488114612034345e-05, "loss": 0.0083, "step": 36090 }, { "grad_norm": 0.18809965252876282, "learning_rate": 3.7461435451572044e-05, "loss": 0.0056, "step": 36100 }, { "grad_norm": 0.21221695840358734, "learning_rate": 3.743476009999459e-05, "loss": 0.0062, "step": 36110 }, { "grad_norm": 0.23103338479995728, "learning_rate": 3.7408088565405245e-05, "loss": 0.0043, "step": 36120 }, { "grad_norm": 0.22648471593856812, "learning_rate": 3.738142085590612e-05, "loss": 0.0066, "step": 36130 }, { "grad_norm": 0.1975145936012268, "learning_rate": 3.7354756979598194e-05, "loss": 0.0059, "step": 36140 }, { "grad_norm": 0.19435888528823853, "learning_rate": 3.7328096944581187e-05, "loss": 0.0065, "step": 36150 }, { "grad_norm": 0.2177690714597702, "learning_rate": 3.730144075895377e-05, "loss": 0.0051, "step": 36160 }, { "grad_norm": 0.2232530415058136, "learning_rate": 3.727478843081335e-05, "loss": 0.0054, "step": 36170 }, { "grad_norm": 0.23869039118289948, "learning_rate": 3.72481399682562e-05, "loss": 0.0074, "step": 36180 }, { "grad_norm": 0.23044690489768982, "learning_rate": 3.722149537937747e-05, "loss": 0.006, "step": 36190 }, { "grad_norm": 0.251235693693161, "learning_rate": 3.7194854672271015e-05, "loss": 0.0051, "step": 36200 }, { "grad_norm": 0.20466117560863495, "learning_rate": 3.7168217855029644e-05, "loss": 0.006, "step": 36210 }, { "grad_norm": 0.24486492574214935, "learning_rate": 3.7141584935744856e-05, "loss": 0.006, "step": 36220 }, { "grad_norm": 0.2942899465560913, "learning_rate": 3.7114955922507055e-05, "loss": 0.0056, "step": 36230 }, { "grad_norm": 0.20960935950279236, "learning_rate": 3.708833082340545e-05, "loss": 0.0059, "step": 36240 }, { "grad_norm": 0.3838084638118744, "learning_rate": 3.7061709646528034e-05, "loss": 0.0044, "step": 36250 }, { "grad_norm": 0.27701202034950256, "learning_rate": 3.7035092399961604e-05, "loss": 0.0066, "step": 36260 }, { "grad_norm": 0.2270684391260147, "learning_rate": 3.700847909179177e-05, "loss": 0.0081, "step": 36270 }, { "grad_norm": 0.2737200856208801, "learning_rate": 3.698186973010297e-05, "loss": 0.0092, "step": 36280 }, { "grad_norm": 0.24267403781414032, "learning_rate": 3.695526432297844e-05, "loss": 0.0054, "step": 36290 }, { "grad_norm": 0.1955595165491104, "learning_rate": 3.692866287850017e-05, "loss": 0.0047, "step": 36300 }, { "grad_norm": 0.1758509874343872, "learning_rate": 3.6902065404749006e-05, "loss": 0.0065, "step": 36310 }, { "grad_norm": 0.21280477941036224, "learning_rate": 3.6875471909804516e-05, "loss": 0.0057, "step": 36320 }, { "grad_norm": 0.23110578954219818, "learning_rate": 3.6848882401745135e-05, "loss": 0.01, "step": 36330 }, { "grad_norm": 0.19947132468223572, "learning_rate": 3.682229688864806e-05, "loss": 0.0086, "step": 36340 }, { "grad_norm": 0.22780759632587433, "learning_rate": 3.6795715378589235e-05, "loss": 0.0063, "step": 36350 }, { "grad_norm": 0.15910424292087555, "learning_rate": 3.676913787964345e-05, "loss": 0.0058, "step": 36360 }, { "grad_norm": 0.21257714927196503, "learning_rate": 3.674256439988423e-05, "loss": 0.0062, "step": 36370 }, { "grad_norm": 0.2670106589794159, "learning_rate": 3.6715994947383904e-05, "loss": 0.0065, "step": 36380 }, { "grad_norm": 0.1675681173801422, "learning_rate": 3.668942953021357e-05, "loss": 0.0053, "step": 36390 }, { "grad_norm": 0.20900532603263855, "learning_rate": 3.66628681564431e-05, "loss": 0.005, "step": 36400 }, { "grad_norm": 0.261904239654541, "learning_rate": 3.663631083414114e-05, "loss": 0.0062, "step": 36410 }, { "grad_norm": 0.13752001523971558, "learning_rate": 3.660975757137509e-05, "loss": 0.0057, "step": 36420 }, { "grad_norm": 0.23134367167949677, "learning_rate": 3.658320837621114e-05, "loss": 0.0074, "step": 36430 }, { "grad_norm": 0.23288044333457947, "learning_rate": 3.655666325671426e-05, "loss": 0.0059, "step": 36440 }, { "grad_norm": 0.21831224858760834, "learning_rate": 3.65301222209481e-05, "loss": 0.0056, "step": 36450 }, { "grad_norm": 0.2047238051891327, "learning_rate": 3.650358527697519e-05, "loss": 0.0066, "step": 36460 }, { "grad_norm": 0.19113247096538544, "learning_rate": 3.64770524328567e-05, "loss": 0.0055, "step": 36470 }, { "grad_norm": 0.16633984446525574, "learning_rate": 3.645052369665265e-05, "loss": 0.0067, "step": 36480 }, { "grad_norm": 0.2797467112541199, "learning_rate": 3.6423999076421724e-05, "loss": 0.0058, "step": 36490 }, { "grad_norm": 0.24397028982639313, "learning_rate": 3.639747858022142e-05, "loss": 0.0056, "step": 36500 }, { "grad_norm": 0.2170751988887787, "learning_rate": 3.637096221610799e-05, "loss": 0.0059, "step": 36510 }, { "grad_norm": 0.27747640013694763, "learning_rate": 3.634444999213638e-05, "loss": 0.0055, "step": 36520 }, { "grad_norm": 0.1372748166322708, "learning_rate": 3.6317941916360296e-05, "loss": 0.0055, "step": 36530 }, { "grad_norm": 0.20448923110961914, "learning_rate": 3.629143799683221e-05, "loss": 0.007, "step": 36540 }, { "grad_norm": 0.21278725564479828, "learning_rate": 3.626493824160331e-05, "loss": 0.0085, "step": 36550 }, { "grad_norm": 0.23292264342308044, "learning_rate": 3.623844265872352e-05, "loss": 0.0047, "step": 36560 }, { "grad_norm": 0.245171919465065, "learning_rate": 3.621195125624149e-05, "loss": 0.0057, "step": 36570 }, { "grad_norm": 0.20896212756633759, "learning_rate": 3.618546404220463e-05, "loss": 0.0067, "step": 36580 }, { "grad_norm": 0.19054430723190308, "learning_rate": 3.615898102465903e-05, "loss": 0.0079, "step": 36590 }, { "grad_norm": 0.1752590835094452, "learning_rate": 3.6132502211649544e-05, "loss": 0.0055, "step": 36600 }, { "grad_norm": 0.1642097383737564, "learning_rate": 3.610602761121975e-05, "loss": 0.0059, "step": 36610 }, { "grad_norm": 0.33313947916030884, "learning_rate": 3.6079557231411897e-05, "loss": 0.0075, "step": 36620 }, { "grad_norm": 0.1732037365436554, "learning_rate": 3.6053091080267035e-05, "loss": 0.0041, "step": 36630 }, { "grad_norm": 0.18742002546787262, "learning_rate": 3.602662916582483e-05, "loss": 0.0069, "step": 36640 }, { "grad_norm": 0.21860133111476898, "learning_rate": 3.600017149612375e-05, "loss": 0.0078, "step": 36650 }, { "grad_norm": 0.24872207641601562, "learning_rate": 3.5973718079200935e-05, "loss": 0.0075, "step": 36660 }, { "grad_norm": 0.21307551860809326, "learning_rate": 3.5947268923092216e-05, "loss": 0.0053, "step": 36670 }, { "grad_norm": 0.24681150913238525, "learning_rate": 3.592082403583216e-05, "loss": 0.0064, "step": 36680 }, { "grad_norm": 0.17814095318317413, "learning_rate": 3.5894383425454004e-05, "loss": 0.0045, "step": 36690 }, { "grad_norm": 0.18989083170890808, "learning_rate": 3.586794709998975e-05, "loss": 0.0051, "step": 36700 }, { "grad_norm": 0.1905387043952942, "learning_rate": 3.584151506747002e-05, "loss": 0.007, "step": 36710 }, { "grad_norm": 0.1567084938287735, "learning_rate": 3.581508733592418e-05, "loss": 0.0039, "step": 36720 }, { "grad_norm": 0.22371873259544373, "learning_rate": 3.5788663913380297e-05, "loss": 0.0071, "step": 36730 }, { "grad_norm": 0.2086075097322464, "learning_rate": 3.576224480786506e-05, "loss": 0.0054, "step": 36740 }, { "grad_norm": 0.2463143765926361, "learning_rate": 3.573583002740393e-05, "loss": 0.0085, "step": 36750 }, { "grad_norm": 0.21759995818138123, "learning_rate": 3.570941958002103e-05, "loss": 0.0055, "step": 36760 }, { "grad_norm": 0.1807888150215149, "learning_rate": 3.568301347373912e-05, "loss": 0.0047, "step": 36770 }, { "grad_norm": 0.2299349159002304, "learning_rate": 3.5656611716579726e-05, "loss": 0.0075, "step": 36780 }, { "grad_norm": 0.20724748075008392, "learning_rate": 3.5630214316562946e-05, "loss": 0.0054, "step": 36790 }, { "grad_norm": 0.20229975879192352, "learning_rate": 3.560382128170766e-05, "loss": 0.0061, "step": 36800 }, { "grad_norm": 0.20782332122325897, "learning_rate": 3.5577432620031374e-05, "loss": 0.0095, "step": 36810 }, { "grad_norm": 0.1981372833251953, "learning_rate": 3.5551048339550216e-05, "loss": 0.0078, "step": 36820 }, { "grad_norm": 0.20795145630836487, "learning_rate": 3.55246684482791e-05, "loss": 0.005, "step": 36830 }, { "grad_norm": 0.2088252753019333, "learning_rate": 3.5498292954231496e-05, "loss": 0.006, "step": 36840 }, { "grad_norm": 0.2797907590866089, "learning_rate": 3.54719218654196e-05, "loss": 0.0086, "step": 36850 }, { "grad_norm": 0.20427638292312622, "learning_rate": 3.544555518985425e-05, "loss": 0.007, "step": 36860 }, { "grad_norm": 0.2264554351568222, "learning_rate": 3.541919293554494e-05, "loss": 0.0043, "step": 36870 }, { "grad_norm": 0.21132589876651764, "learning_rate": 3.539283511049985e-05, "loss": 0.0048, "step": 36880 }, { "grad_norm": 0.22764794528484344, "learning_rate": 3.5366481722725755e-05, "loss": 0.0076, "step": 36890 }, { "grad_norm": 0.26951301097869873, "learning_rate": 3.534013278022816e-05, "loss": 0.0058, "step": 36900 }, { "grad_norm": 0.2956538796424866, "learning_rate": 3.531378829101113e-05, "loss": 0.0051, "step": 36910 }, { "grad_norm": 0.21075810492038727, "learning_rate": 3.528744826307746e-05, "loss": 0.0068, "step": 36920 }, { "grad_norm": 0.20373030006885529, "learning_rate": 3.5261112704428554e-05, "loss": 0.0072, "step": 36930 }, { "grad_norm": 0.17024897038936615, "learning_rate": 3.523478162306443e-05, "loss": 0.0047, "step": 36940 }, { "grad_norm": 0.18920721113681793, "learning_rate": 3.520845502698381e-05, "loss": 0.0051, "step": 36950 }, { "grad_norm": 0.2483489066362381, "learning_rate": 3.5182132924184005e-05, "loss": 0.0055, "step": 36960 }, { "grad_norm": 0.18171939253807068, "learning_rate": 3.5155815322660966e-05, "loss": 0.0053, "step": 36970 }, { "grad_norm": 0.1687224805355072, "learning_rate": 3.512950223040931e-05, "loss": 0.0044, "step": 36980 }, { "grad_norm": 0.1768805831670761, "learning_rate": 3.5103193655422216e-05, "loss": 0.0057, "step": 36990 }, { "grad_norm": 0.13669216632843018, "learning_rate": 3.5076889605691596e-05, "loss": 0.005, "step": 37000 }, { "grad_norm": 0.20554442703723907, "learning_rate": 3.505059008920787e-05, "loss": 0.0066, "step": 37010 }, { "grad_norm": 0.21139760315418243, "learning_rate": 3.502429511396016e-05, "loss": 0.0061, "step": 37020 }, { "grad_norm": 0.19338373839855194, "learning_rate": 3.4998004687936196e-05, "loss": 0.0073, "step": 37030 }, { "grad_norm": 0.1779462993144989, "learning_rate": 3.497171881912229e-05, "loss": 0.0073, "step": 37040 }, { "grad_norm": 0.26644426584243774, "learning_rate": 3.494543751550342e-05, "loss": 0.0053, "step": 37050 }, { "grad_norm": 0.2014368176460266, "learning_rate": 3.491916078506313e-05, "loss": 0.0062, "step": 37060 }, { "grad_norm": 0.31424960494041443, "learning_rate": 3.489288863578361e-05, "loss": 0.0075, "step": 37070 }, { "grad_norm": 0.28273335099220276, "learning_rate": 3.4866621075645646e-05, "loss": 0.007, "step": 37080 }, { "grad_norm": 0.2067117989063263, "learning_rate": 3.4840358112628614e-05, "loss": 0.0051, "step": 37090 }, { "grad_norm": 0.24792727828025818, "learning_rate": 3.481409975471053e-05, "loss": 0.0053, "step": 37100 }, { "grad_norm": 0.2381804883480072, "learning_rate": 3.4787846009867986e-05, "loss": 0.0054, "step": 37110 }, { "grad_norm": 0.21490605175495148, "learning_rate": 3.476159688607615e-05, "loss": 0.0052, "step": 37120 }, { "grad_norm": 0.21271218359470367, "learning_rate": 3.4735352391308854e-05, "loss": 0.0055, "step": 37130 }, { "grad_norm": 0.24120795726776123, "learning_rate": 3.4709112533538446e-05, "loss": 0.0092, "step": 37140 }, { "grad_norm": 0.2060900628566742, "learning_rate": 3.4682877320735934e-05, "loss": 0.0055, "step": 37150 }, { "grad_norm": 0.15543334186077118, "learning_rate": 3.465664676087085e-05, "loss": 0.004, "step": 37160 }, { "grad_norm": 0.20778962969779968, "learning_rate": 3.463042086191136e-05, "loss": 0.0057, "step": 37170 }, { "grad_norm": 0.22885197401046753, "learning_rate": 3.460419963182423e-05, "loss": 0.0057, "step": 37180 }, { "grad_norm": 0.22553709149360657, "learning_rate": 3.457798307857473e-05, "loss": 0.0058, "step": 37190 }, { "grad_norm": 0.2610466182231903, "learning_rate": 3.455177121012678e-05, "loss": 0.0053, "step": 37200 }, { "grad_norm": 0.1642984300851822, "learning_rate": 3.452556403444285e-05, "loss": 0.0079, "step": 37210 }, { "grad_norm": 0.16797909140586853, "learning_rate": 3.4499361559483975e-05, "loss": 0.0055, "step": 37220 }, { "grad_norm": 0.2554571330547333, "learning_rate": 3.44731637932098e-05, "loss": 0.0051, "step": 37230 }, { "grad_norm": 0.13070277869701385, "learning_rate": 3.44469707435785e-05, "loss": 0.0042, "step": 37240 }, { "grad_norm": 0.16924156248569489, "learning_rate": 3.4420782418546835e-05, "loss": 0.0054, "step": 37250 }, { "grad_norm": 0.17158930003643036, "learning_rate": 3.439459882607012e-05, "loss": 0.0054, "step": 37260 }, { "grad_norm": 0.10545698553323746, "learning_rate": 3.436841997410225e-05, "loss": 0.0038, "step": 37270 }, { "grad_norm": 0.21364626288414001, "learning_rate": 3.434224587059567e-05, "loss": 0.0059, "step": 37280 }, { "grad_norm": 0.17869031429290771, "learning_rate": 3.431607652350136e-05, "loss": 0.0045, "step": 37290 }, { "grad_norm": 0.15119893848896027, "learning_rate": 3.428991194076891e-05, "loss": 0.0056, "step": 37300 }, { "grad_norm": 0.2110467553138733, "learning_rate": 3.4263752130346394e-05, "loss": 0.0058, "step": 37310 }, { "grad_norm": 0.12615124881267548, "learning_rate": 3.4237597100180515e-05, "loss": 0.0048, "step": 37320 }, { "grad_norm": 0.18072180449962616, "learning_rate": 3.4211446858216427e-05, "loss": 0.0035, "step": 37330 }, { "grad_norm": 0.1312078833580017, "learning_rate": 3.4185301412397915e-05, "loss": 0.0039, "step": 37340 }, { "grad_norm": 0.25086510181427, "learning_rate": 3.415916077066729e-05, "loss": 0.0078, "step": 37350 }, { "grad_norm": 0.1735914945602417, "learning_rate": 3.413302494096535e-05, "loss": 0.0055, "step": 37360 }, { "grad_norm": 0.21667224168777466, "learning_rate": 3.410689393123151e-05, "loss": 0.0045, "step": 37370 }, { "grad_norm": 0.18501809239387512, "learning_rate": 3.408076774940364e-05, "loss": 0.0077, "step": 37380 }, { "grad_norm": 0.23403215408325195, "learning_rate": 3.40546464034182e-05, "loss": 0.0041, "step": 37390 }, { "grad_norm": 0.16335240006446838, "learning_rate": 3.4028529901210185e-05, "loss": 0.0064, "step": 37400 }, { "grad_norm": 0.2166048139333725, "learning_rate": 3.4002418250713086e-05, "loss": 0.0049, "step": 37410 }, { "grad_norm": 0.31649965047836304, "learning_rate": 3.3976311459858936e-05, "loss": 0.0063, "step": 37420 }, { "grad_norm": 0.20769289135932922, "learning_rate": 3.395020953657826e-05, "loss": 0.0058, "step": 37430 }, { "grad_norm": 0.2356630265712738, "learning_rate": 3.3924112488800165e-05, "loss": 0.0057, "step": 37440 }, { "grad_norm": 0.255302369594574, "learning_rate": 3.389802032445225e-05, "loss": 0.0051, "step": 37450 }, { "grad_norm": 0.1806274950504303, "learning_rate": 3.38719330514606e-05, "loss": 0.0044, "step": 37460 }, { "grad_norm": 0.22856494784355164, "learning_rate": 3.3845850677749866e-05, "loss": 0.0084, "step": 37470 }, { "grad_norm": 0.15293876826763153, "learning_rate": 3.3819773211243157e-05, "loss": 0.0058, "step": 37480 }, { "grad_norm": 0.19884006679058075, "learning_rate": 3.379370065986213e-05, "loss": 0.007, "step": 37490 }, { "grad_norm": 0.19559234380722046, "learning_rate": 3.3767633031526955e-05, "loss": 0.0062, "step": 37500 }, { "grad_norm": 0.20624510943889618, "learning_rate": 3.374157033415626e-05, "loss": 0.0047, "step": 37510 }, { "grad_norm": 0.2057885378599167, "learning_rate": 3.371551257566723e-05, "loss": 0.004, "step": 37520 }, { "grad_norm": 0.20738784968852997, "learning_rate": 3.36894597639755e-05, "loss": 0.0062, "step": 37530 }, { "grad_norm": 0.22211676836013794, "learning_rate": 3.366341190699523e-05, "loss": 0.0052, "step": 37540 }, { "grad_norm": 0.2413415163755417, "learning_rate": 3.36373690126391e-05, "loss": 0.0057, "step": 37550 }, { "grad_norm": 0.21675710380077362, "learning_rate": 3.3611331088818234e-05, "loss": 0.0054, "step": 37560 }, { "grad_norm": 0.30930137634277344, "learning_rate": 3.3585298143442265e-05, "loss": 0.0078, "step": 37570 }, { "grad_norm": 0.27456626296043396, "learning_rate": 3.35592701844193e-05, "loss": 0.0048, "step": 37580 }, { "grad_norm": 0.20125313103199005, "learning_rate": 3.353324721965596e-05, "loss": 0.0057, "step": 37590 }, { "grad_norm": 0.2426241636276245, "learning_rate": 3.350722925705736e-05, "loss": 0.0072, "step": 37600 }, { "grad_norm": 0.11537458002567291, "learning_rate": 3.348121630452703e-05, "loss": 0.0077, "step": 37610 }, { "grad_norm": 0.17236436903476715, "learning_rate": 3.3455208369967044e-05, "loss": 0.0057, "step": 37620 }, { "grad_norm": 0.17111149430274963, "learning_rate": 3.34292054612779e-05, "loss": 0.0043, "step": 37630 }, { "grad_norm": 0.16594715416431427, "learning_rate": 3.340320758635861e-05, "loss": 0.0063, "step": 37640 }, { "grad_norm": 0.16533873975276947, "learning_rate": 3.337721475310666e-05, "loss": 0.0048, "step": 37650 }, { "grad_norm": 0.25538715720176697, "learning_rate": 3.335122696941795e-05, "loss": 0.0057, "step": 37660 }, { "grad_norm": 0.1986265778541565, "learning_rate": 3.332524424318692e-05, "loss": 0.0058, "step": 37670 }, { "grad_norm": 0.22327803075313568, "learning_rate": 3.32992665823064e-05, "loss": 0.0056, "step": 37680 }, { "grad_norm": 0.1464325487613678, "learning_rate": 3.327329399466774e-05, "loss": 0.0049, "step": 37690 }, { "grad_norm": 0.1957155019044876, "learning_rate": 3.324732648816072e-05, "loss": 0.0069, "step": 37700 }, { "grad_norm": 0.26993077993392944, "learning_rate": 3.322136407067358e-05, "loss": 0.0048, "step": 37710 }, { "grad_norm": 0.16770979762077332, "learning_rate": 3.3195406750093036e-05, "loss": 0.006, "step": 37720 }, { "grad_norm": 0.2029825896024704, "learning_rate": 3.3169454534304205e-05, "loss": 0.0052, "step": 37730 }, { "grad_norm": 0.1719246506690979, "learning_rate": 3.3143507431190725e-05, "loss": 0.0048, "step": 37740 }, { "grad_norm": 0.29215943813323975, "learning_rate": 3.311756544863459e-05, "loss": 0.0052, "step": 37750 }, { "grad_norm": 0.20363165438175201, "learning_rate": 3.309162859451633e-05, "loss": 0.0057, "step": 37760 }, { "grad_norm": 0.19112004339694977, "learning_rate": 3.306569687671487e-05, "loss": 0.0064, "step": 37770 }, { "grad_norm": 0.25219428539276123, "learning_rate": 3.303977030310756e-05, "loss": 0.0044, "step": 37780 }, { "grad_norm": 0.18868236243724823, "learning_rate": 3.3013848881570245e-05, "loss": 0.0067, "step": 37790 }, { "grad_norm": 0.151606485247612, "learning_rate": 3.298793261997712e-05, "loss": 0.0047, "step": 37800 }, { "grad_norm": 0.2017272263765335, "learning_rate": 3.2962021526200893e-05, "loss": 0.0041, "step": 37810 }, { "grad_norm": 0.1984231173992157, "learning_rate": 3.293611560811268e-05, "loss": 0.0054, "step": 37820 }, { "grad_norm": 0.19376057386398315, "learning_rate": 3.291021487358199e-05, "loss": 0.0066, "step": 37830 }, { "grad_norm": 0.21602368354797363, "learning_rate": 3.28843193304768e-05, "loss": 0.0067, "step": 37840 }, { "grad_norm": 0.1858169436454773, "learning_rate": 3.2858428986663456e-05, "loss": 0.0052, "step": 37850 }, { "grad_norm": 0.7198758125305176, "learning_rate": 3.283254385000681e-05, "loss": 0.006, "step": 37860 }, { "grad_norm": 0.1725969761610031, "learning_rate": 3.2806663928370076e-05, "loss": 0.0052, "step": 37870 }, { "grad_norm": 0.11645271629095078, "learning_rate": 3.278078922961485e-05, "loss": 0.004, "step": 37880 }, { "grad_norm": 0.18152737617492676, "learning_rate": 3.275491976160123e-05, "loss": 0.0057, "step": 37890 }, { "grad_norm": 0.23510292172431946, "learning_rate": 3.2729055532187645e-05, "loss": 0.0079, "step": 37900 }, { "grad_norm": 0.2650381922721863, "learning_rate": 3.270319654923097e-05, "loss": 0.0045, "step": 37910 }, { "grad_norm": 0.16366882622241974, "learning_rate": 3.2677342820586506e-05, "loss": 0.0053, "step": 37920 }, { "grad_norm": 0.19202515482902527, "learning_rate": 3.2651494354107905e-05, "loss": 0.0077, "step": 37930 }, { "grad_norm": 0.25940415263175964, "learning_rate": 3.2625651157647266e-05, "loss": 0.0071, "step": 37940 }, { "grad_norm": 0.19381456077098846, "learning_rate": 3.259981323905505e-05, "loss": 0.0048, "step": 37950 }, { "grad_norm": 0.1702100932598114, "learning_rate": 3.257398060618014e-05, "loss": 0.0075, "step": 37960 }, { "grad_norm": 0.1628592610359192, "learning_rate": 3.254815326686983e-05, "loss": 0.0057, "step": 37970 }, { "grad_norm": 0.2136130928993225, "learning_rate": 3.2522331228969774e-05, "loss": 0.0072, "step": 37980 }, { "grad_norm": 0.17628826200962067, "learning_rate": 3.2496514500324006e-05, "loss": 0.0058, "step": 37990 }, { "grad_norm": 0.16468124091625214, "learning_rate": 3.247070308877498e-05, "loss": 0.0042, "step": 38000 }, { "grad_norm": 0.1680259108543396, "learning_rate": 3.2444897002163515e-05, "loss": 0.0048, "step": 38010 }, { "grad_norm": 0.15244245529174805, "learning_rate": 3.241909624832885e-05, "loss": 0.0041, "step": 38020 }, { "grad_norm": 0.1866675168275833, "learning_rate": 3.239330083510852e-05, "loss": 0.0042, "step": 38030 }, { "grad_norm": 0.2662525177001953, "learning_rate": 3.236751077033855e-05, "loss": 0.0047, "step": 38040 }, { "grad_norm": 0.24083131551742554, "learning_rate": 3.234172606185322e-05, "loss": 0.0053, "step": 38050 }, { "grad_norm": 0.24699541926383972, "learning_rate": 3.231594671748528e-05, "loss": 0.0075, "step": 38060 }, { "grad_norm": 0.18466341495513916, "learning_rate": 3.2290172745065815e-05, "loss": 0.0058, "step": 38070 }, { "grad_norm": 0.19507554173469543, "learning_rate": 3.226440415242426e-05, "loss": 0.0078, "step": 38080 }, { "grad_norm": 0.19676101207733154, "learning_rate": 3.223864094738846e-05, "loss": 0.006, "step": 38090 }, { "grad_norm": 0.2458454966545105, "learning_rate": 3.221288313778456e-05, "loss": 0.0052, "step": 38100 }, { "grad_norm": 0.24716630578041077, "learning_rate": 3.2187130731437125e-05, "loss": 0.0089, "step": 38110 }, { "grad_norm": 0.22803299129009247, "learning_rate": 3.216138373616905e-05, "loss": 0.0075, "step": 38120 }, { "grad_norm": 0.21906951069831848, "learning_rate": 3.21356421598016e-05, "loss": 0.008, "step": 38130 }, { "grad_norm": 0.2550855576992035, "learning_rate": 3.210990601015438e-05, "loss": 0.0057, "step": 38140 }, { "grad_norm": 0.17309622466564178, "learning_rate": 3.208417529504535e-05, "loss": 0.0071, "step": 38150 }, { "grad_norm": 0.1554785519838333, "learning_rate": 3.205845002229084e-05, "loss": 0.0046, "step": 38160 }, { "grad_norm": 0.15522806346416473, "learning_rate": 3.203273019970547e-05, "loss": 0.0047, "step": 38170 }, { "grad_norm": 0.2033078521490097, "learning_rate": 3.200701583510227e-05, "loss": 0.0069, "step": 38180 }, { "grad_norm": 0.27065756916999817, "learning_rate": 3.198130693629261e-05, "loss": 0.0055, "step": 38190 }, { "grad_norm": 0.29369938373565674, "learning_rate": 3.195560351108612e-05, "loss": 0.0076, "step": 38200 }, { "grad_norm": 0.23765428364276886, "learning_rate": 3.1929905567290865e-05, "loss": 0.0103, "step": 38210 }, { "grad_norm": 0.20930853486061096, "learning_rate": 3.1904213112713164e-05, "loss": 0.0065, "step": 38220 }, { "grad_norm": 0.1556563526391983, "learning_rate": 3.187852615515774e-05, "loss": 0.0075, "step": 38230 }, { "grad_norm": 0.19510279595851898, "learning_rate": 3.1852844702427606e-05, "loss": 0.0049, "step": 38240 }, { "grad_norm": 0.20544369518756866, "learning_rate": 3.18271687623241e-05, "loss": 0.0051, "step": 38250 }, { "grad_norm": 0.24184343218803406, "learning_rate": 3.1801498342646896e-05, "loss": 0.0075, "step": 38260 }, { "grad_norm": 0.24722795188426971, "learning_rate": 3.177583345119398e-05, "loss": 0.0052, "step": 38270 }, { "grad_norm": 0.26140618324279785, "learning_rate": 3.17501740957617e-05, "loss": 0.0076, "step": 38280 }, { "grad_norm": 0.27534765005111694, "learning_rate": 3.172452028414467e-05, "loss": 0.007, "step": 38290 }, { "grad_norm": 0.19065791368484497, "learning_rate": 3.169887202413583e-05, "loss": 0.0068, "step": 38300 }, { "grad_norm": 0.2361488938331604, "learning_rate": 3.167322932352646e-05, "loss": 0.0067, "step": 38310 }, { "grad_norm": 0.159803107380867, "learning_rate": 3.164759219010613e-05, "loss": 0.0039, "step": 38320 }, { "grad_norm": 0.208531454205513, "learning_rate": 3.1621960631662725e-05, "loss": 0.0046, "step": 38330 }, { "grad_norm": 0.12291572988033295, "learning_rate": 3.159633465598245e-05, "loss": 0.004, "step": 38340 }, { "grad_norm": 0.2683856189250946, "learning_rate": 3.1570714270849767e-05, "loss": 0.0057, "step": 38350 }, { "grad_norm": 0.17563460767269135, "learning_rate": 3.1545099484047516e-05, "loss": 0.0041, "step": 38360 }, { "grad_norm": 0.21362178027629852, "learning_rate": 3.151949030335674e-05, "loss": 0.0064, "step": 38370 }, { "grad_norm": 0.20487098395824432, "learning_rate": 3.149388673655687e-05, "loss": 0.0063, "step": 38380 }, { "grad_norm": 0.2187328040599823, "learning_rate": 3.146828879142559e-05, "loss": 0.0073, "step": 38390 }, { "grad_norm": 0.15002216398715973, "learning_rate": 3.1442696475738866e-05, "loss": 0.0063, "step": 38400 }, { "grad_norm": 0.16359585523605347, "learning_rate": 3.141710979727098e-05, "loss": 0.0037, "step": 38410 }, { "grad_norm": 0.1778039187192917, "learning_rate": 3.139152876379447e-05, "loss": 0.0066, "step": 38420 }, { "grad_norm": 0.19601871073246002, "learning_rate": 3.1365953383080214e-05, "loss": 0.0056, "step": 38430 }, { "grad_norm": 0.26296091079711914, "learning_rate": 3.134038366289731e-05, "loss": 0.0056, "step": 38440 }, { "grad_norm": 0.241267129778862, "learning_rate": 3.131481961101317e-05, "loss": 0.005, "step": 38450 }, { "grad_norm": 0.210506871342659, "learning_rate": 3.128926123519349e-05, "loss": 0.005, "step": 38460 }, { "grad_norm": 0.15222042798995972, "learning_rate": 3.1263708543202194e-05, "loss": 0.0046, "step": 38470 }, { "grad_norm": 0.25686559081077576, "learning_rate": 3.123816154280155e-05, "loss": 0.0063, "step": 38480 }, { "grad_norm": 0.18360815942287445, "learning_rate": 3.121262024175207e-05, "loss": 0.0057, "step": 38490 }, { "grad_norm": 0.23407138884067535, "learning_rate": 3.118708464781248e-05, "loss": 0.0091, "step": 38500 }, { "grad_norm": 0.13351522386074066, "learning_rate": 3.116155476873987e-05, "loss": 0.005, "step": 38510 }, { "grad_norm": 0.21726030111312866, "learning_rate": 3.11360306122895e-05, "loss": 0.0081, "step": 38520 }, { "grad_norm": 0.22795534133911133, "learning_rate": 3.1110512186214975e-05, "loss": 0.0043, "step": 38530 }, { "grad_norm": 0.18992166221141815, "learning_rate": 3.1084999498268095e-05, "loss": 0.0042, "step": 38540 }, { "grad_norm": 0.19822633266448975, "learning_rate": 3.1059492556198934e-05, "loss": 0.0064, "step": 38550 }, { "grad_norm": 0.29902753233909607, "learning_rate": 3.103399136775586e-05, "loss": 0.0077, "step": 38560 }, { "grad_norm": 0.21984446048736572, "learning_rate": 3.100849594068541e-05, "loss": 0.005, "step": 38570 }, { "grad_norm": 0.22554202377796173, "learning_rate": 3.0983006282732484e-05, "loss": 0.0047, "step": 38580 }, { "grad_norm": 0.2012355923652649, "learning_rate": 3.0957522401640116e-05, "loss": 0.0072, "step": 38590 }, { "grad_norm": 0.24658608436584473, "learning_rate": 3.0932044305149645e-05, "loss": 0.0051, "step": 38600 }, { "grad_norm": 0.2997727394104004, "learning_rate": 3.090657200100068e-05, "loss": 0.0068, "step": 38610 }, { "grad_norm": 0.16216319799423218, "learning_rate": 3.088110549693099e-05, "loss": 0.0049, "step": 38620 }, { "grad_norm": 0.21875208616256714, "learning_rate": 3.085564480067667e-05, "loss": 0.0069, "step": 38630 }, { "grad_norm": 0.19128629565238953, "learning_rate": 3.0830189919971955e-05, "loss": 0.0069, "step": 38640 }, { "grad_norm": 0.2854331135749817, "learning_rate": 3.080474086254939e-05, "loss": 0.0056, "step": 38650 }, { "grad_norm": 0.23712821304798126, "learning_rate": 3.077929763613975e-05, "loss": 0.0051, "step": 38660 }, { "grad_norm": 0.1727815717458725, "learning_rate": 3.075386024847198e-05, "loss": 0.0055, "step": 38670 }, { "grad_norm": 0.14114877581596375, "learning_rate": 3.072842870727331e-05, "loss": 0.0042, "step": 38680 }, { "grad_norm": 0.18886399269104004, "learning_rate": 3.070300302026916e-05, "loss": 0.0048, "step": 38690 }, { "grad_norm": 0.1381268948316574, "learning_rate": 3.067758319518318e-05, "loss": 0.0044, "step": 38700 }, { "grad_norm": 0.164319708943367, "learning_rate": 3.065216923973725e-05, "loss": 0.0046, "step": 38710 }, { "grad_norm": 0.18960459530353546, "learning_rate": 3.062676116165145e-05, "loss": 0.004, "step": 38720 }, { "grad_norm": 0.2099323272705078, "learning_rate": 3.06013589686441e-05, "loss": 0.0071, "step": 38730 }, { "grad_norm": 0.26826804876327515, "learning_rate": 3.05759626684317e-05, "loss": 0.01, "step": 38740 }, { "grad_norm": 0.2196415662765503, "learning_rate": 3.055057226872896e-05, "loss": 0.005, "step": 38750 }, { "grad_norm": 0.15844856202602386, "learning_rate": 3.052518777724887e-05, "loss": 0.0055, "step": 38760 }, { "grad_norm": 0.2702769339084625, "learning_rate": 3.04998092017025e-05, "loss": 0.0076, "step": 38770 }, { "grad_norm": 0.2672969400882721, "learning_rate": 3.0474436549799246e-05, "loss": 0.0071, "step": 38780 }, { "grad_norm": 0.16617973148822784, "learning_rate": 3.044906982924661e-05, "loss": 0.0069, "step": 38790 }, { "grad_norm": 0.2865569293498993, "learning_rate": 3.0423709047750337e-05, "loss": 0.0048, "step": 38800 }, { "grad_norm": 0.24740882217884064, "learning_rate": 3.03983542130144e-05, "loss": 0.0079, "step": 38810 }, { "grad_norm": 0.16724632680416107, "learning_rate": 3.0373005332740877e-05, "loss": 0.0056, "step": 38820 }, { "grad_norm": 0.15566882491111755, "learning_rate": 3.034766241463013e-05, "loss": 0.0042, "step": 38830 }, { "grad_norm": 0.23041506111621857, "learning_rate": 3.032232546638064e-05, "loss": 0.0065, "step": 38840 }, { "grad_norm": 0.17141301929950714, "learning_rate": 3.0296994495689114e-05, "loss": 0.0049, "step": 38850 }, { "grad_norm": 0.26385313272476196, "learning_rate": 3.0271669510250444e-05, "loss": 0.0053, "step": 38860 }, { "grad_norm": 0.21630458533763885, "learning_rate": 3.024635051775766e-05, "loss": 0.0073, "step": 38870 }, { "grad_norm": 0.20403070747852325, "learning_rate": 3.022103752590205e-05, "loss": 0.0055, "step": 38880 }, { "grad_norm": 0.17463800311088562, "learning_rate": 3.0195730542372992e-05, "loss": 0.0042, "step": 38890 }, { "grad_norm": 0.15132248401641846, "learning_rate": 3.0170429574858084e-05, "loss": 0.0036, "step": 38900 }, { "grad_norm": 0.19021949172019958, "learning_rate": 3.0145134631043127e-05, "loss": 0.0053, "step": 38910 }, { "grad_norm": 0.20701666176319122, "learning_rate": 3.0119845718612018e-05, "loss": 0.0045, "step": 38920 }, { "grad_norm": 0.18600764870643616, "learning_rate": 3.009456284524688e-05, "loss": 0.0043, "step": 38930 }, { "grad_norm": 0.20948046445846558, "learning_rate": 3.0069286018627967e-05, "loss": 0.0046, "step": 38940 }, { "grad_norm": 0.18533526360988617, "learning_rate": 3.0044015246433743e-05, "loss": 0.0038, "step": 38950 }, { "grad_norm": 0.19902680814266205, "learning_rate": 3.0018750536340755e-05, "loss": 0.005, "step": 38960 }, { "grad_norm": 0.20158135890960693, "learning_rate": 2.999349189602378e-05, "loss": 0.0039, "step": 38970 }, { "grad_norm": 0.27020877599716187, "learning_rate": 2.9968239333155733e-05, "loss": 0.0044, "step": 38980 }, { "grad_norm": 0.20163144171237946, "learning_rate": 2.994299285540767e-05, "loss": 0.0041, "step": 38990 }, { "grad_norm": 0.185372456908226, "learning_rate": 2.9917752470448813e-05, "loss": 0.0041, "step": 39000 }, { "grad_norm": 0.26789581775665283, "learning_rate": 2.9892518185946495e-05, "loss": 0.0077, "step": 39010 }, { "grad_norm": 0.28187650442123413, "learning_rate": 2.986729000956624e-05, "loss": 0.0062, "step": 39020 }, { "grad_norm": 0.14906315505504608, "learning_rate": 2.9842067948971736e-05, "loss": 0.0039, "step": 39030 }, { "grad_norm": 0.22005410492420197, "learning_rate": 2.9816852011824727e-05, "loss": 0.0059, "step": 39040 }, { "grad_norm": 0.1969199776649475, "learning_rate": 2.979164220578519e-05, "loss": 0.0063, "step": 39050 }, { "grad_norm": 0.19907855987548828, "learning_rate": 2.9766438538511165e-05, "loss": 0.0049, "step": 39060 }, { "grad_norm": 0.22292138636112213, "learning_rate": 2.9741241017658873e-05, "loss": 0.0046, "step": 39070 }, { "grad_norm": 0.15400312840938568, "learning_rate": 2.971604965088267e-05, "loss": 0.0031, "step": 39080 }, { "grad_norm": 0.1860182136297226, "learning_rate": 2.9690864445835008e-05, "loss": 0.0057, "step": 39090 }, { "grad_norm": 0.26553893089294434, "learning_rate": 2.966568541016651e-05, "loss": 0.0044, "step": 39100 }, { "grad_norm": 0.1889704167842865, "learning_rate": 2.9640512551525867e-05, "loss": 0.0079, "step": 39110 }, { "grad_norm": 0.18434156477451324, "learning_rate": 2.961534587755995e-05, "loss": 0.0051, "step": 39120 }, { "grad_norm": 0.23027871549129486, "learning_rate": 2.959018539591375e-05, "loss": 0.005, "step": 39130 }, { "grad_norm": 0.2066088318824768, "learning_rate": 2.9565031114230325e-05, "loss": 0.008, "step": 39140 }, { "grad_norm": 0.22787030041217804, "learning_rate": 2.9539883040150895e-05, "loss": 0.005, "step": 39150 }, { "grad_norm": 0.21578925848007202, "learning_rate": 2.9514741181314774e-05, "loss": 0.0049, "step": 39160 }, { "grad_norm": 0.1817273646593094, "learning_rate": 2.94896055453594e-05, "loss": 0.0041, "step": 39170 }, { "grad_norm": 0.19556480646133423, "learning_rate": 2.9464476139920332e-05, "loss": 0.0038, "step": 39180 }, { "grad_norm": 0.2627725303173065, "learning_rate": 2.9439352972631186e-05, "loss": 0.0056, "step": 39190 }, { "grad_norm": 0.24497874081134796, "learning_rate": 2.9414236051123757e-05, "loss": 0.0059, "step": 39200 }, { "grad_norm": 0.2422783523797989, "learning_rate": 2.938912538302785e-05, "loss": 0.0048, "step": 39210 }, { "grad_norm": 0.15815244615077972, "learning_rate": 2.9364020975971464e-05, "loss": 0.0064, "step": 39220 }, { "grad_norm": 0.194553941488266, "learning_rate": 2.9338922837580657e-05, "loss": 0.0041, "step": 39230 }, { "grad_norm": 0.21852512657642365, "learning_rate": 2.931383097547955e-05, "loss": 0.0047, "step": 39240 }, { "grad_norm": 0.1858682930469513, "learning_rate": 2.928874539729043e-05, "loss": 0.0077, "step": 39250 }, { "grad_norm": 0.1445969045162201, "learning_rate": 2.926366611063358e-05, "loss": 0.0048, "step": 39260 }, { "grad_norm": 0.3509559631347656, "learning_rate": 2.9238593123127463e-05, "loss": 0.0046, "step": 39270 }, { "grad_norm": 0.20819500088691711, "learning_rate": 2.9213526442388583e-05, "loss": 0.0039, "step": 39280 }, { "grad_norm": 0.2361106276512146, "learning_rate": 2.9188466076031545e-05, "loss": 0.0093, "step": 39290 }, { "grad_norm": 0.18084196746349335, "learning_rate": 2.9163412031669012e-05, "loss": 0.0046, "step": 39300 }, { "grad_norm": 0.1923476755619049, "learning_rate": 2.913836431691175e-05, "loss": 0.007, "step": 39310 }, { "grad_norm": 0.26897430419921875, "learning_rate": 2.9113322939368583e-05, "loss": 0.0078, "step": 39320 }, { "grad_norm": 0.20294654369354248, "learning_rate": 2.9088287906646427e-05, "loss": 0.0045, "step": 39330 }, { "grad_norm": 0.2674720585346222, "learning_rate": 2.906325922635024e-05, "loss": 0.0092, "step": 39340 }, { "grad_norm": 0.3135108947753906, "learning_rate": 2.903823690608313e-05, "loss": 0.0064, "step": 39350 }, { "grad_norm": 0.3090534508228302, "learning_rate": 2.9013220953446174e-05, "loss": 0.0063, "step": 39360 }, { "grad_norm": 0.34263965487480164, "learning_rate": 2.8988211376038564e-05, "loss": 0.0049, "step": 39370 }, { "grad_norm": 0.2082008421421051, "learning_rate": 2.8963208181457564e-05, "loss": 0.0067, "step": 39380 }, { "grad_norm": 0.20318923890590668, "learning_rate": 2.8938211377298453e-05, "loss": 0.0048, "step": 39390 }, { "grad_norm": 0.19519469141960144, "learning_rate": 2.8913220971154652e-05, "loss": 0.0048, "step": 39400 }, { "grad_norm": 0.29822787642478943, "learning_rate": 2.888823697061753e-05, "loss": 0.0053, "step": 39410 }, { "grad_norm": 0.1345885992050171, "learning_rate": 2.8863259383276618e-05, "loss": 0.0035, "step": 39420 }, { "grad_norm": 0.16829681396484375, "learning_rate": 2.8838288216719395e-05, "loss": 0.0047, "step": 39430 }, { "grad_norm": 0.20874419808387756, "learning_rate": 2.8813323478531484e-05, "loss": 0.0047, "step": 39440 }, { "grad_norm": 0.1568189263343811, "learning_rate": 2.8788365176296496e-05, "loss": 0.0064, "step": 39450 }, { "grad_norm": 0.1999816745519638, "learning_rate": 2.876341331759611e-05, "loss": 0.0049, "step": 39460 }, { "grad_norm": 0.2438332736492157, "learning_rate": 2.8738467910010036e-05, "loss": 0.005, "step": 39470 }, { "grad_norm": 0.13956566154956818, "learning_rate": 2.8713528961116032e-05, "loss": 0.0056, "step": 39480 }, { "grad_norm": 0.19313320517539978, "learning_rate": 2.8688596478489875e-05, "loss": 0.0052, "step": 39490 }, { "grad_norm": 0.16613535583019257, "learning_rate": 2.8663670469705434e-05, "loss": 0.0052, "step": 39500 }, { "grad_norm": 0.174787238240242, "learning_rate": 2.8638750942334546e-05, "loss": 0.0077, "step": 39510 }, { "grad_norm": 0.13661451637744904, "learning_rate": 2.8613837903947115e-05, "loss": 0.009, "step": 39520 }, { "grad_norm": 0.23490449786186218, "learning_rate": 2.858893136211106e-05, "loss": 0.0048, "step": 39530 }, { "grad_norm": 0.2397747039794922, "learning_rate": 2.8564031324392315e-05, "loss": 0.0052, "step": 39540 }, { "grad_norm": 0.16338609158992767, "learning_rate": 2.85391377983549e-05, "loss": 0.0041, "step": 39550 }, { "grad_norm": 0.19758690893650055, "learning_rate": 2.851425079156075e-05, "loss": 0.0034, "step": 39560 }, { "grad_norm": 0.18963731825351715, "learning_rate": 2.848937031156994e-05, "loss": 0.0057, "step": 39570 }, { "grad_norm": 0.17410822212696075, "learning_rate": 2.846449636594044e-05, "loss": 0.0051, "step": 39580 }, { "grad_norm": 0.14894436299800873, "learning_rate": 2.843962896222836e-05, "loss": 0.0074, "step": 39590 }, { "grad_norm": 0.16451454162597656, "learning_rate": 2.8414768107987722e-05, "loss": 0.0052, "step": 39600 }, { "grad_norm": 0.24079783260822296, "learning_rate": 2.838991381077061e-05, "loss": 0.0063, "step": 39610 }, { "grad_norm": 0.1851942390203476, "learning_rate": 2.83650660781271e-05, "loss": 0.0052, "step": 39620 }, { "grad_norm": 0.22354325652122498, "learning_rate": 2.8340224917605285e-05, "loss": 0.0048, "step": 39630 }, { "grad_norm": 0.2125934362411499, "learning_rate": 2.831539033675122e-05, "loss": 0.0048, "step": 39640 }, { "grad_norm": 0.17922115325927734, "learning_rate": 2.8290562343109038e-05, "loss": 0.0053, "step": 39650 }, { "grad_norm": 0.23346026241779327, "learning_rate": 2.826574094422082e-05, "loss": 0.0063, "step": 39660 }, { "grad_norm": 0.18768727779388428, "learning_rate": 2.8240926147626645e-05, "loss": 0.0047, "step": 39670 }, { "grad_norm": 0.11735614389181137, "learning_rate": 2.8216117960864586e-05, "loss": 0.0049, "step": 39680 }, { "grad_norm": 0.1411198079586029, "learning_rate": 2.8191316391470703e-05, "loss": 0.0042, "step": 39690 }, { "grad_norm": 0.14979593455791473, "learning_rate": 2.816652144697911e-05, "loss": 0.0046, "step": 39700 }, { "grad_norm": 0.20470823347568512, "learning_rate": 2.8141733134921783e-05, "loss": 0.0056, "step": 39710 }, { "grad_norm": 0.11361949145793915, "learning_rate": 2.811695146282884e-05, "loss": 0.0047, "step": 39720 }, { "grad_norm": 0.14694096148014069, "learning_rate": 2.8092176438228212e-05, "loss": 0.0057, "step": 39730 }, { "grad_norm": 0.2079312652349472, "learning_rate": 2.806740806864598e-05, "loss": 0.0046, "step": 39740 }, { "grad_norm": 0.21088510751724243, "learning_rate": 2.804264636160604e-05, "loss": 0.0049, "step": 39750 }, { "grad_norm": 0.17210254073143005, "learning_rate": 2.8017891324630402e-05, "loss": 0.0052, "step": 39760 }, { "grad_norm": 0.1771102398633957, "learning_rate": 2.7993142965238976e-05, "loss": 0.0043, "step": 39770 }, { "grad_norm": 0.2409965991973877, "learning_rate": 2.7968401290949665e-05, "loss": 0.0044, "step": 39780 }, { "grad_norm": 0.31642183661460876, "learning_rate": 2.7943666309278328e-05, "loss": 0.0058, "step": 39790 }, { "grad_norm": 0.23338210582733154, "learning_rate": 2.7918938027738783e-05, "loss": 0.004, "step": 39800 }, { "grad_norm": 0.19371086359024048, "learning_rate": 2.789421645384287e-05, "loss": 0.0072, "step": 39810 }, { "grad_norm": 0.1881861537694931, "learning_rate": 2.786950159510032e-05, "loss": 0.005, "step": 39820 }, { "grad_norm": 0.1700982004404068, "learning_rate": 2.7844793459018876e-05, "loss": 0.0043, "step": 39830 }, { "grad_norm": 0.1657911092042923, "learning_rate": 2.7820092053104195e-05, "loss": 0.0057, "step": 39840 }, { "grad_norm": 0.2305365353822708, "learning_rate": 2.7795397384859933e-05, "loss": 0.0082, "step": 39850 }, { "grad_norm": 0.2516365051269531, "learning_rate": 2.7770709461787638e-05, "loss": 0.008, "step": 39860 }, { "grad_norm": 0.18513330817222595, "learning_rate": 2.7746028291386915e-05, "loss": 0.0046, "step": 39870 }, { "grad_norm": 0.15349088609218597, "learning_rate": 2.772135388115519e-05, "loss": 0.0043, "step": 39880 }, { "grad_norm": 0.18356260657310486, "learning_rate": 2.7696686238587945e-05, "loss": 0.0047, "step": 39890 }, { "grad_norm": 0.20053298771381378, "learning_rate": 2.7672025371178505e-05, "loss": 0.0054, "step": 39900 }, { "grad_norm": 0.13689036667346954, "learning_rate": 2.7647371286418238e-05, "loss": 0.008, "step": 39910 }, { "grad_norm": 0.2753556966781616, "learning_rate": 2.762272399179639e-05, "loss": 0.0072, "step": 39920 }, { "grad_norm": 0.14381039142608643, "learning_rate": 2.7598083494800154e-05, "loss": 0.0041, "step": 39930 }, { "grad_norm": 0.2688097655773163, "learning_rate": 2.7573449802914664e-05, "loss": 0.0042, "step": 39940 }, { "grad_norm": 0.10915149748325348, "learning_rate": 2.7548822923622964e-05, "loss": 0.006, "step": 39950 }, { "grad_norm": 0.2022131383419037, "learning_rate": 2.752420286440609e-05, "loss": 0.0046, "step": 39960 }, { "grad_norm": 0.1734929084777832, "learning_rate": 2.749958963274295e-05, "loss": 0.0042, "step": 39970 }, { "grad_norm": 0.19679979979991913, "learning_rate": 2.747498323611039e-05, "loss": 0.0053, "step": 39980 }, { "grad_norm": 0.15817561745643616, "learning_rate": 2.7450383681983184e-05, "loss": 0.0061, "step": 39990 }, { "grad_norm": 0.20190629363059998, "learning_rate": 2.742579097783403e-05, "loss": 0.0052, "step": 40000 } ], "logging_steps": 10, "max_steps": 60000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }