{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 2.181729793548584, "learning_rate": 9e-07, "loss": 1.6349, "step": 10 }, { "grad_norm": 1.84587824344635, "learning_rate": 1.9e-06, "loss": 1.6123, "step": 20 }, { "grad_norm": 0.9250903129577637, "learning_rate": 2.9e-06, "loss": 1.5608, "step": 30 }, { "grad_norm": 1.2065900564193726, "learning_rate": 3.9e-06, "loss": 1.4881, "step": 40 }, { "grad_norm": 0.9651978611946106, "learning_rate": 4.9000000000000005e-06, "loss": 1.4098, "step": 50 }, { "grad_norm": 1.1396987438201904, "learning_rate": 5.9e-06, "loss": 1.298, "step": 60 }, { "grad_norm": 0.9417771100997925, "learning_rate": 6.900000000000001e-06, "loss": 1.2038, "step": 70 }, { "grad_norm": 0.8259028196334839, "learning_rate": 7.9e-06, "loss": 1.1359, "step": 80 }, { "grad_norm": 0.7381906509399414, "learning_rate": 8.9e-06, "loss": 1.0851, "step": 90 }, { "grad_norm": 0.475220650434494, "learning_rate": 9.900000000000002e-06, "loss": 1.0614, "step": 100 }, { "grad_norm": 0.5050864219665527, "learning_rate": 1.09e-05, "loss": 1.0536, "step": 110 }, { "grad_norm": 0.551364541053772, "learning_rate": 1.19e-05, "loss": 1.0455, "step": 120 }, { "grad_norm": 0.7080048322677612, "learning_rate": 1.29e-05, "loss": 1.04, "step": 130 }, { "grad_norm": 0.5572801828384399, "learning_rate": 1.3900000000000002e-05, "loss": 1.0267, "step": 140 }, { "grad_norm": 0.9155464768409729, "learning_rate": 1.49e-05, "loss": 1.0378, "step": 150 }, { "grad_norm": 0.5611173510551453, "learning_rate": 1.59e-05, "loss": 1.0367, "step": 160 }, { "grad_norm": 0.851026177406311, "learning_rate": 1.69e-05, "loss": 1.0427, "step": 170 }, { "grad_norm": 0.43606680631637573, "learning_rate": 1.79e-05, "loss": 1.0385, "step": 180 }, { "grad_norm": 1.6352815628051758, "learning_rate": 1.8900000000000002e-05, "loss": 1.0395, "step": 190 }, { "grad_norm": 1.5075136423110962, "learning_rate": 1.9900000000000003e-05, "loss": 1.0306, "step": 200 }, { "grad_norm": 0.7621788382530212, "learning_rate": 2.09e-05, "loss": 1.0633, "step": 210 }, { "grad_norm": 0.49086982011795044, "learning_rate": 2.19e-05, "loss": 1.0198, "step": 220 }, { "grad_norm": 0.6513771414756775, "learning_rate": 2.29e-05, "loss": 1.0139, "step": 230 }, { "grad_norm": 0.6476977467536926, "learning_rate": 2.39e-05, "loss": 1.002, "step": 240 }, { "grad_norm": 0.8046707510948181, "learning_rate": 2.4900000000000002e-05, "loss": 1.0131, "step": 250 }, { "grad_norm": 0.9255441427230835, "learning_rate": 2.5900000000000003e-05, "loss": 0.9891, "step": 260 }, { "grad_norm": 0.6620059609413147, "learning_rate": 2.6900000000000003e-05, "loss": 0.9676, "step": 270 }, { "grad_norm": 0.9781240820884705, "learning_rate": 2.7900000000000004e-05, "loss": 0.967, "step": 280 }, { "grad_norm": 0.8584622144699097, "learning_rate": 2.8899999999999998e-05, "loss": 0.9516, "step": 290 }, { "grad_norm": 1.368175983428955, "learning_rate": 2.9900000000000002e-05, "loss": 0.9173, "step": 300 }, { "grad_norm": 0.989246666431427, "learning_rate": 3.09e-05, "loss": 0.8963, "step": 310 }, { "grad_norm": 1.0129282474517822, "learning_rate": 3.19e-05, "loss": 0.8672, "step": 320 }, { "grad_norm": 1.066400408744812, "learning_rate": 3.29e-05, "loss": 0.8219, "step": 330 }, { "grad_norm": 1.1342850923538208, "learning_rate": 3.3900000000000004e-05, "loss": 0.7875, "step": 340 }, { "grad_norm": 1.1470438241958618, "learning_rate": 3.49e-05, "loss": 0.7531, "step": 350 }, { "grad_norm": 0.939744234085083, "learning_rate": 3.59e-05, "loss": 0.7325, "step": 360 }, { "grad_norm": 0.936755359172821, "learning_rate": 3.69e-05, "loss": 0.7027, "step": 370 }, { "grad_norm": 1.2180145978927612, "learning_rate": 3.79e-05, "loss": 0.664, "step": 380 }, { "grad_norm": 1.1033592224121094, "learning_rate": 3.8900000000000004e-05, "loss": 0.652, "step": 390 }, { "grad_norm": 1.1081236600875854, "learning_rate": 3.99e-05, "loss": 0.6298, "step": 400 }, { "grad_norm": 1.2397894859313965, "learning_rate": 4.09e-05, "loss": 0.6222, "step": 410 }, { "grad_norm": 1.2302908897399902, "learning_rate": 4.19e-05, "loss": 0.5975, "step": 420 }, { "grad_norm": 1.2718716859817505, "learning_rate": 4.29e-05, "loss": 0.5722, "step": 430 }, { "grad_norm": 1.109932541847229, "learning_rate": 4.39e-05, "loss": 0.5438, "step": 440 }, { "grad_norm": 1.1174564361572266, "learning_rate": 4.49e-05, "loss": 0.5266, "step": 450 }, { "grad_norm": 1.1747809648513794, "learning_rate": 4.5900000000000004e-05, "loss": 0.5076, "step": 460 }, { "grad_norm": 1.224544644355774, "learning_rate": 4.69e-05, "loss": 0.4953, "step": 470 }, { "grad_norm": 1.29561448097229, "learning_rate": 4.79e-05, "loss": 0.4903, "step": 480 }, { "grad_norm": 1.049411416053772, "learning_rate": 4.89e-05, "loss": 0.4534, "step": 490 }, { "grad_norm": 1.028067946434021, "learning_rate": 4.99e-05, "loss": 0.4506, "step": 500 }, { "grad_norm": 1.0810296535491943, "learning_rate": 5.0900000000000004e-05, "loss": 0.4321, "step": 510 }, { "grad_norm": 0.8169877529144287, "learning_rate": 5.19e-05, "loss": 0.4115, "step": 520 }, { "grad_norm": 1.1582118272781372, "learning_rate": 5.2900000000000005e-05, "loss": 0.4173, "step": 530 }, { "grad_norm": 1.1482149362564087, "learning_rate": 5.390000000000001e-05, "loss": 0.4001, "step": 540 }, { "grad_norm": 1.2496120929718018, "learning_rate": 5.4900000000000006e-05, "loss": 0.3905, "step": 550 }, { "grad_norm": 1.0071156024932861, "learning_rate": 5.590000000000001e-05, "loss": 0.3825, "step": 560 }, { "grad_norm": 1.0406757593154907, "learning_rate": 5.69e-05, "loss": 0.381, "step": 570 }, { "grad_norm": 1.1631706953048706, "learning_rate": 5.79e-05, "loss": 0.3671, "step": 580 }, { "grad_norm": 1.138960361480713, "learning_rate": 5.89e-05, "loss": 0.3589, "step": 590 }, { "grad_norm": 0.9639902114868164, "learning_rate": 5.99e-05, "loss": 0.3475, "step": 600 }, { "grad_norm": 1.5503042936325073, "learning_rate": 6.09e-05, "loss": 0.3382, "step": 610 }, { "grad_norm": 1.1809571981430054, "learning_rate": 6.19e-05, "loss": 0.3198, "step": 620 }, { "grad_norm": 1.2552809715270996, "learning_rate": 6.29e-05, "loss": 0.3039, "step": 630 }, { "grad_norm": 1.0211730003356934, "learning_rate": 6.390000000000001e-05, "loss": 0.3032, "step": 640 }, { "grad_norm": 0.9970591068267822, "learning_rate": 6.49e-05, "loss": 0.296, "step": 650 }, { "grad_norm": 1.248809814453125, "learning_rate": 6.59e-05, "loss": 0.2756, "step": 660 }, { "grad_norm": 1.2625703811645508, "learning_rate": 6.690000000000001e-05, "loss": 0.2911, "step": 670 }, { "grad_norm": 1.0326027870178223, "learning_rate": 6.790000000000001e-05, "loss": 0.26, "step": 680 }, { "grad_norm": 1.1761547327041626, "learning_rate": 6.89e-05, "loss": 0.2514, "step": 690 }, { "grad_norm": 1.127117395401001, "learning_rate": 6.99e-05, "loss": 0.2493, "step": 700 }, { "grad_norm": 1.1771677732467651, "learning_rate": 7.09e-05, "loss": 0.2424, "step": 710 }, { "grad_norm": 0.8880641460418701, "learning_rate": 7.19e-05, "loss": 0.2185, "step": 720 }, { "grad_norm": 1.165806770324707, "learning_rate": 7.29e-05, "loss": 0.2099, "step": 730 }, { "grad_norm": 1.373753547668457, "learning_rate": 7.390000000000001e-05, "loss": 0.1856, "step": 740 }, { "grad_norm": 1.4519215822219849, "learning_rate": 7.49e-05, "loss": 0.1838, "step": 750 }, { "grad_norm": 0.944869875907898, "learning_rate": 7.59e-05, "loss": 0.1622, "step": 760 }, { "grad_norm": 1.1860719919204712, "learning_rate": 7.69e-05, "loss": 0.1634, "step": 770 }, { "grad_norm": 0.9756244421005249, "learning_rate": 7.790000000000001e-05, "loss": 0.1588, "step": 780 }, { "grad_norm": 1.1594505310058594, "learning_rate": 7.890000000000001e-05, "loss": 0.1492, "step": 790 }, { "grad_norm": 0.9914552569389343, "learning_rate": 7.99e-05, "loss": 0.1458, "step": 800 }, { "grad_norm": 1.269118070602417, "learning_rate": 8.090000000000001e-05, "loss": 0.1542, "step": 810 }, { "grad_norm": 1.2252837419509888, "learning_rate": 8.19e-05, "loss": 0.1397, "step": 820 }, { "grad_norm": 2.476262331008911, "learning_rate": 8.29e-05, "loss": 0.1663, "step": 830 }, { "grad_norm": 0.8104544878005981, "learning_rate": 8.39e-05, "loss": 0.1541, "step": 840 }, { "grad_norm": 1.2556098699569702, "learning_rate": 8.49e-05, "loss": 0.1478, "step": 850 }, { "grad_norm": 0.9899939894676208, "learning_rate": 8.59e-05, "loss": 0.1445, "step": 860 }, { "grad_norm": 1.0164157152175903, "learning_rate": 8.69e-05, "loss": 0.1357, "step": 870 }, { "grad_norm": 0.7789561748504639, "learning_rate": 8.790000000000001e-05, "loss": 0.1311, "step": 880 }, { "grad_norm": 1.1991299390792847, "learning_rate": 8.89e-05, "loss": 0.1282, "step": 890 }, { "grad_norm": 1.0712475776672363, "learning_rate": 8.99e-05, "loss": 0.1502, "step": 900 }, { "grad_norm": 0.9742149710655212, "learning_rate": 9.090000000000001e-05, "loss": 0.1248, "step": 910 }, { "grad_norm": 0.9561946988105774, "learning_rate": 9.190000000000001e-05, "loss": 0.1414, "step": 920 }, { "grad_norm": 0.7228536605834961, "learning_rate": 9.290000000000001e-05, "loss": 0.1265, "step": 930 }, { "grad_norm": 0.8962449431419373, "learning_rate": 9.39e-05, "loss": 0.1335, "step": 940 }, { "grad_norm": 1.252480387687683, "learning_rate": 9.49e-05, "loss": 0.1521, "step": 950 }, { "grad_norm": 1.0166624784469604, "learning_rate": 9.59e-05, "loss": 0.1332, "step": 960 }, { "grad_norm": 0.9230273365974426, "learning_rate": 9.69e-05, "loss": 0.1282, "step": 970 }, { "grad_norm": 1.0204179286956787, "learning_rate": 9.790000000000001e-05, "loss": 0.1381, "step": 980 }, { "grad_norm": 1.0774950981140137, "learning_rate": 9.89e-05, "loss": 0.1495, "step": 990 }, { "grad_norm": 0.8270016312599182, "learning_rate": 9.99e-05, "loss": 0.1382, "step": 1000 }, { "grad_norm": 0.8741334676742554, "learning_rate": 9.999994463727085e-05, "loss": 0.1318, "step": 1010 }, { "grad_norm": 0.8817099928855896, "learning_rate": 9.999975326009292e-05, "loss": 0.1384, "step": 1020 }, { "grad_norm": 0.7642080187797546, "learning_rate": 9.999942518549879e-05, "loss": 0.1292, "step": 1030 }, { "grad_norm": 1.2142404317855835, "learning_rate": 9.999896041438544e-05, "loss": 0.1192, "step": 1040 }, { "grad_norm": 0.9134361147880554, "learning_rate": 9.999835894802353e-05, "loss": 0.1175, "step": 1050 }, { "grad_norm": 1.1910396814346313, "learning_rate": 9.999762078805743e-05, "loss": 0.1177, "step": 1060 }, { "grad_norm": 1.1027071475982666, "learning_rate": 9.999674593650526e-05, "loss": 0.1123, "step": 1070 }, { "grad_norm": 0.9540095925331116, "learning_rate": 9.99957343957588e-05, "loss": 0.098, "step": 1080 }, { "grad_norm": 0.8476457595825195, "learning_rate": 9.99945861685836e-05, "loss": 0.0965, "step": 1090 }, { "grad_norm": 0.7956319451332092, "learning_rate": 9.999330125811884e-05, "loss": 0.1045, "step": 1100 }, { "grad_norm": 1.280799150466919, "learning_rate": 9.999187966787744e-05, "loss": 0.1121, "step": 1110 }, { "grad_norm": 0.8434506058692932, "learning_rate": 9.999032140174595e-05, "loss": 0.1045, "step": 1120 }, { "grad_norm": 0.846332848072052, "learning_rate": 9.998862646398464e-05, "loss": 0.1047, "step": 1130 }, { "grad_norm": 0.883997917175293, "learning_rate": 9.998679485922739e-05, "loss": 0.1012, "step": 1140 }, { "grad_norm": 0.6565617322921753, "learning_rate": 9.998482659248174e-05, "loss": 0.0962, "step": 1150 }, { "grad_norm": 0.6210595965385437, "learning_rate": 9.998272166912883e-05, "loss": 0.0912, "step": 1160 }, { "grad_norm": 0.7833346724510193, "learning_rate": 9.998048009492347e-05, "loss": 0.0944, "step": 1170 }, { "grad_norm": 1.000130295753479, "learning_rate": 9.997810187599403e-05, "loss": 0.0943, "step": 1180 }, { "grad_norm": 0.8826977610588074, "learning_rate": 9.997558701884249e-05, "loss": 0.0984, "step": 1190 }, { "grad_norm": 0.6985938549041748, "learning_rate": 9.997293553034433e-05, "loss": 0.0959, "step": 1200 }, { "grad_norm": 0.7897707223892212, "learning_rate": 9.997014741774866e-05, "loss": 0.0948, "step": 1210 }, { "grad_norm": 0.792839765548706, "learning_rate": 9.996722268867803e-05, "loss": 0.0813, "step": 1220 }, { "grad_norm": 0.8250198364257812, "learning_rate": 9.996416135112858e-05, "loss": 0.0844, "step": 1230 }, { "grad_norm": 0.7786329388618469, "learning_rate": 9.996096341346988e-05, "loss": 0.0773, "step": 1240 }, { "grad_norm": 0.686416506767273, "learning_rate": 9.995762888444495e-05, "loss": 0.086, "step": 1250 }, { "grad_norm": 0.9180893301963806, "learning_rate": 9.995415777317027e-05, "loss": 0.0789, "step": 1260 }, { "grad_norm": 0.8380926847457886, "learning_rate": 9.995055008913574e-05, "loss": 0.0911, "step": 1270 }, { "grad_norm": 1.0363383293151855, "learning_rate": 9.994680584220463e-05, "loss": 0.0936, "step": 1280 }, { "grad_norm": 0.7882492542266846, "learning_rate": 9.994292504261355e-05, "loss": 0.091, "step": 1290 }, { "grad_norm": 0.6642778515815735, "learning_rate": 9.993890770097247e-05, "loss": 0.0822, "step": 1300 }, { "grad_norm": 0.7613900303840637, "learning_rate": 9.993475382826467e-05, "loss": 0.0733, "step": 1310 }, { "grad_norm": 0.6594424247741699, "learning_rate": 9.993046343584664e-05, "loss": 0.0889, "step": 1320 }, { "grad_norm": 0.8057368993759155, "learning_rate": 9.992603653544816e-05, "loss": 0.0685, "step": 1330 }, { "grad_norm": 0.6207922101020813, "learning_rate": 9.992147313917222e-05, "loss": 0.0711, "step": 1340 }, { "grad_norm": 0.5594179630279541, "learning_rate": 9.991677325949497e-05, "loss": 0.0637, "step": 1350 }, { "grad_norm": 0.6301924586296082, "learning_rate": 9.991193690926568e-05, "loss": 0.0736, "step": 1360 }, { "grad_norm": 0.6433706879615784, "learning_rate": 9.990696410170678e-05, "loss": 0.0644, "step": 1370 }, { "grad_norm": 0.924923837184906, "learning_rate": 9.990185485041371e-05, "loss": 0.0689, "step": 1380 }, { "grad_norm": 0.6202785968780518, "learning_rate": 9.989660916935498e-05, "loss": 0.0671, "step": 1390 }, { "grad_norm": 0.8818733096122742, "learning_rate": 9.989122707287208e-05, "loss": 0.0743, "step": 1400 }, { "grad_norm": 0.9659419059753418, "learning_rate": 9.988570857567945e-05, "loss": 0.0769, "step": 1410 }, { "grad_norm": 0.708203911781311, "learning_rate": 9.988005369286446e-05, "loss": 0.0725, "step": 1420 }, { "grad_norm": 0.48283180594444275, "learning_rate": 9.987426243988734e-05, "loss": 0.0741, "step": 1430 }, { "grad_norm": 0.5805427432060242, "learning_rate": 9.986833483258114e-05, "loss": 0.0729, "step": 1440 }, { "grad_norm": 0.9235712885856628, "learning_rate": 9.986227088715173e-05, "loss": 0.0717, "step": 1450 }, { "grad_norm": 0.8887157440185547, "learning_rate": 9.98560706201777e-05, "loss": 0.071, "step": 1460 }, { "grad_norm": 0.6535228490829468, "learning_rate": 9.984973404861036e-05, "loss": 0.0698, "step": 1470 }, { "grad_norm": 0.6180679202079773, "learning_rate": 9.984326118977361e-05, "loss": 0.0727, "step": 1480 }, { "grad_norm": 0.5691157579421997, "learning_rate": 9.983665206136406e-05, "loss": 0.0649, "step": 1490 }, { "grad_norm": 0.7082071304321289, "learning_rate": 9.982990668145075e-05, "loss": 0.0854, "step": 1500 }, { "grad_norm": 0.6680605411529541, "learning_rate": 9.982302506847534e-05, "loss": 0.071, "step": 1510 }, { "grad_norm": 0.6881573796272278, "learning_rate": 9.981600724125189e-05, "loss": 0.0665, "step": 1520 }, { "grad_norm": 2.4010303020477295, "learning_rate": 9.980885321896685e-05, "loss": 0.0696, "step": 1530 }, { "grad_norm": 0.8133783936500549, "learning_rate": 9.980156302117905e-05, "loss": 0.0935, "step": 1540 }, { "grad_norm": 0.69209223985672, "learning_rate": 9.979413666781963e-05, "loss": 0.0818, "step": 1550 }, { "grad_norm": 0.5600221157073975, "learning_rate": 9.978657417919193e-05, "loss": 0.082, "step": 1560 }, { "grad_norm": 0.8262001276016235, "learning_rate": 9.977887557597153e-05, "loss": 0.0762, "step": 1570 }, { "grad_norm": 0.5649595856666565, "learning_rate": 9.97710408792061e-05, "loss": 0.0663, "step": 1580 }, { "grad_norm": 0.5783659219741821, "learning_rate": 9.976307011031542e-05, "loss": 0.0655, "step": 1590 }, { "grad_norm": 0.6117439270019531, "learning_rate": 9.975496329109126e-05, "loss": 0.067, "step": 1600 }, { "grad_norm": 0.6517682075500488, "learning_rate": 9.974672044369732e-05, "loss": 0.0672, "step": 1610 }, { "grad_norm": 0.6064258217811584, "learning_rate": 9.97383415906693e-05, "loss": 0.0596, "step": 1620 }, { "grad_norm": 0.5580758452415466, "learning_rate": 9.97298267549146e-05, "loss": 0.0611, "step": 1630 }, { "grad_norm": 0.5929970741271973, "learning_rate": 9.972117595971249e-05, "loss": 0.0571, "step": 1640 }, { "grad_norm": 0.6233961582183838, "learning_rate": 9.971238922871391e-05, "loss": 0.0662, "step": 1650 }, { "grad_norm": 0.6190022230148315, "learning_rate": 9.970346658594142e-05, "loss": 0.0609, "step": 1660 }, { "grad_norm": 0.70819091796875, "learning_rate": 9.969440805578923e-05, "loss": 0.0641, "step": 1670 }, { "grad_norm": 0.673477292060852, "learning_rate": 9.968521366302298e-05, "loss": 0.072, "step": 1680 }, { "grad_norm": 0.798906147480011, "learning_rate": 9.967588343277981e-05, "loss": 0.0769, "step": 1690 }, { "grad_norm": 0.681957483291626, "learning_rate": 9.966641739056818e-05, "loss": 0.076, "step": 1700 }, { "grad_norm": 0.939273476600647, "learning_rate": 9.965681556226793e-05, "loss": 0.0733, "step": 1710 }, { "grad_norm": 0.6763797998428345, "learning_rate": 9.964707797413006e-05, "loss": 0.0755, "step": 1720 }, { "grad_norm": 0.6557914018630981, "learning_rate": 9.963720465277679e-05, "loss": 0.0774, "step": 1730 }, { "grad_norm": 0.5904055833816528, "learning_rate": 9.96271956252014e-05, "loss": 0.0724, "step": 1740 }, { "grad_norm": 0.532781183719635, "learning_rate": 9.961705091876816e-05, "loss": 0.0702, "step": 1750 }, { "grad_norm": 0.6403186917304993, "learning_rate": 9.960677056121235e-05, "loss": 0.072, "step": 1760 }, { "grad_norm": 0.6167631149291992, "learning_rate": 9.959635458064005e-05, "loss": 0.0696, "step": 1770 }, { "grad_norm": 0.6313881278038025, "learning_rate": 9.958580300552815e-05, "loss": 0.0684, "step": 1780 }, { "grad_norm": 0.5699571967124939, "learning_rate": 9.957511586472426e-05, "loss": 0.0719, "step": 1790 }, { "grad_norm": 1.3282815217971802, "learning_rate": 9.956429318744662e-05, "loss": 0.0625, "step": 1800 }, { "grad_norm": 0.7587619423866272, "learning_rate": 9.955333500328404e-05, "loss": 0.0689, "step": 1810 }, { "grad_norm": 0.47306573390960693, "learning_rate": 9.95422413421957e-05, "loss": 0.0665, "step": 1820 }, { "grad_norm": 0.7589190006256104, "learning_rate": 9.953101223451133e-05, "loss": 0.062, "step": 1830 }, { "grad_norm": 0.7980528473854065, "learning_rate": 9.951964771093085e-05, "loss": 0.0611, "step": 1840 }, { "grad_norm": 0.6117101907730103, "learning_rate": 9.950814780252442e-05, "loss": 0.0607, "step": 1850 }, { "grad_norm": 0.5013240575790405, "learning_rate": 9.949651254073236e-05, "loss": 0.0749, "step": 1860 }, { "grad_norm": 0.6423071026802063, "learning_rate": 9.948474195736504e-05, "loss": 0.0767, "step": 1870 }, { "grad_norm": 0.573163628578186, "learning_rate": 9.947283608460277e-05, "loss": 0.0668, "step": 1880 }, { "grad_norm": 0.5614108443260193, "learning_rate": 9.946079495499577e-05, "loss": 0.0592, "step": 1890 }, { "grad_norm": 0.5699909329414368, "learning_rate": 9.944861860146401e-05, "loss": 0.0718, "step": 1900 }, { "grad_norm": 0.7450922131538391, "learning_rate": 9.943630705729719e-05, "loss": 0.0691, "step": 1910 }, { "grad_norm": 0.5995872020721436, "learning_rate": 9.942386035615459e-05, "loss": 0.0683, "step": 1920 }, { "grad_norm": 0.5596151351928711, "learning_rate": 9.941127853206503e-05, "loss": 0.0679, "step": 1930 }, { "grad_norm": 0.46712827682495117, "learning_rate": 9.939856161942673e-05, "loss": 0.0607, "step": 1940 }, { "grad_norm": 0.5417409539222717, "learning_rate": 9.938570965300724e-05, "loss": 0.0636, "step": 1950 }, { "grad_norm": 0.6594402194023132, "learning_rate": 9.937272266794335e-05, "loss": 0.0645, "step": 1960 }, { "grad_norm": 0.7908797264099121, "learning_rate": 9.935960069974096e-05, "loss": 0.0717, "step": 1970 }, { "grad_norm": 0.5489366054534912, "learning_rate": 9.934634378427506e-05, "loss": 0.0774, "step": 1980 }, { "grad_norm": 0.5664447546005249, "learning_rate": 9.933295195778954e-05, "loss": 0.062, "step": 1990 }, { "grad_norm": 0.45476943254470825, "learning_rate": 9.931942525689715e-05, "loss": 0.0558, "step": 2000 }, { "grad_norm": 0.5109198689460754, "learning_rate": 9.930576371857936e-05, "loss": 0.0511, "step": 2010 }, { "grad_norm": 0.47588807344436646, "learning_rate": 9.929196738018629e-05, "loss": 0.0573, "step": 2020 }, { "grad_norm": 0.6191189885139465, "learning_rate": 9.927803627943662e-05, "loss": 0.064, "step": 2030 }, { "grad_norm": 0.5179525017738342, "learning_rate": 9.926397045441744e-05, "loss": 0.066, "step": 2040 }, { "grad_norm": 0.5242493748664856, "learning_rate": 9.924976994358417e-05, "loss": 0.0586, "step": 2050 }, { "grad_norm": 0.5597029328346252, "learning_rate": 9.923543478576048e-05, "loss": 0.0548, "step": 2060 }, { "grad_norm": 0.5291748046875, "learning_rate": 9.922096502013813e-05, "loss": 0.0532, "step": 2070 }, { "grad_norm": 0.5598992109298706, "learning_rate": 9.92063606862769e-05, "loss": 0.0524, "step": 2080 }, { "grad_norm": 0.616943359375, "learning_rate": 9.919162182410453e-05, "loss": 0.0523, "step": 2090 }, { "grad_norm": 0.36449649930000305, "learning_rate": 9.917674847391645e-05, "loss": 0.0494, "step": 2100 }, { "grad_norm": 0.5966471433639526, "learning_rate": 9.916174067637584e-05, "loss": 0.0521, "step": 2110 }, { "grad_norm": 0.6452257037162781, "learning_rate": 9.914659847251348e-05, "loss": 0.0545, "step": 2120 }, { "grad_norm": 0.4392760097980499, "learning_rate": 9.913132190372753e-05, "loss": 0.0545, "step": 2130 }, { "grad_norm": 0.5927709937095642, "learning_rate": 9.911591101178359e-05, "loss": 0.057, "step": 2140 }, { "grad_norm": 0.5518070459365845, "learning_rate": 9.910036583881443e-05, "loss": 0.0602, "step": 2150 }, { "grad_norm": 0.584982693195343, "learning_rate": 9.908468642731995e-05, "loss": 0.0521, "step": 2160 }, { "grad_norm": 0.4870074689388275, "learning_rate": 9.906887282016707e-05, "loss": 0.0531, "step": 2170 }, { "grad_norm": 0.7150896191596985, "learning_rate": 9.90529250605896e-05, "loss": 0.0605, "step": 2180 }, { "grad_norm": 0.4752698838710785, "learning_rate": 9.903684319218809e-05, "loss": 0.0582, "step": 2190 }, { "grad_norm": 0.5058364272117615, "learning_rate": 9.902062725892976e-05, "loss": 0.0548, "step": 2200 }, { "grad_norm": 0.40742620825767517, "learning_rate": 9.900427730514834e-05, "loss": 0.0477, "step": 2210 }, { "grad_norm": 0.4977620542049408, "learning_rate": 9.8987793375544e-05, "loss": 0.0511, "step": 2220 }, { "grad_norm": 0.6374340653419495, "learning_rate": 9.897117551518318e-05, "loss": 0.0536, "step": 2230 }, { "grad_norm": 0.5005456805229187, "learning_rate": 9.895442376949844e-05, "loss": 0.0507, "step": 2240 }, { "grad_norm": 0.5211104154586792, "learning_rate": 9.893753818428845e-05, "loss": 0.0572, "step": 2250 }, { "grad_norm": 0.5128145813941956, "learning_rate": 9.892051880571773e-05, "loss": 0.0544, "step": 2260 }, { "grad_norm": 0.5326552391052246, "learning_rate": 9.890336568031663e-05, "loss": 0.0496, "step": 2270 }, { "grad_norm": 0.5715444684028625, "learning_rate": 9.888607885498113e-05, "loss": 0.0546, "step": 2280 }, { "grad_norm": 0.5557758808135986, "learning_rate": 9.886865837697275e-05, "loss": 0.0526, "step": 2290 }, { "grad_norm": 0.5583028793334961, "learning_rate": 9.88511042939184e-05, "loss": 0.0482, "step": 2300 }, { "grad_norm": 0.5574371218681335, "learning_rate": 9.883341665381028e-05, "loss": 0.0582, "step": 2310 }, { "grad_norm": 0.5746517777442932, "learning_rate": 9.881559550500575e-05, "loss": 0.0519, "step": 2320 }, { "grad_norm": 0.509598433971405, "learning_rate": 9.879764089622712e-05, "loss": 0.0626, "step": 2330 }, { "grad_norm": 0.5789699554443359, "learning_rate": 9.87795528765616e-05, "loss": 0.0565, "step": 2340 }, { "grad_norm": 0.5941703915596008, "learning_rate": 9.876133149546118e-05, "loss": 0.0597, "step": 2350 }, { "grad_norm": 0.5226916670799255, "learning_rate": 9.874297680274238e-05, "loss": 0.0642, "step": 2360 }, { "grad_norm": 0.5128180980682373, "learning_rate": 9.872448884858624e-05, "loss": 0.0581, "step": 2370 }, { "grad_norm": 0.557876706123352, "learning_rate": 9.870586768353815e-05, "loss": 0.0551, "step": 2380 }, { "grad_norm": 0.6180871725082397, "learning_rate": 9.868711335850764e-05, "loss": 0.0573, "step": 2390 }, { "grad_norm": 0.4662003815174103, "learning_rate": 9.866822592476833e-05, "loss": 0.0542, "step": 2400 }, { "grad_norm": 0.5943747758865356, "learning_rate": 9.86492054339577e-05, "loss": 0.0511, "step": 2410 }, { "grad_norm": 0.5587828755378723, "learning_rate": 9.863005193807711e-05, "loss": 0.0532, "step": 2420 }, { "grad_norm": 0.7192747592926025, "learning_rate": 9.861076548949143e-05, "loss": 0.0487, "step": 2430 }, { "grad_norm": 0.5292271375656128, "learning_rate": 9.859134614092912e-05, "loss": 0.0511, "step": 2440 }, { "grad_norm": 0.6912357807159424, "learning_rate": 9.857179394548191e-05, "loss": 0.0523, "step": 2450 }, { "grad_norm": 0.5890341401100159, "learning_rate": 9.855210895660477e-05, "loss": 0.057, "step": 2460 }, { "grad_norm": 0.5555132031440735, "learning_rate": 9.853229122811568e-05, "loss": 0.0606, "step": 2470 }, { "grad_norm": 0.6136529445648193, "learning_rate": 9.851234081419559e-05, "loss": 0.0555, "step": 2480 }, { "grad_norm": 0.5654100179672241, "learning_rate": 9.849225776938814e-05, "loss": 0.0495, "step": 2490 }, { "grad_norm": 0.4823625087738037, "learning_rate": 9.847204214859964e-05, "loss": 0.0557, "step": 2500 }, { "grad_norm": 0.5169696807861328, "learning_rate": 9.845169400709879e-05, "loss": 0.0518, "step": 2510 }, { "grad_norm": 0.5754597783088684, "learning_rate": 9.843121340051664e-05, "loss": 0.0503, "step": 2520 }, { "grad_norm": 0.45808571577072144, "learning_rate": 9.841060038484641e-05, "loss": 0.047, "step": 2530 }, { "grad_norm": 0.5391010046005249, "learning_rate": 9.838985501644328e-05, "loss": 0.0521, "step": 2540 }, { "grad_norm": 0.7032832503318787, "learning_rate": 9.83689773520243e-05, "loss": 0.0617, "step": 2550 }, { "grad_norm": 0.41807830333709717, "learning_rate": 9.834796744866819e-05, "loss": 0.0534, "step": 2560 }, { "grad_norm": 0.4748309850692749, "learning_rate": 9.832682536381525e-05, "loss": 0.0508, "step": 2570 }, { "grad_norm": 0.40600940585136414, "learning_rate": 9.830555115526711e-05, "loss": 0.0537, "step": 2580 }, { "grad_norm": 0.606266438961029, "learning_rate": 9.828414488118667e-05, "loss": 0.0549, "step": 2590 }, { "grad_norm": 0.5436404943466187, "learning_rate": 9.826260660009785e-05, "loss": 0.0528, "step": 2600 }, { "grad_norm": 0.44797390699386597, "learning_rate": 9.824093637088547e-05, "loss": 0.0528, "step": 2610 }, { "grad_norm": 0.5665544867515564, "learning_rate": 9.821913425279514e-05, "loss": 0.0513, "step": 2620 }, { "grad_norm": 0.504040002822876, "learning_rate": 9.8197200305433e-05, "loss": 0.054, "step": 2630 }, { "grad_norm": 0.4526163935661316, "learning_rate": 9.817513458876564e-05, "loss": 0.0442, "step": 2640 }, { "grad_norm": 0.6039993166923523, "learning_rate": 9.815293716311987e-05, "loss": 0.053, "step": 2650 }, { "grad_norm": 0.5020528435707092, "learning_rate": 9.813060808918262e-05, "loss": 0.061, "step": 2660 }, { "grad_norm": 0.5190415978431702, "learning_rate": 9.810814742800069e-05, "loss": 0.0547, "step": 2670 }, { "grad_norm": 0.6284607648849487, "learning_rate": 9.808555524098074e-05, "loss": 0.0641, "step": 2680 }, { "grad_norm": 0.4829862713813782, "learning_rate": 9.806283158988887e-05, "loss": 0.0522, "step": 2690 }, { "grad_norm": 0.4887554943561554, "learning_rate": 9.803997653685072e-05, "loss": 0.0543, "step": 2700 }, { "grad_norm": 0.4156060218811035, "learning_rate": 9.801699014435112e-05, "loss": 0.0518, "step": 2710 }, { "grad_norm": 0.5919528603553772, "learning_rate": 9.799387247523398e-05, "loss": 0.0508, "step": 2720 }, { "grad_norm": 0.5660161375999451, "learning_rate": 9.797062359270215e-05, "loss": 0.0505, "step": 2730 }, { "grad_norm": 0.4677155911922455, "learning_rate": 9.794724356031715e-05, "loss": 0.0521, "step": 2740 }, { "grad_norm": 0.4448791444301605, "learning_rate": 9.792373244199913e-05, "loss": 0.0523, "step": 2750 }, { "grad_norm": 0.6237162947654724, "learning_rate": 9.790009030202658e-05, "loss": 0.0554, "step": 2760 }, { "grad_norm": 0.7144814729690552, "learning_rate": 9.78763172050362e-05, "loss": 0.0589, "step": 2770 }, { "grad_norm": 0.5253874659538269, "learning_rate": 9.785241321602274e-05, "loss": 0.06, "step": 2780 }, { "grad_norm": 0.5301258563995361, "learning_rate": 9.782837840033879e-05, "loss": 0.0557, "step": 2790 }, { "grad_norm": 0.6269859671592712, "learning_rate": 9.780421282369461e-05, "loss": 0.0651, "step": 2800 }, { "grad_norm": 0.5636250376701355, "learning_rate": 9.777991655215797e-05, "loss": 0.0689, "step": 2810 }, { "grad_norm": 0.5588189959526062, "learning_rate": 9.775548965215394e-05, "loss": 0.0603, "step": 2820 }, { "grad_norm": 0.6025320887565613, "learning_rate": 9.773093219046474e-05, "loss": 0.0553, "step": 2830 }, { "grad_norm": 0.4741453528404236, "learning_rate": 9.770624423422954e-05, "loss": 0.0511, "step": 2840 }, { "grad_norm": 0.6581608653068542, "learning_rate": 9.768142585094426e-05, "loss": 0.0527, "step": 2850 }, { "grad_norm": 0.497335284948349, "learning_rate": 9.765647710846142e-05, "loss": 0.0509, "step": 2860 }, { "grad_norm": 0.41942527890205383, "learning_rate": 9.763139807498991e-05, "loss": 0.0494, "step": 2870 }, { "grad_norm": 0.573166012763977, "learning_rate": 9.760618881909487e-05, "loss": 0.0484, "step": 2880 }, { "grad_norm": 0.507034182548523, "learning_rate": 9.758084940969744e-05, "loss": 0.0458, "step": 2890 }, { "grad_norm": 0.4614987373352051, "learning_rate": 9.755537991607459e-05, "loss": 0.045, "step": 2900 }, { "grad_norm": 0.45924949645996094, "learning_rate": 9.752978040785895e-05, "loss": 0.0597, "step": 2910 }, { "grad_norm": 0.6361121535301208, "learning_rate": 9.750405095503859e-05, "loss": 0.0461, "step": 2920 }, { "grad_norm": 0.44877341389656067, "learning_rate": 9.747819162795686e-05, "loss": 0.0468, "step": 2930 }, { "grad_norm": 0.4664103388786316, "learning_rate": 9.745220249731217e-05, "loss": 0.0465, "step": 2940 }, { "grad_norm": 0.5465919971466064, "learning_rate": 9.742608363415781e-05, "loss": 0.0448, "step": 2950 }, { "grad_norm": 0.6341918706893921, "learning_rate": 9.739983510990176e-05, "loss": 0.0494, "step": 2960 }, { "grad_norm": 0.44346052408218384, "learning_rate": 9.737345699630647e-05, "loss": 0.0525, "step": 2970 }, { "grad_norm": 0.426369309425354, "learning_rate": 9.734694936548869e-05, "loss": 0.0442, "step": 2980 }, { "grad_norm": 0.5477653741836548, "learning_rate": 9.732031228991932e-05, "loss": 0.0466, "step": 2990 }, { "grad_norm": 0.672471821308136, "learning_rate": 9.729354584242302e-05, "loss": 0.0508, "step": 3000 }, { "grad_norm": 0.43908244371414185, "learning_rate": 9.726665009617832e-05, "loss": 0.05, "step": 3010 }, { "grad_norm": 0.44118788838386536, "learning_rate": 9.723962512471714e-05, "loss": 0.0517, "step": 3020 }, { "grad_norm": 0.4536677300930023, "learning_rate": 9.72124710019247e-05, "loss": 0.0456, "step": 3030 }, { "grad_norm": 0.48591578006744385, "learning_rate": 9.718518780203934e-05, "loss": 0.0429, "step": 3040 }, { "grad_norm": 0.6257529854774475, "learning_rate": 9.715777559965228e-05, "loss": 0.046, "step": 3050 }, { "grad_norm": 0.49266698956489563, "learning_rate": 9.713023446970746e-05, "loss": 0.048, "step": 3060 }, { "grad_norm": 0.44442063570022583, "learning_rate": 9.710256448750126e-05, "loss": 0.0488, "step": 3070 }, { "grad_norm": 0.5594804883003235, "learning_rate": 9.707476572868235e-05, "loss": 0.0489, "step": 3080 }, { "grad_norm": 0.41758275032043457, "learning_rate": 9.704683826925149e-05, "loss": 0.0478, "step": 3090 }, { "grad_norm": 0.6071073412895203, "learning_rate": 9.701878218556129e-05, "loss": 0.0474, "step": 3100 }, { "grad_norm": 0.4647288918495178, "learning_rate": 9.699059755431598e-05, "loss": 0.0486, "step": 3110 }, { "grad_norm": 0.5710163116455078, "learning_rate": 9.696228445257132e-05, "loss": 0.0498, "step": 3120 }, { "grad_norm": 0.4418345093727112, "learning_rate": 9.693384295773419e-05, "loss": 0.0513, "step": 3130 }, { "grad_norm": 0.4265054166316986, "learning_rate": 9.690527314756259e-05, "loss": 0.0501, "step": 3140 }, { "grad_norm": 0.47920045256614685, "learning_rate": 9.687657510016527e-05, "loss": 0.0458, "step": 3150 }, { "grad_norm": 0.4441242814064026, "learning_rate": 9.684774889400161e-05, "loss": 0.0442, "step": 3160 }, { "grad_norm": 0.5169118046760559, "learning_rate": 9.681879460788135e-05, "loss": 0.0552, "step": 3170 }, { "grad_norm": 0.44021064043045044, "learning_rate": 9.67897123209644e-05, "loss": 0.045, "step": 3180 }, { "grad_norm": 0.4735221564769745, "learning_rate": 9.676050211276062e-05, "loss": 0.0418, "step": 3190 }, { "grad_norm": 0.39998671412467957, "learning_rate": 9.673116406312962e-05, "loss": 0.0484, "step": 3200 }, { "grad_norm": 1.133259654045105, "learning_rate": 9.67016982522805e-05, "loss": 0.0479, "step": 3210 }, { "grad_norm": 0.45326775312423706, "learning_rate": 9.667210476077164e-05, "loss": 0.0424, "step": 3220 }, { "grad_norm": 0.3644816279411316, "learning_rate": 9.664238366951055e-05, "loss": 0.0453, "step": 3230 }, { "grad_norm": 0.45920780301094055, "learning_rate": 9.661253505975355e-05, "loss": 0.0408, "step": 3240 }, { "grad_norm": 0.4443128705024719, "learning_rate": 9.658255901310557e-05, "loss": 0.0398, "step": 3250 }, { "grad_norm": 0.4563174247741699, "learning_rate": 9.655245561152e-05, "loss": 0.0537, "step": 3260 }, { "grad_norm": 0.38739824295043945, "learning_rate": 9.65222249372984e-05, "loss": 0.0437, "step": 3270 }, { "grad_norm": 0.546059787273407, "learning_rate": 9.649186707309026e-05, "loss": 0.0439, "step": 3280 }, { "grad_norm": 0.4397445321083069, "learning_rate": 9.646138210189283e-05, "loss": 0.0455, "step": 3290 }, { "grad_norm": 0.48162055015563965, "learning_rate": 9.643077010705087e-05, "loss": 0.0449, "step": 3300 }, { "grad_norm": 0.47936204075813293, "learning_rate": 9.640003117225637e-05, "loss": 0.0418, "step": 3310 }, { "grad_norm": 0.45568278431892395, "learning_rate": 9.636916538154846e-05, "loss": 0.0392, "step": 3320 }, { "grad_norm": 0.48796817660331726, "learning_rate": 9.633817281931296e-05, "loss": 0.053, "step": 3330 }, { "grad_norm": 0.5864344239234924, "learning_rate": 9.630705357028242e-05, "loss": 0.049, "step": 3340 }, { "grad_norm": 0.43657219409942627, "learning_rate": 9.627580771953563e-05, "loss": 0.0576, "step": 3350 }, { "grad_norm": 0.4749419391155243, "learning_rate": 9.624443535249759e-05, "loss": 0.0495, "step": 3360 }, { "grad_norm": 0.5182581543922424, "learning_rate": 9.621293655493913e-05, "loss": 0.057, "step": 3370 }, { "grad_norm": 0.4281359910964966, "learning_rate": 9.618131141297675e-05, "loss": 0.0512, "step": 3380 }, { "grad_norm": 0.5009461045265198, "learning_rate": 9.614956001307242e-05, "loss": 0.0501, "step": 3390 }, { "grad_norm": 0.4162812829017639, "learning_rate": 9.611768244203321e-05, "loss": 0.0499, "step": 3400 }, { "grad_norm": 0.3371119499206543, "learning_rate": 9.60856787870112e-05, "loss": 0.043, "step": 3410 }, { "grad_norm": 0.49691930413246155, "learning_rate": 9.605354913550318e-05, "loss": 0.0452, "step": 3420 }, { "grad_norm": 0.4182418882846832, "learning_rate": 9.602129357535037e-05, "loss": 0.0445, "step": 3430 }, { "grad_norm": 0.44390904903411865, "learning_rate": 9.598891219473825e-05, "loss": 0.0449, "step": 3440 }, { "grad_norm": 0.40087294578552246, "learning_rate": 9.595640508219625e-05, "loss": 0.0423, "step": 3450 }, { "grad_norm": 0.46116670966148376, "learning_rate": 9.592377232659761e-05, "loss": 0.0473, "step": 3460 }, { "grad_norm": 0.4395975172519684, "learning_rate": 9.589101401715904e-05, "loss": 0.0413, "step": 3470 }, { "grad_norm": 0.4863179326057434, "learning_rate": 9.585813024344045e-05, "loss": 0.0457, "step": 3480 }, { "grad_norm": 0.4670794904232025, "learning_rate": 9.58251210953449e-05, "loss": 0.0441, "step": 3490 }, { "grad_norm": 0.47603800892829895, "learning_rate": 9.579198666311809e-05, "loss": 0.0519, "step": 3500 }, { "grad_norm": 1.9217393398284912, "learning_rate": 9.575872703734832e-05, "loss": 0.0474, "step": 3510 }, { "grad_norm": 0.34600338339805603, "learning_rate": 9.572534230896611e-05, "loss": 0.0431, "step": 3520 }, { "grad_norm": 0.6398482918739319, "learning_rate": 9.569183256924403e-05, "loss": 0.0447, "step": 3530 }, { "grad_norm": 0.4254852831363678, "learning_rate": 9.565819790979646e-05, "loss": 0.0506, "step": 3540 }, { "grad_norm": 0.4393067955970764, "learning_rate": 9.562443842257925e-05, "loss": 0.0469, "step": 3550 }, { "grad_norm": 1.2385284900665283, "learning_rate": 9.559055419988956e-05, "loss": 0.0509, "step": 3560 }, { "grad_norm": 0.5407046675682068, "learning_rate": 9.555654533436557e-05, "loss": 0.0547, "step": 3570 }, { "grad_norm": 0.5785049200057983, "learning_rate": 9.552241191898621e-05, "loss": 0.0534, "step": 3580 }, { "grad_norm": 0.38366270065307617, "learning_rate": 9.548815404707092e-05, "loss": 0.0464, "step": 3590 }, { "grad_norm": 0.3854336142539978, "learning_rate": 9.545377181227942e-05, "loss": 0.0488, "step": 3600 }, { "grad_norm": 0.4224337041378021, "learning_rate": 9.541926530861145e-05, "loss": 0.0412, "step": 3610 }, { "grad_norm": 0.5154219269752502, "learning_rate": 9.538463463040645e-05, "loss": 0.042, "step": 3620 }, { "grad_norm": 0.37926003336906433, "learning_rate": 9.534987987234337e-05, "loss": 0.0445, "step": 3630 }, { "grad_norm": 0.4577227234840393, "learning_rate": 9.53150011294404e-05, "loss": 0.0453, "step": 3640 }, { "grad_norm": 0.43031319975852966, "learning_rate": 9.527999849705471e-05, "loss": 0.0441, "step": 3650 }, { "grad_norm": 0.580282986164093, "learning_rate": 9.524487207088213e-05, "loss": 0.0403, "step": 3660 }, { "grad_norm": 0.5163167119026184, "learning_rate": 9.520962194695698e-05, "loss": 0.0414, "step": 3670 }, { "grad_norm": 0.5447613596916199, "learning_rate": 9.517424822165175e-05, "loss": 0.0493, "step": 3680 }, { "grad_norm": 0.43740320205688477, "learning_rate": 9.513875099167685e-05, "loss": 0.0472, "step": 3690 }, { "grad_norm": 0.4720941185951233, "learning_rate": 9.510313035408035e-05, "loss": 0.0439, "step": 3700 }, { "grad_norm": 0.4445101022720337, "learning_rate": 9.506738640624775e-05, "loss": 0.0467, "step": 3710 }, { "grad_norm": 0.4809243679046631, "learning_rate": 9.50315192459016e-05, "loss": 0.0479, "step": 3720 }, { "grad_norm": 0.4610294699668884, "learning_rate": 9.499552897110136e-05, "loss": 0.0401, "step": 3730 }, { "grad_norm": 0.37978461384773254, "learning_rate": 9.495941568024304e-05, "loss": 0.0514, "step": 3740 }, { "grad_norm": 0.5151900053024292, "learning_rate": 9.492317947205904e-05, "loss": 0.0443, "step": 3750 }, { "grad_norm": 0.6103503704071045, "learning_rate": 9.488682044561775e-05, "loss": 0.0452, "step": 3760 }, { "grad_norm": 0.4817036986351013, "learning_rate": 9.485033870032335e-05, "loss": 0.0491, "step": 3770 }, { "grad_norm": 0.4768231511116028, "learning_rate": 9.481373433591556e-05, "loss": 0.0405, "step": 3780 }, { "grad_norm": 0.44551756978034973, "learning_rate": 9.47770074524693e-05, "loss": 0.0435, "step": 3790 }, { "grad_norm": 0.49751347303390503, "learning_rate": 9.474015815039446e-05, "loss": 0.0507, "step": 3800 }, { "grad_norm": 0.5176432132720947, "learning_rate": 9.470318653043565e-05, "loss": 0.0453, "step": 3810 }, { "grad_norm": 0.4701424837112427, "learning_rate": 9.466609269367185e-05, "loss": 0.0488, "step": 3820 }, { "grad_norm": 0.38317883014678955, "learning_rate": 9.46288767415162e-05, "loss": 0.045, "step": 3830 }, { "grad_norm": 0.42292967438697815, "learning_rate": 9.459153877571567e-05, "loss": 0.0454, "step": 3840 }, { "grad_norm": 0.5141590237617493, "learning_rate": 9.455407889835087e-05, "loss": 0.0444, "step": 3850 }, { "grad_norm": 0.4151747226715088, "learning_rate": 9.451649721183564e-05, "loss": 0.046, "step": 3860 }, { "grad_norm": 0.446642130613327, "learning_rate": 9.447879381891692e-05, "loss": 0.0424, "step": 3870 }, { "grad_norm": 0.40734538435935974, "learning_rate": 9.444096882267428e-05, "loss": 0.0426, "step": 3880 }, { "grad_norm": 0.48704418540000916, "learning_rate": 9.440302232651988e-05, "loss": 0.0382, "step": 3890 }, { "grad_norm": 0.4663752317428589, "learning_rate": 9.436495443419795e-05, "loss": 0.0443, "step": 3900 }, { "grad_norm": 0.4256438612937927, "learning_rate": 9.432676524978466e-05, "loss": 0.041, "step": 3910 }, { "grad_norm": 0.37890625, "learning_rate": 9.42884548776878e-05, "loss": 0.0436, "step": 3920 }, { "grad_norm": 0.7094347476959229, "learning_rate": 9.425002342264646e-05, "loss": 0.0382, "step": 3930 }, { "grad_norm": 0.4241328835487366, "learning_rate": 9.421147098973077e-05, "loss": 0.0454, "step": 3940 }, { "grad_norm": 0.40152591466903687, "learning_rate": 9.41727976843416e-05, "loss": 0.0456, "step": 3950 }, { "grad_norm": 0.8632655143737793, "learning_rate": 9.413400361221029e-05, "loss": 0.0381, "step": 3960 }, { "grad_norm": 0.34784767031669617, "learning_rate": 9.409508887939835e-05, "loss": 0.0452, "step": 3970 }, { "grad_norm": 0.444382905960083, "learning_rate": 9.40560535922972e-05, "loss": 0.0413, "step": 3980 }, { "grad_norm": 0.395707368850708, "learning_rate": 9.40168978576278e-05, "loss": 0.0395, "step": 3990 }, { "grad_norm": 0.6966690421104431, "learning_rate": 9.397762178244043e-05, "loss": 0.042, "step": 4000 }, { "grad_norm": 0.39495572447776794, "learning_rate": 9.393822547411439e-05, "loss": 0.0409, "step": 4010 }, { "grad_norm": 0.5429093837738037, "learning_rate": 9.389870904035769e-05, "loss": 0.0402, "step": 4020 }, { "grad_norm": 0.4963032305240631, "learning_rate": 9.385907258920672e-05, "loss": 0.0447, "step": 4030 }, { "grad_norm": 0.5194328427314758, "learning_rate": 9.381931622902607e-05, "loss": 0.0426, "step": 4040 }, { "grad_norm": 0.46482375264167786, "learning_rate": 9.377944006850807e-05, "loss": 0.0444, "step": 4050 }, { "grad_norm": 0.42143937945365906, "learning_rate": 9.373944421667265e-05, "loss": 0.0496, "step": 4060 }, { "grad_norm": 0.4322550296783447, "learning_rate": 9.369932878286691e-05, "loss": 0.0461, "step": 4070 }, { "grad_norm": 0.8713535666465759, "learning_rate": 9.365909387676494e-05, "loss": 0.0529, "step": 4080 }, { "grad_norm": 0.4798184633255005, "learning_rate": 9.361873960836744e-05, "loss": 0.0627, "step": 4090 }, { "grad_norm": 0.40550848841667175, "learning_rate": 9.357826608800142e-05, "loss": 0.0565, "step": 4100 }, { "grad_norm": 0.3636028468608856, "learning_rate": 9.353767342631994e-05, "loss": 0.0476, "step": 4110 }, { "grad_norm": 0.6560397744178772, "learning_rate": 9.34969617343018e-05, "loss": 0.0499, "step": 4120 }, { "grad_norm": 0.5257948040962219, "learning_rate": 9.345613112325122e-05, "loss": 0.046, "step": 4130 }, { "grad_norm": 0.5162472128868103, "learning_rate": 9.34151817047975e-05, "loss": 0.0485, "step": 4140 }, { "grad_norm": 0.3595101535320282, "learning_rate": 9.33741135908948e-05, "loss": 0.0426, "step": 4150 }, { "grad_norm": 0.382739394903183, "learning_rate": 9.33329268938218e-05, "loss": 0.0425, "step": 4160 }, { "grad_norm": 0.542331337928772, "learning_rate": 9.329162172618132e-05, "loss": 0.046, "step": 4170 }, { "grad_norm": 0.4396301209926605, "learning_rate": 9.325019820090013e-05, "loss": 0.045, "step": 4180 }, { "grad_norm": 0.3951185941696167, "learning_rate": 9.320865643122855e-05, "loss": 0.0461, "step": 4190 }, { "grad_norm": 0.5389053821563721, "learning_rate": 9.316699653074023e-05, "loss": 0.0467, "step": 4200 }, { "grad_norm": 0.459313303232193, "learning_rate": 9.312521861333172e-05, "loss": 0.0443, "step": 4210 }, { "grad_norm": 0.38323265314102173, "learning_rate": 9.308332279322224e-05, "loss": 0.0399, "step": 4220 }, { "grad_norm": 0.45294708013534546, "learning_rate": 9.304130918495338e-05, "loss": 0.0449, "step": 4230 }, { "grad_norm": 0.9934446215629578, "learning_rate": 9.299917790338874e-05, "loss": 0.0639, "step": 4240 }, { "grad_norm": 0.5188084244728088, "learning_rate": 9.295692906371363e-05, "loss": 0.0669, "step": 4250 }, { "grad_norm": 0.6298515200614929, "learning_rate": 9.291456278143476e-05, "loss": 0.0541, "step": 4260 }, { "grad_norm": 0.5022754073143005, "learning_rate": 9.287207917237994e-05, "loss": 0.0488, "step": 4270 }, { "grad_norm": 0.34121814370155334, "learning_rate": 9.282947835269773e-05, "loss": 0.0458, "step": 4280 }, { "grad_norm": 0.45362961292266846, "learning_rate": 9.278676043885715e-05, "loss": 0.0416, "step": 4290 }, { "grad_norm": 0.9196850657463074, "learning_rate": 9.274392554764733e-05, "loss": 0.052, "step": 4300 }, { "grad_norm": 0.45195627212524414, "learning_rate": 9.270097379617723e-05, "loss": 0.0459, "step": 4310 }, { "grad_norm": 0.4067656397819519, "learning_rate": 9.26579053018753e-05, "loss": 0.0433, "step": 4320 }, { "grad_norm": 0.47101980447769165, "learning_rate": 9.261472018248918e-05, "loss": 0.0442, "step": 4330 }, { "grad_norm": 0.3946528434753418, "learning_rate": 9.25714185560853e-05, "loss": 0.0438, "step": 4340 }, { "grad_norm": 0.5415571331977844, "learning_rate": 9.252800054104868e-05, "loss": 0.0422, "step": 4350 }, { "grad_norm": 0.49813973903656006, "learning_rate": 9.248446625608252e-05, "loss": 0.0426, "step": 4360 }, { "grad_norm": 1.1277873516082764, "learning_rate": 9.244081582020789e-05, "loss": 0.0432, "step": 4370 }, { "grad_norm": 0.425519734621048, "learning_rate": 9.239704935276339e-05, "loss": 0.04, "step": 4380 }, { "grad_norm": 0.39227941632270813, "learning_rate": 9.235316697340489e-05, "loss": 0.0422, "step": 4390 }, { "grad_norm": 0.6046552658081055, "learning_rate": 9.230916880210512e-05, "loss": 0.0518, "step": 4400 }, { "grad_norm": 0.6059166789054871, "learning_rate": 9.226505495915342e-05, "loss": 0.0475, "step": 4410 }, { "grad_norm": 0.5048810839653015, "learning_rate": 9.222082556515536e-05, "loss": 0.0573, "step": 4420 }, { "grad_norm": 0.3837684094905853, "learning_rate": 9.217648074103242e-05, "loss": 0.0511, "step": 4430 }, { "grad_norm": 0.46948739886283875, "learning_rate": 9.213202060802161e-05, "loss": 0.051, "step": 4440 }, { "grad_norm": 0.40239736437797546, "learning_rate": 9.208744528767528e-05, "loss": 0.0417, "step": 4450 }, { "grad_norm": 0.3479786217212677, "learning_rate": 9.204275490186064e-05, "loss": 0.0426, "step": 4460 }, { "grad_norm": 0.3717691898345947, "learning_rate": 9.199794957275949e-05, "loss": 0.0386, "step": 4470 }, { "grad_norm": 0.40065625309944153, "learning_rate": 9.19530294228679e-05, "loss": 0.0411, "step": 4480 }, { "grad_norm": 0.41345635056495667, "learning_rate": 9.190799457499583e-05, "loss": 0.0404, "step": 4490 }, { "grad_norm": 0.4847007393836975, "learning_rate": 9.186284515226686e-05, "loss": 0.0426, "step": 4500 }, { "grad_norm": 0.48832204937934875, "learning_rate": 9.181758127811777e-05, "loss": 0.047, "step": 4510 }, { "grad_norm": 0.3796400725841522, "learning_rate": 9.177220307629825e-05, "loss": 0.0399, "step": 4520 }, { "grad_norm": 0.5174858570098877, "learning_rate": 9.172671067087059e-05, "loss": 0.0404, "step": 4530 }, { "grad_norm": 0.4230014383792877, "learning_rate": 9.16811041862093e-05, "loss": 0.0381, "step": 4540 }, { "grad_norm": 0.43190139532089233, "learning_rate": 9.163538374700076e-05, "loss": 0.0401, "step": 4550 }, { "grad_norm": 0.3337329626083374, "learning_rate": 9.158954947824287e-05, "loss": 0.0368, "step": 4560 }, { "grad_norm": 0.5180363059043884, "learning_rate": 9.154360150524482e-05, "loss": 0.041, "step": 4570 }, { "grad_norm": 0.49386608600616455, "learning_rate": 9.14975399536266e-05, "loss": 0.0394, "step": 4580 }, { "grad_norm": 0.48891502618789673, "learning_rate": 9.14513649493187e-05, "loss": 0.0373, "step": 4590 }, { "grad_norm": 0.5005327463150024, "learning_rate": 9.140507661856187e-05, "loss": 0.0393, "step": 4600 }, { "grad_norm": 0.4711502492427826, "learning_rate": 9.135867508790661e-05, "loss": 0.0424, "step": 4610 }, { "grad_norm": 0.5110882520675659, "learning_rate": 9.131216048421291e-05, "loss": 0.0435, "step": 4620 }, { "grad_norm": 0.4167346656322479, "learning_rate": 9.126553293464998e-05, "loss": 0.0427, "step": 4630 }, { "grad_norm": 0.39822709560394287, "learning_rate": 9.121879256669572e-05, "loss": 0.0423, "step": 4640 }, { "grad_norm": 0.3998677730560303, "learning_rate": 9.117193950813652e-05, "loss": 0.0439, "step": 4650 }, { "grad_norm": 0.46630972623825073, "learning_rate": 9.112497388706685e-05, "loss": 0.0558, "step": 4660 }, { "grad_norm": 0.5087593793869019, "learning_rate": 9.10778958318889e-05, "loss": 0.0441, "step": 4670 }, { "grad_norm": 0.3119197487831116, "learning_rate": 9.103070547131232e-05, "loss": 0.0387, "step": 4680 }, { "grad_norm": 0.46563538908958435, "learning_rate": 9.098340293435375e-05, "loss": 0.05, "step": 4690 }, { "grad_norm": 0.33818677067756653, "learning_rate": 9.093598835033649e-05, "loss": 0.0484, "step": 4700 }, { "grad_norm": 0.4015658497810364, "learning_rate": 9.088846184889021e-05, "loss": 0.041, "step": 4710 }, { "grad_norm": 0.4177631437778473, "learning_rate": 9.084082355995057e-05, "loss": 0.0406, "step": 4720 }, { "grad_norm": 0.427847295999527, "learning_rate": 9.079307361375882e-05, "loss": 0.0403, "step": 4730 }, { "grad_norm": 0.46063587069511414, "learning_rate": 9.074521214086149e-05, "loss": 0.0397, "step": 4740 }, { "grad_norm": 0.474584698677063, "learning_rate": 9.069723927211001e-05, "loss": 0.041, "step": 4750 }, { "grad_norm": 0.3593958020210266, "learning_rate": 9.064915513866037e-05, "loss": 0.0392, "step": 4760 }, { "grad_norm": 0.39750853180885315, "learning_rate": 9.060095987197279e-05, "loss": 0.042, "step": 4770 }, { "grad_norm": 0.39721569418907166, "learning_rate": 9.055265360381126e-05, "loss": 0.0449, "step": 4780 }, { "grad_norm": 0.6142474412918091, "learning_rate": 9.050423646624326e-05, "loss": 0.0403, "step": 4790 }, { "grad_norm": 0.49515292048454285, "learning_rate": 9.045570859163943e-05, "loss": 0.0394, "step": 4800 }, { "grad_norm": 0.4164809286594391, "learning_rate": 9.04070701126731e-05, "loss": 0.0387, "step": 4810 }, { "grad_norm": 0.3969566226005554, "learning_rate": 9.035832116232001e-05, "loss": 0.0354, "step": 4820 }, { "grad_norm": 0.483597069978714, "learning_rate": 9.030946187385796e-05, "loss": 0.0495, "step": 4830 }, { "grad_norm": 0.4349074363708496, "learning_rate": 9.026049238086635e-05, "loss": 0.0509, "step": 4840 }, { "grad_norm": 0.4559333920478821, "learning_rate": 9.021141281722591e-05, "loss": 0.049, "step": 4850 }, { "grad_norm": 0.4789091944694519, "learning_rate": 9.01622233171183e-05, "loss": 0.0444, "step": 4860 }, { "grad_norm": 0.3821337819099426, "learning_rate": 9.011292401502574e-05, "loss": 0.0386, "step": 4870 }, { "grad_norm": 0.3944632112979889, "learning_rate": 9.006351504573063e-05, "loss": 0.0433, "step": 4880 }, { "grad_norm": 0.3611961007118225, "learning_rate": 9.001399654431519e-05, "loss": 0.0424, "step": 4890 }, { "grad_norm": 0.5353875756263733, "learning_rate": 8.996436864616116e-05, "loss": 0.0391, "step": 4900 }, { "grad_norm": 0.3244038224220276, "learning_rate": 8.991463148694925e-05, "loss": 0.0374, "step": 4910 }, { "grad_norm": 0.3895438313484192, "learning_rate": 8.986478520265902e-05, "loss": 0.0351, "step": 4920 }, { "grad_norm": 0.3814556300640106, "learning_rate": 8.981482992956827e-05, "loss": 0.0402, "step": 4930 }, { "grad_norm": 0.5892817974090576, "learning_rate": 8.976476580425282e-05, "loss": 0.0398, "step": 4940 }, { "grad_norm": 0.3014744222164154, "learning_rate": 8.971459296358606e-05, "loss": 0.0463, "step": 4950 }, { "grad_norm": 0.27854928374290466, "learning_rate": 8.966431154473864e-05, "loss": 0.0342, "step": 4960 }, { "grad_norm": 0.4710148274898529, "learning_rate": 8.961392168517803e-05, "loss": 0.0435, "step": 4970 }, { "grad_norm": 0.4919661283493042, "learning_rate": 8.956342352266821e-05, "loss": 0.0432, "step": 4980 }, { "grad_norm": 0.40502259135246277, "learning_rate": 8.95128171952692e-05, "loss": 0.037, "step": 4990 }, { "grad_norm": 0.4463194012641907, "learning_rate": 8.946210284133676e-05, "loss": 0.039, "step": 5000 }, { "grad_norm": 0.34715715050697327, "learning_rate": 8.941128059952201e-05, "loss": 0.0369, "step": 5010 }, { "grad_norm": 0.45670318603515625, "learning_rate": 8.936035060877102e-05, "loss": 0.0391, "step": 5020 }, { "grad_norm": 0.3947446644306183, "learning_rate": 8.930931300832443e-05, "loss": 0.0382, "step": 5030 }, { "grad_norm": 0.38144880533218384, "learning_rate": 8.925816793771711e-05, "loss": 0.0347, "step": 5040 }, { "grad_norm": 0.4539853036403656, "learning_rate": 8.92069155367777e-05, "loss": 0.0388, "step": 5050 }, { "grad_norm": 0.4114759862422943, "learning_rate": 8.915555594562834e-05, "loss": 0.0369, "step": 5060 }, { "grad_norm": 0.392749547958374, "learning_rate": 8.910408930468416e-05, "loss": 0.0323, "step": 5070 }, { "grad_norm": 0.3783397674560547, "learning_rate": 8.905251575465303e-05, "loss": 0.0362, "step": 5080 }, { "grad_norm": 0.3663565516471863, "learning_rate": 8.900083543653502e-05, "loss": 0.0383, "step": 5090 }, { "grad_norm": 0.43638893961906433, "learning_rate": 8.894904849162218e-05, "loss": 0.0408, "step": 5100 }, { "grad_norm": 0.37828871607780457, "learning_rate": 8.889715506149802e-05, "loss": 0.035, "step": 5110 }, { "grad_norm": 0.5742018818855286, "learning_rate": 8.884515528803722e-05, "loss": 0.045, "step": 5120 }, { "grad_norm": 0.39653220772743225, "learning_rate": 8.879304931340517e-05, "loss": 0.0365, "step": 5130 }, { "grad_norm": 0.3888823091983795, "learning_rate": 8.874083728005759e-05, "loss": 0.0365, "step": 5140 }, { "grad_norm": 0.39621666073799133, "learning_rate": 8.868851933074021e-05, "loss": 0.0361, "step": 5150 }, { "grad_norm": 0.4101957082748413, "learning_rate": 8.863609560848829e-05, "loss": 0.0377, "step": 5160 }, { "grad_norm": 0.36506780982017517, "learning_rate": 8.85835662566263e-05, "loss": 0.0327, "step": 5170 }, { "grad_norm": 0.46655046939849854, "learning_rate": 8.853093141876747e-05, "loss": 0.0372, "step": 5180 }, { "grad_norm": 0.3714132606983185, "learning_rate": 8.847819123881343e-05, "loss": 0.0379, "step": 5190 }, { "grad_norm": 0.553723156452179, "learning_rate": 8.842534586095383e-05, "loss": 0.0465, "step": 5200 }, { "grad_norm": 0.4846991002559662, "learning_rate": 8.837239542966593e-05, "loss": 0.0398, "step": 5210 }, { "grad_norm": 0.4052688479423523, "learning_rate": 8.831934008971417e-05, "loss": 0.0408, "step": 5220 }, { "grad_norm": 0.5494813323020935, "learning_rate": 8.826617998614982e-05, "loss": 0.0445, "step": 5230 }, { "grad_norm": 0.4172828495502472, "learning_rate": 8.821291526431056e-05, "loss": 0.038, "step": 5240 }, { "grad_norm": 0.46248674392700195, "learning_rate": 8.815954606982015e-05, "loss": 0.0338, "step": 5250 }, { "grad_norm": 0.426893413066864, "learning_rate": 8.810607254858789e-05, "loss": 0.04, "step": 5260 }, { "grad_norm": 0.39890918135643005, "learning_rate": 8.805249484680838e-05, "loss": 0.0432, "step": 5270 }, { "grad_norm": 0.3608153462409973, "learning_rate": 8.799881311096096e-05, "loss": 0.0378, "step": 5280 }, { "grad_norm": 0.2826215326786041, "learning_rate": 8.794502748780949e-05, "loss": 0.0386, "step": 5290 }, { "grad_norm": 0.3257977068424225, "learning_rate": 8.78911381244018e-05, "loss": 0.0395, "step": 5300 }, { "grad_norm": 0.4238383173942566, "learning_rate": 8.783714516806933e-05, "loss": 0.0378, "step": 5310 }, { "grad_norm": 0.3874889612197876, "learning_rate": 8.77830487664268e-05, "loss": 0.0405, "step": 5320 }, { "grad_norm": 0.3335861265659332, "learning_rate": 8.772884906737167e-05, "loss": 0.0387, "step": 5330 }, { "grad_norm": 0.4150705337524414, "learning_rate": 8.767454621908387e-05, "loss": 0.0451, "step": 5340 }, { "grad_norm": 0.37715503573417664, "learning_rate": 8.76201403700253e-05, "loss": 0.0436, "step": 5350 }, { "grad_norm": 0.4905657172203064, "learning_rate": 8.756563166893949e-05, "loss": 0.0427, "step": 5360 }, { "grad_norm": 0.4148010015487671, "learning_rate": 8.751102026485113e-05, "loss": 0.0409, "step": 5370 }, { "grad_norm": 0.3371499180793762, "learning_rate": 8.745630630706571e-05, "loss": 0.0402, "step": 5380 }, { "grad_norm": 0.3362332880496979, "learning_rate": 8.740148994516912e-05, "loss": 0.0362, "step": 5390 }, { "grad_norm": 0.30496567487716675, "learning_rate": 8.73465713290272e-05, "loss": 0.0385, "step": 5400 }, { "grad_norm": 0.37439537048339844, "learning_rate": 8.729155060878533e-05, "loss": 0.0445, "step": 5410 }, { "grad_norm": 0.4871309995651245, "learning_rate": 8.723642793486809e-05, "loss": 0.0375, "step": 5420 }, { "grad_norm": 0.405536025762558, "learning_rate": 8.718120345797873e-05, "loss": 0.0477, "step": 5430 }, { "grad_norm": 0.49406737089157104, "learning_rate": 8.712587732909889e-05, "loss": 0.0526, "step": 5440 }, { "grad_norm": 0.503643274307251, "learning_rate": 8.707044969948806e-05, "loss": 0.0551, "step": 5450 }, { "grad_norm": 0.4378870725631714, "learning_rate": 8.701492072068329e-05, "loss": 0.0437, "step": 5460 }, { "grad_norm": 0.41509583592414856, "learning_rate": 8.695929054449869e-05, "loss": 0.0419, "step": 5470 }, { "grad_norm": 0.4161760210990906, "learning_rate": 8.690355932302501e-05, "loss": 0.0435, "step": 5480 }, { "grad_norm": 0.4520265460014343, "learning_rate": 8.684772720862931e-05, "loss": 0.0442, "step": 5490 }, { "grad_norm": 0.41613858938217163, "learning_rate": 8.679179435395446e-05, "loss": 0.0413, "step": 5500 }, { "grad_norm": 0.3705853521823883, "learning_rate": 8.673576091191874e-05, "loss": 0.0431, "step": 5510 }, { "grad_norm": 0.3045673668384552, "learning_rate": 8.667962703571541e-05, "loss": 0.0387, "step": 5520 }, { "grad_norm": 0.4056410491466522, "learning_rate": 8.662339287881238e-05, "loss": 0.0463, "step": 5530 }, { "grad_norm": 0.34872907400131226, "learning_rate": 8.656705859495169e-05, "loss": 0.0414, "step": 5540 }, { "grad_norm": 0.34907448291778564, "learning_rate": 8.651062433814912e-05, "loss": 0.0391, "step": 5550 }, { "grad_norm": 0.3630087077617645, "learning_rate": 8.645409026269375e-05, "loss": 0.0373, "step": 5560 }, { "grad_norm": 0.41194403171539307, "learning_rate": 8.639745652314759e-05, "loss": 0.0328, "step": 5570 }, { "grad_norm": 0.4016582667827606, "learning_rate": 8.634072327434515e-05, "loss": 0.04, "step": 5580 }, { "grad_norm": 0.2818276286125183, "learning_rate": 8.628389067139294e-05, "loss": 0.0355, "step": 5590 }, { "grad_norm": 0.3806859850883484, "learning_rate": 8.622695886966911e-05, "loss": 0.0359, "step": 5600 }, { "grad_norm": 0.42152050137519836, "learning_rate": 8.616992802482308e-05, "loss": 0.0366, "step": 5610 }, { "grad_norm": 0.4103473424911499, "learning_rate": 8.611279829277496e-05, "loss": 0.0354, "step": 5620 }, { "grad_norm": 0.4292067289352417, "learning_rate": 8.605556982971528e-05, "loss": 0.0327, "step": 5630 }, { "grad_norm": 0.3313475549221039, "learning_rate": 8.599824279210447e-05, "loss": 0.0393, "step": 5640 }, { "grad_norm": 0.5613791942596436, "learning_rate": 8.594081733667243e-05, "loss": 0.0309, "step": 5650 }, { "grad_norm": 0.3207419216632843, "learning_rate": 8.58832936204182e-05, "loss": 0.0355, "step": 5660 }, { "grad_norm": 0.3897148072719574, "learning_rate": 8.582567180060942e-05, "loss": 0.0348, "step": 5670 }, { "grad_norm": 0.380885511636734, "learning_rate": 8.576795203478194e-05, "loss": 0.0391, "step": 5680 }, { "grad_norm": 0.5935137867927551, "learning_rate": 8.571013448073939e-05, "loss": 0.0398, "step": 5690 }, { "grad_norm": 0.3901899456977844, "learning_rate": 8.565221929655275e-05, "loss": 0.0352, "step": 5700 }, { "grad_norm": 0.5892617106437683, "learning_rate": 8.559420664055992e-05, "loss": 0.0423, "step": 5710 }, { "grad_norm": 0.38492685556411743, "learning_rate": 8.553609667136532e-05, "loss": 0.0449, "step": 5720 }, { "grad_norm": 1.2360210418701172, "learning_rate": 8.547788954783936e-05, "loss": 0.0467, "step": 5730 }, { "grad_norm": 0.5050014853477478, "learning_rate": 8.541958542911808e-05, "loss": 0.0383, "step": 5740 }, { "grad_norm": 0.34547159075737, "learning_rate": 8.536118447460275e-05, "loss": 0.0366, "step": 5750 }, { "grad_norm": 0.3743572235107422, "learning_rate": 8.530268684395932e-05, "loss": 0.0345, "step": 5760 }, { "grad_norm": 5.568979740142822, "learning_rate": 8.524409269711807e-05, "loss": 0.0527, "step": 5770 }, { "grad_norm": 0.2714713215827942, "learning_rate": 8.51854021942732e-05, "loss": 0.0443, "step": 5780 }, { "grad_norm": 0.39137327671051025, "learning_rate": 8.512661549588227e-05, "loss": 0.0368, "step": 5790 }, { "grad_norm": 0.37156957387924194, "learning_rate": 8.506773276266588e-05, "loss": 0.0325, "step": 5800 }, { "grad_norm": 0.23236903548240662, "learning_rate": 8.500875415560721e-05, "loss": 0.0316, "step": 5810 }, { "grad_norm": 0.38974717259407043, "learning_rate": 8.494967983595144e-05, "loss": 0.03, "step": 5820 }, { "grad_norm": 0.4023657739162445, "learning_rate": 8.489050996520558e-05, "loss": 0.0371, "step": 5830 }, { "grad_norm": 0.436508446931839, "learning_rate": 8.483124470513775e-05, "loss": 0.0336, "step": 5840 }, { "grad_norm": 0.37176352739334106, "learning_rate": 8.477188421777692e-05, "loss": 0.0413, "step": 5850 }, { "grad_norm": 0.4424162805080414, "learning_rate": 8.47124286654124e-05, "loss": 0.0335, "step": 5860 }, { "grad_norm": 0.3536135256290436, "learning_rate": 8.465287821059341e-05, "loss": 0.0353, "step": 5870 }, { "grad_norm": 0.3289702534675598, "learning_rate": 8.45932330161286e-05, "loss": 0.0441, "step": 5880 }, { "grad_norm": 0.7082564234733582, "learning_rate": 8.453349324508567e-05, "loss": 0.0339, "step": 5890 }, { "grad_norm": 0.3945935368537903, "learning_rate": 8.447365906079088e-05, "loss": 0.0368, "step": 5900 }, { "grad_norm": 0.32752901315689087, "learning_rate": 8.441373062682856e-05, "loss": 0.0379, "step": 5910 }, { "grad_norm": 0.30462390184402466, "learning_rate": 8.43537081070408e-05, "loss": 0.0349, "step": 5920 }, { "grad_norm": 0.36157065629959106, "learning_rate": 8.429359166552689e-05, "loss": 0.0304, "step": 5930 }, { "grad_norm": 0.486625999212265, "learning_rate": 8.423338146664284e-05, "loss": 0.0336, "step": 5940 }, { "grad_norm": 0.3731265664100647, "learning_rate": 8.417307767500107e-05, "loss": 0.0317, "step": 5950 }, { "grad_norm": 0.4834536015987396, "learning_rate": 8.411268045546983e-05, "loss": 0.0352, "step": 5960 }, { "grad_norm": 0.36277201771736145, "learning_rate": 8.405218997317281e-05, "loss": 0.0323, "step": 5970 }, { "grad_norm": 0.4757532477378845, "learning_rate": 8.399160639348869e-05, "loss": 0.0352, "step": 5980 }, { "grad_norm": 0.4011819362640381, "learning_rate": 8.393092988205065e-05, "loss": 0.0337, "step": 5990 }, { "grad_norm": 0.3650408983230591, "learning_rate": 8.387016060474597e-05, "loss": 0.0353, "step": 6000 }, { "grad_norm": 0.3456519544124603, "learning_rate": 8.380929872771551e-05, "loss": 0.0322, "step": 6010 }, { "grad_norm": 0.4524834156036377, "learning_rate": 8.374834441735335e-05, "loss": 0.0344, "step": 6020 }, { "grad_norm": 0.3826424479484558, "learning_rate": 8.368729784030622e-05, "loss": 0.0344, "step": 6030 }, { "grad_norm": 0.3802699148654938, "learning_rate": 8.362615916347315e-05, "loss": 0.0375, "step": 6040 }, { "grad_norm": 0.35971370339393616, "learning_rate": 8.356492855400493e-05, "loss": 0.0311, "step": 6050 }, { "grad_norm": 0.2886114716529846, "learning_rate": 8.350360617930371e-05, "loss": 0.0345, "step": 6060 }, { "grad_norm": 0.41874226927757263, "learning_rate": 8.344219220702255e-05, "loss": 0.0342, "step": 6070 }, { "grad_norm": 0.5171942710876465, "learning_rate": 8.338068680506485e-05, "loss": 0.0385, "step": 6080 }, { "grad_norm": 0.32878658175468445, "learning_rate": 8.33190901415841e-05, "loss": 0.0354, "step": 6090 }, { "grad_norm": 0.502534031867981, "learning_rate": 8.325740238498317e-05, "loss": 0.0346, "step": 6100 }, { "grad_norm": 0.38916486501693726, "learning_rate": 8.319562370391406e-05, "loss": 0.0339, "step": 6110 }, { "grad_norm": 0.4702387750148773, "learning_rate": 8.31337542672773e-05, "loss": 0.0368, "step": 6120 }, { "grad_norm": 0.3787069618701935, "learning_rate": 8.307179424422158e-05, "loss": 0.0344, "step": 6130 }, { "grad_norm": 0.2693973183631897, "learning_rate": 8.300974380414327e-05, "loss": 0.0349, "step": 6140 }, { "grad_norm": 0.4137895107269287, "learning_rate": 8.294760311668586e-05, "loss": 0.0307, "step": 6150 }, { "grad_norm": 0.35249635577201843, "learning_rate": 8.288537235173961e-05, "loss": 0.0372, "step": 6160 }, { "grad_norm": 0.33685654401779175, "learning_rate": 8.282305167944108e-05, "loss": 0.034, "step": 6170 }, { "grad_norm": 0.3189336061477661, "learning_rate": 8.276064127017262e-05, "loss": 0.0371, "step": 6180 }, { "grad_norm": 0.43651726841926575, "learning_rate": 8.269814129456189e-05, "loss": 0.0372, "step": 6190 }, { "grad_norm": 0.3284255862236023, "learning_rate": 8.263555192348143e-05, "loss": 0.0339, "step": 6200 }, { "grad_norm": 0.3762272298336029, "learning_rate": 8.257287332804819e-05, "loss": 0.0355, "step": 6210 }, { "grad_norm": 0.34051746129989624, "learning_rate": 8.251010567962307e-05, "loss": 0.0335, "step": 6220 }, { "grad_norm": 0.377837210893631, "learning_rate": 8.244724914981041e-05, "loss": 0.0368, "step": 6230 }, { "grad_norm": 0.4306730329990387, "learning_rate": 8.238430391045757e-05, "loss": 0.0341, "step": 6240 }, { "grad_norm": 0.3810703456401825, "learning_rate": 8.232127013365445e-05, "loss": 0.0341, "step": 6250 }, { "grad_norm": 0.35435038805007935, "learning_rate": 8.225814799173295e-05, "loss": 0.0345, "step": 6260 }, { "grad_norm": 1.161548137664795, "learning_rate": 8.219493765726663e-05, "loss": 0.0354, "step": 6270 }, { "grad_norm": 0.35907721519470215, "learning_rate": 8.21316393030701e-05, "loss": 0.0319, "step": 6280 }, { "grad_norm": 0.30492573976516724, "learning_rate": 8.206825310219865e-05, "loss": 0.0349, "step": 6290 }, { "grad_norm": 0.33738401532173157, "learning_rate": 8.200477922794776e-05, "loss": 0.0376, "step": 6300 }, { "grad_norm": 0.4277554452419281, "learning_rate": 8.194121785385256e-05, "loss": 0.0368, "step": 6310 }, { "grad_norm": 0.43897178769111633, "learning_rate": 8.187756915368741e-05, "loss": 0.0333, "step": 6320 }, { "grad_norm": 0.3560790717601776, "learning_rate": 8.181383330146544e-05, "loss": 0.0319, "step": 6330 }, { "grad_norm": 0.5994824767112732, "learning_rate": 8.175001047143804e-05, "loss": 0.0361, "step": 6340 }, { "grad_norm": 0.40710535645484924, "learning_rate": 8.168610083809438e-05, "loss": 0.0369, "step": 6350 }, { "grad_norm": 0.5599947571754456, "learning_rate": 8.162210457616095e-05, "loss": 0.0383, "step": 6360 }, { "grad_norm": 0.4856725335121155, "learning_rate": 8.155802186060109e-05, "loss": 0.0395, "step": 6370 }, { "grad_norm": 0.6037003397941589, "learning_rate": 8.149385286661453e-05, "loss": 0.0386, "step": 6380 }, { "grad_norm": 0.408742219209671, "learning_rate": 8.14295977696368e-05, "loss": 0.0403, "step": 6390 }, { "grad_norm": 0.5326646566390991, "learning_rate": 8.13652567453389e-05, "loss": 0.0386, "step": 6400 }, { "grad_norm": 0.40859511494636536, "learning_rate": 8.130082996962676e-05, "loss": 0.0383, "step": 6410 }, { "grad_norm": 0.49652180075645447, "learning_rate": 8.123631761864068e-05, "loss": 0.0413, "step": 6420 }, { "grad_norm": 0.3600600063800812, "learning_rate": 8.1171719868755e-05, "loss": 0.0381, "step": 6430 }, { "grad_norm": 0.5106244087219238, "learning_rate": 8.110703689657748e-05, "loss": 0.0348, "step": 6440 }, { "grad_norm": 0.36146947741508484, "learning_rate": 8.104226887894892e-05, "loss": 0.0357, "step": 6450 }, { "grad_norm": 0.484184592962265, "learning_rate": 8.097741599294257e-05, "loss": 0.0389, "step": 6460 }, { "grad_norm": 0.4100930094718933, "learning_rate": 8.091247841586378e-05, "loss": 0.0422, "step": 6470 }, { "grad_norm": 0.6038318872451782, "learning_rate": 8.084745632524939e-05, "loss": 0.0379, "step": 6480 }, { "grad_norm": 0.3916587233543396, "learning_rate": 8.07823498988673e-05, "loss": 0.0382, "step": 6490 }, { "grad_norm": 0.48386287689208984, "learning_rate": 8.071715931471602e-05, "loss": 0.0367, "step": 6500 }, { "grad_norm": 0.30906492471694946, "learning_rate": 8.06518847510241e-05, "loss": 0.0381, "step": 6510 }, { "grad_norm": 0.38752448558807373, "learning_rate": 8.058652638624971e-05, "loss": 0.0373, "step": 6520 }, { "grad_norm": 0.2790892422199249, "learning_rate": 8.052108439908013e-05, "loss": 0.0329, "step": 6530 }, { "grad_norm": 0.3137839734554291, "learning_rate": 8.045555896843125e-05, "loss": 0.0324, "step": 6540 }, { "grad_norm": 0.31749773025512695, "learning_rate": 8.03899502734471e-05, "loss": 0.0346, "step": 6550 }, { "grad_norm": 0.4200252592563629, "learning_rate": 8.032425849349931e-05, "loss": 0.0342, "step": 6560 }, { "grad_norm": 0.35133570432662964, "learning_rate": 8.025848380818674e-05, "loss": 0.0367, "step": 6570 }, { "grad_norm": 0.3877725601196289, "learning_rate": 8.019262639733487e-05, "loss": 0.0405, "step": 6580 }, { "grad_norm": 0.34701603651046753, "learning_rate": 8.012668644099531e-05, "loss": 0.0315, "step": 6590 }, { "grad_norm": 0.3548680245876312, "learning_rate": 8.006066411944542e-05, "loss": 0.0324, "step": 6600 }, { "grad_norm": 0.3623726963996887, "learning_rate": 7.999455961318769e-05, "loss": 0.03, "step": 6610 }, { "grad_norm": 0.3317916989326477, "learning_rate": 7.992837310294932e-05, "loss": 0.0316, "step": 6620 }, { "grad_norm": 0.43674328923225403, "learning_rate": 7.986210476968167e-05, "loss": 0.0364, "step": 6630 }, { "grad_norm": 0.35376888513565063, "learning_rate": 7.97957547945599e-05, "loss": 0.033, "step": 6640 }, { "grad_norm": 0.3694382607936859, "learning_rate": 7.972932335898226e-05, "loss": 0.0291, "step": 6650 }, { "grad_norm": 0.277803897857666, "learning_rate": 7.966281064456975e-05, "loss": 0.0321, "step": 6660 }, { "grad_norm": 0.406296968460083, "learning_rate": 7.959621683316563e-05, "loss": 0.0294, "step": 6670 }, { "grad_norm": 0.3531656861305237, "learning_rate": 7.952954210683481e-05, "loss": 0.0363, "step": 6680 }, { "grad_norm": 0.407413512468338, "learning_rate": 7.946278664786345e-05, "loss": 0.0343, "step": 6690 }, { "grad_norm": 0.28382694721221924, "learning_rate": 7.939595063875842e-05, "loss": 0.0319, "step": 6700 }, { "grad_norm": 0.3440980613231659, "learning_rate": 7.932903426224683e-05, "loss": 0.0291, "step": 6710 }, { "grad_norm": 0.32926392555236816, "learning_rate": 7.926203770127552e-05, "loss": 0.0332, "step": 6720 }, { "grad_norm": 0.4076130986213684, "learning_rate": 7.919496113901046e-05, "loss": 0.035, "step": 6730 }, { "grad_norm": 0.3117741346359253, "learning_rate": 7.912780475883649e-05, "loss": 0.0345, "step": 6740 }, { "grad_norm": 0.34045708179473877, "learning_rate": 7.906056874435652e-05, "loss": 0.0328, "step": 6750 }, { "grad_norm": 0.3797502815723419, "learning_rate": 7.899325327939131e-05, "loss": 0.0301, "step": 6760 }, { "grad_norm": 0.34619173407554626, "learning_rate": 7.892585854797872e-05, "loss": 0.031, "step": 6770 }, { "grad_norm": 0.3594895005226135, "learning_rate": 7.88583847343734e-05, "loss": 0.0291, "step": 6780 }, { "grad_norm": 0.35388457775115967, "learning_rate": 7.879083202304616e-05, "loss": 0.0303, "step": 6790 }, { "grad_norm": 0.3556769788265228, "learning_rate": 7.872320059868355e-05, "loss": 0.0346, "step": 6800 }, { "grad_norm": 0.404356449842453, "learning_rate": 7.865549064618729e-05, "loss": 0.0305, "step": 6810 }, { "grad_norm": 0.38205862045288086, "learning_rate": 7.858770235067381e-05, "loss": 0.0311, "step": 6820 }, { "grad_norm": 0.32936254143714905, "learning_rate": 7.851983589747374e-05, "loss": 0.0285, "step": 6830 }, { "grad_norm": 0.36234426498413086, "learning_rate": 7.845189147213133e-05, "loss": 0.0309, "step": 6840 }, { "grad_norm": 0.28015509247779846, "learning_rate": 7.838386926040407e-05, "loss": 0.0301, "step": 6850 }, { "grad_norm": 0.5102009773254395, "learning_rate": 7.83157694482621e-05, "loss": 0.0359, "step": 6860 }, { "grad_norm": 0.447303831577301, "learning_rate": 7.824759222188768e-05, "loss": 0.0358, "step": 6870 }, { "grad_norm": 0.6260831356048584, "learning_rate": 7.817933776767478e-05, "loss": 0.0432, "step": 6880 }, { "grad_norm": 0.44483914971351624, "learning_rate": 7.811100627222842e-05, "loss": 0.0497, "step": 6890 }, { "grad_norm": 0.6558358669281006, "learning_rate": 7.804259792236435e-05, "loss": 0.0474, "step": 6900 }, { "grad_norm": 0.4399555027484894, "learning_rate": 7.797411290510835e-05, "loss": 0.0434, "step": 6910 }, { "grad_norm": 0.4496051073074341, "learning_rate": 7.790555140769586e-05, "loss": 0.0443, "step": 6920 }, { "grad_norm": 0.6381503939628601, "learning_rate": 7.78369136175714e-05, "loss": 0.0364, "step": 6930 }, { "grad_norm": 0.493427574634552, "learning_rate": 7.776819972238806e-05, "loss": 0.032, "step": 6940 }, { "grad_norm": 0.4286026656627655, "learning_rate": 7.7699409910007e-05, "loss": 0.0433, "step": 6950 }, { "grad_norm": 0.4236827492713928, "learning_rate": 7.763054436849694e-05, "loss": 0.0394, "step": 6960 }, { "grad_norm": 0.48710957169532776, "learning_rate": 7.756160328613364e-05, "loss": 0.0398, "step": 6970 }, { "grad_norm": 0.42337432503700256, "learning_rate": 7.749258685139942e-05, "loss": 0.0358, "step": 6980 }, { "grad_norm": 0.3803822696208954, "learning_rate": 7.742349525298253e-05, "loss": 0.0404, "step": 6990 }, { "grad_norm": 0.4692521095275879, "learning_rate": 7.735432867977679e-05, "loss": 0.0366, "step": 7000 }, { "grad_norm": 0.4516281485557556, "learning_rate": 7.728508732088096e-05, "loss": 0.0389, "step": 7010 }, { "grad_norm": 0.5765016078948975, "learning_rate": 7.721577136559825e-05, "loss": 0.0329, "step": 7020 }, { "grad_norm": 0.2870192229747772, "learning_rate": 7.714638100343588e-05, "loss": 0.0343, "step": 7030 }, { "grad_norm": 0.340056449174881, "learning_rate": 7.707691642410444e-05, "loss": 0.0327, "step": 7040 }, { "grad_norm": 0.371625691652298, "learning_rate": 7.70073778175174e-05, "loss": 0.0398, "step": 7050 }, { "grad_norm": 0.3383755683898926, "learning_rate": 7.69377653737907e-05, "loss": 0.0332, "step": 7060 }, { "grad_norm": 0.5258527994155884, "learning_rate": 7.686807928324209e-05, "loss": 0.0343, "step": 7070 }, { "grad_norm": 0.3286188840866089, "learning_rate": 7.679831973639065e-05, "loss": 0.0281, "step": 7080 }, { "grad_norm": 0.37425264716148376, "learning_rate": 7.672848692395637e-05, "loss": 0.0374, "step": 7090 }, { "grad_norm": 0.32992023229599, "learning_rate": 7.665858103685944e-05, "loss": 0.0378, "step": 7100 }, { "grad_norm": 0.49898242950439453, "learning_rate": 7.658860226621991e-05, "loss": 0.0384, "step": 7110 }, { "grad_norm": 0.3310796022415161, "learning_rate": 7.651855080335708e-05, "loss": 0.0329, "step": 7120 }, { "grad_norm": 0.39829936623573303, "learning_rate": 7.644842683978896e-05, "loss": 0.0307, "step": 7130 }, { "grad_norm": 0.3717612326145172, "learning_rate": 7.63782305672318e-05, "loss": 0.0358, "step": 7140 }, { "grad_norm": 0.34906908869743347, "learning_rate": 7.63079621775995e-05, "loss": 0.0389, "step": 7150 }, { "grad_norm": 0.2967912554740906, "learning_rate": 7.623762186300319e-05, "loss": 0.0355, "step": 7160 }, { "grad_norm": 0.2750263512134552, "learning_rate": 7.616720981575057e-05, "loss": 0.0316, "step": 7170 }, { "grad_norm": 0.24674014747142792, "learning_rate": 7.609672622834552e-05, "loss": 0.0309, "step": 7180 }, { "grad_norm": 0.3141728937625885, "learning_rate": 7.602617129348747e-05, "loss": 0.0377, "step": 7190 }, { "grad_norm": 0.31950342655181885, "learning_rate": 7.595554520407088e-05, "loss": 0.0322, "step": 7200 }, { "grad_norm": 0.3073623776435852, "learning_rate": 7.588484815318484e-05, "loss": 0.0282, "step": 7210 }, { "grad_norm": 0.8250489830970764, "learning_rate": 7.581408033411234e-05, "loss": 0.0359, "step": 7220 }, { "grad_norm": 0.2683827877044678, "learning_rate": 7.574324194032995e-05, "loss": 0.0306, "step": 7230 }, { "grad_norm": 0.32709038257598877, "learning_rate": 7.567233316550705e-05, "loss": 0.041, "step": 7240 }, { "grad_norm": 0.3046388328075409, "learning_rate": 7.560135420350562e-05, "loss": 0.0317, "step": 7250 }, { "grad_norm": 0.3633132874965668, "learning_rate": 7.553030524837935e-05, "loss": 0.0329, "step": 7260 }, { "grad_norm": 0.36311909556388855, "learning_rate": 7.545918649437341e-05, "loss": 0.0319, "step": 7270 }, { "grad_norm": 0.43964293599128723, "learning_rate": 7.538799813592377e-05, "loss": 0.0342, "step": 7280 }, { "grad_norm": 0.324747771024704, "learning_rate": 7.531674036765662e-05, "loss": 0.0346, "step": 7290 }, { "grad_norm": 0.3536224067211151, "learning_rate": 7.524541338438807e-05, "loss": 0.0343, "step": 7300 }, { "grad_norm": 0.3991186022758484, "learning_rate": 7.517401738112328e-05, "loss": 0.0362, "step": 7310 }, { "grad_norm": 0.401061087846756, "learning_rate": 7.510255255305628e-05, "loss": 0.0321, "step": 7320 }, { "grad_norm": 0.6150668263435364, "learning_rate": 7.503101909556911e-05, "loss": 0.0363, "step": 7330 }, { "grad_norm": 0.4203510880470276, "learning_rate": 7.495941720423154e-05, "loss": 0.0333, "step": 7340 }, { "grad_norm": 0.36976158618927, "learning_rate": 7.488774707480042e-05, "loss": 0.0307, "step": 7350 }, { "grad_norm": 0.3458176553249359, "learning_rate": 7.481600890321911e-05, "loss": 0.034, "step": 7360 }, { "grad_norm": 0.396194726228714, "learning_rate": 7.474420288561708e-05, "loss": 0.031, "step": 7370 }, { "grad_norm": 0.305926650762558, "learning_rate": 7.467232921830921e-05, "loss": 0.0297, "step": 7380 }, { "grad_norm": 0.2550731599330902, "learning_rate": 7.460038809779537e-05, "loss": 0.0322, "step": 7390 }, { "grad_norm": 0.27800625562667847, "learning_rate": 7.452837972075983e-05, "loss": 0.0309, "step": 7400 }, { "grad_norm": 0.5258330702781677, "learning_rate": 7.445630428407074e-05, "loss": 0.03, "step": 7410 }, { "grad_norm": 0.2773568034172058, "learning_rate": 7.43841619847796e-05, "loss": 0.0338, "step": 7420 }, { "grad_norm": 0.40147870779037476, "learning_rate": 7.431195302012072e-05, "loss": 0.0289, "step": 7430 }, { "grad_norm": 0.26147976517677307, "learning_rate": 7.423967758751061e-05, "loss": 0.0298, "step": 7440 }, { "grad_norm": 0.44134649634361267, "learning_rate": 7.416733588454758e-05, "loss": 0.0268, "step": 7450 }, { "grad_norm": 0.3664838373661041, "learning_rate": 7.409492810901106e-05, "loss": 0.031, "step": 7460 }, { "grad_norm": 0.3707990050315857, "learning_rate": 7.402245445886116e-05, "loss": 0.0282, "step": 7470 }, { "grad_norm": 0.2946252226829529, "learning_rate": 7.394991513223806e-05, "loss": 0.034, "step": 7480 }, { "grad_norm": 0.384781152009964, "learning_rate": 7.38773103274615e-05, "loss": 0.0378, "step": 7490 }, { "grad_norm": 0.3428117334842682, "learning_rate": 7.380464024303028e-05, "loss": 0.0301, "step": 7500 }, { "grad_norm": 0.3943226933479309, "learning_rate": 7.373190507762162e-05, "loss": 0.0289, "step": 7510 }, { "grad_norm": 0.26884499192237854, "learning_rate": 7.365910503009066e-05, "loss": 0.0321, "step": 7520 }, { "grad_norm": 0.3503260612487793, "learning_rate": 7.358624029946996e-05, "loss": 0.0331, "step": 7530 }, { "grad_norm": 0.26951298117637634, "learning_rate": 7.351331108496893e-05, "loss": 0.0263, "step": 7540 }, { "grad_norm": 0.2702380120754242, "learning_rate": 7.344031758597325e-05, "loss": 0.0263, "step": 7550 }, { "grad_norm": 0.33218735456466675, "learning_rate": 7.336726000204435e-05, "loss": 0.0251, "step": 7560 }, { "grad_norm": 0.23734387755393982, "learning_rate": 7.32941385329189e-05, "loss": 0.0276, "step": 7570 }, { "grad_norm": 0.24142706394195557, "learning_rate": 7.322095337850816e-05, "loss": 0.0288, "step": 7580 }, { "grad_norm": 0.36921820044517517, "learning_rate": 7.314770473889758e-05, "loss": 0.0301, "step": 7590 }, { "grad_norm": 0.33638978004455566, "learning_rate": 7.307439281434615e-05, "loss": 0.0291, "step": 7600 }, { "grad_norm": 0.2780473232269287, "learning_rate": 7.300101780528585e-05, "loss": 0.0246, "step": 7610 }, { "grad_norm": 0.3212972581386566, "learning_rate": 7.292757991232117e-05, "loss": 0.0292, "step": 7620 }, { "grad_norm": 0.27114197611808777, "learning_rate": 7.285407933622848e-05, "loss": 0.0312, "step": 7630 }, { "grad_norm": 0.2963033616542816, "learning_rate": 7.278051627795557e-05, "loss": 0.0313, "step": 7640 }, { "grad_norm": 0.3000950217247009, "learning_rate": 7.270689093862105e-05, "loss": 0.0342, "step": 7650 }, { "grad_norm": 0.29192501306533813, "learning_rate": 7.263320351951374e-05, "loss": 0.0314, "step": 7660 }, { "grad_norm": 0.4112901985645294, "learning_rate": 7.255945422209227e-05, "loss": 0.0318, "step": 7670 }, { "grad_norm": 0.31356680393218994, "learning_rate": 7.248564324798437e-05, "loss": 0.0289, "step": 7680 }, { "grad_norm": 0.32500433921813965, "learning_rate": 7.241177079898644e-05, "loss": 0.0289, "step": 7690 }, { "grad_norm": 0.3732810616493225, "learning_rate": 7.233783707706295e-05, "loss": 0.0262, "step": 7700 }, { "grad_norm": 0.3768092095851898, "learning_rate": 7.226384228434586e-05, "loss": 0.0311, "step": 7710 }, { "grad_norm": 0.2774048447608948, "learning_rate": 7.21897866231341e-05, "loss": 0.0302, "step": 7720 }, { "grad_norm": 0.31899410486221313, "learning_rate": 7.211567029589303e-05, "loss": 0.0304, "step": 7730 }, { "grad_norm": 0.38266536593437195, "learning_rate": 7.204149350525387e-05, "loss": 0.0307, "step": 7740 }, { "grad_norm": 0.5800005793571472, "learning_rate": 7.196725645401309e-05, "loss": 0.0304, "step": 7750 }, { "grad_norm": 0.36372581124305725, "learning_rate": 7.1892959345132e-05, "loss": 0.0281, "step": 7760 }, { "grad_norm": 0.23788747191429138, "learning_rate": 7.181860238173605e-05, "loss": 0.0251, "step": 7770 }, { "grad_norm": 0.3640170395374298, "learning_rate": 7.174418576711432e-05, "loss": 0.0295, "step": 7780 }, { "grad_norm": 0.3240286111831665, "learning_rate": 7.1669709704719e-05, "loss": 0.0288, "step": 7790 }, { "grad_norm": 0.2818297743797302, "learning_rate": 7.159517439816481e-05, "loss": 0.0295, "step": 7800 }, { "grad_norm": 0.308823823928833, "learning_rate": 7.152058005122842e-05, "loss": 0.0305, "step": 7810 }, { "grad_norm": 0.3558848202228546, "learning_rate": 7.144592686784793e-05, "loss": 0.0354, "step": 7820 }, { "grad_norm": 0.3704715669155121, "learning_rate": 7.137121505212229e-05, "loss": 0.0383, "step": 7830 }, { "grad_norm": 0.3538174331188202, "learning_rate": 7.129644480831077e-05, "loss": 0.0293, "step": 7840 }, { "grad_norm": 0.31464624404907227, "learning_rate": 7.122161634083234e-05, "loss": 0.0305, "step": 7850 }, { "grad_norm": 0.2582421600818634, "learning_rate": 7.114672985426516e-05, "loss": 0.029, "step": 7860 }, { "grad_norm": 0.3125132918357849, "learning_rate": 7.107178555334606e-05, "loss": 0.0315, "step": 7870 }, { "grad_norm": 0.33996033668518066, "learning_rate": 7.099678364296989e-05, "loss": 0.0294, "step": 7880 }, { "grad_norm": 0.26999738812446594, "learning_rate": 7.0921724328189e-05, "loss": 0.0289, "step": 7890 }, { "grad_norm": 0.30083945393562317, "learning_rate": 7.084660781421268e-05, "loss": 0.0276, "step": 7900 }, { "grad_norm": 0.4122389853000641, "learning_rate": 7.077143430640662e-05, "loss": 0.0341, "step": 7910 }, { "grad_norm": 0.2957439124584198, "learning_rate": 7.069620401029232e-05, "loss": 0.0286, "step": 7920 }, { "grad_norm": 0.3750254213809967, "learning_rate": 7.062091713154655e-05, "loss": 0.0323, "step": 7930 }, { "grad_norm": 0.27661439776420593, "learning_rate": 7.054557387600075e-05, "loss": 0.028, "step": 7940 }, { "grad_norm": 0.2915104627609253, "learning_rate": 7.04701744496405e-05, "loss": 0.0347, "step": 7950 }, { "grad_norm": 0.39326295256614685, "learning_rate": 7.039471905860495e-05, "loss": 0.0353, "step": 7960 }, { "grad_norm": 1.0759928226470947, "learning_rate": 7.031920790918628e-05, "loss": 0.0317, "step": 7970 }, { "grad_norm": 0.3974040150642395, "learning_rate": 7.024364120782906e-05, "loss": 0.0349, "step": 7980 }, { "grad_norm": 0.3099539279937744, "learning_rate": 7.016801916112978e-05, "loss": 0.0281, "step": 7990 }, { "grad_norm": 0.3036164343357086, "learning_rate": 7.009234197583623e-05, "loss": 0.0279, "step": 8000 }, { "grad_norm": 0.4793526232242584, "learning_rate": 7.001660985884692e-05, "loss": 0.033, "step": 8010 }, { "grad_norm": 0.40968289971351624, "learning_rate": 6.994082301721063e-05, "loss": 0.0308, "step": 8020 }, { "grad_norm": 0.5269930958747864, "learning_rate": 6.986498165812563e-05, "loss": 0.0333, "step": 8030 }, { "grad_norm": 0.3973531126976013, "learning_rate": 6.978908598893932e-05, "loss": 0.0298, "step": 8040 }, { "grad_norm": 0.38797101378440857, "learning_rate": 6.971313621714756e-05, "loss": 0.028, "step": 8050 }, { "grad_norm": 0.27901944518089294, "learning_rate": 6.96371325503941e-05, "loss": 0.0335, "step": 8060 }, { "grad_norm": 0.3663145899772644, "learning_rate": 6.956107519647014e-05, "loss": 0.0314, "step": 8070 }, { "grad_norm": 0.34878087043762207, "learning_rate": 6.94849643633135e-05, "loss": 0.03, "step": 8080 }, { "grad_norm": 0.2709192931652069, "learning_rate": 6.940880025900834e-05, "loss": 0.0268, "step": 8090 }, { "grad_norm": 0.2911069095134735, "learning_rate": 6.933258309178438e-05, "loss": 0.0238, "step": 8100 }, { "grad_norm": 0.24862156808376312, "learning_rate": 6.925631307001646e-05, "loss": 0.0261, "step": 8110 }, { "grad_norm": 0.24233442544937134, "learning_rate": 6.91799904022239e-05, "loss": 0.0276, "step": 8120 }, { "grad_norm": 0.30246666073799133, "learning_rate": 6.910361529706997e-05, "loss": 0.0242, "step": 8130 }, { "grad_norm": 0.29776740074157715, "learning_rate": 6.902718796336131e-05, "loss": 0.0292, "step": 8140 }, { "grad_norm": 0.2753586173057556, "learning_rate": 6.895070861004729e-05, "loss": 0.0287, "step": 8150 }, { "grad_norm": 0.6025245189666748, "learning_rate": 6.887417744621956e-05, "loss": 0.0286, "step": 8160 }, { "grad_norm": 0.3204508423805237, "learning_rate": 6.87975946811114e-05, "loss": 0.0255, "step": 8170 }, { "grad_norm": 0.34531474113464355, "learning_rate": 6.872096052409718e-05, "loss": 0.0307, "step": 8180 }, { "grad_norm": 0.2596975266933441, "learning_rate": 6.864427518469174e-05, "loss": 0.025, "step": 8190 }, { "grad_norm": 0.3892361521720886, "learning_rate": 6.856753887254986e-05, "loss": 0.0326, "step": 8200 }, { "grad_norm": 0.25757497549057007, "learning_rate": 6.849075179746572e-05, "loss": 0.0266, "step": 8210 }, { "grad_norm": 0.280251681804657, "learning_rate": 6.841391416937221e-05, "loss": 0.0265, "step": 8220 }, { "grad_norm": 0.31257060170173645, "learning_rate": 6.833702619834053e-05, "loss": 0.0316, "step": 8230 }, { "grad_norm": 0.25400030612945557, "learning_rate": 6.82600880945794e-05, "loss": 0.0288, "step": 8240 }, { "grad_norm": 0.2663815915584564, "learning_rate": 6.818310006843468e-05, "loss": 0.0244, "step": 8250 }, { "grad_norm": 0.3205268681049347, "learning_rate": 6.810606233038868e-05, "loss": 0.0228, "step": 8260 }, { "grad_norm": 0.4853333532810211, "learning_rate": 6.802897509105966e-05, "loss": 0.0339, "step": 8270 }, { "grad_norm": 0.31560322642326355, "learning_rate": 6.79518385612012e-05, "loss": 0.0269, "step": 8280 }, { "grad_norm": 0.31405022740364075, "learning_rate": 6.787465295170157e-05, "loss": 0.0293, "step": 8290 }, { "grad_norm": 0.34504273533821106, "learning_rate": 6.779741847358332e-05, "loss": 0.0267, "step": 8300 }, { "grad_norm": 0.8551024794578552, "learning_rate": 6.772013533800256e-05, "loss": 0.0303, "step": 8310 }, { "grad_norm": 0.32898518443107605, "learning_rate": 6.764280375624843e-05, "loss": 0.0277, "step": 8320 }, { "grad_norm": 0.2907131314277649, "learning_rate": 6.756542393974252e-05, "loss": 0.0284, "step": 8330 }, { "grad_norm": 0.37868279218673706, "learning_rate": 6.748799610003828e-05, "loss": 0.0277, "step": 8340 }, { "grad_norm": 0.2927071452140808, "learning_rate": 6.741052044882048e-05, "loss": 0.0273, "step": 8350 }, { "grad_norm": 0.33212557435035706, "learning_rate": 6.73329971979046e-05, "loss": 0.0254, "step": 8360 }, { "grad_norm": 0.3083970248699188, "learning_rate": 6.725542655923625e-05, "loss": 0.0205, "step": 8370 }, { "grad_norm": 0.3236294984817505, "learning_rate": 6.717780874489057e-05, "loss": 0.0266, "step": 8380 }, { "grad_norm": 0.39815038442611694, "learning_rate": 6.710014396707172e-05, "loss": 0.0312, "step": 8390 }, { "grad_norm": 0.26989176869392395, "learning_rate": 6.702243243811221e-05, "loss": 0.0246, "step": 8400 }, { "grad_norm": 0.4487095773220062, "learning_rate": 6.694467437047244e-05, "loss": 0.0293, "step": 8410 }, { "grad_norm": 0.2348049283027649, "learning_rate": 6.686686997673997e-05, "loss": 0.0281, "step": 8420 }, { "grad_norm": 0.30659857392311096, "learning_rate": 6.678901946962903e-05, "loss": 0.0304, "step": 8430 }, { "grad_norm": 0.36141330003738403, "learning_rate": 6.671112306197996e-05, "loss": 0.0283, "step": 8440 }, { "grad_norm": 0.2801227271556854, "learning_rate": 6.663318096675854e-05, "loss": 0.0238, "step": 8450 }, { "grad_norm": 0.35898301005363464, "learning_rate": 6.655519339705552e-05, "loss": 0.0244, "step": 8460 }, { "grad_norm": 0.35860902070999146, "learning_rate": 6.647716056608588e-05, "loss": 0.0226, "step": 8470 }, { "grad_norm": 0.3369330167770386, "learning_rate": 6.639908268718843e-05, "loss": 0.0341, "step": 8480 }, { "grad_norm": 0.3432939946651459, "learning_rate": 6.632095997382514e-05, "loss": 0.0293, "step": 8490 }, { "grad_norm": 0.3167463541030884, "learning_rate": 6.624279263958047e-05, "loss": 0.0253, "step": 8500 }, { "grad_norm": 0.3745759427547455, "learning_rate": 6.616458089816097e-05, "loss": 0.0295, "step": 8510 }, { "grad_norm": 0.24976347386837006, "learning_rate": 6.608632496339454e-05, "loss": 0.0252, "step": 8520 }, { "grad_norm": 0.2674374580383301, "learning_rate": 6.600802504922988e-05, "loss": 0.0211, "step": 8530 }, { "grad_norm": 0.3006483316421509, "learning_rate": 6.592968136973604e-05, "loss": 0.0214, "step": 8540 }, { "grad_norm": 0.30164214968681335, "learning_rate": 6.585129413910159e-05, "loss": 0.0241, "step": 8550 }, { "grad_norm": 0.19951000809669495, "learning_rate": 6.577286357163424e-05, "loss": 0.0223, "step": 8560 }, { "grad_norm": 0.3646887242794037, "learning_rate": 6.569438988176018e-05, "loss": 0.0252, "step": 8570 }, { "grad_norm": 0.3791358172893524, "learning_rate": 6.561587328402347e-05, "loss": 0.0242, "step": 8580 }, { "grad_norm": 0.37820732593536377, "learning_rate": 6.553731399308549e-05, "loss": 0.0234, "step": 8590 }, { "grad_norm": 0.3919040560722351, "learning_rate": 6.545871222372436e-05, "loss": 0.0281, "step": 8600 }, { "grad_norm": 0.40077751874923706, "learning_rate": 6.538006819083426e-05, "loss": 0.0264, "step": 8610 }, { "grad_norm": 0.4265846014022827, "learning_rate": 6.530138210942505e-05, "loss": 0.0252, "step": 8620 }, { "grad_norm": 0.26133838295936584, "learning_rate": 6.522265419462141e-05, "loss": 0.023, "step": 8630 }, { "grad_norm": 0.25006380677223206, "learning_rate": 6.514388466166248e-05, "loss": 0.023, "step": 8640 }, { "grad_norm": 0.2797548174858093, "learning_rate": 6.506507372590119e-05, "loss": 0.0262, "step": 8650 }, { "grad_norm": 0.45401492714881897, "learning_rate": 6.498622160280355e-05, "loss": 0.023, "step": 8660 }, { "grad_norm": 0.3007832467556, "learning_rate": 6.490732850794832e-05, "loss": 0.0253, "step": 8670 }, { "grad_norm": 0.24562381207942963, "learning_rate": 6.482839465702616e-05, "loss": 0.0262, "step": 8680 }, { "grad_norm": 0.3402327597141266, "learning_rate": 6.474942026583923e-05, "loss": 0.0233, "step": 8690 }, { "grad_norm": 0.27771180868148804, "learning_rate": 6.467040555030052e-05, "loss": 0.024, "step": 8700 }, { "grad_norm": 0.2920501232147217, "learning_rate": 6.459135072643321e-05, "loss": 0.0201, "step": 8710 }, { "grad_norm": 0.3034299612045288, "learning_rate": 6.451225601037019e-05, "loss": 0.0221, "step": 8720 }, { "grad_norm": 0.3276534378528595, "learning_rate": 6.443312161835338e-05, "loss": 0.0264, "step": 8730 }, { "grad_norm": 0.29074618220329285, "learning_rate": 6.43539477667332e-05, "loss": 0.0253, "step": 8740 }, { "grad_norm": 0.36144375801086426, "learning_rate": 6.427473467196793e-05, "loss": 0.0255, "step": 8750 }, { "grad_norm": 0.45484527945518494, "learning_rate": 6.419548255062315e-05, "loss": 0.0241, "step": 8760 }, { "grad_norm": 0.3256067633628845, "learning_rate": 6.411619161937112e-05, "loss": 0.0237, "step": 8770 }, { "grad_norm": 0.31753429770469666, "learning_rate": 6.403686209499022e-05, "loss": 0.0246, "step": 8780 }, { "grad_norm": 0.3572523295879364, "learning_rate": 6.395749419436437e-05, "loss": 0.0267, "step": 8790 }, { "grad_norm": 0.3611835837364197, "learning_rate": 6.387808813448234e-05, "loss": 0.0233, "step": 8800 }, { "grad_norm": 0.35199543833732605, "learning_rate": 6.37986441324373e-05, "loss": 0.0267, "step": 8810 }, { "grad_norm": 0.3609142601490021, "learning_rate": 6.37191624054261e-05, "loss": 0.0272, "step": 8820 }, { "grad_norm": 0.30150124430656433, "learning_rate": 6.363964317074872e-05, "loss": 0.0279, "step": 8830 }, { "grad_norm": 0.3064950108528137, "learning_rate": 6.356008664580776e-05, "loss": 0.0248, "step": 8840 }, { "grad_norm": 0.39343371987342834, "learning_rate": 6.348049304810771e-05, "loss": 0.0277, "step": 8850 }, { "grad_norm": 0.41734203696250916, "learning_rate": 6.340086259525442e-05, "loss": 0.0309, "step": 8860 }, { "grad_norm": 0.36820170283317566, "learning_rate": 6.332119550495448e-05, "loss": 0.026, "step": 8870 }, { "grad_norm": 0.538908064365387, "learning_rate": 6.324149199501473e-05, "loss": 0.0302, "step": 8880 }, { "grad_norm": 0.40561696887016296, "learning_rate": 6.316175228334146e-05, "loss": 0.0275, "step": 8890 }, { "grad_norm": 0.4635489284992218, "learning_rate": 6.308197658794003e-05, "loss": 0.0312, "step": 8900 }, { "grad_norm": 0.3204265236854553, "learning_rate": 6.300216512691417e-05, "loss": 0.0253, "step": 8910 }, { "grad_norm": 0.35730063915252686, "learning_rate": 6.292231811846532e-05, "loss": 0.0262, "step": 8920 }, { "grad_norm": 0.2570178508758545, "learning_rate": 6.284243578089217e-05, "loss": 0.0243, "step": 8930 }, { "grad_norm": 0.4480089545249939, "learning_rate": 6.276251833258999e-05, "loss": 0.0289, "step": 8940 }, { "grad_norm": 0.3789927661418915, "learning_rate": 6.268256599205003e-05, "loss": 0.0266, "step": 8950 }, { "grad_norm": 0.3662071228027344, "learning_rate": 6.260257897785892e-05, "loss": 0.0279, "step": 8960 }, { "grad_norm": 0.31288594007492065, "learning_rate": 6.252255750869811e-05, "loss": 0.0248, "step": 8970 }, { "grad_norm": 0.48229771852493286, "learning_rate": 6.244250180334325e-05, "loss": 0.0276, "step": 8980 }, { "grad_norm": 0.30134981870651245, "learning_rate": 6.236241208066356e-05, "loss": 0.0287, "step": 8990 }, { "grad_norm": 0.3784787952899933, "learning_rate": 6.228228855962133e-05, "loss": 0.0304, "step": 9000 }, { "grad_norm": 0.26448193192481995, "learning_rate": 6.220213145927115e-05, "loss": 0.0234, "step": 9010 }, { "grad_norm": 0.29899072647094727, "learning_rate": 6.212194099875951e-05, "loss": 0.0231, "step": 9020 }, { "grad_norm": 0.3013300895690918, "learning_rate": 6.204171739732405e-05, "loss": 0.0237, "step": 9030 }, { "grad_norm": 0.22254028916358948, "learning_rate": 6.196146087429303e-05, "loss": 0.0222, "step": 9040 }, { "grad_norm": 0.2910575866699219, "learning_rate": 6.188117164908474e-05, "loss": 0.0248, "step": 9050 }, { "grad_norm": 0.3407171070575714, "learning_rate": 6.180084994120684e-05, "loss": 0.027, "step": 9060 }, { "grad_norm": 0.3742899000644684, "learning_rate": 6.17204959702558e-05, "loss": 0.0281, "step": 9070 }, { "grad_norm": 0.30951905250549316, "learning_rate": 6.164010995591635e-05, "loss": 0.0256, "step": 9080 }, { "grad_norm": 0.3051861822605133, "learning_rate": 6.155969211796076e-05, "loss": 0.0252, "step": 9090 }, { "grad_norm": 0.2527798116207123, "learning_rate": 6.147924267624829e-05, "loss": 0.0232, "step": 9100 }, { "grad_norm": 0.2819882929325104, "learning_rate": 6.13987618507247e-05, "loss": 0.0247, "step": 9110 }, { "grad_norm": 0.33718517422676086, "learning_rate": 6.131824986142147e-05, "loss": 0.0251, "step": 9120 }, { "grad_norm": 0.24276159703731537, "learning_rate": 6.123770692845529e-05, "loss": 0.0215, "step": 9130 }, { "grad_norm": 0.24374131858348846, "learning_rate": 6.11571332720275e-05, "loss": 0.0232, "step": 9140 }, { "grad_norm": 0.2967292070388794, "learning_rate": 6.107652911242336e-05, "loss": 0.0265, "step": 9150 }, { "grad_norm": 0.2433336228132248, "learning_rate": 6.0995894670011586e-05, "loss": 0.0213, "step": 9160 }, { "grad_norm": 0.37553396821022034, "learning_rate": 6.091523016524368e-05, "loss": 0.0247, "step": 9170 }, { "grad_norm": 0.26473307609558105, "learning_rate": 6.083453581865328e-05, "loss": 0.025, "step": 9180 }, { "grad_norm": 0.3310491144657135, "learning_rate": 6.075381185085568e-05, "loss": 0.0237, "step": 9190 }, { "grad_norm": 0.3055602014064789, "learning_rate": 6.067305848254709e-05, "loss": 0.0252, "step": 9200 }, { "grad_norm": 0.41784533858299255, "learning_rate": 6.059227593450418e-05, "loss": 0.0245, "step": 9210 }, { "grad_norm": 0.2659528851509094, "learning_rate": 6.051146442758333e-05, "loss": 0.0262, "step": 9220 }, { "grad_norm": 0.3651646077632904, "learning_rate": 6.043062418272012e-05, "loss": 0.0195, "step": 9230 }, { "grad_norm": 0.3546158969402313, "learning_rate": 6.0349755420928666e-05, "loss": 0.0198, "step": 9240 }, { "grad_norm": 0.29854270815849304, "learning_rate": 6.0268858363301105e-05, "loss": 0.0251, "step": 9250 }, { "grad_norm": 0.30452248454093933, "learning_rate": 6.018793323100689e-05, "loss": 0.0208, "step": 9260 }, { "grad_norm": 0.28489115834236145, "learning_rate": 6.0106980245292255e-05, "loss": 0.0222, "step": 9270 }, { "grad_norm": 0.6275737881660461, "learning_rate": 6.002599962747957e-05, "loss": 0.0193, "step": 9280 }, { "grad_norm": 0.33012792468070984, "learning_rate": 5.994499159896673e-05, "loss": 0.0208, "step": 9290 }, { "grad_norm": 0.35391852259635925, "learning_rate": 5.9863956381226607e-05, "loss": 0.0229, "step": 9300 }, { "grad_norm": 0.2809222638607025, "learning_rate": 5.9782894195806394e-05, "loss": 0.0291, "step": 9310 }, { "grad_norm": 0.3259482681751251, "learning_rate": 5.9701805264327004e-05, "loss": 0.023, "step": 9320 }, { "grad_norm": 0.4358774721622467, "learning_rate": 5.96206898084825e-05, "loss": 0.0274, "step": 9330 }, { "grad_norm": 0.2565452456474304, "learning_rate": 5.953954805003942e-05, "loss": 0.0212, "step": 9340 }, { "grad_norm": 0.24253027141094208, "learning_rate": 5.945838021083623e-05, "loss": 0.0207, "step": 9350 }, { "grad_norm": 0.3400305509567261, "learning_rate": 5.9377186512782714e-05, "loss": 0.0239, "step": 9360 }, { "grad_norm": 0.32933956384658813, "learning_rate": 5.929596717785935e-05, "loss": 0.0236, "step": 9370 }, { "grad_norm": 0.3234010636806488, "learning_rate": 5.921472242811668e-05, "loss": 0.0228, "step": 9380 }, { "grad_norm": 0.3915545344352722, "learning_rate": 5.913345248567475e-05, "loss": 0.0245, "step": 9390 }, { "grad_norm": 0.2434956580400467, "learning_rate": 5.905215757272248e-05, "loss": 0.0188, "step": 9400 }, { "grad_norm": 0.3582800030708313, "learning_rate": 5.897083791151706e-05, "loss": 0.0218, "step": 9410 }, { "grad_norm": 0.2924223840236664, "learning_rate": 5.888949372438336e-05, "loss": 0.0222, "step": 9420 }, { "grad_norm": 0.32992318272590637, "learning_rate": 5.8808125233713255e-05, "loss": 0.0204, "step": 9430 }, { "grad_norm": 0.38133886456489563, "learning_rate": 5.872673266196509e-05, "loss": 0.028, "step": 9440 }, { "grad_norm": 0.3852446973323822, "learning_rate": 5.864531623166305e-05, "loss": 0.0232, "step": 9450 }, { "grad_norm": 0.47629016637802124, "learning_rate": 5.856387616539656e-05, "loss": 0.0265, "step": 9460 }, { "grad_norm": 0.2644617259502411, "learning_rate": 5.848241268581967e-05, "loss": 0.0273, "step": 9470 }, { "grad_norm": 0.2801165282726288, "learning_rate": 5.840092601565037e-05, "loss": 0.0246, "step": 9480 }, { "grad_norm": 0.35226529836654663, "learning_rate": 5.8319416377670144e-05, "loss": 0.0231, "step": 9490 }, { "grad_norm": 0.5133236646652222, "learning_rate": 5.82378839947232e-05, "loss": 0.0231, "step": 9500 }, { "grad_norm": 0.3302304744720459, "learning_rate": 5.815632908971599e-05, "loss": 0.0239, "step": 9510 }, { "grad_norm": 0.38231003284454346, "learning_rate": 5.80747518856165e-05, "loss": 0.0283, "step": 9520 }, { "grad_norm": 0.41780105233192444, "learning_rate": 5.799315260545367e-05, "loss": 0.0235, "step": 9530 }, { "grad_norm": 0.5042811632156372, "learning_rate": 5.791153147231686e-05, "loss": 0.0288, "step": 9540 }, { "grad_norm": 0.5355105400085449, "learning_rate": 5.782988870935509e-05, "loss": 0.0276, "step": 9550 }, { "grad_norm": 0.4194992482662201, "learning_rate": 5.774822453977657e-05, "loss": 0.027, "step": 9560 }, { "grad_norm": 0.481855183839798, "learning_rate": 5.7666539186848036e-05, "loss": 0.0259, "step": 9570 }, { "grad_norm": 0.4218195676803589, "learning_rate": 5.758483287389411e-05, "loss": 0.0259, "step": 9580 }, { "grad_norm": 0.49046674370765686, "learning_rate": 5.7503105824296735e-05, "loss": 0.0242, "step": 9590 }, { "grad_norm": 0.49532556533813477, "learning_rate": 5.742135826149453e-05, "loss": 0.0298, "step": 9600 }, { "grad_norm": 0.4005630612373352, "learning_rate": 5.7339590408982223e-05, "loss": 0.0284, "step": 9610 }, { "grad_norm": 0.31389182806015015, "learning_rate": 5.725780249031e-05, "loss": 0.026, "step": 9620 }, { "grad_norm": 0.26234158873558044, "learning_rate": 5.717599472908292e-05, "loss": 0.0293, "step": 9630 }, { "grad_norm": 0.3390723764896393, "learning_rate": 5.7094167348960237e-05, "loss": 0.022, "step": 9640 }, { "grad_norm": 0.327312707901001, "learning_rate": 5.7012320573654945e-05, "loss": 0.0257, "step": 9650 }, { "grad_norm": 0.2970992922782898, "learning_rate": 5.693045462693295e-05, "loss": 0.022, "step": 9660 }, { "grad_norm": 0.34595561027526855, "learning_rate": 5.684856973261266e-05, "loss": 0.0239, "step": 9670 }, { "grad_norm": 0.3990800082683563, "learning_rate": 5.6766666114564215e-05, "loss": 0.0314, "step": 9680 }, { "grad_norm": 0.3851526379585266, "learning_rate": 5.668474399670899e-05, "loss": 0.0222, "step": 9690 }, { "grad_norm": 0.33756446838378906, "learning_rate": 5.660280360301896e-05, "loss": 0.0222, "step": 9700 }, { "grad_norm": 0.2773386240005493, "learning_rate": 5.652084515751599e-05, "loss": 0.0255, "step": 9710 }, { "grad_norm": 0.36143872141838074, "learning_rate": 5.643886888427137e-05, "loss": 0.0236, "step": 9720 }, { "grad_norm": 0.39528948068618774, "learning_rate": 5.6356875007405074e-05, "loss": 0.0243, "step": 9730 }, { "grad_norm": 0.2650708258152008, "learning_rate": 5.627486375108525e-05, "loss": 0.0214, "step": 9740 }, { "grad_norm": 0.26079216599464417, "learning_rate": 5.619283533952754e-05, "loss": 0.0229, "step": 9750 }, { "grad_norm": 0.23435436189174652, "learning_rate": 5.6110789996994474e-05, "loss": 0.0206, "step": 9760 }, { "grad_norm": 0.2628357708454132, "learning_rate": 5.602872794779491e-05, "loss": 0.0208, "step": 9770 }, { "grad_norm": 0.24410578608512878, "learning_rate": 5.594664941628334e-05, "loss": 0.0194, "step": 9780 }, { "grad_norm": 0.2445981204509735, "learning_rate": 5.5864554626859324e-05, "loss": 0.0235, "step": 9790 }, { "grad_norm": 0.3314042389392853, "learning_rate": 5.578244380396691e-05, "loss": 0.0174, "step": 9800 }, { "grad_norm": 0.40565335750579834, "learning_rate": 5.570031717209394e-05, "loss": 0.0223, "step": 9810 }, { "grad_norm": 0.41558390855789185, "learning_rate": 5.561817495577147e-05, "loss": 0.0183, "step": 9820 }, { "grad_norm": 0.26809242367744446, "learning_rate": 5.5536017379573215e-05, "loss": 0.0233, "step": 9830 }, { "grad_norm": 0.2809429466724396, "learning_rate": 5.545384466811483e-05, "loss": 0.0216, "step": 9840 }, { "grad_norm": 0.2963339388370514, "learning_rate": 5.5371657046053384e-05, "loss": 0.0243, "step": 9850 }, { "grad_norm": 0.34849947690963745, "learning_rate": 5.528945473808669e-05, "loss": 0.0231, "step": 9860 }, { "grad_norm": 0.3530868589878082, "learning_rate": 5.520723796895272e-05, "loss": 0.025, "step": 9870 }, { "grad_norm": 0.3079748749732971, "learning_rate": 5.512500696342897e-05, "loss": 0.0297, "step": 9880 }, { "grad_norm": 0.3026134669780731, "learning_rate": 5.504276194633188e-05, "loss": 0.0345, "step": 9890 }, { "grad_norm": 0.27156180143356323, "learning_rate": 5.49605031425162e-05, "loss": 0.0235, "step": 9900 }, { "grad_norm": 0.692590057849884, "learning_rate": 5.487823077687434e-05, "loss": 0.0257, "step": 9910 }, { "grad_norm": 0.25466015934944153, "learning_rate": 5.4795945074335806e-05, "loss": 0.0248, "step": 9920 }, { "grad_norm": 0.2954048812389374, "learning_rate": 5.471364625986657e-05, "loss": 0.0178, "step": 9930 }, { "grad_norm": 1.1072252988815308, "learning_rate": 5.463133455846845e-05, "loss": 0.0269, "step": 9940 }, { "grad_norm": 0.2775861322879791, "learning_rate": 5.4549010195178505e-05, "loss": 0.0224, "step": 9950 }, { "grad_norm": 0.2639944553375244, "learning_rate": 5.446667339506838e-05, "loss": 0.0218, "step": 9960 }, { "grad_norm": 0.3166565001010895, "learning_rate": 5.4384324383243756e-05, "loss": 0.0211, "step": 9970 }, { "grad_norm": 0.36798015236854553, "learning_rate": 5.430196338484368e-05, "loss": 0.0229, "step": 9980 }, { "grad_norm": 0.37934407591819763, "learning_rate": 5.4219590625039975e-05, "loss": 0.0216, "step": 9990 }, { "grad_norm": 0.3425438702106476, "learning_rate": 5.413720632903664e-05, "loss": 0.0226, "step": 10000 }, { "grad_norm": 0.318021297454834, "learning_rate": 5.405481072206917e-05, "loss": 0.0206, "step": 10010 }, { "grad_norm": 0.2233188897371292, "learning_rate": 5.397240402940402e-05, "loss": 0.0214, "step": 10020 }, { "grad_norm": 0.2623114287853241, "learning_rate": 5.388998647633794e-05, "loss": 0.0215, "step": 10030 }, { "grad_norm": 0.31165894865989685, "learning_rate": 5.380755828819737e-05, "loss": 0.0232, "step": 10040 }, { "grad_norm": 0.22251485288143158, "learning_rate": 5.3725119690337846e-05, "loss": 0.0194, "step": 10050 }, { "grad_norm": 0.3188113868236542, "learning_rate": 5.3642670908143324e-05, "loss": 0.0221, "step": 10060 }, { "grad_norm": 0.30347996950149536, "learning_rate": 5.356021216702562e-05, "loss": 0.0194, "step": 10070 }, { "grad_norm": 0.28082916140556335, "learning_rate": 5.347774369242381e-05, "loss": 0.0218, "step": 10080 }, { "grad_norm": 0.31086722016334534, "learning_rate": 5.3395265709803545e-05, "loss": 0.0171, "step": 10090 }, { "grad_norm": 0.33062610030174255, "learning_rate": 5.331277844465647e-05, "loss": 0.0176, "step": 10100 }, { "grad_norm": 0.2999603748321533, "learning_rate": 5.323028212249963e-05, "loss": 0.02, "step": 10110 }, { "grad_norm": 0.3050406873226166, "learning_rate": 5.314777696887481e-05, "loss": 0.0181, "step": 10120 }, { "grad_norm": 0.312641978263855, "learning_rate": 5.306526320934796e-05, "loss": 0.0182, "step": 10130 }, { "grad_norm": 0.2689063847064972, "learning_rate": 5.298274106950854e-05, "loss": 0.022, "step": 10140 }, { "grad_norm": 0.23823900520801544, "learning_rate": 5.290021077496893e-05, "loss": 0.0175, "step": 10150 }, { "grad_norm": 0.3612326383590698, "learning_rate": 5.2817672551363816e-05, "loss": 0.0183, "step": 10160 }, { "grad_norm": 0.2823701500892639, "learning_rate": 5.273512662434952e-05, "loss": 0.0175, "step": 10170 }, { "grad_norm": 0.31416070461273193, "learning_rate": 5.265257321960349e-05, "loss": 0.0181, "step": 10180 }, { "grad_norm": 0.312044620513916, "learning_rate": 5.257001256282357e-05, "loss": 0.0189, "step": 10190 }, { "grad_norm": 0.335399329662323, "learning_rate": 5.248744487972742e-05, "loss": 0.0175, "step": 10200 }, { "grad_norm": 0.26139402389526367, "learning_rate": 5.240487039605196e-05, "loss": 0.0185, "step": 10210 }, { "grad_norm": 0.317781925201416, "learning_rate": 5.232228933755267e-05, "loss": 0.0178, "step": 10220 }, { "grad_norm": 0.2453029453754425, "learning_rate": 5.2239701930003006e-05, "loss": 0.0196, "step": 10230 }, { "grad_norm": 0.4008253514766693, "learning_rate": 5.215710839919379e-05, "loss": 0.0172, "step": 10240 }, { "grad_norm": 0.36088067293167114, "learning_rate": 5.207450897093257e-05, "loss": 0.0163, "step": 10250 }, { "grad_norm": 0.39461827278137207, "learning_rate": 5.1991903871043046e-05, "loss": 0.02, "step": 10260 }, { "grad_norm": 0.2623033821582794, "learning_rate": 5.190929332536439e-05, "loss": 0.0198, "step": 10270 }, { "grad_norm": 0.44567787647247314, "learning_rate": 5.182667755975071e-05, "loss": 0.0236, "step": 10280 }, { "grad_norm": 0.9886274933815002, "learning_rate": 5.1744056800070315e-05, "loss": 0.0221, "step": 10290 }, { "grad_norm": 0.36143478751182556, "learning_rate": 5.166143127220524e-05, "loss": 0.0186, "step": 10300 }, { "grad_norm": 0.3034328520298004, "learning_rate": 5.1578801202050485e-05, "loss": 0.026, "step": 10310 }, { "grad_norm": 0.43665948510169983, "learning_rate": 5.149616681551355e-05, "loss": 0.0247, "step": 10320 }, { "grad_norm": 0.28634530305862427, "learning_rate": 5.141352833851367e-05, "loss": 0.0177, "step": 10330 }, { "grad_norm": 0.33211153745651245, "learning_rate": 5.1330885996981285e-05, "loss": 0.0168, "step": 10340 }, { "grad_norm": 0.27655458450317383, "learning_rate": 5.124824001685741e-05, "loss": 0.0179, "step": 10350 }, { "grad_norm": 0.40463754534721375, "learning_rate": 5.116559062409298e-05, "loss": 0.0147, "step": 10360 }, { "grad_norm": 0.3564929962158203, "learning_rate": 5.10829380446483e-05, "loss": 0.0211, "step": 10370 }, { "grad_norm": 0.3644106090068817, "learning_rate": 5.100028250449235e-05, "loss": 0.0205, "step": 10380 }, { "grad_norm": 0.25284966826438904, "learning_rate": 5.0917624229602234e-05, "loss": 0.017, "step": 10390 }, { "grad_norm": 0.3355977237224579, "learning_rate": 5.0834963445962524e-05, "loss": 0.0193, "step": 10400 }, { "grad_norm": 0.2205965220928192, "learning_rate": 5.075230037956461e-05, "loss": 0.0182, "step": 10410 }, { "grad_norm": 0.26438575983047485, "learning_rate": 5.0669635256406213e-05, "loss": 0.0177, "step": 10420 }, { "grad_norm": 0.2780822813510895, "learning_rate": 5.058696830249058e-05, "loss": 0.0139, "step": 10430 }, { "grad_norm": 0.24444830417633057, "learning_rate": 5.050429974382602e-05, "loss": 0.0175, "step": 10440 }, { "grad_norm": 0.3179857134819031, "learning_rate": 5.042162980642523e-05, "loss": 0.0168, "step": 10450 }, { "grad_norm": 0.252248615026474, "learning_rate": 5.033895871630462e-05, "loss": 0.0187, "step": 10460 }, { "grad_norm": 0.22600539028644562, "learning_rate": 5.025628669948386e-05, "loss": 0.0175, "step": 10470 }, { "grad_norm": 0.2887779474258423, "learning_rate": 5.017361398198502e-05, "loss": 0.0181, "step": 10480 }, { "grad_norm": 0.28037160634994507, "learning_rate": 5.009094078983221e-05, "loss": 0.0148, "step": 10490 }, { "grad_norm": 0.21705150604248047, "learning_rate": 5.000826734905073e-05, "loss": 0.016, "step": 10500 }, { "grad_norm": 0.2511897385120392, "learning_rate": 4.9925593885666645e-05, "loss": 0.0229, "step": 10510 }, { "grad_norm": 0.3790608048439026, "learning_rate": 4.984292062570602e-05, "loss": 0.0192, "step": 10520 }, { "grad_norm": 0.261680543422699, "learning_rate": 4.976024779519442e-05, "loss": 0.0208, "step": 10530 }, { "grad_norm": 0.38352906703948975, "learning_rate": 4.9677575620156194e-05, "loss": 0.0204, "step": 10540 }, { "grad_norm": 0.3845357894897461, "learning_rate": 4.959490432661391e-05, "loss": 0.0177, "step": 10550 }, { "grad_norm": 0.4187433123588562, "learning_rate": 4.9512234140587726e-05, "loss": 0.023, "step": 10560 }, { "grad_norm": 0.2564837634563446, "learning_rate": 4.942956528809477e-05, "loss": 0.0189, "step": 10570 }, { "grad_norm": 0.2957102358341217, "learning_rate": 4.934689799514854e-05, "loss": 0.0166, "step": 10580 }, { "grad_norm": 0.34618788957595825, "learning_rate": 4.926423248775827e-05, "loss": 0.018, "step": 10590 }, { "grad_norm": 0.3520120084285736, "learning_rate": 4.918156899192826e-05, "loss": 0.0193, "step": 10600 }, { "grad_norm": 0.21666838228702545, "learning_rate": 4.909890773365738e-05, "loss": 0.017, "step": 10610 }, { "grad_norm": 0.27566906809806824, "learning_rate": 4.9016248938938344e-05, "loss": 0.0156, "step": 10620 }, { "grad_norm": 0.30554190278053284, "learning_rate": 4.8933592833757156e-05, "loss": 0.0177, "step": 10630 }, { "grad_norm": 0.26935797929763794, "learning_rate": 4.8850939644092435e-05, "loss": 0.0149, "step": 10640 }, { "grad_norm": 0.3062436282634735, "learning_rate": 4.876828959591485e-05, "loss": 0.0151, "step": 10650 }, { "grad_norm": 0.2223013937473297, "learning_rate": 4.8685642915186474e-05, "loss": 0.0185, "step": 10660 }, { "grad_norm": 0.417305588722229, "learning_rate": 4.860299982786018e-05, "loss": 0.0154, "step": 10670 }, { "grad_norm": 0.32130834460258484, "learning_rate": 4.852036055987901e-05, "loss": 0.0248, "step": 10680 }, { "grad_norm": 0.3461330235004425, "learning_rate": 4.843772533717558e-05, "loss": 0.02, "step": 10690 }, { "grad_norm": 0.2780469059944153, "learning_rate": 4.835509438567142e-05, "loss": 0.0216, "step": 10700 }, { "grad_norm": 1.695111632347107, "learning_rate": 4.827246793127639e-05, "loss": 0.0268, "step": 10710 }, { "grad_norm": 0.2820586860179901, "learning_rate": 4.818984619988807e-05, "loss": 0.0179, "step": 10720 }, { "grad_norm": 0.28568750619888306, "learning_rate": 4.810722941739115e-05, "loss": 0.0171, "step": 10730 }, { "grad_norm": 0.27737241983413696, "learning_rate": 4.8024617809656684e-05, "loss": 0.0215, "step": 10740 }, { "grad_norm": 0.34082654118537903, "learning_rate": 4.794201160254171e-05, "loss": 0.017, "step": 10750 }, { "grad_norm": 0.2441881000995636, "learning_rate": 4.785941102188844e-05, "loss": 0.016, "step": 10760 }, { "grad_norm": 0.2678302526473999, "learning_rate": 4.7776816293523686e-05, "loss": 0.0155, "step": 10770 }, { "grad_norm": 0.19535674154758453, "learning_rate": 4.769422764325832e-05, "loss": 0.0159, "step": 10780 }, { "grad_norm": 0.20579543709754944, "learning_rate": 4.76116452968865e-05, "loss": 0.0172, "step": 10790 }, { "grad_norm": 0.2529495060443878, "learning_rate": 4.752906948018525e-05, "loss": 0.0162, "step": 10800 }, { "grad_norm": 0.24260927736759186, "learning_rate": 4.7446500418913684e-05, "loss": 0.0149, "step": 10810 }, { "grad_norm": 0.27154073119163513, "learning_rate": 4.736393833881247e-05, "loss": 0.0175, "step": 10820 }, { "grad_norm": 0.3258320093154907, "learning_rate": 4.7281383465603194e-05, "loss": 0.0179, "step": 10830 }, { "grad_norm": 0.3219239115715027, "learning_rate": 4.71988360249877e-05, "loss": 0.0176, "step": 10840 }, { "grad_norm": 0.19306504726409912, "learning_rate": 4.7116296242647554e-05, "loss": 0.0185, "step": 10850 }, { "grad_norm": 0.23152144253253937, "learning_rate": 4.703376434424336e-05, "loss": 0.0175, "step": 10860 }, { "grad_norm": 0.31980451941490173, "learning_rate": 4.695124055541421e-05, "loss": 0.015, "step": 10870 }, { "grad_norm": 0.2631542980670929, "learning_rate": 4.6868725101776934e-05, "loss": 0.0156, "step": 10880 }, { "grad_norm": 0.28443050384521484, "learning_rate": 4.678621820892567e-05, "loss": 0.0141, "step": 10890 }, { "grad_norm": 0.26374179124832153, "learning_rate": 4.670372010243111e-05, "loss": 0.0154, "step": 10900 }, { "grad_norm": 0.2486492246389389, "learning_rate": 4.662123100783992e-05, "loss": 0.0139, "step": 10910 }, { "grad_norm": 0.24292884767055511, "learning_rate": 4.653875115067415e-05, "loss": 0.016, "step": 10920 }, { "grad_norm": 0.27140259742736816, "learning_rate": 4.6456280756430545e-05, "loss": 0.0158, "step": 10930 }, { "grad_norm": 0.27856114506721497, "learning_rate": 4.637382005058004e-05, "loss": 0.0132, "step": 10940 }, { "grad_norm": 0.24042432010173798, "learning_rate": 4.629136925856705e-05, "loss": 0.0174, "step": 10950 }, { "grad_norm": 0.3661767840385437, "learning_rate": 4.6208928605808895e-05, "loss": 0.0158, "step": 10960 }, { "grad_norm": 0.34262198209762573, "learning_rate": 4.612649831769519e-05, "loss": 0.0165, "step": 10970 }, { "grad_norm": 0.46506384015083313, "learning_rate": 4.604407861958715e-05, "loss": 0.0183, "step": 10980 }, { "grad_norm": 0.3665083050727844, "learning_rate": 4.5961669736817114e-05, "loss": 0.0206, "step": 10990 }, { "grad_norm": 0.3673674464225769, "learning_rate": 4.5879271894687814e-05, "loss": 0.0211, "step": 11000 }, { "grad_norm": 0.5436990261077881, "learning_rate": 4.5796885318471826e-05, "loss": 0.0167, "step": 11010 }, { "grad_norm": 0.403558611869812, "learning_rate": 4.571451023341086e-05, "loss": 0.0185, "step": 11020 }, { "grad_norm": 0.566264808177948, "learning_rate": 4.563214686471527e-05, "loss": 0.0218, "step": 11030 }, { "grad_norm": 0.4633435606956482, "learning_rate": 4.5549795437563365e-05, "loss": 0.0196, "step": 11040 }, { "grad_norm": 0.8563451766967773, "learning_rate": 4.546745617710081e-05, "loss": 0.0255, "step": 11050 }, { "grad_norm": 0.5167824625968933, "learning_rate": 4.5385129308440014e-05, "loss": 0.0229, "step": 11060 }, { "grad_norm": 0.38819336891174316, "learning_rate": 4.530281505665944e-05, "loss": 0.0249, "step": 11070 }, { "grad_norm": 0.28566646575927734, "learning_rate": 4.5220513646803134e-05, "loss": 0.0211, "step": 11080 }, { "grad_norm": 0.3810625672340393, "learning_rate": 4.513822530388003e-05, "loss": 0.0196, "step": 11090 }, { "grad_norm": 0.3390665054321289, "learning_rate": 4.5055950252863296e-05, "loss": 0.018, "step": 11100 }, { "grad_norm": 0.47705504298210144, "learning_rate": 4.4973688718689803e-05, "loss": 0.023, "step": 11110 }, { "grad_norm": 0.607129693031311, "learning_rate": 4.4891440926259406e-05, "loss": 0.0224, "step": 11120 }, { "grad_norm": 0.39328399300575256, "learning_rate": 4.480920710043443e-05, "loss": 0.0244, "step": 11130 }, { "grad_norm": 0.4403414726257324, "learning_rate": 4.4726987466039044e-05, "loss": 0.0284, "step": 11140 }, { "grad_norm": 0.4112926423549652, "learning_rate": 4.46447822478586e-05, "loss": 0.0312, "step": 11150 }, { "grad_norm": 0.48208391666412354, "learning_rate": 4.4562591670638974e-05, "loss": 0.0261, "step": 11160 }, { "grad_norm": 0.37383174896240234, "learning_rate": 4.4480415959086105e-05, "loss": 0.0226, "step": 11170 }, { "grad_norm": 0.4019623398780823, "learning_rate": 4.439825533786522e-05, "loss": 0.0192, "step": 11180 }, { "grad_norm": 0.2845710217952728, "learning_rate": 4.431611003160035e-05, "loss": 0.0271, "step": 11190 }, { "grad_norm": 0.3617405891418457, "learning_rate": 4.4233980264873636e-05, "loss": 0.0213, "step": 11200 }, { "grad_norm": 0.2555653750896454, "learning_rate": 4.4151866262224684e-05, "loss": 0.0227, "step": 11210 }, { "grad_norm": 0.34806498885154724, "learning_rate": 4.406976824815006e-05, "loss": 0.0164, "step": 11220 }, { "grad_norm": 0.36440926790237427, "learning_rate": 4.3987686447102595e-05, "loss": 0.0179, "step": 11230 }, { "grad_norm": 0.24866372346878052, "learning_rate": 4.3905621083490804e-05, "loss": 0.0175, "step": 11240 }, { "grad_norm": 0.7160053253173828, "learning_rate": 4.3823572381678286e-05, "loss": 0.0229, "step": 11250 }, { "grad_norm": 0.25373440980911255, "learning_rate": 4.374154056598301e-05, "loss": 0.0168, "step": 11260 }, { "grad_norm": 0.5825440883636475, "learning_rate": 4.3659525860676845e-05, "loss": 0.0151, "step": 11270 }, { "grad_norm": 0.1970502734184265, "learning_rate": 4.3577528489984854e-05, "loss": 0.0168, "step": 11280 }, { "grad_norm": 0.3317258358001709, "learning_rate": 4.349554867808476e-05, "loss": 0.0185, "step": 11290 }, { "grad_norm": 0.2642054557800293, "learning_rate": 4.34135866491062e-05, "loss": 0.0168, "step": 11300 }, { "grad_norm": 0.2874525785446167, "learning_rate": 4.333164262713022e-05, "loss": 0.0156, "step": 11310 }, { "grad_norm": 0.22513550519943237, "learning_rate": 4.324971683618868e-05, "loss": 0.0136, "step": 11320 }, { "grad_norm": 0.18595671653747559, "learning_rate": 4.316780950026354e-05, "loss": 0.0206, "step": 11330 }, { "grad_norm": 0.2699510455131531, "learning_rate": 4.308592084328637e-05, "loss": 0.0175, "step": 11340 }, { "grad_norm": 0.25458183884620667, "learning_rate": 4.3004051089137576e-05, "loss": 0.0185, "step": 11350 }, { "grad_norm": 0.28865379095077515, "learning_rate": 4.292220046164597e-05, "loss": 0.017, "step": 11360 }, { "grad_norm": 0.26905113458633423, "learning_rate": 4.2840369184588035e-05, "loss": 0.0329, "step": 11370 }, { "grad_norm": 0.342776358127594, "learning_rate": 4.2758557481687345e-05, "loss": 0.0189, "step": 11380 }, { "grad_norm": 0.25887781381607056, "learning_rate": 4.267676557661403e-05, "loss": 0.0155, "step": 11390 }, { "grad_norm": 0.3438360095024109, "learning_rate": 4.2594993692983955e-05, "loss": 0.0141, "step": 11400 }, { "grad_norm": 0.2482961267232895, "learning_rate": 4.251324205435837e-05, "loss": 0.0176, "step": 11410 }, { "grad_norm": 0.4010567367076874, "learning_rate": 4.243151088424312e-05, "loss": 0.0193, "step": 11420 }, { "grad_norm": 0.3231598436832428, "learning_rate": 4.234980040608813e-05, "loss": 0.0208, "step": 11430 }, { "grad_norm": 0.3377741575241089, "learning_rate": 4.22681108432867e-05, "loss": 0.0158, "step": 11440 }, { "grad_norm": 0.29296213388442993, "learning_rate": 4.2186442419174984e-05, "loss": 0.0177, "step": 11450 }, { "grad_norm": 0.4431559145450592, "learning_rate": 4.210479535703133e-05, "loss": 0.0212, "step": 11460 }, { "grad_norm": 0.23292309045791626, "learning_rate": 4.202316988007567e-05, "loss": 0.0183, "step": 11470 }, { "grad_norm": 0.33073052763938904, "learning_rate": 4.194156621146901e-05, "loss": 0.0175, "step": 11480 }, { "grad_norm": 0.16354107856750488, "learning_rate": 4.1859984574312596e-05, "loss": 0.0119, "step": 11490 }, { "grad_norm": 0.36742347478866577, "learning_rate": 4.177842519164752e-05, "loss": 0.0171, "step": 11500 }, { "grad_norm": 0.2743680477142334, "learning_rate": 4.169688828645404e-05, "loss": 0.0138, "step": 11510 }, { "grad_norm": 0.3238767683506012, "learning_rate": 4.161537408165092e-05, "loss": 0.0164, "step": 11520 }, { "grad_norm": 0.279371052980423, "learning_rate": 4.1533882800094924e-05, "loss": 0.0186, "step": 11530 }, { "grad_norm": 0.3474103510379791, "learning_rate": 4.145241466458005e-05, "loss": 0.0142, "step": 11540 }, { "grad_norm": 0.2706928849220276, "learning_rate": 4.13709698978371e-05, "loss": 0.0151, "step": 11550 }, { "grad_norm": 0.5051738619804382, "learning_rate": 4.1289548722532944e-05, "loss": 0.0161, "step": 11560 }, { "grad_norm": 0.20315232872962952, "learning_rate": 4.120815136126999e-05, "loss": 0.0165, "step": 11570 }, { "grad_norm": 0.31077179312705994, "learning_rate": 4.112677803658548e-05, "loss": 0.0138, "step": 11580 }, { "grad_norm": 0.2693970501422882, "learning_rate": 4.1045428970951e-05, "loss": 0.0154, "step": 11590 }, { "grad_norm": 0.3025025725364685, "learning_rate": 4.0964104386771785e-05, "loss": 0.0143, "step": 11600 }, { "grad_norm": 0.21475766599178314, "learning_rate": 4.0882804506386144e-05, "loss": 0.0154, "step": 11610 }, { "grad_norm": 0.24539002776145935, "learning_rate": 4.080152955206485e-05, "loss": 0.0163, "step": 11620 }, { "grad_norm": 0.2522529363632202, "learning_rate": 4.0720279746010505e-05, "loss": 0.0128, "step": 11630 }, { "grad_norm": 0.27717912197113037, "learning_rate": 4.063905531035699e-05, "loss": 0.0148, "step": 11640 }, { "grad_norm": 0.15500105917453766, "learning_rate": 4.055785646716882e-05, "loss": 0.0146, "step": 11650 }, { "grad_norm": 0.2712903618812561, "learning_rate": 4.047668343844051e-05, "loss": 0.0138, "step": 11660 }, { "grad_norm": 0.2597307562828064, "learning_rate": 4.039553644609604e-05, "loss": 0.0152, "step": 11670 }, { "grad_norm": 0.2404446005821228, "learning_rate": 4.0314415711988176e-05, "loss": 0.0134, "step": 11680 }, { "grad_norm": 0.2316468358039856, "learning_rate": 4.023332145789792e-05, "loss": 0.0138, "step": 11690 }, { "grad_norm": 0.3149365782737732, "learning_rate": 4.015225390553385e-05, "loss": 0.0143, "step": 11700 }, { "grad_norm": 0.23129740357398987, "learning_rate": 4.007121327653158e-05, "loss": 0.0173, "step": 11710 }, { "grad_norm": 0.20786169171333313, "learning_rate": 3.9990199792453064e-05, "loss": 0.0117, "step": 11720 }, { "grad_norm": 0.27979785203933716, "learning_rate": 3.9909213674786103e-05, "loss": 0.0137, "step": 11730 }, { "grad_norm": 0.27464741468429565, "learning_rate": 3.982825514494363e-05, "loss": 0.0134, "step": 11740 }, { "grad_norm": 0.26147720217704773, "learning_rate": 3.974732442426319e-05, "loss": 0.0128, "step": 11750 }, { "grad_norm": 0.33032962679862976, "learning_rate": 3.966642173400629e-05, "loss": 0.0149, "step": 11760 }, { "grad_norm": 0.27175289392471313, "learning_rate": 3.9585547295357764e-05, "loss": 0.0118, "step": 11770 }, { "grad_norm": 0.33374258875846863, "learning_rate": 3.950470132942526e-05, "loss": 0.0123, "step": 11780 }, { "grad_norm": 0.3060143291950226, "learning_rate": 3.942388405723856e-05, "loss": 0.0162, "step": 11790 }, { "grad_norm": 0.3007901906967163, "learning_rate": 3.9343095699749e-05, "loss": 0.0185, "step": 11800 }, { "grad_norm": 0.32913342118263245, "learning_rate": 3.9262336477828874e-05, "loss": 0.0169, "step": 11810 }, { "grad_norm": 0.47548192739486694, "learning_rate": 3.9181606612270794e-05, "loss": 0.0185, "step": 11820 }, { "grad_norm": 0.35671311616897583, "learning_rate": 3.910090632378713e-05, "loss": 0.017, "step": 11830 }, { "grad_norm": 0.25314125418663025, "learning_rate": 3.90202358330094e-05, "loss": 0.0146, "step": 11840 }, { "grad_norm": 0.1575855165719986, "learning_rate": 3.8939595360487656e-05, "loss": 0.016, "step": 11850 }, { "grad_norm": 0.26632601022720337, "learning_rate": 3.885898512668984e-05, "loss": 0.0152, "step": 11860 }, { "grad_norm": 0.33615413308143616, "learning_rate": 3.877840535200127e-05, "loss": 0.0165, "step": 11870 }, { "grad_norm": 0.3099479377269745, "learning_rate": 3.869785625672397e-05, "loss": 0.0121, "step": 11880 }, { "grad_norm": 0.27028030157089233, "learning_rate": 3.8617338061076094e-05, "loss": 0.0142, "step": 11890 }, { "grad_norm": 0.2619357109069824, "learning_rate": 3.853685098519132e-05, "loss": 0.0179, "step": 11900 }, { "grad_norm": 0.21196670830249786, "learning_rate": 3.845639524911823e-05, "loss": 0.0157, "step": 11910 }, { "grad_norm": 0.28729477524757385, "learning_rate": 3.837597107281974e-05, "loss": 0.0133, "step": 11920 }, { "grad_norm": 0.20308102667331696, "learning_rate": 3.829557867617247e-05, "loss": 0.0115, "step": 11930 }, { "grad_norm": 0.24132554233074188, "learning_rate": 3.821521827896618e-05, "loss": 0.0127, "step": 11940 }, { "grad_norm": 0.22524775564670563, "learning_rate": 3.81348901009031e-05, "loss": 0.0131, "step": 11950 }, { "grad_norm": 0.2080097496509552, "learning_rate": 3.805459436159741e-05, "loss": 0.0161, "step": 11960 }, { "grad_norm": 0.2566503882408142, "learning_rate": 3.797433128057461e-05, "loss": 0.0157, "step": 11970 }, { "grad_norm": 0.3335372507572174, "learning_rate": 3.789410107727089e-05, "loss": 0.0147, "step": 11980 }, { "grad_norm": 0.2251177579164505, "learning_rate": 3.781390397103257e-05, "loss": 0.0131, "step": 11990 }, { "grad_norm": 0.22937606275081635, "learning_rate": 3.7733740181115455e-05, "loss": 0.0126, "step": 12000 }, { "grad_norm": 0.3486773371696472, "learning_rate": 3.7653609926684306e-05, "loss": 0.0148, "step": 12010 }, { "grad_norm": 0.41840997338294983, "learning_rate": 3.757351342681217e-05, "loss": 0.0189, "step": 12020 }, { "grad_norm": 0.3011527359485626, "learning_rate": 3.749345090047982e-05, "loss": 0.0151, "step": 12030 }, { "grad_norm": 0.7086331844329834, "learning_rate": 3.741342256657515e-05, "loss": 0.0169, "step": 12040 }, { "grad_norm": 0.3649049401283264, "learning_rate": 3.7333428643892567e-05, "loss": 0.0193, "step": 12050 }, { "grad_norm": 0.621841549873352, "learning_rate": 3.725346935113239e-05, "loss": 0.0201, "step": 12060 }, { "grad_norm": 0.4488375186920166, "learning_rate": 3.717354490690029e-05, "loss": 0.0194, "step": 12070 }, { "grad_norm": 0.37582069635391235, "learning_rate": 3.709365552970664e-05, "loss": 0.0168, "step": 12080 }, { "grad_norm": 0.453851580619812, "learning_rate": 3.7013801437965945e-05, "loss": 0.0148, "step": 12090 }, { "grad_norm": 0.4050064980983734, "learning_rate": 3.693398284999623e-05, "loss": 0.0186, "step": 12100 }, { "grad_norm": 0.29123765230178833, "learning_rate": 3.6854199984018484e-05, "loss": 0.0138, "step": 12110 }, { "grad_norm": 0.3061200976371765, "learning_rate": 3.677445305815601e-05, "loss": 0.0142, "step": 12120 }, { "grad_norm": 0.26879048347473145, "learning_rate": 3.669474229043387e-05, "loss": 0.0115, "step": 12130 }, { "grad_norm": 0.18657571077346802, "learning_rate": 3.6615067898778235e-05, "loss": 0.0122, "step": 12140 }, { "grad_norm": 0.3175086975097656, "learning_rate": 3.6535430101015866e-05, "loss": 0.0132, "step": 12150 }, { "grad_norm": 0.22143538296222687, "learning_rate": 3.645582911487345e-05, "loss": 0.0195, "step": 12160 }, { "grad_norm": 0.18988686800003052, "learning_rate": 3.637626515797706e-05, "loss": 0.0129, "step": 12170 }, { "grad_norm": 0.2321450263261795, "learning_rate": 3.629673844785152e-05, "loss": 0.0152, "step": 12180 }, { "grad_norm": 0.21645230054855347, "learning_rate": 3.621724920191979e-05, "loss": 0.0164, "step": 12190 }, { "grad_norm": 0.22263093292713165, "learning_rate": 3.6137797637502444e-05, "loss": 0.0116, "step": 12200 }, { "grad_norm": 0.2953313887119293, "learning_rate": 3.6058383971817035e-05, "loss": 0.0142, "step": 12210 }, { "grad_norm": 0.22207173705101013, "learning_rate": 3.59790084219775e-05, "loss": 0.0131, "step": 12220 }, { "grad_norm": 1.6011451482772827, "learning_rate": 3.589967120499353e-05, "loss": 0.0176, "step": 12230 }, { "grad_norm": 0.3274059295654297, "learning_rate": 3.5820372537770075e-05, "loss": 0.0138, "step": 12240 }, { "grad_norm": 0.3755655884742737, "learning_rate": 3.5741112637106655e-05, "loss": 0.0134, "step": 12250 }, { "grad_norm": 0.37606120109558105, "learning_rate": 3.5661891719696804e-05, "loss": 0.0137, "step": 12260 }, { "grad_norm": 0.3550015687942505, "learning_rate": 3.5582710002127504e-05, "loss": 0.0148, "step": 12270 }, { "grad_norm": 0.33411192893981934, "learning_rate": 3.550356770087853e-05, "loss": 0.0144, "step": 12280 }, { "grad_norm": 0.37244799733161926, "learning_rate": 3.5424465032321914e-05, "loss": 0.0128, "step": 12290 }, { "grad_norm": 0.24733611941337585, "learning_rate": 3.5345402212721335e-05, "loss": 0.0106, "step": 12300 }, { "grad_norm": 0.2642270028591156, "learning_rate": 3.526637945823152e-05, "loss": 0.0125, "step": 12310 }, { "grad_norm": 0.2378457635641098, "learning_rate": 3.518739698489767e-05, "loss": 0.0117, "step": 12320 }, { "grad_norm": 0.3380733132362366, "learning_rate": 3.510845500865485e-05, "loss": 0.0115, "step": 12330 }, { "grad_norm": 0.1878862977027893, "learning_rate": 3.502955374532739e-05, "loss": 0.0115, "step": 12340 }, { "grad_norm": 0.34695613384246826, "learning_rate": 3.495069341062836e-05, "loss": 0.0116, "step": 12350 }, { "grad_norm": 0.20670433342456818, "learning_rate": 3.4871874220158896e-05, "loss": 0.0131, "step": 12360 }, { "grad_norm": 0.2801567614078522, "learning_rate": 3.479309638940762e-05, "loss": 0.013, "step": 12370 }, { "grad_norm": 0.24346186220645905, "learning_rate": 3.4714360133750146e-05, "loss": 0.0162, "step": 12380 }, { "grad_norm": 0.3051888644695282, "learning_rate": 3.463566566844839e-05, "loss": 0.0111, "step": 12390 }, { "grad_norm": 0.2538609504699707, "learning_rate": 3.4557013208650016e-05, "loss": 0.0148, "step": 12400 }, { "grad_norm": 0.3442012071609497, "learning_rate": 3.4478402969387857e-05, "loss": 0.0178, "step": 12410 }, { "grad_norm": 0.22276058793067932, "learning_rate": 3.4399835165579266e-05, "loss": 0.0157, "step": 12420 }, { "grad_norm": 0.29449889063835144, "learning_rate": 3.4321310012025645e-05, "loss": 0.0133, "step": 12430 }, { "grad_norm": 0.26229244470596313, "learning_rate": 3.424282772341176e-05, "loss": 0.0118, "step": 12440 }, { "grad_norm": 0.18634235858917236, "learning_rate": 3.416438851430519e-05, "loss": 0.012, "step": 12450 }, { "grad_norm": 0.2873656749725342, "learning_rate": 3.408599259915577e-05, "loss": 0.0166, "step": 12460 }, { "grad_norm": 0.30891790986061096, "learning_rate": 3.400764019229487e-05, "loss": 0.0117, "step": 12470 }, { "grad_norm": 0.4763983488082886, "learning_rate": 3.3929331507935035e-05, "loss": 0.0156, "step": 12480 }, { "grad_norm": 0.19338323175907135, "learning_rate": 3.3851066760169196e-05, "loss": 0.0121, "step": 12490 }, { "grad_norm": 0.32566213607788086, "learning_rate": 3.377284616297021e-05, "loss": 0.0112, "step": 12500 }, { "grad_norm": 0.2887798547744751, "learning_rate": 3.3694669930190166e-05, "loss": 0.0114, "step": 12510 }, { "grad_norm": 0.2245304435491562, "learning_rate": 3.36165382755599e-05, "loss": 0.011, "step": 12520 }, { "grad_norm": 0.17832738161087036, "learning_rate": 3.35384514126884e-05, "loss": 0.0124, "step": 12530 }, { "grad_norm": 0.454369455575943, "learning_rate": 3.3460409555062154e-05, "loss": 0.0122, "step": 12540 }, { "grad_norm": 0.34253284335136414, "learning_rate": 3.3382412916044645e-05, "loss": 0.0126, "step": 12550 }, { "grad_norm": 0.2619803845882416, "learning_rate": 3.330446170887566e-05, "loss": 0.0108, "step": 12560 }, { "grad_norm": 0.302083283662796, "learning_rate": 3.3226556146670834e-05, "loss": 0.0147, "step": 12570 }, { "grad_norm": 0.3751260042190552, "learning_rate": 3.314869644242102e-05, "loss": 0.0124, "step": 12580 }, { "grad_norm": 0.21409058570861816, "learning_rate": 3.3070882808991674e-05, "loss": 0.0117, "step": 12590 }, { "grad_norm": 0.27636516094207764, "learning_rate": 3.2993115459122305e-05, "loss": 0.0138, "step": 12600 }, { "grad_norm": 0.20102179050445557, "learning_rate": 3.2915394605425835e-05, "loss": 0.0108, "step": 12610 }, { "grad_norm": 0.19911763072013855, "learning_rate": 3.283772046038816e-05, "loss": 0.0111, "step": 12620 }, { "grad_norm": 0.31832626461982727, "learning_rate": 3.276009323636739e-05, "loss": 0.0154, "step": 12630 }, { "grad_norm": 0.1746872365474701, "learning_rate": 3.268251314559344e-05, "loss": 0.0141, "step": 12640 }, { "grad_norm": 0.1790608912706375, "learning_rate": 3.2604980400167254e-05, "loss": 0.0113, "step": 12650 }, { "grad_norm": 0.2451590895652771, "learning_rate": 3.252749521206042e-05, "loss": 0.0121, "step": 12660 }, { "grad_norm": 0.2962441146373749, "learning_rate": 3.2450057793114494e-05, "loss": 0.0115, "step": 12670 }, { "grad_norm": 0.2144993543624878, "learning_rate": 3.2372668355040435e-05, "loss": 0.0086, "step": 12680 }, { "grad_norm": 0.23686689138412476, "learning_rate": 3.2295327109418005e-05, "loss": 0.017, "step": 12690 }, { "grad_norm": 0.22344915568828583, "learning_rate": 3.221803426769518e-05, "loss": 0.0138, "step": 12700 }, { "grad_norm": 0.21418622136116028, "learning_rate": 3.214079004118768e-05, "loss": 0.011, "step": 12710 }, { "grad_norm": 0.21207351982593536, "learning_rate": 3.2063594641078234e-05, "loss": 0.0104, "step": 12720 }, { "grad_norm": 0.20806486904621124, "learning_rate": 3.198644827841616e-05, "loss": 0.0126, "step": 12730 }, { "grad_norm": 0.28037571907043457, "learning_rate": 3.1909351164116654e-05, "loss": 0.0129, "step": 12740 }, { "grad_norm": 0.20963534712791443, "learning_rate": 3.183230350896026e-05, "loss": 0.0103, "step": 12750 }, { "grad_norm": 0.18428194522857666, "learning_rate": 3.1755305523592337e-05, "loss": 0.0138, "step": 12760 }, { "grad_norm": 0.19743488729000092, "learning_rate": 3.167835741852245e-05, "loss": 0.0113, "step": 12770 }, { "grad_norm": 0.29466521739959717, "learning_rate": 3.160145940412378e-05, "loss": 0.0165, "step": 12780 }, { "grad_norm": 0.30787092447280884, "learning_rate": 3.1524611690632545e-05, "loss": 0.0152, "step": 12790 }, { "grad_norm": 0.2704426348209381, "learning_rate": 3.144781448814746e-05, "loss": 0.0168, "step": 12800 }, { "grad_norm": 0.38203495740890503, "learning_rate": 3.1371068006629145e-05, "loss": 0.0143, "step": 12810 }, { "grad_norm": 0.7181354761123657, "learning_rate": 3.129437245589956e-05, "loss": 0.0145, "step": 12820 }, { "grad_norm": 0.3190256953239441, "learning_rate": 3.121772804564143e-05, "loss": 0.0131, "step": 12830 }, { "grad_norm": 0.2996603846549988, "learning_rate": 3.11411349853976e-05, "loss": 0.0159, "step": 12840 }, { "grad_norm": 0.3407917320728302, "learning_rate": 3.10645934845706e-05, "loss": 0.0109, "step": 12850 }, { "grad_norm": 0.251583456993103, "learning_rate": 3.098810375242196e-05, "loss": 0.0128, "step": 12860 }, { "grad_norm": 0.1745646893978119, "learning_rate": 3.0911665998071704e-05, "loss": 0.0123, "step": 12870 }, { "grad_norm": 0.3165740370750427, "learning_rate": 3.083528043049774e-05, "loss": 0.0124, "step": 12880 }, { "grad_norm": 0.2555304169654846, "learning_rate": 3.0758947258535255e-05, "loss": 0.0112, "step": 12890 }, { "grad_norm": 0.19649581611156464, "learning_rate": 3.068266669087625e-05, "loss": 0.0114, "step": 12900 }, { "grad_norm": 0.28474751114845276, "learning_rate": 3.060643893606887e-05, "loss": 0.0107, "step": 12910 }, { "grad_norm": 0.2677845060825348, "learning_rate": 3.053026420251693e-05, "loss": 0.014, "step": 12920 }, { "grad_norm": 0.24403074383735657, "learning_rate": 3.0454142698479183e-05, "loss": 0.011, "step": 12930 }, { "grad_norm": 0.25357353687286377, "learning_rate": 3.0378074632068954e-05, "loss": 0.0113, "step": 12940 }, { "grad_norm": 0.2643353343009949, "learning_rate": 3.0302060211253408e-05, "loss": 0.0116, "step": 12950 }, { "grad_norm": 0.6871694326400757, "learning_rate": 3.0226099643853073e-05, "loss": 0.0132, "step": 12960 }, { "grad_norm": 0.2366318702697754, "learning_rate": 3.0150193137541283e-05, "loss": 0.0125, "step": 12970 }, { "grad_norm": 0.3239292800426483, "learning_rate": 3.0074340899843467e-05, "loss": 0.0139, "step": 12980 }, { "grad_norm": 0.2967315912246704, "learning_rate": 2.999854313813677e-05, "loss": 0.012, "step": 12990 }, { "grad_norm": 0.27742961049079895, "learning_rate": 2.9922800059649382e-05, "loss": 0.0127, "step": 13000 }, { "grad_norm": 0.23715196549892426, "learning_rate": 2.9847111871459976e-05, "loss": 0.0128, "step": 13010 }, { "grad_norm": 0.2771082818508148, "learning_rate": 2.977147878049721e-05, "loss": 0.0126, "step": 13020 }, { "grad_norm": 0.2596262991428375, "learning_rate": 2.9695900993539006e-05, "loss": 0.0118, "step": 13030 }, { "grad_norm": 0.21382467448711395, "learning_rate": 2.9620378717212183e-05, "loss": 0.0111, "step": 13040 }, { "grad_norm": 0.3130057454109192, "learning_rate": 2.9544912157991745e-05, "loss": 0.013, "step": 13050 }, { "grad_norm": 0.19073966145515442, "learning_rate": 2.9469501522200405e-05, "loss": 0.0127, "step": 13060 }, { "grad_norm": 0.22757293283939362, "learning_rate": 2.9394147016007946e-05, "loss": 0.0094, "step": 13070 }, { "grad_norm": 0.25903284549713135, "learning_rate": 2.9318848845430702e-05, "loss": 0.0104, "step": 13080 }, { "grad_norm": 0.2773730456829071, "learning_rate": 2.9243607216331013e-05, "loss": 0.0125, "step": 13090 }, { "grad_norm": 0.19922521710395813, "learning_rate": 2.916842233441661e-05, "loss": 0.0121, "step": 13100 }, { "grad_norm": 0.26156315207481384, "learning_rate": 2.90932944052401e-05, "loss": 0.0102, "step": 13110 }, { "grad_norm": 0.34598907828330994, "learning_rate": 2.9018223634198354e-05, "loss": 0.0117, "step": 13120 }, { "grad_norm": 0.5064572691917419, "learning_rate": 2.8943210226532025e-05, "loss": 0.0131, "step": 13130 }, { "grad_norm": 0.44640839099884033, "learning_rate": 2.8868254387324857e-05, "loss": 0.0125, "step": 13140 }, { "grad_norm": 0.6459829211235046, "learning_rate": 2.8793356321503306e-05, "loss": 0.0122, "step": 13150 }, { "grad_norm": 0.32009294629096985, "learning_rate": 2.87185162338358e-05, "loss": 0.014, "step": 13160 }, { "grad_norm": 0.46464258432388306, "learning_rate": 2.8643734328932253e-05, "loss": 0.0165, "step": 13170 }, { "grad_norm": 0.37637051939964294, "learning_rate": 2.856901081124359e-05, "loss": 0.0149, "step": 13180 }, { "grad_norm": 0.20249499380588531, "learning_rate": 2.8494345885061002e-05, "loss": 0.0144, "step": 13190 }, { "grad_norm": 0.2097480744123459, "learning_rate": 2.8419739754515616e-05, "loss": 0.0149, "step": 13200 }, { "grad_norm": 0.29072973132133484, "learning_rate": 2.8345192623577666e-05, "loss": 0.0114, "step": 13210 }, { "grad_norm": 0.23341014981269836, "learning_rate": 2.8270704696056193e-05, "loss": 0.0129, "step": 13220 }, { "grad_norm": 0.944629430770874, "learning_rate": 2.8196276175598367e-05, "loss": 0.0125, "step": 13230 }, { "grad_norm": 0.1528407335281372, "learning_rate": 2.8121907265688884e-05, "loss": 0.0129, "step": 13240 }, { "grad_norm": 0.21479587256908417, "learning_rate": 2.804759816964957e-05, "loss": 0.013, "step": 13250 }, { "grad_norm": 0.21966037154197693, "learning_rate": 2.797334909063857e-05, "loss": 0.0144, "step": 13260 }, { "grad_norm": 0.24202066659927368, "learning_rate": 2.7899160231650056e-05, "loss": 0.0124, "step": 13270 }, { "grad_norm": 0.2061278074979782, "learning_rate": 2.7825031795513585e-05, "loss": 0.0124, "step": 13280 }, { "grad_norm": 0.2511097192764282, "learning_rate": 2.775096398489341e-05, "loss": 0.0147, "step": 13290 }, { "grad_norm": 0.31790435314178467, "learning_rate": 2.7676957002288163e-05, "loss": 0.0149, "step": 13300 }, { "grad_norm": 0.20916563272476196, "learning_rate": 2.760301105003003e-05, "loss": 0.012, "step": 13310 }, { "grad_norm": 0.25818222761154175, "learning_rate": 2.752912633028446e-05, "loss": 0.0131, "step": 13320 }, { "grad_norm": 0.26836761832237244, "learning_rate": 2.7455303045049474e-05, "loss": 0.0135, "step": 13330 }, { "grad_norm": 0.3333231806755066, "learning_rate": 2.7381541396155098e-05, "loss": 0.0113, "step": 13340 }, { "grad_norm": 0.3425179123878479, "learning_rate": 2.730784158526286e-05, "loss": 0.0104, "step": 13350 }, { "grad_norm": 0.3231576681137085, "learning_rate": 2.723420381386521e-05, "loss": 0.0135, "step": 13360 }, { "grad_norm": 0.2460070103406906, "learning_rate": 2.7160628283285018e-05, "loss": 0.0125, "step": 13370 }, { "grad_norm": 0.19986805319786072, "learning_rate": 2.7087115194675007e-05, "loss": 0.0125, "step": 13380 }, { "grad_norm": 0.20227845013141632, "learning_rate": 2.701366474901712e-05, "loss": 0.0137, "step": 13390 }, { "grad_norm": 1.0397449731826782, "learning_rate": 2.6940277147122085e-05, "loss": 0.0116, "step": 13400 }, { "grad_norm": 0.32964572310447693, "learning_rate": 2.686695258962878e-05, "loss": 0.0114, "step": 13410 }, { "grad_norm": 0.6746490597724915, "learning_rate": 2.679369127700375e-05, "loss": 0.0125, "step": 13420 }, { "grad_norm": 0.275610089302063, "learning_rate": 2.672049340954067e-05, "loss": 0.0115, "step": 13430 }, { "grad_norm": 0.22082962095737457, "learning_rate": 2.6647359187359676e-05, "loss": 0.0141, "step": 13440 }, { "grad_norm": 0.2911020517349243, "learning_rate": 2.6574288810406946e-05, "loss": 0.0123, "step": 13450 }, { "grad_norm": 0.2505112588405609, "learning_rate": 2.6501282478454083e-05, "loss": 0.0142, "step": 13460 }, { "grad_norm": 0.24259142577648163, "learning_rate": 2.6428340391097618e-05, "loss": 0.0125, "step": 13470 }, { "grad_norm": 0.21417009830474854, "learning_rate": 2.6355462747758485e-05, "loss": 0.0127, "step": 13480 }, { "grad_norm": 0.2628515660762787, "learning_rate": 2.6282649747681304e-05, "loss": 0.0163, "step": 13490 }, { "grad_norm": 0.19195199012756348, "learning_rate": 2.620990158993406e-05, "loss": 0.0142, "step": 13500 }, { "grad_norm": 0.25083696842193604, "learning_rate": 2.6137218473407477e-05, "loss": 0.015, "step": 13510 }, { "grad_norm": 0.21779127418994904, "learning_rate": 2.606460059681436e-05, "loss": 0.0097, "step": 13520 }, { "grad_norm": 0.21074676513671875, "learning_rate": 2.599204815868928e-05, "loss": 0.0113, "step": 13530 }, { "grad_norm": 0.20686055719852448, "learning_rate": 2.5919561357387756e-05, "loss": 0.0168, "step": 13540 }, { "grad_norm": 0.29611101746559143, "learning_rate": 2.5847140391085972e-05, "loss": 0.0115, "step": 13550 }, { "grad_norm": 0.27347975969314575, "learning_rate": 2.5774785457780103e-05, "loss": 0.011, "step": 13560 }, { "grad_norm": 0.2213997095823288, "learning_rate": 2.5702496755285753e-05, "loss": 0.0076, "step": 13570 }, { "grad_norm": 0.2341260015964508, "learning_rate": 2.5630274481237483e-05, "loss": 0.0121, "step": 13580 }, { "grad_norm": 0.16575971245765686, "learning_rate": 2.5558118833088197e-05, "loss": 0.0106, "step": 13590 }, { "grad_norm": 0.2699681222438812, "learning_rate": 2.548603000810872e-05, "loss": 0.0113, "step": 13600 }, { "grad_norm": 0.22295697033405304, "learning_rate": 2.5414008203387152e-05, "loss": 0.0134, "step": 13610 }, { "grad_norm": 0.5171917676925659, "learning_rate": 2.534205361582834e-05, "loss": 0.0099, "step": 13620 }, { "grad_norm": 0.2957369089126587, "learning_rate": 2.527016644215338e-05, "loss": 0.0121, "step": 13630 }, { "grad_norm": 0.2450408637523651, "learning_rate": 2.519834687889905e-05, "loss": 0.012, "step": 13640 }, { "grad_norm": 0.2295277714729309, "learning_rate": 2.5126595122417295e-05, "loss": 0.01, "step": 13650 }, { "grad_norm": 0.28740793466567993, "learning_rate": 2.5054911368874713e-05, "loss": 0.0115, "step": 13660 }, { "grad_norm": 0.26296743750572205, "learning_rate": 2.4983295814251916e-05, "loss": 0.0081, "step": 13670 }, { "grad_norm": 0.28540363907814026, "learning_rate": 2.4911748654343105e-05, "loss": 0.0132, "step": 13680 }, { "grad_norm": 0.273831844329834, "learning_rate": 2.4840270084755463e-05, "loss": 0.0097, "step": 13690 }, { "grad_norm": 0.24562785029411316, "learning_rate": 2.4768860300908685e-05, "loss": 0.0099, "step": 13700 }, { "grad_norm": 0.29464760422706604, "learning_rate": 2.469751949803443e-05, "loss": 0.0135, "step": 13710 }, { "grad_norm": 2.670888662338257, "learning_rate": 2.4626247871175666e-05, "loss": 0.0119, "step": 13720 }, { "grad_norm": 0.30418896675109863, "learning_rate": 2.4555045615186346e-05, "loss": 0.0116, "step": 13730 }, { "grad_norm": 0.31108617782592773, "learning_rate": 2.4483912924730677e-05, "loss": 0.0123, "step": 13740 }, { "grad_norm": 0.4150383770465851, "learning_rate": 2.4412849994282742e-05, "loss": 0.0152, "step": 13750 }, { "grad_norm": 0.23592619597911835, "learning_rate": 2.434185701812592e-05, "loss": 0.0099, "step": 13760 }, { "grad_norm": 0.33602234721183777, "learning_rate": 2.4270934190352218e-05, "loss": 0.0098, "step": 13770 }, { "grad_norm": 0.24652159214019775, "learning_rate": 2.4200081704861998e-05, "loss": 0.0103, "step": 13780 }, { "grad_norm": 0.42568060755729675, "learning_rate": 2.412929975536321e-05, "loss": 0.0137, "step": 13790 }, { "grad_norm": 0.3073839545249939, "learning_rate": 2.4058588535371017e-05, "loss": 0.0136, "step": 13800 }, { "grad_norm": 0.3546251356601715, "learning_rate": 2.3987948238207243e-05, "loss": 0.0142, "step": 13810 }, { "grad_norm": 0.2533576488494873, "learning_rate": 2.3917379056999678e-05, "loss": 0.0114, "step": 13820 }, { "grad_norm": 0.43684619665145874, "learning_rate": 2.3846881184681824e-05, "loss": 0.014, "step": 13830 }, { "grad_norm": 0.23682211339473724, "learning_rate": 2.377645481399214e-05, "loss": 0.0107, "step": 13840 }, { "grad_norm": 0.29054197669029236, "learning_rate": 2.3706100137473667e-05, "loss": 0.01, "step": 13850 }, { "grad_norm": 0.44751688838005066, "learning_rate": 2.3635817347473394e-05, "loss": 0.0147, "step": 13860 }, { "grad_norm": 0.2930699288845062, "learning_rate": 2.3565606636141757e-05, "loss": 0.0102, "step": 13870 }, { "grad_norm": 0.22456783056259155, "learning_rate": 2.3495468195432203e-05, "loss": 0.0112, "step": 13880 }, { "grad_norm": 0.3287985622882843, "learning_rate": 2.3425402217100507e-05, "loss": 0.0104, "step": 13890 }, { "grad_norm": 0.22914916276931763, "learning_rate": 2.3355408892704424e-05, "loss": 0.0137, "step": 13900 }, { "grad_norm": 0.26880890130996704, "learning_rate": 2.3285488413603003e-05, "loss": 0.0089, "step": 13910 }, { "grad_norm": 0.20252776145935059, "learning_rate": 2.321564097095615e-05, "loss": 0.0127, "step": 13920 }, { "grad_norm": 0.27722981572151184, "learning_rate": 2.3145866755724142e-05, "loss": 0.0111, "step": 13930 }, { "grad_norm": 0.1980714350938797, "learning_rate": 2.307616595866699e-05, "loss": 0.011, "step": 13940 }, { "grad_norm": 0.25533467531204224, "learning_rate": 2.3006538770344032e-05, "loss": 0.0125, "step": 13950 }, { "grad_norm": 0.196397602558136, "learning_rate": 2.293698538111334e-05, "loss": 0.0122, "step": 13960 }, { "grad_norm": 0.22355696558952332, "learning_rate": 2.28675059811312e-05, "loss": 0.0093, "step": 13970 }, { "grad_norm": 0.21054349839687347, "learning_rate": 2.279810076035167e-05, "loss": 0.0104, "step": 13980 }, { "grad_norm": 0.1631699502468109, "learning_rate": 2.272876990852596e-05, "loss": 0.0123, "step": 13990 }, { "grad_norm": 0.1871449053287506, "learning_rate": 2.265951361520195e-05, "loss": 0.0091, "step": 14000 }, { "grad_norm": 0.21156345307826996, "learning_rate": 2.2590332069723748e-05, "loss": 0.0117, "step": 14010 }, { "grad_norm": 0.25709375739097595, "learning_rate": 2.2521225461231004e-05, "loss": 0.012, "step": 14020 }, { "grad_norm": 0.1850123554468155, "learning_rate": 2.2452193978658597e-05, "loss": 0.0099, "step": 14030 }, { "grad_norm": 0.2465033382177353, "learning_rate": 2.238323781073594e-05, "loss": 0.0085, "step": 14040 }, { "grad_norm": 0.20396371185779572, "learning_rate": 2.2314357145986552e-05, "loss": 0.0076, "step": 14050 }, { "grad_norm": 0.304443895816803, "learning_rate": 2.224555217272757e-05, "loss": 0.01, "step": 14060 }, { "grad_norm": 0.23927661776542664, "learning_rate": 2.2176823079069127e-05, "loss": 0.0121, "step": 14070 }, { "grad_norm": 0.17423300445079803, "learning_rate": 2.210817005291398e-05, "loss": 0.0101, "step": 14080 }, { "grad_norm": 0.23636728525161743, "learning_rate": 2.203959328195686e-05, "loss": 0.0096, "step": 14090 }, { "grad_norm": 0.2560155391693115, "learning_rate": 2.1971092953684026e-05, "loss": 0.0086, "step": 14100 }, { "grad_norm": 0.26385796070098877, "learning_rate": 2.1902669255372788e-05, "loss": 0.0098, "step": 14110 }, { "grad_norm": 0.22127318382263184, "learning_rate": 2.1834322374090897e-05, "loss": 0.0086, "step": 14120 }, { "grad_norm": 0.28687700629234314, "learning_rate": 2.1766052496696153e-05, "loss": 0.0115, "step": 14130 }, { "grad_norm": 0.2743799388408661, "learning_rate": 2.169785980983577e-05, "loss": 0.0105, "step": 14140 }, { "grad_norm": 0.24283431470394135, "learning_rate": 2.162974449994593e-05, "loss": 0.0096, "step": 14150 }, { "grad_norm": 0.2299596667289734, "learning_rate": 2.1561706753251337e-05, "loss": 0.011, "step": 14160 }, { "grad_norm": 0.257315456867218, "learning_rate": 2.1493746755764544e-05, "loss": 0.0107, "step": 14170 }, { "grad_norm": 0.19137918949127197, "learning_rate": 2.1425864693285635e-05, "loss": 0.009, "step": 14180 }, { "grad_norm": 0.16531063616275787, "learning_rate": 2.1358060751401547e-05, "loss": 0.0099, "step": 14190 }, { "grad_norm": 0.2593347430229187, "learning_rate": 2.129033511548566e-05, "loss": 0.0098, "step": 14200 }, { "grad_norm": 0.33044132590293884, "learning_rate": 2.1222687970697315e-05, "loss": 0.0106, "step": 14210 }, { "grad_norm": 0.272811621427536, "learning_rate": 2.1155119501981173e-05, "loss": 0.0101, "step": 14220 }, { "grad_norm": 0.21633927524089813, "learning_rate": 2.1087629894066895e-05, "loss": 0.0103, "step": 14230 }, { "grad_norm": 0.27812865376472473, "learning_rate": 2.1020219331468473e-05, "loss": 0.0096, "step": 14240 }, { "grad_norm": 0.28216585516929626, "learning_rate": 2.095288799848379e-05, "loss": 0.0103, "step": 14250 }, { "grad_norm": 0.2222496122121811, "learning_rate": 2.088563607919417e-05, "loss": 0.0113, "step": 14260 }, { "grad_norm": 0.17181536555290222, "learning_rate": 2.0818463757463786e-05, "loss": 0.0095, "step": 14270 }, { "grad_norm": 0.2349943071603775, "learning_rate": 2.0751371216939175e-05, "loss": 0.009, "step": 14280 }, { "grad_norm": 0.18707339465618134, "learning_rate": 2.068435864104882e-05, "loss": 0.0083, "step": 14290 }, { "grad_norm": 0.20596297085285187, "learning_rate": 2.0617426213002506e-05, "loss": 0.0091, "step": 14300 }, { "grad_norm": 0.2507011890411377, "learning_rate": 2.055057411579097e-05, "loss": 0.0088, "step": 14310 }, { "grad_norm": 0.3093428611755371, "learning_rate": 2.0483802532185286e-05, "loss": 0.0124, "step": 14320 }, { "grad_norm": 0.3834534287452698, "learning_rate": 2.041711164473638e-05, "loss": 0.0136, "step": 14330 }, { "grad_norm": 1.6089227199554443, "learning_rate": 2.0350501635774637e-05, "loss": 0.0126, "step": 14340 }, { "grad_norm": 0.5378519892692566, "learning_rate": 2.0283972687409247e-05, "loss": 0.0107, "step": 14350 }, { "grad_norm": 0.2899678349494934, "learning_rate": 2.021752498152784e-05, "loss": 0.0128, "step": 14360 }, { "grad_norm": 0.2069404274225235, "learning_rate": 2.015115869979589e-05, "loss": 0.0121, "step": 14370 }, { "grad_norm": 0.1375165581703186, "learning_rate": 2.0084874023656265e-05, "loss": 0.01, "step": 14380 }, { "grad_norm": 0.3515959084033966, "learning_rate": 2.001867113432877e-05, "loss": 0.0119, "step": 14390 }, { "grad_norm": 0.20674286782741547, "learning_rate": 1.995255021280954e-05, "loss": 0.0107, "step": 14400 }, { "grad_norm": 0.19261492788791656, "learning_rate": 1.9886511439870688e-05, "loss": 0.0112, "step": 14410 }, { "grad_norm": 0.16453218460083008, "learning_rate": 1.9820554996059675e-05, "loss": 0.009, "step": 14420 }, { "grad_norm": 0.2232791930437088, "learning_rate": 1.9754681061698893e-05, "loss": 0.0098, "step": 14430 }, { "grad_norm": 0.2972833812236786, "learning_rate": 1.9688889816885185e-05, "loss": 0.0109, "step": 14440 }, { "grad_norm": 0.25360363721847534, "learning_rate": 1.962318144148928e-05, "loss": 0.0138, "step": 14450 }, { "grad_norm": 0.22378063201904297, "learning_rate": 1.955755611515539e-05, "loss": 0.0108, "step": 14460 }, { "grad_norm": 0.2836368978023529, "learning_rate": 1.9492014017300642e-05, "loss": 0.0113, "step": 14470 }, { "grad_norm": 0.27458497881889343, "learning_rate": 1.942655532711461e-05, "loss": 0.0127, "step": 14480 }, { "grad_norm": 0.2615237832069397, "learning_rate": 1.9361180223558882e-05, "loss": 0.0097, "step": 14490 }, { "grad_norm": 0.20921631157398224, "learning_rate": 1.929588888536647e-05, "loss": 0.009, "step": 14500 }, { "grad_norm": 0.23652184009552002, "learning_rate": 1.9230681491041425e-05, "loss": 0.0099, "step": 14510 }, { "grad_norm": 0.2607056796550751, "learning_rate": 1.9165558218858264e-05, "loss": 0.0081, "step": 14520 }, { "grad_norm": 0.20434625446796417, "learning_rate": 1.9100519246861505e-05, "loss": 0.0101, "step": 14530 }, { "grad_norm": 0.3184315860271454, "learning_rate": 1.9035564752865248e-05, "loss": 0.0103, "step": 14540 }, { "grad_norm": 0.18696974217891693, "learning_rate": 1.897069491445258e-05, "loss": 0.0111, "step": 14550 }, { "grad_norm": 0.2383071482181549, "learning_rate": 1.890590990897515e-05, "loss": 0.0107, "step": 14560 }, { "grad_norm": 0.260602205991745, "learning_rate": 1.884120991355272e-05, "loss": 0.0096, "step": 14570 }, { "grad_norm": 0.1627962589263916, "learning_rate": 1.8776595105072576e-05, "loss": 0.0111, "step": 14580 }, { "grad_norm": 0.3067907392978668, "learning_rate": 1.8712065660189166e-05, "loss": 0.0092, "step": 14590 }, { "grad_norm": 0.22513465583324432, "learning_rate": 1.8647621755323513e-05, "loss": 0.0093, "step": 14600 }, { "grad_norm": 0.22123558819293976, "learning_rate": 1.858326356666278e-05, "loss": 0.0099, "step": 14610 }, { "grad_norm": 0.2746274471282959, "learning_rate": 1.851899127015983e-05, "loss": 0.0092, "step": 14620 }, { "grad_norm": 0.2846812903881073, "learning_rate": 1.8454805041532626e-05, "loss": 0.0133, "step": 14630 }, { "grad_norm": 0.35010507702827454, "learning_rate": 1.8390705056263906e-05, "loss": 0.0116, "step": 14640 }, { "grad_norm": 0.21462136507034302, "learning_rate": 1.832669148960057e-05, "loss": 0.0089, "step": 14650 }, { "grad_norm": 0.16179300844669342, "learning_rate": 1.8262764516553233e-05, "loss": 0.0073, "step": 14660 }, { "grad_norm": 0.245867058634758, "learning_rate": 1.8198924311895843e-05, "loss": 0.0081, "step": 14670 }, { "grad_norm": 0.17699289321899414, "learning_rate": 1.813517105016505e-05, "loss": 0.0088, "step": 14680 }, { "grad_norm": 0.18229536712169647, "learning_rate": 1.8071504905659888e-05, "loss": 0.0092, "step": 14690 }, { "grad_norm": 0.20560669898986816, "learning_rate": 1.800792605244109e-05, "loss": 0.009, "step": 14700 }, { "grad_norm": 0.2790718078613281, "learning_rate": 1.7944434664330844e-05, "loss": 0.0174, "step": 14710 }, { "grad_norm": 0.23581518232822418, "learning_rate": 1.7881030914912212e-05, "loss": 0.0099, "step": 14720 }, { "grad_norm": 0.2908840775489807, "learning_rate": 1.7817714977528577e-05, "loss": 0.0098, "step": 14730 }, { "grad_norm": 0.2973950207233429, "learning_rate": 1.7754487025283332e-05, "loss": 0.0127, "step": 14740 }, { "grad_norm": 0.32283443212509155, "learning_rate": 1.7691347231039275e-05, "loss": 0.01, "step": 14750 }, { "grad_norm": 0.2502584457397461, "learning_rate": 1.7628295767418164e-05, "loss": 0.0109, "step": 14760 }, { "grad_norm": 0.3543379604816437, "learning_rate": 1.7565332806800333e-05, "loss": 0.012, "step": 14770 }, { "grad_norm": 0.2470313161611557, "learning_rate": 1.750245852132408e-05, "loss": 0.0096, "step": 14780 }, { "grad_norm": 0.20778900384902954, "learning_rate": 1.7439673082885323e-05, "loss": 0.0102, "step": 14790 }, { "grad_norm": 0.16823366284370422, "learning_rate": 1.7376976663137047e-05, "loss": 0.0071, "step": 14800 }, { "grad_norm": 0.22880259156227112, "learning_rate": 1.7314369433488853e-05, "loss": 0.0098, "step": 14810 }, { "grad_norm": 0.17701993882656097, "learning_rate": 1.7251851565106548e-05, "loss": 0.0083, "step": 14820 }, { "grad_norm": 0.32732093334198, "learning_rate": 1.7189423228911574e-05, "loss": 0.0105, "step": 14830 }, { "grad_norm": 0.30788835883140564, "learning_rate": 1.7127084595580606e-05, "loss": 0.0102, "step": 14840 }, { "grad_norm": 0.27838122844696045, "learning_rate": 1.706483583554513e-05, "loss": 0.0086, "step": 14850 }, { "grad_norm": 0.2307506799697876, "learning_rate": 1.700267711899083e-05, "loss": 0.0114, "step": 14860 }, { "grad_norm": 0.37891414761543274, "learning_rate": 1.69406086158573e-05, "loss": 0.0099, "step": 14870 }, { "grad_norm": 0.38476869463920593, "learning_rate": 1.6878630495837455e-05, "loss": 0.0112, "step": 14880 }, { "grad_norm": 0.36826103925704956, "learning_rate": 1.681674292837707e-05, "loss": 0.0111, "step": 14890 }, { "grad_norm": 0.33632421493530273, "learning_rate": 1.6754946082674444e-05, "loss": 0.0095, "step": 14900 }, { "grad_norm": 0.4165757894515991, "learning_rate": 1.6693240127679748e-05, "loss": 0.0107, "step": 14910 }, { "grad_norm": 0.31078946590423584, "learning_rate": 1.663162523209475e-05, "loss": 0.0153, "step": 14920 }, { "grad_norm": 0.23191437125205994, "learning_rate": 1.6570101564372193e-05, "loss": 0.0098, "step": 14930 }, { "grad_norm": 0.24041399359703064, "learning_rate": 1.650866929271543e-05, "loss": 0.0102, "step": 14940 }, { "grad_norm": 0.24999092519283295, "learning_rate": 1.644732858507797e-05, "loss": 0.0099, "step": 14950 }, { "grad_norm": 0.2547627091407776, "learning_rate": 1.6386079609162943e-05, "loss": 0.011, "step": 14960 }, { "grad_norm": 0.42518699169158936, "learning_rate": 1.6324922532422742e-05, "loss": 0.009, "step": 14970 }, { "grad_norm": 0.316510409116745, "learning_rate": 1.6263857522058434e-05, "loss": 0.013, "step": 14980 }, { "grad_norm": 0.27327632904052734, "learning_rate": 1.6202884745019443e-05, "loss": 0.0126, "step": 14990 }, { "grad_norm": 0.3026086091995239, "learning_rate": 1.614200436800304e-05, "loss": 0.0112, "step": 15000 }, { "grad_norm": 0.29821979999542236, "learning_rate": 1.6081216557453814e-05, "loss": 0.0091, "step": 15010 }, { "grad_norm": 0.22507360577583313, "learning_rate": 1.6020521479563367e-05, "loss": 0.0082, "step": 15020 }, { "grad_norm": 0.21088339388370514, "learning_rate": 1.5959919300269654e-05, "loss": 0.013, "step": 15030 }, { "grad_norm": 0.3412456214427948, "learning_rate": 1.5899410185256764e-05, "loss": 0.0113, "step": 15040 }, { "grad_norm": 0.17136356234550476, "learning_rate": 1.583899429995431e-05, "loss": 0.0092, "step": 15050 }, { "grad_norm": 0.20491176843643188, "learning_rate": 1.5778671809536993e-05, "loss": 0.0085, "step": 15060 }, { "grad_norm": 0.1847924441099167, "learning_rate": 1.5718442878924246e-05, "loss": 0.0093, "step": 15070 }, { "grad_norm": 0.18935929238796234, "learning_rate": 1.5658307672779593e-05, "loss": 0.0099, "step": 15080 }, { "grad_norm": 0.16868913173675537, "learning_rate": 1.5598266355510427e-05, "loss": 0.009, "step": 15090 }, { "grad_norm": 0.1602712720632553, "learning_rate": 1.553831909126744e-05, "loss": 0.0086, "step": 15100 }, { "grad_norm": 1.5024904012680054, "learning_rate": 1.5478466043944135e-05, "loss": 0.0126, "step": 15110 }, { "grad_norm": 0.3340737819671631, "learning_rate": 1.5418707377176468e-05, "loss": 0.0106, "step": 15120 }, { "grad_norm": 0.2385675311088562, "learning_rate": 1.535904325434233e-05, "loss": 0.0097, "step": 15130 }, { "grad_norm": 0.1439037173986435, "learning_rate": 1.529947383856118e-05, "loss": 0.008, "step": 15140 }, { "grad_norm": 0.18164511024951935, "learning_rate": 1.5239999292693524e-05, "loss": 0.0083, "step": 15150 }, { "grad_norm": 0.2115994691848755, "learning_rate": 1.5180619779340505e-05, "loss": 0.0077, "step": 15160 }, { "grad_norm": 0.21105799078941345, "learning_rate": 1.5121335460843428e-05, "loss": 0.0082, "step": 15170 }, { "grad_norm": 0.2239411622285843, "learning_rate": 1.5062146499283347e-05, "loss": 0.0084, "step": 15180 }, { "grad_norm": 0.20827172696590424, "learning_rate": 1.5003053056480643e-05, "loss": 0.011, "step": 15190 }, { "grad_norm": 0.151203915476799, "learning_rate": 1.4944055293994551e-05, "loss": 0.0092, "step": 15200 }, { "grad_norm": 0.24095380306243896, "learning_rate": 1.4885153373122656e-05, "loss": 0.0078, "step": 15210 }, { "grad_norm": 0.21926866471767426, "learning_rate": 1.482634745490059e-05, "loss": 0.0087, "step": 15220 }, { "grad_norm": 0.23115688562393188, "learning_rate": 1.4767637700101466e-05, "loss": 0.0082, "step": 15230 }, { "grad_norm": 0.16895054280757904, "learning_rate": 1.4709024269235528e-05, "loss": 0.0088, "step": 15240 }, { "grad_norm": 0.1939065307378769, "learning_rate": 1.4650507322549684e-05, "loss": 0.0119, "step": 15250 }, { "grad_norm": 0.2074580192565918, "learning_rate": 1.4592087020026972e-05, "loss": 0.0106, "step": 15260 }, { "grad_norm": 0.20165348052978516, "learning_rate": 1.4533763521386318e-05, "loss": 0.0105, "step": 15270 }, { "grad_norm": 0.23927544057369232, "learning_rate": 1.44755369860819e-05, "loss": 0.0094, "step": 15280 }, { "grad_norm": 0.16270925104618073, "learning_rate": 1.441740757330287e-05, "loss": 0.0107, "step": 15290 }, { "grad_norm": 0.22936780750751495, "learning_rate": 1.4359375441972844e-05, "loss": 0.0085, "step": 15300 }, { "grad_norm": 0.3682059049606323, "learning_rate": 1.4301440750749395e-05, "loss": 0.0142, "step": 15310 }, { "grad_norm": 0.27441543340682983, "learning_rate": 1.4243603658023808e-05, "loss": 0.013, "step": 15320 }, { "grad_norm": 0.24856552481651306, "learning_rate": 1.4185864321920444e-05, "loss": 0.0103, "step": 15330 }, { "grad_norm": 0.21878738701343536, "learning_rate": 1.4128222900296485e-05, "loss": 0.0091, "step": 15340 }, { "grad_norm": 0.16864259541034698, "learning_rate": 1.407067955074135e-05, "loss": 0.0085, "step": 15350 }, { "grad_norm": 0.16964417695999146, "learning_rate": 1.4013234430576356e-05, "loss": 0.0112, "step": 15360 }, { "grad_norm": 0.18221834301948547, "learning_rate": 1.3955887696854286e-05, "loss": 0.008, "step": 15370 }, { "grad_norm": 0.206833153963089, "learning_rate": 1.38986395063589e-05, "loss": 0.0085, "step": 15380 }, { "grad_norm": 0.23563966155052185, "learning_rate": 1.3841490015604597e-05, "loss": 0.012, "step": 15390 }, { "grad_norm": 0.34755757451057434, "learning_rate": 1.3784439380835879e-05, "loss": 0.0108, "step": 15400 }, { "grad_norm": 0.22355645895004272, "learning_rate": 1.3727487758026986e-05, "loss": 0.0084, "step": 15410 }, { "grad_norm": 0.30632326006889343, "learning_rate": 1.3670635302881525e-05, "loss": 0.011, "step": 15420 }, { "grad_norm": 0.3979724645614624, "learning_rate": 1.3613882170831888e-05, "loss": 0.009, "step": 15430 }, { "grad_norm": 0.21477313339710236, "learning_rate": 1.355722851703901e-05, "loss": 0.0081, "step": 15440 }, { "grad_norm": 0.16144298017024994, "learning_rate": 1.3500674496391814e-05, "loss": 0.0087, "step": 15450 }, { "grad_norm": 0.15438497066497803, "learning_rate": 1.3444220263506795e-05, "loss": 0.0077, "step": 15460 }, { "grad_norm": 0.1932169646024704, "learning_rate": 1.3387865972727714e-05, "loss": 0.0075, "step": 15470 }, { "grad_norm": 0.2231285274028778, "learning_rate": 1.3331611778125036e-05, "loss": 0.0074, "step": 15480 }, { "grad_norm": 0.1794029176235199, "learning_rate": 1.3275457833495564e-05, "loss": 0.0116, "step": 15490 }, { "grad_norm": 0.18809859454631805, "learning_rate": 1.3219404292362065e-05, "loss": 0.0083, "step": 15500 }, { "grad_norm": 0.21087174117565155, "learning_rate": 1.3163451307972751e-05, "loss": 0.0094, "step": 15510 }, { "grad_norm": 0.19908000528812408, "learning_rate": 1.3107599033300977e-05, "loss": 0.0077, "step": 15520 }, { "grad_norm": 0.2235322743654251, "learning_rate": 1.305184762104471e-05, "loss": 0.013, "step": 15530 }, { "grad_norm": 0.2747052311897278, "learning_rate": 1.2996197223626178e-05, "loss": 0.0103, "step": 15540 }, { "grad_norm": 0.2411435842514038, "learning_rate": 1.2940647993191457e-05, "loss": 0.0094, "step": 15550 }, { "grad_norm": 0.14222177863121033, "learning_rate": 1.2885200081610005e-05, "loss": 0.0077, "step": 15560 }, { "grad_norm": 0.23416419327259064, "learning_rate": 1.2829853640474316e-05, "loss": 0.0089, "step": 15570 }, { "grad_norm": 0.2406393587589264, "learning_rate": 1.2774608821099438e-05, "loss": 0.0086, "step": 15580 }, { "grad_norm": 0.18380896747112274, "learning_rate": 1.2719465774522577e-05, "loss": 0.009, "step": 15590 }, { "grad_norm": 0.17795240879058838, "learning_rate": 1.2664424651502755e-05, "loss": 0.0061, "step": 15600 }, { "grad_norm": 0.18897061049938202, "learning_rate": 1.260948560252026e-05, "loss": 0.0102, "step": 15610 }, { "grad_norm": 0.20112966001033783, "learning_rate": 1.2554648777776396e-05, "loss": 0.0082, "step": 15620 }, { "grad_norm": 0.167032852768898, "learning_rate": 1.2499914327192919e-05, "loss": 0.0074, "step": 15630 }, { "grad_norm": 0.17840704321861267, "learning_rate": 1.2445282400411722e-05, "loss": 0.0078, "step": 15640 }, { "grad_norm": 0.1737533062696457, "learning_rate": 1.2390753146794437e-05, "loss": 0.0113, "step": 15650 }, { "grad_norm": 0.20213133096694946, "learning_rate": 1.2336326715421925e-05, "loss": 0.0103, "step": 15660 }, { "grad_norm": 0.1666155457496643, "learning_rate": 1.2282003255094005e-05, "loss": 0.0087, "step": 15670 }, { "grad_norm": 0.2510354518890381, "learning_rate": 1.2227782914328928e-05, "loss": 0.0077, "step": 15680 }, { "grad_norm": 0.16165825724601746, "learning_rate": 1.2173665841363018e-05, "loss": 0.0082, "step": 15690 }, { "grad_norm": 0.5454258322715759, "learning_rate": 1.211965218415032e-05, "loss": 0.0076, "step": 15700 }, { "grad_norm": 0.17377229034900665, "learning_rate": 1.2065742090362082e-05, "loss": 0.0079, "step": 15710 }, { "grad_norm": 0.28120264410972595, "learning_rate": 1.2011935707386457e-05, "loss": 0.0085, "step": 15720 }, { "grad_norm": 0.18770258128643036, "learning_rate": 1.1958233182328044e-05, "loss": 0.0069, "step": 15730 }, { "grad_norm": 0.21482449769973755, "learning_rate": 1.1904634662007474e-05, "loss": 0.0082, "step": 15740 }, { "grad_norm": 0.27096569538116455, "learning_rate": 1.1851140292961088e-05, "loss": 0.0068, "step": 15750 }, { "grad_norm": 0.3027457594871521, "learning_rate": 1.1797750221440424e-05, "loss": 0.0082, "step": 15760 }, { "grad_norm": 0.20425190031528473, "learning_rate": 1.1744464593411897e-05, "loss": 0.0084, "step": 15770 }, { "grad_norm": 0.1998683214187622, "learning_rate": 1.1691283554556399e-05, "loss": 0.0097, "step": 15780 }, { "grad_norm": 0.18402749300003052, "learning_rate": 1.1638207250268834e-05, "loss": 0.0068, "step": 15790 }, { "grad_norm": 0.21123050153255463, "learning_rate": 1.158523582565782e-05, "loss": 0.0076, "step": 15800 }, { "grad_norm": 0.1832370012998581, "learning_rate": 1.1532369425545192e-05, "loss": 0.0074, "step": 15810 }, { "grad_norm": 0.16383832693099976, "learning_rate": 1.1479608194465662e-05, "loss": 0.0099, "step": 15820 }, { "grad_norm": 0.26916202902793884, "learning_rate": 1.1426952276666442e-05, "loss": 0.0075, "step": 15830 }, { "grad_norm": 0.1489630490541458, "learning_rate": 1.1374401816106778e-05, "loss": 0.0083, "step": 15840 }, { "grad_norm": 0.1580963432788849, "learning_rate": 1.1321956956457646e-05, "loss": 0.0109, "step": 15850 }, { "grad_norm": 0.7102701663970947, "learning_rate": 1.1269617841101277e-05, "loss": 0.0073, "step": 15860 }, { "grad_norm": 0.2703263461589813, "learning_rate": 1.1217384613130804e-05, "loss": 0.0082, "step": 15870 }, { "grad_norm": 0.2174505740404129, "learning_rate": 1.11652574153499e-05, "loss": 0.0097, "step": 15880 }, { "grad_norm": 0.23385658860206604, "learning_rate": 1.1113236390272303e-05, "loss": 0.0086, "step": 15890 }, { "grad_norm": 0.23186172544956207, "learning_rate": 1.106132168012155e-05, "loss": 0.0094, "step": 15900 }, { "grad_norm": 0.21415439248085022, "learning_rate": 1.1009513426830448e-05, "loss": 0.0099, "step": 15910 }, { "grad_norm": 0.39905279874801636, "learning_rate": 1.0957811772040777e-05, "loss": 0.0094, "step": 15920 }, { "grad_norm": 0.17572318017482758, "learning_rate": 1.0906216857102913e-05, "loss": 0.0137, "step": 15930 }, { "grad_norm": 0.21807076036930084, "learning_rate": 1.0854728823075355e-05, "loss": 0.0076, "step": 15940 }, { "grad_norm": 0.19862854480743408, "learning_rate": 1.0803347810724452e-05, "loss": 0.0086, "step": 15950 }, { "grad_norm": 0.2252405285835266, "learning_rate": 1.0752073960523911e-05, "loss": 0.0099, "step": 15960 }, { "grad_norm": 0.2389063537120819, "learning_rate": 1.070090741265447e-05, "loss": 0.0075, "step": 15970 }, { "grad_norm": 0.2808586061000824, "learning_rate": 1.0649848307003547e-05, "loss": 0.0116, "step": 15980 }, { "grad_norm": 0.2047148048877716, "learning_rate": 1.0598896783164757e-05, "loss": 0.0075, "step": 15990 }, { "grad_norm": 0.17386089265346527, "learning_rate": 1.0548052980437645e-05, "loss": 0.0077, "step": 16000 }, { "grad_norm": 0.5847902297973633, "learning_rate": 1.049731703782722e-05, "loss": 0.0078, "step": 16010 }, { "grad_norm": 0.16801218688488007, "learning_rate": 1.0446689094043587e-05, "loss": 0.0065, "step": 16020 }, { "grad_norm": 0.12172616273164749, "learning_rate": 1.039616928750165e-05, "loss": 0.0067, "step": 16030 }, { "grad_norm": 0.18768148124217987, "learning_rate": 1.0345757756320612e-05, "loss": 0.0082, "step": 16040 }, { "grad_norm": 0.14117304980754852, "learning_rate": 1.0295454638323666e-05, "loss": 0.0087, "step": 16050 }, { "grad_norm": 0.14175529778003693, "learning_rate": 1.0245260071037632e-05, "loss": 0.0074, "step": 16060 }, { "grad_norm": 0.3167143762111664, "learning_rate": 1.0195174191692518e-05, "loss": 0.009, "step": 16070 }, { "grad_norm": 0.21622122824192047, "learning_rate": 1.014519713722124e-05, "loss": 0.0106, "step": 16080 }, { "grad_norm": 0.21439582109451294, "learning_rate": 1.0095329044259132e-05, "loss": 0.0095, "step": 16090 }, { "grad_norm": 0.2020285576581955, "learning_rate": 1.004557004914365e-05, "loss": 0.0077, "step": 16100 }, { "grad_norm": 0.30906355381011963, "learning_rate": 9.995920287914007e-06, "loss": 0.0092, "step": 16110 }, { "grad_norm": 0.18057408928871155, "learning_rate": 9.946379896310737e-06, "loss": 0.0093, "step": 16120 }, { "grad_norm": 0.19932439923286438, "learning_rate": 9.896949009775396e-06, "loss": 0.0081, "step": 16130 }, { "grad_norm": 0.1577359437942505, "learning_rate": 9.847627763450134e-06, "loss": 0.0087, "step": 16140 }, { "grad_norm": 0.1705850511789322, "learning_rate": 9.798416292177337e-06, "loss": 0.0075, "step": 16150 }, { "grad_norm": 0.142476886510849, "learning_rate": 9.74931473049932e-06, "loss": 0.0104, "step": 16160 }, { "grad_norm": 0.14589132368564606, "learning_rate": 9.700323212657847e-06, "loss": 0.0073, "step": 16170 }, { "grad_norm": 0.1523684710264206, "learning_rate": 9.65144187259388e-06, "loss": 0.0101, "step": 16180 }, { "grad_norm": 0.10377465188503265, "learning_rate": 9.602670843947132e-06, "loss": 0.0077, "step": 16190 }, { "grad_norm": 0.21292094886302948, "learning_rate": 9.554010260055713e-06, "loss": 0.0128, "step": 16200 }, { "grad_norm": 0.4125750660896301, "learning_rate": 9.505460253955834e-06, "loss": 0.0082, "step": 16210 }, { "grad_norm": 0.17852820456027985, "learning_rate": 9.457020958381324e-06, "loss": 0.0073, "step": 16220 }, { "grad_norm": 0.20929287374019623, "learning_rate": 9.408692505763395e-06, "loss": 0.0079, "step": 16230 }, { "grad_norm": 0.16328507661819458, "learning_rate": 9.360475028230181e-06, "loss": 0.0076, "step": 16240 }, { "grad_norm": 0.21511343121528625, "learning_rate": 9.312368657606412e-06, "loss": 0.0071, "step": 16250 }, { "grad_norm": 0.17969392240047455, "learning_rate": 9.264373525413096e-06, "loss": 0.008, "step": 16260 }, { "grad_norm": 0.12664836645126343, "learning_rate": 9.216489762867058e-06, "loss": 0.0069, "step": 16270 }, { "grad_norm": 0.23566000163555145, "learning_rate": 9.168717500880708e-06, "loss": 0.0084, "step": 16280 }, { "grad_norm": 0.1375821977853775, "learning_rate": 9.121056870061574e-06, "loss": 0.0092, "step": 16290 }, { "grad_norm": 0.24378149211406708, "learning_rate": 9.073508000711983e-06, "loss": 0.0063, "step": 16300 }, { "grad_norm": 1.1051721572875977, "learning_rate": 9.026071022828758e-06, "loss": 0.0087, "step": 16310 }, { "grad_norm": 0.18624262511730194, "learning_rate": 8.978746066102771e-06, "loss": 0.007, "step": 16320 }, { "grad_norm": 0.19161497056484222, "learning_rate": 8.931533259918634e-06, "loss": 0.0075, "step": 16330 }, { "grad_norm": 0.1784583181142807, "learning_rate": 8.884432733354382e-06, "loss": 0.0067, "step": 16340 }, { "grad_norm": 0.17507174611091614, "learning_rate": 8.837444615181029e-06, "loss": 0.0079, "step": 16350 }, { "grad_norm": 0.3699311912059784, "learning_rate": 8.790569033862323e-06, "loss": 0.0091, "step": 16360 }, { "grad_norm": 0.25687742233276367, "learning_rate": 8.7438061175543e-06, "loss": 0.0066, "step": 16370 }, { "grad_norm": 0.17475208640098572, "learning_rate": 8.697155994104978e-06, "loss": 0.0084, "step": 16380 }, { "grad_norm": 0.15809917449951172, "learning_rate": 8.650618791054033e-06, "loss": 0.0089, "step": 16390 }, { "grad_norm": 0.13109372556209564, "learning_rate": 8.604194635632373e-06, "loss": 0.0065, "step": 16400 }, { "grad_norm": 0.193354532122612, "learning_rate": 8.557883654761906e-06, "loss": 0.0084, "step": 16410 }, { "grad_norm": 0.18572674691677094, "learning_rate": 8.511685975055061e-06, "loss": 0.0066, "step": 16420 }, { "grad_norm": 0.10653883963823318, "learning_rate": 8.46560172281452e-06, "loss": 0.0073, "step": 16430 }, { "grad_norm": 0.15316790342330933, "learning_rate": 8.419631024032893e-06, "loss": 0.0083, "step": 16440 }, { "grad_norm": 0.18353162705898285, "learning_rate": 8.373774004392293e-06, "loss": 0.006, "step": 16450 }, { "grad_norm": 0.14753106236457825, "learning_rate": 8.32803078926409e-06, "loss": 0.0069, "step": 16460 }, { "grad_norm": 0.19316348433494568, "learning_rate": 8.282401503708454e-06, "loss": 0.0072, "step": 16470 }, { "grad_norm": 0.26579222083091736, "learning_rate": 8.23688627247412e-06, "loss": 0.0079, "step": 16480 }, { "grad_norm": 0.23526771366596222, "learning_rate": 8.191485219998007e-06, "loss": 0.0096, "step": 16490 }, { "grad_norm": 0.17327390611171722, "learning_rate": 8.146198470404843e-06, "loss": 0.0083, "step": 16500 }, { "grad_norm": 0.19906078279018402, "learning_rate": 8.101026147506897e-06, "loss": 0.0067, "step": 16510 }, { "grad_norm": 0.18870054185390472, "learning_rate": 8.05596837480353e-06, "loss": 0.007, "step": 16520 }, { "grad_norm": 0.12604646384716034, "learning_rate": 8.011025275480998e-06, "loss": 0.0066, "step": 16530 }, { "grad_norm": 0.10731339454650879, "learning_rate": 7.966196972412027e-06, "loss": 0.007, "step": 16540 }, { "grad_norm": 0.1516895741224289, "learning_rate": 7.92148358815547e-06, "loss": 0.0087, "step": 16550 }, { "grad_norm": 0.15892720222473145, "learning_rate": 7.87688524495604e-06, "loss": 0.0074, "step": 16560 }, { "grad_norm": 0.12480528652667999, "learning_rate": 7.83240206474386e-06, "loss": 0.0068, "step": 16570 }, { "grad_norm": 0.22631336748600006, "learning_rate": 7.788034169134272e-06, "loss": 0.0083, "step": 16580 }, { "grad_norm": 0.20984452962875366, "learning_rate": 7.743781679427414e-06, "loss": 0.0073, "step": 16590 }, { "grad_norm": 0.3785482943058014, "learning_rate": 7.699644716607895e-06, "loss": 0.0062, "step": 16600 }, { "grad_norm": 0.19197265803813934, "learning_rate": 7.655623401344486e-06, "loss": 0.0089, "step": 16610 }, { "grad_norm": 0.21280412375926971, "learning_rate": 7.611717853989775e-06, "loss": 0.0088, "step": 16620 }, { "grad_norm": 0.17781531810760498, "learning_rate": 7.567928194579854e-06, "loss": 0.0084, "step": 16630 }, { "grad_norm": 0.2932118773460388, "learning_rate": 7.524254542833997e-06, "loss": 0.0071, "step": 16640 }, { "grad_norm": 0.2246873378753662, "learning_rate": 7.480697018154286e-06, "loss": 0.0091, "step": 16650 }, { "grad_norm": 0.14911232888698578, "learning_rate": 7.437255739625332e-06, "loss": 0.0067, "step": 16660 }, { "grad_norm": 0.19599097967147827, "learning_rate": 7.393930826013923e-06, "loss": 0.0072, "step": 16670 }, { "grad_norm": 0.1558641791343689, "learning_rate": 7.350722395768722e-06, "loss": 0.0055, "step": 16680 }, { "grad_norm": 0.2673828899860382, "learning_rate": 7.307630567019963e-06, "loss": 0.0069, "step": 16690 }, { "grad_norm": 0.545863926410675, "learning_rate": 7.264655457579e-06, "loss": 0.0102, "step": 16700 }, { "grad_norm": 0.11445369571447372, "learning_rate": 7.221797184938184e-06, "loss": 0.0057, "step": 16710 }, { "grad_norm": 0.15090951323509216, "learning_rate": 7.179055866270373e-06, "loss": 0.0091, "step": 16720 }, { "grad_norm": 0.13491539657115936, "learning_rate": 7.136431618428707e-06, "loss": 0.0075, "step": 16730 }, { "grad_norm": 0.0993436947464943, "learning_rate": 7.09392455794628e-06, "loss": 0.0063, "step": 16740 }, { "grad_norm": 0.14264445006847382, "learning_rate": 7.051534801035725e-06, "loss": 0.0063, "step": 16750 }, { "grad_norm": 0.16763782501220703, "learning_rate": 7.00926246358905e-06, "loss": 0.0074, "step": 16760 }, { "grad_norm": 0.21594665944576263, "learning_rate": 6.967107661177191e-06, "loss": 0.0082, "step": 16770 }, { "grad_norm": 0.185106098651886, "learning_rate": 6.925070509049786e-06, "loss": 0.0064, "step": 16780 }, { "grad_norm": 0.1578800529241562, "learning_rate": 6.883151122134812e-06, "loss": 0.0068, "step": 16790 }, { "grad_norm": 0.12012939155101776, "learning_rate": 6.8413496150382394e-06, "loss": 0.0076, "step": 16800 }, { "grad_norm": 0.17197932302951813, "learning_rate": 6.7996661020438165e-06, "loss": 0.0096, "step": 16810 }, { "grad_norm": 0.15174546837806702, "learning_rate": 6.758100697112662e-06, "loss": 0.0075, "step": 16820 }, { "grad_norm": 0.25899171829223633, "learning_rate": 6.716653513883026e-06, "loss": 0.0072, "step": 16830 }, { "grad_norm": 0.14711317420005798, "learning_rate": 6.675324665669913e-06, "loss": 0.0063, "step": 16840 }, { "grad_norm": 0.1655847281217575, "learning_rate": 6.634114265464803e-06, "loss": 0.0065, "step": 16850 }, { "grad_norm": 0.10699893534183502, "learning_rate": 6.59302242593538e-06, "loss": 0.0064, "step": 16860 }, { "grad_norm": 0.13388961553573608, "learning_rate": 6.552049259425141e-06, "loss": 0.0073, "step": 16870 }, { "grad_norm": 0.14850012958049774, "learning_rate": 6.511194877953181e-06, "loss": 0.0078, "step": 16880 }, { "grad_norm": 0.23992480337619781, "learning_rate": 6.470459393213813e-06, "loss": 0.0102, "step": 16890 }, { "grad_norm": 0.1842438280582428, "learning_rate": 6.429842916576279e-06, "loss": 0.0104, "step": 16900 }, { "grad_norm": 0.14625371992588043, "learning_rate": 6.389345559084503e-06, "loss": 0.0065, "step": 16910 }, { "grad_norm": 0.14589239656925201, "learning_rate": 6.348967431456682e-06, "loss": 0.0069, "step": 16920 }, { "grad_norm": 0.15522103011608124, "learning_rate": 6.30870864408511e-06, "loss": 0.0069, "step": 16930 }, { "grad_norm": 0.2711852490901947, "learning_rate": 6.268569307035754e-06, "loss": 0.0071, "step": 16940 }, { "grad_norm": 0.13050232827663422, "learning_rate": 6.228549530048022e-06, "loss": 0.0058, "step": 16950 }, { "grad_norm": 0.11063165217638016, "learning_rate": 6.1886494225344814e-06, "loss": 0.0094, "step": 16960 }, { "grad_norm": 0.15676675736904144, "learning_rate": 6.148869093580479e-06, "loss": 0.0099, "step": 16970 }, { "grad_norm": 0.2107265144586563, "learning_rate": 6.109208651943921e-06, "loss": 0.0058, "step": 16980 }, { "grad_norm": 0.6548556685447693, "learning_rate": 6.069668206054946e-06, "loss": 0.0095, "step": 16990 }, { "grad_norm": 0.2878957986831665, "learning_rate": 6.0302478640156145e-06, "loss": 0.0088, "step": 17000 }, { "grad_norm": 0.17711052298545837, "learning_rate": 5.990947733599644e-06, "loss": 0.0068, "step": 17010 }, { "grad_norm": 0.16346819698810577, "learning_rate": 5.951767922252105e-06, "loss": 0.0079, "step": 17020 }, { "grad_norm": 0.36923104524612427, "learning_rate": 5.912708537089068e-06, "loss": 0.0075, "step": 17030 }, { "grad_norm": 0.26777365803718567, "learning_rate": 5.873769684897434e-06, "loss": 0.0085, "step": 17040 }, { "grad_norm": 0.1216161921620369, "learning_rate": 5.834951472134514e-06, "loss": 0.0083, "step": 17050 }, { "grad_norm": 0.19614116847515106, "learning_rate": 5.796254004927832e-06, "loss": 0.0067, "step": 17060 }, { "grad_norm": 0.19887468218803406, "learning_rate": 5.757677389074806e-06, "loss": 0.0085, "step": 17070 }, { "grad_norm": 0.19132661819458008, "learning_rate": 5.719221730042385e-06, "loss": 0.0087, "step": 17080 }, { "grad_norm": 0.15653230249881744, "learning_rate": 5.680887132966911e-06, "loss": 0.0059, "step": 17090 }, { "grad_norm": 0.10167554020881653, "learning_rate": 5.642673702653683e-06, "loss": 0.0072, "step": 17100 }, { "grad_norm": 0.3239072263240814, "learning_rate": 5.604581543576781e-06, "loss": 0.0075, "step": 17110 }, { "grad_norm": 0.23113033175468445, "learning_rate": 5.566610759878704e-06, "loss": 0.008, "step": 17120 }, { "grad_norm": 0.189833402633667, "learning_rate": 5.528761455370119e-06, "loss": 0.0088, "step": 17130 }, { "grad_norm": 0.13736112415790558, "learning_rate": 5.491033733529594e-06, "loss": 0.0118, "step": 17140 }, { "grad_norm": 0.14210128784179688, "learning_rate": 5.453427697503255e-06, "loss": 0.007, "step": 17150 }, { "grad_norm": 1.3866373300552368, "learning_rate": 5.415943450104599e-06, "loss": 0.0079, "step": 17160 }, { "grad_norm": 0.17086149752140045, "learning_rate": 5.378581093814111e-06, "loss": 0.0081, "step": 17170 }, { "grad_norm": 0.13963444530963898, "learning_rate": 5.3413407307790375e-06, "loss": 0.0075, "step": 17180 }, { "grad_norm": 0.13204753398895264, "learning_rate": 5.30422246281313e-06, "loss": 0.006, "step": 17190 }, { "grad_norm": 0.250595360994339, "learning_rate": 5.267226391396296e-06, "loss": 0.0067, "step": 17200 }, { "grad_norm": 0.25121212005615234, "learning_rate": 5.2303526176744e-06, "loss": 0.0114, "step": 17210 }, { "grad_norm": 0.14169013500213623, "learning_rate": 5.193601242458929e-06, "loss": 0.006, "step": 17220 }, { "grad_norm": 0.1313830018043518, "learning_rate": 5.156972366226714e-06, "loss": 0.0065, "step": 17230 }, { "grad_norm": 0.3507251739501953, "learning_rate": 5.120466089119735e-06, "loss": 0.0071, "step": 17240 }, { "grad_norm": 0.11924529075622559, "learning_rate": 5.084082510944749e-06, "loss": 0.0107, "step": 17250 }, { "grad_norm": 0.2524084150791168, "learning_rate": 5.047821731173058e-06, "loss": 0.0064, "step": 17260 }, { "grad_norm": 0.19415691494941711, "learning_rate": 5.011683848940274e-06, "loss": 0.0063, "step": 17270 }, { "grad_norm": 0.18039430677890778, "learning_rate": 4.975668963045954e-06, "loss": 0.0085, "step": 17280 }, { "grad_norm": 0.1924077570438385, "learning_rate": 4.9397771719534525e-06, "loss": 0.0075, "step": 17290 }, { "grad_norm": 0.1716841161251068, "learning_rate": 4.904008573789548e-06, "loss": 0.0082, "step": 17300 }, { "grad_norm": 0.31076234579086304, "learning_rate": 4.8683632663442005e-06, "loss": 0.009, "step": 17310 }, { "grad_norm": 0.12016285210847855, "learning_rate": 4.832841347070343e-06, "loss": 0.0071, "step": 17320 }, { "grad_norm": 0.17957791686058044, "learning_rate": 4.797442913083539e-06, "loss": 0.0081, "step": 17330 }, { "grad_norm": 0.5560227036476135, "learning_rate": 4.7621680611617596e-06, "loss": 0.0079, "step": 17340 }, { "grad_norm": 0.17144133150577545, "learning_rate": 4.727016887745095e-06, "loss": 0.0082, "step": 17350 }, { "grad_norm": 0.16800165176391602, "learning_rate": 4.691989488935511e-06, "loss": 0.0087, "step": 17360 }, { "grad_norm": 0.18115754425525665, "learning_rate": 4.657085960496588e-06, "loss": 0.0069, "step": 17370 }, { "grad_norm": 0.10738164931535721, "learning_rate": 4.6223063978532265e-06, "loss": 0.0068, "step": 17380 }, { "grad_norm": 0.12895676493644714, "learning_rate": 4.587650896091439e-06, "loss": 0.0059, "step": 17390 }, { "grad_norm": 0.15790238976478577, "learning_rate": 4.553119549958035e-06, "loss": 0.0053, "step": 17400 }, { "grad_norm": 0.14036139845848083, "learning_rate": 4.518712453860385e-06, "loss": 0.0069, "step": 17410 }, { "grad_norm": 0.20358788967132568, "learning_rate": 4.484429701866205e-06, "loss": 0.0081, "step": 17420 }, { "grad_norm": 0.17846588790416718, "learning_rate": 4.4502713877031975e-06, "loss": 0.0107, "step": 17430 }, { "grad_norm": 0.11227608472108841, "learning_rate": 4.416237604758911e-06, "loss": 0.005, "step": 17440 }, { "grad_norm": 0.1330096274614334, "learning_rate": 4.3823284460804025e-06, "loss": 0.007, "step": 17450 }, { "grad_norm": 0.1459423452615738, "learning_rate": 4.348544004374011e-06, "loss": 0.0075, "step": 17460 }, { "grad_norm": 0.8251057863235474, "learning_rate": 4.314884372005123e-06, "loss": 0.0099, "step": 17470 }, { "grad_norm": 0.1534908562898636, "learning_rate": 4.281349640997867e-06, "loss": 0.0092, "step": 17480 }, { "grad_norm": 0.13112404942512512, "learning_rate": 4.247939903034942e-06, "loss": 0.0077, "step": 17490 }, { "grad_norm": 0.1583923101425171, "learning_rate": 4.214655249457284e-06, "loss": 0.0048, "step": 17500 }, { "grad_norm": 0.09976863861083984, "learning_rate": 4.181495771263855e-06, "loss": 0.006, "step": 17510 }, { "grad_norm": 0.14303839206695557, "learning_rate": 4.148461559111427e-06, "loss": 0.0062, "step": 17520 }, { "grad_norm": 0.2476406842470169, "learning_rate": 4.115552703314252e-06, "loss": 0.0074, "step": 17530 }, { "grad_norm": 0.13226883113384247, "learning_rate": 4.082769293843886e-06, "loss": 0.0098, "step": 17540 }, { "grad_norm": 0.31039226055145264, "learning_rate": 4.050111420328939e-06, "loss": 0.0098, "step": 17550 }, { "grad_norm": 0.14855457842350006, "learning_rate": 4.017579172054764e-06, "loss": 0.008, "step": 17560 }, { "grad_norm": 0.10973980277776718, "learning_rate": 3.985172637963308e-06, "loss": 0.007, "step": 17570 }, { "grad_norm": 0.30389079451560974, "learning_rate": 3.952891906652784e-06, "loss": 0.0097, "step": 17580 }, { "grad_norm": 0.1407729536294937, "learning_rate": 3.920737066377478e-06, "loss": 0.0058, "step": 17590 }, { "grad_norm": 0.10627759248018265, "learning_rate": 3.888708205047509e-06, "loss": 0.0056, "step": 17600 }, { "grad_norm": 0.07397635281085968, "learning_rate": 3.856805410228542e-06, "loss": 0.0086, "step": 17610 }, { "grad_norm": 0.1141512393951416, "learning_rate": 3.82502876914162e-06, "loss": 0.0053, "step": 17620 }, { "grad_norm": 1.6786439418792725, "learning_rate": 3.7933783686628586e-06, "loss": 0.0063, "step": 17630 }, { "grad_norm": 0.18269212543964386, "learning_rate": 3.7618542953232306e-06, "loss": 0.0075, "step": 17640 }, { "grad_norm": 0.1342618614435196, "learning_rate": 3.7304566353083658e-06, "loss": 0.0065, "step": 17650 }, { "grad_norm": 0.1415589302778244, "learning_rate": 3.6991854744582555e-06, "loss": 0.0082, "step": 17660 }, { "grad_norm": 0.1393871158361435, "learning_rate": 3.6680408982670777e-06, "loss": 0.0112, "step": 17670 }, { "grad_norm": 0.14253245294094086, "learning_rate": 3.637022991882899e-06, "loss": 0.0071, "step": 17680 }, { "grad_norm": 0.1377224624156952, "learning_rate": 3.606131840107485e-06, "loss": 0.0062, "step": 17690 }, { "grad_norm": 0.15855330228805542, "learning_rate": 3.575367527396084e-06, "loss": 0.0096, "step": 17700 }, { "grad_norm": 0.11642663925886154, "learning_rate": 3.5447301378571386e-06, "loss": 0.0064, "step": 17710 }, { "grad_norm": 0.22609098255634308, "learning_rate": 3.514219755252113e-06, "loss": 0.0077, "step": 17720 }, { "grad_norm": 0.18561750650405884, "learning_rate": 3.4838364629952213e-06, "loss": 0.0078, "step": 17730 }, { "grad_norm": 0.1640578955411911, "learning_rate": 3.4535803441532123e-06, "loss": 0.008, "step": 17740 }, { "grad_norm": 0.21458394825458527, "learning_rate": 3.4234514814451836e-06, "loss": 0.0057, "step": 17750 }, { "grad_norm": 0.2046024054288864, "learning_rate": 3.393449957242273e-06, "loss": 0.0077, "step": 17760 }, { "grad_norm": 0.11352556198835373, "learning_rate": 3.363575853567524e-06, "loss": 0.009, "step": 17770 }, { "grad_norm": 0.15080828964710236, "learning_rate": 3.3338292520955826e-06, "loss": 0.0067, "step": 17780 }, { "grad_norm": 0.17963539063930511, "learning_rate": 3.304210234152516e-06, "loss": 0.0063, "step": 17790 }, { "grad_norm": 0.10138626396656036, "learning_rate": 3.2747188807155993e-06, "loss": 0.0086, "step": 17800 }, { "grad_norm": 0.11984547972679138, "learning_rate": 3.2453552724130643e-06, "loss": 0.007, "step": 17810 }, { "grad_norm": 0.24107912182807922, "learning_rate": 3.216119489523889e-06, "loss": 0.0058, "step": 17820 }, { "grad_norm": 0.14644724130630493, "learning_rate": 3.1870116119775917e-06, "loss": 0.0061, "step": 17830 }, { "grad_norm": 0.17007102072238922, "learning_rate": 3.158031719353999e-06, "loss": 0.0071, "step": 17840 }, { "grad_norm": 0.1260616034269333, "learning_rate": 3.1291798908830273e-06, "loss": 0.0115, "step": 17850 }, { "grad_norm": 0.13170063495635986, "learning_rate": 3.1004562054444853e-06, "loss": 0.0068, "step": 17860 }, { "grad_norm": 0.12411011010408401, "learning_rate": 3.071860741567806e-06, "loss": 0.0063, "step": 17870 }, { "grad_norm": 0.2326919138431549, "learning_rate": 3.04339357743193e-06, "loss": 0.0071, "step": 17880 }, { "grad_norm": 0.0947069376707077, "learning_rate": 3.0150547908649628e-06, "loss": 0.0058, "step": 17890 }, { "grad_norm": 0.1182367280125618, "learning_rate": 2.9868444593440957e-06, "loss": 0.0081, "step": 17900 }, { "grad_norm": 0.1294725090265274, "learning_rate": 2.9587626599952846e-06, "loss": 0.0053, "step": 17910 }, { "grad_norm": 0.13719303905963898, "learning_rate": 2.930809469593082e-06, "loss": 0.0091, "step": 17920 }, { "grad_norm": 0.4036208987236023, "learning_rate": 2.9029849645604733e-06, "loss": 0.0083, "step": 17930 }, { "grad_norm": 0.21328043937683105, "learning_rate": 2.8752892209685632e-06, "loss": 0.0083, "step": 17940 }, { "grad_norm": 0.0744176059961319, "learning_rate": 2.847722314536483e-06, "loss": 0.0053, "step": 17950 }, { "grad_norm": 0.21712268888950348, "learning_rate": 2.820284320631078e-06, "loss": 0.0078, "step": 17960 }, { "grad_norm": 0.13843274116516113, "learning_rate": 2.792975314266788e-06, "loss": 0.0094, "step": 17970 }, { "grad_norm": 0.1121164932847023, "learning_rate": 2.7657953701054007e-06, "loss": 0.0072, "step": 17980 }, { "grad_norm": 0.1180398091673851, "learning_rate": 2.7387445624558306e-06, "loss": 0.0088, "step": 17990 }, { "grad_norm": 0.21480044722557068, "learning_rate": 2.7118229652739747e-06, "loss": 0.0077, "step": 18000 }, { "grad_norm": 0.1409302055835724, "learning_rate": 2.6850306521624236e-06, "loss": 0.0067, "step": 18010 }, { "grad_norm": 1.126784086227417, "learning_rate": 2.6583676963703507e-06, "loss": 0.0094, "step": 18020 }, { "grad_norm": 0.13120611011981964, "learning_rate": 2.631834170793268e-06, "loss": 0.009, "step": 18030 }, { "grad_norm": 0.22308212518692017, "learning_rate": 2.6054301479728036e-06, "loss": 0.0075, "step": 18040 }, { "grad_norm": 0.1721251755952835, "learning_rate": 2.579155700096575e-06, "loss": 0.0094, "step": 18050 }, { "grad_norm": 0.08507943153381348, "learning_rate": 2.5530108989978873e-06, "loss": 0.0078, "step": 18060 }, { "grad_norm": 0.08137187361717224, "learning_rate": 2.5269958161556416e-06, "loss": 0.0064, "step": 18070 }, { "grad_norm": 0.1353299468755722, "learning_rate": 2.5011105226940888e-06, "loss": 0.0063, "step": 18080 }, { "grad_norm": 0.10429251939058304, "learning_rate": 2.4753550893826248e-06, "loss": 0.0053, "step": 18090 }, { "grad_norm": 0.3088083267211914, "learning_rate": 2.4497295866356296e-06, "loss": 0.0088, "step": 18100 }, { "grad_norm": 0.11143761873245239, "learning_rate": 2.424234084512228e-06, "loss": 0.0068, "step": 18110 }, { "grad_norm": 0.5420790314674377, "learning_rate": 2.3988686527161687e-06, "loss": 0.0074, "step": 18120 }, { "grad_norm": 0.08882468938827515, "learning_rate": 2.373633360595573e-06, "loss": 0.0052, "step": 18130 }, { "grad_norm": 0.10887418687343597, "learning_rate": 2.3485282771427585e-06, "loss": 0.0081, "step": 18140 }, { "grad_norm": 0.1721881479024887, "learning_rate": 2.3235534709940665e-06, "loss": 0.0065, "step": 18150 }, { "grad_norm": 0.12101589143276215, "learning_rate": 2.2987090104296617e-06, "loss": 0.0085, "step": 18160 }, { "grad_norm": 0.11604715138673782, "learning_rate": 2.273994963373355e-06, "loss": 0.007, "step": 18170 }, { "grad_norm": 0.18888771533966064, "learning_rate": 2.249411397392409e-06, "loss": 0.0074, "step": 18180 }, { "grad_norm": 0.16592435538768768, "learning_rate": 2.2249583796973506e-06, "loss": 0.0077, "step": 18190 }, { "grad_norm": 0.5546330809593201, "learning_rate": 2.200635977141796e-06, "loss": 0.005, "step": 18200 }, { "grad_norm": 0.2621038258075714, "learning_rate": 2.17644425622226e-06, "loss": 0.0048, "step": 18210 }, { "grad_norm": 0.1778993308544159, "learning_rate": 2.152383283077991e-06, "loss": 0.0067, "step": 18220 }, { "grad_norm": 0.09525787085294724, "learning_rate": 2.128453123490781e-06, "loss": 0.0094, "step": 18230 }, { "grad_norm": 0.3412826657295227, "learning_rate": 2.1046538428847462e-06, "loss": 0.0063, "step": 18240 }, { "grad_norm": 0.2745835781097412, "learning_rate": 2.0809855063262273e-06, "loss": 0.0073, "step": 18250 }, { "grad_norm": 1.0415291786193848, "learning_rate": 2.057448178523558e-06, "loss": 0.0057, "step": 18260 }, { "grad_norm": 0.11343783140182495, "learning_rate": 2.034041923826885e-06, "loss": 0.0069, "step": 18270 }, { "grad_norm": 0.6116946935653687, "learning_rate": 2.0107668062280204e-06, "loss": 0.0064, "step": 18280 }, { "grad_norm": 0.24730606377124786, "learning_rate": 1.9876228893602357e-06, "loss": 0.0104, "step": 18290 }, { "grad_norm": 0.2159283608198166, "learning_rate": 1.9646102364981266e-06, "loss": 0.0065, "step": 18300 }, { "grad_norm": 0.10998883098363876, "learning_rate": 1.9417289105574053e-06, "loss": 0.0063, "step": 18310 }, { "grad_norm": 0.14335983991622925, "learning_rate": 1.9189789740947427e-06, "loss": 0.0045, "step": 18320 }, { "grad_norm": 0.2824479639530182, "learning_rate": 1.896360489307597e-06, "loss": 0.0066, "step": 18330 }, { "grad_norm": 0.13925448060035706, "learning_rate": 1.8738735180340362e-06, "loss": 0.006, "step": 18340 }, { "grad_norm": 0.06703918427228928, "learning_rate": 1.8515181217525824e-06, "loss": 0.0051, "step": 18350 }, { "grad_norm": 0.09401879459619522, "learning_rate": 1.8292943615820457e-06, "loss": 0.0079, "step": 18360 }, { "grad_norm": 0.2919436991214752, "learning_rate": 1.8072022982813296e-06, "loss": 0.0082, "step": 18370 }, { "grad_norm": 0.20894771814346313, "learning_rate": 1.7852419922492925e-06, "loss": 0.0074, "step": 18380 }, { "grad_norm": 0.12923209369182587, "learning_rate": 1.763413503524569e-06, "loss": 0.0059, "step": 18390 }, { "grad_norm": 0.08016961067914963, "learning_rate": 1.7417168917854165e-06, "loss": 0.008, "step": 18400 }, { "grad_norm": 0.13532640039920807, "learning_rate": 1.720152216349552e-06, "loss": 0.0081, "step": 18410 }, { "grad_norm": 0.09424306452274323, "learning_rate": 1.6987195361739595e-06, "loss": 0.0074, "step": 18420 }, { "grad_norm": 0.13607047498226166, "learning_rate": 1.6774189098547832e-06, "loss": 0.0073, "step": 18430 }, { "grad_norm": 0.17515063285827637, "learning_rate": 1.6562503956271069e-06, "loss": 0.0072, "step": 18440 }, { "grad_norm": 0.11603332310914993, "learning_rate": 1.6352140513648417e-06, "loss": 0.0063, "step": 18450 }, { "grad_norm": 0.14426587522029877, "learning_rate": 1.6143099345805712e-06, "loss": 0.0082, "step": 18460 }, { "grad_norm": 0.19542574882507324, "learning_rate": 1.5935381024253293e-06, "loss": 0.0064, "step": 18470 }, { "grad_norm": 0.22276097536087036, "learning_rate": 1.572898611688517e-06, "loss": 0.0062, "step": 18480 }, { "grad_norm": 0.11537586152553558, "learning_rate": 1.5523915187977133e-06, "loss": 0.0071, "step": 18490 }, { "grad_norm": 0.30665817856788635, "learning_rate": 1.532016879818532e-06, "loss": 0.0064, "step": 18500 }, { "grad_norm": 0.3355673551559448, "learning_rate": 1.51177475045447e-06, "loss": 0.0113, "step": 18510 }, { "grad_norm": 0.0904407724738121, "learning_rate": 1.4916651860467035e-06, "loss": 0.0064, "step": 18520 }, { "grad_norm": 0.145564466714859, "learning_rate": 1.471688241574043e-06, "loss": 0.0077, "step": 18530 }, { "grad_norm": 0.18740566074848175, "learning_rate": 1.451843971652672e-06, "loss": 0.0112, "step": 18540 }, { "grad_norm": 0.13347408175468445, "learning_rate": 1.432132430536076e-06, "loss": 0.0062, "step": 18550 }, { "grad_norm": 0.12472502887248993, "learning_rate": 1.412553672114869e-06, "loss": 0.0099, "step": 18560 }, { "grad_norm": 0.09032303839921951, "learning_rate": 1.3931077499166056e-06, "loss": 0.0061, "step": 18570 }, { "grad_norm": 0.14776982367038727, "learning_rate": 1.3737947171057085e-06, "loss": 0.0061, "step": 18580 }, { "grad_norm": 0.1977526694536209, "learning_rate": 1.3546146264832582e-06, "loss": 0.0071, "step": 18590 }, { "grad_norm": 0.17422360181808472, "learning_rate": 1.3355675304869086e-06, "loss": 0.0082, "step": 18600 }, { "grad_norm": 0.18238909542560577, "learning_rate": 1.3166534811906827e-06, "loss": 0.0064, "step": 18610 }, { "grad_norm": 0.1807054579257965, "learning_rate": 1.2978725303048666e-06, "loss": 0.0064, "step": 18620 }, { "grad_norm": 0.41429853439331055, "learning_rate": 1.2792247291758762e-06, "loss": 0.007, "step": 18630 }, { "grad_norm": 0.09525076299905777, "learning_rate": 1.2607101287860635e-06, "loss": 0.0057, "step": 18640 }, { "grad_norm": 0.13367614150047302, "learning_rate": 1.2423287797536654e-06, "loss": 0.0067, "step": 18650 }, { "grad_norm": 0.2871861457824707, "learning_rate": 1.2240807323325776e-06, "loss": 0.0071, "step": 18660 }, { "grad_norm": 0.15559346973896027, "learning_rate": 1.205966036412254e-06, "loss": 0.0128, "step": 18670 }, { "grad_norm": 0.11369854211807251, "learning_rate": 1.1879847415175949e-06, "loss": 0.008, "step": 18680 }, { "grad_norm": 0.12126907706260681, "learning_rate": 1.1701368968087712e-06, "loss": 0.0079, "step": 18690 }, { "grad_norm": 0.09696295112371445, "learning_rate": 1.1524225510811116e-06, "loss": 0.0088, "step": 18700 }, { "grad_norm": 0.08779747039079666, "learning_rate": 1.1348417527649535e-06, "loss": 0.0073, "step": 18710 }, { "grad_norm": 0.15799100697040558, "learning_rate": 1.1173945499255268e-06, "loss": 0.0055, "step": 18720 }, { "grad_norm": 0.09780146181583405, "learning_rate": 1.1000809902628307e-06, "loss": 0.0064, "step": 18730 }, { "grad_norm": 0.16587544977664948, "learning_rate": 1.082901121111468e-06, "loss": 0.0065, "step": 18740 }, { "grad_norm": 0.07246270775794983, "learning_rate": 1.0658549894405456e-06, "loss": 0.0056, "step": 18750 }, { "grad_norm": 0.07960870862007141, "learning_rate": 1.0489426418535342e-06, "loss": 0.0056, "step": 18760 }, { "grad_norm": 0.12232284247875214, "learning_rate": 1.0321641245881474e-06, "loss": 0.0065, "step": 18770 }, { "grad_norm": 0.1591997891664505, "learning_rate": 1.015519483516214e-06, "loss": 0.0073, "step": 18780 }, { "grad_norm": 0.1743980348110199, "learning_rate": 9.990087641435443e-07, "loss": 0.0095, "step": 18790 }, { "grad_norm": 0.16895434260368347, "learning_rate": 9.826320116098132e-07, "loss": 0.0074, "step": 18800 }, { "grad_norm": 0.12505821883678436, "learning_rate": 9.663892706884447e-07, "loss": 0.0065, "step": 18810 }, { "grad_norm": 0.10744699090719223, "learning_rate": 9.502805857864616e-07, "loss": 0.0072, "step": 18820 }, { "grad_norm": 0.16336467862129211, "learning_rate": 9.34306000944396e-07, "loss": 0.0054, "step": 18830 }, { "grad_norm": 0.07005229592323303, "learning_rate": 9.184655598361624e-07, "loss": 0.0051, "step": 18840 }, { "grad_norm": 0.08955762535333633, "learning_rate": 9.027593057689076e-07, "loss": 0.0073, "step": 18850 }, { "grad_norm": 0.13053946197032928, "learning_rate": 8.871872816829441e-07, "loss": 0.0055, "step": 18860 }, { "grad_norm": 0.07814237475395203, "learning_rate": 8.717495301515777e-07, "loss": 0.0072, "step": 18870 }, { "grad_norm": 0.1309666484594345, "learning_rate": 8.564460933810415e-07, "loss": 0.0069, "step": 18880 }, { "grad_norm": 0.12929323315620422, "learning_rate": 8.412770132103453e-07, "loss": 0.0058, "step": 18890 }, { "grad_norm": 0.1032700389623642, "learning_rate": 8.262423311111711e-07, "loss": 0.008, "step": 18900 }, { "grad_norm": 0.1722853183746338, "learning_rate": 8.113420881877665e-07, "loss": 0.0054, "step": 18910 }, { "grad_norm": 0.17921482026576996, "learning_rate": 7.965763251768288e-07, "loss": 0.0074, "step": 18920 }, { "grad_norm": 0.08986055850982666, "learning_rate": 7.819450824473995e-07, "loss": 0.0072, "step": 18930 }, { "grad_norm": 0.12910540401935577, "learning_rate": 7.674484000007198e-07, "loss": 0.0058, "step": 18940 }, { "grad_norm": 0.10426624119281769, "learning_rate": 7.530863174701752e-07, "loss": 0.0052, "step": 18950 }, { "grad_norm": 0.16877539455890656, "learning_rate": 7.38858874121151e-07, "loss": 0.0075, "step": 18960 }, { "grad_norm": 0.2070782482624054, "learning_rate": 7.247661088509328e-07, "loss": 0.0054, "step": 18970 }, { "grad_norm": 0.06928875297307968, "learning_rate": 7.108080601886002e-07, "loss": 0.0057, "step": 18980 }, { "grad_norm": 0.11303035169839859, "learning_rate": 6.969847662949336e-07, "loss": 0.0047, "step": 18990 }, { "grad_norm": 0.2127775102853775, "learning_rate": 6.832962649622798e-07, "loss": 0.0055, "step": 19000 }, { "grad_norm": 0.46861031651496887, "learning_rate": 6.697425936144863e-07, "loss": 0.0077, "step": 19010 }, { "grad_norm": 0.17535817623138428, "learning_rate": 6.563237893067731e-07, "loss": 0.008, "step": 19020 }, { "grad_norm": 0.1297111213207245, "learning_rate": 6.430398887256328e-07, "loss": 0.0062, "step": 19030 }, { "grad_norm": 0.4480202794075012, "learning_rate": 6.298909281887478e-07, "loss": 0.0055, "step": 19040 }, { "grad_norm": 0.060463108122348785, "learning_rate": 6.168769436448673e-07, "loss": 0.0055, "step": 19050 }, { "grad_norm": 0.1461174339056015, "learning_rate": 6.03997970673742e-07, "loss": 0.0054, "step": 19060 }, { "grad_norm": 0.11906304955482483, "learning_rate": 5.912540444859782e-07, "loss": 0.0063, "step": 19070 }, { "grad_norm": 0.17703856527805328, "learning_rate": 5.786451999229837e-07, "loss": 0.0058, "step": 19080 }, { "grad_norm": 0.15030202269554138, "learning_rate": 5.661714714568722e-07, "loss": 0.0058, "step": 19090 }, { "grad_norm": 0.1666615754365921, "learning_rate": 5.538328931903259e-07, "loss": 0.0058, "step": 19100 }, { "grad_norm": 0.27413517236709595, "learning_rate": 5.416294988565551e-07, "loss": 0.0049, "step": 19110 }, { "grad_norm": 0.07719439268112183, "learning_rate": 5.29561321819172e-07, "loss": 0.0065, "step": 19120 }, { "grad_norm": 0.5233213901519775, "learning_rate": 5.176283950721061e-07, "loss": 0.0062, "step": 19130 }, { "grad_norm": 0.4687543511390686, "learning_rate": 5.058307512395332e-07, "loss": 0.0069, "step": 19140 }, { "grad_norm": 0.11426586657762527, "learning_rate": 4.941684225757526e-07, "loss": 0.0065, "step": 19150 }, { "grad_norm": 0.23970532417297363, "learning_rate": 4.826414409651314e-07, "loss": 0.0075, "step": 19160 }, { "grad_norm": 0.21478460729122162, "learning_rate": 4.712498379219943e-07, "loss": 0.0103, "step": 19170 }, { "grad_norm": 0.10797052830457687, "learning_rate": 4.599936445905506e-07, "loss": 0.0047, "step": 19180 }, { "grad_norm": 0.08822860568761826, "learning_rate": 4.4887289174480594e-07, "loss": 0.0051, "step": 19190 }, { "grad_norm": 0.10388945788145065, "learning_rate": 4.378876097884621e-07, "loss": 0.005, "step": 19200 }, { "grad_norm": 0.14567793905735016, "learning_rate": 4.2703782875487264e-07, "loss": 0.0063, "step": 19210 }, { "grad_norm": 0.08171377331018448, "learning_rate": 4.163235783069208e-07, "loss": 0.0061, "step": 19220 }, { "grad_norm": 0.16822920739650726, "learning_rate": 4.057448877369585e-07, "loss": 0.0089, "step": 19230 }, { "grad_norm": 0.07995481044054031, "learning_rate": 3.9530178596672295e-07, "loss": 0.0071, "step": 19240 }, { "grad_norm": 0.10448001325130463, "learning_rate": 3.849943015472479e-07, "loss": 0.0073, "step": 19250 }, { "grad_norm": 0.0782025083899498, "learning_rate": 3.748224626588137e-07, "loss": 0.0065, "step": 19260 }, { "grad_norm": 0.1974572390317917, "learning_rate": 3.647862971108307e-07, "loss": 0.0046, "step": 19270 }, { "grad_norm": 0.17926108837127686, "learning_rate": 3.5488583234179473e-07, "loss": 0.0073, "step": 19280 }, { "grad_norm": 0.1817624866962433, "learning_rate": 3.4512109541920413e-07, "loss": 0.0066, "step": 19290 }, { "grad_norm": 0.07127008587121964, "learning_rate": 3.354921130394706e-07, "loss": 0.0049, "step": 19300 }, { "grad_norm": 0.13600394129753113, "learning_rate": 3.259989115278639e-07, "loss": 0.0064, "step": 19310 }, { "grad_norm": 0.0802825540304184, "learning_rate": 3.1664151683843403e-07, "loss": 0.0054, "step": 19320 }, { "grad_norm": 0.07928349077701569, "learning_rate": 3.074199545539447e-07, "loss": 0.006, "step": 19330 }, { "grad_norm": 0.05396463721990585, "learning_rate": 2.983342498857955e-07, "loss": 0.0072, "step": 19340 }, { "grad_norm": 0.23302379250526428, "learning_rate": 2.893844276739499e-07, "loss": 0.0068, "step": 19350 }, { "grad_norm": 0.0736425593495369, "learning_rate": 2.8057051238688514e-07, "loss": 0.0056, "step": 19360 }, { "grad_norm": 0.10104979574680328, "learning_rate": 2.71892528121509e-07, "loss": 0.005, "step": 19370 }, { "grad_norm": 0.12184246629476547, "learning_rate": 2.633504986030988e-07, "loss": 0.0058, "step": 19380 }, { "grad_norm": 0.19412605464458466, "learning_rate": 2.549444471852347e-07, "loss": 0.0064, "step": 19390 }, { "grad_norm": 0.2881784439086914, "learning_rate": 2.4667439684974423e-07, "loss": 0.0055, "step": 19400 }, { "grad_norm": 0.23255540430545807, "learning_rate": 2.3854037020662467e-07, "loss": 0.009, "step": 19410 }, { "grad_norm": 0.14181499183177948, "learning_rate": 2.3054238949399288e-07, "loss": 0.0078, "step": 19420 }, { "grad_norm": 0.11480482667684555, "learning_rate": 2.2268047657802993e-07, "loss": 0.0056, "step": 19430 }, { "grad_norm": 0.19757556915283203, "learning_rate": 2.149546529529034e-07, "loss": 0.0071, "step": 19440 }, { "grad_norm": 0.10842618346214294, "learning_rate": 2.0736493974071736e-07, "loss": 0.0058, "step": 19450 }, { "grad_norm": 0.15002579987049103, "learning_rate": 1.9991135769145686e-07, "loss": 0.006, "step": 19460 }, { "grad_norm": 0.06764764338731766, "learning_rate": 1.9259392718293245e-07, "loss": 0.0066, "step": 19470 }, { "grad_norm": 0.2679527997970581, "learning_rate": 1.8541266822072467e-07, "loss": 0.0096, "step": 19480 }, { "grad_norm": 0.16565731167793274, "learning_rate": 1.7836760043811184e-07, "loss": 0.0076, "step": 19490 }, { "grad_norm": 0.17402954399585724, "learning_rate": 1.7145874309604792e-07, "loss": 0.0058, "step": 19500 }, { "grad_norm": 0.09997935593128204, "learning_rate": 1.6468611508308474e-07, "loss": 0.0079, "step": 19510 }, { "grad_norm": 0.10764401406049728, "learning_rate": 1.5804973491532204e-07, "loss": 0.0059, "step": 19520 }, { "grad_norm": 0.08543140441179276, "learning_rate": 1.5154962073637424e-07, "loss": 0.0067, "step": 19530 }, { "grad_norm": 0.09018760919570923, "learning_rate": 1.4518579031730372e-07, "loss": 0.0065, "step": 19540 }, { "grad_norm": 0.11289840936660767, "learning_rate": 1.389582610565876e-07, "loss": 0.0105, "step": 19550 }, { "grad_norm": 0.1827496588230133, "learning_rate": 1.3286704998003995e-07, "loss": 0.007, "step": 19560 }, { "grad_norm": 0.13279449939727783, "learning_rate": 1.2691217374080632e-07, "loss": 0.0086, "step": 19570 }, { "grad_norm": 0.10321550816297531, "learning_rate": 1.2109364861929705e-07, "loss": 0.0063, "step": 19580 }, { "grad_norm": 0.056855179369449615, "learning_rate": 1.1541149052312628e-07, "loss": 0.0054, "step": 19590 }, { "grad_norm": 0.07009416073560715, "learning_rate": 1.0986571498710074e-07, "loss": 0.0055, "step": 19600 }, { "grad_norm": 0.0947997197508812, "learning_rate": 1.0445633717316438e-07, "loss": 0.0046, "step": 19610 }, { "grad_norm": 0.10172976553440094, "learning_rate": 9.918337187034277e-08, "loss": 0.006, "step": 19620 }, { "grad_norm": 0.09158747643232346, "learning_rate": 9.404683349472643e-08, "loss": 0.0052, "step": 19630 }, { "grad_norm": 0.08260924369096756, "learning_rate": 8.904673608940983e-08, "loss": 0.0044, "step": 19640 }, { "grad_norm": 0.05803241208195686, "learning_rate": 8.418309332447471e-08, "loss": 0.0065, "step": 19650 }, { "grad_norm": 0.08447038382291794, "learning_rate": 7.945591849692902e-08, "loss": 0.0066, "step": 19660 }, { "grad_norm": 0.11308754235506058, "learning_rate": 7.486522453069578e-08, "loss": 0.0079, "step": 19670 }, { "grad_norm": 0.07578729838132858, "learning_rate": 7.041102397655208e-08, "loss": 0.0054, "step": 19680 }, { "grad_norm": 0.12008008360862732, "learning_rate": 6.609332901210685e-08, "loss": 0.0057, "step": 19690 }, { "grad_norm": 0.26330649852752686, "learning_rate": 6.191215144178419e-08, "loss": 0.012, "step": 19700 }, { "grad_norm": 0.390371710062027, "learning_rate": 5.786750269675678e-08, "loss": 0.0062, "step": 19710 }, { "grad_norm": 0.18639151751995087, "learning_rate": 5.395939383494031e-08, "loss": 0.006, "step": 19720 }, { "grad_norm": 0.07233691215515137, "learning_rate": 5.018783554095463e-08, "loss": 0.0064, "step": 19730 }, { "grad_norm": 0.16074158251285553, "learning_rate": 4.655283812610156e-08, "loss": 0.0078, "step": 19740 }, { "grad_norm": 0.079878069460392, "learning_rate": 4.305441152831491e-08, "loss": 0.0072, "step": 19750 }, { "grad_norm": 0.12221339344978333, "learning_rate": 3.9692565312171584e-08, "loss": 0.0063, "step": 19760 }, { "grad_norm": 0.10918987542390823, "learning_rate": 3.6467308668824975e-08, "loss": 0.0062, "step": 19770 }, { "grad_norm": 0.0999763160943985, "learning_rate": 3.3378650416004964e-08, "loss": 0.0083, "step": 19780 }, { "grad_norm": 0.16120582818984985, "learning_rate": 3.042659899797906e-08, "loss": 0.0077, "step": 19790 }, { "grad_norm": 0.09315381944179535, "learning_rate": 2.76111624855524e-08, "loss": 0.0049, "step": 19800 }, { "grad_norm": 0.06680784374475479, "learning_rate": 2.4932348576017784e-08, "loss": 0.0063, "step": 19810 }, { "grad_norm": 0.18044519424438477, "learning_rate": 2.239016459314458e-08, "loss": 0.0055, "step": 19820 }, { "grad_norm": 0.11231748759746552, "learning_rate": 1.9984617487173174e-08, "loss": 0.0073, "step": 19830 }, { "grad_norm": 0.13680022954940796, "learning_rate": 1.7715713834776105e-08, "loss": 0.0086, "step": 19840 }, { "grad_norm": 0.23710624873638153, "learning_rate": 1.5583459839046964e-08, "loss": 0.0066, "step": 19850 }, { "grad_norm": 0.12326239794492722, "learning_rate": 1.3587861329489304e-08, "loss": 0.0082, "step": 19860 }, { "grad_norm": 0.06511534750461578, "learning_rate": 1.1728923761994415e-08, "loss": 0.006, "step": 19870 }, { "grad_norm": 0.07283806800842285, "learning_rate": 1.0006652218819135e-08, "loss": 0.0054, "step": 19880 }, { "grad_norm": 0.2702879011631012, "learning_rate": 8.421051408596947e-09, "loss": 0.0065, "step": 19890 }, { "grad_norm": 0.1429319828748703, "learning_rate": 6.972125666299123e-09, "loss": 0.0071, "step": 19900 }, { "grad_norm": 0.09107927978038788, "learning_rate": 5.659878953229169e-09, "loss": 0.0053, "step": 19910 }, { "grad_norm": 0.07832154631614685, "learning_rate": 4.48431485701728e-09, "loss": 0.0058, "step": 19920 }, { "grad_norm": 0.06927736103534698, "learning_rate": 3.4454365916203322e-09, "loss": 0.0052, "step": 19930 }, { "grad_norm": 0.13584686815738678, "learning_rate": 2.5432469972830332e-09, "loss": 0.0054, "step": 19940 }, { "grad_norm": 0.08867272734642029, "learning_rate": 1.7777485405601203e-09, "loss": 0.006, "step": 19950 }, { "grad_norm": 0.12561294436454773, "learning_rate": 1.1489433142941597e-09, "loss": 0.0062, "step": 19960 }, { "grad_norm": 0.11785329133272171, "learning_rate": 6.568330376210963e-10, "loss": 0.0067, "step": 19970 }, { "grad_norm": 0.16057169437408447, "learning_rate": 3.0141905594249787e-10, "loss": 0.0057, "step": 19980 }, { "grad_norm": 1.7752858400344849, "learning_rate": 8.270234094776008e-11, "loss": 0.0075, "step": 19990 }, { "grad_norm": 0.08768580108880997, "learning_rate": 6.834906085551041e-13, "loss": 0.0049, "step": 20000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }