diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6666666666666666, + "eval_steps": 500, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "grad_norm": 4.756218910217285, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4157, + "step": 10 + }, + { + "grad_norm": 4.533816814422607, + "learning_rate": 6.333333333333333e-07, + "loss": 1.4125, + "step": 20 + }, + { + "grad_norm": 4.002695560455322, + "learning_rate": 9.666666666666668e-07, + "loss": 1.3849, + "step": 30 + }, + { + "grad_norm": 2.823151111602783, + "learning_rate": 1.3e-06, + "loss": 1.3157, + "step": 40 + }, + { + "grad_norm": 1.5573179721832275, + "learning_rate": 1.6333333333333333e-06, + "loss": 1.2429, + "step": 50 + }, + { + "grad_norm": 0.9189413189888, + "learning_rate": 1.9666666666666668e-06, + "loss": 1.1899, + "step": 60 + }, + { + "grad_norm": 0.5993644595146179, + "learning_rate": 2.3e-06, + "loss": 1.1445, + "step": 70 + }, + { + "grad_norm": 0.4388406276702881, + "learning_rate": 2.6333333333333337e-06, + "loss": 1.1259, + "step": 80 + }, + { + "grad_norm": 0.5618627071380615, + "learning_rate": 2.966666666666667e-06, + "loss": 1.1022, + "step": 90 + }, + { + "grad_norm": 0.5426422953605652, + "learning_rate": 3.3e-06, + "loss": 1.1008, + "step": 100 + }, + { + "grad_norm": 0.6314268708229065, + "learning_rate": 3.633333333333334e-06, + "loss": 1.0895, + "step": 110 + }, + { + "grad_norm": 0.5122863054275513, + "learning_rate": 3.966666666666667e-06, + "loss": 1.0792, + "step": 120 + }, + { + "grad_norm": 0.5248027443885803, + "learning_rate": 4.2999999999999995e-06, + "loss": 1.0694, + "step": 130 + }, + { + "grad_norm": 0.6071310639381409, + "learning_rate": 4.633333333333334e-06, + "loss": 1.0675, + "step": 140 + }, + { + "grad_norm": 0.566581666469574, + "learning_rate": 4.966666666666667e-06, + "loss": 1.0633, + "step": 150 + }, + { + "grad_norm": 0.6244418025016785, + "learning_rate": 5.3e-06, + "loss": 1.0559, + "step": 160 + }, + { + "grad_norm": 0.4709051549434662, + "learning_rate": 5.633333333333333e-06, + "loss": 1.0554, + "step": 170 + }, + { + "grad_norm": 0.6256153583526611, + "learning_rate": 5.9666666666666666e-06, + "loss": 1.0538, + "step": 180 + }, + { + "grad_norm": 0.6180979609489441, + "learning_rate": 6.300000000000001e-06, + "loss": 1.0465, + "step": 190 + }, + { + "grad_norm": 0.5741148591041565, + "learning_rate": 6.633333333333333e-06, + "loss": 1.0468, + "step": 200 + }, + { + "grad_norm": 0.5811980366706848, + "learning_rate": 6.966666666666667e-06, + "loss": 1.0448, + "step": 210 + }, + { + "grad_norm": 0.6171549558639526, + "learning_rate": 7.2999999999999996e-06, + "loss": 1.04, + "step": 220 + }, + { + "grad_norm": 0.6265953183174133, + "learning_rate": 7.633333333333334e-06, + "loss": 1.0324, + "step": 230 + }, + { + "grad_norm": 0.6889417767524719, + "learning_rate": 7.966666666666666e-06, + "loss": 1.0267, + "step": 240 + }, + { + "grad_norm": 0.8068269491195679, + "learning_rate": 8.3e-06, + "loss": 1.0159, + "step": 250 + }, + { + "grad_norm": 0.8558771014213562, + "learning_rate": 8.633333333333334e-06, + "loss": 0.9939, + "step": 260 + }, + { + "grad_norm": 0.8399918079376221, + "learning_rate": 8.966666666666668e-06, + "loss": 0.9742, + "step": 270 + }, + { + "grad_norm": 1.2498595714569092, + "learning_rate": 9.3e-06, + "loss": 0.9264, + "step": 280 + }, + { + "grad_norm": 1.109431266784668, + "learning_rate": 9.633333333333335e-06, + "loss": 0.8852, + "step": 290 + }, + { + "grad_norm": 1.3449949026107788, + "learning_rate": 9.966666666666667e-06, + "loss": 0.827, + "step": 300 + }, + { + "grad_norm": 1.495901346206665, + "learning_rate": 1.03e-05, + "loss": 0.7714, + "step": 310 + }, + { + "grad_norm": 1.4885666370391846, + "learning_rate": 1.0633333333333334e-05, + "loss": 0.7105, + "step": 320 + }, + { + "grad_norm": 1.8869973421096802, + "learning_rate": 1.0966666666666666e-05, + "loss": 0.6658, + "step": 330 + }, + { + "grad_norm": 1.8417017459869385, + "learning_rate": 1.13e-05, + "loss": 0.6087, + "step": 340 + }, + { + "grad_norm": 2.179997682571411, + "learning_rate": 1.1633333333333334e-05, + "loss": 0.5631, + "step": 350 + }, + { + "grad_norm": 1.715309500694275, + "learning_rate": 1.1966666666666668e-05, + "loss": 0.5301, + "step": 360 + }, + { + "grad_norm": 2.6223623752593994, + "learning_rate": 1.23e-05, + "loss": 0.5061, + "step": 370 + }, + { + "grad_norm": 2.5044667720794678, + "learning_rate": 1.2633333333333333e-05, + "loss": 0.4606, + "step": 380 + }, + { + "grad_norm": 2.0453903675079346, + "learning_rate": 1.2966666666666669e-05, + "loss": 0.4261, + "step": 390 + }, + { + "grad_norm": 1.932612419128418, + "learning_rate": 1.3300000000000001e-05, + "loss": 0.3947, + "step": 400 + }, + { + "grad_norm": 1.958109974861145, + "learning_rate": 1.3633333333333334e-05, + "loss": 0.3719, + "step": 410 + }, + { + "grad_norm": 3.0342814922332764, + "learning_rate": 1.3966666666666666e-05, + "loss": 0.3453, + "step": 420 + }, + { + "grad_norm": 2.7047736644744873, + "learning_rate": 1.43e-05, + "loss": 0.3351, + "step": 430 + }, + { + "grad_norm": 2.491868734359741, + "learning_rate": 1.4633333333333334e-05, + "loss": 0.3125, + "step": 440 + }, + { + "grad_norm": 2.332961320877075, + "learning_rate": 1.4966666666666668e-05, + "loss": 0.2964, + "step": 450 + }, + { + "grad_norm": 2.4805514812469482, + "learning_rate": 1.53e-05, + "loss": 0.28, + "step": 460 + }, + { + "grad_norm": 2.3569538593292236, + "learning_rate": 1.563333333333333e-05, + "loss": 0.2752, + "step": 470 + }, + { + "grad_norm": 2.5456480979919434, + "learning_rate": 1.5966666666666667e-05, + "loss": 0.2511, + "step": 480 + }, + { + "grad_norm": 2.4560647010803223, + "learning_rate": 1.63e-05, + "loss": 0.2392, + "step": 490 + }, + { + "grad_norm": 2.0467610359191895, + "learning_rate": 1.6633333333333336e-05, + "loss": 0.223, + "step": 500 + }, + { + "grad_norm": 2.2710206508636475, + "learning_rate": 1.6966666666666668e-05, + "loss": 0.2181, + "step": 510 + }, + { + "grad_norm": 2.6807422637939453, + "learning_rate": 1.73e-05, + "loss": 0.197, + "step": 520 + }, + { + "grad_norm": 2.534992218017578, + "learning_rate": 1.7633333333333336e-05, + "loss": 0.2032, + "step": 530 + }, + { + "grad_norm": 1.784435749053955, + "learning_rate": 1.796666666666667e-05, + "loss": 0.1766, + "step": 540 + }, + { + "grad_norm": 2.40442156791687, + "learning_rate": 1.83e-05, + "loss": 0.1762, + "step": 550 + }, + { + "grad_norm": 2.732968330383301, + "learning_rate": 1.8633333333333333e-05, + "loss": 0.1682, + "step": 560 + }, + { + "grad_norm": 2.360205888748169, + "learning_rate": 1.896666666666667e-05, + "loss": 0.1527, + "step": 570 + }, + { + "grad_norm": 2.251589775085449, + "learning_rate": 1.93e-05, + "loss": 0.1554, + "step": 580 + }, + { + "grad_norm": 2.385878562927246, + "learning_rate": 1.9633333333333334e-05, + "loss": 0.1616, + "step": 590 + }, + { + "grad_norm": 2.435128927230835, + "learning_rate": 1.9966666666666666e-05, + "loss": 0.155, + "step": 600 + }, + { + "grad_norm": 2.9539012908935547, + "learning_rate": 2.0300000000000002e-05, + "loss": 0.1641, + "step": 610 + }, + { + "grad_norm": 2.2021026611328125, + "learning_rate": 2.0633333333333335e-05, + "loss": 0.1413, + "step": 620 + }, + { + "grad_norm": 2.5571975708007812, + "learning_rate": 2.0966666666666667e-05, + "loss": 0.1299, + "step": 630 + }, + { + "grad_norm": 2.4322869777679443, + "learning_rate": 2.13e-05, + "loss": 0.1402, + "step": 640 + }, + { + "grad_norm": 2.5171878337860107, + "learning_rate": 2.1633333333333332e-05, + "loss": 0.1384, + "step": 650 + }, + { + "grad_norm": 2.2761356830596924, + "learning_rate": 2.1966666666666668e-05, + "loss": 0.1293, + "step": 660 + }, + { + "grad_norm": 2.6294305324554443, + "learning_rate": 2.23e-05, + "loss": 0.1545, + "step": 670 + }, + { + "grad_norm": 2.5020318031311035, + "learning_rate": 2.2633333333333336e-05, + "loss": 0.1362, + "step": 680 + }, + { + "grad_norm": 2.714468002319336, + "learning_rate": 2.2966666666666668e-05, + "loss": 0.1303, + "step": 690 + }, + { + "grad_norm": 2.5271618366241455, + "learning_rate": 2.3300000000000004e-05, + "loss": 0.1314, + "step": 700 + }, + { + "grad_norm": 2.491684913635254, + "learning_rate": 2.3633333333333336e-05, + "loss": 0.1422, + "step": 710 + }, + { + "grad_norm": 2.2820470333099365, + "learning_rate": 2.396666666666667e-05, + "loss": 0.1239, + "step": 720 + }, + { + "grad_norm": 2.5786640644073486, + "learning_rate": 2.43e-05, + "loss": 0.1261, + "step": 730 + }, + { + "grad_norm": 2.4451346397399902, + "learning_rate": 2.4633333333333334e-05, + "loss": 0.1094, + "step": 740 + }, + { + "grad_norm": 2.223487377166748, + "learning_rate": 2.496666666666667e-05, + "loss": 0.1253, + "step": 750 + }, + { + "grad_norm": 2.8065717220306396, + "learning_rate": 2.5300000000000002e-05, + "loss": 0.1143, + "step": 760 + }, + { + "grad_norm": 2.274339437484741, + "learning_rate": 2.5633333333333338e-05, + "loss": 0.1171, + "step": 770 + }, + { + "grad_norm": 1.814670205116272, + "learning_rate": 2.5966666666666667e-05, + "loss": 0.1179, + "step": 780 + }, + { + "grad_norm": 2.0471904277801514, + "learning_rate": 2.6300000000000002e-05, + "loss": 0.1096, + "step": 790 + }, + { + "grad_norm": 2.3558876514434814, + "learning_rate": 2.663333333333333e-05, + "loss": 0.1171, + "step": 800 + }, + { + "grad_norm": 2.887620449066162, + "learning_rate": 2.6966666666666667e-05, + "loss": 0.1218, + "step": 810 + }, + { + "grad_norm": 1.7995444536209106, + "learning_rate": 2.7300000000000003e-05, + "loss": 0.1136, + "step": 820 + }, + { + "grad_norm": 1.9670045375823975, + "learning_rate": 2.7633333333333332e-05, + "loss": 0.1135, + "step": 830 + }, + { + "grad_norm": 1.573832631111145, + "learning_rate": 2.7966666666666668e-05, + "loss": 0.1058, + "step": 840 + }, + { + "grad_norm": 2.520409345626831, + "learning_rate": 2.83e-05, + "loss": 0.1265, + "step": 850 + }, + { + "grad_norm": 2.2822840213775635, + "learning_rate": 2.8633333333333336e-05, + "loss": 0.1077, + "step": 860 + }, + { + "grad_norm": 2.5108864307403564, + "learning_rate": 2.8966666666666668e-05, + "loss": 0.1099, + "step": 870 + }, + { + "grad_norm": 2.2583436965942383, + "learning_rate": 2.93e-05, + "loss": 0.102, + "step": 880 + }, + { + "grad_norm": 1.973009705543518, + "learning_rate": 2.9633333333333336e-05, + "loss": 0.1006, + "step": 890 + }, + { + "grad_norm": 2.5470638275146484, + "learning_rate": 2.9966666666666672e-05, + "loss": 0.1162, + "step": 900 + }, + { + "grad_norm": 2.4625256061553955, + "learning_rate": 3.03e-05, + "loss": 0.0983, + "step": 910 + }, + { + "grad_norm": 2.042452096939087, + "learning_rate": 3.063333333333334e-05, + "loss": 0.1113, + "step": 920 + }, + { + "grad_norm": 2.270254373550415, + "learning_rate": 3.096666666666666e-05, + "loss": 0.1007, + "step": 930 + }, + { + "grad_norm": 1.8730031251907349, + "learning_rate": 3.13e-05, + "loss": 0.0979, + "step": 940 + }, + { + "grad_norm": 1.923866868019104, + "learning_rate": 3.1633333333333334e-05, + "loss": 0.1135, + "step": 950 + }, + { + "grad_norm": 2.1694090366363525, + "learning_rate": 3.196666666666667e-05, + "loss": 0.0968, + "step": 960 + }, + { + "grad_norm": 2.5053062438964844, + "learning_rate": 3.2300000000000006e-05, + "loss": 0.0962, + "step": 970 + }, + { + "grad_norm": 1.9679805040359497, + "learning_rate": 3.263333333333333e-05, + "loss": 0.1052, + "step": 980 + }, + { + "grad_norm": 2.5273709297180176, + "learning_rate": 3.296666666666667e-05, + "loss": 0.1073, + "step": 990 + }, + { + "grad_norm": 2.220262050628662, + "learning_rate": 3.33e-05, + "loss": 0.1015, + "step": 1000 + }, + { + "grad_norm": 2.0471818447113037, + "learning_rate": 3.3633333333333335e-05, + "loss": 0.0954, + "step": 1010 + }, + { + "grad_norm": 1.8970118761062622, + "learning_rate": 3.396666666666667e-05, + "loss": 0.0973, + "step": 1020 + }, + { + "grad_norm": 1.8871910572052002, + "learning_rate": 3.430000000000001e-05, + "loss": 0.0991, + "step": 1030 + }, + { + "grad_norm": 1.8973971605300903, + "learning_rate": 3.463333333333333e-05, + "loss": 0.0956, + "step": 1040 + }, + { + "grad_norm": 2.0561819076538086, + "learning_rate": 3.496666666666667e-05, + "loss": 0.0913, + "step": 1050 + }, + { + "grad_norm": 2.165154457092285, + "learning_rate": 3.53e-05, + "loss": 0.1109, + "step": 1060 + }, + { + "grad_norm": 2.1164817810058594, + "learning_rate": 3.563333333333334e-05, + "loss": 0.0997, + "step": 1070 + }, + { + "grad_norm": 2.1834139823913574, + "learning_rate": 3.596666666666667e-05, + "loss": 0.1002, + "step": 1080 + }, + { + "grad_norm": 1.7735356092453003, + "learning_rate": 3.63e-05, + "loss": 0.081, + "step": 1090 + }, + { + "grad_norm": 2.0510787963867188, + "learning_rate": 3.6633333333333334e-05, + "loss": 0.0949, + "step": 1100 + }, + { + "grad_norm": 2.2429168224334717, + "learning_rate": 3.6966666666666666e-05, + "loss": 0.0972, + "step": 1110 + }, + { + "grad_norm": 1.9839978218078613, + "learning_rate": 3.73e-05, + "loss": 0.0939, + "step": 1120 + }, + { + "grad_norm": 1.8827766180038452, + "learning_rate": 3.763333333333334e-05, + "loss": 0.1018, + "step": 1130 + }, + { + "grad_norm": 1.7244727611541748, + "learning_rate": 3.796666666666667e-05, + "loss": 0.096, + "step": 1140 + }, + { + "grad_norm": 1.6272896528244019, + "learning_rate": 3.83e-05, + "loss": 0.0962, + "step": 1150 + }, + { + "grad_norm": 1.6950806379318237, + "learning_rate": 3.8633333333333335e-05, + "loss": 0.0985, + "step": 1160 + }, + { + "grad_norm": 1.6659928560256958, + "learning_rate": 3.896666666666667e-05, + "loss": 0.0897, + "step": 1170 + }, + { + "grad_norm": 1.8704904317855835, + "learning_rate": 3.9300000000000007e-05, + "loss": 0.0916, + "step": 1180 + }, + { + "grad_norm": 2.13916277885437, + "learning_rate": 3.963333333333333e-05, + "loss": 0.093, + "step": 1190 + }, + { + "grad_norm": 1.256888508796692, + "learning_rate": 3.996666666666667e-05, + "loss": 0.0991, + "step": 1200 + }, + { + "grad_norm": 1.7913360595703125, + "learning_rate": 4.0300000000000004e-05, + "loss": 0.096, + "step": 1210 + }, + { + "grad_norm": 1.8980103731155396, + "learning_rate": 4.0633333333333336e-05, + "loss": 0.0886, + "step": 1220 + }, + { + "grad_norm": 1.5002257823944092, + "learning_rate": 4.096666666666667e-05, + "loss": 0.0946, + "step": 1230 + }, + { + "grad_norm": 1.6084682941436768, + "learning_rate": 4.13e-05, + "loss": 0.0892, + "step": 1240 + }, + { + "grad_norm": 1.6400270462036133, + "learning_rate": 4.1633333333333333e-05, + "loss": 0.0955, + "step": 1250 + }, + { + "grad_norm": 1.6958472728729248, + "learning_rate": 4.196666666666667e-05, + "loss": 0.0845, + "step": 1260 + }, + { + "grad_norm": 1.517759919166565, + "learning_rate": 4.23e-05, + "loss": 0.0965, + "step": 1270 + }, + { + "grad_norm": 1.7336673736572266, + "learning_rate": 4.263333333333334e-05, + "loss": 0.0946, + "step": 1280 + }, + { + "grad_norm": 1.8478679656982422, + "learning_rate": 4.296666666666666e-05, + "loss": 0.0963, + "step": 1290 + }, + { + "grad_norm": 1.453667402267456, + "learning_rate": 4.33e-05, + "loss": 0.0882, + "step": 1300 + }, + { + "grad_norm": 1.4964280128479004, + "learning_rate": 4.3633333333333335e-05, + "loss": 0.0831, + "step": 1310 + }, + { + "grad_norm": 1.585028052330017, + "learning_rate": 4.396666666666667e-05, + "loss": 0.0965, + "step": 1320 + }, + { + "grad_norm": 1.6701347827911377, + "learning_rate": 4.43e-05, + "loss": 0.0824, + "step": 1330 + }, + { + "grad_norm": 1.593643307685852, + "learning_rate": 4.463333333333334e-05, + "loss": 0.0888, + "step": 1340 + }, + { + "grad_norm": 1.9641443490982056, + "learning_rate": 4.496666666666667e-05, + "loss": 0.0832, + "step": 1350 + }, + { + "grad_norm": 2.0581891536712646, + "learning_rate": 4.53e-05, + "loss": 0.0925, + "step": 1360 + }, + { + "grad_norm": 1.3554548025131226, + "learning_rate": 4.5633333333333336e-05, + "loss": 0.0772, + "step": 1370 + }, + { + "grad_norm": 1.6955831050872803, + "learning_rate": 4.596666666666667e-05, + "loss": 0.0835, + "step": 1380 + }, + { + "grad_norm": 1.43825364112854, + "learning_rate": 4.630000000000001e-05, + "loss": 0.0824, + "step": 1390 + }, + { + "grad_norm": 1.4359674453735352, + "learning_rate": 4.663333333333333e-05, + "loss": 0.0919, + "step": 1400 + }, + { + "grad_norm": 1.3146463632583618, + "learning_rate": 4.696666666666667e-05, + "loss": 0.0747, + "step": 1410 + }, + { + "grad_norm": 1.3383302688598633, + "learning_rate": 4.73e-05, + "loss": 0.0852, + "step": 1420 + }, + { + "grad_norm": 1.4164702892303467, + "learning_rate": 4.763333333333334e-05, + "loss": 0.088, + "step": 1430 + }, + { + "grad_norm": 1.6717121601104736, + "learning_rate": 4.796666666666667e-05, + "loss": 0.0838, + "step": 1440 + }, + { + "grad_norm": 1.6992640495300293, + "learning_rate": 4.83e-05, + "loss": 0.0885, + "step": 1450 + }, + { + "grad_norm": 1.4425464868545532, + "learning_rate": 4.8633333333333334e-05, + "loss": 0.08, + "step": 1460 + }, + { + "grad_norm": 1.7380220890045166, + "learning_rate": 4.8966666666666667e-05, + "loss": 0.0857, + "step": 1470 + }, + { + "grad_norm": 1.3327770233154297, + "learning_rate": 4.93e-05, + "loss": 0.0857, + "step": 1480 + }, + { + "grad_norm": 1.3948441743850708, + "learning_rate": 4.963333333333334e-05, + "loss": 0.0768, + "step": 1490 + }, + { + "grad_norm": 1.4107425212860107, + "learning_rate": 4.996666666666667e-05, + "loss": 0.0956, + "step": 1500 + }, + { + "grad_norm": 1.5134875774383545, + "learning_rate": 5.03e-05, + "loss": 0.081, + "step": 1510 + }, + { + "grad_norm": 1.4226447343826294, + "learning_rate": 5.0633333333333335e-05, + "loss": 0.0835, + "step": 1520 + }, + { + "grad_norm": 1.7193541526794434, + "learning_rate": 5.0966666666666674e-05, + "loss": 0.0847, + "step": 1530 + }, + { + "grad_norm": 1.5719715356826782, + "learning_rate": 5.130000000000001e-05, + "loss": 0.0889, + "step": 1540 + }, + { + "grad_norm": 1.4408549070358276, + "learning_rate": 5.163333333333333e-05, + "loss": 0.084, + "step": 1550 + }, + { + "grad_norm": 1.4915046691894531, + "learning_rate": 5.196666666666667e-05, + "loss": 0.0844, + "step": 1560 + }, + { + "grad_norm": 1.3872088193893433, + "learning_rate": 5.2300000000000004e-05, + "loss": 0.0797, + "step": 1570 + }, + { + "grad_norm": 1.6842246055603027, + "learning_rate": 5.2633333333333336e-05, + "loss": 0.0743, + "step": 1580 + }, + { + "grad_norm": 1.87129807472229, + "learning_rate": 5.296666666666666e-05, + "loss": 0.0831, + "step": 1590 + }, + { + "grad_norm": 1.3865938186645508, + "learning_rate": 5.330000000000001e-05, + "loss": 0.0813, + "step": 1600 + }, + { + "grad_norm": 1.0861713886260986, + "learning_rate": 5.3633333333333334e-05, + "loss": 0.0744, + "step": 1610 + }, + { + "grad_norm": 1.3400224447250366, + "learning_rate": 5.3966666666666666e-05, + "loss": 0.0702, + "step": 1620 + }, + { + "grad_norm": 1.258410096168518, + "learning_rate": 5.4300000000000005e-05, + "loss": 0.0774, + "step": 1630 + }, + { + "grad_norm": 1.5131471157073975, + "learning_rate": 5.463333333333334e-05, + "loss": 0.076, + "step": 1640 + }, + { + "grad_norm": 1.4185198545455933, + "learning_rate": 5.496666666666666e-05, + "loss": 0.0849, + "step": 1650 + }, + { + "grad_norm": 1.3197180032730103, + "learning_rate": 5.530000000000001e-05, + "loss": 0.0678, + "step": 1660 + }, + { + "grad_norm": 1.8167043924331665, + "learning_rate": 5.5633333333333335e-05, + "loss": 0.0781, + "step": 1670 + }, + { + "grad_norm": 1.2916417121887207, + "learning_rate": 5.596666666666667e-05, + "loss": 0.0833, + "step": 1680 + }, + { + "grad_norm": 1.3026211261749268, + "learning_rate": 5.63e-05, + "loss": 0.0829, + "step": 1690 + }, + { + "grad_norm": 1.4835796356201172, + "learning_rate": 5.663333333333334e-05, + "loss": 0.08, + "step": 1700 + }, + { + "grad_norm": 1.2610836029052734, + "learning_rate": 5.696666666666667e-05, + "loss": 0.0745, + "step": 1710 + }, + { + "grad_norm": 1.566968321800232, + "learning_rate": 5.73e-05, + "loss": 0.0828, + "step": 1720 + }, + { + "grad_norm": 1.255800485610962, + "learning_rate": 5.7633333333333336e-05, + "loss": 0.0824, + "step": 1730 + }, + { + "grad_norm": 1.2247788906097412, + "learning_rate": 5.796666666666667e-05, + "loss": 0.0723, + "step": 1740 + }, + { + "grad_norm": 1.3425395488739014, + "learning_rate": 5.83e-05, + "loss": 0.0729, + "step": 1750 + }, + { + "grad_norm": 1.2652937173843384, + "learning_rate": 5.863333333333334e-05, + "loss": 0.0823, + "step": 1760 + }, + { + "grad_norm": 1.3104197978973389, + "learning_rate": 5.896666666666667e-05, + "loss": 0.0763, + "step": 1770 + }, + { + "grad_norm": 1.1734591722488403, + "learning_rate": 5.93e-05, + "loss": 0.0765, + "step": 1780 + }, + { + "grad_norm": 1.1746596097946167, + "learning_rate": 5.9633333333333344e-05, + "loss": 0.0852, + "step": 1790 + }, + { + "grad_norm": 1.4064711332321167, + "learning_rate": 5.996666666666667e-05, + "loss": 0.0745, + "step": 1800 + }, + { + "grad_norm": 0.9678300619125366, + "learning_rate": 6.03e-05, + "loss": 0.0756, + "step": 1810 + }, + { + "grad_norm": 1.1954820156097412, + "learning_rate": 6.063333333333333e-05, + "loss": 0.071, + "step": 1820 + }, + { + "grad_norm": 1.303094744682312, + "learning_rate": 6.0966666666666674e-05, + "loss": 0.0712, + "step": 1830 + }, + { + "grad_norm": 1.378692626953125, + "learning_rate": 6.13e-05, + "loss": 0.0734, + "step": 1840 + }, + { + "grad_norm": 1.252637505531311, + "learning_rate": 6.163333333333333e-05, + "loss": 0.0719, + "step": 1850 + }, + { + "grad_norm": 1.215796947479248, + "learning_rate": 6.196666666666668e-05, + "loss": 0.0832, + "step": 1860 + }, + { + "grad_norm": 1.1180384159088135, + "learning_rate": 6.23e-05, + "loss": 0.0809, + "step": 1870 + }, + { + "grad_norm": 1.3103326559066772, + "learning_rate": 6.263333333333333e-05, + "loss": 0.07, + "step": 1880 + }, + { + "grad_norm": 1.1420583724975586, + "learning_rate": 6.296666666666667e-05, + "loss": 0.0655, + "step": 1890 + }, + { + "grad_norm": 1.397101879119873, + "learning_rate": 6.330000000000001e-05, + "loss": 0.0888, + "step": 1900 + }, + { + "grad_norm": 1.2339445352554321, + "learning_rate": 6.363333333333334e-05, + "loss": 0.0784, + "step": 1910 + }, + { + "grad_norm": 1.3281197547912598, + "learning_rate": 6.396666666666667e-05, + "loss": 0.0742, + "step": 1920 + }, + { + "grad_norm": 1.4674954414367676, + "learning_rate": 6.43e-05, + "loss": 0.0753, + "step": 1930 + }, + { + "grad_norm": 1.1643928289413452, + "learning_rate": 6.463333333333334e-05, + "loss": 0.0774, + "step": 1940 + }, + { + "grad_norm": 1.4948179721832275, + "learning_rate": 6.496666666666667e-05, + "loss": 0.0777, + "step": 1950 + }, + { + "grad_norm": 1.1078697443008423, + "learning_rate": 6.53e-05, + "loss": 0.0628, + "step": 1960 + }, + { + "grad_norm": 1.1832149028778076, + "learning_rate": 6.563333333333333e-05, + "loss": 0.0764, + "step": 1970 + }, + { + "grad_norm": 1.3219122886657715, + "learning_rate": 6.596666666666667e-05, + "loss": 0.083, + "step": 1980 + }, + { + "grad_norm": 1.2816904783248901, + "learning_rate": 6.630000000000001e-05, + "loss": 0.073, + "step": 1990 + }, + { + "grad_norm": 0.922963559627533, + "learning_rate": 6.663333333333333e-05, + "loss": 0.0667, + "step": 2000 + }, + { + "grad_norm": 1.0142713785171509, + "learning_rate": 6.696666666666666e-05, + "loss": 0.0682, + "step": 2010 + }, + { + "grad_norm": 1.1142243146896362, + "learning_rate": 6.730000000000001e-05, + "loss": 0.0702, + "step": 2020 + }, + { + "grad_norm": 1.3863885402679443, + "learning_rate": 6.763333333333334e-05, + "loss": 0.0748, + "step": 2030 + }, + { + "grad_norm": 0.8603031635284424, + "learning_rate": 6.796666666666666e-05, + "loss": 0.0776, + "step": 2040 + }, + { + "grad_norm": 1.0744670629501343, + "learning_rate": 6.83e-05, + "loss": 0.0697, + "step": 2050 + }, + { + "grad_norm": 1.0202767848968506, + "learning_rate": 6.863333333333334e-05, + "loss": 0.0658, + "step": 2060 + }, + { + "grad_norm": 1.1308379173278809, + "learning_rate": 6.896666666666667e-05, + "loss": 0.0656, + "step": 2070 + }, + { + "grad_norm": 1.1652814149856567, + "learning_rate": 6.93e-05, + "loss": 0.0663, + "step": 2080 + }, + { + "grad_norm": 1.0037552118301392, + "learning_rate": 6.963333333333334e-05, + "loss": 0.0651, + "step": 2090 + }, + { + "grad_norm": 1.3986576795578003, + "learning_rate": 6.996666666666667e-05, + "loss": 0.0671, + "step": 2100 + }, + { + "grad_norm": 1.111999273300171, + "learning_rate": 7.03e-05, + "loss": 0.0658, + "step": 2110 + }, + { + "grad_norm": 0.9913539886474609, + "learning_rate": 7.063333333333333e-05, + "loss": 0.0624, + "step": 2120 + }, + { + "grad_norm": 1.0654922723770142, + "learning_rate": 7.096666666666667e-05, + "loss": 0.0738, + "step": 2130 + }, + { + "grad_norm": 1.2569122314453125, + "learning_rate": 7.13e-05, + "loss": 0.0717, + "step": 2140 + }, + { + "grad_norm": 1.0777548551559448, + "learning_rate": 7.163333333333334e-05, + "loss": 0.0788, + "step": 2150 + }, + { + "grad_norm": 1.2611500024795532, + "learning_rate": 7.196666666666668e-05, + "loss": 0.0681, + "step": 2160 + }, + { + "grad_norm": 1.1128934621810913, + "learning_rate": 7.23e-05, + "loss": 0.0647, + "step": 2170 + }, + { + "grad_norm": 1.0642884969711304, + "learning_rate": 7.263333333333334e-05, + "loss": 0.0755, + "step": 2180 + }, + { + "grad_norm": 1.2553468942642212, + "learning_rate": 7.296666666666667e-05, + "loss": 0.0685, + "step": 2190 + }, + { + "grad_norm": 0.9592381715774536, + "learning_rate": 7.33e-05, + "loss": 0.0639, + "step": 2200 + }, + { + "grad_norm": 1.0337885618209839, + "learning_rate": 7.363333333333334e-05, + "loss": 0.0634, + "step": 2210 + }, + { + "grad_norm": 1.0090278387069702, + "learning_rate": 7.396666666666667e-05, + "loss": 0.0692, + "step": 2220 + }, + { + "grad_norm": 1.358959436416626, + "learning_rate": 7.43e-05, + "loss": 0.0749, + "step": 2230 + }, + { + "grad_norm": 1.103554368019104, + "learning_rate": 7.463333333333334e-05, + "loss": 0.0619, + "step": 2240 + }, + { + "grad_norm": 0.8154667019844055, + "learning_rate": 7.496666666666667e-05, + "loss": 0.0714, + "step": 2250 + }, + { + "grad_norm": 0.9753686785697937, + "learning_rate": 7.53e-05, + "loss": 0.0685, + "step": 2260 + }, + { + "grad_norm": 1.071243405342102, + "learning_rate": 7.563333333333333e-05, + "loss": 0.0604, + "step": 2270 + }, + { + "grad_norm": 1.1003142595291138, + "learning_rate": 7.596666666666668e-05, + "loss": 0.0709, + "step": 2280 + }, + { + "grad_norm": 1.0595791339874268, + "learning_rate": 7.630000000000001e-05, + "loss": 0.0636, + "step": 2290 + }, + { + "grad_norm": 0.9559823870658875, + "learning_rate": 7.663333333333333e-05, + "loss": 0.0612, + "step": 2300 + }, + { + "grad_norm": 1.1679898500442505, + "learning_rate": 7.696666666666668e-05, + "loss": 0.0739, + "step": 2310 + }, + { + "grad_norm": 1.0987716913223267, + "learning_rate": 7.730000000000001e-05, + "loss": 0.0629, + "step": 2320 + }, + { + "grad_norm": 1.1461509466171265, + "learning_rate": 7.763333333333334e-05, + "loss": 0.0713, + "step": 2330 + }, + { + "grad_norm": 1.0480573177337646, + "learning_rate": 7.796666666666666e-05, + "loss": 0.0646, + "step": 2340 + }, + { + "grad_norm": 1.0487347841262817, + "learning_rate": 7.83e-05, + "loss": 0.0708, + "step": 2350 + }, + { + "grad_norm": 0.8615747094154358, + "learning_rate": 7.863333333333334e-05, + "loss": 0.0633, + "step": 2360 + }, + { + "grad_norm": 0.961333692073822, + "learning_rate": 7.896666666666667e-05, + "loss": 0.063, + "step": 2370 + }, + { + "grad_norm": 0.8947559595108032, + "learning_rate": 7.93e-05, + "loss": 0.0596, + "step": 2380 + }, + { + "grad_norm": 1.1710392236709595, + "learning_rate": 7.963333333333334e-05, + "loss": 0.0697, + "step": 2390 + }, + { + "grad_norm": 1.1182241439819336, + "learning_rate": 7.996666666666667e-05, + "loss": 0.0677, + "step": 2400 + }, + { + "grad_norm": 1.1741247177124023, + "learning_rate": 8.030000000000001e-05, + "loss": 0.0614, + "step": 2410 + }, + { + "grad_norm": 1.0437695980072021, + "learning_rate": 8.063333333333333e-05, + "loss": 0.0615, + "step": 2420 + }, + { + "grad_norm": 0.9634862542152405, + "learning_rate": 8.096666666666667e-05, + "loss": 0.0616, + "step": 2430 + }, + { + "grad_norm": 0.8846582174301147, + "learning_rate": 8.13e-05, + "loss": 0.0633, + "step": 2440 + }, + { + "grad_norm": 1.008697509765625, + "learning_rate": 8.163333333333334e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "grad_norm": 0.9665005803108215, + "learning_rate": 8.196666666666668e-05, + "loss": 0.0682, + "step": 2460 + }, + { + "grad_norm": 0.8760867714881897, + "learning_rate": 8.23e-05, + "loss": 0.0598, + "step": 2470 + }, + { + "grad_norm": 0.8840994238853455, + "learning_rate": 8.263333333333334e-05, + "loss": 0.0603, + "step": 2480 + }, + { + "grad_norm": 0.6845605373382568, + "learning_rate": 8.296666666666667e-05, + "loss": 0.0544, + "step": 2490 + }, + { + "grad_norm": 1.1670717000961304, + "learning_rate": 8.33e-05, + "loss": 0.0689, + "step": 2500 + }, + { + "grad_norm": 0.9465165734291077, + "learning_rate": 8.363333333333334e-05, + "loss": 0.068, + "step": 2510 + }, + { + "grad_norm": 0.7773513197898865, + "learning_rate": 8.396666666666667e-05, + "loss": 0.0636, + "step": 2520 + }, + { + "grad_norm": 0.9731705784797668, + "learning_rate": 8.43e-05, + "loss": 0.059, + "step": 2530 + }, + { + "grad_norm": 1.029721736907959, + "learning_rate": 8.463333333333335e-05, + "loss": 0.0691, + "step": 2540 + }, + { + "grad_norm": 0.88335120677948, + "learning_rate": 8.496666666666667e-05, + "loss": 0.0751, + "step": 2550 + }, + { + "grad_norm": 0.790886640548706, + "learning_rate": 8.53e-05, + "loss": 0.0679, + "step": 2560 + }, + { + "grad_norm": 0.7625932693481445, + "learning_rate": 8.563333333333333e-05, + "loss": 0.0597, + "step": 2570 + }, + { + "grad_norm": 0.8958044648170471, + "learning_rate": 8.596666666666668e-05, + "loss": 0.0641, + "step": 2580 + }, + { + "grad_norm": 0.877277672290802, + "learning_rate": 8.63e-05, + "loss": 0.0641, + "step": 2590 + }, + { + "grad_norm": 0.9416974782943726, + "learning_rate": 8.663333333333333e-05, + "loss": 0.0612, + "step": 2600 + }, + { + "grad_norm": 0.8281979560852051, + "learning_rate": 8.696666666666668e-05, + "loss": 0.059, + "step": 2610 + }, + { + "grad_norm": 0.8689745664596558, + "learning_rate": 8.730000000000001e-05, + "loss": 0.0615, + "step": 2620 + }, + { + "grad_norm": 0.8341608643531799, + "learning_rate": 8.763333333333334e-05, + "loss": 0.0561, + "step": 2630 + }, + { + "grad_norm": 0.698942244052887, + "learning_rate": 8.796666666666667e-05, + "loss": 0.0522, + "step": 2640 + }, + { + "grad_norm": 0.8354687094688416, + "learning_rate": 8.83e-05, + "loss": 0.0604, + "step": 2650 + }, + { + "grad_norm": 1.0889543294906616, + "learning_rate": 8.863333333333334e-05, + "loss": 0.0719, + "step": 2660 + }, + { + "grad_norm": 0.9338927268981934, + "learning_rate": 8.896666666666667e-05, + "loss": 0.0679, + "step": 2670 + }, + { + "grad_norm": 0.8350193500518799, + "learning_rate": 8.93e-05, + "loss": 0.0661, + "step": 2680 + }, + { + "grad_norm": 0.7167150378227234, + "learning_rate": 8.963333333333333e-05, + "loss": 0.0568, + "step": 2690 + }, + { + "grad_norm": 0.8887212872505188, + "learning_rate": 8.996666666666667e-05, + "loss": 0.0646, + "step": 2700 + }, + { + "grad_norm": 0.8746552467346191, + "learning_rate": 9.030000000000001e-05, + "loss": 0.0602, + "step": 2710 + }, + { + "grad_norm": 0.8915983438491821, + "learning_rate": 9.063333333333333e-05, + "loss": 0.061, + "step": 2720 + }, + { + "grad_norm": 0.7062487006187439, + "learning_rate": 9.096666666666666e-05, + "loss": 0.0578, + "step": 2730 + }, + { + "grad_norm": 0.9249823093414307, + "learning_rate": 9.130000000000001e-05, + "loss": 0.0597, + "step": 2740 + }, + { + "grad_norm": 0.8496858477592468, + "learning_rate": 9.163333333333334e-05, + "loss": 0.0543, + "step": 2750 + }, + { + "grad_norm": 0.8931301236152649, + "learning_rate": 9.196666666666666e-05, + "loss": 0.0565, + "step": 2760 + }, + { + "grad_norm": 0.8664289712905884, + "learning_rate": 9.230000000000001e-05, + "loss": 0.0569, + "step": 2770 + }, + { + "grad_norm": 0.7348942756652832, + "learning_rate": 9.263333333333334e-05, + "loss": 0.0564, + "step": 2780 + }, + { + "grad_norm": 0.8783203363418579, + "learning_rate": 9.296666666666667e-05, + "loss": 0.0525, + "step": 2790 + }, + { + "grad_norm": 0.8279241323471069, + "learning_rate": 9.33e-05, + "loss": 0.0551, + "step": 2800 + }, + { + "grad_norm": 0.8857786655426025, + "learning_rate": 9.363333333333334e-05, + "loss": 0.0566, + "step": 2810 + }, + { + "grad_norm": 0.8813221454620361, + "learning_rate": 9.396666666666667e-05, + "loss": 0.0562, + "step": 2820 + }, + { + "grad_norm": 0.8645294904708862, + "learning_rate": 9.43e-05, + "loss": 0.0549, + "step": 2830 + }, + { + "grad_norm": 0.8404380083084106, + "learning_rate": 9.463333333333333e-05, + "loss": 0.0546, + "step": 2840 + }, + { + "grad_norm": 0.914763867855072, + "learning_rate": 9.496666666666667e-05, + "loss": 0.0559, + "step": 2850 + }, + { + "grad_norm": 0.6793450117111206, + "learning_rate": 9.53e-05, + "loss": 0.0576, + "step": 2860 + }, + { + "grad_norm": 0.8766696453094482, + "learning_rate": 9.563333333333334e-05, + "loss": 0.0544, + "step": 2870 + }, + { + "grad_norm": 0.9002400040626526, + "learning_rate": 9.596666666666668e-05, + "loss": 0.0521, + "step": 2880 + }, + { + "grad_norm": 0.8387170433998108, + "learning_rate": 9.63e-05, + "loss": 0.0529, + "step": 2890 + }, + { + "grad_norm": 0.8228157162666321, + "learning_rate": 9.663333333333334e-05, + "loss": 0.0547, + "step": 2900 + }, + { + "grad_norm": 0.8644803166389465, + "learning_rate": 9.696666666666667e-05, + "loss": 0.0694, + "step": 2910 + }, + { + "grad_norm": 0.8468896150588989, + "learning_rate": 9.730000000000001e-05, + "loss": 0.0547, + "step": 2920 + }, + { + "grad_norm": 0.6621829271316528, + "learning_rate": 9.763333333333334e-05, + "loss": 0.0548, + "step": 2930 + }, + { + "grad_norm": 0.7150681614875793, + "learning_rate": 9.796666666666667e-05, + "loss": 0.0515, + "step": 2940 + }, + { + "grad_norm": 0.6284530758857727, + "learning_rate": 9.83e-05, + "loss": 0.0521, + "step": 2950 + }, + { + "grad_norm": 0.7855443358421326, + "learning_rate": 9.863333333333334e-05, + "loss": 0.0636, + "step": 2960 + }, + { + "grad_norm": 0.9308194518089294, + "learning_rate": 9.896666666666667e-05, + "loss": 0.0584, + "step": 2970 + }, + { + "grad_norm": 0.8599221110343933, + "learning_rate": 9.93e-05, + "loss": 0.0582, + "step": 2980 + }, + { + "grad_norm": 0.7483208179473877, + "learning_rate": 9.963333333333333e-05, + "loss": 0.0515, + "step": 2990 + }, + { + "grad_norm": 0.8008257746696472, + "learning_rate": 9.996666666666668e-05, + "loss": 0.0606, + "step": 3000 + }, + { + "grad_norm": 0.8120972514152527, + "learning_rate": 9.999999384858465e-05, + "loss": 0.0551, + "step": 3010 + }, + { + "grad_norm": 0.9966566562652588, + "learning_rate": 9.999997258443473e-05, + "loss": 0.0652, + "step": 3020 + }, + { + "grad_norm": 0.7767784595489502, + "learning_rate": 9.999993613161331e-05, + "loss": 0.0526, + "step": 3030 + }, + { + "grad_norm": 0.7611342072486877, + "learning_rate": 9.999988449013146e-05, + "loss": 0.0501, + "step": 3040 + }, + { + "grad_norm": 0.899966835975647, + "learning_rate": 9.99998176600049e-05, + "loss": 0.0582, + "step": 3050 + }, + { + "grad_norm": 0.7014909982681274, + "learning_rate": 9.999973564125389e-05, + "loss": 0.0603, + "step": 3060 + }, + { + "grad_norm": 0.8234820365905762, + "learning_rate": 9.999963843390335e-05, + "loss": 0.0625, + "step": 3070 + }, + { + "grad_norm": 0.8866924047470093, + "learning_rate": 9.999952603798282e-05, + "loss": 0.0541, + "step": 3080 + }, + { + "grad_norm": 0.6899054050445557, + "learning_rate": 9.999939845352646e-05, + "loss": 0.055, + "step": 3090 + }, + { + "grad_norm": 0.7839197516441345, + "learning_rate": 9.999925568057298e-05, + "loss": 0.0547, + "step": 3100 + }, + { + "grad_norm": 0.8694648146629333, + "learning_rate": 9.999909771916578e-05, + "loss": 0.0606, + "step": 3110 + }, + { + "grad_norm": 0.7287220358848572, + "learning_rate": 9.999892456935285e-05, + "loss": 0.052, + "step": 3120 + }, + { + "grad_norm": 0.7638460397720337, + "learning_rate": 9.999873623118679e-05, + "loss": 0.0569, + "step": 3130 + }, + { + "grad_norm": 0.8368794322013855, + "learning_rate": 9.999853270472479e-05, + "loss": 0.0579, + "step": 3140 + }, + { + "grad_norm": 0.8717802166938782, + "learning_rate": 9.999831399002871e-05, + "loss": 0.0553, + "step": 3150 + }, + { + "grad_norm": 0.7637776136398315, + "learning_rate": 9.999808008716494e-05, + "loss": 0.0507, + "step": 3160 + }, + { + "grad_norm": 0.733996570110321, + "learning_rate": 9.999783099620459e-05, + "loss": 0.0551, + "step": 3170 + }, + { + "grad_norm": 0.5987215042114258, + "learning_rate": 9.999756671722328e-05, + "loss": 0.0517, + "step": 3180 + }, + { + "grad_norm": 0.7850061058998108, + "learning_rate": 9.99972872503013e-05, + "loss": 0.0553, + "step": 3190 + }, + { + "grad_norm": 0.7434727549552917, + "learning_rate": 9.999699259552359e-05, + "loss": 0.0593, + "step": 3200 + }, + { + "grad_norm": 0.8338168263435364, + "learning_rate": 9.99966827529796e-05, + "loss": 0.0569, + "step": 3210 + }, + { + "grad_norm": 0.953469455242157, + "learning_rate": 9.999635772276348e-05, + "loss": 0.0502, + "step": 3220 + }, + { + "grad_norm": 0.7861298322677612, + "learning_rate": 9.999601750497396e-05, + "loss": 0.0529, + "step": 3230 + }, + { + "grad_norm": 0.7575981616973877, + "learning_rate": 9.99956620997144e-05, + "loss": 0.0459, + "step": 3240 + }, + { + "grad_norm": 0.7318950891494751, + "learning_rate": 9.999529150709275e-05, + "loss": 0.0484, + "step": 3250 + }, + { + "grad_norm": 0.858807384967804, + "learning_rate": 9.999490572722158e-05, + "loss": 0.0622, + "step": 3260 + }, + { + "grad_norm": 0.7145293354988098, + "learning_rate": 9.99945047602181e-05, + "loss": 0.0544, + "step": 3270 + }, + { + "grad_norm": 0.7078118920326233, + "learning_rate": 9.99940886062041e-05, + "loss": 0.0475, + "step": 3280 + }, + { + "grad_norm": 0.7223478555679321, + "learning_rate": 9.999365726530599e-05, + "loss": 0.0547, + "step": 3290 + }, + { + "grad_norm": 0.7214303612709045, + "learning_rate": 9.999321073765481e-05, + "loss": 0.0518, + "step": 3300 + }, + { + "grad_norm": 0.7803531885147095, + "learning_rate": 9.99927490233862e-05, + "loss": 0.0485, + "step": 3310 + }, + { + "grad_norm": 0.6373307704925537, + "learning_rate": 9.999227212264043e-05, + "loss": 0.0463, + "step": 3320 + }, + { + "grad_norm": 0.7561566829681396, + "learning_rate": 9.999178003556236e-05, + "loss": 0.049, + "step": 3330 + }, + { + "grad_norm": 0.5519773960113525, + "learning_rate": 9.999127276230146e-05, + "loss": 0.0475, + "step": 3340 + }, + { + "grad_norm": 0.6129007935523987, + "learning_rate": 9.999075030301184e-05, + "loss": 0.0539, + "step": 3350 + }, + { + "grad_norm": 0.9630881547927856, + "learning_rate": 9.999021265785221e-05, + "loss": 0.049, + "step": 3360 + }, + { + "grad_norm": 0.654624342918396, + "learning_rate": 9.998965982698589e-05, + "loss": 0.0608, + "step": 3370 + }, + { + "grad_norm": 0.6920844316482544, + "learning_rate": 9.998909181058082e-05, + "loss": 0.0532, + "step": 3380 + }, + { + "grad_norm": 0.7399725317955017, + "learning_rate": 9.998850860880953e-05, + "loss": 0.055, + "step": 3390 + }, + { + "grad_norm": 0.9067150354385376, + "learning_rate": 9.998791022184922e-05, + "loss": 0.0604, + "step": 3400 + }, + { + "grad_norm": 0.7762711048126221, + "learning_rate": 9.99872966498816e-05, + "loss": 0.0516, + "step": 3410 + }, + { + "grad_norm": 0.7466596961021423, + "learning_rate": 9.998666789309313e-05, + "loss": 0.053, + "step": 3420 + }, + { + "grad_norm": 0.6821275949478149, + "learning_rate": 9.998602395167475e-05, + "loss": 0.0472, + "step": 3430 + }, + { + "grad_norm": 0.7221843600273132, + "learning_rate": 9.998536482582213e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "grad_norm": 0.7387469410896301, + "learning_rate": 9.998469051573544e-05, + "loss": 0.0452, + "step": 3450 + }, + { + "grad_norm": 0.6583935022354126, + "learning_rate": 9.998400102161954e-05, + "loss": 0.0491, + "step": 3460 + }, + { + "grad_norm": 0.7039799690246582, + "learning_rate": 9.998329634368388e-05, + "loss": 0.0441, + "step": 3470 + }, + { + "grad_norm": 0.677544891834259, + "learning_rate": 9.998257648214253e-05, + "loss": 0.0504, + "step": 3480 + }, + { + "grad_norm": 0.6488243937492371, + "learning_rate": 9.998184143721417e-05, + "loss": 0.0462, + "step": 3490 + }, + { + "grad_norm": 0.7597591876983643, + "learning_rate": 9.998109120912206e-05, + "loss": 0.0563, + "step": 3500 + }, + { + "grad_norm": 0.7398865818977356, + "learning_rate": 9.998032579809411e-05, + "loss": 0.0491, + "step": 3510 + }, + { + "grad_norm": 0.68448406457901, + "learning_rate": 9.997954520436286e-05, + "loss": 0.0471, + "step": 3520 + }, + { + "grad_norm": 0.6528464555740356, + "learning_rate": 9.997874942816538e-05, + "loss": 0.0508, + "step": 3530 + }, + { + "grad_norm": 0.7047237157821655, + "learning_rate": 9.997793846974345e-05, + "loss": 0.057, + "step": 3540 + }, + { + "grad_norm": 0.7688554525375366, + "learning_rate": 9.997711232934341e-05, + "loss": 0.0502, + "step": 3550 + }, + { + "grad_norm": 0.6895895004272461, + "learning_rate": 9.99762710072162e-05, + "loss": 0.0501, + "step": 3560 + }, + { + "grad_norm": 0.7453030943870544, + "learning_rate": 9.997541450361743e-05, + "loss": 0.0543, + "step": 3570 + }, + { + "grad_norm": 0.5960200428962708, + "learning_rate": 9.997454281880723e-05, + "loss": 0.0538, + "step": 3580 + }, + { + "grad_norm": 0.707518994808197, + "learning_rate": 9.997365595305044e-05, + "loss": 0.0483, + "step": 3590 + }, + { + "grad_norm": 0.5813127756118774, + "learning_rate": 9.997275390661644e-05, + "loss": 0.0527, + "step": 3600 + }, + { + "grad_norm": 0.5622773766517639, + "learning_rate": 9.997183667977926e-05, + "loss": 0.0472, + "step": 3610 + }, + { + "grad_norm": 0.758303701877594, + "learning_rate": 9.997090427281752e-05, + "loss": 0.0485, + "step": 3620 + }, + { + "grad_norm": 0.6680123209953308, + "learning_rate": 9.996995668601448e-05, + "loss": 0.0497, + "step": 3630 + }, + { + "grad_norm": 0.6016977429389954, + "learning_rate": 9.996899391965798e-05, + "loss": 0.0493, + "step": 3640 + }, + { + "grad_norm": 0.5767600536346436, + "learning_rate": 9.996801597404048e-05, + "loss": 0.0486, + "step": 3650 + }, + { + "grad_norm": 0.7810037732124329, + "learning_rate": 9.996702284945905e-05, + "loss": 0.0417, + "step": 3660 + }, + { + "grad_norm": 0.6699069738388062, + "learning_rate": 9.996601454621539e-05, + "loss": 0.0454, + "step": 3670 + }, + { + "grad_norm": 0.6514731049537659, + "learning_rate": 9.996499106461577e-05, + "loss": 0.0518, + "step": 3680 + }, + { + "grad_norm": 0.6613548994064331, + "learning_rate": 9.996395240497112e-05, + "loss": 0.0504, + "step": 3690 + }, + { + "grad_norm": 0.6204959750175476, + "learning_rate": 9.996289856759696e-05, + "loss": 0.0459, + "step": 3700 + }, + { + "grad_norm": 0.7176038026809692, + "learning_rate": 9.996182955281342e-05, + "loss": 0.0506, + "step": 3710 + }, + { + "grad_norm": 0.5343673825263977, + "learning_rate": 9.996074536094519e-05, + "loss": 0.049, + "step": 3720 + }, + { + "grad_norm": 0.5605956315994263, + "learning_rate": 9.995964599232168e-05, + "loss": 0.0454, + "step": 3730 + }, + { + "grad_norm": 0.6410989761352539, + "learning_rate": 9.995853144727683e-05, + "loss": 0.0503, + "step": 3740 + }, + { + "grad_norm": 0.6830344200134277, + "learning_rate": 9.99574017261492e-05, + "loss": 0.0472, + "step": 3750 + }, + { + "grad_norm": 0.8887002468109131, + "learning_rate": 9.995625682928198e-05, + "loss": 0.0451, + "step": 3760 + }, + { + "grad_norm": 0.7443501949310303, + "learning_rate": 9.995509675702295e-05, + "loss": 0.0534, + "step": 3770 + }, + { + "grad_norm": 0.6347053050994873, + "learning_rate": 9.995392150972451e-05, + "loss": 0.0471, + "step": 3780 + }, + { + "grad_norm": 0.8508003354072571, + "learning_rate": 9.995273108774366e-05, + "loss": 0.0458, + "step": 3790 + }, + { + "grad_norm": 0.5818729996681213, + "learning_rate": 9.995152549144205e-05, + "loss": 0.0478, + "step": 3800 + }, + { + "grad_norm": 0.6865646243095398, + "learning_rate": 9.995030472118587e-05, + "loss": 0.0393, + "step": 3810 + }, + { + "grad_norm": 0.5445404052734375, + "learning_rate": 9.9949068777346e-05, + "loss": 0.0478, + "step": 3820 + }, + { + "grad_norm": 0.7957289218902588, + "learning_rate": 9.994781766029786e-05, + "loss": 0.0496, + "step": 3830 + }, + { + "grad_norm": 0.6646504402160645, + "learning_rate": 9.994655137042151e-05, + "loss": 0.0451, + "step": 3840 + }, + { + "grad_norm": 0.6870298981666565, + "learning_rate": 9.99452699081016e-05, + "loss": 0.0425, + "step": 3850 + }, + { + "grad_norm": 0.6771845817565918, + "learning_rate": 9.994397327372743e-05, + "loss": 0.0477, + "step": 3860 + }, + { + "grad_norm": 0.6277785301208496, + "learning_rate": 9.994266146769286e-05, + "loss": 0.0446, + "step": 3870 + }, + { + "grad_norm": 0.776437520980835, + "learning_rate": 9.994133449039642e-05, + "loss": 0.0442, + "step": 3880 + }, + { + "grad_norm": 0.6355234384536743, + "learning_rate": 9.993999234224118e-05, + "loss": 0.0446, + "step": 3890 + }, + { + "grad_norm": 0.6138414144515991, + "learning_rate": 9.993863502363485e-05, + "loss": 0.0459, + "step": 3900 + }, + { + "grad_norm": 0.5557674169540405, + "learning_rate": 9.993726253498976e-05, + "loss": 0.0454, + "step": 3910 + }, + { + "grad_norm": 0.7285213470458984, + "learning_rate": 9.993587487672282e-05, + "loss": 0.0426, + "step": 3920 + }, + { + "grad_norm": 0.7623494863510132, + "learning_rate": 9.993447204925558e-05, + "loss": 0.0439, + "step": 3930 + }, + { + "grad_norm": 0.6527449488639832, + "learning_rate": 9.993305405301416e-05, + "loss": 0.0452, + "step": 3940 + }, + { + "grad_norm": 0.8385573029518127, + "learning_rate": 9.993162088842935e-05, + "loss": 0.0468, + "step": 3950 + }, + { + "grad_norm": 0.6902685165405273, + "learning_rate": 9.993017255593646e-05, + "loss": 0.04, + "step": 3960 + }, + { + "grad_norm": 0.580554187297821, + "learning_rate": 9.992870905597548e-05, + "loss": 0.0456, + "step": 3970 + }, + { + "grad_norm": 0.6301660537719727, + "learning_rate": 9.9927230388991e-05, + "loss": 0.0398, + "step": 3980 + }, + { + "grad_norm": 0.7712828516960144, + "learning_rate": 9.992573655543215e-05, + "loss": 0.042, + "step": 3990 + }, + { + "grad_norm": 0.6661479473114014, + "learning_rate": 9.992422755575277e-05, + "loss": 0.0493, + "step": 4000 + }, + { + "grad_norm": 0.6758480668067932, + "learning_rate": 9.992270339041123e-05, + "loss": 0.0446, + "step": 4010 + }, + { + "grad_norm": 0.6842628717422485, + "learning_rate": 9.992116405987053e-05, + "loss": 0.0399, + "step": 4020 + }, + { + "grad_norm": 0.6388065814971924, + "learning_rate": 9.991960956459828e-05, + "loss": 0.047, + "step": 4030 + }, + { + "grad_norm": 0.8170465230941772, + "learning_rate": 9.991803990506669e-05, + "loss": 0.0456, + "step": 4040 + }, + { + "grad_norm": 0.7072161436080933, + "learning_rate": 9.991645508175258e-05, + "loss": 0.0423, + "step": 4050 + }, + { + "grad_norm": 0.5126902461051941, + "learning_rate": 9.99148550951374e-05, + "loss": 0.0497, + "step": 4060 + }, + { + "grad_norm": 0.5235459208488464, + "learning_rate": 9.991323994570716e-05, + "loss": 0.042, + "step": 4070 + }, + { + "grad_norm": 0.54132080078125, + "learning_rate": 9.99116096339525e-05, + "loss": 0.04, + "step": 4080 + }, + { + "grad_norm": 0.7617301344871521, + "learning_rate": 9.990996416036869e-05, + "loss": 0.0405, + "step": 4090 + }, + { + "grad_norm": 0.6913269758224487, + "learning_rate": 9.990830352545555e-05, + "loss": 0.0437, + "step": 4100 + }, + { + "grad_norm": 0.6376801133155823, + "learning_rate": 9.990662772971756e-05, + "loss": 0.043, + "step": 4110 + }, + { + "grad_norm": 0.6430414319038391, + "learning_rate": 9.990493677366376e-05, + "loss": 0.0422, + "step": 4120 + }, + { + "grad_norm": 0.497507780790329, + "learning_rate": 9.990323065780786e-05, + "loss": 0.041, + "step": 4130 + }, + { + "grad_norm": 0.6109529733657837, + "learning_rate": 9.990150938266808e-05, + "loss": 0.0455, + "step": 4140 + }, + { + "grad_norm": 0.6755995154380798, + "learning_rate": 9.989977294876733e-05, + "loss": 0.0416, + "step": 4150 + }, + { + "grad_norm": 0.6640646457672119, + "learning_rate": 9.989802135663308e-05, + "loss": 0.0402, + "step": 4160 + }, + { + "grad_norm": 0.6534554958343506, + "learning_rate": 9.989625460679743e-05, + "loss": 0.0503, + "step": 4170 + }, + { + "grad_norm": 0.769862711429596, + "learning_rate": 9.989447269979706e-05, + "loss": 0.0467, + "step": 4180 + }, + { + "grad_norm": 0.7438839673995972, + "learning_rate": 9.989267563617328e-05, + "loss": 0.0445, + "step": 4190 + }, + { + "grad_norm": 0.6223868727684021, + "learning_rate": 9.989086341647198e-05, + "loss": 0.0409, + "step": 4200 + }, + { + "grad_norm": 0.759129524230957, + "learning_rate": 9.988903604124366e-05, + "loss": 0.0391, + "step": 4210 + }, + { + "grad_norm": 0.7649771571159363, + "learning_rate": 9.988719351104343e-05, + "loss": 0.0426, + "step": 4220 + }, + { + "grad_norm": 0.5601566433906555, + "learning_rate": 9.9885335826431e-05, + "loss": 0.0389, + "step": 4230 + }, + { + "grad_norm": 0.657940149307251, + "learning_rate": 9.988346298797071e-05, + "loss": 0.0441, + "step": 4240 + }, + { + "grad_norm": 0.6650236248970032, + "learning_rate": 9.988157499623146e-05, + "loss": 0.045, + "step": 4250 + }, + { + "grad_norm": 0.5703973770141602, + "learning_rate": 9.987967185178677e-05, + "loss": 0.043, + "step": 4260 + }, + { + "grad_norm": 0.5973702073097229, + "learning_rate": 9.987775355521476e-05, + "loss": 0.0404, + "step": 4270 + }, + { + "grad_norm": 0.6352563500404358, + "learning_rate": 9.987582010709817e-05, + "loss": 0.0379, + "step": 4280 + }, + { + "grad_norm": 0.667305052280426, + "learning_rate": 9.987387150802431e-05, + "loss": 0.0416, + "step": 4290 + }, + { + "grad_norm": 0.6370511054992676, + "learning_rate": 9.987190775858517e-05, + "loss": 0.0424, + "step": 4300 + }, + { + "grad_norm": 0.7280203104019165, + "learning_rate": 9.98699288593772e-05, + "loss": 0.0376, + "step": 4310 + }, + { + "grad_norm": 0.6414651870727539, + "learning_rate": 9.986793481100161e-05, + "loss": 0.0405, + "step": 4320 + }, + { + "grad_norm": 0.5290331244468689, + "learning_rate": 9.986592561406412e-05, + "loss": 0.0371, + "step": 4330 + }, + { + "grad_norm": 0.5593175888061523, + "learning_rate": 9.986390126917503e-05, + "loss": 0.0397, + "step": 4340 + }, + { + "grad_norm": 0.6993313431739807, + "learning_rate": 9.986186177694933e-05, + "loss": 0.0411, + "step": 4350 + }, + { + "grad_norm": 0.7027745246887207, + "learning_rate": 9.985980713800656e-05, + "loss": 0.0395, + "step": 4360 + }, + { + "grad_norm": 0.5979444980621338, + "learning_rate": 9.985773735297084e-05, + "loss": 0.0385, + "step": 4370 + }, + { + "grad_norm": 0.5139358639717102, + "learning_rate": 9.985565242247092e-05, + "loss": 0.0369, + "step": 4380 + }, + { + "grad_norm": 0.7809072732925415, + "learning_rate": 9.985355234714016e-05, + "loss": 0.0413, + "step": 4390 + }, + { + "grad_norm": 0.696377158164978, + "learning_rate": 9.985143712761652e-05, + "loss": 0.0468, + "step": 4400 + }, + { + "grad_norm": 0.455167293548584, + "learning_rate": 9.984930676454252e-05, + "loss": 0.0397, + "step": 4410 + }, + { + "grad_norm": 0.7265823483467102, + "learning_rate": 9.984716125856532e-05, + "loss": 0.0439, + "step": 4420 + }, + { + "grad_norm": 0.584205687046051, + "learning_rate": 9.984500061033667e-05, + "loss": 0.0409, + "step": 4430 + }, + { + "grad_norm": 0.6065359711647034, + "learning_rate": 9.984282482051293e-05, + "loss": 0.0458, + "step": 4440 + }, + { + "grad_norm": 0.6577481627464294, + "learning_rate": 9.9840633889755e-05, + "loss": 0.0383, + "step": 4450 + }, + { + "grad_norm": 0.6028100848197937, + "learning_rate": 9.983842781872848e-05, + "loss": 0.0388, + "step": 4460 + }, + { + "grad_norm": 0.6861836314201355, + "learning_rate": 9.98362066081035e-05, + "loss": 0.0385, + "step": 4470 + }, + { + "grad_norm": 0.5267810821533203, + "learning_rate": 9.983397025855479e-05, + "loss": 0.0384, + "step": 4480 + }, + { + "grad_norm": 0.5755560994148254, + "learning_rate": 9.983171877076171e-05, + "loss": 0.0393, + "step": 4490 + }, + { + "grad_norm": 0.6919408440589905, + "learning_rate": 9.98294521454082e-05, + "loss": 0.0402, + "step": 4500 + }, + { + "grad_norm": 0.5324994325637817, + "learning_rate": 9.98271703831828e-05, + "loss": 0.0446, + "step": 4510 + }, + { + "grad_norm": 0.7347158789634705, + "learning_rate": 9.982487348477865e-05, + "loss": 0.0393, + "step": 4520 + }, + { + "grad_norm": 0.5538548231124878, + "learning_rate": 9.982256145089347e-05, + "loss": 0.0438, + "step": 4530 + }, + { + "grad_norm": 0.774620532989502, + "learning_rate": 9.982023428222962e-05, + "loss": 0.0384, + "step": 4540 + }, + { + "grad_norm": 0.6034471392631531, + "learning_rate": 9.981789197949403e-05, + "loss": 0.0385, + "step": 4550 + }, + { + "grad_norm": 0.6088311076164246, + "learning_rate": 9.98155345433982e-05, + "loss": 0.036, + "step": 4560 + }, + { + "grad_norm": 0.5806707739830017, + "learning_rate": 9.981316197465831e-05, + "loss": 0.0373, + "step": 4570 + }, + { + "grad_norm": 0.7010080814361572, + "learning_rate": 9.981077427399504e-05, + "loss": 0.0377, + "step": 4580 + }, + { + "grad_norm": 0.5499981641769409, + "learning_rate": 9.980837144213371e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "grad_norm": 0.5494488477706909, + "learning_rate": 9.980595347980426e-05, + "loss": 0.037, + "step": 4600 + }, + { + "grad_norm": 0.671785831451416, + "learning_rate": 9.980352038774119e-05, + "loss": 0.0362, + "step": 4610 + }, + { + "grad_norm": 0.6250209808349609, + "learning_rate": 9.98010721666836e-05, + "loss": 0.0407, + "step": 4620 + }, + { + "grad_norm": 0.6483927965164185, + "learning_rate": 9.979860881737523e-05, + "loss": 0.0387, + "step": 4630 + }, + { + "grad_norm": 0.7463237643241882, + "learning_rate": 9.979613034056434e-05, + "loss": 0.0413, + "step": 4640 + }, + { + "grad_norm": 0.7241187691688538, + "learning_rate": 9.979363673700386e-05, + "loss": 0.0391, + "step": 4650 + }, + { + "grad_norm": 0.6088847517967224, + "learning_rate": 9.979112800745124e-05, + "loss": 0.045, + "step": 4660 + }, + { + "grad_norm": 0.5330309271812439, + "learning_rate": 9.978860415266861e-05, + "loss": 0.0386, + "step": 4670 + }, + { + "grad_norm": 0.45378994941711426, + "learning_rate": 9.978606517342262e-05, + "loss": 0.0319, + "step": 4680 + }, + { + "grad_norm": 0.6310836672782898, + "learning_rate": 9.978351107048456e-05, + "loss": 0.0405, + "step": 4690 + }, + { + "grad_norm": 0.7219730615615845, + "learning_rate": 9.978094184463029e-05, + "loss": 0.0425, + "step": 4700 + }, + { + "grad_norm": 0.6553520560264587, + "learning_rate": 9.977835749664029e-05, + "loss": 0.0366, + "step": 4710 + }, + { + "grad_norm": 0.6998794078826904, + "learning_rate": 9.97757580272996e-05, + "loss": 0.0369, + "step": 4720 + }, + { + "grad_norm": 0.5620837211608887, + "learning_rate": 9.977314343739786e-05, + "loss": 0.0372, + "step": 4730 + }, + { + "grad_norm": 0.4794282615184784, + "learning_rate": 9.977051372772934e-05, + "loss": 0.0332, + "step": 4740 + }, + { + "grad_norm": 0.6408950686454773, + "learning_rate": 9.976786889909286e-05, + "loss": 0.0348, + "step": 4750 + }, + { + "grad_norm": 0.6270435452461243, + "learning_rate": 9.976520895229185e-05, + "loss": 0.0347, + "step": 4760 + }, + { + "grad_norm": 0.6506294012069702, + "learning_rate": 9.976253388813433e-05, + "loss": 0.0375, + "step": 4770 + }, + { + "grad_norm": 0.5918760299682617, + "learning_rate": 9.975984370743293e-05, + "loss": 0.04, + "step": 4780 + }, + { + "grad_norm": 0.742189347743988, + "learning_rate": 9.975713841100485e-05, + "loss": 0.0346, + "step": 4790 + }, + { + "grad_norm": 0.648874044418335, + "learning_rate": 9.975441799967187e-05, + "loss": 0.0346, + "step": 4800 + }, + { + "grad_norm": 0.5405378937721252, + "learning_rate": 9.975168247426039e-05, + "loss": 0.0334, + "step": 4810 + }, + { + "grad_norm": 0.7033109664916992, + "learning_rate": 9.974893183560139e-05, + "loss": 0.0308, + "step": 4820 + }, + { + "grad_norm": 0.5814917087554932, + "learning_rate": 9.974616608453045e-05, + "loss": 0.0388, + "step": 4830 + }, + { + "grad_norm": 0.5261088013648987, + "learning_rate": 9.974338522188772e-05, + "loss": 0.0424, + "step": 4840 + }, + { + "grad_norm": 0.5280689001083374, + "learning_rate": 9.974058924851797e-05, + "loss": 0.0367, + "step": 4850 + }, + { + "grad_norm": 0.6398464441299438, + "learning_rate": 9.973777816527051e-05, + "loss": 0.0346, + "step": 4860 + }, + { + "grad_norm": 0.5820260643959045, + "learning_rate": 9.973495197299931e-05, + "loss": 0.0322, + "step": 4870 + }, + { + "grad_norm": 0.5457234978675842, + "learning_rate": 9.973211067256287e-05, + "loss": 0.0388, + "step": 4880 + }, + { + "grad_norm": 0.646477997303009, + "learning_rate": 9.97292542648243e-05, + "loss": 0.035, + "step": 4890 + }, + { + "grad_norm": 0.6922654509544373, + "learning_rate": 9.972638275065131e-05, + "loss": 0.0351, + "step": 4900 + }, + { + "grad_norm": 0.6583635807037354, + "learning_rate": 9.972349613091621e-05, + "loss": 0.032, + "step": 4910 + }, + { + "grad_norm": 0.69245445728302, + "learning_rate": 9.972059440649584e-05, + "loss": 0.0309, + "step": 4920 + }, + { + "grad_norm": 0.6526948809623718, + "learning_rate": 9.971767757827168e-05, + "loss": 0.0357, + "step": 4930 + }, + { + "grad_norm": 0.5548874139785767, + "learning_rate": 9.971474564712982e-05, + "loss": 0.031, + "step": 4940 + }, + { + "grad_norm": 0.7070045471191406, + "learning_rate": 9.971179861396084e-05, + "loss": 0.0348, + "step": 4950 + }, + { + "grad_norm": 0.6669615507125854, + "learning_rate": 9.970883647966003e-05, + "loss": 0.0316, + "step": 4960 + }, + { + "grad_norm": 0.5660342574119568, + "learning_rate": 9.970585924512717e-05, + "loss": 0.0392, + "step": 4970 + }, + { + "grad_norm": 0.5953735709190369, + "learning_rate": 9.970286691126669e-05, + "loss": 0.0398, + "step": 4980 + }, + { + "grad_norm": 0.45345062017440796, + "learning_rate": 9.969985947898756e-05, + "loss": 0.0326, + "step": 4990 + }, + { + "grad_norm": 0.5851004123687744, + "learning_rate": 9.969683694920337e-05, + "loss": 0.0353, + "step": 5000 + }, + { + "grad_norm": 0.6838672757148743, + "learning_rate": 9.969379932283228e-05, + "loss": 0.0334, + "step": 5010 + }, + { + "grad_norm": 0.4194786250591278, + "learning_rate": 9.969074660079704e-05, + "loss": 0.0321, + "step": 5020 + }, + { + "grad_norm": 0.5828918814659119, + "learning_rate": 9.968767878402501e-05, + "loss": 0.0351, + "step": 5030 + }, + { + "grad_norm": 0.562113344669342, + "learning_rate": 9.968459587344808e-05, + "loss": 0.0333, + "step": 5040 + }, + { + "grad_norm": 0.629982590675354, + "learning_rate": 9.968149787000278e-05, + "loss": 0.0366, + "step": 5050 + }, + { + "grad_norm": 0.48110151290893555, + "learning_rate": 9.967838477463018e-05, + "loss": 0.0329, + "step": 5060 + }, + { + "grad_norm": 0.6148000359535217, + "learning_rate": 9.967525658827597e-05, + "loss": 0.029, + "step": 5070 + }, + { + "grad_norm": 0.5258086919784546, + "learning_rate": 9.967211331189042e-05, + "loss": 0.0331, + "step": 5080 + }, + { + "grad_norm": 0.46001553535461426, + "learning_rate": 9.966895494642834e-05, + "loss": 0.0325, + "step": 5090 + }, + { + "grad_norm": 0.6187411546707153, + "learning_rate": 9.96657814928492e-05, + "loss": 0.0346, + "step": 5100 + }, + { + "grad_norm": 0.5961857438087463, + "learning_rate": 9.966259295211697e-05, + "loss": 0.0307, + "step": 5110 + }, + { + "grad_norm": 0.5988697409629822, + "learning_rate": 9.965938932520028e-05, + "loss": 0.041, + "step": 5120 + }, + { + "grad_norm": 0.5525689125061035, + "learning_rate": 9.965617061307229e-05, + "loss": 0.0299, + "step": 5130 + }, + { + "grad_norm": 0.5115570425987244, + "learning_rate": 9.965293681671077e-05, + "loss": 0.0313, + "step": 5140 + }, + { + "grad_norm": 0.5240322947502136, + "learning_rate": 9.964968793709804e-05, + "loss": 0.0285, + "step": 5150 + }, + { + "grad_norm": 0.5910024046897888, + "learning_rate": 9.964642397522106e-05, + "loss": 0.0313, + "step": 5160 + }, + { + "grad_norm": 0.7692542672157288, + "learning_rate": 9.96431449320713e-05, + "loss": 0.025, + "step": 5170 + }, + { + "grad_norm": 0.6028837561607361, + "learning_rate": 9.963985080864486e-05, + "loss": 0.033, + "step": 5180 + }, + { + "grad_norm": 0.5340349674224854, + "learning_rate": 9.96365416059424e-05, + "loss": 0.0339, + "step": 5190 + }, + { + "grad_norm": 0.5638813376426697, + "learning_rate": 9.963321732496919e-05, + "loss": 0.0389, + "step": 5200 + }, + { + "grad_norm": 0.6781941652297974, + "learning_rate": 9.962987796673506e-05, + "loss": 0.0361, + "step": 5210 + }, + { + "grad_norm": 0.47857561707496643, + "learning_rate": 9.962652353225438e-05, + "loss": 0.0344, + "step": 5220 + }, + { + "grad_norm": 0.587444543838501, + "learning_rate": 9.962315402254619e-05, + "loss": 0.0334, + "step": 5230 + }, + { + "grad_norm": 0.5667542815208435, + "learning_rate": 9.9619769438634e-05, + "loss": 0.0287, + "step": 5240 + }, + { + "grad_norm": 0.6488659977912903, + "learning_rate": 9.9616369781546e-05, + "loss": 0.0276, + "step": 5250 + }, + { + "grad_norm": 0.5722289681434631, + "learning_rate": 9.961295505231491e-05, + "loss": 0.0348, + "step": 5260 + }, + { + "grad_norm": 0.501595675945282, + "learning_rate": 9.960952525197804e-05, + "loss": 0.0326, + "step": 5270 + }, + { + "grad_norm": 0.626385509967804, + "learning_rate": 9.960608038157724e-05, + "loss": 0.0325, + "step": 5280 + }, + { + "grad_norm": 0.581871509552002, + "learning_rate": 9.960262044215901e-05, + "loss": 0.0339, + "step": 5290 + }, + { + "grad_norm": 0.6409176588058472, + "learning_rate": 9.959914543477435e-05, + "loss": 0.0315, + "step": 5300 + }, + { + "grad_norm": 0.5938498377799988, + "learning_rate": 9.959565536047892e-05, + "loss": 0.0301, + "step": 5310 + }, + { + "grad_norm": 0.5349408984184265, + "learning_rate": 9.959215022033288e-05, + "loss": 0.0329, + "step": 5320 + }, + { + "grad_norm": 0.47248077392578125, + "learning_rate": 9.9588630015401e-05, + "loss": 0.0294, + "step": 5330 + }, + { + "grad_norm": 0.4678635001182556, + "learning_rate": 9.958509474675264e-05, + "loss": 0.0325, + "step": 5340 + }, + { + "grad_norm": 0.5386501550674438, + "learning_rate": 9.958154441546171e-05, + "loss": 0.0277, + "step": 5350 + }, + { + "grad_norm": 0.5271458029747009, + "learning_rate": 9.957797902260673e-05, + "loss": 0.0331, + "step": 5360 + }, + { + "grad_norm": 0.6122616529464722, + "learning_rate": 9.957439856927073e-05, + "loss": 0.0348, + "step": 5370 + }, + { + "grad_norm": 0.6210896968841553, + "learning_rate": 9.957080305654139e-05, + "loss": 0.0313, + "step": 5380 + }, + { + "grad_norm": 0.5200783610343933, + "learning_rate": 9.956719248551092e-05, + "loss": 0.0286, + "step": 5390 + }, + { + "grad_norm": 0.4495662748813629, + "learning_rate": 9.956356685727612e-05, + "loss": 0.0301, + "step": 5400 + }, + { + "grad_norm": 0.6084323525428772, + "learning_rate": 9.955992617293836e-05, + "loss": 0.0312, + "step": 5410 + }, + { + "grad_norm": 0.47591859102249146, + "learning_rate": 9.955627043360358e-05, + "loss": 0.0277, + "step": 5420 + }, + { + "grad_norm": 0.5773627758026123, + "learning_rate": 9.955259964038231e-05, + "loss": 0.0306, + "step": 5430 + }, + { + "grad_norm": 0.4637615382671356, + "learning_rate": 9.954891379438962e-05, + "loss": 0.0309, + "step": 5440 + }, + { + "grad_norm": 0.6408292055130005, + "learning_rate": 9.954521289674519e-05, + "loss": 0.0356, + "step": 5450 + }, + { + "grad_norm": 0.6400755643844604, + "learning_rate": 9.954149694857325e-05, + "loss": 0.0301, + "step": 5460 + }, + { + "grad_norm": 0.5934323072433472, + "learning_rate": 9.953776595100258e-05, + "loss": 0.032, + "step": 5470 + }, + { + "grad_norm": 0.5204159617424011, + "learning_rate": 9.95340199051666e-05, + "loss": 0.03, + "step": 5480 + }, + { + "grad_norm": 0.7988465428352356, + "learning_rate": 9.953025881220325e-05, + "loss": 0.0298, + "step": 5490 + }, + { + "grad_norm": 0.9485117197036743, + "learning_rate": 9.952648267325504e-05, + "loss": 0.0305, + "step": 5500 + }, + { + "grad_norm": 0.46983402967453003, + "learning_rate": 9.952269148946905e-05, + "loss": 0.0368, + "step": 5510 + }, + { + "grad_norm": 0.44692879915237427, + "learning_rate": 9.951888526199697e-05, + "loss": 0.0306, + "step": 5520 + }, + { + "grad_norm": 0.4629494547843933, + "learning_rate": 9.951506399199501e-05, + "loss": 0.0389, + "step": 5530 + }, + { + "grad_norm": 0.5767475962638855, + "learning_rate": 9.951122768062399e-05, + "loss": 0.0296, + "step": 5540 + }, + { + "grad_norm": 0.5135264992713928, + "learning_rate": 9.950737632904927e-05, + "loss": 0.0266, + "step": 5550 + }, + { + "grad_norm": 0.5246177911758423, + "learning_rate": 9.950350993844077e-05, + "loss": 0.0305, + "step": 5560 + }, + { + "grad_norm": 0.4465120732784271, + "learning_rate": 9.949962850997303e-05, + "loss": 0.029, + "step": 5570 + }, + { + "grad_norm": 0.5132999420166016, + "learning_rate": 9.949573204482512e-05, + "loss": 0.0306, + "step": 5580 + }, + { + "grad_norm": 0.514188289642334, + "learning_rate": 9.949182054418064e-05, + "loss": 0.0302, + "step": 5590 + }, + { + "grad_norm": 0.431026816368103, + "learning_rate": 9.948789400922787e-05, + "loss": 0.0343, + "step": 5600 + }, + { + "grad_norm": 0.6673880219459534, + "learning_rate": 9.948395244115953e-05, + "loss": 0.034, + "step": 5610 + }, + { + "grad_norm": 0.6397469639778137, + "learning_rate": 9.9479995841173e-05, + "loss": 0.0339, + "step": 5620 + }, + { + "grad_norm": 0.5425045490264893, + "learning_rate": 9.947602421047017e-05, + "loss": 0.0288, + "step": 5630 + }, + { + "grad_norm": 0.5681215524673462, + "learning_rate": 9.947203755025753e-05, + "loss": 0.0359, + "step": 5640 + }, + { + "grad_norm": 0.6781080961227417, + "learning_rate": 9.946803586174611e-05, + "loss": 0.0258, + "step": 5650 + }, + { + "grad_norm": 0.5687102675437927, + "learning_rate": 9.946401914615151e-05, + "loss": 0.0319, + "step": 5660 + }, + { + "grad_norm": 0.5427479147911072, + "learning_rate": 9.945998740469394e-05, + "loss": 0.029, + "step": 5670 + }, + { + "grad_norm": 0.521224856376648, + "learning_rate": 9.945594063859809e-05, + "loss": 0.0284, + "step": 5680 + }, + { + "grad_norm": 0.628860592842102, + "learning_rate": 9.94518788490933e-05, + "loss": 0.0343, + "step": 5690 + }, + { + "grad_norm": 0.5305071473121643, + "learning_rate": 9.944780203741341e-05, + "loss": 0.0263, + "step": 5700 + }, + { + "grad_norm": 0.656499981880188, + "learning_rate": 9.944371020479686e-05, + "loss": 0.0344, + "step": 5710 + }, + { + "grad_norm": 0.46921977400779724, + "learning_rate": 9.943960335248662e-05, + "loss": 0.0334, + "step": 5720 + }, + { + "grad_norm": 0.5762396454811096, + "learning_rate": 9.943548148173027e-05, + "loss": 0.0332, + "step": 5730 + }, + { + "grad_norm": 0.4832884669303894, + "learning_rate": 9.943134459377992e-05, + "loss": 0.0268, + "step": 5740 + }, + { + "grad_norm": 0.48663529753685, + "learning_rate": 9.942719268989222e-05, + "loss": 0.0268, + "step": 5750 + }, + { + "grad_norm": 0.4288635551929474, + "learning_rate": 9.942302577132844e-05, + "loss": 0.0278, + "step": 5760 + }, + { + "grad_norm": 0.5483534336090088, + "learning_rate": 9.941884383935438e-05, + "loss": 0.0303, + "step": 5770 + }, + { + "grad_norm": 0.590912401676178, + "learning_rate": 9.941464689524039e-05, + "loss": 0.0332, + "step": 5780 + }, + { + "grad_norm": 0.5242199301719666, + "learning_rate": 9.941043494026139e-05, + "loss": 0.0317, + "step": 5790 + }, + { + "grad_norm": 0.5846026539802551, + "learning_rate": 9.940620797569685e-05, + "loss": 0.0268, + "step": 5800 + }, + { + "grad_norm": 0.5359113812446594, + "learning_rate": 9.940196600283082e-05, + "loss": 0.026, + "step": 5810 + }, + { + "grad_norm": 0.5197345614433289, + "learning_rate": 9.939770902295192e-05, + "loss": 0.0265, + "step": 5820 + }, + { + "grad_norm": 0.4654234051704407, + "learning_rate": 9.939343703735329e-05, + "loss": 0.0322, + "step": 5830 + }, + { + "grad_norm": 0.5041200518608093, + "learning_rate": 9.938915004733264e-05, + "loss": 0.0278, + "step": 5840 + }, + { + "grad_norm": 0.5169036388397217, + "learning_rate": 9.938484805419224e-05, + "loss": 0.0354, + "step": 5850 + }, + { + "grad_norm": 0.35643133521080017, + "learning_rate": 9.938053105923894e-05, + "loss": 0.0268, + "step": 5860 + }, + { + "grad_norm": 0.508072555065155, + "learning_rate": 9.937619906378413e-05, + "loss": 0.0258, + "step": 5870 + }, + { + "grad_norm": 0.44759124517440796, + "learning_rate": 9.937185206914374e-05, + "loss": 0.0387, + "step": 5880 + }, + { + "grad_norm": 0.4900940954685211, + "learning_rate": 9.936749007663829e-05, + "loss": 0.0296, + "step": 5890 + }, + { + "grad_norm": 0.48753684759140015, + "learning_rate": 9.93631130875928e-05, + "loss": 0.0299, + "step": 5900 + }, + { + "grad_norm": 0.47119656205177307, + "learning_rate": 9.935872110333692e-05, + "loss": 0.03, + "step": 5910 + }, + { + "grad_norm": 0.4481375813484192, + "learning_rate": 9.935431412520484e-05, + "loss": 0.03, + "step": 5920 + }, + { + "grad_norm": 0.41306567192077637, + "learning_rate": 9.934989215453523e-05, + "loss": 0.0252, + "step": 5930 + }, + { + "grad_norm": 0.5313475728034973, + "learning_rate": 9.934545519267139e-05, + "loss": 0.0261, + "step": 5940 + }, + { + "grad_norm": 0.586621105670929, + "learning_rate": 9.934100324096117e-05, + "loss": 0.0251, + "step": 5950 + }, + { + "grad_norm": 0.5261890888214111, + "learning_rate": 9.933653630075692e-05, + "loss": 0.0247, + "step": 5960 + }, + { + "grad_norm": 0.5286491513252258, + "learning_rate": 9.93320543734156e-05, + "loss": 0.0223, + "step": 5970 + }, + { + "grad_norm": 0.42709657549858093, + "learning_rate": 9.932755746029871e-05, + "loss": 0.0281, + "step": 5980 + }, + { + "grad_norm": 0.5061715841293335, + "learning_rate": 9.932304556277228e-05, + "loss": 0.0255, + "step": 5990 + }, + { + "grad_norm": 0.6657582521438599, + "learning_rate": 9.93185186822069e-05, + "loss": 0.0284, + "step": 6000 + }, + { + "grad_norm": 0.4694940745830536, + "learning_rate": 9.931397681997773e-05, + "loss": 0.0252, + "step": 6010 + }, + { + "grad_norm": 0.7472635507583618, + "learning_rate": 9.930941997746446e-05, + "loss": 0.0294, + "step": 6020 + }, + { + "grad_norm": 0.584044337272644, + "learning_rate": 9.930484815605134e-05, + "loss": 0.0255, + "step": 6030 + }, + { + "grad_norm": 0.5345356464385986, + "learning_rate": 9.930026135712717e-05, + "loss": 0.0304, + "step": 6040 + }, + { + "grad_norm": 0.41178804636001587, + "learning_rate": 9.92956595820853e-05, + "loss": 0.0221, + "step": 6050 + }, + { + "grad_norm": 0.6209378242492676, + "learning_rate": 9.929104283232362e-05, + "loss": 0.0252, + "step": 6060 + }, + { + "grad_norm": 0.537927508354187, + "learning_rate": 9.92864111092446e-05, + "loss": 0.0247, + "step": 6070 + }, + { + "grad_norm": 0.4989626109600067, + "learning_rate": 9.92817644142552e-05, + "loss": 0.0301, + "step": 6080 + }, + { + "grad_norm": 0.48206672072410583, + "learning_rate": 9.927710274876698e-05, + "loss": 0.0268, + "step": 6090 + }, + { + "grad_norm": 0.4491099417209625, + "learning_rate": 9.927242611419603e-05, + "loss": 0.0228, + "step": 6100 + }, + { + "grad_norm": 0.5502435564994812, + "learning_rate": 9.926773451196301e-05, + "loss": 0.0231, + "step": 6110 + }, + { + "grad_norm": 0.602737307548523, + "learning_rate": 9.926302794349306e-05, + "loss": 0.0266, + "step": 6120 + }, + { + "grad_norm": 0.5966441035270691, + "learning_rate": 9.925830641021594e-05, + "loss": 0.0268, + "step": 6130 + }, + { + "grad_norm": 0.3484886884689331, + "learning_rate": 9.925356991356593e-05, + "loss": 0.0223, + "step": 6140 + }, + { + "grad_norm": 0.5503848791122437, + "learning_rate": 9.924881845498184e-05, + "loss": 0.0235, + "step": 6150 + }, + { + "grad_norm": 0.42554327845573425, + "learning_rate": 9.924405203590705e-05, + "loss": 0.0296, + "step": 6160 + }, + { + "grad_norm": 0.4679335355758667, + "learning_rate": 9.923927065778946e-05, + "loss": 0.0275, + "step": 6170 + }, + { + "grad_norm": 0.46980857849121094, + "learning_rate": 9.923447432208154e-05, + "loss": 0.0262, + "step": 6180 + }, + { + "grad_norm": 0.6113616824150085, + "learning_rate": 9.922966303024027e-05, + "loss": 0.0306, + "step": 6190 + }, + { + "grad_norm": 0.5694376826286316, + "learning_rate": 9.922483678372721e-05, + "loss": 0.0271, + "step": 6200 + }, + { + "grad_norm": 0.5350271463394165, + "learning_rate": 9.921999558400845e-05, + "loss": 0.0262, + "step": 6210 + }, + { + "grad_norm": 0.5522851943969727, + "learning_rate": 9.92151394325546e-05, + "loss": 0.0257, + "step": 6220 + }, + { + "grad_norm": 0.5042399764060974, + "learning_rate": 9.921026833084084e-05, + "loss": 0.025, + "step": 6230 + }, + { + "grad_norm": 0.522422730922699, + "learning_rate": 9.920538228034689e-05, + "loss": 0.027, + "step": 6240 + }, + { + "grad_norm": 0.5598258972167969, + "learning_rate": 9.920048128255699e-05, + "loss": 0.0281, + "step": 6250 + }, + { + "grad_norm": 0.45274388790130615, + "learning_rate": 9.919556533895995e-05, + "loss": 0.0294, + "step": 6260 + }, + { + "grad_norm": 0.5932326912879944, + "learning_rate": 9.919063445104907e-05, + "loss": 0.0268, + "step": 6270 + }, + { + "grad_norm": 0.5251244306564331, + "learning_rate": 9.918568862032227e-05, + "loss": 0.0279, + "step": 6280 + }, + { + "grad_norm": 0.5877266526222229, + "learning_rate": 9.918072784828194e-05, + "loss": 0.0313, + "step": 6290 + }, + { + "grad_norm": 0.48378267884254456, + "learning_rate": 9.917575213643501e-05, + "loss": 0.0245, + "step": 6300 + }, + { + "grad_norm": 0.5477020740509033, + "learning_rate": 9.917076148629302e-05, + "loss": 0.0271, + "step": 6310 + }, + { + "grad_norm": 0.6756936311721802, + "learning_rate": 9.916575589937196e-05, + "loss": 0.0237, + "step": 6320 + }, + { + "grad_norm": 0.5312181115150452, + "learning_rate": 9.916073537719239e-05, + "loss": 0.0254, + "step": 6330 + }, + { + "grad_norm": 0.7678133845329285, + "learning_rate": 9.915569992127944e-05, + "loss": 0.0285, + "step": 6340 + }, + { + "grad_norm": 0.4704437553882599, + "learning_rate": 9.915064953316273e-05, + "loss": 0.0217, + "step": 6350 + }, + { + "grad_norm": 0.5444080233573914, + "learning_rate": 9.914558421437645e-05, + "loss": 0.025, + "step": 6360 + }, + { + "grad_norm": 0.6273277997970581, + "learning_rate": 9.914050396645929e-05, + "loss": 0.0274, + "step": 6370 + }, + { + "grad_norm": 0.460055410861969, + "learning_rate": 9.913540879095452e-05, + "loss": 0.024, + "step": 6380 + }, + { + "grad_norm": 0.4514501392841339, + "learning_rate": 9.913029868940987e-05, + "loss": 0.0254, + "step": 6390 + }, + { + "grad_norm": 0.5933006405830383, + "learning_rate": 9.912517366337772e-05, + "loss": 0.0255, + "step": 6400 + }, + { + "grad_norm": 0.5432915687561035, + "learning_rate": 9.912003371441487e-05, + "loss": 0.0257, + "step": 6410 + }, + { + "grad_norm": 0.5635409355163574, + "learning_rate": 9.911487884408271e-05, + "loss": 0.0268, + "step": 6420 + }, + { + "grad_norm": 0.45503464341163635, + "learning_rate": 9.910970905394719e-05, + "loss": 0.0218, + "step": 6430 + }, + { + "grad_norm": 0.5414345860481262, + "learning_rate": 9.91045243455787e-05, + "loss": 0.0247, + "step": 6440 + }, + { + "grad_norm": 0.46933358907699585, + "learning_rate": 9.909932472055225e-05, + "loss": 0.0212, + "step": 6450 + }, + { + "grad_norm": 0.49070093035697937, + "learning_rate": 9.909411018044734e-05, + "loss": 0.0221, + "step": 6460 + }, + { + "grad_norm": 0.472852885723114, + "learning_rate": 9.908888072684802e-05, + "loss": 0.0269, + "step": 6470 + }, + { + "grad_norm": 0.5244875550270081, + "learning_rate": 9.908363636134285e-05, + "loss": 0.0219, + "step": 6480 + }, + { + "grad_norm": 0.49625301361083984, + "learning_rate": 9.907837708552493e-05, + "loss": 0.0246, + "step": 6490 + }, + { + "grad_norm": 0.529707133769989, + "learning_rate": 9.90731029009919e-05, + "loss": 0.0233, + "step": 6500 + }, + { + "grad_norm": 0.5402315855026245, + "learning_rate": 9.906781380934589e-05, + "loss": 0.0244, + "step": 6510 + }, + { + "grad_norm": 0.46394890546798706, + "learning_rate": 9.906250981219362e-05, + "loss": 0.0285, + "step": 6520 + }, + { + "grad_norm": 0.41746652126312256, + "learning_rate": 9.905719091114628e-05, + "loss": 0.0252, + "step": 6530 + }, + { + "grad_norm": 0.44857144355773926, + "learning_rate": 9.905185710781964e-05, + "loss": 0.0235, + "step": 6540 + }, + { + "grad_norm": 0.6205761432647705, + "learning_rate": 9.904650840383392e-05, + "loss": 0.0235, + "step": 6550 + }, + { + "grad_norm": 0.378711074590683, + "learning_rate": 9.904114480081397e-05, + "loss": 0.0259, + "step": 6560 + }, + { + "grad_norm": 0.5038026571273804, + "learning_rate": 9.903576630038906e-05, + "loss": 0.0288, + "step": 6570 + }, + { + "grad_norm": 0.5139516592025757, + "learning_rate": 9.903037290419309e-05, + "loss": 0.0343, + "step": 6580 + }, + { + "grad_norm": 0.4362604022026062, + "learning_rate": 9.902496461386439e-05, + "loss": 0.0256, + "step": 6590 + }, + { + "grad_norm": 0.5129692554473877, + "learning_rate": 9.901954143104588e-05, + "loss": 0.0238, + "step": 6600 + }, + { + "grad_norm": 0.5847436785697937, + "learning_rate": 9.901410335738496e-05, + "loss": 0.0242, + "step": 6610 + }, + { + "grad_norm": 0.5122170448303223, + "learning_rate": 9.900865039453358e-05, + "loss": 0.0233, + "step": 6620 + }, + { + "grad_norm": 0.5081768035888672, + "learning_rate": 9.900318254414821e-05, + "loss": 0.0279, + "step": 6630 + }, + { + "grad_norm": 0.480114609003067, + "learning_rate": 9.899769980788985e-05, + "loss": 0.0253, + "step": 6640 + }, + { + "grad_norm": 0.51370769739151, + "learning_rate": 9.899220218742398e-05, + "loss": 0.0222, + "step": 6650 + }, + { + "grad_norm": 0.446836918592453, + "learning_rate": 9.898668968442066e-05, + "loss": 0.0225, + "step": 6660 + }, + { + "grad_norm": 0.4553784728050232, + "learning_rate": 9.898116230055443e-05, + "loss": 0.0244, + "step": 6670 + }, + { + "grad_norm": 0.621615469455719, + "learning_rate": 9.897562003750437e-05, + "loss": 0.0276, + "step": 6680 + }, + { + "grad_norm": 0.5878480076789856, + "learning_rate": 9.897006289695407e-05, + "loss": 0.0243, + "step": 6690 + }, + { + "grad_norm": 0.4821608066558838, + "learning_rate": 9.896449088059164e-05, + "loss": 0.0237, + "step": 6700 + }, + { + "grad_norm": 0.4933816194534302, + "learning_rate": 9.89589039901097e-05, + "loss": 0.0287, + "step": 6710 + }, + { + "grad_norm": 0.49964186549186707, + "learning_rate": 9.895330222720542e-05, + "loss": 0.0279, + "step": 6720 + }, + { + "grad_norm": 0.49987050890922546, + "learning_rate": 9.894768559358047e-05, + "loss": 0.0276, + "step": 6730 + }, + { + "grad_norm": 0.40598466992378235, + "learning_rate": 9.894205409094101e-05, + "loss": 0.0289, + "step": 6740 + }, + { + "grad_norm": 0.4163476526737213, + "learning_rate": 9.893640772099777e-05, + "loss": 0.0219, + "step": 6750 + }, + { + "grad_norm": 0.5128793120384216, + "learning_rate": 9.893074648546595e-05, + "loss": 0.0231, + "step": 6760 + }, + { + "grad_norm": 0.5630953311920166, + "learning_rate": 9.892507038606528e-05, + "loss": 0.0246, + "step": 6770 + }, + { + "grad_norm": 0.4373803436756134, + "learning_rate": 9.891937942452003e-05, + "loss": 0.0245, + "step": 6780 + }, + { + "grad_norm": 0.4257532060146332, + "learning_rate": 9.891367360255895e-05, + "loss": 0.0252, + "step": 6790 + }, + { + "grad_norm": 0.5192376375198364, + "learning_rate": 9.890795292191532e-05, + "loss": 0.0323, + "step": 6800 + }, + { + "grad_norm": 0.5340325236320496, + "learning_rate": 9.890221738432694e-05, + "loss": 0.0241, + "step": 6810 + }, + { + "grad_norm": 0.6282180547714233, + "learning_rate": 9.88964669915361e-05, + "loss": 0.0256, + "step": 6820 + }, + { + "grad_norm": 0.513201892375946, + "learning_rate": 9.889070174528963e-05, + "loss": 0.0212, + "step": 6830 + }, + { + "grad_norm": 0.38406088948249817, + "learning_rate": 9.888492164733883e-05, + "loss": 0.0229, + "step": 6840 + }, + { + "grad_norm": 0.40800413489341736, + "learning_rate": 9.88791266994396e-05, + "loss": 0.0221, + "step": 6850 + }, + { + "grad_norm": 0.5369046330451965, + "learning_rate": 9.887331690335223e-05, + "loss": 0.0278, + "step": 6860 + }, + { + "grad_norm": 0.32878604531288147, + "learning_rate": 9.886749226084163e-05, + "loss": 0.028, + "step": 6870 + }, + { + "grad_norm": 0.4625433683395386, + "learning_rate": 9.886165277367714e-05, + "loss": 0.0301, + "step": 6880 + }, + { + "grad_norm": 0.4818950891494751, + "learning_rate": 9.885579844363265e-05, + "loss": 0.026, + "step": 6890 + }, + { + "grad_norm": 0.5412994623184204, + "learning_rate": 9.884992927248656e-05, + "loss": 0.0247, + "step": 6900 + }, + { + "grad_norm": 0.9018186330795288, + "learning_rate": 9.884404526202178e-05, + "loss": 0.026, + "step": 6910 + }, + { + "grad_norm": 1.1377595663070679, + "learning_rate": 9.883814641402568e-05, + "loss": 0.0423, + "step": 6920 + }, + { + "grad_norm": 0.5635457038879395, + "learning_rate": 9.88322327302902e-05, + "loss": 0.0298, + "step": 6930 + }, + { + "grad_norm": 0.4904860258102417, + "learning_rate": 9.882630421261176e-05, + "loss": 0.0273, + "step": 6940 + }, + { + "grad_norm": 0.49857255816459656, + "learning_rate": 9.88203608627913e-05, + "loss": 0.0339, + "step": 6950 + }, + { + "grad_norm": 0.5071297883987427, + "learning_rate": 9.881440268263422e-05, + "loss": 0.0315, + "step": 6960 + }, + { + "grad_norm": 0.5229262709617615, + "learning_rate": 9.880842967395048e-05, + "loss": 0.0295, + "step": 6970 + }, + { + "grad_norm": 0.8999877572059631, + "learning_rate": 9.880244183855452e-05, + "loss": 0.0267, + "step": 6980 + }, + { + "grad_norm": 0.5732887983322144, + "learning_rate": 9.879643917826527e-05, + "loss": 0.033, + "step": 6990 + }, + { + "grad_norm": 0.5922005772590637, + "learning_rate": 9.87904216949062e-05, + "loss": 0.0369, + "step": 7000 + }, + { + "grad_norm": 0.5878987908363342, + "learning_rate": 9.878438939030526e-05, + "loss": 0.0301, + "step": 7010 + }, + { + "grad_norm": 0.4599475860595703, + "learning_rate": 9.877834226629489e-05, + "loss": 0.026, + "step": 7020 + }, + { + "grad_norm": 0.4800131618976593, + "learning_rate": 9.877228032471206e-05, + "loss": 0.0274, + "step": 7030 + }, + { + "grad_norm": 0.45589321851730347, + "learning_rate": 9.876620356739823e-05, + "loss": 0.0284, + "step": 7040 + }, + { + "grad_norm": 0.622838020324707, + "learning_rate": 9.876011199619935e-05, + "loss": 0.0362, + "step": 7050 + }, + { + "grad_norm": 0.4409644603729248, + "learning_rate": 9.875400561296589e-05, + "loss": 0.0264, + "step": 7060 + }, + { + "grad_norm": 0.5039188265800476, + "learning_rate": 9.874788441955278e-05, + "loss": 0.0289, + "step": 7070 + }, + { + "grad_norm": 0.4262818396091461, + "learning_rate": 9.874174841781951e-05, + "loss": 0.0231, + "step": 7080 + }, + { + "grad_norm": 0.5230967998504639, + "learning_rate": 9.873559760963003e-05, + "loss": 0.024, + "step": 7090 + }, + { + "grad_norm": 0.3851624131202698, + "learning_rate": 9.872943199685278e-05, + "loss": 0.0267, + "step": 7100 + }, + { + "grad_norm": 0.8489354252815247, + "learning_rate": 9.872325158136071e-05, + "loss": 0.0268, + "step": 7110 + }, + { + "grad_norm": 0.4670383632183075, + "learning_rate": 9.871705636503128e-05, + "loss": 0.026, + "step": 7120 + }, + { + "grad_norm": 0.5082715749740601, + "learning_rate": 9.871084634974641e-05, + "loss": 0.0247, + "step": 7130 + }, + { + "grad_norm": 0.44554927945137024, + "learning_rate": 9.870462153739257e-05, + "loss": 0.0267, + "step": 7140 + }, + { + "grad_norm": 0.4707428514957428, + "learning_rate": 9.869838192986067e-05, + "loss": 0.0273, + "step": 7150 + }, + { + "grad_norm": 0.5337039232254028, + "learning_rate": 9.869212752904616e-05, + "loss": 0.0294, + "step": 7160 + }, + { + "grad_norm": 0.517871081829071, + "learning_rate": 9.868585833684894e-05, + "loss": 0.0251, + "step": 7170 + }, + { + "grad_norm": 0.4735605716705322, + "learning_rate": 9.867957435517342e-05, + "loss": 0.026, + "step": 7180 + }, + { + "grad_norm": 0.4602915048599243, + "learning_rate": 9.867327558592854e-05, + "loss": 0.0292, + "step": 7190 + }, + { + "grad_norm": 0.43385154008865356, + "learning_rate": 9.866696203102766e-05, + "loss": 0.0241, + "step": 7200 + }, + { + "grad_norm": 0.47179746627807617, + "learning_rate": 9.86606336923887e-05, + "loss": 0.0214, + "step": 7210 + }, + { + "grad_norm": 0.5017564296722412, + "learning_rate": 9.865429057193403e-05, + "loss": 0.0278, + "step": 7220 + }, + { + "grad_norm": 0.39125856757164, + "learning_rate": 9.864793267159053e-05, + "loss": 0.0221, + "step": 7230 + }, + { + "grad_norm": 0.4022477865219116, + "learning_rate": 9.864155999328957e-05, + "loss": 0.0238, + "step": 7240 + }, + { + "grad_norm": 0.4445357918739319, + "learning_rate": 9.8635172538967e-05, + "loss": 0.0213, + "step": 7250 + }, + { + "grad_norm": 0.5076184272766113, + "learning_rate": 9.862877031056312e-05, + "loss": 0.0221, + "step": 7260 + }, + { + "grad_norm": 0.4562908709049225, + "learning_rate": 9.862235331002279e-05, + "loss": 0.0241, + "step": 7270 + }, + { + "grad_norm": 0.4024653434753418, + "learning_rate": 9.861592153929533e-05, + "loss": 0.0205, + "step": 7280 + }, + { + "grad_norm": 0.5546900033950806, + "learning_rate": 9.860947500033455e-05, + "loss": 0.0218, + "step": 7290 + }, + { + "grad_norm": 0.37578102946281433, + "learning_rate": 9.86030136950987e-05, + "loss": 0.0241, + "step": 7300 + }, + { + "grad_norm": 0.44185230135917664, + "learning_rate": 9.85965376255506e-05, + "loss": 0.0272, + "step": 7310 + }, + { + "grad_norm": 0.581455647945404, + "learning_rate": 9.859004679365747e-05, + "loss": 0.023, + "step": 7320 + }, + { + "grad_norm": 0.5566856265068054, + "learning_rate": 9.858354120139108e-05, + "loss": 0.0274, + "step": 7330 + }, + { + "grad_norm": 0.45162466168403625, + "learning_rate": 9.857702085072764e-05, + "loss": 0.022, + "step": 7340 + }, + { + "grad_norm": 0.48306140303611755, + "learning_rate": 9.857048574364787e-05, + "loss": 0.021, + "step": 7350 + }, + { + "grad_norm": 0.42747581005096436, + "learning_rate": 9.856393588213698e-05, + "loss": 0.0215, + "step": 7360 + }, + { + "grad_norm": 0.46995165944099426, + "learning_rate": 9.855737126818458e-05, + "loss": 0.0232, + "step": 7370 + }, + { + "grad_norm": 0.4869796931743622, + "learning_rate": 9.855079190378491e-05, + "loss": 0.0255, + "step": 7380 + }, + { + "grad_norm": 0.6607803702354431, + "learning_rate": 9.854419779093655e-05, + "loss": 0.0285, + "step": 7390 + }, + { + "grad_norm": 0.506090521812439, + "learning_rate": 9.853758893164264e-05, + "loss": 0.0252, + "step": 7400 + }, + { + "grad_norm": 0.42590656876564026, + "learning_rate": 9.853096532791078e-05, + "loss": 0.0215, + "step": 7410 + }, + { + "grad_norm": 0.5458431243896484, + "learning_rate": 9.852432698175304e-05, + "loss": 0.026, + "step": 7420 + }, + { + "grad_norm": 0.500715434551239, + "learning_rate": 9.851767389518597e-05, + "loss": 0.0228, + "step": 7430 + }, + { + "grad_norm": 0.3433137834072113, + "learning_rate": 9.85110060702306e-05, + "loss": 0.0256, + "step": 7440 + }, + { + "grad_norm": 0.44231247901916504, + "learning_rate": 9.850432350891245e-05, + "loss": 0.0205, + "step": 7450 + }, + { + "grad_norm": 0.4089946746826172, + "learning_rate": 9.84976262132615e-05, + "loss": 0.0233, + "step": 7460 + }, + { + "grad_norm": 0.4727373421192169, + "learning_rate": 9.849091418531222e-05, + "loss": 0.0223, + "step": 7470 + }, + { + "grad_norm": 0.46963366866111755, + "learning_rate": 9.848418742710353e-05, + "loss": 0.0234, + "step": 7480 + }, + { + "grad_norm": 0.47123104333877563, + "learning_rate": 9.847744594067885e-05, + "loss": 0.0239, + "step": 7490 + }, + { + "grad_norm": 0.39739325642585754, + "learning_rate": 9.847068972808607e-05, + "loss": 0.0204, + "step": 7500 + }, + { + "grad_norm": 0.46723365783691406, + "learning_rate": 9.846391879137756e-05, + "loss": 0.0192, + "step": 7510 + }, + { + "grad_norm": 0.3652726709842682, + "learning_rate": 9.845713313261012e-05, + "loss": 0.0245, + "step": 7520 + }, + { + "grad_norm": 0.5234922170639038, + "learning_rate": 9.845033275384505e-05, + "loss": 0.0247, + "step": 7530 + }, + { + "grad_norm": 0.40047717094421387, + "learning_rate": 9.844351765714818e-05, + "loss": 0.0217, + "step": 7540 + }, + { + "grad_norm": 0.4735856354236603, + "learning_rate": 9.843668784458971e-05, + "loss": 0.0221, + "step": 7550 + }, + { + "grad_norm": 0.6301396489143372, + "learning_rate": 9.842984331824437e-05, + "loss": 0.0221, + "step": 7560 + }, + { + "grad_norm": 0.40476080775260925, + "learning_rate": 9.842298408019133e-05, + "loss": 0.0235, + "step": 7570 + }, + { + "grad_norm": 0.4858342707157135, + "learning_rate": 9.841611013251429e-05, + "loss": 0.0241, + "step": 7580 + }, + { + "grad_norm": 0.41062504053115845, + "learning_rate": 9.840922147730133e-05, + "loss": 0.022, + "step": 7590 + }, + { + "grad_norm": 0.39325666427612305, + "learning_rate": 9.840231811664506e-05, + "loss": 0.0225, + "step": 7600 + }, + { + "grad_norm": 0.43286484479904175, + "learning_rate": 9.839540005264252e-05, + "loss": 0.0183, + "step": 7610 + }, + { + "grad_norm": 0.4203701913356781, + "learning_rate": 9.838846728739527e-05, + "loss": 0.0194, + "step": 7620 + }, + { + "grad_norm": 0.3241867423057556, + "learning_rate": 9.838151982300927e-05, + "loss": 0.0186, + "step": 7630 + }, + { + "grad_norm": 0.472136527299881, + "learning_rate": 9.8374557661595e-05, + "loss": 0.0208, + "step": 7640 + }, + { + "grad_norm": 0.3961484134197235, + "learning_rate": 9.836758080526735e-05, + "loss": 0.0247, + "step": 7650 + }, + { + "grad_norm": 0.43421080708503723, + "learning_rate": 9.836058925614575e-05, + "loss": 0.0208, + "step": 7660 + }, + { + "grad_norm": 0.5333141684532166, + "learning_rate": 9.8353583016354e-05, + "loss": 0.0213, + "step": 7670 + }, + { + "grad_norm": 0.36163684725761414, + "learning_rate": 9.834656208802044e-05, + "loss": 0.0206, + "step": 7680 + }, + { + "grad_norm": 0.48670899868011475, + "learning_rate": 9.833952647327784e-05, + "loss": 0.021, + "step": 7690 + }, + { + "grad_norm": 0.46940380334854126, + "learning_rate": 9.833247617426342e-05, + "loss": 0.0194, + "step": 7700 + }, + { + "grad_norm": 0.4104388356208801, + "learning_rate": 9.832541119311889e-05, + "loss": 0.0213, + "step": 7710 + }, + { + "grad_norm": 0.40959322452545166, + "learning_rate": 9.83183315319904e-05, + "loss": 0.0193, + "step": 7720 + }, + { + "grad_norm": 0.4774872064590454, + "learning_rate": 9.831123719302855e-05, + "loss": 0.0252, + "step": 7730 + }, + { + "grad_norm": 0.5066968202590942, + "learning_rate": 9.830412817838842e-05, + "loss": 0.0224, + "step": 7740 + }, + { + "grad_norm": 0.4927401840686798, + "learning_rate": 9.829700449022956e-05, + "loss": 0.0204, + "step": 7750 + }, + { + "grad_norm": 0.4348791837692261, + "learning_rate": 9.828986613071593e-05, + "loss": 0.0199, + "step": 7760 + }, + { + "grad_norm": 0.44254836440086365, + "learning_rate": 9.828271310201601e-05, + "loss": 0.0169, + "step": 7770 + }, + { + "grad_norm": 0.4523863196372986, + "learning_rate": 9.827554540630268e-05, + "loss": 0.0232, + "step": 7780 + }, + { + "grad_norm": 0.3269347548484802, + "learning_rate": 9.826836304575329e-05, + "loss": 0.0195, + "step": 7790 + }, + { + "grad_norm": 0.38588622212409973, + "learning_rate": 9.826116602254966e-05, + "loss": 0.023, + "step": 7800 + }, + { + "grad_norm": 0.38418668508529663, + "learning_rate": 9.825395433887805e-05, + "loss": 0.0237, + "step": 7810 + }, + { + "grad_norm": 0.5125848054885864, + "learning_rate": 9.824672799692917e-05, + "loss": 0.0278, + "step": 7820 + }, + { + "grad_norm": 0.5025792717933655, + "learning_rate": 9.823948699889823e-05, + "loss": 0.0326, + "step": 7830 + }, + { + "grad_norm": 0.42523086071014404, + "learning_rate": 9.823223134698483e-05, + "loss": 0.0233, + "step": 7840 + }, + { + "grad_norm": 0.5269473791122437, + "learning_rate": 9.822496104339303e-05, + "loss": 0.0209, + "step": 7850 + }, + { + "grad_norm": 0.4181648790836334, + "learning_rate": 9.821767609033138e-05, + "loss": 0.0205, + "step": 7860 + }, + { + "grad_norm": 0.4053167402744293, + "learning_rate": 9.821037649001284e-05, + "loss": 0.0213, + "step": 7870 + }, + { + "grad_norm": 0.5356869101524353, + "learning_rate": 9.820306224465486e-05, + "loss": 0.023, + "step": 7880 + }, + { + "grad_norm": 0.47925180196762085, + "learning_rate": 9.819573335647928e-05, + "loss": 0.0214, + "step": 7890 + }, + { + "grad_norm": 0.5209689736366272, + "learning_rate": 9.818838982771246e-05, + "loss": 0.0195, + "step": 7900 + }, + { + "grad_norm": 0.41950997710227966, + "learning_rate": 9.818103166058514e-05, + "loss": 0.027, + "step": 7910 + }, + { + "grad_norm": 0.398406982421875, + "learning_rate": 9.817365885733254e-05, + "loss": 0.0205, + "step": 7920 + }, + { + "grad_norm": 0.451774924993515, + "learning_rate": 9.816627142019434e-05, + "loss": 0.0229, + "step": 7930 + }, + { + "grad_norm": 0.590817391872406, + "learning_rate": 9.815886935141463e-05, + "loss": 0.0213, + "step": 7940 + }, + { + "grad_norm": 0.43801072239875793, + "learning_rate": 9.8151452653242e-05, + "loss": 0.0271, + "step": 7950 + }, + { + "grad_norm": 0.3935137689113617, + "learning_rate": 9.814402132792939e-05, + "loss": 0.029, + "step": 7960 + }, + { + "grad_norm": 0.5044687986373901, + "learning_rate": 9.813657537773428e-05, + "loss": 0.0223, + "step": 7970 + }, + { + "grad_norm": 0.47347891330718994, + "learning_rate": 9.812911480491854e-05, + "loss": 0.0292, + "step": 7980 + }, + { + "grad_norm": 0.46765559911727905, + "learning_rate": 9.81216396117485e-05, + "loss": 0.0222, + "step": 7990 + }, + { + "grad_norm": 0.4351886212825775, + "learning_rate": 9.811414980049491e-05, + "loss": 0.0211, + "step": 8000 + }, + { + "grad_norm": 0.4591696262359619, + "learning_rate": 9.810664537343301e-05, + "loss": 0.0248, + "step": 8010 + }, + { + "grad_norm": 0.5217757821083069, + "learning_rate": 9.809912633284243e-05, + "loss": 0.0212, + "step": 8020 + }, + { + "grad_norm": 0.47767433524131775, + "learning_rate": 9.809159268100725e-05, + "loss": 0.0228, + "step": 8030 + }, + { + "grad_norm": 0.468953937292099, + "learning_rate": 9.808404442021599e-05, + "loss": 0.0185, + "step": 8040 + }, + { + "grad_norm": 0.406931072473526, + "learning_rate": 9.807648155276163e-05, + "loss": 0.017, + "step": 8050 + }, + { + "grad_norm": 0.5090564489364624, + "learning_rate": 9.806890408094156e-05, + "loss": 0.0203, + "step": 8060 + }, + { + "grad_norm": 0.4319576621055603, + "learning_rate": 9.806131200705761e-05, + "loss": 0.0194, + "step": 8070 + }, + { + "grad_norm": 0.34996670484542847, + "learning_rate": 9.805370533341605e-05, + "loss": 0.0191, + "step": 8080 + }, + { + "grad_norm": 0.3921457529067993, + "learning_rate": 9.804608406232762e-05, + "loss": 0.018, + "step": 8090 + }, + { + "grad_norm": 0.35749679803848267, + "learning_rate": 9.803844819610741e-05, + "loss": 0.0173, + "step": 8100 + }, + { + "grad_norm": 0.35104918479919434, + "learning_rate": 9.803079773707504e-05, + "loss": 0.0205, + "step": 8110 + }, + { + "grad_norm": 0.3688827455043793, + "learning_rate": 9.802313268755447e-05, + "loss": 0.0206, + "step": 8120 + }, + { + "grad_norm": 0.37073761224746704, + "learning_rate": 9.801545304987419e-05, + "loss": 0.0174, + "step": 8130 + }, + { + "grad_norm": 0.4200715720653534, + "learning_rate": 9.800775882636704e-05, + "loss": 0.0218, + "step": 8140 + }, + { + "grad_norm": 0.45175638794898987, + "learning_rate": 9.800005001937034e-05, + "loss": 0.021, + "step": 8150 + }, + { + "grad_norm": 0.3802504539489746, + "learning_rate": 9.79923266312258e-05, + "loss": 0.0216, + "step": 8160 + }, + { + "grad_norm": 0.43102043867111206, + "learning_rate": 9.79845886642796e-05, + "loss": 0.0208, + "step": 8170 + }, + { + "grad_norm": 0.3668661117553711, + "learning_rate": 9.797683612088233e-05, + "loss": 0.0221, + "step": 8180 + }, + { + "grad_norm": 0.4550149440765381, + "learning_rate": 9.796906900338898e-05, + "loss": 0.0203, + "step": 8190 + }, + { + "grad_norm": 0.45624905824661255, + "learning_rate": 9.796128731415903e-05, + "loss": 0.0212, + "step": 8200 + }, + { + "grad_norm": 0.4912322461605072, + "learning_rate": 9.795349105555634e-05, + "loss": 0.0209, + "step": 8210 + }, + { + "grad_norm": 0.5102741718292236, + "learning_rate": 9.794568022994922e-05, + "loss": 0.023, + "step": 8220 + }, + { + "grad_norm": 0.3720044195652008, + "learning_rate": 9.793785483971034e-05, + "loss": 0.0222, + "step": 8230 + }, + { + "grad_norm": 0.388742059469223, + "learning_rate": 9.793001488721691e-05, + "loss": 0.0176, + "step": 8240 + }, + { + "grad_norm": 0.41112661361694336, + "learning_rate": 9.792216037485047e-05, + "loss": 0.0173, + "step": 8250 + }, + { + "grad_norm": 0.540833592414856, + "learning_rate": 9.791429130499704e-05, + "loss": 0.0182, + "step": 8260 + }, + { + "grad_norm": 0.40391018986701965, + "learning_rate": 9.790640768004698e-05, + "loss": 0.027, + "step": 8270 + }, + { + "grad_norm": 0.44928425550460815, + "learning_rate": 9.789850950239518e-05, + "loss": 0.0216, + "step": 8280 + }, + { + "grad_norm": 0.4349338412284851, + "learning_rate": 9.789059677444089e-05, + "loss": 0.0212, + "step": 8290 + }, + { + "grad_norm": 0.3825474977493286, + "learning_rate": 9.788266949858776e-05, + "loss": 0.0196, + "step": 8300 + }, + { + "grad_norm": 0.4604344666004181, + "learning_rate": 9.787472767724392e-05, + "loss": 0.0257, + "step": 8310 + }, + { + "grad_norm": 0.488352507352829, + "learning_rate": 9.786677131282185e-05, + "loss": 0.0232, + "step": 8320 + }, + { + "grad_norm": 0.42431139945983887, + "learning_rate": 9.785880040773853e-05, + "loss": 0.0225, + "step": 8330 + }, + { + "grad_norm": 0.41543862223625183, + "learning_rate": 9.785081496441527e-05, + "loss": 0.0213, + "step": 8340 + }, + { + "grad_norm": 0.45707616209983826, + "learning_rate": 9.784281498527785e-05, + "loss": 0.0202, + "step": 8350 + }, + { + "grad_norm": 0.3349325656890869, + "learning_rate": 9.783480047275646e-05, + "loss": 0.0198, + "step": 8360 + }, + { + "grad_norm": 0.5466597080230713, + "learning_rate": 9.78267714292857e-05, + "loss": 0.0165, + "step": 8370 + }, + { + "grad_norm": 0.42251455783843994, + "learning_rate": 9.781872785730454e-05, + "loss": 0.0208, + "step": 8380 + }, + { + "grad_norm": 0.47086822986602783, + "learning_rate": 9.781066975925646e-05, + "loss": 0.0201, + "step": 8390 + }, + { + "grad_norm": 0.39013954997062683, + "learning_rate": 9.780259713758928e-05, + "loss": 0.017, + "step": 8400 + }, + { + "grad_norm": 0.43151065707206726, + "learning_rate": 9.779450999475524e-05, + "loss": 0.0192, + "step": 8410 + }, + { + "grad_norm": 0.6166801452636719, + "learning_rate": 9.7786408333211e-05, + "loss": 0.0205, + "step": 8420 + }, + { + "grad_norm": 0.6463521122932434, + "learning_rate": 9.777829215541764e-05, + "loss": 0.025, + "step": 8430 + }, + { + "grad_norm": 0.5083146691322327, + "learning_rate": 9.777016146384064e-05, + "loss": 0.0232, + "step": 8440 + }, + { + "grad_norm": 0.4979715943336487, + "learning_rate": 9.776201626094988e-05, + "loss": 0.0214, + "step": 8450 + }, + { + "grad_norm": 0.3771785497665405, + "learning_rate": 9.775385654921965e-05, + "loss": 0.0211, + "step": 8460 + }, + { + "grad_norm": 0.4617980420589447, + "learning_rate": 9.774568233112868e-05, + "loss": 0.0202, + "step": 8470 + }, + { + "grad_norm": 0.5535889863967896, + "learning_rate": 9.773749360916007e-05, + "loss": 0.0287, + "step": 8480 + }, + { + "grad_norm": 0.430027574300766, + "learning_rate": 9.772929038580134e-05, + "loss": 0.0272, + "step": 8490 + }, + { + "grad_norm": 0.5492871999740601, + "learning_rate": 9.772107266354439e-05, + "loss": 0.0222, + "step": 8500 + }, + { + "grad_norm": 0.45631569623947144, + "learning_rate": 9.77128404448856e-05, + "loss": 0.0238, + "step": 8510 + }, + { + "grad_norm": 0.4331152141094208, + "learning_rate": 9.770459373232565e-05, + "loss": 0.0225, + "step": 8520 + }, + { + "grad_norm": 0.4748595356941223, + "learning_rate": 9.769633252836969e-05, + "loss": 0.0187, + "step": 8530 + }, + { + "grad_norm": 0.4157826900482178, + "learning_rate": 9.768805683552724e-05, + "loss": 0.0175, + "step": 8540 + }, + { + "grad_norm": 0.3154793381690979, + "learning_rate": 9.767976665631228e-05, + "loss": 0.0231, + "step": 8550 + }, + { + "grad_norm": 0.4567323625087738, + "learning_rate": 9.767146199324311e-05, + "loss": 0.0197, + "step": 8560 + }, + { + "grad_norm": 0.4639695882797241, + "learning_rate": 9.766314284884249e-05, + "loss": 0.0242, + "step": 8570 + }, + { + "grad_norm": 0.45093733072280884, + "learning_rate": 9.765480922563752e-05, + "loss": 0.0207, + "step": 8580 + }, + { + "grad_norm": 0.41865789890289307, + "learning_rate": 9.764646112615978e-05, + "loss": 0.0201, + "step": 8590 + }, + { + "grad_norm": 0.37710443139076233, + "learning_rate": 9.763809855294517e-05, + "loss": 0.0231, + "step": 8600 + }, + { + "grad_norm": 0.4054875671863556, + "learning_rate": 9.762972150853404e-05, + "loss": 0.0232, + "step": 8610 + }, + { + "grad_norm": 0.46502891182899475, + "learning_rate": 9.762132999547111e-05, + "loss": 0.023, + "step": 8620 + }, + { + "grad_norm": 0.43600308895111084, + "learning_rate": 9.761292401630549e-05, + "loss": 0.0206, + "step": 8630 + }, + { + "grad_norm": 0.41320857405662537, + "learning_rate": 9.76045035735907e-05, + "loss": 0.0192, + "step": 8640 + }, + { + "grad_norm": 0.38332414627075195, + "learning_rate": 9.759606866988464e-05, + "loss": 0.0254, + "step": 8650 + }, + { + "grad_norm": 0.41022416949272156, + "learning_rate": 9.758761930774963e-05, + "loss": 0.0202, + "step": 8660 + }, + { + "grad_norm": 0.47089439630508423, + "learning_rate": 9.757915548975235e-05, + "loss": 0.0218, + "step": 8670 + }, + { + "grad_norm": 0.42582616209983826, + "learning_rate": 9.757067721846389e-05, + "loss": 0.0233, + "step": 8680 + }, + { + "grad_norm": 0.5691205859184265, + "learning_rate": 9.756218449645971e-05, + "loss": 0.0208, + "step": 8690 + }, + { + "grad_norm": 0.44527509808540344, + "learning_rate": 9.75536773263197e-05, + "loss": 0.0214, + "step": 8700 + }, + { + "grad_norm": 0.47421303391456604, + "learning_rate": 9.75451557106281e-05, + "loss": 0.0179, + "step": 8710 + }, + { + "grad_norm": 0.37851452827453613, + "learning_rate": 9.753661965197354e-05, + "loss": 0.0193, + "step": 8720 + }, + { + "grad_norm": 0.4480518698692322, + "learning_rate": 9.752806915294908e-05, + "loss": 0.0214, + "step": 8730 + }, + { + "grad_norm": 0.33988139033317566, + "learning_rate": 9.75195042161521e-05, + "loss": 0.0201, + "step": 8740 + }, + { + "grad_norm": 0.5393968820571899, + "learning_rate": 9.751092484418442e-05, + "loss": 0.0221, + "step": 8750 + }, + { + "grad_norm": 0.3613015413284302, + "learning_rate": 9.750233103965224e-05, + "loss": 0.0228, + "step": 8760 + }, + { + "grad_norm": 0.43525227904319763, + "learning_rate": 9.749372280516611e-05, + "loss": 0.0194, + "step": 8770 + }, + { + "grad_norm": 0.5981136560440063, + "learning_rate": 9.748510014334097e-05, + "loss": 0.0226, + "step": 8780 + }, + { + "grad_norm": 0.4568023979663849, + "learning_rate": 9.747646305679621e-05, + "loss": 0.0242, + "step": 8790 + }, + { + "grad_norm": 0.35805001854896545, + "learning_rate": 9.74678115481555e-05, + "loss": 0.0197, + "step": 8800 + }, + { + "grad_norm": 0.4945303499698639, + "learning_rate": 9.745914562004696e-05, + "loss": 0.0225, + "step": 8810 + }, + { + "grad_norm": 0.5129665732383728, + "learning_rate": 9.745046527510307e-05, + "loss": 0.0236, + "step": 8820 + }, + { + "grad_norm": 0.44416165351867676, + "learning_rate": 9.744177051596068e-05, + "loss": 0.021, + "step": 8830 + }, + { + "grad_norm": 0.41258132457733154, + "learning_rate": 9.743306134526105e-05, + "loss": 0.0176, + "step": 8840 + }, + { + "grad_norm": 0.719923198223114, + "learning_rate": 9.742433776564977e-05, + "loss": 0.0216, + "step": 8850 + }, + { + "grad_norm": 0.27094727754592896, + "learning_rate": 9.741559977977683e-05, + "loss": 0.0232, + "step": 8860 + }, + { + "grad_norm": 0.4060170650482178, + "learning_rate": 9.740684739029661e-05, + "loss": 0.0186, + "step": 8870 + }, + { + "grad_norm": 0.34258833527565, + "learning_rate": 9.739808059986789e-05, + "loss": 0.0214, + "step": 8880 + }, + { + "grad_norm": 0.4309740662574768, + "learning_rate": 9.738929941115373e-05, + "loss": 0.0167, + "step": 8890 + }, + { + "grad_norm": 0.4455328583717346, + "learning_rate": 9.738050382682167e-05, + "loss": 0.023, + "step": 8900 + }, + { + "grad_norm": 0.3460875451564789, + "learning_rate": 9.737169384954355e-05, + "loss": 0.0175, + "step": 8910 + }, + { + "grad_norm": 0.5134332180023193, + "learning_rate": 9.736286948199562e-05, + "loss": 0.0179, + "step": 8920 + }, + { + "grad_norm": 0.38894134759902954, + "learning_rate": 9.735403072685848e-05, + "loss": 0.017, + "step": 8930 + }, + { + "grad_norm": 0.35251039266586304, + "learning_rate": 9.734517758681712e-05, + "loss": 0.0176, + "step": 8940 + }, + { + "grad_norm": 0.41770267486572266, + "learning_rate": 9.733631006456088e-05, + "loss": 0.0201, + "step": 8950 + }, + { + "grad_norm": 0.4722731113433838, + "learning_rate": 9.732742816278348e-05, + "loss": 0.0216, + "step": 8960 + }, + { + "grad_norm": 0.3995599150657654, + "learning_rate": 9.731853188418302e-05, + "loss": 0.0153, + "step": 8970 + }, + { + "grad_norm": 0.4286731779575348, + "learning_rate": 9.730962123146194e-05, + "loss": 0.021, + "step": 8980 + }, + { + "grad_norm": 0.49526873230934143, + "learning_rate": 9.730069620732709e-05, + "loss": 0.0211, + "step": 8990 + }, + { + "grad_norm": 0.4411509335041046, + "learning_rate": 9.72917568144896e-05, + "loss": 0.0243, + "step": 9000 + }, + { + "grad_norm": 0.33901944756507874, + "learning_rate": 9.728280305566509e-05, + "loss": 0.0187, + "step": 9010 + }, + { + "grad_norm": 0.31252339482307434, + "learning_rate": 9.727383493357343e-05, + "loss": 0.0174, + "step": 9020 + }, + { + "grad_norm": 0.43856510519981384, + "learning_rate": 9.726485245093891e-05, + "loss": 0.0179, + "step": 9030 + }, + { + "grad_norm": 0.3634667694568634, + "learning_rate": 9.725585561049018e-05, + "loss": 0.0175, + "step": 9040 + }, + { + "grad_norm": 0.46502628922462463, + "learning_rate": 9.724684441496022e-05, + "loss": 0.0175, + "step": 9050 + }, + { + "grad_norm": 0.40700143575668335, + "learning_rate": 9.72378188670864e-05, + "loss": 0.0203, + "step": 9060 + }, + { + "grad_norm": 0.40098509192466736, + "learning_rate": 9.722877896961047e-05, + "loss": 0.0181, + "step": 9070 + }, + { + "grad_norm": 0.33963191509246826, + "learning_rate": 9.721972472527848e-05, + "loss": 0.0179, + "step": 9080 + }, + { + "grad_norm": 0.4348234534263611, + "learning_rate": 9.721065613684089e-05, + "loss": 0.0177, + "step": 9090 + }, + { + "grad_norm": 0.40149277448654175, + "learning_rate": 9.72015732070525e-05, + "loss": 0.0218, + "step": 9100 + }, + { + "grad_norm": 0.471910297870636, + "learning_rate": 9.719247593867244e-05, + "loss": 0.0205, + "step": 9110 + }, + { + "grad_norm": 0.4839973449707031, + "learning_rate": 9.718336433446423e-05, + "loss": 0.0207, + "step": 9120 + }, + { + "grad_norm": 0.43992653489112854, + "learning_rate": 9.717423839719574e-05, + "loss": 0.0153, + "step": 9130 + }, + { + "grad_norm": 0.32272520661354065, + "learning_rate": 9.71650981296392e-05, + "loss": 0.0201, + "step": 9140 + }, + { + "grad_norm": 0.46651262044906616, + "learning_rate": 9.715594353457118e-05, + "loss": 0.0228, + "step": 9150 + }, + { + "grad_norm": 0.3393571376800537, + "learning_rate": 9.714677461477257e-05, + "loss": 0.0185, + "step": 9160 + }, + { + "grad_norm": 0.5020555853843689, + "learning_rate": 9.713759137302869e-05, + "loss": 0.022, + "step": 9170 + }, + { + "grad_norm": 0.4701262414455414, + "learning_rate": 9.712839381212914e-05, + "loss": 0.0209, + "step": 9180 + }, + { + "grad_norm": 0.4095565676689148, + "learning_rate": 9.71191819348679e-05, + "loss": 0.0161, + "step": 9190 + }, + { + "grad_norm": 0.4011421203613281, + "learning_rate": 9.710995574404331e-05, + "loss": 0.0203, + "step": 9200 + }, + { + "grad_norm": 0.4591890573501587, + "learning_rate": 9.710071524245802e-05, + "loss": 0.0184, + "step": 9210 + }, + { + "grad_norm": 0.4001629054546356, + "learning_rate": 9.709146043291906e-05, + "loss": 0.0224, + "step": 9220 + }, + { + "grad_norm": 0.4999217689037323, + "learning_rate": 9.70821913182378e-05, + "loss": 0.0199, + "step": 9230 + }, + { + "grad_norm": 0.377604603767395, + "learning_rate": 9.707290790122995e-05, + "loss": 0.0188, + "step": 9240 + }, + { + "grad_norm": 0.3935345411300659, + "learning_rate": 9.706361018471557e-05, + "loss": 0.0229, + "step": 9250 + }, + { + "grad_norm": 0.3941493034362793, + "learning_rate": 9.705429817151906e-05, + "loss": 0.0196, + "step": 9260 + }, + { + "grad_norm": 0.38897705078125, + "learning_rate": 9.704497186446917e-05, + "loss": 0.0203, + "step": 9270 + }, + { + "grad_norm": 0.4263511598110199, + "learning_rate": 9.703563126639896e-05, + "loss": 0.0154, + "step": 9280 + }, + { + "grad_norm": 0.43895986676216125, + "learning_rate": 9.70262763801459e-05, + "loss": 0.0184, + "step": 9290 + }, + { + "grad_norm": 0.47976306080818176, + "learning_rate": 9.701690720855171e-05, + "loss": 0.0208, + "step": 9300 + }, + { + "grad_norm": 0.5062249898910522, + "learning_rate": 9.700752375446253e-05, + "loss": 0.0206, + "step": 9310 + }, + { + "grad_norm": 0.4274824261665344, + "learning_rate": 9.69981260207288e-05, + "loss": 0.0204, + "step": 9320 + }, + { + "grad_norm": 0.48724469542503357, + "learning_rate": 9.698871401020529e-05, + "loss": 0.0232, + "step": 9330 + }, + { + "grad_norm": 0.44530192017555237, + "learning_rate": 9.697928772575112e-05, + "loss": 0.0215, + "step": 9340 + }, + { + "grad_norm": 0.34917524456977844, + "learning_rate": 9.696984717022976e-05, + "loss": 0.0178, + "step": 9350 + }, + { + "grad_norm": 0.33441391587257385, + "learning_rate": 9.6960392346509e-05, + "loss": 0.021, + "step": 9360 + }, + { + "grad_norm": 0.48504024744033813, + "learning_rate": 9.695092325746097e-05, + "loss": 0.0245, + "step": 9370 + }, + { + "grad_norm": 0.407268226146698, + "learning_rate": 9.694143990596211e-05, + "loss": 0.022, + "step": 9380 + }, + { + "grad_norm": 0.3820374608039856, + "learning_rate": 9.693194229489325e-05, + "loss": 0.02, + "step": 9390 + }, + { + "grad_norm": 0.5052766799926758, + "learning_rate": 9.692243042713944e-05, + "loss": 0.0175, + "step": 9400 + }, + { + "grad_norm": 0.4108522832393646, + "learning_rate": 9.691290430559022e-05, + "loss": 0.0177, + "step": 9410 + }, + { + "grad_norm": 0.49897515773773193, + "learning_rate": 9.690336393313932e-05, + "loss": 0.0207, + "step": 9420 + }, + { + "grad_norm": 0.5770998001098633, + "learning_rate": 9.689380931268487e-05, + "loss": 0.0183, + "step": 9430 + }, + { + "grad_norm": 0.3491075336933136, + "learning_rate": 9.688424044712932e-05, + "loss": 0.0253, + "step": 9440 + }, + { + "grad_norm": 0.3772425055503845, + "learning_rate": 9.687465733937942e-05, + "loss": 0.0176, + "step": 9450 + }, + { + "grad_norm": 0.4027300477027893, + "learning_rate": 9.686505999234627e-05, + "loss": 0.0217, + "step": 9460 + }, + { + "grad_norm": 0.48696863651275635, + "learning_rate": 9.685544840894529e-05, + "loss": 0.0212, + "step": 9470 + }, + { + "grad_norm": 0.41473135352134705, + "learning_rate": 9.684582259209624e-05, + "loss": 0.0195, + "step": 9480 + }, + { + "grad_norm": 0.4275462329387665, + "learning_rate": 9.683618254472317e-05, + "loss": 0.0233, + "step": 9490 + }, + { + "grad_norm": 0.4512259364128113, + "learning_rate": 9.682652826975449e-05, + "loss": 0.0231, + "step": 9500 + }, + { + "grad_norm": 0.3336307108402252, + "learning_rate": 9.681685977012291e-05, + "loss": 0.019, + "step": 9510 + }, + { + "grad_norm": 0.33084091544151306, + "learning_rate": 9.680717704876546e-05, + "loss": 0.0223, + "step": 9520 + }, + { + "grad_norm": 0.39084622263908386, + "learning_rate": 9.679748010862349e-05, + "loss": 0.0196, + "step": 9530 + }, + { + "grad_norm": 0.4734356105327606, + "learning_rate": 9.678776895264267e-05, + "loss": 0.0214, + "step": 9540 + }, + { + "grad_norm": 0.4087027907371521, + "learning_rate": 9.6778043583773e-05, + "loss": 0.0221, + "step": 9550 + }, + { + "grad_norm": 0.4503321945667267, + "learning_rate": 9.67683040049688e-05, + "loss": 0.0229, + "step": 9560 + }, + { + "grad_norm": 0.516207218170166, + "learning_rate": 9.675855021918869e-05, + "loss": 0.0213, + "step": 9570 + }, + { + "grad_norm": 0.4708077311515808, + "learning_rate": 9.674878222939561e-05, + "loss": 0.0227, + "step": 9580 + }, + { + "grad_norm": 0.39283517003059387, + "learning_rate": 9.673900003855681e-05, + "loss": 0.0174, + "step": 9590 + }, + { + "grad_norm": 0.4651467204093933, + "learning_rate": 9.672920364964389e-05, + "loss": 0.0202, + "step": 9600 + }, + { + "grad_norm": 0.44438183307647705, + "learning_rate": 9.671939306563269e-05, + "loss": 0.0221, + "step": 9610 + }, + { + "grad_norm": 0.4396190047264099, + "learning_rate": 9.670956828950345e-05, + "loss": 0.0208, + "step": 9620 + }, + { + "grad_norm": 0.5852305889129639, + "learning_rate": 9.669972932424065e-05, + "loss": 0.0228, + "step": 9630 + }, + { + "grad_norm": 0.3958914577960968, + "learning_rate": 9.668987617283312e-05, + "loss": 0.0171, + "step": 9640 + }, + { + "grad_norm": 0.37439844012260437, + "learning_rate": 9.668000883827397e-05, + "loss": 0.0219, + "step": 9650 + }, + { + "grad_norm": 0.44193920493125916, + "learning_rate": 9.667012732356067e-05, + "loss": 0.0183, + "step": 9660 + }, + { + "grad_norm": 0.41145873069763184, + "learning_rate": 9.666023163169493e-05, + "loss": 0.0195, + "step": 9670 + }, + { + "grad_norm": 0.5343393087387085, + "learning_rate": 9.665032176568281e-05, + "loss": 0.019, + "step": 9680 + }, + { + "grad_norm": 0.496954083442688, + "learning_rate": 9.664039772853469e-05, + "loss": 0.0168, + "step": 9690 + }, + { + "grad_norm": 0.3513059616088867, + "learning_rate": 9.663045952326518e-05, + "loss": 0.0178, + "step": 9700 + }, + { + "grad_norm": 0.47588497400283813, + "learning_rate": 9.662050715289328e-05, + "loss": 0.0204, + "step": 9710 + }, + { + "grad_norm": 0.4222734868526459, + "learning_rate": 9.661054062044226e-05, + "loss": 0.0199, + "step": 9720 + }, + { + "grad_norm": 0.4997919201850891, + "learning_rate": 9.660055992893968e-05, + "loss": 0.0197, + "step": 9730 + }, + { + "grad_norm": 0.31947237253189087, + "learning_rate": 9.659056508141739e-05, + "loss": 0.0177, + "step": 9740 + }, + { + "grad_norm": 0.3846675157546997, + "learning_rate": 9.658055608091161e-05, + "loss": 0.018, + "step": 9750 + }, + { + "grad_norm": 0.36692550778388977, + "learning_rate": 9.657053293046276e-05, + "loss": 0.0183, + "step": 9760 + }, + { + "grad_norm": 0.5022429823875427, + "learning_rate": 9.656049563311564e-05, + "loss": 0.018, + "step": 9770 + }, + { + "grad_norm": 0.4174310564994812, + "learning_rate": 9.655044419191929e-05, + "loss": 0.019, + "step": 9780 + }, + { + "grad_norm": 0.4601549804210663, + "learning_rate": 9.654037860992711e-05, + "loss": 0.0212, + "step": 9790 + }, + { + "grad_norm": 0.38675203919410706, + "learning_rate": 9.653029889019672e-05, + "loss": 0.0154, + "step": 9800 + }, + { + "grad_norm": 0.5508227944374084, + "learning_rate": 9.65202050357901e-05, + "loss": 0.0199, + "step": 9810 + }, + { + "grad_norm": 0.3828381597995758, + "learning_rate": 9.651009704977347e-05, + "loss": 0.0159, + "step": 9820 + }, + { + "grad_norm": 0.426347941160202, + "learning_rate": 9.649997493521738e-05, + "loss": 0.0229, + "step": 9830 + }, + { + "grad_norm": 0.3594949543476105, + "learning_rate": 9.64898386951967e-05, + "loss": 0.0204, + "step": 9840 + }, + { + "grad_norm": 0.4823181927204132, + "learning_rate": 9.647968833279049e-05, + "loss": 0.0239, + "step": 9850 + }, + { + "grad_norm": 0.4254204034805298, + "learning_rate": 9.646952385108218e-05, + "loss": 0.0207, + "step": 9860 + }, + { + "grad_norm": 0.41789454221725464, + "learning_rate": 9.645934525315951e-05, + "loss": 0.0175, + "step": 9870 + }, + { + "grad_norm": 0.3172294795513153, + "learning_rate": 9.644915254211442e-05, + "loss": 0.0192, + "step": 9880 + }, + { + "grad_norm": 0.4198409616947174, + "learning_rate": 9.643894572104321e-05, + "loss": 0.0243, + "step": 9890 + }, + { + "grad_norm": 0.3813696503639221, + "learning_rate": 9.642872479304644e-05, + "loss": 0.0192, + "step": 9900 + }, + { + "grad_norm": 0.3871660530567169, + "learning_rate": 9.641848976122895e-05, + "loss": 0.0196, + "step": 9910 + }, + { + "grad_norm": 0.3427238464355469, + "learning_rate": 9.64082406286999e-05, + "loss": 0.0226, + "step": 9920 + }, + { + "grad_norm": 0.28634995222091675, + "learning_rate": 9.639797739857269e-05, + "loss": 0.015, + "step": 9930 + }, + { + "grad_norm": 0.399654746055603, + "learning_rate": 9.638770007396498e-05, + "loss": 0.0229, + "step": 9940 + }, + { + "grad_norm": 0.3774471580982208, + "learning_rate": 9.63774086579988e-05, + "loss": 0.0211, + "step": 9950 + }, + { + "grad_norm": 0.5255162119865417, + "learning_rate": 9.63671031538004e-05, + "loss": 0.0197, + "step": 9960 + }, + { + "grad_norm": 0.41620728373527527, + "learning_rate": 9.635678356450031e-05, + "loss": 0.0201, + "step": 9970 + }, + { + "grad_norm": 0.3821359872817993, + "learning_rate": 9.634644989323336e-05, + "loss": 0.018, + "step": 9980 + }, + { + "grad_norm": 0.476431280374527, + "learning_rate": 9.633610214313861e-05, + "loss": 0.023, + "step": 9990 + }, + { + "grad_norm": 0.3535378873348236, + "learning_rate": 9.632574031735951e-05, + "loss": 0.0249, + "step": 10000 + }, + { + "grad_norm": 0.4726323187351227, + "learning_rate": 9.631536441904364e-05, + "loss": 0.0225, + "step": 10010 + }, + { + "grad_norm": 0.4587574899196625, + "learning_rate": 9.630497445134293e-05, + "loss": 0.0211, + "step": 10020 + }, + { + "grad_norm": 0.36823904514312744, + "learning_rate": 9.62945704174136e-05, + "loss": 0.022, + "step": 10030 + }, + { + "grad_norm": 0.5425640940666199, + "learning_rate": 9.628415232041612e-05, + "loss": 0.0255, + "step": 10040 + }, + { + "grad_norm": 0.40507611632347107, + "learning_rate": 9.627372016351524e-05, + "loss": 0.0171, + "step": 10050 + }, + { + "grad_norm": 0.4909762144088745, + "learning_rate": 9.626327394987995e-05, + "loss": 0.0212, + "step": 10060 + }, + { + "grad_norm": 0.36966660618782043, + "learning_rate": 9.625281368268355e-05, + "loss": 0.022, + "step": 10070 + }, + { + "grad_norm": 0.5324448943138123, + "learning_rate": 9.624233936510357e-05, + "loss": 0.0208, + "step": 10080 + }, + { + "grad_norm": 0.44957393407821655, + "learning_rate": 9.623185100032187e-05, + "loss": 0.0207, + "step": 10090 + }, + { + "grad_norm": 0.3709962069988251, + "learning_rate": 9.62213485915245e-05, + "loss": 0.0213, + "step": 10100 + }, + { + "grad_norm": 0.6879405975341797, + "learning_rate": 9.621083214190186e-05, + "loss": 0.019, + "step": 10110 + }, + { + "grad_norm": 0.4251745045185089, + "learning_rate": 9.62003016546485e-05, + "loss": 0.0205, + "step": 10120 + }, + { + "grad_norm": 0.42113032937049866, + "learning_rate": 9.618975713296339e-05, + "loss": 0.0207, + "step": 10130 + }, + { + "grad_norm": 0.4057314097881317, + "learning_rate": 9.61791985800496e-05, + "loss": 0.0194, + "step": 10140 + }, + { + "grad_norm": 0.2854096591472626, + "learning_rate": 9.616862599911458e-05, + "loss": 0.0178, + "step": 10150 + }, + { + "grad_norm": 0.4426201283931732, + "learning_rate": 9.615803939337e-05, + "loss": 0.0177, + "step": 10160 + }, + { + "grad_norm": 0.4161039888858795, + "learning_rate": 9.614743876603178e-05, + "loss": 0.0208, + "step": 10170 + }, + { + "grad_norm": 0.44474175572395325, + "learning_rate": 9.613682412032013e-05, + "loss": 0.0186, + "step": 10180 + }, + { + "grad_norm": 0.3343549072742462, + "learning_rate": 9.612619545945947e-05, + "loss": 0.0198, + "step": 10190 + }, + { + "grad_norm": 0.4139963984489441, + "learning_rate": 9.611555278667852e-05, + "loss": 0.0198, + "step": 10200 + }, + { + "grad_norm": 0.47918879985809326, + "learning_rate": 9.610489610521024e-05, + "loss": 0.0222, + "step": 10210 + }, + { + "grad_norm": 0.39651069045066833, + "learning_rate": 9.609422541829187e-05, + "loss": 0.0201, + "step": 10220 + }, + { + "grad_norm": 0.393915593624115, + "learning_rate": 9.608354072916486e-05, + "loss": 0.0193, + "step": 10230 + }, + { + "grad_norm": 0.4271937608718872, + "learning_rate": 9.607284204107493e-05, + "loss": 0.0166, + "step": 10240 + }, + { + "grad_norm": 0.4442124366760254, + "learning_rate": 9.606212935727208e-05, + "loss": 0.0193, + "step": 10250 + }, + { + "grad_norm": 0.33683204650878906, + "learning_rate": 9.605140268101052e-05, + "loss": 0.0202, + "step": 10260 + }, + { + "grad_norm": 0.30396541953086853, + "learning_rate": 9.604066201554875e-05, + "loss": 0.0187, + "step": 10270 + }, + { + "grad_norm": 0.4092296063899994, + "learning_rate": 9.60299073641495e-05, + "loss": 0.0254, + "step": 10280 + }, + { + "grad_norm": 0.3376172184944153, + "learning_rate": 9.601913873007974e-05, + "loss": 0.0214, + "step": 10290 + }, + { + "grad_norm": 0.3814334571361542, + "learning_rate": 9.60083561166107e-05, + "loss": 0.0193, + "step": 10300 + }, + { + "grad_norm": 0.3993943929672241, + "learning_rate": 9.599755952701783e-05, + "loss": 0.0194, + "step": 10310 + }, + { + "grad_norm": 0.4468556344509125, + "learning_rate": 9.598674896458089e-05, + "loss": 0.0225, + "step": 10320 + }, + { + "grad_norm": 0.39103955030441284, + "learning_rate": 9.597592443258383e-05, + "loss": 0.0181, + "step": 10330 + }, + { + "grad_norm": 0.29275935888290405, + "learning_rate": 9.596508593431483e-05, + "loss": 0.0162, + "step": 10340 + }, + { + "grad_norm": 0.3820743262767792, + "learning_rate": 9.59542334730664e-05, + "loss": 0.0159, + "step": 10350 + }, + { + "grad_norm": 0.2735508978366852, + "learning_rate": 9.594336705213516e-05, + "loss": 0.0134, + "step": 10360 + }, + { + "grad_norm": 0.3563217222690582, + "learning_rate": 9.593248667482208e-05, + "loss": 0.0188, + "step": 10370 + }, + { + "grad_norm": 0.4323899447917938, + "learning_rate": 9.592159234443233e-05, + "loss": 0.0166, + "step": 10380 + }, + { + "grad_norm": 0.4390353858470917, + "learning_rate": 9.59106840642753e-05, + "loss": 0.0164, + "step": 10390 + }, + { + "grad_norm": 0.49410876631736755, + "learning_rate": 9.589976183766467e-05, + "loss": 0.0205, + "step": 10400 + }, + { + "grad_norm": 0.40017032623291016, + "learning_rate": 9.58888256679183e-05, + "loss": 0.0188, + "step": 10410 + }, + { + "grad_norm": 0.4119909703731537, + "learning_rate": 9.587787555835832e-05, + "loss": 0.0185, + "step": 10420 + }, + { + "grad_norm": 0.4083598554134369, + "learning_rate": 9.586691151231107e-05, + "loss": 0.0165, + "step": 10430 + }, + { + "grad_norm": 0.38529402017593384, + "learning_rate": 9.585593353310715e-05, + "loss": 0.0187, + "step": 10440 + }, + { + "grad_norm": 0.3211827576160431, + "learning_rate": 9.58449416240814e-05, + "loss": 0.0169, + "step": 10450 + }, + { + "grad_norm": 0.4595158100128174, + "learning_rate": 9.583393578857283e-05, + "loss": 0.0164, + "step": 10460 + }, + { + "grad_norm": 0.4236113131046295, + "learning_rate": 9.582291602992474e-05, + "loss": 0.0194, + "step": 10470 + }, + { + "grad_norm": 0.3500874936580658, + "learning_rate": 9.581188235148466e-05, + "loss": 0.0175, + "step": 10480 + }, + { + "grad_norm": 0.4048246145248413, + "learning_rate": 9.58008347566043e-05, + "loss": 0.0182, + "step": 10490 + }, + { + "grad_norm": 0.46567973494529724, + "learning_rate": 9.578977324863965e-05, + "loss": 0.0185, + "step": 10500 + }, + { + "grad_norm": 0.35758182406425476, + "learning_rate": 9.577869783095089e-05, + "loss": 0.0219, + "step": 10510 + }, + { + "grad_norm": 0.42754676938056946, + "learning_rate": 9.576760850690245e-05, + "loss": 0.0229, + "step": 10520 + }, + { + "grad_norm": 0.43167948722839355, + "learning_rate": 9.575650527986298e-05, + "loss": 0.0214, + "step": 10530 + }, + { + "grad_norm": 0.29197803139686584, + "learning_rate": 9.574538815320531e-05, + "loss": 0.0162, + "step": 10540 + }, + { + "grad_norm": 0.3847380578517914, + "learning_rate": 9.573425713030656e-05, + "loss": 0.0146, + "step": 10550 + }, + { + "grad_norm": 0.37215369939804077, + "learning_rate": 9.572311221454806e-05, + "loss": 0.0197, + "step": 10560 + }, + { + "grad_norm": 0.3433307707309723, + "learning_rate": 9.57119534093153e-05, + "loss": 0.0186, + "step": 10570 + }, + { + "grad_norm": 0.33722245693206787, + "learning_rate": 9.570078071799806e-05, + "loss": 0.0156, + "step": 10580 + }, + { + "grad_norm": 0.3338375687599182, + "learning_rate": 9.568959414399028e-05, + "loss": 0.018, + "step": 10590 + }, + { + "grad_norm": 0.3536750376224518, + "learning_rate": 9.567839369069018e-05, + "loss": 0.0166, + "step": 10600 + }, + { + "grad_norm": 0.3826412260532379, + "learning_rate": 9.566717936150013e-05, + "loss": 0.0189, + "step": 10610 + }, + { + "grad_norm": 0.3103746175765991, + "learning_rate": 9.565595115982678e-05, + "loss": 0.0168, + "step": 10620 + }, + { + "grad_norm": 0.3635064661502838, + "learning_rate": 9.564470908908094e-05, + "loss": 0.0183, + "step": 10630 + }, + { + "grad_norm": 0.28928908705711365, + "learning_rate": 9.563345315267764e-05, + "loss": 0.0162, + "step": 10640 + }, + { + "grad_norm": 0.4335481524467468, + "learning_rate": 9.562218335403616e-05, + "loss": 0.0146, + "step": 10650 + }, + { + "grad_norm": 0.3834221363067627, + "learning_rate": 9.561089969657999e-05, + "loss": 0.0184, + "step": 10660 + }, + { + "grad_norm": 0.35768556594848633, + "learning_rate": 9.559960218373673e-05, + "loss": 0.0167, + "step": 10670 + }, + { + "grad_norm": 0.43383631110191345, + "learning_rate": 9.558829081893836e-05, + "loss": 0.0214, + "step": 10680 + }, + { + "grad_norm": 0.4365203082561493, + "learning_rate": 9.55769656056209e-05, + "loss": 0.0154, + "step": 10690 + }, + { + "grad_norm": 0.3742186725139618, + "learning_rate": 9.556562654722469e-05, + "loss": 0.0157, + "step": 10700 + }, + { + "grad_norm": 0.4212026596069336, + "learning_rate": 9.555427364719422e-05, + "loss": 0.019, + "step": 10710 + }, + { + "grad_norm": 0.3102274537086487, + "learning_rate": 9.55429069089782e-05, + "loss": 0.0157, + "step": 10720 + }, + { + "grad_norm": 0.30959296226501465, + "learning_rate": 9.553152633602956e-05, + "loss": 0.0163, + "step": 10730 + }, + { + "grad_norm": 0.39335960149765015, + "learning_rate": 9.552013193180543e-05, + "loss": 0.0187, + "step": 10740 + }, + { + "grad_norm": 0.46019238233566284, + "learning_rate": 9.550872369976707e-05, + "loss": 0.0186, + "step": 10750 + }, + { + "grad_norm": 0.5118435025215149, + "learning_rate": 9.549730164338007e-05, + "loss": 0.0181, + "step": 10760 + }, + { + "grad_norm": 0.38193657994270325, + "learning_rate": 9.548586576611408e-05, + "loss": 0.0176, + "step": 10770 + }, + { + "grad_norm": 0.3409776985645294, + "learning_rate": 9.54744160714431e-05, + "loss": 0.0145, + "step": 10780 + }, + { + "grad_norm": 0.4351353347301483, + "learning_rate": 9.546295256284516e-05, + "loss": 0.0197, + "step": 10790 + }, + { + "grad_norm": 0.3557078242301941, + "learning_rate": 9.545147524380265e-05, + "loss": 0.0178, + "step": 10800 + }, + { + "grad_norm": 0.4559164047241211, + "learning_rate": 9.543998411780201e-05, + "loss": 0.0158, + "step": 10810 + }, + { + "grad_norm": 0.28311851620674133, + "learning_rate": 9.542847918833397e-05, + "loss": 0.0198, + "step": 10820 + }, + { + "grad_norm": 0.4107087254524231, + "learning_rate": 9.541696045889343e-05, + "loss": 0.0179, + "step": 10830 + }, + { + "grad_norm": 0.366896390914917, + "learning_rate": 9.540542793297947e-05, + "loss": 0.0184, + "step": 10840 + }, + { + "grad_norm": 0.4244709610939026, + "learning_rate": 9.539388161409537e-05, + "loss": 0.0179, + "step": 10850 + }, + { + "grad_norm": 0.31330156326293945, + "learning_rate": 9.538232150574857e-05, + "loss": 0.0192, + "step": 10860 + }, + { + "grad_norm": 0.24634262919425964, + "learning_rate": 9.537074761145076e-05, + "loss": 0.0161, + "step": 10870 + }, + { + "grad_norm": 0.2985272705554962, + "learning_rate": 9.535915993471778e-05, + "loss": 0.0154, + "step": 10880 + }, + { + "grad_norm": 0.41853320598602295, + "learning_rate": 9.534755847906964e-05, + "loss": 0.0162, + "step": 10890 + }, + { + "grad_norm": 0.34125640988349915, + "learning_rate": 9.533594324803057e-05, + "loss": 0.0152, + "step": 10900 + }, + { + "grad_norm": 0.4875860810279846, + "learning_rate": 9.532431424512895e-05, + "loss": 0.0153, + "step": 10910 + }, + { + "grad_norm": 0.40720805525779724, + "learning_rate": 9.531267147389741e-05, + "loss": 0.016, + "step": 10920 + }, + { + "grad_norm": 0.39791691303253174, + "learning_rate": 9.530101493787266e-05, + "loss": 0.0198, + "step": 10930 + }, + { + "grad_norm": 0.32826876640319824, + "learning_rate": 9.528934464059571e-05, + "loss": 0.0151, + "step": 10940 + }, + { + "grad_norm": 0.37639832496643066, + "learning_rate": 9.527766058561163e-05, + "loss": 0.0166, + "step": 10950 + }, + { + "grad_norm": 0.3262758255004883, + "learning_rate": 9.526596277646976e-05, + "loss": 0.0154, + "step": 10960 + }, + { + "grad_norm": 0.4586452841758728, + "learning_rate": 9.525425121672358e-05, + "loss": 0.0169, + "step": 10970 + }, + { + "grad_norm": 0.34545138478279114, + "learning_rate": 9.524252590993074e-05, + "loss": 0.017, + "step": 10980 + }, + { + "grad_norm": 0.3826800584793091, + "learning_rate": 9.523078685965309e-05, + "loss": 0.0186, + "step": 10990 + }, + { + "grad_norm": 0.3638732433319092, + "learning_rate": 9.521903406945664e-05, + "loss": 0.0208, + "step": 11000 + }, + { + "grad_norm": 0.29514259099960327, + "learning_rate": 9.520726754291158e-05, + "loss": 0.0161, + "step": 11010 + }, + { + "grad_norm": 0.4679412543773651, + "learning_rate": 9.519548728359227e-05, + "loss": 0.0183, + "step": 11020 + }, + { + "grad_norm": 0.3475341796875, + "learning_rate": 9.518369329507726e-05, + "loss": 0.0196, + "step": 11030 + }, + { + "grad_norm": 0.35793352127075195, + "learning_rate": 9.51718855809492e-05, + "loss": 0.0161, + "step": 11040 + }, + { + "grad_norm": 0.32648763060569763, + "learning_rate": 9.516006414479502e-05, + "loss": 0.0161, + "step": 11050 + }, + { + "grad_norm": 0.369131863117218, + "learning_rate": 9.514822899020572e-05, + "loss": 0.0148, + "step": 11060 + }, + { + "grad_norm": 0.5173619389533997, + "learning_rate": 9.513638012077654e-05, + "loss": 0.0221, + "step": 11070 + }, + { + "grad_norm": 0.29296427965164185, + "learning_rate": 9.512451754010683e-05, + "loss": 0.0175, + "step": 11080 + }, + { + "grad_norm": 0.342509925365448, + "learning_rate": 9.511264125180013e-05, + "loss": 0.0174, + "step": 11090 + }, + { + "grad_norm": 0.3526631295681, + "learning_rate": 9.510075125946414e-05, + "loss": 0.0181, + "step": 11100 + }, + { + "grad_norm": 0.3492644727230072, + "learning_rate": 9.508884756671075e-05, + "loss": 0.0213, + "step": 11110 + }, + { + "grad_norm": 0.4704457223415375, + "learning_rate": 9.507693017715596e-05, + "loss": 0.0194, + "step": 11120 + }, + { + "grad_norm": 0.38454246520996094, + "learning_rate": 9.506499909441997e-05, + "loss": 0.0168, + "step": 11130 + }, + { + "grad_norm": 0.42641520500183105, + "learning_rate": 9.505305432212713e-05, + "loss": 0.022, + "step": 11140 + }, + { + "grad_norm": 0.513241708278656, + "learning_rate": 9.504109586390595e-05, + "loss": 0.0248, + "step": 11150 + }, + { + "grad_norm": 0.4879985749721527, + "learning_rate": 9.502912372338908e-05, + "loss": 0.0206, + "step": 11160 + }, + { + "grad_norm": 0.4405342638492584, + "learning_rate": 9.501713790421335e-05, + "loss": 0.0194, + "step": 11170 + }, + { + "grad_norm": 0.44434618949890137, + "learning_rate": 9.500513841001974e-05, + "loss": 0.0165, + "step": 11180 + }, + { + "grad_norm": 0.2799209654331207, + "learning_rate": 9.499312524445336e-05, + "loss": 0.0221, + "step": 11190 + }, + { + "grad_norm": 0.42130419611930847, + "learning_rate": 9.498109841116351e-05, + "loss": 0.0217, + "step": 11200 + }, + { + "grad_norm": 0.3674059212207794, + "learning_rate": 9.496905791380363e-05, + "loss": 0.0222, + "step": 11210 + }, + { + "grad_norm": 0.3240606486797333, + "learning_rate": 9.495700375603129e-05, + "loss": 0.0163, + "step": 11220 + }, + { + "grad_norm": 0.3400369882583618, + "learning_rate": 9.494493594150822e-05, + "loss": 0.02, + "step": 11230 + }, + { + "grad_norm": 0.39605817198753357, + "learning_rate": 9.493285447390032e-05, + "loss": 0.0164, + "step": 11240 + }, + { + "grad_norm": 0.481381356716156, + "learning_rate": 9.492075935687761e-05, + "loss": 0.0205, + "step": 11250 + }, + { + "grad_norm": 0.4486084282398224, + "learning_rate": 9.490865059411427e-05, + "loss": 0.0164, + "step": 11260 + }, + { + "grad_norm": 0.3878118097782135, + "learning_rate": 9.489652818928863e-05, + "loss": 0.0161, + "step": 11270 + }, + { + "grad_norm": 0.4139678180217743, + "learning_rate": 9.488439214608315e-05, + "loss": 0.0169, + "step": 11280 + }, + { + "grad_norm": 0.5058616399765015, + "learning_rate": 9.487224246818444e-05, + "loss": 0.0182, + "step": 11290 + }, + { + "grad_norm": 0.31589049100875854, + "learning_rate": 9.486007915928325e-05, + "loss": 0.0187, + "step": 11300 + }, + { + "grad_norm": 0.38299834728240967, + "learning_rate": 9.484790222307448e-05, + "loss": 0.0188, + "step": 11310 + }, + { + "grad_norm": 0.3907677233219147, + "learning_rate": 9.483571166325716e-05, + "loss": 0.0164, + "step": 11320 + }, + { + "grad_norm": 0.27185311913490295, + "learning_rate": 9.482350748353444e-05, + "loss": 0.0183, + "step": 11330 + }, + { + "grad_norm": 0.40438348054885864, + "learning_rate": 9.481128968761363e-05, + "loss": 0.0204, + "step": 11340 + }, + { + "grad_norm": 0.3392781913280487, + "learning_rate": 9.479905827920621e-05, + "loss": 0.0194, + "step": 11350 + }, + { + "grad_norm": 0.46958860754966736, + "learning_rate": 9.478681326202773e-05, + "loss": 0.0173, + "step": 11360 + }, + { + "grad_norm": 0.4522368311882019, + "learning_rate": 9.477455463979791e-05, + "loss": 0.0201, + "step": 11370 + }, + { + "grad_norm": 0.313892126083374, + "learning_rate": 9.476228241624059e-05, + "loss": 0.0163, + "step": 11380 + }, + { + "grad_norm": 0.3670874834060669, + "learning_rate": 9.474999659508374e-05, + "loss": 0.014, + "step": 11390 + }, + { + "grad_norm": 0.3470529317855835, + "learning_rate": 9.47376971800595e-05, + "loss": 0.0139, + "step": 11400 + }, + { + "grad_norm": 0.33186304569244385, + "learning_rate": 9.472538417490409e-05, + "loss": 0.0181, + "step": 11410 + }, + { + "grad_norm": 0.369559109210968, + "learning_rate": 9.471305758335784e-05, + "loss": 0.0171, + "step": 11420 + }, + { + "grad_norm": 0.2526124119758606, + "learning_rate": 9.47007174091653e-05, + "loss": 0.0173, + "step": 11430 + }, + { + "grad_norm": 0.508969247341156, + "learning_rate": 9.468836365607507e-05, + "loss": 0.0147, + "step": 11440 + }, + { + "grad_norm": 0.414359450340271, + "learning_rate": 9.467599632783988e-05, + "loss": 0.0169, + "step": 11450 + }, + { + "grad_norm": 0.37647607922554016, + "learning_rate": 9.466361542821662e-05, + "loss": 0.023, + "step": 11460 + }, + { + "grad_norm": 0.33450043201446533, + "learning_rate": 9.465122096096625e-05, + "loss": 0.0198, + "step": 11470 + }, + { + "grad_norm": 0.33466044068336487, + "learning_rate": 9.463881292985391e-05, + "loss": 0.0189, + "step": 11480 + }, + { + "grad_norm": 0.32771435379981995, + "learning_rate": 9.462639133864881e-05, + "loss": 0.0154, + "step": 11490 + }, + { + "grad_norm": 0.2925075888633728, + "learning_rate": 9.461395619112432e-05, + "loss": 0.0185, + "step": 11500 + }, + { + "grad_norm": 0.3099575638771057, + "learning_rate": 9.460150749105791e-05, + "loss": 0.0156, + "step": 11510 + }, + { + "grad_norm": 0.3670234978199005, + "learning_rate": 9.458904524223116e-05, + "loss": 0.0179, + "step": 11520 + }, + { + "grad_norm": 0.5236250758171082, + "learning_rate": 9.457656944842976e-05, + "loss": 0.0198, + "step": 11530 + }, + { + "grad_norm": 0.48519566655158997, + "learning_rate": 9.456408011344353e-05, + "loss": 0.0166, + "step": 11540 + }, + { + "grad_norm": 0.4143960773944855, + "learning_rate": 9.455157724106643e-05, + "loss": 0.0169, + "step": 11550 + }, + { + "grad_norm": 0.3964592516422272, + "learning_rate": 9.453906083509647e-05, + "loss": 0.0165, + "step": 11560 + }, + { + "grad_norm": 0.42435070872306824, + "learning_rate": 9.45265308993358e-05, + "loss": 0.0164, + "step": 11570 + }, + { + "grad_norm": 0.4208449125289917, + "learning_rate": 9.451398743759071e-05, + "loss": 0.0147, + "step": 11580 + }, + { + "grad_norm": 0.3677162826061249, + "learning_rate": 9.450143045367156e-05, + "loss": 0.0184, + "step": 11590 + }, + { + "grad_norm": 0.35301387310028076, + "learning_rate": 9.448885995139283e-05, + "loss": 0.0159, + "step": 11600 + }, + { + "grad_norm": 0.3524612784385681, + "learning_rate": 9.44762759345731e-05, + "loss": 0.0172, + "step": 11610 + }, + { + "grad_norm": 0.36330240964889526, + "learning_rate": 9.446367840703509e-05, + "loss": 0.0191, + "step": 11620 + }, + { + "grad_norm": 0.3642682731151581, + "learning_rate": 9.445106737260556e-05, + "loss": 0.0157, + "step": 11630 + }, + { + "grad_norm": 0.37559136748313904, + "learning_rate": 9.443844283511543e-05, + "loss": 0.0185, + "step": 11640 + }, + { + "grad_norm": 0.36658918857574463, + "learning_rate": 9.442580479839968e-05, + "loss": 0.0141, + "step": 11650 + }, + { + "grad_norm": 0.38368797302246094, + "learning_rate": 9.441315326629745e-05, + "loss": 0.015, + "step": 11660 + }, + { + "grad_norm": 0.38235482573509216, + "learning_rate": 9.44004882426519e-05, + "loss": 0.0206, + "step": 11670 + }, + { + "grad_norm": 0.3051309585571289, + "learning_rate": 9.438780973131037e-05, + "loss": 0.0137, + "step": 11680 + }, + { + "grad_norm": 0.35470375418663025, + "learning_rate": 9.437511773612423e-05, + "loss": 0.017, + "step": 11690 + }, + { + "grad_norm": 0.34460386633872986, + "learning_rate": 9.436241226094896e-05, + "loss": 0.0188, + "step": 11700 + }, + { + "grad_norm": 0.40933752059936523, + "learning_rate": 9.434969330964418e-05, + "loss": 0.0216, + "step": 11710 + }, + { + "grad_norm": 0.40221714973449707, + "learning_rate": 9.433696088607356e-05, + "loss": 0.0157, + "step": 11720 + }, + { + "grad_norm": 0.30475372076034546, + "learning_rate": 9.432421499410486e-05, + "loss": 0.0165, + "step": 11730 + }, + { + "grad_norm": 0.30148208141326904, + "learning_rate": 9.431145563760998e-05, + "loss": 0.0152, + "step": 11740 + }, + { + "grad_norm": 0.3930191397666931, + "learning_rate": 9.429868282046484e-05, + "loss": 0.015, + "step": 11750 + }, + { + "grad_norm": 0.3486611843109131, + "learning_rate": 9.428589654654951e-05, + "loss": 0.0172, + "step": 11760 + }, + { + "grad_norm": 0.36533284187316895, + "learning_rate": 9.42730968197481e-05, + "loss": 0.0136, + "step": 11770 + }, + { + "grad_norm": 0.33773350715637207, + "learning_rate": 9.426028364394883e-05, + "loss": 0.0181, + "step": 11780 + }, + { + "grad_norm": 0.47433459758758545, + "learning_rate": 9.424745702304402e-05, + "loss": 0.0176, + "step": 11790 + }, + { + "grad_norm": 0.2880942225456238, + "learning_rate": 9.423461696093006e-05, + "loss": 0.017, + "step": 11800 + }, + { + "grad_norm": 0.29186585545539856, + "learning_rate": 9.422176346150741e-05, + "loss": 0.0156, + "step": 11810 + }, + { + "grad_norm": 0.36226364970207214, + "learning_rate": 9.420889652868063e-05, + "loss": 0.0208, + "step": 11820 + }, + { + "grad_norm": 0.44331395626068115, + "learning_rate": 9.419601616635836e-05, + "loss": 0.0173, + "step": 11830 + }, + { + "grad_norm": 0.32699716091156006, + "learning_rate": 9.418312237845331e-05, + "loss": 0.0148, + "step": 11840 + }, + { + "grad_norm": 0.333482027053833, + "learning_rate": 9.417021516888225e-05, + "loss": 0.0181, + "step": 11850 + }, + { + "grad_norm": 0.3288630545139313, + "learning_rate": 9.415729454156608e-05, + "loss": 0.0188, + "step": 11860 + }, + { + "grad_norm": 0.2743556797504425, + "learning_rate": 9.414436050042973e-05, + "loss": 0.0212, + "step": 11870 + }, + { + "grad_norm": 0.3627742528915405, + "learning_rate": 9.413141304940223e-05, + "loss": 0.0154, + "step": 11880 + }, + { + "grad_norm": 0.4373108446598053, + "learning_rate": 9.411845219241666e-05, + "loss": 0.0182, + "step": 11890 + }, + { + "grad_norm": 0.34265047311782837, + "learning_rate": 9.410547793341021e-05, + "loss": 0.0186, + "step": 11900 + }, + { + "grad_norm": 0.3257286548614502, + "learning_rate": 9.409249027632408e-05, + "loss": 0.017, + "step": 11910 + }, + { + "grad_norm": 0.3931034803390503, + "learning_rate": 9.407948922510362e-05, + "loss": 0.0168, + "step": 11920 + }, + { + "grad_norm": 0.3891640603542328, + "learning_rate": 9.406647478369817e-05, + "loss": 0.015, + "step": 11930 + }, + { + "grad_norm": 0.3381834030151367, + "learning_rate": 9.405344695606118e-05, + "loss": 0.0181, + "step": 11940 + }, + { + "grad_norm": 0.35451003909111023, + "learning_rate": 9.404040574615018e-05, + "loss": 0.0173, + "step": 11950 + }, + { + "grad_norm": 0.384470134973526, + "learning_rate": 9.402735115792674e-05, + "loss": 0.0177, + "step": 11960 + }, + { + "grad_norm": 0.39360836148262024, + "learning_rate": 9.401428319535649e-05, + "loss": 0.0191, + "step": 11970 + }, + { + "grad_norm": 0.4161492586135864, + "learning_rate": 9.400120186240912e-05, + "loss": 0.0158, + "step": 11980 + }, + { + "grad_norm": 0.4901255965232849, + "learning_rate": 9.398810716305844e-05, + "loss": 0.016, + "step": 11990 + }, + { + "grad_norm": 0.34617289900779724, + "learning_rate": 9.397499910128222e-05, + "loss": 0.0145, + "step": 12000 + }, + { + "grad_norm": 0.352186381816864, + "learning_rate": 9.396187768106237e-05, + "loss": 0.019, + "step": 12010 + }, + { + "grad_norm": 0.35461387038230896, + "learning_rate": 9.394874290638482e-05, + "loss": 0.0205, + "step": 12020 + }, + { + "grad_norm": 0.3936839699745178, + "learning_rate": 9.393559478123959e-05, + "loss": 0.016, + "step": 12030 + }, + { + "grad_norm": 0.4476394057273865, + "learning_rate": 9.39224333096207e-05, + "loss": 0.0142, + "step": 12040 + }, + { + "grad_norm": 0.37746569514274597, + "learning_rate": 9.390925849552629e-05, + "loss": 0.0173, + "step": 12050 + }, + { + "grad_norm": 0.47711020708084106, + "learning_rate": 9.389607034295849e-05, + "loss": 0.0195, + "step": 12060 + }, + { + "grad_norm": 0.3538389205932617, + "learning_rate": 9.388286885592355e-05, + "loss": 0.0156, + "step": 12070 + }, + { + "grad_norm": 0.35910990834236145, + "learning_rate": 9.386965403843168e-05, + "loss": 0.0151, + "step": 12080 + }, + { + "grad_norm": 0.30144861340522766, + "learning_rate": 9.385642589449726e-05, + "loss": 0.014, + "step": 12090 + }, + { + "grad_norm": 0.31444376707077026, + "learning_rate": 9.38431844281386e-05, + "loss": 0.0167, + "step": 12100 + }, + { + "grad_norm": 0.3443724811077118, + "learning_rate": 9.38299296433781e-05, + "loss": 0.0153, + "step": 12110 + }, + { + "grad_norm": 0.3032764494419098, + "learning_rate": 9.381666154424226e-05, + "loss": 0.0159, + "step": 12120 + }, + { + "grad_norm": 0.372444748878479, + "learning_rate": 9.380338013476157e-05, + "loss": 0.0144, + "step": 12130 + }, + { + "grad_norm": 0.35097065567970276, + "learning_rate": 9.379008541897054e-05, + "loss": 0.0159, + "step": 12140 + }, + { + "grad_norm": 0.39746779203414917, + "learning_rate": 9.377677740090777e-05, + "loss": 0.0163, + "step": 12150 + }, + { + "grad_norm": 0.33482182025909424, + "learning_rate": 9.376345608461588e-05, + "loss": 0.0238, + "step": 12160 + }, + { + "grad_norm": 0.2922261357307434, + "learning_rate": 9.375012147414155e-05, + "loss": 0.0145, + "step": 12170 + }, + { + "grad_norm": 0.3712657690048218, + "learning_rate": 9.373677357353545e-05, + "loss": 0.0189, + "step": 12180 + }, + { + "grad_norm": 0.36195287108421326, + "learning_rate": 9.372341238685237e-05, + "loss": 0.0196, + "step": 12190 + }, + { + "grad_norm": 0.3878498673439026, + "learning_rate": 9.371003791815102e-05, + "loss": 0.0146, + "step": 12200 + }, + { + "grad_norm": 0.4278899133205414, + "learning_rate": 9.369665017149429e-05, + "loss": 0.02, + "step": 12210 + }, + { + "grad_norm": 0.3481537997722626, + "learning_rate": 9.368324915094895e-05, + "loss": 0.0194, + "step": 12220 + }, + { + "grad_norm": 0.3668903410434723, + "learning_rate": 9.366983486058591e-05, + "loss": 0.0195, + "step": 12230 + }, + { + "grad_norm": 0.37120190262794495, + "learning_rate": 9.365640730448009e-05, + "loss": 0.015, + "step": 12240 + }, + { + "grad_norm": 0.33916807174682617, + "learning_rate": 9.36429664867104e-05, + "loss": 0.0179, + "step": 12250 + }, + { + "grad_norm": 0.37433159351348877, + "learning_rate": 9.362951241135982e-05, + "loss": 0.0165, + "step": 12260 + }, + { + "grad_norm": 0.3238275647163391, + "learning_rate": 9.361604508251534e-05, + "loss": 0.0193, + "step": 12270 + }, + { + "grad_norm": 0.4189596474170685, + "learning_rate": 9.360256450426799e-05, + "loss": 0.0182, + "step": 12280 + }, + { + "grad_norm": 0.3561323881149292, + "learning_rate": 9.358907068071279e-05, + "loss": 0.0149, + "step": 12290 + }, + { + "grad_norm": 0.32975974678993225, + "learning_rate": 9.357556361594882e-05, + "loss": 0.0144, + "step": 12300 + }, + { + "grad_norm": 0.4341329038143158, + "learning_rate": 9.356204331407917e-05, + "loss": 0.0164, + "step": 12310 + }, + { + "grad_norm": 0.35126253962516785, + "learning_rate": 9.354850977921094e-05, + "loss": 0.0143, + "step": 12320 + }, + { + "grad_norm": 0.3386474549770355, + "learning_rate": 9.353496301545529e-05, + "loss": 0.0147, + "step": 12330 + }, + { + "grad_norm": 0.43438291549682617, + "learning_rate": 9.352140302692733e-05, + "loss": 0.0147, + "step": 12340 + }, + { + "grad_norm": 0.3747648298740387, + "learning_rate": 9.350782981774627e-05, + "loss": 0.0161, + "step": 12350 + }, + { + "grad_norm": 0.3662484586238861, + "learning_rate": 9.349424339203526e-05, + "loss": 0.0146, + "step": 12360 + }, + { + "grad_norm": 0.32128167152404785, + "learning_rate": 9.34806437539215e-05, + "loss": 0.012, + "step": 12370 + }, + { + "grad_norm": 0.42437613010406494, + "learning_rate": 9.346703090753622e-05, + "loss": 0.0194, + "step": 12380 + }, + { + "grad_norm": 0.3569956123828888, + "learning_rate": 9.345340485701461e-05, + "loss": 0.0154, + "step": 12390 + }, + { + "grad_norm": 0.5556051135063171, + "learning_rate": 9.343976560649595e-05, + "loss": 0.0192, + "step": 12400 + }, + { + "grad_norm": 0.4483718276023865, + "learning_rate": 9.342611316012344e-05, + "loss": 0.0188, + "step": 12410 + }, + { + "grad_norm": 0.317126989364624, + "learning_rate": 9.341244752204437e-05, + "loss": 0.0189, + "step": 12420 + }, + { + "grad_norm": 0.306243896484375, + "learning_rate": 9.339876869640995e-05, + "loss": 0.0174, + "step": 12430 + }, + { + "grad_norm": 0.35216355323791504, + "learning_rate": 9.33850766873755e-05, + "loss": 0.0145, + "step": 12440 + }, + { + "grad_norm": 0.35221606492996216, + "learning_rate": 9.337137149910028e-05, + "loss": 0.0129, + "step": 12450 + }, + { + "grad_norm": 0.4230436682701111, + "learning_rate": 9.335765313574753e-05, + "loss": 0.0167, + "step": 12460 + }, + { + "grad_norm": 0.4327002763748169, + "learning_rate": 9.334392160148457e-05, + "loss": 0.0146, + "step": 12470 + }, + { + "grad_norm": 0.4073230028152466, + "learning_rate": 9.333017690048264e-05, + "loss": 0.0186, + "step": 12480 + }, + { + "grad_norm": 0.44986507296562195, + "learning_rate": 9.331641903691706e-05, + "loss": 0.0171, + "step": 12490 + }, + { + "grad_norm": 0.41319334506988525, + "learning_rate": 9.330264801496707e-05, + "loss": 0.0174, + "step": 12500 + }, + { + "grad_norm": 0.3380400836467743, + "learning_rate": 9.328886383881594e-05, + "loss": 0.0149, + "step": 12510 + }, + { + "grad_norm": 0.461952269077301, + "learning_rate": 9.327506651265095e-05, + "loss": 0.0148, + "step": 12520 + }, + { + "grad_norm": 0.285542368888855, + "learning_rate": 9.326125604066338e-05, + "loss": 0.0152, + "step": 12530 + }, + { + "grad_norm": 0.3334369659423828, + "learning_rate": 9.324743242704847e-05, + "loss": 0.014, + "step": 12540 + }, + { + "grad_norm": 0.3627273738384247, + "learning_rate": 9.323359567600546e-05, + "loss": 0.0165, + "step": 12550 + }, + { + "grad_norm": 0.4116191864013672, + "learning_rate": 9.321974579173761e-05, + "loss": 0.0129, + "step": 12560 + }, + { + "grad_norm": 0.3877717852592468, + "learning_rate": 9.320588277845213e-05, + "loss": 0.0172, + "step": 12570 + }, + { + "grad_norm": 0.3369559943675995, + "learning_rate": 9.319200664036026e-05, + "loss": 0.0153, + "step": 12580 + }, + { + "grad_norm": 0.446371853351593, + "learning_rate": 9.31781173816772e-05, + "loss": 0.0142, + "step": 12590 + }, + { + "grad_norm": 0.381043016910553, + "learning_rate": 9.316421500662212e-05, + "loss": 0.0156, + "step": 12600 + }, + { + "grad_norm": 0.2549687623977661, + "learning_rate": 9.31502995194182e-05, + "loss": 0.0145, + "step": 12610 + }, + { + "grad_norm": 0.29747188091278076, + "learning_rate": 9.31363709242926e-05, + "loss": 0.0135, + "step": 12620 + }, + { + "grad_norm": 0.38646069169044495, + "learning_rate": 9.312242922547647e-05, + "loss": 0.0167, + "step": 12630 + }, + { + "grad_norm": 0.33346793055534363, + "learning_rate": 9.310847442720492e-05, + "loss": 0.015, + "step": 12640 + }, + { + "grad_norm": 0.32923072576522827, + "learning_rate": 9.309450653371706e-05, + "loss": 0.0164, + "step": 12650 + }, + { + "grad_norm": 0.3573673665523529, + "learning_rate": 9.308052554925595e-05, + "loss": 0.018, + "step": 12660 + }, + { + "grad_norm": 0.4661363661289215, + "learning_rate": 9.306653147806867e-05, + "loss": 0.0181, + "step": 12670 + }, + { + "grad_norm": 0.41974154114723206, + "learning_rate": 9.305252432440622e-05, + "loss": 0.0146, + "step": 12680 + }, + { + "grad_norm": 0.41825205087661743, + "learning_rate": 9.303850409252361e-05, + "loss": 0.0213, + "step": 12690 + }, + { + "grad_norm": 0.38984861969947815, + "learning_rate": 9.302447078667985e-05, + "loss": 0.0222, + "step": 12700 + }, + { + "grad_norm": 0.33325374126434326, + "learning_rate": 9.301042441113783e-05, + "loss": 0.0139, + "step": 12710 + }, + { + "grad_norm": 0.40057238936424255, + "learning_rate": 9.299636497016451e-05, + "loss": 0.0165, + "step": 12720 + }, + { + "grad_norm": 0.3229687213897705, + "learning_rate": 9.298229246803076e-05, + "loss": 0.0155, + "step": 12730 + }, + { + "grad_norm": 0.37557703256607056, + "learning_rate": 9.296820690901144e-05, + "loss": 0.0171, + "step": 12740 + }, + { + "grad_norm": 0.36332058906555176, + "learning_rate": 9.295410829738539e-05, + "loss": 0.0153, + "step": 12750 + }, + { + "grad_norm": 0.3513367474079132, + "learning_rate": 9.293999663743535e-05, + "loss": 0.0164, + "step": 12760 + }, + { + "grad_norm": 0.29821816086769104, + "learning_rate": 9.292587193344813e-05, + "loss": 0.0155, + "step": 12770 + }, + { + "grad_norm": 0.4072168469429016, + "learning_rate": 9.291173418971437e-05, + "loss": 0.021, + "step": 12780 + }, + { + "grad_norm": 0.34375905990600586, + "learning_rate": 9.28975834105288e-05, + "loss": 0.0194, + "step": 12790 + }, + { + "grad_norm": 0.35275042057037354, + "learning_rate": 9.288341960019004e-05, + "loss": 0.0184, + "step": 12800 + }, + { + "grad_norm": 0.4296000897884369, + "learning_rate": 9.286924276300067e-05, + "loss": 0.0201, + "step": 12810 + }, + { + "grad_norm": 0.32279685139656067, + "learning_rate": 9.285505290326726e-05, + "loss": 0.017, + "step": 12820 + }, + { + "grad_norm": 0.3500145971775055, + "learning_rate": 9.284085002530027e-05, + "loss": 0.0162, + "step": 12830 + }, + { + "grad_norm": 0.34305018186569214, + "learning_rate": 9.282663413341422e-05, + "loss": 0.0173, + "step": 12840 + }, + { + "grad_norm": 0.3377833664417267, + "learning_rate": 9.281240523192747e-05, + "loss": 0.0154, + "step": 12850 + }, + { + "grad_norm": 0.3174279034137726, + "learning_rate": 9.279816332516242e-05, + "loss": 0.0143, + "step": 12860 + }, + { + "grad_norm": 0.3884032964706421, + "learning_rate": 9.278390841744536e-05, + "loss": 0.0153, + "step": 12870 + }, + { + "grad_norm": 0.39290687441825867, + "learning_rate": 9.276964051310658e-05, + "loss": 0.0166, + "step": 12880 + }, + { + "grad_norm": 0.33610856533050537, + "learning_rate": 9.275535961648027e-05, + "loss": 0.0162, + "step": 12890 + }, + { + "grad_norm": 0.39055031538009644, + "learning_rate": 9.274106573190459e-05, + "loss": 0.0158, + "step": 12900 + }, + { + "grad_norm": 0.40723916888237, + "learning_rate": 9.272675886372168e-05, + "loss": 0.0152, + "step": 12910 + }, + { + "grad_norm": 0.4116220772266388, + "learning_rate": 9.271243901627754e-05, + "loss": 0.0175, + "step": 12920 + }, + { + "grad_norm": 0.3669080138206482, + "learning_rate": 9.269810619392219e-05, + "loss": 0.0144, + "step": 12930 + }, + { + "grad_norm": 0.29769766330718994, + "learning_rate": 9.268376040100955e-05, + "loss": 0.0133, + "step": 12940 + }, + { + "grad_norm": 0.6282018423080444, + "learning_rate": 9.266940164189752e-05, + "loss": 0.02, + "step": 12950 + }, + { + "grad_norm": 0.39701253175735474, + "learning_rate": 9.265502992094787e-05, + "loss": 0.0208, + "step": 12960 + }, + { + "grad_norm": 0.3430003821849823, + "learning_rate": 9.264064524252638e-05, + "loss": 0.019, + "step": 12970 + }, + { + "grad_norm": 0.31682834029197693, + "learning_rate": 9.262624761100271e-05, + "loss": 0.0188, + "step": 12980 + }, + { + "grad_norm": 0.5807638168334961, + "learning_rate": 9.261183703075051e-05, + "loss": 0.0205, + "step": 12990 + }, + { + "grad_norm": 0.5272794961929321, + "learning_rate": 9.259741350614733e-05, + "loss": 0.0189, + "step": 13000 + }, + { + "grad_norm": 0.37830850481987, + "learning_rate": 9.258297704157464e-05, + "loss": 0.0221, + "step": 13010 + }, + { + "grad_norm": 0.37266066670417786, + "learning_rate": 9.256852764141786e-05, + "loss": 0.0182, + "step": 13020 + }, + { + "grad_norm": 0.3502638339996338, + "learning_rate": 9.255406531006634e-05, + "loss": 0.0176, + "step": 13030 + }, + { + "grad_norm": 0.3538574278354645, + "learning_rate": 9.253959005191335e-05, + "loss": 0.0197, + "step": 13040 + }, + { + "grad_norm": 0.5364757776260376, + "learning_rate": 9.25251018713561e-05, + "loss": 0.0167, + "step": 13050 + }, + { + "grad_norm": 0.3922223448753357, + "learning_rate": 9.251060077279571e-05, + "loss": 0.0186, + "step": 13060 + }, + { + "grad_norm": 0.3120754659175873, + "learning_rate": 9.249608676063724e-05, + "loss": 0.0133, + "step": 13070 + }, + { + "grad_norm": 0.3706241846084595, + "learning_rate": 9.248155983928964e-05, + "loss": 0.014, + "step": 13080 + }, + { + "grad_norm": 0.36041760444641113, + "learning_rate": 9.246702001316583e-05, + "loss": 0.0176, + "step": 13090 + }, + { + "grad_norm": 0.4849241077899933, + "learning_rate": 9.245246728668262e-05, + "loss": 0.0174, + "step": 13100 + }, + { + "grad_norm": 0.3944106996059418, + "learning_rate": 9.243790166426073e-05, + "loss": 0.0154, + "step": 13110 + }, + { + "grad_norm": 0.4160780608654022, + "learning_rate": 9.242332315032484e-05, + "loss": 0.0174, + "step": 13120 + }, + { + "grad_norm": 0.45664340257644653, + "learning_rate": 9.240873174930349e-05, + "loss": 0.0185, + "step": 13130 + }, + { + "grad_norm": 0.35323366522789, + "learning_rate": 9.239412746562917e-05, + "loss": 0.0168, + "step": 13140 + }, + { + "grad_norm": 0.3198740780353546, + "learning_rate": 9.237951030373828e-05, + "loss": 0.0146, + "step": 13150 + }, + { + "grad_norm": 0.36298424005508423, + "learning_rate": 9.236488026807113e-05, + "loss": 0.0116, + "step": 13160 + }, + { + "grad_norm": 0.3357863128185272, + "learning_rate": 9.235023736307193e-05, + "loss": 0.0156, + "step": 13170 + }, + { + "grad_norm": 0.27079761028289795, + "learning_rate": 9.233558159318881e-05, + "loss": 0.0136, + "step": 13180 + }, + { + "grad_norm": 0.31432172656059265, + "learning_rate": 9.232091296287382e-05, + "loss": 0.0123, + "step": 13190 + }, + { + "grad_norm": 0.3665439784526825, + "learning_rate": 9.230623147658288e-05, + "loss": 0.0205, + "step": 13200 + }, + { + "grad_norm": 0.36489400267601013, + "learning_rate": 9.229153713877586e-05, + "loss": 0.0148, + "step": 13210 + }, + { + "grad_norm": 0.4255466163158417, + "learning_rate": 9.227682995391649e-05, + "loss": 0.0151, + "step": 13220 + }, + { + "grad_norm": 0.4850270748138428, + "learning_rate": 9.226210992647243e-05, + "loss": 0.0201, + "step": 13230 + }, + { + "grad_norm": 0.4035317599773407, + "learning_rate": 9.224737706091525e-05, + "loss": 0.0224, + "step": 13240 + }, + { + "grad_norm": 0.31821107864379883, + "learning_rate": 9.223263136172039e-05, + "loss": 0.0204, + "step": 13250 + }, + { + "grad_norm": 0.31971925497055054, + "learning_rate": 9.22178728333672e-05, + "loss": 0.0213, + "step": 13260 + }, + { + "grad_norm": 0.4425103962421417, + "learning_rate": 9.220310148033897e-05, + "loss": 0.0194, + "step": 13270 + }, + { + "grad_norm": 0.3672041594982147, + "learning_rate": 9.21883173071228e-05, + "loss": 0.0168, + "step": 13280 + }, + { + "grad_norm": 0.27310770750045776, + "learning_rate": 9.217352031820976e-05, + "loss": 0.0193, + "step": 13290 + }, + { + "grad_norm": 0.3518291711807251, + "learning_rate": 9.215871051809477e-05, + "loss": 0.0192, + "step": 13300 + }, + { + "grad_norm": 0.27240341901779175, + "learning_rate": 9.214388791127666e-05, + "loss": 0.0161, + "step": 13310 + }, + { + "grad_norm": 0.35625964403152466, + "learning_rate": 9.212905250225814e-05, + "loss": 0.0158, + "step": 13320 + }, + { + "grad_norm": 0.3670119047164917, + "learning_rate": 9.211420429554583e-05, + "loss": 0.0175, + "step": 13330 + }, + { + "grad_norm": 0.2852018475532532, + "learning_rate": 9.209934329565022e-05, + "loss": 0.0147, + "step": 13340 + }, + { + "grad_norm": 0.3819805383682251, + "learning_rate": 9.208446950708568e-05, + "loss": 0.0136, + "step": 13350 + }, + { + "grad_norm": 0.37003952264785767, + "learning_rate": 9.20695829343705e-05, + "loss": 0.0157, + "step": 13360 + }, + { + "grad_norm": 0.30146515369415283, + "learning_rate": 9.205468358202678e-05, + "loss": 0.0142, + "step": 13370 + }, + { + "grad_norm": 0.312034547328949, + "learning_rate": 9.203977145458059e-05, + "loss": 0.0146, + "step": 13380 + }, + { + "grad_norm": 0.3290935456752777, + "learning_rate": 9.202484655656182e-05, + "loss": 0.0174, + "step": 13390 + }, + { + "grad_norm": 0.3005056381225586, + "learning_rate": 9.200990889250427e-05, + "loss": 0.0126, + "step": 13400 + }, + { + "grad_norm": 0.613081157207489, + "learning_rate": 9.19949584669456e-05, + "loss": 0.0163, + "step": 13410 + }, + { + "grad_norm": 0.31315794587135315, + "learning_rate": 9.197999528442738e-05, + "loss": 0.0168, + "step": 13420 + }, + { + "grad_norm": 0.5277458429336548, + "learning_rate": 9.196501934949499e-05, + "loss": 0.0182, + "step": 13430 + }, + { + "grad_norm": 0.4309273958206177, + "learning_rate": 9.195003066669776e-05, + "loss": 0.0209, + "step": 13440 + }, + { + "grad_norm": 0.4055766761302948, + "learning_rate": 9.193502924058884e-05, + "loss": 0.0164, + "step": 13450 + }, + { + "grad_norm": 0.3456658720970154, + "learning_rate": 9.192001507572526e-05, + "loss": 0.018, + "step": 13460 + }, + { + "grad_norm": 0.40551888942718506, + "learning_rate": 9.190498817666793e-05, + "loss": 0.0181, + "step": 13470 + }, + { + "grad_norm": 0.3050583600997925, + "learning_rate": 9.188994854798163e-05, + "loss": 0.0143, + "step": 13480 + }, + { + "grad_norm": 0.3459891378879547, + "learning_rate": 9.187489619423499e-05, + "loss": 0.0225, + "step": 13490 + }, + { + "grad_norm": 0.4638763666152954, + "learning_rate": 9.185983112000056e-05, + "loss": 0.02, + "step": 13500 + }, + { + "grad_norm": 0.3626478612422943, + "learning_rate": 9.184475332985464e-05, + "loss": 0.0207, + "step": 13510 + }, + { + "grad_norm": 0.34336400032043457, + "learning_rate": 9.182966282837754e-05, + "loss": 0.0127, + "step": 13520 + }, + { + "grad_norm": 0.4573110044002533, + "learning_rate": 9.18145596201533e-05, + "loss": 0.0151, + "step": 13530 + }, + { + "grad_norm": 0.3770318925380707, + "learning_rate": 9.179944370976991e-05, + "loss": 0.0199, + "step": 13540 + }, + { + "grad_norm": 0.39217254519462585, + "learning_rate": 9.178431510181918e-05, + "loss": 0.0172, + "step": 13550 + }, + { + "grad_norm": 0.39840951561927795, + "learning_rate": 9.176917380089675e-05, + "loss": 0.0181, + "step": 13560 + }, + { + "grad_norm": 0.3409793972969055, + "learning_rate": 9.175401981160219e-05, + "loss": 0.0131, + "step": 13570 + }, + { + "grad_norm": 0.4742024838924408, + "learning_rate": 9.173885313853885e-05, + "loss": 0.0187, + "step": 13580 + }, + { + "grad_norm": 0.2477242350578308, + "learning_rate": 9.172367378631398e-05, + "loss": 0.0137, + "step": 13590 + }, + { + "grad_norm": 0.3063543736934662, + "learning_rate": 9.170848175953866e-05, + "loss": 0.0167, + "step": 13600 + }, + { + "grad_norm": 0.32829663157463074, + "learning_rate": 9.169327706282784e-05, + "loss": 0.016, + "step": 13610 + }, + { + "grad_norm": 0.44849711656570435, + "learning_rate": 9.167805970080029e-05, + "loss": 0.0135, + "step": 13620 + }, + { + "grad_norm": 0.42181119322776794, + "learning_rate": 9.166282967807864e-05, + "loss": 0.0164, + "step": 13630 + }, + { + "grad_norm": 0.4155069589614868, + "learning_rate": 9.16475869992894e-05, + "loss": 0.0185, + "step": 13640 + }, + { + "grad_norm": 0.3853234052658081, + "learning_rate": 9.163233166906284e-05, + "loss": 0.0147, + "step": 13650 + }, + { + "grad_norm": 0.30451634526252747, + "learning_rate": 9.161706369203317e-05, + "loss": 0.0164, + "step": 13660 + }, + { + "grad_norm": 0.31337645649909973, + "learning_rate": 9.16017830728384e-05, + "loss": 0.0121, + "step": 13670 + }, + { + "grad_norm": 0.3939133882522583, + "learning_rate": 9.158648981612035e-05, + "loss": 0.0177, + "step": 13680 + }, + { + "grad_norm": 0.4016934335231781, + "learning_rate": 9.157118392652472e-05, + "loss": 0.0149, + "step": 13690 + }, + { + "grad_norm": 0.3945441246032715, + "learning_rate": 9.155586540870104e-05, + "loss": 0.0171, + "step": 13700 + }, + { + "grad_norm": 0.34516316652297974, + "learning_rate": 9.154053426730267e-05, + "loss": 0.0188, + "step": 13710 + }, + { + "grad_norm": 0.33230695128440857, + "learning_rate": 9.15251905069868e-05, + "loss": 0.0134, + "step": 13720 + }, + { + "grad_norm": 0.391711562871933, + "learning_rate": 9.150983413241446e-05, + "loss": 0.0162, + "step": 13730 + }, + { + "grad_norm": 0.34310588240623474, + "learning_rate": 9.149446514825051e-05, + "loss": 0.014, + "step": 13740 + }, + { + "grad_norm": 0.4484381079673767, + "learning_rate": 9.147908355916365e-05, + "loss": 0.0174, + "step": 13750 + }, + { + "grad_norm": 0.3011036217212677, + "learning_rate": 9.146368936982642e-05, + "loss": 0.0139, + "step": 13760 + }, + { + "grad_norm": 0.3673052489757538, + "learning_rate": 9.144828258491511e-05, + "loss": 0.0132, + "step": 13770 + }, + { + "grad_norm": 0.44669434428215027, + "learning_rate": 9.143286320910996e-05, + "loss": 0.0139, + "step": 13780 + }, + { + "grad_norm": 0.32562264800071716, + "learning_rate": 9.141743124709491e-05, + "loss": 0.0165, + "step": 13790 + }, + { + "grad_norm": 0.3485332131385803, + "learning_rate": 9.140198670355784e-05, + "loss": 0.0169, + "step": 13800 + }, + { + "grad_norm": 0.4706399738788605, + "learning_rate": 9.138652958319034e-05, + "loss": 0.0176, + "step": 13810 + }, + { + "grad_norm": 0.3889845609664917, + "learning_rate": 9.137105989068791e-05, + "loss": 0.0155, + "step": 13820 + }, + { + "grad_norm": 0.34890374541282654, + "learning_rate": 9.135557763074983e-05, + "loss": 0.0167, + "step": 13830 + }, + { + "grad_norm": 0.29354432225227356, + "learning_rate": 9.13400828080792e-05, + "loss": 0.0139, + "step": 13840 + }, + { + "grad_norm": 0.3396795392036438, + "learning_rate": 9.132457542738292e-05, + "loss": 0.0131, + "step": 13850 + }, + { + "grad_norm": 0.35870692133903503, + "learning_rate": 9.130905549337174e-05, + "loss": 0.0167, + "step": 13860 + }, + { + "grad_norm": 0.27290400862693787, + "learning_rate": 9.129352301076021e-05, + "loss": 0.0124, + "step": 13870 + }, + { + "grad_norm": 0.32596784830093384, + "learning_rate": 9.127797798426668e-05, + "loss": 0.014, + "step": 13880 + }, + { + "grad_norm": 0.37084895372390747, + "learning_rate": 9.126242041861333e-05, + "loss": 0.0159, + "step": 13890 + }, + { + "grad_norm": 0.34786126017570496, + "learning_rate": 9.124685031852611e-05, + "loss": 0.0167, + "step": 13900 + }, + { + "grad_norm": 0.3394896984100342, + "learning_rate": 9.123126768873482e-05, + "loss": 0.0142, + "step": 13910 + }, + { + "grad_norm": 0.3149508237838745, + "learning_rate": 9.121567253397308e-05, + "loss": 0.0159, + "step": 13920 + }, + { + "grad_norm": 0.2656719386577606, + "learning_rate": 9.120006485897824e-05, + "loss": 0.0136, + "step": 13930 + }, + { + "grad_norm": 0.3305380344390869, + "learning_rate": 9.118444466849152e-05, + "loss": 0.0155, + "step": 13940 + }, + { + "grad_norm": 0.2971353828907013, + "learning_rate": 9.116881196725793e-05, + "loss": 0.0166, + "step": 13950 + }, + { + "grad_norm": 0.36611509323120117, + "learning_rate": 9.115316676002627e-05, + "loss": 0.015, + "step": 13960 + }, + { + "grad_norm": 0.26781439781188965, + "learning_rate": 9.113750905154911e-05, + "loss": 0.0126, + "step": 13970 + }, + { + "grad_norm": 0.40599337220191956, + "learning_rate": 9.112183884658289e-05, + "loss": 0.0149, + "step": 13980 + }, + { + "grad_norm": 0.3325079679489136, + "learning_rate": 9.11061561498878e-05, + "loss": 0.0128, + "step": 13990 + }, + { + "grad_norm": 0.36591359972953796, + "learning_rate": 9.109046096622779e-05, + "loss": 0.0129, + "step": 14000 + }, + { + "grad_norm": 0.3110436499118805, + "learning_rate": 9.107475330037069e-05, + "loss": 0.0144, + "step": 14010 + }, + { + "grad_norm": 0.42516830563545227, + "learning_rate": 9.105903315708806e-05, + "loss": 0.0169, + "step": 14020 + }, + { + "grad_norm": 0.3548910617828369, + "learning_rate": 9.104330054115524e-05, + "loss": 0.0154, + "step": 14030 + }, + { + "grad_norm": 0.2738393247127533, + "learning_rate": 9.102755545735141e-05, + "loss": 0.0141, + "step": 14040 + }, + { + "grad_norm": 0.31736063957214355, + "learning_rate": 9.10117979104595e-05, + "loss": 0.0117, + "step": 14050 + }, + { + "grad_norm": 0.2881460189819336, + "learning_rate": 9.099602790526624e-05, + "loss": 0.0132, + "step": 14060 + }, + { + "grad_norm": 0.3362765610218048, + "learning_rate": 9.098024544656212e-05, + "loss": 0.0144, + "step": 14070 + }, + { + "grad_norm": 0.24119043350219727, + "learning_rate": 9.096445053914148e-05, + "loss": 0.015, + "step": 14080 + }, + { + "grad_norm": 0.41081079840660095, + "learning_rate": 9.094864318780236e-05, + "loss": 0.0139, + "step": 14090 + }, + { + "grad_norm": 0.35552293062210083, + "learning_rate": 9.093282339734663e-05, + "loss": 0.0167, + "step": 14100 + }, + { + "grad_norm": 0.29461267590522766, + "learning_rate": 9.091699117257992e-05, + "loss": 0.0122, + "step": 14110 + }, + { + "grad_norm": 0.312937468290329, + "learning_rate": 9.090114651831163e-05, + "loss": 0.0134, + "step": 14120 + }, + { + "grad_norm": 0.39879798889160156, + "learning_rate": 9.088528943935497e-05, + "loss": 0.0158, + "step": 14130 + }, + { + "grad_norm": 0.36404165625572205, + "learning_rate": 9.086941994052689e-05, + "loss": 0.0171, + "step": 14140 + }, + { + "grad_norm": 0.3130766451358795, + "learning_rate": 9.085353802664813e-05, + "loss": 0.015, + "step": 14150 + }, + { + "grad_norm": 0.3249577581882477, + "learning_rate": 9.08376437025432e-05, + "loss": 0.0155, + "step": 14160 + }, + { + "grad_norm": 0.32672080397605896, + "learning_rate": 9.082173697304035e-05, + "loss": 0.0164, + "step": 14170 + }, + { + "grad_norm": 0.3939981460571289, + "learning_rate": 9.080581784297166e-05, + "loss": 0.014, + "step": 14180 + }, + { + "grad_norm": 0.3205048441886902, + "learning_rate": 9.078988631717291e-05, + "loss": 0.0136, + "step": 14190 + }, + { + "grad_norm": 0.4194487929344177, + "learning_rate": 9.077394240048369e-05, + "loss": 0.0152, + "step": 14200 + }, + { + "grad_norm": 0.3741471469402313, + "learning_rate": 9.075798609774736e-05, + "loss": 0.0152, + "step": 14210 + }, + { + "grad_norm": 0.3669159412384033, + "learning_rate": 9.0742017413811e-05, + "loss": 0.0154, + "step": 14220 + }, + { + "grad_norm": 0.35949963331222534, + "learning_rate": 9.072603635352548e-05, + "loss": 0.0167, + "step": 14230 + }, + { + "grad_norm": 0.3275270164012909, + "learning_rate": 9.071004292174541e-05, + "loss": 0.0129, + "step": 14240 + }, + { + "grad_norm": 0.35712912678718567, + "learning_rate": 9.06940371233292e-05, + "loss": 0.0159, + "step": 14250 + }, + { + "grad_norm": 0.29049134254455566, + "learning_rate": 9.067801896313898e-05, + "loss": 0.0161, + "step": 14260 + }, + { + "grad_norm": 0.3525155484676361, + "learning_rate": 9.066198844604064e-05, + "loss": 0.0133, + "step": 14270 + }, + { + "grad_norm": 0.330254465341568, + "learning_rate": 9.06459455769038e-05, + "loss": 0.0141, + "step": 14280 + }, + { + "grad_norm": 0.2980960011482239, + "learning_rate": 9.062989036060193e-05, + "loss": 0.0105, + "step": 14290 + }, + { + "grad_norm": 0.305086612701416, + "learning_rate": 9.061382280201212e-05, + "loss": 0.0135, + "step": 14300 + }, + { + "grad_norm": 0.3218506872653961, + "learning_rate": 9.059774290601528e-05, + "loss": 0.0131, + "step": 14310 + }, + { + "grad_norm": 0.36173155903816223, + "learning_rate": 9.058165067749606e-05, + "loss": 0.0179, + "step": 14320 + }, + { + "grad_norm": 0.3954292833805084, + "learning_rate": 9.056554612134288e-05, + "loss": 0.0166, + "step": 14330 + }, + { + "grad_norm": 0.3722546398639679, + "learning_rate": 9.054942924244785e-05, + "loss": 0.0152, + "step": 14340 + }, + { + "grad_norm": 0.3163178861141205, + "learning_rate": 9.053330004570686e-05, + "loss": 0.0138, + "step": 14350 + }, + { + "grad_norm": 0.3141244649887085, + "learning_rate": 9.051715853601955e-05, + "loss": 0.0176, + "step": 14360 + }, + { + "grad_norm": 0.3372696340084076, + "learning_rate": 9.050100471828926e-05, + "loss": 0.0154, + "step": 14370 + }, + { + "grad_norm": 0.32954317331314087, + "learning_rate": 9.048483859742311e-05, + "loss": 0.0164, + "step": 14380 + }, + { + "grad_norm": 0.3441936671733856, + "learning_rate": 9.046866017833193e-05, + "loss": 0.0151, + "step": 14390 + }, + { + "grad_norm": 0.3688376843929291, + "learning_rate": 9.045246946593029e-05, + "loss": 0.014, + "step": 14400 + }, + { + "grad_norm": 0.3306489586830139, + "learning_rate": 9.043626646513652e-05, + "loss": 0.0157, + "step": 14410 + }, + { + "grad_norm": 0.3034699857234955, + "learning_rate": 9.042005118087267e-05, + "loss": 0.0147, + "step": 14420 + }, + { + "grad_norm": 0.2809334993362427, + "learning_rate": 9.040382361806448e-05, + "loss": 0.0123, + "step": 14430 + }, + { + "grad_norm": 0.3783026337623596, + "learning_rate": 9.038758378164148e-05, + "loss": 0.0137, + "step": 14440 + }, + { + "grad_norm": 0.4293455481529236, + "learning_rate": 9.037133167653691e-05, + "loss": 0.0229, + "step": 14450 + }, + { + "grad_norm": 0.3802051842212677, + "learning_rate": 9.035506730768771e-05, + "loss": 0.0191, + "step": 14460 + }, + { + "grad_norm": 0.42028242349624634, + "learning_rate": 9.033879068003458e-05, + "loss": 0.0196, + "step": 14470 + }, + { + "grad_norm": 0.4385514259338379, + "learning_rate": 9.032250179852193e-05, + "loss": 0.0195, + "step": 14480 + }, + { + "grad_norm": 0.3730545938014984, + "learning_rate": 9.030620066809787e-05, + "loss": 0.0165, + "step": 14490 + }, + { + "grad_norm": 0.393687903881073, + "learning_rate": 9.028988729371428e-05, + "loss": 0.0125, + "step": 14500 + }, + { + "grad_norm": 0.3687753975391388, + "learning_rate": 9.027356168032673e-05, + "loss": 0.0147, + "step": 14510 + }, + { + "grad_norm": 0.3404821753501892, + "learning_rate": 9.02572238328945e-05, + "loss": 0.0126, + "step": 14520 + }, + { + "grad_norm": 0.363484650850296, + "learning_rate": 9.02408737563806e-05, + "loss": 0.016, + "step": 14530 + }, + { + "grad_norm": 0.5019651651382446, + "learning_rate": 9.022451145575174e-05, + "loss": 0.0161, + "step": 14540 + }, + { + "grad_norm": 0.3429701626300812, + "learning_rate": 9.02081369359784e-05, + "loss": 0.0168, + "step": 14550 + }, + { + "grad_norm": 0.38273975253105164, + "learning_rate": 9.019175020203465e-05, + "loss": 0.0147, + "step": 14560 + }, + { + "grad_norm": 0.3277057409286499, + "learning_rate": 9.017535125889842e-05, + "loss": 0.0159, + "step": 14570 + }, + { + "grad_norm": 0.36684003472328186, + "learning_rate": 9.015894011155124e-05, + "loss": 0.0156, + "step": 14580 + }, + { + "grad_norm": 0.35012102127075195, + "learning_rate": 9.014251676497838e-05, + "loss": 0.0139, + "step": 14590 + }, + { + "grad_norm": 0.24774228036403656, + "learning_rate": 9.012608122416884e-05, + "loss": 0.0149, + "step": 14600 + }, + { + "grad_norm": 0.31932333111763, + "learning_rate": 9.010963349411529e-05, + "loss": 0.0164, + "step": 14610 + }, + { + "grad_norm": 0.28318291902542114, + "learning_rate": 9.00931735798141e-05, + "loss": 0.0134, + "step": 14620 + }, + { + "grad_norm": 0.3513835370540619, + "learning_rate": 9.00767014862654e-05, + "loss": 0.0188, + "step": 14630 + }, + { + "grad_norm": 0.43248969316482544, + "learning_rate": 9.006021721847295e-05, + "loss": 0.016, + "step": 14640 + }, + { + "grad_norm": 0.3420730233192444, + "learning_rate": 9.004372078144423e-05, + "loss": 0.0146, + "step": 14650 + }, + { + "grad_norm": 0.38896235823631287, + "learning_rate": 9.002721218019043e-05, + "loss": 0.0143, + "step": 14660 + }, + { + "grad_norm": 0.384267657995224, + "learning_rate": 9.001069141972642e-05, + "loss": 0.0131, + "step": 14670 + }, + { + "grad_norm": 0.3028285503387451, + "learning_rate": 8.99941585050708e-05, + "loss": 0.0142, + "step": 14680 + }, + { + "grad_norm": 0.2744828462600708, + "learning_rate": 8.997761344124578e-05, + "loss": 0.0117, + "step": 14690 + }, + { + "grad_norm": 0.27059218287467957, + "learning_rate": 8.996105623327737e-05, + "loss": 0.0166, + "step": 14700 + }, + { + "grad_norm": 0.34469544887542725, + "learning_rate": 8.994448688619517e-05, + "loss": 0.0242, + "step": 14710 + }, + { + "grad_norm": 0.4304150938987732, + "learning_rate": 8.992790540503253e-05, + "loss": 0.018, + "step": 14720 + }, + { + "grad_norm": 0.36087173223495483, + "learning_rate": 8.991131179482648e-05, + "loss": 0.0165, + "step": 14730 + }, + { + "grad_norm": 0.3183897137641907, + "learning_rate": 8.989470606061768e-05, + "loss": 0.0196, + "step": 14740 + }, + { + "grad_norm": 0.4256609082221985, + "learning_rate": 8.987808820745056e-05, + "loss": 0.0143, + "step": 14750 + }, + { + "grad_norm": 0.2877207398414612, + "learning_rate": 8.986145824037315e-05, + "loss": 0.0151, + "step": 14760 + }, + { + "grad_norm": 0.3889293968677521, + "learning_rate": 8.984481616443721e-05, + "loss": 0.0185, + "step": 14770 + }, + { + "grad_norm": 0.28330937027931213, + "learning_rate": 8.982816198469815e-05, + "loss": 0.0144, + "step": 14780 + }, + { + "grad_norm": 0.2906559407711029, + "learning_rate": 8.98114957062151e-05, + "loss": 0.0177, + "step": 14790 + }, + { + "grad_norm": 0.3713333308696747, + "learning_rate": 8.97948173340508e-05, + "loss": 0.0125, + "step": 14800 + }, + { + "grad_norm": 0.36049315333366394, + "learning_rate": 8.977812687327172e-05, + "loss": 0.0145, + "step": 14810 + }, + { + "grad_norm": 0.3280653655529022, + "learning_rate": 8.976142432894798e-05, + "loss": 0.0137, + "step": 14820 + }, + { + "grad_norm": 0.3982951045036316, + "learning_rate": 8.974470970615336e-05, + "loss": 0.0139, + "step": 14830 + }, + { + "grad_norm": 0.3464769721031189, + "learning_rate": 8.972798300996534e-05, + "loss": 0.012, + "step": 14840 + }, + { + "grad_norm": 0.39642462134361267, + "learning_rate": 8.971124424546504e-05, + "loss": 0.0126, + "step": 14850 + }, + { + "grad_norm": 0.29961955547332764, + "learning_rate": 8.969449341773724e-05, + "loss": 0.0138, + "step": 14860 + }, + { + "grad_norm": 0.3007674813270569, + "learning_rate": 8.967773053187042e-05, + "loss": 0.0132, + "step": 14870 + }, + { + "grad_norm": 0.41528192162513733, + "learning_rate": 8.966095559295668e-05, + "loss": 0.0136, + "step": 14880 + }, + { + "grad_norm": 0.29742881655693054, + "learning_rate": 8.964416860609184e-05, + "loss": 0.0144, + "step": 14890 + }, + { + "grad_norm": 0.3091881573200226, + "learning_rate": 8.962736957637532e-05, + "loss": 0.0144, + "step": 14900 + }, + { + "grad_norm": 0.30855152010917664, + "learning_rate": 8.96105585089102e-05, + "loss": 0.0114, + "step": 14910 + }, + { + "grad_norm": 0.4418489336967468, + "learning_rate": 8.959373540880329e-05, + "loss": 0.0197, + "step": 14920 + }, + { + "grad_norm": 0.3194306492805481, + "learning_rate": 8.957690028116495e-05, + "loss": 0.0149, + "step": 14930 + }, + { + "grad_norm": 0.34865105152130127, + "learning_rate": 8.956005313110928e-05, + "loss": 0.0147, + "step": 14940 + }, + { + "grad_norm": 0.3382810652256012, + "learning_rate": 8.9543193963754e-05, + "loss": 0.0142, + "step": 14950 + }, + { + "grad_norm": 0.3505389094352722, + "learning_rate": 8.952632278422048e-05, + "loss": 0.0125, + "step": 14960 + }, + { + "grad_norm": 0.3530474603176117, + "learning_rate": 8.95094395976337e-05, + "loss": 0.0117, + "step": 14970 + }, + { + "grad_norm": 0.4002842903137207, + "learning_rate": 8.949254440912239e-05, + "loss": 0.0143, + "step": 14980 + }, + { + "grad_norm": 0.3761323094367981, + "learning_rate": 8.94756372238188e-05, + "loss": 0.0186, + "step": 14990 + }, + { + "grad_norm": 0.4517865478992462, + "learning_rate": 8.945871804685892e-05, + "loss": 0.0137, + "step": 15000 + }, + { + "grad_norm": 0.3776147663593292, + "learning_rate": 8.944178688338236e-05, + "loss": 0.0126, + "step": 15010 + }, + { + "grad_norm": 0.2930288016796112, + "learning_rate": 8.942484373853233e-05, + "loss": 0.0116, + "step": 15020 + }, + { + "grad_norm": 0.2978624105453491, + "learning_rate": 8.940788861745572e-05, + "loss": 0.015, + "step": 15030 + }, + { + "grad_norm": 0.3503165543079376, + "learning_rate": 8.939092152530308e-05, + "loss": 0.0171, + "step": 15040 + }, + { + "grad_norm": 0.25890377163887024, + "learning_rate": 8.937394246722853e-05, + "loss": 0.0122, + "step": 15050 + }, + { + "grad_norm": 0.29408955574035645, + "learning_rate": 8.935695144838984e-05, + "loss": 0.0137, + "step": 15060 + }, + { + "grad_norm": 0.3139441907405853, + "learning_rate": 8.933994847394849e-05, + "loss": 0.0136, + "step": 15070 + }, + { + "grad_norm": 0.25799211859703064, + "learning_rate": 8.932293354906949e-05, + "loss": 0.0111, + "step": 15080 + }, + { + "grad_norm": 0.2083788514137268, + "learning_rate": 8.930590667892153e-05, + "loss": 0.0121, + "step": 15090 + }, + { + "grad_norm": 0.3402978777885437, + "learning_rate": 8.928886786867696e-05, + "loss": 0.0127, + "step": 15100 + }, + { + "grad_norm": 0.29362717270851135, + "learning_rate": 8.927181712351168e-05, + "loss": 0.0186, + "step": 15110 + }, + { + "grad_norm": 0.3671301603317261, + "learning_rate": 8.925475444860527e-05, + "loss": 0.0168, + "step": 15120 + }, + { + "grad_norm": 0.36894676089286804, + "learning_rate": 8.923767984914092e-05, + "loss": 0.0149, + "step": 15130 + }, + { + "grad_norm": 0.28404465317726135, + "learning_rate": 8.922059333030545e-05, + "loss": 0.012, + "step": 15140 + }, + { + "grad_norm": 0.24064715206623077, + "learning_rate": 8.920349489728928e-05, + "loss": 0.0145, + "step": 15150 + }, + { + "grad_norm": 0.3710125982761383, + "learning_rate": 8.918638455528646e-05, + "loss": 0.0124, + "step": 15160 + }, + { + "grad_norm": 0.3326571583747864, + "learning_rate": 8.916926230949468e-05, + "loss": 0.0141, + "step": 15170 + }, + { + "grad_norm": 0.3539113700389862, + "learning_rate": 8.915212816511522e-05, + "loss": 0.0117, + "step": 15180 + }, + { + "grad_norm": 0.3796995282173157, + "learning_rate": 8.913498212735296e-05, + "loss": 0.0173, + "step": 15190 + }, + { + "grad_norm": 0.37052762508392334, + "learning_rate": 8.911782420141643e-05, + "loss": 0.0148, + "step": 15200 + }, + { + "grad_norm": 0.2812616229057312, + "learning_rate": 8.910065439251775e-05, + "loss": 0.0126, + "step": 15210 + }, + { + "grad_norm": 0.34295588731765747, + "learning_rate": 8.908347270587268e-05, + "loss": 0.0127, + "step": 15220 + }, + { + "grad_norm": 0.35617485642433167, + "learning_rate": 8.906627914670054e-05, + "loss": 0.0127, + "step": 15230 + }, + { + "grad_norm": 0.3306311368942261, + "learning_rate": 8.904907372022427e-05, + "loss": 0.0162, + "step": 15240 + }, + { + "grad_norm": 0.33233681321144104, + "learning_rate": 8.903185643167042e-05, + "loss": 0.0182, + "step": 15250 + }, + { + "grad_norm": 0.32985642552375793, + "learning_rate": 8.901462728626919e-05, + "loss": 0.0141, + "step": 15260 + }, + { + "grad_norm": 0.4287532567977905, + "learning_rate": 8.899738628925429e-05, + "loss": 0.0176, + "step": 15270 + }, + { + "grad_norm": 0.3267117142677307, + "learning_rate": 8.898013344586312e-05, + "loss": 0.0141, + "step": 15280 + }, + { + "grad_norm": 0.3373110890388489, + "learning_rate": 8.896286876133661e-05, + "loss": 0.0173, + "step": 15290 + }, + { + "grad_norm": 0.3415488004684448, + "learning_rate": 8.894559224091933e-05, + "loss": 0.013, + "step": 15300 + }, + { + "grad_norm": 0.4470892548561096, + "learning_rate": 8.892830388985942e-05, + "loss": 0.0149, + "step": 15310 + }, + { + "grad_norm": 0.42608416080474854, + "learning_rate": 8.891100371340864e-05, + "loss": 0.0179, + "step": 15320 + }, + { + "grad_norm": 0.3032132387161255, + "learning_rate": 8.889369171682231e-05, + "loss": 0.0168, + "step": 15330 + }, + { + "grad_norm": 0.2819351851940155, + "learning_rate": 8.887636790535936e-05, + "loss": 0.015, + "step": 15340 + }, + { + "grad_norm": 0.3128340244293213, + "learning_rate": 8.885903228428231e-05, + "loss": 0.0138, + "step": 15350 + }, + { + "grad_norm": 0.3368318974971771, + "learning_rate": 8.884168485885727e-05, + "loss": 0.0141, + "step": 15360 + }, + { + "grad_norm": 0.3355594277381897, + "learning_rate": 8.882432563435393e-05, + "loss": 0.013, + "step": 15370 + }, + { + "grad_norm": 0.2632829546928406, + "learning_rate": 8.880695461604556e-05, + "loss": 0.0132, + "step": 15380 + }, + { + "grad_norm": 0.2651746869087219, + "learning_rate": 8.878957180920901e-05, + "loss": 0.0185, + "step": 15390 + }, + { + "grad_norm": 0.39096003770828247, + "learning_rate": 8.877217721912473e-05, + "loss": 0.0155, + "step": 15400 + }, + { + "grad_norm": 0.33127567172050476, + "learning_rate": 8.875477085107673e-05, + "loss": 0.0131, + "step": 15410 + }, + { + "grad_norm": 0.2860681116580963, + "learning_rate": 8.87373527103526e-05, + "loss": 0.0176, + "step": 15420 + }, + { + "grad_norm": 0.3429126441478729, + "learning_rate": 8.871992280224353e-05, + "loss": 0.0113, + "step": 15430 + }, + { + "grad_norm": 0.4041215777397156, + "learning_rate": 8.870248113204422e-05, + "loss": 0.0127, + "step": 15440 + }, + { + "grad_norm": 0.3912096619606018, + "learning_rate": 8.868502770505306e-05, + "loss": 0.0152, + "step": 15450 + }, + { + "grad_norm": 0.3423306345939636, + "learning_rate": 8.86675625265719e-05, + "loss": 0.0147, + "step": 15460 + }, + { + "grad_norm": 0.4177573323249817, + "learning_rate": 8.865008560190618e-05, + "loss": 0.0153, + "step": 15470 + }, + { + "grad_norm": 0.31812793016433716, + "learning_rate": 8.863259693636496e-05, + "loss": 0.0126, + "step": 15480 + }, + { + "grad_norm": 0.26776182651519775, + "learning_rate": 8.861509653526083e-05, + "loss": 0.0159, + "step": 15490 + }, + { + "grad_norm": 0.31965771317481995, + "learning_rate": 8.859758440390993e-05, + "loss": 0.0117, + "step": 15500 + }, + { + "grad_norm": 0.3053452670574188, + "learning_rate": 8.858006054763202e-05, + "loss": 0.0118, + "step": 15510 + }, + { + "grad_norm": 0.3347854018211365, + "learning_rate": 8.856252497175035e-05, + "loss": 0.0126, + "step": 15520 + }, + { + "grad_norm": 0.2923136055469513, + "learning_rate": 8.854497768159178e-05, + "loss": 0.0204, + "step": 15530 + }, + { + "grad_norm": 0.32943835854530334, + "learning_rate": 8.852741868248671e-05, + "loss": 0.0164, + "step": 15540 + }, + { + "grad_norm": 0.28264474868774414, + "learning_rate": 8.85098479797691e-05, + "loss": 0.0145, + "step": 15550 + }, + { + "grad_norm": 0.3556481897830963, + "learning_rate": 8.849226557877646e-05, + "loss": 0.0126, + "step": 15560 + }, + { + "grad_norm": 0.2995128035545349, + "learning_rate": 8.84746714848499e-05, + "loss": 0.012, + "step": 15570 + }, + { + "grad_norm": 0.40936970710754395, + "learning_rate": 8.845706570333397e-05, + "loss": 0.0126, + "step": 15580 + }, + { + "grad_norm": 0.38154274225234985, + "learning_rate": 8.84394482395769e-05, + "loss": 0.0141, + "step": 15590 + }, + { + "grad_norm": 0.32766827940940857, + "learning_rate": 8.842181909893038e-05, + "loss": 0.013, + "step": 15600 + }, + { + "grad_norm": 0.3190121650695801, + "learning_rate": 8.840417828674969e-05, + "loss": 0.0125, + "step": 15610 + }, + { + "grad_norm": 0.3312472701072693, + "learning_rate": 8.838652580839364e-05, + "loss": 0.0127, + "step": 15620 + }, + { + "grad_norm": 0.35937488079071045, + "learning_rate": 8.836886166922458e-05, + "loss": 0.0111, + "step": 15630 + }, + { + "grad_norm": 0.38958367705345154, + "learning_rate": 8.835118587460844e-05, + "loss": 0.0116, + "step": 15640 + }, + { + "grad_norm": 0.3366395831108093, + "learning_rate": 8.83334984299146e-05, + "loss": 0.0166, + "step": 15650 + }, + { + "grad_norm": 0.2910676896572113, + "learning_rate": 8.83157993405161e-05, + "loss": 0.0144, + "step": 15660 + }, + { + "grad_norm": 0.2860357463359833, + "learning_rate": 8.829808861178943e-05, + "loss": 0.0128, + "step": 15670 + }, + { + "grad_norm": 0.405865877866745, + "learning_rate": 8.828036624911464e-05, + "loss": 0.011, + "step": 15680 + }, + { + "grad_norm": 0.3807377219200134, + "learning_rate": 8.826263225787532e-05, + "loss": 0.0148, + "step": 15690 + }, + { + "grad_norm": 0.3049972355365753, + "learning_rate": 8.824488664345858e-05, + "loss": 0.0125, + "step": 15700 + }, + { + "grad_norm": 0.24119381606578827, + "learning_rate": 8.822712941125508e-05, + "loss": 0.0147, + "step": 15710 + }, + { + "grad_norm": 0.3006763756275177, + "learning_rate": 8.820936056665898e-05, + "loss": 0.0147, + "step": 15720 + }, + { + "grad_norm": 0.3721540868282318, + "learning_rate": 8.819158011506801e-05, + "loss": 0.0132, + "step": 15730 + }, + { + "grad_norm": 0.329062283039093, + "learning_rate": 8.81737880618834e-05, + "loss": 0.0148, + "step": 15740 + }, + { + "grad_norm": 0.330485999584198, + "learning_rate": 8.815598441250987e-05, + "loss": 0.0125, + "step": 15750 + }, + { + "grad_norm": 0.32903993129730225, + "learning_rate": 8.813816917235576e-05, + "loss": 0.0143, + "step": 15760 + }, + { + "grad_norm": 0.4610109329223633, + "learning_rate": 8.812034234683282e-05, + "loss": 0.0144, + "step": 15770 + }, + { + "grad_norm": 0.2858344316482544, + "learning_rate": 8.810250394135637e-05, + "loss": 0.0154, + "step": 15780 + }, + { + "grad_norm": 0.3090382218360901, + "learning_rate": 8.808465396134529e-05, + "loss": 0.0138, + "step": 15790 + }, + { + "grad_norm": 0.4043864607810974, + "learning_rate": 8.806679241222189e-05, + "loss": 0.0149, + "step": 15800 + }, + { + "grad_norm": 0.41810423135757446, + "learning_rate": 8.804891929941203e-05, + "loss": 0.0131, + "step": 15810 + }, + { + "grad_norm": 0.30472487211227417, + "learning_rate": 8.803103462834514e-05, + "loss": 0.015, + "step": 15820 + }, + { + "grad_norm": 0.4549807608127594, + "learning_rate": 8.801313840445408e-05, + "loss": 0.0154, + "step": 15830 + }, + { + "grad_norm": 0.29225510358810425, + "learning_rate": 8.799523063317524e-05, + "loss": 0.0131, + "step": 15840 + }, + { + "grad_norm": 0.3015229105949402, + "learning_rate": 8.797731131994854e-05, + "loss": 0.0162, + "step": 15850 + }, + { + "grad_norm": 0.3149995505809784, + "learning_rate": 8.795938047021739e-05, + "loss": 0.0117, + "step": 15860 + }, + { + "grad_norm": 0.33604562282562256, + "learning_rate": 8.794143808942872e-05, + "loss": 0.0126, + "step": 15870 + }, + { + "grad_norm": 0.3729907274246216, + "learning_rate": 8.792348418303296e-05, + "loss": 0.0177, + "step": 15880 + }, + { + "grad_norm": 0.49357935786247253, + "learning_rate": 8.790551875648398e-05, + "loss": 0.0133, + "step": 15890 + }, + { + "grad_norm": 0.4085029065608978, + "learning_rate": 8.788754181523926e-05, + "loss": 0.0166, + "step": 15900 + }, + { + "grad_norm": 0.33292534947395325, + "learning_rate": 8.78695533647597e-05, + "loss": 0.019, + "step": 15910 + }, + { + "grad_norm": 0.37383317947387695, + "learning_rate": 8.785155341050972e-05, + "loss": 0.0159, + "step": 15920 + }, + { + "grad_norm": 0.43480196595191956, + "learning_rate": 8.783354195795721e-05, + "loss": 0.0227, + "step": 15930 + }, + { + "grad_norm": 0.3275454342365265, + "learning_rate": 8.78155190125736e-05, + "loss": 0.014, + "step": 15940 + }, + { + "grad_norm": 0.2989807724952698, + "learning_rate": 8.779748457983378e-05, + "loss": 0.0159, + "step": 15950 + }, + { + "grad_norm": 0.40545862913131714, + "learning_rate": 8.777943866521612e-05, + "loss": 0.0182, + "step": 15960 + }, + { + "grad_norm": 0.43560394644737244, + "learning_rate": 8.77613812742025e-05, + "loss": 0.0139, + "step": 15970 + }, + { + "grad_norm": 0.3351658582687378, + "learning_rate": 8.774331241227829e-05, + "loss": 0.0143, + "step": 15980 + }, + { + "grad_norm": 0.27033624053001404, + "learning_rate": 8.772523208493232e-05, + "loss": 0.0123, + "step": 15990 + }, + { + "grad_norm": 0.2905561029911041, + "learning_rate": 8.770714029765692e-05, + "loss": 0.0114, + "step": 16000 + }, + { + "grad_norm": 0.300875186920166, + "learning_rate": 8.768903705594789e-05, + "loss": 0.0114, + "step": 16010 + }, + { + "grad_norm": 0.5580713152885437, + "learning_rate": 8.767092236530453e-05, + "loss": 0.0101, + "step": 16020 + }, + { + "grad_norm": 0.2916533648967743, + "learning_rate": 8.76527962312296e-05, + "loss": 0.0117, + "step": 16030 + }, + { + "grad_norm": 0.2396416813135147, + "learning_rate": 8.763465865922934e-05, + "loss": 0.0136, + "step": 16040 + }, + { + "grad_norm": 0.36523109674453735, + "learning_rate": 8.761650965481347e-05, + "loss": 0.014, + "step": 16050 + }, + { + "grad_norm": 0.2911982536315918, + "learning_rate": 8.759834922349516e-05, + "loss": 0.0123, + "step": 16060 + }, + { + "grad_norm": 0.32615402340888977, + "learning_rate": 8.758017737079108e-05, + "loss": 0.0136, + "step": 16070 + }, + { + "grad_norm": 0.32375550270080566, + "learning_rate": 8.756199410222137e-05, + "loss": 0.0153, + "step": 16080 + }, + { + "grad_norm": 0.3893955945968628, + "learning_rate": 8.754379942330963e-05, + "loss": 0.0143, + "step": 16090 + }, + { + "grad_norm": 0.2964588403701782, + "learning_rate": 8.75255933395829e-05, + "loss": 0.015, + "step": 16100 + }, + { + "grad_norm": 0.23883485794067383, + "learning_rate": 8.750737585657171e-05, + "loss": 0.0139, + "step": 16110 + }, + { + "grad_norm": 0.28332650661468506, + "learning_rate": 8.748914697981008e-05, + "loss": 0.0133, + "step": 16120 + }, + { + "grad_norm": 0.3785288333892822, + "learning_rate": 8.747090671483542e-05, + "loss": 0.0111, + "step": 16130 + }, + { + "grad_norm": 0.31500139832496643, + "learning_rate": 8.745265506718869e-05, + "loss": 0.0151, + "step": 16140 + }, + { + "grad_norm": 0.3582603633403778, + "learning_rate": 8.74343920424142e-05, + "loss": 0.0115, + "step": 16150 + }, + { + "grad_norm": 0.3371714949607849, + "learning_rate": 8.741611764605982e-05, + "loss": 0.0184, + "step": 16160 + }, + { + "grad_norm": 0.4119216799736023, + "learning_rate": 8.739783188367682e-05, + "loss": 0.0127, + "step": 16170 + }, + { + "grad_norm": 0.2599102854728699, + "learning_rate": 8.737953476081991e-05, + "loss": 0.0166, + "step": 16180 + }, + { + "grad_norm": 0.44500142335891724, + "learning_rate": 8.73612262830473e-05, + "loss": 0.0134, + "step": 16190 + }, + { + "grad_norm": 0.4116831123828888, + "learning_rate": 8.734290645592061e-05, + "loss": 0.0188, + "step": 16200 + }, + { + "grad_norm": 0.27751389145851135, + "learning_rate": 8.732457528500493e-05, + "loss": 0.011, + "step": 16210 + }, + { + "grad_norm": 0.3647110164165497, + "learning_rate": 8.730623277586875e-05, + "loss": 0.0135, + "step": 16220 + }, + { + "grad_norm": 0.3718293309211731, + "learning_rate": 8.72878789340841e-05, + "loss": 0.0134, + "step": 16230 + }, + { + "grad_norm": 0.2759992778301239, + "learning_rate": 8.726951376522635e-05, + "loss": 0.013, + "step": 16240 + }, + { + "grad_norm": 0.3869374394416809, + "learning_rate": 8.725113727487435e-05, + "loss": 0.0119, + "step": 16250 + }, + { + "grad_norm": 0.2692963778972626, + "learning_rate": 8.723274946861042e-05, + "loss": 0.0133, + "step": 16260 + }, + { + "grad_norm": 0.4123724400997162, + "learning_rate": 8.721435035202026e-05, + "loss": 0.0132, + "step": 16270 + }, + { + "grad_norm": 0.31288018822669983, + "learning_rate": 8.719593993069306e-05, + "loss": 0.0121, + "step": 16280 + }, + { + "grad_norm": 0.24861818552017212, + "learning_rate": 8.717751821022139e-05, + "loss": 0.0173, + "step": 16290 + }, + { + "grad_norm": 0.3155044615268707, + "learning_rate": 8.715908519620134e-05, + "loss": 0.0123, + "step": 16300 + }, + { + "grad_norm": 0.4249679148197174, + "learning_rate": 8.71406408942323e-05, + "loss": 0.0144, + "step": 16310 + }, + { + "grad_norm": 0.30136585235595703, + "learning_rate": 8.712218530991723e-05, + "loss": 0.0127, + "step": 16320 + }, + { + "grad_norm": 0.3271821141242981, + "learning_rate": 8.710371844886241e-05, + "loss": 0.0152, + "step": 16330 + }, + { + "grad_norm": 0.27764326333999634, + "learning_rate": 8.708524031667758e-05, + "loss": 0.012, + "step": 16340 + }, + { + "grad_norm": 0.4021338224411011, + "learning_rate": 8.706675091897592e-05, + "loss": 0.0129, + "step": 16350 + }, + { + "grad_norm": 0.30744513869285583, + "learning_rate": 8.704825026137404e-05, + "loss": 0.0139, + "step": 16360 + }, + { + "grad_norm": 0.322570264339447, + "learning_rate": 8.702973834949192e-05, + "loss": 0.0134, + "step": 16370 + }, + { + "grad_norm": 0.32617056369781494, + "learning_rate": 8.701121518895301e-05, + "loss": 0.0149, + "step": 16380 + }, + { + "grad_norm": 0.27813100814819336, + "learning_rate": 8.699268078538414e-05, + "loss": 0.0124, + "step": 16390 + }, + { + "grad_norm": 0.3387972414493561, + "learning_rate": 8.69741351444156e-05, + "loss": 0.0133, + "step": 16400 + }, + { + "grad_norm": 0.3083968460559845, + "learning_rate": 8.695557827168101e-05, + "loss": 0.0132, + "step": 16410 + }, + { + "grad_norm": 0.3003661036491394, + "learning_rate": 8.693701017281753e-05, + "loss": 0.0116, + "step": 16420 + }, + { + "grad_norm": 0.2909878194332123, + "learning_rate": 8.691843085346563e-05, + "loss": 0.0112, + "step": 16430 + }, + { + "grad_norm": 0.38834142684936523, + "learning_rate": 8.689984031926919e-05, + "loss": 0.0131, + "step": 16440 + }, + { + "grad_norm": 0.325787752866745, + "learning_rate": 8.688123857587555e-05, + "loss": 0.0116, + "step": 16450 + }, + { + "grad_norm": 0.2978907823562622, + "learning_rate": 8.686262562893544e-05, + "loss": 0.0138, + "step": 16460 + }, + { + "grad_norm": 0.2757437527179718, + "learning_rate": 8.684400148410294e-05, + "loss": 0.0139, + "step": 16470 + }, + { + "grad_norm": 0.27132338285446167, + "learning_rate": 8.682536614703562e-05, + "loss": 0.0155, + "step": 16480 + }, + { + "grad_norm": 0.3353572189807892, + "learning_rate": 8.680671962339437e-05, + "loss": 0.0158, + "step": 16490 + }, + { + "grad_norm": 0.347239226102829, + "learning_rate": 8.678806191884352e-05, + "loss": 0.0136, + "step": 16500 + }, + { + "grad_norm": 0.3027389943599701, + "learning_rate": 8.67693930390508e-05, + "loss": 0.0126, + "step": 16510 + }, + { + "grad_norm": 0.4040740132331848, + "learning_rate": 8.67507129896873e-05, + "loss": 0.0125, + "step": 16520 + }, + { + "grad_norm": 0.34633320569992065, + "learning_rate": 8.673202177642757e-05, + "loss": 0.0127, + "step": 16530 + }, + { + "grad_norm": 0.28745752573013306, + "learning_rate": 8.671331940494945e-05, + "loss": 0.0128, + "step": 16540 + }, + { + "grad_norm": 0.28032582998275757, + "learning_rate": 8.669460588093427e-05, + "loss": 0.0144, + "step": 16550 + }, + { + "grad_norm": 0.41900312900543213, + "learning_rate": 8.667588121006667e-05, + "loss": 0.0136, + "step": 16560 + }, + { + "grad_norm": 0.30153435468673706, + "learning_rate": 8.665714539803475e-05, + "loss": 0.013, + "step": 16570 + }, + { + "grad_norm": 0.30236735939979553, + "learning_rate": 8.663839845052993e-05, + "loss": 0.0155, + "step": 16580 + }, + { + "grad_norm": 0.2843511402606964, + "learning_rate": 8.661964037324703e-05, + "loss": 0.0125, + "step": 16590 + }, + { + "grad_norm": 0.3118680417537689, + "learning_rate": 8.660087117188427e-05, + "loss": 0.0114, + "step": 16600 + }, + { + "grad_norm": 0.32381507754325867, + "learning_rate": 8.658209085214325e-05, + "loss": 0.0147, + "step": 16610 + }, + { + "grad_norm": 0.3829399645328522, + "learning_rate": 8.656329941972891e-05, + "loss": 0.0139, + "step": 16620 + }, + { + "grad_norm": 0.27941471338272095, + "learning_rate": 8.654449688034963e-05, + "loss": 0.0128, + "step": 16630 + }, + { + "grad_norm": 0.3418424129486084, + "learning_rate": 8.652568323971706e-05, + "loss": 0.0121, + "step": 16640 + }, + { + "grad_norm": 0.3883926570415497, + "learning_rate": 8.650685850354636e-05, + "loss": 0.0139, + "step": 16650 + }, + { + "grad_norm": 0.32417985796928406, + "learning_rate": 8.648802267755593e-05, + "loss": 0.0125, + "step": 16660 + }, + { + "grad_norm": 0.42773956060409546, + "learning_rate": 8.646917576746764e-05, + "loss": 0.0133, + "step": 16670 + }, + { + "grad_norm": 0.31513354182243347, + "learning_rate": 8.645031777900666e-05, + "loss": 0.0117, + "step": 16680 + }, + { + "grad_norm": 0.2895990014076233, + "learning_rate": 8.643144871790154e-05, + "loss": 0.0125, + "step": 16690 + }, + { + "grad_norm": 0.42464783787727356, + "learning_rate": 8.641256858988424e-05, + "loss": 0.018, + "step": 16700 + }, + { + "grad_norm": 0.33051595091819763, + "learning_rate": 8.639367740069e-05, + "loss": 0.0122, + "step": 16710 + }, + { + "grad_norm": 0.3542407155036926, + "learning_rate": 8.63747751560575e-05, + "loss": 0.0172, + "step": 16720 + }, + { + "grad_norm": 0.2745692729949951, + "learning_rate": 8.635586186172871e-05, + "loss": 0.0124, + "step": 16730 + }, + { + "grad_norm": 0.3380177319049835, + "learning_rate": 8.633693752344902e-05, + "loss": 0.0107, + "step": 16740 + }, + { + "grad_norm": 0.36971718072891235, + "learning_rate": 8.631800214696713e-05, + "loss": 0.011, + "step": 16750 + }, + { + "grad_norm": 0.45011842250823975, + "learning_rate": 8.629905573803511e-05, + "loss": 0.0135, + "step": 16760 + }, + { + "grad_norm": 0.29282692074775696, + "learning_rate": 8.628009830240839e-05, + "loss": 0.0152, + "step": 16770 + }, + { + "grad_norm": 0.32043442130088806, + "learning_rate": 8.626112984584571e-05, + "loss": 0.0119, + "step": 16780 + }, + { + "grad_norm": 0.3498082756996155, + "learning_rate": 8.62421503741092e-05, + "loss": 0.0138, + "step": 16790 + }, + { + "grad_norm": 0.26590287685394287, + "learning_rate": 8.622315989296432e-05, + "loss": 0.0142, + "step": 16800 + }, + { + "grad_norm": 0.4875786602497101, + "learning_rate": 8.62041584081799e-05, + "loss": 0.0177, + "step": 16810 + }, + { + "grad_norm": 0.33064335584640503, + "learning_rate": 8.618514592552807e-05, + "loss": 0.0149, + "step": 16820 + }, + { + "grad_norm": 0.3084089457988739, + "learning_rate": 8.616612245078431e-05, + "loss": 0.0122, + "step": 16830 + }, + { + "grad_norm": 0.3607918620109558, + "learning_rate": 8.614708798972746e-05, + "loss": 0.0127, + "step": 16840 + }, + { + "grad_norm": 0.24627645313739777, + "learning_rate": 8.61280425481397e-05, + "loss": 0.0121, + "step": 16850 + }, + { + "grad_norm": 0.39730602502822876, + "learning_rate": 8.61089861318065e-05, + "loss": 0.0113, + "step": 16860 + }, + { + "grad_norm": 0.3092910945415497, + "learning_rate": 8.608991874651673e-05, + "loss": 0.0105, + "step": 16870 + }, + { + "grad_norm": 0.419026255607605, + "learning_rate": 8.607084039806255e-05, + "loss": 0.0161, + "step": 16880 + }, + { + "grad_norm": 0.3118002116680145, + "learning_rate": 8.605175109223944e-05, + "loss": 0.0183, + "step": 16890 + }, + { + "grad_norm": 0.4125400185585022, + "learning_rate": 8.603265083484624e-05, + "loss": 0.0213, + "step": 16900 + }, + { + "grad_norm": 0.3133799433708191, + "learning_rate": 8.60135396316851e-05, + "loss": 0.0149, + "step": 16910 + }, + { + "grad_norm": 0.32613906264305115, + "learning_rate": 8.599441748856152e-05, + "loss": 0.0152, + "step": 16920 + }, + { + "grad_norm": 0.3974539339542389, + "learning_rate": 8.597528441128427e-05, + "loss": 0.0135, + "step": 16930 + }, + { + "grad_norm": 0.22940893471240997, + "learning_rate": 8.595614040566549e-05, + "loss": 0.0152, + "step": 16940 + }, + { + "grad_norm": 0.21529826521873474, + "learning_rate": 8.593698547752063e-05, + "loss": 0.0105, + "step": 16950 + }, + { + "grad_norm": 0.359500914812088, + "learning_rate": 8.591781963266843e-05, + "loss": 0.0146, + "step": 16960 + }, + { + "grad_norm": 0.36617258191108704, + "learning_rate": 8.5898642876931e-05, + "loss": 0.0159, + "step": 16970 + }, + { + "grad_norm": 0.22257599234580994, + "learning_rate": 8.587945521613369e-05, + "loss": 0.0126, + "step": 16980 + }, + { + "grad_norm": 0.3628152906894684, + "learning_rate": 8.586025665610524e-05, + "loss": 0.0156, + "step": 16990 + }, + { + "grad_norm": 0.32624298334121704, + "learning_rate": 8.584104720267765e-05, + "loss": 0.0173, + "step": 17000 + }, + { + "grad_norm": 0.2702179551124573, + "learning_rate": 8.582182686168625e-05, + "loss": 0.0131, + "step": 17010 + }, + { + "grad_norm": 0.30716437101364136, + "learning_rate": 8.580259563896967e-05, + "loss": 0.0136, + "step": 17020 + }, + { + "grad_norm": 0.34903740882873535, + "learning_rate": 8.578335354036983e-05, + "loss": 0.0137, + "step": 17030 + }, + { + "grad_norm": 0.38253846764564514, + "learning_rate": 8.576410057173201e-05, + "loss": 0.0128, + "step": 17040 + }, + { + "grad_norm": 0.2828770577907562, + "learning_rate": 8.574483673890474e-05, + "loss": 0.0123, + "step": 17050 + }, + { + "grad_norm": 0.37984320521354675, + "learning_rate": 8.572556204773983e-05, + "loss": 0.0121, + "step": 17060 + }, + { + "grad_norm": 0.38941362500190735, + "learning_rate": 8.570627650409246e-05, + "loss": 0.0169, + "step": 17070 + }, + { + "grad_norm": 0.3261432647705078, + "learning_rate": 8.568698011382107e-05, + "loss": 0.0144, + "step": 17080 + }, + { + "grad_norm": 0.22355127334594727, + "learning_rate": 8.566767288278738e-05, + "loss": 0.0102, + "step": 17090 + }, + { + "grad_norm": 0.20584484934806824, + "learning_rate": 8.56483548168564e-05, + "loss": 0.0136, + "step": 17100 + }, + { + "grad_norm": 0.2694486975669861, + "learning_rate": 8.562902592189648e-05, + "loss": 0.0141, + "step": 17110 + }, + { + "grad_norm": 0.2758514881134033, + "learning_rate": 8.560968620377921e-05, + "loss": 0.0135, + "step": 17120 + }, + { + "grad_norm": 0.3015854060649872, + "learning_rate": 8.559033566837951e-05, + "loss": 0.0171, + "step": 17130 + }, + { + "grad_norm": 0.2922641932964325, + "learning_rate": 8.557097432157551e-05, + "loss": 0.0176, + "step": 17140 + }, + { + "grad_norm": 0.27565670013427734, + "learning_rate": 8.555160216924872e-05, + "loss": 0.014, + "step": 17150 + }, + { + "grad_norm": 0.31398844718933105, + "learning_rate": 8.55322192172839e-05, + "loss": 0.0137, + "step": 17160 + }, + { + "grad_norm": 0.3369823694229126, + "learning_rate": 8.551282547156902e-05, + "loss": 0.013, + "step": 17170 + }, + { + "grad_norm": 0.34948980808258057, + "learning_rate": 8.549342093799544e-05, + "loss": 0.0134, + "step": 17180 + }, + { + "grad_norm": 0.3204096555709839, + "learning_rate": 8.547400562245773e-05, + "loss": 0.0133, + "step": 17190 + }, + { + "grad_norm": 0.3625577390193939, + "learning_rate": 8.545457953085374e-05, + "loss": 0.0129, + "step": 17200 + }, + { + "grad_norm": 0.2795736491680145, + "learning_rate": 8.543514266908463e-05, + "loss": 0.0158, + "step": 17210 + }, + { + "grad_norm": 0.3084838390350342, + "learning_rate": 8.541569504305478e-05, + "loss": 0.0123, + "step": 17220 + }, + { + "grad_norm": 0.2944566607475281, + "learning_rate": 8.539623665867187e-05, + "loss": 0.012, + "step": 17230 + }, + { + "grad_norm": 0.27562278509140015, + "learning_rate": 8.537676752184685e-05, + "loss": 0.0133, + "step": 17240 + }, + { + "grad_norm": 0.31810757517814636, + "learning_rate": 8.53572876384939e-05, + "loss": 0.0168, + "step": 17250 + }, + { + "grad_norm": 0.23705923557281494, + "learning_rate": 8.533779701453056e-05, + "loss": 0.0117, + "step": 17260 + }, + { + "grad_norm": 0.37942931056022644, + "learning_rate": 8.53182956558775e-05, + "loss": 0.0118, + "step": 17270 + }, + { + "grad_norm": 0.35399091243743896, + "learning_rate": 8.529878356845877e-05, + "loss": 0.015, + "step": 17280 + }, + { + "grad_norm": 0.4677645266056061, + "learning_rate": 8.527926075820158e-05, + "loss": 0.0137, + "step": 17290 + }, + { + "grad_norm": 0.3171139657497406, + "learning_rate": 8.525972723103648e-05, + "loss": 0.0157, + "step": 17300 + }, + { + "grad_norm": 0.28595083951950073, + "learning_rate": 8.524018299289722e-05, + "loss": 0.0149, + "step": 17310 + }, + { + "grad_norm": 0.2826521396636963, + "learning_rate": 8.522062804972083e-05, + "loss": 0.0108, + "step": 17320 + }, + { + "grad_norm": 0.38181421160697937, + "learning_rate": 8.520106240744759e-05, + "loss": 0.0143, + "step": 17330 + }, + { + "grad_norm": 0.2895301878452301, + "learning_rate": 8.518148607202102e-05, + "loss": 0.0142, + "step": 17340 + }, + { + "grad_norm": 0.3157401978969574, + "learning_rate": 8.51618990493879e-05, + "loss": 0.0141, + "step": 17350 + }, + { + "grad_norm": 0.31055977940559387, + "learning_rate": 8.514230134549823e-05, + "loss": 0.0143, + "step": 17360 + }, + { + "grad_norm": 0.24353331327438354, + "learning_rate": 8.51226929663053e-05, + "loss": 0.011, + "step": 17370 + }, + { + "grad_norm": 0.23701763153076172, + "learning_rate": 8.51030739177656e-05, + "loss": 0.0112, + "step": 17380 + }, + { + "grad_norm": 0.2186122089624405, + "learning_rate": 8.508344420583889e-05, + "loss": 0.01, + "step": 17390 + }, + { + "grad_norm": 0.3303378224372864, + "learning_rate": 8.506380383648816e-05, + "loss": 0.0118, + "step": 17400 + }, + { + "grad_norm": 0.28739821910858154, + "learning_rate": 8.504415281567963e-05, + "loss": 0.0125, + "step": 17410 + }, + { + "grad_norm": 0.410834401845932, + "learning_rate": 8.502449114938275e-05, + "loss": 0.0131, + "step": 17420 + }, + { + "grad_norm": 0.40596508979797363, + "learning_rate": 8.500481884357025e-05, + "loss": 0.0167, + "step": 17430 + }, + { + "grad_norm": 0.26776716113090515, + "learning_rate": 8.498513590421801e-05, + "loss": 0.01, + "step": 17440 + }, + { + "grad_norm": 0.29014843702316284, + "learning_rate": 8.496544233730522e-05, + "loss": 0.0122, + "step": 17450 + }, + { + "grad_norm": 0.24734145402908325, + "learning_rate": 8.494573814881426e-05, + "loss": 0.0128, + "step": 17460 + }, + { + "grad_norm": 0.32968658208847046, + "learning_rate": 8.492602334473074e-05, + "loss": 0.0122, + "step": 17470 + }, + { + "grad_norm": 0.36287587881088257, + "learning_rate": 8.49062979310435e-05, + "loss": 0.0128, + "step": 17480 + }, + { + "grad_norm": 0.3090816140174866, + "learning_rate": 8.488656191374458e-05, + "loss": 0.0116, + "step": 17490 + }, + { + "grad_norm": 0.24727264046669006, + "learning_rate": 8.48668152988293e-05, + "loss": 0.0107, + "step": 17500 + }, + { + "grad_norm": 0.264192134141922, + "learning_rate": 8.484705809229612e-05, + "loss": 0.0117, + "step": 17510 + }, + { + "grad_norm": 0.2668319046497345, + "learning_rate": 8.482729030014677e-05, + "loss": 0.0119, + "step": 17520 + }, + { + "grad_norm": 0.3539884388446808, + "learning_rate": 8.48075119283862e-05, + "loss": 0.0139, + "step": 17530 + }, + { + "grad_norm": 0.3166970908641815, + "learning_rate": 8.478772298302254e-05, + "loss": 0.0154, + "step": 17540 + }, + { + "grad_norm": 0.4734901487827301, + "learning_rate": 8.476792347006716e-05, + "loss": 0.0159, + "step": 17550 + }, + { + "grad_norm": 0.36261293292045593, + "learning_rate": 8.474811339553462e-05, + "loss": 0.0138, + "step": 17560 + }, + { + "grad_norm": 0.2863885164260864, + "learning_rate": 8.47282927654427e-05, + "loss": 0.0126, + "step": 17570 + }, + { + "grad_norm": 0.30727434158325195, + "learning_rate": 8.470846158581238e-05, + "loss": 0.0149, + "step": 17580 + }, + { + "grad_norm": 0.3462671637535095, + "learning_rate": 8.468861986266787e-05, + "loss": 0.0113, + "step": 17590 + }, + { + "grad_norm": 0.3012579679489136, + "learning_rate": 8.466876760203654e-05, + "loss": 0.0109, + "step": 17600 + }, + { + "grad_norm": 0.2637174725532532, + "learning_rate": 8.464890480994898e-05, + "loss": 0.0113, + "step": 17610 + }, + { + "grad_norm": 0.33702734112739563, + "learning_rate": 8.462903149243899e-05, + "loss": 0.0112, + "step": 17620 + }, + { + "grad_norm": 0.34073346853256226, + "learning_rate": 8.460914765554357e-05, + "loss": 0.0118, + "step": 17630 + }, + { + "grad_norm": 0.3656296133995056, + "learning_rate": 8.458925330530288e-05, + "loss": 0.0125, + "step": 17640 + }, + { + "grad_norm": 0.31052330136299133, + "learning_rate": 8.456934844776032e-05, + "loss": 0.0117, + "step": 17650 + }, + { + "grad_norm": 0.32945623993873596, + "learning_rate": 8.454943308896246e-05, + "loss": 0.012, + "step": 17660 + }, + { + "grad_norm": 0.37586280703544617, + "learning_rate": 8.452950723495905e-05, + "loss": 0.0167, + "step": 17670 + }, + { + "grad_norm": 0.33211398124694824, + "learning_rate": 8.450957089180303e-05, + "loss": 0.0106, + "step": 17680 + }, + { + "grad_norm": 0.38755369186401367, + "learning_rate": 8.448962406555055e-05, + "loss": 0.011, + "step": 17690 + }, + { + "grad_norm": 0.3846781551837921, + "learning_rate": 8.446966676226093e-05, + "loss": 0.0189, + "step": 17700 + }, + { + "grad_norm": 0.37892523407936096, + "learning_rate": 8.444969898799667e-05, + "loss": 0.0124, + "step": 17710 + }, + { + "grad_norm": 0.34158629179000854, + "learning_rate": 8.442972074882343e-05, + "loss": 0.0132, + "step": 17720 + }, + { + "grad_norm": 0.33211708068847656, + "learning_rate": 8.44097320508101e-05, + "loss": 0.0131, + "step": 17730 + }, + { + "grad_norm": 0.37847191095352173, + "learning_rate": 8.43897329000287e-05, + "loss": 0.0134, + "step": 17740 + }, + { + "grad_norm": 0.35738256573677063, + "learning_rate": 8.436972330255448e-05, + "loss": 0.0108, + "step": 17750 + }, + { + "grad_norm": 0.40498390793800354, + "learning_rate": 8.434970326446579e-05, + "loss": 0.0129, + "step": 17760 + }, + { + "grad_norm": 0.23706281185150146, + "learning_rate": 8.432967279184418e-05, + "loss": 0.0167, + "step": 17770 + }, + { + "grad_norm": 0.2722736895084381, + "learning_rate": 8.430963189077441e-05, + "loss": 0.0123, + "step": 17780 + }, + { + "grad_norm": 0.28679096698760986, + "learning_rate": 8.428958056734437e-05, + "loss": 0.0111, + "step": 17790 + }, + { + "grad_norm": 0.2708021104335785, + "learning_rate": 8.426951882764513e-05, + "loss": 0.0152, + "step": 17800 + }, + { + "grad_norm": 0.25948208570480347, + "learning_rate": 8.424944667777089e-05, + "loss": 0.013, + "step": 17810 + }, + { + "grad_norm": 0.32522982358932495, + "learning_rate": 8.422936412381905e-05, + "loss": 0.0115, + "step": 17820 + }, + { + "grad_norm": 0.353273868560791, + "learning_rate": 8.420927117189017e-05, + "loss": 0.0124, + "step": 17830 + }, + { + "grad_norm": 0.31373462080955505, + "learning_rate": 8.418916782808795e-05, + "loss": 0.015, + "step": 17840 + }, + { + "grad_norm": 0.35310378670692444, + "learning_rate": 8.416905409851926e-05, + "loss": 0.0215, + "step": 17850 + }, + { + "grad_norm": 0.33689039945602417, + "learning_rate": 8.41489299892941e-05, + "loss": 0.0116, + "step": 17860 + }, + { + "grad_norm": 0.2950003147125244, + "learning_rate": 8.412879550652566e-05, + "loss": 0.0106, + "step": 17870 + }, + { + "grad_norm": 0.33894798159599304, + "learning_rate": 8.410865065633029e-05, + "loss": 0.0136, + "step": 17880 + }, + { + "grad_norm": 0.22040528059005737, + "learning_rate": 8.408849544482742e-05, + "loss": 0.0104, + "step": 17890 + }, + { + "grad_norm": 0.2856816053390503, + "learning_rate": 8.406832987813968e-05, + "loss": 0.0133, + "step": 17900 + }, + { + "grad_norm": 0.29003340005874634, + "learning_rate": 8.404815396239286e-05, + "loss": 0.0106, + "step": 17910 + }, + { + "grad_norm": 0.36010029911994934, + "learning_rate": 8.402796770371587e-05, + "loss": 0.0159, + "step": 17920 + }, + { + "grad_norm": 0.2954373061656952, + "learning_rate": 8.400777110824071e-05, + "loss": 0.0145, + "step": 17930 + }, + { + "grad_norm": 0.29758456349372864, + "learning_rate": 8.398756418210263e-05, + "loss": 0.0168, + "step": 17940 + }, + { + "grad_norm": 0.32659077644348145, + "learning_rate": 8.396734693143993e-05, + "loss": 0.0144, + "step": 17950 + }, + { + "grad_norm": 0.31495726108551025, + "learning_rate": 8.39471193623941e-05, + "loss": 0.0159, + "step": 17960 + }, + { + "grad_norm": 0.40374577045440674, + "learning_rate": 8.392688148110974e-05, + "loss": 0.0192, + "step": 17970 + }, + { + "grad_norm": 0.3350340723991394, + "learning_rate": 8.390663329373456e-05, + "loss": 0.014, + "step": 17980 + }, + { + "grad_norm": 0.3928029537200928, + "learning_rate": 8.388637480641944e-05, + "loss": 0.0142, + "step": 17990 + }, + { + "grad_norm": 0.3154008090496063, + "learning_rate": 8.386610602531837e-05, + "loss": 0.0141, + "step": 18000 + }, + { + "grad_norm": 0.30427834391593933, + "learning_rate": 8.384582695658847e-05, + "loss": 0.0144, + "step": 18010 + }, + { + "grad_norm": 0.30862873792648315, + "learning_rate": 8.382553760638999e-05, + "loss": 0.0169, + "step": 18020 + }, + { + "grad_norm": 0.22508589923381805, + "learning_rate": 8.380523798088631e-05, + "loss": 0.0172, + "step": 18030 + }, + { + "grad_norm": 0.301836222410202, + "learning_rate": 8.378492808624389e-05, + "loss": 0.0145, + "step": 18040 + }, + { + "grad_norm": 0.2518996596336365, + "learning_rate": 8.376460792863237e-05, + "loss": 0.0154, + "step": 18050 + }, + { + "grad_norm": 0.2455400824546814, + "learning_rate": 8.374427751422444e-05, + "loss": 0.0162, + "step": 18060 + }, + { + "grad_norm": 0.2845357358455658, + "learning_rate": 8.3723936849196e-05, + "loss": 0.0126, + "step": 18070 + }, + { + "grad_norm": 0.34579500555992126, + "learning_rate": 8.370358593972595e-05, + "loss": 0.012, + "step": 18080 + }, + { + "grad_norm": 0.2587417960166931, + "learning_rate": 8.36832247919964e-05, + "loss": 0.0118, + "step": 18090 + }, + { + "grad_norm": 0.25982800126075745, + "learning_rate": 8.36628534121925e-05, + "loss": 0.013, + "step": 18100 + }, + { + "grad_norm": 0.308040976524353, + "learning_rate": 8.364247180650254e-05, + "loss": 0.0122, + "step": 18110 + }, + { + "grad_norm": 0.3716839849948883, + "learning_rate": 8.362207998111794e-05, + "loss": 0.012, + "step": 18120 + }, + { + "grad_norm": 0.287418007850647, + "learning_rate": 8.360167794223318e-05, + "loss": 0.0097, + "step": 18130 + }, + { + "grad_norm": 0.274263471364975, + "learning_rate": 8.358126569604586e-05, + "loss": 0.015, + "step": 18140 + }, + { + "grad_norm": 0.24698516726493835, + "learning_rate": 8.356084324875668e-05, + "loss": 0.0101, + "step": 18150 + }, + { + "grad_norm": 0.3069823682308197, + "learning_rate": 8.354041060656945e-05, + "loss": 0.0154, + "step": 18160 + }, + { + "grad_norm": 0.18795974552631378, + "learning_rate": 8.351996777569106e-05, + "loss": 0.0098, + "step": 18170 + }, + { + "grad_norm": 0.23682774603366852, + "learning_rate": 8.349951476233148e-05, + "loss": 0.015, + "step": 18180 + }, + { + "grad_norm": 0.27422621846199036, + "learning_rate": 8.347905157270386e-05, + "loss": 0.0135, + "step": 18190 + }, + { + "grad_norm": 0.2154785692691803, + "learning_rate": 8.345857821302432e-05, + "loss": 0.0093, + "step": 18200 + }, + { + "grad_norm": 0.31889963150024414, + "learning_rate": 8.343809468951213e-05, + "loss": 0.0093, + "step": 18210 + }, + { + "grad_norm": 0.4687674343585968, + "learning_rate": 8.341760100838965e-05, + "loss": 0.0119, + "step": 18220 + }, + { + "grad_norm": 0.4000532031059265, + "learning_rate": 8.339709717588233e-05, + "loss": 0.0146, + "step": 18230 + }, + { + "grad_norm": 0.24362359941005707, + "learning_rate": 8.33765831982187e-05, + "loss": 0.0137, + "step": 18240 + }, + { + "grad_norm": 0.29831790924072266, + "learning_rate": 8.335605908163035e-05, + "loss": 0.0152, + "step": 18250 + }, + { + "grad_norm": 0.3350246250629425, + "learning_rate": 8.333552483235196e-05, + "loss": 0.0138, + "step": 18260 + }, + { + "grad_norm": 0.3507806956768036, + "learning_rate": 8.33149804566213e-05, + "loss": 0.014, + "step": 18270 + }, + { + "grad_norm": 0.2848481237888336, + "learning_rate": 8.329442596067921e-05, + "loss": 0.0121, + "step": 18280 + }, + { + "grad_norm": 0.3517798185348511, + "learning_rate": 8.32738613507696e-05, + "loss": 0.0141, + "step": 18290 + }, + { + "grad_norm": 0.37726861238479614, + "learning_rate": 8.325328663313946e-05, + "loss": 0.0152, + "step": 18300 + }, + { + "grad_norm": 0.3971586227416992, + "learning_rate": 8.323270181403884e-05, + "loss": 0.0137, + "step": 18310 + }, + { + "grad_norm": 0.22382038831710815, + "learning_rate": 8.321210689972086e-05, + "loss": 0.0102, + "step": 18320 + }, + { + "grad_norm": 0.3550434112548828, + "learning_rate": 8.319150189644174e-05, + "loss": 0.0147, + "step": 18330 + }, + { + "grad_norm": 0.36026695370674133, + "learning_rate": 8.31708868104607e-05, + "loss": 0.0114, + "step": 18340 + }, + { + "grad_norm": 0.26386353373527527, + "learning_rate": 8.315026164804007e-05, + "loss": 0.0106, + "step": 18350 + }, + { + "grad_norm": 0.2927246689796448, + "learning_rate": 8.312962641544524e-05, + "loss": 0.0129, + "step": 18360 + }, + { + "grad_norm": 0.29330456256866455, + "learning_rate": 8.310898111894465e-05, + "loss": 0.0146, + "step": 18370 + }, + { + "grad_norm": 0.4507358968257904, + "learning_rate": 8.308832576480977e-05, + "loss": 0.0113, + "step": 18380 + }, + { + "grad_norm": 0.37675946950912476, + "learning_rate": 8.306766035931519e-05, + "loss": 0.0118, + "step": 18390 + }, + { + "grad_norm": 0.2877444624900818, + "learning_rate": 8.304698490873847e-05, + "loss": 0.0153, + "step": 18400 + }, + { + "grad_norm": 0.33232858777046204, + "learning_rate": 8.30262994193603e-05, + "loss": 0.0158, + "step": 18410 + }, + { + "grad_norm": 0.3234443664550781, + "learning_rate": 8.300560389746438e-05, + "loss": 0.0137, + "step": 18420 + }, + { + "grad_norm": 0.22922036051750183, + "learning_rate": 8.298489834933745e-05, + "loss": 0.0126, + "step": 18430 + }, + { + "grad_norm": 0.38775700330734253, + "learning_rate": 8.296418278126934e-05, + "loss": 0.0114, + "step": 18440 + }, + { + "grad_norm": 0.30138495564460754, + "learning_rate": 8.294345719955284e-05, + "loss": 0.0114, + "step": 18450 + }, + { + "grad_norm": 0.348544180393219, + "learning_rate": 8.29227216104839e-05, + "loss": 0.014, + "step": 18460 + }, + { + "grad_norm": 0.40388041734695435, + "learning_rate": 8.290197602036137e-05, + "loss": 0.0108, + "step": 18470 + }, + { + "grad_norm": 0.2986717224121094, + "learning_rate": 8.288122043548725e-05, + "loss": 0.0101, + "step": 18480 + }, + { + "grad_norm": 0.19775034487247467, + "learning_rate": 8.286045486216657e-05, + "loss": 0.0128, + "step": 18490 + }, + { + "grad_norm": 0.3551136255264282, + "learning_rate": 8.283967930670733e-05, + "loss": 0.0122, + "step": 18500 + }, + { + "grad_norm": 0.37588241696357727, + "learning_rate": 8.281889377542058e-05, + "loss": 0.0169, + "step": 18510 + }, + { + "grad_norm": 0.34498050808906555, + "learning_rate": 8.279809827462045e-05, + "loss": 0.013, + "step": 18520 + }, + { + "grad_norm": 0.3164331614971161, + "learning_rate": 8.277729281062402e-05, + "loss": 0.0141, + "step": 18530 + }, + { + "grad_norm": 0.2591860294342041, + "learning_rate": 8.27564773897515e-05, + "loss": 0.0183, + "step": 18540 + }, + { + "grad_norm": 0.3670145869255066, + "learning_rate": 8.273565201832602e-05, + "loss": 0.0115, + "step": 18550 + }, + { + "grad_norm": 0.3703955411911011, + "learning_rate": 8.27148167026738e-05, + "loss": 0.0154, + "step": 18560 + }, + { + "grad_norm": 0.3050447106361389, + "learning_rate": 8.269397144912405e-05, + "loss": 0.0142, + "step": 18570 + }, + { + "grad_norm": 0.2702122628688812, + "learning_rate": 8.267311626400899e-05, + "loss": 0.0121, + "step": 18580 + }, + { + "grad_norm": 0.4275292456150055, + "learning_rate": 8.26522511536639e-05, + "loss": 0.0153, + "step": 18590 + }, + { + "grad_norm": 0.3297639787197113, + "learning_rate": 8.263137612442706e-05, + "loss": 0.0142, + "step": 18600 + }, + { + "grad_norm": 0.36414727568626404, + "learning_rate": 8.261049118263971e-05, + "loss": 0.0136, + "step": 18610 + }, + { + "grad_norm": 0.23326407372951508, + "learning_rate": 8.258959633464619e-05, + "loss": 0.0109, + "step": 18620 + }, + { + "grad_norm": 0.28079086542129517, + "learning_rate": 8.256869158679377e-05, + "loss": 0.013, + "step": 18630 + }, + { + "grad_norm": 0.33998361229896545, + "learning_rate": 8.254777694543278e-05, + "loss": 0.0098, + "step": 18640 + }, + { + "grad_norm": 0.5050308108329773, + "learning_rate": 8.252685241691651e-05, + "loss": 0.0116, + "step": 18650 + }, + { + "grad_norm": 0.4916621148586273, + "learning_rate": 8.250591800760133e-05, + "loss": 0.0148, + "step": 18660 + }, + { + "grad_norm": 0.46857693791389465, + "learning_rate": 8.248497372384649e-05, + "loss": 0.0233, + "step": 18670 + }, + { + "grad_norm": 0.32564258575439453, + "learning_rate": 8.246401957201437e-05, + "loss": 0.0197, + "step": 18680 + }, + { + "grad_norm": 0.3152516484260559, + "learning_rate": 8.244305555847027e-05, + "loss": 0.0178, + "step": 18690 + }, + { + "grad_norm": 0.3127182424068451, + "learning_rate": 8.24220816895825e-05, + "loss": 0.0168, + "step": 18700 + }, + { + "grad_norm": 0.3927346467971802, + "learning_rate": 8.240109797172237e-05, + "loss": 0.0166, + "step": 18710 + }, + { + "grad_norm": 0.37310805916786194, + "learning_rate": 8.238010441126416e-05, + "loss": 0.0137, + "step": 18720 + }, + { + "grad_norm": 0.31839457154273987, + "learning_rate": 8.23591010145852e-05, + "loss": 0.0128, + "step": 18730 + }, + { + "grad_norm": 0.31938979029655457, + "learning_rate": 8.233808778806571e-05, + "loss": 0.0141, + "step": 18740 + }, + { + "grad_norm": 0.2878010869026184, + "learning_rate": 8.231706473808903e-05, + "loss": 0.012, + "step": 18750 + }, + { + "grad_norm": 0.3095709979534149, + "learning_rate": 8.229603187104133e-05, + "loss": 0.0107, + "step": 18760 + }, + { + "grad_norm": 0.28657129406929016, + "learning_rate": 8.22749891933119e-05, + "loss": 0.0126, + "step": 18770 + }, + { + "grad_norm": 0.23694774508476257, + "learning_rate": 8.225393671129291e-05, + "loss": 0.0127, + "step": 18780 + }, + { + "grad_norm": 0.2705461084842682, + "learning_rate": 8.223287443137957e-05, + "loss": 0.0132, + "step": 18790 + }, + { + "grad_norm": 0.3463711142539978, + "learning_rate": 8.221180235997004e-05, + "loss": 0.0116, + "step": 18800 + }, + { + "grad_norm": 0.3288988173007965, + "learning_rate": 8.219072050346544e-05, + "loss": 0.0133, + "step": 18810 + }, + { + "grad_norm": 0.32352596521377563, + "learning_rate": 8.216962886826992e-05, + "loss": 0.0127, + "step": 18820 + }, + { + "grad_norm": 0.31411558389663696, + "learning_rate": 8.214852746079054e-05, + "loss": 0.0104, + "step": 18830 + }, + { + "grad_norm": 0.28551432490348816, + "learning_rate": 8.212741628743732e-05, + "loss": 0.01, + "step": 18840 + }, + { + "grad_norm": 0.3180924952030182, + "learning_rate": 8.210629535462333e-05, + "loss": 0.013, + "step": 18850 + }, + { + "grad_norm": 0.30611202120780945, + "learning_rate": 8.208516466876453e-05, + "loss": 0.0123, + "step": 18860 + }, + { + "grad_norm": 0.3751974403858185, + "learning_rate": 8.206402423627986e-05, + "loss": 0.0117, + "step": 18870 + }, + { + "grad_norm": 0.2996487021446228, + "learning_rate": 8.204287406359124e-05, + "loss": 0.0132, + "step": 18880 + }, + { + "grad_norm": 0.25353679060935974, + "learning_rate": 8.20217141571235e-05, + "loss": 0.0106, + "step": 18890 + }, + { + "grad_norm": 0.28663378953933716, + "learning_rate": 8.200054452330449e-05, + "loss": 0.0133, + "step": 18900 + }, + { + "grad_norm": 0.31912288069725037, + "learning_rate": 8.197936516856499e-05, + "loss": 0.0105, + "step": 18910 + }, + { + "grad_norm": 0.3114042282104492, + "learning_rate": 8.195817609933871e-05, + "loss": 0.0147, + "step": 18920 + }, + { + "grad_norm": 0.3462892770767212, + "learning_rate": 8.193697732206233e-05, + "loss": 0.0129, + "step": 18930 + }, + { + "grad_norm": 0.2560267448425293, + "learning_rate": 8.19157688431755e-05, + "loss": 0.0113, + "step": 18940 + }, + { + "grad_norm": 0.309586763381958, + "learning_rate": 8.189455066912077e-05, + "loss": 0.0104, + "step": 18950 + }, + { + "grad_norm": 0.2984698414802551, + "learning_rate": 8.187332280634369e-05, + "loss": 0.0116, + "step": 18960 + }, + { + "grad_norm": 0.3184657692909241, + "learning_rate": 8.18520852612927e-05, + "loss": 0.011, + "step": 18970 + }, + { + "grad_norm": 0.30604785680770874, + "learning_rate": 8.183083804041921e-05, + "loss": 0.0111, + "step": 18980 + }, + { + "grad_norm": 0.2852771580219269, + "learning_rate": 8.180958115017757e-05, + "loss": 0.0096, + "step": 18990 + }, + { + "grad_norm": 0.3685297966003418, + "learning_rate": 8.178831459702505e-05, + "loss": 0.0102, + "step": 19000 + }, + { + "grad_norm": 0.3843708038330078, + "learning_rate": 8.17670383874219e-05, + "loss": 0.0109, + "step": 19010 + }, + { + "grad_norm": 0.43317270278930664, + "learning_rate": 8.174575252783124e-05, + "loss": 0.0138, + "step": 19020 + }, + { + "grad_norm": 0.31003043055534363, + "learning_rate": 8.172445702471914e-05, + "loss": 0.0141, + "step": 19030 + }, + { + "grad_norm": 0.28035664558410645, + "learning_rate": 8.170315188455466e-05, + "loss": 0.011, + "step": 19040 + }, + { + "grad_norm": 0.31346672773361206, + "learning_rate": 8.168183711380969e-05, + "loss": 0.0101, + "step": 19050 + }, + { + "grad_norm": 0.2836418151855469, + "learning_rate": 8.166051271895913e-05, + "loss": 0.0102, + "step": 19060 + }, + { + "grad_norm": 0.3067103624343872, + "learning_rate": 8.163917870648075e-05, + "loss": 0.0116, + "step": 19070 + }, + { + "grad_norm": 0.32565218210220337, + "learning_rate": 8.161783508285526e-05, + "loss": 0.0104, + "step": 19080 + }, + { + "grad_norm": 0.23752239346504211, + "learning_rate": 8.159648185456628e-05, + "loss": 0.0092, + "step": 19090 + }, + { + "grad_norm": 0.3650795519351959, + "learning_rate": 8.157511902810038e-05, + "loss": 0.0108, + "step": 19100 + }, + { + "grad_norm": 0.23786623775959015, + "learning_rate": 8.155374660994701e-05, + "loss": 0.0098, + "step": 19110 + }, + { + "grad_norm": 0.22118094563484192, + "learning_rate": 8.153236460659857e-05, + "loss": 0.0108, + "step": 19120 + }, + { + "grad_norm": 0.32005828619003296, + "learning_rate": 8.151097302455031e-05, + "loss": 0.0135, + "step": 19130 + }, + { + "grad_norm": 0.3606661856174469, + "learning_rate": 8.148957187030044e-05, + "loss": 0.0122, + "step": 19140 + }, + { + "grad_norm": 0.28908205032348633, + "learning_rate": 8.146816115035006e-05, + "loss": 0.0137, + "step": 19150 + }, + { + "grad_norm": 0.3197175860404968, + "learning_rate": 8.14467408712032e-05, + "loss": 0.0137, + "step": 19160 + }, + { + "grad_norm": 0.3105429708957672, + "learning_rate": 8.142531103936678e-05, + "loss": 0.015, + "step": 19170 + }, + { + "grad_norm": 0.36727669835090637, + "learning_rate": 8.14038716613506e-05, + "loss": 0.0114, + "step": 19180 + }, + { + "grad_norm": 0.3169114589691162, + "learning_rate": 8.138242274366736e-05, + "loss": 0.0118, + "step": 19190 + }, + { + "grad_norm": 0.3648217022418976, + "learning_rate": 8.136096429283271e-05, + "loss": 0.0123, + "step": 19200 + }, + { + "grad_norm": 0.2427861988544464, + "learning_rate": 8.133949631536515e-05, + "loss": 0.0103, + "step": 19210 + }, + { + "grad_norm": 0.34194543957710266, + "learning_rate": 8.131801881778607e-05, + "loss": 0.0102, + "step": 19220 + }, + { + "grad_norm": 0.2932226061820984, + "learning_rate": 8.129653180661978e-05, + "loss": 0.0127, + "step": 19230 + }, + { + "grad_norm": 0.42991992831230164, + "learning_rate": 8.127503528839346e-05, + "loss": 0.0116, + "step": 19240 + }, + { + "grad_norm": 0.3318198025226593, + "learning_rate": 8.125352926963721e-05, + "loss": 0.0129, + "step": 19250 + }, + { + "grad_norm": 0.31246650218963623, + "learning_rate": 8.123201375688395e-05, + "loss": 0.0117, + "step": 19260 + }, + { + "grad_norm": 0.3080601394176483, + "learning_rate": 8.121048875666954e-05, + "loss": 0.0113, + "step": 19270 + }, + { + "grad_norm": 0.2473263144493103, + "learning_rate": 8.118895427553274e-05, + "loss": 0.0133, + "step": 19280 + }, + { + "grad_norm": 0.2663351893424988, + "learning_rate": 8.116741032001511e-05, + "loss": 0.0129, + "step": 19290 + }, + { + "grad_norm": 0.2721877694129944, + "learning_rate": 8.114585689666114e-05, + "loss": 0.0104, + "step": 19300 + }, + { + "grad_norm": 0.3005412220954895, + "learning_rate": 8.112429401201821e-05, + "loss": 0.0104, + "step": 19310 + }, + { + "grad_norm": 0.2333231419324875, + "learning_rate": 8.110272167263656e-05, + "loss": 0.0099, + "step": 19320 + }, + { + "grad_norm": 0.3086380362510681, + "learning_rate": 8.108113988506929e-05, + "loss": 0.0111, + "step": 19330 + }, + { + "grad_norm": 0.2621247470378876, + "learning_rate": 8.105954865587235e-05, + "loss": 0.0113, + "step": 19340 + }, + { + "grad_norm": 0.3123207092285156, + "learning_rate": 8.103794799160463e-05, + "loss": 0.0121, + "step": 19350 + }, + { + "grad_norm": 0.3243536353111267, + "learning_rate": 8.101633789882781e-05, + "loss": 0.0113, + "step": 19360 + }, + { + "grad_norm": 0.28597328066825867, + "learning_rate": 8.099471838410648e-05, + "loss": 0.01, + "step": 19370 + }, + { + "grad_norm": 0.3140491545200348, + "learning_rate": 8.097308945400806e-05, + "loss": 0.0105, + "step": 19380 + }, + { + "grad_norm": 0.29438039660453796, + "learning_rate": 8.095145111510288e-05, + "loss": 0.0116, + "step": 19390 + }, + { + "grad_norm": 0.27930429577827454, + "learning_rate": 8.092980337396406e-05, + "loss": 0.011, + "step": 19400 + }, + { + "grad_norm": 0.3430565595626831, + "learning_rate": 8.090814623716763e-05, + "loss": 0.0148, + "step": 19410 + }, + { + "grad_norm": 0.29080599546432495, + "learning_rate": 8.088647971129246e-05, + "loss": 0.0106, + "step": 19420 + }, + { + "grad_norm": 0.25844845175743103, + "learning_rate": 8.086480380292026e-05, + "loss": 0.0113, + "step": 19430 + }, + { + "grad_norm": 0.3026571571826935, + "learning_rate": 8.084311851863562e-05, + "loss": 0.0128, + "step": 19440 + }, + { + "grad_norm": 0.28641223907470703, + "learning_rate": 8.082142386502591e-05, + "loss": 0.0105, + "step": 19450 + }, + { + "grad_norm": 0.2502995729446411, + "learning_rate": 8.079971984868145e-05, + "loss": 0.011, + "step": 19460 + }, + { + "grad_norm": 0.36017531156539917, + "learning_rate": 8.077800647619532e-05, + "loss": 0.0143, + "step": 19470 + }, + { + "grad_norm": 0.2801513373851776, + "learning_rate": 8.075628375416345e-05, + "loss": 0.016, + "step": 19480 + }, + { + "grad_norm": 0.3113939166069031, + "learning_rate": 8.073455168918464e-05, + "loss": 0.0144, + "step": 19490 + }, + { + "grad_norm": 0.3204207420349121, + "learning_rate": 8.071281028786055e-05, + "loss": 0.0114, + "step": 19500 + }, + { + "grad_norm": 0.3569873869419098, + "learning_rate": 8.069105955679562e-05, + "loss": 0.0143, + "step": 19510 + }, + { + "grad_norm": 0.30875831842422485, + "learning_rate": 8.066929950259713e-05, + "loss": 0.0097, + "step": 19520 + }, + { + "grad_norm": 0.3048214614391327, + "learning_rate": 8.064753013187522e-05, + "loss": 0.013, + "step": 19530 + }, + { + "grad_norm": 0.3600217401981354, + "learning_rate": 8.062575145124289e-05, + "loss": 0.0119, + "step": 19540 + }, + { + "grad_norm": 0.30416548252105713, + "learning_rate": 8.060396346731587e-05, + "loss": 0.0156, + "step": 19550 + }, + { + "grad_norm": 0.30622172355651855, + "learning_rate": 8.058216618671281e-05, + "loss": 0.0152, + "step": 19560 + }, + { + "grad_norm": 0.2818950414657593, + "learning_rate": 8.056035961605514e-05, + "loss": 0.0141, + "step": 19570 + }, + { + "grad_norm": 0.41939905285835266, + "learning_rate": 8.05385437619671e-05, + "loss": 0.0114, + "step": 19580 + }, + { + "grad_norm": 0.2622312307357788, + "learning_rate": 8.05167186310758e-05, + "loss": 0.0105, + "step": 19590 + }, + { + "grad_norm": 0.2238624542951584, + "learning_rate": 8.049488423001113e-05, + "loss": 0.0101, + "step": 19600 + }, + { + "grad_norm": 0.2899109125137329, + "learning_rate": 8.047304056540581e-05, + "loss": 0.0094, + "step": 19610 + }, + { + "grad_norm": 0.39610743522644043, + "learning_rate": 8.045118764389534e-05, + "loss": 0.0118, + "step": 19620 + }, + { + "grad_norm": 0.2885169982910156, + "learning_rate": 8.042932547211809e-05, + "loss": 0.0115, + "step": 19630 + }, + { + "grad_norm": 0.2602410614490509, + "learning_rate": 8.04074540567152e-05, + "loss": 0.0095, + "step": 19640 + }, + { + "grad_norm": 0.24544866383075714, + "learning_rate": 8.038557340433063e-05, + "loss": 0.0091, + "step": 19650 + }, + { + "grad_norm": 0.35027745366096497, + "learning_rate": 8.036368352161115e-05, + "loss": 0.013, + "step": 19660 + }, + { + "grad_norm": 0.34638291597366333, + "learning_rate": 8.034178441520633e-05, + "loss": 0.013, + "step": 19670 + }, + { + "grad_norm": 0.3000337779521942, + "learning_rate": 8.031987609176852e-05, + "loss": 0.0094, + "step": 19680 + }, + { + "grad_norm": 0.3059388995170593, + "learning_rate": 8.02979585579529e-05, + "loss": 0.01, + "step": 19690 + }, + { + "grad_norm": 0.384428471326828, + "learning_rate": 8.027603182041745e-05, + "loss": 0.0177, + "step": 19700 + }, + { + "grad_norm": 0.28550684452056885, + "learning_rate": 8.025409588582292e-05, + "loss": 0.0092, + "step": 19710 + }, + { + "grad_norm": 0.24289412796497345, + "learning_rate": 8.023215076083288e-05, + "loss": 0.0103, + "step": 19720 + }, + { + "grad_norm": 0.31261584162712097, + "learning_rate": 8.021019645211367e-05, + "loss": 0.0126, + "step": 19730 + }, + { + "grad_norm": 0.3674880564212799, + "learning_rate": 8.018823296633441e-05, + "loss": 0.0138, + "step": 19740 + }, + { + "grad_norm": 0.31053563952445984, + "learning_rate": 8.016626031016708e-05, + "loss": 0.0128, + "step": 19750 + }, + { + "grad_norm": 0.48972266912460327, + "learning_rate": 8.014427849028636e-05, + "loss": 0.0131, + "step": 19760 + }, + { + "grad_norm": 0.24872824549674988, + "learning_rate": 8.012228751336974e-05, + "loss": 0.0122, + "step": 19770 + }, + { + "grad_norm": 0.2696179151535034, + "learning_rate": 8.01002873860975e-05, + "loss": 0.0148, + "step": 19780 + }, + { + "grad_norm": 0.29384976625442505, + "learning_rate": 8.00782781151527e-05, + "loss": 0.0154, + "step": 19790 + }, + { + "grad_norm": 0.29732707142829895, + "learning_rate": 8.005625970722119e-05, + "loss": 0.0112, + "step": 19800 + }, + { + "grad_norm": 0.3391878306865692, + "learning_rate": 8.003423216899158e-05, + "loss": 0.0134, + "step": 19810 + }, + { + "grad_norm": 0.36865752935409546, + "learning_rate": 8.001219550715522e-05, + "loss": 0.0103, + "step": 19820 + }, + { + "grad_norm": 0.23980113863945007, + "learning_rate": 7.999014972840632e-05, + "loss": 0.0141, + "step": 19830 + }, + { + "grad_norm": 0.35647082328796387, + "learning_rate": 7.996809483944174e-05, + "loss": 0.0153, + "step": 19840 + }, + { + "grad_norm": 0.36074432730674744, + "learning_rate": 7.994603084696124e-05, + "loss": 0.0148, + "step": 19850 + }, + { + "grad_norm": 0.3432263731956482, + "learning_rate": 7.992395775766724e-05, + "loss": 0.0158, + "step": 19860 + }, + { + "grad_norm": 0.3934478461742401, + "learning_rate": 7.990187557826497e-05, + "loss": 0.0124, + "step": 19870 + }, + { + "grad_norm": 0.28861114382743835, + "learning_rate": 7.987978431546242e-05, + "loss": 0.0109, + "step": 19880 + }, + { + "grad_norm": 0.2743509113788605, + "learning_rate": 7.985768397597031e-05, + "loss": 0.013, + "step": 19890 + }, + { + "grad_norm": 0.30137819051742554, + "learning_rate": 7.983557456650216e-05, + "loss": 0.0116, + "step": 19900 + }, + { + "grad_norm": 0.28131797909736633, + "learning_rate": 7.981345609377422e-05, + "loss": 0.0101, + "step": 19910 + }, + { + "grad_norm": 0.24351835250854492, + "learning_rate": 7.97913285645055e-05, + "loss": 0.0108, + "step": 19920 + }, + { + "grad_norm": 0.20535027980804443, + "learning_rate": 7.976919198541776e-05, + "loss": 0.0089, + "step": 19930 + }, + { + "grad_norm": 0.32849133014678955, + "learning_rate": 7.974704636323548e-05, + "loss": 0.0096, + "step": 19940 + }, + { + "grad_norm": 0.3754799962043762, + "learning_rate": 7.972489170468597e-05, + "loss": 0.0111, + "step": 19950 + }, + { + "grad_norm": 0.27436015009880066, + "learning_rate": 7.970272801649918e-05, + "loss": 0.0112, + "step": 19960 + }, + { + "grad_norm": 0.31236574053764343, + "learning_rate": 7.96805553054079e-05, + "loss": 0.0125, + "step": 19970 + }, + { + "grad_norm": 0.2469499409198761, + "learning_rate": 7.965837357814756e-05, + "loss": 0.0105, + "step": 19980 + }, + { + "grad_norm": 0.2964403033256531, + "learning_rate": 7.963618284145643e-05, + "loss": 0.0115, + "step": 19990 + }, + { + "grad_norm": 0.25933167338371277, + "learning_rate": 7.961398310207544e-05, + "loss": 0.0097, + "step": 20000 + }, + { + "grad_norm": 0.31781768798828125, + "learning_rate": 7.95917743667483e-05, + "loss": 0.0106, + "step": 20010 + }, + { + "grad_norm": 0.2968432605266571, + "learning_rate": 7.956955664222144e-05, + "loss": 0.0096, + "step": 20020 + }, + { + "grad_norm": 0.3397810459136963, + "learning_rate": 7.954732993524399e-05, + "loss": 0.0109, + "step": 20030 + }, + { + "grad_norm": 0.2422524094581604, + "learning_rate": 7.952509425256786e-05, + "loss": 0.011, + "step": 20040 + }, + { + "grad_norm": 0.20908181369304657, + "learning_rate": 7.950284960094767e-05, + "loss": 0.0109, + "step": 20050 + }, + { + "grad_norm": 0.28764376044273376, + "learning_rate": 7.948059598714076e-05, + "loss": 0.0116, + "step": 20060 + }, + { + "grad_norm": 0.3075757324695587, + "learning_rate": 7.945833341790717e-05, + "loss": 0.0103, + "step": 20070 + }, + { + "grad_norm": 0.2256336510181427, + "learning_rate": 7.94360619000097e-05, + "loss": 0.0097, + "step": 20080 + }, + { + "grad_norm": 0.4072895646095276, + "learning_rate": 7.941378144021381e-05, + "loss": 0.0103, + "step": 20090 + }, + { + "grad_norm": 0.31318414211273193, + "learning_rate": 7.939149204528777e-05, + "loss": 0.0113, + "step": 20100 + }, + { + "grad_norm": 0.22689473628997803, + "learning_rate": 7.936919372200246e-05, + "loss": 0.0112, + "step": 20110 + }, + { + "grad_norm": 0.2850944995880127, + "learning_rate": 7.934688647713158e-05, + "loss": 0.0093, + "step": 20120 + }, + { + "grad_norm": 0.32716110348701477, + "learning_rate": 7.932457031745143e-05, + "loss": 0.0093, + "step": 20130 + }, + { + "grad_norm": 0.2928125560283661, + "learning_rate": 7.930224524974108e-05, + "loss": 0.0119, + "step": 20140 + }, + { + "grad_norm": 0.3440922200679779, + "learning_rate": 7.927991128078232e-05, + "loss": 0.0121, + "step": 20150 + }, + { + "grad_norm": 0.31154802441596985, + "learning_rate": 7.925756841735958e-05, + "loss": 0.0131, + "step": 20160 + }, + { + "grad_norm": 0.43167048692703247, + "learning_rate": 7.923521666626008e-05, + "loss": 0.0113, + "step": 20170 + }, + { + "grad_norm": 0.4501725733280182, + "learning_rate": 7.921285603427366e-05, + "loss": 0.0141, + "step": 20180 + }, + { + "grad_norm": 0.3480492830276489, + "learning_rate": 7.91904865281929e-05, + "loss": 0.0121, + "step": 20190 + }, + { + "grad_norm": 0.24132727086544037, + "learning_rate": 7.916810815481307e-05, + "loss": 0.0162, + "step": 20200 + }, + { + "grad_norm": 0.3155970275402069, + "learning_rate": 7.914572092093211e-05, + "loss": 0.0104, + "step": 20210 + }, + { + "grad_norm": 0.26399046182632446, + "learning_rate": 7.912332483335068e-05, + "loss": 0.0124, + "step": 20220 + }, + { + "grad_norm": 0.2848663330078125, + "learning_rate": 7.910091989887213e-05, + "loss": 0.0102, + "step": 20230 + }, + { + "grad_norm": 0.22715935111045837, + "learning_rate": 7.907850612430248e-05, + "loss": 0.0121, + "step": 20240 + }, + { + "grad_norm": 0.2672522962093353, + "learning_rate": 7.905608351645044e-05, + "loss": 0.009, + "step": 20250 + }, + { + "grad_norm": 0.19731563329696655, + "learning_rate": 7.90336520821274e-05, + "loss": 0.0113, + "step": 20260 + }, + { + "grad_norm": 0.23456822335720062, + "learning_rate": 7.901121182814746e-05, + "loss": 0.011, + "step": 20270 + }, + { + "grad_norm": 0.2440684735774994, + "learning_rate": 7.898876276132736e-05, + "loss": 0.0097, + "step": 20280 + }, + { + "grad_norm": 0.24227331578731537, + "learning_rate": 7.896630488848654e-05, + "loss": 0.0088, + "step": 20290 + }, + { + "grad_norm": 0.3493794798851013, + "learning_rate": 7.89438382164471e-05, + "loss": 0.0109, + "step": 20300 + }, + { + "grad_norm": 0.2959541380405426, + "learning_rate": 7.892136275203383e-05, + "loss": 0.0101, + "step": 20310 + }, + { + "grad_norm": 0.2868903577327728, + "learning_rate": 7.889887850207418e-05, + "loss": 0.0111, + "step": 20320 + }, + { + "grad_norm": 0.23743665218353271, + "learning_rate": 7.887638547339827e-05, + "loss": 0.0117, + "step": 20330 + }, + { + "grad_norm": 0.3137747347354889, + "learning_rate": 7.885388367283891e-05, + "loss": 0.0176, + "step": 20340 + }, + { + "grad_norm": 0.320621520280838, + "learning_rate": 7.88313731072315e-05, + "loss": 0.0119, + "step": 20350 + }, + { + "grad_norm": 0.2784513831138611, + "learning_rate": 7.88088537834142e-05, + "loss": 0.0108, + "step": 20360 + }, + { + "grad_norm": 0.21401779353618622, + "learning_rate": 7.878632570822778e-05, + "loss": 0.01, + "step": 20370 + }, + { + "grad_norm": 0.3468092679977417, + "learning_rate": 7.876378888851567e-05, + "loss": 0.009, + "step": 20380 + }, + { + "grad_norm": 0.32845574617385864, + "learning_rate": 7.874124333112396e-05, + "loss": 0.0128, + "step": 20390 + }, + { + "grad_norm": 0.26932287216186523, + "learning_rate": 7.871868904290138e-05, + "loss": 0.0088, + "step": 20400 + }, + { + "grad_norm": 0.30503544211387634, + "learning_rate": 7.869612603069935e-05, + "loss": 0.0118, + "step": 20410 + }, + { + "grad_norm": 0.2474668323993683, + "learning_rate": 7.867355430137192e-05, + "loss": 0.0095, + "step": 20420 + }, + { + "grad_norm": 0.2821892499923706, + "learning_rate": 7.865097386177577e-05, + "loss": 0.0105, + "step": 20430 + }, + { + "grad_norm": 0.30608218908309937, + "learning_rate": 7.862838471877023e-05, + "loss": 0.0123, + "step": 20440 + }, + { + "grad_norm": 0.33611175417900085, + "learning_rate": 7.860578687921731e-05, + "loss": 0.0103, + "step": 20450 + }, + { + "grad_norm": 0.2612968683242798, + "learning_rate": 7.858318034998164e-05, + "loss": 0.0107, + "step": 20460 + }, + { + "grad_norm": 0.32673442363739014, + "learning_rate": 7.856056513793046e-05, + "loss": 0.0128, + "step": 20470 + }, + { + "grad_norm": 0.2968776822090149, + "learning_rate": 7.85379412499337e-05, + "loss": 0.0103, + "step": 20480 + }, + { + "grad_norm": 0.26962485909461975, + "learning_rate": 7.851530869286389e-05, + "loss": 0.0094, + "step": 20490 + }, + { + "grad_norm": 0.2564465403556824, + "learning_rate": 7.849266747359619e-05, + "loss": 0.009, + "step": 20500 + }, + { + "grad_norm": 0.2763496935367584, + "learning_rate": 7.847001759900843e-05, + "loss": 0.0136, + "step": 20510 + }, + { + "grad_norm": 0.34941181540489197, + "learning_rate": 7.844735907598102e-05, + "loss": 0.0123, + "step": 20520 + }, + { + "grad_norm": 0.3446839153766632, + "learning_rate": 7.842469191139703e-05, + "loss": 0.0097, + "step": 20530 + }, + { + "grad_norm": 0.24879102408885956, + "learning_rate": 7.840201611214215e-05, + "loss": 0.0118, + "step": 20540 + }, + { + "grad_norm": 0.2297147810459137, + "learning_rate": 7.837933168510469e-05, + "loss": 0.0116, + "step": 20550 + }, + { + "grad_norm": 0.2655164301395416, + "learning_rate": 7.835663863717559e-05, + "loss": 0.0131, + "step": 20560 + }, + { + "grad_norm": 0.29914048314094543, + "learning_rate": 7.833393697524838e-05, + "loss": 0.0107, + "step": 20570 + }, + { + "grad_norm": 0.30555757880210876, + "learning_rate": 7.831122670621922e-05, + "loss": 0.0144, + "step": 20580 + }, + { + "grad_norm": 0.26726293563842773, + "learning_rate": 7.82885078369869e-05, + "loss": 0.01, + "step": 20590 + }, + { + "grad_norm": 0.326022744178772, + "learning_rate": 7.826578037445283e-05, + "loss": 0.0132, + "step": 20600 + }, + { + "grad_norm": 0.3944125175476074, + "learning_rate": 7.824304432552097e-05, + "loss": 0.0126, + "step": 20610 + }, + { + "grad_norm": 0.24064595997333527, + "learning_rate": 7.822029969709798e-05, + "loss": 0.0101, + "step": 20620 + }, + { + "grad_norm": 0.23692484200000763, + "learning_rate": 7.819754649609306e-05, + "loss": 0.0095, + "step": 20630 + }, + { + "grad_norm": 0.32726019620895386, + "learning_rate": 7.817478472941802e-05, + "loss": 0.0096, + "step": 20640 + }, + { + "grad_norm": 0.32033249735832214, + "learning_rate": 7.815201440398727e-05, + "loss": 0.0097, + "step": 20650 + }, + { + "grad_norm": 0.28522205352783203, + "learning_rate": 7.812923552671789e-05, + "loss": 0.0141, + "step": 20660 + }, + { + "grad_norm": 0.39275965094566345, + "learning_rate": 7.810644810452945e-05, + "loss": 0.0118, + "step": 20670 + }, + { + "grad_norm": 0.24330002069473267, + "learning_rate": 7.808365214434417e-05, + "loss": 0.0104, + "step": 20680 + }, + { + "grad_norm": 0.30953559279441833, + "learning_rate": 7.80608476530869e-05, + "loss": 0.0105, + "step": 20690 + }, + { + "grad_norm": 0.24254587292671204, + "learning_rate": 7.8038034637685e-05, + "loss": 0.0105, + "step": 20700 + }, + { + "grad_norm": 0.34047412872314453, + "learning_rate": 7.801521310506848e-05, + "loss": 0.01, + "step": 20710 + }, + { + "grad_norm": 0.2513106167316437, + "learning_rate": 7.799238306216994e-05, + "loss": 0.0106, + "step": 20720 + }, + { + "grad_norm": 0.22358845174312592, + "learning_rate": 7.796954451592448e-05, + "loss": 0.0114, + "step": 20730 + }, + { + "grad_norm": 0.27987927198410034, + "learning_rate": 7.794669747326992e-05, + "loss": 0.0095, + "step": 20740 + }, + { + "grad_norm": 0.29166197776794434, + "learning_rate": 7.792384194114654e-05, + "loss": 0.0091, + "step": 20750 + }, + { + "grad_norm": 0.37438836693763733, + "learning_rate": 7.790097792649729e-05, + "loss": 0.0102, + "step": 20760 + }, + { + "grad_norm": 0.22963950037956238, + "learning_rate": 7.787810543626762e-05, + "loss": 0.0108, + "step": 20770 + }, + { + "grad_norm": 0.3060239553451538, + "learning_rate": 7.785522447740558e-05, + "loss": 0.0135, + "step": 20780 + }, + { + "grad_norm": 0.3001355230808258, + "learning_rate": 7.783233505686182e-05, + "loss": 0.013, + "step": 20790 + }, + { + "grad_norm": 0.24417084455490112, + "learning_rate": 7.780943718158955e-05, + "loss": 0.0129, + "step": 20800 + }, + { + "grad_norm": 0.2219369262456894, + "learning_rate": 7.778653085854453e-05, + "loss": 0.011, + "step": 20810 + }, + { + "grad_norm": 0.2586994469165802, + "learning_rate": 7.77636160946851e-05, + "loss": 0.0099, + "step": 20820 + }, + { + "grad_norm": 0.33123070001602173, + "learning_rate": 7.774069289697215e-05, + "loss": 0.0093, + "step": 20830 + }, + { + "grad_norm": 0.24347035586833954, + "learning_rate": 7.771776127236913e-05, + "loss": 0.0109, + "step": 20840 + }, + { + "grad_norm": 0.23947462439537048, + "learning_rate": 7.769482122784212e-05, + "loss": 0.0107, + "step": 20850 + }, + { + "grad_norm": 0.2801704406738281, + "learning_rate": 7.767187277035963e-05, + "loss": 0.0083, + "step": 20860 + }, + { + "grad_norm": 0.3863893449306488, + "learning_rate": 7.764891590689285e-05, + "loss": 0.0099, + "step": 20870 + }, + { + "grad_norm": 0.31459543108940125, + "learning_rate": 7.762595064441542e-05, + "loss": 0.0091, + "step": 20880 + }, + { + "grad_norm": 0.21968580782413483, + "learning_rate": 7.760297698990362e-05, + "loss": 0.0086, + "step": 20890 + }, + { + "grad_norm": 0.27186062932014465, + "learning_rate": 7.757999495033623e-05, + "loss": 0.0105, + "step": 20900 + }, + { + "grad_norm": 0.23455645143985748, + "learning_rate": 7.755700453269456e-05, + "loss": 0.0112, + "step": 20910 + }, + { + "grad_norm": 0.3652326762676239, + "learning_rate": 7.753400574396254e-05, + "loss": 0.0106, + "step": 20920 + }, + { + "grad_norm": 0.22082045674324036, + "learning_rate": 7.751099859112655e-05, + "loss": 0.0113, + "step": 20930 + }, + { + "grad_norm": 0.22481656074523926, + "learning_rate": 7.748798308117557e-05, + "loss": 0.0102, + "step": 20940 + }, + { + "grad_norm": 0.23719453811645508, + "learning_rate": 7.746495922110112e-05, + "loss": 0.0098, + "step": 20950 + }, + { + "grad_norm": 0.32906338572502136, + "learning_rate": 7.744192701789723e-05, + "loss": 0.0096, + "step": 20960 + }, + { + "grad_norm": 0.30039024353027344, + "learning_rate": 7.741888647856046e-05, + "loss": 0.0123, + "step": 20970 + }, + { + "grad_norm": 0.22510989010334015, + "learning_rate": 7.739583761008994e-05, + "loss": 0.0104, + "step": 20980 + }, + { + "grad_norm": 0.28414151072502136, + "learning_rate": 7.73727804194873e-05, + "loss": 0.0136, + "step": 20990 + }, + { + "grad_norm": 0.42576834559440613, + "learning_rate": 7.734971491375671e-05, + "loss": 0.0163, + "step": 21000 + }, + { + "grad_norm": 0.2854042649269104, + "learning_rate": 7.732664109990485e-05, + "loss": 0.0093, + "step": 21010 + }, + { + "grad_norm": 0.2741122841835022, + "learning_rate": 7.730355898494095e-05, + "loss": 0.0089, + "step": 21020 + }, + { + "grad_norm": 0.2783111333847046, + "learning_rate": 7.728046857587673e-05, + "loss": 0.011, + "step": 21030 + }, + { + "grad_norm": 0.2575124502182007, + "learning_rate": 7.725736987972647e-05, + "loss": 0.0087, + "step": 21040 + }, + { + "grad_norm": 0.4278194308280945, + "learning_rate": 7.723426290350691e-05, + "loss": 0.0096, + "step": 21050 + }, + { + "grad_norm": 0.3166457414627075, + "learning_rate": 7.721114765423736e-05, + "loss": 0.012, + "step": 21060 + }, + { + "grad_norm": 0.2779615819454193, + "learning_rate": 7.718802413893963e-05, + "loss": 0.0097, + "step": 21070 + }, + { + "grad_norm": 0.28359705209732056, + "learning_rate": 7.716489236463802e-05, + "loss": 0.0147, + "step": 21080 + }, + { + "grad_norm": 0.25712549686431885, + "learning_rate": 7.714175233835936e-05, + "loss": 0.0098, + "step": 21090 + }, + { + "grad_norm": 0.2537403106689453, + "learning_rate": 7.711860406713299e-05, + "loss": 0.0113, + "step": 21100 + }, + { + "grad_norm": 0.3189406394958496, + "learning_rate": 7.70954475579907e-05, + "loss": 0.0108, + "step": 21110 + }, + { + "grad_norm": 0.3373509645462036, + "learning_rate": 7.707228281796688e-05, + "loss": 0.0116, + "step": 21120 + }, + { + "grad_norm": 0.21083569526672363, + "learning_rate": 7.704910985409833e-05, + "loss": 0.0129, + "step": 21130 + }, + { + "grad_norm": 0.34385946393013, + "learning_rate": 7.702592867342439e-05, + "loss": 0.011, + "step": 21140 + }, + { + "grad_norm": 0.32396259903907776, + "learning_rate": 7.700273928298691e-05, + "loss": 0.0113, + "step": 21150 + }, + { + "grad_norm": 0.23247189819812775, + "learning_rate": 7.697954168983021e-05, + "loss": 0.0086, + "step": 21160 + }, + { + "grad_norm": 0.34575891494750977, + "learning_rate": 7.695633590100109e-05, + "loss": 0.0122, + "step": 21170 + }, + { + "grad_norm": 0.2856544852256775, + "learning_rate": 7.693312192354886e-05, + "loss": 0.0101, + "step": 21180 + }, + { + "grad_norm": 0.36304816603660583, + "learning_rate": 7.690989976452532e-05, + "loss": 0.0104, + "step": 21190 + }, + { + "grad_norm": 0.48910048604011536, + "learning_rate": 7.688666943098475e-05, + "loss": 0.0114, + "step": 21200 + }, + { + "grad_norm": 0.2329770177602768, + "learning_rate": 7.686343092998389e-05, + "loss": 0.0093, + "step": 21210 + }, + { + "grad_norm": 0.3122267723083496, + "learning_rate": 7.684018426858202e-05, + "loss": 0.0094, + "step": 21220 + }, + { + "grad_norm": 0.38143762946128845, + "learning_rate": 7.681692945384084e-05, + "loss": 0.0126, + "step": 21230 + }, + { + "grad_norm": 0.2841351330280304, + "learning_rate": 7.679366649282456e-05, + "loss": 0.0112, + "step": 21240 + }, + { + "grad_norm": 0.20977483689785004, + "learning_rate": 7.677039539259983e-05, + "loss": 0.01, + "step": 21250 + }, + { + "grad_norm": 0.3063945472240448, + "learning_rate": 7.674711616023581e-05, + "loss": 0.012, + "step": 21260 + }, + { + "grad_norm": 0.33445024490356445, + "learning_rate": 7.672382880280413e-05, + "loss": 0.0156, + "step": 21270 + }, + { + "grad_norm": 0.2996259927749634, + "learning_rate": 7.670053332737885e-05, + "loss": 0.0115, + "step": 21280 + }, + { + "grad_norm": 0.299744188785553, + "learning_rate": 7.667722974103654e-05, + "loss": 0.0115, + "step": 21290 + }, + { + "grad_norm": 0.2751113176345825, + "learning_rate": 7.66539180508562e-05, + "loss": 0.0152, + "step": 21300 + }, + { + "grad_norm": 0.32271459698677063, + "learning_rate": 7.663059826391932e-05, + "loss": 0.0103, + "step": 21310 + }, + { + "grad_norm": 0.36063945293426514, + "learning_rate": 7.660727038730981e-05, + "loss": 0.0124, + "step": 21320 + }, + { + "grad_norm": 0.45738816261291504, + "learning_rate": 7.65839344281141e-05, + "loss": 0.0126, + "step": 21330 + }, + { + "grad_norm": 0.36818403005599976, + "learning_rate": 7.656059039342101e-05, + "loss": 0.0151, + "step": 21340 + }, + { + "grad_norm": 0.25361382961273193, + "learning_rate": 7.653723829032187e-05, + "loss": 0.0123, + "step": 21350 + }, + { + "grad_norm": 0.34046679735183716, + "learning_rate": 7.65138781259104e-05, + "loss": 0.0149, + "step": 21360 + }, + { + "grad_norm": 0.2896510362625122, + "learning_rate": 7.649050990728279e-05, + "loss": 0.0144, + "step": 21370 + }, + { + "grad_norm": 0.3445018231868744, + "learning_rate": 7.646713364153774e-05, + "loss": 0.0121, + "step": 21380 + }, + { + "grad_norm": 0.3236828148365021, + "learning_rate": 7.64437493357763e-05, + "loss": 0.0114, + "step": 21390 + }, + { + "grad_norm": 0.3111863434314728, + "learning_rate": 7.642035699710202e-05, + "loss": 0.0154, + "step": 21400 + }, + { + "grad_norm": 0.3892368972301483, + "learning_rate": 7.639695663262089e-05, + "loss": 0.0141, + "step": 21410 + }, + { + "grad_norm": 0.34180542826652527, + "learning_rate": 7.637354824944128e-05, + "loss": 0.0133, + "step": 21420 + }, + { + "grad_norm": 0.4371061623096466, + "learning_rate": 7.635013185467408e-05, + "loss": 0.0131, + "step": 21430 + }, + { + "grad_norm": 0.3792824149131775, + "learning_rate": 7.632670745543256e-05, + "loss": 0.0157, + "step": 21440 + }, + { + "grad_norm": 0.2838890850543976, + "learning_rate": 7.630327505883242e-05, + "loss": 0.0177, + "step": 21450 + }, + { + "grad_norm": 0.2905552387237549, + "learning_rate": 7.627983467199182e-05, + "loss": 0.0133, + "step": 21460 + }, + { + "grad_norm": 0.31462568044662476, + "learning_rate": 7.625638630203132e-05, + "loss": 0.0173, + "step": 21470 + }, + { + "grad_norm": 0.28866228461265564, + "learning_rate": 7.623292995607394e-05, + "loss": 0.0151, + "step": 21480 + }, + { + "grad_norm": 0.33535662293434143, + "learning_rate": 7.620946564124507e-05, + "loss": 0.0145, + "step": 21490 + }, + { + "grad_norm": 0.30741414427757263, + "learning_rate": 7.618599336467256e-05, + "loss": 0.0134, + "step": 21500 + }, + { + "grad_norm": 0.34034204483032227, + "learning_rate": 7.616251313348666e-05, + "loss": 0.0135, + "step": 21510 + }, + { + "grad_norm": 0.2424885630607605, + "learning_rate": 7.613902495482005e-05, + "loss": 0.014, + "step": 21520 + }, + { + "grad_norm": 0.23943130671977997, + "learning_rate": 7.611552883580784e-05, + "loss": 0.0098, + "step": 21530 + }, + { + "grad_norm": 0.29430538415908813, + "learning_rate": 7.609202478358748e-05, + "loss": 0.0134, + "step": 21540 + }, + { + "grad_norm": 0.2681763768196106, + "learning_rate": 7.606851280529895e-05, + "loss": 0.0092, + "step": 21550 + }, + { + "grad_norm": 0.28645193576812744, + "learning_rate": 7.604499290808449e-05, + "loss": 0.0096, + "step": 21560 + }, + { + "grad_norm": 0.2646980583667755, + "learning_rate": 7.602146509908888e-05, + "loss": 0.0122, + "step": 21570 + }, + { + "grad_norm": 0.2294079065322876, + "learning_rate": 7.599792938545921e-05, + "loss": 0.0119, + "step": 21580 + }, + { + "grad_norm": 0.3118074834346771, + "learning_rate": 7.597438577434506e-05, + "loss": 0.0136, + "step": 21590 + }, + { + "grad_norm": 0.28577011823654175, + "learning_rate": 7.595083427289831e-05, + "loss": 0.0099, + "step": 21600 + }, + { + "grad_norm": 0.3082162141799927, + "learning_rate": 7.59272748882733e-05, + "loss": 0.0124, + "step": 21610 + }, + { + "grad_norm": 0.4013248682022095, + "learning_rate": 7.590370762762675e-05, + "loss": 0.0118, + "step": 21620 + }, + { + "grad_norm": 0.3080922067165375, + "learning_rate": 7.588013249811777e-05, + "loss": 0.0161, + "step": 21630 + }, + { + "grad_norm": 0.39616358280181885, + "learning_rate": 7.585654950690786e-05, + "loss": 0.0124, + "step": 21640 + }, + { + "grad_norm": 0.24922987818717957, + "learning_rate": 7.583295866116091e-05, + "loss": 0.0116, + "step": 21650 + }, + { + "grad_norm": 0.27102354168891907, + "learning_rate": 7.580935996804321e-05, + "loss": 0.0128, + "step": 21660 + }, + { + "grad_norm": 0.37369731068611145, + "learning_rate": 7.57857534347234e-05, + "loss": 0.0116, + "step": 21670 + }, + { + "grad_norm": 0.274298757314682, + "learning_rate": 7.576213906837254e-05, + "loss": 0.0106, + "step": 21680 + }, + { + "grad_norm": 0.27538877725601196, + "learning_rate": 7.573851687616403e-05, + "loss": 0.0123, + "step": 21690 + }, + { + "grad_norm": 0.31631436944007874, + "learning_rate": 7.571488686527368e-05, + "loss": 0.0135, + "step": 21700 + }, + { + "grad_norm": 0.30979371070861816, + "learning_rate": 7.569124904287968e-05, + "loss": 0.0109, + "step": 21710 + }, + { + "grad_norm": 0.24036063253879547, + "learning_rate": 7.566760341616254e-05, + "loss": 0.0103, + "step": 21720 + }, + { + "grad_norm": 0.24478644132614136, + "learning_rate": 7.564394999230519e-05, + "loss": 0.0124, + "step": 21730 + }, + { + "grad_norm": 0.33754533529281616, + "learning_rate": 7.562028877849294e-05, + "loss": 0.0106, + "step": 21740 + }, + { + "grad_norm": 0.3073609471321106, + "learning_rate": 7.559661978191341e-05, + "loss": 0.0121, + "step": 21750 + }, + { + "grad_norm": 0.2695678472518921, + "learning_rate": 7.557294300975664e-05, + "loss": 0.0113, + "step": 21760 + }, + { + "grad_norm": 0.2535015046596527, + "learning_rate": 7.554925846921499e-05, + "loss": 0.0108, + "step": 21770 + }, + { + "grad_norm": 0.23987755179405212, + "learning_rate": 7.552556616748321e-05, + "loss": 0.0114, + "step": 21780 + }, + { + "grad_norm": 0.2868041396141052, + "learning_rate": 7.550186611175838e-05, + "loss": 0.0111, + "step": 21790 + }, + { + "grad_norm": 0.2640947997570038, + "learning_rate": 7.547815830923998e-05, + "loss": 0.0114, + "step": 21800 + }, + { + "grad_norm": 0.36012017726898193, + "learning_rate": 7.54544427671298e-05, + "loss": 0.0126, + "step": 21810 + }, + { + "grad_norm": 0.2856258451938629, + "learning_rate": 7.543071949263198e-05, + "loss": 0.0111, + "step": 21820 + }, + { + "grad_norm": 0.27696359157562256, + "learning_rate": 7.540698849295305e-05, + "loss": 0.0112, + "step": 21830 + }, + { + "grad_norm": 0.26459866762161255, + "learning_rate": 7.538324977530183e-05, + "loss": 0.0097, + "step": 21840 + }, + { + "grad_norm": 0.3199159801006317, + "learning_rate": 7.535950334688955e-05, + "loss": 0.0122, + "step": 21850 + }, + { + "grad_norm": 0.29098185896873474, + "learning_rate": 7.533574921492972e-05, + "loss": 0.0103, + "step": 21860 + }, + { + "grad_norm": 0.3367266356945038, + "learning_rate": 7.531198738663824e-05, + "loss": 0.0106, + "step": 21870 + }, + { + "grad_norm": 0.30199435353279114, + "learning_rate": 7.528821786923333e-05, + "loss": 0.0109, + "step": 21880 + }, + { + "grad_norm": 0.32572558522224426, + "learning_rate": 7.52644406699355e-05, + "loss": 0.0107, + "step": 21890 + }, + { + "grad_norm": 0.23641718924045563, + "learning_rate": 7.524065579596766e-05, + "loss": 0.0117, + "step": 21900 + }, + { + "grad_norm": 0.29985103011131287, + "learning_rate": 7.521686325455506e-05, + "loss": 0.0107, + "step": 21910 + }, + { + "grad_norm": 0.32768669724464417, + "learning_rate": 7.51930630529252e-05, + "loss": 0.0085, + "step": 21920 + }, + { + "grad_norm": 0.3467303216457367, + "learning_rate": 7.516925519830797e-05, + "loss": 0.0149, + "step": 21930 + }, + { + "grad_norm": 0.3408523499965668, + "learning_rate": 7.514543969793557e-05, + "loss": 0.0114, + "step": 21940 + }, + { + "grad_norm": 0.3072498142719269, + "learning_rate": 7.512161655904251e-05, + "loss": 0.0122, + "step": 21950 + }, + { + "grad_norm": 0.3192087709903717, + "learning_rate": 7.509778578886563e-05, + "loss": 0.0108, + "step": 21960 + }, + { + "grad_norm": 0.3271007537841797, + "learning_rate": 7.507394739464412e-05, + "loss": 0.008, + "step": 21970 + }, + { + "grad_norm": 0.3040149211883545, + "learning_rate": 7.50501013836194e-05, + "loss": 0.0164, + "step": 21980 + }, + { + "grad_norm": 0.23984147608280182, + "learning_rate": 7.50262477630353e-05, + "loss": 0.0102, + "step": 21990 + }, + { + "grad_norm": 0.25129538774490356, + "learning_rate": 7.500238654013794e-05, + "loss": 0.0097, + "step": 22000 + }, + { + "grad_norm": 0.2837453782558441, + "learning_rate": 7.497851772217566e-05, + "loss": 0.0099, + "step": 22010 + }, + { + "grad_norm": 0.2572399377822876, + "learning_rate": 7.495464131639924e-05, + "loss": 0.0101, + "step": 22020 + }, + { + "grad_norm": 0.31302526593208313, + "learning_rate": 7.493075733006166e-05, + "loss": 0.0115, + "step": 22030 + }, + { + "grad_norm": 0.27019184827804565, + "learning_rate": 7.490686577041828e-05, + "loss": 0.0099, + "step": 22040 + }, + { + "grad_norm": 0.20474080741405487, + "learning_rate": 7.488296664472668e-05, + "loss": 0.011, + "step": 22050 + }, + { + "grad_norm": 0.2632288932800293, + "learning_rate": 7.485905996024682e-05, + "loss": 0.0082, + "step": 22060 + }, + { + "grad_norm": 0.31164175271987915, + "learning_rate": 7.483514572424093e-05, + "loss": 0.0089, + "step": 22070 + }, + { + "grad_norm": 0.2798174023628235, + "learning_rate": 7.481122394397349e-05, + "loss": 0.0094, + "step": 22080 + }, + { + "grad_norm": 0.2775808870792389, + "learning_rate": 7.478729462671131e-05, + "loss": 0.0101, + "step": 22090 + }, + { + "grad_norm": 0.2715277373790741, + "learning_rate": 7.47633577797235e-05, + "loss": 0.0092, + "step": 22100 + }, + { + "grad_norm": 0.29180750250816345, + "learning_rate": 7.473941341028144e-05, + "loss": 0.0136, + "step": 22110 + }, + { + "grad_norm": 0.28126060962677, + "learning_rate": 7.471546152565879e-05, + "loss": 0.0103, + "step": 22120 + }, + { + "grad_norm": 0.22522731125354767, + "learning_rate": 7.46915021331315e-05, + "loss": 0.0093, + "step": 22130 + }, + { + "grad_norm": 0.28509920835494995, + "learning_rate": 7.466753523997778e-05, + "loss": 0.0099, + "step": 22140 + }, + { + "grad_norm": 0.20971417427062988, + "learning_rate": 7.464356085347819e-05, + "loss": 0.0112, + "step": 22150 + }, + { + "grad_norm": 0.24829228222370148, + "learning_rate": 7.461957898091548e-05, + "loss": 0.0108, + "step": 22160 + }, + { + "grad_norm": 0.28717026114463806, + "learning_rate": 7.459558962957473e-05, + "loss": 0.0084, + "step": 22170 + }, + { + "grad_norm": 0.19303816556930542, + "learning_rate": 7.457159280674326e-05, + "loss": 0.0104, + "step": 22180 + }, + { + "grad_norm": 0.19239214062690735, + "learning_rate": 7.454758851971066e-05, + "loss": 0.0086, + "step": 22190 + }, + { + "grad_norm": 0.24407100677490234, + "learning_rate": 7.45235767757688e-05, + "loss": 0.0113, + "step": 22200 + }, + { + "grad_norm": 0.28287962079048157, + "learning_rate": 7.449955758221183e-05, + "loss": 0.0088, + "step": 22210 + }, + { + "grad_norm": 0.2901305854320526, + "learning_rate": 7.447553094633615e-05, + "loss": 0.0114, + "step": 22220 + }, + { + "grad_norm": 0.32799839973449707, + "learning_rate": 7.445149687544039e-05, + "loss": 0.0106, + "step": 22230 + }, + { + "grad_norm": 0.281703382730484, + "learning_rate": 7.44274553768255e-05, + "loss": 0.0107, + "step": 22240 + }, + { + "grad_norm": 0.35231515765190125, + "learning_rate": 7.440340645779464e-05, + "loss": 0.0109, + "step": 22250 + }, + { + "grad_norm": 0.21387960016727448, + "learning_rate": 7.437935012565322e-05, + "loss": 0.0118, + "step": 22260 + }, + { + "grad_norm": 0.2847549021244049, + "learning_rate": 7.435528638770893e-05, + "loss": 0.0091, + "step": 22270 + }, + { + "grad_norm": 0.26194649934768677, + "learning_rate": 7.433121525127171e-05, + "loss": 0.0098, + "step": 22280 + }, + { + "grad_norm": 0.24325090646743774, + "learning_rate": 7.430713672365371e-05, + "loss": 0.0089, + "step": 22290 + }, + { + "grad_norm": 0.24431732296943665, + "learning_rate": 7.428305081216938e-05, + "loss": 0.0077, + "step": 22300 + }, + { + "grad_norm": 0.24896951019763947, + "learning_rate": 7.425895752413536e-05, + "loss": 0.0082, + "step": 22310 + }, + { + "grad_norm": 0.18142609298229218, + "learning_rate": 7.423485686687057e-05, + "loss": 0.0113, + "step": 22320 + }, + { + "grad_norm": 0.2753553092479706, + "learning_rate": 7.421074884769616e-05, + "loss": 0.0117, + "step": 22330 + }, + { + "grad_norm": 0.2827988266944885, + "learning_rate": 7.418663347393548e-05, + "loss": 0.0109, + "step": 22340 + }, + { + "grad_norm": 0.29138875007629395, + "learning_rate": 7.416251075291418e-05, + "loss": 0.0102, + "step": 22350 + }, + { + "grad_norm": 0.287031352519989, + "learning_rate": 7.413838069196007e-05, + "loss": 0.008, + "step": 22360 + }, + { + "grad_norm": 0.2766510248184204, + "learning_rate": 7.411424329840324e-05, + "loss": 0.0109, + "step": 22370 + }, + { + "grad_norm": 0.3620835244655609, + "learning_rate": 7.409009857957601e-05, + "loss": 0.0132, + "step": 22380 + }, + { + "grad_norm": 0.20657455921173096, + "learning_rate": 7.40659465428129e-05, + "loss": 0.0105, + "step": 22390 + }, + { + "grad_norm": 0.31867966055870056, + "learning_rate": 7.404178719545063e-05, + "loss": 0.0116, + "step": 22400 + }, + { + "grad_norm": 0.42490971088409424, + "learning_rate": 7.401762054482822e-05, + "loss": 0.0125, + "step": 22410 + }, + { + "grad_norm": 0.2265114039182663, + "learning_rate": 7.39934465982868e-05, + "loss": 0.0098, + "step": 22420 + }, + { + "grad_norm": 0.2840220034122467, + "learning_rate": 7.396926536316984e-05, + "loss": 0.0102, + "step": 22430 + }, + { + "grad_norm": 0.28045937418937683, + "learning_rate": 7.394507684682293e-05, + "loss": 0.0086, + "step": 22440 + }, + { + "grad_norm": 0.294400155544281, + "learning_rate": 7.392088105659393e-05, + "loss": 0.0153, + "step": 22450 + }, + { + "grad_norm": 0.2399234175682068, + "learning_rate": 7.389667799983284e-05, + "loss": 0.0127, + "step": 22460 + }, + { + "grad_norm": 0.2573299705982208, + "learning_rate": 7.387246768389193e-05, + "loss": 0.0099, + "step": 22470 + }, + { + "grad_norm": 0.362834632396698, + "learning_rate": 7.384825011612563e-05, + "loss": 0.0102, + "step": 22480 + }, + { + "grad_norm": 0.37360212206840515, + "learning_rate": 7.382402530389066e-05, + "loss": 0.0108, + "step": 22490 + }, + { + "grad_norm": 0.28495290875434875, + "learning_rate": 7.379979325454582e-05, + "loss": 0.0091, + "step": 22500 + }, + { + "grad_norm": 0.3052418529987335, + "learning_rate": 7.37755539754522e-05, + "loss": 0.0097, + "step": 22510 + }, + { + "grad_norm": 0.27458980679512024, + "learning_rate": 7.375130747397302e-05, + "loss": 0.0096, + "step": 22520 + }, + { + "grad_norm": 0.3065735399723053, + "learning_rate": 7.372705375747377e-05, + "loss": 0.0108, + "step": 22530 + }, + { + "grad_norm": 0.2778896689414978, + "learning_rate": 7.370279283332205e-05, + "loss": 0.0081, + "step": 22540 + }, + { + "grad_norm": 0.29692456126213074, + "learning_rate": 7.36785247088877e-05, + "loss": 0.0088, + "step": 22550 + }, + { + "grad_norm": 0.24722807109355927, + "learning_rate": 7.365424939154275e-05, + "loss": 0.0081, + "step": 22560 + }, + { + "grad_norm": 0.2320321798324585, + "learning_rate": 7.362996688866138e-05, + "loss": 0.0109, + "step": 22570 + }, + { + "grad_norm": 0.18126583099365234, + "learning_rate": 7.360567720761999e-05, + "loss": 0.0071, + "step": 22580 + }, + { + "grad_norm": 0.22640566527843475, + "learning_rate": 7.358138035579711e-05, + "loss": 0.0084, + "step": 22590 + }, + { + "grad_norm": 0.2487303614616394, + "learning_rate": 7.355707634057354e-05, + "loss": 0.0091, + "step": 22600 + }, + { + "grad_norm": 0.34879443049430847, + "learning_rate": 7.353276516933215e-05, + "loss": 0.0103, + "step": 22610 + }, + { + "grad_norm": 0.2628817856311798, + "learning_rate": 7.350844684945806e-05, + "loss": 0.0088, + "step": 22620 + }, + { + "grad_norm": 0.21212516725063324, + "learning_rate": 7.348412138833851e-05, + "loss": 0.0074, + "step": 22630 + }, + { + "grad_norm": 0.3087599575519562, + "learning_rate": 7.345978879336295e-05, + "loss": 0.0113, + "step": 22640 + }, + { + "grad_norm": 0.27432340383529663, + "learning_rate": 7.343544907192296e-05, + "loss": 0.0101, + "step": 22650 + }, + { + "grad_norm": 0.25573548674583435, + "learning_rate": 7.341110223141235e-05, + "loss": 0.0096, + "step": 22660 + }, + { + "grad_norm": 0.2608964145183563, + "learning_rate": 7.3386748279227e-05, + "loss": 0.0097, + "step": 22670 + }, + { + "grad_norm": 0.2299220860004425, + "learning_rate": 7.336238722276501e-05, + "loss": 0.0102, + "step": 22680 + }, + { + "grad_norm": 0.26084446907043457, + "learning_rate": 7.333801906942663e-05, + "loss": 0.01, + "step": 22690 + }, + { + "grad_norm": 0.35514795780181885, + "learning_rate": 7.331364382661428e-05, + "loss": 0.0135, + "step": 22700 + }, + { + "grad_norm": 0.26130834221839905, + "learning_rate": 7.328926150173248e-05, + "loss": 0.0123, + "step": 22710 + }, + { + "grad_norm": 0.30046066641807556, + "learning_rate": 7.326487210218795e-05, + "loss": 0.0106, + "step": 22720 + }, + { + "grad_norm": 0.2628723084926605, + "learning_rate": 7.324047563538955e-05, + "loss": 0.0124, + "step": 22730 + }, + { + "grad_norm": 0.3095598816871643, + "learning_rate": 7.321607210874828e-05, + "loss": 0.012, + "step": 22740 + }, + { + "grad_norm": 0.23997288942337036, + "learning_rate": 7.31916615296773e-05, + "loss": 0.0107, + "step": 22750 + }, + { + "grad_norm": 0.27912813425064087, + "learning_rate": 7.316724390559188e-05, + "loss": 0.0109, + "step": 22760 + }, + { + "grad_norm": 0.20605476200580597, + "learning_rate": 7.314281924390946e-05, + "loss": 0.01, + "step": 22770 + }, + { + "grad_norm": 0.3583897054195404, + "learning_rate": 7.311838755204959e-05, + "loss": 0.0096, + "step": 22780 + }, + { + "grad_norm": 0.24017024040222168, + "learning_rate": 7.3093948837434e-05, + "loss": 0.013, + "step": 22790 + }, + { + "grad_norm": 0.3490748107433319, + "learning_rate": 7.306950310748651e-05, + "loss": 0.0109, + "step": 22800 + }, + { + "grad_norm": 0.3087576925754547, + "learning_rate": 7.304505036963311e-05, + "loss": 0.0126, + "step": 22810 + }, + { + "grad_norm": 0.2967706024646759, + "learning_rate": 7.302059063130186e-05, + "loss": 0.0112, + "step": 22820 + }, + { + "grad_norm": 0.2868776321411133, + "learning_rate": 7.2996123899923e-05, + "loss": 0.009, + "step": 22830 + }, + { + "grad_norm": 0.22777420282363892, + "learning_rate": 7.297165018292886e-05, + "loss": 0.01, + "step": 22840 + }, + { + "grad_norm": 0.19153814017772675, + "learning_rate": 7.294716948775396e-05, + "loss": 0.0076, + "step": 22850 + }, + { + "grad_norm": 0.2611812949180603, + "learning_rate": 7.292268182183484e-05, + "loss": 0.01, + "step": 22860 + }, + { + "grad_norm": 0.21843092143535614, + "learning_rate": 7.28981871926102e-05, + "loss": 0.008, + "step": 22870 + }, + { + "grad_norm": 0.2903699576854706, + "learning_rate": 7.28736856075209e-05, + "loss": 0.0097, + "step": 22880 + }, + { + "grad_norm": 0.23961956799030304, + "learning_rate": 7.284917707400985e-05, + "loss": 0.0122, + "step": 22890 + }, + { + "grad_norm": 0.2326366901397705, + "learning_rate": 7.282466159952212e-05, + "loss": 0.0102, + "step": 22900 + }, + { + "grad_norm": 0.3050483763217926, + "learning_rate": 7.280013919150483e-05, + "loss": 0.0113, + "step": 22910 + }, + { + "grad_norm": 0.2711528539657593, + "learning_rate": 7.277560985740728e-05, + "loss": 0.011, + "step": 22920 + }, + { + "grad_norm": 0.2670758366584778, + "learning_rate": 7.275107360468079e-05, + "loss": 0.0097, + "step": 22930 + }, + { + "grad_norm": 0.3290386199951172, + "learning_rate": 7.272653044077885e-05, + "loss": 0.0079, + "step": 22940 + }, + { + "grad_norm": 0.3536282777786255, + "learning_rate": 7.270198037315703e-05, + "loss": 0.0102, + "step": 22950 + }, + { + "grad_norm": 0.24289318919181824, + "learning_rate": 7.267742340927297e-05, + "loss": 0.0092, + "step": 22960 + }, + { + "grad_norm": 0.27409830689430237, + "learning_rate": 7.265285955658645e-05, + "loss": 0.0075, + "step": 22970 + }, + { + "grad_norm": 0.26687631011009216, + "learning_rate": 7.26282888225593e-05, + "loss": 0.0115, + "step": 22980 + }, + { + "grad_norm": 0.1967734843492508, + "learning_rate": 7.260371121465548e-05, + "loss": 0.0087, + "step": 22990 + }, + { + "grad_norm": 0.32088401913642883, + "learning_rate": 7.2579126740341e-05, + "loss": 0.0104, + "step": 23000 + }, + { + "grad_norm": 0.3110400438308716, + "learning_rate": 7.2554535407084e-05, + "loss": 0.0109, + "step": 23010 + }, + { + "grad_norm": 0.2756114900112152, + "learning_rate": 7.252993722235464e-05, + "loss": 0.0096, + "step": 23020 + }, + { + "grad_norm": 0.24227185547351837, + "learning_rate": 7.250533219362523e-05, + "loss": 0.0098, + "step": 23030 + }, + { + "grad_norm": 0.21470095217227936, + "learning_rate": 7.248072032837012e-05, + "loss": 0.0108, + "step": 23040 + }, + { + "grad_norm": 0.6737625598907471, + "learning_rate": 7.245610163406575e-05, + "loss": 0.0153, + "step": 23050 + }, + { + "grad_norm": 0.31704360246658325, + "learning_rate": 7.243147611819061e-05, + "loss": 0.0119, + "step": 23060 + }, + { + "grad_norm": 0.2883269488811493, + "learning_rate": 7.240684378822531e-05, + "loss": 0.0147, + "step": 23070 + }, + { + "grad_norm": 0.26238542795181274, + "learning_rate": 7.238220465165248e-05, + "loss": 0.0117, + "step": 23080 + }, + { + "grad_norm": 0.42589902877807617, + "learning_rate": 7.235755871595684e-05, + "loss": 0.0102, + "step": 23090 + }, + { + "grad_norm": 0.34058448672294617, + "learning_rate": 7.233290598862517e-05, + "loss": 0.0118, + "step": 23100 + }, + { + "grad_norm": 0.29225847125053406, + "learning_rate": 7.230824647714635e-05, + "loss": 0.0114, + "step": 23110 + }, + { + "grad_norm": 0.2492419332265854, + "learning_rate": 7.228358018901124e-05, + "loss": 0.0117, + "step": 23120 + }, + { + "grad_norm": 0.27833303809165955, + "learning_rate": 7.225890713171286e-05, + "loss": 0.0087, + "step": 23130 + }, + { + "grad_norm": 0.2806122601032257, + "learning_rate": 7.223422731274618e-05, + "loss": 0.01, + "step": 23140 + }, + { + "grad_norm": 0.2733665406703949, + "learning_rate": 7.220954073960832e-05, + "loss": 0.0126, + "step": 23150 + }, + { + "grad_norm": 0.295369952917099, + "learning_rate": 7.218484741979838e-05, + "loss": 0.0119, + "step": 23160 + }, + { + "grad_norm": 0.24726513028144836, + "learning_rate": 7.216014736081756e-05, + "loss": 0.0101, + "step": 23170 + }, + { + "grad_norm": 0.27534812688827515, + "learning_rate": 7.213544057016906e-05, + "loss": 0.012, + "step": 23180 + }, + { + "grad_norm": 0.29914525151252747, + "learning_rate": 7.211072705535819e-05, + "loss": 0.0091, + "step": 23190 + }, + { + "grad_norm": 0.3329485058784485, + "learning_rate": 7.208600682389224e-05, + "loss": 0.0116, + "step": 23200 + }, + { + "grad_norm": 0.26701098680496216, + "learning_rate": 7.206127988328055e-05, + "loss": 0.0147, + "step": 23210 + }, + { + "grad_norm": 0.20883502066135406, + "learning_rate": 7.203654624103453e-05, + "loss": 0.0107, + "step": 23220 + }, + { + "grad_norm": 0.3203555941581726, + "learning_rate": 7.201180590466761e-05, + "loss": 0.0164, + "step": 23230 + }, + { + "grad_norm": 0.288200706243515, + "learning_rate": 7.198705888169523e-05, + "loss": 0.0128, + "step": 23240 + }, + { + "grad_norm": 0.23264338076114655, + "learning_rate": 7.196230517963491e-05, + "loss": 0.0137, + "step": 23250 + }, + { + "grad_norm": 0.24267594516277313, + "learning_rate": 7.193754480600615e-05, + "loss": 0.0132, + "step": 23260 + }, + { + "grad_norm": 0.28565508127212524, + "learning_rate": 7.19127777683305e-05, + "loss": 0.0089, + "step": 23270 + }, + { + "grad_norm": 0.24211053550243378, + "learning_rate": 7.188800407413156e-05, + "loss": 0.0111, + "step": 23280 + }, + { + "grad_norm": 0.272942453622818, + "learning_rate": 7.186322373093489e-05, + "loss": 0.0125, + "step": 23290 + }, + { + "grad_norm": 0.26006919145584106, + "learning_rate": 7.18384367462681e-05, + "loss": 0.0084, + "step": 23300 + }, + { + "grad_norm": 0.2471270114183426, + "learning_rate": 7.181364312766085e-05, + "loss": 0.0088, + "step": 23310 + }, + { + "grad_norm": 0.3497978746891022, + "learning_rate": 7.178884288264477e-05, + "loss": 0.0099, + "step": 23320 + }, + { + "grad_norm": 0.31600794196128845, + "learning_rate": 7.176403601875353e-05, + "loss": 0.0108, + "step": 23330 + }, + { + "grad_norm": 0.23590435087680817, + "learning_rate": 7.173922254352279e-05, + "loss": 0.0096, + "step": 23340 + }, + { + "grad_norm": 0.3787817656993866, + "learning_rate": 7.171440246449024e-05, + "loss": 0.0108, + "step": 23350 + }, + { + "grad_norm": 0.26142868399620056, + "learning_rate": 7.168957578919555e-05, + "loss": 0.0081, + "step": 23360 + }, + { + "grad_norm": 0.2901480793952942, + "learning_rate": 7.16647425251804e-05, + "loss": 0.0096, + "step": 23370 + }, + { + "grad_norm": 0.28418490290641785, + "learning_rate": 7.163990267998852e-05, + "loss": 0.0103, + "step": 23380 + }, + { + "grad_norm": 0.31199508905410767, + "learning_rate": 7.161505626116556e-05, + "loss": 0.0111, + "step": 23390 + }, + { + "grad_norm": 0.3080494701862335, + "learning_rate": 7.159020327625923e-05, + "loss": 0.0126, + "step": 23400 + }, + { + "grad_norm": 0.21442535519599915, + "learning_rate": 7.15653437328192e-05, + "loss": 0.0099, + "step": 23410 + }, + { + "grad_norm": 0.25426360964775085, + "learning_rate": 7.154047763839713e-05, + "loss": 0.0093, + "step": 23420 + }, + { + "grad_norm": 0.21504269540309906, + "learning_rate": 7.15156050005467e-05, + "loss": 0.0098, + "step": 23430 + }, + { + "grad_norm": 0.29118266701698303, + "learning_rate": 7.149072582682357e-05, + "loss": 0.0095, + "step": 23440 + }, + { + "grad_norm": 0.1889866590499878, + "learning_rate": 7.146584012478535e-05, + "loss": 0.0093, + "step": 23450 + }, + { + "grad_norm": 0.24235953390598297, + "learning_rate": 7.144094790199169e-05, + "loss": 0.0119, + "step": 23460 + }, + { + "grad_norm": 0.23944562673568726, + "learning_rate": 7.141604916600415e-05, + "loss": 0.0098, + "step": 23470 + }, + { + "grad_norm": 0.24665893614292145, + "learning_rate": 7.139114392438635e-05, + "loss": 0.0114, + "step": 23480 + }, + { + "grad_norm": 0.2754136621952057, + "learning_rate": 7.136623218470382e-05, + "loss": 0.01, + "step": 23490 + }, + { + "grad_norm": 0.3582766652107239, + "learning_rate": 7.13413139545241e-05, + "loss": 0.0111, + "step": 23500 + }, + { + "grad_norm": 0.3257825970649719, + "learning_rate": 7.131638924141668e-05, + "loss": 0.0087, + "step": 23510 + }, + { + "grad_norm": 0.39841777086257935, + "learning_rate": 7.129145805295304e-05, + "loss": 0.0117, + "step": 23520 + }, + { + "grad_norm": 0.2691058814525604, + "learning_rate": 7.126652039670661e-05, + "loss": 0.0103, + "step": 23530 + }, + { + "grad_norm": 0.30880314111709595, + "learning_rate": 7.124157628025278e-05, + "loss": 0.0102, + "step": 23540 + }, + { + "grad_norm": 0.3092741072177887, + "learning_rate": 7.121662571116894e-05, + "loss": 0.009, + "step": 23550 + }, + { + "grad_norm": 0.28892335295677185, + "learning_rate": 7.119166869703441e-05, + "loss": 0.0098, + "step": 23560 + }, + { + "grad_norm": 0.21413643658161163, + "learning_rate": 7.116670524543044e-05, + "loss": 0.0108, + "step": 23570 + }, + { + "grad_norm": 0.30003151297569275, + "learning_rate": 7.114173536394032e-05, + "loss": 0.0107, + "step": 23580 + }, + { + "grad_norm": 0.23662851750850677, + "learning_rate": 7.111675906014917e-05, + "loss": 0.0111, + "step": 23590 + }, + { + "grad_norm": 0.30537134408950806, + "learning_rate": 7.109177634164421e-05, + "loss": 0.0122, + "step": 23600 + }, + { + "grad_norm": 0.3178028166294098, + "learning_rate": 7.106678721601449e-05, + "loss": 0.0122, + "step": 23610 + }, + { + "grad_norm": 0.2335910052061081, + "learning_rate": 7.104179169085103e-05, + "loss": 0.0115, + "step": 23620 + }, + { + "grad_norm": 0.26295721530914307, + "learning_rate": 7.101678977374683e-05, + "loss": 0.01, + "step": 23630 + }, + { + "grad_norm": 0.32382676005363464, + "learning_rate": 7.099178147229685e-05, + "loss": 0.0149, + "step": 23640 + }, + { + "grad_norm": 0.27652525901794434, + "learning_rate": 7.096676679409789e-05, + "loss": 0.0115, + "step": 23650 + }, + { + "grad_norm": 0.3778683543205261, + "learning_rate": 7.094174574674877e-05, + "loss": 0.0117, + "step": 23660 + }, + { + "grad_norm": 0.4018316864967346, + "learning_rate": 7.091671833785025e-05, + "loss": 0.0118, + "step": 23670 + }, + { + "grad_norm": 0.37560781836509705, + "learning_rate": 7.089168457500493e-05, + "loss": 0.0097, + "step": 23680 + }, + { + "grad_norm": 0.33410680294036865, + "learning_rate": 7.086664446581747e-05, + "loss": 0.0109, + "step": 23690 + }, + { + "grad_norm": 0.28020018339157104, + "learning_rate": 7.084159801789438e-05, + "loss": 0.0105, + "step": 23700 + }, + { + "grad_norm": 0.21257483959197998, + "learning_rate": 7.081654523884411e-05, + "loss": 0.0076, + "step": 23710 + }, + { + "grad_norm": 0.2456098049879074, + "learning_rate": 7.0791486136277e-05, + "loss": 0.0105, + "step": 23720 + }, + { + "grad_norm": 0.24972784519195557, + "learning_rate": 7.07664207178054e-05, + "loss": 0.0148, + "step": 23730 + }, + { + "grad_norm": 0.3468533158302307, + "learning_rate": 7.074134899104345e-05, + "loss": 0.0085, + "step": 23740 + }, + { + "grad_norm": 0.2879554331302643, + "learning_rate": 7.071627096360735e-05, + "loss": 0.0121, + "step": 23750 + }, + { + "grad_norm": 0.2748449444770813, + "learning_rate": 7.069118664311511e-05, + "loss": 0.0105, + "step": 23760 + }, + { + "grad_norm": 0.32097867131233215, + "learning_rate": 7.06660960371867e-05, + "loss": 0.0106, + "step": 23770 + }, + { + "grad_norm": 0.25515103340148926, + "learning_rate": 7.064099915344396e-05, + "loss": 0.01, + "step": 23780 + }, + { + "grad_norm": 0.2812367379665375, + "learning_rate": 7.061589599951066e-05, + "loss": 0.0112, + "step": 23790 + }, + { + "grad_norm": 0.2585762143135071, + "learning_rate": 7.05907865830125e-05, + "loss": 0.0094, + "step": 23800 + }, + { + "grad_norm": 0.3192617893218994, + "learning_rate": 7.056567091157703e-05, + "loss": 0.0106, + "step": 23810 + }, + { + "grad_norm": 0.2826465368270874, + "learning_rate": 7.054054899283375e-05, + "loss": 0.0097, + "step": 23820 + }, + { + "grad_norm": 0.21344923973083496, + "learning_rate": 7.051542083441403e-05, + "loss": 0.0107, + "step": 23830 + }, + { + "grad_norm": 0.2828599810600281, + "learning_rate": 7.049028644395113e-05, + "loss": 0.0094, + "step": 23840 + }, + { + "grad_norm": 0.3610101044178009, + "learning_rate": 7.046514582908024e-05, + "loss": 0.0143, + "step": 23850 + }, + { + "grad_norm": 0.3311883211135864, + "learning_rate": 7.043999899743838e-05, + "loss": 0.011, + "step": 23860 + }, + { + "grad_norm": 0.28398993611335754, + "learning_rate": 7.041484595666451e-05, + "loss": 0.0119, + "step": 23870 + }, + { + "grad_norm": 0.3013603389263153, + "learning_rate": 7.038968671439948e-05, + "loss": 0.0148, + "step": 23880 + }, + { + "grad_norm": 0.2967699468135834, + "learning_rate": 7.036452127828596e-05, + "loss": 0.0115, + "step": 23890 + }, + { + "grad_norm": 0.29695960879325867, + "learning_rate": 7.033934965596859e-05, + "loss": 0.0097, + "step": 23900 + }, + { + "grad_norm": 0.3152150511741638, + "learning_rate": 7.031417185509381e-05, + "loss": 0.0124, + "step": 23910 + }, + { + "grad_norm": 0.23919104039669037, + "learning_rate": 7.028898788331e-05, + "loss": 0.0131, + "step": 23920 + }, + { + "grad_norm": 0.27760595083236694, + "learning_rate": 7.026379774826736e-05, + "loss": 0.0113, + "step": 23930 + }, + { + "grad_norm": 0.2188689410686493, + "learning_rate": 7.0238601457618e-05, + "loss": 0.0101, + "step": 23940 + }, + { + "grad_norm": 0.39000511169433594, + "learning_rate": 7.02133990190159e-05, + "loss": 0.0108, + "step": 23950 + }, + { + "grad_norm": 0.2574511766433716, + "learning_rate": 7.018819044011687e-05, + "loss": 0.009, + "step": 23960 + }, + { + "grad_norm": 0.3448181748390198, + "learning_rate": 7.016297572857863e-05, + "loss": 0.0089, + "step": 23970 + }, + { + "grad_norm": 0.2615724205970764, + "learning_rate": 7.013775489206072e-05, + "loss": 0.0094, + "step": 23980 + }, + { + "grad_norm": 0.2373533844947815, + "learning_rate": 7.01125279382246e-05, + "loss": 0.0126, + "step": 23990 + }, + { + "grad_norm": 0.22340206801891327, + "learning_rate": 7.008729487473351e-05, + "loss": 0.0114, + "step": 24000 + }, + { + "grad_norm": 0.22240720689296722, + "learning_rate": 7.006205570925263e-05, + "loss": 0.0107, + "step": 24010 + }, + { + "grad_norm": 0.24717000126838684, + "learning_rate": 7.003681044944892e-05, + "loss": 0.0125, + "step": 24020 + }, + { + "grad_norm": 0.21538789570331573, + "learning_rate": 7.001155910299126e-05, + "loss": 0.0115, + "step": 24030 + }, + { + "grad_norm": 0.22406692802906036, + "learning_rate": 6.99863016775503e-05, + "loss": 0.0104, + "step": 24040 + }, + { + "grad_norm": 0.2345639169216156, + "learning_rate": 6.996103818079859e-05, + "loss": 0.0106, + "step": 24050 + }, + { + "grad_norm": 0.26302775740623474, + "learning_rate": 6.993576862041054e-05, + "loss": 0.0095, + "step": 24060 + }, + { + "grad_norm": 0.31514737010002136, + "learning_rate": 6.991049300406235e-05, + "loss": 0.0101, + "step": 24070 + }, + { + "grad_norm": 0.268582820892334, + "learning_rate": 6.988521133943209e-05, + "loss": 0.0086, + "step": 24080 + }, + { + "grad_norm": 0.38309189677238464, + "learning_rate": 6.985992363419966e-05, + "loss": 0.0132, + "step": 24090 + }, + { + "grad_norm": 0.2702401280403137, + "learning_rate": 6.983462989604682e-05, + "loss": 0.008, + "step": 24100 + }, + { + "grad_norm": 0.3436410129070282, + "learning_rate": 6.980933013265709e-05, + "loss": 0.0103, + "step": 24110 + }, + { + "grad_norm": 0.27887746691703796, + "learning_rate": 6.978402435171592e-05, + "loss": 0.0112, + "step": 24120 + }, + { + "grad_norm": 0.3148270845413208, + "learning_rate": 6.975871256091052e-05, + "loss": 0.009, + "step": 24130 + }, + { + "grad_norm": 0.3671909272670746, + "learning_rate": 6.973339476792995e-05, + "loss": 0.0083, + "step": 24140 + }, + { + "grad_norm": 0.29265913367271423, + "learning_rate": 6.970807098046505e-05, + "loss": 0.0095, + "step": 24150 + }, + { + "grad_norm": 0.35181355476379395, + "learning_rate": 6.968274120620858e-05, + "loss": 0.0095, + "step": 24160 + }, + { + "grad_norm": 0.27665621042251587, + "learning_rate": 6.965740545285499e-05, + "loss": 0.0095, + "step": 24170 + }, + { + "grad_norm": 0.3085362911224365, + "learning_rate": 6.963206372810068e-05, + "loss": 0.0094, + "step": 24180 + }, + { + "grad_norm": 0.3252646327018738, + "learning_rate": 6.960671603964375e-05, + "loss": 0.0104, + "step": 24190 + }, + { + "grad_norm": 0.2893024682998657, + "learning_rate": 6.958136239518418e-05, + "loss": 0.0078, + "step": 24200 + }, + { + "grad_norm": 0.21432098746299744, + "learning_rate": 6.955600280242371e-05, + "loss": 0.0104, + "step": 24210 + }, + { + "grad_norm": 0.2944180965423584, + "learning_rate": 6.953063726906596e-05, + "loss": 0.0095, + "step": 24220 + }, + { + "grad_norm": 0.2731104791164398, + "learning_rate": 6.950526580281626e-05, + "loss": 0.009, + "step": 24230 + }, + { + "grad_norm": 0.34196025133132935, + "learning_rate": 6.947988841138184e-05, + "loss": 0.0098, + "step": 24240 + }, + { + "grad_norm": 0.24402837455272675, + "learning_rate": 6.945450510247165e-05, + "loss": 0.0083, + "step": 24250 + }, + { + "grad_norm": 0.31558629870414734, + "learning_rate": 6.942911588379647e-05, + "loss": 0.0113, + "step": 24260 + }, + { + "grad_norm": 0.34831908345222473, + "learning_rate": 6.940372076306888e-05, + "loss": 0.0083, + "step": 24270 + }, + { + "grad_norm": 0.24995142221450806, + "learning_rate": 6.937831974800326e-05, + "loss": 0.0093, + "step": 24280 + }, + { + "grad_norm": 0.24983029067516327, + "learning_rate": 6.935291284631574e-05, + "loss": 0.0089, + "step": 24290 + }, + { + "grad_norm": 0.2778373658657074, + "learning_rate": 6.932750006572428e-05, + "loss": 0.0101, + "step": 24300 + }, + { + "grad_norm": 0.21077312529087067, + "learning_rate": 6.930208141394863e-05, + "loss": 0.0087, + "step": 24310 + }, + { + "grad_norm": 0.25653186440467834, + "learning_rate": 6.927665689871026e-05, + "loss": 0.0122, + "step": 24320 + }, + { + "grad_norm": 0.26916074752807617, + "learning_rate": 6.925122652773253e-05, + "loss": 0.0088, + "step": 24330 + }, + { + "grad_norm": 0.28328290581703186, + "learning_rate": 6.922579030874046e-05, + "loss": 0.0089, + "step": 24340 + }, + { + "grad_norm": 0.2448911815881729, + "learning_rate": 6.920034824946093e-05, + "loss": 0.0105, + "step": 24350 + }, + { + "grad_norm": 0.36229923367500305, + "learning_rate": 6.917490035762255e-05, + "loss": 0.0102, + "step": 24360 + }, + { + "grad_norm": 0.328237920999527, + "learning_rate": 6.914944664095573e-05, + "loss": 0.0087, + "step": 24370 + }, + { + "grad_norm": 0.33668920397758484, + "learning_rate": 6.912398710719264e-05, + "loss": 0.0086, + "step": 24380 + }, + { + "grad_norm": 0.23626187443733215, + "learning_rate": 6.90985217640672e-05, + "loss": 0.0137, + "step": 24390 + }, + { + "grad_norm": 0.270663857460022, + "learning_rate": 6.90730506193151e-05, + "loss": 0.009, + "step": 24400 + }, + { + "grad_norm": 0.25239109992980957, + "learning_rate": 6.904757368067384e-05, + "loss": 0.0087, + "step": 24410 + }, + { + "grad_norm": 0.28416988253593445, + "learning_rate": 6.90220909558826e-05, + "loss": 0.0125, + "step": 24420 + }, + { + "grad_norm": 0.2679899334907532, + "learning_rate": 6.899660245268237e-05, + "loss": 0.0079, + "step": 24430 + }, + { + "grad_norm": 0.3217419981956482, + "learning_rate": 6.897110817881592e-05, + "loss": 0.008, + "step": 24440 + }, + { + "grad_norm": 0.29920387268066406, + "learning_rate": 6.894560814202769e-05, + "loss": 0.0096, + "step": 24450 + }, + { + "grad_norm": 0.24745391309261322, + "learning_rate": 6.892010235006394e-05, + "loss": 0.0099, + "step": 24460 + }, + { + "grad_norm": 0.31302884221076965, + "learning_rate": 6.889459081067264e-05, + "loss": 0.0103, + "step": 24470 + }, + { + "grad_norm": 0.1890210211277008, + "learning_rate": 6.886907353160356e-05, + "loss": 0.0092, + "step": 24480 + }, + { + "grad_norm": 0.22642023861408234, + "learning_rate": 6.884355052060814e-05, + "loss": 0.0079, + "step": 24490 + }, + { + "grad_norm": 0.23838257789611816, + "learning_rate": 6.88180217854396e-05, + "loss": 0.0079, + "step": 24500 + }, + { + "grad_norm": 0.2048512101173401, + "learning_rate": 6.87924873338529e-05, + "loss": 0.0082, + "step": 24510 + }, + { + "grad_norm": 0.22101201117038727, + "learning_rate": 6.876694717360475e-05, + "loss": 0.0095, + "step": 24520 + }, + { + "grad_norm": 0.27038589119911194, + "learning_rate": 6.874140131245355e-05, + "loss": 0.0123, + "step": 24530 + }, + { + "grad_norm": 0.2969529628753662, + "learning_rate": 6.871584975815948e-05, + "loss": 0.0108, + "step": 24540 + }, + { + "grad_norm": 0.3149702548980713, + "learning_rate": 6.86902925184844e-05, + "loss": 0.0083, + "step": 24550 + }, + { + "grad_norm": 0.18586546182632446, + "learning_rate": 6.866472960119195e-05, + "loss": 0.0074, + "step": 24560 + }, + { + "grad_norm": 0.2549111545085907, + "learning_rate": 6.863916101404748e-05, + "loss": 0.0075, + "step": 24570 + }, + { + "grad_norm": 0.20822013914585114, + "learning_rate": 6.8613586764818e-05, + "loss": 0.0071, + "step": 24580 + }, + { + "grad_norm": 0.37086981534957886, + "learning_rate": 6.858800686127233e-05, + "loss": 0.0123, + "step": 24590 + }, + { + "grad_norm": 0.34554389119148254, + "learning_rate": 6.856242131118097e-05, + "loss": 0.0122, + "step": 24600 + }, + { + "grad_norm": 0.28629931807518005, + "learning_rate": 6.853683012231614e-05, + "loss": 0.012, + "step": 24610 + }, + { + "grad_norm": 0.2933083772659302, + "learning_rate": 6.851123330245173e-05, + "loss": 0.0118, + "step": 24620 + }, + { + "grad_norm": 0.23303160071372986, + "learning_rate": 6.848563085936343e-05, + "loss": 0.0101, + "step": 24630 + }, + { + "grad_norm": 0.31656932830810547, + "learning_rate": 6.846002280082853e-05, + "loss": 0.0072, + "step": 24640 + }, + { + "grad_norm": 0.2558818757534027, + "learning_rate": 6.843440913462614e-05, + "loss": 0.0084, + "step": 24650 + }, + { + "grad_norm": 0.2798789441585541, + "learning_rate": 6.840878986853698e-05, + "loss": 0.0095, + "step": 24660 + }, + { + "grad_norm": 0.23279936611652374, + "learning_rate": 6.838316501034352e-05, + "loss": 0.0101, + "step": 24670 + }, + { + "grad_norm": 0.20617307722568512, + "learning_rate": 6.83575345678299e-05, + "loss": 0.0093, + "step": 24680 + }, + { + "grad_norm": 0.2819904088973999, + "learning_rate": 6.833189854878196e-05, + "loss": 0.0113, + "step": 24690 + }, + { + "grad_norm": 0.27071455121040344, + "learning_rate": 6.83062569609873e-05, + "loss": 0.011, + "step": 24700 + }, + { + "grad_norm": 0.30463576316833496, + "learning_rate": 6.828060981223512e-05, + "loss": 0.0072, + "step": 24710 + }, + { + "grad_norm": 0.2996367812156677, + "learning_rate": 6.825495711031634e-05, + "loss": 0.0118, + "step": 24720 + }, + { + "grad_norm": 0.21536804735660553, + "learning_rate": 6.822929886302359e-05, + "loss": 0.0103, + "step": 24730 + }, + { + "grad_norm": 0.2069932371377945, + "learning_rate": 6.820363507815116e-05, + "loss": 0.0076, + "step": 24740 + }, + { + "grad_norm": 0.2908797562122345, + "learning_rate": 6.817796576349501e-05, + "loss": 0.0098, + "step": 24750 + }, + { + "grad_norm": 0.2410261332988739, + "learning_rate": 6.815229092685285e-05, + "loss": 0.0067, + "step": 24760 + }, + { + "grad_norm": 0.29949018359184265, + "learning_rate": 6.812661057602399e-05, + "loss": 0.0076, + "step": 24770 + }, + { + "grad_norm": 0.3082233667373657, + "learning_rate": 6.810092471880943e-05, + "loss": 0.0117, + "step": 24780 + }, + { + "grad_norm": 0.24515505135059357, + "learning_rate": 6.807523336301187e-05, + "loss": 0.0105, + "step": 24790 + }, + { + "grad_norm": 0.1797519028186798, + "learning_rate": 6.804953651643566e-05, + "loss": 0.0088, + "step": 24800 + }, + { + "grad_norm": 0.16697648167610168, + "learning_rate": 6.802383418688685e-05, + "loss": 0.0089, + "step": 24810 + }, + { + "grad_norm": 0.20068761706352234, + "learning_rate": 6.799812638217309e-05, + "loss": 0.0069, + "step": 24820 + }, + { + "grad_norm": 0.21536289155483246, + "learning_rate": 6.797241311010373e-05, + "loss": 0.0093, + "step": 24830 + }, + { + "grad_norm": 0.25599241256713867, + "learning_rate": 6.794669437848982e-05, + "loss": 0.009, + "step": 24840 + }, + { + "grad_norm": 0.2785956859588623, + "learning_rate": 6.792097019514402e-05, + "loss": 0.0106, + "step": 24850 + }, + { + "grad_norm": 0.2358226478099823, + "learning_rate": 6.789524056788064e-05, + "loss": 0.0107, + "step": 24860 + }, + { + "grad_norm": 0.24556805193424225, + "learning_rate": 6.786950550451567e-05, + "loss": 0.0065, + "step": 24870 + }, + { + "grad_norm": 0.25816965103149414, + "learning_rate": 6.784376501286676e-05, + "loss": 0.0097, + "step": 24880 + }, + { + "grad_norm": 0.32570332288742065, + "learning_rate": 6.781801910075316e-05, + "loss": 0.0085, + "step": 24890 + }, + { + "grad_norm": 0.30192995071411133, + "learning_rate": 6.779226777599581e-05, + "loss": 0.0084, + "step": 24900 + }, + { + "grad_norm": 0.24069426953792572, + "learning_rate": 6.776651104641729e-05, + "loss": 0.0087, + "step": 24910 + }, + { + "grad_norm": 0.2186458706855774, + "learning_rate": 6.774074891984183e-05, + "loss": 0.0081, + "step": 24920 + }, + { + "grad_norm": 0.35617315769195557, + "learning_rate": 6.771498140409526e-05, + "loss": 0.009, + "step": 24930 + }, + { + "grad_norm": 0.3431120216846466, + "learning_rate": 6.768920850700506e-05, + "loss": 0.0117, + "step": 24940 + }, + { + "grad_norm": 0.30532458424568176, + "learning_rate": 6.766343023640039e-05, + "loss": 0.0105, + "step": 24950 + }, + { + "grad_norm": 0.24238221347332, + "learning_rate": 6.763764660011198e-05, + "loss": 0.008, + "step": 24960 + }, + { + "grad_norm": 0.32051047682762146, + "learning_rate": 6.761185760597223e-05, + "loss": 0.0158, + "step": 24970 + }, + { + "grad_norm": 0.3396850824356079, + "learning_rate": 6.758606326181515e-05, + "loss": 0.0097, + "step": 24980 + }, + { + "grad_norm": 0.1894432008266449, + "learning_rate": 6.75602635754764e-05, + "loss": 0.0093, + "step": 24990 + }, + { + "grad_norm": 0.27720701694488525, + "learning_rate": 6.75344585547932e-05, + "loss": 0.0078, + "step": 25000 + }, + { + "grad_norm": 0.25347939133644104, + "learning_rate": 6.750864820760449e-05, + "loss": 0.0121, + "step": 25010 + }, + { + "grad_norm": 0.1976553350687027, + "learning_rate": 6.748283254175072e-05, + "loss": 0.0104, + "step": 25020 + }, + { + "grad_norm": 0.24601860344409943, + "learning_rate": 6.745701156507404e-05, + "loss": 0.0084, + "step": 25030 + }, + { + "grad_norm": 0.2013106346130371, + "learning_rate": 6.743118528541818e-05, + "loss": 0.0082, + "step": 25040 + }, + { + "grad_norm": 0.28903135657310486, + "learning_rate": 6.740535371062846e-05, + "loss": 0.0101, + "step": 25050 + }, + { + "grad_norm": 0.31321632862091064, + "learning_rate": 6.737951684855185e-05, + "loss": 0.0107, + "step": 25060 + }, + { + "grad_norm": 0.25952136516571045, + "learning_rate": 6.735367470703691e-05, + "loss": 0.0079, + "step": 25070 + }, + { + "grad_norm": 0.26057928800582886, + "learning_rate": 6.732782729393379e-05, + "loss": 0.0073, + "step": 25080 + }, + { + "grad_norm": 0.22828325629234314, + "learning_rate": 6.730197461709425e-05, + "loss": 0.0087, + "step": 25090 + }, + { + "grad_norm": 0.21687515079975128, + "learning_rate": 6.727611668437164e-05, + "loss": 0.0087, + "step": 25100 + }, + { + "grad_norm": 0.28764843940734863, + "learning_rate": 6.725025350362094e-05, + "loss": 0.0124, + "step": 25110 + }, + { + "grad_norm": 0.2035594880580902, + "learning_rate": 6.72243850826987e-05, + "loss": 0.0098, + "step": 25120 + }, + { + "grad_norm": 0.33995723724365234, + "learning_rate": 6.719851142946305e-05, + "loss": 0.0076, + "step": 25130 + }, + { + "grad_norm": 0.32248228788375854, + "learning_rate": 6.717263255177372e-05, + "loss": 0.0108, + "step": 25140 + }, + { + "grad_norm": 0.3214641213417053, + "learning_rate": 6.714674845749205e-05, + "loss": 0.0122, + "step": 25150 + }, + { + "grad_norm": 0.2932858467102051, + "learning_rate": 6.712085915448092e-05, + "loss": 0.011, + "step": 25160 + }, + { + "grad_norm": 0.23090533912181854, + "learning_rate": 6.709496465060486e-05, + "loss": 0.0101, + "step": 25170 + }, + { + "grad_norm": 0.2181621938943863, + "learning_rate": 6.706906495372987e-05, + "loss": 0.0075, + "step": 25180 + }, + { + "grad_norm": 0.23707233369350433, + "learning_rate": 6.704316007172365e-05, + "loss": 0.0086, + "step": 25190 + }, + { + "grad_norm": 0.2770235538482666, + "learning_rate": 6.701725001245539e-05, + "loss": 0.0105, + "step": 25200 + }, + { + "grad_norm": 0.3039032220840454, + "learning_rate": 6.699133478379588e-05, + "loss": 0.0068, + "step": 25210 + }, + { + "grad_norm": 0.25725725293159485, + "learning_rate": 6.69654143936175e-05, + "loss": 0.0075, + "step": 25220 + }, + { + "grad_norm": 0.23771362006664276, + "learning_rate": 6.693948884979419e-05, + "loss": 0.0095, + "step": 25230 + }, + { + "grad_norm": 0.20604613423347473, + "learning_rate": 6.691355816020142e-05, + "loss": 0.0072, + "step": 25240 + }, + { + "grad_norm": 0.18242569267749786, + "learning_rate": 6.688762233271624e-05, + "loss": 0.0109, + "step": 25250 + }, + { + "grad_norm": 0.24743270874023438, + "learning_rate": 6.68616813752173e-05, + "loss": 0.009, + "step": 25260 + }, + { + "grad_norm": 0.2938723862171173, + "learning_rate": 6.683573529558477e-05, + "loss": 0.012, + "step": 25270 + }, + { + "grad_norm": 0.3096805810928345, + "learning_rate": 6.680978410170037e-05, + "loss": 0.0084, + "step": 25280 + }, + { + "grad_norm": 0.25070205330848694, + "learning_rate": 6.678382780144741e-05, + "loss": 0.0085, + "step": 25290 + }, + { + "grad_norm": 0.22265048325061798, + "learning_rate": 6.675786640271071e-05, + "loss": 0.0084, + "step": 25300 + }, + { + "grad_norm": 0.2640604078769684, + "learning_rate": 6.673189991337665e-05, + "loss": 0.0098, + "step": 25310 + }, + { + "grad_norm": 0.2184966802597046, + "learning_rate": 6.670592834133317e-05, + "loss": 0.0088, + "step": 25320 + }, + { + "grad_norm": 0.26096197962760925, + "learning_rate": 6.667995169446979e-05, + "loss": 0.0107, + "step": 25330 + }, + { + "grad_norm": 0.30984076857566833, + "learning_rate": 6.665396998067747e-05, + "loss": 0.008, + "step": 25340 + }, + { + "grad_norm": 0.29806143045425415, + "learning_rate": 6.66279832078488e-05, + "loss": 0.0092, + "step": 25350 + }, + { + "grad_norm": 0.3218159079551697, + "learning_rate": 6.660199138387786e-05, + "loss": 0.0074, + "step": 25360 + }, + { + "grad_norm": 0.41157570481300354, + "learning_rate": 6.65759945166603e-05, + "loss": 0.0129, + "step": 25370 + }, + { + "grad_norm": 0.2378745824098587, + "learning_rate": 6.654999261409326e-05, + "loss": 0.0094, + "step": 25380 + }, + { + "grad_norm": 0.23097263276576996, + "learning_rate": 6.652398568407544e-05, + "loss": 0.0093, + "step": 25390 + }, + { + "grad_norm": 0.20936210453510284, + "learning_rate": 6.649797373450707e-05, + "loss": 0.0081, + "step": 25400 + }, + { + "grad_norm": 0.24232088029384613, + "learning_rate": 6.647195677328988e-05, + "loss": 0.0077, + "step": 25410 + }, + { + "grad_norm": 0.2789475619792938, + "learning_rate": 6.644593480832712e-05, + "loss": 0.0101, + "step": 25420 + }, + { + "grad_norm": 0.3859996497631073, + "learning_rate": 6.641990784752363e-05, + "loss": 0.0109, + "step": 25430 + }, + { + "grad_norm": 0.2903768718242645, + "learning_rate": 6.639387589878566e-05, + "loss": 0.0134, + "step": 25440 + }, + { + "grad_norm": 0.30548423528671265, + "learning_rate": 6.636783897002103e-05, + "loss": 0.012, + "step": 25450 + }, + { + "grad_norm": 0.2943495512008667, + "learning_rate": 6.63417970691391e-05, + "loss": 0.0127, + "step": 25460 + }, + { + "grad_norm": 0.21564921736717224, + "learning_rate": 6.63157502040507e-05, + "loss": 0.0075, + "step": 25470 + }, + { + "grad_norm": 0.32418978214263916, + "learning_rate": 6.628969838266819e-05, + "loss": 0.0077, + "step": 25480 + }, + { + "grad_norm": 0.25460556149482727, + "learning_rate": 6.626364161290541e-05, + "loss": 0.0097, + "step": 25490 + }, + { + "grad_norm": 0.3148524761199951, + "learning_rate": 6.623757990267774e-05, + "loss": 0.0103, + "step": 25500 + }, + { + "grad_norm": 0.289489209651947, + "learning_rate": 6.621151325990201e-05, + "loss": 0.0138, + "step": 25510 + }, + { + "grad_norm": 0.2804276943206787, + "learning_rate": 6.618544169249657e-05, + "loss": 0.0107, + "step": 25520 + }, + { + "grad_norm": 0.2988085448741913, + "learning_rate": 6.615936520838133e-05, + "loss": 0.0132, + "step": 25530 + }, + { + "grad_norm": 0.26156941056251526, + "learning_rate": 6.613328381547759e-05, + "loss": 0.0092, + "step": 25540 + }, + { + "grad_norm": 0.2910844385623932, + "learning_rate": 6.610719752170821e-05, + "loss": 0.0087, + "step": 25550 + }, + { + "grad_norm": 0.19885356724262238, + "learning_rate": 6.60811063349975e-05, + "loss": 0.0117, + "step": 25560 + }, + { + "grad_norm": 0.2849405109882355, + "learning_rate": 6.605501026327127e-05, + "loss": 0.0114, + "step": 25570 + }, + { + "grad_norm": 0.31312671303749084, + "learning_rate": 6.602890931445685e-05, + "loss": 0.0081, + "step": 25580 + }, + { + "grad_norm": 0.29845884442329407, + "learning_rate": 6.6002803496483e-05, + "loss": 0.0107, + "step": 25590 + }, + { + "grad_norm": 0.24458935856819153, + "learning_rate": 6.597669281727997e-05, + "loss": 0.0099, + "step": 25600 + }, + { + "grad_norm": 0.2881734371185303, + "learning_rate": 6.595057728477949e-05, + "loss": 0.0087, + "step": 25610 + }, + { + "grad_norm": 0.27456358075141907, + "learning_rate": 6.59244569069148e-05, + "loss": 0.0093, + "step": 25620 + }, + { + "grad_norm": 0.3113868832588196, + "learning_rate": 6.589833169162054e-05, + "loss": 0.0099, + "step": 25630 + }, + { + "grad_norm": 0.21675272285938263, + "learning_rate": 6.587220164683291e-05, + "loss": 0.0093, + "step": 25640 + }, + { + "grad_norm": 0.2244863361120224, + "learning_rate": 6.58460667804895e-05, + "loss": 0.0094, + "step": 25650 + }, + { + "grad_norm": 0.23073671758174896, + "learning_rate": 6.581992710052938e-05, + "loss": 0.0089, + "step": 25660 + }, + { + "grad_norm": 0.2773357629776001, + "learning_rate": 6.579378261489311e-05, + "loss": 0.0082, + "step": 25670 + }, + { + "grad_norm": 0.34319114685058594, + "learning_rate": 6.576763333152268e-05, + "loss": 0.0095, + "step": 25680 + }, + { + "grad_norm": 0.27868327498435974, + "learning_rate": 6.574147925836159e-05, + "loss": 0.0099, + "step": 25690 + }, + { + "grad_norm": 0.34104734659194946, + "learning_rate": 6.571532040335472e-05, + "loss": 0.0118, + "step": 25700 + }, + { + "grad_norm": 0.24662648141384125, + "learning_rate": 6.568915677444845e-05, + "loss": 0.0082, + "step": 25710 + }, + { + "grad_norm": 0.28442811965942383, + "learning_rate": 6.56629883795906e-05, + "loss": 0.0096, + "step": 25720 + }, + { + "grad_norm": 0.33433881402015686, + "learning_rate": 6.563681522673043e-05, + "loss": 0.0112, + "step": 25730 + }, + { + "grad_norm": 0.27193188667297363, + "learning_rate": 6.561063732381867e-05, + "loss": 0.0093, + "step": 25740 + }, + { + "grad_norm": 0.305787056684494, + "learning_rate": 6.558445467880745e-05, + "loss": 0.0086, + "step": 25750 + }, + { + "grad_norm": 0.28165528178215027, + "learning_rate": 6.55582672996504e-05, + "loss": 0.0094, + "step": 25760 + }, + { + "grad_norm": 0.29080915451049805, + "learning_rate": 6.553207519430253e-05, + "loss": 0.0078, + "step": 25770 + }, + { + "grad_norm": 0.3609759211540222, + "learning_rate": 6.550587837072032e-05, + "loss": 0.013, + "step": 25780 + }, + { + "grad_norm": 0.2628922760486603, + "learning_rate": 6.547967683686166e-05, + "loss": 0.0091, + "step": 25790 + }, + { + "grad_norm": 0.38569116592407227, + "learning_rate": 6.545347060068591e-05, + "loss": 0.0088, + "step": 25800 + }, + { + "grad_norm": 0.3310096859931946, + "learning_rate": 6.542725967015382e-05, + "loss": 0.0075, + "step": 25810 + }, + { + "grad_norm": 0.27954334020614624, + "learning_rate": 6.540104405322757e-05, + "loss": 0.0075, + "step": 25820 + }, + { + "grad_norm": 0.23628857731819153, + "learning_rate": 6.537482375787077e-05, + "loss": 0.0083, + "step": 25830 + }, + { + "grad_norm": 0.27185583114624023, + "learning_rate": 6.534859879204845e-05, + "loss": 0.01, + "step": 25840 + }, + { + "grad_norm": 0.35953018069267273, + "learning_rate": 6.532236916372709e-05, + "loss": 0.0104, + "step": 25850 + }, + { + "grad_norm": 0.2615014314651489, + "learning_rate": 6.529613488087454e-05, + "loss": 0.0107, + "step": 25860 + }, + { + "grad_norm": 0.256808340549469, + "learning_rate": 6.526989595146009e-05, + "loss": 0.0088, + "step": 25870 + }, + { + "grad_norm": 0.29117870330810547, + "learning_rate": 6.524365238345441e-05, + "loss": 0.0088, + "step": 25880 + }, + { + "grad_norm": 0.3596153259277344, + "learning_rate": 6.521740418482964e-05, + "loss": 0.0108, + "step": 25890 + }, + { + "grad_norm": 0.22378768026828766, + "learning_rate": 6.519115136355925e-05, + "loss": 0.0105, + "step": 25900 + }, + { + "grad_norm": 0.23168601095676422, + "learning_rate": 6.51648939276182e-05, + "loss": 0.0092, + "step": 25910 + }, + { + "grad_norm": 0.19651584327220917, + "learning_rate": 6.513863188498277e-05, + "loss": 0.0074, + "step": 25920 + }, + { + "grad_norm": 0.18426480889320374, + "learning_rate": 6.511236524363068e-05, + "loss": 0.0072, + "step": 25930 + }, + { + "grad_norm": 0.2757687270641327, + "learning_rate": 6.508609401154104e-05, + "loss": 0.0082, + "step": 25940 + }, + { + "grad_norm": 0.3203938901424408, + "learning_rate": 6.505981819669439e-05, + "loss": 0.0095, + "step": 25950 + }, + { + "grad_norm": 0.26939141750335693, + "learning_rate": 6.503353780707258e-05, + "loss": 0.0092, + "step": 25960 + }, + { + "grad_norm": 0.38569483160972595, + "learning_rate": 6.500725285065895e-05, + "loss": 0.0105, + "step": 25970 + }, + { + "grad_norm": 0.18550324440002441, + "learning_rate": 6.498096333543813e-05, + "loss": 0.0082, + "step": 25980 + }, + { + "grad_norm": 0.2904600203037262, + "learning_rate": 6.49546692693962e-05, + "loss": 0.0127, + "step": 25990 + }, + { + "grad_norm": 0.36599794030189514, + "learning_rate": 6.492837066052059e-05, + "loss": 0.0104, + "step": 26000 + }, + { + "grad_norm": 0.2258162796497345, + "learning_rate": 6.490206751680014e-05, + "loss": 0.0086, + "step": 26010 + }, + { + "grad_norm": 0.32688409090042114, + "learning_rate": 6.487575984622505e-05, + "loss": 0.0091, + "step": 26020 + }, + { + "grad_norm": 0.25327086448669434, + "learning_rate": 6.484944765678689e-05, + "loss": 0.0118, + "step": 26030 + }, + { + "grad_norm": 0.19642053544521332, + "learning_rate": 6.482313095647861e-05, + "loss": 0.008, + "step": 26040 + }, + { + "grad_norm": 0.18546327948570251, + "learning_rate": 6.479680975329451e-05, + "loss": 0.0075, + "step": 26050 + }, + { + "grad_norm": 0.23450107872486115, + "learning_rate": 6.477048405523031e-05, + "loss": 0.011, + "step": 26060 + }, + { + "grad_norm": 0.2481645941734314, + "learning_rate": 6.474415387028304e-05, + "loss": 0.0064, + "step": 26070 + }, + { + "grad_norm": 0.26536768674850464, + "learning_rate": 6.471781920645114e-05, + "loss": 0.0094, + "step": 26080 + }, + { + "grad_norm": 0.45446938276290894, + "learning_rate": 6.469148007173434e-05, + "loss": 0.0085, + "step": 26090 + }, + { + "grad_norm": 0.24124768376350403, + "learning_rate": 6.466513647413381e-05, + "loss": 0.0121, + "step": 26100 + }, + { + "grad_norm": 0.29521143436431885, + "learning_rate": 6.463878842165203e-05, + "loss": 0.0094, + "step": 26110 + }, + { + "grad_norm": 0.2847363352775574, + "learning_rate": 6.461243592229286e-05, + "loss": 0.0085, + "step": 26120 + }, + { + "grad_norm": 0.21861809492111206, + "learning_rate": 6.458607898406146e-05, + "loss": 0.0075, + "step": 26130 + }, + { + "grad_norm": 0.19682231545448303, + "learning_rate": 6.455971761496439e-05, + "loss": 0.0065, + "step": 26140 + }, + { + "grad_norm": 0.25846272706985474, + "learning_rate": 6.453335182300953e-05, + "loss": 0.0105, + "step": 26150 + }, + { + "grad_norm": 0.2956348657608032, + "learning_rate": 6.450698161620612e-05, + "loss": 0.009, + "step": 26160 + }, + { + "grad_norm": 0.28191423416137695, + "learning_rate": 6.448060700256473e-05, + "loss": 0.0097, + "step": 26170 + }, + { + "grad_norm": 0.23963071405887604, + "learning_rate": 6.445422799009726e-05, + "loss": 0.0123, + "step": 26180 + }, + { + "grad_norm": 0.28072118759155273, + "learning_rate": 6.442784458681699e-05, + "loss": 0.0073, + "step": 26190 + }, + { + "grad_norm": 0.2605874240398407, + "learning_rate": 6.440145680073847e-05, + "loss": 0.0104, + "step": 26200 + }, + { + "grad_norm": 0.24629166722297668, + "learning_rate": 6.437506463987762e-05, + "loss": 0.0096, + "step": 26210 + }, + { + "grad_norm": 0.25993961095809937, + "learning_rate": 6.434866811225168e-05, + "loss": 0.0102, + "step": 26220 + }, + { + "grad_norm": 0.3774401843547821, + "learning_rate": 6.432226722587923e-05, + "loss": 0.0101, + "step": 26230 + }, + { + "grad_norm": 0.2263116091489792, + "learning_rate": 6.429586198878015e-05, + "loss": 0.0078, + "step": 26240 + }, + { + "grad_norm": 0.24977029860019684, + "learning_rate": 6.426945240897566e-05, + "loss": 0.009, + "step": 26250 + }, + { + "grad_norm": 0.21061046421527863, + "learning_rate": 6.424303849448829e-05, + "loss": 0.0086, + "step": 26260 + }, + { + "grad_norm": 0.2580801546573639, + "learning_rate": 6.42166202533419e-05, + "loss": 0.0088, + "step": 26270 + }, + { + "grad_norm": 0.2564726769924164, + "learning_rate": 6.419019769356164e-05, + "loss": 0.0079, + "step": 26280 + }, + { + "grad_norm": 0.23917706310749054, + "learning_rate": 6.416377082317398e-05, + "loss": 0.0107, + "step": 26290 + }, + { + "grad_norm": 0.21972177922725677, + "learning_rate": 6.413733965020674e-05, + "loss": 0.0074, + "step": 26300 + }, + { + "grad_norm": 0.25634920597076416, + "learning_rate": 6.411090418268896e-05, + "loss": 0.0087, + "step": 26310 + }, + { + "grad_norm": 0.27690601348876953, + "learning_rate": 6.408446442865109e-05, + "loss": 0.0073, + "step": 26320 + }, + { + "grad_norm": 0.3255150318145752, + "learning_rate": 6.405802039612479e-05, + "loss": 0.0107, + "step": 26330 + }, + { + "grad_norm": 0.3836117684841156, + "learning_rate": 6.403157209314308e-05, + "loss": 0.0085, + "step": 26340 + }, + { + "grad_norm": 0.34810012578964233, + "learning_rate": 6.400511952774024e-05, + "loss": 0.009, + "step": 26350 + }, + { + "grad_norm": 0.23736333847045898, + "learning_rate": 6.397866270795187e-05, + "loss": 0.0083, + "step": 26360 + }, + { + "grad_norm": 0.2519403100013733, + "learning_rate": 6.395220164181489e-05, + "loss": 0.0091, + "step": 26370 + }, + { + "grad_norm": 0.292632520198822, + "learning_rate": 6.39257363373674e-05, + "loss": 0.0114, + "step": 26380 + }, + { + "grad_norm": 0.23322105407714844, + "learning_rate": 6.389926680264892e-05, + "loss": 0.0062, + "step": 26390 + }, + { + "grad_norm": 0.20453332364559174, + "learning_rate": 6.387279304570017e-05, + "loss": 0.0075, + "step": 26400 + }, + { + "grad_norm": 0.3414888381958008, + "learning_rate": 6.384631507456319e-05, + "loss": 0.0086, + "step": 26410 + }, + { + "grad_norm": 0.3276098072528839, + "learning_rate": 6.381983289728126e-05, + "loss": 0.0077, + "step": 26420 + }, + { + "grad_norm": 0.22395369410514832, + "learning_rate": 6.3793346521899e-05, + "loss": 0.009, + "step": 26430 + }, + { + "grad_norm": 0.17065633833408356, + "learning_rate": 6.376685595646226e-05, + "loss": 0.0085, + "step": 26440 + }, + { + "grad_norm": 0.31599608063697815, + "learning_rate": 6.374036120901816e-05, + "loss": 0.0101, + "step": 26450 + }, + { + "grad_norm": 0.34175431728363037, + "learning_rate": 6.371386228761514e-05, + "loss": 0.0087, + "step": 26460 + }, + { + "grad_norm": 0.27586376667022705, + "learning_rate": 6.368735920030283e-05, + "loss": 0.0072, + "step": 26470 + }, + { + "grad_norm": 0.3355049788951874, + "learning_rate": 6.366085195513218e-05, + "loss": 0.0076, + "step": 26480 + }, + { + "grad_norm": 0.22536924481391907, + "learning_rate": 6.363434056015543e-05, + "loss": 0.0081, + "step": 26490 + }, + { + "grad_norm": 0.2461698353290558, + "learning_rate": 6.360782502342599e-05, + "loss": 0.009, + "step": 26500 + }, + { + "grad_norm": 0.3334399163722992, + "learning_rate": 6.358130535299862e-05, + "loss": 0.0096, + "step": 26510 + }, + { + "grad_norm": 0.2501167953014374, + "learning_rate": 6.355478155692926e-05, + "loss": 0.012, + "step": 26520 + }, + { + "grad_norm": 0.16247116029262543, + "learning_rate": 6.352825364327517e-05, + "loss": 0.0073, + "step": 26530 + }, + { + "grad_norm": 0.27164220809936523, + "learning_rate": 6.350172162009482e-05, + "loss": 0.0105, + "step": 26540 + }, + { + "grad_norm": 0.33109942078590393, + "learning_rate": 6.347518549544793e-05, + "loss": 0.0097, + "step": 26550 + }, + { + "grad_norm": 0.3005988895893097, + "learning_rate": 6.344864527739547e-05, + "loss": 0.0082, + "step": 26560 + }, + { + "grad_norm": 0.28174588084220886, + "learning_rate": 6.342210097399966e-05, + "loss": 0.0101, + "step": 26570 + }, + { + "grad_norm": 0.3186158835887909, + "learning_rate": 6.339555259332398e-05, + "loss": 0.0105, + "step": 26580 + }, + { + "grad_norm": 0.22998090088367462, + "learning_rate": 6.33690001434331e-05, + "loss": 0.0082, + "step": 26590 + }, + { + "grad_norm": 0.2678934633731842, + "learning_rate": 6.334244363239296e-05, + "loss": 0.0115, + "step": 26600 + }, + { + "grad_norm": 0.2527500092983246, + "learning_rate": 6.331588306827073e-05, + "loss": 0.0097, + "step": 26610 + }, + { + "grad_norm": 0.2242431938648224, + "learning_rate": 6.328931845913483e-05, + "loss": 0.0083, + "step": 26620 + }, + { + "grad_norm": 0.23258468508720398, + "learning_rate": 6.326274981305484e-05, + "loss": 0.0091, + "step": 26630 + }, + { + "grad_norm": 0.2950723469257355, + "learning_rate": 6.323617713810166e-05, + "loss": 0.0136, + "step": 26640 + }, + { + "grad_norm": 0.2832319438457489, + "learning_rate": 6.320960044234734e-05, + "loss": 0.0102, + "step": 26650 + }, + { + "grad_norm": 0.22223219275474548, + "learning_rate": 6.318301973386518e-05, + "loss": 0.009, + "step": 26660 + }, + { + "grad_norm": 0.27043962478637695, + "learning_rate": 6.315643502072971e-05, + "loss": 0.0081, + "step": 26670 + }, + { + "grad_norm": 0.23619243502616882, + "learning_rate": 6.312984631101667e-05, + "loss": 0.0085, + "step": 26680 + }, + { + "grad_norm": 0.17916107177734375, + "learning_rate": 6.310325361280297e-05, + "loss": 0.0075, + "step": 26690 + }, + { + "grad_norm": 0.2921355664730072, + "learning_rate": 6.30766569341668e-05, + "loss": 0.0081, + "step": 26700 + }, + { + "grad_norm": 0.2554694712162018, + "learning_rate": 6.305005628318753e-05, + "loss": 0.0108, + "step": 26710 + }, + { + "grad_norm": 0.28823018074035645, + "learning_rate": 6.302345166794572e-05, + "loss": 0.01, + "step": 26720 + }, + { + "grad_norm": 0.3092280626296997, + "learning_rate": 6.299684309652316e-05, + "loss": 0.0105, + "step": 26730 + }, + { + "grad_norm": 0.21332131326198578, + "learning_rate": 6.297023057700283e-05, + "loss": 0.0078, + "step": 26740 + }, + { + "grad_norm": 0.2608703076839447, + "learning_rate": 6.294361411746891e-05, + "loss": 0.009, + "step": 26750 + }, + { + "grad_norm": 0.251953125, + "learning_rate": 6.291699372600677e-05, + "loss": 0.0097, + "step": 26760 + }, + { + "grad_norm": 0.18997237086296082, + "learning_rate": 6.2890369410703e-05, + "loss": 0.0088, + "step": 26770 + }, + { + "grad_norm": 0.22552882134914398, + "learning_rate": 6.286374117964534e-05, + "loss": 0.0088, + "step": 26780 + }, + { + "grad_norm": 0.347776859998703, + "learning_rate": 6.283710904092277e-05, + "loss": 0.0081, + "step": 26790 + }, + { + "grad_norm": 0.29550039768218994, + "learning_rate": 6.281047300262542e-05, + "loss": 0.0102, + "step": 26800 + }, + { + "grad_norm": 0.4152955412864685, + "learning_rate": 6.278383307284461e-05, + "loss": 0.01, + "step": 26810 + }, + { + "grad_norm": 0.29928719997406006, + "learning_rate": 6.275718925967284e-05, + "loss": 0.0083, + "step": 26820 + }, + { + "grad_norm": 0.3078267574310303, + "learning_rate": 6.273054157120382e-05, + "loss": 0.0096, + "step": 26830 + }, + { + "grad_norm": 0.28713661432266235, + "learning_rate": 6.270389001553238e-05, + "loss": 0.0108, + "step": 26840 + }, + { + "grad_norm": 0.3074244558811188, + "learning_rate": 6.26772346007546e-05, + "loss": 0.0098, + "step": 26850 + }, + { + "grad_norm": 0.32200419902801514, + "learning_rate": 6.265057533496767e-05, + "loss": 0.0117, + "step": 26860 + }, + { + "grad_norm": 0.3505547046661377, + "learning_rate": 6.262391222626997e-05, + "loss": 0.0098, + "step": 26870 + }, + { + "grad_norm": 0.3319917321205139, + "learning_rate": 6.259724528276106e-05, + "loss": 0.012, + "step": 26880 + }, + { + "grad_norm": 0.2519925534725189, + "learning_rate": 6.257057451254162e-05, + "loss": 0.0071, + "step": 26890 + }, + { + "grad_norm": 0.24205760657787323, + "learning_rate": 6.254389992371357e-05, + "loss": 0.0102, + "step": 26900 + }, + { + "grad_norm": 0.29665759205818176, + "learning_rate": 6.25172215243799e-05, + "loss": 0.0099, + "step": 26910 + }, + { + "grad_norm": 0.21631327271461487, + "learning_rate": 6.249053932264486e-05, + "loss": 0.0075, + "step": 26920 + }, + { + "grad_norm": 0.2641735374927521, + "learning_rate": 6.246385332661376e-05, + "loss": 0.0079, + "step": 26930 + }, + { + "grad_norm": 0.25667640566825867, + "learning_rate": 6.24371635443931e-05, + "loss": 0.0133, + "step": 26940 + }, + { + "grad_norm": 0.23632048070430756, + "learning_rate": 6.241046998409054e-05, + "loss": 0.0092, + "step": 26950 + }, + { + "grad_norm": 0.2546229660511017, + "learning_rate": 6.238377265381489e-05, + "loss": 0.0099, + "step": 26960 + }, + { + "grad_norm": 0.2508358657360077, + "learning_rate": 6.235707156167607e-05, + "loss": 0.0089, + "step": 26970 + }, + { + "grad_norm": 0.26633042097091675, + "learning_rate": 6.233036671578519e-05, + "loss": 0.0086, + "step": 26980 + }, + { + "grad_norm": 0.1944900006055832, + "learning_rate": 6.230365812425445e-05, + "loss": 0.0111, + "step": 26990 + }, + { + "grad_norm": 0.3162895441055298, + "learning_rate": 6.227694579519724e-05, + "loss": 0.0086, + "step": 27000 + }, + { + "grad_norm": 0.2586805820465088, + "learning_rate": 6.225022973672805e-05, + "loss": 0.0104, + "step": 27010 + }, + { + "grad_norm": 0.27403485774993896, + "learning_rate": 6.222350995696253e-05, + "loss": 0.0083, + "step": 27020 + }, + { + "grad_norm": 0.2791021466255188, + "learning_rate": 6.21967864640174e-05, + "loss": 0.0083, + "step": 27030 + }, + { + "grad_norm": 0.22853180766105652, + "learning_rate": 6.217005926601059e-05, + "loss": 0.0081, + "step": 27040 + }, + { + "grad_norm": 0.25925320386886597, + "learning_rate": 6.214332837106111e-05, + "loss": 0.008, + "step": 27050 + }, + { + "grad_norm": 0.24230362474918365, + "learning_rate": 6.21165937872891e-05, + "loss": 0.0072, + "step": 27060 + }, + { + "grad_norm": 0.2683258652687073, + "learning_rate": 6.208985552281582e-05, + "loss": 0.0083, + "step": 27070 + }, + { + "grad_norm": 0.299369752407074, + "learning_rate": 6.206311358576364e-05, + "loss": 0.0098, + "step": 27080 + }, + { + "grad_norm": 0.2882424592971802, + "learning_rate": 6.203636798425608e-05, + "loss": 0.0114, + "step": 27090 + }, + { + "grad_norm": 0.2645210325717926, + "learning_rate": 6.20096187264177e-05, + "loss": 0.0116, + "step": 27100 + }, + { + "grad_norm": 0.2970835566520691, + "learning_rate": 6.198286582037425e-05, + "loss": 0.011, + "step": 27110 + }, + { + "grad_norm": 0.19154250621795654, + "learning_rate": 6.195610927425256e-05, + "loss": 0.0095, + "step": 27120 + }, + { + "grad_norm": 0.26174408197402954, + "learning_rate": 6.192934909618056e-05, + "loss": 0.0109, + "step": 27130 + }, + { + "grad_norm": 0.2742542326450348, + "learning_rate": 6.190258529428728e-05, + "loss": 0.0092, + "step": 27140 + }, + { + "grad_norm": 0.2624475061893463, + "learning_rate": 6.187581787670285e-05, + "loss": 0.0097, + "step": 27150 + }, + { + "grad_norm": 0.19591739773750305, + "learning_rate": 6.184904685155852e-05, + "loss": 0.0092, + "step": 27160 + }, + { + "grad_norm": 0.270652174949646, + "learning_rate": 6.18222722269866e-05, + "loss": 0.0082, + "step": 27170 + }, + { + "grad_norm": 0.28079813718795776, + "learning_rate": 6.179549401112053e-05, + "loss": 0.0158, + "step": 27180 + }, + { + "grad_norm": 0.26096683740615845, + "learning_rate": 6.176871221209482e-05, + "loss": 0.0104, + "step": 27190 + }, + { + "grad_norm": 0.25848788022994995, + "learning_rate": 6.174192683804508e-05, + "loss": 0.0087, + "step": 27200 + }, + { + "grad_norm": 0.27694791555404663, + "learning_rate": 6.1715137897108e-05, + "loss": 0.0076, + "step": 27210 + }, + { + "grad_norm": 0.2889971137046814, + "learning_rate": 6.168834539742134e-05, + "loss": 0.0077, + "step": 27220 + }, + { + "grad_norm": 0.19497044384479523, + "learning_rate": 6.166154934712397e-05, + "loss": 0.0082, + "step": 27230 + }, + { + "grad_norm": 0.2787424325942993, + "learning_rate": 6.163474975435581e-05, + "loss": 0.0075, + "step": 27240 + }, + { + "grad_norm": 0.25081756711006165, + "learning_rate": 6.160794662725787e-05, + "loss": 0.0088, + "step": 27250 + }, + { + "grad_norm": 0.20029161870479584, + "learning_rate": 6.158113997397222e-05, + "loss": 0.0093, + "step": 27260 + }, + { + "grad_norm": 0.25263383984565735, + "learning_rate": 6.155432980264205e-05, + "loss": 0.0089, + "step": 27270 + }, + { + "grad_norm": 0.23779967427253723, + "learning_rate": 6.152751612141156e-05, + "loss": 0.0085, + "step": 27280 + }, + { + "grad_norm": 0.24605008959770203, + "learning_rate": 6.150069893842602e-05, + "loss": 0.0096, + "step": 27290 + }, + { + "grad_norm": 0.3267499804496765, + "learning_rate": 6.147387826183182e-05, + "loss": 0.0099, + "step": 27300 + }, + { + "grad_norm": 0.26676827669143677, + "learning_rate": 6.144705409977635e-05, + "loss": 0.0081, + "step": 27310 + }, + { + "grad_norm": 0.22183412313461304, + "learning_rate": 6.142022646040808e-05, + "loss": 0.01, + "step": 27320 + }, + { + "grad_norm": 0.30547916889190674, + "learning_rate": 6.139339535187653e-05, + "loss": 0.0076, + "step": 27330 + }, + { + "grad_norm": 0.23658980429172516, + "learning_rate": 6.136656078233232e-05, + "loss": 0.0086, + "step": 27340 + }, + { + "grad_norm": 0.20363584160804749, + "learning_rate": 6.133972275992707e-05, + "loss": 0.0117, + "step": 27350 + }, + { + "grad_norm": 0.22415928542613983, + "learning_rate": 6.131288129281342e-05, + "loss": 0.0099, + "step": 27360 + }, + { + "grad_norm": 0.283632755279541, + "learning_rate": 6.128603638914516e-05, + "loss": 0.0106, + "step": 27370 + }, + { + "grad_norm": 0.2705747187137604, + "learning_rate": 6.125918805707704e-05, + "loss": 0.0078, + "step": 27380 + }, + { + "grad_norm": 0.2821328639984131, + "learning_rate": 6.123233630476485e-05, + "loss": 0.0109, + "step": 27390 + }, + { + "grad_norm": 0.26058027148246765, + "learning_rate": 6.120548114036547e-05, + "loss": 0.0101, + "step": 27400 + }, + { + "grad_norm": 0.21678268909454346, + "learning_rate": 6.117862257203679e-05, + "loss": 0.009, + "step": 27410 + }, + { + "grad_norm": 0.24432334303855896, + "learning_rate": 6.115176060793771e-05, + "loss": 0.0107, + "step": 27420 + }, + { + "grad_norm": 0.21130980551242828, + "learning_rate": 6.112489525622822e-05, + "loss": 0.0072, + "step": 27430 + }, + { + "grad_norm": 0.18233679234981537, + "learning_rate": 6.109802652506928e-05, + "loss": 0.0069, + "step": 27440 + }, + { + "grad_norm": 0.1726444810628891, + "learning_rate": 6.107115442262291e-05, + "loss": 0.0083, + "step": 27450 + }, + { + "grad_norm": 0.26311635971069336, + "learning_rate": 6.104427895705214e-05, + "loss": 0.0141, + "step": 27460 + }, + { + "grad_norm": 0.2378009408712387, + "learning_rate": 6.101740013652103e-05, + "loss": 0.0073, + "step": 27470 + }, + { + "grad_norm": 0.238592267036438, + "learning_rate": 6.099051796919465e-05, + "loss": 0.0088, + "step": 27480 + }, + { + "grad_norm": 0.2625325620174408, + "learning_rate": 6.096363246323911e-05, + "loss": 0.0113, + "step": 27490 + }, + { + "grad_norm": 0.2785114645957947, + "learning_rate": 6.0936743626821504e-05, + "loss": 0.0101, + "step": 27500 + }, + { + "grad_norm": 0.28394466638565063, + "learning_rate": 6.090985146810996e-05, + "loss": 0.0087, + "step": 27510 + }, + { + "grad_norm": 0.2965015769004822, + "learning_rate": 6.088295599527357e-05, + "loss": 0.0101, + "step": 27520 + }, + { + "grad_norm": 0.2593602240085602, + "learning_rate": 6.085605721648252e-05, + "loss": 0.0059, + "step": 27530 + }, + { + "grad_norm": 0.34770137071609497, + "learning_rate": 6.082915513990792e-05, + "loss": 0.0088, + "step": 27540 + }, + { + "grad_norm": 0.2729474902153015, + "learning_rate": 6.080224977372192e-05, + "loss": 0.0065, + "step": 27550 + }, + { + "grad_norm": 0.3115181624889374, + "learning_rate": 6.0775341126097666e-05, + "loss": 0.0071, + "step": 27560 + }, + { + "grad_norm": 0.21085843443870544, + "learning_rate": 6.074842920520926e-05, + "loss": 0.009, + "step": 27570 + }, + { + "grad_norm": 0.21746951341629028, + "learning_rate": 6.072151401923186e-05, + "loss": 0.0066, + "step": 27580 + }, + { + "grad_norm": 0.21931703388690948, + "learning_rate": 6.069459557634159e-05, + "loss": 0.0064, + "step": 27590 + }, + { + "grad_norm": 0.22384639084339142, + "learning_rate": 6.066767388471557e-05, + "loss": 0.0082, + "step": 27600 + }, + { + "grad_norm": 0.3302254378795624, + "learning_rate": 6.064074895253188e-05, + "loss": 0.0091, + "step": 27610 + }, + { + "grad_norm": 0.18144726753234863, + "learning_rate": 6.061382078796961e-05, + "loss": 0.0067, + "step": 27620 + }, + { + "grad_norm": 0.20308656990528107, + "learning_rate": 6.0586889399208814e-05, + "loss": 0.0077, + "step": 27630 + }, + { + "grad_norm": 0.3091198205947876, + "learning_rate": 6.0559954794430565e-05, + "loss": 0.0084, + "step": 27640 + }, + { + "grad_norm": 0.3475000560283661, + "learning_rate": 6.053301698181687e-05, + "loss": 0.0117, + "step": 27650 + }, + { + "grad_norm": 0.259026437997818, + "learning_rate": 6.0506075969550725e-05, + "loss": 0.0087, + "step": 27660 + }, + { + "grad_norm": 0.2733376920223236, + "learning_rate": 6.047913176581609e-05, + "loss": 0.0084, + "step": 27670 + }, + { + "grad_norm": 0.24698317050933838, + "learning_rate": 6.0452184378797904e-05, + "loss": 0.0082, + "step": 27680 + }, + { + "grad_norm": 0.41196587681770325, + "learning_rate": 6.042523381668209e-05, + "loss": 0.0078, + "step": 27690 + }, + { + "grad_norm": 0.2769913077354431, + "learning_rate": 6.03982800876555e-05, + "loss": 0.009, + "step": 27700 + }, + { + "grad_norm": 0.18807680904865265, + "learning_rate": 6.0371323199905975e-05, + "loss": 0.0077, + "step": 27710 + }, + { + "grad_norm": 0.3096144199371338, + "learning_rate": 6.03443631616223e-05, + "loss": 0.008, + "step": 27720 + }, + { + "grad_norm": 0.22993217408657074, + "learning_rate": 6.031739998099421e-05, + "loss": 0.0124, + "step": 27730 + }, + { + "grad_norm": 0.2718513011932373, + "learning_rate": 6.029043366621243e-05, + "loss": 0.0093, + "step": 27740 + }, + { + "grad_norm": 0.21488773822784424, + "learning_rate": 6.0263464225468615e-05, + "loss": 0.0094, + "step": 27750 + }, + { + "grad_norm": 0.21636754274368286, + "learning_rate": 6.023649166695534e-05, + "loss": 0.0085, + "step": 27760 + }, + { + "grad_norm": 0.2927066683769226, + "learning_rate": 6.0209515998866186e-05, + "loss": 0.0082, + "step": 27770 + }, + { + "grad_norm": 0.26456037163734436, + "learning_rate": 6.018253722939563e-05, + "loss": 0.0085, + "step": 27780 + }, + { + "grad_norm": 0.233922079205513, + "learning_rate": 6.015555536673914e-05, + "loss": 0.0094, + "step": 27790 + }, + { + "grad_norm": 0.25748538970947266, + "learning_rate": 6.0128570419093054e-05, + "loss": 0.0073, + "step": 27800 + }, + { + "grad_norm": 0.2991805076599121, + "learning_rate": 6.010158239465471e-05, + "loss": 0.0114, + "step": 27810 + }, + { + "grad_norm": 0.27685466408729553, + "learning_rate": 6.007459130162235e-05, + "loss": 0.0084, + "step": 27820 + }, + { + "grad_norm": 0.21518674492835999, + "learning_rate": 6.004759714819516e-05, + "loss": 0.0088, + "step": 27830 + }, + { + "grad_norm": 0.1939920336008072, + "learning_rate": 6.002059994257323e-05, + "loss": 0.0084, + "step": 27840 + }, + { + "grad_norm": 0.21805702149868011, + "learning_rate": 5.999359969295764e-05, + "loss": 0.0073, + "step": 27850 + }, + { + "grad_norm": 0.24761803448200226, + "learning_rate": 5.9966596407550314e-05, + "loss": 0.0064, + "step": 27860 + }, + { + "grad_norm": 0.23991671204566956, + "learning_rate": 5.993959009455416e-05, + "loss": 0.008, + "step": 27870 + }, + { + "grad_norm": 0.259254515171051, + "learning_rate": 5.991258076217298e-05, + "loss": 0.0082, + "step": 27880 + }, + { + "grad_norm": 0.2831530272960663, + "learning_rate": 5.988556841861147e-05, + "loss": 0.0098, + "step": 27890 + }, + { + "grad_norm": 0.20290721952915192, + "learning_rate": 5.985855307207531e-05, + "loss": 0.0082, + "step": 27900 + }, + { + "grad_norm": 0.23735889792442322, + "learning_rate": 5.9831534730771e-05, + "loss": 0.0098, + "step": 27910 + }, + { + "grad_norm": 0.2403845489025116, + "learning_rate": 5.980451340290605e-05, + "loss": 0.0082, + "step": 27920 + }, + { + "grad_norm": 0.3064778745174408, + "learning_rate": 5.97774890966888e-05, + "loss": 0.011, + "step": 27930 + }, + { + "grad_norm": 0.2955588102340698, + "learning_rate": 5.975046182032851e-05, + "loss": 0.0087, + "step": 27940 + }, + { + "grad_norm": 0.29320523142814636, + "learning_rate": 5.972343158203537e-05, + "loss": 0.0094, + "step": 27950 + }, + { + "grad_norm": 0.1543625444173813, + "learning_rate": 5.969639839002045e-05, + "loss": 0.0076, + "step": 27960 + }, + { + "grad_norm": 0.25194329023361206, + "learning_rate": 5.966936225249572e-05, + "loss": 0.008, + "step": 27970 + }, + { + "grad_norm": 0.2639315724372864, + "learning_rate": 5.9642323177674044e-05, + "loss": 0.0086, + "step": 27980 + }, + { + "grad_norm": 0.19331061840057373, + "learning_rate": 5.9615281173769154e-05, + "loss": 0.0067, + "step": 27990 + }, + { + "grad_norm": 0.230178564786911, + "learning_rate": 5.958823624899574e-05, + "loss": 0.0079, + "step": 28000 + }, + { + "grad_norm": 0.2751917541027069, + "learning_rate": 5.956118841156933e-05, + "loss": 0.0062, + "step": 28010 + }, + { + "grad_norm": 0.2951553463935852, + "learning_rate": 5.953413766970631e-05, + "loss": 0.0118, + "step": 28020 + }, + { + "grad_norm": 0.2595328986644745, + "learning_rate": 5.9507084031624e-05, + "loss": 0.0091, + "step": 28030 + }, + { + "grad_norm": 0.28204429149627686, + "learning_rate": 5.948002750554058e-05, + "loss": 0.0096, + "step": 28040 + }, + { + "grad_norm": 0.2264937162399292, + "learning_rate": 5.9452968099675124e-05, + "loss": 0.0076, + "step": 28050 + }, + { + "grad_norm": 0.21322259306907654, + "learning_rate": 5.9425905822247527e-05, + "loss": 0.0085, + "step": 28060 + }, + { + "grad_norm": 0.20056018233299255, + "learning_rate": 5.939884068147864e-05, + "loss": 0.0062, + "step": 28070 + }, + { + "grad_norm": 0.19608286023139954, + "learning_rate": 5.937177268559011e-05, + "loss": 0.0085, + "step": 28080 + }, + { + "grad_norm": 0.20257672667503357, + "learning_rate": 5.934470184280448e-05, + "loss": 0.0073, + "step": 28090 + }, + { + "grad_norm": 0.24828822910785675, + "learning_rate": 5.931762816134516e-05, + "loss": 0.0106, + "step": 28100 + }, + { + "grad_norm": 0.26942571997642517, + "learning_rate": 5.9290551649436434e-05, + "loss": 0.0069, + "step": 28110 + }, + { + "grad_norm": 0.23649120330810547, + "learning_rate": 5.9263472315303416e-05, + "loss": 0.0079, + "step": 28120 + }, + { + "grad_norm": 0.2716841697692871, + "learning_rate": 5.9236390167172096e-05, + "loss": 0.0069, + "step": 28130 + }, + { + "grad_norm": 0.28548988699913025, + "learning_rate": 5.920930521326932e-05, + "loss": 0.0062, + "step": 28140 + }, + { + "grad_norm": 0.18524621427059174, + "learning_rate": 5.918221746182276e-05, + "loss": 0.0076, + "step": 28150 + }, + { + "grad_norm": 0.25055769085884094, + "learning_rate": 5.9155126921061e-05, + "loss": 0.0066, + "step": 28160 + }, + { + "grad_norm": 0.19401398301124573, + "learning_rate": 5.91280335992134e-05, + "loss": 0.0069, + "step": 28170 + }, + { + "grad_norm": 0.23617658019065857, + "learning_rate": 5.91009375045102e-05, + "loss": 0.0066, + "step": 28180 + }, + { + "grad_norm": 0.18584458529949188, + "learning_rate": 5.9073838645182476e-05, + "loss": 0.0066, + "step": 28190 + }, + { + "grad_norm": 0.2191765010356903, + "learning_rate": 5.904673702946217e-05, + "loss": 0.0092, + "step": 28200 + }, + { + "grad_norm": 0.2633270025253296, + "learning_rate": 5.9019632665582004e-05, + "loss": 0.0082, + "step": 28210 + }, + { + "grad_norm": 0.32821163535118103, + "learning_rate": 5.899252556177559e-05, + "loss": 0.0091, + "step": 28220 + }, + { + "grad_norm": 0.28726497292518616, + "learning_rate": 5.896541572627735e-05, + "loss": 0.0105, + "step": 28230 + }, + { + "grad_norm": 0.26289427280426025, + "learning_rate": 5.893830316732253e-05, + "loss": 0.0085, + "step": 28240 + }, + { + "grad_norm": 0.18791896104812622, + "learning_rate": 5.8911187893147214e-05, + "loss": 0.0086, + "step": 28250 + }, + { + "grad_norm": 0.31127044558525085, + "learning_rate": 5.888406991198828e-05, + "loss": 0.0077, + "step": 28260 + }, + { + "grad_norm": 0.21324124932289124, + "learning_rate": 5.885694923208349e-05, + "loss": 0.0077, + "step": 28270 + }, + { + "grad_norm": 0.2801651060581207, + "learning_rate": 5.882982586167138e-05, + "loss": 0.0079, + "step": 28280 + }, + { + "grad_norm": 0.26293638348579407, + "learning_rate": 5.880269980899131e-05, + "loss": 0.0065, + "step": 28290 + }, + { + "grad_norm": 0.24821090698242188, + "learning_rate": 5.8775571082283465e-05, + "loss": 0.0067, + "step": 28300 + }, + { + "grad_norm": 0.3018585443496704, + "learning_rate": 5.8748439689788824e-05, + "loss": 0.008, + "step": 28310 + }, + { + "grad_norm": 0.2888025939464569, + "learning_rate": 5.87213056397492e-05, + "loss": 0.0102, + "step": 28320 + }, + { + "grad_norm": 0.26619553565979004, + "learning_rate": 5.869416894040719e-05, + "loss": 0.011, + "step": 28330 + }, + { + "grad_norm": 0.3204382359981537, + "learning_rate": 5.866702960000621e-05, + "loss": 0.0067, + "step": 28340 + }, + { + "grad_norm": 0.23167967796325684, + "learning_rate": 5.863988762679048e-05, + "loss": 0.0082, + "step": 28350 + }, + { + "grad_norm": 0.3500116765499115, + "learning_rate": 5.8612743029005e-05, + "loss": 0.0078, + "step": 28360 + }, + { + "grad_norm": 0.2260109782218933, + "learning_rate": 5.858559581489561e-05, + "loss": 0.0083, + "step": 28370 + }, + { + "grad_norm": 0.21143600344657898, + "learning_rate": 5.85584459927089e-05, + "loss": 0.0098, + "step": 28380 + }, + { + "grad_norm": 0.28960177302360535, + "learning_rate": 5.853129357069227e-05, + "loss": 0.0074, + "step": 28390 + }, + { + "grad_norm": 0.28553298115730286, + "learning_rate": 5.8504138557093913e-05, + "loss": 0.0095, + "step": 28400 + }, + { + "grad_norm": 0.24120651185512543, + "learning_rate": 5.8476980960162784e-05, + "loss": 0.0087, + "step": 28410 + }, + { + "grad_norm": 0.22933678328990936, + "learning_rate": 5.844982078814868e-05, + "loss": 0.0095, + "step": 28420 + }, + { + "grad_norm": 0.2297295480966568, + "learning_rate": 5.842265804930211e-05, + "loss": 0.0067, + "step": 28430 + }, + { + "grad_norm": 0.25334498286247253, + "learning_rate": 5.839549275187444e-05, + "loss": 0.011, + "step": 28440 + }, + { + "grad_norm": 0.22509747743606567, + "learning_rate": 5.836832490411771e-05, + "loss": 0.0073, + "step": 28450 + }, + { + "grad_norm": 0.34018099308013916, + "learning_rate": 5.834115451428485e-05, + "loss": 0.0105, + "step": 28460 + }, + { + "grad_norm": 0.30208104848861694, + "learning_rate": 5.831398159062946e-05, + "loss": 0.0078, + "step": 28470 + }, + { + "grad_norm": 0.2879299819469452, + "learning_rate": 5.828680614140599e-05, + "loss": 0.0096, + "step": 28480 + }, + { + "grad_norm": 0.2353498935699463, + "learning_rate": 5.825962817486962e-05, + "loss": 0.0092, + "step": 28490 + }, + { + "grad_norm": 0.38227665424346924, + "learning_rate": 5.823244769927629e-05, + "loss": 0.008, + "step": 28500 + }, + { + "grad_norm": 0.2462458312511444, + "learning_rate": 5.8205264722882716e-05, + "loss": 0.0081, + "step": 28510 + }, + { + "grad_norm": 0.2623266875743866, + "learning_rate": 5.817807925394636e-05, + "loss": 0.0081, + "step": 28520 + }, + { + "grad_norm": 0.21388186514377594, + "learning_rate": 5.815089130072546e-05, + "loss": 0.0076, + "step": 28530 + }, + { + "grad_norm": 0.2926730513572693, + "learning_rate": 5.8123700871479e-05, + "loss": 0.0097, + "step": 28540 + }, + { + "grad_norm": 0.23351100087165833, + "learning_rate": 5.809650797446671e-05, + "loss": 0.0086, + "step": 28550 + }, + { + "grad_norm": 0.21775932610034943, + "learning_rate": 5.806931261794907e-05, + "loss": 0.0061, + "step": 28560 + }, + { + "grad_norm": 0.1772022247314453, + "learning_rate": 5.804211481018731e-05, + "loss": 0.0062, + "step": 28570 + }, + { + "grad_norm": 0.2332395613193512, + "learning_rate": 5.801491455944341e-05, + "loss": 0.0086, + "step": 28580 + }, + { + "grad_norm": 0.2631393074989319, + "learning_rate": 5.79877118739801e-05, + "loss": 0.0071, + "step": 28590 + }, + { + "grad_norm": 0.3406801223754883, + "learning_rate": 5.7960506762060816e-05, + "loss": 0.009, + "step": 28600 + }, + { + "grad_norm": 0.2846415042877197, + "learning_rate": 5.793329923194977e-05, + "loss": 0.0081, + "step": 28610 + }, + { + "grad_norm": 0.305441677570343, + "learning_rate": 5.790608929191187e-05, + "loss": 0.0134, + "step": 28620 + }, + { + "grad_norm": 0.28540533781051636, + "learning_rate": 5.78788769502128e-05, + "loss": 0.0097, + "step": 28630 + }, + { + "grad_norm": 0.18442299962043762, + "learning_rate": 5.785166221511894e-05, + "loss": 0.0079, + "step": 28640 + }, + { + "grad_norm": 0.21536025404930115, + "learning_rate": 5.7824445094897415e-05, + "loss": 0.0061, + "step": 28650 + }, + { + "grad_norm": 0.24934975802898407, + "learning_rate": 5.7797225597816065e-05, + "loss": 0.0062, + "step": 28660 + }, + { + "grad_norm": 0.27574193477630615, + "learning_rate": 5.777000373214345e-05, + "loss": 0.0099, + "step": 28670 + }, + { + "grad_norm": 0.28934425115585327, + "learning_rate": 5.774277950614885e-05, + "loss": 0.0083, + "step": 28680 + }, + { + "grad_norm": 0.22388799488544464, + "learning_rate": 5.771555292810227e-05, + "loss": 0.0073, + "step": 28690 + }, + { + "grad_norm": 0.24774283170700073, + "learning_rate": 5.768832400627444e-05, + "loss": 0.0082, + "step": 28700 + }, + { + "grad_norm": 0.2151235193014145, + "learning_rate": 5.7661092748936775e-05, + "loss": 0.0093, + "step": 28710 + }, + { + "grad_norm": 0.16786257922649384, + "learning_rate": 5.76338591643614e-05, + "loss": 0.0074, + "step": 28720 + }, + { + "grad_norm": 0.2493477314710617, + "learning_rate": 5.760662326082118e-05, + "loss": 0.0086, + "step": 28730 + }, + { + "grad_norm": 0.20675677061080933, + "learning_rate": 5.757938504658965e-05, + "loss": 0.007, + "step": 28740 + }, + { + "grad_norm": 0.27235719561576843, + "learning_rate": 5.755214452994107e-05, + "loss": 0.0071, + "step": 28750 + }, + { + "grad_norm": 0.2107953429222107, + "learning_rate": 5.752490171915039e-05, + "loss": 0.0103, + "step": 28760 + }, + { + "grad_norm": 0.23601463437080383, + "learning_rate": 5.749765662249324e-05, + "loss": 0.0101, + "step": 28770 + }, + { + "grad_norm": 0.3236654996871948, + "learning_rate": 5.747040924824596e-05, + "loss": 0.0091, + "step": 28780 + }, + { + "grad_norm": 0.26263535022735596, + "learning_rate": 5.7443159604685613e-05, + "loss": 0.0081, + "step": 28790 + }, + { + "grad_norm": 0.277301549911499, + "learning_rate": 5.74159077000899e-05, + "loss": 0.007, + "step": 28800 + }, + { + "grad_norm": 0.23350243270397186, + "learning_rate": 5.7388653542737235e-05, + "loss": 0.0067, + "step": 28810 + }, + { + "grad_norm": 0.22064347565174103, + "learning_rate": 5.736139714090672e-05, + "loss": 0.0056, + "step": 28820 + }, + { + "grad_norm": 0.17626850306987762, + "learning_rate": 5.73341385028781e-05, + "loss": 0.0049, + "step": 28830 + }, + { + "grad_norm": 0.15675033628940582, + "learning_rate": 5.7306877636931855e-05, + "loss": 0.0065, + "step": 28840 + }, + { + "grad_norm": 0.22760313749313354, + "learning_rate": 5.7279614551349125e-05, + "loss": 0.0076, + "step": 28850 + }, + { + "grad_norm": 0.21552512049674988, + "learning_rate": 5.725234925441169e-05, + "loss": 0.0068, + "step": 28860 + }, + { + "grad_norm": 0.2988857924938202, + "learning_rate": 5.7225081754402044e-05, + "loss": 0.0103, + "step": 28870 + }, + { + "grad_norm": 0.19499582052230835, + "learning_rate": 5.7197812059603326e-05, + "loss": 0.0074, + "step": 28880 + }, + { + "grad_norm": 0.24426379799842834, + "learning_rate": 5.717054017829934e-05, + "loss": 0.0082, + "step": 28890 + }, + { + "grad_norm": 0.2361854761838913, + "learning_rate": 5.7143266118774584e-05, + "loss": 0.0077, + "step": 28900 + }, + { + "grad_norm": 0.25014418363571167, + "learning_rate": 5.711598988931418e-05, + "loss": 0.008, + "step": 28910 + }, + { + "grad_norm": 0.25537556409835815, + "learning_rate": 5.7088711498203954e-05, + "loss": 0.0089, + "step": 28920 + }, + { + "grad_norm": 0.25293126702308655, + "learning_rate": 5.706143095373033e-05, + "loss": 0.0095, + "step": 28930 + }, + { + "grad_norm": 0.21512947976589203, + "learning_rate": 5.703414826418042e-05, + "loss": 0.0049, + "step": 28940 + }, + { + "grad_norm": 0.20542387664318085, + "learning_rate": 5.7006863437842007e-05, + "loss": 0.0065, + "step": 28950 + }, + { + "grad_norm": 0.224168062210083, + "learning_rate": 5.697957648300348e-05, + "loss": 0.0057, + "step": 28960 + }, + { + "grad_norm": 0.2938244938850403, + "learning_rate": 5.695228740795391e-05, + "loss": 0.0072, + "step": 28970 + }, + { + "grad_norm": 0.28032246232032776, + "learning_rate": 5.6924996220982985e-05, + "loss": 0.0091, + "step": 28980 + }, + { + "grad_norm": 0.37351134419441223, + "learning_rate": 5.6897702930381045e-05, + "loss": 0.0123, + "step": 28990 + }, + { + "grad_norm": 0.2779284417629242, + "learning_rate": 5.687040754443908e-05, + "loss": 0.0076, + "step": 29000 + }, + { + "grad_norm": 0.23601925373077393, + "learning_rate": 5.6843110071448725e-05, + "loss": 0.01, + "step": 29010 + }, + { + "grad_norm": 0.23414798080921173, + "learning_rate": 5.6815810519702194e-05, + "loss": 0.0082, + "step": 29020 + }, + { + "grad_norm": 0.19407857954502106, + "learning_rate": 5.6788508897492396e-05, + "loss": 0.0057, + "step": 29030 + }, + { + "grad_norm": 0.19704720377922058, + "learning_rate": 5.676120521311282e-05, + "loss": 0.0076, + "step": 29040 + }, + { + "grad_norm": 0.22677984833717346, + "learning_rate": 5.6733899474857634e-05, + "loss": 0.0061, + "step": 29050 + }, + { + "grad_norm": 0.2066618800163269, + "learning_rate": 5.670659169102157e-05, + "loss": 0.008, + "step": 29060 + }, + { + "grad_norm": 0.1862179934978485, + "learning_rate": 5.6679281869900044e-05, + "loss": 0.0076, + "step": 29070 + }, + { + "grad_norm": 0.20768149197101593, + "learning_rate": 5.6651970019789045e-05, + "loss": 0.0064, + "step": 29080 + }, + { + "grad_norm": 0.2686161696910858, + "learning_rate": 5.662465614898519e-05, + "loss": 0.011, + "step": 29090 + }, + { + "grad_norm": 0.25573837757110596, + "learning_rate": 5.6597340265785695e-05, + "loss": 0.0081, + "step": 29100 + }, + { + "grad_norm": 0.31042012572288513, + "learning_rate": 5.657002237848843e-05, + "loss": 0.012, + "step": 29110 + }, + { + "grad_norm": 0.251038521528244, + "learning_rate": 5.654270249539183e-05, + "loss": 0.0082, + "step": 29120 + }, + { + "grad_norm": 0.21690836548805237, + "learning_rate": 5.651538062479498e-05, + "loss": 0.0063, + "step": 29130 + }, + { + "grad_norm": 0.22868235409259796, + "learning_rate": 5.648805677499751e-05, + "loss": 0.0078, + "step": 29140 + }, + { + "grad_norm": 0.17891651391983032, + "learning_rate": 5.646073095429969e-05, + "loss": 0.0076, + "step": 29150 + }, + { + "grad_norm": 0.2311461716890335, + "learning_rate": 5.643340317100241e-05, + "loss": 0.0072, + "step": 29160 + }, + { + "grad_norm": 0.25308936834335327, + "learning_rate": 5.64060734334071e-05, + "loss": 0.0076, + "step": 29170 + }, + { + "grad_norm": 0.2328048199415207, + "learning_rate": 5.637874174981583e-05, + "loss": 0.0077, + "step": 29180 + }, + { + "grad_norm": 0.22059611976146698, + "learning_rate": 5.635140812853124e-05, + "loss": 0.0067, + "step": 29190 + }, + { + "grad_norm": 0.31788837909698486, + "learning_rate": 5.6324072577856544e-05, + "loss": 0.0105, + "step": 29200 + }, + { + "grad_norm": 0.2928001284599304, + "learning_rate": 5.629673510609559e-05, + "loss": 0.0091, + "step": 29210 + }, + { + "grad_norm": 0.29898592829704285, + "learning_rate": 5.626939572155276e-05, + "loss": 0.0104, + "step": 29220 + }, + { + "grad_norm": 0.3294891119003296, + "learning_rate": 5.6242054432533054e-05, + "loss": 0.0076, + "step": 29230 + }, + { + "grad_norm": 0.27903491258621216, + "learning_rate": 5.621471124734201e-05, + "loss": 0.0072, + "step": 29240 + }, + { + "grad_norm": 0.24635784327983856, + "learning_rate": 5.6187366174285794e-05, + "loss": 0.0073, + "step": 29250 + }, + { + "grad_norm": 0.2724165916442871, + "learning_rate": 5.616001922167109e-05, + "loss": 0.0102, + "step": 29260 + }, + { + "grad_norm": 0.3436252176761627, + "learning_rate": 5.61326703978052e-05, + "loss": 0.0091, + "step": 29270 + }, + { + "grad_norm": 0.3081994652748108, + "learning_rate": 5.6105319710995964e-05, + "loss": 0.0066, + "step": 29280 + }, + { + "grad_norm": 0.27519500255584717, + "learning_rate": 5.60779671695518e-05, + "loss": 0.0103, + "step": 29290 + }, + { + "grad_norm": 0.20871011912822723, + "learning_rate": 5.6050612781781684e-05, + "loss": 0.0067, + "step": 29300 + }, + { + "grad_norm": 0.21957652270793915, + "learning_rate": 5.602325655599516e-05, + "loss": 0.008, + "step": 29310 + }, + { + "grad_norm": 0.20027408003807068, + "learning_rate": 5.599589850050234e-05, + "loss": 0.0075, + "step": 29320 + }, + { + "grad_norm": 0.22094036638736725, + "learning_rate": 5.5968538623613874e-05, + "loss": 0.0095, + "step": 29330 + }, + { + "grad_norm": 0.2588355541229248, + "learning_rate": 5.594117693364095e-05, + "loss": 0.0068, + "step": 29340 + }, + { + "grad_norm": 0.30398425459861755, + "learning_rate": 5.591381343889535e-05, + "loss": 0.0098, + "step": 29350 + }, + { + "grad_norm": 0.18580931425094604, + "learning_rate": 5.5886448147689355e-05, + "loss": 0.0069, + "step": 29360 + }, + { + "grad_norm": 0.2529332935810089, + "learning_rate": 5.585908106833585e-05, + "loss": 0.0057, + "step": 29370 + }, + { + "grad_norm": 0.21863658726215363, + "learning_rate": 5.5831712209148226e-05, + "loss": 0.0095, + "step": 29380 + }, + { + "grad_norm": 0.2676388919353485, + "learning_rate": 5.58043415784404e-05, + "loss": 0.0073, + "step": 29390 + }, + { + "grad_norm": 0.30261924862861633, + "learning_rate": 5.577696918452686e-05, + "loss": 0.0088, + "step": 29400 + }, + { + "grad_norm": 0.2661668062210083, + "learning_rate": 5.5749595035722604e-05, + "loss": 0.0074, + "step": 29410 + }, + { + "grad_norm": 0.2667039632797241, + "learning_rate": 5.5722219140343193e-05, + "loss": 0.0066, + "step": 29420 + }, + { + "grad_norm": 0.2321736365556717, + "learning_rate": 5.56948415067047e-05, + "loss": 0.008, + "step": 29430 + }, + { + "grad_norm": 0.1692020744085312, + "learning_rate": 5.5667462143123704e-05, + "loss": 0.0055, + "step": 29440 + }, + { + "grad_norm": 0.23038333654403687, + "learning_rate": 5.564008105791737e-05, + "loss": 0.0082, + "step": 29450 + }, + { + "grad_norm": 0.2729283273220062, + "learning_rate": 5.5612698259403316e-05, + "loss": 0.0093, + "step": 29460 + }, + { + "grad_norm": 0.24506737291812897, + "learning_rate": 5.5585313755899724e-05, + "loss": 0.0074, + "step": 29470 + }, + { + "grad_norm": 0.2097046822309494, + "learning_rate": 5.5557927555725285e-05, + "loss": 0.0071, + "step": 29480 + }, + { + "grad_norm": 0.20105338096618652, + "learning_rate": 5.55305396671992e-05, + "loss": 0.0071, + "step": 29490 + }, + { + "grad_norm": 0.21321679651737213, + "learning_rate": 5.55031500986412e-05, + "loss": 0.0059, + "step": 29500 + }, + { + "grad_norm": 0.24555663764476776, + "learning_rate": 5.547575885837149e-05, + "loss": 0.011, + "step": 29510 + }, + { + "grad_norm": 0.18333902955055237, + "learning_rate": 5.5448365954710825e-05, + "loss": 0.0092, + "step": 29520 + }, + { + "grad_norm": 0.21150030195713043, + "learning_rate": 5.5420971395980446e-05, + "loss": 0.0078, + "step": 29530 + }, + { + "grad_norm": 0.25246310234069824, + "learning_rate": 5.539357519050209e-05, + "loss": 0.0068, + "step": 29540 + }, + { + "grad_norm": 0.1876419484615326, + "learning_rate": 5.536617734659799e-05, + "loss": 0.0063, + "step": 29550 + }, + { + "grad_norm": 0.262921005487442, + "learning_rate": 5.533877787259091e-05, + "loss": 0.0086, + "step": 29560 + }, + { + "grad_norm": 0.17875295877456665, + "learning_rate": 5.5311376776804044e-05, + "loss": 0.0076, + "step": 29570 + }, + { + "grad_norm": 0.35606858134269714, + "learning_rate": 5.528397406756118e-05, + "loss": 0.0088, + "step": 29580 + }, + { + "grad_norm": 0.31185194849967957, + "learning_rate": 5.525656975318652e-05, + "loss": 0.0086, + "step": 29590 + }, + { + "grad_norm": 0.2646799385547638, + "learning_rate": 5.522916384200474e-05, + "loss": 0.01, + "step": 29600 + }, + { + "grad_norm": 0.21735616028308868, + "learning_rate": 5.520175634234106e-05, + "loss": 0.0069, + "step": 29610 + }, + { + "grad_norm": 0.2125009149312973, + "learning_rate": 5.517434726252113e-05, + "loss": 0.0065, + "step": 29620 + }, + { + "grad_norm": 0.20805130898952484, + "learning_rate": 5.514693661087113e-05, + "loss": 0.0074, + "step": 29630 + }, + { + "grad_norm": 0.2838054597377777, + "learning_rate": 5.511952439571769e-05, + "loss": 0.006, + "step": 29640 + }, + { + "grad_norm": 0.24208292365074158, + "learning_rate": 5.509211062538791e-05, + "loss": 0.0073, + "step": 29650 + }, + { + "grad_norm": 0.23704229295253754, + "learning_rate": 5.506469530820939e-05, + "loss": 0.0086, + "step": 29660 + }, + { + "grad_norm": 0.2304278314113617, + "learning_rate": 5.503727845251014e-05, + "loss": 0.0082, + "step": 29670 + }, + { + "grad_norm": 0.2341359406709671, + "learning_rate": 5.50098600666187e-05, + "loss": 0.0085, + "step": 29680 + }, + { + "grad_norm": 0.23659272491931915, + "learning_rate": 5.498244015886406e-05, + "loss": 0.0064, + "step": 29690 + }, + { + "grad_norm": 0.2730538547039032, + "learning_rate": 5.495501873757565e-05, + "loss": 0.006, + "step": 29700 + }, + { + "grad_norm": 0.2138662487268448, + "learning_rate": 5.492759581108336e-05, + "loss": 0.0078, + "step": 29710 + }, + { + "grad_norm": 0.2787604331970215, + "learning_rate": 5.490017138771759e-05, + "loss": 0.0087, + "step": 29720 + }, + { + "grad_norm": 0.23710469901561737, + "learning_rate": 5.487274547580912e-05, + "loss": 0.0073, + "step": 29730 + }, + { + "grad_norm": 0.24497291445732117, + "learning_rate": 5.484531808368923e-05, + "loss": 0.0079, + "step": 29740 + }, + { + "grad_norm": 0.20636817812919617, + "learning_rate": 5.4817889219689656e-05, + "loss": 0.0071, + "step": 29750 + }, + { + "grad_norm": 0.2868874967098236, + "learning_rate": 5.4790458892142536e-05, + "loss": 0.009, + "step": 29760 + }, + { + "grad_norm": 0.24427495896816254, + "learning_rate": 5.476302710938048e-05, + "loss": 0.007, + "step": 29770 + }, + { + "grad_norm": 0.23827575147151947, + "learning_rate": 5.473559387973657e-05, + "loss": 0.0069, + "step": 29780 + }, + { + "grad_norm": 0.2940636873245239, + "learning_rate": 5.470815921154425e-05, + "loss": 0.0085, + "step": 29790 + }, + { + "grad_norm": 0.2819958031177521, + "learning_rate": 5.468072311313749e-05, + "loss": 0.0094, + "step": 29800 + }, + { + "grad_norm": 0.2547339200973511, + "learning_rate": 5.465328559285063e-05, + "loss": 0.0094, + "step": 29810 + }, + { + "grad_norm": 0.23722390830516815, + "learning_rate": 5.462584665901849e-05, + "loss": 0.01, + "step": 29820 + }, + { + "grad_norm": 0.30876991152763367, + "learning_rate": 5.4598406319976235e-05, + "loss": 0.0066, + "step": 29830 + }, + { + "grad_norm": 0.2791697382926941, + "learning_rate": 5.457096458405958e-05, + "loss": 0.0104, + "step": 29840 + }, + { + "grad_norm": 0.27456432580947876, + "learning_rate": 5.454352145960457e-05, + "loss": 0.0087, + "step": 29850 + }, + { + "grad_norm": 0.2564623951911926, + "learning_rate": 5.4516076954947715e-05, + "loss": 0.0094, + "step": 29860 + }, + { + "grad_norm": 0.23036301136016846, + "learning_rate": 5.448863107842591e-05, + "loss": 0.0097, + "step": 29870 + }, + { + "grad_norm": 0.2549186646938324, + "learning_rate": 5.446118383837651e-05, + "loss": 0.0074, + "step": 29880 + }, + { + "grad_norm": 0.17569203674793243, + "learning_rate": 5.443373524313722e-05, + "loss": 0.0057, + "step": 29890 + }, + { + "grad_norm": 0.2941034734249115, + "learning_rate": 5.440628530104626e-05, + "loss": 0.0099, + "step": 29900 + }, + { + "grad_norm": 0.1800224632024765, + "learning_rate": 5.4378834020442146e-05, + "loss": 0.0081, + "step": 29910 + }, + { + "grad_norm": 0.2607594132423401, + "learning_rate": 5.4351381409663884e-05, + "loss": 0.0056, + "step": 29920 + }, + { + "grad_norm": 0.2771557867527008, + "learning_rate": 5.432392747705084e-05, + "loss": 0.0088, + "step": 29930 + }, + { + "grad_norm": 0.28794682025909424, + "learning_rate": 5.429647223094278e-05, + "loss": 0.0071, + "step": 29940 + }, + { + "grad_norm": 0.2534019351005554, + "learning_rate": 5.4269015679679924e-05, + "loss": 0.0071, + "step": 29950 + }, + { + "grad_norm": 0.217961847782135, + "learning_rate": 5.424155783160281e-05, + "loss": 0.0087, + "step": 29960 + }, + { + "grad_norm": 0.28829023241996765, + "learning_rate": 5.4214098695052415e-05, + "loss": 0.0071, + "step": 29970 + }, + { + "grad_norm": 0.26356667280197144, + "learning_rate": 5.418663827837012e-05, + "loss": 0.0072, + "step": 29980 + }, + { + "grad_norm": 0.24365897476673126, + "learning_rate": 5.415917658989763e-05, + "loss": 0.007, + "step": 29990 + }, + { + "grad_norm": 0.22417080402374268, + "learning_rate": 5.413171363797713e-05, + "loss": 0.0076, + "step": 30000 + }, + { + "grad_norm": 0.20520949363708496, + "learning_rate": 5.4104249430951116e-05, + "loss": 0.0118, + "step": 30010 + }, + { + "grad_norm": 0.18385519087314606, + "learning_rate": 5.4076783977162494e-05, + "loss": 0.0082, + "step": 30020 + }, + { + "grad_norm": 0.307303786277771, + "learning_rate": 5.4049317284954525e-05, + "loss": 0.0087, + "step": 30030 + }, + { + "grad_norm": 0.2726621925830841, + "learning_rate": 5.4021849362670884e-05, + "loss": 0.0092, + "step": 30040 + }, + { + "grad_norm": 0.21024097502231598, + "learning_rate": 5.3994380218655604e-05, + "loss": 0.0069, + "step": 30050 + }, + { + "grad_norm": 0.24375587701797485, + "learning_rate": 5.396690986125309e-05, + "loss": 0.0083, + "step": 30060 + }, + { + "grad_norm": 0.24686092138290405, + "learning_rate": 5.3939438298808075e-05, + "loss": 0.0087, + "step": 30070 + }, + { + "grad_norm": 0.2115754634141922, + "learning_rate": 5.3911965539665744e-05, + "loss": 0.006, + "step": 30080 + }, + { + "grad_norm": 0.2142253965139389, + "learning_rate": 5.388449159217156e-05, + "loss": 0.0096, + "step": 30090 + }, + { + "grad_norm": 0.20955142378807068, + "learning_rate": 5.3857016464671385e-05, + "loss": 0.0088, + "step": 30100 + }, + { + "grad_norm": 0.24378670752048492, + "learning_rate": 5.382954016551146e-05, + "loss": 0.0087, + "step": 30110 + }, + { + "grad_norm": 0.19760951399803162, + "learning_rate": 5.380206270303835e-05, + "loss": 0.0088, + "step": 30120 + }, + { + "grad_norm": 0.20087166130542755, + "learning_rate": 5.377458408559897e-05, + "loss": 0.0095, + "step": 30130 + }, + { + "grad_norm": 0.21888969838619232, + "learning_rate": 5.374710432154061e-05, + "loss": 0.0095, + "step": 30140 + }, + { + "grad_norm": 0.3185484707355499, + "learning_rate": 5.3719623419210886e-05, + "loss": 0.0119, + "step": 30150 + }, + { + "grad_norm": 0.24816665053367615, + "learning_rate": 5.3692141386957786e-05, + "loss": 0.01, + "step": 30160 + }, + { + "grad_norm": 0.2567881941795349, + "learning_rate": 5.3664658233129616e-05, + "loss": 0.0087, + "step": 30170 + }, + { + "grad_norm": 0.3140625059604645, + "learning_rate": 5.363717396607504e-05, + "loss": 0.0109, + "step": 30180 + }, + { + "grad_norm": 0.25329121947288513, + "learning_rate": 5.360968859414305e-05, + "loss": 0.0097, + "step": 30190 + }, + { + "grad_norm": 0.19744175672531128, + "learning_rate": 5.358220212568295e-05, + "loss": 0.008, + "step": 30200 + }, + { + "grad_norm": 0.3458896577358246, + "learning_rate": 5.355471456904444e-05, + "loss": 0.0071, + "step": 30210 + }, + { + "grad_norm": 0.25304165482521057, + "learning_rate": 5.3527225932577495e-05, + "loss": 0.0116, + "step": 30220 + }, + { + "grad_norm": 0.2320389300584793, + "learning_rate": 5.349973622463246e-05, + "loss": 0.0086, + "step": 30230 + }, + { + "grad_norm": 0.24490326642990112, + "learning_rate": 5.3472245453559956e-05, + "loss": 0.0076, + "step": 30240 + }, + { + "grad_norm": 0.1762550175189972, + "learning_rate": 5.3444753627710955e-05, + "loss": 0.0061, + "step": 30250 + }, + { + "grad_norm": 0.3221755921840668, + "learning_rate": 5.341726075543676e-05, + "loss": 0.0077, + "step": 30260 + }, + { + "grad_norm": 0.2090737521648407, + "learning_rate": 5.338976684508898e-05, + "loss": 0.0073, + "step": 30270 + }, + { + "grad_norm": 0.23273541033267975, + "learning_rate": 5.336227190501953e-05, + "loss": 0.008, + "step": 30280 + }, + { + "grad_norm": 0.18089531362056732, + "learning_rate": 5.3334775943580664e-05, + "loss": 0.0064, + "step": 30290 + }, + { + "grad_norm": 0.22441692650318146, + "learning_rate": 5.330727896912491e-05, + "loss": 0.0066, + "step": 30300 + }, + { + "grad_norm": 0.2760082185268402, + "learning_rate": 5.327978099000511e-05, + "loss": 0.0077, + "step": 30310 + }, + { + "grad_norm": 0.2562609612941742, + "learning_rate": 5.3252282014574465e-05, + "loss": 0.0063, + "step": 30320 + }, + { + "grad_norm": 0.29413658380508423, + "learning_rate": 5.322478205118641e-05, + "loss": 0.009, + "step": 30330 + }, + { + "grad_norm": 0.29796522855758667, + "learning_rate": 5.3197281108194704e-05, + "loss": 0.0063, + "step": 30340 + }, + { + "grad_norm": 0.167960062623024, + "learning_rate": 5.316977919395342e-05, + "loss": 0.0054, + "step": 30350 + }, + { + "grad_norm": 0.264981210231781, + "learning_rate": 5.314227631681691e-05, + "loss": 0.0072, + "step": 30360 + }, + { + "grad_norm": 0.2510298192501068, + "learning_rate": 5.311477248513982e-05, + "loss": 0.0081, + "step": 30370 + }, + { + "grad_norm": 0.26735007762908936, + "learning_rate": 5.30872677072771e-05, + "loss": 0.0109, + "step": 30380 + }, + { + "grad_norm": 0.22081099450588226, + "learning_rate": 5.3059761991583954e-05, + "loss": 0.0078, + "step": 30390 + }, + { + "grad_norm": 0.22445517778396606, + "learning_rate": 5.303225534641592e-05, + "loss": 0.0078, + "step": 30400 + }, + { + "grad_norm": 0.2581021785736084, + "learning_rate": 5.300474778012875e-05, + "loss": 0.0065, + "step": 30410 + }, + { + "grad_norm": 0.2959626019001007, + "learning_rate": 5.297723930107855e-05, + "loss": 0.0081, + "step": 30420 + }, + { + "grad_norm": 0.28532201051712036, + "learning_rate": 5.294972991762167e-05, + "loss": 0.0068, + "step": 30430 + }, + { + "grad_norm": 0.22832205891609192, + "learning_rate": 5.292221963811472e-05, + "loss": 0.0077, + "step": 30440 + }, + { + "grad_norm": 0.2599642872810364, + "learning_rate": 5.28947084709146e-05, + "loss": 0.0064, + "step": 30450 + }, + { + "grad_norm": 0.22445081174373627, + "learning_rate": 5.2867196424378465e-05, + "loss": 0.01, + "step": 30460 + }, + { + "grad_norm": 0.31649258732795715, + "learning_rate": 5.2839683506863765e-05, + "loss": 0.009, + "step": 30470 + }, + { + "grad_norm": 0.23157016932964325, + "learning_rate": 5.281216972672821e-05, + "loss": 0.0077, + "step": 30480 + }, + { + "grad_norm": 0.2759285569190979, + "learning_rate": 5.278465509232973e-05, + "loss": 0.0086, + "step": 30490 + }, + { + "grad_norm": 0.3200608193874359, + "learning_rate": 5.275713961202655e-05, + "loss": 0.007, + "step": 30500 + }, + { + "grad_norm": 0.1513795554637909, + "learning_rate": 5.2729623294177165e-05, + "loss": 0.0083, + "step": 30510 + }, + { + "grad_norm": 0.19994240999221802, + "learning_rate": 5.270210614714028e-05, + "loss": 0.0075, + "step": 30520 + }, + { + "grad_norm": 0.28533345460891724, + "learning_rate": 5.267458817927491e-05, + "loss": 0.0102, + "step": 30530 + }, + { + "grad_norm": 0.3048255443572998, + "learning_rate": 5.264706939894026e-05, + "loss": 0.0072, + "step": 30540 + }, + { + "grad_norm": 0.22084756195545197, + "learning_rate": 5.261954981449584e-05, + "loss": 0.0091, + "step": 30550 + }, + { + "grad_norm": 0.22976526618003845, + "learning_rate": 5.2592029434301324e-05, + "loss": 0.0072, + "step": 30560 + }, + { + "grad_norm": 0.29769137501716614, + "learning_rate": 5.256450826671672e-05, + "loss": 0.0087, + "step": 30570 + }, + { + "grad_norm": 0.28434839844703674, + "learning_rate": 5.253698632010221e-05, + "loss": 0.0085, + "step": 30580 + }, + { + "grad_norm": 0.2321949154138565, + "learning_rate": 5.2509463602818246e-05, + "loss": 0.0097, + "step": 30590 + }, + { + "grad_norm": 0.26280343532562256, + "learning_rate": 5.248194012322549e-05, + "loss": 0.0102, + "step": 30600 + }, + { + "grad_norm": 0.19255560636520386, + "learning_rate": 5.245441588968486e-05, + "loss": 0.0085, + "step": 30610 + }, + { + "grad_norm": 0.18992942571640015, + "learning_rate": 5.242689091055748e-05, + "loss": 0.0063, + "step": 30620 + }, + { + "grad_norm": 0.19230274856090546, + "learning_rate": 5.239936519420473e-05, + "loss": 0.0056, + "step": 30630 + }, + { + "grad_norm": 0.24540460109710693, + "learning_rate": 5.2371838748988175e-05, + "loss": 0.005, + "step": 30640 + }, + { + "grad_norm": 0.19172360002994537, + "learning_rate": 5.234431158326965e-05, + "loss": 0.0058, + "step": 30650 + }, + { + "grad_norm": 0.23868797719478607, + "learning_rate": 5.231678370541115e-05, + "loss": 0.0078, + "step": 30660 + }, + { + "grad_norm": 0.2989415228366852, + "learning_rate": 5.228925512377495e-05, + "loss": 0.007, + "step": 30670 + }, + { + "grad_norm": 0.3222934901714325, + "learning_rate": 5.2261725846723465e-05, + "loss": 0.0088, + "step": 30680 + }, + { + "grad_norm": 0.23949749767780304, + "learning_rate": 5.22341958826194e-05, + "loss": 0.0088, + "step": 30690 + }, + { + "grad_norm": 0.23432166874408722, + "learning_rate": 5.22066652398256e-05, + "loss": 0.0071, + "step": 30700 + }, + { + "grad_norm": 0.22309230268001556, + "learning_rate": 5.2179133926705185e-05, + "loss": 0.0083, + "step": 30710 + }, + { + "grad_norm": 0.2281610369682312, + "learning_rate": 5.215160195162141e-05, + "loss": 0.0089, + "step": 30720 + }, + { + "grad_norm": 0.19826920330524445, + "learning_rate": 5.212406932293776e-05, + "loss": 0.0101, + "step": 30730 + }, + { + "grad_norm": 0.3359815180301666, + "learning_rate": 5.209653604901795e-05, + "loss": 0.0078, + "step": 30740 + }, + { + "grad_norm": 0.24799604713916779, + "learning_rate": 5.206900213822584e-05, + "loss": 0.0085, + "step": 30750 + }, + { + "grad_norm": 0.2714398503303528, + "learning_rate": 5.204146759892551e-05, + "loss": 0.0064, + "step": 30760 + }, + { + "grad_norm": 0.29669567942619324, + "learning_rate": 5.2013932439481216e-05, + "loss": 0.012, + "step": 30770 + }, + { + "grad_norm": 0.22248001396656036, + "learning_rate": 5.198639666825743e-05, + "loss": 0.0074, + "step": 30780 + }, + { + "grad_norm": 0.2481614351272583, + "learning_rate": 5.195886029361877e-05, + "loss": 0.0113, + "step": 30790 + }, + { + "grad_norm": 0.2154862880706787, + "learning_rate": 5.193132332393009e-05, + "loss": 0.0058, + "step": 30800 + }, + { + "grad_norm": 0.19185087084770203, + "learning_rate": 5.1903785767556376e-05, + "loss": 0.006, + "step": 30810 + }, + { + "grad_norm": 0.22475433349609375, + "learning_rate": 5.187624763286282e-05, + "loss": 0.0055, + "step": 30820 + }, + { + "grad_norm": 0.46180668473243713, + "learning_rate": 5.184870892821475e-05, + "loss": 0.0101, + "step": 30830 + }, + { + "grad_norm": 0.1878470629453659, + "learning_rate": 5.182116966197773e-05, + "loss": 0.0096, + "step": 30840 + }, + { + "grad_norm": 0.27362531423568726, + "learning_rate": 5.1793629842517466e-05, + "loss": 0.0079, + "step": 30850 + }, + { + "grad_norm": 0.22345761954784393, + "learning_rate": 5.17660894781998e-05, + "loss": 0.0064, + "step": 30860 + }, + { + "grad_norm": 0.3262491524219513, + "learning_rate": 5.173854857739079e-05, + "loss": 0.0117, + "step": 30870 + }, + { + "grad_norm": 0.2509889304637909, + "learning_rate": 5.171100714845661e-05, + "loss": 0.0074, + "step": 30880 + }, + { + "grad_norm": 0.2029818296432495, + "learning_rate": 5.1683465199763646e-05, + "loss": 0.0071, + "step": 30890 + }, + { + "grad_norm": 0.1668272316455841, + "learning_rate": 5.16559227396784e-05, + "loss": 0.0062, + "step": 30900 + }, + { + "grad_norm": 0.22027690708637238, + "learning_rate": 5.1628379776567556e-05, + "loss": 0.007, + "step": 30910 + }, + { + "grad_norm": 0.14922089874744415, + "learning_rate": 5.160083631879792e-05, + "loss": 0.0054, + "step": 30920 + }, + { + "grad_norm": 0.21996065974235535, + "learning_rate": 5.1573292374736484e-05, + "loss": 0.0053, + "step": 30930 + }, + { + "grad_norm": 0.21836510300636292, + "learning_rate": 5.1545747952750356e-05, + "loss": 0.0049, + "step": 30940 + }, + { + "grad_norm": 0.2488766610622406, + "learning_rate": 5.151820306120682e-05, + "loss": 0.0104, + "step": 30950 + }, + { + "grad_norm": 0.32899609208106995, + "learning_rate": 5.149065770847328e-05, + "loss": 0.0066, + "step": 30960 + }, + { + "grad_norm": 0.30933311581611633, + "learning_rate": 5.1463111902917297e-05, + "loss": 0.0085, + "step": 30970 + }, + { + "grad_norm": 0.22308926284313202, + "learning_rate": 5.143556565290654e-05, + "loss": 0.0085, + "step": 30980 + }, + { + "grad_norm": 0.3138863444328308, + "learning_rate": 5.140801896680882e-05, + "loss": 0.0129, + "step": 30990 + }, + { + "grad_norm": 0.20931702852249146, + "learning_rate": 5.1380471852992144e-05, + "loss": 0.0087, + "step": 31000 + }, + { + "grad_norm": 0.25267651677131653, + "learning_rate": 5.135292431982457e-05, + "loss": 0.0072, + "step": 31010 + }, + { + "grad_norm": 0.20820403099060059, + "learning_rate": 5.1325376375674294e-05, + "loss": 0.0086, + "step": 31020 + }, + { + "grad_norm": 0.3354862630367279, + "learning_rate": 5.129782802890968e-05, + "loss": 0.0079, + "step": 31030 + }, + { + "grad_norm": 0.2758081257343292, + "learning_rate": 5.127027928789916e-05, + "loss": 0.0098, + "step": 31040 + }, + { + "grad_norm": 0.21352413296699524, + "learning_rate": 5.124273016101135e-05, + "loss": 0.0064, + "step": 31050 + }, + { + "grad_norm": 0.3500865697860718, + "learning_rate": 5.121518065661492e-05, + "loss": 0.0089, + "step": 31060 + }, + { + "grad_norm": 0.24476422369480133, + "learning_rate": 5.11876307830787e-05, + "loss": 0.0078, + "step": 31070 + }, + { + "grad_norm": 0.23534061014652252, + "learning_rate": 5.1160080548771596e-05, + "loss": 0.0072, + "step": 31080 + }, + { + "grad_norm": 0.2848494350910187, + "learning_rate": 5.1132529962062656e-05, + "loss": 0.0083, + "step": 31090 + }, + { + "grad_norm": 0.23742736876010895, + "learning_rate": 5.110497903132101e-05, + "loss": 0.0074, + "step": 31100 + }, + { + "grad_norm": 0.28013914823532104, + "learning_rate": 5.107742776491592e-05, + "loss": 0.0125, + "step": 31110 + }, + { + "grad_norm": 0.20948348939418793, + "learning_rate": 5.104987617121673e-05, + "loss": 0.0064, + "step": 31120 + }, + { + "grad_norm": 0.2949061095714569, + "learning_rate": 5.102232425859287e-05, + "loss": 0.0089, + "step": 31130 + }, + { + "grad_norm": 0.4035622179508209, + "learning_rate": 5.09947720354139e-05, + "loss": 0.0081, + "step": 31140 + }, + { + "grad_norm": 0.2383720576763153, + "learning_rate": 5.096721951004942e-05, + "loss": 0.0105, + "step": 31150 + }, + { + "grad_norm": 0.22843797504901886, + "learning_rate": 5.0939666690869227e-05, + "loss": 0.0079, + "step": 31160 + }, + { + "grad_norm": 0.1961676925420761, + "learning_rate": 5.0912113586243096e-05, + "loss": 0.0063, + "step": 31170 + }, + { + "grad_norm": 0.2668704688549042, + "learning_rate": 5.0884560204540935e-05, + "loss": 0.0065, + "step": 31180 + }, + { + "grad_norm": 0.2266463041305542, + "learning_rate": 5.0857006554132736e-05, + "loss": 0.0069, + "step": 31190 + }, + { + "grad_norm": 0.23129190504550934, + "learning_rate": 5.0829452643388575e-05, + "loss": 0.0073, + "step": 31200 + }, + { + "grad_norm": 0.1816384643316269, + "learning_rate": 5.08018984806786e-05, + "loss": 0.0058, + "step": 31210 + }, + { + "grad_norm": 0.2779273986816406, + "learning_rate": 5.0774344074373036e-05, + "loss": 0.0076, + "step": 31220 + }, + { + "grad_norm": 0.20822004973888397, + "learning_rate": 5.07467894328422e-05, + "loss": 0.0049, + "step": 31230 + }, + { + "grad_norm": 0.15333092212677002, + "learning_rate": 5.0719234564456454e-05, + "loss": 0.009, + "step": 31240 + }, + { + "grad_norm": 0.21931354701519012, + "learning_rate": 5.0691679477586216e-05, + "loss": 0.0067, + "step": 31250 + }, + { + "grad_norm": 0.228150874376297, + "learning_rate": 5.0664124180602035e-05, + "loss": 0.0059, + "step": 31260 + }, + { + "grad_norm": 0.18881909549236298, + "learning_rate": 5.063656868187447e-05, + "loss": 0.0065, + "step": 31270 + }, + { + "grad_norm": 0.242721289396286, + "learning_rate": 5.060901298977413e-05, + "loss": 0.0067, + "step": 31280 + }, + { + "grad_norm": 0.24006229639053345, + "learning_rate": 5.0581457112671725e-05, + "loss": 0.0075, + "step": 31290 + }, + { + "grad_norm": 0.2504872679710388, + "learning_rate": 5.0553901058938016e-05, + "loss": 0.0065, + "step": 31300 + }, + { + "grad_norm": 0.20765410363674164, + "learning_rate": 5.052634483694377e-05, + "loss": 0.0056, + "step": 31310 + }, + { + "grad_norm": 0.26836851239204407, + "learning_rate": 5.049878845505988e-05, + "loss": 0.0098, + "step": 31320 + }, + { + "grad_norm": 0.290301650762558, + "learning_rate": 5.047123192165721e-05, + "loss": 0.0067, + "step": 31330 + }, + { + "grad_norm": 0.36293449997901917, + "learning_rate": 5.0443675245106735e-05, + "loss": 0.0065, + "step": 31340 + }, + { + "grad_norm": 0.246487557888031, + "learning_rate": 5.0416118433779426e-05, + "loss": 0.0059, + "step": 31350 + }, + { + "grad_norm": 0.2621065378189087, + "learning_rate": 5.038856149604633e-05, + "loss": 0.0094, + "step": 31360 + }, + { + "grad_norm": 0.22675533592700958, + "learning_rate": 5.03610044402785e-05, + "loss": 0.0069, + "step": 31370 + }, + { + "grad_norm": 0.18436338007450104, + "learning_rate": 5.033344727484707e-05, + "loss": 0.006, + "step": 31380 + }, + { + "grad_norm": 0.22949163615703583, + "learning_rate": 5.030589000812315e-05, + "loss": 0.0066, + "step": 31390 + }, + { + "grad_norm": 0.21328596770763397, + "learning_rate": 5.027833264847793e-05, + "loss": 0.0071, + "step": 31400 + }, + { + "grad_norm": 0.3004545271396637, + "learning_rate": 5.025077520428258e-05, + "loss": 0.0109, + "step": 31410 + }, + { + "grad_norm": 0.24109072983264923, + "learning_rate": 5.022321768390837e-05, + "loss": 0.0067, + "step": 31420 + }, + { + "grad_norm": 0.16669245064258575, + "learning_rate": 5.0195660095726516e-05, + "loss": 0.0068, + "step": 31430 + }, + { + "grad_norm": 0.2005222588777542, + "learning_rate": 5.016810244810829e-05, + "loss": 0.0078, + "step": 31440 + }, + { + "grad_norm": 0.16585180163383484, + "learning_rate": 5.0140544749424976e-05, + "loss": 0.009, + "step": 31450 + }, + { + "grad_norm": 0.1720247119665146, + "learning_rate": 5.0112987008047874e-05, + "loss": 0.007, + "step": 31460 + }, + { + "grad_norm": 0.22980570793151855, + "learning_rate": 5.008542923234831e-05, + "loss": 0.0074, + "step": 31470 + }, + { + "grad_norm": 0.21856798231601715, + "learning_rate": 5.00578714306976e-05, + "loss": 0.0065, + "step": 31480 + }, + { + "grad_norm": 0.2693788409233093, + "learning_rate": 5.0030313611467084e-05, + "loss": 0.0096, + "step": 31490 + }, + { + "grad_norm": 0.18391215801239014, + "learning_rate": 5.0002755783028074e-05, + "loss": 0.0088, + "step": 31500 + }, + { + "grad_norm": 0.2168341875076294, + "learning_rate": 4.997519795375194e-05, + "loss": 0.008, + "step": 31510 + }, + { + "grad_norm": 0.23137378692626953, + "learning_rate": 4.9947640132010016e-05, + "loss": 0.0057, + "step": 31520 + }, + { + "grad_norm": 0.18477937579154968, + "learning_rate": 4.9920082326173625e-05, + "loss": 0.0094, + "step": 31530 + }, + { + "grad_norm": 0.19932635128498077, + "learning_rate": 4.9892524544614114e-05, + "loss": 0.0073, + "step": 31540 + }, + { + "grad_norm": 0.217888742685318, + "learning_rate": 4.986496679570283e-05, + "loss": 0.0054, + "step": 31550 + }, + { + "grad_norm": 0.3126939535140991, + "learning_rate": 4.983740908781105e-05, + "loss": 0.0077, + "step": 31560 + }, + { + "grad_norm": 0.30557388067245483, + "learning_rate": 4.9809851429310116e-05, + "loss": 0.0096, + "step": 31570 + }, + { + "grad_norm": 0.2228098064661026, + "learning_rate": 4.9782293828571275e-05, + "loss": 0.0078, + "step": 31580 + }, + { + "grad_norm": 0.2402837574481964, + "learning_rate": 4.9754736293965846e-05, + "loss": 0.006, + "step": 31590 + }, + { + "grad_norm": 0.3416842520236969, + "learning_rate": 4.972717883386502e-05, + "loss": 0.0127, + "step": 31600 + }, + { + "grad_norm": 0.2635408341884613, + "learning_rate": 4.9699621456640075e-05, + "loss": 0.0083, + "step": 31610 + }, + { + "grad_norm": 0.2247954159975052, + "learning_rate": 4.9672064170662214e-05, + "loss": 0.0069, + "step": 31620 + }, + { + "grad_norm": 0.2249128520488739, + "learning_rate": 4.9644506984302583e-05, + "loss": 0.0069, + "step": 31630 + }, + { + "grad_norm": 0.2463538944721222, + "learning_rate": 4.9616949905932356e-05, + "loss": 0.0088, + "step": 31640 + }, + { + "grad_norm": 0.2651323676109314, + "learning_rate": 4.9589392943922615e-05, + "loss": 0.0077, + "step": 31650 + }, + { + "grad_norm": 0.22297944128513336, + "learning_rate": 4.956183610664447e-05, + "loss": 0.0083, + "step": 31660 + }, + { + "grad_norm": 0.22701004147529602, + "learning_rate": 4.9534279402468945e-05, + "loss": 0.0083, + "step": 31670 + }, + { + "grad_norm": 0.15203142166137695, + "learning_rate": 4.9506722839767036e-05, + "loss": 0.0061, + "step": 31680 + }, + { + "grad_norm": 0.32742583751678467, + "learning_rate": 4.947916642690972e-05, + "loss": 0.0089, + "step": 31690 + }, + { + "grad_norm": 0.2184576690196991, + "learning_rate": 4.9451610172267874e-05, + "loss": 0.0068, + "step": 31700 + }, + { + "grad_norm": 0.17164179682731628, + "learning_rate": 4.9424054084212376e-05, + "loss": 0.0084, + "step": 31710 + }, + { + "grad_norm": 0.24547478556632996, + "learning_rate": 4.939649817111407e-05, + "loss": 0.009, + "step": 31720 + }, + { + "grad_norm": 0.2324201464653015, + "learning_rate": 4.936894244134365e-05, + "loss": 0.0075, + "step": 31730 + }, + { + "grad_norm": 0.17113351821899414, + "learning_rate": 4.9341386903271886e-05, + "loss": 0.007, + "step": 31740 + }, + { + "grad_norm": 0.2053913027048111, + "learning_rate": 4.931383156526936e-05, + "loss": 0.0071, + "step": 31750 + }, + { + "grad_norm": 0.24972592294216156, + "learning_rate": 4.92862764357067e-05, + "loss": 0.0081, + "step": 31760 + }, + { + "grad_norm": 0.20848095417022705, + "learning_rate": 4.925872152295443e-05, + "loss": 0.007, + "step": 31770 + }, + { + "grad_norm": 0.2362770438194275, + "learning_rate": 4.923116683538296e-05, + "loss": 0.0112, + "step": 31780 + }, + { + "grad_norm": 0.18292094767093658, + "learning_rate": 4.920361238136273e-05, + "loss": 0.0067, + "step": 31790 + }, + { + "grad_norm": 0.2739775776863098, + "learning_rate": 4.9176058169264014e-05, + "loss": 0.0095, + "step": 31800 + }, + { + "grad_norm": 0.23343752324581146, + "learning_rate": 4.9148504207457074e-05, + "loss": 0.0067, + "step": 31810 + }, + { + "grad_norm": 0.2778758108615875, + "learning_rate": 4.912095050431208e-05, + "loss": 0.0075, + "step": 31820 + }, + { + "grad_norm": 0.1821853071451187, + "learning_rate": 4.909339706819911e-05, + "loss": 0.0065, + "step": 31830 + }, + { + "grad_norm": 0.23555830121040344, + "learning_rate": 4.906584390748819e-05, + "loss": 0.0073, + "step": 31840 + }, + { + "grad_norm": 0.24512019753456116, + "learning_rate": 4.9038291030549195e-05, + "loss": 0.0063, + "step": 31850 + }, + { + "grad_norm": 0.20886830985546112, + "learning_rate": 4.9010738445751995e-05, + "loss": 0.0085, + "step": 31860 + }, + { + "grad_norm": 0.2214621752500534, + "learning_rate": 4.8983186161466364e-05, + "loss": 0.0069, + "step": 31870 + }, + { + "grad_norm": 0.1699652373790741, + "learning_rate": 4.89556341860619e-05, + "loss": 0.0056, + "step": 31880 + }, + { + "grad_norm": 0.1831630915403366, + "learning_rate": 4.892808252790822e-05, + "loss": 0.007, + "step": 31890 + }, + { + "grad_norm": 0.3026648759841919, + "learning_rate": 4.890053119537475e-05, + "loss": 0.0093, + "step": 31900 + }, + { + "grad_norm": 0.2638152539730072, + "learning_rate": 4.887298019683087e-05, + "loss": 0.0066, + "step": 31910 + }, + { + "grad_norm": 0.2961420714855194, + "learning_rate": 4.884542954064587e-05, + "loss": 0.0066, + "step": 31920 + }, + { + "grad_norm": 0.22466962039470673, + "learning_rate": 4.881787923518887e-05, + "loss": 0.0073, + "step": 31930 + }, + { + "grad_norm": 0.2759089767932892, + "learning_rate": 4.879032928882896e-05, + "loss": 0.0088, + "step": 31940 + }, + { + "grad_norm": 0.2509841322898865, + "learning_rate": 4.876277970993505e-05, + "loss": 0.0067, + "step": 31950 + }, + { + "grad_norm": 0.30766138434410095, + "learning_rate": 4.873523050687602e-05, + "loss": 0.0103, + "step": 31960 + }, + { + "grad_norm": 0.21319961547851562, + "learning_rate": 4.870768168802056e-05, + "loss": 0.0061, + "step": 31970 + }, + { + "grad_norm": 0.22675667703151703, + "learning_rate": 4.868013326173728e-05, + "loss": 0.0089, + "step": 31980 + }, + { + "grad_norm": 0.17030957341194153, + "learning_rate": 4.865258523639468e-05, + "loss": 0.0055, + "step": 31990 + }, + { + "grad_norm": 0.23386850953102112, + "learning_rate": 4.862503762036109e-05, + "loss": 0.0082, + "step": 32000 + }, + { + "grad_norm": 0.2531217336654663, + "learning_rate": 4.859749042200478e-05, + "loss": 0.0063, + "step": 32010 + }, + { + "grad_norm": 0.2289196252822876, + "learning_rate": 4.856994364969384e-05, + "loss": 0.0064, + "step": 32020 + }, + { + "grad_norm": 0.22871866822242737, + "learning_rate": 4.854239731179625e-05, + "loss": 0.0078, + "step": 32030 + }, + { + "grad_norm": 0.26219645142555237, + "learning_rate": 4.85148514166799e-05, + "loss": 0.0073, + "step": 32040 + }, + { + "grad_norm": 0.22148561477661133, + "learning_rate": 4.8487305972712456e-05, + "loss": 0.006, + "step": 32050 + }, + { + "grad_norm": 0.24628743529319763, + "learning_rate": 4.8459760988261526e-05, + "loss": 0.0079, + "step": 32060 + }, + { + "grad_norm": 0.20057158172130585, + "learning_rate": 4.843221647169453e-05, + "loss": 0.0057, + "step": 32070 + }, + { + "grad_norm": 0.18639099597930908, + "learning_rate": 4.840467243137878e-05, + "loss": 0.008, + "step": 32080 + }, + { + "grad_norm": 0.18927448987960815, + "learning_rate": 4.837712887568143e-05, + "loss": 0.0068, + "step": 32090 + }, + { + "grad_norm": 0.2605811059474945, + "learning_rate": 4.8349585812969464e-05, + "loss": 0.0059, + "step": 32100 + }, + { + "grad_norm": 0.2559966444969177, + "learning_rate": 4.8322043251609775e-05, + "loss": 0.0074, + "step": 32110 + }, + { + "grad_norm": 0.38517898321151733, + "learning_rate": 4.8294501199969015e-05, + "loss": 0.0098, + "step": 32120 + }, + { + "grad_norm": 0.16990350186824799, + "learning_rate": 4.826695966641376e-05, + "loss": 0.0053, + "step": 32130 + }, + { + "grad_norm": 0.28396254777908325, + "learning_rate": 4.823941865931043e-05, + "loss": 0.0067, + "step": 32140 + }, + { + "grad_norm": 0.2862111032009125, + "learning_rate": 4.82118781870252e-05, + "loss": 0.0065, + "step": 32150 + }, + { + "grad_norm": 0.24131996929645538, + "learning_rate": 4.8184338257924185e-05, + "loss": 0.0088, + "step": 32160 + }, + { + "grad_norm": 0.26497340202331543, + "learning_rate": 4.815679888037324e-05, + "loss": 0.0105, + "step": 32170 + }, + { + "grad_norm": 0.2897150218486786, + "learning_rate": 4.8129260062738135e-05, + "loss": 0.0089, + "step": 32180 + }, + { + "grad_norm": 0.24623708426952362, + "learning_rate": 4.810172181338445e-05, + "loss": 0.0062, + "step": 32190 + }, + { + "grad_norm": 0.20998826622962952, + "learning_rate": 4.807418414067753e-05, + "loss": 0.0054, + "step": 32200 + }, + { + "grad_norm": 0.23517116904258728, + "learning_rate": 4.804664705298264e-05, + "loss": 0.0058, + "step": 32210 + }, + { + "grad_norm": 0.13893675804138184, + "learning_rate": 4.80191105586648e-05, + "loss": 0.0064, + "step": 32220 + }, + { + "grad_norm": 0.22650250792503357, + "learning_rate": 4.799157466608886e-05, + "loss": 0.0074, + "step": 32230 + }, + { + "grad_norm": 0.22311876714229584, + "learning_rate": 4.796403938361951e-05, + "loss": 0.0068, + "step": 32240 + }, + { + "grad_norm": 0.19218571484088898, + "learning_rate": 4.793650471962123e-05, + "loss": 0.0092, + "step": 32250 + }, + { + "grad_norm": 0.2698209285736084, + "learning_rate": 4.790897068245835e-05, + "loss": 0.0055, + "step": 32260 + }, + { + "grad_norm": 0.1621396243572235, + "learning_rate": 4.7881437280494954e-05, + "loss": 0.006, + "step": 32270 + }, + { + "grad_norm": 0.21912983059883118, + "learning_rate": 4.7853904522094965e-05, + "loss": 0.0073, + "step": 32280 + }, + { + "grad_norm": 0.2590363025665283, + "learning_rate": 4.782637241562215e-05, + "loss": 0.0087, + "step": 32290 + }, + { + "grad_norm": 0.1961211860179901, + "learning_rate": 4.779884096943997e-05, + "loss": 0.0056, + "step": 32300 + }, + { + "grad_norm": 0.2392387092113495, + "learning_rate": 4.777131019191182e-05, + "loss": 0.0072, + "step": 32310 + }, + { + "grad_norm": 0.23692595958709717, + "learning_rate": 4.774378009140076e-05, + "loss": 0.0061, + "step": 32320 + }, + { + "grad_norm": 0.13764940202236176, + "learning_rate": 4.7716250676269735e-05, + "loss": 0.0063, + "step": 32330 + }, + { + "grad_norm": 0.24975548684597015, + "learning_rate": 4.7688721954881485e-05, + "loss": 0.007, + "step": 32340 + }, + { + "grad_norm": 0.23488099873065948, + "learning_rate": 4.7661193935598446e-05, + "loss": 0.0061, + "step": 32350 + }, + { + "grad_norm": 0.2668916583061218, + "learning_rate": 4.763366662678296e-05, + "loss": 0.0077, + "step": 32360 + }, + { + "grad_norm": 0.2959752082824707, + "learning_rate": 4.7606140036797064e-05, + "loss": 0.0087, + "step": 32370 + }, + { + "grad_norm": 0.268684983253479, + "learning_rate": 4.7578614174002614e-05, + "loss": 0.0061, + "step": 32380 + }, + { + "grad_norm": 0.3234284222126007, + "learning_rate": 4.755108904676125e-05, + "loss": 0.0093, + "step": 32390 + }, + { + "grad_norm": 0.24526089429855347, + "learning_rate": 4.752356466343436e-05, + "loss": 0.0087, + "step": 32400 + }, + { + "grad_norm": 0.21310359239578247, + "learning_rate": 4.7496041032383174e-05, + "loss": 0.0068, + "step": 32410 + }, + { + "grad_norm": 0.2801964282989502, + "learning_rate": 4.746851816196858e-05, + "loss": 0.0075, + "step": 32420 + }, + { + "grad_norm": 0.17972113192081451, + "learning_rate": 4.744099606055135e-05, + "loss": 0.0079, + "step": 32430 + }, + { + "grad_norm": 0.30641815066337585, + "learning_rate": 4.741347473649193e-05, + "loss": 0.0086, + "step": 32440 + }, + { + "grad_norm": 0.2734004557132721, + "learning_rate": 4.738595419815058e-05, + "loss": 0.0067, + "step": 32450 + }, + { + "grad_norm": 0.2160828411579132, + "learning_rate": 4.7358434453887365e-05, + "loss": 0.0081, + "step": 32460 + }, + { + "grad_norm": 0.18313400447368622, + "learning_rate": 4.7330915512061976e-05, + "loss": 0.006, + "step": 32470 + }, + { + "grad_norm": 0.22924011945724487, + "learning_rate": 4.730339738103402e-05, + "loss": 0.0048, + "step": 32480 + }, + { + "grad_norm": 0.3271869719028473, + "learning_rate": 4.727588006916271e-05, + "loss": 0.0099, + "step": 32490 + }, + { + "grad_norm": 0.24933359026908875, + "learning_rate": 4.724836358480711e-05, + "loss": 0.0067, + "step": 32500 + }, + { + "grad_norm": 0.3346271514892578, + "learning_rate": 4.722084793632601e-05, + "loss": 0.0084, + "step": 32510 + }, + { + "grad_norm": 0.22423043847084045, + "learning_rate": 4.719333313207792e-05, + "loss": 0.0078, + "step": 32520 + }, + { + "grad_norm": 0.2899785041809082, + "learning_rate": 4.716581918042114e-05, + "loss": 0.0081, + "step": 32530 + }, + { + "grad_norm": 0.26931262016296387, + "learning_rate": 4.7138306089713636e-05, + "loss": 0.0087, + "step": 32540 + }, + { + "grad_norm": 0.22534888982772827, + "learning_rate": 4.7110793868313183e-05, + "loss": 0.0072, + "step": 32550 + }, + { + "grad_norm": 0.21676774322986603, + "learning_rate": 4.708328252457729e-05, + "loss": 0.0079, + "step": 32560 + }, + { + "grad_norm": 0.30667024850845337, + "learning_rate": 4.7055772066863135e-05, + "loss": 0.009, + "step": 32570 + }, + { + "grad_norm": 0.2854798436164856, + "learning_rate": 4.702826250352771e-05, + "loss": 0.0122, + "step": 32580 + }, + { + "grad_norm": 0.288504958152771, + "learning_rate": 4.7000753842927653e-05, + "loss": 0.0068, + "step": 32590 + }, + { + "grad_norm": 0.25443020462989807, + "learning_rate": 4.6973246093419384e-05, + "loss": 0.0087, + "step": 32600 + }, + { + "grad_norm": 0.18471738696098328, + "learning_rate": 4.694573926335906e-05, + "loss": 0.0063, + "step": 32610 + }, + { + "grad_norm": 0.20200151205062866, + "learning_rate": 4.6918233361102476e-05, + "loss": 0.0074, + "step": 32620 + }, + { + "grad_norm": 0.20897750556468964, + "learning_rate": 4.689072839500525e-05, + "loss": 0.0062, + "step": 32630 + }, + { + "grad_norm": 0.2191777229309082, + "learning_rate": 4.6863224373422635e-05, + "loss": 0.0079, + "step": 32640 + }, + { + "grad_norm": 0.24972093105316162, + "learning_rate": 4.683572130470962e-05, + "loss": 0.009, + "step": 32650 + }, + { + "grad_norm": 0.20447924733161926, + "learning_rate": 4.680821919722094e-05, + "loss": 0.0062, + "step": 32660 + }, + { + "grad_norm": 0.2813320457935333, + "learning_rate": 4.6780718059310975e-05, + "loss": 0.0072, + "step": 32670 + }, + { + "grad_norm": 0.27030473947525024, + "learning_rate": 4.675321789933389e-05, + "loss": 0.0064, + "step": 32680 + }, + { + "grad_norm": 0.11901049315929413, + "learning_rate": 4.6725718725643464e-05, + "loss": 0.0056, + "step": 32690 + }, + { + "grad_norm": 0.18552948534488678, + "learning_rate": 4.669822054659323e-05, + "loss": 0.0055, + "step": 32700 + }, + { + "grad_norm": 0.3296489119529724, + "learning_rate": 4.667072337053644e-05, + "loss": 0.009, + "step": 32710 + }, + { + "grad_norm": 0.2645881474018097, + "learning_rate": 4.6643227205825965e-05, + "loss": 0.0064, + "step": 32720 + }, + { + "grad_norm": 0.2363099753856659, + "learning_rate": 4.6615732060814454e-05, + "loss": 0.0059, + "step": 32730 + }, + { + "grad_norm": 0.2088632732629776, + "learning_rate": 4.658823794385417e-05, + "loss": 0.0076, + "step": 32740 + }, + { + "grad_norm": 0.1993638426065445, + "learning_rate": 4.6560744863297115e-05, + "loss": 0.0064, + "step": 32750 + }, + { + "grad_norm": 0.19181689620018005, + "learning_rate": 4.653325282749498e-05, + "loss": 0.0061, + "step": 32760 + }, + { + "grad_norm": 0.2350449115037918, + "learning_rate": 4.6505761844799075e-05, + "loss": 0.009, + "step": 32770 + }, + { + "grad_norm": 0.23427946865558624, + "learning_rate": 4.647827192356048e-05, + "loss": 0.008, + "step": 32780 + }, + { + "grad_norm": 0.20481781661510468, + "learning_rate": 4.645078307212989e-05, + "loss": 0.0054, + "step": 32790 + }, + { + "grad_norm": 0.24765610694885254, + "learning_rate": 4.642329529885768e-05, + "loss": 0.0068, + "step": 32800 + }, + { + "grad_norm": 0.2681053876876831, + "learning_rate": 4.639580861209393e-05, + "loss": 0.0072, + "step": 32810 + }, + { + "grad_norm": 0.2844184339046478, + "learning_rate": 4.636832302018835e-05, + "loss": 0.0053, + "step": 32820 + }, + { + "grad_norm": 0.24991938471794128, + "learning_rate": 4.6340838531490365e-05, + "loss": 0.0058, + "step": 32830 + }, + { + "grad_norm": 0.20992514491081238, + "learning_rate": 4.6313355154349e-05, + "loss": 0.0074, + "step": 32840 + }, + { + "grad_norm": 0.18317939341068268, + "learning_rate": 4.6285872897113025e-05, + "loss": 0.0096, + "step": 32850 + }, + { + "grad_norm": 0.17887622117996216, + "learning_rate": 4.625839176813077e-05, + "loss": 0.0059, + "step": 32860 + }, + { + "grad_norm": 0.2228119671344757, + "learning_rate": 4.623091177575031e-05, + "loss": 0.0059, + "step": 32870 + }, + { + "grad_norm": 0.19908063113689423, + "learning_rate": 4.620343292831936e-05, + "loss": 0.0066, + "step": 32880 + }, + { + "grad_norm": 0.28317978978157043, + "learning_rate": 4.6175955234185206e-05, + "loss": 0.0074, + "step": 32890 + }, + { + "grad_norm": 0.29716062545776367, + "learning_rate": 4.614847870169492e-05, + "loss": 0.007, + "step": 32900 + }, + { + "grad_norm": 0.24557605385780334, + "learning_rate": 4.612100333919509e-05, + "loss": 0.0062, + "step": 32910 + }, + { + "grad_norm": 0.2536318004131317, + "learning_rate": 4.609352915503202e-05, + "loss": 0.0054, + "step": 32920 + }, + { + "grad_norm": 0.27791568636894226, + "learning_rate": 4.606605615755166e-05, + "loss": 0.0058, + "step": 32930 + }, + { + "grad_norm": 0.16657011210918427, + "learning_rate": 4.6038584355099576e-05, + "loss": 0.0065, + "step": 32940 + }, + { + "grad_norm": 0.19968189299106598, + "learning_rate": 4.6011113756020964e-05, + "loss": 0.0086, + "step": 32950 + }, + { + "grad_norm": 0.2506800889968872, + "learning_rate": 4.598364436866066e-05, + "loss": 0.0071, + "step": 32960 + }, + { + "grad_norm": 0.33925187587738037, + "learning_rate": 4.595617620136316e-05, + "loss": 0.0091, + "step": 32970 + }, + { + "grad_norm": 0.24584905803203583, + "learning_rate": 4.592870926247257e-05, + "loss": 0.0085, + "step": 32980 + }, + { + "grad_norm": 0.22604088485240936, + "learning_rate": 4.5901243560332594e-05, + "loss": 0.0053, + "step": 32990 + }, + { + "grad_norm": 0.26294711232185364, + "learning_rate": 4.587377910328662e-05, + "loss": 0.0061, + "step": 33000 + }, + { + "grad_norm": 0.14909221231937408, + "learning_rate": 4.5846315899677586e-05, + "loss": 0.0078, + "step": 33010 + }, + { + "grad_norm": 0.21891048550605774, + "learning_rate": 4.5818853957848114e-05, + "loss": 0.0054, + "step": 33020 + }, + { + "grad_norm": 0.1842089593410492, + "learning_rate": 4.579139328614043e-05, + "loss": 0.0059, + "step": 33030 + }, + { + "grad_norm": 0.227177232503891, + "learning_rate": 4.576393389289633e-05, + "loss": 0.0064, + "step": 33040 + }, + { + "grad_norm": 0.22151440382003784, + "learning_rate": 4.573647578645728e-05, + "loss": 0.0067, + "step": 33050 + }, + { + "grad_norm": 0.25708913803100586, + "learning_rate": 4.57090189751643e-05, + "loss": 0.0064, + "step": 33060 + }, + { + "grad_norm": 0.22885015606880188, + "learning_rate": 4.568156346735806e-05, + "loss": 0.0061, + "step": 33070 + }, + { + "grad_norm": 0.1635526418685913, + "learning_rate": 4.565410927137882e-05, + "loss": 0.0051, + "step": 33080 + }, + { + "grad_norm": 0.16752150654792786, + "learning_rate": 4.562665639556644e-05, + "loss": 0.0056, + "step": 33090 + }, + { + "grad_norm": 0.2851332426071167, + "learning_rate": 4.559920484826037e-05, + "loss": 0.0066, + "step": 33100 + }, + { + "grad_norm": 0.23949535191059113, + "learning_rate": 4.5571754637799665e-05, + "loss": 0.0068, + "step": 33110 + }, + { + "grad_norm": 0.1821199208498001, + "learning_rate": 4.554430577252298e-05, + "loss": 0.0061, + "step": 33120 + }, + { + "grad_norm": 0.24349737167358398, + "learning_rate": 4.551685826076858e-05, + "loss": 0.0056, + "step": 33130 + }, + { + "grad_norm": 0.2185320407152176, + "learning_rate": 4.5489412110874246e-05, + "loss": 0.0069, + "step": 33140 + }, + { + "grad_norm": 0.28381046652793884, + "learning_rate": 4.5461967331177444e-05, + "loss": 0.0095, + "step": 33150 + }, + { + "grad_norm": 0.2769639194011688, + "learning_rate": 4.5434523930015115e-05, + "loss": 0.0059, + "step": 33160 + }, + { + "grad_norm": 0.1941869705915451, + "learning_rate": 4.540708191572388e-05, + "loss": 0.0076, + "step": 33170 + }, + { + "grad_norm": 0.21477438509464264, + "learning_rate": 4.537964129663991e-05, + "loss": 0.007, + "step": 33180 + }, + { + "grad_norm": 0.24743351340293884, + "learning_rate": 4.535220208109889e-05, + "loss": 0.0089, + "step": 33190 + }, + { + "grad_norm": 0.16789889335632324, + "learning_rate": 4.5324764277436194e-05, + "loss": 0.0085, + "step": 33200 + }, + { + "grad_norm": 0.21021337807178497, + "learning_rate": 4.529732789398664e-05, + "loss": 0.0069, + "step": 33210 + }, + { + "grad_norm": 0.18843376636505127, + "learning_rate": 4.526989293908472e-05, + "loss": 0.0067, + "step": 33220 + }, + { + "grad_norm": 0.1962655931711197, + "learning_rate": 4.524245942106442e-05, + "loss": 0.0066, + "step": 33230 + }, + { + "grad_norm": 0.20544129610061646, + "learning_rate": 4.5215027348259345e-05, + "loss": 0.0064, + "step": 33240 + }, + { + "grad_norm": 0.1872073858976364, + "learning_rate": 4.5187596729002616e-05, + "loss": 0.0059, + "step": 33250 + }, + { + "grad_norm": 0.18753452599048615, + "learning_rate": 4.516016757162693e-05, + "loss": 0.0068, + "step": 33260 + }, + { + "grad_norm": 0.19614441692829132, + "learning_rate": 4.513273988446457e-05, + "loss": 0.0089, + "step": 33270 + }, + { + "grad_norm": 0.2291834056377411, + "learning_rate": 4.5105313675847296e-05, + "loss": 0.0082, + "step": 33280 + }, + { + "grad_norm": 0.26047879457473755, + "learning_rate": 4.5077888954106495e-05, + "loss": 0.0055, + "step": 33290 + }, + { + "grad_norm": 0.2506044805049896, + "learning_rate": 4.505046572757309e-05, + "loss": 0.0075, + "step": 33300 + }, + { + "grad_norm": 0.17611005902290344, + "learning_rate": 4.502304400457749e-05, + "loss": 0.0054, + "step": 33310 + }, + { + "grad_norm": 0.31874096393585205, + "learning_rate": 4.499562379344973e-05, + "loss": 0.0073, + "step": 33320 + }, + { + "grad_norm": 0.2621985971927643, + "learning_rate": 4.4968205102519306e-05, + "loss": 0.0084, + "step": 33330 + }, + { + "grad_norm": 0.2626568675041199, + "learning_rate": 4.494078794011532e-05, + "loss": 0.0063, + "step": 33340 + }, + { + "grad_norm": 0.21751157939434052, + "learning_rate": 4.491337231456639e-05, + "loss": 0.008, + "step": 33350 + }, + { + "grad_norm": 0.1921292543411255, + "learning_rate": 4.4885958234200634e-05, + "loss": 0.0089, + "step": 33360 + }, + { + "grad_norm": 0.32345283031463623, + "learning_rate": 4.485854570734575e-05, + "loss": 0.0134, + "step": 33370 + }, + { + "grad_norm": 0.29324713349342346, + "learning_rate": 4.483113474232891e-05, + "loss": 0.0089, + "step": 33380 + }, + { + "grad_norm": 0.2626071572303772, + "learning_rate": 4.480372534747688e-05, + "loss": 0.0091, + "step": 33390 + }, + { + "grad_norm": 0.15506887435913086, + "learning_rate": 4.477631753111588e-05, + "loss": 0.0076, + "step": 33400 + }, + { + "grad_norm": 0.18630781769752502, + "learning_rate": 4.4748911301571686e-05, + "loss": 0.0058, + "step": 33410 + }, + { + "grad_norm": 0.2392105907201767, + "learning_rate": 4.472150666716961e-05, + "loss": 0.0074, + "step": 33420 + }, + { + "grad_norm": 0.20797984302043915, + "learning_rate": 4.469410363623442e-05, + "loss": 0.0074, + "step": 33430 + }, + { + "grad_norm": 0.2930055856704712, + "learning_rate": 4.466670221709044e-05, + "loss": 0.0064, + "step": 33440 + }, + { + "grad_norm": 0.22188888490200043, + "learning_rate": 4.463930241806154e-05, + "loss": 0.0055, + "step": 33450 + }, + { + "grad_norm": 0.279517263174057, + "learning_rate": 4.4611904247471006e-05, + "loss": 0.0075, + "step": 33460 + }, + { + "grad_norm": 0.21537578105926514, + "learning_rate": 4.458450771364171e-05, + "loss": 0.0052, + "step": 33470 + }, + { + "grad_norm": 0.1981678456068039, + "learning_rate": 4.4557112824895965e-05, + "loss": 0.0067, + "step": 33480 + }, + { + "grad_norm": 0.25552329421043396, + "learning_rate": 4.452971958955563e-05, + "loss": 0.0076, + "step": 33490 + }, + { + "grad_norm": 0.23445437848567963, + "learning_rate": 4.450232801594208e-05, + "loss": 0.0084, + "step": 33500 + }, + { + "grad_norm": 0.23738905787467957, + "learning_rate": 4.447493811237609e-05, + "loss": 0.0074, + "step": 33510 + }, + { + "grad_norm": 0.2786206901073456, + "learning_rate": 4.444754988717804e-05, + "loss": 0.006, + "step": 33520 + }, + { + "grad_norm": 0.2710059881210327, + "learning_rate": 4.442016334866771e-05, + "loss": 0.0075, + "step": 33530 + }, + { + "grad_norm": 0.20788449048995972, + "learning_rate": 4.4392778505164445e-05, + "loss": 0.0068, + "step": 33540 + }, + { + "grad_norm": 0.24663987755775452, + "learning_rate": 4.436539536498702e-05, + "loss": 0.0072, + "step": 33550 + }, + { + "grad_norm": 0.38234665989875793, + "learning_rate": 4.433801393645369e-05, + "loss": 0.0064, + "step": 33560 + }, + { + "grad_norm": 0.18163244426250458, + "learning_rate": 4.431063422788226e-05, + "loss": 0.0054, + "step": 33570 + }, + { + "grad_norm": 0.20825371146202087, + "learning_rate": 4.428325624758991e-05, + "loss": 0.0078, + "step": 33580 + }, + { + "grad_norm": 0.2456311136484146, + "learning_rate": 4.4255880003893366e-05, + "loss": 0.0075, + "step": 33590 + }, + { + "grad_norm": 0.2841171324253082, + "learning_rate": 4.422850550510884e-05, + "loss": 0.0089, + "step": 33600 + }, + { + "grad_norm": 0.18170909583568573, + "learning_rate": 4.4201132759551934e-05, + "loss": 0.0078, + "step": 33610 + }, + { + "grad_norm": 0.20780561864376068, + "learning_rate": 4.4173761775537804e-05, + "loss": 0.0108, + "step": 33620 + }, + { + "grad_norm": 0.15301473438739777, + "learning_rate": 4.414639256138099e-05, + "loss": 0.0063, + "step": 33630 + }, + { + "grad_norm": 0.16479627788066864, + "learning_rate": 4.411902512539557e-05, + "loss": 0.0092, + "step": 33640 + }, + { + "grad_norm": 0.24109962582588196, + "learning_rate": 4.4091659475895044e-05, + "loss": 0.0081, + "step": 33650 + }, + { + "grad_norm": 0.26153942942619324, + "learning_rate": 4.406429562119235e-05, + "loss": 0.0066, + "step": 33660 + }, + { + "grad_norm": 0.14684417843818665, + "learning_rate": 4.4036933569599945e-05, + "loss": 0.0064, + "step": 33670 + }, + { + "grad_norm": 0.1940925121307373, + "learning_rate": 4.400957332942965e-05, + "loss": 0.0054, + "step": 33680 + }, + { + "grad_norm": 0.1748809516429901, + "learning_rate": 4.3982214908992844e-05, + "loss": 0.006, + "step": 33690 + }, + { + "grad_norm": 0.23238727450370789, + "learning_rate": 4.3954858316600235e-05, + "loss": 0.0062, + "step": 33700 + }, + { + "grad_norm": 0.27522721886634827, + "learning_rate": 4.392750356056205e-05, + "loss": 0.006, + "step": 33710 + }, + { + "grad_norm": 0.17680269479751587, + "learning_rate": 4.390015064918798e-05, + "loss": 0.0055, + "step": 33720 + }, + { + "grad_norm": 0.21424515545368195, + "learning_rate": 4.387279959078705e-05, + "loss": 0.0049, + "step": 33730 + }, + { + "grad_norm": 0.1855633407831192, + "learning_rate": 4.384545039366786e-05, + "loss": 0.0091, + "step": 33740 + }, + { + "grad_norm": 0.20762841403484344, + "learning_rate": 4.381810306613831e-05, + "loss": 0.005, + "step": 33750 + }, + { + "grad_norm": 0.16944566369056702, + "learning_rate": 4.3790757616505826e-05, + "loss": 0.0067, + "step": 33760 + }, + { + "grad_norm": 0.20522117614746094, + "learning_rate": 4.376341405307725e-05, + "loss": 0.0057, + "step": 33770 + }, + { + "grad_norm": 0.19686530530452728, + "learning_rate": 4.37360723841588e-05, + "loss": 0.0069, + "step": 33780 + }, + { + "grad_norm": 0.19774030148983002, + "learning_rate": 4.370873261805619e-05, + "loss": 0.0055, + "step": 33790 + }, + { + "grad_norm": 0.30278995633125305, + "learning_rate": 4.368139476307449e-05, + "loss": 0.011, + "step": 33800 + }, + { + "grad_norm": 0.24399347603321075, + "learning_rate": 4.365405882751822e-05, + "loss": 0.0057, + "step": 33810 + }, + { + "grad_norm": 0.2786239981651306, + "learning_rate": 4.3626724819691326e-05, + "loss": 0.0072, + "step": 33820 + }, + { + "grad_norm": 0.22116117179393768, + "learning_rate": 4.359939274789715e-05, + "loss": 0.0054, + "step": 33830 + }, + { + "grad_norm": 0.24048860371112823, + "learning_rate": 4.357206262043848e-05, + "loss": 0.0071, + "step": 33840 + }, + { + "grad_norm": 0.1859717071056366, + "learning_rate": 4.354473444561745e-05, + "loss": 0.0062, + "step": 33850 + }, + { + "grad_norm": 0.22413165867328644, + "learning_rate": 4.3517408231735644e-05, + "loss": 0.0065, + "step": 33860 + }, + { + "grad_norm": 0.20817112922668457, + "learning_rate": 4.3490083987094086e-05, + "loss": 0.0053, + "step": 33870 + }, + { + "grad_norm": 0.24079784750938416, + "learning_rate": 4.34627617199931e-05, + "loss": 0.0051, + "step": 33880 + }, + { + "grad_norm": 0.2651442587375641, + "learning_rate": 4.3435441438732526e-05, + "loss": 0.0048, + "step": 33890 + }, + { + "grad_norm": 0.2585642337799072, + "learning_rate": 4.340812315161149e-05, + "loss": 0.0072, + "step": 33900 + }, + { + "grad_norm": 0.23831868171691895, + "learning_rate": 4.338080686692859e-05, + "loss": 0.0068, + "step": 33910 + }, + { + "grad_norm": 0.23950357735157013, + "learning_rate": 4.3353492592981816e-05, + "loss": 0.0056, + "step": 33920 + }, + { + "grad_norm": 0.1677539348602295, + "learning_rate": 4.3326180338068485e-05, + "loss": 0.0048, + "step": 33930 + }, + { + "grad_norm": 0.31859931349754333, + "learning_rate": 4.3298870110485356e-05, + "loss": 0.0058, + "step": 33940 + }, + { + "grad_norm": 0.2327817678451538, + "learning_rate": 4.3271561918528567e-05, + "loss": 0.0086, + "step": 33950 + }, + { + "grad_norm": 0.17561651766300201, + "learning_rate": 4.324425577049359e-05, + "loss": 0.007, + "step": 33960 + }, + { + "grad_norm": 0.2269870489835739, + "learning_rate": 4.321695167467535e-05, + "loss": 0.0107, + "step": 33970 + }, + { + "grad_norm": 0.2643167972564697, + "learning_rate": 4.3189649639368093e-05, + "loss": 0.0073, + "step": 33980 + }, + { + "grad_norm": 0.15537384152412415, + "learning_rate": 4.316234967286547e-05, + "loss": 0.0055, + "step": 33990 + }, + { + "grad_norm": 0.24303454160690308, + "learning_rate": 4.313505178346046e-05, + "loss": 0.0099, + "step": 34000 + }, + { + "grad_norm": 0.2226843237876892, + "learning_rate": 4.3107755979445465e-05, + "loss": 0.0069, + "step": 34010 + }, + { + "grad_norm": 0.22701045870780945, + "learning_rate": 4.308046226911224e-05, + "loss": 0.0077, + "step": 34020 + }, + { + "grad_norm": 0.2765866816043854, + "learning_rate": 4.305317066075185e-05, + "loss": 0.0087, + "step": 34030 + }, + { + "grad_norm": 0.21416576206684113, + "learning_rate": 4.302588116265482e-05, + "loss": 0.0079, + "step": 34040 + }, + { + "grad_norm": 0.2295892834663391, + "learning_rate": 4.299859378311094e-05, + "loss": 0.0061, + "step": 34050 + }, + { + "grad_norm": 0.22910450398921967, + "learning_rate": 4.2971308530409424e-05, + "loss": 0.0065, + "step": 34060 + }, + { + "grad_norm": 0.27474451065063477, + "learning_rate": 4.2944025412838765e-05, + "loss": 0.0094, + "step": 34070 + }, + { + "grad_norm": 0.20846879482269287, + "learning_rate": 4.291674443868689e-05, + "loss": 0.0064, + "step": 34080 + }, + { + "grad_norm": 0.2005152404308319, + "learning_rate": 4.288946561624104e-05, + "loss": 0.0054, + "step": 34090 + }, + { + "grad_norm": 0.15358655154705048, + "learning_rate": 4.2862188953787794e-05, + "loss": 0.0053, + "step": 34100 + }, + { + "grad_norm": 0.2720765173435211, + "learning_rate": 4.283491445961308e-05, + "loss": 0.008, + "step": 34110 + }, + { + "grad_norm": 0.19924184679985046, + "learning_rate": 4.2807642142002155e-05, + "loss": 0.0074, + "step": 34120 + }, + { + "grad_norm": 0.1904723048210144, + "learning_rate": 4.278037200923966e-05, + "loss": 0.0046, + "step": 34130 + }, + { + "grad_norm": 0.15538300573825836, + "learning_rate": 4.275310406960953e-05, + "loss": 0.006, + "step": 34140 + }, + { + "grad_norm": 0.19680018723011017, + "learning_rate": 4.272583833139502e-05, + "loss": 0.008, + "step": 34150 + }, + { + "grad_norm": 0.2198832929134369, + "learning_rate": 4.2698574802878794e-05, + "loss": 0.0055, + "step": 34160 + }, + { + "grad_norm": 0.21210193634033203, + "learning_rate": 4.2671313492342734e-05, + "loss": 0.0073, + "step": 34170 + }, + { + "grad_norm": 0.22931769490242004, + "learning_rate": 4.264405440806813e-05, + "loss": 0.0056, + "step": 34180 + }, + { + "grad_norm": 0.23736603558063507, + "learning_rate": 4.26167975583356e-05, + "loss": 0.0068, + "step": 34190 + }, + { + "grad_norm": 0.23222510516643524, + "learning_rate": 4.2589542951425e-05, + "loss": 0.007, + "step": 34200 + }, + { + "grad_norm": 0.27665337920188904, + "learning_rate": 4.2562290595615615e-05, + "loss": 0.0081, + "step": 34210 + }, + { + "grad_norm": 0.2881568372249603, + "learning_rate": 4.2535040499185946e-05, + "loss": 0.0069, + "step": 34220 + }, + { + "grad_norm": 0.18981459736824036, + "learning_rate": 4.250779267041387e-05, + "loss": 0.0051, + "step": 34230 + }, + { + "grad_norm": 0.2082643359899521, + "learning_rate": 4.248054711757657e-05, + "loss": 0.0059, + "step": 34240 + }, + { + "grad_norm": 0.24757909774780273, + "learning_rate": 4.245330384895052e-05, + "loss": 0.0071, + "step": 34250 + }, + { + "grad_norm": 0.1938125640153885, + "learning_rate": 4.242606287281151e-05, + "loss": 0.0056, + "step": 34260 + }, + { + "grad_norm": 0.24106638133525848, + "learning_rate": 4.2398824197434595e-05, + "loss": 0.0071, + "step": 34270 + }, + { + "grad_norm": 0.20034322142601013, + "learning_rate": 4.23715878310942e-05, + "loss": 0.006, + "step": 34280 + }, + { + "grad_norm": 0.23234619200229645, + "learning_rate": 4.234435378206402e-05, + "loss": 0.0065, + "step": 34290 + }, + { + "grad_norm": 0.18687517940998077, + "learning_rate": 4.2317122058617006e-05, + "loss": 0.0056, + "step": 34300 + }, + { + "grad_norm": 0.19216594099998474, + "learning_rate": 4.2289892669025485e-05, + "loss": 0.0073, + "step": 34310 + }, + { + "grad_norm": 0.194553405046463, + "learning_rate": 4.226266562156097e-05, + "loss": 0.0067, + "step": 34320 + }, + { + "grad_norm": 0.21808727085590363, + "learning_rate": 4.223544092449435e-05, + "loss": 0.0067, + "step": 34330 + }, + { + "grad_norm": 0.2061053067445755, + "learning_rate": 4.2208218586095784e-05, + "loss": 0.0058, + "step": 34340 + }, + { + "grad_norm": 0.21847741305828094, + "learning_rate": 4.218099861463466e-05, + "loss": 0.0071, + "step": 34350 + }, + { + "grad_norm": 0.26483115553855896, + "learning_rate": 4.215378101837972e-05, + "loss": 0.0059, + "step": 34360 + }, + { + "grad_norm": 0.23244349658489227, + "learning_rate": 4.2126565805598937e-05, + "loss": 0.0055, + "step": 34370 + }, + { + "grad_norm": 0.21753989160060883, + "learning_rate": 4.209935298455957e-05, + "loss": 0.0077, + "step": 34380 + }, + { + "grad_norm": 0.24541738629341125, + "learning_rate": 4.207214256352817e-05, + "loss": 0.0052, + "step": 34390 + }, + { + "grad_norm": 0.3085399866104126, + "learning_rate": 4.2044934550770524e-05, + "loss": 0.0064, + "step": 34400 + }, + { + "grad_norm": 0.2537444829940796, + "learning_rate": 4.201772895455174e-05, + "loss": 0.0065, + "step": 34410 + }, + { + "grad_norm": 0.2249620109796524, + "learning_rate": 4.199052578313613e-05, + "loss": 0.0063, + "step": 34420 + }, + { + "grad_norm": 0.26493632793426514, + "learning_rate": 4.1963325044787294e-05, + "loss": 0.0085, + "step": 34430 + }, + { + "grad_norm": 0.18385231494903564, + "learning_rate": 4.193612674776814e-05, + "loss": 0.0068, + "step": 34440 + }, + { + "grad_norm": 0.23978319764137268, + "learning_rate": 4.1908930900340745e-05, + "loss": 0.0073, + "step": 34450 + }, + { + "grad_norm": 0.19395625591278076, + "learning_rate": 4.1881737510766536e-05, + "loss": 0.0072, + "step": 34460 + }, + { + "grad_norm": 0.20736093819141388, + "learning_rate": 4.185454658730609e-05, + "loss": 0.0064, + "step": 34470 + }, + { + "grad_norm": 0.22516420483589172, + "learning_rate": 4.1827358138219355e-05, + "loss": 0.0056, + "step": 34480 + }, + { + "grad_norm": 0.274665504693985, + "learning_rate": 4.1800172171765404e-05, + "loss": 0.0083, + "step": 34490 + }, + { + "grad_norm": 0.1988043487071991, + "learning_rate": 4.177298869620264e-05, + "loss": 0.0071, + "step": 34500 + }, + { + "grad_norm": 0.37068867683410645, + "learning_rate": 4.1745807719788705e-05, + "loss": 0.0076, + "step": 34510 + }, + { + "grad_norm": 0.18292322754859924, + "learning_rate": 4.1718629250780445e-05, + "loss": 0.0072, + "step": 34520 + }, + { + "grad_norm": 0.2500819265842438, + "learning_rate": 4.1691453297433956e-05, + "loss": 0.0064, + "step": 34530 + }, + { + "grad_norm": 0.26304394006729126, + "learning_rate": 4.166427986800457e-05, + "loss": 0.0052, + "step": 34540 + }, + { + "grad_norm": 0.26817741990089417, + "learning_rate": 4.163710897074688e-05, + "loss": 0.0067, + "step": 34550 + }, + { + "grad_norm": 0.2203875333070755, + "learning_rate": 4.1609940613914686e-05, + "loss": 0.0064, + "step": 34560 + }, + { + "grad_norm": 0.2155834138393402, + "learning_rate": 4.1582774805760996e-05, + "loss": 0.0057, + "step": 34570 + }, + { + "grad_norm": 0.25557976961135864, + "learning_rate": 4.155561155453809e-05, + "loss": 0.0057, + "step": 34580 + }, + { + "grad_norm": 0.19896359741687775, + "learning_rate": 4.15284508684974e-05, + "loss": 0.0078, + "step": 34590 + }, + { + "grad_norm": 0.22413352131843567, + "learning_rate": 4.1501292755889675e-05, + "loss": 0.0072, + "step": 34600 + }, + { + "grad_norm": 0.18792198598384857, + "learning_rate": 4.1474137224964833e-05, + "loss": 0.0056, + "step": 34610 + }, + { + "grad_norm": 0.24979586899280548, + "learning_rate": 4.144698428397197e-05, + "loss": 0.0094, + "step": 34620 + }, + { + "grad_norm": 0.20292992889881134, + "learning_rate": 4.1419833941159466e-05, + "loss": 0.0063, + "step": 34630 + }, + { + "grad_norm": 0.21745379269123077, + "learning_rate": 4.1392686204774846e-05, + "loss": 0.0056, + "step": 34640 + }, + { + "grad_norm": 0.20036591589450836, + "learning_rate": 4.13655410830649e-05, + "loss": 0.0066, + "step": 34650 + }, + { + "grad_norm": 0.21364815533161163, + "learning_rate": 4.1338398584275594e-05, + "loss": 0.0059, + "step": 34660 + }, + { + "grad_norm": 0.20373791456222534, + "learning_rate": 4.1311258716652104e-05, + "loss": 0.0049, + "step": 34670 + }, + { + "grad_norm": 0.22007043659687042, + "learning_rate": 4.128412148843881e-05, + "loss": 0.0067, + "step": 34680 + }, + { + "grad_norm": 0.19879628717899323, + "learning_rate": 4.125698690787926e-05, + "loss": 0.0068, + "step": 34690 + }, + { + "grad_norm": 0.23853397369384766, + "learning_rate": 4.1229854983216245e-05, + "loss": 0.0063, + "step": 34700 + }, + { + "grad_norm": 0.20324555039405823, + "learning_rate": 4.120272572269175e-05, + "loss": 0.0048, + "step": 34710 + }, + { + "grad_norm": 0.23553842306137085, + "learning_rate": 4.117559913454687e-05, + "loss": 0.0065, + "step": 34720 + }, + { + "grad_norm": 0.24592702090740204, + "learning_rate": 4.114847522702201e-05, + "loss": 0.0063, + "step": 34730 + }, + { + "grad_norm": 0.2628866732120514, + "learning_rate": 4.112135400835664e-05, + "loss": 0.0061, + "step": 34740 + }, + { + "grad_norm": 0.17729780077934265, + "learning_rate": 4.109423548678949e-05, + "loss": 0.0068, + "step": 34750 + }, + { + "grad_norm": 0.26210513710975647, + "learning_rate": 4.106711967055848e-05, + "loss": 0.0061, + "step": 34760 + }, + { + "grad_norm": 0.23312030732631683, + "learning_rate": 4.1040006567900636e-05, + "loss": 0.005, + "step": 34770 + }, + { + "grad_norm": 0.2101515531539917, + "learning_rate": 4.101289618705224e-05, + "loss": 0.0081, + "step": 34780 + }, + { + "grad_norm": 0.19878444075584412, + "learning_rate": 4.0985788536248675e-05, + "loss": 0.0071, + "step": 34790 + }, + { + "grad_norm": 0.23772254586219788, + "learning_rate": 4.095868362372454e-05, + "loss": 0.0123, + "step": 34800 + }, + { + "grad_norm": 0.21559622883796692, + "learning_rate": 4.0931581457713614e-05, + "loss": 0.0052, + "step": 34810 + }, + { + "grad_norm": 0.2253088355064392, + "learning_rate": 4.09044820464488e-05, + "loss": 0.0063, + "step": 34820 + }, + { + "grad_norm": 0.24349753558635712, + "learning_rate": 4.087738539816219e-05, + "loss": 0.0046, + "step": 34830 + }, + { + "grad_norm": 0.2786179780960083, + "learning_rate": 4.085029152108501e-05, + "loss": 0.0068, + "step": 34840 + }, + { + "grad_norm": 0.1682848185300827, + "learning_rate": 4.0823200423447714e-05, + "loss": 0.0062, + "step": 34850 + }, + { + "grad_norm": 0.22498951852321625, + "learning_rate": 4.079611211347981e-05, + "loss": 0.0065, + "step": 34860 + }, + { + "grad_norm": 0.16856351494789124, + "learning_rate": 4.076902659941002e-05, + "loss": 0.0048, + "step": 34870 + }, + { + "grad_norm": 0.17777703702449799, + "learning_rate": 4.074194388946624e-05, + "loss": 0.005, + "step": 34880 + }, + { + "grad_norm": 0.2599576711654663, + "learning_rate": 4.071486399187545e-05, + "loss": 0.0056, + "step": 34890 + }, + { + "grad_norm": 0.2105918526649475, + "learning_rate": 4.0687786914863836e-05, + "loss": 0.0049, + "step": 34900 + }, + { + "grad_norm": 0.2822265923023224, + "learning_rate": 4.0660712666656666e-05, + "loss": 0.0064, + "step": 34910 + }, + { + "grad_norm": 0.2873954772949219, + "learning_rate": 4.0633641255478394e-05, + "loss": 0.008, + "step": 34920 + }, + { + "grad_norm": 0.22428452968597412, + "learning_rate": 4.0606572689552624e-05, + "loss": 0.0064, + "step": 34930 + }, + { + "grad_norm": 0.2152257263660431, + "learning_rate": 4.0579506977102036e-05, + "loss": 0.0064, + "step": 34940 + }, + { + "grad_norm": 0.22747565805912018, + "learning_rate": 4.055244412634849e-05, + "loss": 0.0053, + "step": 34950 + }, + { + "grad_norm": 0.23033103346824646, + "learning_rate": 4.052538414551298e-05, + "loss": 0.006, + "step": 34960 + }, + { + "grad_norm": 0.23219303786754608, + "learning_rate": 4.0498327042815596e-05, + "loss": 0.0065, + "step": 34970 + }, + { + "grad_norm": 0.21151939034461975, + "learning_rate": 4.047127282647559e-05, + "loss": 0.0087, + "step": 34980 + }, + { + "grad_norm": 0.13151350617408752, + "learning_rate": 4.04442215047113e-05, + "loss": 0.0044, + "step": 34990 + }, + { + "grad_norm": 0.15589280426502228, + "learning_rate": 4.041717308574023e-05, + "loss": 0.0063, + "step": 35000 + }, + { + "grad_norm": 0.17671816051006317, + "learning_rate": 4.039012757777893e-05, + "loss": 0.0048, + "step": 35010 + }, + { + "grad_norm": 0.23570609092712402, + "learning_rate": 4.036308498904314e-05, + "loss": 0.0053, + "step": 35020 + }, + { + "grad_norm": 0.229853555560112, + "learning_rate": 4.033604532774771e-05, + "loss": 0.0067, + "step": 35030 + }, + { + "grad_norm": 0.27073344588279724, + "learning_rate": 4.030900860210652e-05, + "loss": 0.0062, + "step": 35040 + }, + { + "grad_norm": 0.26767176389694214, + "learning_rate": 4.028197482033266e-05, + "loss": 0.0075, + "step": 35050 + }, + { + "grad_norm": 0.2794646620750427, + "learning_rate": 4.0254943990638246e-05, + "loss": 0.0078, + "step": 35060 + }, + { + "grad_norm": 0.20178599655628204, + "learning_rate": 4.022791612123454e-05, + "loss": 0.007, + "step": 35070 + }, + { + "grad_norm": 0.23101627826690674, + "learning_rate": 4.020089122033192e-05, + "loss": 0.0062, + "step": 35080 + }, + { + "grad_norm": 0.21919691562652588, + "learning_rate": 4.01738692961398e-05, + "loss": 0.0055, + "step": 35090 + }, + { + "grad_norm": 0.24761426448822021, + "learning_rate": 4.014685035686675e-05, + "loss": 0.0062, + "step": 35100 + }, + { + "grad_norm": 0.2112138569355011, + "learning_rate": 4.011983441072039e-05, + "loss": 0.0061, + "step": 35110 + }, + { + "grad_norm": 0.19377121329307556, + "learning_rate": 4.0092821465907485e-05, + "loss": 0.007, + "step": 35120 + }, + { + "grad_norm": 0.18435238301753998, + "learning_rate": 4.006581153063383e-05, + "loss": 0.0076, + "step": 35130 + }, + { + "grad_norm": 0.2108333855867386, + "learning_rate": 4.003880461310432e-05, + "loss": 0.0075, + "step": 35140 + }, + { + "grad_norm": 0.20133459568023682, + "learning_rate": 4.001180072152298e-05, + "loss": 0.0083, + "step": 35150 + }, + { + "grad_norm": 0.2420373260974884, + "learning_rate": 3.998479986409285e-05, + "loss": 0.0054, + "step": 35160 + }, + { + "grad_norm": 0.4009375274181366, + "learning_rate": 3.995780204901607e-05, + "loss": 0.008, + "step": 35170 + }, + { + "grad_norm": 0.2990924119949341, + "learning_rate": 3.993080728449391e-05, + "loss": 0.0055, + "step": 35180 + }, + { + "grad_norm": 0.23826587200164795, + "learning_rate": 3.990381557872661e-05, + "loss": 0.0067, + "step": 35190 + }, + { + "grad_norm": 0.16642212867736816, + "learning_rate": 3.987682693991359e-05, + "loss": 0.0061, + "step": 35200 + }, + { + "grad_norm": 0.24827103316783905, + "learning_rate": 3.9849841376253226e-05, + "loss": 0.0061, + "step": 35210 + }, + { + "grad_norm": 0.20943792164325714, + "learning_rate": 3.982285889594306e-05, + "loss": 0.0056, + "step": 35220 + }, + { + "grad_norm": 0.15458567440509796, + "learning_rate": 3.9795879507179665e-05, + "loss": 0.0044, + "step": 35230 + }, + { + "grad_norm": 0.15484702587127686, + "learning_rate": 3.9768903218158634e-05, + "loss": 0.0044, + "step": 35240 + }, + { + "grad_norm": 0.29244929552078247, + "learning_rate": 3.974193003707468e-05, + "loss": 0.0076, + "step": 35250 + }, + { + "grad_norm": 0.28837576508522034, + "learning_rate": 3.971495997212152e-05, + "loss": 0.0086, + "step": 35260 + }, + { + "grad_norm": 0.2749343514442444, + "learning_rate": 3.9687993031491985e-05, + "loss": 0.0075, + "step": 35270 + }, + { + "grad_norm": 0.27931755781173706, + "learning_rate": 3.966102922337787e-05, + "loss": 0.0071, + "step": 35280 + }, + { + "grad_norm": 0.23164190351963043, + "learning_rate": 3.963406855597009e-05, + "loss": 0.0071, + "step": 35290 + }, + { + "grad_norm": 0.2774469256401062, + "learning_rate": 3.960711103745861e-05, + "loss": 0.0081, + "step": 35300 + }, + { + "grad_norm": 0.28086864948272705, + "learning_rate": 3.958015667603237e-05, + "loss": 0.0088, + "step": 35310 + }, + { + "grad_norm": 0.21511541306972504, + "learning_rate": 3.955320547987943e-05, + "loss": 0.0067, + "step": 35320 + }, + { + "grad_norm": 0.16067419946193695, + "learning_rate": 3.952625745718681e-05, + "loss": 0.0045, + "step": 35330 + }, + { + "grad_norm": 0.22300614416599274, + "learning_rate": 3.949931261614064e-05, + "loss": 0.0051, + "step": 35340 + }, + { + "grad_norm": 0.18996204435825348, + "learning_rate": 3.947237096492605e-05, + "loss": 0.0056, + "step": 35350 + }, + { + "grad_norm": 0.1919393390417099, + "learning_rate": 3.944543251172719e-05, + "loss": 0.0061, + "step": 35360 + }, + { + "grad_norm": 0.1869780570268631, + "learning_rate": 3.941849726472725e-05, + "loss": 0.0095, + "step": 35370 + }, + { + "grad_norm": 0.2237592339515686, + "learning_rate": 3.939156523210846e-05, + "loss": 0.007, + "step": 35380 + }, + { + "grad_norm": 0.23426155745983124, + "learning_rate": 3.9364636422052046e-05, + "loss": 0.0069, + "step": 35390 + }, + { + "grad_norm": 0.2058716118335724, + "learning_rate": 3.933771084273828e-05, + "loss": 0.0045, + "step": 35400 + }, + { + "grad_norm": 0.20779933035373688, + "learning_rate": 3.931078850234643e-05, + "loss": 0.0054, + "step": 35410 + }, + { + "grad_norm": 0.20989027619361877, + "learning_rate": 3.928386940905483e-05, + "loss": 0.0043, + "step": 35420 + }, + { + "grad_norm": 0.17320884764194489, + "learning_rate": 3.925695357104073e-05, + "loss": 0.0053, + "step": 35430 + }, + { + "grad_norm": 0.23786145448684692, + "learning_rate": 3.923004099648049e-05, + "loss": 0.0056, + "step": 35440 + }, + { + "grad_norm": 0.2616221010684967, + "learning_rate": 3.920313169354944e-05, + "loss": 0.0067, + "step": 35450 + }, + { + "grad_norm": 0.31618523597717285, + "learning_rate": 3.9176225670421897e-05, + "loss": 0.0054, + "step": 35460 + }, + { + "grad_norm": 0.17155753076076508, + "learning_rate": 3.9149322935271224e-05, + "loss": 0.0052, + "step": 35470 + }, + { + "grad_norm": 0.2382754385471344, + "learning_rate": 3.9122423496269725e-05, + "loss": 0.0059, + "step": 35480 + }, + { + "grad_norm": 0.17716126143932343, + "learning_rate": 3.909552736158877e-05, + "loss": 0.005, + "step": 35490 + }, + { + "grad_norm": 0.2661457359790802, + "learning_rate": 3.90686345393987e-05, + "loss": 0.0065, + "step": 35500 + }, + { + "grad_norm": 0.20925626158714294, + "learning_rate": 3.9041745037868816e-05, + "loss": 0.0063, + "step": 35510 + }, + { + "grad_norm": 0.20722226798534393, + "learning_rate": 3.9014858865167465e-05, + "loss": 0.005, + "step": 35520 + }, + { + "grad_norm": 0.2820894420146942, + "learning_rate": 3.8987976029461935e-05, + "loss": 0.0057, + "step": 35530 + }, + { + "grad_norm": 0.2097587138414383, + "learning_rate": 3.896109653891853e-05, + "loss": 0.0052, + "step": 35540 + }, + { + "grad_norm": 0.1808178573846817, + "learning_rate": 3.893422040170254e-05, + "loss": 0.0055, + "step": 35550 + }, + { + "grad_norm": 0.18658258020877838, + "learning_rate": 3.8907347625978207e-05, + "loss": 0.006, + "step": 35560 + }, + { + "grad_norm": 0.2943296730518341, + "learning_rate": 3.88804782199088e-05, + "loss": 0.0058, + "step": 35570 + }, + { + "grad_norm": 0.21354100108146667, + "learning_rate": 3.8853612191656495e-05, + "loss": 0.007, + "step": 35580 + }, + { + "grad_norm": 0.15112528204917908, + "learning_rate": 3.88267495493825e-05, + "loss": 0.0044, + "step": 35590 + }, + { + "grad_norm": 0.22299808263778687, + "learning_rate": 3.8799890301247004e-05, + "loss": 0.0065, + "step": 35600 + }, + { + "grad_norm": 0.13837043941020966, + "learning_rate": 3.8773034455409096e-05, + "loss": 0.0086, + "step": 35610 + }, + { + "grad_norm": 0.16553260385990143, + "learning_rate": 3.8746182020026904e-05, + "loss": 0.0055, + "step": 35620 + }, + { + "grad_norm": 0.162267804145813, + "learning_rate": 3.871933300325745e-05, + "loss": 0.0057, + "step": 35630 + }, + { + "grad_norm": 0.23652200400829315, + "learning_rate": 3.869248741325679e-05, + "loss": 0.0073, + "step": 35640 + }, + { + "grad_norm": 0.22564180195331573, + "learning_rate": 3.866564525817992e-05, + "loss": 0.0056, + "step": 35650 + }, + { + "grad_norm": 0.20974014699459076, + "learning_rate": 3.8638806546180725e-05, + "loss": 0.0052, + "step": 35660 + }, + { + "grad_norm": 0.26222461462020874, + "learning_rate": 3.861197128541213e-05, + "loss": 0.0066, + "step": 35670 + }, + { + "grad_norm": 0.2708519995212555, + "learning_rate": 3.858513948402599e-05, + "loss": 0.0058, + "step": 35680 + }, + { + "grad_norm": 0.19602173566818237, + "learning_rate": 3.8558311150173077e-05, + "loss": 0.0055, + "step": 35690 + }, + { + "grad_norm": 0.1802608072757721, + "learning_rate": 3.853148629200312e-05, + "loss": 0.0067, + "step": 35700 + }, + { + "grad_norm": 0.21257595717906952, + "learning_rate": 3.850466491766482e-05, + "loss": 0.0063, + "step": 35710 + }, + { + "grad_norm": 0.18815435469150543, + "learning_rate": 3.847784703530583e-05, + "loss": 0.0057, + "step": 35720 + }, + { + "grad_norm": 0.21281659603118896, + "learning_rate": 3.845103265307266e-05, + "loss": 0.008, + "step": 35730 + }, + { + "grad_norm": 0.21290278434753418, + "learning_rate": 3.842422177911086e-05, + "loss": 0.0051, + "step": 35740 + }, + { + "grad_norm": 0.147047758102417, + "learning_rate": 3.8397414421564826e-05, + "loss": 0.0048, + "step": 35750 + }, + { + "grad_norm": 0.19908230006694794, + "learning_rate": 3.8370610588577935e-05, + "loss": 0.0074, + "step": 35760 + }, + { + "grad_norm": 0.22344379127025604, + "learning_rate": 3.834381028829251e-05, + "loss": 0.0065, + "step": 35770 + }, + { + "grad_norm": 0.23005081713199615, + "learning_rate": 3.8317013528849745e-05, + "loss": 0.0058, + "step": 35780 + }, + { + "grad_norm": 0.20861759781837463, + "learning_rate": 3.8290220318389815e-05, + "loss": 0.0067, + "step": 35790 + }, + { + "grad_norm": 0.2356967180967331, + "learning_rate": 3.8263430665051746e-05, + "loss": 0.0061, + "step": 35800 + }, + { + "grad_norm": 0.22956997156143188, + "learning_rate": 3.8236644576973554e-05, + "loss": 0.0058, + "step": 35810 + }, + { + "grad_norm": 0.198786661028862, + "learning_rate": 3.820986206229217e-05, + "loss": 0.0049, + "step": 35820 + }, + { + "grad_norm": 0.21523654460906982, + "learning_rate": 3.8183083129143384e-05, + "loss": 0.0071, + "step": 35830 + }, + { + "grad_norm": 0.37201812863349915, + "learning_rate": 3.815630778566193e-05, + "loss": 0.0065, + "step": 35840 + }, + { + "grad_norm": 0.19066141545772552, + "learning_rate": 3.812953603998145e-05, + "loss": 0.0068, + "step": 35850 + }, + { + "grad_norm": 0.22858557105064392, + "learning_rate": 3.8102767900234504e-05, + "loss": 0.0048, + "step": 35860 + }, + { + "grad_norm": 0.2726020812988281, + "learning_rate": 3.807600337455256e-05, + "loss": 0.0055, + "step": 35870 + }, + { + "grad_norm": 0.2349100410938263, + "learning_rate": 3.804924247106593e-05, + "loss": 0.006, + "step": 35880 + }, + { + "grad_norm": 0.15375220775604248, + "learning_rate": 3.8022485197903925e-05, + "loss": 0.0053, + "step": 35890 + }, + { + "grad_norm": 0.19056303799152374, + "learning_rate": 3.799573156319464e-05, + "loss": 0.0059, + "step": 35900 + }, + { + "grad_norm": 0.22171743214130402, + "learning_rate": 3.796898157506515e-05, + "loss": 0.0055, + "step": 35910 + }, + { + "grad_norm": 0.24871499836444855, + "learning_rate": 3.794223524164143e-05, + "loss": 0.0059, + "step": 35920 + }, + { + "grad_norm": 0.2852282226085663, + "learning_rate": 3.7915492571048245e-05, + "loss": 0.0072, + "step": 35930 + }, + { + "grad_norm": 0.20029722154140472, + "learning_rate": 3.788875357140937e-05, + "loss": 0.0065, + "step": 35940 + }, + { + "grad_norm": 0.16307583451271057, + "learning_rate": 3.786201825084736e-05, + "loss": 0.0053, + "step": 35950 + }, + { + "grad_norm": 0.15474049746990204, + "learning_rate": 3.783528661748372e-05, + "loss": 0.0044, + "step": 35960 + }, + { + "grad_norm": 0.19798745214939117, + "learning_rate": 3.780855867943882e-05, + "loss": 0.004, + "step": 35970 + }, + { + "grad_norm": 0.15268008410930634, + "learning_rate": 3.778183444483189e-05, + "loss": 0.0053, + "step": 35980 + }, + { + "grad_norm": 0.19586017727851868, + "learning_rate": 3.775511392178108e-05, + "loss": 0.0047, + "step": 35990 + }, + { + "grad_norm": 0.21341058611869812, + "learning_rate": 3.772839711840332e-05, + "loss": 0.0058, + "step": 36000 + }, + { + "grad_norm": 0.19834588468074799, + "learning_rate": 3.7701684042814515e-05, + "loss": 0.0058, + "step": 36010 + }, + { + "grad_norm": 0.20601443946361542, + "learning_rate": 3.76749747031294e-05, + "loss": 0.0072, + "step": 36020 + }, + { + "grad_norm": 0.19616585969924927, + "learning_rate": 3.764826910746152e-05, + "loss": 0.006, + "step": 36030 + }, + { + "grad_norm": 0.22022952139377594, + "learning_rate": 3.762156726392338e-05, + "loss": 0.0055, + "step": 36040 + }, + { + "grad_norm": 0.17854173481464386, + "learning_rate": 3.759486918062625e-05, + "loss": 0.0052, + "step": 36050 + }, + { + "grad_norm": 0.20402492582798004, + "learning_rate": 3.756817486568033e-05, + "loss": 0.0064, + "step": 36060 + }, + { + "grad_norm": 0.23468294739723206, + "learning_rate": 3.7541484327194654e-05, + "loss": 0.0064, + "step": 36070 + }, + { + "grad_norm": 0.27246394753456116, + "learning_rate": 3.751479757327707e-05, + "loss": 0.0069, + "step": 36080 + }, + { + "grad_norm": 0.23805776238441467, + "learning_rate": 3.7488114612034345e-05, + "loss": 0.0083, + "step": 36090 + }, + { + "grad_norm": 0.18809965252876282, + "learning_rate": 3.7461435451572044e-05, + "loss": 0.0056, + "step": 36100 + }, + { + "grad_norm": 0.21221695840358734, + "learning_rate": 3.743476009999459e-05, + "loss": 0.0062, + "step": 36110 + }, + { + "grad_norm": 0.23103338479995728, + "learning_rate": 3.7408088565405245e-05, + "loss": 0.0043, + "step": 36120 + }, + { + "grad_norm": 0.22648471593856812, + "learning_rate": 3.738142085590612e-05, + "loss": 0.0066, + "step": 36130 + }, + { + "grad_norm": 0.1975145936012268, + "learning_rate": 3.7354756979598194e-05, + "loss": 0.0059, + "step": 36140 + }, + { + "grad_norm": 0.19435888528823853, + "learning_rate": 3.7328096944581187e-05, + "loss": 0.0065, + "step": 36150 + }, + { + "grad_norm": 0.2177690714597702, + "learning_rate": 3.730144075895377e-05, + "loss": 0.0051, + "step": 36160 + }, + { + "grad_norm": 0.2232530415058136, + "learning_rate": 3.727478843081335e-05, + "loss": 0.0054, + "step": 36170 + }, + { + "grad_norm": 0.23869039118289948, + "learning_rate": 3.72481399682562e-05, + "loss": 0.0074, + "step": 36180 + }, + { + "grad_norm": 0.23044690489768982, + "learning_rate": 3.722149537937747e-05, + "loss": 0.006, + "step": 36190 + }, + { + "grad_norm": 0.251235693693161, + "learning_rate": 3.7194854672271015e-05, + "loss": 0.0051, + "step": 36200 + }, + { + "grad_norm": 0.20466117560863495, + "learning_rate": 3.7168217855029644e-05, + "loss": 0.006, + "step": 36210 + }, + { + "grad_norm": 0.24486492574214935, + "learning_rate": 3.7141584935744856e-05, + "loss": 0.006, + "step": 36220 + }, + { + "grad_norm": 0.2942899465560913, + "learning_rate": 3.7114955922507055e-05, + "loss": 0.0056, + "step": 36230 + }, + { + "grad_norm": 0.20960935950279236, + "learning_rate": 3.708833082340545e-05, + "loss": 0.0059, + "step": 36240 + }, + { + "grad_norm": 0.3838084638118744, + "learning_rate": 3.7061709646528034e-05, + "loss": 0.0044, + "step": 36250 + }, + { + "grad_norm": 0.27701202034950256, + "learning_rate": 3.7035092399961604e-05, + "loss": 0.0066, + "step": 36260 + }, + { + "grad_norm": 0.2270684391260147, + "learning_rate": 3.700847909179177e-05, + "loss": 0.0081, + "step": 36270 + }, + { + "grad_norm": 0.2737200856208801, + "learning_rate": 3.698186973010297e-05, + "loss": 0.0092, + "step": 36280 + }, + { + "grad_norm": 0.24267403781414032, + "learning_rate": 3.695526432297844e-05, + "loss": 0.0054, + "step": 36290 + }, + { + "grad_norm": 0.1955595165491104, + "learning_rate": 3.692866287850017e-05, + "loss": 0.0047, + "step": 36300 + }, + { + "grad_norm": 0.1758509874343872, + "learning_rate": 3.6902065404749006e-05, + "loss": 0.0065, + "step": 36310 + }, + { + "grad_norm": 0.21280477941036224, + "learning_rate": 3.6875471909804516e-05, + "loss": 0.0057, + "step": 36320 + }, + { + "grad_norm": 0.23110578954219818, + "learning_rate": 3.6848882401745135e-05, + "loss": 0.01, + "step": 36330 + }, + { + "grad_norm": 0.19947132468223572, + "learning_rate": 3.682229688864806e-05, + "loss": 0.0086, + "step": 36340 + }, + { + "grad_norm": 0.22780759632587433, + "learning_rate": 3.6795715378589235e-05, + "loss": 0.0063, + "step": 36350 + }, + { + "grad_norm": 0.15910424292087555, + "learning_rate": 3.676913787964345e-05, + "loss": 0.0058, + "step": 36360 + }, + { + "grad_norm": 0.21257714927196503, + "learning_rate": 3.674256439988423e-05, + "loss": 0.0062, + "step": 36370 + }, + { + "grad_norm": 0.2670106589794159, + "learning_rate": 3.6715994947383904e-05, + "loss": 0.0065, + "step": 36380 + }, + { + "grad_norm": 0.1675681173801422, + "learning_rate": 3.668942953021357e-05, + "loss": 0.0053, + "step": 36390 + }, + { + "grad_norm": 0.20900532603263855, + "learning_rate": 3.66628681564431e-05, + "loss": 0.005, + "step": 36400 + }, + { + "grad_norm": 0.261904239654541, + "learning_rate": 3.663631083414114e-05, + "loss": 0.0062, + "step": 36410 + }, + { + "grad_norm": 0.13752001523971558, + "learning_rate": 3.660975757137509e-05, + "loss": 0.0057, + "step": 36420 + }, + { + "grad_norm": 0.23134367167949677, + "learning_rate": 3.658320837621114e-05, + "loss": 0.0074, + "step": 36430 + }, + { + "grad_norm": 0.23288044333457947, + "learning_rate": 3.655666325671426e-05, + "loss": 0.0059, + "step": 36440 + }, + { + "grad_norm": 0.21831224858760834, + "learning_rate": 3.65301222209481e-05, + "loss": 0.0056, + "step": 36450 + }, + { + "grad_norm": 0.2047238051891327, + "learning_rate": 3.650358527697519e-05, + "loss": 0.0066, + "step": 36460 + }, + { + "grad_norm": 0.19113247096538544, + "learning_rate": 3.64770524328567e-05, + "loss": 0.0055, + "step": 36470 + }, + { + "grad_norm": 0.16633984446525574, + "learning_rate": 3.645052369665265e-05, + "loss": 0.0067, + "step": 36480 + }, + { + "grad_norm": 0.2797467112541199, + "learning_rate": 3.6423999076421724e-05, + "loss": 0.0058, + "step": 36490 + }, + { + "grad_norm": 0.24397028982639313, + "learning_rate": 3.639747858022142e-05, + "loss": 0.0056, + "step": 36500 + }, + { + "grad_norm": 0.2170751988887787, + "learning_rate": 3.637096221610799e-05, + "loss": 0.0059, + "step": 36510 + }, + { + "grad_norm": 0.27747640013694763, + "learning_rate": 3.634444999213638e-05, + "loss": 0.0055, + "step": 36520 + }, + { + "grad_norm": 0.1372748166322708, + "learning_rate": 3.6317941916360296e-05, + "loss": 0.0055, + "step": 36530 + }, + { + "grad_norm": 0.20448923110961914, + "learning_rate": 3.629143799683221e-05, + "loss": 0.007, + "step": 36540 + }, + { + "grad_norm": 0.21278725564479828, + "learning_rate": 3.626493824160331e-05, + "loss": 0.0085, + "step": 36550 + }, + { + "grad_norm": 0.23292264342308044, + "learning_rate": 3.623844265872352e-05, + "loss": 0.0047, + "step": 36560 + }, + { + "grad_norm": 0.245171919465065, + "learning_rate": 3.621195125624149e-05, + "loss": 0.0057, + "step": 36570 + }, + { + "grad_norm": 0.20896212756633759, + "learning_rate": 3.618546404220463e-05, + "loss": 0.0067, + "step": 36580 + }, + { + "grad_norm": 0.19054430723190308, + "learning_rate": 3.615898102465903e-05, + "loss": 0.0079, + "step": 36590 + }, + { + "grad_norm": 0.1752590835094452, + "learning_rate": 3.6132502211649544e-05, + "loss": 0.0055, + "step": 36600 + }, + { + "grad_norm": 0.1642097383737564, + "learning_rate": 3.610602761121975e-05, + "loss": 0.0059, + "step": 36610 + }, + { + "grad_norm": 0.33313947916030884, + "learning_rate": 3.6079557231411897e-05, + "loss": 0.0075, + "step": 36620 + }, + { + "grad_norm": 0.1732037365436554, + "learning_rate": 3.6053091080267035e-05, + "loss": 0.0041, + "step": 36630 + }, + { + "grad_norm": 0.18742002546787262, + "learning_rate": 3.602662916582483e-05, + "loss": 0.0069, + "step": 36640 + }, + { + "grad_norm": 0.21860133111476898, + "learning_rate": 3.600017149612375e-05, + "loss": 0.0078, + "step": 36650 + }, + { + "grad_norm": 0.24872207641601562, + "learning_rate": 3.5973718079200935e-05, + "loss": 0.0075, + "step": 36660 + }, + { + "grad_norm": 0.21307551860809326, + "learning_rate": 3.5947268923092216e-05, + "loss": 0.0053, + "step": 36670 + }, + { + "grad_norm": 0.24681150913238525, + "learning_rate": 3.592082403583216e-05, + "loss": 0.0064, + "step": 36680 + }, + { + "grad_norm": 0.17814095318317413, + "learning_rate": 3.5894383425454004e-05, + "loss": 0.0045, + "step": 36690 + }, + { + "grad_norm": 0.18989083170890808, + "learning_rate": 3.586794709998975e-05, + "loss": 0.0051, + "step": 36700 + }, + { + "grad_norm": 0.1905387043952942, + "learning_rate": 3.584151506747002e-05, + "loss": 0.007, + "step": 36710 + }, + { + "grad_norm": 0.1567084938287735, + "learning_rate": 3.581508733592418e-05, + "loss": 0.0039, + "step": 36720 + }, + { + "grad_norm": 0.22371873259544373, + "learning_rate": 3.5788663913380297e-05, + "loss": 0.0071, + "step": 36730 + }, + { + "grad_norm": 0.2086075097322464, + "learning_rate": 3.576224480786506e-05, + "loss": 0.0054, + "step": 36740 + }, + { + "grad_norm": 0.2463143765926361, + "learning_rate": 3.573583002740393e-05, + "loss": 0.0085, + "step": 36750 + }, + { + "grad_norm": 0.21759995818138123, + "learning_rate": 3.570941958002103e-05, + "loss": 0.0055, + "step": 36760 + }, + { + "grad_norm": 0.1807888150215149, + "learning_rate": 3.568301347373912e-05, + "loss": 0.0047, + "step": 36770 + }, + { + "grad_norm": 0.2299349159002304, + "learning_rate": 3.5656611716579726e-05, + "loss": 0.0075, + "step": 36780 + }, + { + "grad_norm": 0.20724748075008392, + "learning_rate": 3.5630214316562946e-05, + "loss": 0.0054, + "step": 36790 + }, + { + "grad_norm": 0.20229975879192352, + "learning_rate": 3.560382128170766e-05, + "loss": 0.0061, + "step": 36800 + }, + { + "grad_norm": 0.20782332122325897, + "learning_rate": 3.5577432620031374e-05, + "loss": 0.0095, + "step": 36810 + }, + { + "grad_norm": 0.1981372833251953, + "learning_rate": 3.5551048339550216e-05, + "loss": 0.0078, + "step": 36820 + }, + { + "grad_norm": 0.20795145630836487, + "learning_rate": 3.55246684482791e-05, + "loss": 0.005, + "step": 36830 + }, + { + "grad_norm": 0.2088252753019333, + "learning_rate": 3.5498292954231496e-05, + "loss": 0.006, + "step": 36840 + }, + { + "grad_norm": 0.2797907590866089, + "learning_rate": 3.54719218654196e-05, + "loss": 0.0086, + "step": 36850 + }, + { + "grad_norm": 0.20427638292312622, + "learning_rate": 3.544555518985425e-05, + "loss": 0.007, + "step": 36860 + }, + { + "grad_norm": 0.2264554351568222, + "learning_rate": 3.541919293554494e-05, + "loss": 0.0043, + "step": 36870 + }, + { + "grad_norm": 0.21132589876651764, + "learning_rate": 3.539283511049985e-05, + "loss": 0.0048, + "step": 36880 + }, + { + "grad_norm": 0.22764794528484344, + "learning_rate": 3.5366481722725755e-05, + "loss": 0.0076, + "step": 36890 + }, + { + "grad_norm": 0.26951301097869873, + "learning_rate": 3.534013278022816e-05, + "loss": 0.0058, + "step": 36900 + }, + { + "grad_norm": 0.2956538796424866, + "learning_rate": 3.531378829101113e-05, + "loss": 0.0051, + "step": 36910 + }, + { + "grad_norm": 0.21075810492038727, + "learning_rate": 3.528744826307746e-05, + "loss": 0.0068, + "step": 36920 + }, + { + "grad_norm": 0.20373030006885529, + "learning_rate": 3.5261112704428554e-05, + "loss": 0.0072, + "step": 36930 + }, + { + "grad_norm": 0.17024897038936615, + "learning_rate": 3.523478162306443e-05, + "loss": 0.0047, + "step": 36940 + }, + { + "grad_norm": 0.18920721113681793, + "learning_rate": 3.520845502698381e-05, + "loss": 0.0051, + "step": 36950 + }, + { + "grad_norm": 0.2483489066362381, + "learning_rate": 3.5182132924184005e-05, + "loss": 0.0055, + "step": 36960 + }, + { + "grad_norm": 0.18171939253807068, + "learning_rate": 3.5155815322660966e-05, + "loss": 0.0053, + "step": 36970 + }, + { + "grad_norm": 0.1687224805355072, + "learning_rate": 3.512950223040931e-05, + "loss": 0.0044, + "step": 36980 + }, + { + "grad_norm": 0.1768805831670761, + "learning_rate": 3.5103193655422216e-05, + "loss": 0.0057, + "step": 36990 + }, + { + "grad_norm": 0.13669216632843018, + "learning_rate": 3.5076889605691596e-05, + "loss": 0.005, + "step": 37000 + }, + { + "grad_norm": 0.20554442703723907, + "learning_rate": 3.505059008920787e-05, + "loss": 0.0066, + "step": 37010 + }, + { + "grad_norm": 0.21139760315418243, + "learning_rate": 3.502429511396016e-05, + "loss": 0.0061, + "step": 37020 + }, + { + "grad_norm": 0.19338373839855194, + "learning_rate": 3.4998004687936196e-05, + "loss": 0.0073, + "step": 37030 + }, + { + "grad_norm": 0.1779462993144989, + "learning_rate": 3.497171881912229e-05, + "loss": 0.0073, + "step": 37040 + }, + { + "grad_norm": 0.26644426584243774, + "learning_rate": 3.494543751550342e-05, + "loss": 0.0053, + "step": 37050 + }, + { + "grad_norm": 0.2014368176460266, + "learning_rate": 3.491916078506313e-05, + "loss": 0.0062, + "step": 37060 + }, + { + "grad_norm": 0.31424960494041443, + "learning_rate": 3.489288863578361e-05, + "loss": 0.0075, + "step": 37070 + }, + { + "grad_norm": 0.28273335099220276, + "learning_rate": 3.4866621075645646e-05, + "loss": 0.007, + "step": 37080 + }, + { + "grad_norm": 0.2067117989063263, + "learning_rate": 3.4840358112628614e-05, + "loss": 0.0051, + "step": 37090 + }, + { + "grad_norm": 0.24792727828025818, + "learning_rate": 3.481409975471053e-05, + "loss": 0.0053, + "step": 37100 + }, + { + "grad_norm": 0.2381804883480072, + "learning_rate": 3.4787846009867986e-05, + "loss": 0.0054, + "step": 37110 + }, + { + "grad_norm": 0.21490605175495148, + "learning_rate": 3.476159688607615e-05, + "loss": 0.0052, + "step": 37120 + }, + { + "grad_norm": 0.21271218359470367, + "learning_rate": 3.4735352391308854e-05, + "loss": 0.0055, + "step": 37130 + }, + { + "grad_norm": 0.24120795726776123, + "learning_rate": 3.4709112533538446e-05, + "loss": 0.0092, + "step": 37140 + }, + { + "grad_norm": 0.2060900628566742, + "learning_rate": 3.4682877320735934e-05, + "loss": 0.0055, + "step": 37150 + }, + { + "grad_norm": 0.15543334186077118, + "learning_rate": 3.465664676087085e-05, + "loss": 0.004, + "step": 37160 + }, + { + "grad_norm": 0.20778962969779968, + "learning_rate": 3.463042086191136e-05, + "loss": 0.0057, + "step": 37170 + }, + { + "grad_norm": 0.22885197401046753, + "learning_rate": 3.460419963182423e-05, + "loss": 0.0057, + "step": 37180 + }, + { + "grad_norm": 0.22553709149360657, + "learning_rate": 3.457798307857473e-05, + "loss": 0.0058, + "step": 37190 + }, + { + "grad_norm": 0.2610466182231903, + "learning_rate": 3.455177121012678e-05, + "loss": 0.0053, + "step": 37200 + }, + { + "grad_norm": 0.1642984300851822, + "learning_rate": 3.452556403444285e-05, + "loss": 0.0079, + "step": 37210 + }, + { + "grad_norm": 0.16797909140586853, + "learning_rate": 3.4499361559483975e-05, + "loss": 0.0055, + "step": 37220 + }, + { + "grad_norm": 0.2554571330547333, + "learning_rate": 3.44731637932098e-05, + "loss": 0.0051, + "step": 37230 + }, + { + "grad_norm": 0.13070277869701385, + "learning_rate": 3.44469707435785e-05, + "loss": 0.0042, + "step": 37240 + }, + { + "grad_norm": 0.16924156248569489, + "learning_rate": 3.4420782418546835e-05, + "loss": 0.0054, + "step": 37250 + }, + { + "grad_norm": 0.17158930003643036, + "learning_rate": 3.439459882607012e-05, + "loss": 0.0054, + "step": 37260 + }, + { + "grad_norm": 0.10545698553323746, + "learning_rate": 3.436841997410225e-05, + "loss": 0.0038, + "step": 37270 + }, + { + "grad_norm": 0.21364626288414001, + "learning_rate": 3.434224587059567e-05, + "loss": 0.0059, + "step": 37280 + }, + { + "grad_norm": 0.17869031429290771, + "learning_rate": 3.431607652350136e-05, + "loss": 0.0045, + "step": 37290 + }, + { + "grad_norm": 0.15119893848896027, + "learning_rate": 3.428991194076891e-05, + "loss": 0.0056, + "step": 37300 + }, + { + "grad_norm": 0.2110467553138733, + "learning_rate": 3.4263752130346394e-05, + "loss": 0.0058, + "step": 37310 + }, + { + "grad_norm": 0.12615124881267548, + "learning_rate": 3.4237597100180515e-05, + "loss": 0.0048, + "step": 37320 + }, + { + "grad_norm": 0.18072180449962616, + "learning_rate": 3.4211446858216427e-05, + "loss": 0.0035, + "step": 37330 + }, + { + "grad_norm": 0.1312078833580017, + "learning_rate": 3.4185301412397915e-05, + "loss": 0.0039, + "step": 37340 + }, + { + "grad_norm": 0.25086510181427, + "learning_rate": 3.415916077066729e-05, + "loss": 0.0078, + "step": 37350 + }, + { + "grad_norm": 0.1735914945602417, + "learning_rate": 3.413302494096535e-05, + "loss": 0.0055, + "step": 37360 + }, + { + "grad_norm": 0.21667224168777466, + "learning_rate": 3.410689393123151e-05, + "loss": 0.0045, + "step": 37370 + }, + { + "grad_norm": 0.18501809239387512, + "learning_rate": 3.408076774940364e-05, + "loss": 0.0077, + "step": 37380 + }, + { + "grad_norm": 0.23403215408325195, + "learning_rate": 3.40546464034182e-05, + "loss": 0.0041, + "step": 37390 + }, + { + "grad_norm": 0.16335240006446838, + "learning_rate": 3.4028529901210185e-05, + "loss": 0.0064, + "step": 37400 + }, + { + "grad_norm": 0.2166048139333725, + "learning_rate": 3.4002418250713086e-05, + "loss": 0.0049, + "step": 37410 + }, + { + "grad_norm": 0.31649965047836304, + "learning_rate": 3.3976311459858936e-05, + "loss": 0.0063, + "step": 37420 + }, + { + "grad_norm": 0.20769289135932922, + "learning_rate": 3.395020953657826e-05, + "loss": 0.0058, + "step": 37430 + }, + { + "grad_norm": 0.2356630265712738, + "learning_rate": 3.3924112488800165e-05, + "loss": 0.0057, + "step": 37440 + }, + { + "grad_norm": 0.255302369594574, + "learning_rate": 3.389802032445225e-05, + "loss": 0.0051, + "step": 37450 + }, + { + "grad_norm": 0.1806274950504303, + "learning_rate": 3.38719330514606e-05, + "loss": 0.0044, + "step": 37460 + }, + { + "grad_norm": 0.22856494784355164, + "learning_rate": 3.3845850677749866e-05, + "loss": 0.0084, + "step": 37470 + }, + { + "grad_norm": 0.15293876826763153, + "learning_rate": 3.3819773211243157e-05, + "loss": 0.0058, + "step": 37480 + }, + { + "grad_norm": 0.19884006679058075, + "learning_rate": 3.379370065986213e-05, + "loss": 0.007, + "step": 37490 + }, + { + "grad_norm": 0.19559234380722046, + "learning_rate": 3.3767633031526955e-05, + "loss": 0.0062, + "step": 37500 + }, + { + "grad_norm": 0.20624510943889618, + "learning_rate": 3.374157033415626e-05, + "loss": 0.0047, + "step": 37510 + }, + { + "grad_norm": 0.2057885378599167, + "learning_rate": 3.371551257566723e-05, + "loss": 0.004, + "step": 37520 + }, + { + "grad_norm": 0.20738784968852997, + "learning_rate": 3.36894597639755e-05, + "loss": 0.0062, + "step": 37530 + }, + { + "grad_norm": 0.22211676836013794, + "learning_rate": 3.366341190699523e-05, + "loss": 0.0052, + "step": 37540 + }, + { + "grad_norm": 0.2413415163755417, + "learning_rate": 3.36373690126391e-05, + "loss": 0.0057, + "step": 37550 + }, + { + "grad_norm": 0.21675710380077362, + "learning_rate": 3.3611331088818234e-05, + "loss": 0.0054, + "step": 37560 + }, + { + "grad_norm": 0.30930137634277344, + "learning_rate": 3.3585298143442265e-05, + "loss": 0.0078, + "step": 37570 + }, + { + "grad_norm": 0.27456626296043396, + "learning_rate": 3.35592701844193e-05, + "loss": 0.0048, + "step": 37580 + }, + { + "grad_norm": 0.20125313103199005, + "learning_rate": 3.353324721965596e-05, + "loss": 0.0057, + "step": 37590 + }, + { + "grad_norm": 0.2426241636276245, + "learning_rate": 3.350722925705736e-05, + "loss": 0.0072, + "step": 37600 + }, + { + "grad_norm": 0.11537458002567291, + "learning_rate": 3.348121630452703e-05, + "loss": 0.0077, + "step": 37610 + }, + { + "grad_norm": 0.17236436903476715, + "learning_rate": 3.3455208369967044e-05, + "loss": 0.0057, + "step": 37620 + }, + { + "grad_norm": 0.17111149430274963, + "learning_rate": 3.34292054612779e-05, + "loss": 0.0043, + "step": 37630 + }, + { + "grad_norm": 0.16594715416431427, + "learning_rate": 3.340320758635861e-05, + "loss": 0.0063, + "step": 37640 + }, + { + "grad_norm": 0.16533873975276947, + "learning_rate": 3.337721475310666e-05, + "loss": 0.0048, + "step": 37650 + }, + { + "grad_norm": 0.25538715720176697, + "learning_rate": 3.335122696941795e-05, + "loss": 0.0057, + "step": 37660 + }, + { + "grad_norm": 0.1986265778541565, + "learning_rate": 3.332524424318692e-05, + "loss": 0.0058, + "step": 37670 + }, + { + "grad_norm": 0.22327803075313568, + "learning_rate": 3.32992665823064e-05, + "loss": 0.0056, + "step": 37680 + }, + { + "grad_norm": 0.1464325487613678, + "learning_rate": 3.327329399466774e-05, + "loss": 0.0049, + "step": 37690 + }, + { + "grad_norm": 0.1957155019044876, + "learning_rate": 3.324732648816072e-05, + "loss": 0.0069, + "step": 37700 + }, + { + "grad_norm": 0.26993077993392944, + "learning_rate": 3.322136407067358e-05, + "loss": 0.0048, + "step": 37710 + }, + { + "grad_norm": 0.16770979762077332, + "learning_rate": 3.3195406750093036e-05, + "loss": 0.006, + "step": 37720 + }, + { + "grad_norm": 0.2029825896024704, + "learning_rate": 3.3169454534304205e-05, + "loss": 0.0052, + "step": 37730 + }, + { + "grad_norm": 0.1719246506690979, + "learning_rate": 3.3143507431190725e-05, + "loss": 0.0048, + "step": 37740 + }, + { + "grad_norm": 0.29215943813323975, + "learning_rate": 3.311756544863459e-05, + "loss": 0.0052, + "step": 37750 + }, + { + "grad_norm": 0.20363165438175201, + "learning_rate": 3.309162859451633e-05, + "loss": 0.0057, + "step": 37760 + }, + { + "grad_norm": 0.19112004339694977, + "learning_rate": 3.306569687671487e-05, + "loss": 0.0064, + "step": 37770 + }, + { + "grad_norm": 0.25219428539276123, + "learning_rate": 3.303977030310756e-05, + "loss": 0.0044, + "step": 37780 + }, + { + "grad_norm": 0.18868236243724823, + "learning_rate": 3.3013848881570245e-05, + "loss": 0.0067, + "step": 37790 + }, + { + "grad_norm": 0.151606485247612, + "learning_rate": 3.298793261997712e-05, + "loss": 0.0047, + "step": 37800 + }, + { + "grad_norm": 0.2017272263765335, + "learning_rate": 3.2962021526200893e-05, + "loss": 0.0041, + "step": 37810 + }, + { + "grad_norm": 0.1984231173992157, + "learning_rate": 3.293611560811268e-05, + "loss": 0.0054, + "step": 37820 + }, + { + "grad_norm": 0.19376057386398315, + "learning_rate": 3.291021487358199e-05, + "loss": 0.0066, + "step": 37830 + }, + { + "grad_norm": 0.21602368354797363, + "learning_rate": 3.28843193304768e-05, + "loss": 0.0067, + "step": 37840 + }, + { + "grad_norm": 0.1858169436454773, + "learning_rate": 3.2858428986663456e-05, + "loss": 0.0052, + "step": 37850 + }, + { + "grad_norm": 0.7198758125305176, + "learning_rate": 3.283254385000681e-05, + "loss": 0.006, + "step": 37860 + }, + { + "grad_norm": 0.1725969761610031, + "learning_rate": 3.2806663928370076e-05, + "loss": 0.0052, + "step": 37870 + }, + { + "grad_norm": 0.11645271629095078, + "learning_rate": 3.278078922961485e-05, + "loss": 0.004, + "step": 37880 + }, + { + "grad_norm": 0.18152737617492676, + "learning_rate": 3.275491976160123e-05, + "loss": 0.0057, + "step": 37890 + }, + { + "grad_norm": 0.23510292172431946, + "learning_rate": 3.2729055532187645e-05, + "loss": 0.0079, + "step": 37900 + }, + { + "grad_norm": 0.2650381922721863, + "learning_rate": 3.270319654923097e-05, + "loss": 0.0045, + "step": 37910 + }, + { + "grad_norm": 0.16366882622241974, + "learning_rate": 3.2677342820586506e-05, + "loss": 0.0053, + "step": 37920 + }, + { + "grad_norm": 0.19202515482902527, + "learning_rate": 3.2651494354107905e-05, + "loss": 0.0077, + "step": 37930 + }, + { + "grad_norm": 0.25940415263175964, + "learning_rate": 3.2625651157647266e-05, + "loss": 0.0071, + "step": 37940 + }, + { + "grad_norm": 0.19381456077098846, + "learning_rate": 3.259981323905505e-05, + "loss": 0.0048, + "step": 37950 + }, + { + "grad_norm": 0.1702100932598114, + "learning_rate": 3.257398060618014e-05, + "loss": 0.0075, + "step": 37960 + }, + { + "grad_norm": 0.1628592610359192, + "learning_rate": 3.254815326686983e-05, + "loss": 0.0057, + "step": 37970 + }, + { + "grad_norm": 0.2136130928993225, + "learning_rate": 3.2522331228969774e-05, + "loss": 0.0072, + "step": 37980 + }, + { + "grad_norm": 0.17628826200962067, + "learning_rate": 3.2496514500324006e-05, + "loss": 0.0058, + "step": 37990 + }, + { + "grad_norm": 0.16468124091625214, + "learning_rate": 3.247070308877498e-05, + "loss": 0.0042, + "step": 38000 + }, + { + "grad_norm": 0.1680259108543396, + "learning_rate": 3.2444897002163515e-05, + "loss": 0.0048, + "step": 38010 + }, + { + "grad_norm": 0.15244245529174805, + "learning_rate": 3.241909624832885e-05, + "loss": 0.0041, + "step": 38020 + }, + { + "grad_norm": 0.1866675168275833, + "learning_rate": 3.239330083510852e-05, + "loss": 0.0042, + "step": 38030 + }, + { + "grad_norm": 0.2662525177001953, + "learning_rate": 3.236751077033855e-05, + "loss": 0.0047, + "step": 38040 + }, + { + "grad_norm": 0.24083131551742554, + "learning_rate": 3.234172606185322e-05, + "loss": 0.0053, + "step": 38050 + }, + { + "grad_norm": 0.24699541926383972, + "learning_rate": 3.231594671748528e-05, + "loss": 0.0075, + "step": 38060 + }, + { + "grad_norm": 0.18466341495513916, + "learning_rate": 3.2290172745065815e-05, + "loss": 0.0058, + "step": 38070 + }, + { + "grad_norm": 0.19507554173469543, + "learning_rate": 3.226440415242426e-05, + "loss": 0.0078, + "step": 38080 + }, + { + "grad_norm": 0.19676101207733154, + "learning_rate": 3.223864094738846e-05, + "loss": 0.006, + "step": 38090 + }, + { + "grad_norm": 0.2458454966545105, + "learning_rate": 3.221288313778456e-05, + "loss": 0.0052, + "step": 38100 + }, + { + "grad_norm": 0.24716630578041077, + "learning_rate": 3.2187130731437125e-05, + "loss": 0.0089, + "step": 38110 + }, + { + "grad_norm": 0.22803299129009247, + "learning_rate": 3.216138373616905e-05, + "loss": 0.0075, + "step": 38120 + }, + { + "grad_norm": 0.21906951069831848, + "learning_rate": 3.21356421598016e-05, + "loss": 0.008, + "step": 38130 + }, + { + "grad_norm": 0.2550855576992035, + "learning_rate": 3.210990601015438e-05, + "loss": 0.0057, + "step": 38140 + }, + { + "grad_norm": 0.17309622466564178, + "learning_rate": 3.208417529504535e-05, + "loss": 0.0071, + "step": 38150 + }, + { + "grad_norm": 0.1554785519838333, + "learning_rate": 3.205845002229084e-05, + "loss": 0.0046, + "step": 38160 + }, + { + "grad_norm": 0.15522806346416473, + "learning_rate": 3.203273019970547e-05, + "loss": 0.0047, + "step": 38170 + }, + { + "grad_norm": 0.2033078521490097, + "learning_rate": 3.200701583510227e-05, + "loss": 0.0069, + "step": 38180 + }, + { + "grad_norm": 0.27065756916999817, + "learning_rate": 3.198130693629261e-05, + "loss": 0.0055, + "step": 38190 + }, + { + "grad_norm": 0.29369938373565674, + "learning_rate": 3.195560351108612e-05, + "loss": 0.0076, + "step": 38200 + }, + { + "grad_norm": 0.23765428364276886, + "learning_rate": 3.1929905567290865e-05, + "loss": 0.0103, + "step": 38210 + }, + { + "grad_norm": 0.20930853486061096, + "learning_rate": 3.1904213112713164e-05, + "loss": 0.0065, + "step": 38220 + }, + { + "grad_norm": 0.1556563526391983, + "learning_rate": 3.187852615515774e-05, + "loss": 0.0075, + "step": 38230 + }, + { + "grad_norm": 0.19510279595851898, + "learning_rate": 3.1852844702427606e-05, + "loss": 0.0049, + "step": 38240 + }, + { + "grad_norm": 0.20544369518756866, + "learning_rate": 3.18271687623241e-05, + "loss": 0.0051, + "step": 38250 + }, + { + "grad_norm": 0.24184343218803406, + "learning_rate": 3.1801498342646896e-05, + "loss": 0.0075, + "step": 38260 + }, + { + "grad_norm": 0.24722795188426971, + "learning_rate": 3.177583345119398e-05, + "loss": 0.0052, + "step": 38270 + }, + { + "grad_norm": 0.26140618324279785, + "learning_rate": 3.17501740957617e-05, + "loss": 0.0076, + "step": 38280 + }, + { + "grad_norm": 0.27534765005111694, + "learning_rate": 3.172452028414467e-05, + "loss": 0.007, + "step": 38290 + }, + { + "grad_norm": 0.19065791368484497, + "learning_rate": 3.169887202413583e-05, + "loss": 0.0068, + "step": 38300 + }, + { + "grad_norm": 0.2361488938331604, + "learning_rate": 3.167322932352646e-05, + "loss": 0.0067, + "step": 38310 + }, + { + "grad_norm": 0.159803107380867, + "learning_rate": 3.164759219010613e-05, + "loss": 0.0039, + "step": 38320 + }, + { + "grad_norm": 0.208531454205513, + "learning_rate": 3.1621960631662725e-05, + "loss": 0.0046, + "step": 38330 + }, + { + "grad_norm": 0.12291572988033295, + "learning_rate": 3.159633465598245e-05, + "loss": 0.004, + "step": 38340 + }, + { + "grad_norm": 0.2683856189250946, + "learning_rate": 3.1570714270849767e-05, + "loss": 0.0057, + "step": 38350 + }, + { + "grad_norm": 0.17563460767269135, + "learning_rate": 3.1545099484047516e-05, + "loss": 0.0041, + "step": 38360 + }, + { + "grad_norm": 0.21362178027629852, + "learning_rate": 3.151949030335674e-05, + "loss": 0.0064, + "step": 38370 + }, + { + "grad_norm": 0.20487098395824432, + "learning_rate": 3.149388673655687e-05, + "loss": 0.0063, + "step": 38380 + }, + { + "grad_norm": 0.2187328040599823, + "learning_rate": 3.146828879142559e-05, + "loss": 0.0073, + "step": 38390 + }, + { + "grad_norm": 0.15002216398715973, + "learning_rate": 3.1442696475738866e-05, + "loss": 0.0063, + "step": 38400 + }, + { + "grad_norm": 0.16359585523605347, + "learning_rate": 3.141710979727098e-05, + "loss": 0.0037, + "step": 38410 + }, + { + "grad_norm": 0.1778039187192917, + "learning_rate": 3.139152876379447e-05, + "loss": 0.0066, + "step": 38420 + }, + { + "grad_norm": 0.19601871073246002, + "learning_rate": 3.1365953383080214e-05, + "loss": 0.0056, + "step": 38430 + }, + { + "grad_norm": 0.26296091079711914, + "learning_rate": 3.134038366289731e-05, + "loss": 0.0056, + "step": 38440 + }, + { + "grad_norm": 0.241267129778862, + "learning_rate": 3.131481961101317e-05, + "loss": 0.005, + "step": 38450 + }, + { + "grad_norm": 0.210506871342659, + "learning_rate": 3.128926123519349e-05, + "loss": 0.005, + "step": 38460 + }, + { + "grad_norm": 0.15222042798995972, + "learning_rate": 3.1263708543202194e-05, + "loss": 0.0046, + "step": 38470 + }, + { + "grad_norm": 0.25686559081077576, + "learning_rate": 3.123816154280155e-05, + "loss": 0.0063, + "step": 38480 + }, + { + "grad_norm": 0.18360815942287445, + "learning_rate": 3.121262024175207e-05, + "loss": 0.0057, + "step": 38490 + }, + { + "grad_norm": 0.23407138884067535, + "learning_rate": 3.118708464781248e-05, + "loss": 0.0091, + "step": 38500 + }, + { + "grad_norm": 0.13351522386074066, + "learning_rate": 3.116155476873987e-05, + "loss": 0.005, + "step": 38510 + }, + { + "grad_norm": 0.21726030111312866, + "learning_rate": 3.11360306122895e-05, + "loss": 0.0081, + "step": 38520 + }, + { + "grad_norm": 0.22795534133911133, + "learning_rate": 3.1110512186214975e-05, + "loss": 0.0043, + "step": 38530 + }, + { + "grad_norm": 0.18992166221141815, + "learning_rate": 3.1084999498268095e-05, + "loss": 0.0042, + "step": 38540 + }, + { + "grad_norm": 0.19822633266448975, + "learning_rate": 3.1059492556198934e-05, + "loss": 0.0064, + "step": 38550 + }, + { + "grad_norm": 0.29902753233909607, + "learning_rate": 3.103399136775586e-05, + "loss": 0.0077, + "step": 38560 + }, + { + "grad_norm": 0.21984446048736572, + "learning_rate": 3.100849594068541e-05, + "loss": 0.005, + "step": 38570 + }, + { + "grad_norm": 0.22554202377796173, + "learning_rate": 3.0983006282732484e-05, + "loss": 0.0047, + "step": 38580 + }, + { + "grad_norm": 0.2012355923652649, + "learning_rate": 3.0957522401640116e-05, + "loss": 0.0072, + "step": 38590 + }, + { + "grad_norm": 0.24658608436584473, + "learning_rate": 3.0932044305149645e-05, + "loss": 0.0051, + "step": 38600 + }, + { + "grad_norm": 0.2997727394104004, + "learning_rate": 3.090657200100068e-05, + "loss": 0.0068, + "step": 38610 + }, + { + "grad_norm": 0.16216319799423218, + "learning_rate": 3.088110549693099e-05, + "loss": 0.0049, + "step": 38620 + }, + { + "grad_norm": 0.21875208616256714, + "learning_rate": 3.085564480067667e-05, + "loss": 0.0069, + "step": 38630 + }, + { + "grad_norm": 0.19128629565238953, + "learning_rate": 3.0830189919971955e-05, + "loss": 0.0069, + "step": 38640 + }, + { + "grad_norm": 0.2854331135749817, + "learning_rate": 3.080474086254939e-05, + "loss": 0.0056, + "step": 38650 + }, + { + "grad_norm": 0.23712821304798126, + "learning_rate": 3.077929763613975e-05, + "loss": 0.0051, + "step": 38660 + }, + { + "grad_norm": 0.1727815717458725, + "learning_rate": 3.075386024847198e-05, + "loss": 0.0055, + "step": 38670 + }, + { + "grad_norm": 0.14114877581596375, + "learning_rate": 3.072842870727331e-05, + "loss": 0.0042, + "step": 38680 + }, + { + "grad_norm": 0.18886399269104004, + "learning_rate": 3.070300302026916e-05, + "loss": 0.0048, + "step": 38690 + }, + { + "grad_norm": 0.1381268948316574, + "learning_rate": 3.067758319518318e-05, + "loss": 0.0044, + "step": 38700 + }, + { + "grad_norm": 0.164319708943367, + "learning_rate": 3.065216923973725e-05, + "loss": 0.0046, + "step": 38710 + }, + { + "grad_norm": 0.18960459530353546, + "learning_rate": 3.062676116165145e-05, + "loss": 0.004, + "step": 38720 + }, + { + "grad_norm": 0.2099323272705078, + "learning_rate": 3.06013589686441e-05, + "loss": 0.0071, + "step": 38730 + }, + { + "grad_norm": 0.26826804876327515, + "learning_rate": 3.05759626684317e-05, + "loss": 0.01, + "step": 38740 + }, + { + "grad_norm": 0.2196415662765503, + "learning_rate": 3.055057226872896e-05, + "loss": 0.005, + "step": 38750 + }, + { + "grad_norm": 0.15844856202602386, + "learning_rate": 3.052518777724887e-05, + "loss": 0.0055, + "step": 38760 + }, + { + "grad_norm": 0.2702769339084625, + "learning_rate": 3.04998092017025e-05, + "loss": 0.0076, + "step": 38770 + }, + { + "grad_norm": 0.2672969400882721, + "learning_rate": 3.0474436549799246e-05, + "loss": 0.0071, + "step": 38780 + }, + { + "grad_norm": 0.16617973148822784, + "learning_rate": 3.044906982924661e-05, + "loss": 0.0069, + "step": 38790 + }, + { + "grad_norm": 0.2865569293498993, + "learning_rate": 3.0423709047750337e-05, + "loss": 0.0048, + "step": 38800 + }, + { + "grad_norm": 0.24740882217884064, + "learning_rate": 3.03983542130144e-05, + "loss": 0.0079, + "step": 38810 + }, + { + "grad_norm": 0.16724632680416107, + "learning_rate": 3.0373005332740877e-05, + "loss": 0.0056, + "step": 38820 + }, + { + "grad_norm": 0.15566882491111755, + "learning_rate": 3.034766241463013e-05, + "loss": 0.0042, + "step": 38830 + }, + { + "grad_norm": 0.23041506111621857, + "learning_rate": 3.032232546638064e-05, + "loss": 0.0065, + "step": 38840 + }, + { + "grad_norm": 0.17141301929950714, + "learning_rate": 3.0296994495689114e-05, + "loss": 0.0049, + "step": 38850 + }, + { + "grad_norm": 0.26385313272476196, + "learning_rate": 3.0271669510250444e-05, + "loss": 0.0053, + "step": 38860 + }, + { + "grad_norm": 0.21630458533763885, + "learning_rate": 3.024635051775766e-05, + "loss": 0.0073, + "step": 38870 + }, + { + "grad_norm": 0.20403070747852325, + "learning_rate": 3.022103752590205e-05, + "loss": 0.0055, + "step": 38880 + }, + { + "grad_norm": 0.17463800311088562, + "learning_rate": 3.0195730542372992e-05, + "loss": 0.0042, + "step": 38890 + }, + { + "grad_norm": 0.15132248401641846, + "learning_rate": 3.0170429574858084e-05, + "loss": 0.0036, + "step": 38900 + }, + { + "grad_norm": 0.19021949172019958, + "learning_rate": 3.0145134631043127e-05, + "loss": 0.0053, + "step": 38910 + }, + { + "grad_norm": 0.20701666176319122, + "learning_rate": 3.0119845718612018e-05, + "loss": 0.0045, + "step": 38920 + }, + { + "grad_norm": 0.18600764870643616, + "learning_rate": 3.009456284524688e-05, + "loss": 0.0043, + "step": 38930 + }, + { + "grad_norm": 0.20948046445846558, + "learning_rate": 3.0069286018627967e-05, + "loss": 0.0046, + "step": 38940 + }, + { + "grad_norm": 0.18533526360988617, + "learning_rate": 3.0044015246433743e-05, + "loss": 0.0038, + "step": 38950 + }, + { + "grad_norm": 0.19902680814266205, + "learning_rate": 3.0018750536340755e-05, + "loss": 0.005, + "step": 38960 + }, + { + "grad_norm": 0.20158135890960693, + "learning_rate": 2.999349189602378e-05, + "loss": 0.0039, + "step": 38970 + }, + { + "grad_norm": 0.27020877599716187, + "learning_rate": 2.9968239333155733e-05, + "loss": 0.0044, + "step": 38980 + }, + { + "grad_norm": 0.20163144171237946, + "learning_rate": 2.994299285540767e-05, + "loss": 0.0041, + "step": 38990 + }, + { + "grad_norm": 0.185372456908226, + "learning_rate": 2.9917752470448813e-05, + "loss": 0.0041, + "step": 39000 + }, + { + "grad_norm": 0.26789581775665283, + "learning_rate": 2.9892518185946495e-05, + "loss": 0.0077, + "step": 39010 + }, + { + "grad_norm": 0.28187650442123413, + "learning_rate": 2.986729000956624e-05, + "loss": 0.0062, + "step": 39020 + }, + { + "grad_norm": 0.14906315505504608, + "learning_rate": 2.9842067948971736e-05, + "loss": 0.0039, + "step": 39030 + }, + { + "grad_norm": 0.22005410492420197, + "learning_rate": 2.9816852011824727e-05, + "loss": 0.0059, + "step": 39040 + }, + { + "grad_norm": 0.1969199776649475, + "learning_rate": 2.979164220578519e-05, + "loss": 0.0063, + "step": 39050 + }, + { + "grad_norm": 0.19907855987548828, + "learning_rate": 2.9766438538511165e-05, + "loss": 0.0049, + "step": 39060 + }, + { + "grad_norm": 0.22292138636112213, + "learning_rate": 2.9741241017658873e-05, + "loss": 0.0046, + "step": 39070 + }, + { + "grad_norm": 0.15400312840938568, + "learning_rate": 2.971604965088267e-05, + "loss": 0.0031, + "step": 39080 + }, + { + "grad_norm": 0.1860182136297226, + "learning_rate": 2.9690864445835008e-05, + "loss": 0.0057, + "step": 39090 + }, + { + "grad_norm": 0.26553893089294434, + "learning_rate": 2.966568541016651e-05, + "loss": 0.0044, + "step": 39100 + }, + { + "grad_norm": 0.1889704167842865, + "learning_rate": 2.9640512551525867e-05, + "loss": 0.0079, + "step": 39110 + }, + { + "grad_norm": 0.18434156477451324, + "learning_rate": 2.961534587755995e-05, + "loss": 0.0051, + "step": 39120 + }, + { + "grad_norm": 0.23027871549129486, + "learning_rate": 2.959018539591375e-05, + "loss": 0.005, + "step": 39130 + }, + { + "grad_norm": 0.2066088318824768, + "learning_rate": 2.9565031114230325e-05, + "loss": 0.008, + "step": 39140 + }, + { + "grad_norm": 0.22787030041217804, + "learning_rate": 2.9539883040150895e-05, + "loss": 0.005, + "step": 39150 + }, + { + "grad_norm": 0.21578925848007202, + "learning_rate": 2.9514741181314774e-05, + "loss": 0.0049, + "step": 39160 + }, + { + "grad_norm": 0.1817273646593094, + "learning_rate": 2.94896055453594e-05, + "loss": 0.0041, + "step": 39170 + }, + { + "grad_norm": 0.19556480646133423, + "learning_rate": 2.9464476139920332e-05, + "loss": 0.0038, + "step": 39180 + }, + { + "grad_norm": 0.2627725303173065, + "learning_rate": 2.9439352972631186e-05, + "loss": 0.0056, + "step": 39190 + }, + { + "grad_norm": 0.24497874081134796, + "learning_rate": 2.9414236051123757e-05, + "loss": 0.0059, + "step": 39200 + }, + { + "grad_norm": 0.2422783523797989, + "learning_rate": 2.938912538302785e-05, + "loss": 0.0048, + "step": 39210 + }, + { + "grad_norm": 0.15815244615077972, + "learning_rate": 2.9364020975971464e-05, + "loss": 0.0064, + "step": 39220 + }, + { + "grad_norm": 0.194553941488266, + "learning_rate": 2.9338922837580657e-05, + "loss": 0.0041, + "step": 39230 + }, + { + "grad_norm": 0.21852512657642365, + "learning_rate": 2.931383097547955e-05, + "loss": 0.0047, + "step": 39240 + }, + { + "grad_norm": 0.1858682930469513, + "learning_rate": 2.928874539729043e-05, + "loss": 0.0077, + "step": 39250 + }, + { + "grad_norm": 0.1445969045162201, + "learning_rate": 2.926366611063358e-05, + "loss": 0.0048, + "step": 39260 + }, + { + "grad_norm": 0.3509559631347656, + "learning_rate": 2.9238593123127463e-05, + "loss": 0.0046, + "step": 39270 + }, + { + "grad_norm": 0.20819500088691711, + "learning_rate": 2.9213526442388583e-05, + "loss": 0.0039, + "step": 39280 + }, + { + "grad_norm": 0.2361106276512146, + "learning_rate": 2.9188466076031545e-05, + "loss": 0.0093, + "step": 39290 + }, + { + "grad_norm": 0.18084196746349335, + "learning_rate": 2.9163412031669012e-05, + "loss": 0.0046, + "step": 39300 + }, + { + "grad_norm": 0.1923476755619049, + "learning_rate": 2.913836431691175e-05, + "loss": 0.007, + "step": 39310 + }, + { + "grad_norm": 0.26897430419921875, + "learning_rate": 2.9113322939368583e-05, + "loss": 0.0078, + "step": 39320 + }, + { + "grad_norm": 0.20294654369354248, + "learning_rate": 2.9088287906646427e-05, + "loss": 0.0045, + "step": 39330 + }, + { + "grad_norm": 0.2674720585346222, + "learning_rate": 2.906325922635024e-05, + "loss": 0.0092, + "step": 39340 + }, + { + "grad_norm": 0.3135108947753906, + "learning_rate": 2.903823690608313e-05, + "loss": 0.0064, + "step": 39350 + }, + { + "grad_norm": 0.3090534508228302, + "learning_rate": 2.9013220953446174e-05, + "loss": 0.0063, + "step": 39360 + }, + { + "grad_norm": 0.34263965487480164, + "learning_rate": 2.8988211376038564e-05, + "loss": 0.0049, + "step": 39370 + }, + { + "grad_norm": 0.2082008421421051, + "learning_rate": 2.8963208181457564e-05, + "loss": 0.0067, + "step": 39380 + }, + { + "grad_norm": 0.20318923890590668, + "learning_rate": 2.8938211377298453e-05, + "loss": 0.0048, + "step": 39390 + }, + { + "grad_norm": 0.19519469141960144, + "learning_rate": 2.8913220971154652e-05, + "loss": 0.0048, + "step": 39400 + }, + { + "grad_norm": 0.29822787642478943, + "learning_rate": 2.888823697061753e-05, + "loss": 0.0053, + "step": 39410 + }, + { + "grad_norm": 0.1345885992050171, + "learning_rate": 2.8863259383276618e-05, + "loss": 0.0035, + "step": 39420 + }, + { + "grad_norm": 0.16829681396484375, + "learning_rate": 2.8838288216719395e-05, + "loss": 0.0047, + "step": 39430 + }, + { + "grad_norm": 0.20874419808387756, + "learning_rate": 2.8813323478531484e-05, + "loss": 0.0047, + "step": 39440 + }, + { + "grad_norm": 0.1568189263343811, + "learning_rate": 2.8788365176296496e-05, + "loss": 0.0064, + "step": 39450 + }, + { + "grad_norm": 0.1999816745519638, + "learning_rate": 2.876341331759611e-05, + "loss": 0.0049, + "step": 39460 + }, + { + "grad_norm": 0.2438332736492157, + "learning_rate": 2.8738467910010036e-05, + "loss": 0.005, + "step": 39470 + }, + { + "grad_norm": 0.13956566154956818, + "learning_rate": 2.8713528961116032e-05, + "loss": 0.0056, + "step": 39480 + }, + { + "grad_norm": 0.19313320517539978, + "learning_rate": 2.8688596478489875e-05, + "loss": 0.0052, + "step": 39490 + }, + { + "grad_norm": 0.16613535583019257, + "learning_rate": 2.8663670469705434e-05, + "loss": 0.0052, + "step": 39500 + }, + { + "grad_norm": 0.174787238240242, + "learning_rate": 2.8638750942334546e-05, + "loss": 0.0077, + "step": 39510 + }, + { + "grad_norm": 0.13661451637744904, + "learning_rate": 2.8613837903947115e-05, + "loss": 0.009, + "step": 39520 + }, + { + "grad_norm": 0.23490449786186218, + "learning_rate": 2.858893136211106e-05, + "loss": 0.0048, + "step": 39530 + }, + { + "grad_norm": 0.2397747039794922, + "learning_rate": 2.8564031324392315e-05, + "loss": 0.0052, + "step": 39540 + }, + { + "grad_norm": 0.16338609158992767, + "learning_rate": 2.85391377983549e-05, + "loss": 0.0041, + "step": 39550 + }, + { + "grad_norm": 0.19758690893650055, + "learning_rate": 2.851425079156075e-05, + "loss": 0.0034, + "step": 39560 + }, + { + "grad_norm": 0.18963731825351715, + "learning_rate": 2.848937031156994e-05, + "loss": 0.0057, + "step": 39570 + }, + { + "grad_norm": 0.17410822212696075, + "learning_rate": 2.846449636594044e-05, + "loss": 0.0051, + "step": 39580 + }, + { + "grad_norm": 0.14894436299800873, + "learning_rate": 2.843962896222836e-05, + "loss": 0.0074, + "step": 39590 + }, + { + "grad_norm": 0.16451454162597656, + "learning_rate": 2.8414768107987722e-05, + "loss": 0.0052, + "step": 39600 + }, + { + "grad_norm": 0.24079783260822296, + "learning_rate": 2.838991381077061e-05, + "loss": 0.0063, + "step": 39610 + }, + { + "grad_norm": 0.1851942390203476, + "learning_rate": 2.83650660781271e-05, + "loss": 0.0052, + "step": 39620 + }, + { + "grad_norm": 0.22354325652122498, + "learning_rate": 2.8340224917605285e-05, + "loss": 0.0048, + "step": 39630 + }, + { + "grad_norm": 0.2125934362411499, + "learning_rate": 2.831539033675122e-05, + "loss": 0.0048, + "step": 39640 + }, + { + "grad_norm": 0.17922115325927734, + "learning_rate": 2.8290562343109038e-05, + "loss": 0.0053, + "step": 39650 + }, + { + "grad_norm": 0.23346026241779327, + "learning_rate": 2.826574094422082e-05, + "loss": 0.0063, + "step": 39660 + }, + { + "grad_norm": 0.18768727779388428, + "learning_rate": 2.8240926147626645e-05, + "loss": 0.0047, + "step": 39670 + }, + { + "grad_norm": 0.11735614389181137, + "learning_rate": 2.8216117960864586e-05, + "loss": 0.0049, + "step": 39680 + }, + { + "grad_norm": 0.1411198079586029, + "learning_rate": 2.8191316391470703e-05, + "loss": 0.0042, + "step": 39690 + }, + { + "grad_norm": 0.14979593455791473, + "learning_rate": 2.816652144697911e-05, + "loss": 0.0046, + "step": 39700 + }, + { + "grad_norm": 0.20470823347568512, + "learning_rate": 2.8141733134921783e-05, + "loss": 0.0056, + "step": 39710 + }, + { + "grad_norm": 0.11361949145793915, + "learning_rate": 2.811695146282884e-05, + "loss": 0.0047, + "step": 39720 + }, + { + "grad_norm": 0.14694096148014069, + "learning_rate": 2.8092176438228212e-05, + "loss": 0.0057, + "step": 39730 + }, + { + "grad_norm": 0.2079312652349472, + "learning_rate": 2.806740806864598e-05, + "loss": 0.0046, + "step": 39740 + }, + { + "grad_norm": 0.21088510751724243, + "learning_rate": 2.804264636160604e-05, + "loss": 0.0049, + "step": 39750 + }, + { + "grad_norm": 0.17210254073143005, + "learning_rate": 2.8017891324630402e-05, + "loss": 0.0052, + "step": 39760 + }, + { + "grad_norm": 0.1771102398633957, + "learning_rate": 2.7993142965238976e-05, + "loss": 0.0043, + "step": 39770 + }, + { + "grad_norm": 0.2409965991973877, + "learning_rate": 2.7968401290949665e-05, + "loss": 0.0044, + "step": 39780 + }, + { + "grad_norm": 0.31642183661460876, + "learning_rate": 2.7943666309278328e-05, + "loss": 0.0058, + "step": 39790 + }, + { + "grad_norm": 0.23338210582733154, + "learning_rate": 2.7918938027738783e-05, + "loss": 0.004, + "step": 39800 + }, + { + "grad_norm": 0.19371086359024048, + "learning_rate": 2.789421645384287e-05, + "loss": 0.0072, + "step": 39810 + }, + { + "grad_norm": 0.1881861537694931, + "learning_rate": 2.786950159510032e-05, + "loss": 0.005, + "step": 39820 + }, + { + "grad_norm": 0.1700982004404068, + "learning_rate": 2.7844793459018876e-05, + "loss": 0.0043, + "step": 39830 + }, + { + "grad_norm": 0.1657911092042923, + "learning_rate": 2.7820092053104195e-05, + "loss": 0.0057, + "step": 39840 + }, + { + "grad_norm": 0.2305365353822708, + "learning_rate": 2.7795397384859933e-05, + "loss": 0.0082, + "step": 39850 + }, + { + "grad_norm": 0.2516365051269531, + "learning_rate": 2.7770709461787638e-05, + "loss": 0.008, + "step": 39860 + }, + { + "grad_norm": 0.18513330817222595, + "learning_rate": 2.7746028291386915e-05, + "loss": 0.0046, + "step": 39870 + }, + { + "grad_norm": 0.15349088609218597, + "learning_rate": 2.772135388115519e-05, + "loss": 0.0043, + "step": 39880 + }, + { + "grad_norm": 0.18356260657310486, + "learning_rate": 2.7696686238587945e-05, + "loss": 0.0047, + "step": 39890 + }, + { + "grad_norm": 0.20053298771381378, + "learning_rate": 2.7672025371178505e-05, + "loss": 0.0054, + "step": 39900 + }, + { + "grad_norm": 0.13689036667346954, + "learning_rate": 2.7647371286418238e-05, + "loss": 0.008, + "step": 39910 + }, + { + "grad_norm": 0.2753556966781616, + "learning_rate": 2.762272399179639e-05, + "loss": 0.0072, + "step": 39920 + }, + { + "grad_norm": 0.14381039142608643, + "learning_rate": 2.7598083494800154e-05, + "loss": 0.0041, + "step": 39930 + }, + { + "grad_norm": 0.2688097655773163, + "learning_rate": 2.7573449802914664e-05, + "loss": 0.0042, + "step": 39940 + }, + { + "grad_norm": 0.10915149748325348, + "learning_rate": 2.7548822923622964e-05, + "loss": 0.006, + "step": 39950 + }, + { + "grad_norm": 0.2022131383419037, + "learning_rate": 2.752420286440609e-05, + "loss": 0.0046, + "step": 39960 + }, + { + "grad_norm": 0.1734929084777832, + "learning_rate": 2.749958963274295e-05, + "loss": 0.0042, + "step": 39970 + }, + { + "grad_norm": 0.19679979979991913, + "learning_rate": 2.747498323611039e-05, + "loss": 0.0053, + "step": 39980 + }, + { + "grad_norm": 0.15817561745643616, + "learning_rate": 2.7450383681983184e-05, + "loss": 0.0061, + "step": 39990 + }, + { + "grad_norm": 0.20190629363059998, + "learning_rate": 2.742579097783403e-05, + "loss": 0.0052, + "step": 40000 + } + ], + "logging_steps": 10, + "max_steps": 60000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}