diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "grad_norm": 4.993945121765137, + "learning_rate": 9e-07, + "loss": 1.3918, + "step": 10 + }, + { + "grad_norm": 3.423759698867798, + "learning_rate": 1.9e-06, + "loss": 1.3535, + "step": 20 + }, + { + "grad_norm": 1.7111748456954956, + "learning_rate": 2.9e-06, + "loss": 1.2344, + "step": 30 + }, + { + "grad_norm": 0.8323810696601868, + "learning_rate": 3.9e-06, + "loss": 1.1363, + "step": 40 + }, + { + "grad_norm": 0.44304579496383667, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.0656, + "step": 50 + }, + { + "grad_norm": 0.3622480034828186, + "learning_rate": 5.9e-06, + "loss": 1.0457, + "step": 60 + }, + { + "grad_norm": 0.3319099247455597, + "learning_rate": 6.900000000000001e-06, + "loss": 1.0367, + "step": 70 + }, + { + "grad_norm": 0.4650214612483978, + "learning_rate": 7.9e-06, + "loss": 1.0527, + "step": 80 + }, + { + "grad_norm": 0.4450676143169403, + "learning_rate": 8.9e-06, + "loss": 1.0531, + "step": 90 + }, + { + "grad_norm": 0.2956357002258301, + "learning_rate": 9.900000000000002e-06, + "loss": 1.0398, + "step": 100 + }, + { + "grad_norm": 0.4399329721927643, + "learning_rate": 1.09e-05, + "loss": 1.0428, + "step": 110 + }, + { + "grad_norm": 0.3126818835735321, + "learning_rate": 1.19e-05, + "loss": 1.0395, + "step": 120 + }, + { + "grad_norm": 0.5520121455192566, + "learning_rate": 1.29e-05, + "loss": 1.0127, + "step": 130 + }, + { + "grad_norm": 0.6042282581329346, + "learning_rate": 1.3900000000000002e-05, + "loss": 1.0027, + "step": 140 + }, + { + "grad_norm": 0.7426028251647949, + "learning_rate": 1.49e-05, + "loss": 0.9764, + "step": 150 + }, + { + "grad_norm": 1.1313856840133667, + "learning_rate": 1.59e-05, + "loss": 0.9197, + "step": 160 + }, + { + "grad_norm": 1.190605878829956, + "learning_rate": 1.69e-05, + "loss": 0.8305, + "step": 170 + }, + { + "grad_norm": 1.4809985160827637, + "learning_rate": 1.79e-05, + "loss": 0.7311, + "step": 180 + }, + { + "grad_norm": 1.4859532117843628, + "learning_rate": 1.8900000000000002e-05, + "loss": 0.6406, + "step": 190 + }, + { + "grad_norm": 1.5570173263549805, + "learning_rate": 1.9900000000000003e-05, + "loss": 0.5515, + "step": 200 + }, + { + "grad_norm": 1.6518566608428955, + "learning_rate": 2.09e-05, + "loss": 0.4848, + "step": 210 + }, + { + "grad_norm": 1.893057942390442, + "learning_rate": 2.19e-05, + "loss": 0.4215, + "step": 220 + }, + { + "grad_norm": 2.341607093811035, + "learning_rate": 2.29e-05, + "loss": 0.3814, + "step": 230 + }, + { + "grad_norm": 1.7942633628845215, + "learning_rate": 2.39e-05, + "loss": 0.3405, + "step": 240 + }, + { + "grad_norm": 2.3446788787841797, + "learning_rate": 2.4900000000000002e-05, + "loss": 0.3133, + "step": 250 + }, + { + "grad_norm": 1.7486273050308228, + "learning_rate": 2.5900000000000003e-05, + "loss": 0.2716, + "step": 260 + }, + { + "grad_norm": 2.3605430126190186, + "learning_rate": 2.6900000000000003e-05, + "loss": 0.2225, + "step": 270 + }, + { + "grad_norm": 2.1422431468963623, + "learning_rate": 2.7900000000000004e-05, + "loss": 0.2107, + "step": 280 + }, + { + "grad_norm": 2.0858869552612305, + "learning_rate": 2.8899999999999998e-05, + "loss": 0.1841, + "step": 290 + }, + { + "grad_norm": 2.1064820289611816, + "learning_rate": 2.9900000000000002e-05, + "loss": 0.1817, + "step": 300 + }, + { + "grad_norm": 2.2429964542388916, + "learning_rate": 3.09e-05, + "loss": 0.1895, + "step": 310 + }, + { + "grad_norm": 1.8397496938705444, + "learning_rate": 3.19e-05, + "loss": 0.1719, + "step": 320 + }, + { + "grad_norm": 1.8902621269226074, + "learning_rate": 3.29e-05, + "loss": 0.1714, + "step": 330 + }, + { + "grad_norm": 2.0498616695404053, + "learning_rate": 3.3900000000000004e-05, + "loss": 0.1599, + "step": 340 + }, + { + "grad_norm": 2.1040191650390625, + "learning_rate": 3.49e-05, + "loss": 0.1516, + "step": 350 + }, + { + "grad_norm": 1.9790912866592407, + "learning_rate": 3.59e-05, + "loss": 0.15, + "step": 360 + }, + { + "grad_norm": 2.2093801498413086, + "learning_rate": 3.69e-05, + "loss": 0.143, + "step": 370 + }, + { + "grad_norm": 2.1348953247070312, + "learning_rate": 3.79e-05, + "loss": 0.1443, + "step": 380 + }, + { + "grad_norm": 1.7705327272415161, + "learning_rate": 3.8900000000000004e-05, + "loss": 0.1406, + "step": 390 + }, + { + "grad_norm": 1.7867079973220825, + "learning_rate": 3.99e-05, + "loss": 0.1352, + "step": 400 + }, + { + "grad_norm": 1.8254499435424805, + "learning_rate": 4.09e-05, + "loss": 0.1202, + "step": 410 + }, + { + "grad_norm": 2.074378728866577, + "learning_rate": 4.19e-05, + "loss": 0.0993, + "step": 420 + }, + { + "grad_norm": 1.596728801727295, + "learning_rate": 4.29e-05, + "loss": 0.1061, + "step": 430 + }, + { + "grad_norm": 2.318331480026245, + "learning_rate": 4.39e-05, + "loss": 0.0991, + "step": 440 + }, + { + "grad_norm": 2.02801513671875, + "learning_rate": 4.49e-05, + "loss": 0.1086, + "step": 450 + }, + { + "grad_norm": 1.9473150968551636, + "learning_rate": 4.5900000000000004e-05, + "loss": 0.0961, + "step": 460 + }, + { + "grad_norm": 1.9305334091186523, + "learning_rate": 4.69e-05, + "loss": 0.1091, + "step": 470 + }, + { + "grad_norm": 2.219036817550659, + "learning_rate": 4.79e-05, + "loss": 0.1059, + "step": 480 + }, + { + "grad_norm": 2.2115659713745117, + "learning_rate": 4.89e-05, + "loss": 0.0987, + "step": 490 + }, + { + "grad_norm": 1.901015281677246, + "learning_rate": 4.99e-05, + "loss": 0.0922, + "step": 500 + }, + { + "grad_norm": 2.1519229412078857, + "learning_rate": 5.0900000000000004e-05, + "loss": 0.0723, + "step": 510 + }, + { + "grad_norm": 1.873426079750061, + "learning_rate": 5.19e-05, + "loss": 0.0652, + "step": 520 + }, + { + "grad_norm": 2.1410093307495117, + "learning_rate": 5.2900000000000005e-05, + "loss": 0.0669, + "step": 530 + }, + { + "grad_norm": 2.186974048614502, + "learning_rate": 5.390000000000001e-05, + "loss": 0.0842, + "step": 540 + }, + { + "grad_norm": 1.8643074035644531, + "learning_rate": 5.4900000000000006e-05, + "loss": 0.0924, + "step": 550 + }, + { + "grad_norm": 1.8394888639450073, + "learning_rate": 5.590000000000001e-05, + "loss": 0.0999, + "step": 560 + }, + { + "grad_norm": 1.868140697479248, + "learning_rate": 5.69e-05, + "loss": 0.0993, + "step": 570 + }, + { + "grad_norm": 1.8147094249725342, + "learning_rate": 5.79e-05, + "loss": 0.0841, + "step": 580 + }, + { + "grad_norm": 1.7090058326721191, + "learning_rate": 5.89e-05, + "loss": 0.0718, + "step": 590 + }, + { + "grad_norm": 1.5713618993759155, + "learning_rate": 5.99e-05, + "loss": 0.0749, + "step": 600 + }, + { + "grad_norm": 1.8866902589797974, + "learning_rate": 6.09e-05, + "loss": 0.0751, + "step": 610 + }, + { + "grad_norm": 2.1187193393707275, + "learning_rate": 6.19e-05, + "loss": 0.0804, + "step": 620 + }, + { + "grad_norm": 1.685813546180725, + "learning_rate": 6.29e-05, + "loss": 0.0825, + "step": 630 + }, + { + "grad_norm": 1.4747114181518555, + "learning_rate": 6.390000000000001e-05, + "loss": 0.0875, + "step": 640 + }, + { + "grad_norm": 1.4714409112930298, + "learning_rate": 6.49e-05, + "loss": 0.0849, + "step": 650 + }, + { + "grad_norm": 2.2552554607391357, + "learning_rate": 6.59e-05, + "loss": 0.0937, + "step": 660 + }, + { + "grad_norm": 1.5055975914001465, + "learning_rate": 6.690000000000001e-05, + "loss": 0.0955, + "step": 670 + }, + { + "grad_norm": 1.5907299518585205, + "learning_rate": 6.790000000000001e-05, + "loss": 0.0928, + "step": 680 + }, + { + "grad_norm": 1.400766134262085, + "learning_rate": 6.89e-05, + "loss": 0.0932, + "step": 690 + }, + { + "grad_norm": 1.4266048669815063, + "learning_rate": 6.99e-05, + "loss": 0.0881, + "step": 700 + }, + { + "grad_norm": 1.6281338930130005, + "learning_rate": 7.09e-05, + "loss": 0.092, + "step": 710 + }, + { + "grad_norm": 1.6907267570495605, + "learning_rate": 7.19e-05, + "loss": 0.0835, + "step": 720 + }, + { + "grad_norm": 1.332542896270752, + "learning_rate": 7.29e-05, + "loss": 0.0705, + "step": 730 + }, + { + "grad_norm": 1.4303714036941528, + "learning_rate": 7.390000000000001e-05, + "loss": 0.0817, + "step": 740 + }, + { + "grad_norm": 1.973418951034546, + "learning_rate": 7.49e-05, + "loss": 0.0858, + "step": 750 + }, + { + "grad_norm": 2.1405203342437744, + "learning_rate": 7.59e-05, + "loss": 0.0879, + "step": 760 + }, + { + "grad_norm": 1.4094146490097046, + "learning_rate": 7.69e-05, + "loss": 0.0841, + "step": 770 + }, + { + "grad_norm": 1.3470752239227295, + "learning_rate": 7.790000000000001e-05, + "loss": 0.0771, + "step": 780 + }, + { + "grad_norm": 1.197171688079834, + "learning_rate": 7.890000000000001e-05, + "loss": 0.0756, + "step": 790 + }, + { + "grad_norm": 1.5943537950515747, + "learning_rate": 7.99e-05, + "loss": 0.0754, + "step": 800 + }, + { + "grad_norm": 1.5142279863357544, + "learning_rate": 8.090000000000001e-05, + "loss": 0.0736, + "step": 810 + }, + { + "grad_norm": 1.2352672815322876, + "learning_rate": 8.19e-05, + "loss": 0.0802, + "step": 820 + }, + { + "grad_norm": 1.5542058944702148, + "learning_rate": 8.29e-05, + "loss": 0.0868, + "step": 830 + }, + { + "grad_norm": 1.151941180229187, + "learning_rate": 8.39e-05, + "loss": 0.0795, + "step": 840 + }, + { + "grad_norm": 1.199670433998108, + "learning_rate": 8.49e-05, + "loss": 0.0728, + "step": 850 + }, + { + "grad_norm": 1.4130802154541016, + "learning_rate": 8.59e-05, + "loss": 0.0788, + "step": 860 + }, + { + "grad_norm": 1.3321387767791748, + "learning_rate": 8.69e-05, + "loss": 0.0754, + "step": 870 + }, + { + "grad_norm": 1.5437042713165283, + "learning_rate": 8.790000000000001e-05, + "loss": 0.0829, + "step": 880 + }, + { + "grad_norm": 1.2115956544876099, + "learning_rate": 8.89e-05, + "loss": 0.079, + "step": 890 + }, + { + "grad_norm": 1.4280129671096802, + "learning_rate": 8.99e-05, + "loss": 0.0723, + "step": 900 + }, + { + "grad_norm": 1.4627995491027832, + "learning_rate": 9.090000000000001e-05, + "loss": 0.072, + "step": 910 + }, + { + "grad_norm": 1.0619111061096191, + "learning_rate": 9.190000000000001e-05, + "loss": 0.0702, + "step": 920 + }, + { + "grad_norm": 1.3733621835708618, + "learning_rate": 9.290000000000001e-05, + "loss": 0.0722, + "step": 930 + }, + { + "grad_norm": 1.2505708932876587, + "learning_rate": 9.39e-05, + "loss": 0.0676, + "step": 940 + }, + { + "grad_norm": 1.37816321849823, + "learning_rate": 9.49e-05, + "loss": 0.076, + "step": 950 + }, + { + "grad_norm": 1.057328224182129, + "learning_rate": 9.59e-05, + "loss": 0.0772, + "step": 960 + }, + { + "grad_norm": 1.3616372346878052, + "learning_rate": 9.69e-05, + "loss": 0.0755, + "step": 970 + }, + { + "grad_norm": 1.223283052444458, + "learning_rate": 9.790000000000001e-05, + "loss": 0.0771, + "step": 980 + }, + { + "grad_norm": 1.2254199981689453, + "learning_rate": 9.89e-05, + "loss": 0.0719, + "step": 990 + }, + { + "grad_norm": 1.1356757879257202, + "learning_rate": 9.99e-05, + "loss": 0.0751, + "step": 1000 + }, + { + "grad_norm": 1.3630599975585938, + "learning_rate": 9.999994463727085e-05, + "loss": 0.0638, + "step": 1010 + }, + { + "grad_norm": 0.9833061695098877, + "learning_rate": 9.999975326009292e-05, + "loss": 0.063, + "step": 1020 + }, + { + "grad_norm": 1.3349156379699707, + "learning_rate": 9.999942518549879e-05, + "loss": 0.068, + "step": 1030 + }, + { + "grad_norm": 1.0759925842285156, + "learning_rate": 9.999896041438544e-05, + "loss": 0.0643, + "step": 1040 + }, + { + "grad_norm": 1.0783710479736328, + "learning_rate": 9.999835894802353e-05, + "loss": 0.0623, + "step": 1050 + }, + { + "grad_norm": 1.172471284866333, + "learning_rate": 9.999762078805743e-05, + "loss": 0.0672, + "step": 1060 + }, + { + "grad_norm": 1.135905385017395, + "learning_rate": 9.999674593650526e-05, + "loss": 0.0674, + "step": 1070 + }, + { + "grad_norm": 1.0808508396148682, + "learning_rate": 9.99957343957588e-05, + "loss": 0.057, + "step": 1080 + }, + { + "grad_norm": 0.9804912209510803, + "learning_rate": 9.99945861685836e-05, + "loss": 0.0613, + "step": 1090 + }, + { + "grad_norm": 1.3137835264205933, + "learning_rate": 9.999330125811884e-05, + "loss": 0.0653, + "step": 1100 + }, + { + "grad_norm": 0.9661508202552795, + "learning_rate": 9.999187966787744e-05, + "loss": 0.0578, + "step": 1110 + }, + { + "grad_norm": 1.1586745977401733, + "learning_rate": 9.999032140174595e-05, + "loss": 0.0635, + "step": 1120 + }, + { + "grad_norm": 1.1651030778884888, + "learning_rate": 9.998862646398464e-05, + "loss": 0.0649, + "step": 1130 + }, + { + "grad_norm": 1.070664405822754, + "learning_rate": 9.998679485922739e-05, + "loss": 0.0518, + "step": 1140 + }, + { + "grad_norm": 1.0347111225128174, + "learning_rate": 9.998482659248174e-05, + "loss": 0.0628, + "step": 1150 + }, + { + "grad_norm": 0.9090607762336731, + "learning_rate": 9.998272166912883e-05, + "loss": 0.0566, + "step": 1160 + }, + { + "grad_norm": 0.8385518193244934, + "learning_rate": 9.998048009492347e-05, + "loss": 0.0571, + "step": 1170 + }, + { + "grad_norm": 0.7850964069366455, + "learning_rate": 9.997810187599403e-05, + "loss": 0.057, + "step": 1180 + }, + { + "grad_norm": 0.7865731120109558, + "learning_rate": 9.997558701884249e-05, + "loss": 0.0585, + "step": 1190 + }, + { + "grad_norm": 0.9389501214027405, + "learning_rate": 9.997293553034433e-05, + "loss": 0.0602, + "step": 1200 + }, + { + "grad_norm": 0.951545774936676, + "learning_rate": 9.997014741774866e-05, + "loss": 0.0561, + "step": 1210 + }, + { + "grad_norm": 1.126580834388733, + "learning_rate": 9.996722268867803e-05, + "loss": 0.0579, + "step": 1220 + }, + { + "grad_norm": 0.8887141942977905, + "learning_rate": 9.996416135112858e-05, + "loss": 0.0596, + "step": 1230 + }, + { + "grad_norm": 0.9025460481643677, + "learning_rate": 9.996096341346988e-05, + "loss": 0.0662, + "step": 1240 + }, + { + "grad_norm": 0.8014842867851257, + "learning_rate": 9.995762888444495e-05, + "loss": 0.0635, + "step": 1250 + }, + { + "grad_norm": 0.9758684039115906, + "learning_rate": 9.995415777317027e-05, + "loss": 0.0598, + "step": 1260 + }, + { + "grad_norm": 0.9459826946258545, + "learning_rate": 9.995055008913574e-05, + "loss": 0.0746, + "step": 1270 + }, + { + "grad_norm": 0.8943505883216858, + "learning_rate": 9.994680584220463e-05, + "loss": 0.0627, + "step": 1280 + }, + { + "grad_norm": 1.0308504104614258, + "learning_rate": 9.994292504261355e-05, + "loss": 0.0653, + "step": 1290 + }, + { + "grad_norm": 0.883803129196167, + "learning_rate": 9.993890770097247e-05, + "loss": 0.0557, + "step": 1300 + }, + { + "grad_norm": 0.73323655128479, + "learning_rate": 9.993475382826467e-05, + "loss": 0.0527, + "step": 1310 + }, + { + "grad_norm": 0.9589157700538635, + "learning_rate": 9.993046343584664e-05, + "loss": 0.0684, + "step": 1320 + }, + { + "grad_norm": 0.8940895795822144, + "learning_rate": 9.992603653544816e-05, + "loss": 0.0509, + "step": 1330 + }, + { + "grad_norm": 0.9251660704612732, + "learning_rate": 9.992147313917222e-05, + "loss": 0.0474, + "step": 1340 + }, + { + "grad_norm": 0.6932914853096008, + "learning_rate": 9.991677325949497e-05, + "loss": 0.0475, + "step": 1350 + }, + { + "grad_norm": 0.8829203248023987, + "learning_rate": 9.991193690926568e-05, + "loss": 0.0512, + "step": 1360 + }, + { + "grad_norm": 0.7842630743980408, + "learning_rate": 9.990696410170678e-05, + "loss": 0.0467, + "step": 1370 + }, + { + "grad_norm": 0.9309704303741455, + "learning_rate": 9.990185485041371e-05, + "loss": 0.0504, + "step": 1380 + }, + { + "grad_norm": 0.8081430196762085, + "learning_rate": 9.989660916935498e-05, + "loss": 0.0505, + "step": 1390 + }, + { + "grad_norm": 0.7329186201095581, + "learning_rate": 9.989122707287208e-05, + "loss": 0.0526, + "step": 1400 + }, + { + "grad_norm": 0.8042072653770447, + "learning_rate": 9.988570857567945e-05, + "loss": 0.0536, + "step": 1410 + }, + { + "grad_norm": 0.7964304089546204, + "learning_rate": 9.988005369286446e-05, + "loss": 0.0512, + "step": 1420 + }, + { + "grad_norm": 0.8344742655754089, + "learning_rate": 9.987426243988734e-05, + "loss": 0.0578, + "step": 1430 + }, + { + "grad_norm": 0.9833702445030212, + "learning_rate": 9.986833483258114e-05, + "loss": 0.0525, + "step": 1440 + }, + { + "grad_norm": 0.8511132001876831, + "learning_rate": 9.986227088715173e-05, + "loss": 0.0511, + "step": 1450 + }, + { + "grad_norm": 0.8239938616752625, + "learning_rate": 9.98560706201777e-05, + "loss": 0.0585, + "step": 1460 + }, + { + "grad_norm": 0.8089591264724731, + "learning_rate": 9.984973404861036e-05, + "loss": 0.0491, + "step": 1470 + }, + { + "grad_norm": 0.8275749683380127, + "learning_rate": 9.984326118977361e-05, + "loss": 0.0454, + "step": 1480 + }, + { + "grad_norm": 0.6359297037124634, + "learning_rate": 9.983665206136406e-05, + "loss": 0.0483, + "step": 1490 + }, + { + "grad_norm": 0.8277090787887573, + "learning_rate": 9.982990668145075e-05, + "loss": 0.0471, + "step": 1500 + }, + { + "grad_norm": 1.0116573572158813, + "learning_rate": 9.982302506847534e-05, + "loss": 0.0557, + "step": 1510 + }, + { + "grad_norm": 0.9102075695991516, + "learning_rate": 9.981600724125189e-05, + "loss": 0.0552, + "step": 1520 + }, + { + "grad_norm": 0.7342894673347473, + "learning_rate": 9.980885321896685e-05, + "loss": 0.0566, + "step": 1530 + }, + { + "grad_norm": 0.9402394890785217, + "learning_rate": 9.980156302117905e-05, + "loss": 0.0639, + "step": 1540 + }, + { + "grad_norm": 0.9465509653091431, + "learning_rate": 9.979413666781963e-05, + "loss": 0.0578, + "step": 1550 + }, + { + "grad_norm": 0.6617011427879333, + "learning_rate": 9.978657417919193e-05, + "loss": 0.0516, + "step": 1560 + }, + { + "grad_norm": 0.681211531162262, + "learning_rate": 9.977887557597153e-05, + "loss": 0.0457, + "step": 1570 + }, + { + "grad_norm": 0.7713249921798706, + "learning_rate": 9.97710408792061e-05, + "loss": 0.0385, + "step": 1580 + }, + { + "grad_norm": 0.6770836710929871, + "learning_rate": 9.976307011031542e-05, + "loss": 0.0453, + "step": 1590 + }, + { + "grad_norm": 0.8198121786117554, + "learning_rate": 9.975496329109126e-05, + "loss": 0.0523, + "step": 1600 + }, + { + "grad_norm": 0.798035740852356, + "learning_rate": 9.974672044369732e-05, + "loss": 0.0512, + "step": 1610 + }, + { + "grad_norm": 0.7177817225456238, + "learning_rate": 9.97383415906693e-05, + "loss": 0.0497, + "step": 1620 + }, + { + "grad_norm": 0.7852838039398193, + "learning_rate": 9.97298267549146e-05, + "loss": 0.0502, + "step": 1630 + }, + { + "grad_norm": 0.760026216506958, + "learning_rate": 9.972117595971249e-05, + "loss": 0.0546, + "step": 1640 + }, + { + "grad_norm": 0.7583540081977844, + "learning_rate": 9.971238922871391e-05, + "loss": 0.0501, + "step": 1650 + }, + { + "grad_norm": 0.8529834747314453, + "learning_rate": 9.970346658594142e-05, + "loss": 0.0529, + "step": 1660 + }, + { + "grad_norm": 0.7892301678657532, + "learning_rate": 9.969440805578923e-05, + "loss": 0.0579, + "step": 1670 + }, + { + "grad_norm": 0.8929382562637329, + "learning_rate": 9.968521366302298e-05, + "loss": 0.0514, + "step": 1680 + }, + { + "grad_norm": 0.5422148108482361, + "learning_rate": 9.967588343277981e-05, + "loss": 0.0432, + "step": 1690 + }, + { + "grad_norm": 0.8520615696907043, + "learning_rate": 9.966641739056818e-05, + "loss": 0.0502, + "step": 1700 + }, + { + "grad_norm": 0.812414288520813, + "learning_rate": 9.965681556226793e-05, + "loss": 0.0448, + "step": 1710 + }, + { + "grad_norm": 0.6300027966499329, + "learning_rate": 9.964707797413006e-05, + "loss": 0.0457, + "step": 1720 + }, + { + "grad_norm": 0.6134947538375854, + "learning_rate": 9.963720465277679e-05, + "loss": 0.0479, + "step": 1730 + }, + { + "grad_norm": 0.7621533274650574, + "learning_rate": 9.96271956252014e-05, + "loss": 0.0442, + "step": 1740 + }, + { + "grad_norm": 0.7284694910049438, + "learning_rate": 9.961705091876816e-05, + "loss": 0.0414, + "step": 1750 + }, + { + "grad_norm": 0.5892438888549805, + "learning_rate": 9.960677056121235e-05, + "loss": 0.0405, + "step": 1760 + }, + { + "grad_norm": 0.6418530941009521, + "learning_rate": 9.959635458064005e-05, + "loss": 0.0499, + "step": 1770 + }, + { + "grad_norm": 0.7680502533912659, + "learning_rate": 9.958580300552815e-05, + "loss": 0.0512, + "step": 1780 + }, + { + "grad_norm": 0.689338207244873, + "learning_rate": 9.957511586472426e-05, + "loss": 0.0446, + "step": 1790 + }, + { + "grad_norm": 0.7646634578704834, + "learning_rate": 9.956429318744662e-05, + "loss": 0.0469, + "step": 1800 + }, + { + "grad_norm": 0.8325613737106323, + "learning_rate": 9.955333500328404e-05, + "loss": 0.0444, + "step": 1810 + }, + { + "grad_norm": 0.5923596620559692, + "learning_rate": 9.95422413421957e-05, + "loss": 0.0436, + "step": 1820 + }, + { + "grad_norm": 0.6565541625022888, + "learning_rate": 9.953101223451133e-05, + "loss": 0.0411, + "step": 1830 + }, + { + "grad_norm": 0.6565862894058228, + "learning_rate": 9.951964771093085e-05, + "loss": 0.0408, + "step": 1840 + }, + { + "grad_norm": 0.659924328327179, + "learning_rate": 9.950814780252442e-05, + "loss": 0.0462, + "step": 1850 + }, + { + "grad_norm": 0.61118084192276, + "learning_rate": 9.949651254073236e-05, + "loss": 0.0413, + "step": 1860 + }, + { + "grad_norm": 0.7443326115608215, + "learning_rate": 9.948474195736504e-05, + "loss": 0.0406, + "step": 1870 + }, + { + "grad_norm": 0.7604121565818787, + "learning_rate": 9.947283608460277e-05, + "loss": 0.0393, + "step": 1880 + }, + { + "grad_norm": 0.6163865923881531, + "learning_rate": 9.946079495499577e-05, + "loss": 0.0366, + "step": 1890 + }, + { + "grad_norm": 0.7986574769020081, + "learning_rate": 9.944861860146401e-05, + "loss": 0.0488, + "step": 1900 + }, + { + "grad_norm": 0.8695516586303711, + "learning_rate": 9.943630705729719e-05, + "loss": 0.0488, + "step": 1910 + }, + { + "grad_norm": 0.6148108243942261, + "learning_rate": 9.942386035615459e-05, + "loss": 0.0444, + "step": 1920 + }, + { + "grad_norm": 0.6883893013000488, + "learning_rate": 9.941127853206503e-05, + "loss": 0.0443, + "step": 1930 + }, + { + "grad_norm": 0.6547062397003174, + "learning_rate": 9.939856161942673e-05, + "loss": 0.0457, + "step": 1940 + }, + { + "grad_norm": 0.7619850635528564, + "learning_rate": 9.938570965300724e-05, + "loss": 0.038, + "step": 1950 + }, + { + "grad_norm": 0.6129763126373291, + "learning_rate": 9.937272266794335e-05, + "loss": 0.0376, + "step": 1960 + }, + { + "grad_norm": 0.7471209168434143, + "learning_rate": 9.935960069974096e-05, + "loss": 0.0369, + "step": 1970 + }, + { + "grad_norm": 0.6831609010696411, + "learning_rate": 9.934634378427506e-05, + "loss": 0.0387, + "step": 1980 + }, + { + "grad_norm": 0.5533860921859741, + "learning_rate": 9.933295195778954e-05, + "loss": 0.038, + "step": 1990 + }, + { + "grad_norm": 0.6541736125946045, + "learning_rate": 9.931942525689715e-05, + "loss": 0.0395, + "step": 2000 + }, + { + "grad_norm": 0.6567339897155762, + "learning_rate": 9.930576371857936e-05, + "loss": 0.0454, + "step": 2010 + }, + { + "grad_norm": 0.8603509068489075, + "learning_rate": 9.929196738018629e-05, + "loss": 0.0461, + "step": 2020 + }, + { + "grad_norm": 0.7555803656578064, + "learning_rate": 9.927803627943662e-05, + "loss": 0.0443, + "step": 2030 + }, + { + "grad_norm": 0.7651776671409607, + "learning_rate": 9.926397045441744e-05, + "loss": 0.0472, + "step": 2040 + }, + { + "grad_norm": 0.6635904908180237, + "learning_rate": 9.924976994358417e-05, + "loss": 0.0497, + "step": 2050 + }, + { + "grad_norm": 0.6158275604248047, + "learning_rate": 9.923543478576048e-05, + "loss": 0.0402, + "step": 2060 + }, + { + "grad_norm": 0.8399137854576111, + "learning_rate": 9.922096502013813e-05, + "loss": 0.033, + "step": 2070 + }, + { + "grad_norm": 0.7092491388320923, + "learning_rate": 9.92063606862769e-05, + "loss": 0.0351, + "step": 2080 + }, + { + "grad_norm": 0.6932843327522278, + "learning_rate": 9.919162182410453e-05, + "loss": 0.0401, + "step": 2090 + }, + { + "grad_norm": 0.5982862710952759, + "learning_rate": 9.917674847391645e-05, + "loss": 0.046, + "step": 2100 + }, + { + "grad_norm": 0.7870092988014221, + "learning_rate": 9.916174067637584e-05, + "loss": 0.0442, + "step": 2110 + }, + { + "grad_norm": 0.6547104120254517, + "learning_rate": 9.914659847251348e-05, + "loss": 0.0432, + "step": 2120 + }, + { + "grad_norm": 0.7595717906951904, + "learning_rate": 9.913132190372753e-05, + "loss": 0.0442, + "step": 2130 + }, + { + "grad_norm": 0.6262362599372864, + "learning_rate": 9.911591101178359e-05, + "loss": 0.0395, + "step": 2140 + }, + { + "grad_norm": 0.5770407319068909, + "learning_rate": 9.910036583881443e-05, + "loss": 0.0389, + "step": 2150 + }, + { + "grad_norm": 0.7978299260139465, + "learning_rate": 9.908468642731995e-05, + "loss": 0.0349, + "step": 2160 + }, + { + "grad_norm": 0.7276806831359863, + "learning_rate": 9.906887282016707e-05, + "loss": 0.0484, + "step": 2170 + }, + { + "grad_norm": 0.6408252120018005, + "learning_rate": 9.90529250605896e-05, + "loss": 0.0452, + "step": 2180 + }, + { + "grad_norm": 0.7085357904434204, + "learning_rate": 9.903684319218809e-05, + "loss": 0.0384, + "step": 2190 + }, + { + "grad_norm": 0.6670119166374207, + "learning_rate": 9.902062725892976e-05, + "loss": 0.0364, + "step": 2200 + }, + { + "grad_norm": 0.6009535193443298, + "learning_rate": 9.900427730514834e-05, + "loss": 0.0437, + "step": 2210 + }, + { + "grad_norm": 0.6931541562080383, + "learning_rate": 9.8987793375544e-05, + "loss": 0.0487, + "step": 2220 + }, + { + "grad_norm": 0.7688516974449158, + "learning_rate": 9.897117551518318e-05, + "loss": 0.0485, + "step": 2230 + }, + { + "grad_norm": 0.7647536993026733, + "learning_rate": 9.895442376949844e-05, + "loss": 0.0501, + "step": 2240 + }, + { + "grad_norm": 0.5345138311386108, + "learning_rate": 9.893753818428845e-05, + "loss": 0.0496, + "step": 2250 + }, + { + "grad_norm": 0.6083430051803589, + "learning_rate": 9.892051880571773e-05, + "loss": 0.0377, + "step": 2260 + }, + { + "grad_norm": 0.7751136422157288, + "learning_rate": 9.890336568031663e-05, + "loss": 0.0372, + "step": 2270 + }, + { + "grad_norm": 0.5571203231811523, + "learning_rate": 9.888607885498113e-05, + "loss": 0.0351, + "step": 2280 + }, + { + "grad_norm": 0.5467405915260315, + "learning_rate": 9.886865837697275e-05, + "loss": 0.0296, + "step": 2290 + }, + { + "grad_norm": 0.5539022088050842, + "learning_rate": 9.88511042939184e-05, + "loss": 0.0306, + "step": 2300 + }, + { + "grad_norm": 0.6038488149642944, + "learning_rate": 9.883341665381028e-05, + "loss": 0.0311, + "step": 2310 + }, + { + "grad_norm": 0.6266620755195618, + "learning_rate": 9.881559550500575e-05, + "loss": 0.0348, + "step": 2320 + }, + { + "grad_norm": 0.699028491973877, + "learning_rate": 9.879764089622712e-05, + "loss": 0.0463, + "step": 2330 + }, + { + "grad_norm": 0.6910221576690674, + "learning_rate": 9.87795528765616e-05, + "loss": 0.0447, + "step": 2340 + }, + { + "grad_norm": 0.5585054755210876, + "learning_rate": 9.876133149546118e-05, + "loss": 0.0433, + "step": 2350 + }, + { + "grad_norm": 0.818328320980072, + "learning_rate": 9.874297680274238e-05, + "loss": 0.0475, + "step": 2360 + }, + { + "grad_norm": 0.8087906837463379, + "learning_rate": 9.872448884858624e-05, + "loss": 0.0432, + "step": 2370 + }, + { + "grad_norm": 0.5674486756324768, + "learning_rate": 9.870586768353815e-05, + "loss": 0.0374, + "step": 2380 + }, + { + "grad_norm": 0.7746816873550415, + "learning_rate": 9.868711335850764e-05, + "loss": 0.0417, + "step": 2390 + }, + { + "grad_norm": 0.8778523206710815, + "learning_rate": 9.866822592476833e-05, + "loss": 0.0433, + "step": 2400 + }, + { + "grad_norm": 0.4442523717880249, + "learning_rate": 9.86492054339577e-05, + "loss": 0.0293, + "step": 2410 + }, + { + "grad_norm": 0.6952754259109497, + "learning_rate": 9.863005193807711e-05, + "loss": 0.0312, + "step": 2420 + }, + { + "grad_norm": 0.5432419180870056, + "learning_rate": 9.861076548949143e-05, + "loss": 0.0336, + "step": 2430 + }, + { + "grad_norm": 0.5944503545761108, + "learning_rate": 9.859134614092912e-05, + "loss": 0.0333, + "step": 2440 + }, + { + "grad_norm": 0.6865516304969788, + "learning_rate": 9.857179394548191e-05, + "loss": 0.0361, + "step": 2450 + }, + { + "grad_norm": 0.5804076790809631, + "learning_rate": 9.855210895660477e-05, + "loss": 0.0403, + "step": 2460 + }, + { + "grad_norm": 0.5694625377655029, + "learning_rate": 9.853229122811568e-05, + "loss": 0.0348, + "step": 2470 + }, + { + "grad_norm": 0.713255763053894, + "learning_rate": 9.851234081419559e-05, + "loss": 0.0448, + "step": 2480 + }, + { + "grad_norm": 0.7057042717933655, + "learning_rate": 9.849225776938814e-05, + "loss": 0.0422, + "step": 2490 + }, + { + "grad_norm": 0.7249994277954102, + "learning_rate": 9.847204214859964e-05, + "loss": 0.0441, + "step": 2500 + }, + { + "grad_norm": 0.692448616027832, + "learning_rate": 9.845169400709879e-05, + "loss": 0.0464, + "step": 2510 + }, + { + "grad_norm": 0.6840732097625732, + "learning_rate": 9.843121340051664e-05, + "loss": 0.0436, + "step": 2520 + }, + { + "grad_norm": 0.5917824506759644, + "learning_rate": 9.841060038484641e-05, + "loss": 0.0365, + "step": 2530 + }, + { + "grad_norm": 0.5731431841850281, + "learning_rate": 9.838985501644328e-05, + "loss": 0.0328, + "step": 2540 + }, + { + "grad_norm": 0.6873908042907715, + "learning_rate": 9.83689773520243e-05, + "loss": 0.0404, + "step": 2550 + }, + { + "grad_norm": 0.6992762088775635, + "learning_rate": 9.834796744866819e-05, + "loss": 0.0434, + "step": 2560 + }, + { + "grad_norm": 0.6579549312591553, + "learning_rate": 9.832682536381525e-05, + "loss": 0.0442, + "step": 2570 + }, + { + "grad_norm": 0.6423393487930298, + "learning_rate": 9.830555115526711e-05, + "loss": 0.0501, + "step": 2580 + }, + { + "grad_norm": 0.7137966156005859, + "learning_rate": 9.828414488118667e-05, + "loss": 0.0449, + "step": 2590 + }, + { + "grad_norm": 0.6467208862304688, + "learning_rate": 9.826260660009785e-05, + "loss": 0.0438, + "step": 2600 + }, + { + "grad_norm": 0.583688497543335, + "learning_rate": 9.824093637088547e-05, + "loss": 0.042, + "step": 2610 + }, + { + "grad_norm": 0.6194069981575012, + "learning_rate": 9.821913425279514e-05, + "loss": 0.0389, + "step": 2620 + }, + { + "grad_norm": 0.5874842405319214, + "learning_rate": 9.8197200305433e-05, + "loss": 0.042, + "step": 2630 + }, + { + "grad_norm": 0.6214905977249146, + "learning_rate": 9.817513458876564e-05, + "loss": 0.0427, + "step": 2640 + }, + { + "grad_norm": 0.6732086539268494, + "learning_rate": 9.815293716311987e-05, + "loss": 0.0419, + "step": 2650 + }, + { + "grad_norm": 0.6538270115852356, + "learning_rate": 9.813060808918262e-05, + "loss": 0.0397, + "step": 2660 + }, + { + "grad_norm": 0.6308801174163818, + "learning_rate": 9.810814742800069e-05, + "loss": 0.0447, + "step": 2670 + }, + { + "grad_norm": 0.6440849304199219, + "learning_rate": 9.808555524098074e-05, + "loss": 0.044, + "step": 2680 + }, + { + "grad_norm": 0.6012044548988342, + "learning_rate": 9.806283158988887e-05, + "loss": 0.038, + "step": 2690 + }, + { + "grad_norm": 0.685500979423523, + "learning_rate": 9.803997653685072e-05, + "loss": 0.0366, + "step": 2700 + }, + { + "grad_norm": 0.581657886505127, + "learning_rate": 9.801699014435112e-05, + "loss": 0.0396, + "step": 2710 + }, + { + "grad_norm": 0.5161563754081726, + "learning_rate": 9.799387247523398e-05, + "loss": 0.0374, + "step": 2720 + }, + { + "grad_norm": 0.6774253249168396, + "learning_rate": 9.797062359270215e-05, + "loss": 0.0392, + "step": 2730 + }, + { + "grad_norm": 0.5523225665092468, + "learning_rate": 9.794724356031715e-05, + "loss": 0.0392, + "step": 2740 + }, + { + "grad_norm": 0.623840868473053, + "learning_rate": 9.792373244199913e-05, + "loss": 0.039, + "step": 2750 + }, + { + "grad_norm": 0.7543627619743347, + "learning_rate": 9.790009030202658e-05, + "loss": 0.0435, + "step": 2760 + }, + { + "grad_norm": 0.5150243639945984, + "learning_rate": 9.78763172050362e-05, + "loss": 0.0394, + "step": 2770 + }, + { + "grad_norm": 0.7309544086456299, + "learning_rate": 9.785241321602274e-05, + "loss": 0.0462, + "step": 2780 + }, + { + "grad_norm": 0.6759349703788757, + "learning_rate": 9.782837840033879e-05, + "loss": 0.0468, + "step": 2790 + }, + { + "grad_norm": 0.6460655927658081, + "learning_rate": 9.780421282369461e-05, + "loss": 0.04, + "step": 2800 + }, + { + "grad_norm": 0.5792378187179565, + "learning_rate": 9.777991655215797e-05, + "loss": 0.0356, + "step": 2810 + }, + { + "grad_norm": 0.6339410543441772, + "learning_rate": 9.775548965215394e-05, + "loss": 0.0341, + "step": 2820 + }, + { + "grad_norm": 0.4722057580947876, + "learning_rate": 9.773093219046474e-05, + "loss": 0.0382, + "step": 2830 + }, + { + "grad_norm": 0.5367486476898193, + "learning_rate": 9.770624423422954e-05, + "loss": 0.0391, + "step": 2840 + }, + { + "grad_norm": 0.5035861730575562, + "learning_rate": 9.768142585094426e-05, + "loss": 0.0335, + "step": 2850 + }, + { + "grad_norm": 0.6247970461845398, + "learning_rate": 9.765647710846142e-05, + "loss": 0.0315, + "step": 2860 + }, + { + "grad_norm": 0.6283077597618103, + "learning_rate": 9.763139807498991e-05, + "loss": 0.0328, + "step": 2870 + }, + { + "grad_norm": 0.48914265632629395, + "learning_rate": 9.760618881909487e-05, + "loss": 0.0247, + "step": 2880 + }, + { + "grad_norm": 0.5127755999565125, + "learning_rate": 9.758084940969744e-05, + "loss": 0.0199, + "step": 2890 + }, + { + "grad_norm": 0.5711562037467957, + "learning_rate": 9.755537991607459e-05, + "loss": 0.0194, + "step": 2900 + }, + { + "grad_norm": 0.9706650972366333, + "learning_rate": 9.752978040785895e-05, + "loss": 0.0255, + "step": 2910 + }, + { + "grad_norm": 0.5636439323425293, + "learning_rate": 9.750405095503859e-05, + "loss": 0.0325, + "step": 2920 + }, + { + "grad_norm": 0.6135112047195435, + "learning_rate": 9.747819162795686e-05, + "loss": 0.037, + "step": 2930 + }, + { + "grad_norm": 0.562365710735321, + "learning_rate": 9.745220249731217e-05, + "loss": 0.0366, + "step": 2940 + }, + { + "grad_norm": 0.5705785751342773, + "learning_rate": 9.742608363415781e-05, + "loss": 0.0391, + "step": 2950 + }, + { + "grad_norm": 0.5236222743988037, + "learning_rate": 9.739983510990176e-05, + "loss": 0.0362, + "step": 2960 + }, + { + "grad_norm": 0.6130008697509766, + "learning_rate": 9.737345699630647e-05, + "loss": 0.0322, + "step": 2970 + }, + { + "grad_norm": 0.5343535542488098, + "learning_rate": 9.734694936548869e-05, + "loss": 0.0305, + "step": 2980 + }, + { + "grad_norm": 0.6245582103729248, + "learning_rate": 9.732031228991932e-05, + "loss": 0.0311, + "step": 2990 + }, + { + "grad_norm": 0.7065011858940125, + "learning_rate": 9.729354584242302e-05, + "loss": 0.0374, + "step": 3000 + }, + { + "grad_norm": 0.698158323764801, + "learning_rate": 9.726665009617832e-05, + "loss": 0.0445, + "step": 3010 + }, + { + "grad_norm": 0.6592923402786255, + "learning_rate": 9.723962512471714e-05, + "loss": 0.0429, + "step": 3020 + }, + { + "grad_norm": 0.5743712186813354, + "learning_rate": 9.72124710019247e-05, + "loss": 0.0403, + "step": 3030 + }, + { + "grad_norm": 0.5819076895713806, + "learning_rate": 9.718518780203934e-05, + "loss": 0.0378, + "step": 3040 + }, + { + "grad_norm": 0.5471735000610352, + "learning_rate": 9.715777559965228e-05, + "loss": 0.0407, + "step": 3050 + }, + { + "grad_norm": 0.5923493504524231, + "learning_rate": 9.713023446970746e-05, + "loss": 0.0403, + "step": 3060 + }, + { + "grad_norm": 0.7241693735122681, + "learning_rate": 9.710256448750126e-05, + "loss": 0.0409, + "step": 3070 + }, + { + "grad_norm": 0.8389741778373718, + "learning_rate": 9.707476572868235e-05, + "loss": 0.0357, + "step": 3080 + }, + { + "grad_norm": 0.5713964104652405, + "learning_rate": 9.704683826925149e-05, + "loss": 0.032, + "step": 3090 + }, + { + "grad_norm": 0.7913616895675659, + "learning_rate": 9.701878218556129e-05, + "loss": 0.0358, + "step": 3100 + }, + { + "grad_norm": 0.6317907571792603, + "learning_rate": 9.699059755431598e-05, + "loss": 0.0422, + "step": 3110 + }, + { + "grad_norm": 0.7856892347335815, + "learning_rate": 9.696228445257132e-05, + "loss": 0.0389, + "step": 3120 + }, + { + "grad_norm": 0.6063565015792847, + "learning_rate": 9.693384295773419e-05, + "loss": 0.0522, + "step": 3130 + }, + { + "grad_norm": 0.6263126134872437, + "learning_rate": 9.690527314756259e-05, + "loss": 0.0389, + "step": 3140 + }, + { + "grad_norm": 0.6750133037567139, + "learning_rate": 9.687657510016527e-05, + "loss": 0.0396, + "step": 3150 + }, + { + "grad_norm": 0.4193972945213318, + "learning_rate": 9.684774889400161e-05, + "loss": 0.0282, + "step": 3160 + }, + { + "grad_norm": 0.4982199966907501, + "learning_rate": 9.681879460788135e-05, + "loss": 0.0285, + "step": 3170 + }, + { + "grad_norm": 0.6515623927116394, + "learning_rate": 9.67897123209644e-05, + "loss": 0.0295, + "step": 3180 + }, + { + "grad_norm": 0.5716920495033264, + "learning_rate": 9.676050211276062e-05, + "loss": 0.0287, + "step": 3190 + }, + { + "grad_norm": 0.5825405120849609, + "learning_rate": 9.673116406312962e-05, + "loss": 0.0314, + "step": 3200 + }, + { + "grad_norm": 0.6698433756828308, + "learning_rate": 9.67016982522805e-05, + "loss": 0.0265, + "step": 3210 + }, + { + "grad_norm": 0.7237856984138489, + "learning_rate": 9.667210476077164e-05, + "loss": 0.0284, + "step": 3220 + }, + { + "grad_norm": 0.40168964862823486, + "learning_rate": 9.664238366951055e-05, + "loss": 0.0333, + "step": 3230 + }, + { + "grad_norm": 0.5081711411476135, + "learning_rate": 9.661253505975355e-05, + "loss": 0.0189, + "step": 3240 + }, + { + "grad_norm": 0.4516471028327942, + "learning_rate": 9.658255901310557e-05, + "loss": 0.0178, + "step": 3250 + }, + { + "grad_norm": 0.6510002613067627, + "learning_rate": 9.655245561152e-05, + "loss": 0.0208, + "step": 3260 + }, + { + "grad_norm": 0.584953248500824, + "learning_rate": 9.65222249372984e-05, + "loss": 0.0225, + "step": 3270 + }, + { + "grad_norm": 0.850547730922699, + "learning_rate": 9.649186707309026e-05, + "loss": 0.024, + "step": 3280 + }, + { + "grad_norm": 0.7082916498184204, + "learning_rate": 9.646138210189283e-05, + "loss": 0.0307, + "step": 3290 + }, + { + "grad_norm": 0.5292045474052429, + "learning_rate": 9.643077010705087e-05, + "loss": 0.0279, + "step": 3300 + }, + { + "grad_norm": 0.7349297404289246, + "learning_rate": 9.640003117225637e-05, + "loss": 0.0297, + "step": 3310 + }, + { + "grad_norm": 0.6695575714111328, + "learning_rate": 9.636916538154846e-05, + "loss": 0.0323, + "step": 3320 + }, + { + "grad_norm": 0.5382424592971802, + "learning_rate": 9.633817281931296e-05, + "loss": 0.0279, + "step": 3330 + }, + { + "grad_norm": 0.6347593069076538, + "learning_rate": 9.630705357028242e-05, + "loss": 0.0297, + "step": 3340 + }, + { + "grad_norm": 0.47336849570274353, + "learning_rate": 9.627580771953563e-05, + "loss": 0.0288, + "step": 3350 + }, + { + "grad_norm": 0.5129256248474121, + "learning_rate": 9.624443535249759e-05, + "loss": 0.0249, + "step": 3360 + }, + { + "grad_norm": 0.5860269069671631, + "learning_rate": 9.621293655493913e-05, + "loss": 0.0227, + "step": 3370 + }, + { + "grad_norm": 0.6489112377166748, + "learning_rate": 9.618131141297675e-05, + "loss": 0.0267, + "step": 3380 + }, + { + "grad_norm": 0.7155392169952393, + "learning_rate": 9.614956001307242e-05, + "loss": 0.027, + "step": 3390 + }, + { + "grad_norm": 0.6719536185264587, + "learning_rate": 9.611768244203321e-05, + "loss": 0.0284, + "step": 3400 + }, + { + "grad_norm": 0.5794150233268738, + "learning_rate": 9.60856787870112e-05, + "loss": 0.0305, + "step": 3410 + }, + { + "grad_norm": 0.5461472272872925, + "learning_rate": 9.605354913550318e-05, + "loss": 0.0208, + "step": 3420 + }, + { + "grad_norm": 0.51807701587677, + "learning_rate": 9.602129357535037e-05, + "loss": 0.0206, + "step": 3430 + }, + { + "grad_norm": 0.5499976277351379, + "learning_rate": 9.598891219473825e-05, + "loss": 0.0226, + "step": 3440 + }, + { + "grad_norm": 0.4805843234062195, + "learning_rate": 9.595640508219625e-05, + "loss": 0.0213, + "step": 3450 + }, + { + "grad_norm": 0.41881683468818665, + "learning_rate": 9.592377232659761e-05, + "loss": 0.0289, + "step": 3460 + }, + { + "grad_norm": 0.5759878158569336, + "learning_rate": 9.589101401715904e-05, + "loss": 0.0245, + "step": 3470 + }, + { + "grad_norm": 0.62054044008255, + "learning_rate": 9.585813024344045e-05, + "loss": 0.0207, + "step": 3480 + }, + { + "grad_norm": 0.5209581255912781, + "learning_rate": 9.58251210953449e-05, + "loss": 0.025, + "step": 3490 + }, + { + "grad_norm": 0.4041801393032074, + "learning_rate": 9.579198666311809e-05, + "loss": 0.0233, + "step": 3500 + }, + { + "grad_norm": 0.6874129176139832, + "learning_rate": 9.575872703734832e-05, + "loss": 0.0334, + "step": 3510 + }, + { + "grad_norm": 0.564410388469696, + "learning_rate": 9.572534230896611e-05, + "loss": 0.0306, + "step": 3520 + }, + { + "grad_norm": 0.4553097188472748, + "learning_rate": 9.569183256924403e-05, + "loss": 0.0307, + "step": 3530 + }, + { + "grad_norm": 0.5991028547286987, + "learning_rate": 9.565819790979646e-05, + "loss": 0.031, + "step": 3540 + }, + { + "grad_norm": 0.5057368874549866, + "learning_rate": 9.562443842257925e-05, + "loss": 0.0263, + "step": 3550 + }, + { + "grad_norm": 0.6050683856010437, + "learning_rate": 9.559055419988956e-05, + "loss": 0.0262, + "step": 3560 + }, + { + "grad_norm": 0.559066116809845, + "learning_rate": 9.555654533436557e-05, + "loss": 0.0213, + "step": 3570 + }, + { + "grad_norm": 0.583686113357544, + "learning_rate": 9.552241191898621e-05, + "loss": 0.0172, + "step": 3580 + }, + { + "grad_norm": 0.3482583165168762, + "learning_rate": 9.548815404707092e-05, + "loss": 0.0232, + "step": 3590 + }, + { + "grad_norm": 0.472091406583786, + "learning_rate": 9.545377181227942e-05, + "loss": 0.0237, + "step": 3600 + }, + { + "grad_norm": 0.5067651271820068, + "learning_rate": 9.541926530861145e-05, + "loss": 0.028, + "step": 3610 + }, + { + "grad_norm": 0.6541886925697327, + "learning_rate": 9.538463463040645e-05, + "loss": 0.0269, + "step": 3620 + }, + { + "grad_norm": 0.392147958278656, + "learning_rate": 9.534987987234337e-05, + "loss": 0.0308, + "step": 3630 + }, + { + "grad_norm": 0.650993287563324, + "learning_rate": 9.53150011294404e-05, + "loss": 0.0291, + "step": 3640 + }, + { + "grad_norm": 0.6371326446533203, + "learning_rate": 9.527999849705471e-05, + "loss": 0.0325, + "step": 3650 + }, + { + "grad_norm": 0.5590103268623352, + "learning_rate": 9.524487207088213e-05, + "loss": 0.0321, + "step": 3660 + }, + { + "grad_norm": 0.442900687456131, + "learning_rate": 9.520962194695698e-05, + "loss": 0.0246, + "step": 3670 + }, + { + "grad_norm": 0.3726986348628998, + "learning_rate": 9.517424822165175e-05, + "loss": 0.022, + "step": 3680 + }, + { + "grad_norm": 0.5511045455932617, + "learning_rate": 9.513875099167685e-05, + "loss": 0.0228, + "step": 3690 + }, + { + "grad_norm": 1.1515822410583496, + "learning_rate": 9.510313035408035e-05, + "loss": 0.0217, + "step": 3700 + }, + { + "grad_norm": 0.614046037197113, + "learning_rate": 9.506738640624775e-05, + "loss": 0.03, + "step": 3710 + }, + { + "grad_norm": 0.46794137358665466, + "learning_rate": 9.50315192459016e-05, + "loss": 0.0282, + "step": 3720 + }, + { + "grad_norm": 0.4732813537120819, + "learning_rate": 9.499552897110136e-05, + "loss": 0.028, + "step": 3730 + }, + { + "grad_norm": 0.40765368938446045, + "learning_rate": 9.495941568024304e-05, + "loss": 0.0251, + "step": 3740 + }, + { + "grad_norm": 0.4326068162918091, + "learning_rate": 9.492317947205904e-05, + "loss": 0.0214, + "step": 3750 + }, + { + "grad_norm": 0.7303733229637146, + "learning_rate": 9.488682044561775e-05, + "loss": 0.0228, + "step": 3760 + }, + { + "grad_norm": 0.5912054777145386, + "learning_rate": 9.485033870032335e-05, + "loss": 0.0183, + "step": 3770 + }, + { + "grad_norm": 0.40306001901626587, + "learning_rate": 9.481373433591556e-05, + "loss": 0.0222, + "step": 3780 + }, + { + "grad_norm": 0.48030319809913635, + "learning_rate": 9.47770074524693e-05, + "loss": 0.0227, + "step": 3790 + }, + { + "grad_norm": 0.5131433010101318, + "learning_rate": 9.474015815039446e-05, + "loss": 0.0225, + "step": 3800 + }, + { + "grad_norm": 0.5961817502975464, + "learning_rate": 9.470318653043565e-05, + "loss": 0.0262, + "step": 3810 + }, + { + "grad_norm": 0.7718464136123657, + "learning_rate": 9.466609269367185e-05, + "loss": 0.0263, + "step": 3820 + }, + { + "grad_norm": 0.43835383653640747, + "learning_rate": 9.46288767415162e-05, + "loss": 0.0254, + "step": 3830 + }, + { + "grad_norm": 0.6109564304351807, + "learning_rate": 9.459153877571567e-05, + "loss": 0.0276, + "step": 3840 + }, + { + "grad_norm": 0.5323156118392944, + "learning_rate": 9.455407889835087e-05, + "loss": 0.0294, + "step": 3850 + }, + { + "grad_norm": 0.4276140332221985, + "learning_rate": 9.451649721183564e-05, + "loss": 0.0298, + "step": 3860 + }, + { + "grad_norm": 0.5336406826972961, + "learning_rate": 9.447879381891692e-05, + "loss": 0.0247, + "step": 3870 + }, + { + "grad_norm": 0.4112587869167328, + "learning_rate": 9.444096882267428e-05, + "loss": 0.0228, + "step": 3880 + }, + { + "grad_norm": 0.7397602200508118, + "learning_rate": 9.440302232651988e-05, + "loss": 0.0272, + "step": 3890 + }, + { + "grad_norm": 0.4803454577922821, + "learning_rate": 9.436495443419795e-05, + "loss": 0.0325, + "step": 3900 + }, + { + "grad_norm": 0.5396085381507874, + "learning_rate": 9.432676524978466e-05, + "loss": 0.0287, + "step": 3910 + }, + { + "grad_norm": 0.4485945701599121, + "learning_rate": 9.42884548776878e-05, + "loss": 0.0271, + "step": 3920 + }, + { + "grad_norm": 0.44857850670814514, + "learning_rate": 9.425002342264646e-05, + "loss": 0.02, + "step": 3930 + }, + { + "grad_norm": 0.4943828582763672, + "learning_rate": 9.421147098973077e-05, + "loss": 0.0204, + "step": 3940 + }, + { + "grad_norm": 0.528380274772644, + "learning_rate": 9.41727976843416e-05, + "loss": 0.0168, + "step": 3950 + }, + { + "grad_norm": 0.5004056692123413, + "learning_rate": 9.413400361221029e-05, + "loss": 0.0161, + "step": 3960 + }, + { + "grad_norm": 0.7048073410987854, + "learning_rate": 9.409508887939835e-05, + "loss": 0.019, + "step": 3970 + }, + { + "grad_norm": 0.5115667581558228, + "learning_rate": 9.40560535922972e-05, + "loss": 0.022, + "step": 3980 + }, + { + "grad_norm": 0.5944473743438721, + "learning_rate": 9.40168978576278e-05, + "loss": 0.018, + "step": 3990 + }, + { + "grad_norm": 0.5083578824996948, + "learning_rate": 9.397762178244043e-05, + "loss": 0.0186, + "step": 4000 + }, + { + "grad_norm": 0.5265823006629944, + "learning_rate": 9.393822547411439e-05, + "loss": 0.0163, + "step": 4010 + }, + { + "grad_norm": 0.5557625889778137, + "learning_rate": 9.389870904035769e-05, + "loss": 0.0183, + "step": 4020 + }, + { + "grad_norm": 0.5121268033981323, + "learning_rate": 9.385907258920672e-05, + "loss": 0.0279, + "step": 4030 + }, + { + "grad_norm": 0.5747828483581543, + "learning_rate": 9.381931622902607e-05, + "loss": 0.0288, + "step": 4040 + }, + { + "grad_norm": 0.522057056427002, + "learning_rate": 9.377944006850807e-05, + "loss": 0.0312, + "step": 4050 + }, + { + "grad_norm": 0.5573875308036804, + "learning_rate": 9.373944421667265e-05, + "loss": 0.0291, + "step": 4060 + }, + { + "grad_norm": 0.706202507019043, + "learning_rate": 9.369932878286691e-05, + "loss": 0.0215, + "step": 4070 + }, + { + "grad_norm": 0.6053712368011475, + "learning_rate": 9.365909387676494e-05, + "loss": 0.0233, + "step": 4080 + }, + { + "grad_norm": 0.563764750957489, + "learning_rate": 9.361873960836744e-05, + "loss": 0.0217, + "step": 4090 + }, + { + "grad_norm": 0.37907326221466064, + "learning_rate": 9.357826608800142e-05, + "loss": 0.019, + "step": 4100 + }, + { + "grad_norm": 0.38764074444770813, + "learning_rate": 9.353767342631994e-05, + "loss": 0.0215, + "step": 4110 + }, + { + "grad_norm": 0.4152272641658783, + "learning_rate": 9.34969617343018e-05, + "loss": 0.0215, + "step": 4120 + }, + { + "grad_norm": 0.5082410573959351, + "learning_rate": 9.345613112325122e-05, + "loss": 0.0214, + "step": 4130 + }, + { + "grad_norm": 0.5109147429466248, + "learning_rate": 9.34151817047975e-05, + "loss": 0.0154, + "step": 4140 + }, + { + "grad_norm": 0.5129996538162231, + "learning_rate": 9.33741135908948e-05, + "loss": 0.0175, + "step": 4150 + }, + { + "grad_norm": 0.44588354229927063, + "learning_rate": 9.33329268938218e-05, + "loss": 0.0148, + "step": 4160 + }, + { + "grad_norm": 0.6070412993431091, + "learning_rate": 9.329162172618132e-05, + "loss": 0.0243, + "step": 4170 + }, + { + "grad_norm": 0.5743244290351868, + "learning_rate": 9.325019820090013e-05, + "loss": 0.0254, + "step": 4180 + }, + { + "grad_norm": 0.497286319732666, + "learning_rate": 9.320865643122855e-05, + "loss": 0.0267, + "step": 4190 + }, + { + "grad_norm": 0.49449023604393005, + "learning_rate": 9.316699653074023e-05, + "loss": 0.0207, + "step": 4200 + }, + { + "grad_norm": 0.5246172547340393, + "learning_rate": 9.312521861333172e-05, + "loss": 0.0198, + "step": 4210 + }, + { + "grad_norm": 0.490579754114151, + "learning_rate": 9.308332279322224e-05, + "loss": 0.0208, + "step": 4220 + }, + { + "grad_norm": 0.4154781401157379, + "learning_rate": 9.304130918495338e-05, + "loss": 0.0238, + "step": 4230 + }, + { + "grad_norm": 0.5204804539680481, + "learning_rate": 9.299917790338874e-05, + "loss": 0.0225, + "step": 4240 + }, + { + "grad_norm": 0.5310570001602173, + "learning_rate": 9.295692906371363e-05, + "loss": 0.0214, + "step": 4250 + }, + { + "grad_norm": 0.5856108665466309, + "learning_rate": 9.291456278143476e-05, + "loss": 0.0219, + "step": 4260 + }, + { + "grad_norm": 0.5235369801521301, + "learning_rate": 9.287207917237994e-05, + "loss": 0.0171, + "step": 4270 + }, + { + "grad_norm": 0.33213943243026733, + "learning_rate": 9.282947835269773e-05, + "loss": 0.0182, + "step": 4280 + }, + { + "grad_norm": 0.4885040819644928, + "learning_rate": 9.278676043885715e-05, + "loss": 0.0154, + "step": 4290 + }, + { + "grad_norm": 0.36702555418014526, + "learning_rate": 9.274392554764733e-05, + "loss": 0.0133, + "step": 4300 + }, + { + "grad_norm": 0.3722923994064331, + "learning_rate": 9.270097379617723e-05, + "loss": 0.0137, + "step": 4310 + }, + { + "grad_norm": 0.5792672634124756, + "learning_rate": 9.26579053018753e-05, + "loss": 0.0155, + "step": 4320 + }, + { + "grad_norm": 0.36514580249786377, + "learning_rate": 9.261472018248918e-05, + "loss": 0.0173, + "step": 4330 + }, + { + "grad_norm": 0.46200570464134216, + "learning_rate": 9.25714185560853e-05, + "loss": 0.0201, + "step": 4340 + }, + { + "grad_norm": 0.5175392031669617, + "learning_rate": 9.252800054104868e-05, + "loss": 0.023, + "step": 4350 + }, + { + "grad_norm": 0.41809120774269104, + "learning_rate": 9.248446625608252e-05, + "loss": 0.0207, + "step": 4360 + }, + { + "grad_norm": 0.5627315640449524, + "learning_rate": 9.244081582020789e-05, + "loss": 0.0208, + "step": 4370 + }, + { + "grad_norm": 0.42645886540412903, + "learning_rate": 9.239704935276339e-05, + "loss": 0.0258, + "step": 4380 + }, + { + "grad_norm": 0.48128244280815125, + "learning_rate": 9.235316697340489e-05, + "loss": 0.0322, + "step": 4390 + }, + { + "grad_norm": 0.4808783233165741, + "learning_rate": 9.230916880210512e-05, + "loss": 0.0285, + "step": 4400 + }, + { + "grad_norm": 0.6028614640235901, + "learning_rate": 9.226505495915342e-05, + "loss": 0.0259, + "step": 4410 + }, + { + "grad_norm": 0.5204566717147827, + "learning_rate": 9.222082556515536e-05, + "loss": 0.0259, + "step": 4420 + }, + { + "grad_norm": 0.4750819504261017, + "learning_rate": 9.217648074103242e-05, + "loss": 0.0232, + "step": 4430 + }, + { + "grad_norm": 0.4595358669757843, + "learning_rate": 9.213202060802161e-05, + "loss": 0.0196, + "step": 4440 + }, + { + "grad_norm": 0.5138223171234131, + "learning_rate": 9.208744528767528e-05, + "loss": 0.0187, + "step": 4450 + }, + { + "grad_norm": 0.4462665021419525, + "learning_rate": 9.204275490186064e-05, + "loss": 0.0143, + "step": 4460 + }, + { + "grad_norm": 0.4108940660953522, + "learning_rate": 9.199794957275949e-05, + "loss": 0.0098, + "step": 4470 + }, + { + "grad_norm": 0.3908710777759552, + "learning_rate": 9.19530294228679e-05, + "loss": 0.015, + "step": 4480 + }, + { + "grad_norm": 0.639202892780304, + "learning_rate": 9.190799457499583e-05, + "loss": 0.0189, + "step": 4490 + }, + { + "grad_norm": 0.5188295841217041, + "learning_rate": 9.186284515226686e-05, + "loss": 0.0205, + "step": 4500 + }, + { + "grad_norm": 0.6553609371185303, + "learning_rate": 9.181758127811777e-05, + "loss": 0.0194, + "step": 4510 + }, + { + "grad_norm": 0.3807510733604431, + "learning_rate": 9.177220307629825e-05, + "loss": 0.0147, + "step": 4520 + }, + { + "grad_norm": 0.4898533225059509, + "learning_rate": 9.172671067087059e-05, + "loss": 0.016, + "step": 4530 + }, + { + "grad_norm": 0.36453813314437866, + "learning_rate": 9.16811041862093e-05, + "loss": 0.0137, + "step": 4540 + }, + { + "grad_norm": 0.392630934715271, + "learning_rate": 9.163538374700076e-05, + "loss": 0.0162, + "step": 4550 + }, + { + "grad_norm": 0.5790966153144836, + "learning_rate": 9.158954947824287e-05, + "loss": 0.0233, + "step": 4560 + }, + { + "grad_norm": 0.44102975726127625, + "learning_rate": 9.154360150524482e-05, + "loss": 0.0261, + "step": 4570 + }, + { + "grad_norm": 0.38393422961235046, + "learning_rate": 9.14975399536266e-05, + "loss": 0.0286, + "step": 4580 + }, + { + "grad_norm": 0.536277711391449, + "learning_rate": 9.14513649493187e-05, + "loss": 0.026, + "step": 4590 + }, + { + "grad_norm": 0.5126002430915833, + "learning_rate": 9.140507661856187e-05, + "loss": 0.0232, + "step": 4600 + }, + { + "grad_norm": 0.4797167181968689, + "learning_rate": 9.135867508790661e-05, + "loss": 0.0266, + "step": 4610 + }, + { + "grad_norm": 0.5301699638366699, + "learning_rate": 9.131216048421291e-05, + "loss": 0.0258, + "step": 4620 + }, + { + "grad_norm": 0.6703513860702515, + "learning_rate": 9.126553293464998e-05, + "loss": 0.0272, + "step": 4630 + }, + { + "grad_norm": 0.6077861189842224, + "learning_rate": 9.121879256669572e-05, + "loss": 0.0303, + "step": 4640 + }, + { + "grad_norm": 0.49013379216194153, + "learning_rate": 9.117193950813652e-05, + "loss": 0.0249, + "step": 4650 + }, + { + "grad_norm": 0.4461021423339844, + "learning_rate": 9.112497388706685e-05, + "loss": 0.0255, + "step": 4660 + }, + { + "grad_norm": 0.5884753465652466, + "learning_rate": 9.10778958318889e-05, + "loss": 0.023, + "step": 4670 + }, + { + "grad_norm": 0.5286600589752197, + "learning_rate": 9.103070547131232e-05, + "loss": 0.0225, + "step": 4680 + }, + { + "grad_norm": 0.41834360361099243, + "learning_rate": 9.098340293435375e-05, + "loss": 0.023, + "step": 4690 + }, + { + "grad_norm": 0.41160669922828674, + "learning_rate": 9.093598835033649e-05, + "loss": 0.0269, + "step": 4700 + }, + { + "grad_norm": 0.3593420386314392, + "learning_rate": 9.088846184889021e-05, + "loss": 0.0209, + "step": 4710 + }, + { + "grad_norm": 0.34463873505592346, + "learning_rate": 9.084082355995057e-05, + "loss": 0.0176, + "step": 4720 + }, + { + "grad_norm": 0.5115851759910583, + "learning_rate": 9.079307361375882e-05, + "loss": 0.0205, + "step": 4730 + }, + { + "grad_norm": 0.5639398694038391, + "learning_rate": 9.074521214086149e-05, + "loss": 0.0269, + "step": 4740 + }, + { + "grad_norm": 0.4444732367992401, + "learning_rate": 9.069723927211001e-05, + "loss": 0.0286, + "step": 4750 + }, + { + "grad_norm": 0.44310614466667175, + "learning_rate": 9.064915513866037e-05, + "loss": 0.0263, + "step": 4760 + }, + { + "grad_norm": 0.40478503704071045, + "learning_rate": 9.060095987197279e-05, + "loss": 0.0216, + "step": 4770 + }, + { + "grad_norm": 0.33776071667671204, + "learning_rate": 9.055265360381126e-05, + "loss": 0.0096, + "step": 4780 + }, + { + "grad_norm": 0.283161997795105, + "learning_rate": 9.050423646624326e-05, + "loss": 0.012, + "step": 4790 + }, + { + "grad_norm": 0.4983021020889282, + "learning_rate": 9.045570859163943e-05, + "loss": 0.0156, + "step": 4800 + }, + { + "grad_norm": 0.5157310366630554, + "learning_rate": 9.04070701126731e-05, + "loss": 0.014, + "step": 4810 + }, + { + "grad_norm": 0.38858625292778015, + "learning_rate": 9.035832116232001e-05, + "loss": 0.0148, + "step": 4820 + }, + { + "grad_norm": 0.45956185460090637, + "learning_rate": 9.030946187385796e-05, + "loss": 0.0145, + "step": 4830 + }, + { + "grad_norm": 0.4608694016933441, + "learning_rate": 9.026049238086635e-05, + "loss": 0.016, + "step": 4840 + }, + { + "grad_norm": 0.4715012311935425, + "learning_rate": 9.021141281722591e-05, + "loss": 0.0234, + "step": 4850 + }, + { + "grad_norm": 0.5070418119430542, + "learning_rate": 9.01622233171183e-05, + "loss": 0.0237, + "step": 4860 + }, + { + "grad_norm": 0.5231166481971741, + "learning_rate": 9.011292401502574e-05, + "loss": 0.0275, + "step": 4870 + }, + { + "grad_norm": 0.4771530330181122, + "learning_rate": 9.006351504573063e-05, + "loss": 0.025, + "step": 4880 + }, + { + "grad_norm": 0.4553404152393341, + "learning_rate": 9.001399654431519e-05, + "loss": 0.0208, + "step": 4890 + }, + { + "grad_norm": 0.4051187038421631, + "learning_rate": 8.996436864616116e-05, + "loss": 0.0187, + "step": 4900 + }, + { + "grad_norm": 0.5219714045524597, + "learning_rate": 8.991463148694925e-05, + "loss": 0.017, + "step": 4910 + }, + { + "grad_norm": 0.5234586000442505, + "learning_rate": 8.986478520265902e-05, + "loss": 0.021, + "step": 4920 + }, + { + "grad_norm": 0.349272757768631, + "learning_rate": 8.981482992956827e-05, + "loss": 0.0243, + "step": 4930 + }, + { + "grad_norm": 0.3842460811138153, + "learning_rate": 8.976476580425282e-05, + "loss": 0.0196, + "step": 4940 + }, + { + "grad_norm": 0.42498040199279785, + "learning_rate": 8.971459296358606e-05, + "loss": 0.0227, + "step": 4950 + }, + { + "grad_norm": 0.5083599090576172, + "learning_rate": 8.966431154473864e-05, + "loss": 0.0234, + "step": 4960 + }, + { + "grad_norm": 0.39733007550239563, + "learning_rate": 8.961392168517803e-05, + "loss": 0.0151, + "step": 4970 + }, + { + "grad_norm": 0.47845280170440674, + "learning_rate": 8.956342352266821e-05, + "loss": 0.0214, + "step": 4980 + }, + { + "grad_norm": 0.37154051661491394, + "learning_rate": 8.95128171952692e-05, + "loss": 0.0243, + "step": 4990 + }, + { + "grad_norm": 0.4907708466053009, + "learning_rate": 8.946210284133676e-05, + "loss": 0.0231, + "step": 5000 + }, + { + "grad_norm": 0.4096941649913788, + "learning_rate": 8.941128059952201e-05, + "loss": 0.0267, + "step": 5010 + }, + { + "grad_norm": 0.47642868757247925, + "learning_rate": 8.936035060877102e-05, + "loss": 0.0277, + "step": 5020 + }, + { + "grad_norm": 0.5513421893119812, + "learning_rate": 8.930931300832443e-05, + "loss": 0.0207, + "step": 5030 + }, + { + "grad_norm": 0.40883830189704895, + "learning_rate": 8.925816793771711e-05, + "loss": 0.0204, + "step": 5040 + }, + { + "grad_norm": 0.4464835822582245, + "learning_rate": 8.92069155367777e-05, + "loss": 0.0219, + "step": 5050 + }, + { + "grad_norm": 0.39857494831085205, + "learning_rate": 8.915555594562834e-05, + "loss": 0.0233, + "step": 5060 + }, + { + "grad_norm": 0.4259535074234009, + "learning_rate": 8.910408930468416e-05, + "loss": 0.0208, + "step": 5070 + }, + { + "grad_norm": 0.403730571269989, + "learning_rate": 8.905251575465303e-05, + "loss": 0.0186, + "step": 5080 + }, + { + "grad_norm": 0.5403347015380859, + "learning_rate": 8.900083543653502e-05, + "loss": 0.0194, + "step": 5090 + }, + { + "grad_norm": 0.4551507830619812, + "learning_rate": 8.894904849162218e-05, + "loss": 0.0175, + "step": 5100 + }, + { + "grad_norm": 0.4399072825908661, + "learning_rate": 8.889715506149802e-05, + "loss": 0.0183, + "step": 5110 + }, + { + "grad_norm": 0.4139096736907959, + "learning_rate": 8.884515528803722e-05, + "loss": 0.0147, + "step": 5120 + }, + { + "grad_norm": 0.3700367510318756, + "learning_rate": 8.879304931340517e-05, + "loss": 0.013, + "step": 5130 + }, + { + "grad_norm": 0.4368840157985687, + "learning_rate": 8.874083728005759e-05, + "loss": 0.0124, + "step": 5140 + }, + { + "grad_norm": 0.42493119835853577, + "learning_rate": 8.868851933074021e-05, + "loss": 0.0131, + "step": 5150 + }, + { + "grad_norm": 0.37674298882484436, + "learning_rate": 8.863609560848829e-05, + "loss": 0.0136, + "step": 5160 + }, + { + "grad_norm": 0.5186234712600708, + "learning_rate": 8.85835662566263e-05, + "loss": 0.0132, + "step": 5170 + }, + { + "grad_norm": 0.4621662199497223, + "learning_rate": 8.853093141876747e-05, + "loss": 0.016, + "step": 5180 + }, + { + "grad_norm": 0.35149648785591125, + "learning_rate": 8.847819123881343e-05, + "loss": 0.0153, + "step": 5190 + }, + { + "grad_norm": 0.5584362149238586, + "learning_rate": 8.842534586095383e-05, + "loss": 0.0231, + "step": 5200 + }, + { + "grad_norm": 0.43347012996673584, + "learning_rate": 8.837239542966593e-05, + "loss": 0.0283, + "step": 5210 + }, + { + "grad_norm": 0.4329712986946106, + "learning_rate": 8.831934008971417e-05, + "loss": 0.0278, + "step": 5220 + }, + { + "grad_norm": 0.49545276165008545, + "learning_rate": 8.826617998614982e-05, + "loss": 0.0237, + "step": 5230 + }, + { + "grad_norm": 0.47251617908477783, + "learning_rate": 8.821291526431056e-05, + "loss": 0.0216, + "step": 5240 + }, + { + "grad_norm": 0.36570796370506287, + "learning_rate": 8.815954606982015e-05, + "loss": 0.0221, + "step": 5250 + }, + { + "grad_norm": 0.5365438461303711, + "learning_rate": 8.810607254858789e-05, + "loss": 0.0214, + "step": 5260 + }, + { + "grad_norm": 0.43386510014533997, + "learning_rate": 8.805249484680838e-05, + "loss": 0.0225, + "step": 5270 + }, + { + "grad_norm": 0.43828654289245605, + "learning_rate": 8.799881311096096e-05, + "loss": 0.0225, + "step": 5280 + }, + { + "grad_norm": 0.5182799100875854, + "learning_rate": 8.794502748780949e-05, + "loss": 0.0192, + "step": 5290 + }, + { + "grad_norm": 0.5659856200218201, + "learning_rate": 8.78911381244018e-05, + "loss": 0.0208, + "step": 5300 + }, + { + "grad_norm": 0.48078930377960205, + "learning_rate": 8.783714516806933e-05, + "loss": 0.0241, + "step": 5310 + }, + { + "grad_norm": 0.41057664155960083, + "learning_rate": 8.77830487664268e-05, + "loss": 0.0319, + "step": 5320 + }, + { + "grad_norm": 0.4401390850543976, + "learning_rate": 8.772884906737167e-05, + "loss": 0.0241, + "step": 5330 + }, + { + "grad_norm": 0.3672844469547272, + "learning_rate": 8.767454621908387e-05, + "loss": 0.024, + "step": 5340 + }, + { + "grad_norm": 0.3945058286190033, + "learning_rate": 8.76201403700253e-05, + "loss": 0.0225, + "step": 5350 + }, + { + "grad_norm": 0.33607563376426697, + "learning_rate": 8.756563166893949e-05, + "loss": 0.017, + "step": 5360 + }, + { + "grad_norm": 0.3968481123447418, + "learning_rate": 8.751102026485113e-05, + "loss": 0.018, + "step": 5370 + }, + { + "grad_norm": 0.44317641854286194, + "learning_rate": 8.745630630706571e-05, + "loss": 0.019, + "step": 5380 + }, + { + "grad_norm": 0.48730534315109253, + "learning_rate": 8.740148994516912e-05, + "loss": 0.0193, + "step": 5390 + }, + { + "grad_norm": 0.50745689868927, + "learning_rate": 8.73465713290272e-05, + "loss": 0.0218, + "step": 5400 + }, + { + "grad_norm": 0.45813968777656555, + "learning_rate": 8.729155060878533e-05, + "loss": 0.0259, + "step": 5410 + }, + { + "grad_norm": 0.36077168583869934, + "learning_rate": 8.723642793486809e-05, + "loss": 0.0241, + "step": 5420 + }, + { + "grad_norm": 0.42961910367012024, + "learning_rate": 8.718120345797873e-05, + "loss": 0.0271, + "step": 5430 + }, + { + "grad_norm": 0.4253261387348175, + "learning_rate": 8.712587732909889e-05, + "loss": 0.0188, + "step": 5440 + }, + { + "grad_norm": 0.4135109484195709, + "learning_rate": 8.707044969948806e-05, + "loss": 0.0147, + "step": 5450 + }, + { + "grad_norm": 0.4966714382171631, + "learning_rate": 8.701492072068329e-05, + "loss": 0.0138, + "step": 5460 + }, + { + "grad_norm": 0.4690430164337158, + "learning_rate": 8.695929054449869e-05, + "loss": 0.0117, + "step": 5470 + }, + { + "grad_norm": 0.33759796619415283, + "learning_rate": 8.690355932302501e-05, + "loss": 0.0148, + "step": 5480 + }, + { + "grad_norm": 0.3020997941493988, + "learning_rate": 8.684772720862931e-05, + "loss": 0.019, + "step": 5490 + }, + { + "grad_norm": 0.42248037457466125, + "learning_rate": 8.679179435395446e-05, + "loss": 0.0179, + "step": 5500 + }, + { + "grad_norm": 0.5242781043052673, + "learning_rate": 8.673576091191874e-05, + "loss": 0.0201, + "step": 5510 + }, + { + "grad_norm": 0.4367959797382355, + "learning_rate": 8.667962703571541e-05, + "loss": 0.0203, + "step": 5520 + }, + { + "grad_norm": 0.39691901206970215, + "learning_rate": 8.662339287881238e-05, + "loss": 0.011, + "step": 5530 + }, + { + "grad_norm": 0.4257650077342987, + "learning_rate": 8.656705859495169e-05, + "loss": 0.0124, + "step": 5540 + }, + { + "grad_norm": 0.3149842917919159, + "learning_rate": 8.651062433814912e-05, + "loss": 0.0133, + "step": 5550 + }, + { + "grad_norm": 0.43685126304626465, + "learning_rate": 8.645409026269375e-05, + "loss": 0.0194, + "step": 5560 + }, + { + "grad_norm": 0.47423988580703735, + "learning_rate": 8.639745652314759e-05, + "loss": 0.0226, + "step": 5570 + }, + { + "grad_norm": 0.4042307138442993, + "learning_rate": 8.634072327434515e-05, + "loss": 0.0186, + "step": 5580 + }, + { + "grad_norm": 0.38977760076522827, + "learning_rate": 8.628389067139294e-05, + "loss": 0.0171, + "step": 5590 + }, + { + "grad_norm": 0.4748629033565521, + "learning_rate": 8.622695886966911e-05, + "loss": 0.017, + "step": 5600 + }, + { + "grad_norm": 0.493661105632782, + "learning_rate": 8.616992802482308e-05, + "loss": 0.0196, + "step": 5610 + }, + { + "grad_norm": 0.5512261986732483, + "learning_rate": 8.611279829277496e-05, + "loss": 0.0215, + "step": 5620 + }, + { + "grad_norm": 0.47207796573638916, + "learning_rate": 8.605556982971528e-05, + "loss": 0.0188, + "step": 5630 + }, + { + "grad_norm": 0.46937695145606995, + "learning_rate": 8.599824279210447e-05, + "loss": 0.0167, + "step": 5640 + }, + { + "grad_norm": 0.4144851267337799, + "learning_rate": 8.594081733667243e-05, + "loss": 0.0145, + "step": 5650 + }, + { + "grad_norm": 0.30675607919692993, + "learning_rate": 8.58832936204182e-05, + "loss": 0.0212, + "step": 5660 + }, + { + "grad_norm": 0.3751591145992279, + "learning_rate": 8.582567180060942e-05, + "loss": 0.015, + "step": 5670 + }, + { + "grad_norm": 0.5419527888298035, + "learning_rate": 8.576795203478194e-05, + "loss": 0.021, + "step": 5680 + }, + { + "grad_norm": 0.49854937195777893, + "learning_rate": 8.571013448073939e-05, + "loss": 0.0207, + "step": 5690 + }, + { + "grad_norm": 0.5256165862083435, + "learning_rate": 8.565221929655275e-05, + "loss": 0.0243, + "step": 5700 + }, + { + "grad_norm": 0.5212571024894714, + "learning_rate": 8.559420664055992e-05, + "loss": 0.0234, + "step": 5710 + }, + { + "grad_norm": 0.45727217197418213, + "learning_rate": 8.553609667136532e-05, + "loss": 0.0207, + "step": 5720 + }, + { + "grad_norm": 0.41014188528060913, + "learning_rate": 8.547788954783936e-05, + "loss": 0.015, + "step": 5730 + }, + { + "grad_norm": 0.3546692132949829, + "learning_rate": 8.541958542911808e-05, + "loss": 0.0134, + "step": 5740 + }, + { + "grad_norm": 0.40963688492774963, + "learning_rate": 8.536118447460275e-05, + "loss": 0.0142, + "step": 5750 + }, + { + "grad_norm": 0.3467051386833191, + "learning_rate": 8.530268684395932e-05, + "loss": 0.0153, + "step": 5760 + }, + { + "grad_norm": 0.37610283493995667, + "learning_rate": 8.524409269711807e-05, + "loss": 0.0206, + "step": 5770 + }, + { + "grad_norm": 0.5440983176231384, + "learning_rate": 8.51854021942732e-05, + "loss": 0.0197, + "step": 5780 + }, + { + "grad_norm": 0.49030956625938416, + "learning_rate": 8.512661549588227e-05, + "loss": 0.0215, + "step": 5790 + }, + { + "grad_norm": 0.4372100234031677, + "learning_rate": 8.506773276266588e-05, + "loss": 0.0198, + "step": 5800 + }, + { + "grad_norm": 0.47856998443603516, + "learning_rate": 8.500875415560721e-05, + "loss": 0.0112, + "step": 5810 + }, + { + "grad_norm": 0.4301353693008423, + "learning_rate": 8.494967983595144e-05, + "loss": 0.0122, + "step": 5820 + }, + { + "grad_norm": 0.37016206979751587, + "learning_rate": 8.489050996520558e-05, + "loss": 0.0121, + "step": 5830 + }, + { + "grad_norm": 0.315185546875, + "learning_rate": 8.483124470513775e-05, + "loss": 0.0082, + "step": 5840 + }, + { + "grad_norm": 0.4160684645175934, + "learning_rate": 8.477188421777692e-05, + "loss": 0.0087, + "step": 5850 + }, + { + "grad_norm": 0.468124121427536, + "learning_rate": 8.47124286654124e-05, + "loss": 0.0095, + "step": 5860 + }, + { + "grad_norm": 0.3998025059700012, + "learning_rate": 8.465287821059341e-05, + "loss": 0.0121, + "step": 5870 + }, + { + "grad_norm": 0.41253435611724854, + "learning_rate": 8.45932330161286e-05, + "loss": 0.019, + "step": 5880 + }, + { + "grad_norm": 0.5324806571006775, + "learning_rate": 8.453349324508567e-05, + "loss": 0.0207, + "step": 5890 + }, + { + "grad_norm": 0.3645405173301697, + "learning_rate": 8.447365906079088e-05, + "loss": 0.0216, + "step": 5900 + }, + { + "grad_norm": 0.3897666931152344, + "learning_rate": 8.441373062682856e-05, + "loss": 0.0233, + "step": 5910 + }, + { + "grad_norm": 0.37299057841300964, + "learning_rate": 8.43537081070408e-05, + "loss": 0.0149, + "step": 5920 + }, + { + "grad_norm": 0.4036707282066345, + "learning_rate": 8.429359166552689e-05, + "loss": 0.0109, + "step": 5930 + }, + { + "grad_norm": 0.38577744364738464, + "learning_rate": 8.423338146664284e-05, + "loss": 0.0134, + "step": 5940 + }, + { + "grad_norm": 0.3633534908294678, + "learning_rate": 8.417307767500107e-05, + "loss": 0.0137, + "step": 5950 + }, + { + "grad_norm": 0.34327638149261475, + "learning_rate": 8.411268045546983e-05, + "loss": 0.0131, + "step": 5960 + }, + { + "grad_norm": 0.3706875741481781, + "learning_rate": 8.405218997317281e-05, + "loss": 0.016, + "step": 5970 + }, + { + "grad_norm": 0.34372249245643616, + "learning_rate": 8.399160639348869e-05, + "loss": 0.0148, + "step": 5980 + }, + { + "grad_norm": 0.475347638130188, + "learning_rate": 8.393092988205065e-05, + "loss": 0.0176, + "step": 5990 + }, + { + "grad_norm": 0.4359090328216553, + "learning_rate": 8.387016060474597e-05, + "loss": 0.019, + "step": 6000 + }, + { + "grad_norm": 0.3905183970928192, + "learning_rate": 8.380929872771551e-05, + "loss": 0.0167, + "step": 6010 + }, + { + "grad_norm": 0.418160617351532, + "learning_rate": 8.374834441735335e-05, + "loss": 0.0174, + "step": 6020 + }, + { + "grad_norm": 0.4518284797668457, + "learning_rate": 8.368729784030622e-05, + "loss": 0.0146, + "step": 6030 + }, + { + "grad_norm": 0.42254838347435, + "learning_rate": 8.362615916347315e-05, + "loss": 0.0158, + "step": 6040 + }, + { + "grad_norm": 0.5141122341156006, + "learning_rate": 8.356492855400493e-05, + "loss": 0.0165, + "step": 6050 + }, + { + "grad_norm": 0.5752711892127991, + "learning_rate": 8.350360617930371e-05, + "loss": 0.0148, + "step": 6060 + }, + { + "grad_norm": 0.4201089143753052, + "learning_rate": 8.344219220702255e-05, + "loss": 0.0149, + "step": 6070 + }, + { + "grad_norm": 0.3286941647529602, + "learning_rate": 8.338068680506485e-05, + "loss": 0.0124, + "step": 6080 + }, + { + "grad_norm": 0.40556249022483826, + "learning_rate": 8.33190901415841e-05, + "loss": 0.0094, + "step": 6090 + }, + { + "grad_norm": 0.4031883180141449, + "learning_rate": 8.325740238498317e-05, + "loss": 0.0125, + "step": 6100 + }, + { + "grad_norm": 0.3731491267681122, + "learning_rate": 8.319562370391406e-05, + "loss": 0.0151, + "step": 6110 + }, + { + "grad_norm": 0.5448814034461975, + "learning_rate": 8.31337542672773e-05, + "loss": 0.0179, + "step": 6120 + }, + { + "grad_norm": 0.34223783016204834, + "learning_rate": 8.307179424422158e-05, + "loss": 0.0204, + "step": 6130 + }, + { + "grad_norm": 0.4088946580886841, + "learning_rate": 8.300974380414327e-05, + "loss": 0.0187, + "step": 6140 + }, + { + "grad_norm": 0.557543933391571, + "learning_rate": 8.294760311668586e-05, + "loss": 0.0182, + "step": 6150 + }, + { + "grad_norm": 0.30980128049850464, + "learning_rate": 8.288537235173961e-05, + "loss": 0.0171, + "step": 6160 + }, + { + "grad_norm": 0.35932275652885437, + "learning_rate": 8.282305167944108e-05, + "loss": 0.0169, + "step": 6170 + }, + { + "grad_norm": 0.2954501807689667, + "learning_rate": 8.276064127017262e-05, + "loss": 0.0145, + "step": 6180 + }, + { + "grad_norm": 0.35944661498069763, + "learning_rate": 8.269814129456189e-05, + "loss": 0.0134, + "step": 6190 + }, + { + "grad_norm": 0.4353872537612915, + "learning_rate": 8.263555192348143e-05, + "loss": 0.0159, + "step": 6200 + }, + { + "grad_norm": 0.3751029968261719, + "learning_rate": 8.257287332804819e-05, + "loss": 0.0135, + "step": 6210 + }, + { + "grad_norm": 0.3151674270629883, + "learning_rate": 8.251010567962307e-05, + "loss": 0.0121, + "step": 6220 + }, + { + "grad_norm": 0.3966769576072693, + "learning_rate": 8.244724914981041e-05, + "loss": 0.011, + "step": 6230 + }, + { + "grad_norm": 0.45497483015060425, + "learning_rate": 8.238430391045757e-05, + "loss": 0.0152, + "step": 6240 + }, + { + "grad_norm": 0.4757539629936218, + "learning_rate": 8.232127013365445e-05, + "loss": 0.0187, + "step": 6250 + }, + { + "grad_norm": 0.48154377937316895, + "learning_rate": 8.225814799173295e-05, + "loss": 0.0218, + "step": 6260 + }, + { + "grad_norm": 0.38772448897361755, + "learning_rate": 8.219493765726663e-05, + "loss": 0.0163, + "step": 6270 + }, + { + "grad_norm": 0.38715583086013794, + "learning_rate": 8.21316393030701e-05, + "loss": 0.0203, + "step": 6280 + }, + { + "grad_norm": 0.34129858016967773, + "learning_rate": 8.206825310219865e-05, + "loss": 0.0162, + "step": 6290 + }, + { + "grad_norm": 0.38137829303741455, + "learning_rate": 8.200477922794776e-05, + "loss": 0.0219, + "step": 6300 + }, + { + "grad_norm": 0.43256160616874695, + "learning_rate": 8.194121785385256e-05, + "loss": 0.0193, + "step": 6310 + }, + { + "grad_norm": 0.30605167150497437, + "learning_rate": 8.187756915368741e-05, + "loss": 0.0149, + "step": 6320 + }, + { + "grad_norm": 0.4863114655017853, + "learning_rate": 8.181383330146544e-05, + "loss": 0.0115, + "step": 6330 + }, + { + "grad_norm": 0.4545869529247284, + "learning_rate": 8.175001047143804e-05, + "loss": 0.0138, + "step": 6340 + }, + { + "grad_norm": 0.4741712212562561, + "learning_rate": 8.168610083809438e-05, + "loss": 0.0155, + "step": 6350 + }, + { + "grad_norm": 0.5844265818595886, + "learning_rate": 8.162210457616095e-05, + "loss": 0.0173, + "step": 6360 + }, + { + "grad_norm": 0.49542292952537537, + "learning_rate": 8.155802186060109e-05, + "loss": 0.0166, + "step": 6370 + }, + { + "grad_norm": 0.35017192363739014, + "learning_rate": 8.149385286661453e-05, + "loss": 0.0141, + "step": 6380 + }, + { + "grad_norm": 0.45438051223754883, + "learning_rate": 8.14295977696368e-05, + "loss": 0.0128, + "step": 6390 + }, + { + "grad_norm": 0.4544942378997803, + "learning_rate": 8.13652567453389e-05, + "loss": 0.0146, + "step": 6400 + }, + { + "grad_norm": 0.28921496868133545, + "learning_rate": 8.130082996962676e-05, + "loss": 0.0173, + "step": 6410 + }, + { + "grad_norm": 0.3601858913898468, + "learning_rate": 8.123631761864068e-05, + "loss": 0.0188, + "step": 6420 + }, + { + "grad_norm": 0.39149102568626404, + "learning_rate": 8.1171719868755e-05, + "loss": 0.0153, + "step": 6430 + }, + { + "grad_norm": 0.34540727734565735, + "learning_rate": 8.110703689657748e-05, + "loss": 0.015, + "step": 6440 + }, + { + "grad_norm": 0.46488672494888306, + "learning_rate": 8.104226887894892e-05, + "loss": 0.0195, + "step": 6450 + }, + { + "grad_norm": 0.37852662801742554, + "learning_rate": 8.097741599294257e-05, + "loss": 0.0174, + "step": 6460 + }, + { + "grad_norm": 0.29327821731567383, + "learning_rate": 8.091247841586378e-05, + "loss": 0.0204, + "step": 6470 + }, + { + "grad_norm": 0.42606469988822937, + "learning_rate": 8.084745632524939e-05, + "loss": 0.021, + "step": 6480 + }, + { + "grad_norm": 0.424290269613266, + "learning_rate": 8.07823498988673e-05, + "loss": 0.0175, + "step": 6490 + }, + { + "grad_norm": 0.3854418396949768, + "learning_rate": 8.071715931471602e-05, + "loss": 0.0139, + "step": 6500 + }, + { + "grad_norm": 0.3748428225517273, + "learning_rate": 8.06518847510241e-05, + "loss": 0.0115, + "step": 6510 + }, + { + "grad_norm": 0.3839101493358612, + "learning_rate": 8.058652638624971e-05, + "loss": 0.0104, + "step": 6520 + }, + { + "grad_norm": 0.39358240365982056, + "learning_rate": 8.052108439908013e-05, + "loss": 0.0152, + "step": 6530 + }, + { + "grad_norm": 0.35504159331321716, + "learning_rate": 8.045555896843125e-05, + "loss": 0.0152, + "step": 6540 + }, + { + "grad_norm": 0.573060154914856, + "learning_rate": 8.03899502734471e-05, + "loss": 0.0212, + "step": 6550 + }, + { + "grad_norm": 0.5525245666503906, + "learning_rate": 8.032425849349931e-05, + "loss": 0.0228, + "step": 6560 + }, + { + "grad_norm": 0.4898419678211212, + "learning_rate": 8.025848380818674e-05, + "loss": 0.0216, + "step": 6570 + }, + { + "grad_norm": 0.34719106554985046, + "learning_rate": 8.019262639733487e-05, + "loss": 0.0166, + "step": 6580 + }, + { + "grad_norm": 0.45304349064826965, + "learning_rate": 8.012668644099531e-05, + "loss": 0.0186, + "step": 6590 + }, + { + "grad_norm": 0.3667837083339691, + "learning_rate": 8.006066411944542e-05, + "loss": 0.0203, + "step": 6600 + }, + { + "grad_norm": 0.5387156009674072, + "learning_rate": 7.999455961318769e-05, + "loss": 0.0228, + "step": 6610 + }, + { + "grad_norm": 0.44155004620552063, + "learning_rate": 7.992837310294932e-05, + "loss": 0.0149, + "step": 6620 + }, + { + "grad_norm": 0.3304447531700134, + "learning_rate": 7.986210476968167e-05, + "loss": 0.0247, + "step": 6630 + }, + { + "grad_norm": 0.45029908418655396, + "learning_rate": 7.97957547945599e-05, + "loss": 0.0247, + "step": 6640 + }, + { + "grad_norm": 0.46112605929374695, + "learning_rate": 7.972932335898226e-05, + "loss": 0.022, + "step": 6650 + }, + { + "grad_norm": 0.47679445147514343, + "learning_rate": 7.966281064456975e-05, + "loss": 0.0237, + "step": 6660 + }, + { + "grad_norm": 0.5341081023216248, + "learning_rate": 7.959621683316563e-05, + "loss": 0.0231, + "step": 6670 + }, + { + "grad_norm": 0.3254801332950592, + "learning_rate": 7.952954210683481e-05, + "loss": 0.0178, + "step": 6680 + }, + { + "grad_norm": 0.4155062139034271, + "learning_rate": 7.946278664786345e-05, + "loss": 0.0156, + "step": 6690 + }, + { + "grad_norm": 0.35035744309425354, + "learning_rate": 7.939595063875842e-05, + "loss": 0.0148, + "step": 6700 + }, + { + "grad_norm": 0.37214985489845276, + "learning_rate": 7.932903426224683e-05, + "loss": 0.0145, + "step": 6710 + }, + { + "grad_norm": 0.30615222454071045, + "learning_rate": 7.926203770127552e-05, + "loss": 0.0122, + "step": 6720 + }, + { + "grad_norm": 0.32838308811187744, + "learning_rate": 7.919496113901046e-05, + "loss": 0.0087, + "step": 6730 + }, + { + "grad_norm": 0.4435860812664032, + "learning_rate": 7.912780475883649e-05, + "loss": 0.0098, + "step": 6740 + }, + { + "grad_norm": 0.2991569936275482, + "learning_rate": 7.906056874435652e-05, + "loss": 0.0145, + "step": 6750 + }, + { + "grad_norm": 0.40926697850227356, + "learning_rate": 7.899325327939131e-05, + "loss": 0.0158, + "step": 6760 + }, + { + "grad_norm": 0.4230368733406067, + "learning_rate": 7.892585854797872e-05, + "loss": 0.0155, + "step": 6770 + }, + { + "grad_norm": 0.36268308758735657, + "learning_rate": 7.88583847343734e-05, + "loss": 0.0188, + "step": 6780 + }, + { + "grad_norm": 0.45804107189178467, + "learning_rate": 7.879083202304616e-05, + "loss": 0.0189, + "step": 6790 + }, + { + "grad_norm": 0.38222286105155945, + "learning_rate": 7.872320059868355e-05, + "loss": 0.0181, + "step": 6800 + }, + { + "grad_norm": 0.4070912003517151, + "learning_rate": 7.865549064618729e-05, + "loss": 0.0178, + "step": 6810 + }, + { + "grad_norm": 0.43297070264816284, + "learning_rate": 7.858770235067381e-05, + "loss": 0.0194, + "step": 6820 + }, + { + "grad_norm": 0.34508201479911804, + "learning_rate": 7.851983589747374e-05, + "loss": 0.0136, + "step": 6830 + }, + { + "grad_norm": 0.3884981870651245, + "learning_rate": 7.845189147213133e-05, + "loss": 0.0152, + "step": 6840 + }, + { + "grad_norm": 0.3865789771080017, + "learning_rate": 7.838386926040407e-05, + "loss": 0.0156, + "step": 6850 + }, + { + "grad_norm": 0.4127916097640991, + "learning_rate": 7.83157694482621e-05, + "loss": 0.0165, + "step": 6860 + }, + { + "grad_norm": 0.2912537753582001, + "learning_rate": 7.824759222188768e-05, + "loss": 0.0128, + "step": 6870 + }, + { + "grad_norm": 0.40327343344688416, + "learning_rate": 7.817933776767478e-05, + "loss": 0.0174, + "step": 6880 + }, + { + "grad_norm": 0.30888831615448, + "learning_rate": 7.811100627222842e-05, + "loss": 0.0128, + "step": 6890 + }, + { + "grad_norm": 0.32233473658561707, + "learning_rate": 7.804259792236435e-05, + "loss": 0.0097, + "step": 6900 + }, + { + "grad_norm": 0.3120769262313843, + "learning_rate": 7.797411290510835e-05, + "loss": 0.0091, + "step": 6910 + }, + { + "grad_norm": 0.3786951005458832, + "learning_rate": 7.790555140769586e-05, + "loss": 0.0106, + "step": 6920 + }, + { + "grad_norm": 0.4490700960159302, + "learning_rate": 7.78369136175714e-05, + "loss": 0.0172, + "step": 6930 + }, + { + "grad_norm": 0.389499306678772, + "learning_rate": 7.776819972238806e-05, + "loss": 0.0164, + "step": 6940 + }, + { + "grad_norm": 0.4003327786922455, + "learning_rate": 7.7699409910007e-05, + "loss": 0.0154, + "step": 6950 + }, + { + "grad_norm": 0.5661449432373047, + "learning_rate": 7.763054436849694e-05, + "loss": 0.0162, + "step": 6960 + }, + { + "grad_norm": 0.3841884434223175, + "learning_rate": 7.756160328613364e-05, + "loss": 0.0151, + "step": 6970 + }, + { + "grad_norm": 0.3295084238052368, + "learning_rate": 7.749258685139942e-05, + "loss": 0.0156, + "step": 6980 + }, + { + "grad_norm": 0.4176742136478424, + "learning_rate": 7.742349525298253e-05, + "loss": 0.0156, + "step": 6990 + }, + { + "grad_norm": 0.3022407591342926, + "learning_rate": 7.735432867977679e-05, + "loss": 0.0179, + "step": 7000 + }, + { + "grad_norm": 0.37116876244544983, + "learning_rate": 7.728508732088096e-05, + "loss": 0.0209, + "step": 7010 + }, + { + "grad_norm": 0.44029054045677185, + "learning_rate": 7.721577136559825e-05, + "loss": 0.0197, + "step": 7020 + }, + { + "grad_norm": 0.3499939739704132, + "learning_rate": 7.714638100343588e-05, + "loss": 0.0155, + "step": 7030 + }, + { + "grad_norm": 0.41017189621925354, + "learning_rate": 7.707691642410444e-05, + "loss": 0.0192, + "step": 7040 + }, + { + "grad_norm": 0.43974125385284424, + "learning_rate": 7.70073778175174e-05, + "loss": 0.0198, + "step": 7050 + }, + { + "grad_norm": 0.4120068848133087, + "learning_rate": 7.69377653737907e-05, + "loss": 0.0206, + "step": 7060 + }, + { + "grad_norm": 0.42474716901779175, + "learning_rate": 7.686807928324209e-05, + "loss": 0.0194, + "step": 7070 + }, + { + "grad_norm": 0.48563769459724426, + "learning_rate": 7.679831973639065e-05, + "loss": 0.0205, + "step": 7080 + }, + { + "grad_norm": 0.36655813455581665, + "learning_rate": 7.672848692395637e-05, + "loss": 0.0207, + "step": 7090 + }, + { + "grad_norm": 0.43131348490715027, + "learning_rate": 7.665858103685944e-05, + "loss": 0.0181, + "step": 7100 + }, + { + "grad_norm": 0.359139621257782, + "learning_rate": 7.658860226621991e-05, + "loss": 0.0214, + "step": 7110 + }, + { + "grad_norm": 0.3065771162509918, + "learning_rate": 7.651855080335708e-05, + "loss": 0.02, + "step": 7120 + }, + { + "grad_norm": 0.3586837649345398, + "learning_rate": 7.644842683978896e-05, + "loss": 0.0159, + "step": 7130 + }, + { + "grad_norm": 0.4576740562915802, + "learning_rate": 7.63782305672318e-05, + "loss": 0.0197, + "step": 7140 + }, + { + "grad_norm": 0.36861780285835266, + "learning_rate": 7.63079621775995e-05, + "loss": 0.0158, + "step": 7150 + }, + { + "grad_norm": 0.3511355221271515, + "learning_rate": 7.623762186300319e-05, + "loss": 0.0195, + "step": 7160 + }, + { + "grad_norm": 0.35798928141593933, + "learning_rate": 7.616720981575057e-05, + "loss": 0.0191, + "step": 7170 + }, + { + "grad_norm": 0.38436469435691833, + "learning_rate": 7.609672622834552e-05, + "loss": 0.0188, + "step": 7180 + }, + { + "grad_norm": 0.35056039690971375, + "learning_rate": 7.602617129348747e-05, + "loss": 0.0168, + "step": 7190 + }, + { + "grad_norm": 0.3569100797176361, + "learning_rate": 7.595554520407088e-05, + "loss": 0.0184, + "step": 7200 + }, + { + "grad_norm": 0.3778577446937561, + "learning_rate": 7.588484815318484e-05, + "loss": 0.0199, + "step": 7210 + }, + { + "grad_norm": 0.4807003140449524, + "learning_rate": 7.581408033411234e-05, + "loss": 0.0204, + "step": 7220 + }, + { + "grad_norm": 0.4152441918849945, + "learning_rate": 7.574324194032995e-05, + "loss": 0.0216, + "step": 7230 + }, + { + "grad_norm": 0.3196828365325928, + "learning_rate": 7.567233316550705e-05, + "loss": 0.0176, + "step": 7240 + }, + { + "grad_norm": 0.4675619304180145, + "learning_rate": 7.560135420350562e-05, + "loss": 0.0184, + "step": 7250 + }, + { + "grad_norm": 0.3448525071144104, + "learning_rate": 7.553030524837935e-05, + "loss": 0.0175, + "step": 7260 + }, + { + "grad_norm": 0.33041349053382874, + "learning_rate": 7.545918649437341e-05, + "loss": 0.0164, + "step": 7270 + }, + { + "grad_norm": 0.42748555541038513, + "learning_rate": 7.538799813592377e-05, + "loss": 0.0143, + "step": 7280 + }, + { + "grad_norm": 0.36006462574005127, + "learning_rate": 7.531674036765662e-05, + "loss": 0.0151, + "step": 7290 + }, + { + "grad_norm": 0.33960676193237305, + "learning_rate": 7.524541338438807e-05, + "loss": 0.0157, + "step": 7300 + }, + { + "grad_norm": 0.4089270532131195, + "learning_rate": 7.517401738112328e-05, + "loss": 0.0155, + "step": 7310 + }, + { + "grad_norm": 0.29973724484443665, + "learning_rate": 7.510255255305628e-05, + "loss": 0.0142, + "step": 7320 + }, + { + "grad_norm": 0.3888520300388336, + "learning_rate": 7.503101909556911e-05, + "loss": 0.019, + "step": 7330 + }, + { + "grad_norm": 0.3587034046649933, + "learning_rate": 7.495941720423154e-05, + "loss": 0.0136, + "step": 7340 + }, + { + "grad_norm": 0.4705655872821808, + "learning_rate": 7.488774707480042e-05, + "loss": 0.0171, + "step": 7350 + }, + { + "grad_norm": 0.2558979392051697, + "learning_rate": 7.481600890321911e-05, + "loss": 0.015, + "step": 7360 + }, + { + "grad_norm": 0.39154040813446045, + "learning_rate": 7.474420288561708e-05, + "loss": 0.0183, + "step": 7370 + }, + { + "grad_norm": 0.395163893699646, + "learning_rate": 7.467232921830921e-05, + "loss": 0.0193, + "step": 7380 + }, + { + "grad_norm": 0.43065279722213745, + "learning_rate": 7.460038809779537e-05, + "loss": 0.0208, + "step": 7390 + }, + { + "grad_norm": 0.43748345971107483, + "learning_rate": 7.452837972075983e-05, + "loss": 0.0211, + "step": 7400 + }, + { + "grad_norm": 0.43365004658699036, + "learning_rate": 7.445630428407074e-05, + "loss": 0.0222, + "step": 7410 + }, + { + "grad_norm": 0.35476717352867126, + "learning_rate": 7.43841619847796e-05, + "loss": 0.0198, + "step": 7420 + }, + { + "grad_norm": 0.3931501507759094, + "learning_rate": 7.431195302012072e-05, + "loss": 0.0246, + "step": 7430 + }, + { + "grad_norm": 0.37644949555397034, + "learning_rate": 7.423967758751061e-05, + "loss": 0.0188, + "step": 7440 + }, + { + "grad_norm": 0.3209872841835022, + "learning_rate": 7.416733588454758e-05, + "loss": 0.0162, + "step": 7450 + }, + { + "grad_norm": 0.3461327850818634, + "learning_rate": 7.409492810901106e-05, + "loss": 0.0148, + "step": 7460 + }, + { + "grad_norm": 0.32941320538520813, + "learning_rate": 7.402245445886116e-05, + "loss": 0.0174, + "step": 7470 + }, + { + "grad_norm": 0.3735780417919159, + "learning_rate": 7.394991513223806e-05, + "loss": 0.0139, + "step": 7480 + }, + { + "grad_norm": 0.29369816184043884, + "learning_rate": 7.38773103274615e-05, + "loss": 0.0126, + "step": 7490 + }, + { + "grad_norm": 0.35407641530036926, + "learning_rate": 7.380464024303028e-05, + "loss": 0.0123, + "step": 7500 + }, + { + "grad_norm": 0.30945441126823425, + "learning_rate": 7.373190507762162e-05, + "loss": 0.0123, + "step": 7510 + }, + { + "grad_norm": 0.39426395297050476, + "learning_rate": 7.365910503009066e-05, + "loss": 0.0115, + "step": 7520 + }, + { + "grad_norm": 0.34997260570526123, + "learning_rate": 7.358624029946996e-05, + "loss": 0.017, + "step": 7530 + }, + { + "grad_norm": 0.2550985813140869, + "learning_rate": 7.351331108496893e-05, + "loss": 0.0125, + "step": 7540 + }, + { + "grad_norm": 0.3133242726325989, + "learning_rate": 7.344031758597325e-05, + "loss": 0.0095, + "step": 7550 + }, + { + "grad_norm": 0.33384066820144653, + "learning_rate": 7.336726000204435e-05, + "loss": 0.0101, + "step": 7560 + }, + { + "grad_norm": 0.33045506477355957, + "learning_rate": 7.32941385329189e-05, + "loss": 0.0086, + "step": 7570 + }, + { + "grad_norm": 0.3414493501186371, + "learning_rate": 7.322095337850816e-05, + "loss": 0.0117, + "step": 7580 + }, + { + "grad_norm": 0.3123612701892853, + "learning_rate": 7.314770473889758e-05, + "loss": 0.0077, + "step": 7590 + }, + { + "grad_norm": 0.2817482352256775, + "learning_rate": 7.307439281434615e-05, + "loss": 0.0111, + "step": 7600 + }, + { + "grad_norm": 0.33544886112213135, + "learning_rate": 7.300101780528585e-05, + "loss": 0.0104, + "step": 7610 + }, + { + "grad_norm": 0.38472551107406616, + "learning_rate": 7.292757991232117e-05, + "loss": 0.0108, + "step": 7620 + }, + { + "grad_norm": 0.3105953335762024, + "learning_rate": 7.285407933622848e-05, + "loss": 0.0112, + "step": 7630 + }, + { + "grad_norm": 0.27845093607902527, + "learning_rate": 7.278051627795557e-05, + "loss": 0.0176, + "step": 7640 + }, + { + "grad_norm": 0.27918168902397156, + "learning_rate": 7.270689093862105e-05, + "loss": 0.0182, + "step": 7650 + }, + { + "grad_norm": 0.28590506315231323, + "learning_rate": 7.263320351951374e-05, + "loss": 0.0122, + "step": 7660 + }, + { + "grad_norm": 0.2704273760318756, + "learning_rate": 7.255945422209227e-05, + "loss": 0.0128, + "step": 7670 + }, + { + "grad_norm": 0.31531932950019836, + "learning_rate": 7.248564324798437e-05, + "loss": 0.0149, + "step": 7680 + }, + { + "grad_norm": 0.3159969449043274, + "learning_rate": 7.241177079898644e-05, + "loss": 0.0166, + "step": 7690 + }, + { + "grad_norm": 0.33888447284698486, + "learning_rate": 7.233783707706295e-05, + "loss": 0.0159, + "step": 7700 + }, + { + "grad_norm": 0.4522596001625061, + "learning_rate": 7.226384228434586e-05, + "loss": 0.0162, + "step": 7710 + }, + { + "grad_norm": 0.44001129269599915, + "learning_rate": 7.21897866231341e-05, + "loss": 0.0131, + "step": 7720 + }, + { + "grad_norm": 0.35431328415870667, + "learning_rate": 7.211567029589303e-05, + "loss": 0.015, + "step": 7730 + }, + { + "grad_norm": 0.35809576511383057, + "learning_rate": 7.204149350525387e-05, + "loss": 0.0146, + "step": 7740 + }, + { + "grad_norm": 0.47206911444664, + "learning_rate": 7.196725645401309e-05, + "loss": 0.0146, + "step": 7750 + }, + { + "grad_norm": 0.2520870566368103, + "learning_rate": 7.1892959345132e-05, + "loss": 0.0148, + "step": 7760 + }, + { + "grad_norm": 0.29022568464279175, + "learning_rate": 7.181860238173605e-05, + "loss": 0.0111, + "step": 7770 + }, + { + "grad_norm": 0.39016395807266235, + "learning_rate": 7.174418576711432e-05, + "loss": 0.013, + "step": 7780 + }, + { + "grad_norm": 0.36589184403419495, + "learning_rate": 7.1669709704719e-05, + "loss": 0.0112, + "step": 7790 + }, + { + "grad_norm": 0.3057953715324402, + "learning_rate": 7.159517439816481e-05, + "loss": 0.0138, + "step": 7800 + }, + { + "grad_norm": 0.3297235667705536, + "learning_rate": 7.152058005122842e-05, + "loss": 0.0131, + "step": 7810 + }, + { + "grad_norm": 0.23906858265399933, + "learning_rate": 7.144592686784793e-05, + "loss": 0.0081, + "step": 7820 + }, + { + "grad_norm": 0.3159496486186981, + "learning_rate": 7.137121505212229e-05, + "loss": 0.0086, + "step": 7830 + }, + { + "grad_norm": 0.33193764090538025, + "learning_rate": 7.129644480831077e-05, + "loss": 0.0124, + "step": 7840 + }, + { + "grad_norm": 0.35645991563796997, + "learning_rate": 7.122161634083234e-05, + "loss": 0.0113, + "step": 7850 + }, + { + "grad_norm": 0.46981170773506165, + "learning_rate": 7.114672985426516e-05, + "loss": 0.015, + "step": 7860 + }, + { + "grad_norm": 0.38671356439590454, + "learning_rate": 7.107178555334606e-05, + "loss": 0.0155, + "step": 7870 + }, + { + "grad_norm": 0.3616870939731598, + "learning_rate": 7.099678364296989e-05, + "loss": 0.0175, + "step": 7880 + }, + { + "grad_norm": 0.38888728618621826, + "learning_rate": 7.0921724328189e-05, + "loss": 0.0147, + "step": 7890 + }, + { + "grad_norm": 0.3530901074409485, + "learning_rate": 7.084660781421268e-05, + "loss": 0.0175, + "step": 7900 + }, + { + "grad_norm": 0.3785131871700287, + "learning_rate": 7.077143430640662e-05, + "loss": 0.0198, + "step": 7910 + }, + { + "grad_norm": 0.4906539022922516, + "learning_rate": 7.069620401029232e-05, + "loss": 0.0159, + "step": 7920 + }, + { + "grad_norm": 0.3655000925064087, + "learning_rate": 7.062091713154655e-05, + "loss": 0.0138, + "step": 7930 + }, + { + "grad_norm": 0.24046620726585388, + "learning_rate": 7.054557387600075e-05, + "loss": 0.0116, + "step": 7940 + }, + { + "grad_norm": 0.3180449306964874, + "learning_rate": 7.04701744496405e-05, + "loss": 0.0084, + "step": 7950 + }, + { + "grad_norm": 0.39981985092163086, + "learning_rate": 7.039471905860495e-05, + "loss": 0.0119, + "step": 7960 + }, + { + "grad_norm": 0.3383346498012543, + "learning_rate": 7.031920790918628e-05, + "loss": 0.0124, + "step": 7970 + }, + { + "grad_norm": 0.31623712182044983, + "learning_rate": 7.024364120782906e-05, + "loss": 0.0127, + "step": 7980 + }, + { + "grad_norm": 0.30611786246299744, + "learning_rate": 7.016801916112978e-05, + "loss": 0.0111, + "step": 7990 + }, + { + "grad_norm": 0.2620311677455902, + "learning_rate": 7.009234197583623e-05, + "loss": 0.0102, + "step": 8000 + }, + { + "grad_norm": 0.30000630021095276, + "learning_rate": 7.001660985884692e-05, + "loss": 0.0116, + "step": 8010 + }, + { + "grad_norm": 0.313591867685318, + "learning_rate": 6.994082301721063e-05, + "loss": 0.0153, + "step": 8020 + }, + { + "grad_norm": 0.27706649899482727, + "learning_rate": 6.986498165812563e-05, + "loss": 0.0123, + "step": 8030 + }, + { + "grad_norm": 0.32295501232147217, + "learning_rate": 6.978908598893932e-05, + "loss": 0.0157, + "step": 8040 + }, + { + "grad_norm": 0.3586593270301819, + "learning_rate": 6.971313621714756e-05, + "loss": 0.0107, + "step": 8050 + }, + { + "grad_norm": 0.30321335792541504, + "learning_rate": 6.96371325503941e-05, + "loss": 0.0113, + "step": 8060 + }, + { + "grad_norm": 0.4131782352924347, + "learning_rate": 6.956107519647014e-05, + "loss": 0.012, + "step": 8070 + }, + { + "grad_norm": 0.2817912995815277, + "learning_rate": 6.94849643633135e-05, + "loss": 0.0119, + "step": 8080 + }, + { + "grad_norm": 0.3419937789440155, + "learning_rate": 6.940880025900834e-05, + "loss": 0.0083, + "step": 8090 + }, + { + "grad_norm": 0.2906579375267029, + "learning_rate": 6.933258309178438e-05, + "loss": 0.0093, + "step": 8100 + }, + { + "grad_norm": 0.36081022024154663, + "learning_rate": 6.925631307001646e-05, + "loss": 0.0154, + "step": 8110 + }, + { + "grad_norm": 0.5059500336647034, + "learning_rate": 6.91799904022239e-05, + "loss": 0.0167, + "step": 8120 + }, + { + "grad_norm": 0.3011426627635956, + "learning_rate": 6.910361529706997e-05, + "loss": 0.0154, + "step": 8130 + }, + { + "grad_norm": 0.5015670657157898, + "learning_rate": 6.902718796336131e-05, + "loss": 0.0154, + "step": 8140 + }, + { + "grad_norm": 0.42444461584091187, + "learning_rate": 6.895070861004729e-05, + "loss": 0.0136, + "step": 8150 + }, + { + "grad_norm": 0.3741428256034851, + "learning_rate": 6.887417744621956e-05, + "loss": 0.0142, + "step": 8160 + }, + { + "grad_norm": 0.3352656066417694, + "learning_rate": 6.87975946811114e-05, + "loss": 0.0136, + "step": 8170 + }, + { + "grad_norm": 0.39785802364349365, + "learning_rate": 6.872096052409718e-05, + "loss": 0.0139, + "step": 8180 + }, + { + "grad_norm": 0.3535803556442261, + "learning_rate": 6.864427518469174e-05, + "loss": 0.0159, + "step": 8190 + }, + { + "grad_norm": 0.31960320472717285, + "learning_rate": 6.856753887254986e-05, + "loss": 0.0133, + "step": 8200 + }, + { + "grad_norm": 0.3019167482852936, + "learning_rate": 6.849075179746572e-05, + "loss": 0.0148, + "step": 8210 + }, + { + "grad_norm": 0.3369637727737427, + "learning_rate": 6.841391416937221e-05, + "loss": 0.0125, + "step": 8220 + }, + { + "grad_norm": 0.34352004528045654, + "learning_rate": 6.833702619834053e-05, + "loss": 0.0134, + "step": 8230 + }, + { + "grad_norm": 0.31633585691452026, + "learning_rate": 6.82600880945794e-05, + "loss": 0.0123, + "step": 8240 + }, + { + "grad_norm": 0.25650936365127563, + "learning_rate": 6.818310006843468e-05, + "loss": 0.0105, + "step": 8250 + }, + { + "grad_norm": 0.29332858324050903, + "learning_rate": 6.810606233038868e-05, + "loss": 0.0119, + "step": 8260 + }, + { + "grad_norm": 0.40779438614845276, + "learning_rate": 6.802897509105966e-05, + "loss": 0.0136, + "step": 8270 + }, + { + "grad_norm": 0.3060363233089447, + "learning_rate": 6.79518385612012e-05, + "loss": 0.0158, + "step": 8280 + }, + { + "grad_norm": 0.4033961892127991, + "learning_rate": 6.787465295170157e-05, + "loss": 0.0149, + "step": 8290 + }, + { + "grad_norm": 0.3450801968574524, + "learning_rate": 6.779741847358332e-05, + "loss": 0.0142, + "step": 8300 + }, + { + "grad_norm": 0.3422293961048126, + "learning_rate": 6.772013533800256e-05, + "loss": 0.0152, + "step": 8310 + }, + { + "grad_norm": 0.32153812050819397, + "learning_rate": 6.764280375624843e-05, + "loss": 0.0125, + "step": 8320 + }, + { + "grad_norm": 0.32539886236190796, + "learning_rate": 6.756542393974252e-05, + "loss": 0.0107, + "step": 8330 + }, + { + "grad_norm": 0.4207347631454468, + "learning_rate": 6.748799610003828e-05, + "loss": 0.0108, + "step": 8340 + }, + { + "grad_norm": 0.29526782035827637, + "learning_rate": 6.741052044882048e-05, + "loss": 0.0113, + "step": 8350 + }, + { + "grad_norm": 0.2972121238708496, + "learning_rate": 6.73329971979046e-05, + "loss": 0.0105, + "step": 8360 + }, + { + "grad_norm": 0.3182584047317505, + "learning_rate": 6.725542655923625e-05, + "loss": 0.0089, + "step": 8370 + }, + { + "grad_norm": 0.3145630955696106, + "learning_rate": 6.717780874489057e-05, + "loss": 0.0097, + "step": 8380 + }, + { + "grad_norm": 0.4869435429573059, + "learning_rate": 6.710014396707172e-05, + "loss": 0.0121, + "step": 8390 + }, + { + "grad_norm": 0.32635408639907837, + "learning_rate": 6.702243243811221e-05, + "loss": 0.0111, + "step": 8400 + }, + { + "grad_norm": 0.3574923872947693, + "learning_rate": 6.694467437047244e-05, + "loss": 0.0086, + "step": 8410 + }, + { + "grad_norm": 0.30649110674858093, + "learning_rate": 6.686686997673997e-05, + "loss": 0.0095, + "step": 8420 + }, + { + "grad_norm": 0.33591386675834656, + "learning_rate": 6.678901946962903e-05, + "loss": 0.0102, + "step": 8430 + }, + { + "grad_norm": 0.2789892256259918, + "learning_rate": 6.671112306197996e-05, + "loss": 0.0115, + "step": 8440 + }, + { + "grad_norm": 0.23750841617584229, + "learning_rate": 6.663318096675854e-05, + "loss": 0.0114, + "step": 8450 + }, + { + "grad_norm": 0.368899941444397, + "learning_rate": 6.655519339705552e-05, + "loss": 0.0131, + "step": 8460 + }, + { + "grad_norm": 0.30111491680145264, + "learning_rate": 6.647716056608588e-05, + "loss": 0.0132, + "step": 8470 + }, + { + "grad_norm": 0.27499163150787354, + "learning_rate": 6.639908268718843e-05, + "loss": 0.0139, + "step": 8480 + }, + { + "grad_norm": 0.3472214937210083, + "learning_rate": 6.632095997382514e-05, + "loss": 0.0137, + "step": 8490 + }, + { + "grad_norm": 0.32267826795578003, + "learning_rate": 6.624279263958047e-05, + "loss": 0.015, + "step": 8500 + }, + { + "grad_norm": 0.3251982033252716, + "learning_rate": 6.616458089816097e-05, + "loss": 0.0122, + "step": 8510 + }, + { + "grad_norm": 0.3112645149230957, + "learning_rate": 6.608632496339454e-05, + "loss": 0.0073, + "step": 8520 + }, + { + "grad_norm": 0.2545139789581299, + "learning_rate": 6.600802504922988e-05, + "loss": 0.0091, + "step": 8530 + }, + { + "grad_norm": 0.37264585494995117, + "learning_rate": 6.592968136973604e-05, + "loss": 0.0093, + "step": 8540 + }, + { + "grad_norm": 0.34885725378990173, + "learning_rate": 6.585129413910159e-05, + "loss": 0.0112, + "step": 8550 + }, + { + "grad_norm": 0.2983429431915283, + "learning_rate": 6.577286357163424e-05, + "loss": 0.0127, + "step": 8560 + }, + { + "grad_norm": 0.3230920732021332, + "learning_rate": 6.569438988176018e-05, + "loss": 0.0097, + "step": 8570 + }, + { + "grad_norm": 0.3835681676864624, + "learning_rate": 6.561587328402347e-05, + "loss": 0.0133, + "step": 8580 + }, + { + "grad_norm": 0.42620915174484253, + "learning_rate": 6.553731399308549e-05, + "loss": 0.0152, + "step": 8590 + }, + { + "grad_norm": 0.30636757612228394, + "learning_rate": 6.545871222372436e-05, + "loss": 0.0145, + "step": 8600 + }, + { + "grad_norm": 0.38285258412361145, + "learning_rate": 6.538006819083426e-05, + "loss": 0.015, + "step": 8610 + }, + { + "grad_norm": 0.405769944190979, + "learning_rate": 6.530138210942505e-05, + "loss": 0.0158, + "step": 8620 + }, + { + "grad_norm": 0.3663589358329773, + "learning_rate": 6.522265419462141e-05, + "loss": 0.0141, + "step": 8630 + }, + { + "grad_norm": 0.3903239965438843, + "learning_rate": 6.514388466166248e-05, + "loss": 0.0163, + "step": 8640 + }, + { + "grad_norm": 0.34105631709098816, + "learning_rate": 6.506507372590119e-05, + "loss": 0.0185, + "step": 8650 + }, + { + "grad_norm": 0.38227587938308716, + "learning_rate": 6.498622160280355e-05, + "loss": 0.0171, + "step": 8660 + }, + { + "grad_norm": 0.2439180314540863, + "learning_rate": 6.490732850794832e-05, + "loss": 0.0157, + "step": 8670 + }, + { + "grad_norm": 0.2519382834434509, + "learning_rate": 6.482839465702616e-05, + "loss": 0.0129, + "step": 8680 + }, + { + "grad_norm": 0.2761997580528259, + "learning_rate": 6.474942026583923e-05, + "loss": 0.0123, + "step": 8690 + }, + { + "grad_norm": 0.3216622769832611, + "learning_rate": 6.467040555030052e-05, + "loss": 0.0127, + "step": 8700 + }, + { + "grad_norm": 0.26096946001052856, + "learning_rate": 6.459135072643321e-05, + "loss": 0.0139, + "step": 8710 + }, + { + "grad_norm": 0.30985337495803833, + "learning_rate": 6.451225601037019e-05, + "loss": 0.0118, + "step": 8720 + }, + { + "grad_norm": 0.29444271326065063, + "learning_rate": 6.443312161835338e-05, + "loss": 0.0155, + "step": 8730 + }, + { + "grad_norm": 0.3172520697116852, + "learning_rate": 6.43539477667332e-05, + "loss": 0.0151, + "step": 8740 + }, + { + "grad_norm": 0.41361668705940247, + "learning_rate": 6.427473467196793e-05, + "loss": 0.0198, + "step": 8750 + }, + { + "grad_norm": 0.5138353705406189, + "learning_rate": 6.419548255062315e-05, + "loss": 0.0143, + "step": 8760 + }, + { + "grad_norm": 0.5104215145111084, + "learning_rate": 6.411619161937112e-05, + "loss": 0.0141, + "step": 8770 + }, + { + "grad_norm": 0.34711918234825134, + "learning_rate": 6.403686209499022e-05, + "loss": 0.0139, + "step": 8780 + }, + { + "grad_norm": 0.2875145673751831, + "learning_rate": 6.395749419436437e-05, + "loss": 0.0131, + "step": 8790 + }, + { + "grad_norm": 0.37919068336486816, + "learning_rate": 6.387808813448234e-05, + "loss": 0.0187, + "step": 8800 + }, + { + "grad_norm": 0.4263010621070862, + "learning_rate": 6.37986441324373e-05, + "loss": 0.0171, + "step": 8810 + }, + { + "grad_norm": 0.3483089506626129, + "learning_rate": 6.37191624054261e-05, + "loss": 0.0198, + "step": 8820 + }, + { + "grad_norm": 0.3389641046524048, + "learning_rate": 6.363964317074872e-05, + "loss": 0.0192, + "step": 8830 + }, + { + "grad_norm": 0.3463920056819916, + "learning_rate": 6.356008664580776e-05, + "loss": 0.0098, + "step": 8840 + }, + { + "grad_norm": 0.35007444024086, + "learning_rate": 6.348049304810771e-05, + "loss": 0.0109, + "step": 8850 + }, + { + "grad_norm": 0.3551989495754242, + "learning_rate": 6.340086259525442e-05, + "loss": 0.0089, + "step": 8860 + }, + { + "grad_norm": 0.27436158061027527, + "learning_rate": 6.332119550495448e-05, + "loss": 0.0114, + "step": 8870 + }, + { + "grad_norm": 0.23497121036052704, + "learning_rate": 6.324149199501473e-05, + "loss": 0.0102, + "step": 8880 + }, + { + "grad_norm": 0.2913462221622467, + "learning_rate": 6.316175228334146e-05, + "loss": 0.0129, + "step": 8890 + }, + { + "grad_norm": 0.40428781509399414, + "learning_rate": 6.308197658794003e-05, + "loss": 0.0124, + "step": 8900 + }, + { + "grad_norm": 0.3151707053184509, + "learning_rate": 6.300216512691417e-05, + "loss": 0.0104, + "step": 8910 + }, + { + "grad_norm": 0.368496298789978, + "learning_rate": 6.292231811846532e-05, + "loss": 0.0076, + "step": 8920 + }, + { + "grad_norm": 0.35116830468177795, + "learning_rate": 6.284243578089217e-05, + "loss": 0.0102, + "step": 8930 + }, + { + "grad_norm": 0.3099682629108429, + "learning_rate": 6.276251833258999e-05, + "loss": 0.0104, + "step": 8940 + }, + { + "grad_norm": 0.41350749135017395, + "learning_rate": 6.268256599205003e-05, + "loss": 0.0149, + "step": 8950 + }, + { + "grad_norm": 0.39295464754104614, + "learning_rate": 6.260257897785892e-05, + "loss": 0.0169, + "step": 8960 + }, + { + "grad_norm": 0.35960352420806885, + "learning_rate": 6.252255750869811e-05, + "loss": 0.0159, + "step": 8970 + }, + { + "grad_norm": 0.3055419325828552, + "learning_rate": 6.244250180334325e-05, + "loss": 0.0183, + "step": 8980 + }, + { + "grad_norm": 0.31011077761650085, + "learning_rate": 6.236241208066356e-05, + "loss": 0.0153, + "step": 8990 + }, + { + "grad_norm": 0.37592175602912903, + "learning_rate": 6.228228855962133e-05, + "loss": 0.0176, + "step": 9000 + }, + { + "grad_norm": 0.26465755701065063, + "learning_rate": 6.220213145927115e-05, + "loss": 0.0158, + "step": 9010 + }, + { + "grad_norm": 0.3471278250217438, + "learning_rate": 6.212194099875951e-05, + "loss": 0.0096, + "step": 9020 + }, + { + "grad_norm": 0.3856644034385681, + "learning_rate": 6.204171739732405e-05, + "loss": 0.0131, + "step": 9030 + }, + { + "grad_norm": 0.28634852170944214, + "learning_rate": 6.196146087429303e-05, + "loss": 0.0142, + "step": 9040 + }, + { + "grad_norm": 0.2684045135974884, + "learning_rate": 6.188117164908474e-05, + "loss": 0.0137, + "step": 9050 + }, + { + "grad_norm": 0.31617048382759094, + "learning_rate": 6.180084994120684e-05, + "loss": 0.0115, + "step": 9060 + }, + { + "grad_norm": 0.40079963207244873, + "learning_rate": 6.17204959702558e-05, + "loss": 0.0125, + "step": 9070 + }, + { + "grad_norm": 0.34378454089164734, + "learning_rate": 6.164010995591635e-05, + "loss": 0.013, + "step": 9080 + }, + { + "grad_norm": 0.3143358826637268, + "learning_rate": 6.155969211796076e-05, + "loss": 0.0121, + "step": 9090 + }, + { + "grad_norm": 0.3051069676876068, + "learning_rate": 6.147924267624829e-05, + "loss": 0.0121, + "step": 9100 + }, + { + "grad_norm": 0.4026600122451782, + "learning_rate": 6.13987618507247e-05, + "loss": 0.0124, + "step": 9110 + }, + { + "grad_norm": 0.20754925906658173, + "learning_rate": 6.131824986142147e-05, + "loss": 0.0127, + "step": 9120 + }, + { + "grad_norm": 0.3521782457828522, + "learning_rate": 6.123770692845529e-05, + "loss": 0.0136, + "step": 9130 + }, + { + "grad_norm": 0.36471572518348694, + "learning_rate": 6.11571332720275e-05, + "loss": 0.0169, + "step": 9140 + }, + { + "grad_norm": 0.34544607996940613, + "learning_rate": 6.107652911242336e-05, + "loss": 0.0127, + "step": 9150 + }, + { + "grad_norm": 0.33731943368911743, + "learning_rate": 6.0995894670011586e-05, + "loss": 0.0124, + "step": 9160 + }, + { + "grad_norm": 0.3496052622795105, + "learning_rate": 6.091523016524368e-05, + "loss": 0.0126, + "step": 9170 + }, + { + "grad_norm": 0.4604206681251526, + "learning_rate": 6.083453581865328e-05, + "loss": 0.0137, + "step": 9180 + }, + { + "grad_norm": 0.32989510893821716, + "learning_rate": 6.075381185085568e-05, + "loss": 0.014, + "step": 9190 + }, + { + "grad_norm": 0.3579842448234558, + "learning_rate": 6.067305848254709e-05, + "loss": 0.0156, + "step": 9200 + }, + { + "grad_norm": 0.2964962124824524, + "learning_rate": 6.059227593450418e-05, + "loss": 0.0167, + "step": 9210 + }, + { + "grad_norm": 0.3809475004673004, + "learning_rate": 6.051146442758333e-05, + "loss": 0.0163, + "step": 9220 + }, + { + "grad_norm": 0.2879127860069275, + "learning_rate": 6.043062418272012e-05, + "loss": 0.0153, + "step": 9230 + }, + { + "grad_norm": 0.35881927609443665, + "learning_rate": 6.0349755420928666e-05, + "loss": 0.0172, + "step": 9240 + }, + { + "grad_norm": 0.37010306119918823, + "learning_rate": 6.0268858363301105e-05, + "loss": 0.0149, + "step": 9250 + }, + { + "grad_norm": 0.3885497450828552, + "learning_rate": 6.018793323100689e-05, + "loss": 0.012, + "step": 9260 + }, + { + "grad_norm": 0.31309938430786133, + "learning_rate": 6.0106980245292255e-05, + "loss": 0.0114, + "step": 9270 + }, + { + "grad_norm": 0.336249440908432, + "learning_rate": 6.002599962747957e-05, + "loss": 0.0134, + "step": 9280 + }, + { + "grad_norm": 0.2684231400489807, + "learning_rate": 5.994499159896673e-05, + "loss": 0.0128, + "step": 9290 + }, + { + "grad_norm": 0.353425532579422, + "learning_rate": 5.9863956381226607e-05, + "loss": 0.0143, + "step": 9300 + }, + { + "grad_norm": 0.386186420917511, + "learning_rate": 5.9782894195806394e-05, + "loss": 0.0118, + "step": 9310 + }, + { + "grad_norm": 0.3526606857776642, + "learning_rate": 5.9701805264327004e-05, + "loss": 0.0158, + "step": 9320 + }, + { + "grad_norm": 0.40907028317451477, + "learning_rate": 5.96206898084825e-05, + "loss": 0.0145, + "step": 9330 + }, + { + "grad_norm": 0.37888333201408386, + "learning_rate": 5.953954805003942e-05, + "loss": 0.0137, + "step": 9340 + }, + { + "grad_norm": 0.4098068177700043, + "learning_rate": 5.945838021083623e-05, + "loss": 0.0142, + "step": 9350 + }, + { + "grad_norm": 0.34876784682273865, + "learning_rate": 5.9377186512782714e-05, + "loss": 0.0119, + "step": 9360 + }, + { + "grad_norm": 0.35310330986976624, + "learning_rate": 5.929596717785935e-05, + "loss": 0.0117, + "step": 9370 + }, + { + "grad_norm": 0.37008821964263916, + "learning_rate": 5.921472242811668e-05, + "loss": 0.0144, + "step": 9380 + }, + { + "grad_norm": 0.36046233773231506, + "learning_rate": 5.913345248567475e-05, + "loss": 0.012, + "step": 9390 + }, + { + "grad_norm": 0.33987924456596375, + "learning_rate": 5.905215757272248e-05, + "loss": 0.0166, + "step": 9400 + }, + { + "grad_norm": 0.3639644980430603, + "learning_rate": 5.897083791151706e-05, + "loss": 0.0113, + "step": 9410 + }, + { + "grad_norm": 0.4399937391281128, + "learning_rate": 5.888949372438336e-05, + "loss": 0.0147, + "step": 9420 + }, + { + "grad_norm": 0.3255448043346405, + "learning_rate": 5.8808125233713255e-05, + "loss": 0.0125, + "step": 9430 + }, + { + "grad_norm": 0.2753329873085022, + "learning_rate": 5.872673266196509e-05, + "loss": 0.0128, + "step": 9440 + }, + { + "grad_norm": 0.309054970741272, + "learning_rate": 5.864531623166305e-05, + "loss": 0.0166, + "step": 9450 + }, + { + "grad_norm": 0.33016565442085266, + "learning_rate": 5.856387616539656e-05, + "loss": 0.0134, + "step": 9460 + }, + { + "grad_norm": 0.32756826281547546, + "learning_rate": 5.848241268581967e-05, + "loss": 0.016, + "step": 9470 + }, + { + "grad_norm": 0.2993040978908539, + "learning_rate": 5.840092601565037e-05, + "loss": 0.0153, + "step": 9480 + }, + { + "grad_norm": 0.28896939754486084, + "learning_rate": 5.8319416377670144e-05, + "loss": 0.0139, + "step": 9490 + }, + { + "grad_norm": 0.3286594748497009, + "learning_rate": 5.82378839947232e-05, + "loss": 0.0122, + "step": 9500 + }, + { + "grad_norm": 0.21800272166728973, + "learning_rate": 5.815632908971599e-05, + "loss": 0.0124, + "step": 9510 + }, + { + "grad_norm": 0.29166051745414734, + "learning_rate": 5.80747518856165e-05, + "loss": 0.0092, + "step": 9520 + }, + { + "grad_norm": 0.24596701562404633, + "learning_rate": 5.799315260545367e-05, + "loss": 0.008, + "step": 9530 + }, + { + "grad_norm": 0.22884689271450043, + "learning_rate": 5.791153147231686e-05, + "loss": 0.0096, + "step": 9540 + }, + { + "grad_norm": 0.2536587417125702, + "learning_rate": 5.782988870935509e-05, + "loss": 0.0131, + "step": 9550 + }, + { + "grad_norm": 0.3100685477256775, + "learning_rate": 5.774822453977657e-05, + "loss": 0.0115, + "step": 9560 + }, + { + "grad_norm": 0.2796921133995056, + "learning_rate": 5.7666539186848036e-05, + "loss": 0.0104, + "step": 9570 + }, + { + "grad_norm": 0.3592562973499298, + "learning_rate": 5.758483287389411e-05, + "loss": 0.0125, + "step": 9580 + }, + { + "grad_norm": 0.27882611751556396, + "learning_rate": 5.7503105824296735e-05, + "loss": 0.0123, + "step": 9590 + }, + { + "grad_norm": 0.28244462609291077, + "learning_rate": 5.742135826149453e-05, + "loss": 0.011, + "step": 9600 + }, + { + "grad_norm": 0.298927366733551, + "learning_rate": 5.7339590408982223e-05, + "loss": 0.014, + "step": 9610 + }, + { + "grad_norm": 0.4934275448322296, + "learning_rate": 5.725780249031e-05, + "loss": 0.0114, + "step": 9620 + }, + { + "grad_norm": 0.3640587329864502, + "learning_rate": 5.717599472908292e-05, + "loss": 0.0122, + "step": 9630 + }, + { + "grad_norm": 0.3348456919193268, + "learning_rate": 5.7094167348960237e-05, + "loss": 0.0156, + "step": 9640 + }, + { + "grad_norm": 0.30459585785865784, + "learning_rate": 5.7012320573654945e-05, + "loss": 0.0128, + "step": 9650 + }, + { + "grad_norm": 0.32021665573120117, + "learning_rate": 5.693045462693295e-05, + "loss": 0.0122, + "step": 9660 + }, + { + "grad_norm": 0.2579132616519928, + "learning_rate": 5.684856973261266e-05, + "loss": 0.0091, + "step": 9670 + }, + { + "grad_norm": 0.2512996196746826, + "learning_rate": 5.6766666114564215e-05, + "loss": 0.009, + "step": 9680 + }, + { + "grad_norm": 0.26022472977638245, + "learning_rate": 5.668474399670899e-05, + "loss": 0.0103, + "step": 9690 + }, + { + "grad_norm": 0.39224788546562195, + "learning_rate": 5.660280360301896e-05, + "loss": 0.0113, + "step": 9700 + }, + { + "grad_norm": 0.427045613527298, + "learning_rate": 5.652084515751599e-05, + "loss": 0.0132, + "step": 9710 + }, + { + "grad_norm": 0.31992945075035095, + "learning_rate": 5.643886888427137e-05, + "loss": 0.0128, + "step": 9720 + }, + { + "grad_norm": 0.3383529484272003, + "learning_rate": 5.6356875007405074e-05, + "loss": 0.0118, + "step": 9730 + }, + { + "grad_norm": 0.27886223793029785, + "learning_rate": 5.627486375108525e-05, + "loss": 0.0127, + "step": 9740 + }, + { + "grad_norm": 0.3647865653038025, + "learning_rate": 5.619283533952754e-05, + "loss": 0.0124, + "step": 9750 + }, + { + "grad_norm": 0.4016250669956207, + "learning_rate": 5.6110789996994474e-05, + "loss": 0.011, + "step": 9760 + }, + { + "grad_norm": 0.2679634392261505, + "learning_rate": 5.602872794779491e-05, + "loss": 0.0069, + "step": 9770 + }, + { + "grad_norm": 0.2362307757139206, + "learning_rate": 5.594664941628334e-05, + "loss": 0.0057, + "step": 9780 + }, + { + "grad_norm": 0.23448504507541656, + "learning_rate": 5.5864554626859324e-05, + "loss": 0.0066, + "step": 9790 + }, + { + "grad_norm": 0.3009737730026245, + "learning_rate": 5.578244380396691e-05, + "loss": 0.0096, + "step": 9800 + }, + { + "grad_norm": 0.40142950415611267, + "learning_rate": 5.570031717209394e-05, + "loss": 0.0159, + "step": 9810 + }, + { + "grad_norm": 0.5059084892272949, + "learning_rate": 5.561817495577147e-05, + "loss": 0.0144, + "step": 9820 + }, + { + "grad_norm": 0.35968977212905884, + "learning_rate": 5.5536017379573215e-05, + "loss": 0.0167, + "step": 9830 + }, + { + "grad_norm": 0.34103095531463623, + "learning_rate": 5.545384466811483e-05, + "loss": 0.0169, + "step": 9840 + }, + { + "grad_norm": 0.3724645972251892, + "learning_rate": 5.5371657046053384e-05, + "loss": 0.0139, + "step": 9850 + }, + { + "grad_norm": 0.32530680298805237, + "learning_rate": 5.528945473808669e-05, + "loss": 0.0095, + "step": 9860 + }, + { + "grad_norm": 0.3214346170425415, + "learning_rate": 5.520723796895272e-05, + "loss": 0.0078, + "step": 9870 + }, + { + "grad_norm": 0.24627052247524261, + "learning_rate": 5.512500696342897e-05, + "loss": 0.0094, + "step": 9880 + }, + { + "grad_norm": 0.2178281992673874, + "learning_rate": 5.504276194633188e-05, + "loss": 0.0088, + "step": 9890 + }, + { + "grad_norm": 0.31041690707206726, + "learning_rate": 5.49605031425162e-05, + "loss": 0.0091, + "step": 9900 + }, + { + "grad_norm": 0.30245935916900635, + "learning_rate": 5.487823077687434e-05, + "loss": 0.0105, + "step": 9910 + }, + { + "grad_norm": 0.35192206501960754, + "learning_rate": 5.4795945074335806e-05, + "loss": 0.012, + "step": 9920 + }, + { + "grad_norm": 0.32029780745506287, + "learning_rate": 5.471364625986657e-05, + "loss": 0.0142, + "step": 9930 + }, + { + "grad_norm": 0.2275260090827942, + "learning_rate": 5.463133455846845e-05, + "loss": 0.0124, + "step": 9940 + }, + { + "grad_norm": 0.28003349900245667, + "learning_rate": 5.4549010195178505e-05, + "loss": 0.0182, + "step": 9950 + }, + { + "grad_norm": 0.3532939553260803, + "learning_rate": 5.446667339506838e-05, + "loss": 0.0128, + "step": 9960 + }, + { + "grad_norm": 0.33111679553985596, + "learning_rate": 5.4384324383243756e-05, + "loss": 0.0122, + "step": 9970 + }, + { + "grad_norm": 0.20867730677127838, + "learning_rate": 5.430196338484368e-05, + "loss": 0.0108, + "step": 9980 + }, + { + "grad_norm": 0.2819509506225586, + "learning_rate": 5.4219590625039975e-05, + "loss": 0.0126, + "step": 9990 + }, + { + "grad_norm": 0.31301000714302063, + "learning_rate": 5.413720632903664e-05, + "loss": 0.0119, + "step": 10000 + }, + { + "grad_norm": 0.22655394673347473, + "learning_rate": 5.405481072206917e-05, + "loss": 0.0088, + "step": 10010 + }, + { + "grad_norm": 0.3158484697341919, + "learning_rate": 5.397240402940402e-05, + "loss": 0.0103, + "step": 10020 + }, + { + "grad_norm": 0.3107994794845581, + "learning_rate": 5.388998647633794e-05, + "loss": 0.0132, + "step": 10030 + }, + { + "grad_norm": 0.30526578426361084, + "learning_rate": 5.380755828819737e-05, + "loss": 0.0139, + "step": 10040 + }, + { + "grad_norm": 0.31946560740470886, + "learning_rate": 5.3725119690337846e-05, + "loss": 0.015, + "step": 10050 + }, + { + "grad_norm": 0.3265150189399719, + "learning_rate": 5.3642670908143324e-05, + "loss": 0.0137, + "step": 10060 + }, + { + "grad_norm": 0.2688096761703491, + "learning_rate": 5.356021216702562e-05, + "loss": 0.0134, + "step": 10070 + }, + { + "grad_norm": 1.068115472793579, + "learning_rate": 5.347774369242381e-05, + "loss": 0.0219, + "step": 10080 + }, + { + "grad_norm": 0.38739392161369324, + "learning_rate": 5.3395265709803545e-05, + "loss": 0.0181, + "step": 10090 + }, + { + "grad_norm": 0.33883070945739746, + "learning_rate": 5.331277844465647e-05, + "loss": 0.0136, + "step": 10100 + }, + { + "grad_norm": 0.23752360045909882, + "learning_rate": 5.323028212249963e-05, + "loss": 0.0107, + "step": 10110 + }, + { + "grad_norm": 0.31511190533638, + "learning_rate": 5.314777696887481e-05, + "loss": 0.0129, + "step": 10120 + }, + { + "grad_norm": 0.28592154383659363, + "learning_rate": 5.306526320934796e-05, + "loss": 0.0139, + "step": 10130 + }, + { + "grad_norm": 0.27013254165649414, + "learning_rate": 5.298274106950854e-05, + "loss": 0.013, + "step": 10140 + }, + { + "grad_norm": 0.3462916314601898, + "learning_rate": 5.290021077496893e-05, + "loss": 0.0139, + "step": 10150 + }, + { + "grad_norm": 0.2938562333583832, + "learning_rate": 5.2817672551363816e-05, + "loss": 0.0148, + "step": 10160 + }, + { + "grad_norm": 0.30084943771362305, + "learning_rate": 5.273512662434952e-05, + "loss": 0.015, + "step": 10170 + }, + { + "grad_norm": 0.36582499742507935, + "learning_rate": 5.265257321960349e-05, + "loss": 0.0167, + "step": 10180 + }, + { + "grad_norm": 0.2982017695903778, + "learning_rate": 5.257001256282357e-05, + "loss": 0.0142, + "step": 10190 + }, + { + "grad_norm": 0.3169742822647095, + "learning_rate": 5.248744487972742e-05, + "loss": 0.0111, + "step": 10200 + }, + { + "grad_norm": 0.3239147365093231, + "learning_rate": 5.240487039605196e-05, + "loss": 0.012, + "step": 10210 + }, + { + "grad_norm": 0.3795771598815918, + "learning_rate": 5.232228933755267e-05, + "loss": 0.0116, + "step": 10220 + }, + { + "grad_norm": 0.3357098698616028, + "learning_rate": 5.2239701930003006e-05, + "loss": 0.0118, + "step": 10230 + }, + { + "grad_norm": 0.2874980866909027, + "learning_rate": 5.215710839919379e-05, + "loss": 0.0132, + "step": 10240 + }, + { + "grad_norm": 0.35038667917251587, + "learning_rate": 5.207450897093257e-05, + "loss": 0.0103, + "step": 10250 + }, + { + "grad_norm": 0.26190340518951416, + "learning_rate": 5.1991903871043046e-05, + "loss": 0.0143, + "step": 10260 + }, + { + "grad_norm": 0.28734883666038513, + "learning_rate": 5.190929332536439e-05, + "loss": 0.0084, + "step": 10270 + }, + { + "grad_norm": 0.38185805082321167, + "learning_rate": 5.182667755975071e-05, + "loss": 0.0072, + "step": 10280 + }, + { + "grad_norm": 0.25452402234077454, + "learning_rate": 5.1744056800070315e-05, + "loss": 0.0068, + "step": 10290 + }, + { + "grad_norm": 0.2729600667953491, + "learning_rate": 5.166143127220524e-05, + "loss": 0.0095, + "step": 10300 + }, + { + "grad_norm": 0.29867124557495117, + "learning_rate": 5.1578801202050485e-05, + "loss": 0.0088, + "step": 10310 + }, + { + "grad_norm": 0.21836042404174805, + "learning_rate": 5.149616681551355e-05, + "loss": 0.0097, + "step": 10320 + }, + { + "grad_norm": 0.18065130710601807, + "learning_rate": 5.141352833851367e-05, + "loss": 0.0112, + "step": 10330 + }, + { + "grad_norm": 0.32105210423469543, + "learning_rate": 5.1330885996981285e-05, + "loss": 0.0093, + "step": 10340 + }, + { + "grad_norm": 0.2557836174964905, + "learning_rate": 5.124824001685741e-05, + "loss": 0.0129, + "step": 10350 + }, + { + "grad_norm": 0.3318512737751007, + "learning_rate": 5.116559062409298e-05, + "loss": 0.0094, + "step": 10360 + }, + { + "grad_norm": 0.23337747156620026, + "learning_rate": 5.10829380446483e-05, + "loss": 0.0082, + "step": 10370 + }, + { + "grad_norm": 0.2705354690551758, + "learning_rate": 5.100028250449235e-05, + "loss": 0.009, + "step": 10380 + }, + { + "grad_norm": 0.3192671239376068, + "learning_rate": 5.0917624229602234e-05, + "loss": 0.0096, + "step": 10390 + }, + { + "grad_norm": 0.2905829846858978, + "learning_rate": 5.0834963445962524e-05, + "loss": 0.0113, + "step": 10400 + }, + { + "grad_norm": 0.265475332736969, + "learning_rate": 5.075230037956461e-05, + "loss": 0.0112, + "step": 10410 + }, + { + "grad_norm": 0.24429795145988464, + "learning_rate": 5.0669635256406213e-05, + "loss": 0.0079, + "step": 10420 + }, + { + "grad_norm": 0.21453680098056793, + "learning_rate": 5.058696830249058e-05, + "loss": 0.0096, + "step": 10430 + }, + { + "grad_norm": 0.2631129324436188, + "learning_rate": 5.050429974382602e-05, + "loss": 0.0077, + "step": 10440 + }, + { + "grad_norm": 0.38374248147010803, + "learning_rate": 5.042162980642523e-05, + "loss": 0.004, + "step": 10450 + }, + { + "grad_norm": 0.29487717151641846, + "learning_rate": 5.033895871630462e-05, + "loss": 0.0047, + "step": 10460 + }, + { + "grad_norm": 0.2708784341812134, + "learning_rate": 5.025628669948386e-05, + "loss": 0.0044, + "step": 10470 + }, + { + "grad_norm": 0.3040958642959595, + "learning_rate": 5.017361398198502e-05, + "loss": 0.0061, + "step": 10480 + }, + { + "grad_norm": 0.2770012617111206, + "learning_rate": 5.009094078983221e-05, + "loss": 0.0083, + "step": 10490 + }, + { + "grad_norm": 0.42853882908821106, + "learning_rate": 5.000826734905073e-05, + "loss": 0.0096, + "step": 10500 + }, + { + "grad_norm": 0.335793673992157, + "learning_rate": 4.9925593885666645e-05, + "loss": 0.0094, + "step": 10510 + }, + { + "grad_norm": 0.2871567904949188, + "learning_rate": 4.984292062570602e-05, + "loss": 0.0101, + "step": 10520 + }, + { + "grad_norm": 0.2691814601421356, + "learning_rate": 4.976024779519442e-05, + "loss": 0.0091, + "step": 10530 + }, + { + "grad_norm": 0.26194360852241516, + "learning_rate": 4.9677575620156194e-05, + "loss": 0.0128, + "step": 10540 + }, + { + "grad_norm": 0.281966894865036, + "learning_rate": 4.959490432661391e-05, + "loss": 0.0131, + "step": 10550 + }, + { + "grad_norm": 0.2620058059692383, + "learning_rate": 4.9512234140587726e-05, + "loss": 0.0104, + "step": 10560 + }, + { + "grad_norm": 0.3251452147960663, + "learning_rate": 4.942956528809477e-05, + "loss": 0.0115, + "step": 10570 + }, + { + "grad_norm": 0.2524576187133789, + "learning_rate": 4.934689799514854e-05, + "loss": 0.0067, + "step": 10580 + }, + { + "grad_norm": 0.23417805135250092, + "learning_rate": 4.926423248775827e-05, + "loss": 0.0086, + "step": 10590 + }, + { + "grad_norm": 0.3511345386505127, + "learning_rate": 4.918156899192826e-05, + "loss": 0.0092, + "step": 10600 + }, + { + "grad_norm": 0.28415340185165405, + "learning_rate": 4.909890773365738e-05, + "loss": 0.0143, + "step": 10610 + }, + { + "grad_norm": 0.40072086453437805, + "learning_rate": 4.9016248938938344e-05, + "loss": 0.0165, + "step": 10620 + }, + { + "grad_norm": 0.43341389298439026, + "learning_rate": 4.8933592833757156e-05, + "loss": 0.0121, + "step": 10630 + }, + { + "grad_norm": 0.37757453322410583, + "learning_rate": 4.8850939644092435e-05, + "loss": 0.0139, + "step": 10640 + }, + { + "grad_norm": 0.23439866304397583, + "learning_rate": 4.876828959591485e-05, + "loss": 0.0063, + "step": 10650 + }, + { + "grad_norm": 0.2898266911506653, + "learning_rate": 4.8685642915186474e-05, + "loss": 0.007, + "step": 10660 + }, + { + "grad_norm": 0.2527606189250946, + "learning_rate": 4.860299982786018e-05, + "loss": 0.0062, + "step": 10670 + }, + { + "grad_norm": 0.26961687207221985, + "learning_rate": 4.852036055987901e-05, + "loss": 0.0051, + "step": 10680 + }, + { + "grad_norm": 0.2623964846134186, + "learning_rate": 4.843772533717558e-05, + "loss": 0.0038, + "step": 10690 + }, + { + "grad_norm": 0.19879719614982605, + "learning_rate": 4.835509438567142e-05, + "loss": 0.0045, + "step": 10700 + }, + { + "grad_norm": 0.24892987310886383, + "learning_rate": 4.827246793127639e-05, + "loss": 0.0037, + "step": 10710 + }, + { + "grad_norm": 0.35000476241111755, + "learning_rate": 4.818984619988807e-05, + "loss": 0.0123, + "step": 10720 + }, + { + "grad_norm": 0.33420488238334656, + "learning_rate": 4.810722941739115e-05, + "loss": 0.0161, + "step": 10730 + }, + { + "grad_norm": 0.32986903190612793, + "learning_rate": 4.8024617809656684e-05, + "loss": 0.0142, + "step": 10740 + }, + { + "grad_norm": 0.3339530825614929, + "learning_rate": 4.794201160254171e-05, + "loss": 0.0142, + "step": 10750 + }, + { + "grad_norm": 0.25733357667922974, + "learning_rate": 4.785941102188844e-05, + "loss": 0.013, + "step": 10760 + }, + { + "grad_norm": 0.3128061592578888, + "learning_rate": 4.7776816293523686e-05, + "loss": 0.011, + "step": 10770 + }, + { + "grad_norm": 0.3187994956970215, + "learning_rate": 4.769422764325832e-05, + "loss": 0.0087, + "step": 10780 + }, + { + "grad_norm": 0.3972649574279785, + "learning_rate": 4.76116452968865e-05, + "loss": 0.0126, + "step": 10790 + }, + { + "grad_norm": 0.2713838815689087, + "learning_rate": 4.752906948018525e-05, + "loss": 0.0098, + "step": 10800 + }, + { + "grad_norm": 0.2532338500022888, + "learning_rate": 4.7446500418913684e-05, + "loss": 0.0058, + "step": 10810 + }, + { + "grad_norm": 0.2768974006175995, + "learning_rate": 4.736393833881247e-05, + "loss": 0.0069, + "step": 10820 + }, + { + "grad_norm": 0.28683599829673767, + "learning_rate": 4.7281383465603194e-05, + "loss": 0.0072, + "step": 10830 + }, + { + "grad_norm": 0.36243143677711487, + "learning_rate": 4.71988360249877e-05, + "loss": 0.0095, + "step": 10840 + }, + { + "grad_norm": 0.27696338295936584, + "learning_rate": 4.7116296242647554e-05, + "loss": 0.008, + "step": 10850 + }, + { + "grad_norm": 0.27943673729896545, + "learning_rate": 4.703376434424336e-05, + "loss": 0.0102, + "step": 10860 + }, + { + "grad_norm": 0.3049796521663666, + "learning_rate": 4.695124055541421e-05, + "loss": 0.0107, + "step": 10870 + }, + { + "grad_norm": 0.2998720705509186, + "learning_rate": 4.6868725101776934e-05, + "loss": 0.0062, + "step": 10880 + }, + { + "grad_norm": 0.3271496891975403, + "learning_rate": 4.678621820892567e-05, + "loss": 0.0075, + "step": 10890 + }, + { + "grad_norm": 0.23435017466545105, + "learning_rate": 4.670372010243111e-05, + "loss": 0.0068, + "step": 10900 + }, + { + "grad_norm": 0.2723783552646637, + "learning_rate": 4.662123100783992e-05, + "loss": 0.0078, + "step": 10910 + }, + { + "grad_norm": 0.24249881505966187, + "learning_rate": 4.653875115067415e-05, + "loss": 0.0073, + "step": 10920 + }, + { + "grad_norm": 0.268113374710083, + "learning_rate": 4.6456280756430545e-05, + "loss": 0.0076, + "step": 10930 + }, + { + "grad_norm": 0.2593485414981842, + "learning_rate": 4.637382005058004e-05, + "loss": 0.0072, + "step": 10940 + }, + { + "grad_norm": 0.24902157485485077, + "learning_rate": 4.629136925856705e-05, + "loss": 0.007, + "step": 10950 + }, + { + "grad_norm": 0.24148914217948914, + "learning_rate": 4.6208928605808895e-05, + "loss": 0.0066, + "step": 10960 + }, + { + "grad_norm": 0.2137160748243332, + "learning_rate": 4.612649831769519e-05, + "loss": 0.0053, + "step": 10970 + }, + { + "grad_norm": 0.2593444883823395, + "learning_rate": 4.604407861958715e-05, + "loss": 0.0076, + "step": 10980 + }, + { + "grad_norm": 0.35889652371406555, + "learning_rate": 4.5961669736817114e-05, + "loss": 0.01, + "step": 10990 + }, + { + "grad_norm": 0.23938077688217163, + "learning_rate": 4.5879271894687814e-05, + "loss": 0.0121, + "step": 11000 + }, + { + "grad_norm": 0.26064932346343994, + "learning_rate": 4.5796885318471826e-05, + "loss": 0.0096, + "step": 11010 + }, + { + "grad_norm": 0.21850089728832245, + "learning_rate": 4.571451023341086e-05, + "loss": 0.0101, + "step": 11020 + }, + { + "grad_norm": 0.4301821291446686, + "learning_rate": 4.563214686471527e-05, + "loss": 0.0135, + "step": 11030 + }, + { + "grad_norm": 0.2709929943084717, + "learning_rate": 4.5549795437563365e-05, + "loss": 0.011, + "step": 11040 + }, + { + "grad_norm": 0.3040259778499603, + "learning_rate": 4.546745617710081e-05, + "loss": 0.0121, + "step": 11050 + }, + { + "grad_norm": 0.2754867672920227, + "learning_rate": 4.5385129308440014e-05, + "loss": 0.0121, + "step": 11060 + }, + { + "grad_norm": 0.25341928005218506, + "learning_rate": 4.530281505665944e-05, + "loss": 0.011, + "step": 11070 + }, + { + "grad_norm": 0.3000570833683014, + "learning_rate": 4.5220513646803134e-05, + "loss": 0.0127, + "step": 11080 + }, + { + "grad_norm": 0.2561561167240143, + "learning_rate": 4.513822530388003e-05, + "loss": 0.0113, + "step": 11090 + }, + { + "grad_norm": 0.2941455841064453, + "learning_rate": 4.5055950252863296e-05, + "loss": 0.0108, + "step": 11100 + }, + { + "grad_norm": 0.2674684524536133, + "learning_rate": 4.4973688718689803e-05, + "loss": 0.0082, + "step": 11110 + }, + { + "grad_norm": 0.25140663981437683, + "learning_rate": 4.4891440926259406e-05, + "loss": 0.0084, + "step": 11120 + }, + { + "grad_norm": 0.2520742416381836, + "learning_rate": 4.480920710043443e-05, + "loss": 0.0092, + "step": 11130 + }, + { + "grad_norm": 0.20016080141067505, + "learning_rate": 4.4726987466039044e-05, + "loss": 0.0094, + "step": 11140 + }, + { + "grad_norm": 0.2580920457839966, + "learning_rate": 4.46447822478586e-05, + "loss": 0.0136, + "step": 11150 + }, + { + "grad_norm": 0.25687864422798157, + "learning_rate": 4.4562591670638974e-05, + "loss": 0.0071, + "step": 11160 + }, + { + "grad_norm": 0.29451465606689453, + "learning_rate": 4.4480415959086105e-05, + "loss": 0.0077, + "step": 11170 + }, + { + "grad_norm": 0.3275420367717743, + "learning_rate": 4.439825533786522e-05, + "loss": 0.0097, + "step": 11180 + }, + { + "grad_norm": 0.40727728605270386, + "learning_rate": 4.431611003160035e-05, + "loss": 0.0081, + "step": 11190 + }, + { + "grad_norm": 0.21377375721931458, + "learning_rate": 4.4233980264873636e-05, + "loss": 0.0065, + "step": 11200 + }, + { + "grad_norm": 0.25532540678977966, + "learning_rate": 4.4151866262224684e-05, + "loss": 0.0085, + "step": 11210 + }, + { + "grad_norm": 0.29716506600379944, + "learning_rate": 4.406976824815006e-05, + "loss": 0.0073, + "step": 11220 + }, + { + "grad_norm": 0.23029808700084686, + "learning_rate": 4.3987686447102595e-05, + "loss": 0.0102, + "step": 11230 + }, + { + "grad_norm": 0.23292392492294312, + "learning_rate": 4.3905621083490804e-05, + "loss": 0.0078, + "step": 11240 + }, + { + "grad_norm": 0.23699244856834412, + "learning_rate": 4.3823572381678286e-05, + "loss": 0.0096, + "step": 11250 + }, + { + "grad_norm": 0.2723095118999481, + "learning_rate": 4.374154056598301e-05, + "loss": 0.0107, + "step": 11260 + }, + { + "grad_norm": 0.28357529640197754, + "learning_rate": 4.3659525860676845e-05, + "loss": 0.0079, + "step": 11270 + }, + { + "grad_norm": 0.2124902755022049, + "learning_rate": 4.3577528489984854e-05, + "loss": 0.0076, + "step": 11280 + }, + { + "grad_norm": 0.23794962465763092, + "learning_rate": 4.349554867808476e-05, + "loss": 0.0074, + "step": 11290 + }, + { + "grad_norm": 0.20382577180862427, + "learning_rate": 4.34135866491062e-05, + "loss": 0.008, + "step": 11300 + }, + { + "grad_norm": 0.2991105318069458, + "learning_rate": 4.333164262713022e-05, + "loss": 0.011, + "step": 11310 + }, + { + "grad_norm": 0.2704513967037201, + "learning_rate": 4.324971683618868e-05, + "loss": 0.0076, + "step": 11320 + }, + { + "grad_norm": 0.23047420382499695, + "learning_rate": 4.316780950026354e-05, + "loss": 0.0093, + "step": 11330 + }, + { + "grad_norm": 0.29185932874679565, + "learning_rate": 4.308592084328637e-05, + "loss": 0.0142, + "step": 11340 + }, + { + "grad_norm": 0.3400135636329651, + "learning_rate": 4.3004051089137576e-05, + "loss": 0.013, + "step": 11350 + }, + { + "grad_norm": 0.24304379522800446, + "learning_rate": 4.292220046164597e-05, + "loss": 0.0099, + "step": 11360 + }, + { + "grad_norm": 0.18561001121997833, + "learning_rate": 4.2840369184588035e-05, + "loss": 0.0102, + "step": 11370 + }, + { + "grad_norm": 0.3126794099807739, + "learning_rate": 4.2758557481687345e-05, + "loss": 0.0122, + "step": 11380 + }, + { + "grad_norm": 0.34233587980270386, + "learning_rate": 4.267676557661403e-05, + "loss": 0.0125, + "step": 11390 + }, + { + "grad_norm": 0.21703346073627472, + "learning_rate": 4.2594993692983955e-05, + "loss": 0.0114, + "step": 11400 + }, + { + "grad_norm": 0.3167710602283478, + "learning_rate": 4.251324205435837e-05, + "loss": 0.0133, + "step": 11410 + }, + { + "grad_norm": 0.31768539547920227, + "learning_rate": 4.243151088424312e-05, + "loss": 0.0157, + "step": 11420 + }, + { + "grad_norm": 0.2940398156642914, + "learning_rate": 4.234980040608813e-05, + "loss": 0.0164, + "step": 11430 + }, + { + "grad_norm": 0.3499918282032013, + "learning_rate": 4.22681108432867e-05, + "loss": 0.015, + "step": 11440 + }, + { + "grad_norm": 0.31397590041160583, + "learning_rate": 4.2186442419174984e-05, + "loss": 0.0112, + "step": 11450 + }, + { + "grad_norm": 0.20803068578243256, + "learning_rate": 4.210479535703133e-05, + "loss": 0.0101, + "step": 11460 + }, + { + "grad_norm": 0.16176341474056244, + "learning_rate": 4.202316988007567e-05, + "loss": 0.0089, + "step": 11470 + }, + { + "grad_norm": 0.20148533582687378, + "learning_rate": 4.194156621146901e-05, + "loss": 0.0095, + "step": 11480 + }, + { + "grad_norm": 0.24103689193725586, + "learning_rate": 4.1859984574312596e-05, + "loss": 0.0097, + "step": 11490 + }, + { + "grad_norm": 0.3739573657512665, + "learning_rate": 4.177842519164752e-05, + "loss": 0.015, + "step": 11500 + }, + { + "grad_norm": 0.256573885679245, + "learning_rate": 4.169688828645404e-05, + "loss": 0.0112, + "step": 11510 + }, + { + "grad_norm": 0.2760864496231079, + "learning_rate": 4.161537408165092e-05, + "loss": 0.0123, + "step": 11520 + }, + { + "grad_norm": 0.34800949692726135, + "learning_rate": 4.1533882800094924e-05, + "loss": 0.0117, + "step": 11530 + }, + { + "grad_norm": 0.33044150471687317, + "learning_rate": 4.145241466458005e-05, + "loss": 0.0143, + "step": 11540 + }, + { + "grad_norm": 0.27878257632255554, + "learning_rate": 4.13709698978371e-05, + "loss": 0.0112, + "step": 11550 + }, + { + "grad_norm": 0.33194226026535034, + "learning_rate": 4.1289548722532944e-05, + "loss": 0.0161, + "step": 11560 + }, + { + "grad_norm": 0.3015202283859253, + "learning_rate": 4.120815136126999e-05, + "loss": 0.0129, + "step": 11570 + }, + { + "grad_norm": 0.21716511249542236, + "learning_rate": 4.112677803658548e-05, + "loss": 0.0062, + "step": 11580 + }, + { + "grad_norm": 0.21280543506145477, + "learning_rate": 4.1045428970951e-05, + "loss": 0.0058, + "step": 11590 + }, + { + "grad_norm": 0.2517472803592682, + "learning_rate": 4.0964104386771785e-05, + "loss": 0.0065, + "step": 11600 + }, + { + "grad_norm": 0.3086889982223511, + "learning_rate": 4.0882804506386144e-05, + "loss": 0.0088, + "step": 11610 + }, + { + "grad_norm": 0.28169530630111694, + "learning_rate": 4.080152955206485e-05, + "loss": 0.0135, + "step": 11620 + }, + { + "grad_norm": 0.26306819915771484, + "learning_rate": 4.0720279746010505e-05, + "loss": 0.008, + "step": 11630 + }, + { + "grad_norm": 0.32781359553337097, + "learning_rate": 4.063905531035699e-05, + "loss": 0.0061, + "step": 11640 + }, + { + "grad_norm": 0.26148703694343567, + "learning_rate": 4.055785646716882e-05, + "loss": 0.0061, + "step": 11650 + }, + { + "grad_norm": 0.27796825766563416, + "learning_rate": 4.047668343844051e-05, + "loss": 0.0075, + "step": 11660 + }, + { + "grad_norm": 0.2539880573749542, + "learning_rate": 4.039553644609604e-05, + "loss": 0.0064, + "step": 11670 + }, + { + "grad_norm": 0.2771129906177521, + "learning_rate": 4.0314415711988176e-05, + "loss": 0.0089, + "step": 11680 + }, + { + "grad_norm": 0.35224688053131104, + "learning_rate": 4.023332145789792e-05, + "loss": 0.0145, + "step": 11690 + }, + { + "grad_norm": 0.29217731952667236, + "learning_rate": 4.015225390553385e-05, + "loss": 0.0127, + "step": 11700 + }, + { + "grad_norm": 0.3133326768875122, + "learning_rate": 4.007121327653158e-05, + "loss": 0.009, + "step": 11710 + }, + { + "grad_norm": 0.24848033487796783, + "learning_rate": 3.9990199792453064e-05, + "loss": 0.0088, + "step": 11720 + }, + { + "grad_norm": 0.19317393004894257, + "learning_rate": 3.9909213674786103e-05, + "loss": 0.0048, + "step": 11730 + }, + { + "grad_norm": 0.1701091080904007, + "learning_rate": 3.982825514494363e-05, + "loss": 0.0079, + "step": 11740 + }, + { + "grad_norm": 0.25441068410873413, + "learning_rate": 3.974732442426319e-05, + "loss": 0.0072, + "step": 11750 + }, + { + "grad_norm": 0.25550833344459534, + "learning_rate": 3.966642173400629e-05, + "loss": 0.0085, + "step": 11760 + }, + { + "grad_norm": 0.3118865489959717, + "learning_rate": 3.9585547295357764e-05, + "loss": 0.0101, + "step": 11770 + }, + { + "grad_norm": 0.2191830724477768, + "learning_rate": 3.950470132942526e-05, + "loss": 0.0108, + "step": 11780 + }, + { + "grad_norm": 0.22503943741321564, + "learning_rate": 3.942388405723856e-05, + "loss": 0.0072, + "step": 11790 + }, + { + "grad_norm": 0.3025362193584442, + "learning_rate": 3.9343095699749e-05, + "loss": 0.0108, + "step": 11800 + }, + { + "grad_norm": 0.28737306594848633, + "learning_rate": 3.9262336477828874e-05, + "loss": 0.0091, + "step": 11810 + }, + { + "grad_norm": 0.33169567584991455, + "learning_rate": 3.9181606612270794e-05, + "loss": 0.012, + "step": 11820 + }, + { + "grad_norm": 0.25354838371276855, + "learning_rate": 3.910090632378713e-05, + "loss": 0.0113, + "step": 11830 + }, + { + "grad_norm": 0.2922271490097046, + "learning_rate": 3.90202358330094e-05, + "loss": 0.0105, + "step": 11840 + }, + { + "grad_norm": 0.19395069777965546, + "learning_rate": 3.8939595360487656e-05, + "loss": 0.0111, + "step": 11850 + }, + { + "grad_norm": 0.24549539387226105, + "learning_rate": 3.885898512668984e-05, + "loss": 0.0088, + "step": 11860 + }, + { + "grad_norm": 0.2970368564128876, + "learning_rate": 3.877840535200127e-05, + "loss": 0.0103, + "step": 11870 + }, + { + "grad_norm": 0.24673429131507874, + "learning_rate": 3.869785625672397e-05, + "loss": 0.0077, + "step": 11880 + }, + { + "grad_norm": 0.21303850412368774, + "learning_rate": 3.8617338061076094e-05, + "loss": 0.0087, + "step": 11890 + }, + { + "grad_norm": 0.2450762838125229, + "learning_rate": 3.853685098519132e-05, + "loss": 0.0134, + "step": 11900 + }, + { + "grad_norm": 0.21491692960262299, + "learning_rate": 3.845639524911823e-05, + "loss": 0.0081, + "step": 11910 + }, + { + "grad_norm": 0.2705429196357727, + "learning_rate": 3.837597107281974e-05, + "loss": 0.0108, + "step": 11920 + }, + { + "grad_norm": 0.27946725487709045, + "learning_rate": 3.829557867617247e-05, + "loss": 0.0127, + "step": 11930 + }, + { + "grad_norm": 0.29402947425842285, + "learning_rate": 3.821521827896618e-05, + "loss": 0.0107, + "step": 11940 + }, + { + "grad_norm": 0.42484575510025024, + "learning_rate": 3.81348901009031e-05, + "loss": 0.0168, + "step": 11950 + }, + { + "grad_norm": 0.29109349846839905, + "learning_rate": 3.805459436159741e-05, + "loss": 0.0136, + "step": 11960 + }, + { + "grad_norm": 0.25577837228775024, + "learning_rate": 3.797433128057461e-05, + "loss": 0.0104, + "step": 11970 + }, + { + "grad_norm": 0.18210501968860626, + "learning_rate": 3.789410107727089e-05, + "loss": 0.0104, + "step": 11980 + }, + { + "grad_norm": 0.27438291907310486, + "learning_rate": 3.781390397103257e-05, + "loss": 0.0083, + "step": 11990 + }, + { + "grad_norm": 0.2682974934577942, + "learning_rate": 3.7733740181115455e-05, + "loss": 0.0098, + "step": 12000 + }, + { + "grad_norm": 0.22834736108779907, + "learning_rate": 3.7653609926684306e-05, + "loss": 0.0145, + "step": 12010 + }, + { + "grad_norm": 0.22543223202228546, + "learning_rate": 3.757351342681217e-05, + "loss": 0.0122, + "step": 12020 + }, + { + "grad_norm": 0.3116738200187683, + "learning_rate": 3.749345090047982e-05, + "loss": 0.0121, + "step": 12030 + }, + { + "grad_norm": 0.2180703580379486, + "learning_rate": 3.741342256657515e-05, + "loss": 0.011, + "step": 12040 + }, + { + "grad_norm": 0.2077644318342209, + "learning_rate": 3.7333428643892567e-05, + "loss": 0.0075, + "step": 12050 + }, + { + "grad_norm": 0.18190646171569824, + "learning_rate": 3.725346935113239e-05, + "loss": 0.0075, + "step": 12060 + }, + { + "grad_norm": 0.2454993575811386, + "learning_rate": 3.717354490690029e-05, + "loss": 0.0088, + "step": 12070 + }, + { + "grad_norm": 0.27629631757736206, + "learning_rate": 3.709365552970664e-05, + "loss": 0.0109, + "step": 12080 + }, + { + "grad_norm": 0.2686729431152344, + "learning_rate": 3.7013801437965945e-05, + "loss": 0.0111, + "step": 12090 + }, + { + "grad_norm": 0.2243090271949768, + "learning_rate": 3.693398284999623e-05, + "loss": 0.0125, + "step": 12100 + }, + { + "grad_norm": 0.2539397180080414, + "learning_rate": 3.6854199984018484e-05, + "loss": 0.0125, + "step": 12110 + }, + { + "grad_norm": 0.28046292066574097, + "learning_rate": 3.677445305815601e-05, + "loss": 0.0144, + "step": 12120 + }, + { + "grad_norm": 0.26144519448280334, + "learning_rate": 3.669474229043387e-05, + "loss": 0.011, + "step": 12130 + }, + { + "grad_norm": 0.2847682237625122, + "learning_rate": 3.6615067898778235e-05, + "loss": 0.008, + "step": 12140 + }, + { + "grad_norm": 0.2857397794723511, + "learning_rate": 3.6535430101015866e-05, + "loss": 0.0096, + "step": 12150 + }, + { + "grad_norm": 0.2576258182525635, + "learning_rate": 3.645582911487345e-05, + "loss": 0.0129, + "step": 12160 + }, + { + "grad_norm": 0.3283599317073822, + "learning_rate": 3.637626515797706e-05, + "loss": 0.0137, + "step": 12170 + }, + { + "grad_norm": 0.2969585955142975, + "learning_rate": 3.629673844785152e-05, + "loss": 0.011, + "step": 12180 + }, + { + "grad_norm": 0.19169481098651886, + "learning_rate": 3.621724920191979e-05, + "loss": 0.0111, + "step": 12190 + }, + { + "grad_norm": 0.37894049286842346, + "learning_rate": 3.6137797637502444e-05, + "loss": 0.0141, + "step": 12200 + }, + { + "grad_norm": 0.2613828480243683, + "learning_rate": 3.6058383971817035e-05, + "loss": 0.0097, + "step": 12210 + }, + { + "grad_norm": 0.3283528983592987, + "learning_rate": 3.59790084219775e-05, + "loss": 0.0109, + "step": 12220 + }, + { + "grad_norm": 0.32406893372535706, + "learning_rate": 3.589967120499353e-05, + "loss": 0.0098, + "step": 12230 + }, + { + "grad_norm": 0.29020050168037415, + "learning_rate": 3.5820372537770075e-05, + "loss": 0.0089, + "step": 12240 + }, + { + "grad_norm": 0.2612827718257904, + "learning_rate": 3.5741112637106655e-05, + "loss": 0.0102, + "step": 12250 + }, + { + "grad_norm": 0.18452972173690796, + "learning_rate": 3.5661891719696804e-05, + "loss": 0.0075, + "step": 12260 + }, + { + "grad_norm": 0.2665806710720062, + "learning_rate": 3.5582710002127504e-05, + "loss": 0.0104, + "step": 12270 + }, + { + "grad_norm": 0.23523855209350586, + "learning_rate": 3.550356770087853e-05, + "loss": 0.0088, + "step": 12280 + }, + { + "grad_norm": 0.2624225616455078, + "learning_rate": 3.5424465032321914e-05, + "loss": 0.0084, + "step": 12290 + }, + { + "grad_norm": 0.26257362961769104, + "learning_rate": 3.5345402212721335e-05, + "loss": 0.009, + "step": 12300 + }, + { + "grad_norm": 0.2270408272743225, + "learning_rate": 3.526637945823152e-05, + "loss": 0.0092, + "step": 12310 + }, + { + "grad_norm": 0.24489372968673706, + "learning_rate": 3.518739698489767e-05, + "loss": 0.0062, + "step": 12320 + }, + { + "grad_norm": 0.2051270753145218, + "learning_rate": 3.510845500865485e-05, + "loss": 0.0063, + "step": 12330 + }, + { + "grad_norm": 0.2630171477794647, + "learning_rate": 3.502955374532739e-05, + "loss": 0.0089, + "step": 12340 + }, + { + "grad_norm": 0.22556303441524506, + "learning_rate": 3.495069341062836e-05, + "loss": 0.0064, + "step": 12350 + }, + { + "grad_norm": 0.35419902205467224, + "learning_rate": 3.4871874220158896e-05, + "loss": 0.0128, + "step": 12360 + }, + { + "grad_norm": 0.31963279843330383, + "learning_rate": 3.479309638940762e-05, + "loss": 0.0112, + "step": 12370 + }, + { + "grad_norm": 0.27576276659965515, + "learning_rate": 3.4714360133750146e-05, + "loss": 0.0114, + "step": 12380 + }, + { + "grad_norm": 0.2895192503929138, + "learning_rate": 3.463566566844839e-05, + "loss": 0.0098, + "step": 12390 + }, + { + "grad_norm": 0.3528740406036377, + "learning_rate": 3.4557013208650016e-05, + "loss": 0.0119, + "step": 12400 + }, + { + "grad_norm": 0.17280146479606628, + "learning_rate": 3.4478402969387857e-05, + "loss": 0.0072, + "step": 12410 + }, + { + "grad_norm": 0.16993805766105652, + "learning_rate": 3.4399835165579266e-05, + "loss": 0.0072, + "step": 12420 + }, + { + "grad_norm": 0.24575120210647583, + "learning_rate": 3.4321310012025645e-05, + "loss": 0.0065, + "step": 12430 + }, + { + "grad_norm": 0.23696482181549072, + "learning_rate": 3.424282772341176e-05, + "loss": 0.0063, + "step": 12440 + }, + { + "grad_norm": 0.2803720533847809, + "learning_rate": 3.416438851430519e-05, + "loss": 0.0123, + "step": 12450 + }, + { + "grad_norm": 0.25509169697761536, + "learning_rate": 3.408599259915577e-05, + "loss": 0.0073, + "step": 12460 + }, + { + "grad_norm": 0.2835988402366638, + "learning_rate": 3.400764019229487e-05, + "loss": 0.0101, + "step": 12470 + }, + { + "grad_norm": 0.23895034193992615, + "learning_rate": 3.3929331507935035e-05, + "loss": 0.0107, + "step": 12480 + }, + { + "grad_norm": 0.29957592487335205, + "learning_rate": 3.3851066760169196e-05, + "loss": 0.0096, + "step": 12490 + }, + { + "grad_norm": 0.2671041786670685, + "learning_rate": 3.377284616297021e-05, + "loss": 0.0087, + "step": 12500 + }, + { + "grad_norm": 0.2623506486415863, + "learning_rate": 3.3694669930190166e-05, + "loss": 0.0089, + "step": 12510 + }, + { + "grad_norm": 0.27085670828819275, + "learning_rate": 3.36165382755599e-05, + "loss": 0.0092, + "step": 12520 + }, + { + "grad_norm": 0.2244659960269928, + "learning_rate": 3.35384514126884e-05, + "loss": 0.0115, + "step": 12530 + }, + { + "grad_norm": 0.21095892786979675, + "learning_rate": 3.3460409555062154e-05, + "loss": 0.0097, + "step": 12540 + }, + { + "grad_norm": 0.28782668709754944, + "learning_rate": 3.3382412916044645e-05, + "loss": 0.0113, + "step": 12550 + }, + { + "grad_norm": 0.2779823839664459, + "learning_rate": 3.330446170887566e-05, + "loss": 0.0097, + "step": 12560 + }, + { + "grad_norm": 0.22629587352275848, + "learning_rate": 3.3226556146670834e-05, + "loss": 0.0086, + "step": 12570 + }, + { + "grad_norm": 0.24894531071186066, + "learning_rate": 3.314869644242102e-05, + "loss": 0.0086, + "step": 12580 + }, + { + "grad_norm": 0.22881974279880524, + "learning_rate": 3.3070882808991674e-05, + "loss": 0.0087, + "step": 12590 + }, + { + "grad_norm": 0.2523793876171112, + "learning_rate": 3.2993115459122305e-05, + "loss": 0.0087, + "step": 12600 + }, + { + "grad_norm": 0.234300434589386, + "learning_rate": 3.2915394605425835e-05, + "loss": 0.0078, + "step": 12610 + }, + { + "grad_norm": 0.22297240793704987, + "learning_rate": 3.283772046038816e-05, + "loss": 0.0081, + "step": 12620 + }, + { + "grad_norm": 0.2139202505350113, + "learning_rate": 3.276009323636739e-05, + "loss": 0.0073, + "step": 12630 + }, + { + "grad_norm": 0.25581812858581543, + "learning_rate": 3.268251314559344e-05, + "loss": 0.0077, + "step": 12640 + }, + { + "grad_norm": 0.2765572965145111, + "learning_rate": 3.2604980400167254e-05, + "loss": 0.0071, + "step": 12650 + }, + { + "grad_norm": 0.1986003816127777, + "learning_rate": 3.252749521206042e-05, + "loss": 0.0063, + "step": 12660 + }, + { + "grad_norm": 0.21367865800857544, + "learning_rate": 3.2450057793114494e-05, + "loss": 0.0079, + "step": 12670 + }, + { + "grad_norm": 0.2364194542169571, + "learning_rate": 3.2372668355040435e-05, + "loss": 0.007, + "step": 12680 + }, + { + "grad_norm": 0.2297334372997284, + "learning_rate": 3.2295327109418005e-05, + "loss": 0.0072, + "step": 12690 + }, + { + "grad_norm": 0.2503378093242645, + "learning_rate": 3.221803426769518e-05, + "loss": 0.0074, + "step": 12700 + }, + { + "grad_norm": 0.1948689967393875, + "learning_rate": 3.214079004118768e-05, + "loss": 0.0082, + "step": 12710 + }, + { + "grad_norm": 0.2144840508699417, + "learning_rate": 3.2063594641078234e-05, + "loss": 0.0089, + "step": 12720 + }, + { + "grad_norm": 0.22202593088150024, + "learning_rate": 3.198644827841616e-05, + "loss": 0.0064, + "step": 12730 + }, + { + "grad_norm": 0.2366345226764679, + "learning_rate": 3.1909351164116654e-05, + "loss": 0.0071, + "step": 12740 + }, + { + "grad_norm": 0.2428324669599533, + "learning_rate": 3.183230350896026e-05, + "loss": 0.0094, + "step": 12750 + }, + { + "grad_norm": 0.2547011077404022, + "learning_rate": 3.1755305523592337e-05, + "loss": 0.0091, + "step": 12760 + }, + { + "grad_norm": 0.2759574055671692, + "learning_rate": 3.167835741852245e-05, + "loss": 0.0096, + "step": 12770 + }, + { + "grad_norm": 0.25063660740852356, + "learning_rate": 3.160145940412378e-05, + "loss": 0.0109, + "step": 12780 + }, + { + "grad_norm": 0.23081599175930023, + "learning_rate": 3.1524611690632545e-05, + "loss": 0.0037, + "step": 12790 + }, + { + "grad_norm": 0.16078658401966095, + "learning_rate": 3.144781448814746e-05, + "loss": 0.0032, + "step": 12800 + }, + { + "grad_norm": 0.41102102398872375, + "learning_rate": 3.1371068006629145e-05, + "loss": 0.0029, + "step": 12810 + }, + { + "grad_norm": 0.2575628459453583, + "learning_rate": 3.129437245589956e-05, + "loss": 0.0055, + "step": 12820 + }, + { + "grad_norm": 0.26878824830055237, + "learning_rate": 3.121772804564143e-05, + "loss": 0.0085, + "step": 12830 + }, + { + "grad_norm": 0.24412623047828674, + "learning_rate": 3.11411349853976e-05, + "loss": 0.0096, + "step": 12840 + }, + { + "grad_norm": 0.25293606519699097, + "learning_rate": 3.10645934845706e-05, + "loss": 0.0086, + "step": 12850 + }, + { + "grad_norm": 0.29647108912467957, + "learning_rate": 3.098810375242196e-05, + "loss": 0.0092, + "step": 12860 + }, + { + "grad_norm": 0.21004535257816315, + "learning_rate": 3.0911665998071704e-05, + "loss": 0.0053, + "step": 12870 + }, + { + "grad_norm": 0.18413180112838745, + "learning_rate": 3.083528043049774e-05, + "loss": 0.0043, + "step": 12880 + }, + { + "grad_norm": 0.2547227144241333, + "learning_rate": 3.0758947258535255e-05, + "loss": 0.0056, + "step": 12890 + }, + { + "grad_norm": 0.27488479018211365, + "learning_rate": 3.068266669087625e-05, + "loss": 0.0052, + "step": 12900 + }, + { + "grad_norm": 0.28182438015937805, + "learning_rate": 3.060643893606887e-05, + "loss": 0.0044, + "step": 12910 + }, + { + "grad_norm": 0.22420568764209747, + "learning_rate": 3.053026420251693e-05, + "loss": 0.0048, + "step": 12920 + }, + { + "grad_norm": 0.21663665771484375, + "learning_rate": 3.0454142698479183e-05, + "loss": 0.0049, + "step": 12930 + }, + { + "grad_norm": 0.2472330629825592, + "learning_rate": 3.0378074632068954e-05, + "loss": 0.0071, + "step": 12940 + }, + { + "grad_norm": 0.25183096528053284, + "learning_rate": 3.0302060211253408e-05, + "loss": 0.0079, + "step": 12950 + }, + { + "grad_norm": 0.2250777781009674, + "learning_rate": 3.0226099643853073e-05, + "loss": 0.0094, + "step": 12960 + }, + { + "grad_norm": 0.23961250483989716, + "learning_rate": 3.0150193137541283e-05, + "loss": 0.0088, + "step": 12970 + }, + { + "grad_norm": 0.25967711210250854, + "learning_rate": 3.0074340899843467e-05, + "loss": 0.0108, + "step": 12980 + }, + { + "grad_norm": 0.28399622440338135, + "learning_rate": 2.999854313813677e-05, + "loss": 0.0157, + "step": 12990 + }, + { + "grad_norm": 0.36137092113494873, + "learning_rate": 2.9922800059649382e-05, + "loss": 0.0124, + "step": 13000 + }, + { + "grad_norm": 0.3184651732444763, + "learning_rate": 2.9847111871459976e-05, + "loss": 0.0147, + "step": 13010 + }, + { + "grad_norm": 0.2688327133655548, + "learning_rate": 2.977147878049721e-05, + "loss": 0.0121, + "step": 13020 + }, + { + "grad_norm": 0.22611524164676666, + "learning_rate": 2.9695900993539006e-05, + "loss": 0.0094, + "step": 13030 + }, + { + "grad_norm": 0.26716893911361694, + "learning_rate": 2.9620378717212183e-05, + "loss": 0.0142, + "step": 13040 + }, + { + "grad_norm": 0.26017555594444275, + "learning_rate": 2.9544912157991745e-05, + "loss": 0.0107, + "step": 13050 + }, + { + "grad_norm": 0.31374025344848633, + "learning_rate": 2.9469501522200405e-05, + "loss": 0.0077, + "step": 13060 + }, + { + "grad_norm": 0.23381245136260986, + "learning_rate": 2.9394147016007946e-05, + "loss": 0.0086, + "step": 13070 + }, + { + "grad_norm": 0.3111151158809662, + "learning_rate": 2.9318848845430702e-05, + "loss": 0.0072, + "step": 13080 + }, + { + "grad_norm": 0.26319414377212524, + "learning_rate": 2.9243607216331013e-05, + "loss": 0.0099, + "step": 13090 + }, + { + "grad_norm": 0.1965612769126892, + "learning_rate": 2.916842233441661e-05, + "loss": 0.0084, + "step": 13100 + }, + { + "grad_norm": 0.38237646222114563, + "learning_rate": 2.90932944052401e-05, + "loss": 0.0112, + "step": 13110 + }, + { + "grad_norm": 0.2099912017583847, + "learning_rate": 2.9018223634198354e-05, + "loss": 0.0082, + "step": 13120 + }, + { + "grad_norm": 0.22441068291664124, + "learning_rate": 2.8943210226532025e-05, + "loss": 0.0095, + "step": 13130 + }, + { + "grad_norm": 0.23737552762031555, + "learning_rate": 2.8868254387324857e-05, + "loss": 0.0087, + "step": 13140 + }, + { + "grad_norm": 0.2464762032032013, + "learning_rate": 2.8793356321503306e-05, + "loss": 0.0071, + "step": 13150 + }, + { + "grad_norm": 0.17072263360023499, + "learning_rate": 2.87185162338358e-05, + "loss": 0.0056, + "step": 13160 + }, + { + "grad_norm": 0.19708947837352753, + "learning_rate": 2.8643734328932253e-05, + "loss": 0.0072, + "step": 13170 + }, + { + "grad_norm": 0.24157832562923431, + "learning_rate": 2.856901081124359e-05, + "loss": 0.0049, + "step": 13180 + }, + { + "grad_norm": 0.2708200216293335, + "learning_rate": 2.8494345885061002e-05, + "loss": 0.0071, + "step": 13190 + }, + { + "grad_norm": 0.2672256827354431, + "learning_rate": 2.8419739754515616e-05, + "loss": 0.0066, + "step": 13200 + }, + { + "grad_norm": 0.20534801483154297, + "learning_rate": 2.8345192623577666e-05, + "loss": 0.0054, + "step": 13210 + }, + { + "grad_norm": 0.20907248556613922, + "learning_rate": 2.8270704696056193e-05, + "loss": 0.0107, + "step": 13220 + }, + { + "grad_norm": 0.28192463517189026, + "learning_rate": 2.8196276175598367e-05, + "loss": 0.0115, + "step": 13230 + }, + { + "grad_norm": 0.36617594957351685, + "learning_rate": 2.8121907265688884e-05, + "loss": 0.0125, + "step": 13240 + }, + { + "grad_norm": 0.2999415695667267, + "learning_rate": 2.804759816964957e-05, + "loss": 0.0098, + "step": 13250 + }, + { + "grad_norm": 0.22433225810527802, + "learning_rate": 2.797334909063857e-05, + "loss": 0.0092, + "step": 13260 + }, + { + "grad_norm": 0.21437174081802368, + "learning_rate": 2.7899160231650056e-05, + "loss": 0.0078, + "step": 13270 + }, + { + "grad_norm": 0.21821770071983337, + "learning_rate": 2.7825031795513585e-05, + "loss": 0.0086, + "step": 13280 + }, + { + "grad_norm": 0.2077859342098236, + "learning_rate": 2.775096398489341e-05, + "loss": 0.0056, + "step": 13290 + }, + { + "grad_norm": 0.17683841288089752, + "learning_rate": 2.7676957002288163e-05, + "loss": 0.0043, + "step": 13300 + }, + { + "grad_norm": 0.2242366075515747, + "learning_rate": 2.760301105003003e-05, + "loss": 0.0028, + "step": 13310 + }, + { + "grad_norm": 0.17156025767326355, + "learning_rate": 2.752912633028446e-05, + "loss": 0.0028, + "step": 13320 + }, + { + "grad_norm": 0.1717415601015091, + "learning_rate": 2.7455303045049474e-05, + "loss": 0.0029, + "step": 13330 + }, + { + "grad_norm": 0.19086353480815887, + "learning_rate": 2.7381541396155098e-05, + "loss": 0.003, + "step": 13340 + }, + { + "grad_norm": 0.1967656910419464, + "learning_rate": 2.730784158526286e-05, + "loss": 0.0043, + "step": 13350 + }, + { + "grad_norm": 0.2486424446105957, + "learning_rate": 2.723420381386521e-05, + "loss": 0.0048, + "step": 13360 + }, + { + "grad_norm": 0.22864516079425812, + "learning_rate": 2.7160628283285018e-05, + "loss": 0.0072, + "step": 13370 + }, + { + "grad_norm": 0.2557283639907837, + "learning_rate": 2.7087115194675007e-05, + "loss": 0.0082, + "step": 13380 + }, + { + "grad_norm": 0.17404481768608093, + "learning_rate": 2.701366474901712e-05, + "loss": 0.0059, + "step": 13390 + }, + { + "grad_norm": 0.1635066568851471, + "learning_rate": 2.6940277147122085e-05, + "loss": 0.007, + "step": 13400 + }, + { + "grad_norm": 0.20646320283412933, + "learning_rate": 2.686695258962878e-05, + "loss": 0.0063, + "step": 13410 + }, + { + "grad_norm": 0.20728632807731628, + "learning_rate": 2.679369127700375e-05, + "loss": 0.0071, + "step": 13420 + }, + { + "grad_norm": 0.23233364522457123, + "learning_rate": 2.672049340954067e-05, + "loss": 0.011, + "step": 13430 + }, + { + "grad_norm": 0.27887052297592163, + "learning_rate": 2.6647359187359676e-05, + "loss": 0.0125, + "step": 13440 + }, + { + "grad_norm": 0.22890055179595947, + "learning_rate": 2.6574288810406946e-05, + "loss": 0.0094, + "step": 13450 + }, + { + "grad_norm": 0.22939376533031464, + "learning_rate": 2.6501282478454083e-05, + "loss": 0.0087, + "step": 13460 + }, + { + "grad_norm": 0.20468787848949432, + "learning_rate": 2.6428340391097618e-05, + "loss": 0.0094, + "step": 13470 + }, + { + "grad_norm": 0.306951642036438, + "learning_rate": 2.6355462747758485e-05, + "loss": 0.0054, + "step": 13480 + }, + { + "grad_norm": 0.26637208461761475, + "learning_rate": 2.6282649747681304e-05, + "loss": 0.005, + "step": 13490 + }, + { + "grad_norm": 0.2567267417907715, + "learning_rate": 2.620990158993406e-05, + "loss": 0.0058, + "step": 13500 + }, + { + "grad_norm": 0.22235551476478577, + "learning_rate": 2.6137218473407477e-05, + "loss": 0.0079, + "step": 13510 + }, + { + "grad_norm": 0.22616904973983765, + "learning_rate": 2.606460059681436e-05, + "loss": 0.007, + "step": 13520 + }, + { + "grad_norm": 0.25589826703071594, + "learning_rate": 2.599204815868928e-05, + "loss": 0.008, + "step": 13530 + }, + { + "grad_norm": 0.3133316934108734, + "learning_rate": 2.5919561357387756e-05, + "loss": 0.0077, + "step": 13540 + }, + { + "grad_norm": 0.28082042932510376, + "learning_rate": 2.5847140391085972e-05, + "loss": 0.0092, + "step": 13550 + }, + { + "grad_norm": 0.2670704126358032, + "learning_rate": 2.5774785457780103e-05, + "loss": 0.0122, + "step": 13560 + }, + { + "grad_norm": 0.2751728296279907, + "learning_rate": 2.5702496755285753e-05, + "loss": 0.0089, + "step": 13570 + }, + { + "grad_norm": 0.2093401402235031, + "learning_rate": 2.5630274481237483e-05, + "loss": 0.0072, + "step": 13580 + }, + { + "grad_norm": 0.2111545354127884, + "learning_rate": 2.5558118833088197e-05, + "loss": 0.0065, + "step": 13590 + }, + { + "grad_norm": 0.24518761038780212, + "learning_rate": 2.548603000810872e-05, + "loss": 0.0061, + "step": 13600 + }, + { + "grad_norm": 0.23371608555316925, + "learning_rate": 2.5414008203387152e-05, + "loss": 0.0063, + "step": 13610 + }, + { + "grad_norm": 0.2396596074104309, + "learning_rate": 2.534205361582834e-05, + "loss": 0.0049, + "step": 13620 + }, + { + "grad_norm": 0.1661856323480606, + "learning_rate": 2.527016644215338e-05, + "loss": 0.0043, + "step": 13630 + }, + { + "grad_norm": 0.20579443871974945, + "learning_rate": 2.519834687889905e-05, + "loss": 0.0062, + "step": 13640 + }, + { + "grad_norm": 0.2730153203010559, + "learning_rate": 2.5126595122417295e-05, + "loss": 0.0071, + "step": 13650 + }, + { + "grad_norm": 0.18529944121837616, + "learning_rate": 2.5054911368874713e-05, + "loss": 0.0056, + "step": 13660 + }, + { + "grad_norm": 0.2196544110774994, + "learning_rate": 2.4983295814251916e-05, + "loss": 0.0083, + "step": 13670 + }, + { + "grad_norm": 0.26618191599845886, + "learning_rate": 2.4911748654343105e-05, + "loss": 0.0081, + "step": 13680 + }, + { + "grad_norm": 0.27508193254470825, + "learning_rate": 2.4840270084755463e-05, + "loss": 0.0086, + "step": 13690 + }, + { + "grad_norm": 0.2792375087738037, + "learning_rate": 2.4768860300908685e-05, + "loss": 0.0083, + "step": 13700 + }, + { + "grad_norm": 0.2946309447288513, + "learning_rate": 2.469751949803443e-05, + "loss": 0.0089, + "step": 13710 + }, + { + "grad_norm": 0.308317095041275, + "learning_rate": 2.4626247871175666e-05, + "loss": 0.0107, + "step": 13720 + }, + { + "grad_norm": 0.24969953298568726, + "learning_rate": 2.4555045615186346e-05, + "loss": 0.0055, + "step": 13730 + }, + { + "grad_norm": 0.27003851532936096, + "learning_rate": 2.4483912924730677e-05, + "loss": 0.0051, + "step": 13740 + }, + { + "grad_norm": 0.18226544559001923, + "learning_rate": 2.4412849994282742e-05, + "loss": 0.0044, + "step": 13750 + }, + { + "grad_norm": 0.1628267914056778, + "learning_rate": 2.434185701812592e-05, + "loss": 0.0071, + "step": 13760 + }, + { + "grad_norm": 0.19907641410827637, + "learning_rate": 2.4270934190352218e-05, + "loss": 0.0109, + "step": 13770 + }, + { + "grad_norm": 0.31852245330810547, + "learning_rate": 2.4200081704861998e-05, + "loss": 0.012, + "step": 13780 + }, + { + "grad_norm": 0.2391577810049057, + "learning_rate": 2.412929975536321e-05, + "loss": 0.0101, + "step": 13790 + }, + { + "grad_norm": 0.26120880246162415, + "learning_rate": 2.4058588535371017e-05, + "loss": 0.0108, + "step": 13800 + }, + { + "grad_norm": 0.25855064392089844, + "learning_rate": 2.3987948238207243e-05, + "loss": 0.0089, + "step": 13810 + }, + { + "grad_norm": 0.2583528459072113, + "learning_rate": 2.3917379056999678e-05, + "loss": 0.009, + "step": 13820 + }, + { + "grad_norm": 0.1983281522989273, + "learning_rate": 2.3846881184681824e-05, + "loss": 0.0068, + "step": 13830 + }, + { + "grad_norm": 0.24056018888950348, + "learning_rate": 2.377645481399214e-05, + "loss": 0.006, + "step": 13840 + }, + { + "grad_norm": 0.2152879685163498, + "learning_rate": 2.3706100137473667e-05, + "loss": 0.0084, + "step": 13850 + }, + { + "grad_norm": 0.22262324392795563, + "learning_rate": 2.3635817347473394e-05, + "loss": 0.01, + "step": 13860 + }, + { + "grad_norm": 0.1950044184923172, + "learning_rate": 2.3565606636141757e-05, + "loss": 0.0083, + "step": 13870 + }, + { + "grad_norm": 0.29592257738113403, + "learning_rate": 2.3495468195432203e-05, + "loss": 0.0105, + "step": 13880 + }, + { + "grad_norm": 0.20952005684375763, + "learning_rate": 2.3425402217100507e-05, + "loss": 0.0081, + "step": 13890 + }, + { + "grad_norm": 0.20623354613780975, + "learning_rate": 2.3355408892704424e-05, + "loss": 0.0087, + "step": 13900 + }, + { + "grad_norm": 0.2014545500278473, + "learning_rate": 2.3285488413603003e-05, + "loss": 0.007, + "step": 13910 + }, + { + "grad_norm": 0.18053171038627625, + "learning_rate": 2.321564097095615e-05, + "loss": 0.0065, + "step": 13920 + }, + { + "grad_norm": 0.2605699598789215, + "learning_rate": 2.3145866755724142e-05, + "loss": 0.0062, + "step": 13930 + }, + { + "grad_norm": 0.1765507310628891, + "learning_rate": 2.307616595866699e-05, + "loss": 0.0064, + "step": 13940 + }, + { + "grad_norm": 0.1836540400981903, + "learning_rate": 2.3006538770344032e-05, + "loss": 0.0066, + "step": 13950 + }, + { + "grad_norm": 0.24484200775623322, + "learning_rate": 2.293698538111334e-05, + "loss": 0.0073, + "step": 13960 + }, + { + "grad_norm": 0.318861722946167, + "learning_rate": 2.28675059811312e-05, + "loss": 0.0086, + "step": 13970 + }, + { + "grad_norm": 0.2755028009414673, + "learning_rate": 2.279810076035167e-05, + "loss": 0.0088, + "step": 13980 + }, + { + "grad_norm": 0.24208253622055054, + "learning_rate": 2.272876990852596e-05, + "loss": 0.0097, + "step": 13990 + }, + { + "grad_norm": 0.22539426386356354, + "learning_rate": 2.265951361520195e-05, + "loss": 0.0123, + "step": 14000 + }, + { + "grad_norm": 0.26704996824264526, + "learning_rate": 2.2590332069723748e-05, + "loss": 0.0068, + "step": 14010 + }, + { + "grad_norm": 0.19263841211795807, + "learning_rate": 2.2521225461231004e-05, + "loss": 0.0084, + "step": 14020 + }, + { + "grad_norm": 0.23723247647285461, + "learning_rate": 2.2452193978658597e-05, + "loss": 0.0074, + "step": 14030 + }, + { + "grad_norm": 0.21307164430618286, + "learning_rate": 2.238323781073594e-05, + "loss": 0.0091, + "step": 14040 + }, + { + "grad_norm": 0.21096162497997284, + "learning_rate": 2.2314357145986552e-05, + "loss": 0.007, + "step": 14050 + }, + { + "grad_norm": 0.1488759070634842, + "learning_rate": 2.224555217272757e-05, + "loss": 0.0068, + "step": 14060 + }, + { + "grad_norm": 0.19186030328273773, + "learning_rate": 2.2176823079069127e-05, + "loss": 0.0078, + "step": 14070 + }, + { + "grad_norm": 0.3067329227924347, + "learning_rate": 2.210817005291398e-05, + "loss": 0.0079, + "step": 14080 + }, + { + "grad_norm": 0.21808436512947083, + "learning_rate": 2.203959328195686e-05, + "loss": 0.0103, + "step": 14090 + }, + { + "grad_norm": 0.18855734169483185, + "learning_rate": 2.1971092953684026e-05, + "loss": 0.0095, + "step": 14100 + }, + { + "grad_norm": 0.2067151963710785, + "learning_rate": 2.1902669255372788e-05, + "loss": 0.0087, + "step": 14110 + }, + { + "grad_norm": 0.16560152173042297, + "learning_rate": 2.1834322374090897e-05, + "loss": 0.0082, + "step": 14120 + }, + { + "grad_norm": 0.2646588683128357, + "learning_rate": 2.1766052496696153e-05, + "loss": 0.0084, + "step": 14130 + }, + { + "grad_norm": 0.15644583106040955, + "learning_rate": 2.169785980983577e-05, + "loss": 0.0058, + "step": 14140 + }, + { + "grad_norm": 0.17688466608524323, + "learning_rate": 2.162974449994593e-05, + "loss": 0.0062, + "step": 14150 + }, + { + "grad_norm": 0.19696833193302155, + "learning_rate": 2.1561706753251337e-05, + "loss": 0.0048, + "step": 14160 + }, + { + "grad_norm": 0.34053754806518555, + "learning_rate": 2.1493746755764544e-05, + "loss": 0.0059, + "step": 14170 + }, + { + "grad_norm": 0.2938707172870636, + "learning_rate": 2.1425864693285635e-05, + "loss": 0.0078, + "step": 14180 + }, + { + "grad_norm": 0.21198679506778717, + "learning_rate": 2.1358060751401547e-05, + "loss": 0.0064, + "step": 14190 + }, + { + "grad_norm": 0.26710379123687744, + "learning_rate": 2.129033511548566e-05, + "loss": 0.0096, + "step": 14200 + }, + { + "grad_norm": 0.18252646923065186, + "learning_rate": 2.1222687970697315e-05, + "loss": 0.0047, + "step": 14210 + }, + { + "grad_norm": 0.17757602035999298, + "learning_rate": 2.1155119501981173e-05, + "loss": 0.0071, + "step": 14220 + }, + { + "grad_norm": 0.20141607522964478, + "learning_rate": 2.1087629894066895e-05, + "loss": 0.0071, + "step": 14230 + }, + { + "grad_norm": 0.2534432113170624, + "learning_rate": 2.1020219331468473e-05, + "loss": 0.0077, + "step": 14240 + }, + { + "grad_norm": 0.21556223928928375, + "learning_rate": 2.095288799848379e-05, + "loss": 0.0083, + "step": 14250 + }, + { + "grad_norm": 0.2195851057767868, + "learning_rate": 2.088563607919417e-05, + "loss": 0.006, + "step": 14260 + }, + { + "grad_norm": 0.24260036647319794, + "learning_rate": 2.0818463757463786e-05, + "loss": 0.0066, + "step": 14270 + }, + { + "grad_norm": 0.19472578167915344, + "learning_rate": 2.0751371216939175e-05, + "loss": 0.0067, + "step": 14280 + }, + { + "grad_norm": 0.28243696689605713, + "learning_rate": 2.068435864104882e-05, + "loss": 0.0047, + "step": 14290 + }, + { + "grad_norm": 0.22938968241214752, + "learning_rate": 2.0617426213002506e-05, + "loss": 0.0059, + "step": 14300 + }, + { + "grad_norm": 0.2343452125787735, + "learning_rate": 2.055057411579097e-05, + "loss": 0.0052, + "step": 14310 + }, + { + "grad_norm": 0.17390845715999603, + "learning_rate": 2.0483802532185286e-05, + "loss": 0.0067, + "step": 14320 + }, + { + "grad_norm": 0.19969569146633148, + "learning_rate": 2.041711164473638e-05, + "loss": 0.0095, + "step": 14330 + }, + { + "grad_norm": 0.1951562613248825, + "learning_rate": 2.0350501635774637e-05, + "loss": 0.009, + "step": 14340 + }, + { + "grad_norm": 0.1974617838859558, + "learning_rate": 2.0283972687409247e-05, + "loss": 0.01, + "step": 14350 + }, + { + "grad_norm": 0.18776968121528625, + "learning_rate": 2.021752498152784e-05, + "loss": 0.0098, + "step": 14360 + }, + { + "grad_norm": 0.2281571477651596, + "learning_rate": 2.015115869979589e-05, + "loss": 0.0066, + "step": 14370 + }, + { + "grad_norm": 0.15420253574848175, + "learning_rate": 2.0084874023656265e-05, + "loss": 0.007, + "step": 14380 + }, + { + "grad_norm": 0.21293659508228302, + "learning_rate": 2.001867113432877e-05, + "loss": 0.0095, + "step": 14390 + }, + { + "grad_norm": 0.24641384184360504, + "learning_rate": 1.995255021280954e-05, + "loss": 0.0087, + "step": 14400 + }, + { + "grad_norm": 0.19624438881874084, + "learning_rate": 1.9886511439870688e-05, + "loss": 0.0116, + "step": 14410 + }, + { + "grad_norm": 0.26385873556137085, + "learning_rate": 1.9820554996059675e-05, + "loss": 0.0097, + "step": 14420 + }, + { + "grad_norm": 0.19679303467273712, + "learning_rate": 1.9754681061698893e-05, + "loss": 0.007, + "step": 14430 + }, + { + "grad_norm": 0.19685669243335724, + "learning_rate": 1.9688889816885185e-05, + "loss": 0.0069, + "step": 14440 + }, + { + "grad_norm": 0.2579761743545532, + "learning_rate": 1.962318144148928e-05, + "loss": 0.0077, + "step": 14450 + }, + { + "grad_norm": 0.237992525100708, + "learning_rate": 1.955755611515539e-05, + "loss": 0.0086, + "step": 14460 + }, + { + "grad_norm": 0.20474004745483398, + "learning_rate": 1.9492014017300642e-05, + "loss": 0.0115, + "step": 14470 + }, + { + "grad_norm": 0.26352787017822266, + "learning_rate": 1.942655532711461e-05, + "loss": 0.0086, + "step": 14480 + }, + { + "grad_norm": 0.15515737235546112, + "learning_rate": 1.9361180223558882e-05, + "loss": 0.009, + "step": 14490 + }, + { + "grad_norm": 0.18205532431602478, + "learning_rate": 1.929588888536647e-05, + "loss": 0.0081, + "step": 14500 + }, + { + "grad_norm": 0.25051119923591614, + "learning_rate": 1.9230681491041425e-05, + "loss": 0.0087, + "step": 14510 + }, + { + "grad_norm": 0.24160636961460114, + "learning_rate": 1.9165558218858264e-05, + "loss": 0.0085, + "step": 14520 + }, + { + "grad_norm": 0.1899593472480774, + "learning_rate": 1.9100519246861505e-05, + "loss": 0.0088, + "step": 14530 + }, + { + "grad_norm": 0.20470772683620453, + "learning_rate": 1.9035564752865248e-05, + "loss": 0.009, + "step": 14540 + }, + { + "grad_norm": 0.24114367365837097, + "learning_rate": 1.897069491445258e-05, + "loss": 0.0057, + "step": 14550 + }, + { + "grad_norm": 0.17576472461223602, + "learning_rate": 1.890590990897515e-05, + "loss": 0.0044, + "step": 14560 + }, + { + "grad_norm": 0.1785479038953781, + "learning_rate": 1.884120991355272e-05, + "loss": 0.0045, + "step": 14570 + }, + { + "grad_norm": 0.24302341043949127, + "learning_rate": 1.8776595105072576e-05, + "loss": 0.0072, + "step": 14580 + }, + { + "grad_norm": 0.2177177518606186, + "learning_rate": 1.8712065660189166e-05, + "loss": 0.0075, + "step": 14590 + }, + { + "grad_norm": 0.28443092107772827, + "learning_rate": 1.8647621755323513e-05, + "loss": 0.0072, + "step": 14600 + }, + { + "grad_norm": 0.21479752659797668, + "learning_rate": 1.858326356666278e-05, + "loss": 0.0072, + "step": 14610 + }, + { + "grad_norm": 0.3177355229854584, + "learning_rate": 1.851899127015983e-05, + "loss": 0.0061, + "step": 14620 + }, + { + "grad_norm": 0.27653831243515015, + "learning_rate": 1.8454805041532626e-05, + "loss": 0.0074, + "step": 14630 + }, + { + "grad_norm": 0.2235942780971527, + "learning_rate": 1.8390705056263906e-05, + "loss": 0.0069, + "step": 14640 + }, + { + "grad_norm": 0.2275884449481964, + "learning_rate": 1.832669148960057e-05, + "loss": 0.005, + "step": 14650 + }, + { + "grad_norm": 0.20634378492832184, + "learning_rate": 1.8262764516553233e-05, + "loss": 0.0083, + "step": 14660 + }, + { + "grad_norm": 0.2052215337753296, + "learning_rate": 1.8198924311895843e-05, + "loss": 0.0056, + "step": 14670 + }, + { + "grad_norm": 0.16023024916648865, + "learning_rate": 1.813517105016505e-05, + "loss": 0.0059, + "step": 14680 + }, + { + "grad_norm": 0.22274315357208252, + "learning_rate": 1.8071504905659888e-05, + "loss": 0.0071, + "step": 14690 + }, + { + "grad_norm": 0.12132035940885544, + "learning_rate": 1.800792605244109e-05, + "loss": 0.0078, + "step": 14700 + }, + { + "grad_norm": 0.14513835310935974, + "learning_rate": 1.7944434664330844e-05, + "loss": 0.0051, + "step": 14710 + }, + { + "grad_norm": 0.17122113704681396, + "learning_rate": 1.7881030914912212e-05, + "loss": 0.0061, + "step": 14720 + }, + { + "grad_norm": 0.23022834956645966, + "learning_rate": 1.7817714977528577e-05, + "loss": 0.0056, + "step": 14730 + }, + { + "grad_norm": 0.2473231703042984, + "learning_rate": 1.7754487025283332e-05, + "loss": 0.0074, + "step": 14740 + }, + { + "grad_norm": 0.20677083730697632, + "learning_rate": 1.7691347231039275e-05, + "loss": 0.0061, + "step": 14750 + }, + { + "grad_norm": 0.15179438889026642, + "learning_rate": 1.7628295767418164e-05, + "loss": 0.004, + "step": 14760 + }, + { + "grad_norm": 0.1799698919057846, + "learning_rate": 1.7565332806800333e-05, + "loss": 0.005, + "step": 14770 + }, + { + "grad_norm": 0.277893990278244, + "learning_rate": 1.750245852132408e-05, + "loss": 0.0047, + "step": 14780 + }, + { + "grad_norm": 0.21974965929985046, + "learning_rate": 1.7439673082885323e-05, + "loss": 0.0044, + "step": 14790 + }, + { + "grad_norm": 0.15415234863758087, + "learning_rate": 1.7376976663137047e-05, + "loss": 0.0043, + "step": 14800 + }, + { + "grad_norm": 0.22690968215465546, + "learning_rate": 1.7314369433488853e-05, + "loss": 0.0047, + "step": 14810 + }, + { + "grad_norm": 0.19512154161930084, + "learning_rate": 1.7251851565106548e-05, + "loss": 0.0073, + "step": 14820 + }, + { + "grad_norm": 0.28436723351478577, + "learning_rate": 1.7189423228911574e-05, + "loss": 0.0099, + "step": 14830 + }, + { + "grad_norm": 0.3049081563949585, + "learning_rate": 1.7127084595580606e-05, + "loss": 0.0098, + "step": 14840 + }, + { + "grad_norm": 0.26535964012145996, + "learning_rate": 1.706483583554513e-05, + "loss": 0.0085, + "step": 14850 + }, + { + "grad_norm": 0.22091160714626312, + "learning_rate": 1.700267711899083e-05, + "loss": 0.0056, + "step": 14860 + }, + { + "grad_norm": 0.1770736128091812, + "learning_rate": 1.69406086158573e-05, + "loss": 0.0061, + "step": 14870 + }, + { + "grad_norm": 0.22627416253089905, + "learning_rate": 1.6878630495837455e-05, + "loss": 0.0081, + "step": 14880 + }, + { + "grad_norm": 0.22951920330524445, + "learning_rate": 1.681674292837707e-05, + "loss": 0.009, + "step": 14890 + }, + { + "grad_norm": 0.19670870900154114, + "learning_rate": 1.6754946082674444e-05, + "loss": 0.0084, + "step": 14900 + }, + { + "grad_norm": 0.17799118161201477, + "learning_rate": 1.6693240127679748e-05, + "loss": 0.0078, + "step": 14910 + }, + { + "grad_norm": 0.222540482878685, + "learning_rate": 1.663162523209475e-05, + "loss": 0.0115, + "step": 14920 + }, + { + "grad_norm": 0.24060840904712677, + "learning_rate": 1.6570101564372193e-05, + "loss": 0.0106, + "step": 14930 + }, + { + "grad_norm": 0.213928684592247, + "learning_rate": 1.650866929271543e-05, + "loss": 0.0112, + "step": 14940 + }, + { + "grad_norm": 0.22997981309890747, + "learning_rate": 1.644732858507797e-05, + "loss": 0.007, + "step": 14950 + }, + { + "grad_norm": 0.17118534445762634, + "learning_rate": 1.6386079609162943e-05, + "loss": 0.0065, + "step": 14960 + }, + { + "grad_norm": 0.2330489605665207, + "learning_rate": 1.6324922532422742e-05, + "loss": 0.0122, + "step": 14970 + }, + { + "grad_norm": 0.22197645902633667, + "learning_rate": 1.6263857522058434e-05, + "loss": 0.0063, + "step": 14980 + }, + { + "grad_norm": 0.13857628405094147, + "learning_rate": 1.6202884745019443e-05, + "loss": 0.0054, + "step": 14990 + }, + { + "grad_norm": 0.13685590028762817, + "learning_rate": 1.614200436800304e-05, + "loss": 0.0066, + "step": 15000 + }, + { + "grad_norm": 0.14680595695972443, + "learning_rate": 1.6081216557453814e-05, + "loss": 0.005, + "step": 15010 + }, + { + "grad_norm": 0.22602754831314087, + "learning_rate": 1.6020521479563367e-05, + "loss": 0.0078, + "step": 15020 + }, + { + "grad_norm": 0.20991118252277374, + "learning_rate": 1.5959919300269654e-05, + "loss": 0.0052, + "step": 15030 + }, + { + "grad_norm": 0.1853811889886856, + "learning_rate": 1.5899410185256764e-05, + "loss": 0.0054, + "step": 15040 + }, + { + "grad_norm": 0.17050021886825562, + "learning_rate": 1.583899429995431e-05, + "loss": 0.0069, + "step": 15050 + }, + { + "grad_norm": 0.2241840660572052, + "learning_rate": 1.5778671809536993e-05, + "loss": 0.0055, + "step": 15060 + }, + { + "grad_norm": 0.14445625245571136, + "learning_rate": 1.5718442878924246e-05, + "loss": 0.0048, + "step": 15070 + }, + { + "grad_norm": 0.16181202232837677, + "learning_rate": 1.5658307672779593e-05, + "loss": 0.0046, + "step": 15080 + }, + { + "grad_norm": 0.158863827586174, + "learning_rate": 1.5598266355510427e-05, + "loss": 0.0038, + "step": 15090 + }, + { + "grad_norm": 0.16581790149211884, + "learning_rate": 1.553831909126744e-05, + "loss": 0.0037, + "step": 15100 + }, + { + "grad_norm": 0.24619930982589722, + "learning_rate": 1.5478466043944135e-05, + "loss": 0.0029, + "step": 15110 + }, + { + "grad_norm": 0.1852288544178009, + "learning_rate": 1.5418707377176468e-05, + "loss": 0.0028, + "step": 15120 + }, + { + "grad_norm": 0.19994230568408966, + "learning_rate": 1.535904325434233e-05, + "loss": 0.0033, + "step": 15130 + }, + { + "grad_norm": 0.2706878185272217, + "learning_rate": 1.529947383856118e-05, + "loss": 0.0062, + "step": 15140 + }, + { + "grad_norm": 0.24223172664642334, + "learning_rate": 1.5239999292693524e-05, + "loss": 0.0087, + "step": 15150 + }, + { + "grad_norm": 0.2217617779970169, + "learning_rate": 1.5180619779340505e-05, + "loss": 0.0096, + "step": 15160 + }, + { + "grad_norm": 0.20301875472068787, + "learning_rate": 1.5121335460843428e-05, + "loss": 0.0075, + "step": 15170 + }, + { + "grad_norm": 0.2454066425561905, + "learning_rate": 1.5062146499283347e-05, + "loss": 0.0083, + "step": 15180 + }, + { + "grad_norm": 0.2645903527736664, + "learning_rate": 1.5003053056480643e-05, + "loss": 0.0082, + "step": 15190 + }, + { + "grad_norm": 0.2625524699687958, + "learning_rate": 1.4944055293994551e-05, + "loss": 0.0068, + "step": 15200 + }, + { + "grad_norm": 0.23997178673744202, + "learning_rate": 1.4885153373122656e-05, + "loss": 0.0085, + "step": 15210 + }, + { + "grad_norm": 0.20471638441085815, + "learning_rate": 1.482634745490059e-05, + "loss": 0.0076, + "step": 15220 + }, + { + "grad_norm": 0.1677587330341339, + "learning_rate": 1.4767637700101466e-05, + "loss": 0.006, + "step": 15230 + }, + { + "grad_norm": 0.19631387293338776, + "learning_rate": 1.4709024269235528e-05, + "loss": 0.0057, + "step": 15240 + }, + { + "grad_norm": 0.15639477968215942, + "learning_rate": 1.4650507322549684e-05, + "loss": 0.0054, + "step": 15250 + }, + { + "grad_norm": 0.2173965871334076, + "learning_rate": 1.4592087020026972e-05, + "loss": 0.0065, + "step": 15260 + }, + { + "grad_norm": 0.22251997888088226, + "learning_rate": 1.4533763521386318e-05, + "loss": 0.0069, + "step": 15270 + }, + { + "grad_norm": 0.2840924561023712, + "learning_rate": 1.44755369860819e-05, + "loss": 0.0084, + "step": 15280 + }, + { + "grad_norm": 0.20377834141254425, + "learning_rate": 1.441740757330287e-05, + "loss": 0.0064, + "step": 15290 + }, + { + "grad_norm": 0.18228556215763092, + "learning_rate": 1.4359375441972844e-05, + "loss": 0.0047, + "step": 15300 + }, + { + "grad_norm": 0.1474473476409912, + "learning_rate": 1.4301440750749395e-05, + "loss": 0.0034, + "step": 15310 + }, + { + "grad_norm": 0.1590013951063156, + "learning_rate": 1.4243603658023808e-05, + "loss": 0.0043, + "step": 15320 + }, + { + "grad_norm": 0.17313016951084137, + "learning_rate": 1.4185864321920444e-05, + "loss": 0.0047, + "step": 15330 + }, + { + "grad_norm": 0.23854900896549225, + "learning_rate": 1.4128222900296485e-05, + "loss": 0.0064, + "step": 15340 + }, + { + "grad_norm": 0.206364706158638, + "learning_rate": 1.407067955074135e-05, + "loss": 0.008, + "step": 15350 + }, + { + "grad_norm": 0.24199241399765015, + "learning_rate": 1.4013234430576356e-05, + "loss": 0.0073, + "step": 15360 + }, + { + "grad_norm": 0.16870100796222687, + "learning_rate": 1.3955887696854286e-05, + "loss": 0.011, + "step": 15370 + }, + { + "grad_norm": 0.26507139205932617, + "learning_rate": 1.38986395063589e-05, + "loss": 0.0089, + "step": 15380 + }, + { + "grad_norm": 0.200941264629364, + "learning_rate": 1.3841490015604597e-05, + "loss": 0.0053, + "step": 15390 + }, + { + "grad_norm": 0.27485617995262146, + "learning_rate": 1.3784439380835879e-05, + "loss": 0.0059, + "step": 15400 + }, + { + "grad_norm": 0.17240837216377258, + "learning_rate": 1.3727487758026986e-05, + "loss": 0.0052, + "step": 15410 + }, + { + "grad_norm": 0.1309075951576233, + "learning_rate": 1.3670635302881525e-05, + "loss": 0.0042, + "step": 15420 + }, + { + "grad_norm": 0.1304185539484024, + "learning_rate": 1.3613882170831888e-05, + "loss": 0.0049, + "step": 15430 + }, + { + "grad_norm": 0.19147582352161407, + "learning_rate": 1.355722851703901e-05, + "loss": 0.006, + "step": 15440 + }, + { + "grad_norm": 0.2168254852294922, + "learning_rate": 1.3500674496391814e-05, + "loss": 0.0061, + "step": 15450 + }, + { + "grad_norm": 0.18104854226112366, + "learning_rate": 1.3444220263506795e-05, + "loss": 0.0071, + "step": 15460 + }, + { + "grad_norm": 0.17319612205028534, + "learning_rate": 1.3387865972727714e-05, + "loss": 0.0066, + "step": 15470 + }, + { + "grad_norm": 0.14455774426460266, + "learning_rate": 1.3331611778125036e-05, + "loss": 0.007, + "step": 15480 + }, + { + "grad_norm": 0.18909773230552673, + "learning_rate": 1.3275457833495564e-05, + "loss": 0.0096, + "step": 15490 + }, + { + "grad_norm": 0.15317805111408234, + "learning_rate": 1.3219404292362065e-05, + "loss": 0.0078, + "step": 15500 + }, + { + "grad_norm": 0.20971821248531342, + "learning_rate": 1.3163451307972751e-05, + "loss": 0.0067, + "step": 15510 + }, + { + "grad_norm": 0.17310576140880585, + "learning_rate": 1.3107599033300977e-05, + "loss": 0.0061, + "step": 15520 + }, + { + "grad_norm": 0.189740389585495, + "learning_rate": 1.305184762104471e-05, + "loss": 0.0073, + "step": 15530 + }, + { + "grad_norm": 0.16094551980495453, + "learning_rate": 1.2996197223626178e-05, + "loss": 0.0068, + "step": 15540 + }, + { + "grad_norm": 0.2289162576198578, + "learning_rate": 1.2940647993191457e-05, + "loss": 0.0075, + "step": 15550 + }, + { + "grad_norm": 0.21944604814052582, + "learning_rate": 1.2885200081610005e-05, + "loss": 0.0074, + "step": 15560 + }, + { + "grad_norm": 0.1618151068687439, + "learning_rate": 1.2829853640474316e-05, + "loss": 0.0072, + "step": 15570 + }, + { + "grad_norm": 0.18821214139461517, + "learning_rate": 1.2774608821099438e-05, + "loss": 0.0064, + "step": 15580 + }, + { + "grad_norm": 0.22793236374855042, + "learning_rate": 1.2719465774522577e-05, + "loss": 0.0078, + "step": 15590 + }, + { + "grad_norm": 0.17744077742099762, + "learning_rate": 1.2664424651502755e-05, + "loss": 0.0079, + "step": 15600 + }, + { + "grad_norm": 0.14679332077503204, + "learning_rate": 1.260948560252026e-05, + "loss": 0.0069, + "step": 15610 + }, + { + "grad_norm": 0.12989553809165955, + "learning_rate": 1.2554648777776396e-05, + "loss": 0.0047, + "step": 15620 + }, + { + "grad_norm": 0.15548411011695862, + "learning_rate": 1.2499914327192919e-05, + "loss": 0.0058, + "step": 15630 + }, + { + "grad_norm": 0.14128291606903076, + "learning_rate": 1.2445282400411722e-05, + "loss": 0.0062, + "step": 15640 + }, + { + "grad_norm": 0.16301748156547546, + "learning_rate": 1.2390753146794437e-05, + "loss": 0.0081, + "step": 15650 + }, + { + "grad_norm": 0.2908998429775238, + "learning_rate": 1.2336326715421925e-05, + "loss": 0.0097, + "step": 15660 + }, + { + "grad_norm": 0.16072075068950653, + "learning_rate": 1.2282003255094005e-05, + "loss": 0.0084, + "step": 15670 + }, + { + "grad_norm": 0.19537751376628876, + "learning_rate": 1.2227782914328928e-05, + "loss": 0.0087, + "step": 15680 + }, + { + "grad_norm": 0.20008064806461334, + "learning_rate": 1.2173665841363018e-05, + "loss": 0.008, + "step": 15690 + }, + { + "grad_norm": 0.18201963603496552, + "learning_rate": 1.211965218415032e-05, + "loss": 0.0076, + "step": 15700 + }, + { + "grad_norm": 0.2208029329776764, + "learning_rate": 1.2065742090362082e-05, + "loss": 0.0088, + "step": 15710 + }, + { + "grad_norm": 0.1777421534061432, + "learning_rate": 1.2011935707386457e-05, + "loss": 0.0086, + "step": 15720 + }, + { + "grad_norm": 0.21700501441955566, + "learning_rate": 1.1958233182328044e-05, + "loss": 0.0086, + "step": 15730 + }, + { + "grad_norm": 0.20182208716869354, + "learning_rate": 1.1904634662007474e-05, + "loss": 0.0061, + "step": 15740 + }, + { + "grad_norm": 0.2010135054588318, + "learning_rate": 1.1851140292961088e-05, + "loss": 0.0057, + "step": 15750 + }, + { + "grad_norm": 0.21197208762168884, + "learning_rate": 1.1797750221440424e-05, + "loss": 0.007, + "step": 15760 + }, + { + "grad_norm": 0.19234375655651093, + "learning_rate": 1.1744464593411897e-05, + "loss": 0.0054, + "step": 15770 + }, + { + "grad_norm": 0.16805565357208252, + "learning_rate": 1.1691283554556399e-05, + "loss": 0.0021, + "step": 15780 + }, + { + "grad_norm": 0.16401079297065735, + "learning_rate": 1.1638207250268834e-05, + "loss": 0.0023, + "step": 15790 + }, + { + "grad_norm": 0.22445884346961975, + "learning_rate": 1.158523582565782e-05, + "loss": 0.0046, + "step": 15800 + }, + { + "grad_norm": 0.1363947093486786, + "learning_rate": 1.1532369425545192e-05, + "loss": 0.0046, + "step": 15810 + }, + { + "grad_norm": 0.14404959976673126, + "learning_rate": 1.1479608194465662e-05, + "loss": 0.0092, + "step": 15820 + }, + { + "grad_norm": 0.13788649439811707, + "learning_rate": 1.1426952276666442e-05, + "loss": 0.0047, + "step": 15830 + }, + { + "grad_norm": 0.19880564510822296, + "learning_rate": 1.1374401816106778e-05, + "loss": 0.0068, + "step": 15840 + }, + { + "grad_norm": 0.12659719586372375, + "learning_rate": 1.1321956956457646e-05, + "loss": 0.0054, + "step": 15850 + }, + { + "grad_norm": 0.22960048913955688, + "learning_rate": 1.1269617841101277e-05, + "loss": 0.0059, + "step": 15860 + }, + { + "grad_norm": 0.17524980008602142, + "learning_rate": 1.1217384613130804e-05, + "loss": 0.0053, + "step": 15870 + }, + { + "grad_norm": 0.15279366075992584, + "learning_rate": 1.11652574153499e-05, + "loss": 0.0059, + "step": 15880 + }, + { + "grad_norm": 0.2004585862159729, + "learning_rate": 1.1113236390272303e-05, + "loss": 0.0056, + "step": 15890 + }, + { + "grad_norm": 0.1381554901599884, + "learning_rate": 1.106132168012155e-05, + "loss": 0.0077, + "step": 15900 + }, + { + "grad_norm": 0.19211548566818237, + "learning_rate": 1.1009513426830448e-05, + "loss": 0.0052, + "step": 15910 + }, + { + "grad_norm": 0.2039334774017334, + "learning_rate": 1.0957811772040777e-05, + "loss": 0.0056, + "step": 15920 + }, + { + "grad_norm": 0.17506463825702667, + "learning_rate": 1.0906216857102913e-05, + "loss": 0.0037, + "step": 15930 + }, + { + "grad_norm": 0.17649434506893158, + "learning_rate": 1.0854728823075355e-05, + "loss": 0.0038, + "step": 15940 + }, + { + "grad_norm": 0.17953673005104065, + "learning_rate": 1.0803347810724452e-05, + "loss": 0.0069, + "step": 15950 + }, + { + "grad_norm": 0.18853695690631866, + "learning_rate": 1.0752073960523911e-05, + "loss": 0.0068, + "step": 15960 + }, + { + "grad_norm": 0.22256223857402802, + "learning_rate": 1.070090741265447e-05, + "loss": 0.0077, + "step": 15970 + }, + { + "grad_norm": 0.20178468525409698, + "learning_rate": 1.0649848307003547e-05, + "loss": 0.0068, + "step": 15980 + }, + { + "grad_norm": 0.1299094706773758, + "learning_rate": 1.0598896783164757e-05, + "loss": 0.0072, + "step": 15990 + }, + { + "grad_norm": 0.12188919633626938, + "learning_rate": 1.0548052980437645e-05, + "loss": 0.0086, + "step": 16000 + }, + { + "grad_norm": 0.16716693341732025, + "learning_rate": 1.049731703782722e-05, + "loss": 0.0059, + "step": 16010 + }, + { + "grad_norm": 0.14110815525054932, + "learning_rate": 1.0446689094043587e-05, + "loss": 0.0077, + "step": 16020 + }, + { + "grad_norm": 0.1794903725385666, + "learning_rate": 1.039616928750165e-05, + "loss": 0.0054, + "step": 16030 + }, + { + "grad_norm": 0.13544327020645142, + "learning_rate": 1.0345757756320612e-05, + "loss": 0.0052, + "step": 16040 + }, + { + "grad_norm": 0.12089123576879501, + "learning_rate": 1.0295454638323666e-05, + "loss": 0.0044, + "step": 16050 + }, + { + "grad_norm": 0.15249843895435333, + "learning_rate": 1.0245260071037632e-05, + "loss": 0.0043, + "step": 16060 + }, + { + "grad_norm": 0.3028264045715332, + "learning_rate": 1.0195174191692518e-05, + "loss": 0.0039, + "step": 16070 + }, + { + "grad_norm": 0.1937440037727356, + "learning_rate": 1.014519713722124e-05, + "loss": 0.0048, + "step": 16080 + }, + { + "grad_norm": 0.25855445861816406, + "learning_rate": 1.0095329044259132e-05, + "loss": 0.0044, + "step": 16090 + }, + { + "grad_norm": 0.18820537626743317, + "learning_rate": 1.004557004914365e-05, + "loss": 0.006, + "step": 16100 + }, + { + "grad_norm": 0.18947653472423553, + "learning_rate": 9.995920287914007e-06, + "loss": 0.0058, + "step": 16110 + }, + { + "grad_norm": 0.1625407636165619, + "learning_rate": 9.946379896310737e-06, + "loss": 0.0049, + "step": 16120 + }, + { + "grad_norm": 0.15378440916538239, + "learning_rate": 9.896949009775396e-06, + "loss": 0.0052, + "step": 16130 + }, + { + "grad_norm": 0.20766200125217438, + "learning_rate": 9.847627763450134e-06, + "loss": 0.0055, + "step": 16140 + }, + { + "grad_norm": 0.18093343079090118, + "learning_rate": 9.798416292177337e-06, + "loss": 0.0052, + "step": 16150 + }, + { + "grad_norm": 0.1980709433555603, + "learning_rate": 9.74931473049932e-06, + "loss": 0.0068, + "step": 16160 + }, + { + "grad_norm": 0.1757391095161438, + "learning_rate": 9.700323212657847e-06, + "loss": 0.0063, + "step": 16170 + }, + { + "grad_norm": 0.1837252527475357, + "learning_rate": 9.65144187259388e-06, + "loss": 0.0078, + "step": 16180 + }, + { + "grad_norm": 0.1645137220621109, + "learning_rate": 9.602670843947132e-06, + "loss": 0.0066, + "step": 16190 + }, + { + "grad_norm": 0.16566288471221924, + "learning_rate": 9.554010260055713e-06, + "loss": 0.0058, + "step": 16200 + }, + { + "grad_norm": 0.1449422538280487, + "learning_rate": 9.505460253955834e-06, + "loss": 0.0049, + "step": 16210 + }, + { + "grad_norm": 0.1919349879026413, + "learning_rate": 9.457020958381324e-06, + "loss": 0.0063, + "step": 16220 + }, + { + "grad_norm": 0.15887637436389923, + "learning_rate": 9.408692505763395e-06, + "loss": 0.0057, + "step": 16230 + }, + { + "grad_norm": 0.19135218858718872, + "learning_rate": 9.360475028230181e-06, + "loss": 0.0059, + "step": 16240 + }, + { + "grad_norm": 0.13729731738567352, + "learning_rate": 9.312368657606412e-06, + "loss": 0.004, + "step": 16250 + }, + { + "grad_norm": 0.1505630612373352, + "learning_rate": 9.264373525413096e-06, + "loss": 0.0078, + "step": 16260 + }, + { + "grad_norm": 0.21334412693977356, + "learning_rate": 9.216489762867058e-06, + "loss": 0.0054, + "step": 16270 + }, + { + "grad_norm": 0.27786457538604736, + "learning_rate": 9.168717500880708e-06, + "loss": 0.0082, + "step": 16280 + }, + { + "grad_norm": 0.16654618084430695, + "learning_rate": 9.121056870061574e-06, + "loss": 0.0045, + "step": 16290 + }, + { + "grad_norm": 0.20453572273254395, + "learning_rate": 9.073508000711983e-06, + "loss": 0.0064, + "step": 16300 + }, + { + "grad_norm": 0.18527251482009888, + "learning_rate": 9.026071022828758e-06, + "loss": 0.0083, + "step": 16310 + }, + { + "grad_norm": 0.19920963048934937, + "learning_rate": 8.978746066102771e-06, + "loss": 0.0068, + "step": 16320 + }, + { + "grad_norm": 0.15388308465480804, + "learning_rate": 8.931533259918634e-06, + "loss": 0.0057, + "step": 16330 + }, + { + "grad_norm": 0.139622300863266, + "learning_rate": 8.884432733354382e-06, + "loss": 0.0051, + "step": 16340 + }, + { + "grad_norm": 0.12960968911647797, + "learning_rate": 8.837444615181029e-06, + "loss": 0.0075, + "step": 16350 + }, + { + "grad_norm": 0.17558151483535767, + "learning_rate": 8.790569033862323e-06, + "loss": 0.0064, + "step": 16360 + }, + { + "grad_norm": 0.1749149113893509, + "learning_rate": 8.7438061175543e-06, + "loss": 0.0044, + "step": 16370 + }, + { + "grad_norm": 0.14643223583698273, + "learning_rate": 8.697155994104978e-06, + "loss": 0.004, + "step": 16380 + }, + { + "grad_norm": 0.1505119800567627, + "learning_rate": 8.650618791054033e-06, + "loss": 0.0039, + "step": 16390 + }, + { + "grad_norm": 0.14904901385307312, + "learning_rate": 8.604194635632373e-06, + "loss": 0.0038, + "step": 16400 + }, + { + "grad_norm": 0.1481541246175766, + "learning_rate": 8.557883654761906e-06, + "loss": 0.0044, + "step": 16410 + }, + { + "grad_norm": 0.20909498631954193, + "learning_rate": 8.511685975055061e-06, + "loss": 0.006, + "step": 16420 + }, + { + "grad_norm": 0.18976648151874542, + "learning_rate": 8.46560172281452e-06, + "loss": 0.0067, + "step": 16430 + }, + { + "grad_norm": 0.21085554361343384, + "learning_rate": 8.419631024032893e-06, + "loss": 0.0076, + "step": 16440 + }, + { + "grad_norm": 0.19114437699317932, + "learning_rate": 8.373774004392293e-06, + "loss": 0.0083, + "step": 16450 + }, + { + "grad_norm": 0.1837872713804245, + "learning_rate": 8.32803078926409e-06, + "loss": 0.008, + "step": 16460 + }, + { + "grad_norm": 0.1403045803308487, + "learning_rate": 8.282401503708454e-06, + "loss": 0.0057, + "step": 16470 + }, + { + "grad_norm": 0.16365502774715424, + "learning_rate": 8.23688627247412e-06, + "loss": 0.0054, + "step": 16480 + }, + { + "grad_norm": 0.1936427652835846, + "learning_rate": 8.191485219998007e-06, + "loss": 0.0045, + "step": 16490 + }, + { + "grad_norm": 0.14875873923301697, + "learning_rate": 8.146198470404843e-06, + "loss": 0.0053, + "step": 16500 + }, + { + "grad_norm": 0.19795140624046326, + "learning_rate": 8.101026147506897e-06, + "loss": 0.0046, + "step": 16510 + }, + { + "grad_norm": 0.18331268429756165, + "learning_rate": 8.05596837480353e-06, + "loss": 0.0071, + "step": 16520 + }, + { + "grad_norm": 0.14491984248161316, + "learning_rate": 8.011025275480998e-06, + "loss": 0.0064, + "step": 16530 + }, + { + "grad_norm": 0.19273407757282257, + "learning_rate": 7.966196972412027e-06, + "loss": 0.006, + "step": 16540 + }, + { + "grad_norm": 0.09642724692821503, + "learning_rate": 7.92148358815547e-06, + "loss": 0.0028, + "step": 16550 + }, + { + "grad_norm": 0.17913515865802765, + "learning_rate": 7.87688524495604e-06, + "loss": 0.0023, + "step": 16560 + }, + { + "grad_norm": 0.15309801697731018, + "learning_rate": 7.83240206474386e-06, + "loss": 0.0024, + "step": 16570 + }, + { + "grad_norm": 0.14498166739940643, + "learning_rate": 7.788034169134272e-06, + "loss": 0.0035, + "step": 16580 + }, + { + "grad_norm": 0.13242116570472717, + "learning_rate": 7.743781679427414e-06, + "loss": 0.0059, + "step": 16590 + }, + { + "grad_norm": 0.14567090570926666, + "learning_rate": 7.699644716607895e-06, + "loss": 0.0059, + "step": 16600 + }, + { + "grad_norm": 0.1842607706785202, + "learning_rate": 7.655623401344486e-06, + "loss": 0.0054, + "step": 16610 + }, + { + "grad_norm": 0.19166016578674316, + "learning_rate": 7.611717853989775e-06, + "loss": 0.0059, + "step": 16620 + }, + { + "grad_norm": 0.18736149370670319, + "learning_rate": 7.567928194579854e-06, + "loss": 0.0087, + "step": 16630 + }, + { + "grad_norm": 0.1717270463705063, + "learning_rate": 7.524254542833997e-06, + "loss": 0.0072, + "step": 16640 + }, + { + "grad_norm": 0.15437433123588562, + "learning_rate": 7.480697018154286e-06, + "loss": 0.0121, + "step": 16650 + }, + { + "grad_norm": 0.14670272171497345, + "learning_rate": 7.437255739625332e-06, + "loss": 0.0062, + "step": 16660 + }, + { + "grad_norm": 0.14184066653251648, + "learning_rate": 7.393930826013923e-06, + "loss": 0.0058, + "step": 16670 + }, + { + "grad_norm": 0.12911194562911987, + "learning_rate": 7.350722395768722e-06, + "loss": 0.0035, + "step": 16680 + }, + { + "grad_norm": 0.15619446337223053, + "learning_rate": 7.307630567019963e-06, + "loss": 0.0028, + "step": 16690 + }, + { + "grad_norm": 0.12753093242645264, + "learning_rate": 7.264655457579e-06, + "loss": 0.005, + "step": 16700 + }, + { + "grad_norm": 0.11561994254589081, + "learning_rate": 7.221797184938184e-06, + "loss": 0.0071, + "step": 16710 + }, + { + "grad_norm": 0.14108562469482422, + "learning_rate": 7.179055866270373e-06, + "loss": 0.004, + "step": 16720 + }, + { + "grad_norm": 0.15046365559101105, + "learning_rate": 7.136431618428707e-06, + "loss": 0.0058, + "step": 16730 + }, + { + "grad_norm": 0.17979326844215393, + "learning_rate": 7.09392455794628e-06, + "loss": 0.0068, + "step": 16740 + }, + { + "grad_norm": 0.1262209564447403, + "learning_rate": 7.051534801035725e-06, + "loss": 0.0066, + "step": 16750 + }, + { + "grad_norm": 0.15643055737018585, + "learning_rate": 7.00926246358905e-06, + "loss": 0.0074, + "step": 16760 + }, + { + "grad_norm": 0.24131742119789124, + "learning_rate": 6.967107661177191e-06, + "loss": 0.0089, + "step": 16770 + }, + { + "grad_norm": 0.2032814621925354, + "learning_rate": 6.925070509049786e-06, + "loss": 0.0087, + "step": 16780 + }, + { + "grad_norm": 0.12672913074493408, + "learning_rate": 6.883151122134812e-06, + "loss": 0.0045, + "step": 16790 + }, + { + "grad_norm": 0.13664676249027252, + "learning_rate": 6.8413496150382394e-06, + "loss": 0.005, + "step": 16800 + }, + { + "grad_norm": 0.09806268662214279, + "learning_rate": 6.7996661020438165e-06, + "loss": 0.0028, + "step": 16810 + }, + { + "grad_norm": 0.0869489312171936, + "learning_rate": 6.758100697112662e-06, + "loss": 0.0017, + "step": 16820 + }, + { + "grad_norm": 0.17327825725078583, + "learning_rate": 6.716653513883026e-06, + "loss": 0.0034, + "step": 16830 + }, + { + "grad_norm": 0.11516022682189941, + "learning_rate": 6.675324665669913e-06, + "loss": 0.0033, + "step": 16840 + }, + { + "grad_norm": 0.19973814487457275, + "learning_rate": 6.634114265464803e-06, + "loss": 0.0071, + "step": 16850 + }, + { + "grad_norm": 0.11164401471614838, + "learning_rate": 6.59302242593538e-06, + "loss": 0.0059, + "step": 16860 + }, + { + "grad_norm": 0.1508011668920517, + "learning_rate": 6.552049259425141e-06, + "loss": 0.0067, + "step": 16870 + }, + { + "grad_norm": 0.1377403438091278, + "learning_rate": 6.511194877953181e-06, + "loss": 0.006, + "step": 16880 + }, + { + "grad_norm": 0.10775049030780792, + "learning_rate": 6.470459393213813e-06, + "loss": 0.0046, + "step": 16890 + }, + { + "grad_norm": 0.15003006160259247, + "learning_rate": 6.429842916576279e-06, + "loss": 0.0052, + "step": 16900 + }, + { + "grad_norm": 0.10327060520648956, + "learning_rate": 6.389345559084503e-06, + "loss": 0.0038, + "step": 16910 + }, + { + "grad_norm": 0.1697254478931427, + "learning_rate": 6.348967431456682e-06, + "loss": 0.0046, + "step": 16920 + }, + { + "grad_norm": 0.11398360878229141, + "learning_rate": 6.30870864408511e-06, + "loss": 0.0041, + "step": 16930 + }, + { + "grad_norm": 0.12366661429405212, + "learning_rate": 6.268569307035754e-06, + "loss": 0.0044, + "step": 16940 + }, + { + "grad_norm": 0.11858753859996796, + "learning_rate": 6.228549530048022e-06, + "loss": 0.0052, + "step": 16950 + }, + { + "grad_norm": 0.1497092992067337, + "learning_rate": 6.1886494225344814e-06, + "loss": 0.0073, + "step": 16960 + }, + { + "grad_norm": 0.19247305393218994, + "learning_rate": 6.148869093580479e-06, + "loss": 0.0079, + "step": 16970 + }, + { + "grad_norm": 0.1588144451379776, + "learning_rate": 6.109208651943921e-06, + "loss": 0.0059, + "step": 16980 + }, + { + "grad_norm": 0.14048437774181366, + "learning_rate": 6.069668206054946e-06, + "loss": 0.0054, + "step": 16990 + }, + { + "grad_norm": 0.1959722936153412, + "learning_rate": 6.0302478640156145e-06, + "loss": 0.0071, + "step": 17000 + }, + { + "grad_norm": 0.15154221653938293, + "learning_rate": 5.990947733599644e-06, + "loss": 0.0092, + "step": 17010 + }, + { + "grad_norm": 0.1646411418914795, + "learning_rate": 5.951767922252105e-06, + "loss": 0.0078, + "step": 17020 + }, + { + "grad_norm": 0.151596337556839, + "learning_rate": 5.912708537089068e-06, + "loss": 0.0059, + "step": 17030 + }, + { + "grad_norm": 0.16531504690647125, + "learning_rate": 5.873769684897434e-06, + "loss": 0.0065, + "step": 17040 + }, + { + "grad_norm": 0.12935221195220947, + "learning_rate": 5.834951472134514e-06, + "loss": 0.0075, + "step": 17050 + }, + { + "grad_norm": 0.18192346394062042, + "learning_rate": 5.796254004927832e-06, + "loss": 0.0055, + "step": 17060 + }, + { + "grad_norm": 0.16416220366954803, + "learning_rate": 5.757677389074806e-06, + "loss": 0.0053, + "step": 17070 + }, + { + "grad_norm": 0.12015072256326675, + "learning_rate": 5.719221730042385e-06, + "loss": 0.004, + "step": 17080 + }, + { + "grad_norm": 0.12272200733423233, + "learning_rate": 5.680887132966911e-06, + "loss": 0.0055, + "step": 17090 + }, + { + "grad_norm": 0.15359535813331604, + "learning_rate": 5.642673702653683e-06, + "loss": 0.004, + "step": 17100 + }, + { + "grad_norm": 0.1548011302947998, + "learning_rate": 5.604581543576781e-06, + "loss": 0.0034, + "step": 17110 + }, + { + "grad_norm": 0.20427155494689941, + "learning_rate": 5.566610759878704e-06, + "loss": 0.0059, + "step": 17120 + }, + { + "grad_norm": 0.16857117414474487, + "learning_rate": 5.528761455370119e-06, + "loss": 0.0055, + "step": 17130 + }, + { + "grad_norm": 0.23944735527038574, + "learning_rate": 5.491033733529594e-06, + "loss": 0.007, + "step": 17140 + }, + { + "grad_norm": 0.17614318430423737, + "learning_rate": 5.453427697503255e-06, + "loss": 0.0067, + "step": 17150 + }, + { + "grad_norm": 0.18920522928237915, + "learning_rate": 5.415943450104599e-06, + "loss": 0.0073, + "step": 17160 + }, + { + "grad_norm": 0.19964274764060974, + "learning_rate": 5.378581093814111e-06, + "loss": 0.0063, + "step": 17170 + }, + { + "grad_norm": 0.1071854755282402, + "learning_rate": 5.3413407307790375e-06, + "loss": 0.0072, + "step": 17180 + }, + { + "grad_norm": 0.2243276834487915, + "learning_rate": 5.30422246281313e-06, + "loss": 0.0095, + "step": 17190 + }, + { + "grad_norm": 0.18748407065868378, + "learning_rate": 5.267226391396296e-06, + "loss": 0.0057, + "step": 17200 + }, + { + "grad_norm": 0.13631398975849152, + "learning_rate": 5.2303526176744e-06, + "loss": 0.0046, + "step": 17210 + }, + { + "grad_norm": 0.1465505063533783, + "learning_rate": 5.193601242458929e-06, + "loss": 0.005, + "step": 17220 + }, + { + "grad_norm": 0.14097915589809418, + "learning_rate": 5.156972366226714e-06, + "loss": 0.0056, + "step": 17230 + }, + { + "grad_norm": 0.13900354504585266, + "learning_rate": 5.120466089119735e-06, + "loss": 0.0045, + "step": 17240 + }, + { + "grad_norm": 0.13916902244091034, + "learning_rate": 5.084082510944749e-06, + "loss": 0.0045, + "step": 17250 + }, + { + "grad_norm": 0.1281646490097046, + "learning_rate": 5.047821731173058e-06, + "loss": 0.004, + "step": 17260 + }, + { + "grad_norm": 0.14342135190963745, + "learning_rate": 5.011683848940274e-06, + "loss": 0.0032, + "step": 17270 + }, + { + "grad_norm": 0.12058332562446594, + "learning_rate": 4.975668963045954e-06, + "loss": 0.0034, + "step": 17280 + }, + { + "grad_norm": 0.07655297219753265, + "learning_rate": 4.9397771719534525e-06, + "loss": 0.0029, + "step": 17290 + }, + { + "grad_norm": 0.13594765961170197, + "learning_rate": 4.904008573789548e-06, + "loss": 0.0039, + "step": 17300 + }, + { + "grad_norm": 0.077459916472435, + "learning_rate": 4.8683632663442005e-06, + "loss": 0.0032, + "step": 17310 + }, + { + "grad_norm": 0.1432296335697174, + "learning_rate": 4.832841347070343e-06, + "loss": 0.0053, + "step": 17320 + }, + { + "grad_norm": 0.19469566643238068, + "learning_rate": 4.797442913083539e-06, + "loss": 0.0076, + "step": 17330 + }, + { + "grad_norm": 0.13780337572097778, + "learning_rate": 4.7621680611617596e-06, + "loss": 0.0045, + "step": 17340 + }, + { + "grad_norm": 0.27491068840026855, + "learning_rate": 4.727016887745095e-06, + "loss": 0.0062, + "step": 17350 + }, + { + "grad_norm": 0.17727012932300568, + "learning_rate": 4.691989488935511e-06, + "loss": 0.0043, + "step": 17360 + }, + { + "grad_norm": 0.13536600768566132, + "learning_rate": 4.657085960496588e-06, + "loss": 0.0055, + "step": 17370 + }, + { + "grad_norm": 0.15148520469665527, + "learning_rate": 4.6223063978532265e-06, + "loss": 0.0069, + "step": 17380 + }, + { + "grad_norm": 0.1440393626689911, + "learning_rate": 4.587650896091439e-06, + "loss": 0.0066, + "step": 17390 + }, + { + "grad_norm": 0.27061715722084045, + "learning_rate": 4.553119549958035e-06, + "loss": 0.0099, + "step": 17400 + }, + { + "grad_norm": 0.14800067245960236, + "learning_rate": 4.518712453860385e-06, + "loss": 0.0076, + "step": 17410 + }, + { + "grad_norm": 0.14428620040416718, + "learning_rate": 4.484429701866205e-06, + "loss": 0.0065, + "step": 17420 + }, + { + "grad_norm": 0.16224883496761322, + "learning_rate": 4.4502713877031975e-06, + "loss": 0.0074, + "step": 17430 + }, + { + "grad_norm": 0.12787465751171112, + "learning_rate": 4.416237604758911e-06, + "loss": 0.0065, + "step": 17440 + }, + { + "grad_norm": 0.2244187295436859, + "learning_rate": 4.3823284460804025e-06, + "loss": 0.0041, + "step": 17450 + }, + { + "grad_norm": 0.09566570818424225, + "learning_rate": 4.348544004374011e-06, + "loss": 0.0029, + "step": 17460 + }, + { + "grad_norm": 0.1468641608953476, + "learning_rate": 4.314884372005123e-06, + "loss": 0.0038, + "step": 17470 + }, + { + "grad_norm": 0.16427893936634064, + "learning_rate": 4.281349640997867e-06, + "loss": 0.0059, + "step": 17480 + }, + { + "grad_norm": 0.13429954648017883, + "learning_rate": 4.247939903034942e-06, + "loss": 0.0048, + "step": 17490 + }, + { + "grad_norm": 0.09337612241506577, + "learning_rate": 4.214655249457284e-06, + "loss": 0.0045, + "step": 17500 + }, + { + "grad_norm": 0.14162364602088928, + "learning_rate": 4.181495771263855e-06, + "loss": 0.0048, + "step": 17510 + }, + { + "grad_norm": 0.1563166081905365, + "learning_rate": 4.148461559111427e-06, + "loss": 0.004, + "step": 17520 + }, + { + "grad_norm": 0.13071994483470917, + "learning_rate": 4.115552703314252e-06, + "loss": 0.0055, + "step": 17530 + }, + { + "grad_norm": 0.13444240391254425, + "learning_rate": 4.082769293843886e-06, + "loss": 0.0046, + "step": 17540 + }, + { + "grad_norm": 0.11349745094776154, + "learning_rate": 4.050111420328939e-06, + "loss": 0.0034, + "step": 17550 + }, + { + "grad_norm": 0.1408773511648178, + "learning_rate": 4.017579172054764e-06, + "loss": 0.0048, + "step": 17560 + }, + { + "grad_norm": 0.14414112269878387, + "learning_rate": 3.985172637963308e-06, + "loss": 0.0049, + "step": 17570 + }, + { + "grad_norm": 0.149555042386055, + "learning_rate": 3.952891906652784e-06, + "loss": 0.0043, + "step": 17580 + }, + { + "grad_norm": 0.1302613466978073, + "learning_rate": 3.920737066377478e-06, + "loss": 0.0069, + "step": 17590 + }, + { + "grad_norm": 0.11122079938650131, + "learning_rate": 3.888708205047509e-06, + "loss": 0.005, + "step": 17600 + }, + { + "grad_norm": 0.10337156802415848, + "learning_rate": 3.856805410228542e-06, + "loss": 0.005, + "step": 17610 + }, + { + "grad_norm": 0.10167747735977173, + "learning_rate": 3.82502876914162e-06, + "loss": 0.0047, + "step": 17620 + }, + { + "grad_norm": 0.13269159197807312, + "learning_rate": 3.7933783686628586e-06, + "loss": 0.0038, + "step": 17630 + }, + { + "grad_norm": 0.08322536200284958, + "learning_rate": 3.7618542953232306e-06, + "loss": 0.0048, + "step": 17640 + }, + { + "grad_norm": 0.09429629147052765, + "learning_rate": 3.7304566353083658e-06, + "loss": 0.0035, + "step": 17650 + }, + { + "grad_norm": 0.18311958014965057, + "learning_rate": 3.6991854744582555e-06, + "loss": 0.0042, + "step": 17660 + }, + { + "grad_norm": 0.141080841422081, + "learning_rate": 3.6680408982670777e-06, + "loss": 0.0055, + "step": 17670 + }, + { + "grad_norm": 0.11543253064155579, + "learning_rate": 3.637022991882899e-06, + "loss": 0.0043, + "step": 17680 + }, + { + "grad_norm": 0.09569806605577469, + "learning_rate": 3.606131840107485e-06, + "loss": 0.0041, + "step": 17690 + }, + { + "grad_norm": 0.12522606551647186, + "learning_rate": 3.575367527396084e-06, + "loss": 0.0037, + "step": 17700 + }, + { + "grad_norm": 0.1661101132631302, + "learning_rate": 3.5447301378571386e-06, + "loss": 0.0061, + "step": 17710 + }, + { + "grad_norm": 0.12468750774860382, + "learning_rate": 3.514219755252113e-06, + "loss": 0.0086, + "step": 17720 + }, + { + "grad_norm": 0.1639896184206009, + "learning_rate": 3.4838364629952213e-06, + "loss": 0.0075, + "step": 17730 + }, + { + "grad_norm": 0.10949661582708359, + "learning_rate": 3.4535803441532123e-06, + "loss": 0.0106, + "step": 17740 + }, + { + "grad_norm": 0.16847091913223267, + "learning_rate": 3.4234514814451836e-06, + "loss": 0.0057, + "step": 17750 + }, + { + "grad_norm": 0.1749618500471115, + "learning_rate": 3.393449957242273e-06, + "loss": 0.0098, + "step": 17760 + }, + { + "grad_norm": 0.1580338329076767, + "learning_rate": 3.363575853567524e-06, + "loss": 0.0067, + "step": 17770 + }, + { + "grad_norm": 0.2585931718349457, + "learning_rate": 3.3338292520955826e-06, + "loss": 0.0067, + "step": 17780 + }, + { + "grad_norm": 0.0917854756116867, + "learning_rate": 3.304210234152516e-06, + "loss": 0.0063, + "step": 17790 + }, + { + "grad_norm": 0.17603246867656708, + "learning_rate": 3.2747188807155993e-06, + "loss": 0.0038, + "step": 17800 + }, + { + "grad_norm": 0.13251322507858276, + "learning_rate": 3.2453552724130643e-06, + "loss": 0.0055, + "step": 17810 + }, + { + "grad_norm": 0.1688220202922821, + "learning_rate": 3.216119489523889e-06, + "loss": 0.005, + "step": 17820 + }, + { + "grad_norm": 0.23231695592403412, + "learning_rate": 3.1870116119775917e-06, + "loss": 0.0043, + "step": 17830 + }, + { + "grad_norm": 0.08771422505378723, + "learning_rate": 3.158031719353999e-06, + "loss": 0.0069, + "step": 17840 + }, + { + "grad_norm": 0.12282171845436096, + "learning_rate": 3.1291798908830273e-06, + "loss": 0.0059, + "step": 17850 + }, + { + "grad_norm": 0.24187803268432617, + "learning_rate": 3.1004562054444853e-06, + "loss": 0.0074, + "step": 17860 + }, + { + "grad_norm": 0.15783224999904633, + "learning_rate": 3.071860741567806e-06, + "loss": 0.009, + "step": 17870 + }, + { + "grad_norm": 0.2071942687034607, + "learning_rate": 3.04339357743193e-06, + "loss": 0.0055, + "step": 17880 + }, + { + "grad_norm": 0.18771645426750183, + "learning_rate": 3.0150547908649628e-06, + "loss": 0.0065, + "step": 17890 + }, + { + "grad_norm": 0.21918553113937378, + "learning_rate": 2.9868444593440957e-06, + "loss": 0.0067, + "step": 17900 + }, + { + "grad_norm": 0.13102947175502777, + "learning_rate": 2.9587626599952846e-06, + "loss": 0.0077, + "step": 17910 + }, + { + "grad_norm": 0.12785767018795013, + "learning_rate": 2.930809469593082e-06, + "loss": 0.0071, + "step": 17920 + }, + { + "grad_norm": 0.14695285260677338, + "learning_rate": 2.9029849645604733e-06, + "loss": 0.0056, + "step": 17930 + }, + { + "grad_norm": 0.13243861496448517, + "learning_rate": 2.8752892209685632e-06, + "loss": 0.0056, + "step": 17940 + }, + { + "grad_norm": 0.09605616331100464, + "learning_rate": 2.847722314536483e-06, + "loss": 0.0026, + "step": 17950 + }, + { + "grad_norm": 0.08582542836666107, + "learning_rate": 2.820284320631078e-06, + "loss": 0.0025, + "step": 17960 + }, + { + "grad_norm": 0.10980791598558426, + "learning_rate": 2.792975314266788e-06, + "loss": 0.0021, + "step": 17970 + }, + { + "grad_norm": 0.15544140338897705, + "learning_rate": 2.7657953701054007e-06, + "loss": 0.0041, + "step": 17980 + }, + { + "grad_norm": 0.1361827701330185, + "learning_rate": 2.7387445624558306e-06, + "loss": 0.0068, + "step": 17990 + }, + { + "grad_norm": 0.13008910417556763, + "learning_rate": 2.7118229652739747e-06, + "loss": 0.0069, + "step": 18000 + }, + { + "grad_norm": 0.1772952526807785, + "learning_rate": 2.6850306521624236e-06, + "loss": 0.008, + "step": 18010 + }, + { + "grad_norm": 0.09242834895849228, + "learning_rate": 2.6583676963703507e-06, + "loss": 0.0055, + "step": 18020 + }, + { + "grad_norm": 0.093389131128788, + "learning_rate": 2.631834170793268e-06, + "loss": 0.0054, + "step": 18030 + }, + { + "grad_norm": 0.11211761087179184, + "learning_rate": 2.6054301479728036e-06, + "loss": 0.005, + "step": 18040 + }, + { + "grad_norm": 0.09247152507305145, + "learning_rate": 2.579155700096575e-06, + "loss": 0.0062, + "step": 18050 + }, + { + "grad_norm": 0.16921767592430115, + "learning_rate": 2.5530108989978873e-06, + "loss": 0.0097, + "step": 18060 + }, + { + "grad_norm": 0.18001088500022888, + "learning_rate": 2.5269958161556416e-06, + "loss": 0.0101, + "step": 18070 + }, + { + "grad_norm": 0.10291004925966263, + "learning_rate": 2.5011105226940888e-06, + "loss": 0.0057, + "step": 18080 + }, + { + "grad_norm": 0.14669133722782135, + "learning_rate": 2.4753550893826248e-06, + "loss": 0.0064, + "step": 18090 + }, + { + "grad_norm": 0.10507582128047943, + "learning_rate": 2.4497295866356296e-06, + "loss": 0.0062, + "step": 18100 + }, + { + "grad_norm": 0.09868519008159637, + "learning_rate": 2.424234084512228e-06, + "loss": 0.0056, + "step": 18110 + }, + { + "grad_norm": 0.31316834688186646, + "learning_rate": 2.3988686527161687e-06, + "loss": 0.0108, + "step": 18120 + }, + { + "grad_norm": 0.18530811369419098, + "learning_rate": 2.373633360595573e-06, + "loss": 0.0087, + "step": 18130 + }, + { + "grad_norm": 0.1749511957168579, + "learning_rate": 2.3485282771427585e-06, + "loss": 0.0062, + "step": 18140 + }, + { + "grad_norm": 0.1277114450931549, + "learning_rate": 2.3235534709940665e-06, + "loss": 0.006, + "step": 18150 + }, + { + "grad_norm": 0.19273699820041656, + "learning_rate": 2.2987090104296617e-06, + "loss": 0.0058, + "step": 18160 + }, + { + "grad_norm": 0.11989979445934296, + "learning_rate": 2.273994963373355e-06, + "loss": 0.0055, + "step": 18170 + }, + { + "grad_norm": 0.11362680792808533, + "learning_rate": 2.249411397392409e-06, + "loss": 0.0055, + "step": 18180 + }, + { + "grad_norm": 0.2549756169319153, + "learning_rate": 2.2249583796973506e-06, + "loss": 0.0065, + "step": 18190 + }, + { + "grad_norm": 0.18598651885986328, + "learning_rate": 2.200635977141796e-06, + "loss": 0.0068, + "step": 18200 + }, + { + "grad_norm": 0.14610855281352997, + "learning_rate": 2.17644425622226e-06, + "loss": 0.0059, + "step": 18210 + }, + { + "grad_norm": 0.09029777348041534, + "learning_rate": 2.152383283077991e-06, + "loss": 0.0052, + "step": 18220 + }, + { + "grad_norm": 0.12696830928325653, + "learning_rate": 2.128453123490781e-06, + "loss": 0.0049, + "step": 18230 + }, + { + "grad_norm": 0.15461085736751556, + "learning_rate": 2.1046538428847462e-06, + "loss": 0.0048, + "step": 18240 + }, + { + "grad_norm": 0.11697833985090256, + "learning_rate": 2.0809855063262273e-06, + "loss": 0.0046, + "step": 18250 + }, + { + "grad_norm": 0.11618199199438095, + "learning_rate": 2.057448178523558e-06, + "loss": 0.0056, + "step": 18260 + }, + { + "grad_norm": 0.153721883893013, + "learning_rate": 2.034041923826885e-06, + "loss": 0.0066, + "step": 18270 + }, + { + "grad_norm": 0.14544463157653809, + "learning_rate": 2.0107668062280204e-06, + "loss": 0.0068, + "step": 18280 + }, + { + "grad_norm": 0.10187650471925735, + "learning_rate": 1.9876228893602357e-06, + "loss": 0.0054, + "step": 18290 + }, + { + "grad_norm": 0.09542325884103775, + "learning_rate": 1.9646102364981266e-06, + "loss": 0.0044, + "step": 18300 + }, + { + "grad_norm": 0.11613254994153976, + "learning_rate": 1.9417289105574053e-06, + "loss": 0.0069, + "step": 18310 + }, + { + "grad_norm": 0.09797412157058716, + "learning_rate": 1.9189789740947427e-06, + "loss": 0.0024, + "step": 18320 + }, + { + "grad_norm": 0.12735988199710846, + "learning_rate": 1.896360489307597e-06, + "loss": 0.0045, + "step": 18330 + }, + { + "grad_norm": 0.10587464272975922, + "learning_rate": 1.8738735180340362e-06, + "loss": 0.0046, + "step": 18340 + }, + { + "grad_norm": 0.10340233147144318, + "learning_rate": 1.8515181217525824e-06, + "loss": 0.0046, + "step": 18350 + }, + { + "grad_norm": 0.1418112814426422, + "learning_rate": 1.8292943615820457e-06, + "loss": 0.0053, + "step": 18360 + }, + { + "grad_norm": 0.08964535593986511, + "learning_rate": 1.8072022982813296e-06, + "loss": 0.006, + "step": 18370 + }, + { + "grad_norm": 0.15458083152770996, + "learning_rate": 1.7852419922492925e-06, + "loss": 0.0066, + "step": 18380 + }, + { + "grad_norm": 0.14070585370063782, + "learning_rate": 1.763413503524569e-06, + "loss": 0.0045, + "step": 18390 + }, + { + "grad_norm": 0.11768438667058945, + "learning_rate": 1.7417168917854165e-06, + "loss": 0.0047, + "step": 18400 + }, + { + "grad_norm": 0.10859528928995132, + "learning_rate": 1.720152216349552e-06, + "loss": 0.0042, + "step": 18410 + }, + { + "grad_norm": 0.06654351204633713, + "learning_rate": 1.6987195361739595e-06, + "loss": 0.0041, + "step": 18420 + }, + { + "grad_norm": 0.08936707675457001, + "learning_rate": 1.6774189098547832e-06, + "loss": 0.0061, + "step": 18430 + }, + { + "grad_norm": 0.12725003063678741, + "learning_rate": 1.6562503956271069e-06, + "loss": 0.0058, + "step": 18440 + }, + { + "grad_norm": 0.13364547491073608, + "learning_rate": 1.6352140513648417e-06, + "loss": 0.0061, + "step": 18450 + }, + { + "grad_norm": 0.11553805321455002, + "learning_rate": 1.6143099345805712e-06, + "loss": 0.0034, + "step": 18460 + }, + { + "grad_norm": 0.06523273885250092, + "learning_rate": 1.5935381024253293e-06, + "loss": 0.0026, + "step": 18470 + }, + { + "grad_norm": 0.04697663336992264, + "learning_rate": 1.572898611688517e-06, + "loss": 0.0025, + "step": 18480 + }, + { + "grad_norm": 0.08669311553239822, + "learning_rate": 1.5523915187977133e-06, + "loss": 0.0024, + "step": 18490 + }, + { + "grad_norm": 0.07350636273622513, + "learning_rate": 1.532016879818532e-06, + "loss": 0.0017, + "step": 18500 + }, + { + "grad_norm": 0.07322671264410019, + "learning_rate": 1.51177475045447e-06, + "loss": 0.0016, + "step": 18510 + }, + { + "grad_norm": 0.1319756805896759, + "learning_rate": 1.4916651860467035e-06, + "loss": 0.0039, + "step": 18520 + }, + { + "grad_norm": 0.07744371145963669, + "learning_rate": 1.471688241574043e-06, + "loss": 0.0041, + "step": 18530 + }, + { + "grad_norm": 0.1896924376487732, + "learning_rate": 1.451843971652672e-06, + "loss": 0.006, + "step": 18540 + }, + { + "grad_norm": 0.10496588051319122, + "learning_rate": 1.432132430536076e-06, + "loss": 0.0064, + "step": 18550 + }, + { + "grad_norm": 0.06767508387565613, + "learning_rate": 1.412553672114869e-06, + "loss": 0.0052, + "step": 18560 + }, + { + "grad_norm": 0.09917920082807541, + "learning_rate": 1.3931077499166056e-06, + "loss": 0.005, + "step": 18570 + }, + { + "grad_norm": 0.08152318000793457, + "learning_rate": 1.3737947171057085e-06, + "loss": 0.0042, + "step": 18580 + }, + { + "grad_norm": 0.14928488433361053, + "learning_rate": 1.3546146264832582e-06, + "loss": 0.0062, + "step": 18590 + }, + { + "grad_norm": 0.09968605637550354, + "learning_rate": 1.3355675304869086e-06, + "loss": 0.0047, + "step": 18600 + }, + { + "grad_norm": 0.11749617755413055, + "learning_rate": 1.3166534811906827e-06, + "loss": 0.0061, + "step": 18610 + }, + { + "grad_norm": 0.1802300065755844, + "learning_rate": 1.2978725303048666e-06, + "loss": 0.0068, + "step": 18620 + }, + { + "grad_norm": 0.1360599547624588, + "learning_rate": 1.2792247291758762e-06, + "loss": 0.0073, + "step": 18630 + }, + { + "grad_norm": 0.1120266318321228, + "learning_rate": 1.2607101287860635e-06, + "loss": 0.0063, + "step": 18640 + }, + { + "grad_norm": 0.11309847980737686, + "learning_rate": 1.2423287797536654e-06, + "loss": 0.0061, + "step": 18650 + }, + { + "grad_norm": 0.08905728161334991, + "learning_rate": 1.2240807323325776e-06, + "loss": 0.0054, + "step": 18660 + }, + { + "grad_norm": 0.24295245110988617, + "learning_rate": 1.205966036412254e-06, + "loss": 0.0046, + "step": 18670 + }, + { + "grad_norm": 0.1244949996471405, + "learning_rate": 1.1879847415175949e-06, + "loss": 0.0055, + "step": 18680 + }, + { + "grad_norm": 0.11482975631952286, + "learning_rate": 1.1701368968087712e-06, + "loss": 0.0046, + "step": 18690 + }, + { + "grad_norm": 0.06103646755218506, + "learning_rate": 1.1524225510811116e-06, + "loss": 0.0043, + "step": 18700 + }, + { + "grad_norm": 0.07703328877687454, + "learning_rate": 1.1348417527649535e-06, + "loss": 0.0044, + "step": 18710 + }, + { + "grad_norm": 0.07205721735954285, + "learning_rate": 1.1173945499255268e-06, + "loss": 0.0038, + "step": 18720 + }, + { + "grad_norm": 0.09051566570997238, + "learning_rate": 1.1000809902628307e-06, + "loss": 0.0044, + "step": 18730 + }, + { + "grad_norm": 0.0802495926618576, + "learning_rate": 1.082901121111468e-06, + "loss": 0.0026, + "step": 18740 + }, + { + "grad_norm": 0.09515158832073212, + "learning_rate": 1.0658549894405456e-06, + "loss": 0.003, + "step": 18750 + }, + { + "grad_norm": 0.07529661804437637, + "learning_rate": 1.0489426418535342e-06, + "loss": 0.0032, + "step": 18760 + }, + { + "grad_norm": 0.08873124420642853, + "learning_rate": 1.0321641245881474e-06, + "loss": 0.0032, + "step": 18770 + }, + { + "grad_norm": 0.14086146652698517, + "learning_rate": 1.015519483516214e-06, + "loss": 0.0037, + "step": 18780 + }, + { + "grad_norm": 0.10187302529811859, + "learning_rate": 9.990087641435443e-07, + "loss": 0.0038, + "step": 18790 + }, + { + "grad_norm": 0.1314893215894699, + "learning_rate": 9.826320116098132e-07, + "loss": 0.0066, + "step": 18800 + }, + { + "grad_norm": 0.24432112276554108, + "learning_rate": 9.663892706884447e-07, + "loss": 0.0056, + "step": 18810 + }, + { + "grad_norm": 0.12585218250751495, + "learning_rate": 9.502805857864616e-07, + "loss": 0.01, + "step": 18820 + }, + { + "grad_norm": 0.13484077155590057, + "learning_rate": 9.34306000944396e-07, + "loss": 0.0068, + "step": 18830 + }, + { + "grad_norm": 0.046812377870082855, + "learning_rate": 9.184655598361624e-07, + "loss": 0.0034, + "step": 18840 + }, + { + "grad_norm": 0.05349060893058777, + "learning_rate": 9.027593057689076e-07, + "loss": 0.0027, + "step": 18850 + }, + { + "grad_norm": 0.039122164249420166, + "learning_rate": 8.871872816829441e-07, + "loss": 0.0024, + "step": 18860 + }, + { + "grad_norm": 0.06817688047885895, + "learning_rate": 8.717495301515777e-07, + "loss": 0.0033, + "step": 18870 + }, + { + "grad_norm": 0.13046491146087646, + "learning_rate": 8.564460933810415e-07, + "loss": 0.006, + "step": 18880 + }, + { + "grad_norm": 0.12704172730445862, + "learning_rate": 8.412770132103453e-07, + "loss": 0.0076, + "step": 18890 + }, + { + "grad_norm": 0.10467687994241714, + "learning_rate": 8.262423311111711e-07, + "loss": 0.0057, + "step": 18900 + }, + { + "grad_norm": 0.11078772693872452, + "learning_rate": 8.113420881877665e-07, + "loss": 0.0056, + "step": 18910 + }, + { + "grad_norm": 0.09386801719665527, + "learning_rate": 7.965763251768288e-07, + "loss": 0.0045, + "step": 18920 + }, + { + "grad_norm": 0.13769307732582092, + "learning_rate": 7.819450824473995e-07, + "loss": 0.0047, + "step": 18930 + }, + { + "grad_norm": 0.09748882800340652, + "learning_rate": 7.674484000007198e-07, + "loss": 0.0026, + "step": 18940 + }, + { + "grad_norm": 0.06319034844636917, + "learning_rate": 7.530863174701752e-07, + "loss": 0.0031, + "step": 18950 + }, + { + "grad_norm": 0.06810025125741959, + "learning_rate": 7.38858874121151e-07, + "loss": 0.002, + "step": 18960 + }, + { + "grad_norm": 0.15080776810646057, + "learning_rate": 7.247661088509328e-07, + "loss": 0.0022, + "step": 18970 + }, + { + "grad_norm": 0.0982227623462677, + "learning_rate": 7.108080601886002e-07, + "loss": 0.0044, + "step": 18980 + }, + { + "grad_norm": 0.09390969574451447, + "learning_rate": 6.969847662949336e-07, + "loss": 0.0044, + "step": 18990 + }, + { + "grad_norm": 0.0818520113825798, + "learning_rate": 6.832962649622798e-07, + "loss": 0.0067, + "step": 19000 + }, + { + "grad_norm": 0.096654511988163, + "learning_rate": 6.697425936144863e-07, + "loss": 0.0058, + "step": 19010 + }, + { + "grad_norm": 0.06322211772203445, + "learning_rate": 6.563237893067731e-07, + "loss": 0.0044, + "step": 19020 + }, + { + "grad_norm": 0.10598164796829224, + "learning_rate": 6.430398887256328e-07, + "loss": 0.0053, + "step": 19030 + }, + { + "grad_norm": 0.05596324801445007, + "learning_rate": 6.298909281887478e-07, + "loss": 0.0038, + "step": 19040 + }, + { + "grad_norm": 0.13128861784934998, + "learning_rate": 6.168769436448673e-07, + "loss": 0.0046, + "step": 19050 + }, + { + "grad_norm": 0.11795146763324738, + "learning_rate": 6.03997970673742e-07, + "loss": 0.0042, + "step": 19060 + }, + { + "grad_norm": 0.12046348303556442, + "learning_rate": 5.912540444859782e-07, + "loss": 0.006, + "step": 19070 + }, + { + "grad_norm": 0.07680107653141022, + "learning_rate": 5.786451999229837e-07, + "loss": 0.0083, + "step": 19080 + }, + { + "grad_norm": 0.06225180998444557, + "learning_rate": 5.661714714568722e-07, + "loss": 0.0059, + "step": 19090 + }, + { + "grad_norm": 0.06460992246866226, + "learning_rate": 5.538328931903259e-07, + "loss": 0.0058, + "step": 19100 + }, + { + "grad_norm": 0.07668179273605347, + "learning_rate": 5.416294988565551e-07, + "loss": 0.005, + "step": 19110 + }, + { + "grad_norm": 0.09261664748191833, + "learning_rate": 5.29561321819172e-07, + "loss": 0.0058, + "step": 19120 + }, + { + "grad_norm": 0.07152815163135529, + "learning_rate": 5.176283950721061e-07, + "loss": 0.0049, + "step": 19130 + }, + { + "grad_norm": 0.059480778872966766, + "learning_rate": 5.058307512395332e-07, + "loss": 0.0054, + "step": 19140 + }, + { + "grad_norm": 0.1595279425382614, + "learning_rate": 4.941684225757526e-07, + "loss": 0.0068, + "step": 19150 + }, + { + "grad_norm": 0.07684699445962906, + "learning_rate": 4.826414409651314e-07, + "loss": 0.0058, + "step": 19160 + }, + { + "grad_norm": 0.15140311419963837, + "learning_rate": 4.712498379219943e-07, + "loss": 0.006, + "step": 19170 + }, + { + "grad_norm": 0.11094372719526291, + "learning_rate": 4.599936445905506e-07, + "loss": 0.0044, + "step": 19180 + }, + { + "grad_norm": 0.03870063275098801, + "learning_rate": 4.4887289174480594e-07, + "loss": 0.0024, + "step": 19190 + }, + { + "grad_norm": 0.07703114300966263, + "learning_rate": 4.378876097884621e-07, + "loss": 0.0016, + "step": 19200 + }, + { + "grad_norm": 0.04670195281505585, + "learning_rate": 4.2703782875487264e-07, + "loss": 0.0023, + "step": 19210 + }, + { + "grad_norm": 0.024242544546723366, + "learning_rate": 4.163235783069208e-07, + "loss": 0.0025, + "step": 19220 + }, + { + "grad_norm": 0.09502039849758148, + "learning_rate": 4.057448877369585e-07, + "loss": 0.0034, + "step": 19230 + }, + { + "grad_norm": 0.054932866245508194, + "learning_rate": 3.9530178596672295e-07, + "loss": 0.0027, + "step": 19240 + }, + { + "grad_norm": 0.03872097656130791, + "learning_rate": 3.849943015472479e-07, + "loss": 0.0046, + "step": 19250 + }, + { + "grad_norm": 0.05287490412592888, + "learning_rate": 3.748224626588137e-07, + "loss": 0.0039, + "step": 19260 + }, + { + "grad_norm": 0.08801569044589996, + "learning_rate": 3.647862971108307e-07, + "loss": 0.005, + "step": 19270 + }, + { + "grad_norm": 0.08919723331928253, + "learning_rate": 3.5488583234179473e-07, + "loss": 0.0066, + "step": 19280 + }, + { + "grad_norm": 0.06664815545082092, + "learning_rate": 3.4512109541920413e-07, + "loss": 0.0055, + "step": 19290 + }, + { + "grad_norm": 0.1845441609621048, + "learning_rate": 3.354921130394706e-07, + "loss": 0.0033, + "step": 19300 + }, + { + "grad_norm": 0.07908899337053299, + "learning_rate": 3.259989115278639e-07, + "loss": 0.0035, + "step": 19310 + }, + { + "grad_norm": 0.10014499723911285, + "learning_rate": 3.1664151683843403e-07, + "loss": 0.0041, + "step": 19320 + }, + { + "grad_norm": 0.12704405188560486, + "learning_rate": 3.074199545539447e-07, + "loss": 0.007, + "step": 19330 + }, + { + "grad_norm": 0.1298447549343109, + "learning_rate": 2.983342498857955e-07, + "loss": 0.0066, + "step": 19340 + }, + { + "grad_norm": 0.06533486396074295, + "learning_rate": 2.893844276739499e-07, + "loss": 0.0042, + "step": 19350 + }, + { + "grad_norm": 0.08153844624757767, + "learning_rate": 2.8057051238688514e-07, + "loss": 0.0037, + "step": 19360 + }, + { + "grad_norm": 0.05825541913509369, + "learning_rate": 2.71892528121509e-07, + "loss": 0.0041, + "step": 19370 + }, + { + "grad_norm": 0.16420342028141022, + "learning_rate": 2.633504986030988e-07, + "loss": 0.0057, + "step": 19380 + }, + { + "grad_norm": 0.13691189885139465, + "learning_rate": 2.549444471852347e-07, + "loss": 0.0065, + "step": 19390 + }, + { + "grad_norm": 0.1357102245092392, + "learning_rate": 2.4667439684974423e-07, + "loss": 0.0059, + "step": 19400 + }, + { + "grad_norm": 0.12307611852884293, + "learning_rate": 2.3854037020662467e-07, + "loss": 0.0068, + "step": 19410 + }, + { + "grad_norm": 0.030170906335115433, + "learning_rate": 2.3054238949399288e-07, + "loss": 0.0038, + "step": 19420 + }, + { + "grad_norm": 0.05144472420215607, + "learning_rate": 2.2268047657802993e-07, + "loss": 0.0044, + "step": 19430 + }, + { + "grad_norm": 0.05018848180770874, + "learning_rate": 2.149546529529034e-07, + "loss": 0.0049, + "step": 19440 + }, + { + "grad_norm": 0.1274074763059616, + "learning_rate": 2.0736493974071736e-07, + "loss": 0.0055, + "step": 19450 + }, + { + "grad_norm": 0.05872267857193947, + "learning_rate": 1.9991135769145686e-07, + "loss": 0.0046, + "step": 19460 + }, + { + "grad_norm": 0.07353901863098145, + "learning_rate": 1.9259392718293245e-07, + "loss": 0.0059, + "step": 19470 + }, + { + "grad_norm": 0.09787874668836594, + "learning_rate": 1.8541266822072467e-07, + "loss": 0.0071, + "step": 19480 + }, + { + "grad_norm": 0.08466354757547379, + "learning_rate": 1.7836760043811184e-07, + "loss": 0.0063, + "step": 19490 + }, + { + "grad_norm": 0.10716100037097931, + "learning_rate": 1.7145874309604792e-07, + "loss": 0.0059, + "step": 19500 + }, + { + "grad_norm": 0.12967455387115479, + "learning_rate": 1.6468611508308474e-07, + "loss": 0.0091, + "step": 19510 + }, + { + "grad_norm": 0.09471118450164795, + "learning_rate": 1.5804973491532204e-07, + "loss": 0.006, + "step": 19520 + }, + { + "grad_norm": 0.04478686302900314, + "learning_rate": 1.5154962073637424e-07, + "loss": 0.0052, + "step": 19530 + }, + { + "grad_norm": 0.028785180300474167, + "learning_rate": 1.4518579031730372e-07, + "loss": 0.0024, + "step": 19540 + }, + { + "grad_norm": 0.07058226317167282, + "learning_rate": 1.389582610565876e-07, + "loss": 0.0017, + "step": 19550 + }, + { + "grad_norm": 0.09576913714408875, + "learning_rate": 1.3286704998003995e-07, + "loss": 0.0023, + "step": 19560 + }, + { + "grad_norm": 0.06446996331214905, + "learning_rate": 1.2691217374080632e-07, + "loss": 0.0035, + "step": 19570 + }, + { + "grad_norm": 0.08097999542951584, + "learning_rate": 1.2109364861929705e-07, + "loss": 0.0026, + "step": 19580 + }, + { + "grad_norm": 0.15360364317893982, + "learning_rate": 1.1541149052312628e-07, + "loss": 0.0037, + "step": 19590 + }, + { + "grad_norm": 0.13213372230529785, + "learning_rate": 1.0986571498710074e-07, + "loss": 0.0033, + "step": 19600 + }, + { + "grad_norm": 0.057541485875844955, + "learning_rate": 1.0445633717316438e-07, + "loss": 0.0029, + "step": 19610 + }, + { + "grad_norm": 0.10008604824542999, + "learning_rate": 9.918337187034277e-08, + "loss": 0.0041, + "step": 19620 + }, + { + "grad_norm": 0.10044741630554199, + "learning_rate": 9.404683349472643e-08, + "loss": 0.0036, + "step": 19630 + }, + { + "grad_norm": 0.09773146361112595, + "learning_rate": 8.904673608940983e-08, + "loss": 0.0053, + "step": 19640 + }, + { + "grad_norm": 0.12441589683294296, + "learning_rate": 8.418309332447471e-08, + "loss": 0.005, + "step": 19650 + }, + { + "grad_norm": 0.10869863629341125, + "learning_rate": 7.945591849692902e-08, + "loss": 0.0053, + "step": 19660 + }, + { + "grad_norm": 0.11533400416374207, + "learning_rate": 7.486522453069578e-08, + "loss": 0.0062, + "step": 19670 + }, + { + "grad_norm": 0.14639101922512054, + "learning_rate": 7.041102397655208e-08, + "loss": 0.0076, + "step": 19680 + }, + { + "grad_norm": 0.08389224857091904, + "learning_rate": 6.609332901210685e-08, + "loss": 0.0047, + "step": 19690 + }, + { + "grad_norm": 0.07598240673542023, + "learning_rate": 6.191215144178419e-08, + "loss": 0.0045, + "step": 19700 + }, + { + "grad_norm": 0.08431699872016907, + "learning_rate": 5.786750269675678e-08, + "loss": 0.0065, + "step": 19710 + }, + { + "grad_norm": 0.10730160772800446, + "learning_rate": 5.395939383494031e-08, + "loss": 0.006, + "step": 19720 + }, + { + "grad_norm": 0.10121988505125046, + "learning_rate": 5.018783554095463e-08, + "loss": 0.0083, + "step": 19730 + }, + { + "grad_norm": 0.05829305946826935, + "learning_rate": 4.655283812610156e-08, + "loss": 0.0048, + "step": 19740 + }, + { + "grad_norm": 0.07935597747564316, + "learning_rate": 4.305441152831491e-08, + "loss": 0.0041, + "step": 19750 + }, + { + "grad_norm": 0.04502784460783005, + "learning_rate": 3.9692565312171584e-08, + "loss": 0.0035, + "step": 19760 + }, + { + "grad_norm": 0.16611430048942566, + "learning_rate": 3.6467308668824975e-08, + "loss": 0.0055, + "step": 19770 + }, + { + "grad_norm": 0.08389358222484589, + "learning_rate": 3.3378650416004964e-08, + "loss": 0.0049, + "step": 19780 + }, + { + "grad_norm": 0.07105663418769836, + "learning_rate": 3.042659899797906e-08, + "loss": 0.0059, + "step": 19790 + }, + { + "grad_norm": 0.1458117961883545, + "learning_rate": 2.76111624855524e-08, + "loss": 0.0048, + "step": 19800 + }, + { + "grad_norm": 0.09140418469905853, + "learning_rate": 2.4932348576017784e-08, + "loss": 0.0052, + "step": 19810 + }, + { + "grad_norm": 0.10562155395746231, + "learning_rate": 2.239016459314458e-08, + "loss": 0.0042, + "step": 19820 + }, + { + "grad_norm": 0.11864247173070908, + "learning_rate": 1.9984617487173174e-08, + "loss": 0.0046, + "step": 19830 + }, + { + "grad_norm": 0.08995606005191803, + "learning_rate": 1.7715713834776105e-08, + "loss": 0.0044, + "step": 19840 + }, + { + "grad_norm": 0.11056473106145859, + "learning_rate": 1.5583459839046964e-08, + "loss": 0.0034, + "step": 19850 + }, + { + "grad_norm": 0.1718575656414032, + "learning_rate": 1.3587861329489304e-08, + "loss": 0.0057, + "step": 19860 + }, + { + "grad_norm": 0.09055381268262863, + "learning_rate": 1.1728923761994415e-08, + "loss": 0.0064, + "step": 19870 + }, + { + "grad_norm": 0.10868086665868759, + "learning_rate": 1.0006652218819135e-08, + "loss": 0.0069, + "step": 19880 + }, + { + "grad_norm": 0.14595797657966614, + "learning_rate": 8.421051408596947e-09, + "loss": 0.0069, + "step": 19890 + }, + { + "grad_norm": 0.07357790321111679, + "learning_rate": 6.972125666299123e-09, + "loss": 0.0049, + "step": 19900 + }, + { + "grad_norm": 0.0999709889292717, + "learning_rate": 5.659878953229169e-09, + "loss": 0.0044, + "step": 19910 + }, + { + "grad_norm": 0.0717727541923523, + "learning_rate": 4.48431485701728e-09, + "loss": 0.0026, + "step": 19920 + }, + { + "grad_norm": 0.08553707599639893, + "learning_rate": 3.4454365916203322e-09, + "loss": 0.0033, + "step": 19930 + }, + { + "grad_norm": 0.1162085235118866, + "learning_rate": 2.5432469972830332e-09, + "loss": 0.0034, + "step": 19940 + }, + { + "grad_norm": 0.10553829371929169, + "learning_rate": 1.7777485405601203e-09, + "loss": 0.0034, + "step": 19950 + }, + { + "grad_norm": 0.05970882624387741, + "learning_rate": 1.1489433142941597e-09, + "loss": 0.0055, + "step": 19960 + }, + { + "grad_norm": 0.12160852551460266, + "learning_rate": 6.568330376210963e-10, + "loss": 0.0052, + "step": 19970 + }, + { + "grad_norm": 0.04389548674225807, + "learning_rate": 3.0141905594249787e-10, + "loss": 0.0037, + "step": 19980 + }, + { + "grad_norm": 0.03494492545723915, + "learning_rate": 8.270234094776008e-11, + "loss": 0.0031, + "step": 19990 + }, + { + "grad_norm": 0.04751284793019295, + "learning_rate": 6.834906085551041e-13, + "loss": 0.0032, + "step": 20000 + } + ], + "logging_steps": 10, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 48, + "trial_name": null, + "trial_params": null +}