diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 36.014405762304925, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.012004801920768308, + "grad_norm": 1.4862018823623657, + "learning_rate": 9e-07, + "loss": 0.2093, + "step": 10 + }, + { + "epoch": 0.024009603841536616, + "grad_norm": 1.199112892150879, + "learning_rate": 1.9e-06, + "loss": 0.1872, + "step": 20 + }, + { + "epoch": 0.03601440576230492, + "grad_norm": 1.1331433057785034, + "learning_rate": 2.9e-06, + "loss": 0.1672, + "step": 30 + }, + { + "epoch": 0.04801920768307323, + "grad_norm": 1.742417812347412, + "learning_rate": 3.9e-06, + "loss": 0.1551, + "step": 40 + }, + { + "epoch": 0.060024009603841535, + "grad_norm": 0.5030286908149719, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.1574, + "step": 50 + }, + { + "epoch": 0.07202881152460984, + "grad_norm": 0.8100131750106812, + "learning_rate": 5.9e-06, + "loss": 0.1435, + "step": 60 + }, + { + "epoch": 0.08403361344537816, + "grad_norm": 0.42576003074645996, + "learning_rate": 6.900000000000001e-06, + "loss": 0.1401, + "step": 70 + }, + { + "epoch": 0.09603841536614646, + "grad_norm": 0.7617576122283936, + "learning_rate": 7.9e-06, + "loss": 0.1394, + "step": 80 + }, + { + "epoch": 0.10804321728691477, + "grad_norm": 0.5240992307662964, + "learning_rate": 8.9e-06, + "loss": 0.1358, + "step": 90 + }, + { + "epoch": 0.12004801920768307, + "grad_norm": 0.4942385256290436, + "learning_rate": 9.900000000000002e-06, + "loss": 0.124, + "step": 100 + }, + { + "epoch": 0.13205282112845138, + "grad_norm": 0.5717485547065735, + "learning_rate": 1.09e-05, + "loss": 0.1382, + "step": 110 + }, + { + "epoch": 0.14405762304921968, + "grad_norm": 0.513937771320343, + "learning_rate": 1.19e-05, + "loss": 0.1172, + "step": 120 + }, + { + "epoch": 0.15606242496998798, + "grad_norm": 0.5057616233825684, + "learning_rate": 1.29e-05, + "loss": 0.1245, + "step": 130 + }, + { + "epoch": 0.16806722689075632, + "grad_norm": 0.6607367992401123, + "learning_rate": 1.3900000000000002e-05, + "loss": 0.1222, + "step": 140 + }, + { + "epoch": 0.18007202881152462, + "grad_norm": 0.6424652338027954, + "learning_rate": 1.49e-05, + "loss": 0.1275, + "step": 150 + }, + { + "epoch": 0.19207683073229292, + "grad_norm": 0.6333882808685303, + "learning_rate": 1.59e-05, + "loss": 0.1284, + "step": 160 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 0.7941071391105652, + "learning_rate": 1.69e-05, + "loss": 0.1138, + "step": 170 + }, + { + "epoch": 0.21608643457382953, + "grad_norm": 1.0874606370925903, + "learning_rate": 1.79e-05, + "loss": 0.1105, + "step": 180 + }, + { + "epoch": 0.22809123649459784, + "grad_norm": 0.3523195683956146, + "learning_rate": 1.8900000000000002e-05, + "loss": 0.1102, + "step": 190 + }, + { + "epoch": 0.24009603841536614, + "grad_norm": 0.6137881278991699, + "learning_rate": 1.9900000000000003e-05, + "loss": 0.114, + "step": 200 + }, + { + "epoch": 0.25210084033613445, + "grad_norm": 0.5785936117172241, + "learning_rate": 2.09e-05, + "loss": 0.1145, + "step": 210 + }, + { + "epoch": 0.26410564225690275, + "grad_norm": 0.49162647128105164, + "learning_rate": 2.19e-05, + "loss": 0.117, + "step": 220 + }, + { + "epoch": 0.27611044417767105, + "grad_norm": 0.49432259798049927, + "learning_rate": 2.29e-05, + "loss": 0.1087, + "step": 230 + }, + { + "epoch": 0.28811524609843936, + "grad_norm": 0.7169405221939087, + "learning_rate": 2.39e-05, + "loss": 0.1123, + "step": 240 + }, + { + "epoch": 0.30012004801920766, + "grad_norm": 0.6936995387077332, + "learning_rate": 2.4900000000000002e-05, + "loss": 0.0983, + "step": 250 + }, + { + "epoch": 0.31212484993997597, + "grad_norm": 0.5635675191879272, + "learning_rate": 2.5900000000000003e-05, + "loss": 0.1007, + "step": 260 + }, + { + "epoch": 0.3241296518607443, + "grad_norm": 0.7484883666038513, + "learning_rate": 2.6900000000000003e-05, + "loss": 0.1075, + "step": 270 + }, + { + "epoch": 0.33613445378151263, + "grad_norm": 0.8241913318634033, + "learning_rate": 2.7900000000000004e-05, + "loss": 0.1054, + "step": 280 + }, + { + "epoch": 0.34813925570228094, + "grad_norm": 0.547121524810791, + "learning_rate": 2.8899999999999998e-05, + "loss": 0.0964, + "step": 290 + }, + { + "epoch": 0.36014405762304924, + "grad_norm": 0.656216025352478, + "learning_rate": 2.9900000000000002e-05, + "loss": 0.1065, + "step": 300 + }, + { + "epoch": 0.37214885954381755, + "grad_norm": 0.8635461926460266, + "learning_rate": 3.09e-05, + "loss": 0.1023, + "step": 310 + }, + { + "epoch": 0.38415366146458585, + "grad_norm": 0.5938664078712463, + "learning_rate": 3.19e-05, + "loss": 0.1004, + "step": 320 + }, + { + "epoch": 0.39615846338535415, + "grad_norm": 0.43641048669815063, + "learning_rate": 3.29e-05, + "loss": 0.0985, + "step": 330 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 0.4528971016407013, + "learning_rate": 3.3900000000000004e-05, + "loss": 0.1056, + "step": 340 + }, + { + "epoch": 0.42016806722689076, + "grad_norm": 0.4909480810165405, + "learning_rate": 3.49e-05, + "loss": 0.0929, + "step": 350 + }, + { + "epoch": 0.43217286914765907, + "grad_norm": 0.7683784365653992, + "learning_rate": 3.59e-05, + "loss": 0.1031, + "step": 360 + }, + { + "epoch": 0.44417767106842737, + "grad_norm": 0.42116791009902954, + "learning_rate": 3.69e-05, + "loss": 0.0983, + "step": 370 + }, + { + "epoch": 0.4561824729891957, + "grad_norm": 0.5685861706733704, + "learning_rate": 3.79e-05, + "loss": 0.0973, + "step": 380 + }, + { + "epoch": 0.468187274909964, + "grad_norm": 0.5281659364700317, + "learning_rate": 3.8900000000000004e-05, + "loss": 0.0985, + "step": 390 + }, + { + "epoch": 0.4801920768307323, + "grad_norm": 0.5780090093612671, + "learning_rate": 3.99e-05, + "loss": 0.0914, + "step": 400 + }, + { + "epoch": 0.4921968787515006, + "grad_norm": 0.3591746389865875, + "learning_rate": 4.09e-05, + "loss": 0.0964, + "step": 410 + }, + { + "epoch": 0.5042016806722689, + "grad_norm": 0.42190349102020264, + "learning_rate": 4.19e-05, + "loss": 0.0942, + "step": 420 + }, + { + "epoch": 0.5162064825930373, + "grad_norm": 0.4464319944381714, + "learning_rate": 4.29e-05, + "loss": 0.0948, + "step": 430 + }, + { + "epoch": 0.5282112845138055, + "grad_norm": 0.6127314567565918, + "learning_rate": 4.39e-05, + "loss": 0.0878, + "step": 440 + }, + { + "epoch": 0.5402160864345739, + "grad_norm": 0.36070922017097473, + "learning_rate": 4.49e-05, + "loss": 0.0919, + "step": 450 + }, + { + "epoch": 0.5522208883553421, + "grad_norm": 0.6483412981033325, + "learning_rate": 4.5900000000000004e-05, + "loss": 0.0921, + "step": 460 + }, + { + "epoch": 0.5642256902761105, + "grad_norm": 0.41169866919517517, + "learning_rate": 4.69e-05, + "loss": 0.0872, + "step": 470 + }, + { + "epoch": 0.5762304921968787, + "grad_norm": 0.5746738910675049, + "learning_rate": 4.79e-05, + "loss": 0.0917, + "step": 480 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.5669548511505127, + "learning_rate": 4.89e-05, + "loss": 0.0924, + "step": 490 + }, + { + "epoch": 0.6002400960384153, + "grad_norm": 0.4199235737323761, + "learning_rate": 4.99e-05, + "loss": 0.0856, + "step": 500 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.3564973473548889, + "learning_rate": 5.0900000000000004e-05, + "loss": 0.0822, + "step": 510 + }, + { + "epoch": 0.6242496998799519, + "grad_norm": 0.4544888138771057, + "learning_rate": 5.19e-05, + "loss": 0.0914, + "step": 520 + }, + { + "epoch": 0.6362545018007203, + "grad_norm": 0.430405855178833, + "learning_rate": 5.2900000000000005e-05, + "loss": 0.0942, + "step": 530 + }, + { + "epoch": 0.6482593037214885, + "grad_norm": 0.39812228083610535, + "learning_rate": 5.390000000000001e-05, + "loss": 0.0909, + "step": 540 + }, + { + "epoch": 0.6602641056422569, + "grad_norm": 0.47429418563842773, + "learning_rate": 5.4900000000000006e-05, + "loss": 0.0869, + "step": 550 + }, + { + "epoch": 0.6722689075630253, + "grad_norm": 0.5669972896575928, + "learning_rate": 5.590000000000001e-05, + "loss": 0.0897, + "step": 560 + }, + { + "epoch": 0.6842737094837935, + "grad_norm": 0.5598543882369995, + "learning_rate": 5.69e-05, + "loss": 0.0909, + "step": 570 + }, + { + "epoch": 0.6962785114045619, + "grad_norm": 0.5149314403533936, + "learning_rate": 5.79e-05, + "loss": 0.0875, + "step": 580 + }, + { + "epoch": 0.7082833133253301, + "grad_norm": 0.5105926394462585, + "learning_rate": 5.89e-05, + "loss": 0.0878, + "step": 590 + }, + { + "epoch": 0.7202881152460985, + "grad_norm": 0.5025826096534729, + "learning_rate": 5.99e-05, + "loss": 0.0897, + "step": 600 + }, + { + "epoch": 0.7322929171668667, + "grad_norm": 0.41076698899269104, + "learning_rate": 6.09e-05, + "loss": 0.0805, + "step": 610 + }, + { + "epoch": 0.7442977190876351, + "grad_norm": 0.5647391080856323, + "learning_rate": 6.19e-05, + "loss": 0.0894, + "step": 620 + }, + { + "epoch": 0.7563025210084033, + "grad_norm": 0.3845747411251068, + "learning_rate": 6.29e-05, + "loss": 0.0852, + "step": 630 + }, + { + "epoch": 0.7683073229291717, + "grad_norm": 0.280582457780838, + "learning_rate": 6.390000000000001e-05, + "loss": 0.0801, + "step": 640 + }, + { + "epoch": 0.78031212484994, + "grad_norm": 0.5631809830665588, + "learning_rate": 6.49e-05, + "loss": 0.0908, + "step": 650 + }, + { + "epoch": 0.7923169267707083, + "grad_norm": 0.6072647571563721, + "learning_rate": 6.59e-05, + "loss": 0.0907, + "step": 660 + }, + { + "epoch": 0.8043217286914766, + "grad_norm": 0.37164241075515747, + "learning_rate": 6.690000000000001e-05, + "loss": 0.0875, + "step": 670 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.4169856905937195, + "learning_rate": 6.790000000000001e-05, + "loss": 0.0856, + "step": 680 + }, + { + "epoch": 0.8283313325330132, + "grad_norm": 0.5900633335113525, + "learning_rate": 6.89e-05, + "loss": 0.075, + "step": 690 + }, + { + "epoch": 0.8403361344537815, + "grad_norm": 0.5202320218086243, + "learning_rate": 6.99e-05, + "loss": 0.0823, + "step": 700 + }, + { + "epoch": 0.8523409363745498, + "grad_norm": 0.7741528153419495, + "learning_rate": 7.09e-05, + "loss": 0.093, + "step": 710 + }, + { + "epoch": 0.8643457382953181, + "grad_norm": 0.4348047971725464, + "learning_rate": 7.19e-05, + "loss": 0.0862, + "step": 720 + }, + { + "epoch": 0.8763505402160864, + "grad_norm": 0.25665780901908875, + "learning_rate": 7.29e-05, + "loss": 0.0859, + "step": 730 + }, + { + "epoch": 0.8883553421368547, + "grad_norm": 0.38728955388069153, + "learning_rate": 7.390000000000001e-05, + "loss": 0.0903, + "step": 740 + }, + { + "epoch": 0.9003601440576231, + "grad_norm": 0.30359673500061035, + "learning_rate": 7.49e-05, + "loss": 0.086, + "step": 750 + }, + { + "epoch": 0.9123649459783914, + "grad_norm": 0.5325232148170471, + "learning_rate": 7.59e-05, + "loss": 0.082, + "step": 760 + }, + { + "epoch": 0.9243697478991597, + "grad_norm": 0.3985685408115387, + "learning_rate": 7.69e-05, + "loss": 0.0871, + "step": 770 + }, + { + "epoch": 0.936374549819928, + "grad_norm": 0.3082857131958008, + "learning_rate": 7.790000000000001e-05, + "loss": 0.08, + "step": 780 + }, + { + "epoch": 0.9483793517406963, + "grad_norm": 0.4456673562526703, + "learning_rate": 7.890000000000001e-05, + "loss": 0.0886, + "step": 790 + }, + { + "epoch": 0.9603841536614646, + "grad_norm": 0.47051942348480225, + "learning_rate": 7.99e-05, + "loss": 0.0898, + "step": 800 + }, + { + "epoch": 0.9723889555822329, + "grad_norm": 0.4116154909133911, + "learning_rate": 8.090000000000001e-05, + "loss": 0.08, + "step": 810 + }, + { + "epoch": 0.9843937575030012, + "grad_norm": 0.45954954624176025, + "learning_rate": 8.19e-05, + "loss": 0.0811, + "step": 820 + }, + { + "epoch": 0.9963985594237695, + "grad_norm": 0.3990706503391266, + "learning_rate": 8.29e-05, + "loss": 0.08, + "step": 830 + }, + { + "epoch": 1.0084033613445378, + "grad_norm": 0.5437315106391907, + "learning_rate": 8.39e-05, + "loss": 0.093, + "step": 840 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 0.4281720221042633, + "learning_rate": 8.49e-05, + "loss": 0.0845, + "step": 850 + }, + { + "epoch": 1.0324129651860745, + "grad_norm": 0.45137566328048706, + "learning_rate": 8.59e-05, + "loss": 0.0902, + "step": 860 + }, + { + "epoch": 1.0444177671068426, + "grad_norm": 0.4543575644493103, + "learning_rate": 8.69e-05, + "loss": 0.0862, + "step": 870 + }, + { + "epoch": 1.056422569027611, + "grad_norm": 0.31135764718055725, + "learning_rate": 8.790000000000001e-05, + "loss": 0.086, + "step": 880 + }, + { + "epoch": 1.0684273709483794, + "grad_norm": 0.3942442238330841, + "learning_rate": 8.89e-05, + "loss": 0.0836, + "step": 890 + }, + { + "epoch": 1.0804321728691477, + "grad_norm": 0.35265403985977173, + "learning_rate": 8.99e-05, + "loss": 0.0833, + "step": 900 + }, + { + "epoch": 1.092436974789916, + "grad_norm": 0.36036571860313416, + "learning_rate": 9.090000000000001e-05, + "loss": 0.0776, + "step": 910 + }, + { + "epoch": 1.1044417767106842, + "grad_norm": 0.29071784019470215, + "learning_rate": 9.190000000000001e-05, + "loss": 0.0825, + "step": 920 + }, + { + "epoch": 1.1164465786314526, + "grad_norm": 0.36835888028144836, + "learning_rate": 9.290000000000001e-05, + "loss": 0.0861, + "step": 930 + }, + { + "epoch": 1.128451380552221, + "grad_norm": 0.449015349149704, + "learning_rate": 9.39e-05, + "loss": 0.0828, + "step": 940 + }, + { + "epoch": 1.140456182472989, + "grad_norm": 0.6073689460754395, + "learning_rate": 9.49e-05, + "loss": 0.0807, + "step": 950 + }, + { + "epoch": 1.1524609843937574, + "grad_norm": 0.6466233134269714, + "learning_rate": 9.59e-05, + "loss": 0.0815, + "step": 960 + }, + { + "epoch": 1.1644657863145258, + "grad_norm": 0.33926403522491455, + "learning_rate": 9.69e-05, + "loss": 0.0784, + "step": 970 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 0.2702747583389282, + "learning_rate": 9.790000000000001e-05, + "loss": 0.0825, + "step": 980 + }, + { + "epoch": 1.1884753901560625, + "grad_norm": 0.4769522547721863, + "learning_rate": 9.89e-05, + "loss": 0.0926, + "step": 990 + }, + { + "epoch": 1.2004801920768307, + "grad_norm": 0.37013694643974304, + "learning_rate": 9.99e-05, + "loss": 0.0799, + "step": 1000 + }, + { + "epoch": 1.212484993997599, + "grad_norm": 0.4509250521659851, + "learning_rate": 9.999994463727085e-05, + "loss": 0.0782, + "step": 1010 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 0.4381175935268402, + "learning_rate": 9.999975326009292e-05, + "loss": 0.0881, + "step": 1020 + }, + { + "epoch": 1.2364945978391357, + "grad_norm": 0.4765148460865021, + "learning_rate": 9.999942518549879e-05, + "loss": 0.0795, + "step": 1030 + }, + { + "epoch": 1.2484993997599039, + "grad_norm": 0.35724106431007385, + "learning_rate": 9.999896041438544e-05, + "loss": 0.0802, + "step": 1040 + }, + { + "epoch": 1.2605042016806722, + "grad_norm": 0.3528522551059723, + "learning_rate": 9.999835894802353e-05, + "loss": 0.079, + "step": 1050 + }, + { + "epoch": 1.2725090036014406, + "grad_norm": 0.3644739091396332, + "learning_rate": 9.999762078805743e-05, + "loss": 0.0801, + "step": 1060 + }, + { + "epoch": 1.284513805522209, + "grad_norm": 0.4036848247051239, + "learning_rate": 9.999674593650526e-05, + "loss": 0.0808, + "step": 1070 + }, + { + "epoch": 1.296518607442977, + "grad_norm": 0.324588805437088, + "learning_rate": 9.99957343957588e-05, + "loss": 0.0797, + "step": 1080 + }, + { + "epoch": 1.3085234093637454, + "grad_norm": 0.31259092688560486, + "learning_rate": 9.99945861685836e-05, + "loss": 0.0707, + "step": 1090 + }, + { + "epoch": 1.3205282112845138, + "grad_norm": 0.3430352509021759, + "learning_rate": 9.999330125811884e-05, + "loss": 0.079, + "step": 1100 + }, + { + "epoch": 1.3325330132052822, + "grad_norm": 0.31048154830932617, + "learning_rate": 9.999187966787744e-05, + "loss": 0.0751, + "step": 1110 + }, + { + "epoch": 1.3445378151260505, + "grad_norm": 0.3461169898509979, + "learning_rate": 9.999032140174595e-05, + "loss": 0.074, + "step": 1120 + }, + { + "epoch": 1.3565426170468187, + "grad_norm": 0.40333327651023865, + "learning_rate": 9.998862646398464e-05, + "loss": 0.0829, + "step": 1130 + }, + { + "epoch": 1.368547418967587, + "grad_norm": 0.43178826570510864, + "learning_rate": 9.998679485922739e-05, + "loss": 0.0807, + "step": 1140 + }, + { + "epoch": 1.3805522208883554, + "grad_norm": 0.3486270606517792, + "learning_rate": 9.998482659248174e-05, + "loss": 0.0743, + "step": 1150 + }, + { + "epoch": 1.3925570228091235, + "grad_norm": 0.5704210996627808, + "learning_rate": 9.998272166912883e-05, + "loss": 0.0706, + "step": 1160 + }, + { + "epoch": 1.4045618247298919, + "grad_norm": 0.2576042413711548, + "learning_rate": 9.998048009492347e-05, + "loss": 0.0796, + "step": 1170 + }, + { + "epoch": 1.4165666266506602, + "grad_norm": 0.32346630096435547, + "learning_rate": 9.997810187599403e-05, + "loss": 0.0765, + "step": 1180 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.44201409816741943, + "learning_rate": 9.997558701884249e-05, + "loss": 0.0764, + "step": 1190 + }, + { + "epoch": 1.440576230492197, + "grad_norm": 0.3704690635204315, + "learning_rate": 9.997293553034433e-05, + "loss": 0.0793, + "step": 1200 + }, + { + "epoch": 1.452581032412965, + "grad_norm": 0.25933682918548584, + "learning_rate": 9.997014741774866e-05, + "loss": 0.0747, + "step": 1210 + }, + { + "epoch": 1.4645858343337335, + "grad_norm": 0.3361830711364746, + "learning_rate": 9.996722268867803e-05, + "loss": 0.0707, + "step": 1220 + }, + { + "epoch": 1.4765906362545018, + "grad_norm": 0.4436805248260498, + "learning_rate": 9.996416135112858e-05, + "loss": 0.0693, + "step": 1230 + }, + { + "epoch": 1.4885954381752702, + "grad_norm": 0.37797558307647705, + "learning_rate": 9.996096341346988e-05, + "loss": 0.0768, + "step": 1240 + }, + { + "epoch": 1.5006002400960385, + "grad_norm": 0.44067806005477905, + "learning_rate": 9.995762888444495e-05, + "loss": 0.0767, + "step": 1250 + }, + { + "epoch": 1.5126050420168067, + "grad_norm": 0.30578285455703735, + "learning_rate": 9.995415777317027e-05, + "loss": 0.0745, + "step": 1260 + }, + { + "epoch": 1.524609843937575, + "grad_norm": 0.2689450979232788, + "learning_rate": 9.995055008913574e-05, + "loss": 0.0754, + "step": 1270 + }, + { + "epoch": 1.5366146458583434, + "grad_norm": 0.4041211009025574, + "learning_rate": 9.994680584220463e-05, + "loss": 0.0814, + "step": 1280 + }, + { + "epoch": 1.5486194477791115, + "grad_norm": 0.309457391500473, + "learning_rate": 9.994292504261355e-05, + "loss": 0.0774, + "step": 1290 + }, + { + "epoch": 1.5606242496998801, + "grad_norm": 0.4084872305393219, + "learning_rate": 9.993890770097247e-05, + "loss": 0.0799, + "step": 1300 + }, + { + "epoch": 1.5726290516206483, + "grad_norm": 0.3941647410392761, + "learning_rate": 9.993475382826467e-05, + "loss": 0.0732, + "step": 1310 + }, + { + "epoch": 1.5846338535414166, + "grad_norm": 0.3167079985141754, + "learning_rate": 9.993046343584664e-05, + "loss": 0.0741, + "step": 1320 + }, + { + "epoch": 1.596638655462185, + "grad_norm": 0.21385376155376434, + "learning_rate": 9.992603653544816e-05, + "loss": 0.0755, + "step": 1330 + }, + { + "epoch": 1.6086434573829531, + "grad_norm": 0.38671302795410156, + "learning_rate": 9.992147313917222e-05, + "loss": 0.072, + "step": 1340 + }, + { + "epoch": 1.6206482593037215, + "grad_norm": 0.33764365315437317, + "learning_rate": 9.991677325949497e-05, + "loss": 0.0744, + "step": 1350 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 0.2641427218914032, + "learning_rate": 9.991193690926568e-05, + "loss": 0.0773, + "step": 1360 + }, + { + "epoch": 1.644657863145258, + "grad_norm": 0.4288504421710968, + "learning_rate": 9.990696410170678e-05, + "loss": 0.0749, + "step": 1370 + }, + { + "epoch": 1.6566626650660266, + "grad_norm": 0.3868384063243866, + "learning_rate": 9.990185485041371e-05, + "loss": 0.0722, + "step": 1380 + }, + { + "epoch": 1.6686674669867947, + "grad_norm": 0.4435616731643677, + "learning_rate": 9.989660916935498e-05, + "loss": 0.0754, + "step": 1390 + }, + { + "epoch": 1.680672268907563, + "grad_norm": 0.40516623854637146, + "learning_rate": 9.989122707287208e-05, + "loss": 0.0749, + "step": 1400 + }, + { + "epoch": 1.6926770708283314, + "grad_norm": 0.3815052807331085, + "learning_rate": 9.988570857567945e-05, + "loss": 0.076, + "step": 1410 + }, + { + "epoch": 1.7046818727490995, + "grad_norm": 0.33355700969696045, + "learning_rate": 9.988005369286446e-05, + "loss": 0.0739, + "step": 1420 + }, + { + "epoch": 1.716686674669868, + "grad_norm": 0.4458111822605133, + "learning_rate": 9.987426243988734e-05, + "loss": 0.078, + "step": 1430 + }, + { + "epoch": 1.7286914765906363, + "grad_norm": 0.2785310447216034, + "learning_rate": 9.986833483258114e-05, + "loss": 0.0711, + "step": 1440 + }, + { + "epoch": 1.7406962785114044, + "grad_norm": 0.5463695526123047, + "learning_rate": 9.986227088715173e-05, + "loss": 0.0746, + "step": 1450 + }, + { + "epoch": 1.752701080432173, + "grad_norm": 0.30091533064842224, + "learning_rate": 9.98560706201777e-05, + "loss": 0.0797, + "step": 1460 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 0.41468554735183716, + "learning_rate": 9.984973404861036e-05, + "loss": 0.0744, + "step": 1470 + }, + { + "epoch": 1.7767106842737095, + "grad_norm": 0.28619542717933655, + "learning_rate": 9.984326118977361e-05, + "loss": 0.074, + "step": 1480 + }, + { + "epoch": 1.7887154861944778, + "grad_norm": 0.39195579290390015, + "learning_rate": 9.983665206136406e-05, + "loss": 0.0708, + "step": 1490 + }, + { + "epoch": 1.800720288115246, + "grad_norm": 0.4414312243461609, + "learning_rate": 9.982990668145075e-05, + "loss": 0.0719, + "step": 1500 + }, + { + "epoch": 1.8127250900360146, + "grad_norm": 0.27509427070617676, + "learning_rate": 9.982302506847534e-05, + "loss": 0.0689, + "step": 1510 + }, + { + "epoch": 1.8247298919567827, + "grad_norm": 0.33928677439689636, + "learning_rate": 9.981600724125189e-05, + "loss": 0.0675, + "step": 1520 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 0.26118186116218567, + "learning_rate": 9.980885321896685e-05, + "loss": 0.0732, + "step": 1530 + }, + { + "epoch": 1.8487394957983194, + "grad_norm": 0.21080715954303741, + "learning_rate": 9.980156302117905e-05, + "loss": 0.0699, + "step": 1540 + }, + { + "epoch": 1.8607442977190876, + "grad_norm": 0.3886878490447998, + "learning_rate": 9.979413666781963e-05, + "loss": 0.065, + "step": 1550 + }, + { + "epoch": 1.872749099639856, + "grad_norm": 0.3688771724700928, + "learning_rate": 9.978657417919193e-05, + "loss": 0.0751, + "step": 1560 + }, + { + "epoch": 1.8847539015606243, + "grad_norm": 0.26068076491355896, + "learning_rate": 9.977887557597153e-05, + "loss": 0.072, + "step": 1570 + }, + { + "epoch": 1.8967587034813924, + "grad_norm": 0.2787654399871826, + "learning_rate": 9.97710408792061e-05, + "loss": 0.0714, + "step": 1580 + }, + { + "epoch": 1.908763505402161, + "grad_norm": 0.33321550488471985, + "learning_rate": 9.976307011031542e-05, + "loss": 0.0747, + "step": 1590 + }, + { + "epoch": 1.9207683073229291, + "grad_norm": 0.30425941944122314, + "learning_rate": 9.975496329109126e-05, + "loss": 0.0732, + "step": 1600 + }, + { + "epoch": 1.9327731092436975, + "grad_norm": 0.27393001317977905, + "learning_rate": 9.974672044369732e-05, + "loss": 0.0701, + "step": 1610 + }, + { + "epoch": 1.9447779111644659, + "grad_norm": 0.313311368227005, + "learning_rate": 9.97383415906693e-05, + "loss": 0.071, + "step": 1620 + }, + { + "epoch": 1.956782713085234, + "grad_norm": 0.33065319061279297, + "learning_rate": 9.97298267549146e-05, + "loss": 0.0734, + "step": 1630 + }, + { + "epoch": 1.9687875150060024, + "grad_norm": 0.2630426585674286, + "learning_rate": 9.972117595971249e-05, + "loss": 0.0736, + "step": 1640 + }, + { + "epoch": 1.9807923169267707, + "grad_norm": 0.343834787607193, + "learning_rate": 9.971238922871391e-05, + "loss": 0.0688, + "step": 1650 + }, + { + "epoch": 1.9927971188475389, + "grad_norm": 0.34438538551330566, + "learning_rate": 9.970346658594142e-05, + "loss": 0.0695, + "step": 1660 + }, + { + "epoch": 2.0048019207683074, + "grad_norm": 0.30866822600364685, + "learning_rate": 9.969440805578923e-05, + "loss": 0.0693, + "step": 1670 + }, + { + "epoch": 2.0168067226890756, + "grad_norm": 0.241564080119133, + "learning_rate": 9.968521366302298e-05, + "loss": 0.0691, + "step": 1680 + }, + { + "epoch": 2.028811524609844, + "grad_norm": 0.3528682291507721, + "learning_rate": 9.967588343277981e-05, + "loss": 0.0753, + "step": 1690 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 0.30729350447654724, + "learning_rate": 9.966641739056818e-05, + "loss": 0.0756, + "step": 1700 + }, + { + "epoch": 2.0528211284513804, + "grad_norm": 0.23132391273975372, + "learning_rate": 9.965681556226793e-05, + "loss": 0.0713, + "step": 1710 + }, + { + "epoch": 2.064825930372149, + "grad_norm": 0.31978166103363037, + "learning_rate": 9.964707797413006e-05, + "loss": 0.0635, + "step": 1720 + }, + { + "epoch": 2.076830732292917, + "grad_norm": 0.47489672899246216, + "learning_rate": 9.963720465277679e-05, + "loss": 0.0727, + "step": 1730 + }, + { + "epoch": 2.0888355342136853, + "grad_norm": 0.3407437801361084, + "learning_rate": 9.96271956252014e-05, + "loss": 0.0672, + "step": 1740 + }, + { + "epoch": 2.100840336134454, + "grad_norm": 0.30153170228004456, + "learning_rate": 9.961705091876816e-05, + "loss": 0.0723, + "step": 1750 + }, + { + "epoch": 2.112845138055222, + "grad_norm": 0.2770678699016571, + "learning_rate": 9.960677056121235e-05, + "loss": 0.0664, + "step": 1760 + }, + { + "epoch": 2.1248499399759906, + "grad_norm": 0.4289170801639557, + "learning_rate": 9.959635458064005e-05, + "loss": 0.067, + "step": 1770 + }, + { + "epoch": 2.1368547418967587, + "grad_norm": 0.28723636269569397, + "learning_rate": 9.958580300552815e-05, + "loss": 0.0726, + "step": 1780 + }, + { + "epoch": 2.148859543817527, + "grad_norm": 0.19174404442310333, + "learning_rate": 9.957511586472426e-05, + "loss": 0.0636, + "step": 1790 + }, + { + "epoch": 2.1608643457382954, + "grad_norm": 0.3858477771282196, + "learning_rate": 9.956429318744662e-05, + "loss": 0.0639, + "step": 1800 + }, + { + "epoch": 2.1728691476590636, + "grad_norm": 0.3581012785434723, + "learning_rate": 9.955333500328404e-05, + "loss": 0.0706, + "step": 1810 + }, + { + "epoch": 2.184873949579832, + "grad_norm": 0.28819218277931213, + "learning_rate": 9.95422413421957e-05, + "loss": 0.0729, + "step": 1820 + }, + { + "epoch": 2.1968787515006003, + "grad_norm": 0.19548888504505157, + "learning_rate": 9.953101223451133e-05, + "loss": 0.0706, + "step": 1830 + }, + { + "epoch": 2.2088835534213684, + "grad_norm": 0.33176106214523315, + "learning_rate": 9.951964771093085e-05, + "loss": 0.0731, + "step": 1840 + }, + { + "epoch": 2.220888355342137, + "grad_norm": 0.2821992337703705, + "learning_rate": 9.950814780252442e-05, + "loss": 0.0731, + "step": 1850 + }, + { + "epoch": 2.232893157262905, + "grad_norm": 0.20839476585388184, + "learning_rate": 9.949651254073236e-05, + "loss": 0.0702, + "step": 1860 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 0.48061680793762207, + "learning_rate": 9.948474195736504e-05, + "loss": 0.0667, + "step": 1870 + }, + { + "epoch": 2.256902761104442, + "grad_norm": 0.2649798095226288, + "learning_rate": 9.947283608460277e-05, + "loss": 0.0671, + "step": 1880 + }, + { + "epoch": 2.26890756302521, + "grad_norm": 0.404539555311203, + "learning_rate": 9.946079495499577e-05, + "loss": 0.068, + "step": 1890 + }, + { + "epoch": 2.280912364945978, + "grad_norm": 0.2674334943294525, + "learning_rate": 9.944861860146401e-05, + "loss": 0.0624, + "step": 1900 + }, + { + "epoch": 2.2929171668667467, + "grad_norm": 0.31849735975265503, + "learning_rate": 9.943630705729719e-05, + "loss": 0.0683, + "step": 1910 + }, + { + "epoch": 2.304921968787515, + "grad_norm": 0.2920159697532654, + "learning_rate": 9.942386035615459e-05, + "loss": 0.0674, + "step": 1920 + }, + { + "epoch": 2.3169267707082835, + "grad_norm": 0.2671286463737488, + "learning_rate": 9.941127853206503e-05, + "loss": 0.0695, + "step": 1930 + }, + { + "epoch": 2.3289315726290516, + "grad_norm": 0.26992321014404297, + "learning_rate": 9.939856161942673e-05, + "loss": 0.0741, + "step": 1940 + }, + { + "epoch": 2.3409363745498197, + "grad_norm": 0.23803777992725372, + "learning_rate": 9.938570965300724e-05, + "loss": 0.065, + "step": 1950 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.23451553285121918, + "learning_rate": 9.937272266794335e-05, + "loss": 0.0698, + "step": 1960 + }, + { + "epoch": 2.3649459783913565, + "grad_norm": 0.2439742535352707, + "learning_rate": 9.935960069974096e-05, + "loss": 0.0689, + "step": 1970 + }, + { + "epoch": 2.376950780312125, + "grad_norm": 0.2882639169692993, + "learning_rate": 9.934634378427506e-05, + "loss": 0.0678, + "step": 1980 + }, + { + "epoch": 2.388955582232893, + "grad_norm": 0.3457421362400055, + "learning_rate": 9.933295195778954e-05, + "loss": 0.0695, + "step": 1990 + }, + { + "epoch": 2.4009603841536613, + "grad_norm": 0.28860682249069214, + "learning_rate": 9.931942525689715e-05, + "loss": 0.067, + "step": 2000 + }, + { + "epoch": 2.41296518607443, + "grad_norm": 0.2902769446372986, + "learning_rate": 9.930576371857936e-05, + "loss": 0.0672, + "step": 2010 + }, + { + "epoch": 2.424969987995198, + "grad_norm": 0.32137641310691833, + "learning_rate": 9.929196738018629e-05, + "loss": 0.0694, + "step": 2020 + }, + { + "epoch": 2.4369747899159666, + "grad_norm": 0.3820798695087433, + "learning_rate": 9.927803627943662e-05, + "loss": 0.0728, + "step": 2030 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 0.33662426471710205, + "learning_rate": 9.926397045441744e-05, + "loss": 0.0697, + "step": 2040 + }, + { + "epoch": 2.460984393757503, + "grad_norm": 0.43793460726737976, + "learning_rate": 9.924976994358417e-05, + "loss": 0.0643, + "step": 2050 + }, + { + "epoch": 2.4729891956782715, + "grad_norm": 0.2540702819824219, + "learning_rate": 9.923543478576048e-05, + "loss": 0.0692, + "step": 2060 + }, + { + "epoch": 2.4849939975990396, + "grad_norm": 0.18879561126232147, + "learning_rate": 9.922096502013813e-05, + "loss": 0.0652, + "step": 2070 + }, + { + "epoch": 2.4969987995198077, + "grad_norm": 0.27408313751220703, + "learning_rate": 9.92063606862769e-05, + "loss": 0.0628, + "step": 2080 + }, + { + "epoch": 2.5090036014405763, + "grad_norm": 0.4017021954059601, + "learning_rate": 9.919162182410453e-05, + "loss": 0.0699, + "step": 2090 + }, + { + "epoch": 2.5210084033613445, + "grad_norm": 0.40660732984542847, + "learning_rate": 9.917674847391645e-05, + "loss": 0.0697, + "step": 2100 + }, + { + "epoch": 2.5330132052821126, + "grad_norm": 0.5660555362701416, + "learning_rate": 9.916174067637584e-05, + "loss": 0.0692, + "step": 2110 + }, + { + "epoch": 2.545018007202881, + "grad_norm": 0.3002040982246399, + "learning_rate": 9.914659847251348e-05, + "loss": 0.0646, + "step": 2120 + }, + { + "epoch": 2.5570228091236493, + "grad_norm": 0.48013466596603394, + "learning_rate": 9.913132190372753e-05, + "loss": 0.0749, + "step": 2130 + }, + { + "epoch": 2.569027611044418, + "grad_norm": 0.2294633984565735, + "learning_rate": 9.911591101178359e-05, + "loss": 0.0717, + "step": 2140 + }, + { + "epoch": 2.581032412965186, + "grad_norm": 0.5493084788322449, + "learning_rate": 9.910036583881443e-05, + "loss": 0.0668, + "step": 2150 + }, + { + "epoch": 2.593037214885954, + "grad_norm": 0.23245786130428314, + "learning_rate": 9.908468642731995e-05, + "loss": 0.0616, + "step": 2160 + }, + { + "epoch": 2.6050420168067228, + "grad_norm": 0.2847057580947876, + "learning_rate": 9.906887282016707e-05, + "loss": 0.0656, + "step": 2170 + }, + { + "epoch": 2.617046818727491, + "grad_norm": 0.23932665586471558, + "learning_rate": 9.90529250605896e-05, + "loss": 0.0622, + "step": 2180 + }, + { + "epoch": 2.6290516206482595, + "grad_norm": 0.29925891757011414, + "learning_rate": 9.903684319218809e-05, + "loss": 0.0654, + "step": 2190 + }, + { + "epoch": 2.6410564225690276, + "grad_norm": 0.2648245692253113, + "learning_rate": 9.902062725892976e-05, + "loss": 0.0634, + "step": 2200 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 0.21922999620437622, + "learning_rate": 9.900427730514834e-05, + "loss": 0.0672, + "step": 2210 + }, + { + "epoch": 2.6650660264105643, + "grad_norm": 0.32792994379997253, + "learning_rate": 9.8987793375544e-05, + "loss": 0.0682, + "step": 2220 + }, + { + "epoch": 2.6770708283313325, + "grad_norm": 0.35955172777175903, + "learning_rate": 9.897117551518318e-05, + "loss": 0.0681, + "step": 2230 + }, + { + "epoch": 2.689075630252101, + "grad_norm": 0.47231119871139526, + "learning_rate": 9.895442376949844e-05, + "loss": 0.065, + "step": 2240 + }, + { + "epoch": 2.701080432172869, + "grad_norm": 0.3056893050670624, + "learning_rate": 9.893753818428845e-05, + "loss": 0.0653, + "step": 2250 + }, + { + "epoch": 2.7130852340936373, + "grad_norm": 0.3053362965583801, + "learning_rate": 9.892051880571773e-05, + "loss": 0.0724, + "step": 2260 + }, + { + "epoch": 2.725090036014406, + "grad_norm": 0.26312705874443054, + "learning_rate": 9.890336568031663e-05, + "loss": 0.0671, + "step": 2270 + }, + { + "epoch": 2.737094837935174, + "grad_norm": 0.23441407084465027, + "learning_rate": 9.888607885498113e-05, + "loss": 0.0616, + "step": 2280 + }, + { + "epoch": 2.7490996398559426, + "grad_norm": 0.24327152967453003, + "learning_rate": 9.886865837697275e-05, + "loss": 0.064, + "step": 2290 + }, + { + "epoch": 2.7611044417767108, + "grad_norm": 0.18542835116386414, + "learning_rate": 9.88511042939184e-05, + "loss": 0.0612, + "step": 2300 + }, + { + "epoch": 2.773109243697479, + "grad_norm": 0.22576577961444855, + "learning_rate": 9.883341665381028e-05, + "loss": 0.0639, + "step": 2310 + }, + { + "epoch": 2.785114045618247, + "grad_norm": 0.22635282576084137, + "learning_rate": 9.881559550500575e-05, + "loss": 0.0605, + "step": 2320 + }, + { + "epoch": 2.7971188475390156, + "grad_norm": 0.286115825176239, + "learning_rate": 9.879764089622712e-05, + "loss": 0.0708, + "step": 2330 + }, + { + "epoch": 2.8091236494597838, + "grad_norm": 0.22386610507965088, + "learning_rate": 9.87795528765616e-05, + "loss": 0.0617, + "step": 2340 + }, + { + "epoch": 2.8211284513805523, + "grad_norm": 0.27594274282455444, + "learning_rate": 9.876133149546118e-05, + "loss": 0.0661, + "step": 2350 + }, + { + "epoch": 2.8331332533013205, + "grad_norm": 0.2609950304031372, + "learning_rate": 9.874297680274238e-05, + "loss": 0.0572, + "step": 2360 + }, + { + "epoch": 2.8451380552220886, + "grad_norm": 0.22690969705581665, + "learning_rate": 9.872448884858624e-05, + "loss": 0.0635, + "step": 2370 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.24856036901474, + "learning_rate": 9.870586768353815e-05, + "loss": 0.0602, + "step": 2380 + }, + { + "epoch": 2.8691476590636253, + "grad_norm": 0.3139076828956604, + "learning_rate": 9.868711335850764e-05, + "loss": 0.0628, + "step": 2390 + }, + { + "epoch": 2.881152460984394, + "grad_norm": 0.2659068703651428, + "learning_rate": 9.866822592476833e-05, + "loss": 0.0616, + "step": 2400 + }, + { + "epoch": 2.893157262905162, + "grad_norm": 0.2549724578857422, + "learning_rate": 9.86492054339577e-05, + "loss": 0.0585, + "step": 2410 + }, + { + "epoch": 2.90516206482593, + "grad_norm": 0.22121308743953705, + "learning_rate": 9.863005193807711e-05, + "loss": 0.0633, + "step": 2420 + }, + { + "epoch": 2.917166866746699, + "grad_norm": 0.335601806640625, + "learning_rate": 9.861076548949143e-05, + "loss": 0.0545, + "step": 2430 + }, + { + "epoch": 2.929171668667467, + "grad_norm": 0.42563727498054504, + "learning_rate": 9.859134614092912e-05, + "loss": 0.0616, + "step": 2440 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.3198833763599396, + "learning_rate": 9.857179394548191e-05, + "loss": 0.0608, + "step": 2450 + }, + { + "epoch": 2.9531812725090036, + "grad_norm": 0.24733345210552216, + "learning_rate": 9.855210895660477e-05, + "loss": 0.064, + "step": 2460 + }, + { + "epoch": 2.965186074429772, + "grad_norm": 0.2628723084926605, + "learning_rate": 9.853229122811568e-05, + "loss": 0.059, + "step": 2470 + }, + { + "epoch": 2.9771908763505404, + "grad_norm": 0.23718471825122833, + "learning_rate": 9.851234081419559e-05, + "loss": 0.0623, + "step": 2480 + }, + { + "epoch": 2.9891956782713085, + "grad_norm": 0.2799265384674072, + "learning_rate": 9.849225776938814e-05, + "loss": 0.0713, + "step": 2490 + }, + { + "epoch": 3.0012004801920766, + "grad_norm": 0.2521497905254364, + "learning_rate": 9.847204214859964e-05, + "loss": 0.0641, + "step": 2500 + }, + { + "epoch": 3.013205282112845, + "grad_norm": 0.23328296840190887, + "learning_rate": 9.845169400709879e-05, + "loss": 0.0687, + "step": 2510 + }, + { + "epoch": 3.0252100840336134, + "grad_norm": 0.3600691556930542, + "learning_rate": 9.843121340051664e-05, + "loss": 0.0624, + "step": 2520 + }, + { + "epoch": 3.037214885954382, + "grad_norm": 0.17813469469547272, + "learning_rate": 9.841060038484641e-05, + "loss": 0.0602, + "step": 2530 + }, + { + "epoch": 3.04921968787515, + "grad_norm": 0.28349965810775757, + "learning_rate": 9.838985501644328e-05, + "loss": 0.0601, + "step": 2540 + }, + { + "epoch": 3.061224489795918, + "grad_norm": 0.23676714301109314, + "learning_rate": 9.83689773520243e-05, + "loss": 0.0646, + "step": 2550 + }, + { + "epoch": 3.073229291716687, + "grad_norm": 0.26812148094177246, + "learning_rate": 9.834796744866819e-05, + "loss": 0.0611, + "step": 2560 + }, + { + "epoch": 3.085234093637455, + "grad_norm": 0.20052385330200195, + "learning_rate": 9.832682536381525e-05, + "loss": 0.0571, + "step": 2570 + }, + { + "epoch": 3.097238895558223, + "grad_norm": 0.3274983763694763, + "learning_rate": 9.830555115526711e-05, + "loss": 0.0607, + "step": 2580 + }, + { + "epoch": 3.1092436974789917, + "grad_norm": 0.19615769386291504, + "learning_rate": 9.828414488118667e-05, + "loss": 0.0597, + "step": 2590 + }, + { + "epoch": 3.12124849939976, + "grad_norm": 0.21695363521575928, + "learning_rate": 9.826260660009785e-05, + "loss": 0.0655, + "step": 2600 + }, + { + "epoch": 3.1332533013205284, + "grad_norm": 0.2715723514556885, + "learning_rate": 9.824093637088547e-05, + "loss": 0.0612, + "step": 2610 + }, + { + "epoch": 3.1452581032412965, + "grad_norm": 0.3415834903717041, + "learning_rate": 9.821913425279514e-05, + "loss": 0.0595, + "step": 2620 + }, + { + "epoch": 3.1572629051620646, + "grad_norm": 0.33938565850257874, + "learning_rate": 9.8197200305433e-05, + "loss": 0.0593, + "step": 2630 + }, + { + "epoch": 3.1692677070828332, + "grad_norm": 0.2142215073108673, + "learning_rate": 9.817513458876564e-05, + "loss": 0.0626, + "step": 2640 + }, + { + "epoch": 3.1812725090036014, + "grad_norm": 0.3336300551891327, + "learning_rate": 9.815293716311987e-05, + "loss": 0.0571, + "step": 2650 + }, + { + "epoch": 3.19327731092437, + "grad_norm": 0.3601575493812561, + "learning_rate": 9.813060808918262e-05, + "loss": 0.0666, + "step": 2660 + }, + { + "epoch": 3.205282112845138, + "grad_norm": 0.31689658761024475, + "learning_rate": 9.810814742800069e-05, + "loss": 0.0532, + "step": 2670 + }, + { + "epoch": 3.2172869147659062, + "grad_norm": 0.27273476123809814, + "learning_rate": 9.808555524098074e-05, + "loss": 0.0601, + "step": 2680 + }, + { + "epoch": 3.229291716686675, + "grad_norm": 0.3097611665725708, + "learning_rate": 9.806283158988887e-05, + "loss": 0.0636, + "step": 2690 + }, + { + "epoch": 3.241296518607443, + "grad_norm": 0.28053778409957886, + "learning_rate": 9.803997653685072e-05, + "loss": 0.0606, + "step": 2700 + }, + { + "epoch": 3.2533013205282115, + "grad_norm": 0.24679718911647797, + "learning_rate": 9.801699014435112e-05, + "loss": 0.0612, + "step": 2710 + }, + { + "epoch": 3.2653061224489797, + "grad_norm": 0.2612006962299347, + "learning_rate": 9.799387247523398e-05, + "loss": 0.0605, + "step": 2720 + }, + { + "epoch": 3.277310924369748, + "grad_norm": 0.23463095724582672, + "learning_rate": 9.797062359270215e-05, + "loss": 0.0652, + "step": 2730 + }, + { + "epoch": 3.2893157262905164, + "grad_norm": 0.2189844399690628, + "learning_rate": 9.794724356031715e-05, + "loss": 0.062, + "step": 2740 + }, + { + "epoch": 3.3013205282112845, + "grad_norm": 0.3142010271549225, + "learning_rate": 9.792373244199913e-05, + "loss": 0.0629, + "step": 2750 + }, + { + "epoch": 3.3133253301320527, + "grad_norm": 0.27976614236831665, + "learning_rate": 9.790009030202658e-05, + "loss": 0.0667, + "step": 2760 + }, + { + "epoch": 3.3253301320528212, + "grad_norm": 0.24450460076332092, + "learning_rate": 9.78763172050362e-05, + "loss": 0.0658, + "step": 2770 + }, + { + "epoch": 3.3373349339735894, + "grad_norm": 0.1786380112171173, + "learning_rate": 9.785241321602274e-05, + "loss": 0.0586, + "step": 2780 + }, + { + "epoch": 3.3493397358943575, + "grad_norm": 0.32171499729156494, + "learning_rate": 9.782837840033879e-05, + "loss": 0.054, + "step": 2790 + }, + { + "epoch": 3.361344537815126, + "grad_norm": 0.21520072221755981, + "learning_rate": 9.780421282369461e-05, + "loss": 0.059, + "step": 2800 + }, + { + "epoch": 3.3733493397358942, + "grad_norm": 0.29285305738449097, + "learning_rate": 9.777991655215797e-05, + "loss": 0.0575, + "step": 2810 + }, + { + "epoch": 3.385354141656663, + "grad_norm": 0.2678052484989166, + "learning_rate": 9.775548965215394e-05, + "loss": 0.059, + "step": 2820 + }, + { + "epoch": 3.397358943577431, + "grad_norm": 0.25555649399757385, + "learning_rate": 9.773093219046474e-05, + "loss": 0.0592, + "step": 2830 + }, + { + "epoch": 3.409363745498199, + "grad_norm": 0.2673632502555847, + "learning_rate": 9.770624423422954e-05, + "loss": 0.0624, + "step": 2840 + }, + { + "epoch": 3.4213685474189677, + "grad_norm": 0.26395389437675476, + "learning_rate": 9.768142585094426e-05, + "loss": 0.0655, + "step": 2850 + }, + { + "epoch": 3.433373349339736, + "grad_norm": 0.3288205564022064, + "learning_rate": 9.765647710846142e-05, + "loss": 0.0633, + "step": 2860 + }, + { + "epoch": 3.4453781512605044, + "grad_norm": 0.24022620916366577, + "learning_rate": 9.763139807498991e-05, + "loss": 0.0653, + "step": 2870 + }, + { + "epoch": 3.4573829531812725, + "grad_norm": 0.3421861231327057, + "learning_rate": 9.760618881909487e-05, + "loss": 0.0638, + "step": 2880 + }, + { + "epoch": 3.4693877551020407, + "grad_norm": 0.2175477296113968, + "learning_rate": 9.758084940969744e-05, + "loss": 0.0614, + "step": 2890 + }, + { + "epoch": 3.4813925570228093, + "grad_norm": 0.20153656601905823, + "learning_rate": 9.755537991607459e-05, + "loss": 0.0564, + "step": 2900 + }, + { + "epoch": 3.4933973589435774, + "grad_norm": 0.27558282017707825, + "learning_rate": 9.752978040785895e-05, + "loss": 0.0539, + "step": 2910 + }, + { + "epoch": 3.505402160864346, + "grad_norm": 0.29207199811935425, + "learning_rate": 9.750405095503859e-05, + "loss": 0.063, + "step": 2920 + }, + { + "epoch": 3.517406962785114, + "grad_norm": 0.3319151699542999, + "learning_rate": 9.747819162795686e-05, + "loss": 0.067, + "step": 2930 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 0.36516430974006653, + "learning_rate": 9.745220249731217e-05, + "loss": 0.0633, + "step": 2940 + }, + { + "epoch": 3.5414165666266504, + "grad_norm": 0.34975388646125793, + "learning_rate": 9.742608363415781e-05, + "loss": 0.0598, + "step": 2950 + }, + { + "epoch": 3.553421368547419, + "grad_norm": 0.2755946218967438, + "learning_rate": 9.739983510990176e-05, + "loss": 0.0595, + "step": 2960 + }, + { + "epoch": 3.5654261704681876, + "grad_norm": 0.2289118766784668, + "learning_rate": 9.737345699630647e-05, + "loss": 0.0595, + "step": 2970 + }, + { + "epoch": 3.5774309723889557, + "grad_norm": 0.25320547819137573, + "learning_rate": 9.734694936548869e-05, + "loss": 0.0504, + "step": 2980 + }, + { + "epoch": 3.589435774309724, + "grad_norm": 0.2039664387702942, + "learning_rate": 9.732031228991932e-05, + "loss": 0.0563, + "step": 2990 + }, + { + "epoch": 3.601440576230492, + "grad_norm": 0.24739137291908264, + "learning_rate": 9.729354584242302e-05, + "loss": 0.0605, + "step": 3000 + }, + { + "epoch": 3.6134453781512605, + "grad_norm": 0.27463802695274353, + "learning_rate": 9.726665009617832e-05, + "loss": 0.0575, + "step": 3010 + }, + { + "epoch": 3.6254501800720287, + "grad_norm": 0.3321676254272461, + "learning_rate": 9.723962512471714e-05, + "loss": 0.0604, + "step": 3020 + }, + { + "epoch": 3.6374549819927973, + "grad_norm": 0.27718716859817505, + "learning_rate": 9.72124710019247e-05, + "loss": 0.0573, + "step": 3030 + }, + { + "epoch": 3.6494597839135654, + "grad_norm": 0.208843395113945, + "learning_rate": 9.718518780203934e-05, + "loss": 0.0585, + "step": 3040 + }, + { + "epoch": 3.6614645858343335, + "grad_norm": 0.21609088778495789, + "learning_rate": 9.715777559965228e-05, + "loss": 0.0533, + "step": 3050 + }, + { + "epoch": 3.673469387755102, + "grad_norm": 0.30417191982269287, + "learning_rate": 9.713023446970746e-05, + "loss": 0.0593, + "step": 3060 + }, + { + "epoch": 3.6854741896758703, + "grad_norm": 0.30469974875450134, + "learning_rate": 9.710256448750126e-05, + "loss": 0.0553, + "step": 3070 + }, + { + "epoch": 3.697478991596639, + "grad_norm": 0.2616933286190033, + "learning_rate": 9.707476572868235e-05, + "loss": 0.0531, + "step": 3080 + }, + { + "epoch": 3.709483793517407, + "grad_norm": 0.29858702421188354, + "learning_rate": 9.704683826925149e-05, + "loss": 0.0607, + "step": 3090 + }, + { + "epoch": 3.721488595438175, + "grad_norm": 0.24377647042274475, + "learning_rate": 9.701878218556129e-05, + "loss": 0.0542, + "step": 3100 + }, + { + "epoch": 3.7334933973589437, + "grad_norm": 0.30100560188293457, + "learning_rate": 9.699059755431598e-05, + "loss": 0.0582, + "step": 3110 + }, + { + "epoch": 3.745498199279712, + "grad_norm": 0.2628312110900879, + "learning_rate": 9.696228445257132e-05, + "loss": 0.0594, + "step": 3120 + }, + { + "epoch": 3.7575030012004804, + "grad_norm": 0.276719868183136, + "learning_rate": 9.693384295773419e-05, + "loss": 0.055, + "step": 3130 + }, + { + "epoch": 3.7695078031212486, + "grad_norm": 0.180617555975914, + "learning_rate": 9.690527314756259e-05, + "loss": 0.0572, + "step": 3140 + }, + { + "epoch": 3.7815126050420167, + "grad_norm": 0.23303565382957458, + "learning_rate": 9.687657510016527e-05, + "loss": 0.0549, + "step": 3150 + }, + { + "epoch": 3.7935174069627853, + "grad_norm": 0.22634033858776093, + "learning_rate": 9.684774889400161e-05, + "loss": 0.052, + "step": 3160 + }, + { + "epoch": 3.8055222088835534, + "grad_norm": 0.17961014807224274, + "learning_rate": 9.681879460788135e-05, + "loss": 0.0616, + "step": 3170 + }, + { + "epoch": 3.817527010804322, + "grad_norm": 0.26663339138031006, + "learning_rate": 9.67897123209644e-05, + "loss": 0.0578, + "step": 3180 + }, + { + "epoch": 3.82953181272509, + "grad_norm": 0.35115137696266174, + "learning_rate": 9.676050211276062e-05, + "loss": 0.0629, + "step": 3190 + }, + { + "epoch": 3.8415366146458583, + "grad_norm": 0.2587292492389679, + "learning_rate": 9.673116406312962e-05, + "loss": 0.057, + "step": 3200 + }, + { + "epoch": 3.8535414165666264, + "grad_norm": 0.47217655181884766, + "learning_rate": 9.67016982522805e-05, + "loss": 0.0565, + "step": 3210 + }, + { + "epoch": 3.865546218487395, + "grad_norm": 0.23443858325481415, + "learning_rate": 9.667210476077164e-05, + "loss": 0.0592, + "step": 3220 + }, + { + "epoch": 3.877551020408163, + "grad_norm": 0.26991507411003113, + "learning_rate": 9.664238366951055e-05, + "loss": 0.0532, + "step": 3230 + }, + { + "epoch": 3.8895558223289317, + "grad_norm": 0.27663329243659973, + "learning_rate": 9.661253505975355e-05, + "loss": 0.061, + "step": 3240 + }, + { + "epoch": 3.9015606242497, + "grad_norm": 0.28021475672721863, + "learning_rate": 9.658255901310557e-05, + "loss": 0.0592, + "step": 3250 + }, + { + "epoch": 3.913565426170468, + "grad_norm": 0.21709518134593964, + "learning_rate": 9.655245561152e-05, + "loss": 0.0548, + "step": 3260 + }, + { + "epoch": 3.9255702280912366, + "grad_norm": 0.2568759024143219, + "learning_rate": 9.65222249372984e-05, + "loss": 0.0629, + "step": 3270 + }, + { + "epoch": 3.9375750300120047, + "grad_norm": 0.24051183462142944, + "learning_rate": 9.649186707309026e-05, + "loss": 0.0586, + "step": 3280 + }, + { + "epoch": 3.9495798319327733, + "grad_norm": 0.2518211305141449, + "learning_rate": 9.646138210189283e-05, + "loss": 0.0679, + "step": 3290 + }, + { + "epoch": 3.9615846338535414, + "grad_norm": 0.3275652229785919, + "learning_rate": 9.643077010705087e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 3.9735894357743096, + "grad_norm": 0.3162301480770111, + "learning_rate": 9.640003117225637e-05, + "loss": 0.0583, + "step": 3310 + }, + { + "epoch": 3.985594237695078, + "grad_norm": 0.321437269449234, + "learning_rate": 9.636916538154846e-05, + "loss": 0.0534, + "step": 3320 + }, + { + "epoch": 3.9975990396158463, + "grad_norm": 0.21063852310180664, + "learning_rate": 9.633817281931296e-05, + "loss": 0.0584, + "step": 3330 + }, + { + "epoch": 4.009603841536615, + "grad_norm": 0.30321308970451355, + "learning_rate": 9.630705357028242e-05, + "loss": 0.057, + "step": 3340 + }, + { + "epoch": 4.021608643457383, + "grad_norm": 0.36937403678894043, + "learning_rate": 9.627580771953563e-05, + "loss": 0.0663, + "step": 3350 + }, + { + "epoch": 4.033613445378151, + "grad_norm": 0.2652818560600281, + "learning_rate": 9.624443535249759e-05, + "loss": 0.0608, + "step": 3360 + }, + { + "epoch": 4.045618247298919, + "grad_norm": 0.3787773549556732, + "learning_rate": 9.621293655493913e-05, + "loss": 0.0566, + "step": 3370 + }, + { + "epoch": 4.057623049219688, + "grad_norm": 0.24754038453102112, + "learning_rate": 9.618131141297675e-05, + "loss": 0.0558, + "step": 3380 + }, + { + "epoch": 4.069627851140456, + "grad_norm": 0.30723315477371216, + "learning_rate": 9.614956001307242e-05, + "loss": 0.062, + "step": 3390 + }, + { + "epoch": 4.081632653061225, + "grad_norm": 0.23069646954536438, + "learning_rate": 9.611768244203321e-05, + "loss": 0.0607, + "step": 3400 + }, + { + "epoch": 4.093637454981993, + "grad_norm": 0.17749261856079102, + "learning_rate": 9.60856787870112e-05, + "loss": 0.0572, + "step": 3410 + }, + { + "epoch": 4.105642256902761, + "grad_norm": 0.2597220242023468, + "learning_rate": 9.605354913550318e-05, + "loss": 0.0503, + "step": 3420 + }, + { + "epoch": 4.117647058823529, + "grad_norm": 0.20197387039661407, + "learning_rate": 9.602129357535037e-05, + "loss": 0.06, + "step": 3430 + }, + { + "epoch": 4.129651860744298, + "grad_norm": 0.32154643535614014, + "learning_rate": 9.598891219473825e-05, + "loss": 0.0605, + "step": 3440 + }, + { + "epoch": 4.141656662665066, + "grad_norm": 0.23123258352279663, + "learning_rate": 9.595640508219625e-05, + "loss": 0.0572, + "step": 3450 + }, + { + "epoch": 4.153661464585834, + "grad_norm": 0.22109785676002502, + "learning_rate": 9.592377232659761e-05, + "loss": 0.0618, + "step": 3460 + }, + { + "epoch": 4.165666266506602, + "grad_norm": 0.270155131816864, + "learning_rate": 9.589101401715904e-05, + "loss": 0.0583, + "step": 3470 + }, + { + "epoch": 4.177671068427371, + "grad_norm": 0.18894092738628387, + "learning_rate": 9.585813024344045e-05, + "loss": 0.0534, + "step": 3480 + }, + { + "epoch": 4.18967587034814, + "grad_norm": 0.28107354044914246, + "learning_rate": 9.58251210953449e-05, + "loss": 0.0602, + "step": 3490 + }, + { + "epoch": 4.201680672268908, + "grad_norm": 0.284498393535614, + "learning_rate": 9.579198666311809e-05, + "loss": 0.0575, + "step": 3500 + }, + { + "epoch": 4.213685474189676, + "grad_norm": 0.25355949997901917, + "learning_rate": 9.575872703734832e-05, + "loss": 0.0567, + "step": 3510 + }, + { + "epoch": 4.225690276110444, + "grad_norm": 0.24327352643013, + "learning_rate": 9.572534230896611e-05, + "loss": 0.0554, + "step": 3520 + }, + { + "epoch": 4.237695078031212, + "grad_norm": 0.3286752998828888, + "learning_rate": 9.569183256924403e-05, + "loss": 0.064, + "step": 3530 + }, + { + "epoch": 4.249699879951981, + "grad_norm": 0.21528860926628113, + "learning_rate": 9.565819790979646e-05, + "loss": 0.0585, + "step": 3540 + }, + { + "epoch": 4.261704681872749, + "grad_norm": 0.2020454853773117, + "learning_rate": 9.562443842257925e-05, + "loss": 0.0543, + "step": 3550 + }, + { + "epoch": 4.2737094837935174, + "grad_norm": 0.16958212852478027, + "learning_rate": 9.559055419988956e-05, + "loss": 0.0545, + "step": 3560 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.18796652555465698, + "learning_rate": 9.555654533436557e-05, + "loss": 0.0565, + "step": 3570 + }, + { + "epoch": 4.297719087635054, + "grad_norm": 0.2671239674091339, + "learning_rate": 9.552241191898621e-05, + "loss": 0.0562, + "step": 3580 + }, + { + "epoch": 4.309723889555823, + "grad_norm": 0.24387720227241516, + "learning_rate": 9.548815404707092e-05, + "loss": 0.0581, + "step": 3590 + }, + { + "epoch": 4.321728691476591, + "grad_norm": 0.2632243037223816, + "learning_rate": 9.545377181227942e-05, + "loss": 0.0582, + "step": 3600 + }, + { + "epoch": 4.333733493397359, + "grad_norm": 0.2634962499141693, + "learning_rate": 9.541926530861145e-05, + "loss": 0.0546, + "step": 3610 + }, + { + "epoch": 4.345738295318127, + "grad_norm": 0.22657142579555511, + "learning_rate": 9.538463463040645e-05, + "loss": 0.0576, + "step": 3620 + }, + { + "epoch": 4.357743097238895, + "grad_norm": 0.24404332041740417, + "learning_rate": 9.534987987234337e-05, + "loss": 0.0535, + "step": 3630 + }, + { + "epoch": 4.369747899159664, + "grad_norm": 0.294133722782135, + "learning_rate": 9.53150011294404e-05, + "loss": 0.0535, + "step": 3640 + }, + { + "epoch": 4.3817527010804325, + "grad_norm": 0.1801920384168625, + "learning_rate": 9.527999849705471e-05, + "loss": 0.0571, + "step": 3650 + }, + { + "epoch": 4.393757503001201, + "grad_norm": 0.18878726661205292, + "learning_rate": 9.524487207088213e-05, + "loss": 0.0518, + "step": 3660 + }, + { + "epoch": 4.405762304921969, + "grad_norm": 0.33183103799819946, + "learning_rate": 9.520962194695698e-05, + "loss": 0.0588, + "step": 3670 + }, + { + "epoch": 4.417767106842737, + "grad_norm": 0.33265674114227295, + "learning_rate": 9.517424822165175e-05, + "loss": 0.0514, + "step": 3680 + }, + { + "epoch": 4.429771908763505, + "grad_norm": 0.2474125623703003, + "learning_rate": 9.513875099167685e-05, + "loss": 0.0566, + "step": 3690 + }, + { + "epoch": 4.441776710684274, + "grad_norm": 0.28438690304756165, + "learning_rate": 9.510313035408035e-05, + "loss": 0.0623, + "step": 3700 + }, + { + "epoch": 4.453781512605042, + "grad_norm": 0.2712539732456207, + "learning_rate": 9.506738640624775e-05, + "loss": 0.0546, + "step": 3710 + }, + { + "epoch": 4.46578631452581, + "grad_norm": 0.26627102494239807, + "learning_rate": 9.50315192459016e-05, + "loss": 0.0513, + "step": 3720 + }, + { + "epoch": 4.4777911164465785, + "grad_norm": 0.26080259680747986, + "learning_rate": 9.499552897110136e-05, + "loss": 0.0515, + "step": 3730 + }, + { + "epoch": 4.489795918367347, + "grad_norm": 0.28555986285209656, + "learning_rate": 9.495941568024304e-05, + "loss": 0.0565, + "step": 3740 + }, + { + "epoch": 4.501800720288116, + "grad_norm": 0.3143094480037689, + "learning_rate": 9.492317947205904e-05, + "loss": 0.0574, + "step": 3750 + }, + { + "epoch": 4.513805522208884, + "grad_norm": 0.38898929953575134, + "learning_rate": 9.488682044561775e-05, + "loss": 0.0606, + "step": 3760 + }, + { + "epoch": 4.525810324129652, + "grad_norm": 0.17756113409996033, + "learning_rate": 9.485033870032335e-05, + "loss": 0.061, + "step": 3770 + }, + { + "epoch": 4.53781512605042, + "grad_norm": 0.25813746452331543, + "learning_rate": 9.481373433591556e-05, + "loss": 0.056, + "step": 3780 + }, + { + "epoch": 4.549819927971188, + "grad_norm": 0.188607320189476, + "learning_rate": 9.47770074524693e-05, + "loss": 0.0536, + "step": 3790 + }, + { + "epoch": 4.561824729891956, + "grad_norm": 0.2389359474182129, + "learning_rate": 9.474015815039446e-05, + "loss": 0.0526, + "step": 3800 + }, + { + "epoch": 4.573829531812725, + "grad_norm": 0.17852452397346497, + "learning_rate": 9.470318653043565e-05, + "loss": 0.0589, + "step": 3810 + }, + { + "epoch": 4.5858343337334935, + "grad_norm": 0.23551581799983978, + "learning_rate": 9.466609269367185e-05, + "loss": 0.061, + "step": 3820 + }, + { + "epoch": 4.597839135654262, + "grad_norm": 0.193104088306427, + "learning_rate": 9.46288767415162e-05, + "loss": 0.0594, + "step": 3830 + }, + { + "epoch": 4.60984393757503, + "grad_norm": 0.32908573746681213, + "learning_rate": 9.459153877571567e-05, + "loss": 0.0562, + "step": 3840 + }, + { + "epoch": 4.621848739495798, + "grad_norm": 0.22577226161956787, + "learning_rate": 9.455407889835087e-05, + "loss": 0.0541, + "step": 3850 + }, + { + "epoch": 4.633853541416567, + "grad_norm": 0.1556607186794281, + "learning_rate": 9.451649721183564e-05, + "loss": 0.0539, + "step": 3860 + }, + { + "epoch": 4.645858343337335, + "grad_norm": 0.20491351187229156, + "learning_rate": 9.447879381891692e-05, + "loss": 0.0564, + "step": 3870 + }, + { + "epoch": 4.657863145258103, + "grad_norm": 0.25102701783180237, + "learning_rate": 9.444096882267428e-05, + "loss": 0.053, + "step": 3880 + }, + { + "epoch": 4.669867947178871, + "grad_norm": 0.22304119169712067, + "learning_rate": 9.440302232651988e-05, + "loss": 0.0518, + "step": 3890 + }, + { + "epoch": 4.6818727490996395, + "grad_norm": 0.23516465723514557, + "learning_rate": 9.436495443419795e-05, + "loss": 0.0549, + "step": 3900 + }, + { + "epoch": 4.6938775510204085, + "grad_norm": 0.2126174420118332, + "learning_rate": 9.432676524978466e-05, + "loss": 0.0578, + "step": 3910 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 0.228512242436409, + "learning_rate": 9.42884548776878e-05, + "loss": 0.059, + "step": 3920 + }, + { + "epoch": 4.717887154861945, + "grad_norm": 0.27132928371429443, + "learning_rate": 9.425002342264646e-05, + "loss": 0.0537, + "step": 3930 + }, + { + "epoch": 4.729891956782713, + "grad_norm": 0.20443004369735718, + "learning_rate": 9.421147098973077e-05, + "loss": 0.0488, + "step": 3940 + }, + { + "epoch": 4.741896758703481, + "grad_norm": 0.27069994807243347, + "learning_rate": 9.41727976843416e-05, + "loss": 0.0517, + "step": 3950 + }, + { + "epoch": 4.75390156062425, + "grad_norm": 0.29525721073150635, + "learning_rate": 9.413400361221029e-05, + "loss": 0.0556, + "step": 3960 + }, + { + "epoch": 4.765906362545018, + "grad_norm": 0.24084146320819855, + "learning_rate": 9.409508887939835e-05, + "loss": 0.054, + "step": 3970 + }, + { + "epoch": 4.777911164465786, + "grad_norm": 0.28273072838783264, + "learning_rate": 9.40560535922972e-05, + "loss": 0.0529, + "step": 3980 + }, + { + "epoch": 4.7899159663865545, + "grad_norm": 0.2077736258506775, + "learning_rate": 9.40168978576278e-05, + "loss": 0.0567, + "step": 3990 + }, + { + "epoch": 4.801920768307323, + "grad_norm": 0.1921141892671585, + "learning_rate": 9.397762178244043e-05, + "loss": 0.0574, + "step": 4000 + }, + { + "epoch": 4.813925570228092, + "grad_norm": 0.27245086431503296, + "learning_rate": 9.393822547411439e-05, + "loss": 0.0579, + "step": 4010 + }, + { + "epoch": 4.82593037214886, + "grad_norm": 0.3536801040172577, + "learning_rate": 9.389870904035769e-05, + "loss": 0.0566, + "step": 4020 + }, + { + "epoch": 4.837935174069628, + "grad_norm": 0.36953094601631165, + "learning_rate": 9.385907258920672e-05, + "loss": 0.0525, + "step": 4030 + }, + { + "epoch": 4.849939975990396, + "grad_norm": 0.32677337527275085, + "learning_rate": 9.381931622902607e-05, + "loss": 0.0516, + "step": 4040 + }, + { + "epoch": 4.861944777911164, + "grad_norm": 0.20237119495868683, + "learning_rate": 9.377944006850807e-05, + "loss": 0.0524, + "step": 4050 + }, + { + "epoch": 4.873949579831933, + "grad_norm": 0.2647162675857544, + "learning_rate": 9.373944421667265e-05, + "loss": 0.0494, + "step": 4060 + }, + { + "epoch": 4.885954381752701, + "grad_norm": 0.2096008062362671, + "learning_rate": 9.369932878286691e-05, + "loss": 0.0497, + "step": 4070 + }, + { + "epoch": 4.8979591836734695, + "grad_norm": 0.21764107048511505, + "learning_rate": 9.365909387676494e-05, + "loss": 0.0538, + "step": 4080 + }, + { + "epoch": 4.909963985594238, + "grad_norm": 0.2380916178226471, + "learning_rate": 9.361873960836744e-05, + "loss": 0.0497, + "step": 4090 + }, + { + "epoch": 4.921968787515006, + "grad_norm": 0.29858648777008057, + "learning_rate": 9.357826608800142e-05, + "loss": 0.0541, + "step": 4100 + }, + { + "epoch": 4.933973589435775, + "grad_norm": 0.34755244851112366, + "learning_rate": 9.353767342631994e-05, + "loss": 0.0547, + "step": 4110 + }, + { + "epoch": 4.945978391356543, + "grad_norm": 0.20405231416225433, + "learning_rate": 9.34969617343018e-05, + "loss": 0.0547, + "step": 4120 + }, + { + "epoch": 4.957983193277311, + "grad_norm": 0.19564193487167358, + "learning_rate": 9.345613112325122e-05, + "loss": 0.0511, + "step": 4130 + }, + { + "epoch": 4.969987995198079, + "grad_norm": 0.22039952874183655, + "learning_rate": 9.34151817047975e-05, + "loss": 0.0548, + "step": 4140 + }, + { + "epoch": 4.981992797118847, + "grad_norm": 0.1953112632036209, + "learning_rate": 9.33741135908948e-05, + "loss": 0.0533, + "step": 4150 + }, + { + "epoch": 4.9939975990396155, + "grad_norm": 0.19792640209197998, + "learning_rate": 9.33329268938218e-05, + "loss": 0.0507, + "step": 4160 + }, + { + "epoch": 5.0060024009603845, + "grad_norm": 0.19345709681510925, + "learning_rate": 9.329162172618132e-05, + "loss": 0.0507, + "step": 4170 + }, + { + "epoch": 5.018007202881153, + "grad_norm": 0.2091323435306549, + "learning_rate": 9.325019820090013e-05, + "loss": 0.0523, + "step": 4180 + }, + { + "epoch": 5.030012004801921, + "grad_norm": 0.23929379880428314, + "learning_rate": 9.320865643122855e-05, + "loss": 0.0561, + "step": 4190 + }, + { + "epoch": 5.042016806722689, + "grad_norm": 0.18234746158123016, + "learning_rate": 9.316699653074023e-05, + "loss": 0.0607, + "step": 4200 + }, + { + "epoch": 5.054021608643457, + "grad_norm": 0.17708313465118408, + "learning_rate": 9.312521861333172e-05, + "loss": 0.0587, + "step": 4210 + }, + { + "epoch": 5.066026410564226, + "grad_norm": 0.18894508481025696, + "learning_rate": 9.308332279322224e-05, + "loss": 0.0506, + "step": 4220 + }, + { + "epoch": 5.078031212484994, + "grad_norm": 0.2434273660182953, + "learning_rate": 9.304130918495338e-05, + "loss": 0.0559, + "step": 4230 + }, + { + "epoch": 5.090036014405762, + "grad_norm": 0.178195983171463, + "learning_rate": 9.299917790338874e-05, + "loss": 0.0506, + "step": 4240 + }, + { + "epoch": 5.1020408163265305, + "grad_norm": 0.26926079392433167, + "learning_rate": 9.295692906371363e-05, + "loss": 0.0544, + "step": 4250 + }, + { + "epoch": 5.114045618247299, + "grad_norm": 0.23360787332057953, + "learning_rate": 9.291456278143476e-05, + "loss": 0.0557, + "step": 4260 + }, + { + "epoch": 5.126050420168067, + "grad_norm": 0.1904219537973404, + "learning_rate": 9.287207917237994e-05, + "loss": 0.0527, + "step": 4270 + }, + { + "epoch": 5.138055222088836, + "grad_norm": 0.23298969864845276, + "learning_rate": 9.282947835269773e-05, + "loss": 0.0492, + "step": 4280 + }, + { + "epoch": 5.150060024009604, + "grad_norm": 0.18144750595092773, + "learning_rate": 9.278676043885715e-05, + "loss": 0.0516, + "step": 4290 + }, + { + "epoch": 5.162064825930372, + "grad_norm": 0.19554777443408966, + "learning_rate": 9.274392554764733e-05, + "loss": 0.0493, + "step": 4300 + }, + { + "epoch": 5.17406962785114, + "grad_norm": 0.20711618661880493, + "learning_rate": 9.270097379617723e-05, + "loss": 0.0541, + "step": 4310 + }, + { + "epoch": 5.186074429771908, + "grad_norm": 0.27183395624160767, + "learning_rate": 9.26579053018753e-05, + "loss": 0.0539, + "step": 4320 + }, + { + "epoch": 5.198079231692677, + "grad_norm": 0.2256668359041214, + "learning_rate": 9.261472018248918e-05, + "loss": 0.0508, + "step": 4330 + }, + { + "epoch": 5.2100840336134455, + "grad_norm": 0.3053922653198242, + "learning_rate": 9.25714185560853e-05, + "loss": 0.053, + "step": 4340 + }, + { + "epoch": 5.222088835534214, + "grad_norm": 0.24106553196907043, + "learning_rate": 9.252800054104868e-05, + "loss": 0.0542, + "step": 4350 + }, + { + "epoch": 5.234093637454982, + "grad_norm": 0.23704631626605988, + "learning_rate": 9.248446625608252e-05, + "loss": 0.0519, + "step": 4360 + }, + { + "epoch": 5.24609843937575, + "grad_norm": 0.24795225262641907, + "learning_rate": 9.244081582020789e-05, + "loss": 0.0487, + "step": 4370 + }, + { + "epoch": 5.258103241296519, + "grad_norm": 0.2051684409379959, + "learning_rate": 9.239704935276339e-05, + "loss": 0.0572, + "step": 4380 + }, + { + "epoch": 5.270108043217287, + "grad_norm": 0.31637611985206604, + "learning_rate": 9.235316697340489e-05, + "loss": 0.0553, + "step": 4390 + }, + { + "epoch": 5.282112845138055, + "grad_norm": 0.24383464455604553, + "learning_rate": 9.230916880210512e-05, + "loss": 0.06, + "step": 4400 + }, + { + "epoch": 5.294117647058823, + "grad_norm": 0.17547498643398285, + "learning_rate": 9.226505495915342e-05, + "loss": 0.0574, + "step": 4410 + }, + { + "epoch": 5.3061224489795915, + "grad_norm": 0.22994737327098846, + "learning_rate": 9.222082556515536e-05, + "loss": 0.0517, + "step": 4420 + }, + { + "epoch": 5.3181272509003605, + "grad_norm": 0.24206072092056274, + "learning_rate": 9.217648074103242e-05, + "loss": 0.0518, + "step": 4430 + }, + { + "epoch": 5.330132052821129, + "grad_norm": 0.1928659826517105, + "learning_rate": 9.213202060802161e-05, + "loss": 0.0479, + "step": 4440 + }, + { + "epoch": 5.342136854741897, + "grad_norm": 0.20406599342823029, + "learning_rate": 9.208744528767528e-05, + "loss": 0.0496, + "step": 4450 + }, + { + "epoch": 5.354141656662665, + "grad_norm": 0.19463993608951569, + "learning_rate": 9.204275490186064e-05, + "loss": 0.0502, + "step": 4460 + }, + { + "epoch": 5.366146458583433, + "grad_norm": 0.215300053358078, + "learning_rate": 9.199794957275949e-05, + "loss": 0.0521, + "step": 4470 + }, + { + "epoch": 5.378151260504202, + "grad_norm": 0.28928396105766296, + "learning_rate": 9.19530294228679e-05, + "loss": 0.0539, + "step": 4480 + }, + { + "epoch": 5.39015606242497, + "grad_norm": 0.22982926666736603, + "learning_rate": 9.190799457499583e-05, + "loss": 0.0483, + "step": 4490 + }, + { + "epoch": 5.402160864345738, + "grad_norm": 0.18290464580059052, + "learning_rate": 9.186284515226686e-05, + "loss": 0.0552, + "step": 4500 + }, + { + "epoch": 5.4141656662665065, + "grad_norm": 0.20831234753131866, + "learning_rate": 9.181758127811777e-05, + "loss": 0.0482, + "step": 4510 + }, + { + "epoch": 5.426170468187275, + "grad_norm": 0.22328080236911774, + "learning_rate": 9.177220307629825e-05, + "loss": 0.0499, + "step": 4520 + }, + { + "epoch": 5.438175270108043, + "grad_norm": 0.22057555615901947, + "learning_rate": 9.172671067087059e-05, + "loss": 0.049, + "step": 4530 + }, + { + "epoch": 5.450180072028812, + "grad_norm": 0.1394820660352707, + "learning_rate": 9.16811041862093e-05, + "loss": 0.0494, + "step": 4540 + }, + { + "epoch": 5.46218487394958, + "grad_norm": 0.21995262801647186, + "learning_rate": 9.163538374700076e-05, + "loss": 0.0469, + "step": 4550 + }, + { + "epoch": 5.474189675870348, + "grad_norm": 0.2124902755022049, + "learning_rate": 9.158954947824287e-05, + "loss": 0.0468, + "step": 4560 + }, + { + "epoch": 5.486194477791116, + "grad_norm": 0.17514242231845856, + "learning_rate": 9.154360150524482e-05, + "loss": 0.0461, + "step": 4570 + }, + { + "epoch": 5.498199279711884, + "grad_norm": 0.17899569869041443, + "learning_rate": 9.14975399536266e-05, + "loss": 0.0508, + "step": 4580 + }, + { + "epoch": 5.510204081632653, + "grad_norm": 0.2177286595106125, + "learning_rate": 9.14513649493187e-05, + "loss": 0.0469, + "step": 4590 + }, + { + "epoch": 5.5222088835534215, + "grad_norm": 0.19954226911067963, + "learning_rate": 9.140507661856187e-05, + "loss": 0.0478, + "step": 4600 + }, + { + "epoch": 5.53421368547419, + "grad_norm": 0.2471907138824463, + "learning_rate": 9.135867508790661e-05, + "loss": 0.0543, + "step": 4610 + }, + { + "epoch": 5.546218487394958, + "grad_norm": 0.1807817965745926, + "learning_rate": 9.131216048421291e-05, + "loss": 0.0554, + "step": 4620 + }, + { + "epoch": 5.558223289315726, + "grad_norm": 0.25474002957344055, + "learning_rate": 9.126553293464998e-05, + "loss": 0.0539, + "step": 4630 + }, + { + "epoch": 5.570228091236495, + "grad_norm": 0.26001736521720886, + "learning_rate": 9.121879256669572e-05, + "loss": 0.0514, + "step": 4640 + }, + { + "epoch": 5.582232893157263, + "grad_norm": 0.22953104972839355, + "learning_rate": 9.117193950813652e-05, + "loss": 0.0538, + "step": 4650 + }, + { + "epoch": 5.594237695078031, + "grad_norm": 0.2753389775753021, + "learning_rate": 9.112497388706685e-05, + "loss": 0.0533, + "step": 4660 + }, + { + "epoch": 5.606242496998799, + "grad_norm": 0.2846970558166504, + "learning_rate": 9.10778958318889e-05, + "loss": 0.0546, + "step": 4670 + }, + { + "epoch": 5.6182472989195675, + "grad_norm": 0.2542758584022522, + "learning_rate": 9.103070547131232e-05, + "loss": 0.049, + "step": 4680 + }, + { + "epoch": 5.630252100840336, + "grad_norm": 0.2651291489601135, + "learning_rate": 9.098340293435375e-05, + "loss": 0.0525, + "step": 4690 + }, + { + "epoch": 5.642256902761105, + "grad_norm": 0.21426956355571747, + "learning_rate": 9.093598835033649e-05, + "loss": 0.0536, + "step": 4700 + }, + { + "epoch": 5.654261704681873, + "grad_norm": 0.2831646203994751, + "learning_rate": 9.088846184889021e-05, + "loss": 0.0499, + "step": 4710 + }, + { + "epoch": 5.666266506602641, + "grad_norm": 0.2005920112133026, + "learning_rate": 9.084082355995057e-05, + "loss": 0.0501, + "step": 4720 + }, + { + "epoch": 5.678271308523409, + "grad_norm": 0.2331245094537735, + "learning_rate": 9.079307361375882e-05, + "loss": 0.0456, + "step": 4730 + }, + { + "epoch": 5.690276110444177, + "grad_norm": 0.2295786440372467, + "learning_rate": 9.074521214086149e-05, + "loss": 0.054, + "step": 4740 + }, + { + "epoch": 5.702280912364946, + "grad_norm": 0.25073477625846863, + "learning_rate": 9.069723927211001e-05, + "loss": 0.0564, + "step": 4750 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 0.1839093416929245, + "learning_rate": 9.064915513866037e-05, + "loss": 0.0541, + "step": 4760 + }, + { + "epoch": 5.7262905162064826, + "grad_norm": 0.3122197687625885, + "learning_rate": 9.060095987197279e-05, + "loss": 0.0507, + "step": 4770 + }, + { + "epoch": 5.738295318127251, + "grad_norm": 0.2450714409351349, + "learning_rate": 9.055265360381126e-05, + "loss": 0.0533, + "step": 4780 + }, + { + "epoch": 5.750300120048019, + "grad_norm": 0.17997689545154572, + "learning_rate": 9.050423646624326e-05, + "loss": 0.0549, + "step": 4790 + }, + { + "epoch": 5.762304921968788, + "grad_norm": 0.3649781346321106, + "learning_rate": 9.045570859163943e-05, + "loss": 0.0533, + "step": 4800 + }, + { + "epoch": 5.774309723889556, + "grad_norm": 0.2855977416038513, + "learning_rate": 9.04070701126731e-05, + "loss": 0.0542, + "step": 4810 + }, + { + "epoch": 5.786314525810324, + "grad_norm": 0.3228994607925415, + "learning_rate": 9.035832116232001e-05, + "loss": 0.0533, + "step": 4820 + }, + { + "epoch": 5.798319327731092, + "grad_norm": 0.2657751739025116, + "learning_rate": 9.030946187385796e-05, + "loss": 0.0455, + "step": 4830 + }, + { + "epoch": 5.81032412965186, + "grad_norm": 0.14723694324493408, + "learning_rate": 9.026049238086635e-05, + "loss": 0.0545, + "step": 4840 + }, + { + "epoch": 5.822328931572629, + "grad_norm": 0.2386440932750702, + "learning_rate": 9.021141281722591e-05, + "loss": 0.0533, + "step": 4850 + }, + { + "epoch": 5.834333733493398, + "grad_norm": 0.2640148401260376, + "learning_rate": 9.01622233171183e-05, + "loss": 0.0494, + "step": 4860 + }, + { + "epoch": 5.846338535414166, + "grad_norm": 0.22823426127433777, + "learning_rate": 9.011292401502574e-05, + "loss": 0.0532, + "step": 4870 + }, + { + "epoch": 5.858343337334934, + "grad_norm": 0.23199144005775452, + "learning_rate": 9.006351504573063e-05, + "loss": 0.0484, + "step": 4880 + }, + { + "epoch": 5.870348139255702, + "grad_norm": 0.20410332083702087, + "learning_rate": 9.001399654431519e-05, + "loss": 0.0445, + "step": 4890 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 0.24380654096603394, + "learning_rate": 8.996436864616116e-05, + "loss": 0.0504, + "step": 4900 + }, + { + "epoch": 5.894357743097239, + "grad_norm": 0.24276883900165558, + "learning_rate": 8.991463148694925e-05, + "loss": 0.0446, + "step": 4910 + }, + { + "epoch": 5.906362545018007, + "grad_norm": 0.2329537570476532, + "learning_rate": 8.986478520265902e-05, + "loss": 0.0524, + "step": 4920 + }, + { + "epoch": 5.918367346938775, + "grad_norm": 0.21545173227787018, + "learning_rate": 8.981482992956827e-05, + "loss": 0.0557, + "step": 4930 + }, + { + "epoch": 5.930372148859544, + "grad_norm": 0.23841170966625214, + "learning_rate": 8.976476580425282e-05, + "loss": 0.0537, + "step": 4940 + }, + { + "epoch": 5.942376950780313, + "grad_norm": 0.2711506485939026, + "learning_rate": 8.971459296358606e-05, + "loss": 0.0524, + "step": 4950 + }, + { + "epoch": 5.954381752701081, + "grad_norm": 0.23540785908699036, + "learning_rate": 8.966431154473864e-05, + "loss": 0.0539, + "step": 4960 + }, + { + "epoch": 5.966386554621849, + "grad_norm": 0.2052491456270218, + "learning_rate": 8.961392168517803e-05, + "loss": 0.0482, + "step": 4970 + }, + { + "epoch": 5.978391356542617, + "grad_norm": 0.21751144528388977, + "learning_rate": 8.956342352266821e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 5.990396158463385, + "grad_norm": 0.19977490603923798, + "learning_rate": 8.95128171952692e-05, + "loss": 0.0479, + "step": 4990 + }, + { + "epoch": 6.002400960384153, + "grad_norm": 0.2518291771411896, + "learning_rate": 8.946210284133676e-05, + "loss": 0.0483, + "step": 5000 + }, + { + "epoch": 6.014405762304922, + "grad_norm": 0.2199583500623703, + "learning_rate": 8.941128059952201e-05, + "loss": 0.0494, + "step": 5010 + }, + { + "epoch": 6.02641056422569, + "grad_norm": 0.2438848316669464, + "learning_rate": 8.936035060877102e-05, + "loss": 0.0508, + "step": 5020 + }, + { + "epoch": 6.038415366146459, + "grad_norm": 0.21843816339969635, + "learning_rate": 8.930931300832443e-05, + "loss": 0.053, + "step": 5030 + }, + { + "epoch": 6.050420168067227, + "grad_norm": 0.2176237553358078, + "learning_rate": 8.925816793771711e-05, + "loss": 0.0486, + "step": 5040 + }, + { + "epoch": 6.062424969987995, + "grad_norm": 0.2368723601102829, + "learning_rate": 8.92069155367777e-05, + "loss": 0.0471, + "step": 5050 + }, + { + "epoch": 6.074429771908764, + "grad_norm": 0.22987157106399536, + "learning_rate": 8.915555594562834e-05, + "loss": 0.0482, + "step": 5060 + }, + { + "epoch": 6.086434573829532, + "grad_norm": 0.25101688504219055, + "learning_rate": 8.910408930468416e-05, + "loss": 0.0501, + "step": 5070 + }, + { + "epoch": 6.0984393757503, + "grad_norm": 0.21880921721458435, + "learning_rate": 8.905251575465303e-05, + "loss": 0.0455, + "step": 5080 + }, + { + "epoch": 6.110444177671068, + "grad_norm": 0.3213087320327759, + "learning_rate": 8.900083543653502e-05, + "loss": 0.0481, + "step": 5090 + }, + { + "epoch": 6.122448979591836, + "grad_norm": 0.18597747385501862, + "learning_rate": 8.894904849162218e-05, + "loss": 0.0463, + "step": 5100 + }, + { + "epoch": 6.1344537815126055, + "grad_norm": 0.18564783036708832, + "learning_rate": 8.889715506149802e-05, + "loss": 0.0468, + "step": 5110 + }, + { + "epoch": 6.146458583433374, + "grad_norm": 0.15209642052650452, + "learning_rate": 8.884515528803722e-05, + "loss": 0.046, + "step": 5120 + }, + { + "epoch": 6.158463385354142, + "grad_norm": 0.26202788949012756, + "learning_rate": 8.879304931340517e-05, + "loss": 0.0541, + "step": 5130 + }, + { + "epoch": 6.17046818727491, + "grad_norm": 0.17599712312221527, + "learning_rate": 8.874083728005759e-05, + "loss": 0.0504, + "step": 5140 + }, + { + "epoch": 6.182472989195678, + "grad_norm": 0.21072055399417877, + "learning_rate": 8.868851933074021e-05, + "loss": 0.0544, + "step": 5150 + }, + { + "epoch": 6.194477791116446, + "grad_norm": 0.22731515765190125, + "learning_rate": 8.863609560848829e-05, + "loss": 0.0476, + "step": 5160 + }, + { + "epoch": 6.206482593037215, + "grad_norm": 0.14214357733726501, + "learning_rate": 8.85835662566263e-05, + "loss": 0.0477, + "step": 5170 + }, + { + "epoch": 6.218487394957983, + "grad_norm": 0.3050270676612854, + "learning_rate": 8.853093141876747e-05, + "loss": 0.0514, + "step": 5180 + }, + { + "epoch": 6.230492196878751, + "grad_norm": 0.23398663103580475, + "learning_rate": 8.847819123881343e-05, + "loss": 0.0474, + "step": 5190 + }, + { + "epoch": 6.24249699879952, + "grad_norm": 0.1598176211118698, + "learning_rate": 8.842534586095383e-05, + "loss": 0.0443, + "step": 5200 + }, + { + "epoch": 6.254501800720288, + "grad_norm": 0.264351487159729, + "learning_rate": 8.837239542966593e-05, + "loss": 0.0515, + "step": 5210 + }, + { + "epoch": 6.266506602641057, + "grad_norm": 0.21067233383655548, + "learning_rate": 8.831934008971417e-05, + "loss": 0.0533, + "step": 5220 + }, + { + "epoch": 6.278511404561825, + "grad_norm": 0.17587143182754517, + "learning_rate": 8.826617998614982e-05, + "loss": 0.048, + "step": 5230 + }, + { + "epoch": 6.290516206482593, + "grad_norm": 0.22748790681362152, + "learning_rate": 8.821291526431056e-05, + "loss": 0.0526, + "step": 5240 + }, + { + "epoch": 6.302521008403361, + "grad_norm": 0.17208072543144226, + "learning_rate": 8.815954606982015e-05, + "loss": 0.0499, + "step": 5250 + }, + { + "epoch": 6.314525810324129, + "grad_norm": 0.22176377475261688, + "learning_rate": 8.810607254858789e-05, + "loss": 0.0486, + "step": 5260 + }, + { + "epoch": 6.326530612244898, + "grad_norm": 0.1860537976026535, + "learning_rate": 8.805249484680838e-05, + "loss": 0.0499, + "step": 5270 + }, + { + "epoch": 6.3385354141656665, + "grad_norm": 0.2822646498680115, + "learning_rate": 8.799881311096096e-05, + "loss": 0.0487, + "step": 5280 + }, + { + "epoch": 6.350540216086435, + "grad_norm": 0.16547034680843353, + "learning_rate": 8.794502748780949e-05, + "loss": 0.0474, + "step": 5290 + }, + { + "epoch": 6.362545018007203, + "grad_norm": 0.22144688665866852, + "learning_rate": 8.78911381244018e-05, + "loss": 0.0522, + "step": 5300 + }, + { + "epoch": 6.374549819927971, + "grad_norm": 0.21748043596744537, + "learning_rate": 8.783714516806933e-05, + "loss": 0.0487, + "step": 5310 + }, + { + "epoch": 6.38655462184874, + "grad_norm": 0.21169507503509521, + "learning_rate": 8.77830487664268e-05, + "loss": 0.046, + "step": 5320 + }, + { + "epoch": 6.398559423769508, + "grad_norm": 0.27978330850601196, + "learning_rate": 8.772884906737167e-05, + "loss": 0.0516, + "step": 5330 + }, + { + "epoch": 6.410564225690276, + "grad_norm": 0.24806849658489227, + "learning_rate": 8.767454621908387e-05, + "loss": 0.0531, + "step": 5340 + }, + { + "epoch": 6.422569027611044, + "grad_norm": 0.2576009929180145, + "learning_rate": 8.76201403700253e-05, + "loss": 0.0516, + "step": 5350 + }, + { + "epoch": 6.4345738295318124, + "grad_norm": 0.3232913911342621, + "learning_rate": 8.756563166893949e-05, + "loss": 0.0491, + "step": 5360 + }, + { + "epoch": 6.4465786314525815, + "grad_norm": 0.2769327461719513, + "learning_rate": 8.751102026485113e-05, + "loss": 0.0461, + "step": 5370 + }, + { + "epoch": 6.45858343337335, + "grad_norm": 0.21254070103168488, + "learning_rate": 8.745630630706571e-05, + "loss": 0.0489, + "step": 5380 + }, + { + "epoch": 6.470588235294118, + "grad_norm": 0.21661731600761414, + "learning_rate": 8.740148994516912e-05, + "loss": 0.0481, + "step": 5390 + }, + { + "epoch": 6.482593037214886, + "grad_norm": 0.27693209052085876, + "learning_rate": 8.73465713290272e-05, + "loss": 0.0488, + "step": 5400 + }, + { + "epoch": 6.494597839135654, + "grad_norm": 0.19188067317008972, + "learning_rate": 8.729155060878533e-05, + "loss": 0.0453, + "step": 5410 + }, + { + "epoch": 6.506602641056423, + "grad_norm": 0.23460331559181213, + "learning_rate": 8.723642793486809e-05, + "loss": 0.0543, + "step": 5420 + }, + { + "epoch": 6.518607442977191, + "grad_norm": 0.1821882277727127, + "learning_rate": 8.718120345797873e-05, + "loss": 0.0466, + "step": 5430 + }, + { + "epoch": 6.530612244897959, + "grad_norm": 0.23522675037384033, + "learning_rate": 8.712587732909889e-05, + "loss": 0.0471, + "step": 5440 + }, + { + "epoch": 6.5426170468187275, + "grad_norm": 0.25126388669013977, + "learning_rate": 8.707044969948806e-05, + "loss": 0.0446, + "step": 5450 + }, + { + "epoch": 6.554621848739496, + "grad_norm": 0.2243773341178894, + "learning_rate": 8.701492072068329e-05, + "loss": 0.047, + "step": 5460 + }, + { + "epoch": 6.566626650660264, + "grad_norm": 0.19499455392360687, + "learning_rate": 8.695929054449869e-05, + "loss": 0.05, + "step": 5470 + }, + { + "epoch": 6.578631452581033, + "grad_norm": 0.17084051668643951, + "learning_rate": 8.690355932302501e-05, + "loss": 0.0512, + "step": 5480 + }, + { + "epoch": 6.590636254501801, + "grad_norm": 0.17575982213020325, + "learning_rate": 8.684772720862931e-05, + "loss": 0.0496, + "step": 5490 + }, + { + "epoch": 6.602641056422569, + "grad_norm": 0.1919221580028534, + "learning_rate": 8.679179435395446e-05, + "loss": 0.05, + "step": 5500 + }, + { + "epoch": 6.614645858343337, + "grad_norm": 0.2396879941225052, + "learning_rate": 8.673576091191874e-05, + "loss": 0.053, + "step": 5510 + }, + { + "epoch": 6.626650660264105, + "grad_norm": 0.24176768958568573, + "learning_rate": 8.667962703571541e-05, + "loss": 0.0481, + "step": 5520 + }, + { + "epoch": 6.6386554621848735, + "grad_norm": 0.23963309824466705, + "learning_rate": 8.662339287881238e-05, + "loss": 0.047, + "step": 5530 + }, + { + "epoch": 6.6506602641056425, + "grad_norm": 0.2261841893196106, + "learning_rate": 8.656705859495169e-05, + "loss": 0.0509, + "step": 5540 + }, + { + "epoch": 6.662665066026411, + "grad_norm": 0.23767659068107605, + "learning_rate": 8.651062433814912e-05, + "loss": 0.0489, + "step": 5550 + }, + { + "epoch": 6.674669867947179, + "grad_norm": 0.1971079260110855, + "learning_rate": 8.645409026269375e-05, + "loss": 0.0535, + "step": 5560 + }, + { + "epoch": 6.686674669867947, + "grad_norm": 0.2351047545671463, + "learning_rate": 8.639745652314759e-05, + "loss": 0.0447, + "step": 5570 + }, + { + "epoch": 6.698679471788715, + "grad_norm": 0.2357843518257141, + "learning_rate": 8.634072327434515e-05, + "loss": 0.0482, + "step": 5580 + }, + { + "epoch": 6.710684273709484, + "grad_norm": 0.22226707637310028, + "learning_rate": 8.628389067139294e-05, + "loss": 0.0479, + "step": 5590 + }, + { + "epoch": 6.722689075630252, + "grad_norm": 0.21732674539089203, + "learning_rate": 8.622695886966911e-05, + "loss": 0.0479, + "step": 5600 + }, + { + "epoch": 6.73469387755102, + "grad_norm": 0.341953307390213, + "learning_rate": 8.616992802482308e-05, + "loss": 0.0526, + "step": 5610 + }, + { + "epoch": 6.7466986794717885, + "grad_norm": 0.21553649008274078, + "learning_rate": 8.611279829277496e-05, + "loss": 0.0482, + "step": 5620 + }, + { + "epoch": 6.758703481392557, + "grad_norm": 0.2725537121295929, + "learning_rate": 8.605556982971528e-05, + "loss": 0.0481, + "step": 5630 + }, + { + "epoch": 6.770708283313326, + "grad_norm": 0.18180647492408752, + "learning_rate": 8.599824279210447e-05, + "loss": 0.0482, + "step": 5640 + }, + { + "epoch": 6.782713085234094, + "grad_norm": 0.23095999658107758, + "learning_rate": 8.594081733667243e-05, + "loss": 0.0506, + "step": 5650 + }, + { + "epoch": 6.794717887154862, + "grad_norm": 0.2137482762336731, + "learning_rate": 8.58832936204182e-05, + "loss": 0.0446, + "step": 5660 + }, + { + "epoch": 6.80672268907563, + "grad_norm": 0.2028329074382782, + "learning_rate": 8.582567180060942e-05, + "loss": 0.0469, + "step": 5670 + }, + { + "epoch": 6.818727490996398, + "grad_norm": 0.14150048792362213, + "learning_rate": 8.576795203478194e-05, + "loss": 0.0498, + "step": 5680 + }, + { + "epoch": 6.830732292917167, + "grad_norm": 0.2297830879688263, + "learning_rate": 8.571013448073939e-05, + "loss": 0.0506, + "step": 5690 + }, + { + "epoch": 6.842737094837935, + "grad_norm": 0.23628641664981842, + "learning_rate": 8.565221929655275e-05, + "loss": 0.0429, + "step": 5700 + }, + { + "epoch": 6.8547418967587035, + "grad_norm": 0.2210369110107422, + "learning_rate": 8.559420664055992e-05, + "loss": 0.0487, + "step": 5710 + }, + { + "epoch": 6.866746698679472, + "grad_norm": 0.2224275916814804, + "learning_rate": 8.553609667136532e-05, + "loss": 0.0512, + "step": 5720 + }, + { + "epoch": 6.87875150060024, + "grad_norm": 0.22376279532909393, + "learning_rate": 8.547788954783936e-05, + "loss": 0.0467, + "step": 5730 + }, + { + "epoch": 6.890756302521009, + "grad_norm": 0.23823443055152893, + "learning_rate": 8.541958542911808e-05, + "loss": 0.053, + "step": 5740 + }, + { + "epoch": 6.902761104441777, + "grad_norm": 0.1938626617193222, + "learning_rate": 8.536118447460275e-05, + "loss": 0.0489, + "step": 5750 + }, + { + "epoch": 6.914765906362545, + "grad_norm": 0.24903443455696106, + "learning_rate": 8.530268684395932e-05, + "loss": 0.0477, + "step": 5760 + }, + { + "epoch": 6.926770708283313, + "grad_norm": 0.16630490124225616, + "learning_rate": 8.524409269711807e-05, + "loss": 0.0448, + "step": 5770 + }, + { + "epoch": 6.938775510204081, + "grad_norm": 0.29458174109458923, + "learning_rate": 8.51854021942732e-05, + "loss": 0.0498, + "step": 5780 + }, + { + "epoch": 6.95078031212485, + "grad_norm": 0.2044655978679657, + "learning_rate": 8.512661549588227e-05, + "loss": 0.0447, + "step": 5790 + }, + { + "epoch": 6.9627851140456185, + "grad_norm": 0.16037790477275848, + "learning_rate": 8.506773276266588e-05, + "loss": 0.0475, + "step": 5800 + }, + { + "epoch": 6.974789915966387, + "grad_norm": 0.21814365684986115, + "learning_rate": 8.500875415560721e-05, + "loss": 0.0427, + "step": 5810 + }, + { + "epoch": 6.986794717887155, + "grad_norm": 0.22529636323451996, + "learning_rate": 8.494967983595144e-05, + "loss": 0.0477, + "step": 5820 + }, + { + "epoch": 6.998799519807923, + "grad_norm": 0.2156190276145935, + "learning_rate": 8.489050996520558e-05, + "loss": 0.0438, + "step": 5830 + }, + { + "epoch": 7.010804321728691, + "grad_norm": 0.2061353474855423, + "learning_rate": 8.483124470513775e-05, + "loss": 0.0398, + "step": 5840 + }, + { + "epoch": 7.02280912364946, + "grad_norm": 0.31732502579689026, + "learning_rate": 8.477188421777692e-05, + "loss": 0.0479, + "step": 5850 + }, + { + "epoch": 7.034813925570228, + "grad_norm": 0.2355213165283203, + "learning_rate": 8.47124286654124e-05, + "loss": 0.0492, + "step": 5860 + }, + { + "epoch": 7.046818727490996, + "grad_norm": 0.23484276235103607, + "learning_rate": 8.465287821059341e-05, + "loss": 0.0475, + "step": 5870 + }, + { + "epoch": 7.0588235294117645, + "grad_norm": 0.31468620896339417, + "learning_rate": 8.45932330161286e-05, + "loss": 0.0408, + "step": 5880 + }, + { + "epoch": 7.070828331332533, + "grad_norm": 0.1752273142337799, + "learning_rate": 8.453349324508567e-05, + "loss": 0.0481, + "step": 5890 + }, + { + "epoch": 7.082833133253302, + "grad_norm": 0.26199156045913696, + "learning_rate": 8.447365906079088e-05, + "loss": 0.0455, + "step": 5900 + }, + { + "epoch": 7.09483793517407, + "grad_norm": 0.16147947311401367, + "learning_rate": 8.441373062682856e-05, + "loss": 0.0431, + "step": 5910 + }, + { + "epoch": 7.106842737094838, + "grad_norm": 0.20839284360408783, + "learning_rate": 8.43537081070408e-05, + "loss": 0.045, + "step": 5920 + }, + { + "epoch": 7.118847539015606, + "grad_norm": 0.1995694488286972, + "learning_rate": 8.429359166552689e-05, + "loss": 0.0444, + "step": 5930 + }, + { + "epoch": 7.130852340936374, + "grad_norm": 0.19153304398059845, + "learning_rate": 8.423338146664284e-05, + "loss": 0.049, + "step": 5940 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 0.14519654214382172, + "learning_rate": 8.417307767500107e-05, + "loss": 0.0462, + "step": 5950 + }, + { + "epoch": 7.154861944777911, + "grad_norm": 0.19724847376346588, + "learning_rate": 8.411268045546983e-05, + "loss": 0.0441, + "step": 5960 + }, + { + "epoch": 7.1668667466986795, + "grad_norm": 0.22474150359630585, + "learning_rate": 8.405218997317281e-05, + "loss": 0.0442, + "step": 5970 + }, + { + "epoch": 7.178871548619448, + "grad_norm": 0.2841247022151947, + "learning_rate": 8.399160639348869e-05, + "loss": 0.0461, + "step": 5980 + }, + { + "epoch": 7.190876350540216, + "grad_norm": 0.23602305352687836, + "learning_rate": 8.393092988205065e-05, + "loss": 0.0445, + "step": 5990 + }, + { + "epoch": 7.202881152460985, + "grad_norm": 0.28305262327194214, + "learning_rate": 8.387016060474597e-05, + "loss": 0.0443, + "step": 6000 + }, + { + "epoch": 7.214885954381753, + "grad_norm": 0.186979278922081, + "learning_rate": 8.380929872771551e-05, + "loss": 0.0405, + "step": 6010 + }, + { + "epoch": 7.226890756302521, + "grad_norm": 0.2467087209224701, + "learning_rate": 8.374834441735335e-05, + "loss": 0.0501, + "step": 6020 + }, + { + "epoch": 7.238895558223289, + "grad_norm": 0.19656221568584442, + "learning_rate": 8.368729784030622e-05, + "loss": 0.0435, + "step": 6030 + }, + { + "epoch": 7.250900360144057, + "grad_norm": 0.2309686541557312, + "learning_rate": 8.362615916347315e-05, + "loss": 0.0467, + "step": 6040 + }, + { + "epoch": 7.2629051620648255, + "grad_norm": 0.16139303147792816, + "learning_rate": 8.356492855400493e-05, + "loss": 0.046, + "step": 6050 + }, + { + "epoch": 7.2749099639855945, + "grad_norm": 0.2151726484298706, + "learning_rate": 8.350360617930371e-05, + "loss": 0.0449, + "step": 6060 + }, + { + "epoch": 7.286914765906363, + "grad_norm": 0.22129768133163452, + "learning_rate": 8.344219220702255e-05, + "loss": 0.0448, + "step": 6070 + }, + { + "epoch": 7.298919567827131, + "grad_norm": 0.26973530650138855, + "learning_rate": 8.338068680506485e-05, + "loss": 0.0508, + "step": 6080 + }, + { + "epoch": 7.310924369747899, + "grad_norm": 0.23678086698055267, + "learning_rate": 8.33190901415841e-05, + "loss": 0.0497, + "step": 6090 + }, + { + "epoch": 7.322929171668667, + "grad_norm": 0.23961761593818665, + "learning_rate": 8.325740238498317e-05, + "loss": 0.0464, + "step": 6100 + }, + { + "epoch": 7.334933973589436, + "grad_norm": 0.23793965578079224, + "learning_rate": 8.319562370391406e-05, + "loss": 0.0448, + "step": 6110 + }, + { + "epoch": 7.346938775510204, + "grad_norm": 0.23221546411514282, + "learning_rate": 8.31337542672773e-05, + "loss": 0.0466, + "step": 6120 + }, + { + "epoch": 7.358943577430972, + "grad_norm": 0.24710150063037872, + "learning_rate": 8.307179424422158e-05, + "loss": 0.0456, + "step": 6130 + }, + { + "epoch": 7.3709483793517405, + "grad_norm": 0.24922527372837067, + "learning_rate": 8.300974380414327e-05, + "loss": 0.046, + "step": 6140 + }, + { + "epoch": 7.382953181272509, + "grad_norm": 0.2252652645111084, + "learning_rate": 8.294760311668586e-05, + "loss": 0.0426, + "step": 6150 + }, + { + "epoch": 7.394957983193278, + "grad_norm": 0.26154160499572754, + "learning_rate": 8.288537235173961e-05, + "loss": 0.0511, + "step": 6160 + }, + { + "epoch": 7.406962785114046, + "grad_norm": 0.16650453209877014, + "learning_rate": 8.282305167944108e-05, + "loss": 0.0425, + "step": 6170 + }, + { + "epoch": 7.418967587034814, + "grad_norm": 0.17807242274284363, + "learning_rate": 8.276064127017262e-05, + "loss": 0.0495, + "step": 6180 + }, + { + "epoch": 7.430972388955582, + "grad_norm": 0.16246449947357178, + "learning_rate": 8.269814129456189e-05, + "loss": 0.0402, + "step": 6190 + }, + { + "epoch": 7.44297719087635, + "grad_norm": 0.1720261126756668, + "learning_rate": 8.263555192348143e-05, + "loss": 0.0451, + "step": 6200 + }, + { + "epoch": 7.454981992797119, + "grad_norm": 0.23908481001853943, + "learning_rate": 8.257287332804819e-05, + "loss": 0.045, + "step": 6210 + }, + { + "epoch": 7.466986794717887, + "grad_norm": 0.24481044709682465, + "learning_rate": 8.251010567962307e-05, + "loss": 0.0454, + "step": 6220 + }, + { + "epoch": 7.4789915966386555, + "grad_norm": 0.20445145666599274, + "learning_rate": 8.244724914981041e-05, + "loss": 0.0432, + "step": 6230 + }, + { + "epoch": 7.490996398559424, + "grad_norm": 0.1989467889070511, + "learning_rate": 8.238430391045757e-05, + "loss": 0.0421, + "step": 6240 + }, + { + "epoch": 7.503001200480192, + "grad_norm": 0.18477782607078552, + "learning_rate": 8.232127013365445e-05, + "loss": 0.0461, + "step": 6250 + }, + { + "epoch": 7.515006002400961, + "grad_norm": 0.2631203532218933, + "learning_rate": 8.225814799173295e-05, + "loss": 0.0539, + "step": 6260 + }, + { + "epoch": 7.527010804321729, + "grad_norm": 0.26319631934165955, + "learning_rate": 8.219493765726663e-05, + "loss": 0.0472, + "step": 6270 + }, + { + "epoch": 7.539015606242497, + "grad_norm": 0.1524098515510559, + "learning_rate": 8.21316393030701e-05, + "loss": 0.0441, + "step": 6280 + }, + { + "epoch": 7.551020408163265, + "grad_norm": 0.20412565767765045, + "learning_rate": 8.206825310219865e-05, + "loss": 0.0472, + "step": 6290 + }, + { + "epoch": 7.563025210084033, + "grad_norm": 0.17735373973846436, + "learning_rate": 8.200477922794776e-05, + "loss": 0.0417, + "step": 6300 + }, + { + "epoch": 7.575030012004802, + "grad_norm": 0.21120738983154297, + "learning_rate": 8.194121785385256e-05, + "loss": 0.0395, + "step": 6310 + }, + { + "epoch": 7.5870348139255706, + "grad_norm": 0.26850712299346924, + "learning_rate": 8.187756915368741e-05, + "loss": 0.0436, + "step": 6320 + }, + { + "epoch": 7.599039615846339, + "grad_norm": 0.21218596398830414, + "learning_rate": 8.181383330146544e-05, + "loss": 0.0424, + "step": 6330 + }, + { + "epoch": 7.611044417767107, + "grad_norm": 0.22262167930603027, + "learning_rate": 8.175001047143804e-05, + "loss": 0.0443, + "step": 6340 + }, + { + "epoch": 7.623049219687875, + "grad_norm": 0.17712774872779846, + "learning_rate": 8.168610083809438e-05, + "loss": 0.0473, + "step": 6350 + }, + { + "epoch": 7.635054021608643, + "grad_norm": 0.21082589030265808, + "learning_rate": 8.162210457616095e-05, + "loss": 0.0469, + "step": 6360 + }, + { + "epoch": 7.647058823529412, + "grad_norm": 0.22389423847198486, + "learning_rate": 8.155802186060109e-05, + "loss": 0.0446, + "step": 6370 + }, + { + "epoch": 7.65906362545018, + "grad_norm": 0.2564607262611389, + "learning_rate": 8.149385286661453e-05, + "loss": 0.0443, + "step": 6380 + }, + { + "epoch": 7.671068427370948, + "grad_norm": 0.1748933345079422, + "learning_rate": 8.14295977696368e-05, + "loss": 0.0425, + "step": 6390 + }, + { + "epoch": 7.6830732292917165, + "grad_norm": 0.26417699456214905, + "learning_rate": 8.13652567453389e-05, + "loss": 0.0493, + "step": 6400 + }, + { + "epoch": 7.695078031212485, + "grad_norm": 0.17291571199893951, + "learning_rate": 8.130082996962676e-05, + "loss": 0.0474, + "step": 6410 + }, + { + "epoch": 7.707082833133253, + "grad_norm": 0.2207215279340744, + "learning_rate": 8.123631761864068e-05, + "loss": 0.0398, + "step": 6420 + }, + { + "epoch": 7.719087635054022, + "grad_norm": 0.17853322625160217, + "learning_rate": 8.1171719868755e-05, + "loss": 0.0448, + "step": 6430 + }, + { + "epoch": 7.73109243697479, + "grad_norm": 0.23473280668258667, + "learning_rate": 8.110703689657748e-05, + "loss": 0.0456, + "step": 6440 + }, + { + "epoch": 7.743097238895558, + "grad_norm": 0.22121472656726837, + "learning_rate": 8.104226887894892e-05, + "loss": 0.0456, + "step": 6450 + }, + { + "epoch": 7.755102040816326, + "grad_norm": 0.24212230741977692, + "learning_rate": 8.097741599294257e-05, + "loss": 0.0382, + "step": 6460 + }, + { + "epoch": 7.767106842737094, + "grad_norm": 0.20683586597442627, + "learning_rate": 8.091247841586378e-05, + "loss": 0.0461, + "step": 6470 + }, + { + "epoch": 7.779111644657863, + "grad_norm": 0.18293067812919617, + "learning_rate": 8.084745632524939e-05, + "loss": 0.0405, + "step": 6480 + }, + { + "epoch": 7.791116446578632, + "grad_norm": 0.2585102319717407, + "learning_rate": 8.07823498988673e-05, + "loss": 0.0495, + "step": 6490 + }, + { + "epoch": 7.8031212484994, + "grad_norm": 0.13571149110794067, + "learning_rate": 8.071715931471602e-05, + "loss": 0.0457, + "step": 6500 + }, + { + "epoch": 7.815126050420168, + "grad_norm": 0.2189849466085434, + "learning_rate": 8.06518847510241e-05, + "loss": 0.0447, + "step": 6510 + }, + { + "epoch": 7.827130852340936, + "grad_norm": 0.25842782855033875, + "learning_rate": 8.058652638624971e-05, + "loss": 0.0477, + "step": 6520 + }, + { + "epoch": 7.839135654261705, + "grad_norm": 0.22753950953483582, + "learning_rate": 8.052108439908013e-05, + "loss": 0.0453, + "step": 6530 + }, + { + "epoch": 7.851140456182473, + "grad_norm": 0.12762132287025452, + "learning_rate": 8.045555896843125e-05, + "loss": 0.0454, + "step": 6540 + }, + { + "epoch": 7.863145258103241, + "grad_norm": 0.1836220771074295, + "learning_rate": 8.03899502734471e-05, + "loss": 0.0432, + "step": 6550 + }, + { + "epoch": 7.875150060024009, + "grad_norm": 0.16427825391292572, + "learning_rate": 8.032425849349931e-05, + "loss": 0.0395, + "step": 6560 + }, + { + "epoch": 7.8871548619447776, + "grad_norm": 0.2556310296058655, + "learning_rate": 8.025848380818674e-05, + "loss": 0.0466, + "step": 6570 + }, + { + "epoch": 7.899159663865547, + "grad_norm": 0.19057200849056244, + "learning_rate": 8.019262639733487e-05, + "loss": 0.0462, + "step": 6580 + }, + { + "epoch": 7.911164465786315, + "grad_norm": 0.141492560505867, + "learning_rate": 8.012668644099531e-05, + "loss": 0.0467, + "step": 6590 + }, + { + "epoch": 7.923169267707083, + "grad_norm": 0.18849638104438782, + "learning_rate": 8.006066411944542e-05, + "loss": 0.043, + "step": 6600 + }, + { + "epoch": 7.935174069627851, + "grad_norm": 0.19564303755760193, + "learning_rate": 7.999455961318769e-05, + "loss": 0.0516, + "step": 6610 + }, + { + "epoch": 7.947178871548619, + "grad_norm": 0.27751031517982483, + "learning_rate": 7.992837310294932e-05, + "loss": 0.0453, + "step": 6620 + }, + { + "epoch": 7.959183673469388, + "grad_norm": 0.17118938267230988, + "learning_rate": 7.986210476968167e-05, + "loss": 0.0482, + "step": 6630 + }, + { + "epoch": 7.971188475390156, + "grad_norm": 0.3086610734462738, + "learning_rate": 7.97957547945599e-05, + "loss": 0.0494, + "step": 6640 + }, + { + "epoch": 7.983193277310924, + "grad_norm": 0.2513780891895294, + "learning_rate": 7.972932335898226e-05, + "loss": 0.0459, + "step": 6650 + }, + { + "epoch": 7.995198079231693, + "grad_norm": 0.22606956958770752, + "learning_rate": 7.966281064456975e-05, + "loss": 0.0479, + "step": 6660 + }, + { + "epoch": 8.00720288115246, + "grad_norm": 0.19942642748355865, + "learning_rate": 7.959621683316563e-05, + "loss": 0.0412, + "step": 6670 + }, + { + "epoch": 8.01920768307323, + "grad_norm": 0.15617424249649048, + "learning_rate": 7.952954210683481e-05, + "loss": 0.0419, + "step": 6680 + }, + { + "epoch": 8.031212484993997, + "grad_norm": 0.18540449440479279, + "learning_rate": 7.946278664786345e-05, + "loss": 0.0422, + "step": 6690 + }, + { + "epoch": 8.043217286914766, + "grad_norm": 0.16553406417369843, + "learning_rate": 7.939595063875842e-05, + "loss": 0.0425, + "step": 6700 + }, + { + "epoch": 8.055222088835535, + "grad_norm": 0.18867523968219757, + "learning_rate": 7.932903426224683e-05, + "loss": 0.0406, + "step": 6710 + }, + { + "epoch": 8.067226890756302, + "grad_norm": 0.27043667435646057, + "learning_rate": 7.926203770127552e-05, + "loss": 0.042, + "step": 6720 + }, + { + "epoch": 8.079231692677071, + "grad_norm": 0.15894123911857605, + "learning_rate": 7.919496113901046e-05, + "loss": 0.0418, + "step": 6730 + }, + { + "epoch": 8.091236494597839, + "grad_norm": 0.17546898126602173, + "learning_rate": 7.912780475883649e-05, + "loss": 0.043, + "step": 6740 + }, + { + "epoch": 8.103241296518608, + "grad_norm": 0.237132266163826, + "learning_rate": 7.906056874435652e-05, + "loss": 0.0411, + "step": 6750 + }, + { + "epoch": 8.115246098439377, + "grad_norm": 0.17668040096759796, + "learning_rate": 7.899325327939131e-05, + "loss": 0.0422, + "step": 6760 + }, + { + "epoch": 8.127250900360144, + "grad_norm": 0.1808323711156845, + "learning_rate": 7.892585854797872e-05, + "loss": 0.0442, + "step": 6770 + }, + { + "epoch": 8.139255702280913, + "grad_norm": 0.22958463430404663, + "learning_rate": 7.88583847343734e-05, + "loss": 0.0425, + "step": 6780 + }, + { + "epoch": 8.15126050420168, + "grad_norm": 0.20874300599098206, + "learning_rate": 7.879083202304616e-05, + "loss": 0.0443, + "step": 6790 + }, + { + "epoch": 8.16326530612245, + "grad_norm": 0.18617822229862213, + "learning_rate": 7.872320059868355e-05, + "loss": 0.0416, + "step": 6800 + }, + { + "epoch": 8.175270108043218, + "grad_norm": 0.18889814615249634, + "learning_rate": 7.865549064618729e-05, + "loss": 0.0447, + "step": 6810 + }, + { + "epoch": 8.187274909963985, + "grad_norm": 0.16088111698627472, + "learning_rate": 7.858770235067381e-05, + "loss": 0.0414, + "step": 6820 + }, + { + "epoch": 8.199279711884754, + "grad_norm": 0.18715642392635345, + "learning_rate": 7.851983589747374e-05, + "loss": 0.0449, + "step": 6830 + }, + { + "epoch": 8.211284513805522, + "grad_norm": 0.20352406799793243, + "learning_rate": 7.845189147213133e-05, + "loss": 0.0407, + "step": 6840 + }, + { + "epoch": 8.22328931572629, + "grad_norm": 0.17456160485744476, + "learning_rate": 7.838386926040407e-05, + "loss": 0.0462, + "step": 6850 + }, + { + "epoch": 8.235294117647058, + "grad_norm": 0.2602746784687042, + "learning_rate": 7.83157694482621e-05, + "loss": 0.0437, + "step": 6860 + }, + { + "epoch": 8.247298919567827, + "grad_norm": 0.23365876078605652, + "learning_rate": 7.824759222188768e-05, + "loss": 0.0456, + "step": 6870 + }, + { + "epoch": 8.259303721488596, + "grad_norm": 0.280536949634552, + "learning_rate": 7.817933776767478e-05, + "loss": 0.0504, + "step": 6880 + }, + { + "epoch": 8.271308523409363, + "grad_norm": 0.2396528124809265, + "learning_rate": 7.811100627222842e-05, + "loss": 0.0449, + "step": 6890 + }, + { + "epoch": 8.283313325330132, + "grad_norm": 0.1960943341255188, + "learning_rate": 7.804259792236435e-05, + "loss": 0.0414, + "step": 6900 + }, + { + "epoch": 8.2953181272509, + "grad_norm": 0.26369932293891907, + "learning_rate": 7.797411290510835e-05, + "loss": 0.0468, + "step": 6910 + }, + { + "epoch": 8.307322929171669, + "grad_norm": 0.21671827137470245, + "learning_rate": 7.790555140769586e-05, + "loss": 0.045, + "step": 6920 + }, + { + "epoch": 8.319327731092438, + "grad_norm": 0.17270644009113312, + "learning_rate": 7.78369136175714e-05, + "loss": 0.0438, + "step": 6930 + }, + { + "epoch": 8.331332533013205, + "grad_norm": 0.19208544492721558, + "learning_rate": 7.776819972238806e-05, + "loss": 0.0427, + "step": 6940 + }, + { + "epoch": 8.343337334933974, + "grad_norm": 0.24133478105068207, + "learning_rate": 7.7699409910007e-05, + "loss": 0.0411, + "step": 6950 + }, + { + "epoch": 8.355342136854741, + "grad_norm": 0.15353649854660034, + "learning_rate": 7.763054436849694e-05, + "loss": 0.0475, + "step": 6960 + }, + { + "epoch": 8.36734693877551, + "grad_norm": 0.2783297300338745, + "learning_rate": 7.756160328613364e-05, + "loss": 0.047, + "step": 6970 + }, + { + "epoch": 8.37935174069628, + "grad_norm": 0.3205883502960205, + "learning_rate": 7.749258685139942e-05, + "loss": 0.0512, + "step": 6980 + }, + { + "epoch": 8.391356542617046, + "grad_norm": 0.16524124145507812, + "learning_rate": 7.742349525298253e-05, + "loss": 0.0391, + "step": 6990 + }, + { + "epoch": 8.403361344537815, + "grad_norm": 0.27185773849487305, + "learning_rate": 7.735432867977679e-05, + "loss": 0.0449, + "step": 7000 + }, + { + "epoch": 8.415366146458583, + "grad_norm": 0.21182462573051453, + "learning_rate": 7.728508732088096e-05, + "loss": 0.0399, + "step": 7010 + }, + { + "epoch": 8.427370948379352, + "grad_norm": 0.2224963903427124, + "learning_rate": 7.721577136559825e-05, + "loss": 0.0428, + "step": 7020 + }, + { + "epoch": 8.43937575030012, + "grad_norm": 0.3202022910118103, + "learning_rate": 7.714638100343588e-05, + "loss": 0.0423, + "step": 7030 + }, + { + "epoch": 8.451380552220888, + "grad_norm": 0.1908930391073227, + "learning_rate": 7.707691642410444e-05, + "loss": 0.0409, + "step": 7040 + }, + { + "epoch": 8.463385354141657, + "grad_norm": 0.17953453958034515, + "learning_rate": 7.70073778175174e-05, + "loss": 0.0398, + "step": 7050 + }, + { + "epoch": 8.475390156062424, + "grad_norm": 0.2502972185611725, + "learning_rate": 7.69377653737907e-05, + "loss": 0.0397, + "step": 7060 + }, + { + "epoch": 8.487394957983193, + "grad_norm": 0.1292092353105545, + "learning_rate": 7.686807928324209e-05, + "loss": 0.0422, + "step": 7070 + }, + { + "epoch": 8.499399759903962, + "grad_norm": 0.1878344863653183, + "learning_rate": 7.679831973639065e-05, + "loss": 0.0424, + "step": 7080 + }, + { + "epoch": 8.51140456182473, + "grad_norm": 0.2662566304206848, + "learning_rate": 7.672848692395637e-05, + "loss": 0.0414, + "step": 7090 + }, + { + "epoch": 8.523409363745499, + "grad_norm": 0.176138773560524, + "learning_rate": 7.665858103685944e-05, + "loss": 0.0406, + "step": 7100 + }, + { + "epoch": 8.535414165666266, + "grad_norm": 0.20609258115291595, + "learning_rate": 7.658860226621991e-05, + "loss": 0.0402, + "step": 7110 + }, + { + "epoch": 8.547418967587035, + "grad_norm": 0.32738399505615234, + "learning_rate": 7.651855080335708e-05, + "loss": 0.0511, + "step": 7120 + }, + { + "epoch": 8.559423769507804, + "grad_norm": 0.19472534954547882, + "learning_rate": 7.644842683978896e-05, + "loss": 0.0438, + "step": 7130 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 0.18683090806007385, + "learning_rate": 7.63782305672318e-05, + "loss": 0.0459, + "step": 7140 + }, + { + "epoch": 8.58343337334934, + "grad_norm": 0.14462962746620178, + "learning_rate": 7.63079621775995e-05, + "loss": 0.0426, + "step": 7150 + }, + { + "epoch": 8.595438175270107, + "grad_norm": 0.14628393948078156, + "learning_rate": 7.623762186300319e-05, + "loss": 0.0413, + "step": 7160 + }, + { + "epoch": 8.607442977190876, + "grad_norm": 0.2017265260219574, + "learning_rate": 7.616720981575057e-05, + "loss": 0.0446, + "step": 7170 + }, + { + "epoch": 8.619447779111646, + "grad_norm": 0.27595916390419006, + "learning_rate": 7.609672622834552e-05, + "loss": 0.0423, + "step": 7180 + }, + { + "epoch": 8.631452581032413, + "grad_norm": 0.146741583943367, + "learning_rate": 7.602617129348747e-05, + "loss": 0.0428, + "step": 7190 + }, + { + "epoch": 8.643457382953182, + "grad_norm": 0.15822121500968933, + "learning_rate": 7.595554520407088e-05, + "loss": 0.0428, + "step": 7200 + }, + { + "epoch": 8.655462184873949, + "grad_norm": 0.1491214781999588, + "learning_rate": 7.588484815318484e-05, + "loss": 0.0397, + "step": 7210 + }, + { + "epoch": 8.667466986794718, + "grad_norm": 0.14373914897441864, + "learning_rate": 7.581408033411234e-05, + "loss": 0.0397, + "step": 7220 + }, + { + "epoch": 8.679471788715485, + "grad_norm": 0.262336403131485, + "learning_rate": 7.574324194032995e-05, + "loss": 0.0416, + "step": 7230 + }, + { + "epoch": 8.691476590636254, + "grad_norm": 0.1920471042394638, + "learning_rate": 7.567233316550705e-05, + "loss": 0.0431, + "step": 7240 + }, + { + "epoch": 8.703481392557023, + "grad_norm": 0.23697906732559204, + "learning_rate": 7.560135420350562e-05, + "loss": 0.0443, + "step": 7250 + }, + { + "epoch": 8.71548619447779, + "grad_norm": 0.16066250205039978, + "learning_rate": 7.553030524837935e-05, + "loss": 0.0396, + "step": 7260 + }, + { + "epoch": 8.72749099639856, + "grad_norm": 0.22968637943267822, + "learning_rate": 7.545918649437341e-05, + "loss": 0.0374, + "step": 7270 + }, + { + "epoch": 8.739495798319329, + "grad_norm": 0.21314145624637604, + "learning_rate": 7.538799813592377e-05, + "loss": 0.0426, + "step": 7280 + }, + { + "epoch": 8.751500600240096, + "grad_norm": 0.20644252002239227, + "learning_rate": 7.531674036765662e-05, + "loss": 0.0466, + "step": 7290 + }, + { + "epoch": 8.763505402160865, + "grad_norm": 0.15254901349544525, + "learning_rate": 7.524541338438807e-05, + "loss": 0.0383, + "step": 7300 + }, + { + "epoch": 8.775510204081632, + "grad_norm": 0.1546887755393982, + "learning_rate": 7.517401738112328e-05, + "loss": 0.0411, + "step": 7310 + }, + { + "epoch": 8.787515006002401, + "grad_norm": 0.2441544085741043, + "learning_rate": 7.510255255305628e-05, + "loss": 0.0466, + "step": 7320 + }, + { + "epoch": 8.799519807923168, + "grad_norm": 0.17499303817749023, + "learning_rate": 7.503101909556911e-05, + "loss": 0.0439, + "step": 7330 + }, + { + "epoch": 8.811524609843937, + "grad_norm": 0.16554655134677887, + "learning_rate": 7.495941720423154e-05, + "loss": 0.04, + "step": 7340 + }, + { + "epoch": 8.823529411764707, + "grad_norm": 0.2381986379623413, + "learning_rate": 7.488774707480042e-05, + "loss": 0.0386, + "step": 7350 + }, + { + "epoch": 8.835534213685474, + "grad_norm": 0.2022096812725067, + "learning_rate": 7.481600890321911e-05, + "loss": 0.0426, + "step": 7360 + }, + { + "epoch": 8.847539015606243, + "grad_norm": 0.2779201865196228, + "learning_rate": 7.474420288561708e-05, + "loss": 0.0445, + "step": 7370 + }, + { + "epoch": 8.85954381752701, + "grad_norm": 0.2550973892211914, + "learning_rate": 7.467232921830921e-05, + "loss": 0.0407, + "step": 7380 + }, + { + "epoch": 8.871548619447779, + "grad_norm": 0.22769583761692047, + "learning_rate": 7.460038809779537e-05, + "loss": 0.0445, + "step": 7390 + }, + { + "epoch": 8.883553421368548, + "grad_norm": 0.15999357402324677, + "learning_rate": 7.452837972075983e-05, + "loss": 0.0397, + "step": 7400 + }, + { + "epoch": 8.895558223289315, + "grad_norm": 0.24681740999221802, + "learning_rate": 7.445630428407074e-05, + "loss": 0.0437, + "step": 7410 + }, + { + "epoch": 8.907563025210084, + "grad_norm": 0.16008146107196808, + "learning_rate": 7.43841619847796e-05, + "loss": 0.0411, + "step": 7420 + }, + { + "epoch": 8.919567827130852, + "grad_norm": 0.1991858184337616, + "learning_rate": 7.431195302012072e-05, + "loss": 0.0373, + "step": 7430 + }, + { + "epoch": 8.93157262905162, + "grad_norm": 0.17133541405200958, + "learning_rate": 7.423967758751061e-05, + "loss": 0.0411, + "step": 7440 + }, + { + "epoch": 8.94357743097239, + "grad_norm": 0.25846225023269653, + "learning_rate": 7.416733588454758e-05, + "loss": 0.0429, + "step": 7450 + }, + { + "epoch": 8.955582232893157, + "grad_norm": 0.1933434158563614, + "learning_rate": 7.409492810901106e-05, + "loss": 0.0425, + "step": 7460 + }, + { + "epoch": 8.967587034813926, + "grad_norm": 0.20811417698860168, + "learning_rate": 7.402245445886116e-05, + "loss": 0.0451, + "step": 7470 + }, + { + "epoch": 8.979591836734693, + "grad_norm": 0.17759567499160767, + "learning_rate": 7.394991513223806e-05, + "loss": 0.0426, + "step": 7480 + }, + { + "epoch": 8.991596638655462, + "grad_norm": 0.1702919900417328, + "learning_rate": 7.38773103274615e-05, + "loss": 0.0388, + "step": 7490 + }, + { + "epoch": 9.003601440576231, + "grad_norm": 0.20090565085411072, + "learning_rate": 7.380464024303028e-05, + "loss": 0.0466, + "step": 7500 + }, + { + "epoch": 9.015606242496998, + "grad_norm": 0.14825013279914856, + "learning_rate": 7.373190507762162e-05, + "loss": 0.04, + "step": 7510 + }, + { + "epoch": 9.027611044417768, + "grad_norm": 0.21282115578651428, + "learning_rate": 7.365910503009066e-05, + "loss": 0.039, + "step": 7520 + }, + { + "epoch": 9.039615846338535, + "grad_norm": 0.1601579636335373, + "learning_rate": 7.358624029946996e-05, + "loss": 0.0421, + "step": 7530 + }, + { + "epoch": 9.051620648259304, + "grad_norm": 0.262635201215744, + "learning_rate": 7.351331108496893e-05, + "loss": 0.0391, + "step": 7540 + }, + { + "epoch": 9.063625450180073, + "grad_norm": 0.2068508416414261, + "learning_rate": 7.344031758597325e-05, + "loss": 0.0362, + "step": 7550 + }, + { + "epoch": 9.07563025210084, + "grad_norm": 0.24553176760673523, + "learning_rate": 7.336726000204435e-05, + "loss": 0.0379, + "step": 7560 + }, + { + "epoch": 9.087635054021609, + "grad_norm": 0.13104812800884247, + "learning_rate": 7.32941385329189e-05, + "loss": 0.0428, + "step": 7570 + }, + { + "epoch": 9.099639855942376, + "grad_norm": 0.20186753571033478, + "learning_rate": 7.322095337850816e-05, + "loss": 0.0386, + "step": 7580 + }, + { + "epoch": 9.111644657863145, + "grad_norm": 0.19174814224243164, + "learning_rate": 7.314770473889758e-05, + "loss": 0.0388, + "step": 7590 + }, + { + "epoch": 9.123649459783914, + "grad_norm": 0.16724060475826263, + "learning_rate": 7.307439281434615e-05, + "loss": 0.0378, + "step": 7600 + }, + { + "epoch": 9.135654261704682, + "grad_norm": 0.14335161447525024, + "learning_rate": 7.300101780528585e-05, + "loss": 0.0398, + "step": 7610 + }, + { + "epoch": 9.14765906362545, + "grad_norm": 0.13529323041439056, + "learning_rate": 7.292757991232117e-05, + "loss": 0.0407, + "step": 7620 + }, + { + "epoch": 9.159663865546218, + "grad_norm": 0.17677386105060577, + "learning_rate": 7.285407933622848e-05, + "loss": 0.0395, + "step": 7630 + }, + { + "epoch": 9.171668667466987, + "grad_norm": 0.1877378225326538, + "learning_rate": 7.278051627795557e-05, + "loss": 0.0431, + "step": 7640 + }, + { + "epoch": 9.183673469387756, + "grad_norm": 0.16036252677440643, + "learning_rate": 7.270689093862105e-05, + "loss": 0.0395, + "step": 7650 + }, + { + "epoch": 9.195678271308523, + "grad_norm": 0.16995829343795776, + "learning_rate": 7.263320351951374e-05, + "loss": 0.0383, + "step": 7660 + }, + { + "epoch": 9.207683073229292, + "grad_norm": 0.1889636218547821, + "learning_rate": 7.255945422209227e-05, + "loss": 0.0468, + "step": 7670 + }, + { + "epoch": 9.21968787515006, + "grad_norm": 0.1727803647518158, + "learning_rate": 7.248564324798437e-05, + "loss": 0.0419, + "step": 7680 + }, + { + "epoch": 9.231692677070829, + "grad_norm": 0.25876104831695557, + "learning_rate": 7.241177079898644e-05, + "loss": 0.043, + "step": 7690 + }, + { + "epoch": 9.243697478991596, + "grad_norm": 0.31286776065826416, + "learning_rate": 7.233783707706295e-05, + "loss": 0.0405, + "step": 7700 + }, + { + "epoch": 9.255702280912365, + "grad_norm": 0.1269267350435257, + "learning_rate": 7.226384228434586e-05, + "loss": 0.0416, + "step": 7710 + }, + { + "epoch": 9.267707082833134, + "grad_norm": 0.1755734384059906, + "learning_rate": 7.21897866231341e-05, + "loss": 0.0448, + "step": 7720 + }, + { + "epoch": 9.279711884753901, + "grad_norm": 0.19753843545913696, + "learning_rate": 7.211567029589303e-05, + "loss": 0.0388, + "step": 7730 + }, + { + "epoch": 9.29171668667467, + "grad_norm": 0.18516609072685242, + "learning_rate": 7.204149350525387e-05, + "loss": 0.042, + "step": 7740 + }, + { + "epoch": 9.303721488595437, + "grad_norm": 0.18606910109519958, + "learning_rate": 7.196725645401309e-05, + "loss": 0.0438, + "step": 7750 + }, + { + "epoch": 9.315726290516206, + "grad_norm": 0.209607794880867, + "learning_rate": 7.1892959345132e-05, + "loss": 0.0401, + "step": 7760 + }, + { + "epoch": 9.327731092436975, + "grad_norm": 0.19024908542633057, + "learning_rate": 7.181860238173605e-05, + "loss": 0.0422, + "step": 7770 + }, + { + "epoch": 9.339735894357743, + "grad_norm": 0.17595726251602173, + "learning_rate": 7.174418576711432e-05, + "loss": 0.0434, + "step": 7780 + }, + { + "epoch": 9.351740696278512, + "grad_norm": 0.2189684361219406, + "learning_rate": 7.1669709704719e-05, + "loss": 0.038, + "step": 7790 + }, + { + "epoch": 9.363745498199279, + "grad_norm": 0.18410488963127136, + "learning_rate": 7.159517439816481e-05, + "loss": 0.04, + "step": 7800 + }, + { + "epoch": 9.375750300120048, + "grad_norm": 0.18605315685272217, + "learning_rate": 7.152058005122842e-05, + "loss": 0.0397, + "step": 7810 + }, + { + "epoch": 9.387755102040817, + "grad_norm": 0.1876663863658905, + "learning_rate": 7.144592686784793e-05, + "loss": 0.0414, + "step": 7820 + }, + { + "epoch": 9.399759903961584, + "grad_norm": 0.176482692360878, + "learning_rate": 7.137121505212229e-05, + "loss": 0.0378, + "step": 7830 + }, + { + "epoch": 9.411764705882353, + "grad_norm": 0.16468441486358643, + "learning_rate": 7.129644480831077e-05, + "loss": 0.038, + "step": 7840 + }, + { + "epoch": 9.42376950780312, + "grad_norm": 0.18569250404834747, + "learning_rate": 7.122161634083234e-05, + "loss": 0.0406, + "step": 7850 + }, + { + "epoch": 9.43577430972389, + "grad_norm": 0.2290879786014557, + "learning_rate": 7.114672985426516e-05, + "loss": 0.0434, + "step": 7860 + }, + { + "epoch": 9.447779111644659, + "grad_norm": 0.20217931270599365, + "learning_rate": 7.107178555334606e-05, + "loss": 0.0438, + "step": 7870 + }, + { + "epoch": 9.459783913565426, + "grad_norm": 0.1285896748304367, + "learning_rate": 7.099678364296989e-05, + "loss": 0.0366, + "step": 7880 + }, + { + "epoch": 9.471788715486195, + "grad_norm": 0.1850970983505249, + "learning_rate": 7.0921724328189e-05, + "loss": 0.0362, + "step": 7890 + }, + { + "epoch": 9.483793517406962, + "grad_norm": 0.1738177388906479, + "learning_rate": 7.084660781421268e-05, + "loss": 0.0406, + "step": 7900 + }, + { + "epoch": 9.495798319327731, + "grad_norm": 0.13374589383602142, + "learning_rate": 7.077143430640662e-05, + "loss": 0.04, + "step": 7910 + }, + { + "epoch": 9.5078031212485, + "grad_norm": 0.1623617559671402, + "learning_rate": 7.069620401029232e-05, + "loss": 0.0403, + "step": 7920 + }, + { + "epoch": 9.519807923169267, + "grad_norm": 0.1580892950296402, + "learning_rate": 7.062091713154655e-05, + "loss": 0.0417, + "step": 7930 + }, + { + "epoch": 9.531812725090036, + "grad_norm": 0.2093069702386856, + "learning_rate": 7.054557387600075e-05, + "loss": 0.0352, + "step": 7940 + }, + { + "epoch": 9.543817527010804, + "grad_norm": 0.22347690165042877, + "learning_rate": 7.04701744496405e-05, + "loss": 0.0415, + "step": 7950 + }, + { + "epoch": 9.555822328931573, + "grad_norm": 0.2036539614200592, + "learning_rate": 7.039471905860495e-05, + "loss": 0.0449, + "step": 7960 + }, + { + "epoch": 9.567827130852342, + "grad_norm": 0.29761895537376404, + "learning_rate": 7.031920790918628e-05, + "loss": 0.0419, + "step": 7970 + }, + { + "epoch": 9.579831932773109, + "grad_norm": 0.13356764614582062, + "learning_rate": 7.024364120782906e-05, + "loss": 0.0407, + "step": 7980 + }, + { + "epoch": 9.591836734693878, + "grad_norm": 0.14201948046684265, + "learning_rate": 7.016801916112978e-05, + "loss": 0.0394, + "step": 7990 + }, + { + "epoch": 9.603841536614645, + "grad_norm": 0.2083505541086197, + "learning_rate": 7.009234197583623e-05, + "loss": 0.0404, + "step": 8000 + }, + { + "epoch": 9.615846338535414, + "grad_norm": 0.12711897492408752, + "learning_rate": 7.001660985884692e-05, + "loss": 0.0359, + "step": 8010 + }, + { + "epoch": 9.627851140456183, + "grad_norm": 0.16792258620262146, + "learning_rate": 6.994082301721063e-05, + "loss": 0.0348, + "step": 8020 + }, + { + "epoch": 9.63985594237695, + "grad_norm": 0.17058788239955902, + "learning_rate": 6.986498165812563e-05, + "loss": 0.0411, + "step": 8030 + }, + { + "epoch": 9.65186074429772, + "grad_norm": 0.1682412028312683, + "learning_rate": 6.978908598893932e-05, + "loss": 0.042, + "step": 8040 + }, + { + "epoch": 9.663865546218487, + "grad_norm": 0.19554151594638824, + "learning_rate": 6.971313621714756e-05, + "loss": 0.0398, + "step": 8050 + }, + { + "epoch": 9.675870348139256, + "grad_norm": 0.15563726425170898, + "learning_rate": 6.96371325503941e-05, + "loss": 0.0409, + "step": 8060 + }, + { + "epoch": 9.687875150060023, + "grad_norm": 0.15052123367786407, + "learning_rate": 6.956107519647014e-05, + "loss": 0.0377, + "step": 8070 + }, + { + "epoch": 9.699879951980792, + "grad_norm": 0.14380216598510742, + "learning_rate": 6.94849643633135e-05, + "loss": 0.0401, + "step": 8080 + }, + { + "epoch": 9.711884753901561, + "grad_norm": 0.237024188041687, + "learning_rate": 6.940880025900834e-05, + "loss": 0.0412, + "step": 8090 + }, + { + "epoch": 9.723889555822328, + "grad_norm": 0.18788355588912964, + "learning_rate": 6.933258309178438e-05, + "loss": 0.0459, + "step": 8100 + }, + { + "epoch": 9.735894357743097, + "grad_norm": 0.15717405080795288, + "learning_rate": 6.925631307001646e-05, + "loss": 0.0391, + "step": 8110 + }, + { + "epoch": 9.747899159663866, + "grad_norm": 0.257963627576828, + "learning_rate": 6.91799904022239e-05, + "loss": 0.0385, + "step": 8120 + }, + { + "epoch": 9.759903961584634, + "grad_norm": 0.19174374639987946, + "learning_rate": 6.910361529706997e-05, + "loss": 0.0382, + "step": 8130 + }, + { + "epoch": 9.771908763505403, + "grad_norm": 0.18412497639656067, + "learning_rate": 6.902718796336131e-05, + "loss": 0.0371, + "step": 8140 + }, + { + "epoch": 9.78391356542617, + "grad_norm": 0.2239719182252884, + "learning_rate": 6.895070861004729e-05, + "loss": 0.0402, + "step": 8150 + }, + { + "epoch": 9.795918367346939, + "grad_norm": 0.19654016196727753, + "learning_rate": 6.887417744621956e-05, + "loss": 0.0404, + "step": 8160 + }, + { + "epoch": 9.807923169267706, + "grad_norm": 0.21114899218082428, + "learning_rate": 6.87975946811114e-05, + "loss": 0.0392, + "step": 8170 + }, + { + "epoch": 9.819927971188475, + "grad_norm": 0.1820380836725235, + "learning_rate": 6.872096052409718e-05, + "loss": 0.0384, + "step": 8180 + }, + { + "epoch": 9.831932773109244, + "grad_norm": 0.13072018325328827, + "learning_rate": 6.864427518469174e-05, + "loss": 0.0406, + "step": 8190 + }, + { + "epoch": 9.843937575030012, + "grad_norm": 0.16012217104434967, + "learning_rate": 6.856753887254986e-05, + "loss": 0.0408, + "step": 8200 + }, + { + "epoch": 9.85594237695078, + "grad_norm": 0.24535059928894043, + "learning_rate": 6.849075179746572e-05, + "loss": 0.0408, + "step": 8210 + }, + { + "epoch": 9.867947178871548, + "grad_norm": 0.1463879644870758, + "learning_rate": 6.841391416937221e-05, + "loss": 0.0357, + "step": 8220 + }, + { + "epoch": 9.879951980792317, + "grad_norm": 0.21045243740081787, + "learning_rate": 6.833702619834053e-05, + "loss": 0.0385, + "step": 8230 + }, + { + "epoch": 9.891956782713086, + "grad_norm": 0.1883695125579834, + "learning_rate": 6.82600880945794e-05, + "loss": 0.0409, + "step": 8240 + }, + { + "epoch": 9.903961584633853, + "grad_norm": 0.16713570058345795, + "learning_rate": 6.818310006843468e-05, + "loss": 0.0399, + "step": 8250 + }, + { + "epoch": 9.915966386554622, + "grad_norm": 0.15717430412769318, + "learning_rate": 6.810606233038868e-05, + "loss": 0.0393, + "step": 8260 + }, + { + "epoch": 9.92797118847539, + "grad_norm": 0.1836552768945694, + "learning_rate": 6.802897509105966e-05, + "loss": 0.0417, + "step": 8270 + }, + { + "epoch": 9.939975990396158, + "grad_norm": 0.14814554154872894, + "learning_rate": 6.79518385612012e-05, + "loss": 0.0349, + "step": 8280 + }, + { + "epoch": 9.951980792316927, + "grad_norm": 0.1343373954296112, + "learning_rate": 6.787465295170157e-05, + "loss": 0.0404, + "step": 8290 + }, + { + "epoch": 9.963985594237695, + "grad_norm": 0.2337896078824997, + "learning_rate": 6.779741847358332e-05, + "loss": 0.0414, + "step": 8300 + }, + { + "epoch": 9.975990396158464, + "grad_norm": 0.1882973462343216, + "learning_rate": 6.772013533800256e-05, + "loss": 0.0396, + "step": 8310 + }, + { + "epoch": 9.987995198079231, + "grad_norm": 0.36498594284057617, + "learning_rate": 6.764280375624843e-05, + "loss": 0.0349, + "step": 8320 + }, + { + "epoch": 10.0, + "grad_norm": 0.23099996149539948, + "learning_rate": 6.756542393974252e-05, + "loss": 0.0383, + "step": 8330 + }, + { + "epoch": 10.012004801920769, + "grad_norm": 0.16191919147968292, + "learning_rate": 6.748799610003828e-05, + "loss": 0.0377, + "step": 8340 + }, + { + "epoch": 10.024009603841536, + "grad_norm": 0.18549257516860962, + "learning_rate": 6.741052044882048e-05, + "loss": 0.0367, + "step": 8350 + }, + { + "epoch": 10.036014405762305, + "grad_norm": 0.13752786815166473, + "learning_rate": 6.73329971979046e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 10.048019207683073, + "grad_norm": 0.225381001830101, + "learning_rate": 6.725542655923625e-05, + "loss": 0.0376, + "step": 8370 + }, + { + "epoch": 10.060024009603842, + "grad_norm": 0.1487634778022766, + "learning_rate": 6.717780874489057e-05, + "loss": 0.0356, + "step": 8380 + }, + { + "epoch": 10.07202881152461, + "grad_norm": 0.14549127221107483, + "learning_rate": 6.710014396707172e-05, + "loss": 0.0381, + "step": 8390 + }, + { + "epoch": 10.084033613445378, + "grad_norm": 0.20784419775009155, + "learning_rate": 6.702243243811221e-05, + "loss": 0.0397, + "step": 8400 + }, + { + "epoch": 10.096038415366147, + "grad_norm": 0.22619032859802246, + "learning_rate": 6.694467437047244e-05, + "loss": 0.0417, + "step": 8410 + }, + { + "epoch": 10.108043217286914, + "grad_norm": 0.25021612644195557, + "learning_rate": 6.686686997673997e-05, + "loss": 0.0459, + "step": 8420 + }, + { + "epoch": 10.120048019207683, + "grad_norm": 0.16594727337360382, + "learning_rate": 6.678901946962903e-05, + "loss": 0.0398, + "step": 8430 + }, + { + "epoch": 10.132052821128452, + "grad_norm": 0.2029707282781601, + "learning_rate": 6.671112306197996e-05, + "loss": 0.0385, + "step": 8440 + }, + { + "epoch": 10.14405762304922, + "grad_norm": 0.1568644791841507, + "learning_rate": 6.663318096675854e-05, + "loss": 0.0402, + "step": 8450 + }, + { + "epoch": 10.156062424969988, + "grad_norm": 0.2367262989282608, + "learning_rate": 6.655519339705552e-05, + "loss": 0.0438, + "step": 8460 + }, + { + "epoch": 10.168067226890756, + "grad_norm": 0.17203769087791443, + "learning_rate": 6.647716056608588e-05, + "loss": 0.0362, + "step": 8470 + }, + { + "epoch": 10.180072028811525, + "grad_norm": 0.13694919645786285, + "learning_rate": 6.639908268718843e-05, + "loss": 0.0405, + "step": 8480 + }, + { + "epoch": 10.192076830732294, + "grad_norm": 0.15887856483459473, + "learning_rate": 6.632095997382514e-05, + "loss": 0.0431, + "step": 8490 + }, + { + "epoch": 10.204081632653061, + "grad_norm": 0.1840740293264389, + "learning_rate": 6.624279263958047e-05, + "loss": 0.0363, + "step": 8500 + }, + { + "epoch": 10.21608643457383, + "grad_norm": 0.18266797065734863, + "learning_rate": 6.616458089816097e-05, + "loss": 0.0388, + "step": 8510 + }, + { + "epoch": 10.228091236494597, + "grad_norm": 0.17088434100151062, + "learning_rate": 6.608632496339454e-05, + "loss": 0.038, + "step": 8520 + }, + { + "epoch": 10.240096038415366, + "grad_norm": 0.17704923450946808, + "learning_rate": 6.600802504922988e-05, + "loss": 0.0374, + "step": 8530 + }, + { + "epoch": 10.252100840336134, + "grad_norm": 0.14367282390594482, + "learning_rate": 6.592968136973604e-05, + "loss": 0.0393, + "step": 8540 + }, + { + "epoch": 10.264105642256903, + "grad_norm": 0.16419123113155365, + "learning_rate": 6.585129413910159e-05, + "loss": 0.0357, + "step": 8550 + }, + { + "epoch": 10.276110444177672, + "grad_norm": 0.15816110372543335, + "learning_rate": 6.577286357163424e-05, + "loss": 0.037, + "step": 8560 + }, + { + "epoch": 10.288115246098439, + "grad_norm": 0.20852789282798767, + "learning_rate": 6.569438988176018e-05, + "loss": 0.0451, + "step": 8570 + }, + { + "epoch": 10.300120048019208, + "grad_norm": 0.15505540370941162, + "learning_rate": 6.561587328402347e-05, + "loss": 0.038, + "step": 8580 + }, + { + "epoch": 10.312124849939975, + "grad_norm": 0.15013141930103302, + "learning_rate": 6.553731399308549e-05, + "loss": 0.0368, + "step": 8590 + }, + { + "epoch": 10.324129651860744, + "grad_norm": 0.19663149118423462, + "learning_rate": 6.545871222372436e-05, + "loss": 0.0397, + "step": 8600 + }, + { + "epoch": 10.336134453781513, + "grad_norm": 0.20689783990383148, + "learning_rate": 6.538006819083426e-05, + "loss": 0.0397, + "step": 8610 + }, + { + "epoch": 10.34813925570228, + "grad_norm": 0.2078137993812561, + "learning_rate": 6.530138210942505e-05, + "loss": 0.0364, + "step": 8620 + }, + { + "epoch": 10.36014405762305, + "grad_norm": 0.18468159437179565, + "learning_rate": 6.522265419462141e-05, + "loss": 0.0382, + "step": 8630 + }, + { + "epoch": 10.372148859543817, + "grad_norm": 0.16038332879543304, + "learning_rate": 6.514388466166248e-05, + "loss": 0.0382, + "step": 8640 + }, + { + "epoch": 10.384153661464586, + "grad_norm": 0.19802804291248322, + "learning_rate": 6.506507372590119e-05, + "loss": 0.0406, + "step": 8650 + }, + { + "epoch": 10.396158463385355, + "grad_norm": 0.18139250576496124, + "learning_rate": 6.498622160280355e-05, + "loss": 0.0387, + "step": 8660 + }, + { + "epoch": 10.408163265306122, + "grad_norm": 0.15806864202022552, + "learning_rate": 6.490732850794832e-05, + "loss": 0.037, + "step": 8670 + }, + { + "epoch": 10.420168067226891, + "grad_norm": 0.2164941132068634, + "learning_rate": 6.482839465702616e-05, + "loss": 0.043, + "step": 8680 + }, + { + "epoch": 10.432172869147658, + "grad_norm": 0.15754762291908264, + "learning_rate": 6.474942026583923e-05, + "loss": 0.0401, + "step": 8690 + }, + { + "epoch": 10.444177671068427, + "grad_norm": 0.20164723694324493, + "learning_rate": 6.467040555030052e-05, + "loss": 0.036, + "step": 8700 + }, + { + "epoch": 10.456182472989196, + "grad_norm": 0.14957737922668457, + "learning_rate": 6.459135072643321e-05, + "loss": 0.0351, + "step": 8710 + }, + { + "epoch": 10.468187274909964, + "grad_norm": 0.15689082443714142, + "learning_rate": 6.451225601037019e-05, + "loss": 0.0381, + "step": 8720 + }, + { + "epoch": 10.480192076830733, + "grad_norm": 0.1923455148935318, + "learning_rate": 6.443312161835338e-05, + "loss": 0.0375, + "step": 8730 + }, + { + "epoch": 10.4921968787515, + "grad_norm": 0.1614847183227539, + "learning_rate": 6.43539477667332e-05, + "loss": 0.0367, + "step": 8740 + }, + { + "epoch": 10.504201680672269, + "grad_norm": 0.1464577466249466, + "learning_rate": 6.427473467196793e-05, + "loss": 0.0366, + "step": 8750 + }, + { + "epoch": 10.516206482593038, + "grad_norm": 0.1844187080860138, + "learning_rate": 6.419548255062315e-05, + "loss": 0.0385, + "step": 8760 + }, + { + "epoch": 10.528211284513805, + "grad_norm": 0.1506308913230896, + "learning_rate": 6.411619161937112e-05, + "loss": 0.0385, + "step": 8770 + }, + { + "epoch": 10.540216086434574, + "grad_norm": 0.20905275642871857, + "learning_rate": 6.403686209499022e-05, + "loss": 0.0414, + "step": 8780 + }, + { + "epoch": 10.552220888355341, + "grad_norm": 0.23489916324615479, + "learning_rate": 6.395749419436437e-05, + "loss": 0.0418, + "step": 8790 + }, + { + "epoch": 10.56422569027611, + "grad_norm": 0.2155056744813919, + "learning_rate": 6.387808813448234e-05, + "loss": 0.0399, + "step": 8800 + }, + { + "epoch": 10.57623049219688, + "grad_norm": 0.13842663168907166, + "learning_rate": 6.37986441324373e-05, + "loss": 0.0318, + "step": 8810 + }, + { + "epoch": 10.588235294117647, + "grad_norm": 0.20978684723377228, + "learning_rate": 6.37191624054261e-05, + "loss": 0.0368, + "step": 8820 + }, + { + "epoch": 10.600240096038416, + "grad_norm": 0.17117398977279663, + "learning_rate": 6.363964317074872e-05, + "loss": 0.0387, + "step": 8830 + }, + { + "epoch": 10.612244897959183, + "grad_norm": 0.20055459439754486, + "learning_rate": 6.356008664580776e-05, + "loss": 0.0376, + "step": 8840 + }, + { + "epoch": 10.624249699879952, + "grad_norm": 0.198572039604187, + "learning_rate": 6.348049304810771e-05, + "loss": 0.038, + "step": 8850 + }, + { + "epoch": 10.636254501800721, + "grad_norm": 0.19392156600952148, + "learning_rate": 6.340086259525442e-05, + "loss": 0.0395, + "step": 8860 + }, + { + "epoch": 10.648259303721488, + "grad_norm": 0.14590661227703094, + "learning_rate": 6.332119550495448e-05, + "loss": 0.0355, + "step": 8870 + }, + { + "epoch": 10.660264105642257, + "grad_norm": 0.17346006631851196, + "learning_rate": 6.324149199501473e-05, + "loss": 0.039, + "step": 8880 + }, + { + "epoch": 10.672268907563025, + "grad_norm": 0.21799254417419434, + "learning_rate": 6.316175228334146e-05, + "loss": 0.0383, + "step": 8890 + }, + { + "epoch": 10.684273709483794, + "grad_norm": 0.1774076521396637, + "learning_rate": 6.308197658794003e-05, + "loss": 0.0349, + "step": 8900 + }, + { + "epoch": 10.696278511404563, + "grad_norm": 0.32159462571144104, + "learning_rate": 6.300216512691417e-05, + "loss": 0.0388, + "step": 8910 + }, + { + "epoch": 10.70828331332533, + "grad_norm": 0.18819457292556763, + "learning_rate": 6.292231811846532e-05, + "loss": 0.0352, + "step": 8920 + }, + { + "epoch": 10.720288115246099, + "grad_norm": 0.1994289755821228, + "learning_rate": 6.284243578089217e-05, + "loss": 0.0405, + "step": 8930 + }, + { + "epoch": 10.732292917166866, + "grad_norm": 0.20269493758678436, + "learning_rate": 6.276251833258999e-05, + "loss": 0.0356, + "step": 8940 + }, + { + "epoch": 10.744297719087635, + "grad_norm": 0.20259813964366913, + "learning_rate": 6.268256599205003e-05, + "loss": 0.0357, + "step": 8950 + }, + { + "epoch": 10.756302521008404, + "grad_norm": 0.11998016387224197, + "learning_rate": 6.260257897785892e-05, + "loss": 0.0362, + "step": 8960 + }, + { + "epoch": 10.768307322929171, + "grad_norm": 0.2009921669960022, + "learning_rate": 6.252255750869811e-05, + "loss": 0.0392, + "step": 8970 + }, + { + "epoch": 10.78031212484994, + "grad_norm": 0.1645919233560562, + "learning_rate": 6.244250180334325e-05, + "loss": 0.038, + "step": 8980 + }, + { + "epoch": 10.792316926770708, + "grad_norm": 0.20367179811000824, + "learning_rate": 6.236241208066356e-05, + "loss": 0.0374, + "step": 8990 + }, + { + "epoch": 10.804321728691477, + "grad_norm": 0.19548851251602173, + "learning_rate": 6.228228855962133e-05, + "loss": 0.0369, + "step": 9000 + }, + { + "epoch": 10.816326530612244, + "grad_norm": 0.30880749225616455, + "learning_rate": 6.220213145927115e-05, + "loss": 0.0411, + "step": 9010 + }, + { + "epoch": 10.828331332533013, + "grad_norm": 0.22037912905216217, + "learning_rate": 6.212194099875951e-05, + "loss": 0.0361, + "step": 9020 + }, + { + "epoch": 10.840336134453782, + "grad_norm": 0.1903771013021469, + "learning_rate": 6.204171739732405e-05, + "loss": 0.0388, + "step": 9030 + }, + { + "epoch": 10.85234093637455, + "grad_norm": 0.22356078028678894, + "learning_rate": 6.196146087429303e-05, + "loss": 0.0382, + "step": 9040 + }, + { + "epoch": 10.864345738295318, + "grad_norm": 0.16200630366802216, + "learning_rate": 6.188117164908474e-05, + "loss": 0.0367, + "step": 9050 + }, + { + "epoch": 10.876350540216086, + "grad_norm": 0.17379631102085114, + "learning_rate": 6.180084994120684e-05, + "loss": 0.0345, + "step": 9060 + }, + { + "epoch": 10.888355342136855, + "grad_norm": 0.20111346244812012, + "learning_rate": 6.17204959702558e-05, + "loss": 0.0404, + "step": 9070 + }, + { + "epoch": 10.900360144057624, + "grad_norm": 0.17776581645011902, + "learning_rate": 6.164010995591635e-05, + "loss": 0.0391, + "step": 9080 + }, + { + "epoch": 10.912364945978391, + "grad_norm": 0.2624545991420746, + "learning_rate": 6.155969211796076e-05, + "loss": 0.0383, + "step": 9090 + }, + { + "epoch": 10.92436974789916, + "grad_norm": 0.1517402082681656, + "learning_rate": 6.147924267624829e-05, + "loss": 0.0383, + "step": 9100 + }, + { + "epoch": 10.936374549819927, + "grad_norm": 0.21592414379119873, + "learning_rate": 6.13987618507247e-05, + "loss": 0.0387, + "step": 9110 + }, + { + "epoch": 10.948379351740696, + "grad_norm": 0.2960490584373474, + "learning_rate": 6.131824986142147e-05, + "loss": 0.0374, + "step": 9120 + }, + { + "epoch": 10.960384153661465, + "grad_norm": 0.23838748037815094, + "learning_rate": 6.123770692845529e-05, + "loss": 0.0397, + "step": 9130 + }, + { + "epoch": 10.972388955582232, + "grad_norm": 0.19429929554462433, + "learning_rate": 6.11571332720275e-05, + "loss": 0.0365, + "step": 9140 + }, + { + "epoch": 10.984393757503002, + "grad_norm": 0.1775694191455841, + "learning_rate": 6.107652911242336e-05, + "loss": 0.0354, + "step": 9150 + }, + { + "epoch": 10.996398559423769, + "grad_norm": 0.15581662952899933, + "learning_rate": 6.0995894670011586e-05, + "loss": 0.0382, + "step": 9160 + }, + { + "epoch": 11.008403361344538, + "grad_norm": 0.17194831371307373, + "learning_rate": 6.091523016524368e-05, + "loss": 0.0352, + "step": 9170 + }, + { + "epoch": 11.020408163265307, + "grad_norm": 0.16894415020942688, + "learning_rate": 6.083453581865328e-05, + "loss": 0.0334, + "step": 9180 + }, + { + "epoch": 11.032412965186074, + "grad_norm": 0.1998082995414734, + "learning_rate": 6.075381185085568e-05, + "loss": 0.0374, + "step": 9190 + }, + { + "epoch": 11.044417767106843, + "grad_norm": 0.19012099504470825, + "learning_rate": 6.067305848254709e-05, + "loss": 0.0397, + "step": 9200 + }, + { + "epoch": 11.05642256902761, + "grad_norm": 0.25975775718688965, + "learning_rate": 6.059227593450418e-05, + "loss": 0.0398, + "step": 9210 + }, + { + "epoch": 11.06842737094838, + "grad_norm": 0.24917170405387878, + "learning_rate": 6.051146442758333e-05, + "loss": 0.0351, + "step": 9220 + }, + { + "epoch": 11.080432172869148, + "grad_norm": 0.14517588913440704, + "learning_rate": 6.043062418272012e-05, + "loss": 0.0346, + "step": 9230 + }, + { + "epoch": 11.092436974789916, + "grad_norm": 0.38305822014808655, + "learning_rate": 6.0349755420928666e-05, + "loss": 0.0437, + "step": 9240 + }, + { + "epoch": 11.104441776710685, + "grad_norm": 0.20279426872730255, + "learning_rate": 6.0268858363301105e-05, + "loss": 0.034, + "step": 9250 + }, + { + "epoch": 11.116446578631452, + "grad_norm": 0.2026125192642212, + "learning_rate": 6.018793323100689e-05, + "loss": 0.0358, + "step": 9260 + }, + { + "epoch": 11.128451380552221, + "grad_norm": 0.14681707322597504, + "learning_rate": 6.0106980245292255e-05, + "loss": 0.0362, + "step": 9270 + }, + { + "epoch": 11.14045618247299, + "grad_norm": 0.16976365447044373, + "learning_rate": 6.002599962747957e-05, + "loss": 0.0394, + "step": 9280 + }, + { + "epoch": 11.152460984393757, + "grad_norm": 0.19450034201145172, + "learning_rate": 5.994499159896673e-05, + "loss": 0.0349, + "step": 9290 + }, + { + "epoch": 11.164465786314526, + "grad_norm": 0.1265704184770584, + "learning_rate": 5.9863956381226607e-05, + "loss": 0.0321, + "step": 9300 + }, + { + "epoch": 11.176470588235293, + "grad_norm": 0.1831366866827011, + "learning_rate": 5.9782894195806394e-05, + "loss": 0.0379, + "step": 9310 + }, + { + "epoch": 11.188475390156063, + "grad_norm": 0.2062634825706482, + "learning_rate": 5.9701805264327004e-05, + "loss": 0.0331, + "step": 9320 + }, + { + "epoch": 11.200480192076832, + "grad_norm": 0.17099066078662872, + "learning_rate": 5.96206898084825e-05, + "loss": 0.0349, + "step": 9330 + }, + { + "epoch": 11.212484993997599, + "grad_norm": 0.18083876371383667, + "learning_rate": 5.953954805003942e-05, + "loss": 0.035, + "step": 9340 + }, + { + "epoch": 11.224489795918368, + "grad_norm": 0.18587689101696014, + "learning_rate": 5.945838021083623e-05, + "loss": 0.0378, + "step": 9350 + }, + { + "epoch": 11.236494597839135, + "grad_norm": 0.18780159950256348, + "learning_rate": 5.9377186512782714e-05, + "loss": 0.0377, + "step": 9360 + }, + { + "epoch": 11.248499399759904, + "grad_norm": 0.14256691932678223, + "learning_rate": 5.929596717785935e-05, + "loss": 0.0368, + "step": 9370 + }, + { + "epoch": 11.260504201680673, + "grad_norm": 0.20968148112297058, + "learning_rate": 5.921472242811668e-05, + "loss": 0.0364, + "step": 9380 + }, + { + "epoch": 11.27250900360144, + "grad_norm": 0.16894619166851044, + "learning_rate": 5.913345248567475e-05, + "loss": 0.0353, + "step": 9390 + }, + { + "epoch": 11.28451380552221, + "grad_norm": 0.23655030131340027, + "learning_rate": 5.905215757272248e-05, + "loss": 0.0354, + "step": 9400 + }, + { + "epoch": 11.296518607442977, + "grad_norm": 0.13407224416732788, + "learning_rate": 5.897083791151706e-05, + "loss": 0.0347, + "step": 9410 + }, + { + "epoch": 11.308523409363746, + "grad_norm": 0.12209731340408325, + "learning_rate": 5.888949372438336e-05, + "loss": 0.0337, + "step": 9420 + }, + { + "epoch": 11.320528211284515, + "grad_norm": 0.2568415105342865, + "learning_rate": 5.8808125233713255e-05, + "loss": 0.033, + "step": 9430 + }, + { + "epoch": 11.332533013205282, + "grad_norm": 0.1995142251253128, + "learning_rate": 5.872673266196509e-05, + "loss": 0.0368, + "step": 9440 + }, + { + "epoch": 11.344537815126051, + "grad_norm": 0.14530596137046814, + "learning_rate": 5.864531623166305e-05, + "loss": 0.0376, + "step": 9450 + }, + { + "epoch": 11.356542617046818, + "grad_norm": 0.20431634783744812, + "learning_rate": 5.856387616539656e-05, + "loss": 0.0391, + "step": 9460 + }, + { + "epoch": 11.368547418967587, + "grad_norm": 0.16741293668746948, + "learning_rate": 5.848241268581967e-05, + "loss": 0.0384, + "step": 9470 + }, + { + "epoch": 11.380552220888354, + "grad_norm": 0.18009668588638306, + "learning_rate": 5.840092601565037e-05, + "loss": 0.0351, + "step": 9480 + }, + { + "epoch": 11.392557022809124, + "grad_norm": 0.1775333285331726, + "learning_rate": 5.8319416377670144e-05, + "loss": 0.0375, + "step": 9490 + }, + { + "epoch": 11.404561824729893, + "grad_norm": 0.28646835684776306, + "learning_rate": 5.82378839947232e-05, + "loss": 0.0376, + "step": 9500 + }, + { + "epoch": 11.41656662665066, + "grad_norm": 0.2090577930212021, + "learning_rate": 5.815632908971599e-05, + "loss": 0.0368, + "step": 9510 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 0.18116165697574615, + "learning_rate": 5.80747518856165e-05, + "loss": 0.0392, + "step": 9520 + }, + { + "epoch": 11.440576230492196, + "grad_norm": 0.17569953203201294, + "learning_rate": 5.799315260545367e-05, + "loss": 0.037, + "step": 9530 + }, + { + "epoch": 11.452581032412965, + "grad_norm": 0.18550145626068115, + "learning_rate": 5.791153147231686e-05, + "loss": 0.036, + "step": 9540 + }, + { + "epoch": 11.464585834333734, + "grad_norm": 0.18470260500907898, + "learning_rate": 5.782988870935509e-05, + "loss": 0.0337, + "step": 9550 + }, + { + "epoch": 11.476590636254501, + "grad_norm": 0.18307043612003326, + "learning_rate": 5.774822453977657e-05, + "loss": 0.0359, + "step": 9560 + }, + { + "epoch": 11.48859543817527, + "grad_norm": 0.1706153005361557, + "learning_rate": 5.7666539186848036e-05, + "loss": 0.0366, + "step": 9570 + }, + { + "epoch": 11.500600240096038, + "grad_norm": 0.16758893430233002, + "learning_rate": 5.758483287389411e-05, + "loss": 0.036, + "step": 9580 + }, + { + "epoch": 11.512605042016807, + "grad_norm": 0.15243500471115112, + "learning_rate": 5.7503105824296735e-05, + "loss": 0.0372, + "step": 9590 + }, + { + "epoch": 11.524609843937576, + "grad_norm": 0.1632862240076065, + "learning_rate": 5.742135826149453e-05, + "loss": 0.0355, + "step": 9600 + }, + { + "epoch": 11.536614645858343, + "grad_norm": 0.12307554483413696, + "learning_rate": 5.7339590408982223e-05, + "loss": 0.0336, + "step": 9610 + }, + { + "epoch": 11.548619447779112, + "grad_norm": 0.13798248767852783, + "learning_rate": 5.725780249031e-05, + "loss": 0.0319, + "step": 9620 + }, + { + "epoch": 11.56062424969988, + "grad_norm": 0.17078986763954163, + "learning_rate": 5.717599472908292e-05, + "loss": 0.0337, + "step": 9630 + }, + { + "epoch": 11.572629051620648, + "grad_norm": 0.17350313067436218, + "learning_rate": 5.7094167348960237e-05, + "loss": 0.0328, + "step": 9640 + }, + { + "epoch": 11.584633853541417, + "grad_norm": 0.16148975491523743, + "learning_rate": 5.7012320573654945e-05, + "loss": 0.0346, + "step": 9650 + }, + { + "epoch": 11.596638655462185, + "grad_norm": 0.15845711529254913, + "learning_rate": 5.693045462693295e-05, + "loss": 0.0345, + "step": 9660 + }, + { + "epoch": 11.608643457382954, + "grad_norm": 0.12360858172178268, + "learning_rate": 5.684856973261266e-05, + "loss": 0.0362, + "step": 9670 + }, + { + "epoch": 11.62064825930372, + "grad_norm": 0.16052955389022827, + "learning_rate": 5.6766666114564215e-05, + "loss": 0.0387, + "step": 9680 + }, + { + "epoch": 11.63265306122449, + "grad_norm": 0.1924574226140976, + "learning_rate": 5.668474399670899e-05, + "loss": 0.0377, + "step": 9690 + }, + { + "epoch": 11.644657863145259, + "grad_norm": 0.13616204261779785, + "learning_rate": 5.660280360301896e-05, + "loss": 0.0368, + "step": 9700 + }, + { + "epoch": 11.656662665066026, + "grad_norm": 0.1636907160282135, + "learning_rate": 5.652084515751599e-05, + "loss": 0.0376, + "step": 9710 + }, + { + "epoch": 11.668667466986795, + "grad_norm": 0.1801534742116928, + "learning_rate": 5.643886888427137e-05, + "loss": 0.0362, + "step": 9720 + }, + { + "epoch": 11.680672268907562, + "grad_norm": 0.15523841977119446, + "learning_rate": 5.6356875007405074e-05, + "loss": 0.0383, + "step": 9730 + }, + { + "epoch": 11.692677070828331, + "grad_norm": 0.21545585989952087, + "learning_rate": 5.627486375108525e-05, + "loss": 0.0357, + "step": 9740 + }, + { + "epoch": 11.7046818727491, + "grad_norm": 0.16575084626674652, + "learning_rate": 5.619283533952754e-05, + "loss": 0.0406, + "step": 9750 + }, + { + "epoch": 11.716686674669868, + "grad_norm": 0.27626457810401917, + "learning_rate": 5.6110789996994474e-05, + "loss": 0.0428, + "step": 9760 + }, + { + "epoch": 11.728691476590637, + "grad_norm": 0.18973006308078766, + "learning_rate": 5.602872794779491e-05, + "loss": 0.0341, + "step": 9770 + }, + { + "epoch": 11.740696278511404, + "grad_norm": 0.17076024413108826, + "learning_rate": 5.594664941628334e-05, + "loss": 0.038, + "step": 9780 + }, + { + "epoch": 11.752701080432173, + "grad_norm": 0.2138928323984146, + "learning_rate": 5.5864554626859324e-05, + "loss": 0.0425, + "step": 9790 + }, + { + "epoch": 11.764705882352942, + "grad_norm": 0.16127055883407593, + "learning_rate": 5.578244380396691e-05, + "loss": 0.0359, + "step": 9800 + }, + { + "epoch": 11.77671068427371, + "grad_norm": 0.14932304620742798, + "learning_rate": 5.570031717209394e-05, + "loss": 0.0315, + "step": 9810 + }, + { + "epoch": 11.788715486194478, + "grad_norm": 0.17230631411075592, + "learning_rate": 5.561817495577147e-05, + "loss": 0.0361, + "step": 9820 + }, + { + "epoch": 11.800720288115246, + "grad_norm": 0.1374058723449707, + "learning_rate": 5.5536017379573215e-05, + "loss": 0.0314, + "step": 9830 + }, + { + "epoch": 11.812725090036015, + "grad_norm": 0.14193862676620483, + "learning_rate": 5.545384466811483e-05, + "loss": 0.0367, + "step": 9840 + }, + { + "epoch": 11.824729891956782, + "grad_norm": 0.1421475112438202, + "learning_rate": 5.5371657046053384e-05, + "loss": 0.033, + "step": 9850 + }, + { + "epoch": 11.83673469387755, + "grad_norm": 0.17079542577266693, + "learning_rate": 5.528945473808669e-05, + "loss": 0.0392, + "step": 9860 + }, + { + "epoch": 11.84873949579832, + "grad_norm": 0.22039489448070526, + "learning_rate": 5.520723796895272e-05, + "loss": 0.0365, + "step": 9870 + }, + { + "epoch": 11.860744297719087, + "grad_norm": 0.17339077591896057, + "learning_rate": 5.512500696342897e-05, + "loss": 0.0355, + "step": 9880 + }, + { + "epoch": 11.872749099639856, + "grad_norm": 0.17991453409194946, + "learning_rate": 5.504276194633188e-05, + "loss": 0.0355, + "step": 9890 + }, + { + "epoch": 11.884753901560625, + "grad_norm": 0.15626837313175201, + "learning_rate": 5.49605031425162e-05, + "loss": 0.0381, + "step": 9900 + }, + { + "epoch": 11.896758703481392, + "grad_norm": 0.13403722643852234, + "learning_rate": 5.487823077687434e-05, + "loss": 0.0367, + "step": 9910 + }, + { + "epoch": 11.908763505402161, + "grad_norm": 0.19482530653476715, + "learning_rate": 5.4795945074335806e-05, + "loss": 0.0352, + "step": 9920 + }, + { + "epoch": 11.920768307322929, + "grad_norm": 0.16083987057209015, + "learning_rate": 5.471364625986657e-05, + "loss": 0.036, + "step": 9930 + }, + { + "epoch": 11.932773109243698, + "grad_norm": 0.24408821761608124, + "learning_rate": 5.463133455846845e-05, + "loss": 0.0372, + "step": 9940 + }, + { + "epoch": 11.944777911164465, + "grad_norm": 0.1262819916009903, + "learning_rate": 5.4549010195178505e-05, + "loss": 0.0345, + "step": 9950 + }, + { + "epoch": 11.956782713085234, + "grad_norm": 0.17886441946029663, + "learning_rate": 5.446667339506838e-05, + "loss": 0.0307, + "step": 9960 + }, + { + "epoch": 11.968787515006003, + "grad_norm": 0.20317071676254272, + "learning_rate": 5.4384324383243756e-05, + "loss": 0.0319, + "step": 9970 + }, + { + "epoch": 11.98079231692677, + "grad_norm": 0.13676953315734863, + "learning_rate": 5.430196338484368e-05, + "loss": 0.0341, + "step": 9980 + }, + { + "epoch": 11.99279711884754, + "grad_norm": 0.1989016979932785, + "learning_rate": 5.4219590625039975e-05, + "loss": 0.035, + "step": 9990 + }, + { + "epoch": 12.004801920768307, + "grad_norm": 0.1614992916584015, + "learning_rate": 5.413720632903664e-05, + "loss": 0.0339, + "step": 10000 + }, + { + "epoch": 12.016806722689076, + "grad_norm": 0.2011656016111374, + "learning_rate": 5.405481072206917e-05, + "loss": 0.0378, + "step": 10010 + }, + { + "epoch": 12.028811524609845, + "grad_norm": 0.1602124720811844, + "learning_rate": 5.397240402940402e-05, + "loss": 0.0343, + "step": 10020 + }, + { + "epoch": 12.040816326530612, + "grad_norm": 0.15234936773777008, + "learning_rate": 5.388998647633794e-05, + "loss": 0.0345, + "step": 10030 + }, + { + "epoch": 12.05282112845138, + "grad_norm": 0.16387160122394562, + "learning_rate": 5.380755828819737e-05, + "loss": 0.0325, + "step": 10040 + }, + { + "epoch": 12.064825930372148, + "grad_norm": 0.18789468705654144, + "learning_rate": 5.3725119690337846e-05, + "loss": 0.0355, + "step": 10050 + }, + { + "epoch": 12.076830732292917, + "grad_norm": 0.2006371021270752, + "learning_rate": 5.3642670908143324e-05, + "loss": 0.0393, + "step": 10060 + }, + { + "epoch": 12.088835534213686, + "grad_norm": 0.164114847779274, + "learning_rate": 5.356021216702562e-05, + "loss": 0.0314, + "step": 10070 + }, + { + "epoch": 12.100840336134453, + "grad_norm": 0.1718127578496933, + "learning_rate": 5.347774369242381e-05, + "loss": 0.031, + "step": 10080 + }, + { + "epoch": 12.112845138055222, + "grad_norm": 0.15339119732379913, + "learning_rate": 5.3395265709803545e-05, + "loss": 0.0337, + "step": 10090 + }, + { + "epoch": 12.12484993997599, + "grad_norm": 0.1851869821548462, + "learning_rate": 5.331277844465647e-05, + "loss": 0.0361, + "step": 10100 + }, + { + "epoch": 12.136854741896759, + "grad_norm": 0.1962175816297531, + "learning_rate": 5.323028212249963e-05, + "loss": 0.0353, + "step": 10110 + }, + { + "epoch": 12.148859543817528, + "grad_norm": 0.1800670325756073, + "learning_rate": 5.314777696887481e-05, + "loss": 0.0359, + "step": 10120 + }, + { + "epoch": 12.160864345738295, + "grad_norm": 0.14262789487838745, + "learning_rate": 5.306526320934796e-05, + "loss": 0.0331, + "step": 10130 + }, + { + "epoch": 12.172869147659064, + "grad_norm": 0.19176365435123444, + "learning_rate": 5.298274106950854e-05, + "loss": 0.0315, + "step": 10140 + }, + { + "epoch": 12.184873949579831, + "grad_norm": 0.1543601006269455, + "learning_rate": 5.290021077496893e-05, + "loss": 0.0341, + "step": 10150 + }, + { + "epoch": 12.1968787515006, + "grad_norm": 0.14513146877288818, + "learning_rate": 5.2817672551363816e-05, + "loss": 0.0335, + "step": 10160 + }, + { + "epoch": 12.20888355342137, + "grad_norm": 0.21399088203907013, + "learning_rate": 5.273512662434952e-05, + "loss": 0.0354, + "step": 10170 + }, + { + "epoch": 12.220888355342137, + "grad_norm": 0.1614575833082199, + "learning_rate": 5.265257321960349e-05, + "loss": 0.0324, + "step": 10180 + }, + { + "epoch": 12.232893157262906, + "grad_norm": 0.24782899022102356, + "learning_rate": 5.257001256282357e-05, + "loss": 0.0324, + "step": 10190 + }, + { + "epoch": 12.244897959183673, + "grad_norm": 0.15302206575870514, + "learning_rate": 5.248744487972742e-05, + "loss": 0.032, + "step": 10200 + }, + { + "epoch": 12.256902761104442, + "grad_norm": 0.1560119390487671, + "learning_rate": 5.240487039605196e-05, + "loss": 0.0316, + "step": 10210 + }, + { + "epoch": 12.268907563025211, + "grad_norm": 0.20011839270591736, + "learning_rate": 5.232228933755267e-05, + "loss": 0.0362, + "step": 10220 + }, + { + "epoch": 12.280912364945978, + "grad_norm": 0.15975980460643768, + "learning_rate": 5.2239701930003006e-05, + "loss": 0.0326, + "step": 10230 + }, + { + "epoch": 12.292917166866747, + "grad_norm": 0.1670454889535904, + "learning_rate": 5.215710839919379e-05, + "loss": 0.0351, + "step": 10240 + }, + { + "epoch": 12.304921968787514, + "grad_norm": 0.20251289010047913, + "learning_rate": 5.207450897093257e-05, + "loss": 0.0298, + "step": 10250 + }, + { + "epoch": 12.316926770708283, + "grad_norm": 0.18709607422351837, + "learning_rate": 5.1991903871043046e-05, + "loss": 0.0346, + "step": 10260 + }, + { + "epoch": 12.328931572629052, + "grad_norm": 0.1737508326768875, + "learning_rate": 5.190929332536439e-05, + "loss": 0.0322, + "step": 10270 + }, + { + "epoch": 12.34093637454982, + "grad_norm": 0.21039511263370514, + "learning_rate": 5.182667755975071e-05, + "loss": 0.0355, + "step": 10280 + }, + { + "epoch": 12.352941176470589, + "grad_norm": 0.1532686948776245, + "learning_rate": 5.1744056800070315e-05, + "loss": 0.0312, + "step": 10290 + }, + { + "epoch": 12.364945978391356, + "grad_norm": 0.13164696097373962, + "learning_rate": 5.166143127220524e-05, + "loss": 0.0342, + "step": 10300 + }, + { + "epoch": 12.376950780312125, + "grad_norm": 0.17526483535766602, + "learning_rate": 5.1578801202050485e-05, + "loss": 0.033, + "step": 10310 + }, + { + "epoch": 12.388955582232892, + "grad_norm": 0.18284663558006287, + "learning_rate": 5.149616681551355e-05, + "loss": 0.04, + "step": 10320 + }, + { + "epoch": 12.400960384153661, + "grad_norm": 0.2111562341451645, + "learning_rate": 5.141352833851367e-05, + "loss": 0.0317, + "step": 10330 + }, + { + "epoch": 12.41296518607443, + "grad_norm": 0.17556875944137573, + "learning_rate": 5.1330885996981285e-05, + "loss": 0.0381, + "step": 10340 + }, + { + "epoch": 12.424969987995198, + "grad_norm": 0.17409683763980865, + "learning_rate": 5.124824001685741e-05, + "loss": 0.0311, + "step": 10350 + }, + { + "epoch": 12.436974789915967, + "grad_norm": 0.28423818945884705, + "learning_rate": 5.116559062409298e-05, + "loss": 0.0394, + "step": 10360 + }, + { + "epoch": 12.448979591836734, + "grad_norm": 0.15619589388370514, + "learning_rate": 5.10829380446483e-05, + "loss": 0.0314, + "step": 10370 + }, + { + "epoch": 12.460984393757503, + "grad_norm": 0.1834740787744522, + "learning_rate": 5.100028250449235e-05, + "loss": 0.0359, + "step": 10380 + }, + { + "epoch": 12.472989195678272, + "grad_norm": 0.16582967340946198, + "learning_rate": 5.0917624229602234e-05, + "loss": 0.0292, + "step": 10390 + }, + { + "epoch": 12.48499399759904, + "grad_norm": 0.12122184783220291, + "learning_rate": 5.0834963445962524e-05, + "loss": 0.0312, + "step": 10400 + }, + { + "epoch": 12.496998799519808, + "grad_norm": 0.17128656804561615, + "learning_rate": 5.075230037956461e-05, + "loss": 0.0308, + "step": 10410 + }, + { + "epoch": 12.509003601440575, + "grad_norm": 0.17943637073040009, + "learning_rate": 5.0669635256406213e-05, + "loss": 0.0349, + "step": 10420 + }, + { + "epoch": 12.521008403361344, + "grad_norm": 0.14806485176086426, + "learning_rate": 5.058696830249058e-05, + "loss": 0.0352, + "step": 10430 + }, + { + "epoch": 12.533013205282113, + "grad_norm": 0.19095462560653687, + "learning_rate": 5.050429974382602e-05, + "loss": 0.0321, + "step": 10440 + }, + { + "epoch": 12.54501800720288, + "grad_norm": 0.17411953210830688, + "learning_rate": 5.042162980642523e-05, + "loss": 0.0306, + "step": 10450 + }, + { + "epoch": 12.55702280912365, + "grad_norm": 0.15987840294837952, + "learning_rate": 5.033895871630462e-05, + "loss": 0.0311, + "step": 10460 + }, + { + "epoch": 12.569027611044417, + "grad_norm": 0.2100231945514679, + "learning_rate": 5.025628669948386e-05, + "loss": 0.0353, + "step": 10470 + }, + { + "epoch": 12.581032412965186, + "grad_norm": 0.16145174205303192, + "learning_rate": 5.017361398198502e-05, + "loss": 0.0321, + "step": 10480 + }, + { + "epoch": 12.593037214885955, + "grad_norm": 0.14948807656764984, + "learning_rate": 5.009094078983221e-05, + "loss": 0.0332, + "step": 10490 + }, + { + "epoch": 12.605042016806722, + "grad_norm": 0.16770821809768677, + "learning_rate": 5.000826734905073e-05, + "loss": 0.0375, + "step": 10500 + }, + { + "epoch": 12.617046818727491, + "grad_norm": 0.18285676836967468, + "learning_rate": 4.9925593885666645e-05, + "loss": 0.0352, + "step": 10510 + }, + { + "epoch": 12.629051620648259, + "grad_norm": 0.2115688920021057, + "learning_rate": 4.984292062570602e-05, + "loss": 0.0319, + "step": 10520 + }, + { + "epoch": 12.641056422569028, + "grad_norm": 0.5114518404006958, + "learning_rate": 4.976024779519442e-05, + "loss": 0.0281, + "step": 10530 + }, + { + "epoch": 12.653061224489797, + "grad_norm": 0.13000498712062836, + "learning_rate": 4.9677575620156194e-05, + "loss": 0.033, + "step": 10540 + }, + { + "epoch": 12.665066026410564, + "grad_norm": 0.18074671924114227, + "learning_rate": 4.959490432661391e-05, + "loss": 0.035, + "step": 10550 + }, + { + "epoch": 12.677070828331333, + "grad_norm": 0.1617550253868103, + "learning_rate": 4.9512234140587726e-05, + "loss": 0.0366, + "step": 10560 + }, + { + "epoch": 12.6890756302521, + "grad_norm": 0.15372201800346375, + "learning_rate": 4.942956528809477e-05, + "loss": 0.0336, + "step": 10570 + }, + { + "epoch": 12.70108043217287, + "grad_norm": 0.24288564920425415, + "learning_rate": 4.934689799514854e-05, + "loss": 0.0344, + "step": 10580 + }, + { + "epoch": 12.713085234093638, + "grad_norm": 0.1557466983795166, + "learning_rate": 4.926423248775827e-05, + "loss": 0.0333, + "step": 10590 + }, + { + "epoch": 12.725090036014405, + "grad_norm": 0.14268739521503448, + "learning_rate": 4.918156899192826e-05, + "loss": 0.0345, + "step": 10600 + }, + { + "epoch": 12.737094837935174, + "grad_norm": 0.10624710470438004, + "learning_rate": 4.909890773365738e-05, + "loss": 0.0314, + "step": 10610 + }, + { + "epoch": 12.749099639855942, + "grad_norm": 0.1403711587190628, + "learning_rate": 4.9016248938938344e-05, + "loss": 0.0312, + "step": 10620 + }, + { + "epoch": 12.76110444177671, + "grad_norm": 0.15913717448711395, + "learning_rate": 4.8933592833757156e-05, + "loss": 0.0352, + "step": 10630 + }, + { + "epoch": 12.77310924369748, + "grad_norm": 0.1708512306213379, + "learning_rate": 4.8850939644092435e-05, + "loss": 0.0347, + "step": 10640 + }, + { + "epoch": 12.785114045618247, + "grad_norm": 0.1770499050617218, + "learning_rate": 4.876828959591485e-05, + "loss": 0.0321, + "step": 10650 + }, + { + "epoch": 12.797118847539016, + "grad_norm": 0.16435477137565613, + "learning_rate": 4.8685642915186474e-05, + "loss": 0.0302, + "step": 10660 + }, + { + "epoch": 12.809123649459783, + "grad_norm": 0.17166729271411896, + "learning_rate": 4.860299982786018e-05, + "loss": 0.0281, + "step": 10670 + }, + { + "epoch": 12.821128451380552, + "grad_norm": 0.19754496216773987, + "learning_rate": 4.852036055987901e-05, + "loss": 0.0316, + "step": 10680 + }, + { + "epoch": 12.83313325330132, + "grad_norm": 0.19292643666267395, + "learning_rate": 4.843772533717558e-05, + "loss": 0.0358, + "step": 10690 + }, + { + "epoch": 12.845138055222089, + "grad_norm": 0.17378729581832886, + "learning_rate": 4.835509438567142e-05, + "loss": 0.0362, + "step": 10700 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 0.16305848956108093, + "learning_rate": 4.827246793127639e-05, + "loss": 0.0335, + "step": 10710 + }, + { + "epoch": 12.869147659063625, + "grad_norm": 0.15893776714801788, + "learning_rate": 4.818984619988807e-05, + "loss": 0.0342, + "step": 10720 + }, + { + "epoch": 12.881152460984394, + "grad_norm": 0.15545840561389923, + "learning_rate": 4.810722941739115e-05, + "loss": 0.0333, + "step": 10730 + }, + { + "epoch": 12.893157262905163, + "grad_norm": 0.20558786392211914, + "learning_rate": 4.8024617809656684e-05, + "loss": 0.0375, + "step": 10740 + }, + { + "epoch": 12.90516206482593, + "grad_norm": 0.17814666032791138, + "learning_rate": 4.794201160254171e-05, + "loss": 0.0333, + "step": 10750 + }, + { + "epoch": 12.9171668667467, + "grad_norm": 0.16750440001487732, + "learning_rate": 4.785941102188844e-05, + "loss": 0.0333, + "step": 10760 + }, + { + "epoch": 12.929171668667466, + "grad_norm": 0.18934759497642517, + "learning_rate": 4.7776816293523686e-05, + "loss": 0.0318, + "step": 10770 + }, + { + "epoch": 12.941176470588236, + "grad_norm": 0.21593482792377472, + "learning_rate": 4.769422764325832e-05, + "loss": 0.03, + "step": 10780 + }, + { + "epoch": 12.953181272509003, + "grad_norm": 0.15560540556907654, + "learning_rate": 4.76116452968865e-05, + "loss": 0.0336, + "step": 10790 + }, + { + "epoch": 12.965186074429772, + "grad_norm": 0.19137658178806305, + "learning_rate": 4.752906948018525e-05, + "loss": 0.0358, + "step": 10800 + }, + { + "epoch": 12.97719087635054, + "grad_norm": 0.17795032262802124, + "learning_rate": 4.7446500418913684e-05, + "loss": 0.0319, + "step": 10810 + }, + { + "epoch": 12.989195678271308, + "grad_norm": 0.25058308243751526, + "learning_rate": 4.736393833881247e-05, + "loss": 0.037, + "step": 10820 + }, + { + "epoch": 13.001200480192077, + "grad_norm": 0.20166996121406555, + "learning_rate": 4.7281383465603194e-05, + "loss": 0.0301, + "step": 10830 + }, + { + "epoch": 13.013205282112844, + "grad_norm": 0.20873497426509857, + "learning_rate": 4.71988360249877e-05, + "loss": 0.0293, + "step": 10840 + }, + { + "epoch": 13.025210084033613, + "grad_norm": 0.19249585270881653, + "learning_rate": 4.7116296242647554e-05, + "loss": 0.0307, + "step": 10850 + }, + { + "epoch": 13.037214885954382, + "grad_norm": 0.14951194822788239, + "learning_rate": 4.703376434424336e-05, + "loss": 0.0348, + "step": 10860 + }, + { + "epoch": 13.04921968787515, + "grad_norm": 0.18014411628246307, + "learning_rate": 4.695124055541421e-05, + "loss": 0.0316, + "step": 10870 + }, + { + "epoch": 13.061224489795919, + "grad_norm": 0.15121670067310333, + "learning_rate": 4.6868725101776934e-05, + "loss": 0.0339, + "step": 10880 + }, + { + "epoch": 13.073229291716686, + "grad_norm": 0.16152657568454742, + "learning_rate": 4.678621820892567e-05, + "loss": 0.0278, + "step": 10890 + }, + { + "epoch": 13.085234093637455, + "grad_norm": 0.1307085007429123, + "learning_rate": 4.670372010243111e-05, + "loss": 0.0301, + "step": 10900 + }, + { + "epoch": 13.097238895558224, + "grad_norm": 0.19830501079559326, + "learning_rate": 4.662123100783992e-05, + "loss": 0.0312, + "step": 10910 + }, + { + "epoch": 13.109243697478991, + "grad_norm": 0.15733736753463745, + "learning_rate": 4.653875115067415e-05, + "loss": 0.0311, + "step": 10920 + }, + { + "epoch": 13.12124849939976, + "grad_norm": 0.18646450340747833, + "learning_rate": 4.6456280756430545e-05, + "loss": 0.033, + "step": 10930 + }, + { + "epoch": 13.133253301320527, + "grad_norm": 0.15286655724048615, + "learning_rate": 4.637382005058004e-05, + "loss": 0.0355, + "step": 10940 + }, + { + "epoch": 13.145258103241297, + "grad_norm": 0.1772819459438324, + "learning_rate": 4.629136925856705e-05, + "loss": 0.032, + "step": 10950 + }, + { + "epoch": 13.157262905162066, + "grad_norm": 0.1731722205877304, + "learning_rate": 4.6208928605808895e-05, + "loss": 0.0329, + "step": 10960 + }, + { + "epoch": 13.169267707082833, + "grad_norm": 0.20434404909610748, + "learning_rate": 4.612649831769519e-05, + "loss": 0.0287, + "step": 10970 + }, + { + "epoch": 13.181272509003602, + "grad_norm": 0.26561424136161804, + "learning_rate": 4.604407861958715e-05, + "loss": 0.0368, + "step": 10980 + }, + { + "epoch": 13.193277310924369, + "grad_norm": 0.13921359181404114, + "learning_rate": 4.5961669736817114e-05, + "loss": 0.0322, + "step": 10990 + }, + { + "epoch": 13.205282112845138, + "grad_norm": 0.2511043846607208, + "learning_rate": 4.5879271894687814e-05, + "loss": 0.0337, + "step": 11000 + }, + { + "epoch": 13.217286914765907, + "grad_norm": 0.13867050409317017, + "learning_rate": 4.5796885318471826e-05, + "loss": 0.033, + "step": 11010 + }, + { + "epoch": 13.229291716686674, + "grad_norm": 0.15021547675132751, + "learning_rate": 4.571451023341086e-05, + "loss": 0.0293, + "step": 11020 + }, + { + "epoch": 13.241296518607443, + "grad_norm": 0.14700748026371002, + "learning_rate": 4.563214686471527e-05, + "loss": 0.0311, + "step": 11030 + }, + { + "epoch": 13.25330132052821, + "grad_norm": 0.21747539937496185, + "learning_rate": 4.5549795437563365e-05, + "loss": 0.0277, + "step": 11040 + }, + { + "epoch": 13.26530612244898, + "grad_norm": 0.13461296260356903, + "learning_rate": 4.546745617710081e-05, + "loss": 0.0313, + "step": 11050 + }, + { + "epoch": 13.277310924369749, + "grad_norm": 0.1832558959722519, + "learning_rate": 4.5385129308440014e-05, + "loss": 0.0325, + "step": 11060 + }, + { + "epoch": 13.289315726290516, + "grad_norm": 0.12071637809276581, + "learning_rate": 4.530281505665944e-05, + "loss": 0.0321, + "step": 11070 + }, + { + "epoch": 13.301320528211285, + "grad_norm": 0.16151876747608185, + "learning_rate": 4.5220513646803134e-05, + "loss": 0.0313, + "step": 11080 + }, + { + "epoch": 13.313325330132052, + "grad_norm": 0.15702684223651886, + "learning_rate": 4.513822530388003e-05, + "loss": 0.0301, + "step": 11090 + }, + { + "epoch": 13.325330132052821, + "grad_norm": 0.232677161693573, + "learning_rate": 4.5055950252863296e-05, + "loss": 0.0306, + "step": 11100 + }, + { + "epoch": 13.33733493397359, + "grad_norm": 0.18851803243160248, + "learning_rate": 4.4973688718689803e-05, + "loss": 0.0321, + "step": 11110 + }, + { + "epoch": 13.349339735894358, + "grad_norm": 0.15643465518951416, + "learning_rate": 4.4891440926259406e-05, + "loss": 0.0327, + "step": 11120 + }, + { + "epoch": 13.361344537815127, + "grad_norm": 0.16762901842594147, + "learning_rate": 4.480920710043443e-05, + "loss": 0.0293, + "step": 11130 + }, + { + "epoch": 13.373349339735894, + "grad_norm": 0.16319607198238373, + "learning_rate": 4.4726987466039044e-05, + "loss": 0.0341, + "step": 11140 + }, + { + "epoch": 13.385354141656663, + "grad_norm": 0.14183306694030762, + "learning_rate": 4.46447822478586e-05, + "loss": 0.0285, + "step": 11150 + }, + { + "epoch": 13.39735894357743, + "grad_norm": 0.18502908945083618, + "learning_rate": 4.4562591670638974e-05, + "loss": 0.0267, + "step": 11160 + }, + { + "epoch": 13.409363745498199, + "grad_norm": 0.15558622777462006, + "learning_rate": 4.4480415959086105e-05, + "loss": 0.0289, + "step": 11170 + }, + { + "epoch": 13.421368547418968, + "grad_norm": 0.10113747417926788, + "learning_rate": 4.439825533786522e-05, + "loss": 0.0314, + "step": 11180 + }, + { + "epoch": 13.433373349339735, + "grad_norm": 0.22371365129947662, + "learning_rate": 4.431611003160035e-05, + "loss": 0.0324, + "step": 11190 + }, + { + "epoch": 13.445378151260504, + "grad_norm": 0.18475349247455597, + "learning_rate": 4.4233980264873636e-05, + "loss": 0.0334, + "step": 11200 + }, + { + "epoch": 13.457382953181273, + "grad_norm": 0.2153368592262268, + "learning_rate": 4.4151866262224684e-05, + "loss": 0.0313, + "step": 11210 + }, + { + "epoch": 13.46938775510204, + "grad_norm": 0.1269955188035965, + "learning_rate": 4.406976824815006e-05, + "loss": 0.0311, + "step": 11220 + }, + { + "epoch": 13.48139255702281, + "grad_norm": 0.1498347967863083, + "learning_rate": 4.3987686447102595e-05, + "loss": 0.0288, + "step": 11230 + }, + { + "epoch": 13.493397358943577, + "grad_norm": 0.15554887056350708, + "learning_rate": 4.3905621083490804e-05, + "loss": 0.0372, + "step": 11240 + }, + { + "epoch": 13.505402160864346, + "grad_norm": 0.18272952735424042, + "learning_rate": 4.3823572381678286e-05, + "loss": 0.0318, + "step": 11250 + }, + { + "epoch": 13.517406962785113, + "grad_norm": 0.17770974338054657, + "learning_rate": 4.374154056598301e-05, + "loss": 0.0292, + "step": 11260 + }, + { + "epoch": 13.529411764705882, + "grad_norm": 0.23590652644634247, + "learning_rate": 4.3659525860676845e-05, + "loss": 0.033, + "step": 11270 + }, + { + "epoch": 13.541416566626651, + "grad_norm": 0.16027045249938965, + "learning_rate": 4.3577528489984854e-05, + "loss": 0.0294, + "step": 11280 + }, + { + "epoch": 13.553421368547419, + "grad_norm": 0.11890041083097458, + "learning_rate": 4.349554867808476e-05, + "loss": 0.0295, + "step": 11290 + }, + { + "epoch": 13.565426170468188, + "grad_norm": 0.09067125618457794, + "learning_rate": 4.34135866491062e-05, + "loss": 0.0321, + "step": 11300 + }, + { + "epoch": 13.577430972388955, + "grad_norm": 0.14218930900096893, + "learning_rate": 4.333164262713022e-05, + "loss": 0.0316, + "step": 11310 + }, + { + "epoch": 13.589435774309724, + "grad_norm": 0.1378699094057083, + "learning_rate": 4.324971683618868e-05, + "loss": 0.0272, + "step": 11320 + }, + { + "epoch": 13.601440576230493, + "grad_norm": 0.12394767254590988, + "learning_rate": 4.316780950026354e-05, + "loss": 0.0298, + "step": 11330 + }, + { + "epoch": 13.61344537815126, + "grad_norm": 0.1398560106754303, + "learning_rate": 4.308592084328637e-05, + "loss": 0.0316, + "step": 11340 + }, + { + "epoch": 13.62545018007203, + "grad_norm": 0.10276425629854202, + "learning_rate": 4.3004051089137576e-05, + "loss": 0.032, + "step": 11350 + }, + { + "epoch": 13.637454981992796, + "grad_norm": 0.17750854790210724, + "learning_rate": 4.292220046164597e-05, + "loss": 0.0345, + "step": 11360 + }, + { + "epoch": 13.649459783913565, + "grad_norm": 0.15469428896903992, + "learning_rate": 4.2840369184588035e-05, + "loss": 0.0291, + "step": 11370 + }, + { + "epoch": 13.661464585834334, + "grad_norm": 0.16742867231369019, + "learning_rate": 4.2758557481687345e-05, + "loss": 0.0326, + "step": 11380 + }, + { + "epoch": 13.673469387755102, + "grad_norm": 0.1339283585548401, + "learning_rate": 4.267676557661403e-05, + "loss": 0.0311, + "step": 11390 + }, + { + "epoch": 13.68547418967587, + "grad_norm": 0.15246446430683136, + "learning_rate": 4.2594993692983955e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 13.697478991596638, + "grad_norm": 0.13272404670715332, + "learning_rate": 4.251324205435837e-05, + "loss": 0.0322, + "step": 11410 + }, + { + "epoch": 13.709483793517407, + "grad_norm": 0.20758327841758728, + "learning_rate": 4.243151088424312e-05, + "loss": 0.0342, + "step": 11420 + }, + { + "epoch": 13.721488595438176, + "grad_norm": 0.1464661806821823, + "learning_rate": 4.234980040608813e-05, + "loss": 0.029, + "step": 11430 + }, + { + "epoch": 13.733493397358943, + "grad_norm": 0.12145654112100601, + "learning_rate": 4.22681108432867e-05, + "loss": 0.0324, + "step": 11440 + }, + { + "epoch": 13.745498199279712, + "grad_norm": 0.2272828072309494, + "learning_rate": 4.2186442419174984e-05, + "loss": 0.0294, + "step": 11450 + }, + { + "epoch": 13.75750300120048, + "grad_norm": 0.14333264529705048, + "learning_rate": 4.210479535703133e-05, + "loss": 0.029, + "step": 11460 + }, + { + "epoch": 13.769507803121249, + "grad_norm": 0.1611536294221878, + "learning_rate": 4.202316988007567e-05, + "loss": 0.0309, + "step": 11470 + }, + { + "epoch": 13.781512605042018, + "grad_norm": 0.14833322167396545, + "learning_rate": 4.194156621146901e-05, + "loss": 0.0278, + "step": 11480 + }, + { + "epoch": 13.793517406962785, + "grad_norm": 0.13917264342308044, + "learning_rate": 4.1859984574312596e-05, + "loss": 0.0302, + "step": 11490 + }, + { + "epoch": 13.805522208883554, + "grad_norm": 0.13832975924015045, + "learning_rate": 4.177842519164752e-05, + "loss": 0.0298, + "step": 11500 + }, + { + "epoch": 13.817527010804321, + "grad_norm": 0.16262377798557281, + "learning_rate": 4.169688828645404e-05, + "loss": 0.0272, + "step": 11510 + }, + { + "epoch": 13.82953181272509, + "grad_norm": 0.18487125635147095, + "learning_rate": 4.161537408165092e-05, + "loss": 0.0302, + "step": 11520 + }, + { + "epoch": 13.84153661464586, + "grad_norm": 0.15162165462970734, + "learning_rate": 4.1533882800094924e-05, + "loss": 0.0311, + "step": 11530 + }, + { + "epoch": 13.853541416566626, + "grad_norm": 0.1538446992635727, + "learning_rate": 4.145241466458005e-05, + "loss": 0.032, + "step": 11540 + }, + { + "epoch": 13.865546218487395, + "grad_norm": 0.16242457926273346, + "learning_rate": 4.13709698978371e-05, + "loss": 0.032, + "step": 11550 + }, + { + "epoch": 13.877551020408163, + "grad_norm": 0.13888831436634064, + "learning_rate": 4.1289548722532944e-05, + "loss": 0.0296, + "step": 11560 + }, + { + "epoch": 13.889555822328932, + "grad_norm": 0.15558509528636932, + "learning_rate": 4.120815136126999e-05, + "loss": 0.0328, + "step": 11570 + }, + { + "epoch": 13.9015606242497, + "grad_norm": 0.13373999297618866, + "learning_rate": 4.112677803658548e-05, + "loss": 0.0316, + "step": 11580 + }, + { + "epoch": 13.913565426170468, + "grad_norm": 0.10202693194150925, + "learning_rate": 4.1045428970951e-05, + "loss": 0.031, + "step": 11590 + }, + { + "epoch": 13.925570228091237, + "grad_norm": 0.16313385963439941, + "learning_rate": 4.0964104386771785e-05, + "loss": 0.0296, + "step": 11600 + }, + { + "epoch": 13.937575030012004, + "grad_norm": 0.23488736152648926, + "learning_rate": 4.0882804506386144e-05, + "loss": 0.0294, + "step": 11610 + }, + { + "epoch": 13.949579831932773, + "grad_norm": 0.13064809143543243, + "learning_rate": 4.080152955206485e-05, + "loss": 0.0314, + "step": 11620 + }, + { + "epoch": 13.96158463385354, + "grad_norm": 0.18415741622447968, + "learning_rate": 4.0720279746010505e-05, + "loss": 0.0347, + "step": 11630 + }, + { + "epoch": 13.97358943577431, + "grad_norm": 0.16618268191814423, + "learning_rate": 4.063905531035699e-05, + "loss": 0.0309, + "step": 11640 + }, + { + "epoch": 13.985594237695079, + "grad_norm": 0.16811901330947876, + "learning_rate": 4.055785646716882e-05, + "loss": 0.034, + "step": 11650 + }, + { + "epoch": 13.997599039615846, + "grad_norm": 0.1653086096048355, + "learning_rate": 4.047668343844051e-05, + "loss": 0.0319, + "step": 11660 + }, + { + "epoch": 14.009603841536615, + "grad_norm": 0.10807102173566818, + "learning_rate": 4.039553644609604e-05, + "loss": 0.0286, + "step": 11670 + }, + { + "epoch": 14.021608643457382, + "grad_norm": 0.15677005052566528, + "learning_rate": 4.0314415711988176e-05, + "loss": 0.0271, + "step": 11680 + }, + { + "epoch": 14.033613445378151, + "grad_norm": 0.1050342544913292, + "learning_rate": 4.023332145789792e-05, + "loss": 0.0307, + "step": 11690 + }, + { + "epoch": 14.04561824729892, + "grad_norm": 0.15215755999088287, + "learning_rate": 4.015225390553385e-05, + "loss": 0.028, + "step": 11700 + }, + { + "epoch": 14.057623049219687, + "grad_norm": 0.2366170883178711, + "learning_rate": 4.007121327653158e-05, + "loss": 0.0286, + "step": 11710 + }, + { + "epoch": 14.069627851140456, + "grad_norm": 0.20510484278202057, + "learning_rate": 3.9990199792453064e-05, + "loss": 0.0285, + "step": 11720 + }, + { + "epoch": 14.081632653061224, + "grad_norm": 0.14430855214595795, + "learning_rate": 3.9909213674786103e-05, + "loss": 0.031, + "step": 11730 + }, + { + "epoch": 14.093637454981993, + "grad_norm": 0.18818648159503937, + "learning_rate": 3.982825514494363e-05, + "loss": 0.0315, + "step": 11740 + }, + { + "epoch": 14.105642256902762, + "grad_norm": 0.12248379737138748, + "learning_rate": 3.974732442426319e-05, + "loss": 0.0302, + "step": 11750 + }, + { + "epoch": 14.117647058823529, + "grad_norm": 0.17842459678649902, + "learning_rate": 3.966642173400629e-05, + "loss": 0.034, + "step": 11760 + }, + { + "epoch": 14.129651860744298, + "grad_norm": 0.15299005806446075, + "learning_rate": 3.9585547295357764e-05, + "loss": 0.0315, + "step": 11770 + }, + { + "epoch": 14.141656662665065, + "grad_norm": 0.16563361883163452, + "learning_rate": 3.950470132942526e-05, + "loss": 0.0267, + "step": 11780 + }, + { + "epoch": 14.153661464585834, + "grad_norm": 0.11991439014673233, + "learning_rate": 3.942388405723856e-05, + "loss": 0.0263, + "step": 11790 + }, + { + "epoch": 14.165666266506603, + "grad_norm": 0.16744470596313477, + "learning_rate": 3.9343095699749e-05, + "loss": 0.0329, + "step": 11800 + }, + { + "epoch": 14.17767106842737, + "grad_norm": 0.11292163282632828, + "learning_rate": 3.9262336477828874e-05, + "loss": 0.0298, + "step": 11810 + }, + { + "epoch": 14.18967587034814, + "grad_norm": 0.1414865255355835, + "learning_rate": 3.9181606612270794e-05, + "loss": 0.0309, + "step": 11820 + }, + { + "epoch": 14.201680672268907, + "grad_norm": 0.16028372943401337, + "learning_rate": 3.910090632378713e-05, + "loss": 0.0304, + "step": 11830 + }, + { + "epoch": 14.213685474189676, + "grad_norm": 0.1566227823495865, + "learning_rate": 3.90202358330094e-05, + "loss": 0.0315, + "step": 11840 + }, + { + "epoch": 14.225690276110445, + "grad_norm": 0.1410282850265503, + "learning_rate": 3.8939595360487656e-05, + "loss": 0.0296, + "step": 11850 + }, + { + "epoch": 14.237695078031212, + "grad_norm": 0.15207898616790771, + "learning_rate": 3.885898512668984e-05, + "loss": 0.0295, + "step": 11860 + }, + { + "epoch": 14.249699879951981, + "grad_norm": 0.17018800973892212, + "learning_rate": 3.877840535200127e-05, + "loss": 0.0298, + "step": 11870 + }, + { + "epoch": 14.261704681872748, + "grad_norm": 0.13706082105636597, + "learning_rate": 3.869785625672397e-05, + "loss": 0.0352, + "step": 11880 + }, + { + "epoch": 14.273709483793517, + "grad_norm": 0.17707355320453644, + "learning_rate": 3.8617338061076094e-05, + "loss": 0.0325, + "step": 11890 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 0.20351926982402802, + "learning_rate": 3.853685098519132e-05, + "loss": 0.027, + "step": 11900 + }, + { + "epoch": 14.297719087635054, + "grad_norm": 0.18687346577644348, + "learning_rate": 3.845639524911823e-05, + "loss": 0.0308, + "step": 11910 + }, + { + "epoch": 14.309723889555823, + "grad_norm": 0.1725032776594162, + "learning_rate": 3.837597107281974e-05, + "loss": 0.0283, + "step": 11920 + }, + { + "epoch": 14.32172869147659, + "grad_norm": 0.16488805413246155, + "learning_rate": 3.829557867617247e-05, + "loss": 0.0315, + "step": 11930 + }, + { + "epoch": 14.333733493397359, + "grad_norm": 0.2493484765291214, + "learning_rate": 3.821521827896618e-05, + "loss": 0.0324, + "step": 11940 + }, + { + "epoch": 14.345738295318128, + "grad_norm": 0.18446896970272064, + "learning_rate": 3.81348901009031e-05, + "loss": 0.0328, + "step": 11950 + }, + { + "epoch": 14.357743097238895, + "grad_norm": 0.1447513848543167, + "learning_rate": 3.805459436159741e-05, + "loss": 0.0277, + "step": 11960 + }, + { + "epoch": 14.369747899159664, + "grad_norm": 0.1754480004310608, + "learning_rate": 3.797433128057461e-05, + "loss": 0.0352, + "step": 11970 + }, + { + "epoch": 14.381752701080432, + "grad_norm": 0.14233237504959106, + "learning_rate": 3.789410107727089e-05, + "loss": 0.029, + "step": 11980 + }, + { + "epoch": 14.3937575030012, + "grad_norm": 0.14556150138378143, + "learning_rate": 3.781390397103257e-05, + "loss": 0.0286, + "step": 11990 + }, + { + "epoch": 14.40576230492197, + "grad_norm": 0.2353033423423767, + "learning_rate": 3.7733740181115455e-05, + "loss": 0.0305, + "step": 12000 + }, + { + "epoch": 14.417767106842737, + "grad_norm": 0.1686706840991974, + "learning_rate": 3.7653609926684306e-05, + "loss": 0.0305, + "step": 12010 + }, + { + "epoch": 14.429771908763506, + "grad_norm": 0.1818300485610962, + "learning_rate": 3.757351342681217e-05, + "loss": 0.0301, + "step": 12020 + }, + { + "epoch": 14.441776710684273, + "grad_norm": 0.22708076238632202, + "learning_rate": 3.749345090047982e-05, + "loss": 0.0303, + "step": 12030 + }, + { + "epoch": 14.453781512605042, + "grad_norm": 0.21978455781936646, + "learning_rate": 3.741342256657515e-05, + "loss": 0.0296, + "step": 12040 + }, + { + "epoch": 14.465786314525811, + "grad_norm": 0.1693282276391983, + "learning_rate": 3.7333428643892567e-05, + "loss": 0.0309, + "step": 12050 + }, + { + "epoch": 14.477791116446578, + "grad_norm": 0.15741223096847534, + "learning_rate": 3.725346935113239e-05, + "loss": 0.0299, + "step": 12060 + }, + { + "epoch": 14.489795918367347, + "grad_norm": 0.15638475120067596, + "learning_rate": 3.717354490690029e-05, + "loss": 0.0307, + "step": 12070 + }, + { + "epoch": 14.501800720288115, + "grad_norm": 0.17425619065761566, + "learning_rate": 3.709365552970664e-05, + "loss": 0.0259, + "step": 12080 + }, + { + "epoch": 14.513805522208884, + "grad_norm": 0.15588626265525818, + "learning_rate": 3.7013801437965945e-05, + "loss": 0.0313, + "step": 12090 + }, + { + "epoch": 14.525810324129651, + "grad_norm": 0.10860875248908997, + "learning_rate": 3.693398284999623e-05, + "loss": 0.0278, + "step": 12100 + }, + { + "epoch": 14.53781512605042, + "grad_norm": 0.13515517115592957, + "learning_rate": 3.6854199984018484e-05, + "loss": 0.032, + "step": 12110 + }, + { + "epoch": 14.549819927971189, + "grad_norm": 0.12871921062469482, + "learning_rate": 3.677445305815601e-05, + "loss": 0.0293, + "step": 12120 + }, + { + "epoch": 14.561824729891956, + "grad_norm": 0.1578647643327713, + "learning_rate": 3.669474229043387e-05, + "loss": 0.0288, + "step": 12130 + }, + { + "epoch": 14.573829531812725, + "grad_norm": 0.18135696649551392, + "learning_rate": 3.6615067898778235e-05, + "loss": 0.0285, + "step": 12140 + }, + { + "epoch": 14.585834333733493, + "grad_norm": 0.1525477021932602, + "learning_rate": 3.6535430101015866e-05, + "loss": 0.0298, + "step": 12150 + }, + { + "epoch": 14.597839135654262, + "grad_norm": 0.17381416261196136, + "learning_rate": 3.645582911487345e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 14.60984393757503, + "grad_norm": 0.13748063147068024, + "learning_rate": 3.637626515797706e-05, + "loss": 0.031, + "step": 12170 + }, + { + "epoch": 14.621848739495798, + "grad_norm": 0.14160384237766266, + "learning_rate": 3.629673844785152e-05, + "loss": 0.0294, + "step": 12180 + }, + { + "epoch": 14.633853541416567, + "grad_norm": 0.1339806765317917, + "learning_rate": 3.621724920191979e-05, + "loss": 0.0341, + "step": 12190 + }, + { + "epoch": 14.645858343337334, + "grad_norm": 0.12227331101894379, + "learning_rate": 3.6137797637502444e-05, + "loss": 0.0288, + "step": 12200 + }, + { + "epoch": 14.657863145258103, + "grad_norm": 0.1355089545249939, + "learning_rate": 3.6058383971817035e-05, + "loss": 0.0309, + "step": 12210 + }, + { + "epoch": 14.669867947178872, + "grad_norm": 0.20199604332447052, + "learning_rate": 3.59790084219775e-05, + "loss": 0.0331, + "step": 12220 + }, + { + "epoch": 14.68187274909964, + "grad_norm": 0.13081979751586914, + "learning_rate": 3.589967120499353e-05, + "loss": 0.0258, + "step": 12230 + }, + { + "epoch": 14.693877551020408, + "grad_norm": 0.24102774262428284, + "learning_rate": 3.5820372537770075e-05, + "loss": 0.0319, + "step": 12240 + }, + { + "epoch": 14.705882352941176, + "grad_norm": 0.18084609508514404, + "learning_rate": 3.5741112637106655e-05, + "loss": 0.0301, + "step": 12250 + }, + { + "epoch": 14.717887154861945, + "grad_norm": 0.1716890037059784, + "learning_rate": 3.5661891719696804e-05, + "loss": 0.0266, + "step": 12260 + }, + { + "epoch": 14.729891956782714, + "grad_norm": 0.14061002433300018, + "learning_rate": 3.5582710002127504e-05, + "loss": 0.026, + "step": 12270 + }, + { + "epoch": 14.741896758703481, + "grad_norm": 0.16676115989685059, + "learning_rate": 3.550356770087853e-05, + "loss": 0.0318, + "step": 12280 + }, + { + "epoch": 14.75390156062425, + "grad_norm": 0.09384230524301529, + "learning_rate": 3.5424465032321914e-05, + "loss": 0.0248, + "step": 12290 + }, + { + "epoch": 14.765906362545017, + "grad_norm": 0.2123066633939743, + "learning_rate": 3.5345402212721335e-05, + "loss": 0.0301, + "step": 12300 + }, + { + "epoch": 14.777911164465786, + "grad_norm": 0.16278235614299774, + "learning_rate": 3.526637945823152e-05, + "loss": 0.0327, + "step": 12310 + }, + { + "epoch": 14.789915966386555, + "grad_norm": 0.14843812584877014, + "learning_rate": 3.518739698489767e-05, + "loss": 0.0304, + "step": 12320 + }, + { + "epoch": 14.801920768307323, + "grad_norm": 0.13912516832351685, + "learning_rate": 3.510845500865485e-05, + "loss": 0.028, + "step": 12330 + }, + { + "epoch": 14.813925570228092, + "grad_norm": 0.17703533172607422, + "learning_rate": 3.502955374532739e-05, + "loss": 0.0272, + "step": 12340 + }, + { + "epoch": 14.825930372148859, + "grad_norm": 0.12442536652088165, + "learning_rate": 3.495069341062836e-05, + "loss": 0.0316, + "step": 12350 + }, + { + "epoch": 14.837935174069628, + "grad_norm": 0.21479922533035278, + "learning_rate": 3.4871874220158896e-05, + "loss": 0.0274, + "step": 12360 + }, + { + "epoch": 14.849939975990397, + "grad_norm": 0.10379406064748764, + "learning_rate": 3.479309638940762e-05, + "loss": 0.0273, + "step": 12370 + }, + { + "epoch": 14.861944777911164, + "grad_norm": 0.12830817699432373, + "learning_rate": 3.4714360133750146e-05, + "loss": 0.0315, + "step": 12380 + }, + { + "epoch": 14.873949579831933, + "grad_norm": 0.1771925687789917, + "learning_rate": 3.463566566844839e-05, + "loss": 0.0298, + "step": 12390 + }, + { + "epoch": 14.8859543817527, + "grad_norm": 0.16118048131465912, + "learning_rate": 3.4557013208650016e-05, + "loss": 0.0303, + "step": 12400 + }, + { + "epoch": 14.89795918367347, + "grad_norm": 0.2210869938135147, + "learning_rate": 3.4478402969387857e-05, + "loss": 0.031, + "step": 12410 + }, + { + "epoch": 14.909963985594239, + "grad_norm": 0.09756149351596832, + "learning_rate": 3.4399835165579266e-05, + "loss": 0.0266, + "step": 12420 + }, + { + "epoch": 14.921968787515006, + "grad_norm": 0.13837917149066925, + "learning_rate": 3.4321310012025645e-05, + "loss": 0.0281, + "step": 12430 + }, + { + "epoch": 14.933973589435775, + "grad_norm": 0.18422377109527588, + "learning_rate": 3.424282772341176e-05, + "loss": 0.0296, + "step": 12440 + }, + { + "epoch": 14.945978391356542, + "grad_norm": 0.1621708869934082, + "learning_rate": 3.416438851430519e-05, + "loss": 0.0274, + "step": 12450 + }, + { + "epoch": 14.957983193277311, + "grad_norm": 0.12897968292236328, + "learning_rate": 3.408599259915577e-05, + "loss": 0.0277, + "step": 12460 + }, + { + "epoch": 14.969987995198078, + "grad_norm": 0.18135420978069305, + "learning_rate": 3.400764019229487e-05, + "loss": 0.0262, + "step": 12470 + }, + { + "epoch": 14.981992797118847, + "grad_norm": 0.16789481043815613, + "learning_rate": 3.3929331507935035e-05, + "loss": 0.0314, + "step": 12480 + }, + { + "epoch": 14.993997599039616, + "grad_norm": 0.16956989467144012, + "learning_rate": 3.3851066760169196e-05, + "loss": 0.0329, + "step": 12490 + }, + { + "epoch": 15.006002400960384, + "grad_norm": 0.1477442979812622, + "learning_rate": 3.377284616297021e-05, + "loss": 0.034, + "step": 12500 + }, + { + "epoch": 15.018007202881153, + "grad_norm": 0.13558384776115417, + "learning_rate": 3.3694669930190166e-05, + "loss": 0.0277, + "step": 12510 + }, + { + "epoch": 15.03001200480192, + "grad_norm": 0.17038513720035553, + "learning_rate": 3.36165382755599e-05, + "loss": 0.0299, + "step": 12520 + }, + { + "epoch": 15.042016806722689, + "grad_norm": 0.1499556005001068, + "learning_rate": 3.35384514126884e-05, + "loss": 0.0268, + "step": 12530 + }, + { + "epoch": 15.054021608643458, + "grad_norm": 0.1540243923664093, + "learning_rate": 3.3460409555062154e-05, + "loss": 0.0325, + "step": 12540 + }, + { + "epoch": 15.066026410564225, + "grad_norm": 0.126227468252182, + "learning_rate": 3.3382412916044645e-05, + "loss": 0.0273, + "step": 12550 + }, + { + "epoch": 15.078031212484994, + "grad_norm": 0.10985071212053299, + "learning_rate": 3.330446170887566e-05, + "loss": 0.0264, + "step": 12560 + }, + { + "epoch": 15.090036014405761, + "grad_norm": 0.11205596476793289, + "learning_rate": 3.3226556146670834e-05, + "loss": 0.0301, + "step": 12570 + }, + { + "epoch": 15.10204081632653, + "grad_norm": 0.13825348019599915, + "learning_rate": 3.314869644242102e-05, + "loss": 0.0277, + "step": 12580 + }, + { + "epoch": 15.1140456182473, + "grad_norm": 0.13325649499893188, + "learning_rate": 3.3070882808991674e-05, + "loss": 0.0298, + "step": 12590 + }, + { + "epoch": 15.126050420168067, + "grad_norm": 0.16662487387657166, + "learning_rate": 3.2993115459122305e-05, + "loss": 0.0309, + "step": 12600 + }, + { + "epoch": 15.138055222088836, + "grad_norm": 0.1488875150680542, + "learning_rate": 3.2915394605425835e-05, + "loss": 0.0255, + "step": 12610 + }, + { + "epoch": 15.150060024009603, + "grad_norm": 0.16361911594867706, + "learning_rate": 3.283772046038816e-05, + "loss": 0.0249, + "step": 12620 + }, + { + "epoch": 15.162064825930372, + "grad_norm": 0.17871950566768646, + "learning_rate": 3.276009323636739e-05, + "loss": 0.0273, + "step": 12630 + }, + { + "epoch": 15.174069627851141, + "grad_norm": 0.12992435693740845, + "learning_rate": 3.268251314559344e-05, + "loss": 0.0275, + "step": 12640 + }, + { + "epoch": 15.186074429771908, + "grad_norm": 0.14480508863925934, + "learning_rate": 3.2604980400167254e-05, + "loss": 0.0276, + "step": 12650 + }, + { + "epoch": 15.198079231692677, + "grad_norm": 0.22693277895450592, + "learning_rate": 3.252749521206042e-05, + "loss": 0.0281, + "step": 12660 + }, + { + "epoch": 15.210084033613445, + "grad_norm": 0.20933814346790314, + "learning_rate": 3.2450057793114494e-05, + "loss": 0.0259, + "step": 12670 + }, + { + "epoch": 15.222088835534214, + "grad_norm": 0.1797967553138733, + "learning_rate": 3.2372668355040435e-05, + "loss": 0.0292, + "step": 12680 + }, + { + "epoch": 15.234093637454983, + "grad_norm": 0.17870421707630157, + "learning_rate": 3.2295327109418005e-05, + "loss": 0.028, + "step": 12690 + }, + { + "epoch": 15.24609843937575, + "grad_norm": 0.15499332547187805, + "learning_rate": 3.221803426769518e-05, + "loss": 0.0284, + "step": 12700 + }, + { + "epoch": 15.258103241296519, + "grad_norm": 0.17129755020141602, + "learning_rate": 3.214079004118768e-05, + "loss": 0.0256, + "step": 12710 + }, + { + "epoch": 15.270108043217286, + "grad_norm": 0.13312792778015137, + "learning_rate": 3.2063594641078234e-05, + "loss": 0.0262, + "step": 12720 + }, + { + "epoch": 15.282112845138055, + "grad_norm": 0.19515082240104675, + "learning_rate": 3.198644827841616e-05, + "loss": 0.0324, + "step": 12730 + }, + { + "epoch": 15.294117647058824, + "grad_norm": 0.13378573954105377, + "learning_rate": 3.1909351164116654e-05, + "loss": 0.0308, + "step": 12740 + }, + { + "epoch": 15.306122448979592, + "grad_norm": 0.19103966653347015, + "learning_rate": 3.183230350896026e-05, + "loss": 0.0246, + "step": 12750 + }, + { + "epoch": 15.31812725090036, + "grad_norm": 0.1666073054075241, + "learning_rate": 3.1755305523592337e-05, + "loss": 0.0277, + "step": 12760 + }, + { + "epoch": 15.330132052821128, + "grad_norm": 0.1381581425666809, + "learning_rate": 3.167835741852245e-05, + "loss": 0.0303, + "step": 12770 + }, + { + "epoch": 15.342136854741897, + "grad_norm": 0.165957510471344, + "learning_rate": 3.160145940412378e-05, + "loss": 0.0315, + "step": 12780 + }, + { + "epoch": 15.354141656662666, + "grad_norm": 0.2226068079471588, + "learning_rate": 3.1524611690632545e-05, + "loss": 0.0308, + "step": 12790 + }, + { + "epoch": 15.366146458583433, + "grad_norm": 0.12918299436569214, + "learning_rate": 3.144781448814746e-05, + "loss": 0.0273, + "step": 12800 + }, + { + "epoch": 15.378151260504202, + "grad_norm": 0.1401062309741974, + "learning_rate": 3.1371068006629145e-05, + "loss": 0.0283, + "step": 12810 + }, + { + "epoch": 15.39015606242497, + "grad_norm": 0.12118790298700333, + "learning_rate": 3.129437245589956e-05, + "loss": 0.027, + "step": 12820 + }, + { + "epoch": 15.402160864345738, + "grad_norm": 0.1875918060541153, + "learning_rate": 3.121772804564143e-05, + "loss": 0.0292, + "step": 12830 + }, + { + "epoch": 15.414165666266507, + "grad_norm": 0.1327325403690338, + "learning_rate": 3.11411349853976e-05, + "loss": 0.0281, + "step": 12840 + }, + { + "epoch": 15.426170468187275, + "grad_norm": 0.16790775954723358, + "learning_rate": 3.10645934845706e-05, + "loss": 0.0282, + "step": 12850 + }, + { + "epoch": 15.438175270108044, + "grad_norm": 0.1571805477142334, + "learning_rate": 3.098810375242196e-05, + "loss": 0.0279, + "step": 12860 + }, + { + "epoch": 15.450180072028811, + "grad_norm": 0.1049916222691536, + "learning_rate": 3.0911665998071704e-05, + "loss": 0.0284, + "step": 12870 + }, + { + "epoch": 15.46218487394958, + "grad_norm": 0.14685103297233582, + "learning_rate": 3.083528043049774e-05, + "loss": 0.0291, + "step": 12880 + }, + { + "epoch": 15.474189675870349, + "grad_norm": 0.12812204658985138, + "learning_rate": 3.0758947258535255e-05, + "loss": 0.0301, + "step": 12890 + }, + { + "epoch": 15.486194477791116, + "grad_norm": 0.22347183525562286, + "learning_rate": 3.068266669087625e-05, + "loss": 0.0282, + "step": 12900 + }, + { + "epoch": 15.498199279711885, + "grad_norm": 0.163607656955719, + "learning_rate": 3.060643893606887e-05, + "loss": 0.0285, + "step": 12910 + }, + { + "epoch": 15.510204081632653, + "grad_norm": 0.2703167796134949, + "learning_rate": 3.053026420251693e-05, + "loss": 0.0281, + "step": 12920 + }, + { + "epoch": 15.522208883553422, + "grad_norm": 0.1689223349094391, + "learning_rate": 3.0454142698479183e-05, + "loss": 0.0279, + "step": 12930 + }, + { + "epoch": 15.534213685474189, + "grad_norm": 0.1401679366827011, + "learning_rate": 3.0378074632068954e-05, + "loss": 0.0296, + "step": 12940 + }, + { + "epoch": 15.546218487394958, + "grad_norm": 0.14167532324790955, + "learning_rate": 3.0302060211253408e-05, + "loss": 0.0279, + "step": 12950 + }, + { + "epoch": 15.558223289315727, + "grad_norm": 0.11646001785993576, + "learning_rate": 3.0226099643853073e-05, + "loss": 0.0262, + "step": 12960 + }, + { + "epoch": 15.570228091236494, + "grad_norm": 0.15978814661502838, + "learning_rate": 3.0150193137541283e-05, + "loss": 0.0273, + "step": 12970 + }, + { + "epoch": 15.582232893157263, + "grad_norm": 0.26402613520622253, + "learning_rate": 3.0074340899843467e-05, + "loss": 0.0314, + "step": 12980 + }, + { + "epoch": 15.594237695078032, + "grad_norm": 0.16071078181266785, + "learning_rate": 2.999854313813677e-05, + "loss": 0.0251, + "step": 12990 + }, + { + "epoch": 15.6062424969988, + "grad_norm": 0.17882882058620453, + "learning_rate": 2.9922800059649382e-05, + "loss": 0.0298, + "step": 13000 + }, + { + "epoch": 15.618247298919568, + "grad_norm": 0.15540507435798645, + "learning_rate": 2.9847111871459976e-05, + "loss": 0.0311, + "step": 13010 + }, + { + "epoch": 15.630252100840336, + "grad_norm": 0.14137017726898193, + "learning_rate": 2.977147878049721e-05, + "loss": 0.031, + "step": 13020 + }, + { + "epoch": 15.642256902761105, + "grad_norm": 0.210642009973526, + "learning_rate": 2.9695900993539006e-05, + "loss": 0.0328, + "step": 13030 + }, + { + "epoch": 15.654261704681872, + "grad_norm": 0.16185788810253143, + "learning_rate": 2.9620378717212183e-05, + "loss": 0.0225, + "step": 13040 + }, + { + "epoch": 15.666266506602641, + "grad_norm": 0.13715985417366028, + "learning_rate": 2.9544912157991745e-05, + "loss": 0.0294, + "step": 13050 + }, + { + "epoch": 15.67827130852341, + "grad_norm": 0.14912548661231995, + "learning_rate": 2.9469501522200405e-05, + "loss": 0.0272, + "step": 13060 + }, + { + "epoch": 15.690276110444177, + "grad_norm": 0.11403299123048782, + "learning_rate": 2.9394147016007946e-05, + "loss": 0.0287, + "step": 13070 + }, + { + "epoch": 15.702280912364946, + "grad_norm": 0.10067326575517654, + "learning_rate": 2.9318848845430702e-05, + "loss": 0.0248, + "step": 13080 + }, + { + "epoch": 15.714285714285714, + "grad_norm": 0.1353403627872467, + "learning_rate": 2.9243607216331013e-05, + "loss": 0.0281, + "step": 13090 + }, + { + "epoch": 15.726290516206483, + "grad_norm": 0.1366606205701828, + "learning_rate": 2.916842233441661e-05, + "loss": 0.0286, + "step": 13100 + }, + { + "epoch": 15.738295318127252, + "grad_norm": 0.1578763723373413, + "learning_rate": 2.90932944052401e-05, + "loss": 0.0278, + "step": 13110 + }, + { + "epoch": 15.750300120048019, + "grad_norm": 0.11557815223932266, + "learning_rate": 2.9018223634198354e-05, + "loss": 0.027, + "step": 13120 + }, + { + "epoch": 15.762304921968788, + "grad_norm": 0.13774777948856354, + "learning_rate": 2.8943210226532025e-05, + "loss": 0.026, + "step": 13130 + }, + { + "epoch": 15.774309723889555, + "grad_norm": 0.2051314264535904, + "learning_rate": 2.8868254387324857e-05, + "loss": 0.0305, + "step": 13140 + }, + { + "epoch": 15.786314525810324, + "grad_norm": 0.16215163469314575, + "learning_rate": 2.8793356321503306e-05, + "loss": 0.0272, + "step": 13150 + }, + { + "epoch": 15.798319327731093, + "grad_norm": 0.10462100803852081, + "learning_rate": 2.87185162338358e-05, + "loss": 0.03, + "step": 13160 + }, + { + "epoch": 15.81032412965186, + "grad_norm": 0.13645632565021515, + "learning_rate": 2.8643734328932253e-05, + "loss": 0.0262, + "step": 13170 + }, + { + "epoch": 15.82232893157263, + "grad_norm": 0.11811123788356781, + "learning_rate": 2.856901081124359e-05, + "loss": 0.0289, + "step": 13180 + }, + { + "epoch": 15.834333733493397, + "grad_norm": 0.18381276726722717, + "learning_rate": 2.8494345885061002e-05, + "loss": 0.0286, + "step": 13190 + }, + { + "epoch": 15.846338535414166, + "grad_norm": 0.1658981442451477, + "learning_rate": 2.8419739754515616e-05, + "loss": 0.0274, + "step": 13200 + }, + { + "epoch": 15.858343337334935, + "grad_norm": 0.11254499852657318, + "learning_rate": 2.8345192623577666e-05, + "loss": 0.0298, + "step": 13210 + }, + { + "epoch": 15.870348139255702, + "grad_norm": 0.1343431919813156, + "learning_rate": 2.8270704696056193e-05, + "loss": 0.0286, + "step": 13220 + }, + { + "epoch": 15.882352941176471, + "grad_norm": 0.1462465226650238, + "learning_rate": 2.8196276175598367e-05, + "loss": 0.0271, + "step": 13230 + }, + { + "epoch": 15.894357743097238, + "grad_norm": 0.13775040209293365, + "learning_rate": 2.8121907265688884e-05, + "loss": 0.0259, + "step": 13240 + }, + { + "epoch": 15.906362545018007, + "grad_norm": 0.12857657670974731, + "learning_rate": 2.804759816964957e-05, + "loss": 0.0242, + "step": 13250 + }, + { + "epoch": 15.918367346938776, + "grad_norm": 0.17099910974502563, + "learning_rate": 2.797334909063857e-05, + "loss": 0.0247, + "step": 13260 + }, + { + "epoch": 15.930372148859544, + "grad_norm": 0.139494851231575, + "learning_rate": 2.7899160231650056e-05, + "loss": 0.0276, + "step": 13270 + }, + { + "epoch": 15.942376950780313, + "grad_norm": 0.10354705899953842, + "learning_rate": 2.7825031795513585e-05, + "loss": 0.0262, + "step": 13280 + }, + { + "epoch": 15.95438175270108, + "grad_norm": 0.12300069630146027, + "learning_rate": 2.775096398489341e-05, + "loss": 0.0248, + "step": 13290 + }, + { + "epoch": 15.966386554621849, + "grad_norm": 0.13316982984542847, + "learning_rate": 2.7676957002288163e-05, + "loss": 0.0268, + "step": 13300 + }, + { + "epoch": 15.978391356542616, + "grad_norm": 0.18738071620464325, + "learning_rate": 2.760301105003003e-05, + "loss": 0.0285, + "step": 13310 + }, + { + "epoch": 15.990396158463385, + "grad_norm": 0.14831654727458954, + "learning_rate": 2.752912633028446e-05, + "loss": 0.0246, + "step": 13320 + }, + { + "epoch": 16.002400960384154, + "grad_norm": 0.1398947834968567, + "learning_rate": 2.7455303045049474e-05, + "loss": 0.027, + "step": 13330 + }, + { + "epoch": 16.01440576230492, + "grad_norm": 0.14773908257484436, + "learning_rate": 2.7381541396155098e-05, + "loss": 0.0305, + "step": 13340 + }, + { + "epoch": 16.02641056422569, + "grad_norm": 0.11121239513158798, + "learning_rate": 2.730784158526286e-05, + "loss": 0.0238, + "step": 13350 + }, + { + "epoch": 16.03841536614646, + "grad_norm": 0.13719697296619415, + "learning_rate": 2.723420381386521e-05, + "loss": 0.0247, + "step": 13360 + }, + { + "epoch": 16.050420168067227, + "grad_norm": 0.1372821033000946, + "learning_rate": 2.7160628283285018e-05, + "loss": 0.0281, + "step": 13370 + }, + { + "epoch": 16.062424969987994, + "grad_norm": 0.17600280046463013, + "learning_rate": 2.7087115194675007e-05, + "loss": 0.0273, + "step": 13380 + }, + { + "epoch": 16.074429771908765, + "grad_norm": 0.0941275805234909, + "learning_rate": 2.701366474901712e-05, + "loss": 0.0257, + "step": 13390 + }, + { + "epoch": 16.086434573829532, + "grad_norm": 0.10403145104646683, + "learning_rate": 2.6940277147122085e-05, + "loss": 0.029, + "step": 13400 + }, + { + "epoch": 16.0984393757503, + "grad_norm": 0.1806197464466095, + "learning_rate": 2.686695258962878e-05, + "loss": 0.0296, + "step": 13410 + }, + { + "epoch": 16.11044417767107, + "grad_norm": 0.13532018661499023, + "learning_rate": 2.679369127700375e-05, + "loss": 0.025, + "step": 13420 + }, + { + "epoch": 16.122448979591837, + "grad_norm": 0.1310899704694748, + "learning_rate": 2.672049340954067e-05, + "loss": 0.0284, + "step": 13430 + }, + { + "epoch": 16.134453781512605, + "grad_norm": 0.13828924298286438, + "learning_rate": 2.6647359187359676e-05, + "loss": 0.0277, + "step": 13440 + }, + { + "epoch": 16.146458583433372, + "grad_norm": 0.16724678874015808, + "learning_rate": 2.6574288810406946e-05, + "loss": 0.0268, + "step": 13450 + }, + { + "epoch": 16.158463385354143, + "grad_norm": 0.12728525698184967, + "learning_rate": 2.6501282478454083e-05, + "loss": 0.0259, + "step": 13460 + }, + { + "epoch": 16.17046818727491, + "grad_norm": 0.1488242894411087, + "learning_rate": 2.6428340391097618e-05, + "loss": 0.028, + "step": 13470 + }, + { + "epoch": 16.182472989195677, + "grad_norm": 0.13062691688537598, + "learning_rate": 2.6355462747758485e-05, + "loss": 0.0253, + "step": 13480 + }, + { + "epoch": 16.194477791116448, + "grad_norm": 0.11823634058237076, + "learning_rate": 2.6282649747681304e-05, + "loss": 0.028, + "step": 13490 + }, + { + "epoch": 16.206482593037215, + "grad_norm": 0.1720147430896759, + "learning_rate": 2.620990158993406e-05, + "loss": 0.0255, + "step": 13500 + }, + { + "epoch": 16.218487394957982, + "grad_norm": 0.13524721562862396, + "learning_rate": 2.6137218473407477e-05, + "loss": 0.0283, + "step": 13510 + }, + { + "epoch": 16.230492196878753, + "grad_norm": 0.14287839829921722, + "learning_rate": 2.606460059681436e-05, + "loss": 0.0288, + "step": 13520 + }, + { + "epoch": 16.24249699879952, + "grad_norm": 0.16513414680957794, + "learning_rate": 2.599204815868928e-05, + "loss": 0.0257, + "step": 13530 + }, + { + "epoch": 16.254501800720288, + "grad_norm": 0.10433600842952728, + "learning_rate": 2.5919561357387756e-05, + "loss": 0.0251, + "step": 13540 + }, + { + "epoch": 16.266506602641055, + "grad_norm": 0.1883321851491928, + "learning_rate": 2.5847140391085972e-05, + "loss": 0.0275, + "step": 13550 + }, + { + "epoch": 16.278511404561826, + "grad_norm": 0.17370057106018066, + "learning_rate": 2.5774785457780103e-05, + "loss": 0.0281, + "step": 13560 + }, + { + "epoch": 16.290516206482593, + "grad_norm": 0.1240851953625679, + "learning_rate": 2.5702496755285753e-05, + "loss": 0.0274, + "step": 13570 + }, + { + "epoch": 16.30252100840336, + "grad_norm": 0.19887356460094452, + "learning_rate": 2.5630274481237483e-05, + "loss": 0.0285, + "step": 13580 + }, + { + "epoch": 16.31452581032413, + "grad_norm": 0.1592557281255722, + "learning_rate": 2.5558118833088197e-05, + "loss": 0.0299, + "step": 13590 + }, + { + "epoch": 16.3265306122449, + "grad_norm": 0.22202648222446442, + "learning_rate": 2.548603000810872e-05, + "loss": 0.0272, + "step": 13600 + }, + { + "epoch": 16.338535414165666, + "grad_norm": 0.13548344373703003, + "learning_rate": 2.5414008203387152e-05, + "loss": 0.0275, + "step": 13610 + }, + { + "epoch": 16.350540216086436, + "grad_norm": 0.14391528069972992, + "learning_rate": 2.534205361582834e-05, + "loss": 0.0251, + "step": 13620 + }, + { + "epoch": 16.362545018007204, + "grad_norm": 0.10710681974887848, + "learning_rate": 2.527016644215338e-05, + "loss": 0.0264, + "step": 13630 + }, + { + "epoch": 16.37454981992797, + "grad_norm": 0.15897531807422638, + "learning_rate": 2.519834687889905e-05, + "loss": 0.0245, + "step": 13640 + }, + { + "epoch": 16.386554621848738, + "grad_norm": 0.17369158565998077, + "learning_rate": 2.5126595122417295e-05, + "loss": 0.0245, + "step": 13650 + }, + { + "epoch": 16.39855942376951, + "grad_norm": 0.1347324103116989, + "learning_rate": 2.5054911368874713e-05, + "loss": 0.0252, + "step": 13660 + }, + { + "epoch": 16.410564225690276, + "grad_norm": 0.11314976215362549, + "learning_rate": 2.4983295814251916e-05, + "loss": 0.026, + "step": 13670 + }, + { + "epoch": 16.422569027611043, + "grad_norm": 0.17597338557243347, + "learning_rate": 2.4911748654343105e-05, + "loss": 0.0264, + "step": 13680 + }, + { + "epoch": 16.434573829531814, + "grad_norm": 0.11181126534938812, + "learning_rate": 2.4840270084755463e-05, + "loss": 0.0272, + "step": 13690 + }, + { + "epoch": 16.44657863145258, + "grad_norm": 0.14090761542320251, + "learning_rate": 2.4768860300908685e-05, + "loss": 0.0274, + "step": 13700 + }, + { + "epoch": 16.45858343337335, + "grad_norm": 0.13033826649188995, + "learning_rate": 2.469751949803443e-05, + "loss": 0.0269, + "step": 13710 + }, + { + "epoch": 16.470588235294116, + "grad_norm": 0.17121310532093048, + "learning_rate": 2.4626247871175666e-05, + "loss": 0.0262, + "step": 13720 + }, + { + "epoch": 16.482593037214887, + "grad_norm": 0.1205611377954483, + "learning_rate": 2.4555045615186346e-05, + "loss": 0.0243, + "step": 13730 + }, + { + "epoch": 16.494597839135654, + "grad_norm": 0.15650363266468048, + "learning_rate": 2.4483912924730677e-05, + "loss": 0.0265, + "step": 13740 + }, + { + "epoch": 16.50660264105642, + "grad_norm": 0.10966300219297409, + "learning_rate": 2.4412849994282742e-05, + "loss": 0.0255, + "step": 13750 + }, + { + "epoch": 16.518607442977192, + "grad_norm": 0.11384443193674088, + "learning_rate": 2.434185701812592e-05, + "loss": 0.0258, + "step": 13760 + }, + { + "epoch": 16.53061224489796, + "grad_norm": 0.13343267142772675, + "learning_rate": 2.4270934190352218e-05, + "loss": 0.0261, + "step": 13770 + }, + { + "epoch": 16.542617046818727, + "grad_norm": 0.11565423756837845, + "learning_rate": 2.4200081704861998e-05, + "loss": 0.025, + "step": 13780 + }, + { + "epoch": 16.554621848739497, + "grad_norm": 0.10824315249919891, + "learning_rate": 2.412929975536321e-05, + "loss": 0.0256, + "step": 13790 + }, + { + "epoch": 16.566626650660265, + "grad_norm": 0.13425227999687195, + "learning_rate": 2.4058588535371017e-05, + "loss": 0.0255, + "step": 13800 + }, + { + "epoch": 16.578631452581032, + "grad_norm": 0.14090240001678467, + "learning_rate": 2.3987948238207243e-05, + "loss": 0.026, + "step": 13810 + }, + { + "epoch": 16.5906362545018, + "grad_norm": 0.16668745875358582, + "learning_rate": 2.3917379056999678e-05, + "loss": 0.025, + "step": 13820 + }, + { + "epoch": 16.60264105642257, + "grad_norm": 0.19378547370433807, + "learning_rate": 2.3846881184681824e-05, + "loss": 0.024, + "step": 13830 + }, + { + "epoch": 16.614645858343337, + "grad_norm": 0.15869243443012238, + "learning_rate": 2.377645481399214e-05, + "loss": 0.0299, + "step": 13840 + }, + { + "epoch": 16.626650660264104, + "grad_norm": 0.14208713173866272, + "learning_rate": 2.3706100137473667e-05, + "loss": 0.0273, + "step": 13850 + }, + { + "epoch": 16.638655462184875, + "grad_norm": 0.09438211470842361, + "learning_rate": 2.3635817347473394e-05, + "loss": 0.0245, + "step": 13860 + }, + { + "epoch": 16.650660264105642, + "grad_norm": 0.08857204765081406, + "learning_rate": 2.3565606636141757e-05, + "loss": 0.0221, + "step": 13870 + }, + { + "epoch": 16.66266506602641, + "grad_norm": 0.1975356489419937, + "learning_rate": 2.3495468195432203e-05, + "loss": 0.0251, + "step": 13880 + }, + { + "epoch": 16.67466986794718, + "grad_norm": 0.13867974281311035, + "learning_rate": 2.3425402217100507e-05, + "loss": 0.0275, + "step": 13890 + }, + { + "epoch": 16.686674669867948, + "grad_norm": 0.12511202692985535, + "learning_rate": 2.3355408892704424e-05, + "loss": 0.0256, + "step": 13900 + }, + { + "epoch": 16.698679471788715, + "grad_norm": 0.17822547256946564, + "learning_rate": 2.3285488413603003e-05, + "loss": 0.0265, + "step": 13910 + }, + { + "epoch": 16.710684273709482, + "grad_norm": 0.14072206616401672, + "learning_rate": 2.321564097095615e-05, + "loss": 0.0245, + "step": 13920 + }, + { + "epoch": 16.722689075630253, + "grad_norm": 0.14186719059944153, + "learning_rate": 2.3145866755724142e-05, + "loss": 0.0252, + "step": 13930 + }, + { + "epoch": 16.73469387755102, + "grad_norm": 0.12113595008850098, + "learning_rate": 2.307616595866699e-05, + "loss": 0.0215, + "step": 13940 + }, + { + "epoch": 16.746698679471788, + "grad_norm": 0.18919193744659424, + "learning_rate": 2.3006538770344032e-05, + "loss": 0.025, + "step": 13950 + }, + { + "epoch": 16.75870348139256, + "grad_norm": 0.16483502089977264, + "learning_rate": 2.293698538111334e-05, + "loss": 0.0272, + "step": 13960 + }, + { + "epoch": 16.770708283313326, + "grad_norm": 0.09266642481088638, + "learning_rate": 2.28675059811312e-05, + "loss": 0.0273, + "step": 13970 + }, + { + "epoch": 16.782713085234093, + "grad_norm": 0.17517396807670593, + "learning_rate": 2.279810076035167e-05, + "loss": 0.0267, + "step": 13980 + }, + { + "epoch": 16.79471788715486, + "grad_norm": 0.11813212931156158, + "learning_rate": 2.272876990852596e-05, + "loss": 0.0233, + "step": 13990 + }, + { + "epoch": 16.80672268907563, + "grad_norm": 0.14618849754333496, + "learning_rate": 2.265951361520195e-05, + "loss": 0.0249, + "step": 14000 + }, + { + "epoch": 16.818727490996398, + "grad_norm": 0.1662166267633438, + "learning_rate": 2.2590332069723748e-05, + "loss": 0.0248, + "step": 14010 + }, + { + "epoch": 16.830732292917165, + "grad_norm": 0.11348894983530045, + "learning_rate": 2.2521225461231004e-05, + "loss": 0.0233, + "step": 14020 + }, + { + "epoch": 16.842737094837936, + "grad_norm": 0.12286952883005142, + "learning_rate": 2.2452193978658597e-05, + "loss": 0.027, + "step": 14030 + }, + { + "epoch": 16.854741896758703, + "grad_norm": 0.12708669900894165, + "learning_rate": 2.238323781073594e-05, + "loss": 0.0249, + "step": 14040 + }, + { + "epoch": 16.86674669867947, + "grad_norm": 0.18003401160240173, + "learning_rate": 2.2314357145986552e-05, + "loss": 0.0254, + "step": 14050 + }, + { + "epoch": 16.87875150060024, + "grad_norm": 0.15146084129810333, + "learning_rate": 2.224555217272757e-05, + "loss": 0.027, + "step": 14060 + }, + { + "epoch": 16.89075630252101, + "grad_norm": 0.15525726974010468, + "learning_rate": 2.2176823079069127e-05, + "loss": 0.0243, + "step": 14070 + }, + { + "epoch": 16.902761104441776, + "grad_norm": 0.17208531498908997, + "learning_rate": 2.210817005291398e-05, + "loss": 0.0228, + "step": 14080 + }, + { + "epoch": 16.914765906362547, + "grad_norm": 0.17444483935832977, + "learning_rate": 2.203959328195686e-05, + "loss": 0.023, + "step": 14090 + }, + { + "epoch": 16.926770708283314, + "grad_norm": 0.1691613495349884, + "learning_rate": 2.1971092953684026e-05, + "loss": 0.024, + "step": 14100 + }, + { + "epoch": 16.93877551020408, + "grad_norm": 0.14441335201263428, + "learning_rate": 2.1902669255372788e-05, + "loss": 0.0245, + "step": 14110 + }, + { + "epoch": 16.95078031212485, + "grad_norm": 0.2062889039516449, + "learning_rate": 2.1834322374090897e-05, + "loss": 0.0252, + "step": 14120 + }, + { + "epoch": 16.96278511404562, + "grad_norm": 0.1729172170162201, + "learning_rate": 2.1766052496696153e-05, + "loss": 0.0262, + "step": 14130 + }, + { + "epoch": 16.974789915966387, + "grad_norm": 0.16257983446121216, + "learning_rate": 2.169785980983577e-05, + "loss": 0.0261, + "step": 14140 + }, + { + "epoch": 16.986794717887154, + "grad_norm": 0.10615845769643784, + "learning_rate": 2.162974449994593e-05, + "loss": 0.0234, + "step": 14150 + }, + { + "epoch": 16.998799519807925, + "grad_norm": 0.1503920704126358, + "learning_rate": 2.1561706753251337e-05, + "loss": 0.0267, + "step": 14160 + }, + { + "epoch": 17.010804321728692, + "grad_norm": 0.11082279682159424, + "learning_rate": 2.1493746755764544e-05, + "loss": 0.0242, + "step": 14170 + }, + { + "epoch": 17.02280912364946, + "grad_norm": 0.13873574137687683, + "learning_rate": 2.1425864693285635e-05, + "loss": 0.0265, + "step": 14180 + }, + { + "epoch": 17.034813925570226, + "grad_norm": 0.15051554143428802, + "learning_rate": 2.1358060751401547e-05, + "loss": 0.028, + "step": 14190 + }, + { + "epoch": 17.046818727490997, + "grad_norm": 0.16402918100357056, + "learning_rate": 2.129033511548566e-05, + "loss": 0.0266, + "step": 14200 + }, + { + "epoch": 17.058823529411764, + "grad_norm": 0.12769684195518494, + "learning_rate": 2.1222687970697315e-05, + "loss": 0.0249, + "step": 14210 + }, + { + "epoch": 17.07082833133253, + "grad_norm": 0.15550854802131653, + "learning_rate": 2.1155119501981173e-05, + "loss": 0.0233, + "step": 14220 + }, + { + "epoch": 17.082833133253303, + "grad_norm": 0.1189972385764122, + "learning_rate": 2.1087629894066895e-05, + "loss": 0.0227, + "step": 14230 + }, + { + "epoch": 17.09483793517407, + "grad_norm": 0.14321932196617126, + "learning_rate": 2.1020219331468473e-05, + "loss": 0.0254, + "step": 14240 + }, + { + "epoch": 17.106842737094837, + "grad_norm": 0.13443101942539215, + "learning_rate": 2.095288799848379e-05, + "loss": 0.0258, + "step": 14250 + }, + { + "epoch": 17.118847539015608, + "grad_norm": 0.15397369861602783, + "learning_rate": 2.088563607919417e-05, + "loss": 0.0247, + "step": 14260 + }, + { + "epoch": 17.130852340936375, + "grad_norm": 0.15878726541996002, + "learning_rate": 2.0818463757463786e-05, + "loss": 0.0259, + "step": 14270 + }, + { + "epoch": 17.142857142857142, + "grad_norm": 0.12667106091976166, + "learning_rate": 2.0751371216939175e-05, + "loss": 0.0234, + "step": 14280 + }, + { + "epoch": 17.15486194477791, + "grad_norm": 0.1069289967417717, + "learning_rate": 2.068435864104882e-05, + "loss": 0.0252, + "step": 14290 + }, + { + "epoch": 17.16686674669868, + "grad_norm": 0.14224007725715637, + "learning_rate": 2.0617426213002506e-05, + "loss": 0.0238, + "step": 14300 + }, + { + "epoch": 17.178871548619448, + "grad_norm": 0.13502241671085358, + "learning_rate": 2.055057411579097e-05, + "loss": 0.0228, + "step": 14310 + }, + { + "epoch": 17.190876350540215, + "grad_norm": 0.25406429171562195, + "learning_rate": 2.0483802532185286e-05, + "loss": 0.0283, + "step": 14320 + }, + { + "epoch": 17.202881152460986, + "grad_norm": 0.1546534150838852, + "learning_rate": 2.041711164473638e-05, + "loss": 0.0265, + "step": 14330 + }, + { + "epoch": 17.214885954381753, + "grad_norm": 0.1338605135679245, + "learning_rate": 2.0350501635774637e-05, + "loss": 0.0239, + "step": 14340 + }, + { + "epoch": 17.22689075630252, + "grad_norm": 0.18312373757362366, + "learning_rate": 2.0283972687409247e-05, + "loss": 0.0265, + "step": 14350 + }, + { + "epoch": 17.23889555822329, + "grad_norm": 0.17564059793949127, + "learning_rate": 2.021752498152784e-05, + "loss": 0.0263, + "step": 14360 + }, + { + "epoch": 17.25090036014406, + "grad_norm": 0.36214643716812134, + "learning_rate": 2.015115869979589e-05, + "loss": 0.0241, + "step": 14370 + }, + { + "epoch": 17.262905162064826, + "grad_norm": 0.13008952140808105, + "learning_rate": 2.0084874023656265e-05, + "loss": 0.0281, + "step": 14380 + }, + { + "epoch": 17.274909963985593, + "grad_norm": 0.15419700741767883, + "learning_rate": 2.001867113432877e-05, + "loss": 0.0226, + "step": 14390 + }, + { + "epoch": 17.286914765906364, + "grad_norm": 0.13623179495334625, + "learning_rate": 1.995255021280954e-05, + "loss": 0.0222, + "step": 14400 + }, + { + "epoch": 17.29891956782713, + "grad_norm": 0.1466943919658661, + "learning_rate": 1.9886511439870688e-05, + "loss": 0.0249, + "step": 14410 + }, + { + "epoch": 17.310924369747898, + "grad_norm": 0.13143952190876007, + "learning_rate": 1.9820554996059675e-05, + "loss": 0.0232, + "step": 14420 + }, + { + "epoch": 17.32292917166867, + "grad_norm": 0.12615542113780975, + "learning_rate": 1.9754681061698893e-05, + "loss": 0.0247, + "step": 14430 + }, + { + "epoch": 17.334933973589436, + "grad_norm": 0.1040230393409729, + "learning_rate": 1.9688889816885185e-05, + "loss": 0.0237, + "step": 14440 + }, + { + "epoch": 17.346938775510203, + "grad_norm": 0.15820656716823578, + "learning_rate": 1.962318144148928e-05, + "loss": 0.0273, + "step": 14450 + }, + { + "epoch": 17.35894357743097, + "grad_norm": 0.12214459478855133, + "learning_rate": 1.955755611515539e-05, + "loss": 0.024, + "step": 14460 + }, + { + "epoch": 17.37094837935174, + "grad_norm": 0.11041007190942764, + "learning_rate": 1.9492014017300642e-05, + "loss": 0.0222, + "step": 14470 + }, + { + "epoch": 17.38295318127251, + "grad_norm": 0.1700160801410675, + "learning_rate": 1.942655532711461e-05, + "loss": 0.0261, + "step": 14480 + }, + { + "epoch": 17.394957983193276, + "grad_norm": 0.18096929788589478, + "learning_rate": 1.9361180223558882e-05, + "loss": 0.0221, + "step": 14490 + }, + { + "epoch": 17.406962785114047, + "grad_norm": 0.17094755172729492, + "learning_rate": 1.929588888536647e-05, + "loss": 0.0241, + "step": 14500 + }, + { + "epoch": 17.418967587034814, + "grad_norm": 0.11214415729045868, + "learning_rate": 1.9230681491041425e-05, + "loss": 0.0243, + "step": 14510 + }, + { + "epoch": 17.43097238895558, + "grad_norm": 0.17121051251888275, + "learning_rate": 1.9165558218858264e-05, + "loss": 0.025, + "step": 14520 + }, + { + "epoch": 17.442977190876352, + "grad_norm": 0.14093266427516937, + "learning_rate": 1.9100519246861505e-05, + "loss": 0.024, + "step": 14530 + }, + { + "epoch": 17.45498199279712, + "grad_norm": 0.12380301207304001, + "learning_rate": 1.9035564752865248e-05, + "loss": 0.0253, + "step": 14540 + }, + { + "epoch": 17.466986794717887, + "grad_norm": 0.18602147698402405, + "learning_rate": 1.897069491445258e-05, + "loss": 0.0281, + "step": 14550 + }, + { + "epoch": 17.478991596638654, + "grad_norm": 0.1355210840702057, + "learning_rate": 1.890590990897515e-05, + "loss": 0.0258, + "step": 14560 + }, + { + "epoch": 17.490996398559425, + "grad_norm": 0.1497170776128769, + "learning_rate": 1.884120991355272e-05, + "loss": 0.0256, + "step": 14570 + }, + { + "epoch": 17.503001200480192, + "grad_norm": 0.15526720881462097, + "learning_rate": 1.8776595105072576e-05, + "loss": 0.0235, + "step": 14580 + }, + { + "epoch": 17.51500600240096, + "grad_norm": 0.1422477513551712, + "learning_rate": 1.8712065660189166e-05, + "loss": 0.0243, + "step": 14590 + }, + { + "epoch": 17.52701080432173, + "grad_norm": 0.15879857540130615, + "learning_rate": 1.8647621755323513e-05, + "loss": 0.0263, + "step": 14600 + }, + { + "epoch": 17.539015606242497, + "grad_norm": 0.1457217037677765, + "learning_rate": 1.858326356666278e-05, + "loss": 0.0239, + "step": 14610 + }, + { + "epoch": 17.551020408163264, + "grad_norm": 0.21889209747314453, + "learning_rate": 1.851899127015983e-05, + "loss": 0.0278, + "step": 14620 + }, + { + "epoch": 17.563025210084035, + "grad_norm": 0.18363748490810394, + "learning_rate": 1.8454805041532626e-05, + "loss": 0.0253, + "step": 14630 + }, + { + "epoch": 17.575030012004802, + "grad_norm": 0.14242176711559296, + "learning_rate": 1.8390705056263906e-05, + "loss": 0.0228, + "step": 14640 + }, + { + "epoch": 17.58703481392557, + "grad_norm": 0.13325399160385132, + "learning_rate": 1.832669148960057e-05, + "loss": 0.0233, + "step": 14650 + }, + { + "epoch": 17.599039615846337, + "grad_norm": 0.11963097006082535, + "learning_rate": 1.8262764516553233e-05, + "loss": 0.0239, + "step": 14660 + }, + { + "epoch": 17.611044417767108, + "grad_norm": 0.1360022872686386, + "learning_rate": 1.8198924311895843e-05, + "loss": 0.0238, + "step": 14670 + }, + { + "epoch": 17.623049219687875, + "grad_norm": 0.2043529599905014, + "learning_rate": 1.813517105016505e-05, + "loss": 0.0232, + "step": 14680 + }, + { + "epoch": 17.635054021608642, + "grad_norm": 0.11965558677911758, + "learning_rate": 1.8071504905659888e-05, + "loss": 0.0236, + "step": 14690 + }, + { + "epoch": 17.647058823529413, + "grad_norm": 0.12434712797403336, + "learning_rate": 1.800792605244109e-05, + "loss": 0.0226, + "step": 14700 + }, + { + "epoch": 17.65906362545018, + "grad_norm": 0.1529577374458313, + "learning_rate": 1.7944434664330844e-05, + "loss": 0.0233, + "step": 14710 + }, + { + "epoch": 17.671068427370948, + "grad_norm": 0.1290818303823471, + "learning_rate": 1.7881030914912212e-05, + "loss": 0.0252, + "step": 14720 + }, + { + "epoch": 17.68307322929172, + "grad_norm": 0.15514028072357178, + "learning_rate": 1.7817714977528577e-05, + "loss": 0.0234, + "step": 14730 + }, + { + "epoch": 17.695078031212486, + "grad_norm": 0.1823279708623886, + "learning_rate": 1.7754487025283332e-05, + "loss": 0.0242, + "step": 14740 + }, + { + "epoch": 17.707082833133253, + "grad_norm": 0.15765243768692017, + "learning_rate": 1.7691347231039275e-05, + "loss": 0.0248, + "step": 14750 + }, + { + "epoch": 17.71908763505402, + "grad_norm": 0.21774451434612274, + "learning_rate": 1.7628295767418164e-05, + "loss": 0.0229, + "step": 14760 + }, + { + "epoch": 17.73109243697479, + "grad_norm": 0.1164470985531807, + "learning_rate": 1.7565332806800333e-05, + "loss": 0.0224, + "step": 14770 + }, + { + "epoch": 17.743097238895558, + "grad_norm": 0.1491265892982483, + "learning_rate": 1.750245852132408e-05, + "loss": 0.0223, + "step": 14780 + }, + { + "epoch": 17.755102040816325, + "grad_norm": 0.13243919610977173, + "learning_rate": 1.7439673082885323e-05, + "loss": 0.0227, + "step": 14790 + }, + { + "epoch": 17.767106842737096, + "grad_norm": 0.17193861305713654, + "learning_rate": 1.7376976663137047e-05, + "loss": 0.0238, + "step": 14800 + }, + { + "epoch": 17.779111644657863, + "grad_norm": 0.16078023612499237, + "learning_rate": 1.7314369433488853e-05, + "loss": 0.0229, + "step": 14810 + }, + { + "epoch": 17.79111644657863, + "grad_norm": 0.11704923957586288, + "learning_rate": 1.7251851565106548e-05, + "loss": 0.0191, + "step": 14820 + }, + { + "epoch": 17.8031212484994, + "grad_norm": 0.11911648511886597, + "learning_rate": 1.7189423228911574e-05, + "loss": 0.0231, + "step": 14830 + }, + { + "epoch": 17.81512605042017, + "grad_norm": 0.14266784489154816, + "learning_rate": 1.7127084595580606e-05, + "loss": 0.0242, + "step": 14840 + }, + { + "epoch": 17.827130852340936, + "grad_norm": 0.14386296272277832, + "learning_rate": 1.706483583554513e-05, + "loss": 0.0218, + "step": 14850 + }, + { + "epoch": 17.839135654261703, + "grad_norm": 0.11500339210033417, + "learning_rate": 1.700267711899083e-05, + "loss": 0.023, + "step": 14860 + }, + { + "epoch": 17.851140456182474, + "grad_norm": 0.14879795908927917, + "learning_rate": 1.69406086158573e-05, + "loss": 0.0257, + "step": 14870 + }, + { + "epoch": 17.86314525810324, + "grad_norm": 0.27994251251220703, + "learning_rate": 1.6878630495837455e-05, + "loss": 0.0242, + "step": 14880 + }, + { + "epoch": 17.87515006002401, + "grad_norm": 0.16550205647945404, + "learning_rate": 1.681674292837707e-05, + "loss": 0.0256, + "step": 14890 + }, + { + "epoch": 17.88715486194478, + "grad_norm": 0.11998721212148666, + "learning_rate": 1.6754946082674444e-05, + "loss": 0.0201, + "step": 14900 + }, + { + "epoch": 17.899159663865547, + "grad_norm": 0.12046772241592407, + "learning_rate": 1.6693240127679748e-05, + "loss": 0.0226, + "step": 14910 + }, + { + "epoch": 17.911164465786314, + "grad_norm": 0.182234525680542, + "learning_rate": 1.663162523209475e-05, + "loss": 0.0256, + "step": 14920 + }, + { + "epoch": 17.92316926770708, + "grad_norm": 0.11906658858060837, + "learning_rate": 1.6570101564372193e-05, + "loss": 0.0235, + "step": 14930 + }, + { + "epoch": 17.935174069627852, + "grad_norm": 0.13123486936092377, + "learning_rate": 1.650866929271543e-05, + "loss": 0.0212, + "step": 14940 + }, + { + "epoch": 17.94717887154862, + "grad_norm": 0.15743687748908997, + "learning_rate": 1.644732858507797e-05, + "loss": 0.0285, + "step": 14950 + }, + { + "epoch": 17.959183673469386, + "grad_norm": 0.15071967244148254, + "learning_rate": 1.6386079609162943e-05, + "loss": 0.0233, + "step": 14960 + }, + { + "epoch": 17.971188475390157, + "grad_norm": 0.12913894653320312, + "learning_rate": 1.6324922532422742e-05, + "loss": 0.0226, + "step": 14970 + }, + { + "epoch": 17.983193277310924, + "grad_norm": 0.1681075096130371, + "learning_rate": 1.6263857522058434e-05, + "loss": 0.0235, + "step": 14980 + }, + { + "epoch": 17.99519807923169, + "grad_norm": 0.15073584020137787, + "learning_rate": 1.6202884745019443e-05, + "loss": 0.0215, + "step": 14990 + }, + { + "epoch": 18.007202881152462, + "grad_norm": 0.2494090050458908, + "learning_rate": 1.614200436800304e-05, + "loss": 0.0234, + "step": 15000 + }, + { + "epoch": 18.01920768307323, + "grad_norm": 0.1600668579339981, + "learning_rate": 1.6081216557453814e-05, + "loss": 0.0232, + "step": 15010 + }, + { + "epoch": 18.031212484993997, + "grad_norm": 0.1933845579624176, + "learning_rate": 1.6020521479563367e-05, + "loss": 0.0252, + "step": 15020 + }, + { + "epoch": 18.043217286914764, + "grad_norm": 0.12341886013746262, + "learning_rate": 1.5959919300269654e-05, + "loss": 0.0226, + "step": 15030 + }, + { + "epoch": 18.055222088835535, + "grad_norm": 0.14448224008083344, + "learning_rate": 1.5899410185256764e-05, + "loss": 0.0242, + "step": 15040 + }, + { + "epoch": 18.067226890756302, + "grad_norm": 0.14678455889225006, + "learning_rate": 1.583899429995431e-05, + "loss": 0.0241, + "step": 15050 + }, + { + "epoch": 18.07923169267707, + "grad_norm": 0.14247359335422516, + "learning_rate": 1.5778671809536993e-05, + "loss": 0.0228, + "step": 15060 + }, + { + "epoch": 18.09123649459784, + "grad_norm": 0.20214083790779114, + "learning_rate": 1.5718442878924246e-05, + "loss": 0.0267, + "step": 15070 + }, + { + "epoch": 18.103241296518608, + "grad_norm": 0.14608459174633026, + "learning_rate": 1.5658307672779593e-05, + "loss": 0.0219, + "step": 15080 + }, + { + "epoch": 18.115246098439375, + "grad_norm": 0.13652068376541138, + "learning_rate": 1.5598266355510427e-05, + "loss": 0.0253, + "step": 15090 + }, + { + "epoch": 18.127250900360146, + "grad_norm": 0.11789446324110031, + "learning_rate": 1.553831909126744e-05, + "loss": 0.023, + "step": 15100 + }, + { + "epoch": 18.139255702280913, + "grad_norm": 0.15707841515541077, + "learning_rate": 1.5478466043944135e-05, + "loss": 0.0227, + "step": 15110 + }, + { + "epoch": 18.15126050420168, + "grad_norm": 0.14213168621063232, + "learning_rate": 1.5418707377176468e-05, + "loss": 0.0236, + "step": 15120 + }, + { + "epoch": 18.163265306122447, + "grad_norm": 0.1887068897485733, + "learning_rate": 1.535904325434233e-05, + "loss": 0.0249, + "step": 15130 + }, + { + "epoch": 18.175270108043218, + "grad_norm": 0.12500935792922974, + "learning_rate": 1.529947383856118e-05, + "loss": 0.0217, + "step": 15140 + }, + { + "epoch": 18.187274909963985, + "grad_norm": 0.17140403389930725, + "learning_rate": 1.5239999292693524e-05, + "loss": 0.0253, + "step": 15150 + }, + { + "epoch": 18.199279711884753, + "grad_norm": 0.11903663724660873, + "learning_rate": 1.5180619779340505e-05, + "loss": 0.0263, + "step": 15160 + }, + { + "epoch": 18.211284513805523, + "grad_norm": 0.12757651507854462, + "learning_rate": 1.5121335460843428e-05, + "loss": 0.0243, + "step": 15170 + }, + { + "epoch": 18.22328931572629, + "grad_norm": 0.1446487307548523, + "learning_rate": 1.5062146499283347e-05, + "loss": 0.0251, + "step": 15180 + }, + { + "epoch": 18.235294117647058, + "grad_norm": 0.10920180380344391, + "learning_rate": 1.5003053056480643e-05, + "loss": 0.0211, + "step": 15190 + }, + { + "epoch": 18.24729891956783, + "grad_norm": 0.15035903453826904, + "learning_rate": 1.4944055293994551e-05, + "loss": 0.0247, + "step": 15200 + }, + { + "epoch": 18.259303721488596, + "grad_norm": 0.13117322325706482, + "learning_rate": 1.4885153373122656e-05, + "loss": 0.0231, + "step": 15210 + }, + { + "epoch": 18.271308523409363, + "grad_norm": 0.1557263731956482, + "learning_rate": 1.482634745490059e-05, + "loss": 0.0232, + "step": 15220 + }, + { + "epoch": 18.28331332533013, + "grad_norm": 0.1315263956785202, + "learning_rate": 1.4767637700101466e-05, + "loss": 0.0231, + "step": 15230 + }, + { + "epoch": 18.2953181272509, + "grad_norm": 0.13536487519741058, + "learning_rate": 1.4709024269235528e-05, + "loss": 0.0228, + "step": 15240 + }, + { + "epoch": 18.30732292917167, + "grad_norm": 0.17168110609054565, + "learning_rate": 1.4650507322549684e-05, + "loss": 0.0239, + "step": 15250 + }, + { + "epoch": 18.319327731092436, + "grad_norm": 0.11863841116428375, + "learning_rate": 1.4592087020026972e-05, + "loss": 0.0225, + "step": 15260 + }, + { + "epoch": 18.331332533013207, + "grad_norm": 0.12125737220048904, + "learning_rate": 1.4533763521386318e-05, + "loss": 0.0251, + "step": 15270 + }, + { + "epoch": 18.343337334933974, + "grad_norm": 0.12233352661132812, + "learning_rate": 1.44755369860819e-05, + "loss": 0.0237, + "step": 15280 + }, + { + "epoch": 18.35534213685474, + "grad_norm": 0.1915338933467865, + "learning_rate": 1.441740757330287e-05, + "loss": 0.0228, + "step": 15290 + }, + { + "epoch": 18.367346938775512, + "grad_norm": 0.11918474733829498, + "learning_rate": 1.4359375441972844e-05, + "loss": 0.0232, + "step": 15300 + }, + { + "epoch": 18.37935174069628, + "grad_norm": 0.1683003008365631, + "learning_rate": 1.4301440750749395e-05, + "loss": 0.0234, + "step": 15310 + }, + { + "epoch": 18.391356542617046, + "grad_norm": 0.1486678272485733, + "learning_rate": 1.4243603658023808e-05, + "loss": 0.0241, + "step": 15320 + }, + { + "epoch": 18.403361344537814, + "grad_norm": 0.11867876350879669, + "learning_rate": 1.4185864321920444e-05, + "loss": 0.021, + "step": 15330 + }, + { + "epoch": 18.415366146458584, + "grad_norm": 0.14498692750930786, + "learning_rate": 1.4128222900296485e-05, + "loss": 0.0229, + "step": 15340 + }, + { + "epoch": 18.42737094837935, + "grad_norm": 0.0977136418223381, + "learning_rate": 1.407067955074135e-05, + "loss": 0.0193, + "step": 15350 + }, + { + "epoch": 18.43937575030012, + "grad_norm": 0.13501957058906555, + "learning_rate": 1.4013234430576356e-05, + "loss": 0.0211, + "step": 15360 + }, + { + "epoch": 18.45138055222089, + "grad_norm": 0.1485499143600464, + "learning_rate": 1.3955887696854286e-05, + "loss": 0.0233, + "step": 15370 + }, + { + "epoch": 18.463385354141657, + "grad_norm": 0.1897774487733841, + "learning_rate": 1.38986395063589e-05, + "loss": 0.0224, + "step": 15380 + }, + { + "epoch": 18.475390156062424, + "grad_norm": 0.1308542937040329, + "learning_rate": 1.3841490015604597e-05, + "loss": 0.0227, + "step": 15390 + }, + { + "epoch": 18.48739495798319, + "grad_norm": 0.1363823413848877, + "learning_rate": 1.3784439380835879e-05, + "loss": 0.0251, + "step": 15400 + }, + { + "epoch": 18.499399759903962, + "grad_norm": 0.1395791471004486, + "learning_rate": 1.3727487758026986e-05, + "loss": 0.0224, + "step": 15410 + }, + { + "epoch": 18.51140456182473, + "grad_norm": 0.1292416900396347, + "learning_rate": 1.3670635302881525e-05, + "loss": 0.0223, + "step": 15420 + }, + { + "epoch": 18.523409363745497, + "grad_norm": 0.1866629272699356, + "learning_rate": 1.3613882170831888e-05, + "loss": 0.0235, + "step": 15430 + }, + { + "epoch": 18.535414165666268, + "grad_norm": 0.1639423817396164, + "learning_rate": 1.355722851703901e-05, + "loss": 0.0245, + "step": 15440 + }, + { + "epoch": 18.547418967587035, + "grad_norm": 0.20189766585826874, + "learning_rate": 1.3500674496391814e-05, + "loss": 0.0262, + "step": 15450 + }, + { + "epoch": 18.559423769507802, + "grad_norm": 0.13088259100914001, + "learning_rate": 1.3444220263506795e-05, + "loss": 0.0241, + "step": 15460 + }, + { + "epoch": 18.571428571428573, + "grad_norm": 0.1342611461877823, + "learning_rate": 1.3387865972727714e-05, + "loss": 0.0226, + "step": 15470 + }, + { + "epoch": 18.58343337334934, + "grad_norm": 0.09749628603458405, + "learning_rate": 1.3331611778125036e-05, + "loss": 0.0218, + "step": 15480 + }, + { + "epoch": 18.595438175270107, + "grad_norm": 0.13203932344913483, + "learning_rate": 1.3275457833495564e-05, + "loss": 0.0251, + "step": 15490 + }, + { + "epoch": 18.607442977190875, + "grad_norm": 0.1431536227464676, + "learning_rate": 1.3219404292362065e-05, + "loss": 0.0236, + "step": 15500 + }, + { + "epoch": 18.619447779111646, + "grad_norm": 0.15202537178993225, + "learning_rate": 1.3163451307972751e-05, + "loss": 0.0247, + "step": 15510 + }, + { + "epoch": 18.631452581032413, + "grad_norm": 0.13261103630065918, + "learning_rate": 1.3107599033300977e-05, + "loss": 0.0226, + "step": 15520 + }, + { + "epoch": 18.64345738295318, + "grad_norm": 0.19322539865970612, + "learning_rate": 1.305184762104471e-05, + "loss": 0.0254, + "step": 15530 + }, + { + "epoch": 18.65546218487395, + "grad_norm": 0.12468942999839783, + "learning_rate": 1.2996197223626178e-05, + "loss": 0.0221, + "step": 15540 + }, + { + "epoch": 18.667466986794718, + "grad_norm": 0.1039043739438057, + "learning_rate": 1.2940647993191457e-05, + "loss": 0.022, + "step": 15550 + }, + { + "epoch": 18.679471788715485, + "grad_norm": 0.14117790758609772, + "learning_rate": 1.2885200081610005e-05, + "loss": 0.0237, + "step": 15560 + }, + { + "epoch": 18.691476590636256, + "grad_norm": 0.12882880866527557, + "learning_rate": 1.2829853640474316e-05, + "loss": 0.0243, + "step": 15570 + }, + { + "epoch": 18.703481392557023, + "grad_norm": 0.13615860044956207, + "learning_rate": 1.2774608821099438e-05, + "loss": 0.0214, + "step": 15580 + }, + { + "epoch": 18.71548619447779, + "grad_norm": 0.09100856631994247, + "learning_rate": 1.2719465774522577e-05, + "loss": 0.0229, + "step": 15590 + }, + { + "epoch": 18.727490996398558, + "grad_norm": 0.1675051748752594, + "learning_rate": 1.2664424651502755e-05, + "loss": 0.0202, + "step": 15600 + }, + { + "epoch": 18.73949579831933, + "grad_norm": 0.10287220031023026, + "learning_rate": 1.260948560252026e-05, + "loss": 0.0238, + "step": 15610 + }, + { + "epoch": 18.751500600240096, + "grad_norm": 0.12613695859909058, + "learning_rate": 1.2554648777776396e-05, + "loss": 0.0218, + "step": 15620 + }, + { + "epoch": 18.763505402160863, + "grad_norm": 0.10444662719964981, + "learning_rate": 1.2499914327192919e-05, + "loss": 0.0209, + "step": 15630 + }, + { + "epoch": 18.775510204081634, + "grad_norm": 0.10434585809707642, + "learning_rate": 1.2445282400411722e-05, + "loss": 0.0207, + "step": 15640 + }, + { + "epoch": 18.7875150060024, + "grad_norm": 0.14399515092372894, + "learning_rate": 1.2390753146794437e-05, + "loss": 0.0206, + "step": 15650 + }, + { + "epoch": 18.79951980792317, + "grad_norm": 0.14965707063674927, + "learning_rate": 1.2336326715421925e-05, + "loss": 0.0244, + "step": 15660 + }, + { + "epoch": 18.81152460984394, + "grad_norm": 0.17010720074176788, + "learning_rate": 1.2282003255094005e-05, + "loss": 0.021, + "step": 15670 + }, + { + "epoch": 18.823529411764707, + "grad_norm": 0.12521466612815857, + "learning_rate": 1.2227782914328928e-05, + "loss": 0.0215, + "step": 15680 + }, + { + "epoch": 18.835534213685474, + "grad_norm": 0.1357702910900116, + "learning_rate": 1.2173665841363018e-05, + "loss": 0.0203, + "step": 15690 + }, + { + "epoch": 18.84753901560624, + "grad_norm": 0.11789427697658539, + "learning_rate": 1.211965218415032e-05, + "loss": 0.0216, + "step": 15700 + }, + { + "epoch": 18.859543817527012, + "grad_norm": 0.1321222484111786, + "learning_rate": 1.2065742090362082e-05, + "loss": 0.0212, + "step": 15710 + }, + { + "epoch": 18.87154861944778, + "grad_norm": 0.1272089034318924, + "learning_rate": 1.2011935707386457e-05, + "loss": 0.0238, + "step": 15720 + }, + { + "epoch": 18.883553421368546, + "grad_norm": 0.15831510722637177, + "learning_rate": 1.1958233182328044e-05, + "loss": 0.0214, + "step": 15730 + }, + { + "epoch": 18.895558223289317, + "grad_norm": 0.11801894754171371, + "learning_rate": 1.1904634662007474e-05, + "loss": 0.0229, + "step": 15740 + }, + { + "epoch": 18.907563025210084, + "grad_norm": 0.13816513121128082, + "learning_rate": 1.1851140292961088e-05, + "loss": 0.0216, + "step": 15750 + }, + { + "epoch": 18.91956782713085, + "grad_norm": 0.1323041170835495, + "learning_rate": 1.1797750221440424e-05, + "loss": 0.025, + "step": 15760 + }, + { + "epoch": 18.931572629051622, + "grad_norm": 0.16592179238796234, + "learning_rate": 1.1744464593411897e-05, + "loss": 0.0217, + "step": 15770 + }, + { + "epoch": 18.94357743097239, + "grad_norm": 0.11251413077116013, + "learning_rate": 1.1691283554556399e-05, + "loss": 0.0242, + "step": 15780 + }, + { + "epoch": 18.955582232893157, + "grad_norm": 0.1526908278465271, + "learning_rate": 1.1638207250268834e-05, + "loss": 0.021, + "step": 15790 + }, + { + "epoch": 18.967587034813924, + "grad_norm": 0.13970109820365906, + "learning_rate": 1.158523582565782e-05, + "loss": 0.0254, + "step": 15800 + }, + { + "epoch": 18.979591836734695, + "grad_norm": 0.15840257704257965, + "learning_rate": 1.1532369425545192e-05, + "loss": 0.0231, + "step": 15810 + }, + { + "epoch": 18.991596638655462, + "grad_norm": 0.08998453617095947, + "learning_rate": 1.1479608194465662e-05, + "loss": 0.0239, + "step": 15820 + }, + { + "epoch": 19.00360144057623, + "grad_norm": 0.12575764954090118, + "learning_rate": 1.1426952276666442e-05, + "loss": 0.0194, + "step": 15830 + }, + { + "epoch": 19.015606242497, + "grad_norm": 0.13630954921245575, + "learning_rate": 1.1374401816106778e-05, + "loss": 0.0228, + "step": 15840 + }, + { + "epoch": 19.027611044417768, + "grad_norm": 0.12498758733272552, + "learning_rate": 1.1321956956457646e-05, + "loss": 0.0219, + "step": 15850 + }, + { + "epoch": 19.039615846338535, + "grad_norm": 0.13562022149562836, + "learning_rate": 1.1269617841101277e-05, + "loss": 0.0209, + "step": 15860 + }, + { + "epoch": 19.051620648259302, + "grad_norm": 0.12894918024539948, + "learning_rate": 1.1217384613130804e-05, + "loss": 0.022, + "step": 15870 + }, + { + "epoch": 19.063625450180073, + "grad_norm": 0.12263838201761246, + "learning_rate": 1.11652574153499e-05, + "loss": 0.0232, + "step": 15880 + }, + { + "epoch": 19.07563025210084, + "grad_norm": 0.1654883623123169, + "learning_rate": 1.1113236390272303e-05, + "loss": 0.0215, + "step": 15890 + }, + { + "epoch": 19.087635054021607, + "grad_norm": 0.12265368551015854, + "learning_rate": 1.106132168012155e-05, + "loss": 0.0216, + "step": 15900 + }, + { + "epoch": 19.099639855942378, + "grad_norm": 0.18860219419002533, + "learning_rate": 1.1009513426830448e-05, + "loss": 0.0233, + "step": 15910 + }, + { + "epoch": 19.111644657863145, + "grad_norm": 0.17681626975536346, + "learning_rate": 1.0957811772040777e-05, + "loss": 0.0237, + "step": 15920 + }, + { + "epoch": 19.123649459783913, + "grad_norm": 0.10194648802280426, + "learning_rate": 1.0906216857102913e-05, + "loss": 0.0198, + "step": 15930 + }, + { + "epoch": 19.135654261704683, + "grad_norm": 0.12515147030353546, + "learning_rate": 1.0854728823075355e-05, + "loss": 0.0234, + "step": 15940 + }, + { + "epoch": 19.14765906362545, + "grad_norm": 0.10452139377593994, + "learning_rate": 1.0803347810724452e-05, + "loss": 0.0203, + "step": 15950 + }, + { + "epoch": 19.159663865546218, + "grad_norm": 0.16746190190315247, + "learning_rate": 1.0752073960523911e-05, + "loss": 0.0237, + "step": 15960 + }, + { + "epoch": 19.171668667466985, + "grad_norm": 0.15083172917366028, + "learning_rate": 1.070090741265447e-05, + "loss": 0.0228, + "step": 15970 + }, + { + "epoch": 19.183673469387756, + "grad_norm": 0.12854328751564026, + "learning_rate": 1.0649848307003547e-05, + "loss": 0.0197, + "step": 15980 + }, + { + "epoch": 19.195678271308523, + "grad_norm": 0.1582222729921341, + "learning_rate": 1.0598896783164757e-05, + "loss": 0.0256, + "step": 15990 + }, + { + "epoch": 19.20768307322929, + "grad_norm": 0.1523444503545761, + "learning_rate": 1.0548052980437645e-05, + "loss": 0.0207, + "step": 16000 + }, + { + "epoch": 19.21968787515006, + "grad_norm": 0.11608922481536865, + "learning_rate": 1.049731703782722e-05, + "loss": 0.0215, + "step": 16010 + }, + { + "epoch": 19.23169267707083, + "grad_norm": 0.16081954538822174, + "learning_rate": 1.0446689094043587e-05, + "loss": 0.0221, + "step": 16020 + }, + { + "epoch": 19.243697478991596, + "grad_norm": 0.152528315782547, + "learning_rate": 1.039616928750165e-05, + "loss": 0.0235, + "step": 16030 + }, + { + "epoch": 19.255702280912367, + "grad_norm": 0.15294034779071808, + "learning_rate": 1.0345757756320612e-05, + "loss": 0.0208, + "step": 16040 + }, + { + "epoch": 19.267707082833134, + "grad_norm": 0.16937744617462158, + "learning_rate": 1.0295454638323666e-05, + "loss": 0.0233, + "step": 16050 + }, + { + "epoch": 19.2797118847539, + "grad_norm": 0.16836854815483093, + "learning_rate": 1.0245260071037632e-05, + "loss": 0.0248, + "step": 16060 + }, + { + "epoch": 19.29171668667467, + "grad_norm": 0.13892076909542084, + "learning_rate": 1.0195174191692518e-05, + "loss": 0.0214, + "step": 16070 + }, + { + "epoch": 19.30372148859544, + "grad_norm": 0.14129987359046936, + "learning_rate": 1.014519713722124e-05, + "loss": 0.0252, + "step": 16080 + }, + { + "epoch": 19.315726290516206, + "grad_norm": 0.12809252738952637, + "learning_rate": 1.0095329044259132e-05, + "loss": 0.0213, + "step": 16090 + }, + { + "epoch": 19.327731092436974, + "grad_norm": 0.11890455335378647, + "learning_rate": 1.004557004914365e-05, + "loss": 0.0228, + "step": 16100 + }, + { + "epoch": 19.339735894357744, + "grad_norm": 0.13688954710960388, + "learning_rate": 9.995920287914007e-06, + "loss": 0.0227, + "step": 16110 + }, + { + "epoch": 19.35174069627851, + "grad_norm": 0.1897711157798767, + "learning_rate": 9.946379896310737e-06, + "loss": 0.0233, + "step": 16120 + }, + { + "epoch": 19.36374549819928, + "grad_norm": 0.11930824816226959, + "learning_rate": 9.896949009775396e-06, + "loss": 0.0213, + "step": 16130 + }, + { + "epoch": 19.37575030012005, + "grad_norm": 0.14525601267814636, + "learning_rate": 9.847627763450134e-06, + "loss": 0.0206, + "step": 16140 + }, + { + "epoch": 19.387755102040817, + "grad_norm": 0.12926612794399261, + "learning_rate": 9.798416292177337e-06, + "loss": 0.0191, + "step": 16150 + }, + { + "epoch": 19.399759903961584, + "grad_norm": 0.12119969725608826, + "learning_rate": 9.74931473049932e-06, + "loss": 0.022, + "step": 16160 + }, + { + "epoch": 19.41176470588235, + "grad_norm": 0.09913256764411926, + "learning_rate": 9.700323212657847e-06, + "loss": 0.0197, + "step": 16170 + }, + { + "epoch": 19.423769507803122, + "grad_norm": 0.1334698498249054, + "learning_rate": 9.65144187259388e-06, + "loss": 0.02, + "step": 16180 + }, + { + "epoch": 19.43577430972389, + "grad_norm": 0.09185495227575302, + "learning_rate": 9.602670843947132e-06, + "loss": 0.0229, + "step": 16190 + }, + { + "epoch": 19.447779111644657, + "grad_norm": 0.11436789482831955, + "learning_rate": 9.554010260055713e-06, + "loss": 0.0241, + "step": 16200 + }, + { + "epoch": 19.459783913565428, + "grad_norm": 0.13417653739452362, + "learning_rate": 9.505460253955834e-06, + "loss": 0.0241, + "step": 16210 + }, + { + "epoch": 19.471788715486195, + "grad_norm": 0.12267237901687622, + "learning_rate": 9.457020958381324e-06, + "loss": 0.0227, + "step": 16220 + }, + { + "epoch": 19.483793517406962, + "grad_norm": 0.1499747484922409, + "learning_rate": 9.408692505763395e-06, + "loss": 0.0243, + "step": 16230 + }, + { + "epoch": 19.495798319327733, + "grad_norm": 0.1374678760766983, + "learning_rate": 9.360475028230181e-06, + "loss": 0.0212, + "step": 16240 + }, + { + "epoch": 19.5078031212485, + "grad_norm": 0.14273658394813538, + "learning_rate": 9.312368657606412e-06, + "loss": 0.0226, + "step": 16250 + }, + { + "epoch": 19.519807923169267, + "grad_norm": 0.14158613979816437, + "learning_rate": 9.264373525413096e-06, + "loss": 0.02, + "step": 16260 + }, + { + "epoch": 19.531812725090035, + "grad_norm": 0.1393531858921051, + "learning_rate": 9.216489762867058e-06, + "loss": 0.0228, + "step": 16270 + }, + { + "epoch": 19.543817527010805, + "grad_norm": 0.15625661611557007, + "learning_rate": 9.168717500880708e-06, + "loss": 0.0197, + "step": 16280 + }, + { + "epoch": 19.555822328931573, + "grad_norm": 0.1778373271226883, + "learning_rate": 9.121056870061574e-06, + "loss": 0.0213, + "step": 16290 + }, + { + "epoch": 19.56782713085234, + "grad_norm": 0.15337654948234558, + "learning_rate": 9.073508000711983e-06, + "loss": 0.0222, + "step": 16300 + }, + { + "epoch": 19.57983193277311, + "grad_norm": 0.12559740245342255, + "learning_rate": 9.026071022828758e-06, + "loss": 0.0231, + "step": 16310 + }, + { + "epoch": 19.591836734693878, + "grad_norm": 0.14935651421546936, + "learning_rate": 8.978746066102771e-06, + "loss": 0.0211, + "step": 16320 + }, + { + "epoch": 19.603841536614645, + "grad_norm": 0.12193058431148529, + "learning_rate": 8.931533259918634e-06, + "loss": 0.021, + "step": 16330 + }, + { + "epoch": 19.615846338535412, + "grad_norm": 0.22444473206996918, + "learning_rate": 8.884432733354382e-06, + "loss": 0.0234, + "step": 16340 + }, + { + "epoch": 19.627851140456183, + "grad_norm": 0.11516913771629333, + "learning_rate": 8.837444615181029e-06, + "loss": 0.0224, + "step": 16350 + }, + { + "epoch": 19.63985594237695, + "grad_norm": 0.11824783682823181, + "learning_rate": 8.790569033862323e-06, + "loss": 0.0218, + "step": 16360 + }, + { + "epoch": 19.651860744297718, + "grad_norm": 0.11721570044755936, + "learning_rate": 8.7438061175543e-06, + "loss": 0.0216, + "step": 16370 + }, + { + "epoch": 19.66386554621849, + "grad_norm": 0.17987000942230225, + "learning_rate": 8.697155994104978e-06, + "loss": 0.02, + "step": 16380 + }, + { + "epoch": 19.675870348139256, + "grad_norm": 0.09437516331672668, + "learning_rate": 8.650618791054033e-06, + "loss": 0.0203, + "step": 16390 + }, + { + "epoch": 19.687875150060023, + "grad_norm": 0.18028759956359863, + "learning_rate": 8.604194635632373e-06, + "loss": 0.0241, + "step": 16400 + }, + { + "epoch": 19.699879951980794, + "grad_norm": 0.12108098715543747, + "learning_rate": 8.557883654761906e-06, + "loss": 0.0191, + "step": 16410 + }, + { + "epoch": 19.71188475390156, + "grad_norm": 0.10702801495790482, + "learning_rate": 8.511685975055061e-06, + "loss": 0.0242, + "step": 16420 + }, + { + "epoch": 19.72388955582233, + "grad_norm": 0.17390590906143188, + "learning_rate": 8.46560172281452e-06, + "loss": 0.0213, + "step": 16430 + }, + { + "epoch": 19.735894357743096, + "grad_norm": 0.12297596037387848, + "learning_rate": 8.419631024032893e-06, + "loss": 0.0197, + "step": 16440 + }, + { + "epoch": 19.747899159663866, + "grad_norm": 0.0994333028793335, + "learning_rate": 8.373774004392293e-06, + "loss": 0.0191, + "step": 16450 + }, + { + "epoch": 19.759903961584634, + "grad_norm": 0.1316864788532257, + "learning_rate": 8.32803078926409e-06, + "loss": 0.0214, + "step": 16460 + }, + { + "epoch": 19.7719087635054, + "grad_norm": 0.11829479038715363, + "learning_rate": 8.282401503708454e-06, + "loss": 0.0219, + "step": 16470 + }, + { + "epoch": 19.78391356542617, + "grad_norm": 0.12761950492858887, + "learning_rate": 8.23688627247412e-06, + "loss": 0.0194, + "step": 16480 + }, + { + "epoch": 19.79591836734694, + "grad_norm": 0.1503424495458603, + "learning_rate": 8.191485219998007e-06, + "loss": 0.0226, + "step": 16490 + }, + { + "epoch": 19.807923169267706, + "grad_norm": 0.10854245722293854, + "learning_rate": 8.146198470404843e-06, + "loss": 0.0211, + "step": 16500 + }, + { + "epoch": 19.819927971188477, + "grad_norm": 0.12020760029554367, + "learning_rate": 8.101026147506897e-06, + "loss": 0.021, + "step": 16510 + }, + { + "epoch": 19.831932773109244, + "grad_norm": 0.1152350977063179, + "learning_rate": 8.05596837480353e-06, + "loss": 0.0222, + "step": 16520 + }, + { + "epoch": 19.84393757503001, + "grad_norm": 0.220518097281456, + "learning_rate": 8.011025275480998e-06, + "loss": 0.0231, + "step": 16530 + }, + { + "epoch": 19.85594237695078, + "grad_norm": 0.11797124892473221, + "learning_rate": 7.966196972412027e-06, + "loss": 0.0194, + "step": 16540 + }, + { + "epoch": 19.86794717887155, + "grad_norm": 0.1151261106133461, + "learning_rate": 7.92148358815547e-06, + "loss": 0.021, + "step": 16550 + }, + { + "epoch": 19.879951980792317, + "grad_norm": 0.1691550612449646, + "learning_rate": 7.87688524495604e-06, + "loss": 0.0215, + "step": 16560 + }, + { + "epoch": 19.891956782713084, + "grad_norm": 0.14024755358695984, + "learning_rate": 7.83240206474386e-06, + "loss": 0.019, + "step": 16570 + }, + { + "epoch": 19.903961584633855, + "grad_norm": 0.14472675323486328, + "learning_rate": 7.788034169134272e-06, + "loss": 0.0211, + "step": 16580 + }, + { + "epoch": 19.915966386554622, + "grad_norm": 0.11227346956729889, + "learning_rate": 7.743781679427414e-06, + "loss": 0.0225, + "step": 16590 + }, + { + "epoch": 19.92797118847539, + "grad_norm": 0.13753509521484375, + "learning_rate": 7.699644716607895e-06, + "loss": 0.0204, + "step": 16600 + }, + { + "epoch": 19.939975990396157, + "grad_norm": 0.18487776815891266, + "learning_rate": 7.655623401344486e-06, + "loss": 0.0225, + "step": 16610 + }, + { + "epoch": 19.951980792316927, + "grad_norm": 0.1101747527718544, + "learning_rate": 7.611717853989775e-06, + "loss": 0.0231, + "step": 16620 + }, + { + "epoch": 19.963985594237695, + "grad_norm": 0.14449921250343323, + "learning_rate": 7.567928194579854e-06, + "loss": 0.0235, + "step": 16630 + }, + { + "epoch": 19.975990396158462, + "grad_norm": 0.1550910770893097, + "learning_rate": 7.524254542833997e-06, + "loss": 0.0207, + "step": 16640 + }, + { + "epoch": 19.987995198079233, + "grad_norm": 0.18484516441822052, + "learning_rate": 7.480697018154286e-06, + "loss": 0.0226, + "step": 16650 + }, + { + "epoch": 20.0, + "grad_norm": 0.2342870682477951, + "learning_rate": 7.437255739625332e-06, + "loss": 0.0197, + "step": 16660 + }, + { + "epoch": 20.012004801920767, + "grad_norm": 0.12062092870473862, + "learning_rate": 7.393930826013923e-06, + "loss": 0.0192, + "step": 16670 + }, + { + "epoch": 20.024009603841538, + "grad_norm": 0.2269190400838852, + "learning_rate": 7.350722395768722e-06, + "loss": 0.0245, + "step": 16680 + }, + { + "epoch": 20.036014405762305, + "grad_norm": 0.13567215204238892, + "learning_rate": 7.307630567019963e-06, + "loss": 0.0215, + "step": 16690 + }, + { + "epoch": 20.048019207683073, + "grad_norm": 0.2158946692943573, + "learning_rate": 7.264655457579e-06, + "loss": 0.0237, + "step": 16700 + }, + { + "epoch": 20.06002400960384, + "grad_norm": 0.12031452357769012, + "learning_rate": 7.221797184938184e-06, + "loss": 0.0186, + "step": 16710 + }, + { + "epoch": 20.07202881152461, + "grad_norm": 0.1283365786075592, + "learning_rate": 7.179055866270373e-06, + "loss": 0.0188, + "step": 16720 + }, + { + "epoch": 20.084033613445378, + "grad_norm": 0.11736376583576202, + "learning_rate": 7.136431618428707e-06, + "loss": 0.0197, + "step": 16730 + }, + { + "epoch": 20.096038415366145, + "grad_norm": 0.10194515436887741, + "learning_rate": 7.09392455794628e-06, + "loss": 0.02, + "step": 16740 + }, + { + "epoch": 20.108043217286916, + "grad_norm": 0.14640851318836212, + "learning_rate": 7.051534801035725e-06, + "loss": 0.0202, + "step": 16750 + }, + { + "epoch": 20.120048019207683, + "grad_norm": 0.12677054107189178, + "learning_rate": 7.00926246358905e-06, + "loss": 0.021, + "step": 16760 + }, + { + "epoch": 20.13205282112845, + "grad_norm": 0.16819103062152863, + "learning_rate": 6.967107661177191e-06, + "loss": 0.0235, + "step": 16770 + }, + { + "epoch": 20.14405762304922, + "grad_norm": 0.12303987145423889, + "learning_rate": 6.925070509049786e-06, + "loss": 0.0196, + "step": 16780 + }, + { + "epoch": 20.15606242496999, + "grad_norm": 0.11713185906410217, + "learning_rate": 6.883151122134812e-06, + "loss": 0.0231, + "step": 16790 + }, + { + "epoch": 20.168067226890756, + "grad_norm": 0.14809568226337433, + "learning_rate": 6.8413496150382394e-06, + "loss": 0.0199, + "step": 16800 + }, + { + "epoch": 20.180072028811523, + "grad_norm": 0.14293716847896576, + "learning_rate": 6.7996661020438165e-06, + "loss": 0.0241, + "step": 16810 + }, + { + "epoch": 20.192076830732294, + "grad_norm": 0.1441735178232193, + "learning_rate": 6.758100697112662e-06, + "loss": 0.0206, + "step": 16820 + }, + { + "epoch": 20.20408163265306, + "grad_norm": 0.1063256487250328, + "learning_rate": 6.716653513883026e-06, + "loss": 0.0232, + "step": 16830 + }, + { + "epoch": 20.21608643457383, + "grad_norm": 0.18371447920799255, + "learning_rate": 6.675324665669913e-06, + "loss": 0.0243, + "step": 16840 + }, + { + "epoch": 20.2280912364946, + "grad_norm": 0.07694540917873383, + "learning_rate": 6.634114265464803e-06, + "loss": 0.0226, + "step": 16850 + }, + { + "epoch": 20.240096038415366, + "grad_norm": 0.13379435241222382, + "learning_rate": 6.59302242593538e-06, + "loss": 0.0196, + "step": 16860 + }, + { + "epoch": 20.252100840336134, + "grad_norm": 0.13470886647701263, + "learning_rate": 6.552049259425141e-06, + "loss": 0.0248, + "step": 16870 + }, + { + "epoch": 20.264105642256904, + "grad_norm": 0.1712941825389862, + "learning_rate": 6.511194877953181e-06, + "loss": 0.0217, + "step": 16880 + }, + { + "epoch": 20.27611044417767, + "grad_norm": 0.1474715620279312, + "learning_rate": 6.470459393213813e-06, + "loss": 0.0216, + "step": 16890 + }, + { + "epoch": 20.28811524609844, + "grad_norm": 0.11019083857536316, + "learning_rate": 6.429842916576279e-06, + "loss": 0.022, + "step": 16900 + }, + { + "epoch": 20.300120048019206, + "grad_norm": 0.1208374872803688, + "learning_rate": 6.389345559084503e-06, + "loss": 0.0196, + "step": 16910 + }, + { + "epoch": 20.312124849939977, + "grad_norm": 0.09259053319692612, + "learning_rate": 6.348967431456682e-06, + "loss": 0.0227, + "step": 16920 + }, + { + "epoch": 20.324129651860744, + "grad_norm": 0.1276562213897705, + "learning_rate": 6.30870864408511e-06, + "loss": 0.0205, + "step": 16930 + }, + { + "epoch": 20.33613445378151, + "grad_norm": 0.1518024206161499, + "learning_rate": 6.268569307035754e-06, + "loss": 0.0208, + "step": 16940 + }, + { + "epoch": 20.348139255702282, + "grad_norm": 0.13045068085193634, + "learning_rate": 6.228549530048022e-06, + "loss": 0.0213, + "step": 16950 + }, + { + "epoch": 20.36014405762305, + "grad_norm": 0.1303977221250534, + "learning_rate": 6.1886494225344814e-06, + "loss": 0.0175, + "step": 16960 + }, + { + "epoch": 20.372148859543817, + "grad_norm": 0.15551415085792542, + "learning_rate": 6.148869093580479e-06, + "loss": 0.0196, + "step": 16970 + }, + { + "epoch": 20.384153661464588, + "grad_norm": 0.08998840302228928, + "learning_rate": 6.109208651943921e-06, + "loss": 0.0203, + "step": 16980 + }, + { + "epoch": 20.396158463385355, + "grad_norm": 0.14142873883247375, + "learning_rate": 6.069668206054946e-06, + "loss": 0.0201, + "step": 16990 + }, + { + "epoch": 20.408163265306122, + "grad_norm": 0.16088749468326569, + "learning_rate": 6.0302478640156145e-06, + "loss": 0.0219, + "step": 17000 + }, + { + "epoch": 20.42016806722689, + "grad_norm": 0.0903446152806282, + "learning_rate": 5.990947733599644e-06, + "loss": 0.0221, + "step": 17010 + }, + { + "epoch": 20.43217286914766, + "grad_norm": 0.11717907339334488, + "learning_rate": 5.951767922252105e-06, + "loss": 0.022, + "step": 17020 + }, + { + "epoch": 20.444177671068427, + "grad_norm": 0.10248741507530212, + "learning_rate": 5.912708537089068e-06, + "loss": 0.0202, + "step": 17030 + }, + { + "epoch": 20.456182472989195, + "grad_norm": 0.14726370573043823, + "learning_rate": 5.873769684897434e-06, + "loss": 0.0217, + "step": 17040 + }, + { + "epoch": 20.468187274909965, + "grad_norm": 0.14408385753631592, + "learning_rate": 5.834951472134514e-06, + "loss": 0.0215, + "step": 17050 + }, + { + "epoch": 20.480192076830733, + "grad_norm": 0.1271083801984787, + "learning_rate": 5.796254004927832e-06, + "loss": 0.0215, + "step": 17060 + }, + { + "epoch": 20.4921968787515, + "grad_norm": 0.11219154298305511, + "learning_rate": 5.757677389074806e-06, + "loss": 0.0191, + "step": 17070 + }, + { + "epoch": 20.504201680672267, + "grad_norm": 0.21058289706707, + "learning_rate": 5.719221730042385e-06, + "loss": 0.0205, + "step": 17080 + }, + { + "epoch": 20.516206482593038, + "grad_norm": 0.16587409377098083, + "learning_rate": 5.680887132966911e-06, + "loss": 0.0249, + "step": 17090 + }, + { + "epoch": 20.528211284513805, + "grad_norm": 0.12012436240911484, + "learning_rate": 5.642673702653683e-06, + "loss": 0.0186, + "step": 17100 + }, + { + "epoch": 20.540216086434572, + "grad_norm": 0.14798694849014282, + "learning_rate": 5.604581543576781e-06, + "loss": 0.0199, + "step": 17110 + }, + { + "epoch": 20.552220888355343, + "grad_norm": 0.11643534153699875, + "learning_rate": 5.566610759878704e-06, + "loss": 0.0222, + "step": 17120 + }, + { + "epoch": 20.56422569027611, + "grad_norm": 0.1652154177427292, + "learning_rate": 5.528761455370119e-06, + "loss": 0.0213, + "step": 17130 + }, + { + "epoch": 20.576230492196878, + "grad_norm": 0.12825289368629456, + "learning_rate": 5.491033733529594e-06, + "loss": 0.0206, + "step": 17140 + }, + { + "epoch": 20.58823529411765, + "grad_norm": 0.17129714787006378, + "learning_rate": 5.453427697503255e-06, + "loss": 0.0199, + "step": 17150 + }, + { + "epoch": 20.600240096038416, + "grad_norm": 0.15347318351268768, + "learning_rate": 5.415943450104599e-06, + "loss": 0.0226, + "step": 17160 + }, + { + "epoch": 20.612244897959183, + "grad_norm": 0.1529969424009323, + "learning_rate": 5.378581093814111e-06, + "loss": 0.0187, + "step": 17170 + }, + { + "epoch": 20.62424969987995, + "grad_norm": 0.10956297069787979, + "learning_rate": 5.3413407307790375e-06, + "loss": 0.019, + "step": 17180 + }, + { + "epoch": 20.63625450180072, + "grad_norm": 0.18802589178085327, + "learning_rate": 5.30422246281313e-06, + "loss": 0.0199, + "step": 17190 + }, + { + "epoch": 20.64825930372149, + "grad_norm": 0.1751340627670288, + "learning_rate": 5.267226391396296e-06, + "loss": 0.0215, + "step": 17200 + }, + { + "epoch": 20.660264105642256, + "grad_norm": 0.11301999539136887, + "learning_rate": 5.2303526176744e-06, + "loss": 0.0179, + "step": 17210 + }, + { + "epoch": 20.672268907563026, + "grad_norm": 0.135145902633667, + "learning_rate": 5.193601242458929e-06, + "loss": 0.02, + "step": 17220 + }, + { + "epoch": 20.684273709483794, + "grad_norm": 0.14437155425548553, + "learning_rate": 5.156972366226714e-06, + "loss": 0.0221, + "step": 17230 + }, + { + "epoch": 20.69627851140456, + "grad_norm": 0.12211228162050247, + "learning_rate": 5.120466089119735e-06, + "loss": 0.0173, + "step": 17240 + }, + { + "epoch": 20.70828331332533, + "grad_norm": 0.15170927345752716, + "learning_rate": 5.084082510944749e-06, + "loss": 0.0191, + "step": 17250 + }, + { + "epoch": 20.7202881152461, + "grad_norm": 0.11326973885297775, + "learning_rate": 5.047821731173058e-06, + "loss": 0.02, + "step": 17260 + }, + { + "epoch": 20.732292917166866, + "grad_norm": 0.15162760019302368, + "learning_rate": 5.011683848940274e-06, + "loss": 0.0209, + "step": 17270 + }, + { + "epoch": 20.744297719087633, + "grad_norm": 0.10142092406749725, + "learning_rate": 4.975668963045954e-06, + "loss": 0.0193, + "step": 17280 + }, + { + "epoch": 20.756302521008404, + "grad_norm": 0.1406378298997879, + "learning_rate": 4.9397771719534525e-06, + "loss": 0.0209, + "step": 17290 + }, + { + "epoch": 20.76830732292917, + "grad_norm": 0.15963228046894073, + "learning_rate": 4.904008573789548e-06, + "loss": 0.0202, + "step": 17300 + }, + { + "epoch": 20.78031212484994, + "grad_norm": 0.16755786538124084, + "learning_rate": 4.8683632663442005e-06, + "loss": 0.0215, + "step": 17310 + }, + { + "epoch": 20.79231692677071, + "grad_norm": 0.13985559344291687, + "learning_rate": 4.832841347070343e-06, + "loss": 0.0198, + "step": 17320 + }, + { + "epoch": 20.804321728691477, + "grad_norm": 0.18730008602142334, + "learning_rate": 4.797442913083539e-06, + "loss": 0.0228, + "step": 17330 + }, + { + "epoch": 20.816326530612244, + "grad_norm": 0.11879587173461914, + "learning_rate": 4.7621680611617596e-06, + "loss": 0.0204, + "step": 17340 + }, + { + "epoch": 20.828331332533015, + "grad_norm": 0.1258353888988495, + "learning_rate": 4.727016887745095e-06, + "loss": 0.0187, + "step": 17350 + }, + { + "epoch": 20.840336134453782, + "grad_norm": 0.11064130812883377, + "learning_rate": 4.691989488935511e-06, + "loss": 0.0212, + "step": 17360 + }, + { + "epoch": 20.85234093637455, + "grad_norm": 0.16020932793617249, + "learning_rate": 4.657085960496588e-06, + "loss": 0.0219, + "step": 17370 + }, + { + "epoch": 20.864345738295317, + "grad_norm": 0.12057816237211227, + "learning_rate": 4.6223063978532265e-06, + "loss": 0.0193, + "step": 17380 + }, + { + "epoch": 20.876350540216087, + "grad_norm": 0.10326166450977325, + "learning_rate": 4.587650896091439e-06, + "loss": 0.0203, + "step": 17390 + }, + { + "epoch": 20.888355342136855, + "grad_norm": 0.13974909484386444, + "learning_rate": 4.553119549958035e-06, + "loss": 0.021, + "step": 17400 + }, + { + "epoch": 20.900360144057622, + "grad_norm": 0.12561772763729095, + "learning_rate": 4.518712453860385e-06, + "loss": 0.0198, + "step": 17410 + }, + { + "epoch": 20.912364945978393, + "grad_norm": 0.12127188593149185, + "learning_rate": 4.484429701866205e-06, + "loss": 0.0231, + "step": 17420 + }, + { + "epoch": 20.92436974789916, + "grad_norm": 0.19657544791698456, + "learning_rate": 4.4502713877031975e-06, + "loss": 0.0217, + "step": 17430 + }, + { + "epoch": 20.936374549819927, + "grad_norm": 0.145889014005661, + "learning_rate": 4.416237604758911e-06, + "loss": 0.0227, + "step": 17440 + }, + { + "epoch": 20.948379351740698, + "grad_norm": 0.11928942054510117, + "learning_rate": 4.3823284460804025e-06, + "loss": 0.0163, + "step": 17450 + }, + { + "epoch": 20.960384153661465, + "grad_norm": 0.12276489287614822, + "learning_rate": 4.348544004374011e-06, + "loss": 0.0198, + "step": 17460 + }, + { + "epoch": 20.972388955582232, + "grad_norm": 0.1726096123456955, + "learning_rate": 4.314884372005123e-06, + "loss": 0.0215, + "step": 17470 + }, + { + "epoch": 20.984393757503, + "grad_norm": 0.10986056923866272, + "learning_rate": 4.281349640997867e-06, + "loss": 0.0238, + "step": 17480 + }, + { + "epoch": 20.99639855942377, + "grad_norm": 0.13153740763664246, + "learning_rate": 4.247939903034942e-06, + "loss": 0.02, + "step": 17490 + }, + { + "epoch": 21.008403361344538, + "grad_norm": 0.12088081240653992, + "learning_rate": 4.214655249457284e-06, + "loss": 0.0216, + "step": 17500 + }, + { + "epoch": 21.020408163265305, + "grad_norm": 0.11701159924268723, + "learning_rate": 4.181495771263855e-06, + "loss": 0.0209, + "step": 17510 + }, + { + "epoch": 21.032412965186076, + "grad_norm": 0.07905508577823639, + "learning_rate": 4.148461559111427e-06, + "loss": 0.0192, + "step": 17520 + }, + { + "epoch": 21.044417767106843, + "grad_norm": 0.17528478801250458, + "learning_rate": 4.115552703314252e-06, + "loss": 0.0212, + "step": 17530 + }, + { + "epoch": 21.05642256902761, + "grad_norm": 0.2229757010936737, + "learning_rate": 4.082769293843886e-06, + "loss": 0.0196, + "step": 17540 + }, + { + "epoch": 21.068427370948378, + "grad_norm": 0.11768965423107147, + "learning_rate": 4.050111420328939e-06, + "loss": 0.023, + "step": 17550 + }, + { + "epoch": 21.08043217286915, + "grad_norm": 0.12043041735887527, + "learning_rate": 4.017579172054764e-06, + "loss": 0.022, + "step": 17560 + }, + { + "epoch": 21.092436974789916, + "grad_norm": 0.127870112657547, + "learning_rate": 3.985172637963308e-06, + "loss": 0.0198, + "step": 17570 + }, + { + "epoch": 21.104441776710683, + "grad_norm": 0.11847793310880661, + "learning_rate": 3.952891906652784e-06, + "loss": 0.021, + "step": 17580 + }, + { + "epoch": 21.116446578631454, + "grad_norm": 0.12154414504766464, + "learning_rate": 3.920737066377478e-06, + "loss": 0.02, + "step": 17590 + }, + { + "epoch": 21.12845138055222, + "grad_norm": 0.13587813079357147, + "learning_rate": 3.888708205047509e-06, + "loss": 0.0192, + "step": 17600 + }, + { + "epoch": 21.140456182472988, + "grad_norm": 0.19377745687961578, + "learning_rate": 3.856805410228542e-06, + "loss": 0.0224, + "step": 17610 + }, + { + "epoch": 21.15246098439376, + "grad_norm": 0.10134905576705933, + "learning_rate": 3.82502876914162e-06, + "loss": 0.02, + "step": 17620 + }, + { + "epoch": 21.164465786314526, + "grad_norm": 0.12543246150016785, + "learning_rate": 3.7933783686628586e-06, + "loss": 0.0195, + "step": 17630 + }, + { + "epoch": 21.176470588235293, + "grad_norm": 0.11272493004798889, + "learning_rate": 3.7618542953232306e-06, + "loss": 0.0205, + "step": 17640 + }, + { + "epoch": 21.18847539015606, + "grad_norm": 0.09283991903066635, + "learning_rate": 3.7304566353083658e-06, + "loss": 0.0208, + "step": 17650 + }, + { + "epoch": 21.20048019207683, + "grad_norm": 0.16455183923244476, + "learning_rate": 3.6991854744582555e-06, + "loss": 0.0208, + "step": 17660 + }, + { + "epoch": 21.2124849939976, + "grad_norm": 0.1288999766111374, + "learning_rate": 3.6680408982670777e-06, + "loss": 0.0215, + "step": 17670 + }, + { + "epoch": 21.224489795918366, + "grad_norm": 0.10709884762763977, + "learning_rate": 3.637022991882899e-06, + "loss": 0.0183, + "step": 17680 + }, + { + "epoch": 21.236494597839137, + "grad_norm": 0.130048006772995, + "learning_rate": 3.606131840107485e-06, + "loss": 0.0193, + "step": 17690 + }, + { + "epoch": 21.248499399759904, + "grad_norm": 0.12556588649749756, + "learning_rate": 3.575367527396084e-06, + "loss": 0.0214, + "step": 17700 + }, + { + "epoch": 21.26050420168067, + "grad_norm": 0.13335424661636353, + "learning_rate": 3.5447301378571386e-06, + "loss": 0.0195, + "step": 17710 + }, + { + "epoch": 21.272509003601442, + "grad_norm": 0.15025152266025543, + "learning_rate": 3.514219755252113e-06, + "loss": 0.0185, + "step": 17720 + }, + { + "epoch": 21.28451380552221, + "grad_norm": 0.12791404128074646, + "learning_rate": 3.4838364629952213e-06, + "loss": 0.0194, + "step": 17730 + }, + { + "epoch": 21.296518607442977, + "grad_norm": 0.12671828269958496, + "learning_rate": 3.4535803441532123e-06, + "loss": 0.02, + "step": 17740 + }, + { + "epoch": 21.308523409363744, + "grad_norm": 0.14121338725090027, + "learning_rate": 3.4234514814451836e-06, + "loss": 0.0183, + "step": 17750 + }, + { + "epoch": 21.320528211284515, + "grad_norm": 0.1112084835767746, + "learning_rate": 3.393449957242273e-06, + "loss": 0.0187, + "step": 17760 + }, + { + "epoch": 21.332533013205282, + "grad_norm": 0.09233396500349045, + "learning_rate": 3.363575853567524e-06, + "loss": 0.0204, + "step": 17770 + }, + { + "epoch": 21.34453781512605, + "grad_norm": 0.16576342284679413, + "learning_rate": 3.3338292520955826e-06, + "loss": 0.0192, + "step": 17780 + }, + { + "epoch": 21.35654261704682, + "grad_norm": 0.14751233160495758, + "learning_rate": 3.304210234152516e-06, + "loss": 0.0201, + "step": 17790 + }, + { + "epoch": 21.368547418967587, + "grad_norm": 0.1318328082561493, + "learning_rate": 3.2747188807155993e-06, + "loss": 0.02, + "step": 17800 + }, + { + "epoch": 21.380552220888354, + "grad_norm": 0.11076328158378601, + "learning_rate": 3.2453552724130643e-06, + "loss": 0.0201, + "step": 17810 + }, + { + "epoch": 21.392557022809125, + "grad_norm": 0.15397198498249054, + "learning_rate": 3.216119489523889e-06, + "loss": 0.0201, + "step": 17820 + }, + { + "epoch": 21.404561824729893, + "grad_norm": 0.10876789689064026, + "learning_rate": 3.1870116119775917e-06, + "loss": 0.02, + "step": 17830 + }, + { + "epoch": 21.41656662665066, + "grad_norm": 0.1177273839712143, + "learning_rate": 3.158031719353999e-06, + "loss": 0.0207, + "step": 17840 + }, + { + "epoch": 21.428571428571427, + "grad_norm": 0.14359837770462036, + "learning_rate": 3.1291798908830273e-06, + "loss": 0.0214, + "step": 17850 + }, + { + "epoch": 21.440576230492198, + "grad_norm": 0.14639292657375336, + "learning_rate": 3.1004562054444853e-06, + "loss": 0.0187, + "step": 17860 + }, + { + "epoch": 21.452581032412965, + "grad_norm": 0.12852923572063446, + "learning_rate": 3.071860741567806e-06, + "loss": 0.0209, + "step": 17870 + }, + { + "epoch": 21.464585834333732, + "grad_norm": 0.14038079977035522, + "learning_rate": 3.04339357743193e-06, + "loss": 0.0196, + "step": 17880 + }, + { + "epoch": 21.476590636254503, + "grad_norm": 0.11521651595830917, + "learning_rate": 3.0150547908649628e-06, + "loss": 0.0194, + "step": 17890 + }, + { + "epoch": 21.48859543817527, + "grad_norm": 0.16090793907642365, + "learning_rate": 2.9868444593440957e-06, + "loss": 0.0205, + "step": 17900 + }, + { + "epoch": 21.500600240096038, + "grad_norm": 0.11025667935609818, + "learning_rate": 2.9587626599952846e-06, + "loss": 0.0192, + "step": 17910 + }, + { + "epoch": 21.51260504201681, + "grad_norm": 0.12520357966423035, + "learning_rate": 2.930809469593082e-06, + "loss": 0.0194, + "step": 17920 + }, + { + "epoch": 21.524609843937576, + "grad_norm": 0.09443973004817963, + "learning_rate": 2.9029849645604733e-06, + "loss": 0.0203, + "step": 17930 + }, + { + "epoch": 21.536614645858343, + "grad_norm": 0.1454503834247589, + "learning_rate": 2.8752892209685632e-06, + "loss": 0.0207, + "step": 17940 + }, + { + "epoch": 21.54861944777911, + "grad_norm": 0.11792952567338943, + "learning_rate": 2.847722314536483e-06, + "loss": 0.0187, + "step": 17950 + }, + { + "epoch": 21.56062424969988, + "grad_norm": 0.16851238906383514, + "learning_rate": 2.820284320631078e-06, + "loss": 0.0188, + "step": 17960 + }, + { + "epoch": 21.57262905162065, + "grad_norm": 0.2927883267402649, + "learning_rate": 2.792975314266788e-06, + "loss": 0.0212, + "step": 17970 + }, + { + "epoch": 21.584633853541416, + "grad_norm": 0.12130081653594971, + "learning_rate": 2.7657953701054007e-06, + "loss": 0.0194, + "step": 17980 + }, + { + "epoch": 21.596638655462186, + "grad_norm": 0.16011787950992584, + "learning_rate": 2.7387445624558306e-06, + "loss": 0.0179, + "step": 17990 + }, + { + "epoch": 21.608643457382954, + "grad_norm": 0.15392538905143738, + "learning_rate": 2.7118229652739747e-06, + "loss": 0.0171, + "step": 18000 + }, + { + "epoch": 21.62064825930372, + "grad_norm": 0.0959198921918869, + "learning_rate": 2.6850306521624236e-06, + "loss": 0.0187, + "step": 18010 + }, + { + "epoch": 21.632653061224488, + "grad_norm": 0.12777657806873322, + "learning_rate": 2.6583676963703507e-06, + "loss": 0.0202, + "step": 18020 + }, + { + "epoch": 21.64465786314526, + "grad_norm": 0.14035558700561523, + "learning_rate": 2.631834170793268e-06, + "loss": 0.0205, + "step": 18030 + }, + { + "epoch": 21.656662665066026, + "grad_norm": 0.12026821076869965, + "learning_rate": 2.6054301479728036e-06, + "loss": 0.0192, + "step": 18040 + }, + { + "epoch": 21.668667466986793, + "grad_norm": 0.11479002982378006, + "learning_rate": 2.579155700096575e-06, + "loss": 0.0224, + "step": 18050 + }, + { + "epoch": 21.680672268907564, + "grad_norm": 0.14522339403629303, + "learning_rate": 2.5530108989978873e-06, + "loss": 0.0231, + "step": 18060 + }, + { + "epoch": 21.69267707082833, + "grad_norm": 0.16385529935359955, + "learning_rate": 2.5269958161556416e-06, + "loss": 0.0217, + "step": 18070 + }, + { + "epoch": 21.7046818727491, + "grad_norm": 0.07365598529577255, + "learning_rate": 2.5011105226940888e-06, + "loss": 0.0198, + "step": 18080 + }, + { + "epoch": 21.71668667466987, + "grad_norm": 0.21005652844905853, + "learning_rate": 2.4753550893826248e-06, + "loss": 0.0199, + "step": 18090 + }, + { + "epoch": 21.728691476590637, + "grad_norm": 0.09551231563091278, + "learning_rate": 2.4497295866356296e-06, + "loss": 0.0174, + "step": 18100 + }, + { + "epoch": 21.740696278511404, + "grad_norm": 0.15059629082679749, + "learning_rate": 2.424234084512228e-06, + "loss": 0.0191, + "step": 18110 + }, + { + "epoch": 21.75270108043217, + "grad_norm": 0.09909665584564209, + "learning_rate": 2.3988686527161687e-06, + "loss": 0.0171, + "step": 18120 + }, + { + "epoch": 21.764705882352942, + "grad_norm": 0.11979085952043533, + "learning_rate": 2.373633360595573e-06, + "loss": 0.022, + "step": 18130 + }, + { + "epoch": 21.77671068427371, + "grad_norm": 0.08476952463388443, + "learning_rate": 2.3485282771427585e-06, + "loss": 0.0202, + "step": 18140 + }, + { + "epoch": 21.788715486194477, + "grad_norm": 0.2004087120294571, + "learning_rate": 2.3235534709940665e-06, + "loss": 0.0224, + "step": 18150 + }, + { + "epoch": 21.800720288115247, + "grad_norm": 0.11735227704048157, + "learning_rate": 2.2987090104296617e-06, + "loss": 0.021, + "step": 18160 + }, + { + "epoch": 21.812725090036015, + "grad_norm": 0.12657985091209412, + "learning_rate": 2.273994963373355e-06, + "loss": 0.0208, + "step": 18170 + }, + { + "epoch": 21.824729891956782, + "grad_norm": 0.13143570721149445, + "learning_rate": 2.249411397392409e-06, + "loss": 0.0196, + "step": 18180 + }, + { + "epoch": 21.836734693877553, + "grad_norm": 0.0994463786482811, + "learning_rate": 2.2249583796973506e-06, + "loss": 0.0196, + "step": 18190 + }, + { + "epoch": 21.84873949579832, + "grad_norm": 0.11054611951112747, + "learning_rate": 2.200635977141796e-06, + "loss": 0.0196, + "step": 18200 + }, + { + "epoch": 21.860744297719087, + "grad_norm": 0.09281840920448303, + "learning_rate": 2.17644425622226e-06, + "loss": 0.0202, + "step": 18210 + }, + { + "epoch": 21.872749099639854, + "grad_norm": 0.12430454045534134, + "learning_rate": 2.152383283077991e-06, + "loss": 0.0196, + "step": 18220 + }, + { + "epoch": 21.884753901560625, + "grad_norm": 0.11687669903039932, + "learning_rate": 2.128453123490781e-06, + "loss": 0.0195, + "step": 18230 + }, + { + "epoch": 21.896758703481392, + "grad_norm": 0.15283095836639404, + "learning_rate": 2.1046538428847462e-06, + "loss": 0.02, + "step": 18240 + }, + { + "epoch": 21.90876350540216, + "grad_norm": 0.1463906317949295, + "learning_rate": 2.0809855063262273e-06, + "loss": 0.0216, + "step": 18250 + }, + { + "epoch": 21.92076830732293, + "grad_norm": 0.17351923882961273, + "learning_rate": 2.057448178523558e-06, + "loss": 0.0196, + "step": 18260 + }, + { + "epoch": 21.932773109243698, + "grad_norm": 0.16090908646583557, + "learning_rate": 2.034041923826885e-06, + "loss": 0.0194, + "step": 18270 + }, + { + "epoch": 21.944777911164465, + "grad_norm": 0.11305760592222214, + "learning_rate": 2.0107668062280204e-06, + "loss": 0.0217, + "step": 18280 + }, + { + "epoch": 21.956782713085236, + "grad_norm": 0.105465367436409, + "learning_rate": 1.9876228893602357e-06, + "loss": 0.0212, + "step": 18290 + }, + { + "epoch": 21.968787515006003, + "grad_norm": 0.1431153267621994, + "learning_rate": 1.9646102364981266e-06, + "loss": 0.0208, + "step": 18300 + }, + { + "epoch": 21.98079231692677, + "grad_norm": 0.1424189954996109, + "learning_rate": 1.9417289105574053e-06, + "loss": 0.022, + "step": 18310 + }, + { + "epoch": 21.992797118847538, + "grad_norm": 0.11437863856554031, + "learning_rate": 1.9189789740947427e-06, + "loss": 0.0185, + "step": 18320 + }, + { + "epoch": 22.00480192076831, + "grad_norm": 0.13647331297397614, + "learning_rate": 1.896360489307597e-06, + "loss": 0.0196, + "step": 18330 + }, + { + "epoch": 22.016806722689076, + "grad_norm": 0.10018131881952286, + "learning_rate": 1.8738735180340362e-06, + "loss": 0.0198, + "step": 18340 + }, + { + "epoch": 22.028811524609843, + "grad_norm": 0.11378614604473114, + "learning_rate": 1.8515181217525824e-06, + "loss": 0.0192, + "step": 18350 + }, + { + "epoch": 22.040816326530614, + "grad_norm": 0.12390806525945663, + "learning_rate": 1.8292943615820457e-06, + "loss": 0.0193, + "step": 18360 + }, + { + "epoch": 22.05282112845138, + "grad_norm": 0.12644854187965393, + "learning_rate": 1.8072022982813296e-06, + "loss": 0.018, + "step": 18370 + }, + { + "epoch": 22.064825930372148, + "grad_norm": 0.11944210529327393, + "learning_rate": 1.7852419922492925e-06, + "loss": 0.0228, + "step": 18380 + }, + { + "epoch": 22.07683073229292, + "grad_norm": 0.16349533200263977, + "learning_rate": 1.763413503524569e-06, + "loss": 0.018, + "step": 18390 + }, + { + "epoch": 22.088835534213686, + "grad_norm": 0.1183457300066948, + "learning_rate": 1.7417168917854165e-06, + "loss": 0.0206, + "step": 18400 + }, + { + "epoch": 22.100840336134453, + "grad_norm": 0.12659980356693268, + "learning_rate": 1.720152216349552e-06, + "loss": 0.0182, + "step": 18410 + }, + { + "epoch": 22.11284513805522, + "grad_norm": 0.11066532880067825, + "learning_rate": 1.6987195361739595e-06, + "loss": 0.0191, + "step": 18420 + }, + { + "epoch": 22.12484993997599, + "grad_norm": 0.10759562253952026, + "learning_rate": 1.6774189098547832e-06, + "loss": 0.0199, + "step": 18430 + }, + { + "epoch": 22.13685474189676, + "grad_norm": 0.12089799344539642, + "learning_rate": 1.6562503956271069e-06, + "loss": 0.0225, + "step": 18440 + }, + { + "epoch": 22.148859543817526, + "grad_norm": 0.1707857847213745, + "learning_rate": 1.6352140513648417e-06, + "loss": 0.0198, + "step": 18450 + }, + { + "epoch": 22.160864345738297, + "grad_norm": 0.14675801992416382, + "learning_rate": 1.6143099345805712e-06, + "loss": 0.0187, + "step": 18460 + }, + { + "epoch": 22.172869147659064, + "grad_norm": 0.1096675843000412, + "learning_rate": 1.5935381024253293e-06, + "loss": 0.0201, + "step": 18470 + }, + { + "epoch": 22.18487394957983, + "grad_norm": 0.13198739290237427, + "learning_rate": 1.572898611688517e-06, + "loss": 0.0222, + "step": 18480 + }, + { + "epoch": 22.1968787515006, + "grad_norm": 0.13727837800979614, + "learning_rate": 1.5523915187977133e-06, + "loss": 0.0211, + "step": 18490 + }, + { + "epoch": 22.20888355342137, + "grad_norm": 0.15640442073345184, + "learning_rate": 1.532016879818532e-06, + "loss": 0.0181, + "step": 18500 + }, + { + "epoch": 22.220888355342137, + "grad_norm": 0.16688452661037445, + "learning_rate": 1.51177475045447e-06, + "loss": 0.0217, + "step": 18510 + }, + { + "epoch": 22.232893157262904, + "grad_norm": 0.15100187063217163, + "learning_rate": 1.4916651860467035e-06, + "loss": 0.0198, + "step": 18520 + }, + { + "epoch": 22.244897959183675, + "grad_norm": 0.1434854120016098, + "learning_rate": 1.471688241574043e-06, + "loss": 0.0196, + "step": 18530 + }, + { + "epoch": 22.256902761104442, + "grad_norm": 0.148894801735878, + "learning_rate": 1.451843971652672e-06, + "loss": 0.0203, + "step": 18540 + }, + { + "epoch": 22.26890756302521, + "grad_norm": 0.10717128962278366, + "learning_rate": 1.432132430536076e-06, + "loss": 0.0207, + "step": 18550 + }, + { + "epoch": 22.28091236494598, + "grad_norm": 0.12329607456922531, + "learning_rate": 1.412553672114869e-06, + "loss": 0.0222, + "step": 18560 + }, + { + "epoch": 22.292917166866747, + "grad_norm": 0.14204245805740356, + "learning_rate": 1.3931077499166056e-06, + "loss": 0.0207, + "step": 18570 + }, + { + "epoch": 22.304921968787514, + "grad_norm": 0.13327878713607788, + "learning_rate": 1.3737947171057085e-06, + "loss": 0.0214, + "step": 18580 + }, + { + "epoch": 22.31692677070828, + "grad_norm": 0.1496671438217163, + "learning_rate": 1.3546146264832582e-06, + "loss": 0.0197, + "step": 18590 + }, + { + "epoch": 22.328931572629052, + "grad_norm": 0.128550723195076, + "learning_rate": 1.3355675304869086e-06, + "loss": 0.0198, + "step": 18600 + }, + { + "epoch": 22.34093637454982, + "grad_norm": 0.13040071725845337, + "learning_rate": 1.3166534811906827e-06, + "loss": 0.0221, + "step": 18610 + }, + { + "epoch": 22.352941176470587, + "grad_norm": 0.11219468712806702, + "learning_rate": 1.2978725303048666e-06, + "loss": 0.02, + "step": 18620 + }, + { + "epoch": 22.364945978391358, + "grad_norm": 0.15855956077575684, + "learning_rate": 1.2792247291758762e-06, + "loss": 0.0193, + "step": 18630 + }, + { + "epoch": 22.376950780312125, + "grad_norm": 0.12162794172763824, + "learning_rate": 1.2607101287860635e-06, + "loss": 0.017, + "step": 18640 + }, + { + "epoch": 22.388955582232892, + "grad_norm": 0.13790981471538544, + "learning_rate": 1.2423287797536654e-06, + "loss": 0.023, + "step": 18650 + }, + { + "epoch": 22.400960384153663, + "grad_norm": 0.11997975409030914, + "learning_rate": 1.2240807323325776e-06, + "loss": 0.0197, + "step": 18660 + }, + { + "epoch": 22.41296518607443, + "grad_norm": 0.09589780867099762, + "learning_rate": 1.205966036412254e-06, + "loss": 0.0175, + "step": 18670 + }, + { + "epoch": 22.424969987995198, + "grad_norm": 0.15905794501304626, + "learning_rate": 1.1879847415175949e-06, + "loss": 0.0202, + "step": 18680 + }, + { + "epoch": 22.436974789915965, + "grad_norm": 0.13533008098602295, + "learning_rate": 1.1701368968087712e-06, + "loss": 0.0175, + "step": 18690 + }, + { + "epoch": 22.448979591836736, + "grad_norm": 0.11727448552846909, + "learning_rate": 1.1524225510811116e-06, + "loss": 0.0199, + "step": 18700 + }, + { + "epoch": 22.460984393757503, + "grad_norm": 0.1405397355556488, + "learning_rate": 1.1348417527649535e-06, + "loss": 0.0188, + "step": 18710 + }, + { + "epoch": 22.47298919567827, + "grad_norm": 0.12782089412212372, + "learning_rate": 1.1173945499255268e-06, + "loss": 0.0205, + "step": 18720 + }, + { + "epoch": 22.48499399759904, + "grad_norm": 0.11484133452177048, + "learning_rate": 1.1000809902628307e-06, + "loss": 0.0221, + "step": 18730 + }, + { + "epoch": 22.496998799519808, + "grad_norm": 0.11685503274202347, + "learning_rate": 1.082901121111468e-06, + "loss": 0.0192, + "step": 18740 + }, + { + "epoch": 22.509003601440575, + "grad_norm": 0.13611625134944916, + "learning_rate": 1.0658549894405456e-06, + "loss": 0.0196, + "step": 18750 + }, + { + "epoch": 22.521008403361346, + "grad_norm": 0.1831454485654831, + "learning_rate": 1.0489426418535342e-06, + "loss": 0.0207, + "step": 18760 + }, + { + "epoch": 22.533013205282113, + "grad_norm": 0.12360990792512894, + "learning_rate": 1.0321641245881474e-06, + "loss": 0.0189, + "step": 18770 + }, + { + "epoch": 22.54501800720288, + "grad_norm": 0.12142506241798401, + "learning_rate": 1.015519483516214e-06, + "loss": 0.0181, + "step": 18780 + }, + { + "epoch": 22.557022809123648, + "grad_norm": 0.11129321902990341, + "learning_rate": 9.990087641435443e-07, + "loss": 0.0197, + "step": 18790 + }, + { + "epoch": 22.56902761104442, + "grad_norm": 0.13116417825222015, + "learning_rate": 9.826320116098132e-07, + "loss": 0.0217, + "step": 18800 + }, + { + "epoch": 22.581032412965186, + "grad_norm": 0.27550944685935974, + "learning_rate": 9.663892706884447e-07, + "loss": 0.0236, + "step": 18810 + }, + { + "epoch": 22.593037214885953, + "grad_norm": 0.13565853238105774, + "learning_rate": 9.502805857864616e-07, + "loss": 0.0203, + "step": 18820 + }, + { + "epoch": 22.605042016806724, + "grad_norm": 0.11412777006626129, + "learning_rate": 9.34306000944396e-07, + "loss": 0.0188, + "step": 18830 + }, + { + "epoch": 22.61704681872749, + "grad_norm": 0.1044677197933197, + "learning_rate": 9.184655598361624e-07, + "loss": 0.0197, + "step": 18840 + }, + { + "epoch": 22.62905162064826, + "grad_norm": 0.13519439101219177, + "learning_rate": 9.027593057689076e-07, + "loss": 0.0184, + "step": 18850 + }, + { + "epoch": 22.64105642256903, + "grad_norm": 0.12446912378072739, + "learning_rate": 8.871872816829441e-07, + "loss": 0.0207, + "step": 18860 + }, + { + "epoch": 22.653061224489797, + "grad_norm": 0.12725788354873657, + "learning_rate": 8.717495301515777e-07, + "loss": 0.0185, + "step": 18870 + }, + { + "epoch": 22.665066026410564, + "grad_norm": 0.11546237766742706, + "learning_rate": 8.564460933810415e-07, + "loss": 0.0191, + "step": 18880 + }, + { + "epoch": 22.67707082833133, + "grad_norm": 0.12986241281032562, + "learning_rate": 8.412770132103453e-07, + "loss": 0.0178, + "step": 18890 + }, + { + "epoch": 22.689075630252102, + "grad_norm": 0.10942772775888443, + "learning_rate": 8.262423311111711e-07, + "loss": 0.0211, + "step": 18900 + }, + { + "epoch": 22.70108043217287, + "grad_norm": 0.1392371654510498, + "learning_rate": 8.113420881877665e-07, + "loss": 0.0204, + "step": 18910 + }, + { + "epoch": 22.713085234093636, + "grad_norm": 0.08893535286188126, + "learning_rate": 7.965763251768288e-07, + "loss": 0.0167, + "step": 18920 + }, + { + "epoch": 22.725090036014407, + "grad_norm": 0.17123554646968842, + "learning_rate": 7.819450824473995e-07, + "loss": 0.0197, + "step": 18930 + }, + { + "epoch": 22.737094837935174, + "grad_norm": 0.1633194088935852, + "learning_rate": 7.674484000007198e-07, + "loss": 0.02, + "step": 18940 + }, + { + "epoch": 22.74909963985594, + "grad_norm": 0.12930573523044586, + "learning_rate": 7.530863174701752e-07, + "loss": 0.0189, + "step": 18950 + }, + { + "epoch": 22.76110444177671, + "grad_norm": 0.12500694394111633, + "learning_rate": 7.38858874121151e-07, + "loss": 0.0183, + "step": 18960 + }, + { + "epoch": 22.77310924369748, + "grad_norm": 0.12329408526420593, + "learning_rate": 7.247661088509328e-07, + "loss": 0.0229, + "step": 18970 + }, + { + "epoch": 22.785114045618247, + "grad_norm": 0.12971265614032745, + "learning_rate": 7.108080601886002e-07, + "loss": 0.0196, + "step": 18980 + }, + { + "epoch": 22.797118847539014, + "grad_norm": 0.1274292916059494, + "learning_rate": 6.969847662949336e-07, + "loss": 0.0183, + "step": 18990 + }, + { + "epoch": 22.809123649459785, + "grad_norm": 0.10757575184106827, + "learning_rate": 6.832962649622798e-07, + "loss": 0.0192, + "step": 19000 + }, + { + "epoch": 22.821128451380552, + "grad_norm": 0.18035967648029327, + "learning_rate": 6.697425936144863e-07, + "loss": 0.0184, + "step": 19010 + }, + { + "epoch": 22.83313325330132, + "grad_norm": 0.10933173447847366, + "learning_rate": 6.563237893067731e-07, + "loss": 0.02, + "step": 19020 + }, + { + "epoch": 22.84513805522209, + "grad_norm": 0.145958811044693, + "learning_rate": 6.430398887256328e-07, + "loss": 0.018, + "step": 19030 + }, + { + "epoch": 22.857142857142858, + "grad_norm": 0.10585177689790726, + "learning_rate": 6.298909281887478e-07, + "loss": 0.0196, + "step": 19040 + }, + { + "epoch": 22.869147659063625, + "grad_norm": 0.14341247081756592, + "learning_rate": 6.168769436448673e-07, + "loss": 0.0222, + "step": 19050 + }, + { + "epoch": 22.881152460984392, + "grad_norm": 0.18222087621688843, + "learning_rate": 6.03997970673742e-07, + "loss": 0.0232, + "step": 19060 + }, + { + "epoch": 22.893157262905163, + "grad_norm": 0.15739181637763977, + "learning_rate": 5.912540444859782e-07, + "loss": 0.0217, + "step": 19070 + }, + { + "epoch": 22.90516206482593, + "grad_norm": 0.17346331477165222, + "learning_rate": 5.786451999229837e-07, + "loss": 0.021, + "step": 19080 + }, + { + "epoch": 22.917166866746697, + "grad_norm": 0.15542146563529968, + "learning_rate": 5.661714714568722e-07, + "loss": 0.0203, + "step": 19090 + }, + { + "epoch": 22.92917166866747, + "grad_norm": 0.19914181530475616, + "learning_rate": 5.538328931903259e-07, + "loss": 0.0188, + "step": 19100 + }, + { + "epoch": 22.941176470588236, + "grad_norm": 0.12493497133255005, + "learning_rate": 5.416294988565551e-07, + "loss": 0.0184, + "step": 19110 + }, + { + "epoch": 22.953181272509003, + "grad_norm": 0.14282716810703278, + "learning_rate": 5.29561321819172e-07, + "loss": 0.023, + "step": 19120 + }, + { + "epoch": 22.965186074429774, + "grad_norm": 0.12984885275363922, + "learning_rate": 5.176283950721061e-07, + "loss": 0.0183, + "step": 19130 + }, + { + "epoch": 22.97719087635054, + "grad_norm": 0.1364084929227829, + "learning_rate": 5.058307512395332e-07, + "loss": 0.0198, + "step": 19140 + }, + { + "epoch": 22.989195678271308, + "grad_norm": 0.14342455565929413, + "learning_rate": 4.941684225757526e-07, + "loss": 0.0201, + "step": 19150 + }, + { + "epoch": 23.001200480192075, + "grad_norm": 0.10892301052808762, + "learning_rate": 4.826414409651314e-07, + "loss": 0.0222, + "step": 19160 + }, + { + "epoch": 23.013205282112846, + "grad_norm": 0.0890151858329773, + "learning_rate": 4.712498379219943e-07, + "loss": 0.0226, + "step": 19170 + }, + { + "epoch": 23.025210084033613, + "grad_norm": 0.1374099850654602, + "learning_rate": 4.599936445905506e-07, + "loss": 0.0206, + "step": 19180 + }, + { + "epoch": 23.03721488595438, + "grad_norm": 0.12195596098899841, + "learning_rate": 4.4887289174480594e-07, + "loss": 0.0184, + "step": 19190 + }, + { + "epoch": 23.04921968787515, + "grad_norm": 0.15514099597930908, + "learning_rate": 4.378876097884621e-07, + "loss": 0.0194, + "step": 19200 + }, + { + "epoch": 23.06122448979592, + "grad_norm": 0.09526468813419342, + "learning_rate": 4.2703782875487264e-07, + "loss": 0.0199, + "step": 19210 + }, + { + "epoch": 23.073229291716686, + "grad_norm": 0.1535826474428177, + "learning_rate": 4.163235783069208e-07, + "loss": 0.0183, + "step": 19220 + }, + { + "epoch": 23.085234093637457, + "grad_norm": 0.1993718296289444, + "learning_rate": 4.057448877369585e-07, + "loss": 0.0221, + "step": 19230 + }, + { + "epoch": 23.097238895558224, + "grad_norm": 0.16909180581569672, + "learning_rate": 3.9530178596672295e-07, + "loss": 0.0194, + "step": 19240 + }, + { + "epoch": 23.10924369747899, + "grad_norm": 0.13745202124118805, + "learning_rate": 3.849943015472479e-07, + "loss": 0.0208, + "step": 19250 + }, + { + "epoch": 23.12124849939976, + "grad_norm": 0.1437077820301056, + "learning_rate": 3.748224626588137e-07, + "loss": 0.0187, + "step": 19260 + }, + { + "epoch": 23.13325330132053, + "grad_norm": 0.10845635086297989, + "learning_rate": 3.647862971108307e-07, + "loss": 0.0198, + "step": 19270 + }, + { + "epoch": 23.145258103241297, + "grad_norm": 0.13410313427448273, + "learning_rate": 3.5488583234179473e-07, + "loss": 0.0186, + "step": 19280 + }, + { + "epoch": 23.157262905162064, + "grad_norm": 0.14658544957637787, + "learning_rate": 3.4512109541920413e-07, + "loss": 0.0201, + "step": 19290 + }, + { + "epoch": 23.169267707082835, + "grad_norm": 0.13896240293979645, + "learning_rate": 3.354921130394706e-07, + "loss": 0.0194, + "step": 19300 + }, + { + "epoch": 23.181272509003602, + "grad_norm": 0.13861556351184845, + "learning_rate": 3.259989115278639e-07, + "loss": 0.0212, + "step": 19310 + }, + { + "epoch": 23.19327731092437, + "grad_norm": 0.13458268344402313, + "learning_rate": 3.1664151683843403e-07, + "loss": 0.021, + "step": 19320 + }, + { + "epoch": 23.205282112845136, + "grad_norm": 0.1831841617822647, + "learning_rate": 3.074199545539447e-07, + "loss": 0.0205, + "step": 19330 + }, + { + "epoch": 23.217286914765907, + "grad_norm": 0.11917068809270859, + "learning_rate": 2.983342498857955e-07, + "loss": 0.0208, + "step": 19340 + }, + { + "epoch": 23.229291716686674, + "grad_norm": 0.14435696601867676, + "learning_rate": 2.893844276739499e-07, + "loss": 0.0202, + "step": 19350 + }, + { + "epoch": 23.24129651860744, + "grad_norm": 0.11840876936912537, + "learning_rate": 2.8057051238688514e-07, + "loss": 0.0194, + "step": 19360 + }, + { + "epoch": 23.253301320528212, + "grad_norm": 0.11659438908100128, + "learning_rate": 2.71892528121509e-07, + "loss": 0.02, + "step": 19370 + }, + { + "epoch": 23.26530612244898, + "grad_norm": 0.18473152816295624, + "learning_rate": 2.633504986030988e-07, + "loss": 0.0197, + "step": 19380 + }, + { + "epoch": 23.277310924369747, + "grad_norm": 0.12105758488178253, + "learning_rate": 2.549444471852347e-07, + "loss": 0.0195, + "step": 19390 + }, + { + "epoch": 23.289315726290518, + "grad_norm": 0.13981716334819794, + "learning_rate": 2.4667439684974423e-07, + "loss": 0.0224, + "step": 19400 + }, + { + "epoch": 23.301320528211285, + "grad_norm": 0.11723964661359787, + "learning_rate": 2.3854037020662467e-07, + "loss": 0.0188, + "step": 19410 + }, + { + "epoch": 23.313325330132052, + "grad_norm": 0.11525526642799377, + "learning_rate": 2.3054238949399288e-07, + "loss": 0.0175, + "step": 19420 + }, + { + "epoch": 23.32533013205282, + "grad_norm": 0.14112555980682373, + "learning_rate": 2.2268047657802993e-07, + "loss": 0.0207, + "step": 19430 + }, + { + "epoch": 23.33733493397359, + "grad_norm": 0.12036796659231186, + "learning_rate": 2.149546529529034e-07, + "loss": 0.0213, + "step": 19440 + }, + { + "epoch": 23.349339735894358, + "grad_norm": 0.11130403727293015, + "learning_rate": 2.0736493974071736e-07, + "loss": 0.0212, + "step": 19450 + }, + { + "epoch": 23.361344537815125, + "grad_norm": 0.10032478719949722, + "learning_rate": 1.9991135769145686e-07, + "loss": 0.0183, + "step": 19460 + }, + { + "epoch": 23.373349339735896, + "grad_norm": 0.12451103329658508, + "learning_rate": 1.9259392718293245e-07, + "loss": 0.0172, + "step": 19470 + }, + { + "epoch": 23.385354141656663, + "grad_norm": 0.15205015242099762, + "learning_rate": 1.8541266822072467e-07, + "loss": 0.0209, + "step": 19480 + }, + { + "epoch": 23.39735894357743, + "grad_norm": 0.18264006078243256, + "learning_rate": 1.7836760043811184e-07, + "loss": 0.0209, + "step": 19490 + }, + { + "epoch": 23.4093637454982, + "grad_norm": 0.10471325367689133, + "learning_rate": 1.7145874309604792e-07, + "loss": 0.0167, + "step": 19500 + }, + { + "epoch": 23.421368547418968, + "grad_norm": 0.09756117314100266, + "learning_rate": 1.6468611508308474e-07, + "loss": 0.0187, + "step": 19510 + }, + { + "epoch": 23.433373349339735, + "grad_norm": 0.24212947487831116, + "learning_rate": 1.5804973491532204e-07, + "loss": 0.0194, + "step": 19520 + }, + { + "epoch": 23.445378151260503, + "grad_norm": 0.11846151947975159, + "learning_rate": 1.5154962073637424e-07, + "loss": 0.0202, + "step": 19530 + }, + { + "epoch": 23.457382953181273, + "grad_norm": 0.16639481484889984, + "learning_rate": 1.4518579031730372e-07, + "loss": 0.0213, + "step": 19540 + }, + { + "epoch": 23.46938775510204, + "grad_norm": 0.15499268472194672, + "learning_rate": 1.389582610565876e-07, + "loss": 0.0186, + "step": 19550 + }, + { + "epoch": 23.481392557022808, + "grad_norm": 0.1157829612493515, + "learning_rate": 1.3286704998003995e-07, + "loss": 0.0156, + "step": 19560 + }, + { + "epoch": 23.49339735894358, + "grad_norm": 0.14183443784713745, + "learning_rate": 1.2691217374080632e-07, + "loss": 0.0206, + "step": 19570 + }, + { + "epoch": 23.505402160864346, + "grad_norm": 0.09687591344118118, + "learning_rate": 1.2109364861929705e-07, + "loss": 0.0178, + "step": 19580 + }, + { + "epoch": 23.517406962785113, + "grad_norm": 0.14289265871047974, + "learning_rate": 1.1541149052312628e-07, + "loss": 0.0194, + "step": 19590 + }, + { + "epoch": 23.529411764705884, + "grad_norm": 0.15969368815422058, + "learning_rate": 1.0986571498710074e-07, + "loss": 0.0201, + "step": 19600 + }, + { + "epoch": 23.54141656662665, + "grad_norm": 0.17914821207523346, + "learning_rate": 1.0445633717316438e-07, + "loss": 0.0184, + "step": 19610 + }, + { + "epoch": 23.55342136854742, + "grad_norm": 0.13034981489181519, + "learning_rate": 9.918337187034277e-08, + "loss": 0.0187, + "step": 19620 + }, + { + "epoch": 23.565426170468186, + "grad_norm": 0.1479901373386383, + "learning_rate": 9.404683349472643e-08, + "loss": 0.0182, + "step": 19630 + }, + { + "epoch": 23.577430972388957, + "grad_norm": 0.17344588041305542, + "learning_rate": 8.904673608940983e-08, + "loss": 0.021, + "step": 19640 + }, + { + "epoch": 23.589435774309724, + "grad_norm": 0.12211726605892181, + "learning_rate": 8.418309332447471e-08, + "loss": 0.0176, + "step": 19650 + }, + { + "epoch": 23.60144057623049, + "grad_norm": 0.11222328245639801, + "learning_rate": 7.945591849692902e-08, + "loss": 0.0169, + "step": 19660 + }, + { + "epoch": 23.613445378151262, + "grad_norm": 0.14869572222232819, + "learning_rate": 7.486522453069578e-08, + "loss": 0.0201, + "step": 19670 + }, + { + "epoch": 23.62545018007203, + "grad_norm": 0.16160519421100616, + "learning_rate": 7.041102397655208e-08, + "loss": 0.0222, + "step": 19680 + }, + { + "epoch": 23.637454981992796, + "grad_norm": 0.13478219509124756, + "learning_rate": 6.609332901210685e-08, + "loss": 0.0204, + "step": 19690 + }, + { + "epoch": 23.649459783913564, + "grad_norm": 0.0830029621720314, + "learning_rate": 6.191215144178419e-08, + "loss": 0.0163, + "step": 19700 + }, + { + "epoch": 23.661464585834334, + "grad_norm": 0.190241739153862, + "learning_rate": 5.786750269675678e-08, + "loss": 0.0187, + "step": 19710 + }, + { + "epoch": 23.6734693877551, + "grad_norm": 0.15291255712509155, + "learning_rate": 5.395939383494031e-08, + "loss": 0.0182, + "step": 19720 + }, + { + "epoch": 23.68547418967587, + "grad_norm": 0.10593729466199875, + "learning_rate": 5.018783554095463e-08, + "loss": 0.0163, + "step": 19730 + }, + { + "epoch": 23.69747899159664, + "grad_norm": 0.12421827018260956, + "learning_rate": 4.655283812610156e-08, + "loss": 0.0198, + "step": 19740 + }, + { + "epoch": 23.709483793517407, + "grad_norm": 0.11830649524927139, + "learning_rate": 4.305441152831491e-08, + "loss": 0.0179, + "step": 19750 + }, + { + "epoch": 23.721488595438174, + "grad_norm": 0.09876494854688644, + "learning_rate": 3.9692565312171584e-08, + "loss": 0.0184, + "step": 19760 + }, + { + "epoch": 23.733493397358945, + "grad_norm": 0.16865037381649017, + "learning_rate": 3.6467308668824975e-08, + "loss": 0.0177, + "step": 19770 + }, + { + "epoch": 23.745498199279712, + "grad_norm": 0.11889670789241791, + "learning_rate": 3.3378650416004964e-08, + "loss": 0.0212, + "step": 19780 + }, + { + "epoch": 23.75750300120048, + "grad_norm": 0.1389048844575882, + "learning_rate": 3.042659899797906e-08, + "loss": 0.0191, + "step": 19790 + }, + { + "epoch": 23.769507803121247, + "grad_norm": 0.12379857152700424, + "learning_rate": 2.76111624855524e-08, + "loss": 0.0199, + "step": 19800 + }, + { + "epoch": 23.781512605042018, + "grad_norm": 0.12749259173870087, + "learning_rate": 2.4932348576017784e-08, + "loss": 0.0195, + "step": 19810 + }, + { + "epoch": 23.793517406962785, + "grad_norm": 0.13953399658203125, + "learning_rate": 2.239016459314458e-08, + "loss": 0.0192, + "step": 19820 + }, + { + "epoch": 23.805522208883552, + "grad_norm": 0.12417252361774445, + "learning_rate": 1.9984617487173174e-08, + "loss": 0.0199, + "step": 19830 + }, + { + "epoch": 23.817527010804323, + "grad_norm": 0.11624359339475632, + "learning_rate": 1.7715713834776105e-08, + "loss": 0.0189, + "step": 19840 + }, + { + "epoch": 23.82953181272509, + "grad_norm": 0.1727852076292038, + "learning_rate": 1.5583459839046964e-08, + "loss": 0.0193, + "step": 19850 + }, + { + "epoch": 23.841536614645857, + "grad_norm": 0.12833945453166962, + "learning_rate": 1.3587861329489304e-08, + "loss": 0.0187, + "step": 19860 + }, + { + "epoch": 23.853541416566628, + "grad_norm": 0.1439405381679535, + "learning_rate": 1.1728923761994415e-08, + "loss": 0.0211, + "step": 19870 + }, + { + "epoch": 23.865546218487395, + "grad_norm": 0.2081211358308792, + "learning_rate": 1.0006652218819135e-08, + "loss": 0.0206, + "step": 19880 + }, + { + "epoch": 23.877551020408163, + "grad_norm": 0.13637354969978333, + "learning_rate": 8.421051408596947e-09, + "loss": 0.0184, + "step": 19890 + }, + { + "epoch": 23.88955582232893, + "grad_norm": 0.10543417930603027, + "learning_rate": 6.972125666299123e-09, + "loss": 0.0183, + "step": 19900 + }, + { + "epoch": 23.9015606242497, + "grad_norm": 0.1255130171775818, + "learning_rate": 5.659878953229169e-09, + "loss": 0.0177, + "step": 19910 + }, + { + "epoch": 23.913565426170468, + "grad_norm": 0.16288472712039948, + "learning_rate": 4.48431485701728e-09, + "loss": 0.0199, + "step": 19920 + }, + { + "epoch": 23.925570228091235, + "grad_norm": 0.19040341675281525, + "learning_rate": 3.4454365916203322e-09, + "loss": 0.0193, + "step": 19930 + }, + { + "epoch": 23.937575030012006, + "grad_norm": 0.12367769330739975, + "learning_rate": 2.5432469972830332e-09, + "loss": 0.0207, + "step": 19940 + }, + { + "epoch": 23.949579831932773, + "grad_norm": 0.16760139167308807, + "learning_rate": 1.7777485405601203e-09, + "loss": 0.0198, + "step": 19950 + }, + { + "epoch": 23.96158463385354, + "grad_norm": 0.0778270959854126, + "learning_rate": 1.1489433142941597e-09, + "loss": 0.0194, + "step": 19960 + }, + { + "epoch": 23.97358943577431, + "grad_norm": 0.2117767482995987, + "learning_rate": 6.568330376210963e-10, + "loss": 0.0209, + "step": 19970 + }, + { + "epoch": 23.98559423769508, + "grad_norm": 0.1340763419866562, + "learning_rate": 3.0141905594249787e-10, + "loss": 0.0196, + "step": 19980 + }, + { + "epoch": 23.997599039615846, + "grad_norm": 0.09345092624425888, + "learning_rate": 8.270234094776008e-11, + "loss": 0.0171, + "step": 19990 + }, + { + "epoch": 24.009603841536613, + "grad_norm": 0.1269150823354721, + "learning_rate": 6.834906085551041e-13, + "loss": 0.016, + "step": 20000 + }, + { + "epoch": 24.021608643457384, + "grad_norm": 0.212513729929924, + "learning_rate": 2.7379083738394484e-05, + "loss": 0.02, + "step": 20010 + }, + { + "epoch": 24.03361344537815, + "grad_norm": 0.21590562164783478, + "learning_rate": 2.732994502354823e-05, + "loss": 0.0212, + "step": 20020 + }, + { + "epoch": 24.04561824729892, + "grad_norm": 0.15402567386627197, + "learning_rate": 2.72808338549564e-05, + "loss": 0.0202, + "step": 20030 + }, + { + "epoch": 24.05762304921969, + "grad_norm": 0.25655627250671387, + "learning_rate": 2.723175029229374e-05, + "loss": 0.0218, + "step": 20040 + }, + { + "epoch": 24.069627851140456, + "grad_norm": 0.11590568721294403, + "learning_rate": 2.718269439520138e-05, + "loss": 0.0237, + "step": 20050 + }, + { + "epoch": 24.081632653061224, + "grad_norm": 0.13716264069080353, + "learning_rate": 2.7133666223286858e-05, + "loss": 0.0226, + "step": 20060 + }, + { + "epoch": 24.093637454981994, + "grad_norm": 0.14240233600139618, + "learning_rate": 2.7084665836124006e-05, + "loss": 0.0217, + "step": 20070 + }, + { + "epoch": 24.10564225690276, + "grad_norm": 0.2427178919315338, + "learning_rate": 2.703569329325296e-05, + "loss": 0.0255, + "step": 20080 + }, + { + "epoch": 24.11764705882353, + "grad_norm": 0.15021184086799622, + "learning_rate": 2.698674865417994e-05, + "loss": 0.025, + "step": 20090 + }, + { + "epoch": 24.129651860744296, + "grad_norm": 0.13237839937210083, + "learning_rate": 2.6937831978377325e-05, + "loss": 0.0238, + "step": 20100 + }, + { + "epoch": 24.141656662665067, + "grad_norm": 0.14939936995506287, + "learning_rate": 2.6888943325283485e-05, + "loss": 0.023, + "step": 20110 + }, + { + "epoch": 24.153661464585834, + "grad_norm": 0.11124823242425919, + "learning_rate": 2.6840082754302733e-05, + "loss": 0.0235, + "step": 20120 + }, + { + "epoch": 24.1656662665066, + "grad_norm": 0.16503958404064178, + "learning_rate": 2.679125032480525e-05, + "loss": 0.0246, + "step": 20130 + }, + { + "epoch": 24.177671068427372, + "grad_norm": 0.1901802271604538, + "learning_rate": 2.674244609612708e-05, + "loss": 0.0221, + "step": 20140 + }, + { + "epoch": 24.18967587034814, + "grad_norm": 0.18313367664813995, + "learning_rate": 2.6693670127569958e-05, + "loss": 0.0261, + "step": 20150 + }, + { + "epoch": 24.201680672268907, + "grad_norm": 0.18706026673316956, + "learning_rate": 2.6644922478401268e-05, + "loss": 0.0238, + "step": 20160 + }, + { + "epoch": 24.213685474189674, + "grad_norm": 0.19853872060775757, + "learning_rate": 2.6596203207854004e-05, + "loss": 0.026, + "step": 20170 + }, + { + "epoch": 24.225690276110445, + "grad_norm": 0.13625767827033997, + "learning_rate": 2.654751237512666e-05, + "loss": 0.0249, + "step": 20180 + }, + { + "epoch": 24.237695078031212, + "grad_norm": 0.2724413573741913, + "learning_rate": 2.649885003938323e-05, + "loss": 0.0278, + "step": 20190 + }, + { + "epoch": 24.24969987995198, + "grad_norm": 0.18878400325775146, + "learning_rate": 2.6450216259753e-05, + "loss": 0.0224, + "step": 20200 + }, + { + "epoch": 24.26170468187275, + "grad_norm": 0.14574143290519714, + "learning_rate": 2.640161109533063e-05, + "loss": 0.0233, + "step": 20210 + }, + { + "epoch": 24.273709483793517, + "grad_norm": 0.1667306274175644, + "learning_rate": 2.6353034605175936e-05, + "loss": 0.022, + "step": 20220 + }, + { + "epoch": 24.285714285714285, + "grad_norm": 0.21904948353767395, + "learning_rate": 2.630448684831398e-05, + "loss": 0.0266, + "step": 20230 + }, + { + "epoch": 24.297719087635056, + "grad_norm": 0.14322616159915924, + "learning_rate": 2.625596788373482e-05, + "loss": 0.0238, + "step": 20240 + }, + { + "epoch": 24.309723889555823, + "grad_norm": 0.15375246107578278, + "learning_rate": 2.620747777039363e-05, + "loss": 0.0248, + "step": 20250 + }, + { + "epoch": 24.32172869147659, + "grad_norm": 0.1456855982542038, + "learning_rate": 2.6159016567210424e-05, + "loss": 0.0258, + "step": 20260 + }, + { + "epoch": 24.333733493397357, + "grad_norm": 0.1255490630865097, + "learning_rate": 2.6110584333070153e-05, + "loss": 0.0238, + "step": 20270 + }, + { + "epoch": 24.345738295318128, + "grad_norm": 0.11033497005701065, + "learning_rate": 2.6062181126822537e-05, + "loss": 0.0228, + "step": 20280 + }, + { + "epoch": 24.357743097238895, + "grad_norm": 0.22265668213367462, + "learning_rate": 2.601380700728203e-05, + "loss": 0.0236, + "step": 20290 + }, + { + "epoch": 24.369747899159663, + "grad_norm": 0.1605672538280487, + "learning_rate": 2.596546203322777e-05, + "loss": 0.0235, + "step": 20300 + }, + { + "epoch": 24.381752701080433, + "grad_norm": 0.1429535299539566, + "learning_rate": 2.5917146263403457e-05, + "loss": 0.0239, + "step": 20310 + }, + { + "epoch": 24.3937575030012, + "grad_norm": 0.14756068587303162, + "learning_rate": 2.5868859756517296e-05, + "loss": 0.0223, + "step": 20320 + }, + { + "epoch": 24.405762304921968, + "grad_norm": 0.1598084717988968, + "learning_rate": 2.5820602571241947e-05, + "loss": 0.0271, + "step": 20330 + }, + { + "epoch": 24.41776710684274, + "grad_norm": 0.15689173340797424, + "learning_rate": 2.577237476621442e-05, + "loss": 0.0258, + "step": 20340 + }, + { + "epoch": 24.429771908763506, + "grad_norm": 0.10874620079994202, + "learning_rate": 2.572417640003609e-05, + "loss": 0.024, + "step": 20350 + }, + { + "epoch": 24.441776710684273, + "grad_norm": 0.1472356915473938, + "learning_rate": 2.5676007531272473e-05, + "loss": 0.0281, + "step": 20360 + }, + { + "epoch": 24.45378151260504, + "grad_norm": 0.14376282691955566, + "learning_rate": 2.5627868218453332e-05, + "loss": 0.0213, + "step": 20370 + }, + { + "epoch": 24.46578631452581, + "grad_norm": 0.20684099197387695, + "learning_rate": 2.5579758520072443e-05, + "loss": 0.0284, + "step": 20380 + }, + { + "epoch": 24.47779111644658, + "grad_norm": 0.15236109495162964, + "learning_rate": 2.5531678494587615e-05, + "loss": 0.0272, + "step": 20390 + }, + { + "epoch": 24.489795918367346, + "grad_norm": 0.3093228340148926, + "learning_rate": 2.5483628200420644e-05, + "loss": 0.0277, + "step": 20400 + }, + { + "epoch": 24.501800720288117, + "grad_norm": 0.1692466139793396, + "learning_rate": 2.5435607695957154e-05, + "loss": 0.0227, + "step": 20410 + }, + { + "epoch": 24.513805522208884, + "grad_norm": 0.16796539723873138, + "learning_rate": 2.5387617039546586e-05, + "loss": 0.0242, + "step": 20420 + }, + { + "epoch": 24.52581032412965, + "grad_norm": 0.133388951420784, + "learning_rate": 2.5339656289502106e-05, + "loss": 0.0197, + "step": 20430 + }, + { + "epoch": 24.537815126050422, + "grad_norm": 0.16043365001678467, + "learning_rate": 2.529172550410056e-05, + "loss": 0.0239, + "step": 20440 + }, + { + "epoch": 24.54981992797119, + "grad_norm": 0.11641258746385574, + "learning_rate": 2.524382474158234e-05, + "loss": 0.0241, + "step": 20450 + }, + { + "epoch": 24.561824729891956, + "grad_norm": 0.14615419507026672, + "learning_rate": 2.5195954060151433e-05, + "loss": 0.025, + "step": 20460 + }, + { + "epoch": 24.573829531812724, + "grad_norm": 0.18768751621246338, + "learning_rate": 2.5148113517975212e-05, + "loss": 0.0253, + "step": 20470 + }, + { + "epoch": 24.585834333733494, + "grad_norm": 0.1438913643360138, + "learning_rate": 2.5100303173184447e-05, + "loss": 0.0256, + "step": 20480 + }, + { + "epoch": 24.59783913565426, + "grad_norm": 0.22462992370128632, + "learning_rate": 2.5052523083873196e-05, + "loss": 0.0253, + "step": 20490 + }, + { + "epoch": 24.60984393757503, + "grad_norm": 0.19754095375537872, + "learning_rate": 2.500477330809881e-05, + "loss": 0.0247, + "step": 20500 + }, + { + "epoch": 24.6218487394958, + "grad_norm": 0.18137991428375244, + "learning_rate": 2.4957053903881733e-05, + "loss": 0.0224, + "step": 20510 + }, + { + "epoch": 24.633853541416567, + "grad_norm": 0.20568735897541046, + "learning_rate": 2.4909364929205576e-05, + "loss": 0.0246, + "step": 20520 + }, + { + "epoch": 24.645858343337334, + "grad_norm": 0.11533968150615692, + "learning_rate": 2.4861706442016923e-05, + "loss": 0.0243, + "step": 20530 + }, + { + "epoch": 24.657863145258105, + "grad_norm": 0.1309177428483963, + "learning_rate": 2.481407850022533e-05, + "loss": 0.0246, + "step": 20540 + }, + { + "epoch": 24.669867947178872, + "grad_norm": 0.1320311576128006, + "learning_rate": 2.4766481161703213e-05, + "loss": 0.027, + "step": 20550 + }, + { + "epoch": 24.68187274909964, + "grad_norm": 0.11327943205833435, + "learning_rate": 2.4718914484285876e-05, + "loss": 0.024, + "step": 20560 + }, + { + "epoch": 24.693877551020407, + "grad_norm": 0.18696773052215576, + "learning_rate": 2.467137852577129e-05, + "loss": 0.0233, + "step": 20570 + }, + { + "epoch": 24.705882352941178, + "grad_norm": 0.1381077766418457, + "learning_rate": 2.4623873343920123e-05, + "loss": 0.0241, + "step": 20580 + }, + { + "epoch": 24.717887154861945, + "grad_norm": 0.14449873566627502, + "learning_rate": 2.4576398996455656e-05, + "loss": 0.0253, + "step": 20590 + }, + { + "epoch": 24.729891956782712, + "grad_norm": 0.09883072972297668, + "learning_rate": 2.452895554106368e-05, + "loss": 0.0255, + "step": 20600 + }, + { + "epoch": 24.741896758703483, + "grad_norm": 0.13991789519786835, + "learning_rate": 2.4481543035392506e-05, + "loss": 0.0218, + "step": 20610 + }, + { + "epoch": 24.75390156062425, + "grad_norm": 0.14814579486846924, + "learning_rate": 2.4434161537052774e-05, + "loss": 0.0242, + "step": 20620 + }, + { + "epoch": 24.765906362545017, + "grad_norm": 0.11209749430418015, + "learning_rate": 2.4386811103617474e-05, + "loss": 0.0252, + "step": 20630 + }, + { + "epoch": 24.777911164465785, + "grad_norm": 0.15934473276138306, + "learning_rate": 2.4339491792621833e-05, + "loss": 0.0242, + "step": 20640 + }, + { + "epoch": 24.789915966386555, + "grad_norm": 0.1352858692407608, + "learning_rate": 2.4292203661563312e-05, + "loss": 0.0241, + "step": 20650 + }, + { + "epoch": 24.801920768307323, + "grad_norm": 0.26786303520202637, + "learning_rate": 2.424494676790141e-05, + "loss": 0.0247, + "step": 20660 + }, + { + "epoch": 24.81392557022809, + "grad_norm": 0.18072766065597534, + "learning_rate": 2.4197721169057745e-05, + "loss": 0.0223, + "step": 20670 + }, + { + "epoch": 24.82593037214886, + "grad_norm": 0.14933232963085175, + "learning_rate": 2.4150526922415857e-05, + "loss": 0.027, + "step": 20680 + }, + { + "epoch": 24.837935174069628, + "grad_norm": 0.15249791741371155, + "learning_rate": 2.41033640853212e-05, + "loss": 0.0227, + "step": 20690 + }, + { + "epoch": 24.849939975990395, + "grad_norm": 0.1219593808054924, + "learning_rate": 2.405623271508108e-05, + "loss": 0.0236, + "step": 20700 + }, + { + "epoch": 24.861944777911166, + "grad_norm": 0.17838618159294128, + "learning_rate": 2.4009132868964522e-05, + "loss": 0.0226, + "step": 20710 + }, + { + "epoch": 24.873949579831933, + "grad_norm": 0.141743466258049, + "learning_rate": 2.3962064604202327e-05, + "loss": 0.0247, + "step": 20720 + }, + { + "epoch": 24.8859543817527, + "grad_norm": 0.15161080658435822, + "learning_rate": 2.3915027977986858e-05, + "loss": 0.0259, + "step": 20730 + }, + { + "epoch": 24.897959183673468, + "grad_norm": 0.139906108379364, + "learning_rate": 2.386802304747205e-05, + "loss": 0.0272, + "step": 20740 + }, + { + "epoch": 24.90996398559424, + "grad_norm": 0.13783030211925507, + "learning_rate": 2.382104986977332e-05, + "loss": 0.0218, + "step": 20750 + }, + { + "epoch": 24.921968787515006, + "grad_norm": 0.1585591733455658, + "learning_rate": 2.377410850196749e-05, + "loss": 0.0261, + "step": 20760 + }, + { + "epoch": 24.933973589435773, + "grad_norm": 0.13926950097084045, + "learning_rate": 2.3727199001092797e-05, + "loss": 0.0228, + "step": 20770 + }, + { + "epoch": 24.945978391356544, + "grad_norm": 0.16638319194316864, + "learning_rate": 2.3680321424148677e-05, + "loss": 0.0212, + "step": 20780 + }, + { + "epoch": 24.95798319327731, + "grad_norm": 0.1644236147403717, + "learning_rate": 2.3633475828095793e-05, + "loss": 0.0234, + "step": 20790 + }, + { + "epoch": 24.96998799519808, + "grad_norm": 0.15463919937610626, + "learning_rate": 2.358666226985599e-05, + "loss": 0.025, + "step": 20800 + }, + { + "epoch": 24.98199279711885, + "grad_norm": 0.11844702810049057, + "learning_rate": 2.353988080631213e-05, + "loss": 0.0202, + "step": 20810 + }, + { + "epoch": 24.993997599039616, + "grad_norm": 0.12839071452617645, + "learning_rate": 2.3493131494308142e-05, + "loss": 0.023, + "step": 20820 + }, + { + "epoch": 25.006002400960384, + "grad_norm": 0.16909760236740112, + "learning_rate": 2.344641439064881e-05, + "loss": 0.0252, + "step": 20830 + }, + { + "epoch": 25.01800720288115, + "grad_norm": 0.22735287249088287, + "learning_rate": 2.339972955209984e-05, + "loss": 0.0241, + "step": 20840 + }, + { + "epoch": 25.03001200480192, + "grad_norm": 0.21028269827365875, + "learning_rate": 2.335307703538771e-05, + "loss": 0.0232, + "step": 20850 + }, + { + "epoch": 25.04201680672269, + "grad_norm": 0.1259613037109375, + "learning_rate": 2.330645689719962e-05, + "loss": 0.026, + "step": 20860 + }, + { + "epoch": 25.054021608643456, + "grad_norm": 0.1254626363515854, + "learning_rate": 2.3259869194183414e-05, + "loss": 0.0217, + "step": 20870 + }, + { + "epoch": 25.066026410564227, + "grad_norm": 0.21683155000209808, + "learning_rate": 2.321331398294759e-05, + "loss": 0.026, + "step": 20880 + }, + { + "epoch": 25.078031212484994, + "grad_norm": 0.1291912943124771, + "learning_rate": 2.3166791320061094e-05, + "loss": 0.0245, + "step": 20890 + }, + { + "epoch": 25.09003601440576, + "grad_norm": 0.16226927936077118, + "learning_rate": 2.312030126205335e-05, + "loss": 0.0253, + "step": 20900 + }, + { + "epoch": 25.102040816326532, + "grad_norm": 0.09405460953712463, + "learning_rate": 2.3073843865414162e-05, + "loss": 0.0207, + "step": 20910 + }, + { + "epoch": 25.1140456182473, + "grad_norm": 0.10082219541072845, + "learning_rate": 2.302741918659363e-05, + "loss": 0.0235, + "step": 20920 + }, + { + "epoch": 25.126050420168067, + "grad_norm": 0.13877896964550018, + "learning_rate": 2.2981027282002154e-05, + "loss": 0.0225, + "step": 20930 + }, + { + "epoch": 25.138055222088834, + "grad_norm": 0.11376660317182541, + "learning_rate": 2.2934668208010236e-05, + "loss": 0.0205, + "step": 20940 + }, + { + "epoch": 25.150060024009605, + "grad_norm": 0.12933725118637085, + "learning_rate": 2.2888342020948555e-05, + "loss": 0.0277, + "step": 20950 + }, + { + "epoch": 25.162064825930372, + "grad_norm": 0.13761496543884277, + "learning_rate": 2.2842048777107783e-05, + "loss": 0.0253, + "step": 20960 + }, + { + "epoch": 25.17406962785114, + "grad_norm": 0.11852158606052399, + "learning_rate": 2.2795788532738553e-05, + "loss": 0.0206, + "step": 20970 + }, + { + "epoch": 25.18607442977191, + "grad_norm": 0.28295785188674927, + "learning_rate": 2.2749561344051468e-05, + "loss": 0.0261, + "step": 20980 + }, + { + "epoch": 25.198079231692677, + "grad_norm": 0.17502450942993164, + "learning_rate": 2.2703367267216896e-05, + "loss": 0.0224, + "step": 20990 + }, + { + "epoch": 25.210084033613445, + "grad_norm": 0.14619095623493195, + "learning_rate": 2.2657206358364997e-05, + "loss": 0.0246, + "step": 21000 + }, + { + "epoch": 25.222088835534215, + "grad_norm": 0.13068312406539917, + "learning_rate": 2.261107867358563e-05, + "loss": 0.0208, + "step": 21010 + }, + { + "epoch": 25.234093637454983, + "grad_norm": 0.10725173354148865, + "learning_rate": 2.2564984268928263e-05, + "loss": 0.0217, + "step": 21020 + }, + { + "epoch": 25.24609843937575, + "grad_norm": 0.144377663731575, + "learning_rate": 2.251892320040198e-05, + "loss": 0.0233, + "step": 21030 + }, + { + "epoch": 25.258103241296517, + "grad_norm": 0.12852565944194794, + "learning_rate": 2.2472895523975312e-05, + "loss": 0.0212, + "step": 21040 + }, + { + "epoch": 25.270108043217288, + "grad_norm": 0.23375849425792694, + "learning_rate": 2.2426901295576212e-05, + "loss": 0.0247, + "step": 21050 + }, + { + "epoch": 25.282112845138055, + "grad_norm": 0.14433352649211884, + "learning_rate": 2.2380940571092013e-05, + "loss": 0.0232, + "step": 21060 + }, + { + "epoch": 25.294117647058822, + "grad_norm": 0.1706189215183258, + "learning_rate": 2.2335013406369302e-05, + "loss": 0.0244, + "step": 21070 + }, + { + "epoch": 25.306122448979593, + "grad_norm": 0.14465750753879547, + "learning_rate": 2.228911985721397e-05, + "loss": 0.0201, + "step": 21080 + }, + { + "epoch": 25.31812725090036, + "grad_norm": 0.1454075276851654, + "learning_rate": 2.224325997939095e-05, + "loss": 0.023, + "step": 21090 + }, + { + "epoch": 25.330132052821128, + "grad_norm": 0.16354124248027802, + "learning_rate": 2.219743382862437e-05, + "loss": 0.0233, + "step": 21100 + }, + { + "epoch": 25.342136854741895, + "grad_norm": 0.1235484927892685, + "learning_rate": 2.2151641460597294e-05, + "loss": 0.0221, + "step": 21110 + }, + { + "epoch": 25.354141656662666, + "grad_norm": 0.14026296138763428, + "learning_rate": 2.210588293095177e-05, + "loss": 0.0245, + "step": 21120 + }, + { + "epoch": 25.366146458583433, + "grad_norm": 0.1155179962515831, + "learning_rate": 2.2060158295288714e-05, + "loss": 0.0213, + "step": 21130 + }, + { + "epoch": 25.3781512605042, + "grad_norm": 0.16602090001106262, + "learning_rate": 2.2014467609167905e-05, + "loss": 0.0207, + "step": 21140 + }, + { + "epoch": 25.39015606242497, + "grad_norm": 0.16849586367607117, + "learning_rate": 2.1968810928107806e-05, + "loss": 0.0267, + "step": 21150 + }, + { + "epoch": 25.40216086434574, + "grad_norm": 0.17234639823436737, + "learning_rate": 2.1923188307585606e-05, + "loss": 0.0235, + "step": 21160 + }, + { + "epoch": 25.414165666266506, + "grad_norm": 0.14777640998363495, + "learning_rate": 2.187759980303708e-05, + "loss": 0.0232, + "step": 21170 + }, + { + "epoch": 25.426170468187276, + "grad_norm": 0.11016564816236496, + "learning_rate": 2.1832045469856545e-05, + "loss": 0.0221, + "step": 21180 + }, + { + "epoch": 25.438175270108044, + "grad_norm": 0.11999186128377914, + "learning_rate": 2.1786525363396844e-05, + "loss": 0.0204, + "step": 21190 + }, + { + "epoch": 25.45018007202881, + "grad_norm": 0.15144893527030945, + "learning_rate": 2.1741039538969183e-05, + "loss": 0.0231, + "step": 21200 + }, + { + "epoch": 25.462184873949578, + "grad_norm": 0.2556988000869751, + "learning_rate": 2.169558805184313e-05, + "loss": 0.0236, + "step": 21210 + }, + { + "epoch": 25.47418967587035, + "grad_norm": 0.13731789588928223, + "learning_rate": 2.165017095724651e-05, + "loss": 0.0227, + "step": 21220 + }, + { + "epoch": 25.486194477791116, + "grad_norm": 0.11760425567626953, + "learning_rate": 2.1604788310365405e-05, + "loss": 0.026, + "step": 21230 + }, + { + "epoch": 25.498199279711883, + "grad_norm": 0.20052097737789154, + "learning_rate": 2.155944016634398e-05, + "loss": 0.0232, + "step": 21240 + }, + { + "epoch": 25.510204081632654, + "grad_norm": 0.14892998337745667, + "learning_rate": 2.1514126580284538e-05, + "loss": 0.0238, + "step": 21250 + }, + { + "epoch": 25.52220888355342, + "grad_norm": 0.1541837453842163, + "learning_rate": 2.1468847607247344e-05, + "loss": 0.0239, + "step": 21260 + }, + { + "epoch": 25.53421368547419, + "grad_norm": 0.1341930329799652, + "learning_rate": 2.1423603302250623e-05, + "loss": 0.0205, + "step": 21270 + }, + { + "epoch": 25.54621848739496, + "grad_norm": 0.18611998856067657, + "learning_rate": 2.1378393720270468e-05, + "loss": 0.021, + "step": 21280 + }, + { + "epoch": 25.558223289315727, + "grad_norm": 0.26982957124710083, + "learning_rate": 2.133321891624076e-05, + "loss": 0.0222, + "step": 21290 + }, + { + "epoch": 25.570228091236494, + "grad_norm": 0.14069895446300507, + "learning_rate": 2.1288078945053193e-05, + "loss": 0.0251, + "step": 21300 + }, + { + "epoch": 25.58223289315726, + "grad_norm": 0.12149966508150101, + "learning_rate": 2.1242973861557064e-05, + "loss": 0.0211, + "step": 21310 + }, + { + "epoch": 25.594237695078032, + "grad_norm": 0.18742959201335907, + "learning_rate": 2.1197903720559304e-05, + "loss": 0.0212, + "step": 21320 + }, + { + "epoch": 25.6062424969988, + "grad_norm": 0.15431396663188934, + "learning_rate": 2.115286857682438e-05, + "loss": 0.0274, + "step": 21330 + }, + { + "epoch": 25.618247298919567, + "grad_norm": 0.2406674474477768, + "learning_rate": 2.1107868485074228e-05, + "loss": 0.0238, + "step": 21340 + }, + { + "epoch": 25.630252100840337, + "grad_norm": 0.1427590698003769, + "learning_rate": 2.1062903499988234e-05, + "loss": 0.0245, + "step": 21350 + }, + { + "epoch": 25.642256902761105, + "grad_norm": 0.143111452460289, + "learning_rate": 2.101797367620308e-05, + "loss": 0.0246, + "step": 21360 + }, + { + "epoch": 25.654261704681872, + "grad_norm": 0.11629022657871246, + "learning_rate": 2.0973079068312713e-05, + "loss": 0.0262, + "step": 21370 + }, + { + "epoch": 25.666266506602643, + "grad_norm": 0.1069294735789299, + "learning_rate": 2.0928219730868358e-05, + "loss": 0.0226, + "step": 21380 + }, + { + "epoch": 25.67827130852341, + "grad_norm": 0.1765851378440857, + "learning_rate": 2.0883395718378302e-05, + "loss": 0.0256, + "step": 21390 + }, + { + "epoch": 25.690276110444177, + "grad_norm": 0.14055021107196808, + "learning_rate": 2.083860708530798e-05, + "loss": 0.0247, + "step": 21400 + }, + { + "epoch": 25.702280912364944, + "grad_norm": 0.13715913891792297, + "learning_rate": 2.0793853886079796e-05, + "loss": 0.0222, + "step": 21410 + }, + { + "epoch": 25.714285714285715, + "grad_norm": 0.1487652063369751, + "learning_rate": 2.074913617507309e-05, + "loss": 0.0246, + "step": 21420 + }, + { + "epoch": 25.726290516206483, + "grad_norm": 0.10095977783203125, + "learning_rate": 2.070445400662411e-05, + "loss": 0.0216, + "step": 21430 + }, + { + "epoch": 25.73829531812725, + "grad_norm": 0.12987712025642395, + "learning_rate": 2.0659807435025907e-05, + "loss": 0.0222, + "step": 21440 + }, + { + "epoch": 25.75030012004802, + "grad_norm": 0.11840362101793289, + "learning_rate": 2.061519651452825e-05, + "loss": 0.0232, + "step": 21450 + }, + { + "epoch": 25.762304921968788, + "grad_norm": 0.12199442088603973, + "learning_rate": 2.0570621299337656e-05, + "loss": 0.0229, + "step": 21460 + }, + { + "epoch": 25.774309723889555, + "grad_norm": 0.14325737953186035, + "learning_rate": 2.0526081843617183e-05, + "loss": 0.0227, + "step": 21470 + }, + { + "epoch": 25.786314525810326, + "grad_norm": 0.12465966492891312, + "learning_rate": 2.0481578201486485e-05, + "loss": 0.0202, + "step": 21480 + }, + { + "epoch": 25.798319327731093, + "grad_norm": 0.15693214535713196, + "learning_rate": 2.0437110427021677e-05, + "loss": 0.0213, + "step": 21490 + }, + { + "epoch": 25.81032412965186, + "grad_norm": 0.15001654624938965, + "learning_rate": 2.0392678574255282e-05, + "loss": 0.018, + "step": 21500 + }, + { + "epoch": 25.822328931572628, + "grad_norm": 0.12148378789424896, + "learning_rate": 2.0348282697176217e-05, + "loss": 0.0211, + "step": 21510 + }, + { + "epoch": 25.8343337334934, + "grad_norm": 0.10216701030731201, + "learning_rate": 2.030392284972964e-05, + "loss": 0.0224, + "step": 21520 + }, + { + "epoch": 25.846338535414166, + "grad_norm": 0.17971721291542053, + "learning_rate": 2.0259599085816973e-05, + "loss": 0.0267, + "step": 21530 + }, + { + "epoch": 25.858343337334933, + "grad_norm": 0.164643794298172, + "learning_rate": 2.0215311459295755e-05, + "loss": 0.0245, + "step": 21540 + }, + { + "epoch": 25.870348139255704, + "grad_norm": 0.12775593996047974, + "learning_rate": 2.0171060023979606e-05, + "loss": 0.0236, + "step": 21550 + }, + { + "epoch": 25.88235294117647, + "grad_norm": 0.16631050407886505, + "learning_rate": 2.012684483363823e-05, + "loss": 0.02, + "step": 21560 + }, + { + "epoch": 25.89435774309724, + "grad_norm": 0.1412460058927536, + "learning_rate": 2.0082665941997237e-05, + "loss": 0.0213, + "step": 21570 + }, + { + "epoch": 25.906362545018006, + "grad_norm": 0.12517420947551727, + "learning_rate": 2.0038523402738146e-05, + "loss": 0.0243, + "step": 21580 + }, + { + "epoch": 25.918367346938776, + "grad_norm": 0.16646961867809296, + "learning_rate": 1.99944172694983e-05, + "loss": 0.0226, + "step": 21590 + }, + { + "epoch": 25.930372148859544, + "grad_norm": 0.13655860722064972, + "learning_rate": 1.99503475958708e-05, + "loss": 0.0218, + "step": 21600 + }, + { + "epoch": 25.94237695078031, + "grad_norm": 0.12885116040706635, + "learning_rate": 1.9906314435404484e-05, + "loss": 0.0241, + "step": 21610 + }, + { + "epoch": 25.95438175270108, + "grad_norm": 0.13592582941055298, + "learning_rate": 1.986231784160378e-05, + "loss": 0.0216, + "step": 21620 + }, + { + "epoch": 25.96638655462185, + "grad_norm": 0.13358740508556366, + "learning_rate": 1.9818357867928693e-05, + "loss": 0.0261, + "step": 21630 + }, + { + "epoch": 25.978391356542616, + "grad_norm": 0.1186671331524849, + "learning_rate": 1.977443456779474e-05, + "loss": 0.0216, + "step": 21640 + }, + { + "epoch": 25.990396158463387, + "grad_norm": 0.21757365763187408, + "learning_rate": 1.9730547994572858e-05, + "loss": 0.022, + "step": 21650 + }, + { + "epoch": 26.002400960384154, + "grad_norm": 0.1952618658542633, + "learning_rate": 1.9686698201589393e-05, + "loss": 0.0236, + "step": 21660 + }, + { + "epoch": 26.01440576230492, + "grad_norm": 0.13430652022361755, + "learning_rate": 1.9642885242125964e-05, + "loss": 0.0223, + "step": 21670 + }, + { + "epoch": 26.02641056422569, + "grad_norm": 0.13397128880023956, + "learning_rate": 1.9599109169419466e-05, + "loss": 0.0214, + "step": 21680 + }, + { + "epoch": 26.03841536614646, + "grad_norm": 0.1869993656873703, + "learning_rate": 1.9555370036661948e-05, + "loss": 0.0214, + "step": 21690 + }, + { + "epoch": 26.050420168067227, + "grad_norm": 0.176099956035614, + "learning_rate": 1.9511667897000574e-05, + "loss": 0.0241, + "step": 21700 + }, + { + "epoch": 26.062424969987994, + "grad_norm": 0.19610407948493958, + "learning_rate": 1.946800280353755e-05, + "loss": 0.0203, + "step": 21710 + }, + { + "epoch": 26.074429771908765, + "grad_norm": 0.24617117643356323, + "learning_rate": 1.9424374809330114e-05, + "loss": 0.0216, + "step": 21720 + }, + { + "epoch": 26.086434573829532, + "grad_norm": 0.13529929518699646, + "learning_rate": 1.9380783967390382e-05, + "loss": 0.0225, + "step": 21730 + }, + { + "epoch": 26.0984393757503, + "grad_norm": 0.13489143550395966, + "learning_rate": 1.933723033068533e-05, + "loss": 0.0232, + "step": 21740 + }, + { + "epoch": 26.11044417767107, + "grad_norm": 0.17133590579032898, + "learning_rate": 1.929371395213674e-05, + "loss": 0.0236, + "step": 21750 + }, + { + "epoch": 26.122448979591837, + "grad_norm": 0.15524515509605408, + "learning_rate": 1.9250234884621092e-05, + "loss": 0.0263, + "step": 21760 + }, + { + "epoch": 26.134453781512605, + "grad_norm": 0.12642420828342438, + "learning_rate": 1.920679318096959e-05, + "loss": 0.022, + "step": 21770 + }, + { + "epoch": 26.146458583433372, + "grad_norm": 0.1571064293384552, + "learning_rate": 1.9163388893967982e-05, + "loss": 0.0215, + "step": 21780 + }, + { + "epoch": 26.158463385354143, + "grad_norm": 0.10795789211988449, + "learning_rate": 1.9120022076356575e-05, + "loss": 0.0211, + "step": 21790 + }, + { + "epoch": 26.17046818727491, + "grad_norm": 0.09837262332439423, + "learning_rate": 1.9076692780830114e-05, + "loss": 0.0217, + "step": 21800 + }, + { + "epoch": 26.182472989195677, + "grad_norm": 0.17800407111644745, + "learning_rate": 1.903340106003782e-05, + "loss": 0.0223, + "step": 21810 + }, + { + "epoch": 26.194477791116448, + "grad_norm": 0.16858910024166107, + "learning_rate": 1.8990146966583183e-05, + "loss": 0.0228, + "step": 21820 + }, + { + "epoch": 26.206482593037215, + "grad_norm": 0.14710889756679535, + "learning_rate": 1.8946930553024035e-05, + "loss": 0.0225, + "step": 21830 + }, + { + "epoch": 26.218487394957982, + "grad_norm": 0.10849358886480331, + "learning_rate": 1.8903751871872378e-05, + "loss": 0.0211, + "step": 21840 + }, + { + "epoch": 26.230492196878753, + "grad_norm": 0.10527700930833817, + "learning_rate": 1.8860610975594382e-05, + "loss": 0.0207, + "step": 21850 + }, + { + "epoch": 26.24249699879952, + "grad_norm": 0.22927704453468323, + "learning_rate": 1.8817507916610307e-05, + "loss": 0.0236, + "step": 21860 + }, + { + "epoch": 26.254501800720288, + "grad_norm": 0.12823647260665894, + "learning_rate": 1.8774442747294407e-05, + "loss": 0.0231, + "step": 21870 + }, + { + "epoch": 26.266506602641055, + "grad_norm": 0.14417926967144012, + "learning_rate": 1.8731415519974965e-05, + "loss": 0.0224, + "step": 21880 + }, + { + "epoch": 26.278511404561826, + "grad_norm": 0.1430933028459549, + "learning_rate": 1.86884262869341e-05, + "loss": 0.0214, + "step": 21890 + }, + { + "epoch": 26.290516206482593, + "grad_norm": 0.09878337383270264, + "learning_rate": 1.8645475100407788e-05, + "loss": 0.0207, + "step": 21900 + }, + { + "epoch": 26.30252100840336, + "grad_norm": 0.13216178119182587, + "learning_rate": 1.8602562012585768e-05, + "loss": 0.0268, + "step": 21910 + }, + { + "epoch": 26.31452581032413, + "grad_norm": 0.10237500816583633, + "learning_rate": 1.8559687075611466e-05, + "loss": 0.0177, + "step": 21920 + }, + { + "epoch": 26.3265306122449, + "grad_norm": 0.16356079280376434, + "learning_rate": 1.8516850341582017e-05, + "loss": 0.0221, + "step": 21930 + }, + { + "epoch": 26.338535414165666, + "grad_norm": 0.13223497569561005, + "learning_rate": 1.847405186254807e-05, + "loss": 0.0222, + "step": 21940 + }, + { + "epoch": 26.350540216086436, + "grad_norm": 0.16122056543827057, + "learning_rate": 1.843129169051379e-05, + "loss": 0.0251, + "step": 21950 + }, + { + "epoch": 26.362545018007204, + "grad_norm": 0.15311673283576965, + "learning_rate": 1.8388569877436863e-05, + "loss": 0.0218, + "step": 21960 + }, + { + "epoch": 26.37454981992797, + "grad_norm": 0.15247122943401337, + "learning_rate": 1.834588647522828e-05, + "loss": 0.0205, + "step": 21970 + }, + { + "epoch": 26.386554621848738, + "grad_norm": 0.13160966336727142, + "learning_rate": 1.8303241535752435e-05, + "loss": 0.0221, + "step": 21980 + }, + { + "epoch": 26.39855942376951, + "grad_norm": 0.16289196908473969, + "learning_rate": 1.8260635110826934e-05, + "loss": 0.0248, + "step": 21990 + }, + { + "epoch": 26.410564225690276, + "grad_norm": 0.15324406325817108, + "learning_rate": 1.8218067252222597e-05, + "loss": 0.0262, + "step": 22000 + }, + { + "epoch": 26.422569027611043, + "grad_norm": 0.14814938604831696, + "learning_rate": 1.817553801166339e-05, + "loss": 0.02, + "step": 22010 + }, + { + "epoch": 26.434573829531814, + "grad_norm": 0.11543341726064682, + "learning_rate": 1.8133047440826333e-05, + "loss": 0.0233, + "step": 22020 + }, + { + "epoch": 26.44657863145258, + "grad_norm": 0.13095760345458984, + "learning_rate": 1.809059559134151e-05, + "loss": 0.0242, + "step": 22030 + }, + { + "epoch": 26.45858343337335, + "grad_norm": 0.14437781274318695, + "learning_rate": 1.80481825147919e-05, + "loss": 0.0237, + "step": 22040 + }, + { + "epoch": 26.470588235294116, + "grad_norm": 0.18396629393100739, + "learning_rate": 1.8005808262713396e-05, + "loss": 0.0254, + "step": 22050 + }, + { + "epoch": 26.482593037214887, + "grad_norm": 0.1603223979473114, + "learning_rate": 1.7963472886594714e-05, + "loss": 0.0228, + "step": 22060 + }, + { + "epoch": 26.494597839135654, + "grad_norm": 0.1437341570854187, + "learning_rate": 1.7921176437877302e-05, + "loss": 0.0217, + "step": 22070 + }, + { + "epoch": 26.50660264105642, + "grad_norm": 0.12557433545589447, + "learning_rate": 1.7878918967955366e-05, + "loss": 0.0206, + "step": 22080 + }, + { + "epoch": 26.518607442977192, + "grad_norm": 0.1266847550868988, + "learning_rate": 1.7836700528175693e-05, + "loss": 0.0207, + "step": 22090 + }, + { + "epoch": 26.53061224489796, + "grad_norm": 0.1408148556947708, + "learning_rate": 1.779452116983769e-05, + "loss": 0.0224, + "step": 22100 + }, + { + "epoch": 26.542617046818727, + "grad_norm": 0.1467968225479126, + "learning_rate": 1.775238094419325e-05, + "loss": 0.0212, + "step": 22110 + }, + { + "epoch": 26.554621848739497, + "grad_norm": 0.1709931641817093, + "learning_rate": 1.7710279902446708e-05, + "loss": 0.0217, + "step": 22120 + }, + { + "epoch": 26.566626650660265, + "grad_norm": 0.15106359124183655, + "learning_rate": 1.7668218095754795e-05, + "loss": 0.0225, + "step": 22130 + }, + { + "epoch": 26.578631452581032, + "grad_norm": 0.18498820066452026, + "learning_rate": 1.7626195575226594e-05, + "loss": 0.0214, + "step": 22140 + }, + { + "epoch": 26.5906362545018, + "grad_norm": 0.1458589881658554, + "learning_rate": 1.758421239192343e-05, + "loss": 0.0229, + "step": 22150 + }, + { + "epoch": 26.60264105642257, + "grad_norm": 0.14780539274215698, + "learning_rate": 1.7542268596858814e-05, + "loss": 0.0233, + "step": 22160 + }, + { + "epoch": 26.614645858343337, + "grad_norm": 0.14456841349601746, + "learning_rate": 1.750036424099841e-05, + "loss": 0.0222, + "step": 22170 + }, + { + "epoch": 26.626650660264104, + "grad_norm": 0.13536755740642548, + "learning_rate": 1.7458499375259956e-05, + "loss": 0.0227, + "step": 22180 + }, + { + "epoch": 26.638655462184875, + "grad_norm": 0.13823623955249786, + "learning_rate": 1.7416674050513245e-05, + "loss": 0.0225, + "step": 22190 + }, + { + "epoch": 26.650660264105642, + "grad_norm": 0.10132357478141785, + "learning_rate": 1.737488831757997e-05, + "loss": 0.0228, + "step": 22200 + }, + { + "epoch": 26.66266506602641, + "grad_norm": 0.18887630105018616, + "learning_rate": 1.733314222723373e-05, + "loss": 0.0224, + "step": 22210 + }, + { + "epoch": 26.67466986794718, + "grad_norm": 0.15095727145671844, + "learning_rate": 1.7291435830199954e-05, + "loss": 0.0215, + "step": 22220 + }, + { + "epoch": 26.686674669867948, + "grad_norm": 0.1657824069261551, + "learning_rate": 1.7249769177155878e-05, + "loss": 0.0221, + "step": 22230 + }, + { + "epoch": 26.698679471788715, + "grad_norm": 0.16824282705783844, + "learning_rate": 1.720814231873038e-05, + "loss": 0.0201, + "step": 22240 + }, + { + "epoch": 26.710684273709482, + "grad_norm": 0.16417892277240753, + "learning_rate": 1.716655530550405e-05, + "loss": 0.0209, + "step": 22250 + }, + { + "epoch": 26.722689075630253, + "grad_norm": 0.12105899304151535, + "learning_rate": 1.7125008188009017e-05, + "loss": 0.0198, + "step": 22260 + }, + { + "epoch": 26.73469387755102, + "grad_norm": 0.15529324114322662, + "learning_rate": 1.708350101672894e-05, + "loss": 0.0227, + "step": 22270 + }, + { + "epoch": 26.746698679471788, + "grad_norm": 0.20835082232952118, + "learning_rate": 1.7042033842098957e-05, + "loss": 0.0224, + "step": 22280 + }, + { + "epoch": 26.75870348139256, + "grad_norm": 0.1339913308620453, + "learning_rate": 1.7000606714505567e-05, + "loss": 0.0241, + "step": 22290 + }, + { + "epoch": 26.770708283313326, + "grad_norm": 0.11992942541837692, + "learning_rate": 1.695921968428668e-05, + "loss": 0.0215, + "step": 22300 + }, + { + "epoch": 26.782713085234093, + "grad_norm": 0.13212205469608307, + "learning_rate": 1.6917872801731416e-05, + "loss": 0.022, + "step": 22310 + }, + { + "epoch": 26.79471788715486, + "grad_norm": 0.18816453218460083, + "learning_rate": 1.687656611708014e-05, + "loss": 0.0201, + "step": 22320 + }, + { + "epoch": 26.80672268907563, + "grad_norm": 0.09166266769170761, + "learning_rate": 1.6835299680524368e-05, + "loss": 0.0207, + "step": 22330 + }, + { + "epoch": 26.818727490996398, + "grad_norm": 0.17592717707157135, + "learning_rate": 1.679407354220669e-05, + "loss": 0.022, + "step": 22340 + }, + { + "epoch": 26.830732292917165, + "grad_norm": 0.14277255535125732, + "learning_rate": 1.6752887752220793e-05, + "loss": 0.0258, + "step": 22350 + }, + { + "epoch": 26.842737094837936, + "grad_norm": 0.15904197096824646, + "learning_rate": 1.6711742360611277e-05, + "loss": 0.0226, + "step": 22360 + }, + { + "epoch": 26.854741896758703, + "grad_norm": 0.1520892083644867, + "learning_rate": 1.6670637417373652e-05, + "loss": 0.0218, + "step": 22370 + }, + { + "epoch": 26.86674669867947, + "grad_norm": 0.1334032267332077, + "learning_rate": 1.6629572972454333e-05, + "loss": 0.0209, + "step": 22380 + }, + { + "epoch": 26.87875150060024, + "grad_norm": 0.17315423488616943, + "learning_rate": 1.6588549075750464e-05, + "loss": 0.0232, + "step": 22390 + }, + { + "epoch": 26.89075630252101, + "grad_norm": 0.1696779429912567, + "learning_rate": 1.6547565777109976e-05, + "loss": 0.0235, + "step": 22400 + }, + { + "epoch": 26.902761104441776, + "grad_norm": 0.21570031344890594, + "learning_rate": 1.650662312633143e-05, + "loss": 0.0232, + "step": 22410 + }, + { + "epoch": 26.914765906362547, + "grad_norm": 0.12661398947238922, + "learning_rate": 1.6465721173164002e-05, + "loss": 0.0211, + "step": 22420 + }, + { + "epoch": 26.926770708283314, + "grad_norm": 0.12434355914592743, + "learning_rate": 1.6424859967307428e-05, + "loss": 0.0214, + "step": 22430 + }, + { + "epoch": 26.93877551020408, + "grad_norm": 0.17637760937213898, + "learning_rate": 1.6384039558411903e-05, + "loss": 0.0248, + "step": 22440 + }, + { + "epoch": 26.95078031212485, + "grad_norm": 0.12867140769958496, + "learning_rate": 1.634325999607811e-05, + "loss": 0.0197, + "step": 22450 + }, + { + "epoch": 26.96278511404562, + "grad_norm": 0.12671659886837006, + "learning_rate": 1.6302521329857045e-05, + "loss": 0.0236, + "step": 22460 + }, + { + "epoch": 26.974789915966387, + "grad_norm": 0.14520388841629028, + "learning_rate": 1.6261823609250027e-05, + "loss": 0.0232, + "step": 22470 + }, + { + "epoch": 26.986794717887154, + "grad_norm": 0.1501356065273285, + "learning_rate": 1.6221166883708627e-05, + "loss": 0.0227, + "step": 22480 + }, + { + "epoch": 26.998799519807925, + "grad_norm": 0.11690602451562881, + "learning_rate": 1.6180551202634603e-05, + "loss": 0.0209, + "step": 22490 + }, + { + "epoch": 27.010804321728692, + "grad_norm": 0.11809932440519333, + "learning_rate": 1.613997661537981e-05, + "loss": 0.0194, + "step": 22500 + }, + { + "epoch": 27.02280912364946, + "grad_norm": 0.14049741625785828, + "learning_rate": 1.609944317124624e-05, + "loss": 0.0209, + "step": 22510 + }, + { + "epoch": 27.034813925570226, + "grad_norm": 0.1603565812110901, + "learning_rate": 1.605895091948582e-05, + "loss": 0.0215, + "step": 22520 + }, + { + "epoch": 27.046818727490997, + "grad_norm": 0.1341741532087326, + "learning_rate": 1.6018499909300477e-05, + "loss": 0.0197, + "step": 22530 + }, + { + "epoch": 27.058823529411764, + "grad_norm": 0.13977956771850586, + "learning_rate": 1.597809018984199e-05, + "loss": 0.0227, + "step": 22540 + }, + { + "epoch": 27.07082833133253, + "grad_norm": 0.10594061017036438, + "learning_rate": 1.593772181021196e-05, + "loss": 0.0215, + "step": 22550 + }, + { + "epoch": 27.082833133253303, + "grad_norm": 0.18941223621368408, + "learning_rate": 1.5897394819461813e-05, + "loss": 0.0224, + "step": 22560 + }, + { + "epoch": 27.09483793517407, + "grad_norm": 0.17227433621883392, + "learning_rate": 1.5857109266592625e-05, + "loss": 0.0251, + "step": 22570 + }, + { + "epoch": 27.106842737094837, + "grad_norm": 0.08682507276535034, + "learning_rate": 1.581686520055514e-05, + "loss": 0.0177, + "step": 22580 + }, + { + "epoch": 27.118847539015608, + "grad_norm": 0.15874603390693665, + "learning_rate": 1.5776662670249704e-05, + "loss": 0.0228, + "step": 22590 + }, + { + "epoch": 27.130852340936375, + "grad_norm": 0.1491602063179016, + "learning_rate": 1.573650172452615e-05, + "loss": 0.0222, + "step": 22600 + }, + { + "epoch": 27.142857142857142, + "grad_norm": 0.11198665201663971, + "learning_rate": 1.5696382412183852e-05, + "loss": 0.0207, + "step": 22610 + }, + { + "epoch": 27.15486194477791, + "grad_norm": 0.1200985386967659, + "learning_rate": 1.565630478197155e-05, + "loss": 0.0199, + "step": 22620 + }, + { + "epoch": 27.16686674669868, + "grad_norm": 0.1563582867383957, + "learning_rate": 1.5616268882587332e-05, + "loss": 0.0252, + "step": 22630 + }, + { + "epoch": 27.178871548619448, + "grad_norm": 0.13128525018692017, + "learning_rate": 1.5576274762678594e-05, + "loss": 0.0249, + "step": 22640 + }, + { + "epoch": 27.190876350540215, + "grad_norm": 0.17004545032978058, + "learning_rate": 1.5536322470841952e-05, + "loss": 0.0242, + "step": 22650 + }, + { + "epoch": 27.202881152460986, + "grad_norm": 0.1592366099357605, + "learning_rate": 1.549641205562324e-05, + "loss": 0.0234, + "step": 22660 + }, + { + "epoch": 27.214885954381753, + "grad_norm": 0.14700132608413696, + "learning_rate": 1.5456543565517334e-05, + "loss": 0.0197, + "step": 22670 + }, + { + "epoch": 27.22689075630252, + "grad_norm": 0.1386798471212387, + "learning_rate": 1.541671704896825e-05, + "loss": 0.0193, + "step": 22680 + }, + { + "epoch": 27.23889555822329, + "grad_norm": 0.12607847154140472, + "learning_rate": 1.5376932554368938e-05, + "loss": 0.0209, + "step": 22690 + }, + { + "epoch": 27.25090036014406, + "grad_norm": 0.09603700786828995, + "learning_rate": 1.5337190130061318e-05, + "loss": 0.0196, + "step": 22700 + }, + { + "epoch": 27.262905162064826, + "grad_norm": 0.1405154913663864, + "learning_rate": 1.5297489824336165e-05, + "loss": 0.0221, + "step": 22710 + }, + { + "epoch": 27.274909963985593, + "grad_norm": 0.13211777806282043, + "learning_rate": 1.5257831685433121e-05, + "loss": 0.0206, + "step": 22720 + }, + { + "epoch": 27.286914765906364, + "grad_norm": 0.15171808004379272, + "learning_rate": 1.521821576154055e-05, + "loss": 0.0231, + "step": 22730 + }, + { + "epoch": 27.29891956782713, + "grad_norm": 0.16554026305675507, + "learning_rate": 1.5178642100795542e-05, + "loss": 0.0223, + "step": 22740 + }, + { + "epoch": 27.310924369747898, + "grad_norm": 0.14101165533065796, + "learning_rate": 1.5139110751283819e-05, + "loss": 0.0185, + "step": 22750 + }, + { + "epoch": 27.32292917166867, + "grad_norm": 0.1337374746799469, + "learning_rate": 1.5099621761039683e-05, + "loss": 0.0226, + "step": 22760 + }, + { + "epoch": 27.334933973589436, + "grad_norm": 0.20172914862632751, + "learning_rate": 1.5060175178046016e-05, + "loss": 0.0208, + "step": 22770 + }, + { + "epoch": 27.346938775510203, + "grad_norm": 0.1428225189447403, + "learning_rate": 1.5020771050234117e-05, + "loss": 0.0222, + "step": 22780 + }, + { + "epoch": 27.35894357743097, + "grad_norm": 0.13419939577579498, + "learning_rate": 1.4981409425483716e-05, + "loss": 0.0202, + "step": 22790 + }, + { + "epoch": 27.37094837935174, + "grad_norm": 0.17382368445396423, + "learning_rate": 1.4942090351622883e-05, + "loss": 0.0247, + "step": 22800 + }, + { + "epoch": 27.38295318127251, + "grad_norm": 0.11841116100549698, + "learning_rate": 1.490281387642804e-05, + "loss": 0.0225, + "step": 22810 + }, + { + "epoch": 27.394957983193276, + "grad_norm": 0.16226187348365784, + "learning_rate": 1.4863580047623765e-05, + "loss": 0.0207, + "step": 22820 + }, + { + "epoch": 27.406962785114047, + "grad_norm": 0.17011821269989014, + "learning_rate": 1.4824388912882897e-05, + "loss": 0.022, + "step": 22830 + }, + { + "epoch": 27.418967587034814, + "grad_norm": 0.1687816083431244, + "learning_rate": 1.4785240519826344e-05, + "loss": 0.0163, + "step": 22840 + }, + { + "epoch": 27.43097238895558, + "grad_norm": 0.11351819336414337, + "learning_rate": 1.4746134916023097e-05, + "loss": 0.0198, + "step": 22850 + }, + { + "epoch": 27.442977190876352, + "grad_norm": 0.1797276884317398, + "learning_rate": 1.4707072148990141e-05, + "loss": 0.0218, + "step": 22860 + }, + { + "epoch": 27.45498199279712, + "grad_norm": 0.18256618082523346, + "learning_rate": 1.4668052266192423e-05, + "loss": 0.0221, + "step": 22870 + }, + { + "epoch": 27.466986794717887, + "grad_norm": 0.12626533210277557, + "learning_rate": 1.4629075315042795e-05, + "loss": 0.0206, + "step": 22880 + }, + { + "epoch": 27.478991596638654, + "grad_norm": 0.1450640708208084, + "learning_rate": 1.4590141342901925e-05, + "loss": 0.0226, + "step": 22890 + }, + { + "epoch": 27.490996398559425, + "grad_norm": 0.08083725720643997, + "learning_rate": 1.4551250397078253e-05, + "loss": 0.0192, + "step": 22900 + }, + { + "epoch": 27.503001200480192, + "grad_norm": 0.14513155817985535, + "learning_rate": 1.4512402524827945e-05, + "loss": 0.0223, + "step": 22910 + }, + { + "epoch": 27.51500600240096, + "grad_norm": 0.15383538603782654, + "learning_rate": 1.447359777335482e-05, + "loss": 0.0208, + "step": 22920 + }, + { + "epoch": 27.52701080432173, + "grad_norm": 0.10704786330461502, + "learning_rate": 1.4434836189810335e-05, + "loss": 0.0222, + "step": 22930 + }, + { + "epoch": 27.539015606242497, + "grad_norm": 0.16888022422790527, + "learning_rate": 1.4396117821293453e-05, + "loss": 0.0211, + "step": 22940 + }, + { + "epoch": 27.551020408163264, + "grad_norm": 0.10549420118331909, + "learning_rate": 1.4357442714850633e-05, + "loss": 0.0236, + "step": 22950 + }, + { + "epoch": 27.563025210084035, + "grad_norm": 0.14895986020565033, + "learning_rate": 1.43188109174758e-05, + "loss": 0.02, + "step": 22960 + }, + { + "epoch": 27.575030012004802, + "grad_norm": 0.15602143108844757, + "learning_rate": 1.4280222476110206e-05, + "loss": 0.0208, + "step": 22970 + }, + { + "epoch": 27.58703481392557, + "grad_norm": 0.12513324618339539, + "learning_rate": 1.4241677437642469e-05, + "loss": 0.0207, + "step": 22980 + }, + { + "epoch": 27.599039615846337, + "grad_norm": 0.12730669975280762, + "learning_rate": 1.420317584890844e-05, + "loss": 0.0216, + "step": 22990 + }, + { + "epoch": 27.611044417767108, + "grad_norm": 0.12053617835044861, + "learning_rate": 1.4164717756691176e-05, + "loss": 0.0209, + "step": 23000 + }, + { + "epoch": 27.623049219687875, + "grad_norm": 0.09648242592811584, + "learning_rate": 1.4126303207720882e-05, + "loss": 0.0228, + "step": 23010 + }, + { + "epoch": 27.635054021608642, + "grad_norm": 0.14337508380413055, + "learning_rate": 1.408793224867484e-05, + "loss": 0.0196, + "step": 23020 + }, + { + "epoch": 27.647058823529413, + "grad_norm": 0.1492273062467575, + "learning_rate": 1.4049604926177423e-05, + "loss": 0.0201, + "step": 23030 + }, + { + "epoch": 27.65906362545018, + "grad_norm": 0.1435183882713318, + "learning_rate": 1.4011321286799916e-05, + "loss": 0.0213, + "step": 23040 + }, + { + "epoch": 27.671068427370948, + "grad_norm": 0.187921941280365, + "learning_rate": 1.3973081377060565e-05, + "loss": 0.0199, + "step": 23050 + }, + { + "epoch": 27.68307322929172, + "grad_norm": 0.09742706269025803, + "learning_rate": 1.393488524342445e-05, + "loss": 0.0198, + "step": 23060 + }, + { + "epoch": 27.695078031212486, + "grad_norm": 0.1481027901172638, + "learning_rate": 1.3896732932303486e-05, + "loss": 0.022, + "step": 23070 + }, + { + "epoch": 27.707082833133253, + "grad_norm": 0.14960584044456482, + "learning_rate": 1.3858624490056304e-05, + "loss": 0.0209, + "step": 23080 + }, + { + "epoch": 27.71908763505402, + "grad_norm": 0.13890418410301208, + "learning_rate": 1.3820559962988299e-05, + "loss": 0.0225, + "step": 23090 + }, + { + "epoch": 27.73109243697479, + "grad_norm": 0.1405343860387802, + "learning_rate": 1.3782539397351418e-05, + "loss": 0.02, + "step": 23100 + }, + { + "epoch": 27.743097238895558, + "grad_norm": 0.20176903903484344, + "learning_rate": 1.3744562839344266e-05, + "loss": 0.0197, + "step": 23110 + }, + { + "epoch": 27.755102040816325, + "grad_norm": 0.14295919239521027, + "learning_rate": 1.370663033511193e-05, + "loss": 0.0225, + "step": 23120 + }, + { + "epoch": 27.767106842737096, + "grad_norm": 0.20049460232257843, + "learning_rate": 1.3668741930745964e-05, + "loss": 0.0231, + "step": 23130 + }, + { + "epoch": 27.779111644657863, + "grad_norm": 0.17433352768421173, + "learning_rate": 1.3630897672284382e-05, + "loss": 0.0198, + "step": 23140 + }, + { + "epoch": 27.79111644657863, + "grad_norm": 0.15263788402080536, + "learning_rate": 1.3593097605711508e-05, + "loss": 0.0196, + "step": 23150 + }, + { + "epoch": 27.8031212484994, + "grad_norm": 0.1827925443649292, + "learning_rate": 1.3555341776957992e-05, + "loss": 0.018, + "step": 23160 + }, + { + "epoch": 27.81512605042017, + "grad_norm": 0.1324179470539093, + "learning_rate": 1.3517630231900724e-05, + "loss": 0.0213, + "step": 23170 + }, + { + "epoch": 27.827130852340936, + "grad_norm": 0.2099023312330246, + "learning_rate": 1.3479963016362767e-05, + "loss": 0.0212, + "step": 23180 + }, + { + "epoch": 27.839135654261703, + "grad_norm": 0.08758016675710678, + "learning_rate": 1.3442340176113377e-05, + "loss": 0.0194, + "step": 23190 + }, + { + "epoch": 27.851140456182474, + "grad_norm": 0.16907750070095062, + "learning_rate": 1.340476175686784e-05, + "loss": 0.0229, + "step": 23200 + }, + { + "epoch": 27.86314525810324, + "grad_norm": 0.13329680263996124, + "learning_rate": 1.3367227804287469e-05, + "loss": 0.0207, + "step": 23210 + }, + { + "epoch": 27.87515006002401, + "grad_norm": 0.10996707528829575, + "learning_rate": 1.332973836397956e-05, + "loss": 0.0199, + "step": 23220 + }, + { + "epoch": 27.88715486194478, + "grad_norm": 0.12896615266799927, + "learning_rate": 1.329229348149731e-05, + "loss": 0.0203, + "step": 23230 + }, + { + "epoch": 27.899159663865547, + "grad_norm": 0.1421300619840622, + "learning_rate": 1.3254893202339796e-05, + "loss": 0.0212, + "step": 23240 + }, + { + "epoch": 27.911164465786314, + "grad_norm": 0.2186657041311264, + "learning_rate": 1.3217537571951872e-05, + "loss": 0.0193, + "step": 23250 + }, + { + "epoch": 27.92316926770708, + "grad_norm": 0.1265474408864975, + "learning_rate": 1.3180226635724169e-05, + "loss": 0.0197, + "step": 23260 + }, + { + "epoch": 27.935174069627852, + "grad_norm": 0.14413534104824066, + "learning_rate": 1.314296043899298e-05, + "loss": 0.0186, + "step": 23270 + }, + { + "epoch": 27.94717887154862, + "grad_norm": 0.11279849708080292, + "learning_rate": 1.3105739027040248e-05, + "loss": 0.0202, + "step": 23280 + }, + { + "epoch": 27.959183673469386, + "grad_norm": 0.10481148958206177, + "learning_rate": 1.3068562445093491e-05, + "loss": 0.0188, + "step": 23290 + }, + { + "epoch": 27.971188475390157, + "grad_norm": 0.1351894736289978, + "learning_rate": 1.3031430738325778e-05, + "loss": 0.0213, + "step": 23300 + }, + { + "epoch": 27.983193277310924, + "grad_norm": 0.11906658113002777, + "learning_rate": 1.299434395185563e-05, + "loss": 0.0202, + "step": 23310 + }, + { + "epoch": 27.99519807923169, + "grad_norm": 0.28073757886886597, + "learning_rate": 1.295730213074699e-05, + "loss": 0.0225, + "step": 23320 + }, + { + "epoch": 28.007202881152462, + "grad_norm": 0.13906429708003998, + "learning_rate": 1.2920305320009152e-05, + "loss": 0.0216, + "step": 23330 + }, + { + "epoch": 28.01920768307323, + "grad_norm": 0.14627285301685333, + "learning_rate": 1.2883353564596729e-05, + "loss": 0.0217, + "step": 23340 + }, + { + "epoch": 28.031212484993997, + "grad_norm": 0.11977987736463547, + "learning_rate": 1.2846446909409604e-05, + "loss": 0.0183, + "step": 23350 + }, + { + "epoch": 28.043217286914764, + "grad_norm": 0.11461391299962997, + "learning_rate": 1.280958539929284e-05, + "loss": 0.0216, + "step": 23360 + }, + { + "epoch": 28.055222088835535, + "grad_norm": 0.12852953374385834, + "learning_rate": 1.277276907903664e-05, + "loss": 0.0188, + "step": 23370 + }, + { + "epoch": 28.067226890756302, + "grad_norm": 0.11053499579429626, + "learning_rate": 1.2735997993376298e-05, + "loss": 0.0198, + "step": 23380 + }, + { + "epoch": 28.07923169267707, + "grad_norm": 0.11061467975378036, + "learning_rate": 1.2699272186992166e-05, + "loss": 0.0201, + "step": 23390 + }, + { + "epoch": 28.09123649459784, + "grad_norm": 0.14908309280872345, + "learning_rate": 1.2662591704509547e-05, + "loss": 0.021, + "step": 23400 + }, + { + "epoch": 28.103241296518608, + "grad_norm": 0.09530472010374069, + "learning_rate": 1.2625956590498711e-05, + "loss": 0.0224, + "step": 23410 + }, + { + "epoch": 28.115246098439375, + "grad_norm": 0.15175214409828186, + "learning_rate": 1.2589366889474757e-05, + "loss": 0.0199, + "step": 23420 + }, + { + "epoch": 28.127250900360146, + "grad_norm": 0.11612549424171448, + "learning_rate": 1.2552822645897621e-05, + "loss": 0.0201, + "step": 23430 + }, + { + "epoch": 28.139255702280913, + "grad_norm": 0.15989062190055847, + "learning_rate": 1.2516323904172e-05, + "loss": 0.0215, + "step": 23440 + }, + { + "epoch": 28.15126050420168, + "grad_norm": 0.12337902933359146, + "learning_rate": 1.2479870708647324e-05, + "loss": 0.0217, + "step": 23450 + }, + { + "epoch": 28.163265306122447, + "grad_norm": 0.1315169632434845, + "learning_rate": 1.2443463103617658e-05, + "loss": 0.019, + "step": 23460 + }, + { + "epoch": 28.175270108043218, + "grad_norm": 0.1630536913871765, + "learning_rate": 1.240710113332167e-05, + "loss": 0.0202, + "step": 23470 + }, + { + "epoch": 28.187274909963985, + "grad_norm": 0.14526192843914032, + "learning_rate": 1.2370784841942584e-05, + "loss": 0.0226, + "step": 23480 + }, + { + "epoch": 28.199279711884753, + "grad_norm": 0.20841313898563385, + "learning_rate": 1.2334514273608117e-05, + "loss": 0.0197, + "step": 23490 + }, + { + "epoch": 28.211284513805523, + "grad_norm": 0.09627457708120346, + "learning_rate": 1.2298289472390417e-05, + "loss": 0.0184, + "step": 23500 + }, + { + "epoch": 28.22328931572629, + "grad_norm": 0.12322700768709183, + "learning_rate": 1.226211048230606e-05, + "loss": 0.0184, + "step": 23510 + }, + { + "epoch": 28.235294117647058, + "grad_norm": 0.18626542389392853, + "learning_rate": 1.2225977347315921e-05, + "loss": 0.0205, + "step": 23520 + }, + { + "epoch": 28.24729891956783, + "grad_norm": 0.12849056720733643, + "learning_rate": 1.2189890111325148e-05, + "loss": 0.0199, + "step": 23530 + }, + { + "epoch": 28.259303721488596, + "grad_norm": 0.15195518732070923, + "learning_rate": 1.2153848818183161e-05, + "loss": 0.0207, + "step": 23540 + }, + { + "epoch": 28.271308523409363, + "grad_norm": 0.13248217105865479, + "learning_rate": 1.2117853511683507e-05, + "loss": 0.0203, + "step": 23550 + }, + { + "epoch": 28.28331332533013, + "grad_norm": 0.1054520532488823, + "learning_rate": 1.2081904235563906e-05, + "loss": 0.0201, + "step": 23560 + }, + { + "epoch": 28.2953181272509, + "grad_norm": 0.09792906045913696, + "learning_rate": 1.20460010335061e-05, + "loss": 0.0194, + "step": 23570 + }, + { + "epoch": 28.30732292917167, + "grad_norm": 0.10293040424585342, + "learning_rate": 1.2010143949135865e-05, + "loss": 0.0195, + "step": 23580 + }, + { + "epoch": 28.319327731092436, + "grad_norm": 0.13134272396564484, + "learning_rate": 1.1974333026022938e-05, + "loss": 0.022, + "step": 23590 + }, + { + "epoch": 28.331332533013207, + "grad_norm": 0.1568322330713272, + "learning_rate": 1.1938568307680964e-05, + "loss": 0.0226, + "step": 23600 + }, + { + "epoch": 28.343337334933974, + "grad_norm": 0.14987973868846893, + "learning_rate": 1.1902849837567464e-05, + "loss": 0.0218, + "step": 23610 + }, + { + "epoch": 28.35534213685474, + "grad_norm": 0.16833485662937164, + "learning_rate": 1.1867177659083739e-05, + "loss": 0.0175, + "step": 23620 + }, + { + "epoch": 28.367346938775512, + "grad_norm": 0.2087346762418747, + "learning_rate": 1.1831551815574848e-05, + "loss": 0.0217, + "step": 23630 + }, + { + "epoch": 28.37935174069628, + "grad_norm": 0.13651975989341736, + "learning_rate": 1.1795972350329554e-05, + "loss": 0.0217, + "step": 23640 + }, + { + "epoch": 28.391356542617046, + "grad_norm": 0.0861065611243248, + "learning_rate": 1.176043930658024e-05, + "loss": 0.0208, + "step": 23650 + }, + { + "epoch": 28.403361344537814, + "grad_norm": 0.20981788635253906, + "learning_rate": 1.1724952727502942e-05, + "loss": 0.021, + "step": 23660 + }, + { + "epoch": 28.415366146458584, + "grad_norm": 0.12622465193271637, + "learning_rate": 1.1689512656217178e-05, + "loss": 0.018, + "step": 23670 + }, + { + "epoch": 28.42737094837935, + "grad_norm": 0.15506795048713684, + "learning_rate": 1.1654119135785962e-05, + "loss": 0.0234, + "step": 23680 + }, + { + "epoch": 28.43937575030012, + "grad_norm": 0.16747413575649261, + "learning_rate": 1.1618772209215794e-05, + "loss": 0.0208, + "step": 23690 + }, + { + "epoch": 28.45138055222089, + "grad_norm": 0.08106610924005508, + "learning_rate": 1.1583471919456506e-05, + "loss": 0.0167, + "step": 23700 + }, + { + "epoch": 28.463385354141657, + "grad_norm": 0.08258811384439468, + "learning_rate": 1.1548218309401266e-05, + "loss": 0.0193, + "step": 23710 + }, + { + "epoch": 28.475390156062424, + "grad_norm": 0.11492346227169037, + "learning_rate": 1.1513011421886555e-05, + "loss": 0.0185, + "step": 23720 + }, + { + "epoch": 28.48739495798319, + "grad_norm": 0.11232996731996536, + "learning_rate": 1.1477851299692056e-05, + "loss": 0.019, + "step": 23730 + }, + { + "epoch": 28.499399759903962, + "grad_norm": 0.09705265611410141, + "learning_rate": 1.144273798554063e-05, + "loss": 0.0211, + "step": 23740 + }, + { + "epoch": 28.51140456182473, + "grad_norm": 0.17131149768829346, + "learning_rate": 1.1407671522098262e-05, + "loss": 0.0228, + "step": 23750 + }, + { + "epoch": 28.523409363745497, + "grad_norm": 0.16281628608703613, + "learning_rate": 1.1372651951974e-05, + "loss": 0.023, + "step": 23760 + }, + { + "epoch": 28.535414165666268, + "grad_norm": 0.1357044130563736, + "learning_rate": 1.1337679317719952e-05, + "loss": 0.0214, + "step": 23770 + }, + { + "epoch": 28.547418967587035, + "grad_norm": 0.12954087555408478, + "learning_rate": 1.1302753661831151e-05, + "loss": 0.0216, + "step": 23780 + }, + { + "epoch": 28.559423769507802, + "grad_norm": 0.12253393977880478, + "learning_rate": 1.1267875026745561e-05, + "loss": 0.0198, + "step": 23790 + }, + { + "epoch": 28.571428571428573, + "grad_norm": 0.1305507868528366, + "learning_rate": 1.1233043454844016e-05, + "loss": 0.0201, + "step": 23800 + }, + { + "epoch": 28.58343337334934, + "grad_norm": 0.13246114552021027, + "learning_rate": 1.1198258988450144e-05, + "loss": 0.0193, + "step": 23810 + }, + { + "epoch": 28.595438175270107, + "grad_norm": 0.20463727414608002, + "learning_rate": 1.1163521669830368e-05, + "loss": 0.0221, + "step": 23820 + }, + { + "epoch": 28.607442977190875, + "grad_norm": 0.15630565583705902, + "learning_rate": 1.1128831541193818e-05, + "loss": 0.0208, + "step": 23830 + }, + { + "epoch": 28.619447779111646, + "grad_norm": 0.19153013825416565, + "learning_rate": 1.1094188644692255e-05, + "loss": 0.0211, + "step": 23840 + }, + { + "epoch": 28.631452581032413, + "grad_norm": 0.13796095550060272, + "learning_rate": 1.1059593022420067e-05, + "loss": 0.0204, + "step": 23850 + }, + { + "epoch": 28.64345738295318, + "grad_norm": 0.1939791887998581, + "learning_rate": 1.1025044716414184e-05, + "loss": 0.0219, + "step": 23860 + }, + { + "epoch": 28.65546218487395, + "grad_norm": 0.1383959949016571, + "learning_rate": 1.0990543768654082e-05, + "loss": 0.0215, + "step": 23870 + }, + { + "epoch": 28.667466986794718, + "grad_norm": 0.12495381385087967, + "learning_rate": 1.0956090221061655e-05, + "loss": 0.021, + "step": 23880 + }, + { + "epoch": 28.679471788715485, + "grad_norm": 0.14860762655735016, + "learning_rate": 1.0921684115501208e-05, + "loss": 0.0173, + "step": 23890 + }, + { + "epoch": 28.691476590636256, + "grad_norm": 0.1608656495809555, + "learning_rate": 1.0887325493779405e-05, + "loss": 0.019, + "step": 23900 + }, + { + "epoch": 28.703481392557023, + "grad_norm": 0.12299924343824387, + "learning_rate": 1.0853014397645207e-05, + "loss": 0.0186, + "step": 23910 + }, + { + "epoch": 28.71548619447779, + "grad_norm": 0.12257056683301926, + "learning_rate": 1.0818750868789829e-05, + "loss": 0.0173, + "step": 23920 + }, + { + "epoch": 28.727490996398558, + "grad_norm": 0.11613056063652039, + "learning_rate": 1.0784534948846703e-05, + "loss": 0.0184, + "step": 23930 + }, + { + "epoch": 28.73949579831933, + "grad_norm": 0.11793145537376404, + "learning_rate": 1.0750366679391393e-05, + "loss": 0.0174, + "step": 23940 + }, + { + "epoch": 28.751500600240096, + "grad_norm": 0.15315179526805878, + "learning_rate": 1.0716246101941557e-05, + "loss": 0.0196, + "step": 23950 + }, + { + "epoch": 28.763505402160863, + "grad_norm": 0.17354229092597961, + "learning_rate": 1.0682173257956934e-05, + "loss": 0.0204, + "step": 23960 + }, + { + "epoch": 28.775510204081634, + "grad_norm": 0.11484181135892868, + "learning_rate": 1.064814818883922e-05, + "loss": 0.0209, + "step": 23970 + }, + { + "epoch": 28.7875150060024, + "grad_norm": 0.11268807202577591, + "learning_rate": 1.06141709359321e-05, + "loss": 0.0194, + "step": 23980 + }, + { + "epoch": 28.79951980792317, + "grad_norm": 0.16162721812725067, + "learning_rate": 1.058024154052114e-05, + "loss": 0.0219, + "step": 23990 + }, + { + "epoch": 28.81152460984394, + "grad_norm": 0.1977904886007309, + "learning_rate": 1.0546360043833747e-05, + "loss": 0.0193, + "step": 24000 + }, + { + "epoch": 28.823529411764707, + "grad_norm": 0.10897177457809448, + "learning_rate": 1.0512526487039138e-05, + "loss": 0.02, + "step": 24010 + }, + { + "epoch": 28.835534213685474, + "grad_norm": 0.19861146807670593, + "learning_rate": 1.0478740911248258e-05, + "loss": 0.0194, + "step": 24020 + }, + { + "epoch": 28.84753901560624, + "grad_norm": 0.17273540794849396, + "learning_rate": 1.0445003357513793e-05, + "loss": 0.0231, + "step": 24030 + }, + { + "epoch": 28.859543817527012, + "grad_norm": 0.11945635080337524, + "learning_rate": 1.0411313866830042e-05, + "loss": 0.021, + "step": 24040 + }, + { + "epoch": 28.87154861944778, + "grad_norm": 0.12077473849058151, + "learning_rate": 1.0377672480132915e-05, + "loss": 0.0205, + "step": 24050 + }, + { + "epoch": 28.883553421368546, + "grad_norm": 0.18663369119167328, + "learning_rate": 1.0344079238299864e-05, + "loss": 0.0202, + "step": 24060 + }, + { + "epoch": 28.895558223289317, + "grad_norm": 0.15436522662639618, + "learning_rate": 1.0310534182149833e-05, + "loss": 0.0205, + "step": 24070 + }, + { + "epoch": 28.907563025210084, + "grad_norm": 0.1714785099029541, + "learning_rate": 1.0277037352443258e-05, + "loss": 0.0199, + "step": 24080 + }, + { + "epoch": 28.91956782713085, + "grad_norm": 0.1468949168920517, + "learning_rate": 1.024358878988193e-05, + "loss": 0.0212, + "step": 24090 + }, + { + "epoch": 28.931572629051622, + "grad_norm": 0.20846930146217346, + "learning_rate": 1.0210188535108995e-05, + "loss": 0.0216, + "step": 24100 + }, + { + "epoch": 28.94357743097239, + "grad_norm": 0.13612957298755646, + "learning_rate": 1.0176836628708935e-05, + "loss": 0.0201, + "step": 24110 + }, + { + "epoch": 28.955582232893157, + "grad_norm": 0.17105665802955627, + "learning_rate": 1.0143533111207454e-05, + "loss": 0.0193, + "step": 24120 + }, + { + "epoch": 28.967587034813924, + "grad_norm": 0.09850607067346573, + "learning_rate": 1.0110278023071446e-05, + "loss": 0.0202, + "step": 24130 + }, + { + "epoch": 28.979591836734695, + "grad_norm": 0.11277168989181519, + "learning_rate": 1.007707140470901e-05, + "loss": 0.0193, + "step": 24140 + }, + { + "epoch": 28.991596638655462, + "grad_norm": 0.10133060067892075, + "learning_rate": 1.0043913296469298e-05, + "loss": 0.0189, + "step": 24150 + }, + { + "epoch": 29.00360144057623, + "grad_norm": 0.1363169401884079, + "learning_rate": 1.001080373864255e-05, + "loss": 0.0201, + "step": 24160 + }, + { + "epoch": 29.015606242497, + "grad_norm": 0.1399025171995163, + "learning_rate": 9.977742771459991e-06, + "loss": 0.0184, + "step": 24170 + }, + { + "epoch": 29.027611044417768, + "grad_norm": 0.1256120353937149, + "learning_rate": 9.944730435093802e-06, + "loss": 0.0221, + "step": 24180 + }, + { + "epoch": 29.039615846338535, + "grad_norm": 0.09665780514478683, + "learning_rate": 9.911766769657115e-06, + "loss": 0.0159, + "step": 24190 + }, + { + "epoch": 29.051620648259302, + "grad_norm": 0.12509241700172424, + "learning_rate": 9.878851815203882e-06, + "loss": 0.0204, + "step": 24200 + }, + { + "epoch": 29.063625450180073, + "grad_norm": 0.10542166233062744, + "learning_rate": 9.845985611728864e-06, + "loss": 0.0207, + "step": 24210 + }, + { + "epoch": 29.07563025210084, + "grad_norm": 0.11653442680835724, + "learning_rate": 9.813168199167606e-06, + "loss": 0.0207, + "step": 24220 + }, + { + "epoch": 29.087635054021607, + "grad_norm": 0.12244458496570587, + "learning_rate": 9.780399617396341e-06, + "loss": 0.0213, + "step": 24230 + }, + { + "epoch": 29.099639855942378, + "grad_norm": 0.17382967472076416, + "learning_rate": 9.747679906232016e-06, + "loss": 0.0182, + "step": 24240 + }, + { + "epoch": 29.111644657863145, + "grad_norm": 0.14871448278427124, + "learning_rate": 9.715009105432138e-06, + "loss": 0.0193, + "step": 24250 + }, + { + "epoch": 29.123649459783913, + "grad_norm": 0.14277204871177673, + "learning_rate": 9.682387254694835e-06, + "loss": 0.0184, + "step": 24260 + }, + { + "epoch": 29.135654261704683, + "grad_norm": 0.10987228900194168, + "learning_rate": 9.649814393658723e-06, + "loss": 0.0178, + "step": 24270 + }, + { + "epoch": 29.14765906362545, + "grad_norm": 0.14144128561019897, + "learning_rate": 9.617290561902881e-06, + "loss": 0.019, + "step": 24280 + }, + { + "epoch": 29.159663865546218, + "grad_norm": 0.1072549968957901, + "learning_rate": 9.584815798946861e-06, + "loss": 0.0182, + "step": 24290 + }, + { + "epoch": 29.171668667466985, + "grad_norm": 0.14765198528766632, + "learning_rate": 9.55239014425055e-06, + "loss": 0.0234, + "step": 24300 + }, + { + "epoch": 29.183673469387756, + "grad_norm": 0.15673935413360596, + "learning_rate": 9.520013637214176e-06, + "loss": 0.0174, + "step": 24310 + }, + { + "epoch": 29.195678271308523, + "grad_norm": 0.13911116123199463, + "learning_rate": 9.48768631717824e-06, + "loss": 0.0201, + "step": 24320 + }, + { + "epoch": 29.20768307322929, + "grad_norm": 0.1415235698223114, + "learning_rate": 9.455408223423496e-06, + "loss": 0.019, + "step": 24330 + }, + { + "epoch": 29.21968787515006, + "grad_norm": 0.1514338254928589, + "learning_rate": 9.423179395170845e-06, + "loss": 0.0194, + "step": 24340 + }, + { + "epoch": 29.23169267707083, + "grad_norm": 0.15416865050792694, + "learning_rate": 9.390999871581391e-06, + "loss": 0.0176, + "step": 24350 + }, + { + "epoch": 29.243697478991596, + "grad_norm": 0.1261763870716095, + "learning_rate": 9.358869691756273e-06, + "loss": 0.0194, + "step": 24360 + }, + { + "epoch": 29.255702280912367, + "grad_norm": 0.10816393047571182, + "learning_rate": 9.326788894736688e-06, + "loss": 0.0198, + "step": 24370 + }, + { + "epoch": 29.267707082833134, + "grad_norm": 0.1320532113313675, + "learning_rate": 9.294757519503811e-06, + "loss": 0.0204, + "step": 24380 + }, + { + "epoch": 29.2797118847539, + "grad_norm": 0.13113060593605042, + "learning_rate": 9.262775604978819e-06, + "loss": 0.0181, + "step": 24390 + }, + { + "epoch": 29.29171668667467, + "grad_norm": 0.15414245426654816, + "learning_rate": 9.230843190022725e-06, + "loss": 0.0216, + "step": 24400 + }, + { + "epoch": 29.30372148859544, + "grad_norm": 0.1009291559457779, + "learning_rate": 9.198960313436445e-06, + "loss": 0.0166, + "step": 24410 + }, + { + "epoch": 29.315726290516206, + "grad_norm": 0.17796272039413452, + "learning_rate": 9.16712701396067e-06, + "loss": 0.0194, + "step": 24420 + }, + { + "epoch": 29.327731092436974, + "grad_norm": 0.1944301873445511, + "learning_rate": 9.135343330275864e-06, + "loss": 0.0216, + "step": 24430 + }, + { + "epoch": 29.339735894357744, + "grad_norm": 0.16785448789596558, + "learning_rate": 9.10360930100218e-06, + "loss": 0.0193, + "step": 24440 + }, + { + "epoch": 29.35174069627851, + "grad_norm": 0.13288968801498413, + "learning_rate": 9.07192496469949e-06, + "loss": 0.0187, + "step": 24450 + }, + { + "epoch": 29.36374549819928, + "grad_norm": 0.12969349324703217, + "learning_rate": 9.040290359867231e-06, + "loss": 0.0186, + "step": 24460 + }, + { + "epoch": 29.37575030012005, + "grad_norm": 0.17543071508407593, + "learning_rate": 9.008705524944438e-06, + "loss": 0.0204, + "step": 24470 + }, + { + "epoch": 29.387755102040817, + "grad_norm": 0.2040771245956421, + "learning_rate": 8.977170498309651e-06, + "loss": 0.016, + "step": 24480 + }, + { + "epoch": 29.399759903961584, + "grad_norm": 0.13787350058555603, + "learning_rate": 8.945685318280916e-06, + "loss": 0.0183, + "step": 24490 + }, + { + "epoch": 29.41176470588235, + "grad_norm": 0.10641298443078995, + "learning_rate": 8.914250023115672e-06, + "loss": 0.0195, + "step": 24500 + }, + { + "epoch": 29.423769507803122, + "grad_norm": 0.1658066362142563, + "learning_rate": 8.882864651010798e-06, + "loss": 0.0175, + "step": 24510 + }, + { + "epoch": 29.43577430972389, + "grad_norm": 0.11553323268890381, + "learning_rate": 8.851529240102462e-06, + "loss": 0.0224, + "step": 24520 + }, + { + "epoch": 29.447779111644657, + "grad_norm": 0.14457036554813385, + "learning_rate": 8.820243828466134e-06, + "loss": 0.0196, + "step": 24530 + }, + { + "epoch": 29.459783913565428, + "grad_norm": 0.14818976819515228, + "learning_rate": 8.789008454116566e-06, + "loss": 0.0211, + "step": 24540 + }, + { + "epoch": 29.471788715486195, + "grad_norm": 0.08137602359056473, + "learning_rate": 8.757823155007655e-06, + "loss": 0.0202, + "step": 24550 + }, + { + "epoch": 29.483793517406962, + "grad_norm": 0.15567627549171448, + "learning_rate": 8.72668796903251e-06, + "loss": 0.0195, + "step": 24560 + }, + { + "epoch": 29.495798319327733, + "grad_norm": 0.1126406267285347, + "learning_rate": 8.695602934023301e-06, + "loss": 0.0189, + "step": 24570 + }, + { + "epoch": 29.5078031212485, + "grad_norm": 0.14068357646465302, + "learning_rate": 8.664568087751274e-06, + "loss": 0.0177, + "step": 24580 + }, + { + "epoch": 29.519807923169267, + "grad_norm": 0.14690707623958588, + "learning_rate": 8.633583467926697e-06, + "loss": 0.021, + "step": 24590 + }, + { + "epoch": 29.531812725090035, + "grad_norm": 0.13957443833351135, + "learning_rate": 8.602649112198796e-06, + "loss": 0.0198, + "step": 24600 + }, + { + "epoch": 29.543817527010805, + "grad_norm": 0.1293996125459671, + "learning_rate": 8.571765058155745e-06, + "loss": 0.0185, + "step": 24610 + }, + { + "epoch": 29.555822328931573, + "grad_norm": 0.17345428466796875, + "learning_rate": 8.540931343324583e-06, + "loss": 0.0212, + "step": 24620 + }, + { + "epoch": 29.56782713085234, + "grad_norm": 0.124102383852005, + "learning_rate": 8.51014800517117e-06, + "loss": 0.0186, + "step": 24630 + }, + { + "epoch": 29.57983193277311, + "grad_norm": 0.11648328602313995, + "learning_rate": 8.479415081100167e-06, + "loss": 0.0207, + "step": 24640 + }, + { + "epoch": 29.591836734693878, + "grad_norm": 0.11063316464424133, + "learning_rate": 8.448732608454967e-06, + "loss": 0.0205, + "step": 24650 + }, + { + "epoch": 29.603841536614645, + "grad_norm": 0.13624629378318787, + "learning_rate": 8.418100624517688e-06, + "loss": 0.0195, + "step": 24660 + }, + { + "epoch": 29.615846338535412, + "grad_norm": 0.13353362679481506, + "learning_rate": 8.387519166509062e-06, + "loss": 0.0201, + "step": 24670 + }, + { + "epoch": 29.627851140456183, + "grad_norm": 0.15406091511249542, + "learning_rate": 8.356988271588445e-06, + "loss": 0.021, + "step": 24680 + }, + { + "epoch": 29.63985594237695, + "grad_norm": 0.11233891546726227, + "learning_rate": 8.326507976853765e-06, + "loss": 0.0187, + "step": 24690 + }, + { + "epoch": 29.651860744297718, + "grad_norm": 0.1324179470539093, + "learning_rate": 8.296078319341443e-06, + "loss": 0.0194, + "step": 24700 + }, + { + "epoch": 29.66386554621849, + "grad_norm": 0.15605786442756653, + "learning_rate": 8.265699336026383e-06, + "loss": 0.0221, + "step": 24710 + }, + { + "epoch": 29.675870348139256, + "grad_norm": 0.14126257598400116, + "learning_rate": 8.235371063821923e-06, + "loss": 0.0216, + "step": 24720 + }, + { + "epoch": 29.687875150060023, + "grad_norm": 0.11898759007453918, + "learning_rate": 8.205093539579767e-06, + "loss": 0.0209, + "step": 24730 + }, + { + "epoch": 29.699879951980794, + "grad_norm": 0.19369667768478394, + "learning_rate": 8.174866800089963e-06, + "loss": 0.021, + "step": 24740 + }, + { + "epoch": 29.71188475390156, + "grad_norm": 0.19075901806354523, + "learning_rate": 8.144690882080853e-06, + "loss": 0.0187, + "step": 24750 + }, + { + "epoch": 29.72388955582233, + "grad_norm": 0.1808328777551651, + "learning_rate": 8.114565822219006e-06, + "loss": 0.0196, + "step": 24760 + }, + { + "epoch": 29.735894357743096, + "grad_norm": 0.6989114284515381, + "learning_rate": 8.084491657109233e-06, + "loss": 0.0198, + "step": 24770 + }, + { + "epoch": 29.747899159663866, + "grad_norm": 0.12350083142518997, + "learning_rate": 8.054468423294476e-06, + "loss": 0.0176, + "step": 24780 + }, + { + "epoch": 29.759903961584634, + "grad_norm": 0.20627689361572266, + "learning_rate": 8.024496157255785e-06, + "loss": 0.0197, + "step": 24790 + }, + { + "epoch": 29.7719087635054, + "grad_norm": 0.13889089226722717, + "learning_rate": 7.994574895412293e-06, + "loss": 0.0204, + "step": 24800 + }, + { + "epoch": 29.78391356542617, + "grad_norm": 0.12232981622219086, + "learning_rate": 7.964704674121149e-06, + "loss": 0.0197, + "step": 24810 + }, + { + "epoch": 29.79591836734694, + "grad_norm": 0.21124371886253357, + "learning_rate": 7.934885529677505e-06, + "loss": 0.022, + "step": 24820 + }, + { + "epoch": 29.807923169267706, + "grad_norm": 0.12778808176517487, + "learning_rate": 7.905117498314412e-06, + "loss": 0.0195, + "step": 24830 + }, + { + "epoch": 29.819927971188477, + "grad_norm": 0.17972233891487122, + "learning_rate": 7.87540061620286e-06, + "loss": 0.0216, + "step": 24840 + }, + { + "epoch": 29.831932773109244, + "grad_norm": 0.13024699687957764, + "learning_rate": 7.845734919451647e-06, + "loss": 0.0198, + "step": 24850 + }, + { + "epoch": 29.84393757503001, + "grad_norm": 0.15809771418571472, + "learning_rate": 7.816120444107383e-06, + "loss": 0.0193, + "step": 24860 + }, + { + "epoch": 29.85594237695078, + "grad_norm": 0.16990631818771362, + "learning_rate": 7.786557226154473e-06, + "loss": 0.017, + "step": 24870 + }, + { + "epoch": 29.86794717887155, + "grad_norm": 0.14965873956680298, + "learning_rate": 7.757045301514998e-06, + "loss": 0.0228, + "step": 24880 + }, + { + "epoch": 29.879951980792317, + "grad_norm": 0.13267508149147034, + "learning_rate": 7.727584706048735e-06, + "loss": 0.0171, + "step": 24890 + }, + { + "epoch": 29.891956782713084, + "grad_norm": 0.18801560997962952, + "learning_rate": 7.698175475553077e-06, + "loss": 0.0212, + "step": 24900 + }, + { + "epoch": 29.903961584633855, + "grad_norm": 0.19377407431602478, + "learning_rate": 7.668817645763021e-06, + "loss": 0.02, + "step": 24910 + }, + { + "epoch": 29.915966386554622, + "grad_norm": 0.1398094892501831, + "learning_rate": 7.639511252351088e-06, + "loss": 0.0195, + "step": 24920 + }, + { + "epoch": 29.92797118847539, + "grad_norm": 0.15203358232975006, + "learning_rate": 7.610256330927324e-06, + "loss": 0.0227, + "step": 24930 + }, + { + "epoch": 29.939975990396157, + "grad_norm": 0.21784409880638123, + "learning_rate": 7.581052917039211e-06, + "loss": 0.0201, + "step": 24940 + }, + { + "epoch": 29.951980792316927, + "grad_norm": 0.13877440989017487, + "learning_rate": 7.551901046171645e-06, + "loss": 0.019, + "step": 24950 + }, + { + "epoch": 29.963985594237695, + "grad_norm": 0.16748753190040588, + "learning_rate": 7.522800753746895e-06, + "loss": 0.0209, + "step": 24960 + }, + { + "epoch": 29.975990396158462, + "grad_norm": 0.12670885026454926, + "learning_rate": 7.493752075124577e-06, + "loss": 0.0202, + "step": 24970 + }, + { + "epoch": 29.987995198079233, + "grad_norm": 0.12352591007947922, + "learning_rate": 7.464755045601557e-06, + "loss": 0.0202, + "step": 24980 + }, + { + "epoch": 30.0, + "grad_norm": 0.21619591116905212, + "learning_rate": 7.435809700411972e-06, + "loss": 0.0198, + "step": 24990 + }, + { + "epoch": 30.012004801920767, + "grad_norm": 0.15399694442749023, + "learning_rate": 7.40691607472715e-06, + "loss": 0.0215, + "step": 25000 + }, + { + "epoch": 30.024009603841538, + "grad_norm": 0.1199960932135582, + "learning_rate": 7.3780742036555606e-06, + "loss": 0.0176, + "step": 25010 + }, + { + "epoch": 30.036014405762305, + "grad_norm": 0.12439089268445969, + "learning_rate": 7.349284122242783e-06, + "loss": 0.0208, + "step": 25020 + }, + { + "epoch": 30.048019207683073, + "grad_norm": 0.11908674240112305, + "learning_rate": 7.320545865471512e-06, + "loss": 0.02, + "step": 25030 + }, + { + "epoch": 30.06002400960384, + "grad_norm": 0.10940992832183838, + "learning_rate": 7.291859468261425e-06, + "loss": 0.0186, + "step": 25040 + }, + { + "epoch": 30.07202881152461, + "grad_norm": 0.15589413046836853, + "learning_rate": 7.263224965469195e-06, + "loss": 0.0194, + "step": 25050 + }, + { + "epoch": 30.084033613445378, + "grad_norm": 0.11031153053045273, + "learning_rate": 7.234642391888446e-06, + "loss": 0.0191, + "step": 25060 + }, + { + "epoch": 30.096038415366145, + "grad_norm": 0.1261010617017746, + "learning_rate": 7.206111782249697e-06, + "loss": 0.0155, + "step": 25070 + }, + { + "epoch": 30.108043217286916, + "grad_norm": 0.09941606223583221, + "learning_rate": 7.1776331712203385e-06, + "loss": 0.0187, + "step": 25080 + }, + { + "epoch": 30.120048019207683, + "grad_norm": 0.12381261587142944, + "learning_rate": 7.149206593404561e-06, + "loss": 0.0171, + "step": 25090 + }, + { + "epoch": 30.13205282112845, + "grad_norm": 0.11987265944480896, + "learning_rate": 7.120832083343337e-06, + "loss": 0.0189, + "step": 25100 + }, + { + "epoch": 30.14405762304922, + "grad_norm": 0.1770896017551422, + "learning_rate": 7.092509675514369e-06, + "loss": 0.0195, + "step": 25110 + }, + { + "epoch": 30.15606242496999, + "grad_norm": 0.142696350812912, + "learning_rate": 7.064239404332063e-06, + "loss": 0.0201, + "step": 25120 + }, + { + "epoch": 30.168067226890756, + "grad_norm": 0.10881594568490982, + "learning_rate": 7.03602130414745e-06, + "loss": 0.0169, + "step": 25130 + }, + { + "epoch": 30.180072028811523, + "grad_norm": 0.1804417073726654, + "learning_rate": 7.007855409248198e-06, + "loss": 0.02, + "step": 25140 + }, + { + "epoch": 30.192076830732294, + "grad_norm": 0.15073950588703156, + "learning_rate": 6.979741753858521e-06, + "loss": 0.0178, + "step": 25150 + }, + { + "epoch": 30.20408163265306, + "grad_norm": 0.1482628434896469, + "learning_rate": 6.951680372139158e-06, + "loss": 0.018, + "step": 25160 + }, + { + "epoch": 30.21608643457383, + "grad_norm": 0.10974858701229095, + "learning_rate": 6.923671298187334e-06, + "loss": 0.0185, + "step": 25170 + }, + { + "epoch": 30.2280912364946, + "grad_norm": 0.11083883047103882, + "learning_rate": 6.895714566036704e-06, + "loss": 0.0169, + "step": 25180 + }, + { + "epoch": 30.240096038415366, + "grad_norm": 0.11346050351858139, + "learning_rate": 6.86781020965736e-06, + "loss": 0.0215, + "step": 25190 + }, + { + "epoch": 30.252100840336134, + "grad_norm": 0.12666265666484833, + "learning_rate": 6.839958262955709e-06, + "loss": 0.0187, + "step": 25200 + }, + { + "epoch": 30.264105642256904, + "grad_norm": 0.14662587642669678, + "learning_rate": 6.8121587597744894e-06, + "loss": 0.0188, + "step": 25210 + }, + { + "epoch": 30.27611044417767, + "grad_norm": 0.10752364248037338, + "learning_rate": 6.784411733892732e-06, + "loss": 0.0201, + "step": 25220 + }, + { + "epoch": 30.28811524609844, + "grad_norm": 0.15298420190811157, + "learning_rate": 6.756717219025666e-06, + "loss": 0.0212, + "step": 25230 + }, + { + "epoch": 30.300120048019206, + "grad_norm": 0.10791623592376709, + "learning_rate": 6.729075248824762e-06, + "loss": 0.0162, + "step": 25240 + }, + { + "epoch": 30.312124849939977, + "grad_norm": 0.13937567174434662, + "learning_rate": 6.701485856877615e-06, + "loss": 0.0218, + "step": 25250 + }, + { + "epoch": 30.324129651860744, + "grad_norm": 0.14098693430423737, + "learning_rate": 6.673949076707925e-06, + "loss": 0.0178, + "step": 25260 + }, + { + "epoch": 30.33613445378151, + "grad_norm": 0.11857035756111145, + "learning_rate": 6.646464941775499e-06, + "loss": 0.0203, + "step": 25270 + }, + { + "epoch": 30.348139255702282, + "grad_norm": 0.16003091633319855, + "learning_rate": 6.619033485476128e-06, + "loss": 0.0193, + "step": 25280 + }, + { + "epoch": 30.36014405762305, + "grad_norm": 0.1326630860567093, + "learning_rate": 6.59165474114164e-06, + "loss": 0.0195, + "step": 25290 + }, + { + "epoch": 30.372148859543817, + "grad_norm": 0.13425269722938538, + "learning_rate": 6.564328742039783e-06, + "loss": 0.0189, + "step": 25300 + }, + { + "epoch": 30.384153661464588, + "grad_norm": 0.20798276364803314, + "learning_rate": 6.537055521374219e-06, + "loss": 0.0191, + "step": 25310 + }, + { + "epoch": 30.396158463385355, + "grad_norm": 0.1252010017633438, + "learning_rate": 6.509835112284485e-06, + "loss": 0.0177, + "step": 25320 + }, + { + "epoch": 30.408163265306122, + "grad_norm": 0.12162921577692032, + "learning_rate": 6.482667547845944e-06, + "loss": 0.0168, + "step": 25330 + }, + { + "epoch": 30.42016806722689, + "grad_norm": 0.1621893048286438, + "learning_rate": 6.455552861069736e-06, + "loss": 0.018, + "step": 25340 + }, + { + "epoch": 30.43217286914766, + "grad_norm": 0.1525656282901764, + "learning_rate": 6.428491084902788e-06, + "loss": 0.0211, + "step": 25350 + }, + { + "epoch": 30.444177671068427, + "grad_norm": 0.1205819621682167, + "learning_rate": 6.401482252227698e-06, + "loss": 0.0183, + "step": 25360 + }, + { + "epoch": 30.456182472989195, + "grad_norm": 0.2098769098520279, + "learning_rate": 6.37452639586274e-06, + "loss": 0.0195, + "step": 25370 + }, + { + "epoch": 30.468187274909965, + "grad_norm": 0.244911327958107, + "learning_rate": 6.347623548561826e-06, + "loss": 0.0169, + "step": 25380 + }, + { + "epoch": 30.480192076830733, + "grad_norm": 0.12132042646408081, + "learning_rate": 6.320773743014441e-06, + "loss": 0.0167, + "step": 25390 + }, + { + "epoch": 30.4921968787515, + "grad_norm": 0.1744103878736496, + "learning_rate": 6.293977011845648e-06, + "loss": 0.0204, + "step": 25400 + }, + { + "epoch": 30.504201680672267, + "grad_norm": 0.15784427523612976, + "learning_rate": 6.2672333876159835e-06, + "loss": 0.019, + "step": 25410 + }, + { + "epoch": 30.516206482593038, + "grad_norm": 0.12489645928144455, + "learning_rate": 6.2405429028215e-06, + "loss": 0.017, + "step": 25420 + }, + { + "epoch": 30.528211284513805, + "grad_norm": 0.14216798543930054, + "learning_rate": 6.213905589893631e-06, + "loss": 0.0176, + "step": 25430 + }, + { + "epoch": 30.540216086434572, + "grad_norm": 0.13018181920051575, + "learning_rate": 6.187321481199221e-06, + "loss": 0.0191, + "step": 25440 + }, + { + "epoch": 30.552220888355343, + "grad_norm": 0.15112929046154022, + "learning_rate": 6.1607906090404895e-06, + "loss": 0.0215, + "step": 25450 + }, + { + "epoch": 30.56422569027611, + "grad_norm": 0.1496943235397339, + "learning_rate": 6.134313005654929e-06, + "loss": 0.0167, + "step": 25460 + }, + { + "epoch": 30.576230492196878, + "grad_norm": 0.12133301794528961, + "learning_rate": 6.107888703215337e-06, + "loss": 0.0199, + "step": 25470 + }, + { + "epoch": 30.58823529411765, + "grad_norm": 0.1294260174036026, + "learning_rate": 6.081517733829722e-06, + "loss": 0.0163, + "step": 25480 + }, + { + "epoch": 30.600240096038416, + "grad_norm": 0.18040244281291962, + "learning_rate": 6.055200129541294e-06, + "loss": 0.0187, + "step": 25490 + }, + { + "epoch": 30.612244897959183, + "grad_norm": 0.16459818184375763, + "learning_rate": 6.0289359223284434e-06, + "loss": 0.0178, + "step": 25500 + }, + { + "epoch": 30.62424969987995, + "grad_norm": 0.1303587555885315, + "learning_rate": 6.002725144104648e-06, + "loss": 0.0235, + "step": 25510 + }, + { + "epoch": 30.63625450180072, + "grad_norm": 0.18600085377693176, + "learning_rate": 5.976567826718477e-06, + "loss": 0.02, + "step": 25520 + }, + { + "epoch": 30.64825930372149, + "grad_norm": 0.17760637402534485, + "learning_rate": 5.9504640019535315e-06, + "loss": 0.0202, + "step": 25530 + }, + { + "epoch": 30.660264105642256, + "grad_norm": 0.13155382871627808, + "learning_rate": 5.924413701528414e-06, + "loss": 0.0207, + "step": 25540 + }, + { + "epoch": 30.672268907563026, + "grad_norm": 0.13184744119644165, + "learning_rate": 5.898416957096703e-06, + "loss": 0.0189, + "step": 25550 + }, + { + "epoch": 30.684273709483794, + "grad_norm": 0.12563081085681915, + "learning_rate": 5.872473800246913e-06, + "loss": 0.0185, + "step": 25560 + }, + { + "epoch": 30.69627851140456, + "grad_norm": 0.1271025389432907, + "learning_rate": 5.846584262502402e-06, + "loss": 0.0203, + "step": 25570 + }, + { + "epoch": 30.70828331332533, + "grad_norm": 0.159155935049057, + "learning_rate": 5.820748375321411e-06, + "loss": 0.0204, + "step": 25580 + }, + { + "epoch": 30.7202881152461, + "grad_norm": 0.1143256425857544, + "learning_rate": 5.794966170096977e-06, + "loss": 0.0171, + "step": 25590 + }, + { + "epoch": 30.732292917166866, + "grad_norm": 0.10695891082286835, + "learning_rate": 5.769237678156897e-06, + "loss": 0.018, + "step": 25600 + }, + { + "epoch": 30.744297719087633, + "grad_norm": 0.13480888307094574, + "learning_rate": 5.743562930763735e-06, + "loss": 0.02, + "step": 25610 + }, + { + "epoch": 30.756302521008404, + "grad_norm": 0.13040678203105927, + "learning_rate": 5.717941959114726e-06, + "loss": 0.0174, + "step": 25620 + }, + { + "epoch": 30.76830732292917, + "grad_norm": 0.19337281584739685, + "learning_rate": 5.69237479434176e-06, + "loss": 0.0199, + "step": 25630 + }, + { + "epoch": 30.78031212484994, + "grad_norm": 0.16972963511943817, + "learning_rate": 5.666861467511353e-06, + "loss": 0.0164, + "step": 25640 + }, + { + "epoch": 30.79231692677071, + "grad_norm": 0.1656317114830017, + "learning_rate": 5.641402009624591e-06, + "loss": 0.019, + "step": 25650 + }, + { + "epoch": 30.804321728691477, + "grad_norm": 0.12110815197229385, + "learning_rate": 5.615996451617145e-06, + "loss": 0.0178, + "step": 25660 + }, + { + "epoch": 30.816326530612244, + "grad_norm": 0.12171924114227295, + "learning_rate": 5.590644824359148e-06, + "loss": 0.0174, + "step": 25670 + }, + { + "epoch": 30.828331332533015, + "grad_norm": 0.09268311411142349, + "learning_rate": 5.56534715865521e-06, + "loss": 0.0164, + "step": 25680 + }, + { + "epoch": 30.840336134453782, + "grad_norm": 0.12478414922952652, + "learning_rate": 5.5401034852443965e-06, + "loss": 0.019, + "step": 25690 + }, + { + "epoch": 30.85234093637455, + "grad_norm": 0.15326730906963348, + "learning_rate": 5.514913834800134e-06, + "loss": 0.0189, + "step": 25700 + }, + { + "epoch": 30.864345738295317, + "grad_norm": 0.1470990628004074, + "learning_rate": 5.489778237930238e-06, + "loss": 0.0188, + "step": 25710 + }, + { + "epoch": 30.876350540216087, + "grad_norm": 0.1377541422843933, + "learning_rate": 5.464696725176827e-06, + "loss": 0.0204, + "step": 25720 + }, + { + "epoch": 30.888355342136855, + "grad_norm": 0.184840589761734, + "learning_rate": 5.439669327016294e-06, + "loss": 0.0178, + "step": 25730 + }, + { + "epoch": 30.900360144057622, + "grad_norm": 0.10961104929447174, + "learning_rate": 5.4146960738592985e-06, + "loss": 0.0171, + "step": 25740 + }, + { + "epoch": 30.912364945978393, + "grad_norm": 0.1440529078245163, + "learning_rate": 5.389776996050694e-06, + "loss": 0.017, + "step": 25750 + }, + { + "epoch": 30.92436974789916, + "grad_norm": 0.2221037596464157, + "learning_rate": 5.364912123869492e-06, + "loss": 0.0207, + "step": 25760 + }, + { + "epoch": 30.936374549819927, + "grad_norm": 0.18404117226600647, + "learning_rate": 5.3401014875288864e-06, + "loss": 0.0189, + "step": 25770 + }, + { + "epoch": 30.948379351740698, + "grad_norm": 0.13565605878829956, + "learning_rate": 5.315345117176129e-06, + "loss": 0.0176, + "step": 25780 + }, + { + "epoch": 30.960384153661465, + "grad_norm": 0.1935039460659027, + "learning_rate": 5.2906430428925405e-06, + "loss": 0.021, + "step": 25790 + }, + { + "epoch": 30.972388955582232, + "grad_norm": 0.12446468323469162, + "learning_rate": 5.265995294693471e-06, + "loss": 0.0188, + "step": 25800 + }, + { + "epoch": 30.984393757503, + "grad_norm": 0.11035905033349991, + "learning_rate": 5.241401902528254e-06, + "loss": 0.018, + "step": 25810 + }, + { + "epoch": 30.99639855942377, + "grad_norm": 0.15598323941230774, + "learning_rate": 5.216862896280189e-06, + "loss": 0.0207, + "step": 25820 + }, + { + "epoch": 31.008403361344538, + "grad_norm": 0.15806524455547333, + "learning_rate": 5.192378305766471e-06, + "loss": 0.0186, + "step": 25830 + }, + { + "epoch": 31.020408163265305, + "grad_norm": 0.16320617496967316, + "learning_rate": 5.1679481607382065e-06, + "loss": 0.0183, + "step": 25840 + }, + { + "epoch": 31.032412965186076, + "grad_norm": 0.12102925777435303, + "learning_rate": 5.1435724908803105e-06, + "loss": 0.0183, + "step": 25850 + }, + { + "epoch": 31.044417767106843, + "grad_norm": 0.14435209333896637, + "learning_rate": 5.119251325811514e-06, + "loss": 0.0212, + "step": 25860 + }, + { + "epoch": 31.05642256902761, + "grad_norm": 0.17728962004184723, + "learning_rate": 5.094984695084348e-06, + "loss": 0.0176, + "step": 25870 + }, + { + "epoch": 31.068427370948378, + "grad_norm": 0.1665002405643463, + "learning_rate": 5.070772628185039e-06, + "loss": 0.0186, + "step": 25880 + }, + { + "epoch": 31.08043217286915, + "grad_norm": 0.12240787595510483, + "learning_rate": 5.046615154533535e-06, + "loss": 0.0184, + "step": 25890 + }, + { + "epoch": 31.092436974789916, + "grad_norm": 0.1388007253408432, + "learning_rate": 5.022512303483451e-06, + "loss": 0.0184, + "step": 25900 + }, + { + "epoch": 31.104441776710683, + "grad_norm": 0.131166011095047, + "learning_rate": 4.998464104322015e-06, + "loss": 0.0203, + "step": 25910 + }, + { + "epoch": 31.116446578631454, + "grad_norm": 0.11990370601415634, + "learning_rate": 4.974470586270047e-06, + "loss": 0.0196, + "step": 25920 + }, + { + "epoch": 31.12845138055222, + "grad_norm": 0.20394404232501984, + "learning_rate": 4.950531778481965e-06, + "loss": 0.0179, + "step": 25930 + }, + { + "epoch": 31.140456182472988, + "grad_norm": 0.12968504428863525, + "learning_rate": 4.926647710045651e-06, + "loss": 0.0168, + "step": 25940 + }, + { + "epoch": 31.15246098439376, + "grad_norm": 0.11220231652259827, + "learning_rate": 4.9028184099825125e-06, + "loss": 0.0174, + "step": 25950 + }, + { + "epoch": 31.164465786314526, + "grad_norm": 0.13026553392410278, + "learning_rate": 4.879043907247382e-06, + "loss": 0.0188, + "step": 25960 + }, + { + "epoch": 31.176470588235293, + "grad_norm": 0.16307884454727173, + "learning_rate": 4.8553242307285414e-06, + "loss": 0.0179, + "step": 25970 + }, + { + "epoch": 31.18847539015606, + "grad_norm": 0.16425250470638275, + "learning_rate": 4.8316594092476185e-06, + "loss": 0.0181, + "step": 25980 + }, + { + "epoch": 31.20048019207683, + "grad_norm": 0.1445302814245224, + "learning_rate": 4.808049471559617e-06, + "loss": 0.0179, + "step": 25990 + }, + { + "epoch": 31.2124849939976, + "grad_norm": 0.1944800168275833, + "learning_rate": 4.784494446352833e-06, + "loss": 0.0191, + "step": 26000 + }, + { + "epoch": 31.224489795918366, + "grad_norm": 0.16115717589855194, + "learning_rate": 4.760994362248833e-06, + "loss": 0.0172, + "step": 26010 + }, + { + "epoch": 31.236494597839137, + "grad_norm": 0.10621725022792816, + "learning_rate": 4.737549247802442e-06, + "loss": 0.0156, + "step": 26020 + }, + { + "epoch": 31.248499399759904, + "grad_norm": 0.20379383862018585, + "learning_rate": 4.714159131501689e-06, + "loss": 0.0191, + "step": 26030 + }, + { + "epoch": 31.26050420168067, + "grad_norm": 0.10834300518035889, + "learning_rate": 4.690824041767766e-06, + "loss": 0.0176, + "step": 26040 + }, + { + "epoch": 31.272509003601442, + "grad_norm": 0.5878756642341614, + "learning_rate": 4.6675440069550105e-06, + "loss": 0.0188, + "step": 26050 + }, + { + "epoch": 31.28451380552221, + "grad_norm": 0.10366059839725494, + "learning_rate": 4.64431905535086e-06, + "loss": 0.0169, + "step": 26060 + }, + { + "epoch": 31.296518607442977, + "grad_norm": 0.15415158867835999, + "learning_rate": 4.621149215175808e-06, + "loss": 0.0203, + "step": 26070 + }, + { + "epoch": 31.308523409363744, + "grad_norm": 0.20967499911785126, + "learning_rate": 4.5980345145834154e-06, + "loss": 0.02, + "step": 26080 + }, + { + "epoch": 31.320528211284515, + "grad_norm": 0.13285119831562042, + "learning_rate": 4.574974981660213e-06, + "loss": 0.0185, + "step": 26090 + }, + { + "epoch": 31.332533013205282, + "grad_norm": 0.14206738770008087, + "learning_rate": 4.551970644425707e-06, + "loss": 0.0179, + "step": 26100 + }, + { + "epoch": 31.34453781512605, + "grad_norm": 0.13153153657913208, + "learning_rate": 4.529021530832328e-06, + "loss": 0.0173, + "step": 26110 + }, + { + "epoch": 31.35654261704682, + "grad_norm": 0.12980782985687256, + "learning_rate": 4.5061276687654306e-06, + "loss": 0.0192, + "step": 26120 + }, + { + "epoch": 31.368547418967587, + "grad_norm": 0.13394299149513245, + "learning_rate": 4.4832890860431965e-06, + "loss": 0.0193, + "step": 26130 + }, + { + "epoch": 31.380552220888354, + "grad_norm": 0.11264956742525101, + "learning_rate": 4.460505810416682e-06, + "loss": 0.0195, + "step": 26140 + }, + { + "epoch": 31.392557022809125, + "grad_norm": 0.27373942732810974, + "learning_rate": 4.4377778695696984e-06, + "loss": 0.0189, + "step": 26150 + }, + { + "epoch": 31.404561824729893, + "grad_norm": 0.09188127517700195, + "learning_rate": 4.415105291118843e-06, + "loss": 0.0169, + "step": 26160 + }, + { + "epoch": 31.41656662665066, + "grad_norm": 0.16078197956085205, + "learning_rate": 4.392488102613435e-06, + "loss": 0.0165, + "step": 26170 + }, + { + "epoch": 31.428571428571427, + "grad_norm": 0.10768096148967743, + "learning_rate": 4.369926331535473e-06, + "loss": 0.0186, + "step": 26180 + }, + { + "epoch": 31.440576230492198, + "grad_norm": 0.18812160193920135, + "learning_rate": 4.347420005299668e-06, + "loss": 0.0184, + "step": 26190 + }, + { + "epoch": 31.452581032412965, + "grad_norm": 0.08157604932785034, + "learning_rate": 4.324969151253317e-06, + "loss": 0.0151, + "step": 26200 + }, + { + "epoch": 31.464585834333732, + "grad_norm": 0.14993689954280853, + "learning_rate": 4.302573796676312e-06, + "loss": 0.0174, + "step": 26210 + }, + { + "epoch": 31.476590636254503, + "grad_norm": 0.11384280025959015, + "learning_rate": 4.2802339687811385e-06, + "loss": 0.0169, + "step": 26220 + }, + { + "epoch": 31.48859543817527, + "grad_norm": 0.08684442937374115, + "learning_rate": 4.257949694712771e-06, + "loss": 0.0168, + "step": 26230 + }, + { + "epoch": 31.500600240096038, + "grad_norm": 0.10112815350294113, + "learning_rate": 4.235721001548726e-06, + "loss": 0.0194, + "step": 26240 + }, + { + "epoch": 31.51260504201681, + "grad_norm": 0.10612502694129944, + "learning_rate": 4.21354791629896e-06, + "loss": 0.0176, + "step": 26250 + }, + { + "epoch": 31.524609843937576, + "grad_norm": 0.13567174971103668, + "learning_rate": 4.191430465905843e-06, + "loss": 0.018, + "step": 26260 + }, + { + "epoch": 31.536614645858343, + "grad_norm": 0.13079500198364258, + "learning_rate": 4.169368677244184e-06, + "loss": 0.0175, + "step": 26270 + }, + { + "epoch": 31.54861944777911, + "grad_norm": 0.11626488715410233, + "learning_rate": 4.147362577121122e-06, + "loss": 0.019, + "step": 26280 + }, + { + "epoch": 31.56062424969988, + "grad_norm": 0.23740357160568237, + "learning_rate": 4.125412192276157e-06, + "loss": 0.0194, + "step": 26290 + }, + { + "epoch": 31.57262905162065, + "grad_norm": 0.17691943049430847, + "learning_rate": 4.10351754938107e-06, + "loss": 0.0167, + "step": 26300 + }, + { + "epoch": 31.584633853541416, + "grad_norm": 0.10168479382991791, + "learning_rate": 4.081678675039913e-06, + "loss": 0.0166, + "step": 26310 + }, + { + "epoch": 31.596638655462186, + "grad_norm": 0.09372144192457199, + "learning_rate": 4.0598955957889876e-06, + "loss": 0.0186, + "step": 26320 + }, + { + "epoch": 31.608643457382954, + "grad_norm": 0.20000936090946198, + "learning_rate": 4.038168338096776e-06, + "loss": 0.0204, + "step": 26330 + }, + { + "epoch": 31.62064825930372, + "grad_norm": 0.1428755670785904, + "learning_rate": 4.016496928363944e-06, + "loss": 0.0185, + "step": 26340 + }, + { + "epoch": 31.632653061224488, + "grad_norm": 0.13235777616500854, + "learning_rate": 3.994881392923317e-06, + "loss": 0.0194, + "step": 26350 + }, + { + "epoch": 31.64465786314526, + "grad_norm": 0.12898778915405273, + "learning_rate": 3.9733217580397945e-06, + "loss": 0.0213, + "step": 26360 + }, + { + "epoch": 31.656662665066026, + "grad_norm": 0.12287908792495728, + "learning_rate": 3.951818049910366e-06, + "loss": 0.0187, + "step": 26370 + }, + { + "epoch": 31.668667466986793, + "grad_norm": 0.12642718851566315, + "learning_rate": 3.930370294664071e-06, + "loss": 0.0186, + "step": 26380 + }, + { + "epoch": 31.680672268907564, + "grad_norm": 0.1792992502450943, + "learning_rate": 3.908978518361938e-06, + "loss": 0.0165, + "step": 26390 + }, + { + "epoch": 31.69267707082833, + "grad_norm": 0.1395934671163559, + "learning_rate": 3.887642746997017e-06, + "loss": 0.0184, + "step": 26400 + }, + { + "epoch": 31.7046818727491, + "grad_norm": 0.12295208126306534, + "learning_rate": 3.866363006494255e-06, + "loss": 0.0226, + "step": 26410 + }, + { + "epoch": 31.71668667466987, + "grad_norm": 0.1951552927494049, + "learning_rate": 3.8451393227105725e-06, + "loss": 0.017, + "step": 26420 + }, + { + "epoch": 31.728691476590637, + "grad_norm": 0.08726563304662704, + "learning_rate": 3.823971721434727e-06, + "loss": 0.0183, + "step": 26430 + }, + { + "epoch": 31.740696278511404, + "grad_norm": 0.1342332810163498, + "learning_rate": 3.8028602283873505e-06, + "loss": 0.0164, + "step": 26440 + }, + { + "epoch": 31.75270108043217, + "grad_norm": 0.09991607069969177, + "learning_rate": 3.781804869220912e-06, + "loss": 0.0189, + "step": 26450 + }, + { + "epoch": 31.764705882352942, + "grad_norm": 0.09272430837154388, + "learning_rate": 3.760805669519646e-06, + "loss": 0.0171, + "step": 26460 + }, + { + "epoch": 31.77671068427371, + "grad_norm": 0.20625044405460358, + "learning_rate": 3.7398626547995587e-06, + "loss": 0.0185, + "step": 26470 + }, + { + "epoch": 31.788715486194477, + "grad_norm": 0.12922978401184082, + "learning_rate": 3.7189758505083995e-06, + "loss": 0.019, + "step": 26480 + }, + { + "epoch": 31.800720288115247, + "grad_norm": 0.19149243831634521, + "learning_rate": 3.6981452820255837e-06, + "loss": 0.02, + "step": 26490 + }, + { + "epoch": 31.812725090036015, + "grad_norm": 0.11010369658470154, + "learning_rate": 3.6773709746622363e-06, + "loss": 0.018, + "step": 26500 + }, + { + "epoch": 31.824729891956782, + "grad_norm": 0.116937555372715, + "learning_rate": 3.6566529536610995e-06, + "loss": 0.0177, + "step": 26510 + }, + { + "epoch": 31.836734693877553, + "grad_norm": 0.18079149723052979, + "learning_rate": 3.635991244196513e-06, + "loss": 0.0198, + "step": 26520 + }, + { + "epoch": 31.84873949579832, + "grad_norm": 0.1733437329530716, + "learning_rate": 3.615385871374405e-06, + "loss": 0.017, + "step": 26530 + }, + { + "epoch": 31.860744297719087, + "grad_norm": 0.111283078789711, + "learning_rate": 3.5948368602322367e-06, + "loss": 0.0214, + "step": 26540 + }, + { + "epoch": 31.872749099639854, + "grad_norm": 0.14094135165214539, + "learning_rate": 3.5743442357390187e-06, + "loss": 0.0201, + "step": 26550 + }, + { + "epoch": 31.884753901560625, + "grad_norm": 0.15101316571235657, + "learning_rate": 3.5539080227951937e-06, + "loss": 0.0177, + "step": 26560 + }, + { + "epoch": 31.896758703481392, + "grad_norm": 0.14024664461612701, + "learning_rate": 3.533528246232709e-06, + "loss": 0.0173, + "step": 26570 + }, + { + "epoch": 31.90876350540216, + "grad_norm": 0.1885640174150467, + "learning_rate": 3.5132049308149117e-06, + "loss": 0.0177, + "step": 26580 + }, + { + "epoch": 31.92076830732293, + "grad_norm": 0.2331913560628891, + "learning_rate": 3.4929381012365425e-06, + "loss": 0.02, + "step": 26590 + }, + { + "epoch": 31.932773109243698, + "grad_norm": 0.11406443268060684, + "learning_rate": 3.472727782123697e-06, + "loss": 0.0185, + "step": 26600 + }, + { + "epoch": 31.944777911164465, + "grad_norm": 0.14161522686481476, + "learning_rate": 3.4525739980338425e-06, + "loss": 0.0169, + "step": 26610 + }, + { + "epoch": 31.956782713085236, + "grad_norm": 0.14456471800804138, + "learning_rate": 3.432476773455712e-06, + "loss": 0.0196, + "step": 26620 + }, + { + "epoch": 31.968787515006003, + "grad_norm": 0.13935494422912598, + "learning_rate": 3.4124361328093378e-06, + "loss": 0.0181, + "step": 26630 + }, + { + "epoch": 31.98079231692677, + "grad_norm": 0.08713328093290329, + "learning_rate": 3.392452100445975e-06, + "loss": 0.0191, + "step": 26640 + }, + { + "epoch": 31.992797118847538, + "grad_norm": 0.30903124809265137, + "learning_rate": 3.3725247006481163e-06, + "loss": 0.0212, + "step": 26650 + }, + { + "epoch": 32.00480192076831, + "grad_norm": 0.16427691280841827, + "learning_rate": 3.3526539576294313e-06, + "loss": 0.0175, + "step": 26660 + }, + { + "epoch": 32.016806722689076, + "grad_norm": 0.13706345856189728, + "learning_rate": 3.332839895534745e-06, + "loss": 0.0174, + "step": 26670 + }, + { + "epoch": 32.02881152460984, + "grad_norm": 0.23733574151992798, + "learning_rate": 3.3130825384400155e-06, + "loss": 0.0176, + "step": 26680 + }, + { + "epoch": 32.04081632653061, + "grad_norm": 0.1475694477558136, + "learning_rate": 3.293381910352278e-06, + "loss": 0.0165, + "step": 26690 + }, + { + "epoch": 32.05282112845138, + "grad_norm": 0.13465484976768494, + "learning_rate": 3.273738035209678e-06, + "loss": 0.0161, + "step": 26700 + }, + { + "epoch": 32.06482593037215, + "grad_norm": 0.1145133376121521, + "learning_rate": 3.254150936881356e-06, + "loss": 0.0166, + "step": 26710 + }, + { + "epoch": 32.07683073229292, + "grad_norm": 0.39512962102890015, + "learning_rate": 3.234620639167496e-06, + "loss": 0.0176, + "step": 26720 + }, + { + "epoch": 32.088835534213686, + "grad_norm": 0.11924419552087784, + "learning_rate": 3.2151471657992482e-06, + "loss": 0.0161, + "step": 26730 + }, + { + "epoch": 32.10084033613445, + "grad_norm": 0.14395645260810852, + "learning_rate": 3.195730540438718e-06, + "loss": 0.0196, + "step": 26740 + }, + { + "epoch": 32.11284513805522, + "grad_norm": 0.24332188069820404, + "learning_rate": 3.1763707866789336e-06, + "loss": 0.0176, + "step": 26750 + }, + { + "epoch": 32.12484993997599, + "grad_norm": 0.140403151512146, + "learning_rate": 3.1570679280438165e-06, + "loss": 0.0172, + "step": 26760 + }, + { + "epoch": 32.136854741896755, + "grad_norm": 0.13930822908878326, + "learning_rate": 3.1378219879881767e-06, + "loss": 0.018, + "step": 26770 + }, + { + "epoch": 32.14885954381753, + "grad_norm": 0.1645224541425705, + "learning_rate": 3.11863298989763e-06, + "loss": 0.0185, + "step": 26780 + }, + { + "epoch": 32.1608643457383, + "grad_norm": 0.19584862887859344, + "learning_rate": 3.0995009570886303e-06, + "loss": 0.0173, + "step": 26790 + }, + { + "epoch": 32.172869147659064, + "grad_norm": 0.1859142929315567, + "learning_rate": 3.0804259128083936e-06, + "loss": 0.0187, + "step": 26800 + }, + { + "epoch": 32.18487394957983, + "grad_norm": 0.08499310910701752, + "learning_rate": 3.06140788023489e-06, + "loss": 0.018, + "step": 26810 + }, + { + "epoch": 32.1968787515006, + "grad_norm": 0.10492441058158875, + "learning_rate": 3.042446882476846e-06, + "loss": 0.017, + "step": 26820 + }, + { + "epoch": 32.208883553421366, + "grad_norm": 0.1621704250574112, + "learning_rate": 3.023542942573643e-06, + "loss": 0.0193, + "step": 26830 + }, + { + "epoch": 32.22088835534214, + "grad_norm": 0.126556396484375, + "learning_rate": 3.004696083495351e-06, + "loss": 0.0169, + "step": 26840 + }, + { + "epoch": 32.23289315726291, + "grad_norm": 0.11154916137456894, + "learning_rate": 2.98590632814269e-06, + "loss": 0.0185, + "step": 26850 + }, + { + "epoch": 32.244897959183675, + "grad_norm": 0.22268246114253998, + "learning_rate": 2.9671736993469702e-06, + "loss": 0.0188, + "step": 26860 + }, + { + "epoch": 32.25690276110444, + "grad_norm": 0.13880446553230286, + "learning_rate": 2.9484982198701216e-06, + "loss": 0.0198, + "step": 26870 + }, + { + "epoch": 32.26890756302521, + "grad_norm": 0.1584390252828598, + "learning_rate": 2.929879912404604e-06, + "loss": 0.0179, + "step": 26880 + }, + { + "epoch": 32.280912364945976, + "grad_norm": 0.1490800380706787, + "learning_rate": 2.9113187995734147e-06, + "loss": 0.02, + "step": 26890 + }, + { + "epoch": 32.292917166866744, + "grad_norm": 0.13767166435718536, + "learning_rate": 2.8928149039300524e-06, + "loss": 0.018, + "step": 26900 + }, + { + "epoch": 32.30492196878752, + "grad_norm": 0.15174472332000732, + "learning_rate": 2.8743682479584975e-06, + "loss": 0.0199, + "step": 26910 + }, + { + "epoch": 32.316926770708285, + "grad_norm": 0.09449215233325958, + "learning_rate": 2.8559788540731824e-06, + "loss": 0.0168, + "step": 26920 + }, + { + "epoch": 32.32893157262905, + "grad_norm": 0.1407683789730072, + "learning_rate": 2.8376467446189493e-06, + "loss": 0.0158, + "step": 26930 + }, + { + "epoch": 32.34093637454982, + "grad_norm": 0.13125914335250854, + "learning_rate": 2.8193719418710406e-06, + "loss": 0.0154, + "step": 26940 + }, + { + "epoch": 32.35294117647059, + "grad_norm": 0.16365598142147064, + "learning_rate": 2.8011544680350667e-06, + "loss": 0.0181, + "step": 26950 + }, + { + "epoch": 32.364945978391354, + "grad_norm": 0.10840900242328644, + "learning_rate": 2.7829943452469753e-06, + "loss": 0.0154, + "step": 26960 + }, + { + "epoch": 32.37695078031212, + "grad_norm": 0.1336267739534378, + "learning_rate": 2.7648915955730213e-06, + "loss": 0.0177, + "step": 26970 + }, + { + "epoch": 32.388955582232896, + "grad_norm": 0.16045232117176056, + "learning_rate": 2.746846241009765e-06, + "loss": 0.0162, + "step": 26980 + }, + { + "epoch": 32.40096038415366, + "grad_norm": 0.08603760600090027, + "learning_rate": 2.7288583034839945e-06, + "loss": 0.0173, + "step": 26990 + }, + { + "epoch": 32.41296518607443, + "grad_norm": 0.142605260014534, + "learning_rate": 2.7109278048527752e-06, + "loss": 0.0176, + "step": 27000 + }, + { + "epoch": 32.4249699879952, + "grad_norm": 0.12483145296573639, + "learning_rate": 2.6930547669033413e-06, + "loss": 0.0157, + "step": 27010 + }, + { + "epoch": 32.436974789915965, + "grad_norm": 0.15373776853084564, + "learning_rate": 2.675239211353109e-06, + "loss": 0.0195, + "step": 27020 + }, + { + "epoch": 32.44897959183673, + "grad_norm": 0.18854348361492157, + "learning_rate": 2.6574811598496786e-06, + "loss": 0.0205, + "step": 27030 + }, + { + "epoch": 32.460984393757506, + "grad_norm": 0.15175242722034454, + "learning_rate": 2.6397806339707456e-06, + "loss": 0.0173, + "step": 27040 + }, + { + "epoch": 32.472989195678274, + "grad_norm": 0.13470856845378876, + "learning_rate": 2.6221376552241217e-06, + "loss": 0.0187, + "step": 27050 + }, + { + "epoch": 32.48499399759904, + "grad_norm": 0.13947995007038116, + "learning_rate": 2.604552245047681e-06, + "loss": 0.0195, + "step": 27060 + }, + { + "epoch": 32.49699879951981, + "grad_norm": 0.1317974030971527, + "learning_rate": 2.587024424809359e-06, + "loss": 0.0177, + "step": 27070 + }, + { + "epoch": 32.509003601440575, + "grad_norm": 0.15560297667980194, + "learning_rate": 2.5695542158071186e-06, + "loss": 0.0185, + "step": 27080 + }, + { + "epoch": 32.52100840336134, + "grad_norm": 0.12232505530118942, + "learning_rate": 2.5521416392689068e-06, + "loss": 0.0196, + "step": 27090 + }, + { + "epoch": 32.53301320528211, + "grad_norm": 0.13473716378211975, + "learning_rate": 2.5347867163526384e-06, + "loss": 0.0195, + "step": 27100 + }, + { + "epoch": 32.545018007202884, + "grad_norm": 0.14156603813171387, + "learning_rate": 2.517489468146189e-06, + "loss": 0.0173, + "step": 27110 + }, + { + "epoch": 32.55702280912365, + "grad_norm": 0.12904831767082214, + "learning_rate": 2.500249915667341e-06, + "loss": 0.017, + "step": 27120 + }, + { + "epoch": 32.56902761104442, + "grad_norm": 0.1776820570230484, + "learning_rate": 2.4830680798637817e-06, + "loss": 0.0173, + "step": 27130 + }, + { + "epoch": 32.581032412965186, + "grad_norm": 0.11629913747310638, + "learning_rate": 2.465943981613056e-06, + "loss": 0.0176, + "step": 27140 + }, + { + "epoch": 32.59303721488595, + "grad_norm": 0.14469000697135925, + "learning_rate": 2.448877641722569e-06, + "loss": 0.0176, + "step": 27150 + }, + { + "epoch": 32.60504201680672, + "grad_norm": 0.1177934855222702, + "learning_rate": 2.431869080929522e-06, + "loss": 0.0164, + "step": 27160 + }, + { + "epoch": 32.61704681872749, + "grad_norm": 0.14740915596485138, + "learning_rate": 2.4149183199009216e-06, + "loss": 0.0173, + "step": 27170 + }, + { + "epoch": 32.62905162064826, + "grad_norm": 0.11035487055778503, + "learning_rate": 2.3980253792335427e-06, + "loss": 0.0175, + "step": 27180 + }, + { + "epoch": 32.64105642256903, + "grad_norm": 0.15819308161735535, + "learning_rate": 2.381190279453899e-06, + "loss": 0.0167, + "step": 27190 + }, + { + "epoch": 32.6530612244898, + "grad_norm": 0.23559576272964478, + "learning_rate": 2.364413041018232e-06, + "loss": 0.0194, + "step": 27200 + }, + { + "epoch": 32.665066026410564, + "grad_norm": 0.15436235070228577, + "learning_rate": 2.347693684312463e-06, + "loss": 0.0171, + "step": 27210 + }, + { + "epoch": 32.67707082833133, + "grad_norm": 0.15533028542995453, + "learning_rate": 2.331032229652186e-06, + "loss": 0.0163, + "step": 27220 + }, + { + "epoch": 32.6890756302521, + "grad_norm": 0.1790834665298462, + "learning_rate": 2.314428697282628e-06, + "loss": 0.019, + "step": 27230 + }, + { + "epoch": 32.70108043217287, + "grad_norm": 0.13931187987327576, + "learning_rate": 2.297883107378673e-06, + "loss": 0.0172, + "step": 27240 + }, + { + "epoch": 32.71308523409364, + "grad_norm": 0.13324110209941864, + "learning_rate": 2.2813954800447513e-06, + "loss": 0.0184, + "step": 27250 + }, + { + "epoch": 32.72509003601441, + "grad_norm": 0.17463816702365875, + "learning_rate": 2.2649658353148974e-06, + "loss": 0.0185, + "step": 27260 + }, + { + "epoch": 32.737094837935174, + "grad_norm": 0.15693993866443634, + "learning_rate": 2.2485941931526645e-06, + "loss": 0.017, + "step": 27270 + }, + { + "epoch": 32.74909963985594, + "grad_norm": 0.12308455258607864, + "learning_rate": 2.232280573451151e-06, + "loss": 0.0156, + "step": 27280 + }, + { + "epoch": 32.76110444177671, + "grad_norm": 0.1407734453678131, + "learning_rate": 2.216024996032945e-06, + "loss": 0.0178, + "step": 27290 + }, + { + "epoch": 32.773109243697476, + "grad_norm": 0.11400190740823746, + "learning_rate": 2.1998274806501074e-06, + "loss": 0.0166, + "step": 27300 + }, + { + "epoch": 32.78511404561825, + "grad_norm": 0.11955293267965317, + "learning_rate": 2.183688046984139e-06, + "loss": 0.0184, + "step": 27310 + }, + { + "epoch": 32.79711884753902, + "grad_norm": 0.1492682546377182, + "learning_rate": 2.167606714645981e-06, + "loss": 0.0215, + "step": 27320 + }, + { + "epoch": 32.809123649459785, + "grad_norm": 0.11040783673524857, + "learning_rate": 2.151583503175958e-06, + "loss": 0.0173, + "step": 27330 + }, + { + "epoch": 32.82112845138055, + "grad_norm": 0.16820047795772552, + "learning_rate": 2.1356184320437957e-06, + "loss": 0.0187, + "step": 27340 + }, + { + "epoch": 32.83313325330132, + "grad_norm": 0.14202429354190826, + "learning_rate": 2.119711520648554e-06, + "loss": 0.0186, + "step": 27350 + }, + { + "epoch": 32.84513805522209, + "grad_norm": 0.16774246096611023, + "learning_rate": 2.103862788318628e-06, + "loss": 0.0194, + "step": 27360 + }, + { + "epoch": 32.857142857142854, + "grad_norm": 0.16495777666568756, + "learning_rate": 2.088072254311729e-06, + "loss": 0.017, + "step": 27370 + }, + { + "epoch": 32.86914765906363, + "grad_norm": 0.20977935194969177, + "learning_rate": 2.0723399378148435e-06, + "loss": 0.0163, + "step": 27380 + }, + { + "epoch": 32.881152460984396, + "grad_norm": 0.1219455674290657, + "learning_rate": 2.0566658579442065e-06, + "loss": 0.0183, + "step": 27390 + }, + { + "epoch": 32.89315726290516, + "grad_norm": 0.12607364356517792, + "learning_rate": 2.0410500337453176e-06, + "loss": 0.0149, + "step": 27400 + }, + { + "epoch": 32.90516206482593, + "grad_norm": 0.13252420723438263, + "learning_rate": 2.0254924841928645e-06, + "loss": 0.0192, + "step": 27410 + }, + { + "epoch": 32.9171668667467, + "grad_norm": 0.15026678144931793, + "learning_rate": 2.009993228190754e-06, + "loss": 0.0167, + "step": 27420 + }, + { + "epoch": 32.929171668667465, + "grad_norm": 0.19522148370742798, + "learning_rate": 1.9945522845720323e-06, + "loss": 0.018, + "step": 27430 + }, + { + "epoch": 32.94117647058823, + "grad_norm": 0.14974676072597504, + "learning_rate": 1.9791696720988963e-06, + "loss": 0.0181, + "step": 27440 + }, + { + "epoch": 32.953181272509006, + "grad_norm": 0.15345865488052368, + "learning_rate": 1.9638454094626833e-06, + "loss": 0.0163, + "step": 27450 + }, + { + "epoch": 32.965186074429774, + "grad_norm": 0.10885081440210342, + "learning_rate": 1.9485795152838104e-06, + "loss": 0.0174, + "step": 27460 + }, + { + "epoch": 32.97719087635054, + "grad_norm": 0.12758468091487885, + "learning_rate": 1.933372008111778e-06, + "loss": 0.0185, + "step": 27470 + }, + { + "epoch": 32.98919567827131, + "grad_norm": 0.12225650250911713, + "learning_rate": 1.918222906425143e-06, + "loss": 0.0151, + "step": 27480 + }, + { + "epoch": 33.001200480192075, + "grad_norm": 0.18809345364570618, + "learning_rate": 1.903132228631488e-06, + "loss": 0.018, + "step": 27490 + }, + { + "epoch": 33.01320528211284, + "grad_norm": 0.18281248211860657, + "learning_rate": 1.8880999930674215e-06, + "loss": 0.0193, + "step": 27500 + }, + { + "epoch": 33.02521008403362, + "grad_norm": 0.11868562549352646, + "learning_rate": 1.8731262179985166e-06, + "loss": 0.0195, + "step": 27510 + }, + { + "epoch": 33.037214885954384, + "grad_norm": 0.1520184874534607, + "learning_rate": 1.8582109216193244e-06, + "loss": 0.0155, + "step": 27520 + }, + { + "epoch": 33.04921968787515, + "grad_norm": 0.10554101318120956, + "learning_rate": 1.8433541220533368e-06, + "loss": 0.0182, + "step": 27530 + }, + { + "epoch": 33.06122448979592, + "grad_norm": 0.1313094049692154, + "learning_rate": 1.8285558373529577e-06, + "loss": 0.0159, + "step": 27540 + }, + { + "epoch": 33.073229291716686, + "grad_norm": 0.13635049760341644, + "learning_rate": 1.8138160854995145e-06, + "loss": 0.0194, + "step": 27550 + }, + { + "epoch": 33.08523409363745, + "grad_norm": 0.1388721466064453, + "learning_rate": 1.7991348844031864e-06, + "loss": 0.0172, + "step": 27560 + }, + { + "epoch": 33.09723889555822, + "grad_norm": 0.10829999297857285, + "learning_rate": 1.78451225190302e-06, + "loss": 0.0188, + "step": 27570 + }, + { + "epoch": 33.109243697478995, + "grad_norm": 0.13645993173122406, + "learning_rate": 1.7699482057668914e-06, + "loss": 0.0173, + "step": 27580 + }, + { + "epoch": 33.12124849939976, + "grad_norm": 0.13191618025302887, + "learning_rate": 1.7554427636914895e-06, + "loss": 0.0159, + "step": 27590 + }, + { + "epoch": 33.13325330132053, + "grad_norm": 0.1862807422876358, + "learning_rate": 1.7409959433022871e-06, + "loss": 0.0169, + "step": 27600 + }, + { + "epoch": 33.1452581032413, + "grad_norm": 0.1412965953350067, + "learning_rate": 1.7266077621535481e-06, + "loss": 0.0178, + "step": 27610 + }, + { + "epoch": 33.157262905162064, + "grad_norm": 0.11301453411579132, + "learning_rate": 1.7122782377282597e-06, + "loss": 0.0168, + "step": 27620 + }, + { + "epoch": 33.16926770708283, + "grad_norm": 0.13718311488628387, + "learning_rate": 1.6980073874381497e-06, + "loss": 0.0183, + "step": 27630 + }, + { + "epoch": 33.1812725090036, + "grad_norm": 0.13113008439540863, + "learning_rate": 1.6837952286236413e-06, + "loss": 0.0199, + "step": 27640 + }, + { + "epoch": 33.19327731092437, + "grad_norm": 0.10493451356887817, + "learning_rate": 1.6696417785538488e-06, + "loss": 0.0175, + "step": 27650 + }, + { + "epoch": 33.20528211284514, + "grad_norm": 0.09792274981737137, + "learning_rate": 1.6555470544265538e-06, + "loss": 0.0175, + "step": 27660 + }, + { + "epoch": 33.21728691476591, + "grad_norm": 0.13710401952266693, + "learning_rate": 1.6415110733681738e-06, + "loss": 0.0172, + "step": 27670 + }, + { + "epoch": 33.229291716686674, + "grad_norm": 0.12097518146038055, + "learning_rate": 1.6275338524337436e-06, + "loss": 0.0199, + "step": 27680 + }, + { + "epoch": 33.24129651860744, + "grad_norm": 0.10726942867040634, + "learning_rate": 1.6136154086069056e-06, + "loss": 0.0163, + "step": 27690 + }, + { + "epoch": 33.25330132052821, + "grad_norm": 0.34843647480010986, + "learning_rate": 1.5997557587998868e-06, + "loss": 0.0195, + "step": 27700 + }, + { + "epoch": 33.265306122448976, + "grad_norm": 0.1835946887731552, + "learning_rate": 1.5859549198534606e-06, + "loss": 0.0185, + "step": 27710 + }, + { + "epoch": 33.27731092436975, + "grad_norm": 0.12322438508272171, + "learning_rate": 1.572212908536963e-06, + "loss": 0.0164, + "step": 27720 + }, + { + "epoch": 33.28931572629052, + "grad_norm": 0.1449376344680786, + "learning_rate": 1.5585297415482202e-06, + "loss": 0.017, + "step": 27730 + }, + { + "epoch": 33.301320528211285, + "grad_norm": 0.13018150627613068, + "learning_rate": 1.5449054355135717e-06, + "loss": 0.0189, + "step": 27740 + }, + { + "epoch": 33.31332533013205, + "grad_norm": 0.11550850421190262, + "learning_rate": 1.5313400069878415e-06, + "loss": 0.0184, + "step": 27750 + }, + { + "epoch": 33.32533013205282, + "grad_norm": 0.09491439163684845, + "learning_rate": 1.5178334724542887e-06, + "loss": 0.0181, + "step": 27760 + }, + { + "epoch": 33.33733493397359, + "grad_norm": 0.1730353683233261, + "learning_rate": 1.504385848324641e-06, + "loss": 0.0194, + "step": 27770 + }, + { + "epoch": 33.34933973589436, + "grad_norm": 0.15760737657546997, + "learning_rate": 1.490997150939033e-06, + "loss": 0.0164, + "step": 27780 + }, + { + "epoch": 33.36134453781513, + "grad_norm": 0.1634172946214676, + "learning_rate": 1.4776673965659793e-06, + "loss": 0.017, + "step": 27790 + }, + { + "epoch": 33.373349339735896, + "grad_norm": 0.15807469189167023, + "learning_rate": 1.4643966014023957e-06, + "loss": 0.0168, + "step": 27800 + }, + { + "epoch": 33.38535414165666, + "grad_norm": 0.11513369530439377, + "learning_rate": 1.4511847815735503e-06, + "loss": 0.0196, + "step": 27810 + }, + { + "epoch": 33.39735894357743, + "grad_norm": 0.13244424760341644, + "learning_rate": 1.438031953133051e-06, + "loss": 0.0162, + "step": 27820 + }, + { + "epoch": 33.4093637454982, + "grad_norm": 0.1517142653465271, + "learning_rate": 1.4249381320628197e-06, + "loss": 0.016, + "step": 27830 + }, + { + "epoch": 33.421368547418965, + "grad_norm": 0.17954257130622864, + "learning_rate": 1.4119033342730902e-06, + "loss": 0.019, + "step": 27840 + }, + { + "epoch": 33.43337334933974, + "grad_norm": 0.13092704117298126, + "learning_rate": 1.3989275756023656e-06, + "loss": 0.0166, + "step": 27850 + }, + { + "epoch": 33.445378151260506, + "grad_norm": 0.09787193685770035, + "learning_rate": 1.386010871817417e-06, + "loss": 0.0169, + "step": 27860 + }, + { + "epoch": 33.45738295318127, + "grad_norm": 0.13165122270584106, + "learning_rate": 1.3731532386132616e-06, + "loss": 0.0178, + "step": 27870 + }, + { + "epoch": 33.46938775510204, + "grad_norm": 0.11144757270812988, + "learning_rate": 1.360354691613136e-06, + "loss": 0.0156, + "step": 27880 + }, + { + "epoch": 33.48139255702281, + "grad_norm": 0.20582233369350433, + "learning_rate": 1.3476152463684776e-06, + "loss": 0.018, + "step": 27890 + }, + { + "epoch": 33.493397358943575, + "grad_norm": 0.25068753957748413, + "learning_rate": 1.3349349183589155e-06, + "loss": 0.0187, + "step": 27900 + }, + { + "epoch": 33.50540216086434, + "grad_norm": 0.16686247289180756, + "learning_rate": 1.3223137229922356e-06, + "loss": 0.0178, + "step": 27910 + }, + { + "epoch": 33.51740696278512, + "grad_norm": 0.14804691076278687, + "learning_rate": 1.3097516756043981e-06, + "loss": 0.0201, + "step": 27920 + }, + { + "epoch": 33.529411764705884, + "grad_norm": 0.1490117460489273, + "learning_rate": 1.29724879145946e-06, + "loss": 0.0163, + "step": 27930 + }, + { + "epoch": 33.54141656662665, + "grad_norm": 0.11172396689653397, + "learning_rate": 1.284805085749613e-06, + "loss": 0.0189, + "step": 27940 + }, + { + "epoch": 33.55342136854742, + "grad_norm": 0.14546017348766327, + "learning_rate": 1.2724205735951288e-06, + "loss": 0.0208, + "step": 27950 + }, + { + "epoch": 33.565426170468186, + "grad_norm": 0.18706950545310974, + "learning_rate": 1.2600952700443591e-06, + "loss": 0.0177, + "step": 27960 + }, + { + "epoch": 33.57743097238895, + "grad_norm": 0.1171332374215126, + "learning_rate": 1.247829190073707e-06, + "loss": 0.0148, + "step": 27970 + }, + { + "epoch": 33.58943577430973, + "grad_norm": 0.08883073180913925, + "learning_rate": 1.2356223485876173e-06, + "loss": 0.0169, + "step": 27980 + }, + { + "epoch": 33.601440576230495, + "grad_norm": 0.1247197762131691, + "learning_rate": 1.2234747604185526e-06, + "loss": 0.0158, + "step": 27990 + }, + { + "epoch": 33.61344537815126, + "grad_norm": 0.10286708176136017, + "learning_rate": 1.2113864403269836e-06, + "loss": 0.0182, + "step": 28000 + }, + { + "epoch": 33.62545018007203, + "grad_norm": 0.2448728084564209, + "learning_rate": 1.1993574030013555e-06, + "loss": 0.0164, + "step": 28010 + }, + { + "epoch": 33.637454981992796, + "grad_norm": 0.17016610503196716, + "learning_rate": 1.1873876630580816e-06, + "loss": 0.019, + "step": 28020 + }, + { + "epoch": 33.649459783913564, + "grad_norm": 0.18367940187454224, + "learning_rate": 1.1754772350415278e-06, + "loss": 0.0192, + "step": 28030 + }, + { + "epoch": 33.66146458583433, + "grad_norm": 0.13581907749176025, + "learning_rate": 1.163626133423984e-06, + "loss": 0.0171, + "step": 28040 + }, + { + "epoch": 33.673469387755105, + "grad_norm": 0.1342429220676422, + "learning_rate": 1.1518343726056591e-06, + "loss": 0.0185, + "step": 28050 + }, + { + "epoch": 33.68547418967587, + "grad_norm": 0.2210329920053482, + "learning_rate": 1.1401019669146474e-06, + "loss": 0.0159, + "step": 28060 + }, + { + "epoch": 33.69747899159664, + "grad_norm": 0.19868385791778564, + "learning_rate": 1.128428930606934e-06, + "loss": 0.0198, + "step": 28070 + }, + { + "epoch": 33.70948379351741, + "grad_norm": 0.13609790802001953, + "learning_rate": 1.1168152778663621e-06, + "loss": 0.0193, + "step": 28080 + }, + { + "epoch": 33.721488595438174, + "grad_norm": 0.10851321369409561, + "learning_rate": 1.10526102280461e-06, + "loss": 0.019, + "step": 28090 + }, + { + "epoch": 33.73349339735894, + "grad_norm": 0.12134771049022675, + "learning_rate": 1.0937661794611864e-06, + "loss": 0.0164, + "step": 28100 + }, + { + "epoch": 33.74549819927971, + "grad_norm": 0.10801027715206146, + "learning_rate": 1.082330761803413e-06, + "loss": 0.0167, + "step": 28110 + }, + { + "epoch": 33.75750300120048, + "grad_norm": 0.10802151262760162, + "learning_rate": 1.0709547837263966e-06, + "loss": 0.0172, + "step": 28120 + }, + { + "epoch": 33.76950780312125, + "grad_norm": 0.169146329164505, + "learning_rate": 1.059638259053025e-06, + "loss": 0.0189, + "step": 28130 + }, + { + "epoch": 33.78151260504202, + "grad_norm": 0.1413995772600174, + "learning_rate": 1.0483812015339434e-06, + "loss": 0.018, + "step": 28140 + }, + { + "epoch": 33.793517406962785, + "grad_norm": 0.16008447110652924, + "learning_rate": 1.037183624847543e-06, + "loss": 0.0185, + "step": 28150 + }, + { + "epoch": 33.80552220888355, + "grad_norm": 0.16158892214298248, + "learning_rate": 1.0260455425999238e-06, + "loss": 0.0182, + "step": 28160 + }, + { + "epoch": 33.81752701080432, + "grad_norm": 0.1534670889377594, + "learning_rate": 1.0149669683249208e-06, + "loss": 0.015, + "step": 28170 + }, + { + "epoch": 33.82953181272509, + "grad_norm": 0.1604253500699997, + "learning_rate": 1.003947915484027e-06, + "loss": 0.0185, + "step": 28180 + }, + { + "epoch": 33.84153661464586, + "grad_norm": 0.14298386871814728, + "learning_rate": 9.92988397466449e-07, + "loss": 0.019, + "step": 28190 + }, + { + "epoch": 33.85354141656663, + "grad_norm": 0.18651705980300903, + "learning_rate": 9.820884275890286e-07, + "loss": 0.0183, + "step": 28200 + }, + { + "epoch": 33.865546218487395, + "grad_norm": 0.10586712509393692, + "learning_rate": 9.712480190962546e-07, + "loss": 0.0173, + "step": 28210 + }, + { + "epoch": 33.87755102040816, + "grad_norm": 0.1703958362340927, + "learning_rate": 9.604671851602464e-07, + "loss": 0.0146, + "step": 28220 + }, + { + "epoch": 33.88955582232893, + "grad_norm": 0.1282270848751068, + "learning_rate": 9.497459388807306e-07, + "loss": 0.0152, + "step": 28230 + }, + { + "epoch": 33.9015606242497, + "grad_norm": 0.1261816918849945, + "learning_rate": 9.390842932850364e-07, + "loss": 0.0169, + "step": 28240 + }, + { + "epoch": 33.91356542617047, + "grad_norm": 0.13177306950092316, + "learning_rate": 9.28482261328073e-07, + "loss": 0.0172, + "step": 28250 + }, + { + "epoch": 33.92557022809124, + "grad_norm": 0.17305058240890503, + "learning_rate": 9.179398558923025e-07, + "loss": 0.0173, + "step": 28260 + }, + { + "epoch": 33.937575030012006, + "grad_norm": 0.09119313210248947, + "learning_rate": 9.074570897877388e-07, + "loss": 0.019, + "step": 28270 + }, + { + "epoch": 33.94957983193277, + "grad_norm": 0.1546953320503235, + "learning_rate": 8.970339757519375e-07, + "loss": 0.0191, + "step": 28280 + }, + { + "epoch": 33.96158463385354, + "grad_norm": 0.1518130749464035, + "learning_rate": 8.866705264499619e-07, + "loss": 0.0199, + "step": 28290 + }, + { + "epoch": 33.97358943577431, + "grad_norm": 0.10661707818508148, + "learning_rate": 8.763667544743836e-07, + "loss": 0.016, + "step": 28300 + }, + { + "epoch": 33.985594237695075, + "grad_norm": 0.23024709522724152, + "learning_rate": 8.661226723452543e-07, + "loss": 0.018, + "step": 28310 + }, + { + "epoch": 33.99759903961585, + "grad_norm": 0.13956500589847565, + "learning_rate": 8.559382925101001e-07, + "loss": 0.0165, + "step": 28320 + }, + { + "epoch": 34.00960384153662, + "grad_norm": 0.10925451666116714, + "learning_rate": 8.458136273438943e-07, + "loss": 0.0155, + "step": 28330 + }, + { + "epoch": 34.021608643457384, + "grad_norm": 0.11545053869485855, + "learning_rate": 8.357486891490795e-07, + "loss": 0.0161, + "step": 28340 + }, + { + "epoch": 34.03361344537815, + "grad_norm": 0.1675199568271637, + "learning_rate": 8.257434901554895e-07, + "loss": 0.0173, + "step": 28350 + }, + { + "epoch": 34.04561824729892, + "grad_norm": 0.12686236202716827, + "learning_rate": 8.157980425203937e-07, + "loss": 0.019, + "step": 28360 + }, + { + "epoch": 34.057623049219686, + "grad_norm": 0.1506708711385727, + "learning_rate": 8.059123583284367e-07, + "loss": 0.0152, + "step": 28370 + }, + { + "epoch": 34.06962785114045, + "grad_norm": 0.13418887555599213, + "learning_rate": 7.960864495916654e-07, + "loss": 0.0169, + "step": 28380 + }, + { + "epoch": 34.08163265306123, + "grad_norm": 0.18010973930358887, + "learning_rate": 7.863203282494847e-07, + "loss": 0.0166, + "step": 28390 + }, + { + "epoch": 34.093637454981994, + "grad_norm": 0.14312344789505005, + "learning_rate": 7.766140061686522e-07, + "loss": 0.0168, + "step": 28400 + }, + { + "epoch": 34.10564225690276, + "grad_norm": 0.19526341557502747, + "learning_rate": 7.669674951432615e-07, + "loss": 0.0198, + "step": 28410 + }, + { + "epoch": 34.11764705882353, + "grad_norm": 0.10972777009010315, + "learning_rate": 7.573808068947363e-07, + "loss": 0.0174, + "step": 28420 + }, + { + "epoch": 34.129651860744296, + "grad_norm": 0.0878688395023346, + "learning_rate": 7.478539530718087e-07, + "loss": 0.0173, + "step": 28430 + }, + { + "epoch": 34.14165666266506, + "grad_norm": 0.21302059292793274, + "learning_rate": 7.383869452504965e-07, + "loss": 0.0171, + "step": 28440 + }, + { + "epoch": 34.15366146458584, + "grad_norm": 0.22488252818584442, + "learning_rate": 7.289797949341204e-07, + "loss": 0.0173, + "step": 28450 + }, + { + "epoch": 34.165666266506605, + "grad_norm": 0.17183168232440948, + "learning_rate": 7.196325135532423e-07, + "loss": 0.0157, + "step": 28460 + }, + { + "epoch": 34.17767106842737, + "grad_norm": 0.1386650651693344, + "learning_rate": 7.10345112465699e-07, + "loss": 0.017, + "step": 28470 + }, + { + "epoch": 34.18967587034814, + "grad_norm": 0.1647529900074005, + "learning_rate": 7.011176029565525e-07, + "loss": 0.017, + "step": 28480 + }, + { + "epoch": 34.20168067226891, + "grad_norm": 0.12021121382713318, + "learning_rate": 6.919499962381004e-07, + "loss": 0.0159, + "step": 28490 + }, + { + "epoch": 34.213685474189674, + "grad_norm": 0.1927194446325302, + "learning_rate": 6.828423034498488e-07, + "loss": 0.0187, + "step": 28500 + }, + { + "epoch": 34.22569027611044, + "grad_norm": 0.10737032443284988, + "learning_rate": 6.737945356585007e-07, + "loss": 0.0167, + "step": 28510 + }, + { + "epoch": 34.237695078031216, + "grad_norm": 0.12984801828861237, + "learning_rate": 6.648067038579508e-07, + "loss": 0.0174, + "step": 28520 + }, + { + "epoch": 34.24969987995198, + "grad_norm": 0.14774048328399658, + "learning_rate": 6.558788189692578e-07, + "loss": 0.0162, + "step": 28530 + }, + { + "epoch": 34.26170468187275, + "grad_norm": 0.13456623256206512, + "learning_rate": 6.470108918406492e-07, + "loss": 0.0195, + "step": 28540 + }, + { + "epoch": 34.27370948379352, + "grad_norm": 0.11558368802070618, + "learning_rate": 6.382029332474893e-07, + "loss": 0.0142, + "step": 28550 + }, + { + "epoch": 34.285714285714285, + "grad_norm": 0.17792434990406036, + "learning_rate": 6.294549538922778e-07, + "loss": 0.016, + "step": 28560 + }, + { + "epoch": 34.29771908763505, + "grad_norm": 0.14221855998039246, + "learning_rate": 6.207669644046344e-07, + "loss": 0.0182, + "step": 28570 + }, + { + "epoch": 34.30972388955582, + "grad_norm": 0.1829891949892044, + "learning_rate": 6.121389753412865e-07, + "loss": 0.0172, + "step": 28580 + }, + { + "epoch": 34.321728691476594, + "grad_norm": 0.16149021685123444, + "learning_rate": 6.035709971860592e-07, + "loss": 0.0194, + "step": 28590 + }, + { + "epoch": 34.33373349339736, + "grad_norm": 0.104120634496212, + "learning_rate": 5.950630403498469e-07, + "loss": 0.0175, + "step": 28600 + }, + { + "epoch": 34.34573829531813, + "grad_norm": 0.16425937414169312, + "learning_rate": 5.866151151706189e-07, + "loss": 0.0166, + "step": 28610 + }, + { + "epoch": 34.357743097238895, + "grad_norm": 0.1300278604030609, + "learning_rate": 5.782272319134086e-07, + "loss": 0.0155, + "step": 28620 + }, + { + "epoch": 34.36974789915966, + "grad_norm": 0.10859335213899612, + "learning_rate": 5.698994007702796e-07, + "loss": 0.0168, + "step": 28630 + }, + { + "epoch": 34.38175270108043, + "grad_norm": 0.14533455669879913, + "learning_rate": 5.616316318603321e-07, + "loss": 0.0191, + "step": 28640 + }, + { + "epoch": 34.3937575030012, + "grad_norm": 0.16390067338943481, + "learning_rate": 5.534239352296799e-07, + "loss": 0.0178, + "step": 28650 + }, + { + "epoch": 34.40576230492197, + "grad_norm": 0.17487064003944397, + "learning_rate": 5.452763208514621e-07, + "loss": 0.0185, + "step": 28660 + }, + { + "epoch": 34.41776710684274, + "grad_norm": 0.15616562962532043, + "learning_rate": 5.371887986257873e-07, + "loss": 0.0183, + "step": 28670 + }, + { + "epoch": 34.429771908763506, + "grad_norm": 0.13650959730148315, + "learning_rate": 5.291613783797611e-07, + "loss": 0.0161, + "step": 28680 + }, + { + "epoch": 34.44177671068427, + "grad_norm": 0.14193479716777802, + "learning_rate": 5.211940698674534e-07, + "loss": 0.0161, + "step": 28690 + }, + { + "epoch": 34.45378151260504, + "grad_norm": 0.15755866467952728, + "learning_rate": 5.132868827698978e-07, + "loss": 0.0182, + "step": 28700 + }, + { + "epoch": 34.46578631452581, + "grad_norm": 0.26033398509025574, + "learning_rate": 5.054398266950755e-07, + "loss": 0.0184, + "step": 28710 + }, + { + "epoch": 34.47779111644658, + "grad_norm": 0.15898583829402924, + "learning_rate": 4.976529111778872e-07, + "loss": 0.0166, + "step": 28720 + }, + { + "epoch": 34.48979591836735, + "grad_norm": 0.11695744842290878, + "learning_rate": 4.899261456801862e-07, + "loss": 0.0162, + "step": 28730 + }, + { + "epoch": 34.50180072028812, + "grad_norm": 0.09466444700956345, + "learning_rate": 4.822595395907126e-07, + "loss": 0.015, + "step": 28740 + }, + { + "epoch": 34.513805522208884, + "grad_norm": 0.10087089240550995, + "learning_rate": 4.7465310222510886e-07, + "loss": 0.0159, + "step": 28750 + }, + { + "epoch": 34.52581032412965, + "grad_norm": 0.16671843826770782, + "learning_rate": 4.6710684282593175e-07, + "loss": 0.0176, + "step": 28760 + }, + { + "epoch": 34.53781512605042, + "grad_norm": 0.14230753481388092, + "learning_rate": 4.5962077056257993e-07, + "loss": 0.0171, + "step": 28770 + }, + { + "epoch": 34.549819927971186, + "grad_norm": 0.1633145809173584, + "learning_rate": 4.521948945313492e-07, + "loss": 0.0183, + "step": 28780 + }, + { + "epoch": 34.56182472989196, + "grad_norm": 0.10576315224170685, + "learning_rate": 4.4482922375537195e-07, + "loss": 0.0167, + "step": 28790 + }, + { + "epoch": 34.57382953181273, + "grad_norm": 0.12904444336891174, + "learning_rate": 4.375237671846333e-07, + "loss": 0.0205, + "step": 28800 + }, + { + "epoch": 34.585834333733494, + "grad_norm": 0.17749829590320587, + "learning_rate": 4.302785336959547e-07, + "loss": 0.0197, + "step": 28810 + }, + { + "epoch": 34.59783913565426, + "grad_norm": 0.21655932068824768, + "learning_rate": 4.2309353209297744e-07, + "loss": 0.0164, + "step": 28820 + }, + { + "epoch": 34.60984393757503, + "grad_norm": 0.1990879774093628, + "learning_rate": 4.159687711061566e-07, + "loss": 0.0173, + "step": 28830 + }, + { + "epoch": 34.621848739495796, + "grad_norm": 0.11417590826749802, + "learning_rate": 4.0890425939275055e-07, + "loss": 0.0168, + "step": 28840 + }, + { + "epoch": 34.63385354141656, + "grad_norm": 0.12365961819887161, + "learning_rate": 4.0190000553679827e-07, + "loss": 0.0185, + "step": 28850 + }, + { + "epoch": 34.64585834333734, + "grad_norm": 0.19679924845695496, + "learning_rate": 3.9495601804913627e-07, + "loss": 0.0174, + "step": 28860 + }, + { + "epoch": 34.657863145258105, + "grad_norm": 0.10893429070711136, + "learning_rate": 3.880723053673652e-07, + "loss": 0.0182, + "step": 28870 + }, + { + "epoch": 34.66986794717887, + "grad_norm": 0.18417173624038696, + "learning_rate": 3.812488758558386e-07, + "loss": 0.0198, + "step": 28880 + }, + { + "epoch": 34.68187274909964, + "grad_norm": 0.12079295516014099, + "learning_rate": 3.744857378056743e-07, + "loss": 0.0163, + "step": 28890 + }, + { + "epoch": 34.69387755102041, + "grad_norm": 0.164851114153862, + "learning_rate": 3.677828994347154e-07, + "loss": 0.0177, + "step": 28900 + }, + { + "epoch": 34.705882352941174, + "grad_norm": 0.1389251947402954, + "learning_rate": 3.61140368887547e-07, + "loss": 0.0188, + "step": 28910 + }, + { + "epoch": 34.71788715486194, + "grad_norm": 0.10565647482872009, + "learning_rate": 3.545581542354681e-07, + "loss": 0.0155, + "step": 28920 + }, + { + "epoch": 34.729891956782716, + "grad_norm": 0.12870840728282928, + "learning_rate": 3.480362634764922e-07, + "loss": 0.0173, + "step": 28930 + }, + { + "epoch": 34.74189675870348, + "grad_norm": 0.11793562769889832, + "learning_rate": 3.4157470453533014e-07, + "loss": 0.0165, + "step": 28940 + }, + { + "epoch": 34.75390156062425, + "grad_norm": 0.1203051209449768, + "learning_rate": 3.3517348526339034e-07, + "loss": 0.015, + "step": 28950 + }, + { + "epoch": 34.76590636254502, + "grad_norm": 0.12768325209617615, + "learning_rate": 3.288326134387454e-07, + "loss": 0.0192, + "step": 28960 + }, + { + "epoch": 34.777911164465785, + "grad_norm": 0.13317352533340454, + "learning_rate": 3.225520967661655e-07, + "loss": 0.0163, + "step": 28970 + }, + { + "epoch": 34.78991596638655, + "grad_norm": 0.11385384947061539, + "learning_rate": 3.163319428770628e-07, + "loss": 0.0151, + "step": 28980 + }, + { + "epoch": 34.801920768307326, + "grad_norm": 0.09968360513448715, + "learning_rate": 3.1017215932951374e-07, + "loss": 0.018, + "step": 28990 + }, + { + "epoch": 34.81392557022809, + "grad_norm": 0.23152846097946167, + "learning_rate": 3.040727536082366e-07, + "loss": 0.018, + "step": 29000 + }, + { + "epoch": 34.82593037214886, + "grad_norm": 0.13456237316131592, + "learning_rate": 2.980337331245864e-07, + "loss": 0.0184, + "step": 29010 + }, + { + "epoch": 34.83793517406963, + "grad_norm": 0.11099860072135925, + "learning_rate": 2.9205510521653213e-07, + "loss": 0.0187, + "step": 29020 + }, + { + "epoch": 34.849939975990395, + "grad_norm": 0.150923952460289, + "learning_rate": 2.86136877148685e-07, + "loss": 0.0168, + "step": 29030 + }, + { + "epoch": 34.86194477791116, + "grad_norm": 0.17132671177387238, + "learning_rate": 2.8027905611223704e-07, + "loss": 0.0165, + "step": 29040 + }, + { + "epoch": 34.87394957983193, + "grad_norm": 0.10934516042470932, + "learning_rate": 2.7448164922500573e-07, + "loss": 0.0164, + "step": 29050 + }, + { + "epoch": 34.885954381752704, + "grad_norm": 0.136897474527359, + "learning_rate": 2.687446635313784e-07, + "loss": 0.0157, + "step": 29060 + }, + { + "epoch": 34.89795918367347, + "grad_norm": 0.09857049584388733, + "learning_rate": 2.630681060023343e-07, + "loss": 0.0174, + "step": 29070 + }, + { + "epoch": 34.90996398559424, + "grad_norm": 0.11902054399251938, + "learning_rate": 2.5745198353542833e-07, + "loss": 0.0175, + "step": 29080 + }, + { + "epoch": 34.921968787515006, + "grad_norm": 0.14251264929771423, + "learning_rate": 2.518963029547794e-07, + "loss": 0.0185, + "step": 29090 + }, + { + "epoch": 34.93397358943577, + "grad_norm": 0.14585697650909424, + "learning_rate": 2.464010710110598e-07, + "loss": 0.0177, + "step": 29100 + }, + { + "epoch": 34.94597839135654, + "grad_norm": 0.12604349851608276, + "learning_rate": 2.4096629438150054e-07, + "loss": 0.018, + "step": 29110 + }, + { + "epoch": 34.95798319327731, + "grad_norm": 0.22701574862003326, + "learning_rate": 2.3559197966985802e-07, + "loss": 0.0167, + "step": 29120 + }, + { + "epoch": 34.96998799519808, + "grad_norm": 0.3165396451950073, + "learning_rate": 2.3027813340644188e-07, + "loss": 0.0172, + "step": 29130 + }, + { + "epoch": 34.98199279711885, + "grad_norm": 0.1718091070652008, + "learning_rate": 2.2502476204807055e-07, + "loss": 0.0182, + "step": 29140 + }, + { + "epoch": 34.993997599039616, + "grad_norm": 0.1482403427362442, + "learning_rate": 2.1983187197808786e-07, + "loss": 0.0166, + "step": 29150 + }, + { + "epoch": 35.006002400960384, + "grad_norm": 0.1734495609998703, + "learning_rate": 2.1469946950634644e-07, + "loss": 0.0201, + "step": 29160 + }, + { + "epoch": 35.01800720288115, + "grad_norm": 0.12780140340328217, + "learning_rate": 2.096275608691911e-07, + "loss": 0.0195, + "step": 29170 + }, + { + "epoch": 35.03001200480192, + "grad_norm": 0.14219538867473602, + "learning_rate": 2.046161522294754e-07, + "loss": 0.0189, + "step": 29180 + }, + { + "epoch": 35.04201680672269, + "grad_norm": 0.11961621791124344, + "learning_rate": 1.9966524967653944e-07, + "loss": 0.0186, + "step": 29190 + }, + { + "epoch": 35.05402160864346, + "grad_norm": 0.10733994096517563, + "learning_rate": 1.9477485922618222e-07, + "loss": 0.0189, + "step": 29200 + }, + { + "epoch": 35.06602641056423, + "grad_norm": 0.11169782280921936, + "learning_rate": 1.899449868207004e-07, + "loss": 0.0192, + "step": 29210 + }, + { + "epoch": 35.078031212484994, + "grad_norm": 0.09366128593683243, + "learning_rate": 1.851756383288439e-07, + "loss": 0.0144, + "step": 29220 + }, + { + "epoch": 35.09003601440576, + "grad_norm": 0.13988929986953735, + "learning_rate": 1.8046681954581035e-07, + "loss": 0.0166, + "step": 29230 + }, + { + "epoch": 35.10204081632653, + "grad_norm": 0.14205975830554962, + "learning_rate": 1.7581853619327294e-07, + "loss": 0.0196, + "step": 29240 + }, + { + "epoch": 35.114045618247296, + "grad_norm": 0.08790425956249237, + "learning_rate": 1.7123079391932472e-07, + "loss": 0.015, + "step": 29250 + }, + { + "epoch": 35.12605042016807, + "grad_norm": 0.14559130370616913, + "learning_rate": 1.6670359829850657e-07, + "loss": 0.0185, + "step": 29260 + }, + { + "epoch": 35.13805522208884, + "grad_norm": 0.1399141103029251, + "learning_rate": 1.6223695483179037e-07, + "loss": 0.0169, + "step": 29270 + }, + { + "epoch": 35.150060024009605, + "grad_norm": 0.14598587155342102, + "learning_rate": 1.5783086894656795e-07, + "loss": 0.0163, + "step": 29280 + }, + { + "epoch": 35.16206482593037, + "grad_norm": 0.1407136768102646, + "learning_rate": 1.5348534599665121e-07, + "loss": 0.0178, + "step": 29290 + }, + { + "epoch": 35.17406962785114, + "grad_norm": 0.1603529155254364, + "learning_rate": 1.4920039126225527e-07, + "loss": 0.0159, + "step": 29300 + }, + { + "epoch": 35.18607442977191, + "grad_norm": 0.19401371479034424, + "learning_rate": 1.449760099500097e-07, + "loss": 0.0177, + "step": 29310 + }, + { + "epoch": 35.198079231692674, + "grad_norm": 0.32375070452690125, + "learning_rate": 1.4081220719293075e-07, + "loss": 0.0171, + "step": 29320 + }, + { + "epoch": 35.21008403361345, + "grad_norm": 0.14571723341941833, + "learning_rate": 1.3670898805043797e-07, + "loss": 0.0186, + "step": 29330 + }, + { + "epoch": 35.222088835534215, + "grad_norm": 0.15478546917438507, + "learning_rate": 1.326663575083209e-07, + "loss": 0.0177, + "step": 29340 + }, + { + "epoch": 35.23409363745498, + "grad_norm": 0.12084118276834488, + "learning_rate": 1.2868432047876688e-07, + "loss": 0.0177, + "step": 29350 + }, + { + "epoch": 35.24609843937575, + "grad_norm": 0.14532648026943207, + "learning_rate": 1.2476288180032213e-07, + "loss": 0.0147, + "step": 29360 + }, + { + "epoch": 35.25810324129652, + "grad_norm": 0.13207855820655823, + "learning_rate": 1.209020462379029e-07, + "loss": 0.0176, + "step": 29370 + }, + { + "epoch": 35.270108043217284, + "grad_norm": 0.13063344359397888, + "learning_rate": 1.1710181848278435e-07, + "loss": 0.0151, + "step": 29380 + }, + { + "epoch": 35.28211284513806, + "grad_norm": 0.5598120093345642, + "learning_rate": 1.133622031526116e-07, + "loss": 0.0162, + "step": 29390 + }, + { + "epoch": 35.294117647058826, + "grad_norm": 0.1799507588148117, + "learning_rate": 1.0968320479136095e-07, + "loss": 0.0173, + "step": 29400 + }, + { + "epoch": 35.30612244897959, + "grad_norm": 0.11299262940883636, + "learning_rate": 1.0606482786936767e-07, + "loss": 0.0174, + "step": 29410 + }, + { + "epoch": 35.31812725090036, + "grad_norm": 0.18970942497253418, + "learning_rate": 1.0250707678329808e-07, + "loss": 0.0187, + "step": 29420 + }, + { + "epoch": 35.33013205282113, + "grad_norm": 0.16104525327682495, + "learning_rate": 9.900995585615525e-08, + "loss": 0.0154, + "step": 29430 + }, + { + "epoch": 35.342136854741895, + "grad_norm": 0.20845986902713776, + "learning_rate": 9.55734693372734e-08, + "loss": 0.0174, + "step": 29440 + }, + { + "epoch": 35.35414165666266, + "grad_norm": 0.15043649077415466, + "learning_rate": 9.219762140231236e-08, + "loss": 0.0186, + "step": 29450 + }, + { + "epoch": 35.36614645858344, + "grad_norm": 0.11743452399969101, + "learning_rate": 8.888241615322978e-08, + "loss": 0.0187, + "step": 29460 + }, + { + "epoch": 35.378151260504204, + "grad_norm": 0.10901769995689392, + "learning_rate": 8.562785761833114e-08, + "loss": 0.0172, + "step": 29470 + }, + { + "epoch": 35.39015606242497, + "grad_norm": 0.13122858107089996, + "learning_rate": 8.243394975219753e-08, + "loss": 0.016, + "step": 29480 + }, + { + "epoch": 35.40216086434574, + "grad_norm": 0.08673053234815598, + "learning_rate": 7.930069643573568e-08, + "loss": 0.0167, + "step": 29490 + }, + { + "epoch": 35.414165666266506, + "grad_norm": 0.11769472062587738, + "learning_rate": 7.622810147614456e-08, + "loss": 0.0178, + "step": 29500 + }, + { + "epoch": 35.42617046818727, + "grad_norm": 0.13723498582839966, + "learning_rate": 7.321616860690995e-08, + "loss": 0.0156, + "step": 29510 + }, + { + "epoch": 35.43817527010804, + "grad_norm": 0.11535853147506714, + "learning_rate": 7.026490148782095e-08, + "loss": 0.0169, + "step": 29520 + }, + { + "epoch": 35.450180072028814, + "grad_norm": 0.20017635822296143, + "learning_rate": 6.737430370494236e-08, + "loss": 0.0166, + "step": 29530 + }, + { + "epoch": 35.46218487394958, + "grad_norm": 0.1287733018398285, + "learning_rate": 6.454437877062569e-08, + "loss": 0.0165, + "step": 29540 + }, + { + "epoch": 35.47418967587035, + "grad_norm": 0.1383998543024063, + "learning_rate": 6.177513012349256e-08, + "loss": 0.0171, + "step": 29550 + }, + { + "epoch": 35.486194477791116, + "grad_norm": 0.16688060760498047, + "learning_rate": 5.9066561128445775e-08, + "loss": 0.017, + "step": 29560 + }, + { + "epoch": 35.49819927971188, + "grad_norm": 0.07065486162900925, + "learning_rate": 5.6418675076641556e-08, + "loss": 0.0143, + "step": 29570 + }, + { + "epoch": 35.51020408163265, + "grad_norm": 0.19293184578418732, + "learning_rate": 5.383147518552845e-08, + "loss": 0.0162, + "step": 29580 + }, + { + "epoch": 35.52220888355342, + "grad_norm": 0.13643693923950195, + "learning_rate": 5.1304964598786196e-08, + "loss": 0.0156, + "step": 29590 + }, + { + "epoch": 35.53421368547419, + "grad_norm": 0.1683458685874939, + "learning_rate": 4.883914638636467e-08, + "loss": 0.0183, + "step": 29600 + }, + { + "epoch": 35.54621848739496, + "grad_norm": 0.1551513373851776, + "learning_rate": 4.643402354446713e-08, + "loss": 0.017, + "step": 29610 + }, + { + "epoch": 35.55822328931573, + "grad_norm": 0.13039705157279968, + "learning_rate": 4.4089598995544766e-08, + "loss": 0.0149, + "step": 29620 + }, + { + "epoch": 35.570228091236494, + "grad_norm": 0.11003971099853516, + "learning_rate": 4.180587558829663e-08, + "loss": 0.0197, + "step": 29630 + }, + { + "epoch": 35.58223289315726, + "grad_norm": 0.11151973158121109, + "learning_rate": 3.958285609765855e-08, + "loss": 0.0153, + "step": 29640 + }, + { + "epoch": 35.59423769507803, + "grad_norm": 0.11616402119398117, + "learning_rate": 3.74205432248087e-08, + "loss": 0.0185, + "step": 29650 + }, + { + "epoch": 35.6062424969988, + "grad_norm": 0.14045804738998413, + "learning_rate": 3.531893959716204e-08, + "loss": 0.0189, + "step": 29660 + }, + { + "epoch": 35.61824729891957, + "grad_norm": 0.3478863835334778, + "learning_rate": 3.327804776837029e-08, + "loss": 0.0173, + "step": 29670 + }, + { + "epoch": 35.63025210084034, + "grad_norm": 0.09165357798337936, + "learning_rate": 3.129787021829977e-08, + "loss": 0.0161, + "step": 29680 + }, + { + "epoch": 35.642256902761105, + "grad_norm": 0.11958320438861847, + "learning_rate": 2.9378409353059133e-08, + "loss": 0.0158, + "step": 29690 + }, + { + "epoch": 35.65426170468187, + "grad_norm": 0.08874426037073135, + "learning_rate": 2.7519667504971593e-08, + "loss": 0.0162, + "step": 29700 + }, + { + "epoch": 35.66626650660264, + "grad_norm": 0.12089947611093521, + "learning_rate": 2.572164693258605e-08, + "loss": 0.0168, + "step": 29710 + }, + { + "epoch": 35.678271308523406, + "grad_norm": 0.1199173852801323, + "learning_rate": 2.3984349820665996e-08, + "loss": 0.0164, + "step": 29720 + }, + { + "epoch": 35.69027611044418, + "grad_norm": 0.11333424597978592, + "learning_rate": 2.2307778280189485e-08, + "loss": 0.0189, + "step": 29730 + }, + { + "epoch": 35.70228091236495, + "grad_norm": 0.14748457074165344, + "learning_rate": 2.0691934348354704e-08, + "loss": 0.0155, + "step": 29740 + }, + { + "epoch": 35.714285714285715, + "grad_norm": 0.1616397649049759, + "learning_rate": 1.9136819988557763e-08, + "loss": 0.0175, + "step": 29750 + }, + { + "epoch": 35.72629051620648, + "grad_norm": 0.14503079652786255, + "learning_rate": 1.7642437090414908e-08, + "loss": 0.018, + "step": 29760 + }, + { + "epoch": 35.73829531812725, + "grad_norm": 0.11860646307468414, + "learning_rate": 1.6208787469734755e-08, + "loss": 0.0162, + "step": 29770 + }, + { + "epoch": 35.75030012004802, + "grad_norm": 0.16019582748413086, + "learning_rate": 1.483587286854604e-08, + "loss": 0.0178, + "step": 29780 + }, + { + "epoch": 35.762304921968784, + "grad_norm": 0.3111690878868103, + "learning_rate": 1.3523694955064336e-08, + "loss": 0.0172, + "step": 29790 + }, + { + "epoch": 35.77430972388956, + "grad_norm": 0.11107563972473145, + "learning_rate": 1.2272255323708681e-08, + "loss": 0.0156, + "step": 29800 + }, + { + "epoch": 35.786314525810326, + "grad_norm": 0.14733462035655975, + "learning_rate": 1.1081555495096042e-08, + "loss": 0.0187, + "step": 29810 + }, + { + "epoch": 35.79831932773109, + "grad_norm": 0.17499785125255585, + "learning_rate": 9.951596916041306e-09, + "loss": 0.0181, + "step": 29820 + }, + { + "epoch": 35.81032412965186, + "grad_norm": 0.21558722853660583, + "learning_rate": 8.882380959551739e-09, + "loss": 0.023, + "step": 29830 + }, + { + "epoch": 35.82232893157263, + "grad_norm": 0.13777413964271545, + "learning_rate": 7.873908924821428e-09, + "loss": 0.0179, + "step": 29840 + }, + { + "epoch": 35.834333733493395, + "grad_norm": 0.1791856288909912, + "learning_rate": 6.926182037242379e-09, + "loss": 0.0178, + "step": 29850 + }, + { + "epoch": 35.84633853541416, + "grad_norm": 0.1433567851781845, + "learning_rate": 6.039201448387876e-09, + "loss": 0.0173, + "step": 29860 + }, + { + "epoch": 35.85834333733494, + "grad_norm": 0.08386564254760742, + "learning_rate": 5.212968236029125e-09, + "loss": 0.0175, + "step": 29870 + }, + { + "epoch": 35.870348139255704, + "grad_norm": 0.18354830145835876, + "learning_rate": 4.447483404118602e-09, + "loss": 0.0174, + "step": 29880 + }, + { + "epoch": 35.88235294117647, + "grad_norm": 0.34742647409439087, + "learning_rate": 3.742747882784503e-09, + "loss": 0.0174, + "step": 29890 + }, + { + "epoch": 35.89435774309724, + "grad_norm": 0.11249464750289917, + "learning_rate": 3.0987625283473987e-09, + "loss": 0.0155, + "step": 29900 + }, + { + "epoch": 35.906362545018006, + "grad_norm": 0.08600857853889465, + "learning_rate": 2.515528123320232e-09, + "loss": 0.0161, + "step": 29910 + }, + { + "epoch": 35.91836734693877, + "grad_norm": 0.15173259377479553, + "learning_rate": 1.9930453763750133e-09, + "loss": 0.0184, + "step": 29920 + }, + { + "epoch": 35.93037214885955, + "grad_norm": 0.1270127296447754, + "learning_rate": 1.5313149223872281e-09, + "loss": 0.0145, + "step": 29930 + }, + { + "epoch": 35.942376950780314, + "grad_norm": 0.12920384109020233, + "learning_rate": 1.1303373224025305e-09, + "loss": 0.018, + "step": 29940 + }, + { + "epoch": 35.95438175270108, + "grad_norm": 0.16484585404396057, + "learning_rate": 7.901130636367438e-10, + "loss": 0.018, + "step": 29950 + }, + { + "epoch": 35.96638655462185, + "grad_norm": 0.14851921796798706, + "learning_rate": 5.106425595036158e-10, + "loss": 0.0176, + "step": 29960 + }, + { + "epoch": 35.978391356542616, + "grad_norm": 0.14371319115161896, + "learning_rate": 2.9192614958706286e-10, + "loss": 0.0179, + "step": 29970 + }, + { + "epoch": 35.99039615846338, + "grad_norm": 0.15885087847709656, + "learning_rate": 1.3396409964117064e-10, + "loss": 0.0173, + "step": 29980 + }, + { + "epoch": 36.00240096038415, + "grad_norm": 0.12818215787410736, + "learning_rate": 3.6756601606846574e-11, + "loss": 0.0163, + "step": 29990 + }, + { + "epoch": 36.014405762304925, + "grad_norm": 0.12369412183761597, + "learning_rate": 3.0377360626943076e-13, + "loss": 0.0157, + "step": 30000 + }, + { + "epoch": 36.014405762304925, + "step": 30000, + "total_flos": 0.0, + "train_loss": 0.0066021180565158525, + "train_runtime": 17202.2188, + "train_samples_per_second": 111.614, + "train_steps_per_second": 1.744 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 37, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}