{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.398496240601503, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004699248120300752, "grad_norm": 4.17242956161499, "learning_rate": 1.0000000000000002e-06, "loss": 1.0329, "step": 10 }, { "epoch": 0.009398496240601503, "grad_norm": 6.51262092590332, "learning_rate": 2.0000000000000003e-06, "loss": 1.0723, "step": 20 }, { "epoch": 0.014097744360902255, "grad_norm": 4.060946464538574, "learning_rate": 3e-06, "loss": 1.0744, "step": 30 }, { "epoch": 0.018796992481203006, "grad_norm": 3.7678067684173584, "learning_rate": 4.000000000000001e-06, "loss": 1.0152, "step": 40 }, { "epoch": 0.023496240601503758, "grad_norm": 3.8775312900543213, "learning_rate": 5e-06, "loss": 0.8718, "step": 50 }, { "epoch": 0.02819548872180451, "grad_norm": 3.488631010055542, "learning_rate": 6e-06, "loss": 0.7328, "step": 60 }, { "epoch": 0.03289473684210526, "grad_norm": 2.0852737426757812, "learning_rate": 7.000000000000001e-06, "loss": 0.5037, "step": 70 }, { "epoch": 0.03759398496240601, "grad_norm": 1.5253463983535767, "learning_rate": 8.000000000000001e-06, "loss": 0.4341, "step": 80 }, { "epoch": 0.042293233082706765, "grad_norm": 1.1347906589508057, "learning_rate": 9e-06, "loss": 0.287, "step": 90 }, { "epoch": 0.046992481203007516, "grad_norm": 1.715220332145691, "learning_rate": 1e-05, "loss": 0.2604, "step": 100 }, { "epoch": 0.05169172932330827, "grad_norm": 1.325265645980835, "learning_rate": 1.1000000000000001e-05, "loss": 0.2482, "step": 110 }, { "epoch": 0.05639097744360902, "grad_norm": 1.0217766761779785, "learning_rate": 1.2e-05, "loss": 0.217, "step": 120 }, { "epoch": 0.06109022556390977, "grad_norm": 0.9909769296646118, "learning_rate": 1.3000000000000001e-05, "loss": 0.1512, "step": 130 }, { "epoch": 0.06578947368421052, "grad_norm": 0.549028217792511, "learning_rate": 1.4000000000000001e-05, "loss": 0.1569, "step": 140 }, { "epoch": 0.07048872180451128, "grad_norm": 0.8695999383926392, "learning_rate": 1.5e-05, "loss": 0.1652, "step": 150 }, { "epoch": 0.07518796992481203, "grad_norm": 0.6757495403289795, "learning_rate": 1.6000000000000003e-05, "loss": 0.1339, "step": 160 }, { "epoch": 0.07988721804511278, "grad_norm": 0.4814220666885376, "learning_rate": 1.7000000000000003e-05, "loss": 0.1316, "step": 170 }, { "epoch": 0.08458646616541353, "grad_norm": 0.8050209283828735, "learning_rate": 1.8e-05, "loss": 0.1224, "step": 180 }, { "epoch": 0.08928571428571429, "grad_norm": 0.49900326132774353, "learning_rate": 1.9e-05, "loss": 0.1269, "step": 190 }, { "epoch": 0.09398496240601503, "grad_norm": 0.7031654119491577, "learning_rate": 2e-05, "loss": 0.1288, "step": 200 }, { "epoch": 0.09868421052631579, "grad_norm": 0.5220195055007935, "learning_rate": 2.1e-05, "loss": 0.1111, "step": 210 }, { "epoch": 0.10338345864661654, "grad_norm": 0.4195871353149414, "learning_rate": 2.2000000000000003e-05, "loss": 0.114, "step": 220 }, { "epoch": 0.1080827067669173, "grad_norm": 0.5394448637962341, "learning_rate": 2.3000000000000003e-05, "loss": 0.1086, "step": 230 }, { "epoch": 0.11278195488721804, "grad_norm": 0.5818265676498413, "learning_rate": 2.4e-05, "loss": 0.1034, "step": 240 }, { "epoch": 0.1174812030075188, "grad_norm": 0.5092581510543823, "learning_rate": 2.5e-05, "loss": 0.1001, "step": 250 }, { "epoch": 0.12218045112781954, "grad_norm": 0.5820969939231873, "learning_rate": 2.6000000000000002e-05, "loss": 0.0985, "step": 260 }, { "epoch": 0.12687969924812031, "grad_norm": 0.4616420269012451, "learning_rate": 2.7000000000000002e-05, "loss": 0.095, "step": 270 }, { "epoch": 0.13157894736842105, "grad_norm": 0.5095752477645874, "learning_rate": 2.8000000000000003e-05, "loss": 0.1115, "step": 280 }, { "epoch": 0.1362781954887218, "grad_norm": 0.6476600170135498, "learning_rate": 2.9e-05, "loss": 0.0831, "step": 290 }, { "epoch": 0.14097744360902256, "grad_norm": 0.5189246535301208, "learning_rate": 3e-05, "loss": 0.1043, "step": 300 }, { "epoch": 0.14567669172932332, "grad_norm": 0.5447798371315002, "learning_rate": 3.1e-05, "loss": 0.0829, "step": 310 }, { "epoch": 0.15037593984962405, "grad_norm": 0.47478917241096497, "learning_rate": 3.2000000000000005e-05, "loss": 0.0844, "step": 320 }, { "epoch": 0.1550751879699248, "grad_norm": 0.577869713306427, "learning_rate": 3.3e-05, "loss": 0.0935, "step": 330 }, { "epoch": 0.15977443609022557, "grad_norm": 0.6907890439033508, "learning_rate": 3.4000000000000007e-05, "loss": 0.0773, "step": 340 }, { "epoch": 0.16447368421052633, "grad_norm": 0.5402064919471741, "learning_rate": 3.5e-05, "loss": 0.0852, "step": 350 }, { "epoch": 0.16917293233082706, "grad_norm": 0.5711252689361572, "learning_rate": 3.6e-05, "loss": 0.0821, "step": 360 }, { "epoch": 0.17387218045112782, "grad_norm": 0.5073336362838745, "learning_rate": 3.7e-05, "loss": 0.0793, "step": 370 }, { "epoch": 0.17857142857142858, "grad_norm": 0.4687930941581726, "learning_rate": 3.8e-05, "loss": 0.0788, "step": 380 }, { "epoch": 0.18327067669172933, "grad_norm": 0.46393144130706787, "learning_rate": 3.9000000000000006e-05, "loss": 0.0849, "step": 390 }, { "epoch": 0.18796992481203006, "grad_norm": 0.43402764201164246, "learning_rate": 4e-05, "loss": 0.0667, "step": 400 }, { "epoch": 0.19266917293233082, "grad_norm": 0.4862448275089264, "learning_rate": 4.1e-05, "loss": 0.074, "step": 410 }, { "epoch": 0.19736842105263158, "grad_norm": 0.4262375831604004, "learning_rate": 4.2e-05, "loss": 0.0825, "step": 420 }, { "epoch": 0.20206766917293234, "grad_norm": 0.4565044641494751, "learning_rate": 4.3e-05, "loss": 0.0786, "step": 430 }, { "epoch": 0.20676691729323307, "grad_norm": 0.43929043412208557, "learning_rate": 4.4000000000000006e-05, "loss": 0.0726, "step": 440 }, { "epoch": 0.21146616541353383, "grad_norm": 0.5057302713394165, "learning_rate": 4.5e-05, "loss": 0.0673, "step": 450 }, { "epoch": 0.2161654135338346, "grad_norm": 0.5209115147590637, "learning_rate": 4.600000000000001e-05, "loss": 0.0639, "step": 460 }, { "epoch": 0.22086466165413535, "grad_norm": 0.544666051864624, "learning_rate": 4.7e-05, "loss": 0.0716, "step": 470 }, { "epoch": 0.22556390977443608, "grad_norm": 0.47278276085853577, "learning_rate": 4.8e-05, "loss": 0.0631, "step": 480 }, { "epoch": 0.23026315789473684, "grad_norm": 0.5331622362136841, "learning_rate": 4.9e-05, "loss": 0.0668, "step": 490 }, { "epoch": 0.2349624060150376, "grad_norm": 0.5460582971572876, "learning_rate": 5e-05, "loss": 0.055, "step": 500 }, { "epoch": 0.23966165413533835, "grad_norm": 0.511077880859375, "learning_rate": 5.1000000000000006e-05, "loss": 0.0609, "step": 510 }, { "epoch": 0.24436090225563908, "grad_norm": 0.33869466185569763, "learning_rate": 5.2000000000000004e-05, "loss": 0.0532, "step": 520 }, { "epoch": 0.24906015037593984, "grad_norm": 0.4660898745059967, "learning_rate": 5.300000000000001e-05, "loss": 0.0613, "step": 530 }, { "epoch": 0.25375939849624063, "grad_norm": 0.5432083010673523, "learning_rate": 5.4000000000000005e-05, "loss": 0.0702, "step": 540 }, { "epoch": 0.25845864661654133, "grad_norm": 0.5141884684562683, "learning_rate": 5.500000000000001e-05, "loss": 0.0513, "step": 550 }, { "epoch": 0.2631578947368421, "grad_norm": 0.3961157500743866, "learning_rate": 5.6000000000000006e-05, "loss": 0.0609, "step": 560 }, { "epoch": 0.26785714285714285, "grad_norm": 0.5757367014884949, "learning_rate": 5.6999999999999996e-05, "loss": 0.0717, "step": 570 }, { "epoch": 0.2725563909774436, "grad_norm": 0.4370141625404358, "learning_rate": 5.8e-05, "loss": 0.0637, "step": 580 }, { "epoch": 0.27725563909774437, "grad_norm": 0.3731731176376343, "learning_rate": 5.9e-05, "loss": 0.0517, "step": 590 }, { "epoch": 0.2819548872180451, "grad_norm": 0.5891894102096558, "learning_rate": 6e-05, "loss": 0.0664, "step": 600 }, { "epoch": 0.2866541353383459, "grad_norm": 0.49936580657958984, "learning_rate": 6.1e-05, "loss": 0.071, "step": 610 }, { "epoch": 0.29135338345864664, "grad_norm": 0.5268176198005676, "learning_rate": 6.2e-05, "loss": 0.063, "step": 620 }, { "epoch": 0.29605263157894735, "grad_norm": 0.33853551745414734, "learning_rate": 6.3e-05, "loss": 0.057, "step": 630 }, { "epoch": 0.3007518796992481, "grad_norm": 0.47726792097091675, "learning_rate": 6.400000000000001e-05, "loss": 0.059, "step": 640 }, { "epoch": 0.30545112781954886, "grad_norm": 0.4239175021648407, "learning_rate": 6.500000000000001e-05, "loss": 0.0601, "step": 650 }, { "epoch": 0.3101503759398496, "grad_norm": 0.46040597558021545, "learning_rate": 6.6e-05, "loss": 0.0598, "step": 660 }, { "epoch": 0.3148496240601504, "grad_norm": 0.374403715133667, "learning_rate": 6.7e-05, "loss": 0.0422, "step": 670 }, { "epoch": 0.31954887218045114, "grad_norm": 0.5622545480728149, "learning_rate": 6.800000000000001e-05, "loss": 0.0642, "step": 680 }, { "epoch": 0.3242481203007519, "grad_norm": 0.4140852391719818, "learning_rate": 6.9e-05, "loss": 0.0475, "step": 690 }, { "epoch": 0.32894736842105265, "grad_norm": 0.46566590666770935, "learning_rate": 7e-05, "loss": 0.0576, "step": 700 }, { "epoch": 0.33364661654135336, "grad_norm": 0.6023309826850891, "learning_rate": 7.1e-05, "loss": 0.0566, "step": 710 }, { "epoch": 0.3383458646616541, "grad_norm": 0.5072154402732849, "learning_rate": 7.2e-05, "loss": 0.0493, "step": 720 }, { "epoch": 0.3430451127819549, "grad_norm": 0.439280241727829, "learning_rate": 7.3e-05, "loss": 0.0642, "step": 730 }, { "epoch": 0.34774436090225563, "grad_norm": 0.5976812839508057, "learning_rate": 7.4e-05, "loss": 0.0608, "step": 740 }, { "epoch": 0.3524436090225564, "grad_norm": 0.3581954538822174, "learning_rate": 7.500000000000001e-05, "loss": 0.0522, "step": 750 }, { "epoch": 0.35714285714285715, "grad_norm": 0.5385236740112305, "learning_rate": 7.6e-05, "loss": 0.0556, "step": 760 }, { "epoch": 0.3618421052631579, "grad_norm": 0.48683884739875793, "learning_rate": 7.7e-05, "loss": 0.0545, "step": 770 }, { "epoch": 0.36654135338345867, "grad_norm": 0.4413968622684479, "learning_rate": 7.800000000000001e-05, "loss": 0.0562, "step": 780 }, { "epoch": 0.37124060150375937, "grad_norm": 0.6093789935112, "learning_rate": 7.900000000000001e-05, "loss": 0.0414, "step": 790 }, { "epoch": 0.37593984962406013, "grad_norm": 0.32487279176712036, "learning_rate": 8e-05, "loss": 0.0547, "step": 800 }, { "epoch": 0.3806390977443609, "grad_norm": 0.43237757682800293, "learning_rate": 8.1e-05, "loss": 0.0473, "step": 810 }, { "epoch": 0.38533834586466165, "grad_norm": 0.3570401072502136, "learning_rate": 8.2e-05, "loss": 0.0638, "step": 820 }, { "epoch": 0.3900375939849624, "grad_norm": 0.3672104477882385, "learning_rate": 8.3e-05, "loss": 0.0554, "step": 830 }, { "epoch": 0.39473684210526316, "grad_norm": 0.4324614107608795, "learning_rate": 8.4e-05, "loss": 0.0432, "step": 840 }, { "epoch": 0.3994360902255639, "grad_norm": 0.31276965141296387, "learning_rate": 8.5e-05, "loss": 0.0458, "step": 850 }, { "epoch": 0.4041353383458647, "grad_norm": 0.36087697744369507, "learning_rate": 8.6e-05, "loss": 0.0525, "step": 860 }, { "epoch": 0.40883458646616544, "grad_norm": 0.33883997797966003, "learning_rate": 8.7e-05, "loss": 0.0468, "step": 870 }, { "epoch": 0.41353383458646614, "grad_norm": 0.43214815855026245, "learning_rate": 8.800000000000001e-05, "loss": 0.0565, "step": 880 }, { "epoch": 0.4182330827067669, "grad_norm": 0.34577423334121704, "learning_rate": 8.900000000000001e-05, "loss": 0.0518, "step": 890 }, { "epoch": 0.42293233082706766, "grad_norm": 0.45119839906692505, "learning_rate": 9e-05, "loss": 0.0496, "step": 900 }, { "epoch": 0.4276315789473684, "grad_norm": 0.47580137848854065, "learning_rate": 9.1e-05, "loss": 0.0555, "step": 910 }, { "epoch": 0.4323308270676692, "grad_norm": 0.4175976812839508, "learning_rate": 9.200000000000001e-05, "loss": 0.0567, "step": 920 }, { "epoch": 0.43703007518796994, "grad_norm": 0.4780498445034027, "learning_rate": 9.300000000000001e-05, "loss": 0.0678, "step": 930 }, { "epoch": 0.4417293233082707, "grad_norm": 0.45834606885910034, "learning_rate": 9.4e-05, "loss": 0.0379, "step": 940 }, { "epoch": 0.44642857142857145, "grad_norm": 0.33210793137550354, "learning_rate": 9.5e-05, "loss": 0.0461, "step": 950 }, { "epoch": 0.45112781954887216, "grad_norm": 0.5226761102676392, "learning_rate": 9.6e-05, "loss": 0.0542, "step": 960 }, { "epoch": 0.4558270676691729, "grad_norm": 0.4086574912071228, "learning_rate": 9.7e-05, "loss": 0.0443, "step": 970 }, { "epoch": 0.4605263157894737, "grad_norm": 0.2859014868736267, "learning_rate": 9.8e-05, "loss": 0.0458, "step": 980 }, { "epoch": 0.46522556390977443, "grad_norm": 0.3826181888580322, "learning_rate": 9.900000000000001e-05, "loss": 0.044, "step": 990 }, { "epoch": 0.4699248120300752, "grad_norm": 0.3559493124485016, "learning_rate": 0.0001, "loss": 0.0407, "step": 1000 }, { "epoch": 0.47462406015037595, "grad_norm": 0.2889259457588196, "learning_rate": 9.999993165095463e-05, "loss": 0.0568, "step": 1010 }, { "epoch": 0.4793233082706767, "grad_norm": 0.26828300952911377, "learning_rate": 9.999972660400536e-05, "loss": 0.0459, "step": 1020 }, { "epoch": 0.48402255639097747, "grad_norm": 0.45884090662002563, "learning_rate": 9.999938485971279e-05, "loss": 0.054, "step": 1030 }, { "epoch": 0.48872180451127817, "grad_norm": 0.5988617539405823, "learning_rate": 9.999890641901125e-05, "loss": 0.0455, "step": 1040 }, { "epoch": 0.4934210526315789, "grad_norm": 0.320173442363739, "learning_rate": 9.999829128320874e-05, "loss": 0.047, "step": 1050 }, { "epoch": 0.4981203007518797, "grad_norm": 0.34646356105804443, "learning_rate": 9.999753945398704e-05, "loss": 0.0567, "step": 1060 }, { "epoch": 0.5028195488721805, "grad_norm": 0.321715384721756, "learning_rate": 9.999665093340165e-05, "loss": 0.0418, "step": 1070 }, { "epoch": 0.5075187969924813, "grad_norm": 0.29327085614204407, "learning_rate": 9.99956257238817e-05, "loss": 0.0498, "step": 1080 }, { "epoch": 0.5122180451127819, "grad_norm": 0.2684969902038574, "learning_rate": 9.999446382823013e-05, "loss": 0.0507, "step": 1090 }, { "epoch": 0.5169172932330827, "grad_norm": 0.1975565105676651, "learning_rate": 9.999316524962345e-05, "loss": 0.0387, "step": 1100 }, { "epoch": 0.5216165413533834, "grad_norm": 0.31090304255485535, "learning_rate": 9.999172999161198e-05, "loss": 0.0378, "step": 1110 }, { "epoch": 0.5263157894736842, "grad_norm": 0.30241793394088745, "learning_rate": 9.999015805811965e-05, "loss": 0.0398, "step": 1120 }, { "epoch": 0.5310150375939849, "grad_norm": 0.28800079226493835, "learning_rate": 9.998844945344405e-05, "loss": 0.0409, "step": 1130 }, { "epoch": 0.5357142857142857, "grad_norm": 0.41590771079063416, "learning_rate": 9.998660418225645e-05, "loss": 0.0352, "step": 1140 }, { "epoch": 0.5404135338345865, "grad_norm": 0.3479129374027252, "learning_rate": 9.998462224960175e-05, "loss": 0.049, "step": 1150 }, { "epoch": 0.5451127819548872, "grad_norm": 0.26915863156318665, "learning_rate": 9.998250366089848e-05, "loss": 0.0347, "step": 1160 }, { "epoch": 0.549812030075188, "grad_norm": 0.2776716351509094, "learning_rate": 9.998024842193876e-05, "loss": 0.0461, "step": 1170 }, { "epoch": 0.5545112781954887, "grad_norm": 0.49566450715065, "learning_rate": 9.997785653888835e-05, "loss": 0.0437, "step": 1180 }, { "epoch": 0.5592105263157895, "grad_norm": 0.2881394922733307, "learning_rate": 9.997532801828658e-05, "loss": 0.0401, "step": 1190 }, { "epoch": 0.5639097744360902, "grad_norm": 0.30325883626937866, "learning_rate": 9.997266286704631e-05, "loss": 0.0484, "step": 1200 }, { "epoch": 0.568609022556391, "grad_norm": 0.25720658898353577, "learning_rate": 9.996986109245395e-05, "loss": 0.0572, "step": 1210 }, { "epoch": 0.5733082706766918, "grad_norm": 0.33928200602531433, "learning_rate": 9.996692270216947e-05, "loss": 0.0468, "step": 1220 }, { "epoch": 0.5780075187969925, "grad_norm": 0.21968646347522736, "learning_rate": 9.996384770422629e-05, "loss": 0.0401, "step": 1230 }, { "epoch": 0.5827067669172933, "grad_norm": 0.3939560651779175, "learning_rate": 9.996063610703137e-05, "loss": 0.0504, "step": 1240 }, { "epoch": 0.5874060150375939, "grad_norm": 0.23372384905815125, "learning_rate": 9.995728791936504e-05, "loss": 0.0371, "step": 1250 }, { "epoch": 0.5921052631578947, "grad_norm": 0.3492465019226074, "learning_rate": 9.995380315038119e-05, "loss": 0.0359, "step": 1260 }, { "epoch": 0.5968045112781954, "grad_norm": 0.31815361976623535, "learning_rate": 9.9950181809607e-05, "loss": 0.0387, "step": 1270 }, { "epoch": 0.6015037593984962, "grad_norm": 0.44413354992866516, "learning_rate": 9.994642390694308e-05, "loss": 0.0435, "step": 1280 }, { "epoch": 0.606203007518797, "grad_norm": 0.1992567628622055, "learning_rate": 9.99425294526634e-05, "loss": 0.0403, "step": 1290 }, { "epoch": 0.6109022556390977, "grad_norm": 0.2784936726093292, "learning_rate": 9.993849845741524e-05, "loss": 0.0441, "step": 1300 }, { "epoch": 0.6156015037593985, "grad_norm": 0.22886349260807037, "learning_rate": 9.99343309322192e-05, "loss": 0.0361, "step": 1310 }, { "epoch": 0.6203007518796992, "grad_norm": 0.29100221395492554, "learning_rate": 9.993002688846913e-05, "loss": 0.0439, "step": 1320 }, { "epoch": 0.625, "grad_norm": 0.356252521276474, "learning_rate": 9.992558633793212e-05, "loss": 0.0498, "step": 1330 }, { "epoch": 0.6296992481203008, "grad_norm": 0.3578437864780426, "learning_rate": 9.992100929274846e-05, "loss": 0.0572, "step": 1340 }, { "epoch": 0.6343984962406015, "grad_norm": 0.2747027575969696, "learning_rate": 9.991629576543163e-05, "loss": 0.0395, "step": 1350 }, { "epoch": 0.6390977443609023, "grad_norm": 0.31512099504470825, "learning_rate": 9.991144576886823e-05, "loss": 0.0532, "step": 1360 }, { "epoch": 0.643796992481203, "grad_norm": 0.34956079721450806, "learning_rate": 9.990645931631796e-05, "loss": 0.0433, "step": 1370 }, { "epoch": 0.6484962406015038, "grad_norm": 0.3578788638114929, "learning_rate": 9.990133642141359e-05, "loss": 0.0399, "step": 1380 }, { "epoch": 0.6531954887218046, "grad_norm": 0.4128391146659851, "learning_rate": 9.989607709816091e-05, "loss": 0.0438, "step": 1390 }, { "epoch": 0.6578947368421053, "grad_norm": 0.32965388894081116, "learning_rate": 9.989068136093873e-05, "loss": 0.0385, "step": 1400 }, { "epoch": 0.6625939849624061, "grad_norm": 0.345131516456604, "learning_rate": 9.988514922449879e-05, "loss": 0.0583, "step": 1410 }, { "epoch": 0.6672932330827067, "grad_norm": 0.24946953356266022, "learning_rate": 9.987948070396571e-05, "loss": 0.0432, "step": 1420 }, { "epoch": 0.6719924812030075, "grad_norm": 0.3567699193954468, "learning_rate": 9.987367581483705e-05, "loss": 0.0477, "step": 1430 }, { "epoch": 0.6766917293233082, "grad_norm": 0.3243180215358734, "learning_rate": 9.986773457298311e-05, "loss": 0.0356, "step": 1440 }, { "epoch": 0.681390977443609, "grad_norm": 0.31001994013786316, "learning_rate": 9.986165699464705e-05, "loss": 0.0485, "step": 1450 }, { "epoch": 0.6860902255639098, "grad_norm": 0.22086216509342194, "learning_rate": 9.985544309644475e-05, "loss": 0.041, "step": 1460 }, { "epoch": 0.6907894736842105, "grad_norm": 0.2946653962135315, "learning_rate": 9.984909289536473e-05, "loss": 0.0299, "step": 1470 }, { "epoch": 0.6954887218045113, "grad_norm": 0.29980820417404175, "learning_rate": 9.984260640876821e-05, "loss": 0.0369, "step": 1480 }, { "epoch": 0.700187969924812, "grad_norm": 0.31546637415885925, "learning_rate": 9.983598365438902e-05, "loss": 0.0365, "step": 1490 }, { "epoch": 0.7048872180451128, "grad_norm": 0.2951902747154236, "learning_rate": 9.98292246503335e-05, "loss": 0.0433, "step": 1500 }, { "epoch": 0.7095864661654135, "grad_norm": 0.24987751245498657, "learning_rate": 9.98223294150805e-05, "loss": 0.0322, "step": 1510 }, { "epoch": 0.7142857142857143, "grad_norm": 0.25791454315185547, "learning_rate": 9.981529796748134e-05, "loss": 0.0347, "step": 1520 }, { "epoch": 0.7189849624060151, "grad_norm": 0.2819271385669708, "learning_rate": 9.980813032675974e-05, "loss": 0.0393, "step": 1530 }, { "epoch": 0.7236842105263158, "grad_norm": 0.2781679630279541, "learning_rate": 9.980082651251175e-05, "loss": 0.032, "step": 1540 }, { "epoch": 0.7283834586466166, "grad_norm": 0.2625151574611664, "learning_rate": 9.979338654470569e-05, "loss": 0.0457, "step": 1550 }, { "epoch": 0.7330827067669173, "grad_norm": 0.23519033193588257, "learning_rate": 9.97858104436822e-05, "loss": 0.0293, "step": 1560 }, { "epoch": 0.7377819548872181, "grad_norm": 0.3268503248691559, "learning_rate": 9.977809823015401e-05, "loss": 0.0351, "step": 1570 }, { "epoch": 0.7424812030075187, "grad_norm": 0.26714998483657837, "learning_rate": 9.977024992520602e-05, "loss": 0.0354, "step": 1580 }, { "epoch": 0.7471804511278195, "grad_norm": 0.2875531017780304, "learning_rate": 9.976226555029522e-05, "loss": 0.0375, "step": 1590 }, { "epoch": 0.7518796992481203, "grad_norm": 0.29909199476242065, "learning_rate": 9.975414512725057e-05, "loss": 0.042, "step": 1600 }, { "epoch": 0.756578947368421, "grad_norm": 0.35534903407096863, "learning_rate": 9.974588867827301e-05, "loss": 0.0392, "step": 1610 }, { "epoch": 0.7612781954887218, "grad_norm": 0.2016129046678543, "learning_rate": 9.973749622593534e-05, "loss": 0.0386, "step": 1620 }, { "epoch": 0.7659774436090225, "grad_norm": 0.3945503234863281, "learning_rate": 9.972896779318219e-05, "loss": 0.0468, "step": 1630 }, { "epoch": 0.7706766917293233, "grad_norm": 0.2441464513540268, "learning_rate": 9.972030340333001e-05, "loss": 0.0378, "step": 1640 }, { "epoch": 0.775375939849624, "grad_norm": 0.24610991775989532, "learning_rate": 9.97115030800669e-05, "loss": 0.0324, "step": 1650 }, { "epoch": 0.7800751879699248, "grad_norm": 0.2868049144744873, "learning_rate": 9.970256684745258e-05, "loss": 0.0511, "step": 1660 }, { "epoch": 0.7847744360902256, "grad_norm": 0.36867856979370117, "learning_rate": 9.969349472991838e-05, "loss": 0.036, "step": 1670 }, { "epoch": 0.7894736842105263, "grad_norm": 0.2507390081882477, "learning_rate": 9.968428675226714e-05, "loss": 0.0325, "step": 1680 }, { "epoch": 0.7941729323308271, "grad_norm": 0.29328516125679016, "learning_rate": 9.967494293967312e-05, "loss": 0.0491, "step": 1690 }, { "epoch": 0.7988721804511278, "grad_norm": 0.26543503999710083, "learning_rate": 9.966546331768191e-05, "loss": 0.0397, "step": 1700 }, { "epoch": 0.8035714285714286, "grad_norm": 0.2858276963233948, "learning_rate": 9.965584791221048e-05, "loss": 0.0328, "step": 1710 }, { "epoch": 0.8082706766917294, "grad_norm": 0.3322197496891022, "learning_rate": 9.964609674954696e-05, "loss": 0.0437, "step": 1720 }, { "epoch": 0.8129699248120301, "grad_norm": 0.29589325189590454, "learning_rate": 9.963620985635065e-05, "loss": 0.033, "step": 1730 }, { "epoch": 0.8176691729323309, "grad_norm": 0.2513675093650818, "learning_rate": 9.962618725965196e-05, "loss": 0.0466, "step": 1740 }, { "epoch": 0.8223684210526315, "grad_norm": 0.24024534225463867, "learning_rate": 9.961602898685226e-05, "loss": 0.0316, "step": 1750 }, { "epoch": 0.8270676691729323, "grad_norm": 0.2825106680393219, "learning_rate": 9.96057350657239e-05, "loss": 0.0335, "step": 1760 }, { "epoch": 0.831766917293233, "grad_norm": 0.286940336227417, "learning_rate": 9.959530552441005e-05, "loss": 0.0373, "step": 1770 }, { "epoch": 0.8364661654135338, "grad_norm": 0.25305864214897156, "learning_rate": 9.95847403914247e-05, "loss": 0.0341, "step": 1780 }, { "epoch": 0.8411654135338346, "grad_norm": 0.3127809464931488, "learning_rate": 9.95740396956525e-05, "loss": 0.0382, "step": 1790 }, { "epoch": 0.8458646616541353, "grad_norm": 0.2425500899553299, "learning_rate": 9.956320346634876e-05, "loss": 0.0386, "step": 1800 }, { "epoch": 0.8505639097744361, "grad_norm": 0.3123094439506531, "learning_rate": 9.955223173313931e-05, "loss": 0.0335, "step": 1810 }, { "epoch": 0.8552631578947368, "grad_norm": 0.20537501573562622, "learning_rate": 9.954112452602045e-05, "loss": 0.025, "step": 1820 }, { "epoch": 0.8599624060150376, "grad_norm": 0.24950020015239716, "learning_rate": 9.952988187535886e-05, "loss": 0.0344, "step": 1830 }, { "epoch": 0.8646616541353384, "grad_norm": 0.27342480421066284, "learning_rate": 9.95185038118915e-05, "loss": 0.0414, "step": 1840 }, { "epoch": 0.8693609022556391, "grad_norm": 0.31957921385765076, "learning_rate": 9.950699036672559e-05, "loss": 0.0314, "step": 1850 }, { "epoch": 0.8740601503759399, "grad_norm": 0.2214190661907196, "learning_rate": 9.949534157133844e-05, "loss": 0.0328, "step": 1860 }, { "epoch": 0.8787593984962406, "grad_norm": 0.2547666132450104, "learning_rate": 9.948355745757741e-05, "loss": 0.0296, "step": 1870 }, { "epoch": 0.8834586466165414, "grad_norm": 0.20601588487625122, "learning_rate": 9.94716380576598e-05, "loss": 0.0319, "step": 1880 }, { "epoch": 0.8881578947368421, "grad_norm": 0.30237144231796265, "learning_rate": 9.945958340417283e-05, "loss": 0.0417, "step": 1890 }, { "epoch": 0.8928571428571429, "grad_norm": 0.24439987540245056, "learning_rate": 9.944739353007344e-05, "loss": 0.0414, "step": 1900 }, { "epoch": 0.8975563909774437, "grad_norm": 0.32059434056282043, "learning_rate": 9.943506846868826e-05, "loss": 0.0337, "step": 1910 }, { "epoch": 0.9022556390977443, "grad_norm": 0.24147386848926544, "learning_rate": 9.942260825371358e-05, "loss": 0.0257, "step": 1920 }, { "epoch": 0.9069548872180451, "grad_norm": 0.3102846145629883, "learning_rate": 9.941001291921512e-05, "loss": 0.0391, "step": 1930 }, { "epoch": 0.9116541353383458, "grad_norm": 0.2683519124984741, "learning_rate": 9.939728249962807e-05, "loss": 0.0249, "step": 1940 }, { "epoch": 0.9163533834586466, "grad_norm": 0.2480572909116745, "learning_rate": 9.938441702975689e-05, "loss": 0.0369, "step": 1950 }, { "epoch": 0.9210526315789473, "grad_norm": 0.26705268025398254, "learning_rate": 9.937141654477528e-05, "loss": 0.0374, "step": 1960 }, { "epoch": 0.9257518796992481, "grad_norm": 0.2704300880432129, "learning_rate": 9.93582810802261e-05, "loss": 0.0384, "step": 1970 }, { "epoch": 0.9304511278195489, "grad_norm": 0.3017674386501312, "learning_rate": 9.934501067202117e-05, "loss": 0.0393, "step": 1980 }, { "epoch": 0.9351503759398496, "grad_norm": 0.3015788495540619, "learning_rate": 9.93316053564413e-05, "loss": 0.0326, "step": 1990 }, { "epoch": 0.9398496240601504, "grad_norm": 0.30426859855651855, "learning_rate": 9.931806517013612e-05, "loss": 0.0329, "step": 2000 }, { "epoch": 0.9445488721804511, "grad_norm": 0.3226216435432434, "learning_rate": 9.930439015012396e-05, "loss": 0.0327, "step": 2010 }, { "epoch": 0.9492481203007519, "grad_norm": 0.31180641055107117, "learning_rate": 9.929058033379181e-05, "loss": 0.0284, "step": 2020 }, { "epoch": 0.9539473684210527, "grad_norm": 0.26425114274024963, "learning_rate": 9.927663575889521e-05, "loss": 0.033, "step": 2030 }, { "epoch": 0.9586466165413534, "grad_norm": 0.15367713570594788, "learning_rate": 9.926255646355804e-05, "loss": 0.0311, "step": 2040 }, { "epoch": 0.9633458646616542, "grad_norm": 0.21648705005645752, "learning_rate": 9.92483424862726e-05, "loss": 0.031, "step": 2050 }, { "epoch": 0.9680451127819549, "grad_norm": 0.2765688896179199, "learning_rate": 9.923399386589933e-05, "loss": 0.0389, "step": 2060 }, { "epoch": 0.9727443609022557, "grad_norm": 0.21116332709789276, "learning_rate": 9.921951064166684e-05, "loss": 0.0348, "step": 2070 }, { "epoch": 0.9774436090225563, "grad_norm": 0.2599218189716339, "learning_rate": 9.92048928531717e-05, "loss": 0.043, "step": 2080 }, { "epoch": 0.9821428571428571, "grad_norm": 0.2683297395706177, "learning_rate": 9.919014054037836e-05, "loss": 0.0285, "step": 2090 }, { "epoch": 0.9868421052631579, "grad_norm": 0.20241263508796692, "learning_rate": 9.917525374361912e-05, "loss": 0.033, "step": 2100 }, { "epoch": 0.9915413533834586, "grad_norm": 0.19710877537727356, "learning_rate": 9.91602325035939e-05, "loss": 0.0375, "step": 2110 }, { "epoch": 0.9962406015037594, "grad_norm": 0.29469940066337585, "learning_rate": 9.914507686137019e-05, "loss": 0.0435, "step": 2120 }, { "epoch": 1.0009398496240602, "grad_norm": 0.40180471539497375, "learning_rate": 9.912978685838294e-05, "loss": 0.0425, "step": 2130 }, { "epoch": 1.005639097744361, "grad_norm": 0.26152098178863525, "learning_rate": 9.911436253643445e-05, "loss": 0.0326, "step": 2140 }, { "epoch": 1.0103383458646618, "grad_norm": 0.31987690925598145, "learning_rate": 9.90988039376942e-05, "loss": 0.0318, "step": 2150 }, { "epoch": 1.0150375939849625, "grad_norm": 0.22263208031654358, "learning_rate": 9.90831111046988e-05, "loss": 0.0424, "step": 2160 }, { "epoch": 1.019736842105263, "grad_norm": 0.23681917786598206, "learning_rate": 9.90672840803519e-05, "loss": 0.0272, "step": 2170 }, { "epoch": 1.0244360902255638, "grad_norm": 0.210203617811203, "learning_rate": 9.905132290792394e-05, "loss": 0.0205, "step": 2180 }, { "epoch": 1.0291353383458646, "grad_norm": 0.3350231945514679, "learning_rate": 9.903522763105218e-05, "loss": 0.0308, "step": 2190 }, { "epoch": 1.0338345864661653, "grad_norm": 0.18159183859825134, "learning_rate": 9.901899829374047e-05, "loss": 0.032, "step": 2200 }, { "epoch": 1.038533834586466, "grad_norm": 0.23585166037082672, "learning_rate": 9.900263494035921e-05, "loss": 0.0381, "step": 2210 }, { "epoch": 1.0432330827067668, "grad_norm": 0.21487107872962952, "learning_rate": 9.89861376156452e-05, "loss": 0.0332, "step": 2220 }, { "epoch": 1.0479323308270676, "grad_norm": 0.29074567556381226, "learning_rate": 9.896950636470147e-05, "loss": 0.0363, "step": 2230 }, { "epoch": 1.0526315789473684, "grad_norm": 0.2568870782852173, "learning_rate": 9.895274123299723e-05, "loss": 0.0305, "step": 2240 }, { "epoch": 1.0573308270676691, "grad_norm": 0.3047011196613312, "learning_rate": 9.893584226636772e-05, "loss": 0.0335, "step": 2250 }, { "epoch": 1.0620300751879699, "grad_norm": 0.2922953963279724, "learning_rate": 9.891880951101407e-05, "loss": 0.0443, "step": 2260 }, { "epoch": 1.0667293233082706, "grad_norm": 0.29236820340156555, "learning_rate": 9.890164301350318e-05, "loss": 0.0339, "step": 2270 }, { "epoch": 1.0714285714285714, "grad_norm": 0.2855212688446045, "learning_rate": 9.888434282076758e-05, "loss": 0.0341, "step": 2280 }, { "epoch": 1.0761278195488722, "grad_norm": 0.2906155586242676, "learning_rate": 9.886690898010535e-05, "loss": 0.0366, "step": 2290 }, { "epoch": 1.080827067669173, "grad_norm": 0.24896125495433807, "learning_rate": 9.884934153917997e-05, "loss": 0.0386, "step": 2300 }, { "epoch": 1.0855263157894737, "grad_norm": 0.2523311674594879, "learning_rate": 9.883164054602012e-05, "loss": 0.0322, "step": 2310 }, { "epoch": 1.0902255639097744, "grad_norm": 0.3600609004497528, "learning_rate": 9.881380604901964e-05, "loss": 0.0394, "step": 2320 }, { "epoch": 1.0949248120300752, "grad_norm": 0.2090110331773758, "learning_rate": 9.879583809693738e-05, "loss": 0.0273, "step": 2330 }, { "epoch": 1.099624060150376, "grad_norm": 0.27230265736579895, "learning_rate": 9.877773673889701e-05, "loss": 0.0313, "step": 2340 }, { "epoch": 1.1043233082706767, "grad_norm": 0.24328522384166718, "learning_rate": 9.8759502024387e-05, "loss": 0.0366, "step": 2350 }, { "epoch": 1.1090225563909775, "grad_norm": 0.24177910387516022, "learning_rate": 9.87411340032603e-05, "loss": 0.0295, "step": 2360 }, { "epoch": 1.1137218045112782, "grad_norm": 0.22861960530281067, "learning_rate": 9.872263272573443e-05, "loss": 0.0359, "step": 2370 }, { "epoch": 1.118421052631579, "grad_norm": 0.2622787654399872, "learning_rate": 9.870399824239117e-05, "loss": 0.0274, "step": 2380 }, { "epoch": 1.1231203007518797, "grad_norm": 0.22869399189949036, "learning_rate": 9.868523060417646e-05, "loss": 0.0437, "step": 2390 }, { "epoch": 1.1278195488721805, "grad_norm": 0.22369208931922913, "learning_rate": 9.86663298624003e-05, "loss": 0.0313, "step": 2400 }, { "epoch": 1.1325187969924813, "grad_norm": 0.3054512143135071, "learning_rate": 9.864729606873663e-05, "loss": 0.0343, "step": 2410 }, { "epoch": 1.137218045112782, "grad_norm": 0.21700331568717957, "learning_rate": 9.862812927522309e-05, "loss": 0.0356, "step": 2420 }, { "epoch": 1.1419172932330828, "grad_norm": 0.1657762974500656, "learning_rate": 9.860882953426099e-05, "loss": 0.0273, "step": 2430 }, { "epoch": 1.1466165413533835, "grad_norm": 0.3012523949146271, "learning_rate": 9.858939689861506e-05, "loss": 0.0278, "step": 2440 }, { "epoch": 1.1513157894736843, "grad_norm": 0.1556554138660431, "learning_rate": 9.856983142141339e-05, "loss": 0.032, "step": 2450 }, { "epoch": 1.156015037593985, "grad_norm": 0.2980904281139374, "learning_rate": 9.855013315614725e-05, "loss": 0.0359, "step": 2460 }, { "epoch": 1.1607142857142858, "grad_norm": 0.23628155887126923, "learning_rate": 9.853030215667093e-05, "loss": 0.0253, "step": 2470 }, { "epoch": 1.1654135338345863, "grad_norm": 0.23128274083137512, "learning_rate": 9.851033847720166e-05, "loss": 0.0307, "step": 2480 }, { "epoch": 1.170112781954887, "grad_norm": 0.3112635314464569, "learning_rate": 9.849024217231935e-05, "loss": 0.0337, "step": 2490 }, { "epoch": 1.1748120300751879, "grad_norm": 0.30317866802215576, "learning_rate": 9.847001329696653e-05, "loss": 0.0349, "step": 2500 }, { "epoch": 1.1795112781954886, "grad_norm": 0.3040902614593506, "learning_rate": 9.844965190644817e-05, "loss": 0.035, "step": 2510 }, { "epoch": 1.1842105263157894, "grad_norm": 0.22820954024791718, "learning_rate": 9.842915805643155e-05, "loss": 0.0293, "step": 2520 }, { "epoch": 1.1889097744360901, "grad_norm": 0.25117453932762146, "learning_rate": 9.840853180294608e-05, "loss": 0.0295, "step": 2530 }, { "epoch": 1.193609022556391, "grad_norm": 0.28759878873825073, "learning_rate": 9.838777320238312e-05, "loss": 0.0423, "step": 2540 }, { "epoch": 1.1983082706766917, "grad_norm": 0.286913126707077, "learning_rate": 9.836688231149592e-05, "loss": 0.0284, "step": 2550 }, { "epoch": 1.2030075187969924, "grad_norm": 0.2655298411846161, "learning_rate": 9.834585918739936e-05, "loss": 0.0389, "step": 2560 }, { "epoch": 1.2077067669172932, "grad_norm": 0.2870301604270935, "learning_rate": 9.832470388756987e-05, "loss": 0.0354, "step": 2570 }, { "epoch": 1.212406015037594, "grad_norm": 0.4012272357940674, "learning_rate": 9.830341646984521e-05, "loss": 0.0331, "step": 2580 }, { "epoch": 1.2171052631578947, "grad_norm": 0.29325100779533386, "learning_rate": 9.82819969924244e-05, "loss": 0.0286, "step": 2590 }, { "epoch": 1.2218045112781954, "grad_norm": 0.27856117486953735, "learning_rate": 9.826044551386744e-05, "loss": 0.0342, "step": 2600 }, { "epoch": 1.2265037593984962, "grad_norm": 0.3191596567630768, "learning_rate": 9.823876209309527e-05, "loss": 0.0299, "step": 2610 }, { "epoch": 1.231203007518797, "grad_norm": 0.2671605944633484, "learning_rate": 9.821694678938953e-05, "loss": 0.0305, "step": 2620 }, { "epoch": 1.2359022556390977, "grad_norm": 0.29545503854751587, "learning_rate": 9.819499966239243e-05, "loss": 0.033, "step": 2630 }, { "epoch": 1.2406015037593985, "grad_norm": 0.3468911647796631, "learning_rate": 9.817292077210659e-05, "loss": 0.0362, "step": 2640 }, { "epoch": 1.2453007518796992, "grad_norm": 0.21935796737670898, "learning_rate": 9.815071017889482e-05, "loss": 0.0275, "step": 2650 }, { "epoch": 1.25, "grad_norm": 0.20226982235908508, "learning_rate": 9.812836794348004e-05, "loss": 0.0303, "step": 2660 }, { "epoch": 1.2546992481203008, "grad_norm": 0.24741894006729126, "learning_rate": 9.81058941269451e-05, "loss": 0.0333, "step": 2670 }, { "epoch": 1.2593984962406015, "grad_norm": 0.28749677538871765, "learning_rate": 9.808328879073251e-05, "loss": 0.0297, "step": 2680 }, { "epoch": 1.2640977443609023, "grad_norm": 0.14903762936592102, "learning_rate": 9.806055199664446e-05, "loss": 0.0261, "step": 2690 }, { "epoch": 1.268796992481203, "grad_norm": 0.2174089550971985, "learning_rate": 9.803768380684242e-05, "loss": 0.0336, "step": 2700 }, { "epoch": 1.2734962406015038, "grad_norm": 0.21511636674404144, "learning_rate": 9.801468428384716e-05, "loss": 0.028, "step": 2710 }, { "epoch": 1.2781954887218046, "grad_norm": 0.2507420778274536, "learning_rate": 9.799155349053851e-05, "loss": 0.0315, "step": 2720 }, { "epoch": 1.2828947368421053, "grad_norm": 0.22325477004051208, "learning_rate": 9.796829149015517e-05, "loss": 0.0242, "step": 2730 }, { "epoch": 1.287593984962406, "grad_norm": 0.324486643075943, "learning_rate": 9.794489834629455e-05, "loss": 0.0243, "step": 2740 }, { "epoch": 1.2922932330827068, "grad_norm": 0.24349677562713623, "learning_rate": 9.792137412291265e-05, "loss": 0.0294, "step": 2750 }, { "epoch": 1.2969924812030076, "grad_norm": 0.2508913576602936, "learning_rate": 9.789771888432375e-05, "loss": 0.0283, "step": 2760 }, { "epoch": 1.3016917293233083, "grad_norm": 0.210478737950325, "learning_rate": 9.787393269520039e-05, "loss": 0.0269, "step": 2770 }, { "epoch": 1.306390977443609, "grad_norm": 0.2785148620605469, "learning_rate": 9.785001562057309e-05, "loss": 0.0277, "step": 2780 }, { "epoch": 1.3110902255639099, "grad_norm": 0.30427059531211853, "learning_rate": 9.782596772583026e-05, "loss": 0.0335, "step": 2790 }, { "epoch": 1.3157894736842106, "grad_norm": 0.3535986840724945, "learning_rate": 9.780178907671789e-05, "loss": 0.035, "step": 2800 }, { "epoch": 1.3204887218045114, "grad_norm": 0.3639542758464813, "learning_rate": 9.777747973933948e-05, "loss": 0.0263, "step": 2810 }, { "epoch": 1.3251879699248121, "grad_norm": 0.3313891589641571, "learning_rate": 9.775303978015585e-05, "loss": 0.0302, "step": 2820 }, { "epoch": 1.329887218045113, "grad_norm": 0.266083300113678, "learning_rate": 9.772846926598491e-05, "loss": 0.0354, "step": 2830 }, { "epoch": 1.3345864661654137, "grad_norm": 0.3054597079753876, "learning_rate": 9.77037682640015e-05, "loss": 0.0344, "step": 2840 }, { "epoch": 1.3392857142857144, "grad_norm": 0.1686822921037674, "learning_rate": 9.767893684173721e-05, "loss": 0.0314, "step": 2850 }, { "epoch": 1.3439849624060152, "grad_norm": 0.3133353292942047, "learning_rate": 9.765397506708023e-05, "loss": 0.0314, "step": 2860 }, { "epoch": 1.3486842105263157, "grad_norm": 0.3505442142486572, "learning_rate": 9.762888300827507e-05, "loss": 0.0371, "step": 2870 }, { "epoch": 1.3533834586466165, "grad_norm": 0.23840002715587616, "learning_rate": 9.760366073392246e-05, "loss": 0.0278, "step": 2880 }, { "epoch": 1.3580827067669172, "grad_norm": 0.35968664288520813, "learning_rate": 9.757830831297914e-05, "loss": 0.0315, "step": 2890 }, { "epoch": 1.362781954887218, "grad_norm": 0.2002311795949936, "learning_rate": 9.755282581475769e-05, "loss": 0.0296, "step": 2900 }, { "epoch": 1.3674812030075187, "grad_norm": 0.31238964200019836, "learning_rate": 9.752721330892624e-05, "loss": 0.0279, "step": 2910 }, { "epoch": 1.3721804511278195, "grad_norm": 0.16220539808273315, "learning_rate": 9.750147086550844e-05, "loss": 0.031, "step": 2920 }, { "epoch": 1.3768796992481203, "grad_norm": 0.20878033339977264, "learning_rate": 9.747559855488313e-05, "loss": 0.0275, "step": 2930 }, { "epoch": 1.381578947368421, "grad_norm": 0.2526220679283142, "learning_rate": 9.744959644778422e-05, "loss": 0.0348, "step": 2940 }, { "epoch": 1.3862781954887218, "grad_norm": 0.15938736498355865, "learning_rate": 9.742346461530048e-05, "loss": 0.0393, "step": 2950 }, { "epoch": 1.3909774436090225, "grad_norm": 0.2751128375530243, "learning_rate": 9.739720312887535e-05, "loss": 0.031, "step": 2960 }, { "epoch": 1.3956766917293233, "grad_norm": 0.1849040985107422, "learning_rate": 9.73708120603067e-05, "loss": 0.0316, "step": 2970 }, { "epoch": 1.400375939849624, "grad_norm": 0.2610875964164734, "learning_rate": 9.734429148174675e-05, "loss": 0.0291, "step": 2980 }, { "epoch": 1.4050751879699248, "grad_norm": 0.23159807920455933, "learning_rate": 9.731764146570173e-05, "loss": 0.0322, "step": 2990 }, { "epoch": 1.4097744360902256, "grad_norm": 0.3438315689563751, "learning_rate": 9.729086208503174e-05, "loss": 0.0446, "step": 3000 }, { "epoch": 1.4144736842105263, "grad_norm": 0.1952996701002121, "learning_rate": 9.726395341295062e-05, "loss": 0.0291, "step": 3010 }, { "epoch": 1.419172932330827, "grad_norm": 0.29903092980384827, "learning_rate": 9.723691552302562e-05, "loss": 0.0291, "step": 3020 }, { "epoch": 1.4238721804511278, "grad_norm": 0.19270353019237518, "learning_rate": 9.720974848917735e-05, "loss": 0.0322, "step": 3030 }, { "epoch": 1.4285714285714286, "grad_norm": 0.24853135645389557, "learning_rate": 9.718245238567939e-05, "loss": 0.0352, "step": 3040 }, { "epoch": 1.4332706766917294, "grad_norm": 0.28170156478881836, "learning_rate": 9.715502728715826e-05, "loss": 0.0402, "step": 3050 }, { "epoch": 1.4379699248120301, "grad_norm": 0.2433410882949829, "learning_rate": 9.712747326859315e-05, "loss": 0.029, "step": 3060 }, { "epoch": 1.4426691729323309, "grad_norm": 0.272994726896286, "learning_rate": 9.709979040531569e-05, "loss": 0.0255, "step": 3070 }, { "epoch": 1.4473684210526316, "grad_norm": 0.3101017475128174, "learning_rate": 9.707197877300974e-05, "loss": 0.0464, "step": 3080 }, { "epoch": 1.4520676691729324, "grad_norm": 0.22507302463054657, "learning_rate": 9.704403844771128e-05, "loss": 0.0334, "step": 3090 }, { "epoch": 1.4567669172932332, "grad_norm": 0.17489475011825562, "learning_rate": 9.701596950580806e-05, "loss": 0.0357, "step": 3100 }, { "epoch": 1.461466165413534, "grad_norm": 0.20901988446712494, "learning_rate": 9.698777202403953e-05, "loss": 0.028, "step": 3110 }, { "epoch": 1.4661654135338344, "grad_norm": 0.22741763293743134, "learning_rate": 9.695944607949649e-05, "loss": 0.0329, "step": 3120 }, { "epoch": 1.4708646616541352, "grad_norm": 0.26588672399520874, "learning_rate": 9.693099174962103e-05, "loss": 0.029, "step": 3130 }, { "epoch": 1.475563909774436, "grad_norm": 0.25058892369270325, "learning_rate": 9.690240911220618e-05, "loss": 0.0273, "step": 3140 }, { "epoch": 1.4802631578947367, "grad_norm": 0.2134590893983841, "learning_rate": 9.687369824539577e-05, "loss": 0.034, "step": 3150 }, { "epoch": 1.4849624060150375, "grad_norm": 0.19278165698051453, "learning_rate": 9.684485922768422e-05, "loss": 0.0254, "step": 3160 }, { "epoch": 1.4896616541353382, "grad_norm": 0.17773672938346863, "learning_rate": 9.681589213791633e-05, "loss": 0.0324, "step": 3170 }, { "epoch": 1.494360902255639, "grad_norm": 0.2836746573448181, "learning_rate": 9.6786797055287e-05, "loss": 0.0331, "step": 3180 }, { "epoch": 1.4990601503759398, "grad_norm": 0.1892741471529007, "learning_rate": 9.675757405934103e-05, "loss": 0.0266, "step": 3190 }, { "epoch": 1.5037593984962405, "grad_norm": 0.26725977659225464, "learning_rate": 9.672822322997305e-05, "loss": 0.0392, "step": 3200 }, { "epoch": 1.5084586466165413, "grad_norm": 0.25514504313468933, "learning_rate": 9.669874464742705e-05, "loss": 0.0274, "step": 3210 }, { "epoch": 1.513157894736842, "grad_norm": 0.2787286043167114, "learning_rate": 9.66691383922964e-05, "loss": 0.0262, "step": 3220 }, { "epoch": 1.5178571428571428, "grad_norm": 0.16668666899204254, "learning_rate": 9.663940454552342e-05, "loss": 0.0244, "step": 3230 }, { "epoch": 1.5225563909774436, "grad_norm": 0.19877758622169495, "learning_rate": 9.660954318839933e-05, "loss": 0.0244, "step": 3240 }, { "epoch": 1.5272556390977443, "grad_norm": 0.19377648830413818, "learning_rate": 9.657955440256395e-05, "loss": 0.0302, "step": 3250 }, { "epoch": 1.531954887218045, "grad_norm": 0.2523846924304962, "learning_rate": 9.654943827000548e-05, "loss": 0.0231, "step": 3260 }, { "epoch": 1.5366541353383458, "grad_norm": 0.2247912436723709, "learning_rate": 9.651919487306025e-05, "loss": 0.0303, "step": 3270 }, { "epoch": 1.5413533834586466, "grad_norm": 0.28465062379837036, "learning_rate": 9.648882429441257e-05, "loss": 0.0264, "step": 3280 }, { "epoch": 1.5460526315789473, "grad_norm": 0.24437738955020905, "learning_rate": 9.645832661709444e-05, "loss": 0.0311, "step": 3290 }, { "epoch": 1.550751879699248, "grad_norm": 0.320959210395813, "learning_rate": 9.642770192448536e-05, "loss": 0.041, "step": 3300 }, { "epoch": 1.5554511278195489, "grad_norm": 0.19710564613342285, "learning_rate": 9.639695030031204e-05, "loss": 0.0249, "step": 3310 }, { "epoch": 1.5601503759398496, "grad_norm": 0.2785632908344269, "learning_rate": 9.636607182864827e-05, "loss": 0.026, "step": 3320 }, { "epoch": 1.5648496240601504, "grad_norm": 0.24171295762062073, "learning_rate": 9.63350665939146e-05, "loss": 0.0307, "step": 3330 }, { "epoch": 1.5695488721804511, "grad_norm": 0.25583142042160034, "learning_rate": 9.630393468087818e-05, "loss": 0.0258, "step": 3340 }, { "epoch": 1.574248120300752, "grad_norm": 0.20863236486911774, "learning_rate": 9.627267617465243e-05, "loss": 0.0271, "step": 3350 }, { "epoch": 1.5789473684210527, "grad_norm": 0.1731235533952713, "learning_rate": 9.624129116069694e-05, "loss": 0.0365, "step": 3360 }, { "epoch": 1.5836466165413534, "grad_norm": 0.3141409754753113, "learning_rate": 9.620977972481716e-05, "loss": 0.0375, "step": 3370 }, { "epoch": 1.5883458646616542, "grad_norm": 0.1895882934331894, "learning_rate": 9.617814195316411e-05, "loss": 0.0244, "step": 3380 }, { "epoch": 1.593045112781955, "grad_norm": 0.26948222517967224, "learning_rate": 9.614637793223425e-05, "loss": 0.0269, "step": 3390 }, { "epoch": 1.5977443609022557, "grad_norm": 0.2704524099826813, "learning_rate": 9.611448774886924e-05, "loss": 0.0305, "step": 3400 }, { "epoch": 1.6024436090225564, "grad_norm": 0.24672971665859222, "learning_rate": 9.60824714902556e-05, "loss": 0.0242, "step": 3410 }, { "epoch": 1.6071428571428572, "grad_norm": 0.2219143658876419, "learning_rate": 9.605032924392457e-05, "loss": 0.0323, "step": 3420 }, { "epoch": 1.611842105263158, "grad_norm": 0.29584020376205444, "learning_rate": 9.601806109775179e-05, "loss": 0.0279, "step": 3430 }, { "epoch": 1.6165413533834587, "grad_norm": 0.20713065564632416, "learning_rate": 9.598566713995718e-05, "loss": 0.0361, "step": 3440 }, { "epoch": 1.6212406015037595, "grad_norm": 0.23049211502075195, "learning_rate": 9.595314745910456e-05, "loss": 0.0302, "step": 3450 }, { "epoch": 1.6259398496240602, "grad_norm": 0.2782370150089264, "learning_rate": 9.59205021441015e-05, "loss": 0.0266, "step": 3460 }, { "epoch": 1.630639097744361, "grad_norm": 0.14473502337932587, "learning_rate": 9.588773128419906e-05, "loss": 0.0284, "step": 3470 }, { "epoch": 1.6353383458646618, "grad_norm": 0.16135524213314056, "learning_rate": 9.58548349689915e-05, "loss": 0.0254, "step": 3480 }, { "epoch": 1.6400375939849625, "grad_norm": 0.2047393023967743, "learning_rate": 9.582181328841611e-05, "loss": 0.0317, "step": 3490 }, { "epoch": 1.6447368421052633, "grad_norm": 0.1870441883802414, "learning_rate": 9.578866633275288e-05, "loss": 0.0285, "step": 3500 }, { "epoch": 1.649436090225564, "grad_norm": 0.2362082153558731, "learning_rate": 9.575539419262434e-05, "loss": 0.0299, "step": 3510 }, { "epoch": 1.6541353383458648, "grad_norm": 0.3200538158416748, "learning_rate": 9.572199695899522e-05, "loss": 0.0334, "step": 3520 }, { "epoch": 1.6588345864661656, "grad_norm": 0.22211411595344543, "learning_rate": 9.568847472317232e-05, "loss": 0.0243, "step": 3530 }, { "epoch": 1.6635338345864663, "grad_norm": 0.27358049154281616, "learning_rate": 9.565482757680415e-05, "loss": 0.037, "step": 3540 }, { "epoch": 1.668233082706767, "grad_norm": 0.23407623171806335, "learning_rate": 9.562105561188069e-05, "loss": 0.0278, "step": 3550 }, { "epoch": 1.6729323308270678, "grad_norm": 0.17386211454868317, "learning_rate": 9.558715892073323e-05, "loss": 0.0315, "step": 3560 }, { "epoch": 1.6776315789473686, "grad_norm": 0.2037278711795807, "learning_rate": 9.555313759603402e-05, "loss": 0.0359, "step": 3570 }, { "epoch": 1.6823308270676691, "grad_norm": 0.2268248200416565, "learning_rate": 9.551899173079607e-05, "loss": 0.0376, "step": 3580 }, { "epoch": 1.6870300751879699, "grad_norm": 0.23089633882045746, "learning_rate": 9.548472141837286e-05, "loss": 0.0376, "step": 3590 }, { "epoch": 1.6917293233082706, "grad_norm": 0.19939178228378296, "learning_rate": 9.545032675245813e-05, "loss": 0.0341, "step": 3600 }, { "epoch": 1.6964285714285714, "grad_norm": 0.24425296485424042, "learning_rate": 9.541580782708557e-05, "loss": 0.0333, "step": 3610 }, { "epoch": 1.7011278195488722, "grad_norm": 0.3158224821090698, "learning_rate": 9.538116473662861e-05, "loss": 0.0311, "step": 3620 }, { "epoch": 1.705827067669173, "grad_norm": 0.2468244433403015, "learning_rate": 9.534639757580013e-05, "loss": 0.0325, "step": 3630 }, { "epoch": 1.7105263157894737, "grad_norm": 0.27000561356544495, "learning_rate": 9.531150643965223e-05, "loss": 0.0258, "step": 3640 }, { "epoch": 1.7152255639097744, "grad_norm": 0.22154955565929413, "learning_rate": 9.527649142357596e-05, "loss": 0.0334, "step": 3650 }, { "epoch": 1.7199248120300752, "grad_norm": 0.2872529625892639, "learning_rate": 9.524135262330098e-05, "loss": 0.0276, "step": 3660 }, { "epoch": 1.724624060150376, "grad_norm": 0.2597917914390564, "learning_rate": 9.520609013489547e-05, "loss": 0.0333, "step": 3670 }, { "epoch": 1.7293233082706767, "grad_norm": 0.2070426195859909, "learning_rate": 9.517070405476575e-05, "loss": 0.0224, "step": 3680 }, { "epoch": 1.7340225563909775, "grad_norm": 0.25321757793426514, "learning_rate": 9.513519447965595e-05, "loss": 0.0283, "step": 3690 }, { "epoch": 1.7387218045112782, "grad_norm": 0.25070440769195557, "learning_rate": 9.509956150664796e-05, "loss": 0.0299, "step": 3700 }, { "epoch": 1.743421052631579, "grad_norm": 0.1794224977493286, "learning_rate": 9.50638052331609e-05, "loss": 0.033, "step": 3710 }, { "epoch": 1.7481203007518797, "grad_norm": 0.24292895197868347, "learning_rate": 9.502792575695112e-05, "loss": 0.0347, "step": 3720 }, { "epoch": 1.7528195488721805, "grad_norm": 0.16750149428844452, "learning_rate": 9.499192317611167e-05, "loss": 0.0269, "step": 3730 }, { "epoch": 1.7575187969924813, "grad_norm": 0.2226608693599701, "learning_rate": 9.49557975890723e-05, "loss": 0.032, "step": 3740 }, { "epoch": 1.7622180451127818, "grad_norm": 0.1929808109998703, "learning_rate": 9.491954909459895e-05, "loss": 0.0362, "step": 3750 }, { "epoch": 1.7669172932330826, "grad_norm": 0.2228640466928482, "learning_rate": 9.488317779179361e-05, "loss": 0.0216, "step": 3760 }, { "epoch": 1.7716165413533833, "grad_norm": 0.20736654102802277, "learning_rate": 9.484668378009408e-05, "loss": 0.0231, "step": 3770 }, { "epoch": 1.776315789473684, "grad_norm": 0.1721428632736206, "learning_rate": 9.481006715927351e-05, "loss": 0.0255, "step": 3780 }, { "epoch": 1.7810150375939848, "grad_norm": 0.21083158254623413, "learning_rate": 9.477332802944044e-05, "loss": 0.0303, "step": 3790 }, { "epoch": 1.7857142857142856, "grad_norm": 0.2370145171880722, "learning_rate": 9.473646649103818e-05, "loss": 0.0417, "step": 3800 }, { "epoch": 1.7904135338345863, "grad_norm": 0.2074321210384369, "learning_rate": 9.46994826448448e-05, "loss": 0.0337, "step": 3810 }, { "epoch": 1.795112781954887, "grad_norm": 0.32189542055130005, "learning_rate": 9.46623765919727e-05, "loss": 0.0285, "step": 3820 }, { "epoch": 1.7998120300751879, "grad_norm": 0.12926824390888214, "learning_rate": 9.462514843386845e-05, "loss": 0.0278, "step": 3830 }, { "epoch": 1.8045112781954886, "grad_norm": 0.2261073887348175, "learning_rate": 9.458779827231237e-05, "loss": 0.0286, "step": 3840 }, { "epoch": 1.8092105263157894, "grad_norm": 0.18350914120674133, "learning_rate": 9.45503262094184e-05, "loss": 0.0258, "step": 3850 }, { "epoch": 1.8139097744360901, "grad_norm": 0.2196865826845169, "learning_rate": 9.451273234763371e-05, "loss": 0.0376, "step": 3860 }, { "epoch": 1.818609022556391, "grad_norm": 0.18123769760131836, "learning_rate": 9.447501678973852e-05, "loss": 0.0356, "step": 3870 }, { "epoch": 1.8233082706766917, "grad_norm": 0.21060392260551453, "learning_rate": 9.443717963884569e-05, "loss": 0.0317, "step": 3880 }, { "epoch": 1.8280075187969924, "grad_norm": 0.2036607414484024, "learning_rate": 9.439922099840054e-05, "loss": 0.0315, "step": 3890 }, { "epoch": 1.8327067669172932, "grad_norm": 0.18484705686569214, "learning_rate": 9.43611409721806e-05, "loss": 0.0307, "step": 3900 }, { "epoch": 1.837406015037594, "grad_norm": 0.20937314629554749, "learning_rate": 9.432293966429514e-05, "loss": 0.026, "step": 3910 }, { "epoch": 1.8421052631578947, "grad_norm": 0.2676234245300293, "learning_rate": 9.428461717918511e-05, "loss": 0.0255, "step": 3920 }, { "epoch": 1.8468045112781954, "grad_norm": 0.20652443170547485, "learning_rate": 9.424617362162271e-05, "loss": 0.0288, "step": 3930 }, { "epoch": 1.8515037593984962, "grad_norm": 0.3014736473560333, "learning_rate": 9.420760909671118e-05, "loss": 0.0337, "step": 3940 }, { "epoch": 1.856203007518797, "grad_norm": 0.2299966812133789, "learning_rate": 9.416892370988444e-05, "loss": 0.0317, "step": 3950 }, { "epoch": 1.8609022556390977, "grad_norm": 0.2134593427181244, "learning_rate": 9.413011756690685e-05, "loss": 0.0416, "step": 3960 }, { "epoch": 1.8656015037593985, "grad_norm": 0.18940375745296478, "learning_rate": 9.409119077387294e-05, "loss": 0.0316, "step": 3970 }, { "epoch": 1.8703007518796992, "grad_norm": 0.25739434361457825, "learning_rate": 9.405214343720707e-05, "loss": 0.0252, "step": 3980 }, { "epoch": 1.875, "grad_norm": 0.21260878443717957, "learning_rate": 9.401297566366318e-05, "loss": 0.0241, "step": 3990 }, { "epoch": 1.8796992481203008, "grad_norm": 0.26923254132270813, "learning_rate": 9.397368756032445e-05, "loss": 0.0239, "step": 4000 }, { "epoch": 1.8843984962406015, "grad_norm": 0.18807107210159302, "learning_rate": 9.393427923460308e-05, "loss": 0.0266, "step": 4010 }, { "epoch": 1.8890977443609023, "grad_norm": 0.18463344871997833, "learning_rate": 9.389475079423988e-05, "loss": 0.0228, "step": 4020 }, { "epoch": 1.893796992481203, "grad_norm": 0.1900807023048401, "learning_rate": 9.385510234730415e-05, "loss": 0.0271, "step": 4030 }, { "epoch": 1.8984962406015038, "grad_norm": 0.22632406651973724, "learning_rate": 9.381533400219318e-05, "loss": 0.0262, "step": 4040 }, { "epoch": 1.9031954887218046, "grad_norm": 0.22867454588413239, "learning_rate": 9.377544586763215e-05, "loss": 0.0386, "step": 4050 }, { "epoch": 1.9078947368421053, "grad_norm": 0.21906429529190063, "learning_rate": 9.373543805267368e-05, "loss": 0.0313, "step": 4060 }, { "epoch": 1.912593984962406, "grad_norm": 0.21118290722370148, "learning_rate": 9.369531066669758e-05, "loss": 0.0368, "step": 4070 }, { "epoch": 1.9172932330827068, "grad_norm": 0.2828110456466675, "learning_rate": 9.365506381941066e-05, "loss": 0.0313, "step": 4080 }, { "epoch": 1.9219924812030076, "grad_norm": 0.29733067750930786, "learning_rate": 9.36146976208462e-05, "loss": 0.0287, "step": 4090 }, { "epoch": 1.9266917293233083, "grad_norm": 0.17516322433948517, "learning_rate": 9.357421218136386e-05, "loss": 0.0313, "step": 4100 }, { "epoch": 1.931390977443609, "grad_norm": 0.13433349132537842, "learning_rate": 9.353360761164931e-05, "loss": 0.0235, "step": 4110 }, { "epoch": 1.9360902255639099, "grad_norm": 0.19060957431793213, "learning_rate": 9.349288402271388e-05, "loss": 0.0267, "step": 4120 }, { "epoch": 1.9407894736842106, "grad_norm": 0.2099418193101883, "learning_rate": 9.345204152589428e-05, "loss": 0.022, "step": 4130 }, { "epoch": 1.9454887218045114, "grad_norm": 0.16266584396362305, "learning_rate": 9.341108023285238e-05, "loss": 0.023, "step": 4140 }, { "epoch": 1.9501879699248121, "grad_norm": 0.21736647188663483, "learning_rate": 9.337000025557476e-05, "loss": 0.03, "step": 4150 }, { "epoch": 1.954887218045113, "grad_norm": 0.19137585163116455, "learning_rate": 9.332880170637252e-05, "loss": 0.024, "step": 4160 }, { "epoch": 1.9595864661654137, "grad_norm": 0.22384802997112274, "learning_rate": 9.328748469788093e-05, "loss": 0.0251, "step": 4170 }, { "epoch": 1.9642857142857144, "grad_norm": 0.21163040399551392, "learning_rate": 9.32460493430591e-05, "loss": 0.0279, "step": 4180 }, { "epoch": 1.9689849624060152, "grad_norm": 0.21178165078163147, "learning_rate": 9.320449575518972e-05, "loss": 0.0279, "step": 4190 }, { "epoch": 1.973684210526316, "grad_norm": 0.17716725170612335, "learning_rate": 9.316282404787871e-05, "loss": 0.0211, "step": 4200 }, { "epoch": 1.9783834586466167, "grad_norm": 0.20710819959640503, "learning_rate": 9.31210343350549e-05, "loss": 0.0227, "step": 4210 }, { "epoch": 1.9830827067669174, "grad_norm": 0.21356646716594696, "learning_rate": 9.30791267309698e-05, "loss": 0.0262, "step": 4220 }, { "epoch": 1.9877819548872182, "grad_norm": 0.24351871013641357, "learning_rate": 9.30371013501972e-05, "loss": 0.0231, "step": 4230 }, { "epoch": 1.9924812030075187, "grad_norm": 0.1973067820072174, "learning_rate": 9.299495830763286e-05, "loss": 0.02, "step": 4240 }, { "epoch": 1.9971804511278195, "grad_norm": 0.24003633856773376, "learning_rate": 9.295269771849427e-05, "loss": 0.0296, "step": 4250 }, { "epoch": 2.0018796992481205, "grad_norm": 0.18152223527431488, "learning_rate": 9.291031969832026e-05, "loss": 0.0231, "step": 4260 }, { "epoch": 2.0065789473684212, "grad_norm": 0.18007569015026093, "learning_rate": 9.286782436297073e-05, "loss": 0.0223, "step": 4270 }, { "epoch": 2.011278195488722, "grad_norm": 0.1332777440547943, "learning_rate": 9.282521182862629e-05, "loss": 0.0263, "step": 4280 }, { "epoch": 2.0159774436090228, "grad_norm": 0.2537460923194885, "learning_rate": 9.278248221178798e-05, "loss": 0.0421, "step": 4290 }, { "epoch": 2.0206766917293235, "grad_norm": 0.25093963742256165, "learning_rate": 9.273963562927695e-05, "loss": 0.0238, "step": 4300 }, { "epoch": 2.0253759398496243, "grad_norm": 0.1842799037694931, "learning_rate": 9.269667219823412e-05, "loss": 0.0209, "step": 4310 }, { "epoch": 2.030075187969925, "grad_norm": 0.2580346465110779, "learning_rate": 9.265359203611987e-05, "loss": 0.0257, "step": 4320 }, { "epoch": 2.0347744360902253, "grad_norm": 0.17334118485450745, "learning_rate": 9.261039526071374e-05, "loss": 0.0273, "step": 4330 }, { "epoch": 2.039473684210526, "grad_norm": 0.19151276350021362, "learning_rate": 9.256708199011401e-05, "loss": 0.0212, "step": 4340 }, { "epoch": 2.044172932330827, "grad_norm": 0.23879149556159973, "learning_rate": 9.252365234273755e-05, "loss": 0.0221, "step": 4350 }, { "epoch": 2.0488721804511276, "grad_norm": 0.2724449634552002, "learning_rate": 9.248010643731935e-05, "loss": 0.0298, "step": 4360 }, { "epoch": 2.0535714285714284, "grad_norm": 0.16645994782447815, "learning_rate": 9.243644439291223e-05, "loss": 0.0253, "step": 4370 }, { "epoch": 2.058270676691729, "grad_norm": 0.19133131206035614, "learning_rate": 9.239266632888659e-05, "loss": 0.0262, "step": 4380 }, { "epoch": 2.06296992481203, "grad_norm": 0.2156103402376175, "learning_rate": 9.234877236492997e-05, "loss": 0.0334, "step": 4390 }, { "epoch": 2.0676691729323307, "grad_norm": 0.17188550531864166, "learning_rate": 9.230476262104677e-05, "loss": 0.0189, "step": 4400 }, { "epoch": 2.0723684210526314, "grad_norm": 0.2865235507488251, "learning_rate": 9.226063721755799e-05, "loss": 0.0203, "step": 4410 }, { "epoch": 2.077067669172932, "grad_norm": 0.16109667718410492, "learning_rate": 9.221639627510076e-05, "loss": 0.0212, "step": 4420 }, { "epoch": 2.081766917293233, "grad_norm": 0.22182904183864594, "learning_rate": 9.217203991462815e-05, "loss": 0.0242, "step": 4430 }, { "epoch": 2.0864661654135337, "grad_norm": 0.20680855214595795, "learning_rate": 9.212756825740873e-05, "loss": 0.0283, "step": 4440 }, { "epoch": 2.0911654135338344, "grad_norm": 0.12871260941028595, "learning_rate": 9.208298142502636e-05, "loss": 0.0268, "step": 4450 }, { "epoch": 2.095864661654135, "grad_norm": 0.2089470773935318, "learning_rate": 9.20382795393797e-05, "loss": 0.028, "step": 4460 }, { "epoch": 2.100563909774436, "grad_norm": 0.20934775471687317, "learning_rate": 9.199346272268199e-05, "loss": 0.0298, "step": 4470 }, { "epoch": 2.1052631578947367, "grad_norm": 0.24874700605869293, "learning_rate": 9.194853109746074e-05, "loss": 0.02, "step": 4480 }, { "epoch": 2.1099624060150375, "grad_norm": 0.2033921480178833, "learning_rate": 9.190348478655724e-05, "loss": 0.0226, "step": 4490 }, { "epoch": 2.1146616541353382, "grad_norm": 0.29149314761161804, "learning_rate": 9.185832391312644e-05, "loss": 0.0397, "step": 4500 }, { "epoch": 2.119360902255639, "grad_norm": 0.180942565202713, "learning_rate": 9.18130486006364e-05, "loss": 0.0206, "step": 4510 }, { "epoch": 2.1240601503759398, "grad_norm": 0.21293392777442932, "learning_rate": 9.176765897286813e-05, "loss": 0.0356, "step": 4520 }, { "epoch": 2.1287593984962405, "grad_norm": 0.23218363523483276, "learning_rate": 9.17221551539151e-05, "loss": 0.029, "step": 4530 }, { "epoch": 2.1334586466165413, "grad_norm": 0.19981394708156586, "learning_rate": 9.167653726818305e-05, "loss": 0.0204, "step": 4540 }, { "epoch": 2.138157894736842, "grad_norm": 0.1897270381450653, "learning_rate": 9.163080544038952e-05, "loss": 0.0321, "step": 4550 }, { "epoch": 2.142857142857143, "grad_norm": 0.20262902975082397, "learning_rate": 9.158495979556358e-05, "loss": 0.0284, "step": 4560 }, { "epoch": 2.1475563909774436, "grad_norm": 0.1939176321029663, "learning_rate": 9.153900045904549e-05, "loss": 0.0283, "step": 4570 }, { "epoch": 2.1522556390977443, "grad_norm": 0.23823733627796173, "learning_rate": 9.14929275564863e-05, "loss": 0.0291, "step": 4580 }, { "epoch": 2.156954887218045, "grad_norm": 0.22697608172893524, "learning_rate": 9.144674121384757e-05, "loss": 0.0226, "step": 4590 }, { "epoch": 2.161654135338346, "grad_norm": 0.19017039239406586, "learning_rate": 9.140044155740101e-05, "loss": 0.02, "step": 4600 }, { "epoch": 2.1663533834586466, "grad_norm": 0.16114512085914612, "learning_rate": 9.135402871372808e-05, "loss": 0.0298, "step": 4610 }, { "epoch": 2.1710526315789473, "grad_norm": 0.24711167812347412, "learning_rate": 9.130750280971978e-05, "loss": 0.0312, "step": 4620 }, { "epoch": 2.175751879699248, "grad_norm": 0.1635618507862091, "learning_rate": 9.126086397257612e-05, "loss": 0.0192, "step": 4630 }, { "epoch": 2.180451127819549, "grad_norm": 0.23969772458076477, "learning_rate": 9.121411232980588e-05, "loss": 0.0307, "step": 4640 }, { "epoch": 2.1851503759398496, "grad_norm": 0.21272121369838715, "learning_rate": 9.116724800922629e-05, "loss": 0.0295, "step": 4650 }, { "epoch": 2.1898496240601504, "grad_norm": 0.3185397684574127, "learning_rate": 9.112027113896262e-05, "loss": 0.0323, "step": 4660 }, { "epoch": 2.194548872180451, "grad_norm": 0.1866607964038849, "learning_rate": 9.107318184744781e-05, "loss": 0.0334, "step": 4670 }, { "epoch": 2.199248120300752, "grad_norm": 0.20084838569164276, "learning_rate": 9.102598026342222e-05, "loss": 0.0185, "step": 4680 }, { "epoch": 2.2039473684210527, "grad_norm": 0.13514482975006104, "learning_rate": 9.097866651593317e-05, "loss": 0.0297, "step": 4690 }, { "epoch": 2.2086466165413534, "grad_norm": 0.22178612649440765, "learning_rate": 9.093124073433463e-05, "loss": 0.0288, "step": 4700 }, { "epoch": 2.213345864661654, "grad_norm": 0.22371554374694824, "learning_rate": 9.088370304828685e-05, "loss": 0.0245, "step": 4710 }, { "epoch": 2.218045112781955, "grad_norm": 0.14162443578243256, "learning_rate": 9.083605358775612e-05, "loss": 0.0227, "step": 4720 }, { "epoch": 2.2227443609022557, "grad_norm": 0.1731516569852829, "learning_rate": 9.078829248301417e-05, "loss": 0.0211, "step": 4730 }, { "epoch": 2.2274436090225564, "grad_norm": 0.17966926097869873, "learning_rate": 9.074041986463808e-05, "loss": 0.0309, "step": 4740 }, { "epoch": 2.232142857142857, "grad_norm": 0.20831747353076935, "learning_rate": 9.069243586350975e-05, "loss": 0.0227, "step": 4750 }, { "epoch": 2.236842105263158, "grad_norm": 0.1533919721841812, "learning_rate": 9.064434061081562e-05, "loss": 0.0223, "step": 4760 }, { "epoch": 2.2415413533834587, "grad_norm": 0.2533239722251892, "learning_rate": 9.059613423804623e-05, "loss": 0.0257, "step": 4770 }, { "epoch": 2.2462406015037595, "grad_norm": 0.1808866560459137, "learning_rate": 9.0547816876996e-05, "loss": 0.0323, "step": 4780 }, { "epoch": 2.2509398496240602, "grad_norm": 0.2061844766139984, "learning_rate": 9.049938865976275e-05, "loss": 0.0365, "step": 4790 }, { "epoch": 2.255639097744361, "grad_norm": 0.18600964546203613, "learning_rate": 9.045084971874738e-05, "loss": 0.0202, "step": 4800 }, { "epoch": 2.2603383458646618, "grad_norm": 0.25376129150390625, "learning_rate": 9.040220018665347e-05, "loss": 0.0259, "step": 4810 }, { "epoch": 2.2650375939849625, "grad_norm": 0.2140718698501587, "learning_rate": 9.035344019648702e-05, "loss": 0.0196, "step": 4820 }, { "epoch": 2.2697368421052633, "grad_norm": 0.21682094037532806, "learning_rate": 9.030456988155596e-05, "loss": 0.0224, "step": 4830 }, { "epoch": 2.274436090225564, "grad_norm": 0.2152978926897049, "learning_rate": 9.025558937546988e-05, "loss": 0.0245, "step": 4840 }, { "epoch": 2.279135338345865, "grad_norm": 0.20505715906620026, "learning_rate": 9.020649881213958e-05, "loss": 0.0196, "step": 4850 }, { "epoch": 2.2838345864661656, "grad_norm": 0.19191820919513702, "learning_rate": 9.015729832577681e-05, "loss": 0.021, "step": 4860 }, { "epoch": 2.2885338345864663, "grad_norm": 0.2543664276599884, "learning_rate": 9.010798805089384e-05, "loss": 0.025, "step": 4870 }, { "epoch": 2.293233082706767, "grad_norm": 0.241551473736763, "learning_rate": 9.005856812230304e-05, "loss": 0.0261, "step": 4880 }, { "epoch": 2.297932330827068, "grad_norm": 0.19452232122421265, "learning_rate": 9.000903867511666e-05, "loss": 0.0256, "step": 4890 }, { "epoch": 2.3026315789473686, "grad_norm": 0.16211168467998505, "learning_rate": 8.995939984474624e-05, "loss": 0.0231, "step": 4900 }, { "epoch": 2.3073308270676693, "grad_norm": 0.15496976673603058, "learning_rate": 8.990965176690252e-05, "loss": 0.027, "step": 4910 }, { "epoch": 2.31203007518797, "grad_norm": 0.2289806455373764, "learning_rate": 8.98597945775948e-05, "loss": 0.0303, "step": 4920 }, { "epoch": 2.316729323308271, "grad_norm": 0.15668641030788422, "learning_rate": 8.980982841313074e-05, "loss": 0.0255, "step": 4930 }, { "epoch": 2.3214285714285716, "grad_norm": 0.2543923258781433, "learning_rate": 8.975975341011596e-05, "loss": 0.029, "step": 4940 }, { "epoch": 2.326127819548872, "grad_norm": 0.2210584133863449, "learning_rate": 8.970956970545355e-05, "loss": 0.0217, "step": 4950 }, { "epoch": 2.3308270676691727, "grad_norm": 0.25968605279922485, "learning_rate": 8.965927743634391e-05, "loss": 0.0283, "step": 4960 }, { "epoch": 2.3355263157894735, "grad_norm": 0.22940151393413544, "learning_rate": 8.96088767402841e-05, "loss": 0.0216, "step": 4970 }, { "epoch": 2.340225563909774, "grad_norm": 0.1916259378194809, "learning_rate": 8.955836775506776e-05, "loss": 0.0197, "step": 4980 }, { "epoch": 2.344924812030075, "grad_norm": 0.14167390763759613, "learning_rate": 8.950775061878453e-05, "loss": 0.0244, "step": 4990 }, { "epoch": 2.3496240601503757, "grad_norm": 0.22982220351696014, "learning_rate": 8.945702546981969e-05, "loss": 0.0267, "step": 5000 }, { "epoch": 2.3543233082706765, "grad_norm": 0.21236097812652588, "learning_rate": 8.940619244685388e-05, "loss": 0.0268, "step": 5010 }, { "epoch": 2.3590225563909772, "grad_norm": 0.16472011804580688, "learning_rate": 8.935525168886262e-05, "loss": 0.0279, "step": 5020 }, { "epoch": 2.363721804511278, "grad_norm": 0.18868985772132874, "learning_rate": 8.930420333511606e-05, "loss": 0.0232, "step": 5030 }, { "epoch": 2.3684210526315788, "grad_norm": 0.2023637294769287, "learning_rate": 8.92530475251784e-05, "loss": 0.0208, "step": 5040 }, { "epoch": 2.3731203007518795, "grad_norm": 0.22184379398822784, "learning_rate": 8.920178439890765e-05, "loss": 0.0249, "step": 5050 }, { "epoch": 2.3778195488721803, "grad_norm": 0.23359708487987518, "learning_rate": 8.91504140964553e-05, "loss": 0.0182, "step": 5060 }, { "epoch": 2.382518796992481, "grad_norm": 0.17309461534023285, "learning_rate": 8.909893675826574e-05, "loss": 0.0276, "step": 5070 }, { "epoch": 2.387218045112782, "grad_norm": 0.2259601503610611, "learning_rate": 8.90473525250761e-05, "loss": 0.0271, "step": 5080 }, { "epoch": 2.3919172932330826, "grad_norm": 0.24056588113307953, "learning_rate": 8.899566153791566e-05, "loss": 0.0313, "step": 5090 }, { "epoch": 2.3966165413533833, "grad_norm": 0.1999393254518509, "learning_rate": 8.894386393810563e-05, "loss": 0.0329, "step": 5100 }, { "epoch": 2.401315789473684, "grad_norm": 0.1775711178779602, "learning_rate": 8.889195986725865e-05, "loss": 0.0236, "step": 5110 }, { "epoch": 2.406015037593985, "grad_norm": 0.2337716966867447, "learning_rate": 8.883994946727849e-05, "loss": 0.0267, "step": 5120 }, { "epoch": 2.4107142857142856, "grad_norm": 0.3311557471752167, "learning_rate": 8.878783288035957e-05, "loss": 0.0206, "step": 5130 }, { "epoch": 2.4154135338345863, "grad_norm": 0.21677523851394653, "learning_rate": 8.873561024898668e-05, "loss": 0.028, "step": 5140 }, { "epoch": 2.420112781954887, "grad_norm": 0.2683987617492676, "learning_rate": 8.868328171593448e-05, "loss": 0.0251, "step": 5150 }, { "epoch": 2.424812030075188, "grad_norm": 0.18326067924499512, "learning_rate": 8.863084742426719e-05, "loss": 0.0245, "step": 5160 }, { "epoch": 2.4295112781954886, "grad_norm": 0.16746392846107483, "learning_rate": 8.857830751733815e-05, "loss": 0.0248, "step": 5170 }, { "epoch": 2.4342105263157894, "grad_norm": 0.1843288242816925, "learning_rate": 8.852566213878947e-05, "loss": 0.0283, "step": 5180 }, { "epoch": 2.43890977443609, "grad_norm": 0.2047811597585678, "learning_rate": 8.84729114325516e-05, "loss": 0.0205, "step": 5190 }, { "epoch": 2.443609022556391, "grad_norm": 0.2268257886171341, "learning_rate": 8.842005554284296e-05, "loss": 0.0255, "step": 5200 }, { "epoch": 2.4483082706766917, "grad_norm": 0.27279675006866455, "learning_rate": 8.836709461416952e-05, "loss": 0.0281, "step": 5210 }, { "epoch": 2.4530075187969924, "grad_norm": 0.14802409708499908, "learning_rate": 8.831402879132446e-05, "loss": 0.0195, "step": 5220 }, { "epoch": 2.457706766917293, "grad_norm": 0.1476491093635559, "learning_rate": 8.82608582193877e-05, "loss": 0.0255, "step": 5230 }, { "epoch": 2.462406015037594, "grad_norm": 0.1853826940059662, "learning_rate": 8.820758304372557e-05, "loss": 0.0245, "step": 5240 }, { "epoch": 2.4671052631578947, "grad_norm": 0.2378029078245163, "learning_rate": 8.815420340999033e-05, "loss": 0.0205, "step": 5250 }, { "epoch": 2.4718045112781954, "grad_norm": 0.1497645527124405, "learning_rate": 8.810071946411989e-05, "loss": 0.0239, "step": 5260 }, { "epoch": 2.476503759398496, "grad_norm": 0.23548933863639832, "learning_rate": 8.804713135233731e-05, "loss": 0.0217, "step": 5270 }, { "epoch": 2.481203007518797, "grad_norm": 0.2269853800535202, "learning_rate": 8.799343922115044e-05, "loss": 0.0183, "step": 5280 }, { "epoch": 2.4859022556390977, "grad_norm": 0.22214250266551971, "learning_rate": 8.79396432173515e-05, "loss": 0.0221, "step": 5290 }, { "epoch": 2.4906015037593985, "grad_norm": 0.2275986224412918, "learning_rate": 8.788574348801675e-05, "loss": 0.0311, "step": 5300 }, { "epoch": 2.4953007518796992, "grad_norm": 0.21562987565994263, "learning_rate": 8.783174018050594e-05, "loss": 0.0388, "step": 5310 }, { "epoch": 2.5, "grad_norm": 0.20339468121528625, "learning_rate": 8.77776334424621e-05, "loss": 0.0264, "step": 5320 }, { "epoch": 2.5046992481203008, "grad_norm": 0.1768190860748291, "learning_rate": 8.772342342181095e-05, "loss": 0.0278, "step": 5330 }, { "epoch": 2.5093984962406015, "grad_norm": 0.13602979481220245, "learning_rate": 8.766911026676064e-05, "loss": 0.0241, "step": 5340 }, { "epoch": 2.5140977443609023, "grad_norm": 0.34090307354927063, "learning_rate": 8.761469412580125e-05, "loss": 0.0304, "step": 5350 }, { "epoch": 2.518796992481203, "grad_norm": 0.2651212811470032, "learning_rate": 8.756017514770443e-05, "loss": 0.0264, "step": 5360 }, { "epoch": 2.523496240601504, "grad_norm": 0.1824772208929062, "learning_rate": 8.750555348152298e-05, "loss": 0.0189, "step": 5370 }, { "epoch": 2.5281954887218046, "grad_norm": 0.2285076379776001, "learning_rate": 8.745082927659047e-05, "loss": 0.0282, "step": 5380 }, { "epoch": 2.5328947368421053, "grad_norm": 0.23857127130031586, "learning_rate": 8.739600268252078e-05, "loss": 0.0309, "step": 5390 }, { "epoch": 2.537593984962406, "grad_norm": 0.20476020872592926, "learning_rate": 8.73410738492077e-05, "loss": 0.0265, "step": 5400 }, { "epoch": 2.542293233082707, "grad_norm": 0.2645573616027832, "learning_rate": 8.728604292682459e-05, "loss": 0.0245, "step": 5410 }, { "epoch": 2.5469924812030076, "grad_norm": 0.18955717980861664, "learning_rate": 8.723091006582389e-05, "loss": 0.0279, "step": 5420 }, { "epoch": 2.5516917293233083, "grad_norm": 0.20659516751766205, "learning_rate": 8.717567541693673e-05, "loss": 0.0191, "step": 5430 }, { "epoch": 2.556390977443609, "grad_norm": 0.19456426799297333, "learning_rate": 8.71203391311725e-05, "loss": 0.0286, "step": 5440 }, { "epoch": 2.56109022556391, "grad_norm": 0.15492534637451172, "learning_rate": 8.706490135981855e-05, "loss": 0.0258, "step": 5450 }, { "epoch": 2.5657894736842106, "grad_norm": 0.21852245926856995, "learning_rate": 8.700936225443959e-05, "loss": 0.0227, "step": 5460 }, { "epoch": 2.5704887218045114, "grad_norm": 0.17760106921195984, "learning_rate": 8.695372196687743e-05, "loss": 0.0284, "step": 5470 }, { "epoch": 2.575187969924812, "grad_norm": 0.20838047564029694, "learning_rate": 8.689798064925049e-05, "loss": 0.0239, "step": 5480 }, { "epoch": 2.579887218045113, "grad_norm": 0.19678470492362976, "learning_rate": 8.684213845395339e-05, "loss": 0.022, "step": 5490 }, { "epoch": 2.5845864661654137, "grad_norm": 0.15934813022613525, "learning_rate": 8.678619553365659e-05, "loss": 0.0192, "step": 5500 }, { "epoch": 2.5892857142857144, "grad_norm": 0.21788392961025238, "learning_rate": 8.673015204130586e-05, "loss": 0.0209, "step": 5510 }, { "epoch": 2.593984962406015, "grad_norm": 0.19440729916095734, "learning_rate": 8.6674008130122e-05, "loss": 0.0224, "step": 5520 }, { "epoch": 2.598684210526316, "grad_norm": 0.2586307227611542, "learning_rate": 8.661776395360029e-05, "loss": 0.0246, "step": 5530 }, { "epoch": 2.6033834586466167, "grad_norm": 0.22773458063602448, "learning_rate": 8.656141966551019e-05, "loss": 0.0232, "step": 5540 }, { "epoch": 2.6080827067669174, "grad_norm": 0.1501941978931427, "learning_rate": 8.650497541989482e-05, "loss": 0.0189, "step": 5550 }, { "epoch": 2.612781954887218, "grad_norm": 0.2053869515657425, "learning_rate": 8.644843137107059e-05, "loss": 0.0278, "step": 5560 }, { "epoch": 2.617481203007519, "grad_norm": 0.20903167128562927, "learning_rate": 8.639178767362676e-05, "loss": 0.0265, "step": 5570 }, { "epoch": 2.6221804511278197, "grad_norm": 0.16303761303424835, "learning_rate": 8.633504448242505e-05, "loss": 0.0174, "step": 5580 }, { "epoch": 2.6268796992481205, "grad_norm": 0.18065503239631653, "learning_rate": 8.627820195259918e-05, "loss": 0.0223, "step": 5590 }, { "epoch": 2.6315789473684212, "grad_norm": 0.18412037193775177, "learning_rate": 8.622126023955446e-05, "loss": 0.0287, "step": 5600 }, { "epoch": 2.636278195488722, "grad_norm": 0.1771506518125534, "learning_rate": 8.616421949896734e-05, "loss": 0.0221, "step": 5610 }, { "epoch": 2.6409774436090228, "grad_norm": 0.20367956161499023, "learning_rate": 8.610707988678503e-05, "loss": 0.0262, "step": 5620 }, { "epoch": 2.6456766917293235, "grad_norm": 0.2133282572031021, "learning_rate": 8.604984155922506e-05, "loss": 0.016, "step": 5630 }, { "epoch": 2.6503759398496243, "grad_norm": 0.2280464917421341, "learning_rate": 8.599250467277483e-05, "loss": 0.0226, "step": 5640 }, { "epoch": 2.655075187969925, "grad_norm": 0.20852112770080566, "learning_rate": 8.59350693841912e-05, "loss": 0.0273, "step": 5650 }, { "epoch": 2.659774436090226, "grad_norm": 0.11491677910089493, "learning_rate": 8.587753585050004e-05, "loss": 0.0231, "step": 5660 }, { "epoch": 2.6644736842105265, "grad_norm": 0.13895131647586823, "learning_rate": 8.581990422899585e-05, "loss": 0.0159, "step": 5670 }, { "epoch": 2.6691729323308273, "grad_norm": 0.1562396138906479, "learning_rate": 8.576217467724128e-05, "loss": 0.029, "step": 5680 }, { "epoch": 2.673872180451128, "grad_norm": 0.16841505467891693, "learning_rate": 8.570434735306671e-05, "loss": 0.0164, "step": 5690 }, { "epoch": 2.678571428571429, "grad_norm": 0.19897368550300598, "learning_rate": 8.564642241456986e-05, "loss": 0.03, "step": 5700 }, { "epoch": 2.6832706766917296, "grad_norm": 0.21792681515216827, "learning_rate": 8.558840002011528e-05, "loss": 0.031, "step": 5710 }, { "epoch": 2.6879699248120303, "grad_norm": 0.15453846752643585, "learning_rate": 8.553028032833397e-05, "loss": 0.0211, "step": 5720 }, { "epoch": 2.692669172932331, "grad_norm": 0.129106804728508, "learning_rate": 8.547206349812298e-05, "loss": 0.0238, "step": 5730 }, { "epoch": 2.6973684210526314, "grad_norm": 0.2648111581802368, "learning_rate": 8.541374968864487e-05, "loss": 0.0249, "step": 5740 }, { "epoch": 2.702067669172932, "grad_norm": 0.23966850340366364, "learning_rate": 8.535533905932738e-05, "loss": 0.0224, "step": 5750 }, { "epoch": 2.706766917293233, "grad_norm": 0.2106933295726776, "learning_rate": 8.529683176986295e-05, "loss": 0.02, "step": 5760 }, { "epoch": 2.7114661654135337, "grad_norm": 0.20059479773044586, "learning_rate": 8.523822798020827e-05, "loss": 0.0172, "step": 5770 }, { "epoch": 2.7161654135338344, "grad_norm": 0.14424027502536774, "learning_rate": 8.517952785058385e-05, "loss": 0.0256, "step": 5780 }, { "epoch": 2.720864661654135, "grad_norm": 0.1661170870065689, "learning_rate": 8.512073154147362e-05, "loss": 0.0246, "step": 5790 }, { "epoch": 2.725563909774436, "grad_norm": 0.23051202297210693, "learning_rate": 8.506183921362443e-05, "loss": 0.0223, "step": 5800 }, { "epoch": 2.7302631578947367, "grad_norm": 0.24159590899944305, "learning_rate": 8.500285102804568e-05, "loss": 0.0368, "step": 5810 }, { "epoch": 2.7349624060150375, "grad_norm": 0.20334158837795258, "learning_rate": 8.494376714600878e-05, "loss": 0.0337, "step": 5820 }, { "epoch": 2.7396616541353382, "grad_norm": 0.15019096434116364, "learning_rate": 8.488458772904684e-05, "loss": 0.0246, "step": 5830 }, { "epoch": 2.744360902255639, "grad_norm": 0.15891961753368378, "learning_rate": 8.482531293895412e-05, "loss": 0.0215, "step": 5840 }, { "epoch": 2.7490601503759398, "grad_norm": 0.16506938636302948, "learning_rate": 8.476594293778561e-05, "loss": 0.0256, "step": 5850 }, { "epoch": 2.7537593984962405, "grad_norm": 0.18463429808616638, "learning_rate": 8.470647788785665e-05, "loss": 0.0197, "step": 5860 }, { "epoch": 2.7584586466165413, "grad_norm": 0.186926007270813, "learning_rate": 8.46469179517424e-05, "loss": 0.0199, "step": 5870 }, { "epoch": 2.763157894736842, "grad_norm": 0.19735975563526154, "learning_rate": 8.458726329227747e-05, "loss": 0.0269, "step": 5880 }, { "epoch": 2.767857142857143, "grad_norm": 0.17075762152671814, "learning_rate": 8.452751407255541e-05, "loss": 0.0213, "step": 5890 }, { "epoch": 2.7725563909774436, "grad_norm": 0.25880053639411926, "learning_rate": 8.44676704559283e-05, "loss": 0.0222, "step": 5900 }, { "epoch": 2.7772556390977443, "grad_norm": 0.34592244029045105, "learning_rate": 8.44077326060063e-05, "loss": 0.0295, "step": 5910 }, { "epoch": 2.781954887218045, "grad_norm": 0.16668987274169922, "learning_rate": 8.434770068665723e-05, "loss": 0.0228, "step": 5920 }, { "epoch": 2.786654135338346, "grad_norm": 0.14265859127044678, "learning_rate": 8.428757486200603e-05, "loss": 0.0252, "step": 5930 }, { "epoch": 2.7913533834586466, "grad_norm": 0.2153574824333191, "learning_rate": 8.422735529643444e-05, "loss": 0.0268, "step": 5940 }, { "epoch": 2.7960526315789473, "grad_norm": 0.20310519635677338, "learning_rate": 8.416704215458043e-05, "loss": 0.0205, "step": 5950 }, { "epoch": 2.800751879699248, "grad_norm": 0.20665310323238373, "learning_rate": 8.410663560133784e-05, "loss": 0.022, "step": 5960 }, { "epoch": 2.805451127819549, "grad_norm": 0.23281921446323395, "learning_rate": 8.404613580185585e-05, "loss": 0.0215, "step": 5970 }, { "epoch": 2.8101503759398496, "grad_norm": 0.16719530522823334, "learning_rate": 8.398554292153866e-05, "loss": 0.0158, "step": 5980 }, { "epoch": 2.8148496240601504, "grad_norm": 0.15998616814613342, "learning_rate": 8.392485712604483e-05, "loss": 0.0224, "step": 5990 }, { "epoch": 2.819548872180451, "grad_norm": 0.15419815480709076, "learning_rate": 8.386407858128706e-05, "loss": 0.0285, "step": 6000 }, { "epoch": 2.824248120300752, "grad_norm": 0.18911899626255035, "learning_rate": 8.380320745343153e-05, "loss": 0.0227, "step": 6010 }, { "epoch": 2.8289473684210527, "grad_norm": 0.1339092254638672, "learning_rate": 8.37422439088976e-05, "loss": 0.0219, "step": 6020 }, { "epoch": 2.8336466165413534, "grad_norm": 0.22458699345588684, "learning_rate": 8.368118811435726e-05, "loss": 0.0334, "step": 6030 }, { "epoch": 2.838345864661654, "grad_norm": 0.1807672381401062, "learning_rate": 8.362004023673474e-05, "loss": 0.0302, "step": 6040 }, { "epoch": 2.843045112781955, "grad_norm": 0.19045840203762054, "learning_rate": 8.355880044320598e-05, "loss": 0.0262, "step": 6050 }, { "epoch": 2.8477443609022557, "grad_norm": 0.1385623812675476, "learning_rate": 8.349746890119826e-05, "loss": 0.0204, "step": 6060 }, { "epoch": 2.8524436090225564, "grad_norm": 0.13099630177021027, "learning_rate": 8.343604577838964e-05, "loss": 0.0219, "step": 6070 }, { "epoch": 2.857142857142857, "grad_norm": 0.16968640685081482, "learning_rate": 8.337453124270863e-05, "loss": 0.0179, "step": 6080 }, { "epoch": 2.861842105263158, "grad_norm": 0.19834963977336884, "learning_rate": 8.331292546233362e-05, "loss": 0.0246, "step": 6090 }, { "epoch": 2.8665413533834587, "grad_norm": 0.21443118155002594, "learning_rate": 8.32512286056924e-05, "loss": 0.0263, "step": 6100 }, { "epoch": 2.8712406015037595, "grad_norm": 0.21712978184223175, "learning_rate": 8.318944084146192e-05, "loss": 0.0311, "step": 6110 }, { "epoch": 2.8759398496240602, "grad_norm": 0.18117016553878784, "learning_rate": 8.31275623385675e-05, "loss": 0.0205, "step": 6120 }, { "epoch": 2.880639097744361, "grad_norm": 0.18793556094169617, "learning_rate": 8.306559326618259e-05, "loss": 0.0314, "step": 6130 }, { "epoch": 2.8853383458646618, "grad_norm": 0.18074850738048553, "learning_rate": 8.300353379372834e-05, "loss": 0.0219, "step": 6140 }, { "epoch": 2.8900375939849625, "grad_norm": 0.22259654104709625, "learning_rate": 8.29413840908729e-05, "loss": 0.0226, "step": 6150 }, { "epoch": 2.8947368421052633, "grad_norm": 0.24867089092731476, "learning_rate": 8.287914432753123e-05, "loss": 0.0231, "step": 6160 }, { "epoch": 2.899436090225564, "grad_norm": 0.21988849341869354, "learning_rate": 8.281681467386446e-05, "loss": 0.0257, "step": 6170 }, { "epoch": 2.904135338345865, "grad_norm": 0.18946348130702972, "learning_rate": 8.275439530027948e-05, "loss": 0.0238, "step": 6180 }, { "epoch": 2.9088345864661656, "grad_norm": 0.21483662724494934, "learning_rate": 8.269188637742846e-05, "loss": 0.0232, "step": 6190 }, { "epoch": 2.9135338345864663, "grad_norm": 0.24274252355098724, "learning_rate": 8.262928807620843e-05, "loss": 0.0292, "step": 6200 }, { "epoch": 2.918233082706767, "grad_norm": 0.1813192218542099, "learning_rate": 8.256660056776076e-05, "loss": 0.0223, "step": 6210 }, { "epoch": 2.922932330827068, "grad_norm": 0.25920167565345764, "learning_rate": 8.250382402347065e-05, "loss": 0.0341, "step": 6220 }, { "epoch": 2.9276315789473686, "grad_norm": 0.19504626095294952, "learning_rate": 8.244095861496686e-05, "loss": 0.0217, "step": 6230 }, { "epoch": 2.932330827067669, "grad_norm": 0.1617734581232071, "learning_rate": 8.237800451412095e-05, "loss": 0.0238, "step": 6240 }, { "epoch": 2.9370300751879697, "grad_norm": 0.15244753658771515, "learning_rate": 8.231496189304704e-05, "loss": 0.0244, "step": 6250 }, { "epoch": 2.9417293233082704, "grad_norm": 0.20108844339847565, "learning_rate": 8.225183092410128e-05, "loss": 0.0304, "step": 6260 }, { "epoch": 2.946428571428571, "grad_norm": 0.23991245031356812, "learning_rate": 8.218861177988129e-05, "loss": 0.0286, "step": 6270 }, { "epoch": 2.951127819548872, "grad_norm": 0.17502142488956451, "learning_rate": 8.212530463322583e-05, "loss": 0.0241, "step": 6280 }, { "epoch": 2.9558270676691727, "grad_norm": 0.1822666972875595, "learning_rate": 8.206190965721419e-05, "loss": 0.0276, "step": 6290 }, { "epoch": 2.9605263157894735, "grad_norm": 0.22813008725643158, "learning_rate": 8.199842702516583e-05, "loss": 0.026, "step": 6300 }, { "epoch": 2.965225563909774, "grad_norm": 0.18035642802715302, "learning_rate": 8.193485691063985e-05, "loss": 0.0196, "step": 6310 }, { "epoch": 2.969924812030075, "grad_norm": 0.20626391470432281, "learning_rate": 8.18711994874345e-05, "loss": 0.021, "step": 6320 }, { "epoch": 2.9746240601503757, "grad_norm": 0.23911024630069733, "learning_rate": 8.180745492958674e-05, "loss": 0.0269, "step": 6330 }, { "epoch": 2.9793233082706765, "grad_norm": 0.21398058533668518, "learning_rate": 8.174362341137177e-05, "loss": 0.0282, "step": 6340 }, { "epoch": 2.9840225563909772, "grad_norm": 0.14263466000556946, "learning_rate": 8.167970510730253e-05, "loss": 0.0192, "step": 6350 }, { "epoch": 2.988721804511278, "grad_norm": 0.11814016848802567, "learning_rate": 8.161570019212921e-05, "loss": 0.018, "step": 6360 }, { "epoch": 2.9934210526315788, "grad_norm": 0.17552949488162994, "learning_rate": 8.155160884083881e-05, "loss": 0.0198, "step": 6370 }, { "epoch": 2.9981203007518795, "grad_norm": 0.2032817304134369, "learning_rate": 8.148743122865463e-05, "loss": 0.0202, "step": 6380 }, { "epoch": 3.0028195488721803, "grad_norm": 0.21947944164276123, "learning_rate": 8.14231675310358e-05, "loss": 0.0322, "step": 6390 }, { "epoch": 3.007518796992481, "grad_norm": 0.18210892379283905, "learning_rate": 8.135881792367686e-05, "loss": 0.0278, "step": 6400 }, { "epoch": 3.012218045112782, "grad_norm": 0.16313527524471283, "learning_rate": 8.129438258250712e-05, "loss": 0.0277, "step": 6410 }, { "epoch": 3.0169172932330826, "grad_norm": 0.167706698179245, "learning_rate": 8.12298616836904e-05, "loss": 0.0217, "step": 6420 }, { "epoch": 3.0216165413533833, "grad_norm": 0.16999609768390656, "learning_rate": 8.116525540362434e-05, "loss": 0.0227, "step": 6430 }, { "epoch": 3.026315789473684, "grad_norm": 0.13280713558197021, "learning_rate": 8.110056391894005e-05, "loss": 0.0179, "step": 6440 }, { "epoch": 3.031015037593985, "grad_norm": 0.2560625970363617, "learning_rate": 8.103578740650156e-05, "loss": 0.0299, "step": 6450 }, { "epoch": 3.0357142857142856, "grad_norm": 0.12923632562160492, "learning_rate": 8.097092604340542e-05, "loss": 0.0177, "step": 6460 }, { "epoch": 3.0404135338345863, "grad_norm": 0.1939212530851364, "learning_rate": 8.090598000698009e-05, "loss": 0.0223, "step": 6470 }, { "epoch": 3.045112781954887, "grad_norm": 0.17978033423423767, "learning_rate": 8.084094947478556e-05, "loss": 0.0146, "step": 6480 }, { "epoch": 3.049812030075188, "grad_norm": 0.1513349711894989, "learning_rate": 8.077583462461283e-05, "loss": 0.0154, "step": 6490 }, { "epoch": 3.0545112781954886, "grad_norm": 0.20076999068260193, "learning_rate": 8.07106356344834e-05, "loss": 0.0193, "step": 6500 }, { "epoch": 3.0592105263157894, "grad_norm": 0.19262535870075226, "learning_rate": 8.064535268264883e-05, "loss": 0.0338, "step": 6510 }, { "epoch": 3.06390977443609, "grad_norm": 0.16921037435531616, "learning_rate": 8.057998594759022e-05, "loss": 0.0314, "step": 6520 }, { "epoch": 3.068609022556391, "grad_norm": 0.18066401779651642, "learning_rate": 8.051453560801772e-05, "loss": 0.0173, "step": 6530 }, { "epoch": 3.0733082706766917, "grad_norm": 0.1616375744342804, "learning_rate": 8.044900184287007e-05, "loss": 0.0286, "step": 6540 }, { "epoch": 3.0780075187969924, "grad_norm": 0.15176372230052948, "learning_rate": 8.038338483131407e-05, "loss": 0.0238, "step": 6550 }, { "epoch": 3.082706766917293, "grad_norm": 0.21256138384342194, "learning_rate": 8.031768475274413e-05, "loss": 0.014, "step": 6560 }, { "epoch": 3.087406015037594, "grad_norm": 0.19960719347000122, "learning_rate": 8.025190178678175e-05, "loss": 0.0279, "step": 6570 }, { "epoch": 3.0921052631578947, "grad_norm": 0.1822323501110077, "learning_rate": 8.018603611327504e-05, "loss": 0.0201, "step": 6580 }, { "epoch": 3.0968045112781954, "grad_norm": 0.16403251886367798, "learning_rate": 8.012008791229826e-05, "loss": 0.0194, "step": 6590 }, { "epoch": 3.101503759398496, "grad_norm": 0.19543980062007904, "learning_rate": 8.005405736415126e-05, "loss": 0.0208, "step": 6600 }, { "epoch": 3.106203007518797, "grad_norm": 0.19898360967636108, "learning_rate": 7.998794464935904e-05, "loss": 0.0185, "step": 6610 }, { "epoch": 3.1109022556390977, "grad_norm": 0.13113637268543243, "learning_rate": 7.992174994867123e-05, "loss": 0.0284, "step": 6620 }, { "epoch": 3.1156015037593985, "grad_norm": 0.21924848854541779, "learning_rate": 7.985547344306161e-05, "loss": 0.019, "step": 6630 }, { "epoch": 3.1203007518796992, "grad_norm": 0.13814175128936768, "learning_rate": 7.978911531372765e-05, "loss": 0.0192, "step": 6640 }, { "epoch": 3.125, "grad_norm": 0.17765717208385468, "learning_rate": 7.972267574208991e-05, "loss": 0.0165, "step": 6650 }, { "epoch": 3.1296992481203008, "grad_norm": 0.1574385166168213, "learning_rate": 7.965615490979163e-05, "loss": 0.0271, "step": 6660 }, { "epoch": 3.1343984962406015, "grad_norm": 0.17050959169864655, "learning_rate": 7.958955299869825e-05, "loss": 0.0246, "step": 6670 }, { "epoch": 3.1390977443609023, "grad_norm": 0.17193661630153656, "learning_rate": 7.952287019089685e-05, "loss": 0.0184, "step": 6680 }, { "epoch": 3.143796992481203, "grad_norm": 0.2032015323638916, "learning_rate": 7.945610666869568e-05, "loss": 0.0198, "step": 6690 }, { "epoch": 3.148496240601504, "grad_norm": 0.2095039039850235, "learning_rate": 7.938926261462366e-05, "loss": 0.0335, "step": 6700 }, { "epoch": 3.1531954887218046, "grad_norm": 0.18302005529403687, "learning_rate": 7.932233821142987e-05, "loss": 0.0215, "step": 6710 }, { "epoch": 3.1578947368421053, "grad_norm": 0.18510419130325317, "learning_rate": 7.925533364208309e-05, "loss": 0.0163, "step": 6720 }, { "epoch": 3.162593984962406, "grad_norm": 0.22432270646095276, "learning_rate": 7.918824908977123e-05, "loss": 0.0324, "step": 6730 }, { "epoch": 3.167293233082707, "grad_norm": 0.14236007630825043, "learning_rate": 7.912108473790092e-05, "loss": 0.0251, "step": 6740 }, { "epoch": 3.1719924812030076, "grad_norm": 0.21034064888954163, "learning_rate": 7.905384077009693e-05, "loss": 0.0255, "step": 6750 }, { "epoch": 3.1766917293233083, "grad_norm": 0.18089409172534943, "learning_rate": 7.898651737020166e-05, "loss": 0.0301, "step": 6760 }, { "epoch": 3.181390977443609, "grad_norm": 0.1281910240650177, "learning_rate": 7.891911472227478e-05, "loss": 0.0271, "step": 6770 }, { "epoch": 3.18609022556391, "grad_norm": 0.2349163144826889, "learning_rate": 7.88516330105925e-05, "loss": 0.0231, "step": 6780 }, { "epoch": 3.1907894736842106, "grad_norm": 0.15001404285430908, "learning_rate": 7.878407241964729e-05, "loss": 0.0196, "step": 6790 }, { "epoch": 3.1954887218045114, "grad_norm": 0.20565535128116608, "learning_rate": 7.871643313414718e-05, "loss": 0.0339, "step": 6800 }, { "epoch": 3.200187969924812, "grad_norm": 0.24226559698581696, "learning_rate": 7.864871533901544e-05, "loss": 0.0239, "step": 6810 }, { "epoch": 3.204887218045113, "grad_norm": 0.17688481509685516, "learning_rate": 7.858091921938988e-05, "loss": 0.021, "step": 6820 }, { "epoch": 3.2095864661654137, "grad_norm": 0.22673030197620392, "learning_rate": 7.851304496062254e-05, "loss": 0.0267, "step": 6830 }, { "epoch": 3.2142857142857144, "grad_norm": 0.21045391261577606, "learning_rate": 7.844509274827907e-05, "loss": 0.0199, "step": 6840 }, { "epoch": 3.218984962406015, "grad_norm": 0.14403589069843292, "learning_rate": 7.837706276813819e-05, "loss": 0.0214, "step": 6850 }, { "epoch": 3.223684210526316, "grad_norm": 0.1579636186361313, "learning_rate": 7.830895520619128e-05, "loss": 0.022, "step": 6860 }, { "epoch": 3.2283834586466167, "grad_norm": 0.1557644158601761, "learning_rate": 7.824077024864179e-05, "loss": 0.0192, "step": 6870 }, { "epoch": 3.2330827067669174, "grad_norm": 0.22842907905578613, "learning_rate": 7.817250808190483e-05, "loss": 0.0312, "step": 6880 }, { "epoch": 3.237781954887218, "grad_norm": 0.09012676775455475, "learning_rate": 7.810416889260653e-05, "loss": 0.0202, "step": 6890 }, { "epoch": 3.242481203007519, "grad_norm": 0.14062803983688354, "learning_rate": 7.803575286758364e-05, "loss": 0.0193, "step": 6900 }, { "epoch": 3.2471804511278197, "grad_norm": 0.20320133864879608, "learning_rate": 7.796726019388295e-05, "loss": 0.0269, "step": 6910 }, { "epoch": 3.2518796992481205, "grad_norm": 0.21432797610759735, "learning_rate": 7.789869105876083e-05, "loss": 0.0225, "step": 6920 }, { "epoch": 3.2565789473684212, "grad_norm": 0.1976686418056488, "learning_rate": 7.783004564968263e-05, "loss": 0.0225, "step": 6930 }, { "epoch": 3.261278195488722, "grad_norm": 0.12431466579437256, "learning_rate": 7.776132415432234e-05, "loss": 0.0201, "step": 6940 }, { "epoch": 3.2659774436090228, "grad_norm": 0.11760783195495605, "learning_rate": 7.769252676056187e-05, "loss": 0.0213, "step": 6950 }, { "epoch": 3.2706766917293235, "grad_norm": 0.2153691202402115, "learning_rate": 7.762365365649067e-05, "loss": 0.0239, "step": 6960 }, { "epoch": 3.2753759398496243, "grad_norm": 0.12900856137275696, "learning_rate": 7.755470503040516e-05, "loss": 0.0204, "step": 6970 }, { "epoch": 3.280075187969925, "grad_norm": 0.16941407322883606, "learning_rate": 7.748568107080832e-05, "loss": 0.0211, "step": 6980 }, { "epoch": 3.284774436090226, "grad_norm": 0.2066490203142166, "learning_rate": 7.741658196640892e-05, "loss": 0.0321, "step": 6990 }, { "epoch": 3.2894736842105265, "grad_norm": 0.1458861082792282, "learning_rate": 7.734740790612136e-05, "loss": 0.0245, "step": 7000 }, { "epoch": 3.2941729323308273, "grad_norm": 0.19808563590049744, "learning_rate": 7.727815907906481e-05, "loss": 0.0218, "step": 7010 }, { "epoch": 3.298872180451128, "grad_norm": 0.1738310009241104, "learning_rate": 7.720883567456298e-05, "loss": 0.02, "step": 7020 }, { "epoch": 3.3035714285714284, "grad_norm": 0.18204271793365479, "learning_rate": 7.713943788214337e-05, "loss": 0.0187, "step": 7030 }, { "epoch": 3.308270676691729, "grad_norm": 0.19523444771766663, "learning_rate": 7.70699658915369e-05, "loss": 0.0225, "step": 7040 }, { "epoch": 3.31296992481203, "grad_norm": 0.15334999561309814, "learning_rate": 7.700041989267736e-05, "loss": 0.0215, "step": 7050 }, { "epoch": 3.3176691729323307, "grad_norm": 0.19023378193378448, "learning_rate": 7.693080007570084e-05, "loss": 0.0205, "step": 7060 }, { "epoch": 3.3223684210526314, "grad_norm": 0.14410124719142914, "learning_rate": 7.686110663094525e-05, "loss": 0.0283, "step": 7070 }, { "epoch": 3.327067669172932, "grad_norm": 0.17559373378753662, "learning_rate": 7.679133974894983e-05, "loss": 0.0256, "step": 7080 }, { "epoch": 3.331766917293233, "grad_norm": 0.17834362387657166, "learning_rate": 7.672149962045457e-05, "loss": 0.0236, "step": 7090 }, { "epoch": 3.3364661654135337, "grad_norm": 0.22900669276714325, "learning_rate": 7.66515864363997e-05, "loss": 0.0251, "step": 7100 }, { "epoch": 3.3411654135338344, "grad_norm": 0.15852832794189453, "learning_rate": 7.658160038792518e-05, "loss": 0.0239, "step": 7110 }, { "epoch": 3.345864661654135, "grad_norm": 0.16738687455654144, "learning_rate": 7.651154166637025e-05, "loss": 0.0204, "step": 7120 }, { "epoch": 3.350563909774436, "grad_norm": 0.1924857199192047, "learning_rate": 7.644141046327271e-05, "loss": 0.0271, "step": 7130 }, { "epoch": 3.3552631578947367, "grad_norm": 0.15029247105121613, "learning_rate": 7.637120697036866e-05, "loss": 0.0249, "step": 7140 }, { "epoch": 3.3599624060150375, "grad_norm": 0.16167303919792175, "learning_rate": 7.630093137959171e-05, "loss": 0.0193, "step": 7150 }, { "epoch": 3.3646616541353382, "grad_norm": 0.16990002989768982, "learning_rate": 7.623058388307269e-05, "loss": 0.0305, "step": 7160 }, { "epoch": 3.369360902255639, "grad_norm": 0.22318102419376373, "learning_rate": 7.616016467313891e-05, "loss": 0.0186, "step": 7170 }, { "epoch": 3.3740601503759398, "grad_norm": 0.15061348676681519, "learning_rate": 7.608967394231387e-05, "loss": 0.0213, "step": 7180 }, { "epoch": 3.3787593984962405, "grad_norm": 0.2030361294746399, "learning_rate": 7.60191118833165e-05, "loss": 0.019, "step": 7190 }, { "epoch": 3.3834586466165413, "grad_norm": 0.09996183216571808, "learning_rate": 7.594847868906076e-05, "loss": 0.0297, "step": 7200 }, { "epoch": 3.388157894736842, "grad_norm": 0.22531284391880035, "learning_rate": 7.587777455265515e-05, "loss": 0.0189, "step": 7210 }, { "epoch": 3.392857142857143, "grad_norm": 0.10503160953521729, "learning_rate": 7.580699966740201e-05, "loss": 0.0174, "step": 7220 }, { "epoch": 3.3975563909774436, "grad_norm": 0.11209885030984879, "learning_rate": 7.573615422679726e-05, "loss": 0.0212, "step": 7230 }, { "epoch": 3.4022556390977443, "grad_norm": 0.16246545314788818, "learning_rate": 7.566523842452958e-05, "loss": 0.019, "step": 7240 }, { "epoch": 3.406954887218045, "grad_norm": 0.1651008427143097, "learning_rate": 7.559425245448006e-05, "loss": 0.0241, "step": 7250 }, { "epoch": 3.411654135338346, "grad_norm": 0.12217399477958679, "learning_rate": 7.552319651072164e-05, "loss": 0.0218, "step": 7260 }, { "epoch": 3.4163533834586466, "grad_norm": 0.16352419555187225, "learning_rate": 7.545207078751857e-05, "loss": 0.0269, "step": 7270 }, { "epoch": 3.4210526315789473, "grad_norm": 0.16248418390750885, "learning_rate": 7.538087547932585e-05, "loss": 0.0177, "step": 7280 }, { "epoch": 3.425751879699248, "grad_norm": 0.17357391119003296, "learning_rate": 7.530961078078873e-05, "loss": 0.0195, "step": 7290 }, { "epoch": 3.430451127819549, "grad_norm": 0.19342300295829773, "learning_rate": 7.52382768867422e-05, "loss": 0.029, "step": 7300 }, { "epoch": 3.4351503759398496, "grad_norm": 0.1559091955423355, "learning_rate": 7.516687399221037e-05, "loss": 0.0156, "step": 7310 }, { "epoch": 3.4398496240601504, "grad_norm": 0.156753808259964, "learning_rate": 7.509540229240601e-05, "loss": 0.0218, "step": 7320 }, { "epoch": 3.444548872180451, "grad_norm": 0.25587204098701477, "learning_rate": 7.50238619827301e-05, "loss": 0.0195, "step": 7330 }, { "epoch": 3.449248120300752, "grad_norm": 0.14930440485477448, "learning_rate": 7.495225325877103e-05, "loss": 0.0213, "step": 7340 }, { "epoch": 3.4539473684210527, "grad_norm": 0.1230517104268074, "learning_rate": 7.488057631630437e-05, "loss": 0.0253, "step": 7350 }, { "epoch": 3.4586466165413534, "grad_norm": 0.2551042437553406, "learning_rate": 7.480883135129211e-05, "loss": 0.0337, "step": 7360 }, { "epoch": 3.463345864661654, "grad_norm": 0.2262675166130066, "learning_rate": 7.473701855988227e-05, "loss": 0.0245, "step": 7370 }, { "epoch": 3.468045112781955, "grad_norm": 0.24242740869522095, "learning_rate": 7.466513813840825e-05, "loss": 0.0215, "step": 7380 }, { "epoch": 3.4727443609022557, "grad_norm": 0.13138991594314575, "learning_rate": 7.45931902833884e-05, "loss": 0.0215, "step": 7390 }, { "epoch": 3.4774436090225564, "grad_norm": 0.1672552078962326, "learning_rate": 7.452117519152542e-05, "loss": 0.0163, "step": 7400 }, { "epoch": 3.482142857142857, "grad_norm": 0.1623755842447281, "learning_rate": 7.444909305970578e-05, "loss": 0.0241, "step": 7410 }, { "epoch": 3.486842105263158, "grad_norm": 0.13752855360507965, "learning_rate": 7.437694408499933e-05, "loss": 0.0303, "step": 7420 }, { "epoch": 3.4915413533834587, "grad_norm": 0.23802992701530457, "learning_rate": 7.430472846465856e-05, "loss": 0.0263, "step": 7430 }, { "epoch": 3.4962406015037595, "grad_norm": 0.19350318610668182, "learning_rate": 7.423244639611826e-05, "loss": 0.0204, "step": 7440 }, { "epoch": 3.5009398496240602, "grad_norm": 0.15366598963737488, "learning_rate": 7.416009807699482e-05, "loss": 0.021, "step": 7450 }, { "epoch": 3.505639097744361, "grad_norm": 0.16275855898857117, "learning_rate": 7.408768370508576e-05, "loss": 0.0181, "step": 7460 }, { "epoch": 3.5103383458646618, "grad_norm": 0.22359305620193481, "learning_rate": 7.401520347836926e-05, "loss": 0.0238, "step": 7470 }, { "epoch": 3.5150375939849625, "grad_norm": 0.143357053399086, "learning_rate": 7.394265759500348e-05, "loss": 0.0231, "step": 7480 }, { "epoch": 3.5197368421052633, "grad_norm": 0.14645427465438843, "learning_rate": 7.387004625332608e-05, "loss": 0.0231, "step": 7490 }, { "epoch": 3.524436090225564, "grad_norm": 0.17870672047138214, "learning_rate": 7.379736965185368e-05, "loss": 0.0253, "step": 7500 }, { "epoch": 3.529135338345865, "grad_norm": 0.12523587048053741, "learning_rate": 7.372462798928137e-05, "loss": 0.0172, "step": 7510 }, { "epoch": 3.5338345864661656, "grad_norm": 0.15407587587833405, "learning_rate": 7.365182146448205e-05, "loss": 0.0302, "step": 7520 }, { "epoch": 3.5385338345864663, "grad_norm": 0.178106427192688, "learning_rate": 7.357895027650598e-05, "loss": 0.0266, "step": 7530 }, { "epoch": 3.543233082706767, "grad_norm": 0.1787080615758896, "learning_rate": 7.350601462458024e-05, "loss": 0.0244, "step": 7540 }, { "epoch": 3.547932330827068, "grad_norm": 0.12952303886413574, "learning_rate": 7.343301470810808e-05, "loss": 0.0274, "step": 7550 }, { "epoch": 3.5526315789473686, "grad_norm": 0.14345309138298035, "learning_rate": 7.335995072666848e-05, "loss": 0.0189, "step": 7560 }, { "epoch": 3.557330827067669, "grad_norm": 0.15413424372673035, "learning_rate": 7.328682288001561e-05, "loss": 0.0142, "step": 7570 }, { "epoch": 3.5620300751879697, "grad_norm": 0.19909968972206116, "learning_rate": 7.32136313680782e-05, "loss": 0.0193, "step": 7580 }, { "epoch": 3.5667293233082704, "grad_norm": 0.18164218962192535, "learning_rate": 7.3140376390959e-05, "loss": 0.0241, "step": 7590 }, { "epoch": 3.571428571428571, "grad_norm": 0.12364038825035095, "learning_rate": 7.30670581489344e-05, "loss": 0.023, "step": 7600 }, { "epoch": 3.576127819548872, "grad_norm": 0.14471431076526642, "learning_rate": 7.299367684245362e-05, "loss": 0.0196, "step": 7610 }, { "epoch": 3.5808270676691727, "grad_norm": 0.13688594102859497, "learning_rate": 7.292023267213835e-05, "loss": 0.0186, "step": 7620 }, { "epoch": 3.5855263157894735, "grad_norm": 0.13687384128570557, "learning_rate": 7.284672583878219e-05, "loss": 0.0287, "step": 7630 }, { "epoch": 3.590225563909774, "grad_norm": 0.16403521597385406, "learning_rate": 7.277315654334997e-05, "loss": 0.0195, "step": 7640 }, { "epoch": 3.594924812030075, "grad_norm": 0.15605804324150085, "learning_rate": 7.269952498697734e-05, "loss": 0.027, "step": 7650 }, { "epoch": 3.5996240601503757, "grad_norm": 0.16706325113773346, "learning_rate": 7.262583137097018e-05, "loss": 0.0186, "step": 7660 }, { "epoch": 3.6043233082706765, "grad_norm": 0.18399161100387573, "learning_rate": 7.255207589680402e-05, "loss": 0.0199, "step": 7670 }, { "epoch": 3.6090225563909772, "grad_norm": 0.14956754446029663, "learning_rate": 7.247825876612353e-05, "loss": 0.0223, "step": 7680 }, { "epoch": 3.613721804511278, "grad_norm": 0.16335678100585938, "learning_rate": 7.240438018074189e-05, "loss": 0.0205, "step": 7690 }, { "epoch": 3.6184210526315788, "grad_norm": 0.15078043937683105, "learning_rate": 7.233044034264034e-05, "loss": 0.0166, "step": 7700 }, { "epoch": 3.6231203007518795, "grad_norm": 0.1938415914773941, "learning_rate": 7.225643945396757e-05, "loss": 0.0218, "step": 7710 }, { "epoch": 3.6278195488721803, "grad_norm": 0.1588267982006073, "learning_rate": 7.218237771703921e-05, "loss": 0.031, "step": 7720 }, { "epoch": 3.632518796992481, "grad_norm": 0.17583388090133667, "learning_rate": 7.210825533433719e-05, "loss": 0.0244, "step": 7730 }, { "epoch": 3.637218045112782, "grad_norm": 0.2322985827922821, "learning_rate": 7.203407250850928e-05, "loss": 0.0181, "step": 7740 }, { "epoch": 3.6419172932330826, "grad_norm": 0.16796962916851044, "learning_rate": 7.195982944236851e-05, "loss": 0.0227, "step": 7750 }, { "epoch": 3.6466165413533833, "grad_norm": 0.14361505210399628, "learning_rate": 7.188552633889259e-05, "loss": 0.0223, "step": 7760 }, { "epoch": 3.651315789473684, "grad_norm": 0.13321304321289062, "learning_rate": 7.181116340122336e-05, "loss": 0.0165, "step": 7770 }, { "epoch": 3.656015037593985, "grad_norm": 0.1332216113805771, "learning_rate": 7.173674083266624e-05, "loss": 0.0223, "step": 7780 }, { "epoch": 3.6607142857142856, "grad_norm": 0.15823520720005035, "learning_rate": 7.166225883668969e-05, "loss": 0.0282, "step": 7790 }, { "epoch": 3.6654135338345863, "grad_norm": 0.18819200992584229, "learning_rate": 7.158771761692464e-05, "loss": 0.0156, "step": 7800 }, { "epoch": 3.670112781954887, "grad_norm": 0.1621289998292923, "learning_rate": 7.151311737716397e-05, "loss": 0.0222, "step": 7810 }, { "epoch": 3.674812030075188, "grad_norm": 0.12522225081920624, "learning_rate": 7.143845832136188e-05, "loss": 0.0204, "step": 7820 }, { "epoch": 3.6795112781954886, "grad_norm": 0.12633934617042542, "learning_rate": 7.136374065363334e-05, "loss": 0.0197, "step": 7830 }, { "epoch": 3.6842105263157894, "grad_norm": 0.12287107110023499, "learning_rate": 7.128896457825364e-05, "loss": 0.0218, "step": 7840 }, { "epoch": 3.68890977443609, "grad_norm": 0.17396919429302216, "learning_rate": 7.121413029965769e-05, "loss": 0.0214, "step": 7850 }, { "epoch": 3.693609022556391, "grad_norm": 0.15268878638744354, "learning_rate": 7.113923802243957e-05, "loss": 0.0225, "step": 7860 }, { "epoch": 3.6983082706766917, "grad_norm": 0.14335058629512787, "learning_rate": 7.10642879513519e-05, "loss": 0.0239, "step": 7870 }, { "epoch": 3.7030075187969924, "grad_norm": 0.13077248632907867, "learning_rate": 7.09892802913053e-05, "loss": 0.0208, "step": 7880 }, { "epoch": 3.707706766917293, "grad_norm": 0.16183945536613464, "learning_rate": 7.091421524736784e-05, "loss": 0.0232, "step": 7890 }, { "epoch": 3.712406015037594, "grad_norm": 0.20572522282600403, "learning_rate": 7.083909302476453e-05, "loss": 0.0263, "step": 7900 }, { "epoch": 3.7171052631578947, "grad_norm": 0.17196118831634521, "learning_rate": 7.076391382887661e-05, "loss": 0.0193, "step": 7910 }, { "epoch": 3.7218045112781954, "grad_norm": 0.16815915703773499, "learning_rate": 7.068867786524116e-05, "loss": 0.0175, "step": 7920 }, { "epoch": 3.726503759398496, "grad_norm": 0.20678561925888062, "learning_rate": 7.061338533955043e-05, "loss": 0.0229, "step": 7930 }, { "epoch": 3.731203007518797, "grad_norm": 0.12356437742710114, "learning_rate": 7.053803645765128e-05, "loss": 0.0273, "step": 7940 }, { "epoch": 3.7359022556390977, "grad_norm": 0.1457894891500473, "learning_rate": 7.04626314255447e-05, "loss": 0.0187, "step": 7950 }, { "epoch": 3.7406015037593985, "grad_norm": 0.149841770529747, "learning_rate": 7.038717044938519e-05, "loss": 0.0201, "step": 7960 }, { "epoch": 3.7453007518796992, "grad_norm": 0.22033792734146118, "learning_rate": 7.031165373548014e-05, "loss": 0.0246, "step": 7970 }, { "epoch": 3.75, "grad_norm": 0.13069063425064087, "learning_rate": 7.023608149028937e-05, "loss": 0.0155, "step": 7980 }, { "epoch": 3.7546992481203008, "grad_norm": 0.15247632563114166, "learning_rate": 7.016045392042452e-05, "loss": 0.0217, "step": 7990 }, { "epoch": 3.7593984962406015, "grad_norm": 0.1565304547548294, "learning_rate": 7.008477123264848e-05, "loss": 0.0212, "step": 8000 }, { "epoch": 3.7640977443609023, "grad_norm": 0.08946457505226135, "learning_rate": 7.000903363387482e-05, "loss": 0.0178, "step": 8010 }, { "epoch": 3.768796992481203, "grad_norm": 0.16338147222995758, "learning_rate": 6.993324133116726e-05, "loss": 0.0188, "step": 8020 }, { "epoch": 3.773496240601504, "grad_norm": 0.15769213438034058, "learning_rate": 6.985739453173903e-05, "loss": 0.0183, "step": 8030 }, { "epoch": 3.7781954887218046, "grad_norm": 0.18680426478385925, "learning_rate": 6.978149344295242e-05, "loss": 0.0183, "step": 8040 }, { "epoch": 3.7828947368421053, "grad_norm": 0.15789374709129333, "learning_rate": 6.97055382723181e-05, "loss": 0.0226, "step": 8050 }, { "epoch": 3.787593984962406, "grad_norm": 0.14424045383930206, "learning_rate": 6.962952922749457e-05, "loss": 0.0205, "step": 8060 }, { "epoch": 3.792293233082707, "grad_norm": 0.2000490128993988, "learning_rate": 6.955346651628771e-05, "loss": 0.022, "step": 8070 }, { "epoch": 3.7969924812030076, "grad_norm": 0.11632633209228516, "learning_rate": 6.947735034665002e-05, "loss": 0.03, "step": 8080 }, { "epoch": 3.8016917293233083, "grad_norm": 0.15219521522521973, "learning_rate": 6.940118092668022e-05, "loss": 0.0171, "step": 8090 }, { "epoch": 3.806390977443609, "grad_norm": 0.16450823843479156, "learning_rate": 6.932495846462261e-05, "loss": 0.0203, "step": 8100 }, { "epoch": 3.81109022556391, "grad_norm": 0.2347194105386734, "learning_rate": 6.924868316886649e-05, "loss": 0.0257, "step": 8110 }, { "epoch": 3.8157894736842106, "grad_norm": 0.14800488948822021, "learning_rate": 6.917235524794558e-05, "loss": 0.0263, "step": 8120 }, { "epoch": 3.8204887218045114, "grad_norm": 0.18087433278560638, "learning_rate": 6.909597491053751e-05, "loss": 0.0192, "step": 8130 }, { "epoch": 3.825187969924812, "grad_norm": 0.14640933275222778, "learning_rate": 6.901954236546323e-05, "loss": 0.0248, "step": 8140 }, { "epoch": 3.829887218045113, "grad_norm": 0.12826332449913025, "learning_rate": 6.894305782168638e-05, "loss": 0.0148, "step": 8150 }, { "epoch": 3.8345864661654137, "grad_norm": 0.165438711643219, "learning_rate": 6.886652148831279e-05, "loss": 0.0189, "step": 8160 }, { "epoch": 3.8392857142857144, "grad_norm": 0.17549659311771393, "learning_rate": 6.878993357458986e-05, "loss": 0.024, "step": 8170 }, { "epoch": 3.843984962406015, "grad_norm": 0.10705628246068954, "learning_rate": 6.871329428990602e-05, "loss": 0.0171, "step": 8180 }, { "epoch": 3.848684210526316, "grad_norm": 0.13803797960281372, "learning_rate": 6.863660384379017e-05, "loss": 0.0254, "step": 8190 }, { "epoch": 3.8533834586466167, "grad_norm": 0.21638810634613037, "learning_rate": 6.855986244591104e-05, "loss": 0.0247, "step": 8200 }, { "epoch": 3.8580827067669174, "grad_norm": 0.1485230177640915, "learning_rate": 6.84830703060767e-05, "loss": 0.0173, "step": 8210 }, { "epoch": 3.862781954887218, "grad_norm": 0.12228238582611084, "learning_rate": 6.840622763423391e-05, "loss": 0.0222, "step": 8220 }, { "epoch": 3.867481203007519, "grad_norm": 0.15566863119602203, "learning_rate": 6.83293346404676e-05, "loss": 0.0179, "step": 8230 }, { "epoch": 3.8721804511278197, "grad_norm": 0.1872120499610901, "learning_rate": 6.825239153500029e-05, "loss": 0.0245, "step": 8240 }, { "epoch": 3.8768796992481205, "grad_norm": 0.12243503332138062, "learning_rate": 6.817539852819149e-05, "loss": 0.0208, "step": 8250 }, { "epoch": 3.8815789473684212, "grad_norm": 0.204155832529068, "learning_rate": 6.809835583053715e-05, "loss": 0.0204, "step": 8260 }, { "epoch": 3.886278195488722, "grad_norm": 0.1609678864479065, "learning_rate": 6.802126365266905e-05, "loss": 0.0214, "step": 8270 }, { "epoch": 3.8909774436090228, "grad_norm": 0.17930862307548523, "learning_rate": 6.794412220535426e-05, "loss": 0.0268, "step": 8280 }, { "epoch": 3.8956766917293235, "grad_norm": 0.16851861774921417, "learning_rate": 6.786693169949455e-05, "loss": 0.0266, "step": 8290 }, { "epoch": 3.9003759398496243, "grad_norm": 0.16229747235774994, "learning_rate": 6.778969234612584e-05, "loss": 0.0138, "step": 8300 }, { "epoch": 3.905075187969925, "grad_norm": 0.17356090247631073, "learning_rate": 6.771240435641754e-05, "loss": 0.013, "step": 8310 }, { "epoch": 3.909774436090226, "grad_norm": 0.1390371471643448, "learning_rate": 6.763506794167208e-05, "loss": 0.0163, "step": 8320 }, { "epoch": 3.9144736842105265, "grad_norm": 0.17728669941425323, "learning_rate": 6.755768331332424e-05, "loss": 0.0254, "step": 8330 }, { "epoch": 3.9191729323308273, "grad_norm": 0.13212069869041443, "learning_rate": 6.748025068294067e-05, "loss": 0.0239, "step": 8340 }, { "epoch": 3.923872180451128, "grad_norm": 0.1477879285812378, "learning_rate": 6.740277026221923e-05, "loss": 0.0211, "step": 8350 }, { "epoch": 3.928571428571429, "grad_norm": 0.17585650086402893, "learning_rate": 6.732524226298841e-05, "loss": 0.0282, "step": 8360 }, { "epoch": 3.9332706766917296, "grad_norm": 0.20357094705104828, "learning_rate": 6.72476668972068e-05, "loss": 0.0249, "step": 8370 }, { "epoch": 3.9379699248120303, "grad_norm": 0.14865775406360626, "learning_rate": 6.71700443769625e-05, "loss": 0.0248, "step": 8380 }, { "epoch": 3.942669172932331, "grad_norm": 0.24721617996692657, "learning_rate": 6.709237491447249e-05, "loss": 0.019, "step": 8390 }, { "epoch": 3.9473684210526314, "grad_norm": 0.1690632849931717, "learning_rate": 6.701465872208216e-05, "loss": 0.0199, "step": 8400 }, { "epoch": 3.952067669172932, "grad_norm": 0.1842128187417984, "learning_rate": 6.693689601226458e-05, "loss": 0.0185, "step": 8410 }, { "epoch": 3.956766917293233, "grad_norm": 0.15533077716827393, "learning_rate": 6.685908699762002e-05, "loss": 0.0171, "step": 8420 }, { "epoch": 3.9614661654135337, "grad_norm": 0.19935789704322815, "learning_rate": 6.67812318908754e-05, "loss": 0.0166, "step": 8430 }, { "epoch": 3.9661654135338344, "grad_norm": 0.16483817994594574, "learning_rate": 6.670333090488356e-05, "loss": 0.0167, "step": 8440 }, { "epoch": 3.970864661654135, "grad_norm": 0.13967077434062958, "learning_rate": 6.662538425262285e-05, "loss": 0.0189, "step": 8450 }, { "epoch": 3.975563909774436, "grad_norm": 0.08983052521944046, "learning_rate": 6.654739214719641e-05, "loss": 0.0158, "step": 8460 }, { "epoch": 3.9802631578947367, "grad_norm": 0.11961586773395538, "learning_rate": 6.646935480183173e-05, "loss": 0.0195, "step": 8470 }, { "epoch": 3.9849624060150375, "grad_norm": 0.14519299566745758, "learning_rate": 6.639127242987988e-05, "loss": 0.0227, "step": 8480 }, { "epoch": 3.9896616541353382, "grad_norm": 0.1847597360610962, "learning_rate": 6.631314524481513e-05, "loss": 0.0215, "step": 8490 }, { "epoch": 3.994360902255639, "grad_norm": 0.16919434070587158, "learning_rate": 6.623497346023418e-05, "loss": 0.0233, "step": 8500 }, { "epoch": 3.9990601503759398, "grad_norm": 0.18585987389087677, "learning_rate": 6.615675728985572e-05, "loss": 0.0317, "step": 8510 }, { "epoch": 4.003759398496241, "grad_norm": 0.11600866168737411, "learning_rate": 6.607849694751977e-05, "loss": 0.0221, "step": 8520 }, { "epoch": 4.008458646616542, "grad_norm": 0.1685023158788681, "learning_rate": 6.600019264718713e-05, "loss": 0.0164, "step": 8530 }, { "epoch": 4.0131578947368425, "grad_norm": 0.13364779949188232, "learning_rate": 6.592184460293877e-05, "loss": 0.024, "step": 8540 }, { "epoch": 4.017857142857143, "grad_norm": 0.1409081667661667, "learning_rate": 6.584345302897523e-05, "loss": 0.0197, "step": 8550 }, { "epoch": 4.022556390977444, "grad_norm": 0.18445076048374176, "learning_rate": 6.576501813961609e-05, "loss": 0.0186, "step": 8560 }, { "epoch": 4.027255639097745, "grad_norm": 0.13780328631401062, "learning_rate": 6.568654014929932e-05, "loss": 0.0158, "step": 8570 }, { "epoch": 4.0319548872180455, "grad_norm": 0.1404091864824295, "learning_rate": 6.56080192725808e-05, "loss": 0.0178, "step": 8580 }, { "epoch": 4.036654135338346, "grad_norm": 0.1806766241788864, "learning_rate": 6.552945572413358e-05, "loss": 0.0212, "step": 8590 }, { "epoch": 4.041353383458647, "grad_norm": 0.12911942601203918, "learning_rate": 6.545084971874738e-05, "loss": 0.0162, "step": 8600 }, { "epoch": 4.046052631578948, "grad_norm": 0.1130770891904831, "learning_rate": 6.537220147132805e-05, "loss": 0.0204, "step": 8610 }, { "epoch": 4.0507518796992485, "grad_norm": 0.1793549507856369, "learning_rate": 6.529351119689688e-05, "loss": 0.0149, "step": 8620 }, { "epoch": 4.055451127819549, "grad_norm": 0.12300092726945877, "learning_rate": 6.521477911059008e-05, "loss": 0.0273, "step": 8630 }, { "epoch": 4.06015037593985, "grad_norm": 0.11498741805553436, "learning_rate": 6.513600542765817e-05, "loss": 0.0163, "step": 8640 }, { "epoch": 4.06484962406015, "grad_norm": 0.17557263374328613, "learning_rate": 6.505719036346539e-05, "loss": 0.0187, "step": 8650 }, { "epoch": 4.069548872180451, "grad_norm": 0.12561099231243134, "learning_rate": 6.497833413348909e-05, "loss": 0.0167, "step": 8660 }, { "epoch": 4.0742481203007515, "grad_norm": 0.14439953863620758, "learning_rate": 6.489943695331923e-05, "loss": 0.0181, "step": 8670 }, { "epoch": 4.078947368421052, "grad_norm": 0.2207750380039215, "learning_rate": 6.48204990386577e-05, "loss": 0.0187, "step": 8680 }, { "epoch": 4.083646616541353, "grad_norm": 0.1530761420726776, "learning_rate": 6.474152060531768e-05, "loss": 0.0188, "step": 8690 }, { "epoch": 4.088345864661654, "grad_norm": 0.18468138575553894, "learning_rate": 6.466250186922325e-05, "loss": 0.0153, "step": 8700 }, { "epoch": 4.0930451127819545, "grad_norm": 0.19658444821834564, "learning_rate": 6.458344304640858e-05, "loss": 0.0144, "step": 8710 }, { "epoch": 4.097744360902255, "grad_norm": 0.16969050467014313, "learning_rate": 6.450434435301751e-05, "loss": 0.0167, "step": 8720 }, { "epoch": 4.102443609022556, "grad_norm": 0.20715074241161346, "learning_rate": 6.44252060053028e-05, "loss": 0.0188, "step": 8730 }, { "epoch": 4.107142857142857, "grad_norm": 0.19607798755168915, "learning_rate": 6.43460282196257e-05, "loss": 0.0176, "step": 8740 }, { "epoch": 4.1118421052631575, "grad_norm": 0.13354948163032532, "learning_rate": 6.426681121245527e-05, "loss": 0.0175, "step": 8750 }, { "epoch": 4.116541353383458, "grad_norm": 0.16963709890842438, "learning_rate": 6.418755520036775e-05, "loss": 0.0184, "step": 8760 }, { "epoch": 4.121240601503759, "grad_norm": 0.17850276827812195, "learning_rate": 6.410826040004607e-05, "loss": 0.0206, "step": 8770 }, { "epoch": 4.12593984962406, "grad_norm": 0.13571204245090485, "learning_rate": 6.402892702827916e-05, "loss": 0.0206, "step": 8780 }, { "epoch": 4.1306390977443606, "grad_norm": 0.14652928709983826, "learning_rate": 6.394955530196147e-05, "loss": 0.0285, "step": 8790 }, { "epoch": 4.135338345864661, "grad_norm": 0.20904070138931274, "learning_rate": 6.387014543809223e-05, "loss": 0.0227, "step": 8800 }, { "epoch": 4.140037593984962, "grad_norm": 0.11015600711107254, "learning_rate": 6.3790697653775e-05, "loss": 0.0283, "step": 8810 }, { "epoch": 4.144736842105263, "grad_norm": 0.2260255068540573, "learning_rate": 6.371121216621698e-05, "loss": 0.018, "step": 8820 }, { "epoch": 4.149436090225564, "grad_norm": 0.16859593987464905, "learning_rate": 6.363168919272846e-05, "loss": 0.0179, "step": 8830 }, { "epoch": 4.154135338345864, "grad_norm": 0.11148626357316971, "learning_rate": 6.355212895072223e-05, "loss": 0.0303, "step": 8840 }, { "epoch": 4.158834586466165, "grad_norm": 0.1172458827495575, "learning_rate": 6.34725316577129e-05, "loss": 0.023, "step": 8850 }, { "epoch": 4.163533834586466, "grad_norm": 0.17377308011054993, "learning_rate": 6.339289753131649e-05, "loss": 0.0229, "step": 8860 }, { "epoch": 4.168233082706767, "grad_norm": 0.1693037748336792, "learning_rate": 6.331322678924962e-05, "loss": 0.0224, "step": 8870 }, { "epoch": 4.172932330827067, "grad_norm": 0.1303318440914154, "learning_rate": 6.323351964932908e-05, "loss": 0.0265, "step": 8880 }, { "epoch": 4.177631578947368, "grad_norm": 0.1451072245836258, "learning_rate": 6.315377632947115e-05, "loss": 0.0261, "step": 8890 }, { "epoch": 4.182330827067669, "grad_norm": 0.11508966982364655, "learning_rate": 6.307399704769099e-05, "loss": 0.0175, "step": 8900 }, { "epoch": 4.18703007518797, "grad_norm": 0.15271732211112976, "learning_rate": 6.299418202210214e-05, "loss": 0.0212, "step": 8910 }, { "epoch": 4.19172932330827, "grad_norm": 0.16247068345546722, "learning_rate": 6.291433147091583e-05, "loss": 0.0188, "step": 8920 }, { "epoch": 4.196428571428571, "grad_norm": 0.1361701935529709, "learning_rate": 6.283444561244042e-05, "loss": 0.0173, "step": 8930 }, { "epoch": 4.201127819548872, "grad_norm": 0.1367974579334259, "learning_rate": 6.275452466508077e-05, "loss": 0.0185, "step": 8940 }, { "epoch": 4.205827067669173, "grad_norm": 0.2000769078731537, "learning_rate": 6.26745688473377e-05, "loss": 0.0196, "step": 8950 }, { "epoch": 4.2105263157894735, "grad_norm": 0.18851491808891296, "learning_rate": 6.259457837780742e-05, "loss": 0.0224, "step": 8960 }, { "epoch": 4.215225563909774, "grad_norm": 0.13504041731357574, "learning_rate": 6.251455347518073e-05, "loss": 0.0185, "step": 8970 }, { "epoch": 4.219924812030075, "grad_norm": 0.21460330486297607, "learning_rate": 6.243449435824276e-05, "loss": 0.0231, "step": 8980 }, { "epoch": 4.224624060150376, "grad_norm": 0.18271034955978394, "learning_rate": 6.235440124587198e-05, "loss": 0.0208, "step": 8990 }, { "epoch": 4.2293233082706765, "grad_norm": 0.14157791435718536, "learning_rate": 6.227427435703997e-05, "loss": 0.0194, "step": 9000 }, { "epoch": 4.234022556390977, "grad_norm": 0.1823650449514389, "learning_rate": 6.219411391081055e-05, "loss": 0.025, "step": 9010 }, { "epoch": 4.238721804511278, "grad_norm": 0.10560489445924759, "learning_rate": 6.211392012633932e-05, "loss": 0.0209, "step": 9020 }, { "epoch": 4.243421052631579, "grad_norm": 0.13004441559314728, "learning_rate": 6.203369322287306e-05, "loss": 0.022, "step": 9030 }, { "epoch": 4.2481203007518795, "grad_norm": 0.189819797873497, "learning_rate": 6.195343341974899e-05, "loss": 0.018, "step": 9040 }, { "epoch": 4.25281954887218, "grad_norm": 0.1904393583536148, "learning_rate": 6.187314093639444e-05, "loss": 0.0225, "step": 9050 }, { "epoch": 4.257518796992481, "grad_norm": 0.1637134999036789, "learning_rate": 6.179281599232591e-05, "loss": 0.0229, "step": 9060 }, { "epoch": 4.262218045112782, "grad_norm": 0.1223156750202179, "learning_rate": 6.17124588071488e-05, "loss": 0.0159, "step": 9070 }, { "epoch": 4.2669172932330826, "grad_norm": 0.1365230232477188, "learning_rate": 6.163206960055651e-05, "loss": 0.0229, "step": 9080 }, { "epoch": 4.271616541353383, "grad_norm": 0.11772079765796661, "learning_rate": 6.155164859233012e-05, "loss": 0.0178, "step": 9090 }, { "epoch": 4.276315789473684, "grad_norm": 0.13973119854927063, "learning_rate": 6.147119600233758e-05, "loss": 0.0181, "step": 9100 }, { "epoch": 4.281015037593985, "grad_norm": 0.15319527685642242, "learning_rate": 6.13907120505332e-05, "loss": 0.0148, "step": 9110 }, { "epoch": 4.285714285714286, "grad_norm": 0.12740933895111084, "learning_rate": 6.131019695695702e-05, "loss": 0.0173, "step": 9120 }, { "epoch": 4.290413533834586, "grad_norm": 0.12393118441104889, "learning_rate": 6.122965094173424e-05, "loss": 0.0211, "step": 9130 }, { "epoch": 4.295112781954887, "grad_norm": 0.10120144486427307, "learning_rate": 6.11490742250746e-05, "loss": 0.019, "step": 9140 }, { "epoch": 4.299812030075188, "grad_norm": 0.12977886199951172, "learning_rate": 6.106846702727172e-05, "loss": 0.0188, "step": 9150 }, { "epoch": 4.304511278195489, "grad_norm": 0.21875134110450745, "learning_rate": 6.0987829568702656e-05, "loss": 0.0208, "step": 9160 }, { "epoch": 4.309210526315789, "grad_norm": 0.18392659723758698, "learning_rate": 6.090716206982714e-05, "loss": 0.0192, "step": 9170 }, { "epoch": 4.31390977443609, "grad_norm": 0.1815064251422882, "learning_rate": 6.0826464751186994e-05, "loss": 0.017, "step": 9180 }, { "epoch": 4.318609022556391, "grad_norm": 0.12864035367965698, "learning_rate": 6.074573783340562e-05, "loss": 0.0159, "step": 9190 }, { "epoch": 4.323308270676692, "grad_norm": 0.2170429825782776, "learning_rate": 6.066498153718735e-05, "loss": 0.0155, "step": 9200 }, { "epoch": 4.328007518796992, "grad_norm": 0.179051011800766, "learning_rate": 6.0584196083316794e-05, "loss": 0.0299, "step": 9210 }, { "epoch": 4.332706766917293, "grad_norm": 0.09811102598905563, "learning_rate": 6.05033816926583e-05, "loss": 0.0182, "step": 9220 }, { "epoch": 4.337406015037594, "grad_norm": 0.11130985617637634, "learning_rate": 6.042253858615532e-05, "loss": 0.017, "step": 9230 }, { "epoch": 4.342105263157895, "grad_norm": 0.19028566777706146, "learning_rate": 6.034166698482984e-05, "loss": 0.0273, "step": 9240 }, { "epoch": 4.3468045112781954, "grad_norm": 0.13545772433280945, "learning_rate": 6.026076710978171e-05, "loss": 0.0164, "step": 9250 }, { "epoch": 4.351503759398496, "grad_norm": 0.11075558513402939, "learning_rate": 6.017983918218812e-05, "loss": 0.02, "step": 9260 }, { "epoch": 4.356203007518797, "grad_norm": 0.15003326535224915, "learning_rate": 6.009888342330292e-05, "loss": 0.0224, "step": 9270 }, { "epoch": 4.360902255639098, "grad_norm": 0.1644802838563919, "learning_rate": 6.001790005445607e-05, "loss": 0.0227, "step": 9280 }, { "epoch": 4.3656015037593985, "grad_norm": 0.1330413520336151, "learning_rate": 5.9936889297052986e-05, "loss": 0.0151, "step": 9290 }, { "epoch": 4.370300751879699, "grad_norm": 0.1558411419391632, "learning_rate": 5.985585137257401e-05, "loss": 0.0227, "step": 9300 }, { "epoch": 4.375, "grad_norm": 0.16447797417640686, "learning_rate": 5.977478650257374e-05, "loss": 0.0234, "step": 9310 }, { "epoch": 4.379699248120301, "grad_norm": 0.13559196889400482, "learning_rate": 5.969369490868042e-05, "loss": 0.0338, "step": 9320 }, { "epoch": 4.3843984962406015, "grad_norm": 0.15715987980365753, "learning_rate": 5.961257681259535e-05, "loss": 0.0205, "step": 9330 }, { "epoch": 4.389097744360902, "grad_norm": 0.11574830114841461, "learning_rate": 5.953143243609235e-05, "loss": 0.0166, "step": 9340 }, { "epoch": 4.393796992481203, "grad_norm": 0.093317911028862, "learning_rate": 5.945026200101702e-05, "loss": 0.015, "step": 9350 }, { "epoch": 4.398496240601504, "grad_norm": 0.1592835932970047, "learning_rate": 5.9369065729286245e-05, "loss": 0.0164, "step": 9360 }, { "epoch": 4.4031954887218046, "grad_norm": 0.1047334372997284, "learning_rate": 5.92878438428875e-05, "loss": 0.0276, "step": 9370 }, { "epoch": 4.407894736842105, "grad_norm": 0.12860779464244843, "learning_rate": 5.9206596563878357e-05, "loss": 0.0132, "step": 9380 }, { "epoch": 4.412593984962406, "grad_norm": 0.15609225630760193, "learning_rate": 5.912532411438576e-05, "loss": 0.0246, "step": 9390 }, { "epoch": 4.417293233082707, "grad_norm": 0.13190561532974243, "learning_rate": 5.90440267166055e-05, "loss": 0.0239, "step": 9400 }, { "epoch": 4.421992481203008, "grad_norm": 0.15665863454341888, "learning_rate": 5.896270459280153e-05, "loss": 0.0188, "step": 9410 }, { "epoch": 4.426691729323308, "grad_norm": 0.1300572156906128, "learning_rate": 5.888135796530544e-05, "loss": 0.0174, "step": 9420 }, { "epoch": 4.431390977443609, "grad_norm": 0.16136862337589264, "learning_rate": 5.8799987056515804e-05, "loss": 0.0235, "step": 9430 }, { "epoch": 4.43609022556391, "grad_norm": 0.1629219949245453, "learning_rate": 5.871859208889759e-05, "loss": 0.0184, "step": 9440 }, { "epoch": 4.440789473684211, "grad_norm": 0.18689210712909698, "learning_rate": 5.8637173284981526e-05, "loss": 0.022, "step": 9450 }, { "epoch": 4.445488721804511, "grad_norm": 0.1344980001449585, "learning_rate": 5.85557308673635e-05, "loss": 0.0143, "step": 9460 }, { "epoch": 4.450187969924812, "grad_norm": 0.16175200045108795, "learning_rate": 5.847426505870399e-05, "loss": 0.0204, "step": 9470 }, { "epoch": 4.454887218045113, "grad_norm": 0.15878114104270935, "learning_rate": 5.8392776081727385e-05, "loss": 0.0155, "step": 9480 }, { "epoch": 4.459586466165414, "grad_norm": 0.15410248935222626, "learning_rate": 5.831126415922148e-05, "loss": 0.0176, "step": 9490 }, { "epoch": 4.464285714285714, "grad_norm": 0.19770678877830505, "learning_rate": 5.8229729514036705e-05, "loss": 0.0183, "step": 9500 }, { "epoch": 4.468984962406015, "grad_norm": 0.15174371004104614, "learning_rate": 5.8148172369085686e-05, "loss": 0.0254, "step": 9510 }, { "epoch": 4.473684210526316, "grad_norm": 0.1716819554567337, "learning_rate": 5.8066592947342555e-05, "loss": 0.0205, "step": 9520 }, { "epoch": 4.478383458646617, "grad_norm": 0.11503490060567856, "learning_rate": 5.798499147184233e-05, "loss": 0.0187, "step": 9530 }, { "epoch": 4.4830827067669174, "grad_norm": 0.14293760061264038, "learning_rate": 5.7903368165680327e-05, "loss": 0.0234, "step": 9540 }, { "epoch": 4.487781954887218, "grad_norm": 0.12031774967908859, "learning_rate": 5.782172325201155e-05, "loss": 0.0268, "step": 9550 }, { "epoch": 4.492481203007519, "grad_norm": 0.10676831752061844, "learning_rate": 5.7740056954050084e-05, "loss": 0.0156, "step": 9560 }, { "epoch": 4.49718045112782, "grad_norm": 0.11658485978841782, "learning_rate": 5.765836949506843e-05, "loss": 0.0219, "step": 9570 }, { "epoch": 4.5018796992481205, "grad_norm": 0.17919903993606567, "learning_rate": 5.757666109839702e-05, "loss": 0.012, "step": 9580 }, { "epoch": 4.506578947368421, "grad_norm": 0.1580527275800705, "learning_rate": 5.74949319874235e-05, "loss": 0.0151, "step": 9590 }, { "epoch": 4.511278195488722, "grad_norm": 0.14657042920589447, "learning_rate": 5.74131823855921e-05, "loss": 0.0215, "step": 9600 }, { "epoch": 4.515977443609023, "grad_norm": 0.1635216772556305, "learning_rate": 5.733141251640315e-05, "loss": 0.0145, "step": 9610 }, { "epoch": 4.5206766917293235, "grad_norm": 0.15993301570415497, "learning_rate": 5.72496226034123e-05, "loss": 0.0214, "step": 9620 }, { "epoch": 4.525375939849624, "grad_norm": 0.14244981110095978, "learning_rate": 5.7167812870230094e-05, "loss": 0.0194, "step": 9630 }, { "epoch": 4.530075187969925, "grad_norm": 0.1011502593755722, "learning_rate": 5.7085983540521216e-05, "loss": 0.0131, "step": 9640 }, { "epoch": 4.534774436090226, "grad_norm": 0.12874962389469147, "learning_rate": 5.70041348380039e-05, "loss": 0.02, "step": 9650 }, { "epoch": 4.5394736842105265, "grad_norm": 0.18720367550849915, "learning_rate": 5.692226698644938e-05, "loss": 0.0238, "step": 9660 }, { "epoch": 4.544172932330827, "grad_norm": 0.17449072003364563, "learning_rate": 5.6840380209681255e-05, "loss": 0.0319, "step": 9670 }, { "epoch": 4.548872180451128, "grad_norm": 0.15957769751548767, "learning_rate": 5.675847473157485e-05, "loss": 0.0234, "step": 9680 }, { "epoch": 4.553571428571429, "grad_norm": 0.13500474393367767, "learning_rate": 5.667655077605659e-05, "loss": 0.0217, "step": 9690 }, { "epoch": 4.55827067669173, "grad_norm": 0.1734580397605896, "learning_rate": 5.6594608567103456e-05, "loss": 0.0185, "step": 9700 }, { "epoch": 4.56296992481203, "grad_norm": 0.1234162226319313, "learning_rate": 5.65126483287423e-05, "loss": 0.0155, "step": 9710 }, { "epoch": 4.567669172932331, "grad_norm": 0.13775312900543213, "learning_rate": 5.6430670285049314e-05, "loss": 0.0246, "step": 9720 }, { "epoch": 4.572368421052632, "grad_norm": 0.13060703873634338, "learning_rate": 5.634867466014932e-05, "loss": 0.0206, "step": 9730 }, { "epoch": 4.577067669172933, "grad_norm": 0.27983561158180237, "learning_rate": 5.6266661678215216e-05, "loss": 0.0196, "step": 9740 }, { "epoch": 4.581766917293233, "grad_norm": 0.12983421981334686, "learning_rate": 5.618463156346739e-05, "loss": 0.0234, "step": 9750 }, { "epoch": 4.586466165413534, "grad_norm": 0.08709783852100372, "learning_rate": 5.6102584540173006e-05, "loss": 0.0207, "step": 9760 }, { "epoch": 4.591165413533835, "grad_norm": 0.1174778863787651, "learning_rate": 5.602052083264555e-05, "loss": 0.0145, "step": 9770 }, { "epoch": 4.595864661654136, "grad_norm": 0.1293332278728485, "learning_rate": 5.5938440665244006e-05, "loss": 0.0214, "step": 9780 }, { "epoch": 4.600563909774436, "grad_norm": 0.17360520362854004, "learning_rate": 5.585634426237246e-05, "loss": 0.0238, "step": 9790 }, { "epoch": 4.605263157894737, "grad_norm": 0.16103921830654144, "learning_rate": 5.577423184847932e-05, "loss": 0.0157, "step": 9800 }, { "epoch": 4.609962406015038, "grad_norm": 0.09659445285797119, "learning_rate": 5.569210364805677e-05, "loss": 0.0202, "step": 9810 }, { "epoch": 4.614661654135339, "grad_norm": 0.2310553640127182, "learning_rate": 5.560995988564023e-05, "loss": 0.0208, "step": 9820 }, { "epoch": 4.6193609022556394, "grad_norm": 0.11814546585083008, "learning_rate": 5.552780078580756e-05, "loss": 0.0151, "step": 9830 }, { "epoch": 4.62406015037594, "grad_norm": 0.15608763694763184, "learning_rate": 5.544562657317863e-05, "loss": 0.0142, "step": 9840 }, { "epoch": 4.628759398496241, "grad_norm": 0.1391037106513977, "learning_rate": 5.5363437472414595e-05, "loss": 0.0201, "step": 9850 }, { "epoch": 4.633458646616542, "grad_norm": 0.15986937284469604, "learning_rate": 5.52812337082173e-05, "loss": 0.0193, "step": 9860 }, { "epoch": 4.6381578947368425, "grad_norm": 0.14006660878658295, "learning_rate": 5.519901550532871e-05, "loss": 0.0201, "step": 9870 }, { "epoch": 4.642857142857143, "grad_norm": 0.19203944504261017, "learning_rate": 5.511678308853026e-05, "loss": 0.0156, "step": 9880 }, { "epoch": 4.647556390977444, "grad_norm": 0.24364925920963287, "learning_rate": 5.5034536682642224e-05, "loss": 0.0223, "step": 9890 }, { "epoch": 4.652255639097744, "grad_norm": 0.17941319942474365, "learning_rate": 5.495227651252315e-05, "loss": 0.0197, "step": 9900 }, { "epoch": 4.6569548872180455, "grad_norm": 0.1661718338727951, "learning_rate": 5.487000280306917e-05, "loss": 0.0203, "step": 9910 }, { "epoch": 4.661654135338345, "grad_norm": 0.20843440294265747, "learning_rate": 5.478771577921351e-05, "loss": 0.0145, "step": 9920 }, { "epoch": 4.666353383458647, "grad_norm": 0.15984666347503662, "learning_rate": 5.470541566592573e-05, "loss": 0.0218, "step": 9930 }, { "epoch": 4.671052631578947, "grad_norm": 0.17885951697826385, "learning_rate": 5.462310268821118e-05, "loss": 0.0251, "step": 9940 }, { "epoch": 4.6757518796992485, "grad_norm": 0.20301920175552368, "learning_rate": 5.454077707111042e-05, "loss": 0.0187, "step": 9950 }, { "epoch": 4.680451127819548, "grad_norm": 0.14082729816436768, "learning_rate": 5.445843903969854e-05, "loss": 0.024, "step": 9960 }, { "epoch": 4.68515037593985, "grad_norm": 0.14042581617832184, "learning_rate": 5.4376088819084556e-05, "loss": 0.0174, "step": 9970 }, { "epoch": 4.68984962406015, "grad_norm": 0.1587418168783188, "learning_rate": 5.4293726634410855e-05, "loss": 0.0192, "step": 9980 }, { "epoch": 4.694548872180452, "grad_norm": 0.14780429005622864, "learning_rate": 5.4211352710852495e-05, "loss": 0.0198, "step": 9990 }, { "epoch": 4.6992481203007515, "grad_norm": 0.1238761693239212, "learning_rate": 5.4128967273616625e-05, "loss": 0.0206, "step": 10000 }, { "epoch": 4.703947368421053, "grad_norm": 0.14264823496341705, "learning_rate": 5.404657054794189e-05, "loss": 0.02, "step": 10010 }, { "epoch": 4.708646616541353, "grad_norm": 0.11588063091039658, "learning_rate": 5.396416275909779e-05, "loss": 0.0258, "step": 10020 }, { "epoch": 4.713345864661655, "grad_norm": 0.11729754507541656, "learning_rate": 5.3881744132384104e-05, "loss": 0.0173, "step": 10030 }, { "epoch": 4.7180451127819545, "grad_norm": 0.1283014863729477, "learning_rate": 5.379931489313016e-05, "loss": 0.0205, "step": 10040 }, { "epoch": 4.722744360902256, "grad_norm": 0.11900748312473297, "learning_rate": 5.371687526669439e-05, "loss": 0.0204, "step": 10050 }, { "epoch": 4.727443609022556, "grad_norm": 0.2039898782968521, "learning_rate": 5.363442547846356e-05, "loss": 0.023, "step": 10060 }, { "epoch": 4.732142857142857, "grad_norm": 0.16698098182678223, "learning_rate": 5.355196575385225e-05, "loss": 0.0149, "step": 10070 }, { "epoch": 4.7368421052631575, "grad_norm": 0.20464769005775452, "learning_rate": 5.3469496318302204e-05, "loss": 0.0198, "step": 10080 }, { "epoch": 4.741541353383458, "grad_norm": 0.08524361997842789, "learning_rate": 5.3387017397281704e-05, "loss": 0.0185, "step": 10090 }, { "epoch": 4.746240601503759, "grad_norm": 0.1856192648410797, "learning_rate": 5.330452921628497e-05, "loss": 0.0198, "step": 10100 }, { "epoch": 4.75093984962406, "grad_norm": 0.1088978499174118, "learning_rate": 5.322203200083154e-05, "loss": 0.0156, "step": 10110 }, { "epoch": 4.7556390977443606, "grad_norm": 0.18362818658351898, "learning_rate": 5.313952597646568e-05, "loss": 0.0193, "step": 10120 }, { "epoch": 4.760338345864661, "grad_norm": 0.09336452186107635, "learning_rate": 5.305701136875566e-05, "loss": 0.0127, "step": 10130 }, { "epoch": 4.765037593984962, "grad_norm": 0.10276808589696884, "learning_rate": 5.297448840329329e-05, "loss": 0.0138, "step": 10140 }, { "epoch": 4.769736842105263, "grad_norm": 0.14418187737464905, "learning_rate": 5.2891957305693205e-05, "loss": 0.0188, "step": 10150 }, { "epoch": 4.774436090225564, "grad_norm": 0.1359757035970688, "learning_rate": 5.280941830159227e-05, "loss": 0.0183, "step": 10160 }, { "epoch": 4.779135338345864, "grad_norm": 0.21340955793857574, "learning_rate": 5.2726871616649e-05, "loss": 0.0195, "step": 10170 }, { "epoch": 4.783834586466165, "grad_norm": 0.1824106127023697, "learning_rate": 5.264431747654284e-05, "loss": 0.0221, "step": 10180 }, { "epoch": 4.788533834586466, "grad_norm": 0.1339014172554016, "learning_rate": 5.2561756106973656e-05, "loss": 0.0207, "step": 10190 }, { "epoch": 4.793233082706767, "grad_norm": 0.12776830792427063, "learning_rate": 5.247918773366112e-05, "loss": 0.0241, "step": 10200 }, { "epoch": 4.797932330827067, "grad_norm": 0.1569841355085373, "learning_rate": 5.2396612582343986e-05, "loss": 0.0166, "step": 10210 }, { "epoch": 4.802631578947368, "grad_norm": 0.1283421814441681, "learning_rate": 5.231403087877955e-05, "loss": 0.0177, "step": 10220 }, { "epoch": 4.807330827067669, "grad_norm": 0.16856853663921356, "learning_rate": 5.2231442848743064e-05, "loss": 0.027, "step": 10230 }, { "epoch": 4.81203007518797, "grad_norm": 0.22872963547706604, "learning_rate": 5.214884871802703e-05, "loss": 0.0257, "step": 10240 }, { "epoch": 4.81672932330827, "grad_norm": 0.14206314086914062, "learning_rate": 5.2066248712440656e-05, "loss": 0.0121, "step": 10250 }, { "epoch": 4.821428571428571, "grad_norm": 0.10408526659011841, "learning_rate": 5.198364305780922e-05, "loss": 0.0181, "step": 10260 }, { "epoch": 4.826127819548872, "grad_norm": 0.10545016825199127, "learning_rate": 5.1901031979973394e-05, "loss": 0.0169, "step": 10270 }, { "epoch": 4.830827067669173, "grad_norm": 0.10499098896980286, "learning_rate": 5.1818415704788725e-05, "loss": 0.0173, "step": 10280 }, { "epoch": 4.8355263157894735, "grad_norm": 0.10384233295917511, "learning_rate": 5.1735794458124956e-05, "loss": 0.0172, "step": 10290 }, { "epoch": 4.840225563909774, "grad_norm": 0.13764654099941254, "learning_rate": 5.165316846586541e-05, "loss": 0.0167, "step": 10300 }, { "epoch": 4.844924812030075, "grad_norm": 0.1662788838148117, "learning_rate": 5.157053795390642e-05, "loss": 0.0248, "step": 10310 }, { "epoch": 4.849624060150376, "grad_norm": 0.20149970054626465, "learning_rate": 5.148790314815663e-05, "loss": 0.0158, "step": 10320 }, { "epoch": 4.8543233082706765, "grad_norm": 0.14202655851840973, "learning_rate": 5.1405264274536445e-05, "loss": 0.0112, "step": 10330 }, { "epoch": 4.859022556390977, "grad_norm": 0.1373424530029297, "learning_rate": 5.132262155897739e-05, "loss": 0.0143, "step": 10340 }, { "epoch": 4.863721804511278, "grad_norm": 0.15334497392177582, "learning_rate": 5.123997522742151e-05, "loss": 0.018, "step": 10350 }, { "epoch": 4.868421052631579, "grad_norm": 0.09590376168489456, "learning_rate": 5.1157325505820694e-05, "loss": 0.0187, "step": 10360 }, { "epoch": 4.8731203007518795, "grad_norm": 0.11532887071371078, "learning_rate": 5.107467262013614e-05, "loss": 0.0187, "step": 10370 }, { "epoch": 4.87781954887218, "grad_norm": 0.13246522843837738, "learning_rate": 5.0992016796337686e-05, "loss": 0.0201, "step": 10380 }, { "epoch": 4.882518796992481, "grad_norm": 0.12044485658407211, "learning_rate": 5.0909358260403186e-05, "loss": 0.0259, "step": 10390 }, { "epoch": 4.887218045112782, "grad_norm": 0.14442338049411774, "learning_rate": 5.0826697238317935e-05, "loss": 0.0192, "step": 10400 }, { "epoch": 4.8919172932330826, "grad_norm": 0.20286041498184204, "learning_rate": 5.074403395607399e-05, "loss": 0.0168, "step": 10410 }, { "epoch": 4.896616541353383, "grad_norm": 0.1601938009262085, "learning_rate": 5.066136863966963e-05, "loss": 0.0208, "step": 10420 }, { "epoch": 4.901315789473684, "grad_norm": 0.12259446829557419, "learning_rate": 5.057870151510864e-05, "loss": 0.0208, "step": 10430 }, { "epoch": 4.906015037593985, "grad_norm": 0.1430525928735733, "learning_rate": 5.0496032808399815e-05, "loss": 0.0216, "step": 10440 }, { "epoch": 4.910714285714286, "grad_norm": 0.11188165843486786, "learning_rate": 5.041336274555625e-05, "loss": 0.0205, "step": 10450 }, { "epoch": 4.915413533834586, "grad_norm": 0.1343916654586792, "learning_rate": 5.033069155259471e-05, "loss": 0.0206, "step": 10460 }, { "epoch": 4.920112781954887, "grad_norm": 0.15581011772155762, "learning_rate": 5.02480194555351e-05, "loss": 0.0145, "step": 10470 }, { "epoch": 4.924812030075188, "grad_norm": 0.10869266837835312, "learning_rate": 5.016534668039976e-05, "loss": 0.0183, "step": 10480 }, { "epoch": 4.929511278195489, "grad_norm": 0.1791427731513977, "learning_rate": 5.0082673453212914e-05, "loss": 0.0145, "step": 10490 }, { "epoch": 4.934210526315789, "grad_norm": 0.12199488282203674, "learning_rate": 5e-05, "loss": 0.0142, "step": 10500 }, { "epoch": 4.93890977443609, "grad_norm": 0.1306176632642746, "learning_rate": 4.991732654678709e-05, "loss": 0.0187, "step": 10510 }, { "epoch": 4.943609022556391, "grad_norm": 0.11600327491760254, "learning_rate": 4.9834653319600246e-05, "loss": 0.0206, "step": 10520 }, { "epoch": 4.948308270676692, "grad_norm": 0.14572572708129883, "learning_rate": 4.975198054446492e-05, "loss": 0.017, "step": 10530 }, { "epoch": 4.953007518796992, "grad_norm": 0.1601126790046692, "learning_rate": 4.96693084474053e-05, "loss": 0.0219, "step": 10540 }, { "epoch": 4.957706766917293, "grad_norm": 0.1626318246126175, "learning_rate": 4.9586637254443756e-05, "loss": 0.0209, "step": 10550 }, { "epoch": 4.962406015037594, "grad_norm": 0.17098815739154816, "learning_rate": 4.950396719160018e-05, "loss": 0.0158, "step": 10560 }, { "epoch": 4.967105263157895, "grad_norm": 0.13434873521327972, "learning_rate": 4.942129848489137e-05, "loss": 0.0192, "step": 10570 }, { "epoch": 4.9718045112781954, "grad_norm": 0.15642866492271423, "learning_rate": 4.93386313603304e-05, "loss": 0.0187, "step": 10580 }, { "epoch": 4.976503759398496, "grad_norm": 0.15607450902462006, "learning_rate": 4.925596604392603e-05, "loss": 0.016, "step": 10590 }, { "epoch": 4.981203007518797, "grad_norm": 0.12438451498746872, "learning_rate": 4.917330276168208e-05, "loss": 0.0187, "step": 10600 }, { "epoch": 4.985902255639098, "grad_norm": 0.14996737241744995, "learning_rate": 4.909064173959681e-05, "loss": 0.0171, "step": 10610 }, { "epoch": 4.9906015037593985, "grad_norm": 0.11663806438446045, "learning_rate": 4.9007983203662326e-05, "loss": 0.0182, "step": 10620 }, { "epoch": 4.995300751879699, "grad_norm": 0.11488846689462662, "learning_rate": 4.892532737986387e-05, "loss": 0.0174, "step": 10630 }, { "epoch": 5.0, "grad_norm": 0.15718309581279755, "learning_rate": 4.884267449417931e-05, "loss": 0.0184, "step": 10640 }, { "epoch": 5.004699248120301, "grad_norm": 0.11409519612789154, "learning_rate": 4.87600247725785e-05, "loss": 0.018, "step": 10650 }, { "epoch": 5.0093984962406015, "grad_norm": 0.14960633218288422, "learning_rate": 4.867737844102261e-05, "loss": 0.0131, "step": 10660 }, { "epoch": 5.014097744360902, "grad_norm": 0.1907559037208557, "learning_rate": 4.8594735725463567e-05, "loss": 0.024, "step": 10670 }, { "epoch": 5.018796992481203, "grad_norm": 0.12973536550998688, "learning_rate": 4.851209685184338e-05, "loss": 0.0149, "step": 10680 }, { "epoch": 5.023496240601504, "grad_norm": 0.12642143666744232, "learning_rate": 4.8429462046093585e-05, "loss": 0.0218, "step": 10690 }, { "epoch": 5.0281954887218046, "grad_norm": 0.19579017162322998, "learning_rate": 4.834683153413459e-05, "loss": 0.0162, "step": 10700 }, { "epoch": 5.032894736842105, "grad_norm": 0.06657780706882477, "learning_rate": 4.826420554187506e-05, "loss": 0.0219, "step": 10710 }, { "epoch": 5.037593984962406, "grad_norm": 0.15197539329528809, "learning_rate": 4.818158429521129e-05, "loss": 0.0179, "step": 10720 }, { "epoch": 5.042293233082707, "grad_norm": 0.13582901656627655, "learning_rate": 4.809896802002662e-05, "loss": 0.0222, "step": 10730 }, { "epoch": 5.046992481203008, "grad_norm": 0.0987696573138237, "learning_rate": 4.801635694219079e-05, "loss": 0.02, "step": 10740 }, { "epoch": 5.051691729323308, "grad_norm": 0.1544206142425537, "learning_rate": 4.7933751287559335e-05, "loss": 0.0165, "step": 10750 }, { "epoch": 5.056390977443609, "grad_norm": 0.19164049625396729, "learning_rate": 4.785115128197298e-05, "loss": 0.0162, "step": 10760 }, { "epoch": 5.06109022556391, "grad_norm": 0.1684752255678177, "learning_rate": 4.776855715125694e-05, "loss": 0.0147, "step": 10770 }, { "epoch": 5.065789473684211, "grad_norm": 0.18184302747249603, "learning_rate": 4.7685969121220456e-05, "loss": 0.0183, "step": 10780 }, { "epoch": 5.070488721804511, "grad_norm": 0.15143953263759613, "learning_rate": 4.7603387417656026e-05, "loss": 0.0148, "step": 10790 }, { "epoch": 5.075187969924812, "grad_norm": 0.1459171175956726, "learning_rate": 4.7520812266338885e-05, "loss": 0.0175, "step": 10800 }, { "epoch": 5.079887218045113, "grad_norm": 0.11474397778511047, "learning_rate": 4.743824389302635e-05, "loss": 0.0191, "step": 10810 }, { "epoch": 5.084586466165414, "grad_norm": 0.10887852311134338, "learning_rate": 4.735568252345718e-05, "loss": 0.019, "step": 10820 }, { "epoch": 5.089285714285714, "grad_norm": 0.2430884689092636, "learning_rate": 4.7273128383351015e-05, "loss": 0.0184, "step": 10830 }, { "epoch": 5.093984962406015, "grad_norm": 0.14770224690437317, "learning_rate": 4.7190581698407725e-05, "loss": 0.0119, "step": 10840 }, { "epoch": 5.098684210526316, "grad_norm": 0.11698685586452484, "learning_rate": 4.710804269430681e-05, "loss": 0.0162, "step": 10850 }, { "epoch": 5.103383458646617, "grad_norm": 0.10466932505369186, "learning_rate": 4.702551159670672e-05, "loss": 0.0133, "step": 10860 }, { "epoch": 5.1080827067669174, "grad_norm": 0.21390311419963837, "learning_rate": 4.694298863124435e-05, "loss": 0.0184, "step": 10870 }, { "epoch": 5.112781954887218, "grad_norm": 0.12267635017633438, "learning_rate": 4.6860474023534335e-05, "loss": 0.0201, "step": 10880 }, { "epoch": 5.117481203007519, "grad_norm": 0.10549784451723099, "learning_rate": 4.677796799916845e-05, "loss": 0.0185, "step": 10890 }, { "epoch": 5.12218045112782, "grad_norm": 0.09418050944805145, "learning_rate": 4.669547078371504e-05, "loss": 0.0232, "step": 10900 }, { "epoch": 5.1268796992481205, "grad_norm": 0.10465802997350693, "learning_rate": 4.66129826027183e-05, "loss": 0.016, "step": 10910 }, { "epoch": 5.131578947368421, "grad_norm": 0.14196555316448212, "learning_rate": 4.65305036816978e-05, "loss": 0.0202, "step": 10920 }, { "epoch": 5.136278195488722, "grad_norm": 0.15972945094108582, "learning_rate": 4.6448034246147754e-05, "loss": 0.0164, "step": 10930 }, { "epoch": 5.140977443609023, "grad_norm": 0.1118629202246666, "learning_rate": 4.6365574521536445e-05, "loss": 0.0134, "step": 10940 }, { "epoch": 5.1456766917293235, "grad_norm": 0.08104313164949417, "learning_rate": 4.6283124733305624e-05, "loss": 0.0102, "step": 10950 }, { "epoch": 5.150375939849624, "grad_norm": 0.0983964204788208, "learning_rate": 4.620068510686985e-05, "loss": 0.021, "step": 10960 }, { "epoch": 5.155075187969925, "grad_norm": 0.13934247195720673, "learning_rate": 4.611825586761591e-05, "loss": 0.0123, "step": 10970 }, { "epoch": 5.159774436090226, "grad_norm": 0.10796473920345306, "learning_rate": 4.60358372409022e-05, "loss": 0.0137, "step": 10980 }, { "epoch": 5.1644736842105265, "grad_norm": 0.1407727152109146, "learning_rate": 4.5953429452058135e-05, "loss": 0.0138, "step": 10990 }, { "epoch": 5.169172932330827, "grad_norm": 0.1869104653596878, "learning_rate": 4.5871032726383386e-05, "loss": 0.0178, "step": 11000 }, { "epoch": 5.173872180451128, "grad_norm": 0.22772102057933807, "learning_rate": 4.5788647289147516e-05, "loss": 0.017, "step": 11010 }, { "epoch": 5.178571428571429, "grad_norm": 0.1635831892490387, "learning_rate": 4.570627336558915e-05, "loss": 0.0177, "step": 11020 }, { "epoch": 5.18327067669173, "grad_norm": 0.10193423926830292, "learning_rate": 4.562391118091544e-05, "loss": 0.0135, "step": 11030 }, { "epoch": 5.18796992481203, "grad_norm": 0.17310504615306854, "learning_rate": 4.554156096030149e-05, "loss": 0.0152, "step": 11040 }, { "epoch": 5.192669172932331, "grad_norm": 0.1305575966835022, "learning_rate": 4.545922292888959e-05, "loss": 0.0187, "step": 11050 }, { "epoch": 5.197368421052632, "grad_norm": 0.17065905034542084, "learning_rate": 4.537689731178883e-05, "loss": 0.025, "step": 11060 }, { "epoch": 5.202067669172933, "grad_norm": 0.13812799751758575, "learning_rate": 4.529458433407429e-05, "loss": 0.02, "step": 11070 }, { "epoch": 5.206766917293233, "grad_norm": 0.1573777198791504, "learning_rate": 4.5212284220786494e-05, "loss": 0.0163, "step": 11080 }, { "epoch": 5.211466165413534, "grad_norm": 0.13692985475063324, "learning_rate": 4.5129997196930845e-05, "loss": 0.0125, "step": 11090 }, { "epoch": 5.216165413533835, "grad_norm": 0.11966300755739212, "learning_rate": 4.504772348747687e-05, "loss": 0.0133, "step": 11100 }, { "epoch": 5.220864661654136, "grad_norm": 0.22742719948291779, "learning_rate": 4.496546331735778e-05, "loss": 0.0193, "step": 11110 }, { "epoch": 5.225563909774436, "grad_norm": 0.17194905877113342, "learning_rate": 4.488321691146975e-05, "loss": 0.0163, "step": 11120 }, { "epoch": 5.230263157894737, "grad_norm": 0.10848580300807953, "learning_rate": 4.480098449467132e-05, "loss": 0.011, "step": 11130 }, { "epoch": 5.234962406015038, "grad_norm": 0.12128433585166931, "learning_rate": 4.471876629178273e-05, "loss": 0.0153, "step": 11140 }, { "epoch": 5.239661654135339, "grad_norm": 0.19572050869464874, "learning_rate": 4.463656252758542e-05, "loss": 0.0174, "step": 11150 }, { "epoch": 5.2443609022556394, "grad_norm": 0.18351411819458008, "learning_rate": 4.4554373426821374e-05, "loss": 0.0197, "step": 11160 }, { "epoch": 5.24906015037594, "grad_norm": 0.14616544544696808, "learning_rate": 4.447219921419244e-05, "loss": 0.0186, "step": 11170 }, { "epoch": 5.253759398496241, "grad_norm": 0.1219051256775856, "learning_rate": 4.439004011435979e-05, "loss": 0.0167, "step": 11180 }, { "epoch": 5.258458646616542, "grad_norm": 0.1375686526298523, "learning_rate": 4.430789635194324e-05, "loss": 0.0223, "step": 11190 }, { "epoch": 5.2631578947368425, "grad_norm": 0.1277266889810562, "learning_rate": 4.4225768151520694e-05, "loss": 0.0132, "step": 11200 }, { "epoch": 5.267857142857143, "grad_norm": 0.16868142783641815, "learning_rate": 4.414365573762755e-05, "loss": 0.0292, "step": 11210 }, { "epoch": 5.272556390977444, "grad_norm": 0.091234490275383, "learning_rate": 4.406155933475599e-05, "loss": 0.0165, "step": 11220 }, { "epoch": 5.277255639097745, "grad_norm": 0.11484044045209885, "learning_rate": 4.3979479167354477e-05, "loss": 0.0167, "step": 11230 }, { "epoch": 5.2819548872180455, "grad_norm": 0.14860452711582184, "learning_rate": 4.3897415459827e-05, "loss": 0.019, "step": 11240 }, { "epoch": 5.286654135338346, "grad_norm": 0.13529516756534576, "learning_rate": 4.381536843653262e-05, "loss": 0.0158, "step": 11250 }, { "epoch": 5.291353383458647, "grad_norm": 0.09828982502222061, "learning_rate": 4.373333832178478e-05, "loss": 0.0129, "step": 11260 }, { "epoch": 5.296052631578947, "grad_norm": 0.09586787968873978, "learning_rate": 4.365132533985071e-05, "loss": 0.0189, "step": 11270 }, { "epoch": 5.3007518796992485, "grad_norm": 0.10568477213382721, "learning_rate": 4.3569329714950704e-05, "loss": 0.0194, "step": 11280 }, { "epoch": 5.305451127819548, "grad_norm": 0.14483776688575745, "learning_rate": 4.348735167125771e-05, "loss": 0.016, "step": 11290 }, { "epoch": 5.31015037593985, "grad_norm": 0.07663799077272415, "learning_rate": 4.3405391432896555e-05, "loss": 0.0125, "step": 11300 }, { "epoch": 5.31484962406015, "grad_norm": 0.18015138804912567, "learning_rate": 4.3323449223943416e-05, "loss": 0.0151, "step": 11310 }, { "epoch": 5.319548872180452, "grad_norm": 0.12221848219633102, "learning_rate": 4.324152526842517e-05, "loss": 0.0195, "step": 11320 }, { "epoch": 5.3242481203007515, "grad_norm": 0.10736917704343796, "learning_rate": 4.315961979031875e-05, "loss": 0.018, "step": 11330 }, { "epoch": 5.328947368421053, "grad_norm": 0.15660639107227325, "learning_rate": 4.307773301355062e-05, "loss": 0.0162, "step": 11340 }, { "epoch": 5.333646616541353, "grad_norm": 0.16791242361068726, "learning_rate": 4.2995865161996105e-05, "loss": 0.0146, "step": 11350 }, { "epoch": 5.338345864661654, "grad_norm": 0.08962516486644745, "learning_rate": 4.291401645947879e-05, "loss": 0.0169, "step": 11360 }, { "epoch": 5.3430451127819545, "grad_norm": 0.14809159934520721, "learning_rate": 4.283218712976992e-05, "loss": 0.0148, "step": 11370 }, { "epoch": 5.347744360902255, "grad_norm": 0.10135837644338608, "learning_rate": 4.275037739658771e-05, "loss": 0.0226, "step": 11380 }, { "epoch": 5.352443609022556, "grad_norm": 0.13660521805286407, "learning_rate": 4.2668587483596864e-05, "loss": 0.018, "step": 11390 }, { "epoch": 5.357142857142857, "grad_norm": 0.08671059459447861, "learning_rate": 4.2586817614407895e-05, "loss": 0.0124, "step": 11400 }, { "epoch": 5.3618421052631575, "grad_norm": 0.09788667410612106, "learning_rate": 4.250506801257653e-05, "loss": 0.0151, "step": 11410 }, { "epoch": 5.366541353383458, "grad_norm": 0.15231043100357056, "learning_rate": 4.2423338901602985e-05, "loss": 0.0176, "step": 11420 }, { "epoch": 5.371240601503759, "grad_norm": 0.13623958826065063, "learning_rate": 4.234163050493158e-05, "loss": 0.0238, "step": 11430 }, { "epoch": 5.37593984962406, "grad_norm": 0.16451093554496765, "learning_rate": 4.2259943045949934e-05, "loss": 0.0159, "step": 11440 }, { "epoch": 5.3806390977443606, "grad_norm": 0.1713268756866455, "learning_rate": 4.2178276747988446e-05, "loss": 0.0228, "step": 11450 }, { "epoch": 5.385338345864661, "grad_norm": 0.1485781967639923, "learning_rate": 4.209663183431969e-05, "loss": 0.0239, "step": 11460 }, { "epoch": 5.390037593984962, "grad_norm": 0.09829729050397873, "learning_rate": 4.201500852815768e-05, "loss": 0.0132, "step": 11470 }, { "epoch": 5.394736842105263, "grad_norm": 0.1141219288110733, "learning_rate": 4.1933407052657456e-05, "loss": 0.0158, "step": 11480 }, { "epoch": 5.399436090225564, "grad_norm": 0.11218131333589554, "learning_rate": 4.1851827630914305e-05, "loss": 0.0118, "step": 11490 }, { "epoch": 5.404135338345864, "grad_norm": 0.11761670559644699, "learning_rate": 4.17702704859633e-05, "loss": 0.0196, "step": 11500 }, { "epoch": 5.408834586466165, "grad_norm": 0.11419840902090073, "learning_rate": 4.1688735840778546e-05, "loss": 0.0143, "step": 11510 }, { "epoch": 5.413533834586466, "grad_norm": 0.10110355168581009, "learning_rate": 4.160722391827262e-05, "loss": 0.0177, "step": 11520 }, { "epoch": 5.418233082706767, "grad_norm": 0.08888833969831467, "learning_rate": 4.1525734941296026e-05, "loss": 0.0214, "step": 11530 }, { "epoch": 5.422932330827067, "grad_norm": 0.08826860040426254, "learning_rate": 4.14442691326365e-05, "loss": 0.0119, "step": 11540 }, { "epoch": 5.427631578947368, "grad_norm": 0.1564245969057083, "learning_rate": 4.13628267150185e-05, "loss": 0.0181, "step": 11550 }, { "epoch": 5.432330827067669, "grad_norm": 0.18380986154079437, "learning_rate": 4.1281407911102425e-05, "loss": 0.021, "step": 11560 }, { "epoch": 5.43703007518797, "grad_norm": 0.17931777238845825, "learning_rate": 4.120001294348421e-05, "loss": 0.0142, "step": 11570 }, { "epoch": 5.44172932330827, "grad_norm": 0.06229656562209129, "learning_rate": 4.111864203469457e-05, "loss": 0.0225, "step": 11580 }, { "epoch": 5.446428571428571, "grad_norm": 0.12182196229696274, "learning_rate": 4.103729540719847e-05, "loss": 0.0153, "step": 11590 }, { "epoch": 5.451127819548872, "grad_norm": 0.16493044793605804, "learning_rate": 4.095597328339452e-05, "loss": 0.0141, "step": 11600 }, { "epoch": 5.455827067669173, "grad_norm": 0.07192741334438324, "learning_rate": 4.087467588561424e-05, "loss": 0.0114, "step": 11610 }, { "epoch": 5.4605263157894735, "grad_norm": 0.14149798452854156, "learning_rate": 4.079340343612165e-05, "loss": 0.0167, "step": 11620 }, { "epoch": 5.465225563909774, "grad_norm": 0.14575918018817902, "learning_rate": 4.07121561571125e-05, "loss": 0.0162, "step": 11630 }, { "epoch": 5.469924812030075, "grad_norm": 0.15547528862953186, "learning_rate": 4.063093427071376e-05, "loss": 0.0154, "step": 11640 }, { "epoch": 5.474624060150376, "grad_norm": 0.12537823617458344, "learning_rate": 4.0549737998983e-05, "loss": 0.0202, "step": 11650 }, { "epoch": 5.4793233082706765, "grad_norm": 0.1138703003525734, "learning_rate": 4.046856756390767e-05, "loss": 0.02, "step": 11660 }, { "epoch": 5.484022556390977, "grad_norm": 0.13138170540332794, "learning_rate": 4.038742318740465e-05, "loss": 0.0204, "step": 11670 }, { "epoch": 5.488721804511278, "grad_norm": 0.10835835337638855, "learning_rate": 4.0306305091319595e-05, "loss": 0.0174, "step": 11680 }, { "epoch": 5.493421052631579, "grad_norm": 0.09958133846521378, "learning_rate": 4.0225213497426276e-05, "loss": 0.019, "step": 11690 }, { "epoch": 5.4981203007518795, "grad_norm": 0.14578290283679962, "learning_rate": 4.0144148627425993e-05, "loss": 0.0233, "step": 11700 }, { "epoch": 5.50281954887218, "grad_norm": 0.18853400647640228, "learning_rate": 4.006311070294702e-05, "loss": 0.0255, "step": 11710 }, { "epoch": 5.507518796992481, "grad_norm": 0.1436021775007248, "learning_rate": 3.9982099945543945e-05, "loss": 0.0158, "step": 11720 }, { "epoch": 5.512218045112782, "grad_norm": 0.10432905703783035, "learning_rate": 3.9901116576697083e-05, "loss": 0.0173, "step": 11730 }, { "epoch": 5.5169172932330826, "grad_norm": 0.13678883016109467, "learning_rate": 3.982016081781189e-05, "loss": 0.0174, "step": 11740 }, { "epoch": 5.521616541353383, "grad_norm": 0.15448826551437378, "learning_rate": 3.973923289021829e-05, "loss": 0.016, "step": 11750 }, { "epoch": 5.526315789473684, "grad_norm": 0.16062240302562714, "learning_rate": 3.965833301517017e-05, "loss": 0.0322, "step": 11760 }, { "epoch": 5.531015037593985, "grad_norm": 0.11189954727888107, "learning_rate": 3.9577461413844684e-05, "loss": 0.02, "step": 11770 }, { "epoch": 5.535714285714286, "grad_norm": 0.171811044216156, "learning_rate": 3.949661830734172e-05, "loss": 0.0175, "step": 11780 }, { "epoch": 5.540413533834586, "grad_norm": 0.12942734360694885, "learning_rate": 3.9415803916683224e-05, "loss": 0.0131, "step": 11790 }, { "epoch": 5.545112781954887, "grad_norm": 0.11269880831241608, "learning_rate": 3.933501846281267e-05, "loss": 0.0221, "step": 11800 }, { "epoch": 5.549812030075188, "grad_norm": 0.12639208137989044, "learning_rate": 3.925426216659438e-05, "loss": 0.0178, "step": 11810 }, { "epoch": 5.554511278195489, "grad_norm": 0.21581538021564484, "learning_rate": 3.917353524881302e-05, "loss": 0.0152, "step": 11820 }, { "epoch": 5.559210526315789, "grad_norm": 0.14881852269172668, "learning_rate": 3.9092837930172884e-05, "loss": 0.0156, "step": 11830 }, { "epoch": 5.56390977443609, "grad_norm": 0.18276169896125793, "learning_rate": 3.901217043129735e-05, "loss": 0.023, "step": 11840 }, { "epoch": 5.568609022556391, "grad_norm": 0.10481563955545425, "learning_rate": 3.8931532972728285e-05, "loss": 0.0139, "step": 11850 }, { "epoch": 5.573308270676692, "grad_norm": 0.1183846965432167, "learning_rate": 3.8850925774925425e-05, "loss": 0.0151, "step": 11860 }, { "epoch": 5.578007518796992, "grad_norm": 0.12934796512126923, "learning_rate": 3.877034905826577e-05, "loss": 0.0159, "step": 11870 }, { "epoch": 5.582706766917293, "grad_norm": 0.12058614939451218, "learning_rate": 3.8689803043043e-05, "loss": 0.0164, "step": 11880 }, { "epoch": 5.587406015037594, "grad_norm": 0.10819224268198013, "learning_rate": 3.860928794946682e-05, "loss": 0.0106, "step": 11890 }, { "epoch": 5.592105263157895, "grad_norm": 0.1796448975801468, "learning_rate": 3.852880399766243e-05, "loss": 0.0177, "step": 11900 }, { "epoch": 5.5968045112781954, "grad_norm": 0.14283166825771332, "learning_rate": 3.844835140766988e-05, "loss": 0.0136, "step": 11910 }, { "epoch": 5.601503759398496, "grad_norm": 0.14546529948711395, "learning_rate": 3.836793039944349e-05, "loss": 0.0117, "step": 11920 }, { "epoch": 5.606203007518797, "grad_norm": 0.1672084927558899, "learning_rate": 3.828754119285123e-05, "loss": 0.0126, "step": 11930 }, { "epoch": 5.610902255639098, "grad_norm": 0.08551529794931412, "learning_rate": 3.820718400767409e-05, "loss": 0.0162, "step": 11940 }, { "epoch": 5.6156015037593985, "grad_norm": 0.1129482090473175, "learning_rate": 3.812685906360557e-05, "loss": 0.0151, "step": 11950 }, { "epoch": 5.620300751879699, "grad_norm": 0.17113980650901794, "learning_rate": 3.8046566580251e-05, "loss": 0.0136, "step": 11960 }, { "epoch": 5.625, "grad_norm": 0.08610133826732635, "learning_rate": 3.796630677712697e-05, "loss": 0.0162, "step": 11970 }, { "epoch": 5.629699248120301, "grad_norm": 0.16481080651283264, "learning_rate": 3.788607987366069e-05, "loss": 0.0158, "step": 11980 }, { "epoch": 5.6343984962406015, "grad_norm": 0.1907389611005783, "learning_rate": 3.780588608918947e-05, "loss": 0.018, "step": 11990 }, { "epoch": 5.639097744360902, "grad_norm": 0.10017523914575577, "learning_rate": 3.772572564296005e-05, "loss": 0.0147, "step": 12000 }, { "epoch": 5.643796992481203, "grad_norm": 0.2201337069272995, "learning_rate": 3.764559875412803e-05, "loss": 0.0159, "step": 12010 }, { "epoch": 5.648496240601504, "grad_norm": 0.0887899100780487, "learning_rate": 3.756550564175727e-05, "loss": 0.0139, "step": 12020 }, { "epoch": 5.6531954887218046, "grad_norm": 0.1127086952328682, "learning_rate": 3.748544652481927e-05, "loss": 0.0146, "step": 12030 }, { "epoch": 5.657894736842105, "grad_norm": 0.2131458818912506, "learning_rate": 3.74054216221926e-05, "loss": 0.0215, "step": 12040 }, { "epoch": 5.662593984962406, "grad_norm": 0.10316726565361023, "learning_rate": 3.73254311526623e-05, "loss": 0.0129, "step": 12050 }, { "epoch": 5.667293233082707, "grad_norm": 0.1518326699733734, "learning_rate": 3.7245475334919246e-05, "loss": 0.0114, "step": 12060 }, { "epoch": 5.671992481203008, "grad_norm": 0.1073705404996872, "learning_rate": 3.716555438755961e-05, "loss": 0.0218, "step": 12070 }, { "epoch": 5.676691729323308, "grad_norm": 0.15070666372776031, "learning_rate": 3.7085668529084184e-05, "loss": 0.0182, "step": 12080 }, { "epoch": 5.681390977443609, "grad_norm": 0.09960098564624786, "learning_rate": 3.700581797789786e-05, "loss": 0.0128, "step": 12090 }, { "epoch": 5.68609022556391, "grad_norm": 0.13308414816856384, "learning_rate": 3.6926002952309016e-05, "loss": 0.0111, "step": 12100 }, { "epoch": 5.690789473684211, "grad_norm": 0.1321004182100296, "learning_rate": 3.684622367052887e-05, "loss": 0.0157, "step": 12110 }, { "epoch": 5.695488721804511, "grad_norm": 0.12728892266750336, "learning_rate": 3.676648035067093e-05, "loss": 0.0161, "step": 12120 }, { "epoch": 5.700187969924812, "grad_norm": 0.08147571980953217, "learning_rate": 3.6686773210750385e-05, "loss": 0.0134, "step": 12130 }, { "epoch": 5.704887218045113, "grad_norm": 0.1688104271888733, "learning_rate": 3.6607102468683526e-05, "loss": 0.0147, "step": 12140 }, { "epoch": 5.709586466165414, "grad_norm": 0.11916866153478622, "learning_rate": 3.65274683422871e-05, "loss": 0.0238, "step": 12150 }, { "epoch": 5.714285714285714, "grad_norm": 0.12698572874069214, "learning_rate": 3.6447871049277796e-05, "loss": 0.0151, "step": 12160 }, { "epoch": 5.718984962406015, "grad_norm": 0.09343686699867249, "learning_rate": 3.636831080727154e-05, "loss": 0.0104, "step": 12170 }, { "epoch": 5.723684210526316, "grad_norm": 0.13863399624824524, "learning_rate": 3.628878783378302e-05, "loss": 0.0251, "step": 12180 }, { "epoch": 5.728383458646617, "grad_norm": 0.10305328667163849, "learning_rate": 3.6209302346225006e-05, "loss": 0.0127, "step": 12190 }, { "epoch": 5.7330827067669174, "grad_norm": 0.13228711485862732, "learning_rate": 3.612985456190778e-05, "loss": 0.0106, "step": 12200 }, { "epoch": 5.737781954887218, "grad_norm": 0.18315477669239044, "learning_rate": 3.605044469803854e-05, "loss": 0.015, "step": 12210 }, { "epoch": 5.742481203007519, "grad_norm": 0.09973251819610596, "learning_rate": 3.597107297172084e-05, "loss": 0.0154, "step": 12220 }, { "epoch": 5.74718045112782, "grad_norm": 0.12596507370471954, "learning_rate": 3.5891739599953945e-05, "loss": 0.0165, "step": 12230 }, { "epoch": 5.7518796992481205, "grad_norm": 0.12065248191356659, "learning_rate": 3.581244479963225e-05, "loss": 0.0165, "step": 12240 }, { "epoch": 5.756578947368421, "grad_norm": 0.182419553399086, "learning_rate": 3.5733188787544745e-05, "loss": 0.0141, "step": 12250 }, { "epoch": 5.761278195488722, "grad_norm": 0.1837804764509201, "learning_rate": 3.5653971780374295e-05, "loss": 0.0214, "step": 12260 }, { "epoch": 5.765977443609023, "grad_norm": 0.10047575831413269, "learning_rate": 3.557479399469721e-05, "loss": 0.012, "step": 12270 }, { "epoch": 5.7706766917293235, "grad_norm": 0.10815638303756714, "learning_rate": 3.5495655646982505e-05, "loss": 0.0165, "step": 12280 }, { "epoch": 5.775375939849624, "grad_norm": 0.1140371561050415, "learning_rate": 3.541655695359142e-05, "loss": 0.01, "step": 12290 }, { "epoch": 5.780075187969925, "grad_norm": 0.10626989603042603, "learning_rate": 3.533749813077677e-05, "loss": 0.0158, "step": 12300 }, { "epoch": 5.784774436090226, "grad_norm": 0.15287335216999054, "learning_rate": 3.525847939468233e-05, "loss": 0.0152, "step": 12310 }, { "epoch": 5.7894736842105265, "grad_norm": 0.08385778963565826, "learning_rate": 3.517950096134232e-05, "loss": 0.0207, "step": 12320 }, { "epoch": 5.794172932330827, "grad_norm": 0.08987794816493988, "learning_rate": 3.5100563046680764e-05, "loss": 0.0141, "step": 12330 }, { "epoch": 5.798872180451128, "grad_norm": 0.11704299598932266, "learning_rate": 3.5021665866510925e-05, "loss": 0.021, "step": 12340 }, { "epoch": 5.803571428571429, "grad_norm": 0.15203259885311127, "learning_rate": 3.494280963653463e-05, "loss": 0.0169, "step": 12350 }, { "epoch": 5.80827067669173, "grad_norm": 0.07202760130167007, "learning_rate": 3.4863994572341843e-05, "loss": 0.0171, "step": 12360 }, { "epoch": 5.81296992481203, "grad_norm": 0.16119526326656342, "learning_rate": 3.478522088940993e-05, "loss": 0.0192, "step": 12370 }, { "epoch": 5.817669172932331, "grad_norm": 0.1303485631942749, "learning_rate": 3.470648880310313e-05, "loss": 0.0175, "step": 12380 }, { "epoch": 5.822368421052632, "grad_norm": 0.10909580439329147, "learning_rate": 3.462779852867197e-05, "loss": 0.0255, "step": 12390 }, { "epoch": 5.827067669172933, "grad_norm": 0.13761353492736816, "learning_rate": 3.4549150281252636e-05, "loss": 0.0156, "step": 12400 }, { "epoch": 5.831766917293233, "grad_norm": 0.11712969839572906, "learning_rate": 3.447054427586644e-05, "loss": 0.0127, "step": 12410 }, { "epoch": 5.836466165413534, "grad_norm": 0.1407334804534912, "learning_rate": 3.439198072741921e-05, "loss": 0.0161, "step": 12420 }, { "epoch": 5.841165413533835, "grad_norm": 0.1053805947303772, "learning_rate": 3.431345985070067e-05, "loss": 0.0176, "step": 12430 }, { "epoch": 5.845864661654136, "grad_norm": 0.17800423502922058, "learning_rate": 3.423498186038393e-05, "loss": 0.0182, "step": 12440 }, { "epoch": 5.850563909774436, "grad_norm": 0.09566630423069, "learning_rate": 3.4156546971024784e-05, "loss": 0.016, "step": 12450 }, { "epoch": 5.855263157894737, "grad_norm": 0.12941595911979675, "learning_rate": 3.407815539706124e-05, "loss": 0.0159, "step": 12460 }, { "epoch": 5.859962406015038, "grad_norm": 0.12009347975254059, "learning_rate": 3.399980735281286e-05, "loss": 0.0125, "step": 12470 }, { "epoch": 5.864661654135339, "grad_norm": 0.06218589097261429, "learning_rate": 3.392150305248024e-05, "loss": 0.0189, "step": 12480 }, { "epoch": 5.8693609022556394, "grad_norm": 0.1344054788351059, "learning_rate": 3.384324271014429e-05, "loss": 0.0176, "step": 12490 }, { "epoch": 5.87406015037594, "grad_norm": 0.11987704038619995, "learning_rate": 3.3765026539765834e-05, "loss": 0.0185, "step": 12500 }, { "epoch": 5.878759398496241, "grad_norm": 0.13597393035888672, "learning_rate": 3.368685475518488e-05, "loss": 0.018, "step": 12510 }, { "epoch": 5.883458646616542, "grad_norm": 0.10128919035196304, "learning_rate": 3.360872757012011e-05, "loss": 0.0177, "step": 12520 }, { "epoch": 5.8881578947368425, "grad_norm": 0.1099942997097969, "learning_rate": 3.3530645198168295e-05, "loss": 0.0184, "step": 12530 }, { "epoch": 5.892857142857143, "grad_norm": 0.149112269282341, "learning_rate": 3.3452607852803584e-05, "loss": 0.0123, "step": 12540 }, { "epoch": 5.897556390977444, "grad_norm": 0.20705215632915497, "learning_rate": 3.337461574737716e-05, "loss": 0.019, "step": 12550 }, { "epoch": 5.902255639097744, "grad_norm": 0.14608100056648254, "learning_rate": 3.329666909511645e-05, "loss": 0.0168, "step": 12560 }, { "epoch": 5.9069548872180455, "grad_norm": 0.10303889214992523, "learning_rate": 3.321876810912461e-05, "loss": 0.0217, "step": 12570 }, { "epoch": 5.911654135338345, "grad_norm": 0.13910971581935883, "learning_rate": 3.3140913002379995e-05, "loss": 0.0163, "step": 12580 }, { "epoch": 5.916353383458647, "grad_norm": 0.080747589468956, "learning_rate": 3.3063103987735433e-05, "loss": 0.0106, "step": 12590 }, { "epoch": 5.921052631578947, "grad_norm": 0.08607535809278488, "learning_rate": 3.298534127791785e-05, "loss": 0.0188, "step": 12600 }, { "epoch": 5.9257518796992485, "grad_norm": 0.11979581415653229, "learning_rate": 3.2907625085527503e-05, "loss": 0.0157, "step": 12610 }, { "epoch": 5.930451127819548, "grad_norm": 0.12914730608463287, "learning_rate": 3.282995562303754e-05, "loss": 0.0155, "step": 12620 }, { "epoch": 5.93515037593985, "grad_norm": 0.09366561472415924, "learning_rate": 3.275233310279321e-05, "loss": 0.0102, "step": 12630 }, { "epoch": 5.93984962406015, "grad_norm": 0.10397497564554214, "learning_rate": 3.267475773701161e-05, "loss": 0.0161, "step": 12640 }, { "epoch": 5.944548872180452, "grad_norm": 0.09889456629753113, "learning_rate": 3.2597229737780774e-05, "loss": 0.0143, "step": 12650 }, { "epoch": 5.9492481203007515, "grad_norm": 0.0786278024315834, "learning_rate": 3.251974931705933e-05, "loss": 0.013, "step": 12660 }, { "epoch": 5.953947368421053, "grad_norm": 0.1270080953836441, "learning_rate": 3.244231668667578e-05, "loss": 0.0204, "step": 12670 }, { "epoch": 5.958646616541353, "grad_norm": 0.10432140529155731, "learning_rate": 3.236493205832795e-05, "loss": 0.0138, "step": 12680 }, { "epoch": 5.963345864661655, "grad_norm": 0.12858955562114716, "learning_rate": 3.228759564358248e-05, "loss": 0.0136, "step": 12690 }, { "epoch": 5.9680451127819545, "grad_norm": 0.093807153403759, "learning_rate": 3.221030765387417e-05, "loss": 0.0101, "step": 12700 }, { "epoch": 5.972744360902256, "grad_norm": 0.09169401228427887, "learning_rate": 3.2133068300505455e-05, "loss": 0.0107, "step": 12710 }, { "epoch": 5.977443609022556, "grad_norm": 0.1795411854982376, "learning_rate": 3.205587779464576e-05, "loss": 0.0219, "step": 12720 }, { "epoch": 5.982142857142857, "grad_norm": 0.12922433018684387, "learning_rate": 3.197873634733096e-05, "loss": 0.0166, "step": 12730 }, { "epoch": 5.9868421052631575, "grad_norm": 0.08514426648616791, "learning_rate": 3.190164416946285e-05, "loss": 0.0165, "step": 12740 }, { "epoch": 5.991541353383458, "grad_norm": 0.09083666652441025, "learning_rate": 3.18246014718085e-05, "loss": 0.0123, "step": 12750 }, { "epoch": 5.996240601503759, "grad_norm": 0.14577679336071014, "learning_rate": 3.1747608464999725e-05, "loss": 0.018, "step": 12760 }, { "epoch": 6.00093984962406, "grad_norm": 0.09893287718296051, "learning_rate": 3.167066535953242e-05, "loss": 0.0163, "step": 12770 }, { "epoch": 6.0056390977443606, "grad_norm": 0.09252132475376129, "learning_rate": 3.1593772365766105e-05, "loss": 0.0147, "step": 12780 }, { "epoch": 6.010338345864661, "grad_norm": 0.09168685972690582, "learning_rate": 3.1516929693923315e-05, "loss": 0.0162, "step": 12790 }, { "epoch": 6.015037593984962, "grad_norm": 0.16114775836467743, "learning_rate": 3.144013755408895e-05, "loss": 0.0165, "step": 12800 }, { "epoch": 6.019736842105263, "grad_norm": 0.15589860081672668, "learning_rate": 3.136339615620985e-05, "loss": 0.0176, "step": 12810 }, { "epoch": 6.024436090225564, "grad_norm": 0.11967472732067108, "learning_rate": 3.128670571009399e-05, "loss": 0.017, "step": 12820 }, { "epoch": 6.029135338345864, "grad_norm": 0.10718057304620743, "learning_rate": 3.121006642541014e-05, "loss": 0.0134, "step": 12830 }, { "epoch": 6.033834586466165, "grad_norm": 0.13733793795108795, "learning_rate": 3.113347851168721e-05, "loss": 0.0191, "step": 12840 }, { "epoch": 6.038533834586466, "grad_norm": 0.17362140119075775, "learning_rate": 3.105694217831361e-05, "loss": 0.0113, "step": 12850 }, { "epoch": 6.043233082706767, "grad_norm": 0.09463926404714584, "learning_rate": 3.098045763453678e-05, "loss": 0.0138, "step": 12860 }, { "epoch": 6.047932330827067, "grad_norm": 0.14590539038181305, "learning_rate": 3.090402508946249e-05, "loss": 0.013, "step": 12870 }, { "epoch": 6.052631578947368, "grad_norm": 0.06830133497714996, "learning_rate": 3.082764475205442e-05, "loss": 0.0116, "step": 12880 }, { "epoch": 6.057330827067669, "grad_norm": 0.06772523373365402, "learning_rate": 3.075131683113352e-05, "loss": 0.0107, "step": 12890 }, { "epoch": 6.06203007518797, "grad_norm": 0.07648127526044846, "learning_rate": 3.0675041535377405e-05, "loss": 0.0112, "step": 12900 }, { "epoch": 6.06672932330827, "grad_norm": 0.10932531207799911, "learning_rate": 3.059881907331979e-05, "loss": 0.0125, "step": 12910 }, { "epoch": 6.071428571428571, "grad_norm": 0.18748976290225983, "learning_rate": 3.052264965335e-05, "loss": 0.0164, "step": 12920 }, { "epoch": 6.076127819548872, "grad_norm": 0.11473233997821808, "learning_rate": 3.0446533483712304e-05, "loss": 0.0143, "step": 12930 }, { "epoch": 6.080827067669173, "grad_norm": 0.1189013198018074, "learning_rate": 3.0370470772505433e-05, "loss": 0.0194, "step": 12940 }, { "epoch": 6.0855263157894735, "grad_norm": 0.05811845138669014, "learning_rate": 3.0294461727681932e-05, "loss": 0.0126, "step": 12950 }, { "epoch": 6.090225563909774, "grad_norm": 0.14157220721244812, "learning_rate": 3.0218506557047598e-05, "loss": 0.0127, "step": 12960 }, { "epoch": 6.094924812030075, "grad_norm": 0.06045945733785629, "learning_rate": 3.0142605468260978e-05, "loss": 0.0134, "step": 12970 }, { "epoch": 6.099624060150376, "grad_norm": 0.15809015929698944, "learning_rate": 3.006675866883275e-05, "loss": 0.016, "step": 12980 }, { "epoch": 6.1043233082706765, "grad_norm": 0.09324190765619278, "learning_rate": 2.999096636612518e-05, "loss": 0.0111, "step": 12990 }, { "epoch": 6.109022556390977, "grad_norm": 0.08457409590482712, "learning_rate": 2.991522876735154e-05, "loss": 0.0131, "step": 13000 }, { "epoch": 6.113721804511278, "grad_norm": 0.1202535331249237, "learning_rate": 2.9839546079575497e-05, "loss": 0.0169, "step": 13010 }, { "epoch": 6.118421052631579, "grad_norm": 0.13873109221458435, "learning_rate": 2.976391850971065e-05, "loss": 0.0159, "step": 13020 }, { "epoch": 6.1231203007518795, "grad_norm": 0.08397231251001358, "learning_rate": 2.9688346264519866e-05, "loss": 0.014, "step": 13030 }, { "epoch": 6.12781954887218, "grad_norm": 0.06623484939336777, "learning_rate": 2.9612829550614836e-05, "loss": 0.0156, "step": 13040 }, { "epoch": 6.132518796992481, "grad_norm": 0.11314690858125687, "learning_rate": 2.9537368574455304e-05, "loss": 0.0253, "step": 13050 }, { "epoch": 6.137218045112782, "grad_norm": 0.07116863131523132, "learning_rate": 2.9461963542348737e-05, "loss": 0.014, "step": 13060 }, { "epoch": 6.1419172932330826, "grad_norm": 0.1338961273431778, "learning_rate": 2.9386614660449596e-05, "loss": 0.0178, "step": 13070 }, { "epoch": 6.146616541353383, "grad_norm": 0.16874520480632782, "learning_rate": 2.931132213475884e-05, "loss": 0.0169, "step": 13080 }, { "epoch": 6.151315789473684, "grad_norm": 0.09894317388534546, "learning_rate": 2.9236086171123404e-05, "loss": 0.0117, "step": 13090 }, { "epoch": 6.156015037593985, "grad_norm": 0.14408692717552185, "learning_rate": 2.916090697523549e-05, "loss": 0.0174, "step": 13100 }, { "epoch": 6.160714285714286, "grad_norm": 0.16142411530017853, "learning_rate": 2.9085784752632157e-05, "loss": 0.02, "step": 13110 }, { "epoch": 6.165413533834586, "grad_norm": 0.10157699137926102, "learning_rate": 2.9010719708694722e-05, "loss": 0.0178, "step": 13120 }, { "epoch": 6.170112781954887, "grad_norm": 0.11045944690704346, "learning_rate": 2.8935712048648112e-05, "loss": 0.0141, "step": 13130 }, { "epoch": 6.174812030075188, "grad_norm": 0.16046114265918732, "learning_rate": 2.8860761977560436e-05, "loss": 0.026, "step": 13140 }, { "epoch": 6.179511278195489, "grad_norm": 0.10078492015600204, "learning_rate": 2.878586970034232e-05, "loss": 0.0205, "step": 13150 }, { "epoch": 6.184210526315789, "grad_norm": 0.12906494736671448, "learning_rate": 2.8711035421746367e-05, "loss": 0.0267, "step": 13160 }, { "epoch": 6.18890977443609, "grad_norm": 0.10753978043794632, "learning_rate": 2.8636259346366666e-05, "loss": 0.013, "step": 13170 }, { "epoch": 6.193609022556391, "grad_norm": 0.12442290037870407, "learning_rate": 2.8561541678638142e-05, "loss": 0.0126, "step": 13180 }, { "epoch": 6.198308270676692, "grad_norm": 0.16419479250907898, "learning_rate": 2.8486882622836026e-05, "loss": 0.0181, "step": 13190 }, { "epoch": 6.203007518796992, "grad_norm": 0.1196175292134285, "learning_rate": 2.8412282383075363e-05, "loss": 0.017, "step": 13200 }, { "epoch": 6.207706766917293, "grad_norm": 0.09084620326757431, "learning_rate": 2.8337741163310317e-05, "loss": 0.0122, "step": 13210 }, { "epoch": 6.212406015037594, "grad_norm": 0.08656840771436691, "learning_rate": 2.8263259167333777e-05, "loss": 0.0203, "step": 13220 }, { "epoch": 6.217105263157895, "grad_norm": 0.1413203328847885, "learning_rate": 2.8188836598776662e-05, "loss": 0.0155, "step": 13230 }, { "epoch": 6.2218045112781954, "grad_norm": 0.1380605250597, "learning_rate": 2.811447366110741e-05, "loss": 0.0154, "step": 13240 }, { "epoch": 6.226503759398496, "grad_norm": 0.13681164383888245, "learning_rate": 2.804017055763149e-05, "loss": 0.0131, "step": 13250 }, { "epoch": 6.231203007518797, "grad_norm": 0.10424366593360901, "learning_rate": 2.7965927491490705e-05, "loss": 0.0173, "step": 13260 }, { "epoch": 6.235902255639098, "grad_norm": 0.2111925184726715, "learning_rate": 2.7891744665662823e-05, "loss": 0.0165, "step": 13270 }, { "epoch": 6.2406015037593985, "grad_norm": 0.09070563316345215, "learning_rate": 2.7817622282960815e-05, "loss": 0.0136, "step": 13280 }, { "epoch": 6.245300751879699, "grad_norm": 0.1343836635351181, "learning_rate": 2.774356054603243e-05, "loss": 0.0137, "step": 13290 }, { "epoch": 6.25, "grad_norm": 0.17384490370750427, "learning_rate": 2.766955965735968e-05, "loss": 0.0141, "step": 13300 }, { "epoch": 6.254699248120301, "grad_norm": 0.07739756256341934, "learning_rate": 2.7595619819258116e-05, "loss": 0.0144, "step": 13310 }, { "epoch": 6.2593984962406015, "grad_norm": 0.14672629535198212, "learning_rate": 2.7521741233876496e-05, "loss": 0.0195, "step": 13320 }, { "epoch": 6.264097744360902, "grad_norm": 0.20344915986061096, "learning_rate": 2.7447924103195976e-05, "loss": 0.0133, "step": 13330 }, { "epoch": 6.268796992481203, "grad_norm": 0.20322270691394806, "learning_rate": 2.7374168629029813e-05, "loss": 0.0199, "step": 13340 }, { "epoch": 6.273496240601504, "grad_norm": 0.1515226662158966, "learning_rate": 2.7300475013022663e-05, "loss": 0.0193, "step": 13350 }, { "epoch": 6.2781954887218046, "grad_norm": 0.1119297668337822, "learning_rate": 2.7226843456650037e-05, "loss": 0.0192, "step": 13360 }, { "epoch": 6.282894736842105, "grad_norm": 0.0805499255657196, "learning_rate": 2.7153274161217846e-05, "loss": 0.009, "step": 13370 }, { "epoch": 6.287593984962406, "grad_norm": 0.07355663180351257, "learning_rate": 2.707976732786166e-05, "loss": 0.0172, "step": 13380 }, { "epoch": 6.292293233082707, "grad_norm": 0.06784632056951523, "learning_rate": 2.7006323157546386e-05, "loss": 0.0132, "step": 13390 }, { "epoch": 6.296992481203008, "grad_norm": 0.15547586977481842, "learning_rate": 2.693294185106562e-05, "loss": 0.0138, "step": 13400 }, { "epoch": 6.301691729323308, "grad_norm": 0.10337734967470169, "learning_rate": 2.6859623609040984e-05, "loss": 0.0117, "step": 13410 }, { "epoch": 6.306390977443609, "grad_norm": 0.09480666369199753, "learning_rate": 2.6786368631921836e-05, "loss": 0.016, "step": 13420 }, { "epoch": 6.31109022556391, "grad_norm": 0.16595226526260376, "learning_rate": 2.67131771199844e-05, "loss": 0.0192, "step": 13430 }, { "epoch": 6.315789473684211, "grad_norm": 0.07386650145053864, "learning_rate": 2.6640049273331515e-05, "loss": 0.0119, "step": 13440 }, { "epoch": 6.320488721804511, "grad_norm": 0.14563970267772675, "learning_rate": 2.656698529189193e-05, "loss": 0.017, "step": 13450 }, { "epoch": 6.325187969924812, "grad_norm": 0.08166952431201935, "learning_rate": 2.6493985375419778e-05, "loss": 0.0176, "step": 13460 }, { "epoch": 6.329887218045113, "grad_norm": 0.123371921479702, "learning_rate": 2.642104972349403e-05, "loss": 0.019, "step": 13470 }, { "epoch": 6.334586466165414, "grad_norm": 0.08788731694221497, "learning_rate": 2.6348178535517966e-05, "loss": 0.0203, "step": 13480 }, { "epoch": 6.339285714285714, "grad_norm": 0.07246407121419907, "learning_rate": 2.6275372010718635e-05, "loss": 0.0179, "step": 13490 }, { "epoch": 6.343984962406015, "grad_norm": 0.13420486450195312, "learning_rate": 2.6202630348146324e-05, "loss": 0.0143, "step": 13500 }, { "epoch": 6.348684210526316, "grad_norm": 0.10652023553848267, "learning_rate": 2.612995374667394e-05, "loss": 0.0084, "step": 13510 }, { "epoch": 6.353383458646617, "grad_norm": 0.1149219200015068, "learning_rate": 2.6057342404996522e-05, "loss": 0.01, "step": 13520 }, { "epoch": 6.3580827067669174, "grad_norm": 0.08880855143070221, "learning_rate": 2.5984796521630737e-05, "loss": 0.0138, "step": 13530 }, { "epoch": 6.362781954887218, "grad_norm": 0.153299942612648, "learning_rate": 2.591231629491423e-05, "loss": 0.0109, "step": 13540 }, { "epoch": 6.367481203007519, "grad_norm": 0.10760670900344849, "learning_rate": 2.5839901923005205e-05, "loss": 0.0125, "step": 13550 }, { "epoch": 6.37218045112782, "grad_norm": 0.15970246493816376, "learning_rate": 2.5767553603881767e-05, "loss": 0.0162, "step": 13560 }, { "epoch": 6.3768796992481205, "grad_norm": 0.16052334010601044, "learning_rate": 2.5695271535341443e-05, "loss": 0.0142, "step": 13570 }, { "epoch": 6.381578947368421, "grad_norm": 0.11121491342782974, "learning_rate": 2.562305591500069e-05, "loss": 0.0137, "step": 13580 }, { "epoch": 6.386278195488722, "grad_norm": 0.06344175338745117, "learning_rate": 2.555090694029421e-05, "loss": 0.0143, "step": 13590 }, { "epoch": 6.390977443609023, "grad_norm": 0.12826725840568542, "learning_rate": 2.547882480847461e-05, "loss": 0.024, "step": 13600 }, { "epoch": 6.3956766917293235, "grad_norm": 0.07199674844741821, "learning_rate": 2.540680971661161e-05, "loss": 0.0155, "step": 13610 }, { "epoch": 6.400375939849624, "grad_norm": 0.1330031007528305, "learning_rate": 2.5334861861591753e-05, "loss": 0.0123, "step": 13620 }, { "epoch": 6.405075187969925, "grad_norm": 0.09626597911119461, "learning_rate": 2.526298144011775e-05, "loss": 0.0184, "step": 13630 }, { "epoch": 6.409774436090226, "grad_norm": 0.18249697983264923, "learning_rate": 2.5191168648707887e-05, "loss": 0.0091, "step": 13640 }, { "epoch": 6.4144736842105265, "grad_norm": 0.08582185208797455, "learning_rate": 2.511942368369566e-05, "loss": 0.0097, "step": 13650 }, { "epoch": 6.419172932330827, "grad_norm": 0.09408201277256012, "learning_rate": 2.5047746741228978e-05, "loss": 0.0147, "step": 13660 }, { "epoch": 6.423872180451128, "grad_norm": 0.17775125801563263, "learning_rate": 2.4976138017269908e-05, "loss": 0.0078, "step": 13670 }, { "epoch": 6.428571428571429, "grad_norm": 0.09252524375915527, "learning_rate": 2.490459770759398e-05, "loss": 0.0159, "step": 13680 }, { "epoch": 6.43327067669173, "grad_norm": 0.08252805471420288, "learning_rate": 2.4833126007789653e-05, "loss": 0.0099, "step": 13690 }, { "epoch": 6.43796992481203, "grad_norm": 0.11147051304578781, "learning_rate": 2.476172311325783e-05, "loss": 0.0124, "step": 13700 }, { "epoch": 6.442669172932331, "grad_norm": 0.12817934155464172, "learning_rate": 2.4690389219211273e-05, "loss": 0.0136, "step": 13710 }, { "epoch": 6.447368421052632, "grad_norm": 0.15487729012966156, "learning_rate": 2.4619124520674146e-05, "loss": 0.0189, "step": 13720 }, { "epoch": 6.452067669172933, "grad_norm": 0.12925255298614502, "learning_rate": 2.4547929212481435e-05, "loss": 0.0178, "step": 13730 }, { "epoch": 6.456766917293233, "grad_norm": 0.12829270958900452, "learning_rate": 2.447680348927837e-05, "loss": 0.0208, "step": 13740 }, { "epoch": 6.461466165413534, "grad_norm": 0.1329372674226761, "learning_rate": 2.4405747545519963e-05, "loss": 0.0146, "step": 13750 }, { "epoch": 6.466165413533835, "grad_norm": 0.10055895149707794, "learning_rate": 2.433476157547044e-05, "loss": 0.0199, "step": 13760 }, { "epoch": 6.470864661654136, "grad_norm": 0.109032541513443, "learning_rate": 2.4263845773202736e-05, "loss": 0.0252, "step": 13770 }, { "epoch": 6.475563909774436, "grad_norm": 0.1153714656829834, "learning_rate": 2.419300033259798e-05, "loss": 0.0138, "step": 13780 }, { "epoch": 6.480263157894737, "grad_norm": 0.10681688040494919, "learning_rate": 2.4122225447344875e-05, "loss": 0.0155, "step": 13790 }, { "epoch": 6.484962406015038, "grad_norm": 0.12183579057455063, "learning_rate": 2.405152131093926e-05, "loss": 0.0109, "step": 13800 }, { "epoch": 6.489661654135339, "grad_norm": 0.13035376369953156, "learning_rate": 2.3980888116683515e-05, "loss": 0.0217, "step": 13810 }, { "epoch": 6.4943609022556394, "grad_norm": 0.06607703119516373, "learning_rate": 2.3910326057686127e-05, "loss": 0.0117, "step": 13820 }, { "epoch": 6.49906015037594, "grad_norm": 0.11934183537960052, "learning_rate": 2.3839835326861104e-05, "loss": 0.0165, "step": 13830 }, { "epoch": 6.503759398496241, "grad_norm": 0.13289965689182281, "learning_rate": 2.3769416116927335e-05, "loss": 0.0285, "step": 13840 }, { "epoch": 6.508458646616542, "grad_norm": 0.11643469333648682, "learning_rate": 2.3699068620408304e-05, "loss": 0.0134, "step": 13850 }, { "epoch": 6.5131578947368425, "grad_norm": 0.11470893025398254, "learning_rate": 2.362879302963135e-05, "loss": 0.0112, "step": 13860 }, { "epoch": 6.517857142857143, "grad_norm": 0.10102832317352295, "learning_rate": 2.3558589536727277e-05, "loss": 0.0125, "step": 13870 }, { "epoch": 6.522556390977444, "grad_norm": 0.1053953766822815, "learning_rate": 2.3488458333629777e-05, "loss": 0.0123, "step": 13880 }, { "epoch": 6.527255639097744, "grad_norm": 0.12388722598552704, "learning_rate": 2.341839961207482e-05, "loss": 0.0172, "step": 13890 }, { "epoch": 6.5319548872180455, "grad_norm": 0.13171012699604034, "learning_rate": 2.3348413563600325e-05, "loss": 0.017, "step": 13900 }, { "epoch": 6.536654135338345, "grad_norm": 0.10711341351270676, "learning_rate": 2.3278500379545436e-05, "loss": 0.0152, "step": 13910 }, { "epoch": 6.541353383458647, "grad_norm": 0.17922531068325043, "learning_rate": 2.3208660251050158e-05, "loss": 0.0132, "step": 13920 }, { "epoch": 6.546052631578947, "grad_norm": 0.07679407298564911, "learning_rate": 2.3138893369054766e-05, "loss": 0.0133, "step": 13930 }, { "epoch": 6.5507518796992485, "grad_norm": 0.16956987977027893, "learning_rate": 2.3069199924299174e-05, "loss": 0.0115, "step": 13940 }, { "epoch": 6.555451127819548, "grad_norm": 0.10801958292722702, "learning_rate": 2.2999580107322653e-05, "loss": 0.0212, "step": 13950 }, { "epoch": 6.56015037593985, "grad_norm": 0.14689181745052338, "learning_rate": 2.29300341084631e-05, "loss": 0.0141, "step": 13960 }, { "epoch": 6.56484962406015, "grad_norm": 0.10904575139284134, "learning_rate": 2.2860562117856647e-05, "loss": 0.0139, "step": 13970 }, { "epoch": 6.569548872180452, "grad_norm": 0.06999867409467697, "learning_rate": 2.279116432543705e-05, "loss": 0.0139, "step": 13980 }, { "epoch": 6.5742481203007515, "grad_norm": 0.09230833500623703, "learning_rate": 2.2721840920935196e-05, "loss": 0.0147, "step": 13990 }, { "epoch": 6.578947368421053, "grad_norm": 0.19919702410697937, "learning_rate": 2.2652592093878666e-05, "loss": 0.0162, "step": 14000 }, { "epoch": 6.583646616541353, "grad_norm": 0.0856180340051651, "learning_rate": 2.258341803359108e-05, "loss": 0.0118, "step": 14010 }, { "epoch": 6.588345864661655, "grad_norm": 0.11447542905807495, "learning_rate": 2.251431892919171e-05, "loss": 0.0176, "step": 14020 }, { "epoch": 6.5930451127819545, "grad_norm": 0.11393177509307861, "learning_rate": 2.2445294969594844e-05, "loss": 0.0122, "step": 14030 }, { "epoch": 6.597744360902256, "grad_norm": 0.06813038140535355, "learning_rate": 2.237634634350934e-05, "loss": 0.0112, "step": 14040 }, { "epoch": 6.602443609022556, "grad_norm": 0.1496971994638443, "learning_rate": 2.2307473239438154e-05, "loss": 0.0137, "step": 14050 }, { "epoch": 6.607142857142857, "grad_norm": 0.15726064145565033, "learning_rate": 2.2238675845677663e-05, "loss": 0.0148, "step": 14060 }, { "epoch": 6.6118421052631575, "grad_norm": 0.06907601654529572, "learning_rate": 2.2169954350317374e-05, "loss": 0.0104, "step": 14070 }, { "epoch": 6.616541353383458, "grad_norm": 0.10363949090242386, "learning_rate": 2.2101308941239203e-05, "loss": 0.0203, "step": 14080 }, { "epoch": 6.621240601503759, "grad_norm": 0.1385289579629898, "learning_rate": 2.2032739806117058e-05, "loss": 0.0154, "step": 14090 }, { "epoch": 6.62593984962406, "grad_norm": 0.10194943100214005, "learning_rate": 2.196424713241637e-05, "loss": 0.0198, "step": 14100 }, { "epoch": 6.6306390977443606, "grad_norm": 0.06067529693245888, "learning_rate": 2.1895831107393484e-05, "loss": 0.0124, "step": 14110 }, { "epoch": 6.635338345864661, "grad_norm": 0.0793943703174591, "learning_rate": 2.182749191809518e-05, "loss": 0.0121, "step": 14120 }, { "epoch": 6.640037593984962, "grad_norm": 0.09581856429576874, "learning_rate": 2.1759229751358217e-05, "loss": 0.0164, "step": 14130 }, { "epoch": 6.644736842105263, "grad_norm": 0.11547362804412842, "learning_rate": 2.1691044793808734e-05, "loss": 0.0156, "step": 14140 }, { "epoch": 6.649436090225564, "grad_norm": 0.12122860550880432, "learning_rate": 2.1622937231861822e-05, "loss": 0.01, "step": 14150 }, { "epoch": 6.654135338345864, "grad_norm": 0.10450417548418045, "learning_rate": 2.1554907251720945e-05, "loss": 0.0109, "step": 14160 }, { "epoch": 6.658834586466165, "grad_norm": 0.21685545146465302, "learning_rate": 2.148695503937745e-05, "loss": 0.0137, "step": 14170 }, { "epoch": 6.663533834586466, "grad_norm": 0.07424575835466385, "learning_rate": 2.1419080780610123e-05, "loss": 0.0092, "step": 14180 }, { "epoch": 6.668233082706767, "grad_norm": 0.10741780698299408, "learning_rate": 2.1351284660984572e-05, "loss": 0.0131, "step": 14190 }, { "epoch": 6.672932330827067, "grad_norm": 0.162250816822052, "learning_rate": 2.128356686585282e-05, "loss": 0.0154, "step": 14200 }, { "epoch": 6.677631578947368, "grad_norm": 0.10091499984264374, "learning_rate": 2.121592758035273e-05, "loss": 0.015, "step": 14210 }, { "epoch": 6.682330827067669, "grad_norm": 0.12936082482337952, "learning_rate": 2.1148366989407496e-05, "loss": 0.0115, "step": 14220 }, { "epoch": 6.68703007518797, "grad_norm": 0.05973741412162781, "learning_rate": 2.1080885277725236e-05, "loss": 0.0097, "step": 14230 }, { "epoch": 6.69172932330827, "grad_norm": 0.1663609743118286, "learning_rate": 2.1013482629798333e-05, "loss": 0.0204, "step": 14240 }, { "epoch": 6.696428571428571, "grad_norm": 0.13079845905303955, "learning_rate": 2.094615922990309e-05, "loss": 0.0128, "step": 14250 }, { "epoch": 6.701127819548872, "grad_norm": 0.133228600025177, "learning_rate": 2.0878915262099098e-05, "loss": 0.0117, "step": 14260 }, { "epoch": 6.705827067669173, "grad_norm": 0.13159476220607758, "learning_rate": 2.0811750910228774e-05, "loss": 0.0152, "step": 14270 }, { "epoch": 6.7105263157894735, "grad_norm": 0.17307148873806, "learning_rate": 2.0744666357916925e-05, "loss": 0.0149, "step": 14280 }, { "epoch": 6.715225563909774, "grad_norm": 0.14028523862361908, "learning_rate": 2.067766178857013e-05, "loss": 0.0249, "step": 14290 }, { "epoch": 6.719924812030075, "grad_norm": 0.11767170578241348, "learning_rate": 2.061073738537635e-05, "loss": 0.0104, "step": 14300 }, { "epoch": 6.724624060150376, "grad_norm": 0.09434834867715836, "learning_rate": 2.0543893331304333e-05, "loss": 0.0164, "step": 14310 }, { "epoch": 6.7293233082706765, "grad_norm": 0.13524094223976135, "learning_rate": 2.0477129809103147e-05, "loss": 0.0144, "step": 14320 }, { "epoch": 6.734022556390977, "grad_norm": 0.12902255356311798, "learning_rate": 2.0410447001301753e-05, "loss": 0.0126, "step": 14330 }, { "epoch": 6.738721804511278, "grad_norm": 0.12430833280086517, "learning_rate": 2.0343845090208368e-05, "loss": 0.0136, "step": 14340 }, { "epoch": 6.743421052631579, "grad_norm": 0.15644432604312897, "learning_rate": 2.0277324257910106e-05, "loss": 0.0175, "step": 14350 }, { "epoch": 6.7481203007518795, "grad_norm": 0.10792649537324905, "learning_rate": 2.0210884686272368e-05, "loss": 0.0097, "step": 14360 }, { "epoch": 6.75281954887218, "grad_norm": 0.11637542396783829, "learning_rate": 2.0144526556938387e-05, "loss": 0.0275, "step": 14370 }, { "epoch": 6.757518796992481, "grad_norm": 0.1322125643491745, "learning_rate": 2.0078250051328784e-05, "loss": 0.018, "step": 14380 }, { "epoch": 6.762218045112782, "grad_norm": 0.11292218416929245, "learning_rate": 2.0012055350640986e-05, "loss": 0.0098, "step": 14390 }, { "epoch": 6.7669172932330826, "grad_norm": 0.15685032308101654, "learning_rate": 1.9945942635848748e-05, "loss": 0.016, "step": 14400 }, { "epoch": 6.771616541353383, "grad_norm": 0.10153713822364807, "learning_rate": 1.9879912087701753e-05, "loss": 0.0094, "step": 14410 }, { "epoch": 6.776315789473684, "grad_norm": 0.0829312652349472, "learning_rate": 1.981396388672496e-05, "loss": 0.0104, "step": 14420 }, { "epoch": 6.781015037593985, "grad_norm": 0.19711576402187347, "learning_rate": 1.974809821321827e-05, "loss": 0.0178, "step": 14430 }, { "epoch": 6.785714285714286, "grad_norm": 0.07211043685674667, "learning_rate": 1.9682315247255894e-05, "loss": 0.017, "step": 14440 }, { "epoch": 6.790413533834586, "grad_norm": 0.1333143711090088, "learning_rate": 1.9616615168685943e-05, "loss": 0.0136, "step": 14450 }, { "epoch": 6.795112781954887, "grad_norm": 0.05626295506954193, "learning_rate": 1.9550998157129946e-05, "loss": 0.01, "step": 14460 }, { "epoch": 6.799812030075188, "grad_norm": 0.07672274112701416, "learning_rate": 1.9485464391982284e-05, "loss": 0.0097, "step": 14470 }, { "epoch": 6.804511278195489, "grad_norm": 0.08597870171070099, "learning_rate": 1.942001405240979e-05, "loss": 0.0157, "step": 14480 }, { "epoch": 6.809210526315789, "grad_norm": 0.0822441503405571, "learning_rate": 1.9354647317351188e-05, "loss": 0.0096, "step": 14490 }, { "epoch": 6.81390977443609, "grad_norm": 0.13318496942520142, "learning_rate": 1.928936436551661e-05, "loss": 0.0122, "step": 14500 }, { "epoch": 6.818609022556391, "grad_norm": 0.10445527732372284, "learning_rate": 1.9224165375387193e-05, "loss": 0.0107, "step": 14510 }, { "epoch": 6.823308270676692, "grad_norm": 0.18820570409297943, "learning_rate": 1.9159050525214452e-05, "loss": 0.012, "step": 14520 }, { "epoch": 6.828007518796992, "grad_norm": 0.14440257847309113, "learning_rate": 1.909401999301993e-05, "loss": 0.0151, "step": 14530 }, { "epoch": 6.832706766917293, "grad_norm": 0.1307608187198639, "learning_rate": 1.9029073956594606e-05, "loss": 0.0146, "step": 14540 }, { "epoch": 6.837406015037594, "grad_norm": 0.12377108633518219, "learning_rate": 1.8964212593498442e-05, "loss": 0.0139, "step": 14550 }, { "epoch": 6.842105263157895, "grad_norm": 0.11900335550308228, "learning_rate": 1.8899436081059975e-05, "loss": 0.0143, "step": 14560 }, { "epoch": 6.8468045112781954, "grad_norm": 0.12481661140918732, "learning_rate": 1.8834744596375666e-05, "loss": 0.0089, "step": 14570 }, { "epoch": 6.851503759398496, "grad_norm": 0.1504499465227127, "learning_rate": 1.877013831630961e-05, "loss": 0.0174, "step": 14580 }, { "epoch": 6.856203007518797, "grad_norm": 0.18421101570129395, "learning_rate": 1.8705617417492883e-05, "loss": 0.0225, "step": 14590 }, { "epoch": 6.860902255639098, "grad_norm": 0.09719163924455643, "learning_rate": 1.8641182076323148e-05, "loss": 0.0104, "step": 14600 }, { "epoch": 6.8656015037593985, "grad_norm": 0.10158328711986542, "learning_rate": 1.85768324689642e-05, "loss": 0.0151, "step": 14610 }, { "epoch": 6.870300751879699, "grad_norm": 0.08070684224367142, "learning_rate": 1.851256877134538e-05, "loss": 0.0183, "step": 14620 }, { "epoch": 6.875, "grad_norm": 0.06398982554674149, "learning_rate": 1.8448391159161204e-05, "loss": 0.0118, "step": 14630 }, { "epoch": 6.879699248120301, "grad_norm": 0.09589491784572601, "learning_rate": 1.838429980787081e-05, "loss": 0.0128, "step": 14640 }, { "epoch": 6.8843984962406015, "grad_norm": 0.1288967728614807, "learning_rate": 1.8320294892697478e-05, "loss": 0.0137, "step": 14650 }, { "epoch": 6.889097744360902, "grad_norm": 0.11724042892456055, "learning_rate": 1.8256376588628238e-05, "loss": 0.0207, "step": 14660 }, { "epoch": 6.893796992481203, "grad_norm": 0.13123568892478943, "learning_rate": 1.8192545070413282e-05, "loss": 0.011, "step": 14670 }, { "epoch": 6.898496240601504, "grad_norm": 0.09076406806707382, "learning_rate": 1.8128800512565513e-05, "loss": 0.013, "step": 14680 }, { "epoch": 6.9031954887218046, "grad_norm": 0.09277361631393433, "learning_rate": 1.8065143089360172e-05, "loss": 0.0125, "step": 14690 }, { "epoch": 6.907894736842105, "grad_norm": 0.10244913399219513, "learning_rate": 1.800157297483417e-05, "loss": 0.0111, "step": 14700 }, { "epoch": 6.912593984962406, "grad_norm": 0.06498057395219803, "learning_rate": 1.7938090342785817e-05, "loss": 0.0111, "step": 14710 }, { "epoch": 6.917293233082707, "grad_norm": 0.06595025211572647, "learning_rate": 1.787469536677419e-05, "loss": 0.0099, "step": 14720 }, { "epoch": 6.921992481203008, "grad_norm": 0.0671527311205864, "learning_rate": 1.7811388220118707e-05, "loss": 0.0144, "step": 14730 }, { "epoch": 6.926691729323308, "grad_norm": 0.07553205639123917, "learning_rate": 1.774816907589873e-05, "loss": 0.0137, "step": 14740 }, { "epoch": 6.931390977443609, "grad_norm": 0.13109339773654938, "learning_rate": 1.768503810695295e-05, "loss": 0.0123, "step": 14750 }, { "epoch": 6.93609022556391, "grad_norm": 0.09198206663131714, "learning_rate": 1.7621995485879062e-05, "loss": 0.0119, "step": 14760 }, { "epoch": 6.940789473684211, "grad_norm": 0.07400915026664734, "learning_rate": 1.755904138503316e-05, "loss": 0.0132, "step": 14770 }, { "epoch": 6.945488721804511, "grad_norm": 0.071579709649086, "learning_rate": 1.749617597652934e-05, "loss": 0.0175, "step": 14780 }, { "epoch": 6.950187969924812, "grad_norm": 0.11160339415073395, "learning_rate": 1.743339943223926e-05, "loss": 0.0153, "step": 14790 }, { "epoch": 6.954887218045113, "grad_norm": 0.1040780320763588, "learning_rate": 1.7370711923791567e-05, "loss": 0.0139, "step": 14800 }, { "epoch": 6.959586466165414, "grad_norm": 0.05674157291650772, "learning_rate": 1.7308113622571544e-05, "loss": 0.0064, "step": 14810 }, { "epoch": 6.964285714285714, "grad_norm": 0.08227528631687164, "learning_rate": 1.7245604699720535e-05, "loss": 0.0131, "step": 14820 }, { "epoch": 6.968984962406015, "grad_norm": 0.062148675322532654, "learning_rate": 1.7183185326135543e-05, "loss": 0.0142, "step": 14830 }, { "epoch": 6.973684210526316, "grad_norm": 0.13188673555850983, "learning_rate": 1.712085567246878e-05, "loss": 0.0198, "step": 14840 }, { "epoch": 6.978383458646617, "grad_norm": 0.09028539806604385, "learning_rate": 1.70586159091271e-05, "loss": 0.0131, "step": 14850 }, { "epoch": 6.9830827067669174, "grad_norm": 0.09629946202039719, "learning_rate": 1.699646620627168e-05, "loss": 0.0158, "step": 14860 }, { "epoch": 6.987781954887218, "grad_norm": 0.05366470292210579, "learning_rate": 1.6934406733817414e-05, "loss": 0.0186, "step": 14870 }, { "epoch": 6.992481203007519, "grad_norm": 0.09256349503993988, "learning_rate": 1.6872437661432517e-05, "loss": 0.0124, "step": 14880 }, { "epoch": 6.99718045112782, "grad_norm": 0.08437152206897736, "learning_rate": 1.6810559158538092e-05, "loss": 0.0125, "step": 14890 }, { "epoch": 7.0018796992481205, "grad_norm": 0.13958819210529327, "learning_rate": 1.6748771394307585e-05, "loss": 0.0126, "step": 14900 }, { "epoch": 7.006578947368421, "grad_norm": 0.10412228107452393, "learning_rate": 1.6687074537666398e-05, "loss": 0.0081, "step": 14910 }, { "epoch": 7.011278195488722, "grad_norm": 0.16561958193778992, "learning_rate": 1.662546875729138e-05, "loss": 0.0236, "step": 14920 }, { "epoch": 7.015977443609023, "grad_norm": 0.10665090382099152, "learning_rate": 1.6563954221610355e-05, "loss": 0.0126, "step": 14930 }, { "epoch": 7.0206766917293235, "grad_norm": 0.07634269446134567, "learning_rate": 1.6502531098801753e-05, "loss": 0.0112, "step": 14940 }, { "epoch": 7.025375939849624, "grad_norm": 0.0811878889799118, "learning_rate": 1.6441199556794033e-05, "loss": 0.0129, "step": 14950 }, { "epoch": 7.030075187969925, "grad_norm": 0.09860299527645111, "learning_rate": 1.637995976326527e-05, "loss": 0.0175, "step": 14960 }, { "epoch": 7.034774436090226, "grad_norm": 0.10491390526294708, "learning_rate": 1.631881188564275e-05, "loss": 0.0158, "step": 14970 }, { "epoch": 7.0394736842105265, "grad_norm": 0.17026887834072113, "learning_rate": 1.62577560911024e-05, "loss": 0.0148, "step": 14980 }, { "epoch": 7.044172932330827, "grad_norm": 0.10451708734035492, "learning_rate": 1.6196792546568472e-05, "loss": 0.0101, "step": 14990 }, { "epoch": 7.048872180451128, "grad_norm": 0.11327500641345978, "learning_rate": 1.6135921418712956e-05, "loss": 0.016, "step": 15000 }, { "epoch": 7.053571428571429, "grad_norm": 0.08484944701194763, "learning_rate": 1.6075142873955164e-05, "loss": 0.0103, "step": 15010 }, { "epoch": 7.05827067669173, "grad_norm": 0.07527375966310501, "learning_rate": 1.6014457078461353e-05, "loss": 0.0077, "step": 15020 }, { "epoch": 7.06296992481203, "grad_norm": 0.10955455899238586, "learning_rate": 1.5953864198144135e-05, "loss": 0.0202, "step": 15030 }, { "epoch": 7.067669172932331, "grad_norm": 0.10458432137966156, "learning_rate": 1.5893364398662176e-05, "loss": 0.0117, "step": 15040 }, { "epoch": 7.072368421052632, "grad_norm": 0.0628298744559288, "learning_rate": 1.583295784541958e-05, "loss": 0.0084, "step": 15050 }, { "epoch": 7.077067669172933, "grad_norm": 0.1378493756055832, "learning_rate": 1.5772644703565565e-05, "loss": 0.0204, "step": 15060 }, { "epoch": 7.081766917293233, "grad_norm": 0.13175398111343384, "learning_rate": 1.5712425137993973e-05, "loss": 0.0137, "step": 15070 }, { "epoch": 7.086466165413534, "grad_norm": 0.12167177349328995, "learning_rate": 1.5652299313342773e-05, "loss": 0.0102, "step": 15080 }, { "epoch": 7.091165413533835, "grad_norm": 0.0790913999080658, "learning_rate": 1.5592267393993716e-05, "loss": 0.0127, "step": 15090 }, { "epoch": 7.095864661654136, "grad_norm": 0.1287565529346466, "learning_rate": 1.553232954407171e-05, "loss": 0.0089, "step": 15100 }, { "epoch": 7.100563909774436, "grad_norm": 0.06657912582159042, "learning_rate": 1.5472485927444597e-05, "loss": 0.012, "step": 15110 }, { "epoch": 7.105263157894737, "grad_norm": 0.05287037044763565, "learning_rate": 1.5412736707722537e-05, "loss": 0.0077, "step": 15120 }, { "epoch": 7.109962406015038, "grad_norm": 0.1070045679807663, "learning_rate": 1.5353082048257596e-05, "loss": 0.0119, "step": 15130 }, { "epoch": 7.114661654135339, "grad_norm": 0.07215863466262817, "learning_rate": 1.5293522112143373e-05, "loss": 0.0133, "step": 15140 }, { "epoch": 7.1193609022556394, "grad_norm": 0.10572236031293869, "learning_rate": 1.5234057062214402e-05, "loss": 0.0188, "step": 15150 }, { "epoch": 7.12406015037594, "grad_norm": 0.12182480096817017, "learning_rate": 1.517468706104589e-05, "loss": 0.0166, "step": 15160 }, { "epoch": 7.128759398496241, "grad_norm": 0.1621452122926712, "learning_rate": 1.5115412270953167e-05, "loss": 0.0182, "step": 15170 }, { "epoch": 7.133458646616542, "grad_norm": 0.09231683611869812, "learning_rate": 1.5056232853991209e-05, "loss": 0.0093, "step": 15180 }, { "epoch": 7.1381578947368425, "grad_norm": 0.10214181244373322, "learning_rate": 1.4997148971954344e-05, "loss": 0.018, "step": 15190 }, { "epoch": 7.142857142857143, "grad_norm": 0.10214631259441376, "learning_rate": 1.4938160786375572e-05, "loss": 0.0173, "step": 15200 }, { "epoch": 7.147556390977444, "grad_norm": 0.11673356592655182, "learning_rate": 1.4879268458526379e-05, "loss": 0.0109, "step": 15210 }, { "epoch": 7.152255639097745, "grad_norm": 0.07134755700826645, "learning_rate": 1.4820472149416154e-05, "loss": 0.0132, "step": 15220 }, { "epoch": 7.1569548872180455, "grad_norm": 0.16357994079589844, "learning_rate": 1.4761772019791748e-05, "loss": 0.0142, "step": 15230 }, { "epoch": 7.161654135338346, "grad_norm": 0.05381006747484207, "learning_rate": 1.470316823013707e-05, "loss": 0.0094, "step": 15240 }, { "epoch": 7.166353383458647, "grad_norm": 0.09296070784330368, "learning_rate": 1.4644660940672627e-05, "loss": 0.0146, "step": 15250 }, { "epoch": 7.171052631578948, "grad_norm": 0.10752391815185547, "learning_rate": 1.4586250311355132e-05, "loss": 0.0124, "step": 15260 }, { "epoch": 7.1757518796992485, "grad_norm": 0.10608847439289093, "learning_rate": 1.4527936501877032e-05, "loss": 0.011, "step": 15270 }, { "epoch": 7.180451127819548, "grad_norm": 0.10653159022331238, "learning_rate": 1.4469719671666043e-05, "loss": 0.0132, "step": 15280 }, { "epoch": 7.18515037593985, "grad_norm": 0.09571955353021622, "learning_rate": 1.4411599979884744e-05, "loss": 0.0164, "step": 15290 }, { "epoch": 7.18984962406015, "grad_norm": 0.05526496097445488, "learning_rate": 1.435357758543015e-05, "loss": 0.0154, "step": 15300 }, { "epoch": 7.194548872180452, "grad_norm": 0.08336817473173141, "learning_rate": 1.4295652646933277e-05, "loss": 0.0151, "step": 15310 }, { "epoch": 7.1992481203007515, "grad_norm": 0.06959939748048782, "learning_rate": 1.4237825322758736e-05, "loss": 0.0155, "step": 15320 }, { "epoch": 7.203947368421052, "grad_norm": 0.15572324395179749, "learning_rate": 1.4180095771004154e-05, "loss": 0.019, "step": 15330 }, { "epoch": 7.208646616541353, "grad_norm": 0.14393118023872375, "learning_rate": 1.412246414949997e-05, "loss": 0.0085, "step": 15340 }, { "epoch": 7.213345864661654, "grad_norm": 0.06008661538362503, "learning_rate": 1.4064930615808808e-05, "loss": 0.011, "step": 15350 }, { "epoch": 7.2180451127819545, "grad_norm": 0.14957301318645477, "learning_rate": 1.4007495327225162e-05, "loss": 0.0129, "step": 15360 }, { "epoch": 7.222744360902255, "grad_norm": 0.1129375472664833, "learning_rate": 1.3950158440774957e-05, "loss": 0.0165, "step": 15370 }, { "epoch": 7.227443609022556, "grad_norm": 0.16532012820243835, "learning_rate": 1.389292011321498e-05, "loss": 0.0101, "step": 15380 }, { "epoch": 7.232142857142857, "grad_norm": 0.07673346996307373, "learning_rate": 1.383578050103268e-05, "loss": 0.0099, "step": 15390 }, { "epoch": 7.2368421052631575, "grad_norm": 0.06047337129712105, "learning_rate": 1.3778739760445552e-05, "loss": 0.0105, "step": 15400 }, { "epoch": 7.241541353383458, "grad_norm": 0.09481897950172424, "learning_rate": 1.3721798047400813e-05, "loss": 0.0136, "step": 15410 }, { "epoch": 7.246240601503759, "grad_norm": 0.13582715392112732, "learning_rate": 1.3664955517574968e-05, "loss": 0.0163, "step": 15420 }, { "epoch": 7.25093984962406, "grad_norm": 0.12135443836450577, "learning_rate": 1.3608212326373249e-05, "loss": 0.0126, "step": 15430 }, { "epoch": 7.2556390977443606, "grad_norm": 0.15014630556106567, "learning_rate": 1.3551568628929434e-05, "loss": 0.0138, "step": 15440 }, { "epoch": 7.260338345864661, "grad_norm": 0.08308325707912445, "learning_rate": 1.3495024580105192e-05, "loss": 0.0186, "step": 15450 }, { "epoch": 7.265037593984962, "grad_norm": 0.18257148563861847, "learning_rate": 1.343858033448982e-05, "loss": 0.0154, "step": 15460 }, { "epoch": 7.269736842105263, "grad_norm": 0.05905809998512268, "learning_rate": 1.3382236046399722e-05, "loss": 0.0097, "step": 15470 }, { "epoch": 7.274436090225564, "grad_norm": 0.15629065036773682, "learning_rate": 1.3325991869878013e-05, "loss": 0.014, "step": 15480 }, { "epoch": 7.279135338345864, "grad_norm": 0.10098100453615189, "learning_rate": 1.3269847958694148e-05, "loss": 0.015, "step": 15490 }, { "epoch": 7.283834586466165, "grad_norm": 0.08035392314195633, "learning_rate": 1.3213804466343421e-05, "loss": 0.0134, "step": 15500 }, { "epoch": 7.288533834586466, "grad_norm": 0.09680403023958206, "learning_rate": 1.3157861546046613e-05, "loss": 0.0132, "step": 15510 }, { "epoch": 7.293233082706767, "grad_norm": 0.12656839191913605, "learning_rate": 1.3102019350749528e-05, "loss": 0.0106, "step": 15520 }, { "epoch": 7.297932330827067, "grad_norm": 0.11206916719675064, "learning_rate": 1.3046278033122577e-05, "loss": 0.0092, "step": 15530 }, { "epoch": 7.302631578947368, "grad_norm": 0.09982562065124512, "learning_rate": 1.299063774556042e-05, "loss": 0.0079, "step": 15540 }, { "epoch": 7.307330827067669, "grad_norm": 0.05940884351730347, "learning_rate": 1.293509864018146e-05, "loss": 0.015, "step": 15550 }, { "epoch": 7.31203007518797, "grad_norm": 0.09302843362092972, "learning_rate": 1.2879660868827508e-05, "loss": 0.0133, "step": 15560 }, { "epoch": 7.31672932330827, "grad_norm": 0.07972191274166107, "learning_rate": 1.2824324583063302e-05, "loss": 0.0152, "step": 15570 }, { "epoch": 7.321428571428571, "grad_norm": 0.11651585251092911, "learning_rate": 1.2769089934176126e-05, "loss": 0.0085, "step": 15580 }, { "epoch": 7.326127819548872, "grad_norm": 0.09426891058683395, "learning_rate": 1.2713957073175425e-05, "loss": 0.0163, "step": 15590 }, { "epoch": 7.330827067669173, "grad_norm": 0.1127847209572792, "learning_rate": 1.2658926150792322e-05, "loss": 0.012, "step": 15600 }, { "epoch": 7.3355263157894735, "grad_norm": 0.08010071516036987, "learning_rate": 1.2603997317479238e-05, "loss": 0.0099, "step": 15610 }, { "epoch": 7.340225563909774, "grad_norm": 0.18087002635002136, "learning_rate": 1.2549170723409549e-05, "loss": 0.0179, "step": 15620 }, { "epoch": 7.344924812030075, "grad_norm": 0.12995214760303497, "learning_rate": 1.2494446518477022e-05, "loss": 0.0139, "step": 15630 }, { "epoch": 7.349624060150376, "grad_norm": 0.10053711384534836, "learning_rate": 1.243982485229559e-05, "loss": 0.0162, "step": 15640 }, { "epoch": 7.3543233082706765, "grad_norm": 0.07871995866298676, "learning_rate": 1.2385305874198776e-05, "loss": 0.0165, "step": 15650 }, { "epoch": 7.359022556390977, "grad_norm": 0.09473975747823715, "learning_rate": 1.233088973323937e-05, "loss": 0.0278, "step": 15660 }, { "epoch": 7.363721804511278, "grad_norm": 0.1175215020775795, "learning_rate": 1.2276576578189064e-05, "loss": 0.0091, "step": 15670 }, { "epoch": 7.368421052631579, "grad_norm": 0.11268990486860275, "learning_rate": 1.2222366557537911e-05, "loss": 0.0119, "step": 15680 }, { "epoch": 7.3731203007518795, "grad_norm": 0.12361589074134827, "learning_rate": 1.2168259819494066e-05, "loss": 0.0189, "step": 15690 }, { "epoch": 7.37781954887218, "grad_norm": 0.05146336555480957, "learning_rate": 1.2114256511983274e-05, "loss": 0.0087, "step": 15700 }, { "epoch": 7.382518796992481, "grad_norm": 0.05579405277967453, "learning_rate": 1.2060356782648503e-05, "loss": 0.0094, "step": 15710 }, { "epoch": 7.387218045112782, "grad_norm": 0.07007358223199844, "learning_rate": 1.2006560778849578e-05, "loss": 0.0097, "step": 15720 }, { "epoch": 7.3919172932330826, "grad_norm": 0.09807036072015762, "learning_rate": 1.1952868647662696e-05, "loss": 0.0118, "step": 15730 }, { "epoch": 7.396616541353383, "grad_norm": 0.1110677719116211, "learning_rate": 1.1899280535880119e-05, "loss": 0.0101, "step": 15740 }, { "epoch": 7.401315789473684, "grad_norm": 0.1247294470667839, "learning_rate": 1.1845796590009683e-05, "loss": 0.016, "step": 15750 }, { "epoch": 7.406015037593985, "grad_norm": 0.10053283721208572, "learning_rate": 1.1792416956274444e-05, "loss": 0.0115, "step": 15760 }, { "epoch": 7.410714285714286, "grad_norm": 0.08065032213926315, "learning_rate": 1.1739141780612306e-05, "loss": 0.0152, "step": 15770 }, { "epoch": 7.415413533834586, "grad_norm": 0.0791928693652153, "learning_rate": 1.1685971208675539e-05, "loss": 0.0087, "step": 15780 }, { "epoch": 7.420112781954887, "grad_norm": 0.12295614182949066, "learning_rate": 1.1632905385830484e-05, "loss": 0.0152, "step": 15790 }, { "epoch": 7.424812030075188, "grad_norm": 0.10082532465457916, "learning_rate": 1.157994445715706e-05, "loss": 0.0097, "step": 15800 }, { "epoch": 7.429511278195489, "grad_norm": 0.11935935169458389, "learning_rate": 1.1527088567448407e-05, "loss": 0.0153, "step": 15810 }, { "epoch": 7.434210526315789, "grad_norm": 0.15272879600524902, "learning_rate": 1.1474337861210543e-05, "loss": 0.0132, "step": 15820 }, { "epoch": 7.43890977443609, "grad_norm": 0.08860108256340027, "learning_rate": 1.1421692482661856e-05, "loss": 0.0166, "step": 15830 }, { "epoch": 7.443609022556391, "grad_norm": 0.10336878895759583, "learning_rate": 1.1369152575732822e-05, "loss": 0.0098, "step": 15840 }, { "epoch": 7.448308270676692, "grad_norm": 0.13501764833927155, "learning_rate": 1.1316718284065537e-05, "loss": 0.0125, "step": 15850 }, { "epoch": 7.453007518796992, "grad_norm": 0.12372566759586334, "learning_rate": 1.1264389751013326e-05, "loss": 0.0129, "step": 15860 }, { "epoch": 7.457706766917293, "grad_norm": 0.1402164101600647, "learning_rate": 1.1212167119640438e-05, "loss": 0.0159, "step": 15870 }, { "epoch": 7.462406015037594, "grad_norm": 0.1364455670118332, "learning_rate": 1.1160050532721528e-05, "loss": 0.0155, "step": 15880 }, { "epoch": 7.467105263157895, "grad_norm": 0.12234701961278915, "learning_rate": 1.1108040132741354e-05, "loss": 0.0155, "step": 15890 }, { "epoch": 7.4718045112781954, "grad_norm": 0.0902191773056984, "learning_rate": 1.1056136061894384e-05, "loss": 0.0209, "step": 15900 }, { "epoch": 7.476503759398496, "grad_norm": 0.060509975999593735, "learning_rate": 1.100433846208434e-05, "loss": 0.0201, "step": 15910 }, { "epoch": 7.481203007518797, "grad_norm": 0.15859703719615936, "learning_rate": 1.095264747492391e-05, "loss": 0.0179, "step": 15920 }, { "epoch": 7.485902255639098, "grad_norm": 0.05780461058020592, "learning_rate": 1.090106324173426e-05, "loss": 0.0136, "step": 15930 }, { "epoch": 7.4906015037593985, "grad_norm": 0.14311784505844116, "learning_rate": 1.0849585903544706e-05, "loss": 0.0147, "step": 15940 }, { "epoch": 7.495300751879699, "grad_norm": 0.08800586313009262, "learning_rate": 1.0798215601092354e-05, "loss": 0.0169, "step": 15950 }, { "epoch": 7.5, "grad_norm": 0.12583638727664948, "learning_rate": 1.0746952474821614e-05, "loss": 0.015, "step": 15960 }, { "epoch": 7.504699248120301, "grad_norm": 0.12645405530929565, "learning_rate": 1.069579666488395e-05, "loss": 0.0166, "step": 15970 }, { "epoch": 7.5093984962406015, "grad_norm": 0.08927604556083679, "learning_rate": 1.0644748311137376e-05, "loss": 0.0137, "step": 15980 }, { "epoch": 7.514097744360902, "grad_norm": 0.05305986478924751, "learning_rate": 1.059380755314613e-05, "loss": 0.016, "step": 15990 }, { "epoch": 7.518796992481203, "grad_norm": 0.05655212327837944, "learning_rate": 1.0542974530180327e-05, "loss": 0.0146, "step": 16000 }, { "epoch": 7.523496240601504, "grad_norm": 0.13975438475608826, "learning_rate": 1.049224938121548e-05, "loss": 0.0157, "step": 16010 }, { "epoch": 7.5281954887218046, "grad_norm": 0.16795960068702698, "learning_rate": 1.0441632244932237e-05, "loss": 0.0168, "step": 16020 }, { "epoch": 7.532894736842105, "grad_norm": 0.09216076880693436, "learning_rate": 1.0391123259715906e-05, "loss": 0.0162, "step": 16030 }, { "epoch": 7.537593984962406, "grad_norm": 0.10978179425001144, "learning_rate": 1.0340722563656107e-05, "loss": 0.0147, "step": 16040 }, { "epoch": 7.542293233082707, "grad_norm": 0.06818930059671402, "learning_rate": 1.0290430294546449e-05, "loss": 0.0138, "step": 16050 }, { "epoch": 7.546992481203008, "grad_norm": 0.11898943781852722, "learning_rate": 1.0240246589884044e-05, "loss": 0.0119, "step": 16060 }, { "epoch": 7.551691729323308, "grad_norm": 0.09943754971027374, "learning_rate": 1.0190171586869258e-05, "loss": 0.0129, "step": 16070 }, { "epoch": 7.556390977443609, "grad_norm": 0.05211075395345688, "learning_rate": 1.0140205422405214e-05, "loss": 0.0083, "step": 16080 }, { "epoch": 7.56109022556391, "grad_norm": 0.09384645521640778, "learning_rate": 1.009034823309749e-05, "loss": 0.0111, "step": 16090 }, { "epoch": 7.565789473684211, "grad_norm": 0.08874189853668213, "learning_rate": 1.0040600155253765e-05, "loss": 0.0063, "step": 16100 }, { "epoch": 7.570488721804511, "grad_norm": 0.05463829264044762, "learning_rate": 9.990961324883358e-06, "loss": 0.0121, "step": 16110 }, { "epoch": 7.575187969924812, "grad_norm": 0.16796351969242096, "learning_rate": 9.941431877696955e-06, "loss": 0.0151, "step": 16120 }, { "epoch": 7.579887218045113, "grad_norm": 0.05948880687355995, "learning_rate": 9.892011949106172e-06, "loss": 0.0144, "step": 16130 }, { "epoch": 7.584586466165414, "grad_norm": 0.06560337543487549, "learning_rate": 9.842701674223187e-06, "loss": 0.0089, "step": 16140 }, { "epoch": 7.589285714285714, "grad_norm": 0.07054495811462402, "learning_rate": 9.793501187860432e-06, "loss": 0.0107, "step": 16150 }, { "epoch": 7.593984962406015, "grad_norm": 0.07890637964010239, "learning_rate": 9.744410624530148e-06, "loss": 0.0138, "step": 16160 }, { "epoch": 7.598684210526316, "grad_norm": 0.10604366660118103, "learning_rate": 9.695430118444048e-06, "loss": 0.0085, "step": 16170 }, { "epoch": 7.603383458646617, "grad_norm": 0.07709035277366638, "learning_rate": 9.646559803512994e-06, "loss": 0.0105, "step": 16180 }, { "epoch": 7.6080827067669174, "grad_norm": 0.10427499562501907, "learning_rate": 9.597799813346525e-06, "loss": 0.0097, "step": 16190 }, { "epoch": 7.612781954887218, "grad_norm": 0.10048440843820572, "learning_rate": 9.549150281252633e-06, "loss": 0.0074, "step": 16200 }, { "epoch": 7.617481203007519, "grad_norm": 0.057601477950811386, "learning_rate": 9.500611340237258e-06, "loss": 0.011, "step": 16210 }, { "epoch": 7.62218045112782, "grad_norm": 0.06594853103160858, "learning_rate": 9.452183123004e-06, "loss": 0.0151, "step": 16220 }, { "epoch": 7.6268796992481205, "grad_norm": 0.06648577749729156, "learning_rate": 9.403865761953779e-06, "loss": 0.0117, "step": 16230 }, { "epoch": 7.631578947368421, "grad_norm": 0.08442472666501999, "learning_rate": 9.355659389184396e-06, "loss": 0.0181, "step": 16240 }, { "epoch": 7.636278195488722, "grad_norm": 0.07537990063428879, "learning_rate": 9.307564136490254e-06, "loss": 0.0112, "step": 16250 }, { "epoch": 7.640977443609023, "grad_norm": 0.1092437133193016, "learning_rate": 9.259580135361929e-06, "loss": 0.0084, "step": 16260 }, { "epoch": 7.6456766917293235, "grad_norm": 0.04986412823200226, "learning_rate": 9.211707516985829e-06, "loss": 0.0087, "step": 16270 }, { "epoch": 7.650375939849624, "grad_norm": 0.06437689810991287, "learning_rate": 9.163946412243896e-06, "loss": 0.0177, "step": 16280 }, { "epoch": 7.655075187969925, "grad_norm": 0.0936589166522026, "learning_rate": 9.116296951713133e-06, "loss": 0.0163, "step": 16290 }, { "epoch": 7.659774436090226, "grad_norm": 0.09114740043878555, "learning_rate": 9.068759265665384e-06, "loss": 0.0096, "step": 16300 }, { "epoch": 7.6644736842105265, "grad_norm": 0.093952976167202, "learning_rate": 9.02133348406684e-06, "loss": 0.0161, "step": 16310 }, { "epoch": 7.669172932330827, "grad_norm": 0.11028216034173965, "learning_rate": 8.974019736577777e-06, "loss": 0.0117, "step": 16320 }, { "epoch": 7.673872180451128, "grad_norm": 0.09880071133375168, "learning_rate": 8.92681815255219e-06, "loss": 0.0095, "step": 16330 }, { "epoch": 7.678571428571429, "grad_norm": 0.10285835713148117, "learning_rate": 8.879728861037384e-06, "loss": 0.0117, "step": 16340 }, { "epoch": 7.68327067669173, "grad_norm": 0.0487658828496933, "learning_rate": 8.832751990773714e-06, "loss": 0.0157, "step": 16350 }, { "epoch": 7.68796992481203, "grad_norm": 0.04792909696698189, "learning_rate": 8.785887670194138e-06, "loss": 0.0183, "step": 16360 }, { "epoch": 7.692669172932331, "grad_norm": 0.09301532804965973, "learning_rate": 8.739136027423894e-06, "loss": 0.0138, "step": 16370 }, { "epoch": 7.697368421052632, "grad_norm": 0.1253780871629715, "learning_rate": 8.692497190280224e-06, "loss": 0.016, "step": 16380 }, { "epoch": 7.702067669172933, "grad_norm": 0.14552897214889526, "learning_rate": 8.645971286271904e-06, "loss": 0.0128, "step": 16390 }, { "epoch": 7.706766917293233, "grad_norm": 0.11963018774986267, "learning_rate": 8.599558442598998e-06, "loss": 0.0112, "step": 16400 }, { "epoch": 7.711466165413534, "grad_norm": 0.13872599601745605, "learning_rate": 8.55325878615244e-06, "loss": 0.0113, "step": 16410 }, { "epoch": 7.716165413533835, "grad_norm": 0.06703979521989822, "learning_rate": 8.507072443513702e-06, "loss": 0.0141, "step": 16420 }, { "epoch": 7.720864661654136, "grad_norm": 0.12049257755279541, "learning_rate": 8.460999540954517e-06, "loss": 0.0145, "step": 16430 }, { "epoch": 7.725563909774436, "grad_norm": 0.09920763969421387, "learning_rate": 8.415040204436426e-06, "loss": 0.0119, "step": 16440 }, { "epoch": 7.730263157894737, "grad_norm": 0.20273980498313904, "learning_rate": 8.369194559610482e-06, "loss": 0.0131, "step": 16450 }, { "epoch": 7.734962406015038, "grad_norm": 0.1277891844511032, "learning_rate": 8.323462731816961e-06, "loss": 0.0163, "step": 16460 }, { "epoch": 7.739661654135339, "grad_norm": 0.09195142984390259, "learning_rate": 8.277844846084898e-06, "loss": 0.0089, "step": 16470 }, { "epoch": 7.7443609022556394, "grad_norm": 0.07057520747184753, "learning_rate": 8.232341027131885e-06, "loss": 0.0125, "step": 16480 }, { "epoch": 7.74906015037594, "grad_norm": 0.12380823493003845, "learning_rate": 8.186951399363613e-06, "loss": 0.0099, "step": 16490 }, { "epoch": 7.753759398496241, "grad_norm": 0.11303659528493881, "learning_rate": 8.141676086873572e-06, "loss": 0.0096, "step": 16500 }, { "epoch": 7.758458646616542, "grad_norm": 0.06908733397722244, "learning_rate": 8.096515213442762e-06, "loss": 0.014, "step": 16510 }, { "epoch": 7.7631578947368425, "grad_norm": 0.12358922511339188, "learning_rate": 8.051468902539272e-06, "loss": 0.0106, "step": 16520 }, { "epoch": 7.767857142857143, "grad_norm": 0.06847315281629562, "learning_rate": 8.00653727731801e-06, "loss": 0.0179, "step": 16530 }, { "epoch": 7.772556390977444, "grad_norm": 0.1353650540113449, "learning_rate": 7.96172046062032e-06, "loss": 0.0143, "step": 16540 }, { "epoch": 7.777255639097744, "grad_norm": 0.13592234253883362, "learning_rate": 7.917018574973645e-06, "loss": 0.0214, "step": 16550 }, { "epoch": 7.7819548872180455, "grad_norm": 0.1253795623779297, "learning_rate": 7.872431742591268e-06, "loss": 0.0095, "step": 16560 }, { "epoch": 7.786654135338345, "grad_norm": 0.2027290016412735, "learning_rate": 7.827960085371855e-06, "loss": 0.0151, "step": 16570 }, { "epoch": 7.791353383458647, "grad_norm": 0.15278691053390503, "learning_rate": 7.783603724899257e-06, "loss": 0.0119, "step": 16580 }, { "epoch": 7.796052631578947, "grad_norm": 0.04820878058671951, "learning_rate": 7.739362782442021e-06, "loss": 0.0102, "step": 16590 }, { "epoch": 7.8007518796992485, "grad_norm": 0.12460605800151825, "learning_rate": 7.695237378953223e-06, "loss": 0.0119, "step": 16600 }, { "epoch": 7.805451127819548, "grad_norm": 0.1490667760372162, "learning_rate": 7.651227635070041e-06, "loss": 0.0119, "step": 16610 }, { "epoch": 7.81015037593985, "grad_norm": 0.09200643748044968, "learning_rate": 7.607333671113409e-06, "loss": 0.0185, "step": 16620 }, { "epoch": 7.81484962406015, "grad_norm": 0.10247211903333664, "learning_rate": 7.56355560708778e-06, "loss": 0.0164, "step": 16630 }, { "epoch": 7.819548872180452, "grad_norm": 0.1183587983250618, "learning_rate": 7.519893562680663e-06, "loss": 0.0116, "step": 16640 }, { "epoch": 7.8242481203007515, "grad_norm": 0.08483126014471054, "learning_rate": 7.476347657262456e-06, "loss": 0.0105, "step": 16650 }, { "epoch": 7.828947368421053, "grad_norm": 0.10963563621044159, "learning_rate": 7.432918009885997e-06, "loss": 0.0148, "step": 16660 }, { "epoch": 7.833646616541353, "grad_norm": 0.03716852888464928, "learning_rate": 7.389604739286271e-06, "loss": 0.0125, "step": 16670 }, { "epoch": 7.838345864661655, "grad_norm": 0.05933445319533348, "learning_rate": 7.3464079638801365e-06, "loss": 0.012, "step": 16680 }, { "epoch": 7.8430451127819545, "grad_norm": 0.06644676625728607, "learning_rate": 7.30332780176588e-06, "loss": 0.0158, "step": 16690 }, { "epoch": 7.847744360902256, "grad_norm": 0.0786653533577919, "learning_rate": 7.260364370723044e-06, "loss": 0.0142, "step": 16700 }, { "epoch": 7.852443609022556, "grad_norm": 0.16818052530288696, "learning_rate": 7.217517788212025e-06, "loss": 0.0095, "step": 16710 }, { "epoch": 7.857142857142857, "grad_norm": 0.05747194588184357, "learning_rate": 7.174788171373731e-06, "loss": 0.0094, "step": 16720 }, { "epoch": 7.8618421052631575, "grad_norm": 0.06959807127714157, "learning_rate": 7.132175637029293e-06, "loss": 0.0095, "step": 16730 }, { "epoch": 7.866541353383458, "grad_norm": 0.0715508908033371, "learning_rate": 7.089680301679752e-06, "loss": 0.0106, "step": 16740 }, { "epoch": 7.871240601503759, "grad_norm": 0.11585424840450287, "learning_rate": 7.047302281505736e-06, "loss": 0.0104, "step": 16750 }, { "epoch": 7.87593984962406, "grad_norm": 0.09123794734477997, "learning_rate": 7.005041692367154e-06, "loss": 0.0151, "step": 16760 }, { "epoch": 7.8806390977443606, "grad_norm": 0.14770396053791046, "learning_rate": 6.962898649802823e-06, "loss": 0.0136, "step": 16770 }, { "epoch": 7.885338345864661, "grad_norm": 0.10611529648303986, "learning_rate": 6.92087326903022e-06, "loss": 0.0134, "step": 16780 }, { "epoch": 7.890037593984962, "grad_norm": 0.07197631895542145, "learning_rate": 6.878965664945108e-06, "loss": 0.0148, "step": 16790 }, { "epoch": 7.894736842105263, "grad_norm": 0.08622337877750397, "learning_rate": 6.837175952121306e-06, "loss": 0.0058, "step": 16800 }, { "epoch": 7.899436090225564, "grad_norm": 0.08402518182992935, "learning_rate": 6.795504244810285e-06, "loss": 0.0102, "step": 16810 }, { "epoch": 7.904135338345864, "grad_norm": 0.07237549871206284, "learning_rate": 6.753950656940905e-06, "loss": 0.0126, "step": 16820 }, { "epoch": 7.908834586466165, "grad_norm": 0.17599986493587494, "learning_rate": 6.712515302119077e-06, "loss": 0.0108, "step": 16830 }, { "epoch": 7.913533834586466, "grad_norm": 0.07870358228683472, "learning_rate": 6.671198293627479e-06, "loss": 0.012, "step": 16840 }, { "epoch": 7.918233082706767, "grad_norm": 0.05515943467617035, "learning_rate": 6.629999744425236e-06, "loss": 0.0073, "step": 16850 }, { "epoch": 7.922932330827067, "grad_norm": 0.1197136715054512, "learning_rate": 6.588919767147639e-06, "loss": 0.0096, "step": 16860 }, { "epoch": 7.927631578947368, "grad_norm": 0.055950380861759186, "learning_rate": 6.5479584741057255e-06, "loss": 0.0131, "step": 16870 }, { "epoch": 7.932330827067669, "grad_norm": 0.08994955569505692, "learning_rate": 6.5071159772861436e-06, "loss": 0.0114, "step": 16880 }, { "epoch": 7.93703007518797, "grad_norm": 0.059606000781059265, "learning_rate": 6.466392388350695e-06, "loss": 0.014, "step": 16890 }, { "epoch": 7.94172932330827, "grad_norm": 0.11336628347635269, "learning_rate": 6.425787818636131e-06, "loss": 0.0109, "step": 16900 }, { "epoch": 7.946428571428571, "grad_norm": 0.08738347887992859, "learning_rate": 6.385302379153818e-06, "loss": 0.0099, "step": 16910 }, { "epoch": 7.951127819548872, "grad_norm": 0.13885965943336487, "learning_rate": 6.344936180589351e-06, "loss": 0.0111, "step": 16920 }, { "epoch": 7.955827067669173, "grad_norm": 0.1789887249469757, "learning_rate": 6.304689333302416e-06, "loss": 0.0119, "step": 16930 }, { "epoch": 7.9605263157894735, "grad_norm": 0.08739109337329865, "learning_rate": 6.264561947326331e-06, "loss": 0.0083, "step": 16940 }, { "epoch": 7.965225563909774, "grad_norm": 0.07735337316989899, "learning_rate": 6.22455413236786e-06, "loss": 0.0112, "step": 16950 }, { "epoch": 7.969924812030075, "grad_norm": 0.15434323251247406, "learning_rate": 6.184665997806832e-06, "loss": 0.0189, "step": 16960 }, { "epoch": 7.974624060150376, "grad_norm": 0.07851307839155197, "learning_rate": 6.144897652695864e-06, "loss": 0.0143, "step": 16970 }, { "epoch": 7.9793233082706765, "grad_norm": 0.16692528128623962, "learning_rate": 6.1052492057601275e-06, "loss": 0.024, "step": 16980 }, { "epoch": 7.984022556390977, "grad_norm": 0.04660286381840706, "learning_rate": 6.0657207653969315e-06, "loss": 0.006, "step": 16990 }, { "epoch": 7.988721804511278, "grad_norm": 0.18530908226966858, "learning_rate": 6.026312439675552e-06, "loss": 0.0106, "step": 17000 }, { "epoch": 7.993421052631579, "grad_norm": 0.1558287888765335, "learning_rate": 5.9870243363368275e-06, "loss": 0.0131, "step": 17010 }, { "epoch": 7.9981203007518795, "grad_norm": 0.08138086646795273, "learning_rate": 5.947856562792925e-06, "loss": 0.0156, "step": 17020 }, { "epoch": 8.00281954887218, "grad_norm": 0.04826957359910011, "learning_rate": 5.908809226127054e-06, "loss": 0.0117, "step": 17030 }, { "epoch": 8.007518796992482, "grad_norm": 0.07623440772294998, "learning_rate": 5.869882433093155e-06, "loss": 0.0137, "step": 17040 }, { "epoch": 8.012218045112782, "grad_norm": 0.11140824854373932, "learning_rate": 5.831076290115573e-06, "loss": 0.0098, "step": 17050 }, { "epoch": 8.016917293233083, "grad_norm": 0.045380230993032455, "learning_rate": 5.79239090328883e-06, "loss": 0.0102, "step": 17060 }, { "epoch": 8.021616541353383, "grad_norm": 0.056830886751413345, "learning_rate": 5.753826378377286e-06, "loss": 0.0089, "step": 17070 }, { "epoch": 8.026315789473685, "grad_norm": 0.10038434714078903, "learning_rate": 5.715382820814885e-06, "loss": 0.0093, "step": 17080 }, { "epoch": 8.031015037593985, "grad_norm": 0.06969669461250305, "learning_rate": 5.67706033570487e-06, "loss": 0.0135, "step": 17090 }, { "epoch": 8.035714285714286, "grad_norm": 0.16016316413879395, "learning_rate": 5.6388590278194096e-06, "loss": 0.0121, "step": 17100 }, { "epoch": 8.040413533834586, "grad_norm": 0.08400869369506836, "learning_rate": 5.600779001599455e-06, "loss": 0.0114, "step": 17110 }, { "epoch": 8.045112781954888, "grad_norm": 0.0524422749876976, "learning_rate": 5.562820361154314e-06, "loss": 0.0084, "step": 17120 }, { "epoch": 8.049812030075188, "grad_norm": 0.1329491287469864, "learning_rate": 5.524983210261481e-06, "loss": 0.0126, "step": 17130 }, { "epoch": 8.05451127819549, "grad_norm": 0.04901084676384926, "learning_rate": 5.48726765236629e-06, "loss": 0.0172, "step": 17140 }, { "epoch": 8.05921052631579, "grad_norm": 0.06812364608049393, "learning_rate": 5.449673790581611e-06, "loss": 0.0113, "step": 17150 }, { "epoch": 8.063909774436091, "grad_norm": 0.04741634428501129, "learning_rate": 5.412201727687644e-06, "loss": 0.0109, "step": 17160 }, { "epoch": 8.068609022556391, "grad_norm": 0.0470438152551651, "learning_rate": 5.374851566131561e-06, "loss": 0.0092, "step": 17170 }, { "epoch": 8.073308270676693, "grad_norm": 0.10451999306678772, "learning_rate": 5.337623408027293e-06, "loss": 0.0115, "step": 17180 }, { "epoch": 8.078007518796992, "grad_norm": 0.08115876466035843, "learning_rate": 5.300517355155215e-06, "loss": 0.0156, "step": 17190 }, { "epoch": 8.082706766917294, "grad_norm": 0.0681278258562088, "learning_rate": 5.263533508961827e-06, "loss": 0.0151, "step": 17200 }, { "epoch": 8.087406015037594, "grad_norm": 0.11319839954376221, "learning_rate": 5.226671970559577e-06, "loss": 0.0129, "step": 17210 }, { "epoch": 8.092105263157896, "grad_norm": 0.0824664905667305, "learning_rate": 5.1899328407264855e-06, "loss": 0.0113, "step": 17220 }, { "epoch": 8.096804511278195, "grad_norm": 0.05062666907906532, "learning_rate": 5.153316219905946e-06, "loss": 0.0164, "step": 17230 }, { "epoch": 8.101503759398497, "grad_norm": 0.10200849920511246, "learning_rate": 5.116822208206396e-06, "loss": 0.0158, "step": 17240 }, { "epoch": 8.106203007518797, "grad_norm": 0.07733986526727676, "learning_rate": 5.080450905401057e-06, "loss": 0.0118, "step": 17250 }, { "epoch": 8.110902255639099, "grad_norm": 0.04709453135728836, "learning_rate": 5.044202410927706e-06, "loss": 0.0083, "step": 17260 }, { "epoch": 8.115601503759398, "grad_norm": 0.06094250828027725, "learning_rate": 5.008076823888319e-06, "loss": 0.0154, "step": 17270 }, { "epoch": 8.1203007518797, "grad_norm": 0.04622756689786911, "learning_rate": 4.972074243048897e-06, "loss": 0.0103, "step": 17280 }, { "epoch": 8.125, "grad_norm": 0.07227181643247604, "learning_rate": 4.936194766839103e-06, "loss": 0.0127, "step": 17290 }, { "epoch": 8.1296992481203, "grad_norm": 0.07325026392936707, "learning_rate": 4.900438493352055e-06, "loss": 0.0155, "step": 17300 }, { "epoch": 8.134398496240602, "grad_norm": 0.1052834540605545, "learning_rate": 4.864805520344051e-06, "loss": 0.0117, "step": 17310 }, { "epoch": 8.139097744360901, "grad_norm": 0.09175686538219452, "learning_rate": 4.829295945234258e-06, "loss": 0.0082, "step": 17320 }, { "epoch": 8.143796992481203, "grad_norm": 0.045797426253557205, "learning_rate": 4.7939098651045235e-06, "loss": 0.0142, "step": 17330 }, { "epoch": 8.148496240601503, "grad_norm": 0.06085168570280075, "learning_rate": 4.758647376699032e-06, "loss": 0.0072, "step": 17340 }, { "epoch": 8.153195488721805, "grad_norm": 0.15534856915473938, "learning_rate": 4.723508576424062e-06, "loss": 0.0157, "step": 17350 }, { "epoch": 8.157894736842104, "grad_norm": 0.1595873087644577, "learning_rate": 4.688493560347773e-06, "loss": 0.0167, "step": 17360 }, { "epoch": 8.162593984962406, "grad_norm": 0.05131183937191963, "learning_rate": 4.653602424199876e-06, "loss": 0.0114, "step": 17370 }, { "epoch": 8.167293233082706, "grad_norm": 0.12188933789730072, "learning_rate": 4.618835263371396e-06, "loss": 0.0119, "step": 17380 }, { "epoch": 8.171992481203008, "grad_norm": 0.12209612131118774, "learning_rate": 4.5841921729144424e-06, "loss": 0.0122, "step": 17390 }, { "epoch": 8.176691729323307, "grad_norm": 0.0490056648850441, "learning_rate": 4.549673247541875e-06, "loss": 0.0112, "step": 17400 }, { "epoch": 8.181390977443609, "grad_norm": 0.1899740844964981, "learning_rate": 4.515278581627141e-06, "loss": 0.0082, "step": 17410 }, { "epoch": 8.186090225563909, "grad_norm": 0.05114210397005081, "learning_rate": 4.48100826920394e-06, "loss": 0.0078, "step": 17420 }, { "epoch": 8.19078947368421, "grad_norm": 0.1333758533000946, "learning_rate": 4.446862403965984e-06, "loss": 0.0127, "step": 17430 }, { "epoch": 8.19548872180451, "grad_norm": 0.05620293319225311, "learning_rate": 4.412841079266777e-06, "loss": 0.012, "step": 17440 }, { "epoch": 8.200187969924812, "grad_norm": 0.08491794764995575, "learning_rate": 4.378944388119311e-06, "loss": 0.0162, "step": 17450 }, { "epoch": 8.204887218045112, "grad_norm": 0.08482471108436584, "learning_rate": 4.3451724231958644e-06, "loss": 0.0098, "step": 17460 }, { "epoch": 8.209586466165414, "grad_norm": 0.11990874260663986, "learning_rate": 4.311525276827682e-06, "loss": 0.01, "step": 17470 }, { "epoch": 8.214285714285714, "grad_norm": 0.07016388326883316, "learning_rate": 4.27800304100478e-06, "loss": 0.0142, "step": 17480 }, { "epoch": 8.218984962406015, "grad_norm": 0.14324408769607544, "learning_rate": 4.244605807375679e-06, "loss": 0.0184, "step": 17490 }, { "epoch": 8.223684210526315, "grad_norm": 0.06899172067642212, "learning_rate": 4.2113336672471245e-06, "loss": 0.0108, "step": 17500 }, { "epoch": 8.228383458646617, "grad_norm": 0.1519225388765335, "learning_rate": 4.178186711583904e-06, "loss": 0.015, "step": 17510 }, { "epoch": 8.233082706766917, "grad_norm": 0.0976828932762146, "learning_rate": 4.145165031008508e-06, "loss": 0.0147, "step": 17520 }, { "epoch": 8.237781954887218, "grad_norm": 0.060619693249464035, "learning_rate": 4.112268715800943e-06, "loss": 0.0137, "step": 17530 }, { "epoch": 8.242481203007518, "grad_norm": 0.05055955424904823, "learning_rate": 4.079497855898501e-06, "loss": 0.0148, "step": 17540 }, { "epoch": 8.24718045112782, "grad_norm": 0.11303461343050003, "learning_rate": 4.046852540895446e-06, "loss": 0.0124, "step": 17550 }, { "epoch": 8.25187969924812, "grad_norm": 0.10226847231388092, "learning_rate": 4.01433286004283e-06, "loss": 0.0148, "step": 17560 }, { "epoch": 8.256578947368421, "grad_norm": 0.13229216635227203, "learning_rate": 3.981938902248222e-06, "loss": 0.0131, "step": 17570 }, { "epoch": 8.261278195488721, "grad_norm": 0.10800975561141968, "learning_rate": 3.949670756075447e-06, "loss": 0.0193, "step": 17580 }, { "epoch": 8.265977443609023, "grad_norm": 0.049899160861968994, "learning_rate": 3.917528509744412e-06, "loss": 0.0089, "step": 17590 }, { "epoch": 8.270676691729323, "grad_norm": 0.052780695259571075, "learning_rate": 3.885512251130763e-06, "loss": 0.0118, "step": 17600 }, { "epoch": 8.275375939849624, "grad_norm": 0.10531821846961975, "learning_rate": 3.8536220677657495e-06, "loss": 0.0231, "step": 17610 }, { "epoch": 8.280075187969924, "grad_norm": 0.14322184026241302, "learning_rate": 3.821858046835913e-06, "loss": 0.0114, "step": 17620 }, { "epoch": 8.284774436090226, "grad_norm": 0.05661779269576073, "learning_rate": 3.790220275182854e-06, "loss": 0.0084, "step": 17630 }, { "epoch": 8.289473684210526, "grad_norm": 0.052848171442747116, "learning_rate": 3.75870883930306e-06, "loss": 0.0094, "step": 17640 }, { "epoch": 8.294172932330827, "grad_norm": 0.133419930934906, "learning_rate": 3.7273238253475785e-06, "loss": 0.0199, "step": 17650 }, { "epoch": 8.298872180451127, "grad_norm": 0.08947378396987915, "learning_rate": 3.696065319121833e-06, "loss": 0.0104, "step": 17660 }, { "epoch": 8.303571428571429, "grad_norm": 0.03752607852220535, "learning_rate": 3.664933406085402e-06, "loss": 0.009, "step": 17670 }, { "epoch": 8.308270676691729, "grad_norm": 0.08471754193305969, "learning_rate": 3.6339281713517303e-06, "loss": 0.0097, "step": 17680 }, { "epoch": 8.31296992481203, "grad_norm": 0.07993436604738235, "learning_rate": 3.60304969968796e-06, "loss": 0.0107, "step": 17690 }, { "epoch": 8.31766917293233, "grad_norm": 0.034584853798151016, "learning_rate": 3.5722980755146517e-06, "loss": 0.0141, "step": 17700 }, { "epoch": 8.322368421052632, "grad_norm": 0.13245198130607605, "learning_rate": 3.541673382905558e-06, "loss": 0.0179, "step": 17710 }, { "epoch": 8.327067669172932, "grad_norm": 0.09505411237478256, "learning_rate": 3.511175705587433e-06, "loss": 0.0125, "step": 17720 }, { "epoch": 8.331766917293233, "grad_norm": 0.12648499011993408, "learning_rate": 3.4808051269397512e-06, "loss": 0.0108, "step": 17730 }, { "epoch": 8.336466165413533, "grad_norm": 0.07255811244249344, "learning_rate": 3.4505617299945336e-06, "loss": 0.0073, "step": 17740 }, { "epoch": 8.341165413533835, "grad_norm": 0.07952384650707245, "learning_rate": 3.420445597436056e-06, "loss": 0.0087, "step": 17750 }, { "epoch": 8.345864661654135, "grad_norm": 0.061990268528461456, "learning_rate": 3.390456811600673e-06, "loss": 0.0118, "step": 17760 }, { "epoch": 8.350563909774436, "grad_norm": 0.14404335618019104, "learning_rate": 3.360595454476595e-06, "loss": 0.0179, "step": 17770 }, { "epoch": 8.355263157894736, "grad_norm": 0.13847926259040833, "learning_rate": 3.3308616077036115e-06, "loss": 0.0108, "step": 17780 }, { "epoch": 8.359962406015038, "grad_norm": 0.0559711791574955, "learning_rate": 3.301255352572946e-06, "loss": 0.0084, "step": 17790 }, { "epoch": 8.364661654135338, "grad_norm": 0.05780694633722305, "learning_rate": 3.271776770026963e-06, "loss": 0.0141, "step": 17800 }, { "epoch": 8.36936090225564, "grad_norm": 0.09520161896944046, "learning_rate": 3.2424259406589664e-06, "loss": 0.0138, "step": 17810 }, { "epoch": 8.37406015037594, "grad_norm": 0.07881022244691849, "learning_rate": 3.213202944713023e-06, "loss": 0.007, "step": 17820 }, { "epoch": 8.378759398496241, "grad_norm": 0.12898702919483185, "learning_rate": 3.1841078620836683e-06, "loss": 0.012, "step": 17830 }, { "epoch": 8.38345864661654, "grad_norm": 0.06967730820178986, "learning_rate": 3.155140772315773e-06, "loss": 0.0117, "step": 17840 }, { "epoch": 8.388157894736842, "grad_norm": 0.14333881437778473, "learning_rate": 3.126301754604233e-06, "loss": 0.0112, "step": 17850 }, { "epoch": 8.392857142857142, "grad_norm": 0.05870426073670387, "learning_rate": 3.0975908877938277e-06, "loss": 0.0082, "step": 17860 }, { "epoch": 8.397556390977444, "grad_norm": 0.04919710382819176, "learning_rate": 3.0690082503789742e-06, "loss": 0.0095, "step": 17870 }, { "epoch": 8.402255639097744, "grad_norm": 0.10509052872657776, "learning_rate": 3.040553920503503e-06, "loss": 0.0116, "step": 17880 }, { "epoch": 8.406954887218046, "grad_norm": 0.07657311856746674, "learning_rate": 3.0122279759604745e-06, "loss": 0.0137, "step": 17890 }, { "epoch": 8.411654135338345, "grad_norm": 0.14278799295425415, "learning_rate": 2.9840304941919415e-06, "loss": 0.0147, "step": 17900 }, { "epoch": 8.416353383458647, "grad_norm": 0.06424115598201752, "learning_rate": 2.9559615522887273e-06, "loss": 0.0125, "step": 17910 }, { "epoch": 8.421052631578947, "grad_norm": 0.140712708234787, "learning_rate": 2.928021226990263e-06, "loss": 0.0109, "step": 17920 }, { "epoch": 8.425751879699249, "grad_norm": 0.04652019590139389, "learning_rate": 2.9002095946843277e-06, "loss": 0.0125, "step": 17930 }, { "epoch": 8.430451127819548, "grad_norm": 0.07694724202156067, "learning_rate": 2.8725267314068495e-06, "loss": 0.0075, "step": 17940 }, { "epoch": 8.43515037593985, "grad_norm": 0.08953419327735901, "learning_rate": 2.844972712841737e-06, "loss": 0.0124, "step": 17950 }, { "epoch": 8.43984962406015, "grad_norm": 0.07722273468971252, "learning_rate": 2.817547614320615e-06, "loss": 0.0165, "step": 17960 }, { "epoch": 8.444548872180452, "grad_norm": 0.10457627475261688, "learning_rate": 2.790251510822661e-06, "loss": 0.0104, "step": 17970 }, { "epoch": 8.449248120300751, "grad_norm": 0.07237595319747925, "learning_rate": 2.7630844769743757e-06, "loss": 0.0139, "step": 17980 }, { "epoch": 8.453947368421053, "grad_norm": 0.07277540117502213, "learning_rate": 2.73604658704939e-06, "loss": 0.0141, "step": 17990 }, { "epoch": 8.458646616541353, "grad_norm": 0.0630272775888443, "learning_rate": 2.7091379149682685e-06, "loss": 0.0126, "step": 18000 }, { "epoch": 8.463345864661655, "grad_norm": 0.09985774755477905, "learning_rate": 2.682358534298285e-06, "loss": 0.0135, "step": 18010 }, { "epoch": 8.468045112781954, "grad_norm": 0.05475354194641113, "learning_rate": 2.6557085182532582e-06, "loss": 0.0094, "step": 18020 }, { "epoch": 8.472744360902256, "grad_norm": 0.145661398768425, "learning_rate": 2.6291879396933004e-06, "loss": 0.0104, "step": 18030 }, { "epoch": 8.477443609022556, "grad_norm": 0.04099111631512642, "learning_rate": 2.602796871124663e-06, "loss": 0.0068, "step": 18040 }, { "epoch": 8.482142857142858, "grad_norm": 0.09392759948968887, "learning_rate": 2.57653538469953e-06, "loss": 0.0115, "step": 18050 }, { "epoch": 8.486842105263158, "grad_norm": 0.06434937566518784, "learning_rate": 2.5504035522157854e-06, "loss": 0.0123, "step": 18060 }, { "epoch": 8.49154135338346, "grad_norm": 0.12161760032176971, "learning_rate": 2.5244014451168863e-06, "loss": 0.009, "step": 18070 }, { "epoch": 8.496240601503759, "grad_norm": 0.06177238002419472, "learning_rate": 2.4985291344915674e-06, "loss": 0.0085, "step": 18080 }, { "epoch": 8.50093984962406, "grad_norm": 0.10747835785150528, "learning_rate": 2.4727866910737583e-06, "loss": 0.0115, "step": 18090 }, { "epoch": 8.50563909774436, "grad_norm": 0.09225572645664215, "learning_rate": 2.4471741852423237e-06, "loss": 0.0126, "step": 18100 }, { "epoch": 8.510338345864662, "grad_norm": 0.11941071599721909, "learning_rate": 2.421691687020855e-06, "loss": 0.018, "step": 18110 }, { "epoch": 8.515037593984962, "grad_norm": 0.052437394857406616, "learning_rate": 2.3963392660775575e-06, "loss": 0.0105, "step": 18120 }, { "epoch": 8.519736842105264, "grad_norm": 0.13413766026496887, "learning_rate": 2.371116991724953e-06, "loss": 0.0083, "step": 18130 }, { "epoch": 8.524436090225564, "grad_norm": 0.0477786548435688, "learning_rate": 2.3460249329197824e-06, "loss": 0.0128, "step": 18140 }, { "epoch": 8.529135338345865, "grad_norm": 0.05250224471092224, "learning_rate": 2.321063158262793e-06, "loss": 0.0166, "step": 18150 }, { "epoch": 8.533834586466165, "grad_norm": 0.0531403049826622, "learning_rate": 2.296231735998511e-06, "loss": 0.0072, "step": 18160 }, { "epoch": 8.538533834586467, "grad_norm": 0.046157341450452805, "learning_rate": 2.271530734015104e-06, "loss": 0.0101, "step": 18170 }, { "epoch": 8.543233082706767, "grad_norm": 0.16874974966049194, "learning_rate": 2.2469602198441573e-06, "loss": 0.0169, "step": 18180 }, { "epoch": 8.547932330827068, "grad_norm": 0.08781706541776657, "learning_rate": 2.222520260660521e-06, "loss": 0.0087, "step": 18190 }, { "epoch": 8.552631578947368, "grad_norm": 0.08364150673151016, "learning_rate": 2.1982109232821178e-06, "loss": 0.0154, "step": 18200 }, { "epoch": 8.55733082706767, "grad_norm": 0.09896906465291977, "learning_rate": 2.174032274169746e-06, "loss": 0.0137, "step": 18210 }, { "epoch": 8.56203007518797, "grad_norm": 0.1566537767648697, "learning_rate": 2.149984379426906e-06, "loss": 0.0129, "step": 18220 }, { "epoch": 8.566729323308271, "grad_norm": 0.08905629068613052, "learning_rate": 2.1260673047996227e-06, "loss": 0.0164, "step": 18230 }, { "epoch": 8.571428571428571, "grad_norm": 0.1383201777935028, "learning_rate": 2.102281115676258e-06, "loss": 0.0088, "step": 18240 }, { "epoch": 8.576127819548873, "grad_norm": 0.054648809134960175, "learning_rate": 2.0786258770873647e-06, "loss": 0.0099, "step": 18250 }, { "epoch": 8.580827067669173, "grad_norm": 0.0630398541688919, "learning_rate": 2.0551016537054493e-06, "loss": 0.0077, "step": 18260 }, { "epoch": 8.585526315789474, "grad_norm": 0.12112493067979813, "learning_rate": 2.0317085098448372e-06, "loss": 0.0114, "step": 18270 }, { "epoch": 8.590225563909774, "grad_norm": 0.12041357904672623, "learning_rate": 2.008446509461498e-06, "loss": 0.009, "step": 18280 }, { "epoch": 8.594924812030076, "grad_norm": 0.10477473586797714, "learning_rate": 1.985315716152847e-06, "loss": 0.0115, "step": 18290 }, { "epoch": 8.599624060150376, "grad_norm": 0.05041252076625824, "learning_rate": 1.962316193157593e-06, "loss": 0.0218, "step": 18300 }, { "epoch": 8.604323308270677, "grad_norm": 0.027872784063220024, "learning_rate": 1.939448003355554e-06, "loss": 0.0138, "step": 18310 }, { "epoch": 8.609022556390977, "grad_norm": 0.048155125230550766, "learning_rate": 1.91671120926748e-06, "loss": 0.0119, "step": 18320 }, { "epoch": 8.613721804511279, "grad_norm": 0.10073873400688171, "learning_rate": 1.8941058730549132e-06, "loss": 0.0114, "step": 18330 }, { "epoch": 8.618421052631579, "grad_norm": 0.07115645706653595, "learning_rate": 1.8716320565199618e-06, "loss": 0.0122, "step": 18340 }, { "epoch": 8.62312030075188, "grad_norm": 0.16081885993480682, "learning_rate": 1.849289821105199e-06, "loss": 0.0192, "step": 18350 }, { "epoch": 8.62781954887218, "grad_norm": 0.11983854323625565, "learning_rate": 1.8270792278934302e-06, "loss": 0.0118, "step": 18360 }, { "epoch": 8.632518796992482, "grad_norm": 0.086446613073349, "learning_rate": 1.8050003376075707e-06, "loss": 0.012, "step": 18370 }, { "epoch": 8.637218045112782, "grad_norm": 0.10558143258094788, "learning_rate": 1.7830532106104747e-06, "loss": 0.0092, "step": 18380 }, { "epoch": 8.641917293233083, "grad_norm": 0.07899550348520279, "learning_rate": 1.7612379069047335e-06, "loss": 0.0104, "step": 18390 }, { "epoch": 8.646616541353383, "grad_norm": 0.07186214625835419, "learning_rate": 1.7395544861325718e-06, "loss": 0.0146, "step": 18400 }, { "epoch": 8.651315789473685, "grad_norm": 0.06743978708982468, "learning_rate": 1.7180030075756136e-06, "loss": 0.0087, "step": 18410 }, { "epoch": 8.656015037593985, "grad_norm": 0.06802039593458176, "learning_rate": 1.696583530154794e-06, "loss": 0.012, "step": 18420 }, { "epoch": 8.660714285714286, "grad_norm": 0.13704046607017517, "learning_rate": 1.6752961124301415e-06, "loss": 0.0184, "step": 18430 }, { "epoch": 8.665413533834586, "grad_norm": 0.07604125887155533, "learning_rate": 1.6541408126006463e-06, "loss": 0.01, "step": 18440 }, { "epoch": 8.670112781954888, "grad_norm": 0.09493885189294815, "learning_rate": 1.6331176885040878e-06, "loss": 0.0146, "step": 18450 }, { "epoch": 8.674812030075188, "grad_norm": 0.0435079000890255, "learning_rate": 1.6122267976168781e-06, "loss": 0.0119, "step": 18460 }, { "epoch": 8.67951127819549, "grad_norm": 0.06214023381471634, "learning_rate": 1.5914681970539192e-06, "loss": 0.0127, "step": 18470 }, { "epoch": 8.68421052631579, "grad_norm": 0.11379414051771164, "learning_rate": 1.5708419435684462e-06, "loss": 0.0124, "step": 18480 }, { "epoch": 8.688909774436091, "grad_norm": 0.08199574053287506, "learning_rate": 1.550348093551829e-06, "loss": 0.0173, "step": 18490 }, { "epoch": 8.693609022556391, "grad_norm": 0.04369651526212692, "learning_rate": 1.5299867030334814e-06, "loss": 0.0115, "step": 18500 }, { "epoch": 8.698308270676693, "grad_norm": 0.040135458111763, "learning_rate": 1.5097578276806633e-06, "loss": 0.0123, "step": 18510 }, { "epoch": 8.703007518796992, "grad_norm": 0.06963648647069931, "learning_rate": 1.4896615227983468e-06, "loss": 0.0098, "step": 18520 }, { "epoch": 8.707706766917294, "grad_norm": 0.048562925308942795, "learning_rate": 1.4696978433290653e-06, "loss": 0.0158, "step": 18530 }, { "epoch": 8.712406015037594, "grad_norm": 0.06549891829490662, "learning_rate": 1.4498668438527597e-06, "loss": 0.011, "step": 18540 }, { "epoch": 8.717105263157894, "grad_norm": 0.07452305406332016, "learning_rate": 1.4301685785866214e-06, "loss": 0.0083, "step": 18550 }, { "epoch": 8.721804511278195, "grad_norm": 0.1147482693195343, "learning_rate": 1.4106031013849496e-06, "loss": 0.0128, "step": 18560 }, { "epoch": 8.726503759398497, "grad_norm": 0.14861489832401276, "learning_rate": 1.3911704657390113e-06, "loss": 0.0117, "step": 18570 }, { "epoch": 8.731203007518797, "grad_norm": 0.1281604766845703, "learning_rate": 1.3718707247769135e-06, "loss": 0.0092, "step": 18580 }, { "epoch": 8.735902255639097, "grad_norm": 0.06106063723564148, "learning_rate": 1.3527039312633827e-06, "loss": 0.0061, "step": 18590 }, { "epoch": 8.740601503759398, "grad_norm": 0.1314494013786316, "learning_rate": 1.333670137599713e-06, "loss": 0.019, "step": 18600 }, { "epoch": 8.7453007518797, "grad_norm": 0.12703455984592438, "learning_rate": 1.3147693958235618e-06, "loss": 0.0085, "step": 18610 }, { "epoch": 8.75, "grad_norm": 0.05815136432647705, "learning_rate": 1.2960017576088446e-06, "loss": 0.0151, "step": 18620 }, { "epoch": 8.7546992481203, "grad_norm": 0.08019871264696121, "learning_rate": 1.2773672742655784e-06, "loss": 0.0127, "step": 18630 }, { "epoch": 8.759398496240602, "grad_norm": 0.12885844707489014, "learning_rate": 1.2588659967397e-06, "loss": 0.0094, "step": 18640 }, { "epoch": 8.764097744360903, "grad_norm": 0.0936972126364708, "learning_rate": 1.2404979756130142e-06, "loss": 0.0141, "step": 18650 }, { "epoch": 8.768796992481203, "grad_norm": 0.09744187444448471, "learning_rate": 1.222263261102985e-06, "loss": 0.0156, "step": 18660 }, { "epoch": 8.773496240601503, "grad_norm": 0.14510540664196014, "learning_rate": 1.2041619030626284e-06, "loss": 0.0147, "step": 18670 }, { "epoch": 8.778195488721805, "grad_norm": 0.05950562283396721, "learning_rate": 1.1861939509803687e-06, "loss": 0.0137, "step": 18680 }, { "epoch": 8.782894736842106, "grad_norm": 0.1445922553539276, "learning_rate": 1.1683594539798893e-06, "loss": 0.0108, "step": 18690 }, { "epoch": 8.787593984962406, "grad_norm": 0.05897856876254082, "learning_rate": 1.1506584608200367e-06, "loss": 0.007, "step": 18700 }, { "epoch": 8.792293233082706, "grad_norm": 0.06876976042985916, "learning_rate": 1.1330910198946442e-06, "loss": 0.0106, "step": 18710 }, { "epoch": 8.796992481203008, "grad_norm": 0.1327231377363205, "learning_rate": 1.1156571792324211e-06, "loss": 0.0147, "step": 18720 }, { "epoch": 8.801691729323307, "grad_norm": 0.11447696387767792, "learning_rate": 1.0983569864968346e-06, "loss": 0.0171, "step": 18730 }, { "epoch": 8.806390977443609, "grad_norm": 0.07379591464996338, "learning_rate": 1.0811904889859336e-06, "loss": 0.0151, "step": 18740 }, { "epoch": 8.811090225563909, "grad_norm": 0.11582107096910477, "learning_rate": 1.064157733632276e-06, "loss": 0.0119, "step": 18750 }, { "epoch": 8.81578947368421, "grad_norm": 0.15303994715213776, "learning_rate": 1.0472587670027678e-06, "loss": 0.015, "step": 18760 }, { "epoch": 8.82048872180451, "grad_norm": 0.12110389024019241, "learning_rate": 1.030493635298535e-06, "loss": 0.013, "step": 18770 }, { "epoch": 8.825187969924812, "grad_norm": 0.0908411517739296, "learning_rate": 1.0138623843548078e-06, "loss": 0.0098, "step": 18780 }, { "epoch": 8.829887218045112, "grad_norm": 0.08263817429542542, "learning_rate": 9.97365059640787e-07, "loss": 0.0151, "step": 18790 }, { "epoch": 8.834586466165414, "grad_norm": 0.09736278653144836, "learning_rate": 9.810017062595322e-07, "loss": 0.01, "step": 18800 }, { "epoch": 8.839285714285714, "grad_norm": 0.13054323196411133, "learning_rate": 9.647723689478305e-07, "loss": 0.0113, "step": 18810 }, { "epoch": 8.843984962406015, "grad_norm": 0.09773892909288406, "learning_rate": 9.486770920760668e-07, "loss": 0.0102, "step": 18820 }, { "epoch": 8.848684210526315, "grad_norm": 0.15423932671546936, "learning_rate": 9.327159196481138e-07, "loss": 0.0133, "step": 18830 }, { "epoch": 8.853383458646617, "grad_norm": 0.15199892222881317, "learning_rate": 9.168888953011989e-07, "loss": 0.0133, "step": 18840 }, { "epoch": 8.858082706766917, "grad_norm": 0.0423540361225605, "learning_rate": 9.011960623058202e-07, "loss": 0.0098, "step": 18850 }, { "epoch": 8.862781954887218, "grad_norm": 0.120146244764328, "learning_rate": 8.856374635655695e-07, "loss": 0.0126, "step": 18860 }, { "epoch": 8.867481203007518, "grad_norm": 0.06241992488503456, "learning_rate": 8.702131416170656e-07, "loss": 0.0084, "step": 18870 }, { "epoch": 8.87218045112782, "grad_norm": 0.137865349650383, "learning_rate": 8.549231386298151e-07, "loss": 0.0227, "step": 18880 }, { "epoch": 8.87687969924812, "grad_norm": 0.04242929443717003, "learning_rate": 8.397674964061075e-07, "loss": 0.0134, "step": 18890 }, { "epoch": 8.881578947368421, "grad_norm": 0.05079125985503197, "learning_rate": 8.247462563808817e-07, "loss": 0.0117, "step": 18900 }, { "epoch": 8.886278195488721, "grad_norm": 0.1343405693769455, "learning_rate": 8.098594596216424e-07, "loss": 0.014, "step": 18910 }, { "epoch": 8.890977443609023, "grad_norm": 0.05027283728122711, "learning_rate": 7.951071468283167e-07, "loss": 0.0117, "step": 18920 }, { "epoch": 8.895676691729323, "grad_norm": 0.0534767210483551, "learning_rate": 7.804893583331696e-07, "loss": 0.0134, "step": 18930 }, { "epoch": 8.900375939849624, "grad_norm": 0.15196926891803741, "learning_rate": 7.66006134100672e-07, "loss": 0.0141, "step": 18940 }, { "epoch": 8.905075187969924, "grad_norm": 0.05484990403056145, "learning_rate": 7.516575137274162e-07, "loss": 0.0105, "step": 18950 }, { "epoch": 8.909774436090226, "grad_norm": 0.07123623043298721, "learning_rate": 7.374435364419674e-07, "loss": 0.0112, "step": 18960 }, { "epoch": 8.914473684210526, "grad_norm": 0.1264994591474533, "learning_rate": 7.233642411048014e-07, "loss": 0.0135, "step": 18970 }, { "epoch": 8.919172932330827, "grad_norm": 0.07649128884077072, "learning_rate": 7.094196662081831e-07, "loss": 0.015, "step": 18980 }, { "epoch": 8.923872180451127, "grad_norm": 0.08238343894481659, "learning_rate": 6.956098498760389e-07, "loss": 0.0169, "step": 18990 }, { "epoch": 8.928571428571429, "grad_norm": 0.06182100623846054, "learning_rate": 6.819348298638839e-07, "loss": 0.0082, "step": 19000 }, { "epoch": 8.933270676691729, "grad_norm": 0.07743122428655624, "learning_rate": 6.683946435586952e-07, "loss": 0.0283, "step": 19010 }, { "epoch": 8.93796992481203, "grad_norm": 0.05746229737997055, "learning_rate": 6.549893279788277e-07, "loss": 0.0095, "step": 19020 }, { "epoch": 8.94266917293233, "grad_norm": 0.1159047782421112, "learning_rate": 6.417189197739093e-07, "loss": 0.015, "step": 19030 }, { "epoch": 8.947368421052632, "grad_norm": 0.13121691346168518, "learning_rate": 6.285834552247128e-07, "loss": 0.0136, "step": 19040 }, { "epoch": 8.952067669172932, "grad_norm": 0.10032495856285095, "learning_rate": 6.15582970243117e-07, "loss": 0.0108, "step": 19050 }, { "epoch": 8.956766917293233, "grad_norm": 0.08989301323890686, "learning_rate": 6.027175003719354e-07, "loss": 0.0147, "step": 19060 }, { "epoch": 8.961466165413533, "grad_norm": 0.08607086539268494, "learning_rate": 5.899870807848762e-07, "loss": 0.0162, "step": 19070 }, { "epoch": 8.966165413533835, "grad_norm": 0.07855894416570663, "learning_rate": 5.773917462864264e-07, "loss": 0.0154, "step": 19080 }, { "epoch": 8.970864661654135, "grad_norm": 0.17435581982135773, "learning_rate": 5.64931531311741e-07, "loss": 0.0158, "step": 19090 }, { "epoch": 8.975563909774436, "grad_norm": 0.09805602580308914, "learning_rate": 5.526064699265753e-07, "loss": 0.0158, "step": 19100 }, { "epoch": 8.980263157894736, "grad_norm": 0.0758136659860611, "learning_rate": 5.404165958271811e-07, "loss": 0.0129, "step": 19110 }, { "epoch": 8.984962406015038, "grad_norm": 0.14450609683990479, "learning_rate": 5.283619423401998e-07, "loss": 0.0197, "step": 19120 }, { "epoch": 8.989661654135338, "grad_norm": 0.054334018379449844, "learning_rate": 5.164425424226016e-07, "loss": 0.0079, "step": 19130 }, { "epoch": 8.99436090225564, "grad_norm": 0.1322498619556427, "learning_rate": 5.046584286615697e-07, "loss": 0.0155, "step": 19140 }, { "epoch": 8.99906015037594, "grad_norm": 0.10678986459970474, "learning_rate": 4.930096332744105e-07, "loss": 0.0112, "step": 19150 }, { "epoch": 9.003759398496241, "grad_norm": 0.12043260782957077, "learning_rate": 4.814961881085045e-07, "loss": 0.0139, "step": 19160 }, { "epoch": 9.00845864661654, "grad_norm": 0.14756977558135986, "learning_rate": 4.701181246411501e-07, "loss": 0.0164, "step": 19170 }, { "epoch": 9.013157894736842, "grad_norm": 0.04383913427591324, "learning_rate": 4.5887547397955864e-07, "loss": 0.0075, "step": 19180 }, { "epoch": 9.017857142857142, "grad_norm": 0.23295745253562927, "learning_rate": 4.4776826686069305e-07, "loss": 0.0109, "step": 19190 }, { "epoch": 9.022556390977444, "grad_norm": 0.09766188263893127, "learning_rate": 4.367965336512403e-07, "loss": 0.0096, "step": 19200 }, { "epoch": 9.027255639097744, "grad_norm": 0.2106829732656479, "learning_rate": 4.259603043475002e-07, "loss": 0.0166, "step": 19210 }, { "epoch": 9.031954887218046, "grad_norm": 0.04127572849392891, "learning_rate": 4.1525960857530243e-07, "loss": 0.0162, "step": 19220 }, { "epoch": 9.036654135338345, "grad_norm": 0.07691402733325958, "learning_rate": 4.0469447558995065e-07, "loss": 0.0186, "step": 19230 }, { "epoch": 9.041353383458647, "grad_norm": 0.18556630611419678, "learning_rate": 3.9426493427611177e-07, "loss": 0.0151, "step": 19240 }, { "epoch": 9.046052631578947, "grad_norm": 0.10369502753019333, "learning_rate": 3.839710131477492e-07, "loss": 0.0174, "step": 19250 }, { "epoch": 9.050751879699249, "grad_norm": 0.1650490164756775, "learning_rate": 3.738127403480507e-07, "loss": 0.01, "step": 19260 }, { "epoch": 9.055451127819548, "grad_norm": 0.12048038840293884, "learning_rate": 3.637901436493507e-07, "loss": 0.0108, "step": 19270 }, { "epoch": 9.06015037593985, "grad_norm": 0.049617137759923935, "learning_rate": 3.5390325045304706e-07, "loss": 0.0094, "step": 19280 }, { "epoch": 9.06484962406015, "grad_norm": 0.12197034060955048, "learning_rate": 3.441520877895288e-07, "loss": 0.0161, "step": 19290 }, { "epoch": 9.069548872180452, "grad_norm": 0.05120290815830231, "learning_rate": 3.3453668231809286e-07, "loss": 0.0109, "step": 19300 }, { "epoch": 9.074248120300751, "grad_norm": 0.04005982726812363, "learning_rate": 3.250570603268943e-07, "loss": 0.0133, "step": 19310 }, { "epoch": 9.078947368421053, "grad_norm": 0.10951024293899536, "learning_rate": 3.157132477328628e-07, "loss": 0.0111, "step": 19320 }, { "epoch": 9.083646616541353, "grad_norm": 0.12475921213626862, "learning_rate": 3.0650527008162513e-07, "loss": 0.0113, "step": 19330 }, { "epoch": 9.088345864661655, "grad_norm": 0.0481477752327919, "learning_rate": 2.9743315254743833e-07, "loss": 0.0079, "step": 19340 }, { "epoch": 9.093045112781954, "grad_norm": 0.12329145520925522, "learning_rate": 2.8849691993311777e-07, "loss": 0.0146, "step": 19350 }, { "epoch": 9.097744360902256, "grad_norm": 0.08423899859189987, "learning_rate": 2.796965966699927e-07, "loss": 0.0156, "step": 19360 }, { "epoch": 9.102443609022556, "grad_norm": 0.11121213436126709, "learning_rate": 2.7103220681780615e-07, "loss": 0.0111, "step": 19370 }, { "epoch": 9.107142857142858, "grad_norm": 0.10711877793073654, "learning_rate": 2.625037740646763e-07, "loss": 0.0107, "step": 19380 }, { "epoch": 9.111842105263158, "grad_norm": 0.05875176563858986, "learning_rate": 2.5411132172700194e-07, "loss": 0.0114, "step": 19390 }, { "epoch": 9.11654135338346, "grad_norm": 0.06813649088144302, "learning_rate": 2.458548727494292e-07, "loss": 0.009, "step": 19400 }, { "epoch": 9.121240601503759, "grad_norm": 0.047655753791332245, "learning_rate": 2.3773444970477955e-07, "loss": 0.0128, "step": 19410 }, { "epoch": 9.12593984962406, "grad_norm": 0.11299572139978409, "learning_rate": 2.2975007479397738e-07, "loss": 0.0134, "step": 19420 }, { "epoch": 9.13063909774436, "grad_norm": 0.08376818895339966, "learning_rate": 2.219017698460002e-07, "loss": 0.0177, "step": 19430 }, { "epoch": 9.135338345864662, "grad_norm": 0.15395602583885193, "learning_rate": 2.1418955631781202e-07, "loss": 0.0103, "step": 19440 }, { "epoch": 9.140037593984962, "grad_norm": 0.09228936582803726, "learning_rate": 2.0661345529430775e-07, "loss": 0.0129, "step": 19450 }, { "epoch": 9.144736842105264, "grad_norm": 0.1360449194908142, "learning_rate": 1.9917348748826335e-07, "loss": 0.0128, "step": 19460 }, { "epoch": 9.149436090225564, "grad_norm": 0.10789410024881363, "learning_rate": 1.918696732402636e-07, "loss": 0.0124, "step": 19470 }, { "epoch": 9.154135338345865, "grad_norm": 0.12555211782455444, "learning_rate": 1.847020325186577e-07, "loss": 0.0123, "step": 19480 }, { "epoch": 9.158834586466165, "grad_norm": 0.11357904225587845, "learning_rate": 1.776705849195037e-07, "loss": 0.0086, "step": 19490 }, { "epoch": 9.163533834586467, "grad_norm": 0.09295064955949783, "learning_rate": 1.7077534966650766e-07, "loss": 0.0147, "step": 19500 }, { "epoch": 9.168233082706767, "grad_norm": 0.1021043211221695, "learning_rate": 1.6401634561098444e-07, "loss": 0.0081, "step": 19510 }, { "epoch": 9.172932330827068, "grad_norm": 0.07909571379423141, "learning_rate": 1.5739359123178587e-07, "loss": 0.011, "step": 19520 }, { "epoch": 9.177631578947368, "grad_norm": 0.047228168696165085, "learning_rate": 1.5090710463527836e-07, "loss": 0.0141, "step": 19530 }, { "epoch": 9.18233082706767, "grad_norm": 0.08932602405548096, "learning_rate": 1.4455690355525964e-07, "loss": 0.01, "step": 19540 }, { "epoch": 9.18703007518797, "grad_norm": 0.1202157661318779, "learning_rate": 1.383430053529422e-07, "loss": 0.0112, "step": 19550 }, { "epoch": 9.191729323308271, "grad_norm": 0.027310887351632118, "learning_rate": 1.3226542701689215e-07, "loss": 0.0142, "step": 19560 }, { "epoch": 9.196428571428571, "grad_norm": 0.08331390470266342, "learning_rate": 1.2632418516296262e-07, "loss": 0.0088, "step": 19570 }, { "epoch": 9.201127819548873, "grad_norm": 0.12556742131710052, "learning_rate": 1.2051929603428825e-07, "loss": 0.0204, "step": 19580 }, { "epoch": 9.205827067669173, "grad_norm": 0.05147033929824829, "learning_rate": 1.1485077550122402e-07, "loss": 0.0075, "step": 19590 }, { "epoch": 9.210526315789474, "grad_norm": 0.061899200081825256, "learning_rate": 1.0931863906127327e-07, "loss": 0.0089, "step": 19600 }, { "epoch": 9.215225563909774, "grad_norm": 0.05099288746714592, "learning_rate": 1.0392290183909304e-07, "loss": 0.0115, "step": 19610 }, { "epoch": 9.219924812030076, "grad_norm": 0.04262029007077217, "learning_rate": 9.866357858642205e-08, "loss": 0.0097, "step": 19620 }, { "epoch": 9.224624060150376, "grad_norm": 0.045728232711553574, "learning_rate": 9.354068368204739e-08, "loss": 0.012, "step": 19630 }, { "epoch": 9.229323308270677, "grad_norm": 0.09205039590597153, "learning_rate": 8.855423113177664e-08, "loss": 0.0112, "step": 19640 }, { "epoch": 9.234022556390977, "grad_norm": 0.048479776829481125, "learning_rate": 8.37042345683714e-08, "loss": 0.0205, "step": 19650 }, { "epoch": 9.238721804511279, "grad_norm": 0.046253811568021774, "learning_rate": 7.899070725153613e-08, "loss": 0.0117, "step": 19660 }, { "epoch": 9.243421052631579, "grad_norm": 0.10509341955184937, "learning_rate": 7.44136620678848e-08, "loss": 0.0113, "step": 19670 }, { "epoch": 9.24812030075188, "grad_norm": 0.09187602251768112, "learning_rate": 6.997311153086883e-08, "loss": 0.0172, "step": 19680 }, { "epoch": 9.25281954887218, "grad_norm": 0.04167873412370682, "learning_rate": 6.566906778079917e-08, "loss": 0.0109, "step": 19690 }, { "epoch": 9.257518796992482, "grad_norm": 0.1210450828075409, "learning_rate": 6.150154258476315e-08, "loss": 0.012, "step": 19700 }, { "epoch": 9.262218045112782, "grad_norm": 0.03811037912964821, "learning_rate": 5.747054733660773e-08, "loss": 0.0223, "step": 19710 }, { "epoch": 9.266917293233083, "grad_norm": 0.10711174458265305, "learning_rate": 5.3576093056922906e-08, "loss": 0.0092, "step": 19720 }, { "epoch": 9.271616541353383, "grad_norm": 0.07966139167547226, "learning_rate": 4.981819039300284e-08, "loss": 0.011, "step": 19730 }, { "epoch": 9.276315789473685, "grad_norm": 0.11096978932619095, "learning_rate": 4.619684961881254e-08, "loss": 0.0075, "step": 19740 }, { "epoch": 9.281015037593985, "grad_norm": 0.08339618891477585, "learning_rate": 4.2712080634949024e-08, "loss": 0.0186, "step": 19750 }, { "epoch": 9.285714285714286, "grad_norm": 0.11398918181657791, "learning_rate": 3.936389296864129e-08, "loss": 0.0088, "step": 19760 }, { "epoch": 9.290413533834586, "grad_norm": 0.04430542141199112, "learning_rate": 3.615229577371149e-08, "loss": 0.0149, "step": 19770 }, { "epoch": 9.295112781954888, "grad_norm": 0.06718280166387558, "learning_rate": 3.3077297830541584e-08, "loss": 0.0172, "step": 19780 }, { "epoch": 9.299812030075188, "grad_norm": 0.11311540752649307, "learning_rate": 3.01389075460512e-08, "loss": 0.0123, "step": 19790 }, { "epoch": 9.30451127819549, "grad_norm": 0.06545548141002655, "learning_rate": 2.7337132953697554e-08, "loss": 0.0166, "step": 19800 }, { "epoch": 9.30921052631579, "grad_norm": 0.07823529839515686, "learning_rate": 2.467198171342e-08, "loss": 0.0132, "step": 19810 }, { "epoch": 9.313909774436091, "grad_norm": 0.06555328518152237, "learning_rate": 2.214346111164556e-08, "loss": 0.0115, "step": 19820 }, { "epoch": 9.318609022556391, "grad_norm": 0.04978393763303757, "learning_rate": 1.9751578061244504e-08, "loss": 0.0252, "step": 19830 }, { "epoch": 9.323308270676693, "grad_norm": 0.04655987396836281, "learning_rate": 1.749633910153592e-08, "loss": 0.0098, "step": 19840 }, { "epoch": 9.328007518796992, "grad_norm": 0.04883876070380211, "learning_rate": 1.5377750398265502e-08, "loss": 0.009, "step": 19850 }, { "epoch": 9.332706766917294, "grad_norm": 0.04160952940583229, "learning_rate": 1.3395817743561134e-08, "loss": 0.0088, "step": 19860 }, { "epoch": 9.337406015037594, "grad_norm": 0.14190863072872162, "learning_rate": 1.1550546555960662e-08, "loss": 0.0189, "step": 19870 }, { "epoch": 9.342105263157896, "grad_norm": 0.1261986643075943, "learning_rate": 9.841941880361916e-09, "loss": 0.0123, "step": 19880 }, { "epoch": 9.346804511278195, "grad_norm": 0.13026973605155945, "learning_rate": 8.270008388022721e-09, "loss": 0.019, "step": 19890 }, { "epoch": 9.351503759398497, "grad_norm": 0.04853995516896248, "learning_rate": 6.834750376549792e-09, "loss": 0.0128, "step": 19900 }, { "epoch": 9.356203007518797, "grad_norm": 0.06438510119915009, "learning_rate": 5.536171769887632e-09, "loss": 0.0175, "step": 19910 }, { "epoch": 9.360902255639097, "grad_norm": 0.0485045500099659, "learning_rate": 4.3742761183018784e-09, "loss": 0.0112, "step": 19920 }, { "epoch": 9.365601503759398, "grad_norm": 0.0634605661034584, "learning_rate": 3.349066598362649e-09, "loss": 0.0086, "step": 19930 }, { "epoch": 9.3703007518797, "grad_norm": 0.05341292545199394, "learning_rate": 2.4605460129556445e-09, "loss": 0.0113, "step": 19940 }, { "epoch": 9.375, "grad_norm": 0.10371940582990646, "learning_rate": 1.7087167912710478e-09, "loss": 0.0153, "step": 19950 }, { "epoch": 9.3796992481203, "grad_norm": 0.04511750862002373, "learning_rate": 1.0935809887702154e-09, "loss": 0.0112, "step": 19960 }, { "epoch": 9.384398496240602, "grad_norm": 0.0870666578412056, "learning_rate": 6.151402872134337e-10, "loss": 0.0169, "step": 19970 }, { "epoch": 9.389097744360903, "grad_norm": 0.2327416092157364, "learning_rate": 2.7339599464326627e-10, "loss": 0.0134, "step": 19980 }, { "epoch": 9.393796992481203, "grad_norm": 0.06986179947853088, "learning_rate": 6.834904537900144e-11, "loss": 0.0133, "step": 19990 }, { "epoch": 9.398496240601503, "grad_norm": 0.14632254838943481, "learning_rate": 0.0, "loss": 0.0165, "step": 20000 } ], "logging_steps": 10, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.892274366138223e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }