{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0282, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002, "grad_norm": 12.427441596984863, "learning_rate": 1.8e-07, "loss": 0.4661, "step": 10 }, { "epoch": 0.004, "grad_norm": 10.644728660583496, "learning_rate": 3.8e-07, "loss": 0.43, "step": 20 }, { "epoch": 0.006, "grad_norm": 2.582233428955078, "learning_rate": 5.800000000000001e-07, "loss": 0.2937, "step": 30 }, { "epoch": 0.008, "grad_norm": 1.6104474067687988, "learning_rate": 7.8e-07, "loss": 0.2311, "step": 40 }, { "epoch": 0.01, "grad_norm": 1.3486788272857666, "learning_rate": 9.800000000000001e-07, "loss": 0.1944, "step": 50 }, { "epoch": 0.012, "grad_norm": 2.3484883308410645, "learning_rate": 1.1800000000000001e-06, "loss": 0.178, "step": 60 }, { "epoch": 0.014, "grad_norm": 2.1255300045013428, "learning_rate": 1.3800000000000001e-06, "loss": 0.1663, "step": 70 }, { "epoch": 0.016, "grad_norm": 1.2671189308166504, "learning_rate": 1.5800000000000001e-06, "loss": 0.1521, "step": 80 }, { "epoch": 0.018, "grad_norm": 2.0237274169921875, "learning_rate": 1.7800000000000001e-06, "loss": 0.1515, "step": 90 }, { "epoch": 0.02, "grad_norm": 1.4099950790405273, "learning_rate": 1.98e-06, "loss": 0.1481, "step": 100 }, { "epoch": 0.022, "grad_norm": 2.23960280418396, "learning_rate": 2.1800000000000003e-06, "loss": 0.1487, "step": 110 }, { "epoch": 0.024, "grad_norm": 1.7865636348724365, "learning_rate": 2.38e-06, "loss": 0.1359, "step": 120 }, { "epoch": 0.026, "grad_norm": 1.590512990951538, "learning_rate": 2.5800000000000003e-06, "loss": 0.1487, "step": 130 }, { "epoch": 0.028, "grad_norm": 3.781755208969116, "learning_rate": 2.7800000000000005e-06, "loss": 0.1381, "step": 140 }, { "epoch": 0.03, "grad_norm": 1.4385772943496704, "learning_rate": 2.9800000000000003e-06, "loss": 0.1384, "step": 150 }, { "epoch": 0.032, "grad_norm": 1.2209824323654175, "learning_rate": 3.1800000000000005e-06, "loss": 0.1407, "step": 160 }, { "epoch": 0.034, "grad_norm": 1.463581919670105, "learning_rate": 3.3800000000000007e-06, "loss": 0.1392, "step": 170 }, { "epoch": 0.036, "grad_norm": 0.9795161485671997, "learning_rate": 3.58e-06, "loss": 0.1335, "step": 180 }, { "epoch": 0.038, "grad_norm": 1.5629451274871826, "learning_rate": 3.7800000000000002e-06, "loss": 0.1355, "step": 190 }, { "epoch": 0.04, "grad_norm": 1.2003155946731567, "learning_rate": 3.980000000000001e-06, "loss": 0.136, "step": 200 }, { "epoch": 0.042, "grad_norm": 1.3819934129714966, "learning_rate": 4.18e-06, "loss": 0.1359, "step": 210 }, { "epoch": 0.044, "grad_norm": 1.797484278678894, "learning_rate": 4.38e-06, "loss": 0.1318, "step": 220 }, { "epoch": 0.046, "grad_norm": 1.15238618850708, "learning_rate": 4.58e-06, "loss": 0.1303, "step": 230 }, { "epoch": 0.048, "grad_norm": 0.8387578129768372, "learning_rate": 4.78e-06, "loss": 0.1359, "step": 240 }, { "epoch": 0.05, "grad_norm": 0.8264948129653931, "learning_rate": 4.980000000000001e-06, "loss": 0.1316, "step": 250 }, { "epoch": 0.052, "grad_norm": 0.9605500102043152, "learning_rate": 5.18e-06, "loss": 0.1287, "step": 260 }, { "epoch": 0.054, "grad_norm": 0.9171805381774902, "learning_rate": 5.380000000000001e-06, "loss": 0.135, "step": 270 }, { "epoch": 0.056, "grad_norm": 0.7537838220596313, "learning_rate": 5.580000000000001e-06, "loss": 0.1317, "step": 280 }, { "epoch": 0.058, "grad_norm": 0.9007892608642578, "learning_rate": 5.78e-06, "loss": 0.1295, "step": 290 }, { "epoch": 0.06, "grad_norm": 0.8143233060836792, "learning_rate": 5.98e-06, "loss": 0.1337, "step": 300 }, { "epoch": 0.062, "grad_norm": 0.7211500406265259, "learning_rate": 6.18e-06, "loss": 0.1228, "step": 310 }, { "epoch": 0.064, "grad_norm": 0.8526885509490967, "learning_rate": 6.380000000000001e-06, "loss": 0.1331, "step": 320 }, { "epoch": 0.066, "grad_norm": 1.7408725023269653, "learning_rate": 6.5800000000000005e-06, "loss": 0.127, "step": 330 }, { "epoch": 0.068, "grad_norm": 1.5572606325149536, "learning_rate": 6.780000000000001e-06, "loss": 0.1177, "step": 340 }, { "epoch": 0.07, "grad_norm": 2.8059284687042236, "learning_rate": 6.98e-06, "loss": 0.1267, "step": 350 }, { "epoch": 0.072, "grad_norm": 0.755484938621521, "learning_rate": 7.180000000000001e-06, "loss": 0.1278, "step": 360 }, { "epoch": 0.074, "grad_norm": 0.9175857901573181, "learning_rate": 7.3800000000000005e-06, "loss": 0.121, "step": 370 }, { "epoch": 0.076, "grad_norm": 0.8510663509368896, "learning_rate": 7.58e-06, "loss": 0.115, "step": 380 }, { "epoch": 0.078, "grad_norm": 0.9119841456413269, "learning_rate": 7.78e-06, "loss": 0.1218, "step": 390 }, { "epoch": 0.08, "grad_norm": 0.6241216063499451, "learning_rate": 7.980000000000002e-06, "loss": 0.1238, "step": 400 }, { "epoch": 0.082, "grad_norm": 0.8066564202308655, "learning_rate": 8.18e-06, "loss": 0.1205, "step": 410 }, { "epoch": 0.084, "grad_norm": 1.104693055152893, "learning_rate": 8.380000000000001e-06, "loss": 0.1225, "step": 420 }, { "epoch": 0.086, "grad_norm": 2.131037712097168, "learning_rate": 8.580000000000001e-06, "loss": 0.1321, "step": 430 }, { "epoch": 0.088, "grad_norm": 0.8443548679351807, "learning_rate": 8.78e-06, "loss": 0.1278, "step": 440 }, { "epoch": 0.09, "grad_norm": 0.8610478043556213, "learning_rate": 8.98e-06, "loss": 0.1169, "step": 450 }, { "epoch": 0.092, "grad_norm": 0.9445344805717468, "learning_rate": 9.180000000000002e-06, "loss": 0.1293, "step": 460 }, { "epoch": 0.094, "grad_norm": 0.8306660056114197, "learning_rate": 9.38e-06, "loss": 0.1213, "step": 470 }, { "epoch": 0.096, "grad_norm": 1.2315057516098022, "learning_rate": 9.58e-06, "loss": 0.1173, "step": 480 }, { "epoch": 0.098, "grad_norm": 0.7147797346115112, "learning_rate": 9.780000000000001e-06, "loss": 0.1138, "step": 490 }, { "epoch": 0.1, "grad_norm": 0.8065736293792725, "learning_rate": 9.980000000000001e-06, "loss": 0.1225, "step": 500 }, { "epoch": 0.102, "grad_norm": 0.6982239484786987, "learning_rate": 9.999901304280686e-06, "loss": 0.1225, "step": 510 }, { "epoch": 0.104, "grad_norm": 0.906414806842804, "learning_rate": 9.999560138895238e-06, "loss": 0.1225, "step": 520 }, { "epoch": 0.106, "grad_norm": 0.6281483769416809, "learning_rate": 9.99897530200195e-06, "loss": 0.1183, "step": 530 }, { "epoch": 0.108, "grad_norm": 0.626783549785614, "learning_rate": 9.998146822104943e-06, "loss": 0.124, "step": 540 }, { "epoch": 0.11, "grad_norm": 0.9497509598731995, "learning_rate": 9.997074739583162e-06, "loss": 0.1193, "step": 550 }, { "epoch": 0.112, "grad_norm": 0.7951166033744812, "learning_rate": 9.995759106688394e-06, "loss": 0.1151, "step": 560 }, { "epoch": 0.114, "grad_norm": 0.722017228603363, "learning_rate": 9.99419998754273e-06, "loss": 0.1205, "step": 570 }, { "epoch": 0.116, "grad_norm": 0.7518423795700073, "learning_rate": 9.992397458135438e-06, "loss": 0.1231, "step": 580 }, { "epoch": 0.118, "grad_norm": 3.67732834815979, "learning_rate": 9.990351606319261e-06, "loss": 0.116, "step": 590 }, { "epoch": 0.12, "grad_norm": 0.7335011959075928, "learning_rate": 9.988062531806127e-06, "loss": 0.1138, "step": 600 }, { "epoch": 0.122, "grad_norm": 0.6158178448677063, "learning_rate": 9.9855303461623e-06, "loss": 0.1116, "step": 610 }, { "epoch": 0.124, "grad_norm": 1.0562201738357544, "learning_rate": 9.982755172802933e-06, "loss": 0.1122, "step": 620 }, { "epoch": 0.126, "grad_norm": 0.7680226564407349, "learning_rate": 9.979737146986064e-06, "loss": 0.1108, "step": 630 }, { "epoch": 0.128, "grad_norm": 0.935513973236084, "learning_rate": 9.976476415806013e-06, "loss": 0.1189, "step": 640 }, { "epoch": 0.13, "grad_norm": 0.7690663933753967, "learning_rate": 9.972973138186217e-06, "loss": 0.1201, "step": 650 }, { "epoch": 0.132, "grad_norm": 0.8854688405990601, "learning_rate": 9.969227484871485e-06, "loss": 0.1144, "step": 660 }, { "epoch": 0.134, "grad_norm": 1.5783987045288086, "learning_rate": 9.965239638419673e-06, "loss": 0.1173, "step": 670 }, { "epoch": 0.136, "grad_norm": 0.7775712013244629, "learning_rate": 9.961009793192793e-06, "loss": 0.1199, "step": 680 }, { "epoch": 0.138, "grad_norm": 0.8870111107826233, "learning_rate": 9.956538155347534e-06, "loss": 0.1221, "step": 690 }, { "epoch": 0.14, "grad_norm": 0.9003025889396667, "learning_rate": 9.951824942825215e-06, "loss": 0.1144, "step": 700 }, { "epoch": 0.142, "grad_norm": 0.8907985687255859, "learning_rate": 9.946870385341167e-06, "loss": 0.111, "step": 710 }, { "epoch": 0.144, "grad_norm": 0.7512484192848206, "learning_rate": 9.94167472437353e-06, "loss": 0.1233, "step": 720 }, { "epoch": 0.146, "grad_norm": 1.176719069480896, "learning_rate": 9.936238213151491e-06, "loss": 0.1077, "step": 730 }, { "epoch": 0.148, "grad_norm": 0.5437051057815552, "learning_rate": 9.930561116642936e-06, "loss": 0.1159, "step": 740 }, { "epoch": 0.15, "grad_norm": 1.4601088762283325, "learning_rate": 9.92464371154154e-06, "loss": 0.1121, "step": 750 }, { "epoch": 0.152, "grad_norm": 0.6581608057022095, "learning_rate": 9.918486286253279e-06, "loss": 0.1155, "step": 760 }, { "epoch": 0.154, "grad_norm": 0.7549941539764404, "learning_rate": 9.912089140882377e-06, "loss": 0.1138, "step": 770 }, { "epoch": 0.156, "grad_norm": 0.8127408623695374, "learning_rate": 9.90545258721667e-06, "loss": 0.113, "step": 780 }, { "epoch": 0.158, "grad_norm": 0.6606084108352661, "learning_rate": 9.898576948712427e-06, "loss": 0.1171, "step": 790 }, { "epoch": 0.16, "grad_norm": 0.90671306848526, "learning_rate": 9.891462560478562e-06, "loss": 0.1096, "step": 800 }, { "epoch": 0.162, "grad_norm": 0.6799761652946472, "learning_rate": 9.884109769260326e-06, "loss": 0.1175, "step": 810 }, { "epoch": 0.164, "grad_norm": 0.6381652355194092, "learning_rate": 9.876518933422385e-06, "loss": 0.1148, "step": 820 }, { "epoch": 0.166, "grad_norm": 0.5381753444671631, "learning_rate": 9.868690422931372e-06, "loss": 0.121, "step": 830 }, { "epoch": 0.168, "grad_norm": 0.7047041654586792, "learning_rate": 9.860624619337844e-06, "loss": 0.1128, "step": 840 }, { "epoch": 0.17, "grad_norm": 0.9245678186416626, "learning_rate": 9.852321915757688e-06, "loss": 0.1172, "step": 850 }, { "epoch": 0.172, "grad_norm": 1.1964024305343628, "learning_rate": 9.843782716852963e-06, "loss": 0.1156, "step": 860 }, { "epoch": 0.174, "grad_norm": 0.6748765110969543, "learning_rate": 9.835007438812177e-06, "loss": 0.116, "step": 870 }, { "epoch": 0.176, "grad_norm": 0.9408971071243286, "learning_rate": 9.825996509330001e-06, "loss": 0.1134, "step": 880 }, { "epoch": 0.178, "grad_norm": 0.5978791117668152, "learning_rate": 9.816750367586424e-06, "loss": 0.1148, "step": 890 }, { "epoch": 0.18, "grad_norm": 0.8566645383834839, "learning_rate": 9.807269464225355e-06, "loss": 0.1117, "step": 900 }, { "epoch": 0.182, "grad_norm": 0.7490561604499817, "learning_rate": 9.797554261332644e-06, "loss": 0.1116, "step": 910 }, { "epoch": 0.184, "grad_norm": 0.5367133617401123, "learning_rate": 9.787605232413575e-06, "loss": 0.1132, "step": 920 }, { "epoch": 0.186, "grad_norm": 0.7432158589363098, "learning_rate": 9.777422862369782e-06, "loss": 0.1251, "step": 930 }, { "epoch": 0.188, "grad_norm": 0.7126122117042542, "learning_rate": 9.767007647475618e-06, "loss": 0.1066, "step": 940 }, { "epoch": 0.19, "grad_norm": 1.2850806713104248, "learning_rate": 9.756360095353957e-06, "loss": 0.1122, "step": 950 }, { "epoch": 0.192, "grad_norm": 0.4591011106967926, "learning_rate": 9.745480724951473e-06, "loss": 0.1101, "step": 960 }, { "epoch": 0.194, "grad_norm": 0.6034103035926819, "learning_rate": 9.73437006651333e-06, "loss": 0.1145, "step": 970 }, { "epoch": 0.196, "grad_norm": 0.6527671813964844, "learning_rate": 9.723028661557345e-06, "loss": 0.1074, "step": 980 }, { "epoch": 0.198, "grad_norm": 0.49524030089378357, "learning_rate": 9.711457062847596e-06, "loss": 0.1101, "step": 990 }, { "epoch": 0.2, "grad_norm": 0.7419334053993225, "learning_rate": 9.699655834367479e-06, "loss": 0.1133, "step": 1000 }, { "epoch": 0.202, "grad_norm": 0.508783221244812, "learning_rate": 9.687625551292219e-06, "loss": 0.1156, "step": 1010 }, { "epoch": 0.204, "grad_norm": 0.6890400648117065, "learning_rate": 9.675366799960842e-06, "loss": 0.1095, "step": 1020 }, { "epoch": 0.206, "grad_norm": 0.574763834476471, "learning_rate": 9.662880177847595e-06, "loss": 0.114, "step": 1030 }, { "epoch": 0.208, "grad_norm": 0.5926764607429504, "learning_rate": 9.650166293532822e-06, "loss": 0.1087, "step": 1040 }, { "epoch": 0.21, "grad_norm": 0.6142486333847046, "learning_rate": 9.637225766673309e-06, "loss": 0.1061, "step": 1050 }, { "epoch": 0.212, "grad_norm": 0.5107919573783875, "learning_rate": 9.624059227972077e-06, "loss": 0.1154, "step": 1060 }, { "epoch": 0.214, "grad_norm": 0.7692158818244934, "learning_rate": 9.610667319147648e-06, "loss": 0.1128, "step": 1070 }, { "epoch": 0.216, "grad_norm": 0.5525968074798584, "learning_rate": 9.597050692902765e-06, "loss": 0.1096, "step": 1080 }, { "epoch": 0.218, "grad_norm": 0.6440755724906921, "learning_rate": 9.583210012892582e-06, "loss": 0.1157, "step": 1090 }, { "epoch": 0.22, "grad_norm": 0.5787026882171631, "learning_rate": 9.569145953692316e-06, "loss": 0.1042, "step": 1100 }, { "epoch": 0.222, "grad_norm": 0.8566828966140747, "learning_rate": 9.554859200764371e-06, "loss": 0.109, "step": 1110 }, { "epoch": 0.224, "grad_norm": 0.5807632207870483, "learning_rate": 9.540350450424927e-06, "loss": 0.1094, "step": 1120 }, { "epoch": 0.226, "grad_norm": 0.5819889307022095, "learning_rate": 9.525620409810009e-06, "loss": 0.1065, "step": 1130 }, { "epoch": 0.228, "grad_norm": 0.46392130851745605, "learning_rate": 9.510669796841014e-06, "loss": 0.1058, "step": 1140 }, { "epoch": 0.23, "grad_norm": 1.063761830329895, "learning_rate": 9.495499340189729e-06, "loss": 0.1096, "step": 1150 }, { "epoch": 0.232, "grad_norm": 0.7276637554168701, "learning_rate": 9.480109779242805e-06, "loss": 0.118, "step": 1160 }, { "epoch": 0.234, "grad_norm": 0.5671316385269165, "learning_rate": 9.464501864065735e-06, "loss": 0.1077, "step": 1170 }, { "epoch": 0.236, "grad_norm": 0.7350747585296631, "learning_rate": 9.448676355366282e-06, "loss": 0.1105, "step": 1180 }, { "epoch": 0.238, "grad_norm": 0.5442182421684265, "learning_rate": 9.432634024457414e-06, "loss": 0.1058, "step": 1190 }, { "epoch": 0.24, "grad_norm": 0.9185284376144409, "learning_rate": 9.41637565321971e-06, "loss": 0.1026, "step": 1200 }, { "epoch": 0.242, "grad_norm": 0.7028173208236694, "learning_rate": 9.399902034063244e-06, "loss": 0.108, "step": 1210 }, { "epoch": 0.244, "grad_norm": 0.5107137560844421, "learning_rate": 9.383213969888972e-06, "loss": 0.1148, "step": 1220 }, { "epoch": 0.246, "grad_norm": 0.5467950701713562, "learning_rate": 9.366312274049602e-06, "loss": 0.1007, "step": 1230 }, { "epoch": 0.248, "grad_norm": 0.8054739832878113, "learning_rate": 9.349197770309942e-06, "loss": 0.1057, "step": 1240 }, { "epoch": 0.25, "grad_norm": 0.5686827301979065, "learning_rate": 9.33187129280676e-06, "loss": 0.1071, "step": 1250 }, { "epoch": 0.252, "grad_norm": 0.6124984622001648, "learning_rate": 9.314333686008125e-06, "loss": 0.1095, "step": 1260 }, { "epoch": 0.254, "grad_norm": 1.458778977394104, "learning_rate": 9.296585804672253e-06, "loss": 0.1072, "step": 1270 }, { "epoch": 0.256, "grad_norm": 0.6592323780059814, "learning_rate": 9.278628513805838e-06, "loss": 0.1009, "step": 1280 }, { "epoch": 0.258, "grad_norm": 0.5499700903892517, "learning_rate": 9.260462688621906e-06, "loss": 0.109, "step": 1290 }, { "epoch": 0.26, "grad_norm": 0.5235973596572876, "learning_rate": 9.242089214497146e-06, "loss": 0.1044, "step": 1300 }, { "epoch": 0.262, "grad_norm": 0.5331101417541504, "learning_rate": 9.223508986928766e-06, "loss": 0.103, "step": 1310 }, { "epoch": 0.264, "grad_norm": 0.4890216290950775, "learning_rate": 9.204722911490847e-06, "loss": 0.1059, "step": 1320 }, { "epoch": 0.266, "grad_norm": 0.7136434316635132, "learning_rate": 9.1857319037902e-06, "loss": 0.1063, "step": 1330 }, { "epoch": 0.268, "grad_norm": 0.5715893507003784, "learning_rate": 9.16653688942175e-06, "loss": 0.1147, "step": 1340 }, { "epoch": 0.27, "grad_norm": 0.6043694615364075, "learning_rate": 9.147138803923417e-06, "loss": 0.11, "step": 1350 }, { "epoch": 0.272, "grad_norm": 0.720506489276886, "learning_rate": 9.12753859273052e-06, "loss": 0.1042, "step": 1360 }, { "epoch": 0.274, "grad_norm": 0.8681615591049194, "learning_rate": 9.107737211129702e-06, "loss": 0.1041, "step": 1370 }, { "epoch": 0.276, "grad_norm": 0.49558717012405396, "learning_rate": 9.087735624212365e-06, "loss": 0.1109, "step": 1380 }, { "epoch": 0.278, "grad_norm": 0.5701916217803955, "learning_rate": 9.06753480682764e-06, "loss": 0.1118, "step": 1390 }, { "epoch": 0.28, "grad_norm": 0.503829836845398, "learning_rate": 9.047135743534866e-06, "loss": 0.1086, "step": 1400 }, { "epoch": 0.282, "grad_norm": 0.3906455636024475, "learning_rate": 9.026539428555609e-06, "loss": 0.1082, "step": 1410 }, { "epoch": 0.284, "grad_norm": 0.6386545300483704, "learning_rate": 9.005746865725206e-06, "loss": 0.1061, "step": 1420 }, { "epoch": 0.286, "grad_norm": 0.5736138224601746, "learning_rate": 8.984759068443832e-06, "loss": 0.1117, "step": 1430 }, { "epoch": 0.288, "grad_norm": 0.8654852509498596, "learning_rate": 8.963577059627117e-06, "loss": 0.1075, "step": 1440 }, { "epoch": 0.29, "grad_norm": 0.6457758545875549, "learning_rate": 8.942201871656292e-06, "loss": 0.1098, "step": 1450 }, { "epoch": 0.292, "grad_norm": 0.4607081115245819, "learning_rate": 8.920634546327857e-06, "loss": 0.1033, "step": 1460 }, { "epoch": 0.294, "grad_norm": 0.4616289734840393, "learning_rate": 8.898876134802827e-06, "loss": 0.1073, "step": 1470 }, { "epoch": 0.296, "grad_norm": 0.5038948655128479, "learning_rate": 8.87692769755548e-06, "loss": 0.1086, "step": 1480 }, { "epoch": 0.298, "grad_norm": 0.5729373097419739, "learning_rate": 8.854790304321682e-06, "loss": 0.1061, "step": 1490 }, { "epoch": 0.3, "grad_norm": 0.48975491523742676, "learning_rate": 8.83246503404675e-06, "loss": 0.1099, "step": 1500 }, { "epoch": 0.302, "grad_norm": 0.5203743577003479, "learning_rate": 8.80995297483286e-06, "loss": 0.1075, "step": 1510 }, { "epoch": 0.304, "grad_norm": 0.4938233196735382, "learning_rate": 8.78725522388602e-06, "loss": 0.105, "step": 1520 }, { "epoch": 0.306, "grad_norm": 0.5147393941879272, "learning_rate": 8.764372887462587e-06, "loss": 0.109, "step": 1530 }, { "epoch": 0.308, "grad_norm": 0.6048959493637085, "learning_rate": 8.741307080815357e-06, "loss": 0.1051, "step": 1540 }, { "epoch": 0.31, "grad_norm": 0.4202023446559906, "learning_rate": 8.718058928139205e-06, "loss": 0.1071, "step": 1550 }, { "epoch": 0.312, "grad_norm": 0.7164034247398376, "learning_rate": 8.694629562516295e-06, "loss": 0.1111, "step": 1560 }, { "epoch": 0.314, "grad_norm": 0.5305653214454651, "learning_rate": 8.671020125860851e-06, "loss": 0.0985, "step": 1570 }, { "epoch": 0.316, "grad_norm": 0.48124703764915466, "learning_rate": 8.647231768863513e-06, "loss": 0.1051, "step": 1580 }, { "epoch": 0.318, "grad_norm": 0.586521327495575, "learning_rate": 8.623265650935233e-06, "loss": 0.1056, "step": 1590 }, { "epoch": 0.32, "grad_norm": 0.7912370562553406, "learning_rate": 8.599122940150795e-06, "loss": 0.1042, "step": 1600 }, { "epoch": 0.322, "grad_norm": 0.5484232306480408, "learning_rate": 8.574804813191859e-06, "loss": 0.1008, "step": 1610 }, { "epoch": 0.324, "grad_norm": 0.4200885593891144, "learning_rate": 8.550312455289624e-06, "loss": 0.1058, "step": 1620 }, { "epoch": 0.326, "grad_norm": 0.5569013357162476, "learning_rate": 8.525647060167063e-06, "loss": 0.1087, "step": 1630 }, { "epoch": 0.328, "grad_norm": 0.42352986335754395, "learning_rate": 8.500809829980734e-06, "loss": 0.1003, "step": 1640 }, { "epoch": 0.33, "grad_norm": 0.5038197636604309, "learning_rate": 8.4758019752622e-06, "loss": 0.098, "step": 1650 }, { "epoch": 0.332, "grad_norm": 0.5340690612792969, "learning_rate": 8.450624714859016e-06, "loss": 0.107, "step": 1660 }, { "epoch": 0.334, "grad_norm": 0.5682323575019836, "learning_rate": 8.425279275875336e-06, "loss": 0.1034, "step": 1670 }, { "epoch": 0.336, "grad_norm": 0.5063655376434326, "learning_rate": 8.399766893612096e-06, "loss": 0.1005, "step": 1680 }, { "epoch": 0.338, "grad_norm": 0.6555968523025513, "learning_rate": 8.374088811506819e-06, "loss": 0.1074, "step": 1690 }, { "epoch": 0.34, "grad_norm": 0.4122048318386078, "learning_rate": 8.348246281072998e-06, "loss": 0.1035, "step": 1700 }, { "epoch": 0.342, "grad_norm": 0.6033665537834167, "learning_rate": 8.32224056183911e-06, "loss": 0.1004, "step": 1710 }, { "epoch": 0.344, "grad_norm": 0.6186772584915161, "learning_rate": 8.296072921287217e-06, "loss": 0.1059, "step": 1720 }, { "epoch": 0.346, "grad_norm": 0.5782063603401184, "learning_rate": 8.269744634791207e-06, "loss": 0.1026, "step": 1730 }, { "epoch": 0.348, "grad_norm": 0.5529699921607971, "learning_rate": 8.243256985554622e-06, "loss": 0.1052, "step": 1740 }, { "epoch": 0.35, "grad_norm": 0.43343502283096313, "learning_rate": 8.21661126454811e-06, "loss": 0.105, "step": 1750 }, { "epoch": 0.352, "grad_norm": 0.5857189297676086, "learning_rate": 8.189808770446528e-06, "loss": 0.1049, "step": 1760 }, { "epoch": 0.354, "grad_norm": 0.6525639891624451, "learning_rate": 8.162850809565623e-06, "loss": 0.0974, "step": 1770 }, { "epoch": 0.356, "grad_norm": 2.085735321044922, "learning_rate": 8.135738695798377e-06, "loss": 0.1067, "step": 1780 }, { "epoch": 0.358, "grad_norm": 0.5422115325927734, "learning_rate": 8.108473750550965e-06, "loss": 0.1086, "step": 1790 }, { "epoch": 0.36, "grad_norm": 0.4821435213088989, "learning_rate": 8.081057302678352e-06, "loss": 0.0956, "step": 1800 }, { "epoch": 0.362, "grad_norm": 0.6426023244857788, "learning_rate": 8.053490688419532e-06, "loss": 0.1072, "step": 1810 }, { "epoch": 0.364, "grad_norm": 0.5745382905006409, "learning_rate": 8.02577525133239e-06, "loss": 0.1077, "step": 1820 }, { "epoch": 0.366, "grad_norm": 0.5834154486656189, "learning_rate": 7.997912342228232e-06, "loss": 0.0991, "step": 1830 }, { "epoch": 0.368, "grad_norm": 0.4103539288043976, "learning_rate": 7.969903319105935e-06, "loss": 0.1022, "step": 1840 }, { "epoch": 0.37, "grad_norm": 0.7636476755142212, "learning_rate": 7.941749547085778e-06, "loss": 0.1028, "step": 1850 }, { "epoch": 0.372, "grad_norm": 0.4373137354850769, "learning_rate": 7.913452398342882e-06, "loss": 0.099, "step": 1860 }, { "epoch": 0.374, "grad_norm": 0.5366427302360535, "learning_rate": 7.88501325204036e-06, "loss": 0.1059, "step": 1870 }, { "epoch": 0.376, "grad_norm": 0.49244028329849243, "learning_rate": 7.856433494262078e-06, "loss": 0.1055, "step": 1880 }, { "epoch": 0.378, "grad_norm": 0.7082386016845703, "learning_rate": 7.827714517945116e-06, "loss": 0.1042, "step": 1890 }, { "epoch": 0.38, "grad_norm": 0.6199413537979126, "learning_rate": 7.798857722811857e-06, "loss": 0.1072, "step": 1900 }, { "epoch": 0.382, "grad_norm": 0.5586550831794739, "learning_rate": 7.769864515301787e-06, "loss": 0.1119, "step": 1910 }, { "epoch": 0.384, "grad_norm": 0.5374526977539062, "learning_rate": 7.740736308502939e-06, "loss": 0.1074, "step": 1920 }, { "epoch": 0.386, "grad_norm": 0.6747333407402039, "learning_rate": 7.711474522083015e-06, "loss": 0.1052, "step": 1930 }, { "epoch": 0.388, "grad_norm": 0.8605504631996155, "learning_rate": 7.682080582220206e-06, "loss": 0.1048, "step": 1940 }, { "epoch": 0.39, "grad_norm": 0.8798643350601196, "learning_rate": 7.652555921533671e-06, "loss": 0.0994, "step": 1950 }, { "epoch": 0.392, "grad_norm": 0.39183422923088074, "learning_rate": 7.622901979013717e-06, "loss": 0.1023, "step": 1960 }, { "epoch": 0.394, "grad_norm": 0.5172967314720154, "learning_rate": 7.5931201999516715e-06, "loss": 0.1029, "step": 1970 }, { "epoch": 0.396, "grad_norm": 0.4358147084712982, "learning_rate": 7.563212035869426e-06, "loss": 0.1112, "step": 1980 }, { "epoch": 0.398, "grad_norm": 0.9168479442596436, "learning_rate": 7.533178944448705e-06, "loss": 0.1026, "step": 1990 }, { "epoch": 0.4, "grad_norm": 1.4735708236694336, "learning_rate": 7.503022389460014e-06, "loss": 0.1075, "step": 2000 }, { "epoch": 0.402, "grad_norm": 2.389822244644165, "learning_rate": 7.4727438406912986e-06, "loss": 0.1052, "step": 2010 }, { "epoch": 0.404, "grad_norm": 0.5340117812156677, "learning_rate": 7.44234477387631e-06, "loss": 0.1036, "step": 2020 }, { "epoch": 0.406, "grad_norm": 0.4848586916923523, "learning_rate": 7.411826670622676e-06, "loss": 0.0993, "step": 2030 }, { "epoch": 0.408, "grad_norm": 0.5334826111793518, "learning_rate": 7.381191018339697e-06, "loss": 0.1037, "step": 2040 }, { "epoch": 0.41, "grad_norm": 0.6843870282173157, "learning_rate": 7.350439310165842e-06, "loss": 0.0961, "step": 2050 }, { "epoch": 0.412, "grad_norm": 0.4742767810821533, "learning_rate": 7.319573044895986e-06, "loss": 0.103, "step": 2060 }, { "epoch": 0.414, "grad_norm": 0.7538304924964905, "learning_rate": 7.288593726908351e-06, "loss": 0.1068, "step": 2070 }, { "epoch": 0.416, "grad_norm": 0.4505075216293335, "learning_rate": 7.257502866091192e-06, "loss": 0.0987, "step": 2080 }, { "epoch": 0.418, "grad_norm": 0.4845210015773773, "learning_rate": 7.226301977769199e-06, "loss": 0.1019, "step": 2090 }, { "epoch": 0.42, "grad_norm": 0.7197276949882507, "learning_rate": 7.194992582629654e-06, "loss": 0.1009, "step": 2100 }, { "epoch": 0.422, "grad_norm": 0.8190245032310486, "learning_rate": 7.1635762066483035e-06, "loss": 0.1066, "step": 2110 }, { "epoch": 0.424, "grad_norm": 0.4387541711330414, "learning_rate": 7.1320543810149945e-06, "loss": 0.0943, "step": 2120 }, { "epoch": 0.426, "grad_norm": 0.5509855151176453, "learning_rate": 7.100428642059033e-06, "loss": 0.104, "step": 2130 }, { "epoch": 0.428, "grad_norm": 0.5081747174263, "learning_rate": 7.0687005311743195e-06, "loss": 0.1016, "step": 2140 }, { "epoch": 0.43, "grad_norm": 0.5941420793533325, "learning_rate": 7.036871594744218e-06, "loss": 0.1094, "step": 2150 }, { "epoch": 0.432, "grad_norm": 0.47250431776046753, "learning_rate": 7.0049433840661875e-06, "loss": 0.1022, "step": 2160 }, { "epoch": 0.434, "grad_norm": 0.6311111450195312, "learning_rate": 6.97291745527617e-06, "loss": 0.1014, "step": 2170 }, { "epoch": 0.436, "grad_norm": 1.6460589170455933, "learning_rate": 6.940795369272754e-06, "loss": 0.1093, "step": 2180 }, { "epoch": 0.438, "grad_norm": 0.5381661653518677, "learning_rate": 6.908578691641092e-06, "loss": 0.0997, "step": 2190 }, { "epoch": 0.44, "grad_norm": 0.5386653542518616, "learning_rate": 6.876268992576605e-06, "loss": 0.0996, "step": 2200 }, { "epoch": 0.442, "grad_norm": 0.5878283381462097, "learning_rate": 6.843867846808438e-06, "loss": 0.1011, "step": 2210 }, { "epoch": 0.444, "grad_norm": 1.202819585800171, "learning_rate": 6.811376833522729e-06, "loss": 0.1015, "step": 2220 }, { "epoch": 0.446, "grad_norm": 0.6832873225212097, "learning_rate": 6.778797536285625e-06, "loss": 0.1007, "step": 2230 }, { "epoch": 0.448, "grad_norm": 0.7227466106414795, "learning_rate": 6.746131542966112e-06, "loss": 0.1036, "step": 2240 }, { "epoch": 0.45, "grad_norm": 0.462429016828537, "learning_rate": 6.713380445658618e-06, "loss": 0.1012, "step": 2250 }, { "epoch": 0.452, "grad_norm": 0.5341704487800598, "learning_rate": 6.680545840605423e-06, "loss": 0.103, "step": 2260 }, { "epoch": 0.454, "grad_norm": 0.5896704196929932, "learning_rate": 6.647629328118852e-06, "loss": 0.1045, "step": 2270 }, { "epoch": 0.456, "grad_norm": 4.748597145080566, "learning_rate": 6.614632512503289e-06, "loss": 0.0989, "step": 2280 }, { "epoch": 0.458, "grad_norm": 0.5261045098304749, "learning_rate": 6.58155700197697e-06, "loss": 0.1017, "step": 2290 }, { "epoch": 0.46, "grad_norm": 0.5400019884109497, "learning_rate": 6.548404408593622e-06, "loss": 0.1028, "step": 2300 }, { "epoch": 0.462, "grad_norm": 0.5167948603630066, "learning_rate": 6.5151763481638705e-06, "loss": 0.101, "step": 2310 }, { "epoch": 0.464, "grad_norm": 0.5450933575630188, "learning_rate": 6.481874440176506e-06, "loss": 0.1069, "step": 2320 }, { "epoch": 0.466, "grad_norm": 0.8595630526542664, "learning_rate": 6.448500307719537e-06, "loss": 0.1004, "step": 2330 }, { "epoch": 0.468, "grad_norm": 0.47659191489219666, "learning_rate": 6.415055577401101e-06, "loss": 0.0994, "step": 2340 }, { "epoch": 0.47, "grad_norm": 0.49259841442108154, "learning_rate": 6.3815418792701686e-06, "loss": 0.0985, "step": 2350 }, { "epoch": 0.472, "grad_norm": 0.5162966251373291, "learning_rate": 6.3479608467371055e-06, "loss": 0.1004, "step": 2360 }, { "epoch": 0.474, "grad_norm": 0.43801945447921753, "learning_rate": 6.314314116494061e-06, "loss": 0.1009, "step": 2370 }, { "epoch": 0.476, "grad_norm": 0.5107311010360718, "learning_rate": 6.280603328435199e-06, "loss": 0.0944, "step": 2380 }, { "epoch": 0.478, "grad_norm": 0.5112555027008057, "learning_rate": 6.24683012557677e-06, "loss": 0.1034, "step": 2390 }, { "epoch": 0.48, "grad_norm": 0.40618887543678284, "learning_rate": 6.212996153977038e-06, "loss": 0.1062, "step": 2400 }, { "epoch": 0.482, "grad_norm": 0.6284693479537964, "learning_rate": 6.179103062656042e-06, "loss": 0.0955, "step": 2410 }, { "epoch": 0.484, "grad_norm": 0.42741313576698303, "learning_rate": 6.145152503515239e-06, "loss": 0.1031, "step": 2420 }, { "epoch": 0.486, "grad_norm": 0.5457072854042053, "learning_rate": 6.111146131256983e-06, "loss": 0.1109, "step": 2430 }, { "epoch": 0.488, "grad_norm": 0.4269944727420807, "learning_rate": 6.077085603303883e-06, "loss": 0.098, "step": 2440 }, { "epoch": 0.49, "grad_norm": 0.5703105926513672, "learning_rate": 6.04297257971802e-06, "loss": 0.0988, "step": 2450 }, { "epoch": 0.492, "grad_norm": 0.3747485280036926, "learning_rate": 6.008808723120035e-06, "loss": 0.1019, "step": 2460 }, { "epoch": 0.494, "grad_norm": 0.4911485016345978, "learning_rate": 5.974595698608103e-06, "loss": 0.1007, "step": 2470 }, { "epoch": 0.496, "grad_norm": 0.5864652991294861, "learning_rate": 5.94033517367677e-06, "loss": 0.0988, "step": 2480 }, { "epoch": 0.498, "grad_norm": 0.5941429138183594, "learning_rate": 5.906028818135687e-06, "loss": 0.1036, "step": 2490 }, { "epoch": 0.5, "grad_norm": 0.6516585946083069, "learning_rate": 5.871678304028224e-06, "loss": 0.0895, "step": 2500 }, { "epoch": 0.502, "grad_norm": 0.4426342844963074, "learning_rate": 5.837285305549978e-06, "loss": 0.0968, "step": 2510 }, { "epoch": 0.504, "grad_norm": 0.4874410033226013, "learning_rate": 5.802851498967173e-06, "loss": 0.1018, "step": 2520 }, { "epoch": 0.506, "grad_norm": 0.5964179039001465, "learning_rate": 5.768378562534962e-06, "loss": 0.1037, "step": 2530 }, { "epoch": 0.508, "grad_norm": 0.43735530972480774, "learning_rate": 5.733868176415633e-06, "loss": 0.0993, "step": 2540 }, { "epoch": 0.51, "grad_norm": 0.6398305296897888, "learning_rate": 5.6993220225967214e-06, "loss": 0.0943, "step": 2550 }, { "epoch": 0.512, "grad_norm": 0.5659822225570679, "learning_rate": 5.6647417848090225e-06, "loss": 0.0991, "step": 2560 }, { "epoch": 0.514, "grad_norm": 0.6617985963821411, "learning_rate": 5.630129148444543e-06, "loss": 0.0957, "step": 2570 }, { "epoch": 0.516, "grad_norm": 0.3739423453807831, "learning_rate": 5.59548580047435e-06, "loss": 0.0951, "step": 2580 }, { "epoch": 0.518, "grad_norm": 0.9471829533576965, "learning_rate": 5.560813429366345e-06, "loss": 0.0965, "step": 2590 }, { "epoch": 0.52, "grad_norm": 0.7911657094955444, "learning_rate": 5.526113725002984e-06, "loss": 0.1031, "step": 2600 }, { "epoch": 0.522, "grad_norm": 0.6069086194038391, "learning_rate": 5.491388378598899e-06, "loss": 0.0976, "step": 2610 }, { "epoch": 0.524, "grad_norm": 0.47589465975761414, "learning_rate": 5.456639082618489e-06, "loss": 0.0918, "step": 2620 }, { "epoch": 0.526, "grad_norm": 0.4524126946926117, "learning_rate": 5.4218675306934145e-06, "loss": 0.0979, "step": 2630 }, { "epoch": 0.528, "grad_norm": 0.6384430527687073, "learning_rate": 5.3870754175400595e-06, "loss": 0.0987, "step": 2640 }, { "epoch": 0.53, "grad_norm": 0.40055084228515625, "learning_rate": 5.352264438876935e-06, "loss": 0.097, "step": 2650 }, { "epoch": 0.532, "grad_norm": 0.5200124382972717, "learning_rate": 5.317436291342031e-06, "loss": 0.0973, "step": 2660 }, { "epoch": 0.534, "grad_norm": 0.6301565766334534, "learning_rate": 5.282592672410124e-06, "loss": 0.105, "step": 2670 }, { "epoch": 0.536, "grad_norm": 0.48175889253616333, "learning_rate": 5.247735280310041e-06, "loss": 0.0941, "step": 2680 }, { "epoch": 0.538, "grad_norm": 0.3721906542778015, "learning_rate": 5.212865813941899e-06, "loss": 0.1017, "step": 2690 }, { "epoch": 0.54, "grad_norm": 0.38474878668785095, "learning_rate": 5.177985972794293e-06, "loss": 0.0914, "step": 2700 }, { "epoch": 0.542, "grad_norm": 0.5831676721572876, "learning_rate": 5.143097456861474e-06, "loss": 0.0959, "step": 2710 }, { "epoch": 0.544, "grad_norm": 0.5077164173126221, "learning_rate": 5.1082019665604895e-06, "loss": 0.0977, "step": 2720 }, { "epoch": 0.546, "grad_norm": 0.5812519788742065, "learning_rate": 5.073301202648304e-06, "loss": 0.0874, "step": 2730 }, { "epoch": 0.548, "grad_norm": 2.192858934402466, "learning_rate": 5.038396866138915e-06, "loss": 0.1039, "step": 2740 }, { "epoch": 0.55, "grad_norm": 0.8018079996109009, "learning_rate": 5.003490658220438e-06, "loss": 0.0999, "step": 2750 }, { "epoch": 0.552, "grad_norm": 0.4818994104862213, "learning_rate": 4.968584280172206e-06, "loss": 0.0895, "step": 2760 }, { "epoch": 0.554, "grad_norm": 0.7325239181518555, "learning_rate": 4.933679433281837e-06, "loss": 0.1034, "step": 2770 }, { "epoch": 0.556, "grad_norm": 0.6191554665565491, "learning_rate": 4.898777818762325e-06, "loss": 0.0995, "step": 2780 }, { "epoch": 0.558, "grad_norm": 0.775681734085083, "learning_rate": 4.863881137669123e-06, "loss": 0.0952, "step": 2790 }, { "epoch": 0.56, "grad_norm": 1.4037740230560303, "learning_rate": 4.828991090817238e-06, "loss": 0.0952, "step": 2800 }, { "epoch": 0.562, "grad_norm": 0.43431025743484497, "learning_rate": 4.794109378698327e-06, "loss": 0.1076, "step": 2810 }, { "epoch": 0.564, "grad_norm": 0.4915035665035248, "learning_rate": 4.759237701397831e-06, "loss": 0.1032, "step": 2820 }, { "epoch": 0.566, "grad_norm": 0.4426998794078827, "learning_rate": 4.7243777585121034e-06, "loss": 0.0979, "step": 2830 }, { "epoch": 0.568, "grad_norm": 0.44436115026474, "learning_rate": 4.689531249065581e-06, "loss": 0.0937, "step": 2840 }, { "epoch": 0.57, "grad_norm": 0.9476667642593384, "learning_rate": 4.654699871427972e-06, "loss": 0.1045, "step": 2850 }, { "epoch": 0.572, "grad_norm": 0.6479189395904541, "learning_rate": 4.619885323231484e-06, "loss": 0.1003, "step": 2860 }, { "epoch": 0.574, "grad_norm": 4.131319522857666, "learning_rate": 4.5850893012880806e-06, "loss": 0.1047, "step": 2870 }, { "epoch": 0.576, "grad_norm": 0.4022497832775116, "learning_rate": 4.5503135015067815e-06, "loss": 0.1002, "step": 2880 }, { "epoch": 0.578, "grad_norm": 0.6141453981399536, "learning_rate": 4.5155596188110055e-06, "loss": 0.0973, "step": 2890 }, { "epoch": 0.58, "grad_norm": 0.520523190498352, "learning_rate": 4.4808293470559645e-06, "loss": 0.1045, "step": 2900 }, { "epoch": 0.582, "grad_norm": 0.41964802145957947, "learning_rate": 4.446124378946108e-06, "loss": 0.1089, "step": 2910 }, { "epoch": 0.584, "grad_norm": 1.8227766752243042, "learning_rate": 4.4114464059526185e-06, "loss": 0.0872, "step": 2920 }, { "epoch": 0.586, "grad_norm": 0.427450031042099, "learning_rate": 4.376797118230978e-06, "loss": 0.0966, "step": 2930 }, { "epoch": 0.588, "grad_norm": 0.5866195559501648, "learning_rate": 4.342178204538588e-06, "loss": 0.1014, "step": 2940 }, { "epoch": 0.59, "grad_norm": 0.7429317235946655, "learning_rate": 4.307591352152459e-06, "loss": 0.1011, "step": 2950 }, { "epoch": 0.592, "grad_norm": 0.578780472278595, "learning_rate": 4.273038246786986e-06, "loss": 0.1034, "step": 2960 }, { "epoch": 0.594, "grad_norm": 0.45780378580093384, "learning_rate": 4.238520572511773e-06, "loss": 0.0988, "step": 2970 }, { "epoch": 0.596, "grad_norm": 0.4824381470680237, "learning_rate": 4.204040011669567e-06, "loss": 0.1011, "step": 2980 }, { "epoch": 0.598, "grad_norm": 0.6699690222740173, "learning_rate": 4.169598244794261e-06, "loss": 0.0958, "step": 2990 }, { "epoch": 0.6, "grad_norm": 0.6796635389328003, "learning_rate": 4.135196950528982e-06, "loss": 0.1013, "step": 3000 }, { "epoch": 0.602, "grad_norm": 0.4701448380947113, "learning_rate": 4.100837805544279e-06, "loss": 0.0899, "step": 3010 }, { "epoch": 0.604, "grad_norm": 0.4026479125022888, "learning_rate": 4.066522484456406e-06, "loss": 0.0996, "step": 3020 }, { "epoch": 0.606, "grad_norm": 0.6788782477378845, "learning_rate": 4.032252659745699e-06, "loss": 0.0988, "step": 3030 }, { "epoch": 0.608, "grad_norm": 0.45941832661628723, "learning_rate": 3.9980300016750696e-06, "loss": 0.1006, "step": 3040 }, { "epoch": 0.61, "grad_norm": 0.7753358483314514, "learning_rate": 3.963856178208588e-06, "loss": 0.0943, "step": 3050 }, { "epoch": 0.612, "grad_norm": 0.5039435625076294, "learning_rate": 3.9297328549302e-06, "loss": 0.0987, "step": 3060 }, { "epoch": 0.614, "grad_norm": 0.6418735384941101, "learning_rate": 3.895661694962542e-06, "loss": 0.0983, "step": 3070 }, { "epoch": 0.616, "grad_norm": 1.4340955018997192, "learning_rate": 3.86164435888588e-06, "loss": 0.1017, "step": 3080 }, { "epoch": 0.618, "grad_norm": 0.6281638145446777, "learning_rate": 3.827682504657187e-06, "loss": 0.1016, "step": 3090 }, { "epoch": 0.62, "grad_norm": 0.8099610805511475, "learning_rate": 3.793777787529325e-06, "loss": 0.1028, "step": 3100 }, { "epoch": 0.622, "grad_norm": 0.46766799688339233, "learning_rate": 3.759931859970374e-06, "loss": 0.0992, "step": 3110 }, { "epoch": 0.624, "grad_norm": 0.6429394483566284, "learning_rate": 3.7261463715830902e-06, "loss": 0.0993, "step": 3120 }, { "epoch": 0.626, "grad_norm": 0.5106363892555237, "learning_rate": 3.6924229690245163e-06, "loss": 0.0984, "step": 3130 }, { "epoch": 0.628, "grad_norm": 0.3694756031036377, "learning_rate": 3.6587632959257168e-06, "loss": 0.0919, "step": 3140 }, { "epoch": 0.63, "grad_norm": 0.38708898425102234, "learning_rate": 3.625168992811671e-06, "loss": 0.0992, "step": 3150 }, { "epoch": 0.632, "grad_norm": 0.4585551619529724, "learning_rate": 3.5916416970213173e-06, "loss": 0.0947, "step": 3160 }, { "epoch": 0.634, "grad_norm": 0.4839175045490265, "learning_rate": 3.5581830426277554e-06, "loss": 0.1006, "step": 3170 }, { "epoch": 0.636, "grad_norm": 0.6512119770050049, "learning_rate": 3.524794660358593e-06, "loss": 0.0975, "step": 3180 }, { "epoch": 0.638, "grad_norm": 0.6606001853942871, "learning_rate": 3.491478177516484e-06, "loss": 0.0965, "step": 3190 }, { "epoch": 0.64, "grad_norm": 0.5618588924407959, "learning_rate": 3.4582352178997937e-06, "loss": 0.0972, "step": 3200 }, { "epoch": 0.642, "grad_norm": 0.9269917607307434, "learning_rate": 3.4250674017234774e-06, "loss": 0.1047, "step": 3210 }, { "epoch": 0.644, "grad_norm": 0.4144897162914276, "learning_rate": 3.3919763455401016e-06, "loss": 0.1007, "step": 3220 }, { "epoch": 0.646, "grad_norm": 0.541193425655365, "learning_rate": 3.358963662161062e-06, "loss": 0.0927, "step": 3230 }, { "epoch": 0.648, "grad_norm": 0.7467935085296631, "learning_rate": 3.3260309605779717e-06, "loss": 0.0959, "step": 3240 }, { "epoch": 0.65, "grad_norm": 0.5478144288063049, "learning_rate": 3.293179845884245e-06, "loss": 0.102, "step": 3250 }, { "epoch": 0.652, "grad_norm": 0.4977323114871979, "learning_rate": 3.260411919196866e-06, "loss": 0.0961, "step": 3260 }, { "epoch": 0.654, "grad_norm": 0.5541924834251404, "learning_rate": 3.227728777578353e-06, "loss": 0.1031, "step": 3270 }, { "epoch": 0.656, "grad_norm": 0.570885419845581, "learning_rate": 3.195132013958918e-06, "loss": 0.0984, "step": 3280 }, { "epoch": 0.658, "grad_norm": 1.277539849281311, "learning_rate": 3.1626232170588343e-06, "loss": 0.0952, "step": 3290 }, { "epoch": 0.66, "grad_norm": 0.4125811457633972, "learning_rate": 3.130203971310999e-06, "loss": 0.0996, "step": 3300 }, { "epoch": 0.662, "grad_norm": 0.5309910774230957, "learning_rate": 3.097875856783713e-06, "loss": 0.0954, "step": 3310 }, { "epoch": 0.664, "grad_norm": 0.553604006767273, "learning_rate": 3.0656404491036696e-06, "loss": 0.1029, "step": 3320 }, { "epoch": 0.666, "grad_norm": 0.740545928478241, "learning_rate": 3.033499319379163e-06, "loss": 0.0964, "step": 3330 }, { "epoch": 0.668, "grad_norm": 0.37706542015075684, "learning_rate": 3.001454034123512e-06, "loss": 0.104, "step": 3340 }, { "epoch": 0.67, "grad_norm": 0.4644536077976227, "learning_rate": 2.969506155178711e-06, "loss": 0.0963, "step": 3350 }, { "epoch": 0.672, "grad_norm": 0.4521162509918213, "learning_rate": 2.9376572396393047e-06, "loss": 0.0977, "step": 3360 }, { "epoch": 0.674, "grad_norm": 0.5465760827064514, "learning_rate": 2.905908839776509e-06, "loss": 0.097, "step": 3370 }, { "epoch": 0.676, "grad_norm": 0.6061195135116577, "learning_rate": 2.874262502962537e-06, "loss": 0.0937, "step": 3380 }, { "epoch": 0.678, "grad_norm": 1.0016474723815918, "learning_rate": 2.8427197715952047e-06, "loss": 0.1016, "step": 3390 }, { "epoch": 0.68, "grad_norm": 0.5500060319900513, "learning_rate": 2.811282183022736e-06, "loss": 0.0996, "step": 3400 }, { "epoch": 0.682, "grad_norm": 0.5355184078216553, "learning_rate": 2.779951269468847e-06, "loss": 0.1028, "step": 3410 }, { "epoch": 0.684, "grad_norm": 0.5095385909080505, "learning_rate": 2.7487285579580635e-06, "loss": 0.095, "step": 3420 }, { "epoch": 0.686, "grad_norm": 0.6504735350608826, "learning_rate": 2.717615570241294e-06, "loss": 0.0971, "step": 3430 }, { "epoch": 0.688, "grad_norm": 0.6753683686256409, "learning_rate": 2.686613822721666e-06, "loss": 0.0995, "step": 3440 }, { "epoch": 0.69, "grad_norm": 0.6328535676002502, "learning_rate": 2.6557248263806175e-06, "loss": 0.1017, "step": 3450 }, { "epoch": 0.692, "grad_norm": 0.6124202013015747, "learning_rate": 2.6249500867042523e-06, "loss": 0.1084, "step": 3460 }, { "epoch": 0.694, "grad_norm": 0.510665774345398, "learning_rate": 2.5942911036099657e-06, "loss": 0.1038, "step": 3470 }, { "epoch": 0.696, "grad_norm": 0.5020245909690857, "learning_rate": 2.5637493713733376e-06, "loss": 0.096, "step": 3480 }, { "epoch": 0.698, "grad_norm": 0.6111953854560852, "learning_rate": 2.533326378555314e-06, "loss": 0.0977, "step": 3490 }, { "epoch": 0.7, "grad_norm": 0.49325230717658997, "learning_rate": 2.5030236079296443e-06, "loss": 0.0959, "step": 3500 }, { "epoch": 0.702, "grad_norm": 0.4303823709487915, "learning_rate": 2.4728425364106136e-06, "loss": 0.0924, "step": 3510 }, { "epoch": 0.704, "grad_norm": 0.5887752771377563, "learning_rate": 2.442784634981071e-06, "loss": 0.0964, "step": 3520 }, { "epoch": 0.706, "grad_norm": 0.7012388706207275, "learning_rate": 2.412851368620726e-06, "loss": 0.0958, "step": 3530 }, { "epoch": 0.708, "grad_norm": 0.4575226902961731, "learning_rate": 2.3830441962347528e-06, "loss": 0.1011, "step": 3540 }, { "epoch": 0.71, "grad_norm": 1.7122695446014404, "learning_rate": 2.353364570582681e-06, "loss": 0.0958, "step": 3550 }, { "epoch": 0.712, "grad_norm": 0.5158506631851196, "learning_rate": 2.323813938207593e-06, "loss": 0.094, "step": 3560 }, { "epoch": 0.714, "grad_norm": 1.2165895700454712, "learning_rate": 2.294393739365621e-06, "loss": 0.1015, "step": 3570 }, { "epoch": 0.716, "grad_norm": 0.4191378355026245, "learning_rate": 2.265105407955752e-06, "loss": 0.1023, "step": 3580 }, { "epoch": 0.718, "grad_norm": 0.5682792663574219, "learning_rate": 2.235950371449938e-06, "loss": 0.0987, "step": 3590 }, { "epoch": 0.72, "grad_norm": 0.4805779755115509, "learning_rate": 2.2069300508235273e-06, "loss": 0.0991, "step": 3600 }, { "epoch": 0.722, "grad_norm": 0.5979336500167847, "learning_rate": 2.1780458604860056e-06, "loss": 0.095, "step": 3610 }, { "epoch": 0.724, "grad_norm": 0.48464110493659973, "learning_rate": 2.14929920821206e-06, "loss": 0.1018, "step": 3620 }, { "epoch": 0.726, "grad_norm": 0.8631208539009094, "learning_rate": 2.1206914950729673e-06, "loss": 0.0965, "step": 3630 }, { "epoch": 0.728, "grad_norm": 0.5129687786102295, "learning_rate": 2.0922241153683064e-06, "loss": 0.1012, "step": 3640 }, { "epoch": 0.73, "grad_norm": 0.4799841642379761, "learning_rate": 2.063898456558002e-06, "loss": 0.0954, "step": 3650 }, { "epoch": 0.732, "grad_norm": 0.698362410068512, "learning_rate": 2.035715899194704e-06, "loss": 0.0922, "step": 3660 }, { "epoch": 0.734, "grad_norm": 0.5779376029968262, "learning_rate": 2.007677816856498e-06, "loss": 0.0908, "step": 3670 }, { "epoch": 0.736, "grad_norm": 0.37104320526123047, "learning_rate": 1.979785576079961e-06, "loss": 0.0907, "step": 3680 }, { "epoch": 0.738, "grad_norm": 0.9988909363746643, "learning_rate": 1.95204053629356e-06, "loss": 0.0991, "step": 3690 }, { "epoch": 0.74, "grad_norm": 0.6738276481628418, "learning_rate": 1.9244440497513895e-06, "loss": 0.0883, "step": 3700 }, { "epoch": 0.742, "grad_norm": 1.0084031820297241, "learning_rate": 1.896997461467272e-06, "loss": 0.0963, "step": 3710 }, { "epoch": 0.744, "grad_norm": 0.47921252250671387, "learning_rate": 1.8697021091491991e-06, "loss": 0.0951, "step": 3720 }, { "epoch": 0.746, "grad_norm": 0.697775661945343, "learning_rate": 1.842559323134136e-06, "loss": 0.0946, "step": 3730 }, { "epoch": 0.748, "grad_norm": 0.74412602186203, "learning_rate": 1.8155704263231777e-06, "loss": 0.0889, "step": 3740 }, { "epoch": 0.75, "grad_norm": 0.5915809273719788, "learning_rate": 1.7887367341170781e-06, "loss": 0.097, "step": 3750 }, { "epoch": 0.752, "grad_norm": 0.8403988480567932, "learning_rate": 1.762059554352143e-06, "loss": 0.1004, "step": 3760 }, { "epoch": 0.754, "grad_norm": 0.763522207736969, "learning_rate": 1.7355401872364759e-06, "loss": 0.0987, "step": 3770 }, { "epoch": 0.756, "grad_norm": 0.5236890316009521, "learning_rate": 1.709179925286617e-06, "loss": 0.0993, "step": 3780 }, { "epoch": 0.758, "grad_norm": 1.012764811515808, "learning_rate": 1.6829800532645447e-06, "loss": 0.1035, "step": 3790 }, { "epoch": 0.76, "grad_norm": 0.4698300361633301, "learning_rate": 1.6569418481150596e-06, "loss": 0.0944, "step": 3800 }, { "epoch": 0.762, "grad_norm": 0.545480489730835, "learning_rate": 1.6310665789035468e-06, "loss": 0.0961, "step": 3810 }, { "epoch": 0.764, "grad_norm": 1.2716432809829712, "learning_rate": 1.605355506754121e-06, "loss": 0.0964, "step": 3820 }, { "epoch": 0.766, "grad_norm": 0.6809777021408081, "learning_rate": 1.5798098847881664e-06, "loss": 0.0901, "step": 3830 }, { "epoch": 0.768, "grad_norm": 0.45785894989967346, "learning_rate": 1.554430958063259e-06, "loss": 0.0931, "step": 3840 }, { "epoch": 0.77, "grad_norm": 0.5503713488578796, "learning_rate": 1.529219963512481e-06, "loss": 0.0963, "step": 3850 }, { "epoch": 0.772, "grad_norm": 0.4882219731807709, "learning_rate": 1.5041781298841424e-06, "loss": 0.105, "step": 3860 }, { "epoch": 0.774, "grad_norm": 1.1814230680465698, "learning_rate": 1.4793066776818843e-06, "loss": 0.098, "step": 3870 }, { "epoch": 0.776, "grad_norm": 0.51359623670578, "learning_rate": 1.4546068191051988e-06, "loss": 0.0985, "step": 3880 }, { "epoch": 0.778, "grad_norm": 0.4637075960636139, "learning_rate": 1.4300797579903476e-06, "loss": 0.0955, "step": 3890 }, { "epoch": 0.78, "grad_norm": 0.7598605155944824, "learning_rate": 1.4057266897516842e-06, "loss": 0.103, "step": 3900 }, { "epoch": 0.782, "grad_norm": 0.4668324291706085, "learning_rate": 1.3815488013233986e-06, "loss": 0.0928, "step": 3910 }, { "epoch": 0.784, "grad_norm": 0.46290475130081177, "learning_rate": 1.3575472711016634e-06, "loss": 0.0921, "step": 3920 }, { "epoch": 0.786, "grad_norm": 0.5419790148735046, "learning_rate": 1.333723268887201e-06, "loss": 0.0909, "step": 3930 }, { "epoch": 0.788, "grad_norm": 0.6386012434959412, "learning_rate": 1.3100779558282673e-06, "loss": 0.0903, "step": 3940 }, { "epoch": 0.79, "grad_norm": 1.1641706228256226, "learning_rate": 1.2866124843640614e-06, "loss": 0.0965, "step": 3950 }, { "epoch": 0.792, "grad_norm": 0.8818916082382202, "learning_rate": 1.2633279981685608e-06, "loss": 0.1001, "step": 3960 }, { "epoch": 0.794, "grad_norm": 0.4657428562641144, "learning_rate": 1.240225632094773e-06, "loss": 0.0949, "step": 3970 }, { "epoch": 0.796, "grad_norm": 0.6301387548446655, "learning_rate": 1.217306512119425e-06, "loss": 0.1017, "step": 3980 }, { "epoch": 0.798, "grad_norm": 0.5192793011665344, "learning_rate": 1.1945717552880919e-06, "loss": 0.0949, "step": 3990 }, { "epoch": 0.8, "grad_norm": 0.445238322019577, "learning_rate": 1.1720224696607474e-06, "loss": 0.0933, "step": 4000 }, { "epoch": 0.802, "grad_norm": 0.7800849676132202, "learning_rate": 1.1496597542577603e-06, "loss": 0.0937, "step": 4010 }, { "epoch": 0.804, "grad_norm": 0.5425428748130798, "learning_rate": 1.1274846990063314e-06, "loss": 0.0927, "step": 4020 }, { "epoch": 0.806, "grad_norm": 0.5033382177352905, "learning_rate": 1.1054983846873684e-06, "loss": 0.0909, "step": 4030 }, { "epoch": 0.808, "grad_norm": 0.6877299547195435, "learning_rate": 1.0837018828828133e-06, "loss": 0.0905, "step": 4040 }, { "epoch": 0.81, "grad_norm": 0.5039352774620056, "learning_rate": 1.0620962559234144e-06, "loss": 0.0938, "step": 4050 }, { "epoch": 0.812, "grad_norm": 0.48495736718177795, "learning_rate": 1.0406825568369478e-06, "loss": 0.0947, "step": 4060 }, { "epoch": 0.814, "grad_norm": 0.3968163728713989, "learning_rate": 1.0194618292968972e-06, "loss": 0.0898, "step": 4070 }, { "epoch": 0.816, "grad_norm": 0.542452871799469, "learning_rate": 9.984351075715848e-07, "loss": 0.0916, "step": 4080 }, { "epoch": 0.818, "grad_norm": 0.6849305629730225, "learning_rate": 9.77603416473763e-07, "loss": 0.0968, "step": 4090 }, { "epoch": 0.82, "grad_norm": 1.279828667640686, "learning_rate": 9.569677713106673e-07, "loss": 0.0954, "step": 4100 }, { "epoch": 0.822, "grad_norm": 0.5056818723678589, "learning_rate": 9.365291778345303e-07, "loss": 0.0852, "step": 4110 }, { "epoch": 0.824, "grad_norm": 0.4796373248100281, "learning_rate": 9.162886321935632e-07, "loss": 0.0898, "step": 4120 }, { "epoch": 0.826, "grad_norm": 0.7538776397705078, "learning_rate": 8.962471208834056e-07, "loss": 0.0916, "step": 4130 }, { "epoch": 0.828, "grad_norm": 0.499845951795578, "learning_rate": 8.764056206990446e-07, "loss": 0.0985, "step": 4140 }, { "epoch": 0.83, "grad_norm": 1.0565054416656494, "learning_rate": 8.567650986872061e-07, "loss": 0.0913, "step": 4150 }, { "epoch": 0.832, "grad_norm": 0.47425204515457153, "learning_rate": 8.373265120992252e-07, "loss": 0.1015, "step": 4160 }, { "epoch": 0.834, "grad_norm": 0.5785176753997803, "learning_rate": 8.180908083443884e-07, "loss": 0.0904, "step": 4170 }, { "epoch": 0.836, "grad_norm": 0.5388156175613403, "learning_rate": 7.990589249437591e-07, "loss": 0.0947, "step": 4180 }, { "epoch": 0.838, "grad_norm": 0.4170565903186798, "learning_rate": 7.802317894844835e-07, "loss": 0.0899, "step": 4190 }, { "epoch": 0.84, "grad_norm": 0.5308743715286255, "learning_rate": 7.61610319574585e-07, "loss": 0.0929, "step": 4200 }, { "epoch": 0.842, "grad_norm": 0.583870530128479, "learning_rate": 7.43195422798233e-07, "loss": 0.0865, "step": 4210 }, { "epoch": 0.844, "grad_norm": 0.5489416122436523, "learning_rate": 7.249879966715174e-07, "loss": 0.1026, "step": 4220 }, { "epoch": 0.846, "grad_norm": 0.5027689933776855, "learning_rate": 7.069889285987025e-07, "loss": 0.1007, "step": 4230 }, { "epoch": 0.848, "grad_norm": 0.4889448583126068, "learning_rate": 6.891990958289724e-07, "loss": 0.0875, "step": 4240 }, { "epoch": 0.85, "grad_norm": 1.2699036598205566, "learning_rate": 6.716193654136788e-07, "loss": 0.0973, "step": 4250 }, { "epoch": 0.852, "grad_norm": 0.5371798276901245, "learning_rate": 6.542505941640803e-07, "loss": 0.0943, "step": 4260 }, { "epoch": 0.854, "grad_norm": 0.6962149739265442, "learning_rate": 6.370936286095842e-07, "loss": 0.0909, "step": 4270 }, { "epoch": 0.856, "grad_norm": 0.5354880094528198, "learning_rate": 6.201493049564883e-07, "loss": 0.093, "step": 4280 }, { "epoch": 0.858, "grad_norm": 0.627852737903595, "learning_rate": 6.034184490472195e-07, "loss": 0.1, "step": 4290 }, { "epoch": 0.86, "grad_norm": 0.5209593176841736, "learning_rate": 5.869018763200929e-07, "loss": 0.0944, "step": 4300 }, { "epoch": 0.862, "grad_norm": 0.5602114200592041, "learning_rate": 5.706003917695619e-07, "loss": 0.0944, "step": 4310 }, { "epoch": 0.864, "grad_norm": 0.4925883114337921, "learning_rate": 5.545147899069836e-07, "loss": 0.1006, "step": 4320 }, { "epoch": 0.866, "grad_norm": 0.5700224041938782, "learning_rate": 5.386458547219026e-07, "loss": 0.0908, "step": 4330 }, { "epoch": 0.868, "grad_norm": 1.1226917505264282, "learning_rate": 5.229943596438297e-07, "loss": 0.098, "step": 4340 }, { "epoch": 0.87, "grad_norm": 0.6325000524520874, "learning_rate": 5.075610675045567e-07, "loss": 0.0918, "step": 4350 }, { "epoch": 0.872, "grad_norm": 0.6880750060081482, "learning_rate": 4.92346730500966e-07, "loss": 0.0891, "step": 4360 }, { "epoch": 0.874, "grad_norm": 0.9880423545837402, "learning_rate": 4.773520901583801e-07, "loss": 0.0932, "step": 4370 }, { "epoch": 0.876, "grad_norm": 0.43653684854507446, "learning_rate": 4.625778772944156e-07, "loss": 0.0919, "step": 4380 }, { "epoch": 0.878, "grad_norm": 0.5817267298698425, "learning_rate": 4.480248119833641e-07, "loss": 0.0976, "step": 4390 }, { "epoch": 0.88, "grad_norm": 0.5401520729064941, "learning_rate": 4.33693603521097e-07, "loss": 0.0984, "step": 4400 }, { "epoch": 0.882, "grad_norm": 0.5201219320297241, "learning_rate": 4.195849503904975e-07, "loss": 0.0909, "step": 4410 }, { "epoch": 0.884, "grad_norm": 0.46396180987358093, "learning_rate": 4.056995402274122e-07, "loss": 0.092, "step": 4420 }, { "epoch": 0.886, "grad_norm": 0.8054051995277405, "learning_rate": 3.920380497871473e-07, "loss": 0.0924, "step": 4430 }, { "epoch": 0.888, "grad_norm": 1.6495290994644165, "learning_rate": 3.7860114491147017e-07, "loss": 0.0856, "step": 4440 }, { "epoch": 0.89, "grad_norm": 0.5194240212440491, "learning_rate": 3.6538948049616886e-07, "loss": 0.1016, "step": 4450 }, { "epoch": 0.892, "grad_norm": 0.9510396718978882, "learning_rate": 3.524037004591274e-07, "loss": 0.092, "step": 4460 }, { "epoch": 0.894, "grad_norm": 0.5282657146453857, "learning_rate": 3.396444377089453e-07, "loss": 0.1007, "step": 4470 }, { "epoch": 0.896, "grad_norm": 0.5371118187904358, "learning_rate": 3.271123141140886e-07, "loss": 0.0928, "step": 4480 }, { "epoch": 0.898, "grad_norm": 0.4065883457660675, "learning_rate": 3.148079404725801e-07, "loss": 0.0895, "step": 4490 }, { "epoch": 0.9, "grad_norm": 0.5543450117111206, "learning_rate": 3.027319164822329e-07, "loss": 0.0912, "step": 4500 }, { "epoch": 0.902, "grad_norm": 0.5739998817443848, "learning_rate": 2.908848307114198e-07, "loss": 0.0938, "step": 4510 }, { "epoch": 0.904, "grad_norm": 0.48197928071022034, "learning_rate": 2.792672605703867e-07, "loss": 0.0912, "step": 4520 }, { "epoch": 0.906, "grad_norm": 0.9697645902633667, "learning_rate": 2.6787977228311336e-07, "loss": 0.0929, "step": 4530 }, { "epoch": 0.908, "grad_norm": 0.7662408351898193, "learning_rate": 2.5672292085971276e-07, "loss": 0.0976, "step": 4540 }, { "epoch": 0.91, "grad_norm": 0.5820423364639282, "learning_rate": 2.457972500693834e-07, "loss": 0.0952, "step": 4550 }, { "epoch": 0.912, "grad_norm": 0.6280630826950073, "learning_rate": 2.351032924139063e-07, "loss": 0.0895, "step": 4560 }, { "epoch": 0.914, "grad_norm": 0.8524745106697083, "learning_rate": 2.2464156910168954e-07, "loss": 0.0931, "step": 4570 }, { "epoch": 0.916, "grad_norm": 0.8388736844062805, "learning_rate": 2.1441259002236924e-07, "loss": 0.0984, "step": 4580 }, { "epoch": 0.918, "grad_norm": 0.7623071074485779, "learning_rate": 2.0441685372195487e-07, "loss": 0.0914, "step": 4590 }, { "epoch": 0.92, "grad_norm": 0.7167821526527405, "learning_rate": 1.9465484737853092e-07, "loss": 0.092, "step": 4600 }, { "epoch": 0.922, "grad_norm": 0.9387440085411072, "learning_rate": 1.8512704677851489e-07, "loss": 0.0918, "step": 4610 }, { "epoch": 0.924, "grad_norm": 1.0186631679534912, "learning_rate": 1.758339162934658e-07, "loss": 0.0914, "step": 4620 }, { "epoch": 0.926, "grad_norm": 0.7060619592666626, "learning_rate": 1.6677590885745388e-07, "loss": 0.095, "step": 4630 }, { "epoch": 0.928, "grad_norm": 0.7647775411605835, "learning_rate": 1.5795346594498162e-07, "loss": 0.0942, "step": 4640 }, { "epoch": 0.93, "grad_norm": 0.5398228168487549, "learning_rate": 1.4936701754947104e-07, "loss": 0.0965, "step": 4650 }, { "epoch": 0.932, "grad_norm": 0.6493486762046814, "learning_rate": 1.4101698216230254e-07, "loss": 0.0975, "step": 4660 }, { "epoch": 0.934, "grad_norm": 0.5592340230941772, "learning_rate": 1.3290376675242022e-07, "loss": 0.091, "step": 4670 }, { "epoch": 0.936, "grad_norm": 0.6127602458000183, "learning_rate": 1.2502776674649776e-07, "loss": 0.0955, "step": 4680 }, { "epoch": 0.938, "grad_norm": 0.6314841508865356, "learning_rate": 1.1738936600966366e-07, "loss": 0.0947, "step": 4690 }, { "epoch": 0.94, "grad_norm": 0.6018589735031128, "learning_rate": 1.0998893682679479e-07, "loss": 0.1013, "step": 4700 }, { "epoch": 0.942, "grad_norm": 0.5869537591934204, "learning_rate": 1.0282683988436792e-07, "loss": 0.0952, "step": 4710 }, { "epoch": 0.944, "grad_norm": 0.5971767902374268, "learning_rate": 9.590342425288446e-08, "loss": 0.0855, "step": 4720 }, { "epoch": 0.946, "grad_norm": 0.6858376860618591, "learning_rate": 8.921902736985399e-08, "loss": 0.0964, "step": 4730 }, { "epoch": 0.948, "grad_norm": 0.6425265669822693, "learning_rate": 8.277397502335194e-08, "loss": 0.0873, "step": 4740 }, { "epoch": 0.95, "grad_norm": 0.6217800378799438, "learning_rate": 7.656858133613498e-08, "loss": 0.0932, "step": 4750 }, { "epoch": 0.952, "grad_norm": 0.7790467143058777, "learning_rate": 7.060314875033836e-08, "loss": 0.0884, "step": 4760 }, { "epoch": 0.954, "grad_norm": 0.7824919819831848, "learning_rate": 6.487796801272983e-08, "loss": 0.0994, "step": 4770 }, { "epoch": 0.956, "grad_norm": 1.0274417400360107, "learning_rate": 5.939331816054161e-08, "loss": 0.096, "step": 4780 }, { "epoch": 0.958, "grad_norm": 1.30277681350708, "learning_rate": 5.414946650786957e-08, "loss": 0.0875, "step": 4790 }, { "epoch": 0.96, "grad_norm": 0.7545578479766846, "learning_rate": 4.914666863264528e-08, "loss": 0.09, "step": 4800 }, { "epoch": 0.962, "grad_norm": 0.5341658592224121, "learning_rate": 4.438516836417994e-08, "loss": 0.0904, "step": 4810 }, { "epoch": 0.964, "grad_norm": 0.954708993434906, "learning_rate": 3.986519777127884e-08, "loss": 0.0878, "step": 4820 }, { "epoch": 0.966, "grad_norm": 3.0081846714019775, "learning_rate": 3.558697715093207e-08, "loss": 0.1018, "step": 4830 }, { "epoch": 0.968, "grad_norm": 0.6550202965736389, "learning_rate": 3.1550715017575895e-08, "loss": 0.0933, "step": 4840 }, { "epoch": 0.97, "grad_norm": 1.0143718719482422, "learning_rate": 2.7756608092933678e-08, "loss": 0.1012, "step": 4850 }, { "epoch": 1.0002, "grad_norm": 0.7272701859474182, "learning_rate": 2.4204841296424086e-08, "loss": 0.1023, "step": 4860 }, { "epoch": 1.0022, "grad_norm": 0.5501855611801147, "learning_rate": 2.0895587736149414e-08, "loss": 0.0984, "step": 4870 }, { "epoch": 1.0042, "grad_norm": 0.729179322719574, "learning_rate": 1.7829008700460116e-08, "loss": 0.0895, "step": 4880 }, { "epoch": 1.0062, "grad_norm": 0.6516703963279724, "learning_rate": 1.500525365009109e-08, "loss": 0.0921, "step": 4890 }, { "epoch": 1.0082, "grad_norm": 0.6412553191184998, "learning_rate": 1.2424460210881394e-08, "loss": 0.1001, "step": 4900 }, { "epoch": 1.0102, "grad_norm": 0.8296138644218445, "learning_rate": 1.008675416706073e-08, "loss": 0.0834, "step": 4910 }, { "epoch": 1.0122, "grad_norm": 0.6063240766525269, "learning_rate": 7.992249455124889e-09, "loss": 0.0831, "step": 4920 }, { "epoch": 1.0142, "grad_norm": 0.6920802593231201, "learning_rate": 6.141048158277429e-09, "loss": 0.0919, "step": 4930 }, { "epoch": 1.0162, "grad_norm": 0.6563531160354614, "learning_rate": 4.533240501459202e-09, "loss": 0.0952, "step": 4940 }, { "epoch": 1.0182, "grad_norm": 0.6383795142173767, "learning_rate": 3.1689048469457638e-09, "loss": 0.091, "step": 4950 }, { "epoch": 1.0202, "grad_norm": 0.6429073214530945, "learning_rate": 2.0481076905332074e-09, "loss": 0.0974, "step": 4960 }, { "epoch": 1.0222, "grad_norm": 0.4217958450317383, "learning_rate": 1.170903658293532e-09, "loss": 0.091, "step": 4970 }, { "epoch": 1.0242, "grad_norm": 0.648520827293396, "learning_rate": 5.373355039128836e-10, "loss": 0.0889, "step": 4980 }, { "epoch": 1.0262, "grad_norm": 0.6270639896392822, "learning_rate": 1.4743410661044454e-10, "loss": 0.0999, "step": 4990 }, { "epoch": 1.0282, "grad_norm": 0.7285539507865906, "learning_rate": 1.2184696296380083e-12, "loss": 0.1012, "step": 5000 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4735039884017795e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }