| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0282, | |
| "eval_steps": 500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 12.427441596984863, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.4661, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 10.644728660583496, | |
| "learning_rate": 3.8e-07, | |
| "loss": 0.43, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 2.582233428955078, | |
| "learning_rate": 5.800000000000001e-07, | |
| "loss": 0.2937, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 1.6104474067687988, | |
| "learning_rate": 7.8e-07, | |
| "loss": 0.2311, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.3486788272857666, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 0.1944, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 2.3484883308410645, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "loss": 0.178, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 2.1255300045013428, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 0.1663, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 1.2671189308166504, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "loss": 0.1521, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 2.0237274169921875, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "loss": 0.1515, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.4099950790405273, | |
| "learning_rate": 1.98e-06, | |
| "loss": 0.1481, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 2.23960280418396, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "loss": 0.1487, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 1.7865636348724365, | |
| "learning_rate": 2.38e-06, | |
| "loss": 0.1359, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 1.590512990951538, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "loss": 0.1487, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 3.781755208969116, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "loss": 0.1381, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.4385772943496704, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "loss": 0.1384, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 1.2209824323654175, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "loss": 0.1407, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 1.463581919670105, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "loss": 0.1392, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 0.9795161485671997, | |
| "learning_rate": 3.58e-06, | |
| "loss": 0.1335, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 1.5629451274871826, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 0.1355, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.2003155946731567, | |
| "learning_rate": 3.980000000000001e-06, | |
| "loss": 0.136, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 1.3819934129714966, | |
| "learning_rate": 4.18e-06, | |
| "loss": 0.1359, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 1.797484278678894, | |
| "learning_rate": 4.38e-06, | |
| "loss": 0.1318, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 1.15238618850708, | |
| "learning_rate": 4.58e-06, | |
| "loss": 0.1303, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.8387578129768372, | |
| "learning_rate": 4.78e-06, | |
| "loss": 0.1359, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.8264948129653931, | |
| "learning_rate": 4.980000000000001e-06, | |
| "loss": 0.1316, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 0.9605500102043152, | |
| "learning_rate": 5.18e-06, | |
| "loss": 0.1287, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 0.9171805381774902, | |
| "learning_rate": 5.380000000000001e-06, | |
| "loss": 0.135, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.7537838220596313, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 0.1317, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 0.9007892608642578, | |
| "learning_rate": 5.78e-06, | |
| "loss": 0.1295, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.8143233060836792, | |
| "learning_rate": 5.98e-06, | |
| "loss": 0.1337, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "grad_norm": 0.7211500406265259, | |
| "learning_rate": 6.18e-06, | |
| "loss": 0.1228, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.8526885509490967, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 0.1331, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "grad_norm": 1.7408725023269653, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 0.127, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 1.5572606325149536, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 0.1177, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.8059284687042236, | |
| "learning_rate": 6.98e-06, | |
| "loss": 0.1267, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 0.755484938621521, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 0.1278, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "grad_norm": 0.9175857901573181, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 0.121, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 0.8510663509368896, | |
| "learning_rate": 7.58e-06, | |
| "loss": 0.115, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "grad_norm": 0.9119841456413269, | |
| "learning_rate": 7.78e-06, | |
| "loss": 0.1218, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.6241216063499451, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 0.1238, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "grad_norm": 0.8066564202308655, | |
| "learning_rate": 8.18e-06, | |
| "loss": 0.1205, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 1.104693055152893, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 0.1225, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "grad_norm": 2.131037712097168, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 0.1321, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 0.8443548679351807, | |
| "learning_rate": 8.78e-06, | |
| "loss": 0.1278, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.8610478043556213, | |
| "learning_rate": 8.98e-06, | |
| "loss": 0.1169, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 0.9445344805717468, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 0.1293, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "grad_norm": 0.8306660056114197, | |
| "learning_rate": 9.38e-06, | |
| "loss": 0.1213, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 1.2315057516098022, | |
| "learning_rate": 9.58e-06, | |
| "loss": 0.1173, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "grad_norm": 0.7147797346115112, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 0.1138, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.8065736293792725, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 0.1225, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "grad_norm": 0.6982239484786987, | |
| "learning_rate": 9.999901304280686e-06, | |
| "loss": 0.1225, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 0.906414806842804, | |
| "learning_rate": 9.999560138895238e-06, | |
| "loss": 0.1225, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "grad_norm": 0.6281483769416809, | |
| "learning_rate": 9.99897530200195e-06, | |
| "loss": 0.1183, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 0.626783549785614, | |
| "learning_rate": 9.998146822104943e-06, | |
| "loss": 0.124, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.9497509598731995, | |
| "learning_rate": 9.997074739583162e-06, | |
| "loss": 0.1193, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.7951166033744812, | |
| "learning_rate": 9.995759106688394e-06, | |
| "loss": 0.1151, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "grad_norm": 0.722017228603363, | |
| "learning_rate": 9.99419998754273e-06, | |
| "loss": 0.1205, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 0.7518423795700073, | |
| "learning_rate": 9.992397458135438e-06, | |
| "loss": 0.1231, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "grad_norm": 3.67732834815979, | |
| "learning_rate": 9.990351606319261e-06, | |
| "loss": 0.116, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.7335011959075928, | |
| "learning_rate": 9.988062531806127e-06, | |
| "loss": 0.1138, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.122, | |
| "grad_norm": 0.6158178448677063, | |
| "learning_rate": 9.9855303461623e-06, | |
| "loss": 0.1116, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": 1.0562201738357544, | |
| "learning_rate": 9.982755172802933e-06, | |
| "loss": 0.1122, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.126, | |
| "grad_norm": 0.7680226564407349, | |
| "learning_rate": 9.979737146986064e-06, | |
| "loss": 0.1108, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.935513973236084, | |
| "learning_rate": 9.976476415806013e-06, | |
| "loss": 0.1189, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.7690663933753967, | |
| "learning_rate": 9.972973138186217e-06, | |
| "loss": 0.1201, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "grad_norm": 0.8854688405990601, | |
| "learning_rate": 9.969227484871485e-06, | |
| "loss": 0.1144, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.134, | |
| "grad_norm": 1.5783987045288086, | |
| "learning_rate": 9.965239638419673e-06, | |
| "loss": 0.1173, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 0.7775712013244629, | |
| "learning_rate": 9.961009793192793e-06, | |
| "loss": 0.1199, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.138, | |
| "grad_norm": 0.8870111107826233, | |
| "learning_rate": 9.956538155347534e-06, | |
| "loss": 0.1221, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.9003025889396667, | |
| "learning_rate": 9.951824942825215e-06, | |
| "loss": 0.1144, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.142, | |
| "grad_norm": 0.8907985687255859, | |
| "learning_rate": 9.946870385341167e-06, | |
| "loss": 0.111, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.7512484192848206, | |
| "learning_rate": 9.94167472437353e-06, | |
| "loss": 0.1233, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.146, | |
| "grad_norm": 1.176719069480896, | |
| "learning_rate": 9.936238213151491e-06, | |
| "loss": 0.1077, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "grad_norm": 0.5437051057815552, | |
| "learning_rate": 9.930561116642936e-06, | |
| "loss": 0.1159, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.4601088762283325, | |
| "learning_rate": 9.92464371154154e-06, | |
| "loss": 0.1121, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 0.6581608057022095, | |
| "learning_rate": 9.918486286253279e-06, | |
| "loss": 0.1155, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.154, | |
| "grad_norm": 0.7549941539764404, | |
| "learning_rate": 9.912089140882377e-06, | |
| "loss": 0.1138, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "grad_norm": 0.8127408623695374, | |
| "learning_rate": 9.90545258721667e-06, | |
| "loss": 0.113, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.158, | |
| "grad_norm": 0.6606084108352661, | |
| "learning_rate": 9.898576948712427e-06, | |
| "loss": 0.1171, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.90671306848526, | |
| "learning_rate": 9.891462560478562e-06, | |
| "loss": 0.1096, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.162, | |
| "grad_norm": 0.6799761652946472, | |
| "learning_rate": 9.884109769260326e-06, | |
| "loss": 0.1175, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "grad_norm": 0.6381652355194092, | |
| "learning_rate": 9.876518933422385e-06, | |
| "loss": 0.1148, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.166, | |
| "grad_norm": 0.5381753444671631, | |
| "learning_rate": 9.868690422931372e-06, | |
| "loss": 0.121, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 0.7047041654586792, | |
| "learning_rate": 9.860624619337844e-06, | |
| "loss": 0.1128, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.9245678186416626, | |
| "learning_rate": 9.852321915757688e-06, | |
| "loss": 0.1172, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "grad_norm": 1.1964024305343628, | |
| "learning_rate": 9.843782716852963e-06, | |
| "loss": 0.1156, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.174, | |
| "grad_norm": 0.6748765110969543, | |
| "learning_rate": 9.835007438812177e-06, | |
| "loss": 0.116, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 0.9408971071243286, | |
| "learning_rate": 9.825996509330001e-06, | |
| "loss": 0.1134, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.178, | |
| "grad_norm": 0.5978791117668152, | |
| "learning_rate": 9.816750367586424e-06, | |
| "loss": 0.1148, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.8566645383834839, | |
| "learning_rate": 9.807269464225355e-06, | |
| "loss": 0.1117, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.182, | |
| "grad_norm": 0.7490561604499817, | |
| "learning_rate": 9.797554261332644e-06, | |
| "loss": 0.1116, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 0.5367133617401123, | |
| "learning_rate": 9.787605232413575e-06, | |
| "loss": 0.1132, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.186, | |
| "grad_norm": 0.7432158589363098, | |
| "learning_rate": 9.777422862369782e-06, | |
| "loss": 0.1251, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.188, | |
| "grad_norm": 0.7126122117042542, | |
| "learning_rate": 9.767007647475618e-06, | |
| "loss": 0.1066, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.2850806713104248, | |
| "learning_rate": 9.756360095353957e-06, | |
| "loss": 0.1122, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.4591011106967926, | |
| "learning_rate": 9.745480724951473e-06, | |
| "loss": 0.1101, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.194, | |
| "grad_norm": 0.6034103035926819, | |
| "learning_rate": 9.73437006651333e-06, | |
| "loss": 0.1145, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.196, | |
| "grad_norm": 0.6527671813964844, | |
| "learning_rate": 9.723028661557345e-06, | |
| "loss": 0.1074, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.198, | |
| "grad_norm": 0.49524030089378357, | |
| "learning_rate": 9.711457062847596e-06, | |
| "loss": 0.1101, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.7419334053993225, | |
| "learning_rate": 9.699655834367479e-06, | |
| "loss": 0.1133, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.202, | |
| "grad_norm": 0.508783221244812, | |
| "learning_rate": 9.687625551292219e-06, | |
| "loss": 0.1156, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.204, | |
| "grad_norm": 0.6890400648117065, | |
| "learning_rate": 9.675366799960842e-06, | |
| "loss": 0.1095, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.206, | |
| "grad_norm": 0.574763834476471, | |
| "learning_rate": 9.662880177847595e-06, | |
| "loss": 0.114, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 0.5926764607429504, | |
| "learning_rate": 9.650166293532822e-06, | |
| "loss": 0.1087, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.6142486333847046, | |
| "learning_rate": 9.637225766673309e-06, | |
| "loss": 0.1061, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.212, | |
| "grad_norm": 0.5107919573783875, | |
| "learning_rate": 9.624059227972077e-06, | |
| "loss": 0.1154, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.214, | |
| "grad_norm": 0.7692158818244934, | |
| "learning_rate": 9.610667319147648e-06, | |
| "loss": 0.1128, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 0.5525968074798584, | |
| "learning_rate": 9.597050692902765e-06, | |
| "loss": 0.1096, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.218, | |
| "grad_norm": 0.6440755724906921, | |
| "learning_rate": 9.583210012892582e-06, | |
| "loss": 0.1157, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.5787026882171631, | |
| "learning_rate": 9.569145953692316e-06, | |
| "loss": 0.1042, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.222, | |
| "grad_norm": 0.8566828966140747, | |
| "learning_rate": 9.554859200764371e-06, | |
| "loss": 0.109, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.5807632207870483, | |
| "learning_rate": 9.540350450424927e-06, | |
| "loss": 0.1094, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.226, | |
| "grad_norm": 0.5819889307022095, | |
| "learning_rate": 9.525620409810009e-06, | |
| "loss": 0.1065, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.228, | |
| "grad_norm": 0.46392130851745605, | |
| "learning_rate": 9.510669796841014e-06, | |
| "loss": 0.1058, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.063761830329895, | |
| "learning_rate": 9.495499340189729e-06, | |
| "loss": 0.1096, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 0.7276637554168701, | |
| "learning_rate": 9.480109779242805e-06, | |
| "loss": 0.118, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.234, | |
| "grad_norm": 0.5671316385269165, | |
| "learning_rate": 9.464501864065735e-06, | |
| "loss": 0.1077, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.236, | |
| "grad_norm": 0.7350747585296631, | |
| "learning_rate": 9.448676355366282e-06, | |
| "loss": 0.1105, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.238, | |
| "grad_norm": 0.5442182421684265, | |
| "learning_rate": 9.432634024457414e-06, | |
| "loss": 0.1058, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.9185284376144409, | |
| "learning_rate": 9.41637565321971e-06, | |
| "loss": 0.1026, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.242, | |
| "grad_norm": 0.7028173208236694, | |
| "learning_rate": 9.399902034063244e-06, | |
| "loss": 0.108, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.244, | |
| "grad_norm": 0.5107137560844421, | |
| "learning_rate": 9.383213969888972e-06, | |
| "loss": 0.1148, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.246, | |
| "grad_norm": 0.5467950701713562, | |
| "learning_rate": 9.366312274049602e-06, | |
| "loss": 0.1007, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 0.8054739832878113, | |
| "learning_rate": 9.349197770309942e-06, | |
| "loss": 0.1057, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.5686827301979065, | |
| "learning_rate": 9.33187129280676e-06, | |
| "loss": 0.1071, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.252, | |
| "grad_norm": 0.6124984622001648, | |
| "learning_rate": 9.314333686008125e-06, | |
| "loss": 0.1095, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.254, | |
| "grad_norm": 1.458778977394104, | |
| "learning_rate": 9.296585804672253e-06, | |
| "loss": 0.1072, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.6592323780059814, | |
| "learning_rate": 9.278628513805838e-06, | |
| "loss": 0.1009, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.258, | |
| "grad_norm": 0.5499700903892517, | |
| "learning_rate": 9.260462688621906e-06, | |
| "loss": 0.109, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.5235973596572876, | |
| "learning_rate": 9.242089214497146e-06, | |
| "loss": 0.1044, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.262, | |
| "grad_norm": 0.5331101417541504, | |
| "learning_rate": 9.223508986928766e-06, | |
| "loss": 0.103, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 0.4890216290950775, | |
| "learning_rate": 9.204722911490847e-06, | |
| "loss": 0.1059, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.266, | |
| "grad_norm": 0.7136434316635132, | |
| "learning_rate": 9.1857319037902e-06, | |
| "loss": 0.1063, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.268, | |
| "grad_norm": 0.5715893507003784, | |
| "learning_rate": 9.16653688942175e-06, | |
| "loss": 0.1147, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.6043694615364075, | |
| "learning_rate": 9.147138803923417e-06, | |
| "loss": 0.11, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.720506489276886, | |
| "learning_rate": 9.12753859273052e-06, | |
| "loss": 0.1042, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.274, | |
| "grad_norm": 0.8681615591049194, | |
| "learning_rate": 9.107737211129702e-06, | |
| "loss": 0.1041, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.276, | |
| "grad_norm": 0.49558717012405396, | |
| "learning_rate": 9.087735624212365e-06, | |
| "loss": 0.1109, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.278, | |
| "grad_norm": 0.5701916217803955, | |
| "learning_rate": 9.06753480682764e-06, | |
| "loss": 0.1118, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.503829836845398, | |
| "learning_rate": 9.047135743534866e-06, | |
| "loss": 0.1086, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.282, | |
| "grad_norm": 0.3906455636024475, | |
| "learning_rate": 9.026539428555609e-06, | |
| "loss": 0.1082, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.284, | |
| "grad_norm": 0.6386545300483704, | |
| "learning_rate": 9.005746865725206e-06, | |
| "loss": 0.1061, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.286, | |
| "grad_norm": 0.5736138224601746, | |
| "learning_rate": 8.984759068443832e-06, | |
| "loss": 0.1117, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.8654852509498596, | |
| "learning_rate": 8.963577059627117e-06, | |
| "loss": 0.1075, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.6457758545875549, | |
| "learning_rate": 8.942201871656292e-06, | |
| "loss": 0.1098, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.292, | |
| "grad_norm": 0.4607081115245819, | |
| "learning_rate": 8.920634546327857e-06, | |
| "loss": 0.1033, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.294, | |
| "grad_norm": 0.4616289734840393, | |
| "learning_rate": 8.898876134802827e-06, | |
| "loss": 0.1073, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 0.5038948655128479, | |
| "learning_rate": 8.87692769755548e-06, | |
| "loss": 0.1086, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.298, | |
| "grad_norm": 0.5729373097419739, | |
| "learning_rate": 8.854790304321682e-06, | |
| "loss": 0.1061, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.48975491523742676, | |
| "learning_rate": 8.83246503404675e-06, | |
| "loss": 0.1099, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.302, | |
| "grad_norm": 0.5203743577003479, | |
| "learning_rate": 8.80995297483286e-06, | |
| "loss": 0.1075, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.4938233196735382, | |
| "learning_rate": 8.78725522388602e-06, | |
| "loss": 0.105, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.306, | |
| "grad_norm": 0.5147393941879272, | |
| "learning_rate": 8.764372887462587e-06, | |
| "loss": 0.109, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.308, | |
| "grad_norm": 0.6048959493637085, | |
| "learning_rate": 8.741307080815357e-06, | |
| "loss": 0.1051, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.4202023446559906, | |
| "learning_rate": 8.718058928139205e-06, | |
| "loss": 0.1071, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 0.7164034247398376, | |
| "learning_rate": 8.694629562516295e-06, | |
| "loss": 0.1111, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.314, | |
| "grad_norm": 0.5305653214454651, | |
| "learning_rate": 8.671020125860851e-06, | |
| "loss": 0.0985, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.316, | |
| "grad_norm": 0.48124703764915466, | |
| "learning_rate": 8.647231768863513e-06, | |
| "loss": 0.1051, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.318, | |
| "grad_norm": 0.586521327495575, | |
| "learning_rate": 8.623265650935233e-06, | |
| "loss": 0.1056, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.7912370562553406, | |
| "learning_rate": 8.599122940150795e-06, | |
| "loss": 0.1042, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.322, | |
| "grad_norm": 0.5484232306480408, | |
| "learning_rate": 8.574804813191859e-06, | |
| "loss": 0.1008, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.324, | |
| "grad_norm": 0.4200885593891144, | |
| "learning_rate": 8.550312455289624e-06, | |
| "loss": 0.1058, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.326, | |
| "grad_norm": 0.5569013357162476, | |
| "learning_rate": 8.525647060167063e-06, | |
| "loss": 0.1087, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 0.42352986335754395, | |
| "learning_rate": 8.500809829980734e-06, | |
| "loss": 0.1003, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.5038197636604309, | |
| "learning_rate": 8.4758019752622e-06, | |
| "loss": 0.098, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.332, | |
| "grad_norm": 0.5340690612792969, | |
| "learning_rate": 8.450624714859016e-06, | |
| "loss": 0.107, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.334, | |
| "grad_norm": 0.5682323575019836, | |
| "learning_rate": 8.425279275875336e-06, | |
| "loss": 0.1034, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.5063655376434326, | |
| "learning_rate": 8.399766893612096e-06, | |
| "loss": 0.1005, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.338, | |
| "grad_norm": 0.6555968523025513, | |
| "learning_rate": 8.374088811506819e-06, | |
| "loss": 0.1074, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.4122048318386078, | |
| "learning_rate": 8.348246281072998e-06, | |
| "loss": 0.1035, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.342, | |
| "grad_norm": 0.6033665537834167, | |
| "learning_rate": 8.32224056183911e-06, | |
| "loss": 0.1004, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 0.6186772584915161, | |
| "learning_rate": 8.296072921287217e-06, | |
| "loss": 0.1059, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.346, | |
| "grad_norm": 0.5782063603401184, | |
| "learning_rate": 8.269744634791207e-06, | |
| "loss": 0.1026, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.348, | |
| "grad_norm": 0.5529699921607971, | |
| "learning_rate": 8.243256985554622e-06, | |
| "loss": 0.1052, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.43343502283096313, | |
| "learning_rate": 8.21661126454811e-06, | |
| "loss": 0.105, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.5857189297676086, | |
| "learning_rate": 8.189808770446528e-06, | |
| "loss": 0.1049, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.354, | |
| "grad_norm": 0.6525639891624451, | |
| "learning_rate": 8.162850809565623e-06, | |
| "loss": 0.0974, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.356, | |
| "grad_norm": 2.085735321044922, | |
| "learning_rate": 8.135738695798377e-06, | |
| "loss": 0.1067, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.358, | |
| "grad_norm": 0.5422115325927734, | |
| "learning_rate": 8.108473750550965e-06, | |
| "loss": 0.1086, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.4821435213088989, | |
| "learning_rate": 8.081057302678352e-06, | |
| "loss": 0.0956, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.362, | |
| "grad_norm": 0.6426023244857788, | |
| "learning_rate": 8.053490688419532e-06, | |
| "loss": 0.1072, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.364, | |
| "grad_norm": 0.5745382905006409, | |
| "learning_rate": 8.02577525133239e-06, | |
| "loss": 0.1077, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.366, | |
| "grad_norm": 0.5834154486656189, | |
| "learning_rate": 7.997912342228232e-06, | |
| "loss": 0.0991, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.4103539288043976, | |
| "learning_rate": 7.969903319105935e-06, | |
| "loss": 0.1022, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.7636476755142212, | |
| "learning_rate": 7.941749547085778e-06, | |
| "loss": 0.1028, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.372, | |
| "grad_norm": 0.4373137354850769, | |
| "learning_rate": 7.913452398342882e-06, | |
| "loss": 0.099, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.374, | |
| "grad_norm": 0.5366427302360535, | |
| "learning_rate": 7.88501325204036e-06, | |
| "loss": 0.1059, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 0.49244028329849243, | |
| "learning_rate": 7.856433494262078e-06, | |
| "loss": 0.1055, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.378, | |
| "grad_norm": 0.7082386016845703, | |
| "learning_rate": 7.827714517945116e-06, | |
| "loss": 0.1042, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.6199413537979126, | |
| "learning_rate": 7.798857722811857e-06, | |
| "loss": 0.1072, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.382, | |
| "grad_norm": 0.5586550831794739, | |
| "learning_rate": 7.769864515301787e-06, | |
| "loss": 0.1119, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.5374526977539062, | |
| "learning_rate": 7.740736308502939e-06, | |
| "loss": 0.1074, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.386, | |
| "grad_norm": 0.6747333407402039, | |
| "learning_rate": 7.711474522083015e-06, | |
| "loss": 0.1052, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.388, | |
| "grad_norm": 0.8605504631996155, | |
| "learning_rate": 7.682080582220206e-06, | |
| "loss": 0.1048, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.8798643350601196, | |
| "learning_rate": 7.652555921533671e-06, | |
| "loss": 0.0994, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 0.39183422923088074, | |
| "learning_rate": 7.622901979013717e-06, | |
| "loss": 0.1023, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.394, | |
| "grad_norm": 0.5172967314720154, | |
| "learning_rate": 7.5931201999516715e-06, | |
| "loss": 0.1029, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.396, | |
| "grad_norm": 0.4358147084712982, | |
| "learning_rate": 7.563212035869426e-06, | |
| "loss": 0.1112, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.398, | |
| "grad_norm": 0.9168479442596436, | |
| "learning_rate": 7.533178944448705e-06, | |
| "loss": 0.1026, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.4735708236694336, | |
| "learning_rate": 7.503022389460014e-06, | |
| "loss": 0.1075, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.402, | |
| "grad_norm": 2.389822244644165, | |
| "learning_rate": 7.4727438406912986e-06, | |
| "loss": 0.1052, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.404, | |
| "grad_norm": 0.5340117812156677, | |
| "learning_rate": 7.44234477387631e-06, | |
| "loss": 0.1036, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.406, | |
| "grad_norm": 0.4848586916923523, | |
| "learning_rate": 7.411826670622676e-06, | |
| "loss": 0.0993, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 0.5334826111793518, | |
| "learning_rate": 7.381191018339697e-06, | |
| "loss": 0.1037, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.6843870282173157, | |
| "learning_rate": 7.350439310165842e-06, | |
| "loss": 0.0961, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.412, | |
| "grad_norm": 0.4742767810821533, | |
| "learning_rate": 7.319573044895986e-06, | |
| "loss": 0.103, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.414, | |
| "grad_norm": 0.7538304924964905, | |
| "learning_rate": 7.288593726908351e-06, | |
| "loss": 0.1068, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.4505075216293335, | |
| "learning_rate": 7.257502866091192e-06, | |
| "loss": 0.0987, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.418, | |
| "grad_norm": 0.4845210015773773, | |
| "learning_rate": 7.226301977769199e-06, | |
| "loss": 0.1019, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.7197276949882507, | |
| "learning_rate": 7.194992582629654e-06, | |
| "loss": 0.1009, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.422, | |
| "grad_norm": 0.8190245032310486, | |
| "learning_rate": 7.1635762066483035e-06, | |
| "loss": 0.1066, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 0.4387541711330414, | |
| "learning_rate": 7.1320543810149945e-06, | |
| "loss": 0.0943, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.426, | |
| "grad_norm": 0.5509855151176453, | |
| "learning_rate": 7.100428642059033e-06, | |
| "loss": 0.104, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.428, | |
| "grad_norm": 0.5081747174263, | |
| "learning_rate": 7.0687005311743195e-06, | |
| "loss": 0.1016, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.5941420793533325, | |
| "learning_rate": 7.036871594744218e-06, | |
| "loss": 0.1094, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.47250431776046753, | |
| "learning_rate": 7.0049433840661875e-06, | |
| "loss": 0.1022, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.434, | |
| "grad_norm": 0.6311111450195312, | |
| "learning_rate": 6.97291745527617e-06, | |
| "loss": 0.1014, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.436, | |
| "grad_norm": 1.6460589170455933, | |
| "learning_rate": 6.940795369272754e-06, | |
| "loss": 0.1093, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.438, | |
| "grad_norm": 0.5381661653518677, | |
| "learning_rate": 6.908578691641092e-06, | |
| "loss": 0.0997, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.5386653542518616, | |
| "learning_rate": 6.876268992576605e-06, | |
| "loss": 0.0996, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.442, | |
| "grad_norm": 0.5878283381462097, | |
| "learning_rate": 6.843867846808438e-06, | |
| "loss": 0.1011, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.444, | |
| "grad_norm": 1.202819585800171, | |
| "learning_rate": 6.811376833522729e-06, | |
| "loss": 0.1015, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.446, | |
| "grad_norm": 0.6832873225212097, | |
| "learning_rate": 6.778797536285625e-06, | |
| "loss": 0.1007, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.7227466106414795, | |
| "learning_rate": 6.746131542966112e-06, | |
| "loss": 0.1036, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.462429016828537, | |
| "learning_rate": 6.713380445658618e-06, | |
| "loss": 0.1012, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.452, | |
| "grad_norm": 0.5341704487800598, | |
| "learning_rate": 6.680545840605423e-06, | |
| "loss": 0.103, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.454, | |
| "grad_norm": 0.5896704196929932, | |
| "learning_rate": 6.647629328118852e-06, | |
| "loss": 0.1045, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 4.748597145080566, | |
| "learning_rate": 6.614632512503289e-06, | |
| "loss": 0.0989, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.458, | |
| "grad_norm": 0.5261045098304749, | |
| "learning_rate": 6.58155700197697e-06, | |
| "loss": 0.1017, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.5400019884109497, | |
| "learning_rate": 6.548404408593622e-06, | |
| "loss": 0.1028, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.462, | |
| "grad_norm": 0.5167948603630066, | |
| "learning_rate": 6.5151763481638705e-06, | |
| "loss": 0.101, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.5450933575630188, | |
| "learning_rate": 6.481874440176506e-06, | |
| "loss": 0.1069, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.466, | |
| "grad_norm": 0.8595630526542664, | |
| "learning_rate": 6.448500307719537e-06, | |
| "loss": 0.1004, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.468, | |
| "grad_norm": 0.47659191489219666, | |
| "learning_rate": 6.415055577401101e-06, | |
| "loss": 0.0994, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.49259841442108154, | |
| "learning_rate": 6.3815418792701686e-06, | |
| "loss": 0.0985, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 0.5162966251373291, | |
| "learning_rate": 6.3479608467371055e-06, | |
| "loss": 0.1004, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.474, | |
| "grad_norm": 0.43801945447921753, | |
| "learning_rate": 6.314314116494061e-06, | |
| "loss": 0.1009, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.476, | |
| "grad_norm": 0.5107311010360718, | |
| "learning_rate": 6.280603328435199e-06, | |
| "loss": 0.0944, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.478, | |
| "grad_norm": 0.5112555027008057, | |
| "learning_rate": 6.24683012557677e-06, | |
| "loss": 0.1034, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.40618887543678284, | |
| "learning_rate": 6.212996153977038e-06, | |
| "loss": 0.1062, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.482, | |
| "grad_norm": 0.6284693479537964, | |
| "learning_rate": 6.179103062656042e-06, | |
| "loss": 0.0955, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.484, | |
| "grad_norm": 0.42741313576698303, | |
| "learning_rate": 6.145152503515239e-06, | |
| "loss": 0.1031, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.486, | |
| "grad_norm": 0.5457072854042053, | |
| "learning_rate": 6.111146131256983e-06, | |
| "loss": 0.1109, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 0.4269944727420807, | |
| "learning_rate": 6.077085603303883e-06, | |
| "loss": 0.098, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.5703105926513672, | |
| "learning_rate": 6.04297257971802e-06, | |
| "loss": 0.0988, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.492, | |
| "grad_norm": 0.3747485280036926, | |
| "learning_rate": 6.008808723120035e-06, | |
| "loss": 0.1019, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.494, | |
| "grad_norm": 0.4911485016345978, | |
| "learning_rate": 5.974595698608103e-06, | |
| "loss": 0.1007, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.5864652991294861, | |
| "learning_rate": 5.94033517367677e-06, | |
| "loss": 0.0988, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.498, | |
| "grad_norm": 0.5941429138183594, | |
| "learning_rate": 5.906028818135687e-06, | |
| "loss": 0.1036, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.6516585946083069, | |
| "learning_rate": 5.871678304028224e-06, | |
| "loss": 0.0895, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.502, | |
| "grad_norm": 0.4426342844963074, | |
| "learning_rate": 5.837285305549978e-06, | |
| "loss": 0.0968, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 0.4874410033226013, | |
| "learning_rate": 5.802851498967173e-06, | |
| "loss": 0.1018, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.506, | |
| "grad_norm": 0.5964179039001465, | |
| "learning_rate": 5.768378562534962e-06, | |
| "loss": 0.1037, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.508, | |
| "grad_norm": 0.43735530972480774, | |
| "learning_rate": 5.733868176415633e-06, | |
| "loss": 0.0993, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.6398305296897888, | |
| "learning_rate": 5.6993220225967214e-06, | |
| "loss": 0.0943, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.5659822225570679, | |
| "learning_rate": 5.6647417848090225e-06, | |
| "loss": 0.0991, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.514, | |
| "grad_norm": 0.6617985963821411, | |
| "learning_rate": 5.630129148444543e-06, | |
| "loss": 0.0957, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.516, | |
| "grad_norm": 0.3739423453807831, | |
| "learning_rate": 5.59548580047435e-06, | |
| "loss": 0.0951, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.518, | |
| "grad_norm": 0.9471829533576965, | |
| "learning_rate": 5.560813429366345e-06, | |
| "loss": 0.0965, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.7911657094955444, | |
| "learning_rate": 5.526113725002984e-06, | |
| "loss": 0.1031, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.522, | |
| "grad_norm": 0.6069086194038391, | |
| "learning_rate": 5.491388378598899e-06, | |
| "loss": 0.0976, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.524, | |
| "grad_norm": 0.47589465975761414, | |
| "learning_rate": 5.456639082618489e-06, | |
| "loss": 0.0918, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.526, | |
| "grad_norm": 0.4524126946926117, | |
| "learning_rate": 5.4218675306934145e-06, | |
| "loss": 0.0979, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 0.6384430527687073, | |
| "learning_rate": 5.3870754175400595e-06, | |
| "loss": 0.0987, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.40055084228515625, | |
| "learning_rate": 5.352264438876935e-06, | |
| "loss": 0.097, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.532, | |
| "grad_norm": 0.5200124382972717, | |
| "learning_rate": 5.317436291342031e-06, | |
| "loss": 0.0973, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.534, | |
| "grad_norm": 0.6301565766334534, | |
| "learning_rate": 5.282592672410124e-06, | |
| "loss": 0.105, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 0.48175889253616333, | |
| "learning_rate": 5.247735280310041e-06, | |
| "loss": 0.0941, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.538, | |
| "grad_norm": 0.3721906542778015, | |
| "learning_rate": 5.212865813941899e-06, | |
| "loss": 0.1017, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.38474878668785095, | |
| "learning_rate": 5.177985972794293e-06, | |
| "loss": 0.0914, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.542, | |
| "grad_norm": 0.5831676721572876, | |
| "learning_rate": 5.143097456861474e-06, | |
| "loss": 0.0959, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 0.5077164173126221, | |
| "learning_rate": 5.1082019665604895e-06, | |
| "loss": 0.0977, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.546, | |
| "grad_norm": 0.5812519788742065, | |
| "learning_rate": 5.073301202648304e-06, | |
| "loss": 0.0874, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.548, | |
| "grad_norm": 2.192858934402466, | |
| "learning_rate": 5.038396866138915e-06, | |
| "loss": 0.1039, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.8018079996109009, | |
| "learning_rate": 5.003490658220438e-06, | |
| "loss": 0.0999, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 0.4818994104862213, | |
| "learning_rate": 4.968584280172206e-06, | |
| "loss": 0.0895, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.554, | |
| "grad_norm": 0.7325239181518555, | |
| "learning_rate": 4.933679433281837e-06, | |
| "loss": 0.1034, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.556, | |
| "grad_norm": 0.6191554665565491, | |
| "learning_rate": 4.898777818762325e-06, | |
| "loss": 0.0995, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.558, | |
| "grad_norm": 0.775681734085083, | |
| "learning_rate": 4.863881137669123e-06, | |
| "loss": 0.0952, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.4037740230560303, | |
| "learning_rate": 4.828991090817238e-06, | |
| "loss": 0.0952, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.562, | |
| "grad_norm": 0.43431025743484497, | |
| "learning_rate": 4.794109378698327e-06, | |
| "loss": 0.1076, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.564, | |
| "grad_norm": 0.4915035665035248, | |
| "learning_rate": 4.759237701397831e-06, | |
| "loss": 0.1032, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.566, | |
| "grad_norm": 0.4426998794078827, | |
| "learning_rate": 4.7243777585121034e-06, | |
| "loss": 0.0979, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 0.44436115026474, | |
| "learning_rate": 4.689531249065581e-06, | |
| "loss": 0.0937, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.9476667642593384, | |
| "learning_rate": 4.654699871427972e-06, | |
| "loss": 0.1045, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.572, | |
| "grad_norm": 0.6479189395904541, | |
| "learning_rate": 4.619885323231484e-06, | |
| "loss": 0.1003, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.574, | |
| "grad_norm": 4.131319522857666, | |
| "learning_rate": 4.5850893012880806e-06, | |
| "loss": 0.1047, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 0.4022497832775116, | |
| "learning_rate": 4.5503135015067815e-06, | |
| "loss": 0.1002, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.578, | |
| "grad_norm": 0.6141453981399536, | |
| "learning_rate": 4.5155596188110055e-06, | |
| "loss": 0.0973, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.520523190498352, | |
| "learning_rate": 4.4808293470559645e-06, | |
| "loss": 0.1045, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.582, | |
| "grad_norm": 0.41964802145957947, | |
| "learning_rate": 4.446124378946108e-06, | |
| "loss": 0.1089, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 1.8227766752243042, | |
| "learning_rate": 4.4114464059526185e-06, | |
| "loss": 0.0872, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.586, | |
| "grad_norm": 0.427450031042099, | |
| "learning_rate": 4.376797118230978e-06, | |
| "loss": 0.0966, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.588, | |
| "grad_norm": 0.5866195559501648, | |
| "learning_rate": 4.342178204538588e-06, | |
| "loss": 0.1014, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.7429317235946655, | |
| "learning_rate": 4.307591352152459e-06, | |
| "loss": 0.1011, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 0.578780472278595, | |
| "learning_rate": 4.273038246786986e-06, | |
| "loss": 0.1034, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.594, | |
| "grad_norm": 0.45780378580093384, | |
| "learning_rate": 4.238520572511773e-06, | |
| "loss": 0.0988, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.596, | |
| "grad_norm": 0.4824381470680237, | |
| "learning_rate": 4.204040011669567e-06, | |
| "loss": 0.1011, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.598, | |
| "grad_norm": 0.6699690222740173, | |
| "learning_rate": 4.169598244794261e-06, | |
| "loss": 0.0958, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.6796635389328003, | |
| "learning_rate": 4.135196950528982e-06, | |
| "loss": 0.1013, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.602, | |
| "grad_norm": 0.4701448380947113, | |
| "learning_rate": 4.100837805544279e-06, | |
| "loss": 0.0899, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.604, | |
| "grad_norm": 0.4026479125022888, | |
| "learning_rate": 4.066522484456406e-06, | |
| "loss": 0.0996, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.606, | |
| "grad_norm": 0.6788782477378845, | |
| "learning_rate": 4.032252659745699e-06, | |
| "loss": 0.0988, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 0.45941832661628723, | |
| "learning_rate": 3.9980300016750696e-06, | |
| "loss": 0.1006, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.7753358483314514, | |
| "learning_rate": 3.963856178208588e-06, | |
| "loss": 0.0943, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.612, | |
| "grad_norm": 0.5039435625076294, | |
| "learning_rate": 3.9297328549302e-06, | |
| "loss": 0.0987, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.614, | |
| "grad_norm": 0.6418735384941101, | |
| "learning_rate": 3.895661694962542e-06, | |
| "loss": 0.0983, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 1.4340955018997192, | |
| "learning_rate": 3.86164435888588e-06, | |
| "loss": 0.1017, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.618, | |
| "grad_norm": 0.6281638145446777, | |
| "learning_rate": 3.827682504657187e-06, | |
| "loss": 0.1016, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.8099610805511475, | |
| "learning_rate": 3.793777787529325e-06, | |
| "loss": 0.1028, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.622, | |
| "grad_norm": 0.46766799688339233, | |
| "learning_rate": 3.759931859970374e-06, | |
| "loss": 0.0992, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 0.6429394483566284, | |
| "learning_rate": 3.7261463715830902e-06, | |
| "loss": 0.0993, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.626, | |
| "grad_norm": 0.5106363892555237, | |
| "learning_rate": 3.6924229690245163e-06, | |
| "loss": 0.0984, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.628, | |
| "grad_norm": 0.3694756031036377, | |
| "learning_rate": 3.6587632959257168e-06, | |
| "loss": 0.0919, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.38708898425102234, | |
| "learning_rate": 3.625168992811671e-06, | |
| "loss": 0.0992, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 0.4585551619529724, | |
| "learning_rate": 3.5916416970213173e-06, | |
| "loss": 0.0947, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.634, | |
| "grad_norm": 0.4839175045490265, | |
| "learning_rate": 3.5581830426277554e-06, | |
| "loss": 0.1006, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.636, | |
| "grad_norm": 0.6512119770050049, | |
| "learning_rate": 3.524794660358593e-06, | |
| "loss": 0.0975, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.638, | |
| "grad_norm": 0.6606001853942871, | |
| "learning_rate": 3.491478177516484e-06, | |
| "loss": 0.0965, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.5618588924407959, | |
| "learning_rate": 3.4582352178997937e-06, | |
| "loss": 0.0972, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.642, | |
| "grad_norm": 0.9269917607307434, | |
| "learning_rate": 3.4250674017234774e-06, | |
| "loss": 0.1047, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.644, | |
| "grad_norm": 0.4144897162914276, | |
| "learning_rate": 3.3919763455401016e-06, | |
| "loss": 0.1007, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.646, | |
| "grad_norm": 0.541193425655365, | |
| "learning_rate": 3.358963662161062e-06, | |
| "loss": 0.0927, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 0.7467935085296631, | |
| "learning_rate": 3.3260309605779717e-06, | |
| "loss": 0.0959, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.5478144288063049, | |
| "learning_rate": 3.293179845884245e-06, | |
| "loss": 0.102, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.652, | |
| "grad_norm": 0.4977323114871979, | |
| "learning_rate": 3.260411919196866e-06, | |
| "loss": 0.0961, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.654, | |
| "grad_norm": 0.5541924834251404, | |
| "learning_rate": 3.227728777578353e-06, | |
| "loss": 0.1031, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 0.570885419845581, | |
| "learning_rate": 3.195132013958918e-06, | |
| "loss": 0.0984, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.658, | |
| "grad_norm": 1.277539849281311, | |
| "learning_rate": 3.1626232170588343e-06, | |
| "loss": 0.0952, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.4125811457633972, | |
| "learning_rate": 3.130203971310999e-06, | |
| "loss": 0.0996, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.662, | |
| "grad_norm": 0.5309910774230957, | |
| "learning_rate": 3.097875856783713e-06, | |
| "loss": 0.0954, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 0.553604006767273, | |
| "learning_rate": 3.0656404491036696e-06, | |
| "loss": 0.1029, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.666, | |
| "grad_norm": 0.740545928478241, | |
| "learning_rate": 3.033499319379163e-06, | |
| "loss": 0.0964, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.668, | |
| "grad_norm": 0.37706542015075684, | |
| "learning_rate": 3.001454034123512e-06, | |
| "loss": 0.104, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.4644536077976227, | |
| "learning_rate": 2.969506155178711e-06, | |
| "loss": 0.0963, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 0.4521162509918213, | |
| "learning_rate": 2.9376572396393047e-06, | |
| "loss": 0.0977, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.674, | |
| "grad_norm": 0.5465760827064514, | |
| "learning_rate": 2.905908839776509e-06, | |
| "loss": 0.097, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.676, | |
| "grad_norm": 0.6061195135116577, | |
| "learning_rate": 2.874262502962537e-06, | |
| "loss": 0.0937, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.678, | |
| "grad_norm": 1.0016474723815918, | |
| "learning_rate": 2.8427197715952047e-06, | |
| "loss": 0.1016, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.5500060319900513, | |
| "learning_rate": 2.811282183022736e-06, | |
| "loss": 0.0996, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.682, | |
| "grad_norm": 0.5355184078216553, | |
| "learning_rate": 2.779951269468847e-06, | |
| "loss": 0.1028, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.684, | |
| "grad_norm": 0.5095385909080505, | |
| "learning_rate": 2.7487285579580635e-06, | |
| "loss": 0.095, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.686, | |
| "grad_norm": 0.6504735350608826, | |
| "learning_rate": 2.717615570241294e-06, | |
| "loss": 0.0971, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 0.6753683686256409, | |
| "learning_rate": 2.686613822721666e-06, | |
| "loss": 0.0995, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.6328535676002502, | |
| "learning_rate": 2.6557248263806175e-06, | |
| "loss": 0.1017, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.692, | |
| "grad_norm": 0.6124202013015747, | |
| "learning_rate": 2.6249500867042523e-06, | |
| "loss": 0.1084, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.694, | |
| "grad_norm": 0.510665774345398, | |
| "learning_rate": 2.5942911036099657e-06, | |
| "loss": 0.1038, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 0.5020245909690857, | |
| "learning_rate": 2.5637493713733376e-06, | |
| "loss": 0.096, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.698, | |
| "grad_norm": 0.6111953854560852, | |
| "learning_rate": 2.533326378555314e-06, | |
| "loss": 0.0977, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.49325230717658997, | |
| "learning_rate": 2.5030236079296443e-06, | |
| "loss": 0.0959, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.702, | |
| "grad_norm": 0.4303823709487915, | |
| "learning_rate": 2.4728425364106136e-06, | |
| "loss": 0.0924, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 0.5887752771377563, | |
| "learning_rate": 2.442784634981071e-06, | |
| "loss": 0.0964, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.706, | |
| "grad_norm": 0.7012388706207275, | |
| "learning_rate": 2.412851368620726e-06, | |
| "loss": 0.0958, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.708, | |
| "grad_norm": 0.4575226902961731, | |
| "learning_rate": 2.3830441962347528e-06, | |
| "loss": 0.1011, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.7122695446014404, | |
| "learning_rate": 2.353364570582681e-06, | |
| "loss": 0.0958, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 0.5158506631851196, | |
| "learning_rate": 2.323813938207593e-06, | |
| "loss": 0.094, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.714, | |
| "grad_norm": 1.2165895700454712, | |
| "learning_rate": 2.294393739365621e-06, | |
| "loss": 0.1015, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.716, | |
| "grad_norm": 0.4191378355026245, | |
| "learning_rate": 2.265105407955752e-06, | |
| "loss": 0.1023, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.718, | |
| "grad_norm": 0.5682792663574219, | |
| "learning_rate": 2.235950371449938e-06, | |
| "loss": 0.0987, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.4805779755115509, | |
| "learning_rate": 2.2069300508235273e-06, | |
| "loss": 0.0991, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.722, | |
| "grad_norm": 0.5979336500167847, | |
| "learning_rate": 2.1780458604860056e-06, | |
| "loss": 0.095, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.724, | |
| "grad_norm": 0.48464110493659973, | |
| "learning_rate": 2.14929920821206e-06, | |
| "loss": 0.1018, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.726, | |
| "grad_norm": 0.8631208539009094, | |
| "learning_rate": 2.1206914950729673e-06, | |
| "loss": 0.0965, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 0.5129687786102295, | |
| "learning_rate": 2.0922241153683064e-06, | |
| "loss": 0.1012, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.4799841642379761, | |
| "learning_rate": 2.063898456558002e-06, | |
| "loss": 0.0954, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.732, | |
| "grad_norm": 0.698362410068512, | |
| "learning_rate": 2.035715899194704e-06, | |
| "loss": 0.0922, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.734, | |
| "grad_norm": 0.5779376029968262, | |
| "learning_rate": 2.007677816856498e-06, | |
| "loss": 0.0908, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 0.37104320526123047, | |
| "learning_rate": 1.979785576079961e-06, | |
| "loss": 0.0907, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.738, | |
| "grad_norm": 0.9988909363746643, | |
| "learning_rate": 1.95204053629356e-06, | |
| "loss": 0.0991, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.6738276481628418, | |
| "learning_rate": 1.9244440497513895e-06, | |
| "loss": 0.0883, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.742, | |
| "grad_norm": 1.0084031820297241, | |
| "learning_rate": 1.896997461467272e-06, | |
| "loss": 0.0963, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 0.47921252250671387, | |
| "learning_rate": 1.8697021091491991e-06, | |
| "loss": 0.0951, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.746, | |
| "grad_norm": 0.697775661945343, | |
| "learning_rate": 1.842559323134136e-06, | |
| "loss": 0.0946, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.748, | |
| "grad_norm": 0.74412602186203, | |
| "learning_rate": 1.8155704263231777e-06, | |
| "loss": 0.0889, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.5915809273719788, | |
| "learning_rate": 1.7887367341170781e-06, | |
| "loss": 0.097, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 0.8403988480567932, | |
| "learning_rate": 1.762059554352143e-06, | |
| "loss": 0.1004, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.754, | |
| "grad_norm": 0.763522207736969, | |
| "learning_rate": 1.7355401872364759e-06, | |
| "loss": 0.0987, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.756, | |
| "grad_norm": 0.5236890316009521, | |
| "learning_rate": 1.709179925286617e-06, | |
| "loss": 0.0993, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.758, | |
| "grad_norm": 1.012764811515808, | |
| "learning_rate": 1.6829800532645447e-06, | |
| "loss": 0.1035, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.4698300361633301, | |
| "learning_rate": 1.6569418481150596e-06, | |
| "loss": 0.0944, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.762, | |
| "grad_norm": 0.545480489730835, | |
| "learning_rate": 1.6310665789035468e-06, | |
| "loss": 0.0961, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.764, | |
| "grad_norm": 1.2716432809829712, | |
| "learning_rate": 1.605355506754121e-06, | |
| "loss": 0.0964, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.766, | |
| "grad_norm": 0.6809777021408081, | |
| "learning_rate": 1.5798098847881664e-06, | |
| "loss": 0.0901, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.45785894989967346, | |
| "learning_rate": 1.554430958063259e-06, | |
| "loss": 0.0931, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.5503713488578796, | |
| "learning_rate": 1.529219963512481e-06, | |
| "loss": 0.0963, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.772, | |
| "grad_norm": 0.4882219731807709, | |
| "learning_rate": 1.5041781298841424e-06, | |
| "loss": 0.105, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.774, | |
| "grad_norm": 1.1814230680465698, | |
| "learning_rate": 1.4793066776818843e-06, | |
| "loss": 0.098, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 0.51359623670578, | |
| "learning_rate": 1.4546068191051988e-06, | |
| "loss": 0.0985, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.778, | |
| "grad_norm": 0.4637075960636139, | |
| "learning_rate": 1.4300797579903476e-06, | |
| "loss": 0.0955, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.7598605155944824, | |
| "learning_rate": 1.4057266897516842e-06, | |
| "loss": 0.103, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.782, | |
| "grad_norm": 0.4668324291706085, | |
| "learning_rate": 1.3815488013233986e-06, | |
| "loss": 0.0928, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 0.46290475130081177, | |
| "learning_rate": 1.3575472711016634e-06, | |
| "loss": 0.0921, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.786, | |
| "grad_norm": 0.5419790148735046, | |
| "learning_rate": 1.333723268887201e-06, | |
| "loss": 0.0909, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.788, | |
| "grad_norm": 0.6386012434959412, | |
| "learning_rate": 1.3100779558282673e-06, | |
| "loss": 0.0903, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.1641706228256226, | |
| "learning_rate": 1.2866124843640614e-06, | |
| "loss": 0.0965, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 0.8818916082382202, | |
| "learning_rate": 1.2633279981685608e-06, | |
| "loss": 0.1001, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.794, | |
| "grad_norm": 0.4657428562641144, | |
| "learning_rate": 1.240225632094773e-06, | |
| "loss": 0.0949, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.796, | |
| "grad_norm": 0.6301387548446655, | |
| "learning_rate": 1.217306512119425e-06, | |
| "loss": 0.1017, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.798, | |
| "grad_norm": 0.5192793011665344, | |
| "learning_rate": 1.1945717552880919e-06, | |
| "loss": 0.0949, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.445238322019577, | |
| "learning_rate": 1.1720224696607474e-06, | |
| "loss": 0.0933, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.802, | |
| "grad_norm": 0.7800849676132202, | |
| "learning_rate": 1.1496597542577603e-06, | |
| "loss": 0.0937, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.804, | |
| "grad_norm": 0.5425428748130798, | |
| "learning_rate": 1.1274846990063314e-06, | |
| "loss": 0.0927, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.806, | |
| "grad_norm": 0.5033382177352905, | |
| "learning_rate": 1.1054983846873684e-06, | |
| "loss": 0.0909, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 0.6877299547195435, | |
| "learning_rate": 1.0837018828828133e-06, | |
| "loss": 0.0905, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.5039352774620056, | |
| "learning_rate": 1.0620962559234144e-06, | |
| "loss": 0.0938, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.812, | |
| "grad_norm": 0.48495736718177795, | |
| "learning_rate": 1.0406825568369478e-06, | |
| "loss": 0.0947, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.814, | |
| "grad_norm": 0.3968163728713989, | |
| "learning_rate": 1.0194618292968972e-06, | |
| "loss": 0.0898, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 0.542452871799469, | |
| "learning_rate": 9.984351075715848e-07, | |
| "loss": 0.0916, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.818, | |
| "grad_norm": 0.6849305629730225, | |
| "learning_rate": 9.77603416473763e-07, | |
| "loss": 0.0968, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.279828667640686, | |
| "learning_rate": 9.569677713106673e-07, | |
| "loss": 0.0954, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.822, | |
| "grad_norm": 0.5056818723678589, | |
| "learning_rate": 9.365291778345303e-07, | |
| "loss": 0.0852, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 0.4796373248100281, | |
| "learning_rate": 9.162886321935632e-07, | |
| "loss": 0.0898, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.826, | |
| "grad_norm": 0.7538776397705078, | |
| "learning_rate": 8.962471208834056e-07, | |
| "loss": 0.0916, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.828, | |
| "grad_norm": 0.499845951795578, | |
| "learning_rate": 8.764056206990446e-07, | |
| "loss": 0.0985, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.0565054416656494, | |
| "learning_rate": 8.567650986872061e-07, | |
| "loss": 0.0913, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 0.47425204515457153, | |
| "learning_rate": 8.373265120992252e-07, | |
| "loss": 0.1015, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.834, | |
| "grad_norm": 0.5785176753997803, | |
| "learning_rate": 8.180908083443884e-07, | |
| "loss": 0.0904, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.836, | |
| "grad_norm": 0.5388156175613403, | |
| "learning_rate": 7.990589249437591e-07, | |
| "loss": 0.0947, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.838, | |
| "grad_norm": 0.4170565903186798, | |
| "learning_rate": 7.802317894844835e-07, | |
| "loss": 0.0899, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.5308743715286255, | |
| "learning_rate": 7.61610319574585e-07, | |
| "loss": 0.0929, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.842, | |
| "grad_norm": 0.583870530128479, | |
| "learning_rate": 7.43195422798233e-07, | |
| "loss": 0.0865, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.844, | |
| "grad_norm": 0.5489416122436523, | |
| "learning_rate": 7.249879966715174e-07, | |
| "loss": 0.1026, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.846, | |
| "grad_norm": 0.5027689933776855, | |
| "learning_rate": 7.069889285987025e-07, | |
| "loss": 0.1007, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 0.4889448583126068, | |
| "learning_rate": 6.891990958289724e-07, | |
| "loss": 0.0875, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 1.2699036598205566, | |
| "learning_rate": 6.716193654136788e-07, | |
| "loss": 0.0973, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.852, | |
| "grad_norm": 0.5371798276901245, | |
| "learning_rate": 6.542505941640803e-07, | |
| "loss": 0.0943, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.854, | |
| "grad_norm": 0.6962149739265442, | |
| "learning_rate": 6.370936286095842e-07, | |
| "loss": 0.0909, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 0.5354880094528198, | |
| "learning_rate": 6.201493049564883e-07, | |
| "loss": 0.093, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.858, | |
| "grad_norm": 0.627852737903595, | |
| "learning_rate": 6.034184490472195e-07, | |
| "loss": 0.1, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.5209593176841736, | |
| "learning_rate": 5.869018763200929e-07, | |
| "loss": 0.0944, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.862, | |
| "grad_norm": 0.5602114200592041, | |
| "learning_rate": 5.706003917695619e-07, | |
| "loss": 0.0944, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 0.4925883114337921, | |
| "learning_rate": 5.545147899069836e-07, | |
| "loss": 0.1006, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.866, | |
| "grad_norm": 0.5700224041938782, | |
| "learning_rate": 5.386458547219026e-07, | |
| "loss": 0.0908, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.868, | |
| "grad_norm": 1.1226917505264282, | |
| "learning_rate": 5.229943596438297e-07, | |
| "loss": 0.098, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.6325000524520874, | |
| "learning_rate": 5.075610675045567e-07, | |
| "loss": 0.0918, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 0.6880750060081482, | |
| "learning_rate": 4.92346730500966e-07, | |
| "loss": 0.0891, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.874, | |
| "grad_norm": 0.9880423545837402, | |
| "learning_rate": 4.773520901583801e-07, | |
| "loss": 0.0932, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.876, | |
| "grad_norm": 0.43653684854507446, | |
| "learning_rate": 4.625778772944156e-07, | |
| "loss": 0.0919, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.878, | |
| "grad_norm": 0.5817267298698425, | |
| "learning_rate": 4.480248119833641e-07, | |
| "loss": 0.0976, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.5401520729064941, | |
| "learning_rate": 4.33693603521097e-07, | |
| "loss": 0.0984, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.882, | |
| "grad_norm": 0.5201219320297241, | |
| "learning_rate": 4.195849503904975e-07, | |
| "loss": 0.0909, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.884, | |
| "grad_norm": 0.46396180987358093, | |
| "learning_rate": 4.056995402274122e-07, | |
| "loss": 0.092, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.886, | |
| "grad_norm": 0.8054051995277405, | |
| "learning_rate": 3.920380497871473e-07, | |
| "loss": 0.0924, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 1.6495290994644165, | |
| "learning_rate": 3.7860114491147017e-07, | |
| "loss": 0.0856, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.5194240212440491, | |
| "learning_rate": 3.6538948049616886e-07, | |
| "loss": 0.1016, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.892, | |
| "grad_norm": 0.9510396718978882, | |
| "learning_rate": 3.524037004591274e-07, | |
| "loss": 0.092, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.894, | |
| "grad_norm": 0.5282657146453857, | |
| "learning_rate": 3.396444377089453e-07, | |
| "loss": 0.1007, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 0.5371118187904358, | |
| "learning_rate": 3.271123141140886e-07, | |
| "loss": 0.0928, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.898, | |
| "grad_norm": 0.4065883457660675, | |
| "learning_rate": 3.148079404725801e-07, | |
| "loss": 0.0895, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.5543450117111206, | |
| "learning_rate": 3.027319164822329e-07, | |
| "loss": 0.0912, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.902, | |
| "grad_norm": 0.5739998817443848, | |
| "learning_rate": 2.908848307114198e-07, | |
| "loss": 0.0938, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 0.48197928071022034, | |
| "learning_rate": 2.792672605703867e-07, | |
| "loss": 0.0912, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.906, | |
| "grad_norm": 0.9697645902633667, | |
| "learning_rate": 2.6787977228311336e-07, | |
| "loss": 0.0929, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.908, | |
| "grad_norm": 0.7662408351898193, | |
| "learning_rate": 2.5672292085971276e-07, | |
| "loss": 0.0976, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.5820423364639282, | |
| "learning_rate": 2.457972500693834e-07, | |
| "loss": 0.0952, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 0.6280630826950073, | |
| "learning_rate": 2.351032924139063e-07, | |
| "loss": 0.0895, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.914, | |
| "grad_norm": 0.8524745106697083, | |
| "learning_rate": 2.2464156910168954e-07, | |
| "loss": 0.0931, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.916, | |
| "grad_norm": 0.8388736844062805, | |
| "learning_rate": 2.1441259002236924e-07, | |
| "loss": 0.0984, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.918, | |
| "grad_norm": 0.7623071074485779, | |
| "learning_rate": 2.0441685372195487e-07, | |
| "loss": 0.0914, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.7167821526527405, | |
| "learning_rate": 1.9465484737853092e-07, | |
| "loss": 0.092, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.922, | |
| "grad_norm": 0.9387440085411072, | |
| "learning_rate": 1.8512704677851489e-07, | |
| "loss": 0.0918, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.924, | |
| "grad_norm": 1.0186631679534912, | |
| "learning_rate": 1.758339162934658e-07, | |
| "loss": 0.0914, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.926, | |
| "grad_norm": 0.7060619592666626, | |
| "learning_rate": 1.6677590885745388e-07, | |
| "loss": 0.095, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 0.7647775411605835, | |
| "learning_rate": 1.5795346594498162e-07, | |
| "loss": 0.0942, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.5398228168487549, | |
| "learning_rate": 1.4936701754947104e-07, | |
| "loss": 0.0965, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.932, | |
| "grad_norm": 0.6493486762046814, | |
| "learning_rate": 1.4101698216230254e-07, | |
| "loss": 0.0975, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.934, | |
| "grad_norm": 0.5592340230941772, | |
| "learning_rate": 1.3290376675242022e-07, | |
| "loss": 0.091, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 0.6127602458000183, | |
| "learning_rate": 1.2502776674649776e-07, | |
| "loss": 0.0955, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.938, | |
| "grad_norm": 0.6314841508865356, | |
| "learning_rate": 1.1738936600966366e-07, | |
| "loss": 0.0947, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.6018589735031128, | |
| "learning_rate": 1.0998893682679479e-07, | |
| "loss": 0.1013, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.942, | |
| "grad_norm": 0.5869537591934204, | |
| "learning_rate": 1.0282683988436792e-07, | |
| "loss": 0.0952, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 0.5971767902374268, | |
| "learning_rate": 9.590342425288446e-08, | |
| "loss": 0.0855, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.946, | |
| "grad_norm": 0.6858376860618591, | |
| "learning_rate": 8.921902736985399e-08, | |
| "loss": 0.0964, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.948, | |
| "grad_norm": 0.6425265669822693, | |
| "learning_rate": 8.277397502335194e-08, | |
| "loss": 0.0873, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.6217800378799438, | |
| "learning_rate": 7.656858133613498e-08, | |
| "loss": 0.0932, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 0.7790467143058777, | |
| "learning_rate": 7.060314875033836e-08, | |
| "loss": 0.0884, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.954, | |
| "grad_norm": 0.7824919819831848, | |
| "learning_rate": 6.487796801272983e-08, | |
| "loss": 0.0994, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.956, | |
| "grad_norm": 1.0274417400360107, | |
| "learning_rate": 5.939331816054161e-08, | |
| "loss": 0.096, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.958, | |
| "grad_norm": 1.30277681350708, | |
| "learning_rate": 5.414946650786957e-08, | |
| "loss": 0.0875, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.7545578479766846, | |
| "learning_rate": 4.914666863264528e-08, | |
| "loss": 0.09, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.962, | |
| "grad_norm": 0.5341658592224121, | |
| "learning_rate": 4.438516836417994e-08, | |
| "loss": 0.0904, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.964, | |
| "grad_norm": 0.954708993434906, | |
| "learning_rate": 3.986519777127884e-08, | |
| "loss": 0.0878, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.966, | |
| "grad_norm": 3.0081846714019775, | |
| "learning_rate": 3.558697715093207e-08, | |
| "loss": 0.1018, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 0.6550202965736389, | |
| "learning_rate": 3.1550715017575895e-08, | |
| "loss": 0.0933, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.0143718719482422, | |
| "learning_rate": 2.7756608092933678e-08, | |
| "loss": 0.1012, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.0002, | |
| "grad_norm": 0.7272701859474182, | |
| "learning_rate": 2.4204841296424086e-08, | |
| "loss": 0.1023, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.0022, | |
| "grad_norm": 0.5501855611801147, | |
| "learning_rate": 2.0895587736149414e-08, | |
| "loss": 0.0984, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.0042, | |
| "grad_norm": 0.729179322719574, | |
| "learning_rate": 1.7829008700460116e-08, | |
| "loss": 0.0895, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.0062, | |
| "grad_norm": 0.6516703963279724, | |
| "learning_rate": 1.500525365009109e-08, | |
| "loss": 0.0921, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.0082, | |
| "grad_norm": 0.6412553191184998, | |
| "learning_rate": 1.2424460210881394e-08, | |
| "loss": 0.1001, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.0102, | |
| "grad_norm": 0.8296138644218445, | |
| "learning_rate": 1.008675416706073e-08, | |
| "loss": 0.0834, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.0122, | |
| "grad_norm": 0.6063240766525269, | |
| "learning_rate": 7.992249455124889e-09, | |
| "loss": 0.0831, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.0142, | |
| "grad_norm": 0.6920802593231201, | |
| "learning_rate": 6.141048158277429e-09, | |
| "loss": 0.0919, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.0162, | |
| "grad_norm": 0.6563531160354614, | |
| "learning_rate": 4.533240501459202e-09, | |
| "loss": 0.0952, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.0182, | |
| "grad_norm": 0.6383795142173767, | |
| "learning_rate": 3.1689048469457638e-09, | |
| "loss": 0.091, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.0202, | |
| "grad_norm": 0.6429073214530945, | |
| "learning_rate": 2.0481076905332074e-09, | |
| "loss": 0.0974, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.0222, | |
| "grad_norm": 0.4217958450317383, | |
| "learning_rate": 1.170903658293532e-09, | |
| "loss": 0.091, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.0242, | |
| "grad_norm": 0.648520827293396, | |
| "learning_rate": 5.373355039128836e-10, | |
| "loss": 0.0889, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.0262, | |
| "grad_norm": 0.6270639896392822, | |
| "learning_rate": 1.4743410661044454e-10, | |
| "loss": 0.0999, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.0282, | |
| "grad_norm": 0.7285539507865906, | |
| "learning_rate": 1.2184696296380083e-12, | |
| "loss": 0.1012, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.4735039884017795e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |