| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 20.837148463047743, | |
| "eval_steps": 500, | |
| "global_step": 4000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.052321778940483975, | |
| "grad_norm": 0.5172898769378662, | |
| "learning_rate": 4.9999695642048685e-05, | |
| "loss": 1.2268, | |
| "num_input_tokens_seen": 55264, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10464355788096795, | |
| "grad_norm": 0.2547127306461334, | |
| "learning_rate": 4.9998643550002796e-05, | |
| "loss": 0.163, | |
| "num_input_tokens_seen": 111136, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15696533682145192, | |
| "grad_norm": 0.19801495969295502, | |
| "learning_rate": 4.999683999797514e-05, | |
| "loss": 0.1584, | |
| "num_input_tokens_seen": 167232, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2092871157619359, | |
| "grad_norm": 0.17607145011425018, | |
| "learning_rate": 4.999428504018057e-05, | |
| "loss": 0.1571, | |
| "num_input_tokens_seen": 222368, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2616088947024199, | |
| "grad_norm": 0.123909592628479, | |
| "learning_rate": 4.999097875342117e-05, | |
| "loss": 0.1535, | |
| "num_input_tokens_seen": 280080, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.31393067364290383, | |
| "grad_norm": 0.15530619025230408, | |
| "learning_rate": 4.998692123708403e-05, | |
| "loss": 0.1592, | |
| "num_input_tokens_seen": 336144, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.36625245258338784, | |
| "grad_norm": 0.08514747023582458, | |
| "learning_rate": 4.998211261313822e-05, | |
| "loss": 0.1581, | |
| "num_input_tokens_seen": 392928, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4185742315238718, | |
| "grad_norm": 0.08711712062358856, | |
| "learning_rate": 4.997655302613111e-05, | |
| "loss": 0.1479, | |
| "num_input_tokens_seen": 448288, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4708960104643558, | |
| "grad_norm": 0.14576469361782074, | |
| "learning_rate": 4.997024264318406e-05, | |
| "loss": 0.1574, | |
| "num_input_tokens_seen": 504112, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5232177894048398, | |
| "grad_norm": 0.14594055712223053, | |
| "learning_rate": 4.9963181653987373e-05, | |
| "loss": 0.1532, | |
| "num_input_tokens_seen": 559824, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5755395683453237, | |
| "grad_norm": 1.3401381969451904, | |
| "learning_rate": 4.99553702707946e-05, | |
| "loss": 0.1527, | |
| "num_input_tokens_seen": 615856, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6278613472858077, | |
| "grad_norm": 0.21902425587177277, | |
| "learning_rate": 4.9946808728416143e-05, | |
| "loss": 0.1464, | |
| "num_input_tokens_seen": 672448, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6801831262262917, | |
| "grad_norm": 0.24769121408462524, | |
| "learning_rate": 4.993749728421224e-05, | |
| "loss": 0.1545, | |
| "num_input_tokens_seen": 727840, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7325049051667757, | |
| "grad_norm": 0.2848469018936157, | |
| "learning_rate": 4.992743621808518e-05, | |
| "loss": 0.1448, | |
| "num_input_tokens_seen": 784496, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7848266841072596, | |
| "grad_norm": 0.31721895933151245, | |
| "learning_rate": 4.991662583247092e-05, | |
| "loss": 0.1458, | |
| "num_input_tokens_seen": 840784, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8371484630477436, | |
| "grad_norm": 0.21474598348140717, | |
| "learning_rate": 4.9905066452329964e-05, | |
| "loss": 0.1445, | |
| "num_input_tokens_seen": 897920, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8894702419882276, | |
| "grad_norm": 0.5110652446746826, | |
| "learning_rate": 4.9892758425137643e-05, | |
| "loss": 0.1471, | |
| "num_input_tokens_seen": 953184, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9417920209287116, | |
| "grad_norm": 0.29877740144729614, | |
| "learning_rate": 4.987970212087363e-05, | |
| "loss": 0.1464, | |
| "num_input_tokens_seen": 1009168, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9941137998691956, | |
| "grad_norm": 0.16597020626068115, | |
| "learning_rate": 4.986589793201081e-05, | |
| "loss": 0.1407, | |
| "num_input_tokens_seen": 1065200, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0418574231523872, | |
| "grad_norm": 0.2527844309806824, | |
| "learning_rate": 4.985134627350353e-05, | |
| "loss": 0.1492, | |
| "num_input_tokens_seen": 1115736, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0941792020928711, | |
| "grad_norm": 0.19597752392292023, | |
| "learning_rate": 4.9836047582775084e-05, | |
| "loss": 0.1431, | |
| "num_input_tokens_seen": 1171656, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.146500981033355, | |
| "grad_norm": 0.23703065514564514, | |
| "learning_rate": 4.9820002319704576e-05, | |
| "loss": 0.1434, | |
| "num_input_tokens_seen": 1229224, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.198822759973839, | |
| "grad_norm": 0.19536039233207703, | |
| "learning_rate": 4.98032109666131e-05, | |
| "loss": 0.14, | |
| "num_input_tokens_seen": 1285992, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.251144538914323, | |
| "grad_norm": 0.3111051321029663, | |
| "learning_rate": 4.978567402824924e-05, | |
| "loss": 0.1355, | |
| "num_input_tokens_seen": 1341256, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3034663178548072, | |
| "grad_norm": 0.19106437265872955, | |
| "learning_rate": 4.97673920317739e-05, | |
| "loss": 0.1425, | |
| "num_input_tokens_seen": 1396392, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.355788096795291, | |
| "grad_norm": 0.30303388833999634, | |
| "learning_rate": 4.9748365526744423e-05, | |
| "loss": 0.1343, | |
| "num_input_tokens_seen": 1454088, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.408109875735775, | |
| "grad_norm": 0.3765646815299988, | |
| "learning_rate": 4.972859508509816e-05, | |
| "loss": 0.1343, | |
| "num_input_tokens_seen": 1509656, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.460431654676259, | |
| "grad_norm": 0.3608955442905426, | |
| "learning_rate": 4.9708081301135155e-05, | |
| "loss": 0.1377, | |
| "num_input_tokens_seen": 1565976, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.512753433616743, | |
| "grad_norm": 0.19421738386154175, | |
| "learning_rate": 4.9686824791500396e-05, | |
| "loss": 0.1381, | |
| "num_input_tokens_seen": 1621544, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.565075212557227, | |
| "grad_norm": 0.6034450531005859, | |
| "learning_rate": 4.96648261951652e-05, | |
| "loss": 0.1339, | |
| "num_input_tokens_seen": 1677624, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6173969914977109, | |
| "grad_norm": 0.28776681423187256, | |
| "learning_rate": 4.964208617340803e-05, | |
| "loss": 0.1351, | |
| "num_input_tokens_seen": 1733192, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.669718770438195, | |
| "grad_norm": 0.15169182419776917, | |
| "learning_rate": 4.961860540979464e-05, | |
| "loss": 0.1406, | |
| "num_input_tokens_seen": 1788648, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.7220405493786788, | |
| "grad_norm": 0.24536730349063873, | |
| "learning_rate": 4.9594384610157483e-05, | |
| "loss": 0.1354, | |
| "num_input_tokens_seen": 1844360, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.774362328319163, | |
| "grad_norm": 0.4901825487613678, | |
| "learning_rate": 4.9569424502574544e-05, | |
| "loss": 0.1318, | |
| "num_input_tokens_seen": 1900888, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.8266841072596467, | |
| "grad_norm": 0.33420756459236145, | |
| "learning_rate": 4.954372583734741e-05, | |
| "loss": 0.1324, | |
| "num_input_tokens_seen": 1956840, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.8790058862001309, | |
| "grad_norm": 0.24992604553699493, | |
| "learning_rate": 4.951728938697872e-05, | |
| "loss": 0.1352, | |
| "num_input_tokens_seen": 2013624, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.9313276651406148, | |
| "grad_norm": 0.3095349669456482, | |
| "learning_rate": 4.9490115946148985e-05, | |
| "loss": 0.1332, | |
| "num_input_tokens_seen": 2069768, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.9836494440810988, | |
| "grad_norm": 0.35465294122695923, | |
| "learning_rate": 4.946220633169266e-05, | |
| "loss": 0.128, | |
| "num_input_tokens_seen": 2125768, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.0313930673642906, | |
| "grad_norm": 0.2656111419200897, | |
| "learning_rate": 4.943356138257359e-05, | |
| "loss": 0.1356, | |
| "num_input_tokens_seen": 2176424, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.0837148463047743, | |
| "grad_norm": 0.16828453540802002, | |
| "learning_rate": 4.940418195985983e-05, | |
| "loss": 0.1332, | |
| "num_input_tokens_seen": 2232200, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.1360366252452585, | |
| "grad_norm": 0.22813495993614197, | |
| "learning_rate": 4.9374068946697695e-05, | |
| "loss": 0.1296, | |
| "num_input_tokens_seen": 2289432, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.1883584041857422, | |
| "grad_norm": 0.30772924423217773, | |
| "learning_rate": 4.934322324828529e-05, | |
| "loss": 0.1288, | |
| "num_input_tokens_seen": 2345576, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.2406801831262264, | |
| "grad_norm": 0.29362913966178894, | |
| "learning_rate": 4.931164579184523e-05, | |
| "loss": 0.1307, | |
| "num_input_tokens_seen": 2400696, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.29300196206671, | |
| "grad_norm": 0.21813659369945526, | |
| "learning_rate": 4.9279337526596814e-05, | |
| "loss": 0.1292, | |
| "num_input_tokens_seen": 2456552, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.3453237410071943, | |
| "grad_norm": 0.3129786550998688, | |
| "learning_rate": 4.924629942372748e-05, | |
| "loss": 0.1329, | |
| "num_input_tokens_seen": 2512808, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.397645519947678, | |
| "grad_norm": 0.19762465357780457, | |
| "learning_rate": 4.9212532476363596e-05, | |
| "loss": 0.1261, | |
| "num_input_tokens_seen": 2569016, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.4499672988881622, | |
| "grad_norm": 0.30003225803375244, | |
| "learning_rate": 4.917803769954062e-05, | |
| "loss": 0.124, | |
| "num_input_tokens_seen": 2625208, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.502289077828646, | |
| "grad_norm": 0.2731872797012329, | |
| "learning_rate": 4.9142816130172596e-05, | |
| "loss": 0.1285, | |
| "num_input_tokens_seen": 2680824, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.55461085676913, | |
| "grad_norm": 0.25849199295043945, | |
| "learning_rate": 4.9106868827020955e-05, | |
| "loss": 0.1323, | |
| "num_input_tokens_seen": 2737304, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.6069326357096143, | |
| "grad_norm": 0.24799242615699768, | |
| "learning_rate": 4.907019687066271e-05, | |
| "loss": 0.1299, | |
| "num_input_tokens_seen": 2793432, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.659254414650098, | |
| "grad_norm": 0.1981252282857895, | |
| "learning_rate": 4.9032801363458e-05, | |
| "loss": 0.1281, | |
| "num_input_tokens_seen": 2850008, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.711576193590582, | |
| "grad_norm": 0.40522608160972595, | |
| "learning_rate": 4.8994683429516896e-05, | |
| "loss": 0.1309, | |
| "num_input_tokens_seen": 2905304, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.763897972531066, | |
| "grad_norm": 0.2624233067035675, | |
| "learning_rate": 4.895584421466565e-05, | |
| "loss": 0.1271, | |
| "num_input_tokens_seen": 2961112, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.81621975147155, | |
| "grad_norm": 0.28190159797668457, | |
| "learning_rate": 4.8916284886412214e-05, | |
| "loss": 0.1222, | |
| "num_input_tokens_seen": 3017208, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.868541530412034, | |
| "grad_norm": 0.21756145358085632, | |
| "learning_rate": 4.887600663391122e-05, | |
| "loss": 0.1288, | |
| "num_input_tokens_seen": 3074216, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.920863309352518, | |
| "grad_norm": 0.33730047941207886, | |
| "learning_rate": 4.883501066792814e-05, | |
| "loss": 0.1267, | |
| "num_input_tokens_seen": 3129784, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.973185088293002, | |
| "grad_norm": 0.24758093059062958, | |
| "learning_rate": 4.8793298220802963e-05, | |
| "loss": 0.1288, | |
| "num_input_tokens_seen": 3187048, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.0209287115761936, | |
| "grad_norm": 0.15806905925273895, | |
| "learning_rate": 4.87508705464131e-05, | |
| "loss": 0.1316, | |
| "num_input_tokens_seen": 3238608, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.0732504905166778, | |
| "grad_norm": 0.28546327352523804, | |
| "learning_rate": 4.8707728920135744e-05, | |
| "loss": 0.1256, | |
| "num_input_tokens_seen": 3294400, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.1255722694571615, | |
| "grad_norm": 0.3583323359489441, | |
| "learning_rate": 4.866387463880947e-05, | |
| "loss": 0.1223, | |
| "num_input_tokens_seen": 3350560, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.1778940483976457, | |
| "grad_norm": 0.21477927267551422, | |
| "learning_rate": 4.861930902069531e-05, | |
| "loss": 0.1192, | |
| "num_input_tokens_seen": 3406256, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.2302158273381294, | |
| "grad_norm": 0.3112389147281647, | |
| "learning_rate": 4.8574033405437094e-05, | |
| "loss": 0.1209, | |
| "num_input_tokens_seen": 3461680, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.2825376062786136, | |
| "grad_norm": 0.22634848952293396, | |
| "learning_rate": 4.8528049154021186e-05, | |
| "loss": 0.1318, | |
| "num_input_tokens_seen": 3517984, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.3348593852190973, | |
| "grad_norm": 0.2349083423614502, | |
| "learning_rate": 4.848135764873557e-05, | |
| "loss": 0.1264, | |
| "num_input_tokens_seen": 3573376, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.3871811641595815, | |
| "grad_norm": 0.29488080739974976, | |
| "learning_rate": 4.843396029312832e-05, | |
| "loss": 0.1238, | |
| "num_input_tokens_seen": 3630544, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.439502943100065, | |
| "grad_norm": 0.28836989402770996, | |
| "learning_rate": 4.838585851196537e-05, | |
| "loss": 0.124, | |
| "num_input_tokens_seen": 3686432, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.4918247220405494, | |
| "grad_norm": 0.2634967267513275, | |
| "learning_rate": 4.833705375118772e-05, | |
| "loss": 0.1212, | |
| "num_input_tokens_seen": 3741552, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.544146500981033, | |
| "grad_norm": 0.30187705159187317, | |
| "learning_rate": 4.828754747786796e-05, | |
| "loss": 0.1225, | |
| "num_input_tokens_seen": 3797760, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.5964682799215173, | |
| "grad_norm": 0.31589755415916443, | |
| "learning_rate": 4.823734118016616e-05, | |
| "loss": 0.1236, | |
| "num_input_tokens_seen": 3854704, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.6487900588620015, | |
| "grad_norm": 0.24722936749458313, | |
| "learning_rate": 4.818643636728515e-05, | |
| "loss": 0.1154, | |
| "num_input_tokens_seen": 3910544, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.701111837802485, | |
| "grad_norm": 0.20096950232982635, | |
| "learning_rate": 4.813483456942515e-05, | |
| "loss": 0.119, | |
| "num_input_tokens_seen": 3966448, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.7534336167429694, | |
| "grad_norm": 0.22696824371814728, | |
| "learning_rate": 4.808253733773775e-05, | |
| "loss": 0.1277, | |
| "num_input_tokens_seen": 4022880, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.805755395683453, | |
| "grad_norm": 0.2600644826889038, | |
| "learning_rate": 4.8029546244279346e-05, | |
| "loss": 0.1245, | |
| "num_input_tokens_seen": 4079280, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.8580771746239373, | |
| "grad_norm": 0.3461725115776062, | |
| "learning_rate": 4.797586288196378e-05, | |
| "loss": 0.1245, | |
| "num_input_tokens_seen": 4136096, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.910398953564421, | |
| "grad_norm": 0.20807071030139923, | |
| "learning_rate": 4.792148886451456e-05, | |
| "loss": 0.1238, | |
| "num_input_tokens_seen": 4192832, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.962720732504905, | |
| "grad_norm": 0.25355812907218933, | |
| "learning_rate": 4.7866425826416316e-05, | |
| "loss": 0.1249, | |
| "num_input_tokens_seen": 4248368, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.010464355788097, | |
| "grad_norm": 0.28272193670272827, | |
| "learning_rate": 4.781067542286561e-05, | |
| "loss": 0.1245, | |
| "num_input_tokens_seen": 4299232, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 4.062786134728581, | |
| "grad_norm": 0.39174169301986694, | |
| "learning_rate": 4.7754239329721274e-05, | |
| "loss": 0.1216, | |
| "num_input_tokens_seen": 4356192, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.115107913669065, | |
| "grad_norm": 0.2807229459285736, | |
| "learning_rate": 4.769711924345397e-05, | |
| "loss": 0.1195, | |
| "num_input_tokens_seen": 4411648, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 4.167429692609549, | |
| "grad_norm": 0.24898973107337952, | |
| "learning_rate": 4.763931688109524e-05, | |
| "loss": 0.1174, | |
| "num_input_tokens_seen": 4467568, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.219751471550032, | |
| "grad_norm": 0.3005022704601288, | |
| "learning_rate": 4.7580833980185816e-05, | |
| "loss": 0.1251, | |
| "num_input_tokens_seen": 4522624, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 4.272073250490517, | |
| "grad_norm": 0.3487663269042969, | |
| "learning_rate": 4.7521672298723495e-05, | |
| "loss": 0.1182, | |
| "num_input_tokens_seen": 4578640, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.324395029431001, | |
| "grad_norm": 0.19975335896015167, | |
| "learning_rate": 4.7461833615110194e-05, | |
| "loss": 0.1211, | |
| "num_input_tokens_seen": 4633712, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 4.3767168083714845, | |
| "grad_norm": 0.2742094397544861, | |
| "learning_rate": 4.740131972809856e-05, | |
| "loss": 0.1208, | |
| "num_input_tokens_seen": 4690160, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.429038587311968, | |
| "grad_norm": 0.49232161045074463, | |
| "learning_rate": 4.734013245673788e-05, | |
| "loss": 0.1213, | |
| "num_input_tokens_seen": 4747104, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.481360366252453, | |
| "grad_norm": 0.3197735548019409, | |
| "learning_rate": 4.727827364031936e-05, | |
| "loss": 0.1137, | |
| "num_input_tokens_seen": 4803376, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.5336821451929366, | |
| "grad_norm": 0.43819674849510193, | |
| "learning_rate": 4.721574513832091e-05, | |
| "loss": 0.1163, | |
| "num_input_tokens_seen": 4859840, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 4.58600392413342, | |
| "grad_norm": 0.21390356123447418, | |
| "learning_rate": 4.715254883035119e-05, | |
| "loss": 0.121, | |
| "num_input_tokens_seen": 4916272, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.638325703073905, | |
| "grad_norm": 0.30768075585365295, | |
| "learning_rate": 4.708868661609314e-05, | |
| "loss": 0.1194, | |
| "num_input_tokens_seen": 4971728, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 4.690647482014389, | |
| "grad_norm": 0.34879791736602783, | |
| "learning_rate": 4.702416041524683e-05, | |
| "loss": 0.1223, | |
| "num_input_tokens_seen": 5027680, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.742969260954872, | |
| "grad_norm": 0.23973870277404785, | |
| "learning_rate": 4.695897216747183e-05, | |
| "loss": 0.1225, | |
| "num_input_tokens_seen": 5083392, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 4.795291039895356, | |
| "grad_norm": 0.33110499382019043, | |
| "learning_rate": 4.689312383232883e-05, | |
| "loss": 0.1248, | |
| "num_input_tokens_seen": 5140112, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 4.847612818835841, | |
| "grad_norm": 0.2787615954875946, | |
| "learning_rate": 4.682661738922078e-05, | |
| "loss": 0.1204, | |
| "num_input_tokens_seen": 5195072, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 4.8999345977763245, | |
| "grad_norm": 0.20854991674423218, | |
| "learning_rate": 4.6759454837333376e-05, | |
| "loss": 0.1181, | |
| "num_input_tokens_seen": 5251408, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 4.952256376716808, | |
| "grad_norm": 0.3582795262336731, | |
| "learning_rate": 4.6691638195574963e-05, | |
| "loss": 0.118, | |
| "num_input_tokens_seen": 5307776, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.18520788848400116, | |
| "learning_rate": 4.662316950251584e-05, | |
| "loss": 0.1192, | |
| "num_input_tokens_seen": 5359280, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 5.052321778940484, | |
| "grad_norm": 0.2574616074562073, | |
| "learning_rate": 4.655405081632699e-05, | |
| "loss": 0.1179, | |
| "num_input_tokens_seen": 5415968, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 5.104643557880968, | |
| "grad_norm": 0.3383731544017792, | |
| "learning_rate": 4.648428421471822e-05, | |
| "loss": 0.1137, | |
| "num_input_tokens_seen": 5471840, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 5.156965336821452, | |
| "grad_norm": 0.3150075376033783, | |
| "learning_rate": 4.641387179487569e-05, | |
| "loss": 0.1179, | |
| "num_input_tokens_seen": 5528224, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 5.209287115761936, | |
| "grad_norm": 0.2862718999385834, | |
| "learning_rate": 4.634281567339885e-05, | |
| "loss": 0.1117, | |
| "num_input_tokens_seen": 5583680, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.2616088947024195, | |
| "grad_norm": 0.32703572511672974, | |
| "learning_rate": 4.627111798623688e-05, | |
| "loss": 0.12, | |
| "num_input_tokens_seen": 5638640, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 5.313930673642904, | |
| "grad_norm": 0.3088448941707611, | |
| "learning_rate": 4.619878088862443e-05, | |
| "loss": 0.1134, | |
| "num_input_tokens_seen": 5694208, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 5.366252452583388, | |
| "grad_norm": 0.3116937279701233, | |
| "learning_rate": 4.612580655501683e-05, | |
| "loss": 0.1178, | |
| "num_input_tokens_seen": 5749696, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 5.418574231523872, | |
| "grad_norm": 0.3191389739513397, | |
| "learning_rate": 4.605219717902476e-05, | |
| "loss": 0.1136, | |
| "num_input_tokens_seen": 5805264, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 5.470896010464356, | |
| "grad_norm": 0.31546375155448914, | |
| "learning_rate": 4.5977954973348294e-05, | |
| "loss": 0.1167, | |
| "num_input_tokens_seen": 5861616, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 5.52321778940484, | |
| "grad_norm": 0.2683027386665344, | |
| "learning_rate": 4.590308216971038e-05, | |
| "loss": 0.1113, | |
| "num_input_tokens_seen": 5918816, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 5.575539568345324, | |
| "grad_norm": 0.26271599531173706, | |
| "learning_rate": 4.582758101878977e-05, | |
| "loss": 0.1113, | |
| "num_input_tokens_seen": 5975184, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 5.6278613472858074, | |
| "grad_norm": 0.2612985372543335, | |
| "learning_rate": 4.5751453790153325e-05, | |
| "loss": 0.1143, | |
| "num_input_tokens_seen": 6030736, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 5.680183126226292, | |
| "grad_norm": 0.29255977272987366, | |
| "learning_rate": 4.567470277218786e-05, | |
| "loss": 0.1159, | |
| "num_input_tokens_seen": 6086848, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 5.732504905166776, | |
| "grad_norm": 0.31331729888916016, | |
| "learning_rate": 4.55973302720313e-05, | |
| "loss": 0.1114, | |
| "num_input_tokens_seen": 6142544, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 5.7848266841072595, | |
| "grad_norm": 0.24986866116523743, | |
| "learning_rate": 4.551933861550333e-05, | |
| "loss": 0.1173, | |
| "num_input_tokens_seen": 6199856, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 5.837148463047743, | |
| "grad_norm": 0.26383090019226074, | |
| "learning_rate": 4.5440730147035516e-05, | |
| "loss": 0.1166, | |
| "num_input_tokens_seen": 6255488, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 5.889470241988228, | |
| "grad_norm": 0.28010377287864685, | |
| "learning_rate": 4.5361507229600784e-05, | |
| "loss": 0.1148, | |
| "num_input_tokens_seen": 6311696, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 5.941792020928712, | |
| "grad_norm": 0.2781098783016205, | |
| "learning_rate": 4.528167224464245e-05, | |
| "loss": 0.1152, | |
| "num_input_tokens_seen": 6368064, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 5.994113799869195, | |
| "grad_norm": 0.35192057490348816, | |
| "learning_rate": 4.520122759200256e-05, | |
| "loss": 0.1087, | |
| "num_input_tokens_seen": 6424048, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 6.041857423152387, | |
| "grad_norm": 0.408779114484787, | |
| "learning_rate": 4.512017568984982e-05, | |
| "loss": 0.1094, | |
| "num_input_tokens_seen": 6475464, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 6.094179202092871, | |
| "grad_norm": 0.5394303798675537, | |
| "learning_rate": 4.503851897460686e-05, | |
| "loss": 0.1005, | |
| "num_input_tokens_seen": 6531112, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 6.1465009810333555, | |
| "grad_norm": 0.4771885871887207, | |
| "learning_rate": 4.4956259900877005e-05, | |
| "loss": 0.107, | |
| "num_input_tokens_seen": 6587352, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 6.198822759973839, | |
| "grad_norm": 0.4851493835449219, | |
| "learning_rate": 4.4873400941370506e-05, | |
| "loss": 0.1093, | |
| "num_input_tokens_seen": 6643608, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 6.251144538914323, | |
| "grad_norm": 0.47404423356056213, | |
| "learning_rate": 4.4789944586830196e-05, | |
| "loss": 0.1082, | |
| "num_input_tokens_seen": 6700616, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.303466317854807, | |
| "grad_norm": 0.4265010356903076, | |
| "learning_rate": 4.470589334595662e-05, | |
| "loss": 0.1088, | |
| "num_input_tokens_seen": 6756344, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 6.355788096795291, | |
| "grad_norm": 0.2776981592178345, | |
| "learning_rate": 4.462124974533261e-05, | |
| "loss": 0.1124, | |
| "num_input_tokens_seen": 6813144, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 6.408109875735775, | |
| "grad_norm": 0.3706200122833252, | |
| "learning_rate": 4.453601632934737e-05, | |
| "loss": 0.1095, | |
| "num_input_tokens_seen": 6868280, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 6.460431654676259, | |
| "grad_norm": 0.4983665943145752, | |
| "learning_rate": 4.4450195660119965e-05, | |
| "loss": 0.1114, | |
| "num_input_tokens_seen": 6924296, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 6.5127534336167425, | |
| "grad_norm": 0.38131120800971985, | |
| "learning_rate": 4.4363790317422314e-05, | |
| "loss": 0.1141, | |
| "num_input_tokens_seen": 6980392, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 6.565075212557227, | |
| "grad_norm": 0.24885649979114532, | |
| "learning_rate": 4.427680289860163e-05, | |
| "loss": 0.1128, | |
| "num_input_tokens_seen": 7036056, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 6.617396991497711, | |
| "grad_norm": 0.4561365246772766, | |
| "learning_rate": 4.4189236018502356e-05, | |
| "loss": 0.1149, | |
| "num_input_tokens_seen": 7092152, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 6.669718770438195, | |
| "grad_norm": 0.548459529876709, | |
| "learning_rate": 4.410109230938755e-05, | |
| "loss": 0.1079, | |
| "num_input_tokens_seen": 7147096, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 6.722040549378679, | |
| "grad_norm": 0.46682003140449524, | |
| "learning_rate": 4.4012374420859786e-05, | |
| "loss": 0.1061, | |
| "num_input_tokens_seen": 7202584, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 6.774362328319163, | |
| "grad_norm": 0.32285481691360474, | |
| "learning_rate": 4.392308501978148e-05, | |
| "loss": 0.1098, | |
| "num_input_tokens_seen": 7258552, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 6.826684107259647, | |
| "grad_norm": 0.3166508078575134, | |
| "learning_rate": 4.383322679019472e-05, | |
| "loss": 0.1119, | |
| "num_input_tokens_seen": 7315768, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 6.87900588620013, | |
| "grad_norm": 0.35454803705215454, | |
| "learning_rate": 4.3742802433240625e-05, | |
| "loss": 0.1107, | |
| "num_input_tokens_seen": 7371352, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 6.931327665140615, | |
| "grad_norm": 0.2988300919532776, | |
| "learning_rate": 4.3651814667078086e-05, | |
| "loss": 0.1085, | |
| "num_input_tokens_seen": 7427800, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 6.983649444081099, | |
| "grad_norm": 0.3938722014427185, | |
| "learning_rate": 4.35602662268021e-05, | |
| "loss": 0.1064, | |
| "num_input_tokens_seen": 7484616, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 7.031393067364291, | |
| "grad_norm": 0.45549729466438293, | |
| "learning_rate": 4.346815986436158e-05, | |
| "loss": 0.1041, | |
| "num_input_tokens_seen": 7534896, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 7.083714846304774, | |
| "grad_norm": 0.6130596995353699, | |
| "learning_rate": 4.337549834847655e-05, | |
| "loss": 0.0977, | |
| "num_input_tokens_seen": 7591328, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 7.136036625245258, | |
| "grad_norm": 0.6437245607376099, | |
| "learning_rate": 4.328228446455498e-05, | |
| "loss": 0.0979, | |
| "num_input_tokens_seen": 7647760, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 7.188358404185743, | |
| "grad_norm": 0.5056234002113342, | |
| "learning_rate": 4.3188521014609054e-05, | |
| "loss": 0.0994, | |
| "num_input_tokens_seen": 7704672, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 7.240680183126226, | |
| "grad_norm": 0.5103983879089355, | |
| "learning_rate": 4.309421081717091e-05, | |
| "loss": 0.1001, | |
| "num_input_tokens_seen": 7761104, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 7.29300196206671, | |
| "grad_norm": 0.47358736395835876, | |
| "learning_rate": 4.299935670720794e-05, | |
| "loss": 0.0926, | |
| "num_input_tokens_seen": 7818224, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 7.345323741007194, | |
| "grad_norm": 0.6868699789047241, | |
| "learning_rate": 4.290396153603755e-05, | |
| "loss": 0.0975, | |
| "num_input_tokens_seen": 7874208, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 7.3976455199476785, | |
| "grad_norm": 0.39971715211868286, | |
| "learning_rate": 4.280802817124149e-05, | |
| "loss": 0.1012, | |
| "num_input_tokens_seen": 7930496, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 7.449967298888162, | |
| "grad_norm": 0.6476240754127502, | |
| "learning_rate": 4.271155949657959e-05, | |
| "loss": 0.1033, | |
| "num_input_tokens_seen": 7985552, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 7.502289077828646, | |
| "grad_norm": 0.5134351253509521, | |
| "learning_rate": 4.261455841190314e-05, | |
| "loss": 0.0966, | |
| "num_input_tokens_seen": 8041568, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 7.554610856769131, | |
| "grad_norm": 0.6232761144638062, | |
| "learning_rate": 4.2517027833067685e-05, | |
| "loss": 0.1001, | |
| "num_input_tokens_seen": 8098656, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 7.606932635709614, | |
| "grad_norm": 0.4906207025051117, | |
| "learning_rate": 4.241897069184537e-05, | |
| "loss": 0.1072, | |
| "num_input_tokens_seen": 8154032, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 7.659254414650098, | |
| "grad_norm": 0.6239527463912964, | |
| "learning_rate": 4.2320389935836836e-05, | |
| "loss": 0.1006, | |
| "num_input_tokens_seen": 8210032, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 7.711576193590582, | |
| "grad_norm": 1.2038999795913696, | |
| "learning_rate": 4.2221288528382584e-05, | |
| "loss": 0.1015, | |
| "num_input_tokens_seen": 8265296, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 7.763897972531066, | |
| "grad_norm": 0.365824431180954, | |
| "learning_rate": 4.212166944847392e-05, | |
| "loss": 0.0973, | |
| "num_input_tokens_seen": 8321840, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 7.81621975147155, | |
| "grad_norm": 0.4954347610473633, | |
| "learning_rate": 4.2021535690663414e-05, | |
| "loss": 0.1015, | |
| "num_input_tokens_seen": 8378064, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 7.868541530412034, | |
| "grad_norm": 0.5742694139480591, | |
| "learning_rate": 4.192089026497484e-05, | |
| "loss": 0.0997, | |
| "num_input_tokens_seen": 8434384, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 7.920863309352518, | |
| "grad_norm": 0.48765209317207336, | |
| "learning_rate": 4.181973619681276e-05, | |
| "loss": 0.1003, | |
| "num_input_tokens_seen": 8490496, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 7.973185088293002, | |
| "grad_norm": 0.4735221266746521, | |
| "learning_rate": 4.171807652687151e-05, | |
| "loss": 0.1051, | |
| "num_input_tokens_seen": 8545600, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 8.020928711576193, | |
| "grad_norm": 0.35401248931884766, | |
| "learning_rate": 4.1615914311043855e-05, | |
| "loss": 0.0967, | |
| "num_input_tokens_seen": 8595848, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 8.073250490516678, | |
| "grad_norm": 0.7497507929801941, | |
| "learning_rate": 4.151325262032908e-05, | |
| "loss": 0.0834, | |
| "num_input_tokens_seen": 8652536, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 8.125572269457162, | |
| "grad_norm": 0.7235395908355713, | |
| "learning_rate": 4.1410094540740726e-05, | |
| "loss": 0.0804, | |
| "num_input_tokens_seen": 8708952, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 8.177894048397645, | |
| "grad_norm": 0.9055391550064087, | |
| "learning_rate": 4.1306443173213785e-05, | |
| "loss": 0.085, | |
| "num_input_tokens_seen": 8765688, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 8.23021582733813, | |
| "grad_norm": 0.8378714919090271, | |
| "learning_rate": 4.1202301633511506e-05, | |
| "loss": 0.0813, | |
| "num_input_tokens_seen": 8822376, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 8.282537606278613, | |
| "grad_norm": 0.6385327577590942, | |
| "learning_rate": 4.109767305213173e-05, | |
| "loss": 0.0831, | |
| "num_input_tokens_seen": 8878456, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 8.334859385219097, | |
| "grad_norm": 0.8356183767318726, | |
| "learning_rate": 4.0992560574212764e-05, | |
| "loss": 0.0893, | |
| "num_input_tokens_seen": 8934088, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 8.387181164159582, | |
| "grad_norm": 0.8791753649711609, | |
| "learning_rate": 4.0886967359438885e-05, | |
| "loss": 0.087, | |
| "num_input_tokens_seen": 8990120, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 8.439502943100065, | |
| "grad_norm": 0.6865050792694092, | |
| "learning_rate": 4.078089658194533e-05, | |
| "loss": 0.0873, | |
| "num_input_tokens_seen": 9045848, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 8.49182472204055, | |
| "grad_norm": 0.7075687050819397, | |
| "learning_rate": 4.0674351430222864e-05, | |
| "loss": 0.0855, | |
| "num_input_tokens_seen": 9102456, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 8.544146500981034, | |
| "grad_norm": 0.7803745269775391, | |
| "learning_rate": 4.0567335107021986e-05, | |
| "loss": 0.0887, | |
| "num_input_tokens_seen": 9158712, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 8.596468279921517, | |
| "grad_norm": 0.6807383894920349, | |
| "learning_rate": 4.0459850829256604e-05, | |
| "loss": 0.0826, | |
| "num_input_tokens_seen": 9215496, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 8.648790058862001, | |
| "grad_norm": 0.777777373790741, | |
| "learning_rate": 4.035190182790738e-05, | |
| "loss": 0.0922, | |
| "num_input_tokens_seen": 9269720, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 8.701111837802486, | |
| "grad_norm": 0.6249901056289673, | |
| "learning_rate": 4.024349134792453e-05, | |
| "loss": 0.0862, | |
| "num_input_tokens_seen": 9325624, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 8.753433616742969, | |
| "grad_norm": 0.7085604667663574, | |
| "learning_rate": 4.0134622648130394e-05, | |
| "loss": 0.0836, | |
| "num_input_tokens_seen": 9380984, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 8.805755395683454, | |
| "grad_norm": 0.9964501261711121, | |
| "learning_rate": 4.0025299001121365e-05, | |
| "loss": 0.0862, | |
| "num_input_tokens_seen": 9437320, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 8.858077174623936, | |
| "grad_norm": 1.0215933322906494, | |
| "learning_rate": 3.991552369316958e-05, | |
| "loss": 0.0926, | |
| "num_input_tokens_seen": 9492888, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 8.910398953564421, | |
| "grad_norm": 0.7546895146369934, | |
| "learning_rate": 3.9805300024124125e-05, | |
| "loss": 0.0893, | |
| "num_input_tokens_seen": 9550504, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 8.962720732504906, | |
| "grad_norm": 0.8502705693244934, | |
| "learning_rate": 3.969463130731183e-05, | |
| "loss": 0.0948, | |
| "num_input_tokens_seen": 9606120, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 9.010464355788097, | |
| "grad_norm": 0.5291112065315247, | |
| "learning_rate": 3.9583520869437666e-05, | |
| "loss": 0.0893, | |
| "num_input_tokens_seen": 9656472, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 9.062786134728581, | |
| "grad_norm": 0.6509256958961487, | |
| "learning_rate": 3.9471972050484764e-05, | |
| "loss": 0.063, | |
| "num_input_tokens_seen": 9712712, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 9.115107913669064, | |
| "grad_norm": 1.1916331052780151, | |
| "learning_rate": 3.9359988203614e-05, | |
| "loss": 0.0602, | |
| "num_input_tokens_seen": 9768584, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 9.167429692609549, | |
| "grad_norm": 0.8077856302261353, | |
| "learning_rate": 3.924757269506319e-05, | |
| "loss": 0.0643, | |
| "num_input_tokens_seen": 9824360, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 9.219751471550033, | |
| "grad_norm": 1.0554853677749634, | |
| "learning_rate": 3.913472890404593e-05, | |
| "loss": 0.0668, | |
| "num_input_tokens_seen": 9880328, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 9.272073250490516, | |
| "grad_norm": 0.8842382431030273, | |
| "learning_rate": 3.9021460222649986e-05, | |
| "loss": 0.0633, | |
| "num_input_tokens_seen": 9936248, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 9.324395029431, | |
| "grad_norm": 0.9562489986419678, | |
| "learning_rate": 3.890777005573537e-05, | |
| "loss": 0.0625, | |
| "num_input_tokens_seen": 9992088, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 9.376716808371485, | |
| "grad_norm": 0.8844836950302124, | |
| "learning_rate": 3.8793661820831915e-05, | |
| "loss": 0.0718, | |
| "num_input_tokens_seen": 10048520, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 9.429038587311968, | |
| "grad_norm": 1.2325676679611206, | |
| "learning_rate": 3.867913894803663e-05, | |
| "loss": 0.0646, | |
| "num_input_tokens_seen": 10104776, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 9.481360366252453, | |
| "grad_norm": 0.7227656245231628, | |
| "learning_rate": 3.8564204879910535e-05, | |
| "loss": 0.0657, | |
| "num_input_tokens_seen": 10160888, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 9.533682145192937, | |
| "grad_norm": 0.9736649990081787, | |
| "learning_rate": 3.844886307137519e-05, | |
| "loss": 0.0691, | |
| "num_input_tokens_seen": 10216920, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 9.58600392413342, | |
| "grad_norm": 1.0427885055541992, | |
| "learning_rate": 3.833311698960888e-05, | |
| "loss": 0.0667, | |
| "num_input_tokens_seen": 10273880, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 9.638325703073905, | |
| "grad_norm": 0.9749732613563538, | |
| "learning_rate": 3.8216970113942284e-05, | |
| "loss": 0.0699, | |
| "num_input_tokens_seen": 10329752, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 9.690647482014388, | |
| "grad_norm": 0.6216735243797302, | |
| "learning_rate": 3.8100425935754025e-05, | |
| "loss": 0.072, | |
| "num_input_tokens_seen": 10387128, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 9.742969260954872, | |
| "grad_norm": 0.8047035932540894, | |
| "learning_rate": 3.798348795836562e-05, | |
| "loss": 0.0697, | |
| "num_input_tokens_seen": 10443144, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 9.795291039895357, | |
| "grad_norm": 0.793222188949585, | |
| "learning_rate": 3.786615969693621e-05, | |
| "loss": 0.0709, | |
| "num_input_tokens_seen": 10497784, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 9.84761281883584, | |
| "grad_norm": 0.7294532656669617, | |
| "learning_rate": 3.7748444678356886e-05, | |
| "loss": 0.0716, | |
| "num_input_tokens_seen": 10553416, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 9.899934597776324, | |
| "grad_norm": 1.014732003211975, | |
| "learning_rate": 3.7630346441144656e-05, | |
| "loss": 0.0695, | |
| "num_input_tokens_seen": 10610168, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 9.952256376716809, | |
| "grad_norm": 1.22804856300354, | |
| "learning_rate": 3.7511868535336134e-05, | |
| "loss": 0.0731, | |
| "num_input_tokens_seen": 10665912, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.7104299068450928, | |
| "learning_rate": 3.7393014522380734e-05, | |
| "loss": 0.0652, | |
| "num_input_tokens_seen": 10717424, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 10.052321778940485, | |
| "grad_norm": 0.9654238224029541, | |
| "learning_rate": 3.7273787975033686e-05, | |
| "loss": 0.0435, | |
| "num_input_tokens_seen": 10772656, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 10.104643557880967, | |
| "grad_norm": 1.2412711381912231, | |
| "learning_rate": 3.7154192477248614e-05, | |
| "loss": 0.0425, | |
| "num_input_tokens_seen": 10829200, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 10.156965336821452, | |
| "grad_norm": 0.6960983276367188, | |
| "learning_rate": 3.7034231624069796e-05, | |
| "loss": 0.0376, | |
| "num_input_tokens_seen": 10884064, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 10.209287115761937, | |
| "grad_norm": 1.214837908744812, | |
| "learning_rate": 3.691390902152412e-05, | |
| "loss": 0.0487, | |
| "num_input_tokens_seen": 10939824, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 10.26160889470242, | |
| "grad_norm": 0.7800888419151306, | |
| "learning_rate": 3.679322828651265e-05, | |
| "loss": 0.043, | |
| "num_input_tokens_seen": 10996128, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 10.313930673642904, | |
| "grad_norm": 1.8462598323822021, | |
| "learning_rate": 3.667219304670193e-05, | |
| "loss": 0.0454, | |
| "num_input_tokens_seen": 11053280, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 10.366252452583387, | |
| "grad_norm": 1.075537085533142, | |
| "learning_rate": 3.655080694041495e-05, | |
| "loss": 0.0479, | |
| "num_input_tokens_seen": 11109664, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 10.418574231523872, | |
| "grad_norm": 0.7474583983421326, | |
| "learning_rate": 3.642907361652172e-05, | |
| "loss": 0.0458, | |
| "num_input_tokens_seen": 11166016, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 10.470896010464356, | |
| "grad_norm": 1.0032005310058594, | |
| "learning_rate": 3.6306996734329656e-05, | |
| "loss": 0.0518, | |
| "num_input_tokens_seen": 11221648, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 10.523217789404839, | |
| "grad_norm": 1.0080397129058838, | |
| "learning_rate": 3.618457996347352e-05, | |
| "loss": 0.0501, | |
| "num_input_tokens_seen": 11277952, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 10.575539568345324, | |
| "grad_norm": 1.130409598350525, | |
| "learning_rate": 3.606182698380515e-05, | |
| "loss": 0.0476, | |
| "num_input_tokens_seen": 11334272, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 10.627861347285808, | |
| "grad_norm": 0.9204115867614746, | |
| "learning_rate": 3.593874148528284e-05, | |
| "loss": 0.0515, | |
| "num_input_tokens_seen": 11389760, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 10.680183126226291, | |
| "grad_norm": 0.8044779896736145, | |
| "learning_rate": 3.58153271678604e-05, | |
| "loss": 0.0477, | |
| "num_input_tokens_seen": 11446144, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 10.732504905166776, | |
| "grad_norm": 1.4428260326385498, | |
| "learning_rate": 3.5691587741375934e-05, | |
| "loss": 0.0488, | |
| "num_input_tokens_seen": 11502320, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 10.78482668410726, | |
| "grad_norm": 0.9769074320793152, | |
| "learning_rate": 3.5567526925440353e-05, | |
| "loss": 0.0565, | |
| "num_input_tokens_seen": 11559392, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 10.837148463047743, | |
| "grad_norm": 0.6837288737297058, | |
| "learning_rate": 3.5443148449325545e-05, | |
| "loss": 0.051, | |
| "num_input_tokens_seen": 11615824, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 10.889470241988228, | |
| "grad_norm": 1.1239960193634033, | |
| "learning_rate": 3.5318456051852264e-05, | |
| "loss": 0.0528, | |
| "num_input_tokens_seen": 11671968, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 10.941792020928713, | |
| "grad_norm": 1.2493700981140137, | |
| "learning_rate": 3.519345348127775e-05, | |
| "loss": 0.0479, | |
| "num_input_tokens_seen": 11727632, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 10.994113799869195, | |
| "grad_norm": 1.0371160507202148, | |
| "learning_rate": 3.506814449518306e-05, | |
| "loss": 0.0446, | |
| "num_input_tokens_seen": 11784032, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 11.041857423152388, | |
| "grad_norm": 0.6289274096488953, | |
| "learning_rate": 3.494253286036011e-05, | |
| "loss": 0.0392, | |
| "num_input_tokens_seen": 11835760, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 11.094179202092871, | |
| "grad_norm": 1.0801132917404175, | |
| "learning_rate": 3.481662235269844e-05, | |
| "loss": 0.0311, | |
| "num_input_tokens_seen": 11891584, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 11.146500981033356, | |
| "grad_norm": 0.7098966836929321, | |
| "learning_rate": 3.469041675707173e-05, | |
| "loss": 0.0253, | |
| "num_input_tokens_seen": 11947824, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 11.198822759973838, | |
| "grad_norm": 0.9716883897781372, | |
| "learning_rate": 3.4563919867224e-05, | |
| "loss": 0.0313, | |
| "num_input_tokens_seen": 12003328, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 11.251144538914323, | |
| "grad_norm": 1.6546238660812378, | |
| "learning_rate": 3.4437135485655575e-05, | |
| "loss": 0.0339, | |
| "num_input_tokens_seen": 12060512, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 11.303466317854808, | |
| "grad_norm": 0.8873046040534973, | |
| "learning_rate": 3.4310067423508815e-05, | |
| "loss": 0.0344, | |
| "num_input_tokens_seen": 12117584, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 11.35578809679529, | |
| "grad_norm": 1.2070609331130981, | |
| "learning_rate": 3.418271950045352e-05, | |
| "loss": 0.0274, | |
| "num_input_tokens_seen": 12172512, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 11.408109875735775, | |
| "grad_norm": 1.0395190715789795, | |
| "learning_rate": 3.405509554457211e-05, | |
| "loss": 0.0268, | |
| "num_input_tokens_seen": 12227744, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 11.46043165467626, | |
| "grad_norm": 1.2013176679611206, | |
| "learning_rate": 3.392719939224453e-05, | |
| "loss": 0.0363, | |
| "num_input_tokens_seen": 12283776, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 11.512753433616743, | |
| "grad_norm": 1.1089166402816772, | |
| "learning_rate": 3.379903488803304e-05, | |
| "loss": 0.0339, | |
| "num_input_tokens_seen": 12340288, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 11.565075212557227, | |
| "grad_norm": 1.4544755220413208, | |
| "learning_rate": 3.3670605884566484e-05, | |
| "loss": 0.0325, | |
| "num_input_tokens_seen": 12396368, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 11.61739699149771, | |
| "grad_norm": 1.0953842401504517, | |
| "learning_rate": 3.3541916242424606e-05, | |
| "loss": 0.0333, | |
| "num_input_tokens_seen": 12451872, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 11.669718770438195, | |
| "grad_norm": 0.7939156889915466, | |
| "learning_rate": 3.341296983002193e-05, | |
| "loss": 0.0336, | |
| "num_input_tokens_seen": 12507776, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 11.72204054937868, | |
| "grad_norm": 1.114825963973999, | |
| "learning_rate": 3.3283770523491535e-05, | |
| "loss": 0.0357, | |
| "num_input_tokens_seen": 12564320, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 11.774362328319162, | |
| "grad_norm": 1.0780022144317627, | |
| "learning_rate": 3.3154322206568475e-05, | |
| "loss": 0.0356, | |
| "num_input_tokens_seen": 12620912, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 11.826684107259647, | |
| "grad_norm": 0.9889864325523376, | |
| "learning_rate": 3.302462877047307e-05, | |
| "loss": 0.0318, | |
| "num_input_tokens_seen": 12676464, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 11.879005886200131, | |
| "grad_norm": 1.8913953304290771, | |
| "learning_rate": 3.2894694113793935e-05, | |
| "loss": 0.039, | |
| "num_input_tokens_seen": 12731408, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 11.931327665140614, | |
| "grad_norm": 0.854158341884613, | |
| "learning_rate": 3.27645221423708e-05, | |
| "loss": 0.0408, | |
| "num_input_tokens_seen": 12787552, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 11.983649444081099, | |
| "grad_norm": 0.7944401502609253, | |
| "learning_rate": 3.263411676917704e-05, | |
| "loss": 0.034, | |
| "num_input_tokens_seen": 12843808, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 12.03139306736429, | |
| "grad_norm": 1.3224999904632568, | |
| "learning_rate": 3.250348191420214e-05, | |
| "loss": 0.03, | |
| "num_input_tokens_seen": 12895184, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 12.083714846304774, | |
| "grad_norm": 0.8005937933921814, | |
| "learning_rate": 3.237262150433379e-05, | |
| "loss": 0.0219, | |
| "num_input_tokens_seen": 12951408, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 12.136036625245259, | |
| "grad_norm": 1.3628501892089844, | |
| "learning_rate": 3.224153947323987e-05, | |
| "loss": 0.0181, | |
| "num_input_tokens_seen": 13007776, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 12.188358404185742, | |
| "grad_norm": 0.7954509258270264, | |
| "learning_rate": 3.21102397612502e-05, | |
| "loss": 0.0183, | |
| "num_input_tokens_seen": 13064144, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 12.240680183126226, | |
| "grad_norm": 0.8565235733985901, | |
| "learning_rate": 3.1978726315238094e-05, | |
| "loss": 0.0183, | |
| "num_input_tokens_seen": 13120320, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 12.293001962066711, | |
| "grad_norm": 0.7555674910545349, | |
| "learning_rate": 3.1847003088501726e-05, | |
| "loss": 0.017, | |
| "num_input_tokens_seen": 13177168, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 12.345323741007194, | |
| "grad_norm": 0.7122445106506348, | |
| "learning_rate": 3.1715074040645275e-05, | |
| "loss": 0.0206, | |
| "num_input_tokens_seen": 13232784, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 12.397645519947678, | |
| "grad_norm": 0.9428816437721252, | |
| "learning_rate": 3.158294313745992e-05, | |
| "loss": 0.0194, | |
| "num_input_tokens_seen": 13287312, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 12.449967298888161, | |
| "grad_norm": 1.027761459350586, | |
| "learning_rate": 3.145061435080461e-05, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 13343616, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 12.502289077828646, | |
| "grad_norm": 0.9591146111488342, | |
| "learning_rate": 3.1318091658486655e-05, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 13398656, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 12.55461085676913, | |
| "grad_norm": 2.0098116397857666, | |
| "learning_rate": 3.1185379044142225e-05, | |
| "loss": 0.0202, | |
| "num_input_tokens_seen": 13453888, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 12.606932635709613, | |
| "grad_norm": 1.243646502494812, | |
| "learning_rate": 3.105248049711651e-05, | |
| "loss": 0.0184, | |
| "num_input_tokens_seen": 13511168, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 12.659254414650098, | |
| "grad_norm": 0.7831906676292419, | |
| "learning_rate": 3.091940001234386e-05, | |
| "loss": 0.0215, | |
| "num_input_tokens_seen": 13567168, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 12.711576193590583, | |
| "grad_norm": 0.6232236623764038, | |
| "learning_rate": 3.078614159022767e-05, | |
| "loss": 0.0192, | |
| "num_input_tokens_seen": 13623200, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 12.763897972531066, | |
| "grad_norm": 1.3829624652862549, | |
| "learning_rate": 3.065270923652015e-05, | |
| "loss": 0.0222, | |
| "num_input_tokens_seen": 13678880, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 12.81621975147155, | |
| "grad_norm": 0.9216393232345581, | |
| "learning_rate": 3.051910696220188e-05, | |
| "loss": 0.0159, | |
| "num_input_tokens_seen": 13734624, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 12.868541530412035, | |
| "grad_norm": 1.1284723281860352, | |
| "learning_rate": 3.0385338783361283e-05, | |
| "loss": 0.0248, | |
| "num_input_tokens_seen": 13790576, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 12.920863309352518, | |
| "grad_norm": 1.4107064008712769, | |
| "learning_rate": 3.025140872107386e-05, | |
| "loss": 0.0227, | |
| "num_input_tokens_seen": 13845984, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 12.973185088293002, | |
| "grad_norm": 0.7538278102874756, | |
| "learning_rate": 3.0117320801281335e-05, | |
| "loss": 0.0265, | |
| "num_input_tokens_seen": 13902400, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 13.020928711576193, | |
| "grad_norm": 0.9580565690994263, | |
| "learning_rate": 2.9983079054670627e-05, | |
| "loss": 0.0195, | |
| "num_input_tokens_seen": 13953344, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 13.073250490516678, | |
| "grad_norm": 0.4210267961025238, | |
| "learning_rate": 2.9848687516552725e-05, | |
| "loss": 0.0107, | |
| "num_input_tokens_seen": 14009424, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 13.125572269457162, | |
| "grad_norm": 0.534055233001709, | |
| "learning_rate": 2.9714150226741312e-05, | |
| "loss": 0.013, | |
| "num_input_tokens_seen": 14064880, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 13.177894048397645, | |
| "grad_norm": 0.8242612481117249, | |
| "learning_rate": 2.9579471229431394e-05, | |
| "loss": 0.0095, | |
| "num_input_tokens_seen": 14120896, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 13.23021582733813, | |
| "grad_norm": 1.1644961833953857, | |
| "learning_rate": 2.944465457307771e-05, | |
| "loss": 0.0125, | |
| "num_input_tokens_seen": 14176512, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 13.282537606278613, | |
| "grad_norm": 0.682830274105072, | |
| "learning_rate": 2.930970431027304e-05, | |
| "loss": 0.0106, | |
| "num_input_tokens_seen": 14232608, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 13.334859385219097, | |
| "grad_norm": 0.7904958724975586, | |
| "learning_rate": 2.9174624497626353e-05, | |
| "loss": 0.012, | |
| "num_input_tokens_seen": 14289360, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 13.387181164159582, | |
| "grad_norm": 0.8092711567878723, | |
| "learning_rate": 2.903941919564091e-05, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 14346096, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 13.439502943100065, | |
| "grad_norm": 0.4784017503261566, | |
| "learning_rate": 2.8904092468592187e-05, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 14401872, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 13.49182472204055, | |
| "grad_norm": 0.5194114446640015, | |
| "learning_rate": 2.8768648384405695e-05, | |
| "loss": 0.0101, | |
| "num_input_tokens_seen": 14458864, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 13.544146500981034, | |
| "grad_norm": 0.6601864099502563, | |
| "learning_rate": 2.863309101453469e-05, | |
| "loss": 0.0135, | |
| "num_input_tokens_seen": 14515664, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 13.596468279921517, | |
| "grad_norm": 0.9567685723304749, | |
| "learning_rate": 2.8497424433837833e-05, | |
| "loss": 0.0138, | |
| "num_input_tokens_seen": 14572256, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 13.648790058862001, | |
| "grad_norm": 0.5563291311264038, | |
| "learning_rate": 2.836165272045663e-05, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 14627248, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 13.701111837802486, | |
| "grad_norm": 0.9716143608093262, | |
| "learning_rate": 2.8225779955692905e-05, | |
| "loss": 0.0134, | |
| "num_input_tokens_seen": 14683728, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 13.753433616742969, | |
| "grad_norm": 0.9606854915618896, | |
| "learning_rate": 2.8089810223886076e-05, | |
| "loss": 0.0154, | |
| "num_input_tokens_seen": 14740864, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 13.805755395683454, | |
| "grad_norm": 1.01091730594635, | |
| "learning_rate": 2.79537476122904e-05, | |
| "loss": 0.0121, | |
| "num_input_tokens_seen": 14796176, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 13.858077174623936, | |
| "grad_norm": 0.6134788990020752, | |
| "learning_rate": 2.781759621095209e-05, | |
| "loss": 0.0119, | |
| "num_input_tokens_seen": 14852304, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 13.910398953564421, | |
| "grad_norm": 1.1731514930725098, | |
| "learning_rate": 2.7681360112586403e-05, | |
| "loss": 0.0188, | |
| "num_input_tokens_seen": 14908624, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 13.962720732504906, | |
| "grad_norm": 0.7572017908096313, | |
| "learning_rate": 2.7545043412454568e-05, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 14964784, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 14.010464355788097, | |
| "grad_norm": 0.2877230942249298, | |
| "learning_rate": 2.7408650208240733e-05, | |
| "loss": 0.0093, | |
| "num_input_tokens_seen": 15016112, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 14.062786134728581, | |
| "grad_norm": 1.2057104110717773, | |
| "learning_rate": 2.7272184599928723e-05, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 15072240, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 14.115107913669064, | |
| "grad_norm": 0.2993405759334564, | |
| "learning_rate": 2.7135650689678873e-05, | |
| "loss": 0.0082, | |
| "num_input_tokens_seen": 15128432, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 14.167429692609549, | |
| "grad_norm": 0.4322413504123688, | |
| "learning_rate": 2.6999052581704643e-05, | |
| "loss": 0.0052, | |
| "num_input_tokens_seen": 15185232, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 14.219751471550033, | |
| "grad_norm": 0.4944108724594116, | |
| "learning_rate": 2.6862394382149308e-05, | |
| "loss": 0.0066, | |
| "num_input_tokens_seen": 15241040, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 14.272073250490516, | |
| "grad_norm": 0.6533095836639404, | |
| "learning_rate": 2.672568019896248e-05, | |
| "loss": 0.0088, | |
| "num_input_tokens_seen": 15297904, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 14.324395029431, | |
| "grad_norm": 0.316057026386261, | |
| "learning_rate": 2.6588914141776626e-05, | |
| "loss": 0.0056, | |
| "num_input_tokens_seen": 15355584, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 14.376716808371485, | |
| "grad_norm": 0.502487063407898, | |
| "learning_rate": 2.6452100321783585e-05, | |
| "loss": 0.0029, | |
| "num_input_tokens_seen": 15410592, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 14.429038587311968, | |
| "grad_norm": 0.5012995004653931, | |
| "learning_rate": 2.6315242851610923e-05, | |
| "loss": 0.0109, | |
| "num_input_tokens_seen": 15466448, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 14.481360366252453, | |
| "grad_norm": 0.8451622128486633, | |
| "learning_rate": 2.6178345845198328e-05, | |
| "loss": 0.0057, | |
| "num_input_tokens_seen": 15522816, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 14.533682145192937, | |
| "grad_norm": 0.33364975452423096, | |
| "learning_rate": 2.6041413417673966e-05, | |
| "loss": 0.009, | |
| "num_input_tokens_seen": 15578672, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 14.58600392413342, | |
| "grad_norm": 0.3638412058353424, | |
| "learning_rate": 2.590444968523074e-05, | |
| "loss": 0.0089, | |
| "num_input_tokens_seen": 15635408, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 14.638325703073905, | |
| "grad_norm": 0.5961637496948242, | |
| "learning_rate": 2.5767458765002606e-05, | |
| "loss": 0.008, | |
| "num_input_tokens_seen": 15691648, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 14.690647482014388, | |
| "grad_norm": 0.7401494979858398, | |
| "learning_rate": 2.5630444774940765e-05, | |
| "loss": 0.0081, | |
| "num_input_tokens_seen": 15748032, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 14.742969260954872, | |
| "grad_norm": 0.18349328637123108, | |
| "learning_rate": 2.5493411833689907e-05, | |
| "loss": 0.0071, | |
| "num_input_tokens_seen": 15803232, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 14.795291039895357, | |
| "grad_norm": 1.5877436399459839, | |
| "learning_rate": 2.5356364060464398e-05, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 15859120, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 14.84761281883584, | |
| "grad_norm": 1.041685938835144, | |
| "learning_rate": 2.521930557492444e-05, | |
| "loss": 0.0089, | |
| "num_input_tokens_seen": 15915872, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 14.899934597776324, | |
| "grad_norm": 0.6710309982299805, | |
| "learning_rate": 2.5082240497052267e-05, | |
| "loss": 0.0088, | |
| "num_input_tokens_seen": 15973472, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 14.952256376716809, | |
| "grad_norm": 0.9839669466018677, | |
| "learning_rate": 2.494517294702826e-05, | |
| "loss": 0.0069, | |
| "num_input_tokens_seen": 16029920, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 1.3188862800598145, | |
| "learning_rate": 2.4808107045107123e-05, | |
| "loss": 0.0098, | |
| "num_input_tokens_seen": 16080272, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 15.052321778940485, | |
| "grad_norm": 0.41307705640792847, | |
| "learning_rate": 2.4671046911494025e-05, | |
| "loss": 0.0037, | |
| "num_input_tokens_seen": 16136752, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 15.104643557880967, | |
| "grad_norm": 0.8025826811790466, | |
| "learning_rate": 2.453399666622072e-05, | |
| "loss": 0.0032, | |
| "num_input_tokens_seen": 16191920, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 15.156965336821452, | |
| "grad_norm": 0.2138717621564865, | |
| "learning_rate": 2.4396960429021738e-05, | |
| "loss": 0.0028, | |
| "num_input_tokens_seen": 16246912, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 15.209287115761937, | |
| "grad_norm": 0.16525974869728088, | |
| "learning_rate": 2.4259942319210498e-05, | |
| "loss": 0.0058, | |
| "num_input_tokens_seen": 16303520, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 15.26160889470242, | |
| "grad_norm": 0.22977286577224731, | |
| "learning_rate": 2.412294645555555e-05, | |
| "loss": 0.005, | |
| "num_input_tokens_seen": 16359888, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 15.313930673642904, | |
| "grad_norm": 1.8685427904129028, | |
| "learning_rate": 2.39859769561567e-05, | |
| "loss": 0.0048, | |
| "num_input_tokens_seen": 16416512, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 15.366252452583387, | |
| "grad_norm": 0.7653654217720032, | |
| "learning_rate": 2.3849037938321235e-05, | |
| "loss": 0.0056, | |
| "num_input_tokens_seen": 16473664, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 15.418574231523872, | |
| "grad_norm": 0.21836823225021362, | |
| "learning_rate": 2.3712133518440176e-05, | |
| "loss": 0.0072, | |
| "num_input_tokens_seen": 16529312, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 15.470896010464356, | |
| "grad_norm": 0.6065989136695862, | |
| "learning_rate": 2.3575267811864543e-05, | |
| "loss": 0.0074, | |
| "num_input_tokens_seen": 16586048, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 15.523217789404839, | |
| "grad_norm": 0.21767759323120117, | |
| "learning_rate": 2.34384449327816e-05, | |
| "loss": 0.0037, | |
| "num_input_tokens_seen": 16642560, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 15.575539568345324, | |
| "grad_norm": 0.35699793696403503, | |
| "learning_rate": 2.330166899409124e-05, | |
| "loss": 0.0039, | |
| "num_input_tokens_seen": 16699248, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 15.627861347285808, | |
| "grad_norm": 0.46648091077804565, | |
| "learning_rate": 2.3164944107282333e-05, | |
| "loss": 0.0067, | |
| "num_input_tokens_seen": 16755952, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 15.680183126226291, | |
| "grad_norm": 0.8464080691337585, | |
| "learning_rate": 2.3028274382309097e-05, | |
| "loss": 0.0061, | |
| "num_input_tokens_seen": 16811536, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 15.732504905166776, | |
| "grad_norm": 0.30818283557891846, | |
| "learning_rate": 2.2891663927467604e-05, | |
| "loss": 0.0046, | |
| "num_input_tokens_seen": 16867824, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 15.78482668410726, | |
| "grad_norm": 0.5573921799659729, | |
| "learning_rate": 2.2755116849272274e-05, | |
| "loss": 0.0041, | |
| "num_input_tokens_seen": 16924080, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 15.837148463047743, | |
| "grad_norm": 0.5058010816574097, | |
| "learning_rate": 2.2618637252332398e-05, | |
| "loss": 0.0065, | |
| "num_input_tokens_seen": 16979728, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 15.889470241988228, | |
| "grad_norm": 0.4849563241004944, | |
| "learning_rate": 2.2482229239228785e-05, | |
| "loss": 0.0047, | |
| "num_input_tokens_seen": 17035488, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 15.941792020928713, | |
| "grad_norm": 0.10410932451486588, | |
| "learning_rate": 2.234589691039046e-05, | |
| "loss": 0.0054, | |
| "num_input_tokens_seen": 17091072, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 15.994113799869195, | |
| "grad_norm": 0.31193605065345764, | |
| "learning_rate": 2.2209644363971337e-05, | |
| "loss": 0.0043, | |
| "num_input_tokens_seen": 17147328, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 16.041857423152386, | |
| "grad_norm": 0.15345972776412964, | |
| "learning_rate": 2.2073475695727096e-05, | |
| "loss": 0.0045, | |
| "num_input_tokens_seen": 17198200, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 16.09417920209287, | |
| "grad_norm": 0.8098782896995544, | |
| "learning_rate": 2.193739499889201e-05, | |
| "loss": 0.0042, | |
| "num_input_tokens_seen": 17254408, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 16.146500981033356, | |
| "grad_norm": 0.6010912656784058, | |
| "learning_rate": 2.1801406364055958e-05, | |
| "loss": 0.0049, | |
| "num_input_tokens_seen": 17311304, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 16.19882275997384, | |
| "grad_norm": 0.0812903568148613, | |
| "learning_rate": 2.1665513879041418e-05, | |
| "loss": 0.001, | |
| "num_input_tokens_seen": 17368152, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 16.251144538914325, | |
| "grad_norm": 0.08344978094100952, | |
| "learning_rate": 2.1529721628780593e-05, | |
| "loss": 0.0037, | |
| "num_input_tokens_seen": 17423480, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 16.303466317854806, | |
| "grad_norm": 0.3543494641780853, | |
| "learning_rate": 2.1394033695192645e-05, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 17478984, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 16.35578809679529, | |
| "grad_norm": 0.683672308921814, | |
| "learning_rate": 2.125845415706097e-05, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 17535592, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 16.408109875735775, | |
| "grad_norm": 0.07661443203687668, | |
| "learning_rate": 2.1122987089910577e-05, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 17591960, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 16.46043165467626, | |
| "grad_norm": 0.21092914044857025, | |
| "learning_rate": 2.0987636565885606e-05, | |
| "loss": 0.004, | |
| "num_input_tokens_seen": 17648504, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 16.512753433616744, | |
| "grad_norm": 0.8990269303321838, | |
| "learning_rate": 2.0852406653626916e-05, | |
| "loss": 0.003, | |
| "num_input_tokens_seen": 17705240, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 16.565075212557225, | |
| "grad_norm": 0.22914479672908783, | |
| "learning_rate": 2.0717301418149742e-05, | |
| "loss": 0.0028, | |
| "num_input_tokens_seen": 17760392, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 16.61739699149771, | |
| "grad_norm": 1.7169655561447144, | |
| "learning_rate": 2.058232492072157e-05, | |
| "loss": 0.0033, | |
| "num_input_tokens_seen": 17816744, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 16.669718770438195, | |
| "grad_norm": 0.07557539641857147, | |
| "learning_rate": 2.044748121874e-05, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 17872760, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 16.72204054937868, | |
| "grad_norm": 0.1961260586977005, | |
| "learning_rate": 2.0312774365610783e-05, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 17928696, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 16.774362328319164, | |
| "grad_norm": 0.6838825941085815, | |
| "learning_rate": 2.0178208410626006e-05, | |
| "loss": 0.0011, | |
| "num_input_tokens_seen": 17984232, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 16.82668410725965, | |
| "grad_norm": 0.12144844979047775, | |
| "learning_rate": 2.0043787398842347e-05, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 18040712, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 16.87900588620013, | |
| "grad_norm": 0.3987827003002167, | |
| "learning_rate": 1.9909515370959493e-05, | |
| "loss": 0.0029, | |
| "num_input_tokens_seen": 18097016, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 16.931327665140614, | |
| "grad_norm": 0.10082973539829254, | |
| "learning_rate": 1.9775396363198654e-05, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 18152776, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 16.9836494440811, | |
| "grad_norm": 0.020283468067646027, | |
| "learning_rate": 1.9641434407181285e-05, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 18208456, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 17.03139306736429, | |
| "grad_norm": 0.23080122470855713, | |
| "learning_rate": 1.950763352980782e-05, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 18259784, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 17.083714846304776, | |
| "grad_norm": 0.00946098379790783, | |
| "learning_rate": 1.9373997753136695e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 18316008, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 17.136036625245257, | |
| "grad_norm": 0.033656761050224304, | |
| "learning_rate": 1.9240531094263388e-05, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 18372696, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 17.188358404185742, | |
| "grad_norm": 0.03647719696164131, | |
| "learning_rate": 1.9107237565199716e-05, | |
| "loss": 0.0009, | |
| "num_input_tokens_seen": 18428488, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 17.240680183126226, | |
| "grad_norm": 0.030296266078948975, | |
| "learning_rate": 1.8974121172753192e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 18484120, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 17.29300196206671, | |
| "grad_norm": 0.043923936784267426, | |
| "learning_rate": 1.8841185918406594e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 18539976, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 17.345323741007196, | |
| "grad_norm": 0.038960348814725876, | |
| "learning_rate": 1.870843579819771e-05, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 18596792, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 17.397645519947677, | |
| "grad_norm": 0.010129265487194061, | |
| "learning_rate": 1.8575874802599162e-05, | |
| "loss": 0.001, | |
| "num_input_tokens_seen": 18652776, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 17.44996729888816, | |
| "grad_norm": 0.04958868771791458, | |
| "learning_rate": 1.8443506916398485e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 18709320, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 17.502289077828646, | |
| "grad_norm": 2.036484956741333, | |
| "learning_rate": 1.8311336118578355e-05, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 18766376, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 17.55461085676913, | |
| "grad_norm": 0.10162217170000076, | |
| "learning_rate": 1.8179366382196944e-05, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 18822440, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 17.606932635709615, | |
| "grad_norm": 0.41068536043167114, | |
| "learning_rate": 1.8047601674268522e-05, | |
| "loss": 0.0011, | |
| "num_input_tokens_seen": 18877976, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 17.6592544146501, | |
| "grad_norm": 0.35680681467056274, | |
| "learning_rate": 1.7916045955644207e-05, | |
| "loss": 0.0015, | |
| "num_input_tokens_seen": 18934728, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 17.71157619359058, | |
| "grad_norm": 0.16506262123584747, | |
| "learning_rate": 1.7784703180892882e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 18990088, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 17.763897972531066, | |
| "grad_norm": 0.05834071710705757, | |
| "learning_rate": 1.7653577298182327e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 19046728, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 17.81621975147155, | |
| "grad_norm": 0.14426672458648682, | |
| "learning_rate": 1.752267224916055e-05, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 19101672, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 17.868541530412035, | |
| "grad_norm": 0.2213674634695053, | |
| "learning_rate": 1.7391991968837272e-05, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 19159128, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 17.92086330935252, | |
| "grad_norm": 0.6867311000823975, | |
| "learning_rate": 1.726154038546569e-05, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 19215448, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 17.973185088293, | |
| "grad_norm": 0.022834990173578262, | |
| "learning_rate": 1.713132142042434e-05, | |
| "loss": 0.0013, | |
| "num_input_tokens_seen": 19270328, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 18.020928711576193, | |
| "grad_norm": 0.017958860844373703, | |
| "learning_rate": 1.7001338988099264e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 19321096, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 18.073250490516678, | |
| "grad_norm": 0.013346249237656593, | |
| "learning_rate": 1.68715969957663e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 19377144, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 18.125572269457162, | |
| "grad_norm": 0.016560234129428864, | |
| "learning_rate": 1.6742099343473674e-05, | |
| "loss": 0.0009, | |
| "num_input_tokens_seen": 19433080, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 18.177894048397647, | |
| "grad_norm": 0.3799145519733429, | |
| "learning_rate": 1.6612849923924723e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 19489176, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 18.230215827338128, | |
| "grad_norm": 0.9833922982215881, | |
| "learning_rate": 1.6483852622360923e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 19544920, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 18.282537606278613, | |
| "grad_norm": 0.017415842041373253, | |
| "learning_rate": 1.635511131644505e-05, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 19600888, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 18.334859385219097, | |
| "grad_norm": 0.05391722172498703, | |
| "learning_rate": 1.6226629876144657e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 19656168, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 18.387181164159582, | |
| "grad_norm": 0.32104507088661194, | |
| "learning_rate": 1.609841216361574e-05, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 19711224, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 18.439502943100067, | |
| "grad_norm": 0.019494058564305305, | |
| "learning_rate": 1.597046203308662e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 19767768, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 18.491824722040548, | |
| "grad_norm": 0.03266040235757828, | |
| "learning_rate": 1.584278333074208e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 19824616, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 18.544146500981032, | |
| "grad_norm": 0.024637416005134583, | |
| "learning_rate": 1.571537989460779e-05, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 19880024, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 18.596468279921517, | |
| "grad_norm": 0.03031347133219242, | |
| "learning_rate": 1.5588255554434883e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 19936504, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 18.648790058862, | |
| "grad_norm": 0.005317925941199064, | |
| "learning_rate": 1.5461414131584873e-05, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 19992136, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 18.701111837802486, | |
| "grad_norm": 0.024280209094285965, | |
| "learning_rate": 1.533485943891478e-05, | |
| "loss": 0.0035, | |
| "num_input_tokens_seen": 20049128, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 18.75343361674297, | |
| "grad_norm": 0.9302756190299988, | |
| "learning_rate": 1.5208595280662497e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 20106488, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 18.805755395683452, | |
| "grad_norm": 0.013046924024820328, | |
| "learning_rate": 1.5082625452332433e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 20162536, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 18.858077174623936, | |
| "grad_norm": 0.01642036624252796, | |
| "learning_rate": 1.4956953740581454e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 20219032, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 18.91039895356442, | |
| "grad_norm": 0.01480098720639944, | |
| "learning_rate": 1.4831583923104999e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 20275880, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 18.962720732504906, | |
| "grad_norm": 0.05232972651720047, | |
| "learning_rate": 1.4706519768523597e-05, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 20332264, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 19.0104643557881, | |
| "grad_norm": 0.008723029866814613, | |
| "learning_rate": 1.458176503626949e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 20382464, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 19.06278613472858, | |
| "grad_norm": 0.35439035296440125, | |
| "learning_rate": 1.4457323476473738e-05, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 20438720, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 19.115107913669064, | |
| "grad_norm": 0.02343558706343174, | |
| "learning_rate": 1.4333198829853394e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 20493616, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 19.16742969260955, | |
| "grad_norm": 0.014349430799484253, | |
| "learning_rate": 1.420939482759907e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 20550000, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 19.219751471550033, | |
| "grad_norm": 0.044981323182582855, | |
| "learning_rate": 1.4085915191262832e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 20606144, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 19.272073250490518, | |
| "grad_norm": 0.023957155644893646, | |
| "learning_rate": 1.396276363264629e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 20662720, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 19.324395029431, | |
| "grad_norm": 0.015040691941976547, | |
| "learning_rate": 1.3839943853689024e-05, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 20718992, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 19.376716808371484, | |
| "grad_norm": 0.011137389577925205, | |
| "learning_rate": 1.3717459546357284e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 20776096, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 19.429038587311968, | |
| "grad_norm": 0.059972431510686874, | |
| "learning_rate": 1.3595314392533083e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 20831584, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 19.481360366252453, | |
| "grad_norm": 0.003549647517502308, | |
| "learning_rate": 1.3473512063903432e-05, | |
| "loss": 0.0006, | |
| "num_input_tokens_seen": 20887408, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 19.533682145192937, | |
| "grad_norm": 0.01864522323012352, | |
| "learning_rate": 1.335205622185003e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 20944080, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 19.586003924133422, | |
| "grad_norm": 0.013788777403533459, | |
| "learning_rate": 1.3230950517339141e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21000576, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 19.638325703073903, | |
| "grad_norm": 0.007708332501351833, | |
| "learning_rate": 1.3110198590811918e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21056608, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 19.690647482014388, | |
| "grad_norm": 0.008117050863802433, | |
| "learning_rate": 1.2989804072074918e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21112528, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 19.742969260954872, | |
| "grad_norm": 0.005804878659546375, | |
| "learning_rate": 1.2869770580191051e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21169104, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 19.795291039895357, | |
| "grad_norm": 0.021145416423678398, | |
| "learning_rate": 1.2750101723370683e-05, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 21225440, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 19.84761281883584, | |
| "grad_norm": 0.031260546296834946, | |
| "learning_rate": 1.2630801098863284e-05, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 21281952, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 19.899934597776323, | |
| "grad_norm": 0.024749331176280975, | |
| "learning_rate": 1.2511872292849236e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21338448, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 19.952256376716807, | |
| "grad_norm": 0.11673085391521454, | |
| "learning_rate": 1.2393318880332062e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21394640, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.07176396250724792, | |
| "learning_rate": 1.2275144425030902e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 21445504, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 20.052321778940485, | |
| "grad_norm": 0.024757077917456627, | |
| "learning_rate": 1.2157352479273465e-05, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 21503072, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 20.10464355788097, | |
| "grad_norm": 0.016589034348726273, | |
| "learning_rate": 1.2039946583889225e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21559312, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 20.15696533682145, | |
| "grad_norm": 0.004230449441820383, | |
| "learning_rate": 1.1922930268102949e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21616032, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 20.209287115761935, | |
| "grad_norm": 0.007515770383179188, | |
| "learning_rate": 1.1806307049428616e-05, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 21671872, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 20.26160889470242, | |
| "grad_norm": 0.0041547054424881935, | |
| "learning_rate": 1.1690080433563716e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21727616, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 20.313930673642904, | |
| "grad_norm": 0.0042037139646708965, | |
| "learning_rate": 1.157425391428384e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 21784400, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 20.36625245258339, | |
| "grad_norm": 0.024101046845316887, | |
| "learning_rate": 1.145883097333767e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21841584, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 20.418574231523873, | |
| "grad_norm": 0.01927500218153, | |
| "learning_rate": 1.1343815080342279e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 21897120, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 20.470896010464354, | |
| "grad_norm": 0.005595839582383633, | |
| "learning_rate": 1.1229209692678921e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 21952320, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 20.52321778940484, | |
| "grad_norm": 0.00780284171923995, | |
| "learning_rate": 1.1115018255389006e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 22008432, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 20.575539568345324, | |
| "grad_norm": 0.0051256874576210976, | |
| "learning_rate": 1.1001244201070606e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 22063664, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 20.62786134728581, | |
| "grad_norm": 0.0060654510743916035, | |
| "learning_rate": 1.088789094977522e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 22119488, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 20.680183126226293, | |
| "grad_norm": 0.019679056480526924, | |
| "learning_rate": 1.077496190890502e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 22175568, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 20.732504905166774, | |
| "grad_norm": 0.02322162687778473, | |
| "learning_rate": 1.0662460473110384e-05, | |
| "loss": 0.0, | |
| "num_input_tokens_seen": 22231472, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 20.78482668410726, | |
| "grad_norm": 0.001980294706299901, | |
| "learning_rate": 1.0550390024187906e-05, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 22287120, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 20.837148463047743, | |
| "grad_norm": 0.3102468252182007, | |
| "learning_rate": 1.0438753930978643e-05, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 22342736, | |
| "step": 4000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5730, | |
| "num_input_tokens_seen": 22342736, | |
| "num_train_epochs": 30, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.588523420509798e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |