| { | |
| "best_metric": 0.6116148751910341, | |
| "best_model_checkpoint": "outputs/t5-tiny/weak_tiny_poe/mnli_21/checkpoint-61360", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 61360, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.0741028785705566, | |
| "learning_rate": 4.9592568448500655e-05, | |
| "loss": 1.2614, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.6355718374252319, | |
| "learning_rate": 4.918513689700131e-05, | |
| "loss": 1.2516, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.8562185168266296, | |
| "learning_rate": 4.877770534550195e-05, | |
| "loss": 1.2444, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.435200810432434, | |
| "learning_rate": 4.8370273794002606e-05, | |
| "loss": 1.2294, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.3618035316467285, | |
| "learning_rate": 4.7962842242503265e-05, | |
| "loss": 1.2231, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.9568185806274414, | |
| "learning_rate": 4.755541069100392e-05, | |
| "loss": 1.2171, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 2.025542974472046, | |
| "learning_rate": 4.7147979139504564e-05, | |
| "loss": 1.2158, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.468313217163086, | |
| "learning_rate": 4.6740547588005216e-05, | |
| "loss": 1.1951, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.4365932941436768, | |
| "learning_rate": 4.633311603650587e-05, | |
| "loss": 1.203, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.7032877206802368, | |
| "learning_rate": 4.592568448500652e-05, | |
| "loss": 1.1952, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 2.0673813819885254, | |
| "learning_rate": 4.5518252933507174e-05, | |
| "loss": 1.1958, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 1.7614480257034302, | |
| "learning_rate": 4.511082138200782e-05, | |
| "loss": 1.1933, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 3.3648741245269775, | |
| "learning_rate": 4.470338983050847e-05, | |
| "loss": 1.1862, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 1.8836809396743774, | |
| "learning_rate": 4.429595827900913e-05, | |
| "loss": 1.1926, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 2.070249080657959, | |
| "learning_rate": 4.3888526727509784e-05, | |
| "loss": 1.1841, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 2.4235424995422363, | |
| "learning_rate": 4.348109517601043e-05, | |
| "loss": 1.1755, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.2773147821426392, | |
| "learning_rate": 4.307366362451108e-05, | |
| "loss": 1.1906, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 4.709882736206055, | |
| "learning_rate": 4.2666232073011735e-05, | |
| "loss": 1.1683, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 2.064504384994507, | |
| "learning_rate": 4.225880052151239e-05, | |
| "loss": 1.1703, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 2.670032262802124, | |
| "learning_rate": 4.185136897001304e-05, | |
| "loss": 1.1622, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 1.9821618795394897, | |
| "learning_rate": 4.144393741851369e-05, | |
| "loss": 1.1571, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.3936009407043457, | |
| "learning_rate": 4.103650586701434e-05, | |
| "loss": 1.1616, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.637899398803711, | |
| "learning_rate": 4.0629074315515e-05, | |
| "loss": 1.1677, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 2.4801619052886963, | |
| "learning_rate": 4.022164276401565e-05, | |
| "loss": 1.1514, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.4945491594498217, | |
| "eval_combined_score": 0.4945491594498217, | |
| "eval_loss": 1.0730257034301758, | |
| "eval_runtime": 4.8512, | |
| "eval_samples_per_second": 2023.214, | |
| "eval_steps_per_second": 8.039, | |
| "step": 12272 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 2.4936630725860596, | |
| "learning_rate": 3.98142112125163e-05, | |
| "loss": 1.1492, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 2.4125094413757324, | |
| "learning_rate": 3.940677966101695e-05, | |
| "loss": 1.1456, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.6422048807144165, | |
| "learning_rate": 3.89993481095176e-05, | |
| "loss": 1.1404, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 2.566889762878418, | |
| "learning_rate": 3.8591916558018254e-05, | |
| "loss": 1.145, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 2.594127893447876, | |
| "learning_rate": 3.818448500651891e-05, | |
| "loss": 1.1498, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 2.0145516395568848, | |
| "learning_rate": 3.777705345501956e-05, | |
| "loss": 1.1434, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 2.318466901779175, | |
| "learning_rate": 3.7369621903520205e-05, | |
| "loss": 1.1339, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 2.0532548427581787, | |
| "learning_rate": 3.6962190352020865e-05, | |
| "loss": 1.1474, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 2.050147294998169, | |
| "learning_rate": 3.655475880052152e-05, | |
| "loss": 1.1398, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 2.1561617851257324, | |
| "learning_rate": 3.614732724902217e-05, | |
| "loss": 1.139, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 2.1412413120269775, | |
| "learning_rate": 3.5739895697522816e-05, | |
| "loss": 1.1429, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 1.999888300895691, | |
| "learning_rate": 3.533246414602347e-05, | |
| "loss": 1.1335, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 3.487457275390625, | |
| "learning_rate": 3.492503259452412e-05, | |
| "loss": 1.134, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 2.8775036334991455, | |
| "learning_rate": 3.451760104302477e-05, | |
| "loss": 1.1441, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 2.7384023666381836, | |
| "learning_rate": 3.4110169491525426e-05, | |
| "loss": 1.1341, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 3.0970985889434814, | |
| "learning_rate": 3.370273794002607e-05, | |
| "loss": 1.1268, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 3.6261987686157227, | |
| "learning_rate": 3.329530638852673e-05, | |
| "loss": 1.1328, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 2.8419606685638428, | |
| "learning_rate": 3.2887874837027384e-05, | |
| "loss": 1.1249, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.8945815563201904, | |
| "learning_rate": 3.2480443285528036e-05, | |
| "loss": 1.1294, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 2.6052589416503906, | |
| "learning_rate": 3.207301173402868e-05, | |
| "loss": 1.1336, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 2.4897425174713135, | |
| "learning_rate": 3.1665580182529335e-05, | |
| "loss": 1.1297, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 2.1377058029174805, | |
| "learning_rate": 3.125814863102999e-05, | |
| "loss": 1.117, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 2.3489041328430176, | |
| "learning_rate": 3.085071707953064e-05, | |
| "loss": 1.1242, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 2.4663751125335693, | |
| "learning_rate": 3.044328552803129e-05, | |
| "loss": 1.1249, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.7375434637069702, | |
| "learning_rate": 3.003585397653194e-05, | |
| "loss": 1.1148, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.5651553744268976, | |
| "eval_combined_score": 0.5651553744268976, | |
| "eval_loss": 0.956773579120636, | |
| "eval_runtime": 4.3422, | |
| "eval_samples_per_second": 2260.392, | |
| "eval_steps_per_second": 8.982, | |
| "step": 24544 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 3.0487465858459473, | |
| "learning_rate": 2.9628422425032598e-05, | |
| "loss": 1.1135, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.7948238849639893, | |
| "learning_rate": 2.922099087353325e-05, | |
| "loss": 1.1193, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 2.9265005588531494, | |
| "learning_rate": 2.88135593220339e-05, | |
| "loss": 1.1221, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 2.115213632583618, | |
| "learning_rate": 2.8406127770534552e-05, | |
| "loss": 1.1178, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 2.7759642601013184, | |
| "learning_rate": 2.7998696219035204e-05, | |
| "loss": 1.1106, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 3.3535237312316895, | |
| "learning_rate": 2.7591264667535854e-05, | |
| "loss": 1.1223, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 4.240155220031738, | |
| "learning_rate": 2.7183833116036506e-05, | |
| "loss": 1.1006, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 2.35819411277771, | |
| "learning_rate": 2.6776401564537155e-05, | |
| "loss": 1.1082, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 2.635653257369995, | |
| "learning_rate": 2.6368970013037815e-05, | |
| "loss": 1.1047, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 2.2701103687286377, | |
| "learning_rate": 2.5961538461538464e-05, | |
| "loss": 1.1003, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 3.0462381839752197, | |
| "learning_rate": 2.5554106910039117e-05, | |
| "loss": 1.1127, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 2.489602565765381, | |
| "learning_rate": 2.5146675358539766e-05, | |
| "loss": 1.1028, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 2.6888344287872314, | |
| "learning_rate": 2.473924380704042e-05, | |
| "loss": 1.0982, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 3.001851797103882, | |
| "learning_rate": 2.433181225554107e-05, | |
| "loss": 1.1097, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 2.212657928466797, | |
| "learning_rate": 2.392438070404172e-05, | |
| "loss": 1.11, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 2.230945110321045, | |
| "learning_rate": 2.3516949152542376e-05, | |
| "loss": 1.0993, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 3.2973270416259766, | |
| "learning_rate": 2.3109517601043025e-05, | |
| "loss": 1.1137, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 3.0208470821380615, | |
| "learning_rate": 2.2702086049543678e-05, | |
| "loss": 1.1057, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 1.68820059299469, | |
| "learning_rate": 2.229465449804433e-05, | |
| "loss": 1.1048, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 3.023021936416626, | |
| "learning_rate": 2.188722294654498e-05, | |
| "loss": 1.091, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 2.254213571548462, | |
| "learning_rate": 2.1479791395045636e-05, | |
| "loss": 1.1049, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 3.3968422412872314, | |
| "learning_rate": 2.1072359843546285e-05, | |
| "loss": 1.0982, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 2.841524124145508, | |
| "learning_rate": 2.0664928292046937e-05, | |
| "loss": 1.0937, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 3.070187568664551, | |
| "learning_rate": 2.0257496740547587e-05, | |
| "loss": 1.1048, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.5710646968925115, | |
| "eval_combined_score": 0.5710646968925115, | |
| "eval_loss": 0.9583811163902283, | |
| "eval_runtime": 4.5767, | |
| "eval_samples_per_second": 2144.543, | |
| "eval_steps_per_second": 8.521, | |
| "step": 36816 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 2.1366419792175293, | |
| "learning_rate": 1.9850065189048242e-05, | |
| "loss": 1.094, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 3.5920097827911377, | |
| "learning_rate": 1.944263363754889e-05, | |
| "loss": 1.0943, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 3.099132537841797, | |
| "learning_rate": 1.9035202086049544e-05, | |
| "loss": 1.1039, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 2.7153160572052, | |
| "learning_rate": 1.8627770534550197e-05, | |
| "loss": 1.0928, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 2.752225875854492, | |
| "learning_rate": 1.8220338983050846e-05, | |
| "loss": 1.098, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 2.1102657318115234, | |
| "learning_rate": 1.7812907431551502e-05, | |
| "loss": 1.0968, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 2.7974390983581543, | |
| "learning_rate": 1.740547588005215e-05, | |
| "loss": 1.09, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 2.8349063396453857, | |
| "learning_rate": 1.6998044328552804e-05, | |
| "loss": 1.0862, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 2.360250234603882, | |
| "learning_rate": 1.6590612777053456e-05, | |
| "loss": 1.0953, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 3.601504325866699, | |
| "learning_rate": 1.618318122555411e-05, | |
| "loss": 1.0841, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 2.5209643840789795, | |
| "learning_rate": 1.577574967405476e-05, | |
| "loss": 1.0847, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 3.1257715225219727, | |
| "learning_rate": 1.536831812255541e-05, | |
| "loss": 1.0991, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 3.250303268432617, | |
| "learning_rate": 1.4960886571056063e-05, | |
| "loss": 1.098, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 3.4005048274993896, | |
| "learning_rate": 1.4553455019556716e-05, | |
| "loss": 1.0839, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 2.8816983699798584, | |
| "learning_rate": 1.4146023468057368e-05, | |
| "loss": 1.0849, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 2.870222568511963, | |
| "learning_rate": 1.373859191655802e-05, | |
| "loss": 1.0994, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 3.1101572513580322, | |
| "learning_rate": 1.333116036505867e-05, | |
| "loss": 1.0927, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 3.555572986602783, | |
| "learning_rate": 1.2923728813559321e-05, | |
| "loss": 1.0915, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 3.06459379196167, | |
| "learning_rate": 1.2516297262059975e-05, | |
| "loss": 1.0952, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 3.103626012802124, | |
| "learning_rate": 1.2108865710560626e-05, | |
| "loss": 1.0859, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 3.8208911418914795, | |
| "learning_rate": 1.1701434159061279e-05, | |
| "loss": 1.0966, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 2.8707382678985596, | |
| "learning_rate": 1.1294002607561931e-05, | |
| "loss": 1.0752, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 4.298058032989502, | |
| "learning_rate": 1.0886571056062582e-05, | |
| "loss": 1.092, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 2.011492967605591, | |
| "learning_rate": 1.0479139504563235e-05, | |
| "loss": 1.0789, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "grad_norm": 3.2206387519836426, | |
| "learning_rate": 1.0071707953063886e-05, | |
| "loss": 1.075, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.5978604177279674, | |
| "eval_combined_score": 0.5978604177279674, | |
| "eval_loss": 0.9132007360458374, | |
| "eval_runtime": 3.8437, | |
| "eval_samples_per_second": 2553.526, | |
| "eval_steps_per_second": 10.146, | |
| "step": 49088 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 3.0595147609710693, | |
| "learning_rate": 9.664276401564537e-06, | |
| "loss": 1.0901, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 2.2460098266601562, | |
| "learning_rate": 9.25684485006519e-06, | |
| "loss": 1.0802, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 2.1211585998535156, | |
| "learning_rate": 8.84941329856584e-06, | |
| "loss": 1.0886, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 2.7433323860168457, | |
| "learning_rate": 8.441981747066494e-06, | |
| "loss": 1.0814, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 3.3154397010803223, | |
| "learning_rate": 8.034550195567145e-06, | |
| "loss": 1.0877, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 3.0281894207000732, | |
| "learning_rate": 7.627118644067798e-06, | |
| "loss": 1.0773, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 2.553225517272949, | |
| "learning_rate": 7.219687092568449e-06, | |
| "loss": 1.0869, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 2.8887219429016113, | |
| "learning_rate": 6.812255541069101e-06, | |
| "loss": 1.0756, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 2.1609950065612793, | |
| "learning_rate": 6.404823989569752e-06, | |
| "loss": 1.0884, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 3.0557539463043213, | |
| "learning_rate": 5.997392438070405e-06, | |
| "loss": 1.0886, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 2.783965587615967, | |
| "learning_rate": 5.5899608865710565e-06, | |
| "loss": 1.0827, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 3.0575108528137207, | |
| "learning_rate": 5.182529335071708e-06, | |
| "loss": 1.0886, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 2.5954906940460205, | |
| "learning_rate": 4.77509778357236e-06, | |
| "loss": 1.0773, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 2.1589512825012207, | |
| "learning_rate": 4.367666232073012e-06, | |
| "loss": 1.0693, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 2.5202431678771973, | |
| "learning_rate": 3.9602346805736635e-06, | |
| "loss": 1.0803, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 2.588782787322998, | |
| "learning_rate": 3.5528031290743156e-06, | |
| "loss": 1.0816, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 2.994828701019287, | |
| "learning_rate": 3.1453715775749674e-06, | |
| "loss": 1.0682, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 2.9718551635742188, | |
| "learning_rate": 2.7379400260756195e-06, | |
| "loss": 1.0786, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 4.77, | |
| "grad_norm": 3.181231737136841, | |
| "learning_rate": 2.3305084745762712e-06, | |
| "loss": 1.0818, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 2.3970162868499756, | |
| "learning_rate": 1.9230769230769234e-06, | |
| "loss": 1.0816, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 2.8263609409332275, | |
| "learning_rate": 1.5156453715775751e-06, | |
| "loss": 1.0755, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 3.2032511234283447, | |
| "learning_rate": 1.108213820078227e-06, | |
| "loss": 1.0764, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 3.6272828578948975, | |
| "learning_rate": 7.007822685788788e-07, | |
| "loss": 1.0855, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "grad_norm": 2.672410249710083, | |
| "learning_rate": 2.9335071707953065e-07, | |
| "loss": 1.0755, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.6116148751910341, | |
| "eval_combined_score": 0.6116148751910341, | |
| "eval_loss": 0.8905634880065918, | |
| "eval_runtime": 4.8808, | |
| "eval_samples_per_second": 2010.92, | |
| "eval_steps_per_second": 7.99, | |
| "step": 61360 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 61360, | |
| "total_flos": 0.0, | |
| "train_loss": 1.1215235922762437, | |
| "train_runtime": 4186.3943, | |
| "train_samples_per_second": 469.022, | |
| "train_steps_per_second": 14.657 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 61360, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "total_flos": 0.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |