{ "best_metric": 0.6116148751910341, "best_model_checkpoint": "outputs/t5-tiny/weak_tiny_poe/mnli_21/checkpoint-61360", "epoch": 5.0, "eval_steps": 500, "global_step": 61360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 1.0741028785705566, "learning_rate": 4.9592568448500655e-05, "loss": 1.2614, "step": 500 }, { "epoch": 0.08, "grad_norm": 0.6355718374252319, "learning_rate": 4.918513689700131e-05, "loss": 1.2516, "step": 1000 }, { "epoch": 0.12, "grad_norm": 0.8562185168266296, "learning_rate": 4.877770534550195e-05, "loss": 1.2444, "step": 1500 }, { "epoch": 0.16, "grad_norm": 1.435200810432434, "learning_rate": 4.8370273794002606e-05, "loss": 1.2294, "step": 2000 }, { "epoch": 0.2, "grad_norm": 2.3618035316467285, "learning_rate": 4.7962842242503265e-05, "loss": 1.2231, "step": 2500 }, { "epoch": 0.24, "grad_norm": 1.9568185806274414, "learning_rate": 4.755541069100392e-05, "loss": 1.2171, "step": 3000 }, { "epoch": 0.29, "grad_norm": 2.025542974472046, "learning_rate": 4.7147979139504564e-05, "loss": 1.2158, "step": 3500 }, { "epoch": 0.33, "grad_norm": 1.468313217163086, "learning_rate": 4.6740547588005216e-05, "loss": 1.1951, "step": 4000 }, { "epoch": 0.37, "grad_norm": 2.4365932941436768, "learning_rate": 4.633311603650587e-05, "loss": 1.203, "step": 4500 }, { "epoch": 0.41, "grad_norm": 1.7032877206802368, "learning_rate": 4.592568448500652e-05, "loss": 1.1952, "step": 5000 }, { "epoch": 0.45, "grad_norm": 2.0673813819885254, "learning_rate": 4.5518252933507174e-05, "loss": 1.1958, "step": 5500 }, { "epoch": 0.49, "grad_norm": 1.7614480257034302, "learning_rate": 4.511082138200782e-05, "loss": 1.1933, "step": 6000 }, { "epoch": 0.53, "grad_norm": 3.3648741245269775, "learning_rate": 4.470338983050847e-05, "loss": 1.1862, "step": 6500 }, { "epoch": 0.57, "grad_norm": 1.8836809396743774, "learning_rate": 4.429595827900913e-05, "loss": 1.1926, "step": 7000 }, { "epoch": 0.61, "grad_norm": 2.070249080657959, "learning_rate": 4.3888526727509784e-05, "loss": 1.1841, "step": 7500 }, { "epoch": 0.65, "grad_norm": 2.4235424995422363, "learning_rate": 4.348109517601043e-05, "loss": 1.1755, "step": 8000 }, { "epoch": 0.69, "grad_norm": 1.2773147821426392, "learning_rate": 4.307366362451108e-05, "loss": 1.1906, "step": 8500 }, { "epoch": 0.73, "grad_norm": 4.709882736206055, "learning_rate": 4.2666232073011735e-05, "loss": 1.1683, "step": 9000 }, { "epoch": 0.77, "grad_norm": 2.064504384994507, "learning_rate": 4.225880052151239e-05, "loss": 1.1703, "step": 9500 }, { "epoch": 0.81, "grad_norm": 2.670032262802124, "learning_rate": 4.185136897001304e-05, "loss": 1.1622, "step": 10000 }, { "epoch": 0.86, "grad_norm": 1.9821618795394897, "learning_rate": 4.144393741851369e-05, "loss": 1.1571, "step": 10500 }, { "epoch": 0.9, "grad_norm": 2.3936009407043457, "learning_rate": 4.103650586701434e-05, "loss": 1.1616, "step": 11000 }, { "epoch": 0.94, "grad_norm": 2.637899398803711, "learning_rate": 4.0629074315515e-05, "loss": 1.1677, "step": 11500 }, { "epoch": 0.98, "grad_norm": 2.4801619052886963, "learning_rate": 4.022164276401565e-05, "loss": 1.1514, "step": 12000 }, { "epoch": 1.0, "eval_accuracy": 0.4945491594498217, "eval_combined_score": 0.4945491594498217, "eval_loss": 1.0730257034301758, "eval_runtime": 4.8512, "eval_samples_per_second": 2023.214, "eval_steps_per_second": 8.039, "step": 12272 }, { "epoch": 1.02, "grad_norm": 2.4936630725860596, "learning_rate": 3.98142112125163e-05, "loss": 1.1492, "step": 12500 }, { "epoch": 1.06, "grad_norm": 2.4125094413757324, "learning_rate": 3.940677966101695e-05, "loss": 1.1456, "step": 13000 }, { "epoch": 1.1, "grad_norm": 1.6422048807144165, "learning_rate": 3.89993481095176e-05, "loss": 1.1404, "step": 13500 }, { "epoch": 1.14, "grad_norm": 2.566889762878418, "learning_rate": 3.8591916558018254e-05, "loss": 1.145, "step": 14000 }, { "epoch": 1.18, "grad_norm": 2.594127893447876, "learning_rate": 3.818448500651891e-05, "loss": 1.1498, "step": 14500 }, { "epoch": 1.22, "grad_norm": 2.0145516395568848, "learning_rate": 3.777705345501956e-05, "loss": 1.1434, "step": 15000 }, { "epoch": 1.26, "grad_norm": 2.318466901779175, "learning_rate": 3.7369621903520205e-05, "loss": 1.1339, "step": 15500 }, { "epoch": 1.3, "grad_norm": 2.0532548427581787, "learning_rate": 3.6962190352020865e-05, "loss": 1.1474, "step": 16000 }, { "epoch": 1.34, "grad_norm": 2.050147294998169, "learning_rate": 3.655475880052152e-05, "loss": 1.1398, "step": 16500 }, { "epoch": 1.39, "grad_norm": 2.1561617851257324, "learning_rate": 3.614732724902217e-05, "loss": 1.139, "step": 17000 }, { "epoch": 1.43, "grad_norm": 2.1412413120269775, "learning_rate": 3.5739895697522816e-05, "loss": 1.1429, "step": 17500 }, { "epoch": 1.47, "grad_norm": 1.999888300895691, "learning_rate": 3.533246414602347e-05, "loss": 1.1335, "step": 18000 }, { "epoch": 1.51, "grad_norm": 3.487457275390625, "learning_rate": 3.492503259452412e-05, "loss": 1.134, "step": 18500 }, { "epoch": 1.55, "grad_norm": 2.8775036334991455, "learning_rate": 3.451760104302477e-05, "loss": 1.1441, "step": 19000 }, { "epoch": 1.59, "grad_norm": 2.7384023666381836, "learning_rate": 3.4110169491525426e-05, "loss": 1.1341, "step": 19500 }, { "epoch": 1.63, "grad_norm": 3.0970985889434814, "learning_rate": 3.370273794002607e-05, "loss": 1.1268, "step": 20000 }, { "epoch": 1.67, "grad_norm": 3.6261987686157227, "learning_rate": 3.329530638852673e-05, "loss": 1.1328, "step": 20500 }, { "epoch": 1.71, "grad_norm": 2.8419606685638428, "learning_rate": 3.2887874837027384e-05, "loss": 1.1249, "step": 21000 }, { "epoch": 1.75, "grad_norm": 1.8945815563201904, "learning_rate": 3.2480443285528036e-05, "loss": 1.1294, "step": 21500 }, { "epoch": 1.79, "grad_norm": 2.6052589416503906, "learning_rate": 3.207301173402868e-05, "loss": 1.1336, "step": 22000 }, { "epoch": 1.83, "grad_norm": 2.4897425174713135, "learning_rate": 3.1665580182529335e-05, "loss": 1.1297, "step": 22500 }, { "epoch": 1.87, "grad_norm": 2.1377058029174805, "learning_rate": 3.125814863102999e-05, "loss": 1.117, "step": 23000 }, { "epoch": 1.91, "grad_norm": 2.3489041328430176, "learning_rate": 3.085071707953064e-05, "loss": 1.1242, "step": 23500 }, { "epoch": 1.96, "grad_norm": 2.4663751125335693, "learning_rate": 3.044328552803129e-05, "loss": 1.1249, "step": 24000 }, { "epoch": 2.0, "grad_norm": 1.7375434637069702, "learning_rate": 3.003585397653194e-05, "loss": 1.1148, "step": 24500 }, { "epoch": 2.0, "eval_accuracy": 0.5651553744268976, "eval_combined_score": 0.5651553744268976, "eval_loss": 0.956773579120636, "eval_runtime": 4.3422, "eval_samples_per_second": 2260.392, "eval_steps_per_second": 8.982, "step": 24544 }, { "epoch": 2.04, "grad_norm": 3.0487465858459473, "learning_rate": 2.9628422425032598e-05, "loss": 1.1135, "step": 25000 }, { "epoch": 2.08, "grad_norm": 2.7948238849639893, "learning_rate": 2.922099087353325e-05, "loss": 1.1193, "step": 25500 }, { "epoch": 2.12, "grad_norm": 2.9265005588531494, "learning_rate": 2.88135593220339e-05, "loss": 1.1221, "step": 26000 }, { "epoch": 2.16, "grad_norm": 2.115213632583618, "learning_rate": 2.8406127770534552e-05, "loss": 1.1178, "step": 26500 }, { "epoch": 2.2, "grad_norm": 2.7759642601013184, "learning_rate": 2.7998696219035204e-05, "loss": 1.1106, "step": 27000 }, { "epoch": 2.24, "grad_norm": 3.3535237312316895, "learning_rate": 2.7591264667535854e-05, "loss": 1.1223, "step": 27500 }, { "epoch": 2.28, "grad_norm": 4.240155220031738, "learning_rate": 2.7183833116036506e-05, "loss": 1.1006, "step": 28000 }, { "epoch": 2.32, "grad_norm": 2.35819411277771, "learning_rate": 2.6776401564537155e-05, "loss": 1.1082, "step": 28500 }, { "epoch": 2.36, "grad_norm": 2.635653257369995, "learning_rate": 2.6368970013037815e-05, "loss": 1.1047, "step": 29000 }, { "epoch": 2.4, "grad_norm": 2.2701103687286377, "learning_rate": 2.5961538461538464e-05, "loss": 1.1003, "step": 29500 }, { "epoch": 2.44, "grad_norm": 3.0462381839752197, "learning_rate": 2.5554106910039117e-05, "loss": 1.1127, "step": 30000 }, { "epoch": 2.49, "grad_norm": 2.489602565765381, "learning_rate": 2.5146675358539766e-05, "loss": 1.1028, "step": 30500 }, { "epoch": 2.53, "grad_norm": 2.6888344287872314, "learning_rate": 2.473924380704042e-05, "loss": 1.0982, "step": 31000 }, { "epoch": 2.57, "grad_norm": 3.001851797103882, "learning_rate": 2.433181225554107e-05, "loss": 1.1097, "step": 31500 }, { "epoch": 2.61, "grad_norm": 2.212657928466797, "learning_rate": 2.392438070404172e-05, "loss": 1.11, "step": 32000 }, { "epoch": 2.65, "grad_norm": 2.230945110321045, "learning_rate": 2.3516949152542376e-05, "loss": 1.0993, "step": 32500 }, { "epoch": 2.69, "grad_norm": 3.2973270416259766, "learning_rate": 2.3109517601043025e-05, "loss": 1.1137, "step": 33000 }, { "epoch": 2.73, "grad_norm": 3.0208470821380615, "learning_rate": 2.2702086049543678e-05, "loss": 1.1057, "step": 33500 }, { "epoch": 2.77, "grad_norm": 1.68820059299469, "learning_rate": 2.229465449804433e-05, "loss": 1.1048, "step": 34000 }, { "epoch": 2.81, "grad_norm": 3.023021936416626, "learning_rate": 2.188722294654498e-05, "loss": 1.091, "step": 34500 }, { "epoch": 2.85, "grad_norm": 2.254213571548462, "learning_rate": 2.1479791395045636e-05, "loss": 1.1049, "step": 35000 }, { "epoch": 2.89, "grad_norm": 3.3968422412872314, "learning_rate": 2.1072359843546285e-05, "loss": 1.0982, "step": 35500 }, { "epoch": 2.93, "grad_norm": 2.841524124145508, "learning_rate": 2.0664928292046937e-05, "loss": 1.0937, "step": 36000 }, { "epoch": 2.97, "grad_norm": 3.070187568664551, "learning_rate": 2.0257496740547587e-05, "loss": 1.1048, "step": 36500 }, { "epoch": 3.0, "eval_accuracy": 0.5710646968925115, "eval_combined_score": 0.5710646968925115, "eval_loss": 0.9583811163902283, "eval_runtime": 4.5767, "eval_samples_per_second": 2144.543, "eval_steps_per_second": 8.521, "step": 36816 }, { "epoch": 3.01, "grad_norm": 2.1366419792175293, "learning_rate": 1.9850065189048242e-05, "loss": 1.094, "step": 37000 }, { "epoch": 3.06, "grad_norm": 3.5920097827911377, "learning_rate": 1.944263363754889e-05, "loss": 1.0943, "step": 37500 }, { "epoch": 3.1, "grad_norm": 3.099132537841797, "learning_rate": 1.9035202086049544e-05, "loss": 1.1039, "step": 38000 }, { "epoch": 3.14, "grad_norm": 2.7153160572052, "learning_rate": 1.8627770534550197e-05, "loss": 1.0928, "step": 38500 }, { "epoch": 3.18, "grad_norm": 2.752225875854492, "learning_rate": 1.8220338983050846e-05, "loss": 1.098, "step": 39000 }, { "epoch": 3.22, "grad_norm": 2.1102657318115234, "learning_rate": 1.7812907431551502e-05, "loss": 1.0968, "step": 39500 }, { "epoch": 3.26, "grad_norm": 2.7974390983581543, "learning_rate": 1.740547588005215e-05, "loss": 1.09, "step": 40000 }, { "epoch": 3.3, "grad_norm": 2.8349063396453857, "learning_rate": 1.6998044328552804e-05, "loss": 1.0862, "step": 40500 }, { "epoch": 3.34, "grad_norm": 2.360250234603882, "learning_rate": 1.6590612777053456e-05, "loss": 1.0953, "step": 41000 }, { "epoch": 3.38, "grad_norm": 3.601504325866699, "learning_rate": 1.618318122555411e-05, "loss": 1.0841, "step": 41500 }, { "epoch": 3.42, "grad_norm": 2.5209643840789795, "learning_rate": 1.577574967405476e-05, "loss": 1.0847, "step": 42000 }, { "epoch": 3.46, "grad_norm": 3.1257715225219727, "learning_rate": 1.536831812255541e-05, "loss": 1.0991, "step": 42500 }, { "epoch": 3.5, "grad_norm": 3.250303268432617, "learning_rate": 1.4960886571056063e-05, "loss": 1.098, "step": 43000 }, { "epoch": 3.54, "grad_norm": 3.4005048274993896, "learning_rate": 1.4553455019556716e-05, "loss": 1.0839, "step": 43500 }, { "epoch": 3.59, "grad_norm": 2.8816983699798584, "learning_rate": 1.4146023468057368e-05, "loss": 1.0849, "step": 44000 }, { "epoch": 3.63, "grad_norm": 2.870222568511963, "learning_rate": 1.373859191655802e-05, "loss": 1.0994, "step": 44500 }, { "epoch": 3.67, "grad_norm": 3.1101572513580322, "learning_rate": 1.333116036505867e-05, "loss": 1.0927, "step": 45000 }, { "epoch": 3.71, "grad_norm": 3.555572986602783, "learning_rate": 1.2923728813559321e-05, "loss": 1.0915, "step": 45500 }, { "epoch": 3.75, "grad_norm": 3.06459379196167, "learning_rate": 1.2516297262059975e-05, "loss": 1.0952, "step": 46000 }, { "epoch": 3.79, "grad_norm": 3.103626012802124, "learning_rate": 1.2108865710560626e-05, "loss": 1.0859, "step": 46500 }, { "epoch": 3.83, "grad_norm": 3.8208911418914795, "learning_rate": 1.1701434159061279e-05, "loss": 1.0966, "step": 47000 }, { "epoch": 3.87, "grad_norm": 2.8707382678985596, "learning_rate": 1.1294002607561931e-05, "loss": 1.0752, "step": 47500 }, { "epoch": 3.91, "grad_norm": 4.298058032989502, "learning_rate": 1.0886571056062582e-05, "loss": 1.092, "step": 48000 }, { "epoch": 3.95, "grad_norm": 2.011492967605591, "learning_rate": 1.0479139504563235e-05, "loss": 1.0789, "step": 48500 }, { "epoch": 3.99, "grad_norm": 3.2206387519836426, "learning_rate": 1.0071707953063886e-05, "loss": 1.075, "step": 49000 }, { "epoch": 4.0, "eval_accuracy": 0.5978604177279674, "eval_combined_score": 0.5978604177279674, "eval_loss": 0.9132007360458374, "eval_runtime": 3.8437, "eval_samples_per_second": 2553.526, "eval_steps_per_second": 10.146, "step": 49088 }, { "epoch": 4.03, "grad_norm": 3.0595147609710693, "learning_rate": 9.664276401564537e-06, "loss": 1.0901, "step": 49500 }, { "epoch": 4.07, "grad_norm": 2.2460098266601562, "learning_rate": 9.25684485006519e-06, "loss": 1.0802, "step": 50000 }, { "epoch": 4.12, "grad_norm": 2.1211585998535156, "learning_rate": 8.84941329856584e-06, "loss": 1.0886, "step": 50500 }, { "epoch": 4.16, "grad_norm": 2.7433323860168457, "learning_rate": 8.441981747066494e-06, "loss": 1.0814, "step": 51000 }, { "epoch": 4.2, "grad_norm": 3.3154397010803223, "learning_rate": 8.034550195567145e-06, "loss": 1.0877, "step": 51500 }, { "epoch": 4.24, "grad_norm": 3.0281894207000732, "learning_rate": 7.627118644067798e-06, "loss": 1.0773, "step": 52000 }, { "epoch": 4.28, "grad_norm": 2.553225517272949, "learning_rate": 7.219687092568449e-06, "loss": 1.0869, "step": 52500 }, { "epoch": 4.32, "grad_norm": 2.8887219429016113, "learning_rate": 6.812255541069101e-06, "loss": 1.0756, "step": 53000 }, { "epoch": 4.36, "grad_norm": 2.1609950065612793, "learning_rate": 6.404823989569752e-06, "loss": 1.0884, "step": 53500 }, { "epoch": 4.4, "grad_norm": 3.0557539463043213, "learning_rate": 5.997392438070405e-06, "loss": 1.0886, "step": 54000 }, { "epoch": 4.44, "grad_norm": 2.783965587615967, "learning_rate": 5.5899608865710565e-06, "loss": 1.0827, "step": 54500 }, { "epoch": 4.48, "grad_norm": 3.0575108528137207, "learning_rate": 5.182529335071708e-06, "loss": 1.0886, "step": 55000 }, { "epoch": 4.52, "grad_norm": 2.5954906940460205, "learning_rate": 4.77509778357236e-06, "loss": 1.0773, "step": 55500 }, { "epoch": 4.56, "grad_norm": 2.1589512825012207, "learning_rate": 4.367666232073012e-06, "loss": 1.0693, "step": 56000 }, { "epoch": 4.6, "grad_norm": 2.5202431678771973, "learning_rate": 3.9602346805736635e-06, "loss": 1.0803, "step": 56500 }, { "epoch": 4.64, "grad_norm": 2.588782787322998, "learning_rate": 3.5528031290743156e-06, "loss": 1.0816, "step": 57000 }, { "epoch": 4.69, "grad_norm": 2.994828701019287, "learning_rate": 3.1453715775749674e-06, "loss": 1.0682, "step": 57500 }, { "epoch": 4.73, "grad_norm": 2.9718551635742188, "learning_rate": 2.7379400260756195e-06, "loss": 1.0786, "step": 58000 }, { "epoch": 4.77, "grad_norm": 3.181231737136841, "learning_rate": 2.3305084745762712e-06, "loss": 1.0818, "step": 58500 }, { "epoch": 4.81, "grad_norm": 2.3970162868499756, "learning_rate": 1.9230769230769234e-06, "loss": 1.0816, "step": 59000 }, { "epoch": 4.85, "grad_norm": 2.8263609409332275, "learning_rate": 1.5156453715775751e-06, "loss": 1.0755, "step": 59500 }, { "epoch": 4.89, "grad_norm": 3.2032511234283447, "learning_rate": 1.108213820078227e-06, "loss": 1.0764, "step": 60000 }, { "epoch": 4.93, "grad_norm": 3.6272828578948975, "learning_rate": 7.007822685788788e-07, "loss": 1.0855, "step": 60500 }, { "epoch": 4.97, "grad_norm": 2.672410249710083, "learning_rate": 2.9335071707953065e-07, "loss": 1.0755, "step": 61000 }, { "epoch": 5.0, "eval_accuracy": 0.6116148751910341, "eval_combined_score": 0.6116148751910341, "eval_loss": 0.8905634880065918, "eval_runtime": 4.8808, "eval_samples_per_second": 2010.92, "eval_steps_per_second": 7.99, "step": 61360 }, { "epoch": 5.0, "step": 61360, "total_flos": 0.0, "train_loss": 1.1215235922762437, "train_runtime": 4186.3943, "train_samples_per_second": 469.022, "train_steps_per_second": 14.657 } ], "logging_steps": 500, "max_steps": 61360, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }