| { |
| "best_metric": 0.5003042221069336, |
| "best_model_checkpoint": "./t5_checkpoints_full/checkpoint-1000", |
| "epoch": 0.7841292244961969, |
| "eval_steps": 1000, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01, |
| "learning_rate": 2.88e-05, |
| "loss": 12.5022, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.02, |
| "learning_rate": 5.82e-05, |
| "loss": 10.3469, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.02, |
| "learning_rate": 8.819999999999999e-05, |
| "loss": 4.02, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.03, |
| "learning_rate": 0.0001182, |
| "loss": 0.9201, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.04, |
| "learning_rate": 0.0001482, |
| "loss": 0.7357, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.05, |
| "learning_rate": 0.00017699999999999997, |
| "loss": 0.6602, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.05, |
| "learning_rate": 0.00020699999999999996, |
| "loss": 0.6121, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.06, |
| "learning_rate": 0.000237, |
| "loss": 0.5817, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.07, |
| "learning_rate": 0.000267, |
| "loss": 0.5916, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.08, |
| "learning_rate": 0.00029699999999999996, |
| "loss": 0.5675, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.09, |
| "learning_rate": 0.0002992913893064204, |
| "loss": 0.57, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.09, |
| "learning_rate": 0.0002984861498818982, |
| "loss": 0.561, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1, |
| "learning_rate": 0.000297680910457376, |
| "loss": 0.5669, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.11, |
| "learning_rate": 0.00029687567103285373, |
| "loss": 0.5659, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.12, |
| "learning_rate": 0.0002960704316083315, |
| "loss": 0.5673, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.13, |
| "learning_rate": 0.0002952651921838093, |
| "loss": 0.5619, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.13, |
| "learning_rate": 0.00029449216233626795, |
| "loss": 0.5719, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.14, |
| "learning_rate": 0.00029368692291174576, |
| "loss": 0.5576, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.15, |
| "learning_rate": 0.0002928816834872235, |
| "loss": 0.5567, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.16, |
| "learning_rate": 0.00029210865363968216, |
| "loss": 0.5597, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_loss": 0.5003042221069336, |
| "eval_runtime": 95.3235, |
| "eval_samples_per_second": 118.879, |
| "eval_steps_per_second": 7.438, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.16, |
| "learning_rate": 0.0002913195190036504, |
| "loss": 0.5565, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.17, |
| "learning_rate": 0.00029057869873308994, |
| "loss": 0.5767, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.18, |
| "learning_rate": 0.00028978956409705817, |
| "loss": 0.562, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.19, |
| "learning_rate": 0.0002890004294610264, |
| "loss": 0.5864, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.2, |
| "learning_rate": 0.00028819519003650415, |
| "loss": 0.626, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.2, |
| "learning_rate": 0.0002873899506119819, |
| "loss": 0.7742, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.21, |
| "learning_rate": 0.0002866169207644406, |
| "loss": 1.1101, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.22, |
| "learning_rate": 0.00028581168133991837, |
| "loss": 1.3211, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.23, |
| "learning_rate": 0.0002850064419153961, |
| "loss": 1.413, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.24, |
| "learning_rate": 0.00028420120249087393, |
| "loss": 1.4265, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.24, |
| "learning_rate": 0.0002834281726433326, |
| "loss": 1.47, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.25, |
| "learning_rate": 0.00028262293321881034, |
| "loss": 1.4561, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.26, |
| "learning_rate": 0.0002818337985827786, |
| "loss": 1.4693, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.27, |
| "learning_rate": 0.0002810285591582563, |
| "loss": 1.4729, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.27, |
| "learning_rate": 0.00028022331973373413, |
| "loss": 1.4599, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.28, |
| "learning_rate": 0.0002794180803092119, |
| "loss": 1.4725, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.29, |
| "learning_rate": 0.0002786128408846897, |
| "loss": 1.4503, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.3, |
| "learning_rate": 0.0002778237062486579, |
| "loss": 1.4812, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.31, |
| "learning_rate": 0.0002770184668241357, |
| "loss": 1.4761, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.31, |
| "learning_rate": 0.00027621322739961344, |
| "loss": 1.496, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.31, |
| "eval_loss": 1.251204252243042, |
| "eval_runtime": 94.8348, |
| "eval_samples_per_second": 119.492, |
| "eval_steps_per_second": 7.476, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.32, |
| "learning_rate": 0.00027542409276358167, |
| "loss": 1.4488, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.33, |
| "learning_rate": 0.00027461885333905943, |
| "loss": 1.455, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.34, |
| "learning_rate": 0.00027381361391453724, |
| "loss": 1.4353, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.35, |
| "learning_rate": 0.000273008374490015, |
| "loss": 1.4524, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.35, |
| "learning_rate": 0.0002722192398539832, |
| "loss": 1.4701, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.36, |
| "learning_rate": 0.000271414000429461, |
| "loss": 1.4734, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.37, |
| "learning_rate": 0.0002706409705819197, |
| "loss": 1.5035, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.38, |
| "learning_rate": 0.00026983573115739744, |
| "loss": 1.4513, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.38, |
| "learning_rate": 0.00026904659652136567, |
| "loss": 1.4641, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.39, |
| "learning_rate": 0.0002682413570968434, |
| "loss": 1.4585, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.4, |
| "learning_rate": 0.00026743611767232123, |
| "loss": 1.4673, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.41, |
| "learning_rate": 0.0002666469830362894, |
| "loss": 1.4671, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.42, |
| "learning_rate": 0.0002658578484002577, |
| "loss": 1.4702, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.42, |
| "learning_rate": 0.00026508481855271634, |
| "loss": 1.4612, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.43, |
| "learning_rate": 0.0002642956839166845, |
| "loss": 1.4713, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.44, |
| "learning_rate": 0.00026350654928065275, |
| "loss": 1.4573, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.45, |
| "learning_rate": 0.000262717414644621, |
| "loss": 1.4586, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.45, |
| "learning_rate": 0.00026191217522009873, |
| "loss": 1.4674, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.46, |
| "learning_rate": 0.00026110693579557654, |
| "loss": 1.4466, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.47, |
| "learning_rate": 0.0002603339059480352, |
| "loss": 1.4897, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.47, |
| "eval_loss": 1.2417596578598022, |
| "eval_runtime": 94.8105, |
| "eval_samples_per_second": 119.523, |
| "eval_steps_per_second": 7.478, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.48, |
| "learning_rate": 0.0002595447713120034, |
| "loss": 1.4621, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.49, |
| "learning_rate": 0.0002587395318874812, |
| "loss": 1.4443, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.49, |
| "learning_rate": 0.0002579503972514494, |
| "loss": 1.4314, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.5, |
| "learning_rate": 0.0002571451578269272, |
| "loss": 1.4172, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.51, |
| "learning_rate": 0.00025637212797938587, |
| "loss": 1.4878, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.52, |
| "learning_rate": 0.00025558299334335404, |
| "loss": 1.4344, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.53, |
| "learning_rate": 0.00025477775391883185, |
| "loss": 1.4634, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.53, |
| "learning_rate": 0.0002539725144943096, |
| "loss": 1.4679, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.54, |
| "learning_rate": 0.00025318337985827784, |
| "loss": 1.4641, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.55, |
| "learning_rate": 0.00025239424522224607, |
| "loss": 1.4396, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.56, |
| "learning_rate": 0.00025160511058621425, |
| "loss": 1.485, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.56, |
| "learning_rate": 0.0002508159759501825, |
| "loss": 1.4355, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.57, |
| "learning_rate": 0.0002500107365256603, |
| "loss": 1.4419, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.58, |
| "learning_rate": 0.00024920549710113804, |
| "loss": 1.4224, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.59, |
| "learning_rate": 0.0002484002576766158, |
| "loss": 1.4473, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.6, |
| "learning_rate": 0.0002475950182520936, |
| "loss": 1.4341, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.6, |
| "learning_rate": 0.00024678977882757136, |
| "loss": 1.4463, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.61, |
| "learning_rate": 0.00024598453940304917, |
| "loss": 1.4348, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.62, |
| "learning_rate": 0.00024517929997852693, |
| "loss": 1.4326, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.63, |
| "learning_rate": 0.00024439016534249516, |
| "loss": 1.4586, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.63, |
| "eval_loss": 1.2329678535461426, |
| "eval_runtime": 94.8153, |
| "eval_samples_per_second": 119.517, |
| "eval_steps_per_second": 7.478, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.64, |
| "learning_rate": 0.00024360103070646336, |
| "loss": 1.4624, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.64, |
| "learning_rate": 0.00024282800085892204, |
| "loss": 1.455, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.65, |
| "learning_rate": 0.00024202276143439982, |
| "loss": 1.4294, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.66, |
| "learning_rate": 0.00024124973158685847, |
| "loss": 1.4675, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.67, |
| "learning_rate": 0.00024044449216233625, |
| "loss": 1.432, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.67, |
| "learning_rate": 0.00023963925273781404, |
| "loss": 1.4357, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.68, |
| "learning_rate": 0.0002388340133132918, |
| "loss": 1.4419, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.69, |
| "learning_rate": 0.00023802877388876957, |
| "loss": 1.4272, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.7, |
| "learning_rate": 0.00023722353446424736, |
| "loss": 1.4133, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.71, |
| "learning_rate": 0.0002364182950397251, |
| "loss": 1.434, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.71, |
| "learning_rate": 0.0002356130556152029, |
| "loss": 1.4218, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.72, |
| "learning_rate": 0.0002348239209791711, |
| "loss": 1.4682, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.73, |
| "learning_rate": 0.00023401868155464888, |
| "loss": 1.4333, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.74, |
| "learning_rate": 0.00023321344213012666, |
| "loss": 1.4359, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.74, |
| "learning_rate": 0.00023240820270560445, |
| "loss": 1.4054, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.75, |
| "learning_rate": 0.00023160296328108223, |
| "loss": 1.4215, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.76, |
| "learning_rate": 0.00023081382864505043, |
| "loss": 1.4471, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.77, |
| "learning_rate": 0.00023000858922052821, |
| "loss": 1.4238, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.78, |
| "learning_rate": 0.000229203349796006, |
| "loss": 1.4218, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.78, |
| "learning_rate": 0.00022839811037148378, |
| "loss": 1.4419, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.78, |
| "eval_loss": 1.22481107711792, |
| "eval_runtime": 95.042, |
| "eval_samples_per_second": 119.231, |
| "eval_steps_per_second": 7.46, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 19128, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 1000, |
| "total_flos": 9.830299336704e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|