| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.991501416430595, | |
| "eval_steps": 500, | |
| "global_step": 792, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03777148253068933, | |
| "grad_norm": 2.3805434235210083, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0395, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07554296506137866, | |
| "grad_norm": 2.856513616647182, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9048, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11331444759206799, | |
| "grad_norm": 1.44752023730039, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8676, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1510859301227573, | |
| "grad_norm": 2.724838836111604, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8452, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18885741265344666, | |
| "grad_norm": 1.5098321508092247, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8261, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22662889518413598, | |
| "grad_norm": 1.1562207951359371, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8055, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26440037771482533, | |
| "grad_norm": 1.0400869852117345, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7953, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3021718602455146, | |
| "grad_norm": 0.9148701519417262, | |
| "learning_rate": 5e-06, | |
| "loss": 0.786, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.33994334277620397, | |
| "grad_norm": 0.8722664324312153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7725, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3777148253068933, | |
| "grad_norm": 1.0427331036560754, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7813, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4154863078375826, | |
| "grad_norm": 0.6602583497680015, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7702, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.45325779036827196, | |
| "grad_norm": 0.6261264742184848, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7601, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4910292728989613, | |
| "grad_norm": 0.8612076277718839, | |
| "learning_rate": 5e-06, | |
| "loss": 0.765, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5288007554296507, | |
| "grad_norm": 0.7081715175245555, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7585, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.56657223796034, | |
| "grad_norm": 0.6464092566763289, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7582, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6043437204910292, | |
| "grad_norm": 0.7867913648175029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7535, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6421152030217187, | |
| "grad_norm": 0.8335496759284264, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7506, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6798866855524079, | |
| "grad_norm": 0.9045739681846007, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7538, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7176581680830972, | |
| "grad_norm": 0.642733706975797, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7512, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7554296506137866, | |
| "grad_norm": 0.7751098599889861, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7432, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7932011331444759, | |
| "grad_norm": 0.9169213826519828, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7523, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8309726156751652, | |
| "grad_norm": 0.751413803169088, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7475, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8687440982058546, | |
| "grad_norm": 0.7640332235725673, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7384, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9065155807365439, | |
| "grad_norm": 0.728990276372915, | |
| "learning_rate": 5e-06, | |
| "loss": 0.742, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9442870632672332, | |
| "grad_norm": 0.6854770933941848, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7415, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9820585457979226, | |
| "grad_norm": 0.7112160685903344, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7413, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9971671388101983, | |
| "eval_loss": 0.7381541132926941, | |
| "eval_runtime": 185.7861, | |
| "eval_samples_per_second": 38.399, | |
| "eval_steps_per_second": 0.603, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.019830028328612, | |
| "grad_norm": 0.8806260533344807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7317, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0576015108593013, | |
| "grad_norm": 0.8463597906884003, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6852, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0953729933899905, | |
| "grad_norm": 0.998627698983325, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6903, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.13314447592068, | |
| "grad_norm": 0.8372558817429967, | |
| "learning_rate": 5e-06, | |
| "loss": 0.684, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.1709159584513693, | |
| "grad_norm": 0.6335303040398027, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6899, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2086874409820585, | |
| "grad_norm": 0.8131802909561154, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6886, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.246458923512748, | |
| "grad_norm": 0.8544395006729588, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6876, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.284230406043437, | |
| "grad_norm": 0.7496941658426767, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6841, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3220018885741265, | |
| "grad_norm": 0.6225060094101681, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6875, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.3597733711048159, | |
| "grad_norm": 0.8041619905694252, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6835, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3975448536355053, | |
| "grad_norm": 0.6579082421853544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6839, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4353163361661945, | |
| "grad_norm": 0.6914768615360496, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6888, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4730878186968839, | |
| "grad_norm": 0.5488530522257256, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6844, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.510859301227573, | |
| "grad_norm": 0.5967231206297695, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6835, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5486307837582625, | |
| "grad_norm": 0.8534656126840098, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6894, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.5864022662889519, | |
| "grad_norm": 0.8114796417385948, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6892, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6241737488196413, | |
| "grad_norm": 0.7087354816053721, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6854, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.6619452313503305, | |
| "grad_norm": 0.5687873916760361, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6874, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.6997167138810199, | |
| "grad_norm": 0.7321157421532287, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6891, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.737488196411709, | |
| "grad_norm": 0.9059336248026789, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6813, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.7752596789423984, | |
| "grad_norm": 0.675756534213701, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6835, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.8130311614730878, | |
| "grad_norm": 0.6005853815924641, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6835, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.8508026440037773, | |
| "grad_norm": 0.7612226170902838, | |
| "learning_rate": 5e-06, | |
| "loss": 0.68, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.8885741265344664, | |
| "grad_norm": 0.7562478445031421, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6805, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9263456090651558, | |
| "grad_norm": 0.7011514213635397, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6821, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.964117091595845, | |
| "grad_norm": 0.668105825093532, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6815, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.9981114258734656, | |
| "eval_loss": 0.7243772149085999, | |
| "eval_runtime": 177.4303, | |
| "eval_samples_per_second": 40.207, | |
| "eval_steps_per_second": 0.631, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 2.0018885741265344, | |
| "grad_norm": 1.0061690465784974, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.039660056657224, | |
| "grad_norm": 0.7877880100051793, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6295, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.0774315391879132, | |
| "grad_norm": 0.793030648640271, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6269, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.1152030217186026, | |
| "grad_norm": 0.7589773401731925, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6306, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.1529745042492916, | |
| "grad_norm": 0.6507155946743034, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6302, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.190745986779981, | |
| "grad_norm": 0.7706192007874249, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6278, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.2285174693106704, | |
| "grad_norm": 0.6327752250601594, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.26628895184136, | |
| "grad_norm": 0.6327979312894738, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6326, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.304060434372049, | |
| "grad_norm": 0.5816500449436098, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6322, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.3418319169027386, | |
| "grad_norm": 0.7685458410506589, | |
| "learning_rate": 5e-06, | |
| "loss": 0.632, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.3796033994334276, | |
| "grad_norm": 0.7331975528938945, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6359, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.417374881964117, | |
| "grad_norm": 0.6048367664881513, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.4551463644948064, | |
| "grad_norm": 0.7225741561090323, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6304, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.492917847025496, | |
| "grad_norm": 0.6762661427796176, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6348, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.530689329556185, | |
| "grad_norm": 0.6888475213512071, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6335, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.568460812086874, | |
| "grad_norm": 0.7082247476426633, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6349, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.6062322946175636, | |
| "grad_norm": 0.7648839755479956, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6285, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.644003777148253, | |
| "grad_norm": 0.7473110651002637, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6341, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.6817752596789424, | |
| "grad_norm": 0.6406025398016005, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6393, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.7195467422096318, | |
| "grad_norm": 0.6180603575164161, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6397, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.757318224740321, | |
| "grad_norm": 0.7188210996816503, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6356, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.7950897072710106, | |
| "grad_norm": 0.7014279573066574, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6394, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.8328611898017, | |
| "grad_norm": 0.6189845652330979, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6321, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.870632672332389, | |
| "grad_norm": 0.6750791101100844, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.9084041548630784, | |
| "grad_norm": 0.645156423238826, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6333, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.9461756373937678, | |
| "grad_norm": 0.7328592259216773, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6412, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.983947119924457, | |
| "grad_norm": 0.5893339209675429, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6392, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.991501416430595, | |
| "eval_loss": 0.7273694276809692, | |
| "eval_runtime": 177.6532, | |
| "eval_samples_per_second": 40.157, | |
| "eval_steps_per_second": 0.63, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 2.991501416430595, | |
| "step": 792, | |
| "total_flos": 1326420118732800.0, | |
| "train_loss": 0.7023529005472107, | |
| "train_runtime": 26189.6442, | |
| "train_samples_per_second": 15.525, | |
| "train_steps_per_second": 0.03 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 792, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1326420118732800.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |