{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.991501416430595, "eval_steps": 500, "global_step": 792, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03777148253068933, "grad_norm": 2.3805434235210083, "learning_rate": 5e-06, "loss": 1.0395, "step": 10 }, { "epoch": 0.07554296506137866, "grad_norm": 2.856513616647182, "learning_rate": 5e-06, "loss": 0.9048, "step": 20 }, { "epoch": 0.11331444759206799, "grad_norm": 1.44752023730039, "learning_rate": 5e-06, "loss": 0.8676, "step": 30 }, { "epoch": 0.1510859301227573, "grad_norm": 2.724838836111604, "learning_rate": 5e-06, "loss": 0.8452, "step": 40 }, { "epoch": 0.18885741265344666, "grad_norm": 1.5098321508092247, "learning_rate": 5e-06, "loss": 0.8261, "step": 50 }, { "epoch": 0.22662889518413598, "grad_norm": 1.1562207951359371, "learning_rate": 5e-06, "loss": 0.8055, "step": 60 }, { "epoch": 0.26440037771482533, "grad_norm": 1.0400869852117345, "learning_rate": 5e-06, "loss": 0.7953, "step": 70 }, { "epoch": 0.3021718602455146, "grad_norm": 0.9148701519417262, "learning_rate": 5e-06, "loss": 0.786, "step": 80 }, { "epoch": 0.33994334277620397, "grad_norm": 0.8722664324312153, "learning_rate": 5e-06, "loss": 0.7725, "step": 90 }, { "epoch": 0.3777148253068933, "grad_norm": 1.0427331036560754, "learning_rate": 5e-06, "loss": 0.7813, "step": 100 }, { "epoch": 0.4154863078375826, "grad_norm": 0.6602583497680015, "learning_rate": 5e-06, "loss": 0.7702, "step": 110 }, { "epoch": 0.45325779036827196, "grad_norm": 0.6261264742184848, "learning_rate": 5e-06, "loss": 0.7601, "step": 120 }, { "epoch": 0.4910292728989613, "grad_norm": 0.8612076277718839, "learning_rate": 5e-06, "loss": 0.765, "step": 130 }, { "epoch": 0.5288007554296507, "grad_norm": 0.7081715175245555, "learning_rate": 5e-06, "loss": 0.7585, "step": 140 }, { "epoch": 0.56657223796034, "grad_norm": 0.6464092566763289, "learning_rate": 5e-06, "loss": 0.7582, "step": 150 }, { "epoch": 0.6043437204910292, "grad_norm": 0.7867913648175029, "learning_rate": 5e-06, "loss": 0.7535, "step": 160 }, { "epoch": 0.6421152030217187, "grad_norm": 0.8335496759284264, "learning_rate": 5e-06, "loss": 0.7506, "step": 170 }, { "epoch": 0.6798866855524079, "grad_norm": 0.9045739681846007, "learning_rate": 5e-06, "loss": 0.7538, "step": 180 }, { "epoch": 0.7176581680830972, "grad_norm": 0.642733706975797, "learning_rate": 5e-06, "loss": 0.7512, "step": 190 }, { "epoch": 0.7554296506137866, "grad_norm": 0.7751098599889861, "learning_rate": 5e-06, "loss": 0.7432, "step": 200 }, { "epoch": 0.7932011331444759, "grad_norm": 0.9169213826519828, "learning_rate": 5e-06, "loss": 0.7523, "step": 210 }, { "epoch": 0.8309726156751652, "grad_norm": 0.751413803169088, "learning_rate": 5e-06, "loss": 0.7475, "step": 220 }, { "epoch": 0.8687440982058546, "grad_norm": 0.7640332235725673, "learning_rate": 5e-06, "loss": 0.7384, "step": 230 }, { "epoch": 0.9065155807365439, "grad_norm": 0.728990276372915, "learning_rate": 5e-06, "loss": 0.742, "step": 240 }, { "epoch": 0.9442870632672332, "grad_norm": 0.6854770933941848, "learning_rate": 5e-06, "loss": 0.7415, "step": 250 }, { "epoch": 0.9820585457979226, "grad_norm": 0.7112160685903344, "learning_rate": 5e-06, "loss": 0.7413, "step": 260 }, { "epoch": 0.9971671388101983, "eval_loss": 0.7381541132926941, "eval_runtime": 185.7861, "eval_samples_per_second": 38.399, "eval_steps_per_second": 0.603, "step": 264 }, { "epoch": 1.019830028328612, "grad_norm": 0.8806260533344807, "learning_rate": 5e-06, "loss": 0.7317, "step": 270 }, { "epoch": 1.0576015108593013, "grad_norm": 0.8463597906884003, "learning_rate": 5e-06, "loss": 0.6852, "step": 280 }, { "epoch": 1.0953729933899905, "grad_norm": 0.998627698983325, "learning_rate": 5e-06, "loss": 0.6903, "step": 290 }, { "epoch": 1.13314447592068, "grad_norm": 0.8372558817429967, "learning_rate": 5e-06, "loss": 0.684, "step": 300 }, { "epoch": 1.1709159584513693, "grad_norm": 0.6335303040398027, "learning_rate": 5e-06, "loss": 0.6899, "step": 310 }, { "epoch": 1.2086874409820585, "grad_norm": 0.8131802909561154, "learning_rate": 5e-06, "loss": 0.6886, "step": 320 }, { "epoch": 1.246458923512748, "grad_norm": 0.8544395006729588, "learning_rate": 5e-06, "loss": 0.6876, "step": 330 }, { "epoch": 1.284230406043437, "grad_norm": 0.7496941658426767, "learning_rate": 5e-06, "loss": 0.6841, "step": 340 }, { "epoch": 1.3220018885741265, "grad_norm": 0.6225060094101681, "learning_rate": 5e-06, "loss": 0.6875, "step": 350 }, { "epoch": 1.3597733711048159, "grad_norm": 0.8041619905694252, "learning_rate": 5e-06, "loss": 0.6835, "step": 360 }, { "epoch": 1.3975448536355053, "grad_norm": 0.6579082421853544, "learning_rate": 5e-06, "loss": 0.6839, "step": 370 }, { "epoch": 1.4353163361661945, "grad_norm": 0.6914768615360496, "learning_rate": 5e-06, "loss": 0.6888, "step": 380 }, { "epoch": 1.4730878186968839, "grad_norm": 0.5488530522257256, "learning_rate": 5e-06, "loss": 0.6844, "step": 390 }, { "epoch": 1.510859301227573, "grad_norm": 0.5967231206297695, "learning_rate": 5e-06, "loss": 0.6835, "step": 400 }, { "epoch": 1.5486307837582625, "grad_norm": 0.8534656126840098, "learning_rate": 5e-06, "loss": 0.6894, "step": 410 }, { "epoch": 1.5864022662889519, "grad_norm": 0.8114796417385948, "learning_rate": 5e-06, "loss": 0.6892, "step": 420 }, { "epoch": 1.6241737488196413, "grad_norm": 0.7087354816053721, "learning_rate": 5e-06, "loss": 0.6854, "step": 430 }, { "epoch": 1.6619452313503305, "grad_norm": 0.5687873916760361, "learning_rate": 5e-06, "loss": 0.6874, "step": 440 }, { "epoch": 1.6997167138810199, "grad_norm": 0.7321157421532287, "learning_rate": 5e-06, "loss": 0.6891, "step": 450 }, { "epoch": 1.737488196411709, "grad_norm": 0.9059336248026789, "learning_rate": 5e-06, "loss": 0.6813, "step": 460 }, { "epoch": 1.7752596789423984, "grad_norm": 0.675756534213701, "learning_rate": 5e-06, "loss": 0.6835, "step": 470 }, { "epoch": 1.8130311614730878, "grad_norm": 0.6005853815924641, "learning_rate": 5e-06, "loss": 0.6835, "step": 480 }, { "epoch": 1.8508026440037773, "grad_norm": 0.7612226170902838, "learning_rate": 5e-06, "loss": 0.68, "step": 490 }, { "epoch": 1.8885741265344664, "grad_norm": 0.7562478445031421, "learning_rate": 5e-06, "loss": 0.6805, "step": 500 }, { "epoch": 1.9263456090651558, "grad_norm": 0.7011514213635397, "learning_rate": 5e-06, "loss": 0.6821, "step": 510 }, { "epoch": 1.964117091595845, "grad_norm": 0.668105825093532, "learning_rate": 5e-06, "loss": 0.6815, "step": 520 }, { "epoch": 1.9981114258734656, "eval_loss": 0.7243772149085999, "eval_runtime": 177.4303, "eval_samples_per_second": 40.207, "eval_steps_per_second": 0.631, "step": 529 }, { "epoch": 2.0018885741265344, "grad_norm": 1.0061690465784974, "learning_rate": 5e-06, "loss": 0.7, "step": 530 }, { "epoch": 2.039660056657224, "grad_norm": 0.7877880100051793, "learning_rate": 5e-06, "loss": 0.6295, "step": 540 }, { "epoch": 2.0774315391879132, "grad_norm": 0.793030648640271, "learning_rate": 5e-06, "loss": 0.6269, "step": 550 }, { "epoch": 2.1152030217186026, "grad_norm": 0.7589773401731925, "learning_rate": 5e-06, "loss": 0.6306, "step": 560 }, { "epoch": 2.1529745042492916, "grad_norm": 0.6507155946743034, "learning_rate": 5e-06, "loss": 0.6302, "step": 570 }, { "epoch": 2.190745986779981, "grad_norm": 0.7706192007874249, "learning_rate": 5e-06, "loss": 0.6278, "step": 580 }, { "epoch": 2.2285174693106704, "grad_norm": 0.6327752250601594, "learning_rate": 5e-06, "loss": 0.6364, "step": 590 }, { "epoch": 2.26628895184136, "grad_norm": 0.6327979312894738, "learning_rate": 5e-06, "loss": 0.6326, "step": 600 }, { "epoch": 2.304060434372049, "grad_norm": 0.5816500449436098, "learning_rate": 5e-06, "loss": 0.6322, "step": 610 }, { "epoch": 2.3418319169027386, "grad_norm": 0.7685458410506589, "learning_rate": 5e-06, "loss": 0.632, "step": 620 }, { "epoch": 2.3796033994334276, "grad_norm": 0.7331975528938945, "learning_rate": 5e-06, "loss": 0.6359, "step": 630 }, { "epoch": 2.417374881964117, "grad_norm": 0.6048367664881513, "learning_rate": 5e-06, "loss": 0.6351, "step": 640 }, { "epoch": 2.4551463644948064, "grad_norm": 0.7225741561090323, "learning_rate": 5e-06, "loss": 0.6304, "step": 650 }, { "epoch": 2.492917847025496, "grad_norm": 0.6762661427796176, "learning_rate": 5e-06, "loss": 0.6348, "step": 660 }, { "epoch": 2.530689329556185, "grad_norm": 0.6888475213512071, "learning_rate": 5e-06, "loss": 0.6335, "step": 670 }, { "epoch": 2.568460812086874, "grad_norm": 0.7082247476426633, "learning_rate": 5e-06, "loss": 0.6349, "step": 680 }, { "epoch": 2.6062322946175636, "grad_norm": 0.7648839755479956, "learning_rate": 5e-06, "loss": 0.6285, "step": 690 }, { "epoch": 2.644003777148253, "grad_norm": 0.7473110651002637, "learning_rate": 5e-06, "loss": 0.6341, "step": 700 }, { "epoch": 2.6817752596789424, "grad_norm": 0.6406025398016005, "learning_rate": 5e-06, "loss": 0.6393, "step": 710 }, { "epoch": 2.7195467422096318, "grad_norm": 0.6180603575164161, "learning_rate": 5e-06, "loss": 0.6397, "step": 720 }, { "epoch": 2.757318224740321, "grad_norm": 0.7188210996816503, "learning_rate": 5e-06, "loss": 0.6356, "step": 730 }, { "epoch": 2.7950897072710106, "grad_norm": 0.7014279573066574, "learning_rate": 5e-06, "loss": 0.6394, "step": 740 }, { "epoch": 2.8328611898017, "grad_norm": 0.6189845652330979, "learning_rate": 5e-06, "loss": 0.6321, "step": 750 }, { "epoch": 2.870632672332389, "grad_norm": 0.6750791101100844, "learning_rate": 5e-06, "loss": 0.6351, "step": 760 }, { "epoch": 2.9084041548630784, "grad_norm": 0.645156423238826, "learning_rate": 5e-06, "loss": 0.6333, "step": 770 }, { "epoch": 2.9461756373937678, "grad_norm": 0.7328592259216773, "learning_rate": 5e-06, "loss": 0.6412, "step": 780 }, { "epoch": 2.983947119924457, "grad_norm": 0.5893339209675429, "learning_rate": 5e-06, "loss": 0.6392, "step": 790 }, { "epoch": 2.991501416430595, "eval_loss": 0.7273694276809692, "eval_runtime": 177.6532, "eval_samples_per_second": 40.157, "eval_steps_per_second": 0.63, "step": 792 }, { "epoch": 2.991501416430595, "step": 792, "total_flos": 1326420118732800.0, "train_loss": 0.7023529005472107, "train_runtime": 26189.6442, "train_samples_per_second": 15.525, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 792, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1326420118732800.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }