{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 722, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027710426047800486, "grad_norm": 9.779006004333496, "learning_rate": 2.6027397260273973e-06, "loss": 1.6182, "step": 20 }, { "epoch": 0.05542085209560097, "grad_norm": 1.9644428491592407, "learning_rate": 5.342465753424658e-06, "loss": 0.9549, "step": 40 }, { "epoch": 0.08313127814340146, "grad_norm": 1.7376688718795776, "learning_rate": 8.082191780821919e-06, "loss": 0.7816, "step": 60 }, { "epoch": 0.11084170419120194, "grad_norm": 3.079080104827881, "learning_rate": 9.997891263419896e-06, "loss": 0.6767, "step": 80 }, { "epoch": 0.13855213023900242, "grad_norm": 1.5074495077133179, "learning_rate": 9.960452074303327e-06, "loss": 0.5472, "step": 100 }, { "epoch": 0.16626255628680292, "grad_norm": 1.528727412223816, "learning_rate": 9.876555756875807e-06, "loss": 0.4724, "step": 120 }, { "epoch": 0.1939729823346034, "grad_norm": 1.4423059225082397, "learning_rate": 9.746988042341907e-06, "loss": 0.4629, "step": 140 }, { "epoch": 0.22168340838240388, "grad_norm": 1.143700122833252, "learning_rate": 9.57296239750846e-06, "loss": 0.4399, "step": 160 }, { "epoch": 0.24939383443020435, "grad_norm": 1.106168508529663, "learning_rate": 9.356108660057662e-06, "loss": 0.4209, "step": 180 }, { "epoch": 0.27710426047800485, "grad_norm": 1.386186957359314, "learning_rate": 9.09845777429752e-06, "loss": 0.4259, "step": 200 }, { "epoch": 0.30481468652580535, "grad_norm": 1.0028117895126343, "learning_rate": 8.802422770347044e-06, "loss": 0.4076, "step": 220 }, { "epoch": 0.33252511257360584, "grad_norm": 0.9471271634101868, "learning_rate": 8.47077616489565e-06, "loss": 0.4109, "step": 240 }, { "epoch": 0.3602355386214063, "grad_norm": 0.974446177482605, "learning_rate": 8.106623995190058e-06, "loss": 0.4002, "step": 260 }, { "epoch": 0.3879459646692068, "grad_norm": 0.9693788290023804, "learning_rate": 7.71337672943343e-06, "loss": 0.4071, "step": 280 }, { "epoch": 0.4156563907170073, "grad_norm": 0.9601064324378967, "learning_rate": 7.294717326035508e-06, "loss": 0.4062, "step": 300 }, { "epoch": 0.44336681676480777, "grad_norm": 1.0168567895889282, "learning_rate": 6.854566740854932e-06, "loss": 0.3955, "step": 320 }, { "epoch": 0.47107724281260827, "grad_norm": 1.0382839441299438, "learning_rate": 6.397047205475757e-06, "loss": 0.3957, "step": 340 }, { "epoch": 0.4987876688604087, "grad_norm": 0.986349880695343, "learning_rate": 5.926443620435572e-06, "loss": 0.3974, "step": 360 }, { "epoch": 0.5264980949082092, "grad_norm": 0.9706918001174927, "learning_rate": 5.447163424977076e-06, "loss": 0.395, "step": 380 }, { "epoch": 0.5542085209560097, "grad_norm": 0.882785439491272, "learning_rate": 4.963695319163041e-06, "loss": 0.4023, "step": 400 }, { "epoch": 0.5819189470038102, "grad_norm": 0.9751160144805908, "learning_rate": 4.480567224942845e-06, "loss": 0.3917, "step": 420 }, { "epoch": 0.6096293730516107, "grad_norm": 0.9817010164260864, "learning_rate": 4.002303879886288e-06, "loss": 0.3938, "step": 440 }, { "epoch": 0.6373397990994112, "grad_norm": 0.929880678653717, "learning_rate": 3.5333844607407497e-06, "loss": 0.3935, "step": 460 }, { "epoch": 0.6650502251472117, "grad_norm": 0.8725020885467529, "learning_rate": 3.078200633688352e-06, "loss": 0.3805, "step": 480 }, { "epoch": 0.6927606511950122, "grad_norm": 0.8537760376930237, "learning_rate": 2.6410154241835663e-06, "loss": 0.3809, "step": 500 }, { "epoch": 0.7204710772428126, "grad_norm": 0.9811424016952515, "learning_rate": 2.22592329157594e-06, "loss": 0.3745, "step": 520 }, { "epoch": 0.7481815032906131, "grad_norm": 0.9665875434875488, "learning_rate": 1.8368117824391623e-06, "loss": 0.3809, "step": 540 }, { "epoch": 0.7758919293384136, "grad_norm": 0.9251068234443665, "learning_rate": 1.4773251217423424e-06, "loss": 0.3839, "step": 560 }, { "epoch": 0.803602355386214, "grad_norm": 0.9169190526008606, "learning_rate": 1.1508300828504682e-06, "loss": 0.3684, "step": 580 }, { "epoch": 0.8313127814340145, "grad_norm": 0.9597139954566956, "learning_rate": 8.603844559986823e-07, "loss": 0.3814, "step": 600 }, { "epoch": 0.859023207481815, "grad_norm": 1.081009864807129, "learning_rate": 6.087084105489449e-07, "loss": 0.3864, "step": 620 }, { "epoch": 0.8867336335296155, "grad_norm": 0.9979385733604431, "learning_rate": 3.9815901923598354e-07, "loss": 0.377, "step": 640 }, { "epoch": 0.914444059577416, "grad_norm": 0.9564194679260254, "learning_rate": 2.3070818299573972e-07, "loss": 0.3775, "step": 660 }, { "epoch": 0.9421544856252165, "grad_norm": 0.9237750172615051, "learning_rate": 1.0792416312143172e-07, "loss": 0.3817, "step": 680 }, { "epoch": 0.969864911673017, "grad_norm": 0.8861544728279114, "learning_rate": 3.095689370785249e-08, "loss": 0.3707, "step": 700 }, { "epoch": 0.9975753377208174, "grad_norm": 0.9864802956581116, "learning_rate": 5.272119402693898e-10, "loss": 0.3693, "step": 720 }, { "epoch": 1.0, "step": 722, "total_flos": 7.079433197162856e+18, "train_loss": 0.4693530139995744, "train_runtime": 17945.1855, "train_samples_per_second": 5.148, "train_steps_per_second": 0.04 } ], "logging_steps": 20, "max_steps": 722, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.079433197162856e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }