{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9927360774818403, "eval_steps": 500, "global_step": 309, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09685230024213075, "grad_norm": 1.6106698543432953, "learning_rate": 5e-06, "loss": 0.649, "step": 10 }, { "epoch": 0.1937046004842615, "grad_norm": 0.9911008535692477, "learning_rate": 5e-06, "loss": 0.5864, "step": 20 }, { "epoch": 0.29055690072639223, "grad_norm": 0.6386745349148736, "learning_rate": 5e-06, "loss": 0.5582, "step": 30 }, { "epoch": 0.387409200968523, "grad_norm": 1.0205474281151061, "learning_rate": 5e-06, "loss": 0.5464, "step": 40 }, { "epoch": 0.48426150121065376, "grad_norm": 0.741493469056709, "learning_rate": 5e-06, "loss": 0.5311, "step": 50 }, { "epoch": 0.5811138014527845, "grad_norm": 0.7786501028026512, "learning_rate": 5e-06, "loss": 0.5181, "step": 60 }, { "epoch": 0.6779661016949152, "grad_norm": 0.6711455522499474, "learning_rate": 5e-06, "loss": 0.5158, "step": 70 }, { "epoch": 0.774818401937046, "grad_norm": 0.6218873174772125, "learning_rate": 5e-06, "loss": 0.5126, "step": 80 }, { "epoch": 0.8716707021791767, "grad_norm": 0.6652769231147759, "learning_rate": 5e-06, "loss": 0.5105, "step": 90 }, { "epoch": 0.9685230024213075, "grad_norm": 1.4354098566929865, "learning_rate": 5e-06, "loss": 0.5082, "step": 100 }, { "epoch": 0.9975786924939467, "eval_loss": 0.49066221714019775, "eval_runtime": 69.4631, "eval_samples_per_second": 40.021, "eval_steps_per_second": 0.633, "step": 103 }, { "epoch": 1.0653753026634383, "grad_norm": 0.8075871605198771, "learning_rate": 5e-06, "loss": 0.5159, "step": 110 }, { "epoch": 1.162227602905569, "grad_norm": 0.7654895903052866, "learning_rate": 5e-06, "loss": 0.4583, "step": 120 }, { "epoch": 1.2590799031476998, "grad_norm": 0.47351004510337863, "learning_rate": 5e-06, "loss": 0.4586, "step": 130 }, { "epoch": 1.3559322033898304, "grad_norm": 0.5062829494154636, "learning_rate": 5e-06, "loss": 0.4572, "step": 140 }, { "epoch": 1.4527845036319613, "grad_norm": 0.6119092771725125, "learning_rate": 5e-06, "loss": 0.4544, "step": 150 }, { "epoch": 1.549636803874092, "grad_norm": 0.6212058614890003, "learning_rate": 5e-06, "loss": 0.4561, "step": 160 }, { "epoch": 1.6464891041162226, "grad_norm": 0.5105359500584984, "learning_rate": 5e-06, "loss": 0.4518, "step": 170 }, { "epoch": 1.7433414043583535, "grad_norm": 0.5867880979483323, "learning_rate": 5e-06, "loss": 0.4551, "step": 180 }, { "epoch": 1.8401937046004844, "grad_norm": 0.4498960324504211, "learning_rate": 5e-06, "loss": 0.454, "step": 190 }, { "epoch": 1.937046004842615, "grad_norm": 0.5182866069406472, "learning_rate": 5e-06, "loss": 0.4499, "step": 200 }, { "epoch": 1.9951573849878934, "eval_loss": 0.47824251651763916, "eval_runtime": 71.5938, "eval_samples_per_second": 38.83, "eval_steps_per_second": 0.615, "step": 206 }, { "epoch": 2.0338983050847457, "grad_norm": 0.9414090883543634, "learning_rate": 5e-06, "loss": 0.4671, "step": 210 }, { "epoch": 2.1307506053268765, "grad_norm": 0.5171048417889069, "learning_rate": 5e-06, "loss": 0.4066, "step": 220 }, { "epoch": 2.2276029055690074, "grad_norm": 0.5123629438372025, "learning_rate": 5e-06, "loss": 0.4113, "step": 230 }, { "epoch": 2.324455205811138, "grad_norm": 0.5363285052863767, "learning_rate": 5e-06, "loss": 0.4081, "step": 240 }, { "epoch": 2.4213075060532687, "grad_norm": 0.4907788960865576, "learning_rate": 5e-06, "loss": 0.407, "step": 250 }, { "epoch": 2.5181598062953996, "grad_norm": 0.507228977380475, "learning_rate": 5e-06, "loss": 0.4051, "step": 260 }, { "epoch": 2.61501210653753, "grad_norm": 0.4923140802099653, "learning_rate": 5e-06, "loss": 0.4109, "step": 270 }, { "epoch": 2.711864406779661, "grad_norm": 0.5763086112386324, "learning_rate": 5e-06, "loss": 0.3986, "step": 280 }, { "epoch": 2.8087167070217918, "grad_norm": 0.4788239568139877, "learning_rate": 5e-06, "loss": 0.4115, "step": 290 }, { "epoch": 2.9055690072639226, "grad_norm": 0.5281993404834231, "learning_rate": 5e-06, "loss": 0.4124, "step": 300 }, { "epoch": 2.9927360774818403, "eval_loss": 0.47956007719039917, "eval_runtime": 68.3404, "eval_samples_per_second": 40.679, "eval_steps_per_second": 0.644, "step": 309 }, { "epoch": 2.9927360774818403, "step": 309, "total_flos": 517377129185280.0, "train_loss": 0.4712127952513957, "train_runtime": 10324.3717, "train_samples_per_second": 15.347, "train_steps_per_second": 0.03 } ], "logging_steps": 10, "max_steps": 309, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 517377129185280.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }