{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 21720, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04604051565377532, "grad_norm": 4.105458736419678, "learning_rate": 2.930939226519337e-05, "loss": 0.7338, "step": 500 }, { "epoch": 0.09208103130755065, "grad_norm": 6.632126331329346, "learning_rate": 2.861878453038674e-05, "loss": 0.6214, "step": 1000 }, { "epoch": 0.13812154696132597, "grad_norm": 5.542463302612305, "learning_rate": 2.7928176795580113e-05, "loss": 0.6065, "step": 1500 }, { "epoch": 0.1841620626151013, "grad_norm": 8.155370712280273, "learning_rate": 2.7237569060773482e-05, "loss": 0.5881, "step": 2000 }, { "epoch": 0.2302025782688766, "grad_norm": 3.674407720565796, "learning_rate": 2.6546961325966852e-05, "loss": 0.582, "step": 2500 }, { "epoch": 0.27624309392265195, "grad_norm": 3.3256947994232178, "learning_rate": 2.585635359116022e-05, "loss": 0.577, "step": 3000 }, { "epoch": 0.32228360957642727, "grad_norm": 4.420624732971191, "learning_rate": 2.516574585635359e-05, "loss": 0.5748, "step": 3500 }, { "epoch": 0.3683241252302026, "grad_norm": 5.609716892242432, "learning_rate": 2.4475138121546964e-05, "loss": 0.5532, "step": 4000 }, { "epoch": 0.4143646408839779, "grad_norm": 10.64582633972168, "learning_rate": 2.3784530386740334e-05, "loss": 0.5705, "step": 4500 }, { "epoch": 0.4604051565377532, "grad_norm": 4.55027437210083, "learning_rate": 2.3093922651933703e-05, "loss": 0.5542, "step": 5000 }, { "epoch": 0.5064456721915286, "grad_norm": 4.849785327911377, "learning_rate": 2.2403314917127073e-05, "loss": 0.5561, "step": 5500 }, { "epoch": 0.5524861878453039, "grad_norm": 7.6427154541015625, "learning_rate": 2.1712707182320442e-05, "loss": 0.5427, "step": 6000 }, { "epoch": 0.5985267034990792, "grad_norm": 4.943839073181152, "learning_rate": 2.1022099447513815e-05, "loss": 0.548, "step": 6500 }, { "epoch": 0.6445672191528545, "grad_norm": 3.1242663860321045, "learning_rate": 2.0331491712707185e-05, "loss": 0.5401, "step": 7000 }, { "epoch": 0.6906077348066298, "grad_norm": 4.478808403015137, "learning_rate": 1.9640883977900554e-05, "loss": 0.5444, "step": 7500 }, { "epoch": 0.7366482504604052, "grad_norm": 5.4113054275512695, "learning_rate": 1.8950276243093924e-05, "loss": 0.545, "step": 8000 }, { "epoch": 0.7826887661141805, "grad_norm": 3.117663860321045, "learning_rate": 1.825966850828729e-05, "loss": 0.5268, "step": 8500 }, { "epoch": 0.8287292817679558, "grad_norm": 4.178927421569824, "learning_rate": 1.7569060773480663e-05, "loss": 0.5441, "step": 9000 }, { "epoch": 0.8747697974217311, "grad_norm": 8.61349868774414, "learning_rate": 1.6878453038674033e-05, "loss": 0.5383, "step": 9500 }, { "epoch": 0.9208103130755064, "grad_norm": 5.246815204620361, "learning_rate": 1.6187845303867402e-05, "loss": 0.5386, "step": 10000 }, { "epoch": 0.9668508287292817, "grad_norm": 5.026543140411377, "learning_rate": 1.5497237569060772e-05, "loss": 0.5313, "step": 10500 }, { "epoch": 1.0128913443830572, "grad_norm": 3.600374937057495, "learning_rate": 1.4806629834254145e-05, "loss": 0.5252, "step": 11000 }, { "epoch": 1.0589318600368325, "grad_norm": 3.5238852500915527, "learning_rate": 1.4116022099447514e-05, "loss": 0.4928, "step": 11500 }, { "epoch": 1.1049723756906078, "grad_norm": 4.74073600769043, "learning_rate": 1.3425414364640886e-05, "loss": 0.5087, "step": 12000 }, { "epoch": 1.1510128913443831, "grad_norm": 8.856585502624512, "learning_rate": 1.2734806629834255e-05, "loss": 0.4898, "step": 12500 }, { "epoch": 1.1970534069981584, "grad_norm": 8.898761749267578, "learning_rate": 1.2044198895027623e-05, "loss": 0.5066, "step": 13000 }, { "epoch": 1.2430939226519337, "grad_norm": 5.918560028076172, "learning_rate": 1.1353591160220994e-05, "loss": 0.5149, "step": 13500 }, { "epoch": 1.289134438305709, "grad_norm": 4.796153545379639, "learning_rate": 1.0662983425414364e-05, "loss": 0.5006, "step": 14000 }, { "epoch": 1.3351749539594844, "grad_norm": 5.584662914276123, "learning_rate": 9.972375690607735e-06, "loss": 0.5058, "step": 14500 }, { "epoch": 1.3812154696132597, "grad_norm": 9.456035614013672, "learning_rate": 9.281767955801105e-06, "loss": 0.4885, "step": 15000 }, { "epoch": 1.427255985267035, "grad_norm": 3.6488196849823, "learning_rate": 8.591160220994474e-06, "loss": 0.4914, "step": 15500 }, { "epoch": 1.4732965009208103, "grad_norm": 3.0951404571533203, "learning_rate": 7.900552486187846e-06, "loss": 0.5016, "step": 16000 }, { "epoch": 1.5193370165745856, "grad_norm": 5.87822151184082, "learning_rate": 7.209944751381215e-06, "loss": 0.5117, "step": 16500 }, { "epoch": 1.565377532228361, "grad_norm": 5.016966819763184, "learning_rate": 6.519337016574586e-06, "loss": 0.5134, "step": 17000 }, { "epoch": 1.6114180478821363, "grad_norm": 8.312193870544434, "learning_rate": 5.828729281767956e-06, "loss": 0.4934, "step": 17500 }, { "epoch": 1.6574585635359116, "grad_norm": 5.323014736175537, "learning_rate": 5.1381215469613265e-06, "loss": 0.5025, "step": 18000 }, { "epoch": 1.703499079189687, "grad_norm": 4.238588809967041, "learning_rate": 4.447513812154697e-06, "loss": 0.4948, "step": 18500 }, { "epoch": 1.7495395948434622, "grad_norm": 3.3375260829925537, "learning_rate": 3.756906077348067e-06, "loss": 0.5086, "step": 19000 }, { "epoch": 1.7955801104972375, "grad_norm": 6.620930194854736, "learning_rate": 3.0662983425414365e-06, "loss": 0.4931, "step": 19500 }, { "epoch": 1.8416206261510129, "grad_norm": 4.153687000274658, "learning_rate": 2.375690607734807e-06, "loss": 0.4853, "step": 20000 }, { "epoch": 1.8876611418047882, "grad_norm": 5.683996677398682, "learning_rate": 1.6850828729281769e-06, "loss": 0.492, "step": 20500 }, { "epoch": 1.9337016574585635, "grad_norm": 3.776688575744629, "learning_rate": 9.944751381215469e-07, "loss": 0.4876, "step": 21000 }, { "epoch": 1.979742173112339, "grad_norm": 2.931673526763916, "learning_rate": 3.0386740331491715e-07, "loss": 0.5069, "step": 21500 }, { "epoch": 2.0, "step": 21720, "total_flos": 2.645641233904435e+16, "train_loss": 0.5331895410246172, "train_runtime": 7144.6567, "train_samples_per_second": 36.48, "train_steps_per_second": 3.04 } ], "logging_steps": 500, "max_steps": 21720, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.645641233904435e+16, "train_batch_size": 12, "trial_name": null, "trial_params": null }