| { | |
| "best_global_step": 800, | |
| "best_metric": 0.8280864357948303, | |
| "best_model_checkpoint": "/workspace/model/finetuned/checkpoint-800", | |
| "epoch": 4.848484848484849, | |
| "eval_steps": 25, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.15151515151515152, | |
| "grad_norm": 180.98402404785156, | |
| "learning_rate": 2.7e-06, | |
| "loss": 30.8939, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.15151515151515152, | |
| "eval_loss": 2.02256441116333, | |
| "eval_runtime": 26.9436, | |
| "eval_samples_per_second": 21.786, | |
| "eval_steps_per_second": 2.746, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.30303030303030304, | |
| "grad_norm": 385.3758544921875, | |
| "learning_rate": 6.45e-06, | |
| "loss": 28.2375, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.30303030303030304, | |
| "eval_loss": 1.5823099613189697, | |
| "eval_runtime": 27.0059, | |
| "eval_samples_per_second": 21.736, | |
| "eval_steps_per_second": 2.74, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 30.632709503173828, | |
| "learning_rate": 1.02e-05, | |
| "loss": 21.1384, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "eval_loss": 1.216854214668274, | |
| "eval_runtime": 26.9681, | |
| "eval_samples_per_second": 21.766, | |
| "eval_steps_per_second": 2.744, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 9.063393592834473, | |
| "learning_rate": 1.395e-05, | |
| "loss": 18.6661, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "eval_loss": 1.1212154626846313, | |
| "eval_runtime": 26.8008, | |
| "eval_samples_per_second": 21.902, | |
| "eval_steps_per_second": 2.761, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7575757575757576, | |
| "grad_norm": 10.971136093139648, | |
| "learning_rate": 1.77e-05, | |
| "loss": 17.3687, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.7575757575757576, | |
| "eval_loss": 1.0298739671707153, | |
| "eval_runtime": 26.8143, | |
| "eval_samples_per_second": 21.891, | |
| "eval_steps_per_second": 2.76, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 8.818650245666504, | |
| "learning_rate": 2.145e-05, | |
| "loss": 15.8605, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "eval_loss": 0.9787100553512573, | |
| "eval_runtime": 26.571, | |
| "eval_samples_per_second": 22.092, | |
| "eval_steps_per_second": 2.785, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0606060606060606, | |
| "grad_norm": 11.700923919677734, | |
| "learning_rate": 2.52e-05, | |
| "loss": 15.163, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.0606060606060606, | |
| "eval_loss": 0.9414308667182922, | |
| "eval_runtime": 26.626, | |
| "eval_samples_per_second": 22.046, | |
| "eval_steps_per_second": 2.779, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.2121212121212122, | |
| "grad_norm": 10.862292289733887, | |
| "learning_rate": 2.895e-05, | |
| "loss": 14.6871, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2121212121212122, | |
| "eval_loss": 0.9198422431945801, | |
| "eval_runtime": 26.7039, | |
| "eval_samples_per_second": 21.982, | |
| "eval_steps_per_second": 2.771, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 11.315472602844238, | |
| "learning_rate": 2.9136e-05, | |
| "loss": 14.8471, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "eval_loss": 0.9041078686714172, | |
| "eval_runtime": 26.5947, | |
| "eval_samples_per_second": 22.072, | |
| "eval_steps_per_second": 2.783, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.5151515151515151, | |
| "grad_norm": 12.701277732849121, | |
| "learning_rate": 2.7936e-05, | |
| "loss": 14.163, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.5151515151515151, | |
| "eval_loss": 0.8926578760147095, | |
| "eval_runtime": 26.7046, | |
| "eval_samples_per_second": 21.981, | |
| "eval_steps_per_second": 2.771, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 11.456862449645996, | |
| "learning_rate": 2.6736e-05, | |
| "loss": 14.0006, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "eval_loss": 0.8840105533599854, | |
| "eval_runtime": 26.5642, | |
| "eval_samples_per_second": 22.097, | |
| "eval_steps_per_second": 2.786, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 11.606865882873535, | |
| "learning_rate": 2.5536e-05, | |
| "loss": 14.0325, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "eval_loss": 0.8764263987541199, | |
| "eval_runtime": 26.4677, | |
| "eval_samples_per_second": 22.178, | |
| "eval_steps_per_second": 2.796, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9696969696969697, | |
| "grad_norm": 11.702313423156738, | |
| "learning_rate": 2.4336000000000002e-05, | |
| "loss": 14.1239, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.9696969696969697, | |
| "eval_loss": 0.8708490133285522, | |
| "eval_runtime": 26.714, | |
| "eval_samples_per_second": 21.973, | |
| "eval_steps_per_second": 2.77, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.121212121212121, | |
| "grad_norm": 13.907278060913086, | |
| "learning_rate": 2.3136e-05, | |
| "loss": 13.6706, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.121212121212121, | |
| "eval_loss": 0.8656915426254272, | |
| "eval_runtime": 26.7031, | |
| "eval_samples_per_second": 21.982, | |
| "eval_steps_per_second": 2.771, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "grad_norm": 13.098384857177734, | |
| "learning_rate": 2.1935999999999998e-05, | |
| "loss": 13.5478, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "eval_loss": 0.8606927394866943, | |
| "eval_runtime": 26.6066, | |
| "eval_samples_per_second": 22.062, | |
| "eval_steps_per_second": 2.781, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.4242424242424243, | |
| "grad_norm": 15.584559440612793, | |
| "learning_rate": 2.0736000000000003e-05, | |
| "loss": 13.5654, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.4242424242424243, | |
| "eval_loss": 0.8570966720581055, | |
| "eval_runtime": 26.7383, | |
| "eval_samples_per_second": 21.954, | |
| "eval_steps_per_second": 2.768, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.5757575757575757, | |
| "grad_norm": 14.500994682312012, | |
| "learning_rate": 1.9536e-05, | |
| "loss": 13.4998, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.5757575757575757, | |
| "eval_loss": 0.8537192940711975, | |
| "eval_runtime": 26.82, | |
| "eval_samples_per_second": 21.887, | |
| "eval_steps_per_second": 2.759, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 13.635045051574707, | |
| "learning_rate": 1.8336e-05, | |
| "loss": 13.3694, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "eval_loss": 0.8501807451248169, | |
| "eval_runtime": 26.631, | |
| "eval_samples_per_second": 22.042, | |
| "eval_steps_per_second": 2.779, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.878787878787879, | |
| "grad_norm": 14.899593353271484, | |
| "learning_rate": 1.7136000000000003e-05, | |
| "loss": 13.3274, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.878787878787879, | |
| "eval_loss": 0.8472868204116821, | |
| "eval_runtime": 26.7572, | |
| "eval_samples_per_second": 21.938, | |
| "eval_steps_per_second": 2.766, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.0303030303030303, | |
| "grad_norm": 14.57861614227295, | |
| "learning_rate": 1.5936e-05, | |
| "loss": 13.1797, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.0303030303030303, | |
| "eval_loss": 0.8451663255691528, | |
| "eval_runtime": 26.9575, | |
| "eval_samples_per_second": 21.775, | |
| "eval_steps_per_second": 2.745, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.1818181818181817, | |
| "grad_norm": 15.23614501953125, | |
| "learning_rate": 1.4736000000000001e-05, | |
| "loss": 13.221, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.1818181818181817, | |
| "eval_loss": 0.8429368734359741, | |
| "eval_runtime": 26.6839, | |
| "eval_samples_per_second": 21.998, | |
| "eval_steps_per_second": 2.773, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 16.392993927001953, | |
| "learning_rate": 1.3536e-05, | |
| "loss": 13.1811, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "eval_loss": 0.8409376740455627, | |
| "eval_runtime": 26.569, | |
| "eval_samples_per_second": 22.093, | |
| "eval_steps_per_second": 2.785, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.484848484848485, | |
| "grad_norm": 14.45429515838623, | |
| "learning_rate": 1.2336e-05, | |
| "loss": 12.7355, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.484848484848485, | |
| "eval_loss": 0.8386228084564209, | |
| "eval_runtime": 26.5967, | |
| "eval_samples_per_second": 22.07, | |
| "eval_steps_per_second": 2.782, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.6363636363636362, | |
| "grad_norm": 15.168094635009766, | |
| "learning_rate": 1.1136e-05, | |
| "loss": 13.0834, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.6363636363636362, | |
| "eval_loss": 0.8364977240562439, | |
| "eval_runtime": 26.5442, | |
| "eval_samples_per_second": 22.114, | |
| "eval_steps_per_second": 2.788, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.787878787878788, | |
| "grad_norm": 16.040002822875977, | |
| "learning_rate": 9.936e-06, | |
| "loss": 13.1575, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 3.787878787878788, | |
| "eval_loss": 0.8345832824707031, | |
| "eval_runtime": 26.5067, | |
| "eval_samples_per_second": 22.145, | |
| "eval_steps_per_second": 2.792, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 3.9393939393939394, | |
| "grad_norm": 16.534528732299805, | |
| "learning_rate": 8.736e-06, | |
| "loss": 12.8282, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.9393939393939394, | |
| "eval_loss": 0.832955539226532, | |
| "eval_runtime": 26.6209, | |
| "eval_samples_per_second": 22.05, | |
| "eval_steps_per_second": 2.78, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 4.090909090909091, | |
| "grad_norm": 15.697587966918945, | |
| "learning_rate": 7.5359999999999995e-06, | |
| "loss": 12.707, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 4.090909090909091, | |
| "eval_loss": 0.8321042656898499, | |
| "eval_runtime": 26.6706, | |
| "eval_samples_per_second": 22.009, | |
| "eval_steps_per_second": 2.775, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 4.242424242424242, | |
| "grad_norm": 16.229135513305664, | |
| "learning_rate": 6.336e-06, | |
| "loss": 12.7864, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.242424242424242, | |
| "eval_loss": 0.8310558795928955, | |
| "eval_runtime": 26.5267, | |
| "eval_samples_per_second": 22.129, | |
| "eval_steps_per_second": 2.79, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.393939393939394, | |
| "grad_norm": 16.64604377746582, | |
| "learning_rate": 5.136e-06, | |
| "loss": 12.937, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.393939393939394, | |
| "eval_loss": 0.8296888470649719, | |
| "eval_runtime": 26.5659, | |
| "eval_samples_per_second": 22.096, | |
| "eval_steps_per_second": 2.786, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.545454545454545, | |
| "grad_norm": 16.23679542541504, | |
| "learning_rate": 3.936e-06, | |
| "loss": 12.8632, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.545454545454545, | |
| "eval_loss": 0.828894317150116, | |
| "eval_runtime": 26.6204, | |
| "eval_samples_per_second": 22.051, | |
| "eval_steps_per_second": 2.78, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.696969696969697, | |
| "grad_norm": 16.246938705444336, | |
| "learning_rate": 2.736e-06, | |
| "loss": 13.111, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 4.696969696969697, | |
| "eval_loss": 0.828315794467926, | |
| "eval_runtime": 26.5355, | |
| "eval_samples_per_second": 22.121, | |
| "eval_steps_per_second": 2.789, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 4.848484848484849, | |
| "grad_norm": 17.31324577331543, | |
| "learning_rate": 1.5360000000000002e-06, | |
| "loss": 13.0132, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.848484848484849, | |
| "eval_loss": 0.8280864357948303, | |
| "eval_runtime": 26.4945, | |
| "eval_samples_per_second": 22.156, | |
| "eval_steps_per_second": 2.793, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 825, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 1, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.596536164371661e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |