{ "best_global_step": 800, "best_metric": 0.8280864357948303, "best_model_checkpoint": "/workspace/model/finetuned/checkpoint-800", "epoch": 4.848484848484849, "eval_steps": 25, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15151515151515152, "grad_norm": 180.98402404785156, "learning_rate": 2.7e-06, "loss": 30.8939, "step": 25 }, { "epoch": 0.15151515151515152, "eval_loss": 2.02256441116333, "eval_runtime": 26.9436, "eval_samples_per_second": 21.786, "eval_steps_per_second": 2.746, "step": 25 }, { "epoch": 0.30303030303030304, "grad_norm": 385.3758544921875, "learning_rate": 6.45e-06, "loss": 28.2375, "step": 50 }, { "epoch": 0.30303030303030304, "eval_loss": 1.5823099613189697, "eval_runtime": 27.0059, "eval_samples_per_second": 21.736, "eval_steps_per_second": 2.74, "step": 50 }, { "epoch": 0.45454545454545453, "grad_norm": 30.632709503173828, "learning_rate": 1.02e-05, "loss": 21.1384, "step": 75 }, { "epoch": 0.45454545454545453, "eval_loss": 1.216854214668274, "eval_runtime": 26.9681, "eval_samples_per_second": 21.766, "eval_steps_per_second": 2.744, "step": 75 }, { "epoch": 0.6060606060606061, "grad_norm": 9.063393592834473, "learning_rate": 1.395e-05, "loss": 18.6661, "step": 100 }, { "epoch": 0.6060606060606061, "eval_loss": 1.1212154626846313, "eval_runtime": 26.8008, "eval_samples_per_second": 21.902, "eval_steps_per_second": 2.761, "step": 100 }, { "epoch": 0.7575757575757576, "grad_norm": 10.971136093139648, "learning_rate": 1.77e-05, "loss": 17.3687, "step": 125 }, { "epoch": 0.7575757575757576, "eval_loss": 1.0298739671707153, "eval_runtime": 26.8143, "eval_samples_per_second": 21.891, "eval_steps_per_second": 2.76, "step": 125 }, { "epoch": 0.9090909090909091, "grad_norm": 8.818650245666504, "learning_rate": 2.145e-05, "loss": 15.8605, "step": 150 }, { "epoch": 0.9090909090909091, "eval_loss": 0.9787100553512573, "eval_runtime": 26.571, "eval_samples_per_second": 22.092, "eval_steps_per_second": 2.785, "step": 150 }, { "epoch": 1.0606060606060606, "grad_norm": 11.700923919677734, "learning_rate": 2.52e-05, "loss": 15.163, "step": 175 }, { "epoch": 1.0606060606060606, "eval_loss": 0.9414308667182922, "eval_runtime": 26.626, "eval_samples_per_second": 22.046, "eval_steps_per_second": 2.779, "step": 175 }, { "epoch": 1.2121212121212122, "grad_norm": 10.862292289733887, "learning_rate": 2.895e-05, "loss": 14.6871, "step": 200 }, { "epoch": 1.2121212121212122, "eval_loss": 0.9198422431945801, "eval_runtime": 26.7039, "eval_samples_per_second": 21.982, "eval_steps_per_second": 2.771, "step": 200 }, { "epoch": 1.3636363636363638, "grad_norm": 11.315472602844238, "learning_rate": 2.9136e-05, "loss": 14.8471, "step": 225 }, { "epoch": 1.3636363636363638, "eval_loss": 0.9041078686714172, "eval_runtime": 26.5947, "eval_samples_per_second": 22.072, "eval_steps_per_second": 2.783, "step": 225 }, { "epoch": 1.5151515151515151, "grad_norm": 12.701277732849121, "learning_rate": 2.7936e-05, "loss": 14.163, "step": 250 }, { "epoch": 1.5151515151515151, "eval_loss": 0.8926578760147095, "eval_runtime": 26.7046, "eval_samples_per_second": 21.981, "eval_steps_per_second": 2.771, "step": 250 }, { "epoch": 1.6666666666666665, "grad_norm": 11.456862449645996, "learning_rate": 2.6736e-05, "loss": 14.0006, "step": 275 }, { "epoch": 1.6666666666666665, "eval_loss": 0.8840105533599854, "eval_runtime": 26.5642, "eval_samples_per_second": 22.097, "eval_steps_per_second": 2.786, "step": 275 }, { "epoch": 1.8181818181818183, "grad_norm": 11.606865882873535, "learning_rate": 2.5536e-05, "loss": 14.0325, "step": 300 }, { "epoch": 1.8181818181818183, "eval_loss": 0.8764263987541199, "eval_runtime": 26.4677, "eval_samples_per_second": 22.178, "eval_steps_per_second": 2.796, "step": 300 }, { "epoch": 1.9696969696969697, "grad_norm": 11.702313423156738, "learning_rate": 2.4336000000000002e-05, "loss": 14.1239, "step": 325 }, { "epoch": 1.9696969696969697, "eval_loss": 0.8708490133285522, "eval_runtime": 26.714, "eval_samples_per_second": 21.973, "eval_steps_per_second": 2.77, "step": 325 }, { "epoch": 2.121212121212121, "grad_norm": 13.907278060913086, "learning_rate": 2.3136e-05, "loss": 13.6706, "step": 350 }, { "epoch": 2.121212121212121, "eval_loss": 0.8656915426254272, "eval_runtime": 26.7031, "eval_samples_per_second": 21.982, "eval_steps_per_second": 2.771, "step": 350 }, { "epoch": 2.2727272727272725, "grad_norm": 13.098384857177734, "learning_rate": 2.1935999999999998e-05, "loss": 13.5478, "step": 375 }, { "epoch": 2.2727272727272725, "eval_loss": 0.8606927394866943, "eval_runtime": 26.6066, "eval_samples_per_second": 22.062, "eval_steps_per_second": 2.781, "step": 375 }, { "epoch": 2.4242424242424243, "grad_norm": 15.584559440612793, "learning_rate": 2.0736000000000003e-05, "loss": 13.5654, "step": 400 }, { "epoch": 2.4242424242424243, "eval_loss": 0.8570966720581055, "eval_runtime": 26.7383, "eval_samples_per_second": 21.954, "eval_steps_per_second": 2.768, "step": 400 }, { "epoch": 2.5757575757575757, "grad_norm": 14.500994682312012, "learning_rate": 1.9536e-05, "loss": 13.4998, "step": 425 }, { "epoch": 2.5757575757575757, "eval_loss": 0.8537192940711975, "eval_runtime": 26.82, "eval_samples_per_second": 21.887, "eval_steps_per_second": 2.759, "step": 425 }, { "epoch": 2.7272727272727275, "grad_norm": 13.635045051574707, "learning_rate": 1.8336e-05, "loss": 13.3694, "step": 450 }, { "epoch": 2.7272727272727275, "eval_loss": 0.8501807451248169, "eval_runtime": 26.631, "eval_samples_per_second": 22.042, "eval_steps_per_second": 2.779, "step": 450 }, { "epoch": 2.878787878787879, "grad_norm": 14.899593353271484, "learning_rate": 1.7136000000000003e-05, "loss": 13.3274, "step": 475 }, { "epoch": 2.878787878787879, "eval_loss": 0.8472868204116821, "eval_runtime": 26.7572, "eval_samples_per_second": 21.938, "eval_steps_per_second": 2.766, "step": 475 }, { "epoch": 3.0303030303030303, "grad_norm": 14.57861614227295, "learning_rate": 1.5936e-05, "loss": 13.1797, "step": 500 }, { "epoch": 3.0303030303030303, "eval_loss": 0.8451663255691528, "eval_runtime": 26.9575, "eval_samples_per_second": 21.775, "eval_steps_per_second": 2.745, "step": 500 }, { "epoch": 3.1818181818181817, "grad_norm": 15.23614501953125, "learning_rate": 1.4736000000000001e-05, "loss": 13.221, "step": 525 }, { "epoch": 3.1818181818181817, "eval_loss": 0.8429368734359741, "eval_runtime": 26.6839, "eval_samples_per_second": 21.998, "eval_steps_per_second": 2.773, "step": 525 }, { "epoch": 3.3333333333333335, "grad_norm": 16.392993927001953, "learning_rate": 1.3536e-05, "loss": 13.1811, "step": 550 }, { "epoch": 3.3333333333333335, "eval_loss": 0.8409376740455627, "eval_runtime": 26.569, "eval_samples_per_second": 22.093, "eval_steps_per_second": 2.785, "step": 550 }, { "epoch": 3.484848484848485, "grad_norm": 14.45429515838623, "learning_rate": 1.2336e-05, "loss": 12.7355, "step": 575 }, { "epoch": 3.484848484848485, "eval_loss": 0.8386228084564209, "eval_runtime": 26.5967, "eval_samples_per_second": 22.07, "eval_steps_per_second": 2.782, "step": 575 }, { "epoch": 3.6363636363636362, "grad_norm": 15.168094635009766, "learning_rate": 1.1136e-05, "loss": 13.0834, "step": 600 }, { "epoch": 3.6363636363636362, "eval_loss": 0.8364977240562439, "eval_runtime": 26.5442, "eval_samples_per_second": 22.114, "eval_steps_per_second": 2.788, "step": 600 }, { "epoch": 3.787878787878788, "grad_norm": 16.040002822875977, "learning_rate": 9.936e-06, "loss": 13.1575, "step": 625 }, { "epoch": 3.787878787878788, "eval_loss": 0.8345832824707031, "eval_runtime": 26.5067, "eval_samples_per_second": 22.145, "eval_steps_per_second": 2.792, "step": 625 }, { "epoch": 3.9393939393939394, "grad_norm": 16.534528732299805, "learning_rate": 8.736e-06, "loss": 12.8282, "step": 650 }, { "epoch": 3.9393939393939394, "eval_loss": 0.832955539226532, "eval_runtime": 26.6209, "eval_samples_per_second": 22.05, "eval_steps_per_second": 2.78, "step": 650 }, { "epoch": 4.090909090909091, "grad_norm": 15.697587966918945, "learning_rate": 7.5359999999999995e-06, "loss": 12.707, "step": 675 }, { "epoch": 4.090909090909091, "eval_loss": 0.8321042656898499, "eval_runtime": 26.6706, "eval_samples_per_second": 22.009, "eval_steps_per_second": 2.775, "step": 675 }, { "epoch": 4.242424242424242, "grad_norm": 16.229135513305664, "learning_rate": 6.336e-06, "loss": 12.7864, "step": 700 }, { "epoch": 4.242424242424242, "eval_loss": 0.8310558795928955, "eval_runtime": 26.5267, "eval_samples_per_second": 22.129, "eval_steps_per_second": 2.79, "step": 700 }, { "epoch": 4.393939393939394, "grad_norm": 16.64604377746582, "learning_rate": 5.136e-06, "loss": 12.937, "step": 725 }, { "epoch": 4.393939393939394, "eval_loss": 0.8296888470649719, "eval_runtime": 26.5659, "eval_samples_per_second": 22.096, "eval_steps_per_second": 2.786, "step": 725 }, { "epoch": 4.545454545454545, "grad_norm": 16.23679542541504, "learning_rate": 3.936e-06, "loss": 12.8632, "step": 750 }, { "epoch": 4.545454545454545, "eval_loss": 0.828894317150116, "eval_runtime": 26.6204, "eval_samples_per_second": 22.051, "eval_steps_per_second": 2.78, "step": 750 }, { "epoch": 4.696969696969697, "grad_norm": 16.246938705444336, "learning_rate": 2.736e-06, "loss": 13.111, "step": 775 }, { "epoch": 4.696969696969697, "eval_loss": 0.828315794467926, "eval_runtime": 26.5355, "eval_samples_per_second": 22.121, "eval_steps_per_second": 2.789, "step": 775 }, { "epoch": 4.848484848484849, "grad_norm": 17.31324577331543, "learning_rate": 1.5360000000000002e-06, "loss": 13.0132, "step": 800 }, { "epoch": 4.848484848484849, "eval_loss": 0.8280864357948303, "eval_runtime": 26.4945, "eval_samples_per_second": 22.156, "eval_steps_per_second": 2.793, "step": 800 } ], "logging_steps": 25, "max_steps": 825, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.596536164371661e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }