{ "best_metric": 0.6774637699127197, "best_model_checkpoint": "output/fine_tuning/checkpoints/Meta-Llama-3.1-8B-Instruct/sft/aixpa-ground-short-docs-checkpoint/checkpoint-340", "epoch": 1.8181818181818183, "eval_steps": 20, "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10695187165775401, "grad_norm": 0.2608170211315155, "learning_rate": 1.4819165403057078e-05, "loss": 1.5177, "mean_token_accuracy": 0.6778935924172401, "step": 20 }, { "epoch": 0.10695187165775401, "eval_loss": 1.4139869213104248, "eval_mean_token_accuracy": 0.6904795635037306, "eval_runtime": 1035.6849, "eval_samples_per_second": 0.238, "eval_steps_per_second": 0.119, "step": 20 }, { "epoch": 0.21390374331550802, "grad_norm": 0.2079063504934311, "learning_rate": 1.8247997414535347e-05, "loss": 1.3019, "mean_token_accuracy": 0.7044360123574733, "step": 40 }, { "epoch": 0.21390374331550802, "eval_loss": 1.1822656393051147, "eval_mean_token_accuracy": 0.7299752424402934, "eval_runtime": 1036.3743, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 40 }, { "epoch": 0.32085561497326204, "grad_norm": 0.24613960087299347, "learning_rate": 2e-05, "loss": 1.1518, "mean_token_accuracy": 0.7350467927753925, "step": 60 }, { "epoch": 0.32085561497326204, "eval_loss": 1.0543467998504639, "eval_mean_token_accuracy": 0.7558261015550877, "eval_runtime": 1037.5825, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 60 }, { "epoch": 0.42780748663101603, "grad_norm": 0.21977411210536957, "learning_rate": 2e-05, "loss": 1.0451, "mean_token_accuracy": 0.758122804760933, "step": 80 }, { "epoch": 0.42780748663101603, "eval_loss": 0.9605845808982849, "eval_mean_token_accuracy": 0.7781746368098065, "eval_runtime": 1036.6763, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 80 }, { "epoch": 0.5347593582887701, "grad_norm": 0.29401177167892456, "learning_rate": 2e-05, "loss": 0.9781, "mean_token_accuracy": 0.7737521544098854, "step": 100 }, { "epoch": 0.5347593582887701, "eval_loss": 0.9001632332801819, "eval_mean_token_accuracy": 0.7922655029025504, "eval_runtime": 1037.6159, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 100 }, { "epoch": 0.6417112299465241, "grad_norm": 0.34717148542404175, "learning_rate": 2e-05, "loss": 0.9113, "mean_token_accuracy": 0.7882794156670571, "step": 120 }, { "epoch": 0.6417112299465241, "eval_loss": 0.8677236437797546, "eval_mean_token_accuracy": 0.7979402353123921, "eval_runtime": 1037.0163, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 120 }, { "epoch": 0.7486631016042781, "grad_norm": 0.3498166799545288, "learning_rate": 2e-05, "loss": 0.8725, "mean_token_accuracy": 0.7943845748901367, "step": 140 }, { "epoch": 0.7486631016042781, "eval_loss": 0.8351719379425049, "eval_mean_token_accuracy": 0.8040957174650053, "eval_runtime": 1036.2597, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 140 }, { "epoch": 0.8556149732620321, "grad_norm": 0.3868383467197418, "learning_rate": 2e-05, "loss": 0.8721, "mean_token_accuracy": 0.7931242920458317, "step": 160 }, { "epoch": 0.8556149732620321, "eval_loss": 0.8117150068283081, "eval_mean_token_accuracy": 0.808245500413383, "eval_runtime": 1036.3899, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 160 }, { "epoch": 0.9625668449197861, "grad_norm": 0.5154958367347717, "learning_rate": 2e-05, "loss": 0.83, "mean_token_accuracy": 0.8012300632894039, "step": 180 }, { "epoch": 0.9625668449197861, "eval_loss": 0.7896639108657837, "eval_mean_token_accuracy": 0.8132512133295943, "eval_runtime": 1036.5717, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 180 }, { "epoch": 1.0695187165775402, "grad_norm": 0.42563366889953613, "learning_rate": 2e-05, "loss": 0.8034, "mean_token_accuracy": 0.8062243178486824, "step": 200 }, { "epoch": 1.0695187165775402, "eval_loss": 0.7641515731811523, "eval_mean_token_accuracy": 0.8180098591781244, "eval_runtime": 1037.8429, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 200 }, { "epoch": 1.1764705882352942, "grad_norm": 0.39126402139663696, "learning_rate": 2e-05, "loss": 0.7637, "mean_token_accuracy": 0.8159952461719513, "step": 220 }, { "epoch": 1.1764705882352942, "eval_loss": 0.7506969571113586, "eval_mean_token_accuracy": 0.8211809416127399, "eval_runtime": 1036.9492, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 220 }, { "epoch": 1.2834224598930482, "grad_norm": 0.525314211845398, "learning_rate": 2e-05, "loss": 0.7014, "mean_token_accuracy": 0.8259521864354611, "step": 240 }, { "epoch": 1.2834224598930482, "eval_loss": 0.7359923124313354, "eval_mean_token_accuracy": 0.8241757876504727, "eval_runtime": 1037.3586, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 240 }, { "epoch": 1.3903743315508021, "grad_norm": 1.2710996866226196, "learning_rate": 2e-05, "loss": 0.7084, "mean_token_accuracy": 0.8261168003082275, "step": 260 }, { "epoch": 1.3903743315508021, "eval_loss": 0.7302640676498413, "eval_mean_token_accuracy": 0.8257462154559003, "eval_runtime": 1037.1577, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 260 }, { "epoch": 1.4973262032085561, "grad_norm": 0.5921723246574402, "learning_rate": 2e-05, "loss": 0.6984, "mean_token_accuracy": 0.8257287561893463, "step": 280 }, { "epoch": 1.4973262032085561, "eval_loss": 0.716602087020874, "eval_mean_token_accuracy": 0.8293129685448437, "eval_runtime": 1037.3171, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 280 }, { "epoch": 1.6042780748663101, "grad_norm": 0.6089026927947998, "learning_rate": 2e-05, "loss": 0.6591, "mean_token_accuracy": 0.8372392967343331, "step": 300 }, { "epoch": 1.6042780748663101, "eval_loss": 0.7121440768241882, "eval_mean_token_accuracy": 0.8315709296280775, "eval_runtime": 1038.1176, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.118, "step": 300 }, { "epoch": 1.7112299465240641, "grad_norm": 0.751674473285675, "learning_rate": 2e-05, "loss": 0.6036, "mean_token_accuracy": 0.8498695828020573, "step": 320 }, { "epoch": 1.7112299465240641, "eval_loss": 0.697968602180481, "eval_mean_token_accuracy": 0.8344496600027007, "eval_runtime": 1037.7814, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 320 }, { "epoch": 1.8181818181818183, "grad_norm": 1.045749545097351, "learning_rate": 2e-05, "loss": 0.6099, "mean_token_accuracy": 0.845644561946392, "step": 340 }, { "epoch": 1.8181818181818183, "eval_loss": 0.6774637699127197, "eval_mean_token_accuracy": 0.8406207769866881, "eval_runtime": 1037.5418, "eval_samples_per_second": 0.237, "eval_steps_per_second": 0.119, "step": 340 } ], "logging_steps": 20, "max_steps": 1870, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 20, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.857740816294871e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }