{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 3.0676674842834473, "learning_rate": 1.9520000000000003e-05, "loss": 1.1671, "step": 10 }, { "epoch": 0.16, "grad_norm": 2.477043628692627, "learning_rate": 1.898666666666667e-05, "loss": 0.737, "step": 20 }, { "epoch": 0.24, "grad_norm": 2.6543428897857666, "learning_rate": 1.8453333333333335e-05, "loss": 0.5566, "step": 30 }, { "epoch": 0.32, "grad_norm": 6.639544486999512, "learning_rate": 1.792e-05, "loss": 0.4373, "step": 40 }, { "epoch": 0.4, "grad_norm": 2.2970216274261475, "learning_rate": 1.7386666666666667e-05, "loss": 0.3546, "step": 50 }, { "epoch": 0.48, "grad_norm": 5.090140342712402, "learning_rate": 1.6853333333333333e-05, "loss": 0.3553, "step": 60 }, { "epoch": 0.56, "grad_norm": 2.696153163909912, "learning_rate": 1.632e-05, "loss": 0.3246, "step": 70 }, { "epoch": 0.64, "grad_norm": 5.513143062591553, "learning_rate": 1.578666666666667e-05, "loss": 0.3048, "step": 80 }, { "epoch": 0.72, "grad_norm": 5.173070430755615, "learning_rate": 1.5253333333333335e-05, "loss": 0.2941, "step": 90 }, { "epoch": 0.8, "grad_norm": 4.507210731506348, "learning_rate": 1.4720000000000001e-05, "loss": 0.293, "step": 100 }, { "epoch": 0.88, "grad_norm": 4.429561138153076, "learning_rate": 1.418666666666667e-05, "loss": 0.283, "step": 110 }, { "epoch": 0.96, "grad_norm": 5.397491455078125, "learning_rate": 1.3653333333333334e-05, "loss": 0.2456, "step": 120 }, { "epoch": 1.0, "eval_accuracy": 0.8911995942176009, "eval_f1": 0.8068669527896994, "eval_loss": 0.2935342788696289, "eval_precision": 0.7782377291543465, "eval_recall": 0.8376830044557607, "eval_runtime": 1.1986, "eval_samples_per_second": 125.141, "eval_steps_per_second": 8.343, "step": 125 }, { "epoch": 1.04, "grad_norm": 4.7523274421691895, "learning_rate": 1.3120000000000001e-05, "loss": 0.27, "step": 130 }, { "epoch": 1.12, "grad_norm": 5.5811262130737305, "learning_rate": 1.2586666666666668e-05, "loss": 0.2433, "step": 140 }, { "epoch": 1.2, "grad_norm": 4.444863319396973, "learning_rate": 1.2053333333333335e-05, "loss": 0.2797, "step": 150 }, { "epoch": 1.28, "grad_norm": 3.8088574409484863, "learning_rate": 1.152e-05, "loss": 0.2457, "step": 160 }, { "epoch": 1.3599999999999999, "grad_norm": 3.965343952178955, "learning_rate": 1.0986666666666668e-05, "loss": 0.2554, "step": 170 }, { "epoch": 1.44, "grad_norm": 5.77710485458374, "learning_rate": 1.0453333333333334e-05, "loss": 0.2369, "step": 180 }, { "epoch": 1.52, "grad_norm": 4.24776029586792, "learning_rate": 9.920000000000002e-06, "loss": 0.2475, "step": 190 }, { "epoch": 1.6, "grad_norm": 6.790882110595703, "learning_rate": 9.386666666666668e-06, "loss": 0.2885, "step": 200 }, { "epoch": 1.6800000000000002, "grad_norm": 2.6119062900543213, "learning_rate": 8.853333333333334e-06, "loss": 0.2386, "step": 210 }, { "epoch": 1.76, "grad_norm": 2.789976119995117, "learning_rate": 8.32e-06, "loss": 0.2163, "step": 220 }, { "epoch": 1.8399999999999999, "grad_norm": 2.7082512378692627, "learning_rate": 7.786666666666666e-06, "loss": 0.2211, "step": 230 }, { "epoch": 1.92, "grad_norm": 4.595925807952881, "learning_rate": 7.253333333333335e-06, "loss": 0.2195, "step": 240 }, { "epoch": 2.0, "grad_norm": 3.239081621170044, "learning_rate": 6.720000000000001e-06, "loss": 0.2221, "step": 250 }, { "epoch": 2.0, "eval_accuracy": 0.8984275932031448, "eval_f1": 0.8320817062180834, "eval_loss": 0.3072827160358429, "eval_precision": 0.7878270762229806, "eval_recall": 0.8816040738383195, "eval_runtime": 1.1612, "eval_samples_per_second": 129.177, "eval_steps_per_second": 8.612, "step": 250 }, { "epoch": 2.08, "grad_norm": 3.049429178237915, "learning_rate": 6.186666666666668e-06, "loss": 0.1524, "step": 260 }, { "epoch": 2.16, "grad_norm": 3.8909575939178467, "learning_rate": 5.653333333333334e-06, "loss": 0.2572, "step": 270 }, { "epoch": 2.24, "grad_norm": 4.024440288543701, "learning_rate": 5.12e-06, "loss": 0.2083, "step": 280 }, { "epoch": 2.32, "grad_norm": 2.7988221645355225, "learning_rate": 4.586666666666667e-06, "loss": 0.1893, "step": 290 }, { "epoch": 2.4, "grad_norm": 5.445796966552734, "learning_rate": 4.053333333333333e-06, "loss": 0.1918, "step": 300 }, { "epoch": 2.48, "grad_norm": 2.8356993198394775, "learning_rate": 3.52e-06, "loss": 0.2006, "step": 310 }, { "epoch": 2.56, "grad_norm": 2.6401376724243164, "learning_rate": 2.986666666666667e-06, "loss": 0.1883, "step": 320 }, { "epoch": 2.64, "grad_norm": 3.2241694927215576, "learning_rate": 2.4533333333333333e-06, "loss": 0.2185, "step": 330 }, { "epoch": 2.7199999999999998, "grad_norm": 1.8776860237121582, "learning_rate": 1.9200000000000003e-06, "loss": 0.1718, "step": 340 }, { "epoch": 2.8, "grad_norm": 1.8782577514648438, "learning_rate": 1.3866666666666668e-06, "loss": 0.2874, "step": 350 }, { "epoch": 2.88, "grad_norm": 3.1951863765716553, "learning_rate": 8.533333333333334e-07, "loss": 0.2036, "step": 360 }, { "epoch": 2.96, "grad_norm": 2.771134853363037, "learning_rate": 3.2e-07, "loss": 0.2104, "step": 370 } ], "logging_steps": 10, "max_steps": 375, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 473453787413376.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }