{ "best_global_step": 11655, "best_metric": 0.06289209425449371, "best_model_checkpoint": "./results_albert_punctuation_casing/checkpoint-11655", "epoch": 3.0, "eval_steps": 500, "global_step": 11655, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1287001287001287, "grad_norm": 4.2918195724487305, "learning_rate": 1.9143715143715144e-05, "loss": 0.2105482635498047, "step": 500 }, { "epoch": 0.2574002574002574, "grad_norm": 3.180814504623413, "learning_rate": 1.8285714285714288e-05, "loss": 0.09811102294921875, "step": 1000 }, { "epoch": 0.3861003861003861, "grad_norm": 4.069736480712891, "learning_rate": 1.742771342771343e-05, "loss": 0.08830223083496094, "step": 1500 }, { "epoch": 0.5148005148005148, "grad_norm": 2.3238089084625244, "learning_rate": 1.656971256971257e-05, "loss": 0.07262681579589844, "step": 2000 }, { "epoch": 0.6435006435006435, "grad_norm": 0.725167989730835, "learning_rate": 1.5711711711711713e-05, "loss": 0.07961682891845703, "step": 2500 }, { "epoch": 0.7722007722007722, "grad_norm": 13.671622276306152, "learning_rate": 1.4853710853710854e-05, "loss": 0.06993846893310547, "step": 3000 }, { "epoch": 0.9009009009009009, "grad_norm": 2.6098945140838623, "learning_rate": 1.3995709995709996e-05, "loss": 0.07217549133300781, "step": 3500 }, { "epoch": 1.0, "eval_casing_accuracy": 0.6387912059001499, "eval_loss": 0.07048454880714417, "eval_overall_accuracy": 0.6404223412931571, "eval_punctuation_accuracy": 0.6420534766861643, "eval_runtime": 86.2448, "eval_samples_per_second": 180.162, "eval_steps_per_second": 11.27, "step": 3885 }, { "epoch": 1.0296010296010296, "grad_norm": 1.678989052772522, "learning_rate": 1.3137709137709139e-05, "loss": 0.05899927520751953, "step": 4000 }, { "epoch": 1.1583011583011582, "grad_norm": 5.855215549468994, "learning_rate": 1.2279708279708281e-05, "loss": 0.05248377227783203, "step": 4500 }, { "epoch": 1.287001287001287, "grad_norm": 20.59808921813965, "learning_rate": 1.1421707421707422e-05, "loss": 0.05537939834594727, "step": 5000 }, { "epoch": 1.4157014157014158, "grad_norm": 3.922346830368042, "learning_rate": 1.0563706563706564e-05, "loss": 0.05087580490112305, "step": 5500 }, { "epoch": 1.5444015444015444, "grad_norm": 0.129458948969841, "learning_rate": 9.705705705705706e-06, "loss": 0.0524902229309082, "step": 6000 }, { "epoch": 1.673101673101673, "grad_norm": 0.10066387057304382, "learning_rate": 8.847704847704849e-06, "loss": 0.04880419921875, "step": 6500 }, { "epoch": 1.8018018018018018, "grad_norm": 1.1645872592926025, "learning_rate": 7.989703989703991e-06, "loss": 0.04735799407958984, "step": 7000 }, { "epoch": 1.9305019305019306, "grad_norm": 1.6507762670516968, "learning_rate": 7.1317031317031325e-06, "loss": 0.05284581756591797, "step": 7500 }, { "epoch": 2.0, "eval_casing_accuracy": 0.6404749585638992, "eval_loss": 0.06381073594093323, "eval_overall_accuracy": 0.6414089151195728, "eval_punctuation_accuracy": 0.6423428716752462, "eval_runtime": 85.5043, "eval_samples_per_second": 181.722, "eval_steps_per_second": 11.368, "step": 7770 }, { "epoch": 2.0592020592020592, "grad_norm": 0.3554779887199402, "learning_rate": 6.273702273702275e-06, "loss": 0.042236793518066404, "step": 8000 }, { "epoch": 2.187902187902188, "grad_norm": 0.3045165240764618, "learning_rate": 5.415701415701416e-06, "loss": 0.03922730255126953, "step": 8500 }, { "epoch": 2.3166023166023164, "grad_norm": 1.6675119400024414, "learning_rate": 4.557700557700558e-06, "loss": 0.034516990661621096, "step": 9000 }, { "epoch": 2.4453024453024454, "grad_norm": 1.5336593389511108, "learning_rate": 3.6996996996997e-06, "loss": 0.03220732116699219, "step": 9500 }, { "epoch": 2.574002574002574, "grad_norm": 2.9998581409454346, "learning_rate": 2.8416988416988417e-06, "loss": 0.033812404632568356, "step": 10000 }, { "epoch": 2.7027027027027026, "grad_norm": 3.3144118785858154, "learning_rate": 1.9836979836979837e-06, "loss": 0.03330759048461914, "step": 10500 }, { "epoch": 2.8314028314028317, "grad_norm": 2.0357117652893066, "learning_rate": 1.1256971256971258e-06, "loss": 0.03533472442626953, "step": 11000 }, { "epoch": 2.9601029601029603, "grad_norm": 3.5177860260009766, "learning_rate": 2.676962676962677e-07, "loss": 0.0314073543548584, "step": 11500 }, { "epoch": 3.0, "eval_casing_accuracy": 0.6400978681235804, "eval_loss": 0.06289209425449371, "eval_overall_accuracy": 0.6402776437986162, "eval_punctuation_accuracy": 0.6404574194736519, "eval_runtime": 85.5898, "eval_samples_per_second": 181.54, "eval_steps_per_second": 11.356, "step": 11655 } ], "logging_steps": 500, "max_steps": 11655, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }