{ "best_metric": 0.646373450756073, "best_model_checkpoint": "knowledge-Distillation/checkpoint-500", "epoch": 2.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.048, "grad_norm": 4.009671688079834, "learning_rate": 4.800000000000001e-06, "loss": 1.1378, "step": 12 }, { "epoch": 0.096, "grad_norm": 1.55279541015625, "learning_rate": 9.600000000000001e-06, "loss": 1.1198, "step": 24 }, { "epoch": 0.144, "grad_norm": 2.4240620136260986, "learning_rate": 1.44e-05, "loss": 1.0868, "step": 36 }, { "epoch": 0.192, "grad_norm": 4.712751388549805, "learning_rate": 1.9200000000000003e-05, "loss": 1.0798, "step": 48 }, { "epoch": 0.24, "grad_norm": 12.531917572021484, "learning_rate": 2.4e-05, "loss": 1.0255, "step": 60 }, { "epoch": 0.288, "grad_norm": 7.460992336273193, "learning_rate": 2.88e-05, "loss": 1.0152, "step": 72 }, { "epoch": 0.336, "grad_norm": 12.766624450683594, "learning_rate": 3.3600000000000004e-05, "loss": 0.8645, "step": 84 }, { "epoch": 0.384, "grad_norm": 8.186197280883789, "learning_rate": 3.8400000000000005e-05, "loss": 0.8678, "step": 96 }, { "epoch": 0.432, "grad_norm": 16.78915786743164, "learning_rate": 4.32e-05, "loss": 0.7689, "step": 108 }, { "epoch": 0.48, "grad_norm": 22.758197784423828, "learning_rate": 4.8e-05, "loss": 0.7054, "step": 120 }, { "epoch": 0.528, "grad_norm": 10.546210289001465, "learning_rate": 4.968888888888889e-05, "loss": 0.8524, "step": 132 }, { "epoch": 0.576, "grad_norm": 9.998793601989746, "learning_rate": 4.915555555555556e-05, "loss": 0.7466, "step": 144 }, { "epoch": 0.624, "grad_norm": 16.722980499267578, "learning_rate": 4.862222222222222e-05, "loss": 0.8522, "step": 156 }, { "epoch": 0.672, "grad_norm": 9.654285430908203, "learning_rate": 4.808888888888889e-05, "loss": 0.748, "step": 168 }, { "epoch": 0.72, "grad_norm": 23.497644424438477, "learning_rate": 4.755555555555556e-05, "loss": 0.8113, "step": 180 }, { "epoch": 0.768, "grad_norm": 29.93133544921875, "learning_rate": 4.702222222222222e-05, "loss": 0.7375, "step": 192 }, { "epoch": 0.816, "grad_norm": 21.097806930541992, "learning_rate": 4.648888888888889e-05, "loss": 0.6061, "step": 204 }, { "epoch": 0.864, "grad_norm": 11.148388862609863, "learning_rate": 4.5955555555555555e-05, "loss": 0.8176, "step": 216 }, { "epoch": 0.912, "grad_norm": 41.14344787597656, "learning_rate": 4.5422222222222225e-05, "loss": 0.7304, "step": 228 }, { "epoch": 0.96, "grad_norm": 11.007186889648438, "learning_rate": 4.4888888888888894e-05, "loss": 0.8662, "step": 240 }, { "epoch": 1.0, "eval_accuracy": 0.7207207207207207, "eval_f1_macro": 0.6400173200358859, "eval_f1_micro": 0.7207207207207207, "eval_f1_weighted": 0.6863741252230461, "eval_loss": 0.8032302260398865, "eval_precision_macro": 0.7771056423671171, "eval_precision_micro": 0.7207207207207207, "eval_precision_weighted": 0.754485589161179, "eval_recall_macro": 0.64472593537768, "eval_recall_micro": 0.7207207207207207, "eval_recall_weighted": 0.7207207207207207, "eval_runtime": 162.5414, "eval_samples_per_second": 6.146, "eval_steps_per_second": 0.197, "step": 250 }, { "epoch": 1.008, "grad_norm": 8.793386459350586, "learning_rate": 4.435555555555556e-05, "loss": 0.6726, "step": 252 }, { "epoch": 1.056, "grad_norm": 8.431066513061523, "learning_rate": 4.3822222222222227e-05, "loss": 0.728, "step": 264 }, { "epoch": 1.104, "grad_norm": 13.071714401245117, "learning_rate": 4.328888888888889e-05, "loss": 0.5935, "step": 276 }, { "epoch": 1.152, "grad_norm": 11.652353286743164, "learning_rate": 4.275555555555556e-05, "loss": 0.5623, "step": 288 }, { "epoch": 1.2, "grad_norm": 15.519989013671875, "learning_rate": 4.222222222222222e-05, "loss": 0.5258, "step": 300 }, { "epoch": 1.248, "grad_norm": 12.270814895629883, "learning_rate": 4.168888888888889e-05, "loss": 0.5467, "step": 312 }, { "epoch": 1.296, "grad_norm": 45.41197967529297, "learning_rate": 4.115555555555556e-05, "loss": 0.687, "step": 324 }, { "epoch": 1.3439999999999999, "grad_norm": 18.447298049926758, "learning_rate": 4.062222222222222e-05, "loss": 0.7523, "step": 336 }, { "epoch": 1.392, "grad_norm": 16.430402755737305, "learning_rate": 4.008888888888889e-05, "loss": 0.5786, "step": 348 }, { "epoch": 1.44, "grad_norm": 13.083454132080078, "learning_rate": 3.9555555555555556e-05, "loss": 0.4946, "step": 360 }, { "epoch": 1.488, "grad_norm": 17.238567352294922, "learning_rate": 3.9022222222222225e-05, "loss": 0.6058, "step": 372 }, { "epoch": 1.536, "grad_norm": 18.45892906188965, "learning_rate": 3.848888888888889e-05, "loss": 0.5246, "step": 384 }, { "epoch": 1.584, "grad_norm": 23.937816619873047, "learning_rate": 3.795555555555556e-05, "loss": 0.6662, "step": 396 }, { "epoch": 1.6320000000000001, "grad_norm": 17.260282516479492, "learning_rate": 3.742222222222223e-05, "loss": 0.6118, "step": 408 }, { "epoch": 1.6800000000000002, "grad_norm": 15.230131149291992, "learning_rate": 3.688888888888889e-05, "loss": 0.6536, "step": 420 }, { "epoch": 1.728, "grad_norm": 12.828449249267578, "learning_rate": 3.635555555555556e-05, "loss": 0.6201, "step": 432 }, { "epoch": 1.776, "grad_norm": 20.50193214416504, "learning_rate": 3.582222222222222e-05, "loss": 0.5495, "step": 444 }, { "epoch": 1.8239999999999998, "grad_norm": 15.018689155578613, "learning_rate": 3.528888888888889e-05, "loss": 0.4981, "step": 456 }, { "epoch": 1.8719999999999999, "grad_norm": 20.459606170654297, "learning_rate": 3.475555555555556e-05, "loss": 0.5907, "step": 468 }, { "epoch": 1.92, "grad_norm": 14.490087509155273, "learning_rate": 3.4222222222222224e-05, "loss": 0.519, "step": 480 }, { "epoch": 1.968, "grad_norm": 23.004623413085938, "learning_rate": 3.368888888888889e-05, "loss": 0.5737, "step": 492 }, { "epoch": 2.0, "eval_accuracy": 0.7447447447447447, "eval_f1_macro": 0.7298437618326189, "eval_f1_micro": 0.7447447447447447, "eval_f1_weighted": 0.7467396946425192, "eval_loss": 0.646373450756073, "eval_precision_macro": 0.7348636077090247, "eval_precision_micro": 0.7447447447447447, "eval_precision_weighted": 0.7644157119297172, "eval_recall_macro": 0.7382166799546654, "eval_recall_micro": 0.7447447447447447, "eval_recall_weighted": 0.7447447447447447, "eval_runtime": 163.7809, "eval_samples_per_second": 6.1, "eval_steps_per_second": 0.195, "step": 500 } ], "logging_steps": 12, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 525305938493952.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }