| { | |
| "best_global_step": 16000, | |
| "best_metric": 0.042671315371990204, | |
| "best_model_checkpoint": "./training_output/checkpoint-16000", | |
| "epoch": 0.95, | |
| "eval_steps": 1000, | |
| "global_step": 19000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.8834348917007446, | |
| "learning_rate": 1.9501000000000002e-05, | |
| "loss": 0.1437, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 9.513919830322266, | |
| "learning_rate": 1.9001e-05, | |
| "loss": 0.1085, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_accuracy": 0.91835, | |
| "eval_loss": 0.10802757740020752, | |
| "eval_runtime": 381.1922, | |
| "eval_samples_per_second": 52.467, | |
| "eval_steps_per_second": 3.279, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.013394818641245365, | |
| "learning_rate": 1.8501e-05, | |
| "loss": 0.0965, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.017180055379867554, | |
| "learning_rate": 1.8001000000000003e-05, | |
| "loss": 0.0716, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_accuracy": 0.8473, | |
| "eval_loss": 0.25237375497817993, | |
| "eval_runtime": 381.1616, | |
| "eval_samples_per_second": 52.471, | |
| "eval_steps_per_second": 3.279, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.017619747668504715, | |
| "learning_rate": 1.7501e-05, | |
| "loss": 0.0658, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.527113676071167, | |
| "learning_rate": 1.7001000000000002e-05, | |
| "loss": 0.0615, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_accuracy": 0.9299, | |
| "eval_loss": 0.11818733811378479, | |
| "eval_runtime": 381.4889, | |
| "eval_samples_per_second": 52.426, | |
| "eval_steps_per_second": 3.277, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 140.78282165527344, | |
| "learning_rate": 1.6501e-05, | |
| "loss": 0.056, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.9989501237869263, | |
| "learning_rate": 1.6001e-05, | |
| "loss": 0.0648, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_accuracy": 0.9498, | |
| "eval_loss": 0.07567641884088516, | |
| "eval_runtime": 380.8034, | |
| "eval_samples_per_second": 52.521, | |
| "eval_steps_per_second": 3.283, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.01812303625047207, | |
| "learning_rate": 1.5501000000000003e-05, | |
| "loss": 0.0487, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.05552659556269646, | |
| "learning_rate": 1.5001000000000001e-05, | |
| "loss": 0.0522, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "eval_accuracy": 0.92725, | |
| "eval_loss": 0.12006673216819763, | |
| "eval_runtime": 380.6188, | |
| "eval_samples_per_second": 52.546, | |
| "eval_steps_per_second": 3.284, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.14319103956222534, | |
| "learning_rate": 1.4501e-05, | |
| "loss": 0.0554, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.012562028132379055, | |
| "learning_rate": 1.4001e-05, | |
| "loss": 0.0377, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_accuracy": 0.95545, | |
| "eval_loss": 0.08464282751083374, | |
| "eval_runtime": 380.6212, | |
| "eval_samples_per_second": 52.546, | |
| "eval_steps_per_second": 3.284, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 0.0012805273290723562, | |
| "learning_rate": 1.3501000000000002e-05, | |
| "loss": 0.0327, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.024555200710892677, | |
| "learning_rate": 1.3001000000000001e-05, | |
| "loss": 0.0447, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "eval_accuracy": 0.93225, | |
| "eval_loss": 0.10355959832668304, | |
| "eval_runtime": 380.8232, | |
| "eval_samples_per_second": 52.518, | |
| "eval_steps_per_second": 3.282, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.10476179420948029, | |
| "learning_rate": 1.2501000000000001e-05, | |
| "loss": 0.0379, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.16991844773292542, | |
| "learning_rate": 1.2001e-05, | |
| "loss": 0.0421, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_accuracy": 0.89145, | |
| "eval_loss": 0.1804238110780716, | |
| "eval_runtime": 380.9125, | |
| "eval_samples_per_second": 52.505, | |
| "eval_steps_per_second": 3.282, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 0.017685526981949806, | |
| "learning_rate": 1.1501e-05, | |
| "loss": 0.0384, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 16.736478805541992, | |
| "learning_rate": 1.1001000000000002e-05, | |
| "loss": 0.0364, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "eval_accuracy": 0.96285, | |
| "eval_loss": 0.04935265704989433, | |
| "eval_runtime": 380.5923, | |
| "eval_samples_per_second": 52.55, | |
| "eval_steps_per_second": 3.284, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 0.08418703079223633, | |
| "learning_rate": 1.0501000000000002e-05, | |
| "loss": 0.0262, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.19593098759651184, | |
| "learning_rate": 1.0001000000000001e-05, | |
| "loss": 0.0301, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_accuracy": 0.9689, | |
| "eval_loss": 0.05831901729106903, | |
| "eval_runtime": 380.4459, | |
| "eval_samples_per_second": 52.57, | |
| "eval_steps_per_second": 3.286, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.525, | |
| "grad_norm": 0.005820299498736858, | |
| "learning_rate": 9.501000000000001e-06, | |
| "loss": 0.0359, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.010061199776828289, | |
| "learning_rate": 9.001e-06, | |
| "loss": 0.0281, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "eval_accuracy": 0.96895, | |
| "eval_loss": 0.05543893575668335, | |
| "eval_runtime": 380.3496, | |
| "eval_samples_per_second": 52.583, | |
| "eval_steps_per_second": 3.286, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.575, | |
| "grad_norm": 0.07327098399400711, | |
| "learning_rate": 8.501e-06, | |
| "loss": 0.022, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 32.43045425415039, | |
| "learning_rate": 8.001000000000002e-06, | |
| "loss": 0.0362, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_accuracy": 0.9428, | |
| "eval_loss": 0.08979687839746475, | |
| "eval_runtime": 380.3892, | |
| "eval_samples_per_second": 52.578, | |
| "eval_steps_per_second": 3.286, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 0.02456793375313282, | |
| "learning_rate": 7.501000000000001e-06, | |
| "loss": 0.0246, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.001343235606327653, | |
| "learning_rate": 7.001e-06, | |
| "loss": 0.022, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "eval_accuracy": 0.9687, | |
| "eval_loss": 0.0772382989525795, | |
| "eval_runtime": 380.3973, | |
| "eval_samples_per_second": 52.577, | |
| "eval_steps_per_second": 3.286, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.675, | |
| "grad_norm": 38.745670318603516, | |
| "learning_rate": 6.501000000000001e-06, | |
| "loss": 0.0232, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.0009748099255375564, | |
| "learning_rate": 6.001e-06, | |
| "loss": 0.0221, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "eval_accuracy": 0.96125, | |
| "eval_loss": 0.07062023133039474, | |
| "eval_runtime": 380.5787, | |
| "eval_samples_per_second": 52.552, | |
| "eval_steps_per_second": 3.284, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.725, | |
| "grad_norm": 0.1631036400794983, | |
| "learning_rate": 5.501000000000001e-06, | |
| "loss": 0.0197, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.4217091202735901, | |
| "learning_rate": 5.001e-06, | |
| "loss": 0.0256, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_accuracy": 0.97195, | |
| "eval_loss": 0.048726025968790054, | |
| "eval_runtime": 380.4177, | |
| "eval_samples_per_second": 52.574, | |
| "eval_steps_per_second": 3.286, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.775, | |
| "grad_norm": 0.054990075528621674, | |
| "learning_rate": 4.501000000000001e-06, | |
| "loss": 0.0176, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.00047703919699415565, | |
| "learning_rate": 4.001e-06, | |
| "loss": 0.0215, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_accuracy": 0.97645, | |
| "eval_loss": 0.042671315371990204, | |
| "eval_runtime": 380.4653, | |
| "eval_samples_per_second": 52.567, | |
| "eval_steps_per_second": 3.285, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.825, | |
| "grad_norm": 0.01499607227742672, | |
| "learning_rate": 3.5010000000000004e-06, | |
| "loss": 0.0253, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.0003527141234371811, | |
| "learning_rate": 3.001e-06, | |
| "loss": 0.0162, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "eval_accuracy": 0.97425, | |
| "eval_loss": 0.04368801414966583, | |
| "eval_runtime": 380.357, | |
| "eval_samples_per_second": 52.582, | |
| "eval_steps_per_second": 3.286, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 0.000398589443648234, | |
| "learning_rate": 2.5010000000000003e-06, | |
| "loss": 0.014, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.0027746970299631357, | |
| "learning_rate": 2.001e-06, | |
| "loss": 0.0186, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "eval_accuracy": 0.96805, | |
| "eval_loss": 0.06131412461400032, | |
| "eval_runtime": 380.5623, | |
| "eval_samples_per_second": 52.554, | |
| "eval_steps_per_second": 3.285, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.925, | |
| "grad_norm": 0.0007658928516320884, | |
| "learning_rate": 1.5010000000000003e-06, | |
| "loss": 0.0148, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 44.06643295288086, | |
| "learning_rate": 1.001e-06, | |
| "loss": 0.0211, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "eval_accuracy": 0.95135, | |
| "eval_loss": 0.09501199424266815, | |
| "eval_runtime": 380.4677, | |
| "eval_samples_per_second": 52.567, | |
| "eval_steps_per_second": 3.285, | |
| "step": 19000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 20000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.999644020736e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |