| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 88, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.045714285714285714, |
| "grad_norm": 14.417235374450684, |
| "learning_rate": 9.65909090909091e-06, |
| "loss": 12.365, |
| "mean_token_accuracy": 0.7070006560534239, |
| "num_tokens": 262144.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.09142857142857143, |
| "grad_norm": 8.5507230758667, |
| "learning_rate": 9.204545454545455e-06, |
| "loss": 10.3836, |
| "mean_token_accuracy": 0.7233078088611364, |
| "num_tokens": 524288.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.13714285714285715, |
| "grad_norm": 5.2309746742248535, |
| "learning_rate": 8.750000000000001e-06, |
| "loss": 9.9095, |
| "mean_token_accuracy": 0.7237512413412333, |
| "num_tokens": 786432.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.18285714285714286, |
| "grad_norm": 4.363022327423096, |
| "learning_rate": 8.295454545454547e-06, |
| "loss": 9.3507, |
| "mean_token_accuracy": 0.7317077834159136, |
| "num_tokens": 1048576.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.22857142857142856, |
| "grad_norm": 4.8386616706848145, |
| "learning_rate": 7.840909090909091e-06, |
| "loss": 8.6437, |
| "mean_token_accuracy": 0.7404739372432232, |
| "num_tokens": 1310720.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.2742857142857143, |
| "grad_norm": 4.859585285186768, |
| "learning_rate": 7.386363636363637e-06, |
| "loss": 7.8244, |
| "mean_token_accuracy": 0.7579365894198418, |
| "num_tokens": 1572864.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 5.170243740081787, |
| "learning_rate": 6.931818181818183e-06, |
| "loss": 7.1408, |
| "mean_token_accuracy": 0.7731606159359217, |
| "num_tokens": 1835008.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.3657142857142857, |
| "grad_norm": 4.473582744598389, |
| "learning_rate": 6.477272727272727e-06, |
| "loss": 7.0147, |
| "mean_token_accuracy": 0.7715074121952057, |
| "num_tokens": 2097152.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.4114285714285714, |
| "grad_norm": 4.883391380310059, |
| "learning_rate": 6.022727272727273e-06, |
| "loss": 6.2369, |
| "mean_token_accuracy": 0.7961227279156446, |
| "num_tokens": 2359296.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.45714285714285713, |
| "grad_norm": 4.0982818603515625, |
| "learning_rate": 5.568181818181818e-06, |
| "loss": 5.6263, |
| "mean_token_accuracy": 0.8100197333842516, |
| "num_tokens": 2621440.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5028571428571429, |
| "grad_norm": 5.6680827140808105, |
| "learning_rate": 5.113636363636364e-06, |
| "loss": 5.526, |
| "mean_token_accuracy": 0.8139757830649614, |
| "num_tokens": 2883584.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.5485714285714286, |
| "grad_norm": 4.965151309967041, |
| "learning_rate": 4.6590909090909095e-06, |
| "loss": 4.5715, |
| "mean_token_accuracy": 0.8425043243914843, |
| "num_tokens": 3145728.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.5942857142857143, |
| "grad_norm": 4.398440837860107, |
| "learning_rate": 4.204545454545455e-06, |
| "loss": 4.5646, |
| "mean_token_accuracy": 0.8441276364028454, |
| "num_tokens": 3407872.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 4.850263595581055, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 4.2065, |
| "mean_token_accuracy": 0.8565848618745804, |
| "num_tokens": 3670016.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.6857142857142857, |
| "grad_norm": 5.376770973205566, |
| "learning_rate": 3.2954545454545456e-06, |
| "loss": 3.8085, |
| "mean_token_accuracy": 0.8706343658268452, |
| "num_tokens": 3932160.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.7314285714285714, |
| "grad_norm": 4.992929458618164, |
| "learning_rate": 2.8409090909090916e-06, |
| "loss": 3.5296, |
| "mean_token_accuracy": 0.8793725371360779, |
| "num_tokens": 4194304.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.7771428571428571, |
| "grad_norm": 5.099910736083984, |
| "learning_rate": 2.3863636363636367e-06, |
| "loss": 3.158, |
| "mean_token_accuracy": 0.8912728782743216, |
| "num_tokens": 4456448.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.8228571428571428, |
| "grad_norm": 4.732657432556152, |
| "learning_rate": 1.931818181818182e-06, |
| "loss": 3.4912, |
| "mean_token_accuracy": 0.8802231661975384, |
| "num_tokens": 4718592.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.8685714285714285, |
| "grad_norm": 4.845268726348877, |
| "learning_rate": 1.4772727272727275e-06, |
| "loss": 3.0872, |
| "mean_token_accuracy": 0.8950978331267834, |
| "num_tokens": 4980736.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.9142857142857143, |
| "grad_norm": 4.847484111785889, |
| "learning_rate": 1.0227272727272729e-06, |
| "loss": 2.8941, |
| "mean_token_accuracy": 0.8987292610108852, |
| "num_tokens": 5242880.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 4.448861122131348, |
| "learning_rate": 5.681818181818182e-07, |
| "loss": 3.0148, |
| "mean_token_accuracy": 0.8966547567397356, |
| "num_tokens": 5505024.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 3.111453056335449, |
| "learning_rate": 1.1363636363636364e-07, |
| "loss": 2.7214, |
| "mean_token_accuracy": 0.8931009152105877, |
| "num_tokens": 5730304.0, |
| "step": 88 |
| } |
| ], |
| "logging_steps": 4, |
| "max_steps": 88, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 40, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.026946939511112e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|