HTH / last-checkpoint /trainer_state.json
mgh6's picture
Training in progress, epoch 9, checkpoint
7ddc6d2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.929233772571987,
"eval_steps": 50,
"global_step": 1280,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.3904343582235237,
"grad_norm": 0.5675944685935974,
"learning_rate": 9.609375e-05,
"loss": 1.5678,
"step": 50
},
{
"epoch": 0.3904343582235237,
"eval_loss": 1.53541898727417,
"eval_runtime": 11.6265,
"eval_samples_per_second": 37.156,
"eval_steps_per_second": 18.578,
"step": 50
},
{
"epoch": 0.7808687164470474,
"grad_norm": 0.5330150127410889,
"learning_rate": 9.21875e-05,
"loss": 1.5019,
"step": 100
},
{
"epoch": 0.7808687164470474,
"eval_loss": 1.4973269701004028,
"eval_runtime": 11.5507,
"eval_samples_per_second": 37.4,
"eval_steps_per_second": 18.7,
"step": 100
},
{
"epoch": 1.16398243045388,
"grad_norm": 0.5410779714584351,
"learning_rate": 8.828125000000001e-05,
"loss": 1.4244,
"step": 150
},
{
"epoch": 1.16398243045388,
"eval_loss": 1.43968665599823,
"eval_runtime": 11.5205,
"eval_samples_per_second": 37.498,
"eval_steps_per_second": 18.749,
"step": 150
},
{
"epoch": 1.5544167886774036,
"grad_norm": 0.5698382258415222,
"learning_rate": 8.4375e-05,
"loss": 1.4023,
"step": 200
},
{
"epoch": 1.5544167886774036,
"eval_loss": 1.4115360975265503,
"eval_runtime": 11.5294,
"eval_samples_per_second": 37.469,
"eval_steps_per_second": 18.735,
"step": 200
},
{
"epoch": 1.9448511469009273,
"grad_norm": 0.5761227607727051,
"learning_rate": 8.046875e-05,
"loss": 1.3693,
"step": 250
},
{
"epoch": 1.9448511469009273,
"eval_loss": 1.3649152517318726,
"eval_runtime": 11.5198,
"eval_samples_per_second": 37.501,
"eval_steps_per_second": 18.75,
"step": 250
},
{
"epoch": 2.32796486090776,
"grad_norm": 0.5856680274009705,
"learning_rate": 7.65625e-05,
"loss": 1.302,
"step": 300
},
{
"epoch": 2.32796486090776,
"eval_loss": 1.3940138816833496,
"eval_runtime": 11.5409,
"eval_samples_per_second": 37.432,
"eval_steps_per_second": 18.716,
"step": 300
},
{
"epoch": 2.7183992191312836,
"grad_norm": 0.5735114216804504,
"learning_rate": 7.265625000000001e-05,
"loss": 1.3063,
"step": 350
},
{
"epoch": 2.7183992191312836,
"eval_loss": 1.3489629030227661,
"eval_runtime": 11.5502,
"eval_samples_per_second": 37.402,
"eval_steps_per_second": 18.701,
"step": 350
},
{
"epoch": 3.101512933138116,
"grad_norm": 0.6013683676719666,
"learning_rate": 6.875e-05,
"loss": 1.2507,
"step": 400
},
{
"epoch": 3.101512933138116,
"eval_loss": 1.3298077583312988,
"eval_runtime": 11.5601,
"eval_samples_per_second": 37.37,
"eval_steps_per_second": 18.685,
"step": 400
},
{
"epoch": 3.49194729136164,
"grad_norm": 0.6187678575515747,
"learning_rate": 6.484375e-05,
"loss": 1.2463,
"step": 450
},
{
"epoch": 3.49194729136164,
"eval_loss": 1.2986701726913452,
"eval_runtime": 11.5395,
"eval_samples_per_second": 37.437,
"eval_steps_per_second": 18.718,
"step": 450
},
{
"epoch": 3.8823816495851635,
"grad_norm": 0.5973629951477051,
"learning_rate": 6.0937500000000004e-05,
"loss": 1.2315,
"step": 500
},
{
"epoch": 3.8823816495851635,
"eval_loss": 1.2973381280899048,
"eval_runtime": 11.5491,
"eval_samples_per_second": 37.405,
"eval_steps_per_second": 18.703,
"step": 500
},
{
"epoch": 4.265495363591996,
"grad_norm": 0.6226805448532104,
"learning_rate": 5.703125e-05,
"loss": 1.1778,
"step": 550
},
{
"epoch": 4.265495363591996,
"eval_loss": 1.2510361671447754,
"eval_runtime": 11.5281,
"eval_samples_per_second": 37.474,
"eval_steps_per_second": 18.737,
"step": 550
},
{
"epoch": 4.65592972181552,
"grad_norm": 0.64255690574646,
"learning_rate": 5.3125000000000004e-05,
"loss": 1.1819,
"step": 600
},
{
"epoch": 4.65592972181552,
"eval_loss": 1.2529098987579346,
"eval_runtime": 11.5409,
"eval_samples_per_second": 37.432,
"eval_steps_per_second": 18.716,
"step": 600
},
{
"epoch": 5.039043435822352,
"grad_norm": 0.6386131048202515,
"learning_rate": 4.921875e-05,
"loss": 1.1507,
"step": 650
},
{
"epoch": 5.039043435822352,
"eval_loss": 1.2271380424499512,
"eval_runtime": 11.5699,
"eval_samples_per_second": 37.338,
"eval_steps_per_second": 18.669,
"step": 650
},
{
"epoch": 5.4294777940458765,
"grad_norm": 0.6771230101585388,
"learning_rate": 4.5312500000000004e-05,
"loss": 1.134,
"step": 700
},
{
"epoch": 5.4294777940458765,
"eval_loss": 1.2191808223724365,
"eval_runtime": 11.5238,
"eval_samples_per_second": 37.488,
"eval_steps_per_second": 18.744,
"step": 700
},
{
"epoch": 5.819912152269399,
"grad_norm": 0.6427966952323914,
"learning_rate": 4.140625e-05,
"loss": 1.1258,
"step": 750
},
{
"epoch": 5.819912152269399,
"eval_loss": 1.2103700637817383,
"eval_runtime": 11.5062,
"eval_samples_per_second": 37.545,
"eval_steps_per_second": 18.772,
"step": 750
},
{
"epoch": 6.203025866276232,
"grad_norm": 0.6937867403030396,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.0941,
"step": 800
},
{
"epoch": 6.203025866276232,
"eval_loss": 1.2105975151062012,
"eval_runtime": 11.55,
"eval_samples_per_second": 37.403,
"eval_steps_per_second": 18.701,
"step": 800
},
{
"epoch": 6.593460224499756,
"grad_norm": 0.719428300857544,
"learning_rate": 3.359375e-05,
"loss": 1.0923,
"step": 850
},
{
"epoch": 6.593460224499756,
"eval_loss": 1.1799763441085815,
"eval_runtime": 11.536,
"eval_samples_per_second": 37.448,
"eval_steps_per_second": 18.724,
"step": 850
},
{
"epoch": 6.98389458272328,
"grad_norm": 0.65595942735672,
"learning_rate": 2.96875e-05,
"loss": 1.0796,
"step": 900
},
{
"epoch": 6.98389458272328,
"eval_loss": 1.1729077100753784,
"eval_runtime": 11.5363,
"eval_samples_per_second": 37.447,
"eval_steps_per_second": 18.724,
"step": 900
},
{
"epoch": 7.367008296730113,
"grad_norm": 0.7260088920593262,
"learning_rate": 2.578125e-05,
"loss": 1.0371,
"step": 950
},
{
"epoch": 7.367008296730113,
"eval_loss": 1.1634416580200195,
"eval_runtime": 11.5405,
"eval_samples_per_second": 37.433,
"eval_steps_per_second": 18.717,
"step": 950
},
{
"epoch": 7.7574426549536355,
"grad_norm": 0.6944181323051453,
"learning_rate": 2.1875e-05,
"loss": 1.0466,
"step": 1000
},
{
"epoch": 7.7574426549536355,
"eval_loss": 1.154969573020935,
"eval_runtime": 11.532,
"eval_samples_per_second": 37.461,
"eval_steps_per_second": 18.731,
"step": 1000
},
{
"epoch": 8.140556368960468,
"grad_norm": 0.7572025060653687,
"learning_rate": 1.796875e-05,
"loss": 1.0228,
"step": 1050
},
{
"epoch": 8.140556368960468,
"eval_loss": 1.1517966985702515,
"eval_runtime": 11.5429,
"eval_samples_per_second": 37.426,
"eval_steps_per_second": 18.713,
"step": 1050
},
{
"epoch": 8.530990727183992,
"grad_norm": 0.6960224509239197,
"learning_rate": 1.4062500000000001e-05,
"loss": 1.0231,
"step": 1100
},
{
"epoch": 8.530990727183992,
"eval_loss": 1.108694314956665,
"eval_runtime": 11.5162,
"eval_samples_per_second": 37.512,
"eval_steps_per_second": 18.756,
"step": 1100
},
{
"epoch": 8.921425085407517,
"grad_norm": 0.6743898391723633,
"learning_rate": 1.0156250000000001e-05,
"loss": 1.0164,
"step": 1150
},
{
"epoch": 8.921425085407517,
"eval_loss": 1.1212413311004639,
"eval_runtime": 11.5308,
"eval_samples_per_second": 37.465,
"eval_steps_per_second": 18.732,
"step": 1150
},
{
"epoch": 9.304538799414349,
"grad_norm": 0.7794139385223389,
"learning_rate": 6.25e-06,
"loss": 0.9863,
"step": 1200
},
{
"epoch": 9.304538799414349,
"eval_loss": 1.1227957010269165,
"eval_runtime": 11.5133,
"eval_samples_per_second": 37.522,
"eval_steps_per_second": 18.761,
"step": 1200
},
{
"epoch": 9.694973157637872,
"grad_norm": 0.7152210474014282,
"learning_rate": 2.3437500000000002e-06,
"loss": 1.005,
"step": 1250
},
{
"epoch": 9.694973157637872,
"eval_loss": 1.1129647493362427,
"eval_runtime": 11.5181,
"eval_samples_per_second": 37.506,
"eval_steps_per_second": 18.753,
"step": 1250
}
],
"logging_steps": 50,
"max_steps": 1280,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.541080433502454e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}