hp_ablations_llama3_epoch1 / trainer_state.json
sedrickkeh's picture
End of training
fcf163f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9985211475894705,
"eval_steps": 500,
"global_step": 422,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023661638568470866,
"grad_norm": 6.501171170453925,
"learning_rate": 5e-06,
"loss": 0.8881,
"step": 10
},
{
"epoch": 0.04732327713694173,
"grad_norm": 5.9691618342357975,
"learning_rate": 5e-06,
"loss": 0.7939,
"step": 20
},
{
"epoch": 0.0709849157054126,
"grad_norm": 3.800672425152988,
"learning_rate": 5e-06,
"loss": 0.7688,
"step": 30
},
{
"epoch": 0.09464655427388347,
"grad_norm": 1.0801121583577953,
"learning_rate": 5e-06,
"loss": 0.7473,
"step": 40
},
{
"epoch": 0.11830819284235433,
"grad_norm": 0.9685892726062331,
"learning_rate": 5e-06,
"loss": 0.7309,
"step": 50
},
{
"epoch": 0.1419698314108252,
"grad_norm": 0.7335129796923299,
"learning_rate": 5e-06,
"loss": 0.7181,
"step": 60
},
{
"epoch": 0.16563146997929606,
"grad_norm": 0.5463303979273281,
"learning_rate": 5e-06,
"loss": 0.7118,
"step": 70
},
{
"epoch": 0.18929310854776693,
"grad_norm": 0.7971470498150557,
"learning_rate": 5e-06,
"loss": 0.7019,
"step": 80
},
{
"epoch": 0.2129547471162378,
"grad_norm": 0.5202406996039837,
"learning_rate": 5e-06,
"loss": 0.6862,
"step": 90
},
{
"epoch": 0.23661638568470866,
"grad_norm": 0.5912783446923657,
"learning_rate": 5e-06,
"loss": 0.6854,
"step": 100
},
{
"epoch": 0.26027802425317953,
"grad_norm": 0.5421067803132418,
"learning_rate": 5e-06,
"loss": 0.6902,
"step": 110
},
{
"epoch": 0.2839396628216504,
"grad_norm": 0.6577266833093017,
"learning_rate": 5e-06,
"loss": 0.6776,
"step": 120
},
{
"epoch": 0.30760130139012126,
"grad_norm": 0.7220884795089095,
"learning_rate": 5e-06,
"loss": 0.6736,
"step": 130
},
{
"epoch": 0.33126293995859213,
"grad_norm": 0.6298167534415481,
"learning_rate": 5e-06,
"loss": 0.6821,
"step": 140
},
{
"epoch": 0.354924578527063,
"grad_norm": 0.6024637677965929,
"learning_rate": 5e-06,
"loss": 0.6647,
"step": 150
},
{
"epoch": 0.37858621709553386,
"grad_norm": 0.5147721195366849,
"learning_rate": 5e-06,
"loss": 0.6691,
"step": 160
},
{
"epoch": 0.4022478556640047,
"grad_norm": 0.7032904919994801,
"learning_rate": 5e-06,
"loss": 0.6672,
"step": 170
},
{
"epoch": 0.4259094942324756,
"grad_norm": 0.5237180402877233,
"learning_rate": 5e-06,
"loss": 0.6739,
"step": 180
},
{
"epoch": 0.44957113280094646,
"grad_norm": 0.5717691530030693,
"learning_rate": 5e-06,
"loss": 0.6698,
"step": 190
},
{
"epoch": 0.4732327713694173,
"grad_norm": 0.4766660835336923,
"learning_rate": 5e-06,
"loss": 0.6688,
"step": 200
},
{
"epoch": 0.4968944099378882,
"grad_norm": 0.6790711796986141,
"learning_rate": 5e-06,
"loss": 0.6695,
"step": 210
},
{
"epoch": 0.5205560485063591,
"grad_norm": 0.5878534288094095,
"learning_rate": 5e-06,
"loss": 0.668,
"step": 220
},
{
"epoch": 0.54421768707483,
"grad_norm": 0.4783030354173372,
"learning_rate": 5e-06,
"loss": 0.6603,
"step": 230
},
{
"epoch": 0.5678793256433008,
"grad_norm": 0.5394816933853074,
"learning_rate": 5e-06,
"loss": 0.6645,
"step": 240
},
{
"epoch": 0.5915409642117717,
"grad_norm": 0.6757841342023195,
"learning_rate": 5e-06,
"loss": 0.6616,
"step": 250
},
{
"epoch": 0.6152026027802425,
"grad_norm": 0.620819409974346,
"learning_rate": 5e-06,
"loss": 0.652,
"step": 260
},
{
"epoch": 0.6388642413487134,
"grad_norm": 0.4646800950253652,
"learning_rate": 5e-06,
"loss": 0.6639,
"step": 270
},
{
"epoch": 0.6625258799171843,
"grad_norm": 0.584735854517762,
"learning_rate": 5e-06,
"loss": 0.6651,
"step": 280
},
{
"epoch": 0.6861875184856552,
"grad_norm": 0.4893625135857586,
"learning_rate": 5e-06,
"loss": 0.6596,
"step": 290
},
{
"epoch": 0.709849157054126,
"grad_norm": 0.6392477367293743,
"learning_rate": 5e-06,
"loss": 0.6535,
"step": 300
},
{
"epoch": 0.7335107956225969,
"grad_norm": 0.5580856537983637,
"learning_rate": 5e-06,
"loss": 0.6619,
"step": 310
},
{
"epoch": 0.7571724341910677,
"grad_norm": 0.6586565023491409,
"learning_rate": 5e-06,
"loss": 0.6556,
"step": 320
},
{
"epoch": 0.7808340727595386,
"grad_norm": 0.7017087820497026,
"learning_rate": 5e-06,
"loss": 0.6541,
"step": 330
},
{
"epoch": 0.8044957113280095,
"grad_norm": 0.6201315444988488,
"learning_rate": 5e-06,
"loss": 0.6538,
"step": 340
},
{
"epoch": 0.8281573498964804,
"grad_norm": 0.5419722366809865,
"learning_rate": 5e-06,
"loss": 0.6602,
"step": 350
},
{
"epoch": 0.8518189884649512,
"grad_norm": 0.47923619949498586,
"learning_rate": 5e-06,
"loss": 0.6537,
"step": 360
},
{
"epoch": 0.8754806270334221,
"grad_norm": 0.48623500055139035,
"learning_rate": 5e-06,
"loss": 0.6565,
"step": 370
},
{
"epoch": 0.8991422656018929,
"grad_norm": 0.4927729455615553,
"learning_rate": 5e-06,
"loss": 0.6477,
"step": 380
},
{
"epoch": 0.9228039041703638,
"grad_norm": 0.47069043398418653,
"learning_rate": 5e-06,
"loss": 0.6552,
"step": 390
},
{
"epoch": 0.9464655427388347,
"grad_norm": 0.5081907383490093,
"learning_rate": 5e-06,
"loss": 0.6464,
"step": 400
},
{
"epoch": 0.9701271813073056,
"grad_norm": 0.4892217314838454,
"learning_rate": 5e-06,
"loss": 0.649,
"step": 410
},
{
"epoch": 0.9937888198757764,
"grad_norm": 0.5410064668246478,
"learning_rate": 5e-06,
"loss": 0.648,
"step": 420
},
{
"epoch": 0.9985211475894705,
"eval_loss": 0.6502951979637146,
"eval_runtime": 446.2763,
"eval_samples_per_second": 25.52,
"eval_steps_per_second": 0.399,
"step": 422
},
{
"epoch": 0.9985211475894705,
"step": 422,
"total_flos": 706656337920000.0,
"train_loss": 0.6818134089781774,
"train_runtime": 24762.6241,
"train_samples_per_second": 8.738,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 422,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 706656337920000.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}