hp_ablations_llama3_epoch2 / trainer_state.json
sedrickkeh's picture
End of training
53b2cf6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9973380656610469,
"eval_steps": 500,
"global_step": 844,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023661638568470866,
"grad_norm": 1.2408271523327399,
"learning_rate": 5e-06,
"loss": 0.8878,
"step": 10
},
{
"epoch": 0.04732327713694173,
"grad_norm": 10.221763472546659,
"learning_rate": 5e-06,
"loss": 0.7989,
"step": 20
},
{
"epoch": 0.0709849157054126,
"grad_norm": 1.2230059638293505,
"learning_rate": 5e-06,
"loss": 0.773,
"step": 30
},
{
"epoch": 0.09464655427388347,
"grad_norm": 2.6694221939382583,
"learning_rate": 5e-06,
"loss": 0.7466,
"step": 40
},
{
"epoch": 0.11830819284235433,
"grad_norm": 0.7655921008659343,
"learning_rate": 5e-06,
"loss": 0.7308,
"step": 50
},
{
"epoch": 0.1419698314108252,
"grad_norm": 0.6751841899446792,
"learning_rate": 5e-06,
"loss": 0.7178,
"step": 60
},
{
"epoch": 0.16563146997929606,
"grad_norm": 0.545261037277831,
"learning_rate": 5e-06,
"loss": 0.7116,
"step": 70
},
{
"epoch": 0.18929310854776693,
"grad_norm": 0.8030523103589834,
"learning_rate": 5e-06,
"loss": 0.7017,
"step": 80
},
{
"epoch": 0.2129547471162378,
"grad_norm": 0.8014531123871866,
"learning_rate": 5e-06,
"loss": 0.6861,
"step": 90
},
{
"epoch": 0.23661638568470866,
"grad_norm": 0.4918470297754101,
"learning_rate": 5e-06,
"loss": 0.6852,
"step": 100
},
{
"epoch": 0.26027802425317953,
"grad_norm": 0.69215978679395,
"learning_rate": 5e-06,
"loss": 0.69,
"step": 110
},
{
"epoch": 0.2839396628216504,
"grad_norm": 0.8786435457825235,
"learning_rate": 5e-06,
"loss": 0.6773,
"step": 120
},
{
"epoch": 0.30760130139012126,
"grad_norm": 0.49069358486584114,
"learning_rate": 5e-06,
"loss": 0.6737,
"step": 130
},
{
"epoch": 0.33126293995859213,
"grad_norm": 0.7921488279977867,
"learning_rate": 5e-06,
"loss": 0.6821,
"step": 140
},
{
"epoch": 0.354924578527063,
"grad_norm": 0.77230991386959,
"learning_rate": 5e-06,
"loss": 0.6648,
"step": 150
},
{
"epoch": 0.37858621709553386,
"grad_norm": 0.47987920788300265,
"learning_rate": 5e-06,
"loss": 0.669,
"step": 160
},
{
"epoch": 0.4022478556640047,
"grad_norm": 0.5618200809563821,
"learning_rate": 5e-06,
"loss": 0.6668,
"step": 170
},
{
"epoch": 0.4259094942324756,
"grad_norm": 0.7304782642194491,
"learning_rate": 5e-06,
"loss": 0.6737,
"step": 180
},
{
"epoch": 0.44957113280094646,
"grad_norm": 0.46280184605813207,
"learning_rate": 5e-06,
"loss": 0.6697,
"step": 190
},
{
"epoch": 0.4732327713694173,
"grad_norm": 0.7079097684721737,
"learning_rate": 5e-06,
"loss": 0.6686,
"step": 200
},
{
"epoch": 0.4968944099378882,
"grad_norm": 0.774761573498746,
"learning_rate": 5e-06,
"loss": 0.6694,
"step": 210
},
{
"epoch": 0.5205560485063591,
"grad_norm": 0.576730626392715,
"learning_rate": 5e-06,
"loss": 0.6677,
"step": 220
},
{
"epoch": 0.54421768707483,
"grad_norm": 0.5744988270185307,
"learning_rate": 5e-06,
"loss": 0.6602,
"step": 230
},
{
"epoch": 0.5678793256433008,
"grad_norm": 0.5394481930250411,
"learning_rate": 5e-06,
"loss": 0.6644,
"step": 240
},
{
"epoch": 0.5915409642117717,
"grad_norm": 0.5182952984171931,
"learning_rate": 5e-06,
"loss": 0.6615,
"step": 250
},
{
"epoch": 0.6152026027802425,
"grad_norm": 0.6364320156443367,
"learning_rate": 5e-06,
"loss": 0.6519,
"step": 260
},
{
"epoch": 0.6388642413487134,
"grad_norm": 0.6324207034276161,
"learning_rate": 5e-06,
"loss": 0.6639,
"step": 270
},
{
"epoch": 0.6625258799171843,
"grad_norm": 0.6620182705762153,
"learning_rate": 5e-06,
"loss": 0.6651,
"step": 280
},
{
"epoch": 0.6861875184856552,
"grad_norm": 0.46128169756980925,
"learning_rate": 5e-06,
"loss": 0.6596,
"step": 290
},
{
"epoch": 0.709849157054126,
"grad_norm": 0.622188372470794,
"learning_rate": 5e-06,
"loss": 0.6534,
"step": 300
},
{
"epoch": 0.7335107956225969,
"grad_norm": 0.4904698615453566,
"learning_rate": 5e-06,
"loss": 0.6618,
"step": 310
},
{
"epoch": 0.7571724341910677,
"grad_norm": 0.4555806118897353,
"learning_rate": 5e-06,
"loss": 0.6554,
"step": 320
},
{
"epoch": 0.7808340727595386,
"grad_norm": 0.5273034701797177,
"learning_rate": 5e-06,
"loss": 0.654,
"step": 330
},
{
"epoch": 0.8044957113280095,
"grad_norm": 0.5442233535066454,
"learning_rate": 5e-06,
"loss": 0.6537,
"step": 340
},
{
"epoch": 0.8281573498964804,
"grad_norm": 0.6380409398524519,
"learning_rate": 5e-06,
"loss": 0.6601,
"step": 350
},
{
"epoch": 0.8518189884649512,
"grad_norm": 0.4389996927828098,
"learning_rate": 5e-06,
"loss": 0.6537,
"step": 360
},
{
"epoch": 0.8754806270334221,
"grad_norm": 0.4608268531740333,
"learning_rate": 5e-06,
"loss": 0.6565,
"step": 370
},
{
"epoch": 0.8991422656018929,
"grad_norm": 0.5330723429667825,
"learning_rate": 5e-06,
"loss": 0.6477,
"step": 380
},
{
"epoch": 0.9228039041703638,
"grad_norm": 0.5929849990200475,
"learning_rate": 5e-06,
"loss": 0.6552,
"step": 390
},
{
"epoch": 0.9464655427388347,
"grad_norm": 0.4773172047297779,
"learning_rate": 5e-06,
"loss": 0.6464,
"step": 400
},
{
"epoch": 0.9701271813073056,
"grad_norm": 0.4606137860127268,
"learning_rate": 5e-06,
"loss": 0.6489,
"step": 410
},
{
"epoch": 0.9937888198757764,
"grad_norm": 0.526120099445913,
"learning_rate": 5e-06,
"loss": 0.6478,
"step": 420
},
{
"epoch": 0.9985211475894705,
"eval_loss": 0.6501929759979248,
"eval_runtime": 449.6535,
"eval_samples_per_second": 25.328,
"eval_steps_per_second": 0.396,
"step": 422
},
{
"epoch": 1.0177462289263532,
"grad_norm": 0.5143362353922324,
"learning_rate": 5e-06,
"loss": 0.6515,
"step": 430
},
{
"epoch": 1.041407867494824,
"grad_norm": 0.5162162401792869,
"learning_rate": 5e-06,
"loss": 0.605,
"step": 440
},
{
"epoch": 1.0650695060632949,
"grad_norm": 0.7393357078452915,
"learning_rate": 5e-06,
"loss": 0.603,
"step": 450
},
{
"epoch": 1.0887311446317658,
"grad_norm": 0.649426932177774,
"learning_rate": 5e-06,
"loss": 0.6134,
"step": 460
},
{
"epoch": 1.1123927832002367,
"grad_norm": 0.5705639188659947,
"learning_rate": 5e-06,
"loss": 0.6106,
"step": 470
},
{
"epoch": 1.1360544217687074,
"grad_norm": 0.7543562567579628,
"learning_rate": 5e-06,
"loss": 0.611,
"step": 480
},
{
"epoch": 1.1597160603371783,
"grad_norm": 0.5499597181388575,
"learning_rate": 5e-06,
"loss": 0.6079,
"step": 490
},
{
"epoch": 1.1833776989056493,
"grad_norm": 0.5262121393467482,
"learning_rate": 5e-06,
"loss": 0.6036,
"step": 500
},
{
"epoch": 1.2070393374741202,
"grad_norm": 0.5683114548160128,
"learning_rate": 5e-06,
"loss": 0.6034,
"step": 510
},
{
"epoch": 1.2307009760425909,
"grad_norm": 0.6610172663362014,
"learning_rate": 5e-06,
"loss": 0.6099,
"step": 520
},
{
"epoch": 1.2543626146110618,
"grad_norm": 0.6007955010537178,
"learning_rate": 5e-06,
"loss": 0.6125,
"step": 530
},
{
"epoch": 1.2780242531795327,
"grad_norm": 0.5585264375543114,
"learning_rate": 5e-06,
"loss": 0.6121,
"step": 540
},
{
"epoch": 1.3016858917480034,
"grad_norm": 0.4689366084615487,
"learning_rate": 5e-06,
"loss": 0.6089,
"step": 550
},
{
"epoch": 1.3253475303164743,
"grad_norm": 0.443719906754886,
"learning_rate": 5e-06,
"loss": 0.6073,
"step": 560
},
{
"epoch": 1.3490091688849453,
"grad_norm": 0.8624897115990705,
"learning_rate": 5e-06,
"loss": 0.6084,
"step": 570
},
{
"epoch": 1.3726708074534162,
"grad_norm": 0.5498793437391156,
"learning_rate": 5e-06,
"loss": 0.611,
"step": 580
},
{
"epoch": 1.396332446021887,
"grad_norm": 0.44457160894446396,
"learning_rate": 5e-06,
"loss": 0.6115,
"step": 590
},
{
"epoch": 1.4199940845903578,
"grad_norm": 0.5196837986130378,
"learning_rate": 5e-06,
"loss": 0.6008,
"step": 600
},
{
"epoch": 1.4436557231588287,
"grad_norm": 0.40806642647037533,
"learning_rate": 5e-06,
"loss": 0.6002,
"step": 610
},
{
"epoch": 1.4673173617272997,
"grad_norm": 0.449778520265882,
"learning_rate": 5e-06,
"loss": 0.6037,
"step": 620
},
{
"epoch": 1.4909790002957704,
"grad_norm": 0.46760792115141014,
"learning_rate": 5e-06,
"loss": 0.6157,
"step": 630
},
{
"epoch": 1.5146406388642415,
"grad_norm": 0.4490152450206069,
"learning_rate": 5e-06,
"loss": 0.6101,
"step": 640
},
{
"epoch": 1.5383022774327122,
"grad_norm": 0.42442779950583953,
"learning_rate": 5e-06,
"loss": 0.6042,
"step": 650
},
{
"epoch": 1.5619639160011831,
"grad_norm": 0.5976128445381751,
"learning_rate": 5e-06,
"loss": 0.609,
"step": 660
},
{
"epoch": 1.585625554569654,
"grad_norm": 0.7381067199080075,
"learning_rate": 5e-06,
"loss": 0.6015,
"step": 670
},
{
"epoch": 1.6092871931381247,
"grad_norm": 0.4692365896477618,
"learning_rate": 5e-06,
"loss": 0.6098,
"step": 680
},
{
"epoch": 1.6329488317065957,
"grad_norm": 0.5475052095467955,
"learning_rate": 5e-06,
"loss": 0.601,
"step": 690
},
{
"epoch": 1.6566104702750666,
"grad_norm": 0.5706027825471482,
"learning_rate": 5e-06,
"loss": 0.6107,
"step": 700
},
{
"epoch": 1.6802721088435373,
"grad_norm": 0.5270197331562642,
"learning_rate": 5e-06,
"loss": 0.609,
"step": 710
},
{
"epoch": 1.7039337474120084,
"grad_norm": 0.6598391343305342,
"learning_rate": 5e-06,
"loss": 0.6118,
"step": 720
},
{
"epoch": 1.7275953859804791,
"grad_norm": 0.5570434796027114,
"learning_rate": 5e-06,
"loss": 0.6116,
"step": 730
},
{
"epoch": 1.75125702454895,
"grad_norm": 0.4955844130516369,
"learning_rate": 5e-06,
"loss": 0.6039,
"step": 740
},
{
"epoch": 1.774918663117421,
"grad_norm": 0.47770168087128073,
"learning_rate": 5e-06,
"loss": 0.6101,
"step": 750
},
{
"epoch": 1.7985803016858917,
"grad_norm": 0.4667370666965365,
"learning_rate": 5e-06,
"loss": 0.614,
"step": 760
},
{
"epoch": 1.8222419402543626,
"grad_norm": 0.4616819567056668,
"learning_rate": 5e-06,
"loss": 0.6158,
"step": 770
},
{
"epoch": 1.8459035788228335,
"grad_norm": 0.43467879051005953,
"learning_rate": 5e-06,
"loss": 0.6067,
"step": 780
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.48362881437134725,
"learning_rate": 5e-06,
"loss": 0.6054,
"step": 790
},
{
"epoch": 1.8932268559597754,
"grad_norm": 0.49747648081112666,
"learning_rate": 5e-06,
"loss": 0.6137,
"step": 800
},
{
"epoch": 1.916888494528246,
"grad_norm": 0.4097820122920606,
"learning_rate": 5e-06,
"loss": 0.6114,
"step": 810
},
{
"epoch": 1.940550133096717,
"grad_norm": 0.47535675742314604,
"learning_rate": 5e-06,
"loss": 0.5996,
"step": 820
},
{
"epoch": 1.964211771665188,
"grad_norm": 0.49949616004506914,
"learning_rate": 5e-06,
"loss": 0.6108,
"step": 830
},
{
"epoch": 1.9878734102336586,
"grad_norm": 0.4387152081138621,
"learning_rate": 5e-06,
"loss": 0.5981,
"step": 840
},
{
"epoch": 1.9973380656610469,
"eval_loss": 0.6398828029632568,
"eval_runtime": 449.4321,
"eval_samples_per_second": 25.341,
"eval_steps_per_second": 0.396,
"step": 844
},
{
"epoch": 1.9973380656610469,
"step": 844,
"total_flos": 1413522055495680.0,
"train_loss": 0.645099672378522,
"train_runtime": 50035.9585,
"train_samples_per_second": 8.649,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 844,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1413522055495680.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}