yuyuchily's picture
Upload E-8_baseline_seq_e8
fd8d96d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.8027057497181511,
"eval_steps": 50,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04509582863585118,
"grad_norm": 4.880373001098633,
"learning_rate": 4e-07,
"loss": 1.7893,
"step": 10
},
{
"epoch": 0.09019165727170236,
"grad_norm": 4.680130958557129,
"learning_rate": 8.444444444444444e-07,
"loss": 1.7309,
"step": 20
},
{
"epoch": 0.13528748590755355,
"grad_norm": 4.52065372467041,
"learning_rate": 1.2888888888888889e-06,
"loss": 1.7019,
"step": 30
},
{
"epoch": 0.18038331454340473,
"grad_norm": 4.010641574859619,
"learning_rate": 1.7333333333333334e-06,
"loss": 1.5762,
"step": 40
},
{
"epoch": 0.2254791431792559,
"grad_norm": 2.6717236042022705,
"learning_rate": 1.9995040840893383e-06,
"loss": 1.6333,
"step": 50
},
{
"epoch": 0.2254791431792559,
"eval_loss": 1.5759482383728027,
"eval_runtime": 14.6699,
"eval_samples_per_second": 12.747,
"eval_steps_per_second": 6.408,
"step": 50
},
{
"epoch": 0.2705749718151071,
"grad_norm": 1.3333920240402222,
"learning_rate": 1.9939306773179494e-06,
"loss": 1.2989,
"step": 60
},
{
"epoch": 0.3156708004509583,
"grad_norm": 0.6459554433822632,
"learning_rate": 1.9821986184473754e-06,
"loss": 1.2498,
"step": 70
},
{
"epoch": 0.36076662908680945,
"grad_norm": 0.6363914608955383,
"learning_rate": 1.964380602355277e-06,
"loss": 1.157,
"step": 80
},
{
"epoch": 0.40586245772266066,
"grad_norm": 0.4646184742450714,
"learning_rate": 1.9405870340877135e-06,
"loss": 1.1613,
"step": 90
},
{
"epoch": 0.4509582863585118,
"grad_norm": 0.5635167956352234,
"learning_rate": 1.9109653447608605e-06,
"loss": 1.3009,
"step": 100
},
{
"epoch": 0.4509582863585118,
"eval_loss": 1.3309173583984375,
"eval_runtime": 14.1549,
"eval_samples_per_second": 13.211,
"eval_steps_per_second": 6.641,
"step": 100
},
{
"epoch": 0.496054114994363,
"grad_norm": 0.5266286134719849,
"learning_rate": 1.8756990780396006e-06,
"loss": 1.1237,
"step": 110
},
{
"epoch": 0.5411499436302142,
"grad_norm": 0.5904309153556824,
"learning_rate": 1.8350067528534024e-06,
"loss": 1.2938,
"step": 120
},
{
"epoch": 0.5862457722660653,
"grad_norm": 0.4533354938030243,
"learning_rate": 1.7891405093963937e-06,
"loss": 1.3771,
"step": 130
},
{
"epoch": 0.6313416009019166,
"grad_norm": 0.6127325892448425,
"learning_rate": 1.7383845468013654e-06,
"loss": 1.417,
"step": 140
},
{
"epoch": 0.6764374295377678,
"grad_norm": 0.5645148754119873,
"learning_rate": 1.683053362168282e-06,
"loss": 1.1581,
"step": 150
},
{
"epoch": 0.6764374295377678,
"eval_loss": 1.2584223747253418,
"eval_runtime": 14.0101,
"eval_samples_per_second": 13.347,
"eval_steps_per_second": 6.709,
"step": 150
},
{
"epoch": 0.7215332581736189,
"grad_norm": 0.5589331984519958,
"learning_rate": 1.6234898018587336e-06,
"loss": 1.1287,
"step": 160
},
{
"epoch": 0.7666290868094702,
"grad_norm": 0.5021057724952698,
"learning_rate": 1.5600629371310144e-06,
"loss": 1.105,
"step": 170
},
{
"epoch": 0.8117249154453213,
"grad_norm": 0.4609015882015228,
"learning_rate": 1.4931657772789457e-06,
"loss": 1.1352,
"step": 180
},
{
"epoch": 0.8568207440811725,
"grad_norm": 0.46195802092552185,
"learning_rate": 1.423212834444425e-06,
"loss": 1.0855,
"step": 190
},
{
"epoch": 0.9019165727170236,
"grad_norm": 0.527087926864624,
"learning_rate": 1.3506375551927544e-06,
"loss": 1.0202,
"step": 200
},
{
"epoch": 0.9019165727170236,
"eval_loss": 1.2102832794189453,
"eval_runtime": 14.1065,
"eval_samples_per_second": 13.256,
"eval_steps_per_second": 6.664,
"step": 200
},
{
"epoch": 0.9470124013528749,
"grad_norm": 1.0008864402770996,
"learning_rate": 1.2758896347653752e-06,
"loss": 1.2173,
"step": 210
},
{
"epoch": 0.992108229988726,
"grad_norm": 0.5077604651451111,
"learning_rate": 1.1994322306515925e-06,
"loss": 1.0537,
"step": 220
},
{
"epoch": 1.0360766629086808,
"grad_norm": 0.711245596408844,
"learning_rate": 1.1217390927447225e-06,
"loss": 1.1367,
"step": 230
},
{
"epoch": 1.0811724915445322,
"grad_norm": 0.510122537612915,
"learning_rate": 1.043291627864961e-06,
"loss": 1.1316,
"step": 240
},
{
"epoch": 1.1262683201803834,
"grad_norm": 0.5964261293411255,
"learning_rate": 9.645759168379461e-07,
"loss": 1.2521,
"step": 250
},
{
"epoch": 1.1262683201803834,
"eval_loss": 1.1702840328216553,
"eval_runtime": 14.1699,
"eval_samples_per_second": 13.197,
"eval_steps_per_second": 6.634,
"step": 250
},
{
"epoch": 1.1713641488162345,
"grad_norm": 0.39409753680229187,
"learning_rate": 8.860797026119721e-07,
"loss": 0.915,
"step": 260
},
{
"epoch": 1.2164599774520857,
"grad_norm": 0.4673859179019928,
"learning_rate": 8.082893680762618e-07,
"loss": 1.2371,
"step": 270
},
{
"epoch": 1.2615558060879368,
"grad_norm": 0.5883477926254272,
"learning_rate": 7.316869223065155e-07,
"loss": 1.1097,
"step": 280
},
{
"epoch": 1.306651634723788,
"grad_norm": 0.5270853638648987,
"learning_rate": 6.567470139117447e-07,
"loss": 1.0892,
"step": 290
},
{
"epoch": 1.3517474633596391,
"grad_norm": 0.5741926431655884,
"learning_rate": 5.839339899884628e-07,
"loss": 0.9821,
"step": 300
},
{
"epoch": 1.3517474633596391,
"eval_loss": 1.1472760438919067,
"eval_runtime": 14.092,
"eval_samples_per_second": 13.27,
"eval_steps_per_second": 6.67,
"step": 300
},
{
"epoch": 1.3968432919954905,
"grad_norm": 0.426485151052475,
"learning_rate": 5.136990189057187e-07,
"loss": 1.1382,
"step": 310
},
{
"epoch": 1.4419391206313417,
"grad_norm": 0.45604461431503296,
"learning_rate": 4.4647729474894123e-07,
"loss": 1.0356,
"step": 320
},
{
"epoch": 1.4870349492671928,
"grad_norm": 0.3508279621601105,
"learning_rate": 3.826853407445848e-07,
"loss": 1.0695,
"step": 330
},
{
"epoch": 1.532130777903044,
"grad_norm": 0.4223732352256775,
"learning_rate": 3.227184283742591e-07,
"loss": 0.9751,
"step": 340
},
{
"epoch": 1.5772266065388951,
"grad_norm": 0.6314070224761963,
"learning_rate": 2.6694812817017387e-07,
"loss": 1.0793,
"step": 350
},
{
"epoch": 1.5772266065388951,
"eval_loss": 1.136489748954773,
"eval_runtime": 14.3065,
"eval_samples_per_second": 13.071,
"eval_steps_per_second": 6.57,
"step": 350
},
{
"epoch": 1.6223224351747465,
"grad_norm": 0.5215230584144592,
"learning_rate": 2.157200073678137e-07,
"loss": 0.973,
"step": 360
},
{
"epoch": 1.6674182638105974,
"grad_norm": 0.5581642985343933,
"learning_rate": 1.6935148868177718e-07,
"loss": 0.9685,
"step": 370
},
{
"epoch": 1.7125140924464488,
"grad_norm": 0.5601069331169128,
"learning_rate": 1.2812988347236166e-07,
"loss": 1.0191,
"step": 380
},
{
"epoch": 1.7576099210822997,
"grad_norm": 0.4990064203739166,
"learning_rate": 9.231061148990648e-08,
"loss": 0.9674,
"step": 390
},
{
"epoch": 1.8027057497181511,
"grad_norm": 0.6436290144920349,
"learning_rate": 6.211561822781474e-08,
"loss": 0.826,
"step": 400
},
{
"epoch": 1.8027057497181511,
"eval_loss": 1.1320565938949585,
"eval_runtime": 14.0805,
"eval_samples_per_second": 13.281,
"eval_steps_per_second": 6.676,
"step": 400
}
],
"logging_steps": 10,
"max_steps": 444,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.339717225125376e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}