contract1 / checkpoint-1455 /trainer_state.json
edithram23's picture
Upload folder using huggingface_hub
fdc7d2b verified
{
"best_metric": 3.141986846923828,
"best_model_checkpoint": "contract1/checkpoint-1455",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1455,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0859106529209622,
"grad_norm": 32.766380310058594,
"learning_rate": 6.849315068493151e-06,
"loss": 7.5498,
"step": 25
},
{
"epoch": 0.1718213058419244,
"grad_norm": 55.16777420043945,
"learning_rate": 1.5068493150684931e-05,
"loss": 7.7409,
"step": 50
},
{
"epoch": 0.25773195876288657,
"grad_norm": 19.554790496826172,
"learning_rate": 2.363013698630137e-05,
"loss": 6.8118,
"step": 75
},
{
"epoch": 0.3436426116838488,
"grad_norm": 11.124310493469238,
"learning_rate": 3.219178082191781e-05,
"loss": 5.94,
"step": 100
},
{
"epoch": 0.42955326460481097,
"grad_norm": 9.250848770141602,
"learning_rate": 4.075342465753425e-05,
"loss": 5.2277,
"step": 125
},
{
"epoch": 0.5154639175257731,
"grad_norm": 3.131469964981079,
"learning_rate": 4.9315068493150684e-05,
"loss": 4.6817,
"step": 150
},
{
"epoch": 0.6013745704467354,
"grad_norm": 4.209794998168945,
"learning_rate": 4.912146676852559e-05,
"loss": 4.3776,
"step": 175
},
{
"epoch": 0.6872852233676976,
"grad_norm": 3.0055902004241943,
"learning_rate": 4.816653934300993e-05,
"loss": 4.3974,
"step": 200
},
{
"epoch": 0.7731958762886598,
"grad_norm": 2.923142433166504,
"learning_rate": 4.7211611917494275e-05,
"loss": 4.2811,
"step": 225
},
{
"epoch": 0.8591065292096219,
"grad_norm": 3.110403060913086,
"learning_rate": 4.625668449197861e-05,
"loss": 4.1323,
"step": 250
},
{
"epoch": 0.9450171821305842,
"grad_norm": 2.941375970840454,
"learning_rate": 4.530175706646295e-05,
"loss": 4.1902,
"step": 275
},
{
"epoch": 1.0,
"eval_gen_len": 15.1115,
"eval_loss": 3.686694622039795,
"eval_rouge1": 17.9543,
"eval_rouge2": 4.0352,
"eval_rougeL": 16.3506,
"eval_rougeLsum": 16.4818,
"eval_runtime": 11.3626,
"eval_samples_per_second": 25.61,
"eval_steps_per_second": 3.256,
"step": 291
},
{
"epoch": 1.0309278350515463,
"grad_norm": 3.0971412658691406,
"learning_rate": 4.434682964094729e-05,
"loss": 4.0055,
"step": 300
},
{
"epoch": 1.1168384879725086,
"grad_norm": 3.2513363361358643,
"learning_rate": 4.339190221543163e-05,
"loss": 3.9592,
"step": 325
},
{
"epoch": 1.2027491408934707,
"grad_norm": 4.395771026611328,
"learning_rate": 4.2436974789915967e-05,
"loss": 3.8709,
"step": 350
},
{
"epoch": 1.2886597938144329,
"grad_norm": 2.7217512130737305,
"learning_rate": 4.1482047364400305e-05,
"loss": 3.7554,
"step": 375
},
{
"epoch": 1.3745704467353952,
"grad_norm": 3.703568696975708,
"learning_rate": 4.052711993888464e-05,
"loss": 3.7343,
"step": 400
},
{
"epoch": 1.4604810996563573,
"grad_norm": 2.7263598442077637,
"learning_rate": 3.957219251336899e-05,
"loss": 3.7497,
"step": 425
},
{
"epoch": 1.5463917525773194,
"grad_norm": 2.4919683933258057,
"learning_rate": 3.861726508785333e-05,
"loss": 3.7073,
"step": 450
},
{
"epoch": 1.6323024054982818,
"grad_norm": 2.5988521575927734,
"learning_rate": 3.7662337662337665e-05,
"loss": 3.6325,
"step": 475
},
{
"epoch": 1.718213058419244,
"grad_norm": 3.717288017272949,
"learning_rate": 3.6707410236822004e-05,
"loss": 3.687,
"step": 500
},
{
"epoch": 1.8041237113402062,
"grad_norm": 3.393786668777466,
"learning_rate": 3.575248281130634e-05,
"loss": 3.7349,
"step": 525
},
{
"epoch": 1.8900343642611683,
"grad_norm": 2.5332796573638916,
"learning_rate": 3.479755538579068e-05,
"loss": 3.7308,
"step": 550
},
{
"epoch": 1.9759450171821307,
"grad_norm": 3.4967894554138184,
"learning_rate": 3.384262796027502e-05,
"loss": 3.6033,
"step": 575
},
{
"epoch": 2.0,
"eval_gen_len": 14.7061,
"eval_loss": 3.3814778327941895,
"eval_rouge1": 20.6781,
"eval_rouge2": 5.109,
"eval_rougeL": 17.5025,
"eval_rougeLsum": 17.5956,
"eval_runtime": 11.6963,
"eval_samples_per_second": 24.88,
"eval_steps_per_second": 3.163,
"step": 582
},
{
"epoch": 2.0618556701030926,
"grad_norm": 3.7303099632263184,
"learning_rate": 3.288770053475936e-05,
"loss": 3.4857,
"step": 600
},
{
"epoch": 2.147766323024055,
"grad_norm": 2.58085036277771,
"learning_rate": 3.1932773109243696e-05,
"loss": 3.7377,
"step": 625
},
{
"epoch": 2.2336769759450172,
"grad_norm": 2.9038166999816895,
"learning_rate": 3.097784568372804e-05,
"loss": 3.4969,
"step": 650
},
{
"epoch": 2.319587628865979,
"grad_norm": 1.8798184394836426,
"learning_rate": 3.002291825821238e-05,
"loss": 3.3667,
"step": 675
},
{
"epoch": 2.4054982817869415,
"grad_norm": 2.5839955806732178,
"learning_rate": 2.9067990832696718e-05,
"loss": 3.5371,
"step": 700
},
{
"epoch": 2.491408934707904,
"grad_norm": 14.803485870361328,
"learning_rate": 2.8113063407181056e-05,
"loss": 3.4758,
"step": 725
},
{
"epoch": 2.5773195876288657,
"grad_norm": 2.901104688644409,
"learning_rate": 2.7158135981665394e-05,
"loss": 3.4274,
"step": 750
},
{
"epoch": 2.663230240549828,
"grad_norm": 3.5598862171173096,
"learning_rate": 2.6203208556149733e-05,
"loss": 3.5939,
"step": 775
},
{
"epoch": 2.7491408934707904,
"grad_norm": 2.656578540802002,
"learning_rate": 2.524828113063407e-05,
"loss": 3.5227,
"step": 800
},
{
"epoch": 2.8350515463917527,
"grad_norm": 2.2073974609375,
"learning_rate": 2.4293353705118413e-05,
"loss": 3.5447,
"step": 825
},
{
"epoch": 2.9209621993127146,
"grad_norm": 3.0660665035247803,
"learning_rate": 2.333842627960275e-05,
"loss": 3.4734,
"step": 850
},
{
"epoch": 3.0,
"eval_gen_len": 16.5439,
"eval_loss": 3.232574462890625,
"eval_rouge1": 20.2411,
"eval_rouge2": 5.2598,
"eval_rougeL": 17.2676,
"eval_rougeLsum": 17.4831,
"eval_runtime": 12.7924,
"eval_samples_per_second": 22.748,
"eval_steps_per_second": 2.892,
"step": 873
},
{
"epoch": 3.006872852233677,
"grad_norm": 2.2971296310424805,
"learning_rate": 2.238349885408709e-05,
"loss": 3.4626,
"step": 875
},
{
"epoch": 3.0927835051546393,
"grad_norm": 5.520618438720703,
"learning_rate": 2.1428571428571428e-05,
"loss": 3.4557,
"step": 900
},
{
"epoch": 3.178694158075601,
"grad_norm": 2.2981772422790527,
"learning_rate": 2.047364400305577e-05,
"loss": 3.2812,
"step": 925
},
{
"epoch": 3.2646048109965635,
"grad_norm": 6.0153069496154785,
"learning_rate": 1.951871657754011e-05,
"loss": 3.4321,
"step": 950
},
{
"epoch": 3.350515463917526,
"grad_norm": 2.2888569831848145,
"learning_rate": 1.8563789152024447e-05,
"loss": 3.392,
"step": 975
},
{
"epoch": 3.436426116838488,
"grad_norm": 5.259116172790527,
"learning_rate": 1.7608861726508785e-05,
"loss": 3.4009,
"step": 1000
},
{
"epoch": 3.52233676975945,
"grad_norm": 2.115800380706787,
"learning_rate": 1.6653934300993127e-05,
"loss": 3.3249,
"step": 1025
},
{
"epoch": 3.6082474226804124,
"grad_norm": 2.3146419525146484,
"learning_rate": 1.5699006875477465e-05,
"loss": 3.2829,
"step": 1050
},
{
"epoch": 3.6941580756013748,
"grad_norm": 2.9118130207061768,
"learning_rate": 1.4744079449961804e-05,
"loss": 3.4347,
"step": 1075
},
{
"epoch": 3.7800687285223367,
"grad_norm": 2.7317888736724854,
"learning_rate": 1.3789152024446142e-05,
"loss": 3.2167,
"step": 1100
},
{
"epoch": 3.865979381443299,
"grad_norm": 4.284421920776367,
"learning_rate": 1.2834224598930484e-05,
"loss": 3.431,
"step": 1125
},
{
"epoch": 3.9518900343642613,
"grad_norm": 3.761094808578491,
"learning_rate": 1.1879297173414822e-05,
"loss": 3.4635,
"step": 1150
},
{
"epoch": 4.0,
"eval_gen_len": 15.6284,
"eval_loss": 3.164484739303589,
"eval_rouge1": 20.158,
"eval_rouge2": 4.9421,
"eval_rougeL": 17.0338,
"eval_rougeLsum": 17.2585,
"eval_runtime": 11.6665,
"eval_samples_per_second": 24.943,
"eval_steps_per_second": 3.171,
"step": 1164
},
{
"epoch": 4.037800687285223,
"grad_norm": 2.3253726959228516,
"learning_rate": 1.092436974789916e-05,
"loss": 3.3823,
"step": 1175
},
{
"epoch": 4.123711340206185,
"grad_norm": 5.085910797119141,
"learning_rate": 9.969442322383499e-06,
"loss": 3.2498,
"step": 1200
},
{
"epoch": 4.209621993127148,
"grad_norm": 2.912647008895874,
"learning_rate": 9.014514896867839e-06,
"loss": 3.3191,
"step": 1225
},
{
"epoch": 4.29553264604811,
"grad_norm": 5.910384178161621,
"learning_rate": 8.059587471352178e-06,
"loss": 3.4222,
"step": 1250
},
{
"epoch": 4.381443298969073,
"grad_norm": 10.643930435180664,
"learning_rate": 7.104660045836517e-06,
"loss": 3.4691,
"step": 1275
},
{
"epoch": 4.4673539518900345,
"grad_norm": 2.9152700901031494,
"learning_rate": 6.149732620320856e-06,
"loss": 3.2257,
"step": 1300
},
{
"epoch": 4.553264604810996,
"grad_norm": 2.8727643489837646,
"learning_rate": 5.194805194805195e-06,
"loss": 3.3841,
"step": 1325
},
{
"epoch": 4.639175257731958,
"grad_norm": 8.290576934814453,
"learning_rate": 4.239877769289534e-06,
"loss": 3.1381,
"step": 1350
},
{
"epoch": 4.725085910652921,
"grad_norm": 2.3321030139923096,
"learning_rate": 3.2849503437738733e-06,
"loss": 3.3243,
"step": 1375
},
{
"epoch": 4.810996563573883,
"grad_norm": 3.101409912109375,
"learning_rate": 2.3300229182582125e-06,
"loss": 3.1536,
"step": 1400
},
{
"epoch": 4.896907216494846,
"grad_norm": 4.4823174476623535,
"learning_rate": 1.3750954927425516e-06,
"loss": 3.3531,
"step": 1425
},
{
"epoch": 4.982817869415808,
"grad_norm": 2.517242193222046,
"learning_rate": 4.2016806722689076e-07,
"loss": 3.4086,
"step": 1450
},
{
"epoch": 5.0,
"eval_gen_len": 15.5,
"eval_loss": 3.141986846923828,
"eval_rouge1": 19.8864,
"eval_rouge2": 4.9499,
"eval_rougeL": 16.8946,
"eval_rougeLsum": 17.1002,
"eval_runtime": 12.3635,
"eval_samples_per_second": 23.537,
"eval_steps_per_second": 2.993,
"step": 1455
}
],
"logging_steps": 25,
"max_steps": 1455,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.01
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9422115569664.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}