ck3-localization / checkpoint-36492 /trainer_state.json
Aze4ka's picture
Upload folder using huggingface_hub
e3b0db7 verified
{
"best_metric": 0.5498164296150208,
"best_model_checkpoint": "ck3-localization/checkpoint-36492",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 36492,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.041104899704044726,
"grad_norm": 6.12867546081543,
"learning_rate": 2.7397260273972604e-06,
"loss": 2.3513,
"step": 500
},
{
"epoch": 0.08220979940808945,
"grad_norm": 6.480109691619873,
"learning_rate": 5.479452054794521e-06,
"loss": 1.4689,
"step": 1000
},
{
"epoch": 0.12331469911213416,
"grad_norm": 4.740694046020508,
"learning_rate": 8.219178082191782e-06,
"loss": 1.19,
"step": 1500
},
{
"epoch": 0.1644195988161789,
"grad_norm": 5.501825332641602,
"learning_rate": 1.0958904109589042e-05,
"loss": 1.0318,
"step": 2000
},
{
"epoch": 0.20552449852022361,
"grad_norm": 5.322640419006348,
"learning_rate": 1.3698630136986302e-05,
"loss": 0.9556,
"step": 2500
},
{
"epoch": 0.24662939822426833,
"grad_norm": 5.773024559020996,
"learning_rate": 1.6438356164383563e-05,
"loss": 0.9137,
"step": 3000
},
{
"epoch": 0.28773429792831307,
"grad_norm": 4.7130656242370605,
"learning_rate": 1.9178082191780822e-05,
"loss": 0.8705,
"step": 3500
},
{
"epoch": 0.3288391976323578,
"grad_norm": 3.756901741027832,
"learning_rate": 1.9786858291212475e-05,
"loss": 0.8531,
"step": 4000
},
{
"epoch": 0.3699440973364025,
"grad_norm": 4.033214092254639,
"learning_rate": 1.948237013580172e-05,
"loss": 0.8138,
"step": 4500
},
{
"epoch": 0.41104899704044723,
"grad_norm": 4.564835071563721,
"learning_rate": 1.9177881980390966e-05,
"loss": 0.7946,
"step": 5000
},
{
"epoch": 0.45215389674449197,
"grad_norm": 4.850255012512207,
"learning_rate": 1.887339382498021e-05,
"loss": 0.7833,
"step": 5500
},
{
"epoch": 0.49325879644853665,
"grad_norm": 4.716800689697266,
"learning_rate": 1.8568905669569456e-05,
"loss": 0.7662,
"step": 6000
},
{
"epoch": 0.5343636961525814,
"grad_norm": 4.0613274574279785,
"learning_rate": 1.82644175141587e-05,
"loss": 0.7668,
"step": 6500
},
{
"epoch": 0.5754685958566261,
"grad_norm": 4.6606831550598145,
"learning_rate": 1.7959929358747943e-05,
"loss": 0.7518,
"step": 7000
},
{
"epoch": 0.6165734955606709,
"grad_norm": 4.745144844055176,
"learning_rate": 1.765544120333719e-05,
"loss": 0.7477,
"step": 7500
},
{
"epoch": 0.6576783952647156,
"grad_norm": 4.414773464202881,
"learning_rate": 1.7350953047926437e-05,
"loss": 0.751,
"step": 8000
},
{
"epoch": 0.6987832949687602,
"grad_norm": 4.531556606292725,
"learning_rate": 1.704646489251568e-05,
"loss": 0.7424,
"step": 8500
},
{
"epoch": 0.739888194672805,
"grad_norm": 4.841054439544678,
"learning_rate": 1.6741976737104928e-05,
"loss": 0.7328,
"step": 9000
},
{
"epoch": 0.7809930943768497,
"grad_norm": 4.302633762359619,
"learning_rate": 1.6437488581694175e-05,
"loss": 0.725,
"step": 9500
},
{
"epoch": 0.8220979940808945,
"grad_norm": 3.530836343765259,
"learning_rate": 1.6133000426283418e-05,
"loss": 0.7072,
"step": 10000
},
{
"epoch": 0.8632028937849392,
"grad_norm": 4.2016191482543945,
"learning_rate": 1.5828512270872665e-05,
"loss": 0.6895,
"step": 10500
},
{
"epoch": 0.9043077934889839,
"grad_norm": 3.463226079940796,
"learning_rate": 1.5524024115461912e-05,
"loss": 0.7093,
"step": 11000
},
{
"epoch": 0.9454126931930286,
"grad_norm": 3.3261220455169678,
"learning_rate": 1.5219535960051155e-05,
"loss": 0.6941,
"step": 11500
},
{
"epoch": 0.9865175928970733,
"grad_norm": 3.778331756591797,
"learning_rate": 1.49150478046404e-05,
"loss": 0.7008,
"step": 12000
},
{
"epoch": 1.0,
"eval_gen_len": 33.8514,
"eval_loss": 0.6079365611076355,
"eval_rouge1": 31.9733,
"eval_rouge2": 27.2686,
"eval_rougeL": 31.3526,
"eval_rougeLsum": 31.4543,
"eval_runtime": 1591.2259,
"eval_samples_per_second": 13.59,
"eval_steps_per_second": 0.425,
"step": 12164
},
{
"epoch": 1.0276224926011182,
"grad_norm": 3.7499120235443115,
"learning_rate": 1.4610559649229646e-05,
"loss": 0.6284,
"step": 12500
},
{
"epoch": 1.0687273923051628,
"grad_norm": 3.52150821685791,
"learning_rate": 1.4306071493818891e-05,
"loss": 0.6337,
"step": 13000
},
{
"epoch": 1.1098322920092074,
"grad_norm": 4.520621299743652,
"learning_rate": 1.4001583338408138e-05,
"loss": 0.6292,
"step": 13500
},
{
"epoch": 1.1509371917132523,
"grad_norm": 4.275709629058838,
"learning_rate": 1.3697095182997382e-05,
"loss": 0.6195,
"step": 14000
},
{
"epoch": 1.192042091417297,
"grad_norm": 3.500743865966797,
"learning_rate": 1.3392607027586628e-05,
"loss": 0.6168,
"step": 14500
},
{
"epoch": 1.2331469911213417,
"grad_norm": 2.587315320968628,
"learning_rate": 1.3088118872175875e-05,
"loss": 0.6169,
"step": 15000
},
{
"epoch": 1.2742518908253864,
"grad_norm": 3.4539663791656494,
"learning_rate": 1.2783630716765119e-05,
"loss": 0.619,
"step": 15500
},
{
"epoch": 1.3153567905294312,
"grad_norm": 3.759758710861206,
"learning_rate": 1.2479142561354364e-05,
"loss": 0.612,
"step": 16000
},
{
"epoch": 1.3564616902334758,
"grad_norm": 2.6160857677459717,
"learning_rate": 1.217465440594361e-05,
"loss": 0.6179,
"step": 16500
},
{
"epoch": 1.3975665899375205,
"grad_norm": 3.3576176166534424,
"learning_rate": 1.1870166250532855e-05,
"loss": 0.6203,
"step": 17000
},
{
"epoch": 1.4386714896415653,
"grad_norm": 2.6305747032165527,
"learning_rate": 1.1565678095122101e-05,
"loss": 0.609,
"step": 17500
},
{
"epoch": 1.47977638934561,
"grad_norm": 3.3986129760742188,
"learning_rate": 1.1261189939711345e-05,
"loss": 0.6076,
"step": 18000
},
{
"epoch": 1.5208812890496546,
"grad_norm": 5.06044864654541,
"learning_rate": 1.0956701784300592e-05,
"loss": 0.6187,
"step": 18500
},
{
"epoch": 1.5619861887536994,
"grad_norm": 4.3670549392700195,
"learning_rate": 1.0652213628889839e-05,
"loss": 0.6063,
"step": 19000
},
{
"epoch": 1.6030910884577443,
"grad_norm": 8.928343772888184,
"learning_rate": 1.0347725473479082e-05,
"loss": 0.5948,
"step": 19500
},
{
"epoch": 1.644195988161789,
"grad_norm": 3.5316221714019775,
"learning_rate": 1.004323731806833e-05,
"loss": 0.6108,
"step": 20000
},
{
"epoch": 1.6853008878658335,
"grad_norm": 3.8091230392456055,
"learning_rate": 9.738749162657574e-06,
"loss": 0.6024,
"step": 20500
},
{
"epoch": 1.7264057875698784,
"grad_norm": 2.5065314769744873,
"learning_rate": 9.43426100724682e-06,
"loss": 0.595,
"step": 21000
},
{
"epoch": 1.767510687273923,
"grad_norm": 4.371850490570068,
"learning_rate": 9.129772851836063e-06,
"loss": 0.6063,
"step": 21500
},
{
"epoch": 1.8086155869779676,
"grad_norm": 3.6098592281341553,
"learning_rate": 8.82528469642531e-06,
"loss": 0.6158,
"step": 22000
},
{
"epoch": 1.8497204866820125,
"grad_norm": 4.037623405456543,
"learning_rate": 8.520796541014555e-06,
"loss": 0.6045,
"step": 22500
},
{
"epoch": 1.8908253863860573,
"grad_norm": 4.389125823974609,
"learning_rate": 8.2163083856038e-06,
"loss": 0.5876,
"step": 23000
},
{
"epoch": 1.931930286090102,
"grad_norm": 4.564250946044922,
"learning_rate": 7.911820230193046e-06,
"loss": 0.5857,
"step": 23500
},
{
"epoch": 1.9730351857941466,
"grad_norm": 4.7007365226745605,
"learning_rate": 7.607332074782292e-06,
"loss": 0.6188,
"step": 24000
},
{
"epoch": 2.0,
"eval_gen_len": 34.1084,
"eval_loss": 0.5606569051742554,
"eval_rouge1": 32.2894,
"eval_rouge2": 27.8387,
"eval_rougeL": 31.7251,
"eval_rougeLsum": 31.8152,
"eval_runtime": 1524.3799,
"eval_samples_per_second": 14.185,
"eval_steps_per_second": 0.443,
"step": 24328
},
{
"epoch": 2.0141400854981915,
"grad_norm": 1.9762619733810425,
"learning_rate": 7.302843919371537e-06,
"loss": 0.5736,
"step": 24500
},
{
"epoch": 2.0552449852022363,
"grad_norm": 4.161807060241699,
"learning_rate": 6.998355763960782e-06,
"loss": 0.5458,
"step": 25000
},
{
"epoch": 2.0963498849062807,
"grad_norm": 2.221466541290283,
"learning_rate": 6.693867608550027e-06,
"loss": 0.5468,
"step": 25500
},
{
"epoch": 2.1374547846103256,
"grad_norm": 3.642603874206543,
"learning_rate": 6.3893794531392735e-06,
"loss": 0.5556,
"step": 26000
},
{
"epoch": 2.1785596843143704,
"grad_norm": 3.3420753479003906,
"learning_rate": 6.084891297728519e-06,
"loss": 0.5451,
"step": 26500
},
{
"epoch": 2.219664584018415,
"grad_norm": 3.2656307220458984,
"learning_rate": 5.780403142317764e-06,
"loss": 0.5303,
"step": 27000
},
{
"epoch": 2.2607694837224597,
"grad_norm": 3.4211599826812744,
"learning_rate": 5.475914986907009e-06,
"loss": 0.5496,
"step": 27500
},
{
"epoch": 2.3018743834265045,
"grad_norm": 4.965065956115723,
"learning_rate": 5.171426831496256e-06,
"loss": 0.5392,
"step": 28000
},
{
"epoch": 2.342979283130549,
"grad_norm": 2.4368653297424316,
"learning_rate": 4.866938676085501e-06,
"loss": 0.5443,
"step": 28500
},
{
"epoch": 2.384084182834594,
"grad_norm": 2.6363422870635986,
"learning_rate": 4.562450520674746e-06,
"loss": 0.5418,
"step": 29000
},
{
"epoch": 2.4251890825386386,
"grad_norm": 4.417261123657227,
"learning_rate": 4.257962365263992e-06,
"loss": 0.5377,
"step": 29500
},
{
"epoch": 2.4662939822426835,
"grad_norm": 2.850874900817871,
"learning_rate": 3.953474209853237e-06,
"loss": 0.5406,
"step": 30000
},
{
"epoch": 2.507398881946728,
"grad_norm": 3.0470714569091797,
"learning_rate": 3.6489860544424825e-06,
"loss": 0.5277,
"step": 30500
},
{
"epoch": 2.5485037816507727,
"grad_norm": 3.77066707611084,
"learning_rate": 3.3444978990317278e-06,
"loss": 0.5404,
"step": 31000
},
{
"epoch": 2.5896086813548176,
"grad_norm": 2.103886127471924,
"learning_rate": 3.0400097436209734e-06,
"loss": 0.5511,
"step": 31500
},
{
"epoch": 2.6307135810588624,
"grad_norm": 2.784128427505493,
"learning_rate": 2.7355215882102186e-06,
"loss": 0.5466,
"step": 32000
},
{
"epoch": 2.671818480762907,
"grad_norm": 2.9157445430755615,
"learning_rate": 2.4310334327994643e-06,
"loss": 0.5502,
"step": 32500
},
{
"epoch": 2.7129233804669517,
"grad_norm": 3.7201719284057617,
"learning_rate": 2.1265452773887095e-06,
"loss": 0.5435,
"step": 33000
},
{
"epoch": 2.7540282801709965,
"grad_norm": 3.0712995529174805,
"learning_rate": 1.8220571219779551e-06,
"loss": 0.5442,
"step": 33500
},
{
"epoch": 2.795133179875041,
"grad_norm": 3.5120229721069336,
"learning_rate": 1.5175689665672005e-06,
"loss": 0.5392,
"step": 34000
},
{
"epoch": 2.836238079579086,
"grad_norm": 3.09930682182312,
"learning_rate": 1.2130808111564462e-06,
"loss": 0.5355,
"step": 34500
},
{
"epoch": 2.8773429792831307,
"grad_norm": 2.3439698219299316,
"learning_rate": 9.085926557456916e-07,
"loss": 0.5531,
"step": 35000
},
{
"epoch": 2.918447878987175,
"grad_norm": 2.935701608657837,
"learning_rate": 6.04104500334937e-07,
"loss": 0.5355,
"step": 35500
},
{
"epoch": 2.95955277869122,
"grad_norm": 3.3096513748168945,
"learning_rate": 2.9961634492418246e-07,
"loss": 0.5528,
"step": 36000
},
{
"epoch": 3.0,
"eval_gen_len": 33.8306,
"eval_loss": 0.5498164296150208,
"eval_rouge1": 32.6478,
"eval_rouge2": 28.3022,
"eval_rougeL": 32.1001,
"eval_rougeLsum": 32.1838,
"eval_runtime": 1535.5456,
"eval_samples_per_second": 14.082,
"eval_steps_per_second": 0.44,
"step": 36492
}
],
"logging_steps": 500,
"max_steps": 36492,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.01
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2577075189776384e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}