AI_Checkpoint / checkpoint-2200 /trainer_state.json
ntnmmm's picture
Upload 12 files
4071f9d verified
{
"best_global_step": 600,
"best_metric": 72.28,
"best_model_checkpoint": "/content/drive/MyDrive/Summarization/checkpoints/cur_cp/checkpoint-600",
"epoch": 8.835341365461847,
"eval_steps": 100,
"global_step": 2200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.20080321285140562,
"grad_norm": 10.723053932189941,
"learning_rate": 4.995027130745321e-05,
"loss": 0.8179,
"step": 50
},
{
"epoch": 0.40160642570281124,
"grad_norm": 13.537382125854492,
"learning_rate": 4.980128306524183e-05,
"loss": 0.7085,
"step": 100
},
{
"epoch": 0.40160642570281124,
"eval_f1": 70.31,
"eval_gen_len": 42.92338709677419,
"eval_loss": 0.6963455080986023,
"eval_p": 74.73,
"eval_r": 70.54,
"eval_runtime": 46.3873,
"eval_samples_per_second": 10.693,
"eval_steps_per_second": 0.668,
"step": 100
},
{
"epoch": 0.6024096385542169,
"grad_norm": 3.749103307723999,
"learning_rate": 4.9553627992605066e-05,
"loss": 0.6667,
"step": 150
},
{
"epoch": 0.8032128514056225,
"grad_norm": 7.421478748321533,
"learning_rate": 4.9208291334580104e-05,
"loss": 0.5991,
"step": 200
},
{
"epoch": 0.8032128514056225,
"eval_f1": 68.09,
"eval_gen_len": 40.17741935483871,
"eval_loss": 0.7214160561561584,
"eval_p": 75.9,
"eval_r": 66.2,
"eval_runtime": 45.6263,
"eval_samples_per_second": 10.871,
"eval_steps_per_second": 0.679,
"step": 200
},
{
"epoch": 1.0040160642570282,
"grad_norm": 9.242626190185547,
"learning_rate": 4.87666469424063e-05,
"loss": 0.752,
"step": 250
},
{
"epoch": 1.2048192771084336,
"grad_norm": 4.659054756164551,
"learning_rate": 4.8230451807939135e-05,
"loss": 0.4472,
"step": 300
},
{
"epoch": 1.2048192771084336,
"eval_f1": 67.64,
"eval_gen_len": 39.01411290322581,
"eval_loss": 0.7062053084373474,
"eval_p": 75.6,
"eval_r": 65.19,
"eval_runtime": 42.4185,
"eval_samples_per_second": 11.693,
"eval_steps_per_second": 0.731,
"step": 300
},
{
"epoch": 1.4056224899598393,
"grad_norm": 11.880289077758789,
"learning_rate": 4.760183907381757e-05,
"loss": 0.5069,
"step": 350
},
{
"epoch": 1.606425702811245,
"grad_norm": 9.056407928466797,
"learning_rate": 4.6883309547192476e-05,
"loss": 0.519,
"step": 400
},
{
"epoch": 1.606425702811245,
"eval_f1": 71.67,
"eval_gen_len": 46.93346774193548,
"eval_loss": 0.7584968209266663,
"eval_p": 73.71,
"eval_r": 74.1,
"eval_runtime": 55.9551,
"eval_samples_per_second": 8.864,
"eval_steps_per_second": 0.554,
"step": 400
},
{
"epoch": 1.8072289156626506,
"grad_norm": 7.793162822723389,
"learning_rate": 4.607772175077711e-05,
"loss": 0.4939,
"step": 450
},
{
"epoch": 2.0080321285140563,
"grad_norm": 7.212235927581787,
"learning_rate": 4.518828055079925e-05,
"loss": 0.4646,
"step": 500
},
{
"epoch": 2.0080321285140563,
"eval_f1": 69.01,
"eval_gen_len": 42.16129032258065,
"eval_loss": 0.7297435998916626,
"eval_p": 73.93,
"eval_r": 68.57,
"eval_runtime": 48.8895,
"eval_samples_per_second": 10.145,
"eval_steps_per_second": 0.634,
"step": 500
},
{
"epoch": 2.208835341365462,
"grad_norm": 5.176413536071777,
"learning_rate": 4.421852440709666e-05,
"loss": 0.2646,
"step": 550
},
{
"epoch": 2.4096385542168672,
"grad_norm": 19.735258102416992,
"learning_rate": 4.3172311296078595e-05,
"loss": 0.3214,
"step": 600
},
{
"epoch": 2.4096385542168672,
"eval_f1": 72.28,
"eval_gen_len": 46.564516129032256,
"eval_loss": 0.788872241973877,
"eval_p": 74.17,
"eval_r": 74.3,
"eval_runtime": 54.3998,
"eval_samples_per_second": 9.118,
"eval_steps_per_second": 0.57,
"step": 600
},
{
"epoch": 2.610441767068273,
"grad_norm": 9.665975570678711,
"learning_rate": 4.205380336255594e-05,
"loss": 0.4161,
"step": 650
},
{
"epoch": 2.8112449799196786,
"grad_norm": 8.65356731414795,
"learning_rate": 4.08674503614997e-05,
"loss": 0.2973,
"step": 700
},
{
"epoch": 2.8112449799196786,
"eval_f1": 68.05,
"eval_gen_len": 41.256048387096776,
"eval_loss": 0.770168662071228,
"eval_p": 74.23,
"eval_r": 66.93,
"eval_runtime": 44.2971,
"eval_samples_per_second": 11.197,
"eval_steps_per_second": 0.7,
"step": 700
},
{
"epoch": 3.0120481927710845,
"grad_norm": 4.7805867195129395,
"learning_rate": 3.961797195560118e-05,
"loss": 0.3477,
"step": 750
},
{
"epoch": 3.21285140562249,
"grad_norm": 3.864029884338379,
"learning_rate": 3.8310338939059644e-05,
"loss": 0.2447,
"step": 800
},
{
"epoch": 3.21285140562249,
"eval_f1": 69.3,
"eval_gen_len": 42.681451612903224,
"eval_loss": 0.8243445158004761,
"eval_p": 74.09,
"eval_r": 68.98,
"eval_runtime": 47.774,
"eval_samples_per_second": 10.382,
"eval_steps_per_second": 0.649,
"step": 800
},
{
"epoch": 3.4136546184738954,
"grad_norm": 1.5391823053359985,
"learning_rate": 3.694975346229458e-05,
"loss": 0.2272,
"step": 850
},
{
"epoch": 3.6144578313253013,
"grad_norm": 12.502788543701172,
"learning_rate": 3.55416283362546e-05,
"loss": 0.2136,
"step": 900
},
{
"epoch": 3.6144578313253013,
"eval_f1": 72.18,
"eval_gen_len": 45.850806451612904,
"eval_loss": 0.8359085917472839,
"eval_p": 74.24,
"eval_r": 73.77,
"eval_runtime": 51.4887,
"eval_samples_per_second": 9.633,
"eval_steps_per_second": 0.602,
"step": 900
},
{
"epoch": 3.8152610441767068,
"grad_norm": 6.718369960784912,
"learning_rate": 3.409156549865654e-05,
"loss": 0.1959,
"step": 950
},
{
"epoch": 4.016064257028113,
"grad_norm": 32.431217193603516,
"learning_rate": 3.260533372782234e-05,
"loss": 0.2571,
"step": 1000
},
{
"epoch": 4.016064257028113,
"eval_f1": 69.38,
"eval_gen_len": 42.243951612903224,
"eval_loss": 0.8177807927131653,
"eval_p": 73.87,
"eval_r": 69.22,
"eval_runtime": 48.2955,
"eval_samples_per_second": 10.27,
"eval_steps_per_second": 0.642,
"step": 1000
},
{
"epoch": 4.216867469879518,
"grad_norm": 5.5347065925598145,
"learning_rate": 3.10888456927748e-05,
"loss": 0.1427,
"step": 1050
},
{
"epoch": 4.417670682730924,
"grad_norm": 4.728146076202393,
"learning_rate": 2.9548134430893604e-05,
"loss": 0.1356,
"step": 1100
},
{
"epoch": 4.417670682730924,
"eval_f1": 70.43,
"eval_gen_len": 44.792338709677416,
"eval_loss": 0.8745766878128052,
"eval_p": 73.6,
"eval_r": 71.48,
"eval_runtime": 49.7707,
"eval_samples_per_second": 9.966,
"eval_steps_per_second": 0.623,
"step": 1100
},
{
"epoch": 4.618473895582329,
"grad_norm": 4.265488147735596,
"learning_rate": 2.7989329346710375e-05,
"loss": 0.1644,
"step": 1150
},
{
"epoch": 4.8192771084337345,
"grad_norm": 5.412672996520996,
"learning_rate": 2.6418631827326857e-05,
"loss": 0.1564,
"step": 1200
},
{
"epoch": 4.8192771084337345,
"eval_f1": 70.56,
"eval_gen_len": 45.20967741935484,
"eval_loss": 0.8563244342803955,
"eval_p": 73.64,
"eval_r": 71.76,
"eval_runtime": 50.2355,
"eval_samples_per_second": 9.873,
"eval_steps_per_second": 0.617,
"step": 1200
},
{
"epoch": 5.020080321285141,
"grad_norm": 5.465022087097168,
"learning_rate": 2.484229057146507e-05,
"loss": 0.1396,
"step": 1250
},
{
"epoch": 5.220883534136546,
"grad_norm": 4.947805881500244,
"learning_rate": 2.3266576730297956e-05,
"loss": 0.0937,
"step": 1300
},
{
"epoch": 5.220883534136546,
"eval_f1": 70.17,
"eval_gen_len": 42.51411290322581,
"eval_loss": 0.8945086002349854,
"eval_p": 74.73,
"eval_r": 69.95,
"eval_runtime": 47.3409,
"eval_samples_per_second": 10.477,
"eval_steps_per_second": 0.655,
"step": 1300
},
{
"epoch": 5.421686746987952,
"grad_norm": 2.938572883605957,
"learning_rate": 2.1697758958957448e-05,
"loss": 0.0842,
"step": 1350
},
{
"epoch": 5.622489959839357,
"grad_norm": 7.102436542510986,
"learning_rate": 2.014207847797256e-05,
"loss": 0.1031,
"step": 1400
},
{
"epoch": 5.622489959839357,
"eval_f1": 71.86,
"eval_gen_len": 47.534274193548384,
"eval_loss": 0.8877253532409668,
"eval_p": 73.18,
"eval_r": 74.71,
"eval_runtime": 53.5518,
"eval_samples_per_second": 9.262,
"eval_steps_per_second": 0.579,
"step": 1400
},
{
"epoch": 5.823293172690763,
"grad_norm": 2.2575900554656982,
"learning_rate": 1.8605724243850502e-05,
"loss": 0.1126,
"step": 1450
},
{
"epoch": 6.024096385542169,
"grad_norm": 3.7854058742523193,
"learning_rate": 1.70948083275794e-05,
"loss": 0.095,
"step": 1500
},
{
"epoch": 6.024096385542169,
"eval_f1": 70.87,
"eval_gen_len": 45.21774193548387,
"eval_loss": 0.9038512706756592,
"eval_p": 73.83,
"eval_r": 72.16,
"eval_runtime": 52.1132,
"eval_samples_per_second": 9.518,
"eval_steps_per_second": 0.595,
"step": 1500
},
{
"epoch": 6.224899598393574,
"grad_norm": 3.2422773838043213,
"learning_rate": 1.561534159900441e-05,
"loss": 0.0689,
"step": 1550
},
{
"epoch": 6.42570281124498,
"grad_norm": 6.3022990226745605,
"learning_rate": 1.4173209813811788e-05,
"loss": 0.0764,
"step": 1600
},
{
"epoch": 6.42570281124498,
"eval_f1": 71.12,
"eval_gen_len": 46.310483870967744,
"eval_loss": 0.9024052619934082,
"eval_p": 73.55,
"eval_r": 73.2,
"eval_runtime": 51.8511,
"eval_samples_per_second": 9.566,
"eval_steps_per_second": 0.598,
"step": 1600
},
{
"epoch": 6.626506024096385,
"grad_norm": 3.3996074199676514,
"learning_rate": 1.277415019825417e-05,
"loss": 0.0639,
"step": 1650
},
{
"epoch": 6.827309236947791,
"grad_norm": 1.7633498907089233,
"learning_rate": 1.1423728624769695e-05,
"loss": 0.0499,
"step": 1700
},
{
"epoch": 6.827309236947791,
"eval_f1": 71.1,
"eval_gen_len": 45.52620967741935,
"eval_loss": 0.9023270010948181,
"eval_p": 73.73,
"eval_r": 72.63,
"eval_runtime": 48.5153,
"eval_samples_per_second": 10.224,
"eval_steps_per_second": 0.639,
"step": 1700
},
{
"epoch": 7.028112449799197,
"grad_norm": 0.14504443109035492,
"learning_rate": 1.0127317469297277e-05,
"loss": 0.0707,
"step": 1750
},
{
"epoch": 7.228915662650603,
"grad_norm": 0.9350752830505371,
"learning_rate": 8.890074238378074e-06,
"loss": 0.0422,
"step": 1800
},
{
"epoch": 7.228915662650603,
"eval_f1": 71.74,
"eval_gen_len": 46.16935483870968,
"eval_loss": 0.9106847643852234,
"eval_p": 73.84,
"eval_r": 73.54,
"eval_runtime": 53.7841,
"eval_samples_per_second": 9.222,
"eval_steps_per_second": 0.576,
"step": 1800
},
{
"epoch": 7.429718875502008,
"grad_norm": 2.7118020057678223,
"learning_rate": 7.71692105107098e-06,
"loss": 0.0442,
"step": 1850
},
{
"epoch": 7.6305220883534135,
"grad_norm": 3.540323495864868,
"learning_rate": 6.612525057308949e-06,
"loss": 0.0469,
"step": 1900
},
{
"epoch": 7.6305220883534135,
"eval_f1": 71.51,
"eval_gen_len": 45.40120967741935,
"eval_loss": 0.9106153845787048,
"eval_p": 74.15,
"eval_r": 73.09,
"eval_runtime": 51.3332,
"eval_samples_per_second": 9.662,
"eval_steps_per_second": 0.604,
"step": 1900
},
{
"epoch": 7.831325301204819,
"grad_norm": 1.7734179496765137,
"learning_rate": 5.581279870597867e-06,
"loss": 0.0368,
"step": 1950
},
{
"epoch": 8.032128514056225,
"grad_norm": 2.24417781829834,
"learning_rate": 4.627288088924156e-06,
"loss": 0.0379,
"step": 2000
},
{
"epoch": 8.032128514056225,
"eval_f1": 70.84,
"eval_gen_len": 45.127016129032256,
"eval_loss": 0.9157423973083496,
"eval_p": 73.79,
"eval_r": 72.15,
"eval_runtime": 51.0735,
"eval_samples_per_second": 9.711,
"eval_steps_per_second": 0.607,
"step": 2000
},
{
"epoch": 8.23293172690763,
"grad_norm": 1.954770565032959,
"learning_rate": 3.754344973408064e-06,
"loss": 0.0335,
"step": 2050
},
{
"epoch": 8.433734939759036,
"grad_norm": 1.5767686367034912,
"learning_rate": 2.9659233496337786e-06,
"loss": 0.0301,
"step": 2100
},
{
"epoch": 8.433734939759036,
"eval_f1": 71.39,
"eval_gen_len": 46.381048387096776,
"eval_loss": 0.9240424633026123,
"eval_p": 73.77,
"eval_r": 73.37,
"eval_runtime": 53.2399,
"eval_samples_per_second": 9.316,
"eval_steps_per_second": 0.582,
"step": 2100
},
{
"epoch": 8.634538152610443,
"grad_norm": 0.2601850628852844,
"learning_rate": 2.265159791723373e-06,
"loss": 0.0358,
"step": 2150
},
{
"epoch": 8.835341365461847,
"grad_norm": 5.757479667663574,
"learning_rate": 1.6548421441183875e-06,
"loss": 0.0254,
"step": 2200
},
{
"epoch": 8.835341365461847,
"eval_f1": 71.82,
"eval_gen_len": 46.47782258064516,
"eval_loss": 0.9292237162590027,
"eval_p": 73.8,
"eval_r": 74.04,
"eval_runtime": 54.2877,
"eval_samples_per_second": 9.137,
"eval_steps_per_second": 0.571,
"step": 2200
}
],
"logging_steps": 50,
"max_steps": 2490,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 490953080733696.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}