SFT-mBERT / trainer_state.json
HannahGoossens's picture
Upload 10 files
3dcdc1c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 30.0,
"eval_steps": 500,
"global_step": 4290,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"grad_norm": 0.7125847339630127,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.2965,
"step": 143
},
{
"epoch": 1.0,
"eval_accuracy": 0.9559476075153481,
"eval_f1": 0.0,
"eval_loss": 0.27843931317329407,
"eval_precision": 0.0,
"eval_recall": 0.0,
"eval_runtime": 24.1849,
"eval_samples_per_second": 94.15,
"eval_steps_per_second": 5.913,
"step": 143
},
{
"epoch": 2.0,
"grad_norm": 3.002716064453125,
"learning_rate": 4.666666666666667e-05,
"loss": 0.2066,
"step": 286
},
{
"epoch": 2.0,
"eval_accuracy": 0.960383493145901,
"eval_f1": 0.06783493499152063,
"eval_loss": 0.20140178501605988,
"eval_precision": 0.1901743264659271,
"eval_recall": 0.0412796697626419,
"eval_runtime": 24.1804,
"eval_samples_per_second": 94.167,
"eval_steps_per_second": 5.914,
"step": 286
},
{
"epoch": 3.0,
"grad_norm": 1.8616505861282349,
"learning_rate": 4.5e-05,
"loss": 0.1505,
"step": 429
},
{
"epoch": 3.0,
"eval_accuracy": 0.968307364681969,
"eval_f1": 0.2345959595959596,
"eval_loss": 0.14462772011756897,
"eval_precision": 0.260991712319146,
"eval_recall": 0.21304896227496847,
"eval_runtime": 24.263,
"eval_samples_per_second": 93.846,
"eval_steps_per_second": 5.894,
"step": 429
},
{
"epoch": 4.0,
"grad_norm": 1.670642375946045,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.1091,
"step": 572
},
{
"epoch": 4.0,
"eval_accuracy": 0.9783935879216193,
"eval_f1": 0.3906774174563004,
"eval_loss": 0.09360472857952118,
"eval_precision": 0.40133010882708586,
"eval_recall": 0.38057562206169016,
"eval_runtime": 24.3034,
"eval_samples_per_second": 93.691,
"eval_steps_per_second": 5.884,
"step": 572
},
{
"epoch": 5.0,
"grad_norm": 3.672482967376709,
"learning_rate": 4.166666666666667e-05,
"loss": 0.0836,
"step": 715
},
{
"epoch": 5.0,
"eval_accuracy": 0.9833169648393345,
"eval_f1": 0.5013577732518669,
"eval_loss": 0.07448223978281021,
"eval_precision": 0.4948073701842546,
"eval_recall": 0.5080839353285174,
"eval_runtime": 24.3059,
"eval_samples_per_second": 93.681,
"eval_steps_per_second": 5.883,
"step": 715
},
{
"epoch": 6.0,
"grad_norm": 1.8329507112503052,
"learning_rate": 4e-05,
"loss": 0.0659,
"step": 858
},
{
"epoch": 6.0,
"eval_accuracy": 0.9875376130887135,
"eval_f1": 0.6012593016599884,
"eval_loss": 0.05498756095767021,
"eval_precision": 0.600297176820208,
"eval_recall": 0.602224515537209,
"eval_runtime": 24.3002,
"eval_samples_per_second": 93.703,
"eval_steps_per_second": 5.885,
"step": 858
},
{
"epoch": 7.0,
"grad_norm": 0.9239581823348999,
"learning_rate": 3.8333333333333334e-05,
"loss": 0.0527,
"step": 1001
},
{
"epoch": 7.0,
"eval_accuracy": 0.9897184951628606,
"eval_f1": 0.6688816308142809,
"eval_loss": 0.04328082129359245,
"eval_precision": 0.6571871196193427,
"eval_recall": 0.6809998853342507,
"eval_runtime": 24.2837,
"eval_samples_per_second": 93.767,
"eval_steps_per_second": 5.889,
"step": 1001
},
{
"epoch": 8.0,
"grad_norm": 0.2763989269733429,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.0431,
"step": 1144
},
{
"epoch": 8.0,
"eval_accuracy": 0.9925051778131597,
"eval_f1": 0.735930735930736,
"eval_loss": 0.03102906234562397,
"eval_precision": 0.7311827956989247,
"eval_recall": 0.7407407407407407,
"eval_runtime": 24.3205,
"eval_samples_per_second": 93.625,
"eval_steps_per_second": 5.88,
"step": 1144
},
{
"epoch": 9.0,
"grad_norm": 0.40178802609443665,
"learning_rate": 3.5e-05,
"loss": 0.0335,
"step": 1287
},
{
"epoch": 9.0,
"eval_accuracy": 0.9941914415345428,
"eval_f1": 0.794018817204301,
"eval_loss": 0.024718057364225388,
"eval_precision": 0.7760262725779967,
"eval_recall": 0.8128654970760234,
"eval_runtime": 24.3191,
"eval_samples_per_second": 93.63,
"eval_steps_per_second": 5.88,
"step": 1287
},
{
"epoch": 10.0,
"grad_norm": 1.2496235370635986,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0268,
"step": 1430
},
{
"epoch": 10.0,
"eval_accuracy": 0.9952833079847366,
"eval_f1": 0.833757277711831,
"eval_loss": 0.019720738753676414,
"eval_precision": 0.8221850613154961,
"eval_recall": 0.8456599013874556,
"eval_runtime": 24.5248,
"eval_samples_per_second": 92.845,
"eval_steps_per_second": 5.831,
"step": 1430
},
{
"epoch": 11.0,
"grad_norm": 1.4848353862762451,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.0229,
"step": 1573
},
{
"epoch": 11.0,
"eval_accuracy": 0.996325284975718,
"eval_f1": 0.8729726664398321,
"eval_loss": 0.015156798996031284,
"eval_precision": 0.8635700661954449,
"eval_recall": 0.8825822726751519,
"eval_runtime": 24.3164,
"eval_samples_per_second": 93.64,
"eval_steps_per_second": 5.881,
"step": 1573
},
{
"epoch": 12.0,
"grad_norm": 0.7821120023727417,
"learning_rate": 3e-05,
"loss": 0.0174,
"step": 1716
},
{
"epoch": 12.0,
"eval_accuracy": 0.9967543343249458,
"eval_f1": 0.898472596585804,
"eval_loss": 0.01352603081613779,
"eval_precision": 0.8803785627819962,
"eval_recall": 0.9173259947253756,
"eval_runtime": 24.33,
"eval_samples_per_second": 93.588,
"eval_steps_per_second": 5.878,
"step": 1716
},
{
"epoch": 13.0,
"grad_norm": 5.437506675720215,
"learning_rate": 2.8333333333333335e-05,
"loss": 0.0156,
"step": 1859
},
{
"epoch": 13.0,
"eval_accuracy": 0.9975953280659567,
"eval_f1": 0.9254986645450929,
"eval_loss": 0.009106193669140339,
"eval_precision": 0.9174177557458315,
"eval_recall": 0.9337231968810916,
"eval_runtime": 24.313,
"eval_samples_per_second": 93.654,
"eval_steps_per_second": 5.882,
"step": 1859
},
{
"epoch": 14.0,
"grad_norm": 0.3559066355228424,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.0126,
"step": 2002
},
{
"epoch": 14.0,
"eval_accuracy": 0.9979673608903701,
"eval_f1": 0.9396295449348272,
"eval_loss": 0.007928353734314442,
"eval_precision": 0.936951316839585,
"eval_recall": 0.942323128081642,
"eval_runtime": 24.3095,
"eval_samples_per_second": 93.667,
"eval_steps_per_second": 5.882,
"step": 2002
},
{
"epoch": 15.0,
"grad_norm": 0.4652678966522217,
"learning_rate": 2.5e-05,
"loss": 0.0108,
"step": 2145
},
{
"epoch": 15.0,
"eval_accuracy": 0.998198277815868,
"eval_f1": 0.9444254640701515,
"eval_loss": 0.006473761051893234,
"eval_precision": 0.938016061531501,
"eval_recall": 0.9509230592821925,
"eval_runtime": 24.3088,
"eval_samples_per_second": 93.67,
"eval_steps_per_second": 5.883,
"step": 2145
},
{
"epoch": 16.0,
"grad_norm": 1.0989309549331665,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.0088,
"step": 2288
},
{
"epoch": 16.0,
"eval_accuracy": 0.998377879869033,
"eval_f1": 0.9585400627316796,
"eval_loss": 0.006167967803776264,
"eval_precision": 0.9534830950760155,
"eval_recall": 0.963650957459007,
"eval_runtime": 24.3119,
"eval_samples_per_second": 93.658,
"eval_steps_per_second": 5.882,
"step": 2288
},
{
"epoch": 17.0,
"grad_norm": 0.5213710069656372,
"learning_rate": 2.1666666666666667e-05,
"loss": 0.0088,
"step": 2431
},
{
"epoch": 17.0,
"eval_accuracy": 0.9980870955924801,
"eval_f1": 0.950275002806151,
"eval_loss": 0.006758366245776415,
"eval_precision": 0.930636473562713,
"eval_recall": 0.9707602339181286,
"eval_runtime": 24.3235,
"eval_samples_per_second": 93.613,
"eval_steps_per_second": 5.879,
"step": 2431
},
{
"epoch": 18.0,
"grad_norm": 0.7017818689346313,
"learning_rate": 2e-05,
"loss": 0.0074,
"step": 2574
},
{
"epoch": 18.0,
"eval_accuracy": 0.9985745868796424,
"eval_f1": 0.9625844546641685,
"eval_loss": 0.0046825287863612175,
"eval_precision": 0.9533288349077823,
"eval_recall": 0.972021557160876,
"eval_runtime": 24.3136,
"eval_samples_per_second": 93.651,
"eval_steps_per_second": 5.881,
"step": 2574
},
{
"epoch": 19.0,
"grad_norm": 0.434883713722229,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.0064,
"step": 2717
},
{
"epoch": 19.0,
"eval_accuracy": 0.998613073033892,
"eval_f1": 0.9669478003191248,
"eval_loss": 0.0045896186493337154,
"eval_precision": 0.9611419508326725,
"eval_recall": 0.9728242174062608,
"eval_runtime": 24.326,
"eval_samples_per_second": 93.604,
"eval_steps_per_second": 5.878,
"step": 2717
},
{
"epoch": 20.0,
"grad_norm": 0.06784375011920929,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0061,
"step": 2860
},
{
"epoch": 20.0,
"eval_accuracy": 0.9986572608406231,
"eval_f1": 0.9679410252014401,
"eval_loss": 0.004400221165269613,
"eval_precision": 0.9647983595352017,
"eval_recall": 0.9711042311661506,
"eval_runtime": 24.7155,
"eval_samples_per_second": 92.128,
"eval_steps_per_second": 5.786,
"step": 2860
},
{
"epoch": 21.0,
"grad_norm": 1.0147221088409424,
"learning_rate": 1.5e-05,
"loss": 0.0056,
"step": 3003
},
{
"epoch": 21.0,
"eval_accuracy": 0.9989024318973246,
"eval_f1": 0.9738651994497937,
"eval_loss": 0.003714313032105565,
"eval_precision": 0.9735304228257133,
"eval_recall": 0.9742002063983488,
"eval_runtime": 24.5602,
"eval_samples_per_second": 92.711,
"eval_steps_per_second": 5.822,
"step": 3003
},
{
"epoch": 22.0,
"grad_norm": 0.09583359956741333,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0048,
"step": 3146
},
{
"epoch": 22.0,
"eval_accuracy": 0.9988710728086768,
"eval_f1": 0.9748474305595163,
"eval_loss": 0.003601672360673547,
"eval_precision": 0.9698138901497957,
"eval_recall": 0.9799334938653824,
"eval_runtime": 24.3093,
"eval_samples_per_second": 93.668,
"eval_steps_per_second": 5.883,
"step": 3146
},
{
"epoch": 23.0,
"grad_norm": 0.09160174429416656,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.0046,
"step": 3289
},
{
"epoch": 23.0,
"eval_accuracy": 0.9989608738352593,
"eval_f1": 0.9748989697763105,
"eval_loss": 0.0032980283722281456,
"eval_precision": 0.9679023508137432,
"eval_recall": 0.9819974773535145,
"eval_runtime": 24.3088,
"eval_samples_per_second": 93.67,
"eval_steps_per_second": 5.883,
"step": 3289
},
{
"epoch": 24.0,
"grad_norm": 0.23728908598423004,
"learning_rate": 1e-05,
"loss": 0.0041,
"step": 3432
},
{
"epoch": 24.0,
"eval_accuracy": 0.9989323655728521,
"eval_f1": 0.9761823361823362,
"eval_loss": 0.0033988505601882935,
"eval_precision": 0.9702118020160834,
"eval_recall": 0.9822268088521958,
"eval_runtime": 24.2751,
"eval_samples_per_second": 93.8,
"eval_steps_per_second": 5.891,
"step": 3432
},
{
"epoch": 25.0,
"grad_norm": 0.29415127635002136,
"learning_rate": 8.333333333333334e-06,
"loss": 0.0038,
"step": 3575
},
{
"epoch": 25.0,
"eval_accuracy": 0.9989822550320646,
"eval_f1": 0.9797035347776512,
"eval_loss": 0.0032090507447719574,
"eval_precision": 0.9742601201950335,
"eval_recall": 0.9852081183350533,
"eval_runtime": 24.269,
"eval_samples_per_second": 93.823,
"eval_steps_per_second": 5.892,
"step": 3575
},
{
"epoch": 26.0,
"grad_norm": 0.5604017972946167,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0036,
"step": 3718
},
{
"epoch": 26.0,
"eval_accuracy": 0.9990150395338329,
"eval_f1": 0.979810653587316,
"eval_loss": 0.0030433752108365297,
"eval_precision": 0.9746964711222058,
"eval_recall": 0.984978786836372,
"eval_runtime": 24.3179,
"eval_samples_per_second": 93.635,
"eval_steps_per_second": 5.88,
"step": 3718
},
{
"epoch": 27.0,
"grad_norm": 0.12406046688556671,
"learning_rate": 5e-06,
"loss": 0.0035,
"step": 3861
},
{
"epoch": 27.0,
"eval_accuracy": 0.9989965091632682,
"eval_f1": 0.9798382503702017,
"eval_loss": 0.003074992448091507,
"eval_precision": 0.9734072649089057,
"eval_recall": 0.98635477582846,
"eval_runtime": 24.3593,
"eval_samples_per_second": 93.476,
"eval_steps_per_second": 5.87,
"step": 3861
},
{
"epoch": 28.0,
"grad_norm": 0.4010084569454193,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0033,
"step": 4004
},
{
"epoch": 28.0,
"eval_accuracy": 0.999039271556879,
"eval_f1": 0.9810675182481752,
"eval_loss": 0.002982645994052291,
"eval_precision": 0.9758366420873511,
"eval_recall": 0.98635477582846,
"eval_runtime": 24.2989,
"eval_samples_per_second": 93.708,
"eval_steps_per_second": 5.885,
"step": 4004
},
{
"epoch": 29.0,
"grad_norm": 0.22495581209659576,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0031,
"step": 4147
},
{
"epoch": 29.0,
"eval_accuracy": 0.9990335699043975,
"eval_f1": 0.9815342528211557,
"eval_loss": 0.0029558425303548574,
"eval_precision": 0.9757507082152974,
"eval_recall": 0.9873867675725261,
"eval_runtime": 24.3323,
"eval_samples_per_second": 93.579,
"eval_steps_per_second": 5.877,
"step": 4147
},
{
"epoch": 30.0,
"grad_norm": 0.4037317931652069,
"learning_rate": 0.0,
"loss": 0.0031,
"step": 4290
},
{
"epoch": 30.0,
"eval_accuracy": 0.999039271556879,
"eval_f1": 0.982488163824083,
"eval_loss": 0.002941250102594495,
"eval_precision": 0.9775255391600454,
"eval_recall": 0.9875014333218668,
"eval_runtime": 24.2995,
"eval_samples_per_second": 93.706,
"eval_steps_per_second": 5.885,
"step": 4290
}
],
"logging_steps": 500,
"max_steps": 4290,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.785257029315584e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}