TCS_MLM / last-checkpoint /trainer_state.json
mgh6's picture
Training in progress, step 7000, checkpoint
90f84b5 verified
{
"best_metric": 0.7487396597862244,
"best_model_checkpoint": "mgh6/TCS_MLM/checkpoint-6500",
"epoch": 9.370816599732262,
"eval_steps": 100,
"global_step": 7000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.13386880856760375,
"grad_norm": 0.18324387073516846,
"learning_rate": 0.0009866131191432397,
"loss": 1.432,
"step": 100
},
{
"epoch": 0.13386880856760375,
"eval_loss": 1.1609667539596558,
"eval_runtime": 6.3619,
"eval_samples_per_second": 894.697,
"eval_steps_per_second": 3.615,
"step": 100
},
{
"epoch": 0.2677376171352075,
"grad_norm": 0.24928592145442963,
"learning_rate": 0.0009732262382864793,
"loss": 1.195,
"step": 200
},
{
"epoch": 0.2677376171352075,
"eval_loss": 1.0809524059295654,
"eval_runtime": 6.3583,
"eval_samples_per_second": 895.204,
"eval_steps_per_second": 3.617,
"step": 200
},
{
"epoch": 0.40160642570281124,
"grad_norm": 0.2209886759519577,
"learning_rate": 0.0009598393574297188,
"loss": 1.124,
"step": 300
},
{
"epoch": 0.40160642570281124,
"eval_loss": 1.0328294038772583,
"eval_runtime": 6.3831,
"eval_samples_per_second": 891.734,
"eval_steps_per_second": 3.603,
"step": 300
},
{
"epoch": 0.535475234270415,
"grad_norm": 0.21808107197284698,
"learning_rate": 0.0009464524765729585,
"loss": 1.0729,
"step": 400
},
{
"epoch": 0.535475234270415,
"eval_loss": 1.0003758668899536,
"eval_runtime": 6.3823,
"eval_samples_per_second": 891.836,
"eval_steps_per_second": 3.604,
"step": 400
},
{
"epoch": 0.6693440428380187,
"grad_norm": 0.19088135659694672,
"learning_rate": 0.0009330655957161981,
"loss": 1.0403,
"step": 500
},
{
"epoch": 0.6693440428380187,
"eval_loss": 0.9832900762557983,
"eval_runtime": 6.38,
"eval_samples_per_second": 892.158,
"eval_steps_per_second": 3.605,
"step": 500
},
{
"epoch": 0.8032128514056225,
"grad_norm": 0.19500073790550232,
"learning_rate": 0.0009196787148594378,
"loss": 1.0145,
"step": 600
},
{
"epoch": 0.8032128514056225,
"eval_loss": 0.9555763602256775,
"eval_runtime": 6.3688,
"eval_samples_per_second": 893.733,
"eval_steps_per_second": 3.611,
"step": 600
},
{
"epoch": 0.9370816599732262,
"grad_norm": 0.21632438898086548,
"learning_rate": 0.0009062918340026773,
"loss": 0.9922,
"step": 700
},
{
"epoch": 0.9370816599732262,
"eval_loss": 0.9372277855873108,
"eval_runtime": 6.3894,
"eval_samples_per_second": 890.852,
"eval_steps_per_second": 3.6,
"step": 700
},
{
"epoch": 1.07095046854083,
"grad_norm": 0.19669267535209656,
"learning_rate": 0.000892904953145917,
"loss": 0.9746,
"step": 800
},
{
"epoch": 1.07095046854083,
"eval_loss": 0.9347544312477112,
"eval_runtime": 6.3997,
"eval_samples_per_second": 889.412,
"eval_steps_per_second": 3.594,
"step": 800
},
{
"epoch": 1.2048192771084336,
"grad_norm": 0.19168391823768616,
"learning_rate": 0.0008795180722891566,
"loss": 0.956,
"step": 900
},
{
"epoch": 1.2048192771084336,
"eval_loss": 0.9178469181060791,
"eval_runtime": 6.3662,
"eval_samples_per_second": 894.095,
"eval_steps_per_second": 3.613,
"step": 900
},
{
"epoch": 1.3386880856760375,
"grad_norm": 0.18636095523834229,
"learning_rate": 0.0008661311914323963,
"loss": 0.9389,
"step": 1000
},
{
"epoch": 1.3386880856760375,
"eval_loss": 0.9004408717155457,
"eval_runtime": 6.385,
"eval_samples_per_second": 891.467,
"eval_steps_per_second": 3.602,
"step": 1000
},
{
"epoch": 1.4725568942436413,
"grad_norm": 0.2045995146036148,
"learning_rate": 0.0008527443105756359,
"loss": 0.9242,
"step": 1100
},
{
"epoch": 1.4725568942436413,
"eval_loss": 0.8951759934425354,
"eval_runtime": 6.39,
"eval_samples_per_second": 890.768,
"eval_steps_per_second": 3.599,
"step": 1100
},
{
"epoch": 1.606425702811245,
"grad_norm": 0.18196602165699005,
"learning_rate": 0.0008393574297188755,
"loss": 0.9153,
"step": 1200
},
{
"epoch": 1.606425702811245,
"eval_loss": 0.8943730592727661,
"eval_runtime": 6.3848,
"eval_samples_per_second": 891.496,
"eval_steps_per_second": 3.602,
"step": 1200
},
{
"epoch": 1.7402945113788486,
"grad_norm": 0.18478406965732574,
"learning_rate": 0.0008259705488621151,
"loss": 0.9034,
"step": 1300
},
{
"epoch": 1.7402945113788486,
"eval_loss": 0.8838639259338379,
"eval_runtime": 6.3769,
"eval_samples_per_second": 892.596,
"eval_steps_per_second": 3.607,
"step": 1300
},
{
"epoch": 1.8741633199464525,
"grad_norm": 0.196046844124794,
"learning_rate": 0.0008125836680053548,
"loss": 0.8933,
"step": 1400
},
{
"epoch": 1.8741633199464525,
"eval_loss": 0.8721866607666016,
"eval_runtime": 6.3772,
"eval_samples_per_second": 892.551,
"eval_steps_per_second": 3.607,
"step": 1400
},
{
"epoch": 2.0080321285140563,
"grad_norm": 0.19012120366096497,
"learning_rate": 0.0007991967871485943,
"loss": 0.885,
"step": 1500
},
{
"epoch": 2.0080321285140563,
"eval_loss": 0.8718409538269043,
"eval_runtime": 6.3647,
"eval_samples_per_second": 894.312,
"eval_steps_per_second": 3.614,
"step": 1500
},
{
"epoch": 2.14190093708166,
"grad_norm": 0.21610258519649506,
"learning_rate": 0.0007858099062918341,
"loss": 0.8716,
"step": 1600
},
{
"epoch": 2.14190093708166,
"eval_loss": 0.8591811060905457,
"eval_runtime": 6.3979,
"eval_samples_per_second": 889.671,
"eval_steps_per_second": 3.595,
"step": 1600
},
{
"epoch": 2.2757697456492636,
"grad_norm": 0.1839440017938614,
"learning_rate": 0.0007724230254350736,
"loss": 0.8628,
"step": 1700
},
{
"epoch": 2.2757697456492636,
"eval_loss": 0.8537179231643677,
"eval_runtime": 6.3793,
"eval_samples_per_second": 892.265,
"eval_steps_per_second": 3.605,
"step": 1700
},
{
"epoch": 2.4096385542168672,
"grad_norm": 0.18840855360031128,
"learning_rate": 0.0007590361445783132,
"loss": 0.858,
"step": 1800
},
{
"epoch": 2.4096385542168672,
"eval_loss": 0.8528442978858948,
"eval_runtime": 6.3969,
"eval_samples_per_second": 889.806,
"eval_steps_per_second": 3.595,
"step": 1800
},
{
"epoch": 2.5435073627844713,
"grad_norm": 0.18157944083213806,
"learning_rate": 0.0007456492637215529,
"loss": 0.8556,
"step": 1900
},
{
"epoch": 2.5435073627844713,
"eval_loss": 0.8446890115737915,
"eval_runtime": 6.3767,
"eval_samples_per_second": 892.622,
"eval_steps_per_second": 3.607,
"step": 1900
},
{
"epoch": 2.677376171352075,
"grad_norm": 0.1894202083349228,
"learning_rate": 0.0007322623828647925,
"loss": 0.8463,
"step": 2000
},
{
"epoch": 2.677376171352075,
"eval_loss": 0.8482021689414978,
"eval_runtime": 6.3911,
"eval_samples_per_second": 890.609,
"eval_steps_per_second": 3.599,
"step": 2000
},
{
"epoch": 2.8112449799196786,
"grad_norm": 0.2042614370584488,
"learning_rate": 0.000718875502008032,
"loss": 0.8372,
"step": 2100
},
{
"epoch": 2.8112449799196786,
"eval_loss": 0.8368203639984131,
"eval_runtime": 6.3674,
"eval_samples_per_second": 893.931,
"eval_steps_per_second": 3.612,
"step": 2100
},
{
"epoch": 2.9451137884872827,
"grad_norm": 0.1776580661535263,
"learning_rate": 0.0007054886211512718,
"loss": 0.8327,
"step": 2200
},
{
"epoch": 2.9451137884872827,
"eval_loss": 0.8410006165504456,
"eval_runtime": 6.4205,
"eval_samples_per_second": 886.529,
"eval_steps_per_second": 3.582,
"step": 2200
},
{
"epoch": 3.0789825970548863,
"grad_norm": 0.2126319259405136,
"learning_rate": 0.0006921017402945113,
"loss": 0.8298,
"step": 2300
},
{
"epoch": 3.0789825970548863,
"eval_loss": 0.8286001682281494,
"eval_runtime": 6.4466,
"eval_samples_per_second": 882.942,
"eval_steps_per_second": 3.568,
"step": 2300
},
{
"epoch": 3.21285140562249,
"grad_norm": 0.1851571649312973,
"learning_rate": 0.000678714859437751,
"loss": 0.8181,
"step": 2400
},
{
"epoch": 3.21285140562249,
"eval_loss": 0.8264986872673035,
"eval_runtime": 6.3671,
"eval_samples_per_second": 893.972,
"eval_steps_per_second": 3.612,
"step": 2400
},
{
"epoch": 3.3467202141900936,
"grad_norm": 0.21075735986232758,
"learning_rate": 0.0006653279785809906,
"loss": 0.814,
"step": 2500
},
{
"epoch": 3.3467202141900936,
"eval_loss": 0.8332136869430542,
"eval_runtime": 6.3889,
"eval_samples_per_second": 890.924,
"eval_steps_per_second": 3.6,
"step": 2500
},
{
"epoch": 3.480589022757697,
"grad_norm": 0.1842898279428482,
"learning_rate": 0.0006519410977242302,
"loss": 0.8072,
"step": 2600
},
{
"epoch": 3.480589022757697,
"eval_loss": 0.8202800750732422,
"eval_runtime": 6.3504,
"eval_samples_per_second": 896.32,
"eval_steps_per_second": 3.622,
"step": 2600
},
{
"epoch": 3.6144578313253013,
"grad_norm": 0.18545600771903992,
"learning_rate": 0.0006385542168674699,
"loss": 0.8071,
"step": 2700
},
{
"epoch": 3.6144578313253013,
"eval_loss": 0.8289027214050293,
"eval_runtime": 6.4542,
"eval_samples_per_second": 881.901,
"eval_steps_per_second": 3.564,
"step": 2700
},
{
"epoch": 3.748326639892905,
"grad_norm": 0.19937384128570557,
"learning_rate": 0.0006251673360107095,
"loss": 0.8,
"step": 2800
},
{
"epoch": 3.748326639892905,
"eval_loss": 0.8232221007347107,
"eval_runtime": 6.3959,
"eval_samples_per_second": 889.945,
"eval_steps_per_second": 3.596,
"step": 2800
},
{
"epoch": 3.8821954484605086,
"grad_norm": 0.22407300770282745,
"learning_rate": 0.0006117804551539491,
"loss": 0.7964,
"step": 2900
},
{
"epoch": 3.8821954484605086,
"eval_loss": 0.8169597387313843,
"eval_runtime": 6.4097,
"eval_samples_per_second": 888.036,
"eval_steps_per_second": 3.588,
"step": 2900
},
{
"epoch": 4.016064257028113,
"grad_norm": 0.20041291415691376,
"learning_rate": 0.0005983935742971888,
"loss": 0.7909,
"step": 3000
},
{
"epoch": 4.016064257028113,
"eval_loss": 0.819555401802063,
"eval_runtime": 6.3716,
"eval_samples_per_second": 893.337,
"eval_steps_per_second": 3.61,
"step": 3000
},
{
"epoch": 4.149933065595716,
"grad_norm": 0.19783490896224976,
"learning_rate": 0.0005850066934404283,
"loss": 0.7826,
"step": 3100
},
{
"epoch": 4.149933065595716,
"eval_loss": 0.8133747577667236,
"eval_runtime": 6.3555,
"eval_samples_per_second": 895.607,
"eval_steps_per_second": 3.619,
"step": 3100
},
{
"epoch": 4.28380187416332,
"grad_norm": 0.19236040115356445,
"learning_rate": 0.000571619812583668,
"loss": 0.7805,
"step": 3200
},
{
"epoch": 4.28380187416332,
"eval_loss": 0.805473268032074,
"eval_runtime": 6.4107,
"eval_samples_per_second": 887.894,
"eval_steps_per_second": 3.588,
"step": 3200
},
{
"epoch": 4.417670682730924,
"grad_norm": 0.20242071151733398,
"learning_rate": 0.0005582329317269076,
"loss": 0.775,
"step": 3300
},
{
"epoch": 4.417670682730924,
"eval_loss": 0.8090841174125671,
"eval_runtime": 6.3804,
"eval_samples_per_second": 892.112,
"eval_steps_per_second": 3.605,
"step": 3300
},
{
"epoch": 4.551539491298527,
"grad_norm": 0.18258976936340332,
"learning_rate": 0.0005448460508701473,
"loss": 0.7677,
"step": 3400
},
{
"epoch": 4.551539491298527,
"eval_loss": 0.8068288564682007,
"eval_runtime": 6.3962,
"eval_samples_per_second": 889.896,
"eval_steps_per_second": 3.596,
"step": 3400
},
{
"epoch": 4.685408299866131,
"grad_norm": 0.2070203423500061,
"learning_rate": 0.0005314591700133868,
"loss": 0.7658,
"step": 3500
},
{
"epoch": 4.685408299866131,
"eval_loss": 0.7966070771217346,
"eval_runtime": 6.3599,
"eval_samples_per_second": 894.984,
"eval_steps_per_second": 3.616,
"step": 3500
},
{
"epoch": 4.8192771084337345,
"grad_norm": 0.19489823281764984,
"learning_rate": 0.0005180722891566265,
"loss": 0.768,
"step": 3600
},
{
"epoch": 4.8192771084337345,
"eval_loss": 0.800128698348999,
"eval_runtime": 6.3948,
"eval_samples_per_second": 890.099,
"eval_steps_per_second": 3.597,
"step": 3600
},
{
"epoch": 4.953145917001339,
"grad_norm": 0.18897077441215515,
"learning_rate": 0.0005046854082998661,
"loss": 0.765,
"step": 3700
},
{
"epoch": 4.953145917001339,
"eval_loss": 0.7916857600212097,
"eval_runtime": 6.3763,
"eval_samples_per_second": 892.674,
"eval_steps_per_second": 3.607,
"step": 3700
},
{
"epoch": 5.087014725568943,
"grad_norm": 0.19471462070941925,
"learning_rate": 0.0004912985274431057,
"loss": 0.7532,
"step": 3800
},
{
"epoch": 5.087014725568943,
"eval_loss": 0.8013682961463928,
"eval_runtime": 6.4045,
"eval_samples_per_second": 888.745,
"eval_steps_per_second": 3.591,
"step": 3800
},
{
"epoch": 5.220883534136546,
"grad_norm": 0.19813202321529388,
"learning_rate": 0.0004779116465863454,
"loss": 0.75,
"step": 3900
},
{
"epoch": 5.220883534136546,
"eval_loss": 0.7911626696586609,
"eval_runtime": 6.3995,
"eval_samples_per_second": 889.442,
"eval_steps_per_second": 3.594,
"step": 3900
},
{
"epoch": 5.35475234270415,
"grad_norm": 0.199341282248497,
"learning_rate": 0.000464524765729585,
"loss": 0.7462,
"step": 4000
},
{
"epoch": 5.35475234270415,
"eval_loss": 0.7948747873306274,
"eval_runtime": 6.3721,
"eval_samples_per_second": 893.27,
"eval_steps_per_second": 3.609,
"step": 4000
},
{
"epoch": 5.4886211512717535,
"grad_norm": 0.22185169160366058,
"learning_rate": 0.00045113788487282465,
"loss": 0.7461,
"step": 4100
},
{
"epoch": 5.4886211512717535,
"eval_loss": 0.7832607626914978,
"eval_runtime": 6.3959,
"eval_samples_per_second": 889.94,
"eval_steps_per_second": 3.596,
"step": 4100
},
{
"epoch": 5.622489959839357,
"grad_norm": 0.19276629388332367,
"learning_rate": 0.0004377510040160643,
"loss": 0.7411,
"step": 4200
},
{
"epoch": 5.622489959839357,
"eval_loss": 0.78049236536026,
"eval_runtime": 6.3726,
"eval_samples_per_second": 893.205,
"eval_steps_per_second": 3.609,
"step": 4200
},
{
"epoch": 5.756358768406961,
"grad_norm": 0.19334332644939423,
"learning_rate": 0.00042436412315930387,
"loss": 0.7389,
"step": 4300
},
{
"epoch": 5.756358768406961,
"eval_loss": 0.7910569906234741,
"eval_runtime": 6.3418,
"eval_samples_per_second": 897.535,
"eval_steps_per_second": 3.627,
"step": 4300
},
{
"epoch": 5.890227576974565,
"grad_norm": 0.19738435745239258,
"learning_rate": 0.0004109772423025435,
"loss": 0.7339,
"step": 4400
},
{
"epoch": 5.890227576974565,
"eval_loss": 0.7912316918373108,
"eval_runtime": 6.4227,
"eval_samples_per_second": 886.234,
"eval_steps_per_second": 3.581,
"step": 4400
},
{
"epoch": 6.024096385542169,
"grad_norm": 0.19529978930950165,
"learning_rate": 0.00039759036144578315,
"loss": 0.7329,
"step": 4500
},
{
"epoch": 6.024096385542169,
"eval_loss": 0.7827839851379395,
"eval_runtime": 6.3652,
"eval_samples_per_second": 894.234,
"eval_steps_per_second": 3.613,
"step": 4500
},
{
"epoch": 6.157965194109773,
"grad_norm": 0.18886443972587585,
"learning_rate": 0.0003842034805890228,
"loss": 0.7246,
"step": 4600
},
{
"epoch": 6.157965194109773,
"eval_loss": 0.7793735861778259,
"eval_runtime": 6.3661,
"eval_samples_per_second": 894.112,
"eval_steps_per_second": 3.613,
"step": 4600
},
{
"epoch": 6.291834002677376,
"grad_norm": 0.20140951871871948,
"learning_rate": 0.0003708165997322624,
"loss": 0.7186,
"step": 4700
},
{
"epoch": 6.291834002677376,
"eval_loss": 0.7824135422706604,
"eval_runtime": 6.379,
"eval_samples_per_second": 892.303,
"eval_steps_per_second": 3.606,
"step": 4700
},
{
"epoch": 6.42570281124498,
"grad_norm": 0.19508065283298492,
"learning_rate": 0.000357429718875502,
"loss": 0.7196,
"step": 4800
},
{
"epoch": 6.42570281124498,
"eval_loss": 0.7769716382026672,
"eval_runtime": 6.3587,
"eval_samples_per_second": 895.148,
"eval_steps_per_second": 3.617,
"step": 4800
},
{
"epoch": 6.5595716198125835,
"grad_norm": 0.2040824443101883,
"learning_rate": 0.00034404283801874166,
"loss": 0.7194,
"step": 4900
},
{
"epoch": 6.5595716198125835,
"eval_loss": 0.775974452495575,
"eval_runtime": 6.415,
"eval_samples_per_second": 887.297,
"eval_steps_per_second": 3.585,
"step": 4900
},
{
"epoch": 6.693440428380187,
"grad_norm": 0.21073400974273682,
"learning_rate": 0.00033065595716198125,
"loss": 0.7166,
"step": 5000
},
{
"epoch": 6.693440428380187,
"eval_loss": 0.7732182145118713,
"eval_runtime": 6.3552,
"eval_samples_per_second": 895.647,
"eval_steps_per_second": 3.619,
"step": 5000
},
{
"epoch": 6.827309236947791,
"grad_norm": 0.19911488890647888,
"learning_rate": 0.0003172690763052209,
"loss": 0.7113,
"step": 5100
},
{
"epoch": 6.827309236947791,
"eval_loss": 0.7706419825553894,
"eval_runtime": 6.3916,
"eval_samples_per_second": 890.547,
"eval_steps_per_second": 3.598,
"step": 5100
},
{
"epoch": 6.961178045515394,
"grad_norm": 0.1983019858598709,
"learning_rate": 0.00030388219544846053,
"loss": 0.7077,
"step": 5200
},
{
"epoch": 6.961178045515394,
"eval_loss": 0.7824519276618958,
"eval_runtime": 6.3567,
"eval_samples_per_second": 895.429,
"eval_steps_per_second": 3.618,
"step": 5200
},
{
"epoch": 7.095046854082999,
"grad_norm": 0.19971829652786255,
"learning_rate": 0.0002904953145917001,
"loss": 0.6997,
"step": 5300
},
{
"epoch": 7.095046854082999,
"eval_loss": 0.7725899815559387,
"eval_runtime": 6.3729,
"eval_samples_per_second": 893.153,
"eval_steps_per_second": 3.609,
"step": 5300
},
{
"epoch": 7.228915662650603,
"grad_norm": 0.2070448100566864,
"learning_rate": 0.00027710843373493976,
"loss": 0.6983,
"step": 5400
},
{
"epoch": 7.228915662650603,
"eval_loss": 0.7650670409202576,
"eval_runtime": 6.4167,
"eval_samples_per_second": 887.065,
"eval_steps_per_second": 3.584,
"step": 5400
},
{
"epoch": 7.362784471218206,
"grad_norm": 0.19670027494430542,
"learning_rate": 0.0002637215528781794,
"loss": 0.6967,
"step": 5500
},
{
"epoch": 7.362784471218206,
"eval_loss": 0.7688850164413452,
"eval_runtime": 6.3788,
"eval_samples_per_second": 892.334,
"eval_steps_per_second": 3.606,
"step": 5500
},
{
"epoch": 7.49665327978581,
"grad_norm": 0.22708941996097565,
"learning_rate": 0.00025033467202141904,
"loss": 0.6978,
"step": 5600
},
{
"epoch": 7.49665327978581,
"eval_loss": 0.7693562507629395,
"eval_runtime": 6.3747,
"eval_samples_per_second": 892.903,
"eval_steps_per_second": 3.608,
"step": 5600
},
{
"epoch": 7.6305220883534135,
"grad_norm": 0.2055513709783554,
"learning_rate": 0.00023694779116465866,
"loss": 0.69,
"step": 5700
},
{
"epoch": 7.6305220883534135,
"eval_loss": 0.7648805975914001,
"eval_runtime": 6.3649,
"eval_samples_per_second": 894.277,
"eval_steps_per_second": 3.614,
"step": 5700
},
{
"epoch": 7.764390896921017,
"grad_norm": 0.19769689440727234,
"learning_rate": 0.00022356091030789827,
"loss": 0.6921,
"step": 5800
},
{
"epoch": 7.764390896921017,
"eval_loss": 0.7649876475334167,
"eval_runtime": 6.3876,
"eval_samples_per_second": 891.102,
"eval_steps_per_second": 3.601,
"step": 5800
},
{
"epoch": 7.898259705488621,
"grad_norm": 0.20442116260528564,
"learning_rate": 0.00021017402945113788,
"loss": 0.6876,
"step": 5900
},
{
"epoch": 7.898259705488621,
"eval_loss": 0.7717822790145874,
"eval_runtime": 6.3521,
"eval_samples_per_second": 896.089,
"eval_steps_per_second": 3.621,
"step": 5900
},
{
"epoch": 8.032128514056225,
"grad_norm": 0.21449404954910278,
"learning_rate": 0.00019678714859437752,
"loss": 0.6838,
"step": 6000
},
{
"epoch": 8.032128514056225,
"eval_loss": 0.7580859661102295,
"eval_runtime": 6.4002,
"eval_samples_per_second": 889.346,
"eval_steps_per_second": 3.594,
"step": 6000
},
{
"epoch": 8.165997322623829,
"grad_norm": 0.2097356915473938,
"learning_rate": 0.00018340026773761714,
"loss": 0.6789,
"step": 6100
},
{
"epoch": 8.165997322623829,
"eval_loss": 0.7637941241264343,
"eval_runtime": 6.4327,
"eval_samples_per_second": 884.859,
"eval_steps_per_second": 3.576,
"step": 6100
},
{
"epoch": 8.299866131191433,
"grad_norm": 0.19806508719921112,
"learning_rate": 0.00017001338688085678,
"loss": 0.6774,
"step": 6200
},
{
"epoch": 8.299866131191433,
"eval_loss": 0.757574200630188,
"eval_runtime": 6.4053,
"eval_samples_per_second": 888.642,
"eval_steps_per_second": 3.591,
"step": 6200
},
{
"epoch": 8.433734939759036,
"grad_norm": 0.1967461109161377,
"learning_rate": 0.0001566265060240964,
"loss": 0.672,
"step": 6300
},
{
"epoch": 8.433734939759036,
"eval_loss": 0.7565015554428101,
"eval_runtime": 6.3503,
"eval_samples_per_second": 896.341,
"eval_steps_per_second": 3.622,
"step": 6300
},
{
"epoch": 8.56760374832664,
"grad_norm": 0.21538911759853363,
"learning_rate": 0.000143239625167336,
"loss": 0.6759,
"step": 6400
},
{
"epoch": 8.56760374832664,
"eval_loss": 0.7605956792831421,
"eval_runtime": 6.4074,
"eval_samples_per_second": 888.342,
"eval_steps_per_second": 3.59,
"step": 6400
},
{
"epoch": 8.701472556894243,
"grad_norm": 0.20278280973434448,
"learning_rate": 0.00012985274431057565,
"loss": 0.6707,
"step": 6500
},
{
"epoch": 8.701472556894243,
"eval_loss": 0.7487396597862244,
"eval_runtime": 6.3846,
"eval_samples_per_second": 891.523,
"eval_steps_per_second": 3.602,
"step": 6500
},
{
"epoch": 8.835341365461847,
"grad_norm": 0.20785841345787048,
"learning_rate": 0.00011646586345381527,
"loss": 0.6697,
"step": 6600
},
{
"epoch": 8.835341365461847,
"eval_loss": 0.755291223526001,
"eval_runtime": 6.3631,
"eval_samples_per_second": 894.526,
"eval_steps_per_second": 3.615,
"step": 6600
},
{
"epoch": 8.96921017402945,
"grad_norm": 0.20792551338672638,
"learning_rate": 0.00010307898259705489,
"loss": 0.6629,
"step": 6700
},
{
"epoch": 8.96921017402945,
"eval_loss": 0.7500482201576233,
"eval_runtime": 6.3736,
"eval_samples_per_second": 893.061,
"eval_steps_per_second": 3.609,
"step": 6700
},
{
"epoch": 9.103078982597054,
"grad_norm": 0.2049485594034195,
"learning_rate": 8.969210174029451e-05,
"loss": 0.6629,
"step": 6800
},
{
"epoch": 9.103078982597054,
"eval_loss": 0.7550941705703735,
"eval_runtime": 6.3709,
"eval_samples_per_second": 893.433,
"eval_steps_per_second": 3.61,
"step": 6800
},
{
"epoch": 9.236947791164658,
"grad_norm": 0.21241113543510437,
"learning_rate": 7.630522088353414e-05,
"loss": 0.6629,
"step": 6900
},
{
"epoch": 9.236947791164658,
"eval_loss": 0.7512398958206177,
"eval_runtime": 6.3681,
"eval_samples_per_second": 893.836,
"eval_steps_per_second": 3.612,
"step": 6900
},
{
"epoch": 9.370816599732262,
"grad_norm": 0.2047683447599411,
"learning_rate": 6.291834002677377e-05,
"loss": 0.6625,
"step": 7000
},
{
"epoch": 9.370816599732262,
"eval_loss": 0.7502346038818359,
"eval_runtime": 6.3912,
"eval_samples_per_second": 890.597,
"eval_steps_per_second": 3.599,
"step": 7000
}
],
"logging_steps": 100,
"max_steps": 7470,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 5
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.5418589536256e+17,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}