KYCOCR / trainer_state.json
HiteshKamwal's picture
Upload 16 files
dcd3f6e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9932279909706545,
"eval_steps": 500,
"global_step": 996,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015048908954100828,
"grad_norm": 1.2988319396972656,
"learning_rate": 4.9996890990217804e-05,
"loss": 2.4707,
"num_input_tokens_seen": 5864,
"step": 5
},
{
"epoch": 0.030097817908201655,
"grad_norm": 1.8058427572250366,
"learning_rate": 4.9987564734146566e-05,
"loss": 2.2509,
"num_input_tokens_seen": 11432,
"step": 10
},
{
"epoch": 0.045146726862302484,
"grad_norm": 0.8231738209724426,
"learning_rate": 4.997202355141999e-05,
"loss": 1.6895,
"num_input_tokens_seen": 17000,
"step": 15
},
{
"epoch": 0.06019563581640331,
"grad_norm": 0.7266705632209778,
"learning_rate": 4.995027130745321e-05,
"loss": 1.4876,
"num_input_tokens_seen": 22840,
"step": 20
},
{
"epoch": 0.07524454477050414,
"grad_norm": 1.1722582578659058,
"learning_rate": 4.992231341248137e-05,
"loss": 1.4812,
"num_input_tokens_seen": 28984,
"step": 25
},
{
"epoch": 0.09029345372460497,
"grad_norm": 0.9262341260910034,
"learning_rate": 4.9888156820213974e-05,
"loss": 1.3642,
"num_input_tokens_seen": 34856,
"step": 30
},
{
"epoch": 0.1053423626787058,
"grad_norm": 0.8832902908325195,
"learning_rate": 4.9847810026105394e-05,
"loss": 1.3651,
"num_input_tokens_seen": 41216,
"step": 35
},
{
"epoch": 0.12039127163280662,
"grad_norm": 0.8503655791282654,
"learning_rate": 4.980128306524183e-05,
"loss": 1.1321,
"num_input_tokens_seen": 47304,
"step": 40
},
{
"epoch": 0.13544018058690746,
"grad_norm": 1.348948359489441,
"learning_rate": 4.97485875098454e-05,
"loss": 1.3012,
"num_input_tokens_seen": 53184,
"step": 45
},
{
"epoch": 0.1504890895410083,
"grad_norm": 0.7177269458770752,
"learning_rate": 4.968973646639589e-05,
"loss": 0.9827,
"num_input_tokens_seen": 59024,
"step": 50
},
{
"epoch": 0.1655379984951091,
"grad_norm": 0.6005258560180664,
"learning_rate": 4.9624744572370865e-05,
"loss": 1.2313,
"num_input_tokens_seen": 64816,
"step": 55
},
{
"epoch": 0.18058690744920994,
"grad_norm": 0.6153081059455872,
"learning_rate": 4.9553627992605066e-05,
"loss": 1.0347,
"num_input_tokens_seen": 70848,
"step": 60
},
{
"epoch": 0.19563581640331076,
"grad_norm": 0.7796200513839722,
"learning_rate": 4.947640441526989e-05,
"loss": 1.0422,
"num_input_tokens_seen": 76888,
"step": 65
},
{
"epoch": 0.2106847253574116,
"grad_norm": 0.7273033857345581,
"learning_rate": 4.939309304747391e-05,
"loss": 0.9996,
"num_input_tokens_seen": 82840,
"step": 70
},
{
"epoch": 0.22573363431151242,
"grad_norm": 0.7943289875984192,
"learning_rate": 4.930371461048571e-05,
"loss": 1.0755,
"num_input_tokens_seen": 88824,
"step": 75
},
{
"epoch": 0.24078254326561324,
"grad_norm": 0.6128024458885193,
"learning_rate": 4.9208291334580104e-05,
"loss": 1.026,
"num_input_tokens_seen": 94264,
"step": 80
},
{
"epoch": 0.2558314522197141,
"grad_norm": 0.7087495923042297,
"learning_rate": 4.910684695350895e-05,
"loss": 1.1307,
"num_input_tokens_seen": 99896,
"step": 85
},
{
"epoch": 0.2708803611738149,
"grad_norm": 0.711476743221283,
"learning_rate": 4.8999406698598074e-05,
"loss": 1.0221,
"num_input_tokens_seen": 105640,
"step": 90
},
{
"epoch": 0.28592927012791575,
"grad_norm": 0.5772566795349121,
"learning_rate": 4.8885997292471774e-05,
"loss": 1.012,
"num_input_tokens_seen": 111280,
"step": 95
},
{
"epoch": 0.3009781790820166,
"grad_norm": 0.6769325137138367,
"learning_rate": 4.87666469424063e-05,
"loss": 1.0151,
"num_input_tokens_seen": 116640,
"step": 100
},
{
"epoch": 0.3160270880361174,
"grad_norm": 0.679373025894165,
"learning_rate": 4.86413853333141e-05,
"loss": 1.0028,
"num_input_tokens_seen": 121864,
"step": 105
},
{
"epoch": 0.3310759969902182,
"grad_norm": 0.9181504845619202,
"learning_rate": 4.851024362036064e-05,
"loss": 1.143,
"num_input_tokens_seen": 127384,
"step": 110
},
{
"epoch": 0.34612490594431905,
"grad_norm": 0.7842696905136108,
"learning_rate": 4.837325442121538e-05,
"loss": 0.9695,
"num_input_tokens_seen": 133008,
"step": 115
},
{
"epoch": 0.3611738148984199,
"grad_norm": 0.6459535360336304,
"learning_rate": 4.8230451807939135e-05,
"loss": 0.9017,
"num_input_tokens_seen": 139144,
"step": 120
},
{
"epoch": 0.3762227238525207,
"grad_norm": 0.6695935726165771,
"learning_rate": 4.808187129850963e-05,
"loss": 1.035,
"num_input_tokens_seen": 144848,
"step": 125
},
{
"epoch": 0.3912716328066215,
"grad_norm": 0.9289236664772034,
"learning_rate": 4.792754984798745e-05,
"loss": 1.0128,
"num_input_tokens_seen": 150480,
"step": 130
},
{
"epoch": 0.40632054176072235,
"grad_norm": 0.6192979216575623,
"learning_rate": 4.776752583932454e-05,
"loss": 0.9432,
"num_input_tokens_seen": 156336,
"step": 135
},
{
"epoch": 0.4213694507148232,
"grad_norm": 0.7946303486824036,
"learning_rate": 4.760183907381757e-05,
"loss": 1.0344,
"num_input_tokens_seen": 162440,
"step": 140
},
{
"epoch": 0.436418359668924,
"grad_norm": 0.6548484563827515,
"learning_rate": 4.7430530761208494e-05,
"loss": 0.9452,
"num_input_tokens_seen": 168304,
"step": 145
},
{
"epoch": 0.45146726862302483,
"grad_norm": 0.9075986742973328,
"learning_rate": 4.725364350943492e-05,
"loss": 0.9559,
"num_input_tokens_seen": 173984,
"step": 150
},
{
"epoch": 0.46651617757712566,
"grad_norm": 0.8047800660133362,
"learning_rate": 4.707122131403251e-05,
"loss": 0.9726,
"num_input_tokens_seen": 179896,
"step": 155
},
{
"epoch": 0.4815650865312265,
"grad_norm": 0.6954847574234009,
"learning_rate": 4.6883309547192476e-05,
"loss": 0.9344,
"num_input_tokens_seen": 185296,
"step": 160
},
{
"epoch": 0.4966139954853273,
"grad_norm": 0.7912609577178955,
"learning_rate": 4.668995494647653e-05,
"loss": 0.9497,
"num_input_tokens_seen": 190928,
"step": 165
},
{
"epoch": 0.5116629044394282,
"grad_norm": 0.7360678315162659,
"learning_rate": 4.649120560319225e-05,
"loss": 1.057,
"num_input_tokens_seen": 197352,
"step": 170
},
{
"epoch": 0.526711813393529,
"grad_norm": 0.7325194478034973,
"learning_rate": 4.6287110950431865e-05,
"loss": 0.9847,
"num_input_tokens_seen": 203216,
"step": 175
},
{
"epoch": 0.5417607223476298,
"grad_norm": 0.7140082120895386,
"learning_rate": 4.607772175077711e-05,
"loss": 1.001,
"num_input_tokens_seen": 208624,
"step": 180
},
{
"epoch": 0.5568096313017307,
"grad_norm": 0.9454194903373718,
"learning_rate": 4.586309008367359e-05,
"loss": 0.9384,
"num_input_tokens_seen": 214552,
"step": 185
},
{
"epoch": 0.5718585402558315,
"grad_norm": 0.9370235800743103,
"learning_rate": 4.564326933247752e-05,
"loss": 1.0312,
"num_input_tokens_seen": 220704,
"step": 190
},
{
"epoch": 0.5869074492099323,
"grad_norm": 0.7274216413497925,
"learning_rate": 4.541831417117815e-05,
"loss": 0.9112,
"num_input_tokens_seen": 226480,
"step": 195
},
{
"epoch": 0.6019563581640331,
"grad_norm": 0.9026529788970947,
"learning_rate": 4.518828055079925e-05,
"loss": 0.9967,
"num_input_tokens_seen": 232136,
"step": 200
},
{
"epoch": 0.617005267118134,
"grad_norm": 0.9668667316436768,
"learning_rate": 4.4953225685482904e-05,
"loss": 1.0905,
"num_input_tokens_seen": 238072,
"step": 205
},
{
"epoch": 0.6320541760722348,
"grad_norm": 0.7728851437568665,
"learning_rate": 4.471320803825915e-05,
"loss": 0.9487,
"num_input_tokens_seen": 243680,
"step": 210
},
{
"epoch": 0.6471030850263356,
"grad_norm": 0.7141396999359131,
"learning_rate": 4.4468287306505045e-05,
"loss": 0.8675,
"num_input_tokens_seen": 249376,
"step": 215
},
{
"epoch": 0.6621519939804364,
"grad_norm": 0.7524191737174988,
"learning_rate": 4.421852440709666e-05,
"loss": 0.8624,
"num_input_tokens_seen": 255288,
"step": 220
},
{
"epoch": 0.6772009029345373,
"grad_norm": 1.1502355337142944,
"learning_rate": 4.39639814612578e-05,
"loss": 1.0489,
"num_input_tokens_seen": 261592,
"step": 225
},
{
"epoch": 0.6922498118886381,
"grad_norm": 0.7467320561408997,
"learning_rate": 4.370472177910914e-05,
"loss": 0.9139,
"num_input_tokens_seen": 267192,
"step": 230
},
{
"epoch": 0.7072987208427389,
"grad_norm": 0.6400129795074463,
"learning_rate": 4.3440809843921725e-05,
"loss": 0.9905,
"num_input_tokens_seen": 272712,
"step": 235
},
{
"epoch": 0.7223476297968398,
"grad_norm": 0.6654481291770935,
"learning_rate": 4.3172311296078595e-05,
"loss": 0.8974,
"num_input_tokens_seen": 278720,
"step": 240
},
{
"epoch": 0.7373965387509406,
"grad_norm": 0.7487585544586182,
"learning_rate": 4.28992929167487e-05,
"loss": 0.999,
"num_input_tokens_seen": 284584,
"step": 245
},
{
"epoch": 0.7524454477050414,
"grad_norm": 0.6885581612586975,
"learning_rate": 4.2621822611277e-05,
"loss": 0.9916,
"num_input_tokens_seen": 290408,
"step": 250
},
{
"epoch": 0.7674943566591422,
"grad_norm": 0.774027407169342,
"learning_rate": 4.233996939229502e-05,
"loss": 0.9242,
"num_input_tokens_seen": 295776,
"step": 255
},
{
"epoch": 0.782543265613243,
"grad_norm": 0.8608073592185974,
"learning_rate": 4.205380336255594e-05,
"loss": 1.0426,
"num_input_tokens_seen": 301736,
"step": 260
},
{
"epoch": 0.7975921745673439,
"grad_norm": 0.6539498567581177,
"learning_rate": 4.176339569749865e-05,
"loss": 0.8625,
"num_input_tokens_seen": 307224,
"step": 265
},
{
"epoch": 0.8126410835214447,
"grad_norm": 0.8432996273040771,
"learning_rate": 4.1468818627544845e-05,
"loss": 0.9959,
"num_input_tokens_seen": 313040,
"step": 270
},
{
"epoch": 0.8276899924755455,
"grad_norm": 0.877001166343689,
"learning_rate": 4.11701454201339e-05,
"loss": 0.939,
"num_input_tokens_seen": 319112,
"step": 275
},
{
"epoch": 0.8427389014296464,
"grad_norm": 0.9003238081932068,
"learning_rate": 4.08674503614997e-05,
"loss": 0.9741,
"num_input_tokens_seen": 325040,
"step": 280
},
{
"epoch": 0.8577878103837472,
"grad_norm": 0.8585950136184692,
"learning_rate": 4.0560808738194114e-05,
"loss": 0.98,
"num_input_tokens_seen": 330904,
"step": 285
},
{
"epoch": 0.872836719337848,
"grad_norm": 0.8015385270118713,
"learning_rate": 4.0250296818361647e-05,
"loss": 0.8898,
"num_input_tokens_seen": 336392,
"step": 290
},
{
"epoch": 0.8878856282919488,
"grad_norm": 0.8380082845687866,
"learning_rate": 3.993599183277001e-05,
"loss": 0.953,
"num_input_tokens_seen": 342832,
"step": 295
},
{
"epoch": 0.9029345372460497,
"grad_norm": 0.8890098929405212,
"learning_rate": 3.961797195560118e-05,
"loss": 0.9311,
"num_input_tokens_seen": 348944,
"step": 300
},
{
"epoch": 0.9179834462001505,
"grad_norm": 0.9356483221054077,
"learning_rate": 3.9296316285007887e-05,
"loss": 0.9114,
"num_input_tokens_seen": 354680,
"step": 305
},
{
"epoch": 0.9330323551542513,
"grad_norm": 0.8241044878959656,
"learning_rate": 3.897110482344024e-05,
"loss": 0.9674,
"num_input_tokens_seen": 361008,
"step": 310
},
{
"epoch": 0.9480812641083521,
"grad_norm": 0.7882922887802124,
"learning_rate": 3.864241845774746e-05,
"loss": 0.9582,
"num_input_tokens_seen": 366760,
"step": 315
},
{
"epoch": 0.963130173062453,
"grad_norm": 0.7503064274787903,
"learning_rate": 3.8310338939059644e-05,
"loss": 0.9863,
"num_input_tokens_seen": 372448,
"step": 320
},
{
"epoch": 0.9781790820165538,
"grad_norm": 0.6487952470779419,
"learning_rate": 3.797494886245456e-05,
"loss": 0.906,
"num_input_tokens_seen": 378520,
"step": 325
},
{
"epoch": 0.9932279909706546,
"grad_norm": 0.8584316968917847,
"learning_rate": 3.7636331646414524e-05,
"loss": 0.8958,
"num_input_tokens_seen": 384272,
"step": 330
},
{
"epoch": 1.0060195635816402,
"grad_norm": 0.8825767040252686,
"learning_rate": 3.7294571512078506e-05,
"loss": 0.8349,
"num_input_tokens_seen": 389280,
"step": 335
},
{
"epoch": 1.021068472535741,
"grad_norm": 0.8422874808311462,
"learning_rate": 3.694975346229458e-05,
"loss": 0.8507,
"num_input_tokens_seen": 394944,
"step": 340
},
{
"epoch": 1.036117381489842,
"grad_norm": 0.8337146639823914,
"learning_rate": 3.6601963260477924e-05,
"loss": 0.9287,
"num_input_tokens_seen": 400800,
"step": 345
},
{
"epoch": 1.0511662904439427,
"grad_norm": 0.936469316482544,
"learning_rate": 3.625128740927971e-05,
"loss": 0.9107,
"num_input_tokens_seen": 406728,
"step": 350
},
{
"epoch": 1.0662151993980435,
"grad_norm": 0.8475446105003357,
"learning_rate": 3.589781312907207e-05,
"loss": 0.952,
"num_input_tokens_seen": 412656,
"step": 355
},
{
"epoch": 1.0812641083521444,
"grad_norm": 0.7245047092437744,
"learning_rate": 3.55416283362546e-05,
"loss": 0.9526,
"num_input_tokens_seen": 418488,
"step": 360
},
{
"epoch": 1.0963130173062452,
"grad_norm": 1.0173735618591309,
"learning_rate": 3.518282162138772e-05,
"loss": 0.8775,
"num_input_tokens_seen": 424192,
"step": 365
},
{
"epoch": 1.111361926260346,
"grad_norm": 0.9992531538009644,
"learning_rate": 3.482148222715835e-05,
"loss": 0.883,
"num_input_tokens_seen": 430312,
"step": 370
},
{
"epoch": 1.1264108352144468,
"grad_norm": 1.0938397645950317,
"learning_rate": 3.4457700026183374e-05,
"loss": 1.0032,
"num_input_tokens_seen": 436128,
"step": 375
},
{
"epoch": 1.141459744168548,
"grad_norm": 0.8988808989524841,
"learning_rate": 3.409156549865654e-05,
"loss": 0.943,
"num_input_tokens_seen": 441928,
"step": 380
},
{
"epoch": 1.1565086531226485,
"grad_norm": 0.9952559471130371,
"learning_rate": 3.3723169709844026e-05,
"loss": 0.801,
"num_input_tokens_seen": 447560,
"step": 385
},
{
"epoch": 1.1715575620767495,
"grad_norm": 0.7556662559509277,
"learning_rate": 3.335260428743475e-05,
"loss": 0.9294,
"num_input_tokens_seen": 453296,
"step": 390
},
{
"epoch": 1.1866064710308502,
"grad_norm": 0.8362197279930115,
"learning_rate": 3.297996139875055e-05,
"loss": 0.9528,
"num_input_tokens_seen": 459336,
"step": 395
},
{
"epoch": 1.2016553799849512,
"grad_norm": 0.9389665722846985,
"learning_rate": 3.260533372782234e-05,
"loss": 0.8981,
"num_input_tokens_seen": 464944,
"step": 400
},
{
"epoch": 1.2167042889390518,
"grad_norm": 1.1821860074996948,
"learning_rate": 3.222881445233759e-05,
"loss": 0.9823,
"num_input_tokens_seen": 470992,
"step": 405
},
{
"epoch": 1.2317531978931529,
"grad_norm": 1.0015898942947388,
"learning_rate": 3.185049722046516e-05,
"loss": 0.9047,
"num_input_tokens_seen": 476216,
"step": 410
},
{
"epoch": 1.2468021068472535,
"grad_norm": 0.8765709400177002,
"learning_rate": 3.147047612756302e-05,
"loss": 0.8582,
"num_input_tokens_seen": 481824,
"step": 415
},
{
"epoch": 1.2618510158013545,
"grad_norm": 0.9712916612625122,
"learning_rate": 3.10888456927748e-05,
"loss": 0.8787,
"num_input_tokens_seen": 487576,
"step": 420
},
{
"epoch": 1.276899924755455,
"grad_norm": 1.1555066108703613,
"learning_rate": 3.0705700835520895e-05,
"loss": 0.8729,
"num_input_tokens_seen": 493336,
"step": 425
},
{
"epoch": 1.2919488337095562,
"grad_norm": 1.1198400259017944,
"learning_rate": 3.0321136851890036e-05,
"loss": 0.8772,
"num_input_tokens_seen": 499760,
"step": 430
},
{
"epoch": 1.3069977426636568,
"grad_norm": 1.1468943357467651,
"learning_rate": 2.9935249390937183e-05,
"loss": 0.9451,
"num_input_tokens_seen": 505400,
"step": 435
},
{
"epoch": 1.3220466516177578,
"grad_norm": 0.8468641042709351,
"learning_rate": 2.9548134430893604e-05,
"loss": 0.8202,
"num_input_tokens_seen": 511760,
"step": 440
},
{
"epoch": 1.3370955605718584,
"grad_norm": 1.3206151723861694,
"learning_rate": 2.9159888255295116e-05,
"loss": 0.9773,
"num_input_tokens_seen": 517616,
"step": 445
},
{
"epoch": 1.3521444695259595,
"grad_norm": 1.1996040344238281,
"learning_rate": 2.8770607429034352e-05,
"loss": 0.9101,
"num_input_tokens_seen": 522744,
"step": 450
},
{
"epoch": 1.36719337848006,
"grad_norm": 1.1539313793182373,
"learning_rate": 2.8380388774343047e-05,
"loss": 0.9633,
"num_input_tokens_seen": 528648,
"step": 455
},
{
"epoch": 1.382242287434161,
"grad_norm": 1.021848440170288,
"learning_rate": 2.7989329346710375e-05,
"loss": 0.8886,
"num_input_tokens_seen": 534000,
"step": 460
},
{
"epoch": 1.3972911963882617,
"grad_norm": 0.8612179160118103,
"learning_rate": 2.759752641074322e-05,
"loss": 0.9258,
"num_input_tokens_seen": 539688,
"step": 465
},
{
"epoch": 1.4123401053423628,
"grad_norm": 1.0109293460845947,
"learning_rate": 2.7205077415974416e-05,
"loss": 0.9039,
"num_input_tokens_seen": 545112,
"step": 470
},
{
"epoch": 1.4273890142964636,
"grad_norm": 1.1920832395553589,
"learning_rate": 2.6812079972625077e-05,
"loss": 1.0116,
"num_input_tokens_seen": 551328,
"step": 475
},
{
"epoch": 1.4424379232505644,
"grad_norm": 1.0512142181396484,
"learning_rate": 2.6418631827326857e-05,
"loss": 0.8218,
"num_input_tokens_seen": 556816,
"step": 480
},
{
"epoch": 1.4574868322046652,
"grad_norm": 1.146946907043457,
"learning_rate": 2.602483083881035e-05,
"loss": 0.8604,
"num_input_tokens_seen": 562552,
"step": 485
},
{
"epoch": 1.472535741158766,
"grad_norm": 1.1064790487289429,
"learning_rate": 2.563077495356561e-05,
"loss": 0.8044,
"num_input_tokens_seen": 568480,
"step": 490
},
{
"epoch": 1.487584650112867,
"grad_norm": 0.9678347110748291,
"learning_rate": 2.5236562181480794e-05,
"loss": 0.9198,
"num_input_tokens_seen": 574072,
"step": 495
},
{
"epoch": 1.5026335590669677,
"grad_norm": 0.9460956454277039,
"learning_rate": 2.484229057146507e-05,
"loss": 0.9181,
"num_input_tokens_seen": 580040,
"step": 500
},
{
"epoch": 1.5176824680210683,
"grad_norm": 1.175920844078064,
"learning_rate": 2.4448058187061835e-05,
"loss": 0.8644,
"num_input_tokens_seen": 586128,
"step": 505
},
{
"epoch": 1.5327313769751694,
"grad_norm": 1.2150397300720215,
"learning_rate": 2.4053963082058244e-05,
"loss": 1.0127,
"num_input_tokens_seen": 592256,
"step": 510
},
{
"epoch": 1.54778028592927,
"grad_norm": 0.9520708918571472,
"learning_rate": 2.3660103276097232e-05,
"loss": 0.7937,
"num_input_tokens_seen": 597704,
"step": 515
},
{
"epoch": 1.562829194883371,
"grad_norm": 1.0742231607437134,
"learning_rate": 2.3266576730297956e-05,
"loss": 0.9806,
"num_input_tokens_seen": 603240,
"step": 520
},
{
"epoch": 1.5778781038374716,
"grad_norm": 1.0484352111816406,
"learning_rate": 2.2873481322890862e-05,
"loss": 0.934,
"num_input_tokens_seen": 609616,
"step": 525
},
{
"epoch": 1.5929270127915727,
"grad_norm": 0.8829598426818848,
"learning_rate": 2.2480914824873297e-05,
"loss": 0.9288,
"num_input_tokens_seen": 615520,
"step": 530
},
{
"epoch": 1.6079759217456733,
"grad_norm": 0.9222884178161621,
"learning_rate": 2.2088974875691863e-05,
"loss": 0.8597,
"num_input_tokens_seen": 621208,
"step": 535
},
{
"epoch": 1.6230248306997743,
"grad_norm": 0.894801914691925,
"learning_rate": 2.1697758958957448e-05,
"loss": 0.8817,
"num_input_tokens_seen": 627176,
"step": 540
},
{
"epoch": 1.6380737396538751,
"grad_norm": 1.1703195571899414,
"learning_rate": 2.1307364378199005e-05,
"loss": 0.777,
"num_input_tokens_seen": 633248,
"step": 545
},
{
"epoch": 1.653122648607976,
"grad_norm": 1.0596733093261719,
"learning_rate": 2.0917888232662196e-05,
"loss": 0.798,
"num_input_tokens_seen": 639000,
"step": 550
},
{
"epoch": 1.6681715575620768,
"grad_norm": 1.0426228046417236,
"learning_rate": 2.0529427393158705e-05,
"loss": 0.9104,
"num_input_tokens_seen": 645280,
"step": 555
},
{
"epoch": 1.6832204665161776,
"grad_norm": 1.3300392627716064,
"learning_rate": 2.014207847797256e-05,
"loss": 0.8293,
"num_input_tokens_seen": 651760,
"step": 560
},
{
"epoch": 1.6982693754702785,
"grad_norm": 1.2664028406143188,
"learning_rate": 1.9755937828829067e-05,
"loss": 0.8821,
"num_input_tokens_seen": 657272,
"step": 565
},
{
"epoch": 1.7133182844243793,
"grad_norm": 0.9889734983444214,
"learning_rate": 1.937110148693265e-05,
"loss": 0.8253,
"num_input_tokens_seen": 663336,
"step": 570
},
{
"epoch": 1.72836719337848,
"grad_norm": 1.0789241790771484,
"learning_rate": 1.8987665169079454e-05,
"loss": 0.9391,
"num_input_tokens_seen": 668936,
"step": 575
},
{
"epoch": 1.743416102332581,
"grad_norm": 1.2337504625320435,
"learning_rate": 1.8605724243850502e-05,
"loss": 0.8711,
"num_input_tokens_seen": 675000,
"step": 580
},
{
"epoch": 1.7584650112866818,
"grad_norm": 0.905838668346405,
"learning_rate": 1.822537370789163e-05,
"loss": 0.8346,
"num_input_tokens_seen": 680584,
"step": 585
},
{
"epoch": 1.7735139202407826,
"grad_norm": 1.1633321046829224,
"learning_rate": 1.7846708162285785e-05,
"loss": 0.8275,
"num_input_tokens_seen": 686416,
"step": 590
},
{
"epoch": 1.7885628291948834,
"grad_norm": 0.9946597814559937,
"learning_rate": 1.7469821789023815e-05,
"loss": 0.9435,
"num_input_tokens_seen": 692016,
"step": 595
},
{
"epoch": 1.8036117381489842,
"grad_norm": 1.0259568691253662,
"learning_rate": 1.70948083275794e-05,
"loss": 0.8584,
"num_input_tokens_seen": 697984,
"step": 600
},
{
"epoch": 1.818660647103085,
"grad_norm": 1.0644334554672241,
"learning_rate": 1.672176105159417e-05,
"loss": 0.88,
"num_input_tokens_seen": 704056,
"step": 605
},
{
"epoch": 1.8337095560571859,
"grad_norm": 1.0443474054336548,
"learning_rate": 1.635077274567854e-05,
"loss": 0.8825,
"num_input_tokens_seen": 709760,
"step": 610
},
{
"epoch": 1.8487584650112867,
"grad_norm": 1.0267105102539062,
"learning_rate": 1.5981935682334264e-05,
"loss": 0.9978,
"num_input_tokens_seen": 715872,
"step": 615
},
{
"epoch": 1.8638073739653875,
"grad_norm": 1.3127869367599487,
"learning_rate": 1.561534159900441e-05,
"loss": 0.9626,
"num_input_tokens_seen": 722184,
"step": 620
},
{
"epoch": 1.8788562829194884,
"grad_norm": 1.2093840837478638,
"learning_rate": 1.525108167525624e-05,
"loss": 0.9308,
"num_input_tokens_seen": 727776,
"step": 625
},
{
"epoch": 1.8939051918735892,
"grad_norm": 0.982764482498169,
"learning_rate": 1.4889246510103077e-05,
"loss": 0.9757,
"num_input_tokens_seen": 733760,
"step": 630
},
{
"epoch": 1.90895410082769,
"grad_norm": 1.111680507659912,
"learning_rate": 1.4529926099470348e-05,
"loss": 0.767,
"num_input_tokens_seen": 740024,
"step": 635
},
{
"epoch": 1.9240030097817908,
"grad_norm": 1.218017578125,
"learning_rate": 1.4173209813811788e-05,
"loss": 0.9272,
"num_input_tokens_seen": 745480,
"step": 640
},
{
"epoch": 1.9390519187358917,
"grad_norm": 1.3443623781204224,
"learning_rate": 1.381918637588112e-05,
"loss": 0.7941,
"num_input_tokens_seen": 751384,
"step": 645
},
{
"epoch": 1.9541008276899925,
"grad_norm": 0.9702039361000061,
"learning_rate": 1.3467943838664863e-05,
"loss": 0.8408,
"num_input_tokens_seen": 756920,
"step": 650
},
{
"epoch": 1.9691497366440933,
"grad_norm": 1.1215064525604248,
"learning_rate": 1.311956956348177e-05,
"loss": 0.8459,
"num_input_tokens_seen": 762424,
"step": 655
},
{
"epoch": 1.9841986455981941,
"grad_norm": 1.3830626010894775,
"learning_rate": 1.277415019825417e-05,
"loss": 1.0117,
"num_input_tokens_seen": 768224,
"step": 660
},
{
"epoch": 1.999247554552295,
"grad_norm": 1.028895616531372,
"learning_rate": 1.2431771655956925e-05,
"loss": 0.9665,
"num_input_tokens_seen": 773568,
"step": 665
},
{
"epoch": 2.0120391271632805,
"grad_norm": 1.1555911302566528,
"learning_rate": 1.2092519093248988e-05,
"loss": 0.7625,
"num_input_tokens_seen": 778672,
"step": 670
},
{
"epoch": 2.0270880361173815,
"grad_norm": 1.037429690361023,
"learning_rate": 1.1756476889293269e-05,
"loss": 0.8667,
"num_input_tokens_seen": 784488,
"step": 675
},
{
"epoch": 2.042136945071482,
"grad_norm": 1.053051471710205,
"learning_rate": 1.1423728624769695e-05,
"loss": 0.8297,
"num_input_tokens_seen": 790304,
"step": 680
},
{
"epoch": 2.057185854025583,
"grad_norm": 1.0523649454116821,
"learning_rate": 1.1094357061087033e-05,
"loss": 0.8774,
"num_input_tokens_seen": 796192,
"step": 685
},
{
"epoch": 2.072234762979684,
"grad_norm": 1.0367976427078247,
"learning_rate": 1.0768444119798357e-05,
"loss": 0.8476,
"num_input_tokens_seen": 802144,
"step": 690
},
{
"epoch": 2.087283671933785,
"grad_norm": 1.4130756855010986,
"learning_rate": 1.0446070862225463e-05,
"loss": 0.8641,
"num_input_tokens_seen": 807768,
"step": 695
},
{
"epoch": 2.1023325808878854,
"grad_norm": 1.1584120988845825,
"learning_rate": 1.0127317469297277e-05,
"loss": 0.8383,
"num_input_tokens_seen": 813712,
"step": 700
},
{
"epoch": 2.1173814898419865,
"grad_norm": 1.2318339347839355,
"learning_rate": 9.812263221607112e-06,
"loss": 0.9123,
"num_input_tokens_seen": 819360,
"step": 705
},
{
"epoch": 2.132430398796087,
"grad_norm": 1.6237512826919556,
"learning_rate": 9.500986479694036e-06,
"loss": 0.9635,
"num_input_tokens_seen": 824584,
"step": 710
},
{
"epoch": 2.147479307750188,
"grad_norm": 1.106604814529419,
"learning_rate": 9.19356466455287e-06,
"loss": 0.9221,
"num_input_tokens_seen": 830600,
"step": 715
},
{
"epoch": 2.1625282167042887,
"grad_norm": 0.8615310788154602,
"learning_rate": 8.890074238378074e-06,
"loss": 0.8757,
"num_input_tokens_seen": 836856,
"step": 720
},
{
"epoch": 2.17757712565839,
"grad_norm": 0.8537486791610718,
"learning_rate": 8.590590685545946e-06,
"loss": 0.7958,
"num_input_tokens_seen": 842872,
"step": 725
},
{
"epoch": 2.1926260346124904,
"grad_norm": 0.8556107878684998,
"learning_rate": 8.295188493840104e-06,
"loss": 0.7993,
"num_input_tokens_seen": 848664,
"step": 730
},
{
"epoch": 2.2076749435665914,
"grad_norm": 1.093944787979126,
"learning_rate": 8.003941135924858e-06,
"loss": 0.8436,
"num_input_tokens_seen": 854712,
"step": 735
},
{
"epoch": 2.222723852520692,
"grad_norm": 1.2639975547790527,
"learning_rate": 7.71692105107098e-06,
"loss": 0.896,
"num_input_tokens_seen": 860648,
"step": 740
},
{
"epoch": 2.237772761474793,
"grad_norm": 1.177778720855713,
"learning_rate": 7.434199627138602e-06,
"loss": 0.8948,
"num_input_tokens_seen": 866080,
"step": 745
},
{
"epoch": 2.2528216704288937,
"grad_norm": 0.9701932668685913,
"learning_rate": 7.155847182821523e-06,
"loss": 0.8546,
"num_input_tokens_seen": 871560,
"step": 750
},
{
"epoch": 2.2678705793829947,
"grad_norm": 1.0232161283493042,
"learning_rate": 6.881932950157538e-06,
"loss": 0.8494,
"num_input_tokens_seen": 877568,
"step": 755
},
{
"epoch": 2.282919488337096,
"grad_norm": 1.119441270828247,
"learning_rate": 6.612525057308949e-06,
"loss": 0.7723,
"num_input_tokens_seen": 883808,
"step": 760
},
{
"epoch": 2.2979683972911964,
"grad_norm": 1.5488731861114502,
"learning_rate": 6.347690511617693e-06,
"loss": 0.9168,
"num_input_tokens_seen": 889296,
"step": 765
},
{
"epoch": 2.313017306245297,
"grad_norm": 1.2143895626068115,
"learning_rate": 6.0874951829392234e-06,
"loss": 0.8831,
"num_input_tokens_seen": 895120,
"step": 770
},
{
"epoch": 2.328066215199398,
"grad_norm": 1.157663106918335,
"learning_rate": 5.832003787259327e-06,
"loss": 0.854,
"num_input_tokens_seen": 900320,
"step": 775
},
{
"epoch": 2.343115124153499,
"grad_norm": 1.4496403932571411,
"learning_rate": 5.581279870597867e-06,
"loss": 0.8843,
"num_input_tokens_seen": 905928,
"step": 780
},
{
"epoch": 2.3581640331075997,
"grad_norm": 0.8820686936378479,
"learning_rate": 5.335385793203604e-06,
"loss": 0.862,
"num_input_tokens_seen": 911976,
"step": 785
},
{
"epoch": 2.3732129420617003,
"grad_norm": 1.622916579246521,
"learning_rate": 5.094382714043907e-06,
"loss": 0.985,
"num_input_tokens_seen": 917840,
"step": 790
},
{
"epoch": 2.3882618510158014,
"grad_norm": 1.0603710412979126,
"learning_rate": 4.85833057559322e-06,
"loss": 0.7679,
"num_input_tokens_seen": 923168,
"step": 795
},
{
"epoch": 2.4033107599699024,
"grad_norm": 1.0989526510238647,
"learning_rate": 4.627288088924156e-06,
"loss": 0.8198,
"num_input_tokens_seen": 928720,
"step": 800
},
{
"epoch": 2.418359668924003,
"grad_norm": 0.9745952486991882,
"learning_rate": 4.401312719104802e-06,
"loss": 0.7773,
"num_input_tokens_seen": 934568,
"step": 805
},
{
"epoch": 2.4334085778781036,
"grad_norm": 1.529707670211792,
"learning_rate": 4.180460670905978e-06,
"loss": 0.9312,
"num_input_tokens_seen": 940264,
"step": 810
},
{
"epoch": 2.4484574868322047,
"grad_norm": 1.2537649869918823,
"learning_rate": 3.964786874821955e-06,
"loss": 0.8497,
"num_input_tokens_seen": 946128,
"step": 815
},
{
"epoch": 2.4635063957863057,
"grad_norm": 1.0871232748031616,
"learning_rate": 3.754344973408064e-06,
"loss": 0.782,
"num_input_tokens_seen": 952032,
"step": 820
},
{
"epoch": 2.4785553047404063,
"grad_norm": 1.2940268516540527,
"learning_rate": 3.5491873079387256e-06,
"loss": 0.8937,
"num_input_tokens_seen": 957960,
"step": 825
},
{
"epoch": 2.493604213694507,
"grad_norm": 1.2327598333358765,
"learning_rate": 3.3493649053890326e-06,
"loss": 0.7039,
"num_input_tokens_seen": 964336,
"step": 830
},
{
"epoch": 2.508653122648608,
"grad_norm": 1.516093373298645,
"learning_rate": 3.1549274657433375e-06,
"loss": 0.9265,
"num_input_tokens_seen": 970168,
"step": 835
},
{
"epoch": 2.523702031602709,
"grad_norm": 1.1418204307556152,
"learning_rate": 2.9659233496337786e-06,
"loss": 0.8669,
"num_input_tokens_seen": 975752,
"step": 840
},
{
"epoch": 2.5387509405568096,
"grad_norm": 1.3584462404251099,
"learning_rate": 2.7823995663120327e-06,
"loss": 0.9174,
"num_input_tokens_seen": 981672,
"step": 845
},
{
"epoch": 2.55379984951091,
"grad_norm": 1.1911269426345825,
"learning_rate": 2.6044017619571065e-06,
"loss": 0.8718,
"num_input_tokens_seen": 987560,
"step": 850
},
{
"epoch": 2.5688487584650113,
"grad_norm": 1.3048710823059082,
"learning_rate": 2.431974208322191e-06,
"loss": 0.8634,
"num_input_tokens_seen": 993200,
"step": 855
},
{
"epoch": 2.5838976674191123,
"grad_norm": 1.1356749534606934,
"learning_rate": 2.265159791723373e-06,
"loss": 0.845,
"num_input_tokens_seen": 999192,
"step": 860
},
{
"epoch": 2.598946576373213,
"grad_norm": 1.2655149698257446,
"learning_rate": 2.104000002372886e-06,
"loss": 0.8008,
"num_input_tokens_seen": 1004576,
"step": 865
},
{
"epoch": 2.6139954853273135,
"grad_norm": 1.354706048965454,
"learning_rate": 1.9485349240596613e-06,
"loss": 0.8797,
"num_input_tokens_seen": 1010352,
"step": 870
},
{
"epoch": 2.6290443942814146,
"grad_norm": 1.0957777500152588,
"learning_rate": 1.7988032241796376e-06,
"loss": 0.946,
"num_input_tokens_seen": 1016272,
"step": 875
},
{
"epoch": 2.6440933032355156,
"grad_norm": 1.3322904109954834,
"learning_rate": 1.6548421441183875e-06,
"loss": 0.8032,
"num_input_tokens_seen": 1021896,
"step": 880
},
{
"epoch": 2.659142212189616,
"grad_norm": 1.1363080739974976,
"learning_rate": 1.5166874899884053e-06,
"loss": 0.8892,
"num_input_tokens_seen": 1027704,
"step": 885
},
{
"epoch": 2.674191121143717,
"grad_norm": 1.2706754207611084,
"learning_rate": 1.3843736237233784e-06,
"loss": 0.856,
"num_input_tokens_seen": 1033800,
"step": 890
},
{
"epoch": 2.689240030097818,
"grad_norm": 1.1934438943862915,
"learning_rate": 1.2579334545316733e-06,
"loss": 0.8617,
"num_input_tokens_seen": 1040008,
"step": 895
},
{
"epoch": 2.704288939051919,
"grad_norm": 1.4581674337387085,
"learning_rate": 1.137398430711123e-06,
"loss": 0.9117,
"num_input_tokens_seen": 1046272,
"step": 900
},
{
"epoch": 2.7193378480060195,
"grad_norm": 1.080992579460144,
"learning_rate": 1.0227985318271682e-06,
"loss": 0.7855,
"num_input_tokens_seen": 1052032,
"step": 905
},
{
"epoch": 2.73438675696012,
"grad_norm": 1.0012861490249634,
"learning_rate": 9.141622612563571e-07,
"loss": 0.8212,
"num_input_tokens_seen": 1057584,
"step": 910
},
{
"epoch": 2.749435665914221,
"grad_norm": 1.1472314596176147,
"learning_rate": 8.115166390969125e-07,
"loss": 0.8404,
"num_input_tokens_seen": 1063760,
"step": 915
},
{
"epoch": 2.764484574868322,
"grad_norm": 1.2558523416519165,
"learning_rate": 7.148871954483105e-07,
"loss": 0.7782,
"num_input_tokens_seen": 1069544,
"step": 920
},
{
"epoch": 2.779533483822423,
"grad_norm": 1.1380338668823242,
"learning_rate": 6.242979640613933e-07,
"loss": 0.7847,
"num_input_tokens_seen": 1075472,
"step": 925
},
{
"epoch": 2.7945823927765234,
"grad_norm": 0.972878098487854,
"learning_rate": 5.397714763606843e-07,
"loss": 0.8857,
"num_input_tokens_seen": 1081464,
"step": 930
},
{
"epoch": 2.8096313017306245,
"grad_norm": 1.2546579837799072,
"learning_rate": 4.613287558403512e-07,
"loss": 0.8029,
"num_input_tokens_seen": 1087464,
"step": 935
},
{
"epoch": 2.8246802106847255,
"grad_norm": 1.1165034770965576,
"learning_rate": 3.8898931283523344e-07,
"loss": 0.8154,
"num_input_tokens_seen": 1092888,
"step": 940
},
{
"epoch": 2.839729119638826,
"grad_norm": 1.3924362659454346,
"learning_rate": 3.227711396682015e-07,
"loss": 0.8791,
"num_input_tokens_seen": 1098808,
"step": 945
},
{
"epoch": 2.854778028592927,
"grad_norm": 1.021448016166687,
"learning_rate": 2.626907061751116e-07,
"loss": 0.787,
"num_input_tokens_seen": 1104688,
"step": 950
},
{
"epoch": 2.869826937547028,
"grad_norm": 1.3344382047653198,
"learning_rate": 2.0876295560839364e-07,
"loss": 0.8831,
"num_input_tokens_seen": 1110960,
"step": 955
},
{
"epoch": 2.884875846501129,
"grad_norm": 1.3956490755081177,
"learning_rate": 1.6100130092037703e-07,
"loss": 0.7677,
"num_input_tokens_seen": 1116800,
"step": 960
},
{
"epoch": 2.8999247554552294,
"grad_norm": 1.1644206047058105,
"learning_rate": 1.194176214271897e-07,
"loss": 0.7567,
"num_input_tokens_seen": 1122248,
"step": 965
},
{
"epoch": 2.9149736644093305,
"grad_norm": 1.2540746927261353,
"learning_rate": 8.402225985413848e-08,
"loss": 0.8944,
"num_input_tokens_seen": 1127928,
"step": 970
},
{
"epoch": 2.930022573363431,
"grad_norm": 1.1684881448745728,
"learning_rate": 5.4824019763252685e-08,
"loss": 0.9737,
"num_input_tokens_seen": 1133336,
"step": 975
},
{
"epoch": 2.945071482317532,
"grad_norm": 1.072198510169983,
"learning_rate": 3.1830163363655296e-08,
"loss": 0.8965,
"num_input_tokens_seen": 1139048,
"step": 980
},
{
"epoch": 2.9601203912716327,
"grad_norm": 1.7171086072921753,
"learning_rate": 1.504640970531046e-08,
"loss": 0.837,
"num_input_tokens_seen": 1144456,
"step": 985
},
{
"epoch": 2.975169300225734,
"grad_norm": 1.4984806776046753,
"learning_rate": 4.4769332565558485e-09,
"loss": 0.7812,
"num_input_tokens_seen": 1150160,
"step": 990
},
{
"epoch": 2.9902182091798344,
"grad_norm": 1.2322272062301636,
"learning_rate": 1.2436286584982527e-10,
"loss": 0.8613,
"num_input_tokens_seen": 1156704,
"step": 995
},
{
"epoch": 2.9932279909706545,
"num_input_tokens_seen": 1157808,
"step": 996,
"total_flos": 1.3788411572404224e+16,
"train_loss": 0.939127180590687,
"train_runtime": 10484.6402,
"train_samples_per_second": 0.761,
"train_steps_per_second": 0.095
}
],
"logging_steps": 5,
"max_steps": 996,
"num_input_tokens_seen": 1157808,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3788411572404224e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}