GENOME-gemma-2b-it / sharegpt /trainer_state.json
Estwld's picture
Upload 15 files
579a00e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999849510910459,
"eval_steps": 1000,
"global_step": 3322,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030097817908201654,
"grad_norm": 0.5666791796684265,
"learning_rate": 6.006006006006006e-06,
"loss": 1.0909,
"step": 10
},
{
"epoch": 0.006019563581640331,
"grad_norm": 0.5926806926727295,
"learning_rate": 1.2012012012012012e-05,
"loss": 1.0813,
"step": 20
},
{
"epoch": 0.009029345372460496,
"grad_norm": 0.5680918097496033,
"learning_rate": 1.801801801801802e-05,
"loss": 1.051,
"step": 30
},
{
"epoch": 0.012039127163280662,
"grad_norm": 0.4019787907600403,
"learning_rate": 2.4024024024024024e-05,
"loss": 0.9406,
"step": 40
},
{
"epoch": 0.015048908954100828,
"grad_norm": 0.289296418428421,
"learning_rate": 3.0030030030030033e-05,
"loss": 0.9232,
"step": 50
},
{
"epoch": 0.01805869074492099,
"grad_norm": 0.23350511491298676,
"learning_rate": 3.603603603603604e-05,
"loss": 0.8762,
"step": 60
},
{
"epoch": 0.021068472535741158,
"grad_norm": 0.17287714779376984,
"learning_rate": 4.204204204204204e-05,
"loss": 0.8496,
"step": 70
},
{
"epoch": 0.024078254326561323,
"grad_norm": 0.16750749945640564,
"learning_rate": 4.804804804804805e-05,
"loss": 0.8424,
"step": 80
},
{
"epoch": 0.02708803611738149,
"grad_norm": 0.15370824933052063,
"learning_rate": 5.405405405405406e-05,
"loss": 0.8601,
"step": 90
},
{
"epoch": 0.030097817908201655,
"grad_norm": 0.16766728460788727,
"learning_rate": 6.0060060060060066e-05,
"loss": 0.8179,
"step": 100
},
{
"epoch": 0.03310759969902182,
"grad_norm": 0.2106652557849884,
"learning_rate": 6.606606606606607e-05,
"loss": 0.8094,
"step": 110
},
{
"epoch": 0.03611738148984198,
"grad_norm": 0.22345463931560516,
"learning_rate": 7.207207207207208e-05,
"loss": 0.8096,
"step": 120
},
{
"epoch": 0.03912716328066215,
"grad_norm": 0.23186658322811127,
"learning_rate": 7.807807807807808e-05,
"loss": 0.8195,
"step": 130
},
{
"epoch": 0.042136945071482315,
"grad_norm": 0.3011253774166107,
"learning_rate": 8.408408408408409e-05,
"loss": 0.8184,
"step": 140
},
{
"epoch": 0.045146726862302484,
"grad_norm": 0.21401682496070862,
"learning_rate": 9.009009009009009e-05,
"loss": 0.8,
"step": 150
},
{
"epoch": 0.04815650865312265,
"grad_norm": 0.2243528664112091,
"learning_rate": 9.60960960960961e-05,
"loss": 0.7739,
"step": 160
},
{
"epoch": 0.051166290443942816,
"grad_norm": 0.46177345514297485,
"learning_rate": 0.00010210210210210212,
"loss": 0.8172,
"step": 170
},
{
"epoch": 0.05417607223476298,
"grad_norm": 0.2084399163722992,
"learning_rate": 0.00010810810810810812,
"loss": 0.8181,
"step": 180
},
{
"epoch": 0.05718585402558315,
"grad_norm": 0.24123747646808624,
"learning_rate": 0.00011411411411411413,
"loss": 0.8032,
"step": 190
},
{
"epoch": 0.06019563581640331,
"grad_norm": 0.23546789586544037,
"learning_rate": 0.00012012012012012013,
"loss": 0.7775,
"step": 200
},
{
"epoch": 0.06320541760722348,
"grad_norm": 0.23672956228256226,
"learning_rate": 0.00012612612612612612,
"loss": 0.821,
"step": 210
},
{
"epoch": 0.06621519939804364,
"grad_norm": 0.22630153596401215,
"learning_rate": 0.00013213213213213214,
"loss": 0.7892,
"step": 220
},
{
"epoch": 0.0692249811888638,
"grad_norm": 0.2221691757440567,
"learning_rate": 0.00013813813813813813,
"loss": 0.7932,
"step": 230
},
{
"epoch": 0.07223476297968397,
"grad_norm": 0.2853372097015381,
"learning_rate": 0.00014414414414414415,
"loss": 0.7993,
"step": 240
},
{
"epoch": 0.07524454477050414,
"grad_norm": 0.24529512226581573,
"learning_rate": 0.00015015015015015014,
"loss": 0.8281,
"step": 250
},
{
"epoch": 0.0782543265613243,
"grad_norm": 0.20747515559196472,
"learning_rate": 0.00015615615615615616,
"loss": 0.7695,
"step": 260
},
{
"epoch": 0.08126410835214447,
"grad_norm": 0.21736431121826172,
"learning_rate": 0.00016216216216216218,
"loss": 0.7863,
"step": 270
},
{
"epoch": 0.08427389014296463,
"grad_norm": 0.25259608030319214,
"learning_rate": 0.00016816816816816817,
"loss": 0.7616,
"step": 280
},
{
"epoch": 0.0872836719337848,
"grad_norm": 0.19296181201934814,
"learning_rate": 0.0001741741741741742,
"loss": 0.7707,
"step": 290
},
{
"epoch": 0.09029345372460497,
"grad_norm": 0.26890674233436584,
"learning_rate": 0.00018018018018018018,
"loss": 0.7744,
"step": 300
},
{
"epoch": 0.09330323551542513,
"grad_norm": 0.2053564041852951,
"learning_rate": 0.0001861861861861862,
"loss": 0.743,
"step": 310
},
{
"epoch": 0.0963130173062453,
"grad_norm": 0.20555047690868378,
"learning_rate": 0.0001921921921921922,
"loss": 0.8089,
"step": 320
},
{
"epoch": 0.09932279909706546,
"grad_norm": 0.2425510585308075,
"learning_rate": 0.0001981981981981982,
"loss": 0.8012,
"step": 330
},
{
"epoch": 0.10233258088788563,
"grad_norm": 0.2260735034942627,
"learning_rate": 0.00019999729347501484,
"loss": 0.7843,
"step": 340
},
{
"epoch": 0.1053423626787058,
"grad_norm": 0.21512670814990997,
"learning_rate": 0.0001999840373787939,
"loss": 0.7992,
"step": 350
},
{
"epoch": 0.10835214446952596,
"grad_norm": 0.22542227804660797,
"learning_rate": 0.0001999597360570722,
"loss": 0.772,
"step": 360
},
{
"epoch": 0.11136192626034612,
"grad_norm": 0.15299277007579803,
"learning_rate": 0.0001999243921944139,
"loss": 0.7511,
"step": 370
},
{
"epoch": 0.1143717080511663,
"grad_norm": 0.27176716923713684,
"learning_rate": 0.00019987800969525164,
"loss": 0.7735,
"step": 380
},
{
"epoch": 0.11738148984198646,
"grad_norm": 0.28357240557670593,
"learning_rate": 0.00019982059368345496,
"loss": 0.7729,
"step": 390
},
{
"epoch": 0.12039127163280662,
"grad_norm": 0.18754735589027405,
"learning_rate": 0.00019975215050176433,
"loss": 0.795,
"step": 400
},
{
"epoch": 0.12340105342362678,
"grad_norm": 0.15982478857040405,
"learning_rate": 0.00019967268771109035,
"loss": 0.7671,
"step": 410
},
{
"epoch": 0.12641083521444696,
"grad_norm": 0.18540535867214203,
"learning_rate": 0.00019958221408967875,
"loss": 0.7491,
"step": 420
},
{
"epoch": 0.1294206170052671,
"grad_norm": 0.4123195707798004,
"learning_rate": 0.00019948073963214043,
"loss": 0.7766,
"step": 430
},
{
"epoch": 0.13243039879608728,
"grad_norm": 0.1608065813779831,
"learning_rate": 0.00019936827554834738,
"loss": 0.7904,
"step": 440
},
{
"epoch": 0.13544018058690746,
"grad_norm": 0.21908819675445557,
"learning_rate": 0.00019924483426219452,
"loss": 0.7816,
"step": 450
},
{
"epoch": 0.1384499623777276,
"grad_norm": 0.17256338894367218,
"learning_rate": 0.00019911042941022695,
"loss": 0.7982,
"step": 460
},
{
"epoch": 0.14145974416854779,
"grad_norm": 0.15286709368228912,
"learning_rate": 0.00019896507584013376,
"loss": 0.793,
"step": 470
},
{
"epoch": 0.14446952595936793,
"grad_norm": 0.1949639767408371,
"learning_rate": 0.00019880878960910772,
"loss": 0.745,
"step": 480
},
{
"epoch": 0.1474793077501881,
"grad_norm": 0.22766350209712982,
"learning_rate": 0.00019864158798207137,
"loss": 0.765,
"step": 490
},
{
"epoch": 0.1504890895410083,
"grad_norm": 0.17246346175670624,
"learning_rate": 0.0001984634894297699,
"loss": 0.7494,
"step": 500
},
{
"epoch": 0.15349887133182843,
"grad_norm": 0.18809442222118378,
"learning_rate": 0.00019827451362673052,
"loss": 0.7906,
"step": 510
},
{
"epoch": 0.1565086531226486,
"grad_norm": 0.287610799074173,
"learning_rate": 0.00019807468144908928,
"loss": 0.7991,
"step": 520
},
{
"epoch": 0.1595184349134688,
"grad_norm": 0.2345225214958191,
"learning_rate": 0.00019786401497228466,
"loss": 0.7593,
"step": 530
},
{
"epoch": 0.16252821670428894,
"grad_norm": 0.24265146255493164,
"learning_rate": 0.00019764253746861886,
"loss": 0.7966,
"step": 540
},
{
"epoch": 0.1655379984951091,
"grad_norm": 0.1692132204771042,
"learning_rate": 0.00019741027340468715,
"loss": 0.7525,
"step": 550
},
{
"epoch": 0.16854778028592926,
"grad_norm": 0.18456414341926575,
"learning_rate": 0.00019716724843867487,
"loss": 0.7706,
"step": 560
},
{
"epoch": 0.17155756207674944,
"grad_norm": 0.2189481407403946,
"learning_rate": 0.000196913489417523,
"loss": 0.7683,
"step": 570
},
{
"epoch": 0.1745673438675696,
"grad_norm": 0.24484622478485107,
"learning_rate": 0.00019664902437396245,
"loss": 0.821,
"step": 580
},
{
"epoch": 0.17757712565838976,
"grad_norm": 0.1760026067495346,
"learning_rate": 0.00019637388252341715,
"loss": 0.7744,
"step": 590
},
{
"epoch": 0.18058690744920994,
"grad_norm": 0.18553385138511658,
"learning_rate": 0.00019608809426077678,
"loss": 0.7607,
"step": 600
},
{
"epoch": 0.1835966892400301,
"grad_norm": 0.23498745262622833,
"learning_rate": 0.000195791691157039,
"loss": 0.7707,
"step": 610
},
{
"epoch": 0.18660647103085026,
"grad_norm": 0.20103472471237183,
"learning_rate": 0.00019548470595582166,
"loss": 0.7487,
"step": 620
},
{
"epoch": 0.18961625282167044,
"grad_norm": 0.18829959630966187,
"learning_rate": 0.00019516717256974592,
"loss": 0.7653,
"step": 630
},
{
"epoch": 0.1926260346124906,
"grad_norm": 0.24024085700511932,
"learning_rate": 0.00019483912607668965,
"loss": 0.7918,
"step": 640
},
{
"epoch": 0.19563581640331076,
"grad_norm": 0.18580889701843262,
"learning_rate": 0.00019450060271591243,
"loss": 0.8022,
"step": 650
},
{
"epoch": 0.1986455981941309,
"grad_norm": 0.2515565752983093,
"learning_rate": 0.0001941516398840524,
"loss": 0.7548,
"step": 660
},
{
"epoch": 0.2016553799849511,
"grad_norm": 0.18456920981407166,
"learning_rate": 0.00019379227613099473,
"loss": 0.7903,
"step": 670
},
{
"epoch": 0.20466516177577126,
"grad_norm": 0.23653873801231384,
"learning_rate": 0.00019342255115561337,
"loss": 0.7917,
"step": 680
},
{
"epoch": 0.2076749435665914,
"grad_norm": 0.26416492462158203,
"learning_rate": 0.00019304250580138524,
"loss": 0.7879,
"step": 690
},
{
"epoch": 0.2106847253574116,
"grad_norm": 0.22712339460849762,
"learning_rate": 0.0001926521820518784,
"loss": 0.8014,
"step": 700
},
{
"epoch": 0.21369450714823177,
"grad_norm": 0.17389413714408875,
"learning_rate": 0.00019225162302611412,
"loss": 0.7403,
"step": 710
},
{
"epoch": 0.21670428893905191,
"grad_norm": 0.23703350126743317,
"learning_rate": 0.00019184087297380344,
"loss": 0.7795,
"step": 720
},
{
"epoch": 0.2197140707298721,
"grad_norm": 0.20507369935512543,
"learning_rate": 0.000191419977270459,
"loss": 0.8047,
"step": 730
},
{
"epoch": 0.22272385252069224,
"grad_norm": 0.1989058554172516,
"learning_rate": 0.0001909889824123824,
"loss": 0.7786,
"step": 740
},
{
"epoch": 0.22573363431151242,
"grad_norm": 0.1992356926202774,
"learning_rate": 0.00019054793601152773,
"loss": 0.7352,
"step": 750
},
{
"epoch": 0.2287434161023326,
"grad_norm": 0.1841665804386139,
"learning_rate": 0.0001900968867902419,
"loss": 0.778,
"step": 760
},
{
"epoch": 0.23175319789315274,
"grad_norm": 0.1729171723127365,
"learning_rate": 0.00018963588457588228,
"loss": 0.7776,
"step": 770
},
{
"epoch": 0.23476297968397292,
"grad_norm": 0.18624809384346008,
"learning_rate": 0.00018916498029531223,
"loss": 0.7841,
"step": 780
},
{
"epoch": 0.23777276147479307,
"grad_norm": 0.19757080078125,
"learning_rate": 0.00018868422596927535,
"loss": 0.7628,
"step": 790
},
{
"epoch": 0.24078254326561324,
"grad_norm": 0.1995578110218048,
"learning_rate": 0.00018819367470664862,
"loss": 0.7403,
"step": 800
},
{
"epoch": 0.24379232505643342,
"grad_norm": 0.2151431441307068,
"learning_rate": 0.00018769338069857548,
"loss": 0.7581,
"step": 810
},
{
"epoch": 0.24680210684725357,
"grad_norm": 0.2272290140390396,
"learning_rate": 0.00018718339921247945,
"loss": 0.7914,
"step": 820
},
{
"epoch": 0.24981188863807374,
"grad_norm": 0.1463918834924698,
"learning_rate": 0.0001866637865859586,
"loss": 0.7953,
"step": 830
},
{
"epoch": 0.2528216704288939,
"grad_norm": 0.2657776176929474,
"learning_rate": 0.00018613460022056215,
"loss": 0.7576,
"step": 840
},
{
"epoch": 0.2558314522197141,
"grad_norm": 0.2566029131412506,
"learning_rate": 0.000185595898575449,
"loss": 0.7508,
"step": 850
},
{
"epoch": 0.2588412340105342,
"grad_norm": 0.2356068342924118,
"learning_rate": 0.00018504774116093008,
"loss": 0.7332,
"step": 860
},
{
"epoch": 0.2618510158013544,
"grad_norm": 0.20745734870433807,
"learning_rate": 0.00018449018853189403,
"loss": 0.756,
"step": 870
},
{
"epoch": 0.26486079759217457,
"grad_norm": 0.1839013397693634,
"learning_rate": 0.0001839233022811179,
"loss": 0.7776,
"step": 880
},
{
"epoch": 0.26787057938299474,
"grad_norm": 0.238608717918396,
"learning_rate": 0.00018334714503246273,
"loss": 0.7771,
"step": 890
},
{
"epoch": 0.2708803611738149,
"grad_norm": 0.17599129676818848,
"learning_rate": 0.00018276178043395586,
"loss": 0.7544,
"step": 900
},
{
"epoch": 0.27389014296463504,
"grad_norm": 0.23767121136188507,
"learning_rate": 0.00018216727315075945,
"loss": 0.7946,
"step": 910
},
{
"epoch": 0.2768999247554552,
"grad_norm": 0.180665984749794,
"learning_rate": 0.00018156368885802695,
"loss": 0.8202,
"step": 920
},
{
"epoch": 0.2799097065462754,
"grad_norm": 0.20493340492248535,
"learning_rate": 0.00018095109423364817,
"loss": 0.7823,
"step": 930
},
{
"epoch": 0.28291948833709557,
"grad_norm": 0.29762348532676697,
"learning_rate": 0.0001803295569508832,
"loss": 0.7637,
"step": 940
},
{
"epoch": 0.28592927012791575,
"grad_norm": 0.2135084718465805,
"learning_rate": 0.0001796991456708866,
"loss": 0.768,
"step": 950
},
{
"epoch": 0.28893905191873587,
"grad_norm": 0.21105533838272095,
"learning_rate": 0.0001790599300351225,
"loss": 0.7492,
"step": 960
},
{
"epoch": 0.29194883370955604,
"grad_norm": 0.2163008451461792,
"learning_rate": 0.00017841198065767107,
"loss": 0.758,
"step": 970
},
{
"epoch": 0.2949586155003762,
"grad_norm": 0.16340641677379608,
"learning_rate": 0.00017775536911742806,
"loss": 0.7739,
"step": 980
},
{
"epoch": 0.2979683972911964,
"grad_norm": 0.20751433074474335,
"learning_rate": 0.00017709016795019742,
"loss": 0.7692,
"step": 990
},
{
"epoch": 0.3009781790820166,
"grad_norm": 0.1802573949098587,
"learning_rate": 0.00017641645064067816,
"loss": 0.7886,
"step": 1000
},
{
"epoch": 0.3009781790820166,
"eval_loss": 0.8210044503211975,
"eval_runtime": 143.0609,
"eval_samples_per_second": 39.116,
"eval_steps_per_second": 4.893,
"step": 1000
},
{
"epoch": 0.3039879608728367,
"grad_norm": 0.17218339443206787,
"learning_rate": 0.0001757342916143466,
"loss": 0.7595,
"step": 1010
},
{
"epoch": 0.30699774266365687,
"grad_norm": 0.3174282908439636,
"learning_rate": 0.00017504376622923465,
"loss": 0.7821,
"step": 1020
},
{
"epoch": 0.31000752445447705,
"grad_norm": 0.1862519532442093,
"learning_rate": 0.00017434495076760483,
"loss": 0.7982,
"step": 1030
},
{
"epoch": 0.3130173062452972,
"grad_norm": 0.19561271369457245,
"learning_rate": 0.00017363792242752353,
"loss": 0.7422,
"step": 1040
},
{
"epoch": 0.3160270880361174,
"grad_norm": 0.24750228226184845,
"learning_rate": 0.000172922759314333,
"loss": 0.7425,
"step": 1050
},
{
"epoch": 0.3190368698269376,
"grad_norm": 0.22299301624298096,
"learning_rate": 0.0001721995404320228,
"loss": 0.7392,
"step": 1060
},
{
"epoch": 0.3220466516177577,
"grad_norm": 0.38551756739616394,
"learning_rate": 0.0001714683456745026,
"loss": 0.7913,
"step": 1070
},
{
"epoch": 0.32505643340857787,
"grad_norm": 0.19068562984466553,
"learning_rate": 0.00017072925581677594,
"loss": 0.7368,
"step": 1080
},
{
"epoch": 0.32806621519939805,
"grad_norm": 0.14717283844947815,
"learning_rate": 0.0001699823525060174,
"loss": 0.7938,
"step": 1090
},
{
"epoch": 0.3310759969902182,
"grad_norm": 0.20433548092842102,
"learning_rate": 0.00016922771825255263,
"loss": 0.7672,
"step": 1100
},
{
"epoch": 0.3340857787810384,
"grad_norm": 0.24607650935649872,
"learning_rate": 0.0001684654364207438,
"loss": 0.7971,
"step": 1110
},
{
"epoch": 0.3370955605718585,
"grad_norm": 0.1704825758934021,
"learning_rate": 0.00016769559121978026,
"loss": 0.7283,
"step": 1120
},
{
"epoch": 0.3401053423626787,
"grad_norm": 0.18218447268009186,
"learning_rate": 0.0001669182676943757,
"loss": 0.7405,
"step": 1130
},
{
"epoch": 0.3431151241534989,
"grad_norm": 0.25802308320999146,
"learning_rate": 0.0001661335517153737,
"loss": 0.7821,
"step": 1140
},
{
"epoch": 0.34612490594431905,
"grad_norm": 0.2192961871623993,
"learning_rate": 0.00016534152997026125,
"loss": 0.7392,
"step": 1150
},
{
"epoch": 0.3491346877351392,
"grad_norm": 0.19438660144805908,
"learning_rate": 0.00016454228995359252,
"loss": 0.7928,
"step": 1160
},
{
"epoch": 0.35214446952595935,
"grad_norm": 0.16023948788642883,
"learning_rate": 0.00016373591995732338,
"loss": 0.7542,
"step": 1170
},
{
"epoch": 0.3551542513167795,
"grad_norm": 0.32932865619659424,
"learning_rate": 0.0001629225090610577,
"loss": 0.7766,
"step": 1180
},
{
"epoch": 0.3581640331075997,
"grad_norm": 0.282248854637146,
"learning_rate": 0.00016210214712220687,
"loss": 0.7528,
"step": 1190
},
{
"epoch": 0.3611738148984199,
"grad_norm": 0.2124871164560318,
"learning_rate": 0.00016127492476606308,
"loss": 0.7874,
"step": 1200
},
{
"epoch": 0.36418359668924005,
"grad_norm": 0.23827847838401794,
"learning_rate": 0.00016044093337578815,
"loss": 0.7599,
"step": 1210
},
{
"epoch": 0.3671933784800602,
"grad_norm": 0.22505390644073486,
"learning_rate": 0.00015960026508231824,
"loss": 0.7707,
"step": 1220
},
{
"epoch": 0.37020316027088035,
"grad_norm": 0.17258504033088684,
"learning_rate": 0.00015875301275418638,
"loss": 0.8102,
"step": 1230
},
{
"epoch": 0.3732129420617005,
"grad_norm": 0.24378369748592377,
"learning_rate": 0.00015789926998726315,
"loss": 0.7388,
"step": 1240
},
{
"epoch": 0.3762227238525207,
"grad_norm": 0.22882601618766785,
"learning_rate": 0.00015703913109441713,
"loss": 0.7707,
"step": 1250
},
{
"epoch": 0.3792325056433409,
"grad_norm": 0.20203706622123718,
"learning_rate": 0.0001561726910950962,
"loss": 0.7444,
"step": 1260
},
{
"epoch": 0.382242287434161,
"grad_norm": 0.25836071372032166,
"learning_rate": 0.00015530004570483093,
"loss": 0.7838,
"step": 1270
},
{
"epoch": 0.3852520692249812,
"grad_norm": 0.16395032405853271,
"learning_rate": 0.00015442129132466054,
"loss": 0.7281,
"step": 1280
},
{
"epoch": 0.38826185101580135,
"grad_norm": 0.19731546938419342,
"learning_rate": 0.00015353652503048384,
"loss": 0.7471,
"step": 1290
},
{
"epoch": 0.3912716328066215,
"grad_norm": 0.23747088015079498,
"learning_rate": 0.00015264584456233502,
"loss": 0.7469,
"step": 1300
},
{
"epoch": 0.3942814145974417,
"grad_norm": 0.222218319773674,
"learning_rate": 0.0001517493483135864,
"loss": 0.7945,
"step": 1310
},
{
"epoch": 0.3972911963882618,
"grad_norm": 0.21308743953704834,
"learning_rate": 0.00015084713532007905,
"loss": 0.7659,
"step": 1320
},
{
"epoch": 0.400300978179082,
"grad_norm": 0.25059592723846436,
"learning_rate": 0.00014993930524918208,
"loss": 0.7718,
"step": 1330
},
{
"epoch": 0.4033107599699022,
"grad_norm": 0.2029498815536499,
"learning_rate": 0.00014902595838878256,
"loss": 0.759,
"step": 1340
},
{
"epoch": 0.40632054176072235,
"grad_norm": 0.26108860969543457,
"learning_rate": 0.0001481071956362067,
"loss": 0.7568,
"step": 1350
},
{
"epoch": 0.40933032355154253,
"grad_norm": 0.16724595427513123,
"learning_rate": 0.0001471831184870737,
"loss": 0.7504,
"step": 1360
},
{
"epoch": 0.4123401053423627,
"grad_norm": 0.2351473867893219,
"learning_rate": 0.00014625382902408356,
"loss": 0.7365,
"step": 1370
},
{
"epoch": 0.4153498871331828,
"grad_norm": 0.20350322127342224,
"learning_rate": 0.00014531942990573998,
"loss": 0.7444,
"step": 1380
},
{
"epoch": 0.418359668924003,
"grad_norm": 0.2595962882041931,
"learning_rate": 0.00014438002435500979,
"loss": 0.7574,
"step": 1390
},
{
"epoch": 0.4213694507148232,
"grad_norm": 0.28216540813446045,
"learning_rate": 0.0001434357161479198,
"loss": 0.74,
"step": 1400
},
{
"epoch": 0.42437923250564336,
"grad_norm": 0.19514194130897522,
"learning_rate": 0.0001424866096020927,
"loss": 0.7761,
"step": 1410
},
{
"epoch": 0.42738901429646353,
"grad_norm": 0.2227347493171692,
"learning_rate": 0.00014153280956522322,
"loss": 0.7895,
"step": 1420
},
{
"epoch": 0.43039879608728365,
"grad_norm": 0.21557843685150146,
"learning_rate": 0.00014057442140349543,
"loss": 0.794,
"step": 1430
},
{
"epoch": 0.43340857787810383,
"grad_norm": 0.1980540156364441,
"learning_rate": 0.00013961155098994309,
"loss": 0.7471,
"step": 1440
},
{
"epoch": 0.436418359668924,
"grad_norm": 0.2989167869091034,
"learning_rate": 0.00013864430469275377,
"loss": 0.745,
"step": 1450
},
{
"epoch": 0.4394281414597442,
"grad_norm": 0.2816371023654938,
"learning_rate": 0.00013767278936351854,
"loss": 0.7392,
"step": 1460
},
{
"epoch": 0.44243792325056436,
"grad_norm": 0.20301824808120728,
"learning_rate": 0.00013669711232542776,
"loss": 0.7486,
"step": 1470
},
{
"epoch": 0.4454477050413845,
"grad_norm": 0.18164554238319397,
"learning_rate": 0.00013571738136141555,
"loss": 0.7571,
"step": 1480
},
{
"epoch": 0.44845748683220465,
"grad_norm": 0.20167161524295807,
"learning_rate": 0.0001347337047022526,
"loss": 0.76,
"step": 1490
},
{
"epoch": 0.45146726862302483,
"grad_norm": 0.18913504481315613,
"learning_rate": 0.00013374619101459012,
"loss": 0.7444,
"step": 1500
},
{
"epoch": 0.454477050413845,
"grad_norm": 0.2532965838909149,
"learning_rate": 0.00013275494938895556,
"loss": 0.7755,
"step": 1510
},
{
"epoch": 0.4574868322046652,
"grad_norm": 0.16350123286247253,
"learning_rate": 0.00013176008932770113,
"loss": 0.755,
"step": 1520
},
{
"epoch": 0.4604966139954853,
"grad_norm": 0.18730787932872772,
"learning_rate": 0.00013076172073290724,
"loss": 0.7369,
"step": 1530
},
{
"epoch": 0.4635063957863055,
"grad_norm": 0.21689856052398682,
"learning_rate": 0.00012975995389424166,
"loss": 0.7773,
"step": 1540
},
{
"epoch": 0.46651617757712566,
"grad_norm": 0.21407072246074677,
"learning_rate": 0.0001287548994767758,
"loss": 0.7287,
"step": 1550
},
{
"epoch": 0.46952595936794583,
"grad_norm": 0.2083187699317932,
"learning_rate": 0.00012774666850875942,
"loss": 0.7717,
"step": 1560
},
{
"epoch": 0.472535741158766,
"grad_norm": 0.2223493456840515,
"learning_rate": 0.00012673537236935556,
"loss": 0.7613,
"step": 1570
},
{
"epoch": 0.47554552294958613,
"grad_norm": 0.2172088325023651,
"learning_rate": 0.00012572112277633649,
"loss": 0.7602,
"step": 1580
},
{
"epoch": 0.4785553047404063,
"grad_norm": 0.22197513282299042,
"learning_rate": 0.0001247040317737419,
"loss": 0.7241,
"step": 1590
},
{
"epoch": 0.4815650865312265,
"grad_norm": 0.1837187558412552,
"learning_rate": 0.00012368421171950192,
"loss": 0.7313,
"step": 1600
},
{
"epoch": 0.48457486832204666,
"grad_norm": 0.23295271396636963,
"learning_rate": 0.00012266177527302472,
"loss": 0.7432,
"step": 1610
},
{
"epoch": 0.48758465011286684,
"grad_norm": 0.21737614274024963,
"learning_rate": 0.0001216368353827508,
"loss": 0.7599,
"step": 1620
},
{
"epoch": 0.49059443190368696,
"grad_norm": 0.1803632527589798,
"learning_rate": 0.00012060950527367603,
"loss": 0.7386,
"step": 1630
},
{
"epoch": 0.49360421369450713,
"grad_norm": 0.1795688271522522,
"learning_rate": 0.00011957989843484345,
"loss": 0.7548,
"step": 1640
},
{
"epoch": 0.4966139954853273,
"grad_norm": 0.2615033984184265,
"learning_rate": 0.00011854812860680613,
"loss": 0.7838,
"step": 1650
},
{
"epoch": 0.4996237772761475,
"grad_norm": 0.22325466573238373,
"learning_rate": 0.00011751430976906233,
"loss": 0.7492,
"step": 1660
},
{
"epoch": 0.5026335590669676,
"grad_norm": 0.20187042653560638,
"learning_rate": 0.00011647855612746423,
"loss": 0.7897,
"step": 1670
},
{
"epoch": 0.5056433408577878,
"grad_norm": 0.3646783232688904,
"learning_rate": 0.00011544098210160152,
"loss": 0.7847,
"step": 1680
},
{
"epoch": 0.508653122648608,
"grad_norm": 0.23362590372562408,
"learning_rate": 0.00011440170231216154,
"loss": 0.7624,
"step": 1690
},
{
"epoch": 0.5116629044394282,
"grad_norm": 0.2684471011161804,
"learning_rate": 0.00011336083156826722,
"loss": 0.7973,
"step": 1700
},
{
"epoch": 0.5146726862302483,
"grad_norm": 0.15820720791816711,
"learning_rate": 0.00011231848485479395,
"loss": 0.7322,
"step": 1710
},
{
"epoch": 0.5176824680210684,
"grad_norm": 0.3005324900150299,
"learning_rate": 0.00011127477731966735,
"loss": 0.7449,
"step": 1720
},
{
"epoch": 0.5206922498118887,
"grad_norm": 0.19756928086280823,
"learning_rate": 0.00011022982426114292,
"loss": 0.7988,
"step": 1730
},
{
"epoch": 0.5237020316027088,
"grad_norm": 0.20102205872535706,
"learning_rate": 0.00010918374111506893,
"loss": 0.7273,
"step": 1740
},
{
"epoch": 0.526711813393529,
"grad_norm": 0.2193961888551712,
"learning_rate": 0.00010813664344213427,
"loss": 0.7367,
"step": 1750
},
{
"epoch": 0.5297215951843491,
"grad_norm": 0.2881150245666504,
"learning_rate": 0.00010708864691510254,
"loss": 0.7702,
"step": 1760
},
{
"epoch": 0.5327313769751693,
"grad_norm": 0.24800467491149902,
"learning_rate": 0.00010603986730603368,
"loss": 0.7853,
"step": 1770
},
{
"epoch": 0.5357411587659895,
"grad_norm": 0.18105538189411163,
"learning_rate": 0.00010499042047349455,
"loss": 0.7576,
"step": 1780
},
{
"epoch": 0.5387509405568096,
"grad_norm": 0.16151605546474457,
"learning_rate": 0.00010394042234976016,
"loss": 0.7363,
"step": 1790
},
{
"epoch": 0.5417607223476298,
"grad_norm": 0.2590145170688629,
"learning_rate": 0.00010288998892800657,
"loss": 0.7501,
"step": 1800
},
{
"epoch": 0.54477050413845,
"grad_norm": 0.22977079451084137,
"learning_rate": 0.0001018392362494972,
"loss": 0.7768,
"step": 1810
},
{
"epoch": 0.5477802859292701,
"grad_norm": 0.23535679280757904,
"learning_rate": 0.00010078828039076367,
"loss": 0.7803,
"step": 1820
},
{
"epoch": 0.5507900677200903,
"grad_norm": 0.43004941940307617,
"learning_rate": 9.973723745078296e-05,
"loss": 0.7686,
"step": 1830
},
{
"epoch": 0.5537998495109104,
"grad_norm": 0.25901973247528076,
"learning_rate": 9.868622353815188e-05,
"loss": 0.7623,
"step": 1840
},
{
"epoch": 0.5568096313017307,
"grad_norm": 0.2264987975358963,
"learning_rate": 9.763535475826054e-05,
"loss": 0.7439,
"step": 1850
},
{
"epoch": 0.5598194130925508,
"grad_norm": 0.24374960362911224,
"learning_rate": 9.658474720046637e-05,
"loss": 0.7825,
"step": 1860
},
{
"epoch": 0.5628291948833709,
"grad_norm": 0.20084655284881592,
"learning_rate": 9.553451692526954e-05,
"loss": 0.7802,
"step": 1870
},
{
"epoch": 0.5658389766741911,
"grad_norm": 0.1831783950328827,
"learning_rate": 9.448477995149182e-05,
"loss": 0.7328,
"step": 1880
},
{
"epoch": 0.5688487584650113,
"grad_norm": 0.18079884350299835,
"learning_rate": 9.343565224346013e-05,
"loss": 0.7464,
"step": 1890
},
{
"epoch": 0.5718585402558315,
"grad_norm": 0.248003751039505,
"learning_rate": 9.238724969819579e-05,
"loss": 0.7387,
"step": 1900
},
{
"epoch": 0.5748683220466516,
"grad_norm": 0.18932189047336578,
"learning_rate": 9.13396881326115e-05,
"loss": 0.7136,
"step": 1910
},
{
"epoch": 0.5778781038374717,
"grad_norm": 0.20788271725177765,
"learning_rate": 9.029308327071702e-05,
"loss": 0.7702,
"step": 1920
},
{
"epoch": 0.580887885628292,
"grad_norm": 0.19441328942775726,
"learning_rate": 8.924755073083517e-05,
"loss": 0.7901,
"step": 1930
},
{
"epoch": 0.5838976674191121,
"grad_norm": 0.21718983352184296,
"learning_rate": 8.820320601282949e-05,
"loss": 0.7771,
"step": 1940
},
{
"epoch": 0.5869074492099323,
"grad_norm": 0.19140474498271942,
"learning_rate": 8.71601644853449e-05,
"loss": 0.7601,
"step": 1950
},
{
"epoch": 0.5899172310007524,
"grad_norm": 0.26861098408699036,
"learning_rate": 8.61185413730631e-05,
"loss": 0.7575,
"step": 1960
},
{
"epoch": 0.5929270127915726,
"grad_norm": 0.19967325031757355,
"learning_rate": 8.507845174397357e-05,
"loss": 0.8136,
"step": 1970
},
{
"epoch": 0.5959367945823928,
"grad_norm": 0.19900915026664734,
"learning_rate": 8.404001049666211e-05,
"loss": 0.7205,
"step": 1980
},
{
"epoch": 0.5989465763732129,
"grad_norm": 0.25308361649513245,
"learning_rate": 8.300333234761787e-05,
"loss": 0.768,
"step": 1990
},
{
"epoch": 0.6019563581640331,
"grad_norm": 0.16838368773460388,
"learning_rate": 8.196853181856081e-05,
"loss": 0.7502,
"step": 2000
},
{
"epoch": 0.6019563581640331,
"eval_loss": 0.8055371046066284,
"eval_runtime": 143.0386,
"eval_samples_per_second": 39.122,
"eval_steps_per_second": 4.894,
"step": 2000
},
{
"epoch": 0.6049661399548533,
"grad_norm": 0.22218653559684753,
"learning_rate": 8.093572322379045e-05,
"loss": 0.7597,
"step": 2010
},
{
"epoch": 0.6079759217456734,
"grad_norm": 0.19360238313674927,
"learning_rate": 7.990502065755748e-05,
"loss": 0.7592,
"step": 2020
},
{
"epoch": 0.6109857035364936,
"grad_norm": 0.25584983825683594,
"learning_rate": 7.887653798145987e-05,
"loss": 0.7432,
"step": 2030
},
{
"epoch": 0.6139954853273137,
"grad_norm": 0.23388119041919708,
"learning_rate": 7.785038881186462e-05,
"loss": 0.7638,
"step": 2040
},
{
"epoch": 0.617005267118134,
"grad_norm": 0.24461683630943298,
"learning_rate": 7.682668650735645e-05,
"loss": 0.7451,
"step": 2050
},
{
"epoch": 0.6200150489089541,
"grad_norm": 0.20549526810646057,
"learning_rate": 7.580554415621522e-05,
"loss": 0.7432,
"step": 2060
},
{
"epoch": 0.6230248306997742,
"grad_norm": 0.22572945058345795,
"learning_rate": 7.478707456392302e-05,
"loss": 0.7924,
"step": 2070
},
{
"epoch": 0.6260346124905944,
"grad_norm": 0.33990249037742615,
"learning_rate": 7.377139024070254e-05,
"loss": 0.7471,
"step": 2080
},
{
"epoch": 0.6290443942814146,
"grad_norm": 0.2138250321149826,
"learning_rate": 7.275860338908815e-05,
"loss": 0.7535,
"step": 2090
},
{
"epoch": 0.6320541760722348,
"grad_norm": 0.17337530851364136,
"learning_rate": 7.174882589153076e-05,
"loss": 0.7523,
"step": 2100
},
{
"epoch": 0.6350639578630549,
"grad_norm": 0.2286282777786255,
"learning_rate": 7.07421692980384e-05,
"loss": 0.7507,
"step": 2110
},
{
"epoch": 0.6380737396538751,
"grad_norm": 0.19162622094154358,
"learning_rate": 6.973874481385312e-05,
"loss": 0.7671,
"step": 2120
},
{
"epoch": 0.6410835214446953,
"grad_norm": 0.22725722193717957,
"learning_rate": 6.873866328716614e-05,
"loss": 0.7595,
"step": 2130
},
{
"epoch": 0.6440933032355154,
"grad_norm": 0.28031495213508606,
"learning_rate": 6.774203519687265e-05,
"loss": 0.7384,
"step": 2140
},
{
"epoch": 0.6471030850263356,
"grad_norm": 0.31639155745506287,
"learning_rate": 6.674897064036706e-05,
"loss": 0.7629,
"step": 2150
},
{
"epoch": 0.6501128668171557,
"grad_norm": 0.2253294438123703,
"learning_rate": 6.575957932138057e-05,
"loss": 0.7793,
"step": 2160
},
{
"epoch": 0.653122648607976,
"grad_norm": 0.2402772158384323,
"learning_rate": 6.47739705378623e-05,
"loss": 0.7586,
"step": 2170
},
{
"epoch": 0.6561324303987961,
"grad_norm": 0.1558917760848999,
"learning_rate": 6.379225316990505e-05,
"loss": 0.7564,
"step": 2180
},
{
"epoch": 0.6591422121896162,
"grad_norm": 0.17613032460212708,
"learning_rate": 6.281453566771735e-05,
"loss": 0.7354,
"step": 2190
},
{
"epoch": 0.6621519939804364,
"grad_norm": 0.28283971548080444,
"learning_rate": 6.184092603964308e-05,
"loss": 0.8203,
"step": 2200
},
{
"epoch": 0.6651617757712566,
"grad_norm": 0.2295006364583969,
"learning_rate": 6.087153184022969e-05,
"loss": 0.7466,
"step": 2210
},
{
"epoch": 0.6681715575620768,
"grad_norm": 0.24166414141654968,
"learning_rate": 5.990646015834668e-05,
"loss": 0.76,
"step": 2220
},
{
"epoch": 0.6711813393528969,
"grad_norm": 0.5943595170974731,
"learning_rate": 5.894581760535549e-05,
"loss": 0.7662,
"step": 2230
},
{
"epoch": 0.674191121143717,
"grad_norm": 0.23785801231861115,
"learning_rate": 5.798971030333227e-05,
"loss": 0.7742,
"step": 2240
},
{
"epoch": 0.6772009029345373,
"grad_norm": 0.20206815004348755,
"learning_rate": 5.703824387334442e-05,
"loss": 0.7837,
"step": 2250
},
{
"epoch": 0.6802106847253574,
"grad_norm": 0.21082885563373566,
"learning_rate": 5.609152342378278e-05,
"loss": 0.758,
"step": 2260
},
{
"epoch": 0.6832204665161776,
"grad_norm": 0.1953180432319641,
"learning_rate": 5.514965353875019e-05,
"loss": 0.726,
"step": 2270
},
{
"epoch": 0.6862302483069977,
"grad_norm": 0.22035697102546692,
"learning_rate": 5.4212738266508245e-05,
"loss": 0.7199,
"step": 2280
},
{
"epoch": 0.6892400300978179,
"grad_norm": 0.17208510637283325,
"learning_rate": 5.3280881107982946e-05,
"loss": 0.7379,
"step": 2290
},
{
"epoch": 0.6922498118886381,
"grad_norm": 0.18434813618659973,
"learning_rate": 5.235418500533109e-05,
"loss": 0.7574,
"step": 2300
},
{
"epoch": 0.6952595936794582,
"grad_norm": 0.35970667004585266,
"learning_rate": 5.143275233056817e-05,
"loss": 0.7737,
"step": 2310
},
{
"epoch": 0.6982693754702785,
"grad_norm": 0.19193576276302338,
"learning_rate": 5.051668487425938e-05,
"loss": 0.7268,
"step": 2320
},
{
"epoch": 0.7012791572610986,
"grad_norm": 0.21549300849437714,
"learning_rate": 4.960608383427481e-05,
"loss": 0.7542,
"step": 2330
},
{
"epoch": 0.7042889390519187,
"grad_norm": 0.1781996786594391,
"learning_rate": 4.8701049804610265e-05,
"loss": 0.7493,
"step": 2340
},
{
"epoch": 0.7072987208427389,
"grad_norm": 0.17246761918067932,
"learning_rate": 4.780168276427441e-05,
"loss": 0.7468,
"step": 2350
},
{
"epoch": 0.710308502633559,
"grad_norm": 0.22511474788188934,
"learning_rate": 4.6908082066244275e-05,
"loss": 0.7667,
"step": 2360
},
{
"epoch": 0.7133182844243793,
"grad_norm": 0.22506214678287506,
"learning_rate": 4.602034642648968e-05,
"loss": 0.788,
"step": 2370
},
{
"epoch": 0.7163280662151994,
"grad_norm": 0.23567315936088562,
"learning_rate": 4.513857391306812e-05,
"loss": 0.772,
"step": 2380
},
{
"epoch": 0.7193378480060195,
"grad_norm": 0.21594233810901642,
"learning_rate": 4.4262861935291144e-05,
"loss": 0.7257,
"step": 2390
},
{
"epoch": 0.7223476297968398,
"grad_norm": 0.2095227688550949,
"learning_rate": 4.339330723296373e-05,
"loss": 0.7456,
"step": 2400
},
{
"epoch": 0.7253574115876599,
"grad_norm": 0.2402193695306778,
"learning_rate": 4.25300058656972e-05,
"loss": 0.7722,
"step": 2410
},
{
"epoch": 0.7283671933784801,
"grad_norm": 0.2426275908946991,
"learning_rate": 4.1673053202297676e-05,
"loss": 0.7871,
"step": 2420
},
{
"epoch": 0.7313769751693002,
"grad_norm": 0.1813143640756607,
"learning_rate": 4.0822543910230674e-05,
"loss": 0.7394,
"step": 2430
},
{
"epoch": 0.7343867569601203,
"grad_norm": 0.29781273007392883,
"learning_rate": 3.997857194516319e-05,
"loss": 0.7318,
"step": 2440
},
{
"epoch": 0.7373965387509406,
"grad_norm": 0.2380327582359314,
"learning_rate": 3.914123054058446e-05,
"loss": 0.7711,
"step": 2450
},
{
"epoch": 0.7404063205417607,
"grad_norm": 0.2532583773136139,
"learning_rate": 3.831061219750636e-05,
"loss": 0.7341,
"step": 2460
},
{
"epoch": 0.7434161023325809,
"grad_norm": 0.18625672161579132,
"learning_rate": 3.7486808674245047e-05,
"loss": 0.7349,
"step": 2470
},
{
"epoch": 0.746425884123401,
"grad_norm": 0.2749151289463043,
"learning_rate": 3.666991097628416e-05,
"loss": 0.7551,
"step": 2480
},
{
"epoch": 0.7494356659142212,
"grad_norm": 0.28592735528945923,
"learning_rate": 3.586000934622166e-05,
"loss": 0.7485,
"step": 2490
},
{
"epoch": 0.7524454477050414,
"grad_norm": 0.22789210081100464,
"learning_rate": 3.5057193253800624e-05,
"loss": 0.7308,
"step": 2500
},
{
"epoch": 0.7554552294958615,
"grad_norm": 0.2588096559047699,
"learning_rate": 3.426155138602558e-05,
"loss": 0.7717,
"step": 2510
},
{
"epoch": 0.7584650112866818,
"grad_norm": 0.2197035402059555,
"learning_rate": 3.347317163736524e-05,
"loss": 0.753,
"step": 2520
},
{
"epoch": 0.7614747930775019,
"grad_norm": 0.19545480608940125,
"learning_rate": 3.269214110004293e-05,
"loss": 0.7552,
"step": 2530
},
{
"epoch": 0.764484574868322,
"grad_norm": 0.1901889145374298,
"learning_rate": 3.191854605441527e-05,
"loss": 0.7146,
"step": 2540
},
{
"epoch": 0.7674943566591422,
"grad_norm": 0.19916728138923645,
"learning_rate": 3.115247195944102e-05,
"loss": 0.7733,
"step": 2550
},
{
"epoch": 0.7705041384499624,
"grad_norm": 0.2248535454273224,
"learning_rate": 3.039400344324035e-05,
"loss": 0.7948,
"step": 2560
},
{
"epoch": 0.7735139202407826,
"grad_norm": 0.238205224275589,
"learning_rate": 2.9643224293745954e-05,
"loss": 0.7637,
"step": 2570
},
{
"epoch": 0.7765237020316027,
"grad_norm": 0.26717284321784973,
"learning_rate": 2.8900217449447074e-05,
"loss": 0.7326,
"step": 2580
},
{
"epoch": 0.7795334838224228,
"grad_norm": 0.2582661509513855,
"learning_rate": 2.8165064990227252e-05,
"loss": 0.7571,
"step": 2590
},
{
"epoch": 0.782543265613243,
"grad_norm": 0.22693775594234467,
"learning_rate": 2.7437848128296982e-05,
"loss": 0.75,
"step": 2600
},
{
"epoch": 0.7855530474040632,
"grad_norm": 0.21676841378211975,
"learning_rate": 2.6718647199222214e-05,
"loss": 0.7693,
"step": 2610
},
{
"epoch": 0.7885628291948834,
"grad_norm": 0.21357332170009613,
"learning_rate": 2.600754165304966e-05,
"loss": 0.7499,
"step": 2620
},
{
"epoch": 0.7915726109857035,
"grad_norm": 0.2448240965604782,
"learning_rate": 2.530461004553001e-05,
"loss": 0.7565,
"step": 2630
},
{
"epoch": 0.7945823927765236,
"grad_norm": 0.16155965626239777,
"learning_rate": 2.460993002943983e-05,
"loss": 0.738,
"step": 2640
},
{
"epoch": 0.7975921745673439,
"grad_norm": 0.22366459667682648,
"learning_rate": 2.3923578346003363e-05,
"loss": 0.746,
"step": 2650
},
{
"epoch": 0.800601956358164,
"grad_norm": 0.21000875532627106,
"learning_rate": 2.32456308164148e-05,
"loss": 0.7615,
"step": 2660
},
{
"epoch": 0.8036117381489842,
"grad_norm": 0.23698626458644867,
"learning_rate": 2.2576162333462402e-05,
"loss": 0.8101,
"step": 2670
},
{
"epoch": 0.8066215199398044,
"grad_norm": 0.17663483321666718,
"learning_rate": 2.191524685325512e-05,
"loss": 0.7551,
"step": 2680
},
{
"epoch": 0.8096313017306245,
"grad_norm": 0.193350687623024,
"learning_rate": 2.126295738705262e-05,
"loss": 0.7621,
"step": 2690
},
{
"epoch": 0.8126410835214447,
"grad_norm": 0.2506263256072998,
"learning_rate": 2.0619365993199747e-05,
"loss": 0.7789,
"step": 2700
},
{
"epoch": 0.8156508653122648,
"grad_norm": 0.23012906312942505,
"learning_rate": 1.9984543769166265e-05,
"loss": 0.7246,
"step": 2710
},
{
"epoch": 0.8186606471030851,
"grad_norm": 0.18312327563762665,
"learning_rate": 1.9358560843692787e-05,
"loss": 0.7295,
"step": 2720
},
{
"epoch": 0.8216704288939052,
"grad_norm": 0.22966724634170532,
"learning_rate": 1.8741486369043505e-05,
"loss": 0.7851,
"step": 2730
},
{
"epoch": 0.8246802106847254,
"grad_norm": 0.160865917801857,
"learning_rate": 1.8133388513367078e-05,
"loss": 0.7278,
"step": 2740
},
{
"epoch": 0.8276899924755455,
"grad_norm": 0.26794126629829407,
"learning_rate": 1.7534334453166068e-05,
"loss": 0.7247,
"step": 2750
},
{
"epoch": 0.8306997742663657,
"grad_norm": 0.23241771757602692,
"learning_rate": 1.6944390365875952e-05,
"loss": 0.7741,
"step": 2760
},
{
"epoch": 0.8337095560571859,
"grad_norm": 0.20465601980686188,
"learning_rate": 1.6363621422554476e-05,
"loss": 0.7267,
"step": 2770
},
{
"epoch": 0.836719337848006,
"grad_norm": 0.21765820682048798,
"learning_rate": 1.579209178068234e-05,
"loss": 0.7566,
"step": 2780
},
{
"epoch": 0.8397291196388262,
"grad_norm": 0.21495480835437775,
"learning_rate": 1.5229864577075547e-05,
"loss": 0.8005,
"step": 2790
},
{
"epoch": 0.8427389014296464,
"grad_norm": 0.19566848874092102,
"learning_rate": 1.4677001920910827e-05,
"loss": 0.7494,
"step": 2800
},
{
"epoch": 0.8457486832204665,
"grad_norm": 0.1817813217639923,
"learning_rate": 1.4133564886864381e-05,
"loss": 0.7289,
"step": 2810
},
{
"epoch": 0.8487584650112867,
"grad_norm": 0.2588653266429901,
"learning_rate": 1.3599613508364984e-05,
"loss": 0.7493,
"step": 2820
},
{
"epoch": 0.8517682468021068,
"grad_norm": 0.1954454928636551,
"learning_rate": 1.307520677096209e-05,
"loss": 0.7504,
"step": 2830
},
{
"epoch": 0.8547780285929271,
"grad_norm": 0.17466256022453308,
"learning_rate": 1.2560402605809707e-05,
"loss": 0.7705,
"step": 2840
},
{
"epoch": 0.8577878103837472,
"grad_norm": 0.2213626205921173,
"learning_rate": 1.2055257883266791e-05,
"loss": 0.7307,
"step": 2850
},
{
"epoch": 0.8607975921745673,
"grad_norm": 0.19768081605434418,
"learning_rate": 1.1559828406614714e-05,
"loss": 0.7741,
"step": 2860
},
{
"epoch": 0.8638073739653875,
"grad_norm": 0.22028213739395142,
"learning_rate": 1.1074168905892702e-05,
"loss": 0.7238,
"step": 2870
},
{
"epoch": 0.8668171557562077,
"grad_norm": 0.2675631046295166,
"learning_rate": 1.0598333031851881e-05,
"loss": 0.7315,
"step": 2880
},
{
"epoch": 0.8698269375470279,
"grad_norm": 0.20498494803905487,
"learning_rate": 1.0132373350028313e-05,
"loss": 0.751,
"step": 2890
},
{
"epoch": 0.872836719337848,
"grad_norm": 0.22774334251880646,
"learning_rate": 9.676341334936346e-06,
"loss": 0.72,
"step": 2900
},
{
"epoch": 0.8758465011286681,
"grad_norm": 0.18757307529449463,
"learning_rate": 9.230287364382007e-06,
"loss": 0.7433,
"step": 2910
},
{
"epoch": 0.8788562829194884,
"grad_norm": 0.23557019233703613,
"learning_rate": 8.794260713897862e-06,
"loss": 0.7473,
"step": 2920
},
{
"epoch": 0.8818660647103085,
"grad_norm": 0.24945016205310822,
"learning_rate": 8.368309551299536e-06,
"loss": 0.746,
"step": 2930
},
{
"epoch": 0.8848758465011287,
"grad_norm": 0.5168988704681396,
"learning_rate": 7.952480931364658e-06,
"loss": 0.7523,
"step": 2940
},
{
"epoch": 0.8878856282919488,
"grad_norm": 0.21044060587882996,
"learning_rate": 7.546820790634646e-06,
"loss": 0.7359,
"step": 2950
},
{
"epoch": 0.890895410082769,
"grad_norm": 0.2304990291595459,
"learning_rate": 7.1513739423402e-06,
"loss": 0.7201,
"step": 2960
},
{
"epoch": 0.8939051918735892,
"grad_norm": 0.5010607242584229,
"learning_rate": 6.766184071450721e-06,
"loss": 0.7619,
"step": 2970
},
{
"epoch": 0.8969149736644093,
"grad_norm": 0.1829194277524948,
"learning_rate": 6.391293729848435e-06,
"loss": 0.7654,
"step": 2980
},
{
"epoch": 0.8999247554552295,
"grad_norm": 0.4193117320537567,
"learning_rate": 6.026744331627731e-06,
"loss": 0.7416,
"step": 2990
},
{
"epoch": 0.9029345372460497,
"grad_norm": 0.2364160269498825,
"learning_rate": 5.672576148520137e-06,
"loss": 0.7516,
"step": 3000
},
{
"epoch": 0.9029345372460497,
"eval_loss": 0.7987983226776123,
"eval_runtime": 142.8043,
"eval_samples_per_second": 39.187,
"eval_steps_per_second": 4.902,
"step": 3000
},
{
"epoch": 0.9059443190368698,
"grad_norm": 0.1936904937028885,
"learning_rate": 5.328828305445477e-06,
"loss": 0.7357,
"step": 3010
},
{
"epoch": 0.90895410082769,
"grad_norm": 0.194386288523674,
"learning_rate": 4.9955387761897785e-06,
"loss": 0.7725,
"step": 3020
},
{
"epoch": 0.9119638826185101,
"grad_norm": 0.24116984009742737,
"learning_rate": 4.672744379210336e-06,
"loss": 0.776,
"step": 3030
},
{
"epoch": 0.9149736644093304,
"grad_norm": 0.1920221745967865,
"learning_rate": 4.360480773568321e-06,
"loss": 0.7323,
"step": 3040
},
{
"epoch": 0.9179834462001505,
"grad_norm": 0.19343051314353943,
"learning_rate": 4.058782454989529e-06,
"loss": 0.7436,
"step": 3050
},
{
"epoch": 0.9209932279909706,
"grad_norm": 0.2893059253692627,
"learning_rate": 3.767682752053714e-06,
"loss": 0.7538,
"step": 3060
},
{
"epoch": 0.9240030097817908,
"grad_norm": 0.23565465211868286,
"learning_rate": 3.487213822512714e-06,
"loss": 0.7535,
"step": 3070
},
{
"epoch": 0.927012791572611,
"grad_norm": 0.2228977531194687,
"learning_rate": 3.2174066497380086e-06,
"loss": 0.7572,
"step": 3080
},
{
"epoch": 0.9300225733634312,
"grad_norm": 0.20145723223686218,
"learning_rate": 2.958291039298e-06,
"loss": 0.7114,
"step": 3090
},
{
"epoch": 0.9330323551542513,
"grad_norm": 0.2287394404411316,
"learning_rate": 2.709895615665392e-06,
"loss": 0.7656,
"step": 3100
},
{
"epoch": 0.9360421369450714,
"grad_norm": 0.21397553384304047,
"learning_rate": 2.472247819055029e-06,
"loss": 0.7361,
"step": 3110
},
{
"epoch": 0.9390519187358917,
"grad_norm": 0.22272509336471558,
"learning_rate": 2.2453739023926113e-06,
"loss": 0.7789,
"step": 3120
},
{
"epoch": 0.9420617005267118,
"grad_norm": 0.2435009628534317,
"learning_rate": 2.0292989284144915e-06,
"loss": 0.7756,
"step": 3130
},
{
"epoch": 0.945071482317532,
"grad_norm": 0.17544378340244293,
"learning_rate": 1.8240467668990457e-06,
"loss": 0.7319,
"step": 3140
},
{
"epoch": 0.9480812641083521,
"grad_norm": 0.19456711411476135,
"learning_rate": 1.6296400920297383e-06,
"loss": 0.7469,
"step": 3150
},
{
"epoch": 0.9510910458991723,
"grad_norm": 0.21780775487422943,
"learning_rate": 1.4461003798903695e-06,
"loss": 0.7587,
"step": 3160
},
{
"epoch": 0.9541008276899925,
"grad_norm": 0.21393659710884094,
"learning_rate": 1.2734479060925397e-06,
"loss": 0.7504,
"step": 3170
},
{
"epoch": 0.9571106094808126,
"grad_norm": 0.21127097308635712,
"learning_rate": 1.1117017435358423e-06,
"loss": 0.7628,
"step": 3180
},
{
"epoch": 0.9601203912716328,
"grad_norm": 0.19377408921718597,
"learning_rate": 9.608797603008812e-07,
"loss": 0.7195,
"step": 3190
},
{
"epoch": 0.963130173062453,
"grad_norm": 0.27100813388824463,
"learning_rate": 8.209986176753948e-07,
"loss": 0.7476,
"step": 3200
},
{
"epoch": 0.9661399548532731,
"grad_norm": 0.1997697949409485,
"learning_rate": 6.920737683136613e-07,
"loss": 0.7578,
"step": 3210
},
{
"epoch": 0.9691497366440933,
"grad_norm": 0.3180257976055145,
"learning_rate": 5.741194545294648e-07,
"loss": 0.7537,
"step": 3220
},
{
"epoch": 0.9721595184349134,
"grad_norm": 0.20285405218601227,
"learning_rate": 4.671487067227531e-07,
"loss": 0.7226,
"step": 3230
},
{
"epoch": 0.9751693002257337,
"grad_norm": 0.1977706104516983,
"learning_rate": 3.711733419401453e-07,
"loss": 0.7487,
"step": 3240
},
{
"epoch": 0.9781790820165538,
"grad_norm": 0.2490547150373459,
"learning_rate": 2.8620396256953117e-07,
"loss": 0.773,
"step": 3250
},
{
"epoch": 0.9811888638073739,
"grad_norm": 0.18216556310653687,
"learning_rate": 2.122499551688084e-07,
"loss": 0.7348,
"step": 3260
},
{
"epoch": 0.9841986455981941,
"grad_norm": 0.20423290133476257,
"learning_rate": 1.4931948942895624e-07,
"loss": 0.7116,
"step": 3270
},
{
"epoch": 0.9872084273890143,
"grad_norm": 0.3691667914390564,
"learning_rate": 9.741951727152421e-08,
"loss": 0.7242,
"step": 3280
},
{
"epoch": 0.9902182091798345,
"grad_norm": 0.1876600682735443,
"learning_rate": 5.655577208069085e-08,
"loss": 0.755,
"step": 3290
},
{
"epoch": 0.9932279909706546,
"grad_norm": 0.23210102319717407,
"learning_rate": 2.6732768069825943e-08,
"loss": 0.7697,
"step": 3300
},
{
"epoch": 0.9962377727614747,
"grad_norm": 0.18966606259346008,
"learning_rate": 7.953799782889349e-09,
"loss": 0.7166,
"step": 3310
},
{
"epoch": 0.999247554552295,
"grad_norm": 0.5191490650177002,
"learning_rate": 2.2094173039999277e-10,
"loss": 0.7366,
"step": 3320
},
{
"epoch": 0.999849510910459,
"step": 3322,
"total_flos": 3.156428776263385e+18,
"train_loss": 0.7673604141007986,
"train_runtime": 8289.1597,
"train_samples_per_second": 12.826,
"train_steps_per_second": 0.401
}
],
"logging_steps": 10,
"max_steps": 3322,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.156428776263385e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}