haihp02's picture
Upload task output 1
0dd32d0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.997907949790795,
"eval_steps": 500,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010460251046025104,
"grad_norm": 26.005483627319336,
"learning_rate": 9.704121543300004e-06,
"loss": 3.13,
"mean_token_accuracy": 0.5278321027755737,
"num_tokens": 168987.0,
"step": 5
},
{
"epoch": 0.02092050209205021,
"grad_norm": 2.9515199661254883,
"learning_rate": 2.1834273472425008e-05,
"loss": 0.5661,
"mean_token_accuracy": 0.8022153377532959,
"num_tokens": 336124.0,
"step": 10
},
{
"epoch": 0.03138075313807531,
"grad_norm": 0.42573603987693787,
"learning_rate": 3.3964425401550014e-05,
"loss": 0.197,
"mean_token_accuracy": 0.8857087612152099,
"num_tokens": 502882.0,
"step": 15
},
{
"epoch": 0.04184100418410042,
"grad_norm": 0.2469477653503418,
"learning_rate": 4.6094577330675016e-05,
"loss": 0.1797,
"mean_token_accuracy": 0.8904994368553162,
"num_tokens": 670415.0,
"step": 20
},
{
"epoch": 0.05230125523012552,
"grad_norm": 0.2182134985923767,
"learning_rate": 5.8224729259800026e-05,
"loss": 0.1846,
"mean_token_accuracy": 0.8807026505470276,
"num_tokens": 835982.0,
"step": 25
},
{
"epoch": 0.06276150627615062,
"grad_norm": 0.13391320407390594,
"learning_rate": 7.035488118892503e-05,
"loss": 0.1834,
"mean_token_accuracy": 0.889725637435913,
"num_tokens": 1002333.0,
"step": 30
},
{
"epoch": 0.07322175732217573,
"grad_norm": 0.08877617120742798,
"learning_rate": 8.248503311805003e-05,
"loss": 0.1698,
"mean_token_accuracy": 0.8972177863121032,
"num_tokens": 1170788.0,
"step": 35
},
{
"epoch": 0.08368200836820083,
"grad_norm": 0.13727827370166779,
"learning_rate": 8.489825351636572e-05,
"loss": 0.193,
"mean_token_accuracy": 0.8714782238006592,
"num_tokens": 1334630.0,
"step": 40
},
{
"epoch": 0.09414225941422594,
"grad_norm": 0.06334780901670456,
"learning_rate": 8.484623060641788e-05,
"loss": 0.1784,
"mean_token_accuracy": 0.8926929354667663,
"num_tokens": 1502752.0,
"step": 45
},
{
"epoch": 0.10460251046025104,
"grad_norm": 0.17026661336421967,
"learning_rate": 8.47542595000357e-05,
"loss": 0.1794,
"mean_token_accuracy": 0.8852181315422059,
"num_tokens": 1667330.0,
"step": 50
},
{
"epoch": 0.11506276150627615,
"grad_norm": 0.20800355076789856,
"learning_rate": 8.462245581861537e-05,
"loss": 0.1688,
"mean_token_accuracy": 0.8963077902793884,
"num_tokens": 1835933.0,
"step": 55
},
{
"epoch": 0.12552301255230125,
"grad_norm": 0.09019959717988968,
"learning_rate": 8.445098525904016e-05,
"loss": 0.1766,
"mean_token_accuracy": 0.8914227962493897,
"num_tokens": 2002682.0,
"step": 60
},
{
"epoch": 0.13598326359832635,
"grad_norm": 0.10760070383548737,
"learning_rate": 8.424006338537455e-05,
"loss": 0.1811,
"mean_token_accuracy": 0.8861293196678162,
"num_tokens": 2168156.0,
"step": 65
},
{
"epoch": 0.14644351464435146,
"grad_norm": 0.054771434515714645,
"learning_rate": 8.398995535786821e-05,
"loss": 0.1788,
"mean_token_accuracy": 0.8940866708755493,
"num_tokens": 2334898.0,
"step": 70
},
{
"epoch": 0.15690376569037656,
"grad_norm": 0.11987103521823883,
"learning_rate": 8.370097559960994e-05,
"loss": 0.1761,
"mean_token_accuracy": 0.8878394961357117,
"num_tokens": 2498966.0,
"step": 75
},
{
"epoch": 0.16736401673640167,
"grad_norm": 0.08062479645013809,
"learning_rate": 8.337348740125097e-05,
"loss": 0.1803,
"mean_token_accuracy": 0.8793545484542846,
"num_tokens": 2666878.0,
"step": 80
},
{
"epoch": 0.17782426778242677,
"grad_norm": 0.06624352186918259,
"learning_rate": 8.300790246429446e-05,
"loss": 0.1706,
"mean_token_accuracy": 0.8921630859375,
"num_tokens": 2837571.0,
"step": 85
},
{
"epoch": 0.18828451882845187,
"grad_norm": 0.07486005127429962,
"learning_rate": 8.260468038352525e-05,
"loss": 0.1806,
"mean_token_accuracy": 0.8846845865249634,
"num_tokens": 3004684.0,
"step": 90
},
{
"epoch": 0.19874476987447698,
"grad_norm": 0.0758485421538353,
"learning_rate": 8.216432806923077e-05,
"loss": 0.1784,
"mean_token_accuracy": 0.8852211952209472,
"num_tokens": 3171448.0,
"step": 95
},
{
"epoch": 0.20920502092050208,
"grad_norm": 0.04164596274495125,
"learning_rate": 8.168739910993914e-05,
"loss": 0.1758,
"mean_token_accuracy": 0.8886502146720886,
"num_tokens": 3338844.0,
"step": 100
},
{
"epoch": 0.2196652719665272,
"grad_norm": 0.05184919759631157,
"learning_rate": 8.117449307647588e-05,
"loss": 0.1777,
"mean_token_accuracy": 0.8897549748420716,
"num_tokens": 3504755.0,
"step": 105
},
{
"epoch": 0.2301255230125523,
"grad_norm": 0.1987534463405609,
"learning_rate": 8.062625476821391e-05,
"loss": 0.1776,
"mean_token_accuracy": 0.8872043371200562,
"num_tokens": 3672204.0,
"step": 110
},
{
"epoch": 0.2405857740585774,
"grad_norm": 0.05174088105559349,
"learning_rate": 8.004337340246469e-05,
"loss": 0.1773,
"mean_token_accuracy": 0.8921713590621948,
"num_tokens": 3838638.0,
"step": 115
},
{
"epoch": 0.2510460251046025,
"grad_norm": 0.045758504420518875,
"learning_rate": 7.942658174802911e-05,
"loss": 0.1716,
"mean_token_accuracy": 0.8969062924385071,
"num_tokens": 4007015.0,
"step": 120
},
{
"epoch": 0.2615062761506276,
"grad_norm": 0.06127613037824631,
"learning_rate": 7.8776655203998e-05,
"loss": 0.1747,
"mean_token_accuracy": 0.8894635438919067,
"num_tokens": 4173967.0,
"step": 125
},
{
"epoch": 0.2719665271966527,
"grad_norm": 0.05971810221672058,
"learning_rate": 7.809441082495976e-05,
"loss": 0.1736,
"mean_token_accuracy": 0.8915351152420044,
"num_tokens": 4341522.0,
"step": 130
},
{
"epoch": 0.2824267782426778,
"grad_norm": 0.042919233441352844,
"learning_rate": 7.738070629384086e-05,
"loss": 0.1739,
"mean_token_accuracy": 0.894553828239441,
"num_tokens": 4510338.0,
"step": 135
},
{
"epoch": 0.2928870292887029,
"grad_norm": 0.04007980227470398,
"learning_rate": 7.66364388436706e-05,
"loss": 0.1787,
"mean_token_accuracy": 0.8844435572624206,
"num_tokens": 4676920.0,
"step": 140
},
{
"epoch": 0.303347280334728,
"grad_norm": 0.06786065548658371,
"learning_rate": 7.586254412962525e-05,
"loss": 0.1753,
"mean_token_accuracy": 0.8855065107345581,
"num_tokens": 4842703.0,
"step": 145
},
{
"epoch": 0.3138075313807531,
"grad_norm": 0.05222710967063904,
"learning_rate": 7.505999505277015e-05,
"loss": 0.1815,
"mean_token_accuracy": 0.8847941517829895,
"num_tokens": 5007057.0,
"step": 150
},
{
"epoch": 0.32426778242677823,
"grad_norm": 0.04080546274781227,
"learning_rate": 7.422980053697774e-05,
"loss": 0.1808,
"mean_token_accuracy": 0.8832746624946595,
"num_tokens": 5169568.0,
"step": 155
},
{
"epoch": 0.33472803347280333,
"grad_norm": 0.06257312744855881,
"learning_rate": 7.337300426056003e-05,
"loss": 0.174,
"mean_token_accuracy": 0.8812252283096313,
"num_tokens": 5337925.0,
"step": 160
},
{
"epoch": 0.34518828451882844,
"grad_norm": 0.03837789222598076,
"learning_rate": 7.24906833442092e-05,
"loss": 0.1806,
"mean_token_accuracy": 0.8963685274124146,
"num_tokens": 5505474.0,
"step": 165
},
{
"epoch": 0.35564853556485354,
"grad_norm": 0.03723612427711487,
"learning_rate": 7.158394699689619e-05,
"loss": 0.1718,
"mean_token_accuracy": 0.8872930765151977,
"num_tokens": 5673076.0,
"step": 170
},
{
"epoch": 0.36610878661087864,
"grad_norm": 0.07191859185695648,
"learning_rate": 7.065393512142981e-05,
"loss": 0.171,
"mean_token_accuracy": 0.892008101940155,
"num_tokens": 5839360.0,
"step": 175
},
{
"epoch": 0.37656903765690375,
"grad_norm": 0.06374694406986237,
"learning_rate": 6.970181688142875e-05,
"loss": 0.1802,
"mean_token_accuracy": 0.8897754430770874,
"num_tokens": 6005773.0,
"step": 180
},
{
"epoch": 0.38702928870292885,
"grad_norm": 0.06770812720060349,
"learning_rate": 6.872878923150857e-05,
"loss": 0.171,
"mean_token_accuracy": 0.8885648608207702,
"num_tokens": 6176000.0,
"step": 185
},
{
"epoch": 0.39748953974895396,
"grad_norm": 0.10405594855546951,
"learning_rate": 6.773607541253137e-05,
"loss": 0.1723,
"mean_token_accuracy": 0.8896305203437805,
"num_tokens": 6342906.0,
"step": 190
},
{
"epoch": 0.40794979079497906,
"grad_norm": 0.0766349583864212,
"learning_rate": 6.672492341380945e-05,
"loss": 0.1788,
"mean_token_accuracy": 0.8878316879272461,
"num_tokens": 6509517.0,
"step": 195
},
{
"epoch": 0.41841004184100417,
"grad_norm": 0.04177647829055786,
"learning_rate": 6.569660440419675e-05,
"loss": 0.1743,
"mean_token_accuracy": 0.888104259967804,
"num_tokens": 6678003.0,
"step": 200
},
{
"epoch": 0.42887029288702927,
"grad_norm": 0.05627443641424179,
"learning_rate": 6.465241113403996e-05,
"loss": 0.1752,
"mean_token_accuracy": 0.8931498408317566,
"num_tokens": 6844693.0,
"step": 205
},
{
"epoch": 0.4393305439330544,
"grad_norm": 0.052942972630262375,
"learning_rate": 6.359365630999862e-05,
"loss": 0.171,
"mean_token_accuracy": 0.8925157785415649,
"num_tokens": 7011865.0,
"step": 210
},
{
"epoch": 0.4497907949790795,
"grad_norm": 0.04482652246952057,
"learning_rate": 6.252167094477696e-05,
"loss": 0.178,
"mean_token_accuracy": 0.8899808645248413,
"num_tokens": 7177778.0,
"step": 215
},
{
"epoch": 0.4602510460251046,
"grad_norm": 0.09079255163669586,
"learning_rate": 6.143780268384265e-05,
"loss": 0.1747,
"mean_token_accuracy": 0.8923329830169677,
"num_tokens": 7346200.0,
"step": 220
},
{
"epoch": 0.4707112970711297,
"grad_norm": 0.050432514399290085,
"learning_rate": 6.0343414111235234e-05,
"loss": 0.1723,
"mean_token_accuracy": 0.8974303126335144,
"num_tokens": 7516567.0,
"step": 225
},
{
"epoch": 0.4811715481171548,
"grad_norm": 0.06964699923992157,
"learning_rate": 5.923988103659495e-05,
"loss": 0.179,
"mean_token_accuracy": 0.8864234924316406,
"num_tokens": 7682943.0,
"step": 230
},
{
"epoch": 0.4916317991631799,
"grad_norm": 0.04765457287430763,
"learning_rate": 5.8128590765564834e-05,
"loss": 0.173,
"mean_token_accuracy": 0.8972329378128052,
"num_tokens": 7847531.0,
"step": 235
},
{
"epoch": 0.502092050209205,
"grad_norm": 0.1056985929608345,
"learning_rate": 5.701094035574053e-05,
"loss": 0.1778,
"mean_token_accuracy": 0.889188575744629,
"num_tokens": 8012755.0,
"step": 240
},
{
"epoch": 0.5125523012552301,
"grad_norm": 0.08160027116537094,
"learning_rate": 5.5888334860360676e-05,
"loss": 0.1793,
"mean_token_accuracy": 0.8878963470458985,
"num_tokens": 8177170.0,
"step": 245
},
{
"epoch": 0.5230125523012552,
"grad_norm": 0.06495994329452515,
"learning_rate": 5.476218556194539e-05,
"loss": 0.1705,
"mean_token_accuracy": 0.8930387020111084,
"num_tokens": 8346248.0,
"step": 250
},
{
"epoch": 0.5334728033472803,
"grad_norm": 0.09615645557641983,
"learning_rate": 5.363390819810368e-05,
"loss": 0.1705,
"mean_token_accuracy": 0.9004521608352661,
"num_tokens": 8513864.0,
"step": 255
},
{
"epoch": 0.5439330543933054,
"grad_norm": 0.090224988758564,
"learning_rate": 5.2504921181740125e-05,
"loss": 0.1691,
"mean_token_accuracy": 0.8958656072616578,
"num_tokens": 8682760.0,
"step": 260
},
{
"epoch": 0.5543933054393305,
"grad_norm": 0.09133970737457275,
"learning_rate": 5.137664381789841e-05,
"loss": 0.1768,
"mean_token_accuracy": 0.8868197798728943,
"num_tokens": 8851699.0,
"step": 265
},
{
"epoch": 0.5648535564853556,
"grad_norm": 0.0644727423787117,
"learning_rate": 5.025049451948312e-05,
"loss": 0.1734,
"mean_token_accuracy": 0.8917650938034057,
"num_tokens": 9019980.0,
"step": 270
},
{
"epoch": 0.5753138075313807,
"grad_norm": 0.07377820461988449,
"learning_rate": 4.912788902410327e-05,
"loss": 0.1719,
"mean_token_accuracy": 0.8914459586143494,
"num_tokens": 9186970.0,
"step": 275
},
{
"epoch": 0.5857740585774058,
"grad_norm": 0.08277697116136551,
"learning_rate": 4.8010238614278966e-05,
"loss": 0.1685,
"mean_token_accuracy": 0.9000778555870056,
"num_tokens": 9355396.0,
"step": 280
},
{
"epoch": 0.5962343096234309,
"grad_norm": 0.06693063676357269,
"learning_rate": 4.689894834324886e-05,
"loss": 0.1679,
"mean_token_accuracy": 0.8984725117683411,
"num_tokens": 9522411.0,
"step": 285
},
{
"epoch": 0.606694560669456,
"grad_norm": 0.04146610572934151,
"learning_rate": 4.5795415268608574e-05,
"loss": 0.1719,
"mean_token_accuracy": 0.8927254438400268,
"num_tokens": 9691113.0,
"step": 290
},
{
"epoch": 0.6171548117154811,
"grad_norm": 0.038201991468667984,
"learning_rate": 4.470102669600116e-05,
"loss": 0.1745,
"mean_token_accuracy": 0.8879514336585999,
"num_tokens": 9858976.0,
"step": 295
},
{
"epoch": 0.6276150627615062,
"grad_norm": 0.054663266986608505,
"learning_rate": 4.361715843506684e-05,
"loss": 0.1723,
"mean_token_accuracy": 0.8976601362228394,
"num_tokens": 10024348.0,
"step": 300
},
{
"epoch": 0.6380753138075314,
"grad_norm": 0.05738237872719765,
"learning_rate": 4.254517306984519e-05,
"loss": 0.1758,
"mean_token_accuracy": 0.8909697771072388,
"num_tokens": 10191229.0,
"step": 305
},
{
"epoch": 0.6485355648535565,
"grad_norm": 0.05679866299033165,
"learning_rate": 4.148641824580384e-05,
"loss": 0.1769,
"mean_token_accuracy": 0.885345721244812,
"num_tokens": 10356265.0,
"step": 310
},
{
"epoch": 0.6589958158995816,
"grad_norm": 0.05231667682528496,
"learning_rate": 4.044222497564706e-05,
"loss": 0.17,
"mean_token_accuracy": 0.8863656997680665,
"num_tokens": 10523797.0,
"step": 315
},
{
"epoch": 0.6694560669456067,
"grad_norm": 0.047176092863082886,
"learning_rate": 3.9413905966034353e-05,
"loss": 0.1757,
"mean_token_accuracy": 0.8950721144676208,
"num_tokens": 10690980.0,
"step": 320
},
{
"epoch": 0.6799163179916318,
"grad_norm": 0.07135423272848129,
"learning_rate": 3.8402753967312435e-05,
"loss": 0.1716,
"mean_token_accuracy": 0.8992363691329956,
"num_tokens": 10856917.0,
"step": 325
},
{
"epoch": 0.6903765690376569,
"grad_norm": 0.05982697755098343,
"learning_rate": 3.741004014833522e-05,
"loss": 0.1756,
"mean_token_accuracy": 0.8893219828605652,
"num_tokens": 11026019.0,
"step": 330
},
{
"epoch": 0.700836820083682,
"grad_norm": 0.07223058491945267,
"learning_rate": 3.6437012498415074e-05,
"loss": 0.1761,
"mean_token_accuracy": 0.8935187578201294,
"num_tokens": 11191221.0,
"step": 335
},
{
"epoch": 0.7112970711297071,
"grad_norm": 0.07034377753734589,
"learning_rate": 3.548489425841399e-05,
"loss": 0.174,
"mean_token_accuracy": 0.8920967102050781,
"num_tokens": 11358663.0,
"step": 340
},
{
"epoch": 0.7217573221757322,
"grad_norm": 0.04658304527401924,
"learning_rate": 3.455488238294761e-05,
"loss": 0.1705,
"mean_token_accuracy": 0.8918556332588196,
"num_tokens": 11526106.0,
"step": 345
},
{
"epoch": 0.7322175732217573,
"grad_norm": 0.05488230288028717,
"learning_rate": 3.3648146035634615e-05,
"loss": 0.1717,
"mean_token_accuracy": 0.9017421960830688,
"num_tokens": 11693884.0,
"step": 350
},
{
"epoch": 0.7426778242677824,
"grad_norm": 0.0530734583735466,
"learning_rate": 3.2765825119283766e-05,
"loss": 0.166,
"mean_token_accuracy": 0.8987428307533264,
"num_tokens": 11859724.0,
"step": 355
},
{
"epoch": 0.7531380753138075,
"grad_norm": 0.04387292638421059,
"learning_rate": 3.190902884286605e-05,
"loss": 0.174,
"mean_token_accuracy": 0.8880913257598877,
"num_tokens": 12027250.0,
"step": 360
},
{
"epoch": 0.7635983263598326,
"grad_norm": 0.04933469370007515,
"learning_rate": 3.107883432707365e-05,
"loss": 0.1725,
"mean_token_accuracy": 0.892216432094574,
"num_tokens": 12192702.0,
"step": 365
},
{
"epoch": 0.7740585774058577,
"grad_norm": 0.06139339134097099,
"learning_rate": 3.027628525021854e-05,
"loss": 0.1736,
"mean_token_accuracy": 0.8933562159538269,
"num_tokens": 12356921.0,
"step": 370
},
{
"epoch": 0.7845188284518828,
"grad_norm": 0.06699731200933456,
"learning_rate": 2.9502390536173207e-05,
"loss": 0.175,
"mean_token_accuracy": 0.8892877340316773,
"num_tokens": 12522966.0,
"step": 375
},
{
"epoch": 0.7949790794979079,
"grad_norm": 0.05798710510134697,
"learning_rate": 2.8758123086002943e-05,
"loss": 0.1689,
"mean_token_accuracy": 0.8937100172042847,
"num_tokens": 12690457.0,
"step": 380
},
{
"epoch": 0.805439330543933,
"grad_norm": 0.06085468828678131,
"learning_rate": 2.8044418554884045e-05,
"loss": 0.1717,
"mean_token_accuracy": 0.8984749555587769,
"num_tokens": 12858447.0,
"step": 385
},
{
"epoch": 0.8158995815899581,
"grad_norm": 0.05883463844656944,
"learning_rate": 2.7362174175845794e-05,
"loss": 0.173,
"mean_token_accuracy": 0.8884599447250366,
"num_tokens": 13023325.0,
"step": 390
},
{
"epoch": 0.8263598326359832,
"grad_norm": 0.05903568118810654,
"learning_rate": 2.6712247631814693e-05,
"loss": 0.1739,
"mean_token_accuracy": 0.8846121549606323,
"num_tokens": 13190986.0,
"step": 395
},
{
"epoch": 0.8368200836820083,
"grad_norm": 0.05720416456460953,
"learning_rate": 2.609545597737912e-05,
"loss": 0.1722,
"mean_token_accuracy": 0.8995614528656006,
"num_tokens": 13356356.0,
"step": 400
},
{
"epoch": 0.8472803347280334,
"grad_norm": 0.06948366016149521,
"learning_rate": 2.5512574611629883e-05,
"loss": 0.1719,
"mean_token_accuracy": 0.8946127891540527,
"num_tokens": 13523833.0,
"step": 405
},
{
"epoch": 0.8577405857740585,
"grad_norm": 0.07984772324562073,
"learning_rate": 2.4964336303367922e-05,
"loss": 0.173,
"mean_token_accuracy": 0.8907502651214599,
"num_tokens": 13689718.0,
"step": 410
},
{
"epoch": 0.8682008368200836,
"grad_norm": 0.06918308138847351,
"learning_rate": 2.445143026990465e-05,
"loss": 0.1674,
"mean_token_accuracy": 0.8978348135948181,
"num_tokens": 13859335.0,
"step": 415
},
{
"epoch": 0.8786610878661087,
"grad_norm": 0.04599495977163315,
"learning_rate": 2.397450131061303e-05,
"loss": 0.1725,
"mean_token_accuracy": 0.8939827084541321,
"num_tokens": 14027381.0,
"step": 420
},
{
"epoch": 0.8891213389121339,
"grad_norm": 0.0907023623585701,
"learning_rate": 2.3534148996318554e-05,
"loss": 0.1742,
"mean_token_accuracy": 0.8938085794448852,
"num_tokens": 14191223.0,
"step": 425
},
{
"epoch": 0.899581589958159,
"grad_norm": 0.05372009053826332,
"learning_rate": 2.3130926915549348e-05,
"loss": 0.1722,
"mean_token_accuracy": 0.888364052772522,
"num_tokens": 14357594.0,
"step": 430
},
{
"epoch": 0.9100418410041841,
"grad_norm": 0.06079407036304474,
"learning_rate": 2.2765341978592826e-05,
"loss": 0.1673,
"mean_token_accuracy": 0.9027839779853821,
"num_tokens": 14526417.0,
"step": 435
},
{
"epoch": 0.9205020920502092,
"grad_norm": 0.04136047139763832,
"learning_rate": 2.243785378023386e-05,
"loss": 0.1662,
"mean_token_accuracy": 0.8995121359825134,
"num_tokens": 14696043.0,
"step": 440
},
{
"epoch": 0.9309623430962343,
"grad_norm": 0.06056118756532669,
"learning_rate": 2.2148874021975585e-05,
"loss": 0.1695,
"mean_token_accuracy": 0.8912408828735352,
"num_tokens": 14862935.0,
"step": 445
},
{
"epoch": 0.9414225941422594,
"grad_norm": 0.05185782536864281,
"learning_rate": 2.189876599446925e-05,
"loss": 0.1721,
"mean_token_accuracy": 0.887525987625122,
"num_tokens": 15031337.0,
"step": 450
},
{
"epoch": 0.9518828451882845,
"grad_norm": 0.04410529136657715,
"learning_rate": 2.1687844120803645e-05,
"loss": 0.1688,
"mean_token_accuracy": 0.893509566783905,
"num_tokens": 15200425.0,
"step": 455
},
{
"epoch": 0.9623430962343096,
"grad_norm": 0.050916578620672226,
"learning_rate": 2.1516373561228417e-05,
"loss": 0.1722,
"mean_token_accuracy": 0.892509925365448,
"num_tokens": 15367638.0,
"step": 460
},
{
"epoch": 0.9728033472803347,
"grad_norm": 0.04820539057254791,
"learning_rate": 2.1384569879808095e-05,
"loss": 0.1687,
"mean_token_accuracy": 0.8972745537757874,
"num_tokens": 15535531.0,
"step": 465
},
{
"epoch": 0.9832635983263598,
"grad_norm": 0.04627377539873123,
"learning_rate": 2.1292598773425907e-05,
"loss": 0.1729,
"mean_token_accuracy": 0.8949442863464355,
"num_tokens": 15702112.0,
"step": 470
},
{
"epoch": 0.9937238493723849,
"grad_norm": 0.06575358659029007,
"learning_rate": 2.1240575863478074e-05,
"loss": 0.1739,
"mean_token_accuracy": 0.8913851857185364,
"num_tokens": 15868770.0,
"step": 475
}
],
"logging_steps": 5,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.623087653111726e+17,
"train_batch_size": 28,
"trial_name": null,
"trial_params": null
}