Jais2-random / trainer_state.json
elozeiri's picture
Upload folder using huggingface_hub
ac261af verified
Invalid JSON:Unexpected token 'N', ..."al_loss": NaN, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9660792116732992,
"eval_steps": 1000,
"global_step": 41500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.2857219433784486,
"epoch": 0.002368770134546144,
"grad_norm": 9.544422149658203,
"learning_rate": 1.1605873993368073e-06,
"loss": 2.8961,
"mean_token_accuracy": 0.6633071088790894,
"num_tokens": 1213722.0,
"step": 50
},
{
"entropy": 1.5439852488040924,
"epoch": 0.004737540269092288,
"grad_norm": 1.5218281745910645,
"learning_rate": 2.3448602558029374e-06,
"loss": 2.0745,
"mean_token_accuracy": 0.6788832449913025,
"num_tokens": 2440523.0,
"step": 100
},
{
"entropy": 1.9061457157135009,
"epoch": 0.007106310403638431,
"grad_norm": 1.1667029857635498,
"learning_rate": 3.529133112269067e-06,
"loss": 1.5243,
"mean_token_accuracy": 0.6922976732254028,
"num_tokens": 3673044.0,
"step": 150
},
{
"entropy": 1.8150238823890685,
"epoch": 0.009475080538184575,
"grad_norm": 1.093643307685852,
"learning_rate": 4.713405968735197e-06,
"loss": 1.4194,
"mean_token_accuracy": 0.7095924293994904,
"num_tokens": 4883924.0,
"step": 200
},
{
"entropy": 1.7932313251495362,
"epoch": 0.011843850672730718,
"grad_norm": 0.9533292055130005,
"learning_rate": 5.897678825201327e-06,
"loss": 1.4224,
"mean_token_accuracy": 0.7063059556484222,
"num_tokens": 6115751.0,
"step": 250
},
{
"entropy": 1.7181093657016755,
"epoch": 0.014212620807276862,
"grad_norm": 1.2956347465515137,
"learning_rate": 7.0819516816674565e-06,
"loss": 1.3637,
"mean_token_accuracy": 0.7182639849185943,
"num_tokens": 7344600.0,
"step": 300
},
{
"entropy": 1.7070274019241334,
"epoch": 0.016581390941823006,
"grad_norm": 1.0166376829147339,
"learning_rate": 8.266224538133587e-06,
"loss": 1.3641,
"mean_token_accuracy": 0.7153780800104141,
"num_tokens": 8564072.0,
"step": 350
},
{
"entropy": 1.7232308828830718,
"epoch": 0.01895016107636915,
"grad_norm": 0.9041787981987,
"learning_rate": 9.450497394599716e-06,
"loss": 1.4065,
"mean_token_accuracy": 0.7088368773460388,
"num_tokens": 9774641.0,
"step": 400
},
{
"entropy": 1.691476699113846,
"epoch": 0.02131893121091529,
"grad_norm": 1.104785680770874,
"learning_rate": 1.0634770251065847e-05,
"loss": 1.3464,
"mean_token_accuracy": 0.7208190321922302,
"num_tokens": 11023876.0,
"step": 450
},
{
"entropy": 1.7134545636177063,
"epoch": 0.023687701345461436,
"grad_norm": 1.029800295829773,
"learning_rate": 1.1819043107531975e-05,
"loss": 1.3758,
"mean_token_accuracy": 0.7151617485284806,
"num_tokens": 12267386.0,
"step": 500
},
{
"entropy": 1.6694060420989991,
"epoch": 0.02605647148000758,
"grad_norm": 0.9847853183746338,
"learning_rate": 1.3003315963998106e-05,
"loss": 1.3374,
"mean_token_accuracy": 0.7215989363193512,
"num_tokens": 13530115.0,
"step": 550
},
{
"entropy": 1.6969332695007324,
"epoch": 0.028425241614553724,
"grad_norm": 0.9018113017082214,
"learning_rate": 1.4187588820464234e-05,
"loss": 1.3446,
"mean_token_accuracy": 0.720185512304306,
"num_tokens": 14757305.0,
"step": 600
},
{
"entropy": 1.7514292740821837,
"epoch": 0.03079401174909987,
"grad_norm": 0.803989827632904,
"learning_rate": 1.5371861676930365e-05,
"loss": 1.4127,
"mean_token_accuracy": 0.7072590082883835,
"num_tokens": 15958099.0,
"step": 650
},
{
"entropy": 1.7325732719898224,
"epoch": 0.03316278188364601,
"grad_norm": 0.865963876247406,
"learning_rate": 1.6556134533396493e-05,
"loss": 1.3412,
"mean_token_accuracy": 0.71946579515934,
"num_tokens": 17188325.0,
"step": 700
},
{
"entropy": 1.707287894487381,
"epoch": 0.03553155201819216,
"grad_norm": 0.8039044141769409,
"learning_rate": 1.7740407389862628e-05,
"loss": 1.3502,
"mean_token_accuracy": 0.7187299233675003,
"num_tokens": 18423224.0,
"step": 750
},
{
"entropy": 1.708844404220581,
"epoch": 0.0379003221527383,
"grad_norm": 0.8659459948539734,
"learning_rate": 1.8924680246328755e-05,
"loss": 1.3596,
"mean_token_accuracy": 0.7151121199131012,
"num_tokens": 19656464.0,
"step": 800
},
{
"entropy": 1.7051723492145539,
"epoch": 0.04026909228728444,
"grad_norm": 0.8470927476882935,
"learning_rate": 2.0108953102794883e-05,
"loss": 1.309,
"mean_token_accuracy": 0.723027645945549,
"num_tokens": 20908664.0,
"step": 850
},
{
"entropy": 1.6959775292873382,
"epoch": 0.04263786242183058,
"grad_norm": 1.0128496885299683,
"learning_rate": 2.1293225959261014e-05,
"loss": 1.3337,
"mean_token_accuracy": 0.720455265045166,
"num_tokens": 22165141.0,
"step": 900
},
{
"entropy": 1.7231877827644348,
"epoch": 0.04500663255637673,
"grad_norm": 0.9077188968658447,
"learning_rate": 2.2477498815727142e-05,
"loss": 1.3309,
"mean_token_accuracy": 0.7221323251724243,
"num_tokens": 23408996.0,
"step": 950
},
{
"entropy": 1.7131755888462066,
"epoch": 0.04737540269092287,
"grad_norm": 0.9538567066192627,
"learning_rate": 2.3661771672193277e-05,
"loss": 1.3434,
"mean_token_accuracy": 0.718843805193901,
"num_tokens": 24626341.0,
"step": 1000
},
{
"epoch": 0.04737540269092287,
"eval_entropy": 1.1688038776145298,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7489374687524091,
"eval_num_tokens": 24626341.0,
"eval_runtime": 739.6274,
"eval_samples_per_second": 33.551,
"eval_steps_per_second": 4.194,
"step": 1000
},
{
"entropy": 1.7510324358940124,
"epoch": 0.049744172825469016,
"grad_norm": 0.8725846409797668,
"learning_rate": 2.4846044528659405e-05,
"loss": 1.3756,
"mean_token_accuracy": 0.714400834441185,
"num_tokens": 25873918.0,
"step": 1050
},
{
"entropy": 1.7741026413440704,
"epoch": 0.05211294296001516,
"grad_norm": 0.9376819729804993,
"learning_rate": 2.6030317385125536e-05,
"loss": 1.3554,
"mean_token_accuracy": 0.7169699442386627,
"num_tokens": 27067052.0,
"step": 1100
},
{
"entropy": 1.7553015303611756,
"epoch": 0.054481713094561304,
"grad_norm": 0.7493100166320801,
"learning_rate": 2.7214590241591663e-05,
"loss": 1.3385,
"mean_token_accuracy": 0.721844300031662,
"num_tokens": 28308409.0,
"step": 1150
},
{
"entropy": 1.7428362345695496,
"epoch": 0.05685048322910745,
"grad_norm": 0.7889260053634644,
"learning_rate": 2.8398863098057795e-05,
"loss": 1.3293,
"mean_token_accuracy": 0.7205719447135925,
"num_tokens": 29542172.0,
"step": 1200
},
{
"entropy": 1.727198257446289,
"epoch": 0.05921925336365359,
"grad_norm": 0.8638942837715149,
"learning_rate": 2.9583135954523922e-05,
"loss": 1.3176,
"mean_token_accuracy": 0.7266954278945923,
"num_tokens": 30777592.0,
"step": 1250
},
{
"entropy": 1.7321817111968993,
"epoch": 0.06158802349819974,
"grad_norm": 0.8561661839485168,
"learning_rate": 3.076740881099006e-05,
"loss": 1.3116,
"mean_token_accuracy": 0.7248632162809372,
"num_tokens": 32008108.0,
"step": 1300
},
{
"entropy": 1.7352443253993988,
"epoch": 0.06395679363274588,
"grad_norm": 0.8668932914733887,
"learning_rate": 3.1951681667456185e-05,
"loss": 1.3422,
"mean_token_accuracy": 0.7209743493795395,
"num_tokens": 33257843.0,
"step": 1350
},
{
"entropy": 1.7172068011760713,
"epoch": 0.06632556376729203,
"grad_norm": 1.030638337135315,
"learning_rate": 3.313595452392231e-05,
"loss": 1.3161,
"mean_token_accuracy": 0.7226567584276199,
"num_tokens": 34486893.0,
"step": 1400
},
{
"entropy": 1.7621229577064514,
"epoch": 0.06869433390183817,
"grad_norm": 1.0166395902633667,
"learning_rate": 3.432022738038844e-05,
"loss": 1.3644,
"mean_token_accuracy": 0.7157200646400451,
"num_tokens": 35701555.0,
"step": 1450
},
{
"entropy": 1.739331885576248,
"epoch": 0.07106310403638431,
"grad_norm": 0.6904604434967041,
"learning_rate": 3.550450023685457e-05,
"loss": 1.3599,
"mean_token_accuracy": 0.716941955089569,
"num_tokens": 36936472.0,
"step": 1500
},
{
"entropy": 1.7242075634002685,
"epoch": 0.07343187417093046,
"grad_norm": 0.8110722303390503,
"learning_rate": 3.66887730933207e-05,
"loss": 1.3374,
"mean_token_accuracy": 0.7217743951082229,
"num_tokens": 38178198.0,
"step": 1550
},
{
"entropy": 1.7162011814117433,
"epoch": 0.0758006443054766,
"grad_norm": 0.8524773716926575,
"learning_rate": 3.787304594978684e-05,
"loss": 1.3228,
"mean_token_accuracy": 0.7227801591157913,
"num_tokens": 39412346.0,
"step": 1600
},
{
"entropy": 1.7332351410388946,
"epoch": 0.07816941444002275,
"grad_norm": 0.7344287037849426,
"learning_rate": 3.9057318806252965e-05,
"loss": 1.3343,
"mean_token_accuracy": 0.721068668961525,
"num_tokens": 40640112.0,
"step": 1650
},
{
"entropy": 1.7606168591976166,
"epoch": 0.08053818457456888,
"grad_norm": 0.8270729184150696,
"learning_rate": 4.024159166271909e-05,
"loss": 1.3587,
"mean_token_accuracy": 0.7182448714971542,
"num_tokens": 41866196.0,
"step": 1700
},
{
"entropy": 1.7047703182697296,
"epoch": 0.08290695470911502,
"grad_norm": 0.8977941870689392,
"learning_rate": 4.142586451918522e-05,
"loss": 1.3278,
"mean_token_accuracy": 0.7222372907400131,
"num_tokens": 43124550.0,
"step": 1750
},
{
"entropy": 1.7705276823043823,
"epoch": 0.08527572484366117,
"grad_norm": 0.8741844892501831,
"learning_rate": 4.2610137375651355e-05,
"loss": 1.3573,
"mean_token_accuracy": 0.7164474505186081,
"num_tokens": 44315237.0,
"step": 1800
},
{
"entropy": 1.7388102066516877,
"epoch": 0.08764449497820731,
"grad_norm": 0.8004917502403259,
"learning_rate": 4.379441023211748e-05,
"loss": 1.3342,
"mean_token_accuracy": 0.7217385923862457,
"num_tokens": 45536080.0,
"step": 1850
},
{
"entropy": 1.7502016520500183,
"epoch": 0.09001326511275345,
"grad_norm": 0.8822757005691528,
"learning_rate": 4.497868308858361e-05,
"loss": 1.336,
"mean_token_accuracy": 0.7232905811071396,
"num_tokens": 46770718.0,
"step": 1900
},
{
"entropy": 1.767259726524353,
"epoch": 0.0923820352472996,
"grad_norm": 0.7748751640319824,
"learning_rate": 4.616295594504974e-05,
"loss": 1.3553,
"mean_token_accuracy": 0.7182161051034928,
"num_tokens": 47996075.0,
"step": 1950
},
{
"entropy": 1.6973827588558197,
"epoch": 0.09475080538184574,
"grad_norm": 0.8286657333374023,
"learning_rate": 4.7347228801515866e-05,
"loss": 1.3257,
"mean_token_accuracy": 0.7228487819433213,
"num_tokens": 49257764.0,
"step": 2000
},
{
"epoch": 0.09475080538184574,
"eval_entropy": 1.204862184174056,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7511153636268459,
"eval_num_tokens": 49257764.0,
"eval_runtime": 739.5936,
"eval_samples_per_second": 33.552,
"eval_steps_per_second": 4.194,
"step": 2000
},
{
"entropy": 1.7507979416847228,
"epoch": 0.09711957551639189,
"grad_norm": 0.8749801516532898,
"learning_rate": 4.8531501657982e-05,
"loss": 1.3637,
"mean_token_accuracy": 0.7162546402215958,
"num_tokens": 50488090.0,
"step": 2050
},
{
"entropy": 1.7249326765537263,
"epoch": 0.09948834565093803,
"grad_norm": 0.9060792922973633,
"learning_rate": 4.9715774514448135e-05,
"loss": 1.325,
"mean_token_accuracy": 0.7223574507236481,
"num_tokens": 51760869.0,
"step": 2100
},
{
"entropy": 1.7315819489955901,
"epoch": 0.10185711578548418,
"grad_norm": 0.7879400253295898,
"learning_rate": 5.090004737091426e-05,
"loss": 1.3279,
"mean_token_accuracy": 0.7231736582517624,
"num_tokens": 52971004.0,
"step": 2150
},
{
"entropy": 1.721841138601303,
"epoch": 0.10422588592003032,
"grad_norm": 0.6798914074897766,
"learning_rate": 5.208432022738039e-05,
"loss": 1.3486,
"mean_token_accuracy": 0.7193009465932846,
"num_tokens": 54200559.0,
"step": 2200
},
{
"entropy": 1.732561513185501,
"epoch": 0.10659465605457646,
"grad_norm": 0.9104458689689636,
"learning_rate": 5.326859308384652e-05,
"loss": 1.313,
"mean_token_accuracy": 0.7249197036027908,
"num_tokens": 55431492.0,
"step": 2250
},
{
"entropy": 1.745149908065796,
"epoch": 0.10896342618912261,
"grad_norm": 1.0170321464538574,
"learning_rate": 5.4452865940312646e-05,
"loss": 1.3327,
"mean_token_accuracy": 0.7201522195339203,
"num_tokens": 56685982.0,
"step": 2300
},
{
"entropy": 1.7817789494991303,
"epoch": 0.11133219632366875,
"grad_norm": 0.8275519013404846,
"learning_rate": 5.5637138796778774e-05,
"loss": 1.3902,
"mean_token_accuracy": 0.7141750353574753,
"num_tokens": 57930317.0,
"step": 2350
},
{
"entropy": 1.7808838784694672,
"epoch": 0.1137009664582149,
"grad_norm": 0.8482922315597534,
"learning_rate": 5.6821411653244915e-05,
"loss": 1.3555,
"mean_token_accuracy": 0.7165615385770798,
"num_tokens": 59178908.0,
"step": 2400
},
{
"entropy": 1.7483756732940674,
"epoch": 0.11606973659276104,
"grad_norm": 1.1124041080474854,
"learning_rate": 5.800568450971104e-05,
"loss": 1.3078,
"mean_token_accuracy": 0.7251748180389405,
"num_tokens": 60407544.0,
"step": 2450
},
{
"entropy": 1.7590064382553101,
"epoch": 0.11843850672730719,
"grad_norm": 0.8734819889068604,
"learning_rate": 5.918995736617717e-05,
"loss": 1.3515,
"mean_token_accuracy": 0.7170740348100663,
"num_tokens": 61646580.0,
"step": 2500
},
{
"entropy": 1.7470902466773988,
"epoch": 0.12080727686185333,
"grad_norm": 0.9874738454818726,
"learning_rate": 6.03742302226433e-05,
"loss": 1.3449,
"mean_token_accuracy": 0.7202254205942153,
"num_tokens": 62847914.0,
"step": 2550
},
{
"entropy": 1.7077771651744842,
"epoch": 0.12317604699639947,
"grad_norm": 0.7741467952728271,
"learning_rate": 6.155850307910943e-05,
"loss": 1.3253,
"mean_token_accuracy": 0.721799430847168,
"num_tokens": 64110298.0,
"step": 2600
},
{
"entropy": 1.7353104615211488,
"epoch": 0.12554481713094562,
"grad_norm": 0.878971517086029,
"learning_rate": 6.274277593557556e-05,
"loss": 1.3365,
"mean_token_accuracy": 0.7212816894054412,
"num_tokens": 65347419.0,
"step": 2650
},
{
"entropy": 1.734018679857254,
"epoch": 0.12791358726549176,
"grad_norm": 0.9191023111343384,
"learning_rate": 6.392704879204168e-05,
"loss": 1.3317,
"mean_token_accuracy": 0.7217869812250137,
"num_tokens": 66593613.0,
"step": 2700
},
{
"entropy": 1.7515969347953797,
"epoch": 0.1302823574000379,
"grad_norm": 0.9526401162147522,
"learning_rate": 6.511132164850782e-05,
"loss": 1.3381,
"mean_token_accuracy": 0.7212713253498078,
"num_tokens": 67818628.0,
"step": 2750
},
{
"entropy": 1.732736051082611,
"epoch": 0.13265112753458405,
"grad_norm": 0.9250634908676147,
"learning_rate": 6.629559450497395e-05,
"loss": 1.3267,
"mean_token_accuracy": 0.7237467241287231,
"num_tokens": 69063328.0,
"step": 2800
},
{
"entropy": 1.7254245734214784,
"epoch": 0.1350198976691302,
"grad_norm": 0.8667979836463928,
"learning_rate": 6.747986736144007e-05,
"loss": 1.3669,
"mean_token_accuracy": 0.7158471101522446,
"num_tokens": 70325294.0,
"step": 2850
},
{
"entropy": 1.7342670309543609,
"epoch": 0.13738866780367634,
"grad_norm": 0.9424638748168945,
"learning_rate": 6.866414021790622e-05,
"loss": 1.352,
"mean_token_accuracy": 0.7186112779378891,
"num_tokens": 71578584.0,
"step": 2900
},
{
"entropy": 1.717711169719696,
"epoch": 0.13975743793822248,
"grad_norm": 0.8827985525131226,
"learning_rate": 6.984841307437234e-05,
"loss": 1.3708,
"mean_token_accuracy": 0.7152537268400192,
"num_tokens": 72849472.0,
"step": 2950
},
{
"entropy": 1.7510717618465423,
"epoch": 0.14212620807276863,
"grad_norm": 0.8640701174736023,
"learning_rate": 7.103268593083848e-05,
"loss": 1.3576,
"mean_token_accuracy": 0.7165306961536407,
"num_tokens": 74089751.0,
"step": 3000
},
{
"epoch": 0.14212620807276863,
"eval_entropy": 1.1560545717993527,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7539014699553613,
"eval_num_tokens": 74089751.0,
"eval_runtime": 739.5182,
"eval_samples_per_second": 33.556,
"eval_steps_per_second": 4.195,
"step": 3000
},
{
"entropy": 1.7481833016872406,
"epoch": 0.14449497820731477,
"grad_norm": 0.8835089206695557,
"learning_rate": 7.22169587873046e-05,
"loss": 1.3735,
"mean_token_accuracy": 0.7145194208621979,
"num_tokens": 75300868.0,
"step": 3050
},
{
"entropy": 1.7272928488254546,
"epoch": 0.14686374834186092,
"grad_norm": 0.858995258808136,
"learning_rate": 7.340123164377073e-05,
"loss": 1.3456,
"mean_token_accuracy": 0.7180565488338471,
"num_tokens": 76542456.0,
"step": 3100
},
{
"entropy": 1.7239915192127229,
"epoch": 0.14923251847640706,
"grad_norm": 1.038167953491211,
"learning_rate": 7.458550450023685e-05,
"loss": 1.3364,
"mean_token_accuracy": 0.7197621566057205,
"num_tokens": 77797173.0,
"step": 3150
},
{
"entropy": 1.7232668161392213,
"epoch": 0.1516012886109532,
"grad_norm": 0.7863021492958069,
"learning_rate": 7.576977735670299e-05,
"loss": 1.3252,
"mean_token_accuracy": 0.7219555181264877,
"num_tokens": 79036677.0,
"step": 3200
},
{
"entropy": 1.7575169241428374,
"epoch": 0.15397005874549935,
"grad_norm": 1.122582197189331,
"learning_rate": 7.695405021316912e-05,
"loss": 1.3513,
"mean_token_accuracy": 0.7171193498373032,
"num_tokens": 80257412.0,
"step": 3250
},
{
"entropy": 1.7478620946407317,
"epoch": 0.1563388288800455,
"grad_norm": 0.8442687392234802,
"learning_rate": 7.813832306963524e-05,
"loss": 1.3689,
"mean_token_accuracy": 0.7146556586027145,
"num_tokens": 81452995.0,
"step": 3300
},
{
"entropy": 1.7156465804576875,
"epoch": 0.1587075990145916,
"grad_norm": 0.8353444337844849,
"learning_rate": 7.932259592610138e-05,
"loss": 1.341,
"mean_token_accuracy": 0.7198166775703431,
"num_tokens": 82720102.0,
"step": 3350
},
{
"entropy": 1.6997444534301758,
"epoch": 0.16107636914913775,
"grad_norm": 1.0969985723495483,
"learning_rate": 8.050686878256751e-05,
"loss": 1.3462,
"mean_token_accuracy": 0.7181236177682877,
"num_tokens": 83970755.0,
"step": 3400
},
{
"entropy": 1.7316451609134673,
"epoch": 0.1634451392836839,
"grad_norm": 1.048732876777649,
"learning_rate": 8.169114163903365e-05,
"loss": 1.3286,
"mean_token_accuracy": 0.7228594154119492,
"num_tokens": 85196313.0,
"step": 3450
},
{
"entropy": 1.6936505138874054,
"epoch": 0.16581390941823004,
"grad_norm": 0.9473629593849182,
"learning_rate": 8.287541449549977e-05,
"loss": 1.3178,
"mean_token_accuracy": 0.7219874155521393,
"num_tokens": 86444399.0,
"step": 3500
},
{
"entropy": 1.7614711892604829,
"epoch": 0.1681826795527762,
"grad_norm": 1.0644205808639526,
"learning_rate": 8.40596873519659e-05,
"loss": 1.3485,
"mean_token_accuracy": 0.7177842026948928,
"num_tokens": 87668028.0,
"step": 3550
},
{
"entropy": 1.760385752916336,
"epoch": 0.17055144968732233,
"grad_norm": 0.8554447293281555,
"learning_rate": 8.524396020843202e-05,
"loss": 1.3655,
"mean_token_accuracy": 0.7148052769899368,
"num_tokens": 88876622.0,
"step": 3600
},
{
"entropy": 1.7391455006599426,
"epoch": 0.17292021982186848,
"grad_norm": 0.8997156023979187,
"learning_rate": 8.642823306489816e-05,
"loss": 1.3729,
"mean_token_accuracy": 0.7139494162797928,
"num_tokens": 90122997.0,
"step": 3650
},
{
"entropy": 1.6978785967826844,
"epoch": 0.17528898995641462,
"grad_norm": 0.9306835532188416,
"learning_rate": 8.761250592136429e-05,
"loss": 1.3101,
"mean_token_accuracy": 0.7250325381755829,
"num_tokens": 91386702.0,
"step": 3700
},
{
"entropy": 1.7522969400882722,
"epoch": 0.17765776009096076,
"grad_norm": 0.8477308750152588,
"learning_rate": 8.879677877783041e-05,
"loss": 1.3688,
"mean_token_accuracy": 0.715588583946228,
"num_tokens": 92610301.0,
"step": 3750
},
{
"entropy": 1.7447042429447175,
"epoch": 0.1800265302255069,
"grad_norm": 1.0386239290237427,
"learning_rate": 8.998105163429655e-05,
"loss": 1.3431,
"mean_token_accuracy": 0.718235713839531,
"num_tokens": 93824209.0,
"step": 3800
},
{
"entropy": 1.7764518535137177,
"epoch": 0.18239530036005305,
"grad_norm": 1.1934313774108887,
"learning_rate": 9.116532449076267e-05,
"loss": 1.377,
"mean_token_accuracy": 0.7121944260597229,
"num_tokens": 95073205.0,
"step": 3850
},
{
"entropy": 1.7176523733139037,
"epoch": 0.1847640704945992,
"grad_norm": 0.9109567403793335,
"learning_rate": 9.234959734722882e-05,
"loss": 1.3269,
"mean_token_accuracy": 0.7219234961271286,
"num_tokens": 96282593.0,
"step": 3900
},
{
"entropy": 1.7826895797252655,
"epoch": 0.18713284062914534,
"grad_norm": 0.8581134080886841,
"learning_rate": 9.353387020369494e-05,
"loss": 1.4049,
"mean_token_accuracy": 0.7105602127313614,
"num_tokens": 97491980.0,
"step": 3950
},
{
"entropy": 1.7406088852882384,
"epoch": 0.18950161076369149,
"grad_norm": 0.8592116236686707,
"learning_rate": 9.471814306016107e-05,
"loss": 1.3514,
"mean_token_accuracy": 0.7164700603485108,
"num_tokens": 98726298.0,
"step": 4000
},
{
"epoch": 0.18950161076369149,
"eval_entropy": 1.1647965725124612,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7535035212354303,
"eval_num_tokens": 98726298.0,
"eval_runtime": 741.2483,
"eval_samples_per_second": 33.477,
"eval_steps_per_second": 4.185,
"step": 4000
},
{
"entropy": 1.7602094197273255,
"epoch": 0.19187038089823763,
"grad_norm": 1.0990040302276611,
"learning_rate": 9.590241591662719e-05,
"loss": 1.3787,
"mean_token_accuracy": 0.7126823592185975,
"num_tokens": 99961531.0,
"step": 4050
},
{
"entropy": 1.736720016002655,
"epoch": 0.19423915103278377,
"grad_norm": 0.886349081993103,
"learning_rate": 9.708668877309333e-05,
"loss": 1.3486,
"mean_token_accuracy": 0.7173503488302231,
"num_tokens": 101187732.0,
"step": 4100
},
{
"entropy": 1.772811095714569,
"epoch": 0.19660792116732992,
"grad_norm": 1.147083044052124,
"learning_rate": 9.827096162955945e-05,
"loss": 1.4016,
"mean_token_accuracy": 0.7078080683946609,
"num_tokens": 102372338.0,
"step": 4150
},
{
"entropy": 1.737178726196289,
"epoch": 0.19897669130187606,
"grad_norm": 1.1906094551086426,
"learning_rate": 9.945523448602558e-05,
"loss": 1.3823,
"mean_token_accuracy": 0.7129582542181016,
"num_tokens": 103594020.0,
"step": 4200
},
{
"entropy": 1.739368189573288,
"epoch": 0.2013454614364222,
"grad_norm": 0.8465049862861633,
"learning_rate": 9.999987539454218e-05,
"loss": 1.384,
"mean_token_accuracy": 0.7121477049589157,
"num_tokens": 104839897.0,
"step": 4250
},
{
"entropy": 1.728913918733597,
"epoch": 0.20371423157096835,
"grad_norm": 1.0396977663040161,
"learning_rate": 9.999898657946416e-05,
"loss": 1.4049,
"mean_token_accuracy": 0.709805850982666,
"num_tokens": 106084752.0,
"step": 4300
},
{
"entropy": 1.7548469495773316,
"epoch": 0.2060830017055145,
"grad_norm": 1.1842293739318848,
"learning_rate": 9.999724314980077e-05,
"loss": 1.3883,
"mean_token_accuracy": 0.7103092032670975,
"num_tokens": 107308027.0,
"step": 4350
},
{
"entropy": 1.7609304535388945,
"epoch": 0.20845177184006064,
"grad_norm": 0.8410583138465881,
"learning_rate": 9.999464513535188e-05,
"loss": 1.3632,
"mean_token_accuracy": 0.7141008460521698,
"num_tokens": 108532695.0,
"step": 4400
},
{
"entropy": 1.7612316942214965,
"epoch": 0.21082054197460678,
"grad_norm": 0.9074947237968445,
"learning_rate": 9.999119258052436e-05,
"loss": 1.3728,
"mean_token_accuracy": 0.7128197175264358,
"num_tokens": 109768914.0,
"step": 4450
},
{
"entropy": 1.7695635759830475,
"epoch": 0.21318931210915293,
"grad_norm": 0.9042698740959167,
"learning_rate": 9.99868855443315e-05,
"loss": 1.3519,
"mean_token_accuracy": 0.7166950708627701,
"num_tokens": 110984584.0,
"step": 4500
},
{
"entropy": 1.7432436084747314,
"epoch": 0.21555808224369907,
"grad_norm": 1.2357442378997803,
"learning_rate": 9.99817241003919e-05,
"loss": 1.334,
"mean_token_accuracy": 0.7201163339614868,
"num_tokens": 112235932.0,
"step": 4550
},
{
"entropy": 1.7642862284183503,
"epoch": 0.21792685237824522,
"grad_norm": 1.0687198638916016,
"learning_rate": 9.997570833692829e-05,
"loss": 1.3798,
"mean_token_accuracy": 0.7113319665193558,
"num_tokens": 113455353.0,
"step": 4600
},
{
"entropy": 1.7590344095230102,
"epoch": 0.22029562251279136,
"grad_norm": 1.1026127338409424,
"learning_rate": 9.996883835676589e-05,
"loss": 1.3825,
"mean_token_accuracy": 0.7098899132013321,
"num_tokens": 114694421.0,
"step": 4650
},
{
"entropy": 1.7447860455513,
"epoch": 0.2226643926473375,
"grad_norm": 1.0826524496078491,
"learning_rate": 9.99611142773308e-05,
"loss": 1.3484,
"mean_token_accuracy": 0.7184046697616577,
"num_tokens": 115913968.0,
"step": 4700
},
{
"entropy": 1.7905651438236236,
"epoch": 0.22503316278188365,
"grad_norm": 1.1828806400299072,
"learning_rate": 9.995253623064793e-05,
"loss": 1.4072,
"mean_token_accuracy": 0.7065826892852783,
"num_tokens": 117100168.0,
"step": 4750
},
{
"entropy": 1.7732587778568267,
"epoch": 0.2274019329164298,
"grad_norm": 0.8388417959213257,
"learning_rate": 9.994310436333872e-05,
"loss": 1.3876,
"mean_token_accuracy": 0.7099131292104721,
"num_tokens": 118323063.0,
"step": 4800
},
{
"entropy": 1.7498207116127014,
"epoch": 0.22977070305097594,
"grad_norm": 0.9928333759307861,
"learning_rate": 9.993281883661866e-05,
"loss": 1.3248,
"mean_token_accuracy": 0.7209601724147796,
"num_tokens": 119542247.0,
"step": 4850
},
{
"entropy": 1.7807526588439941,
"epoch": 0.23213947318552208,
"grad_norm": 1.180126428604126,
"learning_rate": 9.992167982629455e-05,
"loss": 1.3807,
"mean_token_accuracy": 0.7116306042671203,
"num_tokens": 120783656.0,
"step": 4900
},
{
"entropy": 1.7660968756675721,
"epoch": 0.23450824332006823,
"grad_norm": 1.035225749015808,
"learning_rate": 9.990968752276143e-05,
"loss": 1.3906,
"mean_token_accuracy": 0.7096653944253921,
"num_tokens": 122014053.0,
"step": 4950
},
{
"entropy": 1.7683662581443786,
"epoch": 0.23687701345461437,
"grad_norm": 0.8732820153236389,
"learning_rate": 9.989684213099944e-05,
"loss": 1.363,
"mean_token_accuracy": 0.7147561728954315,
"num_tokens": 123247491.0,
"step": 5000
},
{
"epoch": 0.23687701345461437,
"eval_entropy": 1.1902209509963915,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7525513527433308,
"eval_num_tokens": 123247491.0,
"eval_runtime": 749.4439,
"eval_samples_per_second": 33.111,
"eval_steps_per_second": 4.139,
"step": 5000
},
{
"entropy": 1.7724631798267365,
"epoch": 0.23924578358916052,
"grad_norm": 1.2394686937332153,
"learning_rate": 9.988314387057021e-05,
"loss": 1.4029,
"mean_token_accuracy": 0.7083960479497909,
"num_tokens": 124486744.0,
"step": 5050
},
{
"entropy": 1.7794454956054688,
"epoch": 0.24161455372370666,
"grad_norm": 1.031551718711853,
"learning_rate": 9.986859297561312e-05,
"loss": 1.3872,
"mean_token_accuracy": 0.7082083231210708,
"num_tokens": 125689651.0,
"step": 5100
},
{
"entropy": 1.8115082442760468,
"epoch": 0.2439833238582528,
"grad_norm": 1.238067388534546,
"learning_rate": 9.985318969484139e-05,
"loss": 1.4075,
"mean_token_accuracy": 0.7077406024932862,
"num_tokens": 126912476.0,
"step": 5150
},
{
"entropy": 1.7362813007831575,
"epoch": 0.24635209399279895,
"grad_norm": 0.9080651998519897,
"learning_rate": 9.983693429153769e-05,
"loss": 1.3715,
"mean_token_accuracy": 0.7125364172458649,
"num_tokens": 128141273.0,
"step": 5200
},
{
"entropy": 1.7462396609783173,
"epoch": 0.2487208641273451,
"grad_norm": 0.9258147478103638,
"learning_rate": 9.981982704354978e-05,
"loss": 1.3539,
"mean_token_accuracy": 0.7153694558143616,
"num_tokens": 129367296.0,
"step": 5250
},
{
"entropy": 1.7526134848594666,
"epoch": 0.25108963426189124,
"grad_norm": 1.0741764307022095,
"learning_rate": 9.980186824328563e-05,
"loss": 1.3639,
"mean_token_accuracy": 0.7122530096769333,
"num_tokens": 130622992.0,
"step": 5300
},
{
"entropy": 1.8136487221717834,
"epoch": 0.25345840439643735,
"grad_norm": 1.079744815826416,
"learning_rate": 9.978305819770852e-05,
"loss": 1.3934,
"mean_token_accuracy": 0.7090709501504898,
"num_tokens": 131844647.0,
"step": 5350
},
{
"entropy": 1.7428915858268739,
"epoch": 0.2558271745309835,
"grad_norm": 1.0281189680099487,
"learning_rate": 9.976339722833178e-05,
"loss": 1.357,
"mean_token_accuracy": 0.7154221564531327,
"num_tokens": 133100147.0,
"step": 5400
},
{
"entropy": 1.8012803518772125,
"epoch": 0.25819594466552964,
"grad_norm": 1.2619256973266602,
"learning_rate": 9.974288567121322e-05,
"loss": 1.4075,
"mean_token_accuracy": 0.7054576027393341,
"num_tokens": 134303236.0,
"step": 5450
},
{
"entropy": 1.7738253235816956,
"epoch": 0.2605647148000758,
"grad_norm": 1.0344356298446655,
"learning_rate": 9.972152387694946e-05,
"loss": 1.3516,
"mean_token_accuracy": 0.7141925716400146,
"num_tokens": 135527480.0,
"step": 5500
},
{
"entropy": 1.7168458807468414,
"epoch": 0.26293348493462193,
"grad_norm": 1.062092661857605,
"learning_rate": 9.969931221066992e-05,
"loss": 1.3439,
"mean_token_accuracy": 0.7171407097578049,
"num_tokens": 136777268.0,
"step": 5550
},
{
"entropy": 1.7599689650535584,
"epoch": 0.2653022550691681,
"grad_norm": 0.9637967348098755,
"learning_rate": 9.96762510520306e-05,
"loss": 1.3794,
"mean_token_accuracy": 0.7112497627735138,
"num_tokens": 137993796.0,
"step": 5600
},
{
"entropy": 1.7565060186386108,
"epoch": 0.2676710252037142,
"grad_norm": 0.9759653806686401,
"learning_rate": 9.965234079520751e-05,
"loss": 1.3797,
"mean_token_accuracy": 0.7126868903636933,
"num_tokens": 139236029.0,
"step": 5650
},
{
"entropy": 1.7332323002815246,
"epoch": 0.2700397953382604,
"grad_norm": 1.1588467359542847,
"learning_rate": 9.962758184889003e-05,
"loss": 1.3803,
"mean_token_accuracy": 0.710934864282608,
"num_tokens": 140453476.0,
"step": 5700
},
{
"entropy": 1.763832380771637,
"epoch": 0.2724085654728065,
"grad_norm": 0.8555989861488342,
"learning_rate": 9.960197463627388e-05,
"loss": 1.3641,
"mean_token_accuracy": 0.7138992995023727,
"num_tokens": 141647360.0,
"step": 5750
},
{
"entropy": 1.7501352691650391,
"epoch": 0.2747773356073527,
"grad_norm": 0.9515321850776672,
"learning_rate": 9.957551959505387e-05,
"loss": 1.4013,
"mean_token_accuracy": 0.7074063158035279,
"num_tokens": 142881658.0,
"step": 5800
},
{
"entropy": 1.7732372057437897,
"epoch": 0.2771461057418988,
"grad_norm": 1.0687644481658936,
"learning_rate": 9.954821717741643e-05,
"loss": 1.3726,
"mean_token_accuracy": 0.7110266560316085,
"num_tokens": 144097656.0,
"step": 5850
},
{
"entropy": 1.8280004715919496,
"epoch": 0.27951487587644497,
"grad_norm": 0.9914586544036865,
"learning_rate": 9.952006785003194e-05,
"loss": 1.4141,
"mean_token_accuracy": 0.7043382048606872,
"num_tokens": 145304660.0,
"step": 5900
},
{
"entropy": 1.7428024232387542,
"epoch": 0.2818836460109911,
"grad_norm": 0.9063569903373718,
"learning_rate": 9.949107209404665e-05,
"loss": 1.3871,
"mean_token_accuracy": 0.7085927510261536,
"num_tokens": 146556399.0,
"step": 5950
},
{
"entropy": 1.7348086619377137,
"epoch": 0.28425241614553726,
"grad_norm": 1.1388063430786133,
"learning_rate": 9.946123040507451e-05,
"loss": 1.4059,
"mean_token_accuracy": 0.7068395394086838,
"num_tokens": 147781528.0,
"step": 6000
},
{
"epoch": 0.28425241614553726,
"eval_entropy": 1.1620629866956358,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7536570436719308,
"eval_num_tokens": 147781528.0,
"eval_runtime": 742.238,
"eval_samples_per_second": 33.433,
"eval_steps_per_second": 4.179,
"step": 6000
},
{
"entropy": 1.7318350422382354,
"epoch": 0.2866211862800834,
"grad_norm": 1.0696161985397339,
"learning_rate": 9.943054329318873e-05,
"loss": 1.3689,
"mean_token_accuracy": 0.7137463581562042,
"num_tokens": 148993131.0,
"step": 6050
},
{
"entropy": 1.7341815280914306,
"epoch": 0.28898995641462955,
"grad_norm": 1.211084246635437,
"learning_rate": 9.9399011282913e-05,
"loss": 1.3396,
"mean_token_accuracy": 0.7190863400697708,
"num_tokens": 150231439.0,
"step": 6100
},
{
"entropy": 1.8086679303646087,
"epoch": 0.29135872654917566,
"grad_norm": 0.997982919216156,
"learning_rate": 9.936663491321256e-05,
"loss": 1.3991,
"mean_token_accuracy": 0.7076171565055848,
"num_tokens": 151425872.0,
"step": 6150
},
{
"entropy": 1.7646045112609863,
"epoch": 0.29372749668372183,
"grad_norm": 1.0052849054336548,
"learning_rate": 9.9333414737485e-05,
"loss": 1.3833,
"mean_token_accuracy": 0.7115501266717911,
"num_tokens": 152649154.0,
"step": 6200
},
{
"entropy": 1.7603888380527497,
"epoch": 0.29609626681826795,
"grad_norm": 1.1485621929168701,
"learning_rate": 9.929935132355075e-05,
"loss": 1.3774,
"mean_token_accuracy": 0.7107777494192123,
"num_tokens": 153909546.0,
"step": 6250
},
{
"entropy": 1.810437490940094,
"epoch": 0.2984650369528141,
"grad_norm": 1.1413508653640747,
"learning_rate": 9.926444525364341e-05,
"loss": 1.378,
"mean_token_accuracy": 0.711902762055397,
"num_tokens": 155120315.0,
"step": 6300
},
{
"entropy": 1.7656940996646882,
"epoch": 0.30083380708736024,
"grad_norm": 0.8839899897575378,
"learning_rate": 9.922869712439981e-05,
"loss": 1.3904,
"mean_token_accuracy": 0.7087905770540237,
"num_tokens": 156368001.0,
"step": 6350
},
{
"entropy": 1.7679949700832367,
"epoch": 0.3032025772219064,
"grad_norm": 1.285138726234436,
"learning_rate": 9.91921075468498e-05,
"loss": 1.3891,
"mean_token_accuracy": 0.7098447853326797,
"num_tokens": 157568259.0,
"step": 6400
},
{
"entropy": 1.775840550661087,
"epoch": 0.3055713473564525,
"grad_norm": 1.10303795337677,
"learning_rate": 9.915467714640578e-05,
"loss": 1.3918,
"mean_token_accuracy": 0.7079905581474304,
"num_tokens": 158791523.0,
"step": 6450
},
{
"entropy": 1.7338063383102418,
"epoch": 0.3079401174909987,
"grad_norm": 1.0604420900344849,
"learning_rate": 9.911640656285203e-05,
"loss": 1.3554,
"mean_token_accuracy": 0.714795948266983,
"num_tokens": 160073528.0,
"step": 6500
},
{
"entropy": 1.7426982474327088,
"epoch": 0.3103088876255448,
"grad_norm": 0.9847440123558044,
"learning_rate": 9.907729645033379e-05,
"loss": 1.3512,
"mean_token_accuracy": 0.7151961398124694,
"num_tokens": 161312761.0,
"step": 6550
},
{
"entropy": 1.8005949878692626,
"epoch": 0.312677657760091,
"grad_norm": 1.2713630199432373,
"learning_rate": 9.903734747734607e-05,
"loss": 1.3597,
"mean_token_accuracy": 0.7128104782104492,
"num_tokens": 162512008.0,
"step": 6600
},
{
"entropy": 1.8041615283489227,
"epoch": 0.3150464278946371,
"grad_norm": 0.99453204870224,
"learning_rate": 9.899656032672221e-05,
"loss": 1.3642,
"mean_token_accuracy": 0.7122291630506515,
"num_tokens": 163702726.0,
"step": 6650
},
{
"entropy": 1.7597569704055787,
"epoch": 0.3174151980291832,
"grad_norm": 1.2227306365966797,
"learning_rate": 9.895493569562221e-05,
"loss": 1.3276,
"mean_token_accuracy": 0.7197510945796967,
"num_tokens": 164943131.0,
"step": 6700
},
{
"entropy": 1.7358013463020325,
"epoch": 0.3197839681637294,
"grad_norm": 1.1400933265686035,
"learning_rate": 9.891247429552082e-05,
"loss": 1.384,
"mean_token_accuracy": 0.7089168894290924,
"num_tokens": 166167321.0,
"step": 6750
},
{
"entropy": 1.7530862140655517,
"epoch": 0.3221527382982755,
"grad_norm": 1.2036629915237427,
"learning_rate": 9.886917685219541e-05,
"loss": 1.3398,
"mean_token_accuracy": 0.7184527868032455,
"num_tokens": 167397732.0,
"step": 6800
},
{
"entropy": 1.7854076647758483,
"epoch": 0.3245215084328217,
"grad_norm": 1.2987496852874756,
"learning_rate": 9.88250441057135e-05,
"loss": 1.394,
"mean_token_accuracy": 0.7086141872406005,
"num_tokens": 168618527.0,
"step": 6850
},
{
"entropy": 1.7509974801540376,
"epoch": 0.3268902785673678,
"grad_norm": 1.056751012802124,
"learning_rate": 9.878007681042014e-05,
"loss": 1.3389,
"mean_token_accuracy": 0.7182145416736603,
"num_tokens": 169856441.0,
"step": 6900
},
{
"entropy": 1.740920853614807,
"epoch": 0.32925904870191397,
"grad_norm": 1.1730940341949463,
"learning_rate": 9.873427573492507e-05,
"loss": 1.3387,
"mean_token_accuracy": 0.718420038819313,
"num_tokens": 171123051.0,
"step": 6950
},
{
"entropy": 1.7686040151119231,
"epoch": 0.3316278188364601,
"grad_norm": 1.139112949371338,
"learning_rate": 9.868764166208946e-05,
"loss": 1.373,
"mean_token_accuracy": 0.7120095008611679,
"num_tokens": 172342540.0,
"step": 7000
},
{
"epoch": 0.3316278188364601,
"eval_entropy": 1.1930562926445525,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7542100738587493,
"eval_num_tokens": 172342540.0,
"eval_runtime": 741.3646,
"eval_samples_per_second": 33.472,
"eval_steps_per_second": 4.184,
"step": 7000
},
{
"entropy": 1.7598109781742095,
"epoch": 0.33399658897100626,
"grad_norm": 1.2180997133255005,
"learning_rate": 9.864017538901267e-05,
"loss": 1.4032,
"mean_token_accuracy": 0.7083274441957473,
"num_tokens": 173589426.0,
"step": 7050
},
{
"entropy": 1.7652622890472411,
"epoch": 0.3363653591055524,
"grad_norm": 1.3037455081939697,
"learning_rate": 9.859187772701853e-05,
"loss": 1.369,
"mean_token_accuracy": 0.7140497547388077,
"num_tokens": 174848839.0,
"step": 7100
},
{
"entropy": 1.7913592505455016,
"epoch": 0.33873412924009855,
"grad_norm": 1.1562169790267944,
"learning_rate": 9.854274950164149e-05,
"loss": 1.3837,
"mean_token_accuracy": 0.7107916122674942,
"num_tokens": 176055919.0,
"step": 7150
},
{
"entropy": 1.7908745443820953,
"epoch": 0.34110289937464466,
"grad_norm": 1.2559897899627686,
"learning_rate": 9.849279155261252e-05,
"loss": 1.3907,
"mean_token_accuracy": 0.7087368202209473,
"num_tokens": 177277309.0,
"step": 7200
},
{
"entropy": 1.753930516242981,
"epoch": 0.34347166950919084,
"grad_norm": 0.9901047348976135,
"learning_rate": 9.844200473384479e-05,
"loss": 1.3527,
"mean_token_accuracy": 0.716761229634285,
"num_tokens": 178518563.0,
"step": 7250
},
{
"entropy": 1.739516668319702,
"epoch": 0.34584043964373695,
"grad_norm": 1.2106683254241943,
"learning_rate": 9.8390389913419e-05,
"loss": 1.3725,
"mean_token_accuracy": 0.7121469175815582,
"num_tokens": 179742683.0,
"step": 7300
},
{
"entropy": 1.756061052083969,
"epoch": 0.3482092097782831,
"grad_norm": 1.0457638502120972,
"learning_rate": 9.833794797356861e-05,
"loss": 1.3701,
"mean_token_accuracy": 0.7125989294052124,
"num_tokens": 180940666.0,
"step": 7350
},
{
"entropy": 1.7689040386676789,
"epoch": 0.35057797991282924,
"grad_norm": 0.9141308069229126,
"learning_rate": 9.828467981066472e-05,
"loss": 1.3718,
"mean_token_accuracy": 0.7115379917621613,
"num_tokens": 182184090.0,
"step": 7400
},
{
"entropy": 1.7089093339443207,
"epoch": 0.3529467500473754,
"grad_norm": 0.8629412055015564,
"learning_rate": 9.823058633520074e-05,
"loss": 1.3324,
"mean_token_accuracy": 0.7187563890218734,
"num_tokens": 183446222.0,
"step": 7450
},
{
"entropy": 1.776807938814163,
"epoch": 0.35531552018192153,
"grad_norm": 0.9498484134674072,
"learning_rate": 9.817566847177689e-05,
"loss": 1.375,
"mean_token_accuracy": 0.7121974611282349,
"num_tokens": 184676077.0,
"step": 7500
},
{
"entropy": 1.8064971625804902,
"epoch": 0.3576842903164677,
"grad_norm": 1.0395594835281372,
"learning_rate": 9.811992715908434e-05,
"loss": 1.3748,
"mean_token_accuracy": 0.7101496076583862,
"num_tokens": 185903667.0,
"step": 7550
},
{
"entropy": 1.756836792230606,
"epoch": 0.3600530604510138,
"grad_norm": 0.9577502608299255,
"learning_rate": 9.806336334988918e-05,
"loss": 1.3556,
"mean_token_accuracy": 0.7159949284791947,
"num_tokens": 187154538.0,
"step": 7600
},
{
"entropy": 1.7699120783805846,
"epoch": 0.36242183058556,
"grad_norm": 1.4034383296966553,
"learning_rate": 9.800597801101612e-05,
"loss": 1.3911,
"mean_token_accuracy": 0.7097045290470123,
"num_tokens": 188378482.0,
"step": 7650
},
{
"entropy": 1.787111645936966,
"epoch": 0.3647906007201061,
"grad_norm": 1.0781787633895874,
"learning_rate": 9.794777212333202e-05,
"loss": 1.3937,
"mean_token_accuracy": 0.7096772521734238,
"num_tokens": 189611171.0,
"step": 7700
},
{
"entropy": 1.777391802072525,
"epoch": 0.3671593708546523,
"grad_norm": 1.1259112358093262,
"learning_rate": 9.7888746681729e-05,
"loss": 1.3448,
"mean_token_accuracy": 0.7169349992275238,
"num_tokens": 190834562.0,
"step": 7750
},
{
"entropy": 1.7815845644474029,
"epoch": 0.3695281409891984,
"grad_norm": 1.1464273929595947,
"learning_rate": 9.782890269510765e-05,
"loss": 1.4057,
"mean_token_accuracy": 0.7066523498296737,
"num_tokens": 192054556.0,
"step": 7800
},
{
"entropy": 1.7850996911525727,
"epoch": 0.37189691112374457,
"grad_norm": 1.0448256731033325,
"learning_rate": 9.776824118635952e-05,
"loss": 1.3829,
"mean_token_accuracy": 0.7095517975091934,
"num_tokens": 193268475.0,
"step": 7850
},
{
"entropy": 1.8061986804008483,
"epoch": 0.3742656812582907,
"grad_norm": 0.9750792384147644,
"learning_rate": 9.770676319234984e-05,
"loss": 1.3863,
"mean_token_accuracy": 0.7090413582324981,
"num_tokens": 194477246.0,
"step": 7900
},
{
"entropy": 1.7675806987285614,
"epoch": 0.37663445139283686,
"grad_norm": 1.0662715435028076,
"learning_rate": 9.764446976389974e-05,
"loss": 1.3617,
"mean_token_accuracy": 0.712408259510994,
"num_tokens": 195727604.0,
"step": 7950
},
{
"entropy": 1.7661338579654693,
"epoch": 0.37900322152738297,
"grad_norm": 1.0620079040527344,
"learning_rate": 9.758136196576822e-05,
"loss": 1.3594,
"mean_token_accuracy": 0.7141281938552857,
"num_tokens": 196957775.0,
"step": 8000
},
{
"epoch": 0.37900322152738297,
"eval_entropy": 1.1821323013705334,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7540838839660376,
"eval_num_tokens": 196957775.0,
"eval_runtime": 746.7244,
"eval_samples_per_second": 33.232,
"eval_steps_per_second": 4.154,
"step": 8000
},
{
"entropy": 1.8103419041633606,
"epoch": 0.38137199166192914,
"grad_norm": 0.9513231515884399,
"learning_rate": 9.751744087663406e-05,
"loss": 1.3912,
"mean_token_accuracy": 0.7097796177864075,
"num_tokens": 198135404.0,
"step": 8050
},
{
"entropy": 1.7960492491722106,
"epoch": 0.38374076179647526,
"grad_norm": 1.0502028465270996,
"learning_rate": 9.74527075890773e-05,
"loss": 1.4075,
"mean_token_accuracy": 0.7074997735023498,
"num_tokens": 199322966.0,
"step": 8100
},
{
"entropy": 1.8010617554187776,
"epoch": 0.38610953193102143,
"grad_norm": 1.0754374265670776,
"learning_rate": 9.73871632095606e-05,
"loss": 1.3893,
"mean_token_accuracy": 0.7116775345802308,
"num_tokens": 200538368.0,
"step": 8150
},
{
"entropy": 1.7480302667617797,
"epoch": 0.38847830206556755,
"grad_norm": 1.074485421180725,
"learning_rate": 9.732080885841031e-05,
"loss": 1.3824,
"mean_token_accuracy": 0.7114830583333969,
"num_tokens": 201768017.0,
"step": 8200
},
{
"entropy": 1.7346595871448516,
"epoch": 0.3908470722001137,
"grad_norm": 1.2857214212417603,
"learning_rate": 9.725364566979737e-05,
"loss": 1.3483,
"mean_token_accuracy": 0.7171267950534821,
"num_tokens": 203001309.0,
"step": 8250
},
{
"entropy": 1.7614091503620148,
"epoch": 0.39321584233465984,
"grad_norm": 0.9842163324356079,
"learning_rate": 9.718567479171784e-05,
"loss": 1.3712,
"mean_token_accuracy": 0.7125260305404663,
"num_tokens": 204234311.0,
"step": 8300
},
{
"entropy": 1.7672381138801574,
"epoch": 0.395584612469206,
"grad_norm": 1.098926067352295,
"learning_rate": 9.711689738597335e-05,
"loss": 1.4068,
"mean_token_accuracy": 0.7051201003789902,
"num_tokens": 205440916.0,
"step": 8350
},
{
"entropy": 1.7645212149620055,
"epoch": 0.3979533826037521,
"grad_norm": 1.0630714893341064,
"learning_rate": 9.70473146281512e-05,
"loss": 1.3971,
"mean_token_accuracy": 0.7092112845182419,
"num_tokens": 206679396.0,
"step": 8400
},
{
"entropy": 1.7202996456623076,
"epoch": 0.4003221527382983,
"grad_norm": 0.9493738412857056,
"learning_rate": 9.697692770760431e-05,
"loss": 1.349,
"mean_token_accuracy": 0.7158361315727234,
"num_tokens": 207946846.0,
"step": 8450
},
{
"entropy": 1.7327898812294007,
"epoch": 0.4026909228728444,
"grad_norm": 0.8810617327690125,
"learning_rate": 9.690573782743082e-05,
"loss": 1.3631,
"mean_token_accuracy": 0.7150676685571671,
"num_tokens": 209162939.0,
"step": 8500
},
{
"entropy": 1.7277316284179687,
"epoch": 0.4050596930073906,
"grad_norm": 1.0136702060699463,
"learning_rate": 9.683374620445361e-05,
"loss": 1.3714,
"mean_token_accuracy": 0.7120017749071121,
"num_tokens": 210427784.0,
"step": 8550
},
{
"entropy": 1.7886472380161285,
"epoch": 0.4074284631419367,
"grad_norm": 1.0549664497375488,
"learning_rate": 9.676095406919943e-05,
"loss": 1.3664,
"mean_token_accuracy": 0.7133744984865189,
"num_tokens": 211638614.0,
"step": 8600
},
{
"entropy": 1.747572809457779,
"epoch": 0.4097972332764829,
"grad_norm": 1.1670211553573608,
"learning_rate": 9.668736266587792e-05,
"loss": 1.3495,
"mean_token_accuracy": 0.7146046167612076,
"num_tokens": 212839094.0,
"step": 8650
},
{
"entropy": 1.7699014341831207,
"epoch": 0.412166003411029,
"grad_norm": 1.0434460639953613,
"learning_rate": 9.66129732523603e-05,
"loss": 1.3686,
"mean_token_accuracy": 0.713732448220253,
"num_tokens": 214078185.0,
"step": 8700
},
{
"entropy": 1.7851570510864259,
"epoch": 0.41453477354557516,
"grad_norm": 1.0788432359695435,
"learning_rate": 9.653778710015788e-05,
"loss": 1.3735,
"mean_token_accuracy": 0.7115869015455246,
"num_tokens": 215291596.0,
"step": 8750
},
{
"entropy": 1.771958166360855,
"epoch": 0.4169035436801213,
"grad_norm": 0.9727463722229004,
"learning_rate": 9.646180549440038e-05,
"loss": 1.3858,
"mean_token_accuracy": 0.7092594999074936,
"num_tokens": 216522630.0,
"step": 8800
},
{
"entropy": 1.7790643846988679,
"epoch": 0.41927231381466745,
"grad_norm": 1.125771403312683,
"learning_rate": 9.638502973381389e-05,
"loss": 1.3779,
"mean_token_accuracy": 0.7110064566135407,
"num_tokens": 217765170.0,
"step": 8850
},
{
"entropy": 1.7827233350276948,
"epoch": 0.42164108394921357,
"grad_norm": 1.0116534233093262,
"learning_rate": 9.63074611306987e-05,
"loss": 1.3625,
"mean_token_accuracy": 0.7156530952453614,
"num_tokens": 218976869.0,
"step": 8900
},
{
"entropy": 1.75447958111763,
"epoch": 0.4240098540837597,
"grad_norm": 1.3180460929870605,
"learning_rate": 9.622910101090686e-05,
"loss": 1.3936,
"mean_token_accuracy": 0.7107756125926972,
"num_tokens": 220217849.0,
"step": 8950
},
{
"entropy": 1.748335200548172,
"epoch": 0.42637862421830586,
"grad_norm": 0.986765444278717,
"learning_rate": 9.614995071381956e-05,
"loss": 1.3734,
"mean_token_accuracy": 0.7136638331413269,
"num_tokens": 221451171.0,
"step": 9000
},
{
"epoch": 0.42637862421830586,
"eval_entropy": 1.1782315272926747,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7542952842323493,
"eval_num_tokens": 221451171.0,
"eval_runtime": 744.3949,
"eval_samples_per_second": 33.336,
"eval_steps_per_second": 4.167,
"step": 9000
},
{
"entropy": 1.762521461248398,
"epoch": 0.428747394352852,
"grad_norm": 1.1056315898895264,
"learning_rate": 9.607001159232418e-05,
"loss": 1.3411,
"mean_token_accuracy": 0.7177901411056519,
"num_tokens": 222644153.0,
"step": 9050
},
{
"entropy": 1.7698546504974366,
"epoch": 0.43111616448739815,
"grad_norm": 1.0218158960342407,
"learning_rate": 9.59892850127912e-05,
"loss": 1.3568,
"mean_token_accuracy": 0.7160427170991898,
"num_tokens": 223885271.0,
"step": 9100
},
{
"entropy": 1.7873007321357728,
"epoch": 0.43348493462194426,
"grad_norm": 1.0137804746627808,
"learning_rate": 9.590777235505085e-05,
"loss": 1.3578,
"mean_token_accuracy": 0.7130710703134536,
"num_tokens": 225093029.0,
"step": 9150
},
{
"entropy": 1.7597880065441132,
"epoch": 0.43585370475649043,
"grad_norm": 1.0279192924499512,
"learning_rate": 9.582547501236947e-05,
"loss": 1.3552,
"mean_token_accuracy": 0.7151528036594391,
"num_tokens": 226339608.0,
"step": 9200
},
{
"entropy": 1.7488136601448059,
"epoch": 0.43822247489103655,
"grad_norm": 1.2627191543579102,
"learning_rate": 9.574239439142576e-05,
"loss": 1.3368,
"mean_token_accuracy": 0.7172259968519211,
"num_tokens": 227578157.0,
"step": 9250
},
{
"entropy": 1.8161335122585296,
"epoch": 0.4405912450255827,
"grad_norm": 1.4642895460128784,
"learning_rate": 9.56585319122867e-05,
"loss": 1.3891,
"mean_token_accuracy": 0.7093940156698227,
"num_tokens": 228810604.0,
"step": 9300
},
{
"entropy": 1.7846631932258605,
"epoch": 0.44296001516012884,
"grad_norm": 1.0811119079589844,
"learning_rate": 9.557388900838334e-05,
"loss": 1.3681,
"mean_token_accuracy": 0.7125671052932739,
"num_tokens": 230055004.0,
"step": 9350
},
{
"entropy": 1.748952749967575,
"epoch": 0.445328785294675,
"grad_norm": 1.0202217102050781,
"learning_rate": 9.548846712648616e-05,
"loss": 1.355,
"mean_token_accuracy": 0.7164496505260467,
"num_tokens": 231284769.0,
"step": 9400
},
{
"entropy": 1.754820455312729,
"epoch": 0.44769755542922113,
"grad_norm": 1.2328052520751953,
"learning_rate": 9.540226772668053e-05,
"loss": 1.3402,
"mean_token_accuracy": 0.7169833314418793,
"num_tokens": 232505637.0,
"step": 9450
},
{
"entropy": 1.7367425131797791,
"epoch": 0.4500663255637673,
"grad_norm": 1.0527913570404053,
"learning_rate": 9.531529228234155e-05,
"loss": 1.3576,
"mean_token_accuracy": 0.7145136260986328,
"num_tokens": 233725437.0,
"step": 9500
},
{
"entropy": 1.733099582195282,
"epoch": 0.4524350956983134,
"grad_norm": 0.8144567608833313,
"learning_rate": 9.522754228010906e-05,
"loss": 1.3282,
"mean_token_accuracy": 0.720543931722641,
"num_tokens": 234955358.0,
"step": 9550
},
{
"entropy": 1.733365514278412,
"epoch": 0.4548038658328596,
"grad_norm": 1.0677859783172607,
"learning_rate": 9.513901921986206e-05,
"loss": 1.3275,
"mean_token_accuracy": 0.7202348792552948,
"num_tokens": 236197729.0,
"step": 9600
},
{
"entropy": 1.7879818844795228,
"epoch": 0.4571726359674057,
"grad_norm": 1.0054843425750732,
"learning_rate": 9.504972461469319e-05,
"loss": 1.3617,
"mean_token_accuracy": 0.7137482041120529,
"num_tokens": 237418727.0,
"step": 9650
},
{
"entropy": 1.7609144997596742,
"epoch": 0.4595414061019519,
"grad_norm": 1.252611756324768,
"learning_rate": 9.495965999088285e-05,
"loss": 1.3773,
"mean_token_accuracy": 0.7108764094114304,
"num_tokens": 238640440.0,
"step": 9700
},
{
"entropy": 1.7785017716884612,
"epoch": 0.461910176236498,
"grad_norm": 1.1619056463241577,
"learning_rate": 9.486882688787305e-05,
"loss": 1.3769,
"mean_token_accuracy": 0.7111158293485641,
"num_tokens": 239845699.0,
"step": 9750
},
{
"entropy": 1.7622488391399385,
"epoch": 0.46427894637104417,
"grad_norm": 1.2110604047775269,
"learning_rate": 9.477722685824114e-05,
"loss": 1.3853,
"mean_token_accuracy": 0.7111801999807358,
"num_tokens": 241057039.0,
"step": 9800
},
{
"entropy": 1.771475486755371,
"epoch": 0.4666477165055903,
"grad_norm": 0.9056064486503601,
"learning_rate": 9.46848614676733e-05,
"loss": 1.3612,
"mean_token_accuracy": 0.7140835148096084,
"num_tokens": 242271603.0,
"step": 9850
},
{
"entropy": 1.7718469250202178,
"epoch": 0.46901648664013645,
"grad_norm": 1.2525917291641235,
"learning_rate": 9.459173229493772e-05,
"loss": 1.3937,
"mean_token_accuracy": 0.7090546947717666,
"num_tokens": 243506199.0,
"step": 9900
},
{
"entropy": 1.7770234513282777,
"epoch": 0.47138525677468257,
"grad_norm": 1.0945196151733398,
"learning_rate": 9.449784093185765e-05,
"loss": 1.3913,
"mean_token_accuracy": 0.7097006791830063,
"num_tokens": 244728720.0,
"step": 9950
},
{
"entropy": 1.7675727343559264,
"epoch": 0.47375402690922874,
"grad_norm": 0.9690260291099548,
"learning_rate": 9.440318898328419e-05,
"loss": 1.3915,
"mean_token_accuracy": 0.7102323162555695,
"num_tokens": 245938116.0,
"step": 10000
},
{
"epoch": 0.47375402690922874,
"eval_entropy": 1.1980976138977295,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7557047149064385,
"eval_num_tokens": 245938116.0,
"eval_runtime": 744.8001,
"eval_samples_per_second": 33.318,
"eval_steps_per_second": 4.165,
"step": 10000
},
{
"entropy": 1.8029465341567994,
"epoch": 0.47612279704377486,
"grad_norm": 1.1126075983047485,
"learning_rate": 9.430777806706885e-05,
"loss": 1.425,
"mean_token_accuracy": 0.7034233027696609,
"num_tokens": 247144026.0,
"step": 10050
},
{
"entropy": 1.7569603097438813,
"epoch": 0.47849156717832103,
"grad_norm": 1.0091259479522705,
"learning_rate": 9.421160981403587e-05,
"loss": 1.3778,
"mean_token_accuracy": 0.7116102015972138,
"num_tokens": 248387083.0,
"step": 10100
},
{
"entropy": 1.730289832353592,
"epoch": 0.48086033731286715,
"grad_norm": 0.9621230959892273,
"learning_rate": 9.411468586795443e-05,
"loss": 1.3592,
"mean_token_accuracy": 0.7129039680957794,
"num_tokens": 249644502.0,
"step": 10150
},
{
"entropy": 1.7547185254096984,
"epoch": 0.4832291074474133,
"grad_norm": 1.0875402688980103,
"learning_rate": 9.401700788551047e-05,
"loss": 1.3664,
"mean_token_accuracy": 0.7126635414361954,
"num_tokens": 250876166.0,
"step": 10200
},
{
"entropy": 1.7428116750717164,
"epoch": 0.48559787758195944,
"grad_norm": 1.006138563156128,
"learning_rate": 9.391857753627837e-05,
"loss": 1.3673,
"mean_token_accuracy": 0.7143008214235306,
"num_tokens": 252091179.0,
"step": 10250
},
{
"entropy": 1.7458369052410125,
"epoch": 0.4879666477165056,
"grad_norm": 1.001531720161438,
"learning_rate": 9.381939650269249e-05,
"loss": 1.3674,
"mean_token_accuracy": 0.7141269159317016,
"num_tokens": 253307058.0,
"step": 10300
},
{
"entropy": 1.7744270980358123,
"epoch": 0.4903354178510517,
"grad_norm": 1.0080331563949585,
"learning_rate": 9.371946648001835e-05,
"loss": 1.383,
"mean_token_accuracy": 0.7098779672384262,
"num_tokens": 254550553.0,
"step": 10350
},
{
"entropy": 1.7707626497745514,
"epoch": 0.4927041879855979,
"grad_norm": 1.0779789686203003,
"learning_rate": 9.361878917632365e-05,
"loss": 1.3529,
"mean_token_accuracy": 0.7156933480501175,
"num_tokens": 255800272.0,
"step": 10400
},
{
"entropy": 1.7735213398933412,
"epoch": 0.495072958120144,
"grad_norm": 0.9861488342285156,
"learning_rate": 9.351736631244914e-05,
"loss": 1.352,
"mean_token_accuracy": 0.7177729392051697,
"num_tokens": 257029917.0,
"step": 10450
},
{
"entropy": 1.7585961294174195,
"epoch": 0.4974417282546902,
"grad_norm": 1.0564011335372925,
"learning_rate": 9.341519962197912e-05,
"loss": 1.3421,
"mean_token_accuracy": 0.7166235017776489,
"num_tokens": 258269464.0,
"step": 10500
},
{
"entropy": 1.745290095806122,
"epoch": 0.4998104983892363,
"grad_norm": 1.0212265253067017,
"learning_rate": 9.331229085121185e-05,
"loss": 1.3898,
"mean_token_accuracy": 0.710235812664032,
"num_tokens": 259531127.0,
"step": 10550
},
{
"entropy": 1.7529479134082795,
"epoch": 0.5021792685237825,
"grad_norm": 1.391863226890564,
"learning_rate": 9.320864175912972e-05,
"loss": 1.378,
"mean_token_accuracy": 0.7103132110834122,
"num_tokens": 260797490.0,
"step": 10600
},
{
"entropy": 1.75710902094841,
"epoch": 0.5045480386583286,
"grad_norm": 1.0978041887283325,
"learning_rate": 9.310425411736916e-05,
"loss": 1.3386,
"mean_token_accuracy": 0.7186200088262558,
"num_tokens": 262023377.0,
"step": 10650
},
{
"entropy": 1.7746818363666534,
"epoch": 0.5069168087928747,
"grad_norm": 1.0323866605758667,
"learning_rate": 9.299912971019036e-05,
"loss": 1.3641,
"mean_token_accuracy": 0.7137188649177552,
"num_tokens": 263266765.0,
"step": 10700
},
{
"entropy": 1.754684933423996,
"epoch": 0.5092855789274209,
"grad_norm": 0.9584967494010925,
"learning_rate": 9.289327033444674e-05,
"loss": 1.3668,
"mean_token_accuracy": 0.7127582091093063,
"num_tokens": 264493871.0,
"step": 10750
},
{
"entropy": 1.7670053398609162,
"epoch": 0.511654349061967,
"grad_norm": 1.0315459966659546,
"learning_rate": 9.278667779955437e-05,
"loss": 1.3966,
"mean_token_accuracy": 0.7078602635860443,
"num_tokens": 265716107.0,
"step": 10800
},
{
"entropy": 1.7422236442565917,
"epoch": 0.5140231191965132,
"grad_norm": 1.066741943359375,
"learning_rate": 9.267935392746081e-05,
"loss": 1.3224,
"mean_token_accuracy": 0.7229005527496338,
"num_tokens": 266969953.0,
"step": 10850
},
{
"entropy": 1.7432917177677154,
"epoch": 0.5163918893310593,
"grad_norm": 1.0382195711135864,
"learning_rate": 9.25713005526142e-05,
"loss": 1.3466,
"mean_token_accuracy": 0.7158039021492004,
"num_tokens": 268225977.0,
"step": 10900
},
{
"entropy": 1.7296686470508575,
"epoch": 0.5187606594656055,
"grad_norm": 1.1235915422439575,
"learning_rate": 9.246251952193176e-05,
"loss": 1.3222,
"mean_token_accuracy": 0.7224133855104446,
"num_tokens": 269466793.0,
"step": 10950
},
{
"entropy": 1.735136388540268,
"epoch": 0.5211294296001516,
"grad_norm": 0.990793764591217,
"learning_rate": 9.235301269476832e-05,
"loss": 1.3191,
"mean_token_accuracy": 0.7210667967796326,
"num_tokens": 270708159.0,
"step": 11000
},
{
"epoch": 0.5211294296001516,
"eval_entropy": 1.181336676109844,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7557077165440849,
"eval_num_tokens": 270708159.0,
"eval_runtime": 747.4423,
"eval_samples_per_second": 33.2,
"eval_steps_per_second": 4.15,
"step": 11000
},
{
"entropy": 1.751279581785202,
"epoch": 0.5234981997346978,
"grad_norm": 1.3285608291625977,
"learning_rate": 9.224278194288444e-05,
"loss": 1.3385,
"mean_token_accuracy": 0.7199172627925873,
"num_tokens": 271927990.0,
"step": 11050
},
{
"entropy": 1.7399055349826813,
"epoch": 0.5258669698692439,
"grad_norm": 1.395338535308838,
"learning_rate": 9.213182915041445e-05,
"loss": 1.3314,
"mean_token_accuracy": 0.7194273501634598,
"num_tokens": 273153187.0,
"step": 11100
},
{
"entropy": 1.73090322971344,
"epoch": 0.52823574000379,
"grad_norm": 1.6059190034866333,
"learning_rate": 9.202015621383431e-05,
"loss": 1.3223,
"mean_token_accuracy": 0.7193130904436111,
"num_tokens": 274381622.0,
"step": 11150
},
{
"entropy": 1.7387698328495025,
"epoch": 0.5306045101383362,
"grad_norm": 0.9370666742324829,
"learning_rate": 9.190776504192909e-05,
"loss": 1.3606,
"mean_token_accuracy": 0.7134118205308915,
"num_tokens": 275611193.0,
"step": 11200
},
{
"entropy": 1.7551235890388488,
"epoch": 0.5329732802728823,
"grad_norm": 0.972176730632782,
"learning_rate": 9.179465755576045e-05,
"loss": 1.4027,
"mean_token_accuracy": 0.7102609771490097,
"num_tokens": 276860392.0,
"step": 11250
},
{
"entropy": 1.742105484008789,
"epoch": 0.5353420504074284,
"grad_norm": 1.3585799932479858,
"learning_rate": 9.16808356886337e-05,
"loss": 1.3869,
"mean_token_accuracy": 0.7101844340562821,
"num_tokens": 278102635.0,
"step": 11300
},
{
"entropy": 1.7393697941303252,
"epoch": 0.5377108205419746,
"grad_norm": 0.9401509165763855,
"learning_rate": 9.156630138606484e-05,
"loss": 1.3764,
"mean_token_accuracy": 0.7136105120182037,
"num_tokens": 279342491.0,
"step": 11350
},
{
"entropy": 1.7620924258232116,
"epoch": 0.5400795906765208,
"grad_norm": 1.03669273853302,
"learning_rate": 9.145105660574725e-05,
"loss": 1.3836,
"mean_token_accuracy": 0.7112589359283448,
"num_tokens": 280562523.0,
"step": 11400
},
{
"entropy": 1.7693402111530303,
"epoch": 0.5424483608110668,
"grad_norm": 1.0556858777999878,
"learning_rate": 9.133510331751828e-05,
"loss": 1.3543,
"mean_token_accuracy": 0.7159368151426315,
"num_tokens": 281804551.0,
"step": 11450
},
{
"entropy": 1.7435523355007172,
"epoch": 0.544817130945613,
"grad_norm": 1.36162531375885,
"learning_rate": 9.121844350332549e-05,
"loss": 1.3505,
"mean_token_accuracy": 0.7172021287679672,
"num_tokens": 283039847.0,
"step": 11500
},
{
"entropy": 1.7690045988559724,
"epoch": 0.5471859010801592,
"grad_norm": 1.1119062900543213,
"learning_rate": 9.110107915719292e-05,
"loss": 1.3536,
"mean_token_accuracy": 0.7164638632535935,
"num_tokens": 284295808.0,
"step": 11550
},
{
"entropy": 1.7988950431346893,
"epoch": 0.5495546712147054,
"grad_norm": 1.2980992794036865,
"learning_rate": 9.098301228518683e-05,
"loss": 1.387,
"mean_token_accuracy": 0.7079293090105057,
"num_tokens": 285481962.0,
"step": 11600
},
{
"entropy": 1.7422871506214141,
"epoch": 0.5519234413492514,
"grad_norm": 1.0130205154418945,
"learning_rate": 9.086424490538157e-05,
"loss": 1.3488,
"mean_token_accuracy": 0.7166511958837509,
"num_tokens": 286739692.0,
"step": 11650
},
{
"entropy": 1.7421106839179992,
"epoch": 0.5542922114837976,
"grad_norm": 1.0390921831130981,
"learning_rate": 9.074477904782495e-05,
"loss": 1.3213,
"mean_token_accuracy": 0.7222142660617829,
"num_tokens": 287953436.0,
"step": 11700
},
{
"entropy": 1.7164568746089934,
"epoch": 0.5566609816183438,
"grad_norm": 0.9376536011695862,
"learning_rate": 9.062461675450366e-05,
"loss": 1.3204,
"mean_token_accuracy": 0.7219431722164154,
"num_tokens": 289187059.0,
"step": 11750
},
{
"entropy": 1.7607939064502716,
"epoch": 0.5590297517528899,
"grad_norm": 1.1221693754196167,
"learning_rate": 9.050376007930831e-05,
"loss": 1.358,
"mean_token_accuracy": 0.7148712009191514,
"num_tokens": 290395472.0,
"step": 11800
},
{
"entropy": 1.7365293169021607,
"epoch": 0.561398521887436,
"grad_norm": 1.2102606296539307,
"learning_rate": 9.038221108799832e-05,
"loss": 1.3362,
"mean_token_accuracy": 0.7193614053726196,
"num_tokens": 291650032.0,
"step": 11850
},
{
"entropy": 1.7262990617752074,
"epoch": 0.5637672920219822,
"grad_norm": 1.1103631258010864,
"learning_rate": 9.025997185816662e-05,
"loss": 1.3304,
"mean_token_accuracy": 0.7197805154323578,
"num_tokens": 292891757.0,
"step": 11900
},
{
"entropy": 1.7870515859127045,
"epoch": 0.5661360621565283,
"grad_norm": 1.2359330654144287,
"learning_rate": 9.013704447920407e-05,
"loss": 1.3947,
"mean_token_accuracy": 0.7112246352434158,
"num_tokens": 294108078.0,
"step": 11950
},
{
"entropy": 1.7402713179588318,
"epoch": 0.5685048322910745,
"grad_norm": 1.2696958780288696,
"learning_rate": 9.001343105226397e-05,
"loss": 1.3456,
"mean_token_accuracy": 0.7186821699142456,
"num_tokens": 295347523.0,
"step": 12000
},
{
"epoch": 0.5685048322910745,
"eval_entropy": 1.1782386238578055,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.755168300715283,
"eval_num_tokens": 295347523.0,
"eval_runtime": 747.9571,
"eval_samples_per_second": 33.177,
"eval_steps_per_second": 4.147,
"step": 12000
},
{
"entropy": 1.7823381924629211,
"epoch": 0.5708736024256206,
"grad_norm": 1.090854287147522,
"learning_rate": 8.988913369022585e-05,
"loss": 1.3752,
"mean_token_accuracy": 0.7125837200880051,
"num_tokens": 296596547.0,
"step": 12050
},
{
"entropy": 1.7829466736316681,
"epoch": 0.5732423725601667,
"grad_norm": 0.9252607226371765,
"learning_rate": 8.976415451765952e-05,
"loss": 1.3646,
"mean_token_accuracy": 0.7142701143026352,
"num_tokens": 297794903.0,
"step": 12100
},
{
"entropy": 1.7680902397632599,
"epoch": 0.5756111426947129,
"grad_norm": 1.0627940893173218,
"learning_rate": 8.96384956707888e-05,
"loss": 1.3628,
"mean_token_accuracy": 0.7150187093019486,
"num_tokens": 299029935.0,
"step": 12150
},
{
"entropy": 1.7492178344726563,
"epoch": 0.5779799128292591,
"grad_norm": 1.2822635173797607,
"learning_rate": 8.951215929745486e-05,
"loss": 1.3594,
"mean_token_accuracy": 0.71372334420681,
"num_tokens": 300256525.0,
"step": 12200
},
{
"entropy": 1.7705177330970765,
"epoch": 0.5803486829638052,
"grad_norm": 1.1303389072418213,
"learning_rate": 8.93851475570796e-05,
"loss": 1.3498,
"mean_token_accuracy": 0.7154391181468963,
"num_tokens": 301466189.0,
"step": 12250
},
{
"entropy": 1.7693954205513,
"epoch": 0.5827174530983513,
"grad_norm": 1.0360733270645142,
"learning_rate": 8.925746262062879e-05,
"loss": 1.3523,
"mean_token_accuracy": 0.71549709379673,
"num_tokens": 302677547.0,
"step": 12300
},
{
"entropy": 1.7499237847328186,
"epoch": 0.5850862232328975,
"grad_norm": 1.2163889408111572,
"learning_rate": 8.912910667057482e-05,
"loss": 1.3219,
"mean_token_accuracy": 0.7233135092258454,
"num_tokens": 303884552.0,
"step": 12350
},
{
"entropy": 1.7528709161281586,
"epoch": 0.5874549933674437,
"grad_norm": 1.0694142580032349,
"learning_rate": 8.900008190085946e-05,
"loss": 1.3695,
"mean_token_accuracy": 0.7140274894237518,
"num_tokens": 305112064.0,
"step": 12400
},
{
"entropy": 1.7593362927436829,
"epoch": 0.5898237635019897,
"grad_norm": 0.9559013247489929,
"learning_rate": 8.887039051685646e-05,
"loss": 1.3538,
"mean_token_accuracy": 0.7164691358804702,
"num_tokens": 306349750.0,
"step": 12450
},
{
"entropy": 1.7250176286697387,
"epoch": 0.5921925336365359,
"grad_norm": 1.0856672525405884,
"learning_rate": 8.874003473533372e-05,
"loss": 1.3617,
"mean_token_accuracy": 0.7142321610450745,
"num_tokens": 307589875.0,
"step": 12500
},
{
"entropy": 1.7204779553413392,
"epoch": 0.5945613037710821,
"grad_norm": 1.0638339519500732,
"learning_rate": 8.860901678441542e-05,
"loss": 1.3523,
"mean_token_accuracy": 0.7161801540851593,
"num_tokens": 308844739.0,
"step": 12550
},
{
"entropy": 1.7241905891895295,
"epoch": 0.5969300739056282,
"grad_norm": 1.0859177112579346,
"learning_rate": 8.847733890354397e-05,
"loss": 1.3558,
"mean_token_accuracy": 0.714522579908371,
"num_tokens": 310070098.0,
"step": 12600
},
{
"entropy": 1.735662100315094,
"epoch": 0.5992988440401743,
"grad_norm": 1.1100165843963623,
"learning_rate": 8.834500334344178e-05,
"loss": 1.363,
"mean_token_accuracy": 0.7140331470966339,
"num_tokens": 311292251.0,
"step": 12650
},
{
"entropy": 1.7426686155796052,
"epoch": 0.6016676141747205,
"grad_norm": 1.109788179397583,
"learning_rate": 8.821201236607266e-05,
"loss": 1.3491,
"mean_token_accuracy": 0.7144311499595642,
"num_tokens": 312573175.0,
"step": 12700
},
{
"entropy": 1.7310996508598329,
"epoch": 0.6040363843092666,
"grad_norm": 1.4260696172714233,
"learning_rate": 8.807836824460329e-05,
"loss": 1.3352,
"mean_token_accuracy": 0.7185973340272903,
"num_tokens": 313821355.0,
"step": 12750
},
{
"entropy": 1.7413757181167602,
"epoch": 0.6064051544438128,
"grad_norm": 0.9746555685997009,
"learning_rate": 8.794407326336427e-05,
"loss": 1.3168,
"mean_token_accuracy": 0.7220592141151428,
"num_tokens": 315041303.0,
"step": 12800
},
{
"entropy": 1.7303865098953246,
"epoch": 0.6087739245783589,
"grad_norm": 0.892135739326477,
"learning_rate": 8.780912971781112e-05,
"loss": 1.3201,
"mean_token_accuracy": 0.7211132681369782,
"num_tokens": 316288409.0,
"step": 12850
},
{
"entropy": 1.7497126710414888,
"epoch": 0.611142694712905,
"grad_norm": 1.199959397315979,
"learning_rate": 8.767353991448503e-05,
"loss": 1.3052,
"mean_token_accuracy": 0.7245729100704194,
"num_tokens": 317526338.0,
"step": 12900
},
{
"entropy": 1.7545914590358733,
"epoch": 0.6135114648474512,
"grad_norm": 0.9794778227806091,
"learning_rate": 8.753730617097342e-05,
"loss": 1.3417,
"mean_token_accuracy": 0.7178518337011337,
"num_tokens": 318776423.0,
"step": 12950
},
{
"entropy": 1.761199436187744,
"epoch": 0.6158802349819974,
"grad_norm": 1.115660548210144,
"learning_rate": 8.740043081587043e-05,
"loss": 1.3428,
"mean_token_accuracy": 0.71872696518898,
"num_tokens": 319970665.0,
"step": 13000
},
{
"epoch": 0.6158802349819974,
"eval_entropy": 1.173806337542414,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.755712005311562,
"eval_num_tokens": 319970665.0,
"eval_runtime": 729.7637,
"eval_samples_per_second": 34.004,
"eval_steps_per_second": 4.251,
"step": 13000
},
{
"entropy": 1.7386520493030548,
"epoch": 0.6182490051165435,
"grad_norm": 1.0832492113113403,
"learning_rate": 8.726291618873692e-05,
"loss": 1.3185,
"mean_token_accuracy": 0.7225498640537262,
"num_tokens": 321195496.0,
"step": 13050
},
{
"entropy": 1.788149139881134,
"epoch": 0.6206177752510896,
"grad_norm": 1.0728507041931152,
"learning_rate": 8.712476464006069e-05,
"loss": 1.3687,
"mean_token_accuracy": 0.7138838738203048,
"num_tokens": 322394051.0,
"step": 13100
},
{
"entropy": 1.7250337314605713,
"epoch": 0.6229865453856358,
"grad_norm": 0.9454106688499451,
"learning_rate": 8.698597853121613e-05,
"loss": 1.3206,
"mean_token_accuracy": 0.7232500827312469,
"num_tokens": 323646049.0,
"step": 13150
},
{
"entropy": 1.7228785872459411,
"epoch": 0.625355315520182,
"grad_norm": 1.074063777923584,
"learning_rate": 8.684656023442404e-05,
"loss": 1.3416,
"mean_token_accuracy": 0.7188435053825378,
"num_tokens": 324901290.0,
"step": 13200
},
{
"entropy": 1.7498575222492219,
"epoch": 0.627724085654728,
"grad_norm": 1.3152785301208496,
"learning_rate": 8.670651213271087e-05,
"loss": 1.3495,
"mean_token_accuracy": 0.7163092708587646,
"num_tokens": 326143599.0,
"step": 13250
},
{
"entropy": 1.773080164194107,
"epoch": 0.6300928557892742,
"grad_norm": 1.117574691772461,
"learning_rate": 8.656583661986815e-05,
"loss": 1.3716,
"mean_token_accuracy": 0.7143875294923783,
"num_tokens": 327369948.0,
"step": 13300
},
{
"entropy": 1.756659119129181,
"epoch": 0.6324616259238204,
"grad_norm": 1.0091075897216797,
"learning_rate": 8.642453610041152e-05,
"loss": 1.3815,
"mean_token_accuracy": 0.7113278949260712,
"num_tokens": 328609411.0,
"step": 13350
},
{
"entropy": 1.7753915119171142,
"epoch": 0.6348303960583664,
"grad_norm": 0.9333689212799072,
"learning_rate": 8.628261298953963e-05,
"loss": 1.3478,
"mean_token_accuracy": 0.7168344795703888,
"num_tokens": 329812629.0,
"step": 13400
},
{
"entropy": 1.7191705119609832,
"epoch": 0.6371991661929126,
"grad_norm": 0.9254161715507507,
"learning_rate": 8.614006971309287e-05,
"loss": 1.32,
"mean_token_accuracy": 0.7235176879167556,
"num_tokens": 331045306.0,
"step": 13450
},
{
"entropy": 1.7539090728759765,
"epoch": 0.6395679363274588,
"grad_norm": 1.135908842086792,
"learning_rate": 8.599690870751189e-05,
"loss": 1.2991,
"mean_token_accuracy": 0.7238886666297912,
"num_tokens": 332265198.0,
"step": 13500
},
{
"entropy": 1.7759091782569885,
"epoch": 0.641936706462005,
"grad_norm": 0.9939352869987488,
"learning_rate": 8.585313241979593e-05,
"loss": 1.3446,
"mean_token_accuracy": 0.7167073094844818,
"num_tokens": 333478621.0,
"step": 13550
},
{
"entropy": 1.8198255062103272,
"epoch": 0.644305476596551,
"grad_norm": 1.1110658645629883,
"learning_rate": 8.570874330746109e-05,
"loss": 1.3429,
"mean_token_accuracy": 0.7163071328401566,
"num_tokens": 334679776.0,
"step": 13600
},
{
"entropy": 1.7491872441768646,
"epoch": 0.6466742467310972,
"grad_norm": 1.102397084236145,
"learning_rate": 8.556374383849815e-05,
"loss": 1.3429,
"mean_token_accuracy": 0.7170016378164291,
"num_tokens": 335924366.0,
"step": 13650
},
{
"entropy": 1.7320797193050383,
"epoch": 0.6490430168656434,
"grad_norm": 0.9770281910896301,
"learning_rate": 8.541813649133064e-05,
"loss": 1.3012,
"mean_token_accuracy": 0.7252740359306336,
"num_tokens": 337177387.0,
"step": 13700
},
{
"entropy": 1.7851051843166352,
"epoch": 0.6514117870001895,
"grad_norm": 1.2061119079589844,
"learning_rate": 8.52719237547722e-05,
"loss": 1.3423,
"mean_token_accuracy": 0.7174914568662644,
"num_tokens": 338406873.0,
"step": 13750
},
{
"entropy": 1.7197813856601716,
"epoch": 0.6537805571347356,
"grad_norm": 1.0583444833755493,
"learning_rate": 8.512510812798426e-05,
"loss": 1.3451,
"mean_token_accuracy": 0.7177790975570679,
"num_tokens": 339627417.0,
"step": 13800
},
{
"entropy": 1.7355473148822784,
"epoch": 0.6561493272692818,
"grad_norm": 1.1621958017349243,
"learning_rate": 8.49776921204332e-05,
"loss": 1.3587,
"mean_token_accuracy": 0.7146015846729279,
"num_tokens": 340857014.0,
"step": 13850
},
{
"entropy": 1.7453387939929963,
"epoch": 0.6585180974038279,
"grad_norm": 1.0361634492874146,
"learning_rate": 8.48296782518475e-05,
"loss": 1.3769,
"mean_token_accuracy": 0.7130674320459366,
"num_tokens": 342093808.0,
"step": 13900
},
{
"entropy": 1.7622779953479766,
"epoch": 0.6608868675383741,
"grad_norm": 1.2546138763427734,
"learning_rate": 8.468106905217465e-05,
"loss": 1.348,
"mean_token_accuracy": 0.7160887461900711,
"num_tokens": 343326476.0,
"step": 13950
},
{
"entropy": 1.7349292349815368,
"epoch": 0.6632556376729202,
"grad_norm": 0.9984197020530701,
"learning_rate": 8.453186706153789e-05,
"loss": 1.301,
"mean_token_accuracy": 0.7255065280199051,
"num_tokens": 344557978.0,
"step": 14000
},
{
"epoch": 0.6632556376729202,
"eval_entropy": 1.1754964998965876,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7568369234841843,
"eval_num_tokens": 344557978.0,
"eval_runtime": 728.4569,
"eval_samples_per_second": 34.065,
"eval_steps_per_second": 4.258,
"step": 14000
},
{
"entropy": 1.786487684249878,
"epoch": 0.6656244078074663,
"grad_norm": 1.1214771270751953,
"learning_rate": 8.438207483019291e-05,
"loss": 1.3981,
"mean_token_accuracy": 0.7088551700115204,
"num_tokens": 345789604.0,
"step": 14050
},
{
"entropy": 1.7951791512966155,
"epoch": 0.6679931779420125,
"grad_norm": 1.0142500400543213,
"learning_rate": 8.42316949184841e-05,
"loss": 1.3948,
"mean_token_accuracy": 0.7093900120258332,
"num_tokens": 347013721.0,
"step": 14100
},
{
"entropy": 1.7639971029758454,
"epoch": 0.6703619480765587,
"grad_norm": 1.0026280879974365,
"learning_rate": 8.408072989680087e-05,
"loss": 1.3031,
"mean_token_accuracy": 0.7246174013614655,
"num_tokens": 348211806.0,
"step": 14150
},
{
"entropy": 1.7443763566017152,
"epoch": 0.6727307182111048,
"grad_norm": 1.0735307931900024,
"learning_rate": 8.39291823455337e-05,
"loss": 1.3052,
"mean_token_accuracy": 0.7236789721250534,
"num_tokens": 349491496.0,
"step": 14200
},
{
"entropy": 1.7840288174152374,
"epoch": 0.6750994883456509,
"grad_norm": 1.1240233182907104,
"learning_rate": 8.377705485503007e-05,
"loss": 1.3545,
"mean_token_accuracy": 0.7152435338497162,
"num_tokens": 350709829.0,
"step": 14250
},
{
"entropy": 1.7674752044677735,
"epoch": 0.6774682584801971,
"grad_norm": 0.9507238864898682,
"learning_rate": 8.36243500255501e-05,
"loss": 1.3193,
"mean_token_accuracy": 0.7238988935947418,
"num_tokens": 351910781.0,
"step": 14300
},
{
"entropy": 1.7877480947971345,
"epoch": 0.6798370286147433,
"grad_norm": 0.9499313831329346,
"learning_rate": 8.34710704672222e-05,
"loss": 1.3392,
"mean_token_accuracy": 0.7172271001338959,
"num_tokens": 353134751.0,
"step": 14350
},
{
"entropy": 1.7766230964660645,
"epoch": 0.6822057987492893,
"grad_norm": 1.2760356664657593,
"learning_rate": 8.331721879999841e-05,
"loss": 1.3595,
"mean_token_accuracy": 0.7147215807437897,
"num_tokens": 354350033.0,
"step": 14400
},
{
"entropy": 1.7614100205898284,
"epoch": 0.6845745688838355,
"grad_norm": 1.043785572052002,
"learning_rate": 8.316279765360957e-05,
"loss": 1.3879,
"mean_token_accuracy": 0.7108203011751175,
"num_tokens": 355573758.0,
"step": 14450
},
{
"entropy": 1.7683514368534088,
"epoch": 0.6869433390183817,
"grad_norm": 1.1136603355407715,
"learning_rate": 8.300780966752049e-05,
"loss": 1.3451,
"mean_token_accuracy": 0.7161549615859986,
"num_tokens": 356822721.0,
"step": 14500
},
{
"entropy": 1.7504412484169007,
"epoch": 0.6893121091529278,
"grad_norm": 1.132605791091919,
"learning_rate": 8.28522574908847e-05,
"loss": 1.3433,
"mean_token_accuracy": 0.7193030816316605,
"num_tokens": 358090609.0,
"step": 14550
},
{
"entropy": 1.7415921115875244,
"epoch": 0.6916808792874739,
"grad_norm": 0.901418924331665,
"learning_rate": 8.269614378249932e-05,
"loss": 1.3098,
"mean_token_accuracy": 0.7223669987916946,
"num_tokens": 359334849.0,
"step": 14600
},
{
"entropy": 1.72568878531456,
"epoch": 0.6940496494220201,
"grad_norm": 1.2013583183288574,
"learning_rate": 8.253947121075942e-05,
"loss": 1.3413,
"mean_token_accuracy": 0.7166631370782852,
"num_tokens": 360565890.0,
"step": 14650
},
{
"entropy": 1.7351550233364106,
"epoch": 0.6964184195565662,
"grad_norm": 0.9248843193054199,
"learning_rate": 8.238224245361262e-05,
"loss": 1.3269,
"mean_token_accuracy": 0.7205180561542511,
"num_tokens": 361780402.0,
"step": 14700
},
{
"entropy": 1.7353229641914367,
"epoch": 0.6987871896911124,
"grad_norm": 0.9147818088531494,
"learning_rate": 8.222446019851314e-05,
"loss": 1.3239,
"mean_token_accuracy": 0.7209709006547927,
"num_tokens": 362998310.0,
"step": 14750
},
{
"entropy": 1.7628222048282622,
"epoch": 0.7011559598256585,
"grad_norm": 1.0660256147384644,
"learning_rate": 8.206612714237601e-05,
"loss": 1.3736,
"mean_token_accuracy": 0.7127251303195954,
"num_tokens": 364192705.0,
"step": 14800
},
{
"entropy": 1.7622958242893219,
"epoch": 0.7035247299602047,
"grad_norm": 1.133527398109436,
"learning_rate": 8.190724599153083e-05,
"loss": 1.3252,
"mean_token_accuracy": 0.7197421258687973,
"num_tokens": 365419544.0,
"step": 14850
},
{
"entropy": 1.7733166551589965,
"epoch": 0.7058935000947508,
"grad_norm": 1.0449475049972534,
"learning_rate": 8.174781946167563e-05,
"loss": 1.3422,
"mean_token_accuracy": 0.7184215635061264,
"num_tokens": 366668472.0,
"step": 14900
},
{
"entropy": 1.7824318826198577,
"epoch": 0.708262270229297,
"grad_norm": 0.9425482749938965,
"learning_rate": 8.158785027783038e-05,
"loss": 1.351,
"mean_token_accuracy": 0.7144128715991974,
"num_tokens": 367883921.0,
"step": 14950
},
{
"entropy": 1.7383503484725953,
"epoch": 0.7106310403638431,
"grad_norm": 1.0266870260238647,
"learning_rate": 8.14273411742905e-05,
"loss": 1.3003,
"mean_token_accuracy": 0.7255125510692596,
"num_tokens": 369125857.0,
"step": 15000
},
{
"epoch": 0.7106310403638431,
"eval_entropy": 1.1784183711370755,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7566592082102632,
"eval_num_tokens": 369125857.0,
"eval_runtime": 728.4679,
"eval_samples_per_second": 34.065,
"eval_steps_per_second": 4.258,
"step": 15000
},
{
"entropy": 1.7569481348991394,
"epoch": 0.7129998104983892,
"grad_norm": 1.0710499286651611,
"learning_rate": 8.126629489457998e-05,
"loss": 1.3493,
"mean_token_accuracy": 0.7171416920423508,
"num_tokens": 370360655.0,
"step": 15050
},
{
"entropy": 1.7639206099510192,
"epoch": 0.7153685806329354,
"grad_norm": 1.023205041885376,
"learning_rate": 8.110471419140461e-05,
"loss": 1.3816,
"mean_token_accuracy": 0.7107687264680862,
"num_tokens": 371611998.0,
"step": 15100
},
{
"entropy": 1.7124925446510315,
"epoch": 0.7177373507674816,
"grad_norm": 1.4796391725540161,
"learning_rate": 8.094260182660491e-05,
"loss": 1.3103,
"mean_token_accuracy": 0.7245303303003311,
"num_tokens": 372852886.0,
"step": 15150
},
{
"entropy": 1.7330104005336762,
"epoch": 0.7201061209020276,
"grad_norm": 1.1054223775863647,
"learning_rate": 8.077996057110881e-05,
"loss": 1.3446,
"mean_token_accuracy": 0.7186214071512222,
"num_tokens": 374060791.0,
"step": 15200
},
{
"entropy": 1.781588876247406,
"epoch": 0.7224748910365738,
"grad_norm": 1.2375303506851196,
"learning_rate": 8.06167932048845e-05,
"loss": 1.3815,
"mean_token_accuracy": 0.7106660062074661,
"num_tokens": 375260162.0,
"step": 15250
},
{
"entropy": 1.7753887116909026,
"epoch": 0.72484366117112,
"grad_norm": 1.0260518789291382,
"learning_rate": 8.045310251689269e-05,
"loss": 1.3782,
"mean_token_accuracy": 0.7120629328489304,
"num_tokens": 376480540.0,
"step": 15300
},
{
"entropy": 1.7605856931209565,
"epoch": 0.7272124313056662,
"grad_norm": 1.0135972499847412,
"learning_rate": 8.028889130503908e-05,
"loss": 1.3664,
"mean_token_accuracy": 0.714390983581543,
"num_tokens": 377707870.0,
"step": 15350
},
{
"entropy": 1.7473648416996002,
"epoch": 0.7295812014402122,
"grad_norm": 1.5319159030914307,
"learning_rate": 8.012416237612651e-05,
"loss": 1.3251,
"mean_token_accuracy": 0.7199866360425949,
"num_tokens": 378945180.0,
"step": 15400
},
{
"entropy": 1.7613112390041352,
"epoch": 0.7319499715747584,
"grad_norm": 1.1516921520233154,
"learning_rate": 7.995891854580694e-05,
"loss": 1.3398,
"mean_token_accuracy": 0.7185401087999344,
"num_tokens": 380202318.0,
"step": 15450
},
{
"entropy": 1.7631869399547577,
"epoch": 0.7343187417093046,
"grad_norm": 1.2842717170715332,
"learning_rate": 7.979316263853338e-05,
"loss": 1.3246,
"mean_token_accuracy": 0.7208184325695037,
"num_tokens": 381422244.0,
"step": 15500
},
{
"entropy": 1.7426608395576477,
"epoch": 0.7366875118438506,
"grad_norm": 1.2845314741134644,
"learning_rate": 7.962689748751158e-05,
"loss": 1.3073,
"mean_token_accuracy": 0.7258092379570007,
"num_tokens": 382656317.0,
"step": 15550
},
{
"entropy": 1.7413418543338777,
"epoch": 0.7390562819783968,
"grad_norm": 1.1051653623580933,
"learning_rate": 7.94601259346516e-05,
"loss": 1.3248,
"mean_token_accuracy": 0.7227061313390731,
"num_tokens": 383887770.0,
"step": 15600
},
{
"entropy": 1.7754042732715607,
"epoch": 0.741425052112943,
"grad_norm": 1.0099495649337769,
"learning_rate": 7.929285083051921e-05,
"loss": 1.3818,
"mean_token_accuracy": 0.713128559589386,
"num_tokens": 385130213.0,
"step": 15650
},
{
"entropy": 1.7583998191356658,
"epoch": 0.7437938222474891,
"grad_norm": 1.0357869863510132,
"learning_rate": 7.912507503428728e-05,
"loss": 1.3513,
"mean_token_accuracy": 0.716811910867691,
"num_tokens": 386352005.0,
"step": 15700
},
{
"entropy": 1.7743099415302277,
"epoch": 0.7461625923820352,
"grad_norm": 1.10836660861969,
"learning_rate": 7.895680141368678e-05,
"loss": 1.3314,
"mean_token_accuracy": 0.7205884575843811,
"num_tokens": 387565047.0,
"step": 15750
},
{
"entropy": 1.7909239864349364,
"epoch": 0.7485313625165814,
"grad_norm": 1.0026726722717285,
"learning_rate": 7.87880328449578e-05,
"loss": 1.3547,
"mean_token_accuracy": 0.7177285236120224,
"num_tokens": 388787505.0,
"step": 15800
},
{
"entropy": 1.7829215788841248,
"epoch": 0.7509001326511275,
"grad_norm": 1.3079992532730103,
"learning_rate": 7.86187722128004e-05,
"loss": 1.329,
"mean_token_accuracy": 0.720749350786209,
"num_tokens": 390046573.0,
"step": 15850
},
{
"entropy": 1.7563760423660277,
"epoch": 0.7532689027856737,
"grad_norm": 1.1663581132888794,
"learning_rate": 7.844902241032535e-05,
"loss": 1.3364,
"mean_token_accuracy": 0.7199984455108642,
"num_tokens": 391284239.0,
"step": 15900
},
{
"entropy": 1.7583712506294251,
"epoch": 0.7556376729202198,
"grad_norm": 1.0669708251953125,
"learning_rate": 7.827878633900461e-05,
"loss": 1.3233,
"mean_token_accuracy": 0.7232204431295395,
"num_tokens": 392511286.0,
"step": 15950
},
{
"entropy": 1.7710711109638213,
"epoch": 0.7580064430547659,
"grad_norm": 1.1993380784988403,
"learning_rate": 7.81080669086217e-05,
"loss": 1.3633,
"mean_token_accuracy": 0.7153352189064026,
"num_tokens": 393762386.0,
"step": 16000
},
{
"epoch": 0.7580064430547659,
"eval_entropy": 1.205138478718751,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7560521679240454,
"eval_num_tokens": 393762386.0,
"eval_runtime": 728.478,
"eval_samples_per_second": 34.064,
"eval_steps_per_second": 4.258,
"step": 16000
},
{
"entropy": 1.7863057303428649,
"epoch": 0.7603752131893121,
"grad_norm": 0.9311954975128174,
"learning_rate": 7.793686703722212e-05,
"loss": 1.3477,
"mean_token_accuracy": 0.7189880239963532,
"num_tokens": 394971495.0,
"step": 16050
},
{
"entropy": 1.747820656299591,
"epoch": 0.7627439833238583,
"grad_norm": 1.0288971662521362,
"learning_rate": 7.776518965106327e-05,
"loss": 1.3034,
"mean_token_accuracy": 0.7258507144451142,
"num_tokens": 396221548.0,
"step": 16100
},
{
"entropy": 1.7468533515930176,
"epoch": 0.7651127534584043,
"grad_norm": 1.0182217359542847,
"learning_rate": 7.759303768456463e-05,
"loss": 1.3123,
"mean_token_accuracy": 0.7229688459634781,
"num_tokens": 397439687.0,
"step": 16150
},
{
"entropy": 1.7685637962818146,
"epoch": 0.7674815235929505,
"grad_norm": 1.1658631563186646,
"learning_rate": 7.742041408025747e-05,
"loss": 1.3163,
"mean_token_accuracy": 0.7229421508312225,
"num_tokens": 398648499.0,
"step": 16200
},
{
"entropy": 1.74519140958786,
"epoch": 0.7698502937274967,
"grad_norm": 1.0480293035507202,
"learning_rate": 7.724732178873456e-05,
"loss": 1.3396,
"mean_token_accuracy": 0.7191978305578232,
"num_tokens": 399900933.0,
"step": 16250
},
{
"entropy": 1.73216095328331,
"epoch": 0.7722190638620429,
"grad_norm": 1.105089783668518,
"learning_rate": 7.707376376859984e-05,
"loss": 1.3092,
"mean_token_accuracy": 0.7250679528713226,
"num_tokens": 401117830.0,
"step": 16300
},
{
"entropy": 1.76406853556633,
"epoch": 0.7745878339965889,
"grad_norm": 1.2632781267166138,
"learning_rate": 7.689974298641773e-05,
"loss": 1.3509,
"mean_token_accuracy": 0.7167004567384719,
"num_tokens": 402347744.0,
"step": 16350
},
{
"entropy": 1.7978839790821075,
"epoch": 0.7769566041311351,
"grad_norm": 1.0637677907943726,
"learning_rate": 7.672526241666248e-05,
"loss": 1.3469,
"mean_token_accuracy": 0.71729552090168,
"num_tokens": 403549647.0,
"step": 16400
},
{
"entropy": 1.764045135974884,
"epoch": 0.7793253742656813,
"grad_norm": 0.9130464196205139,
"learning_rate": 7.655032504166735e-05,
"loss": 1.3204,
"mean_token_accuracy": 0.7207730168104172,
"num_tokens": 404774771.0,
"step": 16450
},
{
"entropy": 1.7537085354328155,
"epoch": 0.7816941444002274,
"grad_norm": 1.1020361185073853,
"learning_rate": 7.637493385157358e-05,
"loss": 1.327,
"mean_token_accuracy": 0.7206742608547211,
"num_tokens": 406011265.0,
"step": 16500
},
{
"entropy": 1.755173259973526,
"epoch": 0.7840629145347735,
"grad_norm": 0.9496687650680542,
"learning_rate": 7.619909184427934e-05,
"loss": 1.3013,
"mean_token_accuracy": 0.7237769782543182,
"num_tokens": 407262276.0,
"step": 16550
},
{
"entropy": 1.8000263261795044,
"epoch": 0.7864316846693197,
"grad_norm": 1.295494556427002,
"learning_rate": 7.602280202538839e-05,
"loss": 1.3753,
"mean_token_accuracy": 0.7130093973875046,
"num_tokens": 408508718.0,
"step": 16600
},
{
"entropy": 1.746513249874115,
"epoch": 0.7888004548038658,
"grad_norm": 1.1544225215911865,
"learning_rate": 7.584606740815885e-05,
"loss": 1.3246,
"mean_token_accuracy": 0.7214300912618637,
"num_tokens": 409745538.0,
"step": 16650
},
{
"entropy": 1.8098858451843263,
"epoch": 0.791169224938412,
"grad_norm": 0.9912792444229126,
"learning_rate": 7.566889101345156e-05,
"loss": 1.3452,
"mean_token_accuracy": 0.7167094177007676,
"num_tokens": 410988780.0,
"step": 16700
},
{
"entropy": 1.735760669708252,
"epoch": 0.7935379950729581,
"grad_norm": 0.9103946685791016,
"learning_rate": 7.549127586967853e-05,
"loss": 1.3319,
"mean_token_accuracy": 0.7208295828104019,
"num_tokens": 412261045.0,
"step": 16750
},
{
"entropy": 1.7235812985897063,
"epoch": 0.7959067652075043,
"grad_norm": 0.9902112483978271,
"learning_rate": 7.531322501275114e-05,
"loss": 1.3523,
"mean_token_accuracy": 0.7184983837604523,
"num_tokens": 413490577.0,
"step": 16800
},
{
"entropy": 1.7275058662891387,
"epoch": 0.7982755353420504,
"grad_norm": 0.857623279094696,
"learning_rate": 7.513474148602826e-05,
"loss": 1.3474,
"mean_token_accuracy": 0.71783855676651,
"num_tokens": 414734324.0,
"step": 16850
},
{
"entropy": 1.712952392101288,
"epoch": 0.8006443054765966,
"grad_norm": 0.8611600399017334,
"learning_rate": 7.495582834026421e-05,
"loss": 1.3284,
"mean_token_accuracy": 0.7218652653694153,
"num_tokens": 415979550.0,
"step": 16900
},
{
"entropy": 1.7426096272468568,
"epoch": 0.8030130756111427,
"grad_norm": 1.153913140296936,
"learning_rate": 7.47764886335567e-05,
"loss": 1.3673,
"mean_token_accuracy": 0.7147996026277542,
"num_tokens": 417172690.0,
"step": 16950
},
{
"entropy": 1.7134468042850495,
"epoch": 0.8053818457456888,
"grad_norm": 0.9624414443969727,
"learning_rate": 7.459672543129438e-05,
"loss": 1.3301,
"mean_token_accuracy": 0.7208444583415985,
"num_tokens": 418396867.0,
"step": 17000
},
{
"epoch": 0.8053818457456888,
"eval_entropy": 1.1603839400696954,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7568193746719877,
"eval_num_tokens": 418396867.0,
"eval_runtime": 730.007,
"eval_samples_per_second": 33.993,
"eval_steps_per_second": 4.249,
"step": 17000
},
{
"entropy": 1.7083112740516662,
"epoch": 0.807750615880235,
"grad_norm": 1.0578949451446533,
"learning_rate": 7.441654180610466e-05,
"loss": 1.3116,
"mean_token_accuracy": 0.7241713929176331,
"num_tokens": 419620242.0,
"step": 17050
},
{
"entropy": 1.7101111936569213,
"epoch": 0.8101193860147812,
"grad_norm": 1.4528478384017944,
"learning_rate": 7.423594083780106e-05,
"loss": 1.2894,
"mean_token_accuracy": 0.7281060153245926,
"num_tokens": 420840365.0,
"step": 17100
},
{
"entropy": 1.7118054771423339,
"epoch": 0.8124881561493272,
"grad_norm": 0.9906657338142395,
"learning_rate": 7.405492561333052e-05,
"loss": 1.3313,
"mean_token_accuracy": 0.7208691501617431,
"num_tokens": 422103109.0,
"step": 17150
},
{
"entropy": 1.743634682893753,
"epoch": 0.8148569262838734,
"grad_norm": 1.1013976335525513,
"learning_rate": 7.387349922672082e-05,
"loss": 1.3435,
"mean_token_accuracy": 0.7182679337263107,
"num_tokens": 423336510.0,
"step": 17200
},
{
"entropy": 1.7483525812625884,
"epoch": 0.8172256964184196,
"grad_norm": 1.1249130964279175,
"learning_rate": 7.369166477902753e-05,
"loss": 1.3356,
"mean_token_accuracy": 0.7192718476057053,
"num_tokens": 424558544.0,
"step": 17250
},
{
"entropy": 1.7446792232990265,
"epoch": 0.8195944665529658,
"grad_norm": 0.9279561042785645,
"learning_rate": 7.350942537828105e-05,
"loss": 1.357,
"mean_token_accuracy": 0.7167576777935029,
"num_tokens": 425791336.0,
"step": 17300
},
{
"entropy": 1.7697773826122285,
"epoch": 0.8219632366875118,
"grad_norm": 1.0355454683303833,
"learning_rate": 7.332678413943352e-05,
"loss": 1.3279,
"mean_token_accuracy": 0.7211151129007339,
"num_tokens": 427017026.0,
"step": 17350
},
{
"entropy": 1.769633387327194,
"epoch": 0.824332006822058,
"grad_norm": 1.1847100257873535,
"learning_rate": 7.314374418430554e-05,
"loss": 1.3239,
"mean_token_accuracy": 0.7223846167325974,
"num_tokens": 428272562.0,
"step": 17400
},
{
"entropy": 1.754239571094513,
"epoch": 0.8267007769566042,
"grad_norm": 1.0456291437149048,
"learning_rate": 7.296030864153286e-05,
"loss": 1.3136,
"mean_token_accuracy": 0.7257154327630997,
"num_tokens": 429502230.0,
"step": 17450
},
{
"entropy": 1.727893146276474,
"epoch": 0.8290695470911503,
"grad_norm": 1.0988260507583618,
"learning_rate": 7.277648064651281e-05,
"loss": 1.3325,
"mean_token_accuracy": 0.7202855634689331,
"num_tokens": 430738126.0,
"step": 17500
},
{
"entropy": 1.7441512525081635,
"epoch": 0.8314383172256964,
"grad_norm": 1.4698035717010498,
"learning_rate": 7.259226334135079e-05,
"loss": 1.303,
"mean_token_accuracy": 0.7249046045541764,
"num_tokens": 431957649.0,
"step": 17550
},
{
"entropy": 1.7701557087898254,
"epoch": 0.8338070873602426,
"grad_norm": 0.8762974143028259,
"learning_rate": 7.240765987480654e-05,
"loss": 1.3501,
"mean_token_accuracy": 0.7148396277427673,
"num_tokens": 433171928.0,
"step": 17600
},
{
"entropy": 1.771160396337509,
"epoch": 0.8361758574947887,
"grad_norm": 0.9736217260360718,
"learning_rate": 7.222267340224034e-05,
"loss": 1.324,
"mean_token_accuracy": 0.7225921380519867,
"num_tokens": 434354105.0,
"step": 17650
},
{
"entropy": 1.7395762205123901,
"epoch": 0.8385446276293349,
"grad_norm": 1.1522939205169678,
"learning_rate": 7.203730708555897e-05,
"loss": 1.3243,
"mean_token_accuracy": 0.7200587207078933,
"num_tokens": 435556247.0,
"step": 17700
},
{
"entropy": 1.7756438231468201,
"epoch": 0.840913397763881,
"grad_norm": 1.2958649396896362,
"learning_rate": 7.185156409316186e-05,
"loss": 1.374,
"mean_token_accuracy": 0.7119175827503205,
"num_tokens": 436745432.0,
"step": 17750
},
{
"entropy": 1.7542549967765808,
"epoch": 0.8432821678984271,
"grad_norm": 1.1340820789337158,
"learning_rate": 7.166544759988676e-05,
"loss": 1.3066,
"mean_token_accuracy": 0.7246117842197418,
"num_tokens": 437965467.0,
"step": 17800
},
{
"entropy": 1.7582876706123352,
"epoch": 0.8456509380329733,
"grad_norm": 1.3607231378555298,
"learning_rate": 7.147896078695551e-05,
"loss": 1.304,
"mean_token_accuracy": 0.724840202331543,
"num_tokens": 439190298.0,
"step": 17850
},
{
"entropy": 1.7604767334461213,
"epoch": 0.8480197081675194,
"grad_norm": 1.2566254138946533,
"learning_rate": 7.129210684191973e-05,
"loss": 1.3237,
"mean_token_accuracy": 0.7207924181222916,
"num_tokens": 440410573.0,
"step": 17900
},
{
"entropy": 1.7759632766246796,
"epoch": 0.8503884783020655,
"grad_norm": 1.3075144290924072,
"learning_rate": 7.110488895860633e-05,
"loss": 1.3476,
"mean_token_accuracy": 0.7191230463981628,
"num_tokens": 441635031.0,
"step": 17950
},
{
"entropy": 1.7343137776851654,
"epoch": 0.8527572484366117,
"grad_norm": 0.9776571393013,
"learning_rate": 7.091731033706281e-05,
"loss": 1.3101,
"mean_token_accuracy": 0.7254330676794052,
"num_tokens": 442870901.0,
"step": 18000
},
{
"epoch": 0.8527572484366117,
"eval_entropy": 1.1681382538694325,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7576727671230478,
"eval_num_tokens": 442870901.0,
"eval_runtime": 728.745,
"eval_samples_per_second": 34.052,
"eval_steps_per_second": 4.257,
"step": 18000
},
{
"entropy": 1.7468896472454072,
"epoch": 0.8551260185711579,
"grad_norm": 1.1819489002227783,
"learning_rate": 7.072937418350267e-05,
"loss": 1.3424,
"mean_token_accuracy": 0.7205572354793549,
"num_tokens": 444113226.0,
"step": 18050
},
{
"entropy": 1.752768530845642,
"epoch": 0.857494788705704,
"grad_norm": 1.1274313926696777,
"learning_rate": 7.05410837102506e-05,
"loss": 1.3276,
"mean_token_accuracy": 0.7209989041090011,
"num_tokens": 445334171.0,
"step": 18100
},
{
"entropy": 1.7303791618347169,
"epoch": 0.8598635588402501,
"grad_norm": 1.3661671876907349,
"learning_rate": 7.035244213568752e-05,
"loss": 1.2946,
"mean_token_accuracy": 0.7279473000764847,
"num_tokens": 446563863.0,
"step": 18150
},
{
"entropy": 1.7404825651645661,
"epoch": 0.8622323289747963,
"grad_norm": 1.2854536771774292,
"learning_rate": 7.016345268419559e-05,
"loss": 1.3414,
"mean_token_accuracy": 0.7202253836393356,
"num_tokens": 447805772.0,
"step": 18200
},
{
"entropy": 1.7380101013183593,
"epoch": 0.8646010991093425,
"grad_norm": 1.1535879373550415,
"learning_rate": 6.997411858610311e-05,
"loss": 1.3059,
"mean_token_accuracy": 0.7250276601314545,
"num_tokens": 449010921.0,
"step": 18250
},
{
"entropy": 1.7006947088241577,
"epoch": 0.8669698692438885,
"grad_norm": 1.076830506324768,
"learning_rate": 6.978444307762932e-05,
"loss": 1.2936,
"mean_token_accuracy": 0.7278600412607193,
"num_tokens": 450222021.0,
"step": 18300
},
{
"entropy": 1.7165203237533568,
"epoch": 0.8693386393784347,
"grad_norm": 1.1687458753585815,
"learning_rate": 6.959442940082907e-05,
"loss": 1.3093,
"mean_token_accuracy": 0.7266046351194382,
"num_tokens": 451437320.0,
"step": 18350
},
{
"entropy": 1.7525631844997407,
"epoch": 0.8717074095129809,
"grad_norm": 1.0274993181228638,
"learning_rate": 6.940408080353737e-05,
"loss": 1.3405,
"mean_token_accuracy": 0.7197101265192032,
"num_tokens": 452637999.0,
"step": 18400
},
{
"entropy": 1.6950951242446899,
"epoch": 0.874076179647527,
"grad_norm": 1.170281171798706,
"learning_rate": 6.921340053931389e-05,
"loss": 1.322,
"mean_token_accuracy": 0.7230970364809036,
"num_tokens": 453872583.0,
"step": 18450
},
{
"entropy": 1.7251943707466126,
"epoch": 0.8764449497820731,
"grad_norm": 1.0783965587615967,
"learning_rate": 6.902239186738742e-05,
"loss": 1.3077,
"mean_token_accuracy": 0.7254717952013016,
"num_tokens": 455115487.0,
"step": 18500
},
{
"entropy": 1.748286772966385,
"epoch": 0.8788137199166193,
"grad_norm": 0.9898918867111206,
"learning_rate": 6.883105805260006e-05,
"loss": 1.336,
"mean_token_accuracy": 0.7198050141334533,
"num_tokens": 456357289.0,
"step": 18550
},
{
"entropy": 1.7271603178977966,
"epoch": 0.8811824900511654,
"grad_norm": 1.0102683305740356,
"learning_rate": 6.863940236535146e-05,
"loss": 1.2972,
"mean_token_accuracy": 0.7267592811584472,
"num_tokens": 457601954.0,
"step": 18600
},
{
"entropy": 1.7312583494186402,
"epoch": 0.8835512601857116,
"grad_norm": 1.1015334129333496,
"learning_rate": 6.844742808154297e-05,
"loss": 1.3264,
"mean_token_accuracy": 0.7216020065546036,
"num_tokens": 458836210.0,
"step": 18650
},
{
"entropy": 1.7203202879428863,
"epoch": 0.8859200303202577,
"grad_norm": 1.2608678340911865,
"learning_rate": 6.82551384825215e-05,
"loss": 1.3112,
"mean_token_accuracy": 0.7242467325925827,
"num_tokens": 460059995.0,
"step": 18700
},
{
"entropy": 1.73845316529274,
"epoch": 0.8882888004548039,
"grad_norm": 1.2071726322174072,
"learning_rate": 6.806253685502361e-05,
"loss": 1.3422,
"mean_token_accuracy": 0.7193791323900223,
"num_tokens": 461321260.0,
"step": 18750
},
{
"entropy": 1.7286129772663117,
"epoch": 0.89065757058935,
"grad_norm": 0.9281275868415833,
"learning_rate": 6.786962649111926e-05,
"loss": 1.3346,
"mean_token_accuracy": 0.7215994411706924,
"num_tokens": 462547797.0,
"step": 18800
},
{
"entropy": 1.7290022671222687,
"epoch": 0.8930263407238962,
"grad_norm": 1.4838134050369263,
"learning_rate": 6.767641068815546e-05,
"loss": 1.2936,
"mean_token_accuracy": 0.7260348951816559,
"num_tokens": 463769872.0,
"step": 18850
},
{
"entropy": 1.7164359045028688,
"epoch": 0.8953951108584423,
"grad_norm": 0.9822611808776855,
"learning_rate": 6.748289274870001e-05,
"loss": 1.2841,
"mean_token_accuracy": 0.7294727778434753,
"num_tokens": 465012929.0,
"step": 18900
},
{
"entropy": 1.778987593650818,
"epoch": 0.8977638809929884,
"grad_norm": 1.056518793106079,
"learning_rate": 6.728907598048503e-05,
"loss": 1.3276,
"mean_token_accuracy": 0.7213660079240799,
"num_tokens": 466199667.0,
"step": 18950
},
{
"entropy": 1.7454373347759247,
"epoch": 0.9001326511275346,
"grad_norm": 1.1590458154678345,
"learning_rate": 6.709496369635043e-05,
"loss": 1.3057,
"mean_token_accuracy": 0.7262363374233246,
"num_tokens": 467441804.0,
"step": 19000
},
{
"epoch": 0.9001326511275346,
"eval_entropy": 1.1678229505634554,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7581798996773941,
"eval_num_tokens": 467441804.0,
"eval_runtime": 729.0934,
"eval_samples_per_second": 34.035,
"eval_steps_per_second": 4.255,
"step": 19000
},
{
"entropy": 1.7560745322704314,
"epoch": 0.9025014212620808,
"grad_norm": 1.1119104623794556,
"learning_rate": 6.69005592141872e-05,
"loss": 1.3103,
"mean_token_accuracy": 0.7248196619749069,
"num_tokens": 468672442.0,
"step": 19050
},
{
"entropy": 1.7368404233455659,
"epoch": 0.9048701913966268,
"grad_norm": 1.0841938257217407,
"learning_rate": 6.670586585688086e-05,
"loss": 1.3168,
"mean_token_accuracy": 0.723661498427391,
"num_tokens": 469911642.0,
"step": 19100
},
{
"entropy": 1.7659292578697205,
"epoch": 0.907238961531173,
"grad_norm": 1.3782135248184204,
"learning_rate": 6.651088695225447e-05,
"loss": 1.3044,
"mean_token_accuracy": 0.7251561576128006,
"num_tokens": 471110856.0,
"step": 19150
},
{
"entropy": 1.7336669373512268,
"epoch": 0.9096077316657192,
"grad_norm": 1.1556999683380127,
"learning_rate": 6.631562583301191e-05,
"loss": 1.297,
"mean_token_accuracy": 0.7274081045389176,
"num_tokens": 472320155.0,
"step": 19200
},
{
"entropy": 1.684252212047577,
"epoch": 0.9119765018002653,
"grad_norm": 0.9659352898597717,
"learning_rate": 6.612008583668082e-05,
"loss": 1.3105,
"mean_token_accuracy": 0.7258839225769043,
"num_tokens": 473560540.0,
"step": 19250
},
{
"entropy": 1.736447709798813,
"epoch": 0.9143452719348114,
"grad_norm": 1.231652855873108,
"learning_rate": 6.592427030555565e-05,
"loss": 1.3364,
"mean_token_accuracy": 0.7204890990257263,
"num_tokens": 474795749.0,
"step": 19300
},
{
"entropy": 1.6979431188106537,
"epoch": 0.9167140420693576,
"grad_norm": 1.0034390687942505,
"learning_rate": 6.572818258664035e-05,
"loss": 1.321,
"mean_token_accuracy": 0.7222663134336471,
"num_tokens": 476048351.0,
"step": 19350
},
{
"entropy": 1.7359539401531219,
"epoch": 0.9190828122039038,
"grad_norm": 1.0725759267807007,
"learning_rate": 6.55318260315914e-05,
"loss": 1.3228,
"mean_token_accuracy": 0.7220259785652161,
"num_tokens": 477260246.0,
"step": 19400
},
{
"entropy": 1.6657227408885955,
"epoch": 0.9214515823384499,
"grad_norm": 0.9826326370239258,
"learning_rate": 6.533520399666033e-05,
"loss": 1.2904,
"mean_token_accuracy": 0.7296865725517273,
"num_tokens": 478504094.0,
"step": 19450
},
{
"entropy": 1.7167856967449189,
"epoch": 0.923820352472996,
"grad_norm": 0.9942904710769653,
"learning_rate": 6.513831984263641e-05,
"loss": 1.2708,
"mean_token_accuracy": 0.7317487215995788,
"num_tokens": 479728318.0,
"step": 19500
},
{
"entropy": 1.7254127764701843,
"epoch": 0.9261891226075422,
"grad_norm": 1.4505666494369507,
"learning_rate": 6.494117693478926e-05,
"loss": 1.2893,
"mean_token_accuracy": 0.7286518901586533,
"num_tokens": 480937077.0,
"step": 19550
},
{
"entropy": 1.7521008849143982,
"epoch": 0.9285578927420883,
"grad_norm": 1.066002607345581,
"learning_rate": 6.474377864281127e-05,
"loss": 1.3244,
"mean_token_accuracy": 0.7240516602993011,
"num_tokens": 482172564.0,
"step": 19600
},
{
"entropy": 1.7225322866439818,
"epoch": 0.9309266628766345,
"grad_norm": 1.1396028995513916,
"learning_rate": 6.454612834076e-05,
"loss": 1.3052,
"mean_token_accuracy": 0.7258065021038056,
"num_tokens": 483406518.0,
"step": 19650
},
{
"entropy": 1.7187459325790406,
"epoch": 0.9332954330111806,
"grad_norm": 0.8960033655166626,
"learning_rate": 6.434822940700057e-05,
"loss": 1.297,
"mean_token_accuracy": 0.7268172729015351,
"num_tokens": 484643697.0,
"step": 19700
},
{
"entropy": 1.7082497942447663,
"epoch": 0.9356642031457267,
"grad_norm": 1.1821448802947998,
"learning_rate": 6.415008522414782e-05,
"loss": 1.292,
"mean_token_accuracy": 0.7285556894540787,
"num_tokens": 485855707.0,
"step": 19750
},
{
"entropy": 1.7341230428218841,
"epoch": 0.9380329732802729,
"grad_norm": 1.0824941396713257,
"learning_rate": 6.395169917900858e-05,
"loss": 1.3135,
"mean_token_accuracy": 0.723016293644905,
"num_tokens": 487075872.0,
"step": 19800
},
{
"entropy": 1.712151471376419,
"epoch": 0.9404017434148191,
"grad_norm": 1.4998127222061157,
"learning_rate": 6.375307466252372e-05,
"loss": 1.3492,
"mean_token_accuracy": 0.7187636381387711,
"num_tokens": 488272477.0,
"step": 19850
},
{
"entropy": 1.683067034482956,
"epoch": 0.9427705135493651,
"grad_norm": 0.9722391963005066,
"learning_rate": 6.355421506971025e-05,
"loss": 1.2899,
"mean_token_accuracy": 0.728040627837181,
"num_tokens": 489486559.0,
"step": 19900
},
{
"entropy": 1.6999699199199676,
"epoch": 0.9451392836839113,
"grad_norm": 1.150619626045227,
"learning_rate": 6.335512379960322e-05,
"loss": 1.2776,
"mean_token_accuracy": 0.7324709689617157,
"num_tokens": 490706066.0,
"step": 19950
},
{
"entropy": 1.7298948228359223,
"epoch": 0.9475080538184575,
"grad_norm": 1.2315185070037842,
"learning_rate": 6.315580425519766e-05,
"loss": 1.3312,
"mean_token_accuracy": 0.7208713871240616,
"num_tokens": 491918291.0,
"step": 20000
},
{
"epoch": 0.9475080538184575,
"eval_entropy": 1.15051956327787,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7579382187335588,
"eval_num_tokens": 491918291.0,
"eval_runtime": 726.7621,
"eval_samples_per_second": 34.145,
"eval_steps_per_second": 4.268,
"step": 20000
},
{
"entropy": 1.7038512194156648,
"epoch": 0.9498768239530035,
"grad_norm": 1.163555383682251,
"learning_rate": 6.295625984339043e-05,
"loss": 1.3204,
"mean_token_accuracy": 0.7242659759521485,
"num_tokens": 493116821.0,
"step": 20050
},
{
"entropy": 1.741351603269577,
"epoch": 0.9522455940875497,
"grad_norm": 1.1677212715148926,
"learning_rate": 6.275649397492195e-05,
"loss": 1.3061,
"mean_token_accuracy": 0.7256824851036072,
"num_tokens": 494289241.0,
"step": 20100
},
{
"entropy": 1.718240325450897,
"epoch": 0.9546143642220959,
"grad_norm": 0.9395654797554016,
"learning_rate": 6.255651006431793e-05,
"loss": 1.2979,
"mean_token_accuracy": 0.7259613001346588,
"num_tokens": 495524606.0,
"step": 20150
},
{
"entropy": 1.6947869229316712,
"epoch": 0.9569831343566421,
"grad_norm": 0.9097754955291748,
"learning_rate": 6.235631152983098e-05,
"loss": 1.3067,
"mean_token_accuracy": 0.7251908606290818,
"num_tokens": 496753832.0,
"step": 20200
},
{
"entropy": 1.6904695987701417,
"epoch": 0.9593519044911881,
"grad_norm": 1.0224709510803223,
"learning_rate": 6.215590179338221e-05,
"loss": 1.2916,
"mean_token_accuracy": 0.7274919444322586,
"num_tokens": 497977841.0,
"step": 20250
},
{
"entropy": 1.7162616491317748,
"epoch": 0.9617206746257343,
"grad_norm": 1.1449992656707764,
"learning_rate": 6.195528428050273e-05,
"loss": 1.3412,
"mean_token_accuracy": 0.7184383940696716,
"num_tokens": 499188236.0,
"step": 20300
},
{
"entropy": 1.6742710149288178,
"epoch": 0.9640894447602805,
"grad_norm": 1.100428581237793,
"learning_rate": 6.17544624202751e-05,
"loss": 1.2547,
"mean_token_accuracy": 0.7356196337938309,
"num_tokens": 500446815.0,
"step": 20350
},
{
"entropy": 1.6939631617069244,
"epoch": 0.9664582148948266,
"grad_norm": 0.9926633238792419,
"learning_rate": 6.15534396452747e-05,
"loss": 1.3121,
"mean_token_accuracy": 0.7248954975605011,
"num_tokens": 501679112.0,
"step": 20400
},
{
"entropy": 1.739109193086624,
"epoch": 0.9688269850293727,
"grad_norm": 1.270719051361084,
"learning_rate": 6.135221939151108e-05,
"loss": 1.3404,
"mean_token_accuracy": 0.7209612077474594,
"num_tokens": 502912575.0,
"step": 20450
},
{
"entropy": 1.7274162566661835,
"epoch": 0.9711957551639189,
"grad_norm": 1.2614290714263916,
"learning_rate": 6.115080509836923e-05,
"loss": 1.334,
"mean_token_accuracy": 0.7216370838880539,
"num_tokens": 504141410.0,
"step": 20500
},
{
"entropy": 1.708759593963623,
"epoch": 0.973564525298465,
"grad_norm": 1.2522040605545044,
"learning_rate": 6.09492002085508e-05,
"loss": 1.3175,
"mean_token_accuracy": 0.7249479728937149,
"num_tokens": 505345687.0,
"step": 20550
},
{
"entropy": 1.6911339461803436,
"epoch": 0.9759332954330112,
"grad_norm": 1.0709445476531982,
"learning_rate": 6.074740816801516e-05,
"loss": 1.2945,
"mean_token_accuracy": 0.7283177155256272,
"num_tokens": 506583420.0,
"step": 20600
},
{
"entropy": 1.7511263823509216,
"epoch": 0.9783020655675573,
"grad_norm": 1.1028821468353271,
"learning_rate": 6.054543242592071e-05,
"loss": 1.3661,
"mean_token_accuracy": 0.7142648506164551,
"num_tokens": 507769373.0,
"step": 20650
},
{
"entropy": 1.7048313403129578,
"epoch": 0.9806708357021034,
"grad_norm": 1.2044216394424438,
"learning_rate": 6.034327643456569e-05,
"loss": 1.2878,
"mean_token_accuracy": 0.7300767368078231,
"num_tokens": 508986124.0,
"step": 20700
},
{
"entropy": 1.732351886034012,
"epoch": 0.9830396058366496,
"grad_norm": 1.118547797203064,
"learning_rate": 6.014094364932931e-05,
"loss": 1.3298,
"mean_token_accuracy": 0.7219525814056397,
"num_tokens": 510216131.0,
"step": 20750
},
{
"entropy": 1.7391897797584535,
"epoch": 0.9854083759711958,
"grad_norm": 1.134662389755249,
"learning_rate": 5.993843752861266e-05,
"loss": 1.349,
"mean_token_accuracy": 0.7185146582126617,
"num_tokens": 511452480.0,
"step": 20800
},
{
"entropy": 1.7354848337173463,
"epoch": 0.9877771461057419,
"grad_norm": 1.2413026094436646,
"learning_rate": 5.9735761533779575e-05,
"loss": 1.3117,
"mean_token_accuracy": 0.7226764589548111,
"num_tokens": 512677249.0,
"step": 20850
},
{
"entropy": 1.7198446631431579,
"epoch": 0.990145916240288,
"grad_norm": 1.094545602798462,
"learning_rate": 5.953291912909751e-05,
"loss": 1.271,
"mean_token_accuracy": 0.7310113716125488,
"num_tokens": 513916521.0,
"step": 20900
},
{
"entropy": 1.7085091185569763,
"epoch": 0.9925146863748342,
"grad_norm": 1.1231886148452759,
"learning_rate": 5.932991378167827e-05,
"loss": 1.2842,
"mean_token_accuracy": 0.7295077663660049,
"num_tokens": 515136725.0,
"step": 20950
},
{
"entropy": 1.6870656645298003,
"epoch": 0.9948834565093804,
"grad_norm": 1.1282687187194824,
"learning_rate": 5.912674896141883e-05,
"loss": 1.3022,
"mean_token_accuracy": 0.7291330778598786,
"num_tokens": 516364497.0,
"step": 21000
},
{
"epoch": 0.9948834565093804,
"eval_entropy": 1.1225133023863065,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7580950103464779,
"eval_num_tokens": 516364497.0,
"eval_runtime": 728.4546,
"eval_samples_per_second": 34.065,
"eval_steps_per_second": 4.258,
"step": 21000
},
{
"entropy": 1.7055912351608276,
"epoch": 0.9972522266439264,
"grad_norm": 1.0038437843322754,
"learning_rate": 5.892342814094193e-05,
"loss": 1.3364,
"mean_token_accuracy": 0.7213913726806641,
"num_tokens": 517597413.0,
"step": 21050
},
{
"entropy": 1.6995044994354247,
"epoch": 0.9996209967784726,
"grad_norm": 1.0054970979690552,
"learning_rate": 5.871995479553676e-05,
"loss": 1.3426,
"mean_token_accuracy": 0.7205459761619568,
"num_tokens": 518808510.0,
"step": 21100
},
{
"entropy": 1.6123450350761415,
"epoch": 1.0019897669130187,
"grad_norm": 1.1733577251434326,
"learning_rate": 5.851633240309963e-05,
"loss": 1.2043,
"mean_token_accuracy": 0.7436364030838013,
"num_tokens": 520050715.0,
"step": 21150
},
{
"entropy": 1.6008358299732208,
"epoch": 1.004358537047565,
"grad_norm": 1.2594228982925415,
"learning_rate": 5.8312564444074366e-05,
"loss": 1.1962,
"mean_token_accuracy": 0.7485791981220246,
"num_tokens": 521272519.0,
"step": 21200
},
{
"entropy": 1.5864481520652771,
"epoch": 1.006727307182111,
"grad_norm": 1.1308941841125488,
"learning_rate": 5.810865440139299e-05,
"loss": 1.2014,
"mean_token_accuracy": 0.7478245437145233,
"num_tokens": 522505953.0,
"step": 21250
},
{
"entropy": 1.6050166165828705,
"epoch": 1.0090960773166573,
"grad_norm": 1.0653034448623657,
"learning_rate": 5.790460576041608e-05,
"loss": 1.2219,
"mean_token_accuracy": 0.7426986521482468,
"num_tokens": 523767803.0,
"step": 21300
},
{
"entropy": 1.598244547843933,
"epoch": 1.0114648474512034,
"grad_norm": 1.1732730865478516,
"learning_rate": 5.77004220088732e-05,
"loss": 1.2003,
"mean_token_accuracy": 0.7464343649148941,
"num_tokens": 525030135.0,
"step": 21350
},
{
"entropy": 1.5887214350700378,
"epoch": 1.0138336175857494,
"grad_norm": 1.0639592409133911,
"learning_rate": 5.749610663680334e-05,
"loss": 1.1959,
"mean_token_accuracy": 0.7482451206445694,
"num_tokens": 526245960.0,
"step": 21400
},
{
"entropy": 1.5872322118282318,
"epoch": 1.0162023877202957,
"grad_norm": 1.4078749418258667,
"learning_rate": 5.729166313649523e-05,
"loss": 1.1928,
"mean_token_accuracy": 0.747797891497612,
"num_tokens": 527461999.0,
"step": 21450
},
{
"entropy": 1.5684971618652344,
"epoch": 1.0185711578548418,
"grad_norm": 1.0040647983551025,
"learning_rate": 5.7087095002427614e-05,
"loss": 1.1636,
"mean_token_accuracy": 0.7527302461862564,
"num_tokens": 528693347.0,
"step": 21500
},
{
"entropy": 1.5993961930274962,
"epoch": 1.0209399279893878,
"grad_norm": 1.519827127456665,
"learning_rate": 5.688240573120962e-05,
"loss": 1.1996,
"mean_token_accuracy": 0.7477444261312485,
"num_tokens": 529916988.0,
"step": 21550
},
{
"entropy": 1.6100480878353118,
"epoch": 1.023308698123934,
"grad_norm": 1.298337459564209,
"learning_rate": 5.6677598821520886e-05,
"loss": 1.1941,
"mean_token_accuracy": 0.746188434958458,
"num_tokens": 531136613.0,
"step": 21600
},
{
"entropy": 1.608763552904129,
"epoch": 1.0256774682584802,
"grad_norm": 1.2754813432693481,
"learning_rate": 5.647267777405177e-05,
"loss": 1.1801,
"mean_token_accuracy": 0.7486988466978073,
"num_tokens": 532395495.0,
"step": 21650
},
{
"entropy": 1.5974150121212005,
"epoch": 1.0280462383930264,
"grad_norm": 1.306957721710205,
"learning_rate": 5.626764609144364e-05,
"loss": 1.229,
"mean_token_accuracy": 0.7420145213603974,
"num_tokens": 533626086.0,
"step": 21700
},
{
"entropy": 1.5830400812625884,
"epoch": 1.0304150085275725,
"grad_norm": 1.2734113931655884,
"learning_rate": 5.606250727822883e-05,
"loss": 1.2002,
"mean_token_accuracy": 0.7472029691934585,
"num_tokens": 534872278.0,
"step": 21750
},
{
"entropy": 1.5808272886276244,
"epoch": 1.0327837786621186,
"grad_norm": 0.9578977227210999,
"learning_rate": 5.585726484077085e-05,
"loss": 1.2118,
"mean_token_accuracy": 0.745669018626213,
"num_tokens": 536104060.0,
"step": 21800
},
{
"entropy": 1.546217747926712,
"epoch": 1.0351525487966649,
"grad_norm": 1.1195182800292969,
"learning_rate": 5.565192228720439e-05,
"loss": 1.1738,
"mean_token_accuracy": 0.7508551919460297,
"num_tokens": 537338574.0,
"step": 21850
},
{
"entropy": 1.5846721458435058,
"epoch": 1.037521318931211,
"grad_norm": 1.2577298879623413,
"learning_rate": 5.544648312737547e-05,
"loss": 1.1778,
"mean_token_accuracy": 0.7510464614629746,
"num_tokens": 538557980.0,
"step": 21900
},
{
"entropy": 1.5822556126117706,
"epoch": 1.039890089065757,
"grad_norm": 1.1481040716171265,
"learning_rate": 5.524095087278126e-05,
"loss": 1.1848,
"mean_token_accuracy": 0.7497791868448257,
"num_tokens": 539784677.0,
"step": 21950
},
{
"entropy": 1.627133835554123,
"epoch": 1.0422588592003033,
"grad_norm": 1.1411134004592896,
"learning_rate": 5.503532903651023e-05,
"loss": 1.2608,
"mean_token_accuracy": 0.7371292334794998,
"num_tokens": 541002283.0,
"step": 22000
},
{
"epoch": 1.0422588592003033,
"eval_entropy": 1.0797893660903208,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7580676496720636,
"eval_num_tokens": 541002283.0,
"eval_runtime": 728.5593,
"eval_samples_per_second": 34.06,
"eval_steps_per_second": 4.258,
"step": 22000
},
{
"entropy": 1.5906290924549102,
"epoch": 1.0446276293348493,
"grad_norm": 1.164602518081665,
"learning_rate": 5.482962113318203e-05,
"loss": 1.2085,
"mean_token_accuracy": 0.744525915980339,
"num_tokens": 542190090.0,
"step": 22050
},
{
"entropy": 1.5971219801902772,
"epoch": 1.0469963994693954,
"grad_norm": 1.2099922895431519,
"learning_rate": 5.462383067888741e-05,
"loss": 1.2251,
"mean_token_accuracy": 0.7439098012447357,
"num_tokens": 543425520.0,
"step": 22100
},
{
"entropy": 1.5796366775035857,
"epoch": 1.0493651696039417,
"grad_norm": 1.0977132320404053,
"learning_rate": 5.441796119112814e-05,
"loss": 1.1964,
"mean_token_accuracy": 0.7492218172550201,
"num_tokens": 544655034.0,
"step": 22150
},
{
"entropy": 1.6276609122753143,
"epoch": 1.0517339397384877,
"grad_norm": 1.2101308107376099,
"learning_rate": 5.421201618875689e-05,
"loss": 1.2242,
"mean_token_accuracy": 0.7425367647409439,
"num_tokens": 545867278.0,
"step": 22200
},
{
"entropy": 1.6007154369354248,
"epoch": 1.054102709873034,
"grad_norm": 1.2157655954360962,
"learning_rate": 5.4005999191917034e-05,
"loss": 1.2258,
"mean_token_accuracy": 0.7422733837366104,
"num_tokens": 547117555.0,
"step": 22250
},
{
"entropy": 1.6148575782775878,
"epoch": 1.05647148000758,
"grad_norm": 1.2759310007095337,
"learning_rate": 5.379991372198259e-05,
"loss": 1.187,
"mean_token_accuracy": 0.7483934825658798,
"num_tokens": 548337677.0,
"step": 22300
},
{
"entropy": 1.6164679837226867,
"epoch": 1.0588402501421261,
"grad_norm": 1.041208267211914,
"learning_rate": 5.359376330149789e-05,
"loss": 1.2082,
"mean_token_accuracy": 0.7465775471925735,
"num_tokens": 549541289.0,
"step": 22350
},
{
"entropy": 1.5865103662014008,
"epoch": 1.0612090202766724,
"grad_norm": 1.0899627208709717,
"learning_rate": 5.338755145411749e-05,
"loss": 1.1928,
"mean_token_accuracy": 0.747542524933815,
"num_tokens": 550805086.0,
"step": 22400
},
{
"entropy": 1.6212577140331268,
"epoch": 1.0635777904112185,
"grad_norm": 1.08705472946167,
"learning_rate": 5.318128170454589e-05,
"loss": 1.1852,
"mean_token_accuracy": 0.7487930029630661,
"num_tokens": 552036289.0,
"step": 22450
},
{
"entropy": 1.620989305973053,
"epoch": 1.0659465605457648,
"grad_norm": 1.2424880266189575,
"learning_rate": 5.297495757847727e-05,
"loss": 1.1865,
"mean_token_accuracy": 0.750239091515541,
"num_tokens": 553267770.0,
"step": 22500
},
{
"entropy": 1.5796532726287842,
"epoch": 1.0683153306803108,
"grad_norm": 1.4131051301956177,
"learning_rate": 5.2768582602535246e-05,
"loss": 1.177,
"mean_token_accuracy": 0.7511077529191971,
"num_tokens": 554500484.0,
"step": 22550
},
{
"entropy": 1.6192417418956757,
"epoch": 1.0706841008148569,
"grad_norm": 1.2125214338302612,
"learning_rate": 5.25621603042126e-05,
"loss": 1.2202,
"mean_token_accuracy": 0.7429804271459579,
"num_tokens": 555724501.0,
"step": 22600
},
{
"entropy": 1.6070238423347474,
"epoch": 1.0730528709494032,
"grad_norm": 1.1681252717971802,
"learning_rate": 5.235569421181103e-05,
"loss": 1.1896,
"mean_token_accuracy": 0.748240845799446,
"num_tokens": 556951175.0,
"step": 22650
},
{
"entropy": 1.6153511393070221,
"epoch": 1.0754216410839492,
"grad_norm": 1.2313953638076782,
"learning_rate": 5.21491878543807e-05,
"loss": 1.1826,
"mean_token_accuracy": 0.7484846365451813,
"num_tokens": 558174509.0,
"step": 22700
},
{
"entropy": 1.6176446998119354,
"epoch": 1.0777904112184953,
"grad_norm": 1.202515959739685,
"learning_rate": 5.194264476166006e-05,
"loss": 1.2147,
"mean_token_accuracy": 0.7431507217884064,
"num_tokens": 559404128.0,
"step": 22750
},
{
"entropy": 1.6383704769611358,
"epoch": 1.0801591813530416,
"grad_norm": 1.1402473449707031,
"learning_rate": 5.1736068464015463e-05,
"loss": 1.2216,
"mean_token_accuracy": 0.7412754154205322,
"num_tokens": 560601861.0,
"step": 22800
},
{
"entropy": 1.6056468284130097,
"epoch": 1.0825279514875876,
"grad_norm": 0.9191189408302307,
"learning_rate": 5.152946249238082e-05,
"loss": 1.1687,
"mean_token_accuracy": 0.751121336221695,
"num_tokens": 561852684.0,
"step": 22850
},
{
"entropy": 1.5993254363536835,
"epoch": 1.0848967216221337,
"grad_norm": 1.460180640220642,
"learning_rate": 5.132283037819723e-05,
"loss": 1.2194,
"mean_token_accuracy": 0.7445776867866516,
"num_tokens": 563087919.0,
"step": 22900
},
{
"entropy": 1.6140161871910095,
"epoch": 1.08726549175668,
"grad_norm": 1.2920235395431519,
"learning_rate": 5.111617565335264e-05,
"loss": 1.2139,
"mean_token_accuracy": 0.7439986896514893,
"num_tokens": 564333108.0,
"step": 22950
},
{
"entropy": 1.599245457649231,
"epoch": 1.089634261891226,
"grad_norm": 1.0727440118789673,
"learning_rate": 5.090950185012152e-05,
"loss": 1.1895,
"mean_token_accuracy": 0.7461957842111587,
"num_tokens": 565584977.0,
"step": 23000
},
{
"epoch": 1.089634261891226,
"eval_entropy": 1.0720901108034038,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7579072969698429,
"eval_num_tokens": 565584977.0,
"eval_runtime": 729.1236,
"eval_samples_per_second": 34.034,
"eval_steps_per_second": 4.254,
"step": 23000
},
{
"entropy": 1.5884887778759003,
"epoch": 1.0920030320257723,
"grad_norm": 1.2358678579330444,
"learning_rate": 5.070281250110437e-05,
"loss": 1.2144,
"mean_token_accuracy": 0.7440959370136261,
"num_tokens": 566830458.0,
"step": 23050
},
{
"entropy": 1.6139474022388458,
"epoch": 1.0943718021603184,
"grad_norm": 1.1043397188186646,
"learning_rate": 5.049611113916745e-05,
"loss": 1.2277,
"mean_token_accuracy": 0.7419761300086976,
"num_tokens": 568033795.0,
"step": 23100
},
{
"entropy": 1.5839227229356765,
"epoch": 1.0967405722948644,
"grad_norm": 1.267622470855713,
"learning_rate": 5.028940129738234e-05,
"loss": 1.1841,
"mean_token_accuracy": 0.7496823114156723,
"num_tokens": 569296146.0,
"step": 23150
},
{
"entropy": 1.5836478543281556,
"epoch": 1.0991093424294107,
"grad_norm": 1.3109345436096191,
"learning_rate": 5.0082686508965594e-05,
"loss": 1.2001,
"mean_token_accuracy": 0.7444102185964584,
"num_tokens": 570538060.0,
"step": 23200
},
{
"entropy": 1.6264153301715851,
"epoch": 1.1014781125639568,
"grad_norm": 1.166844129562378,
"learning_rate": 4.987597030721826e-05,
"loss": 1.2166,
"mean_token_accuracy": 0.7437608361244201,
"num_tokens": 571764299.0,
"step": 23250
},
{
"entropy": 1.593840502500534,
"epoch": 1.1038468826985028,
"grad_norm": 1.313543677330017,
"learning_rate": 4.966925622546559e-05,
"loss": 1.1976,
"mean_token_accuracy": 0.7477673798799515,
"num_tokens": 572986252.0,
"step": 23300
},
{
"entropy": 1.6218383753299712,
"epoch": 1.1062156528330491,
"grad_norm": 1.3079559803009033,
"learning_rate": 4.9462547796996554e-05,
"loss": 1.2085,
"mean_token_accuracy": 0.7460601913928986,
"num_tokens": 574203486.0,
"step": 23350
},
{
"entropy": 1.6025708365440368,
"epoch": 1.1085844229675952,
"grad_norm": 1.045305848121643,
"learning_rate": 4.925584855500357e-05,
"loss": 1.1834,
"mean_token_accuracy": 0.7496994876861572,
"num_tokens": 575431997.0,
"step": 23400
},
{
"entropy": 1.6023164546489717,
"epoch": 1.1109531931021415,
"grad_norm": 1.6260581016540527,
"learning_rate": 4.904916203252196e-05,
"loss": 1.1972,
"mean_token_accuracy": 0.7476231580972672,
"num_tokens": 576655765.0,
"step": 23450
},
{
"entropy": 1.6074532234668732,
"epoch": 1.1133219632366875,
"grad_norm": 1.1137516498565674,
"learning_rate": 4.884249176236966e-05,
"loss": 1.2031,
"mean_token_accuracy": 0.7456535613536834,
"num_tokens": 577896889.0,
"step": 23500
},
{
"entropy": 1.6064541089534758,
"epoch": 1.1156907333712336,
"grad_norm": 1.0753376483917236,
"learning_rate": 4.8635841277086823e-05,
"loss": 1.2093,
"mean_token_accuracy": 0.7460182595252991,
"num_tokens": 579123368.0,
"step": 23550
},
{
"entropy": 1.6451520609855652,
"epoch": 1.1180595035057799,
"grad_norm": 1.2830525636672974,
"learning_rate": 4.842921410887541e-05,
"loss": 1.2173,
"mean_token_accuracy": 0.7460962778329849,
"num_tokens": 580343576.0,
"step": 23600
},
{
"entropy": 1.594506859779358,
"epoch": 1.120428273640326,
"grad_norm": 1.2104618549346924,
"learning_rate": 4.822261378953884e-05,
"loss": 1.1846,
"mean_token_accuracy": 0.7500998550653457,
"num_tokens": 581571230.0,
"step": 23650
},
{
"entropy": 1.5977623069286346,
"epoch": 1.122797043774872,
"grad_norm": 1.0635625123977661,
"learning_rate": 4.8016043850421614e-05,
"loss": 1.2121,
"mean_token_accuracy": 0.7432440650463105,
"num_tokens": 582786589.0,
"step": 23700
},
{
"entropy": 1.6117420196533203,
"epoch": 1.1251658139094183,
"grad_norm": 1.2150670289993286,
"learning_rate": 4.7809507822348967e-05,
"loss": 1.1995,
"mean_token_accuracy": 0.746940575838089,
"num_tokens": 583979707.0,
"step": 23750
},
{
"entropy": 1.6140946924686432,
"epoch": 1.1275345840439643,
"grad_norm": 1.1496037244796753,
"learning_rate": 4.7603009235566465e-05,
"loss": 1.1965,
"mean_token_accuracy": 0.7485955774784088,
"num_tokens": 585198089.0,
"step": 23800
},
{
"entropy": 1.612507269382477,
"epoch": 1.1299033541785106,
"grad_norm": 1.1946005821228027,
"learning_rate": 4.7396551619679735e-05,
"loss": 1.1963,
"mean_token_accuracy": 0.7465278053283692,
"num_tokens": 586406915.0,
"step": 23850
},
{
"entropy": 1.614688711166382,
"epoch": 1.1322721243130567,
"grad_norm": 1.3998786211013794,
"learning_rate": 4.719013850359412e-05,
"loss": 1.202,
"mean_token_accuracy": 0.7469007116556168,
"num_tokens": 587625422.0,
"step": 23900
},
{
"entropy": 1.64091064453125,
"epoch": 1.1346408944476027,
"grad_norm": 1.1203569173812866,
"learning_rate": 4.69837734154543e-05,
"loss": 1.1882,
"mean_token_accuracy": 0.7487949818372727,
"num_tokens": 588838858.0,
"step": 23950
},
{
"entropy": 1.6168962919712067,
"epoch": 1.137009664582149,
"grad_norm": 1.4909604787826538,
"learning_rate": 4.677745988258406e-05,
"loss": 1.1948,
"mean_token_accuracy": 0.7495903551578522,
"num_tokens": 590081510.0,
"step": 24000
},
{
"epoch": 1.137009664582149,
"eval_entropy": 1.0897794357236166,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7586307351547699,
"eval_num_tokens": 590081510.0,
"eval_runtime": 728.9602,
"eval_samples_per_second": 34.042,
"eval_steps_per_second": 4.255,
"step": 24000
},
{
"entropy": 1.6240591382980347,
"epoch": 1.139378434716695,
"grad_norm": 0.9998461604118347,
"learning_rate": 4.657120143142597e-05,
"loss": 1.1922,
"mean_token_accuracy": 0.7492350846529007,
"num_tokens": 591297702.0,
"step": 24050
},
{
"entropy": 1.655022679567337,
"epoch": 1.1417472048512411,
"grad_norm": 1.124312162399292,
"learning_rate": 4.636500158748109e-05,
"loss": 1.2067,
"mean_token_accuracy": 0.7455829763412476,
"num_tokens": 592508714.0,
"step": 24100
},
{
"entropy": 1.5910465133190155,
"epoch": 1.1441159749857874,
"grad_norm": 1.11435866355896,
"learning_rate": 4.6158863875248734e-05,
"loss": 1.1684,
"mean_token_accuracy": 0.7527882248163223,
"num_tokens": 593747081.0,
"step": 24150
},
{
"entropy": 1.6349923205375672,
"epoch": 1.1464847451203335,
"grad_norm": 1.070635437965393,
"learning_rate": 4.595279181816624e-05,
"loss": 1.1916,
"mean_token_accuracy": 0.749586900472641,
"num_tokens": 594943747.0,
"step": 24200
},
{
"entropy": 1.601109493970871,
"epoch": 1.1488535152548796,
"grad_norm": 1.1581242084503174,
"learning_rate": 4.574678893854871e-05,
"loss": 1.1818,
"mean_token_accuracy": 0.7509790074825287,
"num_tokens": 596175131.0,
"step": 24250
},
{
"entropy": 1.6024657559394837,
"epoch": 1.1512222853894258,
"grad_norm": 1.2476531267166138,
"learning_rate": 4.554085875752879e-05,
"loss": 1.1997,
"mean_token_accuracy": 0.747204402089119,
"num_tokens": 597415232.0,
"step": 24300
},
{
"entropy": 1.636058064699173,
"epoch": 1.153591055523972,
"grad_norm": 1.6576809883117676,
"learning_rate": 4.533500479499661e-05,
"loss": 1.248,
"mean_token_accuracy": 0.7378575146198273,
"num_tokens": 598627396.0,
"step": 24350
},
{
"entropy": 1.6401480340957642,
"epoch": 1.1559598256585182,
"grad_norm": 1.2274305820465088,
"learning_rate": 4.512923056953941e-05,
"loss": 1.2219,
"mean_token_accuracy": 0.7444866347312927,
"num_tokens": 599864565.0,
"step": 24400
},
{
"entropy": 1.5943049252033235,
"epoch": 1.1583285957930642,
"grad_norm": 1.2362310886383057,
"learning_rate": 4.49235395983816e-05,
"loss": 1.1675,
"mean_token_accuracy": 0.7534567403793335,
"num_tokens": 601090785.0,
"step": 24450
},
{
"entropy": 1.5798736822605133,
"epoch": 1.1606973659276103,
"grad_norm": 0.9551867842674255,
"learning_rate": 4.4717935397324504e-05,
"loss": 1.1633,
"mean_token_accuracy": 0.7534276330471039,
"num_tokens": 602347409.0,
"step": 24500
},
{
"entropy": 1.612923823595047,
"epoch": 1.1630661360621566,
"grad_norm": 1.167639970779419,
"learning_rate": 4.4512421480686334e-05,
"loss": 1.1752,
"mean_token_accuracy": 0.7525352644920349,
"num_tokens": 603557548.0,
"step": 24550
},
{
"entropy": 1.5877990233898163,
"epoch": 1.1654349061967026,
"grad_norm": 1.1369372606277466,
"learning_rate": 4.430700136124209e-05,
"loss": 1.1781,
"mean_token_accuracy": 0.7510064965486527,
"num_tokens": 604816361.0,
"step": 24600
},
{
"entropy": 1.5707013046741485,
"epoch": 1.167803676331249,
"grad_norm": 1.3217617273330688,
"learning_rate": 4.410167855016356e-05,
"loss": 1.1578,
"mean_token_accuracy": 0.7544564688205719,
"num_tokens": 606031595.0,
"step": 24650
},
{
"entropy": 1.6142506301403046,
"epoch": 1.170172446465795,
"grad_norm": 1.0833561420440674,
"learning_rate": 4.3896456556959245e-05,
"loss": 1.1882,
"mean_token_accuracy": 0.7481600660085678,
"num_tokens": 607260243.0,
"step": 24700
},
{
"entropy": 1.5962833178043365,
"epoch": 1.172541216600341,
"grad_norm": 1.2788193225860596,
"learning_rate": 4.369133888941442e-05,
"loss": 1.1685,
"mean_token_accuracy": 0.7528700757026673,
"num_tokens": 608463931.0,
"step": 24750
},
{
"entropy": 1.6402158641815185,
"epoch": 1.1749099867348873,
"grad_norm": 1.275524616241455,
"learning_rate": 4.348632905353116e-05,
"loss": 1.1968,
"mean_token_accuracy": 0.7491856187582016,
"num_tokens": 609655302.0,
"step": 24800
},
{
"entropy": 1.614987963438034,
"epoch": 1.1772787568694334,
"grad_norm": 0.945978581905365,
"learning_rate": 4.32814305534684e-05,
"loss": 1.209,
"mean_token_accuracy": 0.7443074882030487,
"num_tokens": 610898851.0,
"step": 24850
},
{
"entropy": 1.636513990163803,
"epoch": 1.1796475270039795,
"grad_norm": 1.0590687990188599,
"learning_rate": 4.307664689148205e-05,
"loss": 1.2299,
"mean_token_accuracy": 0.7430419319868088,
"num_tokens": 612116242.0,
"step": 24900
},
{
"entropy": 1.6237515592575074,
"epoch": 1.1820162971385257,
"grad_norm": 1.3774100542068481,
"learning_rate": 4.287198156786516e-05,
"loss": 1.1786,
"mean_token_accuracy": 0.7511296081542969,
"num_tokens": 613335390.0,
"step": 24950
},
{
"entropy": 1.596169695854187,
"epoch": 1.1843850672730718,
"grad_norm": 1.1338964700698853,
"learning_rate": 4.2667438080888036e-05,
"loss": 1.1616,
"mean_token_accuracy": 0.7531165385246277,
"num_tokens": 614572046.0,
"step": 25000
},
{
"epoch": 1.1843850672730718,
"eval_entropy": 1.0971692777848259,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7585837088482061,
"eval_num_tokens": 614572046.0,
"eval_runtime": 728.9341,
"eval_samples_per_second": 34.043,
"eval_steps_per_second": 4.256,
"step": 25000
},
{
"entropy": 1.6357793831825256,
"epoch": 1.1867538374076179,
"grad_norm": 1.3235797882080078,
"learning_rate": 4.24630199267385e-05,
"loss": 1.2175,
"mean_token_accuracy": 0.7434951883554458,
"num_tokens": 615813138.0,
"step": 25050
},
{
"entropy": 1.610986716747284,
"epoch": 1.1891226075421641,
"grad_norm": 0.9810039401054382,
"learning_rate": 4.225873059946206e-05,
"loss": 1.183,
"mean_token_accuracy": 0.7497907614707947,
"num_tokens": 617052724.0,
"step": 25100
},
{
"entropy": 1.5992292380332946,
"epoch": 1.1914913776767102,
"grad_norm": 1.3308528661727905,
"learning_rate": 4.2054573590902295e-05,
"loss": 1.1455,
"mean_token_accuracy": 0.7566713351011276,
"num_tokens": 618273598.0,
"step": 25150
},
{
"entropy": 1.6277151501178742,
"epoch": 1.1938601478112565,
"grad_norm": 1.3727302551269531,
"learning_rate": 4.1850552390641076e-05,
"loss": 1.2243,
"mean_token_accuracy": 0.7431273967027664,
"num_tokens": 619483045.0,
"step": 25200
},
{
"entropy": 1.622536163330078,
"epoch": 1.1962289179458026,
"grad_norm": 1.319136142730713,
"learning_rate": 4.164667048593892e-05,
"loss": 1.1947,
"mean_token_accuracy": 0.7481009513139725,
"num_tokens": 620705194.0,
"step": 25250
},
{
"entropy": 1.6237769031524658,
"epoch": 1.1985976880803486,
"grad_norm": 1.2181921005249023,
"learning_rate": 4.144293136167549e-05,
"loss": 1.1737,
"mean_token_accuracy": 0.7511861574649811,
"num_tokens": 621924964.0,
"step": 25300
},
{
"entropy": 1.6047194039821624,
"epoch": 1.200966458214895,
"grad_norm": 1.2977948188781738,
"learning_rate": 4.123933850028991e-05,
"loss": 1.2143,
"mean_token_accuracy": 0.7442529916763305,
"num_tokens": 623143777.0,
"step": 25350
},
{
"entropy": 1.6068167972564698,
"epoch": 1.203335228349441,
"grad_norm": 1.313650369644165,
"learning_rate": 4.103589538172127e-05,
"loss": 1.2124,
"mean_token_accuracy": 0.7447144162654876,
"num_tokens": 624380253.0,
"step": 25400
},
{
"entropy": 1.5983986258506775,
"epoch": 1.2057039984839872,
"grad_norm": 1.6129273176193237,
"learning_rate": 4.0832605483349193e-05,
"loss": 1.1634,
"mean_token_accuracy": 0.7545448428392411,
"num_tokens": 625600084.0,
"step": 25450
},
{
"entropy": 1.6317675995826721,
"epoch": 1.2080727686185333,
"grad_norm": 1.2153606414794922,
"learning_rate": 4.062947227993433e-05,
"loss": 1.1998,
"mean_token_accuracy": 0.7479275733232498,
"num_tokens": 626816439.0,
"step": 25500
},
{
"entropy": 1.591133669614792,
"epoch": 1.2104415387530794,
"grad_norm": 1.3711997270584106,
"learning_rate": 4.042649924355905e-05,
"loss": 1.1747,
"mean_token_accuracy": 0.7510749793052673,
"num_tokens": 628060275.0,
"step": 25550
},
{
"entropy": 1.616237759590149,
"epoch": 1.2128103088876254,
"grad_norm": 1.317814588546753,
"learning_rate": 4.022368984356801e-05,
"loss": 1.1964,
"mean_token_accuracy": 0.7473501098155976,
"num_tokens": 629291699.0,
"step": 25600
},
{
"entropy": 1.618736606836319,
"epoch": 1.2151790790221717,
"grad_norm": 1.1058121919631958,
"learning_rate": 4.002104754650887e-05,
"loss": 1.2022,
"mean_token_accuracy": 0.74667718231678,
"num_tokens": 630538034.0,
"step": 25650
},
{
"entropy": 1.6311498081684113,
"epoch": 1.2175478491567178,
"grad_norm": 1.0992521047592163,
"learning_rate": 3.981857581607313e-05,
"loss": 1.1851,
"mean_token_accuracy": 0.7504255121946335,
"num_tokens": 631771489.0,
"step": 25700
},
{
"entropy": 1.6083201706409453,
"epoch": 1.219916619291264,
"grad_norm": 1.2340748310089111,
"learning_rate": 3.9616278113036786e-05,
"loss": 1.1595,
"mean_token_accuracy": 0.7537871873378754,
"num_tokens": 632996983.0,
"step": 25750
},
{
"entropy": 1.6100276720523834,
"epoch": 1.22228538942581,
"grad_norm": 1.2286880016326904,
"learning_rate": 3.9414157895201273e-05,
"loss": 1.2196,
"mean_token_accuracy": 0.7460716181993484,
"num_tokens": 634207237.0,
"step": 25800
},
{
"entropy": 1.6090706491470337,
"epoch": 1.2246541595603562,
"grad_norm": 1.2269102334976196,
"learning_rate": 3.9212218617334356e-05,
"loss": 1.19,
"mean_token_accuracy": 0.7494009816646576,
"num_tokens": 635435267.0,
"step": 25850
},
{
"entropy": 1.5722477328777313,
"epoch": 1.2270229296949025,
"grad_norm": 1.0874273777008057,
"learning_rate": 3.901046373111103e-05,
"loss": 1.1665,
"mean_token_accuracy": 0.7549541050195694,
"num_tokens": 636668958.0,
"step": 25900
},
{
"entropy": 1.6207891261577607,
"epoch": 1.2293916998294485,
"grad_norm": 1.4808669090270996,
"learning_rate": 3.880889668505455e-05,
"loss": 1.2441,
"mean_token_accuracy": 0.7397470092773437,
"num_tokens": 637878020.0,
"step": 25950
},
{
"entropy": 1.5923774099349977,
"epoch": 1.2317604699639948,
"grad_norm": 1.1652626991271973,
"learning_rate": 3.860752092447749e-05,
"loss": 1.1818,
"mean_token_accuracy": 0.7510163110494613,
"num_tokens": 639111025.0,
"step": 26000
},
{
"epoch": 1.2317604699639948,
"eval_entropy": 1.0619611897482402,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7597063543775787,
"eval_num_tokens": 639111025.0,
"eval_runtime": 728.2888,
"eval_samples_per_second": 34.073,
"eval_steps_per_second": 4.259,
"step": 26000
},
{
"entropy": 1.6090249001979828,
"epoch": 1.2341292400985409,
"grad_norm": 1.140541434288025,
"learning_rate": 3.840633989142289e-05,
"loss": 1.2208,
"mean_token_accuracy": 0.7445061576366424,
"num_tokens": 640330754.0,
"step": 26050
},
{
"entropy": 1.5675190496444702,
"epoch": 1.236498010233087,
"grad_norm": 1.1996732950210571,
"learning_rate": 3.820535702460533e-05,
"loss": 1.1648,
"mean_token_accuracy": 0.7530361658334732,
"num_tokens": 641577483.0,
"step": 26100
},
{
"entropy": 1.5749832487106323,
"epoch": 1.2388667803676332,
"grad_norm": 1.3973077535629272,
"learning_rate": 3.800457575935222e-05,
"loss": 1.172,
"mean_token_accuracy": 0.7515737456083298,
"num_tokens": 642810578.0,
"step": 26150
},
{
"entropy": 1.5630564618110656,
"epoch": 1.2412355505021793,
"grad_norm": 1.0359597206115723,
"learning_rate": 3.780399952754507e-05,
"loss": 1.1647,
"mean_token_accuracy": 0.7527536135911942,
"num_tokens": 644066110.0,
"step": 26200
},
{
"entropy": 1.592404429912567,
"epoch": 1.2436043206367253,
"grad_norm": 1.076479434967041,
"learning_rate": 3.7603631757560855e-05,
"loss": 1.1732,
"mean_token_accuracy": 0.7527641028165817,
"num_tokens": 645301566.0,
"step": 26250
},
{
"entropy": 1.6117449808120727,
"epoch": 1.2459730907712716,
"grad_norm": 1.218085527420044,
"learning_rate": 3.7403475874213354e-05,
"loss": 1.2315,
"mean_token_accuracy": 0.7417248862981797,
"num_tokens": 646546442.0,
"step": 26300
},
{
"entropy": 1.5983488774299621,
"epoch": 1.2483418609058177,
"grad_norm": 1.2915470600128174,
"learning_rate": 3.7203535298694656e-05,
"loss": 1.2024,
"mean_token_accuracy": 0.7482427370548248,
"num_tokens": 647787345.0,
"step": 26350
},
{
"entropy": 1.5954162907600402,
"epoch": 1.2507106310403637,
"grad_norm": 1.0252054929733276,
"learning_rate": 3.700381344851665e-05,
"loss": 1.1757,
"mean_token_accuracy": 0.752863358259201,
"num_tokens": 649019906.0,
"step": 26400
},
{
"entropy": 1.5740117967128753,
"epoch": 1.25307940117491,
"grad_norm": 1.2225929498672485,
"learning_rate": 3.6804313737452686e-05,
"loss": 1.1731,
"mean_token_accuracy": 0.7517155534029007,
"num_tokens": 650242573.0,
"step": 26450
},
{
"entropy": 1.5935308575630187,
"epoch": 1.255448171309456,
"grad_norm": 1.3027613162994385,
"learning_rate": 3.66050395754791e-05,
"loss": 1.1661,
"mean_token_accuracy": 0.7530785751342773,
"num_tokens": 651477593.0,
"step": 26500
},
{
"entropy": 1.5640760624408723,
"epoch": 1.2578169414440024,
"grad_norm": 0.9961079955101013,
"learning_rate": 3.6405994368717054e-05,
"loss": 1.1706,
"mean_token_accuracy": 0.7543714487552643,
"num_tokens": 652736186.0,
"step": 26550
},
{
"entropy": 1.6221139824390411,
"epoch": 1.2601857115785484,
"grad_norm": 1.6644654273986816,
"learning_rate": 3.620718151937425e-05,
"loss": 1.1881,
"mean_token_accuracy": 0.7484775596857071,
"num_tokens": 653945306.0,
"step": 26600
},
{
"entropy": 1.627434605360031,
"epoch": 1.2625544817130945,
"grad_norm": 1.1984070539474487,
"learning_rate": 3.6008604425686766e-05,
"loss": 1.2087,
"mean_token_accuracy": 0.744699953198433,
"num_tokens": 655163994.0,
"step": 26650
},
{
"entropy": 1.595815200805664,
"epoch": 1.2649232518476408,
"grad_norm": 0.942965030670166,
"learning_rate": 3.581026648186101e-05,
"loss": 1.2047,
"mean_token_accuracy": 0.7466594022512436,
"num_tokens": 656389078.0,
"step": 26700
},
{
"entropy": 1.5596436941623688,
"epoch": 1.2672920219821868,
"grad_norm": 1.0333982706069946,
"learning_rate": 3.561217107801568e-05,
"loss": 1.1366,
"mean_token_accuracy": 0.7599540430307389,
"num_tokens": 657628840.0,
"step": 26750
},
{
"entropy": 1.582226196527481,
"epoch": 1.269660792116733,
"grad_norm": 1.3895862102508545,
"learning_rate": 3.5414321600123854e-05,
"loss": 1.1594,
"mean_token_accuracy": 0.7542477381229401,
"num_tokens": 658863710.0,
"step": 26800
},
{
"entropy": 1.6000851714611053,
"epoch": 1.2720295622512792,
"grad_norm": 1.2000585794448853,
"learning_rate": 3.521672142995506e-05,
"loss": 1.1862,
"mean_token_accuracy": 0.7507990497350693,
"num_tokens": 660068012.0,
"step": 26850
},
{
"entropy": 1.6038016283512115,
"epoch": 1.2743983323858252,
"grad_norm": 1.0799274444580078,
"learning_rate": 3.501937394501747e-05,
"loss": 1.1911,
"mean_token_accuracy": 0.7496211153268814,
"num_tokens": 661305265.0,
"step": 26900
},
{
"entropy": 1.6001941812038423,
"epoch": 1.2767671025203713,
"grad_norm": 1.0266954898834229,
"learning_rate": 3.4822282518500286e-05,
"loss": 1.1319,
"mean_token_accuracy": 0.7590432322025299,
"num_tokens": 662525430.0,
"step": 26950
},
{
"entropy": 1.6020025527477264,
"epoch": 1.2791358726549176,
"grad_norm": 1.4219437837600708,
"learning_rate": 3.4625450519215915e-05,
"loss": 1.1896,
"mean_token_accuracy": 0.7499201774597168,
"num_tokens": 663708332.0,
"step": 27000
},
{
"epoch": 1.2791358726549176,
"eval_entropy": 1.0667552875749993,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7588769892467213,
"eval_num_tokens": 663708332.0,
"eval_runtime": 730.6419,
"eval_samples_per_second": 33.963,
"eval_steps_per_second": 4.246,
"step": 27000
},
{
"entropy": 1.5845714378356934,
"epoch": 1.2815046427894636,
"grad_norm": 1.2918035984039307,
"learning_rate": 3.4428881311542485e-05,
"loss": 1.2018,
"mean_token_accuracy": 0.7470276898145676,
"num_tokens": 664918181.0,
"step": 27050
},
{
"entropy": 1.567339129447937,
"epoch": 1.28387341292401,
"grad_norm": 1.3722400665283203,
"learning_rate": 3.423257825536637e-05,
"loss": 1.1469,
"mean_token_accuracy": 0.7578269052505493,
"num_tokens": 666127940.0,
"step": 27100
},
{
"entropy": 1.5779772651195527,
"epoch": 1.286242183058556,
"grad_norm": 1.2678896188735962,
"learning_rate": 3.403654470602463e-05,
"loss": 1.2057,
"mean_token_accuracy": 0.7468883281946183,
"num_tokens": 667363858.0,
"step": 27150
},
{
"entropy": 1.6206267583370209,
"epoch": 1.288610953193102,
"grad_norm": 1.415664553642273,
"learning_rate": 3.3840784014247825e-05,
"loss": 1.1709,
"mean_token_accuracy": 0.7533509171009064,
"num_tokens": 668586984.0,
"step": 27200
},
{
"entropy": 1.58282252907753,
"epoch": 1.2909797233276483,
"grad_norm": 1.0562044382095337,
"learning_rate": 3.3645299526102625e-05,
"loss": 1.1525,
"mean_token_accuracy": 0.7587102675437927,
"num_tokens": 669848008.0,
"step": 27250
},
{
"entropy": 1.6004004609584808,
"epoch": 1.2933484934621944,
"grad_norm": 1.3321800231933594,
"learning_rate": 3.3450094582934624e-05,
"loss": 1.168,
"mean_token_accuracy": 0.7531924885511398,
"num_tokens": 671055921.0,
"step": 27300
},
{
"entropy": 1.6070753967761993,
"epoch": 1.2957172635967407,
"grad_norm": 1.1480814218521118,
"learning_rate": 3.3255172521311296e-05,
"loss": 1.1957,
"mean_token_accuracy": 0.7474820953607559,
"num_tokens": 672291814.0,
"step": 27350
},
{
"entropy": 1.618386241197586,
"epoch": 1.2980860337312867,
"grad_norm": 1.3218634128570557,
"learning_rate": 3.306053667296491e-05,
"loss": 1.1813,
"mean_token_accuracy": 0.749809256196022,
"num_tokens": 673529686.0,
"step": 27400
},
{
"entropy": 1.5894237875938415,
"epoch": 1.3004548038658328,
"grad_norm": 1.2133702039718628,
"learning_rate": 3.286619036473557e-05,
"loss": 1.1527,
"mean_token_accuracy": 0.7563208711147308,
"num_tokens": 674737012.0,
"step": 27450
},
{
"entropy": 1.5680401778221131,
"epoch": 1.302823574000379,
"grad_norm": 1.3504135608673096,
"learning_rate": 3.267213691851443e-05,
"loss": 1.1453,
"mean_token_accuracy": 0.7576598930358887,
"num_tokens": 676016669.0,
"step": 27500
},
{
"entropy": 1.5564317107200623,
"epoch": 1.3051923441349251,
"grad_norm": 1.2370836734771729,
"learning_rate": 3.2478379651186814e-05,
"loss": 1.151,
"mean_token_accuracy": 0.7560758543014526,
"num_tokens": 677240518.0,
"step": 27550
},
{
"entropy": 1.5811952316761018,
"epoch": 1.3075611142694714,
"grad_norm": 1.161582589149475,
"learning_rate": 3.228492187457557e-05,
"loss": 1.1623,
"mean_token_accuracy": 0.7540548771619797,
"num_tokens": 678477906.0,
"step": 27600
},
{
"entropy": 1.616460270881653,
"epoch": 1.3099298844040175,
"grad_norm": 1.2357761859893799,
"learning_rate": 3.209176689538448e-05,
"loss": 1.203,
"mean_token_accuracy": 0.7480911284685134,
"num_tokens": 679717449.0,
"step": 27650
},
{
"entropy": 1.6137902176380157,
"epoch": 1.3122986545385635,
"grad_norm": 1.1097781658172607,
"learning_rate": 3.189891801514171e-05,
"loss": 1.1877,
"mean_token_accuracy": 0.7503223437070846,
"num_tokens": 680910674.0,
"step": 27700
},
{
"entropy": 1.6009632289409637,
"epoch": 1.3146674246731096,
"grad_norm": 1.260872721672058,
"learning_rate": 3.1706378530143385e-05,
"loss": 1.1725,
"mean_token_accuracy": 0.7530629223585129,
"num_tokens": 682144950.0,
"step": 27750
},
{
"entropy": 1.6063115882873535,
"epoch": 1.3170361948076559,
"grad_norm": 1.1645578145980835,
"learning_rate": 3.1514151731397246e-05,
"loss": 1.1647,
"mean_token_accuracy": 0.753446283340454,
"num_tokens": 683390865.0,
"step": 27800
},
{
"entropy": 1.6270235812664031,
"epoch": 1.319404964942202,
"grad_norm": 1.2020407915115356,
"learning_rate": 3.1322240904566426e-05,
"loss": 1.1735,
"mean_token_accuracy": 0.7529788100719452,
"num_tokens": 684605889.0,
"step": 27850
},
{
"entropy": 1.607979006767273,
"epoch": 1.3217737350767482,
"grad_norm": 1.137190580368042,
"learning_rate": 3.1130649329913225e-05,
"loss": 1.2056,
"mean_token_accuracy": 0.7471660190820694,
"num_tokens": 685842856.0,
"step": 27900
},
{
"entropy": 1.6045704185962677,
"epoch": 1.3241425052112943,
"grad_norm": 1.21959388256073,
"learning_rate": 3.09393802822431e-05,
"loss": 1.1506,
"mean_token_accuracy": 0.7561245012283325,
"num_tokens": 687059905.0,
"step": 27950
},
{
"entropy": 1.6008513212203979,
"epoch": 1.3265112753458403,
"grad_norm": 0.969918429851532,
"learning_rate": 3.074843703084869e-05,
"loss": 1.1717,
"mean_token_accuracy": 0.7522702825069427,
"num_tokens": 688293184.0,
"step": 28000
},
{
"epoch": 1.3265112753458403,
"eval_entropy": 1.1074502345902315,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7599941444139493,
"eval_num_tokens": 688293184.0,
"eval_runtime": 728.3714,
"eval_samples_per_second": 34.069,
"eval_steps_per_second": 4.259,
"step": 28000
},
{
"entropy": 1.613970617055893,
"epoch": 1.3288800454803866,
"grad_norm": 1.2240134477615356,
"learning_rate": 3.0557822839453874e-05,
"loss": 1.1618,
"mean_token_accuracy": 0.7536975979804993,
"num_tokens": 689517314.0,
"step": 28050
},
{
"entropy": 1.5815585339069367,
"epoch": 1.3312488156149327,
"grad_norm": 1.1150712966918945,
"learning_rate": 3.036754096615807e-05,
"loss": 1.1704,
"mean_token_accuracy": 0.753908543586731,
"num_tokens": 690755165.0,
"step": 28100
},
{
"entropy": 1.5578910648822784,
"epoch": 1.333617585749479,
"grad_norm": 1.2640388011932373,
"learning_rate": 3.017759466338046e-05,
"loss": 1.1623,
"mean_token_accuracy": 0.7535911196470261,
"num_tokens": 692007773.0,
"step": 28150
},
{
"entropy": 1.5919007360935211,
"epoch": 1.335986355884025,
"grad_norm": 1.1327555179595947,
"learning_rate": 2.9987987177804494e-05,
"loss": 1.1729,
"mean_token_accuracy": 0.7511158144474029,
"num_tokens": 693232662.0,
"step": 28200
},
{
"entropy": 1.6085839200019836,
"epoch": 1.338355126018571,
"grad_norm": 0.9447433352470398,
"learning_rate": 2.979872175032231e-05,
"loss": 1.1558,
"mean_token_accuracy": 0.7552832061052323,
"num_tokens": 694432399.0,
"step": 28250
},
{
"entropy": 1.6203950083255767,
"epoch": 1.3407238961531174,
"grad_norm": 1.1614621877670288,
"learning_rate": 2.960980161597936e-05,
"loss": 1.1892,
"mean_token_accuracy": 0.7479177683591842,
"num_tokens": 695664920.0,
"step": 28300
},
{
"entropy": 1.588459266424179,
"epoch": 1.3430926662876634,
"grad_norm": 1.6014941930770874,
"learning_rate": 2.9421230003919155e-05,
"loss": 1.1604,
"mean_token_accuracy": 0.75300128698349,
"num_tokens": 696895792.0,
"step": 28350
},
{
"entropy": 1.6118758118152618,
"epoch": 1.3454614364222097,
"grad_norm": 1.1947180032730103,
"learning_rate": 2.923301013732799e-05,
"loss": 1.1825,
"mean_token_accuracy": 0.7502673131227493,
"num_tokens": 698079475.0,
"step": 28400
},
{
"entropy": 1.6115264117717742,
"epoch": 1.3478302065567558,
"grad_norm": 1.2438665628433228,
"learning_rate": 2.9045145233379976e-05,
"loss": 1.2001,
"mean_token_accuracy": 0.7489022916555405,
"num_tokens": 699305883.0,
"step": 28450
},
{
"entropy": 1.5930208683013916,
"epoch": 1.3501989766913018,
"grad_norm": 1.2472587823867798,
"learning_rate": 2.885763850318193e-05,
"loss": 1.1455,
"mean_token_accuracy": 0.7588497418165207,
"num_tokens": 700517157.0,
"step": 28500
},
{
"entropy": 1.611283905506134,
"epoch": 1.352567746825848,
"grad_norm": 1.1896998882293701,
"learning_rate": 2.8670493151718526e-05,
"loss": 1.2069,
"mean_token_accuracy": 0.7471307969093323,
"num_tokens": 701725293.0,
"step": 28550
},
{
"entropy": 1.5695743489265441,
"epoch": 1.3549365169603942,
"grad_norm": 1.1057043075561523,
"learning_rate": 2.8483712377797544e-05,
"loss": 1.1538,
"mean_token_accuracy": 0.7563241708278656,
"num_tokens": 702969110.0,
"step": 28600
},
{
"entropy": 1.5800132751464844,
"epoch": 1.3573052870949402,
"grad_norm": 1.1600664854049683,
"learning_rate": 2.829729937399515e-05,
"loss": 1.1533,
"mean_token_accuracy": 0.7582338035106659,
"num_tokens": 704225571.0,
"step": 28650
},
{
"entropy": 1.6222402799129485,
"epoch": 1.3596740572294865,
"grad_norm": 0.993548572063446,
"learning_rate": 2.8111257326601402e-05,
"loss": 1.2294,
"mean_token_accuracy": 0.742488032579422,
"num_tokens": 705467457.0,
"step": 28700
},
{
"entropy": 1.568291175365448,
"epoch": 1.3620428273640326,
"grad_norm": 1.0379763841629028,
"learning_rate": 2.7925589415565666e-05,
"loss": 1.1593,
"mean_token_accuracy": 0.7555217838287354,
"num_tokens": 706689479.0,
"step": 28750
},
{
"entropy": 1.6043275892734528,
"epoch": 1.3644115974985787,
"grad_norm": 1.43356454372406,
"learning_rate": 2.774029881444238e-05,
"loss": 1.2127,
"mean_token_accuracy": 0.7451710641384125,
"num_tokens": 707935708.0,
"step": 28800
},
{
"entropy": 1.5973702204227447,
"epoch": 1.366780367633125,
"grad_norm": 1.2377339601516724,
"learning_rate": 2.7555388690336725e-05,
"loss": 1.163,
"mean_token_accuracy": 0.7523965907096862,
"num_tokens": 709186556.0,
"step": 28850
},
{
"entropy": 1.5913502633571626,
"epoch": 1.369149137767671,
"grad_norm": 1.2148689031600952,
"learning_rate": 2.737086220385055e-05,
"loss": 1.128,
"mean_token_accuracy": 0.7610643255710602,
"num_tokens": 710387868.0,
"step": 28900
},
{
"entropy": 1.629042412042618,
"epoch": 1.3715179079022173,
"grad_norm": 1.3843477964401245,
"learning_rate": 2.7186722509028294e-05,
"loss": 1.1943,
"mean_token_accuracy": 0.7486888426542282,
"num_tokens": 711599301.0,
"step": 28950
},
{
"entropy": 1.5498824548721313,
"epoch": 1.3738866780367633,
"grad_norm": 1.2801408767700195,
"learning_rate": 2.7002972753303167e-05,
"loss": 1.1466,
"mean_token_accuracy": 0.7573213475942612,
"num_tokens": 712853561.0,
"step": 29000
},
{
"epoch": 1.3738866780367633,
"eval_entropy": 1.08869860577322,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7604253247985526,
"eval_num_tokens": 712853561.0,
"eval_runtime": 728.768,
"eval_samples_per_second": 34.051,
"eval_steps_per_second": 4.256,
"step": 29000
},
{
"entropy": 1.5806612002849578,
"epoch": 1.3762554481713094,
"grad_norm": 1.1750032901763916,
"learning_rate": 2.6819616077443243e-05,
"loss": 1.1608,
"mean_token_accuracy": 0.754431728720665,
"num_tokens": 714067279.0,
"step": 29050
},
{
"entropy": 1.5694472527503966,
"epoch": 1.3786242183058555,
"grad_norm": 1.0282950401306152,
"learning_rate": 2.6636655615497808e-05,
"loss": 1.1345,
"mean_token_accuracy": 0.7596888369321824,
"num_tokens": 715313924.0,
"step": 29100
},
{
"entropy": 1.5819503486156463,
"epoch": 1.3809929884404017,
"grad_norm": 1.0335078239440918,
"learning_rate": 2.6454094494743865e-05,
"loss": 1.172,
"mean_token_accuracy": 0.7516703462600708,
"num_tokens": 716539594.0,
"step": 29150
},
{
"entropy": 1.5596589314937592,
"epoch": 1.3833617585749478,
"grad_norm": 1.1024916172027588,
"learning_rate": 2.627193583563259e-05,
"loss": 1.1515,
"mean_token_accuracy": 0.7565756964683533,
"num_tokens": 717779064.0,
"step": 29200
},
{
"entropy": 1.5737035143375397,
"epoch": 1.385730528709494,
"grad_norm": 1.3154332637786865,
"learning_rate": 2.609018275173601e-05,
"loss": 1.1313,
"mean_token_accuracy": 0.7610451829433441,
"num_tokens": 719013380.0,
"step": 29250
},
{
"entropy": 1.5879342305660247,
"epoch": 1.3880992988440402,
"grad_norm": 1.1045840978622437,
"learning_rate": 2.590883834969383e-05,
"loss": 1.1607,
"mean_token_accuracy": 0.753515048623085,
"num_tokens": 720213990.0,
"step": 29300
},
{
"entropy": 1.5969963049888611,
"epoch": 1.3904680689785862,
"grad_norm": 1.360352635383606,
"learning_rate": 2.5727905729160274e-05,
"loss": 1.2105,
"mean_token_accuracy": 0.7468285745382309,
"num_tokens": 721454429.0,
"step": 29350
},
{
"entropy": 1.5682587361335754,
"epoch": 1.3928368391131325,
"grad_norm": 1.2134160995483398,
"learning_rate": 2.5547387982751186e-05,
"loss": 1.1674,
"mean_token_accuracy": 0.7542579096555709,
"num_tokens": 722678187.0,
"step": 29400
},
{
"entropy": 1.5867646288871766,
"epoch": 1.3952056092476786,
"grad_norm": 1.316106915473938,
"learning_rate": 2.536728819599108e-05,
"loss": 1.1752,
"mean_token_accuracy": 0.7528412294387817,
"num_tokens": 723910197.0,
"step": 29450
},
{
"entropy": 1.5651627695560455,
"epoch": 1.3975743793822248,
"grad_norm": 1.184169054031372,
"learning_rate": 2.5187609447260417e-05,
"loss": 1.1439,
"mean_token_accuracy": 0.7595011454820633,
"num_tokens": 725117786.0,
"step": 29500
},
{
"entropy": 1.5885691118240357,
"epoch": 1.399943149516771,
"grad_norm": 1.026950716972351,
"learning_rate": 2.5008354807743063e-05,
"loss": 1.1624,
"mean_token_accuracy": 0.7540817469358444,
"num_tokens": 726361382.0,
"step": 29550
},
{
"entropy": 1.6078854203224182,
"epoch": 1.402311919651317,
"grad_norm": 1.105989694595337,
"learning_rate": 2.482952734137369e-05,
"loss": 1.1846,
"mean_token_accuracy": 0.7512462210655212,
"num_tokens": 727584011.0,
"step": 29600
},
{
"entropy": 1.5742497992515565,
"epoch": 1.4046806897858632,
"grad_norm": 1.0175246000289917,
"learning_rate": 2.4651130104785464e-05,
"loss": 1.1383,
"mean_token_accuracy": 0.7599206572771072,
"num_tokens": 728859452.0,
"step": 29650
},
{
"entropy": 1.5539786064624785,
"epoch": 1.4070494599204093,
"grad_norm": 1.19257652759552,
"learning_rate": 2.447316614725779e-05,
"loss": 1.1285,
"mean_token_accuracy": 0.7605455183982849,
"num_tokens": 730093871.0,
"step": 29700
},
{
"entropy": 1.575538364648819,
"epoch": 1.4094182300549556,
"grad_norm": 1.3367068767547607,
"learning_rate": 2.429563851066423e-05,
"loss": 1.1549,
"mean_token_accuracy": 0.7566698521375657,
"num_tokens": 731296865.0,
"step": 29750
},
{
"entropy": 1.5920424699783324,
"epoch": 1.4117870001895017,
"grad_norm": 1.1951195001602173,
"learning_rate": 2.411855022942043e-05,
"loss": 1.1863,
"mean_token_accuracy": 0.7511163413524627,
"num_tokens": 732528311.0,
"step": 29800
},
{
"entropy": 1.5808912098407746,
"epoch": 1.4141557703240477,
"grad_norm": 1.2582076787948608,
"learning_rate": 2.394190433043228e-05,
"loss": 1.1524,
"mean_token_accuracy": 0.756331347823143,
"num_tokens": 733754679.0,
"step": 29850
},
{
"entropy": 1.6012385189533234,
"epoch": 1.4165245404585938,
"grad_norm": 1.2719967365264893,
"learning_rate": 2.376570383304423e-05,
"loss": 1.1689,
"mean_token_accuracy": 0.7530979549884796,
"num_tokens": 734988780.0,
"step": 29900
},
{
"entropy": 1.5944563674926757,
"epoch": 1.41889331059314,
"grad_norm": 1.168672800064087,
"learning_rate": 2.3589951748987615e-05,
"loss": 1.1874,
"mean_token_accuracy": 0.7496302407979966,
"num_tokens": 736210496.0,
"step": 29950
},
{
"entropy": 1.5730518507957458,
"epoch": 1.4212620807276861,
"grad_norm": 1.2104912996292114,
"learning_rate": 2.3414651082329214e-05,
"loss": 1.1672,
"mean_token_accuracy": 0.7543781703710556,
"num_tokens": 737427744.0,
"step": 30000
},
{
"epoch": 1.4212620807276861,
"eval_entropy": 1.0714102481581333,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7607390975986735,
"eval_num_tokens": 737427744.0,
"eval_runtime": 727.6986,
"eval_samples_per_second": 34.101,
"eval_steps_per_second": 4.263,
"step": 30000
},
{
"entropy": 1.5904272723197936,
"epoch": 1.4236308508622324,
"grad_norm": 1.3191770315170288,
"learning_rate": 2.323980482941987e-05,
"loss": 1.1451,
"mean_token_accuracy": 0.7584353858232498,
"num_tokens": 738655939.0,
"step": 30050
},
{
"entropy": 1.5946367990970611,
"epoch": 1.4259996209967785,
"grad_norm": 1.6591154336929321,
"learning_rate": 2.3065415978843334e-05,
"loss": 1.1805,
"mean_token_accuracy": 0.7520586925745011,
"num_tokens": 739862826.0,
"step": 30100
},
{
"entropy": 1.5869379675388335,
"epoch": 1.4283683911313245,
"grad_norm": 1.4352805614471436,
"learning_rate": 2.2891487511365144e-05,
"loss": 1.1486,
"mean_token_accuracy": 0.7577965116500854,
"num_tokens": 741119357.0,
"step": 30150
},
{
"entropy": 1.5365791404247284,
"epoch": 1.4307371612658708,
"grad_norm": 1.2801551818847656,
"learning_rate": 2.2718022399881637e-05,
"loss": 1.142,
"mean_token_accuracy": 0.7584273481369018,
"num_tokens": 742333607.0,
"step": 30200
},
{
"entropy": 1.5572332954406738,
"epoch": 1.4331059314004169,
"grad_norm": 1.212966799736023,
"learning_rate": 2.2545023609369202e-05,
"loss": 1.1619,
"mean_token_accuracy": 0.7548242086172103,
"num_tokens": 743565034.0,
"step": 30250
},
{
"entropy": 1.566081155538559,
"epoch": 1.4354747015349631,
"grad_norm": 1.0920426845550537,
"learning_rate": 2.237249409683356e-05,
"loss": 1.1783,
"mean_token_accuracy": 0.7530256235599517,
"num_tokens": 744804057.0,
"step": 30300
},
{
"entropy": 1.6045558285713195,
"epoch": 1.4378434716695092,
"grad_norm": 0.9273141026496887,
"learning_rate": 2.220043681125924e-05,
"loss": 1.1419,
"mean_token_accuracy": 0.7590651035308837,
"num_tokens": 746006768.0,
"step": 30350
},
{
"entropy": 1.5942323195934296,
"epoch": 1.4402122418040553,
"grad_norm": 1.1541792154312134,
"learning_rate": 2.202885469355916e-05,
"loss": 1.1921,
"mean_token_accuracy": 0.7489143800735474,
"num_tokens": 747223675.0,
"step": 30400
},
{
"entropy": 1.5869334352016449,
"epoch": 1.4425810119386016,
"grad_norm": 1.462320327758789,
"learning_rate": 2.1857750676524357e-05,
"loss": 1.1442,
"mean_token_accuracy": 0.7573701620101929,
"num_tokens": 748430497.0,
"step": 30450
},
{
"entropy": 1.586618103981018,
"epoch": 1.4449497820731476,
"grad_norm": 1.0793588161468506,
"learning_rate": 2.168712768477392e-05,
"loss": 1.1743,
"mean_token_accuracy": 0.7522615754604339,
"num_tokens": 749647006.0,
"step": 30500
},
{
"entropy": 1.6103046894073487,
"epoch": 1.447318552207694,
"grad_norm": 1.2154242992401123,
"learning_rate": 2.1516988634704882e-05,
"loss": 1.19,
"mean_token_accuracy": 0.7501159131526947,
"num_tokens": 750853602.0,
"step": 30550
},
{
"entropy": 1.5543176436424255,
"epoch": 1.44968732234224,
"grad_norm": 1.1655502319335938,
"learning_rate": 2.1347336434442467e-05,
"loss": 1.1284,
"mean_token_accuracy": 0.7604024815559387,
"num_tokens": 752063383.0,
"step": 30600
},
{
"entropy": 1.5402367627620697,
"epoch": 1.452056092476786,
"grad_norm": 0.9396981000900269,
"learning_rate": 2.1178173983790333e-05,
"loss": 1.1413,
"mean_token_accuracy": 0.7587384188175201,
"num_tokens": 753297932.0,
"step": 30650
},
{
"entropy": 1.565843381881714,
"epoch": 1.454424862611332,
"grad_norm": 1.2412699460983276,
"learning_rate": 2.100950417418105e-05,
"loss": 1.1336,
"mean_token_accuracy": 0.76046923995018,
"num_tokens": 754534333.0,
"step": 30700
},
{
"entropy": 1.5800429701805114,
"epoch": 1.4567936327458784,
"grad_norm": 1.3534191846847534,
"learning_rate": 2.084132988862663e-05,
"loss": 1.168,
"mean_token_accuracy": 0.7545898991823197,
"num_tokens": 755771112.0,
"step": 30750
},
{
"entropy": 1.5431535518169404,
"epoch": 1.4591624028804244,
"grad_norm": 1.1893748044967651,
"learning_rate": 2.067365400166928e-05,
"loss": 1.1317,
"mean_token_accuracy": 0.7592762231826782,
"num_tokens": 757016170.0,
"step": 30800
},
{
"entropy": 1.5234503149986267,
"epoch": 1.4615311730149707,
"grad_norm": 1.2661027908325195,
"learning_rate": 2.0506479379332277e-05,
"loss": 1.1197,
"mean_token_accuracy": 0.7625928592681884,
"num_tokens": 758267588.0,
"step": 30850
},
{
"entropy": 1.595642819404602,
"epoch": 1.4638999431495168,
"grad_norm": 1.3147796392440796,
"learning_rate": 2.0339808879070942e-05,
"loss": 1.1943,
"mean_token_accuracy": 0.7485580265522003,
"num_tokens": 759488024.0,
"step": 30900
},
{
"entropy": 1.581249178647995,
"epoch": 1.4662687132840628,
"grad_norm": 1.1915379762649536,
"learning_rate": 2.0173645349723823e-05,
"loss": 1.1843,
"mean_token_accuracy": 0.751889705657959,
"num_tokens": 760705547.0,
"step": 30950
},
{
"entropy": 1.547069821357727,
"epoch": 1.4686374834186091,
"grad_norm": 1.3527320623397827,
"learning_rate": 2.0007991631463985e-05,
"loss": 1.124,
"mean_token_accuracy": 0.7617496418952941,
"num_tokens": 761946385.0,
"step": 31000
},
{
"epoch": 1.4686374834186091,
"eval_entropy": 1.0745692070571167,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7606811193421454,
"eval_num_tokens": 761946385.0,
"eval_runtime": 750.8644,
"eval_samples_per_second": 33.049,
"eval_steps_per_second": 4.131,
"step": 31000
},
{
"entropy": 1.5860296082496643,
"epoch": 1.4710062535531552,
"grad_norm": 1.1028927564620972,
"learning_rate": 1.984285055575052e-05,
"loss": 1.1477,
"mean_token_accuracy": 0.757946463227272,
"num_tokens": 763145229.0,
"step": 31050
},
{
"entropy": 1.5409555327892304,
"epoch": 1.4733750236877015,
"grad_norm": 1.3105554580688477,
"learning_rate": 1.967822494528007e-05,
"loss": 1.1388,
"mean_token_accuracy": 0.7595143103599549,
"num_tokens": 764368780.0,
"step": 31100
},
{
"entropy": 1.5342436349391937,
"epoch": 1.4757437938222475,
"grad_norm": 1.175134539604187,
"learning_rate": 1.9514117613938625e-05,
"loss": 1.1376,
"mean_token_accuracy": 0.7598193883895874,
"num_tokens": 765622021.0,
"step": 31150
},
{
"entropy": 1.568356648683548,
"epoch": 1.4781125639567936,
"grad_norm": 1.1193444728851318,
"learning_rate": 1.935053136675339e-05,
"loss": 1.1488,
"mean_token_accuracy": 0.7594327408075333,
"num_tokens": 766856360.0,
"step": 31200
},
{
"entropy": 1.5699161183834076,
"epoch": 1.4804813340913396,
"grad_norm": 1.0560715198516846,
"learning_rate": 1.9187468999844936e-05,
"loss": 1.1459,
"mean_token_accuracy": 0.7583613079786301,
"num_tokens": 768058852.0,
"step": 31250
},
{
"entropy": 1.588351196050644,
"epoch": 1.482850104225886,
"grad_norm": 1.2862846851348877,
"learning_rate": 1.9024933300379277e-05,
"loss": 1.1692,
"mean_token_accuracy": 0.7539873999357224,
"num_tokens": 769278437.0,
"step": 31300
},
{
"entropy": 1.5751077544689178,
"epoch": 1.485218874360432,
"grad_norm": 1.1171611547470093,
"learning_rate": 1.8862927046520312e-05,
"loss": 1.1468,
"mean_token_accuracy": 0.7574340969324111,
"num_tokens": 770491005.0,
"step": 31350
},
{
"entropy": 1.5523167753219604,
"epoch": 1.4875876444949783,
"grad_norm": 1.1966944932937622,
"learning_rate": 1.8701453007382314e-05,
"loss": 1.1322,
"mean_token_accuracy": 0.7628469413518906,
"num_tokens": 771700031.0,
"step": 31400
},
{
"entropy": 1.5793978321552276,
"epoch": 1.4899564146295243,
"grad_norm": 1.404768466949463,
"learning_rate": 1.8540513942982602e-05,
"loss": 1.1309,
"mean_token_accuracy": 0.7610709732770919,
"num_tokens": 772941795.0,
"step": 31450
},
{
"entropy": 1.5804509365558623,
"epoch": 1.4923251847640704,
"grad_norm": 1.3773914575576782,
"learning_rate": 1.838011260419435e-05,
"loss": 1.1556,
"mean_token_accuracy": 0.7568354111909866,
"num_tokens": 774162687.0,
"step": 31500
},
{
"entropy": 1.5457658851146698,
"epoch": 1.4946939548986167,
"grad_norm": 0.9370711445808411,
"learning_rate": 1.822025173269964e-05,
"loss": 1.1291,
"mean_token_accuracy": 0.7615066528320312,
"num_tokens": 775426714.0,
"step": 31550
},
{
"entropy": 1.550627862215042,
"epoch": 1.4970627250331627,
"grad_norm": 1.1992812156677246,
"learning_rate": 1.8060934060942487e-05,
"loss": 1.1443,
"mean_token_accuracy": 0.7579187524318695,
"num_tokens": 776645207.0,
"step": 31600
},
{
"entropy": 1.5496788358688354,
"epoch": 1.499431495167709,
"grad_norm": 1.290854811668396,
"learning_rate": 1.7902162312082194e-05,
"loss": 1.1542,
"mean_token_accuracy": 0.7575876170396805,
"num_tokens": 777890539.0,
"step": 31650
},
{
"entropy": 1.583651841878891,
"epoch": 1.501800265302255,
"grad_norm": 1.4201711416244507,
"learning_rate": 1.7743939199946818e-05,
"loss": 1.1669,
"mean_token_accuracy": 0.7559886735677719,
"num_tokens": 779106659.0,
"step": 31700
},
{
"entropy": 1.5621719944477082,
"epoch": 1.5041690354368011,
"grad_norm": 1.0013508796691895,
"learning_rate": 1.7586267428986763e-05,
"loss": 1.1622,
"mean_token_accuracy": 0.7543949365615845,
"num_tokens": 780313881.0,
"step": 31750
},
{
"entropy": 1.5784629476070404,
"epoch": 1.5065378055713472,
"grad_norm": 1.293186068534851,
"learning_rate": 1.742914969422856e-05,
"loss": 1.1484,
"mean_token_accuracy": 0.7578674453496933,
"num_tokens": 781544604.0,
"step": 31800
},
{
"entropy": 1.5572881984710694,
"epoch": 1.5089065757058935,
"grad_norm": 1.1909185647964478,
"learning_rate": 1.7272588681228767e-05,
"loss": 1.1025,
"mean_token_accuracy": 0.7669892936944962,
"num_tokens": 782765240.0,
"step": 31850
},
{
"entropy": 1.5987060451507569,
"epoch": 1.5112753458404398,
"grad_norm": 1.3331712484359741,
"learning_rate": 1.7116587066028172e-05,
"loss": 1.1787,
"mean_token_accuracy": 0.7533298796415329,
"num_tokens": 783994667.0,
"step": 31900
},
{
"entropy": 1.5784035372734069,
"epoch": 1.5136441159749858,
"grad_norm": 1.3433549404144287,
"learning_rate": 1.6961147515105897e-05,
"loss": 1.1539,
"mean_token_accuracy": 0.7583291745185852,
"num_tokens": 785241722.0,
"step": 31950
},
{
"entropy": 1.5924919998645783,
"epoch": 1.5160128861095319,
"grad_norm": 0.9708880186080933,
"learning_rate": 1.6806272685333967e-05,
"loss": 1.168,
"mean_token_accuracy": 0.7547562402486802,
"num_tokens": 786450293.0,
"step": 32000
},
{
"epoch": 1.5160128861095319,
"eval_entropy": 1.0883530624676796,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.760518561690796,
"eval_num_tokens": 786450293.0,
"eval_runtime": 728.4178,
"eval_samples_per_second": 34.067,
"eval_steps_per_second": 4.259,
"step": 32000
},
{
"entropy": 1.5847830092906952,
"epoch": 1.518381656244078,
"grad_norm": 1.0674691200256348,
"learning_rate": 1.6651965223931798e-05,
"loss": 1.122,
"mean_token_accuracy": 0.7640283882617951,
"num_tokens": 787651249.0,
"step": 32050
},
{
"entropy": 1.6060099351406096,
"epoch": 1.5207504263786242,
"grad_norm": 1.3451073169708252,
"learning_rate": 1.6498227768420986e-05,
"loss": 1.1986,
"mean_token_accuracy": 0.7503712397813797,
"num_tokens": 788894856.0,
"step": 32100
},
{
"entropy": 1.5495011293888092,
"epoch": 1.5231191965131705,
"grad_norm": 1.184458613395691,
"learning_rate": 1.634506294658023e-05,
"loss": 1.1241,
"mean_token_accuracy": 0.7635609942674637,
"num_tokens": 790124279.0,
"step": 32150
},
{
"entropy": 1.5789199233055116,
"epoch": 1.5254879666477166,
"grad_norm": 1.4204998016357422,
"learning_rate": 1.619247337640041e-05,
"loss": 1.1481,
"mean_token_accuracy": 0.7602039396762847,
"num_tokens": 791346787.0,
"step": 32200
},
{
"entropy": 1.5563798201084138,
"epoch": 1.5278567367822626,
"grad_norm": 1.1505266427993774,
"learning_rate": 1.6040461666039808e-05,
"loss": 1.1499,
"mean_token_accuracy": 0.7575086969137191,
"num_tokens": 792593563.0,
"step": 32250
},
{
"entropy": 1.6074644064903258,
"epoch": 1.5302255069168087,
"grad_norm": 1.185583472251892,
"learning_rate": 1.5889030413779622e-05,
"loss": 1.156,
"mean_token_accuracy": 0.7562423485517502,
"num_tokens": 793829790.0,
"step": 32300
},
{
"entropy": 1.5752575540542602,
"epoch": 1.532594277051355,
"grad_norm": 1.4243769645690918,
"learning_rate": 1.5738182207979435e-05,
"loss": 1.1459,
"mean_token_accuracy": 0.7583240360021591,
"num_tokens": 795055789.0,
"step": 32350
},
{
"entropy": 1.5923644971847535,
"epoch": 1.534963047185901,
"grad_norm": 1.6619261503219604,
"learning_rate": 1.558791962703304e-05,
"loss": 1.154,
"mean_token_accuracy": 0.7567561262845993,
"num_tokens": 796275636.0,
"step": 32400
},
{
"entropy": 1.583539651632309,
"epoch": 1.5373318173204473,
"grad_norm": 1.5260084867477417,
"learning_rate": 1.5438245239324372e-05,
"loss": 1.1293,
"mean_token_accuracy": 0.7631356823444366,
"num_tokens": 797503738.0,
"step": 32450
},
{
"entropy": 1.5718154168128968,
"epoch": 1.5397005874549934,
"grad_norm": 1.1916577816009521,
"learning_rate": 1.5289161603183565e-05,
"loss": 1.1556,
"mean_token_accuracy": 0.756939308643341,
"num_tokens": 798743606.0,
"step": 32500
},
{
"entropy": 1.5819120156764983,
"epoch": 1.5420693575895394,
"grad_norm": 1.1773018836975098,
"learning_rate": 1.5140671266843276e-05,
"loss": 1.1722,
"mean_token_accuracy": 0.7551066309213639,
"num_tokens": 799964473.0,
"step": 32550
},
{
"entropy": 1.575450291633606,
"epoch": 1.5444381277240855,
"grad_norm": 1.0022114515304565,
"learning_rate": 1.4992776768395073e-05,
"loss": 1.1449,
"mean_token_accuracy": 0.7597598391771316,
"num_tokens": 801188088.0,
"step": 32600
},
{
"entropy": 1.5459000968933105,
"epoch": 1.5468068978586318,
"grad_norm": 1.2957897186279297,
"learning_rate": 1.4845480635746129e-05,
"loss": 1.1227,
"mean_token_accuracy": 0.7632001984119415,
"num_tokens": 802438523.0,
"step": 32650
},
{
"entropy": 1.5870135259628295,
"epoch": 1.549175667993178,
"grad_norm": 1.3867087364196777,
"learning_rate": 1.469878538657593e-05,
"loss": 1.1395,
"mean_token_accuracy": 0.7591842120885849,
"num_tokens": 803649365.0,
"step": 32700
},
{
"entropy": 1.5655232286453247,
"epoch": 1.5515444381277241,
"grad_norm": 0.9858147501945496,
"learning_rate": 1.4552693528293287e-05,
"loss": 1.1343,
"mean_token_accuracy": 0.7619771939516068,
"num_tokens": 804874548.0,
"step": 32750
},
{
"entropy": 1.5606813442707062,
"epoch": 1.5539132082622702,
"grad_norm": 1.1506080627441406,
"learning_rate": 1.4407207557993468e-05,
"loss": 1.1358,
"mean_token_accuracy": 0.7607605350017548,
"num_tokens": 806110451.0,
"step": 32800
},
{
"entropy": 1.5905800759792328,
"epoch": 1.5562819783968163,
"grad_norm": 1.2425259351730347,
"learning_rate": 1.4262329962415521e-05,
"loss": 1.1416,
"mean_token_accuracy": 0.7600742274522782,
"num_tokens": 807342732.0,
"step": 32850
},
{
"entropy": 1.5680935847759248,
"epoch": 1.5586507485313625,
"grad_norm": 1.153823733329773,
"learning_rate": 1.4118063217899746e-05,
"loss": 1.1335,
"mean_token_accuracy": 0.7605480921268463,
"num_tokens": 808586619.0,
"step": 32900
},
{
"entropy": 1.5620489943027496,
"epoch": 1.5610195186659088,
"grad_norm": 1.050882339477539,
"learning_rate": 1.397440979034544e-05,
"loss": 1.1522,
"mean_token_accuracy": 0.756206591129303,
"num_tokens": 809832674.0,
"step": 32950
},
{
"entropy": 1.605532693862915,
"epoch": 1.5633882888004549,
"grad_norm": 1.3032130002975464,
"learning_rate": 1.383137213516862e-05,
"loss": 1.1758,
"mean_token_accuracy": 0.7512508201599121,
"num_tokens": 811045391.0,
"step": 33000
},
{
"epoch": 1.5633882888004549,
"eval_entropy": 1.0869283330279116,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7607752118838518,
"eval_num_tokens": 811045391.0,
"eval_runtime": 725.7757,
"eval_samples_per_second": 34.191,
"eval_steps_per_second": 4.274,
"step": 33000
},
{
"entropy": 1.592424702644348,
"epoch": 1.565757058935001,
"grad_norm": 1.4170438051223755,
"learning_rate": 1.36889526972602e-05,
"loss": 1.1393,
"mean_token_accuracy": 0.7614479339122773,
"num_tokens": 812284992.0,
"step": 33050
},
{
"entropy": 1.5892753875255585,
"epoch": 1.568125829069547,
"grad_norm": 1.1543865203857422,
"learning_rate": 1.3547153910944083e-05,
"loss": 1.1511,
"mean_token_accuracy": 0.7588121032714844,
"num_tokens": 813547664.0,
"step": 33100
},
{
"entropy": 1.584560890197754,
"epoch": 1.5704945992040933,
"grad_norm": 0.9274981021881104,
"learning_rate": 1.3405978199935615e-05,
"loss": 1.1374,
"mean_token_accuracy": 0.7598586791753769,
"num_tokens": 814802491.0,
"step": 33150
},
{
"entropy": 1.6207016718387603,
"epoch": 1.5728633693386393,
"grad_norm": 1.2314906120300293,
"learning_rate": 1.3265427977300137e-05,
"loss": 1.1615,
"mean_token_accuracy": 0.7563327193260193,
"num_tokens": 816017932.0,
"step": 33200
},
{
"entropy": 1.586843957901001,
"epoch": 1.5752321394731856,
"grad_norm": 1.1286264657974243,
"learning_rate": 1.3125505645411745e-05,
"loss": 1.1426,
"mean_token_accuracy": 0.7603234398365021,
"num_tokens": 817264849.0,
"step": 33250
},
{
"entropy": 1.591685062646866,
"epoch": 1.5776009096077317,
"grad_norm": 1.0517480373382568,
"learning_rate": 1.2986213595912234e-05,
"loss": 1.1207,
"mean_token_accuracy": 0.7635256379842759,
"num_tokens": 818484337.0,
"step": 33300
},
{
"entropy": 1.6172384572029115,
"epoch": 1.5799696797422778,
"grad_norm": 1.360130786895752,
"learning_rate": 1.2847554209670182e-05,
"loss": 1.161,
"mean_token_accuracy": 0.7545110338926315,
"num_tokens": 819712425.0,
"step": 33350
},
{
"entropy": 1.5726615214347839,
"epoch": 1.5823384498768238,
"grad_norm": 0.9944781064987183,
"learning_rate": 1.2709529856740331e-05,
"loss": 1.1207,
"mean_token_accuracy": 0.7627239066362381,
"num_tokens": 820956336.0,
"step": 33400
},
{
"entropy": 1.58195317029953,
"epoch": 1.58470722001137,
"grad_norm": 1.3657851219177246,
"learning_rate": 1.2572142896322991e-05,
"loss": 1.1318,
"mean_token_accuracy": 0.7620331639051438,
"num_tokens": 822181235.0,
"step": 33450
},
{
"entropy": 1.5519816017150878,
"epoch": 1.5870759901459164,
"grad_norm": 1.1192278861999512,
"learning_rate": 1.2435395676723765e-05,
"loss": 1.1255,
"mean_token_accuracy": 0.763149077296257,
"num_tokens": 823394039.0,
"step": 33500
},
{
"entropy": 1.5487982165813445,
"epoch": 1.5894447602804624,
"grad_norm": 0.9175589084625244,
"learning_rate": 1.229929053531339e-05,
"loss": 1.1266,
"mean_token_accuracy": 0.762880043387413,
"num_tokens": 824629826.0,
"step": 33550
},
{
"entropy": 1.5455662417411804,
"epoch": 1.5918135304150085,
"grad_norm": 1.543500542640686,
"learning_rate": 1.2163829798487796e-05,
"loss": 1.1179,
"mean_token_accuracy": 0.7645809006690979,
"num_tokens": 825885699.0,
"step": 33600
},
{
"entropy": 1.545372655391693,
"epoch": 1.5941823005495546,
"grad_norm": 1.042571783065796,
"learning_rate": 1.2029015781628333e-05,
"loss": 1.1253,
"mean_token_accuracy": 0.7624981206655502,
"num_tokens": 827144142.0,
"step": 33650
},
{
"entropy": 1.5542615973949432,
"epoch": 1.5965510706841008,
"grad_norm": 1.2017757892608643,
"learning_rate": 1.1894850789062234e-05,
"loss": 1.1095,
"mean_token_accuracy": 0.7662106871604919,
"num_tokens": 828358780.0,
"step": 33700
},
{
"entropy": 1.5387887310981752,
"epoch": 1.598919840818647,
"grad_norm": 1.2897499799728394,
"learning_rate": 1.1761337114023157e-05,
"loss": 1.1393,
"mean_token_accuracy": 0.7597699278593063,
"num_tokens": 829617688.0,
"step": 33750
},
{
"entropy": 1.57563338637352,
"epoch": 1.6012886109531932,
"grad_norm": 1.7554948329925537,
"learning_rate": 1.1628477038612035e-05,
"loss": 1.1186,
"mean_token_accuracy": 0.7649687886238098,
"num_tokens": 830817095.0,
"step": 33800
},
{
"entropy": 1.5856982839107514,
"epoch": 1.6036573810877393,
"grad_norm": 0.9697763919830322,
"learning_rate": 1.1496272833758042e-05,
"loss": 1.1803,
"mean_token_accuracy": 0.7541155385971069,
"num_tokens": 832068396.0,
"step": 33850
},
{
"entropy": 1.582381078004837,
"epoch": 1.6060261512222853,
"grad_norm": 1.2476204633712769,
"learning_rate": 1.1364726759179856e-05,
"loss": 1.1366,
"mean_token_accuracy": 0.7601368808746338,
"num_tokens": 833258832.0,
"step": 33900
},
{
"entropy": 1.5806366765499116,
"epoch": 1.6083949213568314,
"grad_norm": 1.464986801147461,
"learning_rate": 1.12338410633469e-05,
"loss": 1.1401,
"mean_token_accuracy": 0.7602562707662582,
"num_tokens": 834452824.0,
"step": 33950
},
{
"entropy": 1.5599962186813354,
"epoch": 1.6107636914913777,
"grad_norm": 1.1796619892120361,
"learning_rate": 1.1103617983441017e-05,
"loss": 1.1369,
"mean_token_accuracy": 0.7609011316299439,
"num_tokens": 835712240.0,
"step": 34000
},
{
"epoch": 1.6107636914913777,
"eval_entropy": 1.0767018733141425,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.760854017967028,
"eval_num_tokens": 835712240.0,
"eval_runtime": 727.3112,
"eval_samples_per_second": 34.119,
"eval_steps_per_second": 4.265,
"step": 34000
},
{
"entropy": 1.579708174467087,
"epoch": 1.613132461625924,
"grad_norm": 1.2132450342178345,
"learning_rate": 1.0974059745318177e-05,
"loss": 1.1412,
"mean_token_accuracy": 0.7581533867120743,
"num_tokens": 836951903.0,
"step": 34050
},
{
"entropy": 1.5457484829425812,
"epoch": 1.61550123176047,
"grad_norm": 0.9479733109474182,
"learning_rate": 1.0845168563470492e-05,
"loss": 1.1319,
"mean_token_accuracy": 0.7595140463113785,
"num_tokens": 838214157.0,
"step": 34100
},
{
"entropy": 1.5466408836841583,
"epoch": 1.617870001895016,
"grad_norm": 1.7005789279937744,
"learning_rate": 1.071694664098828e-05,
"loss": 1.1175,
"mean_token_accuracy": 0.7649167954921723,
"num_tokens": 839471563.0,
"step": 34150
},
{
"entropy": 1.5700657200813293,
"epoch": 1.6202387720295621,
"grad_norm": 1.059528112411499,
"learning_rate": 1.0589396169522465e-05,
"loss": 1.1615,
"mean_token_accuracy": 0.7565118598937989,
"num_tokens": 840715891.0,
"step": 34200
},
{
"entropy": 1.5375142538547515,
"epoch": 1.6226075421641084,
"grad_norm": 1.559818148612976,
"learning_rate": 1.0462519329247094e-05,
"loss": 1.1356,
"mean_token_accuracy": 0.7605293154716491,
"num_tokens": 841942053.0,
"step": 34250
},
{
"entropy": 1.587239592075348,
"epoch": 1.6249763122986547,
"grad_norm": 1.1134872436523438,
"learning_rate": 1.03363182888221e-05,
"loss": 1.1564,
"mean_token_accuracy": 0.7570129364728928,
"num_tokens": 843161472.0,
"step": 34300
},
{
"entropy": 1.5743994867801667,
"epoch": 1.6273450824332008,
"grad_norm": 1.3683475255966187,
"learning_rate": 1.021079520535619e-05,
"loss": 1.159,
"mean_token_accuracy": 0.7565000504255295,
"num_tokens": 844415080.0,
"step": 34350
},
{
"entropy": 1.5728708267211915,
"epoch": 1.6297138525677468,
"grad_norm": 1.1361534595489502,
"learning_rate": 1.0085952224369998e-05,
"loss": 1.1464,
"mean_token_accuracy": 0.7604904717206955,
"num_tokens": 845652767.0,
"step": 34400
},
{
"entropy": 1.555993628501892,
"epoch": 1.6320826227022929,
"grad_norm": 1.2197624444961548,
"learning_rate": 9.961791479759453e-06,
"loss": 1.1094,
"mean_token_accuracy": 0.7654684072732926,
"num_tokens": 846861078.0,
"step": 34450
},
{
"entropy": 1.5712345719337464,
"epoch": 1.6344513928368392,
"grad_norm": 1.2012556791305542,
"learning_rate": 9.83831509375922e-06,
"loss": 1.1318,
"mean_token_accuracy": 0.7618235784769058,
"num_tokens": 848079801.0,
"step": 34500
},
{
"entropy": 1.5478745126724243,
"epoch": 1.6368201629713852,
"grad_norm": 1.1320964097976685,
"learning_rate": 9.715525176906482e-06,
"loss": 1.1156,
"mean_token_accuracy": 0.763830555677414,
"num_tokens": 849324814.0,
"step": 34550
},
{
"entropy": 1.5780341172218322,
"epoch": 1.6391889331059315,
"grad_norm": 1.025578498840332,
"learning_rate": 9.59342382800486e-06,
"loss": 1.1426,
"mean_token_accuracy": 0.7610353720188141,
"num_tokens": 850545817.0,
"step": 34600
},
{
"entropy": 1.5829120945930482,
"epoch": 1.6415577032404776,
"grad_norm": 1.0543193817138672,
"learning_rate": 9.472013134088525e-06,
"loss": 1.1659,
"mean_token_accuracy": 0.7564774835109711,
"num_tokens": 851771892.0,
"step": 34650
},
{
"entropy": 1.5125657570362092,
"epoch": 1.6439264733750236,
"grad_norm": 1.047337532043457,
"learning_rate": 9.351295170386536e-06,
"loss": 1.1436,
"mean_token_accuracy": 0.7618407100439072,
"num_tokens": 853004916.0,
"step": 34700
},
{
"entropy": 1.5835665547847748,
"epoch": 1.6462952435095697,
"grad_norm": 1.4141535758972168,
"learning_rate": 9.231272000287355e-06,
"loss": 1.1394,
"mean_token_accuracy": 0.7607215863466262,
"num_tokens": 854213875.0,
"step": 34750
},
{
"entropy": 1.5290465533733368,
"epoch": 1.648664013644116,
"grad_norm": 1.578470230102539,
"learning_rate": 9.111945675303619e-06,
"loss": 1.0863,
"mean_token_accuracy": 0.7710424029827118,
"num_tokens": 855445223.0,
"step": 34800
},
{
"entropy": 1.549327657222748,
"epoch": 1.6510327837786622,
"grad_norm": 1.0670899152755737,
"learning_rate": 8.993318235037001e-06,
"loss": 1.1251,
"mean_token_accuracy": 0.7622793889045716,
"num_tokens": 856681494.0,
"step": 34850
},
{
"entropy": 1.5306969308853149,
"epoch": 1.6534015539132083,
"grad_norm": 1.3352553844451904,
"learning_rate": 8.875391707143432e-06,
"loss": 1.1102,
"mean_token_accuracy": 0.7646553814411163,
"num_tokens": 857925423.0,
"step": 34900
},
{
"entropy": 1.534108463525772,
"epoch": 1.6557703240477544,
"grad_norm": 1.3138459920883179,
"learning_rate": 8.75816810729837e-06,
"loss": 1.1059,
"mean_token_accuracy": 0.7676987838745117,
"num_tokens": 859172535.0,
"step": 34950
},
{
"entropy": 1.561165556907654,
"epoch": 1.6581390941823004,
"grad_norm": 1.1682779788970947,
"learning_rate": 8.641649439162396e-06,
"loss": 1.1193,
"mean_token_accuracy": 0.7643628352880478,
"num_tokens": 860388305.0,
"step": 35000
},
{
"epoch": 1.6581390941823004,
"eval_entropy": 1.0728673784675942,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.761188728099558,
"eval_num_tokens": 860388305.0,
"eval_runtime": 727.1748,
"eval_samples_per_second": 34.125,
"eval_steps_per_second": 4.266,
"step": 35000
},
{
"entropy": 1.5597219800949096,
"epoch": 1.6605078643168467,
"grad_norm": 1.1003355979919434,
"learning_rate": 8.525837694346932e-06,
"loss": 1.1456,
"mean_token_accuracy": 0.7606293076276779,
"num_tokens": 861636981.0,
"step": 35050
},
{
"entropy": 1.5564636278152466,
"epoch": 1.662876634451393,
"grad_norm": 1.22114098072052,
"learning_rate": 8.410734852380231e-06,
"loss": 1.1478,
"mean_token_accuracy": 0.7590148377418519,
"num_tokens": 862893390.0,
"step": 35100
},
{
"entropy": 1.5701312077045442,
"epoch": 1.665245404585939,
"grad_norm": 1.4223356246948242,
"learning_rate": 8.296342880673513e-06,
"loss": 1.1266,
"mean_token_accuracy": 0.763382934331894,
"num_tokens": 864117153.0,
"step": 35150
},
{
"entropy": 1.587481471300125,
"epoch": 1.6676141747204851,
"grad_norm": 1.5688437223434448,
"learning_rate": 8.182663734487372e-06,
"loss": 1.1656,
"mean_token_accuracy": 0.7555125683546067,
"num_tokens": 865348622.0,
"step": 35200
},
{
"entropy": 1.5680071783065797,
"epoch": 1.6699829448550312,
"grad_norm": 1.2558252811431885,
"learning_rate": 8.069699356898309e-06,
"loss": 1.151,
"mean_token_accuracy": 0.7581069612503052,
"num_tokens": 866584445.0,
"step": 35250
},
{
"entropy": 1.5600192046165466,
"epoch": 1.6723517149895775,
"grad_norm": 1.3348325490951538,
"learning_rate": 7.95745167876556e-06,
"loss": 1.1564,
"mean_token_accuracy": 0.7571524727344513,
"num_tokens": 867822403.0,
"step": 35300
},
{
"entropy": 1.562008023262024,
"epoch": 1.6747204851241235,
"grad_norm": 1.3110319375991821,
"learning_rate": 7.84592261869806e-06,
"loss": 1.1462,
"mean_token_accuracy": 0.7590863239765168,
"num_tokens": 869072939.0,
"step": 35350
},
{
"entropy": 1.5816670620441438,
"epoch": 1.6770892552586698,
"grad_norm": 1.2804386615753174,
"learning_rate": 7.735114083021683e-06,
"loss": 1.1353,
"mean_token_accuracy": 0.7603730088472367,
"num_tokens": 870288358.0,
"step": 35400
},
{
"entropy": 1.546754379272461,
"epoch": 1.6794580253932159,
"grad_norm": 1.1443445682525635,
"learning_rate": 7.625027965746634e-06,
"loss": 1.1473,
"mean_token_accuracy": 0.7597916102409363,
"num_tokens": 871537045.0,
"step": 35450
},
{
"entropy": 1.5389190435409545,
"epoch": 1.681826795527762,
"grad_norm": 1.2796293497085571,
"learning_rate": 7.515666148535067e-06,
"loss": 1.1159,
"mean_token_accuracy": 0.7650646787881851,
"num_tokens": 872759023.0,
"step": 35500
},
{
"entropy": 1.5631573498249054,
"epoch": 1.684195565662308,
"grad_norm": 1.503520131111145,
"learning_rate": 7.407030500668971e-06,
"loss": 1.1688,
"mean_token_accuracy": 0.7553135341405869,
"num_tokens": 873995801.0,
"step": 35550
},
{
"entropy": 1.5810019493103027,
"epoch": 1.6865643357968543,
"grad_norm": 1.126876950263977,
"learning_rate": 7.299122879018155e-06,
"loss": 1.1475,
"mean_token_accuracy": 0.7582389563322067,
"num_tokens": 875225780.0,
"step": 35600
},
{
"entropy": 1.5617643618583679,
"epoch": 1.6889331059314006,
"grad_norm": 1.041165828704834,
"learning_rate": 7.191945128008548e-06,
"loss": 1.1599,
"mean_token_accuracy": 0.7565414899587631,
"num_tokens": 876441973.0,
"step": 35650
},
{
"entropy": 1.5521779787540435,
"epoch": 1.6913018760659466,
"grad_norm": 1.024032711982727,
"learning_rate": 7.085499079590674e-06,
"loss": 1.1359,
"mean_token_accuracy": 0.7614186578989028,
"num_tokens": 877691572.0,
"step": 35700
},
{
"entropy": 1.5647426414489747,
"epoch": 1.6936706462004927,
"grad_norm": 1.4068409204483032,
"learning_rate": 6.979786553208306e-06,
"loss": 1.1434,
"mean_token_accuracy": 0.7604501461982727,
"num_tokens": 878910690.0,
"step": 35750
},
{
"entropy": 1.5498666989803314,
"epoch": 1.6960394163350387,
"grad_norm": 1.2371301651000977,
"learning_rate": 6.8748093557674084e-06,
"loss": 1.1359,
"mean_token_accuracy": 0.7612047231197357,
"num_tokens": 880160478.0,
"step": 35800
},
{
"entropy": 1.5569170558452605,
"epoch": 1.698408186469585,
"grad_norm": 1.2279973030090332,
"learning_rate": 6.770569281605244e-06,
"loss": 1.1249,
"mean_token_accuracy": 0.7620308262109756,
"num_tokens": 881367218.0,
"step": 35850
},
{
"entropy": 1.5841618192195892,
"epoch": 1.700776956604131,
"grad_norm": 1.173069953918457,
"learning_rate": 6.667068112459662e-06,
"loss": 1.1585,
"mean_token_accuracy": 0.7556693691015244,
"num_tokens": 882584025.0,
"step": 35900
},
{
"entropy": 1.5525937521457671,
"epoch": 1.7031457267386774,
"grad_norm": 1.2860488891601562,
"learning_rate": 6.56430761743872e-06,
"loss": 1.1681,
"mean_token_accuracy": 0.7552351075410842,
"num_tokens": 883859871.0,
"step": 35950
},
{
"entropy": 1.5703179001808167,
"epoch": 1.7055144968732234,
"grad_norm": 1.2826309204101562,
"learning_rate": 6.462289552990353e-06,
"loss": 1.1341,
"mean_token_accuracy": 0.7611742705106735,
"num_tokens": 885071859.0,
"step": 36000
},
{
"epoch": 1.7055144968732234,
"eval_entropy": 1.0768880580103375,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7609103475268005,
"eval_num_tokens": 885071859.0,
"eval_runtime": 728.1701,
"eval_samples_per_second": 34.079,
"eval_steps_per_second": 4.26,
"step": 36000
},
{
"entropy": 1.5391144120693208,
"epoch": 1.7078832670077695,
"grad_norm": 1.2603217363357544,
"learning_rate": 6.361015662872433e-06,
"loss": 1.1158,
"mean_token_accuracy": 0.7669140672683716,
"num_tokens": 886316413.0,
"step": 36050
},
{
"entropy": 1.5330300676822661,
"epoch": 1.7102520371423156,
"grad_norm": 1.2248510122299194,
"learning_rate": 6.260487678122911e-06,
"loss": 1.0644,
"mean_token_accuracy": 0.7753306698799133,
"num_tokens": 887565349.0,
"step": 36100
},
{
"entropy": 1.5597666120529174,
"epoch": 1.7126208072768618,
"grad_norm": 1.0783295631408691,
"learning_rate": 6.160707317030256e-06,
"loss": 1.109,
"mean_token_accuracy": 0.7654628306627274,
"num_tokens": 888770171.0,
"step": 36150
},
{
"entropy": 1.5886062026023864,
"epoch": 1.7149895774114081,
"grad_norm": 1.518917202949524,
"learning_rate": 6.0616762851040675e-06,
"loss": 1.1602,
"mean_token_accuracy": 0.7575176376104354,
"num_tokens": 889998256.0,
"step": 36200
},
{
"entropy": 1.5691123294830323,
"epoch": 1.7173583475459542,
"grad_norm": 1.1943705081939697,
"learning_rate": 5.963396275045951e-06,
"loss": 1.1476,
"mean_token_accuracy": 0.758755573630333,
"num_tokens": 891199335.0,
"step": 36250
},
{
"entropy": 1.5685584223270417,
"epoch": 1.7197271176805002,
"grad_norm": 1.2932955026626587,
"learning_rate": 5.865868966720556e-06,
"loss": 1.1354,
"mean_token_accuracy": 0.7614442694187165,
"num_tokens": 892434722.0,
"step": 36300
},
{
"entropy": 1.5710162222385406,
"epoch": 1.7220958878150463,
"grad_norm": 1.200039267539978,
"learning_rate": 5.769096027126869e-06,
"loss": 1.1766,
"mean_token_accuracy": 0.7540206718444824,
"num_tokens": 893676597.0,
"step": 36350
},
{
"entropy": 1.562828722000122,
"epoch": 1.7244646579495926,
"grad_norm": 1.2064563035964966,
"learning_rate": 5.673079110369722e-06,
"loss": 1.121,
"mean_token_accuracy": 0.7634602183103562,
"num_tokens": 894910050.0,
"step": 36400
},
{
"entropy": 1.5540617489814759,
"epoch": 1.7268334280841389,
"grad_norm": 1.4902048110961914,
"learning_rate": 5.577819857631539e-06,
"loss": 1.1201,
"mean_token_accuracy": 0.7639645302295685,
"num_tokens": 896142711.0,
"step": 36450
},
{
"entropy": 1.554260642528534,
"epoch": 1.729202198218685,
"grad_norm": 1.2376636266708374,
"learning_rate": 5.483319897144257e-06,
"loss": 1.141,
"mean_token_accuracy": 0.7609711056947708,
"num_tokens": 897387745.0,
"step": 36500
},
{
"entropy": 1.5512582790851592,
"epoch": 1.731570968353231,
"grad_norm": 1.0070257186889648,
"learning_rate": 5.389580844161491e-06,
"loss": 1.151,
"mean_token_accuracy": 0.7582071113586426,
"num_tokens": 898612694.0,
"step": 36550
},
{
"entropy": 1.5260178673267364,
"epoch": 1.733939738487777,
"grad_norm": 1.035585880279541,
"learning_rate": 5.296604300930968e-06,
"loss": 1.1097,
"mean_token_accuracy": 0.7681008791923523,
"num_tokens": 899864115.0,
"step": 36600
},
{
"entropy": 1.557324800491333,
"epoch": 1.7363085086223233,
"grad_norm": 1.2301568984985352,
"learning_rate": 5.204391856667101e-06,
"loss": 1.1191,
"mean_token_accuracy": 0.7642790126800537,
"num_tokens": 901100268.0,
"step": 36650
},
{
"entropy": 1.5380194628238677,
"epoch": 1.7386772787568694,
"grad_norm": 1.246462345123291,
"learning_rate": 5.112945087523824e-06,
"loss": 1.1108,
"mean_token_accuracy": 0.7644780373573303,
"num_tokens": 902310249.0,
"step": 36700
},
{
"entropy": 1.5637565624713898,
"epoch": 1.7410460488914157,
"grad_norm": 1.1634399890899658,
"learning_rate": 5.022265556567668e-06,
"loss": 1.1319,
"mean_token_accuracy": 0.7625255084037781,
"num_tokens": 903523545.0,
"step": 36750
},
{
"entropy": 1.5375991368293762,
"epoch": 1.7434148190259617,
"grad_norm": 1.3280473947525024,
"learning_rate": 4.9323548137510555e-06,
"loss": 1.1053,
"mean_token_accuracy": 0.7662223023176193,
"num_tokens": 904774364.0,
"step": 36800
},
{
"entropy": 1.5782112526893615,
"epoch": 1.7457835891605078,
"grad_norm": 1.3013827800750732,
"learning_rate": 4.843214395885776e-06,
"loss": 1.1594,
"mean_token_accuracy": 0.758129763007164,
"num_tokens": 906007167.0,
"step": 36850
},
{
"entropy": 1.541445196866989,
"epoch": 1.7481523592950539,
"grad_norm": 0.9861883521080017,
"learning_rate": 4.754845826616727e-06,
"loss": 1.1442,
"mean_token_accuracy": 0.7601429998874665,
"num_tokens": 907201311.0,
"step": 36900
},
{
"entropy": 1.5591549813747405,
"epoch": 1.7505211294296001,
"grad_norm": 1.1912263631820679,
"learning_rate": 4.667250616395885e-06,
"loss": 1.1229,
"mean_token_accuracy": 0.7642272913455963,
"num_tokens": 908429689.0,
"step": 36950
},
{
"entropy": 1.53731192111969,
"epoch": 1.7528898995641464,
"grad_norm": 1.2835556268692017,
"learning_rate": 4.580430262456503e-06,
"loss": 1.0855,
"mean_token_accuracy": 0.770037140250206,
"num_tokens": 909656463.0,
"step": 37000
},
{
"epoch": 1.7528898995641464,
"eval_entropy": 1.0678023306651703,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7609675386542278,
"eval_num_tokens": 909656463.0,
"eval_runtime": 744.4803,
"eval_samples_per_second": 33.332,
"eval_steps_per_second": 4.167,
"step": 37000
},
{
"entropy": 1.555112097263336,
"epoch": 1.7552586696986925,
"grad_norm": 1.195916771888733,
"learning_rate": 4.4943862487874575e-06,
"loss": 1.1449,
"mean_token_accuracy": 0.7592443466186524,
"num_tokens": 910867530.0,
"step": 37050
},
{
"entropy": 1.5418341505527495,
"epoch": 1.7576274398332385,
"grad_norm": 1.139011025428772,
"learning_rate": 4.409120046107945e-06,
"loss": 1.1017,
"mean_token_accuracy": 0.7676555049419403,
"num_tokens": 912088709.0,
"step": 37100
},
{
"entropy": 1.5542387223243714,
"epoch": 1.7599962099677846,
"grad_norm": 1.1180921792984009,
"learning_rate": 4.324633111842308e-06,
"loss": 1.1473,
"mean_token_accuracy": 0.759157150387764,
"num_tokens": 913332056.0,
"step": 37150
},
{
"entropy": 1.5596051335334777,
"epoch": 1.762364980102331,
"grad_norm": 1.006624460220337,
"learning_rate": 4.240926890095148e-06,
"loss": 1.1482,
"mean_token_accuracy": 0.7598807489871979,
"num_tokens": 914537591.0,
"step": 37200
},
{
"entropy": 1.5528207927942277,
"epoch": 1.7647337502368772,
"grad_norm": 0.971926748752594,
"learning_rate": 4.158002811626621e-06,
"loss": 1.1571,
"mean_token_accuracy": 0.7576300024986267,
"num_tokens": 915743333.0,
"step": 37250
},
{
"entropy": 1.5758486306667328,
"epoch": 1.7671025203714232,
"grad_norm": 1.1977986097335815,
"learning_rate": 4.075862293827986e-06,
"loss": 1.1495,
"mean_token_accuracy": 0.7577051311731339,
"num_tokens": 916959683.0,
"step": 37300
},
{
"entropy": 1.544589899778366,
"epoch": 1.7694712905059693,
"grad_norm": 1.3282675743103027,
"learning_rate": 3.994506740697407e-06,
"loss": 1.1269,
"mean_token_accuracy": 0.7617994117736816,
"num_tokens": 918211320.0,
"step": 37350
},
{
"entropy": 1.5542384481430054,
"epoch": 1.7718400606405154,
"grad_norm": 1.4619874954223633,
"learning_rate": 3.9139375428159095e-06,
"loss": 1.1173,
"mean_token_accuracy": 0.7629494529962539,
"num_tokens": 919446181.0,
"step": 37400
},
{
"entropy": 1.5291326987743377,
"epoch": 1.7742088307750616,
"grad_norm": 1.3056997060775757,
"learning_rate": 3.834156077323636e-06,
"loss": 1.0887,
"mean_token_accuracy": 0.7687935763597489,
"num_tokens": 920685182.0,
"step": 37450
},
{
"entropy": 1.5420164275169372,
"epoch": 1.7765776009096077,
"grad_norm": 1.2205777168273926,
"learning_rate": 3.7551637078963085e-06,
"loss": 1.1142,
"mean_token_accuracy": 0.7653926169872284,
"num_tokens": 921939020.0,
"step": 37500
},
{
"entropy": 1.567058709859848,
"epoch": 1.778946371044154,
"grad_norm": 0.9547618627548218,
"learning_rate": 3.6769617847219164e-06,
"loss": 1.1223,
"mean_token_accuracy": 0.7639624851942063,
"num_tokens": 923177265.0,
"step": 37550
},
{
"entropy": 1.5774991846084594,
"epoch": 1.7813151411787,
"grad_norm": 1.2139365673065186,
"learning_rate": 3.5995516444776276e-06,
"loss": 1.1457,
"mean_token_accuracy": 0.7596712547540665,
"num_tokens": 924378635.0,
"step": 37600
},
{
"entropy": 1.5741923189163207,
"epoch": 1.783683911313246,
"grad_norm": 1.3455299139022827,
"learning_rate": 3.5229346103069547e-06,
"loss": 1.1265,
"mean_token_accuracy": 0.7622561120986938,
"num_tokens": 925558387.0,
"step": 37650
},
{
"entropy": 1.5316821897029878,
"epoch": 1.7860526814477922,
"grad_norm": 1.833621859550476,
"learning_rate": 3.4471119917971473e-06,
"loss": 1.1108,
"mean_token_accuracy": 0.7637511855363845,
"num_tokens": 926797544.0,
"step": 37700
},
{
"entropy": 1.5604101026058197,
"epoch": 1.7884214515823385,
"grad_norm": 1.2970396280288696,
"learning_rate": 3.3720850849567944e-06,
"loss": 1.112,
"mean_token_accuracy": 0.7661514669656754,
"num_tokens": 928034501.0,
"step": 37750
},
{
"entropy": 1.543112144470215,
"epoch": 1.7907902217168847,
"grad_norm": 0.9984686970710754,
"learning_rate": 3.297855172193659e-06,
"loss": 1.1264,
"mean_token_accuracy": 0.7631747448444366,
"num_tokens": 929281453.0,
"step": 37800
},
{
"entropy": 1.5535426819324494,
"epoch": 1.7931589918514308,
"grad_norm": 1.123579740524292,
"learning_rate": 3.22442352229278e-06,
"loss": 1.1449,
"mean_token_accuracy": 0.760178684592247,
"num_tokens": 930526986.0,
"step": 37850
},
{
"entropy": 1.533100154399872,
"epoch": 1.7955277619859769,
"grad_norm": 1.1597360372543335,
"learning_rate": 3.1517913903947707e-06,
"loss": 1.1216,
"mean_token_accuracy": 0.7638274627923965,
"num_tokens": 931736763.0,
"step": 37900
},
{
"entropy": 1.5554070842266083,
"epoch": 1.797896532120523,
"grad_norm": 1.2038190364837646,
"learning_rate": 3.0799600179743927e-06,
"loss": 1.1308,
"mean_token_accuracy": 0.7614258807897568,
"num_tokens": 932923483.0,
"step": 37950
},
{
"entropy": 1.540804421901703,
"epoch": 1.8002653022550692,
"grad_norm": 1.0928473472595215,
"learning_rate": 3.00893063281929e-06,
"loss": 1.1338,
"mean_token_accuracy": 0.7619293278455734,
"num_tokens": 934170514.0,
"step": 38000
},
{
"epoch": 1.8002653022550692,
"eval_entropy": 1.0719800475410766,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7611359905939115,
"eval_num_tokens": 934170514.0,
"eval_runtime": 723.3133,
"eval_samples_per_second": 34.307,
"eval_steps_per_second": 4.289,
"step": 38000
},
{
"entropy": 1.5533955585956574,
"epoch": 1.8026340723896153,
"grad_norm": 1.0474472045898438,
"learning_rate": 2.9387044490090385e-06,
"loss": 1.1715,
"mean_token_accuracy": 0.7546948331594467,
"num_tokens": 935423080.0,
"step": 38050
},
{
"entropy": 1.5497667694091797,
"epoch": 1.8050028425241615,
"grad_norm": 1.1220715045928955,
"learning_rate": 2.869282666894402e-06,
"loss": 1.1327,
"mean_token_accuracy": 0.7612757116556168,
"num_tokens": 936650729.0,
"step": 38100
},
{
"entropy": 1.5674825513362884,
"epoch": 1.8073716126587076,
"grad_norm": 1.4526468515396118,
"learning_rate": 2.8006664730767683e-06,
"loss": 1.1427,
"mean_token_accuracy": 0.7606144285202027,
"num_tokens": 937892138.0,
"step": 38150
},
{
"entropy": 1.5554044562578202,
"epoch": 1.8097403827932537,
"grad_norm": 1.2976021766662598,
"learning_rate": 2.7328570403879205e-06,
"loss": 1.1397,
"mean_token_accuracy": 0.7616138017177582,
"num_tokens": 939126149.0,
"step": 38200
},
{
"entropy": 1.568613636493683,
"epoch": 1.8121091529277997,
"grad_norm": 1.0613607168197632,
"learning_rate": 2.665855527869948e-06,
"loss": 1.1463,
"mean_token_accuracy": 0.759678093791008,
"num_tokens": 940346079.0,
"step": 38250
},
{
"entropy": 1.5337522840499878,
"epoch": 1.814477923062346,
"grad_norm": 1.267045259475708,
"learning_rate": 2.59966308075546e-06,
"loss": 1.1493,
"mean_token_accuracy": 0.7598909741640091,
"num_tokens": 941597120.0,
"step": 38300
},
{
"entropy": 1.5623528015613557,
"epoch": 1.8168466931968923,
"grad_norm": 1.3286635875701904,
"learning_rate": 2.5342808304479993e-06,
"loss": 1.1713,
"mean_token_accuracy": 0.754482525587082,
"num_tokens": 942822640.0,
"step": 38350
},
{
"entropy": 1.5387673115730285,
"epoch": 1.8192154633314384,
"grad_norm": 1.1870768070220947,
"learning_rate": 2.46970989450272e-06,
"loss": 1.1121,
"mean_token_accuracy": 0.7645811969041825,
"num_tokens": 944054652.0,
"step": 38400
},
{
"entropy": 1.5411259424686432,
"epoch": 1.8215842334659844,
"grad_norm": 1.487240195274353,
"learning_rate": 2.405951376607257e-06,
"loss": 1.0865,
"mean_token_accuracy": 0.7703835678100586,
"num_tokens": 945284822.0,
"step": 38450
},
{
"entropy": 1.586391316652298,
"epoch": 1.8239530036005305,
"grad_norm": 1.198615312576294,
"learning_rate": 2.3430063665628943e-06,
"loss": 1.163,
"mean_token_accuracy": 0.7574268835783005,
"num_tokens": 946506870.0,
"step": 38500
},
{
"entropy": 1.5495548892021178,
"epoch": 1.8263217737350768,
"grad_norm": 1.2452329397201538,
"learning_rate": 2.280875940265903e-06,
"loss": 1.1172,
"mean_token_accuracy": 0.7652907830476761,
"num_tokens": 947752324.0,
"step": 38550
},
{
"entropy": 1.5577371573448182,
"epoch": 1.828690543869623,
"grad_norm": 1.1785380840301514,
"learning_rate": 2.2195611596891872e-06,
"loss": 1.113,
"mean_token_accuracy": 0.7650933820009231,
"num_tokens": 948980177.0,
"step": 38600
},
{
"entropy": 1.5553138053417206,
"epoch": 1.831059314004169,
"grad_norm": 1.3556625843048096,
"learning_rate": 2.159063072864087e-06,
"loss": 1.1187,
"mean_token_accuracy": 0.7657572621107102,
"num_tokens": 950168267.0,
"step": 38650
},
{
"entropy": 1.5441812425851822,
"epoch": 1.8334280841387152,
"grad_norm": 1.2191582918167114,
"learning_rate": 2.09938271386253e-06,
"loss": 1.132,
"mean_token_accuracy": 0.7622571617364884,
"num_tokens": 951380706.0,
"step": 38700
},
{
"entropy": 1.5715741848945617,
"epoch": 1.8357968542732612,
"grad_norm": 1.22894287109375,
"learning_rate": 2.040521102779286e-06,
"loss": 1.1266,
"mean_token_accuracy": 0.7630192279815674,
"num_tokens": 952592270.0,
"step": 38750
},
{
"entropy": 1.5562168991565704,
"epoch": 1.8381656244078075,
"grad_norm": 1.5664132833480835,
"learning_rate": 1.982479245714569e-06,
"loss": 1.1185,
"mean_token_accuracy": 0.765923129916191,
"num_tokens": 953815987.0,
"step": 38800
},
{
"entropy": 1.574343602657318,
"epoch": 1.8405343945423536,
"grad_norm": 1.1616158485412598,
"learning_rate": 1.925258134756858e-06,
"loss": 1.1508,
"mean_token_accuracy": 0.7590267878770828,
"num_tokens": 955053412.0,
"step": 38850
},
{
"entropy": 1.5616822016239167,
"epoch": 1.8429031646768999,
"grad_norm": 1.0465819835662842,
"learning_rate": 1.8688587479658793e-06,
"loss": 1.1668,
"mean_token_accuracy": 0.7562251263856887,
"num_tokens": 956294592.0,
"step": 38900
},
{
"entropy": 1.533266224861145,
"epoch": 1.845271934811446,
"grad_norm": 1.3192518949508667,
"learning_rate": 1.8132820493559521e-06,
"loss": 1.1001,
"mean_token_accuracy": 0.7673191577196121,
"num_tokens": 957522550.0,
"step": 38950
},
{
"entropy": 1.552718700170517,
"epoch": 1.847640704945992,
"grad_norm": 1.5773288011550903,
"learning_rate": 1.758528988879471e-06,
"loss": 1.1048,
"mean_token_accuracy": 0.76726045191288,
"num_tokens": 958757267.0,
"step": 39000
},
{
"epoch": 1.847640704945992,
"eval_entropy": 1.0707699135186055,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7611362978498987,
"eval_num_tokens": 958757267.0,
"eval_runtime": 728.2004,
"eval_samples_per_second": 34.077,
"eval_steps_per_second": 4.26,
"step": 39000
},
{
"entropy": 1.5483810114860534,
"epoch": 1.850009475080538,
"grad_norm": 1.4050700664520264,
"learning_rate": 1.704600502410686e-06,
"loss": 1.1036,
"mean_token_accuracy": 0.7662443941831589,
"num_tokens": 959971891.0,
"step": 39050
},
{
"entropy": 1.538731288909912,
"epoch": 1.8523782452150843,
"grad_norm": 1.1801916360855103,
"learning_rate": 1.6514975117296994e-06,
"loss": 1.101,
"mean_token_accuracy": 0.7680959284305573,
"num_tokens": 961210335.0,
"step": 39100
},
{
"entropy": 1.5468962013721466,
"epoch": 1.8547470153496306,
"grad_norm": 1.2052723169326782,
"learning_rate": 1.599220924506728e-06,
"loss": 1.1602,
"mean_token_accuracy": 0.7576704436540603,
"num_tokens": 962476786.0,
"step": 39150
},
{
"entropy": 1.5435814583301544,
"epoch": 1.8571157854841767,
"grad_norm": 1.056043267250061,
"learning_rate": 1.547771634286549e-06,
"loss": 1.1059,
"mean_token_accuracy": 0.7669939565658569,
"num_tokens": 963730472.0,
"step": 39200
},
{
"entropy": 1.557974625825882,
"epoch": 1.8594845556187227,
"grad_norm": 1.2635796070098877,
"learning_rate": 1.4971505204732673e-06,
"loss": 1.1212,
"mean_token_accuracy": 0.7648367810249329,
"num_tokens": 964964095.0,
"step": 39250
},
{
"entropy": 1.5005019557476045,
"epoch": 1.8618533257532688,
"grad_norm": 1.2261488437652588,
"learning_rate": 1.4473584483152614e-06,
"loss": 1.0945,
"mean_token_accuracy": 0.7694985699653626,
"num_tokens": 966211342.0,
"step": 39300
},
{
"entropy": 1.595631295442581,
"epoch": 1.864222095887815,
"grad_norm": 1.1187533140182495,
"learning_rate": 1.3983962688904062e-06,
"loss": 1.1547,
"mean_token_accuracy": 0.7579811322689056,
"num_tokens": 967415509.0,
"step": 39350
},
{
"entropy": 1.5132013654708862,
"epoch": 1.8665908660223614,
"grad_norm": 1.1046701669692993,
"learning_rate": 1.3502648190915124e-06,
"loss": 1.1251,
"mean_token_accuracy": 0.7633352410793305,
"num_tokens": 968645582.0,
"step": 39400
},
{
"entropy": 1.5683543026447295,
"epoch": 1.8689596361569074,
"grad_norm": 0.9930199384689331,
"learning_rate": 1.3029649216120376e-06,
"loss": 1.1359,
"mean_token_accuracy": 0.7611577039957047,
"num_tokens": 969861208.0,
"step": 39450
},
{
"entropy": 1.5816809368133544,
"epoch": 1.8713284062914535,
"grad_norm": 1.0561200380325317,
"learning_rate": 1.2564973849320204e-06,
"loss": 1.14,
"mean_token_accuracy": 0.7613069009780884,
"num_tokens": 971090749.0,
"step": 39500
},
{
"entropy": 1.5379055535793305,
"epoch": 1.8736971764259995,
"grad_norm": 1.1951854228973389,
"learning_rate": 1.2108630033042412e-06,
"loss": 1.1165,
"mean_token_accuracy": 0.7650814574956893,
"num_tokens": 972346968.0,
"step": 39550
},
{
"entropy": 1.5865603411197662,
"epoch": 1.8760659465605458,
"grad_norm": 1.2471119165420532,
"learning_rate": 1.1660625567406768e-06,
"loss": 1.1328,
"mean_token_accuracy": 0.7614058357477188,
"num_tokens": 973571764.0,
"step": 39600
},
{
"entropy": 1.5396224319934846,
"epoch": 1.8784347166950919,
"grad_norm": 1.348791241645813,
"learning_rate": 1.1220968109991515e-06,
"loss": 1.0901,
"mean_token_accuracy": 0.769990593791008,
"num_tokens": 974799757.0,
"step": 39650
},
{
"entropy": 1.553302252292633,
"epoch": 1.8808034868296382,
"grad_norm": 1.2576488256454468,
"learning_rate": 1.0789665175702456e-06,
"loss": 1.1081,
"mean_token_accuracy": 0.7674774092435837,
"num_tokens": 976007380.0,
"step": 39700
},
{
"entropy": 1.5356010353565217,
"epoch": 1.8831722569641842,
"grad_norm": 1.2327417135238647,
"learning_rate": 1.036672413664458e-06,
"loss": 1.1085,
"mean_token_accuracy": 0.7655727046728135,
"num_tokens": 977252116.0,
"step": 39750
},
{
"entropy": 1.5608667409420014,
"epoch": 1.8855410270987303,
"grad_norm": 1.2925286293029785,
"learning_rate": 9.952152221996024e-07,
"loss": 1.1201,
"mean_token_accuracy": 0.7642844372987747,
"num_tokens": 978457729.0,
"step": 39800
},
{
"entropy": 1.5483964371681214,
"epoch": 1.8879097972332763,
"grad_norm": 1.4176242351531982,
"learning_rate": 9.54595651788448e-07,
"loss": 1.1321,
"mean_token_accuracy": 0.7614534211158752,
"num_tokens": 979721797.0,
"step": 39850
},
{
"entropy": 1.556614215373993,
"epoch": 1.8902785673678226,
"grad_norm": 1.2233829498291016,
"learning_rate": 9.148143967266209e-07,
"loss": 1.1657,
"mean_token_accuracy": 0.7561358803510666,
"num_tokens": 980985021.0,
"step": 39900
},
{
"entropy": 1.5509166061878203,
"epoch": 1.892647337502369,
"grad_norm": 1.1454182863235474,
"learning_rate": 8.758721369807099e-07,
"loss": 1.103,
"mean_token_accuracy": 0.7666506910324097,
"num_tokens": 982221836.0,
"step": 39950
},
{
"entropy": 1.5321409046649932,
"epoch": 1.895016107636915,
"grad_norm": 1.1777846813201904,
"learning_rate": 8.377695381766804e-07,
"loss": 1.1016,
"mean_token_accuracy": 0.7675345009565353,
"num_tokens": 983496374.0,
"step": 40000
},
{
"epoch": 1.895016107636915,
"eval_entropy": 1.0718190068778954,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7612931992664097,
"eval_num_tokens": 983496374.0,
"eval_runtime": 728.5673,
"eval_samples_per_second": 34.06,
"eval_steps_per_second": 4.258,
"step": 40000
},
{
"entropy": 1.563276962041855,
"epoch": 1.897384877771461,
"grad_norm": 1.0903818607330322,
"learning_rate": 8.00507251588456e-07,
"loss": 1.1352,
"mean_token_accuracy": 0.761111610531807,
"num_tokens": 984743304.0,
"step": 40050
},
{
"entropy": 1.5520950222015382,
"epoch": 1.899753647906007,
"grad_norm": 1.4384498596191406,
"learning_rate": 7.64085914126822e-07,
"loss": 1.105,
"mean_token_accuracy": 0.7678878873586654,
"num_tokens": 985955766.0,
"step": 40100
},
{
"entropy": 1.522156765460968,
"epoch": 1.9021224180405534,
"grad_norm": 1.1247589588165283,
"learning_rate": 7.285061483285227e-07,
"loss": 1.0875,
"mean_token_accuracy": 0.7697209113836289,
"num_tokens": 987194885.0,
"step": 40150
},
{
"entropy": 1.5538606441020966,
"epoch": 1.9044911881750994,
"grad_norm": 1.1288164854049683,
"learning_rate": 6.937685623456147e-07,
"loss": 1.1135,
"mean_token_accuracy": 0.7657132083177567,
"num_tokens": 988419847.0,
"step": 40200
},
{
"entropy": 1.5719120705127716,
"epoch": 1.9068599583096457,
"grad_norm": 1.0764816999435425,
"learning_rate": 6.598737499350915e-07,
"loss": 1.1339,
"mean_token_accuracy": 0.7616824221611023,
"num_tokens": 989644278.0,
"step": 40250
},
{
"entropy": 1.531646077632904,
"epoch": 1.9092287284441918,
"grad_norm": 1.442927360534668,
"learning_rate": 6.268222904487087e-07,
"loss": 1.1163,
"mean_token_accuracy": 0.7660135048627853,
"num_tokens": 990874401.0,
"step": 40300
},
{
"entropy": 1.5490214943885803,
"epoch": 1.9115974985787378,
"grad_norm": 1.215856671333313,
"learning_rate": 5.946147488231135e-07,
"loss": 1.1019,
"mean_token_accuracy": 0.7664029818773269,
"num_tokens": 992095108.0,
"step": 40350
},
{
"entropy": 1.5366157114505767,
"epoch": 1.913966268713284,
"grad_norm": 1.2367199659347534,
"learning_rate": 5.632516755701588e-07,
"loss": 1.1322,
"mean_token_accuracy": 0.763009768128395,
"num_tokens": 993348612.0,
"step": 40400
},
{
"entropy": 1.5416124892234802,
"epoch": 1.9163350388478302,
"grad_norm": 1.5094550848007202,
"learning_rate": 5.327336067674992e-07,
"loss": 1.121,
"mean_token_accuracy": 0.7645821911096573,
"num_tokens": 994580681.0,
"step": 40450
},
{
"entropy": 1.5396121156215667,
"epoch": 1.9187038089823765,
"grad_norm": 1.1620501279830933,
"learning_rate": 5.030610640494427e-07,
"loss": 1.0964,
"mean_token_accuracy": 0.7677625626325607,
"num_tokens": 995811651.0,
"step": 40500
},
{
"entropy": 1.551676151752472,
"epoch": 1.9210725791169225,
"grad_norm": 1.1541757583618164,
"learning_rate": 4.7423455459803536e-07,
"loss": 1.1273,
"mean_token_accuracy": 0.7630288958549499,
"num_tokens": 997009574.0,
"step": 40550
},
{
"entropy": 1.5580232727527619,
"epoch": 1.9234413492514686,
"grad_norm": 1.1015989780426025,
"learning_rate": 4.46254571134358e-07,
"loss": 1.1074,
"mean_token_accuracy": 0.7681276690959931,
"num_tokens": 998241396.0,
"step": 40600
},
{
"entropy": 1.5923674273490906,
"epoch": 1.9258101193860147,
"grad_norm": 1.3336706161499023,
"learning_rate": 4.1912159191015433e-07,
"loss": 1.1296,
"mean_token_accuracy": 0.7642046666145325,
"num_tokens": 999467429.0,
"step": 40650
},
{
"entropy": 1.5437606346607209,
"epoch": 1.928178889520561,
"grad_norm": 1.2162717580795288,
"learning_rate": 3.928360806996212e-07,
"loss": 1.1328,
"mean_token_accuracy": 0.7629256331920624,
"num_tokens": 1000699068.0,
"step": 40700
},
{
"entropy": 1.5417852425575256,
"epoch": 1.9305476596551072,
"grad_norm": 1.6152098178863525,
"learning_rate": 3.673984867914815e-07,
"loss": 1.1152,
"mean_token_accuracy": 0.7649567657709122,
"num_tokens": 1001935400.0,
"step": 40750
},
{
"entropy": 1.5360798180103301,
"epoch": 1.9329164297896533,
"grad_norm": 1.1792229413986206,
"learning_rate": 3.4280924498132917e-07,
"loss": 1.0897,
"mean_token_accuracy": 0.7702373021841049,
"num_tokens": 1003148072.0,
"step": 40800
},
{
"entropy": 1.5614196360111237,
"epoch": 1.9352851999241993,
"grad_norm": 1.244520664215088,
"learning_rate": 3.1906877556417414e-07,
"loss": 1.1636,
"mean_token_accuracy": 0.7582697266340256,
"num_tokens": 1004371846.0,
"step": 40850
},
{
"entropy": 1.5311019134521484,
"epoch": 1.9376539700587454,
"grad_norm": 1.064553141593933,
"learning_rate": 2.961774843272702e-07,
"loss": 1.0873,
"mean_token_accuracy": 0.7693702638149261,
"num_tokens": 1005607072.0,
"step": 40900
},
{
"entropy": 1.5764271855354308,
"epoch": 1.9400227401932917,
"grad_norm": 1.3938215970993042,
"learning_rate": 2.7413576254317065e-07,
"loss": 1.1587,
"mean_token_accuracy": 0.7578467607498169,
"num_tokens": 1006850136.0,
"step": 40950
},
{
"entropy": 1.5480285215377807,
"epoch": 1.9423915103278377,
"grad_norm": 1.2054554224014282,
"learning_rate": 2.529439869630612e-07,
"loss": 1.1434,
"mean_token_accuracy": 0.7606914877891541,
"num_tokens": 1008107025.0,
"step": 41000
},
{
"epoch": 1.9423915103278377,
"eval_entropy": 1.068777510758218,
"eval_loss": NaN,
"eval_mean_token_accuracy": 0.7612478421396935,
"eval_num_tokens": 1008107025.0,
"eval_runtime": 729.2965,
"eval_samples_per_second": 34.026,
"eval_steps_per_second": 4.253,
"step": 41000
},
{
"entropy": 1.5319510400295258,
"epoch": 1.944760280462384,
"grad_norm": 1.2180676460266113,
"learning_rate": 2.326025198102877e-07,
"loss": 1.1041,
"mean_token_accuracy": 0.7670031028985977,
"num_tokens": 1009369117.0,
"step": 41050
},
{
"entropy": 1.5614441645145416,
"epoch": 1.94712905059693,
"grad_norm": 1.0921282768249512,
"learning_rate": 2.1311170877418296e-07,
"loss": 1.1394,
"mean_token_accuracy": 0.7615219783782959,
"num_tokens": 1010587367.0,
"step": 41100
},
{
"entropy": 1.5393009448051453,
"epoch": 1.9494978207314761,
"grad_norm": 0.9701796770095825,
"learning_rate": 1.9447188700413287e-07,
"loss": 1.097,
"mean_token_accuracy": 0.7700717490911484,
"num_tokens": 1011833377.0,
"step": 41150
},
{
"entropy": 1.541386902332306,
"epoch": 1.9518665908660222,
"grad_norm": 1.24893319606781,
"learning_rate": 1.7668337310386418e-07,
"loss": 1.1177,
"mean_token_accuracy": 0.7653918391466141,
"num_tokens": 1013068815.0,
"step": 41200
},
{
"entropy": 1.5547258961200714,
"epoch": 1.9542353610005685,
"grad_norm": 1.079546570777893,
"learning_rate": 1.5974647112600994e-07,
"loss": 1.123,
"mean_token_accuracy": 0.7637561255693436,
"num_tokens": 1014274097.0,
"step": 41250
},
{
"entropy": 1.5249077999591827,
"epoch": 1.9566041311351148,
"grad_norm": 1.5189976692199707,
"learning_rate": 1.436614705669026e-07,
"loss": 1.1098,
"mean_token_accuracy": 0.7675369191169739,
"num_tokens": 1015545120.0,
"step": 41300
},
{
"entropy": 1.569046869277954,
"epoch": 1.9589729012696608,
"grad_norm": 1.0235174894332886,
"learning_rate": 1.2842864636164464e-07,
"loss": 1.1425,
"mean_token_accuracy": 0.7613073486089706,
"num_tokens": 1016784041.0,
"step": 41350
},
{
"entropy": 1.5418232583999634,
"epoch": 1.961341671404207,
"grad_norm": 1.1395779848098755,
"learning_rate": 1.1404825887937898e-07,
"loss": 1.1096,
"mean_token_accuracy": 0.7651181477308273,
"num_tokens": 1018023715.0,
"step": 41400
},
{
"entropy": 1.5499148654937744,
"epoch": 1.963710441538753,
"grad_norm": 1.103959560394287,
"learning_rate": 1.0052055391887027e-07,
"loss": 1.1536,
"mean_token_accuracy": 0.7575215709209442,
"num_tokens": 1019247827.0,
"step": 41450
},
{
"entropy": 1.5448267459869385,
"epoch": 1.9660792116732992,
"grad_norm": 1.159401297569275,
"learning_rate": 8.784576270428058e-08,
"loss": 1.1252,
"mean_token_accuracy": 0.7638620465993882,
"num_tokens": 1020467713.0,
"step": 41500
}
],
"logging_steps": 50,
"max_steps": 42216,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1122127009405626e+20,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}