camilablank's picture
Upload shakespearean_L16_a150 seed_42 (final adapter + all intermediate checkpoints)
f09ad50 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.0,
"eval_steps": 500,
"global_step": 5201,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.148803424835205,
"epoch": 0.013458950201884253,
"grad_norm": 3.8571324348449707,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.5379337310791016,
"mean_token_accuracy": 0.8677904605865479,
"num_tokens": 206424.0,
"step": 10
},
{
"entropy": 1.1465779304504395,
"epoch": 0.026917900403768506,
"grad_norm": 3.910707712173462,
"learning_rate": 5.1075268817204305e-06,
"loss": 0.512223768234253,
"mean_token_accuracy": 0.8692944288253784,
"num_tokens": 413440.0,
"step": 20
},
{
"entropy": 1.1525423288345338,
"epoch": 0.040376850605652756,
"grad_norm": 1.4944534301757812,
"learning_rate": 7.795698924731183e-06,
"loss": 0.3680122852325439,
"mean_token_accuracy": 0.8862209141254425,
"num_tokens": 620527.0,
"step": 30
},
{
"entropy": 1.1328181862831115,
"epoch": 0.05383580080753701,
"grad_norm": 0.6623373031616211,
"learning_rate": 1.0483870967741936e-05,
"loss": 0.2167149305343628,
"mean_token_accuracy": 0.9302092254161834,
"num_tokens": 827181.0,
"step": 40
},
{
"entropy": 1.1270673632621766,
"epoch": 0.06729475100942127,
"grad_norm": 0.475289911031723,
"learning_rate": 1.3172043010752688e-05,
"loss": 0.16388083696365358,
"mean_token_accuracy": 0.9458359956741333,
"num_tokens": 1034474.0,
"step": 50
},
{
"entropy": 1.1342731595039368,
"epoch": 0.08075370121130551,
"grad_norm": 0.3283269703388214,
"learning_rate": 1.586021505376344e-05,
"loss": 0.1500812292098999,
"mean_token_accuracy": 0.9489317536354065,
"num_tokens": 1241682.0,
"step": 60
},
{
"entropy": 1.1313621759414674,
"epoch": 0.09421265141318977,
"grad_norm": 0.26236599683761597,
"learning_rate": 1.8548387096774193e-05,
"loss": 0.1297295331954956,
"mean_token_accuracy": 0.9544126331806183,
"num_tokens": 1448340.0,
"step": 70
},
{
"entropy": 1.1243879675865174,
"epoch": 0.10767160161507403,
"grad_norm": 0.3455806374549866,
"learning_rate": 2.1236559139784946e-05,
"loss": 0.12099775075912475,
"mean_token_accuracy": 0.957859605550766,
"num_tokens": 1655485.0,
"step": 80
},
{
"entropy": 1.1236204266548158,
"epoch": 0.12113055181695828,
"grad_norm": 0.34308287501335144,
"learning_rate": 2.39247311827957e-05,
"loss": 0.12203640937805176,
"mean_token_accuracy": 0.9569462358951568,
"num_tokens": 1862721.0,
"step": 90
},
{
"entropy": 1.1212542057037354,
"epoch": 0.13458950201884254,
"grad_norm": 0.4861944913864136,
"learning_rate": 2.661290322580645e-05,
"loss": 0.11177135705947876,
"mean_token_accuracy": 0.9603476166725159,
"num_tokens": 2070235.0,
"step": 100
},
{
"entropy": 1.1195749163627624,
"epoch": 0.1480484522207268,
"grad_norm": 0.45371124148368835,
"learning_rate": 2.9301075268817207e-05,
"loss": 0.10732686519622803,
"mean_token_accuracy": 0.9610694587230683,
"num_tokens": 2276699.0,
"step": 110
},
{
"entropy": 1.117960274219513,
"epoch": 0.16150740242261102,
"grad_norm": 0.34547603130340576,
"learning_rate": 3.198924731182796e-05,
"loss": 0.10359159708023072,
"mean_token_accuracy": 0.9620514333248138,
"num_tokens": 2483409.0,
"step": 120
},
{
"entropy": 1.1100458979606629,
"epoch": 0.17496635262449528,
"grad_norm": 0.4161316752433777,
"learning_rate": 3.467741935483872e-05,
"loss": 0.09693117141723633,
"mean_token_accuracy": 0.9630799531936646,
"num_tokens": 2689832.0,
"step": 130
},
{
"entropy": 1.104417872428894,
"epoch": 0.18842530282637954,
"grad_norm": 0.36697497963905334,
"learning_rate": 3.736559139784947e-05,
"loss": 0.08809277415275574,
"mean_token_accuracy": 0.966953593492508,
"num_tokens": 2896836.0,
"step": 140
},
{
"entropy": 1.1028227806091309,
"epoch": 0.2018842530282638,
"grad_norm": 0.44922295212745667,
"learning_rate": 4.005376344086022e-05,
"loss": 0.08628258109092712,
"mean_token_accuracy": 0.9677423119544983,
"num_tokens": 3103952.0,
"step": 150
},
{
"entropy": 1.1022148132324219,
"epoch": 0.21534320323014805,
"grad_norm": 0.49055221676826477,
"learning_rate": 4.2741935483870973e-05,
"loss": 0.08765531778335571,
"mean_token_accuracy": 0.9671335756778717,
"num_tokens": 3310678.0,
"step": 160
},
{
"entropy": 1.1029130339622497,
"epoch": 0.2288021534320323,
"grad_norm": 0.9099652767181396,
"learning_rate": 4.543010752688172e-05,
"loss": 0.08238034844398498,
"mean_token_accuracy": 0.9691508829593658,
"num_tokens": 3517265.0,
"step": 170
},
{
"entropy": 1.1018678188323974,
"epoch": 0.24226110363391656,
"grad_norm": 0.6304323673248291,
"learning_rate": 4.811827956989248e-05,
"loss": 0.0836961030960083,
"mean_token_accuracy": 0.9679419934749603,
"num_tokens": 3724354.0,
"step": 180
},
{
"entropy": 1.0950765252113341,
"epoch": 0.2557200538358008,
"grad_norm": 0.6410331130027771,
"learning_rate": 5.080645161290323e-05,
"loss": 0.08708053231239318,
"mean_token_accuracy": 0.9672979772090912,
"num_tokens": 3931323.0,
"step": 190
},
{
"entropy": 1.1028541207313538,
"epoch": 0.2691790040376851,
"grad_norm": 0.5233514904975891,
"learning_rate": 5.349462365591398e-05,
"loss": 0.09297683238983154,
"mean_token_accuracy": 0.9647954285144806,
"num_tokens": 4137581.0,
"step": 200
},
{
"entropy": 1.0980924248695374,
"epoch": 0.28263795423956933,
"grad_norm": 0.452391117811203,
"learning_rate": 5.618279569892473e-05,
"loss": 0.07667996883392333,
"mean_token_accuracy": 0.9710327327251435,
"num_tokens": 4344516.0,
"step": 210
},
{
"entropy": 1.094509518146515,
"epoch": 0.2960969044414536,
"grad_norm": 0.36206117272377014,
"learning_rate": 5.887096774193549e-05,
"loss": 0.0823606014251709,
"mean_token_accuracy": 0.9687092125415802,
"num_tokens": 4551381.0,
"step": 220
},
{
"entropy": 1.100801420211792,
"epoch": 0.30955585464333785,
"grad_norm": 0.5433716177940369,
"learning_rate": 6.155913978494624e-05,
"loss": 0.07860409021377564,
"mean_token_accuracy": 0.9693577349185943,
"num_tokens": 4757696.0,
"step": 230
},
{
"entropy": 1.0913572549819945,
"epoch": 0.32301480484522205,
"grad_norm": 0.49431684613227844,
"learning_rate": 6.4247311827957e-05,
"loss": 0.07448963522911071,
"mean_token_accuracy": 0.9722888886928558,
"num_tokens": 4964472.0,
"step": 240
},
{
"entropy": 1.09194358587265,
"epoch": 0.3364737550471063,
"grad_norm": 0.49218472838401794,
"learning_rate": 6.693548387096774e-05,
"loss": 0.07904107570648193,
"mean_token_accuracy": 0.9706218123435975,
"num_tokens": 5171079.0,
"step": 250
},
{
"entropy": 1.0910104513168335,
"epoch": 0.34993270524899056,
"grad_norm": 0.4496209919452667,
"learning_rate": 6.962365591397851e-05,
"loss": 0.07965055108070374,
"mean_token_accuracy": 0.9694360017776489,
"num_tokens": 5377563.0,
"step": 260
},
{
"entropy": 1.0903191208839416,
"epoch": 0.3633916554508748,
"grad_norm": 0.6361143589019775,
"learning_rate": 7.231182795698926e-05,
"loss": 0.07528930306434631,
"mean_token_accuracy": 0.9712541162967682,
"num_tokens": 5584588.0,
"step": 270
},
{
"entropy": 1.088301658630371,
"epoch": 0.3768506056527591,
"grad_norm": 0.48176145553588867,
"learning_rate": 7.500000000000001e-05,
"loss": 0.07463239431381226,
"mean_token_accuracy": 0.9709499478340149,
"num_tokens": 5791987.0,
"step": 280
},
{
"entropy": 1.0902979731559754,
"epoch": 0.39030955585464333,
"grad_norm": 0.3715272843837738,
"learning_rate": 7.768817204301076e-05,
"loss": 0.07689260840415954,
"mean_token_accuracy": 0.9711870729923249,
"num_tokens": 5999048.0,
"step": 290
},
{
"entropy": 1.0841062188148498,
"epoch": 0.4037685060565276,
"grad_norm": 0.41488343477249146,
"learning_rate": 8.037634408602151e-05,
"loss": 0.07187070846557617,
"mean_token_accuracy": 0.9723010718822479,
"num_tokens": 6206324.0,
"step": 300
},
{
"entropy": 1.0855528712272644,
"epoch": 0.41722745625841184,
"grad_norm": 0.46184614300727844,
"learning_rate": 8.306451612903227e-05,
"loss": 0.07177088856697082,
"mean_token_accuracy": 0.9727262139320374,
"num_tokens": 6412954.0,
"step": 310
},
{
"entropy": 1.082417607307434,
"epoch": 0.4306864064602961,
"grad_norm": 0.4955524206161499,
"learning_rate": 8.575268817204302e-05,
"loss": 0.07426758408546448,
"mean_token_accuracy": 0.9719507157802582,
"num_tokens": 6619325.0,
"step": 320
},
{
"entropy": 1.079432439804077,
"epoch": 0.44414535666218036,
"grad_norm": 0.4565524458885193,
"learning_rate": 8.844086021505377e-05,
"loss": 0.07158007621765136,
"mean_token_accuracy": 0.9722371101379395,
"num_tokens": 6825686.0,
"step": 330
},
{
"entropy": 1.0759860038757325,
"epoch": 0.4576043068640646,
"grad_norm": 0.3511630892753601,
"learning_rate": 9.112903225806452e-05,
"loss": 0.06483781337738037,
"mean_token_accuracy": 0.9750715494155884,
"num_tokens": 7033233.0,
"step": 340
},
{
"entropy": 1.0776028990745545,
"epoch": 0.47106325706594887,
"grad_norm": 0.4951312243938446,
"learning_rate": 9.381720430107528e-05,
"loss": 0.06857329607009888,
"mean_token_accuracy": 0.973279458284378,
"num_tokens": 7240102.0,
"step": 350
},
{
"entropy": 1.0731443285942077,
"epoch": 0.4845222072678331,
"grad_norm": 0.4344209134578705,
"learning_rate": 9.650537634408603e-05,
"loss": 0.06759830713272094,
"mean_token_accuracy": 0.9735908925533294,
"num_tokens": 7446371.0,
"step": 360
},
{
"entropy": 1.0724977254867554,
"epoch": 0.4979811574697174,
"grad_norm": 0.366580605506897,
"learning_rate": 9.919354838709678e-05,
"loss": 0.07093892097473145,
"mean_token_accuracy": 0.9729514002799988,
"num_tokens": 7653471.0,
"step": 370
},
{
"entropy": 1.0748721718788148,
"epoch": 0.5114401076716016,
"grad_norm": 0.5749452710151672,
"learning_rate": 9.999975729865971e-05,
"loss": 0.07341647148132324,
"mean_token_accuracy": 0.9716718792915344,
"num_tokens": 7859948.0,
"step": 380
},
{
"entropy": 1.0717879056930542,
"epoch": 0.5248990578734859,
"grad_norm": 0.3533707857131958,
"learning_rate": 9.999856856307314e-05,
"loss": 0.0660275936126709,
"mean_token_accuracy": 0.9741848886013031,
"num_tokens": 8066342.0,
"step": 390
},
{
"entropy": 1.06906156539917,
"epoch": 0.5383580080753702,
"grad_norm": 0.37785258889198303,
"learning_rate": 9.999638923896533e-05,
"loss": 0.07019535303115845,
"mean_token_accuracy": 0.9730446755886077,
"num_tokens": 8273143.0,
"step": 400
},
{
"entropy": 1.0653064250946045,
"epoch": 0.5518169582772544,
"grad_norm": 0.5148960947990417,
"learning_rate": 9.999321936951374e-05,
"loss": 0.06534629464149475,
"mean_token_accuracy": 0.9743636608123779,
"num_tokens": 8479886.0,
"step": 410
},
{
"entropy": 1.0702444195747376,
"epoch": 0.5652759084791387,
"grad_norm": 0.3438984453678131,
"learning_rate": 9.998905901752091e-05,
"loss": 0.07364909052848816,
"mean_token_accuracy": 0.9711242079734802,
"num_tokens": 8686924.0,
"step": 420
},
{
"entropy": 1.0661863446235658,
"epoch": 0.5787348586810229,
"grad_norm": 0.406575083732605,
"learning_rate": 9.998390826541315e-05,
"loss": 0.07197093963623047,
"mean_token_accuracy": 0.9721520006656647,
"num_tokens": 8894083.0,
"step": 430
},
{
"entropy": 1.0637916922569275,
"epoch": 0.5921938088829072,
"grad_norm": 0.428232878446579,
"learning_rate": 9.997776721523888e-05,
"loss": 0.06615663766860962,
"mean_token_accuracy": 0.9749840319156646,
"num_tokens": 9101394.0,
"step": 440
},
{
"entropy": 1.0706895470619202,
"epoch": 0.6056527590847914,
"grad_norm": 0.4631795585155487,
"learning_rate": 9.99706359886667e-05,
"loss": 0.06901694536209106,
"mean_token_accuracy": 0.9723595201969146,
"num_tokens": 9307988.0,
"step": 450
},
{
"entropy": 1.0587412118911743,
"epoch": 0.6191117092866757,
"grad_norm": 0.34737110137939453,
"learning_rate": 9.996251472698281e-05,
"loss": 0.060923701524734496,
"mean_token_accuracy": 0.9760171294212341,
"num_tokens": 9515151.0,
"step": 460
},
{
"entropy": 1.0579840540885925,
"epoch": 0.6325706594885598,
"grad_norm": 0.45198148488998413,
"learning_rate": 9.995340359108844e-05,
"loss": 0.06548458337783813,
"mean_token_accuracy": 0.9744683504104614,
"num_tokens": 9721875.0,
"step": 470
},
{
"entropy": 1.0573879599571228,
"epoch": 0.6460296096904441,
"grad_norm": 0.3072788715362549,
"learning_rate": 9.994330276149649e-05,
"loss": 0.06333011984825135,
"mean_token_accuracy": 0.9758989095687867,
"num_tokens": 9928940.0,
"step": 480
},
{
"entropy": 1.053052794933319,
"epoch": 0.6594885598923284,
"grad_norm": 0.36213016510009766,
"learning_rate": 9.993221243832797e-05,
"loss": 0.06300820708274842,
"mean_token_accuracy": 0.9755940675735474,
"num_tokens": 10136064.0,
"step": 490
},
{
"entropy": 1.0509449362754821,
"epoch": 0.6729475100942126,
"grad_norm": 0.3727293610572815,
"learning_rate": 9.992013284130816e-05,
"loss": 0.0638339638710022,
"mean_token_accuracy": 0.9754894852638245,
"num_tokens": 10343159.0,
"step": 500
},
{
"entropy": 1.0530992031097413,
"epoch": 0.6864064602960969,
"grad_norm": 0.3802219331264496,
"learning_rate": 9.990706420976206e-05,
"loss": 0.06384505033493042,
"mean_token_accuracy": 0.9755050659179687,
"num_tokens": 10550244.0,
"step": 510
},
{
"entropy": 1.0434999823570252,
"epoch": 0.6998654104979811,
"grad_norm": 0.3587857186794281,
"learning_rate": 9.989300680260985e-05,
"loss": 0.06402989625930786,
"mean_token_accuracy": 0.9750327467918396,
"num_tokens": 10756685.0,
"step": 520
},
{
"entropy": 1.0373775839805603,
"epoch": 0.7133243606998654,
"grad_norm": 0.5431106686592102,
"learning_rate": 9.98779608983616e-05,
"loss": 0.06214879155158996,
"mean_token_accuracy": 0.975670325756073,
"num_tokens": 10963777.0,
"step": 530
},
{
"entropy": 1.042518949508667,
"epoch": 0.7267833109017496,
"grad_norm": 0.42488718032836914,
"learning_rate": 9.986192679511189e-05,
"loss": 0.06648873090744019,
"mean_token_accuracy": 0.9741427898406982,
"num_tokens": 11170113.0,
"step": 540
},
{
"entropy": 1.0383589148521424,
"epoch": 0.7402422611036339,
"grad_norm": 0.4388017952442169,
"learning_rate": 9.984490481053372e-05,
"loss": 0.06373496651649475,
"mean_token_accuracy": 0.9753153324127197,
"num_tokens": 11377177.0,
"step": 550
},
{
"entropy": 1.030055034160614,
"epoch": 0.7537012113055181,
"grad_norm": 0.5177302360534668,
"learning_rate": 9.982689528187244e-05,
"loss": 0.06524116396903992,
"mean_token_accuracy": 0.974441123008728,
"num_tokens": 11584297.0,
"step": 560
},
{
"entropy": 1.0238368034362793,
"epoch": 0.7671601615074024,
"grad_norm": 0.3689085841178894,
"learning_rate": 9.98078985659389e-05,
"loss": 0.057816171646118165,
"mean_token_accuracy": 0.9771308839321137,
"num_tokens": 11791221.0,
"step": 570
},
{
"entropy": 1.0260837078094482,
"epoch": 0.7806191117092867,
"grad_norm": 0.5362858772277832,
"learning_rate": 9.978791503910246e-05,
"loss": 0.07015071511268615,
"mean_token_accuracy": 0.9720354020595551,
"num_tokens": 11997267.0,
"step": 580
},
{
"entropy": 1.0203055262565612,
"epoch": 0.7940780619111709,
"grad_norm": 0.3500514328479767,
"learning_rate": 9.97669450972835e-05,
"loss": 0.0703177034854889,
"mean_token_accuracy": 0.9722893536090851,
"num_tokens": 12204122.0,
"step": 590
},
{
"entropy": 1.0201836228370667,
"epoch": 0.8075370121130552,
"grad_norm": 0.4321291744709015,
"learning_rate": 9.974498915594557e-05,
"loss": 0.06507774591445922,
"mean_token_accuracy": 0.9742036819458008,
"num_tokens": 12410976.0,
"step": 600
},
{
"entropy": 1.016060435771942,
"epoch": 0.8209959623149394,
"grad_norm": 0.30269697308540344,
"learning_rate": 9.97220476500872e-05,
"loss": 0.06292833685874939,
"mean_token_accuracy": 0.9752326369285583,
"num_tokens": 12617715.0,
"step": 610
},
{
"entropy": 1.0146049499511718,
"epoch": 0.8344549125168237,
"grad_norm": 0.3690892457962036,
"learning_rate": 9.969812103423325e-05,
"loss": 0.06385961771011353,
"mean_token_accuracy": 0.9751694440841675,
"num_tokens": 12824043.0,
"step": 620
},
{
"entropy": 1.0002728581428528,
"epoch": 0.847913862718708,
"grad_norm": 0.38600072264671326,
"learning_rate": 9.967320978242592e-05,
"loss": 0.0620916485786438,
"mean_token_accuracy": 0.9763328671455384,
"num_tokens": 13031364.0,
"step": 630
},
{
"entropy": 1.0015868484973907,
"epoch": 0.8613728129205922,
"grad_norm": 0.2936280369758606,
"learning_rate": 9.964731438821533e-05,
"loss": 0.05919194221496582,
"mean_token_accuracy": 0.9767723500728607,
"num_tokens": 13238320.0,
"step": 640
},
{
"entropy": 0.9941371619701386,
"epoch": 0.8748317631224765,
"grad_norm": 0.3988257050514221,
"learning_rate": 9.962043536464978e-05,
"loss": 0.06394914388656617,
"mean_token_accuracy": 0.9750961601734162,
"num_tokens": 13444929.0,
"step": 650
},
{
"entropy": 0.9894050359725952,
"epoch": 0.8882907133243607,
"grad_norm": 0.373199462890625,
"learning_rate": 9.959257324426556e-05,
"loss": 0.062080603837966916,
"mean_token_accuracy": 0.9750838100910186,
"num_tokens": 13652058.0,
"step": 660
},
{
"entropy": 0.9806465029716491,
"epoch": 0.901749663526245,
"grad_norm": 0.4831550121307373,
"learning_rate": 9.95637285790764e-05,
"loss": 0.058683907985687254,
"mean_token_accuracy": 0.9773052394390106,
"num_tokens": 13859138.0,
"step": 670
},
{
"entropy": 0.9766267120838166,
"epoch": 0.9152086137281292,
"grad_norm": 0.2987309396266937,
"learning_rate": 9.953390194056258e-05,
"loss": 0.05572218298912048,
"mean_token_accuracy": 0.9783341228961945,
"num_tokens": 14066689.0,
"step": 680
},
{
"entropy": 0.9726546585559845,
"epoch": 0.9286675639300135,
"grad_norm": 0.49207931756973267,
"learning_rate": 9.950309391965947e-05,
"loss": 0.060380947589874265,
"mean_token_accuracy": 0.9769295156002045,
"num_tokens": 14273683.0,
"step": 690
},
{
"entropy": 0.9802318632602691,
"epoch": 0.9421265141318977,
"grad_norm": 0.45118796825408936,
"learning_rate": 9.947130512674602e-05,
"loss": 0.061833739280700684,
"mean_token_accuracy": 0.9755691647529602,
"num_tokens": 14480450.0,
"step": 700
},
{
"entropy": 0.9772093653678894,
"epoch": 0.955585464333782,
"grad_norm": 0.32441404461860657,
"learning_rate": 9.943853619163255e-05,
"loss": 0.06221691370010376,
"mean_token_accuracy": 0.97582648396492,
"num_tokens": 14687394.0,
"step": 710
},
{
"entropy": 0.9712013483047486,
"epoch": 0.9690444145356663,
"grad_norm": 0.4250052273273468,
"learning_rate": 9.94047877635482e-05,
"loss": 0.06065620183944702,
"mean_token_accuracy": 0.9765822350978851,
"num_tokens": 14894507.0,
"step": 720
},
{
"entropy": 0.9658361196517944,
"epoch": 0.9825033647375505,
"grad_norm": 0.32111653685569763,
"learning_rate": 9.93700605111283e-05,
"loss": 0.05561348795890808,
"mean_token_accuracy": 0.977992582321167,
"num_tokens": 15101391.0,
"step": 730
},
{
"entropy": 0.9521977305412292,
"epoch": 0.9959623149394348,
"grad_norm": 0.4938078224658966,
"learning_rate": 9.933435512240084e-05,
"loss": 0.054716891050338744,
"mean_token_accuracy": 0.9787404000759125,
"num_tokens": 15308546.0,
"step": 740
},
{
"epoch": 1.0,
"eval_entropy": 0.953010377610565,
"eval_loss": 0.057268138974905014,
"eval_mean_token_accuracy": 0.9773533947908195,
"eval_num_tokens": 15370753.0,
"eval_runtime": 14.2376,
"eval_samples_per_second": 351.183,
"eval_steps_per_second": 11.027,
"step": 743
},
{
"entropy": 0.9481509625911713,
"epoch": 1.009421265141319,
"grad_norm": 0.45019322633743286,
"learning_rate": 9.929767230477305e-05,
"loss": 0.05444328188896179,
"mean_token_accuracy": 0.9790198564529419,
"num_tokens": 15516109.0,
"step": 750
},
{
"entropy": 0.9414805412292481,
"epoch": 1.0228802153432033,
"grad_norm": 0.4308762848377228,
"learning_rate": 9.92600127850173e-05,
"loss": 0.05448293685913086,
"mean_token_accuracy": 0.9784913420677185,
"num_tokens": 15723217.0,
"step": 760
},
{
"entropy": 0.9518636524677276,
"epoch": 1.0363391655450875,
"grad_norm": 0.35742712020874023,
"learning_rate": 9.922137730925673e-05,
"loss": 0.050360894203186034,
"mean_token_accuracy": 0.9803691744804383,
"num_tokens": 15929676.0,
"step": 770
},
{
"entropy": 0.9442292392253876,
"epoch": 1.0497981157469718,
"grad_norm": 0.3914043605327606,
"learning_rate": 9.918176664295041e-05,
"loss": 0.049952417612075806,
"mean_token_accuracy": 0.9808734893798828,
"num_tokens": 16136561.0,
"step": 780
},
{
"entropy": 0.9437739491462708,
"epoch": 1.063257065948856,
"grad_norm": 0.5056980848312378,
"learning_rate": 9.914118157087824e-05,
"loss": 0.04833500385284424,
"mean_token_accuracy": 0.9812267363071442,
"num_tokens": 16343444.0,
"step": 790
},
{
"entropy": 0.9437163591384887,
"epoch": 1.0767160161507403,
"grad_norm": 0.4277653396129608,
"learning_rate": 9.909962289712538e-05,
"loss": 0.04925972819328308,
"mean_token_accuracy": 0.9810784041881562,
"num_tokens": 16550530.0,
"step": 800
},
{
"entropy": 0.9458538055419922,
"epoch": 1.0901749663526246,
"grad_norm": 0.3851556181907654,
"learning_rate": 9.905709144506629e-05,
"loss": 0.05031745433807373,
"mean_token_accuracy": 0.9802758753299713,
"num_tokens": 16757340.0,
"step": 810
},
{
"entropy": 0.9309192001819611,
"epoch": 1.1036339165545088,
"grad_norm": 0.3910880982875824,
"learning_rate": 9.901358805734846e-05,
"loss": 0.052798688411712646,
"mean_token_accuracy": 0.9791285336017609,
"num_tokens": 16963736.0,
"step": 820
},
{
"entropy": 0.9286812126636506,
"epoch": 1.117092866756393,
"grad_norm": 0.40629255771636963,
"learning_rate": 9.89691135958757e-05,
"loss": 0.0501678466796875,
"mean_token_accuracy": 0.9807449519634247,
"num_tokens": 17170342.0,
"step": 830
},
{
"entropy": 0.9351256549358368,
"epoch": 1.1305518169582773,
"grad_norm": 0.43279024958610535,
"learning_rate": 9.892366894179105e-05,
"loss": 0.054589086771011354,
"mean_token_accuracy": 0.9786225914955139,
"num_tokens": 17377459.0,
"step": 840
},
{
"entropy": 0.9256517946720123,
"epoch": 1.1440107671601616,
"grad_norm": 0.34225529432296753,
"learning_rate": 9.887725499545937e-05,
"loss": 0.05118160843849182,
"mean_token_accuracy": 0.9801044940948487,
"num_tokens": 17584121.0,
"step": 850
},
{
"entropy": 0.9092171311378479,
"epoch": 1.1574697173620458,
"grad_norm": 0.4391540586948395,
"learning_rate": 9.882987267644939e-05,
"loss": 0.050338762998580935,
"mean_token_accuracy": 0.9804662644863129,
"num_tokens": 17791221.0,
"step": 860
},
{
"entropy": 0.9084291815757751,
"epoch": 1.17092866756393,
"grad_norm": 0.36910438537597656,
"learning_rate": 9.878152292351563e-05,
"loss": 0.05027775764465332,
"mean_token_accuracy": 0.9796700894832611,
"num_tokens": 17997756.0,
"step": 870
},
{
"entropy": 0.9082553148269653,
"epoch": 1.1843876177658144,
"grad_norm": 0.6938081383705139,
"learning_rate": 9.873220669457975e-05,
"loss": 0.0528791606426239,
"mean_token_accuracy": 0.9790813088417053,
"num_tokens": 18204162.0,
"step": 880
},
{
"entropy": 0.9058304250240325,
"epoch": 1.1978465679676986,
"grad_norm": 0.45121562480926514,
"learning_rate": 9.868192496671147e-05,
"loss": 0.05375846028327942,
"mean_token_accuracy": 0.978945928812027,
"num_tokens": 18410866.0,
"step": 890
},
{
"entropy": 0.8980253875255585,
"epoch": 1.2113055181695827,
"grad_norm": 0.38418707251548767,
"learning_rate": 9.86306787361094e-05,
"loss": 0.04832733869552612,
"mean_token_accuracy": 0.981135094165802,
"num_tokens": 18617340.0,
"step": 900
},
{
"entropy": 0.9037271738052368,
"epoch": 1.224764468371467,
"grad_norm": 0.4018036127090454,
"learning_rate": 9.857846901808117e-05,
"loss": 0.0564426064491272,
"mean_token_accuracy": 0.978033185005188,
"num_tokens": 18823367.0,
"step": 910
},
{
"entropy": 0.8979138970375061,
"epoch": 1.2382234185733512,
"grad_norm": 0.40662312507629395,
"learning_rate": 9.852529684702329e-05,
"loss": 0.051144140958786014,
"mean_token_accuracy": 0.9796078085899353,
"num_tokens": 19030313.0,
"step": 920
},
{
"entropy": 0.8905315518379211,
"epoch": 1.2516823687752354,
"grad_norm": 0.4605758786201477,
"learning_rate": 9.847116327640082e-05,
"loss": 0.04797832369804382,
"mean_token_accuracy": 0.9815882265567779,
"num_tokens": 19237742.0,
"step": 930
},
{
"entropy": 0.8781926572322846,
"epoch": 1.2651413189771197,
"grad_norm": 0.41945287585258484,
"learning_rate": 9.841606937872632e-05,
"loss": 0.05179686546325683,
"mean_token_accuracy": 0.9793125987052917,
"num_tokens": 19444394.0,
"step": 940
},
{
"entropy": 0.8800916612148285,
"epoch": 1.278600269179004,
"grad_norm": 0.44151630997657776,
"learning_rate": 9.836001624553869e-05,
"loss": 0.05166807770729065,
"mean_token_accuracy": 0.97959805727005,
"num_tokens": 19651243.0,
"step": 950
},
{
"entropy": 0.8785765290260314,
"epoch": 1.2920592193808882,
"grad_norm": 0.35568565130233765,
"learning_rate": 9.830300498738152e-05,
"loss": 0.05172090530395508,
"mean_token_accuracy": 0.9796277642250061,
"num_tokens": 19858323.0,
"step": 960
},
{
"entropy": 0.8748304843902588,
"epoch": 1.3055181695827724,
"grad_norm": 0.5144789814949036,
"learning_rate": 9.824503673378112e-05,
"loss": 0.05269254446029663,
"mean_token_accuracy": 0.9794202625751496,
"num_tokens": 20064779.0,
"step": 970
},
{
"entropy": 0.878674441576004,
"epoch": 1.3189771197846567,
"grad_norm": 0.3717172145843506,
"learning_rate": 9.81861126332241e-05,
"loss": 0.05027352571487427,
"mean_token_accuracy": 0.9804058611392975,
"num_tokens": 20271663.0,
"step": 980
},
{
"entropy": 0.874711400270462,
"epoch": 1.332436069986541,
"grad_norm": 0.5428106784820557,
"learning_rate": 9.812623385313461e-05,
"loss": 0.04746338129043579,
"mean_token_accuracy": 0.9809837818145752,
"num_tokens": 20479093.0,
"step": 990
},
{
"entropy": 0.8767077684402466,
"epoch": 1.3458950201884252,
"grad_norm": 0.4203868806362152,
"learning_rate": 9.806540157985131e-05,
"loss": 0.047260144352912904,
"mean_token_accuracy": 0.981168121099472,
"num_tokens": 20686058.0,
"step": 1000
},
{
"entropy": 0.8626155018806457,
"epoch": 1.3593539703903095,
"grad_norm": 0.5116966962814331,
"learning_rate": 9.800361701860368e-05,
"loss": 0.04717045724391937,
"mean_token_accuracy": 0.9814311504364014,
"num_tokens": 20892972.0,
"step": 1010
},
{
"entropy": 0.8597043097019196,
"epoch": 1.3728129205921937,
"grad_norm": 0.5143622159957886,
"learning_rate": 9.794088139348835e-05,
"loss": 0.04667789340019226,
"mean_token_accuracy": 0.9814274728298187,
"num_tokens": 21100415.0,
"step": 1020
},
{
"entropy": 0.8572463095188141,
"epoch": 1.386271870794078,
"grad_norm": 0.553600549697876,
"learning_rate": 9.787719594744468e-05,
"loss": 0.050517672300338747,
"mean_token_accuracy": 0.9801969349384307,
"num_tokens": 21307218.0,
"step": 1030
},
{
"entropy": 0.8552091658115387,
"epoch": 1.3997308209959622,
"grad_norm": 0.4543687403202057,
"learning_rate": 9.781256194223023e-05,
"loss": 0.05095386505126953,
"mean_token_accuracy": 0.9798394560813903,
"num_tokens": 21514406.0,
"step": 1040
},
{
"entropy": 0.8547879338264466,
"epoch": 1.4131897711978465,
"grad_norm": 0.3877199590206146,
"learning_rate": 9.774698065839577e-05,
"loss": 0.05205919742584229,
"mean_token_accuracy": 0.9795322000980378,
"num_tokens": 21721292.0,
"step": 1050
},
{
"entropy": 0.8591440379619598,
"epoch": 1.4266487213997308,
"grad_norm": 0.4777373671531677,
"learning_rate": 9.768045339525979e-05,
"loss": 0.04756388366222382,
"mean_token_accuracy": 0.9814973652362824,
"num_tokens": 21927753.0,
"step": 1060
},
{
"entropy": 0.83706636428833,
"epoch": 1.440107671601615,
"grad_norm": 0.3106399178504944,
"learning_rate": 9.76129814708829e-05,
"loss": 0.04771801829338074,
"mean_token_accuracy": 0.9811239123344422,
"num_tokens": 22134047.0,
"step": 1070
},
{
"entropy": 0.8355492770671844,
"epoch": 1.4535666218034993,
"grad_norm": 0.5079196691513062,
"learning_rate": 9.754456622204167e-05,
"loss": 0.04518579244613648,
"mean_token_accuracy": 0.9819768607616425,
"num_tokens": 22340843.0,
"step": 1080
},
{
"entropy": 0.8377915918827057,
"epoch": 1.4670255720053835,
"grad_norm": 0.38823193311691284,
"learning_rate": 9.747520900420209e-05,
"loss": 0.04822598397731781,
"mean_token_accuracy": 0.9811451613903046,
"num_tokens": 22547662.0,
"step": 1090
},
{
"entropy": 0.8314657747745514,
"epoch": 1.4804845222072678,
"grad_norm": 0.6800487041473389,
"learning_rate": 9.740491119149277e-05,
"loss": 0.04973709583282471,
"mean_token_accuracy": 0.9797703623771667,
"num_tokens": 22754487.0,
"step": 1100
},
{
"entropy": 0.8429964482784271,
"epoch": 1.493943472409152,
"grad_norm": 0.46988436579704285,
"learning_rate": 9.733367417667773e-05,
"loss": 0.0486497312784195,
"mean_token_accuracy": 0.9810753047466279,
"num_tokens": 22961947.0,
"step": 1110
},
{
"entropy": 0.8371677458286285,
"epoch": 1.5074024226110363,
"grad_norm": 0.4881710410118103,
"learning_rate": 9.726149937112873e-05,
"loss": 0.04747593402862549,
"mean_token_accuracy": 0.9815979599952698,
"num_tokens": 23169342.0,
"step": 1120
},
{
"entropy": 0.8210869550704956,
"epoch": 1.5208613728129206,
"grad_norm": 0.4130207896232605,
"learning_rate": 9.718838820479743e-05,
"loss": 0.049933090806007385,
"mean_token_accuracy": 0.9803601801395416,
"num_tokens": 23376509.0,
"step": 1130
},
{
"entropy": 0.8233297169208527,
"epoch": 1.5343203230148048,
"grad_norm": 0.47688028216362,
"learning_rate": 9.711434212618691e-05,
"loss": 0.049675410985946654,
"mean_token_accuracy": 0.9803059160709381,
"num_tokens": 23582681.0,
"step": 1140
},
{
"entropy": 0.8203558087348938,
"epoch": 1.547779273216689,
"grad_norm": 0.45380693674087524,
"learning_rate": 9.703936260232308e-05,
"loss": 0.04854576587677002,
"mean_token_accuracy": 0.9805228471755981,
"num_tokens": 23789426.0,
"step": 1150
},
{
"entropy": 0.8290607392787933,
"epoch": 1.5612382234185733,
"grad_norm": 0.4461262822151184,
"learning_rate": 9.696345111872557e-05,
"loss": 0.04563935399055481,
"mean_token_accuracy": 0.9819368541240692,
"num_tokens": 23996347.0,
"step": 1160
},
{
"entropy": 0.8161819994449615,
"epoch": 1.5746971736204576,
"grad_norm": 0.46854355931282043,
"learning_rate": 9.688660917937838e-05,
"loss": 0.04675787091255188,
"mean_token_accuracy": 0.98145791888237,
"num_tokens": 24203599.0,
"step": 1170
},
{
"entropy": 0.8060347080230713,
"epoch": 1.5881561238223418,
"grad_norm": 0.41180017590522766,
"learning_rate": 9.68088383066999e-05,
"loss": 0.04404969215393066,
"mean_token_accuracy": 0.9832783460617065,
"num_tokens": 24410522.0,
"step": 1180
},
{
"entropy": 0.8048585295677185,
"epoch": 1.601615074024226,
"grad_norm": 0.491583913564682,
"learning_rate": 9.673014004151292e-05,
"loss": 0.045036160945892335,
"mean_token_accuracy": 0.9821190059185028,
"num_tokens": 24617845.0,
"step": 1190
},
{
"entropy": 0.8028138160705567,
"epoch": 1.6150740242261103,
"grad_norm": 0.4175598919391632,
"learning_rate": 9.665051594301407e-05,
"loss": 0.04619626402854919,
"mean_token_accuracy": 0.9816527247428894,
"num_tokens": 24824416.0,
"step": 1200
},
{
"entropy": 0.8100907325744628,
"epoch": 1.6285329744279946,
"grad_norm": 0.4809361696243286,
"learning_rate": 9.656996758874284e-05,
"loss": 0.04774007797241211,
"mean_token_accuracy": 0.9811590790748597,
"num_tokens": 25031469.0,
"step": 1210
},
{
"entropy": 0.8034710586071014,
"epoch": 1.6419919246298789,
"grad_norm": 0.37044280767440796,
"learning_rate": 9.648849657455044e-05,
"loss": 0.0478478193283081,
"mean_token_accuracy": 0.9812443971633911,
"num_tokens": 25238653.0,
"step": 1220
},
{
"entropy": 0.816223555803299,
"epoch": 1.6554508748317631,
"grad_norm": 0.3459724187850952,
"learning_rate": 9.640610451456811e-05,
"loss": 0.047498634457588194,
"mean_token_accuracy": 0.9809110701084137,
"num_tokens": 25445715.0,
"step": 1230
},
{
"entropy": 0.8094809174537658,
"epoch": 1.6689098250336474,
"grad_norm": 0.4526945650577545,
"learning_rate": 9.632279304117517e-05,
"loss": 0.04413290619850159,
"mean_token_accuracy": 0.9828720092773438,
"num_tokens": 25652990.0,
"step": 1240
},
{
"entropy": 0.8075843989849091,
"epoch": 1.6823687752355316,
"grad_norm": 0.3427794575691223,
"learning_rate": 9.623856380496664e-05,
"loss": 0.04015462398529053,
"mean_token_accuracy": 0.984161913394928,
"num_tokens": 25859529.0,
"step": 1250
},
{
"entropy": 0.7957218885421753,
"epoch": 1.695827725437416,
"grad_norm": 0.4648391902446747,
"learning_rate": 9.615341847472059e-05,
"loss": 0.04236364960670471,
"mean_token_accuracy": 0.9831009984016419,
"num_tokens": 26066638.0,
"step": 1260
},
{
"entropy": 0.7911267936229706,
"epoch": 1.7092866756393001,
"grad_norm": 0.35517409443855286,
"learning_rate": 9.606735873736505e-05,
"loss": 0.04558713436126709,
"mean_token_accuracy": 0.982138192653656,
"num_tokens": 26273299.0,
"step": 1270
},
{
"entropy": 0.7834631562232971,
"epoch": 1.7227456258411844,
"grad_norm": 0.4664056599140167,
"learning_rate": 9.598038629794461e-05,
"loss": 0.04543980956077576,
"mean_token_accuracy": 0.9820871353149414,
"num_tokens": 26480619.0,
"step": 1280
},
{
"entropy": 0.7766066193580627,
"epoch": 1.7362045760430687,
"grad_norm": 0.46915021538734436,
"learning_rate": 9.589250287958657e-05,
"loss": 0.048852354288101196,
"mean_token_accuracy": 0.9804103672504425,
"num_tokens": 26687502.0,
"step": 1290
},
{
"entropy": 0.7885740280151368,
"epoch": 1.749663526244953,
"grad_norm": 0.5565235614776611,
"learning_rate": 9.580371022346693e-05,
"loss": 0.04616362452507019,
"mean_token_accuracy": 0.9811102449893951,
"num_tokens": 26894776.0,
"step": 1300
},
{
"entropy": 0.7828902661800384,
"epoch": 1.7631224764468372,
"grad_norm": 0.5803515911102295,
"learning_rate": 9.571401008877572e-05,
"loss": 0.04636417329311371,
"mean_token_accuracy": 0.9814817011356354,
"num_tokens": 27101361.0,
"step": 1310
},
{
"entropy": 0.7779680609703064,
"epoch": 1.7765814266487214,
"grad_norm": 0.46953341364860535,
"learning_rate": 9.562340425268233e-05,
"loss": 0.043124464154243466,
"mean_token_accuracy": 0.9823956906795501,
"num_tokens": 27308139.0,
"step": 1320
},
{
"entropy": 0.7818998336791992,
"epoch": 1.7900403768506057,
"grad_norm": 0.43444475531578064,
"learning_rate": 9.553189451030019e-05,
"loss": 0.04699342250823975,
"mean_token_accuracy": 0.981617671251297,
"num_tokens": 27515135.0,
"step": 1330
},
{
"entropy": 0.7814963579177856,
"epoch": 1.80349932705249,
"grad_norm": 0.7200784087181091,
"learning_rate": 9.543948267465115e-05,
"loss": 0.04428764283657074,
"mean_token_accuracy": 0.9820694208145142,
"num_tokens": 27722318.0,
"step": 1340
},
{
"entropy": 0.7804880917072297,
"epoch": 1.8169582772543742,
"grad_norm": 0.4098879396915436,
"learning_rate": 9.534617057662977e-05,
"loss": 0.04612208902835846,
"mean_token_accuracy": 0.981588214635849,
"num_tokens": 27929006.0,
"step": 1350
},
{
"entropy": 0.7731973230838776,
"epoch": 1.8304172274562585,
"grad_norm": 0.46128687262535095,
"learning_rate": 9.525196006496679e-05,
"loss": 0.04260898232460022,
"mean_token_accuracy": 0.9835397064685821,
"num_tokens": 28136443.0,
"step": 1360
},
{
"entropy": 0.7819123864173889,
"epoch": 1.8438761776581427,
"grad_norm": 0.41956645250320435,
"learning_rate": 9.515685300619271e-05,
"loss": 0.045637202262878415,
"mean_token_accuracy": 0.981810599565506,
"num_tokens": 28343644.0,
"step": 1370
},
{
"entropy": 0.7694122910499572,
"epoch": 1.857335127860027,
"grad_norm": 0.3839203417301178,
"learning_rate": 9.506085128460065e-05,
"loss": 0.044372576475143435,
"mean_token_accuracy": 0.9824269771575928,
"num_tokens": 28550070.0,
"step": 1380
},
{
"entropy": 0.7737328886985779,
"epoch": 1.8707940780619112,
"grad_norm": 0.3746931254863739,
"learning_rate": 9.496395680220918e-05,
"loss": 0.043450570106506346,
"mean_token_accuracy": 0.9827447474002838,
"num_tokens": 28756798.0,
"step": 1390
},
{
"entropy": 0.783908200263977,
"epoch": 1.8842530282637955,
"grad_norm": 0.4228839874267578,
"learning_rate": 9.486617147872446e-05,
"loss": 0.04549018144607544,
"mean_token_accuracy": 0.9819923400878906,
"num_tokens": 28963808.0,
"step": 1400
},
{
"entropy": 0.7758362650871277,
"epoch": 1.8977119784656797,
"grad_norm": 0.5251613855361938,
"learning_rate": 9.476749725150235e-05,
"loss": 0.04514434039592743,
"mean_token_accuracy": 0.9820655703544616,
"num_tokens": 29170348.0,
"step": 1410
},
{
"entropy": 0.7961021900177002,
"epoch": 1.911170928667564,
"grad_norm": 0.3529505133628845,
"learning_rate": 9.466793607550995e-05,
"loss": 0.044498777389526366,
"mean_token_accuracy": 0.9821643531322479,
"num_tokens": 29377369.0,
"step": 1420
},
{
"entropy": 0.7776929080486298,
"epoch": 1.9246298788694483,
"grad_norm": 0.6512665152549744,
"learning_rate": 9.45674899232869e-05,
"loss": 0.043723279237747194,
"mean_token_accuracy": 0.9822627186775208,
"num_tokens": 29583698.0,
"step": 1430
},
{
"entropy": 0.7827112972736359,
"epoch": 1.9380888290713325,
"grad_norm": 0.42146649956703186,
"learning_rate": 9.446616078490626e-05,
"loss": 0.04473748207092285,
"mean_token_accuracy": 0.9821819305419922,
"num_tokens": 29790624.0,
"step": 1440
},
{
"entropy": 0.7946831583976746,
"epoch": 1.9515477792732168,
"grad_norm": 0.47883015871047974,
"learning_rate": 9.436395066793518e-05,
"loss": 0.04658020734786987,
"mean_token_accuracy": 0.9815436899662018,
"num_tokens": 29997548.0,
"step": 1450
},
{
"entropy": 0.7833279132843017,
"epoch": 1.965006729475101,
"grad_norm": 0.4905496835708618,
"learning_rate": 9.426086159739496e-05,
"loss": 0.04289662837982178,
"mean_token_accuracy": 0.9827987909317016,
"num_tokens": 30203894.0,
"step": 1460
},
{
"entropy": 0.7804967284202575,
"epoch": 1.9784656796769853,
"grad_norm": 0.3333725929260254,
"learning_rate": 9.415689561572107e-05,
"loss": 0.044277578592300415,
"mean_token_accuracy": 0.982471638917923,
"num_tokens": 30410743.0,
"step": 1470
},
{
"entropy": 0.7721736073493958,
"epoch": 1.9919246298788695,
"grad_norm": 0.5358064770698547,
"learning_rate": 9.405205478272267e-05,
"loss": 0.042890912294387816,
"mean_token_accuracy": 0.9828305661678314,
"num_tokens": 30617524.0,
"step": 1480
},
{
"epoch": 2.0,
"eval_entropy": 0.7763412169590118,
"eval_loss": 0.045948393642902374,
"eval_mean_token_accuracy": 0.9816623083345449,
"eval_num_tokens": 30741532.0,
"eval_runtime": 13.9729,
"eval_samples_per_second": 357.835,
"eval_steps_per_second": 11.236,
"step": 1486
},
{
"entropy": 0.7704778194427491,
"epoch": 2.005383580080754,
"grad_norm": 0.4395267367362976,
"learning_rate": 9.394634117554173e-05,
"loss": 0.04141359925270081,
"mean_token_accuracy": 0.9836697518825531,
"num_tokens": 30824156.0,
"step": 1490
},
{
"entropy": 0.7459299206733704,
"epoch": 2.018842530282638,
"grad_norm": 0.4781450629234314,
"learning_rate": 9.38397568886119e-05,
"loss": 0.03474531769752502,
"mean_token_accuracy": 0.9868264079093934,
"num_tokens": 31031057.0,
"step": 1500
},
{
"entropy": 0.7422619521617889,
"epoch": 2.0323014804845223,
"grad_norm": 0.47168222069740295,
"learning_rate": 9.373230403361712e-05,
"loss": 0.03591993451118469,
"mean_token_accuracy": 0.9855878591537476,
"num_tokens": 31238416.0,
"step": 1510
},
{
"entropy": 0.7497646749019623,
"epoch": 2.0457604306864066,
"grad_norm": 0.37677422165870667,
"learning_rate": 9.362398473944958e-05,
"loss": 0.03205449879169464,
"mean_token_accuracy": 0.9876303613185883,
"num_tokens": 31445048.0,
"step": 1520
},
{
"entropy": 0.7456450223922729,
"epoch": 2.059219380888291,
"grad_norm": 0.4814225733280182,
"learning_rate": 9.35148011521677e-05,
"loss": 0.03433309197425842,
"mean_token_accuracy": 0.9864069163799286,
"num_tokens": 31652249.0,
"step": 1530
},
{
"entropy": 0.7520359694957733,
"epoch": 2.072678331090175,
"grad_norm": 0.5464296936988831,
"learning_rate": 9.340475543495364e-05,
"loss": 0.03631590604782105,
"mean_token_accuracy": 0.9858682453632355,
"num_tokens": 31859054.0,
"step": 1540
},
{
"entropy": 0.754530417919159,
"epoch": 2.0861372812920593,
"grad_norm": 0.4758802652359009,
"learning_rate": 9.329384976807023e-05,
"loss": 0.03384111821651459,
"mean_token_accuracy": 0.9866842925548553,
"num_tokens": 32066326.0,
"step": 1550
},
{
"entropy": 0.7495070815086364,
"epoch": 2.0995962314939436,
"grad_norm": 0.4748813211917877,
"learning_rate": 9.318208634881802e-05,
"loss": 0.03490318655967713,
"mean_token_accuracy": 0.9859294176101685,
"num_tokens": 32273034.0,
"step": 1560
},
{
"entropy": 0.7660306870937348,
"epoch": 2.113055181695828,
"grad_norm": 0.41709309816360474,
"learning_rate": 9.306946739149161e-05,
"loss": 0.03640688955783844,
"mean_token_accuracy": 0.9853965878486634,
"num_tokens": 32480411.0,
"step": 1570
},
{
"entropy": 0.7681133329868317,
"epoch": 2.126514131897712,
"grad_norm": 0.5109054446220398,
"learning_rate": 9.29559951273358e-05,
"loss": 0.03681868314743042,
"mean_token_accuracy": 0.9852465927600861,
"num_tokens": 32687538.0,
"step": 1580
},
{
"entropy": 0.7571729898452759,
"epoch": 2.1399730820995964,
"grad_norm": 0.5264601707458496,
"learning_rate": 9.284167180450141e-05,
"loss": 0.035235798358917235,
"mean_token_accuracy": 0.9863873898983002,
"num_tokens": 32894684.0,
"step": 1590
},
{
"entropy": 0.7517239153385162,
"epoch": 2.1534320323014806,
"grad_norm": 0.44705790281295776,
"learning_rate": 9.272649968800069e-05,
"loss": 0.0375944584608078,
"mean_token_accuracy": 0.9852746367454529,
"num_tokens": 33101905.0,
"step": 1600
},
{
"entropy": 0.763954496383667,
"epoch": 2.166890982503365,
"grad_norm": 0.42811813950538635,
"learning_rate": 9.26104810596625e-05,
"loss": 0.03505580127239227,
"mean_token_accuracy": 0.9862452685832978,
"num_tokens": 33308533.0,
"step": 1610
},
{
"entropy": 0.766820102930069,
"epoch": 2.180349932705249,
"grad_norm": 0.4892241656780243,
"learning_rate": 9.249361821808708e-05,
"loss": 0.03619093894958496,
"mean_token_accuracy": 0.9859104573726654,
"num_tokens": 33515373.0,
"step": 1620
},
{
"entropy": 0.7709326684474945,
"epoch": 2.1938088829071334,
"grad_norm": 0.5112985372543335,
"learning_rate": 9.237591347860052e-05,
"loss": 0.03710994720458984,
"mean_token_accuracy": 0.9851432621479035,
"num_tokens": 33722411.0,
"step": 1630
},
{
"entropy": 0.7663405299186706,
"epoch": 2.2072678331090176,
"grad_norm": 0.4687499403953552,
"learning_rate": 9.225736917320886e-05,
"loss": 0.034553620219230655,
"mean_token_accuracy": 0.986689567565918,
"num_tokens": 33929078.0,
"step": 1640
},
{
"entropy": 0.7591994524002075,
"epoch": 2.220726783310902,
"grad_norm": 0.4145253598690033,
"learning_rate": 9.213798765055187e-05,
"loss": 0.0364462822675705,
"mean_token_accuracy": 0.9857377469539642,
"num_tokens": 34136005.0,
"step": 1650
},
{
"entropy": 0.7645731925964355,
"epoch": 2.234185733512786,
"grad_norm": 0.41732943058013916,
"learning_rate": 9.20177712758566e-05,
"loss": 0.034900492429733275,
"mean_token_accuracy": 0.9864438831806183,
"num_tokens": 34342605.0,
"step": 1660
},
{
"entropy": 0.769444328546524,
"epoch": 2.2476446837146704,
"grad_norm": 0.5293087959289551,
"learning_rate": 9.189672243089046e-05,
"loss": 0.03702518343925476,
"mean_token_accuracy": 0.9857267081737519,
"num_tokens": 34549514.0,
"step": 1670
},
{
"entropy": 0.7718922853469848,
"epoch": 2.2611036339165547,
"grad_norm": 0.469811350107193,
"learning_rate": 9.177484351391402e-05,
"loss": 0.03630037009716034,
"mean_token_accuracy": 0.9853004455566406,
"num_tokens": 34756057.0,
"step": 1680
},
{
"entropy": 0.7716727793216706,
"epoch": 2.274562584118439,
"grad_norm": 0.6152373552322388,
"learning_rate": 9.165213693963355e-05,
"loss": 0.033915793895721434,
"mean_token_accuracy": 0.9866912722587585,
"num_tokens": 34962988.0,
"step": 1690
},
{
"entropy": 0.7670650899410247,
"epoch": 2.288021534320323,
"grad_norm": 0.7092661261558533,
"learning_rate": 9.152860513915314e-05,
"loss": 0.03407727777957916,
"mean_token_accuracy": 0.9864597499370575,
"num_tokens": 35169933.0,
"step": 1700
},
{
"entropy": 0.7746156573295593,
"epoch": 2.3014804845222074,
"grad_norm": 0.4469820261001587,
"learning_rate": 9.140425055992648e-05,
"loss": 0.03753764033317566,
"mean_token_accuracy": 0.9847545504570008,
"num_tokens": 35376647.0,
"step": 1710
},
{
"entropy": 0.7733055353164673,
"epoch": 2.3149394347240917,
"grad_norm": 0.46848854422569275,
"learning_rate": 9.127907566570853e-05,
"loss": 0.033966490626335145,
"mean_token_accuracy": 0.9865934252738953,
"num_tokens": 35583818.0,
"step": 1720
},
{
"entropy": 0.7706968247890472,
"epoch": 2.328398384925976,
"grad_norm": 0.5058269500732422,
"learning_rate": 9.115308293650653e-05,
"loss": 0.033354413509368894,
"mean_token_accuracy": 0.9870194375514985,
"num_tokens": 35791234.0,
"step": 1730
},
{
"entropy": 0.7614737451076508,
"epoch": 2.34185733512786,
"grad_norm": 0.3979508578777313,
"learning_rate": 9.102627486853099e-05,
"loss": 0.03707956075668335,
"mean_token_accuracy": 0.9849399268627167,
"num_tokens": 35997886.0,
"step": 1740
},
{
"entropy": 0.7629947364330292,
"epoch": 2.3553162853297445,
"grad_norm": 0.5079808831214905,
"learning_rate": 9.089865397414614e-05,
"loss": 0.03364085555076599,
"mean_token_accuracy": 0.9865766227245331,
"num_tokens": 36204490.0,
"step": 1750
},
{
"entropy": 0.7689139068126678,
"epoch": 2.3687752355316287,
"grad_norm": 0.5184679627418518,
"learning_rate": 9.077022278182024e-05,
"loss": 0.03795020878314972,
"mean_token_accuracy": 0.9854429662227631,
"num_tokens": 36411233.0,
"step": 1760
},
{
"entropy": 0.7789969980716706,
"epoch": 2.382234185733513,
"grad_norm": 0.4910341203212738,
"learning_rate": 9.064098383607545e-05,
"loss": 0.036249291896820066,
"mean_token_accuracy": 0.9861413240432739,
"num_tokens": 36618763.0,
"step": 1770
},
{
"entropy": 0.7814954161643982,
"epoch": 2.3956931359353972,
"grad_norm": 0.5686230659484863,
"learning_rate": 9.051093969743738e-05,
"loss": 0.036424264311790466,
"mean_token_accuracy": 0.9857628643512726,
"num_tokens": 36825280.0,
"step": 1780
},
{
"entropy": 0.7767263531684876,
"epoch": 2.409152086137281,
"grad_norm": 0.453492134809494,
"learning_rate": 9.03800929423844e-05,
"loss": 0.03373002707958221,
"mean_token_accuracy": 0.9867543041706085,
"num_tokens": 37032080.0,
"step": 1790
},
{
"entropy": 0.7687392771244049,
"epoch": 2.4226110363391653,
"grad_norm": 0.558427631855011,
"learning_rate": 9.024844616329662e-05,
"loss": 0.03517001271247864,
"mean_token_accuracy": 0.9863903641700744,
"num_tokens": 37238601.0,
"step": 1800
},
{
"entropy": 0.7732832372188568,
"epoch": 2.4360699865410496,
"grad_norm": 0.5509903430938721,
"learning_rate": 9.011600196840447e-05,
"loss": 0.036943814158439635,
"mean_token_accuracy": 0.9852350294589997,
"num_tokens": 37445537.0,
"step": 1810
},
{
"entropy": 0.7711590051651,
"epoch": 2.449528936742934,
"grad_norm": 0.4789018929004669,
"learning_rate": 8.998276298173707e-05,
"loss": 0.03290043473243713,
"mean_token_accuracy": 0.986678397655487,
"num_tokens": 37652275.0,
"step": 1820
},
{
"entropy": 0.7612685561180115,
"epoch": 2.462987886944818,
"grad_norm": 0.4346236288547516,
"learning_rate": 8.984873184307017e-05,
"loss": 0.034582901000976565,
"mean_token_accuracy": 0.986914598941803,
"num_tokens": 37858877.0,
"step": 1830
},
{
"entropy": 0.756990659236908,
"epoch": 2.4764468371467023,
"grad_norm": 0.49942365288734436,
"learning_rate": 8.971391120787397e-05,
"loss": 0.03583601713180542,
"mean_token_accuracy": 0.9852246820926667,
"num_tokens": 38065791.0,
"step": 1840
},
{
"entropy": 0.7551932036876678,
"epoch": 2.4899057873485866,
"grad_norm": 0.6017034649848938,
"learning_rate": 8.957830374726042e-05,
"loss": 0.0383810430765152,
"mean_token_accuracy": 0.9845924258232117,
"num_tokens": 38272316.0,
"step": 1850
},
{
"entropy": 0.7488674581050873,
"epoch": 2.503364737550471,
"grad_norm": 0.40909528732299805,
"learning_rate": 8.944191214793028e-05,
"loss": 0.03308407068252563,
"mean_token_accuracy": 0.9869977355003356,
"num_tokens": 38479157.0,
"step": 1860
},
{
"entropy": 0.7536681890487671,
"epoch": 2.516823687752355,
"grad_norm": 0.6133081912994385,
"learning_rate": 8.930473911212e-05,
"loss": 0.036927449703216556,
"mean_token_accuracy": 0.9848987877368927,
"num_tokens": 38685562.0,
"step": 1870
},
{
"entropy": 0.7497753620147705,
"epoch": 2.5302826379542394,
"grad_norm": 0.4397526681423187,
"learning_rate": 8.916678735754809e-05,
"loss": 0.03293933868408203,
"mean_token_accuracy": 0.9865592002868653,
"num_tokens": 38892182.0,
"step": 1880
},
{
"entropy": 0.768702232837677,
"epoch": 2.5437415881561236,
"grad_norm": 0.35570991039276123,
"learning_rate": 8.902805961736123e-05,
"loss": 0.03172276020050049,
"mean_token_accuracy": 0.9870541751384735,
"num_tokens": 39098643.0,
"step": 1890
},
{
"entropy": 0.7794657349586487,
"epoch": 2.557200538358008,
"grad_norm": 0.39434006810188293,
"learning_rate": 8.88885586400803e-05,
"loss": 0.032149982452392575,
"mean_token_accuracy": 0.9873526275157929,
"num_tokens": 39305419.0,
"step": 1900
},
{
"entropy": 0.7983768939971924,
"epoch": 2.570659488559892,
"grad_norm": 0.4141407907009125,
"learning_rate": 8.874828718954576e-05,
"loss": 0.03556913733482361,
"mean_token_accuracy": 0.986035841703415,
"num_tokens": 39512111.0,
"step": 1910
},
{
"entropy": 0.8069267928600311,
"epoch": 2.5841184387617764,
"grad_norm": 0.5249026417732239,
"learning_rate": 8.86072480448629e-05,
"loss": 0.035189300775527954,
"mean_token_accuracy": 0.9859413146972656,
"num_tokens": 39718796.0,
"step": 1920
},
{
"entropy": 0.8054641425609589,
"epoch": 2.5975773889636606,
"grad_norm": 0.341441810131073,
"learning_rate": 8.84654440003469e-05,
"loss": 0.035634788870811465,
"mean_token_accuracy": 0.9858911037445068,
"num_tokens": 39925706.0,
"step": 1930
},
{
"entropy": 0.791653448343277,
"epoch": 2.611036339165545,
"grad_norm": 0.4683961272239685,
"learning_rate": 8.83228778654674e-05,
"loss": 0.031418097019195554,
"mean_token_accuracy": 0.9873055160045624,
"num_tokens": 40132455.0,
"step": 1940
},
{
"entropy": 0.7851328790187836,
"epoch": 2.624495289367429,
"grad_norm": 0.5501745343208313,
"learning_rate": 8.817955246479276e-05,
"loss": 0.03469682037830353,
"mean_token_accuracy": 0.9861454546451569,
"num_tokens": 40339095.0,
"step": 1950
},
{
"entropy": 0.7846890926361084,
"epoch": 2.6379542395693134,
"grad_norm": 0.46256187558174133,
"learning_rate": 8.803547063793422e-05,
"loss": 0.03551913499832153,
"mean_token_accuracy": 0.9861681580543518,
"num_tokens": 40545821.0,
"step": 1960
},
{
"entropy": 0.7767995774745942,
"epoch": 2.6514131897711977,
"grad_norm": 0.3970808684825897,
"learning_rate": 8.789063523948958e-05,
"loss": 0.031726115942001344,
"mean_token_accuracy": 0.9874750375747681,
"num_tokens": 40753075.0,
"step": 1970
},
{
"entropy": 0.771297287940979,
"epoch": 2.664872139973082,
"grad_norm": 0.5771340727806091,
"learning_rate": 8.774504913898663e-05,
"loss": 0.033945786952972415,
"mean_token_accuracy": 0.9866833508014679,
"num_tokens": 40960109.0,
"step": 1980
},
{
"entropy": 0.7735596299171448,
"epoch": 2.678331090174966,
"grad_norm": 0.5560683012008667,
"learning_rate": 8.75987152208264e-05,
"loss": 0.033962175250053406,
"mean_token_accuracy": 0.9866875231266021,
"num_tokens": 41167093.0,
"step": 1990
},
{
"entropy": 0.769331830739975,
"epoch": 2.6917900403768504,
"grad_norm": 0.4328141510486603,
"learning_rate": 8.745163638422583e-05,
"loss": 0.03420340120792389,
"mean_token_accuracy": 0.986853563785553,
"num_tokens": 41373741.0,
"step": 2000
},
{
"entropy": 0.7721189796924591,
"epoch": 2.7052489905787347,
"grad_norm": 0.5219032764434814,
"learning_rate": 8.730381554316051e-05,
"loss": 0.03331426382064819,
"mean_token_accuracy": 0.9866217851638794,
"num_tokens": 41580817.0,
"step": 2010
},
{
"entropy": 0.7982444167137146,
"epoch": 2.718707940780619,
"grad_norm": 0.4722937345504761,
"learning_rate": 8.715525562630687e-05,
"loss": 0.034999901056289674,
"mean_token_accuracy": 0.9859172761440277,
"num_tokens": 41787433.0,
"step": 2020
},
{
"entropy": 0.796376746892929,
"epoch": 2.732166890982503,
"grad_norm": 0.4517671465873718,
"learning_rate": 8.700595957698411e-05,
"loss": 0.03431849479675293,
"mean_token_accuracy": 0.9859098434448242,
"num_tokens": 41994291.0,
"step": 2030
},
{
"entropy": 0.7885730385780334,
"epoch": 2.7456258411843875,
"grad_norm": 0.4771164357662201,
"learning_rate": 8.685593035309598e-05,
"loss": 0.03396539688110352,
"mean_token_accuracy": 0.9864761888980865,
"num_tokens": 42200986.0,
"step": 2040
},
{
"entropy": 0.7897965848445893,
"epoch": 2.7590847913862717,
"grad_norm": 0.35293588042259216,
"learning_rate": 8.670517092707213e-05,
"loss": 0.03421284556388855,
"mean_token_accuracy": 0.9866907477378846,
"num_tokens": 42407972.0,
"step": 2050
},
{
"entropy": 0.7963090658187866,
"epoch": 2.772543741588156,
"grad_norm": 0.480851948261261,
"learning_rate": 8.655368428580919e-05,
"loss": 0.03702861964702606,
"mean_token_accuracy": 0.9853508174419403,
"num_tokens": 42615046.0,
"step": 2060
},
{
"entropy": 0.7992978096008301,
"epoch": 2.7860026917900402,
"grad_norm": 0.5696701407432556,
"learning_rate": 8.640147343061165e-05,
"loss": 0.034851402044296265,
"mean_token_accuracy": 0.9860165119171143,
"num_tokens": 42821882.0,
"step": 2070
},
{
"entropy": 0.7952683210372925,
"epoch": 2.7994616419919245,
"grad_norm": 0.4716683030128479,
"learning_rate": 8.624854137713234e-05,
"loss": 0.03539964556694031,
"mean_token_accuracy": 0.9860508978366852,
"num_tokens": 43028269.0,
"step": 2080
},
{
"entropy": 0.787241518497467,
"epoch": 2.8129205921938087,
"grad_norm": 0.6041930913925171,
"learning_rate": 8.609489115531278e-05,
"loss": 0.033989882469177245,
"mean_token_accuracy": 0.9860147058963775,
"num_tokens": 43235021.0,
"step": 2090
},
{
"entropy": 0.7926058828830719,
"epoch": 2.826379542395693,
"grad_norm": 0.5247143507003784,
"learning_rate": 8.594052580932301e-05,
"loss": 0.03367411494255066,
"mean_token_accuracy": 0.9865197122097016,
"num_tokens": 43442265.0,
"step": 2100
},
{
"entropy": 0.7915597558021545,
"epoch": 2.8398384925975773,
"grad_norm": 0.48869433999061584,
"learning_rate": 8.578544839750141e-05,
"loss": 0.03161753416061401,
"mean_token_accuracy": 0.9873407542705536,
"num_tokens": 43648881.0,
"step": 2110
},
{
"entropy": 0.7891027152538299,
"epoch": 2.8532974427994615,
"grad_norm": 0.6239877939224243,
"learning_rate": 8.562966199229399e-05,
"loss": 0.03200874626636505,
"mean_token_accuracy": 0.9873006939888,
"num_tokens": 43855736.0,
"step": 2120
},
{
"entropy": 0.7923749804496765,
"epoch": 2.8667563930013458,
"grad_norm": 0.42834678292274475,
"learning_rate": 8.547316968019363e-05,
"loss": 0.03362695872783661,
"mean_token_accuracy": 0.9866919219493866,
"num_tokens": 44062736.0,
"step": 2130
},
{
"entropy": 0.7924418032169342,
"epoch": 2.88021534320323,
"grad_norm": 0.41659072041511536,
"learning_rate": 8.531597456167885e-05,
"loss": 0.032884901762008666,
"mean_token_accuracy": 0.9865517973899841,
"num_tokens": 44269673.0,
"step": 2140
},
{
"entropy": 0.7898743569850921,
"epoch": 2.8936742934051143,
"grad_norm": 0.5111353397369385,
"learning_rate": 8.515807975115239e-05,
"loss": 0.032617968320846555,
"mean_token_accuracy": 0.9871149003505707,
"num_tokens": 44477238.0,
"step": 2150
},
{
"entropy": 0.8053750157356262,
"epoch": 2.9071332436069985,
"grad_norm": 0.5145390033721924,
"learning_rate": 8.499948837687959e-05,
"loss": 0.035998404026031494,
"mean_token_accuracy": 0.985955685377121,
"num_tokens": 44684355.0,
"step": 2160
},
{
"entropy": 0.8018061518669128,
"epoch": 2.920592193808883,
"grad_norm": 0.5930307507514954,
"learning_rate": 8.484020358092625e-05,
"loss": 0.03492251336574555,
"mean_token_accuracy": 0.9859745800495148,
"num_tokens": 44891085.0,
"step": 2170
},
{
"entropy": 0.7959860146045685,
"epoch": 2.934051144010767,
"grad_norm": 0.527142345905304,
"learning_rate": 8.468022851909657e-05,
"loss": 0.032577240467071535,
"mean_token_accuracy": 0.9865381300449372,
"num_tokens": 45098305.0,
"step": 2180
},
{
"entropy": 0.797534954547882,
"epoch": 2.9475100942126513,
"grad_norm": 0.3939096927642822,
"learning_rate": 8.451956636087046e-05,
"loss": 0.03261285424232483,
"mean_token_accuracy": 0.9867972791194916,
"num_tokens": 45305134.0,
"step": 2190
},
{
"entropy": 0.7963755249977111,
"epoch": 2.9609690444145356,
"grad_norm": 0.37216880917549133,
"learning_rate": 8.435822028934087e-05,
"loss": 0.033119088411331175,
"mean_token_accuracy": 0.987492960691452,
"num_tokens": 45512139.0,
"step": 2200
},
{
"entropy": 0.7935790359973908,
"epoch": 2.97442799461642,
"grad_norm": 0.5378938317298889,
"learning_rate": 8.41961935011506e-05,
"loss": 0.033530765771865846,
"mean_token_accuracy": 0.986775541305542,
"num_tokens": 45718897.0,
"step": 2210
},
{
"entropy": 0.7964731454849243,
"epoch": 2.987886944818304,
"grad_norm": 0.5635746121406555,
"learning_rate": 8.403348920642911e-05,
"loss": 0.03444778919219971,
"mean_token_accuracy": 0.9864307165145874,
"num_tokens": 45925982.0,
"step": 2220
},
{
"epoch": 3.0,
"eval_entropy": 0.7897109795527854,
"eval_loss": 0.045999836176633835,
"eval_mean_token_accuracy": 0.9822407535686615,
"eval_num_tokens": 46112342.0,
"eval_runtime": 13.9838,
"eval_samples_per_second": 357.557,
"eval_steps_per_second": 11.227,
"step": 2229
},
{
"entropy": 0.7922163069248199,
"epoch": 3.0013458950201883,
"grad_norm": 0.5085620284080505,
"learning_rate": 8.387011062872883e-05,
"loss": 0.03196645081043244,
"mean_token_accuracy": 0.9873932301998138,
"num_tokens": 46133123.0,
"step": 2230
},
{
"entropy": 0.7857943475246429,
"epoch": 3.0148048452220726,
"grad_norm": 0.7587371468544006,
"learning_rate": 8.370606100496128e-05,
"loss": 0.024139750003814697,
"mean_token_accuracy": 0.9905063509941101,
"num_tokens": 46339878.0,
"step": 2240
},
{
"entropy": 0.7751803815364837,
"epoch": 3.028263795423957,
"grad_norm": 0.648463249206543,
"learning_rate": 8.354134358533301e-05,
"loss": 0.027016639709472656,
"mean_token_accuracy": 0.9895779073238373,
"num_tokens": 46546959.0,
"step": 2250
},
{
"entropy": 0.7765661358833313,
"epoch": 3.041722745625841,
"grad_norm": 0.5772325992584229,
"learning_rate": 8.337596163328114e-05,
"loss": 0.023444092273712157,
"mean_token_accuracy": 0.9911140978336335,
"num_tokens": 46753901.0,
"step": 2260
},
{
"entropy": 0.7815487205982208,
"epoch": 3.0551816958277254,
"grad_norm": 0.5210555791854858,
"learning_rate": 8.320991842540875e-05,
"loss": 0.025440862774848937,
"mean_token_accuracy": 0.9899839282035827,
"num_tokens": 46960507.0,
"step": 2270
},
{
"entropy": 0.7857287228107452,
"epoch": 3.0686406460296096,
"grad_norm": 0.41927286982536316,
"learning_rate": 8.304321725141995e-05,
"loss": 0.024293258786201477,
"mean_token_accuracy": 0.9906916856765747,
"num_tokens": 47167445.0,
"step": 2280
},
{
"entropy": 0.7888360917568207,
"epoch": 3.082099596231494,
"grad_norm": 0.4237120747566223,
"learning_rate": 8.287586141405464e-05,
"loss": 0.025221824645996094,
"mean_token_accuracy": 0.99046990275383,
"num_tokens": 47374418.0,
"step": 2290
},
{
"entropy": 0.7833469033241272,
"epoch": 3.095558546433378,
"grad_norm": 0.4873560070991516,
"learning_rate": 8.27078542290232e-05,
"loss": 0.025454607605934144,
"mean_token_accuracy": 0.9896302163600922,
"num_tokens": 47580519.0,
"step": 2300
},
{
"entropy": 0.7796760380268097,
"epoch": 3.1090174966352624,
"grad_norm": 0.5142725110054016,
"learning_rate": 8.253919902494071e-05,
"loss": 0.026190632581710817,
"mean_token_accuracy": 0.9899985671043396,
"num_tokens": 47786773.0,
"step": 2310
},
{
"entropy": 0.7891262769699097,
"epoch": 3.1224764468371466,
"grad_norm": 0.46983572840690613,
"learning_rate": 8.236989914326101e-05,
"loss": 0.02558264136314392,
"mean_token_accuracy": 0.9899591863155365,
"num_tokens": 47993610.0,
"step": 2320
},
{
"entropy": 0.7877647399902343,
"epoch": 3.135935397039031,
"grad_norm": 0.3744112253189087,
"learning_rate": 8.21999579382105e-05,
"loss": 0.026258507370948793,
"mean_token_accuracy": 0.9900372087955475,
"num_tokens": 48200562.0,
"step": 2330
},
{
"entropy": 0.7951374232769013,
"epoch": 3.149394347240915,
"grad_norm": 0.430062472820282,
"learning_rate": 8.202937877672175e-05,
"loss": 0.025594598054885863,
"mean_token_accuracy": 0.9902561128139495,
"num_tokens": 48406674.0,
"step": 2340
},
{
"entropy": 0.7907680094242096,
"epoch": 3.1628532974427994,
"grad_norm": 0.4720000624656677,
"learning_rate": 8.185816503836665e-05,
"loss": 0.02532147169113159,
"mean_token_accuracy": 0.9900774240493775,
"num_tokens": 48613281.0,
"step": 2350
},
{
"entropy": 0.7871745467185974,
"epoch": 3.1763122476446837,
"grad_norm": 0.5302978157997131,
"learning_rate": 8.168632011528961e-05,
"loss": 0.024092340469360353,
"mean_token_accuracy": 0.9907279312610626,
"num_tokens": 48820645.0,
"step": 2360
},
{
"entropy": 0.7865763783454895,
"epoch": 3.189771197846568,
"grad_norm": 0.40404385328292847,
"learning_rate": 8.15138474121403e-05,
"loss": 0.026713091135025024,
"mean_token_accuracy": 0.9897278249263763,
"num_tokens": 49028272.0,
"step": 2370
},
{
"entropy": 0.7893508970737457,
"epoch": 3.203230148048452,
"grad_norm": 0.5449444651603699,
"learning_rate": 8.134075034600609e-05,
"loss": 0.02622288465499878,
"mean_token_accuracy": 0.9898015022277832,
"num_tokens": 49235527.0,
"step": 2380
},
{
"entropy": 0.7904633581638336,
"epoch": 3.2166890982503364,
"grad_norm": 0.5542995929718018,
"learning_rate": 8.116703234634453e-05,
"loss": 0.026536452770233154,
"mean_token_accuracy": 0.9896604716777802,
"num_tokens": 49442369.0,
"step": 2390
},
{
"entropy": 0.7914480745792389,
"epoch": 3.2301480484522207,
"grad_norm": 0.37255948781967163,
"learning_rate": 8.099269685491528e-05,
"loss": 0.027568697929382324,
"mean_token_accuracy": 0.9894173204898834,
"num_tokens": 49649512.0,
"step": 2400
},
{
"entropy": 0.7816898763179779,
"epoch": 3.243606998654105,
"grad_norm": 0.4851721525192261,
"learning_rate": 8.081774732571196e-05,
"loss": 0.023995181918144225,
"mean_token_accuracy": 0.9908584892749787,
"num_tokens": 49856522.0,
"step": 2410
},
{
"entropy": 0.7849020719528198,
"epoch": 3.257065948855989,
"grad_norm": 0.5768998265266418,
"learning_rate": 8.06421872248937e-05,
"loss": 0.02669338583946228,
"mean_token_accuracy": 0.9900073111057281,
"num_tokens": 50062538.0,
"step": 2420
},
{
"entropy": 0.7905935168266296,
"epoch": 3.2705248990578735,
"grad_norm": 0.49046942591667175,
"learning_rate": 8.046602003071648e-05,
"loss": 0.026200637221336365,
"mean_token_accuracy": 0.9896903157234191,
"num_tokens": 50269493.0,
"step": 2430
},
{
"entropy": 0.7887889266014099,
"epoch": 3.2839838492597577,
"grad_norm": 0.6130094528198242,
"learning_rate": 8.028924923346426e-05,
"loss": 0.02587394416332245,
"mean_token_accuracy": 0.9894962787628174,
"num_tokens": 50476453.0,
"step": 2440
},
{
"entropy": 0.7816741824150085,
"epoch": 3.297442799461642,
"grad_norm": 0.47344231605529785,
"learning_rate": 8.011187833537972e-05,
"loss": 0.025678065419197083,
"mean_token_accuracy": 0.9897482633590698,
"num_tokens": 50683361.0,
"step": 2450
},
{
"entropy": 0.7746505677700043,
"epoch": 3.3109017496635262,
"grad_norm": 0.46171092987060547,
"learning_rate": 7.993391085059502e-05,
"loss": 0.026066750288009644,
"mean_token_accuracy": 0.9896942138671875,
"num_tokens": 50890676.0,
"step": 2460
},
{
"entropy": 0.7777635037899018,
"epoch": 3.3243606998654105,
"grad_norm": 0.48620864748954773,
"learning_rate": 7.975535030506203e-05,
"loss": 0.02398068457841873,
"mean_token_accuracy": 0.9904903531074524,
"num_tokens": 51097653.0,
"step": 2470
},
{
"entropy": 0.7784659445285798,
"epoch": 3.3378196500672948,
"grad_norm": 0.6068832874298096,
"learning_rate": 7.957620023648256e-05,
"loss": 0.02655583620071411,
"mean_token_accuracy": 0.989698976278305,
"num_tokens": 51304259.0,
"step": 2480
},
{
"entropy": 0.776263278722763,
"epoch": 3.351278600269179,
"grad_norm": 0.3608818054199219,
"learning_rate": 7.939646419423826e-05,
"loss": 0.025072038173675537,
"mean_token_accuracy": 0.9902963459491729,
"num_tokens": 51510973.0,
"step": 2490
},
{
"entropy": 0.7764017939567566,
"epoch": 3.3647375504710633,
"grad_norm": 0.4125957489013672,
"learning_rate": 7.92161457393203e-05,
"loss": 0.0256451278924942,
"mean_token_accuracy": 0.990142571926117,
"num_tokens": 51717824.0,
"step": 2500
},
{
"entropy": 0.7756951570510864,
"epoch": 3.3781965006729475,
"grad_norm": 0.4129035770893097,
"learning_rate": 7.903524844425878e-05,
"loss": 0.025212505459785463,
"mean_token_accuracy": 0.9900435447692871,
"num_tokens": 51924442.0,
"step": 2510
},
{
"entropy": 0.7646311402320862,
"epoch": 3.391655450874832,
"grad_norm": 0.47579509019851685,
"learning_rate": 7.885377589305197e-05,
"loss": 0.02700415551662445,
"mean_token_accuracy": 0.9895658552646637,
"num_tokens": 52131314.0,
"step": 2520
},
{
"entropy": 0.7609292089939117,
"epoch": 3.405114401076716,
"grad_norm": 0.4114918112754822,
"learning_rate": 7.867173168109534e-05,
"loss": 0.024971812963485718,
"mean_token_accuracy": 0.9904350519180298,
"num_tokens": 52337910.0,
"step": 2530
},
{
"entropy": 0.7537091553211213,
"epoch": 3.4185733512786003,
"grad_norm": 0.3902633786201477,
"learning_rate": 7.84891194151103e-05,
"loss": 0.02551236152648926,
"mean_token_accuracy": 0.990143883228302,
"num_tokens": 52545485.0,
"step": 2540
},
{
"entropy": 0.7640065968036651,
"epoch": 3.4320323014804845,
"grad_norm": 0.5460127592086792,
"learning_rate": 7.830594271307267e-05,
"loss": 0.029222273826599122,
"mean_token_accuracy": 0.9886435449123383,
"num_tokens": 52751594.0,
"step": 2550
},
{
"entropy": 0.763840240240097,
"epoch": 3.445491251682369,
"grad_norm": 0.35068443417549133,
"learning_rate": 7.812220520414115e-05,
"loss": 0.024993129074573517,
"mean_token_accuracy": 0.9905041277408599,
"num_tokens": 52958485.0,
"step": 2560
},
{
"entropy": 0.764543867111206,
"epoch": 3.458950201884253,
"grad_norm": 0.38305431604385376,
"learning_rate": 7.793791052858528e-05,
"loss": 0.024963854253292082,
"mean_token_accuracy": 0.9902342796325684,
"num_tokens": 53165775.0,
"step": 2570
},
{
"entropy": 0.7618798077106476,
"epoch": 3.4724091520861373,
"grad_norm": 0.3988240659236908,
"learning_rate": 7.775306233771343e-05,
"loss": 0.025615721940994263,
"mean_token_accuracy": 0.9900647640228272,
"num_tokens": 53373241.0,
"step": 2580
},
{
"entropy": 0.7694470942020416,
"epoch": 3.4858681022880216,
"grad_norm": 0.40997764468193054,
"learning_rate": 7.756766429380033e-05,
"loss": 0.02432440221309662,
"mean_token_accuracy": 0.9905587792396545,
"num_tokens": 53580570.0,
"step": 2590
},
{
"entropy": 0.7734194874763489,
"epoch": 3.499327052489906,
"grad_norm": 0.5883393883705139,
"learning_rate": 7.738172007001465e-05,
"loss": 0.0254077672958374,
"mean_token_accuracy": 0.9899859607219696,
"num_tokens": 53786644.0,
"step": 2600
},
{
"entropy": 0.770265918970108,
"epoch": 3.51278600269179,
"grad_norm": 0.5596480965614319,
"learning_rate": 7.719523335034612e-05,
"loss": 0.025699737668037414,
"mean_token_accuracy": 0.9896622657775879,
"num_tokens": 53993906.0,
"step": 2610
},
{
"entropy": 0.7764162719249725,
"epoch": 3.5262449528936743,
"grad_norm": 0.5137752294540405,
"learning_rate": 7.70082078295326e-05,
"loss": 0.02807466983795166,
"mean_token_accuracy": 0.9887792706489563,
"num_tokens": 54200102.0,
"step": 2620
},
{
"entropy": 0.7647269427776336,
"epoch": 3.5397039030955586,
"grad_norm": 0.46396756172180176,
"learning_rate": 7.682064721298683e-05,
"loss": 0.026442068815231323,
"mean_token_accuracy": 0.9896844685077667,
"num_tokens": 54406930.0,
"step": 2630
},
{
"entropy": 0.7612848520278931,
"epoch": 3.553162853297443,
"grad_norm": 0.45611244440078735,
"learning_rate": 7.663255521672308e-05,
"loss": 0.025213491916656495,
"mean_token_accuracy": 0.990059632062912,
"num_tokens": 54613415.0,
"step": 2640
},
{
"entropy": 0.7642963767051697,
"epoch": 3.566621803499327,
"grad_norm": 0.40814533829689026,
"learning_rate": 7.64439355672835e-05,
"loss": 0.025692886114120482,
"mean_token_accuracy": 0.9896343588829041,
"num_tokens": 54820508.0,
"step": 2650
},
{
"entropy": 0.7696209371089935,
"epoch": 3.5800807537012114,
"grad_norm": 0.5956735014915466,
"learning_rate": 7.625479200166425e-05,
"loss": 0.024135774374008177,
"mean_token_accuracy": 0.990184074640274,
"num_tokens": 55027129.0,
"step": 2660
},
{
"entropy": 0.7715358138084412,
"epoch": 3.5935397039030956,
"grad_norm": 0.4354693293571472,
"learning_rate": 7.606512826724155e-05,
"loss": 0.026834815740585327,
"mean_token_accuracy": 0.9893686950206757,
"num_tokens": 55233486.0,
"step": 2670
},
{
"entropy": 0.7687399387359619,
"epoch": 3.60699865410498,
"grad_norm": 0.4370516836643219,
"learning_rate": 7.587494812169728e-05,
"loss": 0.02458384484052658,
"mean_token_accuracy": 0.9904316484928131,
"num_tokens": 55440146.0,
"step": 2680
},
{
"entropy": 0.7663004577159882,
"epoch": 3.620457604306864,
"grad_norm": 0.4597774147987366,
"learning_rate": 7.568425533294476e-05,
"loss": 0.02627093195915222,
"mean_token_accuracy": 0.9895556032657623,
"num_tokens": 55646688.0,
"step": 2690
},
{
"entropy": 0.763479220867157,
"epoch": 3.6339165545087484,
"grad_norm": 0.6173406839370728,
"learning_rate": 7.549305367905385e-05,
"loss": 0.02632114291191101,
"mean_token_accuracy": 0.9895108163356781,
"num_tokens": 55854045.0,
"step": 2700
},
{
"entropy": 0.7665097296237946,
"epoch": 3.6473755047106327,
"grad_norm": 0.6147279143333435,
"learning_rate": 7.53013469481763e-05,
"loss": 0.026433029770851137,
"mean_token_accuracy": 0.9895636439323425,
"num_tokens": 56060469.0,
"step": 2710
},
{
"entropy": 0.7681903898715973,
"epoch": 3.660834454912517,
"grad_norm": 0.6082019209861755,
"learning_rate": 7.510913893847058e-05,
"loss": 0.024667418003082274,
"mean_token_accuracy": 0.9905052244663238,
"num_tokens": 56267217.0,
"step": 2720
},
{
"entropy": 0.7673894047737122,
"epoch": 3.674293405114401,
"grad_norm": 0.5873395800590515,
"learning_rate": 7.491643345802667e-05,
"loss": 0.026074895262718202,
"mean_token_accuracy": 0.9895181894302368,
"num_tokens": 56473879.0,
"step": 2730
},
{
"entropy": 0.76564120054245,
"epoch": 3.6877523553162854,
"grad_norm": 0.39172571897506714,
"learning_rate": 7.472323432479062e-05,
"loss": 0.02590138018131256,
"mean_token_accuracy": 0.9894755899906158,
"num_tokens": 56680576.0,
"step": 2740
},
{
"entropy": 0.7723122954368591,
"epoch": 3.7012113055181697,
"grad_norm": 0.45653218030929565,
"learning_rate": 7.452954536648888e-05,
"loss": 0.02483226954936981,
"mean_token_accuracy": 0.98988236784935,
"num_tokens": 56887721.0,
"step": 2750
},
{
"entropy": 0.7695056736469269,
"epoch": 3.714670255720054,
"grad_norm": 0.522171139717102,
"learning_rate": 7.433537042055248e-05,
"loss": 0.025389373302459717,
"mean_token_accuracy": 0.9898433744907379,
"num_tokens": 57094335.0,
"step": 2760
},
{
"entropy": 0.7618607759475708,
"epoch": 3.728129205921938,
"grad_norm": 0.4176262319087982,
"learning_rate": 7.414071333404104e-05,
"loss": 0.025304454565048217,
"mean_token_accuracy": 0.9903385758399963,
"num_tokens": 57301425.0,
"step": 2770
},
{
"entropy": 0.7558614552021027,
"epoch": 3.7415881561238225,
"grad_norm": 0.35137420892715454,
"learning_rate": 7.394557796356644e-05,
"loss": 0.023388701677322387,
"mean_token_accuracy": 0.990731543302536,
"num_tokens": 57508536.0,
"step": 2780
},
{
"entropy": 0.7547170579433441,
"epoch": 3.7550471063257067,
"grad_norm": 0.4727605879306793,
"learning_rate": 7.374996817521653e-05,
"loss": 0.02676147222518921,
"mean_token_accuracy": 0.9894650936126709,
"num_tokens": 57715682.0,
"step": 2790
},
{
"entropy": 0.7561111927032471,
"epoch": 3.768506056527591,
"grad_norm": 0.42272260785102844,
"learning_rate": 7.35538878444785e-05,
"loss": 0.02376638352870941,
"mean_token_accuracy": 0.9906246423721313,
"num_tokens": 57922961.0,
"step": 2800
},
{
"entropy": 0.765377151966095,
"epoch": 3.781965006729475,
"grad_norm": 0.4535030424594879,
"learning_rate": 7.335734085616206e-05,
"loss": 0.025309956073760985,
"mean_token_accuracy": 0.990121603012085,
"num_tokens": 58130145.0,
"step": 2810
},
{
"entropy": 0.7608742535114288,
"epoch": 3.7954239569313595,
"grad_norm": 0.45810800790786743,
"learning_rate": 7.316033110432249e-05,
"loss": 0.025189366936683655,
"mean_token_accuracy": 0.9903482019901275,
"num_tokens": 58337200.0,
"step": 2820
},
{
"entropy": 0.7654636561870575,
"epoch": 3.8088829071332437,
"grad_norm": 0.44589853286743164,
"learning_rate": 7.296286249218352e-05,
"loss": 0.023104313015937804,
"mean_token_accuracy": 0.9910144686698914,
"num_tokens": 58544177.0,
"step": 2830
},
{
"entropy": 0.7657354652881623,
"epoch": 3.822341857335128,
"grad_norm": 0.4494393765926361,
"learning_rate": 7.276493893205995e-05,
"loss": 0.024192404747009278,
"mean_token_accuracy": 0.9902793347835541,
"num_tokens": 58751848.0,
"step": 2840
},
{
"entropy": 0.7766771256923676,
"epoch": 3.8358008075370122,
"grad_norm": 0.4393499493598938,
"learning_rate": 7.256656434528018e-05,
"loss": 0.02485754042863846,
"mean_token_accuracy": 0.9902261734008789,
"num_tokens": 58958848.0,
"step": 2850
},
{
"entropy": 0.7848943591117858,
"epoch": 3.8492597577388965,
"grad_norm": 0.3587435185909271,
"learning_rate": 7.236774266210852e-05,
"loss": 0.026044368743896484,
"mean_token_accuracy": 0.9900348424911499,
"num_tokens": 59165462.0,
"step": 2860
},
{
"entropy": 0.7800885438919067,
"epoch": 3.8627187079407808,
"grad_norm": 0.6072045564651489,
"learning_rate": 7.216847782166727e-05,
"loss": 0.025104761123657227,
"mean_token_accuracy": 0.9903142392635346,
"num_tokens": 59372176.0,
"step": 2870
},
{
"entropy": 0.7828551828861237,
"epoch": 3.876177658142665,
"grad_norm": 0.3913317620754242,
"learning_rate": 7.196877377185872e-05,
"loss": 0.022907453775405883,
"mean_token_accuracy": 0.9911571919918061,
"num_tokens": 59578745.0,
"step": 2880
},
{
"entropy": 0.7775038599967956,
"epoch": 3.8896366083445493,
"grad_norm": 0.590624988079071,
"learning_rate": 7.176863446928694e-05,
"loss": 0.025417977571487428,
"mean_token_accuracy": 0.990390133857727,
"num_tokens": 59786043.0,
"step": 2890
},
{
"entropy": 0.7806641340255738,
"epoch": 3.9030955585464335,
"grad_norm": 0.5686063766479492,
"learning_rate": 7.156806387917937e-05,
"loss": 0.026116243004798888,
"mean_token_accuracy": 0.9900750458240509,
"num_tokens": 59992818.0,
"step": 2900
},
{
"entropy": 0.7746634542942047,
"epoch": 3.916554508748318,
"grad_norm": 0.4476391673088074,
"learning_rate": 7.136706597530825e-05,
"loss": 0.025634783506393432,
"mean_token_accuracy": 0.990031260251999,
"num_tokens": 60200512.0,
"step": 2910
},
{
"entropy": 0.7778063654899597,
"epoch": 3.930013458950202,
"grad_norm": 0.45968928933143616,
"learning_rate": 7.116564473991192e-05,
"loss": 0.023689424991607665,
"mean_token_accuracy": 0.9907764077186585,
"num_tokens": 60407416.0,
"step": 2920
},
{
"entropy": 0.7707861483097076,
"epoch": 3.9434724091520863,
"grad_norm": 0.43096625804901123,
"learning_rate": 7.096380416361588e-05,
"loss": 0.026516634225845336,
"mean_token_accuracy": 0.9897350788116455,
"num_tokens": 60614233.0,
"step": 2930
},
{
"entropy": 0.7748425483703614,
"epoch": 3.9569313593539706,
"grad_norm": 0.4846278131008148,
"learning_rate": 7.076154824535381e-05,
"loss": 0.024681851267814636,
"mean_token_accuracy": 0.9901362180709838,
"num_tokens": 60821054.0,
"step": 2940
},
{
"entropy": 0.7784582912921906,
"epoch": 3.970390309555855,
"grad_norm": 0.5366997122764587,
"learning_rate": 7.055888099228825e-05,
"loss": 0.02371453642845154,
"mean_token_accuracy": 0.9906255662441253,
"num_tokens": 61027779.0,
"step": 2950
},
{
"entropy": 0.7729446291923523,
"epoch": 3.983849259757739,
"grad_norm": 0.4675896167755127,
"learning_rate": 7.035580641973119e-05,
"loss": 0.025786811113357545,
"mean_token_accuracy": 0.9903778076171875,
"num_tokens": 61234796.0,
"step": 2960
},
{
"entropy": 0.7831340730190277,
"epoch": 3.9973082099596233,
"grad_norm": 0.5064438581466675,
"learning_rate": 7.015232855106468e-05,
"loss": 0.025132563710212708,
"mean_token_accuracy": 0.9904121816158294,
"num_tokens": 61441775.0,
"step": 2970
},
{
"epoch": 4.0,
"eval_entropy": 0.7909596406730117,
"eval_loss": 0.041984885931015015,
"eval_mean_token_accuracy": 0.9838614156291743,
"eval_num_tokens": 61483186.0,
"eval_runtime": 13.9502,
"eval_samples_per_second": 358.417,
"eval_steps_per_second": 11.254,
"step": 2972
},
{
"entropy": 0.7836226105690003,
"epoch": 4.010767160161508,
"grad_norm": 0.4120180606842041,
"learning_rate": 6.994845141766093e-05,
"loss": 0.020024305582046507,
"mean_token_accuracy": 0.9928817927837372,
"num_tokens": 61648895.0,
"step": 2980
},
{
"entropy": 0.7865998208522796,
"epoch": 4.024226110363392,
"grad_norm": 0.5804464817047119,
"learning_rate": 6.974417905880255e-05,
"loss": 0.01588797867298126,
"mean_token_accuracy": 0.9940837860107422,
"num_tokens": 61855674.0,
"step": 2990
},
{
"entropy": 0.791904878616333,
"epoch": 4.037685060565276,
"grad_norm": 0.5396016836166382,
"learning_rate": 6.953951552160248e-05,
"loss": 0.016767729818820954,
"mean_token_accuracy": 0.9937978744506836,
"num_tokens": 62063108.0,
"step": 3000
},
{
"entropy": 0.7812646925449371,
"epoch": 4.05114401076716,
"grad_norm": 0.42720192670822144,
"learning_rate": 6.933446486092381e-05,
"loss": 0.017768773436546325,
"mean_token_accuracy": 0.9928662478923798,
"num_tokens": 62269786.0,
"step": 3010
},
{
"entropy": 0.7833780586719513,
"epoch": 4.064602960969045,
"grad_norm": 0.4398343861103058,
"learning_rate": 6.912903113929947e-05,
"loss": 0.01689823865890503,
"mean_token_accuracy": 0.9940429389476776,
"num_tokens": 62476729.0,
"step": 3020
},
{
"entropy": 0.7901590645313263,
"epoch": 4.078061911170929,
"grad_norm": 0.4574897289276123,
"learning_rate": 6.892321842685171e-05,
"loss": 0.018462637066841127,
"mean_token_accuracy": 0.9929858446121216,
"num_tokens": 62683484.0,
"step": 3030
},
{
"entropy": 0.7843089640140534,
"epoch": 4.091520861372813,
"grad_norm": 0.4095018208026886,
"learning_rate": 6.871703080121148e-05,
"loss": 0.01854866147041321,
"mean_token_accuracy": 0.9926708579063416,
"num_tokens": 62890197.0,
"step": 3040
},
{
"entropy": 0.7845255315303803,
"epoch": 4.104979811574697,
"grad_norm": 0.49721238017082214,
"learning_rate": 6.851047234743763e-05,
"loss": 0.017842569947242738,
"mean_token_accuracy": 0.9929874241352081,
"num_tokens": 63096782.0,
"step": 3050
},
{
"entropy": 0.8012731611728668,
"epoch": 4.118438761776582,
"grad_norm": 0.3944236636161804,
"learning_rate": 6.830354715793598e-05,
"loss": 0.017182928323745728,
"mean_token_accuracy": 0.9936406493186951,
"num_tokens": 63302887.0,
"step": 3060
},
{
"entropy": 0.8002919495105744,
"epoch": 4.131897711978466,
"grad_norm": 0.4541427493095398,
"learning_rate": 6.809625933237826e-05,
"loss": 0.016480381786823272,
"mean_token_accuracy": 0.9933510184288025,
"num_tokens": 63509930.0,
"step": 3070
},
{
"entropy": 0.8035316348075867,
"epoch": 4.14535666218035,
"grad_norm": 0.44510146975517273,
"learning_rate": 6.788861297762086e-05,
"loss": 0.01899719089269638,
"mean_token_accuracy": 0.99237060546875,
"num_tokens": 63716805.0,
"step": 3080
},
{
"entropy": 0.7973145604133606,
"epoch": 4.158815612382234,
"grad_norm": 0.4416787624359131,
"learning_rate": 6.768061220762345e-05,
"loss": 0.017631854116916656,
"mean_token_accuracy": 0.993181437253952,
"num_tokens": 63923594.0,
"step": 3090
},
{
"entropy": 0.785740852355957,
"epoch": 4.172274562584119,
"grad_norm": 0.44484615325927734,
"learning_rate": 6.747226114336753e-05,
"loss": 0.017793142795562746,
"mean_token_accuracy": 0.9931233644485473,
"num_tokens": 64130006.0,
"step": 3100
},
{
"entropy": 0.769168746471405,
"epoch": 4.185733512786003,
"grad_norm": 0.5192431807518005,
"learning_rate": 6.726356391277471e-05,
"loss": 0.017981474101543427,
"mean_token_accuracy": 0.9931439876556396,
"num_tokens": 64337175.0,
"step": 3110
},
{
"entropy": 0.7719715535640717,
"epoch": 4.199192462987887,
"grad_norm": 0.6079246401786804,
"learning_rate": 6.7054524650625e-05,
"loss": 0.01674233376979828,
"mean_token_accuracy": 0.9935880184173584,
"num_tokens": 64544044.0,
"step": 3120
},
{
"entropy": 0.7753833174705506,
"epoch": 4.212651413189771,
"grad_norm": 0.4159577786922455,
"learning_rate": 6.684514749847482e-05,
"loss": 0.01796432286500931,
"mean_token_accuracy": 0.992877596616745,
"num_tokens": 64751161.0,
"step": 3130
},
{
"entropy": 0.7759944558143616,
"epoch": 4.226110363391656,
"grad_norm": 0.5487306714057922,
"learning_rate": 6.663543660457503e-05,
"loss": 0.019616091251373292,
"mean_token_accuracy": 0.9922689974308014,
"num_tokens": 64957845.0,
"step": 3140
},
{
"entropy": 0.7630272626876831,
"epoch": 4.23956931359354,
"grad_norm": 0.5038378238677979,
"learning_rate": 6.642539612378863e-05,
"loss": 0.01772879660129547,
"mean_token_accuracy": 0.9931340932846069,
"num_tokens": 65165108.0,
"step": 3150
},
{
"entropy": 0.7629122614860535,
"epoch": 4.253028263795424,
"grad_norm": 0.4874984622001648,
"learning_rate": 6.621503021750858e-05,
"loss": 0.018539264798164368,
"mean_token_accuracy": 0.9927669823169708,
"num_tokens": 65371838.0,
"step": 3160
},
{
"entropy": 0.764429771900177,
"epoch": 4.2664872139973085,
"grad_norm": 0.4427169859409332,
"learning_rate": 6.600434305357521e-05,
"loss": 0.01739906668663025,
"mean_token_accuracy": 0.9933329343795776,
"num_tokens": 65578601.0,
"step": 3170
},
{
"entropy": 0.77511927485466,
"epoch": 4.279946164199193,
"grad_norm": 0.6782849431037903,
"learning_rate": 6.579333880619376e-05,
"loss": 0.017153051495552064,
"mean_token_accuracy": 0.9933244109153747,
"num_tokens": 65785157.0,
"step": 3180
},
{
"entropy": 0.7763962566852569,
"epoch": 4.293405114401077,
"grad_norm": 0.3921811878681183,
"learning_rate": 6.558202165585161e-05,
"loss": 0.01766786277294159,
"mean_token_accuracy": 0.9931640028953552,
"num_tokens": 65992605.0,
"step": 3190
},
{
"entropy": 0.7710970163345336,
"epoch": 4.306864064602961,
"grad_norm": 0.4926285147666931,
"learning_rate": 6.53703957892355e-05,
"loss": 0.017494969069957733,
"mean_token_accuracy": 0.993228965997696,
"num_tokens": 66199512.0,
"step": 3200
},
{
"entropy": 0.7679614186286926,
"epoch": 4.3203230148048455,
"grad_norm": 0.4171614646911621,
"learning_rate": 6.515846539914854e-05,
"loss": 0.018978503346443177,
"mean_token_accuracy": 0.9928891599178314,
"num_tokens": 66405729.0,
"step": 3210
},
{
"entropy": 0.7584276318550109,
"epoch": 4.33378196500673,
"grad_norm": 0.45492124557495117,
"learning_rate": 6.494623468442718e-05,
"loss": 0.017702722549438478,
"mean_token_accuracy": 0.993442302942276,
"num_tokens": 66612712.0,
"step": 3220
},
{
"entropy": 0.7790658175945282,
"epoch": 4.347240915208614,
"grad_norm": 0.49954837560653687,
"learning_rate": 6.473370784985798e-05,
"loss": 0.016838689148426057,
"mean_token_accuracy": 0.9936484694480896,
"num_tokens": 66819289.0,
"step": 3230
},
{
"entropy": 0.8013432800769806,
"epoch": 4.360699865410498,
"grad_norm": 0.46975138783454895,
"learning_rate": 6.452088910609436e-05,
"loss": 0.0176068514585495,
"mean_token_accuracy": 0.9932349562644959,
"num_tokens": 67025841.0,
"step": 3240
},
{
"entropy": 0.7898520290851593,
"epoch": 4.3741588156123825,
"grad_norm": 0.38737940788269043,
"learning_rate": 6.430778266957312e-05,
"loss": 0.01757241189479828,
"mean_token_accuracy": 0.9932108998298645,
"num_tokens": 67232968.0,
"step": 3250
},
{
"entropy": 0.7705098152160644,
"epoch": 4.387617765814267,
"grad_norm": 0.4774077832698822,
"learning_rate": 6.409439276243092e-05,
"loss": 0.02002309709787369,
"mean_token_accuracy": 0.9923814833164215,
"num_tokens": 67439528.0,
"step": 3260
},
{
"entropy": 0.7659395933151245,
"epoch": 4.401076716016151,
"grad_norm": 0.551794171333313,
"learning_rate": 6.388072361242067e-05,
"loss": 0.019037923216819762,
"mean_token_accuracy": 0.9924685060977936,
"num_tokens": 67646641.0,
"step": 3270
},
{
"entropy": 0.7678803563117981,
"epoch": 4.414535666218035,
"grad_norm": 0.742902934551239,
"learning_rate": 6.366677945282769e-05,
"loss": 0.019253468513488768,
"mean_token_accuracy": 0.9927851676940918,
"num_tokens": 67853488.0,
"step": 3280
},
{
"entropy": 0.765274178981781,
"epoch": 4.4279946164199195,
"grad_norm": 0.4489310085773468,
"learning_rate": 6.345256452238591e-05,
"loss": 0.019601096212863923,
"mean_token_accuracy": 0.9925384223461151,
"num_tokens": 68060312.0,
"step": 3290
},
{
"entropy": 0.7568464636802673,
"epoch": 4.441453566621804,
"grad_norm": 0.5335469841957092,
"learning_rate": 6.323808306519385e-05,
"loss": 0.01803976595401764,
"mean_token_accuracy": 0.9932062149047851,
"num_tokens": 68267066.0,
"step": 3300
},
{
"entropy": 0.7597867846488953,
"epoch": 4.454912516823688,
"grad_norm": 0.5096181035041809,
"learning_rate": 6.302333933063057e-05,
"loss": 0.017728343605995178,
"mean_token_accuracy": 0.9933403611183167,
"num_tokens": 68474119.0,
"step": 3310
},
{
"entropy": 0.7647906184196472,
"epoch": 4.468371467025572,
"grad_norm": 0.5999929904937744,
"learning_rate": 6.280833757327142e-05,
"loss": 0.01955530196428299,
"mean_token_accuracy": 0.9925283312797546,
"num_tokens": 68680905.0,
"step": 3320
},
{
"entropy": 0.7616272032260895,
"epoch": 4.481830417227457,
"grad_norm": 0.4349266588687897,
"learning_rate": 6.259308205280383e-05,
"loss": 0.018944016098976134,
"mean_token_accuracy": 0.9924477934837341,
"num_tokens": 68887234.0,
"step": 3330
},
{
"entropy": 0.7569435834884644,
"epoch": 4.495289367429341,
"grad_norm": 0.5471845865249634,
"learning_rate": 6.237757703394283e-05,
"loss": 0.016963809728622437,
"mean_token_accuracy": 0.9936882734298706,
"num_tokens": 69094075.0,
"step": 3340
},
{
"entropy": 0.7671379745006561,
"epoch": 4.508748317631225,
"grad_norm": 0.5381366610527039,
"learning_rate": 6.216182678634664e-05,
"loss": 0.016747696697711943,
"mean_token_accuracy": 0.9936853110790252,
"num_tokens": 69300893.0,
"step": 3350
},
{
"entropy": 0.7783296048641205,
"epoch": 4.522207267833109,
"grad_norm": 0.5440670251846313,
"learning_rate": 6.194583558453199e-05,
"loss": 0.017477567493915557,
"mean_token_accuracy": 0.9932597935199737,
"num_tokens": 69508129.0,
"step": 3360
},
{
"entropy": 0.7844888031482696,
"epoch": 4.535666218034994,
"grad_norm": 0.5029262900352478,
"learning_rate": 6.172960770778948e-05,
"loss": 0.016931781172752382,
"mean_token_accuracy": 0.99377281665802,
"num_tokens": 69714928.0,
"step": 3370
},
{
"entropy": 0.794099646806717,
"epoch": 4.549125168236878,
"grad_norm": 0.48055407404899597,
"learning_rate": 6.151314744009885e-05,
"loss": 0.018056708574295043,
"mean_token_accuracy": 0.9930496513843536,
"num_tokens": 69921790.0,
"step": 3380
},
{
"entropy": 0.7762019991874695,
"epoch": 4.562584118438762,
"grad_norm": 0.5742813944816589,
"learning_rate": 6.129645907004395e-05,
"loss": 0.018931837379932405,
"mean_token_accuracy": 0.9926802217960358,
"num_tokens": 70128934.0,
"step": 3390
},
{
"entropy": 0.7701474964618683,
"epoch": 4.576043068640646,
"grad_norm": 0.4316072463989258,
"learning_rate": 6.107954689072796e-05,
"loss": 0.017746394872665404,
"mean_token_accuracy": 0.9927630186080932,
"num_tokens": 70336124.0,
"step": 3400
},
{
"entropy": 0.7807201445102692,
"epoch": 4.589502018842531,
"grad_norm": 0.4106089174747467,
"learning_rate": 6.086241519968822e-05,
"loss": 0.01805357336997986,
"mean_token_accuracy": 0.9930870115756989,
"num_tokens": 70542565.0,
"step": 3410
},
{
"entropy": 0.7832616925239563,
"epoch": 4.602960969044415,
"grad_norm": 0.443533331155777,
"learning_rate": 6.064506829881109e-05,
"loss": 0.018582865595817566,
"mean_token_accuracy": 0.9929667472839355,
"num_tokens": 70749808.0,
"step": 3420
},
{
"entropy": 0.7755731046199799,
"epoch": 4.616419919246299,
"grad_norm": 0.3619384169578552,
"learning_rate": 6.042751049424675e-05,
"loss": 0.01779957562685013,
"mean_token_accuracy": 0.9932062923908234,
"num_tokens": 70956795.0,
"step": 3430
},
{
"entropy": 0.7820507228374481,
"epoch": 4.629878869448183,
"grad_norm": 0.4131346344947815,
"learning_rate": 6.02097460963239e-05,
"loss": 0.01847451776266098,
"mean_token_accuracy": 0.9929662108421325,
"num_tokens": 71163524.0,
"step": 3440
},
{
"entropy": 0.7895468533039093,
"epoch": 4.643337819650068,
"grad_norm": 0.5880967378616333,
"learning_rate": 5.999177941946429e-05,
"loss": 0.018063023686408997,
"mean_token_accuracy": 0.9928370118141174,
"num_tokens": 71370916.0,
"step": 3450
},
{
"entropy": 0.7732610762119293,
"epoch": 4.656796769851952,
"grad_norm": 0.4882413148880005,
"learning_rate": 5.977361478209732e-05,
"loss": 0.01768999993801117,
"mean_token_accuracy": 0.9935171842575073,
"num_tokens": 71577910.0,
"step": 3460
},
{
"entropy": 0.7638017177581787,
"epoch": 4.670255720053836,
"grad_norm": 0.36234211921691895,
"learning_rate": 5.955525650657444e-05,
"loss": 0.01707223504781723,
"mean_token_accuracy": 0.9930117666721344,
"num_tokens": 71784872.0,
"step": 3470
},
{
"entropy": 0.7676116108894349,
"epoch": 4.68371467025572,
"grad_norm": 0.4788723587989807,
"learning_rate": 5.933670891908355e-05,
"loss": 0.018473857641220094,
"mean_token_accuracy": 0.9927574157714844,
"num_tokens": 71991745.0,
"step": 3480
},
{
"entropy": 0.7711163818836212,
"epoch": 4.697173620457605,
"grad_norm": 0.4580574333667755,
"learning_rate": 5.9117976349563206e-05,
"loss": 0.017843346297740936,
"mean_token_accuracy": 0.9930699467658997,
"num_tokens": 72198649.0,
"step": 3490
},
{
"entropy": 0.7644717395305634,
"epoch": 4.710632570659489,
"grad_norm": 0.6007668375968933,
"learning_rate": 5.889906313161696e-05,
"loss": 0.019403815269470215,
"mean_token_accuracy": 0.9927784681320191,
"num_tokens": 72405701.0,
"step": 3500
},
{
"entropy": 0.7628248929977417,
"epoch": 4.724091520861373,
"grad_norm": 0.37293317914009094,
"learning_rate": 5.8679973602427376e-05,
"loss": 0.017194029688835145,
"mean_token_accuracy": 0.9929509341716767,
"num_tokens": 72612800.0,
"step": 3510
},
{
"entropy": 0.7724367201328277,
"epoch": 4.737550471063257,
"grad_norm": 0.3919832706451416,
"learning_rate": 5.846071210267018e-05,
"loss": 0.018434235453605653,
"mean_token_accuracy": 0.9928217232227325,
"num_tokens": 72819618.0,
"step": 3520
},
{
"entropy": 0.7768795371055603,
"epoch": 4.751009421265142,
"grad_norm": 0.5469701886177063,
"learning_rate": 5.824128297642823e-05,
"loss": 0.017534643411636353,
"mean_token_accuracy": 0.9931679129600525,
"num_tokens": 73026521.0,
"step": 3530
},
{
"entropy": 0.7600254356861115,
"epoch": 4.764468371467026,
"grad_norm": 0.4579508602619171,
"learning_rate": 5.802169057110548e-05,
"loss": 0.018480783700942992,
"mean_token_accuracy": 0.9927056908607483,
"num_tokens": 73233512.0,
"step": 3540
},
{
"entropy": 0.750648146867752,
"epoch": 4.77792732166891,
"grad_norm": 0.5560661554336548,
"learning_rate": 5.7801939237340786e-05,
"loss": 0.01876264363527298,
"mean_token_accuracy": 0.9926550030708313,
"num_tokens": 73440515.0,
"step": 3550
},
{
"entropy": 0.7545541048049926,
"epoch": 4.7913862718707945,
"grad_norm": 0.45055732131004333,
"learning_rate": 5.758203332892177e-05,
"loss": 0.01602725088596344,
"mean_token_accuracy": 0.993874579668045,
"num_tokens": 73647478.0,
"step": 3560
},
{
"entropy": 0.7609083950519562,
"epoch": 4.804845222072679,
"grad_norm": 0.549947202205658,
"learning_rate": 5.736197720269855e-05,
"loss": 0.01784185767173767,
"mean_token_accuracy": 0.9930622160434723,
"num_tokens": 73854294.0,
"step": 3570
},
{
"entropy": 0.7656745433807373,
"epoch": 4.818304172274562,
"grad_norm": 0.4915235936641693,
"learning_rate": 5.714177521849736e-05,
"loss": 0.01744186580181122,
"mean_token_accuracy": 0.9929351925849914,
"num_tokens": 74060324.0,
"step": 3580
},
{
"entropy": 0.7592082619667053,
"epoch": 4.831763122476447,
"grad_norm": 0.5766359567642212,
"learning_rate": 5.69214317390343e-05,
"loss": 0.018355701863765717,
"mean_token_accuracy": 0.9929331362247467,
"num_tokens": 74267384.0,
"step": 3590
},
{
"entropy": 0.7560055673122406,
"epoch": 4.845222072678331,
"grad_norm": 0.4810236990451813,
"learning_rate": 5.670095112982875e-05,
"loss": 0.01801997423171997,
"mean_token_accuracy": 0.9928223788738251,
"num_tokens": 74474563.0,
"step": 3600
},
{
"entropy": 0.7539739072322845,
"epoch": 4.858681022880216,
"grad_norm": 0.39908966422080994,
"learning_rate": 5.648033775911701e-05,
"loss": 0.017030759155750273,
"mean_token_accuracy": 0.9934890806674957,
"num_tokens": 74681852.0,
"step": 3610
},
{
"entropy": 0.7632683336734771,
"epoch": 4.872139973082099,
"grad_norm": 0.4063929319381714,
"learning_rate": 5.625959599776564e-05,
"loss": 0.017804598808288573,
"mean_token_accuracy": 0.9931166589260101,
"num_tokens": 74888593.0,
"step": 3620
},
{
"entropy": 0.7684237122535705,
"epoch": 4.885598923283984,
"grad_norm": 0.4352034628391266,
"learning_rate": 5.603873021918493e-05,
"loss": 0.016362884640693666,
"mean_token_accuracy": 0.9936956346035004,
"num_tokens": 75096000.0,
"step": 3630
},
{
"entropy": 0.7725315749645233,
"epoch": 4.899057873485868,
"grad_norm": 0.5453092455863953,
"learning_rate": 5.581774479924229e-05,
"loss": 0.016236093640327454,
"mean_token_accuracy": 0.993706214427948,
"num_tokens": 75302578.0,
"step": 3640
},
{
"entropy": 0.7653468787670136,
"epoch": 4.912516823687753,
"grad_norm": 0.668065071105957,
"learning_rate": 5.5596644116175444e-05,
"loss": 0.016758397221565247,
"mean_token_accuracy": 0.9932759523391723,
"num_tokens": 75509213.0,
"step": 3650
},
{
"entropy": 0.7591579735279084,
"epoch": 4.925975773889636,
"grad_norm": 0.5401139259338379,
"learning_rate": 5.537543255050579e-05,
"loss": 0.017928582429885865,
"mean_token_accuracy": 0.9927070438861847,
"num_tokens": 75716260.0,
"step": 3660
},
{
"entropy": 0.7587584853172302,
"epoch": 4.939434724091521,
"grad_norm": 0.4384903609752655,
"learning_rate": 5.5154114484951556e-05,
"loss": 0.017501908540725707,
"mean_token_accuracy": 0.9937355935573577,
"num_tokens": 75923089.0,
"step": 3670
},
{
"entropy": 0.7553621172904968,
"epoch": 4.952893674293405,
"grad_norm": 0.5839459896087646,
"learning_rate": 5.4932694304340985e-05,
"loss": 0.016943465173244476,
"mean_token_accuracy": 0.9933674991130829,
"num_tokens": 76129808.0,
"step": 3680
},
{
"entropy": 0.7534486532211304,
"epoch": 4.96635262449529,
"grad_norm": 0.5708627700805664,
"learning_rate": 5.471117639552543e-05,
"loss": 0.01732933968305588,
"mean_token_accuracy": 0.993400925397873,
"num_tokens": 76336215.0,
"step": 3690
},
{
"entropy": 0.7495142102241517,
"epoch": 4.979811574697173,
"grad_norm": 0.6189046502113342,
"learning_rate": 5.448956514729251e-05,
"loss": 0.01899988055229187,
"mean_token_accuracy": 0.9925648808479309,
"num_tokens": 76543715.0,
"step": 3700
},
{
"entropy": 0.751545512676239,
"epoch": 4.993270524899058,
"grad_norm": 0.5944417715072632,
"learning_rate": 5.426786495027908e-05,
"loss": 0.017178787291049956,
"mean_token_accuracy": 0.9932264804840087,
"num_tokens": 76750375.0,
"step": 3710
},
{
"epoch": 5.0,
"eval_entropy": 0.7505513182870901,
"eval_loss": 0.044372886419296265,
"eval_mean_token_accuracy": 0.984428718591192,
"eval_num_tokens": 76853983.0,
"eval_runtime": 13.9555,
"eval_samples_per_second": 358.281,
"eval_steps_per_second": 11.25,
"step": 3715
},
{
"entropy": 0.7473571479320527,
"epoch": 5.006729475100942,
"grad_norm": 0.3315245509147644,
"learning_rate": 5.404608019688432e-05,
"loss": 0.013754059374332429,
"mean_token_accuracy": 0.9947969734668731,
"num_tokens": 76957311.0,
"step": 3720
},
{
"entropy": 0.7534472703933716,
"epoch": 5.020188425302826,
"grad_norm": 0.5313814282417297,
"learning_rate": 5.382421528118262e-05,
"loss": 0.01147187352180481,
"mean_token_accuracy": 0.9957284986972809,
"num_tokens": 77164715.0,
"step": 3730
},
{
"entropy": 0.801529586315155,
"epoch": 5.03364737550471,
"grad_norm": 0.4467467665672302,
"learning_rate": 5.360227459883662e-05,
"loss": 0.010463708639144897,
"mean_token_accuracy": 0.9959668934345245,
"num_tokens": 77371640.0,
"step": 3740
},
{
"entropy": 0.828974598646164,
"epoch": 5.0471063257065945,
"grad_norm": 0.5245316624641418,
"learning_rate": 5.338026254701003e-05,
"loss": 0.011523199081420899,
"mean_token_accuracy": 0.9956273198127746,
"num_tokens": 77577924.0,
"step": 3750
},
{
"entropy": 0.7982487976551056,
"epoch": 5.060565275908479,
"grad_norm": 0.39915889501571655,
"learning_rate": 5.31581835242806e-05,
"loss": 0.011306057125329972,
"mean_token_accuracy": 0.9958861231803894,
"num_tokens": 77784880.0,
"step": 3760
},
{
"entropy": 0.7692394316196441,
"epoch": 5.074024226110363,
"grad_norm": 0.34463298320770264,
"learning_rate": 5.293604193055289e-05,
"loss": 0.012429548054933548,
"mean_token_accuracy": 0.995696759223938,
"num_tokens": 77992197.0,
"step": 3770
},
{
"entropy": 0.7731356203556061,
"epoch": 5.087483176312247,
"grad_norm": 0.46471163630485535,
"learning_rate": 5.2713842166971165e-05,
"loss": 0.011507495492696761,
"mean_token_accuracy": 0.9956724286079407,
"num_tokens": 78199265.0,
"step": 3780
},
{
"entropy": 0.7799284398555756,
"epoch": 5.1009421265141315,
"grad_norm": 0.3777540326118469,
"learning_rate": 5.249158863583216e-05,
"loss": 0.01075114607810974,
"mean_token_accuracy": 0.9960379660129547,
"num_tokens": 78406087.0,
"step": 3790
},
{
"entropy": 0.7837011277675628,
"epoch": 5.114401076716016,
"grad_norm": 0.4946545660495758,
"learning_rate": 5.2269285740497876e-05,
"loss": 0.012666209042072296,
"mean_token_accuracy": 0.995057487487793,
"num_tokens": 78613260.0,
"step": 3800
},
{
"entropy": 0.7703280210494995,
"epoch": 5.1278600269179,
"grad_norm": 0.32546892762184143,
"learning_rate": 5.204693788530832e-05,
"loss": 0.01195569708943367,
"mean_token_accuracy": 0.9958400011062623,
"num_tokens": 78819967.0,
"step": 3810
},
{
"entropy": 0.7721941411495209,
"epoch": 5.141318977119784,
"grad_norm": 0.5005202293395996,
"learning_rate": 5.182454947549428e-05,
"loss": 0.010894721746444702,
"mean_token_accuracy": 0.9959119021892547,
"num_tokens": 79026842.0,
"step": 3820
},
{
"entropy": 0.7904182612895966,
"epoch": 5.1547779273216685,
"grad_norm": 0.37900179624557495,
"learning_rate": 5.160212491709002e-05,
"loss": 0.010722023993730545,
"mean_token_accuracy": 0.9961172580718994,
"num_tokens": 79234103.0,
"step": 3830
},
{
"entropy": 0.7988847613334655,
"epoch": 5.168236877523553,
"grad_norm": 0.43181365728378296,
"learning_rate": 5.1379668616845975e-05,
"loss": 0.010627251863479615,
"mean_token_accuracy": 0.9959731698036194,
"num_tokens": 79440411.0,
"step": 3840
},
{
"entropy": 0.7862568140029907,
"epoch": 5.181695827725437,
"grad_norm": 0.3639049232006073,
"learning_rate": 5.115718498214148e-05,
"loss": 0.010891681909561158,
"mean_token_accuracy": 0.9956743359565735,
"num_tokens": 79647596.0,
"step": 3850
},
{
"entropy": 0.7846800267696381,
"epoch": 5.195154777927321,
"grad_norm": 0.5998150110244751,
"learning_rate": 5.093467842089742e-05,
"loss": 0.011233718693256378,
"mean_token_accuracy": 0.9956465005874634,
"num_tokens": 79854600.0,
"step": 3860
},
{
"entropy": 0.7855194449424744,
"epoch": 5.2086137281292055,
"grad_norm": 0.48699286580085754,
"learning_rate": 5.071215334148891e-05,
"loss": 0.011192862689495087,
"mean_token_accuracy": 0.9958138644695282,
"num_tokens": 80061989.0,
"step": 3870
},
{
"entropy": 0.7742628574371337,
"epoch": 5.22207267833109,
"grad_norm": 0.5599384307861328,
"learning_rate": 5.048961415265797e-05,
"loss": 0.011506590247154235,
"mean_token_accuracy": 0.9957414209842682,
"num_tokens": 80268385.0,
"step": 3880
},
{
"entropy": 0.7598394274711608,
"epoch": 5.235531628532974,
"grad_norm": 0.4192953407764435,
"learning_rate": 5.0267065263426125e-05,
"loss": 0.012534248828887939,
"mean_token_accuracy": 0.9951974868774414,
"num_tokens": 80475339.0,
"step": 3890
},
{
"entropy": 0.7494603753089905,
"epoch": 5.248990578734858,
"grad_norm": 0.6345610022544861,
"learning_rate": 5.00445110830071e-05,
"loss": 0.01246308833360672,
"mean_token_accuracy": 0.9951446115970611,
"num_tokens": 80682026.0,
"step": 3900
},
{
"entropy": 0.7482762694358825,
"epoch": 5.262449528936743,
"grad_norm": 0.3931730389595032,
"learning_rate": 4.9821956020719474e-05,
"loss": 0.012075445801019668,
"mean_token_accuracy": 0.9952457189559937,
"num_tokens": 80888988.0,
"step": 3910
},
{
"entropy": 0.7452390134334564,
"epoch": 5.275908479138627,
"grad_norm": 0.4273200035095215,
"learning_rate": 4.959940448589928e-05,
"loss": 0.012490732222795486,
"mean_token_accuracy": 0.9952689290046692,
"num_tokens": 81095986.0,
"step": 3920
},
{
"entropy": 0.752661383152008,
"epoch": 5.289367429340511,
"grad_norm": 0.48054444789886475,
"learning_rate": 4.9376860887812666e-05,
"loss": 0.01093846783041954,
"mean_token_accuracy": 0.9958311855793,
"num_tokens": 81303243.0,
"step": 3930
},
{
"entropy": 0.7650678336620331,
"epoch": 5.302826379542395,
"grad_norm": 0.44087842106819153,
"learning_rate": 4.915432963556853e-05,
"loss": 0.011806166172027588,
"mean_token_accuracy": 0.9957118093967438,
"num_tokens": 81510139.0,
"step": 3940
},
{
"entropy": 0.7748130619525909,
"epoch": 5.31628532974428,
"grad_norm": 0.4736451804637909,
"learning_rate": 4.8931815138031173e-05,
"loss": 0.010728248953819275,
"mean_token_accuracy": 0.9961952686309814,
"num_tokens": 81717034.0,
"step": 3950
},
{
"entropy": 0.7736305713653564,
"epoch": 5.329744279946164,
"grad_norm": 0.4647705852985382,
"learning_rate": 4.870932180373296e-05,
"loss": 0.01173371821641922,
"mean_token_accuracy": 0.9954262256622315,
"num_tokens": 81924216.0,
"step": 3960
},
{
"entropy": 0.7643655180931092,
"epoch": 5.343203230148048,
"grad_norm": 0.5442004799842834,
"learning_rate": 4.8486854040786926e-05,
"loss": 0.011154447495937348,
"mean_token_accuracy": 0.9956519722938537,
"num_tokens": 82130859.0,
"step": 3970
},
{
"entropy": 0.7573722243309021,
"epoch": 5.356662180349932,
"grad_norm": 0.5777075886726379,
"learning_rate": 4.826441625679953e-05,
"loss": 0.010990817844867707,
"mean_token_accuracy": 0.995562344789505,
"num_tokens": 82338043.0,
"step": 3980
},
{
"entropy": 0.755225020647049,
"epoch": 5.370121130551817,
"grad_norm": 0.5585280060768127,
"learning_rate": 4.8042012858783223e-05,
"loss": 0.011740070581436158,
"mean_token_accuracy": 0.9957562625408173,
"num_tokens": 82545102.0,
"step": 3990
},
{
"entropy": 0.7581014811992646,
"epoch": 5.383580080753701,
"grad_norm": 0.4570481777191162,
"learning_rate": 4.781964825306923e-05,
"loss": 0.010656823217868806,
"mean_token_accuracy": 0.996157032251358,
"num_tokens": 82751505.0,
"step": 4000
},
{
"entropy": 0.759818023443222,
"epoch": 5.397039030955585,
"grad_norm": 0.4566776156425476,
"learning_rate": 4.7597326845220206e-05,
"loss": 0.011652395129203796,
"mean_token_accuracy": 0.995825469493866,
"num_tokens": 82958553.0,
"step": 4010
},
{
"entropy": 0.7625110745429993,
"epoch": 5.410497981157469,
"grad_norm": 0.47552627325057983,
"learning_rate": 4.737505303994292e-05,
"loss": 0.01105176955461502,
"mean_token_accuracy": 0.995843505859375,
"num_tokens": 83165855.0,
"step": 4020
},
{
"entropy": 0.7676438748836517,
"epoch": 5.423956931359354,
"grad_norm": 0.5029258131980896,
"learning_rate": 4.7152831241001065e-05,
"loss": 0.012058970332145692,
"mean_token_accuracy": 0.9953496634960175,
"num_tokens": 83372907.0,
"step": 4030
},
{
"entropy": 0.7582855522632599,
"epoch": 5.437415881561238,
"grad_norm": 0.40411460399627686,
"learning_rate": 4.693066585112795e-05,
"loss": 0.011585983633995055,
"mean_token_accuracy": 0.995668250322342,
"num_tokens": 83579883.0,
"step": 4040
},
{
"entropy": 0.7627740204334259,
"epoch": 5.450874831763122,
"grad_norm": 0.4695113003253937,
"learning_rate": 4.670856127193928e-05,
"loss": 0.012437713891267776,
"mean_token_accuracy": 0.9953029453754425,
"num_tokens": 83786422.0,
"step": 4050
},
{
"entropy": 0.7630968987941742,
"epoch": 5.464333781965006,
"grad_norm": 0.6173301339149475,
"learning_rate": 4.648652190384597e-05,
"loss": 0.011899437010288238,
"mean_token_accuracy": 0.9953746914863586,
"num_tokens": 83993873.0,
"step": 4060
},
{
"entropy": 0.7567782461643219,
"epoch": 5.477792732166891,
"grad_norm": 0.35925814509391785,
"learning_rate": 4.626455214596695e-05,
"loss": 0.011643938720226288,
"mean_token_accuracy": 0.9955205857753754,
"num_tokens": 84201046.0,
"step": 4070
},
{
"entropy": 0.7553957641124726,
"epoch": 5.491251682368775,
"grad_norm": 0.4315947890281677,
"learning_rate": 4.6042656396042e-05,
"loss": 0.011083407700061798,
"mean_token_accuracy": 0.9957548975944519,
"num_tokens": 84408311.0,
"step": 4080
},
{
"entropy": 0.7516207814216613,
"epoch": 5.504710632570659,
"grad_norm": 0.5060084462165833,
"learning_rate": 4.5820839050344643e-05,
"loss": 0.01217108517885208,
"mean_token_accuracy": 0.9953884065151215,
"num_tokens": 84615252.0,
"step": 4090
},
{
"entropy": 0.7487595736980438,
"epoch": 5.518169582772543,
"grad_norm": 0.35507968068122864,
"learning_rate": 4.559910450359502e-05,
"loss": 0.01100190356373787,
"mean_token_accuracy": 0.995825058221817,
"num_tokens": 84822094.0,
"step": 4100
},
{
"entropy": 0.754398238658905,
"epoch": 5.531628532974428,
"grad_norm": 0.4639805555343628,
"learning_rate": 4.5377457148872837e-05,
"loss": 0.012004619836807251,
"mean_token_accuracy": 0.9956164836883545,
"num_tokens": 85028800.0,
"step": 4110
},
{
"entropy": 0.7597955822944641,
"epoch": 5.545087483176312,
"grad_norm": 0.52503901720047,
"learning_rate": 4.515590137753032e-05,
"loss": 0.01215251013636589,
"mean_token_accuracy": 0.9956142961978912,
"num_tokens": 85234972.0,
"step": 4120
},
{
"entropy": 0.7510636806488037,
"epoch": 5.558546433378196,
"grad_norm": 0.5228842496871948,
"learning_rate": 4.493444157910521e-05,
"loss": 0.011907792836427688,
"mean_token_accuracy": 0.9956765532493591,
"num_tokens": 85442303.0,
"step": 4130
},
{
"entropy": 0.7509365022182465,
"epoch": 5.5720053835800805,
"grad_norm": 0.3873786926269531,
"learning_rate": 4.471308214123381e-05,
"loss": 0.010966875404119492,
"mean_token_accuracy": 0.995771324634552,
"num_tokens": 85648692.0,
"step": 4140
},
{
"entropy": 0.7490605235099792,
"epoch": 5.585464333781965,
"grad_norm": 0.5178571939468384,
"learning_rate": 4.449182744956403e-05,
"loss": 0.012869048118591308,
"mean_token_accuracy": 0.9949001133441925,
"num_tokens": 85855568.0,
"step": 4150
},
{
"entropy": 0.7433833122253418,
"epoch": 5.598923283983849,
"grad_norm": 0.47041696310043335,
"learning_rate": 4.4270681887668544e-05,
"loss": 0.012250985950231552,
"mean_token_accuracy": 0.9952748596668244,
"num_tokens": 86061971.0,
"step": 4160
},
{
"entropy": 0.7359922230243683,
"epoch": 5.612382234185733,
"grad_norm": 0.33492347598075867,
"learning_rate": 4.404964983695786e-05,
"loss": 0.010909316688776016,
"mean_token_accuracy": 0.9960612893104553,
"num_tokens": 86269179.0,
"step": 4170
},
{
"entropy": 0.7412485539913177,
"epoch": 5.6258411843876175,
"grad_norm": 0.26885855197906494,
"learning_rate": 4.382873567659361e-05,
"loss": 0.010674068331718444,
"mean_token_accuracy": 0.9960609257221222,
"num_tokens": 86476072.0,
"step": 4180
},
{
"entropy": 0.7493269741535187,
"epoch": 5.639300134589502,
"grad_norm": 0.4859159588813782,
"learning_rate": 4.3607943783401736e-05,
"loss": 0.012274256348609925,
"mean_token_accuracy": 0.9954603254795075,
"num_tokens": 86682931.0,
"step": 4190
},
{
"entropy": 0.7531224548816681,
"epoch": 5.652759084791386,
"grad_norm": 0.45160216093063354,
"learning_rate": 4.3387278531785747e-05,
"loss": 0.01201828122138977,
"mean_token_accuracy": 0.9953709781169892,
"num_tokens": 86889799.0,
"step": 4200
},
{
"entropy": 0.7501720905303955,
"epoch": 5.66621803499327,
"grad_norm": 0.6317504048347473,
"learning_rate": 4.3166744293640134e-05,
"loss": 0.011446791887283325,
"mean_token_accuracy": 0.9956945776939392,
"num_tokens": 87096029.0,
"step": 4210
},
{
"entropy": 0.7444611549377441,
"epoch": 5.6796769851951545,
"grad_norm": 0.4731798768043518,
"learning_rate": 4.2946345438263665e-05,
"loss": 0.011316908895969391,
"mean_token_accuracy": 0.9955646634101868,
"num_tokens": 87303163.0,
"step": 4220
},
{
"entropy": 0.7472650587558747,
"epoch": 5.693135935397039,
"grad_norm": 0.4799041152000427,
"learning_rate": 4.272608633227287e-05,
"loss": 0.012147380411624909,
"mean_token_accuracy": 0.9951766729354858,
"num_tokens": 87509492.0,
"step": 4230
},
{
"entropy": 0.7370569348335266,
"epoch": 5.706594885598923,
"grad_norm": 0.36718371510505676,
"learning_rate": 4.250597133951554e-05,
"loss": 0.011406458914279938,
"mean_token_accuracy": 0.9959278583526612,
"num_tokens": 87716755.0,
"step": 4240
},
{
"entropy": 0.73809614777565,
"epoch": 5.720053835800807,
"grad_norm": 0.4381415843963623,
"learning_rate": 4.228600482098423e-05,
"loss": 0.011750604957342148,
"mean_token_accuracy": 0.9957403659820556,
"num_tokens": 87923530.0,
"step": 4250
},
{
"entropy": 0.7487466990947723,
"epoch": 5.7335127860026915,
"grad_norm": 0.4299047291278839,
"learning_rate": 4.206619113472986e-05,
"loss": 0.01089790165424347,
"mean_token_accuracy": 0.9957152485847474,
"num_tokens": 88130348.0,
"step": 4260
},
{
"entropy": 0.7592370331287384,
"epoch": 5.746971736204576,
"grad_norm": 0.502768874168396,
"learning_rate": 4.18465346357754e-05,
"loss": 0.01133182942867279,
"mean_token_accuracy": 0.9958466529846192,
"num_tokens": 88337724.0,
"step": 4270
},
{
"entropy": 0.7617475092411041,
"epoch": 5.76043068640646,
"grad_norm": 0.419840544462204,
"learning_rate": 4.16270396760296e-05,
"loss": 0.011695357412099839,
"mean_token_accuracy": 0.99588702917099,
"num_tokens": 88544195.0,
"step": 4280
},
{
"entropy": 0.7570286870002747,
"epoch": 5.773889636608344,
"grad_norm": 0.5709792375564575,
"learning_rate": 4.140771060420066e-05,
"loss": 0.01149986982345581,
"mean_token_accuracy": 0.995744913816452,
"num_tokens": 88751100.0,
"step": 4290
},
{
"entropy": 0.7453589260578155,
"epoch": 5.787348586810229,
"grad_norm": 0.47392651438713074,
"learning_rate": 4.118855176571021e-05,
"loss": 0.01253439038991928,
"mean_token_accuracy": 0.9950320243835449,
"num_tokens": 88958170.0,
"step": 4300
},
{
"entropy": 0.7403925895690918,
"epoch": 5.800807537012113,
"grad_norm": 0.5566080808639526,
"learning_rate": 4.096956750260718e-05,
"loss": 0.013252611458301543,
"mean_token_accuracy": 0.9950306355953217,
"num_tokens": 89164712.0,
"step": 4310
},
{
"entropy": 0.7351096451282502,
"epoch": 5.814266487213997,
"grad_norm": 0.4116886854171753,
"learning_rate": 4.07507621534817e-05,
"loss": 0.012794888019561768,
"mean_token_accuracy": 0.9951526165008545,
"num_tokens": 89371398.0,
"step": 4320
},
{
"entropy": 0.7350869536399841,
"epoch": 5.827725437415881,
"grad_norm": 0.37677955627441406,
"learning_rate": 4.053214005337924e-05,
"loss": 0.010807374119758606,
"mean_token_accuracy": 0.9960130155086517,
"num_tokens": 89578181.0,
"step": 4330
},
{
"entropy": 0.7448731124401092,
"epoch": 5.841184387617766,
"grad_norm": 0.4849918484687805,
"learning_rate": 4.031370553371465e-05,
"loss": 0.011473974585533142,
"mean_token_accuracy": 0.9955580830574036,
"num_tokens": 89785614.0,
"step": 4340
},
{
"entropy": 0.7568134427070617,
"epoch": 5.85464333781965,
"grad_norm": 0.4150851368904114,
"learning_rate": 4.0095462922186385e-05,
"loss": 0.01176745593547821,
"mean_token_accuracy": 0.9955409824848175,
"num_tokens": 89991753.0,
"step": 4350
},
{
"entropy": 0.7487130522727966,
"epoch": 5.868102288021534,
"grad_norm": 0.4874976873397827,
"learning_rate": 3.9877416542690746e-05,
"loss": 0.011722442507743836,
"mean_token_accuracy": 0.9955667078495025,
"num_tokens": 90198501.0,
"step": 4360
},
{
"entropy": 0.7395595133304596,
"epoch": 5.881561238223418,
"grad_norm": 0.42832764983177185,
"learning_rate": 3.9659570715236234e-05,
"loss": 0.010632017254829406,
"mean_token_accuracy": 0.9957972228527069,
"num_tokens": 90405216.0,
"step": 4370
},
{
"entropy": 0.7417290508747101,
"epoch": 5.895020188425303,
"grad_norm": 0.4343918263912201,
"learning_rate": 3.944192975585792e-05,
"loss": 0.011097145825624466,
"mean_token_accuracy": 0.9955754935741424,
"num_tokens": 90611675.0,
"step": 4380
},
{
"entropy": 0.7389219880104065,
"epoch": 5.908479138627187,
"grad_norm": 0.4636954367160797,
"learning_rate": 3.922449797653198e-05,
"loss": 0.0111973837018013,
"mean_token_accuracy": 0.9958441913127899,
"num_tokens": 90819047.0,
"step": 4390
},
{
"entropy": 0.7388913333415985,
"epoch": 5.921938088829071,
"grad_norm": 0.35796335339546204,
"learning_rate": 3.900727968509024e-05,
"loss": 0.009988104552030563,
"mean_token_accuracy": 0.9962919235229493,
"num_tokens": 91025844.0,
"step": 4400
},
{
"entropy": 0.7422888994216919,
"epoch": 5.935397039030955,
"grad_norm": 0.39291512966156006,
"learning_rate": 3.879027918513483e-05,
"loss": 0.010695169866085052,
"mean_token_accuracy": 0.9959912061691284,
"num_tokens": 91232013.0,
"step": 4410
},
{
"entropy": 0.7449814796447753,
"epoch": 5.94885598923284,
"grad_norm": 0.539179801940918,
"learning_rate": 3.857350077595289e-05,
"loss": 0.011691944301128387,
"mean_token_accuracy": 0.9955081939697266,
"num_tokens": 91438515.0,
"step": 4420
},
{
"entropy": 0.7385162234306335,
"epoch": 5.962314939434724,
"grad_norm": 0.7186609506607056,
"learning_rate": 3.835694875243149e-05,
"loss": 0.012110927700996399,
"mean_token_accuracy": 0.9952600479125977,
"num_tokens": 91645188.0,
"step": 4430
},
{
"entropy": 0.731717336177826,
"epoch": 5.975773889636608,
"grad_norm": 0.509856104850769,
"learning_rate": 3.814062740497243e-05,
"loss": 0.011971230804920196,
"mean_token_accuracy": 0.9953718304634094,
"num_tokens": 91851890.0,
"step": 4440
},
{
"entropy": 0.724138367176056,
"epoch": 5.989232839838492,
"grad_norm": 0.563079297542572,
"learning_rate": 3.7924541019407264e-05,
"loss": 0.012176686525344848,
"mean_token_accuracy": 0.9949949264526368,
"num_tokens": 92059147.0,
"step": 4450
},
{
"epoch": 6.0,
"eval_entropy": 0.7367308508059022,
"eval_loss": 0.050371766090393066,
"eval_mean_token_accuracy": 0.9841803448974706,
"eval_num_tokens": 92224773.0,
"eval_runtime": 14.0163,
"eval_samples_per_second": 356.729,
"eval_steps_per_second": 11.201,
"step": 4458
},
{
"entropy": 0.7296845495700837,
"epoch": 6.002691790040377,
"grad_norm": 0.4457104206085205,
"learning_rate": 3.7708693876912435e-05,
"loss": 0.01225304752588272,
"mean_token_accuracy": 0.9953745543956757,
"num_tokens": 92266330.0,
"step": 4460
},
{
"entropy": 0.7346499502658844,
"epoch": 6.016150740242261,
"grad_norm": 0.36918142437934875,
"learning_rate": 3.74930902539244e-05,
"loss": 0.00721738263964653,
"mean_token_accuracy": 0.9974470973014832,
"num_tokens": 92473315.0,
"step": 4470
},
{
"entropy": 0.7537191212177277,
"epoch": 6.029609690444145,
"grad_norm": 0.3166503310203552,
"learning_rate": 3.727773442205493e-05,
"loss": 0.006283954530954361,
"mean_token_accuracy": 0.9978337228298187,
"num_tokens": 92680500.0,
"step": 4480
},
{
"entropy": 0.7762713313102723,
"epoch": 6.043068640646029,
"grad_norm": 0.35029998421669006,
"learning_rate": 3.7062630648006485e-05,
"loss": 0.007443505525588989,
"mean_token_accuracy": 0.9972987174987793,
"num_tokens": 92887510.0,
"step": 4490
},
{
"entropy": 0.7617170631885528,
"epoch": 6.056527590847914,
"grad_norm": 0.34308943152427673,
"learning_rate": 3.684778319348765e-05,
"loss": 0.006808276474475861,
"mean_token_accuracy": 0.9976418673992157,
"num_tokens": 93094920.0,
"step": 4500
},
{
"entropy": 0.7614555537700654,
"epoch": 6.069986541049798,
"grad_norm": 0.5395469069480896,
"learning_rate": 3.663319631512874e-05,
"loss": 0.007348373532295227,
"mean_token_accuracy": 0.9975483357906342,
"num_tokens": 93301300.0,
"step": 4510
},
{
"entropy": 0.7552687644958496,
"epoch": 6.083445491251682,
"grad_norm": 0.3406250476837158,
"learning_rate": 3.641887426439743e-05,
"loss": 0.006675070524215699,
"mean_token_accuracy": 0.9976675331592559,
"num_tokens": 93508326.0,
"step": 4520
},
{
"entropy": 0.7513856589794159,
"epoch": 6.0969044414535665,
"grad_norm": 0.5376906394958496,
"learning_rate": 3.620482128751456e-05,
"loss": 0.00666118860244751,
"mean_token_accuracy": 0.9976118326187133,
"num_tokens": 93715792.0,
"step": 4530
},
{
"entropy": 0.7578739166259766,
"epoch": 6.110363391655451,
"grad_norm": 0.3736390471458435,
"learning_rate": 3.599104162536997e-05,
"loss": 0.006261451542377472,
"mean_token_accuracy": 0.9977061867713928,
"num_tokens": 93923078.0,
"step": 4540
},
{
"entropy": 0.7500752210617065,
"epoch": 6.123822341857335,
"grad_norm": 0.5576215386390686,
"learning_rate": 3.577753951343851e-05,
"loss": 0.006978290528059006,
"mean_token_accuracy": 0.9975865662097931,
"num_tokens": 94129100.0,
"step": 4550
},
{
"entropy": 0.7302877962589264,
"epoch": 6.137281292059219,
"grad_norm": 0.40667852759361267,
"learning_rate": 3.55643191816961e-05,
"loss": 0.007030657678842545,
"mean_token_accuracy": 0.9972686529159546,
"num_tokens": 94335652.0,
"step": 4560
},
{
"entropy": 0.7341212391853332,
"epoch": 6.1507402422611035,
"grad_norm": 0.3302972614765167,
"learning_rate": 3.535138485453595e-05,
"loss": 0.006430496275424957,
"mean_token_accuracy": 0.9978137254714966,
"num_tokens": 94542535.0,
"step": 4570
},
{
"entropy": 0.7521607518196106,
"epoch": 6.164199192462988,
"grad_norm": 0.3833800256252289,
"learning_rate": 3.513874075068484e-05,
"loss": 0.006178916990756988,
"mean_token_accuracy": 0.9977231621742249,
"num_tokens": 94748991.0,
"step": 4580
},
{
"entropy": 0.7546293139457703,
"epoch": 6.177658142664872,
"grad_norm": 0.39571505784988403,
"learning_rate": 3.492639108311955e-05,
"loss": 0.007638230919837952,
"mean_token_accuracy": 0.9973102450370789,
"num_tokens": 94955693.0,
"step": 4590
},
{
"entropy": 0.7400491952896118,
"epoch": 6.191117092866756,
"grad_norm": 0.4599838852882385,
"learning_rate": 3.471434005898339e-05,
"loss": 0.007158774137496948,
"mean_token_accuracy": 0.9974660098552703,
"num_tokens": 95162378.0,
"step": 4600
},
{
"entropy": 0.7313100039958954,
"epoch": 6.2045760430686405,
"grad_norm": 0.27486035227775574,
"learning_rate": 3.450259187950283e-05,
"loss": 0.006652024388313293,
"mean_token_accuracy": 0.9974403619766236,
"num_tokens": 95368816.0,
"step": 4610
},
{
"entropy": 0.723324716091156,
"epoch": 6.218034993270525,
"grad_norm": 0.3503046929836273,
"learning_rate": 3.429115073990431e-05,
"loss": 0.006369487941265106,
"mean_token_accuracy": 0.9978444695472717,
"num_tokens": 95576191.0,
"step": 4620
},
{
"entropy": 0.7397123396396637,
"epoch": 6.231493943472409,
"grad_norm": 0.3686009347438812,
"learning_rate": 3.408002082933107e-05,
"loss": 0.006405235826969146,
"mean_token_accuracy": 0.9976122856140137,
"num_tokens": 95782987.0,
"step": 4630
},
{
"entropy": 0.7597997963428498,
"epoch": 6.244952893674293,
"grad_norm": 0.35962435603141785,
"learning_rate": 3.3869206330760187e-05,
"loss": 0.005804166570305825,
"mean_token_accuracy": 0.9979152083396912,
"num_tokens": 95989600.0,
"step": 4640
},
{
"entropy": 0.7672134041786194,
"epoch": 6.2584118438761775,
"grad_norm": 0.4252372980117798,
"learning_rate": 3.365871142091968e-05,
"loss": 0.006617816537618637,
"mean_token_accuracy": 0.9977438449859619,
"num_tokens": 96196971.0,
"step": 4650
},
{
"entropy": 0.745790833234787,
"epoch": 6.271870794078062,
"grad_norm": 0.4085806608200073,
"learning_rate": 3.3448540270205766e-05,
"loss": 0.006394334137439728,
"mean_token_accuracy": 0.9979225277900696,
"num_tokens": 96403441.0,
"step": 4660
},
{
"entropy": 0.7209958374500275,
"epoch": 6.285329744279946,
"grad_norm": 0.327882319688797,
"learning_rate": 3.323869704260025e-05,
"loss": 0.005710349604487419,
"mean_token_accuracy": 0.9980106890201569,
"num_tokens": 96610397.0,
"step": 4670
},
{
"entropy": 0.7231827795505523,
"epoch": 6.29878869448183,
"grad_norm": 0.4412100315093994,
"learning_rate": 3.302918589558801e-05,
"loss": 0.006291732937097549,
"mean_token_accuracy": 0.9975874960422516,
"num_tokens": 96817025.0,
"step": 4680
},
{
"entropy": 0.7325192034244538,
"epoch": 6.312247644683715,
"grad_norm": 0.40209078788757324,
"learning_rate": 3.282001098007462e-05,
"loss": 0.00627792626619339,
"mean_token_accuracy": 0.9977674365043641,
"num_tokens": 97023802.0,
"step": 4690
},
{
"entropy": 0.7360162317752839,
"epoch": 6.325706594885599,
"grad_norm": 0.38745999336242676,
"learning_rate": 3.261117644030412e-05,
"loss": 0.00608489103615284,
"mean_token_accuracy": 0.997675096988678,
"num_tokens": 97230943.0,
"step": 4700
},
{
"entropy": 0.7425191223621368,
"epoch": 6.339165545087483,
"grad_norm": 0.39276620745658875,
"learning_rate": 3.240268641377694e-05,
"loss": 0.0066022701561450955,
"mean_token_accuracy": 0.9976878345012665,
"num_tokens": 97437362.0,
"step": 4710
},
{
"entropy": 0.7288519084453583,
"epoch": 6.352624495289367,
"grad_norm": 0.3171084225177765,
"learning_rate": 3.2194545031167866e-05,
"loss": 0.00590059943497181,
"mean_token_accuracy": 0.9977839410305023,
"num_tokens": 97644977.0,
"step": 4720
},
{
"entropy": 0.7246840715408325,
"epoch": 6.366083445491252,
"grad_norm": 0.5965446829795837,
"learning_rate": 3.1986756416244245e-05,
"loss": 0.0069613344967365265,
"mean_token_accuracy": 0.9974348545074463,
"num_tokens": 97851595.0,
"step": 4730
},
{
"entropy": 0.731091046333313,
"epoch": 6.379542395693136,
"grad_norm": 0.4378451108932495,
"learning_rate": 3.177932468578426e-05,
"loss": 0.0070390477776527405,
"mean_token_accuracy": 0.997273427248001,
"num_tokens": 98058641.0,
"step": 4740
},
{
"entropy": 0.7363841950893402,
"epoch": 6.39300134589502,
"grad_norm": 0.42163100838661194,
"learning_rate": 3.157225394949542e-05,
"loss": 0.00652310848236084,
"mean_token_accuracy": 0.99771688580513,
"num_tokens": 98265432.0,
"step": 4750
},
{
"entropy": 0.748940235376358,
"epoch": 6.406460296096904,
"grad_norm": 0.48964178562164307,
"learning_rate": 3.136554830993304e-05,
"loss": 0.007442094385623932,
"mean_token_accuracy": 0.996977812051773,
"num_tokens": 98472309.0,
"step": 4760
},
{
"entropy": 0.7387059688568115,
"epoch": 6.419919246298789,
"grad_norm": 0.4443625211715698,
"learning_rate": 3.115921186241906e-05,
"loss": 0.0067341111600399016,
"mean_token_accuracy": 0.9978115797042847,
"num_tokens": 98679369.0,
"step": 4770
},
{
"entropy": 0.7278500914573669,
"epoch": 6.433378196500673,
"grad_norm": 0.31731724739074707,
"learning_rate": 3.0953248694960824e-05,
"loss": 0.006909257918596268,
"mean_token_accuracy": 0.9976158678531647,
"num_tokens": 98886649.0,
"step": 4780
},
{
"entropy": 0.7249270141124725,
"epoch": 6.446837146702557,
"grad_norm": 0.5315210223197937,
"learning_rate": 3.0747662888170146e-05,
"loss": 0.006646881997585297,
"mean_token_accuracy": 0.9975297033786774,
"num_tokens": 99093341.0,
"step": 4790
},
{
"entropy": 0.7275424122810363,
"epoch": 6.460296096904441,
"grad_norm": 0.36423051357269287,
"learning_rate": 3.054245851518246e-05,
"loss": 0.006122813746333122,
"mean_token_accuracy": 0.9977832078933716,
"num_tokens": 99300096.0,
"step": 4800
},
{
"entropy": 0.7336158037185669,
"epoch": 6.473755047106326,
"grad_norm": 0.5945335626602173,
"learning_rate": 3.0337639641576065e-05,
"loss": 0.006734760850667954,
"mean_token_accuracy": 0.9974972426891326,
"num_tokens": 99506798.0,
"step": 4810
},
{
"entropy": 0.7330217719078064,
"epoch": 6.48721399730821,
"grad_norm": 0.4753057360649109,
"learning_rate": 3.0133210325291662e-05,
"loss": 0.0071483604609966275,
"mean_token_accuracy": 0.9972701132297516,
"num_tokens": 99713316.0,
"step": 4820
},
{
"entropy": 0.7293879866600037,
"epoch": 6.500672947510094,
"grad_norm": 0.44765713810920715,
"learning_rate": 2.9929174616551857e-05,
"loss": 0.0069991335272789005,
"mean_token_accuracy": 0.9972541630268097,
"num_tokens": 99920106.0,
"step": 4830
},
{
"entropy": 0.7240224123001099,
"epoch": 6.514131897711978,
"grad_norm": 0.40643468499183655,
"learning_rate": 2.9725536557781008e-05,
"loss": 0.006019005179405212,
"mean_token_accuracy": 0.9977357804775238,
"num_tokens": 100127187.0,
"step": 4840
},
{
"entropy": 0.7268623948097229,
"epoch": 6.527590847913863,
"grad_norm": 0.4088582396507263,
"learning_rate": 2.9522300183525097e-05,
"loss": 0.006773443520069122,
"mean_token_accuracy": 0.9972230792045593,
"num_tokens": 100334160.0,
"step": 4850
},
{
"entropy": 0.7315291166305542,
"epoch": 6.541049798115747,
"grad_norm": 0.3768236041069031,
"learning_rate": 2.931946952037179e-05,
"loss": 0.006892500817775727,
"mean_token_accuracy": 0.9973698258399963,
"num_tokens": 100541088.0,
"step": 4860
},
{
"entropy": 0.7324226260185241,
"epoch": 6.554508748317631,
"grad_norm": 0.5730559825897217,
"learning_rate": 2.9117048586870654e-05,
"loss": 0.007101823389530182,
"mean_token_accuracy": 0.9975831270217895,
"num_tokens": 100748086.0,
"step": 4870
},
{
"entropy": 0.7297871172428131,
"epoch": 6.5679676985195155,
"grad_norm": 0.42341160774230957,
"learning_rate": 2.891504139345358e-05,
"loss": 0.0055365487933158875,
"mean_token_accuracy": 0.9979402482509613,
"num_tokens": 100954751.0,
"step": 4880
},
{
"entropy": 0.7403721213340759,
"epoch": 6.5814266487214,
"grad_norm": 0.6541618704795837,
"learning_rate": 2.8713451942355285e-05,
"loss": 0.0077784605324268345,
"mean_token_accuracy": 0.9971624433994293,
"num_tokens": 101161323.0,
"step": 4890
},
{
"entropy": 0.7294679641723633,
"epoch": 6.594885598923284,
"grad_norm": 0.5061529278755188,
"learning_rate": 2.8512284227534027e-05,
"loss": 0.0068440020084381105,
"mean_token_accuracy": 0.9972440838813782,
"num_tokens": 101369013.0,
"step": 4900
},
{
"entropy": 0.7254895687103271,
"epoch": 6.608344549125168,
"grad_norm": 0.6346190571784973,
"learning_rate": 2.8311542234592497e-05,
"loss": 0.006360020488500595,
"mean_token_accuracy": 0.9975525736808777,
"num_tokens": 101575775.0,
"step": 4910
},
{
"entropy": 0.7256027460098267,
"epoch": 6.6218034993270525,
"grad_norm": 0.5224611163139343,
"learning_rate": 2.8111229940698842e-05,
"loss": 0.0073788858950138096,
"mean_token_accuracy": 0.9969935595989228,
"num_tokens": 101782675.0,
"step": 4920
},
{
"entropy": 0.7316436052322388,
"epoch": 6.635262449528937,
"grad_norm": 0.49549564719200134,
"learning_rate": 2.791135131450785e-05,
"loss": 0.007514142990112304,
"mean_token_accuracy": 0.9972495913505555,
"num_tokens": 101989297.0,
"step": 4930
},
{
"entropy": 0.7356618404388428,
"epoch": 6.648721399730821,
"grad_norm": 0.40386277437210083,
"learning_rate": 2.7711910316082357e-05,
"loss": 0.007244438678026199,
"mean_token_accuracy": 0.9973312318325043,
"num_tokens": 102195867.0,
"step": 4940
},
{
"entropy": 0.7294074118137359,
"epoch": 6.662180349932705,
"grad_norm": 0.26524385809898376,
"learning_rate": 2.7512910896814747e-05,
"loss": 0.006912172585725784,
"mean_token_accuracy": 0.9975458741188049,
"num_tokens": 102402820.0,
"step": 4950
},
{
"entropy": 0.7264546751976013,
"epoch": 6.6756393001345895,
"grad_norm": 0.3604874908924103,
"learning_rate": 2.7314356999348713e-05,
"loss": 0.006820497661828994,
"mean_token_accuracy": 0.9975083887577056,
"num_tokens": 102609492.0,
"step": 4960
},
{
"entropy": 0.7235826432704926,
"epoch": 6.689098250336474,
"grad_norm": 0.4252055883407593,
"learning_rate": 2.711625255750111e-05,
"loss": 0.006822900474071502,
"mean_token_accuracy": 0.9974255442619324,
"num_tokens": 102816432.0,
"step": 4970
},
{
"entropy": 0.7244568645954133,
"epoch": 6.702557200538358,
"grad_norm": 0.4580247104167938,
"learning_rate": 2.691860149618402e-05,
"loss": 0.00701884925365448,
"mean_token_accuracy": 0.9972208499908447,
"num_tokens": 103023278.0,
"step": 4980
},
{
"entropy": 0.727864944934845,
"epoch": 6.716016150740242,
"grad_norm": 0.35556018352508545,
"learning_rate": 2.6721407731327004e-05,
"loss": 0.006804829835891724,
"mean_token_accuracy": 0.9975570499897003,
"num_tokens": 103230198.0,
"step": 4990
},
{
"entropy": 0.7289696097373962,
"epoch": 6.7294751009421265,
"grad_norm": 0.357962042093277,
"learning_rate": 2.6524675169799506e-05,
"loss": 0.006180135905742646,
"mean_token_accuracy": 0.9977405965328217,
"num_tokens": 103437417.0,
"step": 5000
},
{
"entropy": 0.7292522251605987,
"epoch": 6.742934051144011,
"grad_norm": 0.3891056180000305,
"learning_rate": 2.6328407709333463e-05,
"loss": 0.005779655277729034,
"mean_token_accuracy": 0.9977752268314362,
"num_tokens": 103644773.0,
"step": 5010
},
{
"entropy": 0.7364232182502747,
"epoch": 6.756393001345895,
"grad_norm": 0.4316810369491577,
"learning_rate": 2.6132609238446072e-05,
"loss": 0.006378757953643799,
"mean_token_accuracy": 0.9975412786006927,
"num_tokens": 103851437.0,
"step": 5020
},
{
"entropy": 0.7294283807277679,
"epoch": 6.769851951547779,
"grad_norm": 0.28501173853874207,
"learning_rate": 2.5937283636362724e-05,
"loss": 0.006514771282672882,
"mean_token_accuracy": 0.9976726651191712,
"num_tokens": 104058632.0,
"step": 5030
},
{
"entropy": 0.7193506896495819,
"epoch": 6.783310901749664,
"grad_norm": 0.4643396735191345,
"learning_rate": 2.5742434772940216e-05,
"loss": 0.006748485565185547,
"mean_token_accuracy": 0.9975197494029999,
"num_tokens": 104265547.0,
"step": 5040
},
{
"entropy": 0.7154334783554077,
"epoch": 6.796769851951548,
"grad_norm": 0.6031412482261658,
"learning_rate": 2.5548066508590007e-05,
"loss": 0.0062897935509681705,
"mean_token_accuracy": 0.9975806593894958,
"num_tokens": 104472797.0,
"step": 5050
},
{
"entropy": 0.7141842901706695,
"epoch": 6.810228802153432,
"grad_norm": 0.3686530292034149,
"learning_rate": 2.535418269420178e-05,
"loss": 0.006743116676807404,
"mean_token_accuracy": 0.9975346326828003,
"num_tokens": 104679303.0,
"step": 5060
},
{
"entropy": 0.7162917137145997,
"epoch": 6.823687752355316,
"grad_norm": 0.3937174081802368,
"learning_rate": 2.5160787171067126e-05,
"loss": 0.006931714713573456,
"mean_token_accuracy": 0.9975650846958161,
"num_tokens": 104886129.0,
"step": 5070
},
{
"entropy": 0.7184829175472259,
"epoch": 6.837146702557201,
"grad_norm": 0.3974623382091522,
"learning_rate": 2.4967883770803413e-05,
"loss": 0.006391709297895431,
"mean_token_accuracy": 0.9977280080318451,
"num_tokens": 105092511.0,
"step": 5080
},
{
"entropy": 0.7110052049160004,
"epoch": 6.850605652759085,
"grad_norm": 0.3725025951862335,
"learning_rate": 2.477547631527799e-05,
"loss": 0.006394396722316742,
"mean_token_accuracy": 0.9976818323135376,
"num_tokens": 105299187.0,
"step": 5090
},
{
"entropy": 0.7120427250862121,
"epoch": 6.864064602960969,
"grad_norm": 0.43178483843803406,
"learning_rate": 2.45835686165323e-05,
"loss": 0.006332089006900787,
"mean_token_accuracy": 0.9976500451564789,
"num_tokens": 105505571.0,
"step": 5100
},
{
"entropy": 0.7212999403476715,
"epoch": 6.877523553162853,
"grad_norm": 0.4976682960987091,
"learning_rate": 2.4392164476706468e-05,
"loss": 0.005901779979467392,
"mean_token_accuracy": 0.9980143666267395,
"num_tokens": 105712714.0,
"step": 5110
},
{
"entropy": 0.7251917839050293,
"epoch": 6.890982503364738,
"grad_norm": 0.37038931250572205,
"learning_rate": 2.4201267687963935e-05,
"loss": 0.006622254848480225,
"mean_token_accuracy": 0.9975569725036622,
"num_tokens": 105919685.0,
"step": 5120
},
{
"entropy": 0.7244008183479309,
"epoch": 6.904441453566622,
"grad_norm": 0.4485451877117157,
"learning_rate": 2.4010882032416332e-05,
"loss": 0.00556907095015049,
"mean_token_accuracy": 0.9980982482433319,
"num_tokens": 106126360.0,
"step": 5130
},
{
"entropy": 0.7264035522937775,
"epoch": 6.917900403768506,
"grad_norm": 0.3821909427642822,
"learning_rate": 2.3821011282048545e-05,
"loss": 0.005761538445949554,
"mean_token_accuracy": 0.9980070352554321,
"num_tokens": 106332315.0,
"step": 5140
},
{
"entropy": 0.7214965403079987,
"epoch": 6.93135935397039,
"grad_norm": 0.5453296899795532,
"learning_rate": 2.3631659198643985e-05,
"loss": 0.006120505183935166,
"mean_token_accuracy": 0.9977015674114227,
"num_tokens": 106539439.0,
"step": 5150
},
{
"entropy": 0.7204740345478058,
"epoch": 6.944818304172275,
"grad_norm": 0.5277069807052612,
"learning_rate": 2.344282953371006e-05,
"loss": 0.0063091211020946504,
"mean_token_accuracy": 0.99763303399086,
"num_tokens": 106746217.0,
"step": 5160
},
{
"entropy": 0.7176619052886963,
"epoch": 6.958277254374159,
"grad_norm": 0.4055790901184082,
"learning_rate": 2.325452602840385e-05,
"loss": 0.005276233702898025,
"mean_token_accuracy": 0.9980817437171936,
"num_tokens": 106953032.0,
"step": 5170
},
{
"entropy": 0.7127950310707092,
"epoch": 6.971736204576043,
"grad_norm": 0.336780309677124,
"learning_rate": 2.306675241345797e-05,
"loss": 0.005725700408220291,
"mean_token_accuracy": 0.9980043768882751,
"num_tokens": 107160130.0,
"step": 5180
},
{
"entropy": 0.715798842906952,
"epoch": 6.985195154777927,
"grad_norm": 0.38042059540748596,
"learning_rate": 2.287951240910668e-05,
"loss": 0.0058772068470716475,
"mean_token_accuracy": 0.9980284452438355,
"num_tokens": 107367185.0,
"step": 5190
},
{
"entropy": 0.7066363096237183,
"epoch": 6.998654104979812,
"grad_norm": 0.44882187247276306,
"learning_rate": 2.269280972501217e-05,
"loss": 0.005587458610534668,
"mean_token_accuracy": 0.9979583859443665,
"num_tokens": 107574860.0,
"step": 5200
},
{
"epoch": 7.0,
"eval_entropy": 0.7108410179235374,
"eval_loss": 0.05958922207355499,
"eval_mean_token_accuracy": 0.9847923464076535,
"eval_num_tokens": 107595742.0,
"eval_runtime": 13.9793,
"eval_samples_per_second": 357.671,
"eval_steps_per_second": 11.231,
"step": 5201
}
],
"logging_steps": 10,
"max_steps": 7430,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.280718172519924e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}