Qwen3-4B-Answer-Agent / trainer_state.json
a-k-aAiMGoD's picture
Upload folder using huggingface_hub
dacda63 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 24.934228187919462,
"eval_steps": 500,
"global_step": 9300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.026845637583892617,
"grad_norm": 3.416386604309082,
"learning_rate": 3.2258064516129035e-07,
"loss": 2.4117,
"mean_token_accuracy": 0.6175116240978241,
"num_tokens": 24255.0,
"step": 10
},
{
"epoch": 0.053691275167785234,
"grad_norm": 3.772399663925171,
"learning_rate": 6.810035842293908e-07,
"loss": 2.4628,
"mean_token_accuracy": 0.6110778599977493,
"num_tokens": 47320.0,
"step": 20
},
{
"epoch": 0.08053691275167785,
"grad_norm": 3.545194625854492,
"learning_rate": 1.039426523297491e-06,
"loss": 2.5032,
"mean_token_accuracy": 0.6162596017122268,
"num_tokens": 69404.0,
"step": 30
},
{
"epoch": 0.10738255033557047,
"grad_norm": 3.8445169925689697,
"learning_rate": 1.3978494623655913e-06,
"loss": 2.655,
"mean_token_accuracy": 0.5948106974363327,
"num_tokens": 90245.0,
"step": 40
},
{
"epoch": 0.1342281879194631,
"grad_norm": 3.7103028297424316,
"learning_rate": 1.7562724014336918e-06,
"loss": 2.7566,
"mean_token_accuracy": 0.5874058037996293,
"num_tokens": 109437.0,
"step": 50
},
{
"epoch": 0.1610738255033557,
"grad_norm": 3.137907028198242,
"learning_rate": 2.1146953405017924e-06,
"loss": 2.3534,
"mean_token_accuracy": 0.6221999943256378,
"num_tokens": 133516.0,
"step": 60
},
{
"epoch": 0.18791946308724833,
"grad_norm": 3.168318748474121,
"learning_rate": 2.4731182795698927e-06,
"loss": 2.3546,
"mean_token_accuracy": 0.6222240030765533,
"num_tokens": 156399.0,
"step": 70
},
{
"epoch": 0.21476510067114093,
"grad_norm": 2.6762025356292725,
"learning_rate": 2.831541218637993e-06,
"loss": 2.2976,
"mean_token_accuracy": 0.6325597822666168,
"num_tokens": 178233.0,
"step": 80
},
{
"epoch": 0.24161073825503357,
"grad_norm": 2.528851270675659,
"learning_rate": 3.1899641577060937e-06,
"loss": 2.3289,
"mean_token_accuracy": 0.6177177101373672,
"num_tokens": 198867.0,
"step": 90
},
{
"epoch": 0.2684563758389262,
"grad_norm": 2.180152654647827,
"learning_rate": 3.548387096774194e-06,
"loss": 2.2287,
"mean_token_accuracy": 0.6231148332357407,
"num_tokens": 218120.0,
"step": 100
},
{
"epoch": 0.2953020134228188,
"grad_norm": 1.8343068361282349,
"learning_rate": 3.906810035842294e-06,
"loss": 1.8435,
"mean_token_accuracy": 0.6591462314128875,
"num_tokens": 242238.0,
"step": 110
},
{
"epoch": 0.3221476510067114,
"grad_norm": 1.6624016761779785,
"learning_rate": 4.265232974910394e-06,
"loss": 1.6839,
"mean_token_accuracy": 0.6699382126331329,
"num_tokens": 265176.0,
"step": 120
},
{
"epoch": 0.348993288590604,
"grad_norm": 1.4153294563293457,
"learning_rate": 4.623655913978495e-06,
"loss": 1.4978,
"mean_token_accuracy": 0.6955495923757553,
"num_tokens": 287280.0,
"step": 130
},
{
"epoch": 0.37583892617449666,
"grad_norm": 1.4868972301483154,
"learning_rate": 4.982078853046595e-06,
"loss": 1.37,
"mean_token_accuracy": 0.7068142563104629,
"num_tokens": 308485.0,
"step": 140
},
{
"epoch": 0.40268456375838924,
"grad_norm": 1.5649815797805786,
"learning_rate": 5.340501792114696e-06,
"loss": 1.1729,
"mean_token_accuracy": 0.7447071671485901,
"num_tokens": 327940.0,
"step": 150
},
{
"epoch": 0.42953020134228187,
"grad_norm": 1.2401491403579712,
"learning_rate": 5.698924731182796e-06,
"loss": 0.9397,
"mean_token_accuracy": 0.7840387046337127,
"num_tokens": 352050.0,
"step": 160
},
{
"epoch": 0.4563758389261745,
"grad_norm": 1.1661632061004639,
"learning_rate": 6.057347670250897e-06,
"loss": 0.7066,
"mean_token_accuracy": 0.8346484929323197,
"num_tokens": 374886.0,
"step": 170
},
{
"epoch": 0.48322147651006714,
"grad_norm": 1.1714835166931152,
"learning_rate": 6.415770609318996e-06,
"loss": 0.4859,
"mean_token_accuracy": 0.8801979720592499,
"num_tokens": 396874.0,
"step": 180
},
{
"epoch": 0.5100671140939598,
"grad_norm": 0.5148192048072815,
"learning_rate": 6.774193548387097e-06,
"loss": 0.4031,
"mean_token_accuracy": 0.9046478867530823,
"num_tokens": 417713.0,
"step": 190
},
{
"epoch": 0.5369127516778524,
"grad_norm": 0.43239253759384155,
"learning_rate": 7.1326164874551975e-06,
"loss": 0.3182,
"mean_token_accuracy": 0.9254436582326889,
"num_tokens": 437046.0,
"step": 200
},
{
"epoch": 0.5637583892617449,
"grad_norm": 0.417090505361557,
"learning_rate": 7.491039426523297e-06,
"loss": 0.3797,
"mean_token_accuracy": 0.9017556846141815,
"num_tokens": 461134.0,
"step": 210
},
{
"epoch": 0.5906040268456376,
"grad_norm": 0.3316299617290497,
"learning_rate": 7.849462365591398e-06,
"loss": 0.304,
"mean_token_accuracy": 0.9168222814798355,
"num_tokens": 484137.0,
"step": 220
},
{
"epoch": 0.6174496644295302,
"grad_norm": 0.3776572644710541,
"learning_rate": 8.207885304659498e-06,
"loss": 0.2544,
"mean_token_accuracy": 0.9300281196832657,
"num_tokens": 506286.0,
"step": 230
},
{
"epoch": 0.6442953020134228,
"grad_norm": 0.5413280725479126,
"learning_rate": 8.5663082437276e-06,
"loss": 0.2859,
"mean_token_accuracy": 0.9251022487878799,
"num_tokens": 527336.0,
"step": 240
},
{
"epoch": 0.6711409395973155,
"grad_norm": 0.5370568037033081,
"learning_rate": 8.9247311827957e-06,
"loss": 0.2791,
"mean_token_accuracy": 0.9306178212165832,
"num_tokens": 546762.0,
"step": 250
},
{
"epoch": 0.697986577181208,
"grad_norm": 0.4821404218673706,
"learning_rate": 9.2831541218638e-06,
"loss": 0.2656,
"mean_token_accuracy": 0.9255098283290863,
"num_tokens": 570752.0,
"step": 260
},
{
"epoch": 0.7248322147651006,
"grad_norm": 0.49469488859176636,
"learning_rate": 9.641577060931901e-06,
"loss": 0.2273,
"mean_token_accuracy": 0.9351761728525162,
"num_tokens": 593533.0,
"step": 270
},
{
"epoch": 0.7516778523489933,
"grad_norm": 0.5606919527053833,
"learning_rate": 1e-05,
"loss": 0.2094,
"mean_token_accuracy": 0.9411542862653732,
"num_tokens": 615365.0,
"step": 280
},
{
"epoch": 0.7785234899328859,
"grad_norm": 0.39467954635620117,
"learning_rate": 9.999969679947463e-06,
"loss": 0.2808,
"mean_token_accuracy": 0.9259979039430618,
"num_tokens": 635705.0,
"step": 290
},
{
"epoch": 0.8053691275167785,
"grad_norm": 0.29947659373283386,
"learning_rate": 9.999878720157571e-06,
"loss": 0.21,
"mean_token_accuracy": 0.9384137988090515,
"num_tokens": 654906.0,
"step": 300
},
{
"epoch": 0.8322147651006712,
"grad_norm": 0.3251740634441376,
"learning_rate": 9.99972712173349e-06,
"loss": 0.2109,
"mean_token_accuracy": 0.9341928958892822,
"num_tokens": 679030.0,
"step": 310
},
{
"epoch": 0.8590604026845637,
"grad_norm": 0.2648128271102905,
"learning_rate": 9.999514886513808e-06,
"loss": 0.192,
"mean_token_accuracy": 0.9401267766952515,
"num_tokens": 701960.0,
"step": 320
},
{
"epoch": 0.8859060402684564,
"grad_norm": 0.289673775434494,
"learning_rate": 9.999242017072517e-06,
"loss": 0.1515,
"mean_token_accuracy": 0.9540554910898209,
"num_tokens": 724008.0,
"step": 330
},
{
"epoch": 0.912751677852349,
"grad_norm": 0.3500693440437317,
"learning_rate": 9.998908516718984e-06,
"loss": 0.24,
"mean_token_accuracy": 0.9336700767278672,
"num_tokens": 744883.0,
"step": 340
},
{
"epoch": 0.9395973154362416,
"grad_norm": 0.285081684589386,
"learning_rate": 9.998514389497907e-06,
"loss": 0.2127,
"mean_token_accuracy": 0.9403306484222412,
"num_tokens": 764242.0,
"step": 350
},
{
"epoch": 0.9664429530201343,
"grad_norm": 0.5001574754714966,
"learning_rate": 9.99805964018927e-06,
"loss": 0.1803,
"mean_token_accuracy": 0.944737109541893,
"num_tokens": 787542.0,
"step": 360
},
{
"epoch": 0.9932885906040269,
"grad_norm": 0.4394875764846802,
"learning_rate": 9.997544274308282e-06,
"loss": 0.2054,
"mean_token_accuracy": 0.9406224071979523,
"num_tokens": 808350.0,
"step": 370
},
{
"epoch": 1.018791946308725,
"grad_norm": 0.3800680637359619,
"learning_rate": 9.996968298105313e-06,
"loss": 0.1784,
"mean_token_accuracy": 0.9486302858904788,
"num_tokens": 829396.0,
"step": 380
},
{
"epoch": 1.0456375838926175,
"grad_norm": 0.29024776816368103,
"learning_rate": 9.996331718565812e-06,
"loss": 0.1667,
"mean_token_accuracy": 0.9462663173675537,
"num_tokens": 852700.0,
"step": 390
},
{
"epoch": 1.07248322147651,
"grad_norm": 0.3312149941921234,
"learning_rate": 9.995634543410231e-06,
"loss": 0.1696,
"mean_token_accuracy": 0.9464462220668792,
"num_tokens": 875028.0,
"step": 400
},
{
"epoch": 1.0993288590604027,
"grad_norm": 0.41938960552215576,
"learning_rate": 9.994876781093923e-06,
"loss": 0.2146,
"mean_token_accuracy": 0.937797623872757,
"num_tokens": 896237.0,
"step": 410
},
{
"epoch": 1.1261744966442953,
"grad_norm": 0.3066196143627167,
"learning_rate": 9.994058440807047e-06,
"loss": 0.2241,
"mean_token_accuracy": 0.9370527178049087,
"num_tokens": 915897.0,
"step": 420
},
{
"epoch": 1.1530201342281878,
"grad_norm": 0.3139925003051758,
"learning_rate": 9.99317953247445e-06,
"loss": 0.1757,
"mean_token_accuracy": 0.9461874514818192,
"num_tokens": 938499.0,
"step": 430
},
{
"epoch": 1.1798657718120806,
"grad_norm": 0.3327377438545227,
"learning_rate": 9.992240066755554e-06,
"loss": 0.181,
"mean_token_accuracy": 0.9421871662139892,
"num_tokens": 961628.0,
"step": 440
},
{
"epoch": 1.2067114093959732,
"grad_norm": 0.30847689509391785,
"learning_rate": 9.991240055044214e-06,
"loss": 0.1556,
"mean_token_accuracy": 0.9528691500425339,
"num_tokens": 983811.0,
"step": 450
},
{
"epoch": 1.2335570469798658,
"grad_norm": 0.4445633292198181,
"learning_rate": 9.990179509468595e-06,
"loss": 0.2034,
"mean_token_accuracy": 0.9405299901962281,
"num_tokens": 1005030.0,
"step": 460
},
{
"epoch": 1.2604026845637584,
"grad_norm": 0.35106924176216125,
"learning_rate": 9.989058442891018e-06,
"loss": 0.2153,
"mean_token_accuracy": 0.93738272190094,
"num_tokens": 1024629.0,
"step": 470
},
{
"epoch": 1.287248322147651,
"grad_norm": 0.30040764808654785,
"learning_rate": 9.9878768689078e-06,
"loss": 0.1688,
"mean_token_accuracy": 0.9484542042016983,
"num_tokens": 1047241.0,
"step": 480
},
{
"epoch": 1.3140939597315437,
"grad_norm": 0.29934918880462646,
"learning_rate": 9.986634801849093e-06,
"loss": 0.1672,
"mean_token_accuracy": 0.9444483011960983,
"num_tokens": 1070350.0,
"step": 490
},
{
"epoch": 1.3409395973154363,
"grad_norm": 0.3902275860309601,
"learning_rate": 9.985332256778719e-06,
"loss": 0.1622,
"mean_token_accuracy": 0.9500592857599258,
"num_tokens": 1092615.0,
"step": 500
},
{
"epoch": 1.367785234899329,
"grad_norm": 0.4580014944076538,
"learning_rate": 9.983969249493964e-06,
"loss": 0.1825,
"mean_token_accuracy": 0.9451580941677094,
"num_tokens": 1114038.0,
"step": 510
},
{
"epoch": 1.3946308724832215,
"grad_norm": 0.3057954013347626,
"learning_rate": 9.982545796525416e-06,
"loss": 0.2153,
"mean_token_accuracy": 0.9356414407491684,
"num_tokens": 1133791.0,
"step": 520
},
{
"epoch": 1.421476510067114,
"grad_norm": 0.317940890789032,
"learning_rate": 9.981061915136737e-06,
"loss": 0.1642,
"mean_token_accuracy": 0.950609314441681,
"num_tokens": 1156476.0,
"step": 530
},
{
"epoch": 1.4483221476510066,
"grad_norm": 0.29728659987449646,
"learning_rate": 9.979517623324475e-06,
"loss": 0.1585,
"mean_token_accuracy": 0.9496109008789062,
"num_tokens": 1179818.0,
"step": 540
},
{
"epoch": 1.4751677852348992,
"grad_norm": 0.3028015196323395,
"learning_rate": 9.977912939817833e-06,
"loss": 0.1547,
"mean_token_accuracy": 0.9496413081884384,
"num_tokens": 1202050.0,
"step": 550
},
{
"epoch": 1.5020134228187918,
"grad_norm": 0.5277287364006042,
"learning_rate": 9.976247884078445e-06,
"loss": 0.1941,
"mean_token_accuracy": 0.9424160838127136,
"num_tokens": 1223368.0,
"step": 560
},
{
"epoch": 1.5288590604026846,
"grad_norm": 0.34933868050575256,
"learning_rate": 9.974522476300144e-06,
"loss": 0.2241,
"mean_token_accuracy": 0.9350442677736283,
"num_tokens": 1243174.0,
"step": 570
},
{
"epoch": 1.5557046979865772,
"grad_norm": 0.34490832686424255,
"learning_rate": 9.97273673740871e-06,
"loss": 0.1554,
"mean_token_accuracy": 0.9518986463546752,
"num_tokens": 1265937.0,
"step": 580
},
{
"epoch": 1.5825503355704698,
"grad_norm": 0.3113323748111725,
"learning_rate": 9.970890689061622e-06,
"loss": 0.1523,
"mean_token_accuracy": 0.9522456258535386,
"num_tokens": 1289186.0,
"step": 590
},
{
"epoch": 1.6093959731543626,
"grad_norm": 0.2945059835910797,
"learning_rate": 9.968984353647796e-06,
"loss": 0.1594,
"mean_token_accuracy": 0.950118288397789,
"num_tokens": 1311437.0,
"step": 600
},
{
"epoch": 1.6362416107382551,
"grad_norm": 0.40153175592422485,
"learning_rate": 9.967017754287303e-06,
"loss": 0.1924,
"mean_token_accuracy": 0.9425857335329055,
"num_tokens": 1332682.0,
"step": 610
},
{
"epoch": 1.6630872483221477,
"grad_norm": 0.31573766469955444,
"learning_rate": 9.964990914831104e-06,
"loss": 0.2142,
"mean_token_accuracy": 0.93615363240242,
"num_tokens": 1352512.0,
"step": 620
},
{
"epoch": 1.6899328859060403,
"grad_norm": 0.33446043729782104,
"learning_rate": 9.96290385986075e-06,
"loss": 0.154,
"mean_token_accuracy": 0.9519079566001892,
"num_tokens": 1375202.0,
"step": 630
},
{
"epoch": 1.7167785234899329,
"grad_norm": 0.2740935683250427,
"learning_rate": 9.960756614688089e-06,
"loss": 0.1515,
"mean_token_accuracy": 0.9505283504724502,
"num_tokens": 1398311.0,
"step": 640
},
{
"epoch": 1.7436241610738255,
"grad_norm": 0.2914665639400482,
"learning_rate": 9.958549205354956e-06,
"loss": 0.1472,
"mean_token_accuracy": 0.9522065281867981,
"num_tokens": 1420467.0,
"step": 650
},
{
"epoch": 1.770469798657718,
"grad_norm": 0.48217689990997314,
"learning_rate": 9.956281658632856e-06,
"loss": 0.1726,
"mean_token_accuracy": 0.9457607984542846,
"num_tokens": 1441750.0,
"step": 660
},
{
"epoch": 1.7973154362416106,
"grad_norm": 0.3143758773803711,
"learning_rate": 9.953954002022643e-06,
"loss": 0.2142,
"mean_token_accuracy": 0.9394429564476013,
"num_tokens": 1461475.0,
"step": 670
},
{
"epoch": 1.8241610738255034,
"grad_norm": 0.32394182682037354,
"learning_rate": 9.951566263754184e-06,
"loss": 0.1495,
"mean_token_accuracy": 0.9530263602733612,
"num_tokens": 1484202.0,
"step": 680
},
{
"epoch": 1.851006711409396,
"grad_norm": 0.32636895775794983,
"learning_rate": 9.949118472786024e-06,
"loss": 0.1461,
"mean_token_accuracy": 0.9514715582132339,
"num_tokens": 1507538.0,
"step": 690
},
{
"epoch": 1.8778523489932886,
"grad_norm": 0.2944177985191345,
"learning_rate": 9.946610658805018e-06,
"loss": 0.1629,
"mean_token_accuracy": 0.949513065814972,
"num_tokens": 1529841.0,
"step": 700
},
{
"epoch": 1.9046979865771814,
"grad_norm": 0.36996525526046753,
"learning_rate": 9.944042852225991e-06,
"loss": 0.1742,
"mean_token_accuracy": 0.9453697711229324,
"num_tokens": 1551120.0,
"step": 710
},
{
"epoch": 1.931543624161074,
"grad_norm": 0.3420737087726593,
"learning_rate": 9.94141508419135e-06,
"loss": 0.1999,
"mean_token_accuracy": 0.9439471960067749,
"num_tokens": 1570757.0,
"step": 720
},
{
"epoch": 1.9583892617449665,
"grad_norm": 0.3249684274196625,
"learning_rate": 9.938727386570727e-06,
"loss": 0.144,
"mean_token_accuracy": 0.9547680050134659,
"num_tokens": 1593057.0,
"step": 730
},
{
"epoch": 1.985234899328859,
"grad_norm": 0.39704540371894836,
"learning_rate": 9.935979791960571e-06,
"loss": 0.191,
"mean_token_accuracy": 0.9425975173711777,
"num_tokens": 1614741.0,
"step": 740
},
{
"epoch": 2.010738255033557,
"grad_norm": 0.29178890585899353,
"learning_rate": 9.933172333683768e-06,
"loss": 0.1716,
"mean_token_accuracy": 0.9465354273193761,
"num_tokens": 1634588.0,
"step": 750
},
{
"epoch": 2.03758389261745,
"grad_norm": 0.2685829699039459,
"learning_rate": 9.93030504578923e-06,
"loss": 0.1492,
"mean_token_accuracy": 0.9501548141241074,
"num_tokens": 1658163.0,
"step": 760
},
{
"epoch": 2.0644295302013425,
"grad_norm": 0.2868586778640747,
"learning_rate": 9.927377963051488e-06,
"loss": 0.148,
"mean_token_accuracy": 0.9499404489994049,
"num_tokens": 1680633.0,
"step": 770
},
{
"epoch": 2.091275167785235,
"grad_norm": 0.3987250030040741,
"learning_rate": 9.924391120970262e-06,
"loss": 0.1737,
"mean_token_accuracy": 0.9475592643022537,
"num_tokens": 1702264.0,
"step": 780
},
{
"epoch": 2.1181208053691276,
"grad_norm": 0.330563485622406,
"learning_rate": 9.921344555770033e-06,
"loss": 0.2076,
"mean_token_accuracy": 0.9379633396863938,
"num_tokens": 1722321.0,
"step": 790
},
{
"epoch": 2.14496644295302,
"grad_norm": 0.32567116618156433,
"learning_rate": 9.91823830439961e-06,
"loss": 0.1545,
"mean_token_accuracy": 0.9527941286563874,
"num_tokens": 1743586.0,
"step": 800
},
{
"epoch": 2.1718120805369128,
"grad_norm": 0.280172735452652,
"learning_rate": 9.915072404531675e-06,
"loss": 0.1337,
"mean_token_accuracy": 0.9554655253887177,
"num_tokens": 1767189.0,
"step": 810
},
{
"epoch": 2.1986577181208053,
"grad_norm": 0.37030693888664246,
"learning_rate": 9.911846894562325e-06,
"loss": 0.1634,
"mean_token_accuracy": 0.9463264971971512,
"num_tokens": 1789747.0,
"step": 820
},
{
"epoch": 2.225503355704698,
"grad_norm": 0.38652828335762024,
"learning_rate": 9.908561813610615e-06,
"loss": 0.1542,
"mean_token_accuracy": 0.9503043502569198,
"num_tokens": 1811448.0,
"step": 830
},
{
"epoch": 2.2523489932885905,
"grad_norm": 0.31967300176620483,
"learning_rate": 9.905217201518079e-06,
"loss": 0.2012,
"mean_token_accuracy": 0.9394483864307404,
"num_tokens": 1831513.0,
"step": 840
},
{
"epoch": 2.279194630872483,
"grad_norm": 0.3107980489730835,
"learning_rate": 9.901813098848238e-06,
"loss": 0.1536,
"mean_token_accuracy": 0.9533493936061859,
"num_tokens": 1852698.0,
"step": 850
},
{
"epoch": 2.3060402684563757,
"grad_norm": 0.28780773282051086,
"learning_rate": 9.898349546886123e-06,
"loss": 0.1396,
"mean_token_accuracy": 0.955009663105011,
"num_tokens": 1876231.0,
"step": 860
},
{
"epoch": 2.3328859060402687,
"grad_norm": 0.2966178059577942,
"learning_rate": 9.894826587637764e-06,
"loss": 0.1367,
"mean_token_accuracy": 0.9548171132802963,
"num_tokens": 1898748.0,
"step": 870
},
{
"epoch": 2.3597315436241613,
"grad_norm": 0.3404034972190857,
"learning_rate": 9.891244263829685e-06,
"loss": 0.1484,
"mean_token_accuracy": 0.9537680149078369,
"num_tokens": 1920372.0,
"step": 880
},
{
"epoch": 2.386577181208054,
"grad_norm": 0.3345947861671448,
"learning_rate": 9.887602618908384e-06,
"loss": 0.2133,
"mean_token_accuracy": 0.9383688002824784,
"num_tokens": 1940538.0,
"step": 890
},
{
"epoch": 2.4134228187919464,
"grad_norm": 0.3276788294315338,
"learning_rate": 9.883901697039809e-06,
"loss": 0.1556,
"mean_token_accuracy": 0.9519165605306625,
"num_tokens": 1961802.0,
"step": 900
},
{
"epoch": 2.440268456375839,
"grad_norm": 0.30133068561553955,
"learning_rate": 9.880141543108816e-06,
"loss": 0.1413,
"mean_token_accuracy": 0.9523742258548736,
"num_tokens": 1985264.0,
"step": 910
},
{
"epoch": 2.4671140939597316,
"grad_norm": 0.33560168743133545,
"learning_rate": 9.876322202718633e-06,
"loss": 0.1488,
"mean_token_accuracy": 0.9502911448478699,
"num_tokens": 2007692.0,
"step": 920
},
{
"epoch": 2.493959731543624,
"grad_norm": 0.33598729968070984,
"learning_rate": 9.8724437221903e-06,
"loss": 0.1609,
"mean_token_accuracy": 0.9487929224967957,
"num_tokens": 2029244.0,
"step": 930
},
{
"epoch": 2.5208053691275167,
"grad_norm": 0.3846241533756256,
"learning_rate": 9.868506148562107e-06,
"loss": 0.2107,
"mean_token_accuracy": 0.9374660611152649,
"num_tokens": 2049380.0,
"step": 940
},
{
"epoch": 2.5476510067114093,
"grad_norm": 0.33732545375823975,
"learning_rate": 9.864509529589034e-06,
"loss": 0.1465,
"mean_token_accuracy": 0.9547136723995209,
"num_tokens": 2070666.0,
"step": 950
},
{
"epoch": 2.574496644295302,
"grad_norm": 0.310533732175827,
"learning_rate": 9.860453913742158e-06,
"loss": 0.1427,
"mean_token_accuracy": 0.952840319275856,
"num_tokens": 2094161.0,
"step": 960
},
{
"epoch": 2.6013422818791945,
"grad_norm": 0.2950354218482971,
"learning_rate": 9.856339350208073e-06,
"loss": 0.1539,
"mean_token_accuracy": 0.9479620784521103,
"num_tokens": 2116615.0,
"step": 970
},
{
"epoch": 2.6281879194630875,
"grad_norm": 0.37915998697280884,
"learning_rate": 9.852165888888294e-06,
"loss": 0.1475,
"mean_token_accuracy": 0.9523311793804169,
"num_tokens": 2138198.0,
"step": 980
},
{
"epoch": 2.6550335570469796,
"grad_norm": 0.39890754222869873,
"learning_rate": 9.847933580398645e-06,
"loss": 0.2259,
"mean_token_accuracy": 0.9327600687742233,
"num_tokens": 2158392.0,
"step": 990
},
{
"epoch": 2.6818791946308727,
"grad_norm": 0.38095587491989136,
"learning_rate": 9.843642476068654e-06,
"loss": 0.1577,
"mean_token_accuracy": 0.9513375163078308,
"num_tokens": 2179772.0,
"step": 1000
},
{
"epoch": 2.7087248322147652,
"grad_norm": 0.2671376168727875,
"learning_rate": 9.839292627940924e-06,
"loss": 0.1362,
"mean_token_accuracy": 0.9555162519216538,
"num_tokens": 2203441.0,
"step": 1010
},
{
"epoch": 2.735570469798658,
"grad_norm": 0.3001173436641693,
"learning_rate": 9.834884088770504e-06,
"loss": 0.1532,
"mean_token_accuracy": 0.9491879791021347,
"num_tokens": 2226076.0,
"step": 1020
},
{
"epoch": 2.7624161073825504,
"grad_norm": 0.36988887190818787,
"learning_rate": 9.83041691202425e-06,
"loss": 0.1406,
"mean_token_accuracy": 0.9558630377054215,
"num_tokens": 2247831.0,
"step": 1030
},
{
"epoch": 2.789261744966443,
"grad_norm": 0.3370855450630188,
"learning_rate": 9.825891151880176e-06,
"loss": 0.2107,
"mean_token_accuracy": 0.9384757906198502,
"num_tokens": 2268043.0,
"step": 1040
},
{
"epoch": 2.8161073825503355,
"grad_norm": 0.40412017703056335,
"learning_rate": 9.821306863226796e-06,
"loss": 0.1582,
"mean_token_accuracy": 0.95028136074543,
"num_tokens": 2289294.0,
"step": 1050
},
{
"epoch": 2.842953020134228,
"grad_norm": 0.32156631350517273,
"learning_rate": 9.816664101662458e-06,
"loss": 0.1344,
"mean_token_accuracy": 0.9564105361700058,
"num_tokens": 2312830.0,
"step": 1060
},
{
"epoch": 2.8697986577181207,
"grad_norm": 0.3493017554283142,
"learning_rate": 9.811962923494674e-06,
"loss": 0.1551,
"mean_token_accuracy": 0.9494620323181152,
"num_tokens": 2335317.0,
"step": 1070
},
{
"epoch": 2.8966442953020133,
"grad_norm": 0.36799290776252747,
"learning_rate": 9.80720338573943e-06,
"loss": 0.1578,
"mean_token_accuracy": 0.9477977395057678,
"num_tokens": 2356912.0,
"step": 1080
},
{
"epoch": 2.9234899328859063,
"grad_norm": 0.34753313660621643,
"learning_rate": 9.802385546120498e-06,
"loss": 0.2169,
"mean_token_accuracy": 0.9361104846000672,
"num_tokens": 2377088.0,
"step": 1090
},
{
"epoch": 2.9503355704697984,
"grad_norm": 0.3603476881980896,
"learning_rate": 9.797509463068743e-06,
"loss": 0.1571,
"mean_token_accuracy": 0.9502560168504715,
"num_tokens": 2398159.0,
"step": 1100
},
{
"epoch": 2.9771812080536915,
"grad_norm": 0.3038477301597595,
"learning_rate": 9.7925751957214e-06,
"loss": 0.152,
"mean_token_accuracy": 0.949856498837471,
"num_tokens": 2420698.0,
"step": 1110
},
{
"epoch": 3.002684563758389,
"grad_norm": 0.33368000388145447,
"learning_rate": 9.787582803921366e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.9435529865716633,
"num_tokens": 2439628.0,
"step": 1120
},
{
"epoch": 3.029530201342282,
"grad_norm": 0.2882815897464752,
"learning_rate": 9.782532348216475e-06,
"loss": 0.1371,
"mean_token_accuracy": 0.9531569749116897,
"num_tokens": 2463670.0,
"step": 1130
},
{
"epoch": 3.0563758389261744,
"grad_norm": 0.3468344211578369,
"learning_rate": 9.777423889858759e-06,
"loss": 0.1413,
"mean_token_accuracy": 0.9515006303787231,
"num_tokens": 2486545.0,
"step": 1140
},
{
"epoch": 3.083221476510067,
"grad_norm": 0.3093087077140808,
"learning_rate": 9.77225749080371e-06,
"loss": 0.1362,
"mean_token_accuracy": 0.9559379696846009,
"num_tokens": 2508549.0,
"step": 1150
},
{
"epoch": 3.1100671140939595,
"grad_norm": 0.4076331853866577,
"learning_rate": 9.767033213709525e-06,
"loss": 0.1911,
"mean_token_accuracy": 0.9427547425031662,
"num_tokens": 2529303.0,
"step": 1160
},
{
"epoch": 3.1369127516778526,
"grad_norm": 0.4216303825378418,
"learning_rate": 9.761751121936342e-06,
"loss": 0.1698,
"mean_token_accuracy": 0.947964558005333,
"num_tokens": 2549099.0,
"step": 1170
},
{
"epoch": 3.163758389261745,
"grad_norm": 0.2814992070198059,
"learning_rate": 9.756411279545486e-06,
"loss": 0.1329,
"mean_token_accuracy": 0.9554579347372055,
"num_tokens": 2573053.0,
"step": 1180
},
{
"epoch": 3.1906040268456377,
"grad_norm": 0.31067585945129395,
"learning_rate": 9.751013751298674e-06,
"loss": 0.1416,
"mean_token_accuracy": 0.951316300034523,
"num_tokens": 2595891.0,
"step": 1190
},
{
"epoch": 3.2174496644295303,
"grad_norm": 0.3256937265396118,
"learning_rate": 9.745558602657244e-06,
"loss": 0.1418,
"mean_token_accuracy": 0.952664366364479,
"num_tokens": 2617778.0,
"step": 1200
},
{
"epoch": 3.244295302013423,
"grad_norm": 0.35490092635154724,
"learning_rate": 9.740045899781353e-06,
"loss": 0.1947,
"mean_token_accuracy": 0.9404710203409195,
"num_tokens": 2638355.0,
"step": 1210
},
{
"epoch": 3.2711409395973154,
"grad_norm": 0.34426459670066833,
"learning_rate": 9.734475709529177e-06,
"loss": 0.1674,
"mean_token_accuracy": 0.9495470136404037,
"num_tokens": 2658221.0,
"step": 1220
},
{
"epoch": 3.297986577181208,
"grad_norm": 0.3160524070262909,
"learning_rate": 9.7288480994561e-06,
"loss": 0.1375,
"mean_token_accuracy": 0.9538306951522827,
"num_tokens": 2682262.0,
"step": 1230
},
{
"epoch": 3.3248322147651006,
"grad_norm": 0.33500564098358154,
"learning_rate": 9.723163137813898e-06,
"loss": 0.1418,
"mean_token_accuracy": 0.9511039316654205,
"num_tokens": 2705103.0,
"step": 1240
},
{
"epoch": 3.351677852348993,
"grad_norm": 0.34578338265419006,
"learning_rate": 9.717420893549902e-06,
"loss": 0.1413,
"mean_token_accuracy": 0.9548570722341537,
"num_tokens": 2727045.0,
"step": 1250
},
{
"epoch": 3.3785234899328858,
"grad_norm": 0.33035096526145935,
"learning_rate": 9.711621436306172e-06,
"loss": 0.198,
"mean_token_accuracy": 0.9407137960195542,
"num_tokens": 2747667.0,
"step": 1260
},
{
"epoch": 3.4053691275167783,
"grad_norm": 0.34439653158187866,
"learning_rate": 9.705764836418648e-06,
"loss": 0.1598,
"mean_token_accuracy": 0.9528309881687165,
"num_tokens": 2767458.0,
"step": 1270
},
{
"epoch": 3.432214765100671,
"grad_norm": 0.32619956135749817,
"learning_rate": 9.699851164916296e-06,
"loss": 0.1356,
"mean_token_accuracy": 0.9537905603647232,
"num_tokens": 2791419.0,
"step": 1280
},
{
"epoch": 3.459060402684564,
"grad_norm": 0.33809950947761536,
"learning_rate": 9.69388049352025e-06,
"loss": 0.1447,
"mean_token_accuracy": 0.9516083031892777,
"num_tokens": 2814299.0,
"step": 1290
},
{
"epoch": 3.4859060402684565,
"grad_norm": 0.34236571192741394,
"learning_rate": 9.687852894642932e-06,
"loss": 0.1411,
"mean_token_accuracy": 0.9545169979333877,
"num_tokens": 2836250.0,
"step": 1300
},
{
"epoch": 3.512751677852349,
"grad_norm": 0.35245418548583984,
"learning_rate": 9.681768441387195e-06,
"loss": 0.1905,
"mean_token_accuracy": 0.9411760956048966,
"num_tokens": 2856939.0,
"step": 1310
},
{
"epoch": 3.5395973154362417,
"grad_norm": 0.33470526337623596,
"learning_rate": 9.675627207545415e-06,
"loss": 0.1641,
"mean_token_accuracy": 0.9509621292352677,
"num_tokens": 2876789.0,
"step": 1320
},
{
"epoch": 3.5664429530201343,
"grad_norm": 0.29923608899116516,
"learning_rate": 9.669429267598603e-06,
"loss": 0.1301,
"mean_token_accuracy": 0.9571765452623368,
"num_tokens": 2900661.0,
"step": 1330
},
{
"epoch": 3.593288590604027,
"grad_norm": 0.30686208605766296,
"learning_rate": 9.663174696715502e-06,
"loss": 0.144,
"mean_token_accuracy": 0.9498992472887039,
"num_tokens": 2923496.0,
"step": 1340
},
{
"epoch": 3.6201342281879194,
"grad_norm": 0.34822651743888855,
"learning_rate": 9.656863570751687e-06,
"loss": 0.1491,
"mean_token_accuracy": 0.9523457109928131,
"num_tokens": 2945425.0,
"step": 1350
},
{
"epoch": 3.646979865771812,
"grad_norm": 0.411933034658432,
"learning_rate": 9.650495966248618e-06,
"loss": 0.2078,
"mean_token_accuracy": 0.9376524448394775,
"num_tokens": 2965952.0,
"step": 1360
},
{
"epoch": 3.6738255033557046,
"grad_norm": 0.33340126276016235,
"learning_rate": 9.644071960432741e-06,
"loss": 0.161,
"mean_token_accuracy": 0.9514278948307038,
"num_tokens": 2985755.0,
"step": 1370
},
{
"epoch": 3.7006711409395976,
"grad_norm": 0.3414279520511627,
"learning_rate": 9.637591631214535e-06,
"loss": 0.1323,
"mean_token_accuracy": 0.9548977851867676,
"num_tokens": 3009591.0,
"step": 1380
},
{
"epoch": 3.7275167785234897,
"grad_norm": 0.3117181658744812,
"learning_rate": 9.631055057187564e-06,
"loss": 0.1443,
"mean_token_accuracy": 0.951587375998497,
"num_tokens": 3032349.0,
"step": 1390
},
{
"epoch": 3.7543624161073827,
"grad_norm": 0.3485643267631531,
"learning_rate": 9.624462317627538e-06,
"loss": 0.1343,
"mean_token_accuracy": 0.9559377074241638,
"num_tokens": 3054236.0,
"step": 1400
},
{
"epoch": 3.7812080536912753,
"grad_norm": 0.40424180030822754,
"learning_rate": 9.61781349249134e-06,
"loss": 0.2071,
"mean_token_accuracy": 0.935996612906456,
"num_tokens": 3075030.0,
"step": 1410
},
{
"epoch": 3.808053691275168,
"grad_norm": 0.36545875668525696,
"learning_rate": 9.611108662416064e-06,
"loss": 0.1588,
"mean_token_accuracy": 0.9502278298139573,
"num_tokens": 3094888.0,
"step": 1420
},
{
"epoch": 3.8348993288590605,
"grad_norm": 0.3279842138290405,
"learning_rate": 9.604347908718026e-06,
"loss": 0.131,
"mean_token_accuracy": 0.9550918698310852,
"num_tokens": 3118692.0,
"step": 1430
},
{
"epoch": 3.861744966442953,
"grad_norm": 0.310715913772583,
"learning_rate": 9.59753131339179e-06,
"loss": 0.146,
"mean_token_accuracy": 0.9505369186401367,
"num_tokens": 3141385.0,
"step": 1440
},
{
"epoch": 3.8885906040268456,
"grad_norm": 0.3354150652885437,
"learning_rate": 9.590658959109168e-06,
"loss": 0.1311,
"mean_token_accuracy": 0.9580255270004272,
"num_tokens": 3163253.0,
"step": 1450
},
{
"epoch": 3.915436241610738,
"grad_norm": 0.3825208842754364,
"learning_rate": 9.583730929218218e-06,
"loss": 0.2052,
"mean_token_accuracy": 0.9373064041137695,
"num_tokens": 3183796.0,
"step": 1460
},
{
"epoch": 3.942281879194631,
"grad_norm": 0.36658021807670593,
"learning_rate": 9.576747307742231e-06,
"loss": 0.1639,
"mean_token_accuracy": 0.9496467560529709,
"num_tokens": 3203577.0,
"step": 1470
},
{
"epoch": 3.9691275167785234,
"grad_norm": 0.31084001064300537,
"learning_rate": 9.569708179378716e-06,
"loss": 0.1406,
"mean_token_accuracy": 0.9523321092128754,
"num_tokens": 3226669.0,
"step": 1480
},
{
"epoch": 3.995973154362416,
"grad_norm": 0.36023128032684326,
"learning_rate": 9.562613629498367e-06,
"loss": 0.1767,
"mean_token_accuracy": 0.9448588013648986,
"num_tokens": 3247335.0,
"step": 1490
},
{
"epoch": 4.021476510067114,
"grad_norm": 0.3163889944553375,
"learning_rate": 9.555463744144037e-06,
"loss": 0.1318,
"mean_token_accuracy": 0.9579417329085501,
"num_tokens": 3268914.0,
"step": 1500
},
{
"epoch": 4.048322147651007,
"grad_norm": 0.3244967758655548,
"learning_rate": 9.548258610029684e-06,
"loss": 0.1397,
"mean_token_accuracy": 0.9520616948604583,
"num_tokens": 3292003.0,
"step": 1510
},
{
"epoch": 4.0751677852349,
"grad_norm": 0.3769804835319519,
"learning_rate": 9.540998314539327e-06,
"loss": 0.1414,
"mean_token_accuracy": 0.9535090059041977,
"num_tokens": 3314200.0,
"step": 1520
},
{
"epoch": 4.102013422818792,
"grad_norm": 0.4516792595386505,
"learning_rate": 9.533682945725984e-06,
"loss": 0.1624,
"mean_token_accuracy": 0.9462276846170425,
"num_tokens": 3335382.0,
"step": 1530
},
{
"epoch": 4.128859060402685,
"grad_norm": 0.37825873494148254,
"learning_rate": 9.526312592310597e-06,
"loss": 0.1781,
"mean_token_accuracy": 0.9461793184280396,
"num_tokens": 3354942.0,
"step": 1540
},
{
"epoch": 4.155704697986577,
"grad_norm": 0.28725379705429077,
"learning_rate": 9.518887343680971e-06,
"loss": 0.1305,
"mean_token_accuracy": 0.9573172956705094,
"num_tokens": 3378126.0,
"step": 1550
},
{
"epoch": 4.18255033557047,
"grad_norm": 0.3456558585166931,
"learning_rate": 9.511407289890678e-06,
"loss": 0.1397,
"mean_token_accuracy": 0.9526260673999787,
"num_tokens": 3401323.0,
"step": 1560
},
{
"epoch": 4.209395973154362,
"grad_norm": 0.3543192148208618,
"learning_rate": 9.503872521657964e-06,
"loss": 0.1453,
"mean_token_accuracy": 0.9511166512966156,
"num_tokens": 3423565.0,
"step": 1570
},
{
"epoch": 4.236241610738255,
"grad_norm": 0.45514950156211853,
"learning_rate": 9.496283130364658e-06,
"loss": 0.17,
"mean_token_accuracy": 0.9460733950138092,
"num_tokens": 3444827.0,
"step": 1580
},
{
"epoch": 4.263087248322147,
"grad_norm": 0.3951168358325958,
"learning_rate": 9.488639208055059e-06,
"loss": 0.1844,
"mean_token_accuracy": 0.9429652452468872,
"num_tokens": 3464503.0,
"step": 1590
},
{
"epoch": 4.28993288590604,
"grad_norm": 0.31996405124664307,
"learning_rate": 9.480940847434814e-06,
"loss": 0.1239,
"mean_token_accuracy": 0.9602243095636368,
"num_tokens": 3487664.0,
"step": 1600
},
{
"epoch": 4.3167785234899325,
"grad_norm": 0.36431530117988586,
"learning_rate": 9.473188141869804e-06,
"loss": 0.1465,
"mean_token_accuracy": 0.949748307466507,
"num_tokens": 3510658.0,
"step": 1610
},
{
"epoch": 4.3436241610738255,
"grad_norm": 0.3072509765625,
"learning_rate": 9.465381185385008e-06,
"loss": 0.1297,
"mean_token_accuracy": 0.9579643219709396,
"num_tokens": 3532695.0,
"step": 1620
},
{
"epoch": 4.370469798657718,
"grad_norm": 0.46137064695358276,
"learning_rate": 9.457520072663353e-06,
"loss": 0.176,
"mean_token_accuracy": 0.9446452677249908,
"num_tokens": 3553814.0,
"step": 1630
},
{
"epoch": 4.397315436241611,
"grad_norm": 0.39676016569137573,
"learning_rate": 9.449604899044583e-06,
"loss": 0.1807,
"mean_token_accuracy": 0.9439645022153854,
"num_tokens": 3573399.0,
"step": 1640
},
{
"epoch": 4.424161073825504,
"grad_norm": 0.3147033751010895,
"learning_rate": 9.441635760524087e-06,
"loss": 0.13,
"mean_token_accuracy": 0.9564067393541336,
"num_tokens": 3596524.0,
"step": 1650
},
{
"epoch": 4.451006711409396,
"grad_norm": 0.32451656460762024,
"learning_rate": 9.433612753751748e-06,
"loss": 0.1347,
"mean_token_accuracy": 0.9525512009859085,
"num_tokens": 3619695.0,
"step": 1660
},
{
"epoch": 4.477852348993289,
"grad_norm": 0.34751221537590027,
"learning_rate": 9.425535976030758e-06,
"loss": 0.1346,
"mean_token_accuracy": 0.9561400681734085,
"num_tokens": 3641796.0,
"step": 1670
},
{
"epoch": 4.504697986577181,
"grad_norm": 0.476945161819458,
"learning_rate": 9.417405525316448e-06,
"loss": 0.1663,
"mean_token_accuracy": 0.9473035573959351,
"num_tokens": 3662816.0,
"step": 1680
},
{
"epoch": 4.531543624161074,
"grad_norm": 0.3843275308609009,
"learning_rate": 9.409221500215096e-06,
"loss": 0.1809,
"mean_token_accuracy": 0.9466674089431762,
"num_tokens": 3682421.0,
"step": 1690
},
{
"epoch": 4.558389261744966,
"grad_norm": 0.2982370853424072,
"learning_rate": 9.400983999982729e-06,
"loss": 0.1258,
"mean_token_accuracy": 0.9580628424882889,
"num_tokens": 3705605.0,
"step": 1700
},
{
"epoch": 4.585234899328859,
"grad_norm": 0.36844635009765625,
"learning_rate": 9.392693124523925e-06,
"loss": 0.1346,
"mean_token_accuracy": 0.9527536749839782,
"num_tokens": 3728772.0,
"step": 1710
},
{
"epoch": 4.612080536912751,
"grad_norm": 0.3254833519458771,
"learning_rate": 9.38434897439059e-06,
"loss": 0.1413,
"mean_token_accuracy": 0.9538274705410004,
"num_tokens": 3750862.0,
"step": 1720
},
{
"epoch": 4.638926174496644,
"grad_norm": 0.40839093923568726,
"learning_rate": 9.375951650780759e-06,
"loss": 0.1698,
"mean_token_accuracy": 0.946934774518013,
"num_tokens": 3772020.0,
"step": 1730
},
{
"epoch": 4.665771812080537,
"grad_norm": 0.4056667685508728,
"learning_rate": 9.367501255537347e-06,
"loss": 0.1711,
"mean_token_accuracy": 0.9489806234836579,
"num_tokens": 3791528.0,
"step": 1740
},
{
"epoch": 4.6926174496644295,
"grad_norm": 0.30787405371665955,
"learning_rate": 9.358997891146924e-06,
"loss": 0.1335,
"mean_token_accuracy": 0.9545555591583252,
"num_tokens": 3814594.0,
"step": 1750
},
{
"epoch": 4.7194630872483225,
"grad_norm": 0.33151739835739136,
"learning_rate": 9.350441660738472e-06,
"loss": 0.1354,
"mean_token_accuracy": 0.9516535818576812,
"num_tokens": 3837666.0,
"step": 1760
},
{
"epoch": 4.746308724832215,
"grad_norm": 0.35133394598960876,
"learning_rate": 9.341832668082136e-06,
"loss": 0.1361,
"mean_token_accuracy": 0.9538627177476883,
"num_tokens": 3859873.0,
"step": 1770
},
{
"epoch": 4.773154362416108,
"grad_norm": 0.4053596556186676,
"learning_rate": 9.333171017587956e-06,
"loss": 0.1711,
"mean_token_accuracy": 0.94505635201931,
"num_tokens": 3880988.0,
"step": 1780
},
{
"epoch": 4.8,
"grad_norm": 0.40388065576553345,
"learning_rate": 9.324456814304614e-06,
"loss": 0.187,
"mean_token_accuracy": 0.9447333663702011,
"num_tokens": 3900559.0,
"step": 1790
},
{
"epoch": 4.826845637583893,
"grad_norm": 0.3266175389289856,
"learning_rate": 9.315690163918147e-06,
"loss": 0.129,
"mean_token_accuracy": 0.9574593305587769,
"num_tokens": 3923718.0,
"step": 1800
},
{
"epoch": 4.853691275167785,
"grad_norm": 0.2933380603790283,
"learning_rate": 9.30687117275068e-06,
"loss": 0.1438,
"mean_token_accuracy": 0.9510160237550735,
"num_tokens": 3946770.0,
"step": 1810
},
{
"epoch": 4.880536912751678,
"grad_norm": 0.36102089285850525,
"learning_rate": 9.29799994775912e-06,
"loss": 0.1404,
"mean_token_accuracy": 0.9519519448280335,
"num_tokens": 3968801.0,
"step": 1820
},
{
"epoch": 4.90738255033557,
"grad_norm": 0.41366609930992126,
"learning_rate": 9.289076596533873e-06,
"loss": 0.1868,
"mean_token_accuracy": 0.9425925433635711,
"num_tokens": 3989599.0,
"step": 1830
},
{
"epoch": 4.934228187919463,
"grad_norm": 0.391011118888855,
"learning_rate": 9.280101227297526e-06,
"loss": 0.1801,
"mean_token_accuracy": 0.9462247163057327,
"num_tokens": 4009110.0,
"step": 1840
},
{
"epoch": 4.961073825503355,
"grad_norm": 0.3075045645236969,
"learning_rate": 9.271073948903548e-06,
"loss": 0.1345,
"mean_token_accuracy": 0.9573867440223693,
"num_tokens": 4031827.0,
"step": 1850
},
{
"epoch": 4.987919463087248,
"grad_norm": 0.3926655054092407,
"learning_rate": 9.26199487083496e-06,
"loss": 0.1438,
"mean_token_accuracy": 0.9512310355901719,
"num_tokens": 4053816.0,
"step": 1860
},
{
"epoch": 5.0134228187919465,
"grad_norm": 0.31910833716392517,
"learning_rate": 9.252864103203015e-06,
"loss": 0.1382,
"mean_token_accuracy": 0.9562591439799258,
"num_tokens": 4074082.0,
"step": 1870
},
{
"epoch": 5.040268456375839,
"grad_norm": 0.3303743898868561,
"learning_rate": 9.243681756745851e-06,
"loss": 0.129,
"mean_token_accuracy": 0.9561623185873032,
"num_tokens": 4097543.0,
"step": 1880
},
{
"epoch": 5.067114093959732,
"grad_norm": 0.3294975459575653,
"learning_rate": 9.23444794282716e-06,
"loss": 0.1429,
"mean_token_accuracy": 0.9520007222890854,
"num_tokens": 4119984.0,
"step": 1890
},
{
"epoch": 5.093959731543624,
"grad_norm": 0.43249550461769104,
"learning_rate": 9.225162773434831e-06,
"loss": 0.1441,
"mean_token_accuracy": 0.9515981763601303,
"num_tokens": 4141494.0,
"step": 1900
},
{
"epoch": 5.120805369127517,
"grad_norm": 0.3819236159324646,
"learning_rate": 9.215826361179596e-06,
"loss": 0.1759,
"mean_token_accuracy": 0.945612245798111,
"num_tokens": 4161327.0,
"step": 1910
},
{
"epoch": 5.14765100671141,
"grad_norm": 0.3246872127056122,
"learning_rate": 9.206438819293654e-06,
"loss": 0.1335,
"mean_token_accuracy": 0.9576434195041656,
"num_tokens": 4182978.0,
"step": 1920
},
{
"epoch": 5.174496644295302,
"grad_norm": 0.33821627497673035,
"learning_rate": 9.197000261629314e-06,
"loss": 0.1318,
"mean_token_accuracy": 0.9542952626943588,
"num_tokens": 4206394.0,
"step": 1930
},
{
"epoch": 5.201342281879195,
"grad_norm": 0.3124513328075409,
"learning_rate": 9.187510802657601e-06,
"loss": 0.1353,
"mean_token_accuracy": 0.9533496767282486,
"num_tokens": 4228836.0,
"step": 1940
},
{
"epoch": 5.228187919463087,
"grad_norm": 0.44123515486717224,
"learning_rate": 9.177970557466873e-06,
"loss": 0.1396,
"mean_token_accuracy": 0.9535221606492996,
"num_tokens": 4250312.0,
"step": 1950
},
{
"epoch": 5.25503355704698,
"grad_norm": 0.43715864419937134,
"learning_rate": 9.168379641761425e-06,
"loss": 0.1894,
"mean_token_accuracy": 0.9410462647676467,
"num_tokens": 4270261.0,
"step": 1960
},
{
"epoch": 5.281879194630872,
"grad_norm": 0.3679194748401642,
"learning_rate": 9.158738171860081e-06,
"loss": 0.1346,
"mean_token_accuracy": 0.9566879123449326,
"num_tokens": 4291980.0,
"step": 1970
},
{
"epoch": 5.308724832214765,
"grad_norm": 0.3097969889640808,
"learning_rate": 9.149046264694795e-06,
"loss": 0.1282,
"mean_token_accuracy": 0.9558116465806961,
"num_tokens": 4315357.0,
"step": 1980
},
{
"epoch": 5.3355704697986575,
"grad_norm": 0.3311108350753784,
"learning_rate": 9.139304037809216e-06,
"loss": 0.1361,
"mean_token_accuracy": 0.953630456328392,
"num_tokens": 4337801.0,
"step": 1990
},
{
"epoch": 5.3624161073825505,
"grad_norm": 0.3906519114971161,
"learning_rate": 9.12951160935728e-06,
"loss": 0.1506,
"mean_token_accuracy": 0.9501311779022217,
"num_tokens": 4359347.0,
"step": 2000
},
{
"epoch": 5.389261744966443,
"grad_norm": 0.44028791785240173,
"learning_rate": 9.119669098101764e-06,
"loss": 0.1911,
"mean_token_accuracy": 0.9415321052074432,
"num_tokens": 4379414.0,
"step": 2010
},
{
"epoch": 5.416107382550336,
"grad_norm": 0.3951742649078369,
"learning_rate": 9.10977662341285e-06,
"loss": 0.1363,
"mean_token_accuracy": 0.9573418200016022,
"num_tokens": 4401148.0,
"step": 2020
},
{
"epoch": 5.442953020134228,
"grad_norm": 0.3792262077331543,
"learning_rate": 9.099834305266681e-06,
"loss": 0.1286,
"mean_token_accuracy": 0.9557218492031098,
"num_tokens": 4424751.0,
"step": 2030
},
{
"epoch": 5.469798657718121,
"grad_norm": 0.39373981952667236,
"learning_rate": 9.0898422642439e-06,
"loss": 0.1432,
"mean_token_accuracy": 0.9514533162117005,
"num_tokens": 4447202.0,
"step": 2040
},
{
"epoch": 5.496644295302014,
"grad_norm": 0.38414108753204346,
"learning_rate": 9.07980062152819e-06,
"loss": 0.1523,
"mean_token_accuracy": 0.9517021149396896,
"num_tokens": 4468742.0,
"step": 2050
},
{
"epoch": 5.523489932885906,
"grad_norm": 0.42570510506629944,
"learning_rate": 9.069709498904803e-06,
"loss": 0.1928,
"mean_token_accuracy": 0.9422121673822403,
"num_tokens": 4488651.0,
"step": 2060
},
{
"epoch": 5.550335570469799,
"grad_norm": 0.3122999370098114,
"learning_rate": 9.059569018759092e-06,
"loss": 0.1347,
"mean_token_accuracy": 0.9576286196708679,
"num_tokens": 4510368.0,
"step": 2070
},
{
"epoch": 5.577181208053691,
"grad_norm": 0.342123419046402,
"learning_rate": 9.049379304075009e-06,
"loss": 0.1243,
"mean_token_accuracy": 0.9563952952623367,
"num_tokens": 4533853.0,
"step": 2080
},
{
"epoch": 5.604026845637584,
"grad_norm": 0.3129931390285492,
"learning_rate": 9.039140478433625e-06,
"loss": 0.1435,
"mean_token_accuracy": 0.9513378292322159,
"num_tokens": 4556327.0,
"step": 2090
},
{
"epoch": 5.630872483221476,
"grad_norm": 0.4187992215156555,
"learning_rate": 9.028852666011638e-06,
"loss": 0.145,
"mean_token_accuracy": 0.9517636507749557,
"num_tokens": 4577971.0,
"step": 2100
},
{
"epoch": 5.657718120805369,
"grad_norm": 0.422378808259964,
"learning_rate": 9.018515991579851e-06,
"loss": 0.1844,
"mean_token_accuracy": 0.9434764713048935,
"num_tokens": 4598122.0,
"step": 2110
},
{
"epoch": 5.684563758389261,
"grad_norm": 0.36106374859809875,
"learning_rate": 9.008130580501669e-06,
"loss": 0.143,
"mean_token_accuracy": 0.9556879609823227,
"num_tokens": 4619856.0,
"step": 2120
},
{
"epoch": 5.7114093959731544,
"grad_norm": 0.38176390528678894,
"learning_rate": 8.997696558731575e-06,
"loss": 0.1264,
"mean_token_accuracy": 0.9565722495317459,
"num_tokens": 4643297.0,
"step": 2130
},
{
"epoch": 5.7382550335570475,
"grad_norm": 0.41863518953323364,
"learning_rate": 8.987214052813605e-06,
"loss": 0.1361,
"mean_token_accuracy": 0.9535513520240784,
"num_tokens": 4665658.0,
"step": 2140
},
{
"epoch": 5.76510067114094,
"grad_norm": 0.4007248282432556,
"learning_rate": 8.976683189879811e-06,
"loss": 0.1335,
"mean_token_accuracy": 0.9553972989320755,
"num_tokens": 4687201.0,
"step": 2150
},
{
"epoch": 5.791946308724832,
"grad_norm": 0.4537723958492279,
"learning_rate": 8.966104097648721e-06,
"loss": 0.1943,
"mean_token_accuracy": 0.9408634662628174,
"num_tokens": 4707358.0,
"step": 2160
},
{
"epoch": 5.818791946308725,
"grad_norm": 0.3353871703147888,
"learning_rate": 8.955476904423785e-06,
"loss": 0.1388,
"mean_token_accuracy": 0.9545477360486985,
"num_tokens": 4729147.0,
"step": 2170
},
{
"epoch": 5.845637583892618,
"grad_norm": 0.42482447624206543,
"learning_rate": 8.944801739091831e-06,
"loss": 0.1331,
"mean_token_accuracy": 0.9544930905103683,
"num_tokens": 4752510.0,
"step": 2180
},
{
"epoch": 5.87248322147651,
"grad_norm": 0.3644810616970062,
"learning_rate": 8.934078731121482e-06,
"loss": 0.1315,
"mean_token_accuracy": 0.9561907082796097,
"num_tokens": 4774905.0,
"step": 2190
},
{
"epoch": 5.899328859060403,
"grad_norm": 0.48736846446990967,
"learning_rate": 8.923308010561608e-06,
"loss": 0.1427,
"mean_token_accuracy": 0.9533969014883041,
"num_tokens": 4796405.0,
"step": 2200
},
{
"epoch": 5.926174496644295,
"grad_norm": 0.429516464471817,
"learning_rate": 8.912489708039734e-06,
"loss": 0.1918,
"mean_token_accuracy": 0.9409243553876877,
"num_tokens": 4816438.0,
"step": 2210
},
{
"epoch": 5.953020134228188,
"grad_norm": 0.3750719130039215,
"learning_rate": 8.90162395476046e-06,
"loss": 0.1382,
"mean_token_accuracy": 0.9537917494773864,
"num_tokens": 4838019.0,
"step": 2220
},
{
"epoch": 5.97986577181208,
"grad_norm": 0.431819349527359,
"learning_rate": 8.89071088250387e-06,
"loss": 0.1419,
"mean_token_accuracy": 0.9530791282653809,
"num_tokens": 4860114.0,
"step": 2230
},
{
"epoch": 6.005369127516778,
"grad_norm": 0.35386669635772705,
"learning_rate": 8.879750623623932e-06,
"loss": 0.1549,
"mean_token_accuracy": 0.9517395119918021,
"num_tokens": 4879178.0,
"step": 2240
},
{
"epoch": 6.0322147651006714,
"grad_norm": 0.3473912179470062,
"learning_rate": 8.8687433110469e-06,
"loss": 0.1239,
"mean_token_accuracy": 0.9572397619485855,
"num_tokens": 4903000.0,
"step": 2250
},
{
"epoch": 6.059060402684564,
"grad_norm": 0.3461743891239166,
"learning_rate": 8.857689078269688e-06,
"loss": 0.1367,
"mean_token_accuracy": 0.9519617527723312,
"num_tokens": 4925763.0,
"step": 2260
},
{
"epoch": 6.085906040268457,
"grad_norm": 0.38205376267433167,
"learning_rate": 8.846588059358265e-06,
"loss": 0.1291,
"mean_token_accuracy": 0.9563361287117005,
"num_tokens": 4947525.0,
"step": 2270
},
{
"epoch": 6.112751677852349,
"grad_norm": 0.410349041223526,
"learning_rate": 8.835440388946025e-06,
"loss": 0.183,
"mean_token_accuracy": 0.9442010521888733,
"num_tokens": 4967890.0,
"step": 2280
},
{
"epoch": 6.139597315436242,
"grad_norm": 0.4093804657459259,
"learning_rate": 8.824246202232142e-06,
"loss": 0.1428,
"mean_token_accuracy": 0.9542681604623795,
"num_tokens": 4988239.0,
"step": 2290
},
{
"epoch": 6.166442953020134,
"grad_norm": 0.39240196347236633,
"learning_rate": 8.813005634979954e-06,
"loss": 0.1258,
"mean_token_accuracy": 0.9575669139623642,
"num_tokens": 5012161.0,
"step": 2300
},
{
"epoch": 6.193288590604027,
"grad_norm": 0.3647288680076599,
"learning_rate": 8.801718823515293e-06,
"loss": 0.1303,
"mean_token_accuracy": 0.955005195736885,
"num_tokens": 5034857.0,
"step": 2310
},
{
"epoch": 6.220134228187919,
"grad_norm": 0.41007348895072937,
"learning_rate": 8.790385904724848e-06,
"loss": 0.1275,
"mean_token_accuracy": 0.9576222687959671,
"num_tokens": 5056763.0,
"step": 2320
},
{
"epoch": 6.246979865771812,
"grad_norm": 0.45653489232063293,
"learning_rate": 8.779007016054496e-06,
"loss": 0.1887,
"mean_token_accuracy": 0.9412017434835434,
"num_tokens": 5077221.0,
"step": 2330
},
{
"epoch": 6.273825503355705,
"grad_norm": 0.39891692996025085,
"learning_rate": 8.767582295507637e-06,
"loss": 0.1432,
"mean_token_accuracy": 0.9543320029973984,
"num_tokens": 5097569.0,
"step": 2340
},
{
"epoch": 6.300671140939597,
"grad_norm": 0.3695172071456909,
"learning_rate": 8.75611188164352e-06,
"loss": 0.1224,
"mean_token_accuracy": 0.9566197484731674,
"num_tokens": 5121371.0,
"step": 2350
},
{
"epoch": 6.32751677852349,
"grad_norm": 0.39350956678390503,
"learning_rate": 8.744595913575572e-06,
"loss": 0.1374,
"mean_token_accuracy": 0.9534388244152069,
"num_tokens": 5144160.0,
"step": 2360
},
{
"epoch": 6.354362416107382,
"grad_norm": 0.42946743965148926,
"learning_rate": 8.733034530969688e-06,
"loss": 0.1352,
"mean_token_accuracy": 0.9549015939235688,
"num_tokens": 5166042.0,
"step": 2370
},
{
"epoch": 6.381208053691275,
"grad_norm": 0.5302273035049438,
"learning_rate": 8.721427874042563e-06,
"loss": 0.1971,
"mean_token_accuracy": 0.9400767356157302,
"num_tokens": 5186480.0,
"step": 2380
},
{
"epoch": 6.4080536912751676,
"grad_norm": 0.4178345799446106,
"learning_rate": 8.709776083559978e-06,
"loss": 0.1486,
"mean_token_accuracy": 0.9536987513303756,
"num_tokens": 5206790.0,
"step": 2390
},
{
"epoch": 6.434899328859061,
"grad_norm": 0.408120334148407,
"learning_rate": 8.698079300835088e-06,
"loss": 0.1213,
"mean_token_accuracy": 0.9586103349924088,
"num_tokens": 5230497.0,
"step": 2400
},
{
"epoch": 6.461744966442953,
"grad_norm": 0.34466466307640076,
"learning_rate": 8.686337667726723e-06,
"loss": 0.1321,
"mean_token_accuracy": 0.953861802816391,
"num_tokens": 5253213.0,
"step": 2410
},
{
"epoch": 6.488590604026846,
"grad_norm": 0.43265581130981445,
"learning_rate": 8.674551326637655e-06,
"loss": 0.125,
"mean_token_accuracy": 0.958708542585373,
"num_tokens": 5274989.0,
"step": 2420
},
{
"epoch": 6.515436241610738,
"grad_norm": 0.5245379209518433,
"learning_rate": 8.662720420512877e-06,
"loss": 0.1889,
"mean_token_accuracy": 0.9418635576963424,
"num_tokens": 5295319.0,
"step": 2430
},
{
"epoch": 6.542281879194631,
"grad_norm": 0.3666956126689911,
"learning_rate": 8.650845092837867e-06,
"loss": 0.1407,
"mean_token_accuracy": 0.9555400878190994,
"num_tokens": 5315538.0,
"step": 2440
},
{
"epoch": 6.569127516778524,
"grad_norm": 0.4082365930080414,
"learning_rate": 8.638925487636847e-06,
"loss": 0.1319,
"mean_token_accuracy": 0.9544797509908676,
"num_tokens": 5339222.0,
"step": 2450
},
{
"epoch": 6.595973154362416,
"grad_norm": 0.3805497884750366,
"learning_rate": 8.626961749471044e-06,
"loss": 0.1421,
"mean_token_accuracy": 0.9522867351770401,
"num_tokens": 5361763.0,
"step": 2460
},
{
"epoch": 6.622818791946309,
"grad_norm": 0.3796834945678711,
"learning_rate": 8.61495402343692e-06,
"loss": 0.1271,
"mean_token_accuracy": 0.9572306245565414,
"num_tokens": 5383570.0,
"step": 2470
},
{
"epoch": 6.649664429530201,
"grad_norm": 0.5255109667778015,
"learning_rate": 8.602902455164432e-06,
"loss": 0.1863,
"mean_token_accuracy": 0.9410823851823806,
"num_tokens": 5404105.0,
"step": 2480
},
{
"epoch": 6.676510067114094,
"grad_norm": 0.39988207817077637,
"learning_rate": 8.590807190815254e-06,
"loss": 0.1472,
"mean_token_accuracy": 0.9552285671234131,
"num_tokens": 5424459.0,
"step": 2490
},
{
"epoch": 6.703355704697986,
"grad_norm": 0.37969970703125,
"learning_rate": 8.578668377081001e-06,
"loss": 0.1231,
"mean_token_accuracy": 0.9572932302951813,
"num_tokens": 5448334.0,
"step": 2500
},
{
"epoch": 6.730201342281879,
"grad_norm": 0.36511608958244324,
"learning_rate": 8.56648616118147e-06,
"loss": 0.1309,
"mean_token_accuracy": 0.9542735308408737,
"num_tokens": 5471058.0,
"step": 2510
},
{
"epoch": 6.7570469798657715,
"grad_norm": 0.48128047585487366,
"learning_rate": 8.554260690862824e-06,
"loss": 0.134,
"mean_token_accuracy": 0.9557673066854477,
"num_tokens": 5492789.0,
"step": 2520
},
{
"epoch": 6.7838926174496645,
"grad_norm": 0.46509677171707153,
"learning_rate": 8.541992114395825e-06,
"loss": 0.1749,
"mean_token_accuracy": 0.9453763455152512,
"num_tokens": 5513232.0,
"step": 2530
},
{
"epoch": 6.810738255033557,
"grad_norm": 0.3718424439430237,
"learning_rate": 8.529680580574028e-06,
"loss": 0.1456,
"mean_token_accuracy": 0.9548163831233978,
"num_tokens": 5533489.0,
"step": 2540
},
{
"epoch": 6.83758389261745,
"grad_norm": 0.40027740597724915,
"learning_rate": 8.517326238711976e-06,
"loss": 0.1222,
"mean_token_accuracy": 0.9584755569696426,
"num_tokens": 5557178.0,
"step": 2550
},
{
"epoch": 6.864429530201342,
"grad_norm": 0.4006039798259735,
"learning_rate": 8.504929238643381e-06,
"loss": 0.1362,
"mean_token_accuracy": 0.9527265220880509,
"num_tokens": 5579955.0,
"step": 2560
},
{
"epoch": 6.891275167785235,
"grad_norm": 0.371866375207901,
"learning_rate": 8.492489730719325e-06,
"loss": 0.1299,
"mean_token_accuracy": 0.9573649376630783,
"num_tokens": 5601887.0,
"step": 2570
},
{
"epoch": 6.918120805369128,
"grad_norm": 0.5413601398468018,
"learning_rate": 8.48000786580642e-06,
"loss": 0.1712,
"mean_token_accuracy": 0.944629642367363,
"num_tokens": 5622673.0,
"step": 2580
},
{
"epoch": 6.94496644295302,
"grad_norm": 0.3805997669696808,
"learning_rate": 8.467483795284987e-06,
"loss": 0.1388,
"mean_token_accuracy": 0.9575351625680923,
"num_tokens": 5643003.0,
"step": 2590
},
{
"epoch": 6.971812080536913,
"grad_norm": 0.3734448552131653,
"learning_rate": 8.454917671047213e-06,
"loss": 0.1271,
"mean_token_accuracy": 0.9564762502908707,
"num_tokens": 5665927.0,
"step": 2600
},
{
"epoch": 6.998657718120805,
"grad_norm": 0.5456552505493164,
"learning_rate": 8.442309645495322e-06,
"loss": 0.1593,
"mean_token_accuracy": 0.9499319314956665,
"num_tokens": 5686349.0,
"step": 2610
},
{
"epoch": 7.024161073825503,
"grad_norm": 0.40639278292655945,
"learning_rate": 8.429659871539709e-06,
"loss": 0.1108,
"mean_token_accuracy": 0.9616394952723855,
"num_tokens": 5708418.0,
"step": 2620
},
{
"epoch": 7.051006711409396,
"grad_norm": 0.44582584500312805,
"learning_rate": 8.416968502597101e-06,
"loss": 0.1311,
"mean_token_accuracy": 0.9550166130065918,
"num_tokens": 5731562.0,
"step": 2630
},
{
"epoch": 7.0778523489932885,
"grad_norm": 0.42072245478630066,
"learning_rate": 8.404235692588682e-06,
"loss": 0.1242,
"mean_token_accuracy": 0.9576824128627777,
"num_tokens": 5753681.0,
"step": 2640
},
{
"epoch": 7.1046979865771815,
"grad_norm": 0.5467314720153809,
"learning_rate": 8.391461595938245e-06,
"loss": 0.1659,
"mean_token_accuracy": 0.9464216500520706,
"num_tokens": 5774718.0,
"step": 2650
},
{
"epoch": 7.131543624161074,
"grad_norm": 0.45191726088523865,
"learning_rate": 8.378646367570302e-06,
"loss": 0.1661,
"mean_token_accuracy": 0.9496298342943191,
"num_tokens": 5794201.0,
"step": 2660
},
{
"epoch": 7.158389261744967,
"grad_norm": 0.4229516088962555,
"learning_rate": 8.36579016290821e-06,
"loss": 0.1168,
"mean_token_accuracy": 0.9608386904001236,
"num_tokens": 5817768.0,
"step": 2670
},
{
"epoch": 7.185234899328859,
"grad_norm": 0.42818018794059753,
"learning_rate": 8.352893137872292e-06,
"loss": 0.1219,
"mean_token_accuracy": 0.9569632887840271,
"num_tokens": 5840800.0,
"step": 2680
},
{
"epoch": 7.212080536912752,
"grad_norm": 0.3931344151496887,
"learning_rate": 8.339955448877934e-06,
"loss": 0.1284,
"mean_token_accuracy": 0.9573963195085525,
"num_tokens": 5862870.0,
"step": 2690
},
{
"epoch": 7.238926174496644,
"grad_norm": 0.5881303548812866,
"learning_rate": 8.326977252833704e-06,
"loss": 0.1682,
"mean_token_accuracy": 0.9461307436227798,
"num_tokens": 5883809.0,
"step": 2700
},
{
"epoch": 7.265771812080537,
"grad_norm": 0.44616442918777466,
"learning_rate": 8.313958707139434e-06,
"loss": 0.1528,
"mean_token_accuracy": 0.9527498662471772,
"num_tokens": 5903277.0,
"step": 2710
},
{
"epoch": 7.292617449664429,
"grad_norm": 0.39954400062561035,
"learning_rate": 8.300899969684322e-06,
"loss": 0.1201,
"mean_token_accuracy": 0.9591354012489319,
"num_tokens": 5926773.0,
"step": 2720
},
{
"epoch": 7.319463087248322,
"grad_norm": 0.4336886405944824,
"learning_rate": 8.28780119884501e-06,
"loss": 0.1326,
"mean_token_accuracy": 0.9533321857452393,
"num_tokens": 5949796.0,
"step": 2730
},
{
"epoch": 7.346308724832214,
"grad_norm": 0.45649003982543945,
"learning_rate": 8.274662553483662e-06,
"loss": 0.12,
"mean_token_accuracy": 0.9590807974338531,
"num_tokens": 5971946.0,
"step": 2740
},
{
"epoch": 7.373154362416107,
"grad_norm": 0.5981083512306213,
"learning_rate": 8.26148419294605e-06,
"loss": 0.1539,
"mean_token_accuracy": 0.9494126617908478,
"num_tokens": 5993002.0,
"step": 2750
},
{
"epoch": 7.4,
"grad_norm": 0.5191400051116943,
"learning_rate": 8.248266277059607e-06,
"loss": 0.1636,
"mean_token_accuracy": 0.9508023709058762,
"num_tokens": 6012457.0,
"step": 2760
},
{
"epoch": 7.4268456375838925,
"grad_norm": 0.45730146765708923,
"learning_rate": 8.235008966131492e-06,
"loss": 0.1257,
"mean_token_accuracy": 0.9582333266735077,
"num_tokens": 6035995.0,
"step": 2770
},
{
"epoch": 7.4536912751677855,
"grad_norm": 0.4261474907398224,
"learning_rate": 8.221712420946651e-06,
"loss": 0.1313,
"mean_token_accuracy": 0.9531764149665832,
"num_tokens": 6058903.0,
"step": 2780
},
{
"epoch": 7.480536912751678,
"grad_norm": 0.458927720785141,
"learning_rate": 8.208376802765866e-06,
"loss": 0.1296,
"mean_token_accuracy": 0.9571897268295289,
"num_tokens": 6080945.0,
"step": 2790
},
{
"epoch": 7.507382550335571,
"grad_norm": 0.6553865671157837,
"learning_rate": 8.195002273323792e-06,
"loss": 0.1593,
"mean_token_accuracy": 0.9487423598766327,
"num_tokens": 6101890.0,
"step": 2800
},
{
"epoch": 7.534228187919463,
"grad_norm": 0.4802383780479431,
"learning_rate": 8.181588994827005e-06,
"loss": 0.1583,
"mean_token_accuracy": 0.9515509098768234,
"num_tokens": 6121346.0,
"step": 2810
},
{
"epoch": 7.561073825503356,
"grad_norm": 0.4876711666584015,
"learning_rate": 8.168137129952027e-06,
"loss": 0.1225,
"mean_token_accuracy": 0.9582916587591171,
"num_tokens": 6144990.0,
"step": 2820
},
{
"epoch": 7.587919463087248,
"grad_norm": 0.45218655467033386,
"learning_rate": 8.154646841843358e-06,
"loss": 0.1328,
"mean_token_accuracy": 0.9530595809221267,
"num_tokens": 6168024.0,
"step": 2830
},
{
"epoch": 7.614765100671141,
"grad_norm": 0.463436484336853,
"learning_rate": 8.141118294111496e-06,
"loss": 0.1327,
"mean_token_accuracy": 0.9549932539463043,
"num_tokens": 6190149.0,
"step": 2840
},
{
"epoch": 7.641610738255034,
"grad_norm": 0.6307859420776367,
"learning_rate": 8.127551650830954e-06,
"loss": 0.1635,
"mean_token_accuracy": 0.9493362814188003,
"num_tokens": 6211195.0,
"step": 2850
},
{
"epoch": 7.668456375838926,
"grad_norm": 0.4908117949962616,
"learning_rate": 8.113947076538264e-06,
"loss": 0.1687,
"mean_token_accuracy": 0.9480661511421203,
"num_tokens": 6230721.0,
"step": 2860
},
{
"epoch": 7.695302013422819,
"grad_norm": 0.3950389623641968,
"learning_rate": 8.100304736229991e-06,
"loss": 0.1197,
"mean_token_accuracy": 0.9598756283521652,
"num_tokens": 6254447.0,
"step": 2870
},
{
"epoch": 7.722147651006711,
"grad_norm": 0.4120844006538391,
"learning_rate": 8.086624795360723e-06,
"loss": 0.1295,
"mean_token_accuracy": 0.9543819516897202,
"num_tokens": 6277445.0,
"step": 2880
},
{
"epoch": 7.748993288590604,
"grad_norm": 0.3652952313423157,
"learning_rate": 8.07290741984107e-06,
"loss": 0.1231,
"mean_token_accuracy": 0.9587823182344437,
"num_tokens": 6299402.0,
"step": 2890
},
{
"epoch": 7.7758389261744965,
"grad_norm": 0.5701743960380554,
"learning_rate": 8.059152776035653e-06,
"loss": 0.1654,
"mean_token_accuracy": 0.9458971083164215,
"num_tokens": 6320105.0,
"step": 2900
},
{
"epoch": 7.8026845637583895,
"grad_norm": 0.4986805319786072,
"learning_rate": 8.045361030761082e-06,
"loss": 0.1462,
"mean_token_accuracy": 0.9533874779939652,
"num_tokens": 6339460.0,
"step": 2910
},
{
"epoch": 7.829530201342282,
"grad_norm": 0.45700371265411377,
"learning_rate": 8.03153235128393e-06,
"loss": 0.1116,
"mean_token_accuracy": 0.9626628488302231,
"num_tokens": 6363130.0,
"step": 2920
},
{
"epoch": 7.856375838926175,
"grad_norm": 0.4490987956523895,
"learning_rate": 8.017666905318712e-06,
"loss": 0.1313,
"mean_token_accuracy": 0.9540461421012878,
"num_tokens": 6386118.0,
"step": 2930
},
{
"epoch": 7.883221476510067,
"grad_norm": 0.3797796666622162,
"learning_rate": 8.003764861025853e-06,
"loss": 0.1231,
"mean_token_accuracy": 0.9578628242015839,
"num_tokens": 6408209.0,
"step": 2940
},
{
"epoch": 7.91006711409396,
"grad_norm": 0.5887638330459595,
"learning_rate": 7.989826387009634e-06,
"loss": 0.1537,
"mean_token_accuracy": 0.9492853492498398,
"num_tokens": 6429237.0,
"step": 2950
},
{
"epoch": 7.936912751677852,
"grad_norm": 0.5216886401176453,
"learning_rate": 7.975851652316162e-06,
"loss": 0.1568,
"mean_token_accuracy": 0.9521301418542862,
"num_tokens": 6448673.0,
"step": 2960
},
{
"epoch": 7.963758389261745,
"grad_norm": 0.4166746735572815,
"learning_rate": 7.961840826431314e-06,
"loss": 0.1264,
"mean_token_accuracy": 0.957261809706688,
"num_tokens": 6471731.0,
"step": 2970
},
{
"epoch": 7.990604026845638,
"grad_norm": 0.5581911206245422,
"learning_rate": 7.947794079278678e-06,
"loss": 0.1457,
"mean_token_accuracy": 0.9534464627504349,
"num_tokens": 6492971.0,
"step": 2980
},
{
"epoch": 8.016107382550336,
"grad_norm": 0.4519020915031433,
"learning_rate": 7.933711581217501e-06,
"loss": 0.119,
"mean_token_accuracy": 0.9589253067970276,
"num_tokens": 6513617.0,
"step": 2990
},
{
"epoch": 8.042953020134227,
"grad_norm": 0.44895055890083313,
"learning_rate": 7.919593503040616e-06,
"loss": 0.1246,
"mean_token_accuracy": 0.9571912050247192,
"num_tokens": 6536816.0,
"step": 3000
},
{
"epoch": 8.06979865771812,
"grad_norm": 0.4848621189594269,
"learning_rate": 7.905440015972372e-06,
"loss": 0.1255,
"mean_token_accuracy": 0.9568954467773437,
"num_tokens": 6559060.0,
"step": 3010
},
{
"epoch": 8.096644295302013,
"grad_norm": 0.7106390595436096,
"learning_rate": 7.891251291666554e-06,
"loss": 0.1547,
"mean_token_accuracy": 0.9491069823503494,
"num_tokens": 6580168.0,
"step": 3020
},
{
"epoch": 8.123489932885906,
"grad_norm": 0.5733054876327515,
"learning_rate": 7.877027502204311e-06,
"loss": 0.1588,
"mean_token_accuracy": 0.9492790251970291,
"num_tokens": 6599902.0,
"step": 3030
},
{
"epoch": 8.1503355704698,
"grad_norm": 0.5422059297561646,
"learning_rate": 7.862768820092061e-06,
"loss": 0.1234,
"mean_token_accuracy": 0.9600847691297532,
"num_tokens": 6622088.0,
"step": 3040
},
{
"epoch": 8.17718120805369,
"grad_norm": 0.4836737811565399,
"learning_rate": 7.848475418259399e-06,
"loss": 0.121,
"mean_token_accuracy": 0.9583497941493988,
"num_tokens": 6645425.0,
"step": 3050
},
{
"epoch": 8.204026845637584,
"grad_norm": 0.4681214988231659,
"learning_rate": 7.834147470057006e-06,
"loss": 0.1252,
"mean_token_accuracy": 0.957104617357254,
"num_tokens": 6667702.0,
"step": 3060
},
{
"epoch": 8.230872483221477,
"grad_norm": 0.6612587571144104,
"learning_rate": 7.819785149254534e-06,
"loss": 0.135,
"mean_token_accuracy": 0.9549255698919297,
"num_tokens": 6689095.0,
"step": 3070
},
{
"epoch": 8.25771812080537,
"grad_norm": 0.5453577637672424,
"learning_rate": 7.805388630038512e-06,
"loss": 0.1644,
"mean_token_accuracy": 0.9499281167984008,
"num_tokens": 6709062.0,
"step": 3080
},
{
"epoch": 8.284563758389261,
"grad_norm": 0.5287258625030518,
"learning_rate": 7.790958087010234e-06,
"loss": 0.1275,
"mean_token_accuracy": 0.9588546067476272,
"num_tokens": 6731280.0,
"step": 3090
},
{
"epoch": 8.311409395973154,
"grad_norm": 0.5968199372291565,
"learning_rate": 7.776493695183623e-06,
"loss": 0.1239,
"mean_token_accuracy": 0.9554890125989914,
"num_tokens": 6754515.0,
"step": 3100
},
{
"epoch": 8.338255033557047,
"grad_norm": 0.4913158118724823,
"learning_rate": 7.761995629983129e-06,
"loss": 0.1253,
"mean_token_accuracy": 0.9566126644611359,
"num_tokens": 6776753.0,
"step": 3110
},
{
"epoch": 8.36510067114094,
"grad_norm": 0.853453516960144,
"learning_rate": 7.74746406724159e-06,
"loss": 0.1529,
"mean_token_accuracy": 0.9489505797624588,
"num_tokens": 6797907.0,
"step": 3120
},
{
"epoch": 8.391946308724831,
"grad_norm": 0.5716426968574524,
"learning_rate": 7.732899183198108e-06,
"loss": 0.1569,
"mean_token_accuracy": 0.9509849786758423,
"num_tokens": 6817642.0,
"step": 3130
},
{
"epoch": 8.418791946308724,
"grad_norm": 0.4912736117839813,
"learning_rate": 7.718301154495897e-06,
"loss": 0.1211,
"mean_token_accuracy": 0.9595335066318512,
"num_tokens": 6839853.0,
"step": 3140
},
{
"epoch": 8.445637583892617,
"grad_norm": 0.5178288817405701,
"learning_rate": 7.70367015818016e-06,
"loss": 0.1162,
"mean_token_accuracy": 0.9598266303539276,
"num_tokens": 6863268.0,
"step": 3150
},
{
"epoch": 8.47248322147651,
"grad_norm": 0.4956505000591278,
"learning_rate": 7.689006371695928e-06,
"loss": 0.1278,
"mean_token_accuracy": 0.957508260011673,
"num_tokens": 6885638.0,
"step": 3160
},
{
"epoch": 8.499328859060403,
"grad_norm": 0.7209757566452026,
"learning_rate": 7.674309972885909e-06,
"loss": 0.1411,
"mean_token_accuracy": 0.9534810066223145,
"num_tokens": 6907038.0,
"step": 3170
},
{
"epoch": 8.526174496644295,
"grad_norm": 0.6008272767066956,
"learning_rate": 7.659581139988339e-06,
"loss": 0.1622,
"mean_token_accuracy": 0.9486708849668503,
"num_tokens": 6926858.0,
"step": 3180
},
{
"epoch": 8.553020134228188,
"grad_norm": 0.5501519441604614,
"learning_rate": 7.644820051634813e-06,
"loss": 0.1162,
"mean_token_accuracy": 0.9615104466676712,
"num_tokens": 6949093.0,
"step": 3190
},
{
"epoch": 8.57986577181208,
"grad_norm": 0.6659561991691589,
"learning_rate": 7.630026886848118e-06,
"loss": 0.1237,
"mean_token_accuracy": 0.9558080345392227,
"num_tokens": 6972467.0,
"step": 3200
},
{
"epoch": 8.606711409395974,
"grad_norm": 0.5133163332939148,
"learning_rate": 7.61520182504007e-06,
"loss": 0.1239,
"mean_token_accuracy": 0.957629781961441,
"num_tokens": 6994729.0,
"step": 3210
},
{
"epoch": 8.633557046979865,
"grad_norm": 0.7774618864059448,
"learning_rate": 7.60034504600933e-06,
"loss": 0.1402,
"mean_token_accuracy": 0.9532318562269211,
"num_tokens": 7015998.0,
"step": 3220
},
{
"epoch": 8.660402684563758,
"grad_norm": 0.604790985584259,
"learning_rate": 7.585456729939225e-06,
"loss": 0.1663,
"mean_token_accuracy": 0.9489589869976044,
"num_tokens": 7035712.0,
"step": 3230
},
{
"epoch": 8.687248322147651,
"grad_norm": 0.5196418166160583,
"learning_rate": 7.570537057395566e-06,
"loss": 0.1141,
"mean_token_accuracy": 0.9628522455692291,
"num_tokens": 7057968.0,
"step": 3240
},
{
"epoch": 8.714093959731544,
"grad_norm": 0.5068167448043823,
"learning_rate": 7.555586209324455e-06,
"loss": 0.1159,
"mean_token_accuracy": 0.9594800651073456,
"num_tokens": 7081387.0,
"step": 3250
},
{
"epoch": 8.740939597315435,
"grad_norm": 0.504896879196167,
"learning_rate": 7.540604367050091e-06,
"loss": 0.1341,
"mean_token_accuracy": 0.9530034631490707,
"num_tokens": 7103855.0,
"step": 3260
},
{
"epoch": 8.767785234899328,
"grad_norm": 0.5507974624633789,
"learning_rate": 7.525591712272574e-06,
"loss": 0.1216,
"mean_token_accuracy": 0.9571410089731216,
"num_tokens": 7125521.0,
"step": 3270
},
{
"epoch": 8.794630872483221,
"grad_norm": 0.5960702300071716,
"learning_rate": 7.510548427065693e-06,
"loss": 0.1717,
"mean_token_accuracy": 0.9465545684099197,
"num_tokens": 7145574.0,
"step": 3280
},
{
"epoch": 8.821476510067114,
"grad_norm": 0.4573246240615845,
"learning_rate": 7.495474693874731e-06,
"loss": 0.1201,
"mean_token_accuracy": 0.9599078118801116,
"num_tokens": 7167649.0,
"step": 3290
},
{
"epoch": 8.848322147651007,
"grad_norm": 0.4620523154735565,
"learning_rate": 7.4803706955142385e-06,
"loss": 0.1269,
"mean_token_accuracy": 0.9555114895105362,
"num_tokens": 7190906.0,
"step": 3300
},
{
"epoch": 8.875167785234899,
"grad_norm": 0.438799113035202,
"learning_rate": 7.465236615165826e-06,
"loss": 0.1204,
"mean_token_accuracy": 0.9585326343774796,
"num_tokens": 7213279.0,
"step": 3310
},
{
"epoch": 8.902013422818792,
"grad_norm": 0.7440369129180908,
"learning_rate": 7.450072636375939e-06,
"loss": 0.1266,
"mean_token_accuracy": 0.9576111942529678,
"num_tokens": 7234946.0,
"step": 3320
},
{
"epoch": 8.928859060402685,
"grad_norm": 0.6318687796592712,
"learning_rate": 7.4348789430536275e-06,
"loss": 0.1714,
"mean_token_accuracy": 0.9475503444671631,
"num_tokens": 7255133.0,
"step": 3330
},
{
"epoch": 8.955704697986578,
"grad_norm": 0.523369312286377,
"learning_rate": 7.4196557194683265e-06,
"loss": 0.1218,
"mean_token_accuracy": 0.9601493507623673,
"num_tokens": 7277115.0,
"step": 3340
},
{
"epoch": 8.982550335570469,
"grad_norm": 0.592207133769989,
"learning_rate": 7.40440315024761e-06,
"loss": 0.1281,
"mean_token_accuracy": 0.9569019913673401,
"num_tokens": 7299266.0,
"step": 3350
},
{
"epoch": 9.008053691275167,
"grad_norm": 0.656379759311676,
"learning_rate": 7.389121420374961e-06,
"loss": 0.1389,
"mean_token_accuracy": 0.9550388643616124,
"num_tokens": 7318761.0,
"step": 3360
},
{
"epoch": 9.03489932885906,
"grad_norm": 0.5115005970001221,
"learning_rate": 7.373810715187516e-06,
"loss": 0.1067,
"mean_token_accuracy": 0.9614180713891983,
"num_tokens": 7342447.0,
"step": 3370
},
{
"epoch": 9.061744966442953,
"grad_norm": 0.4467087984085083,
"learning_rate": 7.358471220373831e-06,
"loss": 0.1185,
"mean_token_accuracy": 0.9584500521421433,
"num_tokens": 7365127.0,
"step": 3380
},
{
"epoch": 9.088590604026846,
"grad_norm": 0.6290796399116516,
"learning_rate": 7.343103121971623e-06,
"loss": 0.1196,
"mean_token_accuracy": 0.9590140283107758,
"num_tokens": 7386969.0,
"step": 3390
},
{
"epoch": 9.115436241610738,
"grad_norm": 0.684598445892334,
"learning_rate": 7.327706606365512e-06,
"loss": 0.1561,
"mean_token_accuracy": 0.9502658367156982,
"num_tokens": 7407550.0,
"step": 3400
},
{
"epoch": 9.14228187919463,
"grad_norm": 0.7307904362678528,
"learning_rate": 7.3122818602847624e-06,
"loss": 0.129,
"mean_token_accuracy": 0.9587647318840027,
"num_tokens": 7428380.0,
"step": 3410
},
{
"epoch": 9.169127516778524,
"grad_norm": 0.6057212352752686,
"learning_rate": 7.296829070801017e-06,
"loss": 0.1098,
"mean_token_accuracy": 0.9636854767799378,
"num_tokens": 7452040.0,
"step": 3420
},
{
"epoch": 9.195973154362417,
"grad_norm": 0.589817225933075,
"learning_rate": 7.281348425326034e-06,
"loss": 0.1226,
"mean_token_accuracy": 0.9586849749088288,
"num_tokens": 7474538.0,
"step": 3430
},
{
"epoch": 9.22281879194631,
"grad_norm": 0.8453562259674072,
"learning_rate": 7.265840111609405e-06,
"loss": 0.1187,
"mean_token_accuracy": 0.9598051875829696,
"num_tokens": 7496163.0,
"step": 3440
},
{
"epoch": 9.2496644295302,
"grad_norm": 0.6440286636352539,
"learning_rate": 7.250304317736286e-06,
"loss": 0.1622,
"mean_token_accuracy": 0.9493000030517578,
"num_tokens": 7516288.0,
"step": 3450
},
{
"epoch": 9.276510067114094,
"grad_norm": 0.7584598064422607,
"learning_rate": 7.234741232125111e-06,
"loss": 0.1232,
"mean_token_accuracy": 0.9604197144508362,
"num_tokens": 7537035.0,
"step": 3460
},
{
"epoch": 9.303355704697987,
"grad_norm": 0.623458743095398,
"learning_rate": 7.219151043525311e-06,
"loss": 0.1179,
"mean_token_accuracy": 0.9581877291202545,
"num_tokens": 7560587.0,
"step": 3470
},
{
"epoch": 9.33020134228188,
"grad_norm": 0.5248258113861084,
"learning_rate": 7.203533941015019e-06,
"loss": 0.1235,
"mean_token_accuracy": 0.9568084627389908,
"num_tokens": 7583176.0,
"step": 3480
},
{
"epoch": 9.357046979865771,
"grad_norm": 0.8949034214019775,
"learning_rate": 7.1878901139987826e-06,
"loss": 0.1232,
"mean_token_accuracy": 0.9586084365844727,
"num_tokens": 7604880.0,
"step": 3490
},
{
"epoch": 9.383892617449664,
"grad_norm": 0.7485925555229187,
"learning_rate": 7.172219752205265e-06,
"loss": 0.1614,
"mean_token_accuracy": 0.9506101936101914,
"num_tokens": 7625182.0,
"step": 3500
},
{
"epoch": 9.410738255033557,
"grad_norm": 0.6359798312187195,
"learning_rate": 7.156523045684944e-06,
"loss": 0.1216,
"mean_token_accuracy": 0.9604356437921524,
"num_tokens": 7645893.0,
"step": 3510
},
{
"epoch": 9.43758389261745,
"grad_norm": 0.6602040529251099,
"learning_rate": 7.140800184807805e-06,
"loss": 0.1085,
"mean_token_accuracy": 0.9612102717161178,
"num_tokens": 7669544.0,
"step": 3520
},
{
"epoch": 9.464429530201341,
"grad_norm": 0.5637961030006409,
"learning_rate": 7.1250513602610364e-06,
"loss": 0.1246,
"mean_token_accuracy": 0.9569024622440339,
"num_tokens": 7692213.0,
"step": 3530
},
{
"epoch": 9.491275167785235,
"grad_norm": 0.7590091228485107,
"learning_rate": 7.109276763046713e-06,
"loss": 0.1172,
"mean_token_accuracy": 0.9609732508659363,
"num_tokens": 7713906.0,
"step": 3540
},
{
"epoch": 9.518120805369128,
"grad_norm": 0.8090452551841736,
"learning_rate": 7.09347658447948e-06,
"loss": 0.1623,
"mean_token_accuracy": 0.947560715675354,
"num_tokens": 7734201.0,
"step": 3550
},
{
"epoch": 9.54496644295302,
"grad_norm": 0.5834486484527588,
"learning_rate": 7.077651016184235e-06,
"loss": 0.1259,
"mean_token_accuracy": 0.9605411350727081,
"num_tokens": 7755049.0,
"step": 3560
},
{
"epoch": 9.571812080536914,
"grad_norm": 0.541755735874176,
"learning_rate": 7.061800250093804e-06,
"loss": 0.116,
"mean_token_accuracy": 0.9606775552034378,
"num_tokens": 7778737.0,
"step": 3570
},
{
"epoch": 9.598657718120805,
"grad_norm": 0.5823907852172852,
"learning_rate": 7.0459244784466115e-06,
"loss": 0.1249,
"mean_token_accuracy": 0.9562828868627549,
"num_tokens": 7801497.0,
"step": 3580
},
{
"epoch": 9.625503355704698,
"grad_norm": 0.7929330468177795,
"learning_rate": 7.03002389378435e-06,
"loss": 0.1255,
"mean_token_accuracy": 0.9591152399778367,
"num_tokens": 7823289.0,
"step": 3590
},
{
"epoch": 9.65234899328859,
"grad_norm": 0.7745299935340881,
"learning_rate": 7.014098688949643e-06,
"loss": 0.168,
"mean_token_accuracy": 0.9466694802045822,
"num_tokens": 7843625.0,
"step": 3600
},
{
"epoch": 9.679194630872484,
"grad_norm": 0.6910697817802429,
"learning_rate": 6.998149057083711e-06,
"loss": 0.1208,
"mean_token_accuracy": 0.9632305800914764,
"num_tokens": 7864423.0,
"step": 3610
},
{
"epoch": 9.706040268456375,
"grad_norm": 0.7385261654853821,
"learning_rate": 6.982175191624022e-06,
"loss": 0.1148,
"mean_token_accuracy": 0.9590048015117645,
"num_tokens": 7888184.0,
"step": 3620
},
{
"epoch": 9.732885906040268,
"grad_norm": 0.6523067355155945,
"learning_rate": 6.966177286301954e-06,
"loss": 0.1232,
"mean_token_accuracy": 0.9572295129299164,
"num_tokens": 7910819.0,
"step": 3630
},
{
"epoch": 9.759731543624161,
"grad_norm": 0.7242245674133301,
"learning_rate": 6.950155535140439e-06,
"loss": 0.1198,
"mean_token_accuracy": 0.9601730585098267,
"num_tokens": 7932588.0,
"step": 3640
},
{
"epoch": 9.786577181208054,
"grad_norm": 0.724774181842804,
"learning_rate": 6.934110132451611e-06,
"loss": 0.1595,
"mean_token_accuracy": 0.9477441519498825,
"num_tokens": 7953004.0,
"step": 3650
},
{
"epoch": 9.813422818791945,
"grad_norm": 0.5886031985282898,
"learning_rate": 6.918041272834451e-06,
"loss": 0.1317,
"mean_token_accuracy": 0.9589419364929199,
"num_tokens": 7973860.0,
"step": 3660
},
{
"epoch": 9.840268456375838,
"grad_norm": 0.5724550485610962,
"learning_rate": 6.901949151172427e-06,
"loss": 0.1139,
"mean_token_accuracy": 0.9589464545249939,
"num_tokens": 7997506.0,
"step": 3670
},
{
"epoch": 9.867114093959731,
"grad_norm": 0.6673897504806519,
"learning_rate": 6.885833962631126e-06,
"loss": 0.1317,
"mean_token_accuracy": 0.954210615158081,
"num_tokens": 8020020.0,
"step": 3680
},
{
"epoch": 9.893959731543625,
"grad_norm": 0.7852922081947327,
"learning_rate": 6.869695902655898e-06,
"loss": 0.1195,
"mean_token_accuracy": 0.9600458711385726,
"num_tokens": 8041729.0,
"step": 3690
},
{
"epoch": 9.920805369127518,
"grad_norm": 0.6804185509681702,
"learning_rate": 6.8535351669694694e-06,
"loss": 0.1654,
"mean_token_accuracy": 0.947740015387535,
"num_tokens": 8061952.0,
"step": 3700
},
{
"epoch": 9.947651006711409,
"grad_norm": 0.5824105143547058,
"learning_rate": 6.837351951569584e-06,
"loss": 0.1306,
"mean_token_accuracy": 0.9590909868478775,
"num_tokens": 8082665.0,
"step": 3710
},
{
"epoch": 9.974496644295302,
"grad_norm": 0.5961267948150635,
"learning_rate": 6.821146452726617e-06,
"loss": 0.1166,
"mean_token_accuracy": 0.959626880288124,
"num_tokens": 8105297.0,
"step": 3720
},
{
"epoch": 10.0,
"grad_norm": 1.5416700839996338,
"learning_rate": 6.8049188669812024e-06,
"loss": 0.1366,
"mean_token_accuracy": 0.9568514949397037,
"num_tokens": 8123690.0,
"step": 3730
},
{
"epoch": 10.026845637583893,
"grad_norm": 0.5632114410400391,
"learning_rate": 6.788669391141837e-06,
"loss": 0.0991,
"mean_token_accuracy": 0.9649410545825958,
"num_tokens": 8147756.0,
"step": 3740
},
{
"epoch": 10.053691275167786,
"grad_norm": 0.6619052290916443,
"learning_rate": 6.772398222282507e-06,
"loss": 0.1241,
"mean_token_accuracy": 0.9557583898305893,
"num_tokens": 8170704.0,
"step": 3750
},
{
"epoch": 10.080536912751677,
"grad_norm": 0.7061659097671509,
"learning_rate": 6.756105557740289e-06,
"loss": 0.1112,
"mean_token_accuracy": 0.961877191066742,
"num_tokens": 8192810.0,
"step": 3760
},
{
"epoch": 10.10738255033557,
"grad_norm": 0.8877546787261963,
"learning_rate": 6.739791595112964e-06,
"loss": 0.1295,
"mean_token_accuracy": 0.9570236325263977,
"num_tokens": 8213881.0,
"step": 3770
},
{
"epoch": 10.134228187919463,
"grad_norm": 0.7186614871025085,
"learning_rate": 6.7234565322566116e-06,
"loss": 0.1394,
"mean_token_accuracy": 0.9572271972894668,
"num_tokens": 8233288.0,
"step": 3780
},
{
"epoch": 10.161073825503356,
"grad_norm": 0.8258345723152161,
"learning_rate": 6.707100567283217e-06,
"loss": 0.1079,
"mean_token_accuracy": 0.9627465546131134,
"num_tokens": 8257423.0,
"step": 3790
},
{
"epoch": 10.187919463087248,
"grad_norm": 0.721269428730011,
"learning_rate": 6.690723898558267e-06,
"loss": 0.1125,
"mean_token_accuracy": 0.959991529583931,
"num_tokens": 8280282.0,
"step": 3800
},
{
"epoch": 10.21476510067114,
"grad_norm": 0.6415335536003113,
"learning_rate": 6.6743267246983445e-06,
"loss": 0.1063,
"mean_token_accuracy": 0.9647485375404358,
"num_tokens": 8302156.0,
"step": 3810
},
{
"epoch": 10.241610738255034,
"grad_norm": 1.024770736694336,
"learning_rate": 6.657909244568721e-06,
"loss": 0.1499,
"mean_token_accuracy": 0.9528948366641998,
"num_tokens": 8322759.0,
"step": 3820
},
{
"epoch": 10.268456375838927,
"grad_norm": 0.6429427266120911,
"learning_rate": 6.641471657280937e-06,
"loss": 0.132,
"mean_token_accuracy": 0.959844994544983,
"num_tokens": 8342010.0,
"step": 3830
},
{
"epoch": 10.29530201342282,
"grad_norm": 0.7093128561973572,
"learning_rate": 6.625014162190397e-06,
"loss": 0.1042,
"mean_token_accuracy": 0.9622471898794174,
"num_tokens": 8366113.0,
"step": 3840
},
{
"epoch": 10.322147651006711,
"grad_norm": 0.5344433784484863,
"learning_rate": 6.608536958893948e-06,
"loss": 0.1119,
"mean_token_accuracy": 0.9615410745143891,
"num_tokens": 8388950.0,
"step": 3850
},
{
"epoch": 10.348993288590604,
"grad_norm": 0.6051499843597412,
"learning_rate": 6.59204024722746e-06,
"loss": 0.1083,
"mean_token_accuracy": 0.9628863126039505,
"num_tokens": 8410787.0,
"step": 3860
},
{
"epoch": 10.375838926174497,
"grad_norm": 0.9594293236732483,
"learning_rate": 6.575524227263397e-06,
"loss": 0.1526,
"mean_token_accuracy": 0.951352596282959,
"num_tokens": 8431450.0,
"step": 3870
},
{
"epoch": 10.40268456375839,
"grad_norm": 0.7486820816993713,
"learning_rate": 6.5589890993083934e-06,
"loss": 0.1314,
"mean_token_accuracy": 0.9604231595993042,
"num_tokens": 8450653.0,
"step": 3880
},
{
"epoch": 10.429530201342281,
"grad_norm": 0.8094140887260437,
"learning_rate": 6.542435063900834e-06,
"loss": 0.1078,
"mean_token_accuracy": 0.9626900613307953,
"num_tokens": 8474784.0,
"step": 3890
},
{
"epoch": 10.456375838926174,
"grad_norm": 1.2699894905090332,
"learning_rate": 6.525862321808403e-06,
"loss": 0.1137,
"mean_token_accuracy": 0.9591136366128922,
"num_tokens": 8497679.0,
"step": 3900
},
{
"epoch": 10.483221476510067,
"grad_norm": 0.7747824788093567,
"learning_rate": 6.509271074025668e-06,
"loss": 0.1107,
"mean_token_accuracy": 0.9634034723043442,
"num_tokens": 8519621.0,
"step": 3910
},
{
"epoch": 10.51006711409396,
"grad_norm": 0.882911741733551,
"learning_rate": 6.49266152177163e-06,
"loss": 0.153,
"mean_token_accuracy": 0.9512349933385849,
"num_tokens": 8540214.0,
"step": 3920
},
{
"epoch": 10.536912751677852,
"grad_norm": 0.6655781865119934,
"learning_rate": 6.476033866487287e-06,
"loss": 0.1283,
"mean_token_accuracy": 0.9606866925954819,
"num_tokens": 8559450.0,
"step": 3930
},
{
"epoch": 10.563758389261745,
"grad_norm": 0.6774150729179382,
"learning_rate": 6.459388309833193e-06,
"loss": 0.1069,
"mean_token_accuracy": 0.9630162745714188,
"num_tokens": 8583592.0,
"step": 3940
},
{
"epoch": 10.590604026845638,
"grad_norm": 0.764737606048584,
"learning_rate": 6.442725053687009e-06,
"loss": 0.1205,
"mean_token_accuracy": 0.9564787149429321,
"num_tokens": 8606534.0,
"step": 3950
},
{
"epoch": 10.61744966442953,
"grad_norm": 0.6622138023376465,
"learning_rate": 6.426044300141054e-06,
"loss": 0.1116,
"mean_token_accuracy": 0.9616225004196167,
"num_tokens": 8628580.0,
"step": 3960
},
{
"epoch": 10.644295302013422,
"grad_norm": 1.1456904411315918,
"learning_rate": 6.409346251499859e-06,
"loss": 0.147,
"mean_token_accuracy": 0.9527129501104354,
"num_tokens": 8649304.0,
"step": 3970
},
{
"epoch": 10.671140939597315,
"grad_norm": 0.723003625869751,
"learning_rate": 6.392631110277707e-06,
"loss": 0.1241,
"mean_token_accuracy": 0.9604730904102325,
"num_tokens": 8668520.0,
"step": 3980
},
{
"epoch": 10.697986577181208,
"grad_norm": 0.81331866979599,
"learning_rate": 6.375899079196184e-06,
"loss": 0.1078,
"mean_token_accuracy": 0.9621582269668579,
"num_tokens": 8692652.0,
"step": 3990
},
{
"epoch": 10.724832214765101,
"grad_norm": 0.6479800343513489,
"learning_rate": 6.3591503611817155e-06,
"loss": 0.1157,
"mean_token_accuracy": 0.9594815254211426,
"num_tokens": 8715659.0,
"step": 4000
},
{
"epoch": 10.751677852348994,
"grad_norm": 0.7673507332801819,
"learning_rate": 6.342385159363102e-06,
"loss": 0.1183,
"mean_token_accuracy": 0.959958502650261,
"num_tokens": 8737763.0,
"step": 4010
},
{
"epoch": 10.778523489932885,
"grad_norm": 1.0533620119094849,
"learning_rate": 6.325603677069067e-06,
"loss": 0.1515,
"mean_token_accuracy": 0.9495168924331665,
"num_tokens": 8758611.0,
"step": 4020
},
{
"epoch": 10.805369127516778,
"grad_norm": 0.8211329579353333,
"learning_rate": 6.308806117825777e-06,
"loss": 0.1358,
"mean_token_accuracy": 0.9576447039842606,
"num_tokens": 8777989.0,
"step": 4030
},
{
"epoch": 10.832214765100671,
"grad_norm": 0.7280214428901672,
"learning_rate": 6.291992685354386e-06,
"loss": 0.1117,
"mean_token_accuracy": 0.961353474855423,
"num_tokens": 8802066.0,
"step": 4040
},
{
"epoch": 10.859060402684564,
"grad_norm": 0.7263833284378052,
"learning_rate": 6.2751635835685575e-06,
"loss": 0.1164,
"mean_token_accuracy": 0.9582950919866562,
"num_tokens": 8824929.0,
"step": 4050
},
{
"epoch": 10.885906040268456,
"grad_norm": 0.7844878435134888,
"learning_rate": 6.25831901657199e-06,
"loss": 0.1105,
"mean_token_accuracy": 0.9616876095533371,
"num_tokens": 8846953.0,
"step": 4060
},
{
"epoch": 10.912751677852349,
"grad_norm": 1.225616693496704,
"learning_rate": 6.241459188655944e-06,
"loss": 0.1451,
"mean_token_accuracy": 0.9530714869499206,
"num_tokens": 8867986.0,
"step": 4070
},
{
"epoch": 10.939597315436242,
"grad_norm": 0.8232021927833557,
"learning_rate": 6.224584304296769e-06,
"loss": 0.1358,
"mean_token_accuracy": 0.9574162900447846,
"num_tokens": 8887395.0,
"step": 4080
},
{
"epoch": 10.966442953020135,
"grad_norm": 0.7063604593276978,
"learning_rate": 6.207694568153418e-06,
"loss": 0.1131,
"mean_token_accuracy": 0.9610060393810272,
"num_tokens": 8910823.0,
"step": 4090
},
{
"epoch": 10.993288590604028,
"grad_norm": 0.9226030707359314,
"learning_rate": 6.1907901850649636e-06,
"loss": 0.1344,
"mean_token_accuracy": 0.9559124350547791,
"num_tokens": 8931972.0,
"step": 4100
},
{
"epoch": 11.018791946308724,
"grad_norm": 0.6862479448318481,
"learning_rate": 6.1738713600481205e-06,
"loss": 0.106,
"mean_token_accuracy": 0.9632336026743838,
"num_tokens": 8953111.0,
"step": 4110
},
{
"epoch": 11.045637583892617,
"grad_norm": 0.8745356202125549,
"learning_rate": 6.156938298294752e-06,
"loss": 0.1082,
"mean_token_accuracy": 0.9619537621736527,
"num_tokens": 8976409.0,
"step": 4120
},
{
"epoch": 11.07248322147651,
"grad_norm": 0.7044370770454407,
"learning_rate": 6.139991205169391e-06,
"loss": 0.1108,
"mean_token_accuracy": 0.9622800439596176,
"num_tokens": 8998696.0,
"step": 4130
},
{
"epoch": 11.099328859060403,
"grad_norm": 1.0560659170150757,
"learning_rate": 6.123030286206736e-06,
"loss": 0.1132,
"mean_token_accuracy": 0.9616888105869293,
"num_tokens": 9020113.0,
"step": 4140
},
{
"epoch": 11.126174496644296,
"grad_norm": 0.7895893454551697,
"learning_rate": 6.106055747109169e-06,
"loss": 0.1325,
"mean_token_accuracy": 0.9582968652248383,
"num_tokens": 9039986.0,
"step": 4150
},
{
"epoch": 11.153020134228187,
"grad_norm": 0.9621571898460388,
"learning_rate": 6.089067793744258e-06,
"loss": 0.1044,
"mean_token_accuracy": 0.9661745488643646,
"num_tokens": 9062685.0,
"step": 4160
},
{
"epoch": 11.17986577181208,
"grad_norm": 0.9362059831619263,
"learning_rate": 6.0720666321422574e-06,
"loss": 0.1076,
"mean_token_accuracy": 0.9622334897518158,
"num_tokens": 9085989.0,
"step": 4170
},
{
"epoch": 11.206711409395973,
"grad_norm": 1.2377800941467285,
"learning_rate": 6.055052468493614e-06,
"loss": 0.114,
"mean_token_accuracy": 0.9611568659543991,
"num_tokens": 9108336.0,
"step": 4180
},
{
"epoch": 11.233557046979866,
"grad_norm": 1.4495049715042114,
"learning_rate": 6.038025509146459e-06,
"loss": 0.1238,
"mean_token_accuracy": 0.9592485100030899,
"num_tokens": 9129631.0,
"step": 4190
},
{
"epoch": 11.260402684563758,
"grad_norm": 0.8522602915763855,
"learning_rate": 6.020985960604115e-06,
"loss": 0.1382,
"mean_token_accuracy": 0.9578620493412018,
"num_tokens": 9149360.0,
"step": 4200
},
{
"epoch": 11.28724832214765,
"grad_norm": 1.1033200025558472,
"learning_rate": 6.0039340295225845e-06,
"loss": 0.1053,
"mean_token_accuracy": 0.9642931789159774,
"num_tokens": 9172104.0,
"step": 4210
},
{
"epoch": 11.314093959731544,
"grad_norm": 0.9398940205574036,
"learning_rate": 5.986869922708048e-06,
"loss": 0.1073,
"mean_token_accuracy": 0.9612972408533096,
"num_tokens": 9195342.0,
"step": 4220
},
{
"epoch": 11.340939597315437,
"grad_norm": 0.7499808073043823,
"learning_rate": 5.969793847114349e-06,
"loss": 0.1083,
"mean_token_accuracy": 0.9620550066232681,
"num_tokens": 9217655.0,
"step": 4230
},
{
"epoch": 11.367785234899328,
"grad_norm": 1.2773305177688599,
"learning_rate": 5.952706009840491e-06,
"loss": 0.1168,
"mean_token_accuracy": 0.9608386933803559,
"num_tokens": 9239082.0,
"step": 4240
},
{
"epoch": 11.394630872483221,
"grad_norm": 0.9053451418876648,
"learning_rate": 5.935606618128124e-06,
"loss": 0.1434,
"mean_token_accuracy": 0.9550743252038956,
"num_tokens": 9258824.0,
"step": 4250
},
{
"epoch": 11.421476510067114,
"grad_norm": 0.9365808367729187,
"learning_rate": 5.918495879359032e-06,
"loss": 0.1042,
"mean_token_accuracy": 0.9643994092941284,
"num_tokens": 9281610.0,
"step": 4260
},
{
"epoch": 11.448322147651007,
"grad_norm": 0.958976149559021,
"learning_rate": 5.901374001052614e-06,
"loss": 0.1048,
"mean_token_accuracy": 0.9623458862304688,
"num_tokens": 9304906.0,
"step": 4270
},
{
"epoch": 11.4751677852349,
"grad_norm": 1.1884407997131348,
"learning_rate": 5.884241190863367e-06,
"loss": 0.1097,
"mean_token_accuracy": 0.9642301768064498,
"num_tokens": 9327141.0,
"step": 4280
},
{
"epoch": 11.502013422818791,
"grad_norm": 1.3043391704559326,
"learning_rate": 5.867097656578375e-06,
"loss": 0.1276,
"mean_token_accuracy": 0.9575917005538941,
"num_tokens": 9348426.0,
"step": 4290
},
{
"epoch": 11.528859060402684,
"grad_norm": 0.8189520835876465,
"learning_rate": 5.849943606114782e-06,
"loss": 0.1394,
"mean_token_accuracy": 0.9571978777647019,
"num_tokens": 9368198.0,
"step": 4300
},
{
"epoch": 11.555704697986577,
"grad_norm": 0.9798721075057983,
"learning_rate": 5.832779247517273e-06,
"loss": 0.1064,
"mean_token_accuracy": 0.9647763341665268,
"num_tokens": 9390810.0,
"step": 4310
},
{
"epoch": 11.58255033557047,
"grad_norm": 0.7926787734031677,
"learning_rate": 5.815604788955549e-06,
"loss": 0.1155,
"mean_token_accuracy": 0.9591783225536347,
"num_tokens": 9413905.0,
"step": 4320
},
{
"epoch": 11.609395973154362,
"grad_norm": 0.7051096558570862,
"learning_rate": 5.798420438721804e-06,
"loss": 0.1091,
"mean_token_accuracy": 0.9627108782529831,
"num_tokens": 9436047.0,
"step": 4330
},
{
"epoch": 11.636241610738255,
"grad_norm": 1.5358829498291016,
"learning_rate": 5.781226405228201e-06,
"loss": 0.1291,
"mean_token_accuracy": 0.956219607591629,
"num_tokens": 9457284.0,
"step": 4340
},
{
"epoch": 11.663087248322148,
"grad_norm": 0.8946168422698975,
"learning_rate": 5.764022897004336e-06,
"loss": 0.1376,
"mean_token_accuracy": 0.957264369726181,
"num_tokens": 9477014.0,
"step": 4350
},
{
"epoch": 11.68993288590604,
"grad_norm": 0.8450965285301208,
"learning_rate": 5.74681012269472e-06,
"loss": 0.1076,
"mean_token_accuracy": 0.9647704660892487,
"num_tokens": 9499686.0,
"step": 4360
},
{
"epoch": 11.716778523489932,
"grad_norm": 0.8520516157150269,
"learning_rate": 5.729588291056243e-06,
"loss": 0.1049,
"mean_token_accuracy": 0.9633782804012299,
"num_tokens": 9522983.0,
"step": 4370
},
{
"epoch": 11.743624161073825,
"grad_norm": 0.7502100467681885,
"learning_rate": 5.7123576109556386e-06,
"loss": 0.1043,
"mean_token_accuracy": 0.9639011263847351,
"num_tokens": 9545186.0,
"step": 4380
},
{
"epoch": 11.770469798657718,
"grad_norm": 1.220030665397644,
"learning_rate": 5.695118291366959e-06,
"loss": 0.1245,
"mean_token_accuracy": 0.9588271796703338,
"num_tokens": 9566314.0,
"step": 4390
},
{
"epoch": 11.797315436241611,
"grad_norm": 1.0362236499786377,
"learning_rate": 5.677870541369034e-06,
"loss": 0.1407,
"mean_token_accuracy": 0.9561160743236542,
"num_tokens": 9585969.0,
"step": 4400
},
{
"epoch": 11.824161073825504,
"grad_norm": 1.0146335363388062,
"learning_rate": 5.660614570142938e-06,
"loss": 0.1032,
"mean_token_accuracy": 0.9659360885620117,
"num_tokens": 9608480.0,
"step": 4410
},
{
"epoch": 11.851006711409395,
"grad_norm": 1.0298837423324585,
"learning_rate": 5.643350586969453e-06,
"loss": 0.1067,
"mean_token_accuracy": 0.9613613903522491,
"num_tokens": 9631569.0,
"step": 4420
},
{
"epoch": 11.877852348993288,
"grad_norm": 0.7821735739707947,
"learning_rate": 5.626078801226528e-06,
"loss": 0.1113,
"mean_token_accuracy": 0.961196494102478,
"num_tokens": 9653749.0,
"step": 4430
},
{
"epoch": 11.904697986577181,
"grad_norm": 2.3024230003356934,
"learning_rate": 5.608799422386744e-06,
"loss": 0.1292,
"mean_token_accuracy": 0.9577797710895538,
"num_tokens": 9674872.0,
"step": 4440
},
{
"epoch": 11.931543624161074,
"grad_norm": 0.9050582051277161,
"learning_rate": 5.591512660014773e-06,
"loss": 0.1369,
"mean_token_accuracy": 0.9575184613466263,
"num_tokens": 9694480.0,
"step": 4450
},
{
"epoch": 11.958389261744966,
"grad_norm": 0.9048384428024292,
"learning_rate": 5.57421872376483e-06,
"loss": 0.1071,
"mean_token_accuracy": 0.9633950978517533,
"num_tokens": 9716740.0,
"step": 4460
},
{
"epoch": 11.985234899328859,
"grad_norm": 1.0750577449798584,
"learning_rate": 5.5569178233781384e-06,
"loss": 0.1131,
"mean_token_accuracy": 0.9621215164661407,
"num_tokens": 9738481.0,
"step": 4470
},
{
"epoch": 12.010738255033557,
"grad_norm": 0.8347861170768738,
"learning_rate": 5.539610168680381e-06,
"loss": 0.1152,
"mean_token_accuracy": 0.964070222879711,
"num_tokens": 9758315.0,
"step": 4480
},
{
"epoch": 12.03758389261745,
"grad_norm": 0.9832549691200256,
"learning_rate": 5.522295969579157e-06,
"loss": 0.0956,
"mean_token_accuracy": 0.9664029866456986,
"num_tokens": 9782015.0,
"step": 4490
},
{
"epoch": 12.064429530201343,
"grad_norm": 0.913514256477356,
"learning_rate": 5.50497543606144e-06,
"loss": 0.1038,
"mean_token_accuracy": 0.9633048325777054,
"num_tokens": 9804720.0,
"step": 4500
},
{
"epoch": 12.091275167785234,
"grad_norm": 2.7291688919067383,
"learning_rate": 5.487648778191021e-06,
"loss": 0.1018,
"mean_token_accuracy": 0.9650433152914047,
"num_tokens": 9826495.0,
"step": 4510
},
{
"epoch": 12.118120805369127,
"grad_norm": 1.1719425916671753,
"learning_rate": 5.470316206105971e-06,
"loss": 0.131,
"mean_token_accuracy": 0.9595217138528824,
"num_tokens": 9846765.0,
"step": 4520
},
{
"epoch": 12.14496644295302,
"grad_norm": 1.3220218420028687,
"learning_rate": 5.45297793001609e-06,
"loss": 0.1046,
"mean_token_accuracy": 0.9682448267936706,
"num_tokens": 9868041.0,
"step": 4530
},
{
"epoch": 12.171812080536913,
"grad_norm": 1.0875526666641235,
"learning_rate": 5.435634160200355e-06,
"loss": 0.0976,
"mean_token_accuracy": 0.9661436587572098,
"num_tokens": 9891664.0,
"step": 4540
},
{
"epoch": 12.198657718120806,
"grad_norm": 1.1350823640823364,
"learning_rate": 5.418285107004372e-06,
"loss": 0.1048,
"mean_token_accuracy": 0.9631434679031372,
"num_tokens": 9914268.0,
"step": 4550
},
{
"epoch": 12.225503355704697,
"grad_norm": 1.8183422088623047,
"learning_rate": 5.4009309808378185e-06,
"loss": 0.0967,
"mean_token_accuracy": 0.9672815710306167,
"num_tokens": 9936030.0,
"step": 4560
},
{
"epoch": 12.25234899328859,
"grad_norm": 1.1410795450210571,
"learning_rate": 5.383571992171904e-06,
"loss": 0.1385,
"mean_token_accuracy": 0.9560069739818573,
"num_tokens": 9956185.0,
"step": 4570
},
{
"epoch": 12.279194630872484,
"grad_norm": 1.5358262062072754,
"learning_rate": 5.366208351536809e-06,
"loss": 0.1037,
"mean_token_accuracy": 0.9659923285245895,
"num_tokens": 9977425.0,
"step": 4580
},
{
"epoch": 12.306040268456377,
"grad_norm": 1.0602080821990967,
"learning_rate": 5.34884026951913e-06,
"loss": 0.0997,
"mean_token_accuracy": 0.9650217086076737,
"num_tokens": 10001018.0,
"step": 4590
},
{
"epoch": 12.332885906040268,
"grad_norm": 1.0410484075546265,
"learning_rate": 5.331467956759331e-06,
"loss": 0.1144,
"mean_token_accuracy": 0.9604771822690964,
"num_tokens": 10023449.0,
"step": 4600
},
{
"epoch": 12.35973154362416,
"grad_norm": 1.281563401222229,
"learning_rate": 5.314091623949187e-06,
"loss": 0.1055,
"mean_token_accuracy": 0.9640044838190078,
"num_tokens": 10045006.0,
"step": 4610
},
{
"epoch": 12.386577181208054,
"grad_norm": 1.2597219944000244,
"learning_rate": 5.296711481829227e-06,
"loss": 0.1392,
"mean_token_accuracy": 0.9558876633644104,
"num_tokens": 10065200.0,
"step": 4620
},
{
"epoch": 12.413422818791947,
"grad_norm": 1.0641788244247437,
"learning_rate": 5.279327741186179e-06,
"loss": 0.1021,
"mean_token_accuracy": 0.9678240925073623,
"num_tokens": 10086432.0,
"step": 4630
},
{
"epoch": 12.440268456375838,
"grad_norm": 1.1310027837753296,
"learning_rate": 5.261940612850418e-06,
"loss": 0.0987,
"mean_token_accuracy": 0.9654311060905456,
"num_tokens": 10109941.0,
"step": 4640
},
{
"epoch": 12.467114093959731,
"grad_norm": 0.8842890858650208,
"learning_rate": 5.244550307693398e-06,
"loss": 0.1013,
"mean_token_accuracy": 0.9641594380140305,
"num_tokens": 10132421.0,
"step": 4650
},
{
"epoch": 12.493959731543624,
"grad_norm": 1.3634861707687378,
"learning_rate": 5.227157036625108e-06,
"loss": 0.1003,
"mean_token_accuracy": 0.9646014750003815,
"num_tokens": 10153998.0,
"step": 4660
},
{
"epoch": 12.520805369127517,
"grad_norm": 1.1904743909835815,
"learning_rate": 5.209761010591503e-06,
"loss": 0.1374,
"mean_token_accuracy": 0.9562786787748336,
"num_tokens": 10173970.0,
"step": 4670
},
{
"epoch": 12.54765100671141,
"grad_norm": 1.3061332702636719,
"learning_rate": 5.192362440571955e-06,
"loss": 0.1064,
"mean_token_accuracy": 0.965582725405693,
"num_tokens": 10195289.0,
"step": 4680
},
{
"epoch": 12.574496644295301,
"grad_norm": 1.0577377080917358,
"learning_rate": 5.174961537576685e-06,
"loss": 0.1013,
"mean_token_accuracy": 0.9635748207569123,
"num_tokens": 10218795.0,
"step": 4690
},
{
"epoch": 12.601342281879194,
"grad_norm": 0.8413182497024536,
"learning_rate": 5.15755851264421e-06,
"loss": 0.1114,
"mean_token_accuracy": 0.9613074272871017,
"num_tokens": 10241329.0,
"step": 4700
},
{
"epoch": 12.628187919463087,
"grad_norm": 3.429868221282959,
"learning_rate": 5.140153576838781e-06,
"loss": 0.1125,
"mean_token_accuracy": 0.9631948232650757,
"num_tokens": 10262842.0,
"step": 4710
},
{
"epoch": 12.65503355704698,
"grad_norm": 1.2405115365982056,
"learning_rate": 5.122746941247828e-06,
"loss": 0.1364,
"mean_token_accuracy": 0.958396029472351,
"num_tokens": 10282940.0,
"step": 4720
},
{
"epoch": 12.681879194630872,
"grad_norm": 1.0881459712982178,
"learning_rate": 5.105338816979393e-06,
"loss": 0.1116,
"mean_token_accuracy": 0.9615271121263504,
"num_tokens": 10304123.0,
"step": 4730
},
{
"epoch": 12.708724832214765,
"grad_norm": 1.1950105428695679,
"learning_rate": 5.087929415159571e-06,
"loss": 0.1009,
"mean_token_accuracy": 0.9633850902318954,
"num_tokens": 10327510.0,
"step": 4740
},
{
"epoch": 12.735570469798658,
"grad_norm": 0.8143473267555237,
"learning_rate": 5.070518946929954e-06,
"loss": 0.1052,
"mean_token_accuracy": 0.9625776678323745,
"num_tokens": 10349965.0,
"step": 4750
},
{
"epoch": 12.76241610738255,
"grad_norm": 1.3037388324737549,
"learning_rate": 5.053107623445067e-06,
"loss": 0.101,
"mean_token_accuracy": 0.9653885364532471,
"num_tokens": 10371686.0,
"step": 4760
},
{
"epoch": 12.789261744966442,
"grad_norm": 1.3862019777297974,
"learning_rate": 5.035695655869808e-06,
"loss": 0.137,
"mean_token_accuracy": 0.9558083891868592,
"num_tokens": 10392121.0,
"step": 4770
},
{
"epoch": 12.816107382550335,
"grad_norm": 1.0056508779525757,
"learning_rate": 5.018283255376882e-06,
"loss": 0.1083,
"mean_token_accuracy": 0.9667860418558121,
"num_tokens": 10413476.0,
"step": 4780
},
{
"epoch": 12.842953020134228,
"grad_norm": 1.75967538356781,
"learning_rate": 5.000870633144252e-06,
"loss": 0.0992,
"mean_token_accuracy": 0.965215852856636,
"num_tokens": 10437053.0,
"step": 4790
},
{
"epoch": 12.869798657718121,
"grad_norm": 0.9346075654029846,
"learning_rate": 4.983458000352565e-06,
"loss": 0.1126,
"mean_token_accuracy": 0.96172194480896,
"num_tokens": 10459596.0,
"step": 4800
},
{
"epoch": 12.896644295302014,
"grad_norm": 1.8567789793014526,
"learning_rate": 4.966045568182596e-06,
"loss": 0.1108,
"mean_token_accuracy": 0.9629136115312577,
"num_tokens": 10481183.0,
"step": 4810
},
{
"epoch": 12.923489932885905,
"grad_norm": 3.24837064743042,
"learning_rate": 4.948633547812691e-06,
"loss": 0.1334,
"mean_token_accuracy": 0.958159077167511,
"num_tokens": 10501315.0,
"step": 4820
},
{
"epoch": 12.950335570469798,
"grad_norm": 0.9896105527877808,
"learning_rate": 4.931222150416197e-06,
"loss": 0.0997,
"mean_token_accuracy": 0.9687875539064408,
"num_tokens": 10522428.0,
"step": 4830
},
{
"epoch": 12.977181208053691,
"grad_norm": 1.075074553489685,
"learning_rate": 4.913811587158908e-06,
"loss": 0.1053,
"mean_token_accuracy": 0.9634338021278381,
"num_tokens": 10544661.0,
"step": 4840
},
{
"epoch": 13.00268456375839,
"grad_norm": 0.8715270757675171,
"learning_rate": 4.896402069196502e-06,
"loss": 0.1193,
"mean_token_accuracy": 0.9614172013182389,
"num_tokens": 10563307.0,
"step": 4850
},
{
"epoch": 13.029530201342283,
"grad_norm": 0.9880079627037048,
"learning_rate": 4.878993807671976e-06,
"loss": 0.0906,
"mean_token_accuracy": 0.9682508319616318,
"num_tokens": 10587248.0,
"step": 4860
},
{
"epoch": 13.056375838926174,
"grad_norm": 1.9502968788146973,
"learning_rate": 4.861587013713096e-06,
"loss": 0.1007,
"mean_token_accuracy": 0.9648101240396499,
"num_tokens": 10610106.0,
"step": 4870
},
{
"epoch": 13.083221476510067,
"grad_norm": 0.9801583290100098,
"learning_rate": 4.8441818984298204e-06,
"loss": 0.0912,
"mean_token_accuracy": 0.9686092883348465,
"num_tokens": 10632149.0,
"step": 4880
},
{
"epoch": 13.11006711409396,
"grad_norm": 1.2754795551300049,
"learning_rate": 4.826778672911757e-06,
"loss": 0.1184,
"mean_token_accuracy": 0.9613798499107361,
"num_tokens": 10653072.0,
"step": 4890
},
{
"epoch": 13.136912751677853,
"grad_norm": 1.2431989908218384,
"learning_rate": 4.809377548225589e-06,
"loss": 0.1108,
"mean_token_accuracy": 0.9650191992521286,
"num_tokens": 10672975.0,
"step": 4900
},
{
"epoch": 13.163758389261744,
"grad_norm": 1.0186421871185303,
"learning_rate": 4.79197873541252e-06,
"loss": 0.0892,
"mean_token_accuracy": 0.9677982062101365,
"num_tokens": 10696744.0,
"step": 4910
},
{
"epoch": 13.190604026845637,
"grad_norm": 1.6983952522277832,
"learning_rate": 4.774582445485721e-06,
"loss": 0.0998,
"mean_token_accuracy": 0.9657606661319733,
"num_tokens": 10719417.0,
"step": 4920
},
{
"epoch": 13.21744966442953,
"grad_norm": 1.8801636695861816,
"learning_rate": 4.757188889427761e-06,
"loss": 0.0969,
"mean_token_accuracy": 0.9664499372243881,
"num_tokens": 10741268.0,
"step": 4930
},
{
"epoch": 13.244295302013423,
"grad_norm": 1.604750633239746,
"learning_rate": 4.73979827818805e-06,
"loss": 0.1261,
"mean_token_accuracy": 0.9594014555215835,
"num_tokens": 10761836.0,
"step": 4940
},
{
"epoch": 13.271140939597316,
"grad_norm": 1.1507794857025146,
"learning_rate": 4.7224108226802915e-06,
"loss": 0.1045,
"mean_token_accuracy": 0.9674178540706635,
"num_tokens": 10781662.0,
"step": 4950
},
{
"epoch": 13.297986577181208,
"grad_norm": 1.8615467548370361,
"learning_rate": 4.7050267337799074e-06,
"loss": 0.0936,
"mean_token_accuracy": 0.9667486816644668,
"num_tokens": 10805617.0,
"step": 4960
},
{
"epoch": 13.3248322147651,
"grad_norm": 1.1684675216674805,
"learning_rate": 4.687646222321496e-06,
"loss": 0.1075,
"mean_token_accuracy": 0.9631021231412887,
"num_tokens": 10828423.0,
"step": 4970
},
{
"epoch": 13.351677852348994,
"grad_norm": 0.9275521039962769,
"learning_rate": 4.670269499096266e-06,
"loss": 0.0935,
"mean_token_accuracy": 0.9685831665992737,
"num_tokens": 10850355.0,
"step": 4980
},
{
"epoch": 13.378523489932887,
"grad_norm": 1.5694960355758667,
"learning_rate": 4.652896774849477e-06,
"loss": 0.1288,
"mean_token_accuracy": 0.9589329659938812,
"num_tokens": 10870919.0,
"step": 4990
},
{
"epoch": 13.405369127516778,
"grad_norm": 1.0490679740905762,
"learning_rate": 4.635528260277899e-06,
"loss": 0.1067,
"mean_token_accuracy": 0.9686432272195816,
"num_tokens": 10890675.0,
"step": 5000
},
{
"epoch": 13.432214765100671,
"grad_norm": 1.5049335956573486,
"learning_rate": 4.618164166027238e-06,
"loss": 0.094,
"mean_token_accuracy": 0.9673222094774246,
"num_tokens": 10914713.0,
"step": 5010
},
{
"epoch": 13.459060402684564,
"grad_norm": 1.205875277519226,
"learning_rate": 4.600804702689598e-06,
"loss": 0.1022,
"mean_token_accuracy": 0.9632074743509292,
"num_tokens": 10937577.0,
"step": 5020
},
{
"epoch": 13.485906040268457,
"grad_norm": 1.0139328241348267,
"learning_rate": 4.583450080800912e-06,
"loss": 0.0957,
"mean_token_accuracy": 0.9681812196969986,
"num_tokens": 10959493.0,
"step": 5030
},
{
"epoch": 13.512751677852348,
"grad_norm": 1.8275400400161743,
"learning_rate": 4.5661005108384e-06,
"loss": 0.1238,
"mean_token_accuracy": 0.9604158759117126,
"num_tokens": 10980260.0,
"step": 5040
},
{
"epoch": 13.539597315436241,
"grad_norm": 1.1797428131103516,
"learning_rate": 4.54875620321801e-06,
"loss": 0.1142,
"mean_token_accuracy": 0.9651793152093887,
"num_tokens": 11000267.0,
"step": 5050
},
{
"epoch": 13.566442953020134,
"grad_norm": 1.460530161857605,
"learning_rate": 4.5314173682918704e-06,
"loss": 0.0945,
"mean_token_accuracy": 0.9683007091283798,
"num_tokens": 11024206.0,
"step": 5060
},
{
"epoch": 13.593288590604027,
"grad_norm": 1.042712926864624,
"learning_rate": 4.514084216345736e-06,
"loss": 0.1014,
"mean_token_accuracy": 0.9641075730323792,
"num_tokens": 11046962.0,
"step": 5070
},
{
"epoch": 13.620134228187919,
"grad_norm": 1.3945727348327637,
"learning_rate": 4.496756957596438e-06,
"loss": 0.0934,
"mean_token_accuracy": 0.9680504709482193,
"num_tokens": 11068787.0,
"step": 5080
},
{
"epoch": 13.646979865771812,
"grad_norm": 1.7909672260284424,
"learning_rate": 4.479435802189332e-06,
"loss": 0.1278,
"mean_token_accuracy": 0.9593035578727722,
"num_tokens": 11089125.0,
"step": 5090
},
{
"epoch": 13.673825503355705,
"grad_norm": 1.2251805067062378,
"learning_rate": 4.4621209601957585e-06,
"loss": 0.1054,
"mean_token_accuracy": 0.9684852123260498,
"num_tokens": 11108756.0,
"step": 5100
},
{
"epoch": 13.700671140939598,
"grad_norm": 1.1306813955307007,
"learning_rate": 4.444812641610482e-06,
"loss": 0.0941,
"mean_token_accuracy": 0.9665086060762406,
"num_tokens": 11132698.0,
"step": 5110
},
{
"epoch": 13.72751677852349,
"grad_norm": 1.2338292598724365,
"learning_rate": 4.427511056349157e-06,
"loss": 0.1035,
"mean_token_accuracy": 0.9634651213884353,
"num_tokens": 11155580.0,
"step": 5120
},
{
"epoch": 13.754362416107382,
"grad_norm": 1.1115108728408813,
"learning_rate": 4.410216414245771e-06,
"loss": 0.0954,
"mean_token_accuracy": 0.9664014279842377,
"num_tokens": 11177543.0,
"step": 5130
},
{
"epoch": 13.781208053691275,
"grad_norm": 1.7830060720443726,
"learning_rate": 4.392928925050106e-06,
"loss": 0.1326,
"mean_token_accuracy": 0.9572756230831146,
"num_tokens": 11198259.0,
"step": 5140
},
{
"epoch": 13.808053691275168,
"grad_norm": 1.142449140548706,
"learning_rate": 4.375648798425197e-06,
"loss": 0.1074,
"mean_token_accuracy": 0.96616330742836,
"num_tokens": 11218098.0,
"step": 5150
},
{
"epoch": 13.834899328859061,
"grad_norm": 1.3761417865753174,
"learning_rate": 4.358376243944782e-06,
"loss": 0.0923,
"mean_token_accuracy": 0.9684065580368042,
"num_tokens": 11242063.0,
"step": 5160
},
{
"epoch": 13.861744966442952,
"grad_norm": 0.9292601346969604,
"learning_rate": 4.341111471090762e-06,
"loss": 0.0965,
"mean_token_accuracy": 0.9653068006038665,
"num_tokens": 11264924.0,
"step": 5170
},
{
"epoch": 13.888590604026845,
"grad_norm": 0.9202330708503723,
"learning_rate": 4.323854689250669e-06,
"loss": 0.0927,
"mean_token_accuracy": 0.9676958501338959,
"num_tokens": 11286770.0,
"step": 5180
},
{
"epoch": 13.915436241610738,
"grad_norm": 1.5448764562606812,
"learning_rate": 4.3066061077151124e-06,
"loss": 0.1261,
"mean_token_accuracy": 0.9603673964738846,
"num_tokens": 11307393.0,
"step": 5190
},
{
"epoch": 13.942281879194631,
"grad_norm": 1.1237270832061768,
"learning_rate": 4.289365935675255e-06,
"loss": 0.109,
"mean_token_accuracy": 0.9644716501235961,
"num_tokens": 11327277.0,
"step": 5200
},
{
"epoch": 13.969127516778524,
"grad_norm": 0.9842494130134583,
"learning_rate": 4.272134382220263e-06,
"loss": 0.099,
"mean_token_accuracy": 0.9658454984426499,
"num_tokens": 11350369.0,
"step": 5210
},
{
"epoch": 13.995973154362416,
"grad_norm": 1.408265233039856,
"learning_rate": 4.254911656334778e-06,
"loss": 0.1152,
"mean_token_accuracy": 0.963257348537445,
"num_tokens": 11371043.0,
"step": 5220
},
{
"epoch": 14.021476510067114,
"grad_norm": 1.1198426485061646,
"learning_rate": 4.237697966896385e-06,
"loss": 0.0816,
"mean_token_accuracy": 0.971764558239987,
"num_tokens": 11392630.0,
"step": 5230
},
{
"epoch": 14.048322147651007,
"grad_norm": 1.1393156051635742,
"learning_rate": 4.220493522673067e-06,
"loss": 0.0969,
"mean_token_accuracy": 0.9652393728494644,
"num_tokens": 11415708.0,
"step": 5240
},
{
"epoch": 14.0751677852349,
"grad_norm": 1.3301637172698975,
"learning_rate": 4.20329853232069e-06,
"loss": 0.095,
"mean_token_accuracy": 0.9665391176939011,
"num_tokens": 11437808.0,
"step": 5250
},
{
"epoch": 14.102013422818793,
"grad_norm": 2.872422218322754,
"learning_rate": 4.1861132043804555e-06,
"loss": 0.1017,
"mean_token_accuracy": 0.9674932867288589,
"num_tokens": 11458991.0,
"step": 5260
},
{
"epoch": 14.128859060402684,
"grad_norm": 1.053300142288208,
"learning_rate": 4.168937747276381e-06,
"loss": 0.1138,
"mean_token_accuracy": 0.965873995423317,
"num_tokens": 11478611.0,
"step": 5270
},
{
"epoch": 14.155704697986577,
"grad_norm": 1.5345606803894043,
"learning_rate": 4.151772369312772e-06,
"loss": 0.0877,
"mean_token_accuracy": 0.9710632592439652,
"num_tokens": 11501870.0,
"step": 5280
},
{
"epoch": 14.18255033557047,
"grad_norm": 1.1510871648788452,
"learning_rate": 4.134617278671694e-06,
"loss": 0.0991,
"mean_token_accuracy": 0.9647044003009796,
"num_tokens": 11525013.0,
"step": 5290
},
{
"epoch": 14.209395973154363,
"grad_norm": 1.239437460899353,
"learning_rate": 4.117472683410446e-06,
"loss": 0.0911,
"mean_token_accuracy": 0.9685541450977325,
"num_tokens": 11547218.0,
"step": 5300
},
{
"epoch": 14.236241610738254,
"grad_norm": 2.640573263168335,
"learning_rate": 4.100338791459042e-06,
"loss": 0.1,
"mean_token_accuracy": 0.9668088883161545,
"num_tokens": 11568484.0,
"step": 5310
},
{
"epoch": 14.263087248322147,
"grad_norm": 1.1691937446594238,
"learning_rate": 4.083215810617678e-06,
"loss": 0.1192,
"mean_token_accuracy": 0.9638256758451462,
"num_tokens": 11588180.0,
"step": 5320
},
{
"epoch": 14.28993288590604,
"grad_norm": 1.7287318706512451,
"learning_rate": 4.0661039485542326e-06,
"loss": 0.0895,
"mean_token_accuracy": 0.9701910257339478,
"num_tokens": 11611340.0,
"step": 5330
},
{
"epoch": 14.316778523489933,
"grad_norm": 1.492462396621704,
"learning_rate": 4.049003412801724e-06,
"loss": 0.0927,
"mean_token_accuracy": 0.9668333351612091,
"num_tokens": 11634481.0,
"step": 5340
},
{
"epoch": 14.343624161073825,
"grad_norm": 1.4218974113464355,
"learning_rate": 4.031914410755809e-06,
"loss": 0.0927,
"mean_token_accuracy": 0.9681075811386108,
"num_tokens": 11656604.0,
"step": 5350
},
{
"epoch": 14.370469798657718,
"grad_norm": 2.2234318256378174,
"learning_rate": 4.014837149672266e-06,
"loss": 0.1087,
"mean_token_accuracy": 0.9646641999483109,
"num_tokens": 11677627.0,
"step": 5360
},
{
"epoch": 14.39731543624161,
"grad_norm": 1.1645985841751099,
"learning_rate": 3.997771836664473e-06,
"loss": 0.1092,
"mean_token_accuracy": 0.9664602816104889,
"num_tokens": 11697187.0,
"step": 5370
},
{
"epoch": 14.424161073825504,
"grad_norm": 1.700769305229187,
"learning_rate": 3.980718678700909e-06,
"loss": 0.0866,
"mean_token_accuracy": 0.9712590277194977,
"num_tokens": 11720310.0,
"step": 5380
},
{
"epoch": 14.451006711409397,
"grad_norm": 1.1342298984527588,
"learning_rate": 3.96367788260263e-06,
"loss": 0.094,
"mean_token_accuracy": 0.9667182564735413,
"num_tokens": 11743414.0,
"step": 5390
},
{
"epoch": 14.477852348993288,
"grad_norm": 1.848952054977417,
"learning_rate": 3.9466496550407675e-06,
"loss": 0.0927,
"mean_token_accuracy": 0.9685674637556076,
"num_tokens": 11765609.0,
"step": 5400
},
{
"epoch": 14.504697986577181,
"grad_norm": 1.5314711332321167,
"learning_rate": 3.929634202534026e-06,
"loss": 0.1052,
"mean_token_accuracy": 0.9641919821500778,
"num_tokens": 11786687.0,
"step": 5410
},
{
"epoch": 14.531543624161074,
"grad_norm": 1.143330693244934,
"learning_rate": 3.912631731446168e-06,
"loss": 0.1086,
"mean_token_accuracy": 0.9661551773548126,
"num_tokens": 11806174.0,
"step": 5420
},
{
"epoch": 14.558389261744967,
"grad_norm": 1.3766965866088867,
"learning_rate": 3.895642447983525e-06,
"loss": 0.0949,
"mean_token_accuracy": 0.9679251462221146,
"num_tokens": 11829254.0,
"step": 5430
},
{
"epoch": 14.585234899328858,
"grad_norm": 1.2601746320724487,
"learning_rate": 3.8786665581924805e-06,
"loss": 0.0968,
"mean_token_accuracy": 0.965695607662201,
"num_tokens": 11852275.0,
"step": 5440
},
{
"epoch": 14.612080536912751,
"grad_norm": 1.1301180124282837,
"learning_rate": 3.8617042679569805e-06,
"loss": 0.0908,
"mean_token_accuracy": 0.9697641730308533,
"num_tokens": 11874374.0,
"step": 5450
},
{
"epoch": 14.638926174496644,
"grad_norm": 2.486135959625244,
"learning_rate": 3.844755782996043e-06,
"loss": 0.1104,
"mean_token_accuracy": 0.9635231077671051,
"num_tokens": 11895508.0,
"step": 5460
},
{
"epoch": 14.665771812080537,
"grad_norm": 1.2749981880187988,
"learning_rate": 3.827821308861244e-06,
"loss": 0.1214,
"mean_token_accuracy": 0.9636542230844498,
"num_tokens": 11915109.0,
"step": 5470
},
{
"epoch": 14.692617449664429,
"grad_norm": 1.4531991481781006,
"learning_rate": 3.810901050934247e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9712864845991135,
"num_tokens": 11938194.0,
"step": 5480
},
{
"epoch": 14.719463087248322,
"grad_norm": 1.8714344501495361,
"learning_rate": 3.793995214424292e-06,
"loss": 0.0918,
"mean_token_accuracy": 0.9681427836418152,
"num_tokens": 11961341.0,
"step": 5490
},
{
"epoch": 14.746308724832215,
"grad_norm": 1.3651467561721802,
"learning_rate": 3.777104004365721e-06,
"loss": 0.0932,
"mean_token_accuracy": 0.9683119237422944,
"num_tokens": 11983507.0,
"step": 5500
},
{
"epoch": 14.773154362416108,
"grad_norm": 1.960383415222168,
"learning_rate": 3.7602276256154852e-06,
"loss": 0.1098,
"mean_token_accuracy": 0.9644652128219604,
"num_tokens": 12004614.0,
"step": 5510
},
{
"epoch": 14.8,
"grad_norm": 1.2198858261108398,
"learning_rate": 3.7433662828506577e-06,
"loss": 0.1114,
"mean_token_accuracy": 0.965359115600586,
"num_tokens": 12024189.0,
"step": 5520
},
{
"epoch": 14.826845637583892,
"grad_norm": 1.7983379364013672,
"learning_rate": 3.7265201805659618e-06,
"loss": 0.0863,
"mean_token_accuracy": 0.9709195643663406,
"num_tokens": 12047241.0,
"step": 5530
},
{
"epoch": 14.853691275167785,
"grad_norm": 1.500483751296997,
"learning_rate": 3.709689523071277e-06,
"loss": 0.0965,
"mean_token_accuracy": 0.965771397948265,
"num_tokens": 12070280.0,
"step": 5540
},
{
"epoch": 14.880536912751678,
"grad_norm": 1.2716580629348755,
"learning_rate": 3.6928745144891733e-06,
"loss": 0.0954,
"mean_token_accuracy": 0.9673113852739335,
"num_tokens": 12092414.0,
"step": 5550
},
{
"epoch": 14.907382550335571,
"grad_norm": 1.794506549835205,
"learning_rate": 3.676075358752426e-06,
"loss": 0.1128,
"mean_token_accuracy": 0.9624302506446838,
"num_tokens": 12113463.0,
"step": 5560
},
{
"epoch": 14.934228187919462,
"grad_norm": 1.2952619791030884,
"learning_rate": 3.6592922596015516e-06,
"loss": 0.1053,
"mean_token_accuracy": 0.9676823377609253,
"num_tokens": 12132960.0,
"step": 5570
},
{
"epoch": 14.961073825503355,
"grad_norm": 1.7229140996932983,
"learning_rate": 3.6425254205823256e-06,
"loss": 0.0879,
"mean_token_accuracy": 0.9696457594633102,
"num_tokens": 12155835.0,
"step": 5580
},
{
"epoch": 14.987919463087248,
"grad_norm": 5.81431245803833,
"learning_rate": 3.6257750450433284e-06,
"loss": 0.1062,
"mean_token_accuracy": 0.9640632271766663,
"num_tokens": 12177517.0,
"step": 5590
},
{
"epoch": 15.013422818791947,
"grad_norm": 1.7150989770889282,
"learning_rate": 3.609041336133462e-06,
"loss": 0.0895,
"mean_token_accuracy": 0.9702811994050679,
"num_tokens": 12197826.0,
"step": 5600
},
{
"epoch": 15.04026845637584,
"grad_norm": 1.458148717880249,
"learning_rate": 3.5923244967995045e-06,
"loss": 0.0819,
"mean_token_accuracy": 0.9716112166643143,
"num_tokens": 12221328.0,
"step": 5610
},
{
"epoch": 15.06711409395973,
"grad_norm": 1.431036114692688,
"learning_rate": 3.575624729783632e-06,
"loss": 0.0909,
"mean_token_accuracy": 0.9683985263109207,
"num_tokens": 12243817.0,
"step": 5620
},
{
"epoch": 15.093959731543624,
"grad_norm": 1.949104905128479,
"learning_rate": 3.558942237620968e-06,
"loss": 0.0826,
"mean_token_accuracy": 0.9721886366605759,
"num_tokens": 12265391.0,
"step": 5630
},
{
"epoch": 15.120805369127517,
"grad_norm": 1.5431227684020996,
"learning_rate": 3.5422772226371315e-06,
"loss": 0.1096,
"mean_token_accuracy": 0.9656341940164566,
"num_tokens": 12285431.0,
"step": 5640
},
{
"epoch": 15.14765100671141,
"grad_norm": 1.9090901613235474,
"learning_rate": 3.5256298869457715e-06,
"loss": 0.0866,
"mean_token_accuracy": 0.9727503031492233,
"num_tokens": 12307130.0,
"step": 5650
},
{
"epoch": 15.174496644295303,
"grad_norm": 1.5246036052703857,
"learning_rate": 3.509000432446128e-06,
"loss": 0.087,
"mean_token_accuracy": 0.9687469393014908,
"num_tokens": 12330587.0,
"step": 5660
},
{
"epoch": 15.201342281879194,
"grad_norm": 1.1196630001068115,
"learning_rate": 3.492389060820574e-06,
"loss": 0.0874,
"mean_token_accuracy": 0.9701620578765869,
"num_tokens": 12352973.0,
"step": 5670
},
{
"epoch": 15.228187919463087,
"grad_norm": 2.1184024810791016,
"learning_rate": 3.47579597353217e-06,
"loss": 0.0897,
"mean_token_accuracy": 0.9701414495706558,
"num_tokens": 12374519.0,
"step": 5680
},
{
"epoch": 15.25503355704698,
"grad_norm": 1.739675760269165,
"learning_rate": 3.4592213718222335e-06,
"loss": 0.1136,
"mean_token_accuracy": 0.9656203925609589,
"num_tokens": 12394483.0,
"step": 5690
},
{
"epoch": 15.281879194630873,
"grad_norm": 1.7435755729675293,
"learning_rate": 3.4426654567078753e-06,
"loss": 0.0856,
"mean_token_accuracy": 0.9721616327762603,
"num_tokens": 12416218.0,
"step": 5700
},
{
"epoch": 15.308724832214764,
"grad_norm": 1.2572535276412964,
"learning_rate": 3.426128428979589e-06,
"loss": 0.0898,
"mean_token_accuracy": 0.9683011502027512,
"num_tokens": 12439695.0,
"step": 5710
},
{
"epoch": 15.335570469798657,
"grad_norm": 1.2672688961029053,
"learning_rate": 3.4096104891987903e-06,
"loss": 0.0936,
"mean_token_accuracy": 0.9671826392412186,
"num_tokens": 12462245.0,
"step": 5720
},
{
"epoch": 15.36241610738255,
"grad_norm": 1.6163733005523682,
"learning_rate": 3.3931118376953986e-06,
"loss": 0.0903,
"mean_token_accuracy": 0.9696721345186233,
"num_tokens": 12483912.0,
"step": 5730
},
{
"epoch": 15.389261744966444,
"grad_norm": 1.7301290035247803,
"learning_rate": 3.376632674565411e-06,
"loss": 0.1248,
"mean_token_accuracy": 0.9635996103286744,
"num_tokens": 12504064.0,
"step": 5740
},
{
"epoch": 15.416107382550335,
"grad_norm": 1.6507128477096558,
"learning_rate": 3.3601731996684584e-06,
"loss": 0.0857,
"mean_token_accuracy": 0.97254838347435,
"num_tokens": 12525725.0,
"step": 5750
},
{
"epoch": 15.442953020134228,
"grad_norm": 2.455329656600952,
"learning_rate": 3.343733612625404e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9704270303249359,
"num_tokens": 12549124.0,
"step": 5760
},
{
"epoch": 15.46979865771812,
"grad_norm": 2.2060821056365967,
"learning_rate": 3.3273141128159005e-06,
"loss": 0.0907,
"mean_token_accuracy": 0.9694718390703201,
"num_tokens": 12571502.0,
"step": 5770
},
{
"epoch": 15.496644295302014,
"grad_norm": 1.91788911819458,
"learning_rate": 3.310914899375989e-06,
"loss": 0.089,
"mean_token_accuracy": 0.9696966052055359,
"num_tokens": 12592999.0,
"step": 5780
},
{
"epoch": 15.523489932885907,
"grad_norm": 1.589574933052063,
"learning_rate": 3.294536171195673e-06,
"loss": 0.1121,
"mean_token_accuracy": 0.9663246095180511,
"num_tokens": 12613099.0,
"step": 5790
},
{
"epoch": 15.550335570469798,
"grad_norm": 1.5774067640304565,
"learning_rate": 3.278178126916515e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9742209672927856,
"num_tokens": 12634907.0,
"step": 5800
},
{
"epoch": 15.577181208053691,
"grad_norm": 1.7764393091201782,
"learning_rate": 3.2618409649292153e-06,
"loss": 0.0894,
"mean_token_accuracy": 0.9685896605253219,
"num_tokens": 12658442.0,
"step": 5810
},
{
"epoch": 15.604026845637584,
"grad_norm": 1.441229224205017,
"learning_rate": 3.2455248833712226e-06,
"loss": 0.0875,
"mean_token_accuracy": 0.9684127897024155,
"num_tokens": 12680988.0,
"step": 5820
},
{
"epoch": 15.630872483221477,
"grad_norm": 1.5752300024032593,
"learning_rate": 3.2292300801243133e-06,
"loss": 0.091,
"mean_token_accuracy": 0.969354122877121,
"num_tokens": 12702567.0,
"step": 5830
},
{
"epoch": 15.657718120805368,
"grad_norm": 1.5445083379745483,
"learning_rate": 3.212956752812206e-06,
"loss": 0.1182,
"mean_token_accuracy": 0.9640820533037185,
"num_tokens": 12722654.0,
"step": 5840
},
{
"epoch": 15.684563758389261,
"grad_norm": 1.6099363565444946,
"learning_rate": 3.196705098798156e-06,
"loss": 0.0893,
"mean_token_accuracy": 0.9708084911108017,
"num_tokens": 12744530.0,
"step": 5850
},
{
"epoch": 15.711409395973154,
"grad_norm": 1.6343265771865845,
"learning_rate": 3.180475315182563e-06,
"loss": 0.086,
"mean_token_accuracy": 0.9693089485168457,
"num_tokens": 12767788.0,
"step": 5860
},
{
"epoch": 15.738255033557047,
"grad_norm": 1.7334041595458984,
"learning_rate": 3.1642675988005854e-06,
"loss": 0.0929,
"mean_token_accuracy": 0.9685852974653244,
"num_tokens": 12790039.0,
"step": 5870
},
{
"epoch": 15.765100671140939,
"grad_norm": 1.8182802200317383,
"learning_rate": 3.1480821462197464e-06,
"loss": 0.0944,
"mean_token_accuracy": 0.9677254110574722,
"num_tokens": 12811427.0,
"step": 5880
},
{
"epoch": 15.791946308724832,
"grad_norm": 2.699237585067749,
"learning_rate": 3.1319191537375577e-06,
"loss": 0.1086,
"mean_token_accuracy": 0.9659045994281769,
"num_tokens": 12831229.0,
"step": 5890
},
{
"epoch": 15.818791946308725,
"grad_norm": 1.9692622423171997,
"learning_rate": 3.1157788173791303e-06,
"loss": 0.0908,
"mean_token_accuracy": 0.9693997651338577,
"num_tokens": 12852938.0,
"step": 5900
},
{
"epoch": 15.845637583892618,
"grad_norm": 1.4347225427627563,
"learning_rate": 3.0996613328948006e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.9709456175565719,
"num_tokens": 12876425.0,
"step": 5910
},
{
"epoch": 15.87248322147651,
"grad_norm": 1.4364287853240967,
"learning_rate": 3.0835668957577636e-06,
"loss": 0.0968,
"mean_token_accuracy": 0.9675709336996079,
"num_tokens": 12898803.0,
"step": 5920
},
{
"epoch": 15.899328859060402,
"grad_norm": 1.7437158823013306,
"learning_rate": 3.067495701161686e-06,
"loss": 0.0937,
"mean_token_accuracy": 0.9686466962099075,
"num_tokens": 12920276.0,
"step": 5930
},
{
"epoch": 15.926174496644295,
"grad_norm": 1.5922681093215942,
"learning_rate": 3.051447944018359e-06,
"loss": 0.1165,
"mean_token_accuracy": 0.9641006350517273,
"num_tokens": 12940184.0,
"step": 5940
},
{
"epoch": 15.953020134228188,
"grad_norm": 1.3997697830200195,
"learning_rate": 3.035423818955316e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9725989639759064,
"num_tokens": 12961557.0,
"step": 5950
},
{
"epoch": 15.979865771812081,
"grad_norm": 1.8167921304702759,
"learning_rate": 3.01942352031348e-06,
"loss": 0.0958,
"mean_token_accuracy": 0.9666165590286255,
"num_tokens": 12983780.0,
"step": 5960
},
{
"epoch": 16.00536912751678,
"grad_norm": 1.1939505338668823,
"learning_rate": 3.0034472421448134e-06,
"loss": 0.0984,
"mean_token_accuracy": 0.9686258435249329,
"num_tokens": 13002890.0,
"step": 5970
},
{
"epoch": 16.032214765100672,
"grad_norm": 1.3747490644454956,
"learning_rate": 2.987495178209951e-06,
"loss": 0.0782,
"mean_token_accuracy": 0.9740465998649597,
"num_tokens": 13026596.0,
"step": 5980
},
{
"epoch": 16.059060402684565,
"grad_norm": 1.6247341632843018,
"learning_rate": 2.9715675219758598e-06,
"loss": 0.0878,
"mean_token_accuracy": 0.9689295053482055,
"num_tokens": 13049138.0,
"step": 5990
},
{
"epoch": 16.085906040268455,
"grad_norm": 1.530810832977295,
"learning_rate": 2.9556644666134903e-06,
"loss": 0.072,
"mean_token_accuracy": 0.9749242842197419,
"num_tokens": 13070906.0,
"step": 6000
},
{
"epoch": 16.112751677852348,
"grad_norm": 1.8794249296188354,
"learning_rate": 2.9397862049954307e-06,
"loss": 0.1095,
"mean_token_accuracy": 0.9656890034675598,
"num_tokens": 13091256.0,
"step": 6010
},
{
"epoch": 16.13959731543624,
"grad_norm": 3.501293659210205,
"learning_rate": 2.9239329296935726e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.973246031999588,
"num_tokens": 13111492.0,
"step": 6020
},
{
"epoch": 16.166442953020134,
"grad_norm": 1.5681910514831543,
"learning_rate": 2.908104832976773e-06,
"loss": 0.0822,
"mean_token_accuracy": 0.9714659541845322,
"num_tokens": 13135289.0,
"step": 6030
},
{
"epoch": 16.193288590604027,
"grad_norm": 1.32642662525177,
"learning_rate": 2.892302106808519e-06,
"loss": 0.0867,
"mean_token_accuracy": 0.9693911045789718,
"num_tokens": 13158186.0,
"step": 6040
},
{
"epoch": 16.22013422818792,
"grad_norm": 1.3679111003875732,
"learning_rate": 2.8765249428446074e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.9712060689926147,
"num_tokens": 13180185.0,
"step": 6050
},
{
"epoch": 16.246979865771813,
"grad_norm": 1.9701911211013794,
"learning_rate": 2.860773532430814e-06,
"loss": 0.1007,
"mean_token_accuracy": 0.9682783782482147,
"num_tokens": 13200699.0,
"step": 6060
},
{
"epoch": 16.273825503355706,
"grad_norm": 1.7542141675949097,
"learning_rate": 2.8450480666005743e-06,
"loss": 0.0865,
"mean_token_accuracy": 0.9719853222370147,
"num_tokens": 13220970.0,
"step": 6070
},
{
"epoch": 16.3006711409396,
"grad_norm": 1.3732870817184448,
"learning_rate": 2.8293487360726703e-06,
"loss": 0.0778,
"mean_token_accuracy": 0.9729690462350845,
"num_tokens": 13244725.0,
"step": 6080
},
{
"epoch": 16.32751677852349,
"grad_norm": 1.513179898262024,
"learning_rate": 2.8136757312489104e-06,
"loss": 0.0874,
"mean_token_accuracy": 0.9692198872566223,
"num_tokens": 13267350.0,
"step": 6090
},
{
"epoch": 16.35436241610738,
"grad_norm": 1.625817060470581,
"learning_rate": 2.7980292422118282e-06,
"loss": 0.0788,
"mean_token_accuracy": 0.9728165179491043,
"num_tokens": 13289168.0,
"step": 6100
},
{
"epoch": 16.381208053691275,
"grad_norm": 2.6137592792510986,
"learning_rate": 2.782409458722371e-06,
"loss": 0.1082,
"mean_token_accuracy": 0.9670576930046082,
"num_tokens": 13309717.0,
"step": 6110
},
{
"epoch": 16.408053691275168,
"grad_norm": 1.512478232383728,
"learning_rate": 2.7668165702176007e-06,
"loss": 0.0881,
"mean_token_accuracy": 0.9715318799018859,
"num_tokens": 13330146.0,
"step": 6120
},
{
"epoch": 16.43489932885906,
"grad_norm": 1.7888556718826294,
"learning_rate": 2.7512507658083996e-06,
"loss": 0.0808,
"mean_token_accuracy": 0.9712006777524949,
"num_tokens": 13354002.0,
"step": 6130
},
{
"epoch": 16.461744966442954,
"grad_norm": 1.4530067443847656,
"learning_rate": 2.735712234277165e-06,
"loss": 0.082,
"mean_token_accuracy": 0.9709073066711426,
"num_tokens": 13376841.0,
"step": 6140
},
{
"epoch": 16.488590604026847,
"grad_norm": 1.5421147346496582,
"learning_rate": 2.72020116407554e-06,
"loss": 0.0817,
"mean_token_accuracy": 0.9726418137550354,
"num_tokens": 13398706.0,
"step": 6150
},
{
"epoch": 16.51543624161074,
"grad_norm": 2.2229270935058594,
"learning_rate": 2.704717743322104e-06,
"loss": 0.1123,
"mean_token_accuracy": 0.9643840521574021,
"num_tokens": 13419292.0,
"step": 6160
},
{
"epoch": 16.542281879194633,
"grad_norm": 1.7774611711502075,
"learning_rate": 2.6892621598001157e-06,
"loss": 0.0896,
"mean_token_accuracy": 0.9724551647901535,
"num_tokens": 13439730.0,
"step": 6170
},
{
"epoch": 16.569127516778522,
"grad_norm": 1.6754381656646729,
"learning_rate": 2.673834600955212e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.971163061261177,
"num_tokens": 13463427.0,
"step": 6180
},
{
"epoch": 16.595973154362415,
"grad_norm": 1.3491127490997314,
"learning_rate": 2.6584352538931523e-06,
"loss": 0.0913,
"mean_token_accuracy": 0.9694003283977508,
"num_tokens": 13486109.0,
"step": 6190
},
{
"epoch": 16.622818791946308,
"grad_norm": 1.4250203371047974,
"learning_rate": 2.643064305377542e-06,
"loss": 0.084,
"mean_token_accuracy": 0.9725743800401687,
"num_tokens": 13507988.0,
"step": 6200
},
{
"epoch": 16.6496644295302,
"grad_norm": 5.250053882598877,
"learning_rate": 2.627721941827568e-06,
"loss": 0.1066,
"mean_token_accuracy": 0.9665826559066772,
"num_tokens": 13528436.0,
"step": 6210
},
{
"epoch": 16.676510067114094,
"grad_norm": 2.1783525943756104,
"learning_rate": 2.612408349315734e-06,
"loss": 0.0907,
"mean_token_accuracy": 0.9724477410316468,
"num_tokens": 13548694.0,
"step": 6220
},
{
"epoch": 16.703355704697987,
"grad_norm": 1.5104924440383911,
"learning_rate": 2.597123713565618e-06,
"loss": 0.0786,
"mean_token_accuracy": 0.9716423004865646,
"num_tokens": 13572456.0,
"step": 6230
},
{
"epoch": 16.73020134228188,
"grad_norm": 1.8810374736785889,
"learning_rate": 2.581868219949597e-06,
"loss": 0.0873,
"mean_token_accuracy": 0.9687312304973602,
"num_tokens": 13595077.0,
"step": 6240
},
{
"epoch": 16.757046979865773,
"grad_norm": 3.1727447509765625,
"learning_rate": 2.5666420534866256e-06,
"loss": 0.0875,
"mean_token_accuracy": 0.9711047947406769,
"num_tokens": 13616808.0,
"step": 6250
},
{
"epoch": 16.783892617449663,
"grad_norm": 2.609102725982666,
"learning_rate": 2.551445398839964e-06,
"loss": 0.1102,
"mean_token_accuracy": 0.9661127954721451,
"num_tokens": 13637241.0,
"step": 6260
},
{
"epoch": 16.810738255033556,
"grad_norm": 1.5663530826568604,
"learning_rate": 2.536278440314962e-06,
"loss": 0.0907,
"mean_token_accuracy": 0.9729632198810577,
"num_tokens": 13657546.0,
"step": 6270
},
{
"epoch": 16.83758389261745,
"grad_norm": 3.344959020614624,
"learning_rate": 2.5211413618568114e-06,
"loss": 0.0807,
"mean_token_accuracy": 0.9719722241163253,
"num_tokens": 13681423.0,
"step": 6280
},
{
"epoch": 16.864429530201342,
"grad_norm": 1.5950130224227905,
"learning_rate": 2.5060343470483173e-06,
"loss": 0.0918,
"mean_token_accuracy": 0.9679073393344879,
"num_tokens": 13704189.0,
"step": 6290
},
{
"epoch": 16.891275167785235,
"grad_norm": 1.7153663635253906,
"learning_rate": 2.490957579107673e-06,
"loss": 0.078,
"mean_token_accuracy": 0.9733668386936187,
"num_tokens": 13726058.0,
"step": 6300
},
{
"epoch": 16.918120805369128,
"grad_norm": 4.971838474273682,
"learning_rate": 2.4759112408862366e-06,
"loss": 0.1162,
"mean_token_accuracy": 0.9651578456163407,
"num_tokens": 13746683.0,
"step": 6310
},
{
"epoch": 16.94496644295302,
"grad_norm": 1.5700833797454834,
"learning_rate": 2.460895514866315e-06,
"loss": 0.0913,
"mean_token_accuracy": 0.9729659497737885,
"num_tokens": 13766985.0,
"step": 6320
},
{
"epoch": 16.971812080536914,
"grad_norm": 4.7659382820129395,
"learning_rate": 2.445910583158948e-06,
"loss": 0.0814,
"mean_token_accuracy": 0.9713144838809967,
"num_tokens": 13789886.0,
"step": 6330
},
{
"epoch": 16.998657718120807,
"grad_norm": 1.5581632852554321,
"learning_rate": 2.4309566275017027e-06,
"loss": 0.0994,
"mean_token_accuracy": 0.9688288152217865,
"num_tokens": 13810043.0,
"step": 6340
},
{
"epoch": 17.024161073825503,
"grad_norm": 1.6904538869857788,
"learning_rate": 2.4160338292564685e-06,
"loss": 0.0794,
"mean_token_accuracy": 0.9736049488971108,
"num_tokens": 13832080.0,
"step": 6350
},
{
"epoch": 17.051006711409396,
"grad_norm": 1.6213840246200562,
"learning_rate": 2.401142369407256e-06,
"loss": 0.0797,
"mean_token_accuracy": 0.9715674847364426,
"num_tokens": 13854971.0,
"step": 6360
},
{
"epoch": 17.07785234899329,
"grad_norm": 1.3479030132293701,
"learning_rate": 2.386282428558001e-06,
"loss": 0.0758,
"mean_token_accuracy": 0.9742376655340195,
"num_tokens": 13876881.0,
"step": 6370
},
{
"epoch": 17.104697986577182,
"grad_norm": 2.28167724609375,
"learning_rate": 2.37145418693038e-06,
"loss": 0.1018,
"mean_token_accuracy": 0.9682648777961731,
"num_tokens": 13897674.0,
"step": 6380
},
{
"epoch": 17.131543624161075,
"grad_norm": 1.3986122608184814,
"learning_rate": 2.3566578243616184e-06,
"loss": 0.0943,
"mean_token_accuracy": 0.9730989217758179,
"num_tokens": 13917109.0,
"step": 6390
},
{
"epoch": 17.158389261744965,
"grad_norm": 1.8013077974319458,
"learning_rate": 2.341893520302313e-06,
"loss": 0.0748,
"mean_token_accuracy": 0.9754731118679046,
"num_tokens": 13940804.0,
"step": 6400
},
{
"epoch": 17.185234899328858,
"grad_norm": 1.4591470956802368,
"learning_rate": 2.327161453814254e-06,
"loss": 0.0875,
"mean_token_accuracy": 0.9693176746368408,
"num_tokens": 13963800.0,
"step": 6410
},
{
"epoch": 17.21208053691275,
"grad_norm": 1.3876402378082275,
"learning_rate": 2.3124618035682523e-06,
"loss": 0.0772,
"mean_token_accuracy": 0.9730733752250671,
"num_tokens": 13985821.0,
"step": 6420
},
{
"epoch": 17.238926174496644,
"grad_norm": 2.060427665710449,
"learning_rate": 2.297794747841976e-06,
"loss": 0.1029,
"mean_token_accuracy": 0.9671726226806641,
"num_tokens": 14006527.0,
"step": 6430
},
{
"epoch": 17.265771812080537,
"grad_norm": 1.4501941204071045,
"learning_rate": 2.2831604645177867e-06,
"loss": 0.0924,
"mean_token_accuracy": 0.9718647956848144,
"num_tokens": 14026022.0,
"step": 6440
},
{
"epoch": 17.29261744966443,
"grad_norm": 4.501188278198242,
"learning_rate": 2.2685591310805743e-06,
"loss": 0.0745,
"mean_token_accuracy": 0.9747655302286148,
"num_tokens": 14049610.0,
"step": 6450
},
{
"epoch": 17.319463087248323,
"grad_norm": 1.406905174255371,
"learning_rate": 2.2539909246156257e-06,
"loss": 0.0787,
"mean_token_accuracy": 0.9729764729738235,
"num_tokens": 14072599.0,
"step": 6460
},
{
"epoch": 17.346308724832216,
"grad_norm": 1.2447459697723389,
"learning_rate": 2.2394560218064464e-06,
"loss": 0.0757,
"mean_token_accuracy": 0.9738614737987519,
"num_tokens": 14094676.0,
"step": 6470
},
{
"epoch": 17.37315436241611,
"grad_norm": 3.209287643432617,
"learning_rate": 2.2249545989326516e-06,
"loss": 0.0944,
"mean_token_accuracy": 0.9688991487026215,
"num_tokens": 14115606.0,
"step": 6480
},
{
"epoch": 17.4,
"grad_norm": 1.2575204372406006,
"learning_rate": 2.2104868318677963e-06,
"loss": 0.0878,
"mean_token_accuracy": 0.9742684334516525,
"num_tokens": 14134927.0,
"step": 6490
},
{
"epoch": 17.42684563758389,
"grad_norm": 3.82460880279541,
"learning_rate": 2.1960528960772666e-06,
"loss": 0.0751,
"mean_token_accuracy": 0.9734666824340821,
"num_tokens": 14158556.0,
"step": 6500
},
{
"epoch": 17.453691275167785,
"grad_norm": 2.3200035095214844,
"learning_rate": 2.1816529666161378e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9702016830444335,
"num_tokens": 14181722.0,
"step": 6510
},
{
"epoch": 17.480536912751678,
"grad_norm": 2.4491753578186035,
"learning_rate": 2.1672872181270575e-06,
"loss": 0.0806,
"mean_token_accuracy": 0.9737491935491562,
"num_tokens": 14203907.0,
"step": 6520
},
{
"epoch": 17.50738255033557,
"grad_norm": 4.129950046539307,
"learning_rate": 2.1529558248381254e-06,
"loss": 0.0952,
"mean_token_accuracy": 0.9714792817831039,
"num_tokens": 14224816.0,
"step": 6530
},
{
"epoch": 17.534228187919464,
"grad_norm": 1.8445805311203003,
"learning_rate": 2.1386589605607826e-06,
"loss": 0.0908,
"mean_token_accuracy": 0.9711948812007904,
"num_tokens": 14244208.0,
"step": 6540
},
{
"epoch": 17.561073825503357,
"grad_norm": 1.3207927942276,
"learning_rate": 2.1243967986876933e-06,
"loss": 0.071,
"mean_token_accuracy": 0.9752148389816284,
"num_tokens": 14267815.0,
"step": 6550
},
{
"epoch": 17.58791946308725,
"grad_norm": 1.794206142425537,
"learning_rate": 2.110169512190664e-06,
"loss": 0.0793,
"mean_token_accuracy": 0.9713789284229278,
"num_tokens": 14290856.0,
"step": 6560
},
{
"epoch": 17.614765100671143,
"grad_norm": 1.4987738132476807,
"learning_rate": 2.0959772736185174e-06,
"loss": 0.0809,
"mean_token_accuracy": 0.9734682083129883,
"num_tokens": 14312993.0,
"step": 6570
},
{
"epoch": 17.641610738255032,
"grad_norm": 2.3780603408813477,
"learning_rate": 2.081820255095028e-06,
"loss": 0.0924,
"mean_token_accuracy": 0.9696302711963654,
"num_tokens": 14334097.0,
"step": 6580
},
{
"epoch": 17.668456375838925,
"grad_norm": 1.3051165342330933,
"learning_rate": 2.0676986283168083e-06,
"loss": 0.0973,
"mean_token_accuracy": 0.9718139797449112,
"num_tokens": 14353612.0,
"step": 6590
},
{
"epoch": 17.69530201342282,
"grad_norm": 1.6877503395080566,
"learning_rate": 2.0536125645512473e-06,
"loss": 0.079,
"mean_token_accuracy": 0.9738190263509751,
"num_tokens": 14377191.0,
"step": 6600
},
{
"epoch": 17.72214765100671,
"grad_norm": 1.623205304145813,
"learning_rate": 2.0395622346344213e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.9697360932826996,
"num_tokens": 14400189.0,
"step": 6610
},
{
"epoch": 17.748993288590604,
"grad_norm": 1.23772394657135,
"learning_rate": 2.025547808969028e-06,
"loss": 0.0741,
"mean_token_accuracy": 0.974305123090744,
"num_tokens": 14422242.0,
"step": 6620
},
{
"epoch": 17.775838926174497,
"grad_norm": 3.2685928344726562,
"learning_rate": 2.011569457522315e-06,
"loss": 0.0987,
"mean_token_accuracy": 0.968069052696228,
"num_tokens": 14443178.0,
"step": 6630
},
{
"epoch": 17.80268456375839,
"grad_norm": 1.7577261924743652,
"learning_rate": 1.9976273498240234e-06,
"loss": 0.095,
"mean_token_accuracy": 0.9713614732027054,
"num_tokens": 14462670.0,
"step": 6640
},
{
"epoch": 17.829530201342283,
"grad_norm": 1.8098061084747314,
"learning_rate": 1.9837216549643285e-06,
"loss": 0.076,
"mean_token_accuracy": 0.9737594068050385,
"num_tokens": 14486332.0,
"step": 6650
},
{
"epoch": 17.856375838926173,
"grad_norm": 1.7305619716644287,
"learning_rate": 1.969852541591789e-06,
"loss": 0.0883,
"mean_token_accuracy": 0.9701765835285187,
"num_tokens": 14509387.0,
"step": 6660
},
{
"epoch": 17.883221476510066,
"grad_norm": 3.777503728866577,
"learning_rate": 1.9560201779113056e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.972148808836937,
"num_tokens": 14531516.0,
"step": 6670
},
{
"epoch": 17.91006711409396,
"grad_norm": 1.9363383054733276,
"learning_rate": 1.94222473168207e-06,
"loss": 0.0969,
"mean_token_accuracy": 0.9687934070825577,
"num_tokens": 14552725.0,
"step": 6680
},
{
"epoch": 17.936912751677852,
"grad_norm": 1.2105554342269897,
"learning_rate": 1.928466370215552e-06,
"loss": 0.0959,
"mean_token_accuracy": 0.971455842256546,
"num_tokens": 14572133.0,
"step": 6690
},
{
"epoch": 17.963758389261745,
"grad_norm": 1.5113040208816528,
"learning_rate": 1.9147452603734402e-06,
"loss": 0.0787,
"mean_token_accuracy": 0.9724821716547012,
"num_tokens": 14595076.0,
"step": 6700
},
{
"epoch": 17.990604026845638,
"grad_norm": 2.3052010536193848,
"learning_rate": 1.9010615685656514e-06,
"loss": 0.0875,
"mean_token_accuracy": 0.9719449728727341,
"num_tokens": 14616603.0,
"step": 6710
},
{
"epoch": 18.016107382550334,
"grad_norm": 1.6915111541748047,
"learning_rate": 1.8874154607482815e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9732024732388949,
"num_tokens": 14637290.0,
"step": 6720
},
{
"epoch": 18.042953020134227,
"grad_norm": 1.7393372058868408,
"learning_rate": 1.8738071024216141e-06,
"loss": 0.0734,
"mean_token_accuracy": 0.9738708108663559,
"num_tokens": 14660686.0,
"step": 6730
},
{
"epoch": 18.06979865771812,
"grad_norm": 1.477663516998291,
"learning_rate": 1.8602366586281063e-06,
"loss": 0.0787,
"mean_token_accuracy": 0.9723863214254379,
"num_tokens": 14683101.0,
"step": 6740
},
{
"epoch": 18.096644295302013,
"grad_norm": 2.2596659660339355,
"learning_rate": 1.8467042939503844e-06,
"loss": 0.0765,
"mean_token_accuracy": 0.9746802181005478,
"num_tokens": 14704571.0,
"step": 6750
},
{
"epoch": 18.123489932885906,
"grad_norm": 1.6076984405517578,
"learning_rate": 1.8332101725092522e-06,
"loss": 0.1067,
"mean_token_accuracy": 0.96855249106884,
"num_tokens": 14724451.0,
"step": 6760
},
{
"epoch": 18.1503355704698,
"grad_norm": 2.0869836807250977,
"learning_rate": 1.8197544579616998e-06,
"loss": 0.0792,
"mean_token_accuracy": 0.9745613276958466,
"num_tokens": 14746707.0,
"step": 6770
},
{
"epoch": 18.177181208053693,
"grad_norm": 2.008155345916748,
"learning_rate": 1.8063373134989104e-06,
"loss": 0.0758,
"mean_token_accuracy": 0.9732336699962616,
"num_tokens": 14770082.0,
"step": 6780
},
{
"epoch": 18.204026845637586,
"grad_norm": 1.347522497177124,
"learning_rate": 1.7929589018443016e-06,
"loss": 0.0788,
"mean_token_accuracy": 0.9738054692745208,
"num_tokens": 14792471.0,
"step": 6790
},
{
"epoch": 18.230872483221475,
"grad_norm": 2.535391092300415,
"learning_rate": 1.7796193852515258e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9719479292631149,
"num_tokens": 14813925.0,
"step": 6800
},
{
"epoch": 18.257718120805368,
"grad_norm": 1.823934555053711,
"learning_rate": 1.766318925502522e-06,
"loss": 0.0927,
"mean_token_accuracy": 0.973075395822525,
"num_tokens": 14833771.0,
"step": 6810
},
{
"epoch": 18.28456375838926,
"grad_norm": 1.6548361778259277,
"learning_rate": 1.7530576839055453e-06,
"loss": 0.072,
"mean_token_accuracy": 0.9755393981933593,
"num_tokens": 14855886.0,
"step": 6820
},
{
"epoch": 18.311409395973154,
"grad_norm": 1.5816959142684937,
"learning_rate": 1.7398358212932132e-06,
"loss": 0.0779,
"mean_token_accuracy": 0.9722195774316787,
"num_tokens": 14879050.0,
"step": 6830
},
{
"epoch": 18.338255033557047,
"grad_norm": 1.3728562593460083,
"learning_rate": 1.7266534980205524e-06,
"loss": 0.0788,
"mean_token_accuracy": 0.9737544000148773,
"num_tokens": 14901117.0,
"step": 6840
},
{
"epoch": 18.36510067114094,
"grad_norm": 2.1892294883728027,
"learning_rate": 1.7135108739630573e-06,
"loss": 0.0858,
"mean_token_accuracy": 0.9720165610313416,
"num_tokens": 14922262.0,
"step": 6850
},
{
"epoch": 18.391946308724833,
"grad_norm": 1.304505705833435,
"learning_rate": 1.7004081085147483e-06,
"loss": 0.096,
"mean_token_accuracy": 0.9711766183376312,
"num_tokens": 14942032.0,
"step": 6860
},
{
"epoch": 18.418791946308726,
"grad_norm": 1.7754546403884888,
"learning_rate": 1.6873453605862405e-06,
"loss": 0.0738,
"mean_token_accuracy": 0.9774259269237519,
"num_tokens": 14964280.0,
"step": 6870
},
{
"epoch": 18.44563758389262,
"grad_norm": 3.5867326259613037,
"learning_rate": 1.6743227886028152e-06,
"loss": 0.0756,
"mean_token_accuracy": 0.9743267685174942,
"num_tokens": 14987755.0,
"step": 6880
},
{
"epoch": 18.47248322147651,
"grad_norm": 1.8171364068984985,
"learning_rate": 1.6613405505024987e-06,
"loss": 0.075,
"mean_token_accuracy": 0.974465224146843,
"num_tokens": 15010023.0,
"step": 6890
},
{
"epoch": 18.4993288590604,
"grad_norm": 2.2199792861938477,
"learning_rate": 1.6483988037341497e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9714995324611664,
"num_tokens": 15031292.0,
"step": 6900
},
{
"epoch": 18.526174496644295,
"grad_norm": 1.3972761631011963,
"learning_rate": 1.6354977052555393e-06,
"loss": 0.0983,
"mean_token_accuracy": 0.9719545841217041,
"num_tokens": 15051069.0,
"step": 6910
},
{
"epoch": 18.553020134228188,
"grad_norm": 1.4796215295791626,
"learning_rate": 1.622637411531468e-06,
"loss": 0.0687,
"mean_token_accuracy": 0.9775502204895019,
"num_tokens": 15073310.0,
"step": 6920
},
{
"epoch": 18.57986577181208,
"grad_norm": 1.5705747604370117,
"learning_rate": 1.6098180785318424e-06,
"loss": 0.074,
"mean_token_accuracy": 0.9737792462110519,
"num_tokens": 15096715.0,
"step": 6930
},
{
"epoch": 18.606711409395974,
"grad_norm": 1.4334312677383423,
"learning_rate": 1.5970398617298078e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9727539092302322,
"num_tokens": 15119210.0,
"step": 6940
},
{
"epoch": 18.633557046979867,
"grad_norm": 1.7822301387786865,
"learning_rate": 1.584302916099842e-06,
"loss": 0.0761,
"mean_token_accuracy": 0.9743141323328018,
"num_tokens": 15140845.0,
"step": 6950
},
{
"epoch": 18.66040268456376,
"grad_norm": 1.785094141960144,
"learning_rate": 1.5716073961158907e-06,
"loss": 0.0968,
"mean_token_accuracy": 0.9698344320058823,
"num_tokens": 15160876.0,
"step": 6960
},
{
"epoch": 18.68724832214765,
"grad_norm": 1.5996724367141724,
"learning_rate": 1.5589534557494868e-06,
"loss": 0.0751,
"mean_token_accuracy": 0.9763054817914962,
"num_tokens": 15183031.0,
"step": 6970
},
{
"epoch": 18.714093959731542,
"grad_norm": 2.5582118034362793,
"learning_rate": 1.5463412484678858e-06,
"loss": 0.0789,
"mean_token_accuracy": 0.9721554785966873,
"num_tokens": 15206301.0,
"step": 6980
},
{
"epoch": 18.740939597315435,
"grad_norm": 1.7245941162109375,
"learning_rate": 1.5337709272322015e-06,
"loss": 0.0811,
"mean_token_accuracy": 0.9717311263084412,
"num_tokens": 15228690.0,
"step": 6990
},
{
"epoch": 18.76778523489933,
"grad_norm": 2.050199270248413,
"learning_rate": 1.5212426444955569e-06,
"loss": 0.0835,
"mean_token_accuracy": 0.9718193680047988,
"num_tokens": 15250081.0,
"step": 7000
},
{
"epoch": 18.79463087248322,
"grad_norm": 2.7452526092529297,
"learning_rate": 1.5087565522012226e-06,
"loss": 0.0967,
"mean_token_accuracy": 0.9709857195615769,
"num_tokens": 15269934.0,
"step": 7010
},
{
"epoch": 18.821476510067114,
"grad_norm": 1.5559078454971313,
"learning_rate": 1.496312801780795e-06,
"loss": 0.0709,
"mean_token_accuracy": 0.976904445886612,
"num_tokens": 15292176.0,
"step": 7020
},
{
"epoch": 18.848322147651007,
"grad_norm": 3.355046272277832,
"learning_rate": 1.4839115441523355e-06,
"loss": 0.0761,
"mean_token_accuracy": 0.9721087634563446,
"num_tokens": 15315512.0,
"step": 7030
},
{
"epoch": 18.8751677852349,
"grad_norm": 1.4108015298843384,
"learning_rate": 1.47155292971856e-06,
"loss": 0.0818,
"mean_token_accuracy": 0.9717017352581024,
"num_tokens": 15337909.0,
"step": 7040
},
{
"epoch": 18.902013422818793,
"grad_norm": 2.1846816539764404,
"learning_rate": 1.459237108365003e-06,
"loss": 0.0779,
"mean_token_accuracy": 0.9738930046558381,
"num_tokens": 15359470.0,
"step": 7050
},
{
"epoch": 18.928859060402683,
"grad_norm": 1.6539279222488403,
"learning_rate": 1.4469642294582048e-06,
"loss": 0.0991,
"mean_token_accuracy": 0.9687478452920913,
"num_tokens": 15379489.0,
"step": 7060
},
{
"epoch": 18.955704697986576,
"grad_norm": 1.7294431924819946,
"learning_rate": 1.434734441843899e-06,
"loss": 0.082,
"mean_token_accuracy": 0.9736314594745636,
"num_tokens": 15401287.0,
"step": 7070
},
{
"epoch": 18.98255033557047,
"grad_norm": 2.668654441833496,
"learning_rate": 1.4225478938452064e-06,
"loss": 0.0805,
"mean_token_accuracy": 0.9734222680330277,
"num_tokens": 15423058.0,
"step": 7080
},
{
"epoch": 19.00805369127517,
"grad_norm": 1.6047618389129639,
"learning_rate": 1.4104047332608379e-06,
"loss": 0.0876,
"mean_token_accuracy": 0.9735144627721686,
"num_tokens": 15442432.0,
"step": 7090
},
{
"epoch": 19.034899328859062,
"grad_norm": 1.4699209928512573,
"learning_rate": 1.3983051073632996e-06,
"loss": 0.0695,
"mean_token_accuracy": 0.9758471369743347,
"num_tokens": 15466193.0,
"step": 7100
},
{
"epoch": 19.06174496644295,
"grad_norm": 1.302131175994873,
"learning_rate": 1.3862491628971097e-06,
"loss": 0.0739,
"mean_token_accuracy": 0.9741963982582093,
"num_tokens": 15488932.0,
"step": 7110
},
{
"epoch": 19.088590604026844,
"grad_norm": 1.373273491859436,
"learning_rate": 1.3742370460770144e-06,
"loss": 0.0685,
"mean_token_accuracy": 0.9762805610895157,
"num_tokens": 15510761.0,
"step": 7120
},
{
"epoch": 19.115436241610738,
"grad_norm": 1.9326364994049072,
"learning_rate": 1.3622689025862219e-06,
"loss": 0.0929,
"mean_token_accuracy": 0.9708808869123459,
"num_tokens": 15531273.0,
"step": 7130
},
{
"epoch": 19.14228187919463,
"grad_norm": 2.295323371887207,
"learning_rate": 1.3503448775746226e-06,
"loss": 0.0754,
"mean_token_accuracy": 0.976592805981636,
"num_tokens": 15552169.0,
"step": 7140
},
{
"epoch": 19.169127516778524,
"grad_norm": 2.1047043800354004,
"learning_rate": 1.3384651156570483e-06,
"loss": 0.0684,
"mean_token_accuracy": 0.9758545637130738,
"num_tokens": 15575849.0,
"step": 7150
},
{
"epoch": 19.195973154362417,
"grad_norm": 1.449154019355774,
"learning_rate": 1.3266297609114965e-06,
"loss": 0.0751,
"mean_token_accuracy": 0.9727135717868804,
"num_tokens": 15598502.0,
"step": 7160
},
{
"epoch": 19.22281879194631,
"grad_norm": 1.5876628160476685,
"learning_rate": 1.3148389568774022e-06,
"loss": 0.0707,
"mean_token_accuracy": 0.9776216924190522,
"num_tokens": 15620289.0,
"step": 7170
},
{
"epoch": 19.249664429530203,
"grad_norm": 1.8609250783920288,
"learning_rate": 1.3030928465538822e-06,
"loss": 0.0948,
"mean_token_accuracy": 0.9705761075019836,
"num_tokens": 15640772.0,
"step": 7180
},
{
"epoch": 19.276510067114096,
"grad_norm": 2.3664748668670654,
"learning_rate": 1.291391572398009e-06,
"loss": 0.0746,
"mean_token_accuracy": 0.9772118955850602,
"num_tokens": 15661593.0,
"step": 7190
},
{
"epoch": 19.303355704697985,
"grad_norm": 2.159987449645996,
"learning_rate": 1.279735276323083e-06,
"loss": 0.073,
"mean_token_accuracy": 0.9758386135101318,
"num_tokens": 15685231.0,
"step": 7200
},
{
"epoch": 19.330201342281878,
"grad_norm": 4.445925712585449,
"learning_rate": 1.2681240996969085e-06,
"loss": 0.0794,
"mean_token_accuracy": 0.9732168436050415,
"num_tokens": 15707769.0,
"step": 7210
},
{
"epoch": 19.35704697986577,
"grad_norm": 1.9557926654815674,
"learning_rate": 1.2565581833400753e-06,
"loss": 0.0734,
"mean_token_accuracy": 0.975311490893364,
"num_tokens": 15729487.0,
"step": 7220
},
{
"epoch": 19.383892617449664,
"grad_norm": 1.8642995357513428,
"learning_rate": 1.2450376675242658e-06,
"loss": 0.1031,
"mean_token_accuracy": 0.9692795485258102,
"num_tokens": 15749855.0,
"step": 7230
},
{
"epoch": 19.410738255033557,
"grad_norm": 3.3018038272857666,
"learning_rate": 1.233562691970533e-06,
"loss": 0.0789,
"mean_token_accuracy": 0.9755080968141556,
"num_tokens": 15770677.0,
"step": 7240
},
{
"epoch": 19.43758389261745,
"grad_norm": 3.031343460083008,
"learning_rate": 1.2221333958476261e-06,
"loss": 0.0717,
"mean_token_accuracy": 0.9755002796649933,
"num_tokens": 15794295.0,
"step": 7250
},
{
"epoch": 19.464429530201343,
"grad_norm": 1.6727511882781982,
"learning_rate": 1.2107499177702852e-06,
"loss": 0.0779,
"mean_token_accuracy": 0.9729755282402038,
"num_tokens": 15816964.0,
"step": 7260
},
{
"epoch": 19.491275167785236,
"grad_norm": 2.095463991165161,
"learning_rate": 1.1994123957975722e-06,
"loss": 0.0734,
"mean_token_accuracy": 0.9745409727096558,
"num_tokens": 15838814.0,
"step": 7270
},
{
"epoch": 19.51812080536913,
"grad_norm": 1.4033610820770264,
"learning_rate": 1.1881209674311934e-06,
"loss": 0.093,
"mean_token_accuracy": 0.9723419904708862,
"num_tokens": 15859106.0,
"step": 7280
},
{
"epoch": 19.54496644295302,
"grad_norm": 2.11256742477417,
"learning_rate": 1.1768757696138278e-06,
"loss": 0.0767,
"mean_token_accuracy": 0.9770674705505371,
"num_tokens": 15879928.0,
"step": 7290
},
{
"epoch": 19.571812080536912,
"grad_norm": 2.040146827697754,
"learning_rate": 1.1656769387274714e-06,
"loss": 0.0718,
"mean_token_accuracy": 0.9746215373277665,
"num_tokens": 15903526.0,
"step": 7300
},
{
"epoch": 19.598657718120805,
"grad_norm": 2.3293981552124023,
"learning_rate": 1.1545246105917807e-06,
"loss": 0.0793,
"mean_token_accuracy": 0.9727816581726074,
"num_tokens": 15925985.0,
"step": 7310
},
{
"epoch": 19.625503355704698,
"grad_norm": 2.2348577976226807,
"learning_rate": 1.143418920462425e-06,
"loss": 0.0796,
"mean_token_accuracy": 0.9728397846221923,
"num_tokens": 15947589.0,
"step": 7320
},
{
"epoch": 19.65234899328859,
"grad_norm": 1.796433448791504,
"learning_rate": 1.132360003029449e-06,
"loss": 0.1048,
"mean_token_accuracy": 0.9691227197647094,
"num_tokens": 15967874.0,
"step": 7330
},
{
"epoch": 19.679194630872484,
"grad_norm": 2.2440576553344727,
"learning_rate": 1.1213479924156346e-06,
"loss": 0.0776,
"mean_token_accuracy": 0.9765941977500916,
"num_tokens": 15988720.0,
"step": 7340
},
{
"epoch": 19.706040268456377,
"grad_norm": 2.9480559825897217,
"learning_rate": 1.1103830221748774e-06,
"loss": 0.0759,
"mean_token_accuracy": 0.973601347208023,
"num_tokens": 16012428.0,
"step": 7350
},
{
"epoch": 19.73288590604027,
"grad_norm": 1.705248236656189,
"learning_rate": 1.0994652252905695e-06,
"loss": 0.0825,
"mean_token_accuracy": 0.9710620373487473,
"num_tokens": 16035074.0,
"step": 7360
},
{
"epoch": 19.75973154362416,
"grad_norm": 1.7874031066894531,
"learning_rate": 1.0885947341739768e-06,
"loss": 0.068,
"mean_token_accuracy": 0.9776312798261643,
"num_tokens": 16056867.0,
"step": 7370
},
{
"epoch": 19.786577181208052,
"grad_norm": 1.9907939434051514,
"learning_rate": 1.0777716806626488e-06,
"loss": 0.0924,
"mean_token_accuracy": 0.9716785818338394,
"num_tokens": 16077235.0,
"step": 7380
},
{
"epoch": 19.813422818791945,
"grad_norm": 2.154221773147583,
"learning_rate": 1.0669961960188008e-06,
"loss": 0.0786,
"mean_token_accuracy": 0.9756328999996186,
"num_tokens": 16098064.0,
"step": 7390
},
{
"epoch": 19.84026845637584,
"grad_norm": 2.000872850418091,
"learning_rate": 1.0562684109277426e-06,
"loss": 0.0728,
"mean_token_accuracy": 0.9748990356922149,
"num_tokens": 16121764.0,
"step": 7400
},
{
"epoch": 19.86711409395973,
"grad_norm": 1.618493676185608,
"learning_rate": 1.0455884554962725e-06,
"loss": 0.0764,
"mean_token_accuracy": 0.9751930832862854,
"num_tokens": 16144396.0,
"step": 7410
},
{
"epoch": 19.893959731543625,
"grad_norm": 2.0999059677124023,
"learning_rate": 1.0349564592511162e-06,
"loss": 0.0724,
"mean_token_accuracy": 0.9754237473011017,
"num_tokens": 16166124.0,
"step": 7420
},
{
"epoch": 19.920805369127518,
"grad_norm": 1.7069863080978394,
"learning_rate": 1.024372551137348e-06,
"loss": 0.0942,
"mean_token_accuracy": 0.9713965833187104,
"num_tokens": 16186295.0,
"step": 7430
},
{
"epoch": 19.94765100671141,
"grad_norm": 1.8244096040725708,
"learning_rate": 1.0138368595168291e-06,
"loss": 0.0781,
"mean_token_accuracy": 0.9758277833461761,
"num_tokens": 16206865.0,
"step": 7440
},
{
"epoch": 19.974496644295304,
"grad_norm": 2.282172441482544,
"learning_rate": 1.0033495121666442e-06,
"loss": 0.0764,
"mean_token_accuracy": 0.9733882695436478,
"num_tokens": 16229364.0,
"step": 7450
},
{
"epoch": 20.0,
"grad_norm": 2.2477612495422363,
"learning_rate": 9.929106362775681e-07,
"loss": 0.0866,
"mean_token_accuracy": 0.9732856374037894,
"num_tokens": 16247380.0,
"step": 7460
},
{
"epoch": 20.026845637583893,
"grad_norm": 1.7040324211120605,
"learning_rate": 9.825203584525022e-07,
"loss": 0.0688,
"mean_token_accuracy": 0.9761479705572128,
"num_tokens": 16271513.0,
"step": 7470
},
{
"epoch": 20.053691275167786,
"grad_norm": 1.6158870458602905,
"learning_rate": 9.721788047049586e-07,
"loss": 0.0757,
"mean_token_accuracy": 0.9738033145666123,
"num_tokens": 16294453.0,
"step": 7480
},
{
"epoch": 20.08053691275168,
"grad_norm": 1.5813090801239014,
"learning_rate": 9.618861004575154e-07,
"loss": 0.0676,
"mean_token_accuracy": 0.9767954677343369,
"num_tokens": 16316505.0,
"step": 7490
},
{
"epoch": 20.107382550335572,
"grad_norm": 2.368957042694092,
"learning_rate": 9.516423705403066e-07,
"loss": 0.0876,
"mean_token_accuracy": 0.9725225150585175,
"num_tokens": 16337346.0,
"step": 7500
},
{
"epoch": 20.13422818791946,
"grad_norm": 0.9680088758468628,
"learning_rate": 9.414477391895044e-07,
"loss": 0.0766,
"mean_token_accuracy": 0.9777394503355026,
"num_tokens": 16356618.0,
"step": 7510
},
{
"epoch": 20.161073825503355,
"grad_norm": 1.9878125190734863,
"learning_rate": 9.313023300458118e-07,
"loss": 0.0697,
"mean_token_accuracy": 0.9754358917474747,
"num_tokens": 16380670.0,
"step": 7520
},
{
"epoch": 20.187919463087248,
"grad_norm": 1.7701399326324463,
"learning_rate": 9.212062661529641e-07,
"loss": 0.0697,
"mean_token_accuracy": 0.975850623846054,
"num_tokens": 16403537.0,
"step": 7530
},
{
"epoch": 20.21476510067114,
"grad_norm": 1.4228025674819946,
"learning_rate": 9.11159669956237e-07,
"loss": 0.0704,
"mean_token_accuracy": 0.9756285309791565,
"num_tokens": 16425451.0,
"step": 7540
},
{
"epoch": 20.241610738255034,
"grad_norm": 2.2829296588897705,
"learning_rate": 9.011626633009596e-07,
"loss": 0.0976,
"mean_token_accuracy": 0.9698988974094391,
"num_tokens": 16446094.0,
"step": 7550
},
{
"epoch": 20.268456375838927,
"grad_norm": 1.0587600469589233,
"learning_rate": 8.912153674310386e-07,
"loss": 0.0794,
"mean_token_accuracy": 0.9759812623262405,
"num_tokens": 16465369.0,
"step": 7560
},
{
"epoch": 20.29530201342282,
"grad_norm": 1.7993144989013672,
"learning_rate": 8.813179029874874e-07,
"loss": 0.0684,
"mean_token_accuracy": 0.9770451784133911,
"num_tokens": 16489501.0,
"step": 7570
},
{
"epoch": 20.322147651006713,
"grad_norm": 1.5915813446044922,
"learning_rate": 8.714703900069638e-07,
"loss": 0.0816,
"mean_token_accuracy": 0.971697872877121,
"num_tokens": 16512298.0,
"step": 7580
},
{
"epoch": 20.348993288590606,
"grad_norm": 1.3499330282211304,
"learning_rate": 8.616729479203123e-07,
"loss": 0.0674,
"mean_token_accuracy": 0.9774249851703644,
"num_tokens": 16534247.0,
"step": 7590
},
{
"epoch": 20.375838926174495,
"grad_norm": 3.021082878112793,
"learning_rate": 8.51925695551113e-07,
"loss": 0.0877,
"mean_token_accuracy": 0.9728184163570404,
"num_tokens": 16555124.0,
"step": 7600
},
{
"epoch": 20.40268456375839,
"grad_norm": 1.2314085960388184,
"learning_rate": 8.422287511142524e-07,
"loss": 0.0852,
"mean_token_accuracy": 0.9761320263147354,
"num_tokens": 16574493.0,
"step": 7610
},
{
"epoch": 20.42953020134228,
"grad_norm": 2.3866217136383057,
"learning_rate": 8.325822322144728e-07,
"loss": 0.0707,
"mean_token_accuracy": 0.9761977344751358,
"num_tokens": 16598519.0,
"step": 7620
},
{
"epoch": 20.456375838926174,
"grad_norm": 1.8169933557510376,
"learning_rate": 8.229862558449592e-07,
"loss": 0.076,
"mean_token_accuracy": 0.9738662779331207,
"num_tokens": 16621364.0,
"step": 7630
},
{
"epoch": 20.483221476510067,
"grad_norm": 1.4940258264541626,
"learning_rate": 8.134409383859149e-07,
"loss": 0.0715,
"mean_token_accuracy": 0.9758548170328141,
"num_tokens": 16643312.0,
"step": 7640
},
{
"epoch": 20.51006711409396,
"grad_norm": 2.932368516921997,
"learning_rate": 8.039463956031501e-07,
"loss": 0.0867,
"mean_token_accuracy": 0.9723454862833023,
"num_tokens": 16664147.0,
"step": 7650
},
{
"epoch": 20.536912751677853,
"grad_norm": 1.3653275966644287,
"learning_rate": 7.945027426466801e-07,
"loss": 0.0782,
"mean_token_accuracy": 0.9777654260396957,
"num_tokens": 16683356.0,
"step": 7660
},
{
"epoch": 20.563758389261746,
"grad_norm": 2.0917046070098877,
"learning_rate": 7.851100940493273e-07,
"loss": 0.066,
"mean_token_accuracy": 0.9783705800771714,
"num_tokens": 16707545.0,
"step": 7670
},
{
"epoch": 20.59060402684564,
"grad_norm": 2.3530590534210205,
"learning_rate": 7.757685637253271e-07,
"loss": 0.0744,
"mean_token_accuracy": 0.973720371723175,
"num_tokens": 16730426.0,
"step": 7680
},
{
"epoch": 20.61744966442953,
"grad_norm": 1.8383070230484009,
"learning_rate": 7.664782649689611e-07,
"loss": 0.071,
"mean_token_accuracy": 0.9763635188341141,
"num_tokens": 16752377.0,
"step": 7690
},
{
"epoch": 20.644295302013422,
"grad_norm": 2.579986095428467,
"learning_rate": 7.572393104531622e-07,
"loss": 0.0929,
"mean_token_accuracy": 0.9709441542625428,
"num_tokens": 16773192.0,
"step": 7700
},
{
"epoch": 20.671140939597315,
"grad_norm": 1.4985390901565552,
"learning_rate": 7.480518122281711e-07,
"loss": 0.0785,
"mean_token_accuracy": 0.9765917301177979,
"num_tokens": 16792508.0,
"step": 7710
},
{
"epoch": 20.697986577181208,
"grad_norm": 1.8128042221069336,
"learning_rate": 7.389158817201541e-07,
"loss": 0.0685,
"mean_token_accuracy": 0.9761598974466323,
"num_tokens": 16816671.0,
"step": 7720
},
{
"epoch": 20.7248322147651,
"grad_norm": 1.72760009765625,
"learning_rate": 7.298316297298713e-07,
"loss": 0.0728,
"mean_token_accuracy": 0.9747570604085922,
"num_tokens": 16839673.0,
"step": 7730
},
{
"epoch": 20.751677852348994,
"grad_norm": 1.655516505241394,
"learning_rate": 7.207991664313202e-07,
"loss": 0.0712,
"mean_token_accuracy": 0.9755067259073258,
"num_tokens": 16861795.0,
"step": 7740
},
{
"epoch": 20.778523489932887,
"grad_norm": 2.935474395751953,
"learning_rate": 7.118186013704065e-07,
"loss": 0.0822,
"mean_token_accuracy": 0.973547288775444,
"num_tokens": 16882823.0,
"step": 7750
},
{
"epoch": 20.80536912751678,
"grad_norm": 1.1629414558410645,
"learning_rate": 7.028900434636116e-07,
"loss": 0.0847,
"mean_token_accuracy": 0.9749088943004608,
"num_tokens": 16902242.0,
"step": 7760
},
{
"epoch": 20.83221476510067,
"grad_norm": 2.0850579738616943,
"learning_rate": 6.940136009966735e-07,
"loss": 0.0659,
"mean_token_accuracy": 0.9775604158639908,
"num_tokens": 16926364.0,
"step": 7770
},
{
"epoch": 20.859060402684563,
"grad_norm": 1.6004384756088257,
"learning_rate": 6.851893816232729e-07,
"loss": 0.0757,
"mean_token_accuracy": 0.974211847782135,
"num_tokens": 16949390.0,
"step": 7780
},
{
"epoch": 20.885906040268456,
"grad_norm": 1.4713903665542603,
"learning_rate": 6.764174923637279e-07,
"loss": 0.0704,
"mean_token_accuracy": 0.9770765691995621,
"num_tokens": 16971409.0,
"step": 7790
},
{
"epoch": 20.91275167785235,
"grad_norm": 2.419337272644043,
"learning_rate": 6.676980396036953e-07,
"loss": 0.0895,
"mean_token_accuracy": 0.9723514080047607,
"num_tokens": 16992049.0,
"step": 7800
},
{
"epoch": 20.93959731543624,
"grad_norm": 1.4086570739746094,
"learning_rate": 6.590311290928825e-07,
"loss": 0.0836,
"mean_token_accuracy": 0.9762147784233093,
"num_tokens": 17011358.0,
"step": 7810
},
{
"epoch": 20.966442953020135,
"grad_norm": 1.5443168878555298,
"learning_rate": 6.504168659437627e-07,
"loss": 0.0738,
"mean_token_accuracy": 0.9730790197849274,
"num_tokens": 17034767.0,
"step": 7820
},
{
"epoch": 20.993288590604028,
"grad_norm": 1.7910735607147217,
"learning_rate": 6.418553546302964e-07,
"loss": 0.0856,
"mean_token_accuracy": 0.9733808606863021,
"num_tokens": 17055742.0,
"step": 7830
},
{
"epoch": 21.018791946308724,
"grad_norm": 1.4867401123046875,
"learning_rate": 6.333466989866787e-07,
"loss": 0.0695,
"mean_token_accuracy": 0.9786910979371322,
"num_tokens": 17076765.0,
"step": 7840
},
{
"epoch": 21.045637583892617,
"grad_norm": 2.1171696186065674,
"learning_rate": 6.248910022060595e-07,
"loss": 0.0726,
"mean_token_accuracy": 0.9751413524150848,
"num_tokens": 17099992.0,
"step": 7850
},
{
"epoch": 21.07248322147651,
"grad_norm": 1.8590487241744995,
"learning_rate": 6.16488366839309e-07,
"loss": 0.0683,
"mean_token_accuracy": 0.9762388318777084,
"num_tokens": 17122206.0,
"step": 7860
},
{
"epoch": 21.099328859060403,
"grad_norm": 2.488386631011963,
"learning_rate": 6.08138894793765e-07,
"loss": 0.0813,
"mean_token_accuracy": 0.9725316137075424,
"num_tokens": 17143334.0,
"step": 7870
},
{
"epoch": 21.126174496644296,
"grad_norm": 1.9426281452178955,
"learning_rate": 5.998426873320001e-07,
"loss": 0.0815,
"mean_token_accuracy": 0.9752875089645385,
"num_tokens": 17162878.0,
"step": 7880
},
{
"epoch": 21.15302013422819,
"grad_norm": 1.9060635566711426,
"learning_rate": 5.915998450705923e-07,
"loss": 0.0685,
"mean_token_accuracy": 0.9771264225244523,
"num_tokens": 17185525.0,
"step": 7890
},
{
"epoch": 21.179865771812082,
"grad_norm": 2.2453320026397705,
"learning_rate": 5.834104679789077e-07,
"loss": 0.0737,
"mean_token_accuracy": 0.9742690682411194,
"num_tokens": 17208605.0,
"step": 7900
},
{
"epoch": 21.20671140939597,
"grad_norm": 1.5044374465942383,
"learning_rate": 5.752746553778798e-07,
"loss": 0.0682,
"mean_token_accuracy": 0.9773426532745362,
"num_tokens": 17230705.0,
"step": 7910
},
{
"epoch": 21.233557046979865,
"grad_norm": 2.717085123062134,
"learning_rate": 5.671925059388195e-07,
"loss": 0.0772,
"mean_token_accuracy": 0.9750842124223709,
"num_tokens": 17251855.0,
"step": 7920
},
{
"epoch": 21.260402684563758,
"grad_norm": 1.5443522930145264,
"learning_rate": 5.591641176822005e-07,
"loss": 0.0874,
"mean_token_accuracy": 0.9740733593702317,
"num_tokens": 17271579.0,
"step": 7930
},
{
"epoch": 21.28724832214765,
"grad_norm": 1.8659101724624634,
"learning_rate": 5.511895879764883e-07,
"loss": 0.07,
"mean_token_accuracy": 0.9775551408529282,
"num_tokens": 17294183.0,
"step": 7940
},
{
"epoch": 21.314093959731544,
"grad_norm": 1.8989914655685425,
"learning_rate": 5.432690135369445e-07,
"loss": 0.0796,
"mean_token_accuracy": 0.9734296709299087,
"num_tokens": 17317241.0,
"step": 7950
},
{
"epoch": 21.340939597315437,
"grad_norm": 1.711384892463684,
"learning_rate": 5.354024904244632e-07,
"loss": 0.072,
"mean_token_accuracy": 0.9772316753864289,
"num_tokens": 17339366.0,
"step": 7960
},
{
"epoch": 21.36778523489933,
"grad_norm": 2.298988103866577,
"learning_rate": 5.275901140444012e-07,
"loss": 0.0757,
"mean_token_accuracy": 0.9755023777484894,
"num_tokens": 17360584.0,
"step": 7970
},
{
"epoch": 21.394630872483223,
"grad_norm": 1.5071649551391602,
"learning_rate": 5.198319791454237e-07,
"loss": 0.0942,
"mean_token_accuracy": 0.9728712201118469,
"num_tokens": 17380386.0,
"step": 7980
},
{
"epoch": 21.421476510067116,
"grad_norm": 1.6493000984191895,
"learning_rate": 5.121281798183547e-07,
"loss": 0.0671,
"mean_token_accuracy": 0.9769768297672272,
"num_tokens": 17403090.0,
"step": 7990
},
{
"epoch": 21.448322147651005,
"grad_norm": 1.9484785795211792,
"learning_rate": 5.044788094950332e-07,
"loss": 0.0721,
"mean_token_accuracy": 0.9745801717042923,
"num_tokens": 17426401.0,
"step": 8000
},
{
"epoch": 21.4751677852349,
"grad_norm": 4.09341287612915,
"learning_rate": 4.968839609471837e-07,
"loss": 0.0701,
"mean_token_accuracy": 0.9758915185928345,
"num_tokens": 17448699.0,
"step": 8010
},
{
"epoch": 21.50201342281879,
"grad_norm": 2.4515931606292725,
"learning_rate": 4.893437262852885e-07,
"loss": 0.0729,
"mean_token_accuracy": 0.9776129275560379,
"num_tokens": 17470206.0,
"step": 8020
},
{
"epoch": 21.528859060402684,
"grad_norm": 1.707498550415039,
"learning_rate": 4.818581969574743e-07,
"loss": 0.0947,
"mean_token_accuracy": 0.9731051385402679,
"num_tokens": 17490055.0,
"step": 8030
},
{
"epoch": 21.555704697986577,
"grad_norm": 1.5686323642730713,
"learning_rate": 4.7442746374839363e-07,
"loss": 0.0622,
"mean_token_accuracy": 0.979521569609642,
"num_tokens": 17512695.0,
"step": 8040
},
{
"epoch": 21.58255033557047,
"grad_norm": 2.5754616260528564,
"learning_rate": 4.6705161677814024e-07,
"loss": 0.0717,
"mean_token_accuracy": 0.9750470906496048,
"num_tokens": 17535874.0,
"step": 8050
},
{
"epoch": 21.609395973154363,
"grad_norm": 1.9909005165100098,
"learning_rate": 4.597307455011363e-07,
"loss": 0.0722,
"mean_token_accuracy": 0.9758717834949493,
"num_tokens": 17558072.0,
"step": 8060
},
{
"epoch": 21.636241610738256,
"grad_norm": 2.4641635417938232,
"learning_rate": 4.524649387050667e-07,
"loss": 0.0812,
"mean_token_accuracy": 0.9734772562980651,
"num_tokens": 17579394.0,
"step": 8070
},
{
"epoch": 21.663087248322146,
"grad_norm": 1.7697465419769287,
"learning_rate": 4.4525428450978627e-07,
"loss": 0.0872,
"mean_token_accuracy": 0.973640301823616,
"num_tokens": 17599105.0,
"step": 8080
},
{
"epoch": 21.68993288590604,
"grad_norm": 4.2104268074035645,
"learning_rate": 4.380988703662614e-07,
"loss": 0.0662,
"mean_token_accuracy": 0.9779145032167434,
"num_tokens": 17621804.0,
"step": 8090
},
{
"epoch": 21.716778523489932,
"grad_norm": 2.5988218784332275,
"learning_rate": 4.309987830555057e-07,
"loss": 0.0647,
"mean_token_accuracy": 0.9763138085603714,
"num_tokens": 17645036.0,
"step": 8100
},
{
"epoch": 21.743624161073825,
"grad_norm": 1.6398255825042725,
"learning_rate": 4.239541086875265e-07,
"loss": 0.0715,
"mean_token_accuracy": 0.9762193262577057,
"num_tokens": 17667306.0,
"step": 8110
},
{
"epoch": 21.770469798657718,
"grad_norm": 2.159721612930298,
"learning_rate": 4.1696493270028284e-07,
"loss": 0.078,
"mean_token_accuracy": 0.9745160311460495,
"num_tokens": 17688460.0,
"step": 8120
},
{
"epoch": 21.79731543624161,
"grad_norm": 1.4078185558319092,
"learning_rate": 4.1003133985864864e-07,
"loss": 0.0805,
"mean_token_accuracy": 0.976104524731636,
"num_tokens": 17708085.0,
"step": 8130
},
{
"epoch": 21.824161073825504,
"grad_norm": 1.8352998495101929,
"learning_rate": 4.031534142533816e-07,
"loss": 0.0706,
"mean_token_accuracy": 0.9771550267934799,
"num_tokens": 17730815.0,
"step": 8140
},
{
"epoch": 21.851006711409397,
"grad_norm": 2.1755294799804688,
"learning_rate": 3.9633123930011065e-07,
"loss": 0.0707,
"mean_token_accuracy": 0.9745010644197464,
"num_tokens": 17754175.0,
"step": 8150
},
{
"epoch": 21.87785234899329,
"grad_norm": 1.4615411758422852,
"learning_rate": 3.895648977383143e-07,
"loss": 0.0751,
"mean_token_accuracy": 0.9746832549571991,
"num_tokens": 17776562.0,
"step": 8160
},
{
"epoch": 21.90469798657718,
"grad_norm": 3.1907434463500977,
"learning_rate": 3.828544716303284e-07,
"loss": 0.0698,
"mean_token_accuracy": 0.9775834113359452,
"num_tokens": 17797911.0,
"step": 8170
},
{
"epoch": 21.931543624161073,
"grad_norm": 1.4641789197921753,
"learning_rate": 3.76200042360339e-07,
"loss": 0.0877,
"mean_token_accuracy": 0.9736798137426377,
"num_tokens": 17817668.0,
"step": 8180
},
{
"epoch": 21.958389261744966,
"grad_norm": 1.6593701839447021,
"learning_rate": 3.6960169063340543e-07,
"loss": 0.0706,
"mean_token_accuracy": 0.9777141779661178,
"num_tokens": 17840084.0,
"step": 8190
},
{
"epoch": 21.98523489932886,
"grad_norm": 2.6766629219055176,
"learning_rate": 3.6305949647447545e-07,
"loss": 0.0797,
"mean_token_accuracy": 0.9745386809110641,
"num_tokens": 17862053.0,
"step": 8200
},
{
"epoch": 22.01073825503356,
"grad_norm": 2.444695234298706,
"learning_rate": 3.5657353922741834e-07,
"loss": 0.0739,
"mean_token_accuracy": 0.9770089576118871,
"num_tokens": 17881976.0,
"step": 8210
},
{
"epoch": 22.037583892617448,
"grad_norm": 1.9715152978897095,
"learning_rate": 3.501438975540583e-07,
"loss": 0.0623,
"mean_token_accuracy": 0.9785895735025406,
"num_tokens": 17905589.0,
"step": 8220
},
{
"epoch": 22.06442953020134,
"grad_norm": 1.3546748161315918,
"learning_rate": 3.437706494332266e-07,
"loss": 0.0713,
"mean_token_accuracy": 0.9752832442522049,
"num_tokens": 17928115.0,
"step": 8230
},
{
"epoch": 22.091275167785234,
"grad_norm": 2.851863384246826,
"learning_rate": 3.374538721598086e-07,
"loss": 0.0662,
"mean_token_accuracy": 0.9777217179536819,
"num_tokens": 17949802.0,
"step": 8240
},
{
"epoch": 22.118120805369127,
"grad_norm": 1.7194266319274902,
"learning_rate": 3.311936423438128e-07,
"loss": 0.0856,
"mean_token_accuracy": 0.9752306431531906,
"num_tokens": 17969952.0,
"step": 8250
},
{
"epoch": 22.14496644295302,
"grad_norm": 2.0014491081237793,
"learning_rate": 3.249900359094388e-07,
"loss": 0.0763,
"mean_token_accuracy": 0.9760718226432801,
"num_tokens": 17991224.0,
"step": 8260
},
{
"epoch": 22.171812080536913,
"grad_norm": 2.113339900970459,
"learning_rate": 3.188431280941529e-07,
"loss": 0.0676,
"mean_token_accuracy": 0.9763350129127503,
"num_tokens": 18014820.0,
"step": 8270
},
{
"epoch": 22.198657718120806,
"grad_norm": 1.4311920404434204,
"learning_rate": 3.1275299344778576e-07,
"loss": 0.0727,
"mean_token_accuracy": 0.9759140908718109,
"num_tokens": 18037322.0,
"step": 8280
},
{
"epoch": 22.2255033557047,
"grad_norm": 2.7011280059814453,
"learning_rate": 3.067197058316157e-07,
"loss": 0.0708,
"mean_token_accuracy": 0.9770938664674759,
"num_tokens": 18058945.0,
"step": 8290
},
{
"epoch": 22.252348993288592,
"grad_norm": 1.7201358079910278,
"learning_rate": 3.007433384174835e-07,
"loss": 0.0952,
"mean_token_accuracy": 0.9714390218257904,
"num_tokens": 18079153.0,
"step": 8300
},
{
"epoch": 22.27919463087248,
"grad_norm": 1.7672648429870605,
"learning_rate": 2.948239636868977e-07,
"loss": 0.0691,
"mean_token_accuracy": 0.9796588033437729,
"num_tokens": 18100388.0,
"step": 8310
},
{
"epoch": 22.306040268456375,
"grad_norm": 2.4666810035705566,
"learning_rate": 2.889616534301598e-07,
"loss": 0.0715,
"mean_token_accuracy": 0.9756601005792618,
"num_tokens": 18123969.0,
"step": 8320
},
{
"epoch": 22.332885906040268,
"grad_norm": 1.4548521041870117,
"learning_rate": 2.831564787454916e-07,
"loss": 0.0739,
"mean_token_accuracy": 0.9751757919788361,
"num_tokens": 18146448.0,
"step": 8330
},
{
"epoch": 22.35973154362416,
"grad_norm": 2.3725903034210205,
"learning_rate": 2.774085100381735e-07,
"loss": 0.0694,
"mean_token_accuracy": 0.9762961208820343,
"num_tokens": 18168098.0,
"step": 8340
},
{
"epoch": 22.386577181208054,
"grad_norm": 1.802985668182373,
"learning_rate": 2.717178170196916e-07,
"loss": 0.0924,
"mean_token_accuracy": 0.9725849449634552,
"num_tokens": 18188314.0,
"step": 8350
},
{
"epoch": 22.413422818791947,
"grad_norm": 2.313931465148926,
"learning_rate": 2.660844687068903e-07,
"loss": 0.0682,
"mean_token_accuracy": 0.979394719004631,
"num_tokens": 18209585.0,
"step": 8360
},
{
"epoch": 22.44026845637584,
"grad_norm": 2.165256977081299,
"learning_rate": 2.6050853342113437e-07,
"loss": 0.0687,
"mean_token_accuracy": 0.9766799122095108,
"num_tokens": 18233015.0,
"step": 8370
},
{
"epoch": 22.467114093959733,
"grad_norm": 1.488602876663208,
"learning_rate": 2.549900787874876e-07,
"loss": 0.0748,
"mean_token_accuracy": 0.9740068465471268,
"num_tokens": 18255469.0,
"step": 8380
},
{
"epoch": 22.493959731543626,
"grad_norm": 2.5867998600006104,
"learning_rate": 2.4952917173387993e-07,
"loss": 0.0716,
"mean_token_accuracy": 0.9767491906881333,
"num_tokens": 18276911.0,
"step": 8390
},
{
"epoch": 22.520805369127515,
"grad_norm": 1.6677324771881104,
"learning_rate": 2.4412587849031e-07,
"loss": 0.0911,
"mean_token_accuracy": 0.9713105499744416,
"num_tokens": 18296904.0,
"step": 8400
},
{
"epoch": 22.54765100671141,
"grad_norm": 3.195547342300415,
"learning_rate": 2.3878026458803047e-07,
"loss": 0.0669,
"mean_token_accuracy": 0.9800841093063355,
"num_tokens": 18318172.0,
"step": 8410
},
{
"epoch": 22.5744966442953,
"grad_norm": 1.8589850664138794,
"learning_rate": 2.3349239485875918e-07,
"loss": 0.0679,
"mean_token_accuracy": 0.975395730137825,
"num_tokens": 18341711.0,
"step": 8420
},
{
"epoch": 22.601342281879194,
"grad_norm": 1.5052769184112549,
"learning_rate": 2.282623334338907e-07,
"loss": 0.0706,
"mean_token_accuracy": 0.974920055270195,
"num_tokens": 18364282.0,
"step": 8430
},
{
"epoch": 22.628187919463087,
"grad_norm": 2.210737943649292,
"learning_rate": 2.2309014374372106e-07,
"loss": 0.0699,
"mean_token_accuracy": 0.9770695507526398,
"num_tokens": 18385999.0,
"step": 8440
},
{
"epoch": 22.65503355704698,
"grad_norm": 1.5828299522399902,
"learning_rate": 2.1797588851667494e-07,
"loss": 0.0906,
"mean_token_accuracy": 0.9725321441888809,
"num_tokens": 18406305.0,
"step": 8450
},
{
"epoch": 22.681879194630874,
"grad_norm": 1.828018307685852,
"learning_rate": 2.129196297785474e-07,
"loss": 0.0656,
"mean_token_accuracy": 0.979384246468544,
"num_tokens": 18427591.0,
"step": 8460
},
{
"epoch": 22.708724832214767,
"grad_norm": 1.981468915939331,
"learning_rate": 2.079214288517506e-07,
"loss": 0.0674,
"mean_token_accuracy": 0.9769616097211837,
"num_tokens": 18451157.0,
"step": 8470
},
{
"epoch": 22.735570469798656,
"grad_norm": 2.4667210578918457,
"learning_rate": 2.029813463545699e-07,
"loss": 0.0742,
"mean_token_accuracy": 0.9744071394205094,
"num_tokens": 18473755.0,
"step": 8480
},
{
"epoch": 22.76241610738255,
"grad_norm": 2.268716812133789,
"learning_rate": 1.980994422004312e-07,
"loss": 0.0686,
"mean_token_accuracy": 0.9764893770217895,
"num_tokens": 18495455.0,
"step": 8490
},
{
"epoch": 22.789261744966442,
"grad_norm": 1.5990172624588013,
"learning_rate": 1.9327577559716815e-07,
"loss": 0.0869,
"mean_token_accuracy": 0.9736561328172684,
"num_tokens": 18515761.0,
"step": 8500
},
{
"epoch": 22.816107382550335,
"grad_norm": 1.634722352027893,
"learning_rate": 1.8851040504631325e-07,
"loss": 0.0707,
"mean_token_accuracy": 0.9781961172819138,
"num_tokens": 18537018.0,
"step": 8510
},
{
"epoch": 22.842953020134228,
"grad_norm": 2.427337408065796,
"learning_rate": 1.8380338834237842e-07,
"loss": 0.0695,
"mean_token_accuracy": 0.9761110007762909,
"num_tokens": 18560550.0,
"step": 8520
},
{
"epoch": 22.86979865771812,
"grad_norm": 1.6152406930923462,
"learning_rate": 1.79154782572164e-07,
"loss": 0.0746,
"mean_token_accuracy": 0.9747624099254608,
"num_tokens": 18583052.0,
"step": 8530
},
{
"epoch": 22.896644295302014,
"grad_norm": 1.99691641330719,
"learning_rate": 1.7456464411405527e-07,
"loss": 0.0706,
"mean_token_accuracy": 0.9768070250749588,
"num_tokens": 18604669.0,
"step": 8540
},
{
"epoch": 22.923489932885907,
"grad_norm": 2.081984043121338,
"learning_rate": 1.7003302863735028e-07,
"loss": 0.0929,
"mean_token_accuracy": 0.9729080408811569,
"num_tokens": 18624806.0,
"step": 8550
},
{
"epoch": 22.9503355704698,
"grad_norm": 2.1356115341186523,
"learning_rate": 1.655599911015754e-07,
"loss": 0.068,
"mean_token_accuracy": 0.9790314078330994,
"num_tokens": 18645962.0,
"step": 8560
},
{
"epoch": 22.97718120805369,
"grad_norm": 1.4790443181991577,
"learning_rate": 1.6114558575582418e-07,
"loss": 0.0679,
"mean_token_accuracy": 0.9775549441576004,
"num_tokens": 18668370.0,
"step": 8570
},
{
"epoch": 23.00268456375839,
"grad_norm": 1.7534834146499634,
"learning_rate": 1.5678986613809788e-07,
"loss": 0.0758,
"mean_token_accuracy": 0.9766696377804405,
"num_tokens": 18686993.0,
"step": 8580
},
{
"epoch": 23.029530201342283,
"grad_norm": 1.7369558811187744,
"learning_rate": 1.52492885074656e-07,
"loss": 0.0661,
"mean_token_accuracy": 0.9780718445777893,
"num_tokens": 18710832.0,
"step": 8590
},
{
"epoch": 23.056375838926176,
"grad_norm": 1.6902610063552856,
"learning_rate": 1.4825469467937336e-07,
"loss": 0.08,
"mean_token_accuracy": 0.9725459694862366,
"num_tokens": 18733644.0,
"step": 8600
},
{
"epoch": 23.08322147651007,
"grad_norm": 1.5242305994033813,
"learning_rate": 1.4407534635311415e-07,
"loss": 0.0672,
"mean_token_accuracy": 0.9776864409446716,
"num_tokens": 18755633.0,
"step": 8610
},
{
"epoch": 23.110067114093958,
"grad_norm": 2.102566719055176,
"learning_rate": 1.3995489078310055e-07,
"loss": 0.0855,
"mean_token_accuracy": 0.9734326243400574,
"num_tokens": 18776371.0,
"step": 8620
},
{
"epoch": 23.13691275167785,
"grad_norm": 1.5999020338058472,
"learning_rate": 1.358933779423066e-07,
"loss": 0.0732,
"mean_token_accuracy": 0.9789162427186966,
"num_tokens": 18796212.0,
"step": 8630
},
{
"epoch": 23.163758389261744,
"grad_norm": 1.7957699298858643,
"learning_rate": 1.3189085708884387e-07,
"loss": 0.0654,
"mean_token_accuracy": 0.976372754573822,
"num_tokens": 18819968.0,
"step": 8640
},
{
"epoch": 23.190604026845637,
"grad_norm": 1.6005198955535889,
"learning_rate": 1.2794737676536993e-07,
"loss": 0.0713,
"mean_token_accuracy": 0.9749877661466598,
"num_tokens": 18842576.0,
"step": 8650
},
{
"epoch": 23.21744966442953,
"grad_norm": 1.7131704092025757,
"learning_rate": 1.24062984798497e-07,
"loss": 0.0654,
"mean_token_accuracy": 0.978054803609848,
"num_tokens": 18864395.0,
"step": 8660
},
{
"epoch": 23.244295302013423,
"grad_norm": 2.195054769515991,
"learning_rate": 1.2023772829821202e-07,
"loss": 0.0976,
"mean_token_accuracy": 0.9702393293380738,
"num_tokens": 18884905.0,
"step": 8670
},
{
"epoch": 23.271140939597316,
"grad_norm": 1.513555645942688,
"learning_rate": 1.164716536573074e-07,
"loss": 0.0731,
"mean_token_accuracy": 0.9793914705514908,
"num_tokens": 18904747.0,
"step": 8680
},
{
"epoch": 23.29798657718121,
"grad_norm": 1.324312686920166,
"learning_rate": 1.1276480655081412e-07,
"loss": 0.0649,
"mean_token_accuracy": 0.9773990035057067,
"num_tokens": 18928841.0,
"step": 8690
},
{
"epoch": 23.324832214765102,
"grad_norm": 4.074679374694824,
"learning_rate": 1.091172319354522e-07,
"loss": 0.0689,
"mean_token_accuracy": 0.9761066138744354,
"num_tokens": 18951836.0,
"step": 8700
},
{
"epoch": 23.351677852348992,
"grad_norm": 1.2226976156234741,
"learning_rate": 1.0552897404908391e-07,
"loss": 0.0659,
"mean_token_accuracy": 0.9781163841485977,
"num_tokens": 18973843.0,
"step": 8710
},
{
"epoch": 23.378523489932885,
"grad_norm": 4.224064350128174,
"learning_rate": 1.0200007641017583e-07,
"loss": 0.083,
"mean_token_accuracy": 0.9745501130819321,
"num_tokens": 18994538.0,
"step": 8720
},
{
"epoch": 23.405369127516778,
"grad_norm": 2.0320780277252197,
"learning_rate": 9.853058181727215e-08,
"loss": 0.0797,
"mean_token_accuracy": 0.9776176422834396,
"num_tokens": 19014345.0,
"step": 8730
},
{
"epoch": 23.43221476510067,
"grad_norm": 1.7470709085464478,
"learning_rate": 9.512053234847774e-08,
"loss": 0.0678,
"mean_token_accuracy": 0.9759983509778977,
"num_tokens": 19038219.0,
"step": 8740
},
{
"epoch": 23.459060402684564,
"grad_norm": 1.6720378398895264,
"learning_rate": 9.176996936094195e-08,
"loss": 0.0749,
"mean_token_accuracy": 0.9732575833797454,
"num_tokens": 19061030.0,
"step": 8750
},
{
"epoch": 23.485906040268457,
"grad_norm": 1.4237157106399536,
"learning_rate": 8.847893349036518e-08,
"loss": 0.0657,
"mean_token_accuracy": 0.9781557589769363,
"num_tokens": 19083034.0,
"step": 8760
},
{
"epoch": 23.51275167785235,
"grad_norm": 2.249169111251831,
"learning_rate": 8.52474646504986e-08,
"loss": 0.0853,
"mean_token_accuracy": 0.9733679562807083,
"num_tokens": 19103836.0,
"step": 8770
},
{
"epoch": 23.539597315436243,
"grad_norm": 1.4655739068984985,
"learning_rate": 8.207560203266462e-08,
"loss": 0.0762,
"mean_token_accuracy": 0.9779812008142471,
"num_tokens": 19123721.0,
"step": 8780
},
{
"epoch": 23.566442953020136,
"grad_norm": 2.0239083766937256,
"learning_rate": 7.896338410527948e-08,
"loss": 0.0662,
"mean_token_accuracy": 0.976813143491745,
"num_tokens": 19147587.0,
"step": 8790
},
{
"epoch": 23.593288590604026,
"grad_norm": 1.9883896112442017,
"learning_rate": 7.591084861338749e-08,
"loss": 0.0681,
"mean_token_accuracy": 0.9767774403095245,
"num_tokens": 19170369.0,
"step": 8800
},
{
"epoch": 23.62013422818792,
"grad_norm": 1.4335416555404663,
"learning_rate": 7.291803257820307e-08,
"loss": 0.0706,
"mean_token_accuracy": 0.9769880920648575,
"num_tokens": 19192301.0,
"step": 8810
},
{
"epoch": 23.64697986577181,
"grad_norm": 1.5386704206466675,
"learning_rate": 6.998497229666334e-08,
"loss": 0.0836,
"mean_token_accuracy": 0.9734211921691894,
"num_tokens": 19212985.0,
"step": 8820
},
{
"epoch": 23.673825503355705,
"grad_norm": 1.4760531187057495,
"learning_rate": 6.711170334098294e-08,
"loss": 0.0671,
"mean_token_accuracy": 0.979541563987732,
"num_tokens": 19232801.0,
"step": 8830
},
{
"epoch": 23.700671140939598,
"grad_norm": 1.768637776374817,
"learning_rate": 6.429826055822985e-08,
"loss": 0.0632,
"mean_token_accuracy": 0.9791298300027848,
"num_tokens": 19256797.0,
"step": 8840
},
{
"epoch": 23.72751677852349,
"grad_norm": 1.8322697877883911,
"learning_rate": 6.154467806989639e-08,
"loss": 0.0691,
"mean_token_accuracy": 0.9751640826463699,
"num_tokens": 19279549.0,
"step": 8850
},
{
"epoch": 23.754362416107384,
"grad_norm": 1.681655764579773,
"learning_rate": 5.885098927148947e-08,
"loss": 0.0626,
"mean_token_accuracy": 0.9791503757238388,
"num_tokens": 19301365.0,
"step": 8860
},
{
"epoch": 23.781208053691277,
"grad_norm": 2.793856620788574,
"learning_rate": 5.6217226832122605e-08,
"loss": 0.0892,
"mean_token_accuracy": 0.9728416323661804,
"num_tokens": 19321863.0,
"step": 8870
},
{
"epoch": 23.808053691275166,
"grad_norm": 1.3198052644729614,
"learning_rate": 5.364342269412237e-08,
"loss": 0.0747,
"mean_token_accuracy": 0.9781924426555634,
"num_tokens": 19341690.0,
"step": 8880
},
{
"epoch": 23.83489932885906,
"grad_norm": 1.6709011793136597,
"learning_rate": 5.112960807263978e-08,
"loss": 0.07,
"mean_token_accuracy": 0.9769271492958069,
"num_tokens": 19365725.0,
"step": 8890
},
{
"epoch": 23.861744966442952,
"grad_norm": 1.718581199645996,
"learning_rate": 4.867581345527117e-08,
"loss": 0.0698,
"mean_token_accuracy": 0.9760184645652771,
"num_tokens": 19388641.0,
"step": 8900
},
{
"epoch": 23.888590604026845,
"grad_norm": 1.6949131488800049,
"learning_rate": 4.62820686016896e-08,
"loss": 0.0695,
"mean_token_accuracy": 0.9765161216259003,
"num_tokens": 19410558.0,
"step": 8910
},
{
"epoch": 23.915436241610738,
"grad_norm": 2.4173476696014404,
"learning_rate": 4.3948402543282366e-08,
"loss": 0.0886,
"mean_token_accuracy": 0.9729428887367249,
"num_tokens": 19431305.0,
"step": 8920
},
{
"epoch": 23.94228187919463,
"grad_norm": 1.3827898502349854,
"learning_rate": 4.167484358280016e-08,
"loss": 0.0721,
"mean_token_accuracy": 0.9788724452257156,
"num_tokens": 19451159.0,
"step": 8930
},
{
"epoch": 23.969127516778524,
"grad_norm": 1.6722241640090942,
"learning_rate": 3.946141929401459e-08,
"loss": 0.0733,
"mean_token_accuracy": 0.9754825055599212,
"num_tokens": 19474310.0,
"step": 8940
},
{
"epoch": 23.995973154362417,
"grad_norm": 1.3889379501342773,
"learning_rate": 3.730815652138231e-08,
"loss": 0.0809,
"mean_token_accuracy": 0.9765669792890549,
"num_tokens": 19494756.0,
"step": 8950
},
{
"epoch": 24.021476510067114,
"grad_norm": 1.9823448657989502,
"learning_rate": 3.521508137971807e-08,
"loss": 0.0587,
"mean_token_accuracy": 0.9801463171055442,
"num_tokens": 19516280.0,
"step": 8960
},
{
"epoch": 24.048322147651007,
"grad_norm": 1.6457539796829224,
"learning_rate": 3.3182219253882766e-08,
"loss": 0.0644,
"mean_token_accuracy": 0.9775006264448166,
"num_tokens": 19539394.0,
"step": 8970
},
{
"epoch": 24.0751677852349,
"grad_norm": 1.7060420513153076,
"learning_rate": 3.120959479846919e-08,
"loss": 0.0658,
"mean_token_accuracy": 0.9774825513362885,
"num_tokens": 19561443.0,
"step": 8980
},
{
"epoch": 24.102013422818793,
"grad_norm": 2.1901304721832275,
"learning_rate": 2.9297231937510107e-08,
"loss": 0.0833,
"mean_token_accuracy": 0.9740053981542587,
"num_tokens": 19582429.0,
"step": 8990
},
{
"epoch": 24.128859060402686,
"grad_norm": 1.383569598197937,
"learning_rate": 2.7445153864180674e-08,
"loss": 0.0806,
"mean_token_accuracy": 0.9769510418176651,
"num_tokens": 19601915.0,
"step": 9000
},
{
"epoch": 24.15570469798658,
"grad_norm": 4.088254451751709,
"learning_rate": 2.5653383040524228e-08,
"loss": 0.0658,
"mean_token_accuracy": 0.9784015566110611,
"num_tokens": 19625008.0,
"step": 9010
},
{
"epoch": 24.18255033557047,
"grad_norm": 1.6868436336517334,
"learning_rate": 2.392194119717417e-08,
"loss": 0.071,
"mean_token_accuracy": 0.9756916046142579,
"num_tokens": 19648138.0,
"step": 9020
},
{
"epoch": 24.20939597315436,
"grad_norm": 1.6547763347625732,
"learning_rate": 2.225084933309363e-08,
"loss": 0.067,
"mean_token_accuracy": 0.976289376616478,
"num_tokens": 19670309.0,
"step": 9030
},
{
"epoch": 24.236241610738254,
"grad_norm": 2.064862012863159,
"learning_rate": 2.064012771532009e-08,
"loss": 0.0829,
"mean_token_accuracy": 0.9720039278268814,
"num_tokens": 19691223.0,
"step": 9040
},
{
"epoch": 24.263087248322147,
"grad_norm": 1.1820263862609863,
"learning_rate": 1.9089795878718953e-08,
"loss": 0.083,
"mean_token_accuracy": 0.9768513649702072,
"num_tokens": 19710783.0,
"step": 9050
},
{
"epoch": 24.28993288590604,
"grad_norm": 3.2465732097625732,
"learning_rate": 1.7599872625747583e-08,
"loss": 0.0646,
"mean_token_accuracy": 0.9789623349905014,
"num_tokens": 19734062.0,
"step": 9060
},
{
"epoch": 24.316778523489933,
"grad_norm": 1.9160860776901245,
"learning_rate": 1.6170376026226065e-08,
"loss": 0.0712,
"mean_token_accuracy": 0.975100401043892,
"num_tokens": 19757347.0,
"step": 9070
},
{
"epoch": 24.343624161073826,
"grad_norm": 1.6455693244934082,
"learning_rate": 1.4801323417119595e-08,
"loss": 0.0684,
"mean_token_accuracy": 0.9768495559692383,
"num_tokens": 19779529.0,
"step": 9080
},
{
"epoch": 24.37046979865772,
"grad_norm": 3.2963931560516357,
"learning_rate": 1.3492731402326431e-08,
"loss": 0.086,
"mean_token_accuracy": 0.9731739908456802,
"num_tokens": 19800407.0,
"step": 9090
},
{
"epoch": 24.397315436241612,
"grad_norm": 1.495845913887024,
"learning_rate": 1.2244615852479158e-08,
"loss": 0.0828,
"mean_token_accuracy": 0.9758796036243439,
"num_tokens": 19819948.0,
"step": 9100
},
{
"epoch": 24.424161073825502,
"grad_norm": 2.0350308418273926,
"learning_rate": 1.1056991904748182e-08,
"loss": 0.07,
"mean_token_accuracy": 0.9763268619775772,
"num_tokens": 19843153.0,
"step": 9110
},
{
"epoch": 24.451006711409395,
"grad_norm": 1.525877833366394,
"learning_rate": 9.929873962661873e-09,
"loss": 0.0722,
"mean_token_accuracy": 0.9746161639690399,
"num_tokens": 19866266.0,
"step": 9120
},
{
"epoch": 24.477852348993288,
"grad_norm": 1.7038556337356567,
"learning_rate": 8.86327569593115e-09,
"loss": 0.0665,
"mean_token_accuracy": 0.9768165737390518,
"num_tokens": 19888403.0,
"step": 9130
},
{
"epoch": 24.50469798657718,
"grad_norm": 4.7443366050720215,
"learning_rate": 7.857210040281838e-09,
"loss": 0.0799,
"mean_token_accuracy": 0.9739553958177567,
"num_tokens": 19909460.0,
"step": 9140
},
{
"epoch": 24.531543624161074,
"grad_norm": 1.6406482458114624,
"learning_rate": 6.9116891972986766e-09,
"loss": 0.0787,
"mean_token_accuracy": 0.9759374588727951,
"num_tokens": 19928999.0,
"step": 9150
},
{
"epoch": 24.558389261744967,
"grad_norm": 1.7716859579086304,
"learning_rate": 6.026724634279335e-09,
"loss": 0.0651,
"mean_token_accuracy": 0.978615865111351,
"num_tokens": 19952230.0,
"step": 9160
},
{
"epoch": 24.58523489932886,
"grad_norm": 1.912937045097351,
"learning_rate": 5.20232708409174e-09,
"loss": 0.0694,
"mean_token_accuracy": 0.9758041888475418,
"num_tokens": 19975383.0,
"step": 9170
},
{
"epoch": 24.612080536912753,
"grad_norm": 1.484447717666626,
"learning_rate": 4.438506545046961e-09,
"loss": 0.0655,
"mean_token_accuracy": 0.9781613409519195,
"num_tokens": 19997602.0,
"step": 9180
},
{
"epoch": 24.638926174496646,
"grad_norm": 2.091848134994507,
"learning_rate": 3.73527228077708e-09,
"loss": 0.0741,
"mean_token_accuracy": 0.9758224546909332,
"num_tokens": 20018904.0,
"step": 9190
},
{
"epoch": 24.665771812080536,
"grad_norm": 1.6126073598861694,
"learning_rate": 3.0926328201213996e-09,
"loss": 0.0821,
"mean_token_accuracy": 0.9765258640050888,
"num_tokens": 20038577.0,
"step": 9200
},
{
"epoch": 24.69261744966443,
"grad_norm": 2.2654337882995605,
"learning_rate": 2.510595957025408e-09,
"loss": 0.0664,
"mean_token_accuracy": 0.9781230628490448,
"num_tokens": 20061623.0,
"step": 9210
},
{
"epoch": 24.71946308724832,
"grad_norm": 1.9814746379852295,
"learning_rate": 1.9891687504436373e-09,
"loss": 0.0701,
"mean_token_accuracy": 0.9765848129987716,
"num_tokens": 20084607.0,
"step": 9220
},
{
"epoch": 24.746308724832215,
"grad_norm": 1.474151372909546,
"learning_rate": 1.5283575242569514e-09,
"loss": 0.0732,
"mean_token_accuracy": 0.9761769473552704,
"num_tokens": 20106707.0,
"step": 9230
},
{
"epoch": 24.773154362416108,
"grad_norm": 3.660369634628296,
"learning_rate": 1.1281678671931639e-09,
"loss": 0.0765,
"mean_token_accuracy": 0.9770137190818786,
"num_tokens": 20127881.0,
"step": 9240
},
{
"epoch": 24.8,
"grad_norm": 1.50706946849823,
"learning_rate": 7.886046327609809e-10,
"loss": 0.0816,
"mean_token_accuracy": 0.9761880010366439,
"num_tokens": 20147458.0,
"step": 9250
},
{
"epoch": 24.826845637583894,
"grad_norm": 1.9096148014068604,
"learning_rate": 5.096719391900484e-10,
"loss": 0.0637,
"mean_token_accuracy": 0.9798777759075165,
"num_tokens": 20170537.0,
"step": 9260
},
{
"epoch": 24.853691275167787,
"grad_norm": 4.045505523681641,
"learning_rate": 2.9137316938265826e-10,
"loss": 0.0722,
"mean_token_accuracy": 0.9747423082590103,
"num_tokens": 20193545.0,
"step": 9270
},
{
"epoch": 24.880536912751676,
"grad_norm": 1.777500867843628,
"learning_rate": 1.337109708704487e-10,
"loss": 0.071,
"mean_token_accuracy": 0.9765003561973572,
"num_tokens": 20215713.0,
"step": 9280
},
{
"epoch": 24.90738255033557,
"grad_norm": 2.0628652572631836,
"learning_rate": 3.6687255783873775e-11,
"loss": 0.0722,
"mean_token_accuracy": 0.9766353726387024,
"num_tokens": 20237151.0,
"step": 9290
},
{
"epoch": 24.934228187919462,
"grad_norm": 1.4208660125732422,
"learning_rate": 3.0320082888835036e-13,
"loss": 0.087,
"mean_token_accuracy": 0.9745760560035706,
"num_tokens": 20256982.0,
"step": 9300
}
],
"logging_steps": 10,
"max_steps": 9300,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.462878792121057e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}