sublim-phase4-combo-01 / trainer_state.json
eac123's picture
Upload final checkpoint (checkpoint-804)
89099d4 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1308949291706085,
"epoch": 0.0037313432835820895,
"grad_norm": 1.683108925819397,
"learning_rate": 0.0002,
"loss": 2.489936590194702,
"mean_token_accuracy": 0.5359140038490295,
"num_tokens": 16356.0,
"step": 1
},
{
"entropy": 1.2256053388118744,
"epoch": 0.007462686567164179,
"grad_norm": 1.5088376998901367,
"learning_rate": 0.0002,
"loss": 2.162245273590088,
"mean_token_accuracy": 0.5673863738775253,
"num_tokens": 32718.0,
"step": 2
},
{
"entropy": 1.4011717438697815,
"epoch": 0.011194029850746268,
"grad_norm": 1.1495057344436646,
"learning_rate": 0.0002,
"loss": 1.7410045862197876,
"mean_token_accuracy": 0.5877877026796341,
"num_tokens": 49086.0,
"step": 3
},
{
"entropy": 1.3629191517829895,
"epoch": 0.014925373134328358,
"grad_norm": 0.909584105014801,
"learning_rate": 0.0002,
"loss": 1.410053014755249,
"mean_token_accuracy": 0.6416480243206024,
"num_tokens": 65483.0,
"step": 4
},
{
"entropy": 1.345184564590454,
"epoch": 0.018656716417910446,
"grad_norm": 1.1788593530654907,
"learning_rate": 0.0002,
"loss": 1.2843377590179443,
"mean_token_accuracy": 0.6425914913415909,
"num_tokens": 81705.0,
"step": 5
},
{
"entropy": 1.2523848712444305,
"epoch": 0.022388059701492536,
"grad_norm": 0.7064197659492493,
"learning_rate": 0.0002,
"loss": 1.175342082977295,
"mean_token_accuracy": 0.6635853946208954,
"num_tokens": 97918.0,
"step": 6
},
{
"entropy": 1.199697583913803,
"epoch": 0.026119402985074626,
"grad_norm": 0.4158240854740143,
"learning_rate": 0.0002,
"loss": 1.1010812520980835,
"mean_token_accuracy": 0.6607878506183624,
"num_tokens": 114455.0,
"step": 7
},
{
"entropy": 1.0897426307201385,
"epoch": 0.029850746268656716,
"grad_norm": 0.4258277118206024,
"learning_rate": 0.0002,
"loss": 1.0245436429977417,
"mean_token_accuracy": 0.682918444275856,
"num_tokens": 130921.0,
"step": 8
},
{
"entropy": 0.9851540327072144,
"epoch": 0.033582089552238806,
"grad_norm": 0.6931905150413513,
"learning_rate": 0.0002,
"loss": 0.972236692905426,
"mean_token_accuracy": 0.690200999379158,
"num_tokens": 147028.0,
"step": 9
},
{
"entropy": 0.9809075742959976,
"epoch": 0.03731343283582089,
"grad_norm": 0.4386370778083801,
"learning_rate": 0.0002,
"loss": 0.9174745082855225,
"mean_token_accuracy": 0.6927480399608612,
"num_tokens": 163432.0,
"step": 10
},
{
"entropy": 0.911684438586235,
"epoch": 0.041044776119402986,
"grad_norm": 4.369440078735352,
"learning_rate": 0.0002,
"loss": 0.8261430263519287,
"mean_token_accuracy": 0.7205553501844406,
"num_tokens": 179455.0,
"step": 11
},
{
"entropy": 0.8916845321655273,
"epoch": 0.04477611940298507,
"grad_norm": 0.5139093399047852,
"learning_rate": 0.0002,
"loss": 0.8168894648551941,
"mean_token_accuracy": 0.714234933257103,
"num_tokens": 195668.0,
"step": 12
},
{
"entropy": 0.8192363679409027,
"epoch": 0.048507462686567165,
"grad_norm": 0.5154215097427368,
"learning_rate": 0.0002,
"loss": 0.7735035419464111,
"mean_token_accuracy": 0.7252469956874847,
"num_tokens": 211417.0,
"step": 13
},
{
"entropy": 0.8060386925935745,
"epoch": 0.05223880597014925,
"grad_norm": 0.3869208097457886,
"learning_rate": 0.0002,
"loss": 0.7496379017829895,
"mean_token_accuracy": 0.7249694466590881,
"num_tokens": 228014.0,
"step": 14
},
{
"entropy": 0.7358367741107941,
"epoch": 0.055970149253731345,
"grad_norm": 0.3804072439670563,
"learning_rate": 0.0002,
"loss": 0.7129448652267456,
"mean_token_accuracy": 0.7322827130556107,
"num_tokens": 244548.0,
"step": 15
},
{
"entropy": 0.6891884654760361,
"epoch": 0.05970149253731343,
"grad_norm": 0.4262757897377014,
"learning_rate": 0.0002,
"loss": 0.7087160348892212,
"mean_token_accuracy": 0.7325101941823959,
"num_tokens": 260927.0,
"step": 16
},
{
"entropy": 0.6646793335676193,
"epoch": 0.06343283582089553,
"grad_norm": 0.3463515639305115,
"learning_rate": 0.0002,
"loss": 0.6711890697479248,
"mean_token_accuracy": 0.743767574429512,
"num_tokens": 277478.0,
"step": 17
},
{
"entropy": 0.6615253239870071,
"epoch": 0.06716417910447761,
"grad_norm": 0.3623281419277191,
"learning_rate": 0.0002,
"loss": 0.6425697803497314,
"mean_token_accuracy": 0.7528071999549866,
"num_tokens": 293828.0,
"step": 18
},
{
"entropy": 0.6510400027036667,
"epoch": 0.0708955223880597,
"grad_norm": 0.3351263701915741,
"learning_rate": 0.0002,
"loss": 0.6357494592666626,
"mean_token_accuracy": 0.7543895989656448,
"num_tokens": 309962.0,
"step": 19
},
{
"entropy": 0.6420271843671799,
"epoch": 0.07462686567164178,
"grad_norm": 0.3311758041381836,
"learning_rate": 0.0002,
"loss": 0.6307370662689209,
"mean_token_accuracy": 0.7545324862003326,
"num_tokens": 326597.0,
"step": 20
},
{
"entropy": 0.6174459308385849,
"epoch": 0.07835820895522388,
"grad_norm": 0.35250842571258545,
"learning_rate": 0.0002,
"loss": 0.6103197336196899,
"mean_token_accuracy": 0.7592763751745224,
"num_tokens": 342917.0,
"step": 21
},
{
"entropy": 0.6289893835783005,
"epoch": 0.08208955223880597,
"grad_norm": 0.25894996523857117,
"learning_rate": 0.0002,
"loss": 0.6157230734825134,
"mean_token_accuracy": 0.7587940841913223,
"num_tokens": 359567.0,
"step": 22
},
{
"entropy": 0.6118573248386383,
"epoch": 0.08582089552238806,
"grad_norm": 0.29135045409202576,
"learning_rate": 0.0002,
"loss": 0.6002258658409119,
"mean_token_accuracy": 0.7654120922088623,
"num_tokens": 375565.0,
"step": 23
},
{
"entropy": 0.5791880339384079,
"epoch": 0.08955223880597014,
"grad_norm": 0.2720821499824524,
"learning_rate": 0.0002,
"loss": 0.5813120603561401,
"mean_token_accuracy": 0.7713776230812073,
"num_tokens": 391864.0,
"step": 24
},
{
"entropy": 0.6053604930639267,
"epoch": 0.09328358208955224,
"grad_norm": 0.2560279667377472,
"learning_rate": 0.0002,
"loss": 0.6105175018310547,
"mean_token_accuracy": 0.7615619450807571,
"num_tokens": 408354.0,
"step": 25
},
{
"entropy": 0.5867195874452591,
"epoch": 0.09701492537313433,
"grad_norm": 0.22600652277469635,
"learning_rate": 0.0002,
"loss": 0.5860370993614197,
"mean_token_accuracy": 0.7677419036626816,
"num_tokens": 424712.0,
"step": 26
},
{
"entropy": 0.5918123573064804,
"epoch": 0.10074626865671642,
"grad_norm": 0.256405770778656,
"learning_rate": 0.0002,
"loss": 0.5865331888198853,
"mean_token_accuracy": 0.7698597609996796,
"num_tokens": 441249.0,
"step": 27
},
{
"entropy": 0.5696172267198563,
"epoch": 0.1044776119402985,
"grad_norm": 0.22032174468040466,
"learning_rate": 0.0002,
"loss": 0.5604762434959412,
"mean_token_accuracy": 0.7779532968997955,
"num_tokens": 457602.0,
"step": 28
},
{
"entropy": 0.5602490454912186,
"epoch": 0.10820895522388059,
"grad_norm": 0.20871949195861816,
"learning_rate": 0.0002,
"loss": 0.5587727427482605,
"mean_token_accuracy": 0.7771614342927933,
"num_tokens": 473785.0,
"step": 29
},
{
"entropy": 0.5850763767957687,
"epoch": 0.11194029850746269,
"grad_norm": 0.23072806000709534,
"learning_rate": 0.0002,
"loss": 0.5962345004081726,
"mean_token_accuracy": 0.762176513671875,
"num_tokens": 490054.0,
"step": 30
},
{
"entropy": 0.5698783695697784,
"epoch": 0.11567164179104478,
"grad_norm": 0.20846784114837646,
"learning_rate": 0.0002,
"loss": 0.5793903470039368,
"mean_token_accuracy": 0.7701146155595779,
"num_tokens": 506525.0,
"step": 31
},
{
"entropy": 0.5649833828210831,
"epoch": 0.11940298507462686,
"grad_norm": 0.20395582914352417,
"learning_rate": 0.0002,
"loss": 0.5709314942359924,
"mean_token_accuracy": 0.7762356698513031,
"num_tokens": 522952.0,
"step": 32
},
{
"entropy": 0.5790712088346481,
"epoch": 0.12313432835820895,
"grad_norm": 0.21085898578166962,
"learning_rate": 0.0002,
"loss": 0.5755910873413086,
"mean_token_accuracy": 0.7691536694765091,
"num_tokens": 539151.0,
"step": 33
},
{
"entropy": 0.5798842161893845,
"epoch": 0.12686567164179105,
"grad_norm": 0.1799822747707367,
"learning_rate": 0.0002,
"loss": 0.5749096274375916,
"mean_token_accuracy": 0.7671291828155518,
"num_tokens": 555566.0,
"step": 34
},
{
"entropy": 0.568429708480835,
"epoch": 0.13059701492537312,
"grad_norm": 0.21928845345973969,
"learning_rate": 0.0002,
"loss": 0.5717220306396484,
"mean_token_accuracy": 0.771720290184021,
"num_tokens": 572125.0,
"step": 35
},
{
"entropy": 0.5658127665519714,
"epoch": 0.13432835820895522,
"grad_norm": 0.22536930441856384,
"learning_rate": 0.0002,
"loss": 0.5656446218490601,
"mean_token_accuracy": 0.7756934762001038,
"num_tokens": 588539.0,
"step": 36
},
{
"entropy": 0.5779189765453339,
"epoch": 0.13805970149253732,
"grad_norm": 0.18143770098686218,
"learning_rate": 0.0002,
"loss": 0.5782102942466736,
"mean_token_accuracy": 0.768736332654953,
"num_tokens": 604927.0,
"step": 37
},
{
"entropy": 0.5695452243089676,
"epoch": 0.1417910447761194,
"grad_norm": 0.18897166848182678,
"learning_rate": 0.0002,
"loss": 0.5745816230773926,
"mean_token_accuracy": 0.7676017582416534,
"num_tokens": 621213.0,
"step": 38
},
{
"entropy": 0.5704480558633804,
"epoch": 0.1455223880597015,
"grad_norm": 0.20254790782928467,
"learning_rate": 0.0002,
"loss": 0.573440432548523,
"mean_token_accuracy": 0.769940122961998,
"num_tokens": 637694.0,
"step": 39
},
{
"entropy": 0.5526881515979767,
"epoch": 0.14925373134328357,
"grad_norm": 0.2001330703496933,
"learning_rate": 0.0002,
"loss": 0.5598679780960083,
"mean_token_accuracy": 0.7767495959997177,
"num_tokens": 653791.0,
"step": 40
},
{
"entropy": 0.572973906993866,
"epoch": 0.15298507462686567,
"grad_norm": 0.1802511364221573,
"learning_rate": 0.0002,
"loss": 0.5720363855361938,
"mean_token_accuracy": 0.7737791240215302,
"num_tokens": 669970.0,
"step": 41
},
{
"entropy": 0.5880100876092911,
"epoch": 0.15671641791044777,
"grad_norm": 0.190653994679451,
"learning_rate": 0.0002,
"loss": 0.5839952826499939,
"mean_token_accuracy": 0.7667653411626816,
"num_tokens": 686164.0,
"step": 42
},
{
"entropy": 0.5611717849969864,
"epoch": 0.16044776119402984,
"grad_norm": 0.18095986545085907,
"learning_rate": 0.0002,
"loss": 0.5529768466949463,
"mean_token_accuracy": 0.7791769355535507,
"num_tokens": 702271.0,
"step": 43
},
{
"entropy": 0.5776362270116806,
"epoch": 0.16417910447761194,
"grad_norm": 0.20184266567230225,
"learning_rate": 0.0002,
"loss": 0.572957456111908,
"mean_token_accuracy": 0.772771418094635,
"num_tokens": 718759.0,
"step": 44
},
{
"entropy": 0.5637746602296829,
"epoch": 0.16791044776119404,
"grad_norm": 0.16902145743370056,
"learning_rate": 0.0002,
"loss": 0.564084529876709,
"mean_token_accuracy": 0.7736680209636688,
"num_tokens": 735087.0,
"step": 45
},
{
"entropy": 0.5521982908248901,
"epoch": 0.17164179104477612,
"grad_norm": 0.16458934545516968,
"learning_rate": 0.0002,
"loss": 0.5616670846939087,
"mean_token_accuracy": 0.7762537449598312,
"num_tokens": 751513.0,
"step": 46
},
{
"entropy": 0.5518182516098022,
"epoch": 0.17537313432835822,
"grad_norm": 0.22303543984889984,
"learning_rate": 0.0002,
"loss": 0.5712406039237976,
"mean_token_accuracy": 0.7692597359418869,
"num_tokens": 767651.0,
"step": 47
},
{
"entropy": 0.5570991486310959,
"epoch": 0.1791044776119403,
"grad_norm": 0.1629144549369812,
"learning_rate": 0.0002,
"loss": 0.5624895095825195,
"mean_token_accuracy": 0.7735912799835205,
"num_tokens": 783757.0,
"step": 48
},
{
"entropy": 0.549803838133812,
"epoch": 0.1828358208955224,
"grad_norm": 0.1366954892873764,
"learning_rate": 0.0002,
"loss": 0.5442911982536316,
"mean_token_accuracy": 0.7778248488903046,
"num_tokens": 800127.0,
"step": 49
},
{
"entropy": 0.5679125189781189,
"epoch": 0.1865671641791045,
"grad_norm": 0.1564488559961319,
"learning_rate": 0.0002,
"loss": 0.5563010573387146,
"mean_token_accuracy": 0.7781310826539993,
"num_tokens": 816490.0,
"step": 50
},
{
"entropy": 0.5595380216836929,
"epoch": 0.19029850746268656,
"grad_norm": 0.1663539558649063,
"learning_rate": 0.0002,
"loss": 0.5474997758865356,
"mean_token_accuracy": 0.778365820646286,
"num_tokens": 832576.0,
"step": 51
},
{
"entropy": 0.5542885512113571,
"epoch": 0.19402985074626866,
"grad_norm": 0.15933850407600403,
"learning_rate": 0.0002,
"loss": 0.5465819239616394,
"mean_token_accuracy": 0.781011700630188,
"num_tokens": 848529.0,
"step": 52
},
{
"entropy": 0.570631816983223,
"epoch": 0.19776119402985073,
"grad_norm": 0.15335530042648315,
"learning_rate": 0.0002,
"loss": 0.5733448266983032,
"mean_token_accuracy": 0.7690710127353668,
"num_tokens": 864787.0,
"step": 53
},
{
"entropy": 0.5657172054052353,
"epoch": 0.20149253731343283,
"grad_norm": 0.15320488810539246,
"learning_rate": 0.0002,
"loss": 0.5716187357902527,
"mean_token_accuracy": 0.7727480232715607,
"num_tokens": 881120.0,
"step": 54
},
{
"entropy": 0.5566735565662384,
"epoch": 0.20522388059701493,
"grad_norm": 0.174886554479599,
"learning_rate": 0.0002,
"loss": 0.5643004775047302,
"mean_token_accuracy": 0.7743579894304276,
"num_tokens": 897598.0,
"step": 55
},
{
"entropy": 0.5483224838972092,
"epoch": 0.208955223880597,
"grad_norm": 0.14539019763469696,
"learning_rate": 0.0002,
"loss": 0.5542981028556824,
"mean_token_accuracy": 0.7777313590049744,
"num_tokens": 913970.0,
"step": 56
},
{
"entropy": 0.5746322274208069,
"epoch": 0.2126865671641791,
"grad_norm": 0.1465657502412796,
"learning_rate": 0.0002,
"loss": 0.5676500201225281,
"mean_token_accuracy": 0.7716732025146484,
"num_tokens": 930515.0,
"step": 57
},
{
"entropy": 0.5645405799150467,
"epoch": 0.21641791044776118,
"grad_norm": 0.17157647013664246,
"learning_rate": 0.0002,
"loss": 0.554180383682251,
"mean_token_accuracy": 0.7776309847831726,
"num_tokens": 946699.0,
"step": 58
},
{
"entropy": 0.5437158495187759,
"epoch": 0.22014925373134328,
"grad_norm": 0.14779002964496613,
"learning_rate": 0.0002,
"loss": 0.5412948131561279,
"mean_token_accuracy": 0.7830284535884857,
"num_tokens": 962929.0,
"step": 59
},
{
"entropy": 0.5478496849536896,
"epoch": 0.22388059701492538,
"grad_norm": 0.16550469398498535,
"learning_rate": 0.0002,
"loss": 0.546680212020874,
"mean_token_accuracy": 0.7801186293363571,
"num_tokens": 979336.0,
"step": 60
},
{
"entropy": 0.5491016507148743,
"epoch": 0.22761194029850745,
"grad_norm": 0.17403647303581238,
"learning_rate": 0.0002,
"loss": 0.5650719404220581,
"mean_token_accuracy": 0.7729975134134293,
"num_tokens": 995774.0,
"step": 61
},
{
"entropy": 0.5622769743204117,
"epoch": 0.23134328358208955,
"grad_norm": 0.17750802636146545,
"learning_rate": 0.0002,
"loss": 0.5718308687210083,
"mean_token_accuracy": 0.7699476927518845,
"num_tokens": 1012510.0,
"step": 62
},
{
"entropy": 0.5333654135465622,
"epoch": 0.23507462686567165,
"grad_norm": 0.13930155336856842,
"learning_rate": 0.0002,
"loss": 0.5345954895019531,
"mean_token_accuracy": 0.7855408787727356,
"num_tokens": 1028613.0,
"step": 63
},
{
"entropy": 0.5784197896718979,
"epoch": 0.23880597014925373,
"grad_norm": 0.16901279985904694,
"learning_rate": 0.0002,
"loss": 0.56936115026474,
"mean_token_accuracy": 0.7703966796398163,
"num_tokens": 1045046.0,
"step": 64
},
{
"entropy": 0.5690423101186752,
"epoch": 0.24253731343283583,
"grad_norm": 0.16224578022956848,
"learning_rate": 0.0002,
"loss": 0.559661865234375,
"mean_token_accuracy": 0.7719420939683914,
"num_tokens": 1061419.0,
"step": 65
},
{
"entropy": 0.5822959691286087,
"epoch": 0.2462686567164179,
"grad_norm": 0.16501320898532867,
"learning_rate": 0.0002,
"loss": 0.5733515620231628,
"mean_token_accuracy": 0.7682919055223465,
"num_tokens": 1077724.0,
"step": 66
},
{
"entropy": 0.5663120746612549,
"epoch": 0.25,
"grad_norm": 0.15710598230361938,
"learning_rate": 0.0002,
"loss": 0.5739370584487915,
"mean_token_accuracy": 0.7685963213443756,
"num_tokens": 1094309.0,
"step": 67
},
{
"entropy": 0.5416915565729141,
"epoch": 0.2537313432835821,
"grad_norm": 0.1652906835079193,
"learning_rate": 0.0002,
"loss": 0.5546884536743164,
"mean_token_accuracy": 0.7781604677438736,
"num_tokens": 1110812.0,
"step": 68
},
{
"entropy": 0.5604560673236847,
"epoch": 0.2574626865671642,
"grad_norm": 0.1823517084121704,
"learning_rate": 0.0002,
"loss": 0.565848708152771,
"mean_token_accuracy": 0.7732205092906952,
"num_tokens": 1126983.0,
"step": 69
},
{
"entropy": 0.5681725591421127,
"epoch": 0.26119402985074625,
"grad_norm": 0.15536344051361084,
"learning_rate": 0.0002,
"loss": 0.5707790851593018,
"mean_token_accuracy": 0.7711602002382278,
"num_tokens": 1143690.0,
"step": 70
},
{
"entropy": 0.5554168075323105,
"epoch": 0.26492537313432835,
"grad_norm": 0.1691257208585739,
"learning_rate": 0.0002,
"loss": 0.5645061135292053,
"mean_token_accuracy": 0.7751206457614899,
"num_tokens": 1159930.0,
"step": 71
},
{
"entropy": 0.5698556303977966,
"epoch": 0.26865671641791045,
"grad_norm": 0.17756199836730957,
"learning_rate": 0.0002,
"loss": 0.5670963525772095,
"mean_token_accuracy": 0.7744691073894501,
"num_tokens": 1176287.0,
"step": 72
},
{
"entropy": 0.558213621377945,
"epoch": 0.27238805970149255,
"grad_norm": 0.14214132726192474,
"learning_rate": 0.0002,
"loss": 0.5565056204795837,
"mean_token_accuracy": 0.7759946286678314,
"num_tokens": 1192733.0,
"step": 73
},
{
"entropy": 0.5587260574102402,
"epoch": 0.27611940298507465,
"grad_norm": 0.1475045531988144,
"learning_rate": 0.0002,
"loss": 0.5534224510192871,
"mean_token_accuracy": 0.7787353843450546,
"num_tokens": 1209413.0,
"step": 74
},
{
"entropy": 0.5601568818092346,
"epoch": 0.2798507462686567,
"grad_norm": 0.17161411046981812,
"learning_rate": 0.0002,
"loss": 0.5623729825019836,
"mean_token_accuracy": 0.773567259311676,
"num_tokens": 1225838.0,
"step": 75
},
{
"entropy": 0.5421780049800873,
"epoch": 0.2835820895522388,
"grad_norm": 0.1444474756717682,
"learning_rate": 0.0002,
"loss": 0.5297126173973083,
"mean_token_accuracy": 0.7893946915864944,
"num_tokens": 1242213.0,
"step": 76
},
{
"entropy": 0.5718793421983719,
"epoch": 0.2873134328358209,
"grad_norm": 0.14322321116924286,
"learning_rate": 0.0002,
"loss": 0.5714331865310669,
"mean_token_accuracy": 0.7688785791397095,
"num_tokens": 1258461.0,
"step": 77
},
{
"entropy": 0.5419993549585342,
"epoch": 0.291044776119403,
"grad_norm": 0.1524474024772644,
"learning_rate": 0.0002,
"loss": 0.5490943193435669,
"mean_token_accuracy": 0.779272273182869,
"num_tokens": 1274449.0,
"step": 78
},
{
"entropy": 0.5585939884185791,
"epoch": 0.2947761194029851,
"grad_norm": 0.1510787457227707,
"learning_rate": 0.0002,
"loss": 0.5654528141021729,
"mean_token_accuracy": 0.772942066192627,
"num_tokens": 1290949.0,
"step": 79
},
{
"entropy": 0.563146710395813,
"epoch": 0.29850746268656714,
"grad_norm": 0.1482156217098236,
"learning_rate": 0.0002,
"loss": 0.5777900218963623,
"mean_token_accuracy": 0.7702645510435104,
"num_tokens": 1307187.0,
"step": 80
},
{
"entropy": 0.5600180923938751,
"epoch": 0.30223880597014924,
"grad_norm": 0.15022550523281097,
"learning_rate": 0.0002,
"loss": 0.5632287859916687,
"mean_token_accuracy": 0.7716066837310791,
"num_tokens": 1323407.0,
"step": 81
},
{
"entropy": 0.5598095804452896,
"epoch": 0.30597014925373134,
"grad_norm": 0.1322828084230423,
"learning_rate": 0.0002,
"loss": 0.5537106394767761,
"mean_token_accuracy": 0.7764421850442886,
"num_tokens": 1339664.0,
"step": 82
},
{
"entropy": 0.5458928942680359,
"epoch": 0.30970149253731344,
"grad_norm": 0.1319894790649414,
"learning_rate": 0.0002,
"loss": 0.5423555374145508,
"mean_token_accuracy": 0.7807362526655197,
"num_tokens": 1356260.0,
"step": 83
},
{
"entropy": 0.5659633129835129,
"epoch": 0.31343283582089554,
"grad_norm": 0.13246627151966095,
"learning_rate": 0.0002,
"loss": 0.557287335395813,
"mean_token_accuracy": 0.7743117958307266,
"num_tokens": 1372821.0,
"step": 84
},
{
"entropy": 0.5452462434768677,
"epoch": 0.31716417910447764,
"grad_norm": 0.16196919977664948,
"learning_rate": 0.0002,
"loss": 0.543107271194458,
"mean_token_accuracy": 0.7795177549123764,
"num_tokens": 1388889.0,
"step": 85
},
{
"entropy": 0.5466109812259674,
"epoch": 0.3208955223880597,
"grad_norm": 0.12639470398426056,
"learning_rate": 0.0002,
"loss": 0.5396162271499634,
"mean_token_accuracy": 0.7834953665733337,
"num_tokens": 1405139.0,
"step": 86
},
{
"entropy": 0.551815465092659,
"epoch": 0.3246268656716418,
"grad_norm": 0.18058188259601593,
"learning_rate": 0.0002,
"loss": 0.5637637972831726,
"mean_token_accuracy": 0.7716487348079681,
"num_tokens": 1421439.0,
"step": 87
},
{
"entropy": 0.543148547410965,
"epoch": 0.3283582089552239,
"grad_norm": 0.14002034068107605,
"learning_rate": 0.0002,
"loss": 0.549104630947113,
"mean_token_accuracy": 0.7779115587472916,
"num_tokens": 1437695.0,
"step": 88
},
{
"entropy": 0.5655066221952438,
"epoch": 0.332089552238806,
"grad_norm": 0.13395759463310242,
"learning_rate": 0.0002,
"loss": 0.5683454871177673,
"mean_token_accuracy": 0.7728030234575272,
"num_tokens": 1453991.0,
"step": 89
},
{
"entropy": 0.5676597952842712,
"epoch": 0.3358208955223881,
"grad_norm": 0.14229720830917358,
"learning_rate": 0.0002,
"loss": 0.5701878070831299,
"mean_token_accuracy": 0.7698987573385239,
"num_tokens": 1470371.0,
"step": 90
},
{
"entropy": 0.5576249063014984,
"epoch": 0.33955223880597013,
"grad_norm": 0.1365518420934677,
"learning_rate": 0.0002,
"loss": 0.560733437538147,
"mean_token_accuracy": 0.7742054760456085,
"num_tokens": 1486891.0,
"step": 91
},
{
"entropy": 0.5476901531219482,
"epoch": 0.34328358208955223,
"grad_norm": 0.12286433577537537,
"learning_rate": 0.0002,
"loss": 0.5540446639060974,
"mean_token_accuracy": 0.7757776081562042,
"num_tokens": 1503153.0,
"step": 92
},
{
"entropy": 0.5445209294557571,
"epoch": 0.34701492537313433,
"grad_norm": 0.13203619420528412,
"learning_rate": 0.0002,
"loss": 0.5416238903999329,
"mean_token_accuracy": 0.7820428013801575,
"num_tokens": 1519248.0,
"step": 93
},
{
"entropy": 0.5732006430625916,
"epoch": 0.35074626865671643,
"grad_norm": 0.14288392663002014,
"learning_rate": 0.0002,
"loss": 0.5734184980392456,
"mean_token_accuracy": 0.7677003741264343,
"num_tokens": 1535616.0,
"step": 94
},
{
"entropy": 0.5645585656166077,
"epoch": 0.35447761194029853,
"grad_norm": 0.1253618448972702,
"learning_rate": 0.0002,
"loss": 0.5549549460411072,
"mean_token_accuracy": 0.7756840586662292,
"num_tokens": 1552040.0,
"step": 95
},
{
"entropy": 0.5686955749988556,
"epoch": 0.3582089552238806,
"grad_norm": 0.12725889682769775,
"learning_rate": 0.0002,
"loss": 0.573272705078125,
"mean_token_accuracy": 0.7684734165668488,
"num_tokens": 1568381.0,
"step": 96
},
{
"entropy": 0.547907680273056,
"epoch": 0.3619402985074627,
"grad_norm": 0.13573119044303894,
"learning_rate": 0.0002,
"loss": 0.5526182055473328,
"mean_token_accuracy": 0.7779877185821533,
"num_tokens": 1584726.0,
"step": 97
},
{
"entropy": 0.5658805668354034,
"epoch": 0.3656716417910448,
"grad_norm": 0.13501696288585663,
"learning_rate": 0.0002,
"loss": 0.5696231722831726,
"mean_token_accuracy": 0.7706904113292694,
"num_tokens": 1601142.0,
"step": 98
},
{
"entropy": 0.5553559362888336,
"epoch": 0.3694029850746269,
"grad_norm": 0.12036850303411484,
"learning_rate": 0.0002,
"loss": 0.5520588159561157,
"mean_token_accuracy": 0.7781549990177155,
"num_tokens": 1617184.0,
"step": 99
},
{
"entropy": 0.5559379458427429,
"epoch": 0.373134328358209,
"grad_norm": 0.12556730210781097,
"learning_rate": 0.0002,
"loss": 0.5582664608955383,
"mean_token_accuracy": 0.7744826525449753,
"num_tokens": 1633573.0,
"step": 100
},
{
"entropy": 0.5321817249059677,
"epoch": 0.376865671641791,
"grad_norm": 0.1410171091556549,
"learning_rate": 0.0002,
"loss": 0.531158447265625,
"mean_token_accuracy": 0.7867954224348068,
"num_tokens": 1649580.0,
"step": 101
},
{
"entropy": 0.5629207491874695,
"epoch": 0.3805970149253731,
"grad_norm": 0.1320696920156479,
"learning_rate": 0.0002,
"loss": 0.5548203587532043,
"mean_token_accuracy": 0.777129277586937,
"num_tokens": 1665914.0,
"step": 102
},
{
"entropy": 0.5625062435865402,
"epoch": 0.3843283582089552,
"grad_norm": 0.15022383630275726,
"learning_rate": 0.0002,
"loss": 0.559231698513031,
"mean_token_accuracy": 0.7755367606878281,
"num_tokens": 1682572.0,
"step": 103
},
{
"entropy": 0.55105359852314,
"epoch": 0.3880597014925373,
"grad_norm": 0.13816320896148682,
"learning_rate": 0.0002,
"loss": 0.5513999462127686,
"mean_token_accuracy": 0.7777303904294968,
"num_tokens": 1698800.0,
"step": 104
},
{
"entropy": 0.5433051884174347,
"epoch": 0.3917910447761194,
"grad_norm": 0.13852182030677795,
"learning_rate": 0.0002,
"loss": 0.5473951101303101,
"mean_token_accuracy": 0.7787780612707138,
"num_tokens": 1715089.0,
"step": 105
},
{
"entropy": 0.5638341754674911,
"epoch": 0.39552238805970147,
"grad_norm": 0.13244302570819855,
"learning_rate": 0.0002,
"loss": 0.5711042284965515,
"mean_token_accuracy": 0.7705479264259338,
"num_tokens": 1731289.0,
"step": 106
},
{
"entropy": 0.5590131878852844,
"epoch": 0.39925373134328357,
"grad_norm": 0.14187560975551605,
"learning_rate": 0.0002,
"loss": 0.5588455200195312,
"mean_token_accuracy": 0.775245189666748,
"num_tokens": 1747777.0,
"step": 107
},
{
"entropy": 0.5456477552652359,
"epoch": 0.40298507462686567,
"grad_norm": 0.12155073136091232,
"learning_rate": 0.0002,
"loss": 0.5477449297904968,
"mean_token_accuracy": 0.7793276309967041,
"num_tokens": 1764099.0,
"step": 108
},
{
"entropy": 0.5533221960067749,
"epoch": 0.40671641791044777,
"grad_norm": 0.14932067692279816,
"learning_rate": 0.0002,
"loss": 0.550473153591156,
"mean_token_accuracy": 0.7792102247476578,
"num_tokens": 1780092.0,
"step": 109
},
{
"entropy": 0.5685938596725464,
"epoch": 0.41044776119402987,
"grad_norm": 0.11824015527963638,
"learning_rate": 0.0002,
"loss": 0.567302942276001,
"mean_token_accuracy": 0.768885999917984,
"num_tokens": 1796553.0,
"step": 110
},
{
"entropy": 0.558070957660675,
"epoch": 0.4141791044776119,
"grad_norm": 0.13145862519741058,
"learning_rate": 0.0002,
"loss": 0.5594078302383423,
"mean_token_accuracy": 0.7714920043945312,
"num_tokens": 1812976.0,
"step": 111
},
{
"entropy": 0.5445801764726639,
"epoch": 0.417910447761194,
"grad_norm": 0.1538373976945877,
"learning_rate": 0.0002,
"loss": 0.5507169365882874,
"mean_token_accuracy": 0.7795748263597488,
"num_tokens": 1829496.0,
"step": 112
},
{
"entropy": 0.5546134263277054,
"epoch": 0.4216417910447761,
"grad_norm": 0.14499837160110474,
"learning_rate": 0.0002,
"loss": 0.5621107816696167,
"mean_token_accuracy": 0.772913932800293,
"num_tokens": 1845899.0,
"step": 113
},
{
"entropy": 0.5376207381486893,
"epoch": 0.4253731343283582,
"grad_norm": 0.12395139783620834,
"learning_rate": 0.0002,
"loss": 0.5408076643943787,
"mean_token_accuracy": 0.7826146930456161,
"num_tokens": 1862102.0,
"step": 114
},
{
"entropy": 0.5709025114774704,
"epoch": 0.4291044776119403,
"grad_norm": 0.14900445938110352,
"learning_rate": 0.0002,
"loss": 0.5688319206237793,
"mean_token_accuracy": 0.7712048441171646,
"num_tokens": 1878466.0,
"step": 115
},
{
"entropy": 0.5531350374221802,
"epoch": 0.43283582089552236,
"grad_norm": 0.14944979548454285,
"learning_rate": 0.0002,
"loss": 0.5533212423324585,
"mean_token_accuracy": 0.7762057036161423,
"num_tokens": 1894613.0,
"step": 116
},
{
"entropy": 0.5613852292299271,
"epoch": 0.43656716417910446,
"grad_norm": 0.14122174680233002,
"learning_rate": 0.0002,
"loss": 0.5625326633453369,
"mean_token_accuracy": 0.7721518725156784,
"num_tokens": 1910791.0,
"step": 117
},
{
"entropy": 0.5606949478387833,
"epoch": 0.44029850746268656,
"grad_norm": 0.11353051662445068,
"learning_rate": 0.0002,
"loss": 0.5561124682426453,
"mean_token_accuracy": 0.7774701118469238,
"num_tokens": 1927342.0,
"step": 118
},
{
"entropy": 0.5748601853847504,
"epoch": 0.44402985074626866,
"grad_norm": 0.13328969478607178,
"learning_rate": 0.0002,
"loss": 0.5738563537597656,
"mean_token_accuracy": 0.7660426646471024,
"num_tokens": 1944009.0,
"step": 119
},
{
"entropy": 0.5331175327301025,
"epoch": 0.44776119402985076,
"grad_norm": 0.14304570853710175,
"learning_rate": 0.0002,
"loss": 0.535332441329956,
"mean_token_accuracy": 0.7843142002820969,
"num_tokens": 1960275.0,
"step": 120
},
{
"entropy": 0.5579216629266739,
"epoch": 0.45149253731343286,
"grad_norm": 0.12545879185199738,
"learning_rate": 0.0002,
"loss": 0.5590261220932007,
"mean_token_accuracy": 0.7733252346515656,
"num_tokens": 1976578.0,
"step": 121
},
{
"entropy": 0.5593921393156052,
"epoch": 0.4552238805970149,
"grad_norm": 0.13857485353946686,
"learning_rate": 0.0002,
"loss": 0.5631604194641113,
"mean_token_accuracy": 0.7736008018255234,
"num_tokens": 1993053.0,
"step": 122
},
{
"entropy": 0.5660806745290756,
"epoch": 0.458955223880597,
"grad_norm": 0.11944495886564255,
"learning_rate": 0.0002,
"loss": 0.5569764375686646,
"mean_token_accuracy": 0.7737946212291718,
"num_tokens": 2009442.0,
"step": 123
},
{
"entropy": 0.5681817382574081,
"epoch": 0.4626865671641791,
"grad_norm": 0.14172527194023132,
"learning_rate": 0.0002,
"loss": 0.5605779886245728,
"mean_token_accuracy": 0.7750114947557449,
"num_tokens": 2025901.0,
"step": 124
},
{
"entropy": 0.5467974990606308,
"epoch": 0.4664179104477612,
"grad_norm": 0.1252705603837967,
"learning_rate": 0.0002,
"loss": 0.5515766739845276,
"mean_token_accuracy": 0.7760580778121948,
"num_tokens": 2042208.0,
"step": 125
},
{
"entropy": 0.5420515686273575,
"epoch": 0.4701492537313433,
"grad_norm": 0.13870663940906525,
"learning_rate": 0.0002,
"loss": 0.5480060577392578,
"mean_token_accuracy": 0.7764822095632553,
"num_tokens": 2058681.0,
"step": 126
},
{
"entropy": 0.5362897217273712,
"epoch": 0.47388059701492535,
"grad_norm": 0.13995425403118134,
"learning_rate": 0.0002,
"loss": 0.5513206720352173,
"mean_token_accuracy": 0.7750497758388519,
"num_tokens": 2075000.0,
"step": 127
},
{
"entropy": 0.5329284965991974,
"epoch": 0.47761194029850745,
"grad_norm": 0.16524387896060944,
"learning_rate": 0.0002,
"loss": 0.5436174273490906,
"mean_token_accuracy": 0.7792856246232986,
"num_tokens": 2091221.0,
"step": 128
},
{
"entropy": 0.5539916902780533,
"epoch": 0.48134328358208955,
"grad_norm": 0.12479358166456223,
"learning_rate": 0.0002,
"loss": 0.5608515739440918,
"mean_token_accuracy": 0.7734991759061813,
"num_tokens": 2107664.0,
"step": 129
},
{
"entropy": 0.5594889521598816,
"epoch": 0.48507462686567165,
"grad_norm": 0.14481139183044434,
"learning_rate": 0.0002,
"loss": 0.5508875846862793,
"mean_token_accuracy": 0.7767421901226044,
"num_tokens": 2123952.0,
"step": 130
},
{
"entropy": 0.5442296341061592,
"epoch": 0.48880597014925375,
"grad_norm": 0.12281627953052521,
"learning_rate": 0.0002,
"loss": 0.5368722677230835,
"mean_token_accuracy": 0.7826971709728241,
"num_tokens": 2139985.0,
"step": 131
},
{
"entropy": 0.562851145863533,
"epoch": 0.4925373134328358,
"grad_norm": 0.14453750848770142,
"learning_rate": 0.0002,
"loss": 0.5439143180847168,
"mean_token_accuracy": 0.7809209376573563,
"num_tokens": 2156312.0,
"step": 132
},
{
"entropy": 0.5531761199235916,
"epoch": 0.4962686567164179,
"grad_norm": 0.13650745153427124,
"learning_rate": 0.0002,
"loss": 0.5565841197967529,
"mean_token_accuracy": 0.7758718878030777,
"num_tokens": 2172756.0,
"step": 133
},
{
"entropy": 0.5456132292747498,
"epoch": 0.5,
"grad_norm": 0.13749481737613678,
"learning_rate": 0.0002,
"loss": 0.5540860295295715,
"mean_token_accuracy": 0.7755758464336395,
"num_tokens": 2189086.0,
"step": 134
},
{
"entropy": 0.5647578835487366,
"epoch": 0.503731343283582,
"grad_norm": 0.145718514919281,
"learning_rate": 0.0002,
"loss": 0.5744016766548157,
"mean_token_accuracy": 0.7706383019685745,
"num_tokens": 2205658.0,
"step": 135
},
{
"entropy": 0.5253579095005989,
"epoch": 0.5074626865671642,
"grad_norm": 0.1236543357372284,
"learning_rate": 0.0002,
"loss": 0.5327446460723877,
"mean_token_accuracy": 0.7834168970584869,
"num_tokens": 2221900.0,
"step": 136
},
{
"entropy": 0.5625722110271454,
"epoch": 0.5111940298507462,
"grad_norm": 0.1114581972360611,
"learning_rate": 0.0002,
"loss": 0.5667597651481628,
"mean_token_accuracy": 0.7699635177850723,
"num_tokens": 2238309.0,
"step": 137
},
{
"entropy": 0.5476242303848267,
"epoch": 0.5149253731343284,
"grad_norm": 0.1360960304737091,
"learning_rate": 0.0002,
"loss": 0.5452396273612976,
"mean_token_accuracy": 0.7796155512332916,
"num_tokens": 2254713.0,
"step": 138
},
{
"entropy": 0.5573885440826416,
"epoch": 0.5186567164179104,
"grad_norm": 0.11950599402189255,
"learning_rate": 0.0002,
"loss": 0.5531854629516602,
"mean_token_accuracy": 0.7765035033226013,
"num_tokens": 2271164.0,
"step": 139
},
{
"entropy": 0.5644345581531525,
"epoch": 0.5223880597014925,
"grad_norm": 0.11840134114027023,
"learning_rate": 0.0002,
"loss": 0.5575224161148071,
"mean_token_accuracy": 0.7718838900327682,
"num_tokens": 2287762.0,
"step": 140
},
{
"entropy": 0.5466153174638748,
"epoch": 0.5261194029850746,
"grad_norm": 0.1688532829284668,
"learning_rate": 0.0002,
"loss": 0.5499178171157837,
"mean_token_accuracy": 0.777469664812088,
"num_tokens": 2304348.0,
"step": 141
},
{
"entropy": 0.5427221059799194,
"epoch": 0.5298507462686567,
"grad_norm": 0.14760567247867584,
"learning_rate": 0.0002,
"loss": 0.5492222905158997,
"mean_token_accuracy": 0.778323158621788,
"num_tokens": 2320490.0,
"step": 142
},
{
"entropy": 0.5470593422651291,
"epoch": 0.5335820895522388,
"grad_norm": 0.19991202652454376,
"learning_rate": 0.0002,
"loss": 0.5513626933097839,
"mean_token_accuracy": 0.7774471044540405,
"num_tokens": 2337221.0,
"step": 143
},
{
"entropy": 0.5426470190286636,
"epoch": 0.5373134328358209,
"grad_norm": 0.11571265757083893,
"learning_rate": 0.0002,
"loss": 0.5405253767967224,
"mean_token_accuracy": 0.7813504189252853,
"num_tokens": 2353353.0,
"step": 144
},
{
"entropy": 0.5667431056499481,
"epoch": 0.5410447761194029,
"grad_norm": 0.12742455303668976,
"learning_rate": 0.0002,
"loss": 0.5593273043632507,
"mean_token_accuracy": 0.7729441076517105,
"num_tokens": 2369753.0,
"step": 145
},
{
"entropy": 0.5697275847196579,
"epoch": 0.5447761194029851,
"grad_norm": 0.1348797082901001,
"learning_rate": 0.0002,
"loss": 0.5684511661529541,
"mean_token_accuracy": 0.7724753767251968,
"num_tokens": 2386156.0,
"step": 146
},
{
"entropy": 0.5411224067211151,
"epoch": 0.5485074626865671,
"grad_norm": 0.1279442012310028,
"learning_rate": 0.0002,
"loss": 0.5420435667037964,
"mean_token_accuracy": 0.782076433300972,
"num_tokens": 2402488.0,
"step": 147
},
{
"entropy": 0.5458887368440628,
"epoch": 0.5522388059701493,
"grad_norm": 0.15301373600959778,
"learning_rate": 0.0002,
"loss": 0.5421918630599976,
"mean_token_accuracy": 0.7805485129356384,
"num_tokens": 2418800.0,
"step": 148
},
{
"entropy": 0.5494910031557083,
"epoch": 0.5559701492537313,
"grad_norm": 0.13024193048477173,
"learning_rate": 0.0002,
"loss": 0.5560234189033508,
"mean_token_accuracy": 0.7752619981765747,
"num_tokens": 2435229.0,
"step": 149
},
{
"entropy": 0.5497897416353226,
"epoch": 0.5597014925373134,
"grad_norm": 0.140470951795578,
"learning_rate": 0.0002,
"loss": 0.5513492226600647,
"mean_token_accuracy": 0.775757297873497,
"num_tokens": 2451762.0,
"step": 150
},
{
"entropy": 0.5479221642017365,
"epoch": 0.5634328358208955,
"grad_norm": 0.11884977668523788,
"learning_rate": 0.0002,
"loss": 0.5478861331939697,
"mean_token_accuracy": 0.782090038061142,
"num_tokens": 2468180.0,
"step": 151
},
{
"entropy": 0.5405495166778564,
"epoch": 0.5671641791044776,
"grad_norm": 0.12883080542087555,
"learning_rate": 0.0002,
"loss": 0.5406085252761841,
"mean_token_accuracy": 0.7832252681255341,
"num_tokens": 2484444.0,
"step": 152
},
{
"entropy": 0.5454452037811279,
"epoch": 0.5708955223880597,
"grad_norm": 0.12270363420248032,
"learning_rate": 0.0002,
"loss": 0.5502068400382996,
"mean_token_accuracy": 0.7790153920650482,
"num_tokens": 2500846.0,
"step": 153
},
{
"entropy": 0.5570302158594131,
"epoch": 0.5746268656716418,
"grad_norm": 0.1269625872373581,
"learning_rate": 0.0002,
"loss": 0.5548018217086792,
"mean_token_accuracy": 0.778030514717102,
"num_tokens": 2517083.0,
"step": 154
},
{
"entropy": 0.5605379194021225,
"epoch": 0.5783582089552238,
"grad_norm": 0.1287340223789215,
"learning_rate": 0.0002,
"loss": 0.561842143535614,
"mean_token_accuracy": 0.7721278667449951,
"num_tokens": 2533804.0,
"step": 155
},
{
"entropy": 0.5481511801481247,
"epoch": 0.582089552238806,
"grad_norm": 0.13460931181907654,
"learning_rate": 0.0002,
"loss": 0.5473400950431824,
"mean_token_accuracy": 0.7798450142145157,
"num_tokens": 2550301.0,
"step": 156
},
{
"entropy": 0.5569665729999542,
"epoch": 0.585820895522388,
"grad_norm": 0.1167525053024292,
"learning_rate": 0.0002,
"loss": 0.5591033697128296,
"mean_token_accuracy": 0.7743667513132095,
"num_tokens": 2566630.0,
"step": 157
},
{
"entropy": 0.5529917627573013,
"epoch": 0.5895522388059702,
"grad_norm": 0.1454092264175415,
"learning_rate": 0.0002,
"loss": 0.5575821399688721,
"mean_token_accuracy": 0.7714344263076782,
"num_tokens": 2583278.0,
"step": 158
},
{
"entropy": 0.5369462221860886,
"epoch": 0.5932835820895522,
"grad_norm": 0.12713587284088135,
"learning_rate": 0.0002,
"loss": 0.541353702545166,
"mean_token_accuracy": 0.7810934484004974,
"num_tokens": 2599680.0,
"step": 159
},
{
"entropy": 0.5471956133842468,
"epoch": 0.5970149253731343,
"grad_norm": 0.1193249523639679,
"learning_rate": 0.0002,
"loss": 0.544399619102478,
"mean_token_accuracy": 0.777627244591713,
"num_tokens": 2615971.0,
"step": 160
},
{
"entropy": 0.5561826080083847,
"epoch": 0.6007462686567164,
"grad_norm": 0.1412789523601532,
"learning_rate": 0.0002,
"loss": 0.5533403754234314,
"mean_token_accuracy": 0.774614229798317,
"num_tokens": 2632402.0,
"step": 161
},
{
"entropy": 0.5589349567890167,
"epoch": 0.6044776119402985,
"grad_norm": 0.12422283738851547,
"learning_rate": 0.0002,
"loss": 0.5584982633590698,
"mean_token_accuracy": 0.772629901766777,
"num_tokens": 2648936.0,
"step": 162
},
{
"entropy": 0.5598675608634949,
"epoch": 0.6082089552238806,
"grad_norm": 0.14433413743972778,
"learning_rate": 0.0002,
"loss": 0.5596426725387573,
"mean_token_accuracy": 0.7740431576967239,
"num_tokens": 2665475.0,
"step": 163
},
{
"entropy": 0.5221775621175766,
"epoch": 0.6119402985074627,
"grad_norm": 0.12392512708902359,
"learning_rate": 0.0002,
"loss": 0.5226801037788391,
"mean_token_accuracy": 0.7883991152048111,
"num_tokens": 2681739.0,
"step": 164
},
{
"entropy": 0.5390211492776871,
"epoch": 0.6156716417910447,
"grad_norm": 0.1389789581298828,
"learning_rate": 0.0002,
"loss": 0.5467759370803833,
"mean_token_accuracy": 0.7787502557039261,
"num_tokens": 2698224.0,
"step": 165
},
{
"entropy": 0.5343765914440155,
"epoch": 0.6194029850746269,
"grad_norm": 0.15462790429592133,
"learning_rate": 0.0002,
"loss": 0.5523170232772827,
"mean_token_accuracy": 0.7789429575204849,
"num_tokens": 2714480.0,
"step": 166
},
{
"entropy": 0.5412632822990417,
"epoch": 0.6231343283582089,
"grad_norm": 0.13078634440898895,
"learning_rate": 0.0002,
"loss": 0.5461232662200928,
"mean_token_accuracy": 0.7796546518802643,
"num_tokens": 2730804.0,
"step": 167
},
{
"entropy": 0.5592486709356308,
"epoch": 0.6268656716417911,
"grad_norm": 0.11671686917543411,
"learning_rate": 0.0002,
"loss": 0.5556939244270325,
"mean_token_accuracy": 0.7750763148069382,
"num_tokens": 2747189.0,
"step": 168
},
{
"entropy": 0.5645984709262848,
"epoch": 0.6305970149253731,
"grad_norm": 0.11404155939817429,
"learning_rate": 0.0002,
"loss": 0.5586551427841187,
"mean_token_accuracy": 0.7756913602352142,
"num_tokens": 2763561.0,
"step": 169
},
{
"entropy": 0.5689886808395386,
"epoch": 0.6343283582089553,
"grad_norm": 0.13602924346923828,
"learning_rate": 0.0002,
"loss": 0.571495771408081,
"mean_token_accuracy": 0.7653735727071762,
"num_tokens": 2780048.0,
"step": 170
},
{
"entropy": 0.56998710334301,
"epoch": 0.6380597014925373,
"grad_norm": 0.15131747722625732,
"learning_rate": 0.0002,
"loss": 0.5685769319534302,
"mean_token_accuracy": 0.770746722817421,
"num_tokens": 2796401.0,
"step": 171
},
{
"entropy": 0.5340622663497925,
"epoch": 0.6417910447761194,
"grad_norm": 0.10990842431783676,
"learning_rate": 0.0002,
"loss": 0.5300686955451965,
"mean_token_accuracy": 0.7831304669380188,
"num_tokens": 2812688.0,
"step": 172
},
{
"entropy": 0.5546266734600067,
"epoch": 0.6455223880597015,
"grad_norm": 0.14243000745773315,
"learning_rate": 0.0002,
"loss": 0.5531081557273865,
"mean_token_accuracy": 0.7720183730125427,
"num_tokens": 2828912.0,
"step": 173
},
{
"entropy": 0.530887708067894,
"epoch": 0.6492537313432836,
"grad_norm": 0.14285673201084137,
"learning_rate": 0.0002,
"loss": 0.5329350233078003,
"mean_token_accuracy": 0.7844198048114777,
"num_tokens": 2845032.0,
"step": 174
},
{
"entropy": 0.5529126077890396,
"epoch": 0.6529850746268657,
"grad_norm": 0.12663516402244568,
"learning_rate": 0.0002,
"loss": 0.5582675337791443,
"mean_token_accuracy": 0.775692343711853,
"num_tokens": 2861233.0,
"step": 175
},
{
"entropy": 0.5530151873826981,
"epoch": 0.6567164179104478,
"grad_norm": 0.1777547299861908,
"learning_rate": 0.0002,
"loss": 0.5580370426177979,
"mean_token_accuracy": 0.7773808538913727,
"num_tokens": 2877595.0,
"step": 176
},
{
"entropy": 0.5517453551292419,
"epoch": 0.6604477611940298,
"grad_norm": 0.12728020548820496,
"learning_rate": 0.0002,
"loss": 0.549347996711731,
"mean_token_accuracy": 0.7813896834850311,
"num_tokens": 2893885.0,
"step": 177
},
{
"entropy": 0.5581229478120804,
"epoch": 0.664179104477612,
"grad_norm": 0.12608157098293304,
"learning_rate": 0.0002,
"loss": 0.5528551936149597,
"mean_token_accuracy": 0.774133637547493,
"num_tokens": 2910402.0,
"step": 178
},
{
"entropy": 0.5545129030942917,
"epoch": 0.667910447761194,
"grad_norm": 0.14164696633815765,
"learning_rate": 0.0002,
"loss": 0.5471103191375732,
"mean_token_accuracy": 0.7807044833898544,
"num_tokens": 2927020.0,
"step": 179
},
{
"entropy": 0.5679615437984467,
"epoch": 0.6716417910447762,
"grad_norm": 0.11040110141038895,
"learning_rate": 0.0002,
"loss": 0.5661795139312744,
"mean_token_accuracy": 0.7697756141424179,
"num_tokens": 2943445.0,
"step": 180
},
{
"entropy": 0.5358923226594925,
"epoch": 0.6753731343283582,
"grad_norm": 0.12206491082906723,
"learning_rate": 0.0002,
"loss": 0.5459122061729431,
"mean_token_accuracy": 0.7805617302656174,
"num_tokens": 2959987.0,
"step": 181
},
{
"entropy": 0.5579689890146255,
"epoch": 0.6791044776119403,
"grad_norm": 0.14179477095603943,
"learning_rate": 0.0002,
"loss": 0.5636488199234009,
"mean_token_accuracy": 0.7736007869243622,
"num_tokens": 2976751.0,
"step": 182
},
{
"entropy": 0.5510261654853821,
"epoch": 0.6828358208955224,
"grad_norm": 0.12091591209173203,
"learning_rate": 0.0002,
"loss": 0.561327338218689,
"mean_token_accuracy": 0.776558443903923,
"num_tokens": 2993041.0,
"step": 183
},
{
"entropy": 0.5457663834095001,
"epoch": 0.6865671641791045,
"grad_norm": 0.12697891891002655,
"learning_rate": 0.0002,
"loss": 0.5465325117111206,
"mean_token_accuracy": 0.7786546349525452,
"num_tokens": 3009436.0,
"step": 184
},
{
"entropy": 0.5649427324533463,
"epoch": 0.6902985074626866,
"grad_norm": 0.13892695307731628,
"learning_rate": 0.0002,
"loss": 0.5654124617576599,
"mean_token_accuracy": 0.7703604251146317,
"num_tokens": 3025787.0,
"step": 185
},
{
"entropy": 0.5688793361186981,
"epoch": 0.6940298507462687,
"grad_norm": 0.11656537652015686,
"learning_rate": 0.0002,
"loss": 0.5590483546257019,
"mean_token_accuracy": 0.7758390307426453,
"num_tokens": 3042147.0,
"step": 186
},
{
"entropy": 0.5568420886993408,
"epoch": 0.6977611940298507,
"grad_norm": 0.1266399472951889,
"learning_rate": 0.0002,
"loss": 0.5490051507949829,
"mean_token_accuracy": 0.7778443545103073,
"num_tokens": 3058479.0,
"step": 187
},
{
"entropy": 0.5504391342401505,
"epoch": 0.7014925373134329,
"grad_norm": 0.15510344505310059,
"learning_rate": 0.0002,
"loss": 0.5499662756919861,
"mean_token_accuracy": 0.7750896066427231,
"num_tokens": 3074684.0,
"step": 188
},
{
"entropy": 0.5515661090612411,
"epoch": 0.7052238805970149,
"grad_norm": 0.1378200650215149,
"learning_rate": 0.0002,
"loss": 0.5564606189727783,
"mean_token_accuracy": 0.7740965932607651,
"num_tokens": 3091070.0,
"step": 189
},
{
"entropy": 0.5522360950708389,
"epoch": 0.7089552238805971,
"grad_norm": 0.1490645706653595,
"learning_rate": 0.0002,
"loss": 0.5577459335327148,
"mean_token_accuracy": 0.7747645527124405,
"num_tokens": 3107501.0,
"step": 190
},
{
"entropy": 0.5528729557991028,
"epoch": 0.7126865671641791,
"grad_norm": 0.14538180828094482,
"learning_rate": 0.0002,
"loss": 0.5618550777435303,
"mean_token_accuracy": 0.7729964852333069,
"num_tokens": 3123822.0,
"step": 191
},
{
"entropy": 0.5486249774694443,
"epoch": 0.7164179104477612,
"grad_norm": 0.12265278398990631,
"learning_rate": 0.0002,
"loss": 0.5423588752746582,
"mean_token_accuracy": 0.7789205312728882,
"num_tokens": 3140334.0,
"step": 192
},
{
"entropy": 0.5567969381809235,
"epoch": 0.7201492537313433,
"grad_norm": 0.13273917138576508,
"learning_rate": 0.0002,
"loss": 0.5613058805465698,
"mean_token_accuracy": 0.7748401314020157,
"num_tokens": 3156490.0,
"step": 193
},
{
"entropy": 0.558370977640152,
"epoch": 0.7238805970149254,
"grad_norm": 0.1269926130771637,
"learning_rate": 0.0002,
"loss": 0.548975944519043,
"mean_token_accuracy": 0.7803195267915726,
"num_tokens": 3172917.0,
"step": 194
},
{
"entropy": 0.5645796656608582,
"epoch": 0.7276119402985075,
"grad_norm": 0.12320506572723389,
"learning_rate": 0.0002,
"loss": 0.5635199546813965,
"mean_token_accuracy": 0.773562416434288,
"num_tokens": 3189322.0,
"step": 195
},
{
"entropy": 0.5316331535577774,
"epoch": 0.7313432835820896,
"grad_norm": 0.1522948294878006,
"learning_rate": 0.0002,
"loss": 0.5410732626914978,
"mean_token_accuracy": 0.7790966629981995,
"num_tokens": 3205551.0,
"step": 196
},
{
"entropy": 0.5493949502706528,
"epoch": 0.7350746268656716,
"grad_norm": 0.119343101978302,
"learning_rate": 0.0002,
"loss": 0.5500541925430298,
"mean_token_accuracy": 0.7768760919570923,
"num_tokens": 3222029.0,
"step": 197
},
{
"entropy": 0.5477159917354584,
"epoch": 0.7388059701492538,
"grad_norm": 0.119729183614254,
"learning_rate": 0.0002,
"loss": 0.5477977991104126,
"mean_token_accuracy": 0.7788135707378387,
"num_tokens": 3238421.0,
"step": 198
},
{
"entropy": 0.5607248842716217,
"epoch": 0.7425373134328358,
"grad_norm": 0.13485661149024963,
"learning_rate": 0.0002,
"loss": 0.5701273083686829,
"mean_token_accuracy": 0.7674471586942673,
"num_tokens": 3254789.0,
"step": 199
},
{
"entropy": 0.5362051874399185,
"epoch": 0.746268656716418,
"grad_norm": 0.11599450558423996,
"learning_rate": 0.0002,
"loss": 0.5382620692253113,
"mean_token_accuracy": 0.7804013192653656,
"num_tokens": 3270902.0,
"step": 200
},
{
"entropy": 0.5385442525148392,
"epoch": 0.75,
"grad_norm": 0.11722157150506973,
"learning_rate": 0.0002,
"loss": 0.5425242185592651,
"mean_token_accuracy": 0.7779103666543961,
"num_tokens": 3287148.0,
"step": 201
},
{
"entropy": 0.5608718395233154,
"epoch": 0.753731343283582,
"grad_norm": 0.11743324995040894,
"learning_rate": 0.0002,
"loss": 0.5605480670928955,
"mean_token_accuracy": 0.7718753963708878,
"num_tokens": 3303602.0,
"step": 202
},
{
"entropy": 0.5647395998239517,
"epoch": 0.7574626865671642,
"grad_norm": 0.12360575795173645,
"learning_rate": 0.0002,
"loss": 0.565830409526825,
"mean_token_accuracy": 0.7734925150871277,
"num_tokens": 3319914.0,
"step": 203
},
{
"entropy": 0.5613357871770859,
"epoch": 0.7611940298507462,
"grad_norm": 0.12299378216266632,
"learning_rate": 0.0002,
"loss": 0.5502001643180847,
"mean_token_accuracy": 0.7780173420906067,
"num_tokens": 3336266.0,
"step": 204
},
{
"entropy": 0.5557620376348495,
"epoch": 0.7649253731343284,
"grad_norm": 0.13515423238277435,
"learning_rate": 0.0002,
"loss": 0.5513977408409119,
"mean_token_accuracy": 0.7768134474754333,
"num_tokens": 3352828.0,
"step": 205
},
{
"entropy": 0.5312158316373825,
"epoch": 0.7686567164179104,
"grad_norm": 0.1245652511715889,
"learning_rate": 0.0002,
"loss": 0.5331584215164185,
"mean_token_accuracy": 0.783508375287056,
"num_tokens": 3368900.0,
"step": 206
},
{
"entropy": 0.5540332049131393,
"epoch": 0.7723880597014925,
"grad_norm": 0.12260495871305466,
"learning_rate": 0.0002,
"loss": 0.5610563158988953,
"mean_token_accuracy": 0.772364541888237,
"num_tokens": 3385392.0,
"step": 207
},
{
"entropy": 0.5408795922994614,
"epoch": 0.7761194029850746,
"grad_norm": 0.1623620092868805,
"learning_rate": 0.0002,
"loss": 0.5433046221733093,
"mean_token_accuracy": 0.7798032164573669,
"num_tokens": 3401604.0,
"step": 208
},
{
"entropy": 0.5390565246343613,
"epoch": 0.7798507462686567,
"grad_norm": 0.13042029738426208,
"learning_rate": 0.0002,
"loss": 0.5478684902191162,
"mean_token_accuracy": 0.7792101353406906,
"num_tokens": 3417639.0,
"step": 209
},
{
"entropy": 0.5241924300789833,
"epoch": 0.7835820895522388,
"grad_norm": 0.13064046204090118,
"learning_rate": 0.0002,
"loss": 0.5299482941627502,
"mean_token_accuracy": 0.7843270599842072,
"num_tokens": 3433827.0,
"step": 210
},
{
"entropy": 0.545391634106636,
"epoch": 0.7873134328358209,
"grad_norm": 0.14404848217964172,
"learning_rate": 0.0002,
"loss": 0.539533257484436,
"mean_token_accuracy": 0.7797930389642715,
"num_tokens": 3450075.0,
"step": 211
},
{
"entropy": 0.5748691409826279,
"epoch": 0.7910447761194029,
"grad_norm": 0.13996216654777527,
"learning_rate": 0.0002,
"loss": 0.5611885786056519,
"mean_token_accuracy": 0.7745807766914368,
"num_tokens": 3466557.0,
"step": 212
},
{
"entropy": 0.5685983300209045,
"epoch": 0.7947761194029851,
"grad_norm": 0.12288983166217804,
"learning_rate": 0.0002,
"loss": 0.5553888082504272,
"mean_token_accuracy": 0.7752144187688828,
"num_tokens": 3482978.0,
"step": 213
},
{
"entropy": 0.5502497553825378,
"epoch": 0.7985074626865671,
"grad_norm": 0.12848587334156036,
"learning_rate": 0.0002,
"loss": 0.549103856086731,
"mean_token_accuracy": 0.7791820466518402,
"num_tokens": 3499378.0,
"step": 214
},
{
"entropy": 0.5424053594470024,
"epoch": 0.8022388059701493,
"grad_norm": 0.12519471347332,
"learning_rate": 0.0002,
"loss": 0.5496050119400024,
"mean_token_accuracy": 0.7755117863416672,
"num_tokens": 3515899.0,
"step": 215
},
{
"entropy": 0.5332234650850296,
"epoch": 0.8059701492537313,
"grad_norm": 0.17385068535804749,
"learning_rate": 0.0002,
"loss": 0.5551385283470154,
"mean_token_accuracy": 0.7749006897211075,
"num_tokens": 3532197.0,
"step": 216
},
{
"entropy": 0.5355218946933746,
"epoch": 0.8097014925373134,
"grad_norm": 0.1355784386396408,
"learning_rate": 0.0002,
"loss": 0.5417052507400513,
"mean_token_accuracy": 0.7785830944776535,
"num_tokens": 3548584.0,
"step": 217
},
{
"entropy": 0.545543447136879,
"epoch": 0.8134328358208955,
"grad_norm": 0.10903589427471161,
"learning_rate": 0.0002,
"loss": 0.5351961255073547,
"mean_token_accuracy": 0.7840810418128967,
"num_tokens": 3564973.0,
"step": 218
},
{
"entropy": 0.5678307712078094,
"epoch": 0.8171641791044776,
"grad_norm": 0.13619016110897064,
"learning_rate": 0.0002,
"loss": 0.5577275156974792,
"mean_token_accuracy": 0.7739268988370895,
"num_tokens": 3581436.0,
"step": 219
},
{
"entropy": 0.5452380776405334,
"epoch": 0.8208955223880597,
"grad_norm": 0.12011487782001495,
"learning_rate": 0.0002,
"loss": 0.5431472063064575,
"mean_token_accuracy": 0.7790575325489044,
"num_tokens": 3597661.0,
"step": 220
},
{
"entropy": 0.5536454021930695,
"epoch": 0.8246268656716418,
"grad_norm": 0.10391338169574738,
"learning_rate": 0.0002,
"loss": 0.5514202117919922,
"mean_token_accuracy": 0.7758155465126038,
"num_tokens": 3614221.0,
"step": 221
},
{
"entropy": 0.5350385755300522,
"epoch": 0.8283582089552238,
"grad_norm": 0.1497930884361267,
"learning_rate": 0.0002,
"loss": 0.5447626709938049,
"mean_token_accuracy": 0.778772234916687,
"num_tokens": 3630441.0,
"step": 222
},
{
"entropy": 0.5551132708787918,
"epoch": 0.832089552238806,
"grad_norm": 0.12266736477613449,
"learning_rate": 0.0002,
"loss": 0.558661937713623,
"mean_token_accuracy": 0.773910716176033,
"num_tokens": 3647039.0,
"step": 223
},
{
"entropy": 0.5643535554409027,
"epoch": 0.835820895522388,
"grad_norm": 0.11532776802778244,
"learning_rate": 0.0002,
"loss": 0.5642860531806946,
"mean_token_accuracy": 0.7725937813520432,
"num_tokens": 3663412.0,
"step": 224
},
{
"entropy": 0.5549684166908264,
"epoch": 0.8395522388059702,
"grad_norm": 0.12639960646629333,
"learning_rate": 0.0002,
"loss": 0.5532217025756836,
"mean_token_accuracy": 0.7739283442497253,
"num_tokens": 3679945.0,
"step": 225
},
{
"entropy": 0.560679629445076,
"epoch": 0.8432835820895522,
"grad_norm": 0.13600312173366547,
"learning_rate": 0.0002,
"loss": 0.5514844059944153,
"mean_token_accuracy": 0.7773452550172806,
"num_tokens": 3696613.0,
"step": 226
},
{
"entropy": 0.5458584129810333,
"epoch": 0.8470149253731343,
"grad_norm": 0.10419101268053055,
"learning_rate": 0.0002,
"loss": 0.5424168109893799,
"mean_token_accuracy": 0.7833174467086792,
"num_tokens": 3713158.0,
"step": 227
},
{
"entropy": 0.542242094874382,
"epoch": 0.8507462686567164,
"grad_norm": 0.1483229100704193,
"learning_rate": 0.0002,
"loss": 0.5505244731903076,
"mean_token_accuracy": 0.7768149822950363,
"num_tokens": 3729484.0,
"step": 228
},
{
"entropy": 0.5342283248901367,
"epoch": 0.8544776119402985,
"grad_norm": 0.16167280077934265,
"learning_rate": 0.0002,
"loss": 0.5423468947410583,
"mean_token_accuracy": 0.781244620680809,
"num_tokens": 3745710.0,
"step": 229
},
{
"entropy": 0.5557206273078918,
"epoch": 0.8582089552238806,
"grad_norm": 0.10992418974637985,
"learning_rate": 0.0002,
"loss": 0.5555332899093628,
"mean_token_accuracy": 0.7740505337715149,
"num_tokens": 3761974.0,
"step": 230
},
{
"entropy": 0.5301929265260696,
"epoch": 0.8619402985074627,
"grad_norm": 0.20067644119262695,
"learning_rate": 0.0002,
"loss": 0.5325175523757935,
"mean_token_accuracy": 0.7839723825454712,
"num_tokens": 3777980.0,
"step": 231
},
{
"entropy": 0.5519733354449272,
"epoch": 0.8656716417910447,
"grad_norm": 0.11584831774234772,
"learning_rate": 0.0002,
"loss": 0.547998309135437,
"mean_token_accuracy": 0.7752280086278915,
"num_tokens": 3794210.0,
"step": 232
},
{
"entropy": 0.5573844611644745,
"epoch": 0.8694029850746269,
"grad_norm": 0.14681567251682281,
"learning_rate": 0.0002,
"loss": 0.5630576014518738,
"mean_token_accuracy": 0.7713348120450974,
"num_tokens": 3810625.0,
"step": 233
},
{
"entropy": 0.5614193379878998,
"epoch": 0.8731343283582089,
"grad_norm": 0.3717029392719269,
"learning_rate": 0.0002,
"loss": 0.5614831447601318,
"mean_token_accuracy": 0.7718814015388489,
"num_tokens": 3826871.0,
"step": 234
},
{
"entropy": 0.5552587062120438,
"epoch": 0.8768656716417911,
"grad_norm": 0.1315956562757492,
"learning_rate": 0.0002,
"loss": 0.5541540384292603,
"mean_token_accuracy": 0.7746177315711975,
"num_tokens": 3843187.0,
"step": 235
},
{
"entropy": 0.5387386232614517,
"epoch": 0.8805970149253731,
"grad_norm": 0.4729621112346649,
"learning_rate": 0.0002,
"loss": 0.5513001084327698,
"mean_token_accuracy": 0.777639240026474,
"num_tokens": 3859659.0,
"step": 236
},
{
"entropy": 0.5589011460542679,
"epoch": 0.8843283582089553,
"grad_norm": 0.11313692480325699,
"learning_rate": 0.0002,
"loss": 0.550857424736023,
"mean_token_accuracy": 0.7776817381381989,
"num_tokens": 3876082.0,
"step": 237
},
{
"entropy": 0.5506832748651505,
"epoch": 0.8880597014925373,
"grad_norm": 0.15838703513145447,
"learning_rate": 0.0002,
"loss": 0.5493965148925781,
"mean_token_accuracy": 0.774595633149147,
"num_tokens": 3892310.0,
"step": 238
},
{
"entropy": 0.5482196658849716,
"epoch": 0.8917910447761194,
"grad_norm": 0.16354775428771973,
"learning_rate": 0.0002,
"loss": 0.549696147441864,
"mean_token_accuracy": 0.7784011512994766,
"num_tokens": 3908561.0,
"step": 239
},
{
"entropy": 0.5474406778812408,
"epoch": 0.8955223880597015,
"grad_norm": 0.11488547921180725,
"learning_rate": 0.0002,
"loss": 0.5442180037498474,
"mean_token_accuracy": 0.7787186056375504,
"num_tokens": 3924971.0,
"step": 240
},
{
"entropy": 0.5576506555080414,
"epoch": 0.8992537313432836,
"grad_norm": 0.11725704371929169,
"learning_rate": 0.0002,
"loss": 0.5556765794754028,
"mean_token_accuracy": 0.7754130512475967,
"num_tokens": 3941384.0,
"step": 241
},
{
"entropy": 0.5686157792806625,
"epoch": 0.9029850746268657,
"grad_norm": 0.1209690198302269,
"learning_rate": 0.0002,
"loss": 0.5740119218826294,
"mean_token_accuracy": 0.7644337117671967,
"num_tokens": 3957527.0,
"step": 242
},
{
"entropy": 0.5520821809768677,
"epoch": 0.9067164179104478,
"grad_norm": 0.1097254753112793,
"learning_rate": 0.0002,
"loss": 0.5524159669876099,
"mean_token_accuracy": 0.7778758704662323,
"num_tokens": 3973803.0,
"step": 243
},
{
"entropy": 0.5603332817554474,
"epoch": 0.9104477611940298,
"grad_norm": 0.13421349227428436,
"learning_rate": 0.0002,
"loss": 0.5633103251457214,
"mean_token_accuracy": 0.7723569422960281,
"num_tokens": 3990124.0,
"step": 244
},
{
"entropy": 0.5404402911663055,
"epoch": 0.914179104477612,
"grad_norm": 0.12017542868852615,
"learning_rate": 0.0002,
"loss": 0.5424325466156006,
"mean_token_accuracy": 0.7823856174945831,
"num_tokens": 4006560.0,
"step": 245
},
{
"entropy": 0.5605191737413406,
"epoch": 0.917910447761194,
"grad_norm": 0.14128640294075012,
"learning_rate": 0.0002,
"loss": 0.5602733492851257,
"mean_token_accuracy": 0.7735545933246613,
"num_tokens": 4022966.0,
"step": 246
},
{
"entropy": 0.5599958896636963,
"epoch": 0.9216417910447762,
"grad_norm": 0.11880706995725632,
"learning_rate": 0.0002,
"loss": 0.5598034858703613,
"mean_token_accuracy": 0.7717109471559525,
"num_tokens": 4039261.0,
"step": 247
},
{
"entropy": 0.5408921539783478,
"epoch": 0.9253731343283582,
"grad_norm": 0.12040922045707703,
"learning_rate": 0.0002,
"loss": 0.5460969805717468,
"mean_token_accuracy": 0.7793735712766647,
"num_tokens": 4055343.0,
"step": 248
},
{
"entropy": 0.5573666542768478,
"epoch": 0.9291044776119403,
"grad_norm": 0.12093377858400345,
"learning_rate": 0.0002,
"loss": 0.556143045425415,
"mean_token_accuracy": 0.7752596288919449,
"num_tokens": 4071770.0,
"step": 249
},
{
"entropy": 0.563015878200531,
"epoch": 0.9328358208955224,
"grad_norm": 0.11447741836309433,
"learning_rate": 0.0002,
"loss": 0.5647203922271729,
"mean_token_accuracy": 0.7692370861768723,
"num_tokens": 4088034.0,
"step": 250
},
{
"entropy": 0.548077866435051,
"epoch": 0.9365671641791045,
"grad_norm": 0.11981664597988129,
"learning_rate": 0.0002,
"loss": 0.5454928278923035,
"mean_token_accuracy": 0.7787458151578903,
"num_tokens": 4104196.0,
"step": 251
},
{
"entropy": 0.5375737547874451,
"epoch": 0.9402985074626866,
"grad_norm": 0.12071040272712708,
"learning_rate": 0.0002,
"loss": 0.5404340028762817,
"mean_token_accuracy": 0.7799674719572067,
"num_tokens": 4120470.0,
"step": 252
},
{
"entropy": 0.547912061214447,
"epoch": 0.9440298507462687,
"grad_norm": 0.12739375233650208,
"learning_rate": 0.0002,
"loss": 0.5530076026916504,
"mean_token_accuracy": 0.7753598988056183,
"num_tokens": 4136885.0,
"step": 253
},
{
"entropy": 0.5538879930973053,
"epoch": 0.9477611940298507,
"grad_norm": 0.12144653499126434,
"learning_rate": 0.0002,
"loss": 0.5514186024665833,
"mean_token_accuracy": 0.7753842920064926,
"num_tokens": 4153216.0,
"step": 254
},
{
"entropy": 0.5411302447319031,
"epoch": 0.9514925373134329,
"grad_norm": 0.11099912226200104,
"learning_rate": 0.0002,
"loss": 0.5385034084320068,
"mean_token_accuracy": 0.7812628000974655,
"num_tokens": 4169402.0,
"step": 255
},
{
"entropy": 0.5564829558134079,
"epoch": 0.9552238805970149,
"grad_norm": 0.12310667335987091,
"learning_rate": 0.0002,
"loss": 0.5534285306930542,
"mean_token_accuracy": 0.7745526880025864,
"num_tokens": 4185847.0,
"step": 256
},
{
"entropy": 0.5459543019533157,
"epoch": 0.9589552238805971,
"grad_norm": 0.1408655047416687,
"learning_rate": 0.0002,
"loss": 0.539636492729187,
"mean_token_accuracy": 0.7817695140838623,
"num_tokens": 4202324.0,
"step": 257
},
{
"entropy": 0.5483512580394745,
"epoch": 0.9626865671641791,
"grad_norm": 0.1329817920923233,
"learning_rate": 0.0002,
"loss": 0.5545552968978882,
"mean_token_accuracy": 0.7754471302032471,
"num_tokens": 4218485.0,
"step": 258
},
{
"entropy": 0.5507388859987259,
"epoch": 0.9664179104477612,
"grad_norm": 0.14522868394851685,
"learning_rate": 0.0002,
"loss": 0.5539411306381226,
"mean_token_accuracy": 0.776690736413002,
"num_tokens": 4234830.0,
"step": 259
},
{
"entropy": 0.5551155656576157,
"epoch": 0.9701492537313433,
"grad_norm": 0.1110503152012825,
"learning_rate": 0.0002,
"loss": 0.5517114996910095,
"mean_token_accuracy": 0.7778125107288361,
"num_tokens": 4251249.0,
"step": 260
},
{
"entropy": 0.5606275051832199,
"epoch": 0.9738805970149254,
"grad_norm": 0.11907053738832474,
"learning_rate": 0.0002,
"loss": 0.5583968162536621,
"mean_token_accuracy": 0.7729120701551437,
"num_tokens": 4267571.0,
"step": 261
},
{
"entropy": 0.5697215348482132,
"epoch": 0.9776119402985075,
"grad_norm": 0.11226138472557068,
"learning_rate": 0.0002,
"loss": 0.5654243230819702,
"mean_token_accuracy": 0.7697847783565521,
"num_tokens": 4283938.0,
"step": 262
},
{
"entropy": 0.5614341050386429,
"epoch": 0.9813432835820896,
"grad_norm": 0.12085731327533722,
"learning_rate": 0.0002,
"loss": 0.5629435777664185,
"mean_token_accuracy": 0.7714052200317383,
"num_tokens": 4300727.0,
"step": 263
},
{
"entropy": 0.5495717078447342,
"epoch": 0.9850746268656716,
"grad_norm": 0.1363348811864853,
"learning_rate": 0.0002,
"loss": 0.5549257397651672,
"mean_token_accuracy": 0.7735868841409683,
"num_tokens": 4316903.0,
"step": 264
},
{
"entropy": 0.5352297425270081,
"epoch": 0.9888059701492538,
"grad_norm": 0.1429988294839859,
"learning_rate": 0.0002,
"loss": 0.5460555553436279,
"mean_token_accuracy": 0.7814377993345261,
"num_tokens": 4333143.0,
"step": 265
},
{
"entropy": 0.5603132396936417,
"epoch": 0.9925373134328358,
"grad_norm": 0.14986178278923035,
"learning_rate": 0.0002,
"loss": 0.5551425218582153,
"mean_token_accuracy": 0.7773159593343735,
"num_tokens": 4349576.0,
"step": 266
},
{
"entropy": 0.5535064339637756,
"epoch": 0.996268656716418,
"grad_norm": 0.1105998232960701,
"learning_rate": 0.0002,
"loss": 0.5442855954170227,
"mean_token_accuracy": 0.7821661084890366,
"num_tokens": 4365977.0,
"step": 267
},
{
"entropy": 0.5614945888519287,
"epoch": 1.0,
"grad_norm": 0.12907235324382782,
"learning_rate": 0.0002,
"loss": 0.5476444959640503,
"mean_token_accuracy": 0.7792651057243347,
"num_tokens": 4382526.0,
"step": 268
},
{
"entropy": 0.548059806227684,
"epoch": 1.0037313432835822,
"grad_norm": 0.12145893275737762,
"learning_rate": 0.0002,
"loss": 0.5402656197547913,
"mean_token_accuracy": 0.7813442945480347,
"num_tokens": 4399005.0,
"step": 269
},
{
"entropy": 0.5212839543819427,
"epoch": 1.007462686567164,
"grad_norm": 0.1396404206752777,
"learning_rate": 0.0002,
"loss": 0.5315491557121277,
"mean_token_accuracy": 0.7839601635932922,
"num_tokens": 4415205.0,
"step": 270
},
{
"entropy": 0.5132785737514496,
"epoch": 1.0111940298507462,
"grad_norm": 0.1433689296245575,
"learning_rate": 0.0002,
"loss": 0.5299959778785706,
"mean_token_accuracy": 0.7853466272354126,
"num_tokens": 4431512.0,
"step": 271
},
{
"entropy": 0.5394517332315445,
"epoch": 1.0149253731343284,
"grad_norm": 0.11504881829023361,
"learning_rate": 0.0002,
"loss": 0.5439318418502808,
"mean_token_accuracy": 0.7786544561386108,
"num_tokens": 4447878.0,
"step": 272
},
{
"entropy": 0.5173204094171524,
"epoch": 1.0186567164179103,
"grad_norm": 0.12369395047426224,
"learning_rate": 0.0002,
"loss": 0.525097668170929,
"mean_token_accuracy": 0.7878104597330093,
"num_tokens": 4464069.0,
"step": 273
},
{
"entropy": 0.5443273782730103,
"epoch": 1.0223880597014925,
"grad_norm": 0.12611854076385498,
"learning_rate": 0.0002,
"loss": 0.5425093770027161,
"mean_token_accuracy": 0.7833482921123505,
"num_tokens": 4480510.0,
"step": 274
},
{
"entropy": 0.5319035351276398,
"epoch": 1.0261194029850746,
"grad_norm": 0.11637023091316223,
"learning_rate": 0.0002,
"loss": 0.5231828093528748,
"mean_token_accuracy": 0.788045197725296,
"num_tokens": 4496734.0,
"step": 275
},
{
"entropy": 0.5645869076251984,
"epoch": 1.0298507462686568,
"grad_norm": 0.11970556527376175,
"learning_rate": 0.0002,
"loss": 0.556399405002594,
"mean_token_accuracy": 0.7753234058618546,
"num_tokens": 4513272.0,
"step": 276
},
{
"entropy": 0.5412048548460007,
"epoch": 1.0335820895522387,
"grad_norm": 0.12889669835567474,
"learning_rate": 0.0002,
"loss": 0.5352495908737183,
"mean_token_accuracy": 0.7822704613208771,
"num_tokens": 4529760.0,
"step": 277
},
{
"entropy": 0.5433377772569656,
"epoch": 1.037313432835821,
"grad_norm": 0.15610089898109436,
"learning_rate": 0.0002,
"loss": 0.5424712896347046,
"mean_token_accuracy": 0.7791996449232101,
"num_tokens": 4546065.0,
"step": 278
},
{
"entropy": 0.5367715954780579,
"epoch": 1.041044776119403,
"grad_norm": 0.1712978631258011,
"learning_rate": 0.0002,
"loss": 0.5500761270523071,
"mean_token_accuracy": 0.7774211019277573,
"num_tokens": 4562404.0,
"step": 279
},
{
"entropy": 0.5348818898200989,
"epoch": 1.044776119402985,
"grad_norm": 0.14415498077869415,
"learning_rate": 0.0002,
"loss": 0.5458697080612183,
"mean_token_accuracy": 0.7776882946491241,
"num_tokens": 4578594.0,
"step": 280
},
{
"entropy": 0.5394753366708755,
"epoch": 1.0485074626865671,
"grad_norm": 0.17060807347297668,
"learning_rate": 0.0002,
"loss": 0.5428628921508789,
"mean_token_accuracy": 0.7797123193740845,
"num_tokens": 4594918.0,
"step": 281
},
{
"entropy": 0.5477339029312134,
"epoch": 1.0522388059701493,
"grad_norm": 0.12646426260471344,
"learning_rate": 0.0002,
"loss": 0.5376375913619995,
"mean_token_accuracy": 0.7846843749284744,
"num_tokens": 4611225.0,
"step": 282
},
{
"entropy": 0.553899347782135,
"epoch": 1.0559701492537314,
"grad_norm": 0.14560198783874512,
"learning_rate": 0.0002,
"loss": 0.5442871451377869,
"mean_token_accuracy": 0.779757484793663,
"num_tokens": 4627515.0,
"step": 283
},
{
"entropy": 0.544152095913887,
"epoch": 1.0597014925373134,
"grad_norm": 0.14532814919948578,
"learning_rate": 0.0002,
"loss": 0.5495354533195496,
"mean_token_accuracy": 0.7756282091140747,
"num_tokens": 4644151.0,
"step": 284
},
{
"entropy": 0.5467684864997864,
"epoch": 1.0634328358208955,
"grad_norm": 0.14399303495883942,
"learning_rate": 0.0002,
"loss": 0.5551741123199463,
"mean_token_accuracy": 0.7747452855110168,
"num_tokens": 4660349.0,
"step": 285
},
{
"entropy": 0.5328090041875839,
"epoch": 1.0671641791044777,
"grad_norm": 0.1490914672613144,
"learning_rate": 0.0002,
"loss": 0.5371617674827576,
"mean_token_accuracy": 0.7852603644132614,
"num_tokens": 4676682.0,
"step": 286
},
{
"entropy": 0.5549953877925873,
"epoch": 1.0708955223880596,
"grad_norm": 0.13986609876155853,
"learning_rate": 0.0002,
"loss": 0.5485588312149048,
"mean_token_accuracy": 0.7786588221788406,
"num_tokens": 4693087.0,
"step": 287
},
{
"entropy": 0.5441232770681381,
"epoch": 1.0746268656716418,
"grad_norm": 0.13744987547397614,
"learning_rate": 0.0002,
"loss": 0.5352811813354492,
"mean_token_accuracy": 0.7830296456813812,
"num_tokens": 4709482.0,
"step": 288
},
{
"entropy": 0.5388935655355453,
"epoch": 1.078358208955224,
"grad_norm": 0.12793688476085663,
"learning_rate": 0.0002,
"loss": 0.5364757776260376,
"mean_token_accuracy": 0.780993863940239,
"num_tokens": 4725929.0,
"step": 289
},
{
"entropy": 0.5281359702348709,
"epoch": 1.0820895522388059,
"grad_norm": 0.11734890192747116,
"learning_rate": 0.0002,
"loss": 0.5293084979057312,
"mean_token_accuracy": 0.7876105159521103,
"num_tokens": 4742317.0,
"step": 290
},
{
"entropy": 0.5459820628166199,
"epoch": 1.085820895522388,
"grad_norm": 0.12839624285697937,
"learning_rate": 0.0002,
"loss": 0.5461269617080688,
"mean_token_accuracy": 0.7763439863920212,
"num_tokens": 4758682.0,
"step": 291
},
{
"entropy": 0.5111119300127029,
"epoch": 1.0895522388059702,
"grad_norm": 0.1377914845943451,
"learning_rate": 0.0002,
"loss": 0.5165018439292908,
"mean_token_accuracy": 0.792814165353775,
"num_tokens": 4775165.0,
"step": 292
},
{
"entropy": 0.5256515890359879,
"epoch": 1.0932835820895523,
"grad_norm": 0.13310879468917847,
"learning_rate": 0.0002,
"loss": 0.5263264179229736,
"mean_token_accuracy": 0.7891132682561874,
"num_tokens": 4791249.0,
"step": 293
},
{
"entropy": 0.5361033976078033,
"epoch": 1.0970149253731343,
"grad_norm": 0.11920680850744247,
"learning_rate": 0.0002,
"loss": 0.5344924926757812,
"mean_token_accuracy": 0.7844657897949219,
"num_tokens": 4807722.0,
"step": 294
},
{
"entropy": 0.547529011964798,
"epoch": 1.1007462686567164,
"grad_norm": 0.15012222528457642,
"learning_rate": 0.0002,
"loss": 0.5434770584106445,
"mean_token_accuracy": 0.7794990837574005,
"num_tokens": 4824221.0,
"step": 295
},
{
"entropy": 0.5387088805437088,
"epoch": 1.1044776119402986,
"grad_norm": 0.11607323586940765,
"learning_rate": 0.0002,
"loss": 0.5379114151000977,
"mean_token_accuracy": 0.7820580452680588,
"num_tokens": 4840561.0,
"step": 296
},
{
"entropy": 0.5285296589136124,
"epoch": 1.1082089552238805,
"grad_norm": 0.16472671926021576,
"learning_rate": 0.0002,
"loss": 0.5286039710044861,
"mean_token_accuracy": 0.7859488725662231,
"num_tokens": 4856739.0,
"step": 297
},
{
"entropy": 0.5467464625835419,
"epoch": 1.1119402985074627,
"grad_norm": 0.12136011570692062,
"learning_rate": 0.0002,
"loss": 0.5486158132553101,
"mean_token_accuracy": 0.7766989320516586,
"num_tokens": 4873254.0,
"step": 298
},
{
"entropy": 0.5323450714349747,
"epoch": 1.1156716417910448,
"grad_norm": 0.15763746201992035,
"learning_rate": 0.0002,
"loss": 0.53644198179245,
"mean_token_accuracy": 0.7847353965044022,
"num_tokens": 4889763.0,
"step": 299
},
{
"entropy": 0.5294622331857681,
"epoch": 1.1194029850746268,
"grad_norm": 0.14253245294094086,
"learning_rate": 0.0002,
"loss": 0.5327939987182617,
"mean_token_accuracy": 0.7873322665691376,
"num_tokens": 4905780.0,
"step": 300
},
{
"entropy": 0.5500210523605347,
"epoch": 1.123134328358209,
"grad_norm": 0.1611548215150833,
"learning_rate": 0.0002,
"loss": 0.55262291431427,
"mean_token_accuracy": 0.7771656811237335,
"num_tokens": 4921935.0,
"step": 301
},
{
"entropy": 0.5608504116535187,
"epoch": 1.126865671641791,
"grad_norm": 0.14609341323375702,
"learning_rate": 0.0002,
"loss": 0.5597085952758789,
"mean_token_accuracy": 0.773489698767662,
"num_tokens": 4938566.0,
"step": 302
},
{
"entropy": 0.541571170091629,
"epoch": 1.1305970149253732,
"grad_norm": 0.11906211823225021,
"learning_rate": 0.0002,
"loss": 0.541067361831665,
"mean_token_accuracy": 0.7795013040304184,
"num_tokens": 4954995.0,
"step": 303
},
{
"entropy": 0.5374023020267487,
"epoch": 1.1343283582089552,
"grad_norm": 0.191620334982872,
"learning_rate": 0.0002,
"loss": 0.540854811668396,
"mean_token_accuracy": 0.783530056476593,
"num_tokens": 4971285.0,
"step": 304
},
{
"entropy": 0.5237517058849335,
"epoch": 1.1380597014925373,
"grad_norm": 0.13355116546154022,
"learning_rate": 0.0002,
"loss": 0.5256230235099792,
"mean_token_accuracy": 0.7869999557733536,
"num_tokens": 4987629.0,
"step": 305
},
{
"entropy": 0.5161513015627861,
"epoch": 1.1417910447761195,
"grad_norm": 0.14180561900138855,
"learning_rate": 0.0002,
"loss": 0.5189639925956726,
"mean_token_accuracy": 0.7884562611579895,
"num_tokens": 5003816.0,
"step": 306
},
{
"entropy": 0.5333078503608704,
"epoch": 1.1455223880597014,
"grad_norm": 0.11995179206132889,
"learning_rate": 0.0002,
"loss": 0.5338060259819031,
"mean_token_accuracy": 0.7834619730710983,
"num_tokens": 5020179.0,
"step": 307
},
{
"entropy": 0.5374015420675278,
"epoch": 1.1492537313432836,
"grad_norm": 0.14065897464752197,
"learning_rate": 0.0002,
"loss": 0.541375994682312,
"mean_token_accuracy": 0.7836798280477524,
"num_tokens": 5036421.0,
"step": 308
},
{
"entropy": 0.5318789333105087,
"epoch": 1.1529850746268657,
"grad_norm": 0.15007704496383667,
"learning_rate": 0.0002,
"loss": 0.5320872664451599,
"mean_token_accuracy": 0.7854835838079453,
"num_tokens": 5052767.0,
"step": 309
},
{
"entropy": 0.5555961728096008,
"epoch": 1.1567164179104479,
"grad_norm": 0.12327966094017029,
"learning_rate": 0.0002,
"loss": 0.5514441728591919,
"mean_token_accuracy": 0.775398313999176,
"num_tokens": 5069219.0,
"step": 310
},
{
"entropy": 0.5369515269994736,
"epoch": 1.1604477611940298,
"grad_norm": 0.13790592551231384,
"learning_rate": 0.0002,
"loss": 0.5307064652442932,
"mean_token_accuracy": 0.7870743423700333,
"num_tokens": 5085637.0,
"step": 311
},
{
"entropy": 0.5395635664463043,
"epoch": 1.164179104477612,
"grad_norm": 0.12657856941223145,
"learning_rate": 0.0002,
"loss": 0.539893388748169,
"mean_token_accuracy": 0.7809743881225586,
"num_tokens": 5101984.0,
"step": 312
},
{
"entropy": 0.5528725534677505,
"epoch": 1.1679104477611941,
"grad_norm": 0.15744967758655548,
"learning_rate": 0.0002,
"loss": 0.5551643967628479,
"mean_token_accuracy": 0.7749461233615875,
"num_tokens": 5118457.0,
"step": 313
},
{
"entropy": 0.5547244101762772,
"epoch": 1.171641791044776,
"grad_norm": 0.14667753875255585,
"learning_rate": 0.0002,
"loss": 0.5545704364776611,
"mean_token_accuracy": 0.7767890095710754,
"num_tokens": 5135070.0,
"step": 314
},
{
"entropy": 0.5513405501842499,
"epoch": 1.1753731343283582,
"grad_norm": 0.13363401591777802,
"learning_rate": 0.0002,
"loss": 0.5478935241699219,
"mean_token_accuracy": 0.7782707363367081,
"num_tokens": 5151457.0,
"step": 315
},
{
"entropy": 0.5504343062639236,
"epoch": 1.1791044776119404,
"grad_norm": 0.14427515864372253,
"learning_rate": 0.0002,
"loss": 0.5503411293029785,
"mean_token_accuracy": 0.7759760916233063,
"num_tokens": 5167918.0,
"step": 316
},
{
"entropy": 0.5411941558122635,
"epoch": 1.1828358208955223,
"grad_norm": 0.13475076854228973,
"learning_rate": 0.0002,
"loss": 0.5334619283676147,
"mean_token_accuracy": 0.7848760634660721,
"num_tokens": 5184250.0,
"step": 317
},
{
"entropy": 0.5534447133541107,
"epoch": 1.1865671641791045,
"grad_norm": 0.14666007459163666,
"learning_rate": 0.0002,
"loss": 0.5606579184532166,
"mean_token_accuracy": 0.7732094079256058,
"num_tokens": 5200728.0,
"step": 318
},
{
"entropy": 0.5172414779663086,
"epoch": 1.1902985074626866,
"grad_norm": 0.1494058072566986,
"learning_rate": 0.0002,
"loss": 0.5262372493743896,
"mean_token_accuracy": 0.787101224064827,
"num_tokens": 5216948.0,
"step": 319
},
{
"entropy": 0.5277577340602875,
"epoch": 1.1940298507462686,
"grad_norm": 0.15135720372200012,
"learning_rate": 0.0002,
"loss": 0.5401796102523804,
"mean_token_accuracy": 0.7809148728847504,
"num_tokens": 5233422.0,
"step": 320
},
{
"entropy": 0.5246866941452026,
"epoch": 1.1977611940298507,
"grad_norm": 0.12589603662490845,
"learning_rate": 0.0002,
"loss": 0.5281919836997986,
"mean_token_accuracy": 0.7868399173021317,
"num_tokens": 5249730.0,
"step": 321
},
{
"entropy": 0.5274995267391205,
"epoch": 1.2014925373134329,
"grad_norm": 0.11834204196929932,
"learning_rate": 0.0002,
"loss": 0.5278512835502625,
"mean_token_accuracy": 0.7852350920438766,
"num_tokens": 5266115.0,
"step": 322
},
{
"entropy": 0.5320824682712555,
"epoch": 1.205223880597015,
"grad_norm": 0.13883750140666962,
"learning_rate": 0.0002,
"loss": 0.5280960202217102,
"mean_token_accuracy": 0.7858837693929672,
"num_tokens": 5282462.0,
"step": 323
},
{
"entropy": 0.5404033660888672,
"epoch": 1.208955223880597,
"grad_norm": 0.13842950761318207,
"learning_rate": 0.0002,
"loss": 0.5391522645950317,
"mean_token_accuracy": 0.7815057188272476,
"num_tokens": 5299103.0,
"step": 324
},
{
"entropy": 0.5260981917381287,
"epoch": 1.212686567164179,
"grad_norm": 0.14888468384742737,
"learning_rate": 0.0002,
"loss": 0.5250783562660217,
"mean_token_accuracy": 0.7861860394477844,
"num_tokens": 5315339.0,
"step": 325
},
{
"entropy": 0.5244043916463852,
"epoch": 1.2164179104477613,
"grad_norm": 0.12871688604354858,
"learning_rate": 0.0002,
"loss": 0.5234277844429016,
"mean_token_accuracy": 0.787299633026123,
"num_tokens": 5331854.0,
"step": 326
},
{
"entropy": 0.5336845368146896,
"epoch": 1.2201492537313432,
"grad_norm": 0.1279512345790863,
"learning_rate": 0.0002,
"loss": 0.5357816815376282,
"mean_token_accuracy": 0.7811597734689713,
"num_tokens": 5348268.0,
"step": 327
},
{
"entropy": 0.5396746844053268,
"epoch": 1.2238805970149254,
"grad_norm": 0.1272435188293457,
"learning_rate": 0.0002,
"loss": 0.5367811322212219,
"mean_token_accuracy": 0.7815662026405334,
"num_tokens": 5364832.0,
"step": 328
},
{
"entropy": 0.5355321317911148,
"epoch": 1.2276119402985075,
"grad_norm": 0.12457006424665451,
"learning_rate": 0.0002,
"loss": 0.5324679613113403,
"mean_token_accuracy": 0.7855342030525208,
"num_tokens": 5381181.0,
"step": 329
},
{
"entropy": 0.5404689311981201,
"epoch": 1.2313432835820897,
"grad_norm": 0.1616295725107193,
"learning_rate": 0.0002,
"loss": 0.5461254715919495,
"mean_token_accuracy": 0.7793011963367462,
"num_tokens": 5397689.0,
"step": 330
},
{
"entropy": 0.5573465675115585,
"epoch": 1.2350746268656716,
"grad_norm": 0.1567206233739853,
"learning_rate": 0.0002,
"loss": 0.5680751204490662,
"mean_token_accuracy": 0.7683437466621399,
"num_tokens": 5414063.0,
"step": 331
},
{
"entropy": 0.5585090219974518,
"epoch": 1.2388059701492538,
"grad_norm": 0.13362006843090057,
"learning_rate": 0.0002,
"loss": 0.5544182658195496,
"mean_token_accuracy": 0.7759232968091965,
"num_tokens": 5430545.0,
"step": 332
},
{
"entropy": 0.5479722023010254,
"epoch": 1.242537313432836,
"grad_norm": 0.16734908521175385,
"learning_rate": 0.0002,
"loss": 0.5447990298271179,
"mean_token_accuracy": 0.7797949612140656,
"num_tokens": 5446700.0,
"step": 333
},
{
"entropy": 0.5607796311378479,
"epoch": 1.2462686567164178,
"grad_norm": 0.1450573354959488,
"learning_rate": 0.0002,
"loss": 0.556632936000824,
"mean_token_accuracy": 0.7769130021333694,
"num_tokens": 5463137.0,
"step": 334
},
{
"entropy": 0.5538843423128128,
"epoch": 1.25,
"grad_norm": 0.12896743416786194,
"learning_rate": 0.0002,
"loss": 0.5562998056411743,
"mean_token_accuracy": 0.7745624631643295,
"num_tokens": 5479659.0,
"step": 335
},
{
"entropy": 0.5309284329414368,
"epoch": 1.2537313432835822,
"grad_norm": 0.1323668360710144,
"learning_rate": 0.0002,
"loss": 0.5389367341995239,
"mean_token_accuracy": 0.7794619351625443,
"num_tokens": 5495884.0,
"step": 336
},
{
"entropy": 0.5279457420110703,
"epoch": 1.2574626865671643,
"grad_norm": 0.16464678943157196,
"learning_rate": 0.0002,
"loss": 0.540420413017273,
"mean_token_accuracy": 0.7797137498855591,
"num_tokens": 5512288.0,
"step": 337
},
{
"entropy": 0.5431296676397324,
"epoch": 1.2611940298507462,
"grad_norm": 0.15366457402706146,
"learning_rate": 0.0002,
"loss": 0.5533568263053894,
"mean_token_accuracy": 0.7777420580387115,
"num_tokens": 5528739.0,
"step": 338
},
{
"entropy": 0.5533888936042786,
"epoch": 1.2649253731343284,
"grad_norm": 0.15439164638519287,
"learning_rate": 0.0002,
"loss": 0.5407285690307617,
"mean_token_accuracy": 0.7848910838365555,
"num_tokens": 5545180.0,
"step": 339
},
{
"entropy": 0.5363039374351501,
"epoch": 1.2686567164179103,
"grad_norm": 0.14024227857589722,
"learning_rate": 0.0002,
"loss": 0.5247921943664551,
"mean_token_accuracy": 0.7866441905498505,
"num_tokens": 5561365.0,
"step": 340
},
{
"entropy": 0.5282331109046936,
"epoch": 1.2723880597014925,
"grad_norm": 0.15727277100086212,
"learning_rate": 0.0002,
"loss": 0.5256697535514832,
"mean_token_accuracy": 0.7857891470193863,
"num_tokens": 5577609.0,
"step": 341
},
{
"entropy": 0.5532326549291611,
"epoch": 1.2761194029850746,
"grad_norm": 0.14312665164470673,
"learning_rate": 0.0002,
"loss": 0.5558714866638184,
"mean_token_accuracy": 0.776502713561058,
"num_tokens": 5593922.0,
"step": 342
},
{
"entropy": 0.5117308422923088,
"epoch": 1.2798507462686568,
"grad_norm": 0.13982926309108734,
"learning_rate": 0.0002,
"loss": 0.5216178894042969,
"mean_token_accuracy": 0.7898732572793961,
"num_tokens": 5610160.0,
"step": 343
},
{
"entropy": 0.5327529311180115,
"epoch": 1.2835820895522387,
"grad_norm": 0.1600239872932434,
"learning_rate": 0.0002,
"loss": 0.54588383436203,
"mean_token_accuracy": 0.7827021777629852,
"num_tokens": 5626483.0,
"step": 344
},
{
"entropy": 0.5456168502569199,
"epoch": 1.287313432835821,
"grad_norm": 0.1314232498407364,
"learning_rate": 0.0002,
"loss": 0.5445138216018677,
"mean_token_accuracy": 0.7821621298789978,
"num_tokens": 5642705.0,
"step": 345
},
{
"entropy": 0.5568868517875671,
"epoch": 1.291044776119403,
"grad_norm": 0.12736710906028748,
"learning_rate": 0.0002,
"loss": 0.5469453930854797,
"mean_token_accuracy": 0.7784760594367981,
"num_tokens": 5659144.0,
"step": 346
},
{
"entropy": 0.5525211989879608,
"epoch": 1.294776119402985,
"grad_norm": 0.11654646694660187,
"learning_rate": 0.0002,
"loss": 0.542698323726654,
"mean_token_accuracy": 0.7785234600305557,
"num_tokens": 5675452.0,
"step": 347
},
{
"entropy": 0.5460808724164963,
"epoch": 1.2985074626865671,
"grad_norm": 0.1318521350622177,
"learning_rate": 0.0002,
"loss": 0.5390938520431519,
"mean_token_accuracy": 0.7815311253070831,
"num_tokens": 5691735.0,
"step": 348
},
{
"entropy": 0.5437112301588058,
"epoch": 1.3022388059701493,
"grad_norm": 0.13485990464687347,
"learning_rate": 0.0002,
"loss": 0.5420966148376465,
"mean_token_accuracy": 0.7827932983636856,
"num_tokens": 5708102.0,
"step": 349
},
{
"entropy": 0.5493648052215576,
"epoch": 1.3059701492537314,
"grad_norm": 0.14354610443115234,
"learning_rate": 0.0002,
"loss": 0.5561747550964355,
"mean_token_accuracy": 0.7761517763137817,
"num_tokens": 5724350.0,
"step": 350
},
{
"entropy": 0.5344854891300201,
"epoch": 1.3097014925373134,
"grad_norm": 0.15943452715873718,
"learning_rate": 0.0002,
"loss": 0.5391569137573242,
"mean_token_accuracy": 0.7805770933628082,
"num_tokens": 5740954.0,
"step": 351
},
{
"entropy": 0.5242450833320618,
"epoch": 1.3134328358208955,
"grad_norm": 0.13654360175132751,
"learning_rate": 0.0002,
"loss": 0.5292847156524658,
"mean_token_accuracy": 0.784620076417923,
"num_tokens": 5757385.0,
"step": 352
},
{
"entropy": 0.5383377820253372,
"epoch": 1.3171641791044777,
"grad_norm": 0.13651302456855774,
"learning_rate": 0.0002,
"loss": 0.5413467288017273,
"mean_token_accuracy": 0.7786675840616226,
"num_tokens": 5773852.0,
"step": 353
},
{
"entropy": 0.5402452051639557,
"epoch": 1.3208955223880596,
"grad_norm": 0.13241973519325256,
"learning_rate": 0.0002,
"loss": 0.5419248938560486,
"mean_token_accuracy": 0.778145432472229,
"num_tokens": 5790055.0,
"step": 354
},
{
"entropy": 0.5536379665136337,
"epoch": 1.3246268656716418,
"grad_norm": 0.13762575387954712,
"learning_rate": 0.0002,
"loss": 0.5484678745269775,
"mean_token_accuracy": 0.7766116708517075,
"num_tokens": 5806738.0,
"step": 355
},
{
"entropy": 0.5532735884189606,
"epoch": 1.328358208955224,
"grad_norm": 0.12154927849769592,
"learning_rate": 0.0002,
"loss": 0.5548056960105896,
"mean_token_accuracy": 0.7753622978925705,
"num_tokens": 5823183.0,
"step": 356
},
{
"entropy": 0.5448320060968399,
"epoch": 1.332089552238806,
"grad_norm": 0.144795224070549,
"learning_rate": 0.0002,
"loss": 0.5448752641677856,
"mean_token_accuracy": 0.7790551483631134,
"num_tokens": 5839499.0,
"step": 357
},
{
"entropy": 0.5511485040187836,
"epoch": 1.335820895522388,
"grad_norm": 0.13511039316654205,
"learning_rate": 0.0002,
"loss": 0.5528499484062195,
"mean_token_accuracy": 0.776659682393074,
"num_tokens": 5855921.0,
"step": 358
},
{
"entropy": 0.5290715843439102,
"epoch": 1.3395522388059702,
"grad_norm": 0.11837369203567505,
"learning_rate": 0.0002,
"loss": 0.5328022241592407,
"mean_token_accuracy": 0.7826089113950729,
"num_tokens": 5872142.0,
"step": 359
},
{
"entropy": 0.5363620519638062,
"epoch": 1.3432835820895521,
"grad_norm": 0.12029700726270676,
"learning_rate": 0.0002,
"loss": 0.534315824508667,
"mean_token_accuracy": 0.7845976501703262,
"num_tokens": 5888484.0,
"step": 360
},
{
"entropy": 0.5347290933132172,
"epoch": 1.3470149253731343,
"grad_norm": 0.13828180730342865,
"learning_rate": 0.0002,
"loss": 0.5338245630264282,
"mean_token_accuracy": 0.7808255851268768,
"num_tokens": 5904613.0,
"step": 361
},
{
"entropy": 0.5324546545743942,
"epoch": 1.3507462686567164,
"grad_norm": 0.12894095480442047,
"learning_rate": 0.0002,
"loss": 0.5361336469650269,
"mean_token_accuracy": 0.7821396291255951,
"num_tokens": 5920864.0,
"step": 362
},
{
"entropy": 0.5308556854724884,
"epoch": 1.3544776119402986,
"grad_norm": 0.11929216980934143,
"learning_rate": 0.0002,
"loss": 0.5275416374206543,
"mean_token_accuracy": 0.7852365076541901,
"num_tokens": 5937108.0,
"step": 363
},
{
"entropy": 0.53159399330616,
"epoch": 1.3582089552238805,
"grad_norm": 0.14378131926059723,
"learning_rate": 0.0002,
"loss": 0.5424759387969971,
"mean_token_accuracy": 0.7792777568101883,
"num_tokens": 5953233.0,
"step": 364
},
{
"entropy": 0.5450653731822968,
"epoch": 1.3619402985074627,
"grad_norm": 0.14581741392612457,
"learning_rate": 0.0002,
"loss": 0.5530756115913391,
"mean_token_accuracy": 0.7765647917985916,
"num_tokens": 5969681.0,
"step": 365
},
{
"entropy": 0.5418213754892349,
"epoch": 1.3656716417910448,
"grad_norm": 0.13764694333076477,
"learning_rate": 0.0002,
"loss": 0.5494720935821533,
"mean_token_accuracy": 0.7783620804548264,
"num_tokens": 5985895.0,
"step": 366
},
{
"entropy": 0.5528892427682877,
"epoch": 1.3694029850746268,
"grad_norm": 0.14292745292186737,
"learning_rate": 0.0002,
"loss": 0.5427901148796082,
"mean_token_accuracy": 0.7794772684574127,
"num_tokens": 6002104.0,
"step": 367
},
{
"entropy": 0.5515422970056534,
"epoch": 1.373134328358209,
"grad_norm": 0.12165708839893341,
"learning_rate": 0.0002,
"loss": 0.5388676524162292,
"mean_token_accuracy": 0.7821601629257202,
"num_tokens": 6018297.0,
"step": 368
},
{
"entropy": 0.5522115230560303,
"epoch": 1.376865671641791,
"grad_norm": 0.16414624452590942,
"learning_rate": 0.0002,
"loss": 0.5514496564865112,
"mean_token_accuracy": 0.7735963463783264,
"num_tokens": 6034469.0,
"step": 369
},
{
"entropy": 0.5200467556715012,
"epoch": 1.3805970149253732,
"grad_norm": 0.11550547927618027,
"learning_rate": 0.0002,
"loss": 0.5164188146591187,
"mean_token_accuracy": 0.7926855981349945,
"num_tokens": 6050831.0,
"step": 370
},
{
"entropy": 0.5372455269098282,
"epoch": 1.3843283582089552,
"grad_norm": 0.15535052120685577,
"learning_rate": 0.0002,
"loss": 0.5430443286895752,
"mean_token_accuracy": 0.7787685394287109,
"num_tokens": 6067185.0,
"step": 371
},
{
"entropy": 0.5356560945510864,
"epoch": 1.3880597014925373,
"grad_norm": 0.13415579497814178,
"learning_rate": 0.0002,
"loss": 0.5381686091423035,
"mean_token_accuracy": 0.7826534509658813,
"num_tokens": 6083549.0,
"step": 372
},
{
"entropy": 0.5160757750272751,
"epoch": 1.3917910447761195,
"grad_norm": 0.21146361529827118,
"learning_rate": 0.0002,
"loss": 0.5265405774116516,
"mean_token_accuracy": 0.7884284406900406,
"num_tokens": 6099748.0,
"step": 373
},
{
"entropy": 0.5486676543951035,
"epoch": 1.3955223880597014,
"grad_norm": 0.17727814614772797,
"learning_rate": 0.0002,
"loss": 0.5486956834793091,
"mean_token_accuracy": 0.774789959192276,
"num_tokens": 6116173.0,
"step": 374
},
{
"entropy": 0.5379186123609543,
"epoch": 1.3992537313432836,
"grad_norm": 0.14094142615795135,
"learning_rate": 0.0002,
"loss": 0.5390832424163818,
"mean_token_accuracy": 0.7824152857065201,
"num_tokens": 6132499.0,
"step": 375
},
{
"entropy": 0.5322713851928711,
"epoch": 1.4029850746268657,
"grad_norm": 0.20512345433235168,
"learning_rate": 0.0002,
"loss": 0.5319615602493286,
"mean_token_accuracy": 0.7856654673814774,
"num_tokens": 6148777.0,
"step": 376
},
{
"entropy": 0.5522319674491882,
"epoch": 1.4067164179104479,
"grad_norm": 0.23706185817718506,
"learning_rate": 0.0002,
"loss": 0.5542993545532227,
"mean_token_accuracy": 0.7750299721956253,
"num_tokens": 6165444.0,
"step": 377
},
{
"entropy": 0.5360774844884872,
"epoch": 1.4104477611940298,
"grad_norm": 0.11965668946504593,
"learning_rate": 0.0002,
"loss": 0.5302645564079285,
"mean_token_accuracy": 0.7849837243556976,
"num_tokens": 6181897.0,
"step": 378
},
{
"entropy": 0.546858549118042,
"epoch": 1.414179104477612,
"grad_norm": 0.16231459379196167,
"learning_rate": 0.0002,
"loss": 0.5448977947235107,
"mean_token_accuracy": 0.7800662368535995,
"num_tokens": 6198254.0,
"step": 379
},
{
"entropy": 0.5505042523145676,
"epoch": 1.417910447761194,
"grad_norm": 0.16832560300827026,
"learning_rate": 0.0002,
"loss": 0.560795247554779,
"mean_token_accuracy": 0.7732271403074265,
"num_tokens": 6214773.0,
"step": 380
},
{
"entropy": 0.5255255252122879,
"epoch": 1.421641791044776,
"grad_norm": 0.14621268212795258,
"learning_rate": 0.0002,
"loss": 0.5310673117637634,
"mean_token_accuracy": 0.7856626063585281,
"num_tokens": 6230937.0,
"step": 381
},
{
"entropy": 0.550481304526329,
"epoch": 1.4253731343283582,
"grad_norm": 0.13561075925827026,
"learning_rate": 0.0002,
"loss": 0.552341103553772,
"mean_token_accuracy": 0.7767930179834366,
"num_tokens": 6247144.0,
"step": 382
},
{
"entropy": 0.5227905362844467,
"epoch": 1.4291044776119404,
"grad_norm": 0.13489387929439545,
"learning_rate": 0.0002,
"loss": 0.523324191570282,
"mean_token_accuracy": 0.7840524315834045,
"num_tokens": 6263392.0,
"step": 383
},
{
"entropy": 0.5366068184375763,
"epoch": 1.4328358208955223,
"grad_norm": 0.14153233170509338,
"learning_rate": 0.0002,
"loss": 0.5320409536361694,
"mean_token_accuracy": 0.7857052683830261,
"num_tokens": 6279611.0,
"step": 384
},
{
"entropy": 0.5510872900485992,
"epoch": 1.4365671641791045,
"grad_norm": 0.16421180963516235,
"learning_rate": 0.0002,
"loss": 0.5412197709083557,
"mean_token_accuracy": 0.7806995958089828,
"num_tokens": 6296025.0,
"step": 385
},
{
"entropy": 0.5504460334777832,
"epoch": 1.4402985074626866,
"grad_norm": 0.12805409729480743,
"learning_rate": 0.0002,
"loss": 0.5456997156143188,
"mean_token_accuracy": 0.7775121033191681,
"num_tokens": 6312415.0,
"step": 386
},
{
"entropy": 0.5504113882780075,
"epoch": 1.4440298507462686,
"grad_norm": 0.1690564602613449,
"learning_rate": 0.0002,
"loss": 0.5432727932929993,
"mean_token_accuracy": 0.7804221510887146,
"num_tokens": 6328728.0,
"step": 387
},
{
"entropy": 0.5279664844274521,
"epoch": 1.4477611940298507,
"grad_norm": 0.14327631890773773,
"learning_rate": 0.0002,
"loss": 0.5324951410293579,
"mean_token_accuracy": 0.7857986390590668,
"num_tokens": 6344947.0,
"step": 388
},
{
"entropy": 0.529266320168972,
"epoch": 1.4514925373134329,
"grad_norm": 0.14441367983818054,
"learning_rate": 0.0002,
"loss": 0.5360409021377563,
"mean_token_accuracy": 0.7844860553741455,
"num_tokens": 6361481.0,
"step": 389
},
{
"entropy": 0.5474697202444077,
"epoch": 1.455223880597015,
"grad_norm": 0.17411169409751892,
"learning_rate": 0.0002,
"loss": 0.553131103515625,
"mean_token_accuracy": 0.774516150355339,
"num_tokens": 6378114.0,
"step": 390
},
{
"entropy": 0.53204146027565,
"epoch": 1.458955223880597,
"grad_norm": 0.13096541166305542,
"learning_rate": 0.0002,
"loss": 0.5311554074287415,
"mean_token_accuracy": 0.7832191288471222,
"num_tokens": 6394618.0,
"step": 391
},
{
"entropy": 0.5468081682920456,
"epoch": 1.462686567164179,
"grad_norm": 0.1281428337097168,
"learning_rate": 0.0002,
"loss": 0.5487358570098877,
"mean_token_accuracy": 0.7784566432237625,
"num_tokens": 6411033.0,
"step": 392
},
{
"entropy": 0.5141153857111931,
"epoch": 1.4664179104477613,
"grad_norm": 0.12739789485931396,
"learning_rate": 0.0002,
"loss": 0.5161206126213074,
"mean_token_accuracy": 0.7879614979028702,
"num_tokens": 6427279.0,
"step": 393
},
{
"entropy": 0.5423916280269623,
"epoch": 1.4701492537313432,
"grad_norm": 0.13173308968544006,
"learning_rate": 0.0002,
"loss": 0.5459262132644653,
"mean_token_accuracy": 0.7773706614971161,
"num_tokens": 6443618.0,
"step": 394
},
{
"entropy": 0.5373747050762177,
"epoch": 1.4738805970149254,
"grad_norm": 0.13537272810935974,
"learning_rate": 0.0002,
"loss": 0.5413709878921509,
"mean_token_accuracy": 0.7808920592069626,
"num_tokens": 6459976.0,
"step": 395
},
{
"entropy": 0.5321269482374191,
"epoch": 1.4776119402985075,
"grad_norm": 0.14240136742591858,
"learning_rate": 0.0002,
"loss": 0.5354140400886536,
"mean_token_accuracy": 0.7839590162038803,
"num_tokens": 6476177.0,
"step": 396
},
{
"entropy": 0.5257603526115417,
"epoch": 1.4813432835820897,
"grad_norm": 0.13054870069026947,
"learning_rate": 0.0002,
"loss": 0.5284422636032104,
"mean_token_accuracy": 0.7869588881731033,
"num_tokens": 6492490.0,
"step": 397
},
{
"entropy": 0.5265851616859436,
"epoch": 1.4850746268656716,
"grad_norm": 0.13740919530391693,
"learning_rate": 0.0002,
"loss": 0.526134192943573,
"mean_token_accuracy": 0.7872523069381714,
"num_tokens": 6508878.0,
"step": 398
},
{
"entropy": 0.5212059766054153,
"epoch": 1.4888059701492538,
"grad_norm": 0.13339075446128845,
"learning_rate": 0.0002,
"loss": 0.5221821665763855,
"mean_token_accuracy": 0.7905861139297485,
"num_tokens": 6525084.0,
"step": 399
},
{
"entropy": 0.537382185459137,
"epoch": 1.4925373134328357,
"grad_norm": 0.13736183941364288,
"learning_rate": 0.0002,
"loss": 0.5351852774620056,
"mean_token_accuracy": 0.7818522453308105,
"num_tokens": 6541545.0,
"step": 400
},
{
"entropy": 0.5340493619441986,
"epoch": 1.4962686567164178,
"grad_norm": 0.1368023306131363,
"learning_rate": 0.0002,
"loss": 0.5317674279212952,
"mean_token_accuracy": 0.7867089211940765,
"num_tokens": 6557867.0,
"step": 401
},
{
"entropy": 0.5713642686605453,
"epoch": 1.5,
"grad_norm": 0.12573114037513733,
"learning_rate": 0.0002,
"loss": 0.5638826489448547,
"mean_token_accuracy": 0.773875430226326,
"num_tokens": 6574428.0,
"step": 402
},
{
"entropy": 0.5415615439414978,
"epoch": 1.5037313432835822,
"grad_norm": 0.14824476838111877,
"learning_rate": 0.0002,
"loss": 0.5452718734741211,
"mean_token_accuracy": 0.7793742418289185,
"num_tokens": 6590740.0,
"step": 403
},
{
"entropy": 0.5316762626171112,
"epoch": 1.5074626865671643,
"grad_norm": 0.13510265946388245,
"learning_rate": 0.0002,
"loss": 0.5399596691131592,
"mean_token_accuracy": 0.7803886234760284,
"num_tokens": 6606963.0,
"step": 404
},
{
"entropy": 0.5310466289520264,
"epoch": 1.5111940298507462,
"grad_norm": 0.1413303166627884,
"learning_rate": 0.0002,
"loss": 0.532017707824707,
"mean_token_accuracy": 0.7846063524484634,
"num_tokens": 6623504.0,
"step": 405
},
{
"entropy": 0.5623253732919693,
"epoch": 1.5149253731343284,
"grad_norm": 0.1327054351568222,
"learning_rate": 0.0002,
"loss": 0.5590583682060242,
"mean_token_accuracy": 0.7741520255804062,
"num_tokens": 6639880.0,
"step": 406
},
{
"entropy": 0.5222483575344086,
"epoch": 1.5186567164179103,
"grad_norm": 0.14219273626804352,
"learning_rate": 0.0002,
"loss": 0.5221630930900574,
"mean_token_accuracy": 0.7884060740470886,
"num_tokens": 6656372.0,
"step": 407
},
{
"entropy": 0.5361650884151459,
"epoch": 1.5223880597014925,
"grad_norm": 0.14150315523147583,
"learning_rate": 0.0002,
"loss": 0.5426543951034546,
"mean_token_accuracy": 0.7794915586709976,
"num_tokens": 6672460.0,
"step": 408
},
{
"entropy": 0.5405853539705276,
"epoch": 1.5261194029850746,
"grad_norm": 0.12867780029773712,
"learning_rate": 0.0002,
"loss": 0.545219361782074,
"mean_token_accuracy": 0.7802143394947052,
"num_tokens": 6688740.0,
"step": 409
},
{
"entropy": 0.5196312442421913,
"epoch": 1.5298507462686568,
"grad_norm": 0.12933768332004547,
"learning_rate": 0.0002,
"loss": 0.524722695350647,
"mean_token_accuracy": 0.7893691807985306,
"num_tokens": 6704798.0,
"step": 410
},
{
"entropy": 0.5358741357922554,
"epoch": 1.533582089552239,
"grad_norm": 0.14841386675834656,
"learning_rate": 0.0002,
"loss": 0.5425981879234314,
"mean_token_accuracy": 0.7796852141618729,
"num_tokens": 6720982.0,
"step": 411
},
{
"entropy": 0.5389422178268433,
"epoch": 1.537313432835821,
"grad_norm": 0.12372686713933945,
"learning_rate": 0.0002,
"loss": 0.5368393063545227,
"mean_token_accuracy": 0.7788573652505875,
"num_tokens": 6737135.0,
"step": 412
},
{
"entropy": 0.5395499765872955,
"epoch": 1.5410447761194028,
"grad_norm": 0.1355394721031189,
"learning_rate": 0.0002,
"loss": 0.5324706435203552,
"mean_token_accuracy": 0.7823397219181061,
"num_tokens": 6753507.0,
"step": 413
},
{
"entropy": 0.5506737977266312,
"epoch": 1.544776119402985,
"grad_norm": 0.11822586506605148,
"learning_rate": 0.0002,
"loss": 0.5447027087211609,
"mean_token_accuracy": 0.7776395529508591,
"num_tokens": 6769726.0,
"step": 414
},
{
"entropy": 0.5393240600824356,
"epoch": 1.5485074626865671,
"grad_norm": 0.1220259889960289,
"learning_rate": 0.0002,
"loss": 0.5348957180976868,
"mean_token_accuracy": 0.7820345014333725,
"num_tokens": 6786148.0,
"step": 415
},
{
"entropy": 0.5258119255304337,
"epoch": 1.5522388059701493,
"grad_norm": 0.15211379528045654,
"learning_rate": 0.0002,
"loss": 0.5274648666381836,
"mean_token_accuracy": 0.7861866801977158,
"num_tokens": 6802290.0,
"step": 416
},
{
"entropy": 0.5310887396335602,
"epoch": 1.5559701492537314,
"grad_norm": 0.1319982260465622,
"learning_rate": 0.0002,
"loss": 0.5339083075523376,
"mean_token_accuracy": 0.7847474962472916,
"num_tokens": 6818697.0,
"step": 417
},
{
"entropy": 0.5216883644461632,
"epoch": 1.5597014925373134,
"grad_norm": 0.13150501251220703,
"learning_rate": 0.0002,
"loss": 0.5250256061553955,
"mean_token_accuracy": 0.7854708880186081,
"num_tokens": 6834860.0,
"step": 418
},
{
"entropy": 0.5280915200710297,
"epoch": 1.5634328358208955,
"grad_norm": 0.13087767362594604,
"learning_rate": 0.0002,
"loss": 0.5294699668884277,
"mean_token_accuracy": 0.7844147831201553,
"num_tokens": 6850977.0,
"step": 419
},
{
"entropy": 0.5455043911933899,
"epoch": 1.5671641791044775,
"grad_norm": 0.13152527809143066,
"learning_rate": 0.0002,
"loss": 0.5411855578422546,
"mean_token_accuracy": 0.7831065207719803,
"num_tokens": 6867436.0,
"step": 420
},
{
"entropy": 0.5421444773674011,
"epoch": 1.5708955223880596,
"grad_norm": 0.12552635371685028,
"learning_rate": 0.0002,
"loss": 0.5404070615768433,
"mean_token_accuracy": 0.7799917608499527,
"num_tokens": 6883739.0,
"step": 421
},
{
"entropy": 0.5469988659024239,
"epoch": 1.5746268656716418,
"grad_norm": 0.12713049352169037,
"learning_rate": 0.0002,
"loss": 0.5506969690322876,
"mean_token_accuracy": 0.7773310244083405,
"num_tokens": 6899931.0,
"step": 422
},
{
"entropy": 0.5409539192914963,
"epoch": 1.578358208955224,
"grad_norm": 0.12043388932943344,
"learning_rate": 0.0002,
"loss": 0.5393781661987305,
"mean_token_accuracy": 0.7821668684482574,
"num_tokens": 6916555.0,
"step": 423
},
{
"entropy": 0.5323537066578865,
"epoch": 1.582089552238806,
"grad_norm": 0.15053188800811768,
"learning_rate": 0.0002,
"loss": 0.5387845039367676,
"mean_token_accuracy": 0.7825682461261749,
"num_tokens": 6932929.0,
"step": 424
},
{
"entropy": 0.5519883185625076,
"epoch": 1.585820895522388,
"grad_norm": 0.1525130718946457,
"learning_rate": 0.0002,
"loss": 0.56787109375,
"mean_token_accuracy": 0.7703519463539124,
"num_tokens": 6949313.0,
"step": 425
},
{
"entropy": 0.5393707901239395,
"epoch": 1.5895522388059702,
"grad_norm": 0.14073340594768524,
"learning_rate": 0.0002,
"loss": 0.5375410914421082,
"mean_token_accuracy": 0.7814988791942596,
"num_tokens": 6965684.0,
"step": 426
},
{
"entropy": 0.5354568511247635,
"epoch": 1.5932835820895521,
"grad_norm": 0.13749349117279053,
"learning_rate": 0.0002,
"loss": 0.5318333506584167,
"mean_token_accuracy": 0.7864338159561157,
"num_tokens": 6982013.0,
"step": 427
},
{
"entropy": 0.5405145287513733,
"epoch": 1.5970149253731343,
"grad_norm": 0.12070662528276443,
"learning_rate": 0.0002,
"loss": 0.5362390279769897,
"mean_token_accuracy": 0.7832798510789871,
"num_tokens": 6998503.0,
"step": 428
},
{
"entropy": 0.5447606593370438,
"epoch": 1.6007462686567164,
"grad_norm": 0.1386427879333496,
"learning_rate": 0.0002,
"loss": 0.5441482663154602,
"mean_token_accuracy": 0.778590589761734,
"num_tokens": 7014770.0,
"step": 429
},
{
"entropy": 0.5470203310251236,
"epoch": 1.6044776119402986,
"grad_norm": 0.13212502002716064,
"learning_rate": 0.0002,
"loss": 0.5490391850471497,
"mean_token_accuracy": 0.7765385806560516,
"num_tokens": 7030922.0,
"step": 430
},
{
"entropy": 0.5170739889144897,
"epoch": 1.6082089552238807,
"grad_norm": 0.13961301743984222,
"learning_rate": 0.0002,
"loss": 0.5210376381874084,
"mean_token_accuracy": 0.7884235680103302,
"num_tokens": 7047216.0,
"step": 431
},
{
"entropy": 0.5377417504787445,
"epoch": 1.6119402985074627,
"grad_norm": 0.13901281356811523,
"learning_rate": 0.0002,
"loss": 0.5376747846603394,
"mean_token_accuracy": 0.7830623835325241,
"num_tokens": 7063307.0,
"step": 432
},
{
"entropy": 0.5414564162492752,
"epoch": 1.6156716417910446,
"grad_norm": 0.1463043987751007,
"learning_rate": 0.0002,
"loss": 0.5473238825798035,
"mean_token_accuracy": 0.7770842909812927,
"num_tokens": 7079707.0,
"step": 433
},
{
"entropy": 0.5415572673082352,
"epoch": 1.6194029850746268,
"grad_norm": 0.11891120672225952,
"learning_rate": 0.0002,
"loss": 0.5387373566627502,
"mean_token_accuracy": 0.779969111084938,
"num_tokens": 7095980.0,
"step": 434
},
{
"entropy": 0.5542661100625992,
"epoch": 1.623134328358209,
"grad_norm": 0.13271500170230865,
"learning_rate": 0.0002,
"loss": 0.5507120490074158,
"mean_token_accuracy": 0.7779867500066757,
"num_tokens": 7112556.0,
"step": 435
},
{
"entropy": 0.54887755215168,
"epoch": 1.626865671641791,
"grad_norm": 0.13373985886573792,
"learning_rate": 0.0002,
"loss": 0.5447692275047302,
"mean_token_accuracy": 0.7798765897750854,
"num_tokens": 7128802.0,
"step": 436
},
{
"entropy": 0.5222520381212234,
"epoch": 1.6305970149253732,
"grad_norm": 0.1277901828289032,
"learning_rate": 0.0002,
"loss": 0.5239554643630981,
"mean_token_accuracy": 0.785177692770958,
"num_tokens": 7145060.0,
"step": 437
},
{
"entropy": 0.53469417989254,
"epoch": 1.6343283582089554,
"grad_norm": 0.20547546446323395,
"learning_rate": 0.0002,
"loss": 0.5367586612701416,
"mean_token_accuracy": 0.7803931534290314,
"num_tokens": 7161527.0,
"step": 438
},
{
"entropy": 0.521802693605423,
"epoch": 1.6380597014925373,
"grad_norm": 0.16560786962509155,
"learning_rate": 0.0002,
"loss": 0.5228012204170227,
"mean_token_accuracy": 0.7887944877147675,
"num_tokens": 7178091.0,
"step": 439
},
{
"entropy": 0.5338825434446335,
"epoch": 1.6417910447761193,
"grad_norm": 0.1590629667043686,
"learning_rate": 0.0002,
"loss": 0.5402793288230896,
"mean_token_accuracy": 0.7781020998954773,
"num_tokens": 7194244.0,
"step": 440
},
{
"entropy": 0.5395276695489883,
"epoch": 1.6455223880597014,
"grad_norm": 0.14088116586208344,
"learning_rate": 0.0002,
"loss": 0.5401326417922974,
"mean_token_accuracy": 0.781720831990242,
"num_tokens": 7210451.0,
"step": 441
},
{
"entropy": 0.5567539632320404,
"epoch": 1.6492537313432836,
"grad_norm": 0.19292442500591278,
"learning_rate": 0.0002,
"loss": 0.5627314448356628,
"mean_token_accuracy": 0.7719661146402359,
"num_tokens": 7226996.0,
"step": 442
},
{
"entropy": 0.534116804599762,
"epoch": 1.6529850746268657,
"grad_norm": 0.1254442036151886,
"learning_rate": 0.0002,
"loss": 0.533519983291626,
"mean_token_accuracy": 0.7840958386659622,
"num_tokens": 7243430.0,
"step": 443
},
{
"entropy": 0.5330116599798203,
"epoch": 1.6567164179104479,
"grad_norm": 0.1718529760837555,
"learning_rate": 0.0002,
"loss": 0.5330148339271545,
"mean_token_accuracy": 0.7830322086811066,
"num_tokens": 7259764.0,
"step": 444
},
{
"entropy": 0.5424318462610245,
"epoch": 1.6604477611940298,
"grad_norm": 0.13064436614513397,
"learning_rate": 0.0002,
"loss": 0.5422405004501343,
"mean_token_accuracy": 0.7796443551778793,
"num_tokens": 7276147.0,
"step": 445
},
{
"entropy": 0.555829331278801,
"epoch": 1.664179104477612,
"grad_norm": 0.12649741768836975,
"learning_rate": 0.0002,
"loss": 0.5439899563789368,
"mean_token_accuracy": 0.7798557877540588,
"num_tokens": 7292719.0,
"step": 446
},
{
"entropy": 0.5564119815826416,
"epoch": 1.667910447761194,
"grad_norm": 0.140034019947052,
"learning_rate": 0.0002,
"loss": 0.5546625256538391,
"mean_token_accuracy": 0.7761601060628891,
"num_tokens": 7309242.0,
"step": 447
},
{
"entropy": 0.5416673123836517,
"epoch": 1.671641791044776,
"grad_norm": 0.1388692855834961,
"learning_rate": 0.0002,
"loss": 0.541693389415741,
"mean_token_accuracy": 0.7807905972003937,
"num_tokens": 7325872.0,
"step": 448
},
{
"entropy": 0.5325654745101929,
"epoch": 1.6753731343283582,
"grad_norm": 0.1330399215221405,
"learning_rate": 0.0002,
"loss": 0.5375967025756836,
"mean_token_accuracy": 0.780772253870964,
"num_tokens": 7342461.0,
"step": 449
},
{
"entropy": 0.5460408478975296,
"epoch": 1.6791044776119404,
"grad_norm": 0.1698281317949295,
"learning_rate": 0.0002,
"loss": 0.5483989119529724,
"mean_token_accuracy": 0.7757564038038254,
"num_tokens": 7358926.0,
"step": 450
},
{
"entropy": 0.5587185472249985,
"epoch": 1.6828358208955225,
"grad_norm": 0.150365948677063,
"learning_rate": 0.0002,
"loss": 0.5607273578643799,
"mean_token_accuracy": 0.7735442072153091,
"num_tokens": 7375472.0,
"step": 451
},
{
"entropy": 0.5546591132879257,
"epoch": 1.6865671641791045,
"grad_norm": 0.13346362113952637,
"learning_rate": 0.0002,
"loss": 0.5498383045196533,
"mean_token_accuracy": 0.7771503031253815,
"num_tokens": 7391758.0,
"step": 452
},
{
"entropy": 0.5380023121833801,
"epoch": 1.6902985074626866,
"grad_norm": 0.15642641484737396,
"learning_rate": 0.0002,
"loss": 0.540310263633728,
"mean_token_accuracy": 0.7800187021493912,
"num_tokens": 7407943.0,
"step": 453
},
{
"entropy": 0.5107243284583092,
"epoch": 1.6940298507462686,
"grad_norm": 0.1413007378578186,
"learning_rate": 0.0002,
"loss": 0.5198100209236145,
"mean_token_accuracy": 0.7903516441583633,
"num_tokens": 7424142.0,
"step": 454
},
{
"entropy": 0.5318749994039536,
"epoch": 1.6977611940298507,
"grad_norm": 0.13885854184627533,
"learning_rate": 0.0002,
"loss": 0.5412630438804626,
"mean_token_accuracy": 0.7793916463851929,
"num_tokens": 7440451.0,
"step": 455
},
{
"entropy": 0.5525089502334595,
"epoch": 1.7014925373134329,
"grad_norm": 0.12943100929260254,
"learning_rate": 0.0002,
"loss": 0.5551573634147644,
"mean_token_accuracy": 0.7760037779808044,
"num_tokens": 7456977.0,
"step": 456
},
{
"entropy": 0.5402176976203918,
"epoch": 1.705223880597015,
"grad_norm": 0.15211442112922668,
"learning_rate": 0.0002,
"loss": 0.5398398041725159,
"mean_token_accuracy": 0.779134064912796,
"num_tokens": 7473154.0,
"step": 457
},
{
"entropy": 0.5625119209289551,
"epoch": 1.7089552238805972,
"grad_norm": 0.12840458750724792,
"learning_rate": 0.0002,
"loss": 0.5544787645339966,
"mean_token_accuracy": 0.7756093442440033,
"num_tokens": 7489492.0,
"step": 458
},
{
"entropy": 0.5442609488964081,
"epoch": 1.712686567164179,
"grad_norm": 0.13839711248874664,
"learning_rate": 0.0002,
"loss": 0.5437784194946289,
"mean_token_accuracy": 0.7818922996520996,
"num_tokens": 7505874.0,
"step": 459
},
{
"entropy": 0.5575658231973648,
"epoch": 1.716417910447761,
"grad_norm": 0.14238221943378448,
"learning_rate": 0.0002,
"loss": 0.5612136125564575,
"mean_token_accuracy": 0.7718513458967209,
"num_tokens": 7522288.0,
"step": 460
},
{
"entropy": 0.535207062959671,
"epoch": 1.7201492537313432,
"grad_norm": 0.13308024406433105,
"learning_rate": 0.0002,
"loss": 0.5384257435798645,
"mean_token_accuracy": 0.7802019715309143,
"num_tokens": 7538764.0,
"step": 461
},
{
"entropy": 0.5290672108530998,
"epoch": 1.7238805970149254,
"grad_norm": 0.14699077606201172,
"learning_rate": 0.0002,
"loss": 0.533920168876648,
"mean_token_accuracy": 0.7809716016054153,
"num_tokens": 7555048.0,
"step": 462
},
{
"entropy": 0.5349759012460709,
"epoch": 1.7276119402985075,
"grad_norm": 0.13993169367313385,
"learning_rate": 0.0002,
"loss": 0.5397127866744995,
"mean_token_accuracy": 0.781706914305687,
"num_tokens": 7571331.0,
"step": 463
},
{
"entropy": 0.5471459329128265,
"epoch": 1.7313432835820897,
"grad_norm": 0.1270606368780136,
"learning_rate": 0.0002,
"loss": 0.5457655191421509,
"mean_token_accuracy": 0.7785040736198425,
"num_tokens": 7587268.0,
"step": 464
},
{
"entropy": 0.5576677769422531,
"epoch": 1.7350746268656716,
"grad_norm": 0.13001851737499237,
"learning_rate": 0.0002,
"loss": 0.5535344481468201,
"mean_token_accuracy": 0.7747954726219177,
"num_tokens": 7603468.0,
"step": 465
},
{
"entropy": 0.5527965128421783,
"epoch": 1.7388059701492538,
"grad_norm": 0.11191874742507935,
"learning_rate": 0.0002,
"loss": 0.5493273138999939,
"mean_token_accuracy": 0.7783663272857666,
"num_tokens": 7619861.0,
"step": 466
},
{
"entropy": 0.5458428710699081,
"epoch": 1.7425373134328357,
"grad_norm": 0.12890613079071045,
"learning_rate": 0.0002,
"loss": 0.5422653555870056,
"mean_token_accuracy": 0.7804641127586365,
"num_tokens": 7636365.0,
"step": 467
},
{
"entropy": 0.5396646112203598,
"epoch": 1.7462686567164178,
"grad_norm": 0.14643065631389618,
"learning_rate": 0.0002,
"loss": 0.540531575679779,
"mean_token_accuracy": 0.7787915468215942,
"num_tokens": 7652695.0,
"step": 468
},
{
"entropy": 0.5489283800125122,
"epoch": 1.75,
"grad_norm": 0.12856297194957733,
"learning_rate": 0.0002,
"loss": 0.5493489503860474,
"mean_token_accuracy": 0.7765475660562515,
"num_tokens": 7669417.0,
"step": 469
},
{
"entropy": 0.5371540188789368,
"epoch": 1.7537313432835822,
"grad_norm": 0.1448490023612976,
"learning_rate": 0.0002,
"loss": 0.5445014238357544,
"mean_token_accuracy": 0.7786155045032501,
"num_tokens": 7685950.0,
"step": 470
},
{
"entropy": 0.5441175699234009,
"epoch": 1.7574626865671643,
"grad_norm": 0.1417449563741684,
"learning_rate": 0.0002,
"loss": 0.5456334352493286,
"mean_token_accuracy": 0.7806714922189713,
"num_tokens": 7702096.0,
"step": 471
},
{
"entropy": 0.534687414765358,
"epoch": 1.7611940298507462,
"grad_norm": 0.13397443294525146,
"learning_rate": 0.0002,
"loss": 0.5369069576263428,
"mean_token_accuracy": 0.7817386239767075,
"num_tokens": 7718461.0,
"step": 472
},
{
"entropy": 0.5490274131298065,
"epoch": 1.7649253731343284,
"grad_norm": 0.1352432817220688,
"learning_rate": 0.0002,
"loss": 0.5512405633926392,
"mean_token_accuracy": 0.7781344056129456,
"num_tokens": 7734927.0,
"step": 473
},
{
"entropy": 0.5476491898298264,
"epoch": 1.7686567164179103,
"grad_norm": 0.13750651478767395,
"learning_rate": 0.0002,
"loss": 0.5536763668060303,
"mean_token_accuracy": 0.7743410021066666,
"num_tokens": 7751415.0,
"step": 474
},
{
"entropy": 0.524419367313385,
"epoch": 1.7723880597014925,
"grad_norm": 0.13306710124015808,
"learning_rate": 0.0002,
"loss": 0.5263890624046326,
"mean_token_accuracy": 0.7842015773057938,
"num_tokens": 7767584.0,
"step": 475
},
{
"entropy": 0.5515109747648239,
"epoch": 1.7761194029850746,
"grad_norm": 0.13014942407608032,
"learning_rate": 0.0002,
"loss": 0.546906590461731,
"mean_token_accuracy": 0.7791758924722672,
"num_tokens": 7783929.0,
"step": 476
},
{
"entropy": 0.5460219085216522,
"epoch": 1.7798507462686568,
"grad_norm": 0.12750543653964996,
"learning_rate": 0.0002,
"loss": 0.5416713953018188,
"mean_token_accuracy": 0.7796966135501862,
"num_tokens": 7800322.0,
"step": 477
},
{
"entropy": 0.5496509969234467,
"epoch": 1.783582089552239,
"grad_norm": 0.14019764959812164,
"learning_rate": 0.0002,
"loss": 0.5501259565353394,
"mean_token_accuracy": 0.7778430730104446,
"num_tokens": 7816728.0,
"step": 478
},
{
"entropy": 0.5484806597232819,
"epoch": 1.787313432835821,
"grad_norm": 0.12671294808387756,
"learning_rate": 0.0002,
"loss": 0.546718418598175,
"mean_token_accuracy": 0.7767283469438553,
"num_tokens": 7833182.0,
"step": 479
},
{
"entropy": 0.5313283354043961,
"epoch": 1.7910447761194028,
"grad_norm": 0.16472716629505157,
"learning_rate": 0.0002,
"loss": 0.5414275527000427,
"mean_token_accuracy": 0.7815513163805008,
"num_tokens": 7849402.0,
"step": 480
},
{
"entropy": 0.516701802611351,
"epoch": 1.794776119402985,
"grad_norm": 0.157722607254982,
"learning_rate": 0.0002,
"loss": 0.5291575789451599,
"mean_token_accuracy": 0.7844545841217041,
"num_tokens": 7865503.0,
"step": 481
},
{
"entropy": 0.5476036965847015,
"epoch": 1.7985074626865671,
"grad_norm": 0.16708603501319885,
"learning_rate": 0.0002,
"loss": 0.5535966157913208,
"mean_token_accuracy": 0.7750539481639862,
"num_tokens": 7881822.0,
"step": 482
},
{
"entropy": 0.5405763983726501,
"epoch": 1.8022388059701493,
"grad_norm": 0.12333223968744278,
"learning_rate": 0.0002,
"loss": 0.5385177731513977,
"mean_token_accuracy": 0.7838984429836273,
"num_tokens": 7898111.0,
"step": 483
},
{
"entropy": 0.5573789775371552,
"epoch": 1.8059701492537314,
"grad_norm": 0.14407449960708618,
"learning_rate": 0.0002,
"loss": 0.541386067867279,
"mean_token_accuracy": 0.7797874957323074,
"num_tokens": 7914518.0,
"step": 484
},
{
"entropy": 0.5439587533473969,
"epoch": 1.8097014925373134,
"grad_norm": 0.1654428094625473,
"learning_rate": 0.0002,
"loss": 0.5336223244667053,
"mean_token_accuracy": 0.7846554070711136,
"num_tokens": 7930884.0,
"step": 485
},
{
"entropy": 0.536734089255333,
"epoch": 1.8134328358208955,
"grad_norm": 0.15028727054595947,
"learning_rate": 0.0002,
"loss": 0.5363267660140991,
"mean_token_accuracy": 0.786723256111145,
"num_tokens": 7947486.0,
"step": 486
},
{
"entropy": 0.5316303819417953,
"epoch": 1.8171641791044775,
"grad_norm": 0.2185370773077011,
"learning_rate": 0.0002,
"loss": 0.5426980257034302,
"mean_token_accuracy": 0.7816258370876312,
"num_tokens": 7963754.0,
"step": 487
},
{
"entropy": 0.5372888445854187,
"epoch": 1.8208955223880596,
"grad_norm": 0.14039121568202972,
"learning_rate": 0.0002,
"loss": 0.5452357530593872,
"mean_token_accuracy": 0.7777333706617355,
"num_tokens": 7980178.0,
"step": 488
},
{
"entropy": 0.561303973197937,
"epoch": 1.8246268656716418,
"grad_norm": 0.2095021903514862,
"learning_rate": 0.0002,
"loss": 0.5606201887130737,
"mean_token_accuracy": 0.7701640874147415,
"num_tokens": 7996414.0,
"step": 489
},
{
"entropy": 0.5401351600885391,
"epoch": 1.828358208955224,
"grad_norm": 0.13168978691101074,
"learning_rate": 0.0002,
"loss": 0.5416175723075867,
"mean_token_accuracy": 0.7801533341407776,
"num_tokens": 8012578.0,
"step": 490
},
{
"entropy": 0.5480149686336517,
"epoch": 1.832089552238806,
"grad_norm": 0.18209180235862732,
"learning_rate": 0.0002,
"loss": 0.5433698892593384,
"mean_token_accuracy": 0.7793498337268829,
"num_tokens": 8029063.0,
"step": 491
},
{
"entropy": 0.5556472986936569,
"epoch": 1.835820895522388,
"grad_norm": 0.14936800301074982,
"learning_rate": 0.0002,
"loss": 0.5554640293121338,
"mean_token_accuracy": 0.7756128907203674,
"num_tokens": 8045335.0,
"step": 492
},
{
"entropy": 0.551779106259346,
"epoch": 1.8395522388059702,
"grad_norm": 0.16466236114501953,
"learning_rate": 0.0002,
"loss": 0.5527586936950684,
"mean_token_accuracy": 0.7768742144107819,
"num_tokens": 8061746.0,
"step": 493
},
{
"entropy": 0.5395959764719009,
"epoch": 1.8432835820895521,
"grad_norm": 0.17139406502246857,
"learning_rate": 0.0002,
"loss": 0.5481644868850708,
"mean_token_accuracy": 0.7803965657949448,
"num_tokens": 8078227.0,
"step": 494
},
{
"entropy": 0.544280469417572,
"epoch": 1.8470149253731343,
"grad_norm": 0.14393140375614166,
"learning_rate": 0.0002,
"loss": 0.55059415102005,
"mean_token_accuracy": 0.7759814560413361,
"num_tokens": 8094667.0,
"step": 495
},
{
"entropy": 0.5303434431552887,
"epoch": 1.8507462686567164,
"grad_norm": 0.16556651890277863,
"learning_rate": 0.0002,
"loss": 0.530941903591156,
"mean_token_accuracy": 0.7859343141317368,
"num_tokens": 8110787.0,
"step": 496
},
{
"entropy": 0.5236229598522186,
"epoch": 1.8544776119402986,
"grad_norm": 0.12482267618179321,
"learning_rate": 0.0002,
"loss": 0.5197535753250122,
"mean_token_accuracy": 0.7890704125165939,
"num_tokens": 8127133.0,
"step": 497
},
{
"entropy": 0.5396426022052765,
"epoch": 1.8582089552238807,
"grad_norm": 0.1538504958152771,
"learning_rate": 0.0002,
"loss": 0.5361296534538269,
"mean_token_accuracy": 0.7814654260873795,
"num_tokens": 8143434.0,
"step": 498
},
{
"entropy": 0.5484279841184616,
"epoch": 1.8619402985074627,
"grad_norm": 0.14813822507858276,
"learning_rate": 0.0002,
"loss": 0.5464996695518494,
"mean_token_accuracy": 0.7787739634513855,
"num_tokens": 8159903.0,
"step": 499
},
{
"entropy": 0.519238218665123,
"epoch": 1.8656716417910446,
"grad_norm": 0.13267366588115692,
"learning_rate": 0.0002,
"loss": 0.5259124040603638,
"mean_token_accuracy": 0.7888814806938171,
"num_tokens": 8176179.0,
"step": 500
},
{
"entropy": 0.5393799841403961,
"epoch": 1.8694029850746268,
"grad_norm": 0.1923193484544754,
"learning_rate": 0.0002,
"loss": 0.5401571989059448,
"mean_token_accuracy": 0.7801343649625778,
"num_tokens": 8192554.0,
"step": 501
},
{
"entropy": 0.532251313328743,
"epoch": 1.873134328358209,
"grad_norm": 0.13894309103488922,
"learning_rate": 0.0002,
"loss": 0.527220606803894,
"mean_token_accuracy": 0.7864662110805511,
"num_tokens": 8208849.0,
"step": 502
},
{
"entropy": 0.5306680351495743,
"epoch": 1.876865671641791,
"grad_norm": 0.1474749892950058,
"learning_rate": 0.0002,
"loss": 0.5287739038467407,
"mean_token_accuracy": 0.7855399250984192,
"num_tokens": 8225218.0,
"step": 503
},
{
"entropy": 0.5300537943840027,
"epoch": 1.8805970149253732,
"grad_norm": 0.1491105705499649,
"learning_rate": 0.0002,
"loss": 0.5314114093780518,
"mean_token_accuracy": 0.7854063659906387,
"num_tokens": 8241422.0,
"step": 504
},
{
"entropy": 0.5309967398643494,
"epoch": 1.8843283582089554,
"grad_norm": 0.15464921295642853,
"learning_rate": 0.0002,
"loss": 0.5415985584259033,
"mean_token_accuracy": 0.7829921096563339,
"num_tokens": 8257677.0,
"step": 505
},
{
"entropy": 0.5376427173614502,
"epoch": 1.8880597014925373,
"grad_norm": 0.1445028930902481,
"learning_rate": 0.0002,
"loss": 0.5402049422264099,
"mean_token_accuracy": 0.781824991106987,
"num_tokens": 8274079.0,
"step": 506
},
{
"entropy": 0.5335574001073837,
"epoch": 1.8917910447761193,
"grad_norm": 0.12303903698921204,
"learning_rate": 0.0002,
"loss": 0.530457079410553,
"mean_token_accuracy": 0.7857005745172501,
"num_tokens": 8290576.0,
"step": 507
},
{
"entropy": 0.5357225090265274,
"epoch": 1.8955223880597014,
"grad_norm": 0.14474186301231384,
"learning_rate": 0.0002,
"loss": 0.5326468348503113,
"mean_token_accuracy": 0.7827298194169998,
"num_tokens": 8306959.0,
"step": 508
},
{
"entropy": 0.5418558418750763,
"epoch": 1.8992537313432836,
"grad_norm": 0.13205651938915253,
"learning_rate": 0.0002,
"loss": 0.5394735932350159,
"mean_token_accuracy": 0.7811231166124344,
"num_tokens": 8323198.0,
"step": 509
},
{
"entropy": 0.5494632720947266,
"epoch": 1.9029850746268657,
"grad_norm": 0.13867227733135223,
"learning_rate": 0.0002,
"loss": 0.5512980818748474,
"mean_token_accuracy": 0.7792128920555115,
"num_tokens": 8339407.0,
"step": 510
},
{
"entropy": 0.527800902724266,
"epoch": 1.9067164179104479,
"grad_norm": 0.1300196498632431,
"learning_rate": 0.0002,
"loss": 0.5310680866241455,
"mean_token_accuracy": 0.7856706976890564,
"num_tokens": 8355694.0,
"step": 511
},
{
"entropy": 0.5433302372694016,
"epoch": 1.9104477611940298,
"grad_norm": 0.16294771432876587,
"learning_rate": 0.0002,
"loss": 0.5532437562942505,
"mean_token_accuracy": 0.7759810388088226,
"num_tokens": 8371710.0,
"step": 512
},
{
"entropy": 0.5244318097829819,
"epoch": 1.914179104477612,
"grad_norm": 0.13300037384033203,
"learning_rate": 0.0002,
"loss": 0.5271862149238586,
"mean_token_accuracy": 0.7844917327165604,
"num_tokens": 8387964.0,
"step": 513
},
{
"entropy": 0.5421733111143112,
"epoch": 1.917910447761194,
"grad_norm": 0.12434980273246765,
"learning_rate": 0.0002,
"loss": 0.5377052426338196,
"mean_token_accuracy": 0.7836858928203583,
"num_tokens": 8404373.0,
"step": 514
},
{
"entropy": 0.5615102648735046,
"epoch": 1.921641791044776,
"grad_norm": 0.1264066845178604,
"learning_rate": 0.0002,
"loss": 0.558891236782074,
"mean_token_accuracy": 0.7723990976810455,
"num_tokens": 8420907.0,
"step": 515
},
{
"entropy": 0.5428318381309509,
"epoch": 1.9253731343283582,
"grad_norm": 0.13190090656280518,
"learning_rate": 0.0002,
"loss": 0.5374886393547058,
"mean_token_accuracy": 0.7830605953931808,
"num_tokens": 8437255.0,
"step": 516
},
{
"entropy": 0.5324592739343643,
"epoch": 1.9291044776119404,
"grad_norm": 0.13782039284706116,
"learning_rate": 0.0002,
"loss": 0.5368908643722534,
"mean_token_accuracy": 0.7810968607664108,
"num_tokens": 8453657.0,
"step": 517
},
{
"entropy": 0.563809260725975,
"epoch": 1.9328358208955225,
"grad_norm": 0.11932537704706192,
"learning_rate": 0.0002,
"loss": 0.5596674680709839,
"mean_token_accuracy": 0.7723207473754883,
"num_tokens": 8470566.0,
"step": 518
},
{
"entropy": 0.550938680768013,
"epoch": 1.9365671641791045,
"grad_norm": 0.13882781565189362,
"learning_rate": 0.0002,
"loss": 0.5502666234970093,
"mean_token_accuracy": 0.7773875147104263,
"num_tokens": 8486896.0,
"step": 519
},
{
"entropy": 0.5509646236896515,
"epoch": 1.9402985074626866,
"grad_norm": 0.11496590822935104,
"learning_rate": 0.0002,
"loss": 0.5537518262863159,
"mean_token_accuracy": 0.7762430608272552,
"num_tokens": 8503486.0,
"step": 520
},
{
"entropy": 0.5208418220281601,
"epoch": 1.9440298507462686,
"grad_norm": 0.12605132162570953,
"learning_rate": 0.0002,
"loss": 0.5253016948699951,
"mean_token_accuracy": 0.7866884917020798,
"num_tokens": 8519722.0,
"step": 521
},
{
"entropy": 0.5348703861236572,
"epoch": 1.9477611940298507,
"grad_norm": 0.13436545431613922,
"learning_rate": 0.0002,
"loss": 0.5429031252861023,
"mean_token_accuracy": 0.7784363180398941,
"num_tokens": 8536094.0,
"step": 522
},
{
"entropy": 0.5374516993761063,
"epoch": 1.9514925373134329,
"grad_norm": 0.1355811506509781,
"learning_rate": 0.0002,
"loss": 0.5394662618637085,
"mean_token_accuracy": 0.7806121855974197,
"num_tokens": 8552288.0,
"step": 523
},
{
"entropy": 0.5625811666250229,
"epoch": 1.955223880597015,
"grad_norm": 0.11836230754852295,
"learning_rate": 0.0002,
"loss": 0.5579893589019775,
"mean_token_accuracy": 0.7714975476264954,
"num_tokens": 8568760.0,
"step": 524
},
{
"entropy": 0.5421487241983414,
"epoch": 1.9589552238805972,
"grad_norm": 0.1359013170003891,
"learning_rate": 0.0002,
"loss": 0.5385461449623108,
"mean_token_accuracy": 0.7821292132139206,
"num_tokens": 8585317.0,
"step": 525
},
{
"entropy": 0.5259972438216209,
"epoch": 1.962686567164179,
"grad_norm": 0.1390962302684784,
"learning_rate": 0.0002,
"loss": 0.5276076793670654,
"mean_token_accuracy": 0.785026952624321,
"num_tokens": 8601637.0,
"step": 526
},
{
"entropy": 0.5354560762643814,
"epoch": 1.966417910447761,
"grad_norm": 0.13758784532546997,
"learning_rate": 0.0002,
"loss": 0.5364598035812378,
"mean_token_accuracy": 0.782847136259079,
"num_tokens": 8617902.0,
"step": 527
},
{
"entropy": 0.5353007912635803,
"epoch": 1.9701492537313432,
"grad_norm": 0.16679321229457855,
"learning_rate": 0.0002,
"loss": 0.5458345413208008,
"mean_token_accuracy": 0.7779222279787064,
"num_tokens": 8634235.0,
"step": 528
},
{
"entropy": 0.5326858758926392,
"epoch": 1.9738805970149254,
"grad_norm": 0.1427498161792755,
"learning_rate": 0.0002,
"loss": 0.5339992642402649,
"mean_token_accuracy": 0.7820619940757751,
"num_tokens": 8650417.0,
"step": 529
},
{
"entropy": 0.5444169491529465,
"epoch": 1.9776119402985075,
"grad_norm": 0.12751619517803192,
"learning_rate": 0.0002,
"loss": 0.5337543487548828,
"mean_token_accuracy": 0.7827389687299728,
"num_tokens": 8666763.0,
"step": 530
},
{
"entropy": 0.5495491325855255,
"epoch": 1.9813432835820897,
"grad_norm": 0.13329073786735535,
"learning_rate": 0.0002,
"loss": 0.5403661131858826,
"mean_token_accuracy": 0.7817551493644714,
"num_tokens": 8683086.0,
"step": 531
},
{
"entropy": 0.545268103480339,
"epoch": 1.9850746268656716,
"grad_norm": 0.1334519237279892,
"learning_rate": 0.0002,
"loss": 0.5446645021438599,
"mean_token_accuracy": 0.7789036780595779,
"num_tokens": 8699314.0,
"step": 532
},
{
"entropy": 0.5360117256641388,
"epoch": 1.9888059701492538,
"grad_norm": 0.1417427510023117,
"learning_rate": 0.0002,
"loss": 0.5377262830734253,
"mean_token_accuracy": 0.782628983259201,
"num_tokens": 8715712.0,
"step": 533
},
{
"entropy": 0.539160817861557,
"epoch": 1.9925373134328357,
"grad_norm": 0.13969334959983826,
"learning_rate": 0.0002,
"loss": 0.5430911779403687,
"mean_token_accuracy": 0.7803932130336761,
"num_tokens": 8732278.0,
"step": 534
},
{
"entropy": 0.5323211252689362,
"epoch": 1.9962686567164178,
"grad_norm": 0.13230480253696442,
"learning_rate": 0.0002,
"loss": 0.5352569818496704,
"mean_token_accuracy": 0.7800516188144684,
"num_tokens": 8748639.0,
"step": 535
},
{
"entropy": 0.5396020114421844,
"epoch": 2.0,
"grad_norm": 0.13588403165340424,
"learning_rate": 0.0002,
"loss": 0.5420472025871277,
"mean_token_accuracy": 0.7812368422746658,
"num_tokens": 8765023.0,
"step": 536
},
{
"entropy": 0.5363707542419434,
"epoch": 2.003731343283582,
"grad_norm": 0.13683520257472992,
"learning_rate": 0.0002,
"loss": 0.5242169499397278,
"mean_token_accuracy": 0.7884830236434937,
"num_tokens": 8781503.0,
"step": 537
},
{
"entropy": 0.5355663001537323,
"epoch": 2.0074626865671643,
"grad_norm": 0.1606767475605011,
"learning_rate": 0.0002,
"loss": 0.5340245962142944,
"mean_token_accuracy": 0.7837463468313217,
"num_tokens": 8797833.0,
"step": 538
},
{
"entropy": 0.5198972821235657,
"epoch": 2.0111940298507465,
"grad_norm": 0.1832306683063507,
"learning_rate": 0.0002,
"loss": 0.5226503014564514,
"mean_token_accuracy": 0.7878277599811554,
"num_tokens": 8814387.0,
"step": 539
},
{
"entropy": 0.5145581886172295,
"epoch": 2.014925373134328,
"grad_norm": 0.14004163444042206,
"learning_rate": 0.0002,
"loss": 0.5142262578010559,
"mean_token_accuracy": 0.7930136620998383,
"num_tokens": 8830769.0,
"step": 540
},
{
"entropy": 0.518964596092701,
"epoch": 2.0186567164179103,
"grad_norm": 0.2391389012336731,
"learning_rate": 0.0002,
"loss": 0.5318617224693298,
"mean_token_accuracy": 0.7879888862371445,
"num_tokens": 8847079.0,
"step": 541
},
{
"entropy": 0.5112362876534462,
"epoch": 2.0223880597014925,
"grad_norm": 0.1571192741394043,
"learning_rate": 0.0002,
"loss": 0.5111895799636841,
"mean_token_accuracy": 0.7941466271877289,
"num_tokens": 8863455.0,
"step": 542
},
{
"entropy": 0.5289383679628372,
"epoch": 2.0261194029850746,
"grad_norm": 0.18859665095806122,
"learning_rate": 0.0002,
"loss": 0.5321269035339355,
"mean_token_accuracy": 0.7850861251354218,
"num_tokens": 8879933.0,
"step": 543
},
{
"entropy": 0.5038495659828186,
"epoch": 2.029850746268657,
"grad_norm": 0.1459927260875702,
"learning_rate": 0.0002,
"loss": 0.5009663105010986,
"mean_token_accuracy": 0.800191804766655,
"num_tokens": 8896279.0,
"step": 544
},
{
"entropy": 0.5393158346414566,
"epoch": 2.033582089552239,
"grad_norm": 0.18940559029579163,
"learning_rate": 0.0002,
"loss": 0.5331785678863525,
"mean_token_accuracy": 0.785183385014534,
"num_tokens": 8912807.0,
"step": 545
},
{
"entropy": 0.5186864137649536,
"epoch": 2.0373134328358207,
"grad_norm": 0.13405749201774597,
"learning_rate": 0.0002,
"loss": 0.5130364894866943,
"mean_token_accuracy": 0.7902890145778656,
"num_tokens": 8929085.0,
"step": 546
},
{
"entropy": 0.517152339220047,
"epoch": 2.041044776119403,
"grad_norm": 0.2357271909713745,
"learning_rate": 0.0002,
"loss": 0.5223183631896973,
"mean_token_accuracy": 0.7909936606884003,
"num_tokens": 8945205.0,
"step": 547
},
{
"entropy": 0.504429779946804,
"epoch": 2.044776119402985,
"grad_norm": 0.16896866261959076,
"learning_rate": 0.0002,
"loss": 0.5084525942802429,
"mean_token_accuracy": 0.7927258014678955,
"num_tokens": 8961586.0,
"step": 548
},
{
"entropy": 0.5195313468575478,
"epoch": 2.048507462686567,
"grad_norm": 0.16998501121997833,
"learning_rate": 0.0002,
"loss": 0.5220100283622742,
"mean_token_accuracy": 0.7873262912034988,
"num_tokens": 8978096.0,
"step": 549
},
{
"entropy": 0.5092991963028908,
"epoch": 2.0522388059701493,
"grad_norm": 0.18961496651172638,
"learning_rate": 0.0002,
"loss": 0.5134435892105103,
"mean_token_accuracy": 0.7906353622674942,
"num_tokens": 8994217.0,
"step": 550
},
{
"entropy": 0.5130208507180214,
"epoch": 2.0559701492537314,
"grad_norm": 0.15812328457832336,
"learning_rate": 0.0002,
"loss": 0.5057437419891357,
"mean_token_accuracy": 0.7933137118816376,
"num_tokens": 9010450.0,
"step": 551
},
{
"entropy": 0.5244034826755524,
"epoch": 2.0597014925373136,
"grad_norm": 0.17014764249324799,
"learning_rate": 0.0002,
"loss": 0.5208017230033875,
"mean_token_accuracy": 0.7864028364419937,
"num_tokens": 9026690.0,
"step": 552
},
{
"entropy": 0.524794228374958,
"epoch": 2.0634328358208953,
"grad_norm": 0.1528615653514862,
"learning_rate": 0.0002,
"loss": 0.5251787900924683,
"mean_token_accuracy": 0.7868095934391022,
"num_tokens": 9042889.0,
"step": 553
},
{
"entropy": 0.525935024023056,
"epoch": 2.0671641791044775,
"grad_norm": 0.1623958796262741,
"learning_rate": 0.0002,
"loss": 0.5336424708366394,
"mean_token_accuracy": 0.7855145633220673,
"num_tokens": 9059267.0,
"step": 554
},
{
"entropy": 0.5195625573396683,
"epoch": 2.0708955223880596,
"grad_norm": 0.17523802816867828,
"learning_rate": 0.0002,
"loss": 0.5209751725196838,
"mean_token_accuracy": 0.7891881316900253,
"num_tokens": 9075744.0,
"step": 555
},
{
"entropy": 0.5318533927202225,
"epoch": 2.074626865671642,
"grad_norm": 0.16624799370765686,
"learning_rate": 0.0002,
"loss": 0.5274427533149719,
"mean_token_accuracy": 0.7851865887641907,
"num_tokens": 9092196.0,
"step": 556
},
{
"entropy": 0.5313673615455627,
"epoch": 2.078358208955224,
"grad_norm": 0.16823066771030426,
"learning_rate": 0.0002,
"loss": 0.5263111591339111,
"mean_token_accuracy": 0.7885167598724365,
"num_tokens": 9108431.0,
"step": 557
},
{
"entropy": 0.518197163939476,
"epoch": 2.082089552238806,
"grad_norm": 0.18068267405033112,
"learning_rate": 0.0002,
"loss": 0.5193851590156555,
"mean_token_accuracy": 0.7903801500797272,
"num_tokens": 9124741.0,
"step": 558
},
{
"entropy": 0.5107997804880142,
"epoch": 2.0858208955223883,
"grad_norm": 0.15915489196777344,
"learning_rate": 0.0002,
"loss": 0.5146846771240234,
"mean_token_accuracy": 0.7921037524938583,
"num_tokens": 9141112.0,
"step": 559
},
{
"entropy": 0.5317652374505997,
"epoch": 2.08955223880597,
"grad_norm": 0.18767035007476807,
"learning_rate": 0.0002,
"loss": 0.5400185585021973,
"mean_token_accuracy": 0.7800605148077011,
"num_tokens": 9157563.0,
"step": 560
},
{
"entropy": 0.5086512267589569,
"epoch": 2.093283582089552,
"grad_norm": 0.1544736921787262,
"learning_rate": 0.0002,
"loss": 0.508223831653595,
"mean_token_accuracy": 0.7939174175262451,
"num_tokens": 9173854.0,
"step": 561
},
{
"entropy": 0.52768574655056,
"epoch": 2.0970149253731343,
"grad_norm": 0.17799650132656097,
"learning_rate": 0.0002,
"loss": 0.5289405584335327,
"mean_token_accuracy": 0.7851383984088898,
"num_tokens": 9190112.0,
"step": 562
},
{
"entropy": 0.5307039618492126,
"epoch": 2.1007462686567164,
"grad_norm": 0.1469665914773941,
"learning_rate": 0.0002,
"loss": 0.5241371989250183,
"mean_token_accuracy": 0.7877105623483658,
"num_tokens": 9206476.0,
"step": 563
},
{
"entropy": 0.517830565571785,
"epoch": 2.1044776119402986,
"grad_norm": 0.1440608948469162,
"learning_rate": 0.0002,
"loss": 0.5123553276062012,
"mean_token_accuracy": 0.7936355024576187,
"num_tokens": 9222843.0,
"step": 564
},
{
"entropy": 0.523407056927681,
"epoch": 2.1082089552238807,
"grad_norm": 0.21014799177646637,
"learning_rate": 0.0002,
"loss": 0.5186851620674133,
"mean_token_accuracy": 0.792457640171051,
"num_tokens": 9239327.0,
"step": 565
},
{
"entropy": 0.5128730833530426,
"epoch": 2.111940298507463,
"grad_norm": 0.2577928602695465,
"learning_rate": 0.0002,
"loss": 0.5269497632980347,
"mean_token_accuracy": 0.7877898067235947,
"num_tokens": 9255586.0,
"step": 566
},
{
"entropy": 0.5238759815692902,
"epoch": 2.1156716417910446,
"grad_norm": 0.1416473388671875,
"learning_rate": 0.0002,
"loss": 0.5266433954238892,
"mean_token_accuracy": 0.7873618602752686,
"num_tokens": 9272236.0,
"step": 567
},
{
"entropy": 0.5273244455456734,
"epoch": 2.1194029850746268,
"grad_norm": 0.1742546260356903,
"learning_rate": 0.0002,
"loss": 0.5227883458137512,
"mean_token_accuracy": 0.7893139868974686,
"num_tokens": 9288429.0,
"step": 568
},
{
"entropy": 0.5123281329870224,
"epoch": 2.123134328358209,
"grad_norm": 0.17472973465919495,
"learning_rate": 0.0002,
"loss": 0.5086967945098877,
"mean_token_accuracy": 0.7941555231809616,
"num_tokens": 9304696.0,
"step": 569
},
{
"entropy": 0.5038742050528526,
"epoch": 2.126865671641791,
"grad_norm": 0.15990978479385376,
"learning_rate": 0.0002,
"loss": 0.5093705058097839,
"mean_token_accuracy": 0.7927817106246948,
"num_tokens": 9320823.0,
"step": 570
},
{
"entropy": 0.5118470937013626,
"epoch": 2.1305970149253732,
"grad_norm": 0.15983271598815918,
"learning_rate": 0.0002,
"loss": 0.5105957388877869,
"mean_token_accuracy": 0.7947766035795212,
"num_tokens": 9337178.0,
"step": 571
},
{
"entropy": 0.5117835849523544,
"epoch": 2.1343283582089554,
"grad_norm": 0.17154565453529358,
"learning_rate": 0.0002,
"loss": 0.5166530609130859,
"mean_token_accuracy": 0.7898510247468948,
"num_tokens": 9353541.0,
"step": 572
},
{
"entropy": 0.524290457367897,
"epoch": 2.138059701492537,
"grad_norm": 0.1809605062007904,
"learning_rate": 0.0002,
"loss": 0.5276108980178833,
"mean_token_accuracy": 0.7894007414579391,
"num_tokens": 9370257.0,
"step": 573
},
{
"entropy": 0.5326485335826874,
"epoch": 2.1417910447761193,
"grad_norm": 0.17269255220890045,
"learning_rate": 0.0002,
"loss": 0.5320166349411011,
"mean_token_accuracy": 0.7842083424329758,
"num_tokens": 9386645.0,
"step": 574
},
{
"entropy": 0.5396575331687927,
"epoch": 2.1455223880597014,
"grad_norm": 0.19763849675655365,
"learning_rate": 0.0002,
"loss": 0.5302010774612427,
"mean_token_accuracy": 0.7843988239765167,
"num_tokens": 9403107.0,
"step": 575
},
{
"entropy": 0.53758405148983,
"epoch": 2.1492537313432836,
"grad_norm": 0.1403210610151291,
"learning_rate": 0.0002,
"loss": 0.5297962427139282,
"mean_token_accuracy": 0.7875841557979584,
"num_tokens": 9419679.0,
"step": 576
},
{
"entropy": 0.5233541131019592,
"epoch": 2.1529850746268657,
"grad_norm": 0.18504074215888977,
"learning_rate": 0.0002,
"loss": 0.5262290835380554,
"mean_token_accuracy": 0.7859254032373428,
"num_tokens": 9436038.0,
"step": 577
},
{
"entropy": 0.5059448033571243,
"epoch": 2.156716417910448,
"grad_norm": 0.18249362707138062,
"learning_rate": 0.0002,
"loss": 0.5139797329902649,
"mean_token_accuracy": 0.7936645895242691,
"num_tokens": 9452416.0,
"step": 578
},
{
"entropy": 0.5189633667469025,
"epoch": 2.16044776119403,
"grad_norm": 0.21265490353107452,
"learning_rate": 0.0002,
"loss": 0.533969521522522,
"mean_token_accuracy": 0.7854558378458023,
"num_tokens": 9468830.0,
"step": 579
},
{
"entropy": 0.5293581038713455,
"epoch": 2.1641791044776117,
"grad_norm": 0.16064560413360596,
"learning_rate": 0.0002,
"loss": 0.5302042961120605,
"mean_token_accuracy": 0.7855220139026642,
"num_tokens": 9485369.0,
"step": 580
},
{
"entropy": 0.5367814004421234,
"epoch": 2.167910447761194,
"grad_norm": 0.1988399475812912,
"learning_rate": 0.0002,
"loss": 0.5316881537437439,
"mean_token_accuracy": 0.7867899537086487,
"num_tokens": 9501506.0,
"step": 581
},
{
"entropy": 0.530438095331192,
"epoch": 2.171641791044776,
"grad_norm": 0.16211427748203278,
"learning_rate": 0.0002,
"loss": 0.5204508304595947,
"mean_token_accuracy": 0.7928901314735413,
"num_tokens": 9517998.0,
"step": 582
},
{
"entropy": 0.538342297077179,
"epoch": 2.175373134328358,
"grad_norm": 0.200654536485672,
"learning_rate": 0.0002,
"loss": 0.5368824005126953,
"mean_token_accuracy": 0.7828831076622009,
"num_tokens": 9534418.0,
"step": 583
},
{
"entropy": 0.5067318677902222,
"epoch": 2.1791044776119404,
"grad_norm": 0.18536439538002014,
"learning_rate": 0.0002,
"loss": 0.5152954459190369,
"mean_token_accuracy": 0.7947442531585693,
"num_tokens": 9550929.0,
"step": 584
},
{
"entropy": 0.5143613219261169,
"epoch": 2.1828358208955225,
"grad_norm": 0.18734246492385864,
"learning_rate": 0.0002,
"loss": 0.5320346355438232,
"mean_token_accuracy": 0.7830832600593567,
"num_tokens": 9567052.0,
"step": 585
},
{
"entropy": 0.5134065821766853,
"epoch": 2.1865671641791047,
"grad_norm": 0.1658649444580078,
"learning_rate": 0.0002,
"loss": 0.5137937664985657,
"mean_token_accuracy": 0.792109802365303,
"num_tokens": 9583328.0,
"step": 586
},
{
"entropy": 0.5145891755819321,
"epoch": 2.1902985074626864,
"grad_norm": 0.20381639897823334,
"learning_rate": 0.0002,
"loss": 0.5113189220428467,
"mean_token_accuracy": 0.791796863079071,
"num_tokens": 9599639.0,
"step": 587
},
{
"entropy": 0.5297699421644211,
"epoch": 2.1940298507462686,
"grad_norm": 0.1610771119594574,
"learning_rate": 0.0002,
"loss": 0.5239428877830505,
"mean_token_accuracy": 0.7868966311216354,
"num_tokens": 9616107.0,
"step": 588
},
{
"entropy": 0.5139229521155357,
"epoch": 2.1977611940298507,
"grad_norm": 0.16601988673210144,
"learning_rate": 0.0002,
"loss": 0.5093111991882324,
"mean_token_accuracy": 0.7953454554080963,
"num_tokens": 9632478.0,
"step": 589
},
{
"entropy": 0.5277693122625351,
"epoch": 2.201492537313433,
"grad_norm": 0.15310561656951904,
"learning_rate": 0.0002,
"loss": 0.5306464433670044,
"mean_token_accuracy": 0.785234808921814,
"num_tokens": 9648606.0,
"step": 590
},
{
"entropy": 0.5277083218097687,
"epoch": 2.205223880597015,
"grad_norm": 0.17894159257411957,
"learning_rate": 0.0002,
"loss": 0.5229562520980835,
"mean_token_accuracy": 0.7855621576309204,
"num_tokens": 9664853.0,
"step": 591
},
{
"entropy": 0.5369253158569336,
"epoch": 2.208955223880597,
"grad_norm": 0.17260174453258514,
"learning_rate": 0.0002,
"loss": 0.5379320383071899,
"mean_token_accuracy": 0.785187691450119,
"num_tokens": 9681395.0,
"step": 592
},
{
"entropy": 0.51601941883564,
"epoch": 2.2126865671641793,
"grad_norm": 0.19144131243228912,
"learning_rate": 0.0002,
"loss": 0.525420606136322,
"mean_token_accuracy": 0.7879699319601059,
"num_tokens": 9697832.0,
"step": 593
},
{
"entropy": 0.5305543690919876,
"epoch": 2.216417910447761,
"grad_norm": 0.152136892080307,
"learning_rate": 0.0002,
"loss": 0.5263657569885254,
"mean_token_accuracy": 0.7852640599012375,
"num_tokens": 9714327.0,
"step": 594
},
{
"entropy": 0.5374766737222672,
"epoch": 2.220149253731343,
"grad_norm": 0.18577203154563904,
"learning_rate": 0.0002,
"loss": 0.538034975528717,
"mean_token_accuracy": 0.7831636220216751,
"num_tokens": 9730796.0,
"step": 595
},
{
"entropy": 0.5116140991449356,
"epoch": 2.2238805970149254,
"grad_norm": 0.15658536553382874,
"learning_rate": 0.0002,
"loss": 0.5068283081054688,
"mean_token_accuracy": 0.7946771383285522,
"num_tokens": 9747017.0,
"step": 596
},
{
"entropy": 0.5136987864971161,
"epoch": 2.2276119402985075,
"grad_norm": 0.15834017097949982,
"learning_rate": 0.0002,
"loss": 0.518505334854126,
"mean_token_accuracy": 0.7908380329608917,
"num_tokens": 9763200.0,
"step": 597
},
{
"entropy": 0.48786860704421997,
"epoch": 2.2313432835820897,
"grad_norm": 0.16836979985237122,
"learning_rate": 0.0002,
"loss": 0.4918700158596039,
"mean_token_accuracy": 0.8017545938491821,
"num_tokens": 9779342.0,
"step": 598
},
{
"entropy": 0.511562891304493,
"epoch": 2.235074626865672,
"grad_norm": 0.19002674520015717,
"learning_rate": 0.0002,
"loss": 0.5156916975975037,
"mean_token_accuracy": 0.7910201996564865,
"num_tokens": 9795546.0,
"step": 599
},
{
"entropy": 0.5209366902709007,
"epoch": 2.2388059701492535,
"grad_norm": 0.17156340181827545,
"learning_rate": 0.0002,
"loss": 0.515453040599823,
"mean_token_accuracy": 0.7911808788776398,
"num_tokens": 9811678.0,
"step": 600
},
{
"entropy": 0.5190790444612503,
"epoch": 2.2425373134328357,
"grad_norm": 0.16390037536621094,
"learning_rate": 0.0002,
"loss": 0.5197610259056091,
"mean_token_accuracy": 0.791000559926033,
"num_tokens": 9827971.0,
"step": 601
},
{
"entropy": 0.534053236246109,
"epoch": 2.246268656716418,
"grad_norm": 0.17688144743442535,
"learning_rate": 0.0002,
"loss": 0.5342822074890137,
"mean_token_accuracy": 0.7848292291164398,
"num_tokens": 9844391.0,
"step": 602
},
{
"entropy": 0.5072491243481636,
"epoch": 2.25,
"grad_norm": 0.15552373230457306,
"learning_rate": 0.0002,
"loss": 0.5125934481620789,
"mean_token_accuracy": 0.79164819419384,
"num_tokens": 9860695.0,
"step": 603
},
{
"entropy": 0.5196588039398193,
"epoch": 2.253731343283582,
"grad_norm": 0.20500463247299194,
"learning_rate": 0.0002,
"loss": 0.5203579664230347,
"mean_token_accuracy": 0.7872295528650284,
"num_tokens": 9876962.0,
"step": 604
},
{
"entropy": 0.5224801748991013,
"epoch": 2.2574626865671643,
"grad_norm": 0.16438624262809753,
"learning_rate": 0.0002,
"loss": 0.517778217792511,
"mean_token_accuracy": 0.7902567535638809,
"num_tokens": 9893378.0,
"step": 605
},
{
"entropy": 0.5315049141645432,
"epoch": 2.2611940298507465,
"grad_norm": 0.19314803183078766,
"learning_rate": 0.0002,
"loss": 0.5378735065460205,
"mean_token_accuracy": 0.7826669216156006,
"num_tokens": 9909658.0,
"step": 606
},
{
"entropy": 0.5268717259168625,
"epoch": 2.264925373134328,
"grad_norm": 0.1703607141971588,
"learning_rate": 0.0002,
"loss": 0.5323152542114258,
"mean_token_accuracy": 0.7835480719804764,
"num_tokens": 9926026.0,
"step": 607
},
{
"entropy": 0.5275075733661652,
"epoch": 2.2686567164179103,
"grad_norm": 0.1891828328371048,
"learning_rate": 0.0002,
"loss": 0.523108959197998,
"mean_token_accuracy": 0.7864743769168854,
"num_tokens": 9942362.0,
"step": 608
},
{
"entropy": 0.5301201939582825,
"epoch": 2.2723880597014925,
"grad_norm": 0.16404391825199127,
"learning_rate": 0.0002,
"loss": 0.5282193422317505,
"mean_token_accuracy": 0.7837762832641602,
"num_tokens": 9958517.0,
"step": 609
},
{
"entropy": 0.5198077484965324,
"epoch": 2.2761194029850746,
"grad_norm": 0.1796608716249466,
"learning_rate": 0.0002,
"loss": 0.5138813853263855,
"mean_token_accuracy": 0.7904112935066223,
"num_tokens": 9974864.0,
"step": 610
},
{
"entropy": 0.5151881948113441,
"epoch": 2.279850746268657,
"grad_norm": 0.1921297013759613,
"learning_rate": 0.0002,
"loss": 0.5276269912719727,
"mean_token_accuracy": 0.7861463725566864,
"num_tokens": 9990982.0,
"step": 611
},
{
"entropy": 0.5268184095621109,
"epoch": 2.283582089552239,
"grad_norm": 0.3107461929321289,
"learning_rate": 0.0002,
"loss": 0.5354833006858826,
"mean_token_accuracy": 0.7860495001077652,
"num_tokens": 10007390.0,
"step": 612
},
{
"entropy": 0.5362572968006134,
"epoch": 2.2873134328358207,
"grad_norm": 0.2291727513074875,
"learning_rate": 0.0002,
"loss": 0.5278795957565308,
"mean_token_accuracy": 0.7864319235086441,
"num_tokens": 10023741.0,
"step": 613
},
{
"entropy": 0.5297401547431946,
"epoch": 2.291044776119403,
"grad_norm": 0.22683671116828918,
"learning_rate": 0.0002,
"loss": 0.5257067680358887,
"mean_token_accuracy": 0.7868115305900574,
"num_tokens": 10040185.0,
"step": 614
},
{
"entropy": 0.5152234882116318,
"epoch": 2.294776119402985,
"grad_norm": 0.20225822925567627,
"learning_rate": 0.0002,
"loss": 0.5109996795654297,
"mean_token_accuracy": 0.7922611236572266,
"num_tokens": 10056416.0,
"step": 615
},
{
"entropy": 0.5397164672613144,
"epoch": 2.298507462686567,
"grad_norm": 0.21879570186138153,
"learning_rate": 0.0002,
"loss": 0.53910893201828,
"mean_token_accuracy": 0.7829782217741013,
"num_tokens": 10073119.0,
"step": 616
},
{
"entropy": 0.523445226252079,
"epoch": 2.3022388059701493,
"grad_norm": 0.2043614238500595,
"learning_rate": 0.0002,
"loss": 0.5277411341667175,
"mean_token_accuracy": 0.7879920601844788,
"num_tokens": 10089539.0,
"step": 617
},
{
"entropy": 0.5420306473970413,
"epoch": 2.3059701492537314,
"grad_norm": 0.16890020668506622,
"learning_rate": 0.0002,
"loss": 0.5416600704193115,
"mean_token_accuracy": 0.7815042287111282,
"num_tokens": 10105674.0,
"step": 618
},
{
"entropy": 0.5223758220672607,
"epoch": 2.3097014925373136,
"grad_norm": 0.187328040599823,
"learning_rate": 0.0002,
"loss": 0.5208746790885925,
"mean_token_accuracy": 0.7938240319490433,
"num_tokens": 10121685.0,
"step": 619
},
{
"entropy": 0.5317254960536957,
"epoch": 2.3134328358208958,
"grad_norm": 0.17246371507644653,
"learning_rate": 0.0002,
"loss": 0.5258828997612,
"mean_token_accuracy": 0.7855419665575027,
"num_tokens": 10138380.0,
"step": 620
},
{
"entropy": 0.510456420481205,
"epoch": 2.3171641791044775,
"grad_norm": 0.17611362040042877,
"learning_rate": 0.0002,
"loss": 0.5174400806427002,
"mean_token_accuracy": 0.790027379989624,
"num_tokens": 10154639.0,
"step": 621
},
{
"entropy": 0.5146428272128105,
"epoch": 2.3208955223880596,
"grad_norm": 0.19471095502376556,
"learning_rate": 0.0002,
"loss": 0.5222116708755493,
"mean_token_accuracy": 0.7890471816062927,
"num_tokens": 10170992.0,
"step": 622
},
{
"entropy": 0.5554968118667603,
"epoch": 2.324626865671642,
"grad_norm": 0.15456657111644745,
"learning_rate": 0.0002,
"loss": 0.5553091168403625,
"mean_token_accuracy": 0.7767172753810883,
"num_tokens": 10187415.0,
"step": 623
},
{
"entropy": 0.5297296196222305,
"epoch": 2.328358208955224,
"grad_norm": 0.17202581465244293,
"learning_rate": 0.0002,
"loss": 0.5306862592697144,
"mean_token_accuracy": 0.7859676033258438,
"num_tokens": 10204041.0,
"step": 624
},
{
"entropy": 0.5107762217521667,
"epoch": 2.332089552238806,
"grad_norm": 0.17404352128505707,
"learning_rate": 0.0002,
"loss": 0.5129390358924866,
"mean_token_accuracy": 0.7931138426065445,
"num_tokens": 10220300.0,
"step": 625
},
{
"entropy": 0.5258396938443184,
"epoch": 2.3358208955223883,
"grad_norm": 0.18174229562282562,
"learning_rate": 0.0002,
"loss": 0.5229369401931763,
"mean_token_accuracy": 0.7888091504573822,
"num_tokens": 10236649.0,
"step": 626
},
{
"entropy": 0.5380365252494812,
"epoch": 2.33955223880597,
"grad_norm": 0.17537739872932434,
"learning_rate": 0.0002,
"loss": 0.5373145937919617,
"mean_token_accuracy": 0.7832024991512299,
"num_tokens": 10252909.0,
"step": 627
},
{
"entropy": 0.5075801610946655,
"epoch": 2.343283582089552,
"grad_norm": 0.22284290194511414,
"learning_rate": 0.0002,
"loss": 0.511396586894989,
"mean_token_accuracy": 0.7928276360034943,
"num_tokens": 10269280.0,
"step": 628
},
{
"entropy": 0.5164258778095245,
"epoch": 2.3470149253731343,
"grad_norm": 0.18526744842529297,
"learning_rate": 0.0002,
"loss": 0.5178982019424438,
"mean_token_accuracy": 0.7898775935173035,
"num_tokens": 10285761.0,
"step": 629
},
{
"entropy": 0.5200358033180237,
"epoch": 2.3507462686567164,
"grad_norm": 0.20576190948486328,
"learning_rate": 0.0002,
"loss": 0.5253298878669739,
"mean_token_accuracy": 0.7885328382253647,
"num_tokens": 10301941.0,
"step": 630
},
{
"entropy": 0.5383775234222412,
"epoch": 2.3544776119402986,
"grad_norm": 0.17617975175380707,
"learning_rate": 0.0002,
"loss": 0.5448250770568848,
"mean_token_accuracy": 0.782653346657753,
"num_tokens": 10318486.0,
"step": 631
},
{
"entropy": 0.5118822678923607,
"epoch": 2.3582089552238807,
"grad_norm": 0.18932130932807922,
"learning_rate": 0.0002,
"loss": 0.5223209857940674,
"mean_token_accuracy": 0.7917590737342834,
"num_tokens": 10334530.0,
"step": 632
},
{
"entropy": 0.5191465318202972,
"epoch": 2.361940298507463,
"grad_norm": 0.18021032214164734,
"learning_rate": 0.0002,
"loss": 0.5152462124824524,
"mean_token_accuracy": 0.791267067193985,
"num_tokens": 10350724.0,
"step": 633
},
{
"entropy": 0.5144938305020332,
"epoch": 2.3656716417910446,
"grad_norm": 0.15109598636627197,
"learning_rate": 0.0002,
"loss": 0.4982617497444153,
"mean_token_accuracy": 0.7967542856931686,
"num_tokens": 10366875.0,
"step": 634
},
{
"entropy": 0.5065358951687813,
"epoch": 2.3694029850746268,
"grad_norm": 0.18718236684799194,
"learning_rate": 0.0002,
"loss": 0.4973527193069458,
"mean_token_accuracy": 0.8017638623714447,
"num_tokens": 10383005.0,
"step": 635
},
{
"entropy": 0.530413880944252,
"epoch": 2.373134328358209,
"grad_norm": 0.1718485951423645,
"learning_rate": 0.0002,
"loss": 0.5324255228042603,
"mean_token_accuracy": 0.7831610143184662,
"num_tokens": 10399588.0,
"step": 636
},
{
"entropy": 0.5436315685510635,
"epoch": 2.376865671641791,
"grad_norm": 0.20064882934093475,
"learning_rate": 0.0002,
"loss": 0.5518239140510559,
"mean_token_accuracy": 0.7763282507658005,
"num_tokens": 10416058.0,
"step": 637
},
{
"entropy": 0.5224271416664124,
"epoch": 2.3805970149253732,
"grad_norm": 0.18303366005420685,
"learning_rate": 0.0002,
"loss": 0.5248957872390747,
"mean_token_accuracy": 0.7867279052734375,
"num_tokens": 10432139.0,
"step": 638
},
{
"entropy": 0.5115847885608673,
"epoch": 2.3843283582089554,
"grad_norm": 0.18415044248104095,
"learning_rate": 0.0002,
"loss": 0.5158942937850952,
"mean_token_accuracy": 0.7931726425886154,
"num_tokens": 10448181.0,
"step": 639
},
{
"entropy": 0.5335763245820999,
"epoch": 2.388059701492537,
"grad_norm": 0.17970694601535797,
"learning_rate": 0.0002,
"loss": 0.5286952257156372,
"mean_token_accuracy": 0.7878449261188507,
"num_tokens": 10464583.0,
"step": 640
},
{
"entropy": 0.5233506336808205,
"epoch": 2.3917910447761193,
"grad_norm": 0.19122423231601715,
"learning_rate": 0.0002,
"loss": 0.5172105431556702,
"mean_token_accuracy": 0.7892956882715225,
"num_tokens": 10481023.0,
"step": 641
},
{
"entropy": 0.5129317939281464,
"epoch": 2.3955223880597014,
"grad_norm": 0.16389286518096924,
"learning_rate": 0.0002,
"loss": 0.5165532231330872,
"mean_token_accuracy": 0.7895939499139786,
"num_tokens": 10497404.0,
"step": 642
},
{
"entropy": 0.5067487806081772,
"epoch": 2.3992537313432836,
"grad_norm": 0.17685648798942566,
"learning_rate": 0.0002,
"loss": 0.5114090442657471,
"mean_token_accuracy": 0.79579958319664,
"num_tokens": 10513777.0,
"step": 643
},
{
"entropy": 0.5056411698460579,
"epoch": 2.4029850746268657,
"grad_norm": 0.20632798969745636,
"learning_rate": 0.0002,
"loss": 0.512579083442688,
"mean_token_accuracy": 0.7917985171079636,
"num_tokens": 10530002.0,
"step": 644
},
{
"entropy": 0.503575325012207,
"epoch": 2.406716417910448,
"grad_norm": 0.18627490103244781,
"learning_rate": 0.0002,
"loss": 0.5137442350387573,
"mean_token_accuracy": 0.7893558740615845,
"num_tokens": 10546273.0,
"step": 645
},
{
"entropy": 0.5291843265295029,
"epoch": 2.41044776119403,
"grad_norm": 0.16846197843551636,
"learning_rate": 0.0002,
"loss": 0.5265457630157471,
"mean_token_accuracy": 0.7875650376081467,
"num_tokens": 10562590.0,
"step": 646
},
{
"entropy": 0.5421585887670517,
"epoch": 2.4141791044776117,
"grad_norm": 0.17224395275115967,
"learning_rate": 0.0002,
"loss": 0.5339004993438721,
"mean_token_accuracy": 0.7843624651432037,
"num_tokens": 10578951.0,
"step": 647
},
{
"entropy": 0.5322060137987137,
"epoch": 2.417910447761194,
"grad_norm": 0.15629476308822632,
"learning_rate": 0.0002,
"loss": 0.5219835638999939,
"mean_token_accuracy": 0.7886752039194107,
"num_tokens": 10595214.0,
"step": 648
},
{
"entropy": 0.5281577706336975,
"epoch": 2.421641791044776,
"grad_norm": 0.18105372786521912,
"learning_rate": 0.0002,
"loss": 0.5306849479675293,
"mean_token_accuracy": 0.7853680700063705,
"num_tokens": 10611701.0,
"step": 649
},
{
"entropy": 0.5248554199934006,
"epoch": 2.425373134328358,
"grad_norm": 0.16688814759254456,
"learning_rate": 0.0002,
"loss": 0.5278753638267517,
"mean_token_accuracy": 0.7852373868227005,
"num_tokens": 10628217.0,
"step": 650
},
{
"entropy": 0.5284415632486343,
"epoch": 2.4291044776119404,
"grad_norm": 0.1766011267900467,
"learning_rate": 0.0002,
"loss": 0.5336297750473022,
"mean_token_accuracy": 0.7854758203029633,
"num_tokens": 10644808.0,
"step": 651
},
{
"entropy": 0.522301472723484,
"epoch": 2.4328358208955225,
"grad_norm": 0.1673455685377121,
"learning_rate": 0.0002,
"loss": 0.5260990262031555,
"mean_token_accuracy": 0.7875321805477142,
"num_tokens": 10661415.0,
"step": 652
},
{
"entropy": 0.5340454131364822,
"epoch": 2.4365671641791042,
"grad_norm": 0.1705857813358307,
"learning_rate": 0.0002,
"loss": 0.5287991166114807,
"mean_token_accuracy": 0.7848271727561951,
"num_tokens": 10678098.0,
"step": 653
},
{
"entropy": 0.5536000281572342,
"epoch": 2.4402985074626864,
"grad_norm": 0.16633524000644684,
"learning_rate": 0.0002,
"loss": 0.5458575487136841,
"mean_token_accuracy": 0.7790239751338959,
"num_tokens": 10694453.0,
"step": 654
},
{
"entropy": 0.5396594703197479,
"epoch": 2.4440298507462686,
"grad_norm": 0.1658376157283783,
"learning_rate": 0.0002,
"loss": 0.5348730683326721,
"mean_token_accuracy": 0.7840123027563095,
"num_tokens": 10710682.0,
"step": 655
},
{
"entropy": 0.5132960826158524,
"epoch": 2.4477611940298507,
"grad_norm": 0.16822409629821777,
"learning_rate": 0.0002,
"loss": 0.5173973441123962,
"mean_token_accuracy": 0.7915854156017303,
"num_tokens": 10726882.0,
"step": 656
},
{
"entropy": 0.504063256084919,
"epoch": 2.451492537313433,
"grad_norm": 0.21201510727405548,
"learning_rate": 0.0002,
"loss": 0.5162043571472168,
"mean_token_accuracy": 0.7916038483381271,
"num_tokens": 10743326.0,
"step": 657
},
{
"entropy": 0.5151261985301971,
"epoch": 2.455223880597015,
"grad_norm": 0.22159790992736816,
"learning_rate": 0.0002,
"loss": 0.5307928323745728,
"mean_token_accuracy": 0.783583402633667,
"num_tokens": 10759068.0,
"step": 658
},
{
"entropy": 0.5228653997182846,
"epoch": 2.458955223880597,
"grad_norm": 0.1764376312494278,
"learning_rate": 0.0002,
"loss": 0.526711106300354,
"mean_token_accuracy": 0.785754069685936,
"num_tokens": 10775538.0,
"step": 659
},
{
"entropy": 0.5352444350719452,
"epoch": 2.4626865671641793,
"grad_norm": 0.1673639416694641,
"learning_rate": 0.0002,
"loss": 0.53009432554245,
"mean_token_accuracy": 0.7853073179721832,
"num_tokens": 10791878.0,
"step": 660
},
{
"entropy": 0.5250429213047028,
"epoch": 2.466417910447761,
"grad_norm": 0.1584668755531311,
"learning_rate": 0.0002,
"loss": 0.5163600444793701,
"mean_token_accuracy": 0.7921949625015259,
"num_tokens": 10808194.0,
"step": 661
},
{
"entropy": 0.531511977314949,
"epoch": 2.470149253731343,
"grad_norm": 0.15331409871578217,
"learning_rate": 0.0002,
"loss": 0.52297043800354,
"mean_token_accuracy": 0.7875395864248276,
"num_tokens": 10824487.0,
"step": 662
},
{
"entropy": 0.5337095707654953,
"epoch": 2.4738805970149254,
"grad_norm": 0.1537831574678421,
"learning_rate": 0.0002,
"loss": 0.5269461870193481,
"mean_token_accuracy": 0.7883634269237518,
"num_tokens": 10840768.0,
"step": 663
},
{
"entropy": 0.5136477053165436,
"epoch": 2.4776119402985075,
"grad_norm": 0.1710546612739563,
"learning_rate": 0.0002,
"loss": 0.5147293210029602,
"mean_token_accuracy": 0.790741965174675,
"num_tokens": 10857093.0,
"step": 664
},
{
"entropy": 0.5279193222522736,
"epoch": 2.4813432835820897,
"grad_norm": 0.18926194310188293,
"learning_rate": 0.0002,
"loss": 0.5373238921165466,
"mean_token_accuracy": 0.7801239043474197,
"num_tokens": 10873516.0,
"step": 665
},
{
"entropy": 0.5202833041548729,
"epoch": 2.485074626865672,
"grad_norm": 0.18720589578151703,
"learning_rate": 0.0002,
"loss": 0.5260710120201111,
"mean_token_accuracy": 0.7854216694831848,
"num_tokens": 10889866.0,
"step": 666
},
{
"entropy": 0.5342879593372345,
"epoch": 2.4888059701492535,
"grad_norm": 0.16395018994808197,
"learning_rate": 0.0002,
"loss": 0.5291630625724792,
"mean_token_accuracy": 0.786442369222641,
"num_tokens": 10906265.0,
"step": 667
},
{
"entropy": 0.5179769471287727,
"epoch": 2.4925373134328357,
"grad_norm": 0.18135614693164825,
"learning_rate": 0.0002,
"loss": 0.5105394721031189,
"mean_token_accuracy": 0.7919545620679855,
"num_tokens": 10922859.0,
"step": 668
},
{
"entropy": 0.5149218291044235,
"epoch": 2.496268656716418,
"grad_norm": 0.16995131969451904,
"learning_rate": 0.0002,
"loss": 0.5147515535354614,
"mean_token_accuracy": 0.7931389808654785,
"num_tokens": 10938918.0,
"step": 669
},
{
"entropy": 0.5330513119697571,
"epoch": 2.5,
"grad_norm": 0.1602948158979416,
"learning_rate": 0.0002,
"loss": 0.5284178256988525,
"mean_token_accuracy": 0.7882454097270966,
"num_tokens": 10955263.0,
"step": 670
},
{
"entropy": 0.5100918263196945,
"epoch": 2.503731343283582,
"grad_norm": 0.1638704538345337,
"learning_rate": 0.0002,
"loss": 0.5109102725982666,
"mean_token_accuracy": 0.7914802730083466,
"num_tokens": 10971573.0,
"step": 671
},
{
"entropy": 0.5232444852590561,
"epoch": 2.5074626865671643,
"grad_norm": 0.17863468825817108,
"learning_rate": 0.0002,
"loss": 0.527701735496521,
"mean_token_accuracy": 0.7854352295398712,
"num_tokens": 10987693.0,
"step": 672
},
{
"entropy": 0.5050330087542534,
"epoch": 2.5111940298507465,
"grad_norm": 0.18801726400852203,
"learning_rate": 0.0002,
"loss": 0.5186895728111267,
"mean_token_accuracy": 0.7896755188703537,
"num_tokens": 11003802.0,
"step": 673
},
{
"entropy": 0.5354911088943481,
"epoch": 2.5149253731343286,
"grad_norm": 0.1630580574274063,
"learning_rate": 0.0002,
"loss": 0.5393661856651306,
"mean_token_accuracy": 0.7806737869977951,
"num_tokens": 11020382.0,
"step": 674
},
{
"entropy": 0.5103952214121819,
"epoch": 2.5186567164179103,
"grad_norm": 0.16479070484638214,
"learning_rate": 0.0002,
"loss": 0.5052312016487122,
"mean_token_accuracy": 0.79300656914711,
"num_tokens": 11036684.0,
"step": 675
},
{
"entropy": 0.5548539459705353,
"epoch": 2.5223880597014925,
"grad_norm": 0.15993361175060272,
"learning_rate": 0.0002,
"loss": 0.5424168109893799,
"mean_token_accuracy": 0.7810866236686707,
"num_tokens": 11053105.0,
"step": 676
},
{
"entropy": 0.5318550616502762,
"epoch": 2.5261194029850746,
"grad_norm": 0.17689482867717743,
"learning_rate": 0.0002,
"loss": 0.5247601270675659,
"mean_token_accuracy": 0.7856518179178238,
"num_tokens": 11069578.0,
"step": 677
},
{
"entropy": 0.5139466673135757,
"epoch": 2.529850746268657,
"grad_norm": 0.17671139538288116,
"learning_rate": 0.0002,
"loss": 0.5161247253417969,
"mean_token_accuracy": 0.7908915132284164,
"num_tokens": 11085697.0,
"step": 678
},
{
"entropy": 0.5080201476812363,
"epoch": 2.533582089552239,
"grad_norm": 0.2036965787410736,
"learning_rate": 0.0002,
"loss": 0.5175144672393799,
"mean_token_accuracy": 0.791350468993187,
"num_tokens": 11101902.0,
"step": 679
},
{
"entropy": 0.5312675833702087,
"epoch": 2.5373134328358207,
"grad_norm": 0.19512657821178436,
"learning_rate": 0.0002,
"loss": 0.5406134128570557,
"mean_token_accuracy": 0.7809882313013077,
"num_tokens": 11118259.0,
"step": 680
},
{
"entropy": 0.5147824436426163,
"epoch": 2.541044776119403,
"grad_norm": 0.223260298371315,
"learning_rate": 0.0002,
"loss": 0.5146397948265076,
"mean_token_accuracy": 0.7933319509029388,
"num_tokens": 11134757.0,
"step": 681
},
{
"entropy": 0.5265121906995773,
"epoch": 2.544776119402985,
"grad_norm": 0.17229494452476501,
"learning_rate": 0.0002,
"loss": 0.5215858221054077,
"mean_token_accuracy": 0.7878258526325226,
"num_tokens": 11150969.0,
"step": 682
},
{
"entropy": 0.5460138469934464,
"epoch": 2.548507462686567,
"grad_norm": 0.16450214385986328,
"learning_rate": 0.0002,
"loss": 0.5474146604537964,
"mean_token_accuracy": 0.7795809954404831,
"num_tokens": 11167094.0,
"step": 683
},
{
"entropy": 0.5366989523172379,
"epoch": 2.5522388059701493,
"grad_norm": 0.20410536229610443,
"learning_rate": 0.0002,
"loss": 0.5371419787406921,
"mean_token_accuracy": 0.7853393852710724,
"num_tokens": 11183515.0,
"step": 684
},
{
"entropy": 0.5475771278142929,
"epoch": 2.5559701492537314,
"grad_norm": 0.1698704957962036,
"learning_rate": 0.0002,
"loss": 0.5460457801818848,
"mean_token_accuracy": 0.781210407614708,
"num_tokens": 11200139.0,
"step": 685
},
{
"entropy": 0.5389831364154816,
"epoch": 2.5597014925373136,
"grad_norm": 0.22744543850421906,
"learning_rate": 0.0002,
"loss": 0.5387647747993469,
"mean_token_accuracy": 0.7828833609819412,
"num_tokens": 11216497.0,
"step": 686
},
{
"entropy": 0.531368613243103,
"epoch": 2.5634328358208958,
"grad_norm": 0.17488178610801697,
"learning_rate": 0.0002,
"loss": 0.5309722423553467,
"mean_token_accuracy": 0.7842755913734436,
"num_tokens": 11232676.0,
"step": 687
},
{
"entropy": 0.5410369485616684,
"epoch": 2.5671641791044775,
"grad_norm": 0.1710905283689499,
"learning_rate": 0.0002,
"loss": 0.5380433797836304,
"mean_token_accuracy": 0.7851070165634155,
"num_tokens": 11249092.0,
"step": 688
},
{
"entropy": 0.5218508541584015,
"epoch": 2.5708955223880596,
"grad_norm": 0.2351209968328476,
"learning_rate": 0.0002,
"loss": 0.5304785966873169,
"mean_token_accuracy": 0.7837776988744736,
"num_tokens": 11265168.0,
"step": 689
},
{
"entropy": 0.5149262696504593,
"epoch": 2.574626865671642,
"grad_norm": 0.15611964464187622,
"learning_rate": 0.0002,
"loss": 0.5160297155380249,
"mean_token_accuracy": 0.7932045161724091,
"num_tokens": 11281641.0,
"step": 690
},
{
"entropy": 0.5153379887342453,
"epoch": 2.578358208955224,
"grad_norm": 0.23146718740463257,
"learning_rate": 0.0002,
"loss": 0.5226321220397949,
"mean_token_accuracy": 0.787521630525589,
"num_tokens": 11298142.0,
"step": 691
},
{
"entropy": 0.5393347591161728,
"epoch": 2.582089552238806,
"grad_norm": 0.16657157242298126,
"learning_rate": 0.0002,
"loss": 0.5344167351722717,
"mean_token_accuracy": 0.7832511067390442,
"num_tokens": 11314425.0,
"step": 692
},
{
"entropy": 0.5284578949213028,
"epoch": 2.585820895522388,
"grad_norm": 0.2301884889602661,
"learning_rate": 0.0002,
"loss": 0.5258397459983826,
"mean_token_accuracy": 0.787845253944397,
"num_tokens": 11330672.0,
"step": 693
},
{
"entropy": 0.5345947295427322,
"epoch": 2.58955223880597,
"grad_norm": 0.17253969609737396,
"learning_rate": 0.0002,
"loss": 0.5329262018203735,
"mean_token_accuracy": 0.783668577671051,
"num_tokens": 11346999.0,
"step": 694
},
{
"entropy": 0.5287525057792664,
"epoch": 2.593283582089552,
"grad_norm": 0.1584477573633194,
"learning_rate": 0.0002,
"loss": 0.5283543467521667,
"mean_token_accuracy": 0.7880005240440369,
"num_tokens": 11363488.0,
"step": 695
},
{
"entropy": 0.5259083658456802,
"epoch": 2.5970149253731343,
"grad_norm": 0.18429915606975555,
"learning_rate": 0.0002,
"loss": 0.5257930159568787,
"mean_token_accuracy": 0.7871210873126984,
"num_tokens": 11379993.0,
"step": 696
},
{
"entropy": 0.5198669880628586,
"epoch": 2.6007462686567164,
"grad_norm": 0.19845134019851685,
"learning_rate": 0.0002,
"loss": 0.5221295356750488,
"mean_token_accuracy": 0.7895113527774811,
"num_tokens": 11396236.0,
"step": 697
},
{
"entropy": 0.5398612767457962,
"epoch": 2.6044776119402986,
"grad_norm": 0.19270583987236023,
"learning_rate": 0.0002,
"loss": 0.5429852604866028,
"mean_token_accuracy": 0.7811529338359833,
"num_tokens": 11412613.0,
"step": 698
},
{
"entropy": 0.5187375992536545,
"epoch": 2.6082089552238807,
"grad_norm": 0.18094319105148315,
"learning_rate": 0.0002,
"loss": 0.5167657136917114,
"mean_token_accuracy": 0.790035143494606,
"num_tokens": 11428870.0,
"step": 699
},
{
"entropy": 0.5331326425075531,
"epoch": 2.611940298507463,
"grad_norm": 0.16809140145778656,
"learning_rate": 0.0002,
"loss": 0.5311716794967651,
"mean_token_accuracy": 0.7813376784324646,
"num_tokens": 11445541.0,
"step": 700
},
{
"entropy": 0.5317347943782806,
"epoch": 2.6156716417910446,
"grad_norm": 0.2061910331249237,
"learning_rate": 0.0002,
"loss": 0.5366970896720886,
"mean_token_accuracy": 0.7823969423770905,
"num_tokens": 11461869.0,
"step": 701
},
{
"entropy": 0.5304048359394073,
"epoch": 2.6194029850746268,
"grad_norm": 0.15473014116287231,
"learning_rate": 0.0002,
"loss": 0.5267943143844604,
"mean_token_accuracy": 0.7864733040332794,
"num_tokens": 11478245.0,
"step": 702
},
{
"entropy": 0.528009369969368,
"epoch": 2.623134328358209,
"grad_norm": 0.2206811010837555,
"learning_rate": 0.0002,
"loss": 0.528520941734314,
"mean_token_accuracy": 0.7848467379808426,
"num_tokens": 11494601.0,
"step": 703
},
{
"entropy": 0.5367393791675568,
"epoch": 2.626865671641791,
"grad_norm": 0.17169888317584991,
"learning_rate": 0.0002,
"loss": 0.5352901816368103,
"mean_token_accuracy": 0.7826301157474518,
"num_tokens": 11510824.0,
"step": 704
},
{
"entropy": 0.5446508675813675,
"epoch": 2.6305970149253732,
"grad_norm": 0.23117929697036743,
"learning_rate": 0.0002,
"loss": 0.5552783608436584,
"mean_token_accuracy": 0.7762233018875122,
"num_tokens": 11527111.0,
"step": 705
},
{
"entropy": 0.5259118974208832,
"epoch": 2.6343283582089554,
"grad_norm": 0.17237775027751923,
"learning_rate": 0.0002,
"loss": 0.5258082747459412,
"mean_token_accuracy": 0.7888418883085251,
"num_tokens": 11543508.0,
"step": 706
},
{
"entropy": 0.5134415403008461,
"epoch": 2.638059701492537,
"grad_norm": 0.1968804895877838,
"learning_rate": 0.0002,
"loss": 0.516159176826477,
"mean_token_accuracy": 0.7919125109910965,
"num_tokens": 11559764.0,
"step": 707
},
{
"entropy": 0.5164712592959404,
"epoch": 2.6417910447761193,
"grad_norm": 0.18034212291240692,
"learning_rate": 0.0002,
"loss": 0.5184696316719055,
"mean_token_accuracy": 0.7913271486759186,
"num_tokens": 11576280.0,
"step": 708
},
{
"entropy": 0.5396228730678558,
"epoch": 2.6455223880597014,
"grad_norm": 0.16111285984516144,
"learning_rate": 0.0002,
"loss": 0.536095142364502,
"mean_token_accuracy": 0.7845699042081833,
"num_tokens": 11592548.0,
"step": 709
},
{
"entropy": 0.5335683822631836,
"epoch": 2.6492537313432836,
"grad_norm": 0.18878330290317535,
"learning_rate": 0.0002,
"loss": 0.533022403717041,
"mean_token_accuracy": 0.7858745902776718,
"num_tokens": 11608718.0,
"step": 710
},
{
"entropy": 0.5291629135608673,
"epoch": 2.6529850746268657,
"grad_norm": 0.15525634586811066,
"learning_rate": 0.0002,
"loss": 0.5270857214927673,
"mean_token_accuracy": 0.7867603600025177,
"num_tokens": 11624984.0,
"step": 711
},
{
"entropy": 0.5291008502244949,
"epoch": 2.656716417910448,
"grad_norm": 0.2215014100074768,
"learning_rate": 0.0002,
"loss": 0.5335924029350281,
"mean_token_accuracy": 0.7852614969015121,
"num_tokens": 11641414.0,
"step": 712
},
{
"entropy": 0.5195610374212265,
"epoch": 2.66044776119403,
"grad_norm": 0.1840248554944992,
"learning_rate": 0.0002,
"loss": 0.5272573828697205,
"mean_token_accuracy": 0.7856255769729614,
"num_tokens": 11657606.0,
"step": 713
},
{
"entropy": 0.5212601721286774,
"epoch": 2.664179104477612,
"grad_norm": 0.2194834053516388,
"learning_rate": 0.0002,
"loss": 0.5225985050201416,
"mean_token_accuracy": 0.7896359115839005,
"num_tokens": 11673978.0,
"step": 714
},
{
"entropy": 0.5267243683338165,
"epoch": 2.667910447761194,
"grad_norm": 0.18111757934093475,
"learning_rate": 0.0002,
"loss": 0.5297276973724365,
"mean_token_accuracy": 0.7850082814693451,
"num_tokens": 11690084.0,
"step": 715
},
{
"entropy": 0.5318636000156403,
"epoch": 2.671641791044776,
"grad_norm": 0.1797971874475479,
"learning_rate": 0.0002,
"loss": 0.5307915806770325,
"mean_token_accuracy": 0.7851123064756393,
"num_tokens": 11706504.0,
"step": 716
},
{
"entropy": 0.5428463369607925,
"epoch": 2.675373134328358,
"grad_norm": 0.1636015772819519,
"learning_rate": 0.0002,
"loss": 0.534479558467865,
"mean_token_accuracy": 0.7838175147771835,
"num_tokens": 11722988.0,
"step": 717
},
{
"entropy": 0.5360075086355209,
"epoch": 2.6791044776119404,
"grad_norm": 0.15919257700443268,
"learning_rate": 0.0002,
"loss": 0.5305730700492859,
"mean_token_accuracy": 0.7855097204446793,
"num_tokens": 11739438.0,
"step": 718
},
{
"entropy": 0.5359227359294891,
"epoch": 2.6828358208955225,
"grad_norm": 0.14643317461013794,
"learning_rate": 0.0002,
"loss": 0.532948911190033,
"mean_token_accuracy": 0.7826716750860214,
"num_tokens": 11755793.0,
"step": 719
},
{
"entropy": 0.508900836110115,
"epoch": 2.6865671641791042,
"grad_norm": 0.18424049019813538,
"learning_rate": 0.0002,
"loss": 0.5087383985519409,
"mean_token_accuracy": 0.7960971295833588,
"num_tokens": 11772140.0,
"step": 720
},
{
"entropy": 0.5278252959251404,
"epoch": 2.6902985074626864,
"grad_norm": 0.16620668768882751,
"learning_rate": 0.0002,
"loss": 0.5323323011398315,
"mean_token_accuracy": 0.7838071584701538,
"num_tokens": 11788187.0,
"step": 721
},
{
"entropy": 0.5286207944154739,
"epoch": 2.6940298507462686,
"grad_norm": 0.18285532295703888,
"learning_rate": 0.0002,
"loss": 0.5379830598831177,
"mean_token_accuracy": 0.7834362238645554,
"num_tokens": 11804853.0,
"step": 722
},
{
"entropy": 0.5304315537214279,
"epoch": 2.6977611940298507,
"grad_norm": 0.1528841108083725,
"learning_rate": 0.0002,
"loss": 0.53291916847229,
"mean_token_accuracy": 0.7848697453737259,
"num_tokens": 11821372.0,
"step": 723
},
{
"entropy": 0.5269036293029785,
"epoch": 2.701492537313433,
"grad_norm": 0.16717489063739777,
"learning_rate": 0.0002,
"loss": 0.5263969898223877,
"mean_token_accuracy": 0.7880866229534149,
"num_tokens": 11837581.0,
"step": 724
},
{
"entropy": 0.5256982818245888,
"epoch": 2.705223880597015,
"grad_norm": 0.15457774698734283,
"learning_rate": 0.0002,
"loss": 0.5219148993492126,
"mean_token_accuracy": 0.7873740494251251,
"num_tokens": 11853896.0,
"step": 725
},
{
"entropy": 0.534528449177742,
"epoch": 2.708955223880597,
"grad_norm": 0.15566900372505188,
"learning_rate": 0.0002,
"loss": 0.5313507318496704,
"mean_token_accuracy": 0.7871876060962677,
"num_tokens": 11869979.0,
"step": 726
},
{
"entropy": 0.5365303605794907,
"epoch": 2.7126865671641793,
"grad_norm": 0.16134414076805115,
"learning_rate": 0.0002,
"loss": 0.5403051972389221,
"mean_token_accuracy": 0.7792389243841171,
"num_tokens": 11886540.0,
"step": 727
},
{
"entropy": 0.5314591228961945,
"epoch": 2.716417910447761,
"grad_norm": 0.20206789672374725,
"learning_rate": 0.0002,
"loss": 0.5367040038108826,
"mean_token_accuracy": 0.785218358039856,
"num_tokens": 11902636.0,
"step": 728
},
{
"entropy": 0.5247315615415573,
"epoch": 2.720149253731343,
"grad_norm": 0.17510657012462616,
"learning_rate": 0.0002,
"loss": 0.5183426141738892,
"mean_token_accuracy": 0.7929788678884506,
"num_tokens": 11918809.0,
"step": 729
},
{
"entropy": 0.531570702791214,
"epoch": 2.7238805970149254,
"grad_norm": 0.19654951989650726,
"learning_rate": 0.0002,
"loss": 0.5312444567680359,
"mean_token_accuracy": 0.7852945178747177,
"num_tokens": 11934918.0,
"step": 730
},
{
"entropy": 0.5167503207921982,
"epoch": 2.7276119402985075,
"grad_norm": 0.18647317588329315,
"learning_rate": 0.0002,
"loss": 0.521633505821228,
"mean_token_accuracy": 0.7868699729442596,
"num_tokens": 11951418.0,
"step": 731
},
{
"entropy": 0.5409902930259705,
"epoch": 2.7313432835820897,
"grad_norm": 0.16911281645298004,
"learning_rate": 0.0002,
"loss": 0.5437517166137695,
"mean_token_accuracy": 0.7801080495119095,
"num_tokens": 11967971.0,
"step": 732
},
{
"entropy": 0.5430471152067184,
"epoch": 2.7350746268656714,
"grad_norm": 0.15203061699867249,
"learning_rate": 0.0002,
"loss": 0.5399286150932312,
"mean_token_accuracy": 0.7798464447259903,
"num_tokens": 11984465.0,
"step": 733
},
{
"entropy": 0.5305036455392838,
"epoch": 2.7388059701492535,
"grad_norm": 0.19002215564250946,
"learning_rate": 0.0002,
"loss": 0.526854932308197,
"mean_token_accuracy": 0.788349375128746,
"num_tokens": 12000894.0,
"step": 734
},
{
"entropy": 0.5385335683822632,
"epoch": 2.7425373134328357,
"grad_norm": 0.1556226909160614,
"learning_rate": 0.0002,
"loss": 0.536300003528595,
"mean_token_accuracy": 0.7823566943407059,
"num_tokens": 12017341.0,
"step": 735
},
{
"entropy": 0.5280898958444595,
"epoch": 2.746268656716418,
"grad_norm": 0.22629927098751068,
"learning_rate": 0.0002,
"loss": 0.5357972979545593,
"mean_token_accuracy": 0.7819354236125946,
"num_tokens": 12033592.0,
"step": 736
},
{
"entropy": 0.5210496559739113,
"epoch": 2.75,
"grad_norm": 0.14672952890396118,
"learning_rate": 0.0002,
"loss": 0.5192467570304871,
"mean_token_accuracy": 0.7897329777479172,
"num_tokens": 12050029.0,
"step": 737
},
{
"entropy": 0.5315113514661789,
"epoch": 2.753731343283582,
"grad_norm": 0.179401695728302,
"learning_rate": 0.0002,
"loss": 0.5297517776489258,
"mean_token_accuracy": 0.7900628596544266,
"num_tokens": 12066356.0,
"step": 738
},
{
"entropy": 0.5152995735406876,
"epoch": 2.7574626865671643,
"grad_norm": 0.20404104888439178,
"learning_rate": 0.0002,
"loss": 0.523341953754425,
"mean_token_accuracy": 0.7902668565511703,
"num_tokens": 12082476.0,
"step": 739
},
{
"entropy": 0.5357868671417236,
"epoch": 2.7611940298507465,
"grad_norm": 0.21347877383232117,
"learning_rate": 0.0002,
"loss": 0.5397475361824036,
"mean_token_accuracy": 0.7817140519618988,
"num_tokens": 12098813.0,
"step": 740
},
{
"entropy": 0.5294998437166214,
"epoch": 2.7649253731343286,
"grad_norm": 0.19437092542648315,
"learning_rate": 0.0002,
"loss": 0.5309361219406128,
"mean_token_accuracy": 0.785544291138649,
"num_tokens": 12115108.0,
"step": 741
},
{
"entropy": 0.5339842438697815,
"epoch": 2.7686567164179103,
"grad_norm": 0.211222842335701,
"learning_rate": 0.0002,
"loss": 0.5336329340934753,
"mean_token_accuracy": 0.7840461581945419,
"num_tokens": 12131657.0,
"step": 742
},
{
"entropy": 0.5063766092061996,
"epoch": 2.7723880597014925,
"grad_norm": 0.18974091112613678,
"learning_rate": 0.0002,
"loss": 0.5003129243850708,
"mean_token_accuracy": 0.7983057200908661,
"num_tokens": 12147977.0,
"step": 743
},
{
"entropy": 0.5348393470048904,
"epoch": 2.7761194029850746,
"grad_norm": 0.17940539121627808,
"learning_rate": 0.0002,
"loss": 0.5325519442558289,
"mean_token_accuracy": 0.7843880504369736,
"num_tokens": 12164476.0,
"step": 744
},
{
"entropy": 0.5319767147302628,
"epoch": 2.779850746268657,
"grad_norm": 0.21841664612293243,
"learning_rate": 0.0002,
"loss": 0.5384219884872437,
"mean_token_accuracy": 0.7829115390777588,
"num_tokens": 12180665.0,
"step": 745
},
{
"entropy": 0.5276842713356018,
"epoch": 2.783582089552239,
"grad_norm": 0.15762406587600708,
"learning_rate": 0.0002,
"loss": 0.5222536325454712,
"mean_token_accuracy": 0.7876606732606888,
"num_tokens": 12196994.0,
"step": 746
},
{
"entropy": 0.5283003747463226,
"epoch": 2.7873134328358207,
"grad_norm": 0.1740235984325409,
"learning_rate": 0.0002,
"loss": 0.5262863039970398,
"mean_token_accuracy": 0.7871444076299667,
"num_tokens": 12213146.0,
"step": 747
},
{
"entropy": 0.5243652537465096,
"epoch": 2.791044776119403,
"grad_norm": 0.17303697764873505,
"learning_rate": 0.0002,
"loss": 0.5288724303245544,
"mean_token_accuracy": 0.7889265865087509,
"num_tokens": 12229495.0,
"step": 748
},
{
"entropy": 0.5307216495275497,
"epoch": 2.794776119402985,
"grad_norm": 0.17367562651634216,
"learning_rate": 0.0002,
"loss": 0.5350364446640015,
"mean_token_accuracy": 0.7828467786312103,
"num_tokens": 12245731.0,
"step": 749
},
{
"entropy": 0.5053429380059242,
"epoch": 2.798507462686567,
"grad_norm": 0.18273597955703735,
"learning_rate": 0.0002,
"loss": 0.5170458555221558,
"mean_token_accuracy": 0.7908547967672348,
"num_tokens": 12261995.0,
"step": 750
},
{
"entropy": 0.5304894745349884,
"epoch": 2.8022388059701493,
"grad_norm": 0.19946977496147156,
"learning_rate": 0.0002,
"loss": 0.5361734628677368,
"mean_token_accuracy": 0.7829707115888596,
"num_tokens": 12278393.0,
"step": 751
},
{
"entropy": 0.5383865833282471,
"epoch": 2.8059701492537314,
"grad_norm": 0.18991155922412872,
"learning_rate": 0.0002,
"loss": 0.5307108163833618,
"mean_token_accuracy": 0.7821619510650635,
"num_tokens": 12294798.0,
"step": 752
},
{
"entropy": 0.5184406042098999,
"epoch": 2.8097014925373136,
"grad_norm": 0.1910092979669571,
"learning_rate": 0.0002,
"loss": 0.5096916556358337,
"mean_token_accuracy": 0.7956021875143051,
"num_tokens": 12311283.0,
"step": 753
},
{
"entropy": 0.5503049492835999,
"epoch": 2.8134328358208958,
"grad_norm": 0.16047552227973938,
"learning_rate": 0.0002,
"loss": 0.5400866270065308,
"mean_token_accuracy": 0.781381756067276,
"num_tokens": 12327796.0,
"step": 754
},
{
"entropy": 0.5367267429828644,
"epoch": 2.8171641791044775,
"grad_norm": 0.17214973270893097,
"learning_rate": 0.0002,
"loss": 0.533517062664032,
"mean_token_accuracy": 0.7842586189508438,
"num_tokens": 12344276.0,
"step": 755
},
{
"entropy": 0.5231245383620262,
"epoch": 2.8208955223880596,
"grad_norm": 0.20261810719966888,
"learning_rate": 0.0002,
"loss": 0.5310981869697571,
"mean_token_accuracy": 0.7863229364156723,
"num_tokens": 12360664.0,
"step": 756
},
{
"entropy": 0.5025655254721642,
"epoch": 2.824626865671642,
"grad_norm": 0.23269020020961761,
"learning_rate": 0.0002,
"loss": 0.5136131644248962,
"mean_token_accuracy": 0.7932915538549423,
"num_tokens": 12377108.0,
"step": 757
},
{
"entropy": 0.5385118275880814,
"epoch": 2.828358208955224,
"grad_norm": 0.17557309567928314,
"learning_rate": 0.0002,
"loss": 0.5468243956565857,
"mean_token_accuracy": 0.7773942649364471,
"num_tokens": 12393477.0,
"step": 758
},
{
"entropy": 0.5556999295949936,
"epoch": 2.832089552238806,
"grad_norm": 0.18836821615695953,
"learning_rate": 0.0002,
"loss": 0.5542982816696167,
"mean_token_accuracy": 0.7759236544370651,
"num_tokens": 12409945.0,
"step": 759
},
{
"entropy": 0.5397951006889343,
"epoch": 2.835820895522388,
"grad_norm": 0.16869579255580902,
"learning_rate": 0.0002,
"loss": 0.5345804691314697,
"mean_token_accuracy": 0.7828676253557205,
"num_tokens": 12426172.0,
"step": 760
},
{
"entropy": 0.5465898215770721,
"epoch": 2.83955223880597,
"grad_norm": 0.1971413791179657,
"learning_rate": 0.0002,
"loss": 0.5406813621520996,
"mean_token_accuracy": 0.7830551862716675,
"num_tokens": 12442539.0,
"step": 761
},
{
"entropy": 0.5412090718746185,
"epoch": 2.843283582089552,
"grad_norm": 0.16916459798812866,
"learning_rate": 0.0002,
"loss": 0.5298109650611877,
"mean_token_accuracy": 0.7871081382036209,
"num_tokens": 12458926.0,
"step": 762
},
{
"entropy": 0.5222381502389908,
"epoch": 2.8470149253731343,
"grad_norm": 0.19241978228092194,
"learning_rate": 0.0002,
"loss": 0.5193473100662231,
"mean_token_accuracy": 0.7926554083824158,
"num_tokens": 12475192.0,
"step": 763
},
{
"entropy": 0.5114666819572449,
"epoch": 2.8507462686567164,
"grad_norm": 0.2026778608560562,
"learning_rate": 0.0002,
"loss": 0.5210025906562805,
"mean_token_accuracy": 0.7880990207195282,
"num_tokens": 12491486.0,
"step": 764
},
{
"entropy": 0.5318130105733871,
"epoch": 2.8544776119402986,
"grad_norm": 0.18366879224777222,
"learning_rate": 0.0002,
"loss": 0.5408880710601807,
"mean_token_accuracy": 0.7821989059448242,
"num_tokens": 12508110.0,
"step": 765
},
{
"entropy": 0.5178861618041992,
"epoch": 2.8582089552238807,
"grad_norm": 0.22393299639225006,
"learning_rate": 0.0002,
"loss": 0.5233381986618042,
"mean_token_accuracy": 0.7875554710626602,
"num_tokens": 12524419.0,
"step": 766
},
{
"entropy": 0.5129977464675903,
"epoch": 2.861940298507463,
"grad_norm": 0.16486415266990662,
"learning_rate": 0.0002,
"loss": 0.5123316645622253,
"mean_token_accuracy": 0.7945219725370407,
"num_tokens": 12540623.0,
"step": 767
},
{
"entropy": 0.5352810174226761,
"epoch": 2.8656716417910446,
"grad_norm": 0.16391848027706146,
"learning_rate": 0.0002,
"loss": 0.5287078619003296,
"mean_token_accuracy": 0.7864142656326294,
"num_tokens": 12556769.0,
"step": 768
},
{
"entropy": 0.5213837772607803,
"epoch": 2.8694029850746268,
"grad_norm": 0.15605109930038452,
"learning_rate": 0.0002,
"loss": 0.5177993774414062,
"mean_token_accuracy": 0.791528195142746,
"num_tokens": 12572975.0,
"step": 769
},
{
"entropy": 0.5254454612731934,
"epoch": 2.873134328358209,
"grad_norm": 0.17228880524635315,
"learning_rate": 0.0002,
"loss": 0.5218878388404846,
"mean_token_accuracy": 0.790112167596817,
"num_tokens": 12589664.0,
"step": 770
},
{
"entropy": 0.5180996954441071,
"epoch": 2.876865671641791,
"grad_norm": 0.1603233963251114,
"learning_rate": 0.0002,
"loss": 0.5153653621673584,
"mean_token_accuracy": 0.7935372442007065,
"num_tokens": 12606393.0,
"step": 771
},
{
"entropy": 0.5220412835478783,
"epoch": 2.8805970149253732,
"grad_norm": 0.19191837310791016,
"learning_rate": 0.0002,
"loss": 0.5350449085235596,
"mean_token_accuracy": 0.7817320823669434,
"num_tokens": 12622915.0,
"step": 772
},
{
"entropy": 0.5260520726442337,
"epoch": 2.8843283582089554,
"grad_norm": 0.1964220553636551,
"learning_rate": 0.0002,
"loss": 0.5347790718078613,
"mean_token_accuracy": 0.7870497107505798,
"num_tokens": 12639438.0,
"step": 773
},
{
"entropy": 0.5259631350636482,
"epoch": 2.888059701492537,
"grad_norm": 0.1590423583984375,
"learning_rate": 0.0002,
"loss": 0.5264297723770142,
"mean_token_accuracy": 0.7856660634279251,
"num_tokens": 12656043.0,
"step": 774
},
{
"entropy": 0.5494396686553955,
"epoch": 2.8917910447761193,
"grad_norm": 0.166259765625,
"learning_rate": 0.0002,
"loss": 0.541179895401001,
"mean_token_accuracy": 0.7822139710187912,
"num_tokens": 12672530.0,
"step": 775
},
{
"entropy": 0.5362062454223633,
"epoch": 2.8955223880597014,
"grad_norm": 0.16349440813064575,
"learning_rate": 0.0002,
"loss": 0.530780017375946,
"mean_token_accuracy": 0.7863557487726212,
"num_tokens": 12689021.0,
"step": 776
},
{
"entropy": 0.5223592668771744,
"epoch": 2.8992537313432836,
"grad_norm": 0.15761977434158325,
"learning_rate": 0.0002,
"loss": 0.5155429244041443,
"mean_token_accuracy": 0.7907254546880722,
"num_tokens": 12705262.0,
"step": 777
},
{
"entropy": 0.5258801132440567,
"epoch": 2.9029850746268657,
"grad_norm": 0.1883028894662857,
"learning_rate": 0.0002,
"loss": 0.529833972454071,
"mean_token_accuracy": 0.7863512486219406,
"num_tokens": 12721511.0,
"step": 778
},
{
"entropy": 0.5216899961233139,
"epoch": 2.906716417910448,
"grad_norm": 0.16059532761573792,
"learning_rate": 0.0002,
"loss": 0.522499680519104,
"mean_token_accuracy": 0.7899018228054047,
"num_tokens": 12738089.0,
"step": 779
},
{
"entropy": 0.520403303205967,
"epoch": 2.91044776119403,
"grad_norm": 0.1771392673254013,
"learning_rate": 0.0002,
"loss": 0.5236196517944336,
"mean_token_accuracy": 0.7879007905721664,
"num_tokens": 12754592.0,
"step": 780
},
{
"entropy": 0.5242541432380676,
"epoch": 2.914179104477612,
"grad_norm": 0.17634879052639008,
"learning_rate": 0.0002,
"loss": 0.5289914011955261,
"mean_token_accuracy": 0.7824440151453018,
"num_tokens": 12770734.0,
"step": 781
},
{
"entropy": 0.5201637446880341,
"epoch": 2.917910447761194,
"grad_norm": 0.17048649489879608,
"learning_rate": 0.0002,
"loss": 0.5211310386657715,
"mean_token_accuracy": 0.7937574684619904,
"num_tokens": 12787160.0,
"step": 782
},
{
"entropy": 0.5204057991504669,
"epoch": 2.921641791044776,
"grad_norm": 0.15417909622192383,
"learning_rate": 0.0002,
"loss": 0.517360508441925,
"mean_token_accuracy": 0.7929933965206146,
"num_tokens": 12803683.0,
"step": 783
},
{
"entropy": 0.545757845044136,
"epoch": 2.925373134328358,
"grad_norm": 0.1549869030714035,
"learning_rate": 0.0002,
"loss": 0.5414532423019409,
"mean_token_accuracy": 0.7788090705871582,
"num_tokens": 12819951.0,
"step": 784
},
{
"entropy": 0.5228646248579025,
"epoch": 2.9291044776119404,
"grad_norm": 0.15743686258792877,
"learning_rate": 0.0002,
"loss": 0.516430675983429,
"mean_token_accuracy": 0.7925095409154892,
"num_tokens": 12836413.0,
"step": 785
},
{
"entropy": 0.5214046537876129,
"epoch": 2.9328358208955225,
"grad_norm": 0.16672447323799133,
"learning_rate": 0.0002,
"loss": 0.5222574472427368,
"mean_token_accuracy": 0.7870719730854034,
"num_tokens": 12852872.0,
"step": 786
},
{
"entropy": 0.5317943245172501,
"epoch": 2.9365671641791042,
"grad_norm": 0.21642933785915375,
"learning_rate": 0.0002,
"loss": 0.5372959971427917,
"mean_token_accuracy": 0.7832164466381073,
"num_tokens": 12869405.0,
"step": 787
},
{
"entropy": 0.5113082602620125,
"epoch": 2.9402985074626864,
"grad_norm": 0.22133168578147888,
"learning_rate": 0.0002,
"loss": 0.522553563117981,
"mean_token_accuracy": 0.7871409952640533,
"num_tokens": 12885593.0,
"step": 788
},
{
"entropy": 0.5275594145059586,
"epoch": 2.9440298507462686,
"grad_norm": 0.20494818687438965,
"learning_rate": 0.0002,
"loss": 0.5326835513114929,
"mean_token_accuracy": 0.7843892127275467,
"num_tokens": 12901950.0,
"step": 789
},
{
"entropy": 0.5371553599834442,
"epoch": 2.9477611940298507,
"grad_norm": 0.16483525931835175,
"learning_rate": 0.0002,
"loss": 0.5343260765075684,
"mean_token_accuracy": 0.7844540178775787,
"num_tokens": 12918538.0,
"step": 790
},
{
"entropy": 0.5248367339372635,
"epoch": 2.951492537313433,
"grad_norm": 0.20370911061763763,
"learning_rate": 0.0002,
"loss": 0.5262700915336609,
"mean_token_accuracy": 0.7856797575950623,
"num_tokens": 12935041.0,
"step": 791
},
{
"entropy": 0.5536757409572601,
"epoch": 2.955223880597015,
"grad_norm": 0.15302392840385437,
"learning_rate": 0.0002,
"loss": 0.5451865196228027,
"mean_token_accuracy": 0.781255841255188,
"num_tokens": 12951793.0,
"step": 792
},
{
"entropy": 0.5070596486330032,
"epoch": 2.958955223880597,
"grad_norm": 0.20451144874095917,
"learning_rate": 0.0002,
"loss": 0.5115755796432495,
"mean_token_accuracy": 0.7904744446277618,
"num_tokens": 12968060.0,
"step": 793
},
{
"entropy": 0.5260060653090477,
"epoch": 2.9626865671641793,
"grad_norm": 0.16183388233184814,
"learning_rate": 0.0002,
"loss": 0.5244185328483582,
"mean_token_accuracy": 0.7878494709730148,
"num_tokens": 12984541.0,
"step": 794
},
{
"entropy": 0.5389718413352966,
"epoch": 2.966417910447761,
"grad_norm": 0.17704468965530396,
"learning_rate": 0.0002,
"loss": 0.5415879487991333,
"mean_token_accuracy": 0.7840642035007477,
"num_tokens": 13000817.0,
"step": 795
},
{
"entropy": 0.5400192737579346,
"epoch": 2.970149253731343,
"grad_norm": 0.16612157225608826,
"learning_rate": 0.0002,
"loss": 0.5336055755615234,
"mean_token_accuracy": 0.7857667803764343,
"num_tokens": 13016973.0,
"step": 796
},
{
"entropy": 0.5179389715194702,
"epoch": 2.9738805970149254,
"grad_norm": 0.16657505929470062,
"learning_rate": 0.0002,
"loss": 0.5218580365180969,
"mean_token_accuracy": 0.7903915345668793,
"num_tokens": 13033299.0,
"step": 797
},
{
"entropy": 0.5229775831103325,
"epoch": 2.9776119402985075,
"grad_norm": 0.1601499617099762,
"learning_rate": 0.0002,
"loss": 0.5244333744049072,
"mean_token_accuracy": 0.7875324189662933,
"num_tokens": 13049754.0,
"step": 798
},
{
"entropy": 0.5364563912153244,
"epoch": 2.9813432835820897,
"grad_norm": 0.17928777635097504,
"learning_rate": 0.0002,
"loss": 0.5421883463859558,
"mean_token_accuracy": 0.7822880148887634,
"num_tokens": 13066045.0,
"step": 799
},
{
"entropy": 0.5202258825302124,
"epoch": 2.9850746268656714,
"grad_norm": 0.1714518666267395,
"learning_rate": 0.0002,
"loss": 0.5221466422080994,
"mean_token_accuracy": 0.7896016389131546,
"num_tokens": 13082398.0,
"step": 800
},
{
"entropy": 0.526955708861351,
"epoch": 2.9888059701492535,
"grad_norm": 0.1565951555967331,
"learning_rate": 0.0002,
"loss": 0.521065354347229,
"mean_token_accuracy": 0.7919437438249588,
"num_tokens": 13098966.0,
"step": 801
},
{
"entropy": 0.5393194705247879,
"epoch": 2.9925373134328357,
"grad_norm": 0.1675749570131302,
"learning_rate": 0.0002,
"loss": 0.5336388945579529,
"mean_token_accuracy": 0.7851084172725677,
"num_tokens": 13115333.0,
"step": 802
},
{
"entropy": 0.5270961374044418,
"epoch": 2.996268656716418,
"grad_norm": 0.17216360569000244,
"learning_rate": 0.0002,
"loss": 0.5220625400543213,
"mean_token_accuracy": 0.7888612896203995,
"num_tokens": 13131491.0,
"step": 803
},
{
"entropy": 0.5005228817462921,
"epoch": 3.0,
"grad_norm": 0.1877554953098297,
"learning_rate": 0.0002,
"loss": 0.5059037208557129,
"mean_token_accuracy": 0.797055795788765,
"num_tokens": 13147551.0,
"step": 804
}
],
"logging_steps": 1,
"max_steps": 804,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2254562163611402e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}