{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1308949291706085, "epoch": 0.0037313432835820895, "grad_norm": 1.683108925819397, "learning_rate": 0.0002, "loss": 2.489936590194702, "mean_token_accuracy": 0.5359140038490295, "num_tokens": 16356.0, "step": 1 }, { "entropy": 1.2256053388118744, "epoch": 0.007462686567164179, "grad_norm": 1.5088376998901367, "learning_rate": 0.0002, "loss": 2.162245273590088, "mean_token_accuracy": 0.5673863738775253, "num_tokens": 32718.0, "step": 2 }, { "entropy": 1.4011717438697815, "epoch": 0.011194029850746268, "grad_norm": 1.1495057344436646, "learning_rate": 0.0002, "loss": 1.7410045862197876, "mean_token_accuracy": 0.5877877026796341, "num_tokens": 49086.0, "step": 3 }, { "entropy": 1.3629191517829895, "epoch": 0.014925373134328358, "grad_norm": 0.909584105014801, "learning_rate": 0.0002, "loss": 1.410053014755249, "mean_token_accuracy": 0.6416480243206024, "num_tokens": 65483.0, "step": 4 }, { "entropy": 1.345184564590454, "epoch": 0.018656716417910446, "grad_norm": 1.1788593530654907, "learning_rate": 0.0002, "loss": 1.2843377590179443, "mean_token_accuracy": 0.6425914913415909, "num_tokens": 81705.0, "step": 5 }, { "entropy": 1.2523848712444305, "epoch": 0.022388059701492536, "grad_norm": 0.7064197659492493, "learning_rate": 0.0002, "loss": 1.175342082977295, "mean_token_accuracy": 0.6635853946208954, "num_tokens": 97918.0, "step": 6 }, { "entropy": 1.199697583913803, "epoch": 0.026119402985074626, "grad_norm": 0.4158240854740143, "learning_rate": 0.0002, "loss": 1.1010812520980835, "mean_token_accuracy": 0.6607878506183624, "num_tokens": 114455.0, "step": 7 }, { "entropy": 1.0897426307201385, "epoch": 0.029850746268656716, "grad_norm": 0.4258277118206024, "learning_rate": 0.0002, "loss": 1.0245436429977417, "mean_token_accuracy": 0.682918444275856, "num_tokens": 130921.0, "step": 8 }, { "entropy": 0.9851540327072144, "epoch": 0.033582089552238806, "grad_norm": 0.6931905150413513, "learning_rate": 0.0002, "loss": 0.972236692905426, "mean_token_accuracy": 0.690200999379158, "num_tokens": 147028.0, "step": 9 }, { "entropy": 0.9809075742959976, "epoch": 0.03731343283582089, "grad_norm": 0.4386370778083801, "learning_rate": 0.0002, "loss": 0.9174745082855225, "mean_token_accuracy": 0.6927480399608612, "num_tokens": 163432.0, "step": 10 }, { "entropy": 0.911684438586235, "epoch": 0.041044776119402986, "grad_norm": 4.369440078735352, "learning_rate": 0.0002, "loss": 0.8261430263519287, "mean_token_accuracy": 0.7205553501844406, "num_tokens": 179455.0, "step": 11 }, { "entropy": 0.8916845321655273, "epoch": 0.04477611940298507, "grad_norm": 0.5139093399047852, "learning_rate": 0.0002, "loss": 0.8168894648551941, "mean_token_accuracy": 0.714234933257103, "num_tokens": 195668.0, "step": 12 }, { "entropy": 0.8192363679409027, "epoch": 0.048507462686567165, "grad_norm": 0.5154215097427368, "learning_rate": 0.0002, "loss": 0.7735035419464111, "mean_token_accuracy": 0.7252469956874847, "num_tokens": 211417.0, "step": 13 }, { "entropy": 0.8060386925935745, "epoch": 0.05223880597014925, "grad_norm": 0.3869208097457886, "learning_rate": 0.0002, "loss": 0.7496379017829895, "mean_token_accuracy": 0.7249694466590881, "num_tokens": 228014.0, "step": 14 }, { "entropy": 0.7358367741107941, "epoch": 0.055970149253731345, "grad_norm": 0.3804072439670563, "learning_rate": 0.0002, "loss": 0.7129448652267456, "mean_token_accuracy": 0.7322827130556107, "num_tokens": 244548.0, "step": 15 }, { "entropy": 0.6891884654760361, "epoch": 0.05970149253731343, "grad_norm": 0.4262757897377014, "learning_rate": 0.0002, "loss": 0.7087160348892212, "mean_token_accuracy": 0.7325101941823959, "num_tokens": 260927.0, "step": 16 }, { "entropy": 0.6646793335676193, "epoch": 0.06343283582089553, "grad_norm": 0.3463515639305115, "learning_rate": 0.0002, "loss": 0.6711890697479248, "mean_token_accuracy": 0.743767574429512, "num_tokens": 277478.0, "step": 17 }, { "entropy": 0.6615253239870071, "epoch": 0.06716417910447761, "grad_norm": 0.3623281419277191, "learning_rate": 0.0002, "loss": 0.6425697803497314, "mean_token_accuracy": 0.7528071999549866, "num_tokens": 293828.0, "step": 18 }, { "entropy": 0.6510400027036667, "epoch": 0.0708955223880597, "grad_norm": 0.3351263701915741, "learning_rate": 0.0002, "loss": 0.6357494592666626, "mean_token_accuracy": 0.7543895989656448, "num_tokens": 309962.0, "step": 19 }, { "entropy": 0.6420271843671799, "epoch": 0.07462686567164178, "grad_norm": 0.3311758041381836, "learning_rate": 0.0002, "loss": 0.6307370662689209, "mean_token_accuracy": 0.7545324862003326, "num_tokens": 326597.0, "step": 20 }, { "entropy": 0.6174459308385849, "epoch": 0.07835820895522388, "grad_norm": 0.35250842571258545, "learning_rate": 0.0002, "loss": 0.6103197336196899, "mean_token_accuracy": 0.7592763751745224, "num_tokens": 342917.0, "step": 21 }, { "entropy": 0.6289893835783005, "epoch": 0.08208955223880597, "grad_norm": 0.25894996523857117, "learning_rate": 0.0002, "loss": 0.6157230734825134, "mean_token_accuracy": 0.7587940841913223, "num_tokens": 359567.0, "step": 22 }, { "entropy": 0.6118573248386383, "epoch": 0.08582089552238806, "grad_norm": 0.29135045409202576, "learning_rate": 0.0002, "loss": 0.6002258658409119, "mean_token_accuracy": 0.7654120922088623, "num_tokens": 375565.0, "step": 23 }, { "entropy": 0.5791880339384079, "epoch": 0.08955223880597014, "grad_norm": 0.2720821499824524, "learning_rate": 0.0002, "loss": 0.5813120603561401, "mean_token_accuracy": 0.7713776230812073, "num_tokens": 391864.0, "step": 24 }, { "entropy": 0.6053604930639267, "epoch": 0.09328358208955224, "grad_norm": 0.2560279667377472, "learning_rate": 0.0002, "loss": 0.6105175018310547, "mean_token_accuracy": 0.7615619450807571, "num_tokens": 408354.0, "step": 25 }, { "entropy": 0.5867195874452591, "epoch": 0.09701492537313433, "grad_norm": 0.22600652277469635, "learning_rate": 0.0002, "loss": 0.5860370993614197, "mean_token_accuracy": 0.7677419036626816, "num_tokens": 424712.0, "step": 26 }, { "entropy": 0.5918123573064804, "epoch": 0.10074626865671642, "grad_norm": 0.256405770778656, "learning_rate": 0.0002, "loss": 0.5865331888198853, "mean_token_accuracy": 0.7698597609996796, "num_tokens": 441249.0, "step": 27 }, { "entropy": 0.5696172267198563, "epoch": 0.1044776119402985, "grad_norm": 0.22032174468040466, "learning_rate": 0.0002, "loss": 0.5604762434959412, "mean_token_accuracy": 0.7779532968997955, "num_tokens": 457602.0, "step": 28 }, { "entropy": 0.5602490454912186, "epoch": 0.10820895522388059, "grad_norm": 0.20871949195861816, "learning_rate": 0.0002, "loss": 0.5587727427482605, "mean_token_accuracy": 0.7771614342927933, "num_tokens": 473785.0, "step": 29 }, { "entropy": 0.5850763767957687, "epoch": 0.11194029850746269, "grad_norm": 0.23072806000709534, "learning_rate": 0.0002, "loss": 0.5962345004081726, "mean_token_accuracy": 0.762176513671875, "num_tokens": 490054.0, "step": 30 }, { "entropy": 0.5698783695697784, "epoch": 0.11567164179104478, "grad_norm": 0.20846784114837646, "learning_rate": 0.0002, "loss": 0.5793903470039368, "mean_token_accuracy": 0.7701146155595779, "num_tokens": 506525.0, "step": 31 }, { "entropy": 0.5649833828210831, "epoch": 0.11940298507462686, "grad_norm": 0.20395582914352417, "learning_rate": 0.0002, "loss": 0.5709314942359924, "mean_token_accuracy": 0.7762356698513031, "num_tokens": 522952.0, "step": 32 }, { "entropy": 0.5790712088346481, "epoch": 0.12313432835820895, "grad_norm": 0.21085898578166962, "learning_rate": 0.0002, "loss": 0.5755910873413086, "mean_token_accuracy": 0.7691536694765091, "num_tokens": 539151.0, "step": 33 }, { "entropy": 0.5798842161893845, "epoch": 0.12686567164179105, "grad_norm": 0.1799822747707367, "learning_rate": 0.0002, "loss": 0.5749096274375916, "mean_token_accuracy": 0.7671291828155518, "num_tokens": 555566.0, "step": 34 }, { "entropy": 0.568429708480835, "epoch": 0.13059701492537312, "grad_norm": 0.21928845345973969, "learning_rate": 0.0002, "loss": 0.5717220306396484, "mean_token_accuracy": 0.771720290184021, "num_tokens": 572125.0, "step": 35 }, { "entropy": 0.5658127665519714, "epoch": 0.13432835820895522, "grad_norm": 0.22536930441856384, "learning_rate": 0.0002, "loss": 0.5656446218490601, "mean_token_accuracy": 0.7756934762001038, "num_tokens": 588539.0, "step": 36 }, { "entropy": 0.5779189765453339, "epoch": 0.13805970149253732, "grad_norm": 0.18143770098686218, "learning_rate": 0.0002, "loss": 0.5782102942466736, "mean_token_accuracy": 0.768736332654953, "num_tokens": 604927.0, "step": 37 }, { "entropy": 0.5695452243089676, "epoch": 0.1417910447761194, "grad_norm": 0.18897166848182678, "learning_rate": 0.0002, "loss": 0.5745816230773926, "mean_token_accuracy": 0.7676017582416534, "num_tokens": 621213.0, "step": 38 }, { "entropy": 0.5704480558633804, "epoch": 0.1455223880597015, "grad_norm": 0.20254790782928467, "learning_rate": 0.0002, "loss": 0.573440432548523, "mean_token_accuracy": 0.769940122961998, "num_tokens": 637694.0, "step": 39 }, { "entropy": 0.5526881515979767, "epoch": 0.14925373134328357, "grad_norm": 0.2001330703496933, "learning_rate": 0.0002, "loss": 0.5598679780960083, "mean_token_accuracy": 0.7767495959997177, "num_tokens": 653791.0, "step": 40 }, { "entropy": 0.572973906993866, "epoch": 0.15298507462686567, "grad_norm": 0.1802511364221573, "learning_rate": 0.0002, "loss": 0.5720363855361938, "mean_token_accuracy": 0.7737791240215302, "num_tokens": 669970.0, "step": 41 }, { "entropy": 0.5880100876092911, "epoch": 0.15671641791044777, "grad_norm": 0.190653994679451, "learning_rate": 0.0002, "loss": 0.5839952826499939, "mean_token_accuracy": 0.7667653411626816, "num_tokens": 686164.0, "step": 42 }, { "entropy": 0.5611717849969864, "epoch": 0.16044776119402984, "grad_norm": 0.18095986545085907, "learning_rate": 0.0002, "loss": 0.5529768466949463, "mean_token_accuracy": 0.7791769355535507, "num_tokens": 702271.0, "step": 43 }, { "entropy": 0.5776362270116806, "epoch": 0.16417910447761194, "grad_norm": 0.20184266567230225, "learning_rate": 0.0002, "loss": 0.572957456111908, "mean_token_accuracy": 0.772771418094635, "num_tokens": 718759.0, "step": 44 }, { "entropy": 0.5637746602296829, "epoch": 0.16791044776119404, "grad_norm": 0.16902145743370056, "learning_rate": 0.0002, "loss": 0.564084529876709, "mean_token_accuracy": 0.7736680209636688, "num_tokens": 735087.0, "step": 45 }, { "entropy": 0.5521982908248901, "epoch": 0.17164179104477612, "grad_norm": 0.16458934545516968, "learning_rate": 0.0002, "loss": 0.5616670846939087, "mean_token_accuracy": 0.7762537449598312, "num_tokens": 751513.0, "step": 46 }, { "entropy": 0.5518182516098022, "epoch": 0.17537313432835822, "grad_norm": 0.22303543984889984, "learning_rate": 0.0002, "loss": 0.5712406039237976, "mean_token_accuracy": 0.7692597359418869, "num_tokens": 767651.0, "step": 47 }, { "entropy": 0.5570991486310959, "epoch": 0.1791044776119403, "grad_norm": 0.1629144549369812, "learning_rate": 0.0002, "loss": 0.5624895095825195, "mean_token_accuracy": 0.7735912799835205, "num_tokens": 783757.0, "step": 48 }, { "entropy": 0.549803838133812, "epoch": 0.1828358208955224, "grad_norm": 0.1366954892873764, "learning_rate": 0.0002, "loss": 0.5442911982536316, "mean_token_accuracy": 0.7778248488903046, "num_tokens": 800127.0, "step": 49 }, { "entropy": 0.5679125189781189, "epoch": 0.1865671641791045, "grad_norm": 0.1564488559961319, "learning_rate": 0.0002, "loss": 0.5563010573387146, "mean_token_accuracy": 0.7781310826539993, "num_tokens": 816490.0, "step": 50 }, { "entropy": 0.5595380216836929, "epoch": 0.19029850746268656, "grad_norm": 0.1663539558649063, "learning_rate": 0.0002, "loss": 0.5474997758865356, "mean_token_accuracy": 0.778365820646286, "num_tokens": 832576.0, "step": 51 }, { "entropy": 0.5542885512113571, "epoch": 0.19402985074626866, "grad_norm": 0.15933850407600403, "learning_rate": 0.0002, "loss": 0.5465819239616394, "mean_token_accuracy": 0.781011700630188, "num_tokens": 848529.0, "step": 52 }, { "entropy": 0.570631816983223, "epoch": 0.19776119402985073, "grad_norm": 0.15335530042648315, "learning_rate": 0.0002, "loss": 0.5733448266983032, "mean_token_accuracy": 0.7690710127353668, "num_tokens": 864787.0, "step": 53 }, { "entropy": 0.5657172054052353, "epoch": 0.20149253731343283, "grad_norm": 0.15320488810539246, "learning_rate": 0.0002, "loss": 0.5716187357902527, "mean_token_accuracy": 0.7727480232715607, "num_tokens": 881120.0, "step": 54 }, { "entropy": 0.5566735565662384, "epoch": 0.20522388059701493, "grad_norm": 0.174886554479599, "learning_rate": 0.0002, "loss": 0.5643004775047302, "mean_token_accuracy": 0.7743579894304276, "num_tokens": 897598.0, "step": 55 }, { "entropy": 0.5483224838972092, "epoch": 0.208955223880597, "grad_norm": 0.14539019763469696, "learning_rate": 0.0002, "loss": 0.5542981028556824, "mean_token_accuracy": 0.7777313590049744, "num_tokens": 913970.0, "step": 56 }, { "entropy": 0.5746322274208069, "epoch": 0.2126865671641791, "grad_norm": 0.1465657502412796, "learning_rate": 0.0002, "loss": 0.5676500201225281, "mean_token_accuracy": 0.7716732025146484, "num_tokens": 930515.0, "step": 57 }, { "entropy": 0.5645405799150467, "epoch": 0.21641791044776118, "grad_norm": 0.17157647013664246, "learning_rate": 0.0002, "loss": 0.554180383682251, "mean_token_accuracy": 0.7776309847831726, "num_tokens": 946699.0, "step": 58 }, { "entropy": 0.5437158495187759, "epoch": 0.22014925373134328, "grad_norm": 0.14779002964496613, "learning_rate": 0.0002, "loss": 0.5412948131561279, "mean_token_accuracy": 0.7830284535884857, "num_tokens": 962929.0, "step": 59 }, { "entropy": 0.5478496849536896, "epoch": 0.22388059701492538, "grad_norm": 0.16550469398498535, "learning_rate": 0.0002, "loss": 0.546680212020874, "mean_token_accuracy": 0.7801186293363571, "num_tokens": 979336.0, "step": 60 }, { "entropy": 0.5491016507148743, "epoch": 0.22761194029850745, "grad_norm": 0.17403647303581238, "learning_rate": 0.0002, "loss": 0.5650719404220581, "mean_token_accuracy": 0.7729975134134293, "num_tokens": 995774.0, "step": 61 }, { "entropy": 0.5622769743204117, "epoch": 0.23134328358208955, "grad_norm": 0.17750802636146545, "learning_rate": 0.0002, "loss": 0.5718308687210083, "mean_token_accuracy": 0.7699476927518845, "num_tokens": 1012510.0, "step": 62 }, { "entropy": 0.5333654135465622, "epoch": 0.23507462686567165, "grad_norm": 0.13930155336856842, "learning_rate": 0.0002, "loss": 0.5345954895019531, "mean_token_accuracy": 0.7855408787727356, "num_tokens": 1028613.0, "step": 63 }, { "entropy": 0.5784197896718979, "epoch": 0.23880597014925373, "grad_norm": 0.16901279985904694, "learning_rate": 0.0002, "loss": 0.56936115026474, "mean_token_accuracy": 0.7703966796398163, "num_tokens": 1045046.0, "step": 64 }, { "entropy": 0.5690423101186752, "epoch": 0.24253731343283583, "grad_norm": 0.16224578022956848, "learning_rate": 0.0002, "loss": 0.559661865234375, "mean_token_accuracy": 0.7719420939683914, "num_tokens": 1061419.0, "step": 65 }, { "entropy": 0.5822959691286087, "epoch": 0.2462686567164179, "grad_norm": 0.16501320898532867, "learning_rate": 0.0002, "loss": 0.5733515620231628, "mean_token_accuracy": 0.7682919055223465, "num_tokens": 1077724.0, "step": 66 }, { "entropy": 0.5663120746612549, "epoch": 0.25, "grad_norm": 0.15710598230361938, "learning_rate": 0.0002, "loss": 0.5739370584487915, "mean_token_accuracy": 0.7685963213443756, "num_tokens": 1094309.0, "step": 67 }, { "entropy": 0.5416915565729141, "epoch": 0.2537313432835821, "grad_norm": 0.1652906835079193, "learning_rate": 0.0002, "loss": 0.5546884536743164, "mean_token_accuracy": 0.7781604677438736, "num_tokens": 1110812.0, "step": 68 }, { "entropy": 0.5604560673236847, "epoch": 0.2574626865671642, "grad_norm": 0.1823517084121704, "learning_rate": 0.0002, "loss": 0.565848708152771, "mean_token_accuracy": 0.7732205092906952, "num_tokens": 1126983.0, "step": 69 }, { "entropy": 0.5681725591421127, "epoch": 0.26119402985074625, "grad_norm": 0.15536344051361084, "learning_rate": 0.0002, "loss": 0.5707790851593018, "mean_token_accuracy": 0.7711602002382278, "num_tokens": 1143690.0, "step": 70 }, { "entropy": 0.5554168075323105, "epoch": 0.26492537313432835, "grad_norm": 0.1691257208585739, "learning_rate": 0.0002, "loss": 0.5645061135292053, "mean_token_accuracy": 0.7751206457614899, "num_tokens": 1159930.0, "step": 71 }, { "entropy": 0.5698556303977966, "epoch": 0.26865671641791045, "grad_norm": 0.17756199836730957, "learning_rate": 0.0002, "loss": 0.5670963525772095, "mean_token_accuracy": 0.7744691073894501, "num_tokens": 1176287.0, "step": 72 }, { "entropy": 0.558213621377945, "epoch": 0.27238805970149255, "grad_norm": 0.14214132726192474, "learning_rate": 0.0002, "loss": 0.5565056204795837, "mean_token_accuracy": 0.7759946286678314, "num_tokens": 1192733.0, "step": 73 }, { "entropy": 0.5587260574102402, "epoch": 0.27611940298507465, "grad_norm": 0.1475045531988144, "learning_rate": 0.0002, "loss": 0.5534224510192871, "mean_token_accuracy": 0.7787353843450546, "num_tokens": 1209413.0, "step": 74 }, { "entropy": 0.5601568818092346, "epoch": 0.2798507462686567, "grad_norm": 0.17161411046981812, "learning_rate": 0.0002, "loss": 0.5623729825019836, "mean_token_accuracy": 0.773567259311676, "num_tokens": 1225838.0, "step": 75 }, { "entropy": 0.5421780049800873, "epoch": 0.2835820895522388, "grad_norm": 0.1444474756717682, "learning_rate": 0.0002, "loss": 0.5297126173973083, "mean_token_accuracy": 0.7893946915864944, "num_tokens": 1242213.0, "step": 76 }, { "entropy": 0.5718793421983719, "epoch": 0.2873134328358209, "grad_norm": 0.14322321116924286, "learning_rate": 0.0002, "loss": 0.5714331865310669, "mean_token_accuracy": 0.7688785791397095, "num_tokens": 1258461.0, "step": 77 }, { "entropy": 0.5419993549585342, "epoch": 0.291044776119403, "grad_norm": 0.1524474024772644, "learning_rate": 0.0002, "loss": 0.5490943193435669, "mean_token_accuracy": 0.779272273182869, "num_tokens": 1274449.0, "step": 78 }, { "entropy": 0.5585939884185791, "epoch": 0.2947761194029851, "grad_norm": 0.1510787457227707, "learning_rate": 0.0002, "loss": 0.5654528141021729, "mean_token_accuracy": 0.772942066192627, "num_tokens": 1290949.0, "step": 79 }, { "entropy": 0.563146710395813, "epoch": 0.29850746268656714, "grad_norm": 0.1482156217098236, "learning_rate": 0.0002, "loss": 0.5777900218963623, "mean_token_accuracy": 0.7702645510435104, "num_tokens": 1307187.0, "step": 80 }, { "entropy": 0.5600180923938751, "epoch": 0.30223880597014924, "grad_norm": 0.15022550523281097, "learning_rate": 0.0002, "loss": 0.5632287859916687, "mean_token_accuracy": 0.7716066837310791, "num_tokens": 1323407.0, "step": 81 }, { "entropy": 0.5598095804452896, "epoch": 0.30597014925373134, "grad_norm": 0.1322828084230423, "learning_rate": 0.0002, "loss": 0.5537106394767761, "mean_token_accuracy": 0.7764421850442886, "num_tokens": 1339664.0, "step": 82 }, { "entropy": 0.5458928942680359, "epoch": 0.30970149253731344, "grad_norm": 0.1319894790649414, "learning_rate": 0.0002, "loss": 0.5423555374145508, "mean_token_accuracy": 0.7807362526655197, "num_tokens": 1356260.0, "step": 83 }, { "entropy": 0.5659633129835129, "epoch": 0.31343283582089554, "grad_norm": 0.13246627151966095, "learning_rate": 0.0002, "loss": 0.557287335395813, "mean_token_accuracy": 0.7743117958307266, "num_tokens": 1372821.0, "step": 84 }, { "entropy": 0.5452462434768677, "epoch": 0.31716417910447764, "grad_norm": 0.16196919977664948, "learning_rate": 0.0002, "loss": 0.543107271194458, "mean_token_accuracy": 0.7795177549123764, "num_tokens": 1388889.0, "step": 85 }, { "entropy": 0.5466109812259674, "epoch": 0.3208955223880597, "grad_norm": 0.12639470398426056, "learning_rate": 0.0002, "loss": 0.5396162271499634, "mean_token_accuracy": 0.7834953665733337, "num_tokens": 1405139.0, "step": 86 }, { "entropy": 0.551815465092659, "epoch": 0.3246268656716418, "grad_norm": 0.18058188259601593, "learning_rate": 0.0002, "loss": 0.5637637972831726, "mean_token_accuracy": 0.7716487348079681, "num_tokens": 1421439.0, "step": 87 }, { "entropy": 0.543148547410965, "epoch": 0.3283582089552239, "grad_norm": 0.14002034068107605, "learning_rate": 0.0002, "loss": 0.549104630947113, "mean_token_accuracy": 0.7779115587472916, "num_tokens": 1437695.0, "step": 88 }, { "entropy": 0.5655066221952438, "epoch": 0.332089552238806, "grad_norm": 0.13395759463310242, "learning_rate": 0.0002, "loss": 0.5683454871177673, "mean_token_accuracy": 0.7728030234575272, "num_tokens": 1453991.0, "step": 89 }, { "entropy": 0.5676597952842712, "epoch": 0.3358208955223881, "grad_norm": 0.14229720830917358, "learning_rate": 0.0002, "loss": 0.5701878070831299, "mean_token_accuracy": 0.7698987573385239, "num_tokens": 1470371.0, "step": 90 }, { "entropy": 0.5576249063014984, "epoch": 0.33955223880597013, "grad_norm": 0.1365518420934677, "learning_rate": 0.0002, "loss": 0.560733437538147, "mean_token_accuracy": 0.7742054760456085, "num_tokens": 1486891.0, "step": 91 }, { "entropy": 0.5476901531219482, "epoch": 0.34328358208955223, "grad_norm": 0.12286433577537537, "learning_rate": 0.0002, "loss": 0.5540446639060974, "mean_token_accuracy": 0.7757776081562042, "num_tokens": 1503153.0, "step": 92 }, { "entropy": 0.5445209294557571, "epoch": 0.34701492537313433, "grad_norm": 0.13203619420528412, "learning_rate": 0.0002, "loss": 0.5416238903999329, "mean_token_accuracy": 0.7820428013801575, "num_tokens": 1519248.0, "step": 93 }, { "entropy": 0.5732006430625916, "epoch": 0.35074626865671643, "grad_norm": 0.14288392663002014, "learning_rate": 0.0002, "loss": 0.5734184980392456, "mean_token_accuracy": 0.7677003741264343, "num_tokens": 1535616.0, "step": 94 }, { "entropy": 0.5645585656166077, "epoch": 0.35447761194029853, "grad_norm": 0.1253618448972702, "learning_rate": 0.0002, "loss": 0.5549549460411072, "mean_token_accuracy": 0.7756840586662292, "num_tokens": 1552040.0, "step": 95 }, { "entropy": 0.5686955749988556, "epoch": 0.3582089552238806, "grad_norm": 0.12725889682769775, "learning_rate": 0.0002, "loss": 0.573272705078125, "mean_token_accuracy": 0.7684734165668488, "num_tokens": 1568381.0, "step": 96 }, { "entropy": 0.547907680273056, "epoch": 0.3619402985074627, "grad_norm": 0.13573119044303894, "learning_rate": 0.0002, "loss": 0.5526182055473328, "mean_token_accuracy": 0.7779877185821533, "num_tokens": 1584726.0, "step": 97 }, { "entropy": 0.5658805668354034, "epoch": 0.3656716417910448, "grad_norm": 0.13501696288585663, "learning_rate": 0.0002, "loss": 0.5696231722831726, "mean_token_accuracy": 0.7706904113292694, "num_tokens": 1601142.0, "step": 98 }, { "entropy": 0.5553559362888336, "epoch": 0.3694029850746269, "grad_norm": 0.12036850303411484, "learning_rate": 0.0002, "loss": 0.5520588159561157, "mean_token_accuracy": 0.7781549990177155, "num_tokens": 1617184.0, "step": 99 }, { "entropy": 0.5559379458427429, "epoch": 0.373134328358209, "grad_norm": 0.12556730210781097, "learning_rate": 0.0002, "loss": 0.5582664608955383, "mean_token_accuracy": 0.7744826525449753, "num_tokens": 1633573.0, "step": 100 }, { "entropy": 0.5321817249059677, "epoch": 0.376865671641791, "grad_norm": 0.1410171091556549, "learning_rate": 0.0002, "loss": 0.531158447265625, "mean_token_accuracy": 0.7867954224348068, "num_tokens": 1649580.0, "step": 101 }, { "entropy": 0.5629207491874695, "epoch": 0.3805970149253731, "grad_norm": 0.1320696920156479, "learning_rate": 0.0002, "loss": 0.5548203587532043, "mean_token_accuracy": 0.777129277586937, "num_tokens": 1665914.0, "step": 102 }, { "entropy": 0.5625062435865402, "epoch": 0.3843283582089552, "grad_norm": 0.15022383630275726, "learning_rate": 0.0002, "loss": 0.559231698513031, "mean_token_accuracy": 0.7755367606878281, "num_tokens": 1682572.0, "step": 103 }, { "entropy": 0.55105359852314, "epoch": 0.3880597014925373, "grad_norm": 0.13816320896148682, "learning_rate": 0.0002, "loss": 0.5513999462127686, "mean_token_accuracy": 0.7777303904294968, "num_tokens": 1698800.0, "step": 104 }, { "entropy": 0.5433051884174347, "epoch": 0.3917910447761194, "grad_norm": 0.13852182030677795, "learning_rate": 0.0002, "loss": 0.5473951101303101, "mean_token_accuracy": 0.7787780612707138, "num_tokens": 1715089.0, "step": 105 }, { "entropy": 0.5638341754674911, "epoch": 0.39552238805970147, "grad_norm": 0.13244302570819855, "learning_rate": 0.0002, "loss": 0.5711042284965515, "mean_token_accuracy": 0.7705479264259338, "num_tokens": 1731289.0, "step": 106 }, { "entropy": 0.5590131878852844, "epoch": 0.39925373134328357, "grad_norm": 0.14187560975551605, "learning_rate": 0.0002, "loss": 0.5588455200195312, "mean_token_accuracy": 0.775245189666748, "num_tokens": 1747777.0, "step": 107 }, { "entropy": 0.5456477552652359, "epoch": 0.40298507462686567, "grad_norm": 0.12155073136091232, "learning_rate": 0.0002, "loss": 0.5477449297904968, "mean_token_accuracy": 0.7793276309967041, "num_tokens": 1764099.0, "step": 108 }, { "entropy": 0.5533221960067749, "epoch": 0.40671641791044777, "grad_norm": 0.14932067692279816, "learning_rate": 0.0002, "loss": 0.550473153591156, "mean_token_accuracy": 0.7792102247476578, "num_tokens": 1780092.0, "step": 109 }, { "entropy": 0.5685938596725464, "epoch": 0.41044776119402987, "grad_norm": 0.11824015527963638, "learning_rate": 0.0002, "loss": 0.567302942276001, "mean_token_accuracy": 0.768885999917984, "num_tokens": 1796553.0, "step": 110 }, { "entropy": 0.558070957660675, "epoch": 0.4141791044776119, "grad_norm": 0.13145862519741058, "learning_rate": 0.0002, "loss": 0.5594078302383423, "mean_token_accuracy": 0.7714920043945312, "num_tokens": 1812976.0, "step": 111 }, { "entropy": 0.5445801764726639, "epoch": 0.417910447761194, "grad_norm": 0.1538373976945877, "learning_rate": 0.0002, "loss": 0.5507169365882874, "mean_token_accuracy": 0.7795748263597488, "num_tokens": 1829496.0, "step": 112 }, { "entropy": 0.5546134263277054, "epoch": 0.4216417910447761, "grad_norm": 0.14499837160110474, "learning_rate": 0.0002, "loss": 0.5621107816696167, "mean_token_accuracy": 0.772913932800293, "num_tokens": 1845899.0, "step": 113 }, { "entropy": 0.5376207381486893, "epoch": 0.4253731343283582, "grad_norm": 0.12395139783620834, "learning_rate": 0.0002, "loss": 0.5408076643943787, "mean_token_accuracy": 0.7826146930456161, "num_tokens": 1862102.0, "step": 114 }, { "entropy": 0.5709025114774704, "epoch": 0.4291044776119403, "grad_norm": 0.14900445938110352, "learning_rate": 0.0002, "loss": 0.5688319206237793, "mean_token_accuracy": 0.7712048441171646, "num_tokens": 1878466.0, "step": 115 }, { "entropy": 0.5531350374221802, "epoch": 0.43283582089552236, "grad_norm": 0.14944979548454285, "learning_rate": 0.0002, "loss": 0.5533212423324585, "mean_token_accuracy": 0.7762057036161423, "num_tokens": 1894613.0, "step": 116 }, { "entropy": 0.5613852292299271, "epoch": 0.43656716417910446, "grad_norm": 0.14122174680233002, "learning_rate": 0.0002, "loss": 0.5625326633453369, "mean_token_accuracy": 0.7721518725156784, "num_tokens": 1910791.0, "step": 117 }, { "entropy": 0.5606949478387833, "epoch": 0.44029850746268656, "grad_norm": 0.11353051662445068, "learning_rate": 0.0002, "loss": 0.5561124682426453, "mean_token_accuracy": 0.7774701118469238, "num_tokens": 1927342.0, "step": 118 }, { "entropy": 0.5748601853847504, "epoch": 0.44402985074626866, "grad_norm": 0.13328969478607178, "learning_rate": 0.0002, "loss": 0.5738563537597656, "mean_token_accuracy": 0.7660426646471024, "num_tokens": 1944009.0, "step": 119 }, { "entropy": 0.5331175327301025, "epoch": 0.44776119402985076, "grad_norm": 0.14304570853710175, "learning_rate": 0.0002, "loss": 0.535332441329956, "mean_token_accuracy": 0.7843142002820969, "num_tokens": 1960275.0, "step": 120 }, { "entropy": 0.5579216629266739, "epoch": 0.45149253731343286, "grad_norm": 0.12545879185199738, "learning_rate": 0.0002, "loss": 0.5590261220932007, "mean_token_accuracy": 0.7733252346515656, "num_tokens": 1976578.0, "step": 121 }, { "entropy": 0.5593921393156052, "epoch": 0.4552238805970149, "grad_norm": 0.13857485353946686, "learning_rate": 0.0002, "loss": 0.5631604194641113, "mean_token_accuracy": 0.7736008018255234, "num_tokens": 1993053.0, "step": 122 }, { "entropy": 0.5660806745290756, "epoch": 0.458955223880597, "grad_norm": 0.11944495886564255, "learning_rate": 0.0002, "loss": 0.5569764375686646, "mean_token_accuracy": 0.7737946212291718, "num_tokens": 2009442.0, "step": 123 }, { "entropy": 0.5681817382574081, "epoch": 0.4626865671641791, "grad_norm": 0.14172527194023132, "learning_rate": 0.0002, "loss": 0.5605779886245728, "mean_token_accuracy": 0.7750114947557449, "num_tokens": 2025901.0, "step": 124 }, { "entropy": 0.5467974990606308, "epoch": 0.4664179104477612, "grad_norm": 0.1252705603837967, "learning_rate": 0.0002, "loss": 0.5515766739845276, "mean_token_accuracy": 0.7760580778121948, "num_tokens": 2042208.0, "step": 125 }, { "entropy": 0.5420515686273575, "epoch": 0.4701492537313433, "grad_norm": 0.13870663940906525, "learning_rate": 0.0002, "loss": 0.5480060577392578, "mean_token_accuracy": 0.7764822095632553, "num_tokens": 2058681.0, "step": 126 }, { "entropy": 0.5362897217273712, "epoch": 0.47388059701492535, "grad_norm": 0.13995425403118134, "learning_rate": 0.0002, "loss": 0.5513206720352173, "mean_token_accuracy": 0.7750497758388519, "num_tokens": 2075000.0, "step": 127 }, { "entropy": 0.5329284965991974, "epoch": 0.47761194029850745, "grad_norm": 0.16524387896060944, "learning_rate": 0.0002, "loss": 0.5436174273490906, "mean_token_accuracy": 0.7792856246232986, "num_tokens": 2091221.0, "step": 128 }, { "entropy": 0.5539916902780533, "epoch": 0.48134328358208955, "grad_norm": 0.12479358166456223, "learning_rate": 0.0002, "loss": 0.5608515739440918, "mean_token_accuracy": 0.7734991759061813, "num_tokens": 2107664.0, "step": 129 }, { "entropy": 0.5594889521598816, "epoch": 0.48507462686567165, "grad_norm": 0.14481139183044434, "learning_rate": 0.0002, "loss": 0.5508875846862793, "mean_token_accuracy": 0.7767421901226044, "num_tokens": 2123952.0, "step": 130 }, { "entropy": 0.5442296341061592, "epoch": 0.48880597014925375, "grad_norm": 0.12281627953052521, "learning_rate": 0.0002, "loss": 0.5368722677230835, "mean_token_accuracy": 0.7826971709728241, "num_tokens": 2139985.0, "step": 131 }, { "entropy": 0.562851145863533, "epoch": 0.4925373134328358, "grad_norm": 0.14453750848770142, "learning_rate": 0.0002, "loss": 0.5439143180847168, "mean_token_accuracy": 0.7809209376573563, "num_tokens": 2156312.0, "step": 132 }, { "entropy": 0.5531761199235916, "epoch": 0.4962686567164179, "grad_norm": 0.13650745153427124, "learning_rate": 0.0002, "loss": 0.5565841197967529, "mean_token_accuracy": 0.7758718878030777, "num_tokens": 2172756.0, "step": 133 }, { "entropy": 0.5456132292747498, "epoch": 0.5, "grad_norm": 0.13749481737613678, "learning_rate": 0.0002, "loss": 0.5540860295295715, "mean_token_accuracy": 0.7755758464336395, "num_tokens": 2189086.0, "step": 134 }, { "entropy": 0.5647578835487366, "epoch": 0.503731343283582, "grad_norm": 0.145718514919281, "learning_rate": 0.0002, "loss": 0.5744016766548157, "mean_token_accuracy": 0.7706383019685745, "num_tokens": 2205658.0, "step": 135 }, { "entropy": 0.5253579095005989, "epoch": 0.5074626865671642, "grad_norm": 0.1236543357372284, "learning_rate": 0.0002, "loss": 0.5327446460723877, "mean_token_accuracy": 0.7834168970584869, "num_tokens": 2221900.0, "step": 136 }, { "entropy": 0.5625722110271454, "epoch": 0.5111940298507462, "grad_norm": 0.1114581972360611, "learning_rate": 0.0002, "loss": 0.5667597651481628, "mean_token_accuracy": 0.7699635177850723, "num_tokens": 2238309.0, "step": 137 }, { "entropy": 0.5476242303848267, "epoch": 0.5149253731343284, "grad_norm": 0.1360960304737091, "learning_rate": 0.0002, "loss": 0.5452396273612976, "mean_token_accuracy": 0.7796155512332916, "num_tokens": 2254713.0, "step": 138 }, { "entropy": 0.5573885440826416, "epoch": 0.5186567164179104, "grad_norm": 0.11950599402189255, "learning_rate": 0.0002, "loss": 0.5531854629516602, "mean_token_accuracy": 0.7765035033226013, "num_tokens": 2271164.0, "step": 139 }, { "entropy": 0.5644345581531525, "epoch": 0.5223880597014925, "grad_norm": 0.11840134114027023, "learning_rate": 0.0002, "loss": 0.5575224161148071, "mean_token_accuracy": 0.7718838900327682, "num_tokens": 2287762.0, "step": 140 }, { "entropy": 0.5466153174638748, "epoch": 0.5261194029850746, "grad_norm": 0.1688532829284668, "learning_rate": 0.0002, "loss": 0.5499178171157837, "mean_token_accuracy": 0.777469664812088, "num_tokens": 2304348.0, "step": 141 }, { "entropy": 0.5427221059799194, "epoch": 0.5298507462686567, "grad_norm": 0.14760567247867584, "learning_rate": 0.0002, "loss": 0.5492222905158997, "mean_token_accuracy": 0.778323158621788, "num_tokens": 2320490.0, "step": 142 }, { "entropy": 0.5470593422651291, "epoch": 0.5335820895522388, "grad_norm": 0.19991202652454376, "learning_rate": 0.0002, "loss": 0.5513626933097839, "mean_token_accuracy": 0.7774471044540405, "num_tokens": 2337221.0, "step": 143 }, { "entropy": 0.5426470190286636, "epoch": 0.5373134328358209, "grad_norm": 0.11571265757083893, "learning_rate": 0.0002, "loss": 0.5405253767967224, "mean_token_accuracy": 0.7813504189252853, "num_tokens": 2353353.0, "step": 144 }, { "entropy": 0.5667431056499481, "epoch": 0.5410447761194029, "grad_norm": 0.12742455303668976, "learning_rate": 0.0002, "loss": 0.5593273043632507, "mean_token_accuracy": 0.7729441076517105, "num_tokens": 2369753.0, "step": 145 }, { "entropy": 0.5697275847196579, "epoch": 0.5447761194029851, "grad_norm": 0.1348797082901001, "learning_rate": 0.0002, "loss": 0.5684511661529541, "mean_token_accuracy": 0.7724753767251968, "num_tokens": 2386156.0, "step": 146 }, { "entropy": 0.5411224067211151, "epoch": 0.5485074626865671, "grad_norm": 0.1279442012310028, "learning_rate": 0.0002, "loss": 0.5420435667037964, "mean_token_accuracy": 0.782076433300972, "num_tokens": 2402488.0, "step": 147 }, { "entropy": 0.5458887368440628, "epoch": 0.5522388059701493, "grad_norm": 0.15301373600959778, "learning_rate": 0.0002, "loss": 0.5421918630599976, "mean_token_accuracy": 0.7805485129356384, "num_tokens": 2418800.0, "step": 148 }, { "entropy": 0.5494910031557083, "epoch": 0.5559701492537313, "grad_norm": 0.13024193048477173, "learning_rate": 0.0002, "loss": 0.5560234189033508, "mean_token_accuracy": 0.7752619981765747, "num_tokens": 2435229.0, "step": 149 }, { "entropy": 0.5497897416353226, "epoch": 0.5597014925373134, "grad_norm": 0.140470951795578, "learning_rate": 0.0002, "loss": 0.5513492226600647, "mean_token_accuracy": 0.775757297873497, "num_tokens": 2451762.0, "step": 150 }, { "entropy": 0.5479221642017365, "epoch": 0.5634328358208955, "grad_norm": 0.11884977668523788, "learning_rate": 0.0002, "loss": 0.5478861331939697, "mean_token_accuracy": 0.782090038061142, "num_tokens": 2468180.0, "step": 151 }, { "entropy": 0.5405495166778564, "epoch": 0.5671641791044776, "grad_norm": 0.12883080542087555, "learning_rate": 0.0002, "loss": 0.5406085252761841, "mean_token_accuracy": 0.7832252681255341, "num_tokens": 2484444.0, "step": 152 }, { "entropy": 0.5454452037811279, "epoch": 0.5708955223880597, "grad_norm": 0.12270363420248032, "learning_rate": 0.0002, "loss": 0.5502068400382996, "mean_token_accuracy": 0.7790153920650482, "num_tokens": 2500846.0, "step": 153 }, { "entropy": 0.5570302158594131, "epoch": 0.5746268656716418, "grad_norm": 0.1269625872373581, "learning_rate": 0.0002, "loss": 0.5548018217086792, "mean_token_accuracy": 0.778030514717102, "num_tokens": 2517083.0, "step": 154 }, { "entropy": 0.5605379194021225, "epoch": 0.5783582089552238, "grad_norm": 0.1287340223789215, "learning_rate": 0.0002, "loss": 0.561842143535614, "mean_token_accuracy": 0.7721278667449951, "num_tokens": 2533804.0, "step": 155 }, { "entropy": 0.5481511801481247, "epoch": 0.582089552238806, "grad_norm": 0.13460931181907654, "learning_rate": 0.0002, "loss": 0.5473400950431824, "mean_token_accuracy": 0.7798450142145157, "num_tokens": 2550301.0, "step": 156 }, { "entropy": 0.5569665729999542, "epoch": 0.585820895522388, "grad_norm": 0.1167525053024292, "learning_rate": 0.0002, "loss": 0.5591033697128296, "mean_token_accuracy": 0.7743667513132095, "num_tokens": 2566630.0, "step": 157 }, { "entropy": 0.5529917627573013, "epoch": 0.5895522388059702, "grad_norm": 0.1454092264175415, "learning_rate": 0.0002, "loss": 0.5575821399688721, "mean_token_accuracy": 0.7714344263076782, "num_tokens": 2583278.0, "step": 158 }, { "entropy": 0.5369462221860886, "epoch": 0.5932835820895522, "grad_norm": 0.12713587284088135, "learning_rate": 0.0002, "loss": 0.541353702545166, "mean_token_accuracy": 0.7810934484004974, "num_tokens": 2599680.0, "step": 159 }, { "entropy": 0.5471956133842468, "epoch": 0.5970149253731343, "grad_norm": 0.1193249523639679, "learning_rate": 0.0002, "loss": 0.544399619102478, "mean_token_accuracy": 0.777627244591713, "num_tokens": 2615971.0, "step": 160 }, { "entropy": 0.5561826080083847, "epoch": 0.6007462686567164, "grad_norm": 0.1412789523601532, "learning_rate": 0.0002, "loss": 0.5533403754234314, "mean_token_accuracy": 0.774614229798317, "num_tokens": 2632402.0, "step": 161 }, { "entropy": 0.5589349567890167, "epoch": 0.6044776119402985, "grad_norm": 0.12422283738851547, "learning_rate": 0.0002, "loss": 0.5584982633590698, "mean_token_accuracy": 0.772629901766777, "num_tokens": 2648936.0, "step": 162 }, { "entropy": 0.5598675608634949, "epoch": 0.6082089552238806, "grad_norm": 0.14433413743972778, "learning_rate": 0.0002, "loss": 0.5596426725387573, "mean_token_accuracy": 0.7740431576967239, "num_tokens": 2665475.0, "step": 163 }, { "entropy": 0.5221775621175766, "epoch": 0.6119402985074627, "grad_norm": 0.12392512708902359, "learning_rate": 0.0002, "loss": 0.5226801037788391, "mean_token_accuracy": 0.7883991152048111, "num_tokens": 2681739.0, "step": 164 }, { "entropy": 0.5390211492776871, "epoch": 0.6156716417910447, "grad_norm": 0.1389789581298828, "learning_rate": 0.0002, "loss": 0.5467759370803833, "mean_token_accuracy": 0.7787502557039261, "num_tokens": 2698224.0, "step": 165 }, { "entropy": 0.5343765914440155, "epoch": 0.6194029850746269, "grad_norm": 0.15462790429592133, "learning_rate": 0.0002, "loss": 0.5523170232772827, "mean_token_accuracy": 0.7789429575204849, "num_tokens": 2714480.0, "step": 166 }, { "entropy": 0.5412632822990417, "epoch": 0.6231343283582089, "grad_norm": 0.13078634440898895, "learning_rate": 0.0002, "loss": 0.5461232662200928, "mean_token_accuracy": 0.7796546518802643, "num_tokens": 2730804.0, "step": 167 }, { "entropy": 0.5592486709356308, "epoch": 0.6268656716417911, "grad_norm": 0.11671686917543411, "learning_rate": 0.0002, "loss": 0.5556939244270325, "mean_token_accuracy": 0.7750763148069382, "num_tokens": 2747189.0, "step": 168 }, { "entropy": 0.5645984709262848, "epoch": 0.6305970149253731, "grad_norm": 0.11404155939817429, "learning_rate": 0.0002, "loss": 0.5586551427841187, "mean_token_accuracy": 0.7756913602352142, "num_tokens": 2763561.0, "step": 169 }, { "entropy": 0.5689886808395386, "epoch": 0.6343283582089553, "grad_norm": 0.13602924346923828, "learning_rate": 0.0002, "loss": 0.571495771408081, "mean_token_accuracy": 0.7653735727071762, "num_tokens": 2780048.0, "step": 170 }, { "entropy": 0.56998710334301, "epoch": 0.6380597014925373, "grad_norm": 0.15131747722625732, "learning_rate": 0.0002, "loss": 0.5685769319534302, "mean_token_accuracy": 0.770746722817421, "num_tokens": 2796401.0, "step": 171 }, { "entropy": 0.5340622663497925, "epoch": 0.6417910447761194, "grad_norm": 0.10990842431783676, "learning_rate": 0.0002, "loss": 0.5300686955451965, "mean_token_accuracy": 0.7831304669380188, "num_tokens": 2812688.0, "step": 172 }, { "entropy": 0.5546266734600067, "epoch": 0.6455223880597015, "grad_norm": 0.14243000745773315, "learning_rate": 0.0002, "loss": 0.5531081557273865, "mean_token_accuracy": 0.7720183730125427, "num_tokens": 2828912.0, "step": 173 }, { "entropy": 0.530887708067894, "epoch": 0.6492537313432836, "grad_norm": 0.14285673201084137, "learning_rate": 0.0002, "loss": 0.5329350233078003, "mean_token_accuracy": 0.7844198048114777, "num_tokens": 2845032.0, "step": 174 }, { "entropy": 0.5529126077890396, "epoch": 0.6529850746268657, "grad_norm": 0.12663516402244568, "learning_rate": 0.0002, "loss": 0.5582675337791443, "mean_token_accuracy": 0.775692343711853, "num_tokens": 2861233.0, "step": 175 }, { "entropy": 0.5530151873826981, "epoch": 0.6567164179104478, "grad_norm": 0.1777547299861908, "learning_rate": 0.0002, "loss": 0.5580370426177979, "mean_token_accuracy": 0.7773808538913727, "num_tokens": 2877595.0, "step": 176 }, { "entropy": 0.5517453551292419, "epoch": 0.6604477611940298, "grad_norm": 0.12728020548820496, "learning_rate": 0.0002, "loss": 0.549347996711731, "mean_token_accuracy": 0.7813896834850311, "num_tokens": 2893885.0, "step": 177 }, { "entropy": 0.5581229478120804, "epoch": 0.664179104477612, "grad_norm": 0.12608157098293304, "learning_rate": 0.0002, "loss": 0.5528551936149597, "mean_token_accuracy": 0.774133637547493, "num_tokens": 2910402.0, "step": 178 }, { "entropy": 0.5545129030942917, "epoch": 0.667910447761194, "grad_norm": 0.14164696633815765, "learning_rate": 0.0002, "loss": 0.5471103191375732, "mean_token_accuracy": 0.7807044833898544, "num_tokens": 2927020.0, "step": 179 }, { "entropy": 0.5679615437984467, "epoch": 0.6716417910447762, "grad_norm": 0.11040110141038895, "learning_rate": 0.0002, "loss": 0.5661795139312744, "mean_token_accuracy": 0.7697756141424179, "num_tokens": 2943445.0, "step": 180 }, { "entropy": 0.5358923226594925, "epoch": 0.6753731343283582, "grad_norm": 0.12206491082906723, "learning_rate": 0.0002, "loss": 0.5459122061729431, "mean_token_accuracy": 0.7805617302656174, "num_tokens": 2959987.0, "step": 181 }, { "entropy": 0.5579689890146255, "epoch": 0.6791044776119403, "grad_norm": 0.14179477095603943, "learning_rate": 0.0002, "loss": 0.5636488199234009, "mean_token_accuracy": 0.7736007869243622, "num_tokens": 2976751.0, "step": 182 }, { "entropy": 0.5510261654853821, "epoch": 0.6828358208955224, "grad_norm": 0.12091591209173203, "learning_rate": 0.0002, "loss": 0.561327338218689, "mean_token_accuracy": 0.776558443903923, "num_tokens": 2993041.0, "step": 183 }, { "entropy": 0.5457663834095001, "epoch": 0.6865671641791045, "grad_norm": 0.12697891891002655, "learning_rate": 0.0002, "loss": 0.5465325117111206, "mean_token_accuracy": 0.7786546349525452, "num_tokens": 3009436.0, "step": 184 }, { "entropy": 0.5649427324533463, "epoch": 0.6902985074626866, "grad_norm": 0.13892695307731628, "learning_rate": 0.0002, "loss": 0.5654124617576599, "mean_token_accuracy": 0.7703604251146317, "num_tokens": 3025787.0, "step": 185 }, { "entropy": 0.5688793361186981, "epoch": 0.6940298507462687, "grad_norm": 0.11656537652015686, "learning_rate": 0.0002, "loss": 0.5590483546257019, "mean_token_accuracy": 0.7758390307426453, "num_tokens": 3042147.0, "step": 186 }, { "entropy": 0.5568420886993408, "epoch": 0.6977611940298507, "grad_norm": 0.1266399472951889, "learning_rate": 0.0002, "loss": 0.5490051507949829, "mean_token_accuracy": 0.7778443545103073, "num_tokens": 3058479.0, "step": 187 }, { "entropy": 0.5504391342401505, "epoch": 0.7014925373134329, "grad_norm": 0.15510344505310059, "learning_rate": 0.0002, "loss": 0.5499662756919861, "mean_token_accuracy": 0.7750896066427231, "num_tokens": 3074684.0, "step": 188 }, { "entropy": 0.5515661090612411, "epoch": 0.7052238805970149, "grad_norm": 0.1378200650215149, "learning_rate": 0.0002, "loss": 0.5564606189727783, "mean_token_accuracy": 0.7740965932607651, "num_tokens": 3091070.0, "step": 189 }, { "entropy": 0.5522360950708389, "epoch": 0.7089552238805971, "grad_norm": 0.1490645706653595, "learning_rate": 0.0002, "loss": 0.5577459335327148, "mean_token_accuracy": 0.7747645527124405, "num_tokens": 3107501.0, "step": 190 }, { "entropy": 0.5528729557991028, "epoch": 0.7126865671641791, "grad_norm": 0.14538180828094482, "learning_rate": 0.0002, "loss": 0.5618550777435303, "mean_token_accuracy": 0.7729964852333069, "num_tokens": 3123822.0, "step": 191 }, { "entropy": 0.5486249774694443, "epoch": 0.7164179104477612, "grad_norm": 0.12265278398990631, "learning_rate": 0.0002, "loss": 0.5423588752746582, "mean_token_accuracy": 0.7789205312728882, "num_tokens": 3140334.0, "step": 192 }, { "entropy": 0.5567969381809235, "epoch": 0.7201492537313433, "grad_norm": 0.13273917138576508, "learning_rate": 0.0002, "loss": 0.5613058805465698, "mean_token_accuracy": 0.7748401314020157, "num_tokens": 3156490.0, "step": 193 }, { "entropy": 0.558370977640152, "epoch": 0.7238805970149254, "grad_norm": 0.1269926130771637, "learning_rate": 0.0002, "loss": 0.548975944519043, "mean_token_accuracy": 0.7803195267915726, "num_tokens": 3172917.0, "step": 194 }, { "entropy": 0.5645796656608582, "epoch": 0.7276119402985075, "grad_norm": 0.12320506572723389, "learning_rate": 0.0002, "loss": 0.5635199546813965, "mean_token_accuracy": 0.773562416434288, "num_tokens": 3189322.0, "step": 195 }, { "entropy": 0.5316331535577774, "epoch": 0.7313432835820896, "grad_norm": 0.1522948294878006, "learning_rate": 0.0002, "loss": 0.5410732626914978, "mean_token_accuracy": 0.7790966629981995, "num_tokens": 3205551.0, "step": 196 }, { "entropy": 0.5493949502706528, "epoch": 0.7350746268656716, "grad_norm": 0.119343101978302, "learning_rate": 0.0002, "loss": 0.5500541925430298, "mean_token_accuracy": 0.7768760919570923, "num_tokens": 3222029.0, "step": 197 }, { "entropy": 0.5477159917354584, "epoch": 0.7388059701492538, "grad_norm": 0.119729183614254, "learning_rate": 0.0002, "loss": 0.5477977991104126, "mean_token_accuracy": 0.7788135707378387, "num_tokens": 3238421.0, "step": 198 }, { "entropy": 0.5607248842716217, "epoch": 0.7425373134328358, "grad_norm": 0.13485661149024963, "learning_rate": 0.0002, "loss": 0.5701273083686829, "mean_token_accuracy": 0.7674471586942673, "num_tokens": 3254789.0, "step": 199 }, { "entropy": 0.5362051874399185, "epoch": 0.746268656716418, "grad_norm": 0.11599450558423996, "learning_rate": 0.0002, "loss": 0.5382620692253113, "mean_token_accuracy": 0.7804013192653656, "num_tokens": 3270902.0, "step": 200 }, { "entropy": 0.5385442525148392, "epoch": 0.75, "grad_norm": 0.11722157150506973, "learning_rate": 0.0002, "loss": 0.5425242185592651, "mean_token_accuracy": 0.7779103666543961, "num_tokens": 3287148.0, "step": 201 }, { "entropy": 0.5608718395233154, "epoch": 0.753731343283582, "grad_norm": 0.11743324995040894, "learning_rate": 0.0002, "loss": 0.5605480670928955, "mean_token_accuracy": 0.7718753963708878, "num_tokens": 3303602.0, "step": 202 }, { "entropy": 0.5647395998239517, "epoch": 0.7574626865671642, "grad_norm": 0.12360575795173645, "learning_rate": 0.0002, "loss": 0.565830409526825, "mean_token_accuracy": 0.7734925150871277, "num_tokens": 3319914.0, "step": 203 }, { "entropy": 0.5613357871770859, "epoch": 0.7611940298507462, "grad_norm": 0.12299378216266632, "learning_rate": 0.0002, "loss": 0.5502001643180847, "mean_token_accuracy": 0.7780173420906067, "num_tokens": 3336266.0, "step": 204 }, { "entropy": 0.5557620376348495, "epoch": 0.7649253731343284, "grad_norm": 0.13515423238277435, "learning_rate": 0.0002, "loss": 0.5513977408409119, "mean_token_accuracy": 0.7768134474754333, "num_tokens": 3352828.0, "step": 205 }, { "entropy": 0.5312158316373825, "epoch": 0.7686567164179104, "grad_norm": 0.1245652511715889, "learning_rate": 0.0002, "loss": 0.5331584215164185, "mean_token_accuracy": 0.783508375287056, "num_tokens": 3368900.0, "step": 206 }, { "entropy": 0.5540332049131393, "epoch": 0.7723880597014925, "grad_norm": 0.12260495871305466, "learning_rate": 0.0002, "loss": 0.5610563158988953, "mean_token_accuracy": 0.772364541888237, "num_tokens": 3385392.0, "step": 207 }, { "entropy": 0.5408795922994614, "epoch": 0.7761194029850746, "grad_norm": 0.1623620092868805, "learning_rate": 0.0002, "loss": 0.5433046221733093, "mean_token_accuracy": 0.7798032164573669, "num_tokens": 3401604.0, "step": 208 }, { "entropy": 0.5390565246343613, "epoch": 0.7798507462686567, "grad_norm": 0.13042029738426208, "learning_rate": 0.0002, "loss": 0.5478684902191162, "mean_token_accuracy": 0.7792101353406906, "num_tokens": 3417639.0, "step": 209 }, { "entropy": 0.5241924300789833, "epoch": 0.7835820895522388, "grad_norm": 0.13064046204090118, "learning_rate": 0.0002, "loss": 0.5299482941627502, "mean_token_accuracy": 0.7843270599842072, "num_tokens": 3433827.0, "step": 210 }, { "entropy": 0.545391634106636, "epoch": 0.7873134328358209, "grad_norm": 0.14404848217964172, "learning_rate": 0.0002, "loss": 0.539533257484436, "mean_token_accuracy": 0.7797930389642715, "num_tokens": 3450075.0, "step": 211 }, { "entropy": 0.5748691409826279, "epoch": 0.7910447761194029, "grad_norm": 0.13996216654777527, "learning_rate": 0.0002, "loss": 0.5611885786056519, "mean_token_accuracy": 0.7745807766914368, "num_tokens": 3466557.0, "step": 212 }, { "entropy": 0.5685983300209045, "epoch": 0.7947761194029851, "grad_norm": 0.12288983166217804, "learning_rate": 0.0002, "loss": 0.5553888082504272, "mean_token_accuracy": 0.7752144187688828, "num_tokens": 3482978.0, "step": 213 }, { "entropy": 0.5502497553825378, "epoch": 0.7985074626865671, "grad_norm": 0.12848587334156036, "learning_rate": 0.0002, "loss": 0.549103856086731, "mean_token_accuracy": 0.7791820466518402, "num_tokens": 3499378.0, "step": 214 }, { "entropy": 0.5424053594470024, "epoch": 0.8022388059701493, "grad_norm": 0.12519471347332, "learning_rate": 0.0002, "loss": 0.5496050119400024, "mean_token_accuracy": 0.7755117863416672, "num_tokens": 3515899.0, "step": 215 }, { "entropy": 0.5332234650850296, "epoch": 0.8059701492537313, "grad_norm": 0.17385068535804749, "learning_rate": 0.0002, "loss": 0.5551385283470154, "mean_token_accuracy": 0.7749006897211075, "num_tokens": 3532197.0, "step": 216 }, { "entropy": 0.5355218946933746, "epoch": 0.8097014925373134, "grad_norm": 0.1355784386396408, "learning_rate": 0.0002, "loss": 0.5417052507400513, "mean_token_accuracy": 0.7785830944776535, "num_tokens": 3548584.0, "step": 217 }, { "entropy": 0.545543447136879, "epoch": 0.8134328358208955, "grad_norm": 0.10903589427471161, "learning_rate": 0.0002, "loss": 0.5351961255073547, "mean_token_accuracy": 0.7840810418128967, "num_tokens": 3564973.0, "step": 218 }, { "entropy": 0.5678307712078094, "epoch": 0.8171641791044776, "grad_norm": 0.13619016110897064, "learning_rate": 0.0002, "loss": 0.5577275156974792, "mean_token_accuracy": 0.7739268988370895, "num_tokens": 3581436.0, "step": 219 }, { "entropy": 0.5452380776405334, "epoch": 0.8208955223880597, "grad_norm": 0.12011487782001495, "learning_rate": 0.0002, "loss": 0.5431472063064575, "mean_token_accuracy": 0.7790575325489044, "num_tokens": 3597661.0, "step": 220 }, { "entropy": 0.5536454021930695, "epoch": 0.8246268656716418, "grad_norm": 0.10391338169574738, "learning_rate": 0.0002, "loss": 0.5514202117919922, "mean_token_accuracy": 0.7758155465126038, "num_tokens": 3614221.0, "step": 221 }, { "entropy": 0.5350385755300522, "epoch": 0.8283582089552238, "grad_norm": 0.1497930884361267, "learning_rate": 0.0002, "loss": 0.5447626709938049, "mean_token_accuracy": 0.778772234916687, "num_tokens": 3630441.0, "step": 222 }, { "entropy": 0.5551132708787918, "epoch": 0.832089552238806, "grad_norm": 0.12266736477613449, "learning_rate": 0.0002, "loss": 0.558661937713623, "mean_token_accuracy": 0.773910716176033, "num_tokens": 3647039.0, "step": 223 }, { "entropy": 0.5643535554409027, "epoch": 0.835820895522388, "grad_norm": 0.11532776802778244, "learning_rate": 0.0002, "loss": 0.5642860531806946, "mean_token_accuracy": 0.7725937813520432, "num_tokens": 3663412.0, "step": 224 }, { "entropy": 0.5549684166908264, "epoch": 0.8395522388059702, "grad_norm": 0.12639960646629333, "learning_rate": 0.0002, "loss": 0.5532217025756836, "mean_token_accuracy": 0.7739283442497253, "num_tokens": 3679945.0, "step": 225 }, { "entropy": 0.560679629445076, "epoch": 0.8432835820895522, "grad_norm": 0.13600312173366547, "learning_rate": 0.0002, "loss": 0.5514844059944153, "mean_token_accuracy": 0.7773452550172806, "num_tokens": 3696613.0, "step": 226 }, { "entropy": 0.5458584129810333, "epoch": 0.8470149253731343, "grad_norm": 0.10419101268053055, "learning_rate": 0.0002, "loss": 0.5424168109893799, "mean_token_accuracy": 0.7833174467086792, "num_tokens": 3713158.0, "step": 227 }, { "entropy": 0.542242094874382, "epoch": 0.8507462686567164, "grad_norm": 0.1483229100704193, "learning_rate": 0.0002, "loss": 0.5505244731903076, "mean_token_accuracy": 0.7768149822950363, "num_tokens": 3729484.0, "step": 228 }, { "entropy": 0.5342283248901367, "epoch": 0.8544776119402985, "grad_norm": 0.16167280077934265, "learning_rate": 0.0002, "loss": 0.5423468947410583, "mean_token_accuracy": 0.781244620680809, "num_tokens": 3745710.0, "step": 229 }, { "entropy": 0.5557206273078918, "epoch": 0.8582089552238806, "grad_norm": 0.10992418974637985, "learning_rate": 0.0002, "loss": 0.5555332899093628, "mean_token_accuracy": 0.7740505337715149, "num_tokens": 3761974.0, "step": 230 }, { "entropy": 0.5301929265260696, "epoch": 0.8619402985074627, "grad_norm": 0.20067644119262695, "learning_rate": 0.0002, "loss": 0.5325175523757935, "mean_token_accuracy": 0.7839723825454712, "num_tokens": 3777980.0, "step": 231 }, { "entropy": 0.5519733354449272, "epoch": 0.8656716417910447, "grad_norm": 0.11584831774234772, "learning_rate": 0.0002, "loss": 0.547998309135437, "mean_token_accuracy": 0.7752280086278915, "num_tokens": 3794210.0, "step": 232 }, { "entropy": 0.5573844611644745, "epoch": 0.8694029850746269, "grad_norm": 0.14681567251682281, "learning_rate": 0.0002, "loss": 0.5630576014518738, "mean_token_accuracy": 0.7713348120450974, "num_tokens": 3810625.0, "step": 233 }, { "entropy": 0.5614193379878998, "epoch": 0.8731343283582089, "grad_norm": 0.3717029392719269, "learning_rate": 0.0002, "loss": 0.5614831447601318, "mean_token_accuracy": 0.7718814015388489, "num_tokens": 3826871.0, "step": 234 }, { "entropy": 0.5552587062120438, "epoch": 0.8768656716417911, "grad_norm": 0.1315956562757492, "learning_rate": 0.0002, "loss": 0.5541540384292603, "mean_token_accuracy": 0.7746177315711975, "num_tokens": 3843187.0, "step": 235 }, { "entropy": 0.5387386232614517, "epoch": 0.8805970149253731, "grad_norm": 0.4729621112346649, "learning_rate": 0.0002, "loss": 0.5513001084327698, "mean_token_accuracy": 0.777639240026474, "num_tokens": 3859659.0, "step": 236 }, { "entropy": 0.5589011460542679, "epoch": 0.8843283582089553, "grad_norm": 0.11313692480325699, "learning_rate": 0.0002, "loss": 0.550857424736023, "mean_token_accuracy": 0.7776817381381989, "num_tokens": 3876082.0, "step": 237 }, { "entropy": 0.5506832748651505, "epoch": 0.8880597014925373, "grad_norm": 0.15838703513145447, "learning_rate": 0.0002, "loss": 0.5493965148925781, "mean_token_accuracy": 0.774595633149147, "num_tokens": 3892310.0, "step": 238 }, { "entropy": 0.5482196658849716, "epoch": 0.8917910447761194, "grad_norm": 0.16354775428771973, "learning_rate": 0.0002, "loss": 0.549696147441864, "mean_token_accuracy": 0.7784011512994766, "num_tokens": 3908561.0, "step": 239 }, { "entropy": 0.5474406778812408, "epoch": 0.8955223880597015, "grad_norm": 0.11488547921180725, "learning_rate": 0.0002, "loss": 0.5442180037498474, "mean_token_accuracy": 0.7787186056375504, "num_tokens": 3924971.0, "step": 240 }, { "entropy": 0.5576506555080414, "epoch": 0.8992537313432836, "grad_norm": 0.11725704371929169, "learning_rate": 0.0002, "loss": 0.5556765794754028, "mean_token_accuracy": 0.7754130512475967, "num_tokens": 3941384.0, "step": 241 }, { "entropy": 0.5686157792806625, "epoch": 0.9029850746268657, "grad_norm": 0.1209690198302269, "learning_rate": 0.0002, "loss": 0.5740119218826294, "mean_token_accuracy": 0.7644337117671967, "num_tokens": 3957527.0, "step": 242 }, { "entropy": 0.5520821809768677, "epoch": 0.9067164179104478, "grad_norm": 0.1097254753112793, "learning_rate": 0.0002, "loss": 0.5524159669876099, "mean_token_accuracy": 0.7778758704662323, "num_tokens": 3973803.0, "step": 243 }, { "entropy": 0.5603332817554474, "epoch": 0.9104477611940298, "grad_norm": 0.13421349227428436, "learning_rate": 0.0002, "loss": 0.5633103251457214, "mean_token_accuracy": 0.7723569422960281, "num_tokens": 3990124.0, "step": 244 }, { "entropy": 0.5404402911663055, "epoch": 0.914179104477612, "grad_norm": 0.12017542868852615, "learning_rate": 0.0002, "loss": 0.5424325466156006, "mean_token_accuracy": 0.7823856174945831, "num_tokens": 4006560.0, "step": 245 }, { "entropy": 0.5605191737413406, "epoch": 0.917910447761194, "grad_norm": 0.14128640294075012, "learning_rate": 0.0002, "loss": 0.5602733492851257, "mean_token_accuracy": 0.7735545933246613, "num_tokens": 4022966.0, "step": 246 }, { "entropy": 0.5599958896636963, "epoch": 0.9216417910447762, "grad_norm": 0.11880706995725632, "learning_rate": 0.0002, "loss": 0.5598034858703613, "mean_token_accuracy": 0.7717109471559525, "num_tokens": 4039261.0, "step": 247 }, { "entropy": 0.5408921539783478, "epoch": 0.9253731343283582, "grad_norm": 0.12040922045707703, "learning_rate": 0.0002, "loss": 0.5460969805717468, "mean_token_accuracy": 0.7793735712766647, "num_tokens": 4055343.0, "step": 248 }, { "entropy": 0.5573666542768478, "epoch": 0.9291044776119403, "grad_norm": 0.12093377858400345, "learning_rate": 0.0002, "loss": 0.556143045425415, "mean_token_accuracy": 0.7752596288919449, "num_tokens": 4071770.0, "step": 249 }, { "entropy": 0.563015878200531, "epoch": 0.9328358208955224, "grad_norm": 0.11447741836309433, "learning_rate": 0.0002, "loss": 0.5647203922271729, "mean_token_accuracy": 0.7692370861768723, "num_tokens": 4088034.0, "step": 250 }, { "entropy": 0.548077866435051, "epoch": 0.9365671641791045, "grad_norm": 0.11981664597988129, "learning_rate": 0.0002, "loss": 0.5454928278923035, "mean_token_accuracy": 0.7787458151578903, "num_tokens": 4104196.0, "step": 251 }, { "entropy": 0.5375737547874451, "epoch": 0.9402985074626866, "grad_norm": 0.12071040272712708, "learning_rate": 0.0002, "loss": 0.5404340028762817, "mean_token_accuracy": 0.7799674719572067, "num_tokens": 4120470.0, "step": 252 }, { "entropy": 0.547912061214447, "epoch": 0.9440298507462687, "grad_norm": 0.12739375233650208, "learning_rate": 0.0002, "loss": 0.5530076026916504, "mean_token_accuracy": 0.7753598988056183, "num_tokens": 4136885.0, "step": 253 }, { "entropy": 0.5538879930973053, "epoch": 0.9477611940298507, "grad_norm": 0.12144653499126434, "learning_rate": 0.0002, "loss": 0.5514186024665833, "mean_token_accuracy": 0.7753842920064926, "num_tokens": 4153216.0, "step": 254 }, { "entropy": 0.5411302447319031, "epoch": 0.9514925373134329, "grad_norm": 0.11099912226200104, "learning_rate": 0.0002, "loss": 0.5385034084320068, "mean_token_accuracy": 0.7812628000974655, "num_tokens": 4169402.0, "step": 255 }, { "entropy": 0.5564829558134079, "epoch": 0.9552238805970149, "grad_norm": 0.12310667335987091, "learning_rate": 0.0002, "loss": 0.5534285306930542, "mean_token_accuracy": 0.7745526880025864, "num_tokens": 4185847.0, "step": 256 }, { "entropy": 0.5459543019533157, "epoch": 0.9589552238805971, "grad_norm": 0.1408655047416687, "learning_rate": 0.0002, "loss": 0.539636492729187, "mean_token_accuracy": 0.7817695140838623, "num_tokens": 4202324.0, "step": 257 }, { "entropy": 0.5483512580394745, "epoch": 0.9626865671641791, "grad_norm": 0.1329817920923233, "learning_rate": 0.0002, "loss": 0.5545552968978882, "mean_token_accuracy": 0.7754471302032471, "num_tokens": 4218485.0, "step": 258 }, { "entropy": 0.5507388859987259, "epoch": 0.9664179104477612, "grad_norm": 0.14522868394851685, "learning_rate": 0.0002, "loss": 0.5539411306381226, "mean_token_accuracy": 0.776690736413002, "num_tokens": 4234830.0, "step": 259 }, { "entropy": 0.5551155656576157, "epoch": 0.9701492537313433, "grad_norm": 0.1110503152012825, "learning_rate": 0.0002, "loss": 0.5517114996910095, "mean_token_accuracy": 0.7778125107288361, "num_tokens": 4251249.0, "step": 260 }, { "entropy": 0.5606275051832199, "epoch": 0.9738805970149254, "grad_norm": 0.11907053738832474, "learning_rate": 0.0002, "loss": 0.5583968162536621, "mean_token_accuracy": 0.7729120701551437, "num_tokens": 4267571.0, "step": 261 }, { "entropy": 0.5697215348482132, "epoch": 0.9776119402985075, "grad_norm": 0.11226138472557068, "learning_rate": 0.0002, "loss": 0.5654243230819702, "mean_token_accuracy": 0.7697847783565521, "num_tokens": 4283938.0, "step": 262 }, { "entropy": 0.5614341050386429, "epoch": 0.9813432835820896, "grad_norm": 0.12085731327533722, "learning_rate": 0.0002, "loss": 0.5629435777664185, "mean_token_accuracy": 0.7714052200317383, "num_tokens": 4300727.0, "step": 263 }, { "entropy": 0.5495717078447342, "epoch": 0.9850746268656716, "grad_norm": 0.1363348811864853, "learning_rate": 0.0002, "loss": 0.5549257397651672, "mean_token_accuracy": 0.7735868841409683, "num_tokens": 4316903.0, "step": 264 }, { "entropy": 0.5352297425270081, "epoch": 0.9888059701492538, "grad_norm": 0.1429988294839859, "learning_rate": 0.0002, "loss": 0.5460555553436279, "mean_token_accuracy": 0.7814377993345261, "num_tokens": 4333143.0, "step": 265 }, { "entropy": 0.5603132396936417, "epoch": 0.9925373134328358, "grad_norm": 0.14986178278923035, "learning_rate": 0.0002, "loss": 0.5551425218582153, "mean_token_accuracy": 0.7773159593343735, "num_tokens": 4349576.0, "step": 266 }, { "entropy": 0.5535064339637756, "epoch": 0.996268656716418, "grad_norm": 0.1105998232960701, "learning_rate": 0.0002, "loss": 0.5442855954170227, "mean_token_accuracy": 0.7821661084890366, "num_tokens": 4365977.0, "step": 267 }, { "entropy": 0.5614945888519287, "epoch": 1.0, "grad_norm": 0.12907235324382782, "learning_rate": 0.0002, "loss": 0.5476444959640503, "mean_token_accuracy": 0.7792651057243347, "num_tokens": 4382526.0, "step": 268 }, { "entropy": 0.548059806227684, "epoch": 1.0037313432835822, "grad_norm": 0.12145893275737762, "learning_rate": 0.0002, "loss": 0.5402656197547913, "mean_token_accuracy": 0.7813442945480347, "num_tokens": 4399005.0, "step": 269 }, { "entropy": 0.5212839543819427, "epoch": 1.007462686567164, "grad_norm": 0.1396404206752777, "learning_rate": 0.0002, "loss": 0.5315491557121277, "mean_token_accuracy": 0.7839601635932922, "num_tokens": 4415205.0, "step": 270 }, { "entropy": 0.5132785737514496, "epoch": 1.0111940298507462, "grad_norm": 0.1433689296245575, "learning_rate": 0.0002, "loss": 0.5299959778785706, "mean_token_accuracy": 0.7853466272354126, "num_tokens": 4431512.0, "step": 271 }, { "entropy": 0.5394517332315445, "epoch": 1.0149253731343284, "grad_norm": 0.11504881829023361, "learning_rate": 0.0002, "loss": 0.5439318418502808, "mean_token_accuracy": 0.7786544561386108, "num_tokens": 4447878.0, "step": 272 }, { "entropy": 0.5173204094171524, "epoch": 1.0186567164179103, "grad_norm": 0.12369395047426224, "learning_rate": 0.0002, "loss": 0.525097668170929, "mean_token_accuracy": 0.7878104597330093, "num_tokens": 4464069.0, "step": 273 }, { "entropy": 0.5443273782730103, "epoch": 1.0223880597014925, "grad_norm": 0.12611854076385498, "learning_rate": 0.0002, "loss": 0.5425093770027161, "mean_token_accuracy": 0.7833482921123505, "num_tokens": 4480510.0, "step": 274 }, { "entropy": 0.5319035351276398, "epoch": 1.0261194029850746, "grad_norm": 0.11637023091316223, "learning_rate": 0.0002, "loss": 0.5231828093528748, "mean_token_accuracy": 0.788045197725296, "num_tokens": 4496734.0, "step": 275 }, { "entropy": 0.5645869076251984, "epoch": 1.0298507462686568, "grad_norm": 0.11970556527376175, "learning_rate": 0.0002, "loss": 0.556399405002594, "mean_token_accuracy": 0.7753234058618546, "num_tokens": 4513272.0, "step": 276 }, { "entropy": 0.5412048548460007, "epoch": 1.0335820895522387, "grad_norm": 0.12889669835567474, "learning_rate": 0.0002, "loss": 0.5352495908737183, "mean_token_accuracy": 0.7822704613208771, "num_tokens": 4529760.0, "step": 277 }, { "entropy": 0.5433377772569656, "epoch": 1.037313432835821, "grad_norm": 0.15610089898109436, "learning_rate": 0.0002, "loss": 0.5424712896347046, "mean_token_accuracy": 0.7791996449232101, "num_tokens": 4546065.0, "step": 278 }, { "entropy": 0.5367715954780579, "epoch": 1.041044776119403, "grad_norm": 0.1712978631258011, "learning_rate": 0.0002, "loss": 0.5500761270523071, "mean_token_accuracy": 0.7774211019277573, "num_tokens": 4562404.0, "step": 279 }, { "entropy": 0.5348818898200989, "epoch": 1.044776119402985, "grad_norm": 0.14415498077869415, "learning_rate": 0.0002, "loss": 0.5458697080612183, "mean_token_accuracy": 0.7776882946491241, "num_tokens": 4578594.0, "step": 280 }, { "entropy": 0.5394753366708755, "epoch": 1.0485074626865671, "grad_norm": 0.17060807347297668, "learning_rate": 0.0002, "loss": 0.5428628921508789, "mean_token_accuracy": 0.7797123193740845, "num_tokens": 4594918.0, "step": 281 }, { "entropy": 0.5477339029312134, "epoch": 1.0522388059701493, "grad_norm": 0.12646426260471344, "learning_rate": 0.0002, "loss": 0.5376375913619995, "mean_token_accuracy": 0.7846843749284744, "num_tokens": 4611225.0, "step": 282 }, { "entropy": 0.553899347782135, "epoch": 1.0559701492537314, "grad_norm": 0.14560198783874512, "learning_rate": 0.0002, "loss": 0.5442871451377869, "mean_token_accuracy": 0.779757484793663, "num_tokens": 4627515.0, "step": 283 }, { "entropy": 0.544152095913887, "epoch": 1.0597014925373134, "grad_norm": 0.14532814919948578, "learning_rate": 0.0002, "loss": 0.5495354533195496, "mean_token_accuracy": 0.7756282091140747, "num_tokens": 4644151.0, "step": 284 }, { "entropy": 0.5467684864997864, "epoch": 1.0634328358208955, "grad_norm": 0.14399303495883942, "learning_rate": 0.0002, "loss": 0.5551741123199463, "mean_token_accuracy": 0.7747452855110168, "num_tokens": 4660349.0, "step": 285 }, { "entropy": 0.5328090041875839, "epoch": 1.0671641791044777, "grad_norm": 0.1490914672613144, "learning_rate": 0.0002, "loss": 0.5371617674827576, "mean_token_accuracy": 0.7852603644132614, "num_tokens": 4676682.0, "step": 286 }, { "entropy": 0.5549953877925873, "epoch": 1.0708955223880596, "grad_norm": 0.13986609876155853, "learning_rate": 0.0002, "loss": 0.5485588312149048, "mean_token_accuracy": 0.7786588221788406, "num_tokens": 4693087.0, "step": 287 }, { "entropy": 0.5441232770681381, "epoch": 1.0746268656716418, "grad_norm": 0.13744987547397614, "learning_rate": 0.0002, "loss": 0.5352811813354492, "mean_token_accuracy": 0.7830296456813812, "num_tokens": 4709482.0, "step": 288 }, { "entropy": 0.5388935655355453, "epoch": 1.078358208955224, "grad_norm": 0.12793688476085663, "learning_rate": 0.0002, "loss": 0.5364757776260376, "mean_token_accuracy": 0.780993863940239, "num_tokens": 4725929.0, "step": 289 }, { "entropy": 0.5281359702348709, "epoch": 1.0820895522388059, "grad_norm": 0.11734890192747116, "learning_rate": 0.0002, "loss": 0.5293084979057312, "mean_token_accuracy": 0.7876105159521103, "num_tokens": 4742317.0, "step": 290 }, { "entropy": 0.5459820628166199, "epoch": 1.085820895522388, "grad_norm": 0.12839624285697937, "learning_rate": 0.0002, "loss": 0.5461269617080688, "mean_token_accuracy": 0.7763439863920212, "num_tokens": 4758682.0, "step": 291 }, { "entropy": 0.5111119300127029, "epoch": 1.0895522388059702, "grad_norm": 0.1377914845943451, "learning_rate": 0.0002, "loss": 0.5165018439292908, "mean_token_accuracy": 0.792814165353775, "num_tokens": 4775165.0, "step": 292 }, { "entropy": 0.5256515890359879, "epoch": 1.0932835820895523, "grad_norm": 0.13310879468917847, "learning_rate": 0.0002, "loss": 0.5263264179229736, "mean_token_accuracy": 0.7891132682561874, "num_tokens": 4791249.0, "step": 293 }, { "entropy": 0.5361033976078033, "epoch": 1.0970149253731343, "grad_norm": 0.11920680850744247, "learning_rate": 0.0002, "loss": 0.5344924926757812, "mean_token_accuracy": 0.7844657897949219, "num_tokens": 4807722.0, "step": 294 }, { "entropy": 0.547529011964798, "epoch": 1.1007462686567164, "grad_norm": 0.15012222528457642, "learning_rate": 0.0002, "loss": 0.5434770584106445, "mean_token_accuracy": 0.7794990837574005, "num_tokens": 4824221.0, "step": 295 }, { "entropy": 0.5387088805437088, "epoch": 1.1044776119402986, "grad_norm": 0.11607323586940765, "learning_rate": 0.0002, "loss": 0.5379114151000977, "mean_token_accuracy": 0.7820580452680588, "num_tokens": 4840561.0, "step": 296 }, { "entropy": 0.5285296589136124, "epoch": 1.1082089552238805, "grad_norm": 0.16472671926021576, "learning_rate": 0.0002, "loss": 0.5286039710044861, "mean_token_accuracy": 0.7859488725662231, "num_tokens": 4856739.0, "step": 297 }, { "entropy": 0.5467464625835419, "epoch": 1.1119402985074627, "grad_norm": 0.12136011570692062, "learning_rate": 0.0002, "loss": 0.5486158132553101, "mean_token_accuracy": 0.7766989320516586, "num_tokens": 4873254.0, "step": 298 }, { "entropy": 0.5323450714349747, "epoch": 1.1156716417910448, "grad_norm": 0.15763746201992035, "learning_rate": 0.0002, "loss": 0.53644198179245, "mean_token_accuracy": 0.7847353965044022, "num_tokens": 4889763.0, "step": 299 }, { "entropy": 0.5294622331857681, "epoch": 1.1194029850746268, "grad_norm": 0.14253245294094086, "learning_rate": 0.0002, "loss": 0.5327939987182617, "mean_token_accuracy": 0.7873322665691376, "num_tokens": 4905780.0, "step": 300 }, { "entropy": 0.5500210523605347, "epoch": 1.123134328358209, "grad_norm": 0.1611548215150833, "learning_rate": 0.0002, "loss": 0.55262291431427, "mean_token_accuracy": 0.7771656811237335, "num_tokens": 4921935.0, "step": 301 }, { "entropy": 0.5608504116535187, "epoch": 1.126865671641791, "grad_norm": 0.14609341323375702, "learning_rate": 0.0002, "loss": 0.5597085952758789, "mean_token_accuracy": 0.773489698767662, "num_tokens": 4938566.0, "step": 302 }, { "entropy": 0.541571170091629, "epoch": 1.1305970149253732, "grad_norm": 0.11906211823225021, "learning_rate": 0.0002, "loss": 0.541067361831665, "mean_token_accuracy": 0.7795013040304184, "num_tokens": 4954995.0, "step": 303 }, { "entropy": 0.5374023020267487, "epoch": 1.1343283582089552, "grad_norm": 0.191620334982872, "learning_rate": 0.0002, "loss": 0.540854811668396, "mean_token_accuracy": 0.783530056476593, "num_tokens": 4971285.0, "step": 304 }, { "entropy": 0.5237517058849335, "epoch": 1.1380597014925373, "grad_norm": 0.13355116546154022, "learning_rate": 0.0002, "loss": 0.5256230235099792, "mean_token_accuracy": 0.7869999557733536, "num_tokens": 4987629.0, "step": 305 }, { "entropy": 0.5161513015627861, "epoch": 1.1417910447761195, "grad_norm": 0.14180561900138855, "learning_rate": 0.0002, "loss": 0.5189639925956726, "mean_token_accuracy": 0.7884562611579895, "num_tokens": 5003816.0, "step": 306 }, { "entropy": 0.5333078503608704, "epoch": 1.1455223880597014, "grad_norm": 0.11995179206132889, "learning_rate": 0.0002, "loss": 0.5338060259819031, "mean_token_accuracy": 0.7834619730710983, "num_tokens": 5020179.0, "step": 307 }, { "entropy": 0.5374015420675278, "epoch": 1.1492537313432836, "grad_norm": 0.14065897464752197, "learning_rate": 0.0002, "loss": 0.541375994682312, "mean_token_accuracy": 0.7836798280477524, "num_tokens": 5036421.0, "step": 308 }, { "entropy": 0.5318789333105087, "epoch": 1.1529850746268657, "grad_norm": 0.15007704496383667, "learning_rate": 0.0002, "loss": 0.5320872664451599, "mean_token_accuracy": 0.7854835838079453, "num_tokens": 5052767.0, "step": 309 }, { "entropy": 0.5555961728096008, "epoch": 1.1567164179104479, "grad_norm": 0.12327966094017029, "learning_rate": 0.0002, "loss": 0.5514441728591919, "mean_token_accuracy": 0.775398313999176, "num_tokens": 5069219.0, "step": 310 }, { "entropy": 0.5369515269994736, "epoch": 1.1604477611940298, "grad_norm": 0.13790592551231384, "learning_rate": 0.0002, "loss": 0.5307064652442932, "mean_token_accuracy": 0.7870743423700333, "num_tokens": 5085637.0, "step": 311 }, { "entropy": 0.5395635664463043, "epoch": 1.164179104477612, "grad_norm": 0.12657856941223145, "learning_rate": 0.0002, "loss": 0.539893388748169, "mean_token_accuracy": 0.7809743881225586, "num_tokens": 5101984.0, "step": 312 }, { "entropy": 0.5528725534677505, "epoch": 1.1679104477611941, "grad_norm": 0.15744967758655548, "learning_rate": 0.0002, "loss": 0.5551643967628479, "mean_token_accuracy": 0.7749461233615875, "num_tokens": 5118457.0, "step": 313 }, { "entropy": 0.5547244101762772, "epoch": 1.171641791044776, "grad_norm": 0.14667753875255585, "learning_rate": 0.0002, "loss": 0.5545704364776611, "mean_token_accuracy": 0.7767890095710754, "num_tokens": 5135070.0, "step": 314 }, { "entropy": 0.5513405501842499, "epoch": 1.1753731343283582, "grad_norm": 0.13363401591777802, "learning_rate": 0.0002, "loss": 0.5478935241699219, "mean_token_accuracy": 0.7782707363367081, "num_tokens": 5151457.0, "step": 315 }, { "entropy": 0.5504343062639236, "epoch": 1.1791044776119404, "grad_norm": 0.14427515864372253, "learning_rate": 0.0002, "loss": 0.5503411293029785, "mean_token_accuracy": 0.7759760916233063, "num_tokens": 5167918.0, "step": 316 }, { "entropy": 0.5411941558122635, "epoch": 1.1828358208955223, "grad_norm": 0.13475076854228973, "learning_rate": 0.0002, "loss": 0.5334619283676147, "mean_token_accuracy": 0.7848760634660721, "num_tokens": 5184250.0, "step": 317 }, { "entropy": 0.5534447133541107, "epoch": 1.1865671641791045, "grad_norm": 0.14666007459163666, "learning_rate": 0.0002, "loss": 0.5606579184532166, "mean_token_accuracy": 0.7732094079256058, "num_tokens": 5200728.0, "step": 318 }, { "entropy": 0.5172414779663086, "epoch": 1.1902985074626866, "grad_norm": 0.1494058072566986, "learning_rate": 0.0002, "loss": 0.5262372493743896, "mean_token_accuracy": 0.787101224064827, "num_tokens": 5216948.0, "step": 319 }, { "entropy": 0.5277577340602875, "epoch": 1.1940298507462686, "grad_norm": 0.15135720372200012, "learning_rate": 0.0002, "loss": 0.5401796102523804, "mean_token_accuracy": 0.7809148728847504, "num_tokens": 5233422.0, "step": 320 }, { "entropy": 0.5246866941452026, "epoch": 1.1977611940298507, "grad_norm": 0.12589603662490845, "learning_rate": 0.0002, "loss": 0.5281919836997986, "mean_token_accuracy": 0.7868399173021317, "num_tokens": 5249730.0, "step": 321 }, { "entropy": 0.5274995267391205, "epoch": 1.2014925373134329, "grad_norm": 0.11834204196929932, "learning_rate": 0.0002, "loss": 0.5278512835502625, "mean_token_accuracy": 0.7852350920438766, "num_tokens": 5266115.0, "step": 322 }, { "entropy": 0.5320824682712555, "epoch": 1.205223880597015, "grad_norm": 0.13883750140666962, "learning_rate": 0.0002, "loss": 0.5280960202217102, "mean_token_accuracy": 0.7858837693929672, "num_tokens": 5282462.0, "step": 323 }, { "entropy": 0.5404033660888672, "epoch": 1.208955223880597, "grad_norm": 0.13842950761318207, "learning_rate": 0.0002, "loss": 0.5391522645950317, "mean_token_accuracy": 0.7815057188272476, "num_tokens": 5299103.0, "step": 324 }, { "entropy": 0.5260981917381287, "epoch": 1.212686567164179, "grad_norm": 0.14888468384742737, "learning_rate": 0.0002, "loss": 0.5250783562660217, "mean_token_accuracy": 0.7861860394477844, "num_tokens": 5315339.0, "step": 325 }, { "entropy": 0.5244043916463852, "epoch": 1.2164179104477613, "grad_norm": 0.12871688604354858, "learning_rate": 0.0002, "loss": 0.5234277844429016, "mean_token_accuracy": 0.787299633026123, "num_tokens": 5331854.0, "step": 326 }, { "entropy": 0.5336845368146896, "epoch": 1.2201492537313432, "grad_norm": 0.1279512345790863, "learning_rate": 0.0002, "loss": 0.5357816815376282, "mean_token_accuracy": 0.7811597734689713, "num_tokens": 5348268.0, "step": 327 }, { "entropy": 0.5396746844053268, "epoch": 1.2238805970149254, "grad_norm": 0.1272435188293457, "learning_rate": 0.0002, "loss": 0.5367811322212219, "mean_token_accuracy": 0.7815662026405334, "num_tokens": 5364832.0, "step": 328 }, { "entropy": 0.5355321317911148, "epoch": 1.2276119402985075, "grad_norm": 0.12457006424665451, "learning_rate": 0.0002, "loss": 0.5324679613113403, "mean_token_accuracy": 0.7855342030525208, "num_tokens": 5381181.0, "step": 329 }, { "entropy": 0.5404689311981201, "epoch": 1.2313432835820897, "grad_norm": 0.1616295725107193, "learning_rate": 0.0002, "loss": 0.5461254715919495, "mean_token_accuracy": 0.7793011963367462, "num_tokens": 5397689.0, "step": 330 }, { "entropy": 0.5573465675115585, "epoch": 1.2350746268656716, "grad_norm": 0.1567206233739853, "learning_rate": 0.0002, "loss": 0.5680751204490662, "mean_token_accuracy": 0.7683437466621399, "num_tokens": 5414063.0, "step": 331 }, { "entropy": 0.5585090219974518, "epoch": 1.2388059701492538, "grad_norm": 0.13362006843090057, "learning_rate": 0.0002, "loss": 0.5544182658195496, "mean_token_accuracy": 0.7759232968091965, "num_tokens": 5430545.0, "step": 332 }, { "entropy": 0.5479722023010254, "epoch": 1.242537313432836, "grad_norm": 0.16734908521175385, "learning_rate": 0.0002, "loss": 0.5447990298271179, "mean_token_accuracy": 0.7797949612140656, "num_tokens": 5446700.0, "step": 333 }, { "entropy": 0.5607796311378479, "epoch": 1.2462686567164178, "grad_norm": 0.1450573354959488, "learning_rate": 0.0002, "loss": 0.556632936000824, "mean_token_accuracy": 0.7769130021333694, "num_tokens": 5463137.0, "step": 334 }, { "entropy": 0.5538843423128128, "epoch": 1.25, "grad_norm": 0.12896743416786194, "learning_rate": 0.0002, "loss": 0.5562998056411743, "mean_token_accuracy": 0.7745624631643295, "num_tokens": 5479659.0, "step": 335 }, { "entropy": 0.5309284329414368, "epoch": 1.2537313432835822, "grad_norm": 0.1323668360710144, "learning_rate": 0.0002, "loss": 0.5389367341995239, "mean_token_accuracy": 0.7794619351625443, "num_tokens": 5495884.0, "step": 336 }, { "entropy": 0.5279457420110703, "epoch": 1.2574626865671643, "grad_norm": 0.16464678943157196, "learning_rate": 0.0002, "loss": 0.540420413017273, "mean_token_accuracy": 0.7797137498855591, "num_tokens": 5512288.0, "step": 337 }, { "entropy": 0.5431296676397324, "epoch": 1.2611940298507462, "grad_norm": 0.15366457402706146, "learning_rate": 0.0002, "loss": 0.5533568263053894, "mean_token_accuracy": 0.7777420580387115, "num_tokens": 5528739.0, "step": 338 }, { "entropy": 0.5533888936042786, "epoch": 1.2649253731343284, "grad_norm": 0.15439164638519287, "learning_rate": 0.0002, "loss": 0.5407285690307617, "mean_token_accuracy": 0.7848910838365555, "num_tokens": 5545180.0, "step": 339 }, { "entropy": 0.5363039374351501, "epoch": 1.2686567164179103, "grad_norm": 0.14024227857589722, "learning_rate": 0.0002, "loss": 0.5247921943664551, "mean_token_accuracy": 0.7866441905498505, "num_tokens": 5561365.0, "step": 340 }, { "entropy": 0.5282331109046936, "epoch": 1.2723880597014925, "grad_norm": 0.15727277100086212, "learning_rate": 0.0002, "loss": 0.5256697535514832, "mean_token_accuracy": 0.7857891470193863, "num_tokens": 5577609.0, "step": 341 }, { "entropy": 0.5532326549291611, "epoch": 1.2761194029850746, "grad_norm": 0.14312665164470673, "learning_rate": 0.0002, "loss": 0.5558714866638184, "mean_token_accuracy": 0.776502713561058, "num_tokens": 5593922.0, "step": 342 }, { "entropy": 0.5117308422923088, "epoch": 1.2798507462686568, "grad_norm": 0.13982926309108734, "learning_rate": 0.0002, "loss": 0.5216178894042969, "mean_token_accuracy": 0.7898732572793961, "num_tokens": 5610160.0, "step": 343 }, { "entropy": 0.5327529311180115, "epoch": 1.2835820895522387, "grad_norm": 0.1600239872932434, "learning_rate": 0.0002, "loss": 0.54588383436203, "mean_token_accuracy": 0.7827021777629852, "num_tokens": 5626483.0, "step": 344 }, { "entropy": 0.5456168502569199, "epoch": 1.287313432835821, "grad_norm": 0.1314232498407364, "learning_rate": 0.0002, "loss": 0.5445138216018677, "mean_token_accuracy": 0.7821621298789978, "num_tokens": 5642705.0, "step": 345 }, { "entropy": 0.5568868517875671, "epoch": 1.291044776119403, "grad_norm": 0.12736710906028748, "learning_rate": 0.0002, "loss": 0.5469453930854797, "mean_token_accuracy": 0.7784760594367981, "num_tokens": 5659144.0, "step": 346 }, { "entropy": 0.5525211989879608, "epoch": 1.294776119402985, "grad_norm": 0.11654646694660187, "learning_rate": 0.0002, "loss": 0.542698323726654, "mean_token_accuracy": 0.7785234600305557, "num_tokens": 5675452.0, "step": 347 }, { "entropy": 0.5460808724164963, "epoch": 1.2985074626865671, "grad_norm": 0.1318521350622177, "learning_rate": 0.0002, "loss": 0.5390938520431519, "mean_token_accuracy": 0.7815311253070831, "num_tokens": 5691735.0, "step": 348 }, { "entropy": 0.5437112301588058, "epoch": 1.3022388059701493, "grad_norm": 0.13485990464687347, "learning_rate": 0.0002, "loss": 0.5420966148376465, "mean_token_accuracy": 0.7827932983636856, "num_tokens": 5708102.0, "step": 349 }, { "entropy": 0.5493648052215576, "epoch": 1.3059701492537314, "grad_norm": 0.14354610443115234, "learning_rate": 0.0002, "loss": 0.5561747550964355, "mean_token_accuracy": 0.7761517763137817, "num_tokens": 5724350.0, "step": 350 }, { "entropy": 0.5344854891300201, "epoch": 1.3097014925373134, "grad_norm": 0.15943452715873718, "learning_rate": 0.0002, "loss": 0.5391569137573242, "mean_token_accuracy": 0.7805770933628082, "num_tokens": 5740954.0, "step": 351 }, { "entropy": 0.5242450833320618, "epoch": 1.3134328358208955, "grad_norm": 0.13654360175132751, "learning_rate": 0.0002, "loss": 0.5292847156524658, "mean_token_accuracy": 0.784620076417923, "num_tokens": 5757385.0, "step": 352 }, { "entropy": 0.5383377820253372, "epoch": 1.3171641791044777, "grad_norm": 0.13651302456855774, "learning_rate": 0.0002, "loss": 0.5413467288017273, "mean_token_accuracy": 0.7786675840616226, "num_tokens": 5773852.0, "step": 353 }, { "entropy": 0.5402452051639557, "epoch": 1.3208955223880596, "grad_norm": 0.13241973519325256, "learning_rate": 0.0002, "loss": 0.5419248938560486, "mean_token_accuracy": 0.778145432472229, "num_tokens": 5790055.0, "step": 354 }, { "entropy": 0.5536379665136337, "epoch": 1.3246268656716418, "grad_norm": 0.13762575387954712, "learning_rate": 0.0002, "loss": 0.5484678745269775, "mean_token_accuracy": 0.7766116708517075, "num_tokens": 5806738.0, "step": 355 }, { "entropy": 0.5532735884189606, "epoch": 1.328358208955224, "grad_norm": 0.12154927849769592, "learning_rate": 0.0002, "loss": 0.5548056960105896, "mean_token_accuracy": 0.7753622978925705, "num_tokens": 5823183.0, "step": 356 }, { "entropy": 0.5448320060968399, "epoch": 1.332089552238806, "grad_norm": 0.144795224070549, "learning_rate": 0.0002, "loss": 0.5448752641677856, "mean_token_accuracy": 0.7790551483631134, "num_tokens": 5839499.0, "step": 357 }, { "entropy": 0.5511485040187836, "epoch": 1.335820895522388, "grad_norm": 0.13511039316654205, "learning_rate": 0.0002, "loss": 0.5528499484062195, "mean_token_accuracy": 0.776659682393074, "num_tokens": 5855921.0, "step": 358 }, { "entropy": 0.5290715843439102, "epoch": 1.3395522388059702, "grad_norm": 0.11837369203567505, "learning_rate": 0.0002, "loss": 0.5328022241592407, "mean_token_accuracy": 0.7826089113950729, "num_tokens": 5872142.0, "step": 359 }, { "entropy": 0.5363620519638062, "epoch": 1.3432835820895521, "grad_norm": 0.12029700726270676, "learning_rate": 0.0002, "loss": 0.534315824508667, "mean_token_accuracy": 0.7845976501703262, "num_tokens": 5888484.0, "step": 360 }, { "entropy": 0.5347290933132172, "epoch": 1.3470149253731343, "grad_norm": 0.13828180730342865, "learning_rate": 0.0002, "loss": 0.5338245630264282, "mean_token_accuracy": 0.7808255851268768, "num_tokens": 5904613.0, "step": 361 }, { "entropy": 0.5324546545743942, "epoch": 1.3507462686567164, "grad_norm": 0.12894095480442047, "learning_rate": 0.0002, "loss": 0.5361336469650269, "mean_token_accuracy": 0.7821396291255951, "num_tokens": 5920864.0, "step": 362 }, { "entropy": 0.5308556854724884, "epoch": 1.3544776119402986, "grad_norm": 0.11929216980934143, "learning_rate": 0.0002, "loss": 0.5275416374206543, "mean_token_accuracy": 0.7852365076541901, "num_tokens": 5937108.0, "step": 363 }, { "entropy": 0.53159399330616, "epoch": 1.3582089552238805, "grad_norm": 0.14378131926059723, "learning_rate": 0.0002, "loss": 0.5424759387969971, "mean_token_accuracy": 0.7792777568101883, "num_tokens": 5953233.0, "step": 364 }, { "entropy": 0.5450653731822968, "epoch": 1.3619402985074627, "grad_norm": 0.14581741392612457, "learning_rate": 0.0002, "loss": 0.5530756115913391, "mean_token_accuracy": 0.7765647917985916, "num_tokens": 5969681.0, "step": 365 }, { "entropy": 0.5418213754892349, "epoch": 1.3656716417910448, "grad_norm": 0.13764694333076477, "learning_rate": 0.0002, "loss": 0.5494720935821533, "mean_token_accuracy": 0.7783620804548264, "num_tokens": 5985895.0, "step": 366 }, { "entropy": 0.5528892427682877, "epoch": 1.3694029850746268, "grad_norm": 0.14292745292186737, "learning_rate": 0.0002, "loss": 0.5427901148796082, "mean_token_accuracy": 0.7794772684574127, "num_tokens": 6002104.0, "step": 367 }, { "entropy": 0.5515422970056534, "epoch": 1.373134328358209, "grad_norm": 0.12165708839893341, "learning_rate": 0.0002, "loss": 0.5388676524162292, "mean_token_accuracy": 0.7821601629257202, "num_tokens": 6018297.0, "step": 368 }, { "entropy": 0.5522115230560303, "epoch": 1.376865671641791, "grad_norm": 0.16414624452590942, "learning_rate": 0.0002, "loss": 0.5514496564865112, "mean_token_accuracy": 0.7735963463783264, "num_tokens": 6034469.0, "step": 369 }, { "entropy": 0.5200467556715012, "epoch": 1.3805970149253732, "grad_norm": 0.11550547927618027, "learning_rate": 0.0002, "loss": 0.5164188146591187, "mean_token_accuracy": 0.7926855981349945, "num_tokens": 6050831.0, "step": 370 }, { "entropy": 0.5372455269098282, "epoch": 1.3843283582089552, "grad_norm": 0.15535052120685577, "learning_rate": 0.0002, "loss": 0.5430443286895752, "mean_token_accuracy": 0.7787685394287109, "num_tokens": 6067185.0, "step": 371 }, { "entropy": 0.5356560945510864, "epoch": 1.3880597014925373, "grad_norm": 0.13415579497814178, "learning_rate": 0.0002, "loss": 0.5381686091423035, "mean_token_accuracy": 0.7826534509658813, "num_tokens": 6083549.0, "step": 372 }, { "entropy": 0.5160757750272751, "epoch": 1.3917910447761195, "grad_norm": 0.21146361529827118, "learning_rate": 0.0002, "loss": 0.5265405774116516, "mean_token_accuracy": 0.7884284406900406, "num_tokens": 6099748.0, "step": 373 }, { "entropy": 0.5486676543951035, "epoch": 1.3955223880597014, "grad_norm": 0.17727814614772797, "learning_rate": 0.0002, "loss": 0.5486956834793091, "mean_token_accuracy": 0.774789959192276, "num_tokens": 6116173.0, "step": 374 }, { "entropy": 0.5379186123609543, "epoch": 1.3992537313432836, "grad_norm": 0.14094142615795135, "learning_rate": 0.0002, "loss": 0.5390832424163818, "mean_token_accuracy": 0.7824152857065201, "num_tokens": 6132499.0, "step": 375 }, { "entropy": 0.5322713851928711, "epoch": 1.4029850746268657, "grad_norm": 0.20512345433235168, "learning_rate": 0.0002, "loss": 0.5319615602493286, "mean_token_accuracy": 0.7856654673814774, "num_tokens": 6148777.0, "step": 376 }, { "entropy": 0.5522319674491882, "epoch": 1.4067164179104479, "grad_norm": 0.23706185817718506, "learning_rate": 0.0002, "loss": 0.5542993545532227, "mean_token_accuracy": 0.7750299721956253, "num_tokens": 6165444.0, "step": 377 }, { "entropy": 0.5360774844884872, "epoch": 1.4104477611940298, "grad_norm": 0.11965668946504593, "learning_rate": 0.0002, "loss": 0.5302645564079285, "mean_token_accuracy": 0.7849837243556976, "num_tokens": 6181897.0, "step": 378 }, { "entropy": 0.546858549118042, "epoch": 1.414179104477612, "grad_norm": 0.16231459379196167, "learning_rate": 0.0002, "loss": 0.5448977947235107, "mean_token_accuracy": 0.7800662368535995, "num_tokens": 6198254.0, "step": 379 }, { "entropy": 0.5505042523145676, "epoch": 1.417910447761194, "grad_norm": 0.16832560300827026, "learning_rate": 0.0002, "loss": 0.560795247554779, "mean_token_accuracy": 0.7732271403074265, "num_tokens": 6214773.0, "step": 380 }, { "entropy": 0.5255255252122879, "epoch": 1.421641791044776, "grad_norm": 0.14621268212795258, "learning_rate": 0.0002, "loss": 0.5310673117637634, "mean_token_accuracy": 0.7856626063585281, "num_tokens": 6230937.0, "step": 381 }, { "entropy": 0.550481304526329, "epoch": 1.4253731343283582, "grad_norm": 0.13561075925827026, "learning_rate": 0.0002, "loss": 0.552341103553772, "mean_token_accuracy": 0.7767930179834366, "num_tokens": 6247144.0, "step": 382 }, { "entropy": 0.5227905362844467, "epoch": 1.4291044776119404, "grad_norm": 0.13489387929439545, "learning_rate": 0.0002, "loss": 0.523324191570282, "mean_token_accuracy": 0.7840524315834045, "num_tokens": 6263392.0, "step": 383 }, { "entropy": 0.5366068184375763, "epoch": 1.4328358208955223, "grad_norm": 0.14153233170509338, "learning_rate": 0.0002, "loss": 0.5320409536361694, "mean_token_accuracy": 0.7857052683830261, "num_tokens": 6279611.0, "step": 384 }, { "entropy": 0.5510872900485992, "epoch": 1.4365671641791045, "grad_norm": 0.16421180963516235, "learning_rate": 0.0002, "loss": 0.5412197709083557, "mean_token_accuracy": 0.7806995958089828, "num_tokens": 6296025.0, "step": 385 }, { "entropy": 0.5504460334777832, "epoch": 1.4402985074626866, "grad_norm": 0.12805409729480743, "learning_rate": 0.0002, "loss": 0.5456997156143188, "mean_token_accuracy": 0.7775121033191681, "num_tokens": 6312415.0, "step": 386 }, { "entropy": 0.5504113882780075, "epoch": 1.4440298507462686, "grad_norm": 0.1690564602613449, "learning_rate": 0.0002, "loss": 0.5432727932929993, "mean_token_accuracy": 0.7804221510887146, "num_tokens": 6328728.0, "step": 387 }, { "entropy": 0.5279664844274521, "epoch": 1.4477611940298507, "grad_norm": 0.14327631890773773, "learning_rate": 0.0002, "loss": 0.5324951410293579, "mean_token_accuracy": 0.7857986390590668, "num_tokens": 6344947.0, "step": 388 }, { "entropy": 0.529266320168972, "epoch": 1.4514925373134329, "grad_norm": 0.14441367983818054, "learning_rate": 0.0002, "loss": 0.5360409021377563, "mean_token_accuracy": 0.7844860553741455, "num_tokens": 6361481.0, "step": 389 }, { "entropy": 0.5474697202444077, "epoch": 1.455223880597015, "grad_norm": 0.17411169409751892, "learning_rate": 0.0002, "loss": 0.553131103515625, "mean_token_accuracy": 0.774516150355339, "num_tokens": 6378114.0, "step": 390 }, { "entropy": 0.53204146027565, "epoch": 1.458955223880597, "grad_norm": 0.13096541166305542, "learning_rate": 0.0002, "loss": 0.5311554074287415, "mean_token_accuracy": 0.7832191288471222, "num_tokens": 6394618.0, "step": 391 }, { "entropy": 0.5468081682920456, "epoch": 1.462686567164179, "grad_norm": 0.1281428337097168, "learning_rate": 0.0002, "loss": 0.5487358570098877, "mean_token_accuracy": 0.7784566432237625, "num_tokens": 6411033.0, "step": 392 }, { "entropy": 0.5141153857111931, "epoch": 1.4664179104477613, "grad_norm": 0.12739789485931396, "learning_rate": 0.0002, "loss": 0.5161206126213074, "mean_token_accuracy": 0.7879614979028702, "num_tokens": 6427279.0, "step": 393 }, { "entropy": 0.5423916280269623, "epoch": 1.4701492537313432, "grad_norm": 0.13173308968544006, "learning_rate": 0.0002, "loss": 0.5459262132644653, "mean_token_accuracy": 0.7773706614971161, "num_tokens": 6443618.0, "step": 394 }, { "entropy": 0.5373747050762177, "epoch": 1.4738805970149254, "grad_norm": 0.13537272810935974, "learning_rate": 0.0002, "loss": 0.5413709878921509, "mean_token_accuracy": 0.7808920592069626, "num_tokens": 6459976.0, "step": 395 }, { "entropy": 0.5321269482374191, "epoch": 1.4776119402985075, "grad_norm": 0.14240136742591858, "learning_rate": 0.0002, "loss": 0.5354140400886536, "mean_token_accuracy": 0.7839590162038803, "num_tokens": 6476177.0, "step": 396 }, { "entropy": 0.5257603526115417, "epoch": 1.4813432835820897, "grad_norm": 0.13054870069026947, "learning_rate": 0.0002, "loss": 0.5284422636032104, "mean_token_accuracy": 0.7869588881731033, "num_tokens": 6492490.0, "step": 397 }, { "entropy": 0.5265851616859436, "epoch": 1.4850746268656716, "grad_norm": 0.13740919530391693, "learning_rate": 0.0002, "loss": 0.526134192943573, "mean_token_accuracy": 0.7872523069381714, "num_tokens": 6508878.0, "step": 398 }, { "entropy": 0.5212059766054153, "epoch": 1.4888059701492538, "grad_norm": 0.13339075446128845, "learning_rate": 0.0002, "loss": 0.5221821665763855, "mean_token_accuracy": 0.7905861139297485, "num_tokens": 6525084.0, "step": 399 }, { "entropy": 0.537382185459137, "epoch": 1.4925373134328357, "grad_norm": 0.13736183941364288, "learning_rate": 0.0002, "loss": 0.5351852774620056, "mean_token_accuracy": 0.7818522453308105, "num_tokens": 6541545.0, "step": 400 }, { "entropy": 0.5340493619441986, "epoch": 1.4962686567164178, "grad_norm": 0.1368023306131363, "learning_rate": 0.0002, "loss": 0.5317674279212952, "mean_token_accuracy": 0.7867089211940765, "num_tokens": 6557867.0, "step": 401 }, { "entropy": 0.5713642686605453, "epoch": 1.5, "grad_norm": 0.12573114037513733, "learning_rate": 0.0002, "loss": 0.5638826489448547, "mean_token_accuracy": 0.773875430226326, "num_tokens": 6574428.0, "step": 402 }, { "entropy": 0.5415615439414978, "epoch": 1.5037313432835822, "grad_norm": 0.14824476838111877, "learning_rate": 0.0002, "loss": 0.5452718734741211, "mean_token_accuracy": 0.7793742418289185, "num_tokens": 6590740.0, "step": 403 }, { "entropy": 0.5316762626171112, "epoch": 1.5074626865671643, "grad_norm": 0.13510265946388245, "learning_rate": 0.0002, "loss": 0.5399596691131592, "mean_token_accuracy": 0.7803886234760284, "num_tokens": 6606963.0, "step": 404 }, { "entropy": 0.5310466289520264, "epoch": 1.5111940298507462, "grad_norm": 0.1413303166627884, "learning_rate": 0.0002, "loss": 0.532017707824707, "mean_token_accuracy": 0.7846063524484634, "num_tokens": 6623504.0, "step": 405 }, { "entropy": 0.5623253732919693, "epoch": 1.5149253731343284, "grad_norm": 0.1327054351568222, "learning_rate": 0.0002, "loss": 0.5590583682060242, "mean_token_accuracy": 0.7741520255804062, "num_tokens": 6639880.0, "step": 406 }, { "entropy": 0.5222483575344086, "epoch": 1.5186567164179103, "grad_norm": 0.14219273626804352, "learning_rate": 0.0002, "loss": 0.5221630930900574, "mean_token_accuracy": 0.7884060740470886, "num_tokens": 6656372.0, "step": 407 }, { "entropy": 0.5361650884151459, "epoch": 1.5223880597014925, "grad_norm": 0.14150315523147583, "learning_rate": 0.0002, "loss": 0.5426543951034546, "mean_token_accuracy": 0.7794915586709976, "num_tokens": 6672460.0, "step": 408 }, { "entropy": 0.5405853539705276, "epoch": 1.5261194029850746, "grad_norm": 0.12867780029773712, "learning_rate": 0.0002, "loss": 0.545219361782074, "mean_token_accuracy": 0.7802143394947052, "num_tokens": 6688740.0, "step": 409 }, { "entropy": 0.5196312442421913, "epoch": 1.5298507462686568, "grad_norm": 0.12933768332004547, "learning_rate": 0.0002, "loss": 0.524722695350647, "mean_token_accuracy": 0.7893691807985306, "num_tokens": 6704798.0, "step": 410 }, { "entropy": 0.5358741357922554, "epoch": 1.533582089552239, "grad_norm": 0.14841386675834656, "learning_rate": 0.0002, "loss": 0.5425981879234314, "mean_token_accuracy": 0.7796852141618729, "num_tokens": 6720982.0, "step": 411 }, { "entropy": 0.5389422178268433, "epoch": 1.537313432835821, "grad_norm": 0.12372686713933945, "learning_rate": 0.0002, "loss": 0.5368393063545227, "mean_token_accuracy": 0.7788573652505875, "num_tokens": 6737135.0, "step": 412 }, { "entropy": 0.5395499765872955, "epoch": 1.5410447761194028, "grad_norm": 0.1355394721031189, "learning_rate": 0.0002, "loss": 0.5324706435203552, "mean_token_accuracy": 0.7823397219181061, "num_tokens": 6753507.0, "step": 413 }, { "entropy": 0.5506737977266312, "epoch": 1.544776119402985, "grad_norm": 0.11822586506605148, "learning_rate": 0.0002, "loss": 0.5447027087211609, "mean_token_accuracy": 0.7776395529508591, "num_tokens": 6769726.0, "step": 414 }, { "entropy": 0.5393240600824356, "epoch": 1.5485074626865671, "grad_norm": 0.1220259889960289, "learning_rate": 0.0002, "loss": 0.5348957180976868, "mean_token_accuracy": 0.7820345014333725, "num_tokens": 6786148.0, "step": 415 }, { "entropy": 0.5258119255304337, "epoch": 1.5522388059701493, "grad_norm": 0.15211379528045654, "learning_rate": 0.0002, "loss": 0.5274648666381836, "mean_token_accuracy": 0.7861866801977158, "num_tokens": 6802290.0, "step": 416 }, { "entropy": 0.5310887396335602, "epoch": 1.5559701492537314, "grad_norm": 0.1319982260465622, "learning_rate": 0.0002, "loss": 0.5339083075523376, "mean_token_accuracy": 0.7847474962472916, "num_tokens": 6818697.0, "step": 417 }, { "entropy": 0.5216883644461632, "epoch": 1.5597014925373134, "grad_norm": 0.13150501251220703, "learning_rate": 0.0002, "loss": 0.5250256061553955, "mean_token_accuracy": 0.7854708880186081, "num_tokens": 6834860.0, "step": 418 }, { "entropy": 0.5280915200710297, "epoch": 1.5634328358208955, "grad_norm": 0.13087767362594604, "learning_rate": 0.0002, "loss": 0.5294699668884277, "mean_token_accuracy": 0.7844147831201553, "num_tokens": 6850977.0, "step": 419 }, { "entropy": 0.5455043911933899, "epoch": 1.5671641791044775, "grad_norm": 0.13152527809143066, "learning_rate": 0.0002, "loss": 0.5411855578422546, "mean_token_accuracy": 0.7831065207719803, "num_tokens": 6867436.0, "step": 420 }, { "entropy": 0.5421444773674011, "epoch": 1.5708955223880596, "grad_norm": 0.12552635371685028, "learning_rate": 0.0002, "loss": 0.5404070615768433, "mean_token_accuracy": 0.7799917608499527, "num_tokens": 6883739.0, "step": 421 }, { "entropy": 0.5469988659024239, "epoch": 1.5746268656716418, "grad_norm": 0.12713049352169037, "learning_rate": 0.0002, "loss": 0.5506969690322876, "mean_token_accuracy": 0.7773310244083405, "num_tokens": 6899931.0, "step": 422 }, { "entropy": 0.5409539192914963, "epoch": 1.578358208955224, "grad_norm": 0.12043388932943344, "learning_rate": 0.0002, "loss": 0.5393781661987305, "mean_token_accuracy": 0.7821668684482574, "num_tokens": 6916555.0, "step": 423 }, { "entropy": 0.5323537066578865, "epoch": 1.582089552238806, "grad_norm": 0.15053188800811768, "learning_rate": 0.0002, "loss": 0.5387845039367676, "mean_token_accuracy": 0.7825682461261749, "num_tokens": 6932929.0, "step": 424 }, { "entropy": 0.5519883185625076, "epoch": 1.585820895522388, "grad_norm": 0.1525130718946457, "learning_rate": 0.0002, "loss": 0.56787109375, "mean_token_accuracy": 0.7703519463539124, "num_tokens": 6949313.0, "step": 425 }, { "entropy": 0.5393707901239395, "epoch": 1.5895522388059702, "grad_norm": 0.14073340594768524, "learning_rate": 0.0002, "loss": 0.5375410914421082, "mean_token_accuracy": 0.7814988791942596, "num_tokens": 6965684.0, "step": 426 }, { "entropy": 0.5354568511247635, "epoch": 1.5932835820895521, "grad_norm": 0.13749349117279053, "learning_rate": 0.0002, "loss": 0.5318333506584167, "mean_token_accuracy": 0.7864338159561157, "num_tokens": 6982013.0, "step": 427 }, { "entropy": 0.5405145287513733, "epoch": 1.5970149253731343, "grad_norm": 0.12070662528276443, "learning_rate": 0.0002, "loss": 0.5362390279769897, "mean_token_accuracy": 0.7832798510789871, "num_tokens": 6998503.0, "step": 428 }, { "entropy": 0.5447606593370438, "epoch": 1.6007462686567164, "grad_norm": 0.1386427879333496, "learning_rate": 0.0002, "loss": 0.5441482663154602, "mean_token_accuracy": 0.778590589761734, "num_tokens": 7014770.0, "step": 429 }, { "entropy": 0.5470203310251236, "epoch": 1.6044776119402986, "grad_norm": 0.13212502002716064, "learning_rate": 0.0002, "loss": 0.5490391850471497, "mean_token_accuracy": 0.7765385806560516, "num_tokens": 7030922.0, "step": 430 }, { "entropy": 0.5170739889144897, "epoch": 1.6082089552238807, "grad_norm": 0.13961301743984222, "learning_rate": 0.0002, "loss": 0.5210376381874084, "mean_token_accuracy": 0.7884235680103302, "num_tokens": 7047216.0, "step": 431 }, { "entropy": 0.5377417504787445, "epoch": 1.6119402985074627, "grad_norm": 0.13901281356811523, "learning_rate": 0.0002, "loss": 0.5376747846603394, "mean_token_accuracy": 0.7830623835325241, "num_tokens": 7063307.0, "step": 432 }, { "entropy": 0.5414564162492752, "epoch": 1.6156716417910446, "grad_norm": 0.1463043987751007, "learning_rate": 0.0002, "loss": 0.5473238825798035, "mean_token_accuracy": 0.7770842909812927, "num_tokens": 7079707.0, "step": 433 }, { "entropy": 0.5415572673082352, "epoch": 1.6194029850746268, "grad_norm": 0.11891120672225952, "learning_rate": 0.0002, "loss": 0.5387373566627502, "mean_token_accuracy": 0.779969111084938, "num_tokens": 7095980.0, "step": 434 }, { "entropy": 0.5542661100625992, "epoch": 1.623134328358209, "grad_norm": 0.13271500170230865, "learning_rate": 0.0002, "loss": 0.5507120490074158, "mean_token_accuracy": 0.7779867500066757, "num_tokens": 7112556.0, "step": 435 }, { "entropy": 0.54887755215168, "epoch": 1.626865671641791, "grad_norm": 0.13373985886573792, "learning_rate": 0.0002, "loss": 0.5447692275047302, "mean_token_accuracy": 0.7798765897750854, "num_tokens": 7128802.0, "step": 436 }, { "entropy": 0.5222520381212234, "epoch": 1.6305970149253732, "grad_norm": 0.1277901828289032, "learning_rate": 0.0002, "loss": 0.5239554643630981, "mean_token_accuracy": 0.785177692770958, "num_tokens": 7145060.0, "step": 437 }, { "entropy": 0.53469417989254, "epoch": 1.6343283582089554, "grad_norm": 0.20547546446323395, "learning_rate": 0.0002, "loss": 0.5367586612701416, "mean_token_accuracy": 0.7803931534290314, "num_tokens": 7161527.0, "step": 438 }, { "entropy": 0.521802693605423, "epoch": 1.6380597014925373, "grad_norm": 0.16560786962509155, "learning_rate": 0.0002, "loss": 0.5228012204170227, "mean_token_accuracy": 0.7887944877147675, "num_tokens": 7178091.0, "step": 439 }, { "entropy": 0.5338825434446335, "epoch": 1.6417910447761193, "grad_norm": 0.1590629667043686, "learning_rate": 0.0002, "loss": 0.5402793288230896, "mean_token_accuracy": 0.7781020998954773, "num_tokens": 7194244.0, "step": 440 }, { "entropy": 0.5395276695489883, "epoch": 1.6455223880597014, "grad_norm": 0.14088116586208344, "learning_rate": 0.0002, "loss": 0.5401326417922974, "mean_token_accuracy": 0.781720831990242, "num_tokens": 7210451.0, "step": 441 }, { "entropy": 0.5567539632320404, "epoch": 1.6492537313432836, "grad_norm": 0.19292442500591278, "learning_rate": 0.0002, "loss": 0.5627314448356628, "mean_token_accuracy": 0.7719661146402359, "num_tokens": 7226996.0, "step": 442 }, { "entropy": 0.534116804599762, "epoch": 1.6529850746268657, "grad_norm": 0.1254442036151886, "learning_rate": 0.0002, "loss": 0.533519983291626, "mean_token_accuracy": 0.7840958386659622, "num_tokens": 7243430.0, "step": 443 }, { "entropy": 0.5330116599798203, "epoch": 1.6567164179104479, "grad_norm": 0.1718529760837555, "learning_rate": 0.0002, "loss": 0.5330148339271545, "mean_token_accuracy": 0.7830322086811066, "num_tokens": 7259764.0, "step": 444 }, { "entropy": 0.5424318462610245, "epoch": 1.6604477611940298, "grad_norm": 0.13064436614513397, "learning_rate": 0.0002, "loss": 0.5422405004501343, "mean_token_accuracy": 0.7796443551778793, "num_tokens": 7276147.0, "step": 445 }, { "entropy": 0.555829331278801, "epoch": 1.664179104477612, "grad_norm": 0.12649741768836975, "learning_rate": 0.0002, "loss": 0.5439899563789368, "mean_token_accuracy": 0.7798557877540588, "num_tokens": 7292719.0, "step": 446 }, { "entropy": 0.5564119815826416, "epoch": 1.667910447761194, "grad_norm": 0.140034019947052, "learning_rate": 0.0002, "loss": 0.5546625256538391, "mean_token_accuracy": 0.7761601060628891, "num_tokens": 7309242.0, "step": 447 }, { "entropy": 0.5416673123836517, "epoch": 1.671641791044776, "grad_norm": 0.1388692855834961, "learning_rate": 0.0002, "loss": 0.541693389415741, "mean_token_accuracy": 0.7807905972003937, "num_tokens": 7325872.0, "step": 448 }, { "entropy": 0.5325654745101929, "epoch": 1.6753731343283582, "grad_norm": 0.1330399215221405, "learning_rate": 0.0002, "loss": 0.5375967025756836, "mean_token_accuracy": 0.780772253870964, "num_tokens": 7342461.0, "step": 449 }, { "entropy": 0.5460408478975296, "epoch": 1.6791044776119404, "grad_norm": 0.1698281317949295, "learning_rate": 0.0002, "loss": 0.5483989119529724, "mean_token_accuracy": 0.7757564038038254, "num_tokens": 7358926.0, "step": 450 }, { "entropy": 0.5587185472249985, "epoch": 1.6828358208955225, "grad_norm": 0.150365948677063, "learning_rate": 0.0002, "loss": 0.5607273578643799, "mean_token_accuracy": 0.7735442072153091, "num_tokens": 7375472.0, "step": 451 }, { "entropy": 0.5546591132879257, "epoch": 1.6865671641791045, "grad_norm": 0.13346362113952637, "learning_rate": 0.0002, "loss": 0.5498383045196533, "mean_token_accuracy": 0.7771503031253815, "num_tokens": 7391758.0, "step": 452 }, { "entropy": 0.5380023121833801, "epoch": 1.6902985074626866, "grad_norm": 0.15642641484737396, "learning_rate": 0.0002, "loss": 0.540310263633728, "mean_token_accuracy": 0.7800187021493912, "num_tokens": 7407943.0, "step": 453 }, { "entropy": 0.5107243284583092, "epoch": 1.6940298507462686, "grad_norm": 0.1413007378578186, "learning_rate": 0.0002, "loss": 0.5198100209236145, "mean_token_accuracy": 0.7903516441583633, "num_tokens": 7424142.0, "step": 454 }, { "entropy": 0.5318749994039536, "epoch": 1.6977611940298507, "grad_norm": 0.13885854184627533, "learning_rate": 0.0002, "loss": 0.5412630438804626, "mean_token_accuracy": 0.7793916463851929, "num_tokens": 7440451.0, "step": 455 }, { "entropy": 0.5525089502334595, "epoch": 1.7014925373134329, "grad_norm": 0.12943100929260254, "learning_rate": 0.0002, "loss": 0.5551573634147644, "mean_token_accuracy": 0.7760037779808044, "num_tokens": 7456977.0, "step": 456 }, { "entropy": 0.5402176976203918, "epoch": 1.705223880597015, "grad_norm": 0.15211442112922668, "learning_rate": 0.0002, "loss": 0.5398398041725159, "mean_token_accuracy": 0.779134064912796, "num_tokens": 7473154.0, "step": 457 }, { "entropy": 0.5625119209289551, "epoch": 1.7089552238805972, "grad_norm": 0.12840458750724792, "learning_rate": 0.0002, "loss": 0.5544787645339966, "mean_token_accuracy": 0.7756093442440033, "num_tokens": 7489492.0, "step": 458 }, { "entropy": 0.5442609488964081, "epoch": 1.712686567164179, "grad_norm": 0.13839711248874664, "learning_rate": 0.0002, "loss": 0.5437784194946289, "mean_token_accuracy": 0.7818922996520996, "num_tokens": 7505874.0, "step": 459 }, { "entropy": 0.5575658231973648, "epoch": 1.716417910447761, "grad_norm": 0.14238221943378448, "learning_rate": 0.0002, "loss": 0.5612136125564575, "mean_token_accuracy": 0.7718513458967209, "num_tokens": 7522288.0, "step": 460 }, { "entropy": 0.535207062959671, "epoch": 1.7201492537313432, "grad_norm": 0.13308024406433105, "learning_rate": 0.0002, "loss": 0.5384257435798645, "mean_token_accuracy": 0.7802019715309143, "num_tokens": 7538764.0, "step": 461 }, { "entropy": 0.5290672108530998, "epoch": 1.7238805970149254, "grad_norm": 0.14699077606201172, "learning_rate": 0.0002, "loss": 0.533920168876648, "mean_token_accuracy": 0.7809716016054153, "num_tokens": 7555048.0, "step": 462 }, { "entropy": 0.5349759012460709, "epoch": 1.7276119402985075, "grad_norm": 0.13993169367313385, "learning_rate": 0.0002, "loss": 0.5397127866744995, "mean_token_accuracy": 0.781706914305687, "num_tokens": 7571331.0, "step": 463 }, { "entropy": 0.5471459329128265, "epoch": 1.7313432835820897, "grad_norm": 0.1270606368780136, "learning_rate": 0.0002, "loss": 0.5457655191421509, "mean_token_accuracy": 0.7785040736198425, "num_tokens": 7587268.0, "step": 464 }, { "entropy": 0.5576677769422531, "epoch": 1.7350746268656716, "grad_norm": 0.13001851737499237, "learning_rate": 0.0002, "loss": 0.5535344481468201, "mean_token_accuracy": 0.7747954726219177, "num_tokens": 7603468.0, "step": 465 }, { "entropy": 0.5527965128421783, "epoch": 1.7388059701492538, "grad_norm": 0.11191874742507935, "learning_rate": 0.0002, "loss": 0.5493273138999939, "mean_token_accuracy": 0.7783663272857666, "num_tokens": 7619861.0, "step": 466 }, { "entropy": 0.5458428710699081, "epoch": 1.7425373134328357, "grad_norm": 0.12890613079071045, "learning_rate": 0.0002, "loss": 0.5422653555870056, "mean_token_accuracy": 0.7804641127586365, "num_tokens": 7636365.0, "step": 467 }, { "entropy": 0.5396646112203598, "epoch": 1.7462686567164178, "grad_norm": 0.14643065631389618, "learning_rate": 0.0002, "loss": 0.540531575679779, "mean_token_accuracy": 0.7787915468215942, "num_tokens": 7652695.0, "step": 468 }, { "entropy": 0.5489283800125122, "epoch": 1.75, "grad_norm": 0.12856297194957733, "learning_rate": 0.0002, "loss": 0.5493489503860474, "mean_token_accuracy": 0.7765475660562515, "num_tokens": 7669417.0, "step": 469 }, { "entropy": 0.5371540188789368, "epoch": 1.7537313432835822, "grad_norm": 0.1448490023612976, "learning_rate": 0.0002, "loss": 0.5445014238357544, "mean_token_accuracy": 0.7786155045032501, "num_tokens": 7685950.0, "step": 470 }, { "entropy": 0.5441175699234009, "epoch": 1.7574626865671643, "grad_norm": 0.1417449563741684, "learning_rate": 0.0002, "loss": 0.5456334352493286, "mean_token_accuracy": 0.7806714922189713, "num_tokens": 7702096.0, "step": 471 }, { "entropy": 0.534687414765358, "epoch": 1.7611940298507462, "grad_norm": 0.13397443294525146, "learning_rate": 0.0002, "loss": 0.5369069576263428, "mean_token_accuracy": 0.7817386239767075, "num_tokens": 7718461.0, "step": 472 }, { "entropy": 0.5490274131298065, "epoch": 1.7649253731343284, "grad_norm": 0.1352432817220688, "learning_rate": 0.0002, "loss": 0.5512405633926392, "mean_token_accuracy": 0.7781344056129456, "num_tokens": 7734927.0, "step": 473 }, { "entropy": 0.5476491898298264, "epoch": 1.7686567164179103, "grad_norm": 0.13750651478767395, "learning_rate": 0.0002, "loss": 0.5536763668060303, "mean_token_accuracy": 0.7743410021066666, "num_tokens": 7751415.0, "step": 474 }, { "entropy": 0.524419367313385, "epoch": 1.7723880597014925, "grad_norm": 0.13306710124015808, "learning_rate": 0.0002, "loss": 0.5263890624046326, "mean_token_accuracy": 0.7842015773057938, "num_tokens": 7767584.0, "step": 475 }, { "entropy": 0.5515109747648239, "epoch": 1.7761194029850746, "grad_norm": 0.13014942407608032, "learning_rate": 0.0002, "loss": 0.546906590461731, "mean_token_accuracy": 0.7791758924722672, "num_tokens": 7783929.0, "step": 476 }, { "entropy": 0.5460219085216522, "epoch": 1.7798507462686568, "grad_norm": 0.12750543653964996, "learning_rate": 0.0002, "loss": 0.5416713953018188, "mean_token_accuracy": 0.7796966135501862, "num_tokens": 7800322.0, "step": 477 }, { "entropy": 0.5496509969234467, "epoch": 1.783582089552239, "grad_norm": 0.14019764959812164, "learning_rate": 0.0002, "loss": 0.5501259565353394, "mean_token_accuracy": 0.7778430730104446, "num_tokens": 7816728.0, "step": 478 }, { "entropy": 0.5484806597232819, "epoch": 1.787313432835821, "grad_norm": 0.12671294808387756, "learning_rate": 0.0002, "loss": 0.546718418598175, "mean_token_accuracy": 0.7767283469438553, "num_tokens": 7833182.0, "step": 479 }, { "entropy": 0.5313283354043961, "epoch": 1.7910447761194028, "grad_norm": 0.16472716629505157, "learning_rate": 0.0002, "loss": 0.5414275527000427, "mean_token_accuracy": 0.7815513163805008, "num_tokens": 7849402.0, "step": 480 }, { "entropy": 0.516701802611351, "epoch": 1.794776119402985, "grad_norm": 0.157722607254982, "learning_rate": 0.0002, "loss": 0.5291575789451599, "mean_token_accuracy": 0.7844545841217041, "num_tokens": 7865503.0, "step": 481 }, { "entropy": 0.5476036965847015, "epoch": 1.7985074626865671, "grad_norm": 0.16708603501319885, "learning_rate": 0.0002, "loss": 0.5535966157913208, "mean_token_accuracy": 0.7750539481639862, "num_tokens": 7881822.0, "step": 482 }, { "entropy": 0.5405763983726501, "epoch": 1.8022388059701493, "grad_norm": 0.12333223968744278, "learning_rate": 0.0002, "loss": 0.5385177731513977, "mean_token_accuracy": 0.7838984429836273, "num_tokens": 7898111.0, "step": 483 }, { "entropy": 0.5573789775371552, "epoch": 1.8059701492537314, "grad_norm": 0.14407449960708618, "learning_rate": 0.0002, "loss": 0.541386067867279, "mean_token_accuracy": 0.7797874957323074, "num_tokens": 7914518.0, "step": 484 }, { "entropy": 0.5439587533473969, "epoch": 1.8097014925373134, "grad_norm": 0.1654428094625473, "learning_rate": 0.0002, "loss": 0.5336223244667053, "mean_token_accuracy": 0.7846554070711136, "num_tokens": 7930884.0, "step": 485 }, { "entropy": 0.536734089255333, "epoch": 1.8134328358208955, "grad_norm": 0.15028727054595947, "learning_rate": 0.0002, "loss": 0.5363267660140991, "mean_token_accuracy": 0.786723256111145, "num_tokens": 7947486.0, "step": 486 }, { "entropy": 0.5316303819417953, "epoch": 1.8171641791044775, "grad_norm": 0.2185370773077011, "learning_rate": 0.0002, "loss": 0.5426980257034302, "mean_token_accuracy": 0.7816258370876312, "num_tokens": 7963754.0, "step": 487 }, { "entropy": 0.5372888445854187, "epoch": 1.8208955223880596, "grad_norm": 0.14039121568202972, "learning_rate": 0.0002, "loss": 0.5452357530593872, "mean_token_accuracy": 0.7777333706617355, "num_tokens": 7980178.0, "step": 488 }, { "entropy": 0.561303973197937, "epoch": 1.8246268656716418, "grad_norm": 0.2095021903514862, "learning_rate": 0.0002, "loss": 0.5606201887130737, "mean_token_accuracy": 0.7701640874147415, "num_tokens": 7996414.0, "step": 489 }, { "entropy": 0.5401351600885391, "epoch": 1.828358208955224, "grad_norm": 0.13168978691101074, "learning_rate": 0.0002, "loss": 0.5416175723075867, "mean_token_accuracy": 0.7801533341407776, "num_tokens": 8012578.0, "step": 490 }, { "entropy": 0.5480149686336517, "epoch": 1.832089552238806, "grad_norm": 0.18209180235862732, "learning_rate": 0.0002, "loss": 0.5433698892593384, "mean_token_accuracy": 0.7793498337268829, "num_tokens": 8029063.0, "step": 491 }, { "entropy": 0.5556472986936569, "epoch": 1.835820895522388, "grad_norm": 0.14936800301074982, "learning_rate": 0.0002, "loss": 0.5554640293121338, "mean_token_accuracy": 0.7756128907203674, "num_tokens": 8045335.0, "step": 492 }, { "entropy": 0.551779106259346, "epoch": 1.8395522388059702, "grad_norm": 0.16466236114501953, "learning_rate": 0.0002, "loss": 0.5527586936950684, "mean_token_accuracy": 0.7768742144107819, "num_tokens": 8061746.0, "step": 493 }, { "entropy": 0.5395959764719009, "epoch": 1.8432835820895521, "grad_norm": 0.17139406502246857, "learning_rate": 0.0002, "loss": 0.5481644868850708, "mean_token_accuracy": 0.7803965657949448, "num_tokens": 8078227.0, "step": 494 }, { "entropy": 0.544280469417572, "epoch": 1.8470149253731343, "grad_norm": 0.14393140375614166, "learning_rate": 0.0002, "loss": 0.55059415102005, "mean_token_accuracy": 0.7759814560413361, "num_tokens": 8094667.0, "step": 495 }, { "entropy": 0.5303434431552887, "epoch": 1.8507462686567164, "grad_norm": 0.16556651890277863, "learning_rate": 0.0002, "loss": 0.530941903591156, "mean_token_accuracy": 0.7859343141317368, "num_tokens": 8110787.0, "step": 496 }, { "entropy": 0.5236229598522186, "epoch": 1.8544776119402986, "grad_norm": 0.12482267618179321, "learning_rate": 0.0002, "loss": 0.5197535753250122, "mean_token_accuracy": 0.7890704125165939, "num_tokens": 8127133.0, "step": 497 }, { "entropy": 0.5396426022052765, "epoch": 1.8582089552238807, "grad_norm": 0.1538504958152771, "learning_rate": 0.0002, "loss": 0.5361296534538269, "mean_token_accuracy": 0.7814654260873795, "num_tokens": 8143434.0, "step": 498 }, { "entropy": 0.5484279841184616, "epoch": 1.8619402985074627, "grad_norm": 0.14813822507858276, "learning_rate": 0.0002, "loss": 0.5464996695518494, "mean_token_accuracy": 0.7787739634513855, "num_tokens": 8159903.0, "step": 499 }, { "entropy": 0.519238218665123, "epoch": 1.8656716417910446, "grad_norm": 0.13267366588115692, "learning_rate": 0.0002, "loss": 0.5259124040603638, "mean_token_accuracy": 0.7888814806938171, "num_tokens": 8176179.0, "step": 500 }, { "entropy": 0.5393799841403961, "epoch": 1.8694029850746268, "grad_norm": 0.1923193484544754, "learning_rate": 0.0002, "loss": 0.5401571989059448, "mean_token_accuracy": 0.7801343649625778, "num_tokens": 8192554.0, "step": 501 }, { "entropy": 0.532251313328743, "epoch": 1.873134328358209, "grad_norm": 0.13894309103488922, "learning_rate": 0.0002, "loss": 0.527220606803894, "mean_token_accuracy": 0.7864662110805511, "num_tokens": 8208849.0, "step": 502 }, { "entropy": 0.5306680351495743, "epoch": 1.876865671641791, "grad_norm": 0.1474749892950058, "learning_rate": 0.0002, "loss": 0.5287739038467407, "mean_token_accuracy": 0.7855399250984192, "num_tokens": 8225218.0, "step": 503 }, { "entropy": 0.5300537943840027, "epoch": 1.8805970149253732, "grad_norm": 0.1491105705499649, "learning_rate": 0.0002, "loss": 0.5314114093780518, "mean_token_accuracy": 0.7854063659906387, "num_tokens": 8241422.0, "step": 504 }, { "entropy": 0.5309967398643494, "epoch": 1.8843283582089554, "grad_norm": 0.15464921295642853, "learning_rate": 0.0002, "loss": 0.5415985584259033, "mean_token_accuracy": 0.7829921096563339, "num_tokens": 8257677.0, "step": 505 }, { "entropy": 0.5376427173614502, "epoch": 1.8880597014925373, "grad_norm": 0.1445028930902481, "learning_rate": 0.0002, "loss": 0.5402049422264099, "mean_token_accuracy": 0.781824991106987, "num_tokens": 8274079.0, "step": 506 }, { "entropy": 0.5335574001073837, "epoch": 1.8917910447761193, "grad_norm": 0.12303903698921204, "learning_rate": 0.0002, "loss": 0.530457079410553, "mean_token_accuracy": 0.7857005745172501, "num_tokens": 8290576.0, "step": 507 }, { "entropy": 0.5357225090265274, "epoch": 1.8955223880597014, "grad_norm": 0.14474186301231384, "learning_rate": 0.0002, "loss": 0.5326468348503113, "mean_token_accuracy": 0.7827298194169998, "num_tokens": 8306959.0, "step": 508 }, { "entropy": 0.5418558418750763, "epoch": 1.8992537313432836, "grad_norm": 0.13205651938915253, "learning_rate": 0.0002, "loss": 0.5394735932350159, "mean_token_accuracy": 0.7811231166124344, "num_tokens": 8323198.0, "step": 509 }, { "entropy": 0.5494632720947266, "epoch": 1.9029850746268657, "grad_norm": 0.13867227733135223, "learning_rate": 0.0002, "loss": 0.5512980818748474, "mean_token_accuracy": 0.7792128920555115, "num_tokens": 8339407.0, "step": 510 }, { "entropy": 0.527800902724266, "epoch": 1.9067164179104479, "grad_norm": 0.1300196498632431, "learning_rate": 0.0002, "loss": 0.5310680866241455, "mean_token_accuracy": 0.7856706976890564, "num_tokens": 8355694.0, "step": 511 }, { "entropy": 0.5433302372694016, "epoch": 1.9104477611940298, "grad_norm": 0.16294771432876587, "learning_rate": 0.0002, "loss": 0.5532437562942505, "mean_token_accuracy": 0.7759810388088226, "num_tokens": 8371710.0, "step": 512 }, { "entropy": 0.5244318097829819, "epoch": 1.914179104477612, "grad_norm": 0.13300037384033203, "learning_rate": 0.0002, "loss": 0.5271862149238586, "mean_token_accuracy": 0.7844917327165604, "num_tokens": 8387964.0, "step": 513 }, { "entropy": 0.5421733111143112, "epoch": 1.917910447761194, "grad_norm": 0.12434980273246765, "learning_rate": 0.0002, "loss": 0.5377052426338196, "mean_token_accuracy": 0.7836858928203583, "num_tokens": 8404373.0, "step": 514 }, { "entropy": 0.5615102648735046, "epoch": 1.921641791044776, "grad_norm": 0.1264066845178604, "learning_rate": 0.0002, "loss": 0.558891236782074, "mean_token_accuracy": 0.7723990976810455, "num_tokens": 8420907.0, "step": 515 }, { "entropy": 0.5428318381309509, "epoch": 1.9253731343283582, "grad_norm": 0.13190090656280518, "learning_rate": 0.0002, "loss": 0.5374886393547058, "mean_token_accuracy": 0.7830605953931808, "num_tokens": 8437255.0, "step": 516 }, { "entropy": 0.5324592739343643, "epoch": 1.9291044776119404, "grad_norm": 0.13782039284706116, "learning_rate": 0.0002, "loss": 0.5368908643722534, "mean_token_accuracy": 0.7810968607664108, "num_tokens": 8453657.0, "step": 517 }, { "entropy": 0.563809260725975, "epoch": 1.9328358208955225, "grad_norm": 0.11932537704706192, "learning_rate": 0.0002, "loss": 0.5596674680709839, "mean_token_accuracy": 0.7723207473754883, "num_tokens": 8470566.0, "step": 518 }, { "entropy": 0.550938680768013, "epoch": 1.9365671641791045, "grad_norm": 0.13882781565189362, "learning_rate": 0.0002, "loss": 0.5502666234970093, "mean_token_accuracy": 0.7773875147104263, "num_tokens": 8486896.0, "step": 519 }, { "entropy": 0.5509646236896515, "epoch": 1.9402985074626866, "grad_norm": 0.11496590822935104, "learning_rate": 0.0002, "loss": 0.5537518262863159, "mean_token_accuracy": 0.7762430608272552, "num_tokens": 8503486.0, "step": 520 }, { "entropy": 0.5208418220281601, "epoch": 1.9440298507462686, "grad_norm": 0.12605132162570953, "learning_rate": 0.0002, "loss": 0.5253016948699951, "mean_token_accuracy": 0.7866884917020798, "num_tokens": 8519722.0, "step": 521 }, { "entropy": 0.5348703861236572, "epoch": 1.9477611940298507, "grad_norm": 0.13436545431613922, "learning_rate": 0.0002, "loss": 0.5429031252861023, "mean_token_accuracy": 0.7784363180398941, "num_tokens": 8536094.0, "step": 522 }, { "entropy": 0.5374516993761063, "epoch": 1.9514925373134329, "grad_norm": 0.1355811506509781, "learning_rate": 0.0002, "loss": 0.5394662618637085, "mean_token_accuracy": 0.7806121855974197, "num_tokens": 8552288.0, "step": 523 }, { "entropy": 0.5625811666250229, "epoch": 1.955223880597015, "grad_norm": 0.11836230754852295, "learning_rate": 0.0002, "loss": 0.5579893589019775, "mean_token_accuracy": 0.7714975476264954, "num_tokens": 8568760.0, "step": 524 }, { "entropy": 0.5421487241983414, "epoch": 1.9589552238805972, "grad_norm": 0.1359013170003891, "learning_rate": 0.0002, "loss": 0.5385461449623108, "mean_token_accuracy": 0.7821292132139206, "num_tokens": 8585317.0, "step": 525 }, { "entropy": 0.5259972438216209, "epoch": 1.962686567164179, "grad_norm": 0.1390962302684784, "learning_rate": 0.0002, "loss": 0.5276076793670654, "mean_token_accuracy": 0.785026952624321, "num_tokens": 8601637.0, "step": 526 }, { "entropy": 0.5354560762643814, "epoch": 1.966417910447761, "grad_norm": 0.13758784532546997, "learning_rate": 0.0002, "loss": 0.5364598035812378, "mean_token_accuracy": 0.782847136259079, "num_tokens": 8617902.0, "step": 527 }, { "entropy": 0.5353007912635803, "epoch": 1.9701492537313432, "grad_norm": 0.16679321229457855, "learning_rate": 0.0002, "loss": 0.5458345413208008, "mean_token_accuracy": 0.7779222279787064, "num_tokens": 8634235.0, "step": 528 }, { "entropy": 0.5326858758926392, "epoch": 1.9738805970149254, "grad_norm": 0.1427498161792755, "learning_rate": 0.0002, "loss": 0.5339992642402649, "mean_token_accuracy": 0.7820619940757751, "num_tokens": 8650417.0, "step": 529 }, { "entropy": 0.5444169491529465, "epoch": 1.9776119402985075, "grad_norm": 0.12751619517803192, "learning_rate": 0.0002, "loss": 0.5337543487548828, "mean_token_accuracy": 0.7827389687299728, "num_tokens": 8666763.0, "step": 530 }, { "entropy": 0.5495491325855255, "epoch": 1.9813432835820897, "grad_norm": 0.13329073786735535, "learning_rate": 0.0002, "loss": 0.5403661131858826, "mean_token_accuracy": 0.7817551493644714, "num_tokens": 8683086.0, "step": 531 }, { "entropy": 0.545268103480339, "epoch": 1.9850746268656716, "grad_norm": 0.1334519237279892, "learning_rate": 0.0002, "loss": 0.5446645021438599, "mean_token_accuracy": 0.7789036780595779, "num_tokens": 8699314.0, "step": 532 }, { "entropy": 0.5360117256641388, "epoch": 1.9888059701492538, "grad_norm": 0.1417427510023117, "learning_rate": 0.0002, "loss": 0.5377262830734253, "mean_token_accuracy": 0.782628983259201, "num_tokens": 8715712.0, "step": 533 }, { "entropy": 0.539160817861557, "epoch": 1.9925373134328357, "grad_norm": 0.13969334959983826, "learning_rate": 0.0002, "loss": 0.5430911779403687, "mean_token_accuracy": 0.7803932130336761, "num_tokens": 8732278.0, "step": 534 }, { "entropy": 0.5323211252689362, "epoch": 1.9962686567164178, "grad_norm": 0.13230480253696442, "learning_rate": 0.0002, "loss": 0.5352569818496704, "mean_token_accuracy": 0.7800516188144684, "num_tokens": 8748639.0, "step": 535 }, { "entropy": 0.5396020114421844, "epoch": 2.0, "grad_norm": 0.13588403165340424, "learning_rate": 0.0002, "loss": 0.5420472025871277, "mean_token_accuracy": 0.7812368422746658, "num_tokens": 8765023.0, "step": 536 }, { "entropy": 0.5363707542419434, "epoch": 2.003731343283582, "grad_norm": 0.13683520257472992, "learning_rate": 0.0002, "loss": 0.5242169499397278, "mean_token_accuracy": 0.7884830236434937, "num_tokens": 8781503.0, "step": 537 }, { "entropy": 0.5355663001537323, "epoch": 2.0074626865671643, "grad_norm": 0.1606767475605011, "learning_rate": 0.0002, "loss": 0.5340245962142944, "mean_token_accuracy": 0.7837463468313217, "num_tokens": 8797833.0, "step": 538 }, { "entropy": 0.5198972821235657, "epoch": 2.0111940298507465, "grad_norm": 0.1832306683063507, "learning_rate": 0.0002, "loss": 0.5226503014564514, "mean_token_accuracy": 0.7878277599811554, "num_tokens": 8814387.0, "step": 539 }, { "entropy": 0.5145581886172295, "epoch": 2.014925373134328, "grad_norm": 0.14004163444042206, "learning_rate": 0.0002, "loss": 0.5142262578010559, "mean_token_accuracy": 0.7930136620998383, "num_tokens": 8830769.0, "step": 540 }, { "entropy": 0.518964596092701, "epoch": 2.0186567164179103, "grad_norm": 0.2391389012336731, "learning_rate": 0.0002, "loss": 0.5318617224693298, "mean_token_accuracy": 0.7879888862371445, "num_tokens": 8847079.0, "step": 541 }, { "entropy": 0.5112362876534462, "epoch": 2.0223880597014925, "grad_norm": 0.1571192741394043, "learning_rate": 0.0002, "loss": 0.5111895799636841, "mean_token_accuracy": 0.7941466271877289, "num_tokens": 8863455.0, "step": 542 }, { "entropy": 0.5289383679628372, "epoch": 2.0261194029850746, "grad_norm": 0.18859665095806122, "learning_rate": 0.0002, "loss": 0.5321269035339355, "mean_token_accuracy": 0.7850861251354218, "num_tokens": 8879933.0, "step": 543 }, { "entropy": 0.5038495659828186, "epoch": 2.029850746268657, "grad_norm": 0.1459927260875702, "learning_rate": 0.0002, "loss": 0.5009663105010986, "mean_token_accuracy": 0.800191804766655, "num_tokens": 8896279.0, "step": 544 }, { "entropy": 0.5393158346414566, "epoch": 2.033582089552239, "grad_norm": 0.18940559029579163, "learning_rate": 0.0002, "loss": 0.5331785678863525, "mean_token_accuracy": 0.785183385014534, "num_tokens": 8912807.0, "step": 545 }, { "entropy": 0.5186864137649536, "epoch": 2.0373134328358207, "grad_norm": 0.13405749201774597, "learning_rate": 0.0002, "loss": 0.5130364894866943, "mean_token_accuracy": 0.7902890145778656, "num_tokens": 8929085.0, "step": 546 }, { "entropy": 0.517152339220047, "epoch": 2.041044776119403, "grad_norm": 0.2357271909713745, "learning_rate": 0.0002, "loss": 0.5223183631896973, "mean_token_accuracy": 0.7909936606884003, "num_tokens": 8945205.0, "step": 547 }, { "entropy": 0.504429779946804, "epoch": 2.044776119402985, "grad_norm": 0.16896866261959076, "learning_rate": 0.0002, "loss": 0.5084525942802429, "mean_token_accuracy": 0.7927258014678955, "num_tokens": 8961586.0, "step": 548 }, { "entropy": 0.5195313468575478, "epoch": 2.048507462686567, "grad_norm": 0.16998501121997833, "learning_rate": 0.0002, "loss": 0.5220100283622742, "mean_token_accuracy": 0.7873262912034988, "num_tokens": 8978096.0, "step": 549 }, { "entropy": 0.5092991963028908, "epoch": 2.0522388059701493, "grad_norm": 0.18961496651172638, "learning_rate": 0.0002, "loss": 0.5134435892105103, "mean_token_accuracy": 0.7906353622674942, "num_tokens": 8994217.0, "step": 550 }, { "entropy": 0.5130208507180214, "epoch": 2.0559701492537314, "grad_norm": 0.15812328457832336, "learning_rate": 0.0002, "loss": 0.5057437419891357, "mean_token_accuracy": 0.7933137118816376, "num_tokens": 9010450.0, "step": 551 }, { "entropy": 0.5244034826755524, "epoch": 2.0597014925373136, "grad_norm": 0.17014764249324799, "learning_rate": 0.0002, "loss": 0.5208017230033875, "mean_token_accuracy": 0.7864028364419937, "num_tokens": 9026690.0, "step": 552 }, { "entropy": 0.524794228374958, "epoch": 2.0634328358208953, "grad_norm": 0.1528615653514862, "learning_rate": 0.0002, "loss": 0.5251787900924683, "mean_token_accuracy": 0.7868095934391022, "num_tokens": 9042889.0, "step": 553 }, { "entropy": 0.525935024023056, "epoch": 2.0671641791044775, "grad_norm": 0.1623958796262741, "learning_rate": 0.0002, "loss": 0.5336424708366394, "mean_token_accuracy": 0.7855145633220673, "num_tokens": 9059267.0, "step": 554 }, { "entropy": 0.5195625573396683, "epoch": 2.0708955223880596, "grad_norm": 0.17523802816867828, "learning_rate": 0.0002, "loss": 0.5209751725196838, "mean_token_accuracy": 0.7891881316900253, "num_tokens": 9075744.0, "step": 555 }, { "entropy": 0.5318533927202225, "epoch": 2.074626865671642, "grad_norm": 0.16624799370765686, "learning_rate": 0.0002, "loss": 0.5274427533149719, "mean_token_accuracy": 0.7851865887641907, "num_tokens": 9092196.0, "step": 556 }, { "entropy": 0.5313673615455627, "epoch": 2.078358208955224, "grad_norm": 0.16823066771030426, "learning_rate": 0.0002, "loss": 0.5263111591339111, "mean_token_accuracy": 0.7885167598724365, "num_tokens": 9108431.0, "step": 557 }, { "entropy": 0.518197163939476, "epoch": 2.082089552238806, "grad_norm": 0.18068267405033112, "learning_rate": 0.0002, "loss": 0.5193851590156555, "mean_token_accuracy": 0.7903801500797272, "num_tokens": 9124741.0, "step": 558 }, { "entropy": 0.5107997804880142, "epoch": 2.0858208955223883, "grad_norm": 0.15915489196777344, "learning_rate": 0.0002, "loss": 0.5146846771240234, "mean_token_accuracy": 0.7921037524938583, "num_tokens": 9141112.0, "step": 559 }, { "entropy": 0.5317652374505997, "epoch": 2.08955223880597, "grad_norm": 0.18767035007476807, "learning_rate": 0.0002, "loss": 0.5400185585021973, "mean_token_accuracy": 0.7800605148077011, "num_tokens": 9157563.0, "step": 560 }, { "entropy": 0.5086512267589569, "epoch": 2.093283582089552, "grad_norm": 0.1544736921787262, "learning_rate": 0.0002, "loss": 0.508223831653595, "mean_token_accuracy": 0.7939174175262451, "num_tokens": 9173854.0, "step": 561 }, { "entropy": 0.52768574655056, "epoch": 2.0970149253731343, "grad_norm": 0.17799650132656097, "learning_rate": 0.0002, "loss": 0.5289405584335327, "mean_token_accuracy": 0.7851383984088898, "num_tokens": 9190112.0, "step": 562 }, { "entropy": 0.5307039618492126, "epoch": 2.1007462686567164, "grad_norm": 0.1469665914773941, "learning_rate": 0.0002, "loss": 0.5241371989250183, "mean_token_accuracy": 0.7877105623483658, "num_tokens": 9206476.0, "step": 563 }, { "entropy": 0.517830565571785, "epoch": 2.1044776119402986, "grad_norm": 0.1440608948469162, "learning_rate": 0.0002, "loss": 0.5123553276062012, "mean_token_accuracy": 0.7936355024576187, "num_tokens": 9222843.0, "step": 564 }, { "entropy": 0.523407056927681, "epoch": 2.1082089552238807, "grad_norm": 0.21014799177646637, "learning_rate": 0.0002, "loss": 0.5186851620674133, "mean_token_accuracy": 0.792457640171051, "num_tokens": 9239327.0, "step": 565 }, { "entropy": 0.5128730833530426, "epoch": 2.111940298507463, "grad_norm": 0.2577928602695465, "learning_rate": 0.0002, "loss": 0.5269497632980347, "mean_token_accuracy": 0.7877898067235947, "num_tokens": 9255586.0, "step": 566 }, { "entropy": 0.5238759815692902, "epoch": 2.1156716417910446, "grad_norm": 0.1416473388671875, "learning_rate": 0.0002, "loss": 0.5266433954238892, "mean_token_accuracy": 0.7873618602752686, "num_tokens": 9272236.0, "step": 567 }, { "entropy": 0.5273244455456734, "epoch": 2.1194029850746268, "grad_norm": 0.1742546260356903, "learning_rate": 0.0002, "loss": 0.5227883458137512, "mean_token_accuracy": 0.7893139868974686, "num_tokens": 9288429.0, "step": 568 }, { "entropy": 0.5123281329870224, "epoch": 2.123134328358209, "grad_norm": 0.17472973465919495, "learning_rate": 0.0002, "loss": 0.5086967945098877, "mean_token_accuracy": 0.7941555231809616, "num_tokens": 9304696.0, "step": 569 }, { "entropy": 0.5038742050528526, "epoch": 2.126865671641791, "grad_norm": 0.15990978479385376, "learning_rate": 0.0002, "loss": 0.5093705058097839, "mean_token_accuracy": 0.7927817106246948, "num_tokens": 9320823.0, "step": 570 }, { "entropy": 0.5118470937013626, "epoch": 2.1305970149253732, "grad_norm": 0.15983271598815918, "learning_rate": 0.0002, "loss": 0.5105957388877869, "mean_token_accuracy": 0.7947766035795212, "num_tokens": 9337178.0, "step": 571 }, { "entropy": 0.5117835849523544, "epoch": 2.1343283582089554, "grad_norm": 0.17154565453529358, "learning_rate": 0.0002, "loss": 0.5166530609130859, "mean_token_accuracy": 0.7898510247468948, "num_tokens": 9353541.0, "step": 572 }, { "entropy": 0.524290457367897, "epoch": 2.138059701492537, "grad_norm": 0.1809605062007904, "learning_rate": 0.0002, "loss": 0.5276108980178833, "mean_token_accuracy": 0.7894007414579391, "num_tokens": 9370257.0, "step": 573 }, { "entropy": 0.5326485335826874, "epoch": 2.1417910447761193, "grad_norm": 0.17269255220890045, "learning_rate": 0.0002, "loss": 0.5320166349411011, "mean_token_accuracy": 0.7842083424329758, "num_tokens": 9386645.0, "step": 574 }, { "entropy": 0.5396575331687927, "epoch": 2.1455223880597014, "grad_norm": 0.19763849675655365, "learning_rate": 0.0002, "loss": 0.5302010774612427, "mean_token_accuracy": 0.7843988239765167, "num_tokens": 9403107.0, "step": 575 }, { "entropy": 0.53758405148983, "epoch": 2.1492537313432836, "grad_norm": 0.1403210610151291, "learning_rate": 0.0002, "loss": 0.5297962427139282, "mean_token_accuracy": 0.7875841557979584, "num_tokens": 9419679.0, "step": 576 }, { "entropy": 0.5233541131019592, "epoch": 2.1529850746268657, "grad_norm": 0.18504074215888977, "learning_rate": 0.0002, "loss": 0.5262290835380554, "mean_token_accuracy": 0.7859254032373428, "num_tokens": 9436038.0, "step": 577 }, { "entropy": 0.5059448033571243, "epoch": 2.156716417910448, "grad_norm": 0.18249362707138062, "learning_rate": 0.0002, "loss": 0.5139797329902649, "mean_token_accuracy": 0.7936645895242691, "num_tokens": 9452416.0, "step": 578 }, { "entropy": 0.5189633667469025, "epoch": 2.16044776119403, "grad_norm": 0.21265490353107452, "learning_rate": 0.0002, "loss": 0.533969521522522, "mean_token_accuracy": 0.7854558378458023, "num_tokens": 9468830.0, "step": 579 }, { "entropy": 0.5293581038713455, "epoch": 2.1641791044776117, "grad_norm": 0.16064560413360596, "learning_rate": 0.0002, "loss": 0.5302042961120605, "mean_token_accuracy": 0.7855220139026642, "num_tokens": 9485369.0, "step": 580 }, { "entropy": 0.5367814004421234, "epoch": 2.167910447761194, "grad_norm": 0.1988399475812912, "learning_rate": 0.0002, "loss": 0.5316881537437439, "mean_token_accuracy": 0.7867899537086487, "num_tokens": 9501506.0, "step": 581 }, { "entropy": 0.530438095331192, "epoch": 2.171641791044776, "grad_norm": 0.16211427748203278, "learning_rate": 0.0002, "loss": 0.5204508304595947, "mean_token_accuracy": 0.7928901314735413, "num_tokens": 9517998.0, "step": 582 }, { "entropy": 0.538342297077179, "epoch": 2.175373134328358, "grad_norm": 0.200654536485672, "learning_rate": 0.0002, "loss": 0.5368824005126953, "mean_token_accuracy": 0.7828831076622009, "num_tokens": 9534418.0, "step": 583 }, { "entropy": 0.5067318677902222, "epoch": 2.1791044776119404, "grad_norm": 0.18536439538002014, "learning_rate": 0.0002, "loss": 0.5152954459190369, "mean_token_accuracy": 0.7947442531585693, "num_tokens": 9550929.0, "step": 584 }, { "entropy": 0.5143613219261169, "epoch": 2.1828358208955225, "grad_norm": 0.18734246492385864, "learning_rate": 0.0002, "loss": 0.5320346355438232, "mean_token_accuracy": 0.7830832600593567, "num_tokens": 9567052.0, "step": 585 }, { "entropy": 0.5134065821766853, "epoch": 2.1865671641791047, "grad_norm": 0.1658649444580078, "learning_rate": 0.0002, "loss": 0.5137937664985657, "mean_token_accuracy": 0.792109802365303, "num_tokens": 9583328.0, "step": 586 }, { "entropy": 0.5145891755819321, "epoch": 2.1902985074626864, "grad_norm": 0.20381639897823334, "learning_rate": 0.0002, "loss": 0.5113189220428467, "mean_token_accuracy": 0.791796863079071, "num_tokens": 9599639.0, "step": 587 }, { "entropy": 0.5297699421644211, "epoch": 2.1940298507462686, "grad_norm": 0.1610771119594574, "learning_rate": 0.0002, "loss": 0.5239428877830505, "mean_token_accuracy": 0.7868966311216354, "num_tokens": 9616107.0, "step": 588 }, { "entropy": 0.5139229521155357, "epoch": 2.1977611940298507, "grad_norm": 0.16601988673210144, "learning_rate": 0.0002, "loss": 0.5093111991882324, "mean_token_accuracy": 0.7953454554080963, "num_tokens": 9632478.0, "step": 589 }, { "entropy": 0.5277693122625351, "epoch": 2.201492537313433, "grad_norm": 0.15310561656951904, "learning_rate": 0.0002, "loss": 0.5306464433670044, "mean_token_accuracy": 0.785234808921814, "num_tokens": 9648606.0, "step": 590 }, { "entropy": 0.5277083218097687, "epoch": 2.205223880597015, "grad_norm": 0.17894159257411957, "learning_rate": 0.0002, "loss": 0.5229562520980835, "mean_token_accuracy": 0.7855621576309204, "num_tokens": 9664853.0, "step": 591 }, { "entropy": 0.5369253158569336, "epoch": 2.208955223880597, "grad_norm": 0.17260174453258514, "learning_rate": 0.0002, "loss": 0.5379320383071899, "mean_token_accuracy": 0.785187691450119, "num_tokens": 9681395.0, "step": 592 }, { "entropy": 0.51601941883564, "epoch": 2.2126865671641793, "grad_norm": 0.19144131243228912, "learning_rate": 0.0002, "loss": 0.525420606136322, "mean_token_accuracy": 0.7879699319601059, "num_tokens": 9697832.0, "step": 593 }, { "entropy": 0.5305543690919876, "epoch": 2.216417910447761, "grad_norm": 0.152136892080307, "learning_rate": 0.0002, "loss": 0.5263657569885254, "mean_token_accuracy": 0.7852640599012375, "num_tokens": 9714327.0, "step": 594 }, { "entropy": 0.5374766737222672, "epoch": 2.220149253731343, "grad_norm": 0.18577203154563904, "learning_rate": 0.0002, "loss": 0.538034975528717, "mean_token_accuracy": 0.7831636220216751, "num_tokens": 9730796.0, "step": 595 }, { "entropy": 0.5116140991449356, "epoch": 2.2238805970149254, "grad_norm": 0.15658536553382874, "learning_rate": 0.0002, "loss": 0.5068283081054688, "mean_token_accuracy": 0.7946771383285522, "num_tokens": 9747017.0, "step": 596 }, { "entropy": 0.5136987864971161, "epoch": 2.2276119402985075, "grad_norm": 0.15834017097949982, "learning_rate": 0.0002, "loss": 0.518505334854126, "mean_token_accuracy": 0.7908380329608917, "num_tokens": 9763200.0, "step": 597 }, { "entropy": 0.48786860704421997, "epoch": 2.2313432835820897, "grad_norm": 0.16836979985237122, "learning_rate": 0.0002, "loss": 0.4918700158596039, "mean_token_accuracy": 0.8017545938491821, "num_tokens": 9779342.0, "step": 598 }, { "entropy": 0.511562891304493, "epoch": 2.235074626865672, "grad_norm": 0.19002674520015717, "learning_rate": 0.0002, "loss": 0.5156916975975037, "mean_token_accuracy": 0.7910201996564865, "num_tokens": 9795546.0, "step": 599 }, { "entropy": 0.5209366902709007, "epoch": 2.2388059701492535, "grad_norm": 0.17156340181827545, "learning_rate": 0.0002, "loss": 0.515453040599823, "mean_token_accuracy": 0.7911808788776398, "num_tokens": 9811678.0, "step": 600 }, { "entropy": 0.5190790444612503, "epoch": 2.2425373134328357, "grad_norm": 0.16390037536621094, "learning_rate": 0.0002, "loss": 0.5197610259056091, "mean_token_accuracy": 0.791000559926033, "num_tokens": 9827971.0, "step": 601 }, { "entropy": 0.534053236246109, "epoch": 2.246268656716418, "grad_norm": 0.17688144743442535, "learning_rate": 0.0002, "loss": 0.5342822074890137, "mean_token_accuracy": 0.7848292291164398, "num_tokens": 9844391.0, "step": 602 }, { "entropy": 0.5072491243481636, "epoch": 2.25, "grad_norm": 0.15552373230457306, "learning_rate": 0.0002, "loss": 0.5125934481620789, "mean_token_accuracy": 0.79164819419384, "num_tokens": 9860695.0, "step": 603 }, { "entropy": 0.5196588039398193, "epoch": 2.253731343283582, "grad_norm": 0.20500463247299194, "learning_rate": 0.0002, "loss": 0.5203579664230347, "mean_token_accuracy": 0.7872295528650284, "num_tokens": 9876962.0, "step": 604 }, { "entropy": 0.5224801748991013, "epoch": 2.2574626865671643, "grad_norm": 0.16438624262809753, "learning_rate": 0.0002, "loss": 0.517778217792511, "mean_token_accuracy": 0.7902567535638809, "num_tokens": 9893378.0, "step": 605 }, { "entropy": 0.5315049141645432, "epoch": 2.2611940298507465, "grad_norm": 0.19314803183078766, "learning_rate": 0.0002, "loss": 0.5378735065460205, "mean_token_accuracy": 0.7826669216156006, "num_tokens": 9909658.0, "step": 606 }, { "entropy": 0.5268717259168625, "epoch": 2.264925373134328, "grad_norm": 0.1703607141971588, "learning_rate": 0.0002, "loss": 0.5323152542114258, "mean_token_accuracy": 0.7835480719804764, "num_tokens": 9926026.0, "step": 607 }, { "entropy": 0.5275075733661652, "epoch": 2.2686567164179103, "grad_norm": 0.1891828328371048, "learning_rate": 0.0002, "loss": 0.523108959197998, "mean_token_accuracy": 0.7864743769168854, "num_tokens": 9942362.0, "step": 608 }, { "entropy": 0.5301201939582825, "epoch": 2.2723880597014925, "grad_norm": 0.16404391825199127, "learning_rate": 0.0002, "loss": 0.5282193422317505, "mean_token_accuracy": 0.7837762832641602, "num_tokens": 9958517.0, "step": 609 }, { "entropy": 0.5198077484965324, "epoch": 2.2761194029850746, "grad_norm": 0.1796608716249466, "learning_rate": 0.0002, "loss": 0.5138813853263855, "mean_token_accuracy": 0.7904112935066223, "num_tokens": 9974864.0, "step": 610 }, { "entropy": 0.5151881948113441, "epoch": 2.279850746268657, "grad_norm": 0.1921297013759613, "learning_rate": 0.0002, "loss": 0.5276269912719727, "mean_token_accuracy": 0.7861463725566864, "num_tokens": 9990982.0, "step": 611 }, { "entropy": 0.5268184095621109, "epoch": 2.283582089552239, "grad_norm": 0.3107461929321289, "learning_rate": 0.0002, "loss": 0.5354833006858826, "mean_token_accuracy": 0.7860495001077652, "num_tokens": 10007390.0, "step": 612 }, { "entropy": 0.5362572968006134, "epoch": 2.2873134328358207, "grad_norm": 0.2291727513074875, "learning_rate": 0.0002, "loss": 0.5278795957565308, "mean_token_accuracy": 0.7864319235086441, "num_tokens": 10023741.0, "step": 613 }, { "entropy": 0.5297401547431946, "epoch": 2.291044776119403, "grad_norm": 0.22683671116828918, "learning_rate": 0.0002, "loss": 0.5257067680358887, "mean_token_accuracy": 0.7868115305900574, "num_tokens": 10040185.0, "step": 614 }, { "entropy": 0.5152234882116318, "epoch": 2.294776119402985, "grad_norm": 0.20225822925567627, "learning_rate": 0.0002, "loss": 0.5109996795654297, "mean_token_accuracy": 0.7922611236572266, "num_tokens": 10056416.0, "step": 615 }, { "entropy": 0.5397164672613144, "epoch": 2.298507462686567, "grad_norm": 0.21879570186138153, "learning_rate": 0.0002, "loss": 0.53910893201828, "mean_token_accuracy": 0.7829782217741013, "num_tokens": 10073119.0, "step": 616 }, { "entropy": 0.523445226252079, "epoch": 2.3022388059701493, "grad_norm": 0.2043614238500595, "learning_rate": 0.0002, "loss": 0.5277411341667175, "mean_token_accuracy": 0.7879920601844788, "num_tokens": 10089539.0, "step": 617 }, { "entropy": 0.5420306473970413, "epoch": 2.3059701492537314, "grad_norm": 0.16890020668506622, "learning_rate": 0.0002, "loss": 0.5416600704193115, "mean_token_accuracy": 0.7815042287111282, "num_tokens": 10105674.0, "step": 618 }, { "entropy": 0.5223758220672607, "epoch": 2.3097014925373136, "grad_norm": 0.187328040599823, "learning_rate": 0.0002, "loss": 0.5208746790885925, "mean_token_accuracy": 0.7938240319490433, "num_tokens": 10121685.0, "step": 619 }, { "entropy": 0.5317254960536957, "epoch": 2.3134328358208958, "grad_norm": 0.17246371507644653, "learning_rate": 0.0002, "loss": 0.5258828997612, "mean_token_accuracy": 0.7855419665575027, "num_tokens": 10138380.0, "step": 620 }, { "entropy": 0.510456420481205, "epoch": 2.3171641791044775, "grad_norm": 0.17611362040042877, "learning_rate": 0.0002, "loss": 0.5174400806427002, "mean_token_accuracy": 0.790027379989624, "num_tokens": 10154639.0, "step": 621 }, { "entropy": 0.5146428272128105, "epoch": 2.3208955223880596, "grad_norm": 0.19471095502376556, "learning_rate": 0.0002, "loss": 0.5222116708755493, "mean_token_accuracy": 0.7890471816062927, "num_tokens": 10170992.0, "step": 622 }, { "entropy": 0.5554968118667603, "epoch": 2.324626865671642, "grad_norm": 0.15456657111644745, "learning_rate": 0.0002, "loss": 0.5553091168403625, "mean_token_accuracy": 0.7767172753810883, "num_tokens": 10187415.0, "step": 623 }, { "entropy": 0.5297296196222305, "epoch": 2.328358208955224, "grad_norm": 0.17202581465244293, "learning_rate": 0.0002, "loss": 0.5306862592697144, "mean_token_accuracy": 0.7859676033258438, "num_tokens": 10204041.0, "step": 624 }, { "entropy": 0.5107762217521667, "epoch": 2.332089552238806, "grad_norm": 0.17404352128505707, "learning_rate": 0.0002, "loss": 0.5129390358924866, "mean_token_accuracy": 0.7931138426065445, "num_tokens": 10220300.0, "step": 625 }, { "entropy": 0.5258396938443184, "epoch": 2.3358208955223883, "grad_norm": 0.18174229562282562, "learning_rate": 0.0002, "loss": 0.5229369401931763, "mean_token_accuracy": 0.7888091504573822, "num_tokens": 10236649.0, "step": 626 }, { "entropy": 0.5380365252494812, "epoch": 2.33955223880597, "grad_norm": 0.17537739872932434, "learning_rate": 0.0002, "loss": 0.5373145937919617, "mean_token_accuracy": 0.7832024991512299, "num_tokens": 10252909.0, "step": 627 }, { "entropy": 0.5075801610946655, "epoch": 2.343283582089552, "grad_norm": 0.22284290194511414, "learning_rate": 0.0002, "loss": 0.511396586894989, "mean_token_accuracy": 0.7928276360034943, "num_tokens": 10269280.0, "step": 628 }, { "entropy": 0.5164258778095245, "epoch": 2.3470149253731343, "grad_norm": 0.18526744842529297, "learning_rate": 0.0002, "loss": 0.5178982019424438, "mean_token_accuracy": 0.7898775935173035, "num_tokens": 10285761.0, "step": 629 }, { "entropy": 0.5200358033180237, "epoch": 2.3507462686567164, "grad_norm": 0.20576190948486328, "learning_rate": 0.0002, "loss": 0.5253298878669739, "mean_token_accuracy": 0.7885328382253647, "num_tokens": 10301941.0, "step": 630 }, { "entropy": 0.5383775234222412, "epoch": 2.3544776119402986, "grad_norm": 0.17617975175380707, "learning_rate": 0.0002, "loss": 0.5448250770568848, "mean_token_accuracy": 0.782653346657753, "num_tokens": 10318486.0, "step": 631 }, { "entropy": 0.5118822678923607, "epoch": 2.3582089552238807, "grad_norm": 0.18932130932807922, "learning_rate": 0.0002, "loss": 0.5223209857940674, "mean_token_accuracy": 0.7917590737342834, "num_tokens": 10334530.0, "step": 632 }, { "entropy": 0.5191465318202972, "epoch": 2.361940298507463, "grad_norm": 0.18021032214164734, "learning_rate": 0.0002, "loss": 0.5152462124824524, "mean_token_accuracy": 0.791267067193985, "num_tokens": 10350724.0, "step": 633 }, { "entropy": 0.5144938305020332, "epoch": 2.3656716417910446, "grad_norm": 0.15109598636627197, "learning_rate": 0.0002, "loss": 0.4982617497444153, "mean_token_accuracy": 0.7967542856931686, "num_tokens": 10366875.0, "step": 634 }, { "entropy": 0.5065358951687813, "epoch": 2.3694029850746268, "grad_norm": 0.18718236684799194, "learning_rate": 0.0002, "loss": 0.4973527193069458, "mean_token_accuracy": 0.8017638623714447, "num_tokens": 10383005.0, "step": 635 }, { "entropy": 0.530413880944252, "epoch": 2.373134328358209, "grad_norm": 0.1718485951423645, "learning_rate": 0.0002, "loss": 0.5324255228042603, "mean_token_accuracy": 0.7831610143184662, "num_tokens": 10399588.0, "step": 636 }, { "entropy": 0.5436315685510635, "epoch": 2.376865671641791, "grad_norm": 0.20064882934093475, "learning_rate": 0.0002, "loss": 0.5518239140510559, "mean_token_accuracy": 0.7763282507658005, "num_tokens": 10416058.0, "step": 637 }, { "entropy": 0.5224271416664124, "epoch": 2.3805970149253732, "grad_norm": 0.18303366005420685, "learning_rate": 0.0002, "loss": 0.5248957872390747, "mean_token_accuracy": 0.7867279052734375, "num_tokens": 10432139.0, "step": 638 }, { "entropy": 0.5115847885608673, "epoch": 2.3843283582089554, "grad_norm": 0.18415044248104095, "learning_rate": 0.0002, "loss": 0.5158942937850952, "mean_token_accuracy": 0.7931726425886154, "num_tokens": 10448181.0, "step": 639 }, { "entropy": 0.5335763245820999, "epoch": 2.388059701492537, "grad_norm": 0.17970694601535797, "learning_rate": 0.0002, "loss": 0.5286952257156372, "mean_token_accuracy": 0.7878449261188507, "num_tokens": 10464583.0, "step": 640 }, { "entropy": 0.5233506336808205, "epoch": 2.3917910447761193, "grad_norm": 0.19122423231601715, "learning_rate": 0.0002, "loss": 0.5172105431556702, "mean_token_accuracy": 0.7892956882715225, "num_tokens": 10481023.0, "step": 641 }, { "entropy": 0.5129317939281464, "epoch": 2.3955223880597014, "grad_norm": 0.16389286518096924, "learning_rate": 0.0002, "loss": 0.5165532231330872, "mean_token_accuracy": 0.7895939499139786, "num_tokens": 10497404.0, "step": 642 }, { "entropy": 0.5067487806081772, "epoch": 2.3992537313432836, "grad_norm": 0.17685648798942566, "learning_rate": 0.0002, "loss": 0.5114090442657471, "mean_token_accuracy": 0.79579958319664, "num_tokens": 10513777.0, "step": 643 }, { "entropy": 0.5056411698460579, "epoch": 2.4029850746268657, "grad_norm": 0.20632798969745636, "learning_rate": 0.0002, "loss": 0.512579083442688, "mean_token_accuracy": 0.7917985171079636, "num_tokens": 10530002.0, "step": 644 }, { "entropy": 0.503575325012207, "epoch": 2.406716417910448, "grad_norm": 0.18627490103244781, "learning_rate": 0.0002, "loss": 0.5137442350387573, "mean_token_accuracy": 0.7893558740615845, "num_tokens": 10546273.0, "step": 645 }, { "entropy": 0.5291843265295029, "epoch": 2.41044776119403, "grad_norm": 0.16846197843551636, "learning_rate": 0.0002, "loss": 0.5265457630157471, "mean_token_accuracy": 0.7875650376081467, "num_tokens": 10562590.0, "step": 646 }, { "entropy": 0.5421585887670517, "epoch": 2.4141791044776117, "grad_norm": 0.17224395275115967, "learning_rate": 0.0002, "loss": 0.5339004993438721, "mean_token_accuracy": 0.7843624651432037, "num_tokens": 10578951.0, "step": 647 }, { "entropy": 0.5322060137987137, "epoch": 2.417910447761194, "grad_norm": 0.15629476308822632, "learning_rate": 0.0002, "loss": 0.5219835638999939, "mean_token_accuracy": 0.7886752039194107, "num_tokens": 10595214.0, "step": 648 }, { "entropy": 0.5281577706336975, "epoch": 2.421641791044776, "grad_norm": 0.18105372786521912, "learning_rate": 0.0002, "loss": 0.5306849479675293, "mean_token_accuracy": 0.7853680700063705, "num_tokens": 10611701.0, "step": 649 }, { "entropy": 0.5248554199934006, "epoch": 2.425373134328358, "grad_norm": 0.16688814759254456, "learning_rate": 0.0002, "loss": 0.5278753638267517, "mean_token_accuracy": 0.7852373868227005, "num_tokens": 10628217.0, "step": 650 }, { "entropy": 0.5284415632486343, "epoch": 2.4291044776119404, "grad_norm": 0.1766011267900467, "learning_rate": 0.0002, "loss": 0.5336297750473022, "mean_token_accuracy": 0.7854758203029633, "num_tokens": 10644808.0, "step": 651 }, { "entropy": 0.522301472723484, "epoch": 2.4328358208955225, "grad_norm": 0.1673455685377121, "learning_rate": 0.0002, "loss": 0.5260990262031555, "mean_token_accuracy": 0.7875321805477142, "num_tokens": 10661415.0, "step": 652 }, { "entropy": 0.5340454131364822, "epoch": 2.4365671641791042, "grad_norm": 0.1705857813358307, "learning_rate": 0.0002, "loss": 0.5287991166114807, "mean_token_accuracy": 0.7848271727561951, "num_tokens": 10678098.0, "step": 653 }, { "entropy": 0.5536000281572342, "epoch": 2.4402985074626864, "grad_norm": 0.16633524000644684, "learning_rate": 0.0002, "loss": 0.5458575487136841, "mean_token_accuracy": 0.7790239751338959, "num_tokens": 10694453.0, "step": 654 }, { "entropy": 0.5396594703197479, "epoch": 2.4440298507462686, "grad_norm": 0.1658376157283783, "learning_rate": 0.0002, "loss": 0.5348730683326721, "mean_token_accuracy": 0.7840123027563095, "num_tokens": 10710682.0, "step": 655 }, { "entropy": 0.5132960826158524, "epoch": 2.4477611940298507, "grad_norm": 0.16822409629821777, "learning_rate": 0.0002, "loss": 0.5173973441123962, "mean_token_accuracy": 0.7915854156017303, "num_tokens": 10726882.0, "step": 656 }, { "entropy": 0.504063256084919, "epoch": 2.451492537313433, "grad_norm": 0.21201510727405548, "learning_rate": 0.0002, "loss": 0.5162043571472168, "mean_token_accuracy": 0.7916038483381271, "num_tokens": 10743326.0, "step": 657 }, { "entropy": 0.5151261985301971, "epoch": 2.455223880597015, "grad_norm": 0.22159790992736816, "learning_rate": 0.0002, "loss": 0.5307928323745728, "mean_token_accuracy": 0.783583402633667, "num_tokens": 10759068.0, "step": 658 }, { "entropy": 0.5228653997182846, "epoch": 2.458955223880597, "grad_norm": 0.1764376312494278, "learning_rate": 0.0002, "loss": 0.526711106300354, "mean_token_accuracy": 0.785754069685936, "num_tokens": 10775538.0, "step": 659 }, { "entropy": 0.5352444350719452, "epoch": 2.4626865671641793, "grad_norm": 0.1673639416694641, "learning_rate": 0.0002, "loss": 0.53009432554245, "mean_token_accuracy": 0.7853073179721832, "num_tokens": 10791878.0, "step": 660 }, { "entropy": 0.5250429213047028, "epoch": 2.466417910447761, "grad_norm": 0.1584668755531311, "learning_rate": 0.0002, "loss": 0.5163600444793701, "mean_token_accuracy": 0.7921949625015259, "num_tokens": 10808194.0, "step": 661 }, { "entropy": 0.531511977314949, "epoch": 2.470149253731343, "grad_norm": 0.15331409871578217, "learning_rate": 0.0002, "loss": 0.52297043800354, "mean_token_accuracy": 0.7875395864248276, "num_tokens": 10824487.0, "step": 662 }, { "entropy": 0.5337095707654953, "epoch": 2.4738805970149254, "grad_norm": 0.1537831574678421, "learning_rate": 0.0002, "loss": 0.5269461870193481, "mean_token_accuracy": 0.7883634269237518, "num_tokens": 10840768.0, "step": 663 }, { "entropy": 0.5136477053165436, "epoch": 2.4776119402985075, "grad_norm": 0.1710546612739563, "learning_rate": 0.0002, "loss": 0.5147293210029602, "mean_token_accuracy": 0.790741965174675, "num_tokens": 10857093.0, "step": 664 }, { "entropy": 0.5279193222522736, "epoch": 2.4813432835820897, "grad_norm": 0.18926194310188293, "learning_rate": 0.0002, "loss": 0.5373238921165466, "mean_token_accuracy": 0.7801239043474197, "num_tokens": 10873516.0, "step": 665 }, { "entropy": 0.5202833041548729, "epoch": 2.485074626865672, "grad_norm": 0.18720589578151703, "learning_rate": 0.0002, "loss": 0.5260710120201111, "mean_token_accuracy": 0.7854216694831848, "num_tokens": 10889866.0, "step": 666 }, { "entropy": 0.5342879593372345, "epoch": 2.4888059701492535, "grad_norm": 0.16395018994808197, "learning_rate": 0.0002, "loss": 0.5291630625724792, "mean_token_accuracy": 0.786442369222641, "num_tokens": 10906265.0, "step": 667 }, { "entropy": 0.5179769471287727, "epoch": 2.4925373134328357, "grad_norm": 0.18135614693164825, "learning_rate": 0.0002, "loss": 0.5105394721031189, "mean_token_accuracy": 0.7919545620679855, "num_tokens": 10922859.0, "step": 668 }, { "entropy": 0.5149218291044235, "epoch": 2.496268656716418, "grad_norm": 0.16995131969451904, "learning_rate": 0.0002, "loss": 0.5147515535354614, "mean_token_accuracy": 0.7931389808654785, "num_tokens": 10938918.0, "step": 669 }, { "entropy": 0.5330513119697571, "epoch": 2.5, "grad_norm": 0.1602948158979416, "learning_rate": 0.0002, "loss": 0.5284178256988525, "mean_token_accuracy": 0.7882454097270966, "num_tokens": 10955263.0, "step": 670 }, { "entropy": 0.5100918263196945, "epoch": 2.503731343283582, "grad_norm": 0.1638704538345337, "learning_rate": 0.0002, "loss": 0.5109102725982666, "mean_token_accuracy": 0.7914802730083466, "num_tokens": 10971573.0, "step": 671 }, { "entropy": 0.5232444852590561, "epoch": 2.5074626865671643, "grad_norm": 0.17863468825817108, "learning_rate": 0.0002, "loss": 0.527701735496521, "mean_token_accuracy": 0.7854352295398712, "num_tokens": 10987693.0, "step": 672 }, { "entropy": 0.5050330087542534, "epoch": 2.5111940298507465, "grad_norm": 0.18801726400852203, "learning_rate": 0.0002, "loss": 0.5186895728111267, "mean_token_accuracy": 0.7896755188703537, "num_tokens": 11003802.0, "step": 673 }, { "entropy": 0.5354911088943481, "epoch": 2.5149253731343286, "grad_norm": 0.1630580574274063, "learning_rate": 0.0002, "loss": 0.5393661856651306, "mean_token_accuracy": 0.7806737869977951, "num_tokens": 11020382.0, "step": 674 }, { "entropy": 0.5103952214121819, "epoch": 2.5186567164179103, "grad_norm": 0.16479070484638214, "learning_rate": 0.0002, "loss": 0.5052312016487122, "mean_token_accuracy": 0.79300656914711, "num_tokens": 11036684.0, "step": 675 }, { "entropy": 0.5548539459705353, "epoch": 2.5223880597014925, "grad_norm": 0.15993361175060272, "learning_rate": 0.0002, "loss": 0.5424168109893799, "mean_token_accuracy": 0.7810866236686707, "num_tokens": 11053105.0, "step": 676 }, { "entropy": 0.5318550616502762, "epoch": 2.5261194029850746, "grad_norm": 0.17689482867717743, "learning_rate": 0.0002, "loss": 0.5247601270675659, "mean_token_accuracy": 0.7856518179178238, "num_tokens": 11069578.0, "step": 677 }, { "entropy": 0.5139466673135757, "epoch": 2.529850746268657, "grad_norm": 0.17671139538288116, "learning_rate": 0.0002, "loss": 0.5161247253417969, "mean_token_accuracy": 0.7908915132284164, "num_tokens": 11085697.0, "step": 678 }, { "entropy": 0.5080201476812363, "epoch": 2.533582089552239, "grad_norm": 0.2036965787410736, "learning_rate": 0.0002, "loss": 0.5175144672393799, "mean_token_accuracy": 0.791350468993187, "num_tokens": 11101902.0, "step": 679 }, { "entropy": 0.5312675833702087, "epoch": 2.5373134328358207, "grad_norm": 0.19512657821178436, "learning_rate": 0.0002, "loss": 0.5406134128570557, "mean_token_accuracy": 0.7809882313013077, "num_tokens": 11118259.0, "step": 680 }, { "entropy": 0.5147824436426163, "epoch": 2.541044776119403, "grad_norm": 0.223260298371315, "learning_rate": 0.0002, "loss": 0.5146397948265076, "mean_token_accuracy": 0.7933319509029388, "num_tokens": 11134757.0, "step": 681 }, { "entropy": 0.5265121906995773, "epoch": 2.544776119402985, "grad_norm": 0.17229494452476501, "learning_rate": 0.0002, "loss": 0.5215858221054077, "mean_token_accuracy": 0.7878258526325226, "num_tokens": 11150969.0, "step": 682 }, { "entropy": 0.5460138469934464, "epoch": 2.548507462686567, "grad_norm": 0.16450214385986328, "learning_rate": 0.0002, "loss": 0.5474146604537964, "mean_token_accuracy": 0.7795809954404831, "num_tokens": 11167094.0, "step": 683 }, { "entropy": 0.5366989523172379, "epoch": 2.5522388059701493, "grad_norm": 0.20410536229610443, "learning_rate": 0.0002, "loss": 0.5371419787406921, "mean_token_accuracy": 0.7853393852710724, "num_tokens": 11183515.0, "step": 684 }, { "entropy": 0.5475771278142929, "epoch": 2.5559701492537314, "grad_norm": 0.1698704957962036, "learning_rate": 0.0002, "loss": 0.5460457801818848, "mean_token_accuracy": 0.781210407614708, "num_tokens": 11200139.0, "step": 685 }, { "entropy": 0.5389831364154816, "epoch": 2.5597014925373136, "grad_norm": 0.22744543850421906, "learning_rate": 0.0002, "loss": 0.5387647747993469, "mean_token_accuracy": 0.7828833609819412, "num_tokens": 11216497.0, "step": 686 }, { "entropy": 0.531368613243103, "epoch": 2.5634328358208958, "grad_norm": 0.17488178610801697, "learning_rate": 0.0002, "loss": 0.5309722423553467, "mean_token_accuracy": 0.7842755913734436, "num_tokens": 11232676.0, "step": 687 }, { "entropy": 0.5410369485616684, "epoch": 2.5671641791044775, "grad_norm": 0.1710905283689499, "learning_rate": 0.0002, "loss": 0.5380433797836304, "mean_token_accuracy": 0.7851070165634155, "num_tokens": 11249092.0, "step": 688 }, { "entropy": 0.5218508541584015, "epoch": 2.5708955223880596, "grad_norm": 0.2351209968328476, "learning_rate": 0.0002, "loss": 0.5304785966873169, "mean_token_accuracy": 0.7837776988744736, "num_tokens": 11265168.0, "step": 689 }, { "entropy": 0.5149262696504593, "epoch": 2.574626865671642, "grad_norm": 0.15611964464187622, "learning_rate": 0.0002, "loss": 0.5160297155380249, "mean_token_accuracy": 0.7932045161724091, "num_tokens": 11281641.0, "step": 690 }, { "entropy": 0.5153379887342453, "epoch": 2.578358208955224, "grad_norm": 0.23146718740463257, "learning_rate": 0.0002, "loss": 0.5226321220397949, "mean_token_accuracy": 0.787521630525589, "num_tokens": 11298142.0, "step": 691 }, { "entropy": 0.5393347591161728, "epoch": 2.582089552238806, "grad_norm": 0.16657157242298126, "learning_rate": 0.0002, "loss": 0.5344167351722717, "mean_token_accuracy": 0.7832511067390442, "num_tokens": 11314425.0, "step": 692 }, { "entropy": 0.5284578949213028, "epoch": 2.585820895522388, "grad_norm": 0.2301884889602661, "learning_rate": 0.0002, "loss": 0.5258397459983826, "mean_token_accuracy": 0.787845253944397, "num_tokens": 11330672.0, "step": 693 }, { "entropy": 0.5345947295427322, "epoch": 2.58955223880597, "grad_norm": 0.17253969609737396, "learning_rate": 0.0002, "loss": 0.5329262018203735, "mean_token_accuracy": 0.783668577671051, "num_tokens": 11346999.0, "step": 694 }, { "entropy": 0.5287525057792664, "epoch": 2.593283582089552, "grad_norm": 0.1584477573633194, "learning_rate": 0.0002, "loss": 0.5283543467521667, "mean_token_accuracy": 0.7880005240440369, "num_tokens": 11363488.0, "step": 695 }, { "entropy": 0.5259083658456802, "epoch": 2.5970149253731343, "grad_norm": 0.18429915606975555, "learning_rate": 0.0002, "loss": 0.5257930159568787, "mean_token_accuracy": 0.7871210873126984, "num_tokens": 11379993.0, "step": 696 }, { "entropy": 0.5198669880628586, "epoch": 2.6007462686567164, "grad_norm": 0.19845134019851685, "learning_rate": 0.0002, "loss": 0.5221295356750488, "mean_token_accuracy": 0.7895113527774811, "num_tokens": 11396236.0, "step": 697 }, { "entropy": 0.5398612767457962, "epoch": 2.6044776119402986, "grad_norm": 0.19270583987236023, "learning_rate": 0.0002, "loss": 0.5429852604866028, "mean_token_accuracy": 0.7811529338359833, "num_tokens": 11412613.0, "step": 698 }, { "entropy": 0.5187375992536545, "epoch": 2.6082089552238807, "grad_norm": 0.18094319105148315, "learning_rate": 0.0002, "loss": 0.5167657136917114, "mean_token_accuracy": 0.790035143494606, "num_tokens": 11428870.0, "step": 699 }, { "entropy": 0.5331326425075531, "epoch": 2.611940298507463, "grad_norm": 0.16809140145778656, "learning_rate": 0.0002, "loss": 0.5311716794967651, "mean_token_accuracy": 0.7813376784324646, "num_tokens": 11445541.0, "step": 700 }, { "entropy": 0.5317347943782806, "epoch": 2.6156716417910446, "grad_norm": 0.2061910331249237, "learning_rate": 0.0002, "loss": 0.5366970896720886, "mean_token_accuracy": 0.7823969423770905, "num_tokens": 11461869.0, "step": 701 }, { "entropy": 0.5304048359394073, "epoch": 2.6194029850746268, "grad_norm": 0.15473014116287231, "learning_rate": 0.0002, "loss": 0.5267943143844604, "mean_token_accuracy": 0.7864733040332794, "num_tokens": 11478245.0, "step": 702 }, { "entropy": 0.528009369969368, "epoch": 2.623134328358209, "grad_norm": 0.2206811010837555, "learning_rate": 0.0002, "loss": 0.528520941734314, "mean_token_accuracy": 0.7848467379808426, "num_tokens": 11494601.0, "step": 703 }, { "entropy": 0.5367393791675568, "epoch": 2.626865671641791, "grad_norm": 0.17169888317584991, "learning_rate": 0.0002, "loss": 0.5352901816368103, "mean_token_accuracy": 0.7826301157474518, "num_tokens": 11510824.0, "step": 704 }, { "entropy": 0.5446508675813675, "epoch": 2.6305970149253732, "grad_norm": 0.23117929697036743, "learning_rate": 0.0002, "loss": 0.5552783608436584, "mean_token_accuracy": 0.7762233018875122, "num_tokens": 11527111.0, "step": 705 }, { "entropy": 0.5259118974208832, "epoch": 2.6343283582089554, "grad_norm": 0.17237775027751923, "learning_rate": 0.0002, "loss": 0.5258082747459412, "mean_token_accuracy": 0.7888418883085251, "num_tokens": 11543508.0, "step": 706 }, { "entropy": 0.5134415403008461, "epoch": 2.638059701492537, "grad_norm": 0.1968804895877838, "learning_rate": 0.0002, "loss": 0.516159176826477, "mean_token_accuracy": 0.7919125109910965, "num_tokens": 11559764.0, "step": 707 }, { "entropy": 0.5164712592959404, "epoch": 2.6417910447761193, "grad_norm": 0.18034212291240692, "learning_rate": 0.0002, "loss": 0.5184696316719055, "mean_token_accuracy": 0.7913271486759186, "num_tokens": 11576280.0, "step": 708 }, { "entropy": 0.5396228730678558, "epoch": 2.6455223880597014, "grad_norm": 0.16111285984516144, "learning_rate": 0.0002, "loss": 0.536095142364502, "mean_token_accuracy": 0.7845699042081833, "num_tokens": 11592548.0, "step": 709 }, { "entropy": 0.5335683822631836, "epoch": 2.6492537313432836, "grad_norm": 0.18878330290317535, "learning_rate": 0.0002, "loss": 0.533022403717041, "mean_token_accuracy": 0.7858745902776718, "num_tokens": 11608718.0, "step": 710 }, { "entropy": 0.5291629135608673, "epoch": 2.6529850746268657, "grad_norm": 0.15525634586811066, "learning_rate": 0.0002, "loss": 0.5270857214927673, "mean_token_accuracy": 0.7867603600025177, "num_tokens": 11624984.0, "step": 711 }, { "entropy": 0.5291008502244949, "epoch": 2.656716417910448, "grad_norm": 0.2215014100074768, "learning_rate": 0.0002, "loss": 0.5335924029350281, "mean_token_accuracy": 0.7852614969015121, "num_tokens": 11641414.0, "step": 712 }, { "entropy": 0.5195610374212265, "epoch": 2.66044776119403, "grad_norm": 0.1840248554944992, "learning_rate": 0.0002, "loss": 0.5272573828697205, "mean_token_accuracy": 0.7856255769729614, "num_tokens": 11657606.0, "step": 713 }, { "entropy": 0.5212601721286774, "epoch": 2.664179104477612, "grad_norm": 0.2194834053516388, "learning_rate": 0.0002, "loss": 0.5225985050201416, "mean_token_accuracy": 0.7896359115839005, "num_tokens": 11673978.0, "step": 714 }, { "entropy": 0.5267243683338165, "epoch": 2.667910447761194, "grad_norm": 0.18111757934093475, "learning_rate": 0.0002, "loss": 0.5297276973724365, "mean_token_accuracy": 0.7850082814693451, "num_tokens": 11690084.0, "step": 715 }, { "entropy": 0.5318636000156403, "epoch": 2.671641791044776, "grad_norm": 0.1797971874475479, "learning_rate": 0.0002, "loss": 0.5307915806770325, "mean_token_accuracy": 0.7851123064756393, "num_tokens": 11706504.0, "step": 716 }, { "entropy": 0.5428463369607925, "epoch": 2.675373134328358, "grad_norm": 0.1636015772819519, "learning_rate": 0.0002, "loss": 0.534479558467865, "mean_token_accuracy": 0.7838175147771835, "num_tokens": 11722988.0, "step": 717 }, { "entropy": 0.5360075086355209, "epoch": 2.6791044776119404, "grad_norm": 0.15919257700443268, "learning_rate": 0.0002, "loss": 0.5305730700492859, "mean_token_accuracy": 0.7855097204446793, "num_tokens": 11739438.0, "step": 718 }, { "entropy": 0.5359227359294891, "epoch": 2.6828358208955225, "grad_norm": 0.14643317461013794, "learning_rate": 0.0002, "loss": 0.532948911190033, "mean_token_accuracy": 0.7826716750860214, "num_tokens": 11755793.0, "step": 719 }, { "entropy": 0.508900836110115, "epoch": 2.6865671641791042, "grad_norm": 0.18424049019813538, "learning_rate": 0.0002, "loss": 0.5087383985519409, "mean_token_accuracy": 0.7960971295833588, "num_tokens": 11772140.0, "step": 720 }, { "entropy": 0.5278252959251404, "epoch": 2.6902985074626864, "grad_norm": 0.16620668768882751, "learning_rate": 0.0002, "loss": 0.5323323011398315, "mean_token_accuracy": 0.7838071584701538, "num_tokens": 11788187.0, "step": 721 }, { "entropy": 0.5286207944154739, "epoch": 2.6940298507462686, "grad_norm": 0.18285532295703888, "learning_rate": 0.0002, "loss": 0.5379830598831177, "mean_token_accuracy": 0.7834362238645554, "num_tokens": 11804853.0, "step": 722 }, { "entropy": 0.5304315537214279, "epoch": 2.6977611940298507, "grad_norm": 0.1528841108083725, "learning_rate": 0.0002, "loss": 0.53291916847229, "mean_token_accuracy": 0.7848697453737259, "num_tokens": 11821372.0, "step": 723 }, { "entropy": 0.5269036293029785, "epoch": 2.701492537313433, "grad_norm": 0.16717489063739777, "learning_rate": 0.0002, "loss": 0.5263969898223877, "mean_token_accuracy": 0.7880866229534149, "num_tokens": 11837581.0, "step": 724 }, { "entropy": 0.5256982818245888, "epoch": 2.705223880597015, "grad_norm": 0.15457774698734283, "learning_rate": 0.0002, "loss": 0.5219148993492126, "mean_token_accuracy": 0.7873740494251251, "num_tokens": 11853896.0, "step": 725 }, { "entropy": 0.534528449177742, "epoch": 2.708955223880597, "grad_norm": 0.15566900372505188, "learning_rate": 0.0002, "loss": 0.5313507318496704, "mean_token_accuracy": 0.7871876060962677, "num_tokens": 11869979.0, "step": 726 }, { "entropy": 0.5365303605794907, "epoch": 2.7126865671641793, "grad_norm": 0.16134414076805115, "learning_rate": 0.0002, "loss": 0.5403051972389221, "mean_token_accuracy": 0.7792389243841171, "num_tokens": 11886540.0, "step": 727 }, { "entropy": 0.5314591228961945, "epoch": 2.716417910447761, "grad_norm": 0.20206789672374725, "learning_rate": 0.0002, "loss": 0.5367040038108826, "mean_token_accuracy": 0.785218358039856, "num_tokens": 11902636.0, "step": 728 }, { "entropy": 0.5247315615415573, "epoch": 2.720149253731343, "grad_norm": 0.17510657012462616, "learning_rate": 0.0002, "loss": 0.5183426141738892, "mean_token_accuracy": 0.7929788678884506, "num_tokens": 11918809.0, "step": 729 }, { "entropy": 0.531570702791214, "epoch": 2.7238805970149254, "grad_norm": 0.19654951989650726, "learning_rate": 0.0002, "loss": 0.5312444567680359, "mean_token_accuracy": 0.7852945178747177, "num_tokens": 11934918.0, "step": 730 }, { "entropy": 0.5167503207921982, "epoch": 2.7276119402985075, "grad_norm": 0.18647317588329315, "learning_rate": 0.0002, "loss": 0.521633505821228, "mean_token_accuracy": 0.7868699729442596, "num_tokens": 11951418.0, "step": 731 }, { "entropy": 0.5409902930259705, "epoch": 2.7313432835820897, "grad_norm": 0.16911281645298004, "learning_rate": 0.0002, "loss": 0.5437517166137695, "mean_token_accuracy": 0.7801080495119095, "num_tokens": 11967971.0, "step": 732 }, { "entropy": 0.5430471152067184, "epoch": 2.7350746268656714, "grad_norm": 0.15203061699867249, "learning_rate": 0.0002, "loss": 0.5399286150932312, "mean_token_accuracy": 0.7798464447259903, "num_tokens": 11984465.0, "step": 733 }, { "entropy": 0.5305036455392838, "epoch": 2.7388059701492535, "grad_norm": 0.19002215564250946, "learning_rate": 0.0002, "loss": 0.526854932308197, "mean_token_accuracy": 0.788349375128746, "num_tokens": 12000894.0, "step": 734 }, { "entropy": 0.5385335683822632, "epoch": 2.7425373134328357, "grad_norm": 0.1556226909160614, "learning_rate": 0.0002, "loss": 0.536300003528595, "mean_token_accuracy": 0.7823566943407059, "num_tokens": 12017341.0, "step": 735 }, { "entropy": 0.5280898958444595, "epoch": 2.746268656716418, "grad_norm": 0.22629927098751068, "learning_rate": 0.0002, "loss": 0.5357972979545593, "mean_token_accuracy": 0.7819354236125946, "num_tokens": 12033592.0, "step": 736 }, { "entropy": 0.5210496559739113, "epoch": 2.75, "grad_norm": 0.14672952890396118, "learning_rate": 0.0002, "loss": 0.5192467570304871, "mean_token_accuracy": 0.7897329777479172, "num_tokens": 12050029.0, "step": 737 }, { "entropy": 0.5315113514661789, "epoch": 2.753731343283582, "grad_norm": 0.179401695728302, "learning_rate": 0.0002, "loss": 0.5297517776489258, "mean_token_accuracy": 0.7900628596544266, "num_tokens": 12066356.0, "step": 738 }, { "entropy": 0.5152995735406876, "epoch": 2.7574626865671643, "grad_norm": 0.20404104888439178, "learning_rate": 0.0002, "loss": 0.523341953754425, "mean_token_accuracy": 0.7902668565511703, "num_tokens": 12082476.0, "step": 739 }, { "entropy": 0.5357868671417236, "epoch": 2.7611940298507465, "grad_norm": 0.21347877383232117, "learning_rate": 0.0002, "loss": 0.5397475361824036, "mean_token_accuracy": 0.7817140519618988, "num_tokens": 12098813.0, "step": 740 }, { "entropy": 0.5294998437166214, "epoch": 2.7649253731343286, "grad_norm": 0.19437092542648315, "learning_rate": 0.0002, "loss": 0.5309361219406128, "mean_token_accuracy": 0.785544291138649, "num_tokens": 12115108.0, "step": 741 }, { "entropy": 0.5339842438697815, "epoch": 2.7686567164179103, "grad_norm": 0.211222842335701, "learning_rate": 0.0002, "loss": 0.5336329340934753, "mean_token_accuracy": 0.7840461581945419, "num_tokens": 12131657.0, "step": 742 }, { "entropy": 0.5063766092061996, "epoch": 2.7723880597014925, "grad_norm": 0.18974091112613678, "learning_rate": 0.0002, "loss": 0.5003129243850708, "mean_token_accuracy": 0.7983057200908661, "num_tokens": 12147977.0, "step": 743 }, { "entropy": 0.5348393470048904, "epoch": 2.7761194029850746, "grad_norm": 0.17940539121627808, "learning_rate": 0.0002, "loss": 0.5325519442558289, "mean_token_accuracy": 0.7843880504369736, "num_tokens": 12164476.0, "step": 744 }, { "entropy": 0.5319767147302628, "epoch": 2.779850746268657, "grad_norm": 0.21841664612293243, "learning_rate": 0.0002, "loss": 0.5384219884872437, "mean_token_accuracy": 0.7829115390777588, "num_tokens": 12180665.0, "step": 745 }, { "entropy": 0.5276842713356018, "epoch": 2.783582089552239, "grad_norm": 0.15762406587600708, "learning_rate": 0.0002, "loss": 0.5222536325454712, "mean_token_accuracy": 0.7876606732606888, "num_tokens": 12196994.0, "step": 746 }, { "entropy": 0.5283003747463226, "epoch": 2.7873134328358207, "grad_norm": 0.1740235984325409, "learning_rate": 0.0002, "loss": 0.5262863039970398, "mean_token_accuracy": 0.7871444076299667, "num_tokens": 12213146.0, "step": 747 }, { "entropy": 0.5243652537465096, "epoch": 2.791044776119403, "grad_norm": 0.17303697764873505, "learning_rate": 0.0002, "loss": 0.5288724303245544, "mean_token_accuracy": 0.7889265865087509, "num_tokens": 12229495.0, "step": 748 }, { "entropy": 0.5307216495275497, "epoch": 2.794776119402985, "grad_norm": 0.17367562651634216, "learning_rate": 0.0002, "loss": 0.5350364446640015, "mean_token_accuracy": 0.7828467786312103, "num_tokens": 12245731.0, "step": 749 }, { "entropy": 0.5053429380059242, "epoch": 2.798507462686567, "grad_norm": 0.18273597955703735, "learning_rate": 0.0002, "loss": 0.5170458555221558, "mean_token_accuracy": 0.7908547967672348, "num_tokens": 12261995.0, "step": 750 }, { "entropy": 0.5304894745349884, "epoch": 2.8022388059701493, "grad_norm": 0.19946977496147156, "learning_rate": 0.0002, "loss": 0.5361734628677368, "mean_token_accuracy": 0.7829707115888596, "num_tokens": 12278393.0, "step": 751 }, { "entropy": 0.5383865833282471, "epoch": 2.8059701492537314, "grad_norm": 0.18991155922412872, "learning_rate": 0.0002, "loss": 0.5307108163833618, "mean_token_accuracy": 0.7821619510650635, "num_tokens": 12294798.0, "step": 752 }, { "entropy": 0.5184406042098999, "epoch": 2.8097014925373136, "grad_norm": 0.1910092979669571, "learning_rate": 0.0002, "loss": 0.5096916556358337, "mean_token_accuracy": 0.7956021875143051, "num_tokens": 12311283.0, "step": 753 }, { "entropy": 0.5503049492835999, "epoch": 2.8134328358208958, "grad_norm": 0.16047552227973938, "learning_rate": 0.0002, "loss": 0.5400866270065308, "mean_token_accuracy": 0.781381756067276, "num_tokens": 12327796.0, "step": 754 }, { "entropy": 0.5367267429828644, "epoch": 2.8171641791044775, "grad_norm": 0.17214973270893097, "learning_rate": 0.0002, "loss": 0.533517062664032, "mean_token_accuracy": 0.7842586189508438, "num_tokens": 12344276.0, "step": 755 }, { "entropy": 0.5231245383620262, "epoch": 2.8208955223880596, "grad_norm": 0.20261810719966888, "learning_rate": 0.0002, "loss": 0.5310981869697571, "mean_token_accuracy": 0.7863229364156723, "num_tokens": 12360664.0, "step": 756 }, { "entropy": 0.5025655254721642, "epoch": 2.824626865671642, "grad_norm": 0.23269020020961761, "learning_rate": 0.0002, "loss": 0.5136131644248962, "mean_token_accuracy": 0.7932915538549423, "num_tokens": 12377108.0, "step": 757 }, { "entropy": 0.5385118275880814, "epoch": 2.828358208955224, "grad_norm": 0.17557309567928314, "learning_rate": 0.0002, "loss": 0.5468243956565857, "mean_token_accuracy": 0.7773942649364471, "num_tokens": 12393477.0, "step": 758 }, { "entropy": 0.5556999295949936, "epoch": 2.832089552238806, "grad_norm": 0.18836821615695953, "learning_rate": 0.0002, "loss": 0.5542982816696167, "mean_token_accuracy": 0.7759236544370651, "num_tokens": 12409945.0, "step": 759 }, { "entropy": 0.5397951006889343, "epoch": 2.835820895522388, "grad_norm": 0.16869579255580902, "learning_rate": 0.0002, "loss": 0.5345804691314697, "mean_token_accuracy": 0.7828676253557205, "num_tokens": 12426172.0, "step": 760 }, { "entropy": 0.5465898215770721, "epoch": 2.83955223880597, "grad_norm": 0.1971413791179657, "learning_rate": 0.0002, "loss": 0.5406813621520996, "mean_token_accuracy": 0.7830551862716675, "num_tokens": 12442539.0, "step": 761 }, { "entropy": 0.5412090718746185, "epoch": 2.843283582089552, "grad_norm": 0.16916459798812866, "learning_rate": 0.0002, "loss": 0.5298109650611877, "mean_token_accuracy": 0.7871081382036209, "num_tokens": 12458926.0, "step": 762 }, { "entropy": 0.5222381502389908, "epoch": 2.8470149253731343, "grad_norm": 0.19241978228092194, "learning_rate": 0.0002, "loss": 0.5193473100662231, "mean_token_accuracy": 0.7926554083824158, "num_tokens": 12475192.0, "step": 763 }, { "entropy": 0.5114666819572449, "epoch": 2.8507462686567164, "grad_norm": 0.2026778608560562, "learning_rate": 0.0002, "loss": 0.5210025906562805, "mean_token_accuracy": 0.7880990207195282, "num_tokens": 12491486.0, "step": 764 }, { "entropy": 0.5318130105733871, "epoch": 2.8544776119402986, "grad_norm": 0.18366879224777222, "learning_rate": 0.0002, "loss": 0.5408880710601807, "mean_token_accuracy": 0.7821989059448242, "num_tokens": 12508110.0, "step": 765 }, { "entropy": 0.5178861618041992, "epoch": 2.8582089552238807, "grad_norm": 0.22393299639225006, "learning_rate": 0.0002, "loss": 0.5233381986618042, "mean_token_accuracy": 0.7875554710626602, "num_tokens": 12524419.0, "step": 766 }, { "entropy": 0.5129977464675903, "epoch": 2.861940298507463, "grad_norm": 0.16486415266990662, "learning_rate": 0.0002, "loss": 0.5123316645622253, "mean_token_accuracy": 0.7945219725370407, "num_tokens": 12540623.0, "step": 767 }, { "entropy": 0.5352810174226761, "epoch": 2.8656716417910446, "grad_norm": 0.16391848027706146, "learning_rate": 0.0002, "loss": 0.5287078619003296, "mean_token_accuracy": 0.7864142656326294, "num_tokens": 12556769.0, "step": 768 }, { "entropy": 0.5213837772607803, "epoch": 2.8694029850746268, "grad_norm": 0.15605109930038452, "learning_rate": 0.0002, "loss": 0.5177993774414062, "mean_token_accuracy": 0.791528195142746, "num_tokens": 12572975.0, "step": 769 }, { "entropy": 0.5254454612731934, "epoch": 2.873134328358209, "grad_norm": 0.17228880524635315, "learning_rate": 0.0002, "loss": 0.5218878388404846, "mean_token_accuracy": 0.790112167596817, "num_tokens": 12589664.0, "step": 770 }, { "entropy": 0.5180996954441071, "epoch": 2.876865671641791, "grad_norm": 0.1603233963251114, "learning_rate": 0.0002, "loss": 0.5153653621673584, "mean_token_accuracy": 0.7935372442007065, "num_tokens": 12606393.0, "step": 771 }, { "entropy": 0.5220412835478783, "epoch": 2.8805970149253732, "grad_norm": 0.19191837310791016, "learning_rate": 0.0002, "loss": 0.5350449085235596, "mean_token_accuracy": 0.7817320823669434, "num_tokens": 12622915.0, "step": 772 }, { "entropy": 0.5260520726442337, "epoch": 2.8843283582089554, "grad_norm": 0.1964220553636551, "learning_rate": 0.0002, "loss": 0.5347790718078613, "mean_token_accuracy": 0.7870497107505798, "num_tokens": 12639438.0, "step": 773 }, { "entropy": 0.5259631350636482, "epoch": 2.888059701492537, "grad_norm": 0.1590423583984375, "learning_rate": 0.0002, "loss": 0.5264297723770142, "mean_token_accuracy": 0.7856660634279251, "num_tokens": 12656043.0, "step": 774 }, { "entropy": 0.5494396686553955, "epoch": 2.8917910447761193, "grad_norm": 0.166259765625, "learning_rate": 0.0002, "loss": 0.541179895401001, "mean_token_accuracy": 0.7822139710187912, "num_tokens": 12672530.0, "step": 775 }, { "entropy": 0.5362062454223633, "epoch": 2.8955223880597014, "grad_norm": 0.16349440813064575, "learning_rate": 0.0002, "loss": 0.530780017375946, "mean_token_accuracy": 0.7863557487726212, "num_tokens": 12689021.0, "step": 776 }, { "entropy": 0.5223592668771744, "epoch": 2.8992537313432836, "grad_norm": 0.15761977434158325, "learning_rate": 0.0002, "loss": 0.5155429244041443, "mean_token_accuracy": 0.7907254546880722, "num_tokens": 12705262.0, "step": 777 }, { "entropy": 0.5258801132440567, "epoch": 2.9029850746268657, "grad_norm": 0.1883028894662857, "learning_rate": 0.0002, "loss": 0.529833972454071, "mean_token_accuracy": 0.7863512486219406, "num_tokens": 12721511.0, "step": 778 }, { "entropy": 0.5216899961233139, "epoch": 2.906716417910448, "grad_norm": 0.16059532761573792, "learning_rate": 0.0002, "loss": 0.522499680519104, "mean_token_accuracy": 0.7899018228054047, "num_tokens": 12738089.0, "step": 779 }, { "entropy": 0.520403303205967, "epoch": 2.91044776119403, "grad_norm": 0.1771392673254013, "learning_rate": 0.0002, "loss": 0.5236196517944336, "mean_token_accuracy": 0.7879007905721664, "num_tokens": 12754592.0, "step": 780 }, { "entropy": 0.5242541432380676, "epoch": 2.914179104477612, "grad_norm": 0.17634879052639008, "learning_rate": 0.0002, "loss": 0.5289914011955261, "mean_token_accuracy": 0.7824440151453018, "num_tokens": 12770734.0, "step": 781 }, { "entropy": 0.5201637446880341, "epoch": 2.917910447761194, "grad_norm": 0.17048649489879608, "learning_rate": 0.0002, "loss": 0.5211310386657715, "mean_token_accuracy": 0.7937574684619904, "num_tokens": 12787160.0, "step": 782 }, { "entropy": 0.5204057991504669, "epoch": 2.921641791044776, "grad_norm": 0.15417909622192383, "learning_rate": 0.0002, "loss": 0.517360508441925, "mean_token_accuracy": 0.7929933965206146, "num_tokens": 12803683.0, "step": 783 }, { "entropy": 0.545757845044136, "epoch": 2.925373134328358, "grad_norm": 0.1549869030714035, "learning_rate": 0.0002, "loss": 0.5414532423019409, "mean_token_accuracy": 0.7788090705871582, "num_tokens": 12819951.0, "step": 784 }, { "entropy": 0.5228646248579025, "epoch": 2.9291044776119404, "grad_norm": 0.15743686258792877, "learning_rate": 0.0002, "loss": 0.516430675983429, "mean_token_accuracy": 0.7925095409154892, "num_tokens": 12836413.0, "step": 785 }, { "entropy": 0.5214046537876129, "epoch": 2.9328358208955225, "grad_norm": 0.16672447323799133, "learning_rate": 0.0002, "loss": 0.5222574472427368, "mean_token_accuracy": 0.7870719730854034, "num_tokens": 12852872.0, "step": 786 }, { "entropy": 0.5317943245172501, "epoch": 2.9365671641791042, "grad_norm": 0.21642933785915375, "learning_rate": 0.0002, "loss": 0.5372959971427917, "mean_token_accuracy": 0.7832164466381073, "num_tokens": 12869405.0, "step": 787 }, { "entropy": 0.5113082602620125, "epoch": 2.9402985074626864, "grad_norm": 0.22133168578147888, "learning_rate": 0.0002, "loss": 0.522553563117981, "mean_token_accuracy": 0.7871409952640533, "num_tokens": 12885593.0, "step": 788 }, { "entropy": 0.5275594145059586, "epoch": 2.9440298507462686, "grad_norm": 0.20494818687438965, "learning_rate": 0.0002, "loss": 0.5326835513114929, "mean_token_accuracy": 0.7843892127275467, "num_tokens": 12901950.0, "step": 789 }, { "entropy": 0.5371553599834442, "epoch": 2.9477611940298507, "grad_norm": 0.16483525931835175, "learning_rate": 0.0002, "loss": 0.5343260765075684, "mean_token_accuracy": 0.7844540178775787, "num_tokens": 12918538.0, "step": 790 }, { "entropy": 0.5248367339372635, "epoch": 2.951492537313433, "grad_norm": 0.20370911061763763, "learning_rate": 0.0002, "loss": 0.5262700915336609, "mean_token_accuracy": 0.7856797575950623, "num_tokens": 12935041.0, "step": 791 }, { "entropy": 0.5536757409572601, "epoch": 2.955223880597015, "grad_norm": 0.15302392840385437, "learning_rate": 0.0002, "loss": 0.5451865196228027, "mean_token_accuracy": 0.781255841255188, "num_tokens": 12951793.0, "step": 792 }, { "entropy": 0.5070596486330032, "epoch": 2.958955223880597, "grad_norm": 0.20451144874095917, "learning_rate": 0.0002, "loss": 0.5115755796432495, "mean_token_accuracy": 0.7904744446277618, "num_tokens": 12968060.0, "step": 793 }, { "entropy": 0.5260060653090477, "epoch": 2.9626865671641793, "grad_norm": 0.16183388233184814, "learning_rate": 0.0002, "loss": 0.5244185328483582, "mean_token_accuracy": 0.7878494709730148, "num_tokens": 12984541.0, "step": 794 }, { "entropy": 0.5389718413352966, "epoch": 2.966417910447761, "grad_norm": 0.17704468965530396, "learning_rate": 0.0002, "loss": 0.5415879487991333, "mean_token_accuracy": 0.7840642035007477, "num_tokens": 13000817.0, "step": 795 }, { "entropy": 0.5400192737579346, "epoch": 2.970149253731343, "grad_norm": 0.16612157225608826, "learning_rate": 0.0002, "loss": 0.5336055755615234, "mean_token_accuracy": 0.7857667803764343, "num_tokens": 13016973.0, "step": 796 }, { "entropy": 0.5179389715194702, "epoch": 2.9738805970149254, "grad_norm": 0.16657505929470062, "learning_rate": 0.0002, "loss": 0.5218580365180969, "mean_token_accuracy": 0.7903915345668793, "num_tokens": 13033299.0, "step": 797 }, { "entropy": 0.5229775831103325, "epoch": 2.9776119402985075, "grad_norm": 0.1601499617099762, "learning_rate": 0.0002, "loss": 0.5244333744049072, "mean_token_accuracy": 0.7875324189662933, "num_tokens": 13049754.0, "step": 798 }, { "entropy": 0.5364563912153244, "epoch": 2.9813432835820897, "grad_norm": 0.17928777635097504, "learning_rate": 0.0002, "loss": 0.5421883463859558, "mean_token_accuracy": 0.7822880148887634, "num_tokens": 13066045.0, "step": 799 }, { "entropy": 0.5202258825302124, "epoch": 2.9850746268656714, "grad_norm": 0.1714518666267395, "learning_rate": 0.0002, "loss": 0.5221466422080994, "mean_token_accuracy": 0.7896016389131546, "num_tokens": 13082398.0, "step": 800 }, { "entropy": 0.526955708861351, "epoch": 2.9888059701492535, "grad_norm": 0.1565951555967331, "learning_rate": 0.0002, "loss": 0.521065354347229, "mean_token_accuracy": 0.7919437438249588, "num_tokens": 13098966.0, "step": 801 }, { "entropy": 0.5393194705247879, "epoch": 2.9925373134328357, "grad_norm": 0.1675749570131302, "learning_rate": 0.0002, "loss": 0.5336388945579529, "mean_token_accuracy": 0.7851084172725677, "num_tokens": 13115333.0, "step": 802 }, { "entropy": 0.5270961374044418, "epoch": 2.996268656716418, "grad_norm": 0.17216360569000244, "learning_rate": 0.0002, "loss": 0.5220625400543213, "mean_token_accuracy": 0.7888612896203995, "num_tokens": 13131491.0, "step": 803 }, { "entropy": 0.5005228817462921, "epoch": 3.0, "grad_norm": 0.1877554953098297, "learning_rate": 0.0002, "loss": 0.5059037208557129, "mean_token_accuracy": 0.797055795788765, "num_tokens": 13147551.0, "step": 804 } ], "logging_steps": 1, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2254562163611402e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }