{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.126197099685669, "epoch": 0.003738317757009346, "grad_norm": 0.4137735962867737, "learning_rate": 0.0002, "loss": 2.431535243988037, "mean_token_accuracy": 0.54428631067276, "num_tokens": 16465.0, "step": 1 }, { "entropy": 1.2562520503997803, "epoch": 0.007476635514018692, "grad_norm": 0.3902691900730133, "learning_rate": 0.0002, "loss": 2.188866376876831, "mean_token_accuracy": 0.5568228960037231, "num_tokens": 32573.0, "step": 2 }, { "entropy": 1.4093195796012878, "epoch": 0.011214953271028037, "grad_norm": 0.29741090536117554, "learning_rate": 0.0002, "loss": 1.7309190034866333, "mean_token_accuracy": 0.591301366686821, "num_tokens": 48848.0, "step": 3 }, { "entropy": 1.3904370069503784, "epoch": 0.014953271028037384, "grad_norm": 0.24415643513202667, "learning_rate": 0.0002, "loss": 1.4167925119400024, "mean_token_accuracy": 0.6262245625257492, "num_tokens": 64779.0, "step": 4 }, { "entropy": 1.3590968251228333, "epoch": 0.018691588785046728, "grad_norm": 0.2501066327095032, "learning_rate": 0.0002, "loss": 1.3086440563201904, "mean_token_accuracy": 0.6442629396915436, "num_tokens": 81017.0, "step": 5 }, { "entropy": 1.2659040987491608, "epoch": 0.022429906542056073, "grad_norm": 0.13132381439208984, "learning_rate": 0.0002, "loss": 1.1781953573226929, "mean_token_accuracy": 0.6602727770805359, "num_tokens": 97143.0, "step": 6 }, { "entropy": 1.1754920184612274, "epoch": 0.026168224299065422, "grad_norm": 0.10863616317510605, "learning_rate": 0.0002, "loss": 1.0758289098739624, "mean_token_accuracy": 0.6747478097677231, "num_tokens": 113270.0, "step": 7 }, { "entropy": 1.1110295355319977, "epoch": 0.029906542056074768, "grad_norm": 0.11261261999607086, "learning_rate": 0.0002, "loss": 1.0382510423660278, "mean_token_accuracy": 0.6741550117731094, "num_tokens": 129740.0, "step": 8 }, { "entropy": 1.0438694655895233, "epoch": 0.03364485981308411, "grad_norm": 0.1300426870584488, "learning_rate": 0.0002, "loss": 0.9842232465744019, "mean_token_accuracy": 0.6938712894916534, "num_tokens": 146153.0, "step": 9 }, { "entropy": 0.980072870850563, "epoch": 0.037383177570093455, "grad_norm": 0.1279866099357605, "learning_rate": 0.0002, "loss": 0.907992422580719, "mean_token_accuracy": 0.7037613391876221, "num_tokens": 162400.0, "step": 10 }, { "entropy": 0.9512171745300293, "epoch": 0.041121495327102804, "grad_norm": 0.11444728821516037, "learning_rate": 0.0002, "loss": 0.8603078722953796, "mean_token_accuracy": 0.7085670977830887, "num_tokens": 178596.0, "step": 11 }, { "entropy": 0.9008210897445679, "epoch": 0.044859813084112146, "grad_norm": 0.1163485050201416, "learning_rate": 0.0002, "loss": 0.8202763199806213, "mean_token_accuracy": 0.7147757261991501, "num_tokens": 194960.0, "step": 12 }, { "entropy": 0.8144031316041946, "epoch": 0.048598130841121495, "grad_norm": 1.8727822303771973, "learning_rate": 0.0002, "loss": 0.7989485859870911, "mean_token_accuracy": 0.714598998427391, "num_tokens": 211519.0, "step": 13 }, { "entropy": 0.7731810510158539, "epoch": 0.052336448598130844, "grad_norm": 0.40646815299987793, "learning_rate": 0.0002, "loss": 0.7675734162330627, "mean_token_accuracy": 0.7164532542228699, "num_tokens": 227947.0, "step": 14 }, { "entropy": 0.7750754952430725, "epoch": 0.056074766355140186, "grad_norm": 0.0927761048078537, "learning_rate": 0.0002, "loss": 0.752495527267456, "mean_token_accuracy": 0.7247887402772903, "num_tokens": 244285.0, "step": 15 }, { "entropy": 0.7294797450304031, "epoch": 0.059813084112149535, "grad_norm": 0.09633366763591766, "learning_rate": 0.0002, "loss": 0.7139282822608948, "mean_token_accuracy": 0.733425110578537, "num_tokens": 260524.0, "step": 16 }, { "entropy": 0.7113516181707382, "epoch": 0.06355140186915888, "grad_norm": 0.08278490602970123, "learning_rate": 0.0002, "loss": 0.69715416431427, "mean_token_accuracy": 0.7404225617647171, "num_tokens": 276676.0, "step": 17 }, { "entropy": 0.6892006993293762, "epoch": 0.06728971962616823, "grad_norm": 0.09702161699533463, "learning_rate": 0.0002, "loss": 0.6832636594772339, "mean_token_accuracy": 0.7384749203920364, "num_tokens": 293327.0, "step": 18 }, { "entropy": 0.683604821562767, "epoch": 0.07102803738317758, "grad_norm": 0.09970250725746155, "learning_rate": 0.0002, "loss": 0.6719778776168823, "mean_token_accuracy": 0.7447258532047272, "num_tokens": 309768.0, "step": 19 }, { "entropy": 0.6530238687992096, "epoch": 0.07476635514018691, "grad_norm": 0.08765958249568939, "learning_rate": 0.0002, "loss": 0.6265610456466675, "mean_token_accuracy": 0.7607048451900482, "num_tokens": 325953.0, "step": 20 }, { "entropy": 0.6858675181865692, "epoch": 0.07850467289719626, "grad_norm": 0.1555248200893402, "learning_rate": 0.0002, "loss": 0.653350830078125, "mean_token_accuracy": 0.7461759150028229, "num_tokens": 342357.0, "step": 21 }, { "entropy": 0.6731577664613724, "epoch": 0.08224299065420561, "grad_norm": 0.07943135499954224, "learning_rate": 0.0002, "loss": 0.6468416452407837, "mean_token_accuracy": 0.745930403470993, "num_tokens": 358780.0, "step": 22 }, { "entropy": 0.6372379511594772, "epoch": 0.08598130841121496, "grad_norm": 0.07176131755113602, "learning_rate": 0.0002, "loss": 0.6231244802474976, "mean_token_accuracy": 0.757389485836029, "num_tokens": 375059.0, "step": 23 }, { "entropy": 0.6160608530044556, "epoch": 0.08971962616822429, "grad_norm": 0.09053056687116623, "learning_rate": 0.0002, "loss": 0.6240095496177673, "mean_token_accuracy": 0.7537032961845398, "num_tokens": 391372.0, "step": 24 }, { "entropy": 0.6163977682590485, "epoch": 0.09345794392523364, "grad_norm": 0.06957540661096573, "learning_rate": 0.0002, "loss": 0.6137739419937134, "mean_token_accuracy": 0.7591944634914398, "num_tokens": 407634.0, "step": 25 }, { "entropy": 0.6172843426465988, "epoch": 0.09719626168224299, "grad_norm": 0.06831946223974228, "learning_rate": 0.0002, "loss": 0.6151383519172668, "mean_token_accuracy": 0.7588979452848434, "num_tokens": 424139.0, "step": 26 }, { "entropy": 0.6146537363529205, "epoch": 0.10093457943925234, "grad_norm": 0.06785774976015091, "learning_rate": 0.0002, "loss": 0.6100280284881592, "mean_token_accuracy": 0.7608075141906738, "num_tokens": 440251.0, "step": 27 }, { "entropy": 0.5965892523527145, "epoch": 0.10467289719626169, "grad_norm": 0.06592898070812225, "learning_rate": 0.0002, "loss": 0.5876743793487549, "mean_token_accuracy": 0.7687714993953705, "num_tokens": 456512.0, "step": 28 }, { "entropy": 0.6143475025892258, "epoch": 0.10841121495327102, "grad_norm": 0.06412907689809799, "learning_rate": 0.0002, "loss": 0.6119903326034546, "mean_token_accuracy": 0.7573402374982834, "num_tokens": 472958.0, "step": 29 }, { "entropy": 0.5956396609544754, "epoch": 0.11214953271028037, "grad_norm": 0.06444356590509415, "learning_rate": 0.0002, "loss": 0.594578206539154, "mean_token_accuracy": 0.7660299837589264, "num_tokens": 489407.0, "step": 30 }, { "entropy": 0.5987770259380341, "epoch": 0.11588785046728972, "grad_norm": 0.05562213435769081, "learning_rate": 0.0002, "loss": 0.5932596921920776, "mean_token_accuracy": 0.7620532661676407, "num_tokens": 506104.0, "step": 31 }, { "entropy": 0.5812755525112152, "epoch": 0.11962616822429907, "grad_norm": 0.060992538928985596, "learning_rate": 0.0002, "loss": 0.5729696154594421, "mean_token_accuracy": 0.7730918079614639, "num_tokens": 522565.0, "step": 32 }, { "entropy": 0.5877644866704941, "epoch": 0.1233644859813084, "grad_norm": 0.05839328467845917, "learning_rate": 0.0002, "loss": 0.5913704633712769, "mean_token_accuracy": 0.7656503766775131, "num_tokens": 539081.0, "step": 33 }, { "entropy": 0.5780033618211746, "epoch": 0.12710280373831775, "grad_norm": 0.05193523317575455, "learning_rate": 0.0002, "loss": 0.5819685459136963, "mean_token_accuracy": 0.7665455341339111, "num_tokens": 555504.0, "step": 34 }, { "entropy": 0.5869153290987015, "epoch": 0.1308411214953271, "grad_norm": 0.06890807300806046, "learning_rate": 0.0002, "loss": 0.5857660174369812, "mean_token_accuracy": 0.7676131427288055, "num_tokens": 572153.0, "step": 35 }, { "entropy": 0.5672304034233093, "epoch": 0.13457943925233645, "grad_norm": 0.05624233931303024, "learning_rate": 0.0002, "loss": 0.5718747973442078, "mean_token_accuracy": 0.7710311710834503, "num_tokens": 588585.0, "step": 36 }, { "entropy": 0.5678977817296982, "epoch": 0.1383177570093458, "grad_norm": 0.06091594323515892, "learning_rate": 0.0002, "loss": 0.5765193104743958, "mean_token_accuracy": 0.7686972767114639, "num_tokens": 604864.0, "step": 37 }, { "entropy": 0.5863034427165985, "epoch": 0.14205607476635515, "grad_norm": 0.07292835414409637, "learning_rate": 0.0002, "loss": 0.597279965877533, "mean_token_accuracy": 0.7606304287910461, "num_tokens": 621080.0, "step": 38 }, { "entropy": 0.5759021639823914, "epoch": 0.14579439252336449, "grad_norm": 0.05464645475149155, "learning_rate": 0.0002, "loss": 0.570218563079834, "mean_token_accuracy": 0.770964503288269, "num_tokens": 637503.0, "step": 39 }, { "entropy": 0.5763402879238129, "epoch": 0.14953271028037382, "grad_norm": 0.056617990136146545, "learning_rate": 0.0002, "loss": 0.5686919093132019, "mean_token_accuracy": 0.7723182737827301, "num_tokens": 653609.0, "step": 40 }, { "entropy": 0.6039886325597763, "epoch": 0.15327102803738318, "grad_norm": 0.04869381710886955, "learning_rate": 0.0002, "loss": 0.5939038395881653, "mean_token_accuracy": 0.7607405036687851, "num_tokens": 669981.0, "step": 41 }, { "entropy": 0.5946750342845917, "epoch": 0.15700934579439252, "grad_norm": 0.046227701008319855, "learning_rate": 0.0002, "loss": 0.589706301689148, "mean_token_accuracy": 0.7646626383066177, "num_tokens": 686537.0, "step": 42 }, { "entropy": 0.5577073395252228, "epoch": 0.16074766355140188, "grad_norm": 0.04413911700248718, "learning_rate": 0.0002, "loss": 0.559436023235321, "mean_token_accuracy": 0.7762598991394043, "num_tokens": 702686.0, "step": 43 }, { "entropy": 0.5665079057216644, "epoch": 0.16448598130841122, "grad_norm": 0.047774720937013626, "learning_rate": 0.0002, "loss": 0.5647708773612976, "mean_token_accuracy": 0.7764726728200912, "num_tokens": 718966.0, "step": 44 }, { "entropy": 0.5726076513528824, "epoch": 0.16822429906542055, "grad_norm": 0.05053015798330307, "learning_rate": 0.0002, "loss": 0.5747931003570557, "mean_token_accuracy": 0.7704672068357468, "num_tokens": 735364.0, "step": 45 }, { "entropy": 0.5688610672950745, "epoch": 0.17196261682242991, "grad_norm": 0.037495676428079605, "learning_rate": 0.0002, "loss": 0.5652605295181274, "mean_token_accuracy": 0.770918145775795, "num_tokens": 751902.0, "step": 46 }, { "entropy": 0.581221267580986, "epoch": 0.17570093457943925, "grad_norm": 0.051694370806217194, "learning_rate": 0.0002, "loss": 0.5826902389526367, "mean_token_accuracy": 0.7654351443052292, "num_tokens": 768151.0, "step": 47 }, { "entropy": 0.5708408057689667, "epoch": 0.17943925233644858, "grad_norm": 0.04264647886157036, "learning_rate": 0.0002, "loss": 0.5651251673698425, "mean_token_accuracy": 0.7749274671077728, "num_tokens": 784511.0, "step": 48 }, { "entropy": 0.5757250636816025, "epoch": 0.18317757009345795, "grad_norm": 0.050725825130939484, "learning_rate": 0.0002, "loss": 0.5704944133758545, "mean_token_accuracy": 0.7680549174547195, "num_tokens": 800966.0, "step": 49 }, { "entropy": 0.5546318888664246, "epoch": 0.18691588785046728, "grad_norm": 0.03947490453720093, "learning_rate": 0.0002, "loss": 0.5488482713699341, "mean_token_accuracy": 0.7769860327243805, "num_tokens": 817293.0, "step": 50 }, { "entropy": 0.5634811520576477, "epoch": 0.19065420560747665, "grad_norm": 0.049806442111730576, "learning_rate": 0.0002, "loss": 0.5557321906089783, "mean_token_accuracy": 0.7740621268749237, "num_tokens": 833385.0, "step": 51 }, { "entropy": 0.582123801112175, "epoch": 0.19439252336448598, "grad_norm": 0.0458400622010231, "learning_rate": 0.0002, "loss": 0.5802882313728333, "mean_token_accuracy": 0.7661796510219574, "num_tokens": 849741.0, "step": 52 }, { "entropy": 0.5494910776615143, "epoch": 0.19813084112149532, "grad_norm": 0.04727543145418167, "learning_rate": 0.0002, "loss": 0.554188072681427, "mean_token_accuracy": 0.7779219001531601, "num_tokens": 865884.0, "step": 53 }, { "entropy": 0.568273514509201, "epoch": 0.20186915887850468, "grad_norm": 0.052229855209589005, "learning_rate": 0.0002, "loss": 0.5752811431884766, "mean_token_accuracy": 0.7671186923980713, "num_tokens": 882348.0, "step": 54 }, { "entropy": 0.5694270133972168, "epoch": 0.205607476635514, "grad_norm": 0.04475817084312439, "learning_rate": 0.0002, "loss": 0.5706926584243774, "mean_token_accuracy": 0.7702507525682449, "num_tokens": 898544.0, "step": 55 }, { "entropy": 0.5677521079778671, "epoch": 0.20934579439252338, "grad_norm": 0.03592672944068909, "learning_rate": 0.0002, "loss": 0.5723967552185059, "mean_token_accuracy": 0.766302615404129, "num_tokens": 914946.0, "step": 56 }, { "entropy": 0.5698029845952988, "epoch": 0.2130841121495327, "grad_norm": 0.04732033982872963, "learning_rate": 0.0002, "loss": 0.5640438795089722, "mean_token_accuracy": 0.7732385843992233, "num_tokens": 931100.0, "step": 57 }, { "entropy": 0.5775126665830612, "epoch": 0.21682242990654205, "grad_norm": 0.04193758964538574, "learning_rate": 0.0002, "loss": 0.5704541802406311, "mean_token_accuracy": 0.7691217958927155, "num_tokens": 947448.0, "step": 58 }, { "entropy": 0.5770154148340225, "epoch": 0.2205607476635514, "grad_norm": 0.035865288227796555, "learning_rate": 0.0002, "loss": 0.5679229497909546, "mean_token_accuracy": 0.7680188864469528, "num_tokens": 963902.0, "step": 59 }, { "entropy": 0.5588070899248123, "epoch": 0.22429906542056074, "grad_norm": 0.04689257591962814, "learning_rate": 0.0002, "loss": 0.5615048408508301, "mean_token_accuracy": 0.7748474776744843, "num_tokens": 980180.0, "step": 60 }, { "entropy": 0.57504902780056, "epoch": 0.22803738317757008, "grad_norm": 0.04198114946484566, "learning_rate": 0.0002, "loss": 0.577617883682251, "mean_token_accuracy": 0.7648669481277466, "num_tokens": 996613.0, "step": 61 }, { "entropy": 0.5450393110513687, "epoch": 0.23177570093457944, "grad_norm": 0.040139347314834595, "learning_rate": 0.0002, "loss": 0.552120566368103, "mean_token_accuracy": 0.7774388641119003, "num_tokens": 1012686.0, "step": 62 }, { "entropy": 0.5609021335840225, "epoch": 0.23551401869158878, "grad_norm": 0.03753409534692764, "learning_rate": 0.0002, "loss": 0.5530397295951843, "mean_token_accuracy": 0.7765212655067444, "num_tokens": 1028835.0, "step": 63 }, { "entropy": 0.5794262290000916, "epoch": 0.23925233644859814, "grad_norm": 0.035354360938072205, "learning_rate": 0.0002, "loss": 0.5788048505783081, "mean_token_accuracy": 0.7663274556398392, "num_tokens": 1045176.0, "step": 64 }, { "entropy": 0.5655659288167953, "epoch": 0.24299065420560748, "grad_norm": 0.03588757663965225, "learning_rate": 0.0002, "loss": 0.5581645369529724, "mean_token_accuracy": 0.7732069790363312, "num_tokens": 1061452.0, "step": 65 }, { "entropy": 0.5672483444213867, "epoch": 0.2467289719626168, "grad_norm": 0.036772388964891434, "learning_rate": 0.0002, "loss": 0.5631874203681946, "mean_token_accuracy": 0.7695926129817963, "num_tokens": 1077997.0, "step": 66 }, { "entropy": 0.578306719660759, "epoch": 0.2504672897196262, "grad_norm": 0.039442483335733414, "learning_rate": 0.0002, "loss": 0.5765112638473511, "mean_token_accuracy": 0.7657738327980042, "num_tokens": 1094247.0, "step": 67 }, { "entropy": 0.5700875818729401, "epoch": 0.2542056074766355, "grad_norm": 0.0448731891810894, "learning_rate": 0.0002, "loss": 0.574236273765564, "mean_token_accuracy": 0.7669749855995178, "num_tokens": 1110470.0, "step": 68 }, { "entropy": 0.5609024912118912, "epoch": 0.25794392523364484, "grad_norm": 0.033255062997341156, "learning_rate": 0.0002, "loss": 0.5576102137565613, "mean_token_accuracy": 0.7776026874780655, "num_tokens": 1127050.0, "step": 69 }, { "entropy": 0.5673299431800842, "epoch": 0.2616822429906542, "grad_norm": 0.03715064004063606, "learning_rate": 0.0002, "loss": 0.5695099234580994, "mean_token_accuracy": 0.7701731324195862, "num_tokens": 1143383.0, "step": 70 }, { "entropy": 0.560445249080658, "epoch": 0.26542056074766357, "grad_norm": 0.04453396797180176, "learning_rate": 0.0002, "loss": 0.5644095540046692, "mean_token_accuracy": 0.7720398306846619, "num_tokens": 1159597.0, "step": 71 }, { "entropy": 0.5526476353406906, "epoch": 0.2691588785046729, "grad_norm": 0.039633698761463165, "learning_rate": 0.0002, "loss": 0.5499011874198914, "mean_token_accuracy": 0.7772456705570221, "num_tokens": 1175764.0, "step": 72 }, { "entropy": 0.5623870193958282, "epoch": 0.27289719626168224, "grad_norm": 0.036508623510599136, "learning_rate": 0.0002, "loss": 0.5721215009689331, "mean_token_accuracy": 0.7691169232130051, "num_tokens": 1192041.0, "step": 73 }, { "entropy": 0.5718335658311844, "epoch": 0.2766355140186916, "grad_norm": 0.044028230011463165, "learning_rate": 0.0002, "loss": 0.5752332806587219, "mean_token_accuracy": 0.7687042653560638, "num_tokens": 1208468.0, "step": 74 }, { "entropy": 0.5587927252054214, "epoch": 0.2803738317757009, "grad_norm": 0.04269316419959068, "learning_rate": 0.0002, "loss": 0.5531549453735352, "mean_token_accuracy": 0.7755036056041718, "num_tokens": 1224757.0, "step": 75 }, { "entropy": 0.5787914991378784, "epoch": 0.2841121495327103, "grad_norm": 0.040728773921728134, "learning_rate": 0.0002, "loss": 0.5694252252578735, "mean_token_accuracy": 0.7696126103401184, "num_tokens": 1241162.0, "step": 76 }, { "entropy": 0.5616230517625809, "epoch": 0.28785046728971964, "grad_norm": 0.037814315408468246, "learning_rate": 0.0002, "loss": 0.5627362728118896, "mean_token_accuracy": 0.7735611200332642, "num_tokens": 1257583.0, "step": 77 }, { "entropy": 0.567746564745903, "epoch": 0.29158878504672897, "grad_norm": 0.03843110799789429, "learning_rate": 0.0002, "loss": 0.5634809732437134, "mean_token_accuracy": 0.7711174041032791, "num_tokens": 1274115.0, "step": 78 }, { "entropy": 0.5585684925317764, "epoch": 0.2953271028037383, "grad_norm": 0.03358754143118858, "learning_rate": 0.0002, "loss": 0.5604900121688843, "mean_token_accuracy": 0.7713887989521027, "num_tokens": 1290371.0, "step": 79 }, { "entropy": 0.5650099366903305, "epoch": 0.29906542056074764, "grad_norm": 0.038185376673936844, "learning_rate": 0.0002, "loss": 0.5694409608840942, "mean_token_accuracy": 0.7706831693649292, "num_tokens": 1306602.0, "step": 80 }, { "entropy": 0.5573018193244934, "epoch": 0.30280373831775703, "grad_norm": 0.04070131108164787, "learning_rate": 0.0002, "loss": 0.5703440308570862, "mean_token_accuracy": 0.771970734000206, "num_tokens": 1322957.0, "step": 81 }, { "entropy": 0.545403316617012, "epoch": 0.30654205607476637, "grad_norm": 0.04340139031410217, "learning_rate": 0.0002, "loss": 0.5498678088188171, "mean_token_accuracy": 0.7774094045162201, "num_tokens": 1339233.0, "step": 82 }, { "entropy": 0.5381540507078171, "epoch": 0.3102803738317757, "grad_norm": 0.039635106921195984, "learning_rate": 0.0002, "loss": 0.542028546333313, "mean_token_accuracy": 0.7835624068975449, "num_tokens": 1355463.0, "step": 83 }, { "entropy": 0.5599908977746964, "epoch": 0.31401869158878504, "grad_norm": 0.039568379521369934, "learning_rate": 0.0002, "loss": 0.5559767484664917, "mean_token_accuracy": 0.7765284180641174, "num_tokens": 1371815.0, "step": 84 }, { "entropy": 0.5593477934598923, "epoch": 0.3177570093457944, "grad_norm": 0.039335861802101135, "learning_rate": 0.0002, "loss": 0.5506576895713806, "mean_token_accuracy": 0.7803503125905991, "num_tokens": 1388181.0, "step": 85 }, { "entropy": 0.5572251528501511, "epoch": 0.32149532710280376, "grad_norm": 0.03665383532643318, "learning_rate": 0.0002, "loss": 0.5480077862739563, "mean_token_accuracy": 0.7788248509168625, "num_tokens": 1404584.0, "step": 86 }, { "entropy": 0.5664831250905991, "epoch": 0.3252336448598131, "grad_norm": 0.040541525930166245, "learning_rate": 0.0002, "loss": 0.5769516229629517, "mean_token_accuracy": 0.7674112915992737, "num_tokens": 1420963.0, "step": 87 }, { "entropy": 0.5584649592638016, "epoch": 0.32897196261682243, "grad_norm": 0.033256057649850845, "learning_rate": 0.0002, "loss": 0.5648812651634216, "mean_token_accuracy": 0.7723092287778854, "num_tokens": 1437122.0, "step": 88 }, { "entropy": 0.5519673079252243, "epoch": 0.33271028037383177, "grad_norm": 0.031988468021154404, "learning_rate": 0.0002, "loss": 0.551476776599884, "mean_token_accuracy": 0.7795782834291458, "num_tokens": 1453481.0, "step": 89 }, { "entropy": 0.5844476372003555, "epoch": 0.3364485981308411, "grad_norm": 0.037734005600214005, "learning_rate": 0.0002, "loss": 0.5850376486778259, "mean_token_accuracy": 0.7618721723556519, "num_tokens": 1469968.0, "step": 90 }, { "entropy": 0.5527342259883881, "epoch": 0.3401869158878505, "grad_norm": 0.03733964264392853, "learning_rate": 0.0002, "loss": 0.5517382621765137, "mean_token_accuracy": 0.7791167348623276, "num_tokens": 1486410.0, "step": 91 }, { "entropy": 0.5490231364965439, "epoch": 0.34392523364485983, "grad_norm": 0.03796572983264923, "learning_rate": 0.0002, "loss": 0.5472099781036377, "mean_token_accuracy": 0.7787582278251648, "num_tokens": 1502827.0, "step": 92 }, { "entropy": 0.5654839426279068, "epoch": 0.34766355140186916, "grad_norm": 0.03400302678346634, "learning_rate": 0.0002, "loss": 0.5675226449966431, "mean_token_accuracy": 0.7715823501348495, "num_tokens": 1519035.0, "step": 93 }, { "entropy": 0.5789331346750259, "epoch": 0.3514018691588785, "grad_norm": 0.03300806134939194, "learning_rate": 0.0002, "loss": 0.5738787055015564, "mean_token_accuracy": 0.7701004296541214, "num_tokens": 1535776.0, "step": 94 }, { "entropy": 0.5546596646308899, "epoch": 0.35514018691588783, "grad_norm": 0.03256770223379135, "learning_rate": 0.0002, "loss": 0.5567547082901001, "mean_token_accuracy": 0.7791133224964142, "num_tokens": 1552013.0, "step": 95 }, { "entropy": 0.5764150321483612, "epoch": 0.35887850467289717, "grad_norm": 0.03291841968894005, "learning_rate": 0.0002, "loss": 0.5735791921615601, "mean_token_accuracy": 0.770502358675003, "num_tokens": 1568424.0, "step": 96 }, { "entropy": 0.5675235092639923, "epoch": 0.36261682242990656, "grad_norm": 0.03169221803545952, "learning_rate": 0.0002, "loss": 0.567868709564209, "mean_token_accuracy": 0.7711145430803299, "num_tokens": 1584887.0, "step": 97 }, { "entropy": 0.5626550316810608, "epoch": 0.3663551401869159, "grad_norm": 0.03811025619506836, "learning_rate": 0.0002, "loss": 0.5668138265609741, "mean_token_accuracy": 0.772192656993866, "num_tokens": 1601260.0, "step": 98 }, { "entropy": 0.5581237971782684, "epoch": 0.37009345794392523, "grad_norm": 0.03798513859510422, "learning_rate": 0.0002, "loss": 0.5674142241477966, "mean_token_accuracy": 0.7706556767225266, "num_tokens": 1617528.0, "step": 99 }, { "entropy": 0.5649739503860474, "epoch": 0.37383177570093457, "grad_norm": 0.03556443750858307, "learning_rate": 0.0002, "loss": 0.5644899606704712, "mean_token_accuracy": 0.7701123207807541, "num_tokens": 1633885.0, "step": 100 }, { "entropy": 0.5828528255224228, "epoch": 0.3775700934579439, "grad_norm": 0.03924545273184776, "learning_rate": 0.0002, "loss": 0.5804182291030884, "mean_token_accuracy": 0.7685290277004242, "num_tokens": 1650680.0, "step": 101 }, { "entropy": 0.5504215061664581, "epoch": 0.3813084112149533, "grad_norm": 0.03934217616915703, "learning_rate": 0.0002, "loss": 0.5463358163833618, "mean_token_accuracy": 0.7797124236822128, "num_tokens": 1666866.0, "step": 102 }, { "entropy": 0.5697780549526215, "epoch": 0.3850467289719626, "grad_norm": 0.03712291270494461, "learning_rate": 0.0002, "loss": 0.5653584599494934, "mean_token_accuracy": 0.7692228257656097, "num_tokens": 1683118.0, "step": 103 }, { "entropy": 0.5601143538951874, "epoch": 0.38878504672897196, "grad_norm": 0.033694274723529816, "learning_rate": 0.0002, "loss": 0.5663195848464966, "mean_token_accuracy": 0.7706973105669022, "num_tokens": 1699475.0, "step": 104 }, { "entropy": 0.5591333955526352, "epoch": 0.3925233644859813, "grad_norm": 0.03714451938867569, "learning_rate": 0.0002, "loss": 0.566075325012207, "mean_token_accuracy": 0.7697228640317917, "num_tokens": 1715853.0, "step": 105 }, { "entropy": 0.5509396642446518, "epoch": 0.39626168224299063, "grad_norm": 0.03486821800470352, "learning_rate": 0.0002, "loss": 0.5632879734039307, "mean_token_accuracy": 0.7730516046285629, "num_tokens": 1732170.0, "step": 106 }, { "entropy": 0.5652123540639877, "epoch": 0.4, "grad_norm": 0.041288331151008606, "learning_rate": 0.0002, "loss": 0.5604725480079651, "mean_token_accuracy": 0.7711915820837021, "num_tokens": 1748328.0, "step": 107 }, { "entropy": 0.5530835092067719, "epoch": 0.40373831775700936, "grad_norm": 0.0322246178984642, "learning_rate": 0.0002, "loss": 0.5545868277549744, "mean_token_accuracy": 0.7774576395750046, "num_tokens": 1764582.0, "step": 108 }, { "entropy": 0.574239119887352, "epoch": 0.4074766355140187, "grad_norm": 0.031295642256736755, "learning_rate": 0.0002, "loss": 0.5755724906921387, "mean_token_accuracy": 0.7669118195772171, "num_tokens": 1780985.0, "step": 109 }, { "entropy": 0.5714472681283951, "epoch": 0.411214953271028, "grad_norm": 0.034113939851522446, "learning_rate": 0.0002, "loss": 0.565799355506897, "mean_token_accuracy": 0.7719277888536453, "num_tokens": 1797483.0, "step": 110 }, { "entropy": 0.5522187203168869, "epoch": 0.41495327102803736, "grad_norm": 0.03207452967762947, "learning_rate": 0.0002, "loss": 0.5486649870872498, "mean_token_accuracy": 0.7786776423454285, "num_tokens": 1813763.0, "step": 111 }, { "entropy": 0.5560779720544815, "epoch": 0.41869158878504675, "grad_norm": 0.0334036760032177, "learning_rate": 0.0002, "loss": 0.5554910898208618, "mean_token_accuracy": 0.7745659798383713, "num_tokens": 1829937.0, "step": 112 }, { "entropy": 0.5375554114580154, "epoch": 0.4224299065420561, "grad_norm": 0.03380579128861427, "learning_rate": 0.0002, "loss": 0.5416814684867859, "mean_token_accuracy": 0.7802845388650894, "num_tokens": 1846164.0, "step": 113 }, { "entropy": 0.5589973330497742, "epoch": 0.4261682242990654, "grad_norm": 0.03403402864933014, "learning_rate": 0.0002, "loss": 0.5650242567062378, "mean_token_accuracy": 0.7712521702051163, "num_tokens": 1862080.0, "step": 114 }, { "entropy": 0.5673896223306656, "epoch": 0.42990654205607476, "grad_norm": 0.03260383754968643, "learning_rate": 0.0002, "loss": 0.5664341449737549, "mean_token_accuracy": 0.7702513486146927, "num_tokens": 1878608.0, "step": 115 }, { "entropy": 0.572798103094101, "epoch": 0.4336448598130841, "grad_norm": 0.03137151151895523, "learning_rate": 0.0002, "loss": 0.5731777548789978, "mean_token_accuracy": 0.7663247585296631, "num_tokens": 1895166.0, "step": 116 }, { "entropy": 0.5312000960111618, "epoch": 0.4373831775700935, "grad_norm": 0.031823012977838516, "learning_rate": 0.0002, "loss": 0.5382552742958069, "mean_token_accuracy": 0.7808444052934647, "num_tokens": 1911130.0, "step": 117 }, { "entropy": 0.5409984290599823, "epoch": 0.4411214953271028, "grad_norm": 0.03332378715276718, "learning_rate": 0.0002, "loss": 0.5410414934158325, "mean_token_accuracy": 0.7819060832262039, "num_tokens": 1927264.0, "step": 118 }, { "entropy": 0.5695091038942337, "epoch": 0.44485981308411215, "grad_norm": 0.03380680829286575, "learning_rate": 0.0002, "loss": 0.5648797154426575, "mean_token_accuracy": 0.7696678340435028, "num_tokens": 1943766.0, "step": 119 }, { "entropy": 0.5565821528434753, "epoch": 0.4485981308411215, "grad_norm": 0.02917688526213169, "learning_rate": 0.0002, "loss": 0.5566266179084778, "mean_token_accuracy": 0.7743457108736038, "num_tokens": 1959998.0, "step": 120 }, { "entropy": 0.5624082386493683, "epoch": 0.4523364485981308, "grad_norm": 0.03372650966048241, "learning_rate": 0.0002, "loss": 0.5673832297325134, "mean_token_accuracy": 0.7714631706476212, "num_tokens": 1976438.0, "step": 121 }, { "entropy": 0.5652057379484177, "epoch": 0.45607476635514016, "grad_norm": 0.031156128272414207, "learning_rate": 0.0002, "loss": 0.5634032487869263, "mean_token_accuracy": 0.7731290906667709, "num_tokens": 1992993.0, "step": 122 }, { "entropy": 0.5621330291032791, "epoch": 0.45981308411214955, "grad_norm": 0.03159690275788307, "learning_rate": 0.0002, "loss": 0.5597059726715088, "mean_token_accuracy": 0.7743693888187408, "num_tokens": 2009294.0, "step": 123 }, { "entropy": 0.558076485991478, "epoch": 0.4635514018691589, "grad_norm": 0.032280728220939636, "learning_rate": 0.0002, "loss": 0.561931312084198, "mean_token_accuracy": 0.7742635309696198, "num_tokens": 2025544.0, "step": 124 }, { "entropy": 0.5441709458827972, "epoch": 0.4672897196261682, "grad_norm": 0.03219074383378029, "learning_rate": 0.0002, "loss": 0.5506591200828552, "mean_token_accuracy": 0.7746744006872177, "num_tokens": 2041666.0, "step": 125 }, { "entropy": 0.5633633583784103, "epoch": 0.47102803738317756, "grad_norm": 0.03131939098238945, "learning_rate": 0.0002, "loss": 0.5623766183853149, "mean_token_accuracy": 0.7734539210796356, "num_tokens": 2057983.0, "step": 126 }, { "entropy": 0.5601471066474915, "epoch": 0.4747663551401869, "grad_norm": 0.03067948669195175, "learning_rate": 0.0002, "loss": 0.5621774196624756, "mean_token_accuracy": 0.7716772705316544, "num_tokens": 2074261.0, "step": 127 }, { "entropy": 0.5540204495191574, "epoch": 0.4785046728971963, "grad_norm": 0.03339416906237602, "learning_rate": 0.0002, "loss": 0.548160970211029, "mean_token_accuracy": 0.7764931470155716, "num_tokens": 2090516.0, "step": 128 }, { "entropy": 0.552289143204689, "epoch": 0.4822429906542056, "grad_norm": 0.031481482088565826, "learning_rate": 0.0002, "loss": 0.5535706877708435, "mean_token_accuracy": 0.7739048302173615, "num_tokens": 2106672.0, "step": 129 }, { "entropy": 0.5568640977144241, "epoch": 0.48598130841121495, "grad_norm": 0.028559116646647453, "learning_rate": 0.0002, "loss": 0.5580005645751953, "mean_token_accuracy": 0.7733460515737534, "num_tokens": 2123117.0, "step": 130 }, { "entropy": 0.5648922473192215, "epoch": 0.4897196261682243, "grad_norm": 0.029422340914607048, "learning_rate": 0.0002, "loss": 0.5628851056098938, "mean_token_accuracy": 0.7712086588144302, "num_tokens": 2139369.0, "step": 131 }, { "entropy": 0.5373547673225403, "epoch": 0.4934579439252336, "grad_norm": 0.030260303989052773, "learning_rate": 0.0002, "loss": 0.541597843170166, "mean_token_accuracy": 0.7806773632764816, "num_tokens": 2155734.0, "step": 132 }, { "entropy": 0.5263249725103378, "epoch": 0.497196261682243, "grad_norm": 0.03478972613811493, "learning_rate": 0.0002, "loss": 0.5312929153442383, "mean_token_accuracy": 0.7852403372526169, "num_tokens": 2171760.0, "step": 133 }, { "entropy": 0.5605382472276688, "epoch": 0.5009345794392523, "grad_norm": 0.033430542796850204, "learning_rate": 0.0002, "loss": 0.5653795599937439, "mean_token_accuracy": 0.7712585926055908, "num_tokens": 2188007.0, "step": 134 }, { "entropy": 0.5739341080188751, "epoch": 0.5046728971962616, "grad_norm": 0.030662760138511658, "learning_rate": 0.0002, "loss": 0.5707223415374756, "mean_token_accuracy": 0.7689347118139267, "num_tokens": 2204304.0, "step": 135 }, { "entropy": 0.5562440007925034, "epoch": 0.508411214953271, "grad_norm": 0.029425745829939842, "learning_rate": 0.0002, "loss": 0.5517452955245972, "mean_token_accuracy": 0.7767287492752075, "num_tokens": 2220312.0, "step": 136 }, { "entropy": 0.5788603723049164, "epoch": 0.5121495327102804, "grad_norm": 0.033554431051015854, "learning_rate": 0.0002, "loss": 0.5720421075820923, "mean_token_accuracy": 0.7664643228054047, "num_tokens": 2236563.0, "step": 137 }, { "entropy": 0.558774933218956, "epoch": 0.5158878504672897, "grad_norm": 0.035832736641168594, "learning_rate": 0.0002, "loss": 0.559954822063446, "mean_token_accuracy": 0.7725366801023483, "num_tokens": 2252830.0, "step": 138 }, { "entropy": 0.554543524980545, "epoch": 0.5196261682242991, "grad_norm": 0.03428984060883522, "learning_rate": 0.0002, "loss": 0.5592023730278015, "mean_token_accuracy": 0.772834375500679, "num_tokens": 2269287.0, "step": 139 }, { "entropy": 0.5500677078962326, "epoch": 0.5233644859813084, "grad_norm": 0.035624898970127106, "learning_rate": 0.0002, "loss": 0.5614656209945679, "mean_token_accuracy": 0.7710914462804794, "num_tokens": 2285456.0, "step": 140 }, { "entropy": 0.5587853938341141, "epoch": 0.5271028037383177, "grad_norm": 0.03407886624336243, "learning_rate": 0.0002, "loss": 0.5605294704437256, "mean_token_accuracy": 0.7720634043216705, "num_tokens": 2301539.0, "step": 141 }, { "entropy": 0.5649153292179108, "epoch": 0.5308411214953271, "grad_norm": 0.028877010568976402, "learning_rate": 0.0002, "loss": 0.5598087310791016, "mean_token_accuracy": 0.7749214172363281, "num_tokens": 2317846.0, "step": 142 }, { "entropy": 0.5670332461595535, "epoch": 0.5345794392523364, "grad_norm": 0.03278481960296631, "learning_rate": 0.0002, "loss": 0.5650190114974976, "mean_token_accuracy": 0.7726317644119263, "num_tokens": 2334166.0, "step": 143 }, { "entropy": 0.5582242161035538, "epoch": 0.5383177570093458, "grad_norm": 0.033217303454875946, "learning_rate": 0.0002, "loss": 0.56020587682724, "mean_token_accuracy": 0.7734358310699463, "num_tokens": 2350590.0, "step": 144 }, { "entropy": 0.5491778552532196, "epoch": 0.5420560747663551, "grad_norm": 0.030532008036971092, "learning_rate": 0.0002, "loss": 0.5535258650779724, "mean_token_accuracy": 0.7728464603424072, "num_tokens": 2367000.0, "step": 145 }, { "entropy": 0.5495235919952393, "epoch": 0.5457943925233645, "grad_norm": 0.03000551462173462, "learning_rate": 0.0002, "loss": 0.549593448638916, "mean_token_accuracy": 0.7776431441307068, "num_tokens": 2383493.0, "step": 146 }, { "entropy": 0.5404796749353409, "epoch": 0.5495327102803739, "grad_norm": 0.03362047299742699, "learning_rate": 0.0002, "loss": 0.5460700392723083, "mean_token_accuracy": 0.7808279991149902, "num_tokens": 2399803.0, "step": 147 }, { "entropy": 0.5644742697477341, "epoch": 0.5532710280373832, "grad_norm": 0.031069470569491386, "learning_rate": 0.0002, "loss": 0.5680921077728271, "mean_token_accuracy": 0.7682847529649734, "num_tokens": 2416029.0, "step": 148 }, { "entropy": 0.548800989985466, "epoch": 0.5570093457943925, "grad_norm": 0.027548154816031456, "learning_rate": 0.0002, "loss": 0.5483176708221436, "mean_token_accuracy": 0.7775937616825104, "num_tokens": 2432412.0, "step": 149 }, { "entropy": 0.5704467445611954, "epoch": 0.5607476635514018, "grad_norm": 0.032674722373485565, "learning_rate": 0.0002, "loss": 0.5650383830070496, "mean_token_accuracy": 0.7691423147916794, "num_tokens": 2448801.0, "step": 150 }, { "entropy": 0.5737617313861847, "epoch": 0.5644859813084112, "grad_norm": 0.02663569711148739, "learning_rate": 0.0002, "loss": 0.5644318461418152, "mean_token_accuracy": 0.7708285748958588, "num_tokens": 2465024.0, "step": 151 }, { "entropy": 0.5562496334314346, "epoch": 0.5682242990654206, "grad_norm": 0.03284625709056854, "learning_rate": 0.0002, "loss": 0.5537476539611816, "mean_token_accuracy": 0.7753592431545258, "num_tokens": 2481162.0, "step": 152 }, { "entropy": 0.5587188154459, "epoch": 0.5719626168224299, "grad_norm": 0.035413194447755814, "learning_rate": 0.0002, "loss": 0.5652291178703308, "mean_token_accuracy": 0.7711132764816284, "num_tokens": 2497543.0, "step": 153 }, { "entropy": 0.5715966671705246, "epoch": 0.5757009345794393, "grad_norm": 0.030816730111837387, "learning_rate": 0.0002, "loss": 0.5740691423416138, "mean_token_accuracy": 0.767062708735466, "num_tokens": 2513719.0, "step": 154 }, { "entropy": 0.5732139945030212, "epoch": 0.5794392523364486, "grad_norm": 0.031442996114492416, "learning_rate": 0.0002, "loss": 0.575890302658081, "mean_token_accuracy": 0.7688509374856949, "num_tokens": 2529964.0, "step": 155 }, { "entropy": 0.5707177966833115, "epoch": 0.5831775700934579, "grad_norm": 0.029468102380633354, "learning_rate": 0.0002, "loss": 0.5684511661529541, "mean_token_accuracy": 0.7719237357378006, "num_tokens": 2546476.0, "step": 156 }, { "entropy": 0.5587103515863419, "epoch": 0.5869158878504673, "grad_norm": 0.031475260853767395, "learning_rate": 0.0002, "loss": 0.5583993792533875, "mean_token_accuracy": 0.7759029716253281, "num_tokens": 2562728.0, "step": 157 }, { "entropy": 0.574567124247551, "epoch": 0.5906542056074766, "grad_norm": 0.03264502063393593, "learning_rate": 0.0002, "loss": 0.5683896541595459, "mean_token_accuracy": 0.7703035026788712, "num_tokens": 2578973.0, "step": 158 }, { "entropy": 0.5552074015140533, "epoch": 0.594392523364486, "grad_norm": 0.032595545053482056, "learning_rate": 0.0002, "loss": 0.5574095249176025, "mean_token_accuracy": 0.7743780612945557, "num_tokens": 2595151.0, "step": 159 }, { "entropy": 0.5568316876888275, "epoch": 0.5981308411214953, "grad_norm": 0.033984988927841187, "learning_rate": 0.0002, "loss": 0.5642867088317871, "mean_token_accuracy": 0.7713010758161545, "num_tokens": 2611492.0, "step": 160 }, { "entropy": 0.5599596947431564, "epoch": 0.6018691588785047, "grad_norm": 0.031165285035967827, "learning_rate": 0.0002, "loss": 0.5589022636413574, "mean_token_accuracy": 0.7745718359947205, "num_tokens": 2628012.0, "step": 161 }, { "entropy": 0.5476372390985489, "epoch": 0.6056074766355141, "grad_norm": 0.0300962645560503, "learning_rate": 0.0002, "loss": 0.5493466258049011, "mean_token_accuracy": 0.7759741544723511, "num_tokens": 2644335.0, "step": 162 }, { "entropy": 0.5408246964216232, "epoch": 0.6093457943925233, "grad_norm": 0.03227512910962105, "learning_rate": 0.0002, "loss": 0.5468109846115112, "mean_token_accuracy": 0.7807517051696777, "num_tokens": 2660464.0, "step": 163 }, { "entropy": 0.5610683709383011, "epoch": 0.6130841121495327, "grad_norm": 0.033202771097421646, "learning_rate": 0.0002, "loss": 0.5660794377326965, "mean_token_accuracy": 0.7704542577266693, "num_tokens": 2676703.0, "step": 164 }, { "entropy": 0.556282252073288, "epoch": 0.616822429906542, "grad_norm": 0.030140740796923637, "learning_rate": 0.0002, "loss": 0.5595802664756775, "mean_token_accuracy": 0.7701904326677322, "num_tokens": 2692980.0, "step": 165 }, { "entropy": 0.5742812305688858, "epoch": 0.6205607476635514, "grad_norm": 0.031175116077065468, "learning_rate": 0.0002, "loss": 0.5679398775100708, "mean_token_accuracy": 0.7715850621461868, "num_tokens": 2709458.0, "step": 166 }, { "entropy": 0.5686928480863571, "epoch": 0.6242990654205608, "grad_norm": 0.03218809515237808, "learning_rate": 0.0002, "loss": 0.570385217666626, "mean_token_accuracy": 0.7703390866518021, "num_tokens": 2725878.0, "step": 167 }, { "entropy": 0.5649634599685669, "epoch": 0.6280373831775701, "grad_norm": 0.03405897319316864, "learning_rate": 0.0002, "loss": 0.5623840093612671, "mean_token_accuracy": 0.7718164473772049, "num_tokens": 2742230.0, "step": 168 }, { "entropy": 0.54586461186409, "epoch": 0.6317757009345795, "grad_norm": 0.030788332223892212, "learning_rate": 0.0002, "loss": 0.5481584072113037, "mean_token_accuracy": 0.7789210081100464, "num_tokens": 2758288.0, "step": 169 }, { "entropy": 0.5519826114177704, "epoch": 0.6355140186915887, "grad_norm": 0.0393390953540802, "learning_rate": 0.0002, "loss": 0.5614264607429504, "mean_token_accuracy": 0.7715797126293182, "num_tokens": 2774621.0, "step": 170 }, { "entropy": 0.5494296550750732, "epoch": 0.6392523364485981, "grad_norm": 0.03524143248796463, "learning_rate": 0.0002, "loss": 0.5467370748519897, "mean_token_accuracy": 0.7793298810720444, "num_tokens": 2790715.0, "step": 171 }, { "entropy": 0.5330041199922562, "epoch": 0.6429906542056075, "grad_norm": 0.03651867434382439, "learning_rate": 0.0002, "loss": 0.539812445640564, "mean_token_accuracy": 0.7808443903923035, "num_tokens": 2806717.0, "step": 172 }, { "entropy": 0.5453702062368393, "epoch": 0.6467289719626168, "grad_norm": 0.03462547808885574, "learning_rate": 0.0002, "loss": 0.5413773655891418, "mean_token_accuracy": 0.7798687964677811, "num_tokens": 2823284.0, "step": 173 }, { "entropy": 0.5685944706201553, "epoch": 0.6504672897196262, "grad_norm": 0.028748901560902596, "learning_rate": 0.0002, "loss": 0.5659922361373901, "mean_token_accuracy": 0.7701825350522995, "num_tokens": 2839827.0, "step": 174 }, { "entropy": 0.5635224878787994, "epoch": 0.6542056074766355, "grad_norm": 0.02829919010400772, "learning_rate": 0.0002, "loss": 0.5650316476821899, "mean_token_accuracy": 0.7709458023309708, "num_tokens": 2856136.0, "step": 175 }, { "entropy": 0.5540378838777542, "epoch": 0.6579439252336449, "grad_norm": 0.033104948699474335, "learning_rate": 0.0002, "loss": 0.5580451488494873, "mean_token_accuracy": 0.7731391340494156, "num_tokens": 2872416.0, "step": 176 }, { "entropy": 0.5654754340648651, "epoch": 0.6616822429906543, "grad_norm": 0.03393986448645592, "learning_rate": 0.0002, "loss": 0.566604733467102, "mean_token_accuracy": 0.768456295132637, "num_tokens": 2888732.0, "step": 177 }, { "entropy": 0.538336843252182, "epoch": 0.6654205607476635, "grad_norm": 0.031724728643894196, "learning_rate": 0.0002, "loss": 0.5347487926483154, "mean_token_accuracy": 0.783712849020958, "num_tokens": 2904747.0, "step": 178 }, { "entropy": 0.563370868563652, "epoch": 0.6691588785046729, "grad_norm": 0.028497006744146347, "learning_rate": 0.0002, "loss": 0.5567288398742676, "mean_token_accuracy": 0.7762446701526642, "num_tokens": 2921642.0, "step": 179 }, { "entropy": 0.5554675310850143, "epoch": 0.6728971962616822, "grad_norm": 0.027588432654738426, "learning_rate": 0.0002, "loss": 0.5539284348487854, "mean_token_accuracy": 0.7720596343278885, "num_tokens": 2938231.0, "step": 180 }, { "entropy": 0.5351214110851288, "epoch": 0.6766355140186916, "grad_norm": 0.02989763207733631, "learning_rate": 0.0002, "loss": 0.5380938053131104, "mean_token_accuracy": 0.7797621339559555, "num_tokens": 2954651.0, "step": 181 }, { "entropy": 0.5512963533401489, "epoch": 0.680373831775701, "grad_norm": 0.031486768275499344, "learning_rate": 0.0002, "loss": 0.559045672416687, "mean_token_accuracy": 0.7730693370103836, "num_tokens": 2970900.0, "step": 182 }, { "entropy": 0.5643429905176163, "epoch": 0.6841121495327103, "grad_norm": 0.030211007222533226, "learning_rate": 0.0002, "loss": 0.5652138590812683, "mean_token_accuracy": 0.7722145467996597, "num_tokens": 2987276.0, "step": 183 }, { "entropy": 0.5449773222208023, "epoch": 0.6878504672897197, "grad_norm": 0.03100084885954857, "learning_rate": 0.0002, "loss": 0.5516652464866638, "mean_token_accuracy": 0.7781905680894852, "num_tokens": 3003582.0, "step": 184 }, { "entropy": 0.5534535795450211, "epoch": 0.6915887850467289, "grad_norm": 0.029445704072713852, "learning_rate": 0.0002, "loss": 0.549251914024353, "mean_token_accuracy": 0.7758228182792664, "num_tokens": 3019792.0, "step": 185 }, { "entropy": 0.5563573390245438, "epoch": 0.6953271028037383, "grad_norm": 0.03839804232120514, "learning_rate": 0.0002, "loss": 0.5603447556495667, "mean_token_accuracy": 0.7714342921972275, "num_tokens": 3035807.0, "step": 186 }, { "entropy": 0.538311779499054, "epoch": 0.6990654205607477, "grad_norm": 0.03146633878350258, "learning_rate": 0.0002, "loss": 0.5352146625518799, "mean_token_accuracy": 0.7827797681093216, "num_tokens": 3051838.0, "step": 187 }, { "entropy": 0.5633413791656494, "epoch": 0.702803738317757, "grad_norm": 0.02970045432448387, "learning_rate": 0.0002, "loss": 0.558843195438385, "mean_token_accuracy": 0.774773895740509, "num_tokens": 3068298.0, "step": 188 }, { "entropy": 0.5590213239192963, "epoch": 0.7065420560747664, "grad_norm": 0.030248312279582024, "learning_rate": 0.0002, "loss": 0.5594462156295776, "mean_token_accuracy": 0.7730938643217087, "num_tokens": 3084742.0, "step": 189 }, { "entropy": 0.5729488730430603, "epoch": 0.7102803738317757, "grad_norm": 0.02910761535167694, "learning_rate": 0.0002, "loss": 0.5710701942443848, "mean_token_accuracy": 0.7694995701313019, "num_tokens": 3101166.0, "step": 190 }, { "entropy": 0.5414529591798782, "epoch": 0.7140186915887851, "grad_norm": 0.030337564647197723, "learning_rate": 0.0002, "loss": 0.5447859168052673, "mean_token_accuracy": 0.7779805213212967, "num_tokens": 3117310.0, "step": 191 }, { "entropy": 0.5537209510803223, "epoch": 0.7177570093457943, "grad_norm": 0.03048059716820717, "learning_rate": 0.0002, "loss": 0.5590298771858215, "mean_token_accuracy": 0.7726654410362244, "num_tokens": 3133530.0, "step": 192 }, { "entropy": 0.5551200062036514, "epoch": 0.7214953271028037, "grad_norm": 0.03023671731352806, "learning_rate": 0.0002, "loss": 0.5620648860931396, "mean_token_accuracy": 0.7735067456960678, "num_tokens": 3149663.0, "step": 193 }, { "entropy": 0.5674590468406677, "epoch": 0.7252336448598131, "grad_norm": 0.0296547319740057, "learning_rate": 0.0002, "loss": 0.5588228702545166, "mean_token_accuracy": 0.7742174565792084, "num_tokens": 3166066.0, "step": 194 }, { "entropy": 0.5779262185096741, "epoch": 0.7289719626168224, "grad_norm": 0.028214752674102783, "learning_rate": 0.0002, "loss": 0.572249710559845, "mean_token_accuracy": 0.7688845992088318, "num_tokens": 3182640.0, "step": 195 }, { "entropy": 0.540147215127945, "epoch": 0.7327102803738318, "grad_norm": 0.027666175737977028, "learning_rate": 0.0002, "loss": 0.5338530540466309, "mean_token_accuracy": 0.7832302153110504, "num_tokens": 3198796.0, "step": 196 }, { "entropy": 0.5551275163888931, "epoch": 0.7364485981308411, "grad_norm": 0.034123752266168594, "learning_rate": 0.0002, "loss": 0.5622342824935913, "mean_token_accuracy": 0.7688822001218796, "num_tokens": 3214771.0, "step": 197 }, { "entropy": 0.5611921101808548, "epoch": 0.7401869158878505, "grad_norm": 0.02890852838754654, "learning_rate": 0.0002, "loss": 0.5630607604980469, "mean_token_accuracy": 0.7698909342288971, "num_tokens": 3231278.0, "step": 198 }, { "entropy": 0.5426182597875595, "epoch": 0.7439252336448599, "grad_norm": 0.029497232288122177, "learning_rate": 0.0002, "loss": 0.5449106097221375, "mean_token_accuracy": 0.7783599495887756, "num_tokens": 3247627.0, "step": 199 }, { "entropy": 0.5460454076528549, "epoch": 0.7476635514018691, "grad_norm": 0.03151922672986984, "learning_rate": 0.0002, "loss": 0.5513307452201843, "mean_token_accuracy": 0.7761969566345215, "num_tokens": 3263818.0, "step": 200 }, { "entropy": 0.5589698106050491, "epoch": 0.7514018691588785, "grad_norm": 0.028974369168281555, "learning_rate": 0.0002, "loss": 0.5579357147216797, "mean_token_accuracy": 0.7737255245447159, "num_tokens": 3279922.0, "step": 201 }, { "entropy": 0.553888663649559, "epoch": 0.7551401869158878, "grad_norm": 0.026153914630413055, "learning_rate": 0.0002, "loss": 0.550652027130127, "mean_token_accuracy": 0.7776264399290085, "num_tokens": 3296366.0, "step": 202 }, { "entropy": 0.5686471164226532, "epoch": 0.7588785046728972, "grad_norm": 0.028719555586576462, "learning_rate": 0.0002, "loss": 0.566332221031189, "mean_token_accuracy": 0.7694560885429382, "num_tokens": 3312940.0, "step": 203 }, { "entropy": 0.5482725948095322, "epoch": 0.7626168224299066, "grad_norm": 0.031571801751852036, "learning_rate": 0.0002, "loss": 0.5515249967575073, "mean_token_accuracy": 0.7790126353502274, "num_tokens": 3329137.0, "step": 204 }, { "entropy": 0.5548627823591232, "epoch": 0.7663551401869159, "grad_norm": 0.03189053386449814, "learning_rate": 0.0002, "loss": 0.5633711218833923, "mean_token_accuracy": 0.7717642784118652, "num_tokens": 3345223.0, "step": 205 }, { "entropy": 0.5403945446014404, "epoch": 0.7700934579439253, "grad_norm": 0.03444300964474678, "learning_rate": 0.0002, "loss": 0.5441574454307556, "mean_token_accuracy": 0.7791598290205002, "num_tokens": 3361512.0, "step": 206 }, { "entropy": 0.5523678362369537, "epoch": 0.7738317757009345, "grad_norm": 0.027761496603488922, "learning_rate": 0.0002, "loss": 0.5582634210586548, "mean_token_accuracy": 0.7723374962806702, "num_tokens": 3377859.0, "step": 207 }, { "entropy": 0.5723598301410675, "epoch": 0.7775700934579439, "grad_norm": 0.028997788205742836, "learning_rate": 0.0002, "loss": 0.5705980658531189, "mean_token_accuracy": 0.7668357789516449, "num_tokens": 3394399.0, "step": 208 }, { "entropy": 0.5796838849782944, "epoch": 0.7813084112149533, "grad_norm": 0.03271174803376198, "learning_rate": 0.0002, "loss": 0.5698305368423462, "mean_token_accuracy": 0.7698051035404205, "num_tokens": 3410824.0, "step": 209 }, { "entropy": 0.5651015788316727, "epoch": 0.7850467289719626, "grad_norm": 0.031869035214185715, "learning_rate": 0.0002, "loss": 0.5655361413955688, "mean_token_accuracy": 0.7697497308254242, "num_tokens": 3426955.0, "step": 210 }, { "entropy": 0.5639242976903915, "epoch": 0.788785046728972, "grad_norm": 0.026541458442807198, "learning_rate": 0.0002, "loss": 0.5636979341506958, "mean_token_accuracy": 0.7697752565145493, "num_tokens": 3443406.0, "step": 211 }, { "entropy": 0.5432985424995422, "epoch": 0.7925233644859813, "grad_norm": 0.032391466200351715, "learning_rate": 0.0002, "loss": 0.5466354489326477, "mean_token_accuracy": 0.7787620276212692, "num_tokens": 3459857.0, "step": 212 }, { "entropy": 0.546247586607933, "epoch": 0.7962616822429907, "grad_norm": 0.03624865412712097, "learning_rate": 0.0002, "loss": 0.5477287769317627, "mean_token_accuracy": 0.7784061878919601, "num_tokens": 3476064.0, "step": 213 }, { "entropy": 0.5712321698665619, "epoch": 0.8, "grad_norm": 0.027368342503905296, "learning_rate": 0.0002, "loss": 0.5628222823143005, "mean_token_accuracy": 0.7711902260780334, "num_tokens": 3492569.0, "step": 214 }, { "entropy": 0.5511522740125656, "epoch": 0.8037383177570093, "grad_norm": 0.0314224548637867, "learning_rate": 0.0002, "loss": 0.546245813369751, "mean_token_accuracy": 0.777819886803627, "num_tokens": 3508946.0, "step": 215 }, { "entropy": 0.5641316920518875, "epoch": 0.8074766355140187, "grad_norm": 0.02934875525534153, "learning_rate": 0.0002, "loss": 0.5656546354293823, "mean_token_accuracy": 0.7672451436519623, "num_tokens": 3525415.0, "step": 216 }, { "entropy": 0.5616082847118378, "epoch": 0.811214953271028, "grad_norm": 0.027262428775429726, "learning_rate": 0.0002, "loss": 0.5606979131698608, "mean_token_accuracy": 0.7726116627454758, "num_tokens": 3541513.0, "step": 217 }, { "entropy": 0.5319297313690186, "epoch": 0.8149532710280374, "grad_norm": 0.02967401221394539, "learning_rate": 0.0002, "loss": 0.5409149527549744, "mean_token_accuracy": 0.7806787341833115, "num_tokens": 3557840.0, "step": 218 }, { "entropy": 0.5461787581443787, "epoch": 0.8186915887850468, "grad_norm": 0.03170184791088104, "learning_rate": 0.0002, "loss": 0.5544174313545227, "mean_token_accuracy": 0.7753637731075287, "num_tokens": 3574334.0, "step": 219 }, { "entropy": 0.5393616706132889, "epoch": 0.822429906542056, "grad_norm": 0.02985682338476181, "learning_rate": 0.0002, "loss": 0.5457973480224609, "mean_token_accuracy": 0.7773662656545639, "num_tokens": 3590741.0, "step": 220 }, { "entropy": 0.5554001927375793, "epoch": 0.8261682242990654, "grad_norm": 0.02711213380098343, "learning_rate": 0.0002, "loss": 0.555370569229126, "mean_token_accuracy": 0.7716074883937836, "num_tokens": 3607018.0, "step": 221 }, { "entropy": 0.5483701378107071, "epoch": 0.8299065420560747, "grad_norm": 0.029320966452360153, "learning_rate": 0.0002, "loss": 0.5421203970909119, "mean_token_accuracy": 0.7806040942668915, "num_tokens": 3623209.0, "step": 222 }, { "entropy": 0.5777206718921661, "epoch": 0.8336448598130841, "grad_norm": 0.030610879883170128, "learning_rate": 0.0002, "loss": 0.5738532543182373, "mean_token_accuracy": 0.7664468586444855, "num_tokens": 3639406.0, "step": 223 }, { "entropy": 0.5567807406187057, "epoch": 0.8373831775700935, "grad_norm": 0.028399785980582237, "learning_rate": 0.0002, "loss": 0.5526878237724304, "mean_token_accuracy": 0.773535892367363, "num_tokens": 3655602.0, "step": 224 }, { "entropy": 0.530220165848732, "epoch": 0.8411214953271028, "grad_norm": 0.03518186882138252, "learning_rate": 0.0002, "loss": 0.5408585667610168, "mean_token_accuracy": 0.779409795999527, "num_tokens": 3671905.0, "step": 225 }, { "entropy": 0.5535659790039062, "epoch": 0.8448598130841122, "grad_norm": 0.03929230943322182, "learning_rate": 0.0002, "loss": 0.5663979053497314, "mean_token_accuracy": 0.7698138654232025, "num_tokens": 3688191.0, "step": 226 }, { "entropy": 0.569505363702774, "epoch": 0.8485981308411215, "grad_norm": 0.0272939745336771, "learning_rate": 0.0002, "loss": 0.5618590712547302, "mean_token_accuracy": 0.7725658267736435, "num_tokens": 3704751.0, "step": 227 }, { "entropy": 0.5644249469041824, "epoch": 0.8523364485981308, "grad_norm": 0.03415616601705551, "learning_rate": 0.0002, "loss": 0.5562848448753357, "mean_token_accuracy": 0.7748490273952484, "num_tokens": 3720710.0, "step": 228 }, { "entropy": 0.5773901343345642, "epoch": 0.8560747663551402, "grad_norm": 0.031880877912044525, "learning_rate": 0.0002, "loss": 0.5614221096038818, "mean_token_accuracy": 0.7720403522253036, "num_tokens": 3737054.0, "step": 229 }, { "entropy": 0.5547749698162079, "epoch": 0.8598130841121495, "grad_norm": 0.0324094183743, "learning_rate": 0.0002, "loss": 0.5520619750022888, "mean_token_accuracy": 0.7773038893938065, "num_tokens": 3753537.0, "step": 230 }, { "entropy": 0.5418203920125961, "epoch": 0.8635514018691589, "grad_norm": 0.03512468561530113, "learning_rate": 0.0002, "loss": 0.5538347959518433, "mean_token_accuracy": 0.7749911546707153, "num_tokens": 3769863.0, "step": 231 }, { "entropy": 0.5521644353866577, "epoch": 0.8672897196261682, "grad_norm": 0.02896721474826336, "learning_rate": 0.0002, "loss": 0.5608810186386108, "mean_token_accuracy": 0.7746408581733704, "num_tokens": 3786316.0, "step": 232 }, { "entropy": 0.543023481965065, "epoch": 0.8710280373831776, "grad_norm": 0.03712921962141991, "learning_rate": 0.0002, "loss": 0.5551246404647827, "mean_token_accuracy": 0.7738360315561295, "num_tokens": 3802441.0, "step": 233 }, { "entropy": 0.5672542154788971, "epoch": 0.874766355140187, "grad_norm": 0.026832984760403633, "learning_rate": 0.0002, "loss": 0.5662351846694946, "mean_token_accuracy": 0.7704236954450607, "num_tokens": 3818851.0, "step": 234 }, { "entropy": 0.5710914433002472, "epoch": 0.8785046728971962, "grad_norm": 0.036441151052713394, "learning_rate": 0.0002, "loss": 0.5647166967391968, "mean_token_accuracy": 0.7697651982307434, "num_tokens": 3835229.0, "step": 235 }, { "entropy": 0.5721132010221481, "epoch": 0.8822429906542056, "grad_norm": 0.031891413033008575, "learning_rate": 0.0002, "loss": 0.561801552772522, "mean_token_accuracy": 0.7740357220172882, "num_tokens": 3851634.0, "step": 236 }, { "entropy": 0.5430081784725189, "epoch": 0.8859813084112149, "grad_norm": 0.028133288025856018, "learning_rate": 0.0002, "loss": 0.5482598543167114, "mean_token_accuracy": 0.7780391424894333, "num_tokens": 3867818.0, "step": 237 }, { "entropy": 0.5531598627567291, "epoch": 0.8897196261682243, "grad_norm": 0.031570907682180405, "learning_rate": 0.0002, "loss": 0.5597803592681885, "mean_token_accuracy": 0.7725805789232254, "num_tokens": 3884128.0, "step": 238 }, { "entropy": 0.552057608962059, "epoch": 0.8934579439252337, "grad_norm": 0.03431302309036255, "learning_rate": 0.0002, "loss": 0.5592586398124695, "mean_token_accuracy": 0.7739444822072983, "num_tokens": 3900459.0, "step": 239 }, { "entropy": 0.552062600851059, "epoch": 0.897196261682243, "grad_norm": 0.029298607259988785, "learning_rate": 0.0002, "loss": 0.5525797009468079, "mean_token_accuracy": 0.7755719870328903, "num_tokens": 3916582.0, "step": 240 }, { "entropy": 0.571002647280693, "epoch": 0.9009345794392524, "grad_norm": 0.028903625905513763, "learning_rate": 0.0002, "loss": 0.5647273659706116, "mean_token_accuracy": 0.7697427272796631, "num_tokens": 3932989.0, "step": 241 }, { "entropy": 0.5607190132141113, "epoch": 0.9046728971962616, "grad_norm": 0.02721545286476612, "learning_rate": 0.0002, "loss": 0.5572564601898193, "mean_token_accuracy": 0.7735343724489212, "num_tokens": 3949591.0, "step": 242 }, { "entropy": 0.554363563656807, "epoch": 0.908411214953271, "grad_norm": 0.028853297233581543, "learning_rate": 0.0002, "loss": 0.5598585605621338, "mean_token_accuracy": 0.7746720314025879, "num_tokens": 3965977.0, "step": 243 }, { "entropy": 0.562399297952652, "epoch": 0.9121495327102803, "grad_norm": 0.031765274703502655, "learning_rate": 0.0002, "loss": 0.5609657764434814, "mean_token_accuracy": 0.7706955671310425, "num_tokens": 3982241.0, "step": 244 }, { "entropy": 0.5663948059082031, "epoch": 0.9158878504672897, "grad_norm": 0.02977531962096691, "learning_rate": 0.0002, "loss": 0.5600242018699646, "mean_token_accuracy": 0.7716616988182068, "num_tokens": 3998850.0, "step": 245 }, { "entropy": 0.5626737624406815, "epoch": 0.9196261682242991, "grad_norm": 0.03073737397789955, "learning_rate": 0.0002, "loss": 0.5680803656578064, "mean_token_accuracy": 0.7690348774194717, "num_tokens": 4015357.0, "step": 246 }, { "entropy": 0.5617063343524933, "epoch": 0.9233644859813084, "grad_norm": 0.03239826485514641, "learning_rate": 0.0002, "loss": 0.5647311210632324, "mean_token_accuracy": 0.7720029205083847, "num_tokens": 4031434.0, "step": 247 }, { "entropy": 0.5446989983320236, "epoch": 0.9271028037383178, "grad_norm": 0.026935769245028496, "learning_rate": 0.0002, "loss": 0.5423059463500977, "mean_token_accuracy": 0.7784274518489838, "num_tokens": 4047542.0, "step": 248 }, { "entropy": 0.5633901953697205, "epoch": 0.930841121495327, "grad_norm": 0.03004775382578373, "learning_rate": 0.0002, "loss": 0.5547890663146973, "mean_token_accuracy": 0.7750878036022186, "num_tokens": 4063671.0, "step": 249 }, { "entropy": 0.5641201138496399, "epoch": 0.9345794392523364, "grad_norm": 0.035040173679590225, "learning_rate": 0.0002, "loss": 0.560414731502533, "mean_token_accuracy": 0.7721855938434601, "num_tokens": 4080062.0, "step": 250 }, { "entropy": 0.5267122685909271, "epoch": 0.9383177570093458, "grad_norm": 0.026784395799040794, "learning_rate": 0.0002, "loss": 0.528884768486023, "mean_token_accuracy": 0.7842623591423035, "num_tokens": 4096314.0, "step": 251 }, { "entropy": 0.5412785857915878, "epoch": 0.9420560747663551, "grad_norm": 0.029483763501048088, "learning_rate": 0.0002, "loss": 0.5475237369537354, "mean_token_accuracy": 0.7779380232095718, "num_tokens": 4112543.0, "step": 252 }, { "entropy": 0.5688454955816269, "epoch": 0.9457943925233645, "grad_norm": 0.02722441591322422, "learning_rate": 0.0002, "loss": 0.5703037977218628, "mean_token_accuracy": 0.7700005024671555, "num_tokens": 4128880.0, "step": 253 }, { "entropy": 0.5569160729646683, "epoch": 0.9495327102803738, "grad_norm": 0.028683314099907875, "learning_rate": 0.0002, "loss": 0.5574289560317993, "mean_token_accuracy": 0.7722644209861755, "num_tokens": 4145417.0, "step": 254 }, { "entropy": 0.5437170565128326, "epoch": 0.9532710280373832, "grad_norm": 0.03323707729578018, "learning_rate": 0.0002, "loss": 0.5411959886550903, "mean_token_accuracy": 0.7814441025257111, "num_tokens": 4161528.0, "step": 255 }, { "entropy": 0.5666731148958206, "epoch": 0.9570093457943926, "grad_norm": 0.028484966605901718, "learning_rate": 0.0002, "loss": 0.5648545622825623, "mean_token_accuracy": 0.77223140001297, "num_tokens": 4177883.0, "step": 256 }, { "entropy": 0.5472739338874817, "epoch": 0.9607476635514018, "grad_norm": 0.032945599406957626, "learning_rate": 0.0002, "loss": 0.5465376377105713, "mean_token_accuracy": 0.7768394351005554, "num_tokens": 4194047.0, "step": 257 }, { "entropy": 0.5488951653242111, "epoch": 0.9644859813084112, "grad_norm": 0.030117738991975784, "learning_rate": 0.0002, "loss": 0.5551251769065857, "mean_token_accuracy": 0.7728994339704514, "num_tokens": 4210415.0, "step": 258 }, { "entropy": 0.5574130117893219, "epoch": 0.9682242990654205, "grad_norm": 0.028586212545633316, "learning_rate": 0.0002, "loss": 0.5596088171005249, "mean_token_accuracy": 0.7760643810033798, "num_tokens": 4226881.0, "step": 259 }, { "entropy": 0.5550301373004913, "epoch": 0.9719626168224299, "grad_norm": 0.035784922540187836, "learning_rate": 0.0002, "loss": 0.5660927891731262, "mean_token_accuracy": 0.7692493498325348, "num_tokens": 4243149.0, "step": 260 }, { "entropy": 0.5651994347572327, "epoch": 0.9757009345794393, "grad_norm": 0.03252053260803223, "learning_rate": 0.0002, "loss": 0.5599735379219055, "mean_token_accuracy": 0.7730003446340561, "num_tokens": 4259611.0, "step": 261 }, { "entropy": 0.5637697577476501, "epoch": 0.9794392523364486, "grad_norm": 0.047552503645420074, "learning_rate": 0.0002, "loss": 0.5568199157714844, "mean_token_accuracy": 0.7762705087661743, "num_tokens": 4275796.0, "step": 262 }, { "entropy": 0.567447230219841, "epoch": 0.983177570093458, "grad_norm": 0.027801062911748886, "learning_rate": 0.0002, "loss": 0.5698356032371521, "mean_token_accuracy": 0.7690239697694778, "num_tokens": 4292132.0, "step": 263 }, { "entropy": 0.5712171792984009, "epoch": 0.9869158878504672, "grad_norm": 0.11246822774410248, "learning_rate": 0.0002, "loss": 0.5811023116111755, "mean_token_accuracy": 0.7647420465946198, "num_tokens": 4308584.0, "step": 264 }, { "entropy": 0.5711934268474579, "epoch": 0.9906542056074766, "grad_norm": 0.06911394000053406, "learning_rate": 0.0002, "loss": 0.5809019804000854, "mean_token_accuracy": 0.7624327838420868, "num_tokens": 4324962.0, "step": 265 }, { "entropy": 0.5627400726079941, "epoch": 0.994392523364486, "grad_norm": 0.030455252155661583, "learning_rate": 0.0002, "loss": 0.5616910457611084, "mean_token_accuracy": 0.7730111479759216, "num_tokens": 4341120.0, "step": 266 }, { "entropy": 0.5654444992542267, "epoch": 0.9981308411214953, "grad_norm": 0.02772046998143196, "learning_rate": 0.0002, "loss": 0.5567201972007751, "mean_token_accuracy": 0.7720088213682175, "num_tokens": 4357574.0, "step": 267 }, { "entropy": 0.5589146912097931, "epoch": 1.0, "grad_norm": 0.04032747447490692, "learning_rate": 0.0002, "loss": 0.5460503101348877, "mean_token_accuracy": 0.779203861951828, "num_tokens": 4365546.0, "step": 268 }, { "entropy": 0.5703114420175552, "epoch": 1.0037383177570094, "grad_norm": 0.033491045236587524, "learning_rate": 0.0002, "loss": 0.5557507276535034, "mean_token_accuracy": 0.7745671570301056, "num_tokens": 4381699.0, "step": 269 }, { "entropy": 0.5609012693166733, "epoch": 1.0074766355140188, "grad_norm": 0.03252531215548515, "learning_rate": 0.0002, "loss": 0.5590213537216187, "mean_token_accuracy": 0.7752612829208374, "num_tokens": 4398284.0, "step": 270 }, { "entropy": 0.5300652086734772, "epoch": 1.011214953271028, "grad_norm": 0.036933887749910355, "learning_rate": 0.0002, "loss": 0.5396179556846619, "mean_token_accuracy": 0.7816686779260635, "num_tokens": 4414795.0, "step": 271 }, { "entropy": 0.5411953181028366, "epoch": 1.0149532710280373, "grad_norm": 0.035878736525774, "learning_rate": 0.0002, "loss": 0.5491203665733337, "mean_token_accuracy": 0.7742594629526138, "num_tokens": 4431190.0, "step": 272 }, { "entropy": 0.5370450466871262, "epoch": 1.0186915887850467, "grad_norm": 0.029914801940321922, "learning_rate": 0.0002, "loss": 0.5417315363883972, "mean_token_accuracy": 0.7806635499000549, "num_tokens": 4447475.0, "step": 273 }, { "entropy": 0.5567668229341507, "epoch": 1.0224299065420561, "grad_norm": 0.03265395388007164, "learning_rate": 0.0002, "loss": 0.5509355068206787, "mean_token_accuracy": 0.7730302512645721, "num_tokens": 4463734.0, "step": 274 }, { "entropy": 0.5656838417053223, "epoch": 1.0261682242990655, "grad_norm": 0.03136991336941719, "learning_rate": 0.0002, "loss": 0.5576434135437012, "mean_token_accuracy": 0.7703666239976883, "num_tokens": 4479995.0, "step": 275 }, { "entropy": 0.548493430018425, "epoch": 1.0299065420560747, "grad_norm": 0.033384647220373154, "learning_rate": 0.0002, "loss": 0.5452391505241394, "mean_token_accuracy": 0.7803221642971039, "num_tokens": 4496385.0, "step": 276 }, { "entropy": 0.547315925359726, "epoch": 1.033644859813084, "grad_norm": 0.02812100760638714, "learning_rate": 0.0002, "loss": 0.5515413284301758, "mean_token_accuracy": 0.7755024433135986, "num_tokens": 4512779.0, "step": 277 }, { "entropy": 0.5315467417240143, "epoch": 1.0373831775700935, "grad_norm": 0.041606683284044266, "learning_rate": 0.0002, "loss": 0.5446295738220215, "mean_token_accuracy": 0.7787878066301346, "num_tokens": 4529088.0, "step": 278 }, { "entropy": 0.5279169529676437, "epoch": 1.0411214953271029, "grad_norm": 0.031057002022862434, "learning_rate": 0.0002, "loss": 0.536575973033905, "mean_token_accuracy": 0.7812807857990265, "num_tokens": 4545377.0, "step": 279 }, { "entropy": 0.5590710490942001, "epoch": 1.0448598130841122, "grad_norm": 0.02644682675600052, "learning_rate": 0.0002, "loss": 0.554656982421875, "mean_token_accuracy": 0.7751928865909576, "num_tokens": 4561701.0, "step": 280 }, { "entropy": 0.5662561357021332, "epoch": 1.0485981308411214, "grad_norm": 0.029125280678272247, "learning_rate": 0.0002, "loss": 0.5619407892227173, "mean_token_accuracy": 0.7679703086614609, "num_tokens": 4578007.0, "step": 281 }, { "entropy": 0.5509714484214783, "epoch": 1.0523364485981308, "grad_norm": 0.03366995230317116, "learning_rate": 0.0002, "loss": 0.544794499874115, "mean_token_accuracy": 0.7797580361366272, "num_tokens": 4594260.0, "step": 282 }, { "entropy": 0.5634302496910095, "epoch": 1.0560747663551402, "grad_norm": 0.027832867577672005, "learning_rate": 0.0002, "loss": 0.5580713748931885, "mean_token_accuracy": 0.7739240676164627, "num_tokens": 4610748.0, "step": 283 }, { "entropy": 0.5439006388187408, "epoch": 1.0598130841121496, "grad_norm": 0.03045068122446537, "learning_rate": 0.0002, "loss": 0.5474724173545837, "mean_token_accuracy": 0.7765053659677505, "num_tokens": 4627116.0, "step": 284 }, { "entropy": 0.5238615572452545, "epoch": 1.063551401869159, "grad_norm": 0.03397069126367569, "learning_rate": 0.0002, "loss": 0.532546877861023, "mean_token_accuracy": 0.7858656197786331, "num_tokens": 4643480.0, "step": 285 }, { "entropy": 0.5387604683637619, "epoch": 1.0672897196261681, "grad_norm": 0.036734551191329956, "learning_rate": 0.0002, "loss": 0.5468651056289673, "mean_token_accuracy": 0.7797952890396118, "num_tokens": 4660303.0, "step": 286 }, { "entropy": 0.5558950453996658, "epoch": 1.0710280373831775, "grad_norm": 0.030276885256171227, "learning_rate": 0.0002, "loss": 0.5584522485733032, "mean_token_accuracy": 0.7732091248035431, "num_tokens": 4676839.0, "step": 287 }, { "entropy": 0.5617282688617706, "epoch": 1.074766355140187, "grad_norm": 0.033773574978113174, "learning_rate": 0.0002, "loss": 0.5567758679389954, "mean_token_accuracy": 0.7739396244287491, "num_tokens": 4692959.0, "step": 288 }, { "entropy": 0.5491297841072083, "epoch": 1.0785046728971963, "grad_norm": 0.0321025624871254, "learning_rate": 0.0002, "loss": 0.5414766073226929, "mean_token_accuracy": 0.7804555594921112, "num_tokens": 4709310.0, "step": 289 }, { "entropy": 0.5456965118646622, "epoch": 1.0822429906542057, "grad_norm": 0.029098015278577805, "learning_rate": 0.0002, "loss": 0.5451281070709229, "mean_token_accuracy": 0.7778134942054749, "num_tokens": 4725506.0, "step": 290 }, { "entropy": 0.5477775633335114, "epoch": 1.0859813084112149, "grad_norm": 0.02958570048213005, "learning_rate": 0.0002, "loss": 0.5455498695373535, "mean_token_accuracy": 0.7799811661243439, "num_tokens": 4741775.0, "step": 291 }, { "entropy": 0.5301359370350838, "epoch": 1.0897196261682243, "grad_norm": 0.03702852129936218, "learning_rate": 0.0002, "loss": 0.5398594737052917, "mean_token_accuracy": 0.7832937985658646, "num_tokens": 4758016.0, "step": 292 }, { "entropy": 0.5263582319021225, "epoch": 1.0934579439252337, "grad_norm": 0.0337018184363842, "learning_rate": 0.0002, "loss": 0.528889000415802, "mean_token_accuracy": 0.7862381190061569, "num_tokens": 4774331.0, "step": 293 }, { "entropy": 0.5430160015821457, "epoch": 1.097196261682243, "grad_norm": 0.036417651921510696, "learning_rate": 0.0002, "loss": 0.5521553158760071, "mean_token_accuracy": 0.7737599611282349, "num_tokens": 4790501.0, "step": 294 }, { "entropy": 0.5552934557199478, "epoch": 1.1009345794392524, "grad_norm": 0.03106369823217392, "learning_rate": 0.0002, "loss": 0.5559324622154236, "mean_token_accuracy": 0.7761313170194626, "num_tokens": 4806597.0, "step": 295 }, { "entropy": 0.5548459142446518, "epoch": 1.1046728971962616, "grad_norm": 0.031152816489338875, "learning_rate": 0.0002, "loss": 0.5504705905914307, "mean_token_accuracy": 0.7746731489896774, "num_tokens": 4822650.0, "step": 296 }, { "entropy": 0.5644493997097015, "epoch": 1.108411214953271, "grad_norm": 0.030590267851948738, "learning_rate": 0.0002, "loss": 0.5608450770378113, "mean_token_accuracy": 0.7722194045782089, "num_tokens": 4839117.0, "step": 297 }, { "entropy": 0.5444105267524719, "epoch": 1.1121495327102804, "grad_norm": 0.027887985110282898, "learning_rate": 0.0002, "loss": 0.5356480479240417, "mean_token_accuracy": 0.7835922837257385, "num_tokens": 4855616.0, "step": 298 }, { "entropy": 0.5529257953166962, "epoch": 1.1158878504672898, "grad_norm": 0.029403148218989372, "learning_rate": 0.0002, "loss": 0.5520183444023132, "mean_token_accuracy": 0.7763603180646896, "num_tokens": 4871877.0, "step": 299 }, { "entropy": 0.5645637214183807, "epoch": 1.1196261682242992, "grad_norm": 0.028178894892334938, "learning_rate": 0.0002, "loss": 0.5597948431968689, "mean_token_accuracy": 0.7721023112535477, "num_tokens": 4888211.0, "step": 300 }, { "entropy": 0.5288026034832001, "epoch": 1.1233644859813083, "grad_norm": 0.04107068479061127, "learning_rate": 0.0002, "loss": 0.5410320162773132, "mean_token_accuracy": 0.7809516042470932, "num_tokens": 4904621.0, "step": 301 }, { "entropy": 0.539900153875351, "epoch": 1.1271028037383177, "grad_norm": 0.029827676713466644, "learning_rate": 0.0002, "loss": 0.5402933955192566, "mean_token_accuracy": 0.7816860228776932, "num_tokens": 4921127.0, "step": 302 }, { "entropy": 0.5498250722885132, "epoch": 1.1308411214953271, "grad_norm": 0.026688000187277794, "learning_rate": 0.0002, "loss": 0.5489476323127747, "mean_token_accuracy": 0.7740818113088608, "num_tokens": 4937487.0, "step": 303 }, { "entropy": 0.5250164270401001, "epoch": 1.1345794392523365, "grad_norm": 0.02805374562740326, "learning_rate": 0.0002, "loss": 0.5292810797691345, "mean_token_accuracy": 0.7862300872802734, "num_tokens": 4953715.0, "step": 304 }, { "entropy": 0.5558099746704102, "epoch": 1.1383177570093457, "grad_norm": 0.028311913833022118, "learning_rate": 0.0002, "loss": 0.553642213344574, "mean_token_accuracy": 0.772954136133194, "num_tokens": 4970083.0, "step": 305 }, { "entropy": 0.552794486284256, "epoch": 1.142056074766355, "grad_norm": 0.02732912451028824, "learning_rate": 0.0002, "loss": 0.5542539358139038, "mean_token_accuracy": 0.7786157876253128, "num_tokens": 4986475.0, "step": 306 }, { "entropy": 0.541429802775383, "epoch": 1.1457943925233645, "grad_norm": 0.026043161749839783, "learning_rate": 0.0002, "loss": 0.54054194688797, "mean_token_accuracy": 0.779283881187439, "num_tokens": 5002946.0, "step": 307 }, { "entropy": 0.5385288000106812, "epoch": 1.1495327102803738, "grad_norm": 0.029000889509916306, "learning_rate": 0.0002, "loss": 0.5392960906028748, "mean_token_accuracy": 0.7790030539035797, "num_tokens": 5019257.0, "step": 308 }, { "entropy": 0.5650081187486649, "epoch": 1.1532710280373832, "grad_norm": 0.030966322869062424, "learning_rate": 0.0002, "loss": 0.5671533942222595, "mean_token_accuracy": 0.7687903195619583, "num_tokens": 5035694.0, "step": 309 }, { "entropy": 0.5269978791475296, "epoch": 1.1570093457943926, "grad_norm": 0.029498660936951637, "learning_rate": 0.0002, "loss": 0.5207559466362, "mean_token_accuracy": 0.789651021361351, "num_tokens": 5051896.0, "step": 310 }, { "entropy": 0.536905974149704, "epoch": 1.1607476635514018, "grad_norm": 0.030239341780543327, "learning_rate": 0.0002, "loss": 0.5469245910644531, "mean_token_accuracy": 0.7770659476518631, "num_tokens": 5068088.0, "step": 311 }, { "entropy": 0.5390781760215759, "epoch": 1.1644859813084112, "grad_norm": 0.03393058478832245, "learning_rate": 0.0002, "loss": 0.542595386505127, "mean_token_accuracy": 0.7818379998207092, "num_tokens": 5084518.0, "step": 312 }, { "entropy": 0.5539942681789398, "epoch": 1.1682242990654206, "grad_norm": 0.02896442450582981, "learning_rate": 0.0002, "loss": 0.5544940233230591, "mean_token_accuracy": 0.773167759180069, "num_tokens": 5101049.0, "step": 313 }, { "entropy": 0.5508127510547638, "epoch": 1.17196261682243, "grad_norm": 0.0290669035166502, "learning_rate": 0.0002, "loss": 0.5456743240356445, "mean_token_accuracy": 0.7797731012105942, "num_tokens": 5117401.0, "step": 314 }, { "entropy": 0.5471421480178833, "epoch": 1.1757009345794391, "grad_norm": 0.03175804764032364, "learning_rate": 0.0002, "loss": 0.547149658203125, "mean_token_accuracy": 0.7758717685937881, "num_tokens": 5133730.0, "step": 315 }, { "entropy": 0.5345856845378876, "epoch": 1.1794392523364485, "grad_norm": 0.030823305249214172, "learning_rate": 0.0002, "loss": 0.5330408215522766, "mean_token_accuracy": 0.784162163734436, "num_tokens": 5149933.0, "step": 316 }, { "entropy": 0.5622152835130692, "epoch": 1.183177570093458, "grad_norm": 0.035467732697725296, "learning_rate": 0.0002, "loss": 0.5626823902130127, "mean_token_accuracy": 0.7694768160581589, "num_tokens": 5166513.0, "step": 317 }, { "entropy": 0.5603054612874985, "epoch": 1.1869158878504673, "grad_norm": 0.03127942234277725, "learning_rate": 0.0002, "loss": 0.562260091304779, "mean_token_accuracy": 0.7705819606781006, "num_tokens": 5182789.0, "step": 318 }, { "entropy": 0.5313067883253098, "epoch": 1.1906542056074767, "grad_norm": 0.031915076076984406, "learning_rate": 0.0002, "loss": 0.535006046295166, "mean_token_accuracy": 0.7801574766635895, "num_tokens": 5198808.0, "step": 319 }, { "entropy": 0.5626082420349121, "epoch": 1.194392523364486, "grad_norm": 0.0270744226872921, "learning_rate": 0.0002, "loss": 0.5664738416671753, "mean_token_accuracy": 0.7685981392860413, "num_tokens": 5215173.0, "step": 320 }, { "entropy": 0.5448359251022339, "epoch": 1.1981308411214953, "grad_norm": 0.034068379551172256, "learning_rate": 0.0002, "loss": 0.5446659922599792, "mean_token_accuracy": 0.7786541432142258, "num_tokens": 5231488.0, "step": 321 }, { "entropy": 0.5552321374416351, "epoch": 1.2018691588785047, "grad_norm": 0.027504440397024155, "learning_rate": 0.0002, "loss": 0.5556068420410156, "mean_token_accuracy": 0.7737858295440674, "num_tokens": 5248043.0, "step": 322 }, { "entropy": 0.5611619353294373, "epoch": 1.205607476635514, "grad_norm": 0.0314825214445591, "learning_rate": 0.0002, "loss": 0.5585416555404663, "mean_token_accuracy": 0.7727329283952713, "num_tokens": 5264537.0, "step": 323 }, { "entropy": 0.539411261677742, "epoch": 1.2093457943925234, "grad_norm": 0.02891836315393448, "learning_rate": 0.0002, "loss": 0.542159378528595, "mean_token_accuracy": 0.7766279429197311, "num_tokens": 5280701.0, "step": 324 }, { "entropy": 0.5438771396875381, "epoch": 1.2130841121495326, "grad_norm": 0.030331527814269066, "learning_rate": 0.0002, "loss": 0.5439496040344238, "mean_token_accuracy": 0.7776656746864319, "num_tokens": 5297144.0, "step": 325 }, { "entropy": 0.5600438266992569, "epoch": 1.216822429906542, "grad_norm": 0.031427256762981415, "learning_rate": 0.0002, "loss": 0.5602800846099854, "mean_token_accuracy": 0.7731630206108093, "num_tokens": 5313519.0, "step": 326 }, { "entropy": 0.5613888651132584, "epoch": 1.2205607476635514, "grad_norm": 0.02703862637281418, "learning_rate": 0.0002, "loss": 0.5599865317344666, "mean_token_accuracy": 0.7733557522296906, "num_tokens": 5329856.0, "step": 327 }, { "entropy": 0.5237439274787903, "epoch": 1.2242990654205608, "grad_norm": 0.02758556418120861, "learning_rate": 0.0002, "loss": 0.5267841815948486, "mean_token_accuracy": 0.7867935001850128, "num_tokens": 5346177.0, "step": 328 }, { "entropy": 0.5669067651033401, "epoch": 1.2280373831775702, "grad_norm": 0.028242675587534904, "learning_rate": 0.0002, "loss": 0.5650265216827393, "mean_token_accuracy": 0.7703205198049545, "num_tokens": 5362512.0, "step": 329 }, { "entropy": 0.5509548783302307, "epoch": 1.2317757009345796, "grad_norm": 0.028802327811717987, "learning_rate": 0.0002, "loss": 0.5518352389335632, "mean_token_accuracy": 0.7750025242567062, "num_tokens": 5379024.0, "step": 330 }, { "entropy": 0.5300867408514023, "epoch": 1.2355140186915887, "grad_norm": 0.028508059680461884, "learning_rate": 0.0002, "loss": 0.5312294363975525, "mean_token_accuracy": 0.7825600951910019, "num_tokens": 5395474.0, "step": 331 }, { "entropy": 0.5559873282909393, "epoch": 1.2392523364485981, "grad_norm": 0.029974235221743584, "learning_rate": 0.0002, "loss": 0.5561782717704773, "mean_token_accuracy": 0.7731552422046661, "num_tokens": 5411674.0, "step": 332 }, { "entropy": 0.557199090719223, "epoch": 1.2429906542056075, "grad_norm": 0.03494254872202873, "learning_rate": 0.0002, "loss": 0.5579161643981934, "mean_token_accuracy": 0.7746251970529556, "num_tokens": 5428042.0, "step": 333 }, { "entropy": 0.5486237108707428, "epoch": 1.246728971962617, "grad_norm": 0.03307056799530983, "learning_rate": 0.0002, "loss": 0.547027587890625, "mean_token_accuracy": 0.7762673646211624, "num_tokens": 5444468.0, "step": 334 }, { "entropy": 0.5655098557472229, "epoch": 1.250467289719626, "grad_norm": 0.030658213421702385, "learning_rate": 0.0002, "loss": 0.5607244372367859, "mean_token_accuracy": 0.7719737142324448, "num_tokens": 5460943.0, "step": 335 }, { "entropy": 0.5550193935632706, "epoch": 1.2542056074766355, "grad_norm": 0.03245887532830238, "learning_rate": 0.0002, "loss": 0.558559775352478, "mean_token_accuracy": 0.7714462429285049, "num_tokens": 5477095.0, "step": 336 }, { "entropy": 0.5516159981489182, "epoch": 1.2579439252336448, "grad_norm": 0.029303548857569695, "learning_rate": 0.0002, "loss": 0.5509077310562134, "mean_token_accuracy": 0.7748865634202957, "num_tokens": 5493314.0, "step": 337 }, { "entropy": 0.5517037510871887, "epoch": 1.2616822429906542, "grad_norm": 0.030339522287249565, "learning_rate": 0.0002, "loss": 0.5531480312347412, "mean_token_accuracy": 0.7767991721630096, "num_tokens": 5509491.0, "step": 338 }, { "entropy": 0.5280565023422241, "epoch": 1.2654205607476636, "grad_norm": 0.031923625618219376, "learning_rate": 0.0002, "loss": 0.528035581111908, "mean_token_accuracy": 0.7852191030979156, "num_tokens": 5525691.0, "step": 339 }, { "entropy": 0.5340898633003235, "epoch": 1.269158878504673, "grad_norm": 0.029536927118897438, "learning_rate": 0.0002, "loss": 0.5422028303146362, "mean_token_accuracy": 0.7782081514596939, "num_tokens": 5541867.0, "step": 340 }, { "entropy": 0.5269799679517746, "epoch": 1.2728971962616822, "grad_norm": 0.028842000290751457, "learning_rate": 0.0002, "loss": 0.5262301564216614, "mean_token_accuracy": 0.7851875424385071, "num_tokens": 5558001.0, "step": 341 }, { "entropy": 0.5422883927822113, "epoch": 1.2766355140186916, "grad_norm": 0.03446980193257332, "learning_rate": 0.0002, "loss": 0.5427042245864868, "mean_token_accuracy": 0.7805773615837097, "num_tokens": 5574327.0, "step": 342 }, { "entropy": 0.5518148094415665, "epoch": 1.280373831775701, "grad_norm": 0.027705170214176178, "learning_rate": 0.0002, "loss": 0.5506993532180786, "mean_token_accuracy": 0.7755730003118515, "num_tokens": 5590749.0, "step": 343 }, { "entropy": 0.5408089458942413, "epoch": 1.2841121495327104, "grad_norm": 0.029695594683289528, "learning_rate": 0.0002, "loss": 0.5394558906555176, "mean_token_accuracy": 0.7792032957077026, "num_tokens": 5606965.0, "step": 344 }, { "entropy": 0.555278405547142, "epoch": 1.2878504672897195, "grad_norm": 0.03306727111339569, "learning_rate": 0.0002, "loss": 0.5528630018234253, "mean_token_accuracy": 0.7753221690654755, "num_tokens": 5623293.0, "step": 345 }, { "entropy": 0.5409073531627655, "epoch": 1.291588785046729, "grad_norm": 0.029820574447512627, "learning_rate": 0.0002, "loss": 0.5416831970214844, "mean_token_accuracy": 0.7789396792650223, "num_tokens": 5639449.0, "step": 346 }, { "entropy": 0.5428119450807571, "epoch": 1.2953271028037383, "grad_norm": 0.02653786540031433, "learning_rate": 0.0002, "loss": 0.5379306077957153, "mean_token_accuracy": 0.7808004468679428, "num_tokens": 5655647.0, "step": 347 }, { "entropy": 0.5534338802099228, "epoch": 1.2990654205607477, "grad_norm": 0.036522869020700455, "learning_rate": 0.0002, "loss": 0.5622379779815674, "mean_token_accuracy": 0.7683994024991989, "num_tokens": 5672013.0, "step": 348 }, { "entropy": 0.5302807092666626, "epoch": 1.302803738317757, "grad_norm": 0.029457183554768562, "learning_rate": 0.0002, "loss": 0.5294267535209656, "mean_token_accuracy": 0.7827122360467911, "num_tokens": 5688450.0, "step": 349 }, { "entropy": 0.5444758385419846, "epoch": 1.3065420560747665, "grad_norm": 0.029874974861741066, "learning_rate": 0.0002, "loss": 0.5353363752365112, "mean_token_accuracy": 0.7824759036302567, "num_tokens": 5705038.0, "step": 350 }, { "entropy": 0.5528301745653152, "epoch": 1.3102803738317756, "grad_norm": 0.029413780197501183, "learning_rate": 0.0002, "loss": 0.5467464923858643, "mean_token_accuracy": 0.7778250128030777, "num_tokens": 5721143.0, "step": 351 }, { "entropy": 0.5555091798305511, "epoch": 1.314018691588785, "grad_norm": 0.03153051435947418, "learning_rate": 0.0002, "loss": 0.5567013025283813, "mean_token_accuracy": 0.7745524048805237, "num_tokens": 5737899.0, "step": 352 }, { "entropy": 0.5499187856912613, "epoch": 1.3177570093457944, "grad_norm": 0.03486097231507301, "learning_rate": 0.0002, "loss": 0.5597171783447266, "mean_token_accuracy": 0.7737800478935242, "num_tokens": 5754281.0, "step": 353 }, { "entropy": 0.5655581057071686, "epoch": 1.3214953271028038, "grad_norm": 0.034320469945669174, "learning_rate": 0.0002, "loss": 0.5727288126945496, "mean_token_accuracy": 0.7656765133142471, "num_tokens": 5770770.0, "step": 354 }, { "entropy": 0.5538551807403564, "epoch": 1.325233644859813, "grad_norm": 0.03038712590932846, "learning_rate": 0.0002, "loss": 0.5568647384643555, "mean_token_accuracy": 0.7737635225057602, "num_tokens": 5787055.0, "step": 355 }, { "entropy": 0.5601113438606262, "epoch": 1.3289719626168224, "grad_norm": 0.02863963134586811, "learning_rate": 0.0002, "loss": 0.5530621409416199, "mean_token_accuracy": 0.7755090743303299, "num_tokens": 5803445.0, "step": 356 }, { "entropy": 0.5483526140451431, "epoch": 1.3327102803738318, "grad_norm": 0.03086850978434086, "learning_rate": 0.0002, "loss": 0.5400408506393433, "mean_token_accuracy": 0.7810002267360687, "num_tokens": 5819715.0, "step": 357 }, { "entropy": 0.5624817609786987, "epoch": 1.3364485981308412, "grad_norm": 0.027300981804728508, "learning_rate": 0.0002, "loss": 0.5635508894920349, "mean_token_accuracy": 0.768461674451828, "num_tokens": 5835943.0, "step": 358 }, { "entropy": 0.5395894348621368, "epoch": 1.3401869158878505, "grad_norm": 0.030900444835424423, "learning_rate": 0.0002, "loss": 0.544026255607605, "mean_token_accuracy": 0.7806333154439926, "num_tokens": 5852434.0, "step": 359 }, { "entropy": 0.5406174808740616, "epoch": 1.34392523364486, "grad_norm": 0.030813222751021385, "learning_rate": 0.0002, "loss": 0.545943021774292, "mean_token_accuracy": 0.7791963070631027, "num_tokens": 5868855.0, "step": 360 }, { "entropy": 0.5282687693834305, "epoch": 1.347663551401869, "grad_norm": 0.03219500184059143, "learning_rate": 0.0002, "loss": 0.5280976891517639, "mean_token_accuracy": 0.7882633060216904, "num_tokens": 5885162.0, "step": 361 }, { "entropy": 0.5588660687208176, "epoch": 1.3514018691588785, "grad_norm": 0.030664408579468727, "learning_rate": 0.0002, "loss": 0.5600679516792297, "mean_token_accuracy": 0.7683242410421371, "num_tokens": 5901397.0, "step": 362 }, { "entropy": 0.5558361262083054, "epoch": 1.355140186915888, "grad_norm": 0.029887903481721878, "learning_rate": 0.0002, "loss": 0.5512230396270752, "mean_token_accuracy": 0.7751856446266174, "num_tokens": 5917688.0, "step": 363 }, { "entropy": 0.5585273951292038, "epoch": 1.358878504672897, "grad_norm": 0.030291857197880745, "learning_rate": 0.0002, "loss": 0.5574408173561096, "mean_token_accuracy": 0.7735242694616318, "num_tokens": 5934252.0, "step": 364 }, { "entropy": 0.5426641255617142, "epoch": 1.3626168224299064, "grad_norm": 0.03163778409361839, "learning_rate": 0.0002, "loss": 0.5456237196922302, "mean_token_accuracy": 0.77604641020298, "num_tokens": 5950736.0, "step": 365 }, { "entropy": 0.5607275068759918, "epoch": 1.3663551401869158, "grad_norm": 0.02867417223751545, "learning_rate": 0.0002, "loss": 0.5595529079437256, "mean_token_accuracy": 0.773354560136795, "num_tokens": 5967130.0, "step": 366 }, { "entropy": 0.554174154996872, "epoch": 1.3700934579439252, "grad_norm": 0.03474622219800949, "learning_rate": 0.0002, "loss": 0.5513558387756348, "mean_token_accuracy": 0.7774477899074554, "num_tokens": 5983303.0, "step": 367 }, { "entropy": 0.5479168146848679, "epoch": 1.3738317757009346, "grad_norm": 0.03147226572036743, "learning_rate": 0.0002, "loss": 0.5468041300773621, "mean_token_accuracy": 0.7777006030082703, "num_tokens": 5999776.0, "step": 368 }, { "entropy": 0.5567852258682251, "epoch": 1.377570093457944, "grad_norm": 0.03519264608621597, "learning_rate": 0.0002, "loss": 0.5599963068962097, "mean_token_accuracy": 0.7709233462810516, "num_tokens": 6015938.0, "step": 369 }, { "entropy": 0.5587522089481354, "epoch": 1.3813084112149534, "grad_norm": 0.03433060646057129, "learning_rate": 0.0002, "loss": 0.5571247339248657, "mean_token_accuracy": 0.7718200087547302, "num_tokens": 6032196.0, "step": 370 }, { "entropy": 0.5337067395448685, "epoch": 1.3850467289719626, "grad_norm": 0.030834900215268135, "learning_rate": 0.0002, "loss": 0.5330364108085632, "mean_token_accuracy": 0.7854774743318558, "num_tokens": 6048415.0, "step": 371 }, { "entropy": 0.5485008955001831, "epoch": 1.388785046728972, "grad_norm": 0.038097940385341644, "learning_rate": 0.0002, "loss": 0.5500508546829224, "mean_token_accuracy": 0.775309219956398, "num_tokens": 6064562.0, "step": 372 }, { "entropy": 0.5520146042108536, "epoch": 1.3925233644859814, "grad_norm": 0.02676542103290558, "learning_rate": 0.0002, "loss": 0.546633243560791, "mean_token_accuracy": 0.7763903141021729, "num_tokens": 6080869.0, "step": 373 }, { "entropy": 0.5430674999952316, "epoch": 1.3962616822429905, "grad_norm": 0.0291767455637455, "learning_rate": 0.0002, "loss": 0.5384376049041748, "mean_token_accuracy": 0.7846493870019913, "num_tokens": 6096995.0, "step": 374 }, { "entropy": 0.543053463101387, "epoch": 1.4, "grad_norm": 0.031880684196949005, "learning_rate": 0.0002, "loss": 0.5416824817657471, "mean_token_accuracy": 0.7807471454143524, "num_tokens": 6113154.0, "step": 375 }, { "entropy": 0.555852085351944, "epoch": 1.4037383177570093, "grad_norm": 0.03215760365128517, "learning_rate": 0.0002, "loss": 0.5583543181419373, "mean_token_accuracy": 0.7724814862012863, "num_tokens": 6129602.0, "step": 376 }, { "entropy": 0.5323648005723953, "epoch": 1.4074766355140187, "grad_norm": 0.03375270590186119, "learning_rate": 0.0002, "loss": 0.5405369400978088, "mean_token_accuracy": 0.7804393470287323, "num_tokens": 6145766.0, "step": 377 }, { "entropy": 0.5550488829612732, "epoch": 1.411214953271028, "grad_norm": 0.029217012226581573, "learning_rate": 0.0002, "loss": 0.554684579372406, "mean_token_accuracy": 0.7745330631732941, "num_tokens": 6162201.0, "step": 378 }, { "entropy": 0.5482346266508102, "epoch": 1.4149532710280375, "grad_norm": 0.03129247948527336, "learning_rate": 0.0002, "loss": 0.5419821739196777, "mean_token_accuracy": 0.7780721634626389, "num_tokens": 6178420.0, "step": 379 }, { "entropy": 0.5605264604091644, "epoch": 1.4186915887850469, "grad_norm": 0.028088558465242386, "learning_rate": 0.0002, "loss": 0.5536739230155945, "mean_token_accuracy": 0.7760752588510513, "num_tokens": 6195017.0, "step": 380 }, { "entropy": 0.5308103561401367, "epoch": 1.422429906542056, "grad_norm": 0.03174047917127609, "learning_rate": 0.0002, "loss": 0.5348400473594666, "mean_token_accuracy": 0.7830243110656738, "num_tokens": 6211269.0, "step": 381 }, { "entropy": 0.5362233817577362, "epoch": 1.4261682242990654, "grad_norm": 0.03284025564789772, "learning_rate": 0.0002, "loss": 0.5401143431663513, "mean_token_accuracy": 0.7799562960863113, "num_tokens": 6227503.0, "step": 382 }, { "entropy": 0.5288970768451691, "epoch": 1.4299065420560748, "grad_norm": 0.03117184154689312, "learning_rate": 0.0002, "loss": 0.5347498655319214, "mean_token_accuracy": 0.7850797027349472, "num_tokens": 6243667.0, "step": 383 }, { "entropy": 0.5478838980197906, "epoch": 1.433644859813084, "grad_norm": 0.0355689711868763, "learning_rate": 0.0002, "loss": 0.5515888333320618, "mean_token_accuracy": 0.7750401347875595, "num_tokens": 6259958.0, "step": 384 }, { "entropy": 0.5556496828794479, "epoch": 1.4373831775700934, "grad_norm": 0.03252286836504936, "learning_rate": 0.0002, "loss": 0.5527741312980652, "mean_token_accuracy": 0.7747504711151123, "num_tokens": 6276256.0, "step": 385 }, { "entropy": 0.536173865199089, "epoch": 1.4411214953271028, "grad_norm": 0.03125045448541641, "learning_rate": 0.0002, "loss": 0.5389170050621033, "mean_token_accuracy": 0.7826138287782669, "num_tokens": 6292477.0, "step": 386 }, { "entropy": 0.5414228439331055, "epoch": 1.4448598130841122, "grad_norm": 0.029693089425563812, "learning_rate": 0.0002, "loss": 0.5456768870353699, "mean_token_accuracy": 0.7780184000730515, "num_tokens": 6308848.0, "step": 387 }, { "entropy": 0.5460960417985916, "epoch": 1.4485981308411215, "grad_norm": 0.028725288808345795, "learning_rate": 0.0002, "loss": 0.5453904867172241, "mean_token_accuracy": 0.7754503637552261, "num_tokens": 6325175.0, "step": 388 }, { "entropy": 0.5478474348783493, "epoch": 1.452336448598131, "grad_norm": 0.03158194199204445, "learning_rate": 0.0002, "loss": 0.5430905818939209, "mean_token_accuracy": 0.7789453864097595, "num_tokens": 6341307.0, "step": 389 }, { "entropy": 0.5458368062973022, "epoch": 1.45607476635514, "grad_norm": 0.02816491760313511, "learning_rate": 0.0002, "loss": 0.543704092502594, "mean_token_accuracy": 0.7792259007692337, "num_tokens": 6357858.0, "step": 390 }, { "entropy": 0.5392302572727203, "epoch": 1.4598130841121495, "grad_norm": 0.04157215729355812, "learning_rate": 0.0002, "loss": 0.544989287853241, "mean_token_accuracy": 0.7776051461696625, "num_tokens": 6373868.0, "step": 391 }, { "entropy": 0.5487792640924454, "epoch": 1.4635514018691589, "grad_norm": 0.03120332583785057, "learning_rate": 0.0002, "loss": 0.5500867962837219, "mean_token_accuracy": 0.7786511480808258, "num_tokens": 6390370.0, "step": 392 }, { "entropy": 0.5473900437355042, "epoch": 1.4672897196261683, "grad_norm": 0.03685331344604492, "learning_rate": 0.0002, "loss": 0.5516798496246338, "mean_token_accuracy": 0.7734636813402176, "num_tokens": 6406810.0, "step": 393 }, { "entropy": 0.5339369177818298, "epoch": 1.4710280373831774, "grad_norm": 0.031062059104442596, "learning_rate": 0.0002, "loss": 0.5277940034866333, "mean_token_accuracy": 0.7844891250133514, "num_tokens": 6423321.0, "step": 394 }, { "entropy": 0.5646286159753799, "epoch": 1.4747663551401868, "grad_norm": 0.03419705480337143, "learning_rate": 0.0002, "loss": 0.560526967048645, "mean_token_accuracy": 0.7742912471294403, "num_tokens": 6439751.0, "step": 395 }, { "entropy": 0.5566267520189285, "epoch": 1.4785046728971962, "grad_norm": 0.030112918466329575, "learning_rate": 0.0002, "loss": 0.551886796951294, "mean_token_accuracy": 0.7758849114179611, "num_tokens": 6456064.0, "step": 396 }, { "entropy": 0.5496308952569962, "epoch": 1.4822429906542056, "grad_norm": 0.029358550906181335, "learning_rate": 0.0002, "loss": 0.5503244400024414, "mean_token_accuracy": 0.779025211930275, "num_tokens": 6472168.0, "step": 397 }, { "entropy": 0.5490056574344635, "epoch": 1.485981308411215, "grad_norm": 0.03679414093494415, "learning_rate": 0.0002, "loss": 0.5532426834106445, "mean_token_accuracy": 0.77412910759449, "num_tokens": 6488701.0, "step": 398 }, { "entropy": 0.5552525818347931, "epoch": 1.4897196261682244, "grad_norm": 0.03460443392395973, "learning_rate": 0.0002, "loss": 0.5580930709838867, "mean_token_accuracy": 0.7725805938243866, "num_tokens": 6504913.0, "step": 399 }, { "entropy": 0.5486905574798584, "epoch": 1.4934579439252336, "grad_norm": 0.03757799416780472, "learning_rate": 0.0002, "loss": 0.5467075705528259, "mean_token_accuracy": 0.7737327963113785, "num_tokens": 6521159.0, "step": 400 }, { "entropy": 0.5667891502380371, "epoch": 1.497196261682243, "grad_norm": 0.0321633443236351, "learning_rate": 0.0002, "loss": 0.5584529042243958, "mean_token_accuracy": 0.7716430127620697, "num_tokens": 6537343.0, "step": 401 }, { "entropy": 0.560171589255333, "epoch": 1.5009345794392523, "grad_norm": 0.027958108112215996, "learning_rate": 0.0002, "loss": 0.5571039319038391, "mean_token_accuracy": 0.7695076316595078, "num_tokens": 6553654.0, "step": 402 }, { "entropy": 0.5325733348727226, "epoch": 1.5046728971962615, "grad_norm": 0.03109286166727543, "learning_rate": 0.0002, "loss": 0.5371490716934204, "mean_token_accuracy": 0.7818229347467422, "num_tokens": 6569830.0, "step": 403 }, { "entropy": 0.5464021414518356, "epoch": 1.508411214953271, "grad_norm": 0.033921979367733, "learning_rate": 0.0002, "loss": 0.5520694255828857, "mean_token_accuracy": 0.7737181484699249, "num_tokens": 6586181.0, "step": 404 }, { "entropy": 0.5360658913850784, "epoch": 1.5121495327102803, "grad_norm": 0.03216444328427315, "learning_rate": 0.0002, "loss": 0.539574921131134, "mean_token_accuracy": 0.7791631668806076, "num_tokens": 6602220.0, "step": 405 }, { "entropy": 0.5452992171049118, "epoch": 1.5158878504672897, "grad_norm": 0.02836962789297104, "learning_rate": 0.0002, "loss": 0.5482081174850464, "mean_token_accuracy": 0.7770387381315231, "num_tokens": 6618603.0, "step": 406 }, { "entropy": 0.5549522340297699, "epoch": 1.519626168224299, "grad_norm": 0.029138341546058655, "learning_rate": 0.0002, "loss": 0.5456300973892212, "mean_token_accuracy": 0.7779618352651596, "num_tokens": 6634957.0, "step": 407 }, { "entropy": 0.5506550967693329, "epoch": 1.5233644859813085, "grad_norm": 0.02889757789671421, "learning_rate": 0.0002, "loss": 0.5417683720588684, "mean_token_accuracy": 0.7772906571626663, "num_tokens": 6651192.0, "step": 408 }, { "entropy": 0.5641747862100601, "epoch": 1.5271028037383179, "grad_norm": 0.029291054233908653, "learning_rate": 0.0002, "loss": 0.5575106143951416, "mean_token_accuracy": 0.7736930400133133, "num_tokens": 6667351.0, "step": 409 }, { "entropy": 0.5569720417261124, "epoch": 1.5308411214953273, "grad_norm": 0.031217265874147415, "learning_rate": 0.0002, "loss": 0.5568684339523315, "mean_token_accuracy": 0.7742536216974258, "num_tokens": 6683766.0, "step": 410 }, { "entropy": 0.5555198639631271, "epoch": 1.5345794392523364, "grad_norm": 0.041470784693956375, "learning_rate": 0.0002, "loss": 0.5674223303794861, "mean_token_accuracy": 0.7700306624174118, "num_tokens": 6700296.0, "step": 411 }, { "entropy": 0.5609412640333176, "epoch": 1.5383177570093458, "grad_norm": 0.03198862448334694, "learning_rate": 0.0002, "loss": 0.5651755332946777, "mean_token_accuracy": 0.7717378437519073, "num_tokens": 6716475.0, "step": 412 }, { "entropy": 0.5559493005275726, "epoch": 1.542056074766355, "grad_norm": 0.029610617086291313, "learning_rate": 0.0002, "loss": 0.5465991497039795, "mean_token_accuracy": 0.7768793702125549, "num_tokens": 6732579.0, "step": 413 }, { "entropy": 0.5383591949939728, "epoch": 1.5457943925233644, "grad_norm": 0.03238457813858986, "learning_rate": 0.0002, "loss": 0.5351200699806213, "mean_token_accuracy": 0.7838361263275146, "num_tokens": 6748613.0, "step": 414 }, { "entropy": 0.5723170787096024, "epoch": 1.5495327102803738, "grad_norm": 0.03184224292635918, "learning_rate": 0.0002, "loss": 0.5706000328063965, "mean_token_accuracy": 0.7656203061342239, "num_tokens": 6764799.0, "step": 415 }, { "entropy": 0.5449900329113007, "epoch": 1.5532710280373832, "grad_norm": 0.03413036838173866, "learning_rate": 0.0002, "loss": 0.5444662570953369, "mean_token_accuracy": 0.7746504992246628, "num_tokens": 6781040.0, "step": 416 }, { "entropy": 0.5653754621744156, "epoch": 1.5570093457943925, "grad_norm": 0.03557061403989792, "learning_rate": 0.0002, "loss": 0.5661092400550842, "mean_token_accuracy": 0.7700045108795166, "num_tokens": 6797618.0, "step": 417 }, { "entropy": 0.5285668075084686, "epoch": 1.560747663551402, "grad_norm": 0.02898026816546917, "learning_rate": 0.0002, "loss": 0.5310862064361572, "mean_token_accuracy": 0.7867710143327713, "num_tokens": 6813889.0, "step": 418 }, { "entropy": 0.5591782182455063, "epoch": 1.5644859813084113, "grad_norm": 0.03489390015602112, "learning_rate": 0.0002, "loss": 0.559260368347168, "mean_token_accuracy": 0.7742950618267059, "num_tokens": 6830511.0, "step": 419 }, { "entropy": 0.5233039408922195, "epoch": 1.5682242990654207, "grad_norm": 0.031120121479034424, "learning_rate": 0.0002, "loss": 0.5304787158966064, "mean_token_accuracy": 0.7851588577032089, "num_tokens": 6846831.0, "step": 420 }, { "entropy": 0.5615075826644897, "epoch": 1.5719626168224299, "grad_norm": 0.032532718032598495, "learning_rate": 0.0002, "loss": 0.557915985584259, "mean_token_accuracy": 0.7756024897098541, "num_tokens": 6863482.0, "step": 421 }, { "entropy": 0.5608477592468262, "epoch": 1.5757009345794393, "grad_norm": 0.03193405270576477, "learning_rate": 0.0002, "loss": 0.5570778250694275, "mean_token_accuracy": 0.7736349552869797, "num_tokens": 6879744.0, "step": 422 }, { "entropy": 0.5420049726963043, "epoch": 1.5794392523364484, "grad_norm": 0.03341756388545036, "learning_rate": 0.0002, "loss": 0.5422099828720093, "mean_token_accuracy": 0.7786398679018021, "num_tokens": 6895998.0, "step": 423 }, { "entropy": 0.5501766800880432, "epoch": 1.5831775700934578, "grad_norm": 0.03080238774418831, "learning_rate": 0.0002, "loss": 0.543519139289856, "mean_token_accuracy": 0.779445543885231, "num_tokens": 6912350.0, "step": 424 }, { "entropy": 0.5548175424337387, "epoch": 1.5869158878504672, "grad_norm": 0.029699817299842834, "learning_rate": 0.0002, "loss": 0.554355263710022, "mean_token_accuracy": 0.7715099602937698, "num_tokens": 6928868.0, "step": 425 }, { "entropy": 0.5445838496088982, "epoch": 1.5906542056074766, "grad_norm": 0.03310444578528404, "learning_rate": 0.0002, "loss": 0.5509841442108154, "mean_token_accuracy": 0.7749770432710648, "num_tokens": 6945115.0, "step": 426 }, { "entropy": 0.5508389323949814, "epoch": 1.594392523364486, "grad_norm": 0.03343511372804642, "learning_rate": 0.0002, "loss": 0.5527422428131104, "mean_token_accuracy": 0.7760582268238068, "num_tokens": 6961606.0, "step": 427 }, { "entropy": 0.5455803871154785, "epoch": 1.5981308411214954, "grad_norm": 0.030003823339939117, "learning_rate": 0.0002, "loss": 0.5433002710342407, "mean_token_accuracy": 0.7772544771432877, "num_tokens": 6977721.0, "step": 428 }, { "entropy": 0.542354941368103, "epoch": 1.6018691588785048, "grad_norm": 0.02921188622713089, "learning_rate": 0.0002, "loss": 0.5396295785903931, "mean_token_accuracy": 0.7784738689661026, "num_tokens": 6994015.0, "step": 429 }, { "entropy": 0.5403562635183334, "epoch": 1.6056074766355142, "grad_norm": 0.03267091140151024, "learning_rate": 0.0002, "loss": 0.5412419438362122, "mean_token_accuracy": 0.7828981131315231, "num_tokens": 7010256.0, "step": 430 }, { "entropy": 0.5418384820222855, "epoch": 1.6093457943925233, "grad_norm": 0.03328794986009598, "learning_rate": 0.0002, "loss": 0.5415868163108826, "mean_token_accuracy": 0.7787100970745087, "num_tokens": 7026538.0, "step": 431 }, { "entropy": 0.5569044798612595, "epoch": 1.6130841121495327, "grad_norm": 0.03399523347616196, "learning_rate": 0.0002, "loss": 0.5610039830207825, "mean_token_accuracy": 0.7681904435157776, "num_tokens": 7042821.0, "step": 432 }, { "entropy": 0.5516158491373062, "epoch": 1.616822429906542, "grad_norm": 0.041675642132759094, "learning_rate": 0.0002, "loss": 0.5512884855270386, "mean_token_accuracy": 0.7792385816574097, "num_tokens": 7059278.0, "step": 433 }, { "entropy": 0.5493542701005936, "epoch": 1.6205607476635513, "grad_norm": 0.029840141534805298, "learning_rate": 0.0002, "loss": 0.5508259534835815, "mean_token_accuracy": 0.7764638513326645, "num_tokens": 7075675.0, "step": 434 }, { "entropy": 0.5415777564048767, "epoch": 1.6242990654205607, "grad_norm": 0.04138097167015076, "learning_rate": 0.0002, "loss": 0.540780246257782, "mean_token_accuracy": 0.7806251496076584, "num_tokens": 7091803.0, "step": 435 }, { "entropy": 0.5550828725099564, "epoch": 1.62803738317757, "grad_norm": 0.03500202298164368, "learning_rate": 0.0002, "loss": 0.5536463856697083, "mean_token_accuracy": 0.7767235636711121, "num_tokens": 7108257.0, "step": 436 }, { "entropy": 0.5612530559301376, "epoch": 1.6317757009345795, "grad_norm": 0.029145153239369392, "learning_rate": 0.0002, "loss": 0.5608190894126892, "mean_token_accuracy": 0.7731182426214218, "num_tokens": 7124785.0, "step": 437 }, { "entropy": 0.5527195036411285, "epoch": 1.6355140186915889, "grad_norm": 0.035749297589063644, "learning_rate": 0.0002, "loss": 0.5629845857620239, "mean_token_accuracy": 0.7721443176269531, "num_tokens": 7141265.0, "step": 438 }, { "entropy": 0.5614519417285919, "epoch": 1.6392523364485982, "grad_norm": 0.033001191914081573, "learning_rate": 0.0002, "loss": 0.5560024976730347, "mean_token_accuracy": 0.7749044448137283, "num_tokens": 7157859.0, "step": 439 }, { "entropy": 0.5537575930356979, "epoch": 1.6429906542056076, "grad_norm": 0.026474064216017723, "learning_rate": 0.0002, "loss": 0.5511392951011658, "mean_token_accuracy": 0.7752827405929565, "num_tokens": 7174159.0, "step": 440 }, { "entropy": 0.5490387231111526, "epoch": 1.6467289719626168, "grad_norm": 0.03137727826833725, "learning_rate": 0.0002, "loss": 0.5470349192619324, "mean_token_accuracy": 0.7756170034408569, "num_tokens": 7190518.0, "step": 441 }, { "entropy": 0.5602337867021561, "epoch": 1.6504672897196262, "grad_norm": 0.0327768549323082, "learning_rate": 0.0002, "loss": 0.5596269369125366, "mean_token_accuracy": 0.7712970525026321, "num_tokens": 7206832.0, "step": 442 }, { "entropy": 0.5407531261444092, "epoch": 1.6542056074766354, "grad_norm": 0.0337577648460865, "learning_rate": 0.0002, "loss": 0.5448312759399414, "mean_token_accuracy": 0.7795456647872925, "num_tokens": 7222967.0, "step": 443 }, { "entropy": 0.5409540086984634, "epoch": 1.6579439252336448, "grad_norm": 0.03192588686943054, "learning_rate": 0.0002, "loss": 0.5484352111816406, "mean_token_accuracy": 0.7764406651258469, "num_tokens": 7239342.0, "step": 444 }, { "entropy": 0.5369711667299271, "epoch": 1.6616822429906541, "grad_norm": 0.029282715171575546, "learning_rate": 0.0002, "loss": 0.5391625165939331, "mean_token_accuracy": 0.7777595669031143, "num_tokens": 7255685.0, "step": 445 }, { "entropy": 0.5320119112730026, "epoch": 1.6654205607476635, "grad_norm": 0.03132037818431854, "learning_rate": 0.0002, "loss": 0.5324081182479858, "mean_token_accuracy": 0.7831796556711197, "num_tokens": 7271873.0, "step": 446 }, { "entropy": 0.5473773181438446, "epoch": 1.669158878504673, "grad_norm": 0.029359478503465652, "learning_rate": 0.0002, "loss": 0.5430581569671631, "mean_token_accuracy": 0.780887171626091, "num_tokens": 7288229.0, "step": 447 }, { "entropy": 0.5577313005924225, "epoch": 1.6728971962616823, "grad_norm": 0.0312592051923275, "learning_rate": 0.0002, "loss": 0.5549578070640564, "mean_token_accuracy": 0.7755182534456253, "num_tokens": 7304562.0, "step": 448 }, { "entropy": 0.5430529564619064, "epoch": 1.6766355140186917, "grad_norm": 0.036848753690719604, "learning_rate": 0.0002, "loss": 0.5486578941345215, "mean_token_accuracy": 0.7793130427598953, "num_tokens": 7320789.0, "step": 449 }, { "entropy": 0.5367421358823776, "epoch": 1.680373831775701, "grad_norm": 0.03133554011583328, "learning_rate": 0.0002, "loss": 0.5428006649017334, "mean_token_accuracy": 0.7791069746017456, "num_tokens": 7336720.0, "step": 450 }, { "entropy": 0.5608862638473511, "epoch": 1.6841121495327103, "grad_norm": 0.033135656267404556, "learning_rate": 0.0002, "loss": 0.5513461828231812, "mean_token_accuracy": 0.7747347801923752, "num_tokens": 7353115.0, "step": 451 }, { "entropy": 0.5476694256067276, "epoch": 1.6878504672897197, "grad_norm": 0.02974470518529415, "learning_rate": 0.0002, "loss": 0.5473049879074097, "mean_token_accuracy": 0.7776686698198318, "num_tokens": 7369302.0, "step": 452 }, { "entropy": 0.5416230708360672, "epoch": 1.6915887850467288, "grad_norm": 0.0338185578584671, "learning_rate": 0.0002, "loss": 0.5420779585838318, "mean_token_accuracy": 0.7770841121673584, "num_tokens": 7385486.0, "step": 453 }, { "entropy": 0.5354430079460144, "epoch": 1.6953271028037382, "grad_norm": 0.04928300157189369, "learning_rate": 0.0002, "loss": 0.5383298397064209, "mean_token_accuracy": 0.7825010567903519, "num_tokens": 7401834.0, "step": 454 }, { "entropy": 0.5533457249403, "epoch": 1.6990654205607476, "grad_norm": 0.03868211433291435, "learning_rate": 0.0002, "loss": 0.5589519739151001, "mean_token_accuracy": 0.7741620242595673, "num_tokens": 7418328.0, "step": 455 }, { "entropy": 0.5337075442075729, "epoch": 1.702803738317757, "grad_norm": 0.03012922592461109, "learning_rate": 0.0002, "loss": 0.5302947163581848, "mean_token_accuracy": 0.7835781127214432, "num_tokens": 7434426.0, "step": 456 }, { "entropy": 0.5648263692855835, "epoch": 1.7065420560747664, "grad_norm": 0.028873439878225327, "learning_rate": 0.0002, "loss": 0.5585320591926575, "mean_token_accuracy": 0.7732219845056534, "num_tokens": 7451036.0, "step": 457 }, { "entropy": 0.5839773565530777, "epoch": 1.7102803738317758, "grad_norm": 0.033153235912323, "learning_rate": 0.0002, "loss": 0.5761073231697083, "mean_token_accuracy": 0.7669852823019028, "num_tokens": 7467359.0, "step": 458 }, { "entropy": 0.5488205403089523, "epoch": 1.7140186915887852, "grad_norm": 0.032065052539110184, "learning_rate": 0.0002, "loss": 0.5483813285827637, "mean_token_accuracy": 0.7763916105031967, "num_tokens": 7483649.0, "step": 459 }, { "entropy": 0.5411174297332764, "epoch": 1.7177570093457943, "grad_norm": 0.0323743000626564, "learning_rate": 0.0002, "loss": 0.5461615920066833, "mean_token_accuracy": 0.7778149843215942, "num_tokens": 7500070.0, "step": 460 }, { "entropy": 0.533783033490181, "epoch": 1.7214953271028037, "grad_norm": 0.03367235139012337, "learning_rate": 0.0002, "loss": 0.5427653193473816, "mean_token_accuracy": 0.7805494964122772, "num_tokens": 7516529.0, "step": 461 }, { "entropy": 0.5454732924699783, "epoch": 1.7252336448598131, "grad_norm": 0.034071460366249084, "learning_rate": 0.0002, "loss": 0.5546566247940063, "mean_token_accuracy": 0.7736624777317047, "num_tokens": 7533025.0, "step": 462 }, { "entropy": 0.5454118698835373, "epoch": 1.7289719626168223, "grad_norm": 0.03127819299697876, "learning_rate": 0.0002, "loss": 0.5452259182929993, "mean_token_accuracy": 0.7759493589401245, "num_tokens": 7549482.0, "step": 463 }, { "entropy": 0.5667081475257874, "epoch": 1.7327102803738317, "grad_norm": 0.0311261173337698, "learning_rate": 0.0002, "loss": 0.5610095858573914, "mean_token_accuracy": 0.772314265370369, "num_tokens": 7565748.0, "step": 464 }, { "entropy": 0.5310934036970139, "epoch": 1.736448598130841, "grad_norm": 0.03265678882598877, "learning_rate": 0.0002, "loss": 0.5214373469352722, "mean_token_accuracy": 0.7887950539588928, "num_tokens": 7582052.0, "step": 465 }, { "entropy": 0.5556392967700958, "epoch": 1.7401869158878505, "grad_norm": 0.03034058026969433, "learning_rate": 0.0002, "loss": 0.5505704283714294, "mean_token_accuracy": 0.7774366736412048, "num_tokens": 7598174.0, "step": 466 }, { "entropy": 0.5393192917108536, "epoch": 1.7439252336448599, "grad_norm": 0.0359746590256691, "learning_rate": 0.0002, "loss": 0.5477877259254456, "mean_token_accuracy": 0.7797855734825134, "num_tokens": 7614503.0, "step": 467 }, { "entropy": 0.551783487200737, "epoch": 1.7476635514018692, "grad_norm": 0.03548724204301834, "learning_rate": 0.0002, "loss": 0.5540840029716492, "mean_token_accuracy": 0.7747608870267868, "num_tokens": 7630814.0, "step": 468 }, { "entropy": 0.5413367450237274, "epoch": 1.7514018691588786, "grad_norm": 0.034123897552490234, "learning_rate": 0.0002, "loss": 0.5470243692398071, "mean_token_accuracy": 0.779376894235611, "num_tokens": 7647376.0, "step": 469 }, { "entropy": 0.5412023663520813, "epoch": 1.7551401869158878, "grad_norm": 0.03561440855264664, "learning_rate": 0.0002, "loss": 0.5472733378410339, "mean_token_accuracy": 0.7762201726436615, "num_tokens": 7663345.0, "step": 470 }, { "entropy": 0.549220860004425, "epoch": 1.7588785046728972, "grad_norm": 0.02905275858938694, "learning_rate": 0.0002, "loss": 0.541520893573761, "mean_token_accuracy": 0.7792876809835434, "num_tokens": 7679585.0, "step": 471 }, { "entropy": 0.5333058834075928, "epoch": 1.7626168224299066, "grad_norm": 0.03320024162530899, "learning_rate": 0.0002, "loss": 0.5264161229133606, "mean_token_accuracy": 0.7870939522981644, "num_tokens": 7695719.0, "step": 472 }, { "entropy": 0.5468353033065796, "epoch": 1.7663551401869158, "grad_norm": 0.03256339579820633, "learning_rate": 0.0002, "loss": 0.5458404421806335, "mean_token_accuracy": 0.778706505894661, "num_tokens": 7711803.0, "step": 473 }, { "entropy": 0.536187469959259, "epoch": 1.7700934579439251, "grad_norm": 0.03339603543281555, "learning_rate": 0.0002, "loss": 0.5392374992370605, "mean_token_accuracy": 0.7822528183460236, "num_tokens": 7728002.0, "step": 474 }, { "entropy": 0.5286234319210052, "epoch": 1.7738317757009345, "grad_norm": 0.033285900950431824, "learning_rate": 0.0002, "loss": 0.5358365774154663, "mean_token_accuracy": 0.7836114317178726, "num_tokens": 7744366.0, "step": 475 }, { "entropy": 0.5403973311185837, "epoch": 1.777570093457944, "grad_norm": 0.028936821967363358, "learning_rate": 0.0002, "loss": 0.5398406386375427, "mean_token_accuracy": 0.7814478874206543, "num_tokens": 7760549.0, "step": 476 }, { "entropy": 0.5419041812419891, "epoch": 1.7813084112149533, "grad_norm": 0.03836261108517647, "learning_rate": 0.0002, "loss": 0.5494267344474792, "mean_token_accuracy": 0.775143027305603, "num_tokens": 7776621.0, "step": 477 }, { "entropy": 0.5589816868305206, "epoch": 1.7850467289719627, "grad_norm": 0.03261716663837433, "learning_rate": 0.0002, "loss": 0.5496556758880615, "mean_token_accuracy": 0.775287851691246, "num_tokens": 7792949.0, "step": 478 }, { "entropy": 0.5772902369499207, "epoch": 1.788785046728972, "grad_norm": 0.03729069605469704, "learning_rate": 0.0002, "loss": 0.5730117559432983, "mean_token_accuracy": 0.7676824629306793, "num_tokens": 7809233.0, "step": 479 }, { "entropy": 0.5505616068840027, "epoch": 1.7925233644859813, "grad_norm": 0.0271653700619936, "learning_rate": 0.0002, "loss": 0.5481145977973938, "mean_token_accuracy": 0.7766467928886414, "num_tokens": 7825604.0, "step": 480 }, { "entropy": 0.5539548844099045, "epoch": 1.7962616822429907, "grad_norm": 0.035687919706106186, "learning_rate": 0.0002, "loss": 0.5536059737205505, "mean_token_accuracy": 0.7723885625600815, "num_tokens": 7841764.0, "step": 481 }, { "entropy": 0.548996701836586, "epoch": 1.8, "grad_norm": 0.03167950361967087, "learning_rate": 0.0002, "loss": 0.5525107383728027, "mean_token_accuracy": 0.7743307799100876, "num_tokens": 7857918.0, "step": 482 }, { "entropy": 0.5371337532997131, "epoch": 1.8037383177570092, "grad_norm": 0.03125729039311409, "learning_rate": 0.0002, "loss": 0.5431434512138367, "mean_token_accuracy": 0.7770611643791199, "num_tokens": 7874375.0, "step": 483 }, { "entropy": 0.5534856170415878, "epoch": 1.8074766355140186, "grad_norm": 0.03495310619473457, "learning_rate": 0.0002, "loss": 0.5606104731559753, "mean_token_accuracy": 0.7701490819454193, "num_tokens": 7890503.0, "step": 484 }, { "entropy": 0.5570873767137527, "epoch": 1.811214953271028, "grad_norm": 0.031059635803103447, "learning_rate": 0.0002, "loss": 0.5577523112297058, "mean_token_accuracy": 0.7766271531581879, "num_tokens": 7906740.0, "step": 485 }, { "entropy": 0.549734815955162, "epoch": 1.8149532710280374, "grad_norm": 0.029658785089850426, "learning_rate": 0.0002, "loss": 0.5459674000740051, "mean_token_accuracy": 0.778388187289238, "num_tokens": 7923366.0, "step": 486 }, { "entropy": 0.556487500667572, "epoch": 1.8186915887850468, "grad_norm": 0.03030308522284031, "learning_rate": 0.0002, "loss": 0.5487005710601807, "mean_token_accuracy": 0.7778837084770203, "num_tokens": 7939678.0, "step": 487 }, { "entropy": 0.5620574653148651, "epoch": 1.8224299065420562, "grad_norm": 0.03321143984794617, "learning_rate": 0.0002, "loss": 0.5632344484329224, "mean_token_accuracy": 0.771716520190239, "num_tokens": 7955824.0, "step": 488 }, { "entropy": 0.5325201749801636, "epoch": 1.8261682242990656, "grad_norm": 0.0296145249158144, "learning_rate": 0.0002, "loss": 0.5337831377983093, "mean_token_accuracy": 0.7806598991155624, "num_tokens": 7971945.0, "step": 489 }, { "entropy": 0.5530183613300323, "epoch": 1.8299065420560747, "grad_norm": 0.04490596428513527, "learning_rate": 0.0002, "loss": 0.5658998489379883, "mean_token_accuracy": 0.7682041078805923, "num_tokens": 7988395.0, "step": 490 }, { "entropy": 0.540508821606636, "epoch": 1.8336448598130841, "grad_norm": 0.03253109008073807, "learning_rate": 0.0002, "loss": 0.5402263402938843, "mean_token_accuracy": 0.7800282388925552, "num_tokens": 8004443.0, "step": 491 }, { "entropy": 0.5511161684989929, "epoch": 1.8373831775700935, "grad_norm": 0.030638035386800766, "learning_rate": 0.0002, "loss": 0.5421851277351379, "mean_token_accuracy": 0.7774636000394821, "num_tokens": 8020850.0, "step": 492 }, { "entropy": 0.5710225850343704, "epoch": 1.8411214953271027, "grad_norm": 0.029152031987905502, "learning_rate": 0.0002, "loss": 0.5603572130203247, "mean_token_accuracy": 0.7699873447418213, "num_tokens": 8037043.0, "step": 493 }, { "entropy": 0.5580283105373383, "epoch": 1.844859813084112, "grad_norm": 0.030489208176732063, "learning_rate": 0.0002, "loss": 0.5527392625808716, "mean_token_accuracy": 0.7742099016904831, "num_tokens": 8053631.0, "step": 494 }, { "entropy": 0.5568618625402451, "epoch": 1.8485981308411215, "grad_norm": 0.03116370178759098, "learning_rate": 0.0002, "loss": 0.557203471660614, "mean_token_accuracy": 0.7757259756326675, "num_tokens": 8069679.0, "step": 495 }, { "entropy": 0.5572323054075241, "epoch": 1.8523364485981308, "grad_norm": 0.03199765831232071, "learning_rate": 0.0002, "loss": 0.5623334646224976, "mean_token_accuracy": 0.7726736217737198, "num_tokens": 8086185.0, "step": 496 }, { "entropy": 0.5608405023813248, "epoch": 1.8560747663551402, "grad_norm": 0.03123069368302822, "learning_rate": 0.0002, "loss": 0.5668354630470276, "mean_token_accuracy": 0.7697951197624207, "num_tokens": 8102680.0, "step": 497 }, { "entropy": 0.5482483208179474, "epoch": 1.8598130841121496, "grad_norm": 0.03388088196516037, "learning_rate": 0.0002, "loss": 0.5544660091400146, "mean_token_accuracy": 0.7736243009567261, "num_tokens": 8119206.0, "step": 498 }, { "entropy": 0.5743024945259094, "epoch": 1.863551401869159, "grad_norm": 0.027546290308237076, "learning_rate": 0.0002, "loss": 0.5691558718681335, "mean_token_accuracy": 0.7669505923986435, "num_tokens": 8135686.0, "step": 499 }, { "entropy": 0.5571306794881821, "epoch": 1.8672897196261682, "grad_norm": 0.03095332719385624, "learning_rate": 0.0002, "loss": 0.5527883172035217, "mean_token_accuracy": 0.7751508802175522, "num_tokens": 8151938.0, "step": 500 }, { "entropy": 0.5444643199443817, "epoch": 1.8710280373831776, "grad_norm": 0.03176809847354889, "learning_rate": 0.0002, "loss": 0.5450653433799744, "mean_token_accuracy": 0.7778386175632477, "num_tokens": 8168369.0, "step": 501 }, { "entropy": 0.5318097025156021, "epoch": 1.874766355140187, "grad_norm": 0.03216860815882683, "learning_rate": 0.0002, "loss": 0.5350679159164429, "mean_token_accuracy": 0.7839819490909576, "num_tokens": 8184441.0, "step": 502 }, { "entropy": 0.5431730151176453, "epoch": 1.8785046728971961, "grad_norm": 0.031609971076250076, "learning_rate": 0.0002, "loss": 0.5454133152961731, "mean_token_accuracy": 0.7757967710494995, "num_tokens": 8200701.0, "step": 503 }, { "entropy": 0.5446748435497284, "epoch": 1.8822429906542055, "grad_norm": 0.03689466044306755, "learning_rate": 0.0002, "loss": 0.5491172075271606, "mean_token_accuracy": 0.7771103084087372, "num_tokens": 8216896.0, "step": 504 }, { "entropy": 0.5379506647586823, "epoch": 1.885981308411215, "grad_norm": 0.03774857521057129, "learning_rate": 0.0002, "loss": 0.5465993881225586, "mean_token_accuracy": 0.7745991945266724, "num_tokens": 8233119.0, "step": 505 }, { "entropy": 0.5524174273014069, "epoch": 1.8897196261682243, "grad_norm": 0.03127999231219292, "learning_rate": 0.0002, "loss": 0.552331268787384, "mean_token_accuracy": 0.7734175026416779, "num_tokens": 8249424.0, "step": 506 }, { "entropy": 0.5634707659482956, "epoch": 1.8934579439252337, "grad_norm": 0.03172188624739647, "learning_rate": 0.0002, "loss": 0.5552417039871216, "mean_token_accuracy": 0.7762156277894974, "num_tokens": 8265823.0, "step": 507 }, { "entropy": 0.5733916610479355, "epoch": 1.897196261682243, "grad_norm": 0.041391924023628235, "learning_rate": 0.0002, "loss": 0.5685185790061951, "mean_token_accuracy": 0.7656967639923096, "num_tokens": 8282150.0, "step": 508 }, { "entropy": 0.5633519440889359, "epoch": 1.9009345794392525, "grad_norm": 0.03210509195923805, "learning_rate": 0.0002, "loss": 0.5575313568115234, "mean_token_accuracy": 0.7736276984214783, "num_tokens": 8298545.0, "step": 509 }, { "entropy": 0.5282728672027588, "epoch": 1.9046728971962616, "grad_norm": 0.031000696122646332, "learning_rate": 0.0002, "loss": 0.5271653532981873, "mean_token_accuracy": 0.7857028245925903, "num_tokens": 8314750.0, "step": 510 }, { "entropy": 0.5598197877407074, "epoch": 1.908411214953271, "grad_norm": 0.03814297169446945, "learning_rate": 0.0002, "loss": 0.5556469559669495, "mean_token_accuracy": 0.7734071165323257, "num_tokens": 8331160.0, "step": 511 }, { "entropy": 0.5301484763622284, "epoch": 1.9121495327102802, "grad_norm": 0.03675490617752075, "learning_rate": 0.0002, "loss": 0.5384268760681152, "mean_token_accuracy": 0.7815950363874435, "num_tokens": 8347524.0, "step": 512 }, { "entropy": 0.556285485625267, "epoch": 1.9158878504672896, "grad_norm": 0.03204094246029854, "learning_rate": 0.0002, "loss": 0.5582637190818787, "mean_token_accuracy": 0.7725251466035843, "num_tokens": 8363738.0, "step": 513 }, { "entropy": 0.5535630583763123, "epoch": 1.919626168224299, "grad_norm": 0.030629510059952736, "learning_rate": 0.0002, "loss": 0.5578333735466003, "mean_token_accuracy": 0.7727056741714478, "num_tokens": 8380122.0, "step": 514 }, { "entropy": 0.5471296161413193, "epoch": 1.9233644859813084, "grad_norm": 0.03401264175772667, "learning_rate": 0.0002, "loss": 0.5535186529159546, "mean_token_accuracy": 0.7754651010036469, "num_tokens": 8396440.0, "step": 515 }, { "entropy": 0.5500332862138748, "epoch": 1.9271028037383178, "grad_norm": 0.03108939900994301, "learning_rate": 0.0002, "loss": 0.5485121607780457, "mean_token_accuracy": 0.7769151926040649, "num_tokens": 8412740.0, "step": 516 }, { "entropy": 0.5605651885271072, "epoch": 1.9308411214953272, "grad_norm": 0.028515921905636787, "learning_rate": 0.0002, "loss": 0.5516760349273682, "mean_token_accuracy": 0.7752381414175034, "num_tokens": 8429081.0, "step": 517 }, { "entropy": 0.5527090132236481, "epoch": 1.9345794392523366, "grad_norm": 0.032440509647130966, "learning_rate": 0.0002, "loss": 0.5482094883918762, "mean_token_accuracy": 0.776523694396019, "num_tokens": 8445459.0, "step": 518 }, { "entropy": 0.5639519840478897, "epoch": 1.938317757009346, "grad_norm": 0.03387531265616417, "learning_rate": 0.0002, "loss": 0.565314769744873, "mean_token_accuracy": 0.7686825692653656, "num_tokens": 8461834.0, "step": 519 }, { "entropy": 0.5390266180038452, "epoch": 1.9420560747663551, "grad_norm": 0.02882574312388897, "learning_rate": 0.0002, "loss": 0.5430452823638916, "mean_token_accuracy": 0.7774745523929596, "num_tokens": 8478272.0, "step": 520 }, { "entropy": 0.5343397557735443, "epoch": 1.9457943925233645, "grad_norm": 0.030860040336847305, "learning_rate": 0.0002, "loss": 0.5347194075584412, "mean_token_accuracy": 0.7817697376012802, "num_tokens": 8494437.0, "step": 521 }, { "entropy": 0.5492627769708633, "epoch": 1.9495327102803737, "grad_norm": 0.03405896574258804, "learning_rate": 0.0002, "loss": 0.5500932335853577, "mean_token_accuracy": 0.7765759974718094, "num_tokens": 8510975.0, "step": 522 }, { "entropy": 0.5563263446092606, "epoch": 1.953271028037383, "grad_norm": 0.03141237422823906, "learning_rate": 0.0002, "loss": 0.557966947555542, "mean_token_accuracy": 0.7717025876045227, "num_tokens": 8527347.0, "step": 523 }, { "entropy": 0.5636772364377975, "epoch": 1.9570093457943925, "grad_norm": 0.03168516606092453, "learning_rate": 0.0002, "loss": 0.5611008405685425, "mean_token_accuracy": 0.7714557945728302, "num_tokens": 8543551.0, "step": 524 }, { "entropy": 0.5489466190338135, "epoch": 1.9607476635514018, "grad_norm": 0.03355073928833008, "learning_rate": 0.0002, "loss": 0.5395604372024536, "mean_token_accuracy": 0.7807340919971466, "num_tokens": 8559955.0, "step": 525 }, { "entropy": 0.5399315655231476, "epoch": 1.9644859813084112, "grad_norm": 0.03453009948134422, "learning_rate": 0.0002, "loss": 0.5348931550979614, "mean_token_accuracy": 0.7806299477815628, "num_tokens": 8576469.0, "step": 526 }, { "entropy": 0.5491375476121902, "epoch": 1.9682242990654206, "grad_norm": 0.0316200815141201, "learning_rate": 0.0002, "loss": 0.5556234121322632, "mean_token_accuracy": 0.773221030831337, "num_tokens": 8592906.0, "step": 527 }, { "entropy": 0.5373014956712723, "epoch": 1.97196261682243, "grad_norm": 0.032452452927827835, "learning_rate": 0.0002, "loss": 0.5457467436790466, "mean_token_accuracy": 0.7758653908967972, "num_tokens": 8609100.0, "step": 528 }, { "entropy": 0.5414352118968964, "epoch": 1.9757009345794394, "grad_norm": 0.03351645544171333, "learning_rate": 0.0002, "loss": 0.5482410788536072, "mean_token_accuracy": 0.7752601951360703, "num_tokens": 8625316.0, "step": 529 }, { "entropy": 0.5407055169343948, "epoch": 1.9794392523364486, "grad_norm": 0.03003384917974472, "learning_rate": 0.0002, "loss": 0.5356785655021667, "mean_token_accuracy": 0.7822994440793991, "num_tokens": 8641716.0, "step": 530 }, { "entropy": 0.5463829636573792, "epoch": 1.983177570093458, "grad_norm": 0.028586186468601227, "learning_rate": 0.0002, "loss": 0.5386159420013428, "mean_token_accuracy": 0.7832934260368347, "num_tokens": 8658117.0, "step": 531 }, { "entropy": 0.52997986972332, "epoch": 1.9869158878504671, "grad_norm": 0.03231372311711311, "learning_rate": 0.0002, "loss": 0.5258426666259766, "mean_token_accuracy": 0.786494106054306, "num_tokens": 8674098.0, "step": 532 }, { "entropy": 0.5263413488864899, "epoch": 1.9906542056074765, "grad_norm": 0.029255473986268044, "learning_rate": 0.0002, "loss": 0.5267069935798645, "mean_token_accuracy": 0.784383550286293, "num_tokens": 8690474.0, "step": 533 }, { "entropy": 0.5337765663862228, "epoch": 1.994392523364486, "grad_norm": 0.03723280131816864, "learning_rate": 0.0002, "loss": 0.5434689521789551, "mean_token_accuracy": 0.7792166471481323, "num_tokens": 8706774.0, "step": 534 }, { "entropy": 0.5302833914756775, "epoch": 1.9981308411214953, "grad_norm": 0.03789842873811722, "learning_rate": 0.0002, "loss": 0.5390503406524658, "mean_token_accuracy": 0.7825159579515457, "num_tokens": 8722988.0, "step": 535 }, { "entropy": 0.5365387499332428, "epoch": 2.0, "grad_norm": 0.03994116187095642, "learning_rate": 0.0002, "loss": 0.5442785024642944, "mean_token_accuracy": 0.779285341501236, "num_tokens": 8731086.0, "step": 536 }, { "entropy": 0.5551358312368393, "epoch": 2.0037383177570094, "grad_norm": 0.03304925188422203, "learning_rate": 0.0002, "loss": 0.5366768836975098, "mean_token_accuracy": 0.7850453853607178, "num_tokens": 8747251.0, "step": 537 }, { "entropy": 0.5637228041887283, "epoch": 2.007476635514019, "grad_norm": 0.03504426032304764, "learning_rate": 0.0002, "loss": 0.5443665981292725, "mean_token_accuracy": 0.7774000763893127, "num_tokens": 8763427.0, "step": 538 }, { "entropy": 0.5427139699459076, "epoch": 2.011214953271028, "grad_norm": 0.03504855930805206, "learning_rate": 0.0002, "loss": 0.5313124656677246, "mean_token_accuracy": 0.7818376272916794, "num_tokens": 8779836.0, "step": 539 }, { "entropy": 0.5330108106136322, "epoch": 2.0149532710280376, "grad_norm": 0.03754406422376633, "learning_rate": 0.0002, "loss": 0.5421642661094666, "mean_token_accuracy": 0.7790561318397522, "num_tokens": 8796325.0, "step": 540 }, { "entropy": 0.512071430683136, "epoch": 2.0186915887850465, "grad_norm": 0.043662529438734055, "learning_rate": 0.0002, "loss": 0.5302350521087646, "mean_token_accuracy": 0.7863733917474747, "num_tokens": 8812606.0, "step": 541 }, { "entropy": 0.5129958391189575, "epoch": 2.022429906542056, "grad_norm": 0.04149031639099121, "learning_rate": 0.0002, "loss": 0.5309258699417114, "mean_token_accuracy": 0.7860198318958282, "num_tokens": 8828882.0, "step": 542 }, { "entropy": 0.5420234501361847, "epoch": 2.0261682242990653, "grad_norm": 0.03192834183573723, "learning_rate": 0.0002, "loss": 0.5397300124168396, "mean_token_accuracy": 0.7826980352401733, "num_tokens": 8845360.0, "step": 543 }, { "entropy": 0.5496412217617035, "epoch": 2.0299065420560747, "grad_norm": 0.03798922896385193, "learning_rate": 0.0002, "loss": 0.5328091979026794, "mean_token_accuracy": 0.7848182171583176, "num_tokens": 8861741.0, "step": 544 }, { "entropy": 0.5499916076660156, "epoch": 2.033644859813084, "grad_norm": 0.03497615084052086, "learning_rate": 0.0002, "loss": 0.5330801010131836, "mean_token_accuracy": 0.7823185920715332, "num_tokens": 8878099.0, "step": 545 }, { "entropy": 0.5397230982780457, "epoch": 2.0373831775700935, "grad_norm": 0.03805805742740631, "learning_rate": 0.0002, "loss": 0.5325009822845459, "mean_token_accuracy": 0.7835113406181335, "num_tokens": 8894613.0, "step": 546 }, { "entropy": 0.5198622792959213, "epoch": 2.041121495327103, "grad_norm": 0.03364388644695282, "learning_rate": 0.0002, "loss": 0.5222806334495544, "mean_token_accuracy": 0.7844293862581253, "num_tokens": 8910849.0, "step": 547 }, { "entropy": 0.5255338400602341, "epoch": 2.0448598130841122, "grad_norm": 0.047903481870889664, "learning_rate": 0.0002, "loss": 0.5388204455375671, "mean_token_accuracy": 0.7818868011236191, "num_tokens": 8927305.0, "step": 548 }, { "entropy": 0.5240660309791565, "epoch": 2.0485981308411216, "grad_norm": 0.04678136110305786, "learning_rate": 0.0002, "loss": 0.544981062412262, "mean_token_accuracy": 0.7767013013362885, "num_tokens": 8943628.0, "step": 549 }, { "entropy": 0.5418435484170914, "epoch": 2.052336448598131, "grad_norm": 0.04154983535408974, "learning_rate": 0.0002, "loss": 0.5431923866271973, "mean_token_accuracy": 0.7803478538990021, "num_tokens": 8959739.0, "step": 550 }, { "entropy": 0.5464048683643341, "epoch": 2.05607476635514, "grad_norm": 0.03621891885995865, "learning_rate": 0.0002, "loss": 0.5369123220443726, "mean_token_accuracy": 0.7831740379333496, "num_tokens": 8975834.0, "step": 551 }, { "entropy": 0.5625316351652145, "epoch": 2.0598130841121494, "grad_norm": 0.04116278514266014, "learning_rate": 0.0002, "loss": 0.5496330261230469, "mean_token_accuracy": 0.7770462930202484, "num_tokens": 8992265.0, "step": 552 }, { "entropy": 0.5488497316837311, "epoch": 2.0635514018691588, "grad_norm": 0.03322463855147362, "learning_rate": 0.0002, "loss": 0.5367662310600281, "mean_token_accuracy": 0.7818718105554581, "num_tokens": 9008719.0, "step": 553 }, { "entropy": 0.5378982275724411, "epoch": 2.067289719626168, "grad_norm": 0.034129269421100616, "learning_rate": 0.0002, "loss": 0.5418792963027954, "mean_token_accuracy": 0.7807257324457169, "num_tokens": 9025151.0, "step": 554 }, { "entropy": 0.5220974087715149, "epoch": 2.0710280373831775, "grad_norm": 0.045197054743766785, "learning_rate": 0.0002, "loss": 0.5300080180168152, "mean_token_accuracy": 0.7885446846485138, "num_tokens": 9041486.0, "step": 555 }, { "entropy": 0.515913613140583, "epoch": 2.074766355140187, "grad_norm": 0.04399452358484268, "learning_rate": 0.0002, "loss": 0.5253356099128723, "mean_token_accuracy": 0.787113681435585, "num_tokens": 9057792.0, "step": 556 }, { "entropy": 0.529649943113327, "epoch": 2.0785046728971963, "grad_norm": 0.0405830517411232, "learning_rate": 0.0002, "loss": 0.5332399010658264, "mean_token_accuracy": 0.7825795114040375, "num_tokens": 9073971.0, "step": 557 }, { "entropy": 0.5306390672922134, "epoch": 2.0822429906542057, "grad_norm": 0.04040224850177765, "learning_rate": 0.0002, "loss": 0.5270552039146423, "mean_token_accuracy": 0.7854219824075699, "num_tokens": 9090396.0, "step": 558 }, { "entropy": 0.540916696190834, "epoch": 2.085981308411215, "grad_norm": 0.039850566536188126, "learning_rate": 0.0002, "loss": 0.5330172181129456, "mean_token_accuracy": 0.7840156704187393, "num_tokens": 9106865.0, "step": 559 }, { "entropy": 0.5573539286851883, "epoch": 2.0897196261682245, "grad_norm": 0.039134591817855835, "learning_rate": 0.0002, "loss": 0.5492205023765564, "mean_token_accuracy": 0.7779581248760223, "num_tokens": 9123213.0, "step": 560 }, { "entropy": 0.5308785140514374, "epoch": 2.0934579439252334, "grad_norm": 0.033643938601017, "learning_rate": 0.0002, "loss": 0.5260533690452576, "mean_token_accuracy": 0.7881509810686111, "num_tokens": 9139334.0, "step": 561 }, { "entropy": 0.5462942272424698, "epoch": 2.097196261682243, "grad_norm": 0.0343049094080925, "learning_rate": 0.0002, "loss": 0.5453207492828369, "mean_token_accuracy": 0.7791396528482437, "num_tokens": 9155964.0, "step": 562 }, { "entropy": 0.5272018313407898, "epoch": 2.100934579439252, "grad_norm": 0.040583785623311996, "learning_rate": 0.0002, "loss": 0.5357244610786438, "mean_token_accuracy": 0.7829957753419876, "num_tokens": 9172409.0, "step": 563 }, { "entropy": 0.5276166945695877, "epoch": 2.1046728971962616, "grad_norm": 0.03636649623513222, "learning_rate": 0.0002, "loss": 0.5361207127571106, "mean_token_accuracy": 0.7831525951623917, "num_tokens": 9188524.0, "step": 564 }, { "entropy": 0.5464211106300354, "epoch": 2.108411214953271, "grad_norm": 0.0365222692489624, "learning_rate": 0.0002, "loss": 0.5448060035705566, "mean_token_accuracy": 0.7774559408426285, "num_tokens": 9204803.0, "step": 565 }, { "entropy": 0.5368735194206238, "epoch": 2.1121495327102804, "grad_norm": 0.04034702479839325, "learning_rate": 0.0002, "loss": 0.5308568477630615, "mean_token_accuracy": 0.784459188580513, "num_tokens": 9220931.0, "step": 566 }, { "entropy": 0.5340090990066528, "epoch": 2.1158878504672898, "grad_norm": 0.03558754175901413, "learning_rate": 0.0002, "loss": 0.5307760238647461, "mean_token_accuracy": 0.7841941863298416, "num_tokens": 9237402.0, "step": 567 }, { "entropy": 0.554409846663475, "epoch": 2.119626168224299, "grad_norm": 0.038797035813331604, "learning_rate": 0.0002, "loss": 0.5491658449172974, "mean_token_accuracy": 0.7782745659351349, "num_tokens": 9254002.0, "step": 568 }, { "entropy": 0.546349972486496, "epoch": 2.1233644859813086, "grad_norm": 0.04194206744432449, "learning_rate": 0.0002, "loss": 0.5519090294837952, "mean_token_accuracy": 0.7750387489795685, "num_tokens": 9270313.0, "step": 569 }, { "entropy": 0.5365971177816391, "epoch": 2.127102803738318, "grad_norm": 0.045358605682849884, "learning_rate": 0.0002, "loss": 0.5437461733818054, "mean_token_accuracy": 0.7794076204299927, "num_tokens": 9286712.0, "step": 570 }, { "entropy": 0.5360657125711441, "epoch": 2.130841121495327, "grad_norm": 0.04332416132092476, "learning_rate": 0.0002, "loss": 0.5378158688545227, "mean_token_accuracy": 0.7812185734510422, "num_tokens": 9302929.0, "step": 571 }, { "entropy": 0.5161439999938011, "epoch": 2.1345794392523363, "grad_norm": 0.03498893231153488, "learning_rate": 0.0002, "loss": 0.5166691541671753, "mean_token_accuracy": 0.7898645251989365, "num_tokens": 9318970.0, "step": 572 }, { "entropy": 0.5420155078172684, "epoch": 2.1383177570093457, "grad_norm": 0.059223148971796036, "learning_rate": 0.0002, "loss": 0.5398759841918945, "mean_token_accuracy": 0.7814654260873795, "num_tokens": 9335490.0, "step": 573 }, { "entropy": 0.5263395309448242, "epoch": 2.142056074766355, "grad_norm": 0.03245805576443672, "learning_rate": 0.0002, "loss": 0.5229323506355286, "mean_token_accuracy": 0.7877913564443588, "num_tokens": 9351959.0, "step": 574 }, { "entropy": 0.5362307131290436, "epoch": 2.1457943925233645, "grad_norm": 0.037454549223184586, "learning_rate": 0.0002, "loss": 0.5291175246238708, "mean_token_accuracy": 0.783667266368866, "num_tokens": 9368360.0, "step": 575 }, { "entropy": 0.527548685669899, "epoch": 2.149532710280374, "grad_norm": 0.043125126510858536, "learning_rate": 0.0002, "loss": 0.5279426574707031, "mean_token_accuracy": 0.7838954478502274, "num_tokens": 9384665.0, "step": 576 }, { "entropy": 0.543443351984024, "epoch": 2.1532710280373832, "grad_norm": 0.03840547800064087, "learning_rate": 0.0002, "loss": 0.5481908321380615, "mean_token_accuracy": 0.7762167900800705, "num_tokens": 9400994.0, "step": 577 }, { "entropy": 0.5402033478021622, "epoch": 2.1570093457943926, "grad_norm": 0.04524662345647812, "learning_rate": 0.0002, "loss": 0.5483248829841614, "mean_token_accuracy": 0.7753354609012604, "num_tokens": 9417287.0, "step": 578 }, { "entropy": 0.5183399319648743, "epoch": 2.160747663551402, "grad_norm": 0.033803943544626236, "learning_rate": 0.0002, "loss": 0.5152841210365295, "mean_token_accuracy": 0.7872842252254486, "num_tokens": 9433683.0, "step": 579 }, { "entropy": 0.5163632705807686, "epoch": 2.1644859813084114, "grad_norm": 0.036510877311229706, "learning_rate": 0.0002, "loss": 0.5149884223937988, "mean_token_accuracy": 0.7905207723379135, "num_tokens": 9450137.0, "step": 580 }, { "entropy": 0.5321061164140701, "epoch": 2.1682242990654204, "grad_norm": 0.0464416965842247, "learning_rate": 0.0002, "loss": 0.5351567268371582, "mean_token_accuracy": 0.7838670462369919, "num_tokens": 9466550.0, "step": 581 }, { "entropy": 0.5199630409479141, "epoch": 2.1719626168224297, "grad_norm": 0.04309747740626335, "learning_rate": 0.0002, "loss": 0.5278782844543457, "mean_token_accuracy": 0.7839005291461945, "num_tokens": 9482588.0, "step": 582 }, { "entropy": 0.5339600071310997, "epoch": 2.175700934579439, "grad_norm": 0.04095384106040001, "learning_rate": 0.0002, "loss": 0.5310637354850769, "mean_token_accuracy": 0.783690795302391, "num_tokens": 9498951.0, "step": 583 }, { "entropy": 0.5384320765733719, "epoch": 2.1794392523364485, "grad_norm": 0.03863927349448204, "learning_rate": 0.0002, "loss": 0.540824294090271, "mean_token_accuracy": 0.7791530042886734, "num_tokens": 9515132.0, "step": 584 }, { "entropy": 0.5549707859754562, "epoch": 2.183177570093458, "grad_norm": 0.03921306133270264, "learning_rate": 0.0002, "loss": 0.5536147356033325, "mean_token_accuracy": 0.7751126140356064, "num_tokens": 9531512.0, "step": 585 }, { "entropy": 0.5347359776496887, "epoch": 2.1869158878504673, "grad_norm": 0.037864800542593, "learning_rate": 0.0002, "loss": 0.5341432094573975, "mean_token_accuracy": 0.7835363298654556, "num_tokens": 9547534.0, "step": 586 }, { "entropy": 0.5516605377197266, "epoch": 2.1906542056074767, "grad_norm": 0.036846909672021866, "learning_rate": 0.0002, "loss": 0.5443211197853088, "mean_token_accuracy": 0.7788311243057251, "num_tokens": 9564040.0, "step": 587 }, { "entropy": 0.5391202419996262, "epoch": 2.194392523364486, "grad_norm": 0.03954128175973892, "learning_rate": 0.0002, "loss": 0.5309199094772339, "mean_token_accuracy": 0.783383384346962, "num_tokens": 9580289.0, "step": 588 }, { "entropy": 0.5318265110254288, "epoch": 2.1981308411214955, "grad_norm": 0.03327268362045288, "learning_rate": 0.0002, "loss": 0.5330622792243958, "mean_token_accuracy": 0.7819591611623764, "num_tokens": 9596500.0, "step": 589 }, { "entropy": 0.5139677748084068, "epoch": 2.201869158878505, "grad_norm": 0.039606738835573196, "learning_rate": 0.0002, "loss": 0.520559549331665, "mean_token_accuracy": 0.7877521514892578, "num_tokens": 9612675.0, "step": 590 }, { "entropy": 0.5283454358577728, "epoch": 2.205607476635514, "grad_norm": 0.03826924040913582, "learning_rate": 0.0002, "loss": 0.5321468710899353, "mean_token_accuracy": 0.7843296527862549, "num_tokens": 9629044.0, "step": 591 }, { "entropy": 0.5257805287837982, "epoch": 2.209345794392523, "grad_norm": 0.04099821671843529, "learning_rate": 0.0002, "loss": 0.5277660489082336, "mean_token_accuracy": 0.7833193689584732, "num_tokens": 9645271.0, "step": 592 }, { "entropy": 0.5350408107042313, "epoch": 2.2130841121495326, "grad_norm": 0.038267582654953, "learning_rate": 0.0002, "loss": 0.5255724787712097, "mean_token_accuracy": 0.7867475599050522, "num_tokens": 9661448.0, "step": 593 }, { "entropy": 0.5472716838121414, "epoch": 2.216822429906542, "grad_norm": 0.03405248373746872, "learning_rate": 0.0002, "loss": 0.5390135645866394, "mean_token_accuracy": 0.779327467083931, "num_tokens": 9677824.0, "step": 594 }, { "entropy": 0.5421159714460373, "epoch": 2.2205607476635514, "grad_norm": 0.041895944625139236, "learning_rate": 0.0002, "loss": 0.5395660400390625, "mean_token_accuracy": 0.7796223610639572, "num_tokens": 9694305.0, "step": 595 }, { "entropy": 0.5459330081939697, "epoch": 2.2242990654205608, "grad_norm": 0.036602918058633804, "learning_rate": 0.0002, "loss": 0.5457043647766113, "mean_token_accuracy": 0.7810876667499542, "num_tokens": 9710852.0, "step": 596 }, { "entropy": 0.5278807803988457, "epoch": 2.22803738317757, "grad_norm": 0.04418497160077095, "learning_rate": 0.0002, "loss": 0.5371560454368591, "mean_token_accuracy": 0.7824568003416061, "num_tokens": 9727075.0, "step": 597 }, { "entropy": 0.5311697870492935, "epoch": 2.2317757009345796, "grad_norm": 0.043200667947530746, "learning_rate": 0.0002, "loss": 0.5364136695861816, "mean_token_accuracy": 0.783041849732399, "num_tokens": 9743306.0, "step": 598 }, { "entropy": 0.5302419811487198, "epoch": 2.235514018691589, "grad_norm": 0.037720005959272385, "learning_rate": 0.0002, "loss": 0.5262041091918945, "mean_token_accuracy": 0.7870023250579834, "num_tokens": 9759403.0, "step": 599 }, { "entropy": 0.5483334362506866, "epoch": 2.2392523364485983, "grad_norm": 0.03560694679617882, "learning_rate": 0.0002, "loss": 0.5467509627342224, "mean_token_accuracy": 0.779225081205368, "num_tokens": 9775738.0, "step": 600 }, { "entropy": 0.5375639796257019, "epoch": 2.2429906542056073, "grad_norm": 0.03993435204029083, "learning_rate": 0.0002, "loss": 0.5336683988571167, "mean_token_accuracy": 0.7839321345090866, "num_tokens": 9792043.0, "step": 601 }, { "entropy": 0.544166311621666, "epoch": 2.2467289719626167, "grad_norm": 0.03602972254157066, "learning_rate": 0.0002, "loss": 0.5403839945793152, "mean_token_accuracy": 0.7812667638063431, "num_tokens": 9808431.0, "step": 602 }, { "entropy": 0.5295002460479736, "epoch": 2.250467289719626, "grad_norm": 0.041549984365701675, "learning_rate": 0.0002, "loss": 0.5339419841766357, "mean_token_accuracy": 0.7843643128871918, "num_tokens": 9824744.0, "step": 603 }, { "entropy": 0.5211731493473053, "epoch": 2.2542056074766355, "grad_norm": 0.04408840090036392, "learning_rate": 0.0002, "loss": 0.5288305878639221, "mean_token_accuracy": 0.7842673063278198, "num_tokens": 9841081.0, "step": 604 }, { "entropy": 0.5425246208906174, "epoch": 2.257943925233645, "grad_norm": 0.04026458412408829, "learning_rate": 0.0002, "loss": 0.5444083213806152, "mean_token_accuracy": 0.7781710475683212, "num_tokens": 9857545.0, "step": 605 }, { "entropy": 0.5519444048404694, "epoch": 2.2616822429906542, "grad_norm": 0.03973834961652756, "learning_rate": 0.0002, "loss": 0.547622799873352, "mean_token_accuracy": 0.7769842147827148, "num_tokens": 9873925.0, "step": 606 }, { "entropy": 0.5228262096643448, "epoch": 2.2654205607476636, "grad_norm": 0.041971541941165924, "learning_rate": 0.0002, "loss": 0.5222245454788208, "mean_token_accuracy": 0.7858153134584427, "num_tokens": 9890052.0, "step": 607 }, { "entropy": 0.5335221141576767, "epoch": 2.269158878504673, "grad_norm": 0.039673078805208206, "learning_rate": 0.0002, "loss": 0.5314098000526428, "mean_token_accuracy": 0.7840564250946045, "num_tokens": 9906259.0, "step": 608 }, { "entropy": 0.5426364839076996, "epoch": 2.2728971962616824, "grad_norm": 0.04128013923764229, "learning_rate": 0.0002, "loss": 0.5407010316848755, "mean_token_accuracy": 0.7802868187427521, "num_tokens": 9922434.0, "step": 609 }, { "entropy": 0.5306970030069351, "epoch": 2.2766355140186914, "grad_norm": 0.03684001415967941, "learning_rate": 0.0002, "loss": 0.5325096845626831, "mean_token_accuracy": 0.7816676050424576, "num_tokens": 9938715.0, "step": 610 }, { "entropy": 0.5312017947435379, "epoch": 2.2803738317757007, "grad_norm": 0.0396246500313282, "learning_rate": 0.0002, "loss": 0.5326136350631714, "mean_token_accuracy": 0.7833829969167709, "num_tokens": 9954795.0, "step": 611 }, { "entropy": 0.5242188572883606, "epoch": 2.28411214953271, "grad_norm": 0.03666768968105316, "learning_rate": 0.0002, "loss": 0.5254257321357727, "mean_token_accuracy": 0.785698264837265, "num_tokens": 9970976.0, "step": 612 }, { "entropy": 0.5251396894454956, "epoch": 2.2878504672897195, "grad_norm": 0.041744161397218704, "learning_rate": 0.0002, "loss": 0.5361155867576599, "mean_token_accuracy": 0.781558558344841, "num_tokens": 9987242.0, "step": 613 }, { "entropy": 0.5212117433547974, "epoch": 2.291588785046729, "grad_norm": 0.044306471943855286, "learning_rate": 0.0002, "loss": 0.5255172252655029, "mean_token_accuracy": 0.7819651514291763, "num_tokens": 10003383.0, "step": 614 }, { "entropy": 0.5342397391796112, "epoch": 2.2953271028037383, "grad_norm": 0.04804427549242973, "learning_rate": 0.0002, "loss": 0.5286440849304199, "mean_token_accuracy": 0.7870652973651886, "num_tokens": 10019705.0, "step": 615 }, { "entropy": 0.5513401627540588, "epoch": 2.2990654205607477, "grad_norm": 0.04101845622062683, "learning_rate": 0.0002, "loss": 0.5483744144439697, "mean_token_accuracy": 0.7755522131919861, "num_tokens": 10035997.0, "step": 616 }, { "entropy": 0.5434563606977463, "epoch": 2.302803738317757, "grad_norm": 0.036619942635297775, "learning_rate": 0.0002, "loss": 0.5326208472251892, "mean_token_accuracy": 0.782253697514534, "num_tokens": 10052253.0, "step": 617 }, { "entropy": 0.5315294414758682, "epoch": 2.3065420560747665, "grad_norm": 0.037794552743434906, "learning_rate": 0.0002, "loss": 0.5253270864486694, "mean_token_accuracy": 0.7854621708393097, "num_tokens": 10068502.0, "step": 618 }, { "entropy": 0.5264740660786629, "epoch": 2.310280373831776, "grad_norm": 0.05285142362117767, "learning_rate": 0.0002, "loss": 0.5347273349761963, "mean_token_accuracy": 0.7845266908407211, "num_tokens": 10084722.0, "step": 619 }, { "entropy": 0.5410954803228378, "epoch": 2.3140186915887853, "grad_norm": 0.036392901092767715, "learning_rate": 0.0002, "loss": 0.5492109060287476, "mean_token_accuracy": 0.775203213095665, "num_tokens": 10101110.0, "step": 620 }, { "entropy": 0.5478453040122986, "epoch": 2.317757009345794, "grad_norm": 0.0461491234600544, "learning_rate": 0.0002, "loss": 0.5482407808303833, "mean_token_accuracy": 0.7783631533384323, "num_tokens": 10117543.0, "step": 621 }, { "entropy": 0.515753298997879, "epoch": 2.3214953271028036, "grad_norm": 0.04075627774000168, "learning_rate": 0.0002, "loss": 0.5150102972984314, "mean_token_accuracy": 0.789474606513977, "num_tokens": 10133572.0, "step": 622 }, { "entropy": 0.5349336713552475, "epoch": 2.325233644859813, "grad_norm": 0.042154040187597275, "learning_rate": 0.0002, "loss": 0.526114821434021, "mean_token_accuracy": 0.7856980115175247, "num_tokens": 10150048.0, "step": 623 }, { "entropy": 0.5674707591533661, "epoch": 2.3289719626168224, "grad_norm": 0.04182770103216171, "learning_rate": 0.0002, "loss": 0.5611693859100342, "mean_token_accuracy": 0.7749929875135422, "num_tokens": 10166642.0, "step": 624 }, { "entropy": 0.5181543081998825, "epoch": 2.3327102803738318, "grad_norm": 0.038145892322063446, "learning_rate": 0.0002, "loss": 0.5206056833267212, "mean_token_accuracy": 0.788123145699501, "num_tokens": 10182897.0, "step": 625 }, { "entropy": 0.5357862561941147, "epoch": 2.336448598130841, "grad_norm": 0.04366487264633179, "learning_rate": 0.0002, "loss": 0.5423003435134888, "mean_token_accuracy": 0.7787369638681412, "num_tokens": 10199311.0, "step": 626 }, { "entropy": 0.5277369916439056, "epoch": 2.3401869158878505, "grad_norm": 0.05174623429775238, "learning_rate": 0.0002, "loss": 0.539736270904541, "mean_token_accuracy": 0.7798131704330444, "num_tokens": 10215707.0, "step": 627 }, { "entropy": 0.5540482401847839, "epoch": 2.34392523364486, "grad_norm": 0.03900719806551933, "learning_rate": 0.0002, "loss": 0.5546514391899109, "mean_token_accuracy": 0.7751745879650116, "num_tokens": 10232233.0, "step": 628 }, { "entropy": 0.5211993083357811, "epoch": 2.3476635514018693, "grad_norm": 0.044696055352687836, "learning_rate": 0.0002, "loss": 0.5210398435592651, "mean_token_accuracy": 0.7867566049098969, "num_tokens": 10248397.0, "step": 629 }, { "entropy": 0.5406811684370041, "epoch": 2.3514018691588783, "grad_norm": 0.04107234627008438, "learning_rate": 0.0002, "loss": 0.5430042147636414, "mean_token_accuracy": 0.7786548435688019, "num_tokens": 10264653.0, "step": 630 }, { "entropy": 0.538291797041893, "epoch": 2.3551401869158877, "grad_norm": 0.03656275197863579, "learning_rate": 0.0002, "loss": 0.534942090511322, "mean_token_accuracy": 0.7826343178749084, "num_tokens": 10280941.0, "step": 631 }, { "entropy": 0.5547115802764893, "epoch": 2.358878504672897, "grad_norm": 0.04424076899886131, "learning_rate": 0.0002, "loss": 0.5602344870567322, "mean_token_accuracy": 0.7771879583597183, "num_tokens": 10297564.0, "step": 632 }, { "entropy": 0.5327815413475037, "epoch": 2.3626168224299064, "grad_norm": 0.04512718692421913, "learning_rate": 0.0002, "loss": 0.529172420501709, "mean_token_accuracy": 0.7825805693864822, "num_tokens": 10313759.0, "step": 633 }, { "entropy": 0.5432299822568893, "epoch": 2.366355140186916, "grad_norm": 0.040462445467710495, "learning_rate": 0.0002, "loss": 0.5389863848686218, "mean_token_accuracy": 0.779638260602951, "num_tokens": 10330290.0, "step": 634 }, { "entropy": 0.5529568791389465, "epoch": 2.3700934579439252, "grad_norm": 0.04414237663149834, "learning_rate": 0.0002, "loss": 0.5526305437088013, "mean_token_accuracy": 0.7754997760057449, "num_tokens": 10346636.0, "step": 635 }, { "entropy": 0.5441652536392212, "epoch": 2.3738317757009346, "grad_norm": 0.037299707531929016, "learning_rate": 0.0002, "loss": 0.5382997393608093, "mean_token_accuracy": 0.7791097015142441, "num_tokens": 10362922.0, "step": 636 }, { "entropy": 0.5348048955202103, "epoch": 2.377570093457944, "grad_norm": 0.0446464829146862, "learning_rate": 0.0002, "loss": 0.5380210876464844, "mean_token_accuracy": 0.7818952798843384, "num_tokens": 10379134.0, "step": 637 }, { "entropy": 0.5187151804566383, "epoch": 2.3813084112149534, "grad_norm": 0.0778694897890091, "learning_rate": 0.0002, "loss": 0.5220566391944885, "mean_token_accuracy": 0.7889348715543747, "num_tokens": 10395255.0, "step": 638 }, { "entropy": 0.5462511032819748, "epoch": 2.385046728971963, "grad_norm": 0.04299847036600113, "learning_rate": 0.0002, "loss": 0.5423526167869568, "mean_token_accuracy": 0.7763472348451614, "num_tokens": 10411644.0, "step": 639 }, { "entropy": 0.5463699400424957, "epoch": 2.388785046728972, "grad_norm": 0.10935911536216736, "learning_rate": 0.0002, "loss": 0.554538369178772, "mean_token_accuracy": 0.7772965431213379, "num_tokens": 10427999.0, "step": 640 }, { "entropy": 0.5152165368199348, "epoch": 2.392523364485981, "grad_norm": 0.03762959688901901, "learning_rate": 0.0002, "loss": 0.508588969707489, "mean_token_accuracy": 0.7926003634929657, "num_tokens": 10444169.0, "step": 641 }, { "entropy": 0.529686912894249, "epoch": 2.3962616822429905, "grad_norm": 0.040958285331726074, "learning_rate": 0.0002, "loss": 0.5307521820068359, "mean_token_accuracy": 0.7849727272987366, "num_tokens": 10460506.0, "step": 642 }, { "entropy": 0.5430792719125748, "epoch": 2.4, "grad_norm": 0.059025488793849945, "learning_rate": 0.0002, "loss": 0.5434512495994568, "mean_token_accuracy": 0.7796961963176727, "num_tokens": 10476852.0, "step": 643 }, { "entropy": 0.5448063015937805, "epoch": 2.4037383177570093, "grad_norm": 0.040974777191877365, "learning_rate": 0.0002, "loss": 0.5473527312278748, "mean_token_accuracy": 0.7792296558618546, "num_tokens": 10493362.0, "step": 644 }, { "entropy": 0.5385838449001312, "epoch": 2.4074766355140187, "grad_norm": 0.03980987146496773, "learning_rate": 0.0002, "loss": 0.5398511290550232, "mean_token_accuracy": 0.7808338552713394, "num_tokens": 10509993.0, "step": 645 }, { "entropy": 0.5397947132587433, "epoch": 2.411214953271028, "grad_norm": 0.04422999173402786, "learning_rate": 0.0002, "loss": 0.5439976453781128, "mean_token_accuracy": 0.7772432416677475, "num_tokens": 10525999.0, "step": 646 }, { "entropy": 0.5487875193357468, "epoch": 2.4149532710280375, "grad_norm": 0.035030197352170944, "learning_rate": 0.0002, "loss": 0.5411213636398315, "mean_token_accuracy": 0.7808128446340561, "num_tokens": 10542385.0, "step": 647 }, { "entropy": 0.5536469519138336, "epoch": 2.418691588785047, "grad_norm": 0.03504094481468201, "learning_rate": 0.0002, "loss": 0.5501288771629333, "mean_token_accuracy": 0.7798037678003311, "num_tokens": 10558968.0, "step": 648 }, { "entropy": 0.542830765247345, "epoch": 2.4224299065420563, "grad_norm": 0.04252900928258896, "learning_rate": 0.0002, "loss": 0.5463917255401611, "mean_token_accuracy": 0.7780060321092606, "num_tokens": 10575204.0, "step": 649 }, { "entropy": 0.5445516556501389, "epoch": 2.426168224299065, "grad_norm": 0.03962906450033188, "learning_rate": 0.0002, "loss": 0.5398474335670471, "mean_token_accuracy": 0.7808130532503128, "num_tokens": 10591758.0, "step": 650 }, { "entropy": 0.5405502319335938, "epoch": 2.4299065420560746, "grad_norm": 0.0443168580532074, "learning_rate": 0.0002, "loss": 0.5365331172943115, "mean_token_accuracy": 0.7831508964300156, "num_tokens": 10608086.0, "step": 651 }, { "entropy": 0.5417730808258057, "epoch": 2.433644859813084, "grad_norm": 0.03887809067964554, "learning_rate": 0.0002, "loss": 0.5410832166671753, "mean_token_accuracy": 0.7785631865262985, "num_tokens": 10624498.0, "step": 652 }, { "entropy": 0.539076067507267, "epoch": 2.4373831775700934, "grad_norm": 0.03908571973443031, "learning_rate": 0.0002, "loss": 0.5387341976165771, "mean_token_accuracy": 0.781864196062088, "num_tokens": 10640880.0, "step": 653 }, { "entropy": 0.5390027314424515, "epoch": 2.4411214953271028, "grad_norm": 0.03712445870041847, "learning_rate": 0.0002, "loss": 0.5360729694366455, "mean_token_accuracy": 0.783073827624321, "num_tokens": 10657400.0, "step": 654 }, { "entropy": 0.5502242594957352, "epoch": 2.444859813084112, "grad_norm": 0.03870626538991928, "learning_rate": 0.0002, "loss": 0.5568853616714478, "mean_token_accuracy": 0.7743858247995377, "num_tokens": 10673826.0, "step": 655 }, { "entropy": 0.525546170771122, "epoch": 2.4485981308411215, "grad_norm": 0.05200404301285744, "learning_rate": 0.0002, "loss": 0.5247287154197693, "mean_token_accuracy": 0.787117063999176, "num_tokens": 10690101.0, "step": 656 }, { "entropy": 0.5489766597747803, "epoch": 2.452336448598131, "grad_norm": 0.03731005638837814, "learning_rate": 0.0002, "loss": 0.5479599833488464, "mean_token_accuracy": 0.7739788293838501, "num_tokens": 10706469.0, "step": 657 }, { "entropy": 0.5457844734191895, "epoch": 2.4560747663551403, "grad_norm": 0.03958994895219803, "learning_rate": 0.0002, "loss": 0.5466060638427734, "mean_token_accuracy": 0.776677593588829, "num_tokens": 10722827.0, "step": 658 }, { "entropy": 0.5301162749528885, "epoch": 2.4598130841121497, "grad_norm": 0.04651971161365509, "learning_rate": 0.0002, "loss": 0.5345625281333923, "mean_token_accuracy": 0.7808788865804672, "num_tokens": 10739136.0, "step": 659 }, { "entropy": 0.5545621961355209, "epoch": 2.463551401869159, "grad_norm": 0.04008018597960472, "learning_rate": 0.0002, "loss": 0.5584450960159302, "mean_token_accuracy": 0.7706544101238251, "num_tokens": 10755369.0, "step": 660 }, { "entropy": 0.5189358592033386, "epoch": 2.467289719626168, "grad_norm": 0.040387995541095734, "learning_rate": 0.0002, "loss": 0.5199939608573914, "mean_token_accuracy": 0.7878802865743637, "num_tokens": 10771408.0, "step": 661 }, { "entropy": 0.5370910465717316, "epoch": 2.4710280373831774, "grad_norm": 0.04395879805088043, "learning_rate": 0.0002, "loss": 0.534496545791626, "mean_token_accuracy": 0.7834903597831726, "num_tokens": 10787604.0, "step": 662 }, { "entropy": 0.5326719284057617, "epoch": 2.474766355140187, "grad_norm": 0.04668545350432396, "learning_rate": 0.0002, "loss": 0.5241788029670715, "mean_token_accuracy": 0.7905293852090836, "num_tokens": 10803945.0, "step": 663 }, { "entropy": 0.5368177741765976, "epoch": 2.4785046728971962, "grad_norm": 0.04925902187824249, "learning_rate": 0.0002, "loss": 0.5367681384086609, "mean_token_accuracy": 0.7809154391288757, "num_tokens": 10820178.0, "step": 664 }, { "entropy": 0.5293789505958557, "epoch": 2.4822429906542056, "grad_norm": 0.041696734726428986, "learning_rate": 0.0002, "loss": 0.5327548980712891, "mean_token_accuracy": 0.7873236238956451, "num_tokens": 10836561.0, "step": 665 }, { "entropy": 0.529408723115921, "epoch": 2.485981308411215, "grad_norm": 0.041212067008018494, "learning_rate": 0.0002, "loss": 0.5328470468521118, "mean_token_accuracy": 0.7832391858100891, "num_tokens": 10852980.0, "step": 666 }, { "entropy": 0.5545576214790344, "epoch": 2.4897196261682244, "grad_norm": 0.04478580132126808, "learning_rate": 0.0002, "loss": 0.5554249286651611, "mean_token_accuracy": 0.7741198241710663, "num_tokens": 10869321.0, "step": 667 }, { "entropy": 0.5539140552282333, "epoch": 2.493457943925234, "grad_norm": 0.04277152568101883, "learning_rate": 0.0002, "loss": 0.5493362545967102, "mean_token_accuracy": 0.7759024202823639, "num_tokens": 10885666.0, "step": 668 }, { "entropy": 0.5433756709098816, "epoch": 2.497196261682243, "grad_norm": 0.04360437020659447, "learning_rate": 0.0002, "loss": 0.5412634611129761, "mean_token_accuracy": 0.7808667570352554, "num_tokens": 10901903.0, "step": 669 }, { "entropy": 0.5487286895513535, "epoch": 2.500934579439252, "grad_norm": 0.03885580971837044, "learning_rate": 0.0002, "loss": 0.5431787371635437, "mean_token_accuracy": 0.7802725732326508, "num_tokens": 10918340.0, "step": 670 }, { "entropy": 0.5228707492351532, "epoch": 2.5046728971962615, "grad_norm": 0.053798187524080276, "learning_rate": 0.0002, "loss": 0.5311392545700073, "mean_token_accuracy": 0.7843292206525803, "num_tokens": 10934469.0, "step": 671 }, { "entropy": 0.5447903871536255, "epoch": 2.508411214953271, "grad_norm": 0.05324989929795265, "learning_rate": 0.0002, "loss": 0.5491751432418823, "mean_token_accuracy": 0.7752528339624405, "num_tokens": 10950837.0, "step": 672 }, { "entropy": 0.5308417528867722, "epoch": 2.5121495327102803, "grad_norm": 0.06228797510266304, "learning_rate": 0.0002, "loss": 0.5361084938049316, "mean_token_accuracy": 0.7828515321016312, "num_tokens": 10967098.0, "step": 673 }, { "entropy": 0.5403530299663544, "epoch": 2.5158878504672897, "grad_norm": 0.051257163286209106, "learning_rate": 0.0002, "loss": 0.542191207408905, "mean_token_accuracy": 0.7825300693511963, "num_tokens": 10983262.0, "step": 674 }, { "entropy": 0.5413467437028885, "epoch": 2.519626168224299, "grad_norm": 0.04910978302359581, "learning_rate": 0.0002, "loss": 0.5313704013824463, "mean_token_accuracy": 0.7851869165897369, "num_tokens": 10999552.0, "step": 675 }, { "entropy": 0.55167156457901, "epoch": 2.5233644859813085, "grad_norm": 0.033519063144922256, "learning_rate": 0.0002, "loss": 0.5438812971115112, "mean_token_accuracy": 0.7780154794454575, "num_tokens": 11016044.0, "step": 676 }, { "entropy": 0.5392196476459503, "epoch": 2.527102803738318, "grad_norm": 0.04278670251369476, "learning_rate": 0.0002, "loss": 0.5411216020584106, "mean_token_accuracy": 0.780839130282402, "num_tokens": 11032377.0, "step": 677 }, { "entropy": 0.5352826565504074, "epoch": 2.5308411214953273, "grad_norm": 0.04736237972974777, "learning_rate": 0.0002, "loss": 0.5446096658706665, "mean_token_accuracy": 0.7806870341300964, "num_tokens": 11048727.0, "step": 678 }, { "entropy": 0.5168470665812492, "epoch": 2.5345794392523366, "grad_norm": 0.03513955697417259, "learning_rate": 0.0002, "loss": 0.5200102925300598, "mean_token_accuracy": 0.7874528765678406, "num_tokens": 11064947.0, "step": 679 }, { "entropy": 0.5375211834907532, "epoch": 2.538317757009346, "grad_norm": 0.04709267243742943, "learning_rate": 0.0002, "loss": 0.5393041968345642, "mean_token_accuracy": 0.7837181091308594, "num_tokens": 11081532.0, "step": 680 }, { "entropy": 0.5512478798627853, "epoch": 2.542056074766355, "grad_norm": 0.04090959206223488, "learning_rate": 0.0002, "loss": 0.546190619468689, "mean_token_accuracy": 0.7762559801340103, "num_tokens": 11098073.0, "step": 681 }, { "entropy": 0.5283504128456116, "epoch": 2.5457943925233644, "grad_norm": 0.036959145218133926, "learning_rate": 0.0002, "loss": 0.5237979292869568, "mean_token_accuracy": 0.7874845713376999, "num_tokens": 11114315.0, "step": 682 }, { "entropy": 0.5489681363105774, "epoch": 2.5495327102803738, "grad_norm": 0.04488472267985344, "learning_rate": 0.0002, "loss": 0.5456336736679077, "mean_token_accuracy": 0.7797751575708389, "num_tokens": 11130665.0, "step": 683 }, { "entropy": 0.5317860543727875, "epoch": 2.553271028037383, "grad_norm": 0.04248347505927086, "learning_rate": 0.0002, "loss": 0.5382874011993408, "mean_token_accuracy": 0.77965147793293, "num_tokens": 11146874.0, "step": 684 }, { "entropy": 0.5419623553752899, "epoch": 2.5570093457943925, "grad_norm": 0.04522377625107765, "learning_rate": 0.0002, "loss": 0.5449318289756775, "mean_token_accuracy": 0.7786058634519577, "num_tokens": 11163427.0, "step": 685 }, { "entropy": 0.5241860747337341, "epoch": 2.560747663551402, "grad_norm": 0.04621601849794388, "learning_rate": 0.0002, "loss": 0.5267641544342041, "mean_token_accuracy": 0.7829258441925049, "num_tokens": 11179801.0, "step": 686 }, { "entropy": 0.5173597782850266, "epoch": 2.5644859813084113, "grad_norm": 0.043366726487874985, "learning_rate": 0.0002, "loss": 0.5181450843811035, "mean_token_accuracy": 0.7898700088262558, "num_tokens": 11196083.0, "step": 687 }, { "entropy": 0.538482740521431, "epoch": 2.5682242990654207, "grad_norm": 0.04418179765343666, "learning_rate": 0.0002, "loss": 0.5392533540725708, "mean_token_accuracy": 0.778387576341629, "num_tokens": 11212295.0, "step": 688 }, { "entropy": 0.540611207485199, "epoch": 2.5719626168224297, "grad_norm": 0.05271269753575325, "learning_rate": 0.0002, "loss": 0.5393270254135132, "mean_token_accuracy": 0.7812009155750275, "num_tokens": 11228565.0, "step": 689 }, { "entropy": 0.5282483994960785, "epoch": 2.575700934579439, "grad_norm": 0.04314183071255684, "learning_rate": 0.0002, "loss": 0.5224794149398804, "mean_token_accuracy": 0.7856594175100327, "num_tokens": 11244953.0, "step": 690 }, { "entropy": 0.5318177044391632, "epoch": 2.5794392523364484, "grad_norm": 0.05587287247180939, "learning_rate": 0.0002, "loss": 0.5358354449272156, "mean_token_accuracy": 0.7822671979665756, "num_tokens": 11261194.0, "step": 691 }, { "entropy": 0.5375986397266388, "epoch": 2.583177570093458, "grad_norm": 0.043386682868003845, "learning_rate": 0.0002, "loss": 0.5412317514419556, "mean_token_accuracy": 0.781296119093895, "num_tokens": 11277286.0, "step": 692 }, { "entropy": 0.5498186945915222, "epoch": 2.586915887850467, "grad_norm": 0.04709560051560402, "learning_rate": 0.0002, "loss": 0.5513982176780701, "mean_token_accuracy": 0.7768333256244659, "num_tokens": 11293799.0, "step": 693 }, { "entropy": 0.5409555584192276, "epoch": 2.5906542056074766, "grad_norm": 0.04518339782953262, "learning_rate": 0.0002, "loss": 0.5396868586540222, "mean_token_accuracy": 0.7791042476892471, "num_tokens": 11310089.0, "step": 694 }, { "entropy": 0.5236431509256363, "epoch": 2.594392523364486, "grad_norm": 0.03244040906429291, "learning_rate": 0.0002, "loss": 0.5155695676803589, "mean_token_accuracy": 0.7898247241973877, "num_tokens": 11326515.0, "step": 695 }, { "entropy": 0.5529845803976059, "epoch": 2.5981308411214954, "grad_norm": 0.04760007932782173, "learning_rate": 0.0002, "loss": 0.5487071871757507, "mean_token_accuracy": 0.7782804220914841, "num_tokens": 11342994.0, "step": 696 }, { "entropy": 0.5314944535493851, "epoch": 2.601869158878505, "grad_norm": 0.0422595851123333, "learning_rate": 0.0002, "loss": 0.5344254970550537, "mean_token_accuracy": 0.7827649861574173, "num_tokens": 11359320.0, "step": 697 }, { "entropy": 0.5296527296304703, "epoch": 2.605607476635514, "grad_norm": 0.04541509971022606, "learning_rate": 0.0002, "loss": 0.5399951338768005, "mean_token_accuracy": 0.7812868803739548, "num_tokens": 11375866.0, "step": 698 }, { "entropy": 0.5503706336021423, "epoch": 2.6093457943925236, "grad_norm": 0.04639806970953941, "learning_rate": 0.0002, "loss": 0.560705304145813, "mean_token_accuracy": 0.7734115719795227, "num_tokens": 11392189.0, "step": 699 }, { "entropy": 0.5334575325250626, "epoch": 2.613084112149533, "grad_norm": 0.03491205349564552, "learning_rate": 0.0002, "loss": 0.5285266637802124, "mean_token_accuracy": 0.786865234375, "num_tokens": 11408320.0, "step": 700 }, { "entropy": 0.5375584214925766, "epoch": 2.616822429906542, "grad_norm": 0.03665752336382866, "learning_rate": 0.0002, "loss": 0.5285854935646057, "mean_token_accuracy": 0.7843970507383347, "num_tokens": 11424696.0, "step": 701 }, { "entropy": 0.5432839095592499, "epoch": 2.6205607476635513, "grad_norm": 0.040845148265361786, "learning_rate": 0.0002, "loss": 0.5354432463645935, "mean_token_accuracy": 0.7819717228412628, "num_tokens": 11440921.0, "step": 702 }, { "entropy": 0.5447598993778229, "epoch": 2.6242990654205607, "grad_norm": 0.03317207470536232, "learning_rate": 0.0002, "loss": 0.5364579558372498, "mean_token_accuracy": 0.7815430164337158, "num_tokens": 11457136.0, "step": 703 }, { "entropy": 0.5318229794502258, "epoch": 2.62803738317757, "grad_norm": 0.04842844605445862, "learning_rate": 0.0002, "loss": 0.5381250381469727, "mean_token_accuracy": 0.7842467576265335, "num_tokens": 11473451.0, "step": 704 }, { "entropy": 0.53319051861763, "epoch": 2.6317757009345795, "grad_norm": 0.04995809122920036, "learning_rate": 0.0002, "loss": 0.5435810089111328, "mean_token_accuracy": 0.7806897163391113, "num_tokens": 11489778.0, "step": 705 }, { "entropy": 0.5205372422933578, "epoch": 2.635514018691589, "grad_norm": 0.043053507804870605, "learning_rate": 0.0002, "loss": 0.5225018858909607, "mean_token_accuracy": 0.7891059070825577, "num_tokens": 11506150.0, "step": 706 }, { "entropy": 0.5405721217393875, "epoch": 2.6392523364485982, "grad_norm": 0.047551702708005905, "learning_rate": 0.0002, "loss": 0.5341666340827942, "mean_token_accuracy": 0.7827833145856857, "num_tokens": 11522269.0, "step": 707 }, { "entropy": 0.555420309305191, "epoch": 2.6429906542056076, "grad_norm": 0.04240434989333153, "learning_rate": 0.0002, "loss": 0.5463941097259521, "mean_token_accuracy": 0.776122510433197, "num_tokens": 11538672.0, "step": 708 }, { "entropy": 0.5373465269804001, "epoch": 2.6467289719626166, "grad_norm": 0.04053036868572235, "learning_rate": 0.0002, "loss": 0.5378127694129944, "mean_token_accuracy": 0.7802188992500305, "num_tokens": 11554872.0, "step": 709 }, { "entropy": 0.554849311709404, "epoch": 2.650467289719626, "grad_norm": 0.03659540414810181, "learning_rate": 0.0002, "loss": 0.5495964288711548, "mean_token_accuracy": 0.7751747816801071, "num_tokens": 11571048.0, "step": 710 }, { "entropy": 0.5463902503252029, "epoch": 2.6542056074766354, "grad_norm": 0.04418041929602623, "learning_rate": 0.0002, "loss": 0.5471721887588501, "mean_token_accuracy": 0.7752395421266556, "num_tokens": 11587320.0, "step": 711 }, { "entropy": 0.5346667915582657, "epoch": 2.6579439252336448, "grad_norm": 0.03727971389889717, "learning_rate": 0.0002, "loss": 0.5335649847984314, "mean_token_accuracy": 0.7821184396743774, "num_tokens": 11603606.0, "step": 712 }, { "entropy": 0.5425343364477158, "epoch": 2.661682242990654, "grad_norm": 0.03725122660398483, "learning_rate": 0.0002, "loss": 0.5478883385658264, "mean_token_accuracy": 0.7786499708890915, "num_tokens": 11619898.0, "step": 713 }, { "entropy": 0.5213692635297775, "epoch": 2.6654205607476635, "grad_norm": 0.042857397347688675, "learning_rate": 0.0002, "loss": 0.5380342602729797, "mean_token_accuracy": 0.7818091064691544, "num_tokens": 11636325.0, "step": 714 }, { "entropy": 0.514741487801075, "epoch": 2.669158878504673, "grad_norm": 0.035097621381282806, "learning_rate": 0.0002, "loss": 0.5151344537734985, "mean_token_accuracy": 0.7884217798709869, "num_tokens": 11652621.0, "step": 715 }, { "entropy": 0.5442497134208679, "epoch": 2.6728971962616823, "grad_norm": 0.04381122440099716, "learning_rate": 0.0002, "loss": 0.5412749648094177, "mean_token_accuracy": 0.7799884676933289, "num_tokens": 11669129.0, "step": 716 }, { "entropy": 0.5303985998034477, "epoch": 2.6766355140186917, "grad_norm": 0.03387914225459099, "learning_rate": 0.0002, "loss": 0.5209308862686157, "mean_token_accuracy": 0.7879882901906967, "num_tokens": 11685246.0, "step": 717 }, { "entropy": 0.551127091050148, "epoch": 2.680373831775701, "grad_norm": 0.03922301158308983, "learning_rate": 0.0002, "loss": 0.5454061031341553, "mean_token_accuracy": 0.7784066051244736, "num_tokens": 11701476.0, "step": 718 }, { "entropy": 0.537367194890976, "epoch": 2.6841121495327105, "grad_norm": 0.038754355162382126, "learning_rate": 0.0002, "loss": 0.5407044887542725, "mean_token_accuracy": 0.7816831916570663, "num_tokens": 11717876.0, "step": 719 }, { "entropy": 0.5448082834482193, "epoch": 2.68785046728972, "grad_norm": 0.039220135658979416, "learning_rate": 0.0002, "loss": 0.5474362373352051, "mean_token_accuracy": 0.7776313573122025, "num_tokens": 11734335.0, "step": 720 }, { "entropy": 0.5400021821260452, "epoch": 2.691588785046729, "grad_norm": 0.04735405370593071, "learning_rate": 0.0002, "loss": 0.5481384992599487, "mean_token_accuracy": 0.7767128497362137, "num_tokens": 11750551.0, "step": 721 }, { "entropy": 0.5442029386758804, "epoch": 2.695327102803738, "grad_norm": 0.04216023534536362, "learning_rate": 0.0002, "loss": 0.5538774728775024, "mean_token_accuracy": 0.7767860740423203, "num_tokens": 11766874.0, "step": 722 }, { "entropy": 0.5446023046970367, "epoch": 2.6990654205607476, "grad_norm": 0.036887411028146744, "learning_rate": 0.0002, "loss": 0.5384114384651184, "mean_token_accuracy": 0.7818654030561447, "num_tokens": 11783153.0, "step": 723 }, { "entropy": 0.5451595932245255, "epoch": 2.702803738317757, "grad_norm": 0.03859608620405197, "learning_rate": 0.0002, "loss": 0.5347609519958496, "mean_token_accuracy": 0.781577005982399, "num_tokens": 11799221.0, "step": 724 }, { "entropy": 0.5464123338460922, "epoch": 2.7065420560747664, "grad_norm": 0.04104648903012276, "learning_rate": 0.0002, "loss": 0.531836986541748, "mean_token_accuracy": 0.7847746908664703, "num_tokens": 11815592.0, "step": 725 }, { "entropy": 0.5458803474903107, "epoch": 2.710280373831776, "grad_norm": 0.041141774505376816, "learning_rate": 0.0002, "loss": 0.5450369119644165, "mean_token_accuracy": 0.7772473990917206, "num_tokens": 11831810.0, "step": 726 }, { "entropy": 0.5207616165280342, "epoch": 2.714018691588785, "grad_norm": 0.039117299020290375, "learning_rate": 0.0002, "loss": 0.5268270969390869, "mean_token_accuracy": 0.7860666513442993, "num_tokens": 11848039.0, "step": 727 }, { "entropy": 0.5192839056253433, "epoch": 2.717757009345794, "grad_norm": 0.03917457163333893, "learning_rate": 0.0002, "loss": 0.5228926539421082, "mean_token_accuracy": 0.7870692610740662, "num_tokens": 11864185.0, "step": 728 }, { "entropy": 0.5525725483894348, "epoch": 2.7214953271028035, "grad_norm": 0.04475993663072586, "learning_rate": 0.0002, "loss": 0.5607837438583374, "mean_token_accuracy": 0.7710844576358795, "num_tokens": 11880885.0, "step": 729 }, { "entropy": 0.5314790159463882, "epoch": 2.725233644859813, "grad_norm": 0.03775126487016678, "learning_rate": 0.0002, "loss": 0.5314686298370361, "mean_token_accuracy": 0.7859503030776978, "num_tokens": 11897351.0, "step": 730 }, { "entropy": 0.5637041479349136, "epoch": 2.7289719626168223, "grad_norm": 0.045830611139535904, "learning_rate": 0.0002, "loss": 0.5615176558494568, "mean_token_accuracy": 0.7733500599861145, "num_tokens": 11913886.0, "step": 731 }, { "entropy": 0.5528976023197174, "epoch": 2.7327102803738317, "grad_norm": 0.0355507992208004, "learning_rate": 0.0002, "loss": 0.5482446551322937, "mean_token_accuracy": 0.7790254056453705, "num_tokens": 11930270.0, "step": 732 }, { "entropy": 0.521368145942688, "epoch": 2.736448598130841, "grad_norm": 0.040386781096458435, "learning_rate": 0.0002, "loss": 0.5189903974533081, "mean_token_accuracy": 0.7861309498548508, "num_tokens": 11946624.0, "step": 733 }, { "entropy": 0.5495569705963135, "epoch": 2.7401869158878505, "grad_norm": 0.04659309610724449, "learning_rate": 0.0002, "loss": 0.5496231913566589, "mean_token_accuracy": 0.7766851484775543, "num_tokens": 11963057.0, "step": 734 }, { "entropy": 0.5380824655294418, "epoch": 2.74392523364486, "grad_norm": 0.04431717097759247, "learning_rate": 0.0002, "loss": 0.5472241640090942, "mean_token_accuracy": 0.7799153625965118, "num_tokens": 11979414.0, "step": 735 }, { "entropy": 0.5362866371870041, "epoch": 2.7476635514018692, "grad_norm": 0.04207630082964897, "learning_rate": 0.0002, "loss": 0.5480789542198181, "mean_token_accuracy": 0.7744766473770142, "num_tokens": 11995788.0, "step": 736 }, { "entropy": 0.5203833281993866, "epoch": 2.7514018691588786, "grad_norm": 0.040439583361148834, "learning_rate": 0.0002, "loss": 0.5229013562202454, "mean_token_accuracy": 0.7877133041620255, "num_tokens": 12011768.0, "step": 737 }, { "entropy": 0.5442389398813248, "epoch": 2.755140186915888, "grad_norm": 0.036312710493803024, "learning_rate": 0.0002, "loss": 0.5421340465545654, "mean_token_accuracy": 0.7801235765218735, "num_tokens": 12027990.0, "step": 738 }, { "entropy": 0.540812149643898, "epoch": 2.7588785046728974, "grad_norm": 0.035805970430374146, "learning_rate": 0.0002, "loss": 0.5289261937141418, "mean_token_accuracy": 0.7858118265867233, "num_tokens": 12044016.0, "step": 739 }, { "entropy": 0.5561389774084091, "epoch": 2.762616822429907, "grad_norm": 0.03753306344151497, "learning_rate": 0.0002, "loss": 0.5497045516967773, "mean_token_accuracy": 0.7774728685617447, "num_tokens": 12060449.0, "step": 740 }, { "entropy": 0.5353166311979294, "epoch": 2.7663551401869158, "grad_norm": 0.04419036954641342, "learning_rate": 0.0002, "loss": 0.5267462134361267, "mean_token_accuracy": 0.7831297665834427, "num_tokens": 12076756.0, "step": 741 }, { "entropy": 0.5390448272228241, "epoch": 2.770093457943925, "grad_norm": 0.039156846702098846, "learning_rate": 0.0002, "loss": 0.5363330841064453, "mean_token_accuracy": 0.7822138518095016, "num_tokens": 12093231.0, "step": 742 }, { "entropy": 0.5334637314081192, "epoch": 2.7738317757009345, "grad_norm": 0.03978954628109932, "learning_rate": 0.0002, "loss": 0.5416637659072876, "mean_token_accuracy": 0.782222107052803, "num_tokens": 12109520.0, "step": 743 }, { "entropy": 0.5362211316823959, "epoch": 2.777570093457944, "grad_norm": 0.04728684201836586, "learning_rate": 0.0002, "loss": 0.5461055040359497, "mean_token_accuracy": 0.7771897614002228, "num_tokens": 12125527.0, "step": 744 }, { "entropy": 0.5383228212594986, "epoch": 2.7813084112149533, "grad_norm": 0.03740681707859039, "learning_rate": 0.0002, "loss": 0.5361698269844055, "mean_token_accuracy": 0.7826491445302963, "num_tokens": 12141826.0, "step": 745 }, { "entropy": 0.5330131649971008, "epoch": 2.7850467289719627, "grad_norm": 0.03758367896080017, "learning_rate": 0.0002, "loss": 0.5265568494796753, "mean_token_accuracy": 0.7877195477485657, "num_tokens": 12157984.0, "step": 746 }, { "entropy": 0.5397753864526749, "epoch": 2.788785046728972, "grad_norm": 0.042070865631103516, "learning_rate": 0.0002, "loss": 0.5313206911087036, "mean_token_accuracy": 0.7845780104398727, "num_tokens": 12174529.0, "step": 747 }, { "entropy": 0.5600686222314835, "epoch": 2.792523364485981, "grad_norm": 0.0377703532576561, "learning_rate": 0.0002, "loss": 0.5598015189170837, "mean_token_accuracy": 0.7710230052471161, "num_tokens": 12190857.0, "step": 748 }, { "entropy": 0.5242457091808319, "epoch": 2.7962616822429904, "grad_norm": 0.036673370748758316, "learning_rate": 0.0002, "loss": 0.5266134738922119, "mean_token_accuracy": 0.7835761904716492, "num_tokens": 12207046.0, "step": 749 }, { "entropy": 0.5196694731712341, "epoch": 2.8, "grad_norm": 0.04529178887605667, "learning_rate": 0.0002, "loss": 0.5295214653015137, "mean_token_accuracy": 0.7850393652915955, "num_tokens": 12223323.0, "step": 750 }, { "entropy": 0.5278067588806152, "epoch": 2.803738317757009, "grad_norm": 0.04078579694032669, "learning_rate": 0.0002, "loss": 0.5326597094535828, "mean_token_accuracy": 0.7830272614955902, "num_tokens": 12239416.0, "step": 751 }, { "entropy": 0.5326859503984451, "epoch": 2.8074766355140186, "grad_norm": 0.04164998233318329, "learning_rate": 0.0002, "loss": 0.5332698225975037, "mean_token_accuracy": 0.7816595435142517, "num_tokens": 12255780.0, "step": 752 }, { "entropy": 0.5238984450697899, "epoch": 2.811214953271028, "grad_norm": 0.03843814134597778, "learning_rate": 0.0002, "loss": 0.5195130109786987, "mean_token_accuracy": 0.7881060838699341, "num_tokens": 12272157.0, "step": 753 }, { "entropy": 0.5336880385875702, "epoch": 2.8149532710280374, "grad_norm": 0.039413440972566605, "learning_rate": 0.0002, "loss": 0.531658411026001, "mean_token_accuracy": 0.7836297303438187, "num_tokens": 12288500.0, "step": 754 }, { "entropy": 0.5406560152769089, "epoch": 2.8186915887850468, "grad_norm": 0.044693466275930405, "learning_rate": 0.0002, "loss": 0.541545033454895, "mean_token_accuracy": 0.7807977646589279, "num_tokens": 12304864.0, "step": 755 }, { "entropy": 0.538055032491684, "epoch": 2.822429906542056, "grad_norm": 0.03888081759214401, "learning_rate": 0.0002, "loss": 0.5337695479393005, "mean_token_accuracy": 0.7844773530960083, "num_tokens": 12321170.0, "step": 756 }, { "entropy": 0.527722030878067, "epoch": 2.8261682242990656, "grad_norm": 0.04188257455825806, "learning_rate": 0.0002, "loss": 0.5265190005302429, "mean_token_accuracy": 0.7878826707601547, "num_tokens": 12337523.0, "step": 757 }, { "entropy": 0.5507965534925461, "epoch": 2.829906542056075, "grad_norm": 0.03817446902394295, "learning_rate": 0.0002, "loss": 0.5500692129135132, "mean_token_accuracy": 0.7806660830974579, "num_tokens": 12354118.0, "step": 758 }, { "entropy": 0.5407035946846008, "epoch": 2.8336448598130843, "grad_norm": 0.042875856161117554, "learning_rate": 0.0002, "loss": 0.5405147671699524, "mean_token_accuracy": 0.7810708433389664, "num_tokens": 12370434.0, "step": 759 }, { "entropy": 0.5315204411745071, "epoch": 2.8373831775700937, "grad_norm": 0.042397141456604004, "learning_rate": 0.0002, "loss": 0.538346529006958, "mean_token_accuracy": 0.7821339964866638, "num_tokens": 12386428.0, "step": 760 }, { "entropy": 0.5520299524068832, "epoch": 2.8411214953271027, "grad_norm": 0.04137783497571945, "learning_rate": 0.0002, "loss": 0.5512533187866211, "mean_token_accuracy": 0.7781175673007965, "num_tokens": 12402867.0, "step": 761 }, { "entropy": 0.5510706156492233, "epoch": 2.844859813084112, "grad_norm": 0.04001981019973755, "learning_rate": 0.0002, "loss": 0.5554083585739136, "mean_token_accuracy": 0.7719452530145645, "num_tokens": 12419054.0, "step": 762 }, { "entropy": 0.5559884458780289, "epoch": 2.8485981308411215, "grad_norm": 0.035403911024332047, "learning_rate": 0.0002, "loss": 0.5523775815963745, "mean_token_accuracy": 0.7766276150941849, "num_tokens": 12435351.0, "step": 763 }, { "entropy": 0.5434874594211578, "epoch": 2.852336448598131, "grad_norm": 0.03929636627435684, "learning_rate": 0.0002, "loss": 0.537907063961029, "mean_token_accuracy": 0.7796172052621841, "num_tokens": 12451647.0, "step": 764 }, { "entropy": 0.5497813075780869, "epoch": 2.8560747663551402, "grad_norm": 0.03768793120980263, "learning_rate": 0.0002, "loss": 0.5450780391693115, "mean_token_accuracy": 0.7810264527797699, "num_tokens": 12468063.0, "step": 765 }, { "entropy": 0.5202910378575325, "epoch": 2.8598130841121496, "grad_norm": 0.03793422132730484, "learning_rate": 0.0002, "loss": 0.5197356343269348, "mean_token_accuracy": 0.7887470573186874, "num_tokens": 12484329.0, "step": 766 }, { "entropy": 0.5339359492063522, "epoch": 2.863551401869159, "grad_norm": 0.04222627729177475, "learning_rate": 0.0002, "loss": 0.5416290760040283, "mean_token_accuracy": 0.7798094302415848, "num_tokens": 12500522.0, "step": 767 }, { "entropy": 0.5492495894432068, "epoch": 2.867289719626168, "grad_norm": 0.043936122208833694, "learning_rate": 0.0002, "loss": 0.556658148765564, "mean_token_accuracy": 0.7760462909936905, "num_tokens": 12516877.0, "step": 768 }, { "entropy": 0.534624308347702, "epoch": 2.8710280373831774, "grad_norm": 0.042372506111860275, "learning_rate": 0.0002, "loss": 0.5317083597183228, "mean_token_accuracy": 0.7851851731538773, "num_tokens": 12533180.0, "step": 769 }, { "entropy": 0.5446592271327972, "epoch": 2.8747663551401867, "grad_norm": 0.037292055785655975, "learning_rate": 0.0002, "loss": 0.5379966497421265, "mean_token_accuracy": 0.7800319492816925, "num_tokens": 12549532.0, "step": 770 }, { "entropy": 0.5482804775238037, "epoch": 2.878504672897196, "grad_norm": 0.038804132491350174, "learning_rate": 0.0002, "loss": 0.5504724383354187, "mean_token_accuracy": 0.7738227695226669, "num_tokens": 12565943.0, "step": 771 }, { "entropy": 0.5368440747261047, "epoch": 2.8822429906542055, "grad_norm": 0.04019741341471672, "learning_rate": 0.0002, "loss": 0.5410951375961304, "mean_token_accuracy": 0.7783905565738678, "num_tokens": 12582258.0, "step": 772 }, { "entropy": 0.5336288064718246, "epoch": 2.885981308411215, "grad_norm": 0.034321509301662445, "learning_rate": 0.0002, "loss": 0.5328375101089478, "mean_token_accuracy": 0.784157395362854, "num_tokens": 12598555.0, "step": 773 }, { "entropy": 0.5653717815876007, "epoch": 2.8897196261682243, "grad_norm": 0.03593064844608307, "learning_rate": 0.0002, "loss": 0.5628952383995056, "mean_token_accuracy": 0.7731250822544098, "num_tokens": 12614684.0, "step": 774 }, { "entropy": 0.5388960689306259, "epoch": 2.8934579439252337, "grad_norm": 0.03794105350971222, "learning_rate": 0.0002, "loss": 0.5317496061325073, "mean_token_accuracy": 0.7814508825540543, "num_tokens": 12631301.0, "step": 775 }, { "entropy": 0.5498441606760025, "epoch": 2.897196261682243, "grad_norm": 0.03615562617778778, "learning_rate": 0.0002, "loss": 0.5489410161972046, "mean_token_accuracy": 0.7768700569868088, "num_tokens": 12647948.0, "step": 776 }, { "entropy": 0.5340896248817444, "epoch": 2.9009345794392525, "grad_norm": 0.038868315517902374, "learning_rate": 0.0002, "loss": 0.5335500836372375, "mean_token_accuracy": 0.7818741798400879, "num_tokens": 12664189.0, "step": 777 }, { "entropy": 0.5473947077989578, "epoch": 2.904672897196262, "grad_norm": 0.04030415788292885, "learning_rate": 0.0002, "loss": 0.547685980796814, "mean_token_accuracy": 0.7762889117002487, "num_tokens": 12680521.0, "step": 778 }, { "entropy": 0.5354717969894409, "epoch": 2.9084112149532713, "grad_norm": 0.03963444381952286, "learning_rate": 0.0002, "loss": 0.5363295078277588, "mean_token_accuracy": 0.7828177064657211, "num_tokens": 12696847.0, "step": 779 }, { "entropy": 0.5292405933141708, "epoch": 2.91214953271028, "grad_norm": 0.044744838029146194, "learning_rate": 0.0002, "loss": 0.5327066779136658, "mean_token_accuracy": 0.7849072515964508, "num_tokens": 12713036.0, "step": 780 }, { "entropy": 0.52642522752285, "epoch": 2.9158878504672896, "grad_norm": 0.04283163696527481, "learning_rate": 0.0002, "loss": 0.5329762697219849, "mean_token_accuracy": 0.7837288975715637, "num_tokens": 12729209.0, "step": 781 }, { "entropy": 0.527685210108757, "epoch": 2.919626168224299, "grad_norm": 0.041390661150217056, "learning_rate": 0.0002, "loss": 0.5320221185684204, "mean_token_accuracy": 0.783889576792717, "num_tokens": 12745655.0, "step": 782 }, { "entropy": 0.5404015928506851, "epoch": 2.9233644859813084, "grad_norm": 0.040262214839458466, "learning_rate": 0.0002, "loss": 0.5304533243179321, "mean_token_accuracy": 0.7833625972270966, "num_tokens": 12762029.0, "step": 783 }, { "entropy": 0.5551902800798416, "epoch": 2.9271028037383178, "grad_norm": 0.0381385013461113, "learning_rate": 0.0002, "loss": 0.5540827512741089, "mean_token_accuracy": 0.774557501077652, "num_tokens": 12778129.0, "step": 784 }, { "entropy": 0.5423577576875687, "epoch": 2.930841121495327, "grad_norm": 0.04024689272046089, "learning_rate": 0.0002, "loss": 0.5434139370918274, "mean_token_accuracy": 0.7793742418289185, "num_tokens": 12794167.0, "step": 785 }, { "entropy": 0.5381026417016983, "epoch": 2.9345794392523366, "grad_norm": 0.03909367695450783, "learning_rate": 0.0002, "loss": 0.540184736251831, "mean_token_accuracy": 0.7813534885644913, "num_tokens": 12810454.0, "step": 786 }, { "entropy": 0.5301714539527893, "epoch": 2.938317757009346, "grad_norm": 0.039717331528663635, "learning_rate": 0.0002, "loss": 0.528195858001709, "mean_token_accuracy": 0.7839880138635635, "num_tokens": 12826792.0, "step": 787 }, { "entropy": 0.5483011454343796, "epoch": 2.942056074766355, "grad_norm": 0.04299187660217285, "learning_rate": 0.0002, "loss": 0.5469069480895996, "mean_token_accuracy": 0.7784111201763153, "num_tokens": 12843156.0, "step": 788 }, { "entropy": 0.5493280291557312, "epoch": 2.9457943925233643, "grad_norm": 0.03909771516919136, "learning_rate": 0.0002, "loss": 0.5475714206695557, "mean_token_accuracy": 0.7802032381296158, "num_tokens": 12859513.0, "step": 789 }, { "entropy": 0.545919269323349, "epoch": 2.9495327102803737, "grad_norm": 0.03977775201201439, "learning_rate": 0.0002, "loss": 0.5396496057510376, "mean_token_accuracy": 0.7824081033468246, "num_tokens": 12875944.0, "step": 790 }, { "entropy": 0.5471485257148743, "epoch": 2.953271028037383, "grad_norm": 0.04360375925898552, "learning_rate": 0.0002, "loss": 0.546139657497406, "mean_token_accuracy": 0.7795716971158981, "num_tokens": 12892408.0, "step": 791 }, { "entropy": 0.5483593940734863, "epoch": 2.9570093457943925, "grad_norm": 0.03873739019036293, "learning_rate": 0.0002, "loss": 0.5458930134773254, "mean_token_accuracy": 0.7784797698259354, "num_tokens": 12908878.0, "step": 792 }, { "entropy": 0.5327412039041519, "epoch": 2.960747663551402, "grad_norm": 0.04030138626694679, "learning_rate": 0.0002, "loss": 0.531423032283783, "mean_token_accuracy": 0.7864594012498856, "num_tokens": 12925328.0, "step": 793 }, { "entropy": 0.5355861634016037, "epoch": 2.9644859813084112, "grad_norm": 0.03622936084866524, "learning_rate": 0.0002, "loss": 0.5347930192947388, "mean_token_accuracy": 0.7837072014808655, "num_tokens": 12941525.0, "step": 794 }, { "entropy": 0.5421173870563507, "epoch": 2.9682242990654206, "grad_norm": 0.04139631241559982, "learning_rate": 0.0002, "loss": 0.5441262125968933, "mean_token_accuracy": 0.7780770361423492, "num_tokens": 12957883.0, "step": 795 }, { "entropy": 0.5358422696590424, "epoch": 2.97196261682243, "grad_norm": 0.04235566407442093, "learning_rate": 0.0002, "loss": 0.5453042984008789, "mean_token_accuracy": 0.780327558517456, "num_tokens": 12974226.0, "step": 796 }, { "entropy": 0.5261758118867874, "epoch": 2.9757009345794394, "grad_norm": 0.038478292524814606, "learning_rate": 0.0002, "loss": 0.5281113386154175, "mean_token_accuracy": 0.7872153073549271, "num_tokens": 12990610.0, "step": 797 }, { "entropy": 0.555643692612648, "epoch": 2.979439252336449, "grad_norm": 0.03554081916809082, "learning_rate": 0.0002, "loss": 0.5489306449890137, "mean_token_accuracy": 0.7791497707366943, "num_tokens": 13007012.0, "step": 798 }, { "entropy": 0.5474710315465927, "epoch": 2.983177570093458, "grad_norm": 0.04082915186882019, "learning_rate": 0.0002, "loss": 0.5414685606956482, "mean_token_accuracy": 0.7802593261003494, "num_tokens": 13023273.0, "step": 799 }, { "entropy": 0.551795169711113, "epoch": 2.986915887850467, "grad_norm": 0.03786645457148552, "learning_rate": 0.0002, "loss": 0.5478507280349731, "mean_token_accuracy": 0.7769146114587784, "num_tokens": 13039409.0, "step": 800 }, { "entropy": 0.5366168767213821, "epoch": 2.9906542056074765, "grad_norm": 0.04365032911300659, "learning_rate": 0.0002, "loss": 0.5442554354667664, "mean_token_accuracy": 0.7847046703100204, "num_tokens": 13055837.0, "step": 801 }, { "entropy": 0.528346061706543, "epoch": 2.994392523364486, "grad_norm": 0.05227791890501976, "learning_rate": 0.0002, "loss": 0.5428685545921326, "mean_token_accuracy": 0.7789010256528854, "num_tokens": 13072216.0, "step": 802 }, { "entropy": 0.5396917909383774, "epoch": 2.9981308411214953, "grad_norm": 0.03931191936135292, "learning_rate": 0.0002, "loss": 0.5454744696617126, "mean_token_accuracy": 0.7764900475740433, "num_tokens": 13088462.0, "step": 803 }, { "entropy": 0.5376738607883453, "epoch": 3.0, "grad_norm": 0.04954347386956215, "learning_rate": 0.0002, "loss": 0.5307910442352295, "mean_token_accuracy": 0.7855222225189209, "num_tokens": 13096612.0, "step": 804 } ], "logging_steps": 1, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2194419027224822e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }