{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 220, "global_step": 3512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.9718521535396576, "epoch": 0.002280501710376283, "grad_norm": 22.375, "learning_rate": 0.0, "loss": 0.5641, "mean_token_accuracy": 0.8487243205308914, "num_tokens": 106991.0, "step": 1 }, { "entropy": 0.9679722785949707, "epoch": 0.004561003420752566, "grad_norm": 20.125, "learning_rate": 2.272727272727273e-08, "loss": 0.5435, "mean_token_accuracy": 0.8494603335857391, "num_tokens": 212946.0, "step": 2 }, { "entropy": 0.9649502336978912, "epoch": 0.0068415051311288486, "grad_norm": 22.625, "learning_rate": 4.545454545454546e-08, "loss": 0.6144, "mean_token_accuracy": 0.8399151116609573, "num_tokens": 319206.0, "step": 3 }, { "entropy": 0.9706314355134964, "epoch": 0.009122006841505131, "grad_norm": 21.5, "learning_rate": 6.818181818181819e-08, "loss": 0.5636, "mean_token_accuracy": 0.8579978793859482, "num_tokens": 425322.0, "step": 4 }, { "entropy": 0.9706339240074158, "epoch": 0.011402508551881414, "grad_norm": 20.75, "learning_rate": 9.090909090909091e-08, "loss": 0.62, "mean_token_accuracy": 0.8451835811138153, "num_tokens": 531937.0, "step": 5 }, { "entropy": 0.9648790061473846, "epoch": 0.013683010262257697, "grad_norm": 19.75, "learning_rate": 1.1363636363636364e-07, "loss": 0.5459, "mean_token_accuracy": 0.8513335734605789, "num_tokens": 637977.0, "step": 6 }, { "entropy": 0.9688467532396317, "epoch": 0.01596351197263398, "grad_norm": 20.5, "learning_rate": 1.3636363636363637e-07, "loss": 0.5818, "mean_token_accuracy": 0.8554333001375198, "num_tokens": 744457.0, "step": 7 }, { "entropy": 0.9626207947731018, "epoch": 0.018244013683010263, "grad_norm": 20.5, "learning_rate": 1.590909090909091e-07, "loss": 0.5499, "mean_token_accuracy": 0.8546150773763657, "num_tokens": 851351.0, "step": 8 }, { "entropy": 0.9742659777402878, "epoch": 0.020524515393386546, "grad_norm": 20.0, "learning_rate": 1.8181818181818183e-07, "loss": 0.5715, "mean_token_accuracy": 0.8515513092279434, "num_tokens": 957783.0, "step": 9 }, { "entropy": 0.9747052788734436, "epoch": 0.02280501710376283, "grad_norm": 20.125, "learning_rate": 2.0454545454545456e-07, "loss": 0.5932, "mean_token_accuracy": 0.855087161064148, "num_tokens": 1064484.0, "step": 10 }, { "entropy": 0.9937114864587784, "epoch": 0.02508551881413911, "grad_norm": 20.25, "learning_rate": 2.2727272727272729e-07, "loss": 0.5747, "mean_token_accuracy": 0.8463916778564453, "num_tokens": 1171094.0, "step": 11 }, { "entropy": 0.9803711771965027, "epoch": 0.027366020524515394, "grad_norm": 21.0, "learning_rate": 2.5000000000000004e-07, "loss": 0.5643, "mean_token_accuracy": 0.8523868471384048, "num_tokens": 1277120.0, "step": 12 }, { "entropy": 0.9708976000547409, "epoch": 0.029646522234891677, "grad_norm": 19.5, "learning_rate": 2.7272727272727274e-07, "loss": 0.542, "mean_token_accuracy": 0.8623145669698715, "num_tokens": 1383378.0, "step": 13 }, { "entropy": 0.9743312001228333, "epoch": 0.03192702394526796, "grad_norm": 18.75, "learning_rate": 2.954545454545455e-07, "loss": 0.5744, "mean_token_accuracy": 0.8504001498222351, "num_tokens": 1490017.0, "step": 14 }, { "entropy": 0.9716125130653381, "epoch": 0.03420752565564424, "grad_norm": 20.0, "learning_rate": 3.181818181818182e-07, "loss": 0.5754, "mean_token_accuracy": 0.8533312529325485, "num_tokens": 1596039.0, "step": 15 }, { "entropy": 0.9829763025045395, "epoch": 0.036488027366020526, "grad_norm": 21.25, "learning_rate": 3.409090909090909e-07, "loss": 0.6211, "mean_token_accuracy": 0.8512101024389267, "num_tokens": 1702663.0, "step": 16 }, { "entropy": 0.9836482554674149, "epoch": 0.03876852907639681, "grad_norm": 19.25, "learning_rate": 3.6363636363636366e-07, "loss": 0.5878, "mean_token_accuracy": 0.8530930876731873, "num_tokens": 1809380.0, "step": 17 }, { "entropy": 0.9813654273748398, "epoch": 0.04104903078677309, "grad_norm": 20.5, "learning_rate": 3.8636363636363636e-07, "loss": 0.594, "mean_token_accuracy": 0.849306732416153, "num_tokens": 1916379.0, "step": 18 }, { "entropy": 0.9761021733283997, "epoch": 0.043329532497149374, "grad_norm": 19.25, "learning_rate": 4.090909090909091e-07, "loss": 0.6105, "mean_token_accuracy": 0.8469045907258987, "num_tokens": 2023542.0, "step": 19 }, { "entropy": 0.9846144765615463, "epoch": 0.04561003420752566, "grad_norm": 20.5, "learning_rate": 4.3181818181818187e-07, "loss": 0.5719, "mean_token_accuracy": 0.8530702441930771, "num_tokens": 2129588.0, "step": 20 }, { "entropy": 0.9861293584108353, "epoch": 0.04789053591790194, "grad_norm": 23.125, "learning_rate": 4.5454545454545457e-07, "loss": 0.5503, "mean_token_accuracy": 0.8581545054912567, "num_tokens": 2236273.0, "step": 21 }, { "entropy": 0.9861622601747513, "epoch": 0.05017103762827822, "grad_norm": 19.625, "learning_rate": 4.772727272727274e-07, "loss": 0.5571, "mean_token_accuracy": 0.8600664436817169, "num_tokens": 2342660.0, "step": 22 }, { "entropy": 0.9745429307222366, "epoch": 0.052451539338654506, "grad_norm": 26.0, "learning_rate": 5.000000000000001e-07, "loss": 0.5802, "mean_token_accuracy": 0.8462299108505249, "num_tokens": 2448798.0, "step": 23 }, { "entropy": 0.974897101521492, "epoch": 0.05473204104903079, "grad_norm": 25.125, "learning_rate": 5.227272727272728e-07, "loss": 0.5938, "mean_token_accuracy": 0.8518176674842834, "num_tokens": 2554466.0, "step": 24 }, { "entropy": 0.9859575182199478, "epoch": 0.05701254275940707, "grad_norm": 26.125, "learning_rate": 5.454545454545455e-07, "loss": 0.5965, "mean_token_accuracy": 0.8504894822835922, "num_tokens": 2661100.0, "step": 25 }, { "entropy": 0.9888395965099335, "epoch": 0.059293044469783354, "grad_norm": 19.75, "learning_rate": 5.681818181818182e-07, "loss": 0.6318, "mean_token_accuracy": 0.8416079580783844, "num_tokens": 2768033.0, "step": 26 }, { "entropy": 0.9656000137329102, "epoch": 0.06157354618015964, "grad_norm": 20.875, "learning_rate": 5.90909090909091e-07, "loss": 0.6022, "mean_token_accuracy": 0.852581262588501, "num_tokens": 2873901.0, "step": 27 }, { "entropy": 0.9809471517801285, "epoch": 0.06385404789053592, "grad_norm": 20.5, "learning_rate": 6.136363636363637e-07, "loss": 0.5601, "mean_token_accuracy": 0.8570095002651215, "num_tokens": 2980343.0, "step": 28 }, { "entropy": 0.9743965268135071, "epoch": 0.0661345496009122, "grad_norm": 20.0, "learning_rate": 6.363636363636364e-07, "loss": 0.5752, "mean_token_accuracy": 0.8493557870388031, "num_tokens": 3086658.0, "step": 29 }, { "entropy": 0.9842239767313004, "epoch": 0.06841505131128849, "grad_norm": 19.0, "learning_rate": 6.590909090909091e-07, "loss": 0.5271, "mean_token_accuracy": 0.8636961430311203, "num_tokens": 3192435.0, "step": 30 }, { "entropy": 0.9747372418642044, "epoch": 0.07069555302166476, "grad_norm": 18.375, "learning_rate": 6.818181818181818e-07, "loss": 0.5387, "mean_token_accuracy": 0.8586861491203308, "num_tokens": 3298855.0, "step": 31 }, { "entropy": 0.97234046459198, "epoch": 0.07297605473204105, "grad_norm": 18.75, "learning_rate": 7.045454545454545e-07, "loss": 0.5153, "mean_token_accuracy": 0.864816814661026, "num_tokens": 3405470.0, "step": 32 }, { "entropy": 0.97023506462574, "epoch": 0.07525655644241733, "grad_norm": 20.125, "learning_rate": 7.272727272727273e-07, "loss": 0.573, "mean_token_accuracy": 0.8532920628786087, "num_tokens": 3511598.0, "step": 33 }, { "entropy": 0.9774262756109238, "epoch": 0.07753705815279362, "grad_norm": 19.5, "learning_rate": 7.5e-07, "loss": 0.5753, "mean_token_accuracy": 0.858393594622612, "num_tokens": 3618428.0, "step": 34 }, { "entropy": 0.9733954071998596, "epoch": 0.07981755986316989, "grad_norm": 17.5, "learning_rate": 7.727272727272727e-07, "loss": 0.473, "mean_token_accuracy": 0.8714419454336166, "num_tokens": 3724354.0, "step": 35 }, { "entropy": 0.9686232805252075, "epoch": 0.08209806157354618, "grad_norm": 19.375, "learning_rate": 7.954545454545455e-07, "loss": 0.5881, "mean_token_accuracy": 0.8572538048028946, "num_tokens": 3830483.0, "step": 36 }, { "entropy": 0.9713583290576935, "epoch": 0.08437856328392246, "grad_norm": 21.125, "learning_rate": 8.181818181818182e-07, "loss": 0.5149, "mean_token_accuracy": 0.8647192567586899, "num_tokens": 3937426.0, "step": 37 }, { "entropy": 0.9800392240285873, "epoch": 0.08665906499429875, "grad_norm": 18.875, "learning_rate": 8.409090909090909e-07, "loss": 0.4947, "mean_token_accuracy": 0.8733615279197693, "num_tokens": 4043552.0, "step": 38 }, { "entropy": 0.9819011092185974, "epoch": 0.08893956670467502, "grad_norm": 17.875, "learning_rate": 8.636363636363637e-07, "loss": 0.5191, "mean_token_accuracy": 0.8626638501882553, "num_tokens": 4150290.0, "step": 39 }, { "entropy": 0.9766407608985901, "epoch": 0.09122006841505131, "grad_norm": 18.0, "learning_rate": 8.863636363636364e-07, "loss": 0.5246, "mean_token_accuracy": 0.8591038435697556, "num_tokens": 4256548.0, "step": 40 }, { "entropy": 0.9757827073335648, "epoch": 0.09350057012542759, "grad_norm": 17.125, "learning_rate": 9.090909090909091e-07, "loss": 0.4861, "mean_token_accuracy": 0.873199537396431, "num_tokens": 4363734.0, "step": 41 }, { "entropy": 0.9684963524341583, "epoch": 0.09578107183580388, "grad_norm": 21.875, "learning_rate": 9.31818181818182e-07, "loss": 0.5363, "mean_token_accuracy": 0.8590797334909439, "num_tokens": 4469308.0, "step": 42 }, { "entropy": 0.9752471148967743, "epoch": 0.09806157354618016, "grad_norm": 18.375, "learning_rate": 9.545454545454548e-07, "loss": 0.5135, "mean_token_accuracy": 0.86577108502388, "num_tokens": 4575933.0, "step": 43 }, { "entropy": 0.9777541160583496, "epoch": 0.10034207525655645, "grad_norm": 25.125, "learning_rate": 9.772727272727275e-07, "loss": 0.5889, "mean_token_accuracy": 0.8561224490404129, "num_tokens": 4682110.0, "step": 44 }, { "entropy": 0.9719362854957581, "epoch": 0.10262257696693272, "grad_norm": 17.5, "learning_rate": 1.0000000000000002e-06, "loss": 0.5238, "mean_token_accuracy": 0.8665270805358887, "num_tokens": 4788059.0, "step": 45 }, { "entropy": 0.9743500500917435, "epoch": 0.10490307867730901, "grad_norm": 17.625, "learning_rate": 1.0227272727272729e-06, "loss": 0.5109, "mean_token_accuracy": 0.8747717142105103, "num_tokens": 4894627.0, "step": 46 }, { "entropy": 0.9867343902587891, "epoch": 0.10718358038768529, "grad_norm": 19.25, "learning_rate": 1.0454545454545456e-06, "loss": 0.5368, "mean_token_accuracy": 0.8593539595603943, "num_tokens": 5001793.0, "step": 47 }, { "entropy": 0.9779965728521347, "epoch": 0.10946408209806158, "grad_norm": 16.75, "learning_rate": 1.0681818181818183e-06, "loss": 0.5155, "mean_token_accuracy": 0.8755223155021667, "num_tokens": 5108380.0, "step": 48 }, { "entropy": 0.9692215919494629, "epoch": 0.11174458380843785, "grad_norm": 17.625, "learning_rate": 1.090909090909091e-06, "loss": 0.568, "mean_token_accuracy": 0.86724853515625, "num_tokens": 5214943.0, "step": 49 }, { "entropy": 0.9778064638376236, "epoch": 0.11402508551881414, "grad_norm": 17.0, "learning_rate": 1.1136363636363637e-06, "loss": 0.5126, "mean_token_accuracy": 0.8680306822061539, "num_tokens": 5320823.0, "step": 50 }, { "entropy": 0.9795403331518173, "epoch": 0.11630558722919042, "grad_norm": 17.375, "learning_rate": 1.1363636363636364e-06, "loss": 0.4963, "mean_token_accuracy": 0.8756769299507141, "num_tokens": 5427057.0, "step": 51 }, { "entropy": 0.9684654474258423, "epoch": 0.11858608893956671, "grad_norm": 16.5, "learning_rate": 1.159090909090909e-06, "loss": 0.4996, "mean_token_accuracy": 0.8833677768707275, "num_tokens": 5533159.0, "step": 52 }, { "entropy": 0.9636593014001846, "epoch": 0.12086659064994298, "grad_norm": 15.25, "learning_rate": 1.181818181818182e-06, "loss": 0.4847, "mean_token_accuracy": 0.8897876888513565, "num_tokens": 5639660.0, "step": 53 }, { "entropy": 0.9623651653528214, "epoch": 0.12314709236031927, "grad_norm": 17.125, "learning_rate": 1.2045454545454547e-06, "loss": 0.4838, "mean_token_accuracy": 0.8815622329711914, "num_tokens": 5746090.0, "step": 54 }, { "entropy": 0.9659887701272964, "epoch": 0.12542759407069556, "grad_norm": 14.3125, "learning_rate": 1.2272727272727274e-06, "loss": 0.4248, "mean_token_accuracy": 0.8967800289392471, "num_tokens": 5852161.0, "step": 55 }, { "entropy": 0.9752072095870972, "epoch": 0.12770809578107184, "grad_norm": 20.0, "learning_rate": 1.25e-06, "loss": 0.4924, "mean_token_accuracy": 0.8890634328126907, "num_tokens": 5958514.0, "step": 56 }, { "entropy": 0.9793832451105118, "epoch": 0.12998859749144812, "grad_norm": 13.9375, "learning_rate": 1.2727272727272728e-06, "loss": 0.4317, "mean_token_accuracy": 0.8969658315181732, "num_tokens": 6065650.0, "step": 57 }, { "entropy": 0.9677479565143585, "epoch": 0.1322690992018244, "grad_norm": 13.5, "learning_rate": 1.2954545454545455e-06, "loss": 0.448, "mean_token_accuracy": 0.8914330154657364, "num_tokens": 6172002.0, "step": 58 }, { "entropy": 0.9671346098184586, "epoch": 0.1345496009122007, "grad_norm": 14.1875, "learning_rate": 1.3181818181818182e-06, "loss": 0.4173, "mean_token_accuracy": 0.9014728516340256, "num_tokens": 6278469.0, "step": 59 }, { "entropy": 0.9790653139352798, "epoch": 0.13683010262257697, "grad_norm": 15.0, "learning_rate": 1.3409090909090911e-06, "loss": 0.5121, "mean_token_accuracy": 0.8901016712188721, "num_tokens": 6385164.0, "step": 60 }, { "entropy": 0.968551516532898, "epoch": 0.13911060433295325, "grad_norm": 14.0625, "learning_rate": 1.3636363636363636e-06, "loss": 0.4377, "mean_token_accuracy": 0.8993001282215118, "num_tokens": 6491510.0, "step": 61 }, { "entropy": 0.9765780121088028, "epoch": 0.14139110604332952, "grad_norm": 14.9375, "learning_rate": 1.3863636363636365e-06, "loss": 0.4864, "mean_token_accuracy": 0.9009080529212952, "num_tokens": 6597595.0, "step": 62 }, { "entropy": 0.962057501077652, "epoch": 0.14367160775370583, "grad_norm": 13.625, "learning_rate": 1.409090909090909e-06, "loss": 0.4481, "mean_token_accuracy": 0.8968811482191086, "num_tokens": 6704303.0, "step": 63 }, { "entropy": 0.9634363055229187, "epoch": 0.1459521094640821, "grad_norm": 12.8125, "learning_rate": 1.431818181818182e-06, "loss": 0.4171, "mean_token_accuracy": 0.9008316695690155, "num_tokens": 6810611.0, "step": 64 }, { "entropy": 0.9596368819475174, "epoch": 0.14823261117445838, "grad_norm": 13.8125, "learning_rate": 1.4545454545454546e-06, "loss": 0.4292, "mean_token_accuracy": 0.8998262584209442, "num_tokens": 6916863.0, "step": 65 }, { "entropy": 0.9591926634311676, "epoch": 0.15051311288483465, "grad_norm": 12.0625, "learning_rate": 1.4772727272727275e-06, "loss": 0.3763, "mean_token_accuracy": 0.906080573797226, "num_tokens": 7022838.0, "step": 66 }, { "entropy": 0.9627029299736023, "epoch": 0.15279361459521096, "grad_norm": 11.8125, "learning_rate": 1.5e-06, "loss": 0.3825, "mean_token_accuracy": 0.9101632535457611, "num_tokens": 7128965.0, "step": 67 }, { "entropy": 0.9665196388959885, "epoch": 0.15507411630558723, "grad_norm": 13.375, "learning_rate": 1.522727272727273e-06, "loss": 0.3492, "mean_token_accuracy": 0.9141627699136734, "num_tokens": 7235292.0, "step": 68 }, { "entropy": 0.9532487392425537, "epoch": 0.1573546180159635, "grad_norm": 18.875, "learning_rate": 1.5454545454545454e-06, "loss": 0.3965, "mean_token_accuracy": 0.9139973968267441, "num_tokens": 7341872.0, "step": 69 }, { "entropy": 0.9812297523021698, "epoch": 0.15963511972633979, "grad_norm": 12.625, "learning_rate": 1.5681818181818184e-06, "loss": 0.4012, "mean_token_accuracy": 0.9071068912744522, "num_tokens": 7449543.0, "step": 70 }, { "entropy": 0.9552039951086044, "epoch": 0.1619156214367161, "grad_norm": 12.1875, "learning_rate": 1.590909090909091e-06, "loss": 0.4077, "mean_token_accuracy": 0.9109017997980118, "num_tokens": 7555729.0, "step": 71 }, { "entropy": 0.96413454413414, "epoch": 0.16419612314709237, "grad_norm": 11.375, "learning_rate": 1.613636363636364e-06, "loss": 0.3848, "mean_token_accuracy": 0.9147694408893585, "num_tokens": 7661774.0, "step": 72 }, { "entropy": 0.963201254606247, "epoch": 0.16647662485746864, "grad_norm": 11.4375, "learning_rate": 1.6363636363636365e-06, "loss": 0.369, "mean_token_accuracy": 0.9128158241510391, "num_tokens": 7767915.0, "step": 73 }, { "entropy": 0.9576181173324585, "epoch": 0.16875712656784492, "grad_norm": 11.375, "learning_rate": 1.6590909090909094e-06, "loss": 0.3747, "mean_token_accuracy": 0.9172393828630447, "num_tokens": 7874328.0, "step": 74 }, { "entropy": 0.9647140651941299, "epoch": 0.17103762827822122, "grad_norm": 10.4375, "learning_rate": 1.6818181818181819e-06, "loss": 0.355, "mean_token_accuracy": 0.9231621623039246, "num_tokens": 7980217.0, "step": 75 }, { "entropy": 0.963541105389595, "epoch": 0.1733181299885975, "grad_norm": 10.875, "learning_rate": 1.7045454545454546e-06, "loss": 0.3933, "mean_token_accuracy": 0.9156422019004822, "num_tokens": 8086991.0, "step": 76 }, { "entropy": 0.9523647874593735, "epoch": 0.17559863169897377, "grad_norm": 14.3125, "learning_rate": 1.7272727272727275e-06, "loss": 0.4026, "mean_token_accuracy": 0.9106068015098572, "num_tokens": 8193076.0, "step": 77 }, { "entropy": 0.9554667323827744, "epoch": 0.17787913340935005, "grad_norm": 9.375, "learning_rate": 1.75e-06, "loss": 0.3199, "mean_token_accuracy": 0.9274622648954391, "num_tokens": 8299348.0, "step": 78 }, { "entropy": 0.9564069658517838, "epoch": 0.18015963511972635, "grad_norm": 10.375, "learning_rate": 1.7727272727272729e-06, "loss": 0.3822, "mean_token_accuracy": 0.9190638661384583, "num_tokens": 8405826.0, "step": 79 }, { "entropy": 0.9656827300786972, "epoch": 0.18244013683010263, "grad_norm": 10.875, "learning_rate": 1.7954545454545456e-06, "loss": 0.3554, "mean_token_accuracy": 0.9255002290010452, "num_tokens": 8511909.0, "step": 80 }, { "entropy": 0.9718214571475983, "epoch": 0.1847206385404789, "grad_norm": 9.625, "learning_rate": 1.8181818181818183e-06, "loss": 0.32, "mean_token_accuracy": 0.9225317537784576, "num_tokens": 8618249.0, "step": 81 }, { "entropy": 0.9628926664590836, "epoch": 0.18700114025085518, "grad_norm": 9.375, "learning_rate": 1.840909090909091e-06, "loss": 0.3317, "mean_token_accuracy": 0.9254167675971985, "num_tokens": 8724880.0, "step": 82 }, { "entropy": 0.96316958963871, "epoch": 0.18928164196123148, "grad_norm": 8.75, "learning_rate": 1.863636363636364e-06, "loss": 0.321, "mean_token_accuracy": 0.9289757609367371, "num_tokens": 8830871.0, "step": 83 }, { "entropy": 0.955514132976532, "epoch": 0.19156214367160776, "grad_norm": 9.25, "learning_rate": 1.8863636363636364e-06, "loss": 0.3235, "mean_token_accuracy": 0.9263086318969727, "num_tokens": 8936697.0, "step": 84 }, { "entropy": 0.964419350028038, "epoch": 0.19384264538198404, "grad_norm": 8.25, "learning_rate": 1.9090909090909095e-06, "loss": 0.2861, "mean_token_accuracy": 0.9326538443565369, "num_tokens": 9043057.0, "step": 85 }, { "entropy": 0.9724892824888229, "epoch": 0.1961231470923603, "grad_norm": 8.5625, "learning_rate": 1.931818181818182e-06, "loss": 0.2739, "mean_token_accuracy": 0.9340428560972214, "num_tokens": 9149982.0, "step": 86 }, { "entropy": 0.9621438831090927, "epoch": 0.19840364880273662, "grad_norm": 10.5, "learning_rate": 1.954545454545455e-06, "loss": 0.3359, "mean_token_accuracy": 0.9191739857196808, "num_tokens": 9255838.0, "step": 87 }, { "entropy": 0.9591668099164963, "epoch": 0.2006841505131129, "grad_norm": 7.9375, "learning_rate": 1.977272727272727e-06, "loss": 0.2597, "mean_token_accuracy": 0.9419514685869217, "num_tokens": 9361872.0, "step": 88 }, { "entropy": 0.9570581167936325, "epoch": 0.20296465222348917, "grad_norm": 7.875, "learning_rate": 2.0000000000000003e-06, "loss": 0.2872, "mean_token_accuracy": 0.93398417532444, "num_tokens": 9467742.0, "step": 89 }, { "entropy": 0.9712041169404984, "epoch": 0.20524515393386544, "grad_norm": 9.5, "learning_rate": 2.022727272727273e-06, "loss": 0.3023, "mean_token_accuracy": 0.9260585755109787, "num_tokens": 9574102.0, "step": 90 }, { "entropy": 0.9654423147439957, "epoch": 0.20752565564424175, "grad_norm": 7.6875, "learning_rate": 2.0454545454545457e-06, "loss": 0.2523, "mean_token_accuracy": 0.9439366012811661, "num_tokens": 9679656.0, "step": 91 }, { "entropy": 0.9483186155557632, "epoch": 0.20980615735461802, "grad_norm": 8.375, "learning_rate": 2.0681818181818184e-06, "loss": 0.2665, "mean_token_accuracy": 0.9371416568756104, "num_tokens": 9785201.0, "step": 92 }, { "entropy": 0.9651912152767181, "epoch": 0.2120866590649943, "grad_norm": 8.1875, "learning_rate": 2.090909090909091e-06, "loss": 0.2353, "mean_token_accuracy": 0.9408894330263138, "num_tokens": 9891360.0, "step": 93 }, { "entropy": 0.9730426371097565, "epoch": 0.21436716077537057, "grad_norm": 7.53125, "learning_rate": 2.113636363636364e-06, "loss": 0.2076, "mean_token_accuracy": 0.9458318650722504, "num_tokens": 9997492.0, "step": 94 }, { "entropy": 0.9590244591236115, "epoch": 0.21664766248574688, "grad_norm": 6.625, "learning_rate": 2.1363636363636365e-06, "loss": 0.2081, "mean_token_accuracy": 0.9475728273391724, "num_tokens": 10104262.0, "step": 95 }, { "entropy": 0.9598219692707062, "epoch": 0.21892816419612315, "grad_norm": 7.28125, "learning_rate": 2.1590909090909092e-06, "loss": 0.2605, "mean_token_accuracy": 0.9348293989896774, "num_tokens": 10210118.0, "step": 96 }, { "entropy": 0.9554770439863205, "epoch": 0.22120866590649943, "grad_norm": 6.8125, "learning_rate": 2.181818181818182e-06, "loss": 0.2361, "mean_token_accuracy": 0.9426037520170212, "num_tokens": 10316132.0, "step": 97 }, { "entropy": 0.9720187038183212, "epoch": 0.2234891676168757, "grad_norm": 9.75, "learning_rate": 2.2045454545454547e-06, "loss": 0.239, "mean_token_accuracy": 0.94212706387043, "num_tokens": 10422912.0, "step": 98 }, { "entropy": 0.9586055129766464, "epoch": 0.22576966932725198, "grad_norm": 6.84375, "learning_rate": 2.2272727272727274e-06, "loss": 0.2605, "mean_token_accuracy": 0.9372538030147552, "num_tokens": 10529007.0, "step": 99 }, { "entropy": 0.9759549796581268, "epoch": 0.22805017103762829, "grad_norm": 6.875, "learning_rate": 2.25e-06, "loss": 0.2469, "mean_token_accuracy": 0.934854045510292, "num_tokens": 10635685.0, "step": 100 }, { "entropy": 0.9591715484857559, "epoch": 0.23033067274800456, "grad_norm": 6.53125, "learning_rate": 2.2727272727272728e-06, "loss": 0.218, "mean_token_accuracy": 0.9447477757930756, "num_tokens": 10741736.0, "step": 101 }, { "entropy": 0.9706251621246338, "epoch": 0.23261117445838084, "grad_norm": 5.15625, "learning_rate": 2.295454545454546e-06, "loss": 0.1929, "mean_token_accuracy": 0.9466055184602737, "num_tokens": 10847564.0, "step": 102 }, { "entropy": 0.9708344042301178, "epoch": 0.2348916761687571, "grad_norm": 4.6875, "learning_rate": 2.318181818181818e-06, "loss": 0.1935, "mean_token_accuracy": 0.9511224627494812, "num_tokens": 10954194.0, "step": 103 }, { "entropy": 0.976265549659729, "epoch": 0.23717217787913342, "grad_norm": 5.40625, "learning_rate": 2.3409090909090913e-06, "loss": 0.2182, "mean_token_accuracy": 0.9498215913772583, "num_tokens": 11060062.0, "step": 104 }, { "entropy": 0.9729485958814621, "epoch": 0.2394526795895097, "grad_norm": 4.59375, "learning_rate": 2.363636363636364e-06, "loss": 0.1656, "mean_token_accuracy": 0.9602951556444168, "num_tokens": 11165917.0, "step": 105 }, { "entropy": 0.9831607192754745, "epoch": 0.24173318129988597, "grad_norm": 4.75, "learning_rate": 2.3863636363636367e-06, "loss": 0.2003, "mean_token_accuracy": 0.9462843537330627, "num_tokens": 11272161.0, "step": 106 }, { "entropy": 0.9589905589818954, "epoch": 0.24401368301026224, "grad_norm": 4.875, "learning_rate": 2.4090909090909094e-06, "loss": 0.1623, "mean_token_accuracy": 0.9581523388624191, "num_tokens": 11378556.0, "step": 107 }, { "entropy": 0.9755458235740662, "epoch": 0.24629418472063855, "grad_norm": 4.4375, "learning_rate": 2.431818181818182e-06, "loss": 0.1684, "mean_token_accuracy": 0.9592550992965698, "num_tokens": 11484766.0, "step": 108 }, { "entropy": 0.9729411751031876, "epoch": 0.24857468643101482, "grad_norm": 4.125, "learning_rate": 2.454545454545455e-06, "loss": 0.1618, "mean_token_accuracy": 0.9527864754199982, "num_tokens": 11591397.0, "step": 109 }, { "entropy": 0.9685869067907333, "epoch": 0.2508551881413911, "grad_norm": 4.25, "learning_rate": 2.4772727272727275e-06, "loss": 0.1644, "mean_token_accuracy": 0.9592944830656052, "num_tokens": 11696833.0, "step": 110 }, { "entropy": 0.9567488133907318, "epoch": 0.2531356898517674, "grad_norm": 4.84375, "learning_rate": 2.5e-06, "loss": 0.1825, "mean_token_accuracy": 0.9519646167755127, "num_tokens": 11803093.0, "step": 111 }, { "entropy": 0.9612108618021011, "epoch": 0.2554161915621437, "grad_norm": 4.15625, "learning_rate": 2.522727272727273e-06, "loss": 0.1698, "mean_token_accuracy": 0.9577222168445587, "num_tokens": 11909127.0, "step": 112 }, { "entropy": 0.9570221900939941, "epoch": 0.2576966932725199, "grad_norm": 4.0625, "learning_rate": 2.5454545454545456e-06, "loss": 0.1765, "mean_token_accuracy": 0.954651489853859, "num_tokens": 12015474.0, "step": 113 }, { "entropy": 0.9563720226287842, "epoch": 0.25997719498289623, "grad_norm": 4.1875, "learning_rate": 2.5681818181818187e-06, "loss": 0.1825, "mean_token_accuracy": 0.95598004758358, "num_tokens": 12121682.0, "step": 114 }, { "entropy": 0.9663042277097702, "epoch": 0.26225769669327254, "grad_norm": 4.5625, "learning_rate": 2.590909090909091e-06, "loss": 0.167, "mean_token_accuracy": 0.9566276967525482, "num_tokens": 12228239.0, "step": 115 }, { "entropy": 0.9684116095304489, "epoch": 0.2645381984036488, "grad_norm": 3.765625, "learning_rate": 2.6136363636363637e-06, "loss": 0.1509, "mean_token_accuracy": 0.9617685973644257, "num_tokens": 12334226.0, "step": 116 }, { "entropy": 0.972596675157547, "epoch": 0.2668187001140251, "grad_norm": 3.734375, "learning_rate": 2.6363636363636364e-06, "loss": 0.1567, "mean_token_accuracy": 0.9637322723865509, "num_tokens": 12440211.0, "step": 117 }, { "entropy": 0.9730584472417831, "epoch": 0.2690992018244014, "grad_norm": 3.71875, "learning_rate": 2.6590909090909095e-06, "loss": 0.1501, "mean_token_accuracy": 0.9572943300008774, "num_tokens": 12546830.0, "step": 118 }, { "entropy": 0.9717285186052322, "epoch": 0.27137970353477764, "grad_norm": 4.4375, "learning_rate": 2.6818181818181822e-06, "loss": 0.1817, "mean_token_accuracy": 0.954412192106247, "num_tokens": 12652971.0, "step": 119 }, { "entropy": 0.96302430331707, "epoch": 0.27366020524515394, "grad_norm": 4.25, "learning_rate": 2.7045454545454545e-06, "loss": 0.2182, "mean_token_accuracy": 0.9513444155454636, "num_tokens": 12760054.0, "step": 120 }, { "entropy": 0.9655550718307495, "epoch": 0.2759407069555302, "grad_norm": 3.765625, "learning_rate": 2.7272727272727272e-06, "loss": 0.1525, "mean_token_accuracy": 0.95326067507267, "num_tokens": 12866347.0, "step": 121 }, { "entropy": 0.971894383430481, "epoch": 0.2782212086659065, "grad_norm": 3.1875, "learning_rate": 2.7500000000000004e-06, "loss": 0.1381, "mean_token_accuracy": 0.9625128507614136, "num_tokens": 12972500.0, "step": 122 }, { "entropy": 0.9715329557657242, "epoch": 0.2805017103762828, "grad_norm": 3.15625, "learning_rate": 2.772727272727273e-06, "loss": 0.1275, "mean_token_accuracy": 0.9679891467094421, "num_tokens": 13078285.0, "step": 123 }, { "entropy": 0.9749847501516342, "epoch": 0.28278221208665905, "grad_norm": 3.625, "learning_rate": 2.7954545454545458e-06, "loss": 0.1598, "mean_token_accuracy": 0.9576127529144287, "num_tokens": 13184886.0, "step": 124 }, { "entropy": 0.9740230143070221, "epoch": 0.28506271379703535, "grad_norm": 3.953125, "learning_rate": 2.818181818181818e-06, "loss": 0.163, "mean_token_accuracy": 0.9560465663671494, "num_tokens": 13291030.0, "step": 125 }, { "entropy": 0.9571146965026855, "epoch": 0.28734321550741165, "grad_norm": 3.28125, "learning_rate": 2.8409090909090916e-06, "loss": 0.1415, "mean_token_accuracy": 0.9635005295276642, "num_tokens": 13397421.0, "step": 126 }, { "entropy": 0.9747041761875153, "epoch": 0.2896237172177879, "grad_norm": 3.4375, "learning_rate": 2.863636363636364e-06, "loss": 0.1541, "mean_token_accuracy": 0.9610213786363602, "num_tokens": 13503696.0, "step": 127 }, { "entropy": 0.9665826708078384, "epoch": 0.2919042189281642, "grad_norm": 3.359375, "learning_rate": 2.8863636363636366e-06, "loss": 0.1396, "mean_token_accuracy": 0.9652371108531952, "num_tokens": 13609741.0, "step": 128 }, { "entropy": 0.9699429124593735, "epoch": 0.29418472063854045, "grad_norm": 3.328125, "learning_rate": 2.9090909090909093e-06, "loss": 0.1302, "mean_token_accuracy": 0.9631678909063339, "num_tokens": 13715767.0, "step": 129 }, { "entropy": 0.9605697840452194, "epoch": 0.29646522234891676, "grad_norm": 3.078125, "learning_rate": 2.931818181818182e-06, "loss": 0.1202, "mean_token_accuracy": 0.9665965586900711, "num_tokens": 13822325.0, "step": 130 }, { "entropy": 0.980366662144661, "epoch": 0.29874572405929306, "grad_norm": 2.984375, "learning_rate": 2.954545454545455e-06, "loss": 0.129, "mean_token_accuracy": 0.9665180742740631, "num_tokens": 13928854.0, "step": 131 }, { "entropy": 0.9769528657197952, "epoch": 0.3010262257696693, "grad_norm": 2.984375, "learning_rate": 2.9772727272727274e-06, "loss": 0.127, "mean_token_accuracy": 0.9646988958120346, "num_tokens": 14035368.0, "step": 132 }, { "entropy": 0.9685975164175034, "epoch": 0.3033067274800456, "grad_norm": 4.15625, "learning_rate": 3e-06, "loss": 0.1355, "mean_token_accuracy": 0.9614743143320084, "num_tokens": 14141611.0, "step": 133 }, { "entropy": 0.9768745601177216, "epoch": 0.3055872291904219, "grad_norm": 2.890625, "learning_rate": 3.0227272727272728e-06, "loss": 0.1265, "mean_token_accuracy": 0.9646161943674088, "num_tokens": 14248153.0, "step": 134 }, { "entropy": 0.9718175381422043, "epoch": 0.30786773090079816, "grad_norm": 3.140625, "learning_rate": 3.045454545454546e-06, "loss": 0.1428, "mean_token_accuracy": 0.9626754224300385, "num_tokens": 14354770.0, "step": 135 }, { "entropy": 0.9629888981580734, "epoch": 0.31014823261117447, "grad_norm": 2.828125, "learning_rate": 3.0681818181818186e-06, "loss": 0.1161, "mean_token_accuracy": 0.9677574187517166, "num_tokens": 14461535.0, "step": 136 }, { "entropy": 0.962815061211586, "epoch": 0.3124287343215507, "grad_norm": 3.171875, "learning_rate": 3.090909090909091e-06, "loss": 0.141, "mean_token_accuracy": 0.963960662484169, "num_tokens": 14567404.0, "step": 137 }, { "entropy": 0.9701665937900543, "epoch": 0.314709236031927, "grad_norm": 2.96875, "learning_rate": 3.1136363636363636e-06, "loss": 0.1205, "mean_token_accuracy": 0.9698525667190552, "num_tokens": 14673506.0, "step": 138 }, { "entropy": 0.9629870802164078, "epoch": 0.3169897377423033, "grad_norm": 3.125, "learning_rate": 3.1363636363636367e-06, "loss": 0.1309, "mean_token_accuracy": 0.9648740589618683, "num_tokens": 14779648.0, "step": 139 }, { "entropy": 0.9612726271152496, "epoch": 0.31927023945267957, "grad_norm": 2.6875, "learning_rate": 3.1590909090909094e-06, "loss": 0.1124, "mean_token_accuracy": 0.9702065289020538, "num_tokens": 14886084.0, "step": 140 }, { "entropy": 0.9696302562952042, "epoch": 0.3215507411630559, "grad_norm": 3.359375, "learning_rate": 3.181818181818182e-06, "loss": 0.1506, "mean_token_accuracy": 0.9627184718847275, "num_tokens": 14993016.0, "step": 141 }, { "entropy": 0.962703675031662, "epoch": 0.3238312428734322, "grad_norm": 3.078125, "learning_rate": 3.204545454545455e-06, "loss": 0.14, "mean_token_accuracy": 0.9622391909360886, "num_tokens": 15099761.0, "step": 142 }, { "entropy": 0.9584702998399734, "epoch": 0.3261117445838084, "grad_norm": 3.171875, "learning_rate": 3.227272727272728e-06, "loss": 0.1328, "mean_token_accuracy": 0.9650201350450516, "num_tokens": 15205943.0, "step": 143 }, { "entropy": 0.9747777432203293, "epoch": 0.32839224629418473, "grad_norm": 4.53125, "learning_rate": 3.2500000000000002e-06, "loss": 0.1395, "mean_token_accuracy": 0.9675467610359192, "num_tokens": 15311801.0, "step": 144 }, { "entropy": 0.9676468223333359, "epoch": 0.330672748004561, "grad_norm": 2.578125, "learning_rate": 3.272727272727273e-06, "loss": 0.1206, "mean_token_accuracy": 0.9655298143625259, "num_tokens": 15417866.0, "step": 145 }, { "entropy": 0.9659714996814728, "epoch": 0.3329532497149373, "grad_norm": 2.6875, "learning_rate": 3.2954545454545456e-06, "loss": 0.1172, "mean_token_accuracy": 0.9689408242702484, "num_tokens": 15524379.0, "step": 146 }, { "entropy": 0.9870049506425858, "epoch": 0.3352337514253136, "grad_norm": 2.5, "learning_rate": 3.3181818181818188e-06, "loss": 0.1046, "mean_token_accuracy": 0.9696906358003616, "num_tokens": 15630729.0, "step": 147 }, { "entropy": 0.9705940783023834, "epoch": 0.33751425313568983, "grad_norm": 3.21875, "learning_rate": 3.3409090909090915e-06, "loss": 0.1118, "mean_token_accuracy": 0.9727614969015121, "num_tokens": 15737292.0, "step": 148 }, { "entropy": 0.993527352809906, "epoch": 0.33979475484606614, "grad_norm": 2.703125, "learning_rate": 3.3636363636363637e-06, "loss": 0.1217, "mean_token_accuracy": 0.966929703950882, "num_tokens": 15843213.0, "step": 149 }, { "entropy": 0.9875759780406952, "epoch": 0.34207525655644244, "grad_norm": 2.296875, "learning_rate": 3.3863636363636364e-06, "loss": 0.101, "mean_token_accuracy": 0.9717714637517929, "num_tokens": 15949442.0, "step": 150 }, { "entropy": 0.9509166181087494, "epoch": 0.3443557582668187, "grad_norm": 2.5, "learning_rate": 3.409090909090909e-06, "loss": 0.091, "mean_token_accuracy": 0.9728886038064957, "num_tokens": 16055285.0, "step": 151 }, { "entropy": 0.9573154449462891, "epoch": 0.346636259977195, "grad_norm": 2.625, "learning_rate": 3.4318181818181823e-06, "loss": 0.1009, "mean_token_accuracy": 0.9706485569477081, "num_tokens": 16160851.0, "step": 152 }, { "entropy": 0.9695054441690445, "epoch": 0.34891676168757124, "grad_norm": 2.515625, "learning_rate": 3.454545454545455e-06, "loss": 0.0977, "mean_token_accuracy": 0.9712526351213455, "num_tokens": 16267017.0, "step": 153 }, { "entropy": 0.9582559168338776, "epoch": 0.35119726339794755, "grad_norm": 2.859375, "learning_rate": 3.4772727272727277e-06, "loss": 0.1244, "mean_token_accuracy": 0.9661664664745331, "num_tokens": 16373244.0, "step": 154 }, { "entropy": 0.9529191851615906, "epoch": 0.35347776510832385, "grad_norm": 2.53125, "learning_rate": 3.5e-06, "loss": 0.115, "mean_token_accuracy": 0.9696183651685715, "num_tokens": 16479761.0, "step": 155 }, { "entropy": 0.962014839053154, "epoch": 0.3557582668187001, "grad_norm": 2.609375, "learning_rate": 3.522727272727273e-06, "loss": 0.1099, "mean_token_accuracy": 0.9720597863197327, "num_tokens": 16586144.0, "step": 156 }, { "entropy": 0.9554325491189957, "epoch": 0.3580387685290764, "grad_norm": 3.234375, "learning_rate": 3.5454545454545458e-06, "loss": 0.1186, "mean_token_accuracy": 0.9686466306447983, "num_tokens": 16692532.0, "step": 157 }, { "entropy": 0.9646629840135574, "epoch": 0.3603192702394527, "grad_norm": 2.59375, "learning_rate": 3.5681818181818185e-06, "loss": 0.0948, "mean_token_accuracy": 0.9709384739398956, "num_tokens": 16799317.0, "step": 158 }, { "entropy": 0.9611927717924118, "epoch": 0.36259977194982895, "grad_norm": 2.40625, "learning_rate": 3.590909090909091e-06, "loss": 0.1122, "mean_token_accuracy": 0.969350665807724, "num_tokens": 16905362.0, "step": 159 }, { "entropy": 0.9534666985273361, "epoch": 0.36488027366020526, "grad_norm": 2.71875, "learning_rate": 3.6136363636363643e-06, "loss": 0.1039, "mean_token_accuracy": 0.9758395105600357, "num_tokens": 17011468.0, "step": 160 }, { "entropy": 0.9613295793533325, "epoch": 0.3671607753705815, "grad_norm": 2.3125, "learning_rate": 3.6363636363636366e-06, "loss": 0.1069, "mean_token_accuracy": 0.9734722077846527, "num_tokens": 17118013.0, "step": 161 }, { "entropy": 0.9676676243543625, "epoch": 0.3694412770809578, "grad_norm": 2.078125, "learning_rate": 3.6590909090909093e-06, "loss": 0.0986, "mean_token_accuracy": 0.9763278663158417, "num_tokens": 17224203.0, "step": 162 }, { "entropy": 0.9632392078638077, "epoch": 0.3717217787913341, "grad_norm": 2.8125, "learning_rate": 3.681818181818182e-06, "loss": 0.1173, "mean_token_accuracy": 0.9718816876411438, "num_tokens": 17330920.0, "step": 163 }, { "entropy": 0.9814814925193787, "epoch": 0.37400228050171036, "grad_norm": 2.46875, "learning_rate": 3.704545454545455e-06, "loss": 0.1132, "mean_token_accuracy": 0.9699600636959076, "num_tokens": 17437158.0, "step": 164 }, { "entropy": 0.9592552185058594, "epoch": 0.37628278221208666, "grad_norm": 2.703125, "learning_rate": 3.727272727272728e-06, "loss": 0.1022, "mean_token_accuracy": 0.974522590637207, "num_tokens": 17543200.0, "step": 165 }, { "entropy": 0.9622849375009537, "epoch": 0.37856328392246297, "grad_norm": 2.390625, "learning_rate": 3.7500000000000005e-06, "loss": 0.1061, "mean_token_accuracy": 0.9737326949834824, "num_tokens": 17649508.0, "step": 166 }, { "entropy": 0.9660589098930359, "epoch": 0.3808437856328392, "grad_norm": 2.6875, "learning_rate": 3.772727272727273e-06, "loss": 0.112, "mean_token_accuracy": 0.96737040579319, "num_tokens": 17756326.0, "step": 167 }, { "entropy": 0.9816988259553909, "epoch": 0.3831242873432155, "grad_norm": 2.28125, "learning_rate": 3.7954545454545455e-06, "loss": 0.0941, "mean_token_accuracy": 0.9745183736085892, "num_tokens": 17862604.0, "step": 168 }, { "entropy": 0.9815102219581604, "epoch": 0.38540478905359177, "grad_norm": 2.34375, "learning_rate": 3.818181818181819e-06, "loss": 0.0894, "mean_token_accuracy": 0.9715871810913086, "num_tokens": 17968870.0, "step": 169 }, { "entropy": 0.9609841853380203, "epoch": 0.38768529076396807, "grad_norm": 2.328125, "learning_rate": 3.840909090909091e-06, "loss": 0.1012, "mean_token_accuracy": 0.9733140915632248, "num_tokens": 18074910.0, "step": 170 }, { "entropy": 0.9721857756376266, "epoch": 0.3899657924743444, "grad_norm": 2.140625, "learning_rate": 3.863636363636364e-06, "loss": 0.1, "mean_token_accuracy": 0.9719968289136887, "num_tokens": 18180840.0, "step": 171 }, { "entropy": 0.9636870622634888, "epoch": 0.3922462941847206, "grad_norm": 2.78125, "learning_rate": 3.886363636363637e-06, "loss": 0.1191, "mean_token_accuracy": 0.9657348394393921, "num_tokens": 18286668.0, "step": 172 }, { "entropy": 0.9792468994855881, "epoch": 0.3945267958950969, "grad_norm": 2.0, "learning_rate": 3.90909090909091e-06, "loss": 0.0926, "mean_token_accuracy": 0.9742760807275772, "num_tokens": 18393344.0, "step": 173 }, { "entropy": 0.9880589842796326, "epoch": 0.39680729760547323, "grad_norm": 2.140625, "learning_rate": 3.931818181818182e-06, "loss": 0.0797, "mean_token_accuracy": 0.9805101454257965, "num_tokens": 18499584.0, "step": 174 }, { "entropy": 0.9744017422199249, "epoch": 0.3990877993158495, "grad_norm": 2.1875, "learning_rate": 3.954545454545454e-06, "loss": 0.1082, "mean_token_accuracy": 0.9695633202791214, "num_tokens": 18606201.0, "step": 175 }, { "entropy": 0.9725730866193771, "epoch": 0.4013683010262258, "grad_norm": 2.46875, "learning_rate": 3.9772727272727275e-06, "loss": 0.0951, "mean_token_accuracy": 0.9743359535932541, "num_tokens": 18712619.0, "step": 176 }, { "entropy": 0.9788859039545059, "epoch": 0.40364880273660203, "grad_norm": 2.046875, "learning_rate": 4.000000000000001e-06, "loss": 0.0823, "mean_token_accuracy": 0.9772210866212845, "num_tokens": 18818755.0, "step": 177 }, { "entropy": 0.9626740664243698, "epoch": 0.40592930444697833, "grad_norm": 2.34375, "learning_rate": 4.022727272727273e-06, "loss": 0.0988, "mean_token_accuracy": 0.9707264006137848, "num_tokens": 18924551.0, "step": 178 }, { "entropy": 0.978600263595581, "epoch": 0.40820980615735464, "grad_norm": 2.375, "learning_rate": 4.045454545454546e-06, "loss": 0.0981, "mean_token_accuracy": 0.9696833938360214, "num_tokens": 19030541.0, "step": 179 }, { "entropy": 0.9767398834228516, "epoch": 0.4104903078677309, "grad_norm": 2.15625, "learning_rate": 4.068181818181818e-06, "loss": 0.1128, "mean_token_accuracy": 0.9714149832725525, "num_tokens": 19137334.0, "step": 180 }, { "entropy": 0.9780558049678802, "epoch": 0.4127708095781072, "grad_norm": 2.296875, "learning_rate": 4.0909090909090915e-06, "loss": 0.1045, "mean_token_accuracy": 0.9701898396015167, "num_tokens": 19243963.0, "step": 181 }, { "entropy": 0.9744202494621277, "epoch": 0.4150513112884835, "grad_norm": 2.34375, "learning_rate": 4.113636363636364e-06, "loss": 0.0847, "mean_token_accuracy": 0.9766269624233246, "num_tokens": 19350783.0, "step": 182 }, { "entropy": 0.9820019155740738, "epoch": 0.41733181299885974, "grad_norm": 2.28125, "learning_rate": 4.136363636363637e-06, "loss": 0.0988, "mean_token_accuracy": 0.9722923040390015, "num_tokens": 19457278.0, "step": 183 }, { "entropy": 0.9767319411039352, "epoch": 0.41961231470923605, "grad_norm": 2.1875, "learning_rate": 4.159090909090909e-06, "loss": 0.0742, "mean_token_accuracy": 0.9803382009267807, "num_tokens": 19563108.0, "step": 184 }, { "entropy": 0.9798580855131149, "epoch": 0.4218928164196123, "grad_norm": 2.21875, "learning_rate": 4.181818181818182e-06, "loss": 0.1044, "mean_token_accuracy": 0.9701553732156754, "num_tokens": 19669477.0, "step": 185 }, { "entropy": 0.9839195758104324, "epoch": 0.4241733181299886, "grad_norm": 2.03125, "learning_rate": 4.204545454545455e-06, "loss": 0.0973, "mean_token_accuracy": 0.9711517244577408, "num_tokens": 19776196.0, "step": 186 }, { "entropy": 0.9820647239685059, "epoch": 0.4264538198403649, "grad_norm": 2.125, "learning_rate": 4.227272727272728e-06, "loss": 0.0923, "mean_token_accuracy": 0.9750039428472519, "num_tokens": 19882698.0, "step": 187 }, { "entropy": 0.9776212275028229, "epoch": 0.42873432155074115, "grad_norm": 2.03125, "learning_rate": 4.25e-06, "loss": 0.1009, "mean_token_accuracy": 0.9712205231189728, "num_tokens": 19988392.0, "step": 188 }, { "entropy": 0.9827404767274857, "epoch": 0.43101482326111745, "grad_norm": 2.28125, "learning_rate": 4.272727272727273e-06, "loss": 0.1127, "mean_token_accuracy": 0.9736526608467102, "num_tokens": 20094707.0, "step": 189 }, { "entropy": 0.974037230014801, "epoch": 0.43329532497149376, "grad_norm": 2.0625, "learning_rate": 4.295454545454546e-06, "loss": 0.0987, "mean_token_accuracy": 0.9715169072151184, "num_tokens": 20200468.0, "step": 190 }, { "entropy": 0.9882257878780365, "epoch": 0.43557582668187, "grad_norm": 1.9296875, "learning_rate": 4.3181818181818185e-06, "loss": 0.0918, "mean_token_accuracy": 0.9734714180231094, "num_tokens": 20306806.0, "step": 191 }, { "entropy": 0.9822763651609421, "epoch": 0.4378563283922463, "grad_norm": 2.234375, "learning_rate": 4.340909090909091e-06, "loss": 0.09, "mean_token_accuracy": 0.9777934104204178, "num_tokens": 20413397.0, "step": 192 }, { "entropy": 0.9726729542016983, "epoch": 0.44013683010262256, "grad_norm": 2.0625, "learning_rate": 4.363636363636364e-06, "loss": 0.0836, "mean_token_accuracy": 0.9762649834156036, "num_tokens": 20519877.0, "step": 193 }, { "entropy": 0.9758595675230026, "epoch": 0.44241733181299886, "grad_norm": 2.046875, "learning_rate": 4.386363636363637e-06, "loss": 0.1113, "mean_token_accuracy": 0.9718074202537537, "num_tokens": 20626493.0, "step": 194 }, { "entropy": 0.9731335192918777, "epoch": 0.44469783352337516, "grad_norm": 2.078125, "learning_rate": 4.409090909090909e-06, "loss": 0.0736, "mean_token_accuracy": 0.9770399481058121, "num_tokens": 20733042.0, "step": 195 }, { "entropy": 0.9831180274486542, "epoch": 0.4469783352337514, "grad_norm": 2.140625, "learning_rate": 4.4318181818181824e-06, "loss": 0.0838, "mean_token_accuracy": 0.9770352393388748, "num_tokens": 20839287.0, "step": 196 }, { "entropy": 0.9816676378250122, "epoch": 0.4492588369441277, "grad_norm": 1.65625, "learning_rate": 4.454545454545455e-06, "loss": 0.0766, "mean_token_accuracy": 0.9765711724758148, "num_tokens": 20945153.0, "step": 197 }, { "entropy": 0.9693256616592407, "epoch": 0.45153933865450396, "grad_norm": 2.078125, "learning_rate": 4.477272727272728e-06, "loss": 0.0886, "mean_token_accuracy": 0.9764720648527145, "num_tokens": 21051541.0, "step": 198 }, { "entropy": 0.9668312817811966, "epoch": 0.45381984036488027, "grad_norm": 1.9609375, "learning_rate": 4.5e-06, "loss": 0.0922, "mean_token_accuracy": 0.9719244986772537, "num_tokens": 21157877.0, "step": 199 }, { "entropy": 0.9792518019676208, "epoch": 0.45610034207525657, "grad_norm": 1.9453125, "learning_rate": 4.522727272727273e-06, "loss": 0.0888, "mean_token_accuracy": 0.9760424494743347, "num_tokens": 21264531.0, "step": 200 }, { "entropy": 0.9802867025136948, "epoch": 0.4583808437856328, "grad_norm": 1.71875, "learning_rate": 4.5454545454545455e-06, "loss": 0.0737, "mean_token_accuracy": 0.9797925651073456, "num_tokens": 21371412.0, "step": 201 }, { "entropy": 0.9773290902376175, "epoch": 0.4606613454960091, "grad_norm": 1.9140625, "learning_rate": 4.568181818181819e-06, "loss": 0.0862, "mean_token_accuracy": 0.9757405668497086, "num_tokens": 21477982.0, "step": 202 }, { "entropy": 0.9776417016983032, "epoch": 0.4629418472063854, "grad_norm": 1.9140625, "learning_rate": 4.590909090909092e-06, "loss": 0.0903, "mean_token_accuracy": 0.9734518378973007, "num_tokens": 21584141.0, "step": 203 }, { "entropy": 0.977250337600708, "epoch": 0.4652223489167617, "grad_norm": 1.6875, "learning_rate": 4.613636363636364e-06, "loss": 0.0727, "mean_token_accuracy": 0.9801328480243683, "num_tokens": 21691061.0, "step": 204 }, { "entropy": 0.9694255292415619, "epoch": 0.467502850627138, "grad_norm": 1.890625, "learning_rate": 4.636363636363636e-06, "loss": 0.0926, "mean_token_accuracy": 0.9761814475059509, "num_tokens": 21797497.0, "step": 205 }, { "entropy": 0.9882890731096268, "epoch": 0.4697833523375142, "grad_norm": 2.078125, "learning_rate": 4.6590909090909095e-06, "loss": 0.0878, "mean_token_accuracy": 0.9752508252859116, "num_tokens": 21904402.0, "step": 206 }, { "entropy": 0.9826070964336395, "epoch": 0.47206385404789053, "grad_norm": 2.078125, "learning_rate": 4.681818181818183e-06, "loss": 0.0698, "mean_token_accuracy": 0.9805110692977905, "num_tokens": 22011500.0, "step": 207 }, { "entropy": 0.9730394929647446, "epoch": 0.47434435575826683, "grad_norm": 2.171875, "learning_rate": 4.704545454545455e-06, "loss": 0.0968, "mean_token_accuracy": 0.9740218818187714, "num_tokens": 22118183.0, "step": 208 }, { "entropy": 0.9748863130807877, "epoch": 0.4766248574686431, "grad_norm": 1.875, "learning_rate": 4.727272727272728e-06, "loss": 0.0845, "mean_token_accuracy": 0.974096029996872, "num_tokens": 22223978.0, "step": 209 }, { "entropy": 0.9780798703432083, "epoch": 0.4789053591790194, "grad_norm": 1.6796875, "learning_rate": 4.75e-06, "loss": 0.0777, "mean_token_accuracy": 0.9796034842729568, "num_tokens": 22330059.0, "step": 210 }, { "entropy": 0.9688428640365601, "epoch": 0.4811858608893957, "grad_norm": 1.640625, "learning_rate": 4.772727272727273e-06, "loss": 0.073, "mean_token_accuracy": 0.9793464988470078, "num_tokens": 22435877.0, "step": 211 }, { "entropy": 0.9865798354148865, "epoch": 0.48346636259977194, "grad_norm": 2.328125, "learning_rate": 4.795454545454546e-06, "loss": 0.086, "mean_token_accuracy": 0.9786234349012375, "num_tokens": 22542607.0, "step": 212 }, { "entropy": 0.9794099628925323, "epoch": 0.48574686431014824, "grad_norm": 1.8046875, "learning_rate": 4.818181818181819e-06, "loss": 0.0781, "mean_token_accuracy": 0.9783029109239578, "num_tokens": 22648542.0, "step": 213 }, { "entropy": 0.9772571325302124, "epoch": 0.4880273660205245, "grad_norm": 1.8203125, "learning_rate": 4.840909090909091e-06, "loss": 0.0769, "mean_token_accuracy": 0.9782924801111221, "num_tokens": 22754480.0, "step": 214 }, { "entropy": 0.9819867461919785, "epoch": 0.4903078677309008, "grad_norm": 2.25, "learning_rate": 4.863636363636364e-06, "loss": 0.0989, "mean_token_accuracy": 0.973569244146347, "num_tokens": 22861027.0, "step": 215 }, { "entropy": 0.9933525323867798, "epoch": 0.4925883694412771, "grad_norm": 1.875, "learning_rate": 4.8863636363636365e-06, "loss": 0.0796, "mean_token_accuracy": 0.9777833074331284, "num_tokens": 22967586.0, "step": 216 }, { "entropy": 0.9731242060661316, "epoch": 0.49486887115165334, "grad_norm": 1.828125, "learning_rate": 4.90909090909091e-06, "loss": 0.0731, "mean_token_accuracy": 0.9815525859594345, "num_tokens": 23073874.0, "step": 217 }, { "entropy": 0.9722823649644852, "epoch": 0.49714937286202965, "grad_norm": 2.234375, "learning_rate": 4.931818181818182e-06, "loss": 0.0916, "mean_token_accuracy": 0.9719773828983307, "num_tokens": 23180142.0, "step": 218 }, { "entropy": 0.9862520098686218, "epoch": 0.49942987457240595, "grad_norm": 1.4765625, "learning_rate": 4.954545454545455e-06, "loss": 0.0673, "mean_token_accuracy": 0.9838427156209946, "num_tokens": 23287311.0, "step": 219 }, { "entropy": 0.977488324046135, "epoch": 0.5017103762827823, "grad_norm": 1.6484375, "learning_rate": 4.977272727272728e-06, "loss": 0.0559, "mean_token_accuracy": 0.9814870804548264, "num_tokens": 23392990.0, "step": 220 }, { "epoch": 0.5017103762827823, "eval_entropy": 0.9814684393288065, "eval_loss": 0.0761854350566864, "eval_mean_token_accuracy": 0.9785121103656609, "eval_num_tokens": 23392990.0, "eval_runtime": 66.0042, "eval_samples_per_second": 127.037, "eval_steps_per_second": 3.985, "step": 220 }, { "entropy": 0.9853405505418777, "epoch": 0.5039908779931584, "grad_norm": 1.6953125, "learning_rate": 5e-06, "loss": 0.0721, "mean_token_accuracy": 0.977836549282074, "num_tokens": 23499540.0, "step": 221 }, { "entropy": 0.9821516275405884, "epoch": 0.5062713797035348, "grad_norm": 1.3828125, "learning_rate": 4.999999290524132e-06, "loss": 0.0636, "mean_token_accuracy": 0.9825811833143234, "num_tokens": 23605756.0, "step": 222 }, { "entropy": 0.9918091297149658, "epoch": 0.508551881413911, "grad_norm": 1.984375, "learning_rate": 4.999997162096932e-06, "loss": 0.0903, "mean_token_accuracy": 0.9785769879817963, "num_tokens": 23712344.0, "step": 223 }, { "entropy": 0.9905040711164474, "epoch": 0.5108323831242874, "grad_norm": 1.9140625, "learning_rate": 4.999993614719606e-06, "loss": 0.0666, "mean_token_accuracy": 0.9809742718935013, "num_tokens": 23818823.0, "step": 224 }, { "entropy": 0.991222932934761, "epoch": 0.5131128848346637, "grad_norm": 1.8046875, "learning_rate": 4.999988648394169e-06, "loss": 0.0739, "mean_token_accuracy": 0.9789907932281494, "num_tokens": 23925706.0, "step": 225 }, { "entropy": 0.9891288727521896, "epoch": 0.5153933865450399, "grad_norm": 1.7265625, "learning_rate": 4.99998226312344e-06, "loss": 0.0754, "mean_token_accuracy": 0.9797730892896652, "num_tokens": 24031653.0, "step": 226 }, { "entropy": 0.9921743869781494, "epoch": 0.5176738882554162, "grad_norm": 1.515625, "learning_rate": 4.999974458911041e-06, "loss": 0.069, "mean_token_accuracy": 0.9805087447166443, "num_tokens": 24137798.0, "step": 227 }, { "entropy": 0.9898716509342194, "epoch": 0.5199543899657925, "grad_norm": 1.8046875, "learning_rate": 4.999965235761404e-06, "loss": 0.075, "mean_token_accuracy": 0.979846715927124, "num_tokens": 24243925.0, "step": 228 }, { "entropy": 0.9929698556661606, "epoch": 0.5222348916761688, "grad_norm": 1.7578125, "learning_rate": 4.999954593679762e-06, "loss": 0.0804, "mean_token_accuracy": 0.9782237261533737, "num_tokens": 24350193.0, "step": 229 }, { "entropy": 0.99391970038414, "epoch": 0.5245153933865451, "grad_norm": 2.015625, "learning_rate": 4.999942532672157e-06, "loss": 0.0868, "mean_token_accuracy": 0.9750309139490128, "num_tokens": 24455935.0, "step": 230 }, { "entropy": 0.9833217263221741, "epoch": 0.5267958950969214, "grad_norm": 1.9375, "learning_rate": 4.999929052745434e-06, "loss": 0.071, "mean_token_accuracy": 0.9789441972970963, "num_tokens": 24562450.0, "step": 231 }, { "entropy": 0.9885305315256119, "epoch": 0.5290763968072976, "grad_norm": 2.03125, "learning_rate": 4.999914153907243e-06, "loss": 0.0819, "mean_token_accuracy": 0.9774582087993622, "num_tokens": 24668765.0, "step": 232 }, { "entropy": 0.987172544002533, "epoch": 0.5313568985176739, "grad_norm": 1.6875, "learning_rate": 4.999897836166041e-06, "loss": 0.0759, "mean_token_accuracy": 0.9792931824922562, "num_tokens": 24774691.0, "step": 233 }, { "entropy": 0.9881054311990738, "epoch": 0.5336374002280502, "grad_norm": 2.109375, "learning_rate": 4.999880099531089e-06, "loss": 0.075, "mean_token_accuracy": 0.9794323295354843, "num_tokens": 24880070.0, "step": 234 }, { "entropy": 0.9898116737604141, "epoch": 0.5359179019384265, "grad_norm": 1.9765625, "learning_rate": 4.999860944012455e-06, "loss": 0.0786, "mean_token_accuracy": 0.9749497175216675, "num_tokens": 24986339.0, "step": 235 }, { "entropy": 0.9890349805355072, "epoch": 0.5381984036488028, "grad_norm": 1.6953125, "learning_rate": 4.999840369621011e-06, "loss": 0.0815, "mean_token_accuracy": 0.9758811593055725, "num_tokens": 25092737.0, "step": 236 }, { "entropy": 0.991373211145401, "epoch": 0.540478905359179, "grad_norm": 1.8203125, "learning_rate": 4.999818376368435e-06, "loss": 0.0588, "mean_token_accuracy": 0.9859211593866348, "num_tokens": 25198733.0, "step": 237 }, { "entropy": 0.99245585501194, "epoch": 0.5427594070695553, "grad_norm": 1.6875, "learning_rate": 4.999794964267208e-06, "loss": 0.0689, "mean_token_accuracy": 0.9791001975536346, "num_tokens": 25305362.0, "step": 238 }, { "entropy": 0.983941063284874, "epoch": 0.5450399087799316, "grad_norm": 1.75, "learning_rate": 4.9997701333306215e-06, "loss": 0.0732, "mean_token_accuracy": 0.9804347306489944, "num_tokens": 25412137.0, "step": 239 }, { "entropy": 0.9987574219703674, "epoch": 0.5473204104903079, "grad_norm": 1.8359375, "learning_rate": 4.999743883572766e-06, "loss": 0.0771, "mean_token_accuracy": 0.9786854088306427, "num_tokens": 25518631.0, "step": 240 }, { "entropy": 0.9896082580089569, "epoch": 0.5496009122006842, "grad_norm": 1.734375, "learning_rate": 4.999716215008542e-06, "loss": 0.0704, "mean_token_accuracy": 0.9800395220518112, "num_tokens": 25624510.0, "step": 241 }, { "entropy": 1.0002480447292328, "epoch": 0.5518814139110604, "grad_norm": 1.96875, "learning_rate": 4.999687127653654e-06, "loss": 0.0789, "mean_token_accuracy": 0.9769577234983444, "num_tokens": 25730889.0, "step": 242 }, { "entropy": 0.9938782304525375, "epoch": 0.5541619156214367, "grad_norm": 2.046875, "learning_rate": 4.99965662152461e-06, "loss": 0.0723, "mean_token_accuracy": 0.9782837480306625, "num_tokens": 25837773.0, "step": 243 }, { "entropy": 0.9996870309114456, "epoch": 0.556442417331813, "grad_norm": 1.59375, "learning_rate": 4.999624696638725e-06, "loss": 0.0752, "mean_token_accuracy": 0.9779903739690781, "num_tokens": 25944099.0, "step": 244 }, { "entropy": 1.0009443163871765, "epoch": 0.5587229190421893, "grad_norm": 1.46875, "learning_rate": 4.999591353014119e-06, "loss": 0.0572, "mean_token_accuracy": 0.9828549772500992, "num_tokens": 26050303.0, "step": 245 }, { "entropy": 1.0002779513597488, "epoch": 0.5610034207525656, "grad_norm": 1.9921875, "learning_rate": 4.999556590669718e-06, "loss": 0.0768, "mean_token_accuracy": 0.9791773706674576, "num_tokens": 26157011.0, "step": 246 }, { "entropy": 0.9967940300703049, "epoch": 0.5632839224629419, "grad_norm": 1.625, "learning_rate": 4.999520409625253e-06, "loss": 0.0823, "mean_token_accuracy": 0.9767479449510574, "num_tokens": 26263254.0, "step": 247 }, { "entropy": 0.9997926950454712, "epoch": 0.5655644241733181, "grad_norm": 1.5234375, "learning_rate": 4.999482809901257e-06, "loss": 0.0735, "mean_token_accuracy": 0.9782445132732391, "num_tokens": 26369257.0, "step": 248 }, { "entropy": 1.000330924987793, "epoch": 0.5678449258836944, "grad_norm": 1.3671875, "learning_rate": 4.999443791519074e-06, "loss": 0.07, "mean_token_accuracy": 0.9820457696914673, "num_tokens": 26475485.0, "step": 249 }, { "entropy": 1.005497694015503, "epoch": 0.5701254275940707, "grad_norm": 1.8203125, "learning_rate": 4.999403354500847e-06, "loss": 0.0578, "mean_token_accuracy": 0.9825426489114761, "num_tokens": 26581749.0, "step": 250 }, { "entropy": 0.9956629276275635, "epoch": 0.572405929304447, "grad_norm": 1.7109375, "learning_rate": 4.99936149886953e-06, "loss": 0.0722, "mean_token_accuracy": 0.9816341400146484, "num_tokens": 26688184.0, "step": 251 }, { "entropy": 1.0046759843826294, "epoch": 0.5746864310148233, "grad_norm": 1.7890625, "learning_rate": 4.999318224648878e-06, "loss": 0.0719, "mean_token_accuracy": 0.9788413643836975, "num_tokens": 26794665.0, "step": 252 }, { "entropy": 1.0035061836242676, "epoch": 0.5769669327251995, "grad_norm": 1.578125, "learning_rate": 4.999273531863453e-06, "loss": 0.0637, "mean_token_accuracy": 0.9816992729902267, "num_tokens": 26901269.0, "step": 253 }, { "entropy": 0.9954895526170731, "epoch": 0.5792474344355758, "grad_norm": 2.0, "learning_rate": 4.999227420538622e-06, "loss": 0.0864, "mean_token_accuracy": 0.9759148508310318, "num_tokens": 27007830.0, "step": 254 }, { "entropy": 0.9954589605331421, "epoch": 0.5815279361459521, "grad_norm": 1.59375, "learning_rate": 4.999179890700555e-06, "loss": 0.0569, "mean_token_accuracy": 0.9835962057113647, "num_tokens": 27113665.0, "step": 255 }, { "entropy": 0.9942858219146729, "epoch": 0.5838084378563284, "grad_norm": 1.5078125, "learning_rate": 4.999130942376232e-06, "loss": 0.0498, "mean_token_accuracy": 0.9844542145729065, "num_tokens": 27219722.0, "step": 256 }, { "entropy": 0.9946591109037399, "epoch": 0.5860889395667047, "grad_norm": 1.46875, "learning_rate": 4.999080575593433e-06, "loss": 0.0556, "mean_token_accuracy": 0.9846567213535309, "num_tokens": 27325546.0, "step": 257 }, { "entropy": 0.9959817230701447, "epoch": 0.5883694412770809, "grad_norm": 2.0, "learning_rate": 4.999028790380746e-06, "loss": 0.0756, "mean_token_accuracy": 0.978645458817482, "num_tokens": 27431662.0, "step": 258 }, { "entropy": 0.991923600435257, "epoch": 0.5906499429874572, "grad_norm": 1.578125, "learning_rate": 4.9989755867675635e-06, "loss": 0.0659, "mean_token_accuracy": 0.980969712138176, "num_tokens": 27537969.0, "step": 259 }, { "entropy": 0.9943036586046219, "epoch": 0.5929304446978335, "grad_norm": 1.6953125, "learning_rate": 4.998920964784082e-06, "loss": 0.0715, "mean_token_accuracy": 0.9778747856616974, "num_tokens": 27644190.0, "step": 260 }, { "entropy": 0.9970516115427017, "epoch": 0.5952109464082098, "grad_norm": 1.40625, "learning_rate": 4.998864924461305e-06, "loss": 0.0554, "mean_token_accuracy": 0.9850499331951141, "num_tokens": 27750139.0, "step": 261 }, { "entropy": 0.990564614534378, "epoch": 0.5974914481185861, "grad_norm": 1.84375, "learning_rate": 4.998807465831039e-06, "loss": 0.074, "mean_token_accuracy": 0.9778688848018646, "num_tokens": 27856444.0, "step": 262 }, { "entropy": 0.9961322247982025, "epoch": 0.5997719498289624, "grad_norm": 1.703125, "learning_rate": 4.998748588925897e-06, "loss": 0.0684, "mean_token_accuracy": 0.980366975069046, "num_tokens": 27962658.0, "step": 263 }, { "entropy": 0.9895317852497101, "epoch": 0.6020524515393386, "grad_norm": 1.5859375, "learning_rate": 4.998688293779297e-06, "loss": 0.0618, "mean_token_accuracy": 0.9835464954376221, "num_tokens": 28069315.0, "step": 264 }, { "entropy": 0.9910869151353836, "epoch": 0.6043329532497149, "grad_norm": 1.609375, "learning_rate": 4.998626580425459e-06, "loss": 0.0681, "mean_token_accuracy": 0.9786537140607834, "num_tokens": 28175626.0, "step": 265 }, { "entropy": 0.9833101481199265, "epoch": 0.6066134549600912, "grad_norm": 1.9375, "learning_rate": 4.998563448899413e-06, "loss": 0.0794, "mean_token_accuracy": 0.9792327880859375, "num_tokens": 28282141.0, "step": 266 }, { "entropy": 0.988568127155304, "epoch": 0.6088939566704675, "grad_norm": 2.078125, "learning_rate": 4.998498899236989e-06, "loss": 0.0706, "mean_token_accuracy": 0.9813559949398041, "num_tokens": 28388682.0, "step": 267 }, { "entropy": 0.984402135014534, "epoch": 0.6111744583808438, "grad_norm": 1.7578125, "learning_rate": 4.998432931474825e-06, "loss": 0.0687, "mean_token_accuracy": 0.9804593473672867, "num_tokens": 28495044.0, "step": 268 }, { "entropy": 0.9896164536476135, "epoch": 0.61345496009122, "grad_norm": 1.7578125, "learning_rate": 4.998365545650365e-06, "loss": 0.0651, "mean_token_accuracy": 0.9824084341526031, "num_tokens": 28602202.0, "step": 269 }, { "entropy": 0.9873324930667877, "epoch": 0.6157354618015963, "grad_norm": 1.8828125, "learning_rate": 4.998296741801852e-06, "loss": 0.0758, "mean_token_accuracy": 0.9782459735870361, "num_tokens": 28708021.0, "step": 270 }, { "entropy": 0.984616830945015, "epoch": 0.6180159635119726, "grad_norm": 1.3984375, "learning_rate": 4.998226519968341e-06, "loss": 0.0529, "mean_token_accuracy": 0.985872358083725, "num_tokens": 28814138.0, "step": 271 }, { "entropy": 0.9886069595813751, "epoch": 0.6202964652223489, "grad_norm": 2.015625, "learning_rate": 4.998154880189688e-06, "loss": 0.0675, "mean_token_accuracy": 0.980413481593132, "num_tokens": 28920443.0, "step": 272 }, { "entropy": 0.9872033447027206, "epoch": 0.6225769669327252, "grad_norm": 1.4453125, "learning_rate": 4.998081822506552e-06, "loss": 0.0555, "mean_token_accuracy": 0.9839828610420227, "num_tokens": 29026992.0, "step": 273 }, { "entropy": 0.9949117600917816, "epoch": 0.6248574686431014, "grad_norm": 1.5859375, "learning_rate": 4.998007346960402e-06, "loss": 0.0629, "mean_token_accuracy": 0.9827041178941727, "num_tokens": 29133766.0, "step": 274 }, { "entropy": 0.9896757751703262, "epoch": 0.6271379703534777, "grad_norm": 1.609375, "learning_rate": 4.997931453593507e-06, "loss": 0.0737, "mean_token_accuracy": 0.978770911693573, "num_tokens": 29240250.0, "step": 275 }, { "entropy": 0.9852808862924576, "epoch": 0.629418472063854, "grad_norm": 1.5078125, "learning_rate": 4.997854142448944e-06, "loss": 0.056, "mean_token_accuracy": 0.9847185164690018, "num_tokens": 29346561.0, "step": 276 }, { "entropy": 0.9815623313188553, "epoch": 0.6316989737742303, "grad_norm": 1.484375, "learning_rate": 4.997775413570593e-06, "loss": 0.0646, "mean_token_accuracy": 0.9823594987392426, "num_tokens": 29453315.0, "step": 277 }, { "entropy": 0.9884564131498337, "epoch": 0.6339794754846066, "grad_norm": 1.25, "learning_rate": 4.997695267003139e-06, "loss": 0.0478, "mean_token_accuracy": 0.9863954186439514, "num_tokens": 29559422.0, "step": 278 }, { "entropy": 0.9902565777301788, "epoch": 0.636259977194983, "grad_norm": 1.5078125, "learning_rate": 4.99761370279207e-06, "loss": 0.0681, "mean_token_accuracy": 0.9808500409126282, "num_tokens": 29664998.0, "step": 279 }, { "entropy": 0.9911557883024216, "epoch": 0.6385404789053591, "grad_norm": 1.6796875, "learning_rate": 4.997530720983682e-06, "loss": 0.0769, "mean_token_accuracy": 0.979027509689331, "num_tokens": 29771636.0, "step": 280 }, { "entropy": 0.9903472363948822, "epoch": 0.6408209806157354, "grad_norm": 1.4921875, "learning_rate": 4.9974463216250735e-06, "loss": 0.0688, "mean_token_accuracy": 0.9808298200368881, "num_tokens": 29877832.0, "step": 281 }, { "entropy": 0.9894820004701614, "epoch": 0.6431014823261118, "grad_norm": 2.0625, "learning_rate": 4.997360504764148e-06, "loss": 0.066, "mean_token_accuracy": 0.981927290558815, "num_tokens": 29983895.0, "step": 282 }, { "entropy": 0.9881015419960022, "epoch": 0.645381984036488, "grad_norm": 1.421875, "learning_rate": 4.997273270449614e-06, "loss": 0.0593, "mean_token_accuracy": 0.9821851253509521, "num_tokens": 30090298.0, "step": 283 }, { "entropy": 0.9897582530975342, "epoch": 0.6476624857468644, "grad_norm": 1.640625, "learning_rate": 4.997184618730983e-06, "loss": 0.0679, "mean_token_accuracy": 0.9793965071439743, "num_tokens": 30197084.0, "step": 284 }, { "entropy": 0.988196536898613, "epoch": 0.6499429874572406, "grad_norm": 1.5390625, "learning_rate": 4.997094549658572e-06, "loss": 0.076, "mean_token_accuracy": 0.9784490913152695, "num_tokens": 30304333.0, "step": 285 }, { "entropy": 0.9899246990680695, "epoch": 0.6522234891676169, "grad_norm": 1.7734375, "learning_rate": 4.997003063283503e-06, "loss": 0.0882, "mean_token_accuracy": 0.9760512113571167, "num_tokens": 30410293.0, "step": 286 }, { "entropy": 0.9854319989681244, "epoch": 0.6545039908779932, "grad_norm": 1.578125, "learning_rate": 4.996910159657703e-06, "loss": 0.0675, "mean_token_accuracy": 0.9807902574539185, "num_tokens": 30516977.0, "step": 287 }, { "entropy": 0.9868714064359665, "epoch": 0.6567844925883695, "grad_norm": 1.7890625, "learning_rate": 4.996815838833899e-06, "loss": 0.082, "mean_token_accuracy": 0.9774934649467468, "num_tokens": 30623959.0, "step": 288 }, { "entropy": 0.9822472780942917, "epoch": 0.6590649942987458, "grad_norm": 1.625, "learning_rate": 4.99672010086563e-06, "loss": 0.0654, "mean_token_accuracy": 0.9798430353403091, "num_tokens": 30730980.0, "step": 289 }, { "entropy": 0.9833397418260574, "epoch": 0.661345496009122, "grad_norm": 1.671875, "learning_rate": 4.996622945807231e-06, "loss": 0.0592, "mean_token_accuracy": 0.9816693216562271, "num_tokens": 30837536.0, "step": 290 }, { "entropy": 0.9952177107334137, "epoch": 0.6636259977194983, "grad_norm": 1.7421875, "learning_rate": 4.996524373713848e-06, "loss": 0.0774, "mean_token_accuracy": 0.9767493307590485, "num_tokens": 30944814.0, "step": 291 }, { "entropy": 0.9855045825242996, "epoch": 0.6659064994298746, "grad_norm": 1.4140625, "learning_rate": 4.996424384641428e-06, "loss": 0.0594, "mean_token_accuracy": 0.9810649454593658, "num_tokens": 31050954.0, "step": 292 }, { "entropy": 0.9883499592542648, "epoch": 0.6681870011402509, "grad_norm": 1.3359375, "learning_rate": 4.996322978646722e-06, "loss": 0.0572, "mean_token_accuracy": 0.9821442663669586, "num_tokens": 31157246.0, "step": 293 }, { "entropy": 0.9915643930435181, "epoch": 0.6704675028506272, "grad_norm": 1.8984375, "learning_rate": 4.996220155787287e-06, "loss": 0.0638, "mean_token_accuracy": 0.9854893833398819, "num_tokens": 31263598.0, "step": 294 }, { "entropy": 0.9881881773471832, "epoch": 0.6727480045610034, "grad_norm": 1.9375, "learning_rate": 4.996115916121483e-06, "loss": 0.076, "mean_token_accuracy": 0.9801952093839645, "num_tokens": 31370113.0, "step": 295 }, { "entropy": 0.9826924949884415, "epoch": 0.6750285062713797, "grad_norm": 1.96875, "learning_rate": 4.996010259708475e-06, "loss": 0.0822, "mean_token_accuracy": 0.9789474010467529, "num_tokens": 31476349.0, "step": 296 }, { "entropy": 0.9859483242034912, "epoch": 0.677309007981756, "grad_norm": 1.625, "learning_rate": 4.99590318660823e-06, "loss": 0.0595, "mean_token_accuracy": 0.984224483370781, "num_tokens": 31582972.0, "step": 297 }, { "entropy": 0.9819737374782562, "epoch": 0.6795895096921323, "grad_norm": 1.515625, "learning_rate": 4.9957946968815215e-06, "loss": 0.0692, "mean_token_accuracy": 0.9812329709529877, "num_tokens": 31689175.0, "step": 298 }, { "entropy": 0.990312322974205, "epoch": 0.6818700114025086, "grad_norm": 1.6875, "learning_rate": 4.995684790589927e-06, "loss": 0.069, "mean_token_accuracy": 0.9808000177145004, "num_tokens": 31795467.0, "step": 299 }, { "entropy": 0.9858143478631973, "epoch": 0.6841505131128849, "grad_norm": 1.953125, "learning_rate": 4.995573467795825e-06, "loss": 0.0751, "mean_token_accuracy": 0.9785460084676743, "num_tokens": 31902128.0, "step": 300 }, { "entropy": 0.992887869477272, "epoch": 0.6864310148232611, "grad_norm": 1.453125, "learning_rate": 4.995460728562403e-06, "loss": 0.0612, "mean_token_accuracy": 0.9832176268100739, "num_tokens": 32008421.0, "step": 301 }, { "entropy": 0.9935334175825119, "epoch": 0.6887115165336374, "grad_norm": 1.4453125, "learning_rate": 4.9953465729536475e-06, "loss": 0.0542, "mean_token_accuracy": 0.9855736494064331, "num_tokens": 32114826.0, "step": 302 }, { "entropy": 0.9840753525495529, "epoch": 0.6909920182440137, "grad_norm": 1.3984375, "learning_rate": 4.995231001034352e-06, "loss": 0.0623, "mean_token_accuracy": 0.9786565601825714, "num_tokens": 32220645.0, "step": 303 }, { "entropy": 0.9954367429018021, "epoch": 0.69327251995439, "grad_norm": 1.5546875, "learning_rate": 4.995114012870112e-06, "loss": 0.0523, "mean_token_accuracy": 0.9822810888290405, "num_tokens": 32326964.0, "step": 304 }, { "entropy": 0.987249344587326, "epoch": 0.6955530216647663, "grad_norm": 1.5078125, "learning_rate": 4.99499560852733e-06, "loss": 0.0742, "mean_token_accuracy": 0.9806233644485474, "num_tokens": 32433435.0, "step": 305 }, { "entropy": 0.9856873005628586, "epoch": 0.6978335233751425, "grad_norm": 1.546875, "learning_rate": 4.994875788073207e-06, "loss": 0.0668, "mean_token_accuracy": 0.9828383773565292, "num_tokens": 32540472.0, "step": 306 }, { "entropy": 0.9927677065134048, "epoch": 0.7001140250855188, "grad_norm": 1.671875, "learning_rate": 4.994754551575752e-06, "loss": 0.0753, "mean_token_accuracy": 0.9787809401750565, "num_tokens": 32646312.0, "step": 307 }, { "entropy": 0.9906519949436188, "epoch": 0.7023945267958951, "grad_norm": 1.296875, "learning_rate": 4.994631899103777e-06, "loss": 0.0533, "mean_token_accuracy": 0.9853856414556503, "num_tokens": 32752699.0, "step": 308 }, { "entropy": 0.9899300187826157, "epoch": 0.7046750285062714, "grad_norm": 1.46875, "learning_rate": 4.9945078307268974e-06, "loss": 0.0657, "mean_token_accuracy": 0.9820606559514999, "num_tokens": 32858614.0, "step": 309 }, { "entropy": 0.9925295263528824, "epoch": 0.7069555302166477, "grad_norm": 1.3984375, "learning_rate": 4.994382346515531e-06, "loss": 0.0652, "mean_token_accuracy": 0.9830323904752731, "num_tokens": 32965404.0, "step": 310 }, { "entropy": 0.9907500743865967, "epoch": 0.7092360319270239, "grad_norm": 1.453125, "learning_rate": 4.9942554465409e-06, "loss": 0.0487, "mean_token_accuracy": 0.9847237020730972, "num_tokens": 33071054.0, "step": 311 }, { "entropy": 0.9853813350200653, "epoch": 0.7115165336374002, "grad_norm": 1.484375, "learning_rate": 4.994127130875032e-06, "loss": 0.0549, "mean_token_accuracy": 0.9849971234798431, "num_tokens": 33177443.0, "step": 312 }, { "entropy": 0.9887946546077728, "epoch": 0.7137970353477765, "grad_norm": 1.7421875, "learning_rate": 4.993997399590755e-06, "loss": 0.0636, "mean_token_accuracy": 0.9814212471246719, "num_tokens": 33283662.0, "step": 313 }, { "entropy": 0.9988455921411514, "epoch": 0.7160775370581528, "grad_norm": 1.9765625, "learning_rate": 4.993866252761702e-06, "loss": 0.0598, "mean_token_accuracy": 0.9842907935380936, "num_tokens": 33389896.0, "step": 314 }, { "entropy": 0.9922047108411789, "epoch": 0.7183580387685291, "grad_norm": 1.609375, "learning_rate": 4.993733690462311e-06, "loss": 0.0551, "mean_token_accuracy": 0.98334601521492, "num_tokens": 33496214.0, "step": 315 }, { "entropy": 0.9899660050868988, "epoch": 0.7206385404789054, "grad_norm": 1.3515625, "learning_rate": 4.99359971276782e-06, "loss": 0.0498, "mean_token_accuracy": 0.9843093305826187, "num_tokens": 33602503.0, "step": 316 }, { "entropy": 0.9869642108678818, "epoch": 0.7229190421892816, "grad_norm": 1.265625, "learning_rate": 4.993464319754273e-06, "loss": 0.0561, "mean_token_accuracy": 0.9819488823413849, "num_tokens": 33708508.0, "step": 317 }, { "entropy": 0.9906782358884811, "epoch": 0.7251995438996579, "grad_norm": 1.53125, "learning_rate": 4.993327511498516e-06, "loss": 0.0591, "mean_token_accuracy": 0.9827994257211685, "num_tokens": 33815006.0, "step": 318 }, { "entropy": 0.984364703297615, "epoch": 0.7274800456100342, "grad_norm": 1.578125, "learning_rate": 4.9931892880782e-06, "loss": 0.0612, "mean_token_accuracy": 0.984150841832161, "num_tokens": 33921273.0, "step": 319 }, { "entropy": 0.9854198694229126, "epoch": 0.7297605473204105, "grad_norm": 1.6640625, "learning_rate": 4.993049649571775e-06, "loss": 0.0586, "mean_token_accuracy": 0.9820143580436707, "num_tokens": 34027498.0, "step": 320 }, { "entropy": 0.9824755042791367, "epoch": 0.7320410490307868, "grad_norm": 1.703125, "learning_rate": 4.992908596058501e-06, "loss": 0.0628, "mean_token_accuracy": 0.9822209030389786, "num_tokens": 34134095.0, "step": 321 }, { "entropy": 0.9831451922655106, "epoch": 0.734321550741163, "grad_norm": 1.421875, "learning_rate": 4.992766127618434e-06, "loss": 0.0617, "mean_token_accuracy": 0.9824348092079163, "num_tokens": 34240238.0, "step": 322 }, { "entropy": 0.9829341471195221, "epoch": 0.7366020524515393, "grad_norm": 1.6171875, "learning_rate": 4.992622244332439e-06, "loss": 0.0581, "mean_token_accuracy": 0.9842199236154556, "num_tokens": 34346108.0, "step": 323 }, { "entropy": 0.9864680171012878, "epoch": 0.7388825541619156, "grad_norm": 1.5703125, "learning_rate": 4.992476946282179e-06, "loss": 0.0578, "mean_token_accuracy": 0.9837837368249893, "num_tokens": 34452621.0, "step": 324 }, { "entropy": 0.9825157523155212, "epoch": 0.7411630558722919, "grad_norm": 1.9921875, "learning_rate": 4.992330233550124e-06, "loss": 0.0608, "mean_token_accuracy": 0.9795385301113129, "num_tokens": 34559001.0, "step": 325 }, { "entropy": 0.9834232479333878, "epoch": 0.7434435575826682, "grad_norm": 1.6953125, "learning_rate": 4.9921821062195445e-06, "loss": 0.0631, "mean_token_accuracy": 0.9811839759349823, "num_tokens": 34664752.0, "step": 326 }, { "entropy": 0.97878198325634, "epoch": 0.7457240592930444, "grad_norm": 1.375, "learning_rate": 4.9920325643745145e-06, "loss": 0.054, "mean_token_accuracy": 0.9846237748861313, "num_tokens": 34771066.0, "step": 327 }, { "entropy": 0.9821022599935532, "epoch": 0.7480045610034207, "grad_norm": 1.7421875, "learning_rate": 4.991881608099912e-06, "loss": 0.0643, "mean_token_accuracy": 0.9793728888034821, "num_tokens": 34877748.0, "step": 328 }, { "entropy": 0.9794961363077164, "epoch": 0.750285062713797, "grad_norm": 1.4296875, "learning_rate": 4.991729237481417e-06, "loss": 0.0556, "mean_token_accuracy": 0.9851173311471939, "num_tokens": 34983959.0, "step": 329 }, { "entropy": 0.9823174625635147, "epoch": 0.7525655644241733, "grad_norm": 1.8203125, "learning_rate": 4.991575452605511e-06, "loss": 0.0604, "mean_token_accuracy": 0.9824616312980652, "num_tokens": 35090408.0, "step": 330 }, { "entropy": 0.9894793182611465, "epoch": 0.7548460661345496, "grad_norm": 1.4140625, "learning_rate": 4.9914202535594795e-06, "loss": 0.0606, "mean_token_accuracy": 0.9813710302114487, "num_tokens": 35196494.0, "step": 331 }, { "entropy": 0.9870564639568329, "epoch": 0.7571265678449259, "grad_norm": 1.4921875, "learning_rate": 4.991263640431411e-06, "loss": 0.0671, "mean_token_accuracy": 0.9811055809259415, "num_tokens": 35302997.0, "step": 332 }, { "entropy": 0.9843275099992752, "epoch": 0.7594070695553021, "grad_norm": 1.7109375, "learning_rate": 4.9911056133101965e-06, "loss": 0.0662, "mean_token_accuracy": 0.9805188328027725, "num_tokens": 35409033.0, "step": 333 }, { "entropy": 0.9955145567655563, "epoch": 0.7616875712656784, "grad_norm": 1.4140625, "learning_rate": 4.990946172285528e-06, "loss": 0.0586, "mean_token_accuracy": 0.9837165921926498, "num_tokens": 35515115.0, "step": 334 }, { "entropy": 0.9938471615314484, "epoch": 0.7639680729760547, "grad_norm": 1.2265625, "learning_rate": 4.990785317447901e-06, "loss": 0.0513, "mean_token_accuracy": 0.9866195619106293, "num_tokens": 35621562.0, "step": 335 }, { "entropy": 0.9911567866802216, "epoch": 0.766248574686431, "grad_norm": 1.4765625, "learning_rate": 4.990623048888615e-06, "loss": 0.0712, "mean_token_accuracy": 0.9799536466598511, "num_tokens": 35727567.0, "step": 336 }, { "entropy": 0.9883733689785004, "epoch": 0.7685290763968073, "grad_norm": 1.8125, "learning_rate": 4.9904593666997704e-06, "loss": 0.0824, "mean_token_accuracy": 0.9781724065542221, "num_tokens": 35833619.0, "step": 337 }, { "entropy": 0.9930688291788101, "epoch": 0.7708095781071835, "grad_norm": 1.6328125, "learning_rate": 4.990294270974268e-06, "loss": 0.061, "mean_token_accuracy": 0.9832266718149185, "num_tokens": 35940008.0, "step": 338 }, { "entropy": 0.9947724640369415, "epoch": 0.7730900798175598, "grad_norm": 1.578125, "learning_rate": 4.990127761805816e-06, "loss": 0.0563, "mean_token_accuracy": 0.9866450279951096, "num_tokens": 36045543.0, "step": 339 }, { "entropy": 0.997623085975647, "epoch": 0.7753705815279361, "grad_norm": 1.484375, "learning_rate": 4.989959839288919e-06, "loss": 0.0566, "mean_token_accuracy": 0.9805685430765152, "num_tokens": 36152571.0, "step": 340 }, { "entropy": 0.9901531636714935, "epoch": 0.7776510832383124, "grad_norm": 1.46875, "learning_rate": 4.989790503518888e-06, "loss": 0.0649, "mean_token_accuracy": 0.9812293499708176, "num_tokens": 36258854.0, "step": 341 }, { "entropy": 0.9993949085474014, "epoch": 0.7799315849486887, "grad_norm": 1.3125, "learning_rate": 4.9896197545918345e-06, "loss": 0.063, "mean_token_accuracy": 0.9800555258989334, "num_tokens": 36365554.0, "step": 342 }, { "entropy": 0.9981313943862915, "epoch": 0.7822120866590649, "grad_norm": 1.3671875, "learning_rate": 4.989447592604673e-06, "loss": 0.0572, "mean_token_accuracy": 0.9835271686315536, "num_tokens": 36472371.0, "step": 343 }, { "entropy": 0.9955075234174728, "epoch": 0.7844925883694412, "grad_norm": 1.7109375, "learning_rate": 4.989274017655117e-06, "loss": 0.0503, "mean_token_accuracy": 0.9840450584888458, "num_tokens": 36578737.0, "step": 344 }, { "entropy": 0.9929013103246689, "epoch": 0.7867730900798175, "grad_norm": 1.3828125, "learning_rate": 4.989099029841687e-06, "loss": 0.0559, "mean_token_accuracy": 0.9847136586904526, "num_tokens": 36684595.0, "step": 345 }, { "entropy": 0.9863213151693344, "epoch": 0.7890535917901939, "grad_norm": 1.53125, "learning_rate": 4.988922629263701e-06, "loss": 0.0655, "mean_token_accuracy": 0.9808591157197952, "num_tokens": 36790571.0, "step": 346 }, { "entropy": 0.9935362339019775, "epoch": 0.7913340935005702, "grad_norm": 1.59375, "learning_rate": 4.988744816021283e-06, "loss": 0.0764, "mean_token_accuracy": 0.9813351631164551, "num_tokens": 36897420.0, "step": 347 }, { "entropy": 0.9953356832265854, "epoch": 0.7936145952109465, "grad_norm": 1.84375, "learning_rate": 4.988565590215352e-06, "loss": 0.0635, "mean_token_accuracy": 0.9834104031324387, "num_tokens": 37003675.0, "step": 348 }, { "entropy": 0.9887809455394745, "epoch": 0.7958950969213227, "grad_norm": 1.4765625, "learning_rate": 4.9883849519476364e-06, "loss": 0.0531, "mean_token_accuracy": 0.9847796410322189, "num_tokens": 37110318.0, "step": 349 }, { "entropy": 0.9980737566947937, "epoch": 0.798175598631699, "grad_norm": 1.328125, "learning_rate": 4.988202901320663e-06, "loss": 0.0586, "mean_token_accuracy": 0.9852436035871506, "num_tokens": 37216892.0, "step": 350 }, { "entropy": 0.987604945898056, "epoch": 0.8004561003420753, "grad_norm": 1.6328125, "learning_rate": 4.988019438437759e-06, "loss": 0.0537, "mean_token_accuracy": 0.9847637712955475, "num_tokens": 37322734.0, "step": 351 }, { "entropy": 0.9908708184957504, "epoch": 0.8027366020524516, "grad_norm": 1.4140625, "learning_rate": 4.987834563403055e-06, "loss": 0.0518, "mean_token_accuracy": 0.9842967987060547, "num_tokens": 37429778.0, "step": 352 }, { "entropy": 0.9979000985622406, "epoch": 0.8050171037628279, "grad_norm": 1.6953125, "learning_rate": 4.987648276321482e-06, "loss": 0.0613, "mean_token_accuracy": 0.9825383871793747, "num_tokens": 37535540.0, "step": 353 }, { "entropy": 0.991772934794426, "epoch": 0.8072976054732041, "grad_norm": 1.6796875, "learning_rate": 4.987460577298774e-06, "loss": 0.054, "mean_token_accuracy": 0.9818495213985443, "num_tokens": 37642142.0, "step": 354 }, { "entropy": 0.992615595459938, "epoch": 0.8095781071835804, "grad_norm": 1.5234375, "learning_rate": 4.9872714664414635e-06, "loss": 0.06, "mean_token_accuracy": 0.9823702573776245, "num_tokens": 37747643.0, "step": 355 }, { "entropy": 0.9873270094394684, "epoch": 0.8118586088939567, "grad_norm": 1.7109375, "learning_rate": 4.987080943856887e-06, "loss": 0.0653, "mean_token_accuracy": 0.9821462035179138, "num_tokens": 37853451.0, "step": 356 }, { "entropy": 0.9961950331926346, "epoch": 0.814139110604333, "grad_norm": 1.7109375, "learning_rate": 4.986889009653183e-06, "loss": 0.063, "mean_token_accuracy": 0.9811909645795822, "num_tokens": 37960196.0, "step": 357 }, { "entropy": 0.9889590442180634, "epoch": 0.8164196123147093, "grad_norm": 1.703125, "learning_rate": 4.986695663939288e-06, "loss": 0.0601, "mean_token_accuracy": 0.9819139838218689, "num_tokens": 38066574.0, "step": 358 }, { "entropy": 0.9976160824298859, "epoch": 0.8187001140250855, "grad_norm": 1.40625, "learning_rate": 4.986500906824942e-06, "loss": 0.0514, "mean_token_accuracy": 0.9866797626018524, "num_tokens": 38172665.0, "step": 359 }, { "entropy": 0.9937845170497894, "epoch": 0.8209806157354618, "grad_norm": 1.4375, "learning_rate": 4.986304738420684e-06, "loss": 0.0488, "mean_token_accuracy": 0.985192134976387, "num_tokens": 38278975.0, "step": 360 }, { "entropy": 0.9914178550243378, "epoch": 0.8232611174458381, "grad_norm": 1.640625, "learning_rate": 4.9861071588378565e-06, "loss": 0.0527, "mean_token_accuracy": 0.985836535692215, "num_tokens": 38385060.0, "step": 361 }, { "entropy": 0.9859942942857742, "epoch": 0.8255416191562144, "grad_norm": 1.59375, "learning_rate": 4.985908168188602e-06, "loss": 0.0555, "mean_token_accuracy": 0.9858461618423462, "num_tokens": 38491127.0, "step": 362 }, { "entropy": 0.9915935397148132, "epoch": 0.8278221208665907, "grad_norm": 1.4765625, "learning_rate": 4.985707766585865e-06, "loss": 0.0572, "mean_token_accuracy": 0.9834768027067184, "num_tokens": 38597105.0, "step": 363 }, { "entropy": 0.9895428717136383, "epoch": 0.830102622576967, "grad_norm": 1.5390625, "learning_rate": 4.985505954143387e-06, "loss": 0.0567, "mean_token_accuracy": 0.9850101172924042, "num_tokens": 38703456.0, "step": 364 }, { "entropy": 0.9922546595335007, "epoch": 0.8323831242873432, "grad_norm": 1.2265625, "learning_rate": 4.985302730975713e-06, "loss": 0.0524, "mean_token_accuracy": 0.986375093460083, "num_tokens": 38809851.0, "step": 365 }, { "entropy": 0.9947866797447205, "epoch": 0.8346636259977195, "grad_norm": 1.7109375, "learning_rate": 4.9850980971981914e-06, "loss": 0.0613, "mean_token_accuracy": 0.9821257442235947, "num_tokens": 38915963.0, "step": 366 }, { "entropy": 0.9925936162471771, "epoch": 0.8369441277080958, "grad_norm": 1.578125, "learning_rate": 4.984892052926965e-06, "loss": 0.0628, "mean_token_accuracy": 0.9799981266260147, "num_tokens": 39022164.0, "step": 367 }, { "entropy": 0.9900194704532623, "epoch": 0.8392246294184721, "grad_norm": 1.1796875, "learning_rate": 4.984684598278982e-06, "loss": 0.0449, "mean_token_accuracy": 0.9867079854011536, "num_tokens": 39127992.0, "step": 368 }, { "entropy": 0.9945370107889175, "epoch": 0.8415051311288484, "grad_norm": 2.15625, "learning_rate": 4.984475733371991e-06, "loss": 0.0743, "mean_token_accuracy": 0.9772392511367798, "num_tokens": 39234564.0, "step": 369 }, { "entropy": 0.9922092258930206, "epoch": 0.8437856328392246, "grad_norm": 1.6171875, "learning_rate": 4.984265458324538e-06, "loss": 0.0638, "mean_token_accuracy": 0.9809235036373138, "num_tokens": 39340483.0, "step": 370 }, { "entropy": 0.9936464130878448, "epoch": 0.8460661345496009, "grad_norm": 1.46875, "learning_rate": 4.984053773255971e-06, "loss": 0.0509, "mean_token_accuracy": 0.9857594221830368, "num_tokens": 39447200.0, "step": 371 }, { "entropy": 0.997045174241066, "epoch": 0.8483466362599772, "grad_norm": 1.4921875, "learning_rate": 4.9838406782864394e-06, "loss": 0.0504, "mean_token_accuracy": 0.9833871126174927, "num_tokens": 39553617.0, "step": 372 }, { "entropy": 0.9940406233072281, "epoch": 0.8506271379703535, "grad_norm": 1.359375, "learning_rate": 4.983626173536891e-06, "loss": 0.0559, "mean_token_accuracy": 0.9846807271242142, "num_tokens": 39659932.0, "step": 373 }, { "entropy": 1.0006090253591537, "epoch": 0.8529076396807298, "grad_norm": 1.7578125, "learning_rate": 4.983410259129075e-06, "loss": 0.0624, "mean_token_accuracy": 0.9830401539802551, "num_tokens": 39766179.0, "step": 374 }, { "entropy": 0.9935064017772675, "epoch": 0.855188141391106, "grad_norm": 1.515625, "learning_rate": 4.983192935185539e-06, "loss": 0.056, "mean_token_accuracy": 0.9846747666597366, "num_tokens": 39872108.0, "step": 375 }, { "entropy": 0.9951618611812592, "epoch": 0.8574686431014823, "grad_norm": 1.5625, "learning_rate": 4.9829742018296335e-06, "loss": 0.0649, "mean_token_accuracy": 0.9802599400281906, "num_tokens": 39978438.0, "step": 376 }, { "entropy": 0.9980364143848419, "epoch": 0.8597491448118586, "grad_norm": 1.09375, "learning_rate": 4.9827540591855064e-06, "loss": 0.0502, "mean_token_accuracy": 0.9840812534093857, "num_tokens": 40084566.0, "step": 377 }, { "entropy": 0.9930616468191147, "epoch": 0.8620296465222349, "grad_norm": 1.7265625, "learning_rate": 4.9825325073781075e-06, "loss": 0.0597, "mean_token_accuracy": 0.9831181466579437, "num_tokens": 40190757.0, "step": 378 }, { "entropy": 1.0002626925706863, "epoch": 0.8643101482326112, "grad_norm": 1.3046875, "learning_rate": 4.982309546533184e-06, "loss": 0.0491, "mean_token_accuracy": 0.9840922653675079, "num_tokens": 40296705.0, "step": 379 }, { "entropy": 0.9971409887075424, "epoch": 0.8665906499429875, "grad_norm": 1.1640625, "learning_rate": 4.982085176777285e-06, "loss": 0.0524, "mean_token_accuracy": 0.9838970452547073, "num_tokens": 40403093.0, "step": 380 }, { "entropy": 1.000163421034813, "epoch": 0.8688711516533637, "grad_norm": 1.4609375, "learning_rate": 4.981859398237758e-06, "loss": 0.066, "mean_token_accuracy": 0.9838507920503616, "num_tokens": 40509518.0, "step": 381 }, { "entropy": 0.9972517937421799, "epoch": 0.87115165336374, "grad_norm": 1.6953125, "learning_rate": 4.9816322110427505e-06, "loss": 0.062, "mean_token_accuracy": 0.9827415943145752, "num_tokens": 40616238.0, "step": 382 }, { "entropy": 1.0007647722959518, "epoch": 0.8734321550741163, "grad_norm": 1.65625, "learning_rate": 4.98140361532121e-06, "loss": 0.0595, "mean_token_accuracy": 0.9832288771867752, "num_tokens": 40722285.0, "step": 383 }, { "entropy": 0.9979493468999863, "epoch": 0.8757126567844926, "grad_norm": 1.3046875, "learning_rate": 4.981173611202883e-06, "loss": 0.0477, "mean_token_accuracy": 0.9869934171438217, "num_tokens": 40828326.0, "step": 384 }, { "entropy": 0.9982305616140366, "epoch": 0.8779931584948689, "grad_norm": 1.4453125, "learning_rate": 4.980942198818315e-06, "loss": 0.0601, "mean_token_accuracy": 0.9832217544317245, "num_tokens": 40934897.0, "step": 385 }, { "entropy": 0.9912564754486084, "epoch": 0.8802736602052451, "grad_norm": 1.2578125, "learning_rate": 4.980709378298851e-06, "loss": 0.0483, "mean_token_accuracy": 0.9833930283784866, "num_tokens": 41041131.0, "step": 386 }, { "entropy": 0.996632531285286, "epoch": 0.8825541619156214, "grad_norm": 1.7421875, "learning_rate": 4.980475149776636e-06, "loss": 0.0605, "mean_token_accuracy": 0.9817553609609604, "num_tokens": 41147177.0, "step": 387 }, { "entropy": 0.9980178773403168, "epoch": 0.8848346636259977, "grad_norm": 1.5234375, "learning_rate": 4.980239513384614e-06, "loss": 0.0565, "mean_token_accuracy": 0.9834538698196411, "num_tokens": 41253464.0, "step": 388 }, { "entropy": 0.9888476729393005, "epoch": 0.887115165336374, "grad_norm": 1.5234375, "learning_rate": 4.980002469256527e-06, "loss": 0.0501, "mean_token_accuracy": 0.9848448634147644, "num_tokens": 41359698.0, "step": 389 }, { "entropy": 0.9897028654813766, "epoch": 0.8893956670467503, "grad_norm": 1.359375, "learning_rate": 4.979764017526916e-06, "loss": 0.0477, "mean_token_accuracy": 0.9866544455289841, "num_tokens": 41465978.0, "step": 390 }, { "entropy": 0.9959966242313385, "epoch": 0.8916761687571265, "grad_norm": 1.078125, "learning_rate": 4.979524158331123e-06, "loss": 0.0434, "mean_token_accuracy": 0.9886751919984818, "num_tokens": 41572032.0, "step": 391 }, { "entropy": 0.9893486350774765, "epoch": 0.8939566704675028, "grad_norm": 1.3828125, "learning_rate": 4.979282891805287e-06, "loss": 0.06, "mean_token_accuracy": 0.9828176945447922, "num_tokens": 41678662.0, "step": 392 }, { "entropy": 0.994259387254715, "epoch": 0.8962371721778791, "grad_norm": 1.28125, "learning_rate": 4.979040218086345e-06, "loss": 0.0389, "mean_token_accuracy": 0.9888333380222321, "num_tokens": 41784582.0, "step": 393 }, { "entropy": 0.9941402822732925, "epoch": 0.8985176738882554, "grad_norm": 1.25, "learning_rate": 4.978796137312036e-06, "loss": 0.042, "mean_token_accuracy": 0.9859060049057007, "num_tokens": 41890767.0, "step": 394 }, { "entropy": 0.9920034259557724, "epoch": 0.9007981755986317, "grad_norm": 1.5390625, "learning_rate": 4.978550649620894e-06, "loss": 0.0603, "mean_token_accuracy": 0.9817443341016769, "num_tokens": 41997376.0, "step": 395 }, { "entropy": 0.9951441287994385, "epoch": 0.9030786773090079, "grad_norm": 1.484375, "learning_rate": 4.978303755152254e-06, "loss": 0.0553, "mean_token_accuracy": 0.9853280782699585, "num_tokens": 42104204.0, "step": 396 }, { "entropy": 0.9931427538394928, "epoch": 0.9053591790193842, "grad_norm": 1.2578125, "learning_rate": 4.978055454046247e-06, "loss": 0.0429, "mean_token_accuracy": 0.9856635928153992, "num_tokens": 42210403.0, "step": 397 }, { "entropy": 0.9815190732479095, "epoch": 0.9076396807297605, "grad_norm": 1.546875, "learning_rate": 4.977805746443807e-06, "loss": 0.0684, "mean_token_accuracy": 0.9810968935489655, "num_tokens": 42316603.0, "step": 398 }, { "entropy": 0.9933472871780396, "epoch": 0.9099201824401368, "grad_norm": 1.015625, "learning_rate": 4.9775546324866596e-06, "loss": 0.0437, "mean_token_accuracy": 0.9875859767198563, "num_tokens": 42422821.0, "step": 399 }, { "entropy": 0.9833043813705444, "epoch": 0.9122006841505131, "grad_norm": 1.7734375, "learning_rate": 4.977302112317334e-06, "loss": 0.0593, "mean_token_accuracy": 0.9822406768798828, "num_tokens": 42528995.0, "step": 400 }, { "entropy": 0.9921567142009735, "epoch": 0.9144811858608894, "grad_norm": 1.453125, "learning_rate": 4.977048186079155e-06, "loss": 0.0539, "mean_token_accuracy": 0.9828652590513229, "num_tokens": 42635025.0, "step": 401 }, { "entropy": 0.986000269651413, "epoch": 0.9167616875712656, "grad_norm": 1.9921875, "learning_rate": 4.976792853916248e-06, "loss": 0.0537, "mean_token_accuracy": 0.9850079566240311, "num_tokens": 42740839.0, "step": 402 }, { "entropy": 0.9934243708848953, "epoch": 0.9190421892816419, "grad_norm": 1.46875, "learning_rate": 4.9765361159735335e-06, "loss": 0.0621, "mean_token_accuracy": 0.9847323596477509, "num_tokens": 42847493.0, "step": 403 }, { "entropy": 0.9852237701416016, "epoch": 0.9213226909920182, "grad_norm": 1.640625, "learning_rate": 4.97627797239673e-06, "loss": 0.0641, "mean_token_accuracy": 0.9819068759679794, "num_tokens": 42953570.0, "step": 404 }, { "entropy": 0.9889465272426605, "epoch": 0.9236031927023945, "grad_norm": 1.4453125, "learning_rate": 4.976018423332357e-06, "loss": 0.071, "mean_token_accuracy": 0.9820783287286758, "num_tokens": 43059711.0, "step": 405 }, { "entropy": 0.9891555309295654, "epoch": 0.9258836944127709, "grad_norm": 1.390625, "learning_rate": 4.975757468927727e-06, "loss": 0.0523, "mean_token_accuracy": 0.9827336519956589, "num_tokens": 43166236.0, "step": 406 }, { "entropy": 0.9895079582929611, "epoch": 0.928164196123147, "grad_norm": 1.8359375, "learning_rate": 4.975495109330954e-06, "loss": 0.0698, "mean_token_accuracy": 0.9793443232774734, "num_tokens": 43272214.0, "step": 407 }, { "entropy": 0.9915363788604736, "epoch": 0.9304446978335233, "grad_norm": 1.453125, "learning_rate": 4.97523134469095e-06, "loss": 0.058, "mean_token_accuracy": 0.9854949712753296, "num_tokens": 43378538.0, "step": 408 }, { "entropy": 0.9931406229734421, "epoch": 0.9327251995438997, "grad_norm": 1.234375, "learning_rate": 4.97496617515742e-06, "loss": 0.0502, "mean_token_accuracy": 0.9865347594022751, "num_tokens": 43485133.0, "step": 409 }, { "entropy": 0.9865346848964691, "epoch": 0.935005701254276, "grad_norm": 1.5625, "learning_rate": 4.974699600880869e-06, "loss": 0.0627, "mean_token_accuracy": 0.982403352856636, "num_tokens": 43592181.0, "step": 410 }, { "entropy": 0.9857459515333176, "epoch": 0.9372862029646523, "grad_norm": 1.2890625, "learning_rate": 4.974431622012601e-06, "loss": 0.0628, "mean_token_accuracy": 0.9843746870756149, "num_tokens": 43698162.0, "step": 411 }, { "entropy": 0.9871868640184402, "epoch": 0.9395667046750285, "grad_norm": 1.3203125, "learning_rate": 4.974162238704716e-06, "loss": 0.0479, "mean_token_accuracy": 0.9863204210996628, "num_tokens": 43804523.0, "step": 412 }, { "entropy": 0.9960972666740417, "epoch": 0.9418472063854048, "grad_norm": 1.2421875, "learning_rate": 4.973891451110109e-06, "loss": 0.0554, "mean_token_accuracy": 0.9831165969371796, "num_tokens": 43910940.0, "step": 413 }, { "entropy": 0.9916461408138275, "epoch": 0.9441277080957811, "grad_norm": 1.484375, "learning_rate": 4.973619259382475e-06, "loss": 0.0529, "mean_token_accuracy": 0.9849494993686676, "num_tokens": 44017619.0, "step": 414 }, { "entropy": 0.9933348745107651, "epoch": 0.9464082098061574, "grad_norm": 1.515625, "learning_rate": 4.973345663676305e-06, "loss": 0.0395, "mean_token_accuracy": 0.9884075969457626, "num_tokens": 44123480.0, "step": 415 }, { "entropy": 0.9933515936136246, "epoch": 0.9486887115165337, "grad_norm": 1.609375, "learning_rate": 4.973070664146885e-06, "loss": 0.0575, "mean_token_accuracy": 0.9833314269781113, "num_tokens": 44229888.0, "step": 416 }, { "entropy": 0.9953409284353256, "epoch": 0.95096921322691, "grad_norm": 1.3671875, "learning_rate": 4.972794260950301e-06, "loss": 0.0508, "mean_token_accuracy": 0.9846776574850082, "num_tokens": 44336188.0, "step": 417 }, { "entropy": 0.9853181689977646, "epoch": 0.9532497149372862, "grad_norm": 1.5390625, "learning_rate": 4.972516454243433e-06, "loss": 0.0615, "mean_token_accuracy": 0.9845842719078064, "num_tokens": 44443077.0, "step": 418 }, { "entropy": 0.9902692437171936, "epoch": 0.9555302166476625, "grad_norm": 1.6796875, "learning_rate": 4.972237244183961e-06, "loss": 0.0461, "mean_token_accuracy": 0.9872005134820938, "num_tokens": 44549687.0, "step": 419 }, { "entropy": 0.9820026755332947, "epoch": 0.9578107183580388, "grad_norm": 1.3828125, "learning_rate": 4.971956630930356e-06, "loss": 0.0479, "mean_token_accuracy": 0.9872235655784607, "num_tokens": 44656340.0, "step": 420 }, { "entropy": 0.9850461035966873, "epoch": 0.9600912200684151, "grad_norm": 1.6171875, "learning_rate": 4.971674614641891e-06, "loss": 0.0553, "mean_token_accuracy": 0.9860990941524506, "num_tokens": 44762772.0, "step": 421 }, { "entropy": 0.981113001704216, "epoch": 0.9623717217787914, "grad_norm": 1.3203125, "learning_rate": 4.971391195478632e-06, "loss": 0.0562, "mean_token_accuracy": 0.986365020275116, "num_tokens": 44868870.0, "step": 422 }, { "entropy": 0.9865913093090057, "epoch": 0.9646522234891676, "grad_norm": 1.875, "learning_rate": 4.971106373601443e-06, "loss": 0.0625, "mean_token_accuracy": 0.9825845658779144, "num_tokens": 44974694.0, "step": 423 }, { "entropy": 0.9814057648181915, "epoch": 0.9669327251995439, "grad_norm": 1.640625, "learning_rate": 4.9708201491719825e-06, "loss": 0.0631, "mean_token_accuracy": 0.9818404912948608, "num_tokens": 45080706.0, "step": 424 }, { "entropy": 0.9817659556865692, "epoch": 0.9692132269099202, "grad_norm": 1.4765625, "learning_rate": 4.9705325223527055e-06, "loss": 0.0521, "mean_token_accuracy": 0.9864613711833954, "num_tokens": 45187074.0, "step": 425 }, { "entropy": 0.9875007718801498, "epoch": 0.9714937286202965, "grad_norm": 1.7265625, "learning_rate": 4.970243493306865e-06, "loss": 0.0564, "mean_token_accuracy": 0.985431969165802, "num_tokens": 45293353.0, "step": 426 }, { "entropy": 0.9827473014593124, "epoch": 0.9737742303306728, "grad_norm": 1.546875, "learning_rate": 4.969953062198508e-06, "loss": 0.0573, "mean_token_accuracy": 0.9855919927358627, "num_tokens": 45399673.0, "step": 427 }, { "entropy": 0.9839651584625244, "epoch": 0.976054732041049, "grad_norm": 2.0, "learning_rate": 4.969661229192477e-06, "loss": 0.0641, "mean_token_accuracy": 0.9825310260057449, "num_tokens": 45505907.0, "step": 428 }, { "entropy": 0.9861139059066772, "epoch": 0.9783352337514253, "grad_norm": 1.2578125, "learning_rate": 4.969367994454412e-06, "loss": 0.0545, "mean_token_accuracy": 0.9863940924406052, "num_tokens": 45612788.0, "step": 429 }, { "entropy": 0.9843345284461975, "epoch": 0.9806157354618016, "grad_norm": 1.328125, "learning_rate": 4.9690733581507445e-06, "loss": 0.0596, "mean_token_accuracy": 0.9829192757606506, "num_tokens": 45718966.0, "step": 430 }, { "entropy": 0.9875551462173462, "epoch": 0.9828962371721779, "grad_norm": 1.2890625, "learning_rate": 4.968777320448707e-06, "loss": 0.0543, "mean_token_accuracy": 0.9823981374502182, "num_tokens": 45825479.0, "step": 431 }, { "entropy": 0.9855508804321289, "epoch": 0.9851767388825542, "grad_norm": 1.1875, "learning_rate": 4.9684798815163235e-06, "loss": 0.0507, "mean_token_accuracy": 0.985785648226738, "num_tokens": 45931565.0, "step": 432 }, { "entropy": 0.9845681488513947, "epoch": 0.9874572405929305, "grad_norm": 1.4296875, "learning_rate": 4.968181041522416e-06, "loss": 0.0622, "mean_token_accuracy": 0.9834192246198654, "num_tokens": 46038035.0, "step": 433 }, { "entropy": 0.9875636398792267, "epoch": 0.9897377423033067, "grad_norm": 1.40625, "learning_rate": 4.967880800636599e-06, "loss": 0.0526, "mean_token_accuracy": 0.9825520515441895, "num_tokens": 46144268.0, "step": 434 }, { "entropy": 0.9869890064001083, "epoch": 0.992018244013683, "grad_norm": 1.328125, "learning_rate": 4.967579159029284e-06, "loss": 0.0619, "mean_token_accuracy": 0.9823245704174042, "num_tokens": 46250353.0, "step": 435 }, { "entropy": 0.9811157733201981, "epoch": 0.9942987457240593, "grad_norm": 1.5078125, "learning_rate": 4.9672761168716766e-06, "loss": 0.0561, "mean_token_accuracy": 0.9833370447158813, "num_tokens": 46356552.0, "step": 436 }, { "entropy": 0.983551025390625, "epoch": 0.9965792474344356, "grad_norm": 1.359375, "learning_rate": 4.966971674335778e-06, "loss": 0.048, "mean_token_accuracy": 0.987804114818573, "num_tokens": 46462619.0, "step": 437 }, { "entropy": 0.9823139160871506, "epoch": 0.9988597491448119, "grad_norm": 1.328125, "learning_rate": 4.966665831594383e-06, "loss": 0.0579, "mean_token_accuracy": 0.9830011874437332, "num_tokens": 46569093.0, "step": 438 }, { "entropy": 0.9805935621261597, "epoch": 1.0, "grad_norm": 2.265625, "learning_rate": 4.966358588821084e-06, "loss": 0.0456, "mean_token_accuracy": 0.9885893762111664, "num_tokens": 46607976.0, "step": 439 }, { "entropy": 0.9857584089040756, "epoch": 1.0022805017103762, "grad_norm": 1.3046875, "learning_rate": 4.966049946190265e-06, "loss": 0.038, "mean_token_accuracy": 0.9886159598827362, "num_tokens": 46714273.0, "step": 440 }, { "epoch": 1.0022805017103762, "eval_entropy": 0.9844890288980288, "eval_loss": 0.052759651094675064, "eval_mean_token_accuracy": 0.9848716392716528, "eval_num_tokens": 46714273.0, "eval_runtime": 66.0684, "eval_samples_per_second": 126.914, "eval_steps_per_second": 3.981, "step": 440 }, { "entropy": 0.9883111417293549, "epoch": 1.0045610034207526, "grad_norm": 1.125, "learning_rate": 4.9657399038771045e-06, "loss": 0.0372, "mean_token_accuracy": 0.9890596568584442, "num_tokens": 46820423.0, "step": 441 }, { "entropy": 0.9901106655597687, "epoch": 1.0068415051311288, "grad_norm": 1.6015625, "learning_rate": 4.965428462057578e-06, "loss": 0.0565, "mean_token_accuracy": 0.9837136566638947, "num_tokens": 46926732.0, "step": 442 }, { "entropy": 0.9789129942655563, "epoch": 1.0091220068415052, "grad_norm": 1.28125, "learning_rate": 4.965115620908453e-06, "loss": 0.0433, "mean_token_accuracy": 0.9877880811691284, "num_tokens": 47033084.0, "step": 443 }, { "entropy": 0.9872928857803345, "epoch": 1.0114025085518814, "grad_norm": 1.3359375, "learning_rate": 4.964801380607293e-06, "loss": 0.0443, "mean_token_accuracy": 0.985794872045517, "num_tokens": 47139405.0, "step": 444 }, { "entropy": 0.9767682105302811, "epoch": 1.0136830102622576, "grad_norm": 1.3515625, "learning_rate": 4.964485741332453e-06, "loss": 0.0487, "mean_token_accuracy": 0.9855656921863556, "num_tokens": 47246648.0, "step": 445 }, { "entropy": 0.981042742729187, "epoch": 1.015963511972634, "grad_norm": 1.375, "learning_rate": 4.964168703263086e-06, "loss": 0.0507, "mean_token_accuracy": 0.984528049826622, "num_tokens": 47353497.0, "step": 446 }, { "entropy": 0.9819304347038269, "epoch": 1.0182440136830102, "grad_norm": 1.34375, "learning_rate": 4.963850266579136e-06, "loss": 0.0601, "mean_token_accuracy": 0.9819273501634598, "num_tokens": 47460184.0, "step": 447 }, { "entropy": 0.9769564718008041, "epoch": 1.0205245153933866, "grad_norm": 1.46875, "learning_rate": 4.963530431461341e-06, "loss": 0.058, "mean_token_accuracy": 0.9820620119571686, "num_tokens": 47566264.0, "step": 448 }, { "entropy": 0.9764729142189026, "epoch": 1.0228050171037628, "grad_norm": 1.28125, "learning_rate": 4.963209198091232e-06, "loss": 0.0468, "mean_token_accuracy": 0.9862594902515411, "num_tokens": 47673121.0, "step": 449 }, { "entropy": 0.9832967966794968, "epoch": 1.025085518814139, "grad_norm": 1.15625, "learning_rate": 4.962886566651138e-06, "loss": 0.0414, "mean_token_accuracy": 0.9868118166923523, "num_tokens": 47779909.0, "step": 450 }, { "entropy": 0.9717783331871033, "epoch": 1.0273660205245154, "grad_norm": 1.6015625, "learning_rate": 4.962562537324176e-06, "loss": 0.0516, "mean_token_accuracy": 0.9864268451929092, "num_tokens": 47886671.0, "step": 451 }, { "entropy": 0.9811577945947647, "epoch": 1.0296465222348916, "grad_norm": 1.7734375, "learning_rate": 4.96223711029426e-06, "loss": 0.0606, "mean_token_accuracy": 0.982515811920166, "num_tokens": 47992664.0, "step": 452 }, { "entropy": 0.9805515557527542, "epoch": 1.031927023945268, "grad_norm": 1.546875, "learning_rate": 4.961910285746094e-06, "loss": 0.048, "mean_token_accuracy": 0.9855034649372101, "num_tokens": 48098768.0, "step": 453 }, { "entropy": 0.9824148267507553, "epoch": 1.0342075256556442, "grad_norm": 1.2578125, "learning_rate": 4.9615820638651805e-06, "loss": 0.0474, "mean_token_accuracy": 0.987827479839325, "num_tokens": 48204647.0, "step": 454 }, { "entropy": 0.9784812480211258, "epoch": 1.0364880273660204, "grad_norm": 1.7421875, "learning_rate": 4.961252444837809e-06, "loss": 0.0519, "mean_token_accuracy": 0.98605877161026, "num_tokens": 48310575.0, "step": 455 }, { "entropy": 0.9828673303127289, "epoch": 1.0387685290763968, "grad_norm": 1.453125, "learning_rate": 4.960921428851066e-06, "loss": 0.0522, "mean_token_accuracy": 0.9853001981973648, "num_tokens": 48416381.0, "step": 456 }, { "entropy": 0.983055904507637, "epoch": 1.041049030786773, "grad_norm": 1.625, "learning_rate": 4.960589016092832e-06, "loss": 0.0568, "mean_token_accuracy": 0.9835362285375595, "num_tokens": 48523445.0, "step": 457 }, { "entropy": 0.9792490750551224, "epoch": 1.0433295324971494, "grad_norm": 2.0, "learning_rate": 4.960255206751774e-06, "loss": 0.0639, "mean_token_accuracy": 0.981433779001236, "num_tokens": 48629611.0, "step": 458 }, { "entropy": 0.9859515279531479, "epoch": 1.0456100342075256, "grad_norm": 1.265625, "learning_rate": 4.959920001017358e-06, "loss": 0.0463, "mean_token_accuracy": 0.9869069457054138, "num_tokens": 48736035.0, "step": 459 }, { "entropy": 0.9823427051305771, "epoch": 1.047890535917902, "grad_norm": 1.328125, "learning_rate": 4.95958339907984e-06, "loss": 0.0502, "mean_token_accuracy": 0.9854542016983032, "num_tokens": 48842594.0, "step": 460 }, { "entropy": 0.9820017218589783, "epoch": 1.0501710376282782, "grad_norm": 1.8125, "learning_rate": 4.959245401130269e-06, "loss": 0.0584, "mean_token_accuracy": 0.9839332550764084, "num_tokens": 48948354.0, "step": 461 }, { "entropy": 0.9806426763534546, "epoch": 1.0524515393386544, "grad_norm": 1.375, "learning_rate": 4.958906007360487e-06, "loss": 0.0511, "mean_token_accuracy": 0.9851651340723038, "num_tokens": 49054646.0, "step": 462 }, { "entropy": 0.9847809225320816, "epoch": 1.0547320410490308, "grad_norm": 1.1640625, "learning_rate": 4.958565217963125e-06, "loss": 0.0473, "mean_token_accuracy": 0.9857627749443054, "num_tokens": 49161127.0, "step": 463 }, { "entropy": 0.9819318652153015, "epoch": 1.057012542759407, "grad_norm": 1.3828125, "learning_rate": 4.95822303313161e-06, "loss": 0.04, "mean_token_accuracy": 0.9887021034955978, "num_tokens": 49267381.0, "step": 464 }, { "entropy": 0.9826763868331909, "epoch": 1.0592930444697835, "grad_norm": 1.375, "learning_rate": 4.957879453060159e-06, "loss": 0.0503, "mean_token_accuracy": 0.9851426631212234, "num_tokens": 49374247.0, "step": 465 }, { "entropy": 0.9819617420434952, "epoch": 1.0615735461801596, "grad_norm": 1.15625, "learning_rate": 4.957534477943782e-06, "loss": 0.0476, "mean_token_accuracy": 0.9846676886081696, "num_tokens": 49480992.0, "step": 466 }, { "entropy": 0.9916632920503616, "epoch": 1.0638540478905358, "grad_norm": 1.265625, "learning_rate": 4.957188107978279e-06, "loss": 0.0405, "mean_token_accuracy": 0.9867600202560425, "num_tokens": 49587706.0, "step": 467 }, { "entropy": 0.9870170950889587, "epoch": 1.0661345496009123, "grad_norm": 1.2734375, "learning_rate": 4.956840343360245e-06, "loss": 0.0537, "mean_token_accuracy": 0.9858717322349548, "num_tokens": 49694468.0, "step": 468 }, { "entropy": 0.9772855937480927, "epoch": 1.0684150513112884, "grad_norm": 1.34375, "learning_rate": 4.956491184287062e-06, "loss": 0.0407, "mean_token_accuracy": 0.9872087687253952, "num_tokens": 49801060.0, "step": 469 }, { "entropy": 0.9790156334638596, "epoch": 1.0706955530216649, "grad_norm": 1.546875, "learning_rate": 4.9561406309569084e-06, "loss": 0.0533, "mean_token_accuracy": 0.9861847758293152, "num_tokens": 49907495.0, "step": 470 }, { "entropy": 0.981247216463089, "epoch": 1.072976054732041, "grad_norm": 1.2109375, "learning_rate": 4.955788683568749e-06, "loss": 0.0484, "mean_token_accuracy": 0.9856405854225159, "num_tokens": 50013584.0, "step": 471 }, { "entropy": 0.9859843850135803, "epoch": 1.0752565564424172, "grad_norm": 1.2421875, "learning_rate": 4.955435342322345e-06, "loss": 0.0385, "mean_token_accuracy": 0.9871060997247696, "num_tokens": 50119696.0, "step": 472 }, { "entropy": 0.9826996475458145, "epoch": 1.0775370581527937, "grad_norm": 1.203125, "learning_rate": 4.955080607418244e-06, "loss": 0.0489, "mean_token_accuracy": 0.9862266182899475, "num_tokens": 50226078.0, "step": 473 }, { "entropy": 0.9858256131410599, "epoch": 1.0798175598631699, "grad_norm": 1.265625, "learning_rate": 4.954724479057788e-06, "loss": 0.0468, "mean_token_accuracy": 0.986233577132225, "num_tokens": 50331595.0, "step": 474 }, { "entropy": 0.9837163835763931, "epoch": 1.0820980615735463, "grad_norm": 1.296875, "learning_rate": 4.954366957443107e-06, "loss": 0.0487, "mean_token_accuracy": 0.9829851239919662, "num_tokens": 50437402.0, "step": 475 }, { "entropy": 0.9858402013778687, "epoch": 1.0843785632839225, "grad_norm": 1.78125, "learning_rate": 4.954008042777125e-06, "loss": 0.0458, "mean_token_accuracy": 0.9858029782772064, "num_tokens": 50543666.0, "step": 476 }, { "entropy": 0.9878192394971848, "epoch": 1.0866590649942987, "grad_norm": 1.78125, "learning_rate": 4.953647735263555e-06, "loss": 0.0519, "mean_token_accuracy": 0.9836118370294571, "num_tokens": 50649942.0, "step": 477 }, { "entropy": 0.9846040606498718, "epoch": 1.088939566704675, "grad_norm": 1.5859375, "learning_rate": 4.953286035106898e-06, "loss": 0.0481, "mean_token_accuracy": 0.9870715737342834, "num_tokens": 50756479.0, "step": 478 }, { "entropy": 0.98616161942482, "epoch": 1.0912200684150513, "grad_norm": 1.546875, "learning_rate": 4.952922942512452e-06, "loss": 0.0527, "mean_token_accuracy": 0.983563169836998, "num_tokens": 50862632.0, "step": 479 }, { "entropy": 0.987126350402832, "epoch": 1.0935005701254277, "grad_norm": 1.6796875, "learning_rate": 4.9525584576862985e-06, "loss": 0.0512, "mean_token_accuracy": 0.9847068190574646, "num_tokens": 50968775.0, "step": 480 }, { "entropy": 0.9918335974216461, "epoch": 1.0957810718358039, "grad_norm": 1.421875, "learning_rate": 4.952192580835313e-06, "loss": 0.0476, "mean_token_accuracy": 0.986454039812088, "num_tokens": 51075257.0, "step": 481 }, { "entropy": 0.9806262850761414, "epoch": 1.09806157354618, "grad_norm": 1.8125, "learning_rate": 4.9518253121671595e-06, "loss": 0.0529, "mean_token_accuracy": 0.9842270612716675, "num_tokens": 51182020.0, "step": 482 }, { "entropy": 0.9917881339788437, "epoch": 1.1003420752565565, "grad_norm": 1.53125, "learning_rate": 4.951456651890294e-06, "loss": 0.0338, "mean_token_accuracy": 0.9903978258371353, "num_tokens": 51289000.0, "step": 483 }, { "entropy": 0.9833653718233109, "epoch": 1.1026225769669327, "grad_norm": 1.609375, "learning_rate": 4.951086600213959e-06, "loss": 0.0599, "mean_token_accuracy": 0.9817968159914017, "num_tokens": 51395229.0, "step": 484 }, { "entropy": 0.9845210462808609, "epoch": 1.104903078677309, "grad_norm": 1.2109375, "learning_rate": 4.950715157348191e-06, "loss": 0.0438, "mean_token_accuracy": 0.9879978895187378, "num_tokens": 51501331.0, "step": 485 }, { "entropy": 0.980344757437706, "epoch": 1.1071835803876853, "grad_norm": 1.296875, "learning_rate": 4.950342323503812e-06, "loss": 0.0593, "mean_token_accuracy": 0.980981707572937, "num_tokens": 51607370.0, "step": 486 }, { "entropy": 0.9814257919788361, "epoch": 1.1094640820980617, "grad_norm": 1.765625, "learning_rate": 4.949968098892436e-06, "loss": 0.0674, "mean_token_accuracy": 0.9810915142297745, "num_tokens": 51712977.0, "step": 487 }, { "entropy": 0.9865595996379852, "epoch": 1.1117445838084379, "grad_norm": 1.453125, "learning_rate": 4.949592483726465e-06, "loss": 0.0478, "mean_token_accuracy": 0.9852126836776733, "num_tokens": 51819570.0, "step": 488 }, { "entropy": 0.980936661362648, "epoch": 1.114025085518814, "grad_norm": 1.40625, "learning_rate": 4.949215478219092e-06, "loss": 0.0537, "mean_token_accuracy": 0.9838624149560928, "num_tokens": 51925273.0, "step": 489 }, { "entropy": 0.9855486750602722, "epoch": 1.1163055872291905, "grad_norm": 1.1484375, "learning_rate": 4.948837082584298e-06, "loss": 0.0484, "mean_token_accuracy": 0.9885068088769913, "num_tokens": 52031742.0, "step": 490 }, { "entropy": 0.9910408705472946, "epoch": 1.1185860889395667, "grad_norm": 1.5390625, "learning_rate": 4.9484572970368516e-06, "loss": 0.0535, "mean_token_accuracy": 0.9822172373533249, "num_tokens": 52137566.0, "step": 491 }, { "entropy": 0.9863322675228119, "epoch": 1.120866590649943, "grad_norm": 1.265625, "learning_rate": 4.948076121792313e-06, "loss": 0.0432, "mean_token_accuracy": 0.985975980758667, "num_tokens": 52243071.0, "step": 492 }, { "entropy": 0.9889108240604401, "epoch": 1.1231470923603193, "grad_norm": 1.6015625, "learning_rate": 4.9476935570670294e-06, "loss": 0.0479, "mean_token_accuracy": 0.9855006635189056, "num_tokens": 52349679.0, "step": 493 }, { "entropy": 0.9834950119256973, "epoch": 1.1254275940706955, "grad_norm": 1.609375, "learning_rate": 4.947309603078138e-06, "loss": 0.0572, "mean_token_accuracy": 0.9859189242124557, "num_tokens": 52456271.0, "step": 494 }, { "entropy": 0.9897123575210571, "epoch": 1.127708095781072, "grad_norm": 1.84375, "learning_rate": 4.946924260043563e-06, "loss": 0.0462, "mean_token_accuracy": 0.9844526499509811, "num_tokens": 52562361.0, "step": 495 }, { "entropy": 0.9912416785955429, "epoch": 1.129988597491448, "grad_norm": 1.6875, "learning_rate": 4.946537528182017e-06, "loss": 0.0496, "mean_token_accuracy": 0.9847407639026642, "num_tokens": 52668282.0, "step": 496 }, { "entropy": 0.9902364015579224, "epoch": 1.1322690992018245, "grad_norm": 1.609375, "learning_rate": 4.946149407713002e-06, "loss": 0.0539, "mean_token_accuracy": 0.9825800359249115, "num_tokens": 52774684.0, "step": 497 }, { "entropy": 0.989710807800293, "epoch": 1.1345496009122007, "grad_norm": 1.6484375, "learning_rate": 4.945759898856809e-06, "loss": 0.0608, "mean_token_accuracy": 0.9827321916818619, "num_tokens": 52880970.0, "step": 498 }, { "entropy": 0.9932610988616943, "epoch": 1.1368301026225769, "grad_norm": 1.3515625, "learning_rate": 4.9453690018345144e-06, "loss": 0.0648, "mean_token_accuracy": 0.9826203137636185, "num_tokens": 52986658.0, "step": 499 }, { "entropy": 0.9941859543323517, "epoch": 1.1391106043329533, "grad_norm": 1.4296875, "learning_rate": 4.944976716867984e-06, "loss": 0.0526, "mean_token_accuracy": 0.9851877987384796, "num_tokens": 53092375.0, "step": 500 }, { "entropy": 0.9975610375404358, "epoch": 1.1413911060433295, "grad_norm": 1.46875, "learning_rate": 4.944583044179871e-06, "loss": 0.0462, "mean_token_accuracy": 0.9870862662792206, "num_tokens": 53198912.0, "step": 501 }, { "entropy": 0.9965277165174484, "epoch": 1.143671607753706, "grad_norm": 1.2578125, "learning_rate": 4.944187983993617e-06, "loss": 0.0441, "mean_token_accuracy": 0.9879919588565826, "num_tokens": 53305585.0, "step": 502 }, { "entropy": 0.9936819076538086, "epoch": 1.145952109464082, "grad_norm": 1.4609375, "learning_rate": 4.94379153653345e-06, "loss": 0.0496, "mean_token_accuracy": 0.9840343445539474, "num_tokens": 53411212.0, "step": 503 }, { "entropy": 0.9846415519714355, "epoch": 1.1482326111744583, "grad_norm": 1.4921875, "learning_rate": 4.9433937020243854e-06, "loss": 0.0621, "mean_token_accuracy": 0.9815598428249359, "num_tokens": 53517402.0, "step": 504 }, { "entropy": 0.9882779270410538, "epoch": 1.1505131128848347, "grad_norm": 1.5625, "learning_rate": 4.942994480692228e-06, "loss": 0.049, "mean_token_accuracy": 0.985088124871254, "num_tokens": 53623289.0, "step": 505 }, { "entropy": 0.9839213490486145, "epoch": 1.152793614595211, "grad_norm": 0.9453125, "learning_rate": 4.942593872763566e-06, "loss": 0.0366, "mean_token_accuracy": 0.989039734005928, "num_tokens": 53729356.0, "step": 506 }, { "entropy": 0.9932788163423538, "epoch": 1.1550741163055873, "grad_norm": 1.46875, "learning_rate": 4.9421918784657795e-06, "loss": 0.0426, "mean_token_accuracy": 0.9877276122570038, "num_tokens": 53835589.0, "step": 507 }, { "entropy": 0.9919345676898956, "epoch": 1.1573546180159635, "grad_norm": 1.3515625, "learning_rate": 4.94178849802703e-06, "loss": 0.0544, "mean_token_accuracy": 0.9860077649354935, "num_tokens": 53941563.0, "step": 508 }, { "entropy": 0.9877480119466782, "epoch": 1.1596351197263397, "grad_norm": 1.8046875, "learning_rate": 4.9413837316762705e-06, "loss": 0.0566, "mean_token_accuracy": 0.9828944355249405, "num_tokens": 54047438.0, "step": 509 }, { "entropy": 0.9895816892385483, "epoch": 1.1619156214367161, "grad_norm": 1.2890625, "learning_rate": 4.940977579643237e-06, "loss": 0.0465, "mean_token_accuracy": 0.9851583391427994, "num_tokens": 54153533.0, "step": 510 }, { "entropy": 0.987389549612999, "epoch": 1.1641961231470923, "grad_norm": 1.390625, "learning_rate": 4.940570042158454e-06, "loss": 0.0495, "mean_token_accuracy": 0.9858760833740234, "num_tokens": 54259885.0, "step": 511 }, { "entropy": 0.982793003320694, "epoch": 1.1664766248574687, "grad_norm": 1.40625, "learning_rate": 4.940161119453232e-06, "loss": 0.0609, "mean_token_accuracy": 0.9827637374401093, "num_tokens": 54366334.0, "step": 512 }, { "entropy": 0.9925961196422577, "epoch": 1.168757126567845, "grad_norm": 1.328125, "learning_rate": 4.939750811759668e-06, "loss": 0.0485, "mean_token_accuracy": 0.9838505387306213, "num_tokens": 54472641.0, "step": 513 }, { "entropy": 0.9881662726402283, "epoch": 1.171037628278221, "grad_norm": 1.390625, "learning_rate": 4.939339119310645e-06, "loss": 0.0606, "mean_token_accuracy": 0.9825042486190796, "num_tokens": 54579279.0, "step": 514 }, { "entropy": 0.9895337373018265, "epoch": 1.1733181299885975, "grad_norm": 1.890625, "learning_rate": 4.93892604233983e-06, "loss": 0.0607, "mean_token_accuracy": 0.983093872666359, "num_tokens": 54685527.0, "step": 515 }, { "entropy": 0.9868631511926651, "epoch": 1.1755986316989737, "grad_norm": 1.5703125, "learning_rate": 4.93851158108168e-06, "loss": 0.0521, "mean_token_accuracy": 0.9835385382175446, "num_tokens": 54792368.0, "step": 516 }, { "entropy": 0.9865094870328903, "epoch": 1.1778791334093501, "grad_norm": 1.6015625, "learning_rate": 4.938095735771433e-06, "loss": 0.0458, "mean_token_accuracy": 0.9856425374746323, "num_tokens": 54898697.0, "step": 517 }, { "entropy": 0.9822773039340973, "epoch": 1.1801596351197263, "grad_norm": 1.3203125, "learning_rate": 4.937678506645116e-06, "loss": 0.0526, "mean_token_accuracy": 0.9838978499174118, "num_tokens": 55005198.0, "step": 518 }, { "entropy": 0.9864204376935959, "epoch": 1.1824401368301025, "grad_norm": 1.484375, "learning_rate": 4.937259893939539e-06, "loss": 0.0534, "mean_token_accuracy": 0.9851760864257812, "num_tokens": 55111891.0, "step": 519 }, { "entropy": 0.993215411901474, "epoch": 1.184720638540479, "grad_norm": 1.2421875, "learning_rate": 4.9368398978923e-06, "loss": 0.0506, "mean_token_accuracy": 0.9848170131444931, "num_tokens": 55218335.0, "step": 520 }, { "entropy": 0.9899388402700424, "epoch": 1.1870011402508551, "grad_norm": 1.4296875, "learning_rate": 4.93641851874178e-06, "loss": 0.0456, "mean_token_accuracy": 0.986264705657959, "num_tokens": 55324377.0, "step": 521 }, { "entropy": 0.9833606481552124, "epoch": 1.1892816419612315, "grad_norm": 1.40625, "learning_rate": 4.935995756727146e-06, "loss": 0.0458, "mean_token_accuracy": 0.9873937666416168, "num_tokens": 55430822.0, "step": 522 }, { "entropy": 0.9857819676399231, "epoch": 1.1915621436716077, "grad_norm": 1.3828125, "learning_rate": 4.935571612088349e-06, "loss": 0.0331, "mean_token_accuracy": 0.9912050664424896, "num_tokens": 55537454.0, "step": 523 }, { "entropy": 0.9795243889093399, "epoch": 1.193842645381984, "grad_norm": 1.6640625, "learning_rate": 4.935146085066125e-06, "loss": 0.0409, "mean_token_accuracy": 0.9888584017753601, "num_tokens": 55643379.0, "step": 524 }, { "entropy": 0.982573926448822, "epoch": 1.1961231470923603, "grad_norm": 1.578125, "learning_rate": 4.934719175901996e-06, "loss": 0.0538, "mean_token_accuracy": 0.9843892604112625, "num_tokens": 55749552.0, "step": 525 }, { "entropy": 0.9827165901660919, "epoch": 1.1984036488027365, "grad_norm": 1.3671875, "learning_rate": 4.934290884838266e-06, "loss": 0.0389, "mean_token_accuracy": 0.988952249288559, "num_tokens": 55856037.0, "step": 526 }, { "entropy": 0.9846077412366867, "epoch": 1.200684150513113, "grad_norm": 1.5078125, "learning_rate": 4.933861212118027e-06, "loss": 0.0478, "mean_token_accuracy": 0.9863806664943695, "num_tokens": 55962302.0, "step": 527 }, { "entropy": 0.9804884791374207, "epoch": 1.2029646522234891, "grad_norm": 1.4296875, "learning_rate": 4.933430157985151e-06, "loss": 0.0516, "mean_token_accuracy": 0.9873116761445999, "num_tokens": 56068273.0, "step": 528 }, { "entropy": 0.9786861389875412, "epoch": 1.2052451539338653, "grad_norm": 1.203125, "learning_rate": 4.932997722684296e-06, "loss": 0.0416, "mean_token_accuracy": 0.9872642606496811, "num_tokens": 56174387.0, "step": 529 }, { "entropy": 0.9822756797075272, "epoch": 1.2075256556442417, "grad_norm": 1.28125, "learning_rate": 4.932563906460905e-06, "loss": 0.0386, "mean_token_accuracy": 0.9865251034498215, "num_tokens": 56280324.0, "step": 530 }, { "entropy": 0.9759213477373123, "epoch": 1.209806157354618, "grad_norm": 1.328125, "learning_rate": 4.932128709561202e-06, "loss": 0.0498, "mean_token_accuracy": 0.9853211790323257, "num_tokens": 56387137.0, "step": 531 }, { "entropy": 0.9827826172113419, "epoch": 1.2120866590649944, "grad_norm": 1.1796875, "learning_rate": 4.931692132232198e-06, "loss": 0.0432, "mean_token_accuracy": 0.9871606081724167, "num_tokens": 56493713.0, "step": 532 }, { "entropy": 0.989987850189209, "epoch": 1.2143671607753705, "grad_norm": 1.2734375, "learning_rate": 4.931254174721687e-06, "loss": 0.049, "mean_token_accuracy": 0.9831738770008087, "num_tokens": 56599883.0, "step": 533 }, { "entropy": 0.9815209358930588, "epoch": 1.216647662485747, "grad_norm": 1.421875, "learning_rate": 4.930814837278242e-06, "loss": 0.0445, "mean_token_accuracy": 0.9855115115642548, "num_tokens": 56706944.0, "step": 534 }, { "entropy": 0.9807597100734711, "epoch": 1.2189281641961232, "grad_norm": 1.359375, "learning_rate": 4.930374120151225e-06, "loss": 0.0499, "mean_token_accuracy": 0.9848751425743103, "num_tokens": 56813270.0, "step": 535 }, { "entropy": 0.9792383164167404, "epoch": 1.2212086659064993, "grad_norm": 1.2421875, "learning_rate": 4.929932023590776e-06, "loss": 0.0471, "mean_token_accuracy": 0.985316202044487, "num_tokens": 56918971.0, "step": 536 }, { "entropy": 0.9853387176990509, "epoch": 1.2234891676168758, "grad_norm": 1.34375, "learning_rate": 4.929488547847823e-06, "loss": 0.0561, "mean_token_accuracy": 0.9828062504529953, "num_tokens": 57025019.0, "step": 537 }, { "entropy": 0.9816011339426041, "epoch": 1.225769669327252, "grad_norm": 1.2109375, "learning_rate": 4.9290436931740735e-06, "loss": 0.0457, "mean_token_accuracy": 0.987445130944252, "num_tokens": 57131596.0, "step": 538 }, { "entropy": 0.974600151181221, "epoch": 1.2280501710376284, "grad_norm": 1.21875, "learning_rate": 4.928597459822018e-06, "loss": 0.0547, "mean_token_accuracy": 0.9851370304822922, "num_tokens": 57237705.0, "step": 539 }, { "entropy": 0.9817901402711868, "epoch": 1.2303306727480046, "grad_norm": 1.3828125, "learning_rate": 4.928149848044931e-06, "loss": 0.0577, "mean_token_accuracy": 0.9835440665483475, "num_tokens": 57343917.0, "step": 540 }, { "entropy": 0.9848485291004181, "epoch": 1.2326111744583808, "grad_norm": 1.234375, "learning_rate": 4.9277008580968665e-06, "loss": 0.0481, "mean_token_accuracy": 0.984641507267952, "num_tokens": 57450290.0, "step": 541 }, { "entropy": 0.9814647734165192, "epoch": 1.2348916761687572, "grad_norm": 1.015625, "learning_rate": 4.927250490232664e-06, "loss": 0.0322, "mean_token_accuracy": 0.9902014136314392, "num_tokens": 57556438.0, "step": 542 }, { "entropy": 0.9874097108840942, "epoch": 1.2371721778791334, "grad_norm": 1.4609375, "learning_rate": 4.926798744707943e-06, "loss": 0.0475, "mean_token_accuracy": 0.9844791740179062, "num_tokens": 57663192.0, "step": 543 }, { "entropy": 0.9831148833036423, "epoch": 1.2394526795895098, "grad_norm": 1.3828125, "learning_rate": 4.926345621779106e-06, "loss": 0.0534, "mean_token_accuracy": 0.9846959114074707, "num_tokens": 57769811.0, "step": 544 }, { "entropy": 0.9873417764902115, "epoch": 1.241733181299886, "grad_norm": 1.234375, "learning_rate": 4.9258911217033355e-06, "loss": 0.0425, "mean_token_accuracy": 0.9865515977144241, "num_tokens": 57875902.0, "step": 545 }, { "entropy": 0.9872582852840424, "epoch": 1.2440136830102622, "grad_norm": 1.2890625, "learning_rate": 4.925435244738599e-06, "loss": 0.0389, "mean_token_accuracy": 0.9864893108606339, "num_tokens": 57982139.0, "step": 546 }, { "entropy": 0.9836206883192062, "epoch": 1.2462941847206386, "grad_norm": 1.0703125, "learning_rate": 4.924977991143642e-06, "loss": 0.0362, "mean_token_accuracy": 0.9901499003171921, "num_tokens": 58089329.0, "step": 547 }, { "entropy": 0.9854682832956314, "epoch": 1.2485746864310148, "grad_norm": 1.203125, "learning_rate": 4.924519361177993e-06, "loss": 0.0477, "mean_token_accuracy": 0.984602615237236, "num_tokens": 58196024.0, "step": 548 }, { "entropy": 0.9885090738534927, "epoch": 1.2508551881413912, "grad_norm": 1.390625, "learning_rate": 4.9240593551019625e-06, "loss": 0.0581, "mean_token_accuracy": 0.9843998700380325, "num_tokens": 58302945.0, "step": 549 }, { "entropy": 0.9896544069051743, "epoch": 1.2531356898517674, "grad_norm": 2.0625, "learning_rate": 4.92359797317664e-06, "loss": 0.0443, "mean_token_accuracy": 0.9877608716487885, "num_tokens": 58409255.0, "step": 550 }, { "entropy": 0.9790240675210953, "epoch": 1.2554161915621438, "grad_norm": 1.265625, "learning_rate": 4.923135215663897e-06, "loss": 0.0467, "mean_token_accuracy": 0.9867167472839355, "num_tokens": 58515421.0, "step": 551 }, { "entropy": 0.9928353279829025, "epoch": 1.25769669327252, "grad_norm": 2.125, "learning_rate": 4.922671082826386e-06, "loss": 0.0496, "mean_token_accuracy": 0.9844487607479095, "num_tokens": 58621414.0, "step": 552 }, { "entropy": 0.9881910532712936, "epoch": 1.2599771949828962, "grad_norm": 1.6796875, "learning_rate": 4.92220557492754e-06, "loss": 0.0578, "mean_token_accuracy": 0.9852906465530396, "num_tokens": 58727821.0, "step": 553 }, { "entropy": 0.9884717017412186, "epoch": 1.2622576966932726, "grad_norm": 1.109375, "learning_rate": 4.921738692231572e-06, "loss": 0.0355, "mean_token_accuracy": 0.9894751757383347, "num_tokens": 58833956.0, "step": 554 }, { "entropy": 0.9794674664735794, "epoch": 1.2645381984036488, "grad_norm": 1.421875, "learning_rate": 4.9212704350034764e-06, "loss": 0.0454, "mean_token_accuracy": 0.9868530482053757, "num_tokens": 58940100.0, "step": 555 }, { "entropy": 0.9812622219324112, "epoch": 1.2668187001140252, "grad_norm": 1.2734375, "learning_rate": 4.920800803509026e-06, "loss": 0.0437, "mean_token_accuracy": 0.9863903224468231, "num_tokens": 59045982.0, "step": 556 }, { "entropy": 0.9846433252096176, "epoch": 1.2690992018244014, "grad_norm": 1.34375, "learning_rate": 4.920329798014775e-06, "loss": 0.0372, "mean_token_accuracy": 0.9873294532299042, "num_tokens": 59152474.0, "step": 557 }, { "entropy": 0.9862755686044693, "epoch": 1.2713797035347776, "grad_norm": 1.6796875, "learning_rate": 4.919857418788056e-06, "loss": 0.0492, "mean_token_accuracy": 0.985272228717804, "num_tokens": 59258392.0, "step": 558 }, { "entropy": 0.987528994679451, "epoch": 1.273660205245154, "grad_norm": 1.3046875, "learning_rate": 4.919383666096985e-06, "loss": 0.0503, "mean_token_accuracy": 0.9844127893447876, "num_tokens": 59364591.0, "step": 559 }, { "entropy": 0.9889151751995087, "epoch": 1.2759407069555302, "grad_norm": 1.8125, "learning_rate": 4.918908540210452e-06, "loss": 0.0489, "mean_token_accuracy": 0.9862928688526154, "num_tokens": 59471229.0, "step": 560 }, { "entropy": 0.9836794435977936, "epoch": 1.2782212086659066, "grad_norm": 1.65625, "learning_rate": 4.91843204139813e-06, "loss": 0.0392, "mean_token_accuracy": 0.987623929977417, "num_tokens": 59577670.0, "step": 561 }, { "entropy": 0.9865290373563766, "epoch": 1.2805017103762828, "grad_norm": 1.5703125, "learning_rate": 4.917954169930472e-06, "loss": 0.0443, "mean_token_accuracy": 0.9862612932920456, "num_tokens": 59684089.0, "step": 562 }, { "entropy": 0.992164671421051, "epoch": 1.282782212086659, "grad_norm": 1.625, "learning_rate": 4.917474926078707e-06, "loss": 0.0663, "mean_token_accuracy": 0.9822495728731155, "num_tokens": 59791258.0, "step": 563 }, { "entropy": 0.98800989985466, "epoch": 1.2850627137970354, "grad_norm": 1.0546875, "learning_rate": 4.916994310114845e-06, "loss": 0.0393, "mean_token_accuracy": 0.9893053472042084, "num_tokens": 59898216.0, "step": 564 }, { "entropy": 0.9866294264793396, "epoch": 1.2873432155074116, "grad_norm": 1.109375, "learning_rate": 4.916512322311675e-06, "loss": 0.0519, "mean_token_accuracy": 0.9849787205457687, "num_tokens": 60004527.0, "step": 565 }, { "entropy": 0.9852918833494186, "epoch": 1.289623717217788, "grad_norm": 1.03125, "learning_rate": 4.916028962942763e-06, "loss": 0.0356, "mean_token_accuracy": 0.9897968024015427, "num_tokens": 60111140.0, "step": 566 }, { "entropy": 0.9881338775157928, "epoch": 1.2919042189281642, "grad_norm": 1.765625, "learning_rate": 4.915544232282455e-06, "loss": 0.0543, "mean_token_accuracy": 0.984331414103508, "num_tokens": 60217711.0, "step": 567 }, { "entropy": 0.9874555617570877, "epoch": 1.2941847206385404, "grad_norm": 1.1796875, "learning_rate": 4.915058130605874e-06, "loss": 0.0435, "mean_token_accuracy": 0.9871149510145187, "num_tokens": 60323853.0, "step": 568 }, { "entropy": 0.9900244325399399, "epoch": 1.2964652223489168, "grad_norm": 1.5703125, "learning_rate": 4.9145706581889235e-06, "loss": 0.0509, "mean_token_accuracy": 0.9858052283525467, "num_tokens": 60429679.0, "step": 569 }, { "entropy": 0.9844519048929214, "epoch": 1.298745724059293, "grad_norm": 1.0625, "learning_rate": 4.914081815308283e-06, "loss": 0.0368, "mean_token_accuracy": 0.9894878417253494, "num_tokens": 60535815.0, "step": 570 }, { "entropy": 0.9829506576061249, "epoch": 1.3010262257696694, "grad_norm": 1.59375, "learning_rate": 4.913591602241409e-06, "loss": 0.0549, "mean_token_accuracy": 0.9811260104179382, "num_tokens": 60642062.0, "step": 571 }, { "entropy": 0.9875671416521072, "epoch": 1.3033067274800456, "grad_norm": 1.578125, "learning_rate": 4.9131000192665365e-06, "loss": 0.0601, "mean_token_accuracy": 0.9827910512685776, "num_tokens": 60748203.0, "step": 572 }, { "entropy": 0.9808060675859451, "epoch": 1.3055872291904218, "grad_norm": 1.59375, "learning_rate": 4.9126070666626815e-06, "loss": 0.0484, "mean_token_accuracy": 0.9867612570524216, "num_tokens": 60854476.0, "step": 573 }, { "entropy": 0.9811474531888962, "epoch": 1.3078677309007982, "grad_norm": 1.0625, "learning_rate": 4.912112744709632e-06, "loss": 0.0361, "mean_token_accuracy": 0.9890451729297638, "num_tokens": 60960486.0, "step": 574 }, { "entropy": 0.9795915484428406, "epoch": 1.3101482326111744, "grad_norm": 1.34375, "learning_rate": 4.911617053687957e-06, "loss": 0.0505, "mean_token_accuracy": 0.9841840118169785, "num_tokens": 61066842.0, "step": 575 }, { "entropy": 0.9791534096002579, "epoch": 1.3124287343215508, "grad_norm": 1.4375, "learning_rate": 4.911119993878999e-06, "loss": 0.0541, "mean_token_accuracy": 0.9835745692253113, "num_tokens": 61173056.0, "step": 576 }, { "entropy": 0.9816157221794128, "epoch": 1.314709236031927, "grad_norm": 1.2578125, "learning_rate": 4.910621565564882e-06, "loss": 0.0495, "mean_token_accuracy": 0.9850636124610901, "num_tokens": 61279652.0, "step": 577 }, { "entropy": 0.9729274809360504, "epoch": 1.3169897377423032, "grad_norm": 1.46875, "learning_rate": 4.910121769028503e-06, "loss": 0.0492, "mean_token_accuracy": 0.9849403351545334, "num_tokens": 61385918.0, "step": 578 }, { "entropy": 0.9791656732559204, "epoch": 1.3192702394526796, "grad_norm": 1.6015625, "learning_rate": 4.909620604553537e-06, "loss": 0.042, "mean_token_accuracy": 0.9860332608222961, "num_tokens": 61491988.0, "step": 579 }, { "entropy": 0.9780607968568802, "epoch": 1.3215507411630558, "grad_norm": 1.3359375, "learning_rate": 4.909118072424436e-06, "loss": 0.0409, "mean_token_accuracy": 0.9874880462884903, "num_tokens": 61598130.0, "step": 580 }, { "entropy": 0.9761448949575424, "epoch": 1.3238312428734322, "grad_norm": 1.421875, "learning_rate": 4.908614172926426e-06, "loss": 0.0445, "mean_token_accuracy": 0.9874280095100403, "num_tokens": 61704475.0, "step": 581 }, { "entropy": 0.9757523089647293, "epoch": 1.3261117445838084, "grad_norm": 1.2109375, "learning_rate": 4.908108906345512e-06, "loss": 0.0403, "mean_token_accuracy": 0.9856415688991547, "num_tokens": 61811200.0, "step": 582 }, { "entropy": 0.9737198948860168, "epoch": 1.3283922462941846, "grad_norm": 1.21875, "learning_rate": 4.907602272968473e-06, "loss": 0.0361, "mean_token_accuracy": 0.9878649115562439, "num_tokens": 61917080.0, "step": 583 }, { "entropy": 0.9727436155080795, "epoch": 1.330672748004561, "grad_norm": 1.3828125, "learning_rate": 4.907094273082865e-06, "loss": 0.0444, "mean_token_accuracy": 0.987605944275856, "num_tokens": 62022996.0, "step": 584 }, { "entropy": 0.9732117354869843, "epoch": 1.3329532497149372, "grad_norm": 1.6015625, "learning_rate": 4.906584906977018e-06, "loss": 0.0517, "mean_token_accuracy": 0.9851914197206497, "num_tokens": 62129844.0, "step": 585 }, { "entropy": 0.9711791574954987, "epoch": 1.3352337514253136, "grad_norm": 1.2578125, "learning_rate": 4.906074174940038e-06, "loss": 0.046, "mean_token_accuracy": 0.986432820558548, "num_tokens": 62236084.0, "step": 586 }, { "entropy": 0.9672954380512238, "epoch": 1.3375142531356898, "grad_norm": 1.34375, "learning_rate": 4.905562077261808e-06, "loss": 0.0482, "mean_token_accuracy": 0.9860783517360687, "num_tokens": 62342023.0, "step": 587 }, { "entropy": 0.969301626086235, "epoch": 1.339794754846066, "grad_norm": 1.5390625, "learning_rate": 4.905048614232984e-06, "loss": 0.046, "mean_token_accuracy": 0.985334187746048, "num_tokens": 62448291.0, "step": 588 }, { "entropy": 0.9700509607791901, "epoch": 1.3420752565564424, "grad_norm": 1.6640625, "learning_rate": 4.904533786144998e-06, "loss": 0.0554, "mean_token_accuracy": 0.9854261726140976, "num_tokens": 62554891.0, "step": 589 }, { "entropy": 0.9772413372993469, "epoch": 1.3443557582668186, "grad_norm": 1.1328125, "learning_rate": 4.904017593290056e-06, "loss": 0.0385, "mean_token_accuracy": 0.9871950149536133, "num_tokens": 62660797.0, "step": 590 }, { "entropy": 0.9788028299808502, "epoch": 1.346636259977195, "grad_norm": 1.453125, "learning_rate": 4.903500035961139e-06, "loss": 0.0499, "mean_token_accuracy": 0.9865570366382599, "num_tokens": 62767610.0, "step": 591 }, { "entropy": 0.9704584032297134, "epoch": 1.3489167616875712, "grad_norm": 1.421875, "learning_rate": 4.902981114452005e-06, "loss": 0.0493, "mean_token_accuracy": 0.9858876913785934, "num_tokens": 62873815.0, "step": 592 }, { "entropy": 0.9771107584238052, "epoch": 1.3511972633979474, "grad_norm": 1.46875, "learning_rate": 4.90246082905718e-06, "loss": 0.0587, "mean_token_accuracy": 0.9846887439489365, "num_tokens": 62980012.0, "step": 593 }, { "entropy": 0.9829419404268265, "epoch": 1.3534777651083238, "grad_norm": 1.21875, "learning_rate": 4.90193918007197e-06, "loss": 0.0386, "mean_token_accuracy": 0.9879123568534851, "num_tokens": 63086268.0, "step": 594 }, { "entropy": 0.9826843291521072, "epoch": 1.3557582668187, "grad_norm": 1.3203125, "learning_rate": 4.901416167792452e-06, "loss": 0.0509, "mean_token_accuracy": 0.9842789322137833, "num_tokens": 63192666.0, "step": 595 }, { "entropy": 0.9840848743915558, "epoch": 1.3580387685290765, "grad_norm": 1.234375, "learning_rate": 4.9008917925154795e-06, "loss": 0.0435, "mean_token_accuracy": 0.9851053655147552, "num_tokens": 63299308.0, "step": 596 }, { "entropy": 0.9760123491287231, "epoch": 1.3603192702394526, "grad_norm": 1.15625, "learning_rate": 4.900366054538675e-06, "loss": 0.0331, "mean_token_accuracy": 0.9891353398561478, "num_tokens": 63405086.0, "step": 597 }, { "entropy": 0.9799070358276367, "epoch": 1.3625997719498288, "grad_norm": 1.1953125, "learning_rate": 4.8998389541604405e-06, "loss": 0.0392, "mean_token_accuracy": 0.9894413203001022, "num_tokens": 63511357.0, "step": 598 }, { "entropy": 0.9821567237377167, "epoch": 1.3648802736602053, "grad_norm": 1.1015625, "learning_rate": 4.899310491679945e-06, "loss": 0.0441, "mean_token_accuracy": 0.9870839715003967, "num_tokens": 63617738.0, "step": 599 }, { "entropy": 0.9835236966609955, "epoch": 1.3671607753705814, "grad_norm": 1.421875, "learning_rate": 4.898780667397136e-06, "loss": 0.0482, "mean_token_accuracy": 0.9867229461669922, "num_tokens": 63724187.0, "step": 600 }, { "entropy": 0.9833615273237228, "epoch": 1.3694412770809579, "grad_norm": 1.1796875, "learning_rate": 4.89824948161273e-06, "loss": 0.0494, "mean_token_accuracy": 0.9867308437824249, "num_tokens": 63830385.0, "step": 601 }, { "entropy": 0.9881637394428253, "epoch": 1.371721778791334, "grad_norm": 1.640625, "learning_rate": 4.8977169346282184e-06, "loss": 0.0415, "mean_token_accuracy": 0.9864300191402435, "num_tokens": 63936550.0, "step": 602 }, { "entropy": 0.9731858521699905, "epoch": 1.3740022805017102, "grad_norm": 1.3203125, "learning_rate": 4.8971830267458645e-06, "loss": 0.0471, "mean_token_accuracy": 0.987165167927742, "num_tokens": 64043064.0, "step": 603 }, { "entropy": 0.9781176894903183, "epoch": 1.3762827822120867, "grad_norm": 1.234375, "learning_rate": 4.896647758268703e-06, "loss": 0.0375, "mean_token_accuracy": 0.986235037446022, "num_tokens": 64149456.0, "step": 604 }, { "entropy": 0.9795133769512177, "epoch": 1.378563283922463, "grad_norm": 1.234375, "learning_rate": 4.8961111295005444e-06, "loss": 0.0463, "mean_token_accuracy": 0.9864026755094528, "num_tokens": 64255349.0, "step": 605 }, { "entropy": 0.9801002591848373, "epoch": 1.3808437856328393, "grad_norm": 1.3671875, "learning_rate": 4.895573140745967e-06, "loss": 0.0505, "mean_token_accuracy": 0.9843447804450989, "num_tokens": 64361505.0, "step": 606 }, { "entropy": 0.9772611111402512, "epoch": 1.3831242873432155, "grad_norm": 1.3046875, "learning_rate": 4.895033792310323e-06, "loss": 0.0454, "mean_token_accuracy": 0.9865716844797134, "num_tokens": 64467925.0, "step": 607 }, { "entropy": 0.9781694114208221, "epoch": 1.3854047890535917, "grad_norm": 1.234375, "learning_rate": 4.894493084499736e-06, "loss": 0.0533, "mean_token_accuracy": 0.9822942614555359, "num_tokens": 64574365.0, "step": 608 }, { "entropy": 0.9828665405511856, "epoch": 1.387685290763968, "grad_norm": 1.296875, "learning_rate": 4.893951017621103e-06, "loss": 0.0501, "mean_token_accuracy": 0.985865443944931, "num_tokens": 64681284.0, "step": 609 }, { "entropy": 0.9818078130483627, "epoch": 1.3899657924743445, "grad_norm": 1.359375, "learning_rate": 4.893407591982088e-06, "loss": 0.0455, "mean_token_accuracy": 0.9839897304773331, "num_tokens": 64786973.0, "step": 610 }, { "entropy": 0.976662740111351, "epoch": 1.3922462941847207, "grad_norm": 1.390625, "learning_rate": 4.892862807891131e-06, "loss": 0.0529, "mean_token_accuracy": 0.9843283295631409, "num_tokens": 64893392.0, "step": 611 }, { "entropy": 0.9824441373348236, "epoch": 1.3945267958950969, "grad_norm": 1.4296875, "learning_rate": 4.89231666565744e-06, "loss": 0.0432, "mean_token_accuracy": 0.9859775453805923, "num_tokens": 64999815.0, "step": 612 }, { "entropy": 0.9791814684867859, "epoch": 1.3968072976054733, "grad_norm": 1.3984375, "learning_rate": 4.891769165590995e-06, "loss": 0.053, "mean_token_accuracy": 0.9859975278377533, "num_tokens": 65106466.0, "step": 613 }, { "entropy": 0.9795537441968918, "epoch": 1.3990877993158495, "grad_norm": 1.5546875, "learning_rate": 4.891220308002547e-06, "loss": 0.0504, "mean_token_accuracy": 0.9859340041875839, "num_tokens": 65212277.0, "step": 614 }, { "entropy": 0.9848696291446686, "epoch": 1.401368301026226, "grad_norm": 0.9609375, "learning_rate": 4.890670093203617e-06, "loss": 0.0352, "mean_token_accuracy": 0.9870152175426483, "num_tokens": 65318023.0, "step": 615 }, { "entropy": 0.9828765690326691, "epoch": 1.403648802736602, "grad_norm": 1.5078125, "learning_rate": 4.890118521506494e-06, "loss": 0.0467, "mean_token_accuracy": 0.9860928952693939, "num_tokens": 65423909.0, "step": 616 }, { "entropy": 0.9815609902143478, "epoch": 1.4059293044469783, "grad_norm": 1.140625, "learning_rate": 4.889565593224242e-06, "loss": 0.0423, "mean_token_accuracy": 0.9870648235082626, "num_tokens": 65529809.0, "step": 617 }, { "entropy": 0.978388100862503, "epoch": 1.4082098061573547, "grad_norm": 1.1640625, "learning_rate": 4.889011308670693e-06, "loss": 0.0403, "mean_token_accuracy": 0.9880014359951019, "num_tokens": 65635809.0, "step": 618 }, { "entropy": 0.9747432172298431, "epoch": 1.4104903078677309, "grad_norm": 1.34375, "learning_rate": 4.8884556681604445e-06, "loss": 0.0454, "mean_token_accuracy": 0.9868259280920029, "num_tokens": 65742419.0, "step": 619 }, { "entropy": 0.9840758144855499, "epoch": 1.4127708095781073, "grad_norm": 1.34375, "learning_rate": 4.8878986720088715e-06, "loss": 0.0449, "mean_token_accuracy": 0.9864968955516815, "num_tokens": 65848494.0, "step": 620 }, { "entropy": 0.9834176450967789, "epoch": 1.4150513112884835, "grad_norm": 1.328125, "learning_rate": 4.8873403205321115e-06, "loss": 0.0529, "mean_token_accuracy": 0.9852334260940552, "num_tokens": 65955128.0, "step": 621 }, { "entropy": 0.9759757816791534, "epoch": 1.4173318129988597, "grad_norm": 1.1796875, "learning_rate": 4.886780614047075e-06, "loss": 0.0459, "mean_token_accuracy": 0.9841852337121964, "num_tokens": 66061614.0, "step": 622 }, { "entropy": 0.9837797284126282, "epoch": 1.419612314709236, "grad_norm": 1.34375, "learning_rate": 4.886219552871441e-06, "loss": 0.0562, "mean_token_accuracy": 0.9830678552389145, "num_tokens": 66168404.0, "step": 623 }, { "entropy": 0.9806897342205048, "epoch": 1.4218928164196123, "grad_norm": 1.609375, "learning_rate": 4.885657137323656e-06, "loss": 0.0463, "mean_token_accuracy": 0.9857884496450424, "num_tokens": 66274984.0, "step": 624 }, { "entropy": 0.9839248359203339, "epoch": 1.4241733181299887, "grad_norm": 1.21875, "learning_rate": 4.885093367722937e-06, "loss": 0.0348, "mean_token_accuracy": 0.9898484200239182, "num_tokens": 66381524.0, "step": 625 }, { "entropy": 0.9826979488134384, "epoch": 1.426453819840365, "grad_norm": 1.28125, "learning_rate": 4.884528244389269e-06, "loss": 0.0403, "mean_token_accuracy": 0.9872270971536636, "num_tokens": 66487477.0, "step": 626 }, { "entropy": 0.985921710729599, "epoch": 1.428734321550741, "grad_norm": 1.234375, "learning_rate": 4.883961767643404e-06, "loss": 0.0513, "mean_token_accuracy": 0.9826608151197433, "num_tokens": 66593839.0, "step": 627 }, { "entropy": 0.9850452244281769, "epoch": 1.4310148232611175, "grad_norm": 1.40625, "learning_rate": 4.883393937806864e-06, "loss": 0.0506, "mean_token_accuracy": 0.9839091449975967, "num_tokens": 66700400.0, "step": 628 }, { "entropy": 0.9795786887407303, "epoch": 1.4332953249714937, "grad_norm": 1.28125, "learning_rate": 4.882824755201938e-06, "loss": 0.0527, "mean_token_accuracy": 0.9856137931346893, "num_tokens": 66807164.0, "step": 629 }, { "entropy": 0.9784361869096756, "epoch": 1.4355758266818701, "grad_norm": 1.171875, "learning_rate": 4.8822542201516835e-06, "loss": 0.0414, "mean_token_accuracy": 0.9879076331853867, "num_tokens": 66913564.0, "step": 630 }, { "entropy": 0.9799492806196213, "epoch": 1.4378563283922463, "grad_norm": 1.203125, "learning_rate": 4.881682332979925e-06, "loss": 0.0546, "mean_token_accuracy": 0.9803657829761505, "num_tokens": 67020630.0, "step": 631 }, { "entropy": 0.9749458581209183, "epoch": 1.4401368301026225, "grad_norm": 1.4921875, "learning_rate": 4.881109094011254e-06, "loss": 0.0533, "mean_token_accuracy": 0.984637588262558, "num_tokens": 67127640.0, "step": 632 }, { "entropy": 0.9810019284486771, "epoch": 1.442417331812999, "grad_norm": 1.2421875, "learning_rate": 4.88053450357103e-06, "loss": 0.0459, "mean_token_accuracy": 0.9871197193861008, "num_tokens": 67234187.0, "step": 633 }, { "entropy": 0.984291136264801, "epoch": 1.444697833523375, "grad_norm": 1.328125, "learning_rate": 4.87995856198538e-06, "loss": 0.0462, "mean_token_accuracy": 0.9859982132911682, "num_tokens": 67340062.0, "step": 634 }, { "entropy": 0.9820298850536346, "epoch": 1.4469783352337515, "grad_norm": 1.3203125, "learning_rate": 4.879381269581197e-06, "loss": 0.0406, "mean_token_accuracy": 0.9880660623311996, "num_tokens": 67446088.0, "step": 635 }, { "entropy": 0.9794302880764008, "epoch": 1.4492588369441277, "grad_norm": 1.0234375, "learning_rate": 4.878802626686141e-06, "loss": 0.0362, "mean_token_accuracy": 0.9896172434091568, "num_tokens": 67552364.0, "step": 636 }, { "entropy": 0.9800821095705032, "epoch": 1.451539338654504, "grad_norm": 1.21875, "learning_rate": 4.8782226336286395e-06, "loss": 0.0472, "mean_token_accuracy": 0.9833100736141205, "num_tokens": 67658779.0, "step": 637 }, { "entropy": 0.9811799228191376, "epoch": 1.4538198403648803, "grad_norm": 1.0078125, "learning_rate": 4.8776412907378845e-06, "loss": 0.0344, "mean_token_accuracy": 0.9893343448638916, "num_tokens": 67765176.0, "step": 638 }, { "entropy": 0.9803298860788345, "epoch": 1.4561003420752565, "grad_norm": 1.4453125, "learning_rate": 4.877058598343835e-06, "loss": 0.0524, "mean_token_accuracy": 0.9843551963567734, "num_tokens": 67872117.0, "step": 639 }, { "entropy": 0.9799992442131042, "epoch": 1.458380843785633, "grad_norm": 1.2421875, "learning_rate": 4.876474556777216e-06, "loss": 0.0443, "mean_token_accuracy": 0.9876337349414825, "num_tokens": 67978774.0, "step": 640 }, { "entropy": 0.9770323783159256, "epoch": 1.4606613454960091, "grad_norm": 1.109375, "learning_rate": 4.8758891663695165e-06, "loss": 0.0293, "mean_token_accuracy": 0.9914994537830353, "num_tokens": 68085091.0, "step": 641 }, { "entropy": 0.9767118841409683, "epoch": 1.4629418472063853, "grad_norm": 1.2265625, "learning_rate": 4.875302427452996e-06, "loss": 0.0546, "mean_token_accuracy": 0.98292276263237, "num_tokens": 68190999.0, "step": 642 }, { "entropy": 0.9793314784765244, "epoch": 1.4652223489167617, "grad_norm": 1.203125, "learning_rate": 4.874714340360674e-06, "loss": 0.0465, "mean_token_accuracy": 0.988775223493576, "num_tokens": 68297826.0, "step": 643 }, { "entropy": 0.9827540665864944, "epoch": 1.467502850627138, "grad_norm": 1.296875, "learning_rate": 4.874124905426339e-06, "loss": 0.0388, "mean_token_accuracy": 0.9889181554317474, "num_tokens": 68404306.0, "step": 644 }, { "entropy": 0.9804134517908096, "epoch": 1.4697833523375143, "grad_norm": 1.4609375, "learning_rate": 4.873534122984541e-06, "loss": 0.0497, "mean_token_accuracy": 0.9860327988862991, "num_tokens": 68510102.0, "step": 645 }, { "entropy": 0.9769677668809891, "epoch": 1.4720638540478905, "grad_norm": 0.96484375, "learning_rate": 4.872941993370598e-06, "loss": 0.0322, "mean_token_accuracy": 0.9897413551807404, "num_tokens": 68616371.0, "step": 646 }, { "entropy": 0.9818035960197449, "epoch": 1.4743443557582667, "grad_norm": 1.2890625, "learning_rate": 4.872348516920591e-06, "loss": 0.0577, "mean_token_accuracy": 0.9837579876184464, "num_tokens": 68722420.0, "step": 647 }, { "entropy": 0.9834107905626297, "epoch": 1.4766248574686431, "grad_norm": 0.94140625, "learning_rate": 4.8717536939713665e-06, "loss": 0.0342, "mean_token_accuracy": 0.988848865032196, "num_tokens": 68829161.0, "step": 648 }, { "entropy": 0.9826249927282333, "epoch": 1.4789053591790193, "grad_norm": 1.3671875, "learning_rate": 4.871157524860533e-06, "loss": 0.0452, "mean_token_accuracy": 0.9867520183324814, "num_tokens": 68935583.0, "step": 649 }, { "entropy": 0.9807153791189194, "epoch": 1.4811858608893957, "grad_norm": 1.1015625, "learning_rate": 4.870560009926465e-06, "loss": 0.0368, "mean_token_accuracy": 0.9894155114889145, "num_tokens": 69042394.0, "step": 650 }, { "entropy": 0.9813119620084763, "epoch": 1.483466362599772, "grad_norm": 1.53125, "learning_rate": 4.869961149508301e-06, "loss": 0.0477, "mean_token_accuracy": 0.9865817576646805, "num_tokens": 69148164.0, "step": 651 }, { "entropy": 0.9798155277967453, "epoch": 1.4857468643101481, "grad_norm": 1.1328125, "learning_rate": 4.869360943945943e-06, "loss": 0.0432, "mean_token_accuracy": 0.9867672920227051, "num_tokens": 69254266.0, "step": 652 }, { "entropy": 0.9817049652338028, "epoch": 1.4880273660205245, "grad_norm": 1.234375, "learning_rate": 4.868759393580054e-06, "loss": 0.0419, "mean_token_accuracy": 0.9874981939792633, "num_tokens": 69360382.0, "step": 653 }, { "entropy": 0.9882735460996628, "epoch": 1.4903078677309007, "grad_norm": 1.6015625, "learning_rate": 4.868156498752066e-06, "loss": 0.055, "mean_token_accuracy": 0.986307829618454, "num_tokens": 69466758.0, "step": 654 }, { "entropy": 0.9898873567581177, "epoch": 1.4925883694412772, "grad_norm": 1.3515625, "learning_rate": 4.8675522598041675e-06, "loss": 0.0431, "mean_token_accuracy": 0.9848946034908295, "num_tokens": 69572720.0, "step": 655 }, { "entropy": 0.9823256582021713, "epoch": 1.4948688711516533, "grad_norm": 1.375, "learning_rate": 4.866946677079314e-06, "loss": 0.0501, "mean_token_accuracy": 0.9841320067644119, "num_tokens": 69679309.0, "step": 656 }, { "entropy": 0.9823461472988129, "epoch": 1.4971493728620295, "grad_norm": 1.46875, "learning_rate": 4.866339750921222e-06, "loss": 0.0587, "mean_token_accuracy": 0.9821277111768723, "num_tokens": 69785448.0, "step": 657 }, { "entropy": 0.9864525347948074, "epoch": 1.499429874572406, "grad_norm": 1.328125, "learning_rate": 4.86573148167437e-06, "loss": 0.0526, "mean_token_accuracy": 0.9849957525730133, "num_tokens": 69892003.0, "step": 658 }, { "entropy": 0.986070841550827, "epoch": 1.5017103762827824, "grad_norm": 1.2734375, "learning_rate": 4.865121869684003e-06, "loss": 0.0418, "mean_token_accuracy": 0.9881172776222229, "num_tokens": 69998843.0, "step": 659 }, { "entropy": 0.9773884266614914, "epoch": 1.5039908779931586, "grad_norm": 1.25, "learning_rate": 4.864510915296122e-06, "loss": 0.0452, "mean_token_accuracy": 0.9850863069295883, "num_tokens": 70105887.0, "step": 660 }, { "epoch": 1.5039908779931586, "eval_entropy": 0.9859027030803404, "eval_loss": 0.04627906531095505, "eval_mean_token_accuracy": 0.9862413676066091, "eval_num_tokens": 70105887.0, "eval_runtime": 66.0648, "eval_samples_per_second": 126.921, "eval_steps_per_second": 3.981, "step": 660 }, { "entropy": 0.9838593006134033, "epoch": 1.5062713797035348, "grad_norm": 1.1796875, "learning_rate": 4.8638986188574955e-06, "loss": 0.0481, "mean_token_accuracy": 0.9861566573381424, "num_tokens": 70212084.0, "step": 661 }, { "entropy": 0.9894915819168091, "epoch": 1.508551881413911, "grad_norm": 1.4375, "learning_rate": 4.863284980715649e-06, "loss": 0.0469, "mean_token_accuracy": 0.9855886250734329, "num_tokens": 70318193.0, "step": 662 }, { "entropy": 0.9855186641216278, "epoch": 1.5108323831242874, "grad_norm": 1.46875, "learning_rate": 4.8626700012188724e-06, "loss": 0.0524, "mean_token_accuracy": 0.982574000954628, "num_tokens": 70424457.0, "step": 663 }, { "entropy": 0.9923011213541031, "epoch": 1.5131128848346638, "grad_norm": 1.0625, "learning_rate": 4.8620536807162164e-06, "loss": 0.0364, "mean_token_accuracy": 0.9891124069690704, "num_tokens": 70530437.0, "step": 664 }, { "entropy": 0.989051565527916, "epoch": 1.51539338654504, "grad_norm": 1.4296875, "learning_rate": 4.861436019557492e-06, "loss": 0.0578, "mean_token_accuracy": 0.9841172993183136, "num_tokens": 70636342.0, "step": 665 }, { "entropy": 0.9854145795106888, "epoch": 1.5176738882554162, "grad_norm": 1.140625, "learning_rate": 4.8608170180932725e-06, "loss": 0.0369, "mean_token_accuracy": 0.9900761395692825, "num_tokens": 70742495.0, "step": 666 }, { "entropy": 0.9859603494405746, "epoch": 1.5199543899657924, "grad_norm": 1.15625, "learning_rate": 4.860196676674891e-06, "loss": 0.0459, "mean_token_accuracy": 0.9867541193962097, "num_tokens": 70848500.0, "step": 667 }, { "entropy": 0.9896578639745712, "epoch": 1.5222348916761688, "grad_norm": 1.09375, "learning_rate": 4.8595749956544414e-06, "loss": 0.0376, "mean_token_accuracy": 0.9897489696741104, "num_tokens": 70954639.0, "step": 668 }, { "entropy": 0.9918593913316727, "epoch": 1.5245153933865452, "grad_norm": 1.3046875, "learning_rate": 4.858951975384777e-06, "loss": 0.0435, "mean_token_accuracy": 0.9880163222551346, "num_tokens": 71060617.0, "step": 669 }, { "entropy": 0.9847344607114792, "epoch": 1.5267958950969214, "grad_norm": 1.140625, "learning_rate": 4.858327616219513e-06, "loss": 0.0494, "mean_token_accuracy": 0.9869774430990219, "num_tokens": 71166909.0, "step": 670 }, { "entropy": 0.9868601411581039, "epoch": 1.5290763968072976, "grad_norm": 1.1171875, "learning_rate": 4.857701918513023e-06, "loss": 0.0409, "mean_token_accuracy": 0.9875942170619965, "num_tokens": 71272753.0, "step": 671 }, { "entropy": 0.993453860282898, "epoch": 1.5313568985176738, "grad_norm": 1.5390625, "learning_rate": 4.857074882620442e-06, "loss": 0.0551, "mean_token_accuracy": 0.9847117960453033, "num_tokens": 71379569.0, "step": 672 }, { "entropy": 0.9826454818248749, "epoch": 1.5336374002280502, "grad_norm": 1.015625, "learning_rate": 4.856446508897662e-06, "loss": 0.0436, "mean_token_accuracy": 0.9873650521039963, "num_tokens": 71486516.0, "step": 673 }, { "entropy": 0.985729843378067, "epoch": 1.5359179019384266, "grad_norm": 1.25, "learning_rate": 4.8558167977013365e-06, "loss": 0.0409, "mean_token_accuracy": 0.9879619777202606, "num_tokens": 71593451.0, "step": 674 }, { "entropy": 0.9918702691793442, "epoch": 1.5381984036488028, "grad_norm": 1.1640625, "learning_rate": 4.8551857493888775e-06, "loss": 0.0321, "mean_token_accuracy": 0.9902060478925705, "num_tokens": 71699343.0, "step": 675 }, { "entropy": 0.9866811782121658, "epoch": 1.540478905359179, "grad_norm": 1.3203125, "learning_rate": 4.854553364318456e-06, "loss": 0.0547, "mean_token_accuracy": 0.9849691390991211, "num_tokens": 71805942.0, "step": 676 }, { "entropy": 0.9882898777723312, "epoch": 1.5427594070695552, "grad_norm": 1.2890625, "learning_rate": 4.8539196428490016e-06, "loss": 0.0425, "mean_token_accuracy": 0.9880412667989731, "num_tokens": 71911809.0, "step": 677 }, { "entropy": 0.9855213016271591, "epoch": 1.5450399087799316, "grad_norm": 1.1484375, "learning_rate": 4.8532845853402015e-06, "loss": 0.0371, "mean_token_accuracy": 0.9896957576274872, "num_tokens": 72017819.0, "step": 678 }, { "entropy": 0.9849673509597778, "epoch": 1.547320410490308, "grad_norm": 1.375, "learning_rate": 4.8526481921525035e-06, "loss": 0.0518, "mean_token_accuracy": 0.9861007779836655, "num_tokens": 72124105.0, "step": 679 }, { "entropy": 0.9880193173885345, "epoch": 1.5496009122006842, "grad_norm": 1.1484375, "learning_rate": 4.85201046364711e-06, "loss": 0.0394, "mean_token_accuracy": 0.98882856965065, "num_tokens": 72230684.0, "step": 680 }, { "entropy": 0.9829340279102325, "epoch": 1.5518814139110604, "grad_norm": 1.25, "learning_rate": 4.851371400185986e-06, "loss": 0.0434, "mean_token_accuracy": 0.9865158647298813, "num_tokens": 72336981.0, "step": 681 }, { "entropy": 0.9799613058567047, "epoch": 1.5541619156214366, "grad_norm": 1.2265625, "learning_rate": 4.85073100213185e-06, "loss": 0.0403, "mean_token_accuracy": 0.9851117730140686, "num_tokens": 72443100.0, "step": 682 }, { "entropy": 0.9868748784065247, "epoch": 1.556442417331813, "grad_norm": 1.1953125, "learning_rate": 4.8500892698481784e-06, "loss": 0.0522, "mean_token_accuracy": 0.9832839965820312, "num_tokens": 72549454.0, "step": 683 }, { "entropy": 0.9862592369318008, "epoch": 1.5587229190421894, "grad_norm": 1.5625, "learning_rate": 4.849446203699209e-06, "loss": 0.0519, "mean_token_accuracy": 0.9832311123609543, "num_tokens": 72655797.0, "step": 684 }, { "entropy": 0.9847282618284225, "epoch": 1.5610034207525656, "grad_norm": 1.71875, "learning_rate": 4.848801804049932e-06, "loss": 0.0526, "mean_token_accuracy": 0.9826249331235886, "num_tokens": 72762181.0, "step": 685 }, { "entropy": 0.9894759356975555, "epoch": 1.5632839224629418, "grad_norm": 1.265625, "learning_rate": 4.848156071266095e-06, "loss": 0.0429, "mean_token_accuracy": 0.9864169210195541, "num_tokens": 72868291.0, "step": 686 }, { "entropy": 0.9880176335573196, "epoch": 1.565564424173318, "grad_norm": 1.6015625, "learning_rate": 4.847509005714207e-06, "loss": 0.0531, "mean_token_accuracy": 0.984017089009285, "num_tokens": 72974229.0, "step": 687 }, { "entropy": 0.9860113561153412, "epoch": 1.5678449258836944, "grad_norm": 1.2421875, "learning_rate": 4.846860607761527e-06, "loss": 0.0441, "mean_token_accuracy": 0.9863083213567734, "num_tokens": 73080267.0, "step": 688 }, { "entropy": 0.9819766730070114, "epoch": 1.5701254275940708, "grad_norm": 1.1328125, "learning_rate": 4.8462108777760734e-06, "loss": 0.0472, "mean_token_accuracy": 0.985945537686348, "num_tokens": 73187231.0, "step": 689 }, { "entropy": 0.9873101860284805, "epoch": 1.572405929304447, "grad_norm": 1.34375, "learning_rate": 4.845559816126622e-06, "loss": 0.0515, "mean_token_accuracy": 0.983524814248085, "num_tokens": 73293491.0, "step": 690 }, { "entropy": 0.9941841512918472, "epoch": 1.5746864310148232, "grad_norm": 0.9921875, "learning_rate": 4.844907423182699e-06, "loss": 0.039, "mean_token_accuracy": 0.989460363984108, "num_tokens": 73399505.0, "step": 691 }, { "entropy": 0.9882229119539261, "epoch": 1.5769669327251994, "grad_norm": 1.5625, "learning_rate": 4.844253699314596e-06, "loss": 0.0559, "mean_token_accuracy": 0.984563410282135, "num_tokens": 73506432.0, "step": 692 }, { "entropy": 0.9846877604722977, "epoch": 1.5792474344355758, "grad_norm": 1.25, "learning_rate": 4.843598644893349e-06, "loss": 0.0369, "mean_token_accuracy": 0.9882936924695969, "num_tokens": 73612556.0, "step": 693 }, { "entropy": 0.9937217235565186, "epoch": 1.5815279361459522, "grad_norm": 1.375, "learning_rate": 4.842942260290757e-06, "loss": 0.0435, "mean_token_accuracy": 0.9862556308507919, "num_tokens": 73719002.0, "step": 694 }, { "entropy": 0.9906664341688156, "epoch": 1.5838084378563284, "grad_norm": 1.7890625, "learning_rate": 4.84228454587937e-06, "loss": 0.0599, "mean_token_accuracy": 0.9827061742544174, "num_tokens": 73825116.0, "step": 695 }, { "entropy": 0.9938293993473053, "epoch": 1.5860889395667046, "grad_norm": 1.1640625, "learning_rate": 4.841625502032495e-06, "loss": 0.0371, "mean_token_accuracy": 0.9887871295213699, "num_tokens": 73931133.0, "step": 696 }, { "entropy": 0.9843335002660751, "epoch": 1.5883694412770808, "grad_norm": 1.3125, "learning_rate": 4.84096512912419e-06, "loss": 0.0464, "mean_token_accuracy": 0.9887007176876068, "num_tokens": 74036939.0, "step": 697 }, { "entropy": 0.9865503758192062, "epoch": 1.5906499429874572, "grad_norm": 1.171875, "learning_rate": 4.8403034275292735e-06, "loss": 0.0465, "mean_token_accuracy": 0.9894467294216156, "num_tokens": 74142982.0, "step": 698 }, { "entropy": 0.9895882606506348, "epoch": 1.5929304446978336, "grad_norm": 1.171875, "learning_rate": 4.839640397623312e-06, "loss": 0.0353, "mean_token_accuracy": 0.9900733232498169, "num_tokens": 74249160.0, "step": 699 }, { "entropy": 0.9879752695560455, "epoch": 1.5952109464082098, "grad_norm": 2.046875, "learning_rate": 4.83897603978263e-06, "loss": 0.048, "mean_token_accuracy": 0.9853450804948807, "num_tokens": 74355454.0, "step": 700 }, { "entropy": 0.9961231052875519, "epoch": 1.597491448118586, "grad_norm": 1.203125, "learning_rate": 4.838310354384304e-06, "loss": 0.0461, "mean_token_accuracy": 0.9866641610860825, "num_tokens": 74461512.0, "step": 701 }, { "entropy": 0.9886191636323929, "epoch": 1.5997719498289624, "grad_norm": 1.0234375, "learning_rate": 4.8376433418061615e-06, "loss": 0.0412, "mean_token_accuracy": 0.9873870760202408, "num_tokens": 74567772.0, "step": 702 }, { "entropy": 0.9916016459465027, "epoch": 1.6020524515393386, "grad_norm": 1.1796875, "learning_rate": 4.8369750024267904e-06, "loss": 0.0377, "mean_token_accuracy": 0.9888146072626114, "num_tokens": 74674367.0, "step": 703 }, { "entropy": 0.9854080229997635, "epoch": 1.604332953249715, "grad_norm": 1.4765625, "learning_rate": 4.836305336625523e-06, "loss": 0.0429, "mean_token_accuracy": 0.9876130521297455, "num_tokens": 74780540.0, "step": 704 }, { "entropy": 0.9950538277626038, "epoch": 1.6066134549600912, "grad_norm": 1.5078125, "learning_rate": 4.835634344782453e-06, "loss": 0.0501, "mean_token_accuracy": 0.985178217291832, "num_tokens": 74886850.0, "step": 705 }, { "entropy": 0.9908217489719391, "epoch": 1.6088939566704674, "grad_norm": 1.3984375, "learning_rate": 4.834962027278418e-06, "loss": 0.0457, "mean_token_accuracy": 0.9870124012231827, "num_tokens": 74993186.0, "step": 706 }, { "entropy": 0.9886568039655685, "epoch": 1.6111744583808438, "grad_norm": 1.390625, "learning_rate": 4.834288384495015e-06, "loss": 0.0458, "mean_token_accuracy": 0.986735537648201, "num_tokens": 75099209.0, "step": 707 }, { "entropy": 0.9837959557771683, "epoch": 1.61345496009122, "grad_norm": 1.2578125, "learning_rate": 4.833613416814591e-06, "loss": 0.0484, "mean_token_accuracy": 0.9860903471708298, "num_tokens": 75205506.0, "step": 708 }, { "entropy": 0.9928357899188995, "epoch": 1.6157354618015964, "grad_norm": 1.1015625, "learning_rate": 4.832937124620243e-06, "loss": 0.0411, "mean_token_accuracy": 0.9862971901893616, "num_tokens": 75312273.0, "step": 709 }, { "entropy": 0.9830140024423599, "epoch": 1.6180159635119726, "grad_norm": 1.0390625, "learning_rate": 4.832259508295822e-06, "loss": 0.0395, "mean_token_accuracy": 0.9892749488353729, "num_tokens": 75418526.0, "step": 710 }, { "entropy": 0.9864770323038101, "epoch": 1.6202964652223488, "grad_norm": 1.1953125, "learning_rate": 4.831580568225931e-06, "loss": 0.0425, "mean_token_accuracy": 0.985965222120285, "num_tokens": 75525402.0, "step": 711 }, { "entropy": 0.9867913275957108, "epoch": 1.6225769669327252, "grad_norm": 1.0703125, "learning_rate": 4.830900304795921e-06, "loss": 0.032, "mean_token_accuracy": 0.9907727241516113, "num_tokens": 75632076.0, "step": 712 }, { "entropy": 0.9806598871946335, "epoch": 1.6248574686431014, "grad_norm": 1.2265625, "learning_rate": 4.8302187183918996e-06, "loss": 0.0475, "mean_token_accuracy": 0.9855496138334274, "num_tokens": 75738441.0, "step": 713 }, { "entropy": 0.9862528294324875, "epoch": 1.6271379703534778, "grad_norm": 1.296875, "learning_rate": 4.8295358094007184e-06, "loss": 0.0424, "mean_token_accuracy": 0.9844648540019989, "num_tokens": 75843839.0, "step": 714 }, { "entropy": 0.9793501049280167, "epoch": 1.629418472063854, "grad_norm": 1.4765625, "learning_rate": 4.828851578209986e-06, "loss": 0.055, "mean_token_accuracy": 0.9846485555171967, "num_tokens": 75950544.0, "step": 715 }, { "entropy": 0.9807504415512085, "epoch": 1.6316989737742302, "grad_norm": 1.296875, "learning_rate": 4.828166025208059e-06, "loss": 0.0412, "mean_token_accuracy": 0.9867251664400101, "num_tokens": 76056855.0, "step": 716 }, { "entropy": 0.9817144423723221, "epoch": 1.6339794754846066, "grad_norm": 1.421875, "learning_rate": 4.8274791507840416e-06, "loss": 0.0519, "mean_token_accuracy": 0.9856670796871185, "num_tokens": 76163201.0, "step": 717 }, { "entropy": 0.9857932329177856, "epoch": 1.636259977194983, "grad_norm": 1.0859375, "learning_rate": 4.826790955327793e-06, "loss": 0.0508, "mean_token_accuracy": 0.9857564717531204, "num_tokens": 76270060.0, "step": 718 }, { "entropy": 0.9813764989376068, "epoch": 1.6385404789053593, "grad_norm": 1.34375, "learning_rate": 4.826101439229918e-06, "loss": 0.0497, "mean_token_accuracy": 0.9852314591407776, "num_tokens": 76375916.0, "step": 719 }, { "entropy": 0.9841171354055405, "epoch": 1.6408209806157354, "grad_norm": 1.515625, "learning_rate": 4.825410602881774e-06, "loss": 0.0494, "mean_token_accuracy": 0.9843982458114624, "num_tokens": 76482032.0, "step": 720 }, { "entropy": 0.9853092432022095, "epoch": 1.6431014823261116, "grad_norm": 1.015625, "learning_rate": 4.824718446675465e-06, "loss": 0.0474, "mean_token_accuracy": 0.9876334369182587, "num_tokens": 76588027.0, "step": 721 }, { "entropy": 0.9818762689828873, "epoch": 1.645381984036488, "grad_norm": 1.109375, "learning_rate": 4.8240249710038455e-06, "loss": 0.0414, "mean_token_accuracy": 0.9852838814258575, "num_tokens": 76694345.0, "step": 722 }, { "entropy": 0.9801706969738007, "epoch": 1.6476624857468645, "grad_norm": 1.09375, "learning_rate": 4.82333017626052e-06, "loss": 0.0353, "mean_token_accuracy": 0.990264892578125, "num_tokens": 76800173.0, "step": 723 }, { "entropy": 0.9869963377714157, "epoch": 1.6499429874572407, "grad_norm": 1.2890625, "learning_rate": 4.82263406283984e-06, "loss": 0.0517, "mean_token_accuracy": 0.9859137982130051, "num_tokens": 76906294.0, "step": 724 }, { "entropy": 0.9855607151985168, "epoch": 1.6522234891676169, "grad_norm": 1.0703125, "learning_rate": 4.821936631136907e-06, "loss": 0.0316, "mean_token_accuracy": 0.9924267083406448, "num_tokens": 77012629.0, "step": 725 }, { "entropy": 0.9860947281122208, "epoch": 1.654503990877993, "grad_norm": 1.328125, "learning_rate": 4.821237881547567e-06, "loss": 0.0483, "mean_token_accuracy": 0.9857582002878189, "num_tokens": 77119889.0, "step": 726 }, { "entropy": 0.9861250966787338, "epoch": 1.6567844925883695, "grad_norm": 1.1171875, "learning_rate": 4.82053781446842e-06, "loss": 0.0391, "mean_token_accuracy": 0.9880025237798691, "num_tokens": 77226133.0, "step": 727 }, { "entropy": 0.9774948060512543, "epoch": 1.6590649942987459, "grad_norm": 1.1796875, "learning_rate": 4.819836430296809e-06, "loss": 0.0378, "mean_token_accuracy": 0.9881469011306763, "num_tokens": 77332213.0, "step": 728 }, { "entropy": 0.9819346964359283, "epoch": 1.661345496009122, "grad_norm": 1.234375, "learning_rate": 4.819133729430826e-06, "loss": 0.056, "mean_token_accuracy": 0.9851277768611908, "num_tokens": 77438574.0, "step": 729 }, { "entropy": 0.9822986572980881, "epoch": 1.6636259977194983, "grad_norm": 1.0, "learning_rate": 4.818429712269312e-06, "loss": 0.0366, "mean_token_accuracy": 0.9912009537220001, "num_tokens": 77545332.0, "step": 730 }, { "entropy": 0.9791626930236816, "epoch": 1.6659064994298745, "grad_norm": 1.171875, "learning_rate": 4.8177243792118515e-06, "loss": 0.0381, "mean_token_accuracy": 0.9872196614742279, "num_tokens": 77651516.0, "step": 731 }, { "entropy": 0.9798251241445541, "epoch": 1.6681870011402509, "grad_norm": 1.5703125, "learning_rate": 4.8170177306587785e-06, "loss": 0.0599, "mean_token_accuracy": 0.9813332110643387, "num_tokens": 77759023.0, "step": 732 }, { "entropy": 0.9767559915781021, "epoch": 1.6704675028506273, "grad_norm": 1.6328125, "learning_rate": 4.8163097670111735e-06, "loss": 0.0605, "mean_token_accuracy": 0.9817634671926498, "num_tokens": 77866101.0, "step": 733 }, { "entropy": 0.9808355271816254, "epoch": 1.6727480045610035, "grad_norm": 1.296875, "learning_rate": 4.815600488670863e-06, "loss": 0.0513, "mean_token_accuracy": 0.9856716990470886, "num_tokens": 77972418.0, "step": 734 }, { "entropy": 0.972852885723114, "epoch": 1.6750285062713797, "grad_norm": 1.046875, "learning_rate": 4.81488989604042e-06, "loss": 0.0354, "mean_token_accuracy": 0.988412082195282, "num_tokens": 78078243.0, "step": 735 }, { "entropy": 0.9828459471464157, "epoch": 1.6773090079817559, "grad_norm": 1.4921875, "learning_rate": 4.814177989523162e-06, "loss": 0.0491, "mean_token_accuracy": 0.9846896529197693, "num_tokens": 78185278.0, "step": 736 }, { "entropy": 0.9801107794046402, "epoch": 1.6795895096921323, "grad_norm": 1.328125, "learning_rate": 4.813464769523154e-06, "loss": 0.0445, "mean_token_accuracy": 0.9862786531448364, "num_tokens": 78291802.0, "step": 737 }, { "entropy": 0.9806272387504578, "epoch": 1.6818700114025087, "grad_norm": 1.3125, "learning_rate": 4.812750236445206e-06, "loss": 0.0445, "mean_token_accuracy": 0.9869152307510376, "num_tokens": 78398659.0, "step": 738 }, { "entropy": 0.9770122468471527, "epoch": 1.6841505131128849, "grad_norm": 1.625, "learning_rate": 4.812034390694874e-06, "loss": 0.0602, "mean_token_accuracy": 0.9835155457258224, "num_tokens": 78505191.0, "step": 739 }, { "entropy": 0.9864220172166824, "epoch": 1.686431014823261, "grad_norm": 1.2734375, "learning_rate": 4.811317232678456e-06, "loss": 0.0356, "mean_token_accuracy": 0.9902741014957428, "num_tokens": 78611443.0, "step": 740 }, { "entropy": 0.9819770902395248, "epoch": 1.6887115165336373, "grad_norm": 1.2109375, "learning_rate": 4.810598762803e-06, "loss": 0.0439, "mean_token_accuracy": 0.9862501472234726, "num_tokens": 78718160.0, "step": 741 }, { "entropy": 0.9879536330699921, "epoch": 1.6909920182440137, "grad_norm": 1.2421875, "learning_rate": 4.809878981476293e-06, "loss": 0.0349, "mean_token_accuracy": 0.9915418326854706, "num_tokens": 78824194.0, "step": 742 }, { "entropy": 0.9861274063587189, "epoch": 1.69327251995439, "grad_norm": 1.3203125, "learning_rate": 4.80915788910687e-06, "loss": 0.0432, "mean_token_accuracy": 0.9868280589580536, "num_tokens": 78930054.0, "step": 743 }, { "entropy": 0.9867687970399857, "epoch": 1.6955530216647663, "grad_norm": 0.9453125, "learning_rate": 4.80843548610401e-06, "loss": 0.0368, "mean_token_accuracy": 0.9900541305541992, "num_tokens": 79036513.0, "step": 744 }, { "entropy": 0.9836286902427673, "epoch": 1.6978335233751425, "grad_norm": 1.328125, "learning_rate": 4.807711772877733e-06, "loss": 0.0394, "mean_token_accuracy": 0.9860865473747253, "num_tokens": 79142365.0, "step": 745 }, { "entropy": 0.9871436506509781, "epoch": 1.7001140250855187, "grad_norm": 1.3046875, "learning_rate": 4.8069867498388066e-06, "loss": 0.0482, "mean_token_accuracy": 0.9848539382219315, "num_tokens": 79248149.0, "step": 746 }, { "entropy": 0.9842555522918701, "epoch": 1.702394526795895, "grad_norm": 1.2421875, "learning_rate": 4.806260417398739e-06, "loss": 0.0442, "mean_token_accuracy": 0.9860708713531494, "num_tokens": 79353996.0, "step": 747 }, { "entropy": 0.9879532903432846, "epoch": 1.7046750285062715, "grad_norm": 1.1796875, "learning_rate": 4.805532775969783e-06, "loss": 0.0304, "mean_token_accuracy": 0.9913208782672882, "num_tokens": 79460210.0, "step": 748 }, { "entropy": 0.9895314276218414, "epoch": 1.7069555302166477, "grad_norm": 1.2734375, "learning_rate": 4.804803825964933e-06, "loss": 0.0412, "mean_token_accuracy": 0.9873016029596329, "num_tokens": 79566392.0, "step": 749 }, { "entropy": 0.9852771311998367, "epoch": 1.709236031927024, "grad_norm": 1.109375, "learning_rate": 4.804073567797928e-06, "loss": 0.042, "mean_token_accuracy": 0.9875580072402954, "num_tokens": 79672806.0, "step": 750 }, { "entropy": 0.9840660989284515, "epoch": 1.7115165336374, "grad_norm": 1.15625, "learning_rate": 4.803342001883247e-06, "loss": 0.0329, "mean_token_accuracy": 0.9898307770490646, "num_tokens": 79779317.0, "step": 751 }, { "entropy": 0.9880010336637497, "epoch": 1.7137970353477765, "grad_norm": 1.5703125, "learning_rate": 4.802609128636113e-06, "loss": 0.0495, "mean_token_accuracy": 0.9857719093561172, "num_tokens": 79885484.0, "step": 752 }, { "entropy": 0.9862550497055054, "epoch": 1.716077537058153, "grad_norm": 1.484375, "learning_rate": 4.801874948472492e-06, "loss": 0.0543, "mean_token_accuracy": 0.9832598865032196, "num_tokens": 79991667.0, "step": 753 }, { "entropy": 0.9794258624315262, "epoch": 1.718358038768529, "grad_norm": 1.4921875, "learning_rate": 4.801139461809089e-06, "loss": 0.0544, "mean_token_accuracy": 0.9824716150760651, "num_tokens": 80098206.0, "step": 754 }, { "entropy": 0.9851735234260559, "epoch": 1.7206385404789053, "grad_norm": 1.2578125, "learning_rate": 4.800402669063353e-06, "loss": 0.0354, "mean_token_accuracy": 0.9892710894346237, "num_tokens": 80203876.0, "step": 755 }, { "entropy": 0.9815828204154968, "epoch": 1.7229190421892815, "grad_norm": 1.2421875, "learning_rate": 4.799664570653473e-06, "loss": 0.0449, "mean_token_accuracy": 0.9863273948431015, "num_tokens": 80310240.0, "step": 756 }, { "entropy": 0.9795661866664886, "epoch": 1.725199543899658, "grad_norm": 1.1640625, "learning_rate": 4.79892516699838e-06, "loss": 0.0369, "mean_token_accuracy": 0.9880826622247696, "num_tokens": 80416629.0, "step": 757 }, { "entropy": 0.9770589023828506, "epoch": 1.7274800456100343, "grad_norm": 1.484375, "learning_rate": 4.798184458517745e-06, "loss": 0.0523, "mean_token_accuracy": 0.9857479333877563, "num_tokens": 80523636.0, "step": 758 }, { "entropy": 0.9783640503883362, "epoch": 1.7297605473204105, "grad_norm": 1.1484375, "learning_rate": 4.797442445631978e-06, "loss": 0.0486, "mean_token_accuracy": 0.9845965802669525, "num_tokens": 80629893.0, "step": 759 }, { "entropy": 0.9824425578117371, "epoch": 1.7320410490307867, "grad_norm": 1.3984375, "learning_rate": 4.7966991287622335e-06, "loss": 0.0467, "mean_token_accuracy": 0.9853549748659134, "num_tokens": 80736013.0, "step": 760 }, { "entropy": 0.9838876128196716, "epoch": 1.734321550741163, "grad_norm": 1.25, "learning_rate": 4.795954508330403e-06, "loss": 0.0443, "mean_token_accuracy": 0.9865567237138748, "num_tokens": 80841922.0, "step": 761 }, { "entropy": 0.9819838553667068, "epoch": 1.7366020524515393, "grad_norm": 1.1484375, "learning_rate": 4.795208584759119e-06, "loss": 0.0304, "mean_token_accuracy": 0.9926751405000687, "num_tokens": 80948451.0, "step": 762 }, { "entropy": 0.9860671013593674, "epoch": 1.7388825541619157, "grad_norm": 1.1171875, "learning_rate": 4.794461358471753e-06, "loss": 0.0338, "mean_token_accuracy": 0.9898345172405243, "num_tokens": 81055059.0, "step": 763 }, { "entropy": 0.9845342040061951, "epoch": 1.741163055872292, "grad_norm": 1.1875, "learning_rate": 4.7937128298924155e-06, "loss": 0.0348, "mean_token_accuracy": 0.9891417324542999, "num_tokens": 81161779.0, "step": 764 }, { "entropy": 0.976081371307373, "epoch": 1.7434435575826681, "grad_norm": 0.8203125, "learning_rate": 4.7929629994459584e-06, "loss": 0.0346, "mean_token_accuracy": 0.9904254227876663, "num_tokens": 81268453.0, "step": 765 }, { "entropy": 0.9800683110952377, "epoch": 1.7457240592930443, "grad_norm": 1.2265625, "learning_rate": 4.792211867557969e-06, "loss": 0.0371, "mean_token_accuracy": 0.9870404005050659, "num_tokens": 81375127.0, "step": 766 }, { "entropy": 0.9822813272476196, "epoch": 1.7480045610034207, "grad_norm": 1.1328125, "learning_rate": 4.7914594346547774e-06, "loss": 0.0322, "mean_token_accuracy": 0.9899706542491913, "num_tokens": 81481201.0, "step": 767 }, { "entropy": 0.9750334620475769, "epoch": 1.7502850627137971, "grad_norm": 0.95703125, "learning_rate": 4.790705701163449e-06, "loss": 0.0325, "mean_token_accuracy": 0.9892334491014481, "num_tokens": 81587280.0, "step": 768 }, { "entropy": 0.9779385030269623, "epoch": 1.7525655644241733, "grad_norm": 1.3203125, "learning_rate": 4.789950667511789e-06, "loss": 0.0502, "mean_token_accuracy": 0.9859482645988464, "num_tokens": 81693772.0, "step": 769 }, { "entropy": 0.9777934402227402, "epoch": 1.7548460661345495, "grad_norm": 1.3828125, "learning_rate": 4.789194334128338e-06, "loss": 0.042, "mean_token_accuracy": 0.988482728600502, "num_tokens": 81800016.0, "step": 770 }, { "entropy": 0.9858877509832382, "epoch": 1.757126567844926, "grad_norm": 1.1796875, "learning_rate": 4.788436701442378e-06, "loss": 0.0414, "mean_token_accuracy": 0.9866744428873062, "num_tokens": 81906699.0, "step": 771 }, { "entropy": 0.9785190671682358, "epoch": 1.7594070695553021, "grad_norm": 1.75, "learning_rate": 4.787677769883926e-06, "loss": 0.0566, "mean_token_accuracy": 0.9831848591566086, "num_tokens": 82013125.0, "step": 772 }, { "entropy": 0.9772975891828537, "epoch": 1.7616875712656785, "grad_norm": 1.3125, "learning_rate": 4.786917539883738e-06, "loss": 0.0397, "mean_token_accuracy": 0.9883663654327393, "num_tokens": 82119514.0, "step": 773 }, { "entropy": 0.979434072971344, "epoch": 1.7639680729760547, "grad_norm": 1.5625, "learning_rate": 4.786156011873304e-06, "loss": 0.062, "mean_token_accuracy": 0.9830264300107956, "num_tokens": 82225657.0, "step": 774 }, { "entropy": 0.9769580513238907, "epoch": 1.766248574686431, "grad_norm": 1.109375, "learning_rate": 4.785393186284854e-06, "loss": 0.037, "mean_token_accuracy": 0.9884229898452759, "num_tokens": 82331579.0, "step": 775 }, { "entropy": 0.9783082008361816, "epoch": 1.7685290763968073, "grad_norm": 1.3125, "learning_rate": 4.784629063551354e-06, "loss": 0.0434, "mean_token_accuracy": 0.984489843249321, "num_tokens": 82437946.0, "step": 776 }, { "entropy": 0.9816767424345016, "epoch": 1.7708095781071835, "grad_norm": 1.3828125, "learning_rate": 4.783863644106502e-06, "loss": 0.0449, "mean_token_accuracy": 0.9854064881801605, "num_tokens": 82544467.0, "step": 777 }, { "entropy": 0.9730943739414215, "epoch": 1.77309007981756, "grad_norm": 1.265625, "learning_rate": 4.783096928384739e-06, "loss": 0.0491, "mean_token_accuracy": 0.9854862689971924, "num_tokens": 82651186.0, "step": 778 }, { "entropy": 0.9829456210136414, "epoch": 1.7753705815279361, "grad_norm": 1.2890625, "learning_rate": 4.782328916821235e-06, "loss": 0.0516, "mean_token_accuracy": 0.9845905154943466, "num_tokens": 82757198.0, "step": 779 }, { "entropy": 0.985258549451828, "epoch": 1.7776510832383123, "grad_norm": 1.21875, "learning_rate": 4.7815596098519004e-06, "loss": 0.049, "mean_token_accuracy": 0.9860012084245682, "num_tokens": 82863511.0, "step": 780 }, { "entropy": 0.979990541934967, "epoch": 1.7799315849486887, "grad_norm": 1.25, "learning_rate": 4.780789007913379e-06, "loss": 0.0546, "mean_token_accuracy": 0.9829631000757217, "num_tokens": 82970046.0, "step": 781 }, { "entropy": 0.9799901396036148, "epoch": 1.782212086659065, "grad_norm": 1.1640625, "learning_rate": 4.780017111443048e-06, "loss": 0.0554, "mean_token_accuracy": 0.9834972321987152, "num_tokens": 83076264.0, "step": 782 }, { "entropy": 0.9851639270782471, "epoch": 1.7844925883694414, "grad_norm": 1.046875, "learning_rate": 4.779243920879023e-06, "loss": 0.0427, "mean_token_accuracy": 0.9880333095788956, "num_tokens": 83182052.0, "step": 783 }, { "entropy": 0.9815797507762909, "epoch": 1.7867730900798175, "grad_norm": 1.6484375, "learning_rate": 4.77846943666015e-06, "loss": 0.0455, "mean_token_accuracy": 0.9859478175640106, "num_tokens": 83288787.0, "step": 784 }, { "entropy": 0.9827895164489746, "epoch": 1.7890535917901937, "grad_norm": 1.359375, "learning_rate": 4.777693659226013e-06, "loss": 0.05, "mean_token_accuracy": 0.9851265996694565, "num_tokens": 83395329.0, "step": 785 }, { "entropy": 0.9841955155134201, "epoch": 1.7913340935005702, "grad_norm": 1.140625, "learning_rate": 4.776916589016928e-06, "loss": 0.041, "mean_token_accuracy": 0.987487867474556, "num_tokens": 83501417.0, "step": 786 }, { "entropy": 0.9860419183969498, "epoch": 1.7936145952109466, "grad_norm": 1.28125, "learning_rate": 4.776138226473944e-06, "loss": 0.0435, "mean_token_accuracy": 0.9878757745027542, "num_tokens": 83607236.0, "step": 787 }, { "entropy": 0.9798759371042252, "epoch": 1.7958950969213228, "grad_norm": 1.5078125, "learning_rate": 4.775358572038845e-06, "loss": 0.052, "mean_token_accuracy": 0.9858628660440445, "num_tokens": 83713717.0, "step": 788 }, { "entropy": 0.9801809638738632, "epoch": 1.798175598631699, "grad_norm": 1.125, "learning_rate": 4.774577626154148e-06, "loss": 0.0483, "mean_token_accuracy": 0.9856576174497604, "num_tokens": 83820198.0, "step": 789 }, { "entropy": 0.9808086007833481, "epoch": 1.8004561003420751, "grad_norm": 1.3359375, "learning_rate": 4.773795389263104e-06, "loss": 0.0488, "mean_token_accuracy": 0.9842071831226349, "num_tokens": 83926731.0, "step": 790 }, { "entropy": 0.9855221211910248, "epoch": 1.8027366020524516, "grad_norm": 1.1171875, "learning_rate": 4.773011861809694e-06, "loss": 0.0489, "mean_token_accuracy": 0.986427441239357, "num_tokens": 84033019.0, "step": 791 }, { "entropy": 0.9828860610723495, "epoch": 1.805017103762828, "grad_norm": 1.4609375, "learning_rate": 4.772227044238632e-06, "loss": 0.0457, "mean_token_accuracy": 0.9855331480503082, "num_tokens": 84139526.0, "step": 792 }, { "entropy": 0.981247216463089, "epoch": 1.8072976054732042, "grad_norm": 1.0625, "learning_rate": 4.771440936995367e-06, "loss": 0.0455, "mean_token_accuracy": 0.985289916396141, "num_tokens": 84245663.0, "step": 793 }, { "entropy": 0.9858391135931015, "epoch": 1.8095781071835804, "grad_norm": 1.046875, "learning_rate": 4.770653540526079e-06, "loss": 0.0396, "mean_token_accuracy": 0.9867228418588638, "num_tokens": 84352046.0, "step": 794 }, { "entropy": 0.9898903965950012, "epoch": 1.8118586088939566, "grad_norm": 1.1328125, "learning_rate": 4.7698648552776785e-06, "loss": 0.0452, "mean_token_accuracy": 0.9873431771993637, "num_tokens": 84458018.0, "step": 795 }, { "entropy": 0.9806401133537292, "epoch": 1.814139110604333, "grad_norm": 1.1640625, "learning_rate": 4.769074881697806e-06, "loss": 0.0337, "mean_token_accuracy": 0.9900524169206619, "num_tokens": 84564558.0, "step": 796 }, { "entropy": 0.9841604083776474, "epoch": 1.8164196123147094, "grad_norm": 1.0390625, "learning_rate": 4.768283620234838e-06, "loss": 0.0384, "mean_token_accuracy": 0.9883437156677246, "num_tokens": 84670813.0, "step": 797 }, { "entropy": 0.9775114208459854, "epoch": 1.8187001140250856, "grad_norm": 1.140625, "learning_rate": 4.767491071337877e-06, "loss": 0.0396, "mean_token_accuracy": 0.9884517341852188, "num_tokens": 84777329.0, "step": 798 }, { "entropy": 0.9760616570711136, "epoch": 1.8209806157354618, "grad_norm": 1.2421875, "learning_rate": 4.766697235456761e-06, "loss": 0.0511, "mean_token_accuracy": 0.9862138777971268, "num_tokens": 84884597.0, "step": 799 }, { "entropy": 0.9836569726467133, "epoch": 1.823261117445838, "grad_norm": 1.328125, "learning_rate": 4.765902113042053e-06, "loss": 0.0493, "mean_token_accuracy": 0.987353652715683, "num_tokens": 84990296.0, "step": 800 }, { "entropy": 0.9792052954435349, "epoch": 1.8255416191562144, "grad_norm": 1.4375, "learning_rate": 4.765105704545052e-06, "loss": 0.0487, "mean_token_accuracy": 0.9839796274900436, "num_tokens": 85096552.0, "step": 801 }, { "entropy": 0.9776153862476349, "epoch": 1.8278221208665908, "grad_norm": 1.1953125, "learning_rate": 4.7643080104177815e-06, "loss": 0.0429, "mean_token_accuracy": 0.9868887960910797, "num_tokens": 85202815.0, "step": 802 }, { "entropy": 0.9776228666305542, "epoch": 1.830102622576967, "grad_norm": 1.5859375, "learning_rate": 4.763509031113e-06, "loss": 0.0582, "mean_token_accuracy": 0.9857947677373886, "num_tokens": 85309474.0, "step": 803 }, { "entropy": 0.9804913848638535, "epoch": 1.8323831242873432, "grad_norm": 1.1640625, "learning_rate": 4.7627087670841894e-06, "loss": 0.0356, "mean_token_accuracy": 0.987692266702652, "num_tokens": 85415678.0, "step": 804 }, { "entropy": 0.9770019948482513, "epoch": 1.8346636259977194, "grad_norm": 1.1015625, "learning_rate": 4.761907218785566e-06, "loss": 0.0371, "mean_token_accuracy": 0.98845574259758, "num_tokens": 85521557.0, "step": 805 }, { "entropy": 0.9816814363002777, "epoch": 1.8369441277080958, "grad_norm": 1.265625, "learning_rate": 4.761104386672074e-06, "loss": 0.0393, "mean_token_accuracy": 0.9856883436441422, "num_tokens": 85628317.0, "step": 806 }, { "entropy": 0.9757161289453506, "epoch": 1.8392246294184722, "grad_norm": 1.4609375, "learning_rate": 4.760300271199384e-06, "loss": 0.0484, "mean_token_accuracy": 0.9868269860744476, "num_tokens": 85734762.0, "step": 807 }, { "entropy": 0.9805223643779755, "epoch": 1.8415051311288484, "grad_norm": 1.1484375, "learning_rate": 4.759494872823896e-06, "loss": 0.0494, "mean_token_accuracy": 0.9879367500543594, "num_tokens": 85840933.0, "step": 808 }, { "entropy": 0.9795517474412918, "epoch": 1.8437856328392246, "grad_norm": 1.109375, "learning_rate": 4.758688192002741e-06, "loss": 0.0343, "mean_token_accuracy": 0.9911162406206131, "num_tokens": 85947488.0, "step": 809 }, { "entropy": 0.9768324196338654, "epoch": 1.8460661345496008, "grad_norm": 1.40625, "learning_rate": 4.757880229193773e-06, "loss": 0.0443, "mean_token_accuracy": 0.9878810942173004, "num_tokens": 86053502.0, "step": 810 }, { "entropy": 0.9745518714189529, "epoch": 1.8483466362599772, "grad_norm": 1.109375, "learning_rate": 4.757070984855577e-06, "loss": 0.0373, "mean_token_accuracy": 0.9883411228656769, "num_tokens": 86159717.0, "step": 811 }, { "entropy": 0.9768142998218536, "epoch": 1.8506271379703536, "grad_norm": 1.3046875, "learning_rate": 4.756260459447465e-06, "loss": 0.048, "mean_token_accuracy": 0.9853675663471222, "num_tokens": 86265733.0, "step": 812 }, { "entropy": 0.9818833470344543, "epoch": 1.8529076396807298, "grad_norm": 1.078125, "learning_rate": 4.755448653429475e-06, "loss": 0.0332, "mean_token_accuracy": 0.9898140579462051, "num_tokens": 86372363.0, "step": 813 }, { "entropy": 0.9774245470762253, "epoch": 1.855188141391106, "grad_norm": 1.375, "learning_rate": 4.754635567262372e-06, "loss": 0.0399, "mean_token_accuracy": 0.9868037402629852, "num_tokens": 86478761.0, "step": 814 }, { "entropy": 0.9774171859025955, "epoch": 1.8574686431014822, "grad_norm": 1.0078125, "learning_rate": 4.753821201407648e-06, "loss": 0.0384, "mean_token_accuracy": 0.9878309071063995, "num_tokens": 86585049.0, "step": 815 }, { "entropy": 0.985477015376091, "epoch": 1.8597491448118586, "grad_norm": 1.21875, "learning_rate": 4.7530055563275225e-06, "loss": 0.0459, "mean_token_accuracy": 0.9862307459115982, "num_tokens": 86691835.0, "step": 816 }, { "entropy": 0.9800645560026169, "epoch": 1.862029646522235, "grad_norm": 1.28125, "learning_rate": 4.7521886324849385e-06, "loss": 0.0515, "mean_token_accuracy": 0.987695649266243, "num_tokens": 86797751.0, "step": 817 }, { "entropy": 0.9817820191383362, "epoch": 1.8643101482326112, "grad_norm": 1.1171875, "learning_rate": 4.751370430343568e-06, "loss": 0.0418, "mean_token_accuracy": 0.9868020415306091, "num_tokens": 86904217.0, "step": 818 }, { "entropy": 0.9755240231752396, "epoch": 1.8665906499429874, "grad_norm": 1.0546875, "learning_rate": 4.750550950367805e-06, "loss": 0.0423, "mean_token_accuracy": 0.9865309000015259, "num_tokens": 87010498.0, "step": 819 }, { "entropy": 0.9808765798807144, "epoch": 1.8688711516533636, "grad_norm": 1.1875, "learning_rate": 4.749730193022771e-06, "loss": 0.0465, "mean_token_accuracy": 0.9887320697307587, "num_tokens": 87116553.0, "step": 820 }, { "entropy": 0.9771013557910919, "epoch": 1.87115165336374, "grad_norm": 1.15625, "learning_rate": 4.748908158774312e-06, "loss": 0.0367, "mean_token_accuracy": 0.9899720847606659, "num_tokens": 87223113.0, "step": 821 }, { "entropy": 0.981809064745903, "epoch": 1.8734321550741164, "grad_norm": 1.078125, "learning_rate": 4.748084848089e-06, "loss": 0.0382, "mean_token_accuracy": 0.9879558086395264, "num_tokens": 87329178.0, "step": 822 }, { "entropy": 0.9781689494848251, "epoch": 1.8757126567844926, "grad_norm": 1.15625, "learning_rate": 4.747260261434128e-06, "loss": 0.0394, "mean_token_accuracy": 0.9867708832025528, "num_tokens": 87435776.0, "step": 823 }, { "entropy": 0.9840404689311981, "epoch": 1.8779931584948688, "grad_norm": 1.1875, "learning_rate": 4.7464343992777175e-06, "loss": 0.0352, "mean_token_accuracy": 0.9895784109830856, "num_tokens": 87541866.0, "step": 824 }, { "entropy": 0.9742622971534729, "epoch": 1.880273660205245, "grad_norm": 1.03125, "learning_rate": 4.74560726208851e-06, "loss": 0.0391, "mean_token_accuracy": 0.9870938062667847, "num_tokens": 87648062.0, "step": 825 }, { "entropy": 0.9788495451211929, "epoch": 1.8825541619156214, "grad_norm": 1.1953125, "learning_rate": 4.744778850335974e-06, "loss": 0.036, "mean_token_accuracy": 0.9880734086036682, "num_tokens": 87754358.0, "step": 826 }, { "entropy": 0.9767916202545166, "epoch": 1.8848346636259978, "grad_norm": 1.2421875, "learning_rate": 4.7439491644903e-06, "loss": 0.0447, "mean_token_accuracy": 0.9885515570640564, "num_tokens": 87860816.0, "step": 827 }, { "entropy": 0.979478046298027, "epoch": 1.887115165336374, "grad_norm": 1.234375, "learning_rate": 4.743118205022402e-06, "loss": 0.0363, "mean_token_accuracy": 0.9877117574214935, "num_tokens": 87967097.0, "step": 828 }, { "entropy": 0.9811704903841019, "epoch": 1.8893956670467502, "grad_norm": 1.3984375, "learning_rate": 4.742285972403915e-06, "loss": 0.0444, "mean_token_accuracy": 0.9873367249965668, "num_tokens": 88073441.0, "step": 829 }, { "entropy": 0.9732038527727127, "epoch": 1.8916761687571264, "grad_norm": 1.2421875, "learning_rate": 4.7414524671071995e-06, "loss": 0.0401, "mean_token_accuracy": 0.9859383404254913, "num_tokens": 88180050.0, "step": 830 }, { "entropy": 0.9793674051761627, "epoch": 1.8939566704675028, "grad_norm": 1.0078125, "learning_rate": 4.7406176896053356e-06, "loss": 0.0268, "mean_token_accuracy": 0.9921271950006485, "num_tokens": 88286258.0, "step": 831 }, { "entropy": 0.9723433405160904, "epoch": 1.8962371721778792, "grad_norm": 1.3671875, "learning_rate": 4.739781640372129e-06, "loss": 0.0479, "mean_token_accuracy": 0.9865561425685883, "num_tokens": 88392703.0, "step": 832 }, { "entropy": 0.9698154777288437, "epoch": 1.8985176738882554, "grad_norm": 1.125, "learning_rate": 4.7389443198821035e-06, "loss": 0.0396, "mean_token_accuracy": 0.9884408861398697, "num_tokens": 88499092.0, "step": 833 }, { "entropy": 0.9767738878726959, "epoch": 1.9007981755986316, "grad_norm": 1.1953125, "learning_rate": 4.738105728610507e-06, "loss": 0.0381, "mean_token_accuracy": 0.9874678403139114, "num_tokens": 88605340.0, "step": 834 }, { "entropy": 0.9752073436975479, "epoch": 1.9030786773090078, "grad_norm": 1.2265625, "learning_rate": 4.737265867033307e-06, "loss": 0.0436, "mean_token_accuracy": 0.9860784113407135, "num_tokens": 88711443.0, "step": 835 }, { "entropy": 0.9761349558830261, "epoch": 1.9053591790193842, "grad_norm": 1.171875, "learning_rate": 4.736424735627193e-06, "loss": 0.0459, "mean_token_accuracy": 0.9867573827505112, "num_tokens": 88817116.0, "step": 836 }, { "entropy": 0.9727744907140732, "epoch": 1.9076396807297606, "grad_norm": 1.359375, "learning_rate": 4.735582334869575e-06, "loss": 0.0495, "mean_token_accuracy": 0.9861758798360825, "num_tokens": 88923446.0, "step": 837 }, { "entropy": 0.9776846319437027, "epoch": 1.9099201824401368, "grad_norm": 0.90234375, "learning_rate": 4.734738665238583e-06, "loss": 0.0311, "mean_token_accuracy": 0.9908521920442581, "num_tokens": 89030003.0, "step": 838 }, { "entropy": 0.9710155874490738, "epoch": 1.912200684150513, "grad_norm": 1.5078125, "learning_rate": 4.733893727213068e-06, "loss": 0.0484, "mean_token_accuracy": 0.9877030402421951, "num_tokens": 89136323.0, "step": 839 }, { "entropy": 0.973404198884964, "epoch": 1.9144811858608894, "grad_norm": 1.0703125, "learning_rate": 4.7330475212726e-06, "loss": 0.0358, "mean_token_accuracy": 0.987859919667244, "num_tokens": 89243305.0, "step": 840 }, { "entropy": 0.9721640795469284, "epoch": 1.9167616875712656, "grad_norm": 1.390625, "learning_rate": 4.73220004789747e-06, "loss": 0.0455, "mean_token_accuracy": 0.9872177392244339, "num_tokens": 89349216.0, "step": 841 }, { "entropy": 0.9737680405378342, "epoch": 1.919042189281642, "grad_norm": 0.953125, "learning_rate": 4.7313513075686875e-06, "loss": 0.0409, "mean_token_accuracy": 0.9884746223688126, "num_tokens": 89455515.0, "step": 842 }, { "entropy": 0.9743280112743378, "epoch": 1.9213226909920182, "grad_norm": 1.2265625, "learning_rate": 4.73050130076798e-06, "loss": 0.0385, "mean_token_accuracy": 0.9883975386619568, "num_tokens": 89562183.0, "step": 843 }, { "entropy": 0.9717733561992645, "epoch": 1.9236031927023944, "grad_norm": 1.125, "learning_rate": 4.729650027977797e-06, "loss": 0.0369, "mean_token_accuracy": 0.9877028167247772, "num_tokens": 89668769.0, "step": 844 }, { "entropy": 0.9719804376363754, "epoch": 1.9258836944127709, "grad_norm": 1.2734375, "learning_rate": 4.728797489681302e-06, "loss": 0.0449, "mean_token_accuracy": 0.9860342592000961, "num_tokens": 89774826.0, "step": 845 }, { "entropy": 0.9776778817176819, "epoch": 1.928164196123147, "grad_norm": 0.96875, "learning_rate": 4.7279436863623805e-06, "loss": 0.0378, "mean_token_accuracy": 0.9897838979959488, "num_tokens": 89881764.0, "step": 846 }, { "entropy": 0.9760612547397614, "epoch": 1.9304446978335235, "grad_norm": 1.390625, "learning_rate": 4.7270886185056355e-06, "loss": 0.0415, "mean_token_accuracy": 0.9894539713859558, "num_tokens": 89988381.0, "step": 847 }, { "entropy": 0.9774655550718307, "epoch": 1.9327251995438997, "grad_norm": 1.203125, "learning_rate": 4.726232286596385e-06, "loss": 0.0371, "mean_token_accuracy": 0.9895890206098557, "num_tokens": 90094821.0, "step": 848 }, { "entropy": 0.9793540239334106, "epoch": 1.9350057012542758, "grad_norm": 1.34375, "learning_rate": 4.725374691120669e-06, "loss": 0.0528, "mean_token_accuracy": 0.9850776195526123, "num_tokens": 90200865.0, "step": 849 }, { "entropy": 0.9794057458639145, "epoch": 1.9372862029646523, "grad_norm": 1.25, "learning_rate": 4.7245158325652396e-06, "loss": 0.0518, "mean_token_accuracy": 0.9828950762748718, "num_tokens": 90307275.0, "step": 850 }, { "entropy": 0.9757974445819855, "epoch": 1.9395667046750285, "grad_norm": 1.640625, "learning_rate": 4.7236557114175705e-06, "loss": 0.0575, "mean_token_accuracy": 0.9819788038730621, "num_tokens": 90413420.0, "step": 851 }, { "entropy": 0.978723555803299, "epoch": 1.9418472063854049, "grad_norm": 1.2890625, "learning_rate": 4.722794328165849e-06, "loss": 0.0466, "mean_token_accuracy": 0.9828702211380005, "num_tokens": 90519744.0, "step": 852 }, { "entropy": 0.975884199142456, "epoch": 1.944127708095781, "grad_norm": 1.03125, "learning_rate": 4.721931683298979e-06, "loss": 0.033, "mean_token_accuracy": 0.9882368743419647, "num_tokens": 90625742.0, "step": 853 }, { "entropy": 0.9794593751430511, "epoch": 1.9464082098061573, "grad_norm": 1.2421875, "learning_rate": 4.721067777306582e-06, "loss": 0.0381, "mean_token_accuracy": 0.9873843938112259, "num_tokens": 90731739.0, "step": 854 }, { "entropy": 0.9795663207769394, "epoch": 1.9486887115165337, "grad_norm": 1.0078125, "learning_rate": 4.7202026106789935e-06, "loss": 0.0417, "mean_token_accuracy": 0.9866738319396973, "num_tokens": 90837872.0, "step": 855 }, { "entropy": 0.9779291301965714, "epoch": 1.95096921322691, "grad_norm": 1.1875, "learning_rate": 4.719336183907266e-06, "loss": 0.0441, "mean_token_accuracy": 0.9843874871730804, "num_tokens": 90944388.0, "step": 856 }, { "entropy": 0.9852243065834045, "epoch": 1.9532497149372863, "grad_norm": 1.0, "learning_rate": 4.718468497483166e-06, "loss": 0.0384, "mean_token_accuracy": 0.9882629811763763, "num_tokens": 91050902.0, "step": 857 }, { "entropy": 0.9813793003559113, "epoch": 1.9555302166476625, "grad_norm": 1.53125, "learning_rate": 4.717599551899177e-06, "loss": 0.0545, "mean_token_accuracy": 0.9851154386997223, "num_tokens": 91157171.0, "step": 858 }, { "entropy": 0.9803758710622787, "epoch": 1.9578107183580387, "grad_norm": 1.2421875, "learning_rate": 4.716729347648494e-06, "loss": 0.0403, "mean_token_accuracy": 0.9895334988832474, "num_tokens": 91263596.0, "step": 859 }, { "entropy": 0.981821209192276, "epoch": 1.960091220068415, "grad_norm": 0.94921875, "learning_rate": 4.71585788522503e-06, "loss": 0.0374, "mean_token_accuracy": 0.9896746426820755, "num_tokens": 91370181.0, "step": 860 }, { "entropy": 0.9830631166696548, "epoch": 1.9623717217787915, "grad_norm": 1.46875, "learning_rate": 4.7149851651234085e-06, "loss": 0.0403, "mean_token_accuracy": 0.9863715767860413, "num_tokens": 91476466.0, "step": 861 }, { "entropy": 0.9737545102834702, "epoch": 1.9646522234891677, "grad_norm": 1.0703125, "learning_rate": 4.714111187838969e-06, "loss": 0.0505, "mean_token_accuracy": 0.9853809475898743, "num_tokens": 91583624.0, "step": 862 }, { "entropy": 0.981592133641243, "epoch": 1.9669327251995439, "grad_norm": 1.0859375, "learning_rate": 4.713235953867764e-06, "loss": 0.0375, "mean_token_accuracy": 0.9856505542993546, "num_tokens": 91689006.0, "step": 863 }, { "entropy": 0.9825274050235748, "epoch": 1.96921322690992, "grad_norm": 1.046875, "learning_rate": 4.712359463706561e-06, "loss": 0.0378, "mean_token_accuracy": 0.9888201206922531, "num_tokens": 91795021.0, "step": 864 }, { "entropy": 0.9793699979782104, "epoch": 1.9714937286202965, "grad_norm": 1.390625, "learning_rate": 4.711481717852837e-06, "loss": 0.0508, "mean_token_accuracy": 0.9841022342443466, "num_tokens": 91901744.0, "step": 865 }, { "entropy": 0.9856456369161606, "epoch": 1.973774230330673, "grad_norm": 1.453125, "learning_rate": 4.710602716804784e-06, "loss": 0.0514, "mean_token_accuracy": 0.9839185178279877, "num_tokens": 92008075.0, "step": 866 }, { "entropy": 0.978681743144989, "epoch": 1.976054732041049, "grad_norm": 1.140625, "learning_rate": 4.709722461061307e-06, "loss": 0.0431, "mean_token_accuracy": 0.989291176199913, "num_tokens": 92114214.0, "step": 867 }, { "entropy": 0.982078030705452, "epoch": 1.9783352337514253, "grad_norm": 1.140625, "learning_rate": 4.70884095112202e-06, "loss": 0.0455, "mean_token_accuracy": 0.9850260466337204, "num_tokens": 92220609.0, "step": 868 }, { "entropy": 0.9859579205513, "epoch": 1.9806157354618015, "grad_norm": 1.0859375, "learning_rate": 4.707958187487254e-06, "loss": 0.0385, "mean_token_accuracy": 0.9886489361524582, "num_tokens": 92327221.0, "step": 869 }, { "entropy": 0.987820953130722, "epoch": 1.982896237172178, "grad_norm": 1.015625, "learning_rate": 4.707074170658046e-06, "loss": 0.0355, "mean_token_accuracy": 0.9895476847887039, "num_tokens": 92432614.0, "step": 870 }, { "entropy": 0.9920127838850021, "epoch": 1.9851767388825543, "grad_norm": 1.0546875, "learning_rate": 4.706188901136148e-06, "loss": 0.0329, "mean_token_accuracy": 0.9905287027359009, "num_tokens": 92539349.0, "step": 871 }, { "entropy": 0.9765962958335876, "epoch": 1.9874572405929305, "grad_norm": 1.28125, "learning_rate": 4.705302379424023e-06, "loss": 0.0515, "mean_token_accuracy": 0.9823716282844543, "num_tokens": 92646019.0, "step": 872 }, { "entropy": 0.9774642288684845, "epoch": 1.9897377423033067, "grad_norm": 1.4375, "learning_rate": 4.704414606024842e-06, "loss": 0.0486, "mean_token_accuracy": 0.9834532737731934, "num_tokens": 92751643.0, "step": 873 }, { "entropy": 0.9821784198284149, "epoch": 1.9920182440136829, "grad_norm": 1.1953125, "learning_rate": 4.703525581442488e-06, "loss": 0.0391, "mean_token_accuracy": 0.9870892912149429, "num_tokens": 92857801.0, "step": 874 }, { "entropy": 0.9786333739757538, "epoch": 1.9942987457240593, "grad_norm": 1.0703125, "learning_rate": 4.702635306181554e-06, "loss": 0.042, "mean_token_accuracy": 0.9878517985343933, "num_tokens": 92963958.0, "step": 875 }, { "entropy": 0.9832588732242584, "epoch": 1.9965792474344357, "grad_norm": 1.1875, "learning_rate": 4.701743780747345e-06, "loss": 0.0458, "mean_token_accuracy": 0.9866780936717987, "num_tokens": 93070702.0, "step": 876 }, { "entropy": 0.9838996976613998, "epoch": 1.998859749144812, "grad_norm": 1.2578125, "learning_rate": 4.700851005645872e-06, "loss": 0.043, "mean_token_accuracy": 0.9864163249731064, "num_tokens": 93177084.0, "step": 877 }, { "entropy": 0.9773348569869995, "epoch": 2.0, "grad_norm": 1.34375, "learning_rate": 4.699956981383857e-06, "loss": 0.0289, "mean_token_accuracy": 0.9930062294006348, "num_tokens": 93215952.0, "step": 878 }, { "entropy": 0.9804229289293289, "epoch": 2.002280501710376, "grad_norm": 1.0859375, "learning_rate": 4.699061708468732e-06, "loss": 0.0381, "mean_token_accuracy": 0.9880535006523132, "num_tokens": 93321951.0, "step": 879 }, { "entropy": 0.9747269004583359, "epoch": 2.0045610034207524, "grad_norm": 1.2109375, "learning_rate": 4.698165187408635e-06, "loss": 0.0419, "mean_token_accuracy": 0.9867187291383743, "num_tokens": 93428822.0, "step": 880 }, { "epoch": 2.0045610034207524, "eval_entropy": 0.9781239599329438, "eval_loss": 0.04288763180375099, "eval_mean_token_accuracy": 0.9870988382132788, "eval_num_tokens": 93428822.0, "eval_runtime": 66.0824, "eval_samples_per_second": 126.887, "eval_steps_per_second": 3.98, "step": 880 }, { "entropy": 0.9795099049806595, "epoch": 2.006841505131129, "grad_norm": 1.078125, "learning_rate": 4.697267418712415e-06, "loss": 0.0365, "mean_token_accuracy": 0.9892882853746414, "num_tokens": 93535129.0, "step": 881 }, { "entropy": 0.9794227629899979, "epoch": 2.009122006841505, "grad_norm": 0.96875, "learning_rate": 4.6963684028896285e-06, "loss": 0.0342, "mean_token_accuracy": 0.9906343668699265, "num_tokens": 93641590.0, "step": 882 }, { "entropy": 0.9793710112571716, "epoch": 2.0114025085518814, "grad_norm": 1.109375, "learning_rate": 4.695468140450539e-06, "loss": 0.0384, "mean_token_accuracy": 0.989298403263092, "num_tokens": 93747995.0, "step": 883 }, { "entropy": 0.9803194850683212, "epoch": 2.0136830102622576, "grad_norm": 1.359375, "learning_rate": 4.6945666319061166e-06, "loss": 0.0358, "mean_token_accuracy": 0.988432839512825, "num_tokens": 93854339.0, "step": 884 }, { "entropy": 0.9765643775463104, "epoch": 2.015963511972634, "grad_norm": 1.3203125, "learning_rate": 4.6936638777680435e-06, "loss": 0.0539, "mean_token_accuracy": 0.9833818674087524, "num_tokens": 93960385.0, "step": 885 }, { "entropy": 0.9752150475978851, "epoch": 2.0182440136830104, "grad_norm": 1.25, "learning_rate": 4.6927598785487026e-06, "loss": 0.0328, "mean_token_accuracy": 0.9883216172456741, "num_tokens": 94066204.0, "step": 886 }, { "entropy": 0.9780465215444565, "epoch": 2.0205245153933866, "grad_norm": 1.5078125, "learning_rate": 4.691854634761188e-06, "loss": 0.0452, "mean_token_accuracy": 0.9872179478406906, "num_tokens": 94173193.0, "step": 887 }, { "entropy": 0.9768876880407333, "epoch": 2.022805017103763, "grad_norm": 1.0234375, "learning_rate": 4.690948146919299e-06, "loss": 0.0358, "mean_token_accuracy": 0.9916546642780304, "num_tokens": 94279521.0, "step": 888 }, { "entropy": 0.9787120074033737, "epoch": 2.025085518814139, "grad_norm": 1.078125, "learning_rate": 4.690040415537538e-06, "loss": 0.0312, "mean_token_accuracy": 0.9898882508277893, "num_tokens": 94385861.0, "step": 889 }, { "entropy": 0.9709342122077942, "epoch": 2.027366020524515, "grad_norm": 1.1875, "learning_rate": 4.689131441131119e-06, "loss": 0.0476, "mean_token_accuracy": 0.9862944334745407, "num_tokens": 94492772.0, "step": 890 }, { "entropy": 0.9712212085723877, "epoch": 2.029646522234892, "grad_norm": 1.2421875, "learning_rate": 4.6882212242159555e-06, "loss": 0.0342, "mean_token_accuracy": 0.988854318857193, "num_tokens": 94599001.0, "step": 891 }, { "entropy": 0.9796290993690491, "epoch": 2.031927023945268, "grad_norm": 1.0390625, "learning_rate": 4.687309765308671e-06, "loss": 0.035, "mean_token_accuracy": 0.9902321249246597, "num_tokens": 94705684.0, "step": 892 }, { "entropy": 0.978389322757721, "epoch": 2.034207525655644, "grad_norm": 1.109375, "learning_rate": 4.6863970649265914e-06, "loss": 0.0388, "mean_token_accuracy": 0.9880534559488297, "num_tokens": 94812343.0, "step": 893 }, { "entropy": 0.9750422388315201, "epoch": 2.0364880273660204, "grad_norm": 0.98828125, "learning_rate": 4.685483123587748e-06, "loss": 0.0297, "mean_token_accuracy": 0.9897147566080093, "num_tokens": 94918678.0, "step": 894 }, { "entropy": 0.9754298627376556, "epoch": 2.0387685290763966, "grad_norm": 0.9609375, "learning_rate": 4.684567941810876e-06, "loss": 0.0299, "mean_token_accuracy": 0.9899564385414124, "num_tokens": 95025190.0, "step": 895 }, { "entropy": 0.9771984666585922, "epoch": 2.0410490307867732, "grad_norm": 0.94140625, "learning_rate": 4.683651520115414e-06, "loss": 0.032, "mean_token_accuracy": 0.9902064949274063, "num_tokens": 95132028.0, "step": 896 }, { "entropy": 0.9739722609519958, "epoch": 2.0433295324971494, "grad_norm": 1.2109375, "learning_rate": 4.682733859021508e-06, "loss": 0.0383, "mean_token_accuracy": 0.9877973943948746, "num_tokens": 95238432.0, "step": 897 }, { "entropy": 0.9751162081956863, "epoch": 2.0456100342075256, "grad_norm": 1.09375, "learning_rate": 4.681814959050002e-06, "loss": 0.0282, "mean_token_accuracy": 0.9899467378854752, "num_tokens": 95345269.0, "step": 898 }, { "entropy": 0.9731719791889191, "epoch": 2.047890535917902, "grad_norm": 1.4375, "learning_rate": 4.680894820722446e-06, "loss": 0.0479, "mean_token_accuracy": 0.9847432225942612, "num_tokens": 95452007.0, "step": 899 }, { "entropy": 0.9745832234621048, "epoch": 2.050171037628278, "grad_norm": 1.1875, "learning_rate": 4.679973444561095e-06, "loss": 0.0405, "mean_token_accuracy": 0.988477498292923, "num_tokens": 95558109.0, "step": 900 }, { "entropy": 0.9740213751792908, "epoch": 2.0524515393386547, "grad_norm": 1.6484375, "learning_rate": 4.679050831088902e-06, "loss": 0.0467, "mean_token_accuracy": 0.9860271066427231, "num_tokens": 95664036.0, "step": 901 }, { "entropy": 0.9660260379314423, "epoch": 2.054732041049031, "grad_norm": 0.95703125, "learning_rate": 4.678126980829525e-06, "loss": 0.0311, "mean_token_accuracy": 0.9908648729324341, "num_tokens": 95770669.0, "step": 902 }, { "entropy": 0.973817765712738, "epoch": 2.057012542759407, "grad_norm": 1.578125, "learning_rate": 4.677201894307325e-06, "loss": 0.0452, "mean_token_accuracy": 0.985239714384079, "num_tokens": 95876707.0, "step": 903 }, { "entropy": 0.9711736738681793, "epoch": 2.0592930444697832, "grad_norm": 1.5390625, "learning_rate": 4.676275572047362e-06, "loss": 0.0482, "mean_token_accuracy": 0.9853266328573227, "num_tokens": 95983016.0, "step": 904 }, { "entropy": 0.9733150452375412, "epoch": 2.0615735461801594, "grad_norm": 1.09375, "learning_rate": 4.675348014575399e-06, "loss": 0.0415, "mean_token_accuracy": 0.9891210794448853, "num_tokens": 96089311.0, "step": 905 }, { "entropy": 0.968235969543457, "epoch": 2.063854047890536, "grad_norm": 1.3046875, "learning_rate": 4.674419222417899e-06, "loss": 0.0473, "mean_token_accuracy": 0.9878730773925781, "num_tokens": 96195434.0, "step": 906 }, { "entropy": 0.9766311794519424, "epoch": 2.0661345496009123, "grad_norm": 1.078125, "learning_rate": 4.673489196102028e-06, "loss": 0.0315, "mean_token_accuracy": 0.9880999326705933, "num_tokens": 96302300.0, "step": 907 }, { "entropy": 0.9700921475887299, "epoch": 2.0684150513112884, "grad_norm": 1.1171875, "learning_rate": 4.67255793615565e-06, "loss": 0.0283, "mean_token_accuracy": 0.989893227815628, "num_tokens": 96408415.0, "step": 908 }, { "entropy": 0.9686842709779739, "epoch": 2.0706955530216646, "grad_norm": 1.1796875, "learning_rate": 4.67162544310733e-06, "loss": 0.0383, "mean_token_accuracy": 0.9882586598396301, "num_tokens": 96514857.0, "step": 909 }, { "entropy": 0.9681156128644943, "epoch": 2.072976054732041, "grad_norm": 1.046875, "learning_rate": 4.670691717486333e-06, "loss": 0.0337, "mean_token_accuracy": 0.9903255999088287, "num_tokens": 96621547.0, "step": 910 }, { "entropy": 0.9785451889038086, "epoch": 2.0752565564424175, "grad_norm": 1.375, "learning_rate": 4.669756759822625e-06, "loss": 0.0503, "mean_token_accuracy": 0.9856112003326416, "num_tokens": 96728037.0, "step": 911 }, { "entropy": 0.9735833704471588, "epoch": 2.0775370581527937, "grad_norm": 0.9375, "learning_rate": 4.668820570646868e-06, "loss": 0.0324, "mean_token_accuracy": 0.9907559603452682, "num_tokens": 96833991.0, "step": 912 }, { "entropy": 0.9744001030921936, "epoch": 2.07981755986317, "grad_norm": 0.92578125, "learning_rate": 4.667883150490427e-06, "loss": 0.0274, "mean_token_accuracy": 0.9917363971471786, "num_tokens": 96939763.0, "step": 913 }, { "entropy": 0.9790977239608765, "epoch": 2.082098061573546, "grad_norm": 1.1953125, "learning_rate": 4.666944499885361e-06, "loss": 0.0445, "mean_token_accuracy": 0.9866353422403336, "num_tokens": 97046131.0, "step": 914 }, { "entropy": 0.9720945358276367, "epoch": 2.0843785632839227, "grad_norm": 1.28125, "learning_rate": 4.6660046193644315e-06, "loss": 0.0381, "mean_token_accuracy": 0.9866390526294708, "num_tokens": 97152421.0, "step": 915 }, { "entropy": 0.9683322459459305, "epoch": 2.086659064994299, "grad_norm": 1.09375, "learning_rate": 4.665063509461098e-06, "loss": 0.0458, "mean_token_accuracy": 0.985177531838417, "num_tokens": 97259020.0, "step": 916 }, { "entropy": 0.9691174179315567, "epoch": 2.088939566704675, "grad_norm": 1.109375, "learning_rate": 4.664121170709512e-06, "loss": 0.0412, "mean_token_accuracy": 0.9876672774553299, "num_tokens": 97365469.0, "step": 917 }, { "entropy": 0.9801434427499771, "epoch": 2.0912200684150513, "grad_norm": 1.1875, "learning_rate": 4.663177603644532e-06, "loss": 0.0428, "mean_token_accuracy": 0.9879299253225327, "num_tokens": 97471722.0, "step": 918 }, { "entropy": 0.9697654545307159, "epoch": 2.0935005701254275, "grad_norm": 1.2890625, "learning_rate": 4.662232808801704e-06, "loss": 0.0417, "mean_token_accuracy": 0.9861269593238831, "num_tokens": 97578106.0, "step": 919 }, { "entropy": 0.9695786237716675, "epoch": 2.095781071835804, "grad_norm": 1.34375, "learning_rate": 4.661286786717278e-06, "loss": 0.0449, "mean_token_accuracy": 0.9849065989255905, "num_tokens": 97684801.0, "step": 920 }, { "entropy": 0.9732540398836136, "epoch": 2.0980615735461803, "grad_norm": 1.0859375, "learning_rate": 4.660339537928198e-06, "loss": 0.0346, "mean_token_accuracy": 0.9885504841804504, "num_tokens": 97790855.0, "step": 921 }, { "entropy": 0.9747906178236008, "epoch": 2.1003420752565565, "grad_norm": 1.078125, "learning_rate": 4.659391062972102e-06, "loss": 0.0399, "mean_token_accuracy": 0.9874437749385834, "num_tokens": 97897495.0, "step": 922 }, { "entropy": 0.9691756218671799, "epoch": 2.1026225769669327, "grad_norm": 1.0625, "learning_rate": 4.658441362387328e-06, "loss": 0.0314, "mean_token_accuracy": 0.990680143237114, "num_tokens": 98003518.0, "step": 923 }, { "entropy": 0.9775384068489075, "epoch": 2.104903078677309, "grad_norm": 1.03125, "learning_rate": 4.657490436712907e-06, "loss": 0.0391, "mean_token_accuracy": 0.9867166429758072, "num_tokens": 98109300.0, "step": 924 }, { "entropy": 0.9670076668262482, "epoch": 2.1071835803876855, "grad_norm": 1.1171875, "learning_rate": 4.6565382864885665e-06, "loss": 0.0268, "mean_token_accuracy": 0.9916305989027023, "num_tokens": 98215858.0, "step": 925 }, { "entropy": 0.9700657576322556, "epoch": 2.1094640820980617, "grad_norm": 0.92578125, "learning_rate": 4.655584912254727e-06, "loss": 0.0354, "mean_token_accuracy": 0.9893134981393814, "num_tokens": 98322282.0, "step": 926 }, { "entropy": 0.9679078608751297, "epoch": 2.111744583808438, "grad_norm": 1.328125, "learning_rate": 4.654630314552508e-06, "loss": 0.0462, "mean_token_accuracy": 0.9849422872066498, "num_tokens": 98428677.0, "step": 927 }, { "entropy": 0.9669574201107025, "epoch": 2.114025085518814, "grad_norm": 1.5, "learning_rate": 4.653674493923718e-06, "loss": 0.0451, "mean_token_accuracy": 0.9863000065088272, "num_tokens": 98535297.0, "step": 928 }, { "entropy": 0.9739602655172348, "epoch": 2.1163055872291903, "grad_norm": 1.203125, "learning_rate": 4.652717450910864e-06, "loss": 0.0525, "mean_token_accuracy": 0.9824361503124237, "num_tokens": 98641764.0, "step": 929 }, { "entropy": 0.9672547280788422, "epoch": 2.118586088939567, "grad_norm": 0.9765625, "learning_rate": 4.651759186057144e-06, "loss": 0.0353, "mean_token_accuracy": 0.9884200245141983, "num_tokens": 98747959.0, "step": 930 }, { "entropy": 0.9716185182332993, "epoch": 2.120866590649943, "grad_norm": 1.3046875, "learning_rate": 4.650799699906452e-06, "loss": 0.0355, "mean_token_accuracy": 0.9892501831054688, "num_tokens": 98854341.0, "step": 931 }, { "entropy": 0.9713221788406372, "epoch": 2.1231470923603193, "grad_norm": 1.03125, "learning_rate": 4.649838993003373e-06, "loss": 0.0451, "mean_token_accuracy": 0.9873639792203903, "num_tokens": 98960953.0, "step": 932 }, { "entropy": 0.9744275808334351, "epoch": 2.1254275940706955, "grad_norm": 1.0234375, "learning_rate": 4.648877065893186e-06, "loss": 0.031, "mean_token_accuracy": 0.9905261695384979, "num_tokens": 99067330.0, "step": 933 }, { "entropy": 0.9711235910654068, "epoch": 2.1277080957810717, "grad_norm": 1.03125, "learning_rate": 4.647913919121861e-06, "loss": 0.028, "mean_token_accuracy": 0.9900173246860504, "num_tokens": 99173975.0, "step": 934 }, { "entropy": 0.9673244208097458, "epoch": 2.1299885974914483, "grad_norm": 1.1953125, "learning_rate": 4.646949553236064e-06, "loss": 0.0351, "mean_token_accuracy": 0.9906974583864212, "num_tokens": 99280479.0, "step": 935 }, { "entropy": 0.9681116938591003, "epoch": 2.1322690992018245, "grad_norm": 1.4140625, "learning_rate": 4.645983968783148e-06, "loss": 0.0345, "mean_token_accuracy": 0.987143948674202, "num_tokens": 99386700.0, "step": 936 }, { "entropy": 0.971301257610321, "epoch": 2.1345496009122007, "grad_norm": 1.3046875, "learning_rate": 4.645017166311163e-06, "loss": 0.046, "mean_token_accuracy": 0.985542044043541, "num_tokens": 99493754.0, "step": 937 }, { "entropy": 0.9663265347480774, "epoch": 2.136830102622577, "grad_norm": 1.2578125, "learning_rate": 4.644049146368844e-06, "loss": 0.0459, "mean_token_accuracy": 0.987193688750267, "num_tokens": 99599548.0, "step": 938 }, { "entropy": 0.9665764719247818, "epoch": 2.139110604332953, "grad_norm": 0.91796875, "learning_rate": 4.643079909505622e-06, "loss": 0.0353, "mean_token_accuracy": 0.988664984703064, "num_tokens": 99705983.0, "step": 939 }, { "entropy": 0.9727141261100769, "epoch": 2.1413911060433297, "grad_norm": 1.2421875, "learning_rate": 4.642109456271618e-06, "loss": 0.0427, "mean_token_accuracy": 0.9860325157642365, "num_tokens": 99812389.0, "step": 940 }, { "entropy": 0.9782813638448715, "epoch": 2.143671607753706, "grad_norm": 1.2734375, "learning_rate": 4.64113778721764e-06, "loss": 0.0372, "mean_token_accuracy": 0.9881032258272171, "num_tokens": 99918499.0, "step": 941 }, { "entropy": 0.9678835570812225, "epoch": 2.145952109464082, "grad_norm": 1.2265625, "learning_rate": 4.640164902895192e-06, "loss": 0.0475, "mean_token_accuracy": 0.9864360243082047, "num_tokens": 100025443.0, "step": 942 }, { "entropy": 0.9698448032140732, "epoch": 2.1482326111744583, "grad_norm": 1.2578125, "learning_rate": 4.6391908038564615e-06, "loss": 0.0367, "mean_token_accuracy": 0.9905203580856323, "num_tokens": 100131788.0, "step": 943 }, { "entropy": 0.9688317030668259, "epoch": 2.1505131128848345, "grad_norm": 1.234375, "learning_rate": 4.6382154906543295e-06, "loss": 0.0321, "mean_token_accuracy": 0.9887078553438187, "num_tokens": 100238328.0, "step": 944 }, { "entropy": 0.9721045792102814, "epoch": 2.152793614595211, "grad_norm": 1.1796875, "learning_rate": 4.637238963842365e-06, "loss": 0.0437, "mean_token_accuracy": 0.987345427274704, "num_tokens": 100344001.0, "step": 945 }, { "entropy": 0.9771005511283875, "epoch": 2.1550741163055873, "grad_norm": 1.0078125, "learning_rate": 4.636261223974826e-06, "loss": 0.0324, "mean_token_accuracy": 0.9887211471796036, "num_tokens": 100450024.0, "step": 946 }, { "entropy": 0.9737095385789871, "epoch": 2.1573546180159635, "grad_norm": 1.171875, "learning_rate": 4.635282271606658e-06, "loss": 0.0349, "mean_token_accuracy": 0.9870777577161789, "num_tokens": 100556317.0, "step": 947 }, { "entropy": 0.9708284884691238, "epoch": 2.1596351197263397, "grad_norm": 1.2265625, "learning_rate": 4.634302107293497e-06, "loss": 0.0449, "mean_token_accuracy": 0.9879834949970245, "num_tokens": 100662068.0, "step": 948 }, { "entropy": 0.9660469442605972, "epoch": 2.161915621436716, "grad_norm": 1.0625, "learning_rate": 4.633320731591663e-06, "loss": 0.031, "mean_token_accuracy": 0.9897365719079971, "num_tokens": 100768748.0, "step": 949 }, { "entropy": 0.9745510965585709, "epoch": 2.1641961231470925, "grad_norm": 1.2421875, "learning_rate": 4.632338145058167e-06, "loss": 0.0431, "mean_token_accuracy": 0.9873800724744797, "num_tokens": 100875214.0, "step": 950 }, { "entropy": 0.975474402308464, "epoch": 2.1664766248574687, "grad_norm": 1.203125, "learning_rate": 4.631354348250706e-06, "loss": 0.0434, "mean_token_accuracy": 0.9889813363552094, "num_tokens": 100981865.0, "step": 951 }, { "entropy": 0.9771051108837128, "epoch": 2.168757126567845, "grad_norm": 1.1015625, "learning_rate": 4.630369341727665e-06, "loss": 0.0358, "mean_token_accuracy": 0.9889795482158661, "num_tokens": 101087783.0, "step": 952 }, { "entropy": 0.973135456442833, "epoch": 2.171037628278221, "grad_norm": 1.0703125, "learning_rate": 4.629383126048114e-06, "loss": 0.0414, "mean_token_accuracy": 0.9877391457557678, "num_tokens": 101194157.0, "step": 953 }, { "entropy": 0.9724164307117462, "epoch": 2.1733181299885973, "grad_norm": 1.0859375, "learning_rate": 4.6283957017718105e-06, "loss": 0.0378, "mean_token_accuracy": 0.9878981858491898, "num_tokens": 101300806.0, "step": 954 }, { "entropy": 0.9706181734800339, "epoch": 2.175598631698974, "grad_norm": 0.9140625, "learning_rate": 4.627407069459196e-06, "loss": 0.0332, "mean_token_accuracy": 0.9885203987360001, "num_tokens": 101406849.0, "step": 955 }, { "entropy": 0.9715875089168549, "epoch": 2.17787913340935, "grad_norm": 1.125, "learning_rate": 4.626417229671401e-06, "loss": 0.0364, "mean_token_accuracy": 0.9881094992160797, "num_tokens": 101513909.0, "step": 956 }, { "entropy": 0.9726030677556992, "epoch": 2.1801596351197263, "grad_norm": 1.1953125, "learning_rate": 4.625426182970237e-06, "loss": 0.0411, "mean_token_accuracy": 0.9858454912900925, "num_tokens": 101619949.0, "step": 957 }, { "entropy": 0.9729648977518082, "epoch": 2.1824401368301025, "grad_norm": 1.6484375, "learning_rate": 4.6244339299182065e-06, "loss": 0.0411, "mean_token_accuracy": 0.9873932301998138, "num_tokens": 101726004.0, "step": 958 }, { "entropy": 0.9736730754375458, "epoch": 2.1847206385404787, "grad_norm": 1.578125, "learning_rate": 4.62344047107849e-06, "loss": 0.0393, "mean_token_accuracy": 0.9863914251327515, "num_tokens": 101832364.0, "step": 959 }, { "entropy": 0.9698660373687744, "epoch": 2.1870011402508553, "grad_norm": 1.203125, "learning_rate": 4.622445807014956e-06, "loss": 0.0387, "mean_token_accuracy": 0.9870862662792206, "num_tokens": 101939266.0, "step": 960 }, { "entropy": 0.9682884514331818, "epoch": 2.1892816419612315, "grad_norm": 1.3359375, "learning_rate": 4.621449938292159e-06, "loss": 0.0576, "mean_token_accuracy": 0.9834570288658142, "num_tokens": 102046274.0, "step": 961 }, { "entropy": 0.965948298573494, "epoch": 2.1915621436716077, "grad_norm": 1.25, "learning_rate": 4.620452865475331e-06, "loss": 0.0374, "mean_token_accuracy": 0.9889321625232697, "num_tokens": 102153009.0, "step": 962 }, { "entropy": 0.9710833728313446, "epoch": 2.193842645381984, "grad_norm": 1.4765625, "learning_rate": 4.6194545891303955e-06, "loss": 0.042, "mean_token_accuracy": 0.9870147109031677, "num_tokens": 102259281.0, "step": 963 }, { "entropy": 0.9709675908088684, "epoch": 2.19612314709236, "grad_norm": 1.46875, "learning_rate": 4.618455109823952e-06, "loss": 0.0423, "mean_token_accuracy": 0.9858052879571915, "num_tokens": 102365324.0, "step": 964 }, { "entropy": 0.9683816432952881, "epoch": 2.1984036488027368, "grad_norm": 1.1484375, "learning_rate": 4.617454428123287e-06, "loss": 0.0314, "mean_token_accuracy": 0.9903790503740311, "num_tokens": 102471072.0, "step": 965 }, { "entropy": 0.9733701795339584, "epoch": 2.200684150513113, "grad_norm": 1.21875, "learning_rate": 4.616452544596367e-06, "loss": 0.0353, "mean_token_accuracy": 0.9875676780939102, "num_tokens": 102577713.0, "step": 966 }, { "entropy": 0.9687923789024353, "epoch": 2.202964652223489, "grad_norm": 1.328125, "learning_rate": 4.615449459811843e-06, "loss": 0.0395, "mean_token_accuracy": 0.9899037927389145, "num_tokens": 102683951.0, "step": 967 }, { "entropy": 0.9655643105506897, "epoch": 2.2052451539338653, "grad_norm": 1.4375, "learning_rate": 4.614445174339045e-06, "loss": 0.0342, "mean_token_accuracy": 0.9889160692691803, "num_tokens": 102790218.0, "step": 968 }, { "entropy": 0.9638230949640274, "epoch": 2.2075256556442415, "grad_norm": 1.0546875, "learning_rate": 4.613439688747988e-06, "loss": 0.0376, "mean_token_accuracy": 0.9902021735906601, "num_tokens": 102896871.0, "step": 969 }, { "entropy": 0.9714786559343338, "epoch": 2.209806157354618, "grad_norm": 1.1953125, "learning_rate": 4.612433003609365e-06, "loss": 0.0402, "mean_token_accuracy": 0.98813097178936, "num_tokens": 103002829.0, "step": 970 }, { "entropy": 0.9638766050338745, "epoch": 2.2120866590649944, "grad_norm": 1.125, "learning_rate": 4.611425119494552e-06, "loss": 0.0469, "mean_token_accuracy": 0.9875811636447906, "num_tokens": 103109449.0, "step": 971 }, { "entropy": 0.9737377613782883, "epoch": 2.2143671607753705, "grad_norm": 1.4609375, "learning_rate": 4.6104160369756025e-06, "loss": 0.0474, "mean_token_accuracy": 0.984283909201622, "num_tokens": 103216410.0, "step": 972 }, { "entropy": 0.9697515517473221, "epoch": 2.2166476624857467, "grad_norm": 1.375, "learning_rate": 4.609405756625254e-06, "loss": 0.0348, "mean_token_accuracy": 0.9885269701480865, "num_tokens": 103322908.0, "step": 973 }, { "entropy": 0.9701025187969208, "epoch": 2.2189281641961234, "grad_norm": 1.3515625, "learning_rate": 4.608394279016921e-06, "loss": 0.0428, "mean_token_accuracy": 0.9886103421449661, "num_tokens": 103429355.0, "step": 974 }, { "entropy": 0.9692303389310837, "epoch": 2.2212086659064996, "grad_norm": 1.21875, "learning_rate": 4.6073816047247e-06, "loss": 0.0439, "mean_token_accuracy": 0.9856361597776413, "num_tokens": 103536147.0, "step": 975 }, { "entropy": 0.971646711230278, "epoch": 2.2234891676168758, "grad_norm": 1.1328125, "learning_rate": 4.606367734323365e-06, "loss": 0.0352, "mean_token_accuracy": 0.9885317534208298, "num_tokens": 103642649.0, "step": 976 }, { "entropy": 0.9666504114866257, "epoch": 2.225769669327252, "grad_norm": 1.40625, "learning_rate": 4.605352668388369e-06, "loss": 0.04, "mean_token_accuracy": 0.9869769364595413, "num_tokens": 103749581.0, "step": 977 }, { "entropy": 0.9609551280736923, "epoch": 2.228050171037628, "grad_norm": 1.0859375, "learning_rate": 4.6043364074958435e-06, "loss": 0.0336, "mean_token_accuracy": 0.9909057915210724, "num_tokens": 103856198.0, "step": 978 }, { "entropy": 0.9704583585262299, "epoch": 2.2303306727480043, "grad_norm": 1.359375, "learning_rate": 4.6033189522226e-06, "loss": 0.0443, "mean_token_accuracy": 0.986680343747139, "num_tokens": 103962217.0, "step": 979 }, { "entropy": 0.9719421714544296, "epoch": 2.232611174458381, "grad_norm": 0.88671875, "learning_rate": 4.602300303146123e-06, "loss": 0.0352, "mean_token_accuracy": 0.9895425885915756, "num_tokens": 104069063.0, "step": 980 }, { "entropy": 0.9746485948562622, "epoch": 2.234891676168757, "grad_norm": 0.96875, "learning_rate": 4.601280460844583e-06, "loss": 0.0323, "mean_token_accuracy": 0.990533322095871, "num_tokens": 104174998.0, "step": 981 }, { "entropy": 0.9690642356872559, "epoch": 2.2371721778791334, "grad_norm": 1.078125, "learning_rate": 4.6002594258968185e-06, "loss": 0.037, "mean_token_accuracy": 0.9874879270792007, "num_tokens": 104281047.0, "step": 982 }, { "entropy": 0.9710892289876938, "epoch": 2.2394526795895096, "grad_norm": 1.09375, "learning_rate": 4.599237198882351e-06, "loss": 0.0374, "mean_token_accuracy": 0.9888193756341934, "num_tokens": 104387129.0, "step": 983 }, { "entropy": 0.9715360552072525, "epoch": 2.241733181299886, "grad_norm": 1.1171875, "learning_rate": 4.598213780381377e-06, "loss": 0.0311, "mean_token_accuracy": 0.9888642132282257, "num_tokens": 104493054.0, "step": 984 }, { "entropy": 0.9662254750728607, "epoch": 2.2440136830102624, "grad_norm": 0.95703125, "learning_rate": 4.59718917097477e-06, "loss": 0.0303, "mean_token_accuracy": 0.9908223748207092, "num_tokens": 104599246.0, "step": 985 }, { "entropy": 0.9683901369571686, "epoch": 2.2462941847206386, "grad_norm": 1.484375, "learning_rate": 4.596163371244076e-06, "loss": 0.0446, "mean_token_accuracy": 0.9886699169874191, "num_tokens": 104705374.0, "step": 986 }, { "entropy": 0.9659496694803238, "epoch": 2.2485746864310148, "grad_norm": 1.0078125, "learning_rate": 4.595136381771521e-06, "loss": 0.0268, "mean_token_accuracy": 0.9928339272737503, "num_tokens": 104811075.0, "step": 987 }, { "entropy": 0.9662622362375259, "epoch": 2.250855188141391, "grad_norm": 1.0859375, "learning_rate": 4.594108203140004e-06, "loss": 0.0375, "mean_token_accuracy": 0.9898355007171631, "num_tokens": 104917671.0, "step": 988 }, { "entropy": 0.9677999764680862, "epoch": 2.253135689851767, "grad_norm": 1.078125, "learning_rate": 4.593078835933099e-06, "loss": 0.0352, "mean_token_accuracy": 0.9894730150699615, "num_tokens": 105024226.0, "step": 989 }, { "entropy": 0.9703814685344696, "epoch": 2.255416191562144, "grad_norm": 1.234375, "learning_rate": 4.592048280735055e-06, "loss": 0.0407, "mean_token_accuracy": 0.9883961379528046, "num_tokens": 105130764.0, "step": 990 }, { "entropy": 0.9728590548038483, "epoch": 2.25769669327252, "grad_norm": 1.1015625, "learning_rate": 4.591016538130796e-06, "loss": 0.04, "mean_token_accuracy": 0.9884311109781265, "num_tokens": 105236491.0, "step": 991 }, { "entropy": 0.9667416960000992, "epoch": 2.259977194982896, "grad_norm": 0.9296875, "learning_rate": 4.589983608705918e-06, "loss": 0.0308, "mean_token_accuracy": 0.9906700849533081, "num_tokens": 105342390.0, "step": 992 }, { "entropy": 0.9680958390235901, "epoch": 2.2622576966932724, "grad_norm": 1.3046875, "learning_rate": 4.588949493046693e-06, "loss": 0.0472, "mean_token_accuracy": 0.986142098903656, "num_tokens": 105448792.0, "step": 993 }, { "entropy": 0.973539724946022, "epoch": 2.264538198403649, "grad_norm": 1.2578125, "learning_rate": 4.587914191740064e-06, "loss": 0.0399, "mean_token_accuracy": 0.9866781383752823, "num_tokens": 105554562.0, "step": 994 }, { "entropy": 0.9680445343255997, "epoch": 2.266818700114025, "grad_norm": 1.171875, "learning_rate": 4.586877705373648e-06, "loss": 0.0374, "mean_token_accuracy": 0.989005833864212, "num_tokens": 105660881.0, "step": 995 }, { "entropy": 0.9734873920679092, "epoch": 2.2690992018244014, "grad_norm": 1.3125, "learning_rate": 4.585840034535736e-06, "loss": 0.0449, "mean_token_accuracy": 0.986071303486824, "num_tokens": 105768151.0, "step": 996 }, { "entropy": 0.966447189450264, "epoch": 2.2713797035347776, "grad_norm": 1.203125, "learning_rate": 4.584801179815289e-06, "loss": 0.0399, "mean_token_accuracy": 0.9900715798139572, "num_tokens": 105874575.0, "step": 997 }, { "entropy": 0.9765993505716324, "epoch": 2.2736602052451538, "grad_norm": 1.21875, "learning_rate": 4.583761141801941e-06, "loss": 0.0367, "mean_token_accuracy": 0.9874779880046844, "num_tokens": 105980751.0, "step": 998 }, { "entropy": 0.9703917503356934, "epoch": 2.27594070695553, "grad_norm": 1.09375, "learning_rate": 4.5827199210859975e-06, "loss": 0.0401, "mean_token_accuracy": 0.9873557239770889, "num_tokens": 106087243.0, "step": 999 }, { "entropy": 0.972438395023346, "epoch": 2.2782212086659066, "grad_norm": 1.1875, "learning_rate": 4.581677518258435e-06, "loss": 0.0443, "mean_token_accuracy": 0.9856215566396713, "num_tokens": 106193576.0, "step": 1000 }, { "entropy": 0.9708035588264465, "epoch": 2.280501710376283, "grad_norm": 1.0234375, "learning_rate": 4.580633933910901e-06, "loss": 0.0262, "mean_token_accuracy": 0.9913432896137238, "num_tokens": 106299947.0, "step": 1001 }, { "entropy": 0.9676198214292526, "epoch": 2.282782212086659, "grad_norm": 1.359375, "learning_rate": 4.579589168635715e-06, "loss": 0.0534, "mean_token_accuracy": 0.9857507795095444, "num_tokens": 106405625.0, "step": 1002 }, { "entropy": 0.9698827862739563, "epoch": 2.285062713797035, "grad_norm": 1.484375, "learning_rate": 4.578543223025865e-06, "loss": 0.0483, "mean_token_accuracy": 0.9842702150344849, "num_tokens": 106511571.0, "step": 1003 }, { "entropy": 0.9746315330266953, "epoch": 2.287343215507412, "grad_norm": 1.0546875, "learning_rate": 4.577496097675009e-06, "loss": 0.0436, "mean_token_accuracy": 0.9866337478160858, "num_tokens": 106617579.0, "step": 1004 }, { "entropy": 0.9752069115638733, "epoch": 2.289623717217788, "grad_norm": 0.96875, "learning_rate": 4.576447793177476e-06, "loss": 0.0349, "mean_token_accuracy": 0.989075243473053, "num_tokens": 106723504.0, "step": 1005 }, { "entropy": 0.965659573674202, "epoch": 2.291904218928164, "grad_norm": 1.1484375, "learning_rate": 4.575398310128263e-06, "loss": 0.0391, "mean_token_accuracy": 0.9883884489536285, "num_tokens": 106830136.0, "step": 1006 }, { "entropy": 0.9683704823255539, "epoch": 2.2941847206385404, "grad_norm": 1.53125, "learning_rate": 4.574347649123036e-06, "loss": 0.067, "mean_token_accuracy": 0.982392743229866, "num_tokens": 106936388.0, "step": 1007 }, { "entropy": 0.9761912524700165, "epoch": 2.2964652223489166, "grad_norm": 1.3125, "learning_rate": 4.57329581075813e-06, "loss": 0.0451, "mean_token_accuracy": 0.9876230657100677, "num_tokens": 107042705.0, "step": 1008 }, { "entropy": 0.9708452671766281, "epoch": 2.2987457240592932, "grad_norm": 1.21875, "learning_rate": 4.572242795630549e-06, "loss": 0.0398, "mean_token_accuracy": 0.9853936433792114, "num_tokens": 107149346.0, "step": 1009 }, { "entropy": 0.9672209769487381, "epoch": 2.3010262257696694, "grad_norm": 1.109375, "learning_rate": 4.571188604337963e-06, "loss": 0.0393, "mean_token_accuracy": 0.9871280938386917, "num_tokens": 107255896.0, "step": 1010 }, { "entropy": 0.9764431267976761, "epoch": 2.3033067274800456, "grad_norm": 1.1328125, "learning_rate": 4.570133237478711e-06, "loss": 0.0348, "mean_token_accuracy": 0.9892553836107254, "num_tokens": 107361948.0, "step": 1011 }, { "entropy": 0.9766973853111267, "epoch": 2.305587229190422, "grad_norm": 1.1953125, "learning_rate": 4.5690766956517985e-06, "loss": 0.0358, "mean_token_accuracy": 0.9891420155763626, "num_tokens": 107467988.0, "step": 1012 }, { "entropy": 0.9735081046819687, "epoch": 2.307867730900798, "grad_norm": 1.2734375, "learning_rate": 4.568018979456899e-06, "loss": 0.0408, "mean_token_accuracy": 0.9868201315402985, "num_tokens": 107574459.0, "step": 1013 }, { "entropy": 0.9773495346307755, "epoch": 2.3101482326111746, "grad_norm": 1.3984375, "learning_rate": 4.566960089494351e-06, "loss": 0.0405, "mean_token_accuracy": 0.9866890162229538, "num_tokens": 107681038.0, "step": 1014 }, { "entropy": 0.9691914767026901, "epoch": 2.312428734321551, "grad_norm": 1.1015625, "learning_rate": 4.5659000263651615e-06, "loss": 0.0379, "mean_token_accuracy": 0.9883751422166824, "num_tokens": 107787378.0, "step": 1015 }, { "entropy": 0.9812718480825424, "epoch": 2.314709236031927, "grad_norm": 1.09375, "learning_rate": 4.564838790671e-06, "loss": 0.0376, "mean_token_accuracy": 0.9885855466127396, "num_tokens": 107894020.0, "step": 1016 }, { "entropy": 0.9770101457834244, "epoch": 2.316989737742303, "grad_norm": 1.09375, "learning_rate": 4.5637763830142046e-06, "loss": 0.0378, "mean_token_accuracy": 0.9878818392753601, "num_tokens": 108000596.0, "step": 1017 }, { "entropy": 0.9727422893047333, "epoch": 2.3192702394526794, "grad_norm": 1.171875, "learning_rate": 4.562712803997776e-06, "loss": 0.04, "mean_token_accuracy": 0.9849680215120316, "num_tokens": 108106784.0, "step": 1018 }, { "entropy": 0.9721705168485641, "epoch": 2.321550741163056, "grad_norm": 1.015625, "learning_rate": 4.5616480542253825e-06, "loss": 0.0269, "mean_token_accuracy": 0.9919751435518265, "num_tokens": 108212621.0, "step": 1019 }, { "entropy": 0.9778535217046738, "epoch": 2.3238312428734322, "grad_norm": 1.046875, "learning_rate": 4.5605821343013555e-06, "loss": 0.0394, "mean_token_accuracy": 0.9890420287847519, "num_tokens": 108319239.0, "step": 1020 }, { "entropy": 0.9772415310144424, "epoch": 2.3261117445838084, "grad_norm": 1.09375, "learning_rate": 4.55951504483069e-06, "loss": 0.0384, "mean_token_accuracy": 0.9904467165470123, "num_tokens": 108425763.0, "step": 1021 }, { "entropy": 0.9750592857599258, "epoch": 2.3283922462941846, "grad_norm": 1.1171875, "learning_rate": 4.558446786419045e-06, "loss": 0.0417, "mean_token_accuracy": 0.9882222414016724, "num_tokens": 108531505.0, "step": 1022 }, { "entropy": 0.9679126143455505, "epoch": 2.330672748004561, "grad_norm": 0.9140625, "learning_rate": 4.557377359672745e-06, "loss": 0.0304, "mean_token_accuracy": 0.9896198958158493, "num_tokens": 108637222.0, "step": 1023 }, { "entropy": 0.9743359535932541, "epoch": 2.3329532497149374, "grad_norm": 1.0703125, "learning_rate": 4.556306765198775e-06, "loss": 0.0325, "mean_token_accuracy": 0.9902807772159576, "num_tokens": 108743665.0, "step": 1024 }, { "entropy": 0.9713866114616394, "epoch": 2.3352337514253136, "grad_norm": 1.078125, "learning_rate": 4.555235003604782e-06, "loss": 0.0442, "mean_token_accuracy": 0.9889845848083496, "num_tokens": 108849717.0, "step": 1025 }, { "entropy": 0.9720082432031631, "epoch": 2.33751425313569, "grad_norm": 1.296875, "learning_rate": 4.55416207549908e-06, "loss": 0.0363, "mean_token_accuracy": 0.9886191487312317, "num_tokens": 108955944.0, "step": 1026 }, { "entropy": 0.9723177999258041, "epoch": 2.339794754846066, "grad_norm": 1.3359375, "learning_rate": 4.5530879814906404e-06, "loss": 0.0492, "mean_token_accuracy": 0.9855318069458008, "num_tokens": 109062076.0, "step": 1027 }, { "entropy": 0.9724453091621399, "epoch": 2.342075256556442, "grad_norm": 1.3984375, "learning_rate": 4.5520127221891e-06, "loss": 0.0451, "mean_token_accuracy": 0.9872293770313263, "num_tokens": 109168615.0, "step": 1028 }, { "entropy": 0.9662226736545563, "epoch": 2.344355758266819, "grad_norm": 1.15625, "learning_rate": 4.5509362982047525e-06, "loss": 0.0422, "mean_token_accuracy": 0.987361267209053, "num_tokens": 109274651.0, "step": 1029 }, { "entropy": 0.9634005427360535, "epoch": 2.346636259977195, "grad_norm": 1.4140625, "learning_rate": 4.549858710148558e-06, "loss": 0.0557, "mean_token_accuracy": 0.9826209247112274, "num_tokens": 109381350.0, "step": 1030 }, { "entropy": 0.9696230888366699, "epoch": 2.3489167616875712, "grad_norm": 1.1640625, "learning_rate": 4.548779958632134e-06, "loss": 0.0318, "mean_token_accuracy": 0.9873648285865784, "num_tokens": 109487854.0, "step": 1031 }, { "entropy": 0.9789362251758575, "epoch": 2.3511972633979474, "grad_norm": 1.2734375, "learning_rate": 4.5477000442677575e-06, "loss": 0.0376, "mean_token_accuracy": 0.989895835518837, "num_tokens": 109593995.0, "step": 1032 }, { "entropy": 0.9745998531579971, "epoch": 2.353477765108324, "grad_norm": 0.96484375, "learning_rate": 4.546618967668369e-06, "loss": 0.0322, "mean_token_accuracy": 0.9898173063993454, "num_tokens": 109700363.0, "step": 1033 }, { "entropy": 0.9786639511585236, "epoch": 2.3557582668187003, "grad_norm": 1.0390625, "learning_rate": 4.545536729447566e-06, "loss": 0.0379, "mean_token_accuracy": 0.989013671875, "num_tokens": 109807170.0, "step": 1034 }, { "entropy": 0.9819515645503998, "epoch": 2.3580387685290765, "grad_norm": 1.3359375, "learning_rate": 4.544453330219606e-06, "loss": 0.0408, "mean_token_accuracy": 0.9887014031410217, "num_tokens": 109913616.0, "step": 1035 }, { "entropy": 0.9827314168214798, "epoch": 2.3603192702394526, "grad_norm": 0.94140625, "learning_rate": 4.543368770599406e-06, "loss": 0.0306, "mean_token_accuracy": 0.9909116327762604, "num_tokens": 110020275.0, "step": 1036 }, { "entropy": 0.9764288514852524, "epoch": 2.362599771949829, "grad_norm": 1.34375, "learning_rate": 4.542283051202539e-06, "loss": 0.0378, "mean_token_accuracy": 0.9873865097761154, "num_tokens": 110126351.0, "step": 1037 }, { "entropy": 0.9781017750501633, "epoch": 2.364880273660205, "grad_norm": 1.0078125, "learning_rate": 4.541196172645242e-06, "loss": 0.0361, "mean_token_accuracy": 0.989377960562706, "num_tokens": 110232972.0, "step": 1038 }, { "entropy": 0.9734379053115845, "epoch": 2.3671607753705817, "grad_norm": 1.03125, "learning_rate": 4.540108135544403e-06, "loss": 0.0411, "mean_token_accuracy": 0.989588275551796, "num_tokens": 110339484.0, "step": 1039 }, { "entropy": 0.9779237359762192, "epoch": 2.369441277080958, "grad_norm": 1.03125, "learning_rate": 4.5390189405175725e-06, "loss": 0.0324, "mean_token_accuracy": 0.9876978099346161, "num_tokens": 110445471.0, "step": 1040 }, { "entropy": 0.974235787987709, "epoch": 2.371721778791334, "grad_norm": 1.4140625, "learning_rate": 4.537928588182955e-06, "loss": 0.05, "mean_token_accuracy": 0.9840153604745865, "num_tokens": 110551752.0, "step": 1041 }, { "entropy": 0.9705098122358322, "epoch": 2.3740022805017102, "grad_norm": 1.09375, "learning_rate": 4.536837079159416e-06, "loss": 0.0365, "mean_token_accuracy": 0.9898076951503754, "num_tokens": 110657762.0, "step": 1042 }, { "entropy": 0.9708872437477112, "epoch": 2.376282782212087, "grad_norm": 1.1640625, "learning_rate": 4.535744414066473e-06, "loss": 0.0342, "mean_token_accuracy": 0.9888600707054138, "num_tokens": 110764007.0, "step": 1043 }, { "entropy": 0.969092071056366, "epoch": 2.378563283922463, "grad_norm": 1.109375, "learning_rate": 4.534650593524302e-06, "loss": 0.0443, "mean_token_accuracy": 0.9873199760913849, "num_tokens": 110869909.0, "step": 1044 }, { "entropy": 0.9712023884057999, "epoch": 2.3808437856328393, "grad_norm": 1.0859375, "learning_rate": 4.533555618153735e-06, "loss": 0.0374, "mean_token_accuracy": 0.9876801520586014, "num_tokens": 110976185.0, "step": 1045 }, { "entropy": 0.9766702055931091, "epoch": 2.3831242873432155, "grad_norm": 1.125, "learning_rate": 4.532459488576258e-06, "loss": 0.0388, "mean_token_accuracy": 0.9872826337814331, "num_tokens": 111082068.0, "step": 1046 }, { "entropy": 0.9742221832275391, "epoch": 2.3854047890535917, "grad_norm": 0.984375, "learning_rate": 4.531362205414013e-06, "loss": 0.0301, "mean_token_accuracy": 0.9915291219949722, "num_tokens": 111187887.0, "step": 1047 }, { "entropy": 0.9727058708667755, "epoch": 2.387685290763968, "grad_norm": 1.1640625, "learning_rate": 4.530263769289798e-06, "loss": 0.035, "mean_token_accuracy": 0.9898785799741745, "num_tokens": 111294702.0, "step": 1048 }, { "entropy": 0.9676071852445602, "epoch": 2.3899657924743445, "grad_norm": 1.265625, "learning_rate": 4.529164180827063e-06, "loss": 0.0449, "mean_token_accuracy": 0.9858859330415726, "num_tokens": 111400380.0, "step": 1049 }, { "entropy": 0.9675841182470322, "epoch": 2.3922462941847207, "grad_norm": 1.1640625, "learning_rate": 4.528063440649913e-06, "loss": 0.0411, "mean_token_accuracy": 0.9879327714443207, "num_tokens": 111506824.0, "step": 1050 }, { "entropy": 0.9689165651798248, "epoch": 2.394526795895097, "grad_norm": 1.0234375, "learning_rate": 4.526961549383109e-06, "loss": 0.0369, "mean_token_accuracy": 0.9900340139865875, "num_tokens": 111612956.0, "step": 1051 }, { "entropy": 0.9679895788431168, "epoch": 2.396807297605473, "grad_norm": 1.2421875, "learning_rate": 4.52585850765206e-06, "loss": 0.0449, "mean_token_accuracy": 0.9865595549345016, "num_tokens": 111719187.0, "step": 1052 }, { "entropy": 0.9688171744346619, "epoch": 2.3990877993158497, "grad_norm": 1.1953125, "learning_rate": 4.524754316082833e-06, "loss": 0.036, "mean_token_accuracy": 0.9872208386659622, "num_tokens": 111825373.0, "step": 1053 }, { "entropy": 0.9666113257408142, "epoch": 2.401368301026226, "grad_norm": 0.94921875, "learning_rate": 4.5236489753021465e-06, "loss": 0.034, "mean_token_accuracy": 0.9890531599521637, "num_tokens": 111931425.0, "step": 1054 }, { "entropy": 0.9701842665672302, "epoch": 2.403648802736602, "grad_norm": 1.0625, "learning_rate": 4.522542485937369e-06, "loss": 0.0348, "mean_token_accuracy": 0.9887361228466034, "num_tokens": 112037663.0, "step": 1055 }, { "entropy": 0.9716154336929321, "epoch": 2.4059293044469783, "grad_norm": 0.93359375, "learning_rate": 4.521434848616523e-06, "loss": 0.0301, "mean_token_accuracy": 0.9895419478416443, "num_tokens": 112144266.0, "step": 1056 }, { "entropy": 0.9622577875852585, "epoch": 2.4082098061573545, "grad_norm": 1.0546875, "learning_rate": 4.520326063968283e-06, "loss": 0.0366, "mean_token_accuracy": 0.9904957413673401, "num_tokens": 112250113.0, "step": 1057 }, { "entropy": 0.9606965780258179, "epoch": 2.4104903078677307, "grad_norm": 1.203125, "learning_rate": 4.5192161326219716e-06, "loss": 0.0389, "mean_token_accuracy": 0.9895916134119034, "num_tokens": 112356501.0, "step": 1058 }, { "entropy": 0.9638993889093399, "epoch": 2.4127708095781073, "grad_norm": 1.1875, "learning_rate": 4.5181050552075665e-06, "loss": 0.0384, "mean_token_accuracy": 0.9878609478473663, "num_tokens": 112462759.0, "step": 1059 }, { "entropy": 0.9625800848007202, "epoch": 2.4150513112884835, "grad_norm": 1.1484375, "learning_rate": 4.516992832355694e-06, "loss": 0.0382, "mean_token_accuracy": 0.987004280090332, "num_tokens": 112568361.0, "step": 1060 }, { "entropy": 0.9633389860391617, "epoch": 2.4173318129988597, "grad_norm": 1.1953125, "learning_rate": 4.515879464697629e-06, "loss": 0.0387, "mean_token_accuracy": 0.9881696999073029, "num_tokens": 112675219.0, "step": 1061 }, { "entropy": 0.9658568054437637, "epoch": 2.419612314709236, "grad_norm": 1.1015625, "learning_rate": 4.514764952865297e-06, "loss": 0.0424, "mean_token_accuracy": 0.9870736002922058, "num_tokens": 112781230.0, "step": 1062 }, { "entropy": 0.9683505743741989, "epoch": 2.4218928164196125, "grad_norm": 1.4609375, "learning_rate": 4.513649297491275e-06, "loss": 0.0548, "mean_token_accuracy": 0.9832236021757126, "num_tokens": 112887184.0, "step": 1063 }, { "entropy": 0.9636904150247574, "epoch": 2.4241733181299887, "grad_norm": 1.3046875, "learning_rate": 4.512532499208787e-06, "loss": 0.0411, "mean_token_accuracy": 0.9852688014507294, "num_tokens": 112993761.0, "step": 1064 }, { "entropy": 0.9600468426942825, "epoch": 2.426453819840365, "grad_norm": 0.9140625, "learning_rate": 4.511414558651706e-06, "loss": 0.0272, "mean_token_accuracy": 0.9919755160808563, "num_tokens": 113100820.0, "step": 1065 }, { "entropy": 0.9651389122009277, "epoch": 2.428734321550741, "grad_norm": 1.0625, "learning_rate": 4.5102954764545525e-06, "loss": 0.0319, "mean_token_accuracy": 0.9904666990041733, "num_tokens": 113206961.0, "step": 1066 }, { "entropy": 0.9627701193094254, "epoch": 2.4310148232611173, "grad_norm": 1.0234375, "learning_rate": 4.509175253252497e-06, "loss": 0.033, "mean_token_accuracy": 0.9914871156215668, "num_tokens": 113313004.0, "step": 1067 }, { "entropy": 0.9668271392583847, "epoch": 2.433295324971494, "grad_norm": 1.046875, "learning_rate": 4.508053889681357e-06, "loss": 0.0345, "mean_token_accuracy": 0.98903489112854, "num_tokens": 113419139.0, "step": 1068 }, { "entropy": 0.9609852582216263, "epoch": 2.43557582668187, "grad_norm": 1.1640625, "learning_rate": 4.5069313863775956e-06, "loss": 0.0299, "mean_token_accuracy": 0.9899353682994843, "num_tokens": 113525171.0, "step": 1069 }, { "entropy": 0.9651302695274353, "epoch": 2.4378563283922463, "grad_norm": 1.265625, "learning_rate": 4.505807743978325e-06, "loss": 0.0388, "mean_token_accuracy": 0.9884258806705475, "num_tokens": 113631896.0, "step": 1070 }, { "entropy": 0.9607261121273041, "epoch": 2.4401368301026225, "grad_norm": 1.4921875, "learning_rate": 4.5046829631213014e-06, "loss": 0.0513, "mean_token_accuracy": 0.9846921861171722, "num_tokens": 113737992.0, "step": 1071 }, { "entropy": 0.9653102606534958, "epoch": 2.4424173318129987, "grad_norm": 1.203125, "learning_rate": 4.503557044444931e-06, "loss": 0.0458, "mean_token_accuracy": 0.9871001839637756, "num_tokens": 113843792.0, "step": 1072 }, { "entropy": 0.9640719592571259, "epoch": 2.4446978335233753, "grad_norm": 1.3359375, "learning_rate": 4.502429988588263e-06, "loss": 0.0422, "mean_token_accuracy": 0.9876347482204437, "num_tokens": 113950701.0, "step": 1073 }, { "entropy": 0.962962731719017, "epoch": 2.4469783352337515, "grad_norm": 1.265625, "learning_rate": 4.50130179619099e-06, "loss": 0.0413, "mean_token_accuracy": 0.9877217561006546, "num_tokens": 114056908.0, "step": 1074 }, { "entropy": 0.9692247211933136, "epoch": 2.4492588369441277, "grad_norm": 1.2265625, "learning_rate": 4.500172467893455e-06, "loss": 0.0334, "mean_token_accuracy": 0.9891466200351715, "num_tokens": 114162888.0, "step": 1075 }, { "entropy": 0.9686284363269806, "epoch": 2.451539338654504, "grad_norm": 1.0390625, "learning_rate": 4.499042004336642e-06, "loss": 0.0406, "mean_token_accuracy": 0.9873951524496078, "num_tokens": 114269050.0, "step": 1076 }, { "entropy": 0.9691548943519592, "epoch": 2.45381984036488, "grad_norm": 1.03125, "learning_rate": 4.497910406162182e-06, "loss": 0.0376, "mean_token_accuracy": 0.9870355278253555, "num_tokens": 114375058.0, "step": 1077 }, { "entropy": 0.9663748145103455, "epoch": 2.4561003420752567, "grad_norm": 1.1953125, "learning_rate": 4.496777674012345e-06, "loss": 0.039, "mean_token_accuracy": 0.9865869879722595, "num_tokens": 114481042.0, "step": 1078 }, { "entropy": 0.9692698866128922, "epoch": 2.458380843785633, "grad_norm": 0.9296875, "learning_rate": 4.495643808530049e-06, "loss": 0.0336, "mean_token_accuracy": 0.9886258840560913, "num_tokens": 114587243.0, "step": 1079 }, { "entropy": 0.9781520068645477, "epoch": 2.460661345496009, "grad_norm": 1.0859375, "learning_rate": 4.494508810358855e-06, "loss": 0.0336, "mean_token_accuracy": 0.988973394036293, "num_tokens": 114693582.0, "step": 1080 }, { "entropy": 0.97136490046978, "epoch": 2.4629418472063853, "grad_norm": 1.484375, "learning_rate": 4.4933726801429665e-06, "loss": 0.0557, "mean_token_accuracy": 0.9825483411550522, "num_tokens": 114800229.0, "step": 1081 }, { "entropy": 0.9704419076442719, "epoch": 2.4652223489167615, "grad_norm": 0.9609375, "learning_rate": 4.4922354185272275e-06, "loss": 0.0308, "mean_token_accuracy": 0.9923818558454514, "num_tokens": 114906186.0, "step": 1082 }, { "entropy": 0.9763527363538742, "epoch": 2.467502850627138, "grad_norm": 1.2421875, "learning_rate": 4.491097026157127e-06, "loss": 0.0385, "mean_token_accuracy": 0.9884812384843826, "num_tokens": 115012350.0, "step": 1083 }, { "entropy": 0.9740628451108932, "epoch": 2.4697833523375143, "grad_norm": 1.53125, "learning_rate": 4.489957503678794e-06, "loss": 0.0418, "mean_token_accuracy": 0.9858824759721756, "num_tokens": 115118989.0, "step": 1084 }, { "entropy": 0.9755960404872894, "epoch": 2.4720638540478905, "grad_norm": 0.734375, "learning_rate": 4.488816851738999e-06, "loss": 0.0228, "mean_token_accuracy": 0.9934317022562027, "num_tokens": 115224779.0, "step": 1085 }, { "entropy": 0.9770965427160263, "epoch": 2.4743443557582667, "grad_norm": 1.375, "learning_rate": 4.487675070985156e-06, "loss": 0.0496, "mean_token_accuracy": 0.9869184494018555, "num_tokens": 115331117.0, "step": 1086 }, { "entropy": 0.97431281208992, "epoch": 2.476624857468643, "grad_norm": 1.125, "learning_rate": 4.4865321620653144e-06, "loss": 0.0353, "mean_token_accuracy": 0.9892715811729431, "num_tokens": 115437241.0, "step": 1087 }, { "entropy": 0.9696021229028702, "epoch": 2.4789053591790196, "grad_norm": 1.2890625, "learning_rate": 4.485388125628171e-06, "loss": 0.0439, "mean_token_accuracy": 0.9869652092456818, "num_tokens": 115543516.0, "step": 1088 }, { "entropy": 0.9733566790819168, "epoch": 2.4811858608893957, "grad_norm": 0.94921875, "learning_rate": 4.484242962323056e-06, "loss": 0.0301, "mean_token_accuracy": 0.9924483895301819, "num_tokens": 115649482.0, "step": 1089 }, { "entropy": 0.9805065989494324, "epoch": 2.483466362599772, "grad_norm": 1.1171875, "learning_rate": 4.483096672799942e-06, "loss": 0.0342, "mean_token_accuracy": 0.9900181293487549, "num_tokens": 115755743.0, "step": 1090 }, { "entropy": 0.9793063402175903, "epoch": 2.485746864310148, "grad_norm": 1.09375, "learning_rate": 4.481949257709442e-06, "loss": 0.0343, "mean_token_accuracy": 0.9869582951068878, "num_tokens": 115862168.0, "step": 1091 }, { "entropy": 0.9828750491142273, "epoch": 2.4880273660205243, "grad_norm": 1.1328125, "learning_rate": 4.480800717702807e-06, "loss": 0.0402, "mean_token_accuracy": 0.9876468777656555, "num_tokens": 115969363.0, "step": 1092 }, { "entropy": 0.9811009019613266, "epoch": 2.490307867730901, "grad_norm": 0.859375, "learning_rate": 4.479651053431926e-06, "loss": 0.0302, "mean_token_accuracy": 0.9905859977006912, "num_tokens": 116075531.0, "step": 1093 }, { "entropy": 0.9783196300268173, "epoch": 2.492588369441277, "grad_norm": 0.96875, "learning_rate": 4.4785002655493246e-06, "loss": 0.0362, "mean_token_accuracy": 0.9912123680114746, "num_tokens": 116181940.0, "step": 1094 }, { "entropy": 0.9730523973703384, "epoch": 2.4948688711516533, "grad_norm": 0.92578125, "learning_rate": 4.477348354708169e-06, "loss": 0.0408, "mean_token_accuracy": 0.9878622442483902, "num_tokens": 116288442.0, "step": 1095 }, { "entropy": 0.978343278169632, "epoch": 2.4971493728620295, "grad_norm": 0.984375, "learning_rate": 4.476195321562262e-06, "loss": 0.0341, "mean_token_accuracy": 0.9909177422523499, "num_tokens": 116394417.0, "step": 1096 }, { "entropy": 0.9833126813173294, "epoch": 2.4994298745724057, "grad_norm": 1.359375, "learning_rate": 4.475041166766042e-06, "loss": 0.0451, "mean_token_accuracy": 0.9850174337625504, "num_tokens": 116500647.0, "step": 1097 }, { "entropy": 0.9796348363161087, "epoch": 2.5017103762827824, "grad_norm": 1.34375, "learning_rate": 4.473885890974586e-06, "loss": 0.0544, "mean_token_accuracy": 0.9831701964139938, "num_tokens": 116607519.0, "step": 1098 }, { "entropy": 0.9762048274278641, "epoch": 2.5039908779931586, "grad_norm": 0.953125, "learning_rate": 4.472729494843605e-06, "loss": 0.0348, "mean_token_accuracy": 0.9884055703878403, "num_tokens": 116713761.0, "step": 1099 }, { "entropy": 0.9711233526468277, "epoch": 2.5062713797035348, "grad_norm": 1.09375, "learning_rate": 4.471571979029448e-06, "loss": 0.0411, "mean_token_accuracy": 0.9883287400007248, "num_tokens": 116820317.0, "step": 1100 }, { "epoch": 2.5062713797035348, "eval_entropy": 0.972808671768174, "eval_loss": 0.041190750896930695, "eval_mean_token_accuracy": 0.9875200148317751, "eval_num_tokens": 116820317.0, "eval_runtime": 66.0955, "eval_samples_per_second": 126.862, "eval_steps_per_second": 3.979, "step": 1100 }, { "entropy": 0.9735891968011856, "epoch": 2.508551881413911, "grad_norm": 1.0, "learning_rate": 4.470413344189098e-06, "loss": 0.0279, "mean_token_accuracy": 0.9902595281600952, "num_tokens": 116926006.0, "step": 1101 }, { "entropy": 0.9691547155380249, "epoch": 2.5108323831242876, "grad_norm": 1.46875, "learning_rate": 4.469253590980175e-06, "loss": 0.0452, "mean_token_accuracy": 0.9880786836147308, "num_tokens": 117032169.0, "step": 1102 }, { "entropy": 0.9763856530189514, "epoch": 2.5131128848346638, "grad_norm": 1.0859375, "learning_rate": 4.46809272006093e-06, "loss": 0.0307, "mean_token_accuracy": 0.9909102469682693, "num_tokens": 117138293.0, "step": 1103 }, { "entropy": 0.9715720117092133, "epoch": 2.51539338654504, "grad_norm": 0.92578125, "learning_rate": 4.466930732090254e-06, "loss": 0.0232, "mean_token_accuracy": 0.9930012375116348, "num_tokens": 117244219.0, "step": 1104 }, { "entropy": 0.9719373136758804, "epoch": 2.517673888255416, "grad_norm": 1.046875, "learning_rate": 4.465767627727668e-06, "loss": 0.0352, "mean_token_accuracy": 0.9869057238101959, "num_tokens": 117351115.0, "step": 1105 }, { "entropy": 0.9684425443410873, "epoch": 2.5199543899657924, "grad_norm": 1.21875, "learning_rate": 4.464603407633326e-06, "loss": 0.0399, "mean_token_accuracy": 0.987548902630806, "num_tokens": 117457215.0, "step": 1106 }, { "entropy": 0.9655813276767731, "epoch": 2.5222348916761685, "grad_norm": 1.46875, "learning_rate": 4.463438072468018e-06, "loss": 0.0543, "mean_token_accuracy": 0.9867598563432693, "num_tokens": 117562893.0, "step": 1107 }, { "entropy": 0.968076542019844, "epoch": 2.524515393386545, "grad_norm": 1.265625, "learning_rate": 4.462271622893166e-06, "loss": 0.0373, "mean_token_accuracy": 0.9880144596099854, "num_tokens": 117669039.0, "step": 1108 }, { "entropy": 0.9657844603061676, "epoch": 2.5267958950969214, "grad_norm": 1.203125, "learning_rate": 4.461104059570825e-06, "loss": 0.0469, "mean_token_accuracy": 0.9855824857950211, "num_tokens": 117775287.0, "step": 1109 }, { "entropy": 0.9700010716915131, "epoch": 2.5290763968072976, "grad_norm": 0.89453125, "learning_rate": 4.4599353831636785e-06, "loss": 0.0399, "mean_token_accuracy": 0.98725825548172, "num_tokens": 117881948.0, "step": 1110 }, { "entropy": 0.9708001017570496, "epoch": 2.5313568985176738, "grad_norm": 1.3046875, "learning_rate": 4.458765594335048e-06, "loss": 0.0469, "mean_token_accuracy": 0.9851561933755875, "num_tokens": 117988128.0, "step": 1111 }, { "entropy": 0.9762086123228073, "epoch": 2.5336374002280504, "grad_norm": 1.0625, "learning_rate": 4.457594693748881e-06, "loss": 0.0311, "mean_token_accuracy": 0.9916230291128159, "num_tokens": 118094300.0, "step": 1112 }, { "entropy": 0.9723193347454071, "epoch": 2.5359179019384266, "grad_norm": 1.0234375, "learning_rate": 4.456422682069758e-06, "loss": 0.0336, "mean_token_accuracy": 0.9892087131738663, "num_tokens": 118200472.0, "step": 1113 }, { "entropy": 0.9696424454450607, "epoch": 2.538198403648803, "grad_norm": 1.2265625, "learning_rate": 4.455249559962892e-06, "loss": 0.0415, "mean_token_accuracy": 0.9874148070812225, "num_tokens": 118306617.0, "step": 1114 }, { "entropy": 0.9707659631967545, "epoch": 2.540478905359179, "grad_norm": 1.0859375, "learning_rate": 4.454075328094123e-06, "loss": 0.0415, "mean_token_accuracy": 0.9850387424230576, "num_tokens": 118413532.0, "step": 1115 }, { "entropy": 0.9675559103488922, "epoch": 2.542759407069555, "grad_norm": 1.296875, "learning_rate": 4.452899987129922e-06, "loss": 0.0462, "mean_token_accuracy": 0.9869939237833023, "num_tokens": 118520576.0, "step": 1116 }, { "entropy": 0.9709131717681885, "epoch": 2.5450399087799314, "grad_norm": 1.4375, "learning_rate": 4.4517235377373915e-06, "loss": 0.0477, "mean_token_accuracy": 0.9858109205961227, "num_tokens": 118626948.0, "step": 1117 }, { "entropy": 0.9667210429906845, "epoch": 2.547320410490308, "grad_norm": 1.0703125, "learning_rate": 4.45054598058426e-06, "loss": 0.0414, "mean_token_accuracy": 0.9855765104293823, "num_tokens": 118733182.0, "step": 1118 }, { "entropy": 0.9698969423770905, "epoch": 2.549600912200684, "grad_norm": 1.265625, "learning_rate": 4.449367316338887e-06, "loss": 0.0507, "mean_token_accuracy": 0.9822993725538254, "num_tokens": 118839545.0, "step": 1119 }, { "entropy": 0.977651983499527, "epoch": 2.5518814139110604, "grad_norm": 0.9609375, "learning_rate": 4.448187545670258e-06, "loss": 0.0276, "mean_token_accuracy": 0.9909904599189758, "num_tokens": 118946134.0, "step": 1120 }, { "entropy": 0.9743582606315613, "epoch": 2.5541619156214366, "grad_norm": 1.109375, "learning_rate": 4.44700666924799e-06, "loss": 0.0355, "mean_token_accuracy": 0.9887563735246658, "num_tokens": 119051987.0, "step": 1121 }, { "entropy": 0.9775199592113495, "epoch": 2.556442417331813, "grad_norm": 1.1484375, "learning_rate": 4.4458246877423254e-06, "loss": 0.0346, "mean_token_accuracy": 0.9883389174938202, "num_tokens": 119158100.0, "step": 1122 }, { "entropy": 0.9689960926771164, "epoch": 2.5587229190421894, "grad_norm": 1.0625, "learning_rate": 4.444641601824134e-06, "loss": 0.0348, "mean_token_accuracy": 0.9881866872310638, "num_tokens": 119264027.0, "step": 1123 }, { "entropy": 0.9724647998809814, "epoch": 2.5610034207525656, "grad_norm": 1.140625, "learning_rate": 4.443457412164911e-06, "loss": 0.0324, "mean_token_accuracy": 0.9884301871061325, "num_tokens": 119370246.0, "step": 1124 }, { "entropy": 0.9701798260211945, "epoch": 2.563283922462942, "grad_norm": 1.109375, "learning_rate": 4.442272119436781e-06, "loss": 0.0421, "mean_token_accuracy": 0.9853730648756027, "num_tokens": 119477182.0, "step": 1125 }, { "entropy": 0.9724767357110977, "epoch": 2.565564424173318, "grad_norm": 1.1328125, "learning_rate": 4.441085724312494e-06, "loss": 0.0458, "mean_token_accuracy": 0.985518604516983, "num_tokens": 119583611.0, "step": 1126 }, { "entropy": 0.963296428322792, "epoch": 2.567844925883694, "grad_norm": 1.21875, "learning_rate": 4.4398982274654235e-06, "loss": 0.0424, "mean_token_accuracy": 0.9853932112455368, "num_tokens": 119690682.0, "step": 1127 }, { "entropy": 0.9678771048784256, "epoch": 2.570125427594071, "grad_norm": 0.89453125, "learning_rate": 4.43870962956957e-06, "loss": 0.0231, "mean_token_accuracy": 0.9924886524677277, "num_tokens": 119797241.0, "step": 1128 }, { "entropy": 0.9637564867734909, "epoch": 2.572405929304447, "grad_norm": 1.2890625, "learning_rate": 4.437519931299559e-06, "loss": 0.0402, "mean_token_accuracy": 0.9869941473007202, "num_tokens": 119904575.0, "step": 1129 }, { "entropy": 0.9727903455495834, "epoch": 2.574686431014823, "grad_norm": 0.99609375, "learning_rate": 4.43632913333064e-06, "loss": 0.0315, "mean_token_accuracy": 0.9905203729867935, "num_tokens": 120010572.0, "step": 1130 }, { "entropy": 0.9710706770420074, "epoch": 2.5769669327251994, "grad_norm": 0.9609375, "learning_rate": 4.435137236338688e-06, "loss": 0.0318, "mean_token_accuracy": 0.9897060394287109, "num_tokens": 120117187.0, "step": 1131 }, { "entropy": 0.9718214720487595, "epoch": 2.579247434435576, "grad_norm": 1.078125, "learning_rate": 4.433944241000199e-06, "loss": 0.0341, "mean_token_accuracy": 0.98975470662117, "num_tokens": 120223452.0, "step": 1132 }, { "entropy": 0.968155100941658, "epoch": 2.581527936145952, "grad_norm": 0.91796875, "learning_rate": 4.4327501479922955e-06, "loss": 0.0284, "mean_token_accuracy": 0.9907172918319702, "num_tokens": 120330104.0, "step": 1133 }, { "entropy": 0.9752079844474792, "epoch": 2.5838084378563284, "grad_norm": 1.140625, "learning_rate": 4.431554957992722e-06, "loss": 0.0379, "mean_token_accuracy": 0.9869110286235809, "num_tokens": 120436034.0, "step": 1134 }, { "entropy": 0.9668303430080414, "epoch": 2.5860889395667046, "grad_norm": 1.28125, "learning_rate": 4.430358671679843e-06, "loss": 0.0412, "mean_token_accuracy": 0.9871958494186401, "num_tokens": 120541900.0, "step": 1135 }, { "entropy": 0.9641019701957703, "epoch": 2.588369441277081, "grad_norm": 1.3984375, "learning_rate": 4.42916128973265e-06, "loss": 0.0389, "mean_token_accuracy": 0.9878207594156265, "num_tokens": 120648062.0, "step": 1136 }, { "entropy": 0.9681955426931381, "epoch": 2.590649942987457, "grad_norm": 1.0859375, "learning_rate": 4.427962812830753e-06, "loss": 0.0383, "mean_token_accuracy": 0.9883594959974289, "num_tokens": 120754484.0, "step": 1137 }, { "entropy": 0.9667742252349854, "epoch": 2.5929304446978336, "grad_norm": 1.328125, "learning_rate": 4.426763241654383e-06, "loss": 0.0411, "mean_token_accuracy": 0.987299844622612, "num_tokens": 120860896.0, "step": 1138 }, { "entropy": 0.9654023796319962, "epoch": 2.59521094640821, "grad_norm": 1.140625, "learning_rate": 4.425562576884396e-06, "loss": 0.0373, "mean_token_accuracy": 0.9890449196100235, "num_tokens": 120967093.0, "step": 1139 }, { "entropy": 0.9694820642471313, "epoch": 2.597491448118586, "grad_norm": 0.85546875, "learning_rate": 4.424360819202264e-06, "loss": 0.0239, "mean_token_accuracy": 0.9920150488615036, "num_tokens": 121073316.0, "step": 1140 }, { "entropy": 0.966747984290123, "epoch": 2.5997719498289626, "grad_norm": 1.328125, "learning_rate": 4.423157969290081e-06, "loss": 0.0406, "mean_token_accuracy": 0.988251119852066, "num_tokens": 121179281.0, "step": 1141 }, { "entropy": 0.967594563961029, "epoch": 2.602052451539339, "grad_norm": 1.1875, "learning_rate": 4.421954027830565e-06, "loss": 0.0431, "mean_token_accuracy": 0.9866156131029129, "num_tokens": 121285926.0, "step": 1142 }, { "entropy": 0.9688167721033096, "epoch": 2.604332953249715, "grad_norm": 1.046875, "learning_rate": 4.4207489955070465e-06, "loss": 0.0288, "mean_token_accuracy": 0.9904647618532181, "num_tokens": 121391833.0, "step": 1143 }, { "entropy": 0.9688373059034348, "epoch": 2.6066134549600912, "grad_norm": 1.1796875, "learning_rate": 4.419542873003479e-06, "loss": 0.0352, "mean_token_accuracy": 0.9892513006925583, "num_tokens": 121498486.0, "step": 1144 }, { "entropy": 0.9655745774507523, "epoch": 2.6088939566704674, "grad_norm": 1.15625, "learning_rate": 4.418335661004436e-06, "loss": 0.0366, "mean_token_accuracy": 0.9884840399026871, "num_tokens": 121604701.0, "step": 1145 }, { "entropy": 0.9713713973760605, "epoch": 2.6111744583808436, "grad_norm": 1.109375, "learning_rate": 4.417127360195107e-06, "loss": 0.0398, "mean_token_accuracy": 0.9874497205018997, "num_tokens": 121711072.0, "step": 1146 }, { "entropy": 0.9671221673488617, "epoch": 2.61345496009122, "grad_norm": 1.0625, "learning_rate": 4.415917971261299e-06, "loss": 0.0426, "mean_token_accuracy": 0.9890292286872864, "num_tokens": 121817819.0, "step": 1147 }, { "entropy": 0.9724067002534866, "epoch": 2.6157354618015964, "grad_norm": 0.9296875, "learning_rate": 4.414707494889439e-06, "loss": 0.0312, "mean_token_accuracy": 0.9898326545953751, "num_tokens": 121924173.0, "step": 1148 }, { "entropy": 0.9695972204208374, "epoch": 2.6180159635119726, "grad_norm": 1.2578125, "learning_rate": 4.413495931766571e-06, "loss": 0.0403, "mean_token_accuracy": 0.9874971807003021, "num_tokens": 122030984.0, "step": 1149 }, { "entropy": 0.9627700001001358, "epoch": 2.620296465222349, "grad_norm": 1.0859375, "learning_rate": 4.412283282580352e-06, "loss": 0.025, "mean_token_accuracy": 0.9919200092554092, "num_tokens": 122136896.0, "step": 1150 }, { "entropy": 0.9624491482973099, "epoch": 2.6225769669327255, "grad_norm": 0.94921875, "learning_rate": 4.41106954801906e-06, "loss": 0.032, "mean_token_accuracy": 0.9898037761449814, "num_tokens": 122243303.0, "step": 1151 }, { "entropy": 0.9660142362117767, "epoch": 2.6248574686431017, "grad_norm": 0.89453125, "learning_rate": 4.409854728771588e-06, "loss": 0.0355, "mean_token_accuracy": 0.989554300904274, "num_tokens": 122349722.0, "step": 1152 }, { "entropy": 0.962755024433136, "epoch": 2.627137970353478, "grad_norm": 1.40625, "learning_rate": 4.4086388255274425e-06, "loss": 0.0479, "mean_token_accuracy": 0.9860374182462692, "num_tokens": 122455662.0, "step": 1153 }, { "entropy": 0.9573903828859329, "epoch": 2.629418472063854, "grad_norm": 1.140625, "learning_rate": 4.407421838976747e-06, "loss": 0.0336, "mean_token_accuracy": 0.9896968454122543, "num_tokens": 122562130.0, "step": 1154 }, { "entropy": 0.9680678397417068, "epoch": 2.6316989737742302, "grad_norm": 1.1015625, "learning_rate": 4.40620376981024e-06, "loss": 0.0334, "mean_token_accuracy": 0.989229217171669, "num_tokens": 122668528.0, "step": 1155 }, { "entropy": 0.9655135869979858, "epoch": 2.6339794754846064, "grad_norm": 1.3203125, "learning_rate": 4.404984618719275e-06, "loss": 0.0461, "mean_token_accuracy": 0.9864922314882278, "num_tokens": 122774395.0, "step": 1156 }, { "entropy": 0.9733229130506516, "epoch": 2.636259977194983, "grad_norm": 0.890625, "learning_rate": 4.403764386395817e-06, "loss": 0.0306, "mean_token_accuracy": 0.9915270954370499, "num_tokens": 122881289.0, "step": 1157 }, { "entropy": 0.9662052690982819, "epoch": 2.6385404789053593, "grad_norm": 1.2578125, "learning_rate": 4.402543073532446e-06, "loss": 0.0358, "mean_token_accuracy": 0.9880698472261429, "num_tokens": 122988037.0, "step": 1158 }, { "entropy": 0.9783133268356323, "epoch": 2.6408209806157354, "grad_norm": 1.0390625, "learning_rate": 4.401320680822357e-06, "loss": 0.0338, "mean_token_accuracy": 0.9881659150123596, "num_tokens": 123094985.0, "step": 1159 }, { "entropy": 0.9705758541822433, "epoch": 2.6431014823261116, "grad_norm": 1.1875, "learning_rate": 4.400097208959357e-06, "loss": 0.0381, "mean_token_accuracy": 0.9875228255987167, "num_tokens": 123202182.0, "step": 1160 }, { "entropy": 0.9714789241552353, "epoch": 2.6453819840364883, "grad_norm": 1.1796875, "learning_rate": 4.398872658637863e-06, "loss": 0.035, "mean_token_accuracy": 0.989600732922554, "num_tokens": 123308359.0, "step": 1161 }, { "entropy": 0.9634044170379639, "epoch": 2.6476624857468645, "grad_norm": 1.2109375, "learning_rate": 4.397647030552907e-06, "loss": 0.0443, "mean_token_accuracy": 0.9840610772371292, "num_tokens": 123414301.0, "step": 1162 }, { "entropy": 0.9699638783931732, "epoch": 2.6499429874572407, "grad_norm": 1.4375, "learning_rate": 4.396420325400132e-06, "loss": 0.0462, "mean_token_accuracy": 0.9850536435842514, "num_tokens": 123520607.0, "step": 1163 }, { "entropy": 0.9696861505508423, "epoch": 2.652223489167617, "grad_norm": 1.3359375, "learning_rate": 4.3951925438757936e-06, "loss": 0.0503, "mean_token_accuracy": 0.9849546998739243, "num_tokens": 123627356.0, "step": 1164 }, { "entropy": 0.9717001169919968, "epoch": 2.654503990877993, "grad_norm": 1.2734375, "learning_rate": 4.3939636866767535e-06, "loss": 0.0338, "mean_token_accuracy": 0.9883604645729065, "num_tokens": 123734049.0, "step": 1165 }, { "entropy": 0.9700741171836853, "epoch": 2.6567844925883692, "grad_norm": 1.1953125, "learning_rate": 4.39273375450049e-06, "loss": 0.0383, "mean_token_accuracy": 0.988090768456459, "num_tokens": 123840020.0, "step": 1166 }, { "entropy": 0.9708808064460754, "epoch": 2.659064994298746, "grad_norm": 1.09375, "learning_rate": 4.391502748045088e-06, "loss": 0.0401, "mean_token_accuracy": 0.9862798601388931, "num_tokens": 123945942.0, "step": 1167 }, { "entropy": 0.9671271592378616, "epoch": 2.661345496009122, "grad_norm": 1.109375, "learning_rate": 4.390270668009244e-06, "loss": 0.0362, "mean_token_accuracy": 0.9868480116128922, "num_tokens": 124052095.0, "step": 1168 }, { "entropy": 0.9711120277643204, "epoch": 2.6636259977194983, "grad_norm": 1.171875, "learning_rate": 4.38903751509226e-06, "loss": 0.0393, "mean_token_accuracy": 0.9883609712123871, "num_tokens": 124158677.0, "step": 1169 }, { "entropy": 0.9714861661195755, "epoch": 2.6659064994298745, "grad_norm": 1.3671875, "learning_rate": 4.3878032899940534e-06, "loss": 0.0498, "mean_token_accuracy": 0.9873904138803482, "num_tokens": 124264993.0, "step": 1170 }, { "entropy": 0.971263512969017, "epoch": 2.668187001140251, "grad_norm": 1.1796875, "learning_rate": 4.386567993415144e-06, "loss": 0.0365, "mean_token_accuracy": 0.9867577999830246, "num_tokens": 124371578.0, "step": 1171 }, { "entropy": 0.9735779315233231, "epoch": 2.6704675028506273, "grad_norm": 1.1015625, "learning_rate": 4.3853316260566635e-06, "loss": 0.0384, "mean_token_accuracy": 0.9876649677753448, "num_tokens": 124477797.0, "step": 1172 }, { "entropy": 0.9750159531831741, "epoch": 2.6727480045610035, "grad_norm": 1.0546875, "learning_rate": 4.384094188620349e-06, "loss": 0.0342, "mean_token_accuracy": 0.989717423915863, "num_tokens": 124583621.0, "step": 1173 }, { "entropy": 0.9759665429592133, "epoch": 2.6750285062713797, "grad_norm": 1.3984375, "learning_rate": 4.3828556818085485e-06, "loss": 0.0421, "mean_token_accuracy": 0.9888923168182373, "num_tokens": 124689828.0, "step": 1174 }, { "entropy": 0.9681411385536194, "epoch": 2.677309007981756, "grad_norm": 1.2421875, "learning_rate": 4.3816161063242115e-06, "loss": 0.0461, "mean_token_accuracy": 0.9851164072751999, "num_tokens": 124795641.0, "step": 1175 }, { "entropy": 0.972316563129425, "epoch": 2.679589509692132, "grad_norm": 1.015625, "learning_rate": 4.3803754628708995e-06, "loss": 0.0301, "mean_token_accuracy": 0.9907431900501251, "num_tokens": 124902479.0, "step": 1176 }, { "entropy": 0.9699146300554276, "epoch": 2.6818700114025087, "grad_norm": 0.90625, "learning_rate": 4.379133752152776e-06, "loss": 0.0269, "mean_token_accuracy": 0.9911725521087646, "num_tokens": 125009125.0, "step": 1177 }, { "entropy": 0.9652835428714752, "epoch": 2.684150513112885, "grad_norm": 1.0390625, "learning_rate": 4.377890974874614e-06, "loss": 0.0414, "mean_token_accuracy": 0.9872023910284042, "num_tokens": 125116280.0, "step": 1178 }, { "entropy": 0.9745573252439499, "epoch": 2.686431014823261, "grad_norm": 1.109375, "learning_rate": 4.376647131741787e-06, "loss": 0.0399, "mean_token_accuracy": 0.9874963015317917, "num_tokens": 125222651.0, "step": 1179 }, { "entropy": 0.9780310243368149, "epoch": 2.6887115165336373, "grad_norm": 1.1796875, "learning_rate": 4.375402223460279e-06, "loss": 0.042, "mean_token_accuracy": 0.9885870218276978, "num_tokens": 125328770.0, "step": 1180 }, { "entropy": 0.9796605706214905, "epoch": 2.690992018244014, "grad_norm": 1.0859375, "learning_rate": 4.3741562507366754e-06, "loss": 0.0331, "mean_token_accuracy": 0.9889589697122574, "num_tokens": 125435101.0, "step": 1181 }, { "entropy": 0.9712125957012177, "epoch": 2.69327251995439, "grad_norm": 1.1015625, "learning_rate": 4.3729092142781655e-06, "loss": 0.0367, "mean_token_accuracy": 0.987577423453331, "num_tokens": 125541505.0, "step": 1182 }, { "entropy": 0.9721206724643707, "epoch": 2.6955530216647663, "grad_norm": 1.3828125, "learning_rate": 4.3716611147925435e-06, "loss": 0.0337, "mean_token_accuracy": 0.9874710440635681, "num_tokens": 125647461.0, "step": 1183 }, { "entropy": 0.971238449215889, "epoch": 2.6978335233751425, "grad_norm": 1.296875, "learning_rate": 4.370411952988207e-06, "loss": 0.0385, "mean_token_accuracy": 0.9884220510721207, "num_tokens": 125753923.0, "step": 1184 }, { "entropy": 0.9707814753055573, "epoch": 2.7001140250855187, "grad_norm": 1.125, "learning_rate": 4.369161729574155e-06, "loss": 0.038, "mean_token_accuracy": 0.987330362200737, "num_tokens": 125859970.0, "step": 1185 }, { "entropy": 0.9786464869976044, "epoch": 2.702394526795895, "grad_norm": 0.97265625, "learning_rate": 4.367910445259991e-06, "loss": 0.0297, "mean_token_accuracy": 0.9905536621809006, "num_tokens": 125965827.0, "step": 1186 }, { "entropy": 0.9706958532333374, "epoch": 2.7046750285062715, "grad_norm": 1.1328125, "learning_rate": 4.36665810075592e-06, "loss": 0.0386, "mean_token_accuracy": 0.9884040504693985, "num_tokens": 126071886.0, "step": 1187 }, { "entropy": 0.9625386744737625, "epoch": 2.7069555302166477, "grad_norm": 1.1484375, "learning_rate": 4.365404696772748e-06, "loss": 0.0378, "mean_token_accuracy": 0.988489642739296, "num_tokens": 126178472.0, "step": 1188 }, { "entropy": 0.970743864774704, "epoch": 2.709236031927024, "grad_norm": 1.0625, "learning_rate": 4.364150234021883e-06, "loss": 0.0348, "mean_token_accuracy": 0.9889666587114334, "num_tokens": 126285121.0, "step": 1189 }, { "entropy": 0.9701168984174728, "epoch": 2.7115165336374, "grad_norm": 1.109375, "learning_rate": 4.362894713215334e-06, "loss": 0.046, "mean_token_accuracy": 0.9855711311101913, "num_tokens": 126392443.0, "step": 1190 }, { "entropy": 0.96549953520298, "epoch": 2.7137970353477767, "grad_norm": 0.8359375, "learning_rate": 4.361638135065711e-06, "loss": 0.0313, "mean_token_accuracy": 0.9917374849319458, "num_tokens": 126500038.0, "step": 1191 }, { "entropy": 0.9710358530282974, "epoch": 2.716077537058153, "grad_norm": 1.2734375, "learning_rate": 4.360380500286222e-06, "loss": 0.0428, "mean_token_accuracy": 0.9871172159910202, "num_tokens": 126606557.0, "step": 1192 }, { "entropy": 0.9713350236415863, "epoch": 2.718358038768529, "grad_norm": 1.2109375, "learning_rate": 4.359121809590678e-06, "loss": 0.0376, "mean_token_accuracy": 0.9875933229923248, "num_tokens": 126713052.0, "step": 1193 }, { "entropy": 0.9673966020345688, "epoch": 2.7206385404789053, "grad_norm": 1.03125, "learning_rate": 4.357862063693486e-06, "loss": 0.0327, "mean_token_accuracy": 0.9886877536773682, "num_tokens": 126818754.0, "step": 1194 }, { "entropy": 0.9702577441930771, "epoch": 2.7229190421892815, "grad_norm": 1.2109375, "learning_rate": 4.356601263309654e-06, "loss": 0.0405, "mean_token_accuracy": 0.9870445877313614, "num_tokens": 126924674.0, "step": 1195 }, { "entropy": 0.9754566699266434, "epoch": 2.7251995438996577, "grad_norm": 1.015625, "learning_rate": 4.355339409154788e-06, "loss": 0.028, "mean_token_accuracy": 0.9901687651872635, "num_tokens": 127031197.0, "step": 1196 }, { "entropy": 0.9728171825408936, "epoch": 2.7274800456100343, "grad_norm": 1.265625, "learning_rate": 4.354076501945093e-06, "loss": 0.0428, "mean_token_accuracy": 0.9873422682285309, "num_tokens": 127137543.0, "step": 1197 }, { "entropy": 0.9696276932954788, "epoch": 2.7297605473204105, "grad_norm": 1.21875, "learning_rate": 4.352812542397369e-06, "loss": 0.0372, "mean_token_accuracy": 0.9892459511756897, "num_tokens": 127243306.0, "step": 1198 }, { "entropy": 0.9704063683748245, "epoch": 2.7320410490307867, "grad_norm": 0.95703125, "learning_rate": 4.351547531229016e-06, "loss": 0.0428, "mean_token_accuracy": 0.9872656762599945, "num_tokens": 127349566.0, "step": 1199 }, { "entropy": 0.9770202934741974, "epoch": 2.734321550741163, "grad_norm": 1.171875, "learning_rate": 4.350281469158029e-06, "loss": 0.0407, "mean_token_accuracy": 0.9869884848594666, "num_tokens": 127455589.0, "step": 1200 }, { "entropy": 0.9779450744390488, "epoch": 2.7366020524515395, "grad_norm": 1.03125, "learning_rate": 4.3490143569030025e-06, "loss": 0.0304, "mean_token_accuracy": 0.9899354577064514, "num_tokens": 127561754.0, "step": 1201 }, { "entropy": 0.9746156185865402, "epoch": 2.7388825541619157, "grad_norm": 1.296875, "learning_rate": 4.347746195183123e-06, "loss": 0.0384, "mean_token_accuracy": 0.9859634935855865, "num_tokens": 127668276.0, "step": 1202 }, { "entropy": 0.9748456478118896, "epoch": 2.741163055872292, "grad_norm": 1.4140625, "learning_rate": 4.346476984718176e-06, "loss": 0.0411, "mean_token_accuracy": 0.9877548664808273, "num_tokens": 127774926.0, "step": 1203 }, { "entropy": 0.9758718907833099, "epoch": 2.743443557582668, "grad_norm": 0.91015625, "learning_rate": 4.345206726228538e-06, "loss": 0.0294, "mean_token_accuracy": 0.991203248500824, "num_tokens": 127881322.0, "step": 1204 }, { "entropy": 0.9800431281328201, "epoch": 2.7457240592930443, "grad_norm": 1.1640625, "learning_rate": 4.343935420435187e-06, "loss": 0.0393, "mean_token_accuracy": 0.9884122759103775, "num_tokens": 127987608.0, "step": 1205 }, { "entropy": 0.969656765460968, "epoch": 2.7480045610034205, "grad_norm": 0.94140625, "learning_rate": 4.34266306805969e-06, "loss": 0.0364, "mean_token_accuracy": 0.9896283447742462, "num_tokens": 128093439.0, "step": 1206 }, { "entropy": 0.9739441275596619, "epoch": 2.750285062713797, "grad_norm": 1.1171875, "learning_rate": 4.341389669824209e-06, "loss": 0.0368, "mean_token_accuracy": 0.9878317713737488, "num_tokens": 128199427.0, "step": 1207 }, { "entropy": 0.9769521355628967, "epoch": 2.7525655644241733, "grad_norm": 1.234375, "learning_rate": 4.340115226451501e-06, "loss": 0.0322, "mean_token_accuracy": 0.9907612353563309, "num_tokens": 128305873.0, "step": 1208 }, { "entropy": 0.9728002101182938, "epoch": 2.7548460661345495, "grad_norm": 1.1484375, "learning_rate": 4.338839738664915e-06, "loss": 0.0398, "mean_token_accuracy": 0.9888560920953751, "num_tokens": 128412263.0, "step": 1209 }, { "entropy": 0.9710625112056732, "epoch": 2.757126567844926, "grad_norm": 1.171875, "learning_rate": 4.3375632071883935e-06, "loss": 0.047, "mean_token_accuracy": 0.9864435493946075, "num_tokens": 128518397.0, "step": 1210 }, { "entropy": 0.9743824601173401, "epoch": 2.7594070695553023, "grad_norm": 1.21875, "learning_rate": 4.336285632746472e-06, "loss": 0.0434, "mean_token_accuracy": 0.9882290363311768, "num_tokens": 128624678.0, "step": 1211 }, { "entropy": 0.9776053875684738, "epoch": 2.7616875712656785, "grad_norm": 0.88671875, "learning_rate": 4.3350070160642754e-06, "loss": 0.0304, "mean_token_accuracy": 0.990995243191719, "num_tokens": 128730599.0, "step": 1212 }, { "entropy": 0.9766076952219009, "epoch": 2.7639680729760547, "grad_norm": 1.0859375, "learning_rate": 4.333727357867523e-06, "loss": 0.0262, "mean_token_accuracy": 0.9923071265220642, "num_tokens": 128837401.0, "step": 1213 }, { "entropy": 0.9725754261016846, "epoch": 2.766248574686431, "grad_norm": 0.9765625, "learning_rate": 4.3324466588825235e-06, "loss": 0.035, "mean_token_accuracy": 0.989327684044838, "num_tokens": 128943601.0, "step": 1214 }, { "entropy": 0.9754811972379684, "epoch": 2.768529076396807, "grad_norm": 1.1328125, "learning_rate": 4.331164919836177e-06, "loss": 0.0388, "mean_token_accuracy": 0.9877387136220932, "num_tokens": 129050157.0, "step": 1215 }, { "entropy": 0.9775501936674118, "epoch": 2.7708095781071833, "grad_norm": 1.140625, "learning_rate": 4.329882141455974e-06, "loss": 0.0424, "mean_token_accuracy": 0.9877998381853104, "num_tokens": 129155912.0, "step": 1216 }, { "entropy": 0.9765558838844299, "epoch": 2.77309007981756, "grad_norm": 1.0546875, "learning_rate": 4.3285983244699955e-06, "loss": 0.0337, "mean_token_accuracy": 0.9903810769319534, "num_tokens": 129262439.0, "step": 1217 }, { "entropy": 0.9762320518493652, "epoch": 2.775370581527936, "grad_norm": 1.0078125, "learning_rate": 4.327313469606911e-06, "loss": 0.037, "mean_token_accuracy": 0.9895225912332535, "num_tokens": 129368946.0, "step": 1218 }, { "entropy": 0.9735846072435379, "epoch": 2.7776510832383123, "grad_norm": 0.9765625, "learning_rate": 4.326027577595977e-06, "loss": 0.0278, "mean_token_accuracy": 0.9923290312290192, "num_tokens": 129475041.0, "step": 1219 }, { "entropy": 0.976649671792984, "epoch": 2.779931584948689, "grad_norm": 1.1171875, "learning_rate": 4.324740649167044e-06, "loss": 0.0342, "mean_token_accuracy": 0.988461047410965, "num_tokens": 129581323.0, "step": 1220 }, { "entropy": 0.9672993272542953, "epoch": 2.782212086659065, "grad_norm": 0.92578125, "learning_rate": 4.323452685050545e-06, "loss": 0.0325, "mean_token_accuracy": 0.9906620979309082, "num_tokens": 129688033.0, "step": 1221 }, { "entropy": 0.9759734719991684, "epoch": 2.7844925883694414, "grad_norm": 0.953125, "learning_rate": 4.3221636859775075e-06, "loss": 0.0342, "mean_token_accuracy": 0.9903420656919479, "num_tokens": 129794316.0, "step": 1222 }, { "entropy": 0.9727287143468857, "epoch": 2.7867730900798175, "grad_norm": 1.2265625, "learning_rate": 4.320873652679538e-06, "loss": 0.0389, "mean_token_accuracy": 0.9886883795261383, "num_tokens": 129900504.0, "step": 1223 }, { "entropy": 0.9718088209629059, "epoch": 2.7890535917901937, "grad_norm": 1.3125, "learning_rate": 4.319582585888838e-06, "loss": 0.0298, "mean_token_accuracy": 0.9904930591583252, "num_tokens": 130006899.0, "step": 1224 }, { "entropy": 0.976301446557045, "epoch": 2.79133409350057, "grad_norm": 1.09375, "learning_rate": 4.31829048633819e-06, "loss": 0.0362, "mean_token_accuracy": 0.9906734973192215, "num_tokens": 130113561.0, "step": 1225 }, { "entropy": 0.9724910408258438, "epoch": 2.7936145952109466, "grad_norm": 1.1875, "learning_rate": 4.316997354760965e-06, "loss": 0.0426, "mean_token_accuracy": 0.9845650643110275, "num_tokens": 130220085.0, "step": 1226 }, { "entropy": 0.978107288479805, "epoch": 2.7958950969213228, "grad_norm": 0.94921875, "learning_rate": 4.3157031918911204e-06, "loss": 0.0309, "mean_token_accuracy": 0.9902230054140091, "num_tokens": 130325887.0, "step": 1227 }, { "entropy": 0.9707192629575729, "epoch": 2.798175598631699, "grad_norm": 1.3671875, "learning_rate": 4.314407998463198e-06, "loss": 0.0421, "mean_token_accuracy": 0.9873139709234238, "num_tokens": 130432129.0, "step": 1228 }, { "entropy": 0.9766882210969925, "epoch": 2.800456100342075, "grad_norm": 0.97265625, "learning_rate": 4.3131117752123235e-06, "loss": 0.0282, "mean_token_accuracy": 0.9907979220151901, "num_tokens": 130538104.0, "step": 1229 }, { "entropy": 0.9742271900177002, "epoch": 2.802736602052452, "grad_norm": 1.453125, "learning_rate": 4.311814522874209e-06, "loss": 0.0466, "mean_token_accuracy": 0.9851456582546234, "num_tokens": 130644189.0, "step": 1230 }, { "entropy": 0.9752599745988846, "epoch": 2.805017103762828, "grad_norm": 1.2890625, "learning_rate": 4.3105162421851494e-06, "loss": 0.0389, "mean_token_accuracy": 0.9876409620046616, "num_tokens": 130750249.0, "step": 1231 }, { "entropy": 0.9773288071155548, "epoch": 2.807297605473204, "grad_norm": 1.25, "learning_rate": 4.309216933882025e-06, "loss": 0.0363, "mean_token_accuracy": 0.9887907207012177, "num_tokens": 130856322.0, "step": 1232 }, { "entropy": 0.9705926924943924, "epoch": 2.8095781071835804, "grad_norm": 1.5, "learning_rate": 4.307916598702296e-06, "loss": 0.0504, "mean_token_accuracy": 0.9847634434700012, "num_tokens": 130963114.0, "step": 1233 }, { "entropy": 0.9733545929193497, "epoch": 2.8118586088939566, "grad_norm": 1.125, "learning_rate": 4.3066152373840105e-06, "loss": 0.0474, "mean_token_accuracy": 0.9861346781253815, "num_tokens": 131069885.0, "step": 1234 }, { "entropy": 0.9824858754873276, "epoch": 2.8141391106043327, "grad_norm": 1.1953125, "learning_rate": 4.305312850665794e-06, "loss": 0.0445, "mean_token_accuracy": 0.9863295704126358, "num_tokens": 131176212.0, "step": 1235 }, { "entropy": 0.9796987473964691, "epoch": 2.8164196123147094, "grad_norm": 1.109375, "learning_rate": 4.304009439286855e-06, "loss": 0.0344, "mean_token_accuracy": 0.9911274015903473, "num_tokens": 131281914.0, "step": 1236 }, { "entropy": 0.9832790940999985, "epoch": 2.8187001140250856, "grad_norm": 1.15625, "learning_rate": 4.3027050039869865e-06, "loss": 0.0414, "mean_token_accuracy": 0.9890188276767731, "num_tokens": 131388444.0, "step": 1237 }, { "entropy": 0.985818400979042, "epoch": 2.8209806157354618, "grad_norm": 1.046875, "learning_rate": 4.301399545506561e-06, "loss": 0.0395, "mean_token_accuracy": 0.9875315725803375, "num_tokens": 131494900.0, "step": 1238 }, { "entropy": 0.9849425107240677, "epoch": 2.823261117445838, "grad_norm": 1.3125, "learning_rate": 4.3000930645865305e-06, "loss": 0.0334, "mean_token_accuracy": 0.990275114774704, "num_tokens": 131601273.0, "step": 1239 }, { "entropy": 0.9828355014324188, "epoch": 2.8255416191562146, "grad_norm": 1.4375, "learning_rate": 4.298785561968428e-06, "loss": 0.0357, "mean_token_accuracy": 0.9891202598810196, "num_tokens": 131707073.0, "step": 1240 }, { "entropy": 0.9756573885679245, "epoch": 2.827822120866591, "grad_norm": 1.125, "learning_rate": 4.297477038394368e-06, "loss": 0.0361, "mean_token_accuracy": 0.9889079630374908, "num_tokens": 131813341.0, "step": 1241 }, { "entropy": 0.9813584387302399, "epoch": 2.830102622576967, "grad_norm": 1.46875, "learning_rate": 4.296167494607043e-06, "loss": 0.0405, "mean_token_accuracy": 0.9876329898834229, "num_tokens": 131920165.0, "step": 1242 }, { "entropy": 0.988616481423378, "epoch": 2.832383124287343, "grad_norm": 1.1640625, "learning_rate": 4.294856931349724e-06, "loss": 0.0447, "mean_token_accuracy": 0.9858872145414352, "num_tokens": 132026499.0, "step": 1243 }, { "entropy": 0.9896643459796906, "epoch": 2.8346636259977194, "grad_norm": 1.203125, "learning_rate": 4.293545349366262e-06, "loss": 0.0323, "mean_token_accuracy": 0.9884999245405197, "num_tokens": 132132127.0, "step": 1244 }, { "entropy": 0.9806882292032242, "epoch": 2.8369441277080956, "grad_norm": 1.109375, "learning_rate": 4.292232749401085e-06, "loss": 0.0319, "mean_token_accuracy": 0.9903034269809723, "num_tokens": 132238219.0, "step": 1245 }, { "entropy": 0.9803225845098495, "epoch": 2.839224629418472, "grad_norm": 1.25, "learning_rate": 4.2909191321992e-06, "loss": 0.0406, "mean_token_accuracy": 0.9889567494392395, "num_tokens": 132344757.0, "step": 1246 }, { "entropy": 0.9824121743440628, "epoch": 2.8415051311288484, "grad_norm": 1.2734375, "learning_rate": 4.2896044985061915e-06, "loss": 0.0379, "mean_token_accuracy": 0.9879318624734879, "num_tokens": 132450985.0, "step": 1247 }, { "entropy": 0.9802721589803696, "epoch": 2.8437856328392246, "grad_norm": 0.80078125, "learning_rate": 4.288288849068218e-06, "loss": 0.0269, "mean_token_accuracy": 0.9912798404693604, "num_tokens": 132557290.0, "step": 1248 }, { "entropy": 0.988624632358551, "epoch": 2.846066134549601, "grad_norm": 1.4765625, "learning_rate": 4.286972184632019e-06, "loss": 0.0433, "mean_token_accuracy": 0.9853486865758896, "num_tokens": 132663885.0, "step": 1249 }, { "entropy": 0.9832646697759628, "epoch": 2.8483466362599774, "grad_norm": 0.9921875, "learning_rate": 4.285654505944906e-06, "loss": 0.0269, "mean_token_accuracy": 0.9911466836929321, "num_tokens": 132770039.0, "step": 1250 }, { "entropy": 0.9787805080413818, "epoch": 2.8506271379703536, "grad_norm": 0.890625, "learning_rate": 4.28433581375477e-06, "loss": 0.0243, "mean_token_accuracy": 0.9929917007684708, "num_tokens": 132877058.0, "step": 1251 }, { "entropy": 0.9861638098955154, "epoch": 2.85290763968073, "grad_norm": 1.453125, "learning_rate": 4.283016108810073e-06, "loss": 0.0526, "mean_token_accuracy": 0.9847084134817123, "num_tokens": 132983519.0, "step": 1252 }, { "entropy": 0.9792502224445343, "epoch": 2.855188141391106, "grad_norm": 1.125, "learning_rate": 4.281695391859854e-06, "loss": 0.0381, "mean_token_accuracy": 0.989331841468811, "num_tokens": 133090137.0, "step": 1253 }, { "entropy": 0.9833216071128845, "epoch": 2.857468643101482, "grad_norm": 0.96484375, "learning_rate": 4.28037366365373e-06, "loss": 0.0233, "mean_token_accuracy": 0.9925377517938614, "num_tokens": 133196395.0, "step": 1254 }, { "entropy": 0.9776209741830826, "epoch": 2.8597491448118584, "grad_norm": 1.28125, "learning_rate": 4.279050924941885e-06, "loss": 0.0317, "mean_token_accuracy": 0.9905758500099182, "num_tokens": 133302839.0, "step": 1255 }, { "entropy": 0.9762896448373795, "epoch": 2.862029646522235, "grad_norm": 1.109375, "learning_rate": 4.2777271764750805e-06, "loss": 0.0337, "mean_token_accuracy": 0.9899146258831024, "num_tokens": 133409396.0, "step": 1256 }, { "entropy": 0.9751209318637848, "epoch": 2.864310148232611, "grad_norm": 1.015625, "learning_rate": 4.276402419004652e-06, "loss": 0.0361, "mean_token_accuracy": 0.9878804534673691, "num_tokens": 133515438.0, "step": 1257 }, { "entropy": 0.9795290380716324, "epoch": 2.8665906499429874, "grad_norm": 0.953125, "learning_rate": 4.275076653282504e-06, "loss": 0.031, "mean_token_accuracy": 0.991499051451683, "num_tokens": 133621393.0, "step": 1258 }, { "entropy": 0.9726355373859406, "epoch": 2.8688711516533636, "grad_norm": 1.2734375, "learning_rate": 4.273749880061118e-06, "loss": 0.0489, "mean_token_accuracy": 0.985678419470787, "num_tokens": 133727325.0, "step": 1259 }, { "entropy": 0.9805049002170563, "epoch": 2.8711516533637402, "grad_norm": 1.0, "learning_rate": 4.272422100093542e-06, "loss": 0.0225, "mean_token_accuracy": 0.9918928444385529, "num_tokens": 133833457.0, "step": 1260 }, { "entropy": 0.9752933382987976, "epoch": 2.8734321550741164, "grad_norm": 1.1796875, "learning_rate": 4.271093314133401e-06, "loss": 0.0381, "mean_token_accuracy": 0.9876104295253754, "num_tokens": 133939794.0, "step": 1261 }, { "entropy": 0.9751120507717133, "epoch": 2.8757126567844926, "grad_norm": 1.2890625, "learning_rate": 4.269763522934888e-06, "loss": 0.0443, "mean_token_accuracy": 0.9860926866531372, "num_tokens": 134046245.0, "step": 1262 }, { "entropy": 0.983139768242836, "epoch": 2.877993158494869, "grad_norm": 1.5, "learning_rate": 4.268432727252765e-06, "loss": 0.0475, "mean_token_accuracy": 0.9835478961467743, "num_tokens": 134153165.0, "step": 1263 }, { "entropy": 0.976004883646965, "epoch": 2.880273660205245, "grad_norm": 1.234375, "learning_rate": 4.2671009278423665e-06, "loss": 0.0364, "mean_token_accuracy": 0.9896314740180969, "num_tokens": 134259159.0, "step": 1264 }, { "entropy": 0.9782766252756119, "epoch": 2.882554161915621, "grad_norm": 1.09375, "learning_rate": 4.265768125459597e-06, "loss": 0.0312, "mean_token_accuracy": 0.9901249557733536, "num_tokens": 134365558.0, "step": 1265 }, { "entropy": 0.9777655899524689, "epoch": 2.884834663625998, "grad_norm": 1.1484375, "learning_rate": 4.264434320860929e-06, "loss": 0.0334, "mean_token_accuracy": 0.9884685128927231, "num_tokens": 134471757.0, "step": 1266 }, { "entropy": 0.9796280711889267, "epoch": 2.887115165336374, "grad_norm": 1.3515625, "learning_rate": 4.2630995148034044e-06, "loss": 0.0415, "mean_token_accuracy": 0.9880338609218597, "num_tokens": 134577545.0, "step": 1267 }, { "entropy": 0.9775187373161316, "epoch": 2.88939566704675, "grad_norm": 0.92578125, "learning_rate": 4.261763708044633e-06, "loss": 0.0225, "mean_token_accuracy": 0.9936632066965103, "num_tokens": 134683760.0, "step": 1268 }, { "entropy": 0.9810190349817276, "epoch": 2.8916761687571264, "grad_norm": 1.046875, "learning_rate": 4.2604269013427925e-06, "loss": 0.031, "mean_token_accuracy": 0.9896435737609863, "num_tokens": 134790300.0, "step": 1269 }, { "entropy": 0.9706858992576599, "epoch": 2.893956670467503, "grad_norm": 1.46875, "learning_rate": 4.25908909545663e-06, "loss": 0.0411, "mean_token_accuracy": 0.9882616102695465, "num_tokens": 134896534.0, "step": 1270 }, { "entropy": 0.9763378202915192, "epoch": 2.8962371721778792, "grad_norm": 1.21875, "learning_rate": 4.257750291145457e-06, "loss": 0.0385, "mean_token_accuracy": 0.9879039973020554, "num_tokens": 135002261.0, "step": 1271 }, { "entropy": 0.9810153990983963, "epoch": 2.8985176738882554, "grad_norm": 1.015625, "learning_rate": 4.256410489169154e-06, "loss": 0.0296, "mean_token_accuracy": 0.9910179227590561, "num_tokens": 135108701.0, "step": 1272 }, { "entropy": 0.982611209154129, "epoch": 2.9007981755986316, "grad_norm": 0.9921875, "learning_rate": 4.255069690288166e-06, "loss": 0.0339, "mean_token_accuracy": 0.9911675900220871, "num_tokens": 135215143.0, "step": 1273 }, { "entropy": 0.9802381992340088, "epoch": 2.903078677309008, "grad_norm": 1.0546875, "learning_rate": 4.253727895263504e-06, "loss": 0.0317, "mean_token_accuracy": 0.9893192201852798, "num_tokens": 135321825.0, "step": 1274 }, { "entropy": 0.9757795184850693, "epoch": 2.905359179019384, "grad_norm": 1.21875, "learning_rate": 4.252385104856746e-06, "loss": 0.0321, "mean_token_accuracy": 0.9875144958496094, "num_tokens": 135427956.0, "step": 1275 }, { "entropy": 0.9772332906723022, "epoch": 2.9076396807297606, "grad_norm": 1.3046875, "learning_rate": 4.251041319830034e-06, "loss": 0.0394, "mean_token_accuracy": 0.9875067323446274, "num_tokens": 135534053.0, "step": 1276 }, { "entropy": 0.9727882593870163, "epoch": 2.909920182440137, "grad_norm": 1.3203125, "learning_rate": 4.249696540946074e-06, "loss": 0.0402, "mean_token_accuracy": 0.9887483566999435, "num_tokens": 135641187.0, "step": 1277 }, { "entropy": 0.9770946949720383, "epoch": 2.912200684150513, "grad_norm": 1.625, "learning_rate": 4.248350768968136e-06, "loss": 0.0586, "mean_token_accuracy": 0.982013151049614, "num_tokens": 135746821.0, "step": 1278 }, { "entropy": 0.9736180752515793, "epoch": 2.9144811858608897, "grad_norm": 0.953125, "learning_rate": 4.247004004660055e-06, "loss": 0.0311, "mean_token_accuracy": 0.9891501516103745, "num_tokens": 135852879.0, "step": 1279 }, { "entropy": 0.9722704589366913, "epoch": 2.916761687571266, "grad_norm": 1.15625, "learning_rate": 4.245656248786228e-06, "loss": 0.0371, "mean_token_accuracy": 0.9895616471767426, "num_tokens": 135958895.0, "step": 1280 }, { "entropy": 0.9761803448200226, "epoch": 2.919042189281642, "grad_norm": 1.1953125, "learning_rate": 4.2443075021116166e-06, "loss": 0.036, "mean_token_accuracy": 0.9890339970588684, "num_tokens": 136064737.0, "step": 1281 }, { "entropy": 0.9798304736614227, "epoch": 2.9213226909920182, "grad_norm": 0.94140625, "learning_rate": 4.242957765401741e-06, "loss": 0.029, "mean_token_accuracy": 0.9899295121431351, "num_tokens": 136171122.0, "step": 1282 }, { "entropy": 0.9803309142589569, "epoch": 2.9236031927023944, "grad_norm": 0.8515625, "learning_rate": 4.241607039422687e-06, "loss": 0.0234, "mean_token_accuracy": 0.992745578289032, "num_tokens": 136277271.0, "step": 1283 }, { "entropy": 0.972515344619751, "epoch": 2.9258836944127706, "grad_norm": 1.1171875, "learning_rate": 4.2402553249411e-06, "loss": 0.0347, "mean_token_accuracy": 0.9888733923435211, "num_tokens": 136383623.0, "step": 1284 }, { "entropy": 0.9766307324171066, "epoch": 2.928164196123147, "grad_norm": 1.234375, "learning_rate": 4.238902622724188e-06, "loss": 0.0394, "mean_token_accuracy": 0.9877131283283234, "num_tokens": 136490368.0, "step": 1285 }, { "entropy": 0.9758215844631195, "epoch": 2.9304446978335235, "grad_norm": 1.140625, "learning_rate": 4.237548933539718e-06, "loss": 0.0322, "mean_token_accuracy": 0.9891624003648758, "num_tokens": 136596314.0, "step": 1286 }, { "entropy": 0.9708556234836578, "epoch": 2.9327251995438997, "grad_norm": 1.2109375, "learning_rate": 4.236194258156019e-06, "loss": 0.0435, "mean_token_accuracy": 0.9863570481538773, "num_tokens": 136703023.0, "step": 1287 }, { "entropy": 0.9744352251291275, "epoch": 2.935005701254276, "grad_norm": 1.1796875, "learning_rate": 4.234838597341977e-06, "loss": 0.0463, "mean_token_accuracy": 0.9866407364606857, "num_tokens": 136809478.0, "step": 1288 }, { "entropy": 0.9749622642993927, "epoch": 2.9372862029646525, "grad_norm": 1.2578125, "learning_rate": 4.233481951867039e-06, "loss": 0.0351, "mean_token_accuracy": 0.9890402853488922, "num_tokens": 136915390.0, "step": 1289 }, { "entropy": 0.9765562415122986, "epoch": 2.9395667046750287, "grad_norm": 1.21875, "learning_rate": 4.232124322501212e-06, "loss": 0.0428, "mean_token_accuracy": 0.9859289973974228, "num_tokens": 137021565.0, "step": 1290 }, { "entropy": 0.9669390320777893, "epoch": 2.941847206385405, "grad_norm": 0.90625, "learning_rate": 4.230765710015058e-06, "loss": 0.0313, "mean_token_accuracy": 0.9912543892860413, "num_tokens": 137127293.0, "step": 1291 }, { "entropy": 0.9735155701637268, "epoch": 2.944127708095781, "grad_norm": 0.94921875, "learning_rate": 4.229406115179703e-06, "loss": 0.0345, "mean_token_accuracy": 0.9891339391469955, "num_tokens": 137233879.0, "step": 1292 }, { "entropy": 0.9680090844631195, "epoch": 2.9464082098061573, "grad_norm": 0.9921875, "learning_rate": 4.228045538766823e-06, "loss": 0.0327, "mean_token_accuracy": 0.9899452775716782, "num_tokens": 137339659.0, "step": 1293 }, { "entropy": 0.9705035537481308, "epoch": 2.9486887115165334, "grad_norm": 1.375, "learning_rate": 4.226683981548656e-06, "loss": 0.0511, "mean_token_accuracy": 0.9858105927705765, "num_tokens": 137446078.0, "step": 1294 }, { "entropy": 0.967381164431572, "epoch": 2.95096921322691, "grad_norm": 1.171875, "learning_rate": 4.2253214442979975e-06, "loss": 0.0452, "mean_token_accuracy": 0.9858705550432205, "num_tokens": 137552565.0, "step": 1295 }, { "entropy": 0.9794913083314896, "epoch": 2.9532497149372863, "grad_norm": 1.0546875, "learning_rate": 4.223957927788195e-06, "loss": 0.0401, "mean_token_accuracy": 0.9864100366830826, "num_tokens": 137658913.0, "step": 1296 }, { "entropy": 0.9758711904287338, "epoch": 2.9555302166476625, "grad_norm": 1.2109375, "learning_rate": 4.222593432793155e-06, "loss": 0.035, "mean_token_accuracy": 0.986694797873497, "num_tokens": 137765139.0, "step": 1297 }, { "entropy": 0.9758809059858322, "epoch": 2.9578107183580387, "grad_norm": 0.9453125, "learning_rate": 4.2212279600873385e-06, "loss": 0.0303, "mean_token_accuracy": 0.9897606521844864, "num_tokens": 137871823.0, "step": 1298 }, { "entropy": 0.9732758700847626, "epoch": 2.9600912200684153, "grad_norm": 1.1171875, "learning_rate": 4.219861510445762e-06, "loss": 0.0447, "mean_token_accuracy": 0.9870912432670593, "num_tokens": 137977989.0, "step": 1299 }, { "entropy": 0.9804201424121857, "epoch": 2.9623717217787915, "grad_norm": 1.3359375, "learning_rate": 4.2184940846439946e-06, "loss": 0.0462, "mean_token_accuracy": 0.9864893108606339, "num_tokens": 138084607.0, "step": 1300 }, { "entropy": 0.978776752948761, "epoch": 2.9646522234891677, "grad_norm": 1.0078125, "learning_rate": 4.217125683458162e-06, "loss": 0.0336, "mean_token_accuracy": 0.990794226527214, "num_tokens": 138190952.0, "step": 1301 }, { "entropy": 0.9712161123752594, "epoch": 2.966932725199544, "grad_norm": 1.2578125, "learning_rate": 4.215756307664941e-06, "loss": 0.036, "mean_token_accuracy": 0.9892019927501678, "num_tokens": 138297401.0, "step": 1302 }, { "entropy": 0.9754619151353836, "epoch": 2.96921322690992, "grad_norm": 1.1875, "learning_rate": 4.214385958041565e-06, "loss": 0.0475, "mean_token_accuracy": 0.9850366115570068, "num_tokens": 138403882.0, "step": 1303 }, { "entropy": 0.9759823679924011, "epoch": 2.9714937286202963, "grad_norm": 1.3515625, "learning_rate": 4.213014635365816e-06, "loss": 0.0477, "mean_token_accuracy": 0.9853897243738174, "num_tokens": 138509611.0, "step": 1304 }, { "entropy": 0.9768089056015015, "epoch": 2.973774230330673, "grad_norm": 0.8984375, "learning_rate": 4.2116423404160316e-06, "loss": 0.0328, "mean_token_accuracy": 0.9898511916399002, "num_tokens": 138616358.0, "step": 1305 }, { "entropy": 0.9736228138208389, "epoch": 2.976054732041049, "grad_norm": 1.3359375, "learning_rate": 4.210269073971098e-06, "loss": 0.0427, "mean_token_accuracy": 0.9861264824867249, "num_tokens": 138723351.0, "step": 1306 }, { "entropy": 0.9720828086137772, "epoch": 2.9783352337514253, "grad_norm": 0.85546875, "learning_rate": 4.208894836810457e-06, "loss": 0.0301, "mean_token_accuracy": 0.9904955923557281, "num_tokens": 138829731.0, "step": 1307 }, { "entropy": 0.9795053005218506, "epoch": 2.9806157354618015, "grad_norm": 1.1484375, "learning_rate": 4.207519629714099e-06, "loss": 0.0454, "mean_token_accuracy": 0.9868450611829758, "num_tokens": 138935755.0, "step": 1308 }, { "entropy": 0.97999507188797, "epoch": 2.982896237172178, "grad_norm": 1.2265625, "learning_rate": 4.206143453462562e-06, "loss": 0.0464, "mean_token_accuracy": 0.9868758767843246, "num_tokens": 139042120.0, "step": 1309 }, { "entropy": 0.9704454839229584, "epoch": 2.9851767388825543, "grad_norm": 1.03125, "learning_rate": 4.204766308836941e-06, "loss": 0.0367, "mean_token_accuracy": 0.9865719377994537, "num_tokens": 139148838.0, "step": 1310 }, { "entropy": 0.9800317883491516, "epoch": 2.9874572405929305, "grad_norm": 1.046875, "learning_rate": 4.203388196618874e-06, "loss": 0.0326, "mean_token_accuracy": 0.9890518486499786, "num_tokens": 139254891.0, "step": 1311 }, { "entropy": 0.972194641828537, "epoch": 2.9897377423033067, "grad_norm": 1.171875, "learning_rate": 4.202009117590552e-06, "loss": 0.0306, "mean_token_accuracy": 0.9903203696012497, "num_tokens": 139360866.0, "step": 1312 }, { "entropy": 0.9736153334379196, "epoch": 2.992018244013683, "grad_norm": 0.96875, "learning_rate": 4.200629072534713e-06, "loss": 0.0366, "mean_token_accuracy": 0.9872580319643021, "num_tokens": 139467205.0, "step": 1313 }, { "entropy": 0.9733667522668839, "epoch": 2.994298745724059, "grad_norm": 0.96484375, "learning_rate": 4.1992480622346455e-06, "loss": 0.0299, "mean_token_accuracy": 0.9887090921401978, "num_tokens": 139573029.0, "step": 1314 }, { "entropy": 0.9740904867649078, "epoch": 2.9965792474344357, "grad_norm": 1.2109375, "learning_rate": 4.197866087474181e-06, "loss": 0.0415, "mean_token_accuracy": 0.9847603589296341, "num_tokens": 139679066.0, "step": 1315 }, { "entropy": 0.974031999707222, "epoch": 2.998859749144812, "grad_norm": 0.99609375, "learning_rate": 4.196483149037707e-06, "loss": 0.0358, "mean_token_accuracy": 0.9881764352321625, "num_tokens": 139785101.0, "step": 1316 }, { "entropy": 0.9803251624107361, "epoch": 3.0, "grad_norm": 1.7734375, "learning_rate": 4.195099247710147e-06, "loss": 0.0372, "mean_token_accuracy": 0.9905160367488861, "num_tokens": 139823928.0, "step": 1317 }, { "entropy": 0.9742143452167511, "epoch": 3.002280501710376, "grad_norm": 1.03125, "learning_rate": 4.1937143842769805e-06, "loss": 0.036, "mean_token_accuracy": 0.9870474636554718, "num_tokens": 139929886.0, "step": 1318 }, { "entropy": 0.9706628024578094, "epoch": 3.0045610034207524, "grad_norm": 0.88671875, "learning_rate": 4.192328559524227e-06, "loss": 0.0331, "mean_token_accuracy": 0.9894536733627319, "num_tokens": 140036721.0, "step": 1319 }, { "entropy": 0.9779044836759567, "epoch": 3.006841505131129, "grad_norm": 0.96484375, "learning_rate": 4.190941774238454e-06, "loss": 0.0302, "mean_token_accuracy": 0.9915411025285721, "num_tokens": 140143486.0, "step": 1320 }, { "epoch": 3.006841505131129, "eval_entropy": 0.9731998627176757, "eval_loss": 0.03961104154586792, "eval_mean_token_accuracy": 0.9880227206324443, "eval_num_tokens": 140143486.0, "eval_runtime": 66.1279, "eval_samples_per_second": 126.8, "eval_steps_per_second": 3.977, "step": 1320 }, { "entropy": 0.9724339842796326, "epoch": 3.009122006841505, "grad_norm": 1.015625, "learning_rate": 4.1895540292067765e-06, "loss": 0.0334, "mean_token_accuracy": 0.9899168163537979, "num_tokens": 140249612.0, "step": 1321 }, { "entropy": 0.9738974571228027, "epoch": 3.0114025085518814, "grad_norm": 1.1484375, "learning_rate": 4.18816532521685e-06, "loss": 0.0351, "mean_token_accuracy": 0.9894688725471497, "num_tokens": 140356353.0, "step": 1322 }, { "entropy": 0.9714240878820419, "epoch": 3.0136830102622576, "grad_norm": 1.109375, "learning_rate": 4.1867756630568755e-06, "loss": 0.0375, "mean_token_accuracy": 0.9870206713676453, "num_tokens": 140463443.0, "step": 1323 }, { "entropy": 0.971658781170845, "epoch": 3.015963511972634, "grad_norm": 1.109375, "learning_rate": 4.1853850435156e-06, "loss": 0.0373, "mean_token_accuracy": 0.9887799322605133, "num_tokens": 140570002.0, "step": 1324 }, { "entropy": 0.9682203531265259, "epoch": 3.0182440136830104, "grad_norm": 1.1171875, "learning_rate": 4.18399346738231e-06, "loss": 0.0388, "mean_token_accuracy": 0.9870467633008957, "num_tokens": 140675540.0, "step": 1325 }, { "entropy": 0.9735381901264191, "epoch": 3.0205245153933866, "grad_norm": 1.03125, "learning_rate": 4.18260093544684e-06, "loss": 0.031, "mean_token_accuracy": 0.9896798580884933, "num_tokens": 140781280.0, "step": 1326 }, { "entropy": 0.971565455198288, "epoch": 3.022805017103763, "grad_norm": 0.99609375, "learning_rate": 4.181207448499562e-06, "loss": 0.0285, "mean_token_accuracy": 0.9912341833114624, "num_tokens": 140887809.0, "step": 1327 }, { "entropy": 0.968007892370224, "epoch": 3.025085518814139, "grad_norm": 1.0703125, "learning_rate": 4.179813007331394e-06, "loss": 0.0359, "mean_token_accuracy": 0.9884268790483475, "num_tokens": 140993987.0, "step": 1328 }, { "entropy": 0.9693398475646973, "epoch": 3.027366020524515, "grad_norm": 1.015625, "learning_rate": 4.178417612733792e-06, "loss": 0.0359, "mean_token_accuracy": 0.9885739833116531, "num_tokens": 141099938.0, "step": 1329 }, { "entropy": 0.9689221382141113, "epoch": 3.029646522234892, "grad_norm": 1.265625, "learning_rate": 4.177021265498757e-06, "loss": 0.0297, "mean_token_accuracy": 0.9913437068462372, "num_tokens": 141205840.0, "step": 1330 }, { "entropy": 0.970409631729126, "epoch": 3.031927023945268, "grad_norm": 1.1171875, "learning_rate": 4.1756239664188275e-06, "loss": 0.0294, "mean_token_accuracy": 0.9902429282665253, "num_tokens": 141312463.0, "step": 1331 }, { "entropy": 0.9689930975437164, "epoch": 3.034207525655644, "grad_norm": 0.890625, "learning_rate": 4.1742257162870835e-06, "loss": 0.0329, "mean_token_accuracy": 0.9902664422988892, "num_tokens": 141418881.0, "step": 1332 }, { "entropy": 0.9704746007919312, "epoch": 3.0364880273660204, "grad_norm": 0.88671875, "learning_rate": 4.172826515897146e-06, "loss": 0.0325, "mean_token_accuracy": 0.9903807789087296, "num_tokens": 141525065.0, "step": 1333 }, { "entropy": 0.9646623134613037, "epoch": 3.0387685290763966, "grad_norm": 1.2890625, "learning_rate": 4.171426366043172e-06, "loss": 0.0416, "mean_token_accuracy": 0.9889770299196243, "num_tokens": 141631660.0, "step": 1334 }, { "entropy": 0.9718209654092789, "epoch": 3.0410490307867732, "grad_norm": 0.98828125, "learning_rate": 4.170025267519862e-06, "loss": 0.0316, "mean_token_accuracy": 0.9912799745798111, "num_tokens": 141737776.0, "step": 1335 }, { "entropy": 0.9695717245340347, "epoch": 3.0433295324971494, "grad_norm": 1.1484375, "learning_rate": 4.168623221122451e-06, "loss": 0.0415, "mean_token_accuracy": 0.9872856140136719, "num_tokens": 141844412.0, "step": 1336 }, { "entropy": 0.965941920876503, "epoch": 3.0456100342075256, "grad_norm": 1.0078125, "learning_rate": 4.167220227646713e-06, "loss": 0.034, "mean_token_accuracy": 0.9884367734193802, "num_tokens": 141950925.0, "step": 1337 }, { "entropy": 0.9709319323301315, "epoch": 3.047890535917902, "grad_norm": 1.203125, "learning_rate": 4.165816287888962e-06, "loss": 0.037, "mean_token_accuracy": 0.9856457859277725, "num_tokens": 142057724.0, "step": 1338 }, { "entropy": 0.9704457968473434, "epoch": 3.050171037628278, "grad_norm": 0.91796875, "learning_rate": 4.164411402646045e-06, "loss": 0.0306, "mean_token_accuracy": 0.9896255731582642, "num_tokens": 142163878.0, "step": 1339 }, { "entropy": 0.9673406332731247, "epoch": 3.0524515393386547, "grad_norm": 1.2265625, "learning_rate": 4.163005572715348e-06, "loss": 0.0415, "mean_token_accuracy": 0.9884674996137619, "num_tokens": 142270062.0, "step": 1340 }, { "entropy": 0.9689511656761169, "epoch": 3.054732041049031, "grad_norm": 1.1796875, "learning_rate": 4.161598798894795e-06, "loss": 0.0333, "mean_token_accuracy": 0.9912149757146835, "num_tokens": 142376022.0, "step": 1341 }, { "entropy": 0.9696397334337234, "epoch": 3.057012542759407, "grad_norm": 1.1015625, "learning_rate": 4.160191081982841e-06, "loss": 0.036, "mean_token_accuracy": 0.9897021353244781, "num_tokens": 142482342.0, "step": 1342 }, { "entropy": 0.962868258357048, "epoch": 3.0592930444697832, "grad_norm": 1.03125, "learning_rate": 4.15878242277848e-06, "loss": 0.0354, "mean_token_accuracy": 0.9894379675388336, "num_tokens": 142589477.0, "step": 1343 }, { "entropy": 0.9624534547328949, "epoch": 3.0615735461801594, "grad_norm": 1.03125, "learning_rate": 4.157372822081241e-06, "loss": 0.0337, "mean_token_accuracy": 0.9892805069684982, "num_tokens": 142696059.0, "step": 1344 }, { "entropy": 0.9677219986915588, "epoch": 3.063854047890536, "grad_norm": 1.171875, "learning_rate": 4.155962280691184e-06, "loss": 0.0403, "mean_token_accuracy": 0.9867630451917648, "num_tokens": 142802176.0, "step": 1345 }, { "entropy": 0.9644540548324585, "epoch": 3.0661345496009123, "grad_norm": 0.953125, "learning_rate": 4.154550799408906e-06, "loss": 0.0338, "mean_token_accuracy": 0.9891113936901093, "num_tokens": 142908266.0, "step": 1346 }, { "entropy": 0.9659831672906876, "epoch": 3.0684150513112884, "grad_norm": 0.98046875, "learning_rate": 4.153138379035537e-06, "loss": 0.029, "mean_token_accuracy": 0.9899478256702423, "num_tokens": 143014560.0, "step": 1347 }, { "entropy": 0.9639919400215149, "epoch": 3.0706955530216646, "grad_norm": 1.234375, "learning_rate": 4.1517250203727395e-06, "loss": 0.0465, "mean_token_accuracy": 0.9852881580591202, "num_tokens": 143121025.0, "step": 1348 }, { "entropy": 0.9643982499837875, "epoch": 3.072976054732041, "grad_norm": 1.0390625, "learning_rate": 4.150310724222708e-06, "loss": 0.035, "mean_token_accuracy": 0.9886498004198074, "num_tokens": 143227383.0, "step": 1349 }, { "entropy": 0.9690122455358505, "epoch": 3.0752565564424175, "grad_norm": 1.2265625, "learning_rate": 4.14889549138817e-06, "loss": 0.0414, "mean_token_accuracy": 0.9868215024471283, "num_tokens": 143334303.0, "step": 1350 }, { "entropy": 0.9709564447402954, "epoch": 3.0775370581527937, "grad_norm": 1.0703125, "learning_rate": 4.147479322672383e-06, "loss": 0.0383, "mean_token_accuracy": 0.9894109964370728, "num_tokens": 143440634.0, "step": 1351 }, { "entropy": 0.9746661186218262, "epoch": 3.07981755986317, "grad_norm": 1.0234375, "learning_rate": 4.14606221887914e-06, "loss": 0.0291, "mean_token_accuracy": 0.989901602268219, "num_tokens": 143547050.0, "step": 1352 }, { "entropy": 0.9687386155128479, "epoch": 3.082098061573546, "grad_norm": 1.078125, "learning_rate": 4.144644180812759e-06, "loss": 0.0375, "mean_token_accuracy": 0.9881113618612289, "num_tokens": 143653713.0, "step": 1353 }, { "entropy": 0.9687791615724564, "epoch": 3.0843785632839227, "grad_norm": 1.46875, "learning_rate": 4.143225209278093e-06, "loss": 0.0528, "mean_token_accuracy": 0.9834932088851929, "num_tokens": 143760058.0, "step": 1354 }, { "entropy": 0.9736841768026352, "epoch": 3.086659064994299, "grad_norm": 1.0078125, "learning_rate": 4.141805305080521e-06, "loss": 0.0373, "mean_token_accuracy": 0.988155409693718, "num_tokens": 143865964.0, "step": 1355 }, { "entropy": 0.9707996249198914, "epoch": 3.088939566704675, "grad_norm": 0.94140625, "learning_rate": 4.1403844690259544e-06, "loss": 0.0336, "mean_token_accuracy": 0.9909797310829163, "num_tokens": 143972102.0, "step": 1356 }, { "entropy": 0.976124107837677, "epoch": 3.0912200684150513, "grad_norm": 1.0078125, "learning_rate": 4.138962701920831e-06, "loss": 0.0323, "mean_token_accuracy": 0.9890758395195007, "num_tokens": 144078786.0, "step": 1357 }, { "entropy": 0.9713015258312225, "epoch": 3.0935005701254275, "grad_norm": 1.0234375, "learning_rate": 4.13754000457212e-06, "loss": 0.0382, "mean_token_accuracy": 0.9887296259403229, "num_tokens": 144184845.0, "step": 1358 }, { "entropy": 0.972064733505249, "epoch": 3.095781071835804, "grad_norm": 1.2109375, "learning_rate": 4.136116377787317e-06, "loss": 0.0342, "mean_token_accuracy": 0.9892207533121109, "num_tokens": 144290958.0, "step": 1359 }, { "entropy": 0.9678972959518433, "epoch": 3.0980615735461803, "grad_norm": 1.3984375, "learning_rate": 4.134691822374445e-06, "loss": 0.0463, "mean_token_accuracy": 0.9884915053844452, "num_tokens": 144397096.0, "step": 1360 }, { "entropy": 0.9719163924455643, "epoch": 3.1003420752565565, "grad_norm": 1.109375, "learning_rate": 4.1332663391420515e-06, "loss": 0.0328, "mean_token_accuracy": 0.9885279387235641, "num_tokens": 144503172.0, "step": 1361 }, { "entropy": 0.971217468380928, "epoch": 3.1026225769669327, "grad_norm": 1.0390625, "learning_rate": 4.131839928899217e-06, "loss": 0.0319, "mean_token_accuracy": 0.9898978471755981, "num_tokens": 144610175.0, "step": 1362 }, { "entropy": 0.9728261083364487, "epoch": 3.104903078677309, "grad_norm": 1.1015625, "learning_rate": 4.130412592455542e-06, "loss": 0.0398, "mean_token_accuracy": 0.9876290708780289, "num_tokens": 144716574.0, "step": 1363 }, { "entropy": 0.974139928817749, "epoch": 3.1071835803876855, "grad_norm": 1.2578125, "learning_rate": 4.128984330621157e-06, "loss": 0.0342, "mean_token_accuracy": 0.9876402318477631, "num_tokens": 144822910.0, "step": 1364 }, { "entropy": 0.9755742996931076, "epoch": 3.1094640820980617, "grad_norm": 1.1953125, "learning_rate": 4.127555144206713e-06, "loss": 0.0333, "mean_token_accuracy": 0.9910844266414642, "num_tokens": 144929400.0, "step": 1365 }, { "entropy": 0.9740118533372879, "epoch": 3.111744583808438, "grad_norm": 1.046875, "learning_rate": 4.126125034023392e-06, "loss": 0.0349, "mean_token_accuracy": 0.9870626330375671, "num_tokens": 145035576.0, "step": 1366 }, { "entropy": 0.9646743685007095, "epoch": 3.114025085518814, "grad_norm": 1.078125, "learning_rate": 4.124694000882894e-06, "loss": 0.0329, "mean_token_accuracy": 0.9892594516277313, "num_tokens": 145142641.0, "step": 1367 }, { "entropy": 0.973348006606102, "epoch": 3.1163055872291903, "grad_norm": 0.9609375, "learning_rate": 4.123262045597447e-06, "loss": 0.0349, "mean_token_accuracy": 0.9902152866125107, "num_tokens": 145249079.0, "step": 1368 }, { "entropy": 0.9678291976451874, "epoch": 3.118586088939567, "grad_norm": 1.0234375, "learning_rate": 4.121829168979802e-06, "loss": 0.0356, "mean_token_accuracy": 0.988338902592659, "num_tokens": 145355369.0, "step": 1369 }, { "entropy": 0.9734716862440109, "epoch": 3.120866590649943, "grad_norm": 1.3671875, "learning_rate": 4.120395371843231e-06, "loss": 0.0317, "mean_token_accuracy": 0.9902591407299042, "num_tokens": 145461429.0, "step": 1370 }, { "entropy": 0.967303916811943, "epoch": 3.1231470923603193, "grad_norm": 1.0703125, "learning_rate": 4.11896065500153e-06, "loss": 0.0348, "mean_token_accuracy": 0.9883625209331512, "num_tokens": 145567336.0, "step": 1371 }, { "entropy": 0.9723329395055771, "epoch": 3.1254275940706955, "grad_norm": 1.5, "learning_rate": 4.117525019269016e-06, "loss": 0.0446, "mean_token_accuracy": 0.9876422733068466, "num_tokens": 145673566.0, "step": 1372 }, { "entropy": 0.9707440286874771, "epoch": 3.1277080957810717, "grad_norm": 1.078125, "learning_rate": 4.116088465460529e-06, "loss": 0.0363, "mean_token_accuracy": 0.9888935536146164, "num_tokens": 145779665.0, "step": 1373 }, { "entropy": 0.9760256707668304, "epoch": 3.1299885974914483, "grad_norm": 1.171875, "learning_rate": 4.114650994391428e-06, "loss": 0.0449, "mean_token_accuracy": 0.9890113025903702, "num_tokens": 145886348.0, "step": 1374 }, { "entropy": 0.9664222300052643, "epoch": 3.1322690992018245, "grad_norm": 1.453125, "learning_rate": 4.113212606877596e-06, "loss": 0.0431, "mean_token_accuracy": 0.9860242307186127, "num_tokens": 145992261.0, "step": 1375 }, { "entropy": 0.9734215885400772, "epoch": 3.1345496009122007, "grad_norm": 1.125, "learning_rate": 4.111773303735432e-06, "loss": 0.036, "mean_token_accuracy": 0.9890868663787842, "num_tokens": 146098716.0, "step": 1376 }, { "entropy": 0.9739941656589508, "epoch": 3.136830102622577, "grad_norm": 1.1015625, "learning_rate": 4.110333085781857e-06, "loss": 0.0321, "mean_token_accuracy": 0.9891481101512909, "num_tokens": 146204727.0, "step": 1377 }, { "entropy": 0.973417267203331, "epoch": 3.139110604332953, "grad_norm": 1.453125, "learning_rate": 4.108891953834312e-06, "loss": 0.032, "mean_token_accuracy": 0.9901543706655502, "num_tokens": 146310548.0, "step": 1378 }, { "entropy": 0.965936928987503, "epoch": 3.1413911060433297, "grad_norm": 1.2890625, "learning_rate": 4.107449908710753e-06, "loss": 0.043, "mean_token_accuracy": 0.9860404431819916, "num_tokens": 146417352.0, "step": 1379 }, { "entropy": 0.9699915647506714, "epoch": 3.143671607753706, "grad_norm": 1.203125, "learning_rate": 4.106006951229661e-06, "loss": 0.0392, "mean_token_accuracy": 0.9858393669128418, "num_tokens": 146523533.0, "step": 1380 }, { "entropy": 0.9650615155696869, "epoch": 3.145952109464082, "grad_norm": 1.09375, "learning_rate": 4.104563082210028e-06, "loss": 0.0396, "mean_token_accuracy": 0.9899323433637619, "num_tokens": 146630056.0, "step": 1381 }, { "entropy": 0.9687219560146332, "epoch": 3.1482326111744583, "grad_norm": 1.0234375, "learning_rate": 4.1031183024713665e-06, "loss": 0.0276, "mean_token_accuracy": 0.9909821897745132, "num_tokens": 146736304.0, "step": 1382 }, { "entropy": 0.9655210077762604, "epoch": 3.1505131128848345, "grad_norm": 0.84765625, "learning_rate": 4.101672612833706e-06, "loss": 0.0234, "mean_token_accuracy": 0.9929578006267548, "num_tokens": 146842254.0, "step": 1383 }, { "entropy": 0.9682228416204453, "epoch": 3.152793614595211, "grad_norm": 1.1484375, "learning_rate": 4.100226014117592e-06, "loss": 0.0289, "mean_token_accuracy": 0.9894851744174957, "num_tokens": 146948268.0, "step": 1384 }, { "entropy": 0.9692109376192093, "epoch": 3.1550741163055873, "grad_norm": 0.97265625, "learning_rate": 4.098778507144086e-06, "loss": 0.029, "mean_token_accuracy": 0.9908901453018188, "num_tokens": 147054602.0, "step": 1385 }, { "entropy": 0.9708583503961563, "epoch": 3.1573546180159635, "grad_norm": 1.109375, "learning_rate": 4.097330092734765e-06, "loss": 0.0355, "mean_token_accuracy": 0.987810418009758, "num_tokens": 147160276.0, "step": 1386 }, { "entropy": 0.9735705852508545, "epoch": 3.1596351197263397, "grad_norm": 0.85546875, "learning_rate": 4.09588077171172e-06, "loss": 0.028, "mean_token_accuracy": 0.9920137226581573, "num_tokens": 147267016.0, "step": 1387 }, { "entropy": 0.9685496240854263, "epoch": 3.161915621436716, "grad_norm": 0.875, "learning_rate": 4.094430544897559e-06, "loss": 0.0256, "mean_token_accuracy": 0.9918839335441589, "num_tokens": 147372950.0, "step": 1388 }, { "entropy": 0.9633918404579163, "epoch": 3.1641961231470925, "grad_norm": 1.21875, "learning_rate": 4.092979413115404e-06, "loss": 0.0338, "mean_token_accuracy": 0.9890776872634888, "num_tokens": 147479212.0, "step": 1389 }, { "entropy": 0.9676188081502914, "epoch": 3.1664766248574687, "grad_norm": 1.4296875, "learning_rate": 4.091527377188886e-06, "loss": 0.0415, "mean_token_accuracy": 0.9886800348758698, "num_tokens": 147585921.0, "step": 1390 }, { "entropy": 0.9638732224702835, "epoch": 3.168757126567845, "grad_norm": 1.125, "learning_rate": 4.090074437942155e-06, "loss": 0.0352, "mean_token_accuracy": 0.9882200807332993, "num_tokens": 147692794.0, "step": 1391 }, { "entropy": 0.9668201357126236, "epoch": 3.171037628278221, "grad_norm": 1.078125, "learning_rate": 4.088620596199872e-06, "loss": 0.0293, "mean_token_accuracy": 0.9912041872739792, "num_tokens": 147799326.0, "step": 1392 }, { "entropy": 0.9651756435632706, "epoch": 3.1733181299885973, "grad_norm": 1.0703125, "learning_rate": 4.087165852787206e-06, "loss": 0.0359, "mean_token_accuracy": 0.9914982914924622, "num_tokens": 147905837.0, "step": 1393 }, { "entropy": 0.9630156010389328, "epoch": 3.175598631698974, "grad_norm": 1.234375, "learning_rate": 4.085710208529844e-06, "loss": 0.0385, "mean_token_accuracy": 0.9872415363788605, "num_tokens": 148012862.0, "step": 1394 }, { "entropy": 0.9721795171499252, "epoch": 3.17787913340935, "grad_norm": 1.3046875, "learning_rate": 4.084253664253981e-06, "loss": 0.0402, "mean_token_accuracy": 0.9862841367721558, "num_tokens": 148119202.0, "step": 1395 }, { "entropy": 0.9669381678104401, "epoch": 3.1801596351197263, "grad_norm": 1.5, "learning_rate": 4.082796220786324e-06, "loss": 0.0384, "mean_token_accuracy": 0.9887655079364777, "num_tokens": 148225907.0, "step": 1396 }, { "entropy": 0.9700199961662292, "epoch": 3.1824401368301025, "grad_norm": 1.171875, "learning_rate": 4.081337878954088e-06, "loss": 0.0327, "mean_token_accuracy": 0.9898239225149155, "num_tokens": 148332240.0, "step": 1397 }, { "entropy": 0.9652293771505356, "epoch": 3.1847206385404787, "grad_norm": 1.0, "learning_rate": 4.079878639585002e-06, "loss": 0.0295, "mean_token_accuracy": 0.9903471320867538, "num_tokens": 148438733.0, "step": 1398 }, { "entropy": 0.9698170721530914, "epoch": 3.1870011402508553, "grad_norm": 0.9765625, "learning_rate": 4.0784185035072996e-06, "loss": 0.0329, "mean_token_accuracy": 0.9891312271356583, "num_tokens": 148544408.0, "step": 1399 }, { "entropy": 0.9667359292507172, "epoch": 3.1892816419612315, "grad_norm": 1.1171875, "learning_rate": 4.076957471549728e-06, "loss": 0.0307, "mean_token_accuracy": 0.9891311973333359, "num_tokens": 148650303.0, "step": 1400 }, { "entropy": 0.9682821035385132, "epoch": 3.1915621436716077, "grad_norm": 1.40625, "learning_rate": 4.0754955445415405e-06, "loss": 0.0454, "mean_token_accuracy": 0.9868654608726501, "num_tokens": 148756390.0, "step": 1401 }, { "entropy": 0.9715272784233093, "epoch": 3.193842645381984, "grad_norm": 1.046875, "learning_rate": 4.074032723312497e-06, "loss": 0.0349, "mean_token_accuracy": 0.9892802834510803, "num_tokens": 148862509.0, "step": 1402 }, { "entropy": 0.9697977006435394, "epoch": 3.19612314709236, "grad_norm": 1.0703125, "learning_rate": 4.072569008692868e-06, "loss": 0.037, "mean_token_accuracy": 0.989894762635231, "num_tokens": 148968533.0, "step": 1403 }, { "entropy": 0.9706413745880127, "epoch": 3.1984036488027368, "grad_norm": 1.3203125, "learning_rate": 4.071104401513429e-06, "loss": 0.0432, "mean_token_accuracy": 0.9854031205177307, "num_tokens": 149074939.0, "step": 1404 }, { "entropy": 0.9713904708623886, "epoch": 3.200684150513113, "grad_norm": 0.9375, "learning_rate": 4.069638902605464e-06, "loss": 0.0283, "mean_token_accuracy": 0.9902945458889008, "num_tokens": 149182273.0, "step": 1405 }, { "entropy": 0.9747706800699234, "epoch": 3.202964652223489, "grad_norm": 1.1015625, "learning_rate": 4.06817251280076e-06, "loss": 0.0291, "mean_token_accuracy": 0.9912142157554626, "num_tokens": 149288488.0, "step": 1406 }, { "entropy": 0.974722683429718, "epoch": 3.2052451539338653, "grad_norm": 1.078125, "learning_rate": 4.0667052329316125e-06, "loss": 0.0379, "mean_token_accuracy": 0.9899085462093353, "num_tokens": 149395124.0, "step": 1407 }, { "entropy": 0.9746892601251602, "epoch": 3.2075256556442415, "grad_norm": 1.125, "learning_rate": 4.0652370638308215e-06, "loss": 0.039, "mean_token_accuracy": 0.9880725294351578, "num_tokens": 149500815.0, "step": 1408 }, { "entropy": 0.9722046107053757, "epoch": 3.209806157354618, "grad_norm": 1.078125, "learning_rate": 4.063768006331691e-06, "loss": 0.0346, "mean_token_accuracy": 0.9869754165410995, "num_tokens": 149607084.0, "step": 1409 }, { "entropy": 0.9725812375545502, "epoch": 3.2120866590649944, "grad_norm": 0.9765625, "learning_rate": 4.06229806126803e-06, "loss": 0.0301, "mean_token_accuracy": 0.9889445900917053, "num_tokens": 149713598.0, "step": 1410 }, { "entropy": 0.9712058901786804, "epoch": 3.2143671607753705, "grad_norm": 0.9375, "learning_rate": 4.06082722947415e-06, "loss": 0.0274, "mean_token_accuracy": 0.9928667992353439, "num_tokens": 149820319.0, "step": 1411 }, { "entropy": 0.9717503488063812, "epoch": 3.2166476624857467, "grad_norm": 1.0859375, "learning_rate": 4.059355511784868e-06, "loss": 0.0331, "mean_token_accuracy": 0.9898428022861481, "num_tokens": 149926287.0, "step": 1412 }, { "entropy": 0.9744679927825928, "epoch": 3.2189281641961234, "grad_norm": 1.1875, "learning_rate": 4.057882909035503e-06, "loss": 0.0431, "mean_token_accuracy": 0.9868465662002563, "num_tokens": 150032770.0, "step": 1413 }, { "entropy": 0.9754738211631775, "epoch": 3.2212086659064996, "grad_norm": 0.953125, "learning_rate": 4.0564094220618735e-06, "loss": 0.0337, "mean_token_accuracy": 0.989643320441246, "num_tokens": 150138950.0, "step": 1414 }, { "entropy": 0.9691427648067474, "epoch": 3.2234891676168758, "grad_norm": 1.1171875, "learning_rate": 4.054935051700305e-06, "loss": 0.0303, "mean_token_accuracy": 0.9882515966892242, "num_tokens": 150245171.0, "step": 1415 }, { "entropy": 0.9756327420473099, "epoch": 3.225769669327252, "grad_norm": 0.99609375, "learning_rate": 4.053459798787619e-06, "loss": 0.0345, "mean_token_accuracy": 0.9904384166002274, "num_tokens": 150351661.0, "step": 1416 }, { "entropy": 0.9740752875804901, "epoch": 3.228050171037628, "grad_norm": 1.140625, "learning_rate": 4.0519836641611425e-06, "loss": 0.0351, "mean_token_accuracy": 0.98947973549366, "num_tokens": 150458014.0, "step": 1417 }, { "entropy": 0.9697173833847046, "epoch": 3.2303306727480043, "grad_norm": 1.40625, "learning_rate": 4.050506648658701e-06, "loss": 0.0493, "mean_token_accuracy": 0.983524426817894, "num_tokens": 150564191.0, "step": 1418 }, { "entropy": 0.9690650850534439, "epoch": 3.232611174458381, "grad_norm": 1.203125, "learning_rate": 4.049028753118619e-06, "loss": 0.0403, "mean_token_accuracy": 0.9884923994541168, "num_tokens": 150670692.0, "step": 1419 }, { "entropy": 0.9740584343671799, "epoch": 3.234891676168757, "grad_norm": 1.5625, "learning_rate": 4.047549978379721e-06, "loss": 0.0407, "mean_token_accuracy": 0.9877983331680298, "num_tokens": 150776791.0, "step": 1420 }, { "entropy": 0.9732891023159027, "epoch": 3.2371721778791334, "grad_norm": 1.3359375, "learning_rate": 4.046070325281333e-06, "loss": 0.0384, "mean_token_accuracy": 0.9877580255270004, "num_tokens": 150883009.0, "step": 1421 }, { "entropy": 0.977691113948822, "epoch": 3.2394526795895096, "grad_norm": 1.140625, "learning_rate": 4.044589794663275e-06, "loss": 0.04, "mean_token_accuracy": 0.9859212934970856, "num_tokens": 150989658.0, "step": 1422 }, { "entropy": 0.9746886938810349, "epoch": 3.241733181299886, "grad_norm": 1.0546875, "learning_rate": 4.04310838736587e-06, "loss": 0.0363, "mean_token_accuracy": 0.9897939711809158, "num_tokens": 151096698.0, "step": 1423 }, { "entropy": 0.9722186177968979, "epoch": 3.2440136830102624, "grad_norm": 0.9375, "learning_rate": 4.041626104229937e-06, "loss": 0.0369, "mean_token_accuracy": 0.9879841059446335, "num_tokens": 151202826.0, "step": 1424 }, { "entropy": 0.9750289022922516, "epoch": 3.2462941847206386, "grad_norm": 0.99609375, "learning_rate": 4.0401429460967864e-06, "loss": 0.0393, "mean_token_accuracy": 0.9846253395080566, "num_tokens": 151308237.0, "step": 1425 }, { "entropy": 0.9746069014072418, "epoch": 3.2485746864310148, "grad_norm": 1.1015625, "learning_rate": 4.038658913808235e-06, "loss": 0.0399, "mean_token_accuracy": 0.988548144698143, "num_tokens": 151414827.0, "step": 1426 }, { "entropy": 0.9694573283195496, "epoch": 3.250855188141391, "grad_norm": 0.88671875, "learning_rate": 4.037174008206589e-06, "loss": 0.0296, "mean_token_accuracy": 0.9906992763280869, "num_tokens": 151520311.0, "step": 1427 }, { "entropy": 0.9699313789606094, "epoch": 3.253135689851767, "grad_norm": 1.1015625, "learning_rate": 4.035688230134651e-06, "loss": 0.0313, "mean_token_accuracy": 0.9911614209413528, "num_tokens": 151626597.0, "step": 1428 }, { "entropy": 0.9809326976537704, "epoch": 3.255416191562144, "grad_norm": 1.0078125, "learning_rate": 4.034201580435723e-06, "loss": 0.0339, "mean_token_accuracy": 0.9911264926195145, "num_tokens": 151732727.0, "step": 1429 }, { "entropy": 0.9745475798845291, "epoch": 3.25769669327252, "grad_norm": 1.140625, "learning_rate": 4.0327140599535954e-06, "loss": 0.0386, "mean_token_accuracy": 0.9870993494987488, "num_tokens": 151838714.0, "step": 1430 }, { "entropy": 0.972587376832962, "epoch": 3.259977194982896, "grad_norm": 1.453125, "learning_rate": 4.031225669532558e-06, "loss": 0.0438, "mean_token_accuracy": 0.9883745461702347, "num_tokens": 151944484.0, "step": 1431 }, { "entropy": 0.9612488895654678, "epoch": 3.2622576966932724, "grad_norm": 1.1171875, "learning_rate": 4.029736410017392e-06, "loss": 0.0405, "mean_token_accuracy": 0.9886812269687653, "num_tokens": 152050757.0, "step": 1432 }, { "entropy": 0.9742260128259659, "epoch": 3.264538198403649, "grad_norm": 1.09375, "learning_rate": 4.028246282253373e-06, "loss": 0.0366, "mean_token_accuracy": 0.9879946112632751, "num_tokens": 152156665.0, "step": 1433 }, { "entropy": 0.9745662659406662, "epoch": 3.266818700114025, "grad_norm": 1.0234375, "learning_rate": 4.026755287086267e-06, "loss": 0.0385, "mean_token_accuracy": 0.9877077490091324, "num_tokens": 152263236.0, "step": 1434 }, { "entropy": 0.9760592728853226, "epoch": 3.2690992018244014, "grad_norm": 1.1484375, "learning_rate": 4.025263425362335e-06, "loss": 0.0329, "mean_token_accuracy": 0.989358127117157, "num_tokens": 152369175.0, "step": 1435 }, { "entropy": 0.9704379290342331, "epoch": 3.2713797035347776, "grad_norm": 0.98828125, "learning_rate": 4.0237706979283306e-06, "loss": 0.0338, "mean_token_accuracy": 0.9896824955940247, "num_tokens": 152475289.0, "step": 1436 }, { "entropy": 0.9693782925605774, "epoch": 3.2736602052451538, "grad_norm": 0.98828125, "learning_rate": 4.022277105631495e-06, "loss": 0.0224, "mean_token_accuracy": 0.9918347597122192, "num_tokens": 152581615.0, "step": 1437 }, { "entropy": 0.9746818691492081, "epoch": 3.27594070695553, "grad_norm": 1.4296875, "learning_rate": 4.020782649319563e-06, "loss": 0.0455, "mean_token_accuracy": 0.9837725907564163, "num_tokens": 152688092.0, "step": 1438 }, { "entropy": 0.9681685268878937, "epoch": 3.2782212086659066, "grad_norm": 0.9921875, "learning_rate": 4.019287329840759e-06, "loss": 0.0312, "mean_token_accuracy": 0.9890150874853134, "num_tokens": 152794002.0, "step": 1439 }, { "entropy": 0.9758435189723969, "epoch": 3.280501710376283, "grad_norm": 0.890625, "learning_rate": 4.017791148043797e-06, "loss": 0.0306, "mean_token_accuracy": 0.9916428625583649, "num_tokens": 152900284.0, "step": 1440 }, { "entropy": 0.9673158973455429, "epoch": 3.282782212086659, "grad_norm": 0.875, "learning_rate": 4.016294104777883e-06, "loss": 0.0291, "mean_token_accuracy": 0.9914217889308929, "num_tokens": 153006501.0, "step": 1441 }, { "entropy": 0.965080738067627, "epoch": 3.285062713797035, "grad_norm": 1.1796875, "learning_rate": 4.0147962008927065e-06, "loss": 0.0322, "mean_token_accuracy": 0.9890916794538498, "num_tokens": 153113808.0, "step": 1442 }, { "entropy": 0.9690580517053604, "epoch": 3.287343215507412, "grad_norm": 1.140625, "learning_rate": 4.013297437238452e-06, "loss": 0.0292, "mean_token_accuracy": 0.9895809143781662, "num_tokens": 153219670.0, "step": 1443 }, { "entropy": 0.9717966616153717, "epoch": 3.289623717217788, "grad_norm": 1.1015625, "learning_rate": 4.011797814665787e-06, "loss": 0.0419, "mean_token_accuracy": 0.9876728057861328, "num_tokens": 153325970.0, "step": 1444 }, { "entropy": 0.9654037207365036, "epoch": 3.291904218928164, "grad_norm": 1.1640625, "learning_rate": 4.010297334025869e-06, "loss": 0.0347, "mean_token_accuracy": 0.9888811260461807, "num_tokens": 153432435.0, "step": 1445 }, { "entropy": 0.966295450925827, "epoch": 3.2941847206385404, "grad_norm": 1.03125, "learning_rate": 4.008795996170341e-06, "loss": 0.0283, "mean_token_accuracy": 0.9913307726383209, "num_tokens": 153538713.0, "step": 1446 }, { "entropy": 0.9667803943157196, "epoch": 3.2964652223489166, "grad_norm": 1.171875, "learning_rate": 4.0072938019513345e-06, "loss": 0.0396, "mean_token_accuracy": 0.9867127239704132, "num_tokens": 153645561.0, "step": 1447 }, { "entropy": 0.9618817567825317, "epoch": 3.2987457240592932, "grad_norm": 1.1640625, "learning_rate": 4.0057907522214646e-06, "loss": 0.0399, "mean_token_accuracy": 0.987223669886589, "num_tokens": 153752736.0, "step": 1448 }, { "entropy": 0.9625356942415237, "epoch": 3.3010262257696694, "grad_norm": 1.09375, "learning_rate": 4.004286847833835e-06, "loss": 0.0362, "mean_token_accuracy": 0.989401176571846, "num_tokens": 153859316.0, "step": 1449 }, { "entropy": 0.969322606921196, "epoch": 3.3033067274800456, "grad_norm": 1.2578125, "learning_rate": 4.002782089642031e-06, "loss": 0.0369, "mean_token_accuracy": 0.9882229864597321, "num_tokens": 153965429.0, "step": 1450 }, { "entropy": 0.9619986861944199, "epoch": 3.305587229190422, "grad_norm": 1.078125, "learning_rate": 4.001276478500127e-06, "loss": 0.0341, "mean_token_accuracy": 0.9900808781385422, "num_tokens": 154072686.0, "step": 1451 }, { "entropy": 0.9677063971757889, "epoch": 3.307867730900798, "grad_norm": 0.984375, "learning_rate": 3.9997700152626755e-06, "loss": 0.0281, "mean_token_accuracy": 0.9891399294137955, "num_tokens": 154178658.0, "step": 1452 }, { "entropy": 0.9696138501167297, "epoch": 3.3101482326111746, "grad_norm": 0.90625, "learning_rate": 3.9982627007847186e-06, "loss": 0.0333, "mean_token_accuracy": 0.9885885715484619, "num_tokens": 154286014.0, "step": 1453 }, { "entropy": 0.9600279033184052, "epoch": 3.312428734321551, "grad_norm": 0.9921875, "learning_rate": 3.996754535921777e-06, "loss": 0.0324, "mean_token_accuracy": 0.9907027184963226, "num_tokens": 154392438.0, "step": 1454 }, { "entropy": 0.9634466767311096, "epoch": 3.314709236031927, "grad_norm": 1.0390625, "learning_rate": 3.995245521529857e-06, "loss": 0.0311, "mean_token_accuracy": 0.9900589287281036, "num_tokens": 154498629.0, "step": 1455 }, { "entropy": 0.9650966823101044, "epoch": 3.316989737742303, "grad_norm": 1.0078125, "learning_rate": 3.993735658465446e-06, "loss": 0.0327, "mean_token_accuracy": 0.9920144826173782, "num_tokens": 154604164.0, "step": 1456 }, { "entropy": 0.9697616696357727, "epoch": 3.3192702394526794, "grad_norm": 1.0859375, "learning_rate": 3.992224947585513e-06, "loss": 0.0312, "mean_token_accuracy": 0.9888024479150772, "num_tokens": 154710686.0, "step": 1457 }, { "entropy": 0.970878392457962, "epoch": 3.321550741163056, "grad_norm": 0.8046875, "learning_rate": 3.990713389747508e-06, "loss": 0.0266, "mean_token_accuracy": 0.9904832392930984, "num_tokens": 154816798.0, "step": 1458 }, { "entropy": 0.9668480455875397, "epoch": 3.3238312428734322, "grad_norm": 0.86328125, "learning_rate": 3.989200985809362e-06, "loss": 0.0268, "mean_token_accuracy": 0.9922827929258347, "num_tokens": 154923040.0, "step": 1459 }, { "entropy": 0.9656287282705307, "epoch": 3.3261117445838084, "grad_norm": 1.109375, "learning_rate": 3.987687736629487e-06, "loss": 0.0337, "mean_token_accuracy": 0.9886372685432434, "num_tokens": 155028966.0, "step": 1460 }, { "entropy": 0.9685462564229965, "epoch": 3.3283922462941846, "grad_norm": 1.1484375, "learning_rate": 3.986173643066774e-06, "loss": 0.0384, "mean_token_accuracy": 0.9884121119976044, "num_tokens": 155135019.0, "step": 1461 }, { "entropy": 0.9600776880979538, "epoch": 3.330672748004561, "grad_norm": 1.390625, "learning_rate": 3.984658705980593e-06, "loss": 0.0513, "mean_token_accuracy": 0.9845166206359863, "num_tokens": 155242037.0, "step": 1462 }, { "entropy": 0.9659885466098785, "epoch": 3.3329532497149374, "grad_norm": 1.28125, "learning_rate": 3.983142926230792e-06, "loss": 0.0412, "mean_token_accuracy": 0.9881733357906342, "num_tokens": 155348621.0, "step": 1463 }, { "entropy": 0.9651351273059845, "epoch": 3.3352337514253136, "grad_norm": 1.09375, "learning_rate": 3.981626304677701e-06, "loss": 0.034, "mean_token_accuracy": 0.9867776036262512, "num_tokens": 155455302.0, "step": 1464 }, { "entropy": 0.9689832329750061, "epoch": 3.33751425313569, "grad_norm": 1.453125, "learning_rate": 3.980108842182121e-06, "loss": 0.0387, "mean_token_accuracy": 0.9887252300977707, "num_tokens": 155562002.0, "step": 1465 }, { "entropy": 0.9688422232866287, "epoch": 3.339794754846066, "grad_norm": 1.2578125, "learning_rate": 3.978590539605338e-06, "loss": 0.0457, "mean_token_accuracy": 0.9863058030605316, "num_tokens": 155669190.0, "step": 1466 }, { "entropy": 0.9615237265825272, "epoch": 3.342075256556442, "grad_norm": 0.95703125, "learning_rate": 3.97707139780911e-06, "loss": 0.033, "mean_token_accuracy": 0.9886186420917511, "num_tokens": 155776296.0, "step": 1467 }, { "entropy": 0.9665016531944275, "epoch": 3.344355758266819, "grad_norm": 0.98828125, "learning_rate": 3.975551417655673e-06, "loss": 0.0327, "mean_token_accuracy": 0.9900273084640503, "num_tokens": 155882261.0, "step": 1468 }, { "entropy": 0.9675234854221344, "epoch": 3.346636259977195, "grad_norm": 1.1875, "learning_rate": 3.974030600007737e-06, "loss": 0.0361, "mean_token_accuracy": 0.9891727566719055, "num_tokens": 155988351.0, "step": 1469 }, { "entropy": 0.9697202742099762, "epoch": 3.3489167616875712, "grad_norm": 1.0546875, "learning_rate": 3.97250894572849e-06, "loss": 0.0353, "mean_token_accuracy": 0.987371951341629, "num_tokens": 156094553.0, "step": 1470 }, { "entropy": 0.9692974984645844, "epoch": 3.3511972633979474, "grad_norm": 1.0859375, "learning_rate": 3.970986455681593e-06, "loss": 0.0319, "mean_token_accuracy": 0.9914093017578125, "num_tokens": 156201280.0, "step": 1471 }, { "entropy": 0.9683528244495392, "epoch": 3.353477765108324, "grad_norm": 1.2109375, "learning_rate": 3.969463130731183e-06, "loss": 0.0374, "mean_token_accuracy": 0.9866133630275726, "num_tokens": 156306935.0, "step": 1472 }, { "entropy": 0.968035563826561, "epoch": 3.3557582668187003, "grad_norm": 1.1953125, "learning_rate": 3.967938971741869e-06, "loss": 0.0364, "mean_token_accuracy": 0.9890647977590561, "num_tokens": 156413831.0, "step": 1473 }, { "entropy": 0.9697747230529785, "epoch": 3.3580387685290765, "grad_norm": 0.9921875, "learning_rate": 3.966413979578734e-06, "loss": 0.0352, "mean_token_accuracy": 0.989161342382431, "num_tokens": 156520235.0, "step": 1474 }, { "entropy": 0.9771900177001953, "epoch": 3.3603192702394526, "grad_norm": 1.1796875, "learning_rate": 3.964888155107335e-06, "loss": 0.0325, "mean_token_accuracy": 0.9886705875396729, "num_tokens": 156626461.0, "step": 1475 }, { "entropy": 0.9719144552946091, "epoch": 3.362599771949829, "grad_norm": 1.2265625, "learning_rate": 3.963361499193699e-06, "loss": 0.038, "mean_token_accuracy": 0.9860211908817291, "num_tokens": 156733193.0, "step": 1476 }, { "entropy": 0.9688958823680878, "epoch": 3.364880273660205, "grad_norm": 1.0625, "learning_rate": 3.9618340127043274e-06, "loss": 0.0351, "mean_token_accuracy": 0.9882425516843796, "num_tokens": 156839306.0, "step": 1477 }, { "entropy": 0.9699642807245255, "epoch": 3.3671607753705817, "grad_norm": 1.171875, "learning_rate": 3.960305696506192e-06, "loss": 0.0326, "mean_token_accuracy": 0.9909366518259048, "num_tokens": 156945529.0, "step": 1478 }, { "entropy": 0.963573083281517, "epoch": 3.369441277080958, "grad_norm": 1.1484375, "learning_rate": 3.958776551466737e-06, "loss": 0.0382, "mean_token_accuracy": 0.9873963296413422, "num_tokens": 157051559.0, "step": 1479 }, { "entropy": 0.9688340425491333, "epoch": 3.371721778791334, "grad_norm": 1.140625, "learning_rate": 3.957246578453873e-06, "loss": 0.0377, "mean_token_accuracy": 0.9887951612472534, "num_tokens": 157157043.0, "step": 1480 }, { "entropy": 0.972258523106575, "epoch": 3.3740022805017102, "grad_norm": 1.0625, "learning_rate": 3.955715778335984e-06, "loss": 0.0344, "mean_token_accuracy": 0.9888990521430969, "num_tokens": 157262966.0, "step": 1481 }, { "entropy": 0.9687809944152832, "epoch": 3.376282782212087, "grad_norm": 1.0234375, "learning_rate": 3.954184151981924e-06, "loss": 0.0263, "mean_token_accuracy": 0.9903520196676254, "num_tokens": 157369044.0, "step": 1482 }, { "entropy": 0.9705048054456711, "epoch": 3.378563283922463, "grad_norm": 0.9921875, "learning_rate": 3.952651700261012e-06, "loss": 0.0281, "mean_token_accuracy": 0.9926184266805649, "num_tokens": 157475263.0, "step": 1483 }, { "entropy": 0.975169762969017, "epoch": 3.3808437856328393, "grad_norm": 1.3984375, "learning_rate": 3.95111842404304e-06, "loss": 0.0342, "mean_token_accuracy": 0.9889620691537857, "num_tokens": 157582550.0, "step": 1484 }, { "entropy": 0.9667369574308395, "epoch": 3.3831242873432155, "grad_norm": 0.9765625, "learning_rate": 3.949584324198266e-06, "loss": 0.0311, "mean_token_accuracy": 0.9901253283023834, "num_tokens": 157689100.0, "step": 1485 }, { "entropy": 0.972982183098793, "epoch": 3.3854047890535917, "grad_norm": 1.0234375, "learning_rate": 3.948049401597414e-06, "loss": 0.0354, "mean_token_accuracy": 0.989602267742157, "num_tokens": 157795534.0, "step": 1486 }, { "entropy": 0.9688234329223633, "epoch": 3.387685290763968, "grad_norm": 1.1171875, "learning_rate": 3.946513657111678e-06, "loss": 0.0298, "mean_token_accuracy": 0.9920781999826431, "num_tokens": 157902244.0, "step": 1487 }, { "entropy": 0.9678186178207397, "epoch": 3.3899657924743445, "grad_norm": 1.078125, "learning_rate": 3.944977091612716e-06, "loss": 0.0337, "mean_token_accuracy": 0.988512709736824, "num_tokens": 158008590.0, "step": 1488 }, { "entropy": 0.9675301313400269, "epoch": 3.3922462941847207, "grad_norm": 1.171875, "learning_rate": 3.943439705972654e-06, "loss": 0.0359, "mean_token_accuracy": 0.9904681444168091, "num_tokens": 158114896.0, "step": 1489 }, { "entropy": 0.9673101156949997, "epoch": 3.394526795895097, "grad_norm": 1.25, "learning_rate": 3.94190150106408e-06, "loss": 0.0316, "mean_token_accuracy": 0.9913520365953445, "num_tokens": 158221133.0, "step": 1490 }, { "entropy": 0.971269279718399, "epoch": 3.396807297605473, "grad_norm": 1.1953125, "learning_rate": 3.9403624777600526e-06, "loss": 0.0345, "mean_token_accuracy": 0.9888667166233063, "num_tokens": 158327262.0, "step": 1491 }, { "entropy": 0.9691773504018784, "epoch": 3.3990877993158497, "grad_norm": 1.1640625, "learning_rate": 3.938822636934089e-06, "loss": 0.0338, "mean_token_accuracy": 0.9873778522014618, "num_tokens": 158433403.0, "step": 1492 }, { "entropy": 0.970480278134346, "epoch": 3.401368301026226, "grad_norm": 1.0703125, "learning_rate": 3.937281979460175e-06, "loss": 0.0306, "mean_token_accuracy": 0.9902668297290802, "num_tokens": 158539667.0, "step": 1493 }, { "entropy": 0.9705560952425003, "epoch": 3.403648802736602, "grad_norm": 1.140625, "learning_rate": 3.9357405062127565e-06, "loss": 0.0306, "mean_token_accuracy": 0.988071620464325, "num_tokens": 158645877.0, "step": 1494 }, { "entropy": 0.9682572185993195, "epoch": 3.4059293044469783, "grad_norm": 1.5625, "learning_rate": 3.934198218066745e-06, "loss": 0.0381, "mean_token_accuracy": 0.9876181781291962, "num_tokens": 158752623.0, "step": 1495 }, { "entropy": 0.9685372710227966, "epoch": 3.4082098061573545, "grad_norm": 1.125, "learning_rate": 3.932655115897513e-06, "loss": 0.0304, "mean_token_accuracy": 0.990140438079834, "num_tokens": 158858682.0, "step": 1496 }, { "entropy": 0.9651192724704742, "epoch": 3.4104903078677307, "grad_norm": 1.1875, "learning_rate": 3.9311112005808955e-06, "loss": 0.0416, "mean_token_accuracy": 0.9882625192403793, "num_tokens": 158964953.0, "step": 1497 }, { "entropy": 0.9682807326316833, "epoch": 3.4127708095781073, "grad_norm": 1.015625, "learning_rate": 3.92956647299319e-06, "loss": 0.0361, "mean_token_accuracy": 0.9908769428730011, "num_tokens": 159071252.0, "step": 1498 }, { "entropy": 0.9723004698753357, "epoch": 3.4150513112884835, "grad_norm": 1.3125, "learning_rate": 3.928020934011153e-06, "loss": 0.0369, "mean_token_accuracy": 0.9888971745967865, "num_tokens": 159177536.0, "step": 1499 }, { "entropy": 0.9725925177335739, "epoch": 3.4173318129988597, "grad_norm": 0.921875, "learning_rate": 3.926474584512002e-06, "loss": 0.0325, "mean_token_accuracy": 0.991219162940979, "num_tokens": 159283636.0, "step": 1500 }, { "entropy": 0.9715686440467834, "epoch": 3.419612314709236, "grad_norm": 1.09375, "learning_rate": 3.924927425373417e-06, "loss": 0.0364, "mean_token_accuracy": 0.9881605356931686, "num_tokens": 159390211.0, "step": 1501 }, { "entropy": 0.9715805649757385, "epoch": 3.4218928164196125, "grad_norm": 1.0546875, "learning_rate": 3.9233794574735345e-06, "loss": 0.0359, "mean_token_accuracy": 0.9871464371681213, "num_tokens": 159496837.0, "step": 1502 }, { "entropy": 0.9667670577764511, "epoch": 3.4241733181299887, "grad_norm": 0.91015625, "learning_rate": 3.921830681690951e-06, "loss": 0.0276, "mean_token_accuracy": 0.9910220205783844, "num_tokens": 159602223.0, "step": 1503 }, { "entropy": 0.9689970016479492, "epoch": 3.426453819840365, "grad_norm": 1.2734375, "learning_rate": 3.920281098904722e-06, "loss": 0.0314, "mean_token_accuracy": 0.9901114106178284, "num_tokens": 159708819.0, "step": 1504 }, { "entropy": 0.9620591849088669, "epoch": 3.428734321550741, "grad_norm": 1.09375, "learning_rate": 3.918730709994361e-06, "loss": 0.0383, "mean_token_accuracy": 0.9864480644464493, "num_tokens": 159815257.0, "step": 1505 }, { "entropy": 0.9708369225263596, "epoch": 3.4310148232611173, "grad_norm": 1.1953125, "learning_rate": 3.91717951583984e-06, "loss": 0.0369, "mean_token_accuracy": 0.9899269938468933, "num_tokens": 159921285.0, "step": 1506 }, { "entropy": 0.9659807085990906, "epoch": 3.433295324971494, "grad_norm": 1.078125, "learning_rate": 3.915627517321584e-06, "loss": 0.0318, "mean_token_accuracy": 0.9909418821334839, "num_tokens": 160027969.0, "step": 1507 }, { "entropy": 0.9704028964042664, "epoch": 3.43557582668187, "grad_norm": 1.09375, "learning_rate": 3.914074715320479e-06, "loss": 0.0286, "mean_token_accuracy": 0.9912102073431015, "num_tokens": 160134363.0, "step": 1508 }, { "entropy": 0.9676614999771118, "epoch": 3.4378563283922463, "grad_norm": 1.125, "learning_rate": 3.912521110717866e-06, "loss": 0.0399, "mean_token_accuracy": 0.9910549372434616, "num_tokens": 160240968.0, "step": 1509 }, { "entropy": 0.972350999712944, "epoch": 3.4401368301026225, "grad_norm": 1.09375, "learning_rate": 3.9109667043955405e-06, "loss": 0.0337, "mean_token_accuracy": 0.9883147776126862, "num_tokens": 160347169.0, "step": 1510 }, { "entropy": 0.9651853442192078, "epoch": 3.4424173318129987, "grad_norm": 1.03125, "learning_rate": 3.909411497235752e-06, "loss": 0.0222, "mean_token_accuracy": 0.9933672547340393, "num_tokens": 160453257.0, "step": 1511 }, { "entropy": 0.9767438918352127, "epoch": 3.4446978335233753, "grad_norm": 0.91015625, "learning_rate": 3.907855490121208e-06, "loss": 0.0389, "mean_token_accuracy": 0.9899672716856003, "num_tokens": 160559527.0, "step": 1512 }, { "entropy": 0.966604083776474, "epoch": 3.4469783352337515, "grad_norm": 1.25, "learning_rate": 3.906298683935068e-06, "loss": 0.0403, "mean_token_accuracy": 0.9880654811859131, "num_tokens": 160665809.0, "step": 1513 }, { "entropy": 0.9667899757623672, "epoch": 3.4492588369441277, "grad_norm": 1.3515625, "learning_rate": 3.904741079560944e-06, "loss": 0.0475, "mean_token_accuracy": 0.9875009804964066, "num_tokens": 160772483.0, "step": 1514 }, { "entropy": 0.9695316255092621, "epoch": 3.451539338654504, "grad_norm": 1.0703125, "learning_rate": 3.903182677882904e-06, "loss": 0.0308, "mean_token_accuracy": 0.9909137189388275, "num_tokens": 160879049.0, "step": 1515 }, { "entropy": 0.9667837172746658, "epoch": 3.45381984036488, "grad_norm": 1.078125, "learning_rate": 3.901623479785465e-06, "loss": 0.0401, "mean_token_accuracy": 0.9891201704740524, "num_tokens": 160985025.0, "step": 1516 }, { "entropy": 0.967622235417366, "epoch": 3.4561003420752567, "grad_norm": 0.94140625, "learning_rate": 3.900063486153598e-06, "loss": 0.0342, "mean_token_accuracy": 0.9914166331291199, "num_tokens": 161091379.0, "step": 1517 }, { "entropy": 0.9717029333114624, "epoch": 3.458380843785633, "grad_norm": 1.265625, "learning_rate": 3.898502697872725e-06, "loss": 0.0378, "mean_token_accuracy": 0.9893264770507812, "num_tokens": 161197865.0, "step": 1518 }, { "entropy": 0.9709351062774658, "epoch": 3.460661345496009, "grad_norm": 1.0390625, "learning_rate": 3.896941115828721e-06, "loss": 0.0333, "mean_token_accuracy": 0.9906123131513596, "num_tokens": 161304011.0, "step": 1519 }, { "entropy": 0.9729030579328537, "epoch": 3.4629418472063853, "grad_norm": 1.0546875, "learning_rate": 3.895378740907908e-06, "loss": 0.0321, "mean_token_accuracy": 0.9898895919322968, "num_tokens": 161409713.0, "step": 1520 }, { "entropy": 0.9705152958631516, "epoch": 3.4652223489167615, "grad_norm": 1.296875, "learning_rate": 3.89381557399706e-06, "loss": 0.037, "mean_token_accuracy": 0.9876594990491867, "num_tokens": 161516059.0, "step": 1521 }, { "entropy": 0.9668488204479218, "epoch": 3.467502850627138, "grad_norm": 1.4921875, "learning_rate": 3.892251615983401e-06, "loss": 0.0369, "mean_token_accuracy": 0.9882087260484695, "num_tokens": 161622675.0, "step": 1522 }, { "entropy": 0.9694378226995468, "epoch": 3.4697833523375143, "grad_norm": 1.2578125, "learning_rate": 3.890686867754604e-06, "loss": 0.0361, "mean_token_accuracy": 0.9886024296283722, "num_tokens": 161728761.0, "step": 1523 }, { "entropy": 0.9665937721729279, "epoch": 3.4720638540478905, "grad_norm": 1.1015625, "learning_rate": 3.889121330198788e-06, "loss": 0.0356, "mean_token_accuracy": 0.9887768179178238, "num_tokens": 161835136.0, "step": 1524 }, { "entropy": 0.9667414873838425, "epoch": 3.4743443557582667, "grad_norm": 1.1484375, "learning_rate": 3.887555004204524e-06, "loss": 0.0348, "mean_token_accuracy": 0.9879112988710403, "num_tokens": 161941934.0, "step": 1525 }, { "entropy": 0.9715663343667984, "epoch": 3.476624857468643, "grad_norm": 1.21875, "learning_rate": 3.885987890660828e-06, "loss": 0.0364, "mean_token_accuracy": 0.9879909604787827, "num_tokens": 162048587.0, "step": 1526 }, { "entropy": 0.9666858464479446, "epoch": 3.4789053591790196, "grad_norm": 1.5078125, "learning_rate": 3.884419990457161e-06, "loss": 0.0506, "mean_token_accuracy": 0.9875040352344513, "num_tokens": 162154822.0, "step": 1527 }, { "entropy": 0.9694091528654099, "epoch": 3.4811858608893957, "grad_norm": 1.3671875, "learning_rate": 3.882851304483436e-06, "loss": 0.0376, "mean_token_accuracy": 0.9889573007822037, "num_tokens": 162261031.0, "step": 1528 }, { "entropy": 0.9726677983999252, "epoch": 3.483466362599772, "grad_norm": 1.0859375, "learning_rate": 3.881281833630007e-06, "loss": 0.0337, "mean_token_accuracy": 0.9909560680389404, "num_tokens": 162368089.0, "step": 1529 }, { "entropy": 0.9566008895635605, "epoch": 3.485746864310148, "grad_norm": 1.1796875, "learning_rate": 3.879711578787676e-06, "loss": 0.0417, "mean_token_accuracy": 0.9860413372516632, "num_tokens": 162474843.0, "step": 1530 }, { "entropy": 0.9718994647264481, "epoch": 3.4880273660205243, "grad_norm": 0.86328125, "learning_rate": 3.87814054084769e-06, "loss": 0.0224, "mean_token_accuracy": 0.9929602891206741, "num_tokens": 162581433.0, "step": 1531 }, { "entropy": 0.9679249823093414, "epoch": 3.490307867730901, "grad_norm": 1.171875, "learning_rate": 3.8765687207017375e-06, "loss": 0.0269, "mean_token_accuracy": 0.9910697042942047, "num_tokens": 162687820.0, "step": 1532 }, { "entropy": 0.9727962166070938, "epoch": 3.492588369441277, "grad_norm": 1.1171875, "learning_rate": 3.874996119241956e-06, "loss": 0.0309, "mean_token_accuracy": 0.989816427230835, "num_tokens": 162794246.0, "step": 1533 }, { "entropy": 0.9709348380565643, "epoch": 3.4948688711516533, "grad_norm": 1.09375, "learning_rate": 3.873422737360922e-06, "loss": 0.0386, "mean_token_accuracy": 0.9869482219219208, "num_tokens": 162900426.0, "step": 1534 }, { "entropy": 0.9655685424804688, "epoch": 3.4971493728620295, "grad_norm": 1.0546875, "learning_rate": 3.871848575951658e-06, "loss": 0.0359, "mean_token_accuracy": 0.9899090379476547, "num_tokens": 163006692.0, "step": 1535 }, { "entropy": 0.9704323559999466, "epoch": 3.4994298745724057, "grad_norm": 1.15625, "learning_rate": 3.8702736359076265e-06, "loss": 0.0374, "mean_token_accuracy": 0.9858821332454681, "num_tokens": 163113316.0, "step": 1536 }, { "entropy": 0.9678916931152344, "epoch": 3.5017103762827824, "grad_norm": 1.0703125, "learning_rate": 3.868697918122733e-06, "loss": 0.0345, "mean_token_accuracy": 0.9897069483995438, "num_tokens": 163219473.0, "step": 1537 }, { "entropy": 0.9649472236633301, "epoch": 3.5039908779931586, "grad_norm": 1.0078125, "learning_rate": 3.867121423491325e-06, "loss": 0.0239, "mean_token_accuracy": 0.9913370907306671, "num_tokens": 163326121.0, "step": 1538 }, { "entropy": 0.9663941860198975, "epoch": 3.5062713797035348, "grad_norm": 0.9765625, "learning_rate": 3.86554415290819e-06, "loss": 0.0397, "mean_token_accuracy": 0.9875494539737701, "num_tokens": 163432545.0, "step": 1539 }, { "entropy": 0.9719779193401337, "epoch": 3.508551881413911, "grad_norm": 1.2421875, "learning_rate": 3.8639661072685575e-06, "loss": 0.0373, "mean_token_accuracy": 0.9879452735185623, "num_tokens": 163538719.0, "step": 1540 }, { "epoch": 3.508551881413911, "eval_entropy": 0.9669553909464934, "eval_loss": 0.03903954103589058, "eval_mean_token_accuracy": 0.9880886315846171, "eval_num_tokens": 163538719.0, "eval_runtime": 66.0816, "eval_samples_per_second": 126.889, "eval_steps_per_second": 3.98, "step": 1540 }, { "entropy": 0.9713653922080994, "epoch": 3.5108323831242876, "grad_norm": 0.99609375, "learning_rate": 3.862387287468095e-06, "loss": 0.0274, "mean_token_accuracy": 0.9912195950746536, "num_tokens": 163645241.0, "step": 1541 }, { "entropy": 0.9706912487745285, "epoch": 3.5131128848346638, "grad_norm": 1.2421875, "learning_rate": 3.860807694402909e-06, "loss": 0.0441, "mean_token_accuracy": 0.9856588840484619, "num_tokens": 163751167.0, "step": 1542 }, { "entropy": 0.9686107337474823, "epoch": 3.51539338654504, "grad_norm": 1.09375, "learning_rate": 3.859227328969547e-06, "loss": 0.0324, "mean_token_accuracy": 0.9908144026994705, "num_tokens": 163857968.0, "step": 1543 }, { "entropy": 0.972332239151001, "epoch": 3.517673888255416, "grad_norm": 0.93359375, "learning_rate": 3.857646192064995e-06, "loss": 0.0278, "mean_token_accuracy": 0.9913387298583984, "num_tokens": 163964451.0, "step": 1544 }, { "entropy": 0.9704210460186005, "epoch": 3.5199543899657924, "grad_norm": 1.1640625, "learning_rate": 3.856064284586674e-06, "loss": 0.0279, "mean_token_accuracy": 0.9912612736225128, "num_tokens": 164070590.0, "step": 1545 }, { "entropy": 0.9703137427568436, "epoch": 3.5222348916761685, "grad_norm": 0.984375, "learning_rate": 3.854481607432445e-06, "loss": 0.0199, "mean_token_accuracy": 0.9932030886411667, "num_tokens": 164176629.0, "step": 1546 }, { "entropy": 0.9691025465726852, "epoch": 3.524515393386545, "grad_norm": 0.99609375, "learning_rate": 3.852898161500605e-06, "loss": 0.0325, "mean_token_accuracy": 0.9901134371757507, "num_tokens": 164283145.0, "step": 1547 }, { "entropy": 0.9734313637018204, "epoch": 3.5267958950969214, "grad_norm": 1.375, "learning_rate": 3.851313947689888e-06, "loss": 0.0475, "mean_token_accuracy": 0.9864543974399567, "num_tokens": 164389155.0, "step": 1548 }, { "entropy": 0.9703131765127182, "epoch": 3.5290763968072976, "grad_norm": 1.375, "learning_rate": 3.849728966899462e-06, "loss": 0.0363, "mean_token_accuracy": 0.9887517243623734, "num_tokens": 164495616.0, "step": 1549 }, { "entropy": 0.9684729725122452, "epoch": 3.5313568985176738, "grad_norm": 1.1015625, "learning_rate": 3.848143220028931e-06, "loss": 0.0288, "mean_token_accuracy": 0.9921360462903976, "num_tokens": 164600976.0, "step": 1550 }, { "entropy": 0.9630979150533676, "epoch": 3.5336374002280504, "grad_norm": 0.87109375, "learning_rate": 3.846556707978337e-06, "loss": 0.0239, "mean_token_accuracy": 0.9925627261400223, "num_tokens": 164707191.0, "step": 1551 }, { "entropy": 0.9719082117080688, "epoch": 3.5359179019384266, "grad_norm": 1.0078125, "learning_rate": 3.844969431648151e-06, "loss": 0.0271, "mean_token_accuracy": 0.9921387284994125, "num_tokens": 164813098.0, "step": 1552 }, { "entropy": 0.9686633795499802, "epoch": 3.538198403648803, "grad_norm": 1.2421875, "learning_rate": 3.843381391939281e-06, "loss": 0.0379, "mean_token_accuracy": 0.9870139211416245, "num_tokens": 164919480.0, "step": 1553 }, { "entropy": 0.9648558795452118, "epoch": 3.540478905359179, "grad_norm": 0.8984375, "learning_rate": 3.841792589753067e-06, "loss": 0.0287, "mean_token_accuracy": 0.9895726293325424, "num_tokens": 165025928.0, "step": 1554 }, { "entropy": 0.973118931055069, "epoch": 3.542759407069555, "grad_norm": 0.95703125, "learning_rate": 3.840203025991285e-06, "loss": 0.0316, "mean_token_accuracy": 0.9885214865207672, "num_tokens": 165132362.0, "step": 1555 }, { "entropy": 0.9656031578779221, "epoch": 3.5450399087799314, "grad_norm": 1.0, "learning_rate": 3.838612701556138e-06, "loss": 0.0284, "mean_token_accuracy": 0.9894514679908752, "num_tokens": 165238937.0, "step": 1556 }, { "entropy": 0.9696692228317261, "epoch": 3.547320410490308, "grad_norm": 1.09375, "learning_rate": 3.837021617350266e-06, "loss": 0.0293, "mean_token_accuracy": 0.9895401000976562, "num_tokens": 165345167.0, "step": 1557 }, { "entropy": 0.965021476149559, "epoch": 3.549600912200684, "grad_norm": 1.1328125, "learning_rate": 3.8354297742767345e-06, "loss": 0.0335, "mean_token_accuracy": 0.9900798350572586, "num_tokens": 165451759.0, "step": 1558 }, { "entropy": 0.9683341234922409, "epoch": 3.5518814139110604, "grad_norm": 1.2734375, "learning_rate": 3.833837173239044e-06, "loss": 0.0366, "mean_token_accuracy": 0.9876466691493988, "num_tokens": 165558200.0, "step": 1559 }, { "entropy": 0.9699911922216415, "epoch": 3.5541619156214366, "grad_norm": 1.0234375, "learning_rate": 3.832243815141126e-06, "loss": 0.0368, "mean_token_accuracy": 0.9892467707395554, "num_tokens": 165664110.0, "step": 1560 }, { "entropy": 0.9696343839168549, "epoch": 3.556442417331813, "grad_norm": 1.1328125, "learning_rate": 3.830649700887339e-06, "loss": 0.0343, "mean_token_accuracy": 0.9873642027378082, "num_tokens": 165770111.0, "step": 1561 }, { "entropy": 0.972029909491539, "epoch": 3.5587229190421894, "grad_norm": 1.0703125, "learning_rate": 3.829054831382471e-06, "loss": 0.0322, "mean_token_accuracy": 0.9905273169279099, "num_tokens": 165876674.0, "step": 1562 }, { "entropy": 0.9694970846176147, "epoch": 3.5610034207525656, "grad_norm": 1.0546875, "learning_rate": 3.827459207531739e-06, "loss": 0.0354, "mean_token_accuracy": 0.9915715754032135, "num_tokens": 165983221.0, "step": 1563 }, { "entropy": 0.9703568816184998, "epoch": 3.563283922462942, "grad_norm": 1.03125, "learning_rate": 3.825862830240787e-06, "loss": 0.0297, "mean_token_accuracy": 0.9883049726486206, "num_tokens": 166089630.0, "step": 1564 }, { "entropy": 0.9666757434606552, "epoch": 3.565564424173318, "grad_norm": 1.2421875, "learning_rate": 3.82426570041569e-06, "loss": 0.0308, "mean_token_accuracy": 0.9892342686653137, "num_tokens": 166195237.0, "step": 1565 }, { "entropy": 0.9685663282871246, "epoch": 3.567844925883694, "grad_norm": 0.94140625, "learning_rate": 3.822667818962948e-06, "loss": 0.0303, "mean_token_accuracy": 0.9898319244384766, "num_tokens": 166301416.0, "step": 1566 }, { "entropy": 0.9634935259819031, "epoch": 3.570125427594071, "grad_norm": 1.1328125, "learning_rate": 3.821069186789486e-06, "loss": 0.0376, "mean_token_accuracy": 0.9880454838275909, "num_tokens": 166407462.0, "step": 1567 }, { "entropy": 0.9668799787759781, "epoch": 3.572405929304447, "grad_norm": 1.015625, "learning_rate": 3.819469804802659e-06, "loss": 0.0389, "mean_token_accuracy": 0.9875521808862686, "num_tokens": 166513856.0, "step": 1568 }, { "entropy": 0.9634953141212463, "epoch": 3.574686431014823, "grad_norm": 1.1953125, "learning_rate": 3.8178696739102435e-06, "loss": 0.0428, "mean_token_accuracy": 0.9857545346021652, "num_tokens": 166620237.0, "step": 1569 }, { "entropy": 0.9615159332752228, "epoch": 3.5769669327251994, "grad_norm": 1.0390625, "learning_rate": 3.816268795020443e-06, "loss": 0.0278, "mean_token_accuracy": 0.9914974570274353, "num_tokens": 166726885.0, "step": 1570 }, { "entropy": 0.9699313044548035, "epoch": 3.579247434435576, "grad_norm": 1.09375, "learning_rate": 3.814667169041887e-06, "loss": 0.0397, "mean_token_accuracy": 0.9908092021942139, "num_tokens": 166832720.0, "step": 1571 }, { "entropy": 0.9674697071313858, "epoch": 3.581527936145952, "grad_norm": 1.03125, "learning_rate": 3.8130647968836254e-06, "loss": 0.0294, "mean_token_accuracy": 0.9909363985061646, "num_tokens": 166938956.0, "step": 1572 }, { "entropy": 0.9689359813928604, "epoch": 3.5838084378563284, "grad_norm": 0.76953125, "learning_rate": 3.811461679455136e-06, "loss": 0.0233, "mean_token_accuracy": 0.9936000257730484, "num_tokens": 167044869.0, "step": 1573 }, { "entropy": 0.9711139798164368, "epoch": 3.5860889395667046, "grad_norm": 1.234375, "learning_rate": 3.809857817666316e-06, "loss": 0.04, "mean_token_accuracy": 0.989261731505394, "num_tokens": 167151886.0, "step": 1574 }, { "entropy": 0.9674369841814041, "epoch": 3.588369441277081, "grad_norm": 1.1015625, "learning_rate": 3.808253212427486e-06, "loss": 0.0346, "mean_token_accuracy": 0.9894447028636932, "num_tokens": 167258263.0, "step": 1575 }, { "entropy": 0.9653229117393494, "epoch": 3.590649942987457, "grad_norm": 1.0078125, "learning_rate": 3.8066478646493898e-06, "loss": 0.0279, "mean_token_accuracy": 0.99195396900177, "num_tokens": 167364491.0, "step": 1576 }, { "entropy": 0.9670141190290451, "epoch": 3.5929304446978336, "grad_norm": 1.1328125, "learning_rate": 3.805041775243191e-06, "loss": 0.0413, "mean_token_accuracy": 0.9865926653146744, "num_tokens": 167470923.0, "step": 1577 }, { "entropy": 0.9665297567844391, "epoch": 3.59521094640821, "grad_norm": 1.03125, "learning_rate": 3.803434945120475e-06, "loss": 0.0375, "mean_token_accuracy": 0.9901112467050552, "num_tokens": 167577439.0, "step": 1578 }, { "entropy": 0.9628346711397171, "epoch": 3.597491448118586, "grad_norm": 0.99609375, "learning_rate": 3.801827375193249e-06, "loss": 0.0318, "mean_token_accuracy": 0.9877982288599014, "num_tokens": 167683628.0, "step": 1579 }, { "entropy": 0.9677550792694092, "epoch": 3.5997719498289626, "grad_norm": 1.125, "learning_rate": 3.8002190663739362e-06, "loss": 0.0319, "mean_token_accuracy": 0.9894658625125885, "num_tokens": 167789858.0, "step": 1580 }, { "entropy": 0.9688262343406677, "epoch": 3.602052451539339, "grad_norm": 1.1796875, "learning_rate": 3.798610019575384e-06, "loss": 0.0258, "mean_token_accuracy": 0.9916083961725235, "num_tokens": 167895724.0, "step": 1581 }, { "entropy": 0.9696432650089264, "epoch": 3.604332953249715, "grad_norm": 1.234375, "learning_rate": 3.7970002357108554e-06, "loss": 0.0384, "mean_token_accuracy": 0.9885877072811127, "num_tokens": 168001748.0, "step": 1582 }, { "entropy": 0.965540274977684, "epoch": 3.6066134549600912, "grad_norm": 1.1015625, "learning_rate": 3.7953897156940323e-06, "loss": 0.0404, "mean_token_accuracy": 0.9882766902446747, "num_tokens": 168107667.0, "step": 1583 }, { "entropy": 0.9684633910655975, "epoch": 3.6088939566704674, "grad_norm": 1.1875, "learning_rate": 3.793778460439015e-06, "loss": 0.0338, "mean_token_accuracy": 0.9882051944732666, "num_tokens": 168213720.0, "step": 1584 }, { "entropy": 0.9701700508594513, "epoch": 3.6111744583808436, "grad_norm": 1.1953125, "learning_rate": 3.792166470860321e-06, "loss": 0.0431, "mean_token_accuracy": 0.986098974943161, "num_tokens": 168319798.0, "step": 1585 }, { "entropy": 0.9641270339488983, "epoch": 3.61345496009122, "grad_norm": 1.03125, "learning_rate": 3.790553747872885e-06, "loss": 0.0378, "mean_token_accuracy": 0.9891530126333237, "num_tokens": 168425851.0, "step": 1586 }, { "entropy": 0.969876229763031, "epoch": 3.6157354618015964, "grad_norm": 1.1171875, "learning_rate": 3.788940292392056e-06, "loss": 0.037, "mean_token_accuracy": 0.9874753206968307, "num_tokens": 168532641.0, "step": 1587 }, { "entropy": 0.9617808014154434, "epoch": 3.6180159635119726, "grad_norm": 1.2890625, "learning_rate": 3.787326105333601e-06, "loss": 0.0375, "mean_token_accuracy": 0.989299476146698, "num_tokens": 168639494.0, "step": 1588 }, { "entropy": 0.9627580791711807, "epoch": 3.620296465222349, "grad_norm": 0.921875, "learning_rate": 3.7857111876137017e-06, "loss": 0.0343, "mean_token_accuracy": 0.989138275384903, "num_tokens": 168745578.0, "step": 1589 }, { "entropy": 0.9678329825401306, "epoch": 3.6225769669327255, "grad_norm": 1.2734375, "learning_rate": 3.784095540148954e-06, "loss": 0.0402, "mean_token_accuracy": 0.987067312002182, "num_tokens": 168851638.0, "step": 1590 }, { "entropy": 0.9675362706184387, "epoch": 3.6248574686431017, "grad_norm": 1.2421875, "learning_rate": 3.7824791638563674e-06, "loss": 0.0353, "mean_token_accuracy": 0.9876633733510971, "num_tokens": 168957688.0, "step": 1591 }, { "entropy": 0.9793463796377182, "epoch": 3.627137970353478, "grad_norm": 1.2734375, "learning_rate": 3.7808620596533675e-06, "loss": 0.0284, "mean_token_accuracy": 0.9916827827692032, "num_tokens": 169064187.0, "step": 1592 }, { "entropy": 0.9690855741500854, "epoch": 3.629418472063854, "grad_norm": 1.125, "learning_rate": 3.77924422845779e-06, "loss": 0.0406, "mean_token_accuracy": 0.9869449138641357, "num_tokens": 169170004.0, "step": 1593 }, { "entropy": 0.9686385542154312, "epoch": 3.6316989737742302, "grad_norm": 1.484375, "learning_rate": 3.7776256711878856e-06, "loss": 0.0462, "mean_token_accuracy": 0.988389179110527, "num_tokens": 169276154.0, "step": 1594 }, { "entropy": 0.9691651612520218, "epoch": 3.6339794754846064, "grad_norm": 1.03125, "learning_rate": 3.7760063887623155e-06, "loss": 0.0342, "mean_token_accuracy": 0.9890716671943665, "num_tokens": 169382197.0, "step": 1595 }, { "entropy": 0.9680845886468887, "epoch": 3.636259977194983, "grad_norm": 0.9765625, "learning_rate": 3.7743863821001538e-06, "loss": 0.0324, "mean_token_accuracy": 0.989521935582161, "num_tokens": 169489218.0, "step": 1596 }, { "entropy": 0.9738813042640686, "epoch": 3.6385404789053593, "grad_norm": 0.94921875, "learning_rate": 3.7727656521208843e-06, "loss": 0.0276, "mean_token_accuracy": 0.9908671379089355, "num_tokens": 169595212.0, "step": 1597 }, { "entropy": 0.9630001485347748, "epoch": 3.6408209806157354, "grad_norm": 0.94921875, "learning_rate": 3.771144199744402e-06, "loss": 0.0336, "mean_token_accuracy": 0.9887588024139404, "num_tokens": 169701013.0, "step": 1598 }, { "entropy": 0.9679297357797623, "epoch": 3.6431014823261116, "grad_norm": 0.984375, "learning_rate": 3.7695220258910124e-06, "loss": 0.0342, "mean_token_accuracy": 0.9880800694227219, "num_tokens": 169807155.0, "step": 1599 }, { "entropy": 0.9693388938903809, "epoch": 3.6453819840364883, "grad_norm": 1.1796875, "learning_rate": 3.7678991314814305e-06, "loss": 0.0426, "mean_token_accuracy": 0.987029179930687, "num_tokens": 169913003.0, "step": 1600 }, { "entropy": 0.9776575416326523, "epoch": 3.6476624857468645, "grad_norm": 1.0078125, "learning_rate": 3.766275517436779e-06, "loss": 0.0276, "mean_token_accuracy": 0.9922656267881393, "num_tokens": 170019241.0, "step": 1601 }, { "entropy": 0.9675299823284149, "epoch": 3.6499429874572407, "grad_norm": 1.046875, "learning_rate": 3.7646511846785904e-06, "loss": 0.0345, "mean_token_accuracy": 0.9885559380054474, "num_tokens": 170125826.0, "step": 1602 }, { "entropy": 0.9709239900112152, "epoch": 3.652223489167617, "grad_norm": 0.90625, "learning_rate": 3.7630261341288044e-06, "loss": 0.0269, "mean_token_accuracy": 0.9918776452541351, "num_tokens": 170231990.0, "step": 1603 }, { "entropy": 0.9694900065660477, "epoch": 3.654503990877993, "grad_norm": 1.265625, "learning_rate": 3.7614003667097674e-06, "loss": 0.0428, "mean_token_accuracy": 0.9859653562307358, "num_tokens": 170338217.0, "step": 1604 }, { "entropy": 0.9746795892715454, "epoch": 3.6567844925883692, "grad_norm": 1.296875, "learning_rate": 3.759773883344236e-06, "loss": 0.037, "mean_token_accuracy": 0.9890840649604797, "num_tokens": 170443946.0, "step": 1605 }, { "entropy": 0.9741235822439194, "epoch": 3.659064994298746, "grad_norm": 1.0234375, "learning_rate": 3.7581466849553685e-06, "loss": 0.0292, "mean_token_accuracy": 0.9924089312553406, "num_tokens": 170551191.0, "step": 1606 }, { "entropy": 0.9672462940216064, "epoch": 3.661345496009122, "grad_norm": 0.96484375, "learning_rate": 3.7565187724667324e-06, "loss": 0.0302, "mean_token_accuracy": 0.9894005656242371, "num_tokens": 170657849.0, "step": 1607 }, { "entropy": 0.9675280302762985, "epoch": 3.6636259977194983, "grad_norm": 0.9375, "learning_rate": 3.7548901468022993e-06, "loss": 0.0296, "mean_token_accuracy": 0.9911756068468094, "num_tokens": 170764075.0, "step": 1608 }, { "entropy": 0.9735906571149826, "epoch": 3.6659064994298745, "grad_norm": 1.0546875, "learning_rate": 3.7532608088864444e-06, "loss": 0.0311, "mean_token_accuracy": 0.988739863038063, "num_tokens": 170870140.0, "step": 1609 }, { "entropy": 0.9648818969726562, "epoch": 3.668187001140251, "grad_norm": 1.234375, "learning_rate": 3.75163075964395e-06, "loss": 0.0361, "mean_token_accuracy": 0.9890250414609909, "num_tokens": 170976428.0, "step": 1610 }, { "entropy": 0.968808725476265, "epoch": 3.6704675028506273, "grad_norm": 1.25, "learning_rate": 3.7500000000000005e-06, "loss": 0.0404, "mean_token_accuracy": 0.9869667589664459, "num_tokens": 171083114.0, "step": 1611 }, { "entropy": 0.9712309837341309, "epoch": 3.6727480045610035, "grad_norm": 0.875, "learning_rate": 3.748368530880183e-06, "loss": 0.0205, "mean_token_accuracy": 0.9943950027227402, "num_tokens": 171188754.0, "step": 1612 }, { "entropy": 0.9664530903100967, "epoch": 3.6750285062713797, "grad_norm": 1.015625, "learning_rate": 3.7467363532104874e-06, "loss": 0.0297, "mean_token_accuracy": 0.9893354028463364, "num_tokens": 171295424.0, "step": 1613 }, { "entropy": 0.9685205221176147, "epoch": 3.677309007981756, "grad_norm": 1.1484375, "learning_rate": 3.7451034679173082e-06, "loss": 0.0387, "mean_token_accuracy": 0.9871962666511536, "num_tokens": 171401602.0, "step": 1614 }, { "entropy": 0.9720876812934875, "epoch": 3.679589509692132, "grad_norm": 1.25, "learning_rate": 3.7434698759274366e-06, "loss": 0.0378, "mean_token_accuracy": 0.9877047836780548, "num_tokens": 171508488.0, "step": 1615 }, { "entropy": 0.9659658521413803, "epoch": 3.6818700114025087, "grad_norm": 1.0625, "learning_rate": 3.741835578168071e-06, "loss": 0.0313, "mean_token_accuracy": 0.9910999536514282, "num_tokens": 171615179.0, "step": 1616 }, { "entropy": 0.9707490503787994, "epoch": 3.684150513112885, "grad_norm": 1.15625, "learning_rate": 3.740200575566806e-06, "loss": 0.0348, "mean_token_accuracy": 0.9897696673870087, "num_tokens": 171721191.0, "step": 1617 }, { "entropy": 0.9652235209941864, "epoch": 3.686431014823261, "grad_norm": 1.2109375, "learning_rate": 3.7385648690516364e-06, "loss": 0.0363, "mean_token_accuracy": 0.9890176206827164, "num_tokens": 171827455.0, "step": 1618 }, { "entropy": 0.9706735461950302, "epoch": 3.6887115165336373, "grad_norm": 1.4140625, "learning_rate": 3.7369284595509587e-06, "loss": 0.0342, "mean_token_accuracy": 0.9903312623500824, "num_tokens": 171934079.0, "step": 1619 }, { "entropy": 0.9652814567089081, "epoch": 3.690992018244014, "grad_norm": 1.0390625, "learning_rate": 3.7352913479935672e-06, "loss": 0.033, "mean_token_accuracy": 0.9883919954299927, "num_tokens": 172040847.0, "step": 1620 }, { "entropy": 0.9724728912115097, "epoch": 3.69327251995439, "grad_norm": 1.03125, "learning_rate": 3.7336535353086546e-06, "loss": 0.0317, "mean_token_accuracy": 0.989402636885643, "num_tokens": 172147493.0, "step": 1621 }, { "entropy": 0.9683105200529099, "epoch": 3.6955530216647663, "grad_norm": 1.328125, "learning_rate": 3.7320150224258124e-06, "loss": 0.0367, "mean_token_accuracy": 0.9891185313463211, "num_tokens": 172253832.0, "step": 1622 }, { "entropy": 0.9705256223678589, "epoch": 3.6978335233751425, "grad_norm": 0.91796875, "learning_rate": 3.7303758102750274e-06, "loss": 0.0275, "mean_token_accuracy": 0.9896822720766068, "num_tokens": 172360323.0, "step": 1623 }, { "entropy": 0.9718990921974182, "epoch": 3.7001140250855187, "grad_norm": 1.125, "learning_rate": 3.7287358997866872e-06, "loss": 0.037, "mean_token_accuracy": 0.9871226847171783, "num_tokens": 172466483.0, "step": 1624 }, { "entropy": 0.9680652022361755, "epoch": 3.702394526795895, "grad_norm": 1.203125, "learning_rate": 3.7270952918915715e-06, "loss": 0.0377, "mean_token_accuracy": 0.987531378865242, "num_tokens": 172572333.0, "step": 1625 }, { "entropy": 0.9706844389438629, "epoch": 3.7046750285062715, "grad_norm": 1.125, "learning_rate": 3.7254539875208577e-06, "loss": 0.0303, "mean_token_accuracy": 0.9901835918426514, "num_tokens": 172678604.0, "step": 1626 }, { "entropy": 0.9694691300392151, "epoch": 3.7069555302166477, "grad_norm": 1.1015625, "learning_rate": 3.7238119876061196e-06, "loss": 0.0315, "mean_token_accuracy": 0.9890733063220978, "num_tokens": 172784756.0, "step": 1627 }, { "entropy": 0.9720593392848969, "epoch": 3.709236031927024, "grad_norm": 1.109375, "learning_rate": 3.7221692930793234e-06, "loss": 0.0298, "mean_token_accuracy": 0.9914166331291199, "num_tokens": 172890970.0, "step": 1628 }, { "entropy": 0.9641873091459274, "epoch": 3.7115165336374, "grad_norm": 1.2734375, "learning_rate": 3.7205259048728316e-06, "loss": 0.0326, "mean_token_accuracy": 0.9901603907346725, "num_tokens": 172997907.0, "step": 1629 }, { "entropy": 0.9624163210391998, "epoch": 3.7137970353477767, "grad_norm": 1.2109375, "learning_rate": 3.718881823919399e-06, "loss": 0.0407, "mean_token_accuracy": 0.9889312833547592, "num_tokens": 173104683.0, "step": 1630 }, { "entropy": 0.9639604240655899, "epoch": 3.716077537058153, "grad_norm": 0.984375, "learning_rate": 3.717237051152175e-06, "loss": 0.021, "mean_token_accuracy": 0.9941398501396179, "num_tokens": 173211088.0, "step": 1631 }, { "entropy": 0.9682330340147018, "epoch": 3.718358038768529, "grad_norm": 1.3203125, "learning_rate": 3.7155915875047005e-06, "loss": 0.0383, "mean_token_accuracy": 0.9879795908927917, "num_tokens": 173317416.0, "step": 1632 }, { "entropy": 0.9660849124193192, "epoch": 3.7206385404789053, "grad_norm": 1.15625, "learning_rate": 3.7139454339109082e-06, "loss": 0.0349, "mean_token_accuracy": 0.9895250797271729, "num_tokens": 173423487.0, "step": 1633 }, { "entropy": 0.9621398150920868, "epoch": 3.7229190421892815, "grad_norm": 0.890625, "learning_rate": 3.7122985913051242e-06, "loss": 0.026, "mean_token_accuracy": 0.9911150187253952, "num_tokens": 173529995.0, "step": 1634 }, { "entropy": 0.9643241912126541, "epoch": 3.7251995438996577, "grad_norm": 1.1171875, "learning_rate": 3.710651060622064e-06, "loss": 0.0296, "mean_token_accuracy": 0.9920361936092377, "num_tokens": 173636139.0, "step": 1635 }, { "entropy": 0.9660554379224777, "epoch": 3.7274800456100343, "grad_norm": 1.5, "learning_rate": 3.7090028427968343e-06, "loss": 0.0429, "mean_token_accuracy": 0.985758513212204, "num_tokens": 173741761.0, "step": 1636 }, { "entropy": 0.9717907160520554, "epoch": 3.7297605473204105, "grad_norm": 1.1640625, "learning_rate": 3.7073539387649316e-06, "loss": 0.0291, "mean_token_accuracy": 0.9902626574039459, "num_tokens": 173847765.0, "step": 1637 }, { "entropy": 0.9647781699895859, "epoch": 3.7320410490307867, "grad_norm": 1.171875, "learning_rate": 3.7057043494622423e-06, "loss": 0.0379, "mean_token_accuracy": 0.9891909658908844, "num_tokens": 173954037.0, "step": 1638 }, { "entropy": 0.9590819627046585, "epoch": 3.734321550741163, "grad_norm": 1.40625, "learning_rate": 3.704054075825042e-06, "loss": 0.0363, "mean_token_accuracy": 0.9888905733823776, "num_tokens": 174060067.0, "step": 1639 }, { "entropy": 0.9668785631656647, "epoch": 3.7366020524515395, "grad_norm": 0.93359375, "learning_rate": 3.702403118789992e-06, "loss": 0.0239, "mean_token_accuracy": 0.9928990602493286, "num_tokens": 174166529.0, "step": 1640 }, { "entropy": 0.968707948923111, "epoch": 3.7388825541619157, "grad_norm": 0.9453125, "learning_rate": 3.7007514792941462e-06, "loss": 0.0284, "mean_token_accuracy": 0.9909039735794067, "num_tokens": 174272970.0, "step": 1641 }, { "entropy": 0.9681785106658936, "epoch": 3.741163055872292, "grad_norm": 1.046875, "learning_rate": 3.6990991582749414e-06, "loss": 0.0256, "mean_token_accuracy": 0.9916411936283112, "num_tokens": 174379479.0, "step": 1642 }, { "entropy": 0.9636764675378799, "epoch": 3.743443557582668, "grad_norm": 1.4453125, "learning_rate": 3.6974461566702048e-06, "loss": 0.0439, "mean_token_accuracy": 0.986323818564415, "num_tokens": 174485981.0, "step": 1643 }, { "entropy": 0.9588721990585327, "epoch": 3.7457240592930443, "grad_norm": 1.296875, "learning_rate": 3.695792475418146e-06, "loss": 0.0427, "mean_token_accuracy": 0.9886270612478256, "num_tokens": 174592441.0, "step": 1644 }, { "entropy": 0.9631621241569519, "epoch": 3.7480045610034205, "grad_norm": 1.0078125, "learning_rate": 3.6941381154573646e-06, "loss": 0.0281, "mean_token_accuracy": 0.9935251176357269, "num_tokens": 174698589.0, "step": 1645 }, { "entropy": 0.9712978601455688, "epoch": 3.750285062713797, "grad_norm": 1.3046875, "learning_rate": 3.692483077726843e-06, "loss": 0.043, "mean_token_accuracy": 0.9884666353464127, "num_tokens": 174804993.0, "step": 1646 }, { "entropy": 0.9618167281150818, "epoch": 3.7525655644241733, "grad_norm": 0.953125, "learning_rate": 3.6908273631659475e-06, "loss": 0.0336, "mean_token_accuracy": 0.9892315566539764, "num_tokens": 174910657.0, "step": 1647 }, { "entropy": 0.9655503034591675, "epoch": 3.7548460661345495, "grad_norm": 1.140625, "learning_rate": 3.689170972714431e-06, "loss": 0.042, "mean_token_accuracy": 0.9877781867980957, "num_tokens": 175017215.0, "step": 1648 }, { "entropy": 0.9652870744466782, "epoch": 3.757126567844926, "grad_norm": 0.83984375, "learning_rate": 3.6875139073124277e-06, "loss": 0.0256, "mean_token_accuracy": 0.9917975813150406, "num_tokens": 175123399.0, "step": 1649 }, { "entropy": 0.9589584171772003, "epoch": 3.7594070695553023, "grad_norm": 1.046875, "learning_rate": 3.6858561679004567e-06, "loss": 0.0388, "mean_token_accuracy": 0.9856279045343399, "num_tokens": 175229752.0, "step": 1650 }, { "entropy": 0.9664766639471054, "epoch": 3.7616875712656785, "grad_norm": 1.0, "learning_rate": 3.684197755419419e-06, "loss": 0.0338, "mean_token_accuracy": 0.9887282699346542, "num_tokens": 175336229.0, "step": 1651 }, { "entropy": 0.9644445329904556, "epoch": 3.7639680729760547, "grad_norm": 1.0, "learning_rate": 3.6825386708105963e-06, "loss": 0.03, "mean_token_accuracy": 0.989570289850235, "num_tokens": 175442406.0, "step": 1652 }, { "entropy": 0.9687969982624054, "epoch": 3.766248574686431, "grad_norm": 1.0703125, "learning_rate": 3.6808789150156545e-06, "loss": 0.0269, "mean_token_accuracy": 0.9896868169307709, "num_tokens": 175548669.0, "step": 1653 }, { "entropy": 0.96585713326931, "epoch": 3.768529076396807, "grad_norm": 1.078125, "learning_rate": 3.679218488976638e-06, "loss": 0.0409, "mean_token_accuracy": 0.985218808054924, "num_tokens": 175655170.0, "step": 1654 }, { "entropy": 0.9698283523321152, "epoch": 3.7708095781071833, "grad_norm": 1.0546875, "learning_rate": 3.677557393635973e-06, "loss": 0.0321, "mean_token_accuracy": 0.9905073195695877, "num_tokens": 175761531.0, "step": 1655 }, { "entropy": 0.9648319184780121, "epoch": 3.77309007981756, "grad_norm": 0.9453125, "learning_rate": 3.6758956299364643e-06, "loss": 0.0268, "mean_token_accuracy": 0.9907392710447311, "num_tokens": 175867420.0, "step": 1656 }, { "entropy": 0.9611748307943344, "epoch": 3.775370581527936, "grad_norm": 1.2265625, "learning_rate": 3.674233198821299e-06, "loss": 0.0405, "mean_token_accuracy": 0.9873669445514679, "num_tokens": 175974438.0, "step": 1657 }, { "entropy": 0.9636261165142059, "epoch": 3.7776510832383123, "grad_norm": 0.9921875, "learning_rate": 3.6725701012340387e-06, "loss": 0.0262, "mean_token_accuracy": 0.9937563985586166, "num_tokens": 176080627.0, "step": 1658 }, { "entropy": 0.9619545638561249, "epoch": 3.779931584948689, "grad_norm": 1.1796875, "learning_rate": 3.6709063381186267e-06, "loss": 0.0396, "mean_token_accuracy": 0.9874506294727325, "num_tokens": 176187355.0, "step": 1659 }, { "entropy": 0.966101884841919, "epoch": 3.782212086659065, "grad_norm": 1.1015625, "learning_rate": 3.6692419104193823e-06, "loss": 0.0343, "mean_token_accuracy": 0.9879247546195984, "num_tokens": 176293625.0, "step": 1660 }, { "entropy": 0.9666341543197632, "epoch": 3.7844925883694414, "grad_norm": 1.046875, "learning_rate": 3.6675768190810023e-06, "loss": 0.0299, "mean_token_accuracy": 0.9908340275287628, "num_tokens": 176399817.0, "step": 1661 }, { "entropy": 0.9640224575996399, "epoch": 3.7867730900798175, "grad_norm": 1.1953125, "learning_rate": 3.665911065048561e-06, "loss": 0.0363, "mean_token_accuracy": 0.9889632016420364, "num_tokens": 176506733.0, "step": 1662 }, { "entropy": 0.9630848169326782, "epoch": 3.7890535917901937, "grad_norm": 0.9921875, "learning_rate": 3.6642446492675075e-06, "loss": 0.0313, "mean_token_accuracy": 0.9893649071455002, "num_tokens": 176612874.0, "step": 1663 }, { "entropy": 0.9733673185110092, "epoch": 3.79133409350057, "grad_norm": 1.0, "learning_rate": 3.6625775726836677e-06, "loss": 0.0299, "mean_token_accuracy": 0.9899280518293381, "num_tokens": 176719182.0, "step": 1664 }, { "entropy": 0.9600510895252228, "epoch": 3.7936145952109466, "grad_norm": 1.0390625, "learning_rate": 3.6609098362432425e-06, "loss": 0.027, "mean_token_accuracy": 0.9918528646230698, "num_tokens": 176825208.0, "step": 1665 }, { "entropy": 0.9657678455114365, "epoch": 3.7958950969213228, "grad_norm": 1.0859375, "learning_rate": 3.659241440892806e-06, "loss": 0.0357, "mean_token_accuracy": 0.9883759766817093, "num_tokens": 176931530.0, "step": 1666 }, { "entropy": 0.9671884775161743, "epoch": 3.798175598631699, "grad_norm": 1.21875, "learning_rate": 3.6575723875793085e-06, "loss": 0.0354, "mean_token_accuracy": 0.9875563085079193, "num_tokens": 177037733.0, "step": 1667 }, { "entropy": 0.9664320200681686, "epoch": 3.800456100342075, "grad_norm": 1.203125, "learning_rate": 3.655902677250071e-06, "loss": 0.0314, "mean_token_accuracy": 0.9898822754621506, "num_tokens": 177144180.0, "step": 1668 }, { "entropy": 0.9601629674434662, "epoch": 3.802736602052452, "grad_norm": 1.1640625, "learning_rate": 3.6542323108527896e-06, "loss": 0.0344, "mean_token_accuracy": 0.9877012521028519, "num_tokens": 177250528.0, "step": 1669 }, { "entropy": 0.9699590802192688, "epoch": 3.805017103762828, "grad_norm": 1.0703125, "learning_rate": 3.652561289335532e-06, "loss": 0.0403, "mean_token_accuracy": 0.989463597536087, "num_tokens": 177356009.0, "step": 1670 }, { "entropy": 0.9639468938112259, "epoch": 3.807297605473204, "grad_norm": 1.125, "learning_rate": 3.6508896136467376e-06, "loss": 0.0337, "mean_token_accuracy": 0.9901049137115479, "num_tokens": 177462590.0, "step": 1671 }, { "entropy": 0.9636625200510025, "epoch": 3.8095781071835804, "grad_norm": 1.1640625, "learning_rate": 3.649217284735217e-06, "loss": 0.0399, "mean_token_accuracy": 0.9890829473733902, "num_tokens": 177569135.0, "step": 1672 }, { "entropy": 0.9668265283107758, "epoch": 3.8118586088939566, "grad_norm": 0.98046875, "learning_rate": 3.6475443035501522e-06, "loss": 0.0327, "mean_token_accuracy": 0.9908407032489777, "num_tokens": 177675164.0, "step": 1673 }, { "entropy": 0.9670537412166595, "epoch": 3.8141391106043327, "grad_norm": 0.96875, "learning_rate": 3.645870671041095e-06, "loss": 0.028, "mean_token_accuracy": 0.9918348789215088, "num_tokens": 177781353.0, "step": 1674 }, { "entropy": 0.9731958210468292, "epoch": 3.8164196123147094, "grad_norm": 1.4140625, "learning_rate": 3.6441963881579668e-06, "loss": 0.0458, "mean_token_accuracy": 0.9845471382141113, "num_tokens": 177888182.0, "step": 1675 }, { "entropy": 0.9679045081138611, "epoch": 3.8187001140250856, "grad_norm": 1.1875, "learning_rate": 3.642521455851058e-06, "loss": 0.0418, "mean_token_accuracy": 0.9843722283840179, "num_tokens": 177994329.0, "step": 1676 }, { "entropy": 0.9686506241559982, "epoch": 3.8209806157354618, "grad_norm": 1.0234375, "learning_rate": 3.6408458750710284e-06, "loss": 0.0301, "mean_token_accuracy": 0.9906431883573532, "num_tokens": 178100838.0, "step": 1677 }, { "entropy": 0.9722163826227188, "epoch": 3.823261117445838, "grad_norm": 1.3828125, "learning_rate": 3.639169646768905e-06, "loss": 0.0452, "mean_token_accuracy": 0.9838831424713135, "num_tokens": 178207551.0, "step": 1678 }, { "entropy": 0.9757493436336517, "epoch": 3.8255416191562146, "grad_norm": 1.140625, "learning_rate": 3.637492771896082e-06, "loss": 0.0328, "mean_token_accuracy": 0.9895419627428055, "num_tokens": 178313555.0, "step": 1679 }, { "entropy": 0.9634037613868713, "epoch": 3.827822120866591, "grad_norm": 1.0703125, "learning_rate": 3.6358152514043226e-06, "loss": 0.0277, "mean_token_accuracy": 0.9910358190536499, "num_tokens": 178419888.0, "step": 1680 }, { "entropy": 0.9687094986438751, "epoch": 3.830102622576967, "grad_norm": 1.171875, "learning_rate": 3.634137086245754e-06, "loss": 0.0236, "mean_token_accuracy": 0.9916118681430817, "num_tokens": 178526376.0, "step": 1681 }, { "entropy": 0.9621000289916992, "epoch": 3.832383124287343, "grad_norm": 1.28125, "learning_rate": 3.6324582773728712e-06, "loss": 0.0381, "mean_token_accuracy": 0.9883902221918106, "num_tokens": 178632968.0, "step": 1682 }, { "entropy": 0.9679328799247742, "epoch": 3.8346636259977194, "grad_norm": 0.9375, "learning_rate": 3.6307788257385325e-06, "loss": 0.0393, "mean_token_accuracy": 0.9901617169380188, "num_tokens": 178739167.0, "step": 1683 }, { "entropy": 0.9609950631856918, "epoch": 3.8369441277080956, "grad_norm": 1.1171875, "learning_rate": 3.6290987322959624e-06, "loss": 0.0372, "mean_token_accuracy": 0.9892608672380447, "num_tokens": 178846173.0, "step": 1684 }, { "entropy": 0.9703402668237686, "epoch": 3.839224629418472, "grad_norm": 0.84765625, "learning_rate": 3.6274179979987507e-06, "loss": 0.0266, "mean_token_accuracy": 0.9920370876789093, "num_tokens": 178952176.0, "step": 1685 }, { "entropy": 0.9641043692827225, "epoch": 3.8415051311288484, "grad_norm": 1.3515625, "learning_rate": 3.625736623800849e-06, "loss": 0.0381, "mean_token_accuracy": 0.9885027408599854, "num_tokens": 179058649.0, "step": 1686 }, { "entropy": 0.973464697599411, "epoch": 3.8437856328392246, "grad_norm": 0.96484375, "learning_rate": 3.624054610656572e-06, "loss": 0.039, "mean_token_accuracy": 0.987271711230278, "num_tokens": 179165384.0, "step": 1687 }, { "entropy": 0.9719117730855942, "epoch": 3.846066134549601, "grad_norm": 1.3515625, "learning_rate": 3.622371959520599e-06, "loss": 0.036, "mean_token_accuracy": 0.9895391464233398, "num_tokens": 179271650.0, "step": 1688 }, { "entropy": 0.965576633810997, "epoch": 3.8483466362599774, "grad_norm": 1.390625, "learning_rate": 3.6206886713479705e-06, "loss": 0.0448, "mean_token_accuracy": 0.9876195043325424, "num_tokens": 179377943.0, "step": 1689 }, { "entropy": 0.9633410573005676, "epoch": 3.8506271379703536, "grad_norm": 1.125, "learning_rate": 3.6190047470940875e-06, "loss": 0.0324, "mean_token_accuracy": 0.9887489080429077, "num_tokens": 179483989.0, "step": 1690 }, { "entropy": 0.9627305269241333, "epoch": 3.85290763968073, "grad_norm": 1.1640625, "learning_rate": 3.6173201877147134e-06, "loss": 0.038, "mean_token_accuracy": 0.9888621270656586, "num_tokens": 179589844.0, "step": 1691 }, { "entropy": 0.9635622203350067, "epoch": 3.855188141391106, "grad_norm": 0.91015625, "learning_rate": 3.6156349941659717e-06, "loss": 0.0287, "mean_token_accuracy": 0.9907369613647461, "num_tokens": 179696362.0, "step": 1692 }, { "entropy": 0.9643493294715881, "epoch": 3.857468643101482, "grad_norm": 1.046875, "learning_rate": 3.613949167404345e-06, "loss": 0.0351, "mean_token_accuracy": 0.9875357002019882, "num_tokens": 179802200.0, "step": 1693 }, { "entropy": 0.9682297557592392, "epoch": 3.8597491448118584, "grad_norm": 1.0078125, "learning_rate": 3.6122627083866773e-06, "loss": 0.0279, "mean_token_accuracy": 0.9914259612560272, "num_tokens": 179908038.0, "step": 1694 }, { "entropy": 0.962970569729805, "epoch": 3.862029646522235, "grad_norm": 1.15625, "learning_rate": 3.610575618070169e-06, "loss": 0.0271, "mean_token_accuracy": 0.9906346052885056, "num_tokens": 180014213.0, "step": 1695 }, { "entropy": 0.9633815437555313, "epoch": 3.864310148232611, "grad_norm": 0.98046875, "learning_rate": 3.6088878974123796e-06, "loss": 0.0326, "mean_token_accuracy": 0.9896625131368637, "num_tokens": 180120962.0, "step": 1696 }, { "entropy": 0.9685228317975998, "epoch": 3.8665906499429874, "grad_norm": 1.09375, "learning_rate": 3.6071995473712284e-06, "loss": 0.0286, "mean_token_accuracy": 0.9899550974369049, "num_tokens": 180226970.0, "step": 1697 }, { "entropy": 0.9690250009298325, "epoch": 3.8688711516533636, "grad_norm": 1.1015625, "learning_rate": 3.605510568904989e-06, "loss": 0.0344, "mean_token_accuracy": 0.9903195947408676, "num_tokens": 180333304.0, "step": 1698 }, { "entropy": 0.9647123962640762, "epoch": 3.8711516533637402, "grad_norm": 1.0078125, "learning_rate": 3.6038209629722936e-06, "loss": 0.0322, "mean_token_accuracy": 0.9899515807628632, "num_tokens": 180439795.0, "step": 1699 }, { "entropy": 0.9705328941345215, "epoch": 3.8734321550741164, "grad_norm": 1.09375, "learning_rate": 3.6021307305321295e-06, "loss": 0.0363, "mean_token_accuracy": 0.9887100160121918, "num_tokens": 180546482.0, "step": 1700 }, { "entropy": 0.9631434082984924, "epoch": 3.8757126567844926, "grad_norm": 0.9609375, "learning_rate": 3.6004398725438406e-06, "loss": 0.0313, "mean_token_accuracy": 0.9889775663614273, "num_tokens": 180652918.0, "step": 1701 }, { "entropy": 0.9633007943630219, "epoch": 3.877993158494869, "grad_norm": 0.98046875, "learning_rate": 3.5987483899671245e-06, "loss": 0.0325, "mean_token_accuracy": 0.9914916157722473, "num_tokens": 180759477.0, "step": 1702 }, { "entropy": 0.960902988910675, "epoch": 3.880273660205245, "grad_norm": 1.0625, "learning_rate": 3.597056283762034e-06, "loss": 0.0276, "mean_token_accuracy": 0.9910908937454224, "num_tokens": 180866553.0, "step": 1703 }, { "entropy": 0.9665449559688568, "epoch": 3.882554161915621, "grad_norm": 0.9453125, "learning_rate": 3.5953635548889777e-06, "loss": 0.0275, "mean_token_accuracy": 0.9919651001691818, "num_tokens": 180972444.0, "step": 1704 }, { "entropy": 0.9665002375841141, "epoch": 3.884834663625998, "grad_norm": 1.40625, "learning_rate": 3.5936702043087134e-06, "loss": 0.0385, "mean_token_accuracy": 0.9884840399026871, "num_tokens": 181078364.0, "step": 1705 }, { "entropy": 0.9654636830091476, "epoch": 3.887115165336374, "grad_norm": 1.1484375, "learning_rate": 3.5919762329823556e-06, "loss": 0.0418, "mean_token_accuracy": 0.9884174019098282, "num_tokens": 181185044.0, "step": 1706 }, { "entropy": 0.9707409143447876, "epoch": 3.88939566704675, "grad_norm": 0.93359375, "learning_rate": 3.5902816418713694e-06, "loss": 0.0271, "mean_token_accuracy": 0.9914720803499222, "num_tokens": 181291361.0, "step": 1707 }, { "entropy": 0.9692645221948624, "epoch": 3.8916761687571264, "grad_norm": 1.0390625, "learning_rate": 3.5885864319375717e-06, "loss": 0.0298, "mean_token_accuracy": 0.9904048293828964, "num_tokens": 181398261.0, "step": 1708 }, { "entropy": 0.965175062417984, "epoch": 3.893956670467503, "grad_norm": 1.2421875, "learning_rate": 3.5868906041431313e-06, "loss": 0.0397, "mean_token_accuracy": 0.9876458048820496, "num_tokens": 181504134.0, "step": 1709 }, { "entropy": 0.9574038982391357, "epoch": 3.8962371721778792, "grad_norm": 1.7734375, "learning_rate": 3.5851941594505674e-06, "loss": 0.0484, "mean_token_accuracy": 0.9857321083545685, "num_tokens": 181610083.0, "step": 1710 }, { "entropy": 0.9655005186796188, "epoch": 3.8985176738882554, "grad_norm": 1.03125, "learning_rate": 3.5834970988227484e-06, "loss": 0.0323, "mean_token_accuracy": 0.9885351359844208, "num_tokens": 181716410.0, "step": 1711 }, { "entropy": 0.9626259207725525, "epoch": 3.9007981755986316, "grad_norm": 1.3046875, "learning_rate": 3.581799423222895e-06, "loss": 0.0328, "mean_token_accuracy": 0.9894422888755798, "num_tokens": 181822883.0, "step": 1712 }, { "entropy": 0.9598754048347473, "epoch": 3.903078677309008, "grad_norm": 1.03125, "learning_rate": 3.580101133614573e-06, "loss": 0.0349, "mean_token_accuracy": 0.9892832338809967, "num_tokens": 181930147.0, "step": 1713 }, { "entropy": 0.9642284661531448, "epoch": 3.905359179019384, "grad_norm": 1.3515625, "learning_rate": 3.5784022309617006e-06, "loss": 0.0404, "mean_token_accuracy": 0.9878305941820145, "num_tokens": 182036754.0, "step": 1714 }, { "entropy": 0.9691982269287109, "epoch": 3.9076396807297606, "grad_norm": 0.96875, "learning_rate": 3.57670271622854e-06, "loss": 0.0291, "mean_token_accuracy": 0.9902936071157455, "num_tokens": 182142630.0, "step": 1715 }, { "entropy": 0.9625127762556076, "epoch": 3.909920182440137, "grad_norm": 1.328125, "learning_rate": 3.5750025903797053e-06, "loss": 0.0412, "mean_token_accuracy": 0.9869704693555832, "num_tokens": 182248224.0, "step": 1716 }, { "entropy": 0.9712620228528976, "epoch": 3.912200684150513, "grad_norm": 1.296875, "learning_rate": 3.5733018543801534e-06, "loss": 0.0459, "mean_token_accuracy": 0.9851551502943039, "num_tokens": 182354023.0, "step": 1717 }, { "entropy": 0.9695584326982498, "epoch": 3.9144811858608897, "grad_norm": 1.2109375, "learning_rate": 3.5716005091951906e-06, "loss": 0.0361, "mean_token_accuracy": 0.9888798147439957, "num_tokens": 182459890.0, "step": 1718 }, { "entropy": 0.9689391851425171, "epoch": 3.916761687571266, "grad_norm": 1.0859375, "learning_rate": 3.569898555790466e-06, "loss": 0.0379, "mean_token_accuracy": 0.9887217879295349, "num_tokens": 182565656.0, "step": 1719 }, { "entropy": 0.9702668339014053, "epoch": 3.919042189281642, "grad_norm": 0.984375, "learning_rate": 3.5681959951319766e-06, "loss": 0.0362, "mean_token_accuracy": 0.9895577281713486, "num_tokens": 182672369.0, "step": 1720 }, { "entropy": 0.9714137762784958, "epoch": 3.9213226909920182, "grad_norm": 1.015625, "learning_rate": 3.566492828186063e-06, "loss": 0.0232, "mean_token_accuracy": 0.9930102080106735, "num_tokens": 182778460.0, "step": 1721 }, { "entropy": 0.9745535552501678, "epoch": 3.9236031927023944, "grad_norm": 0.875, "learning_rate": 3.564789055919409e-06, "loss": 0.0264, "mean_token_accuracy": 0.9921733140945435, "num_tokens": 182884965.0, "step": 1722 }, { "entropy": 0.9702221006155014, "epoch": 3.9258836944127706, "grad_norm": 1.2109375, "learning_rate": 3.5630846792990435e-06, "loss": 0.0382, "mean_token_accuracy": 0.9877462834119797, "num_tokens": 182991134.0, "step": 1723 }, { "entropy": 0.9731080234050751, "epoch": 3.928164196123147, "grad_norm": 0.98828125, "learning_rate": 3.5613796992923382e-06, "loss": 0.0249, "mean_token_accuracy": 0.9925242215394974, "num_tokens": 183097605.0, "step": 1724 }, { "entropy": 0.977436900138855, "epoch": 3.9304446978335235, "grad_norm": 1.140625, "learning_rate": 3.559674116867006e-06, "loss": 0.0294, "mean_token_accuracy": 0.9905887097120285, "num_tokens": 183204045.0, "step": 1725 }, { "entropy": 0.9667867869138718, "epoch": 3.9327251995438997, "grad_norm": 1.0546875, "learning_rate": 3.5579679329911025e-06, "loss": 0.0272, "mean_token_accuracy": 0.9919560849666595, "num_tokens": 183310470.0, "step": 1726 }, { "entropy": 0.9702861905097961, "epoch": 3.935005701254276, "grad_norm": 1.03125, "learning_rate": 3.556261148633026e-06, "loss": 0.0312, "mean_token_accuracy": 0.9912091493606567, "num_tokens": 183416995.0, "step": 1727 }, { "entropy": 0.9754245728254318, "epoch": 3.9372862029646525, "grad_norm": 0.9296875, "learning_rate": 3.5545537647615125e-06, "loss": 0.0352, "mean_token_accuracy": 0.9887178838253021, "num_tokens": 183523083.0, "step": 1728 }, { "entropy": 0.9649964570999146, "epoch": 3.9395667046750287, "grad_norm": 1.4140625, "learning_rate": 3.552845782345642e-06, "loss": 0.0513, "mean_token_accuracy": 0.9837079495191574, "num_tokens": 183629046.0, "step": 1729 }, { "entropy": 0.9721202999353409, "epoch": 3.941847206385405, "grad_norm": 1.3203125, "learning_rate": 3.551137202354831e-06, "loss": 0.0317, "mean_token_accuracy": 0.9910330921411514, "num_tokens": 183735100.0, "step": 1730 }, { "entropy": 0.9695976227521896, "epoch": 3.944127708095781, "grad_norm": 0.96484375, "learning_rate": 3.5494280257588367e-06, "loss": 0.0333, "mean_token_accuracy": 0.9891901314258575, "num_tokens": 183841552.0, "step": 1731 }, { "entropy": 0.9719134718179703, "epoch": 3.9464082098061573, "grad_norm": 0.87109375, "learning_rate": 3.547718253527755e-06, "loss": 0.0269, "mean_token_accuracy": 0.9913059771060944, "num_tokens": 183948070.0, "step": 1732 }, { "entropy": 0.9767229855060577, "epoch": 3.9486887115165334, "grad_norm": 1.140625, "learning_rate": 3.546007886632019e-06, "loss": 0.0387, "mean_token_accuracy": 0.9881872087717056, "num_tokens": 184053772.0, "step": 1733 }, { "entropy": 0.9769161194562912, "epoch": 3.95096921322691, "grad_norm": 0.9609375, "learning_rate": 3.5442969260424022e-06, "loss": 0.0292, "mean_token_accuracy": 0.991097554564476, "num_tokens": 184160199.0, "step": 1734 }, { "entropy": 0.9704889208078384, "epoch": 3.9532497149372863, "grad_norm": 0.9140625, "learning_rate": 3.5425853727300095e-06, "loss": 0.026, "mean_token_accuracy": 0.9930500984191895, "num_tokens": 184266881.0, "step": 1735 }, { "entropy": 0.9677465856075287, "epoch": 3.9555302166476625, "grad_norm": 0.91015625, "learning_rate": 3.5408732276662882e-06, "loss": 0.0249, "mean_token_accuracy": 0.992928996682167, "num_tokens": 184372815.0, "step": 1736 }, { "entropy": 0.9613370299339294, "epoch": 3.9578107183580387, "grad_norm": 1.15625, "learning_rate": 3.5391604918230173e-06, "loss": 0.0359, "mean_token_accuracy": 0.9897250831127167, "num_tokens": 184479788.0, "step": 1737 }, { "entropy": 0.962685838341713, "epoch": 3.9600912200684153, "grad_norm": 1.1015625, "learning_rate": 3.537447166172313e-06, "loss": 0.033, "mean_token_accuracy": 0.9886024743318558, "num_tokens": 184585958.0, "step": 1738 }, { "entropy": 0.9692789167165756, "epoch": 3.9623717217787915, "grad_norm": 0.9375, "learning_rate": 3.5357332516866256e-06, "loss": 0.0252, "mean_token_accuracy": 0.9915052056312561, "num_tokens": 184691915.0, "step": 1739 }, { "entropy": 0.9724288582801819, "epoch": 3.9646522234891677, "grad_norm": 0.90625, "learning_rate": 3.534018749338741e-06, "loss": 0.0287, "mean_token_accuracy": 0.9895677268505096, "num_tokens": 184798627.0, "step": 1740 }, { "entropy": 0.9630392789840698, "epoch": 3.966932725199544, "grad_norm": 0.96484375, "learning_rate": 3.532303660101776e-06, "loss": 0.034, "mean_token_accuracy": 0.990952879190445, "num_tokens": 184904778.0, "step": 1741 }, { "entropy": 0.9684912115335464, "epoch": 3.96921322690992, "grad_norm": 1.15625, "learning_rate": 3.530587984949183e-06, "loss": 0.0344, "mean_token_accuracy": 0.9890445470809937, "num_tokens": 185010762.0, "step": 1742 }, { "entropy": 0.9674504548311234, "epoch": 3.9714937286202963, "grad_norm": 1.046875, "learning_rate": 3.5288717248547453e-06, "loss": 0.0298, "mean_token_accuracy": 0.9893774092197418, "num_tokens": 185118028.0, "step": 1743 }, { "entropy": 0.9684316366910934, "epoch": 3.973774230330673, "grad_norm": 1.171875, "learning_rate": 3.5271548807925803e-06, "loss": 0.0331, "mean_token_accuracy": 0.9889957755804062, "num_tokens": 185224416.0, "step": 1744 }, { "entropy": 0.9707543849945068, "epoch": 3.976054732041049, "grad_norm": 1.0, "learning_rate": 3.525437453737136e-06, "loss": 0.0351, "mean_token_accuracy": 0.9870686233043671, "num_tokens": 185330806.0, "step": 1745 }, { "entropy": 0.9642233848571777, "epoch": 3.9783352337514253, "grad_norm": 0.90234375, "learning_rate": 3.5237194446631883e-06, "loss": 0.0297, "mean_token_accuracy": 0.9899952560663223, "num_tokens": 185436329.0, "step": 1746 }, { "entropy": 0.9651535451412201, "epoch": 3.9806157354618015, "grad_norm": 1.0390625, "learning_rate": 3.522000854545849e-06, "loss": 0.0304, "mean_token_accuracy": 0.9886195510625839, "num_tokens": 185542353.0, "step": 1747 }, { "entropy": 0.9595563262701035, "epoch": 3.982896237172178, "grad_norm": 1.3046875, "learning_rate": 3.520281684360554e-06, "loss": 0.0434, "mean_token_accuracy": 0.9858220219612122, "num_tokens": 185648576.0, "step": 1748 }, { "entropy": 0.969423234462738, "epoch": 3.9851767388825543, "grad_norm": 1.1015625, "learning_rate": 3.5185619350830725e-06, "loss": 0.0379, "mean_token_accuracy": 0.9884755909442902, "num_tokens": 185754488.0, "step": 1749 }, { "entropy": 0.9609140157699585, "epoch": 3.9874572405929305, "grad_norm": 0.921875, "learning_rate": 3.516841607689501e-06, "loss": 0.0256, "mean_token_accuracy": 0.993467390537262, "num_tokens": 185861241.0, "step": 1750 }, { "entropy": 0.965735524892807, "epoch": 3.9897377423033067, "grad_norm": 1.1015625, "learning_rate": 3.515120703156264e-06, "loss": 0.0374, "mean_token_accuracy": 0.9870176464319229, "num_tokens": 185966763.0, "step": 1751 }, { "entropy": 0.9706375449895859, "epoch": 3.992018244013683, "grad_norm": 1.25, "learning_rate": 3.5133992224601126e-06, "loss": 0.0387, "mean_token_accuracy": 0.9890061467885971, "num_tokens": 186072864.0, "step": 1752 }, { "entropy": 0.9701850116252899, "epoch": 3.994298745724059, "grad_norm": 1.1171875, "learning_rate": 3.511677166578128e-06, "loss": 0.0398, "mean_token_accuracy": 0.986503928899765, "num_tokens": 186179923.0, "step": 1753 }, { "entropy": 0.9658877104520798, "epoch": 3.9965792474344357, "grad_norm": 1.265625, "learning_rate": 3.509954536487714e-06, "loss": 0.0374, "mean_token_accuracy": 0.9895786494016647, "num_tokens": 186286071.0, "step": 1754 }, { "entropy": 0.9729218780994415, "epoch": 3.998859749144812, "grad_norm": 1.3515625, "learning_rate": 3.5082313331666035e-06, "loss": 0.0443, "mean_token_accuracy": 0.9862878620624542, "num_tokens": 186392966.0, "step": 1755 }, { "entropy": 0.9669854044914246, "epoch": 4.0, "grad_norm": 1.4765625, "learning_rate": 3.506507557592853e-06, "loss": 0.0243, "mean_token_accuracy": 0.9926508963108063, "num_tokens": 186431904.0, "step": 1756 }, { "entropy": 0.9688285738229752, "epoch": 4.002280501710376, "grad_norm": 1.1328125, "learning_rate": 3.5047832107448437e-06, "loss": 0.0318, "mean_token_accuracy": 0.9906902313232422, "num_tokens": 186537644.0, "step": 1757 }, { "entropy": 0.968401312828064, "epoch": 4.004561003420752, "grad_norm": 0.9453125, "learning_rate": 3.503058293601283e-06, "loss": 0.0313, "mean_token_accuracy": 0.9891801178455353, "num_tokens": 186644312.0, "step": 1758 }, { "entropy": 0.968361884355545, "epoch": 4.006841505131129, "grad_norm": 1.0546875, "learning_rate": 3.5013328071411995e-06, "loss": 0.0379, "mean_token_accuracy": 0.9898781776428223, "num_tokens": 186750940.0, "step": 1759 }, { "entropy": 0.9727257490158081, "epoch": 4.009122006841505, "grad_norm": 1.265625, "learning_rate": 3.499606752343945e-06, "loss": 0.0404, "mean_token_accuracy": 0.9886132329702377, "num_tokens": 186858001.0, "step": 1760 }, { "epoch": 4.009122006841505, "eval_entropy": 0.9666312207740523, "eval_loss": 0.038487330079078674, "eval_mean_token_accuracy": 0.9883188759419401, "eval_num_tokens": 186858001.0, "eval_runtime": 66.092, "eval_samples_per_second": 126.869, "eval_steps_per_second": 3.979, "step": 1760 }, { "entropy": 0.9698476046323776, "epoch": 4.011402508551882, "grad_norm": 1.2265625, "learning_rate": 3.4978801301891972e-06, "loss": 0.0367, "mean_token_accuracy": 0.9902595579624176, "num_tokens": 186964371.0, "step": 1761 }, { "entropy": 0.9667666554450989, "epoch": 4.013683010262258, "grad_norm": 1.03125, "learning_rate": 3.496152941656952e-06, "loss": 0.0286, "mean_token_accuracy": 0.989838033914566, "num_tokens": 187070342.0, "step": 1762 }, { "entropy": 0.9733050912618637, "epoch": 4.015963511972634, "grad_norm": 1.2578125, "learning_rate": 3.494425187727528e-06, "loss": 0.0347, "mean_token_accuracy": 0.9885300099849701, "num_tokens": 187176740.0, "step": 1763 }, { "entropy": 0.9682720750570297, "epoch": 4.01824401368301, "grad_norm": 0.8984375, "learning_rate": 3.4926968693815667e-06, "loss": 0.0242, "mean_token_accuracy": 0.9939256310462952, "num_tokens": 187282906.0, "step": 1764 }, { "entropy": 0.9647968262434006, "epoch": 4.020524515393387, "grad_norm": 1.0234375, "learning_rate": 3.4909679876000256e-06, "loss": 0.029, "mean_token_accuracy": 0.9901951104402542, "num_tokens": 187389432.0, "step": 1765 }, { "entropy": 0.9693285822868347, "epoch": 4.022805017103763, "grad_norm": 1.0703125, "learning_rate": 3.4892385433641875e-06, "loss": 0.0209, "mean_token_accuracy": 0.9927219897508621, "num_tokens": 187495688.0, "step": 1766 }, { "entropy": 0.9683179557323456, "epoch": 4.025085518814139, "grad_norm": 1.078125, "learning_rate": 3.4875085376556493e-06, "loss": 0.0341, "mean_token_accuracy": 0.9894810765981674, "num_tokens": 187602337.0, "step": 1767 }, { "entropy": 0.970379188656807, "epoch": 4.027366020524515, "grad_norm": 1.0703125, "learning_rate": 3.4857779714563305e-06, "loss": 0.034, "mean_token_accuracy": 0.9882687479257584, "num_tokens": 187708367.0, "step": 1768 }, { "entropy": 0.9647615104913712, "epoch": 4.029646522234891, "grad_norm": 1.109375, "learning_rate": 3.4840468457484654e-06, "loss": 0.028, "mean_token_accuracy": 0.9902230948209763, "num_tokens": 187814984.0, "step": 1769 }, { "entropy": 0.9651589393615723, "epoch": 4.031927023945268, "grad_norm": 0.91796875, "learning_rate": 3.4823151615146093e-06, "loss": 0.029, "mean_token_accuracy": 0.9903969317674637, "num_tokens": 187921324.0, "step": 1770 }, { "entropy": 0.9642309695482254, "epoch": 4.034207525655645, "grad_norm": 0.93359375, "learning_rate": 3.480582919737631e-06, "loss": 0.0283, "mean_token_accuracy": 0.9901574105024338, "num_tokens": 188027473.0, "step": 1771 }, { "entropy": 0.9631271064281464, "epoch": 4.036488027366021, "grad_norm": 1.046875, "learning_rate": 3.478850121400719e-06, "loss": 0.0283, "mean_token_accuracy": 0.9897805750370026, "num_tokens": 188134178.0, "step": 1772 }, { "entropy": 0.9626231491565704, "epoch": 4.038768529076397, "grad_norm": 0.93359375, "learning_rate": 3.477116767487375e-06, "loss": 0.0254, "mean_token_accuracy": 0.9934516698122025, "num_tokens": 188240579.0, "step": 1773 }, { "entropy": 0.968368798494339, "epoch": 4.041049030786773, "grad_norm": 1.046875, "learning_rate": 3.475382858981418e-06, "loss": 0.0292, "mean_token_accuracy": 0.9904149323701859, "num_tokens": 188346879.0, "step": 1774 }, { "entropy": 0.9712808281183243, "epoch": 4.043329532497149, "grad_norm": 1.03125, "learning_rate": 3.473648396866981e-06, "loss": 0.0343, "mean_token_accuracy": 0.9882197380065918, "num_tokens": 188452973.0, "step": 1775 }, { "entropy": 0.9602316170930862, "epoch": 4.045610034207526, "grad_norm": 1.125, "learning_rate": 3.4719133821285108e-06, "loss": 0.0358, "mean_token_accuracy": 0.9889072626829147, "num_tokens": 188559492.0, "step": 1776 }, { "entropy": 0.9627697318792343, "epoch": 4.047890535917902, "grad_norm": 0.984375, "learning_rate": 3.470177815750769e-06, "loss": 0.032, "mean_token_accuracy": 0.9884670674800873, "num_tokens": 188666647.0, "step": 1777 }, { "entropy": 0.960688129067421, "epoch": 4.050171037628278, "grad_norm": 1.3515625, "learning_rate": 3.4684416987188273e-06, "loss": 0.0381, "mean_token_accuracy": 0.9879306554794312, "num_tokens": 188773411.0, "step": 1778 }, { "entropy": 0.9646214842796326, "epoch": 4.052451539338654, "grad_norm": 1.265625, "learning_rate": 3.4667050320180755e-06, "loss": 0.041, "mean_token_accuracy": 0.9881556630134583, "num_tokens": 188879613.0, "step": 1779 }, { "entropy": 0.9645721912384033, "epoch": 4.05473204104903, "grad_norm": 0.99609375, "learning_rate": 3.4649678166342104e-06, "loss": 0.0292, "mean_token_accuracy": 0.9912207722663879, "num_tokens": 188985692.0, "step": 1780 }, { "entropy": 0.9681588858366013, "epoch": 4.0570125427594075, "grad_norm": 1.109375, "learning_rate": 3.4632300535532415e-06, "loss": 0.0377, "mean_token_accuracy": 0.9876747280359268, "num_tokens": 189092323.0, "step": 1781 }, { "entropy": 0.9691698402166367, "epoch": 4.059293044469784, "grad_norm": 1.1015625, "learning_rate": 3.46149174376149e-06, "loss": 0.0298, "mean_token_accuracy": 0.9900484979152679, "num_tokens": 189198232.0, "step": 1782 }, { "entropy": 0.9630117863416672, "epoch": 4.06157354618016, "grad_norm": 1.140625, "learning_rate": 3.459752888245587e-06, "loss": 0.0278, "mean_token_accuracy": 0.9920857548713684, "num_tokens": 189304773.0, "step": 1783 }, { "entropy": 0.9640295952558517, "epoch": 4.063854047890536, "grad_norm": 1.2109375, "learning_rate": 3.4580134879924732e-06, "loss": 0.0343, "mean_token_accuracy": 0.9883357733488083, "num_tokens": 189410971.0, "step": 1784 }, { "entropy": 0.9701209962368011, "epoch": 4.066134549600912, "grad_norm": 1.046875, "learning_rate": 3.4562735439894e-06, "loss": 0.0297, "mean_token_accuracy": 0.9900753498077393, "num_tokens": 189517749.0, "step": 1785 }, { "entropy": 0.9656591862440109, "epoch": 4.068415051311288, "grad_norm": 0.9765625, "learning_rate": 3.4545330572239234e-06, "loss": 0.0317, "mean_token_accuracy": 0.9895436465740204, "num_tokens": 189623807.0, "step": 1786 }, { "entropy": 0.9681552201509476, "epoch": 4.070695553021665, "grad_norm": 1.0625, "learning_rate": 3.452792028683912e-06, "loss": 0.0341, "mean_token_accuracy": 0.9897497445344925, "num_tokens": 189730041.0, "step": 1787 }, { "entropy": 0.9622933566570282, "epoch": 4.072976054732041, "grad_norm": 1.0859375, "learning_rate": 3.4510504593575396e-06, "loss": 0.028, "mean_token_accuracy": 0.9911855459213257, "num_tokens": 189836276.0, "step": 1788 }, { "entropy": 0.95988330245018, "epoch": 4.075256556442417, "grad_norm": 1.3984375, "learning_rate": 3.449308350233287e-06, "loss": 0.041, "mean_token_accuracy": 0.9869813919067383, "num_tokens": 189943120.0, "step": 1789 }, { "entropy": 0.9663940966129303, "epoch": 4.077537058152793, "grad_norm": 1.140625, "learning_rate": 3.447565702299942e-06, "loss": 0.0377, "mean_token_accuracy": 0.9889859706163406, "num_tokens": 190049409.0, "step": 1790 }, { "entropy": 0.9687055349349976, "epoch": 4.07981755986317, "grad_norm": 1.109375, "learning_rate": 3.445822516546598e-06, "loss": 0.0348, "mean_token_accuracy": 0.988969624042511, "num_tokens": 190156050.0, "step": 1791 }, { "entropy": 0.9628987610340118, "epoch": 4.0820980615735465, "grad_norm": 0.984375, "learning_rate": 3.444078793962653e-06, "loss": 0.033, "mean_token_accuracy": 0.9891820847988129, "num_tokens": 190262143.0, "step": 1792 }, { "entropy": 0.9698877483606339, "epoch": 4.084378563283923, "grad_norm": 0.88671875, "learning_rate": 3.4423345355378114e-06, "loss": 0.0291, "mean_token_accuracy": 0.9907748103141785, "num_tokens": 190368658.0, "step": 1793 }, { "entropy": 0.9709612727165222, "epoch": 4.086659064994299, "grad_norm": 0.8671875, "learning_rate": 3.440589742262079e-06, "loss": 0.0253, "mean_token_accuracy": 0.993770644068718, "num_tokens": 190474812.0, "step": 1794 }, { "entropy": 0.9583680480718613, "epoch": 4.088939566704675, "grad_norm": 1.3515625, "learning_rate": 3.438844415125768e-06, "loss": 0.04, "mean_token_accuracy": 0.9894697368144989, "num_tokens": 190581163.0, "step": 1795 }, { "entropy": 0.9628935158252716, "epoch": 4.091220068415051, "grad_norm": 0.98046875, "learning_rate": 3.437098555119493e-06, "loss": 0.0275, "mean_token_accuracy": 0.9918935298919678, "num_tokens": 190687575.0, "step": 1796 }, { "entropy": 0.9698667377233505, "epoch": 4.0935005701254275, "grad_norm": 1.078125, "learning_rate": 3.4353521632341686e-06, "loss": 0.0285, "mean_token_accuracy": 0.9918943494558334, "num_tokens": 190793744.0, "step": 1797 }, { "entropy": 0.9670751243829727, "epoch": 4.095781071835804, "grad_norm": 1.15625, "learning_rate": 3.4336052404610138e-06, "loss": 0.0364, "mean_token_accuracy": 0.989435002207756, "num_tokens": 190900121.0, "step": 1798 }, { "entropy": 0.9729818850755692, "epoch": 4.09806157354618, "grad_norm": 1.1171875, "learning_rate": 3.431857787791549e-06, "loss": 0.0278, "mean_token_accuracy": 0.991462454199791, "num_tokens": 191006358.0, "step": 1799 }, { "entropy": 0.9665858298540115, "epoch": 4.100342075256556, "grad_norm": 1.0, "learning_rate": 3.4301098062175936e-06, "loss": 0.027, "mean_token_accuracy": 0.990742951631546, "num_tokens": 191113034.0, "step": 1800 }, { "entropy": 0.9704269170761108, "epoch": 4.102622576966933, "grad_norm": 1.3828125, "learning_rate": 3.4283612967312692e-06, "loss": 0.0467, "mean_token_accuracy": 0.9849923998117447, "num_tokens": 191218796.0, "step": 1801 }, { "entropy": 0.9686210602521896, "epoch": 4.104903078677309, "grad_norm": 1.265625, "learning_rate": 3.426612260324996e-06, "loss": 0.0279, "mean_token_accuracy": 0.989859327673912, "num_tokens": 191324731.0, "step": 1802 }, { "entropy": 0.9685505628585815, "epoch": 4.1071835803876855, "grad_norm": 1.21875, "learning_rate": 3.424862697991491e-06, "loss": 0.0415, "mean_token_accuracy": 0.9854398965835571, "num_tokens": 191431191.0, "step": 1803 }, { "entropy": 0.9615141898393631, "epoch": 4.109464082098062, "grad_norm": 1.1875, "learning_rate": 3.4231126107237754e-06, "loss": 0.0336, "mean_token_accuracy": 0.9870307594537735, "num_tokens": 191538904.0, "step": 1804 }, { "entropy": 0.9654593467712402, "epoch": 4.111744583808438, "grad_norm": 0.93359375, "learning_rate": 3.4213619995151628e-06, "loss": 0.0299, "mean_token_accuracy": 0.9915264546871185, "num_tokens": 191644977.0, "step": 1805 }, { "entropy": 0.966985285282135, "epoch": 4.114025085518814, "grad_norm": 0.796875, "learning_rate": 3.4196108653592662e-06, "loss": 0.0205, "mean_token_accuracy": 0.9936831146478653, "num_tokens": 191751311.0, "step": 1806 }, { "entropy": 0.9668587893247604, "epoch": 4.11630558722919, "grad_norm": 1.078125, "learning_rate": 3.417859209249997e-06, "loss": 0.0278, "mean_token_accuracy": 0.9907823503017426, "num_tokens": 191857241.0, "step": 1807 }, { "entropy": 0.9670950174331665, "epoch": 4.1185860889395665, "grad_norm": 1.296875, "learning_rate": 3.4161070321815605e-06, "loss": 0.0355, "mean_token_accuracy": 0.9888591766357422, "num_tokens": 191963353.0, "step": 1808 }, { "entropy": 0.9706541448831558, "epoch": 4.120866590649943, "grad_norm": 1.078125, "learning_rate": 3.4143543351484585e-06, "loss": 0.0334, "mean_token_accuracy": 0.9900697618722916, "num_tokens": 192069546.0, "step": 1809 }, { "entropy": 0.9663038700819016, "epoch": 4.123147092360319, "grad_norm": 0.7265625, "learning_rate": 3.4126011191454877e-06, "loss": 0.0269, "mean_token_accuracy": 0.991223156452179, "num_tokens": 192175384.0, "step": 1810 }, { "entropy": 0.9647921025753021, "epoch": 4.125427594070696, "grad_norm": 1.0703125, "learning_rate": 3.4108473851677408e-06, "loss": 0.0405, "mean_token_accuracy": 0.9885048121213913, "num_tokens": 192281380.0, "step": 1811 }, { "entropy": 0.9604755491018295, "epoch": 4.127708095781072, "grad_norm": 1.0390625, "learning_rate": 3.4090931342106024e-06, "loss": 0.0328, "mean_token_accuracy": 0.9897146224975586, "num_tokens": 192388308.0, "step": 1812 }, { "entropy": 0.9702850431203842, "epoch": 4.129988597491448, "grad_norm": 1.296875, "learning_rate": 3.4073383672697524e-06, "loss": 0.0307, "mean_token_accuracy": 0.9907782971858978, "num_tokens": 192494627.0, "step": 1813 }, { "entropy": 0.9690610021352768, "epoch": 4.1322690992018245, "grad_norm": 1.3515625, "learning_rate": 3.4055830853411616e-06, "loss": 0.0407, "mean_token_accuracy": 0.9863899350166321, "num_tokens": 192600574.0, "step": 1814 }, { "entropy": 0.9701189994812012, "epoch": 4.134549600912201, "grad_norm": 1.078125, "learning_rate": 3.4038272894210945e-06, "loss": 0.0284, "mean_token_accuracy": 0.9901335686445236, "num_tokens": 192706469.0, "step": 1815 }, { "entropy": 0.9645695686340332, "epoch": 4.136830102622577, "grad_norm": 0.97265625, "learning_rate": 3.4020709805061066e-06, "loss": 0.037, "mean_token_accuracy": 0.9887773841619492, "num_tokens": 192813140.0, "step": 1816 }, { "entropy": 0.9664911329746246, "epoch": 4.139110604332953, "grad_norm": 0.9609375, "learning_rate": 3.4003141595930456e-06, "loss": 0.0276, "mean_token_accuracy": 0.9897423535585403, "num_tokens": 192919396.0, "step": 1817 }, { "entropy": 0.9720323234796524, "epoch": 4.141391106043329, "grad_norm": 1.140625, "learning_rate": 3.3985568276790487e-06, "loss": 0.0361, "mean_token_accuracy": 0.9874101728200912, "num_tokens": 193026590.0, "step": 1818 }, { "entropy": 0.9693032652139664, "epoch": 4.1436716077537055, "grad_norm": 1.1875, "learning_rate": 3.3967989857615434e-06, "loss": 0.0415, "mean_token_accuracy": 0.9878171980381012, "num_tokens": 193132964.0, "step": 1819 }, { "entropy": 0.9687658995389938, "epoch": 4.145952109464082, "grad_norm": 0.98046875, "learning_rate": 3.3950406348382483e-06, "loss": 0.0308, "mean_token_accuracy": 0.9903606325387955, "num_tokens": 193238876.0, "step": 1820 }, { "entropy": 0.9673096239566803, "epoch": 4.148232611174459, "grad_norm": 1.1171875, "learning_rate": 3.3932817759071666e-06, "loss": 0.0358, "mean_token_accuracy": 0.9872155636548996, "num_tokens": 193345078.0, "step": 1821 }, { "entropy": 0.9674221128225327, "epoch": 4.150513112884835, "grad_norm": 1.125, "learning_rate": 3.3915224099665962e-06, "loss": 0.0302, "mean_token_accuracy": 0.9900580644607544, "num_tokens": 193451593.0, "step": 1822 }, { "entropy": 0.9670815765857697, "epoch": 4.152793614595211, "grad_norm": 1.0546875, "learning_rate": 3.389762538015116e-06, "loss": 0.0357, "mean_token_accuracy": 0.9865506291389465, "num_tokens": 193557694.0, "step": 1823 }, { "entropy": 0.9698064476251602, "epoch": 4.155074116305587, "grad_norm": 1.2109375, "learning_rate": 3.388002161051598e-06, "loss": 0.0391, "mean_token_accuracy": 0.988313227891922, "num_tokens": 193663812.0, "step": 1824 }, { "entropy": 0.9627960473299026, "epoch": 4.1573546180159635, "grad_norm": 0.93359375, "learning_rate": 3.3862412800751963e-06, "loss": 0.0325, "mean_token_accuracy": 0.9891023337841034, "num_tokens": 193769600.0, "step": 1825 }, { "entropy": 0.9669003337621689, "epoch": 4.15963511972634, "grad_norm": 1.2109375, "learning_rate": 3.3844798960853533e-06, "loss": 0.0414, "mean_token_accuracy": 0.9867894649505615, "num_tokens": 193875852.0, "step": 1826 }, { "entropy": 0.9670723676681519, "epoch": 4.161915621436716, "grad_norm": 0.9921875, "learning_rate": 3.382718010081797e-06, "loss": 0.0323, "mean_token_accuracy": 0.9911000579595566, "num_tokens": 193982303.0, "step": 1827 }, { "entropy": 0.9656706899404526, "epoch": 4.164196123147092, "grad_norm": 1.203125, "learning_rate": 3.38095562306454e-06, "loss": 0.0394, "mean_token_accuracy": 0.9872424155473709, "num_tokens": 194088386.0, "step": 1828 }, { "entropy": 0.9653499126434326, "epoch": 4.166476624857468, "grad_norm": 1.03125, "learning_rate": 3.3791927360338785e-06, "loss": 0.0312, "mean_token_accuracy": 0.9894671887159348, "num_tokens": 194194991.0, "step": 1829 }, { "entropy": 0.9633163213729858, "epoch": 4.168757126567845, "grad_norm": 0.9765625, "learning_rate": 3.3774293499903934e-06, "loss": 0.038, "mean_token_accuracy": 0.9886797368526459, "num_tokens": 194300781.0, "step": 1830 }, { "entropy": 0.9663009494543076, "epoch": 4.1710376282782216, "grad_norm": 1.0703125, "learning_rate": 3.3756654659349487e-06, "loss": 0.0385, "mean_token_accuracy": 0.9872436970472336, "num_tokens": 194406659.0, "step": 1831 }, { "entropy": 0.9615237414836884, "epoch": 4.173318129988598, "grad_norm": 1.1328125, "learning_rate": 3.373901084868691e-06, "loss": 0.0298, "mean_token_accuracy": 0.9900057017803192, "num_tokens": 194513109.0, "step": 1832 }, { "entropy": 0.9703481942415237, "epoch": 4.175598631698974, "grad_norm": 1.234375, "learning_rate": 3.372136207793049e-06, "loss": 0.0346, "mean_token_accuracy": 0.9887818396091461, "num_tokens": 194619274.0, "step": 1833 }, { "entropy": 0.9669326394796371, "epoch": 4.17787913340935, "grad_norm": 0.9453125, "learning_rate": 3.3703708357097333e-06, "loss": 0.0238, "mean_token_accuracy": 0.9926311373710632, "num_tokens": 194725570.0, "step": 1834 }, { "entropy": 0.9607755243778229, "epoch": 4.180159635119726, "grad_norm": 0.765625, "learning_rate": 3.3686049696207336e-06, "loss": 0.0229, "mean_token_accuracy": 0.9913339763879776, "num_tokens": 194831725.0, "step": 1835 }, { "entropy": 0.9651920199394226, "epoch": 4.1824401368301025, "grad_norm": 0.93359375, "learning_rate": 3.3668386105283226e-06, "loss": 0.0213, "mean_token_accuracy": 0.9934971630573273, "num_tokens": 194937789.0, "step": 1836 }, { "entropy": 0.9694849997758865, "epoch": 4.184720638540479, "grad_norm": 1.1171875, "learning_rate": 3.365071759435051e-06, "loss": 0.0348, "mean_token_accuracy": 0.9878856986761093, "num_tokens": 195044218.0, "step": 1837 }, { "entropy": 0.9617316871881485, "epoch": 4.187001140250855, "grad_norm": 1.1640625, "learning_rate": 3.363304417343749e-06, "loss": 0.0371, "mean_token_accuracy": 0.986471563577652, "num_tokens": 195150833.0, "step": 1838 }, { "entropy": 0.9684401750564575, "epoch": 4.189281641961231, "grad_norm": 1.5859375, "learning_rate": 3.3615365852575276e-06, "loss": 0.0351, "mean_token_accuracy": 0.9895707368850708, "num_tokens": 195257925.0, "step": 1839 }, { "entropy": 0.9666711688041687, "epoch": 4.191562143671608, "grad_norm": 1.15625, "learning_rate": 3.359768264179772e-06, "loss": 0.0248, "mean_token_accuracy": 0.9923476278781891, "num_tokens": 195364649.0, "step": 1840 }, { "entropy": 0.9631852805614471, "epoch": 4.193842645381984, "grad_norm": 0.93359375, "learning_rate": 3.357999455114148e-06, "loss": 0.0238, "mean_token_accuracy": 0.9921600967645645, "num_tokens": 195470924.0, "step": 1841 }, { "entropy": 0.9576045125722885, "epoch": 4.196123147092361, "grad_norm": 0.984375, "learning_rate": 3.356230159064599e-06, "loss": 0.0319, "mean_token_accuracy": 0.9899688065052032, "num_tokens": 195576992.0, "step": 1842 }, { "entropy": 0.9611700922250748, "epoch": 4.198403648802737, "grad_norm": 1.0, "learning_rate": 3.3544603770353407e-06, "loss": 0.0266, "mean_token_accuracy": 0.9917650669813156, "num_tokens": 195683347.0, "step": 1843 }, { "entropy": 0.964864507317543, "epoch": 4.200684150513113, "grad_norm": 1.1640625, "learning_rate": 3.352690110030869e-06, "loss": 0.0328, "mean_token_accuracy": 0.9924787729978561, "num_tokens": 195789805.0, "step": 1844 }, { "entropy": 0.9608648866415024, "epoch": 4.202964652223489, "grad_norm": 1.2890625, "learning_rate": 3.350919359055953e-06, "loss": 0.0366, "mean_token_accuracy": 0.9874790459871292, "num_tokens": 195896369.0, "step": 1845 }, { "entropy": 0.9630059599876404, "epoch": 4.205245153933865, "grad_norm": 0.84375, "learning_rate": 3.3491481251156355e-06, "loss": 0.0243, "mean_token_accuracy": 0.9919001013040543, "num_tokens": 196002889.0, "step": 1846 }, { "entropy": 0.965234562754631, "epoch": 4.2075256556442415, "grad_norm": 1.1484375, "learning_rate": 3.347376409215236e-06, "loss": 0.0372, "mean_token_accuracy": 0.9889468252658844, "num_tokens": 196109480.0, "step": 1847 }, { "entropy": 0.9654846042394638, "epoch": 4.209806157354618, "grad_norm": 1.09375, "learning_rate": 3.345604212360346e-06, "loss": 0.0351, "mean_token_accuracy": 0.9904627799987793, "num_tokens": 196215468.0, "step": 1848 }, { "entropy": 0.9605536311864853, "epoch": 4.212086659064994, "grad_norm": 0.98046875, "learning_rate": 3.3438315355568295e-06, "loss": 0.0294, "mean_token_accuracy": 0.9903625696897507, "num_tokens": 196321297.0, "step": 1849 }, { "entropy": 0.9674085080623627, "epoch": 4.214367160775371, "grad_norm": 1.03125, "learning_rate": 3.3420583798108253e-06, "loss": 0.0295, "mean_token_accuracy": 0.9889358431100845, "num_tokens": 196427299.0, "step": 1850 }, { "entropy": 0.9643923938274384, "epoch": 4.216647662485747, "grad_norm": 1.0234375, "learning_rate": 3.34028474612874e-06, "loss": 0.037, "mean_token_accuracy": 0.9903976321220398, "num_tokens": 196533397.0, "step": 1851 }, { "entropy": 0.9684377908706665, "epoch": 4.218928164196123, "grad_norm": 1.2734375, "learning_rate": 3.338510635517256e-06, "loss": 0.0401, "mean_token_accuracy": 0.9864699393510818, "num_tokens": 196640091.0, "step": 1852 }, { "entropy": 0.9682759195566177, "epoch": 4.2212086659065, "grad_norm": 1.125, "learning_rate": 3.3367360489833236e-06, "loss": 0.0321, "mean_token_accuracy": 0.9901448339223862, "num_tokens": 196747044.0, "step": 1853 }, { "entropy": 0.9661152511835098, "epoch": 4.223489167616876, "grad_norm": 0.84375, "learning_rate": 3.3349609875341626e-06, "loss": 0.026, "mean_token_accuracy": 0.9927148371934891, "num_tokens": 196853400.0, "step": 1854 }, { "entropy": 0.9635871201753616, "epoch": 4.225769669327252, "grad_norm": 1.0625, "learning_rate": 3.3331854521772656e-06, "loss": 0.0291, "mean_token_accuracy": 0.9925208687782288, "num_tokens": 196959386.0, "step": 1855 }, { "entropy": 0.9695325940847397, "epoch": 4.228050171037628, "grad_norm": 0.953125, "learning_rate": 3.3314094439203903e-06, "loss": 0.0291, "mean_token_accuracy": 0.9919337928295135, "num_tokens": 197065358.0, "step": 1856 }, { "entropy": 0.9659925997257233, "epoch": 4.230330672748004, "grad_norm": 1.015625, "learning_rate": 3.3296329637715662e-06, "loss": 0.0319, "mean_token_accuracy": 0.9911742657423019, "num_tokens": 197171010.0, "step": 1857 }, { "entropy": 0.9657643586397171, "epoch": 4.2326111744583805, "grad_norm": 1.03125, "learning_rate": 3.3278560127390892e-06, "loss": 0.0246, "mean_token_accuracy": 0.9927117377519608, "num_tokens": 197277507.0, "step": 1858 }, { "entropy": 0.9665837436914444, "epoch": 4.234891676168757, "grad_norm": 0.9765625, "learning_rate": 3.32607859183152e-06, "loss": 0.0248, "mean_token_accuracy": 0.9911992251873016, "num_tokens": 197383037.0, "step": 1859 }, { "entropy": 0.9597873240709305, "epoch": 4.237172177879134, "grad_norm": 1.1015625, "learning_rate": 3.3243007020576917e-06, "loss": 0.0386, "mean_token_accuracy": 0.9869554340839386, "num_tokens": 197489123.0, "step": 1860 }, { "entropy": 0.9672567248344421, "epoch": 4.23945267958951, "grad_norm": 1.1171875, "learning_rate": 3.322522344426698e-06, "loss": 0.0271, "mean_token_accuracy": 0.9917360246181488, "num_tokens": 197595085.0, "step": 1861 }, { "entropy": 0.9666308462619781, "epoch": 4.241733181299886, "grad_norm": 0.9296875, "learning_rate": 3.320743519947901e-06, "loss": 0.0308, "mean_token_accuracy": 0.9892037808895111, "num_tokens": 197701669.0, "step": 1862 }, { "entropy": 0.9660544693470001, "epoch": 4.244013683010262, "grad_norm": 1.21875, "learning_rate": 3.318964229630927e-06, "loss": 0.039, "mean_token_accuracy": 0.9882539957761765, "num_tokens": 197807864.0, "step": 1863 }, { "entropy": 0.9683523178100586, "epoch": 4.246294184720639, "grad_norm": 1.109375, "learning_rate": 3.3171844744856675e-06, "loss": 0.0272, "mean_token_accuracy": 0.9906697124242783, "num_tokens": 197914026.0, "step": 1864 }, { "entropy": 0.9644881188869476, "epoch": 4.248574686431015, "grad_norm": 1.1328125, "learning_rate": 3.3154042555222758e-06, "loss": 0.0418, "mean_token_accuracy": 0.9867703914642334, "num_tokens": 198020717.0, "step": 1865 }, { "entropy": 0.9673348963260651, "epoch": 4.250855188141391, "grad_norm": 1.0859375, "learning_rate": 3.3136235737511715e-06, "loss": 0.0367, "mean_token_accuracy": 0.9874231517314911, "num_tokens": 198126600.0, "step": 1866 }, { "entropy": 0.9678922295570374, "epoch": 4.253135689851767, "grad_norm": 1.4140625, "learning_rate": 3.3118424301830343e-06, "loss": 0.0441, "mean_token_accuracy": 0.9874817728996277, "num_tokens": 198232762.0, "step": 1867 }, { "entropy": 0.9675136059522629, "epoch": 4.255416191562143, "grad_norm": 1.2734375, "learning_rate": 3.310060825828807e-06, "loss": 0.0413, "mean_token_accuracy": 0.985375240445137, "num_tokens": 198338862.0, "step": 1868 }, { "entropy": 0.9677513688802719, "epoch": 4.2576966932725195, "grad_norm": 0.984375, "learning_rate": 3.3082787616996938e-06, "loss": 0.0339, "mean_token_accuracy": 0.9882034212350845, "num_tokens": 198445807.0, "step": 1869 }, { "entropy": 0.9703932851552963, "epoch": 4.259977194982897, "grad_norm": 0.97265625, "learning_rate": 3.3064962388071586e-06, "loss": 0.0347, "mean_token_accuracy": 0.9885646551847458, "num_tokens": 198552050.0, "step": 1870 }, { "entropy": 0.968902662396431, "epoch": 4.262257696693273, "grad_norm": 1.1796875, "learning_rate": 3.3047132581629297e-06, "loss": 0.0375, "mean_token_accuracy": 0.9887754023075104, "num_tokens": 198658998.0, "step": 1871 }, { "entropy": 0.9704457968473434, "epoch": 4.264538198403649, "grad_norm": 1.140625, "learning_rate": 3.3029298207789907e-06, "loss": 0.0367, "mean_token_accuracy": 0.9878048151731491, "num_tokens": 198766034.0, "step": 1872 }, { "entropy": 0.966766744852066, "epoch": 4.266818700114025, "grad_norm": 1.1796875, "learning_rate": 3.301145927667586e-06, "loss": 0.0421, "mean_token_accuracy": 0.9888606667518616, "num_tokens": 198872334.0, "step": 1873 }, { "entropy": 0.9672351181507111, "epoch": 4.269099201824401, "grad_norm": 0.92578125, "learning_rate": 3.2993615798412204e-06, "loss": 0.0244, "mean_token_accuracy": 0.9920351654291153, "num_tokens": 198978596.0, "step": 1874 }, { "entropy": 0.9695388078689575, "epoch": 4.271379703534778, "grad_norm": 1.0234375, "learning_rate": 3.297576778312654e-06, "loss": 0.0314, "mean_token_accuracy": 0.990473210811615, "num_tokens": 199085108.0, "step": 1875 }, { "entropy": 0.9699968248605728, "epoch": 4.273660205245154, "grad_norm": 1.203125, "learning_rate": 3.295791524094906e-06, "loss": 0.0371, "mean_token_accuracy": 0.9872556328773499, "num_tokens": 199191627.0, "step": 1876 }, { "entropy": 0.9657128155231476, "epoch": 4.27594070695553, "grad_norm": 1.0234375, "learning_rate": 3.294005818201252e-06, "loss": 0.0212, "mean_token_accuracy": 0.9934595078229904, "num_tokens": 199297452.0, "step": 1877 }, { "entropy": 0.9632662385702133, "epoch": 4.278221208665906, "grad_norm": 1.1015625, "learning_rate": 3.2922196616452253e-06, "loss": 0.0335, "mean_token_accuracy": 0.9889944493770599, "num_tokens": 199403519.0, "step": 1878 }, { "entropy": 0.971528172492981, "epoch": 4.280501710376283, "grad_norm": 1.0234375, "learning_rate": 3.2904330554406126e-06, "loss": 0.0338, "mean_token_accuracy": 0.9889898747205734, "num_tokens": 199510214.0, "step": 1879 }, { "entropy": 0.9661404490470886, "epoch": 4.282782212086659, "grad_norm": 1.2578125, "learning_rate": 3.288646000601457e-06, "loss": 0.038, "mean_token_accuracy": 0.987985298037529, "num_tokens": 199616562.0, "step": 1880 }, { "entropy": 0.9626235067844391, "epoch": 4.285062713797036, "grad_norm": 0.97265625, "learning_rate": 3.286858498142057e-06, "loss": 0.0352, "mean_token_accuracy": 0.9894498586654663, "num_tokens": 199722836.0, "step": 1881 }, { "entropy": 0.9684963822364807, "epoch": 4.287343215507412, "grad_norm": 1.0703125, "learning_rate": 3.285070549076965e-06, "loss": 0.0364, "mean_token_accuracy": 0.9880726933479309, "num_tokens": 199829214.0, "step": 1882 }, { "entropy": 0.9693914204835892, "epoch": 4.289623717217788, "grad_norm": 1.0078125, "learning_rate": 3.283282154420985e-06, "loss": 0.0319, "mean_token_accuracy": 0.991316556930542, "num_tokens": 199935861.0, "step": 1883 }, { "entropy": 0.9681668132543564, "epoch": 4.291904218928164, "grad_norm": 1.140625, "learning_rate": 3.2814933151891766e-06, "loss": 0.0321, "mean_token_accuracy": 0.989312544465065, "num_tokens": 200042114.0, "step": 1884 }, { "entropy": 0.9719049781560898, "epoch": 4.29418472063854, "grad_norm": 1.0078125, "learning_rate": 3.2797040323968493e-06, "loss": 0.0393, "mean_token_accuracy": 0.9862053990364075, "num_tokens": 200148422.0, "step": 1885 }, { "entropy": 0.9687244892120361, "epoch": 4.296465222348917, "grad_norm": 1.25, "learning_rate": 3.277914307059566e-06, "loss": 0.0382, "mean_token_accuracy": 0.989607185125351, "num_tokens": 200254824.0, "step": 1886 }, { "entropy": 0.9622658491134644, "epoch": 4.298745724059293, "grad_norm": 1.1171875, "learning_rate": 3.276124140193141e-06, "loss": 0.0303, "mean_token_accuracy": 0.9902054816484451, "num_tokens": 200361834.0, "step": 1887 }, { "entropy": 0.9656627774238586, "epoch": 4.301026225769669, "grad_norm": 0.94921875, "learning_rate": 3.274333532813637e-06, "loss": 0.0319, "mean_token_accuracy": 0.9876298159360886, "num_tokens": 200467929.0, "step": 1888 }, { "entropy": 0.9640339910984039, "epoch": 4.303306727480045, "grad_norm": 0.9765625, "learning_rate": 3.272542485937369e-06, "loss": 0.0315, "mean_token_accuracy": 0.9906089007854462, "num_tokens": 200574247.0, "step": 1889 }, { "entropy": 0.9717896282672882, "epoch": 4.305587229190422, "grad_norm": 1.0234375, "learning_rate": 3.2707510005809005e-06, "loss": 0.0375, "mean_token_accuracy": 0.9873252362012863, "num_tokens": 200681141.0, "step": 1890 }, { "entropy": 0.9667606055736542, "epoch": 4.307867730900798, "grad_norm": 1.171875, "learning_rate": 3.2689590777610443e-06, "loss": 0.0368, "mean_token_accuracy": 0.9882083833217621, "num_tokens": 200787591.0, "step": 1891 }, { "entropy": 0.9647973030805588, "epoch": 4.310148232611175, "grad_norm": 1.21875, "learning_rate": 3.267166718494861e-06, "loss": 0.0366, "mean_token_accuracy": 0.9878316670656204, "num_tokens": 200893549.0, "step": 1892 }, { "entropy": 0.966752216219902, "epoch": 4.312428734321551, "grad_norm": 1.0078125, "learning_rate": 3.265373923799658e-06, "loss": 0.0306, "mean_token_accuracy": 0.9902854859828949, "num_tokens": 201000103.0, "step": 1893 }, { "entropy": 0.964112401008606, "epoch": 4.314709236031927, "grad_norm": 0.99609375, "learning_rate": 3.263580694692992e-06, "loss": 0.0309, "mean_token_accuracy": 0.9908120483160019, "num_tokens": 201106783.0, "step": 1894 }, { "entropy": 0.9618717133998871, "epoch": 4.316989737742303, "grad_norm": 0.984375, "learning_rate": 3.261787032192666e-06, "loss": 0.0312, "mean_token_accuracy": 0.9908460527658463, "num_tokens": 201212631.0, "step": 1895 }, { "entropy": 0.961259588599205, "epoch": 4.319270239452679, "grad_norm": 0.9453125, "learning_rate": 3.259992937316727e-06, "loss": 0.0242, "mean_token_accuracy": 0.991882249712944, "num_tokens": 201317940.0, "step": 1896 }, { "entropy": 0.9615083336830139, "epoch": 4.321550741163056, "grad_norm": 1.0703125, "learning_rate": 3.258198411083469e-06, "loss": 0.0364, "mean_token_accuracy": 0.989090159535408, "num_tokens": 201423719.0, "step": 1897 }, { "entropy": 0.9602324366569519, "epoch": 4.323831242873432, "grad_norm": 1.0, "learning_rate": 3.2564034545114308e-06, "loss": 0.0298, "mean_token_accuracy": 0.9898355454206467, "num_tokens": 201530427.0, "step": 1898 }, { "entropy": 0.9641898572444916, "epoch": 4.326111744583809, "grad_norm": 1.1171875, "learning_rate": 3.2546080686193947e-06, "loss": 0.0289, "mean_token_accuracy": 0.9899128377437592, "num_tokens": 201636901.0, "step": 1899 }, { "entropy": 0.9630482941865921, "epoch": 4.328392246294185, "grad_norm": 1.1484375, "learning_rate": 3.2528122544263873e-06, "loss": 0.0284, "mean_token_accuracy": 0.9897538870573044, "num_tokens": 201743075.0, "step": 1900 }, { "entropy": 0.9592286944389343, "epoch": 4.330672748004561, "grad_norm": 0.88671875, "learning_rate": 3.251016012951678e-06, "loss": 0.024, "mean_token_accuracy": 0.9932413101196289, "num_tokens": 201849360.0, "step": 1901 }, { "entropy": 0.9622706919908524, "epoch": 4.3329532497149374, "grad_norm": 1.21875, "learning_rate": 3.2492193452147774e-06, "loss": 0.0375, "mean_token_accuracy": 0.9881621748209, "num_tokens": 201954993.0, "step": 1902 }, { "entropy": 0.9586799144744873, "epoch": 4.335233751425314, "grad_norm": 0.90625, "learning_rate": 3.247422252235442e-06, "loss": 0.0274, "mean_token_accuracy": 0.98979751765728, "num_tokens": 202061527.0, "step": 1903 }, { "entropy": 0.9615220427513123, "epoch": 4.33751425313569, "grad_norm": 1.0, "learning_rate": 3.245624735033665e-06, "loss": 0.0276, "mean_token_accuracy": 0.9921201169490814, "num_tokens": 202167719.0, "step": 1904 }, { "entropy": 0.9631716310977936, "epoch": 4.339794754846066, "grad_norm": 1.015625, "learning_rate": 3.2438267946296836e-06, "loss": 0.0363, "mean_token_accuracy": 0.9890903383493423, "num_tokens": 202274104.0, "step": 1905 }, { "entropy": 0.9551274925470352, "epoch": 4.342075256556442, "grad_norm": 1.1328125, "learning_rate": 3.242028432043974e-06, "loss": 0.0275, "mean_token_accuracy": 0.9914508759975433, "num_tokens": 202380877.0, "step": 1906 }, { "entropy": 0.9610219299793243, "epoch": 4.344355758266818, "grad_norm": 1.015625, "learning_rate": 3.2402296482972513e-06, "loss": 0.0247, "mean_token_accuracy": 0.9927946478128433, "num_tokens": 202487932.0, "step": 1907 }, { "entropy": 0.9623614996671677, "epoch": 4.346636259977195, "grad_norm": 0.89453125, "learning_rate": 3.238430444410471e-06, "loss": 0.028, "mean_token_accuracy": 0.9910885691642761, "num_tokens": 202594745.0, "step": 1908 }, { "entropy": 0.9674703925848007, "epoch": 4.348916761687571, "grad_norm": 1.1484375, "learning_rate": 3.2366308214048262e-06, "loss": 0.0342, "mean_token_accuracy": 0.9884245097637177, "num_tokens": 202700952.0, "step": 1909 }, { "entropy": 0.9569944590330124, "epoch": 4.351197263397948, "grad_norm": 1.0703125, "learning_rate": 3.2348307803017493e-06, "loss": 0.0312, "mean_token_accuracy": 0.9908646494150162, "num_tokens": 202807103.0, "step": 1910 }, { "entropy": 0.9598442316055298, "epoch": 4.353477765108324, "grad_norm": 1.1015625, "learning_rate": 3.2330303221229078e-06, "loss": 0.0323, "mean_token_accuracy": 0.9880586564540863, "num_tokens": 202913111.0, "step": 1911 }, { "entropy": 0.9567076861858368, "epoch": 4.3557582668187, "grad_norm": 0.90234375, "learning_rate": 3.231229447890206e-06, "loss": 0.0233, "mean_token_accuracy": 0.9921134412288666, "num_tokens": 203019314.0, "step": 1912 }, { "entropy": 0.9636315107345581, "epoch": 4.3580387685290765, "grad_norm": 1.109375, "learning_rate": 3.229428158625787e-06, "loss": 0.0294, "mean_token_accuracy": 0.9897793680429459, "num_tokens": 203125869.0, "step": 1913 }, { "entropy": 0.9580313116312027, "epoch": 4.360319270239453, "grad_norm": 1.078125, "learning_rate": 3.2276264553520275e-06, "loss": 0.0276, "mean_token_accuracy": 0.9915956407785416, "num_tokens": 203231764.0, "step": 1914 }, { "entropy": 0.9577291309833527, "epoch": 4.362599771949829, "grad_norm": 1.1953125, "learning_rate": 3.2258243390915397e-06, "loss": 0.0431, "mean_token_accuracy": 0.9872864484786987, "num_tokens": 203338343.0, "step": 1915 }, { "entropy": 0.9626700133085251, "epoch": 4.364880273660205, "grad_norm": 1.015625, "learning_rate": 3.2240218108671683e-06, "loss": 0.0304, "mean_token_accuracy": 0.9899421334266663, "num_tokens": 203444566.0, "step": 1916 }, { "entropy": 0.9622848331928253, "epoch": 4.367160775370581, "grad_norm": 1.0859375, "learning_rate": 3.2222188717019965e-06, "loss": 0.0334, "mean_token_accuracy": 0.9899799227714539, "num_tokens": 203550588.0, "step": 1917 }, { "entropy": 0.9636380076408386, "epoch": 4.369441277080957, "grad_norm": 1.3515625, "learning_rate": 3.220415522619335e-06, "loss": 0.0365, "mean_token_accuracy": 0.9890536665916443, "num_tokens": 203657154.0, "step": 1918 }, { "entropy": 0.9542723596096039, "epoch": 4.3717217787913345, "grad_norm": 1.25, "learning_rate": 3.218611764642732e-06, "loss": 0.039, "mean_token_accuracy": 0.9875475168228149, "num_tokens": 203763468.0, "step": 1919 }, { "entropy": 0.9599116444587708, "epoch": 4.374002280501711, "grad_norm": 1.3359375, "learning_rate": 3.2168075987959633e-06, "loss": 0.0357, "mean_token_accuracy": 0.9894316047430038, "num_tokens": 203869418.0, "step": 1920 }, { "entropy": 0.9561650604009628, "epoch": 4.376282782212087, "grad_norm": 1.1328125, "learning_rate": 3.2150030261030414e-06, "loss": 0.0336, "mean_token_accuracy": 0.988392248749733, "num_tokens": 203975562.0, "step": 1921 }, { "entropy": 0.9596114754676819, "epoch": 4.378563283922463, "grad_norm": 0.87890625, "learning_rate": 3.2131980475882053e-06, "loss": 0.0222, "mean_token_accuracy": 0.9929933100938797, "num_tokens": 204081626.0, "step": 1922 }, { "entropy": 0.9634875059127808, "epoch": 4.380843785632839, "grad_norm": 1.1953125, "learning_rate": 3.2113926642759256e-06, "loss": 0.0434, "mean_token_accuracy": 0.9894748628139496, "num_tokens": 204187181.0, "step": 1923 }, { "entropy": 0.9566300064325333, "epoch": 4.3831242873432155, "grad_norm": 1.0546875, "learning_rate": 3.2095868771909037e-06, "loss": 0.0361, "mean_token_accuracy": 0.9879244416952133, "num_tokens": 204293108.0, "step": 1924 }, { "entropy": 0.9664239287376404, "epoch": 4.385404789053592, "grad_norm": 1.1328125, "learning_rate": 3.2077806873580696e-06, "loss": 0.0297, "mean_token_accuracy": 0.989672988653183, "num_tokens": 204399348.0, "step": 1925 }, { "entropy": 0.963142991065979, "epoch": 4.387685290763968, "grad_norm": 1.1640625, "learning_rate": 3.205974095802582e-06, "loss": 0.0246, "mean_token_accuracy": 0.9916354715824127, "num_tokens": 204504944.0, "step": 1926 }, { "entropy": 0.9589821398258209, "epoch": 4.389965792474344, "grad_norm": 1.1875, "learning_rate": 3.204167103549827e-06, "loss": 0.0267, "mean_token_accuracy": 0.9910952150821686, "num_tokens": 204611237.0, "step": 1927 }, { "entropy": 0.9648015648126602, "epoch": 4.39224629418472, "grad_norm": 1.0546875, "learning_rate": 3.2023597116254175e-06, "loss": 0.0322, "mean_token_accuracy": 0.9924189746379852, "num_tokens": 204717193.0, "step": 1928 }, { "entropy": 0.9570301473140717, "epoch": 4.394526795895097, "grad_norm": 1.109375, "learning_rate": 3.2005519210551955e-06, "loss": 0.0332, "mean_token_accuracy": 0.9902378022670746, "num_tokens": 204823568.0, "step": 1929 }, { "entropy": 0.9621627628803253, "epoch": 4.3968072976054735, "grad_norm": 1.1015625, "learning_rate": 3.1987437328652287e-06, "loss": 0.0359, "mean_token_accuracy": 0.9893099963665009, "num_tokens": 204929810.0, "step": 1930 }, { "entropy": 0.9665351063013077, "epoch": 4.39908779931585, "grad_norm": 1.359375, "learning_rate": 3.196935148081808e-06, "loss": 0.0417, "mean_token_accuracy": 0.9841814041137695, "num_tokens": 205036192.0, "step": 1931 }, { "entropy": 0.9566177278757095, "epoch": 4.401368301026226, "grad_norm": 1.171875, "learning_rate": 3.1951261677314526e-06, "loss": 0.0311, "mean_token_accuracy": 0.9911172688007355, "num_tokens": 205142829.0, "step": 1932 }, { "entropy": 0.9617832005023956, "epoch": 4.403648802736602, "grad_norm": 1.1171875, "learning_rate": 3.1933167928409046e-06, "loss": 0.0327, "mean_token_accuracy": 0.9907574653625488, "num_tokens": 205249798.0, "step": 1933 }, { "entropy": 0.964421883225441, "epoch": 4.405929304446978, "grad_norm": 0.75, "learning_rate": 3.1915070244371295e-06, "loss": 0.0249, "mean_token_accuracy": 0.9921190738677979, "num_tokens": 205356495.0, "step": 1934 }, { "entropy": 0.9646548181772232, "epoch": 4.4082098061573545, "grad_norm": 1.1328125, "learning_rate": 3.1896968635473174e-06, "loss": 0.0383, "mean_token_accuracy": 0.9883177280426025, "num_tokens": 205463092.0, "step": 1935 }, { "entropy": 0.9650232940912247, "epoch": 4.410490307867731, "grad_norm": 1.0625, "learning_rate": 3.187886311198881e-06, "loss": 0.0315, "mean_token_accuracy": 0.9906440526247025, "num_tokens": 205569648.0, "step": 1936 }, { "entropy": 0.9589008688926697, "epoch": 4.412770809578107, "grad_norm": 0.94921875, "learning_rate": 3.1860753684194536e-06, "loss": 0.0342, "mean_token_accuracy": 0.9897739142179489, "num_tokens": 205676269.0, "step": 1937 }, { "entropy": 0.9628678858280182, "epoch": 4.415051311288483, "grad_norm": 1.1875, "learning_rate": 3.1842640362368932e-06, "loss": 0.0356, "mean_token_accuracy": 0.9889646172523499, "num_tokens": 205783215.0, "step": 1938 }, { "entropy": 0.961723804473877, "epoch": 4.41733181299886, "grad_norm": 1.21875, "learning_rate": 3.182452315679276e-06, "loss": 0.0385, "mean_token_accuracy": 0.989678218960762, "num_tokens": 205890024.0, "step": 1939 }, { "entropy": 0.961358055472374, "epoch": 4.419612314709236, "grad_norm": 1.234375, "learning_rate": 3.1806402077748987e-06, "loss": 0.0363, "mean_token_accuracy": 0.9879544973373413, "num_tokens": 205996535.0, "step": 1940 }, { "entropy": 0.9674185067415237, "epoch": 4.4218928164196125, "grad_norm": 1.2578125, "learning_rate": 3.178827713552281e-06, "loss": 0.033, "mean_token_accuracy": 0.9903965890407562, "num_tokens": 206102926.0, "step": 1941 }, { "entropy": 0.9602567106485367, "epoch": 4.424173318129989, "grad_norm": 1.1171875, "learning_rate": 3.177014834040158e-06, "loss": 0.0334, "mean_token_accuracy": 0.9886505007743835, "num_tokens": 206209330.0, "step": 1942 }, { "entropy": 0.9611152410507202, "epoch": 4.426453819840365, "grad_norm": 1.125, "learning_rate": 3.1752015702674855e-06, "loss": 0.0386, "mean_token_accuracy": 0.9877144247293472, "num_tokens": 206315571.0, "step": 1943 }, { "entropy": 0.9630506634712219, "epoch": 4.428734321550741, "grad_norm": 1.0703125, "learning_rate": 3.173387923263437e-06, "loss": 0.033, "mean_token_accuracy": 0.9886568039655685, "num_tokens": 206421798.0, "step": 1944 }, { "entropy": 0.9602255821228027, "epoch": 4.431014823261117, "grad_norm": 0.96484375, "learning_rate": 3.1715738940574032e-06, "loss": 0.0278, "mean_token_accuracy": 0.9911738932132721, "num_tokens": 206528179.0, "step": 1945 }, { "entropy": 0.9557541757822037, "epoch": 4.4332953249714935, "grad_norm": 1.328125, "learning_rate": 3.1697594836789924e-06, "loss": 0.0322, "mean_token_accuracy": 0.9900602847337723, "num_tokens": 206634730.0, "step": 1946 }, { "entropy": 0.9585333466529846, "epoch": 4.43557582668187, "grad_norm": 1.203125, "learning_rate": 3.167944693158029e-06, "loss": 0.0333, "mean_token_accuracy": 0.990835502743721, "num_tokens": 206740752.0, "step": 1947 }, { "entropy": 0.962812140583992, "epoch": 4.437856328392247, "grad_norm": 1.3125, "learning_rate": 3.166129523524553e-06, "loss": 0.0371, "mean_token_accuracy": 0.9874485582113266, "num_tokens": 206846735.0, "step": 1948 }, { "entropy": 0.9616965651512146, "epoch": 4.440136830102623, "grad_norm": 1.453125, "learning_rate": 3.1643139758088194e-06, "loss": 0.0364, "mean_token_accuracy": 0.9882022142410278, "num_tokens": 206953386.0, "step": 1949 }, { "entropy": 0.9651433378458023, "epoch": 4.442417331812999, "grad_norm": 1.03125, "learning_rate": 3.1624980510412984e-06, "loss": 0.0287, "mean_token_accuracy": 0.9916284382343292, "num_tokens": 207059724.0, "step": 1950 }, { "entropy": 0.9585947394371033, "epoch": 4.444697833523375, "grad_norm": 0.890625, "learning_rate": 3.160681750252674e-06, "loss": 0.0332, "mean_token_accuracy": 0.9901492595672607, "num_tokens": 207165809.0, "step": 1951 }, { "entropy": 0.961254358291626, "epoch": 4.4469783352337515, "grad_norm": 0.9921875, "learning_rate": 3.1588650744738418e-06, "loss": 0.0285, "mean_token_accuracy": 0.9914227724075317, "num_tokens": 207272026.0, "step": 1952 }, { "entropy": 0.9629954695701599, "epoch": 4.449258836944128, "grad_norm": 1.015625, "learning_rate": 3.1570480247359147e-06, "loss": 0.0339, "mean_token_accuracy": 0.9909213334321976, "num_tokens": 207378398.0, "step": 1953 }, { "entropy": 0.960856020450592, "epoch": 4.451539338654504, "grad_norm": 0.9921875, "learning_rate": 3.155230602070213e-06, "loss": 0.0321, "mean_token_accuracy": 0.9908858686685562, "num_tokens": 207484283.0, "step": 1954 }, { "entropy": 0.9615865051746368, "epoch": 4.45381984036488, "grad_norm": 1.0234375, "learning_rate": 3.153412807508271e-06, "loss": 0.028, "mean_token_accuracy": 0.9903390854597092, "num_tokens": 207589512.0, "step": 1955 }, { "entropy": 0.9688438773155212, "epoch": 4.456100342075256, "grad_norm": 0.9921875, "learning_rate": 3.1515946420818343e-06, "loss": 0.0352, "mean_token_accuracy": 0.9877954721450806, "num_tokens": 207695829.0, "step": 1956 }, { "entropy": 0.9602732807397842, "epoch": 4.4583808437856325, "grad_norm": 0.95703125, "learning_rate": 3.1497761068228585e-06, "loss": 0.02, "mean_token_accuracy": 0.9936823099851608, "num_tokens": 207802198.0, "step": 1957 }, { "entropy": 0.9661998897790909, "epoch": 4.460661345496009, "grad_norm": 1.078125, "learning_rate": 3.1479572027635085e-06, "loss": 0.0322, "mean_token_accuracy": 0.9897193014621735, "num_tokens": 207908228.0, "step": 1958 }, { "entropy": 0.9618597477674484, "epoch": 4.462941847206386, "grad_norm": 1.2421875, "learning_rate": 3.1461379309361594e-06, "loss": 0.0396, "mean_token_accuracy": 0.9882063716650009, "num_tokens": 208014611.0, "step": 1959 }, { "entropy": 0.9626256227493286, "epoch": 4.465222348916762, "grad_norm": 1.140625, "learning_rate": 3.144318292373395e-06, "loss": 0.0326, "mean_token_accuracy": 0.9881562739610672, "num_tokens": 208121024.0, "step": 1960 }, { "entropy": 0.9622137695550919, "epoch": 4.467502850627138, "grad_norm": 1.390625, "learning_rate": 3.142498288108007e-06, "loss": 0.0306, "mean_token_accuracy": 0.9915677309036255, "num_tokens": 208227646.0, "step": 1961 }, { "entropy": 0.9547878354787827, "epoch": 4.469783352337514, "grad_norm": 0.875, "learning_rate": 3.1406779191729954e-06, "loss": 0.0225, "mean_token_accuracy": 0.9932042360305786, "num_tokens": 208333588.0, "step": 1962 }, { "entropy": 0.9638462513685226, "epoch": 4.4720638540478905, "grad_norm": 1.125, "learning_rate": 3.1388571866015645e-06, "loss": 0.037, "mean_token_accuracy": 0.9888993799686432, "num_tokens": 208439773.0, "step": 1963 }, { "entropy": 0.9597388952970505, "epoch": 4.474344355758267, "grad_norm": 1.1796875, "learning_rate": 3.1370360914271286e-06, "loss": 0.0319, "mean_token_accuracy": 0.9903824180364609, "num_tokens": 208545994.0, "step": 1964 }, { "entropy": 0.9640433043241501, "epoch": 4.476624857468643, "grad_norm": 1.28125, "learning_rate": 3.1352146346833057e-06, "loss": 0.0375, "mean_token_accuracy": 0.9866506010293961, "num_tokens": 208652551.0, "step": 1965 }, { "entropy": 0.9696111977100372, "epoch": 4.478905359179019, "grad_norm": 1.2265625, "learning_rate": 3.133392817403919e-06, "loss": 0.0346, "mean_token_accuracy": 0.9908190816640854, "num_tokens": 208758994.0, "step": 1966 }, { "entropy": 0.9646519869565964, "epoch": 4.481185860889395, "grad_norm": 1.046875, "learning_rate": 3.131570640622998e-06, "loss": 0.0313, "mean_token_accuracy": 0.9907734990119934, "num_tokens": 208865143.0, "step": 1967 }, { "entropy": 0.9590180069208145, "epoch": 4.483466362599772, "grad_norm": 1.046875, "learning_rate": 3.1297481053747737e-06, "loss": 0.0373, "mean_token_accuracy": 0.990446463227272, "num_tokens": 208972162.0, "step": 1968 }, { "entropy": 0.9590565860271454, "epoch": 4.485746864310149, "grad_norm": 1.2109375, "learning_rate": 3.127925212693682e-06, "loss": 0.031, "mean_token_accuracy": 0.9895134419202805, "num_tokens": 209078075.0, "step": 1969 }, { "entropy": 0.9592174589633942, "epoch": 4.488027366020525, "grad_norm": 1.1328125, "learning_rate": 3.1261019636143636e-06, "loss": 0.0315, "mean_token_accuracy": 0.9888044446706772, "num_tokens": 209184197.0, "step": 1970 }, { "entropy": 0.9663747698068619, "epoch": 4.490307867730901, "grad_norm": 1.046875, "learning_rate": 3.124278359171657e-06, "loss": 0.0331, "mean_token_accuracy": 0.9897551089525223, "num_tokens": 209290677.0, "step": 1971 }, { "entropy": 0.9629360884428024, "epoch": 4.492588369441277, "grad_norm": 1.078125, "learning_rate": 3.122454400400606e-06, "loss": 0.0313, "mean_token_accuracy": 0.9903358966112137, "num_tokens": 209396626.0, "step": 1972 }, { "entropy": 0.9574064463376999, "epoch": 4.494868871151653, "grad_norm": 1.1796875, "learning_rate": 3.1206300883364547e-06, "loss": 0.0358, "mean_token_accuracy": 0.9882284551858902, "num_tokens": 209503447.0, "step": 1973 }, { "entropy": 0.966063380241394, "epoch": 4.4971493728620295, "grad_norm": 1.0625, "learning_rate": 3.1188054240146463e-06, "loss": 0.0301, "mean_token_accuracy": 0.9907215982675552, "num_tokens": 209610039.0, "step": 1974 }, { "entropy": 0.9677144587039948, "epoch": 4.499429874572406, "grad_norm": 1.203125, "learning_rate": 3.1169804084708267e-06, "loss": 0.0345, "mean_token_accuracy": 0.9881967157125473, "num_tokens": 209716402.0, "step": 1975 }, { "entropy": 0.9634523540735245, "epoch": 4.501710376282782, "grad_norm": 0.9609375, "learning_rate": 3.1151550427408383e-06, "loss": 0.0292, "mean_token_accuracy": 0.9903677701950073, "num_tokens": 209822759.0, "step": 1976 }, { "entropy": 0.9642475098371506, "epoch": 4.503990877993158, "grad_norm": 1.1015625, "learning_rate": 3.1133293278607228e-06, "loss": 0.0319, "mean_token_accuracy": 0.9897796511650085, "num_tokens": 209929133.0, "step": 1977 }, { "entropy": 0.9657768756151199, "epoch": 4.506271379703534, "grad_norm": 1.03125, "learning_rate": 3.1115032648667224e-06, "loss": 0.032, "mean_token_accuracy": 0.9901612848043442, "num_tokens": 210035897.0, "step": 1978 }, { "entropy": 0.9633835703134537, "epoch": 4.508551881413911, "grad_norm": 1.015625, "learning_rate": 3.1096768547952743e-06, "loss": 0.0281, "mean_token_accuracy": 0.9925199896097183, "num_tokens": 210142173.0, "step": 1979 }, { "entropy": 0.9619158655405045, "epoch": 4.510832383124288, "grad_norm": 1.0703125, "learning_rate": 3.1078500986830134e-06, "loss": 0.0307, "mean_token_accuracy": 0.9897541701793671, "num_tokens": 210248635.0, "step": 1980 }, { "epoch": 4.510832383124288, "eval_entropy": 0.9625453572762783, "eval_loss": 0.03818083927035332, "eval_mean_token_accuracy": 0.9883312501381558, "eval_num_tokens": 210248635.0, "eval_runtime": 66.101, "eval_samples_per_second": 126.851, "eval_steps_per_second": 3.979, "step": 1980 }, { "entropy": 0.9666935056447983, "epoch": 4.513112884834664, "grad_norm": 0.8828125, "learning_rate": 3.1060229975667716e-06, "loss": 0.0277, "mean_token_accuracy": 0.992246463894844, "num_tokens": 210354959.0, "step": 1981 }, { "entropy": 0.9631439745426178, "epoch": 4.51539338654504, "grad_norm": 1.2890625, "learning_rate": 3.104195552483576e-06, "loss": 0.0364, "mean_token_accuracy": 0.9888398945331573, "num_tokens": 210461072.0, "step": 1982 }, { "entropy": 0.9624048173427582, "epoch": 4.517673888255416, "grad_norm": 0.90234375, "learning_rate": 3.102367764470649e-06, "loss": 0.0302, "mean_token_accuracy": 0.9911307841539383, "num_tokens": 210567412.0, "step": 1983 }, { "entropy": 0.9609938561916351, "epoch": 4.519954389965792, "grad_norm": 1.1484375, "learning_rate": 3.1005396345654087e-06, "loss": 0.0389, "mean_token_accuracy": 0.9884150624275208, "num_tokens": 210674445.0, "step": 1984 }, { "entropy": 0.9591469615697861, "epoch": 4.5222348916761685, "grad_norm": 1.0546875, "learning_rate": 3.0987111638054657e-06, "loss": 0.0327, "mean_token_accuracy": 0.9893463253974915, "num_tokens": 210780484.0, "step": 1985 }, { "entropy": 0.9654952883720398, "epoch": 4.524515393386545, "grad_norm": 0.828125, "learning_rate": 3.0968823532286246e-06, "loss": 0.0213, "mean_token_accuracy": 0.9933325052261353, "num_tokens": 210886918.0, "step": 1986 }, { "entropy": 0.9592381715774536, "epoch": 4.526795895096921, "grad_norm": 0.91796875, "learning_rate": 3.095053203872883e-06, "loss": 0.0283, "mean_token_accuracy": 0.9904720485210419, "num_tokens": 210993511.0, "step": 1987 }, { "entropy": 0.9644463658332825, "epoch": 4.529076396807298, "grad_norm": 1.140625, "learning_rate": 3.0932237167764306e-06, "loss": 0.0311, "mean_token_accuracy": 0.9898818880319595, "num_tokens": 211099720.0, "step": 1988 }, { "entropy": 0.9652171581983566, "epoch": 4.531356898517674, "grad_norm": 1.359375, "learning_rate": 3.0913938929776493e-06, "loss": 0.0361, "mean_token_accuracy": 0.987453043460846, "num_tokens": 211205729.0, "step": 1989 }, { "entropy": 0.9674256294965744, "epoch": 4.53363740022805, "grad_norm": 1.15625, "learning_rate": 3.0895637335151117e-06, "loss": 0.0325, "mean_token_accuracy": 0.9908331781625748, "num_tokens": 211312087.0, "step": 1990 }, { "entropy": 0.9642764180898666, "epoch": 4.535917901938427, "grad_norm": 1.0703125, "learning_rate": 3.0877332394275806e-06, "loss": 0.0286, "mean_token_accuracy": 0.9904601871967316, "num_tokens": 211417658.0, "step": 1991 }, { "entropy": 0.9641773849725723, "epoch": 4.538198403648803, "grad_norm": 1.125, "learning_rate": 3.08590241175401e-06, "loss": 0.0366, "mean_token_accuracy": 0.9867310672998428, "num_tokens": 211523853.0, "step": 1992 }, { "entropy": 0.9654300212860107, "epoch": 4.540478905359179, "grad_norm": 1.1171875, "learning_rate": 3.0840712515335412e-06, "loss": 0.0309, "mean_token_accuracy": 0.9909527450799942, "num_tokens": 211629747.0, "step": 1993 }, { "entropy": 0.9654576480388641, "epoch": 4.542759407069555, "grad_norm": 1.140625, "learning_rate": 3.0822397598055065e-06, "loss": 0.029, "mean_token_accuracy": 0.9897182881832123, "num_tokens": 211736553.0, "step": 1994 }, { "entropy": 0.9630515575408936, "epoch": 4.545039908779931, "grad_norm": 1.1484375, "learning_rate": 3.080407937609424e-06, "loss": 0.0352, "mean_token_accuracy": 0.9886357933282852, "num_tokens": 211843203.0, "step": 1995 }, { "entropy": 0.9604474157094955, "epoch": 4.5473204104903076, "grad_norm": 1.328125, "learning_rate": 3.0785757859850025e-06, "loss": 0.0328, "mean_token_accuracy": 0.9885948747396469, "num_tokens": 211949554.0, "step": 1996 }, { "entropy": 0.9662304520606995, "epoch": 4.549600912200685, "grad_norm": 1.046875, "learning_rate": 3.0767433059721338e-06, "loss": 0.0301, "mean_token_accuracy": 0.9895837157964706, "num_tokens": 212055625.0, "step": 1997 }, { "entropy": 0.9662515372037888, "epoch": 4.55188141391106, "grad_norm": 0.96875, "learning_rate": 3.074910498610899e-06, "loss": 0.0269, "mean_token_accuracy": 0.9901050925254822, "num_tokens": 212162158.0, "step": 1998 }, { "entropy": 0.9616214483976364, "epoch": 4.554161915621437, "grad_norm": 1.0390625, "learning_rate": 3.0730773649415647e-06, "loss": 0.0353, "mean_token_accuracy": 0.9890009462833405, "num_tokens": 212268087.0, "step": 1999 }, { "entropy": 0.9615877419710159, "epoch": 4.556442417331813, "grad_norm": 1.0546875, "learning_rate": 3.0712439060045818e-06, "loss": 0.0292, "mean_token_accuracy": 0.9895431101322174, "num_tokens": 212374063.0, "step": 2000 }, { "entropy": 0.9657685458660126, "epoch": 4.558722919042189, "grad_norm": 0.9765625, "learning_rate": 3.069410122840585e-06, "loss": 0.0263, "mean_token_accuracy": 0.9919369518756866, "num_tokens": 212480450.0, "step": 2001 }, { "entropy": 0.9689582735300064, "epoch": 4.561003420752566, "grad_norm": 0.90234375, "learning_rate": 3.0675760164903972e-06, "loss": 0.0307, "mean_token_accuracy": 0.9926885068416595, "num_tokens": 212586666.0, "step": 2002 }, { "entropy": 0.9614085853099823, "epoch": 4.563283922462942, "grad_norm": 0.9375, "learning_rate": 3.065741587995019e-06, "loss": 0.0226, "mean_token_accuracy": 0.9925189316272736, "num_tokens": 212692792.0, "step": 2003 }, { "entropy": 0.960402175784111, "epoch": 4.565564424173318, "grad_norm": 0.98828125, "learning_rate": 3.0639068383956373e-06, "loss": 0.03, "mean_token_accuracy": 0.9900523871183395, "num_tokens": 212798795.0, "step": 2004 }, { "entropy": 0.9657806009054184, "epoch": 4.567844925883694, "grad_norm": 1.0, "learning_rate": 3.062071768733621e-06, "loss": 0.0394, "mean_token_accuracy": 0.988313227891922, "num_tokens": 212905201.0, "step": 2005 }, { "entropy": 0.9635949283838272, "epoch": 4.57012542759407, "grad_norm": 1.0859375, "learning_rate": 3.0602363800505198e-06, "loss": 0.0243, "mean_token_accuracy": 0.9919077157974243, "num_tokens": 213011900.0, "step": 2006 }, { "entropy": 0.96717369556427, "epoch": 4.572405929304447, "grad_norm": 1.2265625, "learning_rate": 3.0584006733880656e-06, "loss": 0.0294, "mean_token_accuracy": 0.9898323118686676, "num_tokens": 213118042.0, "step": 2007 }, { "entropy": 0.958612471818924, "epoch": 4.574686431014824, "grad_norm": 1.0390625, "learning_rate": 3.0565646497881697e-06, "loss": 0.0283, "mean_token_accuracy": 0.9917443096637726, "num_tokens": 213224126.0, "step": 2008 }, { "entropy": 0.9680547565221786, "epoch": 4.5769669327252, "grad_norm": 1.1640625, "learning_rate": 3.0547283102929228e-06, "loss": 0.0295, "mean_token_accuracy": 0.9918537586927414, "num_tokens": 213330370.0, "step": 2009 }, { "entropy": 0.9651702046394348, "epoch": 4.579247434435576, "grad_norm": 1.1796875, "learning_rate": 3.0528916559445967e-06, "loss": 0.0291, "mean_token_accuracy": 0.9904236793518066, "num_tokens": 213436832.0, "step": 2010 }, { "entropy": 0.9639228582382202, "epoch": 4.581527936145952, "grad_norm": 0.79296875, "learning_rate": 3.05105468778564e-06, "loss": 0.0249, "mean_token_accuracy": 0.9918838888406754, "num_tokens": 213543687.0, "step": 2011 }, { "entropy": 0.9706073701381683, "epoch": 4.583808437856328, "grad_norm": 1.1484375, "learning_rate": 3.049217406858681e-06, "loss": 0.0364, "mean_token_accuracy": 0.9886925369501114, "num_tokens": 213649672.0, "step": 2012 }, { "entropy": 0.9641620367765427, "epoch": 4.586088939566705, "grad_norm": 1.1015625, "learning_rate": 3.047379814206526e-06, "loss": 0.0339, "mean_token_accuracy": 0.9880337119102478, "num_tokens": 213755963.0, "step": 2013 }, { "entropy": 0.9638512283563614, "epoch": 4.588369441277081, "grad_norm": 0.97265625, "learning_rate": 3.0455419108721556e-06, "loss": 0.0335, "mean_token_accuracy": 0.9897000193595886, "num_tokens": 213862069.0, "step": 2014 }, { "entropy": 0.9648711383342743, "epoch": 4.590649942987457, "grad_norm": 1.234375, "learning_rate": 3.043703697898728e-06, "loss": 0.025, "mean_token_accuracy": 0.9915802925825119, "num_tokens": 213968699.0, "step": 2015 }, { "entropy": 0.9619019478559494, "epoch": 4.592930444697833, "grad_norm": 1.0390625, "learning_rate": 3.041865176329579e-06, "loss": 0.0262, "mean_token_accuracy": 0.9907998293638229, "num_tokens": 214075303.0, "step": 2016 }, { "entropy": 0.9710960984230042, "epoch": 4.59521094640821, "grad_norm": 1.09375, "learning_rate": 3.040026347208217e-06, "loss": 0.0327, "mean_token_accuracy": 0.9892988204956055, "num_tokens": 214182101.0, "step": 2017 }, { "entropy": 0.9665683805942535, "epoch": 4.5974914481185865, "grad_norm": 1.0390625, "learning_rate": 3.0381872115783256e-06, "loss": 0.0367, "mean_token_accuracy": 0.9892513602972031, "num_tokens": 214288833.0, "step": 2018 }, { "entropy": 0.9643343240022659, "epoch": 4.599771949828963, "grad_norm": 1.0234375, "learning_rate": 3.0363477704837633e-06, "loss": 0.028, "mean_token_accuracy": 0.9912444204092026, "num_tokens": 214395075.0, "step": 2019 }, { "entropy": 0.9695619493722916, "epoch": 4.602052451539339, "grad_norm": 0.984375, "learning_rate": 3.034508024968561e-06, "loss": 0.0382, "mean_token_accuracy": 0.9883536249399185, "num_tokens": 214501787.0, "step": 2020 }, { "entropy": 0.9664026200771332, "epoch": 4.604332953249715, "grad_norm": 0.92578125, "learning_rate": 3.032667976076923e-06, "loss": 0.0274, "mean_token_accuracy": 0.9910774230957031, "num_tokens": 214608167.0, "step": 2021 }, { "entropy": 0.9661362320184708, "epoch": 4.606613454960091, "grad_norm": 1.0625, "learning_rate": 3.0308276248532244e-06, "loss": 0.037, "mean_token_accuracy": 0.9881613552570343, "num_tokens": 214714232.0, "step": 2022 }, { "entropy": 0.9697739332914352, "epoch": 4.608893956670467, "grad_norm": 1.1640625, "learning_rate": 3.0289869723420144e-06, "loss": 0.0258, "mean_token_accuracy": 0.9924123138189316, "num_tokens": 214820682.0, "step": 2023 }, { "entropy": 0.9656050801277161, "epoch": 4.611174458380844, "grad_norm": 0.94921875, "learning_rate": 3.027146019588012e-06, "loss": 0.0272, "mean_token_accuracy": 0.9906067401170731, "num_tokens": 214927225.0, "step": 2024 }, { "entropy": 0.9604237377643585, "epoch": 4.61345496009122, "grad_norm": 0.94921875, "learning_rate": 3.025304767636105e-06, "loss": 0.0286, "mean_token_accuracy": 0.9889236986637115, "num_tokens": 215034122.0, "step": 2025 }, { "entropy": 0.9681928902864456, "epoch": 4.615735461801596, "grad_norm": 0.82421875, "learning_rate": 3.0234632175313537e-06, "loss": 0.0268, "mean_token_accuracy": 0.9905083477497101, "num_tokens": 215140185.0, "step": 2026 }, { "entropy": 0.968199223279953, "epoch": 4.618015963511972, "grad_norm": 0.984375, "learning_rate": 3.0216213703189856e-06, "loss": 0.0314, "mean_token_accuracy": 0.9884727150201797, "num_tokens": 215246454.0, "step": 2027 }, { "entropy": 0.9633829295635223, "epoch": 4.620296465222349, "grad_norm": 1.0078125, "learning_rate": 3.019779227044398e-06, "loss": 0.0291, "mean_token_accuracy": 0.9913322776556015, "num_tokens": 215353382.0, "step": 2028 }, { "entropy": 0.9623784571886063, "epoch": 4.6225769669327255, "grad_norm": 1.0859375, "learning_rate": 3.0179367887531567e-06, "loss": 0.0332, "mean_token_accuracy": 0.9886854737997055, "num_tokens": 215459285.0, "step": 2029 }, { "entropy": 0.974655956029892, "epoch": 4.624857468643102, "grad_norm": 1.015625, "learning_rate": 3.016094056490993e-06, "loss": 0.0322, "mean_token_accuracy": 0.9905224293470383, "num_tokens": 215565667.0, "step": 2030 }, { "entropy": 0.9687018245458603, "epoch": 4.627137970353478, "grad_norm": 0.90234375, "learning_rate": 3.0142510313038057e-06, "loss": 0.027, "mean_token_accuracy": 0.9930611997842789, "num_tokens": 215672491.0, "step": 2031 }, { "entropy": 0.9664171934127808, "epoch": 4.629418472063854, "grad_norm": 1.171875, "learning_rate": 3.012407714237662e-06, "loss": 0.0353, "mean_token_accuracy": 0.9883527606725693, "num_tokens": 215778772.0, "step": 2032 }, { "entropy": 0.9631301611661911, "epoch": 4.63169897377423, "grad_norm": 1.140625, "learning_rate": 3.010564106338791e-06, "loss": 0.0345, "mean_token_accuracy": 0.9879868775606155, "num_tokens": 215885504.0, "step": 2033 }, { "entropy": 0.9653843492269516, "epoch": 4.633979475484606, "grad_norm": 0.9140625, "learning_rate": 3.0087202086535915e-06, "loss": 0.0327, "mean_token_accuracy": 0.98924121260643, "num_tokens": 215992613.0, "step": 2034 }, { "entropy": 0.9673458337783813, "epoch": 4.636259977194983, "grad_norm": 1.078125, "learning_rate": 3.006876022228622e-06, "loss": 0.0302, "mean_token_accuracy": 0.9909915626049042, "num_tokens": 216099037.0, "step": 2035 }, { "entropy": 0.9678966850042343, "epoch": 4.638540478905359, "grad_norm": 1.125, "learning_rate": 3.0050315481106074e-06, "loss": 0.0299, "mean_token_accuracy": 0.9924499541521072, "num_tokens": 216204979.0, "step": 2036 }, { "entropy": 0.9673795998096466, "epoch": 4.640820980615736, "grad_norm": 1.515625, "learning_rate": 3.0031867873464372e-06, "loss": 0.042, "mean_token_accuracy": 0.9856287091970444, "num_tokens": 216311098.0, "step": 2037 }, { "entropy": 0.9659597873687744, "epoch": 4.643101482326112, "grad_norm": 1.078125, "learning_rate": 3.00134174098316e-06, "loss": 0.0316, "mean_token_accuracy": 0.9888305217027664, "num_tokens": 216416194.0, "step": 2038 }, { "entropy": 0.9579313397407532, "epoch": 4.645381984036488, "grad_norm": 1.03125, "learning_rate": 2.999496410067989e-06, "loss": 0.0235, "mean_token_accuracy": 0.9926250874996185, "num_tokens": 216523463.0, "step": 2039 }, { "entropy": 0.9697036147117615, "epoch": 4.6476624857468645, "grad_norm": 1.3203125, "learning_rate": 2.9976507956482996e-06, "loss": 0.0395, "mean_token_accuracy": 0.9893929958343506, "num_tokens": 216629925.0, "step": 2040 }, { "entropy": 0.970169723033905, "epoch": 4.649942987457241, "grad_norm": 1.328125, "learning_rate": 2.9958048987716266e-06, "loss": 0.0369, "mean_token_accuracy": 0.9888918548822403, "num_tokens": 216735754.0, "step": 2041 }, { "entropy": 0.9726706445217133, "epoch": 4.652223489167617, "grad_norm": 1.21875, "learning_rate": 2.993958720485664e-06, "loss": 0.0427, "mean_token_accuracy": 0.985613077878952, "num_tokens": 216841988.0, "step": 2042 }, { "entropy": 0.9629200994968414, "epoch": 4.654503990877993, "grad_norm": 1.0703125, "learning_rate": 2.9921122618382687e-06, "loss": 0.0288, "mean_token_accuracy": 0.9906602799892426, "num_tokens": 216948588.0, "step": 2043 }, { "entropy": 0.9652369469404221, "epoch": 4.656784492588369, "grad_norm": 0.953125, "learning_rate": 2.9902655238774537e-06, "loss": 0.0333, "mean_token_accuracy": 0.9899879097938538, "num_tokens": 217054736.0, "step": 2044 }, { "entropy": 0.9719822853803635, "epoch": 4.659064994298745, "grad_norm": 1.265625, "learning_rate": 2.988418507651392e-06, "loss": 0.042, "mean_token_accuracy": 0.9864343702793121, "num_tokens": 217161309.0, "step": 2045 }, { "entropy": 0.9659538418054581, "epoch": 4.661345496009122, "grad_norm": 1.34375, "learning_rate": 2.9865712142084145e-06, "loss": 0.0367, "mean_token_accuracy": 0.9889904260635376, "num_tokens": 217268224.0, "step": 2046 }, { "entropy": 0.9723149389028549, "epoch": 4.663625997719498, "grad_norm": 1.0546875, "learning_rate": 2.98472364459701e-06, "loss": 0.0263, "mean_token_accuracy": 0.9915052354335785, "num_tokens": 217374701.0, "step": 2047 }, { "entropy": 0.9623636454343796, "epoch": 4.665906499429875, "grad_norm": 0.88671875, "learning_rate": 2.982875799865823e-06, "loss": 0.0347, "mean_token_accuracy": 0.9895738512277603, "num_tokens": 217481008.0, "step": 2048 }, { "entropy": 0.9746496677398682, "epoch": 4.668187001140251, "grad_norm": 0.9140625, "learning_rate": 2.9810276810636535e-06, "loss": 0.0271, "mean_token_accuracy": 0.9908635914325714, "num_tokens": 217587058.0, "step": 2049 }, { "entropy": 0.9681879132986069, "epoch": 4.670467502850627, "grad_norm": 0.93359375, "learning_rate": 2.97917928923946e-06, "loss": 0.0318, "mean_token_accuracy": 0.9882565438747406, "num_tokens": 217693702.0, "step": 2050 }, { "entropy": 0.9684478342533112, "epoch": 4.6727480045610035, "grad_norm": 1.1328125, "learning_rate": 2.977330625442352e-06, "loss": 0.0331, "mean_token_accuracy": 0.9890124648809433, "num_tokens": 217800353.0, "step": 2051 }, { "entropy": 0.9674251079559326, "epoch": 4.67502850627138, "grad_norm": 1.0390625, "learning_rate": 2.9754816907215963e-06, "loss": 0.0235, "mean_token_accuracy": 0.9918033927679062, "num_tokens": 217906484.0, "step": 2052 }, { "entropy": 0.9691198319196701, "epoch": 4.677309007981756, "grad_norm": 1.078125, "learning_rate": 2.9736324861266125e-06, "loss": 0.0358, "mean_token_accuracy": 0.988465890288353, "num_tokens": 218012822.0, "step": 2053 }, { "entropy": 0.9670202136039734, "epoch": 4.679589509692132, "grad_norm": 0.875, "learning_rate": 2.9717830127069734e-06, "loss": 0.0281, "mean_token_accuracy": 0.9898405075073242, "num_tokens": 218118839.0, "step": 2054 }, { "entropy": 0.9679974466562271, "epoch": 4.681870011402508, "grad_norm": 1.078125, "learning_rate": 2.969933271512404e-06, "loss": 0.0339, "mean_token_accuracy": 0.9886072278022766, "num_tokens": 218225025.0, "step": 2055 }, { "entropy": 0.9687058925628662, "epoch": 4.684150513112884, "grad_norm": 1.2578125, "learning_rate": 2.9680832635927824e-06, "loss": 0.041, "mean_token_accuracy": 0.9890687465667725, "num_tokens": 218330975.0, "step": 2056 }, { "entropy": 0.969805896282196, "epoch": 4.6864310148232615, "grad_norm": 1.0625, "learning_rate": 2.9662329899981375e-06, "loss": 0.0333, "mean_token_accuracy": 0.9914238452911377, "num_tokens": 218437518.0, "step": 2057 }, { "entropy": 0.9591345340013504, "epoch": 4.688711516533638, "grad_norm": 0.92578125, "learning_rate": 2.964382451778648e-06, "loss": 0.0229, "mean_token_accuracy": 0.9915568977594376, "num_tokens": 218543876.0, "step": 2058 }, { "entropy": 0.9650208950042725, "epoch": 4.690992018244014, "grad_norm": 1.03125, "learning_rate": 2.9625316499846444e-06, "loss": 0.0256, "mean_token_accuracy": 0.9930683076381683, "num_tokens": 218649955.0, "step": 2059 }, { "entropy": 0.9678671956062317, "epoch": 4.69327251995439, "grad_norm": 1.1953125, "learning_rate": 2.9606805856666053e-06, "loss": 0.0411, "mean_token_accuracy": 0.9870015680789948, "num_tokens": 218756533.0, "step": 2060 }, { "entropy": 0.9701713025569916, "epoch": 4.695553021664766, "grad_norm": 1.046875, "learning_rate": 2.95882925987516e-06, "loss": 0.0296, "mean_token_accuracy": 0.9886830002069473, "num_tokens": 218862979.0, "step": 2061 }, { "entropy": 0.9680881053209305, "epoch": 4.6978335233751425, "grad_norm": 1.0546875, "learning_rate": 2.9569776736610855e-06, "loss": 0.0348, "mean_token_accuracy": 0.9901280701160431, "num_tokens": 218969292.0, "step": 2062 }, { "entropy": 0.9646338820457458, "epoch": 4.700114025085519, "grad_norm": 1.0625, "learning_rate": 2.9551258280753046e-06, "loss": 0.0408, "mean_token_accuracy": 0.9873304814100266, "num_tokens": 219075712.0, "step": 2063 }, { "entropy": 0.9635647833347321, "epoch": 4.702394526795895, "grad_norm": 0.99609375, "learning_rate": 2.953273724168891e-06, "loss": 0.026, "mean_token_accuracy": 0.9903406947851181, "num_tokens": 219182060.0, "step": 2064 }, { "entropy": 0.9679474830627441, "epoch": 4.704675028506271, "grad_norm": 1.25, "learning_rate": 2.9514213629930614e-06, "loss": 0.0489, "mean_token_accuracy": 0.984403669834137, "num_tokens": 219287867.0, "step": 2065 }, { "entropy": 0.9642367362976074, "epoch": 4.706955530216648, "grad_norm": 0.94921875, "learning_rate": 2.949568745599182e-06, "loss": 0.0195, "mean_token_accuracy": 0.9918919205665588, "num_tokens": 219393846.0, "step": 2066 }, { "entropy": 0.9622966051101685, "epoch": 4.7092360319270234, "grad_norm": 1.0078125, "learning_rate": 2.9477158730387615e-06, "loss": 0.033, "mean_token_accuracy": 0.9885730445384979, "num_tokens": 219499868.0, "step": 2067 }, { "entropy": 0.9668188393115997, "epoch": 4.7115165336374005, "grad_norm": 1.0703125, "learning_rate": 2.945862746363455e-06, "loss": 0.0283, "mean_token_accuracy": 0.9911842793226242, "num_tokens": 219606507.0, "step": 2068 }, { "entropy": 0.9693067371845245, "epoch": 4.713797035347777, "grad_norm": 1.0546875, "learning_rate": 2.944009366625061e-06, "loss": 0.0357, "mean_token_accuracy": 0.9875004589557648, "num_tokens": 219712570.0, "step": 2069 }, { "entropy": 0.9613317251205444, "epoch": 4.716077537058153, "grad_norm": 1.03125, "learning_rate": 2.942155734875523e-06, "loss": 0.0333, "mean_token_accuracy": 0.9882738143205643, "num_tokens": 219818722.0, "step": 2070 }, { "entropy": 0.9659555405378342, "epoch": 4.718358038768529, "grad_norm": 1.109375, "learning_rate": 2.9403018521669256e-06, "loss": 0.0303, "mean_token_accuracy": 0.9902569055557251, "num_tokens": 219924961.0, "step": 2071 }, { "entropy": 0.9675969481468201, "epoch": 4.720638540478905, "grad_norm": 1.0390625, "learning_rate": 2.938447719551498e-06, "loss": 0.0249, "mean_token_accuracy": 0.9922146201133728, "num_tokens": 220030883.0, "step": 2072 }, { "entropy": 0.9705196917057037, "epoch": 4.7229190421892815, "grad_norm": 1.0234375, "learning_rate": 2.9365933380816092e-06, "loss": 0.0328, "mean_token_accuracy": 0.9885386079549789, "num_tokens": 220137263.0, "step": 2073 }, { "entropy": 0.9678087830543518, "epoch": 4.725199543899658, "grad_norm": 1.390625, "learning_rate": 2.93473870880977e-06, "loss": 0.0362, "mean_token_accuracy": 0.9879881739616394, "num_tokens": 220243825.0, "step": 2074 }, { "entropy": 0.9651920348405838, "epoch": 4.727480045610034, "grad_norm": 1.2109375, "learning_rate": 2.932883832788633e-06, "loss": 0.0398, "mean_token_accuracy": 0.9887396842241287, "num_tokens": 220350158.0, "step": 2075 }, { "entropy": 0.9689117670059204, "epoch": 4.72976054732041, "grad_norm": 1.2734375, "learning_rate": 2.9310287110709895e-06, "loss": 0.0496, "mean_token_accuracy": 0.9862651228904724, "num_tokens": 220456618.0, "step": 2076 }, { "entropy": 0.9639807045459747, "epoch": 4.732041049030787, "grad_norm": 1.0234375, "learning_rate": 2.9291733447097714e-06, "loss": 0.028, "mean_token_accuracy": 0.9912071079015732, "num_tokens": 220562845.0, "step": 2077 }, { "entropy": 0.9674589037895203, "epoch": 4.734321550741163, "grad_norm": 1.2578125, "learning_rate": 2.927317734758047e-06, "loss": 0.0321, "mean_token_accuracy": 0.9883464425802231, "num_tokens": 220669232.0, "step": 2078 }, { "entropy": 0.9671458899974823, "epoch": 4.7366020524515395, "grad_norm": 1.21875, "learning_rate": 2.925461882269027e-06, "loss": 0.0383, "mean_token_accuracy": 0.9864200204610825, "num_tokens": 220775968.0, "step": 2079 }, { "entropy": 0.9710444360971451, "epoch": 4.738882554161916, "grad_norm": 1.1953125, "learning_rate": 2.9236057882960567e-06, "loss": 0.0398, "mean_token_accuracy": 0.9853101223707199, "num_tokens": 220882611.0, "step": 2080 }, { "entropy": 0.9693167507648468, "epoch": 4.741163055872292, "grad_norm": 1.125, "learning_rate": 2.921749453892618e-06, "loss": 0.032, "mean_token_accuracy": 0.990357056260109, "num_tokens": 220989109.0, "step": 2081 }, { "entropy": 0.9647654891014099, "epoch": 4.743443557582668, "grad_norm": 0.91796875, "learning_rate": 2.919892880112332e-06, "loss": 0.0274, "mean_token_accuracy": 0.9912889897823334, "num_tokens": 221095255.0, "step": 2082 }, { "entropy": 0.9620081037282944, "epoch": 4.745724059293044, "grad_norm": 1.140625, "learning_rate": 2.9180360680089542e-06, "loss": 0.0299, "mean_token_accuracy": 0.9900912940502167, "num_tokens": 221201253.0, "step": 2083 }, { "entropy": 0.9686687290668488, "epoch": 4.7480045610034205, "grad_norm": 1.125, "learning_rate": 2.9161790186363746e-06, "loss": 0.027, "mean_token_accuracy": 0.9901805222034454, "num_tokens": 221307482.0, "step": 2084 }, { "entropy": 0.9672572761774063, "epoch": 4.750285062713797, "grad_norm": 1.1171875, "learning_rate": 2.9143217330486186e-06, "loss": 0.0353, "mean_token_accuracy": 0.9879501610994339, "num_tokens": 221413446.0, "step": 2085 }, { "entropy": 0.9590799510478973, "epoch": 4.752565564424174, "grad_norm": 0.984375, "learning_rate": 2.9124642122998453e-06, "loss": 0.0282, "mean_token_accuracy": 0.9916356801986694, "num_tokens": 221520149.0, "step": 2086 }, { "entropy": 0.9730539917945862, "epoch": 4.75484606613455, "grad_norm": 1.0859375, "learning_rate": 2.9106064574443477e-06, "loss": 0.0319, "mean_token_accuracy": 0.9896161705255508, "num_tokens": 221626258.0, "step": 2087 }, { "entropy": 0.9692618548870087, "epoch": 4.757126567844926, "grad_norm": 1.125, "learning_rate": 2.9087484695365523e-06, "loss": 0.0346, "mean_token_accuracy": 0.9892013221979141, "num_tokens": 221732441.0, "step": 2088 }, { "entropy": 0.9696735739707947, "epoch": 4.759407069555302, "grad_norm": 1.03125, "learning_rate": 2.906890249631017e-06, "loss": 0.0311, "mean_token_accuracy": 0.9885156601667404, "num_tokens": 221838574.0, "step": 2089 }, { "entropy": 0.969530388712883, "epoch": 4.7616875712656785, "grad_norm": 0.890625, "learning_rate": 2.905031798782431e-06, "loss": 0.0232, "mean_token_accuracy": 0.9932834506034851, "num_tokens": 221944994.0, "step": 2090 }, { "entropy": 0.968477264046669, "epoch": 4.763968072976055, "grad_norm": 1.0546875, "learning_rate": 2.903173118045616e-06, "loss": 0.0332, "mean_token_accuracy": 0.9893457889556885, "num_tokens": 222051364.0, "step": 2091 }, { "entropy": 0.9663691520690918, "epoch": 4.766248574686431, "grad_norm": 1.015625, "learning_rate": 2.901314208475522e-06, "loss": 0.0316, "mean_token_accuracy": 0.9891757369041443, "num_tokens": 222157677.0, "step": 2092 }, { "entropy": 0.962722584605217, "epoch": 4.768529076396807, "grad_norm": 1.15625, "learning_rate": 2.8994550711272317e-06, "loss": 0.0359, "mean_token_accuracy": 0.9883667379617691, "num_tokens": 222264014.0, "step": 2093 }, { "entropy": 0.9636942595243454, "epoch": 4.770809578107183, "grad_norm": 1.0546875, "learning_rate": 2.897595707055954e-06, "loss": 0.0281, "mean_token_accuracy": 0.9920971989631653, "num_tokens": 222370305.0, "step": 2094 }, { "entropy": 0.9732548594474792, "epoch": 4.7730900798175595, "grad_norm": 1.140625, "learning_rate": 2.8957361173170297e-06, "loss": 0.0249, "mean_token_accuracy": 0.9927437752485275, "num_tokens": 222476273.0, "step": 2095 }, { "entropy": 0.9692937433719635, "epoch": 4.775370581527936, "grad_norm": 1.375, "learning_rate": 2.893876302965925e-06, "loss": 0.0331, "mean_token_accuracy": 0.9884673655033112, "num_tokens": 222582522.0, "step": 2096 }, { "entropy": 0.9642610251903534, "epoch": 4.777651083238313, "grad_norm": 1.0390625, "learning_rate": 2.8920162650582344e-06, "loss": 0.033, "mean_token_accuracy": 0.989716425538063, "num_tokens": 222690122.0, "step": 2097 }, { "entropy": 0.9661080837249756, "epoch": 4.779931584948689, "grad_norm": 1.15625, "learning_rate": 2.8901560046496797e-06, "loss": 0.0365, "mean_token_accuracy": 0.9875150918960571, "num_tokens": 222796375.0, "step": 2098 }, { "entropy": 0.9638914167881012, "epoch": 4.782212086659065, "grad_norm": 1.1796875, "learning_rate": 2.8882955227961098e-06, "loss": 0.0411, "mean_token_accuracy": 0.9880530536174774, "num_tokens": 222902609.0, "step": 2099 }, { "entropy": 0.9755161851644516, "epoch": 4.784492588369441, "grad_norm": 1.1796875, "learning_rate": 2.886434820553497e-06, "loss": 0.0334, "mean_token_accuracy": 0.9893867522478104, "num_tokens": 223009207.0, "step": 2100 }, { "entropy": 0.9676876217126846, "epoch": 4.7867730900798175, "grad_norm": 0.89453125, "learning_rate": 2.884573898977941e-06, "loss": 0.0271, "mean_token_accuracy": 0.9908407777547836, "num_tokens": 223115239.0, "step": 2101 }, { "entropy": 0.9727314114570618, "epoch": 4.789053591790194, "grad_norm": 1.171875, "learning_rate": 2.882712759125664e-06, "loss": 0.0306, "mean_token_accuracy": 0.9904059916734695, "num_tokens": 223221912.0, "step": 2102 }, { "entropy": 0.9674284160137177, "epoch": 4.79133409350057, "grad_norm": 1.0859375, "learning_rate": 2.8808514020530127e-06, "loss": 0.0365, "mean_token_accuracy": 0.990010604262352, "num_tokens": 223327974.0, "step": 2103 }, { "entropy": 0.9669561833143234, "epoch": 4.793614595210946, "grad_norm": 1.09375, "learning_rate": 2.8789898288164595e-06, "loss": 0.0251, "mean_token_accuracy": 0.9929055571556091, "num_tokens": 223433976.0, "step": 2104 }, { "entropy": 0.9691787213087082, "epoch": 4.795895096921322, "grad_norm": 1.125, "learning_rate": 2.8771280404725953e-06, "loss": 0.0373, "mean_token_accuracy": 0.9888149052858353, "num_tokens": 223539709.0, "step": 2105 }, { "entropy": 0.9662378281354904, "epoch": 4.798175598631699, "grad_norm": 1.375, "learning_rate": 2.8752660380781367e-06, "loss": 0.0369, "mean_token_accuracy": 0.9885356277227402, "num_tokens": 223646169.0, "step": 2106 }, { "entropy": 0.9644147902727127, "epoch": 4.800456100342076, "grad_norm": 0.94140625, "learning_rate": 2.8734038226899198e-06, "loss": 0.0237, "mean_token_accuracy": 0.991982027888298, "num_tokens": 223751910.0, "step": 2107 }, { "entropy": 0.965197429060936, "epoch": 4.802736602052452, "grad_norm": 1.0546875, "learning_rate": 2.8715413953649012e-06, "loss": 0.0373, "mean_token_accuracy": 0.9885252565145493, "num_tokens": 223858412.0, "step": 2108 }, { "entropy": 0.9716744422912598, "epoch": 4.805017103762828, "grad_norm": 1.171875, "learning_rate": 2.8696787571601597e-06, "loss": 0.0355, "mean_token_accuracy": 0.9883052110671997, "num_tokens": 223964767.0, "step": 2109 }, { "entropy": 0.9670709073543549, "epoch": 4.807297605473204, "grad_norm": 0.98046875, "learning_rate": 2.8678159091328926e-06, "loss": 0.0284, "mean_token_accuracy": 0.9915338307619095, "num_tokens": 224071188.0, "step": 2110 }, { "entropy": 0.967201292514801, "epoch": 4.80957810718358, "grad_norm": 1.203125, "learning_rate": 2.865952852340417e-06, "loss": 0.0371, "mean_token_accuracy": 0.9893742650747299, "num_tokens": 224177538.0, "step": 2111 }, { "entropy": 0.9692767411470413, "epoch": 4.811858608893957, "grad_norm": 1.1171875, "learning_rate": 2.864089587840167e-06, "loss": 0.0311, "mean_token_accuracy": 0.9898247867822647, "num_tokens": 224283129.0, "step": 2112 }, { "entropy": 0.9630853086709976, "epoch": 4.814139110604333, "grad_norm": 1.1875, "learning_rate": 2.862226116689696e-06, "loss": 0.0325, "mean_token_accuracy": 0.9881247133016586, "num_tokens": 224389197.0, "step": 2113 }, { "entropy": 0.9705997407436371, "epoch": 4.816419612314709, "grad_norm": 1.28125, "learning_rate": 2.8603624399466732e-06, "loss": 0.027, "mean_token_accuracy": 0.9904511719942093, "num_tokens": 224496246.0, "step": 2114 }, { "entropy": 0.9747413992881775, "epoch": 4.818700114025085, "grad_norm": 1.203125, "learning_rate": 2.858498558668888e-06, "loss": 0.0318, "mean_token_accuracy": 0.9890868961811066, "num_tokens": 224602758.0, "step": 2115 }, { "entropy": 0.969358891248703, "epoch": 4.820980615735461, "grad_norm": 1.5, "learning_rate": 2.856634473914242e-06, "loss": 0.0478, "mean_token_accuracy": 0.9866527616977692, "num_tokens": 224709720.0, "step": 2116 }, { "entropy": 0.9640456587076187, "epoch": 4.823261117445838, "grad_norm": 1.0390625, "learning_rate": 2.854770186740753e-06, "loss": 0.0289, "mean_token_accuracy": 0.9895864576101303, "num_tokens": 224816595.0, "step": 2117 }, { "entropy": 0.9697559922933578, "epoch": 4.825541619156215, "grad_norm": 1.1875, "learning_rate": 2.8529056982065557e-06, "loss": 0.0324, "mean_token_accuracy": 0.989689290523529, "num_tokens": 224922769.0, "step": 2118 }, { "entropy": 0.9675846993923187, "epoch": 4.827822120866591, "grad_norm": 1.140625, "learning_rate": 2.8510410093698966e-06, "loss": 0.0334, "mean_token_accuracy": 0.9867147654294968, "num_tokens": 225028516.0, "step": 2119 }, { "entropy": 0.9734363555908203, "epoch": 4.830102622576967, "grad_norm": 0.9375, "learning_rate": 2.849176121289138e-06, "loss": 0.0278, "mean_token_accuracy": 0.9924135655164719, "num_tokens": 225135568.0, "step": 2120 }, { "entropy": 0.9701265245676041, "epoch": 4.832383124287343, "grad_norm": 1.0, "learning_rate": 2.8473110350227536e-06, "loss": 0.0323, "mean_token_accuracy": 0.9887912422418594, "num_tokens": 225242089.0, "step": 2121 }, { "entropy": 0.969630554318428, "epoch": 4.834663625997719, "grad_norm": 1.359375, "learning_rate": 2.845445751629331e-06, "loss": 0.0356, "mean_token_accuracy": 0.9880877435207367, "num_tokens": 225349075.0, "step": 2122 }, { "entropy": 0.9734223484992981, "epoch": 4.836944127708096, "grad_norm": 1.0234375, "learning_rate": 2.843580272167569e-06, "loss": 0.0299, "mean_token_accuracy": 0.9898817092180252, "num_tokens": 225455551.0, "step": 2123 }, { "entropy": 0.9677305668592453, "epoch": 4.839224629418472, "grad_norm": 0.87109375, "learning_rate": 2.8417145976962773e-06, "loss": 0.0225, "mean_token_accuracy": 0.9932042509317398, "num_tokens": 225561458.0, "step": 2124 }, { "entropy": 0.9687599241733551, "epoch": 4.841505131128848, "grad_norm": 1.296875, "learning_rate": 2.8398487292743772e-06, "loss": 0.0411, "mean_token_accuracy": 0.9883753806352615, "num_tokens": 225667390.0, "step": 2125 }, { "entropy": 0.9648679941892624, "epoch": 4.843785632839225, "grad_norm": 1.0390625, "learning_rate": 2.8379826679609e-06, "loss": 0.0347, "mean_token_accuracy": 0.9876659065485001, "num_tokens": 225773291.0, "step": 2126 }, { "entropy": 0.967779278755188, "epoch": 4.846066134549601, "grad_norm": 1.2421875, "learning_rate": 2.836116414814985e-06, "loss": 0.0379, "mean_token_accuracy": 0.988066241145134, "num_tokens": 225879142.0, "step": 2127 }, { "entropy": 0.966596320271492, "epoch": 4.848346636259977, "grad_norm": 1.2578125, "learning_rate": 2.8342499708958827e-06, "loss": 0.0375, "mean_token_accuracy": 0.9880295991897583, "num_tokens": 225985376.0, "step": 2128 }, { "entropy": 0.9734581410884857, "epoch": 4.850627137970354, "grad_norm": 1.203125, "learning_rate": 2.8323833372629485e-06, "loss": 0.0315, "mean_token_accuracy": 0.9913296103477478, "num_tokens": 226092104.0, "step": 2129 }, { "entropy": 0.9694672971963882, "epoch": 4.85290763968073, "grad_norm": 1.0234375, "learning_rate": 2.8305165149756496e-06, "loss": 0.0329, "mean_token_accuracy": 0.9897075295448303, "num_tokens": 226198465.0, "step": 2130 }, { "entropy": 0.9644845277070999, "epoch": 4.855188141391106, "grad_norm": 1.140625, "learning_rate": 2.828649505093558e-06, "loss": 0.0359, "mean_token_accuracy": 0.9896194338798523, "num_tokens": 226304746.0, "step": 2131 }, { "entropy": 0.9684278964996338, "epoch": 4.857468643101482, "grad_norm": 1.3515625, "learning_rate": 2.826782308676351e-06, "loss": 0.0431, "mean_token_accuracy": 0.9868874996900558, "num_tokens": 226410654.0, "step": 2132 }, { "entropy": 0.9760751873254776, "epoch": 4.859749144811858, "grad_norm": 1.0, "learning_rate": 2.824914926783815e-06, "loss": 0.0243, "mean_token_accuracy": 0.9913100898265839, "num_tokens": 226517283.0, "step": 2133 }, { "entropy": 0.9733271151781082, "epoch": 4.862029646522235, "grad_norm": 1.0859375, "learning_rate": 2.82304736047584e-06, "loss": 0.0328, "mean_token_accuracy": 0.9919776171445847, "num_tokens": 226623776.0, "step": 2134 }, { "entropy": 0.9724178463220596, "epoch": 4.864310148232612, "grad_norm": 1.140625, "learning_rate": 2.821179610812419e-06, "loss": 0.0328, "mean_token_accuracy": 0.9881785809993744, "num_tokens": 226730592.0, "step": 2135 }, { "entropy": 0.9681249409914017, "epoch": 4.866590649942988, "grad_norm": 1.375, "learning_rate": 2.819311678853652e-06, "loss": 0.0407, "mean_token_accuracy": 0.9869142323732376, "num_tokens": 226837585.0, "step": 2136 }, { "entropy": 0.967714324593544, "epoch": 4.868871151653364, "grad_norm": 1.0859375, "learning_rate": 2.8174435656597403e-06, "loss": 0.0271, "mean_token_accuracy": 0.9925556480884552, "num_tokens": 226943963.0, "step": 2137 }, { "entropy": 0.9724419713020325, "epoch": 4.87115165336374, "grad_norm": 0.80859375, "learning_rate": 2.8155752722909896e-06, "loss": 0.0287, "mean_token_accuracy": 0.9910224378108978, "num_tokens": 227050312.0, "step": 2138 }, { "entropy": 0.9704539477825165, "epoch": 4.873432155074116, "grad_norm": 1.046875, "learning_rate": 2.8137067998078073e-06, "loss": 0.0253, "mean_token_accuracy": 0.9908962696790695, "num_tokens": 227156164.0, "step": 2139 }, { "entropy": 0.9665473401546478, "epoch": 4.875712656784493, "grad_norm": 1.1796875, "learning_rate": 2.8118381492707004e-06, "loss": 0.0376, "mean_token_accuracy": 0.9873292148113251, "num_tokens": 227262123.0, "step": 2140 }, { "entropy": 0.9712285995483398, "epoch": 4.877993158494869, "grad_norm": 1.03125, "learning_rate": 2.8099693217402807e-06, "loss": 0.0354, "mean_token_accuracy": 0.9902003556489944, "num_tokens": 227368938.0, "step": 2141 }, { "entropy": 0.9630431234836578, "epoch": 4.880273660205245, "grad_norm": 1.203125, "learning_rate": 2.808100318277258e-06, "loss": 0.0402, "mean_token_accuracy": 0.9863322526216507, "num_tokens": 227475350.0, "step": 2142 }, { "entropy": 0.969196692109108, "epoch": 4.882554161915621, "grad_norm": 1.015625, "learning_rate": 2.806231139942443e-06, "loss": 0.0335, "mean_token_accuracy": 0.9905116707086563, "num_tokens": 227582364.0, "step": 2143 }, { "entropy": 0.9710868746042252, "epoch": 4.884834663625997, "grad_norm": 1.0390625, "learning_rate": 2.8043617877967456e-06, "loss": 0.0334, "mean_token_accuracy": 0.989299938082695, "num_tokens": 227688381.0, "step": 2144 }, { "entropy": 0.9706981927156448, "epoch": 4.887115165336374, "grad_norm": 0.94140625, "learning_rate": 2.8024922629011727e-06, "loss": 0.0304, "mean_token_accuracy": 0.9925013780593872, "num_tokens": 227795008.0, "step": 2145 }, { "entropy": 0.966568797826767, "epoch": 4.889395667046751, "grad_norm": 0.98828125, "learning_rate": 2.800622566316831e-06, "loss": 0.0363, "mean_token_accuracy": 0.9878135621547699, "num_tokens": 227901184.0, "step": 2146 }, { "entropy": 0.9677979052066803, "epoch": 4.891676168757127, "grad_norm": 0.83203125, "learning_rate": 2.798752699104925e-06, "loss": 0.0174, "mean_token_accuracy": 0.9940001517534256, "num_tokens": 228007019.0, "step": 2147 }, { "entropy": 0.9721635729074478, "epoch": 4.893956670467503, "grad_norm": 1.2265625, "learning_rate": 2.7968826623267542e-06, "loss": 0.0387, "mean_token_accuracy": 0.9898413121700287, "num_tokens": 228113371.0, "step": 2148 }, { "entropy": 0.9669943600893021, "epoch": 4.896237172177879, "grad_norm": 1.28125, "learning_rate": 2.7950124570437163e-06, "loss": 0.0368, "mean_token_accuracy": 0.9901173710823059, "num_tokens": 228219711.0, "step": 2149 }, { "entropy": 0.9634810090065002, "epoch": 4.898517673888255, "grad_norm": 1.0703125, "learning_rate": 2.793142084317303e-06, "loss": 0.0351, "mean_token_accuracy": 0.9894691705703735, "num_tokens": 228325652.0, "step": 2150 }, { "entropy": 0.9656842648983002, "epoch": 4.900798175598632, "grad_norm": 1.265625, "learning_rate": 2.7912715452091014e-06, "loss": 0.0312, "mean_token_accuracy": 0.9907356947660446, "num_tokens": 228432253.0, "step": 2151 }, { "entropy": 0.9629835486412048, "epoch": 4.903078677309008, "grad_norm": 1.4453125, "learning_rate": 2.789400840780795e-06, "loss": 0.0437, "mean_token_accuracy": 0.9851148575544357, "num_tokens": 228538208.0, "step": 2152 }, { "entropy": 0.9692727029323578, "epoch": 4.905359179019384, "grad_norm": 1.15625, "learning_rate": 2.7875299720941577e-06, "loss": 0.0363, "mean_token_accuracy": 0.9898783564567566, "num_tokens": 228644883.0, "step": 2153 }, { "entropy": 0.9756845831871033, "epoch": 4.90763968072976, "grad_norm": 1.0, "learning_rate": 2.785658940211059e-06, "loss": 0.0301, "mean_token_accuracy": 0.9907511472702026, "num_tokens": 228750968.0, "step": 2154 }, { "entropy": 0.9739930629730225, "epoch": 4.909920182440137, "grad_norm": 1.078125, "learning_rate": 2.7837877461934616e-06, "loss": 0.0341, "mean_token_accuracy": 0.9887475222349167, "num_tokens": 228856739.0, "step": 2155 }, { "entropy": 0.9670908600091934, "epoch": 4.9122006841505135, "grad_norm": 1.328125, "learning_rate": 2.7819163911034175e-06, "loss": 0.0337, "mean_token_accuracy": 0.9907625466585159, "num_tokens": 228963107.0, "step": 2156 }, { "entropy": 0.9710409641265869, "epoch": 4.91448118586089, "grad_norm": 1.234375, "learning_rate": 2.7800448760030724e-06, "loss": 0.0352, "mean_token_accuracy": 0.9887731671333313, "num_tokens": 229069605.0, "step": 2157 }, { "entropy": 0.9669790267944336, "epoch": 4.916761687571266, "grad_norm": 0.92578125, "learning_rate": 2.7781732019546625e-06, "loss": 0.0275, "mean_token_accuracy": 0.9913983792066574, "num_tokens": 229175809.0, "step": 2158 }, { "entropy": 0.9669333100318909, "epoch": 4.919042189281642, "grad_norm": 0.93359375, "learning_rate": 2.776301370020513e-06, "loss": 0.0242, "mean_token_accuracy": 0.9926677942276001, "num_tokens": 229281316.0, "step": 2159 }, { "entropy": 0.9669125974178314, "epoch": 4.921322690992018, "grad_norm": 1.375, "learning_rate": 2.7744293812630412e-06, "loss": 0.0368, "mean_token_accuracy": 0.9861198514699936, "num_tokens": 229387630.0, "step": 2160 }, { "entropy": 0.9651005268096924, "epoch": 4.923603192702394, "grad_norm": 1.328125, "learning_rate": 2.77255723674475e-06, "loss": 0.0436, "mean_token_accuracy": 0.987390547990799, "num_tokens": 229494099.0, "step": 2161 }, { "entropy": 0.9625854939222336, "epoch": 4.925883694412771, "grad_norm": 1.234375, "learning_rate": 2.770684937528233e-06, "loss": 0.0408, "mean_token_accuracy": 0.9879903048276901, "num_tokens": 229600340.0, "step": 2162 }, { "entropy": 0.9741434901952744, "epoch": 4.928164196123147, "grad_norm": 1.2265625, "learning_rate": 2.7688124846761716e-06, "loss": 0.0346, "mean_token_accuracy": 0.9894145578145981, "num_tokens": 229706435.0, "step": 2163 }, { "entropy": 0.9702705293893814, "epoch": 4.930444697833523, "grad_norm": 1.1796875, "learning_rate": 2.766939879251333e-06, "loss": 0.0422, "mean_token_accuracy": 0.9872661679983139, "num_tokens": 229813257.0, "step": 2164 }, { "entropy": 0.9684687554836273, "epoch": 4.932725199543899, "grad_norm": 0.96875, "learning_rate": 2.7650671223165726e-06, "loss": 0.021, "mean_token_accuracy": 0.9937348961830139, "num_tokens": 229919327.0, "step": 2165 }, { "entropy": 0.9668440073728561, "epoch": 4.935005701254276, "grad_norm": 1.0546875, "learning_rate": 2.7631942149348313e-06, "loss": 0.0372, "mean_token_accuracy": 0.9893345534801483, "num_tokens": 230025574.0, "step": 2166 }, { "entropy": 0.9706819206476212, "epoch": 4.9372862029646525, "grad_norm": 1.234375, "learning_rate": 2.761321158169134e-06, "loss": 0.0383, "mean_token_accuracy": 0.9885515123605728, "num_tokens": 230132031.0, "step": 2167 }, { "entropy": 0.9665537178516388, "epoch": 4.939566704675029, "grad_norm": 1.015625, "learning_rate": 2.759447953082593e-06, "loss": 0.0327, "mean_token_accuracy": 0.9900698661804199, "num_tokens": 230237880.0, "step": 2168 }, { "entropy": 0.9708750396966934, "epoch": 4.941847206385405, "grad_norm": 1.015625, "learning_rate": 2.757574600738402e-06, "loss": 0.0328, "mean_token_accuracy": 0.9878809452056885, "num_tokens": 230343655.0, "step": 2169 }, { "entropy": 0.9706359356641769, "epoch": 4.944127708095781, "grad_norm": 0.921875, "learning_rate": 2.755701102199841e-06, "loss": 0.0314, "mean_token_accuracy": 0.990574061870575, "num_tokens": 230449885.0, "step": 2170 }, { "entropy": 0.9681488424539566, "epoch": 4.946408209806157, "grad_norm": 1.171875, "learning_rate": 2.7538274585302707e-06, "loss": 0.0273, "mean_token_accuracy": 0.9913212358951569, "num_tokens": 230556297.0, "step": 2171 }, { "entropy": 0.966719388961792, "epoch": 4.9486887115165334, "grad_norm": 0.91796875, "learning_rate": 2.751953670793135e-06, "loss": 0.025, "mean_token_accuracy": 0.9928033202886581, "num_tokens": 230662410.0, "step": 2172 }, { "entropy": 0.9749582260847092, "epoch": 4.95096921322691, "grad_norm": 1.140625, "learning_rate": 2.7500797400519595e-06, "loss": 0.0339, "mean_token_accuracy": 0.9862391203641891, "num_tokens": 230768648.0, "step": 2173 }, { "entropy": 0.9727483689785004, "epoch": 4.953249714937286, "grad_norm": 1.078125, "learning_rate": 2.7482056673703526e-06, "loss": 0.0329, "mean_token_accuracy": 0.9903423637151718, "num_tokens": 230875013.0, "step": 2174 }, { "entropy": 0.9683956801891327, "epoch": 4.955530216647663, "grad_norm": 1.2265625, "learning_rate": 2.746331453812e-06, "loss": 0.0294, "mean_token_accuracy": 0.9907998740673065, "num_tokens": 230981264.0, "step": 2175 }, { "entropy": 0.9700355529785156, "epoch": 4.957810718358039, "grad_norm": 1.1015625, "learning_rate": 2.74445710044067e-06, "loss": 0.0268, "mean_token_accuracy": 0.9907157570123672, "num_tokens": 231087921.0, "step": 2176 }, { "entropy": 0.9712271839380264, "epoch": 4.960091220068415, "grad_norm": 1.0859375, "learning_rate": 2.7425826083202096e-06, "loss": 0.028, "mean_token_accuracy": 0.9906646460294724, "num_tokens": 231194023.0, "step": 2177 }, { "entropy": 0.9727357029914856, "epoch": 4.9623717217787915, "grad_norm": 1.15625, "learning_rate": 2.740707978514543e-06, "loss": 0.0358, "mean_token_accuracy": 0.9887940138578415, "num_tokens": 231300646.0, "step": 2178 }, { "entropy": 0.9685721397399902, "epoch": 4.964652223489168, "grad_norm": 1.1171875, "learning_rate": 2.738833212087676e-06, "loss": 0.0309, "mean_token_accuracy": 0.9903885871171951, "num_tokens": 231407175.0, "step": 2179 }, { "entropy": 0.9722049981355667, "epoch": 4.966932725199544, "grad_norm": 0.91796875, "learning_rate": 2.736958310103688e-06, "loss": 0.0332, "mean_token_accuracy": 0.9902003407478333, "num_tokens": 231513425.0, "step": 2180 }, { "entropy": 0.9669078886508942, "epoch": 4.96921322690992, "grad_norm": 1.21875, "learning_rate": 2.735083273626738e-06, "loss": 0.0397, "mean_token_accuracy": 0.9886399954557419, "num_tokens": 231619292.0, "step": 2181 }, { "entropy": 0.9727934002876282, "epoch": 4.971493728620296, "grad_norm": 1.078125, "learning_rate": 2.7332081037210607e-06, "loss": 0.0311, "mean_token_accuracy": 0.9912904500961304, "num_tokens": 231726173.0, "step": 2182 }, { "entropy": 0.966730073094368, "epoch": 4.9737742303306725, "grad_norm": 0.98828125, "learning_rate": 2.7313328014509653e-06, "loss": 0.032, "mean_token_accuracy": 0.9897010326385498, "num_tokens": 231832307.0, "step": 2183 }, { "entropy": 0.9687830954790115, "epoch": 4.976054732041049, "grad_norm": 1.1796875, "learning_rate": 2.729457367880838e-06, "loss": 0.028, "mean_token_accuracy": 0.9898069947957993, "num_tokens": 231938696.0, "step": 2184 }, { "entropy": 0.9661456793546677, "epoch": 4.978335233751425, "grad_norm": 1.1328125, "learning_rate": 2.727581804075139e-06, "loss": 0.0365, "mean_token_accuracy": 0.9863445311784744, "num_tokens": 232044403.0, "step": 2185 }, { "entropy": 0.9699610471725464, "epoch": 4.980615735461802, "grad_norm": 1.0078125, "learning_rate": 2.7257061110984005e-06, "loss": 0.0368, "mean_token_accuracy": 0.9881321787834167, "num_tokens": 232150585.0, "step": 2186 }, { "entropy": 0.966751217842102, "epoch": 4.982896237172178, "grad_norm": 0.828125, "learning_rate": 2.7238302900152327e-06, "loss": 0.0267, "mean_token_accuracy": 0.9933794885873795, "num_tokens": 232256670.0, "step": 2187 }, { "entropy": 0.96232470870018, "epoch": 4.985176738882554, "grad_norm": 1.0078125, "learning_rate": 2.7219543418903115e-06, "loss": 0.037, "mean_token_accuracy": 0.9880478829145432, "num_tokens": 232362351.0, "step": 2188 }, { "entropy": 0.969416931271553, "epoch": 4.9874572405929305, "grad_norm": 1.0078125, "learning_rate": 2.720078267788392e-06, "loss": 0.0369, "mean_token_accuracy": 0.989921897649765, "num_tokens": 232468306.0, "step": 2189 }, { "entropy": 0.9633197337388992, "epoch": 4.989737742303307, "grad_norm": 1.2890625, "learning_rate": 2.718202068774296e-06, "loss": 0.0448, "mean_token_accuracy": 0.9875536859035492, "num_tokens": 232575450.0, "step": 2190 }, { "entropy": 0.9658726453781128, "epoch": 4.992018244013683, "grad_norm": 0.98828125, "learning_rate": 2.7163257459129184e-06, "loss": 0.0259, "mean_token_accuracy": 0.9922931641340256, "num_tokens": 232681809.0, "step": 2191 }, { "entropy": 0.9689303040504456, "epoch": 4.994298745724059, "grad_norm": 1.203125, "learning_rate": 2.7144493002692242e-06, "loss": 0.0276, "mean_token_accuracy": 0.9912952035665512, "num_tokens": 232788221.0, "step": 2192 }, { "entropy": 0.964628204703331, "epoch": 4.996579247434435, "grad_norm": 1.1015625, "learning_rate": 2.7125727329082474e-06, "loss": 0.0352, "mean_token_accuracy": 0.9885336309671402, "num_tokens": 232894387.0, "step": 2193 }, { "entropy": 0.9664207249879837, "epoch": 4.9988597491448115, "grad_norm": 1.0546875, "learning_rate": 2.7106960448950904e-06, "loss": 0.0314, "mean_token_accuracy": 0.9903766810894012, "num_tokens": 233000665.0, "step": 2194 }, { "entropy": 0.9590504765510559, "epoch": 5.0, "grad_norm": 1.375, "learning_rate": 2.7088192372949267e-06, "loss": 0.0243, "mean_token_accuracy": 0.9957035481929779, "num_tokens": 233039880.0, "step": 2195 }, { "entropy": 0.967502236366272, "epoch": 5.002280501710376, "grad_norm": 1.1328125, "learning_rate": 2.7069423111729948e-06, "loss": 0.0375, "mean_token_accuracy": 0.9887584745883942, "num_tokens": 233146251.0, "step": 2196 }, { "entropy": 0.9642662107944489, "epoch": 5.004561003420752, "grad_norm": 1.15625, "learning_rate": 2.705065267594602e-06, "loss": 0.0363, "mean_token_accuracy": 0.9849029034376144, "num_tokens": 233253039.0, "step": 2197 }, { "entropy": 0.9679869711399078, "epoch": 5.006841505131129, "grad_norm": 0.88671875, "learning_rate": 2.703188107625123e-06, "loss": 0.0263, "mean_token_accuracy": 0.991951510310173, "num_tokens": 233359038.0, "step": 2198 }, { "entropy": 0.9616879224777222, "epoch": 5.009122006841505, "grad_norm": 0.9921875, "learning_rate": 2.701310832329996e-06, "loss": 0.0333, "mean_token_accuracy": 0.989852100610733, "num_tokens": 233465175.0, "step": 2199 }, { "entropy": 0.9587210714817047, "epoch": 5.011402508551882, "grad_norm": 1.328125, "learning_rate": 2.6994334427747276e-06, "loss": 0.0312, "mean_token_accuracy": 0.9889845103025436, "num_tokens": 233571249.0, "step": 2200 }, { "epoch": 5.011402508551882, "eval_entropy": 0.9651945815793461, "eval_loss": 0.03787891939282417, "eval_mean_token_accuracy": 0.988409234543717, "eval_num_tokens": 233571249.0, "eval_runtime": 66.0473, "eval_samples_per_second": 126.954, "eval_steps_per_second": 3.982, "step": 2200 }, { "entropy": 0.9646730124950409, "epoch": 5.013683010262258, "grad_norm": 0.953125, "learning_rate": 2.6975559400248876e-06, "loss": 0.0286, "mean_token_accuracy": 0.9913780242204666, "num_tokens": 233678166.0, "step": 2201 }, { "entropy": 0.9700965881347656, "epoch": 5.015963511972634, "grad_norm": 1.0859375, "learning_rate": 2.6956783251461093e-06, "loss": 0.0332, "mean_token_accuracy": 0.9920449107885361, "num_tokens": 233784652.0, "step": 2202 }, { "entropy": 0.9625149667263031, "epoch": 5.01824401368301, "grad_norm": 1.34375, "learning_rate": 2.6938005992040923e-06, "loss": 0.037, "mean_token_accuracy": 0.9894689619541168, "num_tokens": 233891019.0, "step": 2203 }, { "entropy": 0.9661629498004913, "epoch": 5.020524515393387, "grad_norm": 1.125, "learning_rate": 2.6919227632645963e-06, "loss": 0.0335, "mean_token_accuracy": 0.9885742217302322, "num_tokens": 233997167.0, "step": 2204 }, { "entropy": 0.9622644186019897, "epoch": 5.022805017103763, "grad_norm": 0.984375, "learning_rate": 2.690044818393444e-06, "loss": 0.0268, "mean_token_accuracy": 0.9912956207990646, "num_tokens": 234103021.0, "step": 2205 }, { "entropy": 0.9605561196804047, "epoch": 5.025085518814139, "grad_norm": 1.15625, "learning_rate": 2.688166765656523e-06, "loss": 0.0399, "mean_token_accuracy": 0.9888800829648972, "num_tokens": 234209440.0, "step": 2206 }, { "entropy": 0.9636734575033188, "epoch": 5.027366020524515, "grad_norm": 1.1328125, "learning_rate": 2.686288606119778e-06, "loss": 0.0307, "mean_token_accuracy": 0.9897111505270004, "num_tokens": 234315401.0, "step": 2207 }, { "entropy": 0.9620020836591721, "epoch": 5.029646522234891, "grad_norm": 0.8515625, "learning_rate": 2.6844103408492165e-06, "loss": 0.0223, "mean_token_accuracy": 0.9918202310800552, "num_tokens": 234421441.0, "step": 2208 }, { "entropy": 0.965594008564949, "epoch": 5.031927023945268, "grad_norm": 1.1875, "learning_rate": 2.682531970910906e-06, "loss": 0.0356, "mean_token_accuracy": 0.9876242429018021, "num_tokens": 234528156.0, "step": 2209 }, { "entropy": 0.9652815759181976, "epoch": 5.034207525655645, "grad_norm": 1.1640625, "learning_rate": 2.6806534973709723e-06, "loss": 0.0324, "mean_token_accuracy": 0.98935167491436, "num_tokens": 234634850.0, "step": 2210 }, { "entropy": 0.9658011347055435, "epoch": 5.036488027366021, "grad_norm": 0.94921875, "learning_rate": 2.6787749212956023e-06, "loss": 0.0293, "mean_token_accuracy": 0.9917258322238922, "num_tokens": 234740698.0, "step": 2211 }, { "entropy": 0.9641145169734955, "epoch": 5.038768529076397, "grad_norm": 1.28125, "learning_rate": 2.676896243751037e-06, "loss": 0.0461, "mean_token_accuracy": 0.984249159693718, "num_tokens": 234847130.0, "step": 2212 }, { "entropy": 0.9650871753692627, "epoch": 5.041049030786773, "grad_norm": 1.15625, "learning_rate": 2.6750174658035793e-06, "loss": 0.0346, "mean_token_accuracy": 0.9878454208374023, "num_tokens": 234953715.0, "step": 2213 }, { "entropy": 0.9682760536670685, "epoch": 5.043329532497149, "grad_norm": 0.9296875, "learning_rate": 2.673138588519587e-06, "loss": 0.0224, "mean_token_accuracy": 0.9925963878631592, "num_tokens": 235060034.0, "step": 2214 }, { "entropy": 0.9578351825475693, "epoch": 5.045610034207526, "grad_norm": 1.1171875, "learning_rate": 2.671259612965475e-06, "loss": 0.0291, "mean_token_accuracy": 0.9919521510601044, "num_tokens": 235166058.0, "step": 2215 }, { "entropy": 0.964358776807785, "epoch": 5.047890535917902, "grad_norm": 1.1015625, "learning_rate": 2.6693805402077123e-06, "loss": 0.0302, "mean_token_accuracy": 0.9890440404415131, "num_tokens": 235272180.0, "step": 2216 }, { "entropy": 0.9719296842813492, "epoch": 5.050171037628278, "grad_norm": 1.046875, "learning_rate": 2.6675013713128252e-06, "loss": 0.0335, "mean_token_accuracy": 0.9902167767286301, "num_tokens": 235379005.0, "step": 2217 }, { "entropy": 0.9601863473653793, "epoch": 5.052451539338654, "grad_norm": 1.0546875, "learning_rate": 2.665622107347393e-06, "loss": 0.0284, "mean_token_accuracy": 0.989778682589531, "num_tokens": 235485359.0, "step": 2218 }, { "entropy": 0.966862753033638, "epoch": 5.05473204104903, "grad_norm": 1.359375, "learning_rate": 2.6637427493780503e-06, "loss": 0.0344, "mean_token_accuracy": 0.9907550513744354, "num_tokens": 235591171.0, "step": 2219 }, { "entropy": 0.9600814133882523, "epoch": 5.0570125427594075, "grad_norm": 1.2109375, "learning_rate": 2.6618632984714843e-06, "loss": 0.0355, "mean_token_accuracy": 0.9896006286144257, "num_tokens": 235697805.0, "step": 2220 }, { "entropy": 0.9626932740211487, "epoch": 5.059293044469784, "grad_norm": 1.171875, "learning_rate": 2.6599837556944353e-06, "loss": 0.034, "mean_token_accuracy": 0.9901525974273682, "num_tokens": 235803664.0, "step": 2221 }, { "entropy": 0.9672225117683411, "epoch": 5.06157354618016, "grad_norm": 1.046875, "learning_rate": 2.658104122113695e-06, "loss": 0.0329, "mean_token_accuracy": 0.9881310760974884, "num_tokens": 235909796.0, "step": 2222 }, { "entropy": 0.9643477648496628, "epoch": 5.063854047890536, "grad_norm": 1.03125, "learning_rate": 2.6562243987961066e-06, "loss": 0.0358, "mean_token_accuracy": 0.9892918616533279, "num_tokens": 236016535.0, "step": 2223 }, { "entropy": 0.9678105562925339, "epoch": 5.066134549600912, "grad_norm": 1.078125, "learning_rate": 2.6543445868085665e-06, "loss": 0.0355, "mean_token_accuracy": 0.9896625429391861, "num_tokens": 236123031.0, "step": 2224 }, { "entropy": 0.9689858257770538, "epoch": 5.068415051311288, "grad_norm": 0.88671875, "learning_rate": 2.652464687218018e-06, "loss": 0.0289, "mean_token_accuracy": 0.989907443523407, "num_tokens": 236229219.0, "step": 2225 }, { "entropy": 0.9703265279531479, "epoch": 5.070695553021665, "grad_norm": 1.1875, "learning_rate": 2.6505847010914575e-06, "loss": 0.0377, "mean_token_accuracy": 0.9878903031349182, "num_tokens": 236335255.0, "step": 2226 }, { "entropy": 0.9697002619504929, "epoch": 5.072976054732041, "grad_norm": 1.1015625, "learning_rate": 2.6487046294959275e-06, "loss": 0.0258, "mean_token_accuracy": 0.9909873753786087, "num_tokens": 236441484.0, "step": 2227 }, { "entropy": 0.965367317199707, "epoch": 5.075256556442417, "grad_norm": 0.86328125, "learning_rate": 2.64682447349852e-06, "loss": 0.0212, "mean_token_accuracy": 0.9922732412815094, "num_tokens": 236547138.0, "step": 2228 }, { "entropy": 0.967468649148941, "epoch": 5.077537058152793, "grad_norm": 0.8671875, "learning_rate": 2.6449442341663755e-06, "loss": 0.0258, "mean_token_accuracy": 0.9911113232374191, "num_tokens": 236653711.0, "step": 2229 }, { "entropy": 0.9655253887176514, "epoch": 5.07981755986317, "grad_norm": 1.171875, "learning_rate": 2.643063912566683e-06, "loss": 0.0377, "mean_token_accuracy": 0.9866091459989548, "num_tokens": 236759679.0, "step": 2230 }, { "entropy": 0.9671695828437805, "epoch": 5.0820980615735465, "grad_norm": 1.0703125, "learning_rate": 2.641183509766675e-06, "loss": 0.0226, "mean_token_accuracy": 0.9922260791063309, "num_tokens": 236866115.0, "step": 2231 }, { "entropy": 0.956344485282898, "epoch": 5.084378563283923, "grad_norm": 0.921875, "learning_rate": 2.639303026833632e-06, "loss": 0.0253, "mean_token_accuracy": 0.9921888560056686, "num_tokens": 236972453.0, "step": 2232 }, { "entropy": 0.9635798633098602, "epoch": 5.086659064994299, "grad_norm": 1.109375, "learning_rate": 2.6374224648348815e-06, "loss": 0.0272, "mean_token_accuracy": 0.9916727542877197, "num_tokens": 237078912.0, "step": 2233 }, { "entropy": 0.9675526916980743, "epoch": 5.088939566704675, "grad_norm": 1.0546875, "learning_rate": 2.6355418248377928e-06, "loss": 0.0271, "mean_token_accuracy": 0.9920151233673096, "num_tokens": 237185737.0, "step": 2234 }, { "entropy": 0.9662830680608749, "epoch": 5.091220068415051, "grad_norm": 1.03125, "learning_rate": 2.633661107909781e-06, "loss": 0.0301, "mean_token_accuracy": 0.9886122792959213, "num_tokens": 237292002.0, "step": 2235 }, { "entropy": 0.9622719138860703, "epoch": 5.0935005701254275, "grad_norm": 0.9765625, "learning_rate": 2.6317803151183053e-06, "loss": 0.0314, "mean_token_accuracy": 0.9892111718654633, "num_tokens": 237398283.0, "step": 2236 }, { "entropy": 0.9578322768211365, "epoch": 5.095781071835804, "grad_norm": 0.8515625, "learning_rate": 2.629899447530866e-06, "loss": 0.0235, "mean_token_accuracy": 0.9922783821821213, "num_tokens": 237505878.0, "step": 2237 }, { "entropy": 0.9673309773206711, "epoch": 5.09806157354618, "grad_norm": 0.90625, "learning_rate": 2.6280185062150084e-06, "loss": 0.0256, "mean_token_accuracy": 0.9898202568292618, "num_tokens": 237612336.0, "step": 2238 }, { "entropy": 0.9638909846544266, "epoch": 5.100342075256556, "grad_norm": 1.0, "learning_rate": 2.6261374922383176e-06, "loss": 0.0259, "mean_token_accuracy": 0.9923551380634308, "num_tokens": 237718829.0, "step": 2239 }, { "entropy": 0.9660124331712723, "epoch": 5.102622576966933, "grad_norm": 1.140625, "learning_rate": 2.6242564066684217e-06, "loss": 0.0331, "mean_token_accuracy": 0.9891023486852646, "num_tokens": 237824694.0, "step": 2240 }, { "entropy": 0.9654735922813416, "epoch": 5.104903078677309, "grad_norm": 1.0625, "learning_rate": 2.6223752505729884e-06, "loss": 0.0287, "mean_token_accuracy": 0.9900224804878235, "num_tokens": 237931532.0, "step": 2241 }, { "entropy": 0.965776801109314, "epoch": 5.1071835803876855, "grad_norm": 1.1015625, "learning_rate": 2.6204940250197253e-06, "loss": 0.0243, "mean_token_accuracy": 0.9918332695960999, "num_tokens": 238037814.0, "step": 2242 }, { "entropy": 0.9627796858549118, "epoch": 5.109464082098062, "grad_norm": 1.65625, "learning_rate": 2.61861273107638e-06, "loss": 0.0458, "mean_token_accuracy": 0.9855745583772659, "num_tokens": 238144292.0, "step": 2243 }, { "entropy": 0.9628746956586838, "epoch": 5.111744583808438, "grad_norm": 1.328125, "learning_rate": 2.6167313698107385e-06, "loss": 0.0345, "mean_token_accuracy": 0.9887275993824005, "num_tokens": 238250540.0, "step": 2244 }, { "entropy": 0.9654737412929535, "epoch": 5.114025085518814, "grad_norm": 1.0546875, "learning_rate": 2.6148499422906243e-06, "loss": 0.0285, "mean_token_accuracy": 0.9926280230283737, "num_tokens": 238357120.0, "step": 2245 }, { "entropy": 0.957552894949913, "epoch": 5.11630558722919, "grad_norm": 1.1796875, "learning_rate": 2.6129684495839013e-06, "loss": 0.0287, "mean_token_accuracy": 0.9899897128343582, "num_tokens": 238463353.0, "step": 2246 }, { "entropy": 0.9614425599575043, "epoch": 5.1185860889395665, "grad_norm": 1.296875, "learning_rate": 2.611086892758467e-06, "loss": 0.0324, "mean_token_accuracy": 0.9892287403345108, "num_tokens": 238569332.0, "step": 2247 }, { "entropy": 0.9659459739923477, "epoch": 5.120866590649943, "grad_norm": 1.09375, "learning_rate": 2.6092052728822564e-06, "loss": 0.0324, "mean_token_accuracy": 0.9902137815952301, "num_tokens": 238675683.0, "step": 2248 }, { "entropy": 0.9583204090595245, "epoch": 5.123147092360319, "grad_norm": 1.1171875, "learning_rate": 2.607323591023242e-06, "loss": 0.0366, "mean_token_accuracy": 0.9896302819252014, "num_tokens": 238781991.0, "step": 2249 }, { "entropy": 0.9656778872013092, "epoch": 5.125427594070696, "grad_norm": 1.0625, "learning_rate": 2.605441848249428e-06, "loss": 0.0319, "mean_token_accuracy": 0.989483654499054, "num_tokens": 238888505.0, "step": 2250 }, { "entropy": 0.9561873078346252, "epoch": 5.127708095781072, "grad_norm": 1.109375, "learning_rate": 2.6035600456288573e-06, "loss": 0.0269, "mean_token_accuracy": 0.9925181716680527, "num_tokens": 238994260.0, "step": 2251 }, { "entropy": 0.9622867405414581, "epoch": 5.129988597491448, "grad_norm": 1.0703125, "learning_rate": 2.6016781842296044e-06, "loss": 0.029, "mean_token_accuracy": 0.9899962395429611, "num_tokens": 239099986.0, "step": 2252 }, { "entropy": 0.9655700325965881, "epoch": 5.1322690992018245, "grad_norm": 1.2734375, "learning_rate": 2.599796265119777e-06, "loss": 0.0373, "mean_token_accuracy": 0.9883208572864532, "num_tokens": 239206241.0, "step": 2253 }, { "entropy": 0.9600808471441269, "epoch": 5.134549600912201, "grad_norm": 1.109375, "learning_rate": 2.597914289367516e-06, "loss": 0.0264, "mean_token_accuracy": 0.9914987087249756, "num_tokens": 239313020.0, "step": 2254 }, { "entropy": 0.9599828124046326, "epoch": 5.136830102622577, "grad_norm": 0.9609375, "learning_rate": 2.596032258040994e-06, "loss": 0.0265, "mean_token_accuracy": 0.990741103887558, "num_tokens": 239419191.0, "step": 2255 }, { "entropy": 0.9627092629671097, "epoch": 5.139110604332953, "grad_norm": 1.53125, "learning_rate": 2.594150172208417e-06, "loss": 0.0463, "mean_token_accuracy": 0.9859785437583923, "num_tokens": 239525339.0, "step": 2256 }, { "entropy": 0.9578230679035187, "epoch": 5.141391106043329, "grad_norm": 1.1171875, "learning_rate": 2.59226803293802e-06, "loss": 0.0323, "mean_token_accuracy": 0.989411473274231, "num_tokens": 239631508.0, "step": 2257 }, { "entropy": 0.9580929428339005, "epoch": 5.1436716077537055, "grad_norm": 1.0546875, "learning_rate": 2.5903858412980688e-06, "loss": 0.03, "mean_token_accuracy": 0.9911700785160065, "num_tokens": 239737417.0, "step": 2258 }, { "entropy": 0.9563765972852707, "epoch": 5.145952109464082, "grad_norm": 1.2109375, "learning_rate": 2.5885035983568584e-06, "loss": 0.0361, "mean_token_accuracy": 0.9898612201213837, "num_tokens": 239844152.0, "step": 2259 }, { "entropy": 0.962183803319931, "epoch": 5.148232611174459, "grad_norm": 1.0859375, "learning_rate": 2.5866213051827148e-06, "loss": 0.0275, "mean_token_accuracy": 0.9904561191797256, "num_tokens": 239950484.0, "step": 2260 }, { "entropy": 0.9625698626041412, "epoch": 5.150513112884835, "grad_norm": 1.0390625, "learning_rate": 2.5847389628439905e-06, "loss": 0.028, "mean_token_accuracy": 0.9918230623006821, "num_tokens": 240057023.0, "step": 2261 }, { "entropy": 0.9618327170610428, "epoch": 5.152793614595211, "grad_norm": 1.03125, "learning_rate": 2.5828565724090672e-06, "loss": 0.0308, "mean_token_accuracy": 0.990477055311203, "num_tokens": 240163944.0, "step": 2262 }, { "entropy": 0.9662399441003799, "epoch": 5.155074116305587, "grad_norm": 1.046875, "learning_rate": 2.5809741349463526e-06, "loss": 0.0272, "mean_token_accuracy": 0.9929028749465942, "num_tokens": 240270079.0, "step": 2263 }, { "entropy": 0.9670891910791397, "epoch": 5.1573546180159635, "grad_norm": 1.2109375, "learning_rate": 2.579091651524282e-06, "loss": 0.0311, "mean_token_accuracy": 0.9892935752868652, "num_tokens": 240376506.0, "step": 2264 }, { "entropy": 0.9601222425699234, "epoch": 5.15963511972634, "grad_norm": 1.25, "learning_rate": 2.5772091232113176e-06, "loss": 0.0392, "mean_token_accuracy": 0.9860453456640244, "num_tokens": 240483051.0, "step": 2265 }, { "entropy": 0.9604740738868713, "epoch": 5.161915621436716, "grad_norm": 0.9140625, "learning_rate": 2.575326551075945e-06, "loss": 0.0223, "mean_token_accuracy": 0.993595078587532, "num_tokens": 240589251.0, "step": 2266 }, { "entropy": 0.9696341007947922, "epoch": 5.164196123147092, "grad_norm": 1.2109375, "learning_rate": 2.5734439361866762e-06, "loss": 0.0297, "mean_token_accuracy": 0.9909399151802063, "num_tokens": 240696331.0, "step": 2267 }, { "entropy": 0.9589659124612808, "epoch": 5.166476624857468, "grad_norm": 1.0, "learning_rate": 2.571561279612047e-06, "loss": 0.0292, "mean_token_accuracy": 0.9910659492015839, "num_tokens": 240802657.0, "step": 2268 }, { "entropy": 0.9582824558019638, "epoch": 5.168757126567845, "grad_norm": 1.0390625, "learning_rate": 2.5696785824206177e-06, "loss": 0.0284, "mean_token_accuracy": 0.9898156225681305, "num_tokens": 240908734.0, "step": 2269 }, { "entropy": 0.9609266519546509, "epoch": 5.1710376282782216, "grad_norm": 1.390625, "learning_rate": 2.5677958456809703e-06, "loss": 0.0417, "mean_token_accuracy": 0.9874203950166702, "num_tokens": 241014409.0, "step": 2270 }, { "entropy": 0.9675130993127823, "epoch": 5.173318129988598, "grad_norm": 0.9765625, "learning_rate": 2.5659130704617092e-06, "loss": 0.0251, "mean_token_accuracy": 0.9928015470504761, "num_tokens": 241121068.0, "step": 2271 }, { "entropy": 0.9634346663951874, "epoch": 5.175598631698974, "grad_norm": 1.0625, "learning_rate": 2.5640302578314614e-06, "loss": 0.0303, "mean_token_accuracy": 0.9888690263032913, "num_tokens": 241227409.0, "step": 2272 }, { "entropy": 0.9633354395627975, "epoch": 5.17787913340935, "grad_norm": 1.078125, "learning_rate": 2.562147408858876e-06, "loss": 0.0321, "mean_token_accuracy": 0.9900061339139938, "num_tokens": 241333734.0, "step": 2273 }, { "entropy": 0.9689003080129623, "epoch": 5.180159635119726, "grad_norm": 1.28125, "learning_rate": 2.5602645246126207e-06, "loss": 0.044, "mean_token_accuracy": 0.9866411238908768, "num_tokens": 241440127.0, "step": 2274 }, { "entropy": 0.9670904129743576, "epoch": 5.1824401368301025, "grad_norm": 1.1953125, "learning_rate": 2.5583816061613847e-06, "loss": 0.0375, "mean_token_accuracy": 0.9896675497293472, "num_tokens": 241546459.0, "step": 2275 }, { "entropy": 0.9646856933832169, "epoch": 5.184720638540479, "grad_norm": 1.0703125, "learning_rate": 2.5564986545738767e-06, "loss": 0.0354, "mean_token_accuracy": 0.9880961626768112, "num_tokens": 241653168.0, "step": 2276 }, { "entropy": 0.9655872583389282, "epoch": 5.187001140250855, "grad_norm": 1.1328125, "learning_rate": 2.554615670918823e-06, "loss": 0.0302, "mean_token_accuracy": 0.9904870688915253, "num_tokens": 241758692.0, "step": 2277 }, { "entropy": 0.96611288189888, "epoch": 5.189281641961231, "grad_norm": 1.203125, "learning_rate": 2.552732656264969e-06, "loss": 0.0315, "mean_token_accuracy": 0.9890878945589066, "num_tokens": 241865328.0, "step": 2278 }, { "entropy": 0.9641726762056351, "epoch": 5.191562143671608, "grad_norm": 1.4140625, "learning_rate": 2.5508496116810766e-06, "loss": 0.0394, "mean_token_accuracy": 0.9886446446180344, "num_tokens": 241971522.0, "step": 2279 }, { "entropy": 0.9603229463100433, "epoch": 5.193842645381984, "grad_norm": 0.984375, "learning_rate": 2.548966538235927e-06, "loss": 0.0281, "mean_token_accuracy": 0.9879406690597534, "num_tokens": 242077668.0, "step": 2280 }, { "entropy": 0.9587863981723785, "epoch": 5.196123147092361, "grad_norm": 0.875, "learning_rate": 2.547083436998316e-06, "loss": 0.0247, "mean_token_accuracy": 0.9903833866119385, "num_tokens": 242183593.0, "step": 2281 }, { "entropy": 0.9632166624069214, "epoch": 5.198403648802737, "grad_norm": 0.9375, "learning_rate": 2.5452003090370543e-06, "loss": 0.0284, "mean_token_accuracy": 0.990844115614891, "num_tokens": 242289789.0, "step": 2282 }, { "entropy": 0.9592739045619965, "epoch": 5.200684150513113, "grad_norm": 1.0078125, "learning_rate": 2.5433171554209694e-06, "loss": 0.0293, "mean_token_accuracy": 0.989198163151741, "num_tokens": 242395944.0, "step": 2283 }, { "entropy": 0.9641063362360001, "epoch": 5.202964652223489, "grad_norm": 1.0, "learning_rate": 2.5414339772189045e-06, "loss": 0.0252, "mean_token_accuracy": 0.9906119853258133, "num_tokens": 242502436.0, "step": 2284 }, { "entropy": 0.9584684073925018, "epoch": 5.205245153933865, "grad_norm": 0.8671875, "learning_rate": 2.5395507754997135e-06, "loss": 0.026, "mean_token_accuracy": 0.9912816733121872, "num_tokens": 242609144.0, "step": 2285 }, { "entropy": 0.9654579162597656, "epoch": 5.2075256556442415, "grad_norm": 1.1328125, "learning_rate": 2.5376675513322665e-06, "loss": 0.0315, "mean_token_accuracy": 0.9898001551628113, "num_tokens": 242715280.0, "step": 2286 }, { "entropy": 0.9696633666753769, "epoch": 5.209806157354618, "grad_norm": 1.109375, "learning_rate": 2.535784305785443e-06, "loss": 0.0355, "mean_token_accuracy": 0.9892886579036713, "num_tokens": 242821777.0, "step": 2287 }, { "entropy": 0.9686882048845291, "epoch": 5.212086659064994, "grad_norm": 1.328125, "learning_rate": 2.5339010399281394e-06, "loss": 0.0407, "mean_token_accuracy": 0.9868284314870834, "num_tokens": 242928468.0, "step": 2288 }, { "entropy": 0.9630966782569885, "epoch": 5.214367160775371, "grad_norm": 0.8828125, "learning_rate": 2.53201775482926e-06, "loss": 0.031, "mean_token_accuracy": 0.990122064948082, "num_tokens": 243034651.0, "step": 2289 }, { "entropy": 0.9632774442434311, "epoch": 5.216647662485747, "grad_norm": 1.015625, "learning_rate": 2.530134451557722e-06, "loss": 0.0285, "mean_token_accuracy": 0.9915072321891785, "num_tokens": 243141047.0, "step": 2290 }, { "entropy": 0.9679417163133621, "epoch": 5.218928164196123, "grad_norm": 1.125, "learning_rate": 2.52825113118245e-06, "loss": 0.0358, "mean_token_accuracy": 0.9870262145996094, "num_tokens": 243247603.0, "step": 2291 }, { "entropy": 0.9668564945459366, "epoch": 5.2212086659065, "grad_norm": 0.9921875, "learning_rate": 2.5263677947723813e-06, "loss": 0.0318, "mean_token_accuracy": 0.9900890588760376, "num_tokens": 243353876.0, "step": 2292 }, { "entropy": 0.9692956209182739, "epoch": 5.223489167616876, "grad_norm": 1.015625, "learning_rate": 2.5244844433964615e-06, "loss": 0.0323, "mean_token_accuracy": 0.9894204586744308, "num_tokens": 243460641.0, "step": 2293 }, { "entropy": 0.9591492712497711, "epoch": 5.225769669327252, "grad_norm": 1.0703125, "learning_rate": 2.522601078123645e-06, "loss": 0.0265, "mean_token_accuracy": 0.9901764988899231, "num_tokens": 243567238.0, "step": 2294 }, { "entropy": 0.9600541442632675, "epoch": 5.228050171037628, "grad_norm": 1.1015625, "learning_rate": 2.5207177000228916e-06, "loss": 0.0315, "mean_token_accuracy": 0.9900453090667725, "num_tokens": 243673068.0, "step": 2295 }, { "entropy": 0.9701025187969208, "epoch": 5.230330672748004, "grad_norm": 1.15625, "learning_rate": 2.5188343101631717e-06, "loss": 0.0313, "mean_token_accuracy": 0.9908655881881714, "num_tokens": 243779333.0, "step": 2296 }, { "entropy": 0.9628444314002991, "epoch": 5.2326111744583805, "grad_norm": 0.93359375, "learning_rate": 2.516950909613462e-06, "loss": 0.0276, "mean_token_accuracy": 0.9905129671096802, "num_tokens": 243885809.0, "step": 2297 }, { "entropy": 0.9703962355852127, "epoch": 5.234891676168757, "grad_norm": 1.125, "learning_rate": 2.5150674994427427e-06, "loss": 0.0279, "mean_token_accuracy": 0.9925048649311066, "num_tokens": 243991825.0, "step": 2298 }, { "entropy": 0.9647563993930817, "epoch": 5.237172177879134, "grad_norm": 1.046875, "learning_rate": 2.5131840807200015e-06, "loss": 0.0311, "mean_token_accuracy": 0.9896704107522964, "num_tokens": 244099633.0, "step": 2299 }, { "entropy": 0.9610617309808731, "epoch": 5.23945267958951, "grad_norm": 1.203125, "learning_rate": 2.511300654514231e-06, "loss": 0.04, "mean_token_accuracy": 0.9870607256889343, "num_tokens": 244205487.0, "step": 2300 }, { "entropy": 0.9659099280834198, "epoch": 5.241733181299886, "grad_norm": 0.9921875, "learning_rate": 2.5094172218944276e-06, "loss": 0.0228, "mean_token_accuracy": 0.9933736324310303, "num_tokens": 244312131.0, "step": 2301 }, { "entropy": 0.9636854082345963, "epoch": 5.244013683010262, "grad_norm": 1.1875, "learning_rate": 2.5075337839295903e-06, "loss": 0.0325, "mean_token_accuracy": 0.9895174354314804, "num_tokens": 244418235.0, "step": 2302 }, { "entropy": 0.9597474038600922, "epoch": 5.246294184720639, "grad_norm": 1.1640625, "learning_rate": 2.5056503416887222e-06, "loss": 0.0333, "mean_token_accuracy": 0.9891835749149323, "num_tokens": 244524531.0, "step": 2303 }, { "entropy": 0.9660688638687134, "epoch": 5.248574686431015, "grad_norm": 1.0625, "learning_rate": 2.5037668962408295e-06, "loss": 0.0381, "mean_token_accuracy": 0.988997608423233, "num_tokens": 244631069.0, "step": 2304 }, { "entropy": 0.9588354229927063, "epoch": 5.250855188141391, "grad_norm": 1.171875, "learning_rate": 2.5018834486549198e-06, "loss": 0.0349, "mean_token_accuracy": 0.9897580593824387, "num_tokens": 244737541.0, "step": 2305 }, { "entropy": 0.9604678452014923, "epoch": 5.253135689851767, "grad_norm": 1.0546875, "learning_rate": 2.5e-06, "loss": 0.0239, "mean_token_accuracy": 0.9932571351528168, "num_tokens": 244843857.0, "step": 2306 }, { "entropy": 0.9626455903053284, "epoch": 5.255416191562143, "grad_norm": 1.0625, "learning_rate": 2.4981165513450807e-06, "loss": 0.0246, "mean_token_accuracy": 0.9939456433057785, "num_tokens": 244949558.0, "step": 2307 }, { "entropy": 0.9601141661405563, "epoch": 5.2576966932725195, "grad_norm": 1.203125, "learning_rate": 2.4962331037591705e-06, "loss": 0.023, "mean_token_accuracy": 0.993055522441864, "num_tokens": 245055967.0, "step": 2308 }, { "entropy": 0.9621061533689499, "epoch": 5.259977194982897, "grad_norm": 1.046875, "learning_rate": 2.494349658311279e-06, "loss": 0.0288, "mean_token_accuracy": 0.9920292496681213, "num_tokens": 245161827.0, "step": 2309 }, { "entropy": 0.9647950083017349, "epoch": 5.262257696693273, "grad_norm": 1.1171875, "learning_rate": 2.492466216070411e-06, "loss": 0.0304, "mean_token_accuracy": 0.9902704954147339, "num_tokens": 245268948.0, "step": 2310 }, { "entropy": 0.9653992205858231, "epoch": 5.264538198403649, "grad_norm": 1.140625, "learning_rate": 2.4905827781055733e-06, "loss": 0.0363, "mean_token_accuracy": 0.9887953847646713, "num_tokens": 245374799.0, "step": 2311 }, { "entropy": 0.9611643254756927, "epoch": 5.266818700114025, "grad_norm": 1.296875, "learning_rate": 2.4886993454857696e-06, "loss": 0.0335, "mean_token_accuracy": 0.988814651966095, "num_tokens": 245481819.0, "step": 2312 }, { "entropy": 0.957087978720665, "epoch": 5.269099201824401, "grad_norm": 1.5, "learning_rate": 2.486815919279999e-06, "loss": 0.0358, "mean_token_accuracy": 0.9889304339885712, "num_tokens": 245587753.0, "step": 2313 }, { "entropy": 0.9682483226060867, "epoch": 5.271379703534778, "grad_norm": 0.9921875, "learning_rate": 2.4849325005572573e-06, "loss": 0.0286, "mean_token_accuracy": 0.9903685748577118, "num_tokens": 245693738.0, "step": 2314 }, { "entropy": 0.9592265337705612, "epoch": 5.273660205245154, "grad_norm": 1.0546875, "learning_rate": 2.483049090386539e-06, "loss": 0.0262, "mean_token_accuracy": 0.9906654804944992, "num_tokens": 245799670.0, "step": 2315 }, { "entropy": 0.9667607694864273, "epoch": 5.27594070695553, "grad_norm": 1.2734375, "learning_rate": 2.4811656898368287e-06, "loss": 0.0363, "mean_token_accuracy": 0.9880292564630508, "num_tokens": 245905953.0, "step": 2316 }, { "entropy": 0.9659079313278198, "epoch": 5.278221208665906, "grad_norm": 1.234375, "learning_rate": 2.4792822999771092e-06, "loss": 0.0251, "mean_token_accuracy": 0.9921453297138214, "num_tokens": 246012079.0, "step": 2317 }, { "entropy": 0.9630410969257355, "epoch": 5.280501710376283, "grad_norm": 1.3203125, "learning_rate": 2.477398921876356e-06, "loss": 0.0439, "mean_token_accuracy": 0.9860195368528366, "num_tokens": 246118182.0, "step": 2318 }, { "entropy": 0.965837150812149, "epoch": 5.282782212086659, "grad_norm": 0.875, "learning_rate": 2.475515556603539e-06, "loss": 0.0263, "mean_token_accuracy": 0.9924841523170471, "num_tokens": 246224423.0, "step": 2319 }, { "entropy": 0.9606252163648605, "epoch": 5.285062713797036, "grad_norm": 0.99609375, "learning_rate": 2.47363220522762e-06, "loss": 0.0314, "mean_token_accuracy": 0.9906062036752701, "num_tokens": 246330907.0, "step": 2320 }, { "entropy": 0.9543637633323669, "epoch": 5.287343215507412, "grad_norm": 1.1015625, "learning_rate": 2.4717488688175513e-06, "loss": 0.0263, "mean_token_accuracy": 0.9912769347429276, "num_tokens": 246436727.0, "step": 2321 }, { "entropy": 0.9609425961971283, "epoch": 5.289623717217788, "grad_norm": 0.9375, "learning_rate": 2.469865548442279e-06, "loss": 0.0274, "mean_token_accuracy": 0.9916811734437943, "num_tokens": 246543098.0, "step": 2322 }, { "entropy": 0.9576994627714157, "epoch": 5.291904218928164, "grad_norm": 0.9765625, "learning_rate": 2.4679822451707404e-06, "loss": 0.032, "mean_token_accuracy": 0.9891669154167175, "num_tokens": 246649290.0, "step": 2323 }, { "entropy": 0.9605866819620132, "epoch": 5.29418472063854, "grad_norm": 1.265625, "learning_rate": 2.4660989600718606e-06, "loss": 0.0334, "mean_token_accuracy": 0.9898315221071243, "num_tokens": 246755667.0, "step": 2324 }, { "entropy": 0.9581461846828461, "epoch": 5.296465222348917, "grad_norm": 1.0546875, "learning_rate": 2.4642156942145577e-06, "loss": 0.0276, "mean_token_accuracy": 0.9912557601928711, "num_tokens": 246861897.0, "step": 2325 }, { "entropy": 0.9626449346542358, "epoch": 5.298745724059293, "grad_norm": 1.15625, "learning_rate": 2.4623324486677352e-06, "loss": 0.0323, "mean_token_accuracy": 0.9893551617860794, "num_tokens": 246968107.0, "step": 2326 }, { "entropy": 0.9572718292474747, "epoch": 5.301026225769669, "grad_norm": 1.2890625, "learning_rate": 2.4604492245002873e-06, "loss": 0.0348, "mean_token_accuracy": 0.9887119680643082, "num_tokens": 247074401.0, "step": 2327 }, { "entropy": 0.9594281911849976, "epoch": 5.303306727480045, "grad_norm": 1.0703125, "learning_rate": 2.4585660227810963e-06, "loss": 0.0267, "mean_token_accuracy": 0.9911034405231476, "num_tokens": 247180953.0, "step": 2328 }, { "entropy": 0.9617215394973755, "epoch": 5.305587229190422, "grad_norm": 1.2890625, "learning_rate": 2.4566828445790306e-06, "loss": 0.037, "mean_token_accuracy": 0.9886268675327301, "num_tokens": 247286578.0, "step": 2329 }, { "entropy": 0.9571081846952438, "epoch": 5.307867730900798, "grad_norm": 0.859375, "learning_rate": 2.454799690962946e-06, "loss": 0.0204, "mean_token_accuracy": 0.9931831657886505, "num_tokens": 247392611.0, "step": 2330 }, { "entropy": 0.9668297469615936, "epoch": 5.310148232611175, "grad_norm": 1.03125, "learning_rate": 2.4529165630016855e-06, "loss": 0.0263, "mean_token_accuracy": 0.9920424818992615, "num_tokens": 247498496.0, "step": 2331 }, { "entropy": 0.9603388756513596, "epoch": 5.312428734321551, "grad_norm": 1.28125, "learning_rate": 2.4510334617640733e-06, "loss": 0.0345, "mean_token_accuracy": 0.9892850518226624, "num_tokens": 247604573.0, "step": 2332 }, { "entropy": 0.961391270160675, "epoch": 5.314709236031927, "grad_norm": 1.0234375, "learning_rate": 2.4491503883189242e-06, "loss": 0.0251, "mean_token_accuracy": 0.991115465760231, "num_tokens": 247711013.0, "step": 2333 }, { "entropy": 0.9639288485050201, "epoch": 5.316989737742303, "grad_norm": 1.484375, "learning_rate": 2.447267343735032e-06, "loss": 0.0417, "mean_token_accuracy": 0.9870303720235825, "num_tokens": 247816962.0, "step": 2334 }, { "entropy": 0.956159308552742, "epoch": 5.319270239452679, "grad_norm": 1.1640625, "learning_rate": 2.4453843290811772e-06, "loss": 0.0345, "mean_token_accuracy": 0.9900843650102615, "num_tokens": 247923428.0, "step": 2335 }, { "entropy": 0.9560309201478958, "epoch": 5.321550741163056, "grad_norm": 0.875, "learning_rate": 2.4435013454261246e-06, "loss": 0.0201, "mean_token_accuracy": 0.9930903762578964, "num_tokens": 248029174.0, "step": 2336 }, { "entropy": 0.9628920406103134, "epoch": 5.323831242873432, "grad_norm": 1.1875, "learning_rate": 2.4416183938386157e-06, "loss": 0.0355, "mean_token_accuracy": 0.9878555983304977, "num_tokens": 248135886.0, "step": 2337 }, { "entropy": 0.9555362313985825, "epoch": 5.326111744583809, "grad_norm": 1.3359375, "learning_rate": 2.4397354753873797e-06, "loss": 0.0419, "mean_token_accuracy": 0.9849408268928528, "num_tokens": 248242302.0, "step": 2338 }, { "entropy": 0.9626855701208115, "epoch": 5.328392246294185, "grad_norm": 1.0703125, "learning_rate": 2.4378525911411246e-06, "loss": 0.0317, "mean_token_accuracy": 0.9900832772254944, "num_tokens": 248348270.0, "step": 2339 }, { "entropy": 0.9574351608753204, "epoch": 5.330672748004561, "grad_norm": 1.21875, "learning_rate": 2.435969742168539e-06, "loss": 0.0379, "mean_token_accuracy": 0.9867818653583527, "num_tokens": 248454432.0, "step": 2340 }, { "entropy": 0.9655468016862869, "epoch": 5.3329532497149374, "grad_norm": 1.046875, "learning_rate": 2.4340869295382924e-06, "loss": 0.0314, "mean_token_accuracy": 0.9903284907341003, "num_tokens": 248560626.0, "step": 2341 }, { "entropy": 0.9646767973899841, "epoch": 5.335233751425314, "grad_norm": 1.1484375, "learning_rate": 2.432204154319031e-06, "loss": 0.0379, "mean_token_accuracy": 0.9895422011613846, "num_tokens": 248667318.0, "step": 2342 }, { "entropy": 0.9612955749034882, "epoch": 5.33751425313569, "grad_norm": 1.0703125, "learning_rate": 2.4303214175793827e-06, "loss": 0.0321, "mean_token_accuracy": 0.9915361255407333, "num_tokens": 248773653.0, "step": 2343 }, { "entropy": 0.9592497199773788, "epoch": 5.339794754846066, "grad_norm": 1.1875, "learning_rate": 2.4284387203879536e-06, "loss": 0.0303, "mean_token_accuracy": 0.9892747104167938, "num_tokens": 248880768.0, "step": 2344 }, { "entropy": 0.9676202088594437, "epoch": 5.342075256556442, "grad_norm": 0.921875, "learning_rate": 2.426556063813324e-06, "loss": 0.0259, "mean_token_accuracy": 0.9921819716691971, "num_tokens": 248987711.0, "step": 2345 }, { "entropy": 0.9593151360750198, "epoch": 5.344355758266818, "grad_norm": 1.3359375, "learning_rate": 2.4246734489240554e-06, "loss": 0.0351, "mean_token_accuracy": 0.9877553880214691, "num_tokens": 249093723.0, "step": 2346 }, { "entropy": 0.9622428864240646, "epoch": 5.346636259977195, "grad_norm": 1.359375, "learning_rate": 2.4227908767886837e-06, "loss": 0.0377, "mean_token_accuracy": 0.9878920465707779, "num_tokens": 249200756.0, "step": 2347 }, { "entropy": 0.957678496837616, "epoch": 5.348916761687571, "grad_norm": 1.0859375, "learning_rate": 2.420908348475719e-06, "loss": 0.0375, "mean_token_accuracy": 0.9897744655609131, "num_tokens": 249306758.0, "step": 2348 }, { "entropy": 0.9586951434612274, "epoch": 5.351197263397948, "grad_norm": 1.0703125, "learning_rate": 2.4190258650536483e-06, "loss": 0.0269, "mean_token_accuracy": 0.9916781336069107, "num_tokens": 249412931.0, "step": 2349 }, { "entropy": 0.9633719623088837, "epoch": 5.353477765108324, "grad_norm": 0.92578125, "learning_rate": 2.417143427590933e-06, "loss": 0.0296, "mean_token_accuracy": 0.9906605035066605, "num_tokens": 249519667.0, "step": 2350 }, { "entropy": 0.9609403908252716, "epoch": 5.3557582668187, "grad_norm": 1.3984375, "learning_rate": 2.4152610371560095e-06, "loss": 0.0291, "mean_token_accuracy": 0.9904833287000656, "num_tokens": 249626047.0, "step": 2351 }, { "entropy": 0.9619010984897614, "epoch": 5.3580387685290765, "grad_norm": 0.88671875, "learning_rate": 2.413378694817286e-06, "loss": 0.0303, "mean_token_accuracy": 0.9911776930093765, "num_tokens": 249732712.0, "step": 2352 }, { "entropy": 0.9612488001585007, "epoch": 5.360319270239453, "grad_norm": 0.87109375, "learning_rate": 2.411496401643142e-06, "loss": 0.0243, "mean_token_accuracy": 0.992933377623558, "num_tokens": 249839110.0, "step": 2353 }, { "entropy": 0.9619357585906982, "epoch": 5.362599771949829, "grad_norm": 0.9609375, "learning_rate": 2.409614158701932e-06, "loss": 0.0239, "mean_token_accuracy": 0.9921589493751526, "num_tokens": 249945806.0, "step": 2354 }, { "entropy": 0.9618428647518158, "epoch": 5.364880273660205, "grad_norm": 1.0, "learning_rate": 2.407731967061981e-06, "loss": 0.0293, "mean_token_accuracy": 0.9913053065538406, "num_tokens": 250052365.0, "step": 2355 }, { "entropy": 0.964727520942688, "epoch": 5.367160775370581, "grad_norm": 1.3046875, "learning_rate": 2.4058498277915835e-06, "loss": 0.042, "mean_token_accuracy": 0.9847124218940735, "num_tokens": 250157968.0, "step": 2356 }, { "entropy": 0.9647073745727539, "epoch": 5.369441277080957, "grad_norm": 1.2734375, "learning_rate": 2.4039677419590064e-06, "loss": 0.0384, "mean_token_accuracy": 0.9872478246688843, "num_tokens": 250264495.0, "step": 2357 }, { "entropy": 0.9652851223945618, "epoch": 5.3717217787913345, "grad_norm": 1.2265625, "learning_rate": 2.4020857106324853e-06, "loss": 0.0406, "mean_token_accuracy": 0.9853969365358353, "num_tokens": 250370768.0, "step": 2358 }, { "entropy": 0.9687476456165314, "epoch": 5.374002280501711, "grad_norm": 1.4375, "learning_rate": 2.4002037348802245e-06, "loss": 0.0397, "mean_token_accuracy": 0.9877526164054871, "num_tokens": 250477172.0, "step": 2359 }, { "entropy": 0.9643543660640717, "epoch": 5.376282782212087, "grad_norm": 1.0546875, "learning_rate": 2.3983218157703964e-06, "loss": 0.0334, "mean_token_accuracy": 0.9897617101669312, "num_tokens": 250583379.0, "step": 2360 }, { "entropy": 0.9663588851690292, "epoch": 5.378563283922463, "grad_norm": 1.0234375, "learning_rate": 2.3964399543711427e-06, "loss": 0.0245, "mean_token_accuracy": 0.9923048764467239, "num_tokens": 250689926.0, "step": 2361 }, { "entropy": 0.9615009874105453, "epoch": 5.380843785632839, "grad_norm": 1.1640625, "learning_rate": 2.394558151750572e-06, "loss": 0.0343, "mean_token_accuracy": 0.9895179867744446, "num_tokens": 250796492.0, "step": 2362 }, { "entropy": 0.9673548340797424, "epoch": 5.3831242873432155, "grad_norm": 0.953125, "learning_rate": 2.3926764089767594e-06, "loss": 0.0291, "mean_token_accuracy": 0.9913256466388702, "num_tokens": 250902643.0, "step": 2363 }, { "entropy": 0.9660579860210419, "epoch": 5.385404789053592, "grad_norm": 0.85546875, "learning_rate": 2.3907947271177444e-06, "loss": 0.0212, "mean_token_accuracy": 0.9935721307992935, "num_tokens": 251008653.0, "step": 2364 }, { "entropy": 0.9695286750793457, "epoch": 5.387685290763968, "grad_norm": 1.234375, "learning_rate": 2.388913107241534e-06, "loss": 0.0257, "mean_token_accuracy": 0.9922136962413788, "num_tokens": 251114393.0, "step": 2365 }, { "entropy": 0.9680584371089935, "epoch": 5.389965792474344, "grad_norm": 1.1796875, "learning_rate": 2.3870315504160995e-06, "loss": 0.0347, "mean_token_accuracy": 0.9899269044399261, "num_tokens": 251220952.0, "step": 2366 }, { "entropy": 0.9639090597629547, "epoch": 5.39224629418472, "grad_norm": 0.953125, "learning_rate": 2.3851500577093757e-06, "loss": 0.0275, "mean_token_accuracy": 0.9895485043525696, "num_tokens": 251327318.0, "step": 2367 }, { "entropy": 0.965565949678421, "epoch": 5.394526795895097, "grad_norm": 1.1328125, "learning_rate": 2.3832686301892628e-06, "loss": 0.0354, "mean_token_accuracy": 0.9891826957464218, "num_tokens": 251433486.0, "step": 2368 }, { "entropy": 0.9695440530776978, "epoch": 5.3968072976054735, "grad_norm": 1.2265625, "learning_rate": 2.381387268923621e-06, "loss": 0.0426, "mean_token_accuracy": 0.9886240214109421, "num_tokens": 251539962.0, "step": 2369 }, { "entropy": 0.9645057171583176, "epoch": 5.39908779931585, "grad_norm": 1.046875, "learning_rate": 2.3795059749802756e-06, "loss": 0.0304, "mean_token_accuracy": 0.9902735948562622, "num_tokens": 251646223.0, "step": 2370 }, { "entropy": 0.9619290679693222, "epoch": 5.401368301026226, "grad_norm": 1.2421875, "learning_rate": 2.377624749427012e-06, "loss": 0.0353, "mean_token_accuracy": 0.9909352660179138, "num_tokens": 251752746.0, "step": 2371 }, { "entropy": 0.9649537652730942, "epoch": 5.403648802736602, "grad_norm": 1.0, "learning_rate": 2.3757435933315787e-06, "loss": 0.0311, "mean_token_accuracy": 0.9903056472539902, "num_tokens": 251858831.0, "step": 2372 }, { "entropy": 0.967634916305542, "epoch": 5.405929304446978, "grad_norm": 1.0546875, "learning_rate": 2.3738625077616837e-06, "loss": 0.0313, "mean_token_accuracy": 0.9896340519189835, "num_tokens": 251965349.0, "step": 2373 }, { "entropy": 0.9652699530124664, "epoch": 5.4082098061573545, "grad_norm": 1.140625, "learning_rate": 2.371981493784993e-06, "loss": 0.0342, "mean_token_accuracy": 0.9903118908405304, "num_tokens": 252070942.0, "step": 2374 }, { "entropy": 0.9602026641368866, "epoch": 5.410490307867731, "grad_norm": 0.921875, "learning_rate": 2.370100552469135e-06, "loss": 0.0348, "mean_token_accuracy": 0.9886699318885803, "num_tokens": 252176899.0, "step": 2375 }, { "entropy": 0.9674193114042282, "epoch": 5.412770809578107, "grad_norm": 1.296875, "learning_rate": 2.3682196848816955e-06, "loss": 0.044, "mean_token_accuracy": 0.9854168146848679, "num_tokens": 252283350.0, "step": 2376 }, { "entropy": 0.9683422595262527, "epoch": 5.415051311288483, "grad_norm": 0.96484375, "learning_rate": 2.3663388920902198e-06, "loss": 0.0258, "mean_token_accuracy": 0.9930626600980759, "num_tokens": 252389544.0, "step": 2377 }, { "entropy": 0.9604955613613129, "epoch": 5.41733181299886, "grad_norm": 0.984375, "learning_rate": 2.3644581751622076e-06, "loss": 0.0288, "mean_token_accuracy": 0.9906046092510223, "num_tokens": 252495820.0, "step": 2378 }, { "entropy": 0.9706380367279053, "epoch": 5.419612314709236, "grad_norm": 1.1171875, "learning_rate": 2.3625775351651193e-06, "loss": 0.0329, "mean_token_accuracy": 0.9908811300992966, "num_tokens": 252601890.0, "step": 2379 }, { "entropy": 0.9665205180644989, "epoch": 5.4218928164196125, "grad_norm": 1.0078125, "learning_rate": 2.3606969731663683e-06, "loss": 0.0322, "mean_token_accuracy": 0.9899998754262924, "num_tokens": 252708302.0, "step": 2380 }, { "entropy": 0.968678817152977, "epoch": 5.424173318129989, "grad_norm": 1.2265625, "learning_rate": 2.358816490233326e-06, "loss": 0.0326, "mean_token_accuracy": 0.9894915968179703, "num_tokens": 252814282.0, "step": 2381 }, { "entropy": 0.9722213000059128, "epoch": 5.426453819840365, "grad_norm": 1.171875, "learning_rate": 2.356936087433318e-06, "loss": 0.0271, "mean_token_accuracy": 0.9928218275308609, "num_tokens": 252920956.0, "step": 2382 }, { "entropy": 0.9641828387975693, "epoch": 5.428734321550741, "grad_norm": 0.99609375, "learning_rate": 2.3550557658336245e-06, "loss": 0.0244, "mean_token_accuracy": 0.9914212673902512, "num_tokens": 253026921.0, "step": 2383 }, { "entropy": 0.9644743800163269, "epoch": 5.431014823261117, "grad_norm": 1.203125, "learning_rate": 2.3531755265014818e-06, "loss": 0.0399, "mean_token_accuracy": 0.9880397021770477, "num_tokens": 253134495.0, "step": 2384 }, { "entropy": 0.965605840086937, "epoch": 5.4332953249714935, "grad_norm": 1.0390625, "learning_rate": 2.3512953705040737e-06, "loss": 0.0263, "mean_token_accuracy": 0.9929135590791702, "num_tokens": 253241584.0, "step": 2385 }, { "entropy": 0.9608158320188522, "epoch": 5.43557582668187, "grad_norm": 0.9140625, "learning_rate": 2.3494152989085433e-06, "loss": 0.0291, "mean_token_accuracy": 0.9909258931875229, "num_tokens": 253349019.0, "step": 2386 }, { "entropy": 0.9733458757400513, "epoch": 5.437856328392247, "grad_norm": 1.15625, "learning_rate": 2.3475353127819827e-06, "loss": 0.0326, "mean_token_accuracy": 0.9886897057294846, "num_tokens": 253455819.0, "step": 2387 }, { "entropy": 0.9634932279586792, "epoch": 5.440136830102623, "grad_norm": 0.97265625, "learning_rate": 2.345655413191434e-06, "loss": 0.0336, "mean_token_accuracy": 0.9896508604288101, "num_tokens": 253562307.0, "step": 2388 }, { "entropy": 0.9651092439889908, "epoch": 5.442417331812999, "grad_norm": 0.91015625, "learning_rate": 2.3437756012038933e-06, "loss": 0.0275, "mean_token_accuracy": 0.9909972250461578, "num_tokens": 253669489.0, "step": 2389 }, { "entropy": 0.9643020033836365, "epoch": 5.444697833523375, "grad_norm": 0.88671875, "learning_rate": 2.341895877886306e-06, "loss": 0.0235, "mean_token_accuracy": 0.9911245256662369, "num_tokens": 253775657.0, "step": 2390 }, { "entropy": 0.9683791697025299, "epoch": 5.4469783352337515, "grad_norm": 1.296875, "learning_rate": 2.3400162443055655e-06, "loss": 0.0331, "mean_token_accuracy": 0.9896823018789291, "num_tokens": 253882031.0, "step": 2391 }, { "entropy": 0.9651946723461151, "epoch": 5.449258836944128, "grad_norm": 1.0234375, "learning_rate": 2.338136701528516e-06, "loss": 0.034, "mean_token_accuracy": 0.9890072047710419, "num_tokens": 253988727.0, "step": 2392 }, { "entropy": 0.9672014564275742, "epoch": 5.451539338654504, "grad_norm": 0.99609375, "learning_rate": 2.33625725062195e-06, "loss": 0.0264, "mean_token_accuracy": 0.9921978414058685, "num_tokens": 254094680.0, "step": 2393 }, { "entropy": 0.9607627093791962, "epoch": 5.45381984036488, "grad_norm": 1.0703125, "learning_rate": 2.3343778926526074e-06, "loss": 0.0298, "mean_token_accuracy": 0.990896612405777, "num_tokens": 254200472.0, "step": 2394 }, { "entropy": 0.9632444381713867, "epoch": 5.456100342075256, "grad_norm": 1.046875, "learning_rate": 2.332498628687176e-06, "loss": 0.0246, "mean_token_accuracy": 0.9920130372047424, "num_tokens": 254306520.0, "step": 2395 }, { "entropy": 0.9650780558586121, "epoch": 5.4583808437856325, "grad_norm": 1.2109375, "learning_rate": 2.330619459792289e-06, "loss": 0.0342, "mean_token_accuracy": 0.9892529398202896, "num_tokens": 254412420.0, "step": 2396 }, { "entropy": 0.969687819480896, "epoch": 5.460661345496009, "grad_norm": 1.0625, "learning_rate": 2.328740387034526e-06, "loss": 0.0368, "mean_token_accuracy": 0.9872991144657135, "num_tokens": 254518791.0, "step": 2397 }, { "entropy": 0.9620527476072311, "epoch": 5.462941847206386, "grad_norm": 1.1796875, "learning_rate": 2.326861411480414e-06, "loss": 0.0312, "mean_token_accuracy": 0.9892431348562241, "num_tokens": 254624905.0, "step": 2398 }, { "entropy": 0.9625495225191116, "epoch": 5.465222348916762, "grad_norm": 1.0390625, "learning_rate": 2.324982534196421e-06, "loss": 0.0315, "mean_token_accuracy": 0.9904213398694992, "num_tokens": 254731640.0, "step": 2399 }, { "entropy": 0.9606980681419373, "epoch": 5.467502850627138, "grad_norm": 1.0078125, "learning_rate": 2.3231037562489636e-06, "loss": 0.0364, "mean_token_accuracy": 0.9893456697463989, "num_tokens": 254838172.0, "step": 2400 }, { "entropy": 0.9663498252630234, "epoch": 5.469783352337514, "grad_norm": 1.15625, "learning_rate": 2.321225078704399e-06, "loss": 0.0356, "mean_token_accuracy": 0.9867542237043381, "num_tokens": 254944334.0, "step": 2401 }, { "entropy": 0.9642149657011032, "epoch": 5.4720638540478905, "grad_norm": 0.8671875, "learning_rate": 2.319346502629028e-06, "loss": 0.0232, "mean_token_accuracy": 0.992842897772789, "num_tokens": 255050606.0, "step": 2402 }, { "entropy": 0.9678411334753036, "epoch": 5.474344355758267, "grad_norm": 0.9453125, "learning_rate": 2.3174680290890945e-06, "loss": 0.0244, "mean_token_accuracy": 0.991176500916481, "num_tokens": 255157128.0, "step": 2403 }, { "entropy": 0.9650191366672516, "epoch": 5.476624857468643, "grad_norm": 1.2265625, "learning_rate": 2.315589659150784e-06, "loss": 0.0383, "mean_token_accuracy": 0.9876338094472885, "num_tokens": 255263316.0, "step": 2404 }, { "entropy": 0.9568027853965759, "epoch": 5.478905359179019, "grad_norm": 0.9140625, "learning_rate": 2.3137113938802224e-06, "loss": 0.0242, "mean_token_accuracy": 0.9924261271953583, "num_tokens": 255369980.0, "step": 2405 }, { "entropy": 0.9637410491704941, "epoch": 5.481185860889395, "grad_norm": 1.25, "learning_rate": 2.311833234343478e-06, "loss": 0.0456, "mean_token_accuracy": 0.9847883135080338, "num_tokens": 255476151.0, "step": 2406 }, { "entropy": 0.9671718329191208, "epoch": 5.483466362599772, "grad_norm": 1.0546875, "learning_rate": 2.3099551816065563e-06, "loss": 0.0385, "mean_token_accuracy": 0.9867358356714249, "num_tokens": 255581791.0, "step": 2407 }, { "entropy": 0.9669261425733566, "epoch": 5.485746864310149, "grad_norm": 0.84375, "learning_rate": 2.3080772367354046e-06, "loss": 0.0219, "mean_token_accuracy": 0.9942637383937836, "num_tokens": 255688004.0, "step": 2408 }, { "entropy": 0.9662802368402481, "epoch": 5.488027366020525, "grad_norm": 0.79296875, "learning_rate": 2.3061994007959086e-06, "loss": 0.0223, "mean_token_accuracy": 0.9926073998212814, "num_tokens": 255793946.0, "step": 2409 }, { "entropy": 0.969020664691925, "epoch": 5.490307867730901, "grad_norm": 0.9375, "learning_rate": 2.304321674853891e-06, "loss": 0.0274, "mean_token_accuracy": 0.9921877831220627, "num_tokens": 255900448.0, "step": 2410 }, { "entropy": 0.9674045890569687, "epoch": 5.492588369441277, "grad_norm": 1.078125, "learning_rate": 2.3024440599751132e-06, "loss": 0.0329, "mean_token_accuracy": 0.9919902086257935, "num_tokens": 256007059.0, "step": 2411 }, { "entropy": 0.9702950716018677, "epoch": 5.494868871151653, "grad_norm": 1.2890625, "learning_rate": 2.3005665572252732e-06, "loss": 0.0478, "mean_token_accuracy": 0.9865375310182571, "num_tokens": 256112930.0, "step": 2412 }, { "entropy": 0.9630885422229767, "epoch": 5.4971493728620295, "grad_norm": 1.1875, "learning_rate": 2.2986891676700042e-06, "loss": 0.0346, "mean_token_accuracy": 0.9882705509662628, "num_tokens": 256219158.0, "step": 2413 }, { "entropy": 0.966094359755516, "epoch": 5.499429874572406, "grad_norm": 0.8828125, "learning_rate": 2.296811892374878e-06, "loss": 0.0231, "mean_token_accuracy": 0.9930365681648254, "num_tokens": 256325538.0, "step": 2414 }, { "entropy": 0.9656511396169662, "epoch": 5.501710376282782, "grad_norm": 1.0078125, "learning_rate": 2.294934732405398e-06, "loss": 0.0303, "mean_token_accuracy": 0.9889591187238693, "num_tokens": 256432188.0, "step": 2415 }, { "entropy": 0.9658100605010986, "epoch": 5.503990877993158, "grad_norm": 0.99609375, "learning_rate": 2.293057688827007e-06, "loss": 0.0263, "mean_token_accuracy": 0.991086483001709, "num_tokens": 256538913.0, "step": 2416 }, { "entropy": 0.9626293629407883, "epoch": 5.506271379703534, "grad_norm": 1.21875, "learning_rate": 2.2911807627050745e-06, "loss": 0.0374, "mean_token_accuracy": 0.9877655953168869, "num_tokens": 256645545.0, "step": 2417 }, { "entropy": 0.96553173661232, "epoch": 5.508551881413911, "grad_norm": 1.328125, "learning_rate": 2.2893039551049104e-06, "loss": 0.0374, "mean_token_accuracy": 0.9889493733644485, "num_tokens": 256751963.0, "step": 2418 }, { "entropy": 0.9620585739612579, "epoch": 5.510832383124288, "grad_norm": 0.96875, "learning_rate": 2.2874272670917534e-06, "loss": 0.027, "mean_token_accuracy": 0.9918595850467682, "num_tokens": 256858637.0, "step": 2419 }, { "entropy": 0.9685219377279282, "epoch": 5.513112884834664, "grad_norm": 0.90625, "learning_rate": 2.2855506997307766e-06, "loss": 0.0233, "mean_token_accuracy": 0.9931177496910095, "num_tokens": 256965028.0, "step": 2420 }, { "epoch": 5.513112884834664, "eval_entropy": 0.9644884718688269, "eval_loss": 0.037821054458618164, "eval_mean_token_accuracy": 0.9884481459516082, "eval_num_tokens": 256965028.0, "eval_runtime": 66.0698, "eval_samples_per_second": 126.911, "eval_steps_per_second": 3.981, "step": 2420 }, { "entropy": 0.9612863957881927, "epoch": 5.51539338654504, "grad_norm": 0.9453125, "learning_rate": 2.283674254087082e-06, "loss": 0.0285, "mean_token_accuracy": 0.9899799227714539, "num_tokens": 257071142.0, "step": 2421 }, { "entropy": 0.9690279811620712, "epoch": 5.517673888255416, "grad_norm": 1.0, "learning_rate": 2.281797931225705e-06, "loss": 0.0326, "mean_token_accuracy": 0.9894107282161713, "num_tokens": 257176821.0, "step": 2422 }, { "entropy": 0.9607860893011093, "epoch": 5.519954389965792, "grad_norm": 1.09375, "learning_rate": 2.279921732211609e-06, "loss": 0.0383, "mean_token_accuracy": 0.986273929476738, "num_tokens": 257282642.0, "step": 2423 }, { "entropy": 0.9621947109699249, "epoch": 5.5222348916761685, "grad_norm": 1.046875, "learning_rate": 2.278045658109689e-06, "loss": 0.029, "mean_token_accuracy": 0.9914281368255615, "num_tokens": 257388813.0, "step": 2424 }, { "entropy": 0.9649747312068939, "epoch": 5.524515393386545, "grad_norm": 1.0625, "learning_rate": 2.2761697099847686e-06, "loss": 0.0328, "mean_token_accuracy": 0.9900173395872116, "num_tokens": 257494591.0, "step": 2425 }, { "entropy": 0.9646356105804443, "epoch": 5.526795895096921, "grad_norm": 1.09375, "learning_rate": 2.274293888901599e-06, "loss": 0.0365, "mean_token_accuracy": 0.9887258261442184, "num_tokens": 257601322.0, "step": 2426 }, { "entropy": 0.9667605608701706, "epoch": 5.529076396807298, "grad_norm": 0.97265625, "learning_rate": 2.2724181959248627e-06, "loss": 0.0305, "mean_token_accuracy": 0.9899974167346954, "num_tokens": 257707666.0, "step": 2427 }, { "entropy": 0.9662160724401474, "epoch": 5.531356898517674, "grad_norm": 1.1171875, "learning_rate": 2.270542632119163e-06, "loss": 0.0323, "mean_token_accuracy": 0.9884564131498337, "num_tokens": 257813890.0, "step": 2428 }, { "entropy": 0.9673523753881454, "epoch": 5.53363740022805, "grad_norm": 1.1796875, "learning_rate": 2.2686671985490355e-06, "loss": 0.0372, "mean_token_accuracy": 0.9879400134086609, "num_tokens": 257920422.0, "step": 2429 }, { "entropy": 0.96356600522995, "epoch": 5.535917901938427, "grad_norm": 1.078125, "learning_rate": 2.26679189627894e-06, "loss": 0.0346, "mean_token_accuracy": 0.9875753074884415, "num_tokens": 258026193.0, "step": 2430 }, { "entropy": 0.9665153324604034, "epoch": 5.538198403648803, "grad_norm": 0.9765625, "learning_rate": 2.264916726373263e-06, "loss": 0.0232, "mean_token_accuracy": 0.9929258078336716, "num_tokens": 258131621.0, "step": 2431 }, { "entropy": 0.9657359421253204, "epoch": 5.540478905359179, "grad_norm": 1.1796875, "learning_rate": 2.263041689896313e-06, "loss": 0.0364, "mean_token_accuracy": 0.9866589307785034, "num_tokens": 258237674.0, "step": 2432 }, { "entropy": 0.9703254997730255, "epoch": 5.542759407069555, "grad_norm": 0.90234375, "learning_rate": 2.261166787912325e-06, "loss": 0.0263, "mean_token_accuracy": 0.9918536990880966, "num_tokens": 258343915.0, "step": 2433 }, { "entropy": 0.9691693782806396, "epoch": 5.545039908779931, "grad_norm": 1.1640625, "learning_rate": 2.2592920214854573e-06, "loss": 0.0276, "mean_token_accuracy": 0.9915995299816132, "num_tokens": 258449871.0, "step": 2434 }, { "entropy": 0.9686582982540131, "epoch": 5.5473204104903076, "grad_norm": 1.15625, "learning_rate": 2.2574173916797912e-06, "loss": 0.0336, "mean_token_accuracy": 0.9900621175765991, "num_tokens": 258556253.0, "step": 2435 }, { "entropy": 0.9652422666549683, "epoch": 5.549600912200685, "grad_norm": 1.0703125, "learning_rate": 2.2555428995593303e-06, "loss": 0.0282, "mean_token_accuracy": 0.9890480488538742, "num_tokens": 258662504.0, "step": 2436 }, { "entropy": 0.9723929464817047, "epoch": 5.55188141391106, "grad_norm": 0.9609375, "learning_rate": 2.253668546188e-06, "loss": 0.0257, "mean_token_accuracy": 0.9917514026165009, "num_tokens": 258769134.0, "step": 2437 }, { "entropy": 0.9625257104635239, "epoch": 5.554161915621437, "grad_norm": 1.1015625, "learning_rate": 2.2517943326296487e-06, "loss": 0.0353, "mean_token_accuracy": 0.9892865717411041, "num_tokens": 258876148.0, "step": 2438 }, { "entropy": 0.9698383957147598, "epoch": 5.556442417331813, "grad_norm": 1.25, "learning_rate": 2.249920259948041e-06, "loss": 0.036, "mean_token_accuracy": 0.987659752368927, "num_tokens": 258983403.0, "step": 2439 }, { "entropy": 0.9658518135547638, "epoch": 5.558722919042189, "grad_norm": 1.046875, "learning_rate": 2.2480463292068655e-06, "loss": 0.0258, "mean_token_accuracy": 0.9910058230161667, "num_tokens": 259089389.0, "step": 2440 }, { "entropy": 0.9645768254995346, "epoch": 5.561003420752566, "grad_norm": 1.0546875, "learning_rate": 2.24617254146973e-06, "loss": 0.0316, "mean_token_accuracy": 0.9893222898244858, "num_tokens": 259195554.0, "step": 2441 }, { "entropy": 0.9648852199316025, "epoch": 5.563283922462942, "grad_norm": 1.109375, "learning_rate": 2.2442988978001594e-06, "loss": 0.0418, "mean_token_accuracy": 0.9878713488578796, "num_tokens": 259302818.0, "step": 2442 }, { "entropy": 0.9687736481428146, "epoch": 5.565564424173318, "grad_norm": 1.140625, "learning_rate": 2.2424253992615983e-06, "loss": 0.0363, "mean_token_accuracy": 0.9881009310483932, "num_tokens": 259408750.0, "step": 2443 }, { "entropy": 0.9616790860891342, "epoch": 5.567844925883694, "grad_norm": 1.2578125, "learning_rate": 2.2405520469174084e-06, "loss": 0.0375, "mean_token_accuracy": 0.990286186337471, "num_tokens": 259514913.0, "step": 2444 }, { "entropy": 0.967809721827507, "epoch": 5.57012542759407, "grad_norm": 1.3203125, "learning_rate": 2.238678841830867e-06, "loss": 0.0384, "mean_token_accuracy": 0.9878415465354919, "num_tokens": 259621029.0, "step": 2445 }, { "entropy": 0.9606745690107346, "epoch": 5.572405929304447, "grad_norm": 1.171875, "learning_rate": 2.23680578506517e-06, "loss": 0.0393, "mean_token_accuracy": 0.9896344393491745, "num_tokens": 259727209.0, "step": 2446 }, { "entropy": 0.9603323489427567, "epoch": 5.574686431014824, "grad_norm": 0.95703125, "learning_rate": 2.234932877683428e-06, "loss": 0.0288, "mean_token_accuracy": 0.9903627336025238, "num_tokens": 259833266.0, "step": 2447 }, { "entropy": 0.9657222181558609, "epoch": 5.5769669327252, "grad_norm": 0.84375, "learning_rate": 2.233060120748667e-06, "loss": 0.0193, "mean_token_accuracy": 0.9941488206386566, "num_tokens": 259939563.0, "step": 2448 }, { "entropy": 0.9637742191553116, "epoch": 5.579247434435576, "grad_norm": 1.15625, "learning_rate": 2.2311875153238296e-06, "loss": 0.0293, "mean_token_accuracy": 0.9919669181108475, "num_tokens": 260045622.0, "step": 2449 }, { "entropy": 0.963170737028122, "epoch": 5.581527936145952, "grad_norm": 0.87890625, "learning_rate": 2.229315062471768e-06, "loss": 0.0268, "mean_token_accuracy": 0.9920327663421631, "num_tokens": 260151907.0, "step": 2450 }, { "entropy": 0.969622939825058, "epoch": 5.583808437856328, "grad_norm": 1.1015625, "learning_rate": 2.2274427632552507e-06, "loss": 0.0308, "mean_token_accuracy": 0.990932509303093, "num_tokens": 260257691.0, "step": 2451 }, { "entropy": 0.9697427898645401, "epoch": 5.586088939566705, "grad_norm": 1.296875, "learning_rate": 2.2255706187369596e-06, "loss": 0.0328, "mean_token_accuracy": 0.9891589134931564, "num_tokens": 260364196.0, "step": 2452 }, { "entropy": 0.9691111594438553, "epoch": 5.588369441277081, "grad_norm": 1.03125, "learning_rate": 2.223698629979487e-06, "loss": 0.0281, "mean_token_accuracy": 0.9907284080982208, "num_tokens": 260470609.0, "step": 2453 }, { "entropy": 0.9668069183826447, "epoch": 5.590649942987457, "grad_norm": 0.97265625, "learning_rate": 2.221826798045338e-06, "loss": 0.0317, "mean_token_accuracy": 0.9910073429346085, "num_tokens": 260576479.0, "step": 2454 }, { "entropy": 0.9656313359737396, "epoch": 5.592930444697833, "grad_norm": 0.90625, "learning_rate": 2.2199551239969284e-06, "loss": 0.0293, "mean_token_accuracy": 0.9916772544384003, "num_tokens": 260682809.0, "step": 2455 }, { "entropy": 0.9710301458835602, "epoch": 5.59521094640821, "grad_norm": 1.140625, "learning_rate": 2.2180836088965833e-06, "loss": 0.0272, "mean_token_accuracy": 0.9922315180301666, "num_tokens": 260789184.0, "step": 2456 }, { "entropy": 0.9571098834276199, "epoch": 5.5974914481185865, "grad_norm": 0.859375, "learning_rate": 2.216212253806539e-06, "loss": 0.0295, "mean_token_accuracy": 0.9909218102693558, "num_tokens": 260895550.0, "step": 2457 }, { "entropy": 0.9651229530572891, "epoch": 5.599771949828963, "grad_norm": 1.078125, "learning_rate": 2.214341059788941e-06, "loss": 0.0265, "mean_token_accuracy": 0.9903301298618317, "num_tokens": 261001893.0, "step": 2458 }, { "entropy": 0.9627633392810822, "epoch": 5.602052451539339, "grad_norm": 1.1953125, "learning_rate": 2.2124700279058435e-06, "loss": 0.0346, "mean_token_accuracy": 0.9869565367698669, "num_tokens": 261108136.0, "step": 2459 }, { "entropy": 0.972589299082756, "epoch": 5.604332953249715, "grad_norm": 1.0078125, "learning_rate": 2.2105991592192063e-06, "loss": 0.0324, "mean_token_accuracy": 0.9908348172903061, "num_tokens": 261214195.0, "step": 2460 }, { "entropy": 0.963259756565094, "epoch": 5.606613454960091, "grad_norm": 0.984375, "learning_rate": 2.208728454790899e-06, "loss": 0.0293, "mean_token_accuracy": 0.9902229756116867, "num_tokens": 261320547.0, "step": 2461 }, { "entropy": 0.9653888940811157, "epoch": 5.608893956670467, "grad_norm": 1.0859375, "learning_rate": 2.2068579156826974e-06, "loss": 0.0336, "mean_token_accuracy": 0.9882569164037704, "num_tokens": 261427585.0, "step": 2462 }, { "entropy": 0.9639451205730438, "epoch": 5.611174458380844, "grad_norm": 0.96875, "learning_rate": 2.2049875429562845e-06, "loss": 0.0319, "mean_token_accuracy": 0.989973396062851, "num_tokens": 261533860.0, "step": 2463 }, { "entropy": 0.9690001010894775, "epoch": 5.61345496009122, "grad_norm": 0.88671875, "learning_rate": 2.203117337673246e-06, "loss": 0.0278, "mean_token_accuracy": 0.9916131794452667, "num_tokens": 261639986.0, "step": 2464 }, { "entropy": 0.9621605426073074, "epoch": 5.615735461801596, "grad_norm": 0.9296875, "learning_rate": 2.2012473008950756e-06, "loss": 0.0305, "mean_token_accuracy": 0.989893451333046, "num_tokens": 261746853.0, "step": 2465 }, { "entropy": 0.9635549187660217, "epoch": 5.618015963511972, "grad_norm": 1.1953125, "learning_rate": 2.1993774336831696e-06, "loss": 0.0277, "mean_token_accuracy": 0.9910480976104736, "num_tokens": 261853638.0, "step": 2466 }, { "entropy": 0.9667169898748398, "epoch": 5.620296465222349, "grad_norm": 0.890625, "learning_rate": 2.197507737098828e-06, "loss": 0.0264, "mean_token_accuracy": 0.989977166056633, "num_tokens": 261960201.0, "step": 2467 }, { "entropy": 0.9630070477724075, "epoch": 5.6225769669327255, "grad_norm": 1.1484375, "learning_rate": 2.195638212203255e-06, "loss": 0.0321, "mean_token_accuracy": 0.9899444431066513, "num_tokens": 262066049.0, "step": 2468 }, { "entropy": 0.9614637196063995, "epoch": 5.624857468643102, "grad_norm": 0.89453125, "learning_rate": 2.193768860057557e-06, "loss": 0.0272, "mean_token_accuracy": 0.9895845651626587, "num_tokens": 262172633.0, "step": 2469 }, { "entropy": 0.9654940962791443, "epoch": 5.627137970353478, "grad_norm": 1.078125, "learning_rate": 2.191899681722743e-06, "loss": 0.0316, "mean_token_accuracy": 0.9896231442689896, "num_tokens": 262278637.0, "step": 2470 }, { "entropy": 0.9646185040473938, "epoch": 5.629418472063854, "grad_norm": 0.9375, "learning_rate": 2.19003067825972e-06, "loss": 0.0272, "mean_token_accuracy": 0.9896707534790039, "num_tokens": 262384997.0, "step": 2471 }, { "entropy": 0.9616711735725403, "epoch": 5.63169897377423, "grad_norm": 0.9921875, "learning_rate": 2.1881618507293004e-06, "loss": 0.0304, "mean_token_accuracy": 0.9892481118440628, "num_tokens": 262491630.0, "step": 2472 }, { "entropy": 0.9640662521123886, "epoch": 5.633979475484606, "grad_norm": 1.0234375, "learning_rate": 2.186293200192194e-06, "loss": 0.0355, "mean_token_accuracy": 0.9893411993980408, "num_tokens": 262597958.0, "step": 2473 }, { "entropy": 0.9603606909513474, "epoch": 5.636259977194983, "grad_norm": 1.15625, "learning_rate": 2.1844247277090113e-06, "loss": 0.0391, "mean_token_accuracy": 0.9873971343040466, "num_tokens": 262704577.0, "step": 2474 }, { "entropy": 0.9659052491188049, "epoch": 5.638540478905359, "grad_norm": 1.453125, "learning_rate": 2.1825564343402606e-06, "loss": 0.0442, "mean_token_accuracy": 0.9873139262199402, "num_tokens": 262810501.0, "step": 2475 }, { "entropy": 0.9567843526601791, "epoch": 5.640820980615736, "grad_norm": 1.1953125, "learning_rate": 2.180688321146349e-06, "loss": 0.0347, "mean_token_accuracy": 0.9899375736713409, "num_tokens": 262915906.0, "step": 2476 }, { "entropy": 0.9670934677124023, "epoch": 5.643101482326112, "grad_norm": 0.765625, "learning_rate": 2.1788203891875818e-06, "loss": 0.0202, "mean_token_accuracy": 0.9940281808376312, "num_tokens": 263022196.0, "step": 2477 }, { "entropy": 0.9627981930971146, "epoch": 5.645381984036488, "grad_norm": 1.21875, "learning_rate": 2.176952639524161e-06, "loss": 0.0355, "mean_token_accuracy": 0.9888273775577545, "num_tokens": 263128560.0, "step": 2478 }, { "entropy": 0.9670226722955704, "epoch": 5.6476624857468645, "grad_norm": 0.91796875, "learning_rate": 2.175085073216185e-06, "loss": 0.0259, "mean_token_accuracy": 0.9915167540311813, "num_tokens": 263234869.0, "step": 2479 }, { "entropy": 0.967424139380455, "epoch": 5.649942987457241, "grad_norm": 0.87890625, "learning_rate": 2.173217691323649e-06, "loss": 0.027, "mean_token_accuracy": 0.9914283454418182, "num_tokens": 263341203.0, "step": 2480 }, { "entropy": 0.9662760198116302, "epoch": 5.652223489167617, "grad_norm": 1.015625, "learning_rate": 2.1713504949064433e-06, "loss": 0.0248, "mean_token_accuracy": 0.992805540561676, "num_tokens": 263448105.0, "step": 2481 }, { "entropy": 0.9682681560516357, "epoch": 5.654503990877993, "grad_norm": 1.1171875, "learning_rate": 2.169483485024351e-06, "loss": 0.029, "mean_token_accuracy": 0.9919771254062653, "num_tokens": 263554170.0, "step": 2482 }, { "entropy": 0.9680563658475876, "epoch": 5.656784492588369, "grad_norm": 1.0546875, "learning_rate": 2.167616662737052e-06, "loss": 0.0258, "mean_token_accuracy": 0.9925573617219925, "num_tokens": 263660833.0, "step": 2483 }, { "entropy": 0.9641561806201935, "epoch": 5.659064994298745, "grad_norm": 0.96875, "learning_rate": 2.1657500291041185e-06, "loss": 0.0234, "mean_token_accuracy": 0.9932416826486588, "num_tokens": 263767696.0, "step": 2484 }, { "entropy": 0.9679719507694244, "epoch": 5.661345496009122, "grad_norm": 1.0703125, "learning_rate": 2.1638835851850155e-06, "loss": 0.028, "mean_token_accuracy": 0.9902598708868027, "num_tokens": 263873770.0, "step": 2485 }, { "entropy": 0.9578444063663483, "epoch": 5.663625997719498, "grad_norm": 1.0625, "learning_rate": 2.1620173320391007e-06, "loss": 0.0288, "mean_token_accuracy": 0.9901141375303268, "num_tokens": 263980249.0, "step": 2486 }, { "entropy": 0.9595326483249664, "epoch": 5.665906499429875, "grad_norm": 1.15625, "learning_rate": 2.160151270725623e-06, "loss": 0.0408, "mean_token_accuracy": 0.9859036654233932, "num_tokens": 264086720.0, "step": 2487 }, { "entropy": 0.9683012366294861, "epoch": 5.668187001140251, "grad_norm": 1.3515625, "learning_rate": 2.158285402303723e-06, "loss": 0.0415, "mean_token_accuracy": 0.9852505922317505, "num_tokens": 264193203.0, "step": 2488 }, { "entropy": 0.9686890095472336, "epoch": 5.670467502850627, "grad_norm": 0.98828125, "learning_rate": 2.1564197278324317e-06, "loss": 0.0321, "mean_token_accuracy": 0.9902969598770142, "num_tokens": 264299784.0, "step": 2489 }, { "entropy": 0.9618648439645767, "epoch": 5.6727480045610035, "grad_norm": 0.9921875, "learning_rate": 2.1545542483706694e-06, "loss": 0.0279, "mean_token_accuracy": 0.9927697628736496, "num_tokens": 264406138.0, "step": 2490 }, { "entropy": 0.9602451920509338, "epoch": 5.67502850627138, "grad_norm": 0.95703125, "learning_rate": 2.1526889649772477e-06, "loss": 0.0268, "mean_token_accuracy": 0.9920501559972763, "num_tokens": 264512460.0, "step": 2491 }, { "entropy": 0.9650501161813736, "epoch": 5.677309007981756, "grad_norm": 0.984375, "learning_rate": 2.1508238787108633e-06, "loss": 0.0297, "mean_token_accuracy": 0.990641325712204, "num_tokens": 264618858.0, "step": 2492 }, { "entropy": 0.9688914120197296, "epoch": 5.679589509692132, "grad_norm": 0.875, "learning_rate": 2.1489589906301046e-06, "loss": 0.0284, "mean_token_accuracy": 0.9904737025499344, "num_tokens": 264724816.0, "step": 2493 }, { "entropy": 0.9663682729005814, "epoch": 5.681870011402508, "grad_norm": 1.171875, "learning_rate": 2.1470943017934455e-06, "loss": 0.0354, "mean_token_accuracy": 0.9896170347929001, "num_tokens": 264831509.0, "step": 2494 }, { "entropy": 0.9626950323581696, "epoch": 5.684150513112884, "grad_norm": 0.91015625, "learning_rate": 2.145229813259248e-06, "loss": 0.0242, "mean_token_accuracy": 0.991471067070961, "num_tokens": 264938098.0, "step": 2495 }, { "entropy": 0.9670432060956955, "epoch": 5.6864310148232615, "grad_norm": 0.98828125, "learning_rate": 2.143365526085759e-06, "loss": 0.0342, "mean_token_accuracy": 0.9902390092611313, "num_tokens": 265044116.0, "step": 2496 }, { "entropy": 0.9604650288820267, "epoch": 5.688711516533638, "grad_norm": 1.046875, "learning_rate": 2.1415014413311126e-06, "loss": 0.0357, "mean_token_accuracy": 0.9892512112855911, "num_tokens": 265150280.0, "step": 2497 }, { "entropy": 0.9601235836744308, "epoch": 5.690992018244014, "grad_norm": 1.078125, "learning_rate": 2.139637560053327e-06, "loss": 0.0375, "mean_token_accuracy": 0.9886748790740967, "num_tokens": 265257015.0, "step": 2498 }, { "entropy": 0.966834768652916, "epoch": 5.69327251995439, "grad_norm": 1.078125, "learning_rate": 2.137773883310305e-06, "loss": 0.0349, "mean_token_accuracy": 0.9902046769857407, "num_tokens": 265363373.0, "step": 2499 }, { "entropy": 0.9657060950994492, "epoch": 5.695553021664766, "grad_norm": 1.0625, "learning_rate": 2.1359104121598337e-06, "loss": 0.0276, "mean_token_accuracy": 0.9907273799180984, "num_tokens": 265469346.0, "step": 2500 }, { "entropy": 0.961906835436821, "epoch": 5.6978335233751425, "grad_norm": 1.0078125, "learning_rate": 2.1340471476595836e-06, "loss": 0.0295, "mean_token_accuracy": 0.9892087280750275, "num_tokens": 265576181.0, "step": 2501 }, { "entropy": 0.9618450850248337, "epoch": 5.700114025085519, "grad_norm": 1.125, "learning_rate": 2.1321840908671082e-06, "loss": 0.0342, "mean_token_accuracy": 0.9885698705911636, "num_tokens": 265682517.0, "step": 2502 }, { "entropy": 0.9614415317773819, "epoch": 5.702394526795895, "grad_norm": 1.015625, "learning_rate": 2.1303212428398407e-06, "loss": 0.0289, "mean_token_accuracy": 0.9907098710536957, "num_tokens": 265788393.0, "step": 2503 }, { "entropy": 0.9680274426937103, "epoch": 5.704675028506271, "grad_norm": 1.546875, "learning_rate": 2.1284586046350996e-06, "loss": 0.0381, "mean_token_accuracy": 0.9880109429359436, "num_tokens": 265894939.0, "step": 2504 }, { "entropy": 0.9668640196323395, "epoch": 5.706955530216648, "grad_norm": 1.0234375, "learning_rate": 2.126596177310081e-06, "loss": 0.029, "mean_token_accuracy": 0.991649329662323, "num_tokens": 266001471.0, "step": 2505 }, { "entropy": 0.9675723314285278, "epoch": 5.7092360319270234, "grad_norm": 1.2265625, "learning_rate": 2.124733961921864e-06, "loss": 0.0359, "mean_token_accuracy": 0.9887653291225433, "num_tokens": 266108023.0, "step": 2506 }, { "entropy": 0.9693010002374649, "epoch": 5.7115165336374005, "grad_norm": 1.265625, "learning_rate": 2.1228719595274056e-06, "loss": 0.0441, "mean_token_accuracy": 0.9873128533363342, "num_tokens": 266214146.0, "step": 2507 }, { "entropy": 0.9615640193223953, "epoch": 5.713797035347777, "grad_norm": 1.109375, "learning_rate": 2.1210101711835413e-06, "loss": 0.0327, "mean_token_accuracy": 0.9903405904769897, "num_tokens": 266320392.0, "step": 2508 }, { "entropy": 0.9620994627475739, "epoch": 5.716077537058153, "grad_norm": 0.97265625, "learning_rate": 2.1191485979469877e-06, "loss": 0.0278, "mean_token_accuracy": 0.9907161295413971, "num_tokens": 266426336.0, "step": 2509 }, { "entropy": 0.964812159538269, "epoch": 5.718358038768529, "grad_norm": 1.1171875, "learning_rate": 2.1172872408743374e-06, "loss": 0.028, "mean_token_accuracy": 0.9914517104625702, "num_tokens": 266532530.0, "step": 2510 }, { "entropy": 0.9670618325471878, "epoch": 5.720638540478905, "grad_norm": 1.5234375, "learning_rate": 2.11542610102206e-06, "loss": 0.0342, "mean_token_accuracy": 0.990879014134407, "num_tokens": 266638598.0, "step": 2511 }, { "entropy": 0.967229038476944, "epoch": 5.7229190421892815, "grad_norm": 1.1796875, "learning_rate": 2.1135651794465032e-06, "loss": 0.0319, "mean_token_accuracy": 0.9893302768468857, "num_tokens": 266745141.0, "step": 2512 }, { "entropy": 0.9661744236946106, "epoch": 5.725199543899658, "grad_norm": 1.0546875, "learning_rate": 2.1117044772038915e-06, "loss": 0.0286, "mean_token_accuracy": 0.9920381754636765, "num_tokens": 266851190.0, "step": 2513 }, { "entropy": 0.9635248631238937, "epoch": 5.727480045610034, "grad_norm": 1.09375, "learning_rate": 2.1098439953503207e-06, "loss": 0.0337, "mean_token_accuracy": 0.9883318990468979, "num_tokens": 266957519.0, "step": 2514 }, { "entropy": 0.9647699743509293, "epoch": 5.72976054732041, "grad_norm": 1.03125, "learning_rate": 2.1079837349417664e-06, "loss": 0.026, "mean_token_accuracy": 0.9925019443035126, "num_tokens": 267063343.0, "step": 2515 }, { "entropy": 0.9608018845319748, "epoch": 5.732041049030787, "grad_norm": 0.96484375, "learning_rate": 2.1061236970340756e-06, "loss": 0.0231, "mean_token_accuracy": 0.9928547739982605, "num_tokens": 267169753.0, "step": 2516 }, { "entropy": 0.9680146425962448, "epoch": 5.734321550741163, "grad_norm": 1.390625, "learning_rate": 2.104263882682971e-06, "loss": 0.0315, "mean_token_accuracy": 0.9910774528980255, "num_tokens": 267275898.0, "step": 2517 }, { "entropy": 0.9687290340662003, "epoch": 5.7366020524515395, "grad_norm": 1.046875, "learning_rate": 2.1024042929440465e-06, "loss": 0.0287, "mean_token_accuracy": 0.9896125197410583, "num_tokens": 267382114.0, "step": 2518 }, { "entropy": 0.9676609635353088, "epoch": 5.738882554161916, "grad_norm": 0.98828125, "learning_rate": 2.1005449288727696e-06, "loss": 0.026, "mean_token_accuracy": 0.9924918413162231, "num_tokens": 267488644.0, "step": 2519 }, { "entropy": 0.969560906291008, "epoch": 5.741163055872292, "grad_norm": 1.1171875, "learning_rate": 2.0986857915244787e-06, "loss": 0.0282, "mean_token_accuracy": 0.9898799359798431, "num_tokens": 267595290.0, "step": 2520 }, { "entropy": 0.9662415534257889, "epoch": 5.743443557582668, "grad_norm": 1.3046875, "learning_rate": 2.096826881954385e-06, "loss": 0.0396, "mean_token_accuracy": 0.9886318147182465, "num_tokens": 267701789.0, "step": 2521 }, { "entropy": 0.969692662358284, "epoch": 5.745724059293044, "grad_norm": 1.2890625, "learning_rate": 2.0949682012175693e-06, "loss": 0.0334, "mean_token_accuracy": 0.9895701259374619, "num_tokens": 267808391.0, "step": 2522 }, { "entropy": 0.9658484756946564, "epoch": 5.7480045610034205, "grad_norm": 1.203125, "learning_rate": 2.093109750368983e-06, "loss": 0.0357, "mean_token_accuracy": 0.9893975406885147, "num_tokens": 267915104.0, "step": 2523 }, { "entropy": 0.9688145518302917, "epoch": 5.750285062713797, "grad_norm": 0.890625, "learning_rate": 2.0912515304634485e-06, "loss": 0.0274, "mean_token_accuracy": 0.9913087785243988, "num_tokens": 268021429.0, "step": 2524 }, { "entropy": 0.9625142961740494, "epoch": 5.752565564424174, "grad_norm": 1.1640625, "learning_rate": 2.089393542555653e-06, "loss": 0.033, "mean_token_accuracy": 0.9894680380821228, "num_tokens": 268127392.0, "step": 2525 }, { "entropy": 0.9708652794361115, "epoch": 5.75484606613455, "grad_norm": 1.078125, "learning_rate": 2.0875357877001556e-06, "loss": 0.0269, "mean_token_accuracy": 0.9906363040208817, "num_tokens": 268233390.0, "step": 2526 }, { "entropy": 0.9671672433614731, "epoch": 5.757126567844926, "grad_norm": 0.8828125, "learning_rate": 2.085678266951382e-06, "loss": 0.0204, "mean_token_accuracy": 0.9928423464298248, "num_tokens": 268339477.0, "step": 2527 }, { "entropy": 0.9668699651956558, "epoch": 5.759407069555302, "grad_norm": 1.3203125, "learning_rate": 2.083820981363626e-06, "loss": 0.0388, "mean_token_accuracy": 0.9860024154186249, "num_tokens": 268445880.0, "step": 2528 }, { "entropy": 0.9667616486549377, "epoch": 5.7616875712656785, "grad_norm": 1.296875, "learning_rate": 2.0819639319910466e-06, "loss": 0.0313, "mean_token_accuracy": 0.98829685151577, "num_tokens": 268552121.0, "step": 2529 }, { "entropy": 0.9629393368959427, "epoch": 5.763968072976055, "grad_norm": 0.953125, "learning_rate": 2.0801071198876684e-06, "loss": 0.0283, "mean_token_accuracy": 0.9896856844425201, "num_tokens": 268658012.0, "step": 2530 }, { "entropy": 0.9654719531536102, "epoch": 5.766248574686431, "grad_norm": 1.0546875, "learning_rate": 2.0782505461073822e-06, "loss": 0.0386, "mean_token_accuracy": 0.9876517951488495, "num_tokens": 268763731.0, "step": 2531 }, { "entropy": 0.9626254588365555, "epoch": 5.768529076396807, "grad_norm": 0.984375, "learning_rate": 2.076394211703944e-06, "loss": 0.0283, "mean_token_accuracy": 0.9914027899503708, "num_tokens": 268870056.0, "step": 2532 }, { "entropy": 0.9667053073644638, "epoch": 5.770809578107183, "grad_norm": 1.0078125, "learning_rate": 2.0745381177309732e-06, "loss": 0.0259, "mean_token_accuracy": 0.9925691932439804, "num_tokens": 268976488.0, "step": 2533 }, { "entropy": 0.96499103307724, "epoch": 5.7730900798175595, "grad_norm": 0.98828125, "learning_rate": 2.072682265241954e-06, "loss": 0.0284, "mean_token_accuracy": 0.9910237193107605, "num_tokens": 269082754.0, "step": 2534 }, { "entropy": 0.9715036898851395, "epoch": 5.775370581527936, "grad_norm": 1.3046875, "learning_rate": 2.0708266552902303e-06, "loss": 0.0385, "mean_token_accuracy": 0.9876284152269363, "num_tokens": 269188871.0, "step": 2535 }, { "entropy": 0.9613078981637955, "epoch": 5.777651083238313, "grad_norm": 0.97265625, "learning_rate": 2.0689712889290114e-06, "loss": 0.0236, "mean_token_accuracy": 0.990517184138298, "num_tokens": 269295327.0, "step": 2536 }, { "entropy": 0.9678123742341995, "epoch": 5.779931584948689, "grad_norm": 1.109375, "learning_rate": 2.0671161672113677e-06, "loss": 0.0384, "mean_token_accuracy": 0.990313783288002, "num_tokens": 269401775.0, "step": 2537 }, { "entropy": 0.9621356427669525, "epoch": 5.782212086659065, "grad_norm": 1.2421875, "learning_rate": 2.06526129119023e-06, "loss": 0.0366, "mean_token_accuracy": 0.9871158003807068, "num_tokens": 269508108.0, "step": 2538 }, { "entropy": 0.9630264639854431, "epoch": 5.784492588369441, "grad_norm": 1.2109375, "learning_rate": 2.063406661918391e-06, "loss": 0.0357, "mean_token_accuracy": 0.9881645441055298, "num_tokens": 269614659.0, "step": 2539 }, { "entropy": 0.9692083597183228, "epoch": 5.7867730900798175, "grad_norm": 1.0625, "learning_rate": 2.0615522804485027e-06, "loss": 0.0297, "mean_token_accuracy": 0.9901022166013718, "num_tokens": 269720715.0, "step": 2540 }, { "entropy": 0.9655407071113586, "epoch": 5.789053591790194, "grad_norm": 1.015625, "learning_rate": 2.059698147833075e-06, "loss": 0.0367, "mean_token_accuracy": 0.9877977669239044, "num_tokens": 269827273.0, "step": 2541 }, { "entropy": 0.971122220158577, "epoch": 5.79133409350057, "grad_norm": 1.0390625, "learning_rate": 2.0578442651244774e-06, "loss": 0.031, "mean_token_accuracy": 0.9903147369623184, "num_tokens": 269933528.0, "step": 2542 }, { "entropy": 0.9698529243469238, "epoch": 5.793614595210946, "grad_norm": 0.8515625, "learning_rate": 2.0559906333749392e-06, "loss": 0.0213, "mean_token_accuracy": 0.9933410882949829, "num_tokens": 270040122.0, "step": 2543 }, { "entropy": 0.964664876461029, "epoch": 5.795895096921322, "grad_norm": 0.9296875, "learning_rate": 2.054137253636545e-06, "loss": 0.0303, "mean_token_accuracy": 0.9908774793148041, "num_tokens": 270147312.0, "step": 2544 }, { "entropy": 0.9669168889522552, "epoch": 5.798175598631699, "grad_norm": 1.296875, "learning_rate": 2.0522841269612397e-06, "loss": 0.0393, "mean_token_accuracy": 0.9890192449092865, "num_tokens": 270254045.0, "step": 2545 }, { "entropy": 0.9585468918085098, "epoch": 5.800456100342076, "grad_norm": 1.03125, "learning_rate": 2.0504312544008193e-06, "loss": 0.0286, "mean_token_accuracy": 0.9916414171457291, "num_tokens": 270359908.0, "step": 2546 }, { "entropy": 0.9667042046785355, "epoch": 5.802736602052452, "grad_norm": 1.34375, "learning_rate": 2.048578637006939e-06, "loss": 0.0337, "mean_token_accuracy": 0.9891840368509293, "num_tokens": 270466297.0, "step": 2547 }, { "entropy": 0.9605764895677567, "epoch": 5.805017103762828, "grad_norm": 1.1796875, "learning_rate": 2.04672627583111e-06, "loss": 0.0313, "mean_token_accuracy": 0.989999070763588, "num_tokens": 270572638.0, "step": 2548 }, { "entropy": 0.9638091176748276, "epoch": 5.807297605473204, "grad_norm": 1.0859375, "learning_rate": 2.0448741719246962e-06, "loss": 0.0295, "mean_token_accuracy": 0.9914338439702988, "num_tokens": 270678582.0, "step": 2549 }, { "entropy": 0.961869016289711, "epoch": 5.80957810718358, "grad_norm": 1.453125, "learning_rate": 2.043022326338916e-06, "loss": 0.0399, "mean_token_accuracy": 0.9878752529621124, "num_tokens": 270785005.0, "step": 2550 }, { "entropy": 0.9620109796524048, "epoch": 5.811858608893957, "grad_norm": 0.984375, "learning_rate": 2.0411707401248406e-06, "loss": 0.0375, "mean_token_accuracy": 0.9865156710147858, "num_tokens": 270891728.0, "step": 2551 }, { "entropy": 0.970068171620369, "epoch": 5.814139110604333, "grad_norm": 1.296875, "learning_rate": 2.0393194143333956e-06, "loss": 0.0361, "mean_token_accuracy": 0.989677295088768, "num_tokens": 270998179.0, "step": 2552 }, { "entropy": 0.967042550444603, "epoch": 5.816419612314709, "grad_norm": 1.0859375, "learning_rate": 2.0374683500153564e-06, "loss": 0.0306, "mean_token_accuracy": 0.9899038225412369, "num_tokens": 271104688.0, "step": 2553 }, { "entropy": 0.9660926163196564, "epoch": 5.818700114025085, "grad_norm": 0.9296875, "learning_rate": 2.0356175482213523e-06, "loss": 0.0207, "mean_token_accuracy": 0.9937026649713516, "num_tokens": 271210878.0, "step": 2554 }, { "entropy": 0.9646914154291153, "epoch": 5.820980615735461, "grad_norm": 1.265625, "learning_rate": 2.033767010001863e-06, "loss": 0.0365, "mean_token_accuracy": 0.9872609078884125, "num_tokens": 271316336.0, "step": 2555 }, { "entropy": 0.9700614511966705, "epoch": 5.823261117445838, "grad_norm": 1.203125, "learning_rate": 2.0319167364072184e-06, "loss": 0.0328, "mean_token_accuracy": 0.9892990589141846, "num_tokens": 271423017.0, "step": 2556 }, { "entropy": 0.9635125398635864, "epoch": 5.825541619156215, "grad_norm": 1.4453125, "learning_rate": 2.0300667284875965e-06, "loss": 0.032, "mean_token_accuracy": 0.9894865304231644, "num_tokens": 271529948.0, "step": 2557 }, { "entropy": 0.9673675000667572, "epoch": 5.827822120866591, "grad_norm": 1.1875, "learning_rate": 2.0282169872930275e-06, "loss": 0.0347, "mean_token_accuracy": 0.9917401075363159, "num_tokens": 271636884.0, "step": 2558 }, { "entropy": 0.9642207622528076, "epoch": 5.830102622576967, "grad_norm": 1.0, "learning_rate": 2.026367513873388e-06, "loss": 0.0316, "mean_token_accuracy": 0.9910280108451843, "num_tokens": 271743683.0, "step": 2559 }, { "entropy": 0.964018777012825, "epoch": 5.832383124287343, "grad_norm": 1.2578125, "learning_rate": 2.0245183092784046e-06, "loss": 0.0332, "mean_token_accuracy": 0.9886831939220428, "num_tokens": 271849476.0, "step": 2560 }, { "entropy": 0.9535078257322311, "epoch": 5.834663625997719, "grad_norm": 1.21875, "learning_rate": 2.0226693745576494e-06, "loss": 0.0338, "mean_token_accuracy": 0.9903120994567871, "num_tokens": 271956349.0, "step": 2561 }, { "entropy": 0.9616459310054779, "epoch": 5.836944127708096, "grad_norm": 1.1484375, "learning_rate": 2.020820710760541e-06, "loss": 0.031, "mean_token_accuracy": 0.9901913553476334, "num_tokens": 272063371.0, "step": 2562 }, { "entropy": 0.9646886140108109, "epoch": 5.839224629418472, "grad_norm": 1.0625, "learning_rate": 2.018972318936347e-06, "loss": 0.0351, "mean_token_accuracy": 0.9867581129074097, "num_tokens": 272169355.0, "step": 2563 }, { "entropy": 0.9636795669794083, "epoch": 5.841505131128848, "grad_norm": 1.0546875, "learning_rate": 2.017124200134178e-06, "loss": 0.033, "mean_token_accuracy": 0.9918037205934525, "num_tokens": 272275844.0, "step": 2564 }, { "entropy": 0.9686161577701569, "epoch": 5.843785632839225, "grad_norm": 1.0390625, "learning_rate": 2.01527635540299e-06, "loss": 0.034, "mean_token_accuracy": 0.9898652583360672, "num_tokens": 272382394.0, "step": 2565 }, { "entropy": 0.9592109322547913, "epoch": 5.846066134549601, "grad_norm": 1.1640625, "learning_rate": 2.0134287857915864e-06, "loss": 0.0385, "mean_token_accuracy": 0.9883390069007874, "num_tokens": 272489457.0, "step": 2566 }, { "entropy": 0.9664392173290253, "epoch": 5.848346636259977, "grad_norm": 1.1015625, "learning_rate": 2.0115814923486093e-06, "loss": 0.0334, "mean_token_accuracy": 0.9890879839658737, "num_tokens": 272595681.0, "step": 2567 }, { "entropy": 0.9648758471012115, "epoch": 5.850627137970354, "grad_norm": 0.8828125, "learning_rate": 2.009734476122547e-06, "loss": 0.0225, "mean_token_accuracy": 0.9903213977813721, "num_tokens": 272702127.0, "step": 2568 }, { "entropy": 0.9684413224458694, "epoch": 5.85290763968073, "grad_norm": 0.9609375, "learning_rate": 2.007887738161732e-06, "loss": 0.0268, "mean_token_accuracy": 0.991629496216774, "num_tokens": 272808816.0, "step": 2569 }, { "entropy": 0.9690776169300079, "epoch": 5.855188141391106, "grad_norm": 1.3671875, "learning_rate": 2.006041279514336e-06, "loss": 0.0406, "mean_token_accuracy": 0.9865544736385345, "num_tokens": 272914768.0, "step": 2570 }, { "entropy": 0.9643846601247787, "epoch": 5.857468643101482, "grad_norm": 1.0546875, "learning_rate": 2.004195101228374e-06, "loss": 0.024, "mean_token_accuracy": 0.9923728406429291, "num_tokens": 273021530.0, "step": 2571 }, { "entropy": 0.9641321003437042, "epoch": 5.859749144811858, "grad_norm": 1.0, "learning_rate": 2.002349204351701e-06, "loss": 0.0302, "mean_token_accuracy": 0.9900417923927307, "num_tokens": 273127316.0, "step": 2572 }, { "entropy": 0.9634136408567429, "epoch": 5.862029646522235, "grad_norm": 0.921875, "learning_rate": 2.0005035899320115e-06, "loss": 0.0253, "mean_token_accuracy": 0.9939804524183273, "num_tokens": 273233755.0, "step": 2573 }, { "entropy": 0.9693943858146667, "epoch": 5.864310148232612, "grad_norm": 0.890625, "learning_rate": 1.998658259016841e-06, "loss": 0.0262, "mean_token_accuracy": 0.991637110710144, "num_tokens": 273339820.0, "step": 2574 }, { "entropy": 0.9656626731157303, "epoch": 5.866590649942988, "grad_norm": 1.375, "learning_rate": 1.996813212653564e-06, "loss": 0.0477, "mean_token_accuracy": 0.9859059303998947, "num_tokens": 273446222.0, "step": 2575 }, { "entropy": 0.9692315459251404, "epoch": 5.868871151653364, "grad_norm": 1.1953125, "learning_rate": 1.9949684518893926e-06, "loss": 0.0369, "mean_token_accuracy": 0.9885121732950211, "num_tokens": 273552327.0, "step": 2576 }, { "entropy": 0.9670549780130386, "epoch": 5.87115165336374, "grad_norm": 0.87109375, "learning_rate": 1.9931239777713794e-06, "loss": 0.0228, "mean_token_accuracy": 0.9936285465955734, "num_tokens": 273658776.0, "step": 2577 }, { "entropy": 0.9614523947238922, "epoch": 5.873432155074116, "grad_norm": 1.140625, "learning_rate": 1.9912797913464098e-06, "loss": 0.0377, "mean_token_accuracy": 0.9882145375013351, "num_tokens": 273765175.0, "step": 2578 }, { "entropy": 0.9608191102743149, "epoch": 5.875712656784493, "grad_norm": 1.453125, "learning_rate": 1.989435893661209e-06, "loss": 0.0349, "mean_token_accuracy": 0.9886292666196823, "num_tokens": 273871471.0, "step": 2579 }, { "entropy": 0.9610506594181061, "epoch": 5.877993158494869, "grad_norm": 1.1171875, "learning_rate": 1.9875922857623387e-06, "loss": 0.0269, "mean_token_accuracy": 0.9907281547784805, "num_tokens": 273977500.0, "step": 2580 }, { "entropy": 0.9600795209407806, "epoch": 5.880273660205245, "grad_norm": 1.0546875, "learning_rate": 1.985748968696194e-06, "loss": 0.0316, "mean_token_accuracy": 0.9889247566461563, "num_tokens": 274083228.0, "step": 2581 }, { "entropy": 0.9656701236963272, "epoch": 5.882554161915621, "grad_norm": 1.0859375, "learning_rate": 1.9839059435090073e-06, "loss": 0.0296, "mean_token_accuracy": 0.9906674772500992, "num_tokens": 274189086.0, "step": 2582 }, { "entropy": 0.9553936719894409, "epoch": 5.884834663625997, "grad_norm": 1.0078125, "learning_rate": 1.9820632112468437e-06, "loss": 0.0281, "mean_token_accuracy": 0.9913211613893509, "num_tokens": 274295727.0, "step": 2583 }, { "entropy": 0.9636656194925308, "epoch": 5.887115165336374, "grad_norm": 1.0390625, "learning_rate": 1.9802207729556023e-06, "loss": 0.0278, "mean_token_accuracy": 0.9899256676435471, "num_tokens": 274401893.0, "step": 2584 }, { "entropy": 0.9648701101541519, "epoch": 5.889395667046751, "grad_norm": 1.0, "learning_rate": 1.9783786296810148e-06, "loss": 0.0325, "mean_token_accuracy": 0.9911725372076035, "num_tokens": 274508665.0, "step": 2585 }, { "entropy": 0.9640392512083054, "epoch": 5.891676168757127, "grad_norm": 0.91796875, "learning_rate": 1.9765367824686467e-06, "loss": 0.0235, "mean_token_accuracy": 0.992053434252739, "num_tokens": 274614448.0, "step": 2586 }, { "entropy": 0.9677613973617554, "epoch": 5.893956670467503, "grad_norm": 0.9921875, "learning_rate": 1.974695232363895e-06, "loss": 0.0226, "mean_token_accuracy": 0.9914045035839081, "num_tokens": 274720660.0, "step": 2587 }, { "entropy": 0.9656088650226593, "epoch": 5.896237172177879, "grad_norm": 1.265625, "learning_rate": 1.9728539804119893e-06, "loss": 0.0449, "mean_token_accuracy": 0.9876495003700256, "num_tokens": 274826995.0, "step": 2588 }, { "entropy": 0.9638325721025467, "epoch": 5.898517673888255, "grad_norm": 1.0859375, "learning_rate": 1.9710130276579864e-06, "loss": 0.0232, "mean_token_accuracy": 0.9925849735736847, "num_tokens": 274932704.0, "step": 2589 }, { "entropy": 0.9661104679107666, "epoch": 5.900798175598632, "grad_norm": 1.125, "learning_rate": 1.969172375146776e-06, "loss": 0.0256, "mean_token_accuracy": 0.992829754948616, "num_tokens": 275039174.0, "step": 2590 }, { "entropy": 0.9623557925224304, "epoch": 5.903078677309008, "grad_norm": 1.4140625, "learning_rate": 1.9673320239230783e-06, "loss": 0.0425, "mean_token_accuracy": 0.9880584180355072, "num_tokens": 275145446.0, "step": 2591 }, { "entropy": 0.9594337940216064, "epoch": 5.905359179019384, "grad_norm": 0.9453125, "learning_rate": 1.9654919750314396e-06, "loss": 0.0239, "mean_token_accuracy": 0.9927521795034409, "num_tokens": 275251469.0, "step": 2592 }, { "entropy": 0.9592840224504471, "epoch": 5.90763968072976, "grad_norm": 1.234375, "learning_rate": 1.9636522295162375e-06, "loss": 0.0373, "mean_token_accuracy": 0.9889799505472183, "num_tokens": 275358110.0, "step": 2593 }, { "entropy": 0.9644607305526733, "epoch": 5.909920182440137, "grad_norm": 0.98828125, "learning_rate": 1.9618127884216753e-06, "loss": 0.0329, "mean_token_accuracy": 0.9908797144889832, "num_tokens": 275464396.0, "step": 2594 }, { "entropy": 0.9684407263994217, "epoch": 5.9122006841505135, "grad_norm": 1.078125, "learning_rate": 1.959973652791784e-06, "loss": 0.0329, "mean_token_accuracy": 0.9895510226488113, "num_tokens": 275571387.0, "step": 2595 }, { "entropy": 0.9625894725322723, "epoch": 5.91448118586089, "grad_norm": 1.234375, "learning_rate": 1.9581348236704217e-06, "loss": 0.0351, "mean_token_accuracy": 0.9897045493125916, "num_tokens": 275677780.0, "step": 2596 }, { "entropy": 0.9609717130661011, "epoch": 5.916761687571266, "grad_norm": 1.1796875, "learning_rate": 1.9562963021012723e-06, "loss": 0.0361, "mean_token_accuracy": 0.9867963790893555, "num_tokens": 275784173.0, "step": 2597 }, { "entropy": 0.9622538536787033, "epoch": 5.919042189281642, "grad_norm": 0.85546875, "learning_rate": 1.954458089127845e-06, "loss": 0.0226, "mean_token_accuracy": 0.993469163775444, "num_tokens": 275890669.0, "step": 2598 }, { "entropy": 0.9689373821020126, "epoch": 5.921322690992018, "grad_norm": 1.125, "learning_rate": 1.952620185793475e-06, "loss": 0.0363, "mean_token_accuracy": 0.9884637892246246, "num_tokens": 275996463.0, "step": 2599 }, { "entropy": 0.9671191871166229, "epoch": 5.923603192702394, "grad_norm": 0.890625, "learning_rate": 1.9507825931413193e-06, "loss": 0.0266, "mean_token_accuracy": 0.992078423500061, "num_tokens": 276102646.0, "step": 2600 }, { "entropy": 0.9611636102199554, "epoch": 5.925883694412771, "grad_norm": 0.8984375, "learning_rate": 1.9489453122143605e-06, "loss": 0.0248, "mean_token_accuracy": 0.9917316287755966, "num_tokens": 276208544.0, "step": 2601 }, { "entropy": 0.9636888951063156, "epoch": 5.928164196123147, "grad_norm": 0.9375, "learning_rate": 1.947108344055404e-06, "loss": 0.0233, "mean_token_accuracy": 0.9913294762372971, "num_tokens": 276314748.0, "step": 2602 }, { "entropy": 0.9625857025384903, "epoch": 5.930444697833523, "grad_norm": 1.0546875, "learning_rate": 1.9452716897070785e-06, "loss": 0.031, "mean_token_accuracy": 0.9893152713775635, "num_tokens": 276421031.0, "step": 2603 }, { "entropy": 0.9621812999248505, "epoch": 5.932725199543899, "grad_norm": 1.1328125, "learning_rate": 1.943435350211832e-06, "loss": 0.0322, "mean_token_accuracy": 0.9899448752403259, "num_tokens": 276527206.0, "step": 2604 }, { "entropy": 0.9640472680330276, "epoch": 5.935005701254276, "grad_norm": 0.80859375, "learning_rate": 1.941599326611935e-06, "loss": 0.0307, "mean_token_accuracy": 0.9893184006214142, "num_tokens": 276633003.0, "step": 2605 }, { "entropy": 0.9645978510379791, "epoch": 5.9372862029646525, "grad_norm": 1.2421875, "learning_rate": 1.939763619949481e-06, "loss": 0.033, "mean_token_accuracy": 0.9885160028934479, "num_tokens": 276739421.0, "step": 2606 }, { "entropy": 0.9604286104440689, "epoch": 5.939566704675029, "grad_norm": 1.0703125, "learning_rate": 1.9379282312663797e-06, "loss": 0.0355, "mean_token_accuracy": 0.9883994907140732, "num_tokens": 276845549.0, "step": 2607 }, { "entropy": 0.9636229872703552, "epoch": 5.941847206385405, "grad_norm": 0.8984375, "learning_rate": 1.936093161604363e-06, "loss": 0.0235, "mean_token_accuracy": 0.9924397021532059, "num_tokens": 276951831.0, "step": 2608 }, { "entropy": 0.966996818780899, "epoch": 5.944127708095781, "grad_norm": 1.4765625, "learning_rate": 1.9342584120049824e-06, "loss": 0.0482, "mean_token_accuracy": 0.9868300259113312, "num_tokens": 277058063.0, "step": 2609 }, { "entropy": 0.9599764198064804, "epoch": 5.946408209806157, "grad_norm": 1.5078125, "learning_rate": 1.9324239835096044e-06, "loss": 0.0396, "mean_token_accuracy": 0.9886357188224792, "num_tokens": 277164690.0, "step": 2610 }, { "entropy": 0.9680380374193192, "epoch": 5.9486887115165334, "grad_norm": 1.078125, "learning_rate": 1.930589877159415e-06, "loss": 0.0388, "mean_token_accuracy": 0.9868640601634979, "num_tokens": 277271379.0, "step": 2611 }, { "entropy": 0.9587440937757492, "epoch": 5.95096921322691, "grad_norm": 1.203125, "learning_rate": 1.928756093995419e-06, "loss": 0.0425, "mean_token_accuracy": 0.9858490228652954, "num_tokens": 277377645.0, "step": 2612 }, { "entropy": 0.963230699300766, "epoch": 5.953249714937286, "grad_norm": 1.1796875, "learning_rate": 1.9269226350584357e-06, "loss": 0.0351, "mean_token_accuracy": 0.9875157922506332, "num_tokens": 277483735.0, "step": 2613 }, { "entropy": 0.9669713079929352, "epoch": 5.955530216647663, "grad_norm": 1.03125, "learning_rate": 1.9250895013891015e-06, "loss": 0.0249, "mean_token_accuracy": 0.9920880049467087, "num_tokens": 277590061.0, "step": 2614 }, { "entropy": 0.9601591527462006, "epoch": 5.957810718358039, "grad_norm": 1.265625, "learning_rate": 1.9232566940278675e-06, "loss": 0.0341, "mean_token_accuracy": 0.9895024597644806, "num_tokens": 277696372.0, "step": 2615 }, { "entropy": 0.9639184772968292, "epoch": 5.960091220068415, "grad_norm": 1.0390625, "learning_rate": 1.9214242140149987e-06, "loss": 0.0271, "mean_token_accuracy": 0.9930510520935059, "num_tokens": 277802710.0, "step": 2616 }, { "entropy": 0.9628991633653641, "epoch": 5.9623717217787915, "grad_norm": 1.234375, "learning_rate": 1.9195920623905766e-06, "loss": 0.04, "mean_token_accuracy": 0.9900447279214859, "num_tokens": 277908704.0, "step": 2617 }, { "entropy": 0.9631651490926743, "epoch": 5.964652223489168, "grad_norm": 1.015625, "learning_rate": 1.9177602401944943e-06, "loss": 0.0325, "mean_token_accuracy": 0.9923735558986664, "num_tokens": 278014189.0, "step": 2618 }, { "entropy": 0.9723061919212341, "epoch": 5.966932725199544, "grad_norm": 1.2109375, "learning_rate": 1.915928748466459e-06, "loss": 0.0351, "mean_token_accuracy": 0.9894900470972061, "num_tokens": 278120961.0, "step": 2619 }, { "entropy": 0.9645763635635376, "epoch": 5.96921322690992, "grad_norm": 1.1875, "learning_rate": 1.9140975882459912e-06, "loss": 0.0307, "mean_token_accuracy": 0.9910223335027695, "num_tokens": 278227175.0, "step": 2620 }, { "entropy": 0.9636652320623398, "epoch": 5.971493728620296, "grad_norm": 1.015625, "learning_rate": 1.9122667605724202e-06, "loss": 0.0305, "mean_token_accuracy": 0.9877048134803772, "num_tokens": 278333134.0, "step": 2621 }, { "entropy": 0.9610069394111633, "epoch": 5.9737742303306725, "grad_norm": 1.2265625, "learning_rate": 1.910436266484889e-06, "loss": 0.0359, "mean_token_accuracy": 0.9893794357776642, "num_tokens": 278439297.0, "step": 2622 }, { "entropy": 0.9624828696250916, "epoch": 5.976054732041049, "grad_norm": 0.95703125, "learning_rate": 1.908606107022351e-06, "loss": 0.0236, "mean_token_accuracy": 0.9930557906627655, "num_tokens": 278546036.0, "step": 2623 }, { "entropy": 0.9616840332746506, "epoch": 5.978335233751425, "grad_norm": 1.359375, "learning_rate": 1.9067762832235698e-06, "loss": 0.0342, "mean_token_accuracy": 0.9902885407209396, "num_tokens": 278652693.0, "step": 2624 }, { "entropy": 0.9732004404067993, "epoch": 5.980615735461802, "grad_norm": 1.1640625, "learning_rate": 1.9049467961271184e-06, "loss": 0.0261, "mean_token_accuracy": 0.9899483770132065, "num_tokens": 278758932.0, "step": 2625 }, { "entropy": 0.957132562994957, "epoch": 5.982896237172178, "grad_norm": 0.90625, "learning_rate": 1.9031176467713763e-06, "loss": 0.0268, "mean_token_accuracy": 0.9906071126461029, "num_tokens": 278865665.0, "step": 2626 }, { "entropy": 0.9614012837409973, "epoch": 5.985176738882554, "grad_norm": 1.0, "learning_rate": 1.9012888361945354e-06, "loss": 0.0293, "mean_token_accuracy": 0.993062362074852, "num_tokens": 278972177.0, "step": 2627 }, { "entropy": 0.962404191493988, "epoch": 5.9874572405929305, "grad_norm": 0.85546875, "learning_rate": 1.8994603654345917e-06, "loss": 0.0245, "mean_token_accuracy": 0.991689145565033, "num_tokens": 279078183.0, "step": 2628 }, { "entropy": 0.9660652577877045, "epoch": 5.989737742303307, "grad_norm": 0.94921875, "learning_rate": 1.897632235529351e-06, "loss": 0.0219, "mean_token_accuracy": 0.9922740608453751, "num_tokens": 279184251.0, "step": 2629 }, { "entropy": 0.9651109427213669, "epoch": 5.992018244013683, "grad_norm": 1.0234375, "learning_rate": 1.8958044475164242e-06, "loss": 0.0342, "mean_token_accuracy": 0.9886815994977951, "num_tokens": 279290853.0, "step": 2630 }, { "entropy": 0.9635764062404633, "epoch": 5.994298745724059, "grad_norm": 0.99609375, "learning_rate": 1.8939770024332294e-06, "loss": 0.0278, "mean_token_accuracy": 0.9922763705253601, "num_tokens": 279396926.0, "step": 2631 }, { "entropy": 0.9657981693744659, "epoch": 5.996579247434435, "grad_norm": 1.109375, "learning_rate": 1.8921499013169876e-06, "loss": 0.0386, "mean_token_accuracy": 0.9885959178209305, "num_tokens": 279503132.0, "step": 2632 }, { "entropy": 0.9654140025377274, "epoch": 5.9988597491448115, "grad_norm": 0.9375, "learning_rate": 1.8903231452047265e-06, "loss": 0.0242, "mean_token_accuracy": 0.9929667860269547, "num_tokens": 279609231.0, "step": 2633 }, { "entropy": 0.9622074365615845, "epoch": 6.0, "grad_norm": 1.8828125, "learning_rate": 1.8884967351332778e-06, "loss": 0.0347, "mean_token_accuracy": 0.9882912635803223, "num_tokens": 279647856.0, "step": 2634 }, { "entropy": 0.9569820612668991, "epoch": 6.002280501710376, "grad_norm": 1.203125, "learning_rate": 1.886670672139277e-06, "loss": 0.032, "mean_token_accuracy": 0.9902430772781372, "num_tokens": 279754361.0, "step": 2635 }, { "entropy": 0.9665438532829285, "epoch": 6.004561003420752, "grad_norm": 0.8125, "learning_rate": 1.884844957259163e-06, "loss": 0.0244, "mean_token_accuracy": 0.9933710396289825, "num_tokens": 279860623.0, "step": 2636 }, { "entropy": 0.9654576629400253, "epoch": 6.006841505131129, "grad_norm": 1.125, "learning_rate": 1.8830195915291741e-06, "loss": 0.0353, "mean_token_accuracy": 0.989177480340004, "num_tokens": 279966857.0, "step": 2637 }, { "entropy": 0.9612531661987305, "epoch": 6.009122006841505, "grad_norm": 1.1484375, "learning_rate": 1.8811945759853543e-06, "loss": 0.0352, "mean_token_accuracy": 0.9909251779317856, "num_tokens": 280073080.0, "step": 2638 }, { "entropy": 0.966259241104126, "epoch": 6.011402508551882, "grad_norm": 1.1484375, "learning_rate": 1.879369911663546e-06, "loss": 0.0348, "mean_token_accuracy": 0.9882449954748154, "num_tokens": 280179363.0, "step": 2639 }, { "entropy": 0.960009977221489, "epoch": 6.013683010262258, "grad_norm": 0.95703125, "learning_rate": 1.8775455995993941e-06, "loss": 0.0331, "mean_token_accuracy": 0.9901233017444611, "num_tokens": 280285685.0, "step": 2640 }, { "epoch": 6.013683010262258, "eval_entropy": 0.9626933052965897, "eval_loss": 0.03770119696855545, "eval_mean_token_accuracy": 0.9884859355230295, "eval_num_tokens": 280285685.0, "eval_runtime": 66.0431, "eval_samples_per_second": 126.963, "eval_steps_per_second": 3.982, "step": 2640 }, { "entropy": 0.9572997242212296, "epoch": 6.015963511972634, "grad_norm": 0.91796875, "learning_rate": 1.875721640828344e-06, "loss": 0.0262, "mean_token_accuracy": 0.9907844066619873, "num_tokens": 280391631.0, "step": 2641 }, { "entropy": 0.9577366858720779, "epoch": 6.01824401368301, "grad_norm": 1.1875, "learning_rate": 1.8738980363856376e-06, "loss": 0.0375, "mean_token_accuracy": 0.9877007603645325, "num_tokens": 280497563.0, "step": 2642 }, { "entropy": 0.9608228504657745, "epoch": 6.020524515393387, "grad_norm": 1.0703125, "learning_rate": 1.8720747873063184e-06, "loss": 0.0341, "mean_token_accuracy": 0.989451989531517, "num_tokens": 280603921.0, "step": 2643 }, { "entropy": 0.9617141038179398, "epoch": 6.022805017103763, "grad_norm": 1.15625, "learning_rate": 1.870251894625227e-06, "loss": 0.0305, "mean_token_accuracy": 0.9895782470703125, "num_tokens": 280710078.0, "step": 2644 }, { "entropy": 0.9612976908683777, "epoch": 6.025085518814139, "grad_norm": 1.109375, "learning_rate": 1.8684293593770026e-06, "loss": 0.0291, "mean_token_accuracy": 0.9896937012672424, "num_tokens": 280815846.0, "step": 2645 }, { "entropy": 0.9618713408708572, "epoch": 6.027366020524515, "grad_norm": 1.03125, "learning_rate": 1.866607182596081e-06, "loss": 0.0209, "mean_token_accuracy": 0.9932781159877777, "num_tokens": 280922004.0, "step": 2646 }, { "entropy": 0.9611506313085556, "epoch": 6.029646522234891, "grad_norm": 1.03125, "learning_rate": 1.8647853653166953e-06, "loss": 0.0334, "mean_token_accuracy": 0.9913944453001022, "num_tokens": 281028279.0, "step": 2647 }, { "entropy": 0.9692288339138031, "epoch": 6.031927023945268, "grad_norm": 1.0703125, "learning_rate": 1.862963908572872e-06, "loss": 0.0364, "mean_token_accuracy": 0.9893703013658524, "num_tokens": 281134208.0, "step": 2648 }, { "entropy": 0.9674956947565079, "epoch": 6.034207525655645, "grad_norm": 1.171875, "learning_rate": 1.8611428133984365e-06, "loss": 0.033, "mean_token_accuracy": 0.9897223562002182, "num_tokens": 281240287.0, "step": 2649 }, { "entropy": 0.961338073015213, "epoch": 6.036488027366021, "grad_norm": 0.984375, "learning_rate": 1.8593220808270057e-06, "loss": 0.0273, "mean_token_accuracy": 0.9910855740308762, "num_tokens": 281346242.0, "step": 2650 }, { "entropy": 0.962697833776474, "epoch": 6.038768529076397, "grad_norm": 1.21875, "learning_rate": 1.857501711891993e-06, "loss": 0.0331, "mean_token_accuracy": 0.9891698062419891, "num_tokens": 281452912.0, "step": 2651 }, { "entropy": 0.9599490761756897, "epoch": 6.041049030786773, "grad_norm": 1.1171875, "learning_rate": 1.8556817076266059e-06, "loss": 0.0266, "mean_token_accuracy": 0.9898010939359665, "num_tokens": 281558761.0, "step": 2652 }, { "entropy": 0.9703655689954758, "epoch": 6.043329532497149, "grad_norm": 0.96875, "learning_rate": 1.8538620690638414e-06, "loss": 0.0273, "mean_token_accuracy": 0.9909131228923798, "num_tokens": 281665179.0, "step": 2653 }, { "entropy": 0.9628771096467972, "epoch": 6.045610034207526, "grad_norm": 1.2578125, "learning_rate": 1.8520427972364924e-06, "loss": 0.0413, "mean_token_accuracy": 0.9855238795280457, "num_tokens": 281771802.0, "step": 2654 }, { "entropy": 0.9632657617330551, "epoch": 6.047890535917902, "grad_norm": 1.1015625, "learning_rate": 1.8502238931771422e-06, "loss": 0.0263, "mean_token_accuracy": 0.9912308901548386, "num_tokens": 281877955.0, "step": 2655 }, { "entropy": 0.96798075735569, "epoch": 6.050171037628278, "grad_norm": 0.984375, "learning_rate": 1.848405357918166e-06, "loss": 0.0237, "mean_token_accuracy": 0.9918845742940903, "num_tokens": 281983820.0, "step": 2656 }, { "entropy": 0.9706284254789352, "epoch": 6.052451539338654, "grad_norm": 1.265625, "learning_rate": 1.8465871924917295e-06, "loss": 0.0346, "mean_token_accuracy": 0.9908955991268158, "num_tokens": 282089521.0, "step": 2657 }, { "entropy": 0.9625689089298248, "epoch": 6.05473204104903, "grad_norm": 1.2890625, "learning_rate": 1.8447693979297882e-06, "loss": 0.0365, "mean_token_accuracy": 0.9891251027584076, "num_tokens": 282196082.0, "step": 2658 }, { "entropy": 0.969121515750885, "epoch": 6.0570125427594075, "grad_norm": 1.0234375, "learning_rate": 1.8429519752640862e-06, "loss": 0.0291, "mean_token_accuracy": 0.9908890724182129, "num_tokens": 282302612.0, "step": 2659 }, { "entropy": 0.9634146839380264, "epoch": 6.059293044469784, "grad_norm": 0.953125, "learning_rate": 1.8411349255261587e-06, "loss": 0.0315, "mean_token_accuracy": 0.9901494979858398, "num_tokens": 282409357.0, "step": 2660 }, { "entropy": 0.9628078490495682, "epoch": 6.06157354618016, "grad_norm": 1.03125, "learning_rate": 1.8393182497473271e-06, "loss": 0.0316, "mean_token_accuracy": 0.9909157454967499, "num_tokens": 282516154.0, "step": 2661 }, { "entropy": 0.9626386016607285, "epoch": 6.063854047890536, "grad_norm": 1.0078125, "learning_rate": 1.837501948958702e-06, "loss": 0.0319, "mean_token_accuracy": 0.9908792525529861, "num_tokens": 282622693.0, "step": 2662 }, { "entropy": 0.9650411307811737, "epoch": 6.066134549600912, "grad_norm": 1.046875, "learning_rate": 1.8356860241911817e-06, "loss": 0.0282, "mean_token_accuracy": 0.9911665618419647, "num_tokens": 282729839.0, "step": 2663 }, { "entropy": 0.9617755562067032, "epoch": 6.068415051311288, "grad_norm": 1.359375, "learning_rate": 1.833870476475448e-06, "loss": 0.0417, "mean_token_accuracy": 0.9877984821796417, "num_tokens": 282836146.0, "step": 2664 }, { "entropy": 0.9630835354328156, "epoch": 6.070695553021665, "grad_norm": 1.3671875, "learning_rate": 1.8320553068419716e-06, "loss": 0.0334, "mean_token_accuracy": 0.9887526631355286, "num_tokens": 282942188.0, "step": 2665 }, { "entropy": 0.9619114249944687, "epoch": 6.072976054732041, "grad_norm": 0.84765625, "learning_rate": 1.830240516321008e-06, "loss": 0.0249, "mean_token_accuracy": 0.993399515748024, "num_tokens": 283048332.0, "step": 2666 }, { "entropy": 0.9637665897607803, "epoch": 6.075256556442417, "grad_norm": 0.8125, "learning_rate": 1.8284261059425972e-06, "loss": 0.0213, "mean_token_accuracy": 0.9929117411375046, "num_tokens": 283154741.0, "step": 2667 }, { "entropy": 0.9644084721803665, "epoch": 6.077537058152793, "grad_norm": 1.0234375, "learning_rate": 1.8266120767365642e-06, "loss": 0.0252, "mean_token_accuracy": 0.9930639266967773, "num_tokens": 283260420.0, "step": 2668 }, { "entropy": 0.9648587256669998, "epoch": 6.07981755986317, "grad_norm": 1.0390625, "learning_rate": 1.8247984297325156e-06, "loss": 0.0338, "mean_token_accuracy": 0.9908852577209473, "num_tokens": 283366582.0, "step": 2669 }, { "entropy": 0.9614146649837494, "epoch": 6.0820980615735465, "grad_norm": 1.390625, "learning_rate": 1.8229851659598425e-06, "loss": 0.0408, "mean_token_accuracy": 0.9876816868782043, "num_tokens": 283472896.0, "step": 2670 }, { "entropy": 0.9594793766736984, "epoch": 6.084378563283923, "grad_norm": 1.0078125, "learning_rate": 1.8211722864477197e-06, "loss": 0.0314, "mean_token_accuracy": 0.9905988574028015, "num_tokens": 283578879.0, "step": 2671 }, { "entropy": 0.9610547423362732, "epoch": 6.086659064994299, "grad_norm": 1.0625, "learning_rate": 1.819359792225101e-06, "loss": 0.0306, "mean_token_accuracy": 0.9885759055614471, "num_tokens": 283685517.0, "step": 2672 }, { "entropy": 0.9684831649065018, "epoch": 6.088939566704675, "grad_norm": 0.97265625, "learning_rate": 1.8175476843207245e-06, "loss": 0.0272, "mean_token_accuracy": 0.9908299297094345, "num_tokens": 283791873.0, "step": 2673 }, { "entropy": 0.959471121430397, "epoch": 6.091220068415051, "grad_norm": 0.87109375, "learning_rate": 1.8157359637631078e-06, "loss": 0.025, "mean_token_accuracy": 0.9927243739366531, "num_tokens": 283897592.0, "step": 2674 }, { "entropy": 0.9608851224184036, "epoch": 6.0935005701254275, "grad_norm": 0.9453125, "learning_rate": 1.813924631580547e-06, "loss": 0.0258, "mean_token_accuracy": 0.9916556477546692, "num_tokens": 284003695.0, "step": 2675 }, { "entropy": 0.9574878662824631, "epoch": 6.095781071835804, "grad_norm": 1.15625, "learning_rate": 1.8121136888011198e-06, "loss": 0.0379, "mean_token_accuracy": 0.986450120806694, "num_tokens": 284109357.0, "step": 2676 }, { "entropy": 0.9579738825559616, "epoch": 6.09806157354618, "grad_norm": 1.0, "learning_rate": 1.810303136452683e-06, "loss": 0.0255, "mean_token_accuracy": 0.9923062920570374, "num_tokens": 284215838.0, "step": 2677 }, { "entropy": 0.962499126791954, "epoch": 6.100342075256556, "grad_norm": 1.1015625, "learning_rate": 1.8084929755628707e-06, "loss": 0.0348, "mean_token_accuracy": 0.9885999262332916, "num_tokens": 284322121.0, "step": 2678 }, { "entropy": 0.9632545411586761, "epoch": 6.102622576966933, "grad_norm": 1.0703125, "learning_rate": 1.8066832071590967e-06, "loss": 0.0256, "mean_token_accuracy": 0.9909094274044037, "num_tokens": 284428678.0, "step": 2679 }, { "entropy": 0.9615702629089355, "epoch": 6.104903078677309, "grad_norm": 0.875, "learning_rate": 1.8048738322685478e-06, "loss": 0.025, "mean_token_accuracy": 0.992927223443985, "num_tokens": 284534673.0, "step": 2680 }, { "entropy": 0.9649644792079926, "epoch": 6.1071835803876855, "grad_norm": 1.140625, "learning_rate": 1.8030648519181926e-06, "loss": 0.0298, "mean_token_accuracy": 0.9909150004386902, "num_tokens": 284640979.0, "step": 2681 }, { "entropy": 0.9605531543493271, "epoch": 6.109464082098062, "grad_norm": 1.1171875, "learning_rate": 1.8012562671347721e-06, "loss": 0.034, "mean_token_accuracy": 0.9896342009305954, "num_tokens": 284747018.0, "step": 2682 }, { "entropy": 0.9643349498510361, "epoch": 6.111744583808438, "grad_norm": 0.890625, "learning_rate": 1.7994480789448043e-06, "loss": 0.0252, "mean_token_accuracy": 0.9917148351669312, "num_tokens": 284853650.0, "step": 2683 }, { "entropy": 0.9586532711982727, "epoch": 6.114025085518814, "grad_norm": 1.1015625, "learning_rate": 1.7976402883745836e-06, "loss": 0.0381, "mean_token_accuracy": 0.9864254891872406, "num_tokens": 284960337.0, "step": 2684 }, { "entropy": 0.9568156152963638, "epoch": 6.11630558722919, "grad_norm": 1.2109375, "learning_rate": 1.7958328964501749e-06, "loss": 0.029, "mean_token_accuracy": 0.9896482676267624, "num_tokens": 285066072.0, "step": 2685 }, { "entropy": 0.9624978601932526, "epoch": 6.1185860889395665, "grad_norm": 0.9140625, "learning_rate": 1.7940259041974189e-06, "loss": 0.0333, "mean_token_accuracy": 0.9908523857593536, "num_tokens": 285172324.0, "step": 2686 }, { "entropy": 0.9607389569282532, "epoch": 6.120866590649943, "grad_norm": 1.140625, "learning_rate": 1.7922193126419306e-06, "loss": 0.0261, "mean_token_accuracy": 0.9909248352050781, "num_tokens": 285278391.0, "step": 2687 }, { "entropy": 0.9636567384004593, "epoch": 6.123147092360319, "grad_norm": 1.0703125, "learning_rate": 1.7904131228090965e-06, "loss": 0.0353, "mean_token_accuracy": 0.9890855550765991, "num_tokens": 285384666.0, "step": 2688 }, { "entropy": 0.9641198366880417, "epoch": 6.125427594070696, "grad_norm": 1.15625, "learning_rate": 1.7886073357240746e-06, "loss": 0.0397, "mean_token_accuracy": 0.987984761595726, "num_tokens": 285491037.0, "step": 2689 }, { "entropy": 0.9624408334493637, "epoch": 6.127708095781072, "grad_norm": 1.2109375, "learning_rate": 1.7868019524117957e-06, "loss": 0.0344, "mean_token_accuracy": 0.9881749153137207, "num_tokens": 285597002.0, "step": 2690 }, { "entropy": 0.9632614552974701, "epoch": 6.129988597491448, "grad_norm": 1.4375, "learning_rate": 1.7849969738969592e-06, "loss": 0.0405, "mean_token_accuracy": 0.9863800704479218, "num_tokens": 285702852.0, "step": 2691 }, { "entropy": 0.9653108268976212, "epoch": 6.1322690992018245, "grad_norm": 1.1484375, "learning_rate": 1.783192401204037e-06, "loss": 0.0281, "mean_token_accuracy": 0.9907037913799286, "num_tokens": 285809172.0, "step": 2692 }, { "entropy": 0.9612651020288467, "epoch": 6.134549600912201, "grad_norm": 0.8359375, "learning_rate": 1.7813882353572692e-06, "loss": 0.0232, "mean_token_accuracy": 0.9924894720315933, "num_tokens": 285915257.0, "step": 2693 }, { "entropy": 0.9625966995954514, "epoch": 6.136830102622577, "grad_norm": 1.0859375, "learning_rate": 1.7795844773806653e-06, "loss": 0.0291, "mean_token_accuracy": 0.9905236512422562, "num_tokens": 286021781.0, "step": 2694 }, { "entropy": 0.961761936545372, "epoch": 6.139110604332953, "grad_norm": 1.1875, "learning_rate": 1.7777811282980047e-06, "loss": 0.0342, "mean_token_accuracy": 0.987438291311264, "num_tokens": 286128312.0, "step": 2695 }, { "entropy": 0.9685081541538239, "epoch": 6.141391106043329, "grad_norm": 1.078125, "learning_rate": 1.7759781891328321e-06, "loss": 0.0268, "mean_token_accuracy": 0.9911776483058929, "num_tokens": 286234752.0, "step": 2696 }, { "entropy": 0.9614653140306473, "epoch": 6.1436716077537055, "grad_norm": 1.0625, "learning_rate": 1.7741756609084616e-06, "loss": 0.0252, "mean_token_accuracy": 0.9922378063201904, "num_tokens": 286341464.0, "step": 2697 }, { "entropy": 0.9654054641723633, "epoch": 6.145952109464082, "grad_norm": 0.92578125, "learning_rate": 1.772373544647973e-06, "loss": 0.0316, "mean_token_accuracy": 0.9904837757349014, "num_tokens": 286448311.0, "step": 2698 }, { "entropy": 0.9621796160936356, "epoch": 6.148232611174459, "grad_norm": 1.1875, "learning_rate": 1.770571841374213e-06, "loss": 0.0352, "mean_token_accuracy": 0.9907694607973099, "num_tokens": 286554518.0, "step": 2699 }, { "entropy": 0.9636770188808441, "epoch": 6.150513112884835, "grad_norm": 0.93359375, "learning_rate": 1.7687705521097954e-06, "loss": 0.0268, "mean_token_accuracy": 0.990375280380249, "num_tokens": 286661123.0, "step": 2700 }, { "entropy": 0.9715162068605423, "epoch": 6.152793614595211, "grad_norm": 1.28125, "learning_rate": 1.766969677877094e-06, "loss": 0.0391, "mean_token_accuracy": 0.9888971298933029, "num_tokens": 286767900.0, "step": 2701 }, { "entropy": 0.963448166847229, "epoch": 6.155074116305587, "grad_norm": 0.875, "learning_rate": 1.7651692196982517e-06, "loss": 0.0303, "mean_token_accuracy": 0.9901432543992996, "num_tokens": 286874122.0, "step": 2702 }, { "entropy": 0.9598369747400284, "epoch": 6.1573546180159635, "grad_norm": 1.078125, "learning_rate": 1.7633691785951746e-06, "loss": 0.027, "mean_token_accuracy": 0.9892519265413284, "num_tokens": 286980175.0, "step": 2703 }, { "entropy": 0.9674243032932281, "epoch": 6.15963511972634, "grad_norm": 1.046875, "learning_rate": 1.7615695555895296e-06, "loss": 0.0327, "mean_token_accuracy": 0.9894340932369232, "num_tokens": 287086683.0, "step": 2704 }, { "entropy": 0.96704962849617, "epoch": 6.161915621436716, "grad_norm": 1.0, "learning_rate": 1.7597703517027491e-06, "loss": 0.0308, "mean_token_accuracy": 0.9889951348304749, "num_tokens": 287192693.0, "step": 2705 }, { "entropy": 0.956523209810257, "epoch": 6.164196123147092, "grad_norm": 1.015625, "learning_rate": 1.7579715679560273e-06, "loss": 0.0307, "mean_token_accuracy": 0.9921295642852783, "num_tokens": 287298942.0, "step": 2706 }, { "entropy": 0.9640293121337891, "epoch": 6.166476624857468, "grad_norm": 1.1953125, "learning_rate": 1.7561732053703174e-06, "loss": 0.0323, "mean_token_accuracy": 0.9900531321763992, "num_tokens": 287405661.0, "step": 2707 }, { "entropy": 0.959778293967247, "epoch": 6.168757126567845, "grad_norm": 1.21875, "learning_rate": 1.7543752649663354e-06, "loss": 0.0386, "mean_token_accuracy": 0.9872139394283295, "num_tokens": 287512614.0, "step": 2708 }, { "entropy": 0.9607634097337723, "epoch": 6.1710376282782216, "grad_norm": 1.0625, "learning_rate": 1.7525777477645586e-06, "loss": 0.026, "mean_token_accuracy": 0.9913304597139359, "num_tokens": 287619824.0, "step": 2709 }, { "entropy": 0.9578602612018585, "epoch": 6.173318129988598, "grad_norm": 1.015625, "learning_rate": 1.7507806547852224e-06, "loss": 0.0334, "mean_token_accuracy": 0.9907326698303223, "num_tokens": 287726279.0, "step": 2710 }, { "entropy": 0.9601590782403946, "epoch": 6.175598631698974, "grad_norm": 1.1875, "learning_rate": 1.7489839870483236e-06, "loss": 0.0288, "mean_token_accuracy": 0.9912566840648651, "num_tokens": 287832831.0, "step": 2711 }, { "entropy": 0.9643799960613251, "epoch": 6.17787913340935, "grad_norm": 1.03125, "learning_rate": 1.7471877455736136e-06, "loss": 0.031, "mean_token_accuracy": 0.9910463392734528, "num_tokens": 287939385.0, "step": 2712 }, { "entropy": 0.9621084630489349, "epoch": 6.180159635119726, "grad_norm": 1.4140625, "learning_rate": 1.7453919313806057e-06, "loss": 0.0333, "mean_token_accuracy": 0.9910760670900345, "num_tokens": 288045699.0, "step": 2713 }, { "entropy": 0.9631487280130386, "epoch": 6.1824401368301025, "grad_norm": 1.0390625, "learning_rate": 1.7435965454885699e-06, "loss": 0.0337, "mean_token_accuracy": 0.9891772568225861, "num_tokens": 288152006.0, "step": 2714 }, { "entropy": 0.9656496345996857, "epoch": 6.184720638540479, "grad_norm": 0.79296875, "learning_rate": 1.7418015889165312e-06, "loss": 0.0226, "mean_token_accuracy": 0.9910032749176025, "num_tokens": 288259130.0, "step": 2715 }, { "entropy": 0.9642879217863083, "epoch": 6.187001140250855, "grad_norm": 1.09375, "learning_rate": 1.7400070626832732e-06, "loss": 0.0329, "mean_token_accuracy": 0.9887663424015045, "num_tokens": 288365479.0, "step": 2716 }, { "entropy": 0.9662395268678665, "epoch": 6.189281641961231, "grad_norm": 1.0234375, "learning_rate": 1.7382129678073351e-06, "loss": 0.0285, "mean_token_accuracy": 0.9905247837305069, "num_tokens": 288471904.0, "step": 2717 }, { "entropy": 0.9626572579145432, "epoch": 6.191562143671608, "grad_norm": 1.296875, "learning_rate": 1.7364193053070082e-06, "loss": 0.0319, "mean_token_accuracy": 0.990158274769783, "num_tokens": 288577512.0, "step": 2718 }, { "entropy": 0.9645081907510757, "epoch": 6.193842645381984, "grad_norm": 0.90625, "learning_rate": 1.7346260762003428e-06, "loss": 0.0233, "mean_token_accuracy": 0.9921761155128479, "num_tokens": 288683626.0, "step": 2719 }, { "entropy": 0.9693494290113449, "epoch": 6.196123147092361, "grad_norm": 1.171875, "learning_rate": 1.7328332815051403e-06, "loss": 0.034, "mean_token_accuracy": 0.9883894771337509, "num_tokens": 288790364.0, "step": 2720 }, { "entropy": 0.9589163661003113, "epoch": 6.198403648802737, "grad_norm": 0.88671875, "learning_rate": 1.7310409222389563e-06, "loss": 0.028, "mean_token_accuracy": 0.9886386543512344, "num_tokens": 288896728.0, "step": 2721 }, { "entropy": 0.9660790711641312, "epoch": 6.200684150513113, "grad_norm": 0.9296875, "learning_rate": 1.7292489994191005e-06, "loss": 0.0328, "mean_token_accuracy": 0.9896864891052246, "num_tokens": 289003246.0, "step": 2722 }, { "entropy": 0.9656028300523758, "epoch": 6.202964652223489, "grad_norm": 1.1640625, "learning_rate": 1.7274575140626318e-06, "loss": 0.0293, "mean_token_accuracy": 0.9914490133523941, "num_tokens": 289109903.0, "step": 2723 }, { "entropy": 0.9616937637329102, "epoch": 6.205245153933865, "grad_norm": 1.0078125, "learning_rate": 1.7256664671863634e-06, "loss": 0.0274, "mean_token_accuracy": 0.9920057505369186, "num_tokens": 289216582.0, "step": 2724 }, { "entropy": 0.9655533581972122, "epoch": 6.2075256556442415, "grad_norm": 1.0625, "learning_rate": 1.72387585980686e-06, "loss": 0.0236, "mean_token_accuracy": 0.9924517869949341, "num_tokens": 289322738.0, "step": 2725 }, { "entropy": 0.9635019302368164, "epoch": 6.209806157354618, "grad_norm": 1.046875, "learning_rate": 1.7220856929404342e-06, "loss": 0.0237, "mean_token_accuracy": 0.9923419207334518, "num_tokens": 289429017.0, "step": 2726 }, { "entropy": 0.9685231298208237, "epoch": 6.212086659064994, "grad_norm": 1.265625, "learning_rate": 1.720295967603152e-06, "loss": 0.029, "mean_token_accuracy": 0.9900393486022949, "num_tokens": 289535389.0, "step": 2727 }, { "entropy": 0.9683802276849747, "epoch": 6.214367160775371, "grad_norm": 1.15625, "learning_rate": 1.7185066848108244e-06, "loss": 0.0272, "mean_token_accuracy": 0.9904020130634308, "num_tokens": 289641754.0, "step": 2728 }, { "entropy": 0.9690528064966202, "epoch": 6.216647662485747, "grad_norm": 1.0625, "learning_rate": 1.7167178455790157e-06, "loss": 0.0345, "mean_token_accuracy": 0.9905894249677658, "num_tokens": 289748399.0, "step": 2729 }, { "entropy": 0.9714601933956146, "epoch": 6.218928164196123, "grad_norm": 1.0, "learning_rate": 1.7149294509230357e-06, "loss": 0.0285, "mean_token_accuracy": 0.990757629275322, "num_tokens": 289854446.0, "step": 2730 }, { "entropy": 0.9641185849905014, "epoch": 6.2212086659065, "grad_norm": 1.1875, "learning_rate": 1.713141501857943e-06, "loss": 0.0277, "mean_token_accuracy": 0.9912948757410049, "num_tokens": 289960975.0, "step": 2731 }, { "entropy": 0.9645472317934036, "epoch": 6.223489167616876, "grad_norm": 1.1484375, "learning_rate": 1.7113539993985431e-06, "loss": 0.0396, "mean_token_accuracy": 0.9877965748310089, "num_tokens": 290066853.0, "step": 2732 }, { "entropy": 0.9605286121368408, "epoch": 6.225769669327252, "grad_norm": 1.046875, "learning_rate": 1.7095669445593887e-06, "loss": 0.0371, "mean_token_accuracy": 0.9885055720806122, "num_tokens": 290173267.0, "step": 2733 }, { "entropy": 0.9638795852661133, "epoch": 6.228050171037628, "grad_norm": 1.1796875, "learning_rate": 1.707780338354776e-06, "loss": 0.0336, "mean_token_accuracy": 0.9914102107286453, "num_tokens": 290279496.0, "step": 2734 }, { "entropy": 0.9632378816604614, "epoch": 6.230330672748004, "grad_norm": 1.1328125, "learning_rate": 1.7059941817987485e-06, "loss": 0.0301, "mean_token_accuracy": 0.9906269013881683, "num_tokens": 290385957.0, "step": 2735 }, { "entropy": 0.9690006822347641, "epoch": 6.2326111744583805, "grad_norm": 1.21875, "learning_rate": 1.7042084759050948e-06, "loss": 0.0347, "mean_token_accuracy": 0.9875853955745697, "num_tokens": 290492383.0, "step": 2736 }, { "entropy": 0.9676993936300278, "epoch": 6.234891676168757, "grad_norm": 1.21875, "learning_rate": 1.7024232216873465e-06, "loss": 0.0335, "mean_token_accuracy": 0.9898857474327087, "num_tokens": 290598897.0, "step": 2737 }, { "entropy": 0.9601109325885773, "epoch": 6.237172177879134, "grad_norm": 0.953125, "learning_rate": 1.7006384201587809e-06, "loss": 0.0374, "mean_token_accuracy": 0.9875656813383102, "num_tokens": 290705266.0, "step": 2738 }, { "entropy": 0.9628721475601196, "epoch": 6.23945267958951, "grad_norm": 1.453125, "learning_rate": 1.6988540723324145e-06, "loss": 0.0386, "mean_token_accuracy": 0.9851835370063782, "num_tokens": 290811754.0, "step": 2739 }, { "entropy": 0.9631428122520447, "epoch": 6.241733181299886, "grad_norm": 1.0625, "learning_rate": 1.6970701792210101e-06, "loss": 0.0327, "mean_token_accuracy": 0.9906308054924011, "num_tokens": 290918554.0, "step": 2740 }, { "entropy": 0.9576750993728638, "epoch": 6.244013683010262, "grad_norm": 1.25, "learning_rate": 1.6952867418370707e-06, "loss": 0.0344, "mean_token_accuracy": 0.9901357591152191, "num_tokens": 291025202.0, "step": 2741 }, { "entropy": 0.9614335000514984, "epoch": 6.246294184720639, "grad_norm": 1.1640625, "learning_rate": 1.6935037611928412e-06, "loss": 0.022, "mean_token_accuracy": 0.9935334473848343, "num_tokens": 291131105.0, "step": 2742 }, { "entropy": 0.9641132205724716, "epoch": 6.248574686431015, "grad_norm": 1.3203125, "learning_rate": 1.691721238300308e-06, "loss": 0.0242, "mean_token_accuracy": 0.9913577735424042, "num_tokens": 291237108.0, "step": 2743 }, { "entropy": 0.9593443125486374, "epoch": 6.250855188141391, "grad_norm": 1.1484375, "learning_rate": 1.689939174171194e-06, "loss": 0.0399, "mean_token_accuracy": 0.9878155887126923, "num_tokens": 291343278.0, "step": 2744 }, { "entropy": 0.9627764225006104, "epoch": 6.253135689851767, "grad_norm": 1.1328125, "learning_rate": 1.6881575698169662e-06, "loss": 0.0331, "mean_token_accuracy": 0.9891181588172913, "num_tokens": 291450037.0, "step": 2745 }, { "entropy": 0.9640289694070816, "epoch": 6.255416191562143, "grad_norm": 0.99609375, "learning_rate": 1.6863764262488292e-06, "loss": 0.0316, "mean_token_accuracy": 0.9918324798345566, "num_tokens": 291556556.0, "step": 2746 }, { "entropy": 0.9644737094640732, "epoch": 6.2576966932725195, "grad_norm": 1.2421875, "learning_rate": 1.6845957444777244e-06, "loss": 0.032, "mean_token_accuracy": 0.9900206923484802, "num_tokens": 291663156.0, "step": 2747 }, { "entropy": 0.9604948461055756, "epoch": 6.259977194982897, "grad_norm": 0.98046875, "learning_rate": 1.6828155255143331e-06, "loss": 0.0261, "mean_token_accuracy": 0.9909422844648361, "num_tokens": 291769236.0, "step": 2748 }, { "entropy": 0.9604319781064987, "epoch": 6.262257696693273, "grad_norm": 1.234375, "learning_rate": 1.6810357703690739e-06, "loss": 0.0442, "mean_token_accuracy": 0.9898703247308731, "num_tokens": 291875798.0, "step": 2749 }, { "entropy": 0.9624180495738983, "epoch": 6.264538198403649, "grad_norm": 0.95703125, "learning_rate": 1.6792564800521e-06, "loss": 0.029, "mean_token_accuracy": 0.9890719503164291, "num_tokens": 291981863.0, "step": 2750 }, { "entropy": 0.9593867063522339, "epoch": 6.266818700114025, "grad_norm": 0.9453125, "learning_rate": 1.677477655573303e-06, "loss": 0.028, "mean_token_accuracy": 0.9910954684019089, "num_tokens": 292087900.0, "step": 2751 }, { "entropy": 0.9656224846839905, "epoch": 6.269099201824401, "grad_norm": 1.1875, "learning_rate": 1.675699297942309e-06, "loss": 0.032, "mean_token_accuracy": 0.988454133272171, "num_tokens": 292194033.0, "step": 2752 }, { "entropy": 0.9674578160047531, "epoch": 6.271379703534778, "grad_norm": 1.265625, "learning_rate": 1.6739214081684799e-06, "loss": 0.0427, "mean_token_accuracy": 0.9871638268232346, "num_tokens": 292300613.0, "step": 2753 }, { "entropy": 0.9618206769227982, "epoch": 6.273660205245154, "grad_norm": 1.1171875, "learning_rate": 1.6721439872609125e-06, "loss": 0.0347, "mean_token_accuracy": 0.9887508153915405, "num_tokens": 292407674.0, "step": 2754 }, { "entropy": 0.9685026407241821, "epoch": 6.27594070695553, "grad_norm": 1.1015625, "learning_rate": 1.6703670362284346e-06, "loss": 0.0309, "mean_token_accuracy": 0.9895285815000534, "num_tokens": 292513881.0, "step": 2755 }, { "entropy": 0.9629374295473099, "epoch": 6.278221208665906, "grad_norm": 1.296875, "learning_rate": 1.6685905560796101e-06, "loss": 0.0383, "mean_token_accuracy": 0.9887033402919769, "num_tokens": 292620065.0, "step": 2756 }, { "entropy": 0.9626521170139313, "epoch": 6.280501710376283, "grad_norm": 0.88671875, "learning_rate": 1.6668145478227354e-06, "loss": 0.0245, "mean_token_accuracy": 0.9928363412618637, "num_tokens": 292725996.0, "step": 2757 }, { "entropy": 0.9720036089420319, "epoch": 6.282782212086659, "grad_norm": 0.96484375, "learning_rate": 1.6650390124658378e-06, "loss": 0.0256, "mean_token_accuracy": 0.9901890605688095, "num_tokens": 292832257.0, "step": 2758 }, { "entropy": 0.9649845212697983, "epoch": 6.285062713797036, "grad_norm": 1.15625, "learning_rate": 1.663263951016678e-06, "loss": 0.0269, "mean_token_accuracy": 0.9913786798715591, "num_tokens": 292938313.0, "step": 2759 }, { "entropy": 0.9640349894762039, "epoch": 6.287343215507412, "grad_norm": 1.15625, "learning_rate": 1.661489364482745e-06, "loss": 0.0359, "mean_token_accuracy": 0.9890404939651489, "num_tokens": 293044563.0, "step": 2760 }, { "entropy": 0.967204675078392, "epoch": 6.289623717217788, "grad_norm": 1.125, "learning_rate": 1.6597152538712608e-06, "loss": 0.0338, "mean_token_accuracy": 0.9894020110368729, "num_tokens": 293151276.0, "step": 2761 }, { "entropy": 0.9634224027395248, "epoch": 6.291904218928164, "grad_norm": 1.0234375, "learning_rate": 1.6579416201891757e-06, "loss": 0.0313, "mean_token_accuracy": 0.9917814582586288, "num_tokens": 293257963.0, "step": 2762 }, { "entropy": 0.96688012778759, "epoch": 6.29418472063854, "grad_norm": 1.046875, "learning_rate": 1.6561684644431709e-06, "loss": 0.0289, "mean_token_accuracy": 0.9878240674734116, "num_tokens": 293363978.0, "step": 2763 }, { "entropy": 0.9678733944892883, "epoch": 6.296465222348917, "grad_norm": 1.0234375, "learning_rate": 1.6543957876396544e-06, "loss": 0.0285, "mean_token_accuracy": 0.9910137802362442, "num_tokens": 293470378.0, "step": 2764 }, { "entropy": 0.9632772505283356, "epoch": 6.298745724059293, "grad_norm": 1.1640625, "learning_rate": 1.6526235907847649e-06, "loss": 0.0333, "mean_token_accuracy": 0.9890914708375931, "num_tokens": 293576913.0, "step": 2765 }, { "entropy": 0.9653374999761581, "epoch": 6.301026225769669, "grad_norm": 0.98828125, "learning_rate": 1.6508518748843651e-06, "loss": 0.0315, "mean_token_accuracy": 0.99041947722435, "num_tokens": 293683531.0, "step": 2766 }, { "entropy": 0.9668000489473343, "epoch": 6.303306727480045, "grad_norm": 1.1796875, "learning_rate": 1.649080640944048e-06, "loss": 0.0334, "mean_token_accuracy": 0.990400418639183, "num_tokens": 293790825.0, "step": 2767 }, { "entropy": 0.9691970348358154, "epoch": 6.305587229190422, "grad_norm": 1.0390625, "learning_rate": 1.6473098899691313e-06, "loss": 0.0261, "mean_token_accuracy": 0.9924120604991913, "num_tokens": 293896711.0, "step": 2768 }, { "entropy": 0.9678291827440262, "epoch": 6.307867730900798, "grad_norm": 1.0390625, "learning_rate": 1.6455396229646595e-06, "loss": 0.0283, "mean_token_accuracy": 0.9904878586530685, "num_tokens": 294003232.0, "step": 2769 }, { "entropy": 0.9660652875900269, "epoch": 6.310148232611175, "grad_norm": 1.015625, "learning_rate": 1.6437698409354025e-06, "loss": 0.0335, "mean_token_accuracy": 0.9896391034126282, "num_tokens": 294109734.0, "step": 2770 }, { "entropy": 0.970296785235405, "epoch": 6.312428734321551, "grad_norm": 1.046875, "learning_rate": 1.6420005448858522e-06, "loss": 0.0306, "mean_token_accuracy": 0.9897883832454681, "num_tokens": 294215731.0, "step": 2771 }, { "entropy": 0.963557168841362, "epoch": 6.314709236031927, "grad_norm": 1.140625, "learning_rate": 1.6402317358202286e-06, "loss": 0.0334, "mean_token_accuracy": 0.9876372814178467, "num_tokens": 294321713.0, "step": 2772 }, { "entropy": 0.9611302465200424, "epoch": 6.316989737742303, "grad_norm": 1.09375, "learning_rate": 1.6384634147424732e-06, "loss": 0.0329, "mean_token_accuracy": 0.9908719956874847, "num_tokens": 294428436.0, "step": 2773 }, { "entropy": 0.962317019701004, "epoch": 6.319270239452679, "grad_norm": 1.0546875, "learning_rate": 1.636695582656251e-06, "loss": 0.0344, "mean_token_accuracy": 0.9916597455739975, "num_tokens": 294534387.0, "step": 2774 }, { "entropy": 0.9624365866184235, "epoch": 6.321550741163056, "grad_norm": 1.0625, "learning_rate": 1.6349282405649506e-06, "loss": 0.0278, "mean_token_accuracy": 0.9920227825641632, "num_tokens": 294641044.0, "step": 2775 }, { "entropy": 0.966009259223938, "epoch": 6.323831242873432, "grad_norm": 0.96875, "learning_rate": 1.6331613894716787e-06, "loss": 0.0257, "mean_token_accuracy": 0.9913590848445892, "num_tokens": 294747207.0, "step": 2776 }, { "entropy": 0.9682252705097198, "epoch": 6.326111744583809, "grad_norm": 1.03125, "learning_rate": 1.6313950303792672e-06, "loss": 0.029, "mean_token_accuracy": 0.9899289906024933, "num_tokens": 294853602.0, "step": 2777 }, { "entropy": 0.9612033814191818, "epoch": 6.328392246294185, "grad_norm": 1.3203125, "learning_rate": 1.6296291642902673e-06, "loss": 0.0369, "mean_token_accuracy": 0.989063560962677, "num_tokens": 294959804.0, "step": 2778 }, { "entropy": 0.9651383459568024, "epoch": 6.330672748004561, "grad_norm": 1.1328125, "learning_rate": 1.6278637922069512e-06, "loss": 0.0354, "mean_token_accuracy": 0.9875663220882416, "num_tokens": 295066184.0, "step": 2779 }, { "entropy": 0.9664762169122696, "epoch": 6.3329532497149374, "grad_norm": 0.96484375, "learning_rate": 1.6260989151313091e-06, "loss": 0.0241, "mean_token_accuracy": 0.9922053068876266, "num_tokens": 295171965.0, "step": 2780 }, { "entropy": 0.9654189348220825, "epoch": 6.335233751425314, "grad_norm": 0.9609375, "learning_rate": 1.6243345340650523e-06, "loss": 0.032, "mean_token_accuracy": 0.9911584556102753, "num_tokens": 295278094.0, "step": 2781 }, { "entropy": 0.9661945551633835, "epoch": 6.33751425313569, "grad_norm": 1.1796875, "learning_rate": 1.6225706500096079e-06, "loss": 0.0346, "mean_token_accuracy": 0.989261731505394, "num_tokens": 295384226.0, "step": 2782 }, { "entropy": 0.9611630439758301, "epoch": 6.339794754846066, "grad_norm": 1.21875, "learning_rate": 1.6208072639661226e-06, "loss": 0.0341, "mean_token_accuracy": 0.9893072247505188, "num_tokens": 295490653.0, "step": 2783 }, { "entropy": 0.9652715474367142, "epoch": 6.342075256556442, "grad_norm": 1.59375, "learning_rate": 1.6190443769354608e-06, "loss": 0.034, "mean_token_accuracy": 0.9873823672533035, "num_tokens": 295596713.0, "step": 2784 }, { "entropy": 0.9649737030267715, "epoch": 6.344355758266818, "grad_norm": 1.28125, "learning_rate": 1.6172819899182036e-06, "loss": 0.0322, "mean_token_accuracy": 0.9917201697826385, "num_tokens": 295702829.0, "step": 2785 }, { "entropy": 0.9663645923137665, "epoch": 6.346636259977195, "grad_norm": 0.90625, "learning_rate": 1.6155201039146478e-06, "loss": 0.0269, "mean_token_accuracy": 0.9925671815872192, "num_tokens": 295809251.0, "step": 2786 }, { "entropy": 0.9643654227256775, "epoch": 6.348916761687571, "grad_norm": 1.1484375, "learning_rate": 1.613758719924805e-06, "loss": 0.0424, "mean_token_accuracy": 0.9862226843833923, "num_tokens": 295915238.0, "step": 2787 }, { "entropy": 0.9642847180366516, "epoch": 6.351197263397948, "grad_norm": 1.140625, "learning_rate": 1.611997838948403e-06, "loss": 0.0259, "mean_token_accuracy": 0.9912941455841064, "num_tokens": 296022471.0, "step": 2788 }, { "entropy": 0.9603889584541321, "epoch": 6.353477765108324, "grad_norm": 0.8984375, "learning_rate": 1.6102374619848845e-06, "loss": 0.0242, "mean_token_accuracy": 0.9923200607299805, "num_tokens": 296128789.0, "step": 2789 }, { "entropy": 0.9646933376789093, "epoch": 6.3557582668187, "grad_norm": 0.94921875, "learning_rate": 1.6084775900334046e-06, "loss": 0.0239, "mean_token_accuracy": 0.9917632341384888, "num_tokens": 296234884.0, "step": 2790 }, { "entropy": 0.9651411920785904, "epoch": 6.3580387685290765, "grad_norm": 0.96875, "learning_rate": 1.6067182240928332e-06, "loss": 0.0273, "mean_token_accuracy": 0.9916862696409225, "num_tokens": 296341248.0, "step": 2791 }, { "entropy": 0.9613000005483627, "epoch": 6.360319270239453, "grad_norm": 1.25, "learning_rate": 1.6049593651617534e-06, "loss": 0.0284, "mean_token_accuracy": 0.9913472384214401, "num_tokens": 296447575.0, "step": 2792 }, { "entropy": 0.9658768177032471, "epoch": 6.362599771949829, "grad_norm": 1.4375, "learning_rate": 1.6032010142384572e-06, "loss": 0.0454, "mean_token_accuracy": 0.9860958456993103, "num_tokens": 296553922.0, "step": 2793 }, { "entropy": 0.9632129222154617, "epoch": 6.364880273660205, "grad_norm": 1.0859375, "learning_rate": 1.6014431723209522e-06, "loss": 0.0296, "mean_token_accuracy": 0.9904309213161469, "num_tokens": 296660018.0, "step": 2794 }, { "entropy": 0.9611330181360245, "epoch": 6.367160775370581, "grad_norm": 1.0, "learning_rate": 1.599685840406955e-06, "loss": 0.0336, "mean_token_accuracy": 0.9872413575649261, "num_tokens": 296766714.0, "step": 2795 }, { "entropy": 0.9609897285699844, "epoch": 6.369441277080957, "grad_norm": 1.3671875, "learning_rate": 1.5979290194938938e-06, "loss": 0.041, "mean_token_accuracy": 0.9877043962478638, "num_tokens": 296872647.0, "step": 2796 }, { "entropy": 0.9672413021326065, "epoch": 6.3717217787913345, "grad_norm": 0.94140625, "learning_rate": 1.5961727105789072e-06, "loss": 0.0245, "mean_token_accuracy": 0.9923326671123505, "num_tokens": 296978945.0, "step": 2797 }, { "entropy": 0.9639715403318405, "epoch": 6.374002280501711, "grad_norm": 1.0546875, "learning_rate": 1.5944169146588395e-06, "loss": 0.0263, "mean_token_accuracy": 0.9905667752027512, "num_tokens": 297085132.0, "step": 2798 }, { "entropy": 0.9696639180183411, "epoch": 6.376282782212087, "grad_norm": 1.2109375, "learning_rate": 1.5926616327302482e-06, "loss": 0.0322, "mean_token_accuracy": 0.9890216737985611, "num_tokens": 297191367.0, "step": 2799 }, { "entropy": 0.9637541472911835, "epoch": 6.378563283922463, "grad_norm": 1.21875, "learning_rate": 1.5909068657893978e-06, "loss": 0.0318, "mean_token_accuracy": 0.9906446486711502, "num_tokens": 297297816.0, "step": 2800 }, { "entropy": 0.963173046708107, "epoch": 6.380843785632839, "grad_norm": 1.109375, "learning_rate": 1.5891526148322594e-06, "loss": 0.0319, "mean_token_accuracy": 0.9908640831708908, "num_tokens": 297404311.0, "step": 2801 }, { "entropy": 0.9565755277872086, "epoch": 6.3831242873432155, "grad_norm": 0.94921875, "learning_rate": 1.5873988808545127e-06, "loss": 0.0266, "mean_token_accuracy": 0.9916023463010788, "num_tokens": 297510764.0, "step": 2802 }, { "entropy": 0.9605354964733124, "epoch": 6.385404789053592, "grad_norm": 0.85546875, "learning_rate": 1.5856456648515425e-06, "loss": 0.0269, "mean_token_accuracy": 0.9902220815420151, "num_tokens": 297617366.0, "step": 2803 }, { "entropy": 0.9649783223867416, "epoch": 6.387685290763968, "grad_norm": 0.88671875, "learning_rate": 1.5838929678184405e-06, "loss": 0.0273, "mean_token_accuracy": 0.9910926669836044, "num_tokens": 297722741.0, "step": 2804 }, { "entropy": 0.9623785465955734, "epoch": 6.389965792474344, "grad_norm": 1.1796875, "learning_rate": 1.5821407907500036e-06, "loss": 0.0383, "mean_token_accuracy": 0.9888468831777573, "num_tokens": 297829743.0, "step": 2805 }, { "entropy": 0.9646396636962891, "epoch": 6.39224629418472, "grad_norm": 0.9609375, "learning_rate": 1.5803891346407342e-06, "loss": 0.0299, "mean_token_accuracy": 0.9925766885280609, "num_tokens": 297936190.0, "step": 2806 }, { "entropy": 0.9603356122970581, "epoch": 6.394526795895097, "grad_norm": 1.0, "learning_rate": 1.5786380004848379e-06, "loss": 0.0267, "mean_token_accuracy": 0.9921542704105377, "num_tokens": 298042556.0, "step": 2807 }, { "entropy": 0.9635659158229828, "epoch": 6.3968072976054735, "grad_norm": 1.140625, "learning_rate": 1.576887389276226e-06, "loss": 0.0294, "mean_token_accuracy": 0.991036057472229, "num_tokens": 298149069.0, "step": 2808 }, { "entropy": 0.9619456976652145, "epoch": 6.39908779931585, "grad_norm": 1.125, "learning_rate": 1.5751373020085093e-06, "loss": 0.0351, "mean_token_accuracy": 0.989626869559288, "num_tokens": 298255469.0, "step": 2809 }, { "entropy": 0.9604221135377884, "epoch": 6.401368301026226, "grad_norm": 1.0546875, "learning_rate": 1.5733877396750051e-06, "loss": 0.0284, "mean_token_accuracy": 0.990064725279808, "num_tokens": 298361303.0, "step": 2810 }, { "entropy": 0.9632517248392105, "epoch": 6.403648802736602, "grad_norm": 1.234375, "learning_rate": 1.5716387032687314e-06, "loss": 0.0341, "mean_token_accuracy": 0.9890487790107727, "num_tokens": 298467336.0, "step": 2811 }, { "entropy": 0.9639025032520294, "epoch": 6.405929304446978, "grad_norm": 1.21875, "learning_rate": 1.5698901937824066e-06, "loss": 0.0332, "mean_token_accuracy": 0.9896437227725983, "num_tokens": 298573671.0, "step": 2812 }, { "entropy": 0.9611777365207672, "epoch": 6.4082098061573545, "grad_norm": 0.93359375, "learning_rate": 1.5681422122084522e-06, "loss": 0.0281, "mean_token_accuracy": 0.9924857020378113, "num_tokens": 298680893.0, "step": 2813 }, { "entropy": 0.9610385000705719, "epoch": 6.410490307867731, "grad_norm": 1.3125, "learning_rate": 1.5663947595389873e-06, "loss": 0.038, "mean_token_accuracy": 0.9882655143737793, "num_tokens": 298787664.0, "step": 2814 }, { "entropy": 0.9623655378818512, "epoch": 6.412770809578107, "grad_norm": 1.53125, "learning_rate": 1.5646478367658325e-06, "loss": 0.0436, "mean_token_accuracy": 0.9878936260938644, "num_tokens": 298893515.0, "step": 2815 }, { "entropy": 0.9599327594041824, "epoch": 6.415051311288483, "grad_norm": 1.2890625, "learning_rate": 1.562901444880508e-06, "loss": 0.0305, "mean_token_accuracy": 0.9901946634054184, "num_tokens": 298999865.0, "step": 2816 }, { "entropy": 0.9602803289890289, "epoch": 6.41733181299886, "grad_norm": 1.2421875, "learning_rate": 1.5611555848742318e-06, "loss": 0.0349, "mean_token_accuracy": 0.9890265017747879, "num_tokens": 299106275.0, "step": 2817 }, { "entropy": 0.9574746787548065, "epoch": 6.419612314709236, "grad_norm": 0.9375, "learning_rate": 1.5594102577379216e-06, "loss": 0.0259, "mean_token_accuracy": 0.9920108765363693, "num_tokens": 299212340.0, "step": 2818 }, { "entropy": 0.9690692573785782, "epoch": 6.4218928164196125, "grad_norm": 1.1328125, "learning_rate": 1.5576654644621897e-06, "loss": 0.032, "mean_token_accuracy": 0.9888695627450943, "num_tokens": 299318317.0, "step": 2819 }, { "entropy": 0.9684653133153915, "epoch": 6.424173318129989, "grad_norm": 0.97265625, "learning_rate": 1.5559212060373474e-06, "loss": 0.0241, "mean_token_accuracy": 0.9919718205928802, "num_tokens": 299424576.0, "step": 2820 }, { "entropy": 0.961356908082962, "epoch": 6.426453819840365, "grad_norm": 0.953125, "learning_rate": 1.5541774834534024e-06, "loss": 0.0278, "mean_token_accuracy": 0.9914631098508835, "num_tokens": 299530862.0, "step": 2821 }, { "entropy": 0.9632999747991562, "epoch": 6.428734321550741, "grad_norm": 1.1953125, "learning_rate": 1.5524342977000587e-06, "loss": 0.0349, "mean_token_accuracy": 0.9874798953533173, "num_tokens": 299637794.0, "step": 2822 }, { "entropy": 0.9616778939962387, "epoch": 6.431014823261117, "grad_norm": 1.21875, "learning_rate": 1.5506916497667134e-06, "loss": 0.042, "mean_token_accuracy": 0.987631693482399, "num_tokens": 299744345.0, "step": 2823 }, { "entropy": 0.9629386961460114, "epoch": 6.4332953249714935, "grad_norm": 0.98046875, "learning_rate": 1.5489495406424618e-06, "loss": 0.0272, "mean_token_accuracy": 0.9905762374401093, "num_tokens": 299851179.0, "step": 2824 }, { "entropy": 0.9631151258945465, "epoch": 6.43557582668187, "grad_norm": 1.359375, "learning_rate": 1.5472079713160892e-06, "loss": 0.0344, "mean_token_accuracy": 0.9899939298629761, "num_tokens": 299958607.0, "step": 2825 }, { "entropy": 0.962843731045723, "epoch": 6.437856328392247, "grad_norm": 1.046875, "learning_rate": 1.5454669427760774e-06, "loss": 0.0231, "mean_token_accuracy": 0.9923212081193924, "num_tokens": 300064268.0, "step": 2826 }, { "entropy": 0.96213099360466, "epoch": 6.440136830102623, "grad_norm": 1.1328125, "learning_rate": 1.5437264560106014e-06, "loss": 0.0347, "mean_token_accuracy": 0.9890454113483429, "num_tokens": 300170588.0, "step": 2827 }, { "entropy": 0.9596124440431595, "epoch": 6.442417331812999, "grad_norm": 1.15625, "learning_rate": 1.5419865120075267e-06, "loss": 0.026, "mean_token_accuracy": 0.9906197339296341, "num_tokens": 300276849.0, "step": 2828 }, { "entropy": 0.9628003835678101, "epoch": 6.444697833523375, "grad_norm": 1.3671875, "learning_rate": 1.5402471117544143e-06, "loss": 0.0359, "mean_token_accuracy": 0.9873076975345612, "num_tokens": 300382913.0, "step": 2829 }, { "entropy": 0.9636849761009216, "epoch": 6.4469783352337515, "grad_norm": 1.0546875, "learning_rate": 1.5385082562385112e-06, "loss": 0.0341, "mean_token_accuracy": 0.9882227629423141, "num_tokens": 300489751.0, "step": 2830 }, { "entropy": 0.9611945301294327, "epoch": 6.449258836944128, "grad_norm": 0.9765625, "learning_rate": 1.5367699464467596e-06, "loss": 0.0297, "mean_token_accuracy": 0.9894246906042099, "num_tokens": 300596047.0, "step": 2831 }, { "entropy": 0.9663524925708771, "epoch": 6.451539338654504, "grad_norm": 1.109375, "learning_rate": 1.5350321833657904e-06, "loss": 0.0286, "mean_token_accuracy": 0.9909536838531494, "num_tokens": 300702015.0, "step": 2832 }, { "entropy": 0.9583011269569397, "epoch": 6.45381984036488, "grad_norm": 1.2421875, "learning_rate": 1.5332949679819251e-06, "loss": 0.0366, "mean_token_accuracy": 0.9882856756448746, "num_tokens": 300808103.0, "step": 2833 }, { "entropy": 0.9690171480178833, "epoch": 6.456100342075256, "grad_norm": 1.0546875, "learning_rate": 1.531558301281173e-06, "loss": 0.0299, "mean_token_accuracy": 0.9896034598350525, "num_tokens": 300914706.0, "step": 2834 }, { "entropy": 0.9648149460554123, "epoch": 6.4583808437856325, "grad_norm": 0.99609375, "learning_rate": 1.5298221842492328e-06, "loss": 0.0256, "mean_token_accuracy": 0.9913268834352493, "num_tokens": 301020545.0, "step": 2835 }, { "entropy": 0.9623223096132278, "epoch": 6.460661345496009, "grad_norm": 1.75, "learning_rate": 1.5280866178714898e-06, "loss": 0.0382, "mean_token_accuracy": 0.9881086200475693, "num_tokens": 301127710.0, "step": 2836 }, { "entropy": 0.9646039754152298, "epoch": 6.462941847206386, "grad_norm": 1.3203125, "learning_rate": 1.5263516031330195e-06, "loss": 0.0323, "mean_token_accuracy": 0.9901058077812195, "num_tokens": 301234088.0, "step": 2837 }, { "entropy": 0.9645749926567078, "epoch": 6.465222348916762, "grad_norm": 1.265625, "learning_rate": 1.524617141018582e-06, "loss": 0.0329, "mean_token_accuracy": 0.9883266091346741, "num_tokens": 301340064.0, "step": 2838 }, { "entropy": 0.9606651216745377, "epoch": 6.467502850627138, "grad_norm": 0.92578125, "learning_rate": 1.5228832325126248e-06, "loss": 0.0322, "mean_token_accuracy": 0.9915178716182709, "num_tokens": 301446438.0, "step": 2839 }, { "entropy": 0.9615496844053268, "epoch": 6.469783352337514, "grad_norm": 0.921875, "learning_rate": 1.5211498785992818e-06, "loss": 0.0305, "mean_token_accuracy": 0.9915241003036499, "num_tokens": 301552517.0, "step": 2840 }, { "entropy": 0.9689502567052841, "epoch": 6.4720638540478905, "grad_norm": 1.0625, "learning_rate": 1.5194170802623692e-06, "loss": 0.034, "mean_token_accuracy": 0.9898267984390259, "num_tokens": 301658638.0, "step": 2841 }, { "entropy": 0.9701874852180481, "epoch": 6.474344355758267, "grad_norm": 1.15625, "learning_rate": 1.5176848384853913e-06, "loss": 0.031, "mean_token_accuracy": 0.9893524050712585, "num_tokens": 301764860.0, "step": 2842 }, { "entropy": 0.9627227336168289, "epoch": 6.476624857468643, "grad_norm": 0.95703125, "learning_rate": 1.515953154251535e-06, "loss": 0.0252, "mean_token_accuracy": 0.991218626499176, "num_tokens": 301870729.0, "step": 2843 }, { "entropy": 0.9638179689645767, "epoch": 6.478905359179019, "grad_norm": 1.1328125, "learning_rate": 1.5142220285436701e-06, "loss": 0.0298, "mean_token_accuracy": 0.9899030774831772, "num_tokens": 301976682.0, "step": 2844 }, { "entropy": 0.9601583778858185, "epoch": 6.481185860889395, "grad_norm": 1.046875, "learning_rate": 1.512491462344351e-06, "loss": 0.0254, "mean_token_accuracy": 0.9912155866622925, "num_tokens": 302082931.0, "step": 2845 }, { "entropy": 0.9648629277944565, "epoch": 6.483466362599772, "grad_norm": 1.0078125, "learning_rate": 1.5107614566358136e-06, "loss": 0.0321, "mean_token_accuracy": 0.9905120730400085, "num_tokens": 302189817.0, "step": 2846 }, { "entropy": 0.9679024368524551, "epoch": 6.485746864310149, "grad_norm": 0.859375, "learning_rate": 1.5090320123999746e-06, "loss": 0.023, "mean_token_accuracy": 0.9942043572664261, "num_tokens": 302295419.0, "step": 2847 }, { "entropy": 0.9666184931993484, "epoch": 6.488027366020525, "grad_norm": 0.91796875, "learning_rate": 1.5073031306184343e-06, "loss": 0.026, "mean_token_accuracy": 0.9915440380573273, "num_tokens": 302401363.0, "step": 2848 }, { "entropy": 0.9585491269826889, "epoch": 6.490307867730901, "grad_norm": 1.03125, "learning_rate": 1.5055748122724722e-06, "loss": 0.0338, "mean_token_accuracy": 0.9901338368654251, "num_tokens": 302507742.0, "step": 2849 }, { "entropy": 0.9630323350429535, "epoch": 6.492588369441277, "grad_norm": 0.88671875, "learning_rate": 1.5038470583430485e-06, "loss": 0.0247, "mean_token_accuracy": 0.991613045334816, "num_tokens": 302613819.0, "step": 2850 }, { "entropy": 0.9641955345869064, "epoch": 6.494868871151653, "grad_norm": 1.0078125, "learning_rate": 1.5021198698108038e-06, "loss": 0.0271, "mean_token_accuracy": 0.9902271777391434, "num_tokens": 302720175.0, "step": 2851 }, { "entropy": 0.9623762518167496, "epoch": 6.4971493728620295, "grad_norm": 0.8359375, "learning_rate": 1.5003932476560554e-06, "loss": 0.0232, "mean_token_accuracy": 0.9927068948745728, "num_tokens": 302825916.0, "step": 2852 }, { "entropy": 0.958920493721962, "epoch": 6.499429874572406, "grad_norm": 1.3359375, "learning_rate": 1.4986671928588016e-06, "loss": 0.0405, "mean_token_accuracy": 0.98631152510643, "num_tokens": 302932133.0, "step": 2853 }, { "entropy": 0.9656879603862762, "epoch": 6.501710376282782, "grad_norm": 1.09375, "learning_rate": 1.496941706398718e-06, "loss": 0.026, "mean_token_accuracy": 0.9908463209867477, "num_tokens": 303038514.0, "step": 2854 }, { "entropy": 0.9643964916467667, "epoch": 6.503990877993158, "grad_norm": 1.015625, "learning_rate": 1.495216789255156e-06, "loss": 0.0316, "mean_token_accuracy": 0.9892091155052185, "num_tokens": 303143931.0, "step": 2855 }, { "entropy": 0.967761218547821, "epoch": 6.506271379703534, "grad_norm": 1.1484375, "learning_rate": 1.4934924424071479e-06, "loss": 0.0309, "mean_token_accuracy": 0.9911254942417145, "num_tokens": 303249845.0, "step": 2856 }, { "entropy": 0.9621175974607468, "epoch": 6.508551881413911, "grad_norm": 0.98046875, "learning_rate": 1.4917686668333975e-06, "loss": 0.0335, "mean_token_accuracy": 0.9919760227203369, "num_tokens": 303355895.0, "step": 2857 }, { "entropy": 0.9624361544847488, "epoch": 6.510832383124288, "grad_norm": 1.0859375, "learning_rate": 1.4900454635122866e-06, "loss": 0.0362, "mean_token_accuracy": 0.9883544594049454, "num_tokens": 303461863.0, "step": 2858 }, { "entropy": 0.9659542143344879, "epoch": 6.513112884834664, "grad_norm": 1.0859375, "learning_rate": 1.4883228334218727e-06, "loss": 0.0322, "mean_token_accuracy": 0.9906762391328812, "num_tokens": 303567937.0, "step": 2859 }, { "entropy": 0.9677791148424149, "epoch": 6.51539338654504, "grad_norm": 1.1328125, "learning_rate": 1.4866007775398874e-06, "loss": 0.0324, "mean_token_accuracy": 0.9897576719522476, "num_tokens": 303674706.0, "step": 2860 }, { "epoch": 6.51539338654504, "eval_entropy": 0.9607415505235186, "eval_loss": 0.03769908472895622, "eval_mean_token_accuracy": 0.9885010553856767, "eval_num_tokens": 303674706.0, "eval_runtime": 66.0686, "eval_samples_per_second": 126.914, "eval_steps_per_second": 3.981, "step": 2860 }, { "entropy": 0.9558621048927307, "epoch": 6.517673888255416, "grad_norm": 0.984375, "learning_rate": 1.4848792968437376e-06, "loss": 0.0277, "mean_token_accuracy": 0.9920063763856888, "num_tokens": 303781691.0, "step": 2861 }, { "entropy": 0.9634474515914917, "epoch": 6.519954389965792, "grad_norm": 1.1171875, "learning_rate": 1.4831583923105e-06, "loss": 0.0316, "mean_token_accuracy": 0.9910600781440735, "num_tokens": 303888310.0, "step": 2862 }, { "entropy": 0.964483305811882, "epoch": 6.5222348916761685, "grad_norm": 1.1015625, "learning_rate": 1.481438064916928e-06, "loss": 0.0364, "mean_token_accuracy": 0.9878126680850983, "num_tokens": 303994230.0, "step": 2863 }, { "entropy": 0.961705282330513, "epoch": 6.524515393386545, "grad_norm": 1.046875, "learning_rate": 1.4797183156394462e-06, "loss": 0.0252, "mean_token_accuracy": 0.9911656677722931, "num_tokens": 304101387.0, "step": 2864 }, { "entropy": 0.9703675508499146, "epoch": 6.526795895096921, "grad_norm": 1.1015625, "learning_rate": 1.477999145454152e-06, "loss": 0.0319, "mean_token_accuracy": 0.9911032170057297, "num_tokens": 304207750.0, "step": 2865 }, { "entropy": 0.965502992272377, "epoch": 6.529076396807298, "grad_norm": 1.1953125, "learning_rate": 1.4762805553368115e-06, "loss": 0.0363, "mean_token_accuracy": 0.986082449555397, "num_tokens": 304313755.0, "step": 2866 }, { "entropy": 0.9726242274045944, "epoch": 6.531356898517674, "grad_norm": 1.2734375, "learning_rate": 1.4745625462628654e-06, "loss": 0.0414, "mean_token_accuracy": 0.9893353134393692, "num_tokens": 304420579.0, "step": 2867 }, { "entropy": 0.9599726647138596, "epoch": 6.53363740022805, "grad_norm": 1.0625, "learning_rate": 1.47284511920742e-06, "loss": 0.0361, "mean_token_accuracy": 0.9900783449411392, "num_tokens": 304526594.0, "step": 2868 }, { "entropy": 0.9591333866119385, "epoch": 6.535917901938427, "grad_norm": 1.03125, "learning_rate": 1.4711282751452549e-06, "loss": 0.0329, "mean_token_accuracy": 0.9902170598506927, "num_tokens": 304632704.0, "step": 2869 }, { "entropy": 0.9682103097438812, "epoch": 6.538198403648803, "grad_norm": 1.3828125, "learning_rate": 1.4694120150508179e-06, "loss": 0.0296, "mean_token_accuracy": 0.98924520611763, "num_tokens": 304739185.0, "step": 2870 }, { "entropy": 0.9625193476676941, "epoch": 6.540478905359179, "grad_norm": 1.328125, "learning_rate": 1.4676963398982248e-06, "loss": 0.0393, "mean_token_accuracy": 0.9890033453702927, "num_tokens": 304846239.0, "step": 2871 }, { "entropy": 0.964838296175003, "epoch": 6.542759407069555, "grad_norm": 1.078125, "learning_rate": 1.4659812506612608e-06, "loss": 0.027, "mean_token_accuracy": 0.9908052533864975, "num_tokens": 304952324.0, "step": 2872 }, { "entropy": 0.9681885689496994, "epoch": 6.545039908779931, "grad_norm": 1.109375, "learning_rate": 1.4642667483133753e-06, "loss": 0.0371, "mean_token_accuracy": 0.9904428869485855, "num_tokens": 305058101.0, "step": 2873 }, { "entropy": 0.9638442397117615, "epoch": 6.5473204104903076, "grad_norm": 1.0390625, "learning_rate": 1.4625528338276879e-06, "loss": 0.0325, "mean_token_accuracy": 0.9910483211278915, "num_tokens": 305164163.0, "step": 2874 }, { "entropy": 0.9675172716379166, "epoch": 6.549600912200685, "grad_norm": 1.15625, "learning_rate": 1.4608395081769833e-06, "loss": 0.0366, "mean_token_accuracy": 0.9890283644199371, "num_tokens": 305271064.0, "step": 2875 }, { "entropy": 0.9613495320081711, "epoch": 6.55188141391106, "grad_norm": 1.0390625, "learning_rate": 1.4591267723337122e-06, "loss": 0.035, "mean_token_accuracy": 0.9881925135850906, "num_tokens": 305377706.0, "step": 2876 }, { "entropy": 0.9697945863008499, "epoch": 6.554161915621437, "grad_norm": 1.1328125, "learning_rate": 1.4574146272699914e-06, "loss": 0.032, "mean_token_accuracy": 0.9888103902339935, "num_tokens": 305484039.0, "step": 2877 }, { "entropy": 0.9699305295944214, "epoch": 6.556442417331813, "grad_norm": 0.86328125, "learning_rate": 1.4557030739575988e-06, "loss": 0.0274, "mean_token_accuracy": 0.9906058311462402, "num_tokens": 305590473.0, "step": 2878 }, { "entropy": 0.968422532081604, "epoch": 6.558722919042189, "grad_norm": 1.203125, "learning_rate": 1.4539921133679808e-06, "loss": 0.032, "mean_token_accuracy": 0.9893057346343994, "num_tokens": 305697244.0, "step": 2879 }, { "entropy": 0.9680476635694504, "epoch": 6.561003420752566, "grad_norm": 0.99609375, "learning_rate": 1.4522817464722453e-06, "loss": 0.0281, "mean_token_accuracy": 0.991367906332016, "num_tokens": 305803470.0, "step": 2880 }, { "entropy": 0.9686400592327118, "epoch": 6.563283922462942, "grad_norm": 1.2109375, "learning_rate": 1.4505719742411644e-06, "loss": 0.0349, "mean_token_accuracy": 0.9896674305200577, "num_tokens": 305909633.0, "step": 2881 }, { "entropy": 0.9634786248207092, "epoch": 6.565564424173318, "grad_norm": 0.96875, "learning_rate": 1.44886279764517e-06, "loss": 0.0225, "mean_token_accuracy": 0.9923684895038605, "num_tokens": 306015402.0, "step": 2882 }, { "entropy": 0.9610666185617447, "epoch": 6.567844925883694, "grad_norm": 0.94921875, "learning_rate": 1.4471542176543587e-06, "loss": 0.0275, "mean_token_accuracy": 0.9910954385995865, "num_tokens": 306121241.0, "step": 2883 }, { "entropy": 0.9618506580591202, "epoch": 6.57012542759407, "grad_norm": 0.9765625, "learning_rate": 1.4454462352384885e-06, "loss": 0.0219, "mean_token_accuracy": 0.9943099468946457, "num_tokens": 306227986.0, "step": 2884 }, { "entropy": 0.9594457149505615, "epoch": 6.572405929304447, "grad_norm": 0.9140625, "learning_rate": 1.4437388513669754e-06, "loss": 0.0216, "mean_token_accuracy": 0.9922869503498077, "num_tokens": 306334228.0, "step": 2885 }, { "entropy": 0.9556462168693542, "epoch": 6.574686431014824, "grad_norm": 1.1953125, "learning_rate": 1.4420320670088977e-06, "loss": 0.0337, "mean_token_accuracy": 0.9877760112285614, "num_tokens": 306441757.0, "step": 2886 }, { "entropy": 0.9615198373794556, "epoch": 6.5769669327252, "grad_norm": 1.15625, "learning_rate": 1.4403258831329947e-06, "loss": 0.0312, "mean_token_accuracy": 0.9885658472776413, "num_tokens": 306547870.0, "step": 2887 }, { "entropy": 0.9695922583341599, "epoch": 6.579247434435576, "grad_norm": 1.09375, "learning_rate": 1.4386203007076632e-06, "loss": 0.026, "mean_token_accuracy": 0.9898249059915543, "num_tokens": 306654493.0, "step": 2888 }, { "entropy": 0.9677231758832932, "epoch": 6.581527936145952, "grad_norm": 1.296875, "learning_rate": 1.4369153207009573e-06, "loss": 0.0301, "mean_token_accuracy": 0.9902849644422531, "num_tokens": 306760745.0, "step": 2889 }, { "entropy": 0.9639592468738556, "epoch": 6.583808437856328, "grad_norm": 1.0234375, "learning_rate": 1.4352109440805917e-06, "loss": 0.0273, "mean_token_accuracy": 0.9921388775110245, "num_tokens": 306867177.0, "step": 2890 }, { "entropy": 0.967625081539154, "epoch": 6.586088939566705, "grad_norm": 0.96484375, "learning_rate": 1.4335071718139379e-06, "loss": 0.029, "mean_token_accuracy": 0.989590048789978, "num_tokens": 306973750.0, "step": 2891 }, { "entropy": 0.9676901996135712, "epoch": 6.588369441277081, "grad_norm": 1.0625, "learning_rate": 1.4318040048680238e-06, "loss": 0.022, "mean_token_accuracy": 0.9926696419715881, "num_tokens": 307080435.0, "step": 2892 }, { "entropy": 0.9627184718847275, "epoch": 6.590649942987457, "grad_norm": 1.1640625, "learning_rate": 1.430101444209535e-06, "loss": 0.0305, "mean_token_accuracy": 0.9910749346017838, "num_tokens": 307187126.0, "step": 2893 }, { "entropy": 0.9650626331567764, "epoch": 6.592930444697833, "grad_norm": 0.93359375, "learning_rate": 1.4283994908048107e-06, "loss": 0.0332, "mean_token_accuracy": 0.9897768050432205, "num_tokens": 307293349.0, "step": 2894 }, { "entropy": 0.9645390659570694, "epoch": 6.59521094640821, "grad_norm": 0.9140625, "learning_rate": 1.426698145619847e-06, "loss": 0.0228, "mean_token_accuracy": 0.9921887218952179, "num_tokens": 307399705.0, "step": 2895 }, { "entropy": 0.970082238316536, "epoch": 6.5974914481185865, "grad_norm": 1.1484375, "learning_rate": 1.424997409620295e-06, "loss": 0.0329, "mean_token_accuracy": 0.989900678396225, "num_tokens": 307506736.0, "step": 2896 }, { "entropy": 0.9633694142103195, "epoch": 6.599771949828963, "grad_norm": 1.078125, "learning_rate": 1.4232972837714598e-06, "loss": 0.0292, "mean_token_accuracy": 0.9902058094739914, "num_tokens": 307612814.0, "step": 2897 }, { "entropy": 0.9594677686691284, "epoch": 6.602052451539339, "grad_norm": 0.984375, "learning_rate": 1.4215977690382998e-06, "loss": 0.0291, "mean_token_accuracy": 0.9913697987794876, "num_tokens": 307719835.0, "step": 2898 }, { "entropy": 0.9690703749656677, "epoch": 6.604332953249715, "grad_norm": 0.9296875, "learning_rate": 1.4198988663854276e-06, "loss": 0.0305, "mean_token_accuracy": 0.9914468079805374, "num_tokens": 307825996.0, "step": 2899 }, { "entropy": 0.9617172926664352, "epoch": 6.606613454960091, "grad_norm": 1.1328125, "learning_rate": 1.4182005767771057e-06, "loss": 0.0323, "mean_token_accuracy": 0.9893198907375336, "num_tokens": 307932997.0, "step": 2900 }, { "entropy": 0.9660787582397461, "epoch": 6.608893956670467, "grad_norm": 0.93359375, "learning_rate": 1.4165029011772513e-06, "loss": 0.0252, "mean_token_accuracy": 0.992088258266449, "num_tokens": 308038864.0, "step": 2901 }, { "entropy": 0.9662402272224426, "epoch": 6.611174458380844, "grad_norm": 0.9375, "learning_rate": 1.4148058405494328e-06, "loss": 0.0241, "mean_token_accuracy": 0.9907518923282623, "num_tokens": 308145093.0, "step": 2902 }, { "entropy": 0.9623273462057114, "epoch": 6.61345496009122, "grad_norm": 0.86328125, "learning_rate": 1.4131093958568695e-06, "loss": 0.026, "mean_token_accuracy": 0.9918383061885834, "num_tokens": 308251509.0, "step": 2903 }, { "entropy": 0.9604493826627731, "epoch": 6.615735461801596, "grad_norm": 1.1640625, "learning_rate": 1.4114135680624291e-06, "loss": 0.0332, "mean_token_accuracy": 0.9893151223659515, "num_tokens": 308357549.0, "step": 2904 }, { "entropy": 0.9629510790109634, "epoch": 6.618015963511972, "grad_norm": 1.0546875, "learning_rate": 1.4097183581286322e-06, "loss": 0.0342, "mean_token_accuracy": 0.9879812449216843, "num_tokens": 308463331.0, "step": 2905 }, { "entropy": 0.9647343307733536, "epoch": 6.620296465222349, "grad_norm": 1.2734375, "learning_rate": 1.4080237670176456e-06, "loss": 0.0342, "mean_token_accuracy": 0.9870759695768356, "num_tokens": 308569971.0, "step": 2906 }, { "entropy": 0.9566651284694672, "epoch": 6.6225769669327255, "grad_norm": 1.0234375, "learning_rate": 1.4063297956912875e-06, "loss": 0.0277, "mean_token_accuracy": 0.9921025782823563, "num_tokens": 308676473.0, "step": 2907 }, { "entropy": 0.9704433381557465, "epoch": 6.624857468643102, "grad_norm": 0.92578125, "learning_rate": 1.4046364451110234e-06, "loss": 0.0227, "mean_token_accuracy": 0.9922423213720322, "num_tokens": 308782383.0, "step": 2908 }, { "entropy": 0.9654195010662079, "epoch": 6.627137970353478, "grad_norm": 1.1875, "learning_rate": 1.4029437162379666e-06, "loss": 0.0321, "mean_token_accuracy": 0.9899304807186127, "num_tokens": 308888213.0, "step": 2909 }, { "entropy": 0.9611908942461014, "epoch": 6.629418472063854, "grad_norm": 1.0078125, "learning_rate": 1.4012516100328766e-06, "loss": 0.0275, "mean_token_accuracy": 0.9918880313634872, "num_tokens": 308994261.0, "step": 2910 }, { "entropy": 0.9574006348848343, "epoch": 6.63169897377423, "grad_norm": 1.1328125, "learning_rate": 1.3995601274561605e-06, "loss": 0.0301, "mean_token_accuracy": 0.991576224565506, "num_tokens": 309101246.0, "step": 2911 }, { "entropy": 0.9635819792747498, "epoch": 6.633979475484606, "grad_norm": 1.3046875, "learning_rate": 1.3978692694678711e-06, "loss": 0.0349, "mean_token_accuracy": 0.9868823736906052, "num_tokens": 309207338.0, "step": 2912 }, { "entropy": 0.9639009088277817, "epoch": 6.636259977194983, "grad_norm": 0.93359375, "learning_rate": 1.3961790370277068e-06, "loss": 0.0276, "mean_token_accuracy": 0.9915954619646072, "num_tokens": 309313398.0, "step": 2913 }, { "entropy": 0.9668000042438507, "epoch": 6.638540478905359, "grad_norm": 1.4765625, "learning_rate": 1.3944894310950113e-06, "loss": 0.0336, "mean_token_accuracy": 0.9895083010196686, "num_tokens": 309420137.0, "step": 2914 }, { "entropy": 0.9627787321805954, "epoch": 6.640820980615736, "grad_norm": 1.0625, "learning_rate": 1.3928004526287729e-06, "loss": 0.0285, "mean_token_accuracy": 0.9898115396499634, "num_tokens": 309526221.0, "step": 2915 }, { "entropy": 0.9581753611564636, "epoch": 6.643101482326112, "grad_norm": 1.0546875, "learning_rate": 1.3911121025876212e-06, "loss": 0.03, "mean_token_accuracy": 0.9906202405691147, "num_tokens": 309632372.0, "step": 2916 }, { "entropy": 0.9642494171857834, "epoch": 6.645381984036488, "grad_norm": 1.2265625, "learning_rate": 1.389424381929832e-06, "loss": 0.0325, "mean_token_accuracy": 0.9899601340293884, "num_tokens": 309738708.0, "step": 2917 }, { "entropy": 0.9598596841096878, "epoch": 6.6476624857468645, "grad_norm": 1.40625, "learning_rate": 1.3877372916133234e-06, "loss": 0.0413, "mean_token_accuracy": 0.9870982319116592, "num_tokens": 309844815.0, "step": 2918 }, { "entropy": 0.9629800170660019, "epoch": 6.649942987457241, "grad_norm": 1.1796875, "learning_rate": 1.3860508325956549e-06, "loss": 0.0325, "mean_token_accuracy": 0.9892289340496063, "num_tokens": 309951012.0, "step": 2919 }, { "entropy": 0.9666880369186401, "epoch": 6.652223489167617, "grad_norm": 1.0390625, "learning_rate": 1.3843650058340291e-06, "loss": 0.0233, "mean_token_accuracy": 0.9924325942993164, "num_tokens": 310057388.0, "step": 2920 }, { "entropy": 0.9668467044830322, "epoch": 6.654503990877993, "grad_norm": 0.9765625, "learning_rate": 1.382679812285287e-06, "loss": 0.031, "mean_token_accuracy": 0.9907945692539215, "num_tokens": 310164071.0, "step": 2921 }, { "entropy": 0.9576689004898071, "epoch": 6.656784492588369, "grad_norm": 1.1015625, "learning_rate": 1.3809952529059127e-06, "loss": 0.0226, "mean_token_accuracy": 0.9931910336017609, "num_tokens": 310270286.0, "step": 2922 }, { "entropy": 0.9640212804079056, "epoch": 6.659064994298745, "grad_norm": 0.98828125, "learning_rate": 1.3793113286520293e-06, "loss": 0.0297, "mean_token_accuracy": 0.991049125790596, "num_tokens": 310377020.0, "step": 2923 }, { "entropy": 0.9686090797185898, "epoch": 6.661345496009122, "grad_norm": 1.078125, "learning_rate": 1.3776280404794016e-06, "loss": 0.0315, "mean_token_accuracy": 0.9907583594322205, "num_tokens": 310483372.0, "step": 2924 }, { "entropy": 0.964018777012825, "epoch": 6.663625997719498, "grad_norm": 1.046875, "learning_rate": 1.3759453893434285e-06, "loss": 0.0258, "mean_token_accuracy": 0.9928009212017059, "num_tokens": 310589384.0, "step": 2925 }, { "entropy": 0.9637644737958908, "epoch": 6.665906499429875, "grad_norm": 1.0234375, "learning_rate": 1.3742633761991519e-06, "loss": 0.0267, "mean_token_accuracy": 0.9907101094722748, "num_tokens": 310695804.0, "step": 2926 }, { "entropy": 0.9642823338508606, "epoch": 6.668187001140251, "grad_norm": 0.95703125, "learning_rate": 1.3725820020012506e-06, "loss": 0.0305, "mean_token_accuracy": 0.9905742108821869, "num_tokens": 310802498.0, "step": 2927 }, { "entropy": 0.9604359716176987, "epoch": 6.670467502850627, "grad_norm": 0.91015625, "learning_rate": 1.3709012677040385e-06, "loss": 0.0269, "mean_token_accuracy": 0.9895476400852203, "num_tokens": 310908925.0, "step": 2928 }, { "entropy": 0.9687705487012863, "epoch": 6.6727480045610035, "grad_norm": 0.9765625, "learning_rate": 1.3692211742614686e-06, "loss": 0.0282, "mean_token_accuracy": 0.9907910823822021, "num_tokens": 311014791.0, "step": 2929 }, { "entropy": 0.9611459225416183, "epoch": 6.67502850627138, "grad_norm": 1.0859375, "learning_rate": 1.3675417226271298e-06, "loss": 0.0345, "mean_token_accuracy": 0.9888648092746735, "num_tokens": 311121788.0, "step": 2930 }, { "entropy": 0.9618968367576599, "epoch": 6.677309007981756, "grad_norm": 1.375, "learning_rate": 1.365862913754247e-06, "loss": 0.0325, "mean_token_accuracy": 0.9906362742185593, "num_tokens": 311228649.0, "step": 2931 }, { "entropy": 0.9644518196582794, "epoch": 6.679589509692132, "grad_norm": 1.46875, "learning_rate": 1.3641847485956782e-06, "loss": 0.0358, "mean_token_accuracy": 0.9905315786600113, "num_tokens": 311334781.0, "step": 2932 }, { "entropy": 0.9617306888103485, "epoch": 6.681870011402508, "grad_norm": 1.1015625, "learning_rate": 1.362507228103918e-06, "loss": 0.0327, "mean_token_accuracy": 0.9897153079509735, "num_tokens": 311441328.0, "step": 2933 }, { "entropy": 0.9683189243078232, "epoch": 6.684150513112884, "grad_norm": 0.98828125, "learning_rate": 1.3608303532310956e-06, "loss": 0.0264, "mean_token_accuracy": 0.9905821830034256, "num_tokens": 311547471.0, "step": 2934 }, { "entropy": 0.9593832492828369, "epoch": 6.6864310148232615, "grad_norm": 0.83984375, "learning_rate": 1.3591541249289718e-06, "loss": 0.0268, "mean_token_accuracy": 0.99069644510746, "num_tokens": 311654281.0, "step": 2935 }, { "entropy": 0.9673309326171875, "epoch": 6.688711516533638, "grad_norm": 1.203125, "learning_rate": 1.357478544148943e-06, "loss": 0.0335, "mean_token_accuracy": 0.9880859106779099, "num_tokens": 311761217.0, "step": 2936 }, { "entropy": 0.9613813310861588, "epoch": 6.690992018244014, "grad_norm": 0.9765625, "learning_rate": 1.3558036118420343e-06, "loss": 0.029, "mean_token_accuracy": 0.9924673438072205, "num_tokens": 311867480.0, "step": 2937 }, { "entropy": 0.9589533656835556, "epoch": 6.69327251995439, "grad_norm": 1.203125, "learning_rate": 1.3541293289589058e-06, "loss": 0.0325, "mean_token_accuracy": 0.9897895604372025, "num_tokens": 311974244.0, "step": 2938 }, { "entropy": 0.9614315330982208, "epoch": 6.695553021664766, "grad_norm": 0.98828125, "learning_rate": 1.3524556964498482e-06, "loss": 0.0264, "mean_token_accuracy": 0.9917001724243164, "num_tokens": 312080694.0, "step": 2939 }, { "entropy": 0.9660706669092178, "epoch": 6.6978335233751425, "grad_norm": 0.8984375, "learning_rate": 1.3507827152647835e-06, "loss": 0.0234, "mean_token_accuracy": 0.9920610189437866, "num_tokens": 312187106.0, "step": 2940 }, { "entropy": 0.9596981853246689, "epoch": 6.700114025085519, "grad_norm": 1.1875, "learning_rate": 1.3491103863532626e-06, "loss": 0.0391, "mean_token_accuracy": 0.9879145622253418, "num_tokens": 312293540.0, "step": 2941 }, { "entropy": 0.9659764915704727, "epoch": 6.702394526795895, "grad_norm": 1.2265625, "learning_rate": 1.3474387106644688e-06, "loss": 0.0357, "mean_token_accuracy": 0.9878545254468918, "num_tokens": 312399918.0, "step": 2942 }, { "entropy": 0.961578980088234, "epoch": 6.704675028506271, "grad_norm": 1.3125, "learning_rate": 1.345767689147211e-06, "loss": 0.0389, "mean_token_accuracy": 0.9882629066705704, "num_tokens": 312506335.0, "step": 2943 }, { "entropy": 0.9681281596422195, "epoch": 6.706955530216648, "grad_norm": 0.97265625, "learning_rate": 1.3440973227499293e-06, "loss": 0.0294, "mean_token_accuracy": 0.9909478425979614, "num_tokens": 312612444.0, "step": 2944 }, { "entropy": 0.9584186673164368, "epoch": 6.7092360319270234, "grad_norm": 0.83984375, "learning_rate": 1.3424276124206917e-06, "loss": 0.0198, "mean_token_accuracy": 0.9940358996391296, "num_tokens": 312718528.0, "step": 2945 }, { "entropy": 0.9620232284069061, "epoch": 6.7115165336374005, "grad_norm": 1.34375, "learning_rate": 1.3407585591071944e-06, "loss": 0.0415, "mean_token_accuracy": 0.9875925332307816, "num_tokens": 312824224.0, "step": 2946 }, { "entropy": 0.9602758288383484, "epoch": 6.713797035347777, "grad_norm": 1.125, "learning_rate": 1.3390901637567579e-06, "loss": 0.0331, "mean_token_accuracy": 0.9888793081045151, "num_tokens": 312930859.0, "step": 2947 }, { "entropy": 0.9600441455841064, "epoch": 6.716077537058153, "grad_norm": 1.2734375, "learning_rate": 1.3374224273163334e-06, "loss": 0.0407, "mean_token_accuracy": 0.9865812361240387, "num_tokens": 313036935.0, "step": 2948 }, { "entropy": 0.9578616768121719, "epoch": 6.718358038768529, "grad_norm": 1.234375, "learning_rate": 1.3357553507324938e-06, "loss": 0.0299, "mean_token_accuracy": 0.9899198412895203, "num_tokens": 313142844.0, "step": 2949 }, { "entropy": 0.9602793455123901, "epoch": 6.720638540478905, "grad_norm": 1.0234375, "learning_rate": 1.3340889349514403e-06, "loss": 0.0342, "mean_token_accuracy": 0.9889950603246689, "num_tokens": 313248866.0, "step": 2950 }, { "entropy": 0.9640858620405197, "epoch": 6.7229190421892815, "grad_norm": 1.046875, "learning_rate": 1.3324231809189985e-06, "loss": 0.0264, "mean_token_accuracy": 0.9900582581758499, "num_tokens": 313354873.0, "step": 2951 }, { "entropy": 0.9591253846883774, "epoch": 6.725199543899658, "grad_norm": 0.90625, "learning_rate": 1.3307580895806194e-06, "loss": 0.0295, "mean_token_accuracy": 0.9905223697423935, "num_tokens": 313461168.0, "step": 2952 }, { "entropy": 0.9631872922182083, "epoch": 6.727480045610034, "grad_norm": 1.0, "learning_rate": 1.3290936618813747e-06, "loss": 0.021, "mean_token_accuracy": 0.9941126704216003, "num_tokens": 313567333.0, "step": 2953 }, { "entropy": 0.9638652354478836, "epoch": 6.72976054732041, "grad_norm": 1.1171875, "learning_rate": 1.327429898765962e-06, "loss": 0.031, "mean_token_accuracy": 0.9887006729841232, "num_tokens": 313673598.0, "step": 2954 }, { "entropy": 0.9634290039539337, "epoch": 6.732041049030787, "grad_norm": 0.9921875, "learning_rate": 1.3257668011787018e-06, "loss": 0.0243, "mean_token_accuracy": 0.9934560060501099, "num_tokens": 313779876.0, "step": 2955 }, { "entropy": 0.9639912396669388, "epoch": 6.734321550741163, "grad_norm": 0.79296875, "learning_rate": 1.3241043700635352e-06, "loss": 0.0229, "mean_token_accuracy": 0.9932168275117874, "num_tokens": 313885837.0, "step": 2956 }, { "entropy": 0.966426208615303, "epoch": 6.7366020524515395, "grad_norm": 1.3515625, "learning_rate": 1.3224426063640272e-06, "loss": 0.0316, "mean_token_accuracy": 0.9911061674356461, "num_tokens": 313992113.0, "step": 2957 }, { "entropy": 0.9618357419967651, "epoch": 6.738882554161916, "grad_norm": 1.0625, "learning_rate": 1.320781511023363e-06, "loss": 0.0285, "mean_token_accuracy": 0.9917110055685043, "num_tokens": 314099041.0, "step": 2958 }, { "entropy": 0.9603020548820496, "epoch": 6.741163055872292, "grad_norm": 1.1328125, "learning_rate": 1.3191210849843461e-06, "loss": 0.0353, "mean_token_accuracy": 0.9880330711603165, "num_tokens": 314206298.0, "step": 2959 }, { "entropy": 0.9646539986133575, "epoch": 6.743443557582668, "grad_norm": 1.2734375, "learning_rate": 1.3174613291894039e-06, "loss": 0.0302, "mean_token_accuracy": 0.9914537519216537, "num_tokens": 314312606.0, "step": 2960 }, { "entropy": 0.9575569331645966, "epoch": 6.745724059293044, "grad_norm": 1.078125, "learning_rate": 1.3158022445805816e-06, "loss": 0.0378, "mean_token_accuracy": 0.986927330493927, "num_tokens": 314418972.0, "step": 2961 }, { "entropy": 0.9609543234109879, "epoch": 6.7480045610034205, "grad_norm": 1.015625, "learning_rate": 1.3141438320995433e-06, "loss": 0.0319, "mean_token_accuracy": 0.9898447692394257, "num_tokens": 314524996.0, "step": 2962 }, { "entropy": 0.9595657885074615, "epoch": 6.750285062713797, "grad_norm": 1.2265625, "learning_rate": 1.3124860926875732e-06, "loss": 0.0262, "mean_token_accuracy": 0.9912229031324387, "num_tokens": 314631535.0, "step": 2963 }, { "entropy": 0.9575449377298355, "epoch": 6.752565564424174, "grad_norm": 1.0859375, "learning_rate": 1.3108290272855697e-06, "loss": 0.0342, "mean_token_accuracy": 0.9877715408802032, "num_tokens": 314738168.0, "step": 2964 }, { "entropy": 0.9710715711116791, "epoch": 6.75484606613455, "grad_norm": 0.98046875, "learning_rate": 1.309172636834053e-06, "loss": 0.0273, "mean_token_accuracy": 0.991586372256279, "num_tokens": 314844907.0, "step": 2965 }, { "entropy": 0.9662147462368011, "epoch": 6.757126567844926, "grad_norm": 1.1015625, "learning_rate": 1.3075169222731573e-06, "loss": 0.033, "mean_token_accuracy": 0.9909979999065399, "num_tokens": 314951525.0, "step": 2966 }, { "entropy": 0.9631311595439911, "epoch": 6.759407069555302, "grad_norm": 1.265625, "learning_rate": 1.305861884542636e-06, "loss": 0.0346, "mean_token_accuracy": 0.9896212220191956, "num_tokens": 315057618.0, "step": 2967 }, { "entropy": 0.9548712223768234, "epoch": 6.7616875712656785, "grad_norm": 1.09375, "learning_rate": 1.3042075245818542e-06, "loss": 0.0402, "mean_token_accuracy": 0.9876781404018402, "num_tokens": 315163869.0, "step": 2968 }, { "entropy": 0.9621338099241257, "epoch": 6.763968072976055, "grad_norm": 0.96484375, "learning_rate": 1.3025538433297957e-06, "loss": 0.0319, "mean_token_accuracy": 0.9904128909111023, "num_tokens": 315270763.0, "step": 2969 }, { "entropy": 0.9616038054227829, "epoch": 6.766248574686431, "grad_norm": 1.0859375, "learning_rate": 1.3009008417250597e-06, "loss": 0.0279, "mean_token_accuracy": 0.9903973788022995, "num_tokens": 315377155.0, "step": 2970 }, { "entropy": 0.9676768183708191, "epoch": 6.768529076396807, "grad_norm": 0.8984375, "learning_rate": 1.2992485207058548e-06, "loss": 0.0229, "mean_token_accuracy": 0.9930784553289413, "num_tokens": 315483130.0, "step": 2971 }, { "entropy": 0.9648289382457733, "epoch": 6.770809578107183, "grad_norm": 1.0546875, "learning_rate": 1.2975968812100081e-06, "loss": 0.0317, "mean_token_accuracy": 0.9871583878993988, "num_tokens": 315589551.0, "step": 2972 }, { "entropy": 0.9672328233718872, "epoch": 6.7730900798175595, "grad_norm": 0.9765625, "learning_rate": 1.295945924174959e-06, "loss": 0.0238, "mean_token_accuracy": 0.9923983514308929, "num_tokens": 315695351.0, "step": 2973 }, { "entropy": 0.9616836458444595, "epoch": 6.775370581527936, "grad_norm": 0.87890625, "learning_rate": 1.2942956505377585e-06, "loss": 0.0245, "mean_token_accuracy": 0.9917676448822021, "num_tokens": 315801401.0, "step": 2974 }, { "entropy": 0.964852899312973, "epoch": 6.777651083238313, "grad_norm": 1.2734375, "learning_rate": 1.2926460612350688e-06, "loss": 0.0385, "mean_token_accuracy": 0.9876869469881058, "num_tokens": 315907759.0, "step": 2975 }, { "entropy": 0.9594565629959106, "epoch": 6.779931584948689, "grad_norm": 0.99609375, "learning_rate": 1.2909971572031663e-06, "loss": 0.0293, "mean_token_accuracy": 0.9919818043708801, "num_tokens": 316014678.0, "step": 2976 }, { "entropy": 0.9617218524217606, "epoch": 6.782212086659065, "grad_norm": 0.77734375, "learning_rate": 1.2893489393779362e-06, "loss": 0.0161, "mean_token_accuracy": 0.9955779016017914, "num_tokens": 316120857.0, "step": 2977 }, { "entropy": 0.9652404636144638, "epoch": 6.784492588369441, "grad_norm": 1.1640625, "learning_rate": 1.2877014086948762e-06, "loss": 0.0334, "mean_token_accuracy": 0.9894522875547409, "num_tokens": 316227026.0, "step": 2978 }, { "entropy": 0.9685012698173523, "epoch": 6.7867730900798175, "grad_norm": 1.234375, "learning_rate": 1.2860545660890928e-06, "loss": 0.0251, "mean_token_accuracy": 0.9916336685419083, "num_tokens": 316333055.0, "step": 2979 }, { "entropy": 0.9650068283081055, "epoch": 6.789053591790194, "grad_norm": 0.984375, "learning_rate": 1.2844084124953006e-06, "loss": 0.0294, "mean_token_accuracy": 0.9911601096391678, "num_tokens": 316439536.0, "step": 2980 }, { "entropy": 0.9617577493190765, "epoch": 6.79133409350057, "grad_norm": 1.1015625, "learning_rate": 1.2827629488478254e-06, "loss": 0.0259, "mean_token_accuracy": 0.9913628846406937, "num_tokens": 316546205.0, "step": 2981 }, { "entropy": 0.9635928124189377, "epoch": 6.793614595210946, "grad_norm": 1.0859375, "learning_rate": 1.2811181760806013e-06, "loss": 0.0425, "mean_token_accuracy": 0.9888466745615005, "num_tokens": 316652173.0, "step": 2982 }, { "entropy": 0.9677512496709824, "epoch": 6.795895096921322, "grad_norm": 1.28125, "learning_rate": 1.2794740951271686e-06, "loss": 0.0326, "mean_token_accuracy": 0.9895225763320923, "num_tokens": 316758007.0, "step": 2983 }, { "entropy": 0.9554256200790405, "epoch": 6.798175598631699, "grad_norm": 1.1796875, "learning_rate": 1.2778307069206764e-06, "loss": 0.0333, "mean_token_accuracy": 0.989355742931366, "num_tokens": 316864149.0, "step": 2984 }, { "entropy": 0.9636441767215729, "epoch": 6.800456100342076, "grad_norm": 1.2734375, "learning_rate": 1.2761880123938814e-06, "loss": 0.0352, "mean_token_accuracy": 0.989696204662323, "num_tokens": 316971185.0, "step": 2985 }, { "entropy": 0.9708690792322159, "epoch": 6.802736602052452, "grad_norm": 1.171875, "learning_rate": 1.2745460124791425e-06, "loss": 0.0312, "mean_token_accuracy": 0.989282876253128, "num_tokens": 317078091.0, "step": 2986 }, { "entropy": 0.9648498147726059, "epoch": 6.805017103762828, "grad_norm": 1.03125, "learning_rate": 1.272904708108429e-06, "loss": 0.024, "mean_token_accuracy": 0.9929722249507904, "num_tokens": 317183679.0, "step": 2987 }, { "entropy": 0.9643113166093826, "epoch": 6.807297605473204, "grad_norm": 1.2109375, "learning_rate": 1.2712641002133128e-06, "loss": 0.0379, "mean_token_accuracy": 0.9907848984003067, "num_tokens": 317289378.0, "step": 2988 }, { "entropy": 0.9609562903642654, "epoch": 6.80957810718358, "grad_norm": 1.234375, "learning_rate": 1.2696241897249728e-06, "loss": 0.0371, "mean_token_accuracy": 0.990398496389389, "num_tokens": 317395372.0, "step": 2989 }, { "entropy": 0.9702621847391129, "epoch": 6.811858608893957, "grad_norm": 0.9921875, "learning_rate": 1.2679849775741884e-06, "loss": 0.0279, "mean_token_accuracy": 0.9913286417722702, "num_tokens": 317501776.0, "step": 2990 }, { "entropy": 0.9651257246732712, "epoch": 6.814139110604333, "grad_norm": 1.015625, "learning_rate": 1.266346464691346e-06, "loss": 0.0238, "mean_token_accuracy": 0.9921526908874512, "num_tokens": 317607860.0, "step": 2991 }, { "entropy": 0.9679255783557892, "epoch": 6.816419612314709, "grad_norm": 1.2734375, "learning_rate": 1.2647086520064343e-06, "loss": 0.0415, "mean_token_accuracy": 0.9881050437688828, "num_tokens": 317714379.0, "step": 2992 }, { "entropy": 0.9604811817407608, "epoch": 6.818700114025085, "grad_norm": 0.99609375, "learning_rate": 1.2630715404490424e-06, "loss": 0.0184, "mean_token_accuracy": 0.9930875450372696, "num_tokens": 317820383.0, "step": 2993 }, { "entropy": 0.9617069214582443, "epoch": 6.820980615735461, "grad_norm": 1.140625, "learning_rate": 1.2614351309483646e-06, "loss": 0.0322, "mean_token_accuracy": 0.9885686039924622, "num_tokens": 317927498.0, "step": 2994 }, { "entropy": 0.9655987024307251, "epoch": 6.823261117445838, "grad_norm": 1.0390625, "learning_rate": 1.259799424433196e-06, "loss": 0.0308, "mean_token_accuracy": 0.9901165515184402, "num_tokens": 318033305.0, "step": 2995 }, { "entropy": 0.9631451964378357, "epoch": 6.825541619156215, "grad_norm": 1.03125, "learning_rate": 1.25816442183193e-06, "loss": 0.0285, "mean_token_accuracy": 0.9897834956645966, "num_tokens": 318139546.0, "step": 2996 }, { "entropy": 0.9630729705095291, "epoch": 6.827822120866591, "grad_norm": 1.15625, "learning_rate": 1.2565301240725636e-06, "loss": 0.0368, "mean_token_accuracy": 0.9874976575374603, "num_tokens": 318245626.0, "step": 2997 }, { "entropy": 0.9589530825614929, "epoch": 6.830102622576967, "grad_norm": 0.98828125, "learning_rate": 1.2548965320826928e-06, "loss": 0.0242, "mean_token_accuracy": 0.9921260625123978, "num_tokens": 318351789.0, "step": 2998 }, { "entropy": 0.9634335041046143, "epoch": 6.832383124287343, "grad_norm": 1.15625, "learning_rate": 1.2532636467895126e-06, "loss": 0.0395, "mean_token_accuracy": 0.9881116896867752, "num_tokens": 318457927.0, "step": 2999 }, { "entropy": 0.9621726423501968, "epoch": 6.834663625997719, "grad_norm": 1.078125, "learning_rate": 1.2516314691198172e-06, "loss": 0.0334, "mean_token_accuracy": 0.988776296377182, "num_tokens": 318564430.0, "step": 3000 }, { "entropy": 0.9613594114780426, "epoch": 6.836944127708096, "grad_norm": 1.0625, "learning_rate": 1.2500000000000007e-06, "loss": 0.0329, "mean_token_accuracy": 0.9866965413093567, "num_tokens": 318670266.0, "step": 3001 }, { "entropy": 0.9587560296058655, "epoch": 6.839224629418472, "grad_norm": 0.97265625, "learning_rate": 1.2483692403560507e-06, "loss": 0.0299, "mean_token_accuracy": 0.9919214397668839, "num_tokens": 318777709.0, "step": 3002 }, { "entropy": 0.9688235074281693, "epoch": 6.841505131128848, "grad_norm": 0.9921875, "learning_rate": 1.2467391911135562e-06, "loss": 0.0312, "mean_token_accuracy": 0.9889025688171387, "num_tokens": 318883469.0, "step": 3003 }, { "entropy": 0.9606591612100601, "epoch": 6.843785632839225, "grad_norm": 1.1640625, "learning_rate": 1.2451098531977015e-06, "loss": 0.0338, "mean_token_accuracy": 0.989453986287117, "num_tokens": 318990197.0, "step": 3004 }, { "entropy": 0.9570139199495316, "epoch": 6.846066134549601, "grad_norm": 0.890625, "learning_rate": 1.2434812275332678e-06, "loss": 0.0233, "mean_token_accuracy": 0.9915159493684769, "num_tokens": 319096467.0, "step": 3005 }, { "entropy": 0.9614800661802292, "epoch": 6.848346636259977, "grad_norm": 0.921875, "learning_rate": 1.2418533150446324e-06, "loss": 0.0275, "mean_token_accuracy": 0.990804448723793, "num_tokens": 319202813.0, "step": 3006 }, { "entropy": 0.9650447964668274, "epoch": 6.850627137970354, "grad_norm": 1.0625, "learning_rate": 1.2402261166557647e-06, "loss": 0.0321, "mean_token_accuracy": 0.9917020946741104, "num_tokens": 319308462.0, "step": 3007 }, { "entropy": 0.965262696146965, "epoch": 6.85290763968073, "grad_norm": 1.1796875, "learning_rate": 1.2385996332902326e-06, "loss": 0.0348, "mean_token_accuracy": 0.990638867020607, "num_tokens": 319414591.0, "step": 3008 }, { "entropy": 0.9638961851596832, "epoch": 6.855188141391106, "grad_norm": 0.94921875, "learning_rate": 1.236973865871196e-06, "loss": 0.0242, "mean_token_accuracy": 0.9920720458030701, "num_tokens": 319520526.0, "step": 3009 }, { "entropy": 0.9658631086349487, "epoch": 6.857468643101482, "grad_norm": 1.265625, "learning_rate": 1.2353488153214096e-06, "loss": 0.0347, "mean_token_accuracy": 0.9904117286205292, "num_tokens": 319627382.0, "step": 3010 }, { "entropy": 0.9608684629201889, "epoch": 6.859749144811858, "grad_norm": 0.9296875, "learning_rate": 1.2337244825632217e-06, "loss": 0.0228, "mean_token_accuracy": 0.9936680942773819, "num_tokens": 319733842.0, "step": 3011 }, { "entropy": 0.9650074541568756, "epoch": 6.862029646522235, "grad_norm": 1.15625, "learning_rate": 1.2321008685185699e-06, "loss": 0.0338, "mean_token_accuracy": 0.9888368248939514, "num_tokens": 319838978.0, "step": 3012 }, { "entropy": 0.9591619670391083, "epoch": 6.864310148232612, "grad_norm": 0.98828125, "learning_rate": 1.2304779741089884e-06, "loss": 0.0311, "mean_token_accuracy": 0.9892920702695847, "num_tokens": 319945452.0, "step": 3013 }, { "entropy": 0.9665842801332474, "epoch": 6.866590649942988, "grad_norm": 1.2578125, "learning_rate": 1.228855800255599e-06, "loss": 0.0323, "mean_token_accuracy": 0.9894215762615204, "num_tokens": 320051277.0, "step": 3014 }, { "entropy": 0.9679447561502457, "epoch": 6.868871151653364, "grad_norm": 1.0234375, "learning_rate": 1.2272343478791165e-06, "loss": 0.0264, "mean_token_accuracy": 0.9926391988992691, "num_tokens": 320157633.0, "step": 3015 }, { "entropy": 0.9601188749074936, "epoch": 6.87115165336374, "grad_norm": 1.0546875, "learning_rate": 1.2256136178998468e-06, "loss": 0.0351, "mean_token_accuracy": 0.988796204328537, "num_tokens": 320264674.0, "step": 3016 }, { "entropy": 0.9686760902404785, "epoch": 6.873432155074116, "grad_norm": 1.3984375, "learning_rate": 1.2239936112376858e-06, "loss": 0.0359, "mean_token_accuracy": 0.9883907586336136, "num_tokens": 320371309.0, "step": 3017 }, { "entropy": 0.9610850960016251, "epoch": 6.875712656784493, "grad_norm": 0.90234375, "learning_rate": 1.2223743288121155e-06, "loss": 0.0224, "mean_token_accuracy": 0.9928624331951141, "num_tokens": 320477285.0, "step": 3018 }, { "entropy": 0.9657604992389679, "epoch": 6.877993158494869, "grad_norm": 1.2578125, "learning_rate": 1.2207557715422106e-06, "loss": 0.0274, "mean_token_accuracy": 0.9921462684869766, "num_tokens": 320584056.0, "step": 3019 }, { "entropy": 0.9644034504890442, "epoch": 6.880273660205245, "grad_norm": 1.2265625, "learning_rate": 1.219137940346633e-06, "loss": 0.0399, "mean_token_accuracy": 0.9890817254781723, "num_tokens": 320690122.0, "step": 3020 }, { "entropy": 0.9595093578100204, "epoch": 6.882554161915621, "grad_norm": 1.171875, "learning_rate": 1.2175208361436328e-06, "loss": 0.0316, "mean_token_accuracy": 0.9896067082881927, "num_tokens": 320796331.0, "step": 3021 }, { "entropy": 0.9577773958444595, "epoch": 6.884834663625997, "grad_norm": 1.0546875, "learning_rate": 1.2159044598510473e-06, "loss": 0.0302, "mean_token_accuracy": 0.990882933139801, "num_tokens": 320902711.0, "step": 3022 }, { "entropy": 0.9671423882246017, "epoch": 6.887115165336374, "grad_norm": 1.2265625, "learning_rate": 1.2142888123862992e-06, "loss": 0.0357, "mean_token_accuracy": 0.9895678013563156, "num_tokens": 321009389.0, "step": 3023 }, { "entropy": 0.9642983973026276, "epoch": 6.889395667046751, "grad_norm": 1.015625, "learning_rate": 1.2126738946663996e-06, "loss": 0.0284, "mean_token_accuracy": 0.9902970790863037, "num_tokens": 321115267.0, "step": 3024 }, { "entropy": 0.9531379640102386, "epoch": 6.891676168757127, "grad_norm": 1.015625, "learning_rate": 1.2110597076079448e-06, "loss": 0.0313, "mean_token_accuracy": 0.9894680380821228, "num_tokens": 321221266.0, "step": 3025 }, { "entropy": 0.9645583778619766, "epoch": 6.893956670467503, "grad_norm": 0.875, "learning_rate": 1.2094462521271156e-06, "loss": 0.0222, "mean_token_accuracy": 0.992204338312149, "num_tokens": 321327390.0, "step": 3026 }, { "entropy": 0.9585260897874832, "epoch": 6.896237172177879, "grad_norm": 1.0703125, "learning_rate": 1.2078335291396798e-06, "loss": 0.0269, "mean_token_accuracy": 0.9910581856966019, "num_tokens": 321433724.0, "step": 3027 }, { "entropy": 0.9645042270421982, "epoch": 6.898517673888255, "grad_norm": 1.140625, "learning_rate": 1.2062215395609856e-06, "loss": 0.033, "mean_token_accuracy": 0.9889741688966751, "num_tokens": 321539805.0, "step": 3028 }, { "entropy": 0.9684229344129562, "epoch": 6.900798175598632, "grad_norm": 1.03125, "learning_rate": 1.2046102843059681e-06, "loss": 0.0297, "mean_token_accuracy": 0.9903654009103775, "num_tokens": 321646214.0, "step": 3029 }, { "entropy": 0.9601180553436279, "epoch": 6.903078677309008, "grad_norm": 0.91796875, "learning_rate": 1.202999764289145e-06, "loss": 0.0253, "mean_token_accuracy": 0.9911731481552124, "num_tokens": 321752717.0, "step": 3030 }, { "entropy": 0.9624457955360413, "epoch": 6.905359179019384, "grad_norm": 1.0625, "learning_rate": 1.201389980424616e-06, "loss": 0.0311, "mean_token_accuracy": 0.988470733165741, "num_tokens": 321858688.0, "step": 3031 }, { "entropy": 0.9613544791936874, "epoch": 6.90763968072976, "grad_norm": 0.99609375, "learning_rate": 1.1997809336260644e-06, "loss": 0.0283, "mean_token_accuracy": 0.9910278618335724, "num_tokens": 321964300.0, "step": 3032 }, { "entropy": 0.9572088271379471, "epoch": 6.909920182440137, "grad_norm": 0.8671875, "learning_rate": 1.1981726248067521e-06, "loss": 0.0233, "mean_token_accuracy": 0.9919779002666473, "num_tokens": 322070350.0, "step": 3033 }, { "entropy": 0.967486172914505, "epoch": 6.9122006841505135, "grad_norm": 1.0546875, "learning_rate": 1.1965650548795251e-06, "loss": 0.022, "mean_token_accuracy": 0.9919692128896713, "num_tokens": 322176702.0, "step": 3034 }, { "entropy": 0.9584386050701141, "epoch": 6.91448118586089, "grad_norm": 0.96484375, "learning_rate": 1.1949582247568107e-06, "loss": 0.033, "mean_token_accuracy": 0.9898127168416977, "num_tokens": 322282819.0, "step": 3035 }, { "entropy": 0.9620967358350754, "epoch": 6.916761687571266, "grad_norm": 1.3203125, "learning_rate": 1.1933521353506117e-06, "loss": 0.043, "mean_token_accuracy": 0.9845108240842819, "num_tokens": 322389766.0, "step": 3036 }, { "entropy": 0.963974192738533, "epoch": 6.919042189281642, "grad_norm": 1.1875, "learning_rate": 1.1917467875725148e-06, "loss": 0.0342, "mean_token_accuracy": 0.9900654405355453, "num_tokens": 322495974.0, "step": 3037 }, { "entropy": 0.9584506005048752, "epoch": 6.921322690992018, "grad_norm": 0.859375, "learning_rate": 1.1901421823336856e-06, "loss": 0.0262, "mean_token_accuracy": 0.9909387081861496, "num_tokens": 322602488.0, "step": 3038 }, { "entropy": 0.9666791558265686, "epoch": 6.923603192702394, "grad_norm": 1.1484375, "learning_rate": 1.188538320544865e-06, "loss": 0.037, "mean_token_accuracy": 0.9890025854110718, "num_tokens": 322708782.0, "step": 3039 }, { "entropy": 0.9625478982925415, "epoch": 6.925883694412771, "grad_norm": 1.4609375, "learning_rate": 1.1869352031163746e-06, "loss": 0.0478, "mean_token_accuracy": 0.9857501536607742, "num_tokens": 322814911.0, "step": 3040 }, { "entropy": 0.9564260095357895, "epoch": 6.928164196123147, "grad_norm": 0.9375, "learning_rate": 1.1853328309581139e-06, "loss": 0.028, "mean_token_accuracy": 0.9919535368680954, "num_tokens": 322920769.0, "step": 3041 }, { "entropy": 0.9643784463405609, "epoch": 6.930444697833523, "grad_norm": 1.296875, "learning_rate": 1.183731204979557e-06, "loss": 0.0354, "mean_token_accuracy": 0.989438846707344, "num_tokens": 323027674.0, "step": 3042 }, { "entropy": 0.9646007418632507, "epoch": 6.932725199543899, "grad_norm": 1.109375, "learning_rate": 1.182130326089758e-06, "loss": 0.0333, "mean_token_accuracy": 0.9900464117527008, "num_tokens": 323134369.0, "step": 3043 }, { "entropy": 0.9646202772855759, "epoch": 6.935005701254276, "grad_norm": 1.1484375, "learning_rate": 1.1805301951973423e-06, "loss": 0.0331, "mean_token_accuracy": 0.9877679944038391, "num_tokens": 323240629.0, "step": 3044 }, { "entropy": 0.9624777287244797, "epoch": 6.9372862029646525, "grad_norm": 1.0546875, "learning_rate": 1.1789308132105145e-06, "loss": 0.0267, "mean_token_accuracy": 0.9910609871149063, "num_tokens": 323346542.0, "step": 3045 }, { "entropy": 0.9602365493774414, "epoch": 6.939566704675029, "grad_norm": 1.0859375, "learning_rate": 1.1773321810370527e-06, "loss": 0.0312, "mean_token_accuracy": 0.9910784363746643, "num_tokens": 323452507.0, "step": 3046 }, { "entropy": 0.9617316573858261, "epoch": 6.941847206385405, "grad_norm": 0.98046875, "learning_rate": 1.1757342995843103e-06, "loss": 0.0257, "mean_token_accuracy": 0.99165178835392, "num_tokens": 323558569.0, "step": 3047 }, { "entropy": 0.961124449968338, "epoch": 6.944127708095781, "grad_norm": 0.9296875, "learning_rate": 1.1741371697592134e-06, "loss": 0.0272, "mean_token_accuracy": 0.9912428706884384, "num_tokens": 323664826.0, "step": 3048 }, { "entropy": 0.9600909948348999, "epoch": 6.946408209806157, "grad_norm": 1.234375, "learning_rate": 1.1725407924682628e-06, "loss": 0.0329, "mean_token_accuracy": 0.9895073175430298, "num_tokens": 323771701.0, "step": 3049 }, { "entropy": 0.9603312313556671, "epoch": 6.9486887115165334, "grad_norm": 1.0390625, "learning_rate": 1.17094516861753e-06, "loss": 0.0249, "mean_token_accuracy": 0.991720512509346, "num_tokens": 323878777.0, "step": 3050 }, { "entropy": 0.9551673978567123, "epoch": 6.95096921322691, "grad_norm": 1.203125, "learning_rate": 1.1693502991126609e-06, "loss": 0.0351, "mean_token_accuracy": 0.9902802854776382, "num_tokens": 323985055.0, "step": 3051 }, { "entropy": 0.9653651416301727, "epoch": 6.953249714937286, "grad_norm": 1.1171875, "learning_rate": 1.1677561848588734e-06, "loss": 0.0309, "mean_token_accuracy": 0.9886702001094818, "num_tokens": 324091389.0, "step": 3052 }, { "entropy": 0.9670352935791016, "epoch": 6.955530216647663, "grad_norm": 1.1171875, "learning_rate": 1.166162826760955e-06, "loss": 0.0315, "mean_token_accuracy": 0.9916467815637589, "num_tokens": 324198272.0, "step": 3053 }, { "entropy": 0.961565688252449, "epoch": 6.957810718358039, "grad_norm": 1.1328125, "learning_rate": 1.1645702257232663e-06, "loss": 0.0305, "mean_token_accuracy": 0.9892797917127609, "num_tokens": 324304337.0, "step": 3054 }, { "entropy": 0.9641566425561905, "epoch": 6.960091220068415, "grad_norm": 1.0625, "learning_rate": 1.1629783826497351e-06, "loss": 0.0296, "mean_token_accuracy": 0.9919904321432114, "num_tokens": 324410547.0, "step": 3055 }, { "entropy": 0.9608302116394043, "epoch": 6.9623717217787915, "grad_norm": 0.97265625, "learning_rate": 1.161387298443863e-06, "loss": 0.0246, "mean_token_accuracy": 0.9916424751281738, "num_tokens": 324516859.0, "step": 3056 }, { "entropy": 0.9608175456523895, "epoch": 6.964652223489168, "grad_norm": 1.09375, "learning_rate": 1.1597969740087159e-06, "loss": 0.0299, "mean_token_accuracy": 0.9896736592054367, "num_tokens": 324622948.0, "step": 3057 }, { "entropy": 0.9647179394960403, "epoch": 6.966932725199544, "grad_norm": 1.4453125, "learning_rate": 1.1582074102469332e-06, "loss": 0.0422, "mean_token_accuracy": 0.9878463447093964, "num_tokens": 324729794.0, "step": 3058 }, { "entropy": 0.9561490416526794, "epoch": 6.96921322690992, "grad_norm": 0.8125, "learning_rate": 1.1566186080607198e-06, "loss": 0.0231, "mean_token_accuracy": 0.9918828904628754, "num_tokens": 324836430.0, "step": 3059 }, { "entropy": 0.9564995020627975, "epoch": 6.971493728620296, "grad_norm": 1.0703125, "learning_rate": 1.1550305683518506e-06, "loss": 0.0297, "mean_token_accuracy": 0.9917511194944382, "num_tokens": 324942726.0, "step": 3060 }, { "entropy": 0.9576681554317474, "epoch": 6.9737742303306725, "grad_norm": 1.3828125, "learning_rate": 1.1534432920216643e-06, "loss": 0.0478, "mean_token_accuracy": 0.9853257387876511, "num_tokens": 325048442.0, "step": 3061 }, { "entropy": 0.9597254395484924, "epoch": 6.976054732041049, "grad_norm": 0.98046875, "learning_rate": 1.151856779971069e-06, "loss": 0.0283, "mean_token_accuracy": 0.9897295236587524, "num_tokens": 325154424.0, "step": 3062 }, { "entropy": 0.9644580632448196, "epoch": 6.978335233751425, "grad_norm": 1.328125, "learning_rate": 1.1502710331005384e-06, "loss": 0.0374, "mean_token_accuracy": 0.9889875799417496, "num_tokens": 325260604.0, "step": 3063 }, { "entropy": 0.9639627188444138, "epoch": 6.980615735461802, "grad_norm": 0.8984375, "learning_rate": 1.148686052310112e-06, "loss": 0.029, "mean_token_accuracy": 0.9897939115762711, "num_tokens": 325366319.0, "step": 3064 }, { "entropy": 0.9642220735549927, "epoch": 6.982896237172178, "grad_norm": 1.203125, "learning_rate": 1.147101838499395e-06, "loss": 0.0381, "mean_token_accuracy": 0.9868553131818771, "num_tokens": 325472644.0, "step": 3065 }, { "entropy": 0.9662026017904282, "epoch": 6.985176738882554, "grad_norm": 1.0546875, "learning_rate": 1.145518392567555e-06, "loss": 0.0282, "mean_token_accuracy": 0.9898103028535843, "num_tokens": 325578771.0, "step": 3066 }, { "entropy": 0.9614441096782684, "epoch": 6.9874572405929305, "grad_norm": 1.03125, "learning_rate": 1.1439357154133263e-06, "loss": 0.0281, "mean_token_accuracy": 0.9898333996534348, "num_tokens": 325686172.0, "step": 3067 }, { "entropy": 0.9595726132392883, "epoch": 6.989737742303307, "grad_norm": 1.0546875, "learning_rate": 1.1423538079350053e-06, "loss": 0.0305, "mean_token_accuracy": 0.9874832928180695, "num_tokens": 325792665.0, "step": 3068 }, { "entropy": 0.9679504632949829, "epoch": 6.992018244013683, "grad_norm": 1.140625, "learning_rate": 1.1407726710304525e-06, "loss": 0.03, "mean_token_accuracy": 0.990865170955658, "num_tokens": 325898552.0, "step": 3069 }, { "entropy": 0.9587936401367188, "epoch": 6.994298745724059, "grad_norm": 1.2109375, "learning_rate": 1.139192305597092e-06, "loss": 0.0315, "mean_token_accuracy": 0.9897777885198593, "num_tokens": 326004502.0, "step": 3070 }, { "entropy": 0.9627903252840042, "epoch": 6.996579247434435, "grad_norm": 1.0625, "learning_rate": 1.1376127125319065e-06, "loss": 0.0303, "mean_token_accuracy": 0.9902934581041336, "num_tokens": 326111045.0, "step": 3071 }, { "entropy": 0.9617948532104492, "epoch": 6.9988597491448115, "grad_norm": 1.0390625, "learning_rate": 1.1360338927314432e-06, "loss": 0.0319, "mean_token_accuracy": 0.9903764426708221, "num_tokens": 326216870.0, "step": 3072 }, { "entropy": 0.9685780107975006, "epoch": 7.0, "grad_norm": 2.375, "learning_rate": 1.1344558470918098e-06, "loss": 0.0361, "mean_token_accuracy": 0.987199455499649, "num_tokens": 326255832.0, "step": 3073 }, { "entropy": 0.9629501402378082, "epoch": 7.002280501710376, "grad_norm": 1.078125, "learning_rate": 1.1328785765086752e-06, "loss": 0.0283, "mean_token_accuracy": 0.989934042096138, "num_tokens": 326362096.0, "step": 3074 }, { "entropy": 0.9573250263929367, "epoch": 7.004561003420752, "grad_norm": 1.234375, "learning_rate": 1.131302081877268e-06, "loss": 0.0321, "mean_token_accuracy": 0.9892319142818451, "num_tokens": 326468159.0, "step": 3075 }, { "entropy": 0.958941325545311, "epoch": 7.006841505131129, "grad_norm": 1.109375, "learning_rate": 1.1297263640923745e-06, "loss": 0.0334, "mean_token_accuracy": 0.9912248253822327, "num_tokens": 326573986.0, "step": 3076 }, { "entropy": 0.9660691022872925, "epoch": 7.009122006841505, "grad_norm": 0.98828125, "learning_rate": 1.1281514240483427e-06, "loss": 0.0254, "mean_token_accuracy": 0.992889329791069, "num_tokens": 326679775.0, "step": 3077 }, { "entropy": 0.9640592038631439, "epoch": 7.011402508551882, "grad_norm": 0.95703125, "learning_rate": 1.1265772626390786e-06, "loss": 0.0278, "mean_token_accuracy": 0.9909728616476059, "num_tokens": 326786268.0, "step": 3078 }, { "entropy": 0.9596409648656845, "epoch": 7.013683010262258, "grad_norm": 1.0703125, "learning_rate": 1.1250038807580449e-06, "loss": 0.0328, "mean_token_accuracy": 0.988507479429245, "num_tokens": 326892229.0, "step": 3079 }, { "entropy": 0.9668424278497696, "epoch": 7.015963511972634, "grad_norm": 1.28125, "learning_rate": 1.1234312792982627e-06, "loss": 0.0269, "mean_token_accuracy": 0.9919394552707672, "num_tokens": 326998906.0, "step": 3080 }, { "epoch": 7.015963511972634, "eval_entropy": 0.961929503049234, "eval_loss": 0.03775237128138542, "eval_mean_token_accuracy": 0.988488341465649, "eval_num_tokens": 326998906.0, "eval_runtime": 66.1679, "eval_samples_per_second": 126.723, "eval_steps_per_second": 3.975, "step": 3080 }, { "entropy": 0.9620021879673004, "epoch": 7.01824401368301, "grad_norm": 1.0390625, "learning_rate": 1.1218594591523118e-06, "loss": 0.0284, "mean_token_accuracy": 0.9911754727363586, "num_tokens": 327105353.0, "step": 3081 }, { "entropy": 0.9549067318439484, "epoch": 7.020524515393387, "grad_norm": 0.91015625, "learning_rate": 1.120288421212325e-06, "loss": 0.0257, "mean_token_accuracy": 0.9920074939727783, "num_tokens": 327211604.0, "step": 3082 }, { "entropy": 0.9652613997459412, "epoch": 7.022805017103763, "grad_norm": 1.3828125, "learning_rate": 1.1187181663699935e-06, "loss": 0.0448, "mean_token_accuracy": 0.9847714006900787, "num_tokens": 327318230.0, "step": 3083 }, { "entropy": 0.9611967951059341, "epoch": 7.025085518814139, "grad_norm": 1.1328125, "learning_rate": 1.1171486955165645e-06, "loss": 0.0273, "mean_token_accuracy": 0.9918750077486038, "num_tokens": 327424763.0, "step": 3084 }, { "entropy": 0.9616560786962509, "epoch": 7.027366020524515, "grad_norm": 1.1484375, "learning_rate": 1.115580009542839e-06, "loss": 0.0313, "mean_token_accuracy": 0.9888999313116074, "num_tokens": 327531333.0, "step": 3085 }, { "entropy": 0.9614894837141037, "epoch": 7.029646522234891, "grad_norm": 1.0078125, "learning_rate": 1.1140121093391736e-06, "loss": 0.0282, "mean_token_accuracy": 0.989834800362587, "num_tokens": 327638008.0, "step": 3086 }, { "entropy": 0.96116803586483, "epoch": 7.031927023945268, "grad_norm": 1.1171875, "learning_rate": 1.1124449957954764e-06, "loss": 0.0304, "mean_token_accuracy": 0.9904114603996277, "num_tokens": 327744390.0, "step": 3087 }, { "entropy": 0.9614219516515732, "epoch": 7.034207525655645, "grad_norm": 0.8671875, "learning_rate": 1.110878669801212e-06, "loss": 0.0247, "mean_token_accuracy": 0.9925713390111923, "num_tokens": 327851073.0, "step": 3088 }, { "entropy": 0.9589527100324631, "epoch": 7.036488027366021, "grad_norm": 0.91796875, "learning_rate": 1.1093131322453966e-06, "loss": 0.0232, "mean_token_accuracy": 0.9913557022809982, "num_tokens": 327957200.0, "step": 3089 }, { "entropy": 0.9632944613695145, "epoch": 7.038768529076397, "grad_norm": 1.046875, "learning_rate": 1.1077483840165986e-06, "loss": 0.0301, "mean_token_accuracy": 0.9913585335016251, "num_tokens": 328063115.0, "step": 3090 }, { "entropy": 0.9609430134296417, "epoch": 7.041049030786773, "grad_norm": 0.9375, "learning_rate": 1.10618442600294e-06, "loss": 0.0254, "mean_token_accuracy": 0.9909470826387405, "num_tokens": 328169190.0, "step": 3091 }, { "entropy": 0.958580806851387, "epoch": 7.043329532497149, "grad_norm": 1.0703125, "learning_rate": 1.1046212590920931e-06, "loss": 0.031, "mean_token_accuracy": 0.9932730495929718, "num_tokens": 328276177.0, "step": 3092 }, { "entropy": 0.9650003463029861, "epoch": 7.045610034207526, "grad_norm": 1.3203125, "learning_rate": 1.10305888417128e-06, "loss": 0.0389, "mean_token_accuracy": 0.9874429106712341, "num_tokens": 328382147.0, "step": 3093 }, { "entropy": 0.9603755921125412, "epoch": 7.047890535917902, "grad_norm": 0.9140625, "learning_rate": 1.101497302127275e-06, "loss": 0.0229, "mean_token_accuracy": 0.9925276190042496, "num_tokens": 328487729.0, "step": 3094 }, { "entropy": 0.9578560292720795, "epoch": 7.050171037628278, "grad_norm": 0.94921875, "learning_rate": 1.0999365138464024e-06, "loss": 0.0274, "mean_token_accuracy": 0.9909138083457947, "num_tokens": 328594355.0, "step": 3095 }, { "entropy": 0.9605102092027664, "epoch": 7.052451539338654, "grad_norm": 1.1953125, "learning_rate": 1.0983765202145351e-06, "loss": 0.0382, "mean_token_accuracy": 0.9881908148527145, "num_tokens": 328700079.0, "step": 3096 }, { "entropy": 0.9644969552755356, "epoch": 7.05473204104903, "grad_norm": 1.109375, "learning_rate": 1.0968173221170966e-06, "loss": 0.0254, "mean_token_accuracy": 0.9919920861721039, "num_tokens": 328806444.0, "step": 3097 }, { "entropy": 0.9533155858516693, "epoch": 7.0570125427594075, "grad_norm": 1.125, "learning_rate": 1.0952589204390557e-06, "loss": 0.0395, "mean_token_accuracy": 0.9880241006612778, "num_tokens": 328913263.0, "step": 3098 }, { "entropy": 0.9676066637039185, "epoch": 7.059293044469784, "grad_norm": 1.3828125, "learning_rate": 1.0937013160649328e-06, "loss": 0.0353, "mean_token_accuracy": 0.9905592054128647, "num_tokens": 329019649.0, "step": 3099 }, { "entropy": 0.9601194113492966, "epoch": 7.06157354618016, "grad_norm": 1.1015625, "learning_rate": 1.0921445098787923e-06, "loss": 0.0331, "mean_token_accuracy": 0.9898305088281631, "num_tokens": 329126427.0, "step": 3100 }, { "entropy": 0.9602620154619217, "epoch": 7.063854047890536, "grad_norm": 1.0, "learning_rate": 1.0905885027642484e-06, "loss": 0.0322, "mean_token_accuracy": 0.9890037029981613, "num_tokens": 329232557.0, "step": 3101 }, { "entropy": 0.9689409732818604, "epoch": 7.066134549600912, "grad_norm": 0.90234375, "learning_rate": 1.0890332956044614e-06, "loss": 0.0259, "mean_token_accuracy": 0.991204246878624, "num_tokens": 329338481.0, "step": 3102 }, { "entropy": 0.9592747837305069, "epoch": 7.068415051311288, "grad_norm": 0.97265625, "learning_rate": 1.0874788892821354e-06, "loss": 0.0295, "mean_token_accuracy": 0.9905671179294586, "num_tokens": 329444773.0, "step": 3103 }, { "entropy": 0.9636125862598419, "epoch": 7.070695553021665, "grad_norm": 1.0859375, "learning_rate": 1.0859252846795215e-06, "loss": 0.0323, "mean_token_accuracy": 0.9906530827283859, "num_tokens": 329551417.0, "step": 3104 }, { "entropy": 0.9650565683841705, "epoch": 7.072976054732041, "grad_norm": 1.0625, "learning_rate": 1.0843724826784165e-06, "loss": 0.0325, "mean_token_accuracy": 0.9917179495096207, "num_tokens": 329657183.0, "step": 3105 }, { "entropy": 0.9660647064447403, "epoch": 7.075256556442417, "grad_norm": 1.15625, "learning_rate": 1.0828204841601608e-06, "loss": 0.0255, "mean_token_accuracy": 0.9908114522695541, "num_tokens": 329763415.0, "step": 3106 }, { "entropy": 0.955759808421135, "epoch": 7.077537058152793, "grad_norm": 0.97265625, "learning_rate": 1.0812692900056384e-06, "loss": 0.0278, "mean_token_accuracy": 0.9906134158372879, "num_tokens": 329869668.0, "step": 3107 }, { "entropy": 0.961561769247055, "epoch": 7.07981755986317, "grad_norm": 1.390625, "learning_rate": 1.0797189010952784e-06, "loss": 0.0318, "mean_token_accuracy": 0.9889099299907684, "num_tokens": 329975845.0, "step": 3108 }, { "entropy": 0.9658802598714828, "epoch": 7.0820980615735465, "grad_norm": 1.203125, "learning_rate": 1.0781693183090495e-06, "loss": 0.0318, "mean_token_accuracy": 0.9898400455713272, "num_tokens": 330082200.0, "step": 3109 }, { "entropy": 0.9568900465965271, "epoch": 7.084378563283923, "grad_norm": 1.0234375, "learning_rate": 1.076620542526466e-06, "loss": 0.0272, "mean_token_accuracy": 0.9929737448692322, "num_tokens": 330188680.0, "step": 3110 }, { "entropy": 0.9644964784383774, "epoch": 7.086659064994299, "grad_norm": 1.0703125, "learning_rate": 1.0750725746265832e-06, "loss": 0.0286, "mean_token_accuracy": 0.9891897588968277, "num_tokens": 330295909.0, "step": 3111 }, { "entropy": 0.9597250372171402, "epoch": 7.088939566704675, "grad_norm": 1.109375, "learning_rate": 1.0735254154879979e-06, "loss": 0.0306, "mean_token_accuracy": 0.9910938739776611, "num_tokens": 330401897.0, "step": 3112 }, { "entropy": 0.964574545621872, "epoch": 7.091220068415051, "grad_norm": 1.296875, "learning_rate": 1.0719790659888481e-06, "loss": 0.0327, "mean_token_accuracy": 0.989324152469635, "num_tokens": 330508306.0, "step": 3113 }, { "entropy": 0.9609233886003494, "epoch": 7.0935005701254275, "grad_norm": 1.2421875, "learning_rate": 1.070433527006811e-06, "loss": 0.04, "mean_token_accuracy": 0.9872569441795349, "num_tokens": 330614729.0, "step": 3114 }, { "entropy": 0.9607871472835541, "epoch": 7.095781071835804, "grad_norm": 1.0390625, "learning_rate": 1.0688887994191049e-06, "loss": 0.0306, "mean_token_accuracy": 0.9910204857587814, "num_tokens": 330721391.0, "step": 3115 }, { "entropy": 0.9620466977357864, "epoch": 7.09806157354618, "grad_norm": 1.234375, "learning_rate": 1.0673448841024875e-06, "loss": 0.0277, "mean_token_accuracy": 0.9910043030977249, "num_tokens": 330828214.0, "step": 3116 }, { "entropy": 0.9573817402124405, "epoch": 7.100342075256556, "grad_norm": 1.109375, "learning_rate": 1.0658017819332556e-06, "loss": 0.0321, "mean_token_accuracy": 0.9912348687648773, "num_tokens": 330935020.0, "step": 3117 }, { "entropy": 0.963767796754837, "epoch": 7.102622576966933, "grad_norm": 1.125, "learning_rate": 1.064259493787244e-06, "loss": 0.0341, "mean_token_accuracy": 0.9911262542009354, "num_tokens": 331041490.0, "step": 3118 }, { "entropy": 0.9600581228733063, "epoch": 7.104903078677309, "grad_norm": 1.296875, "learning_rate": 1.0627180205398263e-06, "loss": 0.0367, "mean_token_accuracy": 0.9863324016332626, "num_tokens": 331147677.0, "step": 3119 }, { "entropy": 0.9626371115446091, "epoch": 7.1071835803876855, "grad_norm": 1.0625, "learning_rate": 1.0611773630659117e-06, "loss": 0.0305, "mean_token_accuracy": 0.988623321056366, "num_tokens": 331254263.0, "step": 3120 }, { "entropy": 0.9674955606460571, "epoch": 7.109464082098062, "grad_norm": 0.7734375, "learning_rate": 1.0596375222399491e-06, "loss": 0.0246, "mean_token_accuracy": 0.9922275096178055, "num_tokens": 331360643.0, "step": 3121 }, { "entropy": 0.9618344157934189, "epoch": 7.111744583808438, "grad_norm": 1.1953125, "learning_rate": 1.0580984989359205e-06, "loss": 0.0319, "mean_token_accuracy": 0.990387350320816, "num_tokens": 331466758.0, "step": 3122 }, { "entropy": 0.9646287113428116, "epoch": 7.114025085518814, "grad_norm": 1.09375, "learning_rate": 1.0565602940273472e-06, "loss": 0.0322, "mean_token_accuracy": 0.990839809179306, "num_tokens": 331573151.0, "step": 3123 }, { "entropy": 0.9618979245424271, "epoch": 7.11630558722919, "grad_norm": 0.91015625, "learning_rate": 1.055022908387285e-06, "loss": 0.026, "mean_token_accuracy": 0.9910827577114105, "num_tokens": 331679427.0, "step": 3124 }, { "entropy": 0.9593231827020645, "epoch": 7.1185860889395665, "grad_norm": 0.96484375, "learning_rate": 1.053486342888323e-06, "loss": 0.029, "mean_token_accuracy": 0.9910137355327606, "num_tokens": 331786003.0, "step": 3125 }, { "entropy": 0.9585807174444199, "epoch": 7.120866590649943, "grad_norm": 1.03125, "learning_rate": 1.0519505984025865e-06, "loss": 0.0255, "mean_token_accuracy": 0.9902257770299911, "num_tokens": 331891866.0, "step": 3126 }, { "entropy": 0.9593189507722855, "epoch": 7.123147092360319, "grad_norm": 0.9296875, "learning_rate": 1.050415675801735e-06, "loss": 0.0251, "mean_token_accuracy": 0.9916585981845856, "num_tokens": 331998429.0, "step": 3127 }, { "entropy": 0.9606748223304749, "epoch": 7.125427594070696, "grad_norm": 0.984375, "learning_rate": 1.0488815759569605e-06, "loss": 0.0269, "mean_token_accuracy": 0.9908248782157898, "num_tokens": 332104682.0, "step": 3128 }, { "entropy": 0.9593944996595383, "epoch": 7.127708095781072, "grad_norm": 0.9765625, "learning_rate": 1.0473482997389891e-06, "loss": 0.0225, "mean_token_accuracy": 0.991805836558342, "num_tokens": 332210704.0, "step": 3129 }, { "entropy": 0.9664431661367416, "epoch": 7.129988597491448, "grad_norm": 0.9375, "learning_rate": 1.0458158480180777e-06, "loss": 0.0261, "mean_token_accuracy": 0.9911508709192276, "num_tokens": 332316860.0, "step": 3130 }, { "entropy": 0.9618607461452484, "epoch": 7.1322690992018245, "grad_norm": 0.98046875, "learning_rate": 1.0442842216640168e-06, "loss": 0.0336, "mean_token_accuracy": 0.9891491830348969, "num_tokens": 332423277.0, "step": 3131 }, { "entropy": 0.9636634886264801, "epoch": 7.134549600912201, "grad_norm": 1.09375, "learning_rate": 1.042753421546128e-06, "loss": 0.0248, "mean_token_accuracy": 0.9908437579870224, "num_tokens": 332529332.0, "step": 3132 }, { "entropy": 0.9633251577615738, "epoch": 7.136830102622577, "grad_norm": 0.84375, "learning_rate": 1.0412234485332636e-06, "loss": 0.0198, "mean_token_accuracy": 0.9945605397224426, "num_tokens": 332635545.0, "step": 3133 }, { "entropy": 0.9605818390846252, "epoch": 7.139110604332953, "grad_norm": 1.2109375, "learning_rate": 1.0396943034938077e-06, "loss": 0.0315, "mean_token_accuracy": 0.9893599450588226, "num_tokens": 332741444.0, "step": 3134 }, { "entropy": 0.9643488228321075, "epoch": 7.141391106043329, "grad_norm": 0.95703125, "learning_rate": 1.0381659872956732e-06, "loss": 0.0244, "mean_token_accuracy": 0.991791769862175, "num_tokens": 332847305.0, "step": 3135 }, { "entropy": 0.9602548182010651, "epoch": 7.1436716077537055, "grad_norm": 1.0625, "learning_rate": 1.0366385008063015e-06, "loss": 0.0316, "mean_token_accuracy": 0.9902472794055939, "num_tokens": 332953369.0, "step": 3136 }, { "entropy": 0.9621587842702866, "epoch": 7.145952109464082, "grad_norm": 1.1171875, "learning_rate": 1.0351118448926658e-06, "loss": 0.0358, "mean_token_accuracy": 0.9909892678260803, "num_tokens": 333059954.0, "step": 3137 }, { "entropy": 0.9630517661571503, "epoch": 7.148232611174459, "grad_norm": 1.234375, "learning_rate": 1.0335860204212662e-06, "loss": 0.0327, "mean_token_accuracy": 0.9891262650489807, "num_tokens": 333166054.0, "step": 3138 }, { "entropy": 0.957866370677948, "epoch": 7.150513112884835, "grad_norm": 0.87890625, "learning_rate": 1.0320610282581309e-06, "loss": 0.0236, "mean_token_accuracy": 0.9934527724981308, "num_tokens": 333272265.0, "step": 3139 }, { "entropy": 0.9638258069753647, "epoch": 7.152793614595211, "grad_norm": 1.0703125, "learning_rate": 1.0305368692688175e-06, "loss": 0.0264, "mean_token_accuracy": 0.9912703484296799, "num_tokens": 333378799.0, "step": 3140 }, { "entropy": 0.9582513570785522, "epoch": 7.155074116305587, "grad_norm": 1.15625, "learning_rate": 1.029013544318407e-06, "loss": 0.0316, "mean_token_accuracy": 0.9893788248300552, "num_tokens": 333485271.0, "step": 3141 }, { "entropy": 0.9557350873947144, "epoch": 7.1573546180159635, "grad_norm": 1.140625, "learning_rate": 1.0274910542715103e-06, "loss": 0.0391, "mean_token_accuracy": 0.9880546927452087, "num_tokens": 333591270.0, "step": 3142 }, { "entropy": 0.9704095274209976, "epoch": 7.15963511972634, "grad_norm": 1.0390625, "learning_rate": 1.025969399992264e-06, "loss": 0.0278, "mean_token_accuracy": 0.9908082485198975, "num_tokens": 333697801.0, "step": 3143 }, { "entropy": 0.9647316932678223, "epoch": 7.161915621436716, "grad_norm": 1.4765625, "learning_rate": 1.0244485823443281e-06, "loss": 0.0324, "mean_token_accuracy": 0.9906502366065979, "num_tokens": 333803992.0, "step": 3144 }, { "entropy": 0.9660503268241882, "epoch": 7.164196123147092, "grad_norm": 1.0, "learning_rate": 1.0229286021908913e-06, "loss": 0.0324, "mean_token_accuracy": 0.9901970475912094, "num_tokens": 333910814.0, "step": 3145 }, { "entropy": 0.9653466045856476, "epoch": 7.166476624857468, "grad_norm": 0.93359375, "learning_rate": 1.021409460394663e-06, "loss": 0.0284, "mean_token_accuracy": 0.9918655157089233, "num_tokens": 334018488.0, "step": 3146 }, { "entropy": 0.9548106044530869, "epoch": 7.168757126567845, "grad_norm": 1.03125, "learning_rate": 1.0198911578178797e-06, "loss": 0.0315, "mean_token_accuracy": 0.9924036115407944, "num_tokens": 334125157.0, "step": 3147 }, { "entropy": 0.9585267454385757, "epoch": 7.1710376282782216, "grad_norm": 1.25, "learning_rate": 1.0183736953223005e-06, "loss": 0.0327, "mean_token_accuracy": 0.989573746919632, "num_tokens": 334231141.0, "step": 3148 }, { "entropy": 0.958716094493866, "epoch": 7.173318129988598, "grad_norm": 1.328125, "learning_rate": 1.0168570737692082e-06, "loss": 0.0348, "mean_token_accuracy": 0.9898531883955002, "num_tokens": 334337637.0, "step": 3149 }, { "entropy": 0.9572674632072449, "epoch": 7.175598631698974, "grad_norm": 1.3515625, "learning_rate": 1.0153412940194073e-06, "loss": 0.0404, "mean_token_accuracy": 0.9889692068099976, "num_tokens": 334444627.0, "step": 3150 }, { "entropy": 0.960103914141655, "epoch": 7.17787913340935, "grad_norm": 1.1640625, "learning_rate": 1.0138263569332268e-06, "loss": 0.031, "mean_token_accuracy": 0.990531787276268, "num_tokens": 334550871.0, "step": 3151 }, { "entropy": 0.9580908119678497, "epoch": 7.180159635119726, "grad_norm": 1.2578125, "learning_rate": 1.0123122633705131e-06, "loss": 0.0325, "mean_token_accuracy": 0.9911550134420395, "num_tokens": 334656739.0, "step": 3152 }, { "entropy": 0.9636468291282654, "epoch": 7.1824401368301025, "grad_norm": 1.21875, "learning_rate": 1.0107990141906378e-06, "loss": 0.0368, "mean_token_accuracy": 0.9872081875801086, "num_tokens": 334762668.0, "step": 3153 }, { "entropy": 0.95978944003582, "epoch": 7.184720638540479, "grad_norm": 1.1953125, "learning_rate": 1.0092866102524922e-06, "loss": 0.0312, "mean_token_accuracy": 0.9893182069063187, "num_tokens": 334868990.0, "step": 3154 }, { "entropy": 0.9645968675613403, "epoch": 7.187001140250855, "grad_norm": 1.09375, "learning_rate": 1.0077750524144871e-06, "loss": 0.0279, "mean_token_accuracy": 0.9904057085514069, "num_tokens": 334975667.0, "step": 3155 }, { "entropy": 0.9616621136665344, "epoch": 7.189281641961231, "grad_norm": 1.046875, "learning_rate": 1.0062643415345546e-06, "loss": 0.0316, "mean_token_accuracy": 0.9903654009103775, "num_tokens": 335081690.0, "step": 3156 }, { "entropy": 0.9631069302558899, "epoch": 7.191562143671608, "grad_norm": 0.9765625, "learning_rate": 1.0047544784701435e-06, "loss": 0.0328, "mean_token_accuracy": 0.9879620373249054, "num_tokens": 335188534.0, "step": 3157 }, { "entropy": 0.9629451632499695, "epoch": 7.193842645381984, "grad_norm": 1.03125, "learning_rate": 1.0032454640782232e-06, "loss": 0.0301, "mean_token_accuracy": 0.9902997314929962, "num_tokens": 335294334.0, "step": 3158 }, { "entropy": 0.9627998918294907, "epoch": 7.196123147092361, "grad_norm": 1.1328125, "learning_rate": 1.0017372992152819e-06, "loss": 0.0293, "mean_token_accuracy": 0.9924831241369247, "num_tokens": 335401221.0, "step": 3159 }, { "entropy": 0.9555269777774811, "epoch": 7.198403648802737, "grad_norm": 1.1328125, "learning_rate": 1.0002299847373243e-06, "loss": 0.0327, "mean_token_accuracy": 0.9883863627910614, "num_tokens": 335507319.0, "step": 3160 }, { "entropy": 0.9612943232059479, "epoch": 7.200684150513113, "grad_norm": 1.2890625, "learning_rate": 9.987235214998741e-07, "loss": 0.0405, "mean_token_accuracy": 0.9867410957813263, "num_tokens": 335613201.0, "step": 3161 }, { "entropy": 0.9653564393520355, "epoch": 7.202964652223489, "grad_norm": 1.1953125, "learning_rate": 9.972179103579687e-07, "loss": 0.0351, "mean_token_accuracy": 0.986826702952385, "num_tokens": 335719239.0, "step": 3162 }, { "entropy": 0.9607570767402649, "epoch": 7.205245153933865, "grad_norm": 1.1796875, "learning_rate": 9.957131521661655e-07, "loss": 0.0319, "mean_token_accuracy": 0.9905806630849838, "num_tokens": 335825637.0, "step": 3163 }, { "entropy": 0.9622220396995544, "epoch": 7.2075256556442415, "grad_norm": 1.0703125, "learning_rate": 9.942092477785365e-07, "loss": 0.03, "mean_token_accuracy": 0.9919417202472687, "num_tokens": 335931943.0, "step": 3164 }, { "entropy": 0.9600049406290054, "epoch": 7.209806157354618, "grad_norm": 1.0859375, "learning_rate": 9.927061980486668e-07, "loss": 0.0276, "mean_token_accuracy": 0.9910698086023331, "num_tokens": 336037893.0, "step": 3165 }, { "entropy": 0.9660215228796005, "epoch": 7.212086659064994, "grad_norm": 1.171875, "learning_rate": 9.9120400382966e-07, "loss": 0.0307, "mean_token_accuracy": 0.9891226142644882, "num_tokens": 336143991.0, "step": 3166 }, { "entropy": 0.962347611784935, "epoch": 7.214367160775371, "grad_norm": 0.92578125, "learning_rate": 9.897026659741328e-07, "loss": 0.0293, "mean_token_accuracy": 0.9890447407960892, "num_tokens": 336250206.0, "step": 3167 }, { "entropy": 0.966417521238327, "epoch": 7.216647662485747, "grad_norm": 1.046875, "learning_rate": 9.882021853342143e-07, "loss": 0.0303, "mean_token_accuracy": 0.9900527447462082, "num_tokens": 336356137.0, "step": 3168 }, { "entropy": 0.9596583992242813, "epoch": 7.218928164196123, "grad_norm": 1.1171875, "learning_rate": 9.867025627615493e-07, "loss": 0.0332, "mean_token_accuracy": 0.9877303540706635, "num_tokens": 336462646.0, "step": 3169 }, { "entropy": 0.9601673632860184, "epoch": 7.2212086659065, "grad_norm": 1.1484375, "learning_rate": 9.852037991072941e-07, "loss": 0.0342, "mean_token_accuracy": 0.9894249737262726, "num_tokens": 336568798.0, "step": 3170 }, { "entropy": 0.957128182053566, "epoch": 7.223489167616876, "grad_norm": 1.03125, "learning_rate": 9.837058952221182e-07, "loss": 0.0302, "mean_token_accuracy": 0.9900428205728531, "num_tokens": 336675274.0, "step": 3171 }, { "entropy": 0.9580126404762268, "epoch": 7.225769669327252, "grad_norm": 0.98828125, "learning_rate": 9.822088519562038e-07, "loss": 0.0292, "mean_token_accuracy": 0.9920491427183151, "num_tokens": 336781642.0, "step": 3172 }, { "entropy": 0.9615471661090851, "epoch": 7.228050171037628, "grad_norm": 1.2421875, "learning_rate": 9.80712670159242e-07, "loss": 0.0311, "mean_token_accuracy": 0.9890821874141693, "num_tokens": 336887819.0, "step": 3173 }, { "entropy": 0.9609358161687851, "epoch": 7.230330672748004, "grad_norm": 1.109375, "learning_rate": 9.792173506804378e-07, "loss": 0.0336, "mean_token_accuracy": 0.987556055188179, "num_tokens": 336993557.0, "step": 3174 }, { "entropy": 0.9617246091365814, "epoch": 7.2326111744583805, "grad_norm": 1.203125, "learning_rate": 9.777228943685055e-07, "loss": 0.0371, "mean_token_accuracy": 0.9883116036653519, "num_tokens": 337100040.0, "step": 3175 }, { "entropy": 0.9618670046329498, "epoch": 7.234891676168757, "grad_norm": 0.91796875, "learning_rate": 9.762293020716696e-07, "loss": 0.0244, "mean_token_accuracy": 0.9926453530788422, "num_tokens": 337205853.0, "step": 3176 }, { "entropy": 0.9621409326791763, "epoch": 7.237172177879134, "grad_norm": 0.9921875, "learning_rate": 9.74736574637665e-07, "loss": 0.026, "mean_token_accuracy": 0.9920718669891357, "num_tokens": 337311950.0, "step": 3177 }, { "entropy": 0.958904892206192, "epoch": 7.23945267958951, "grad_norm": 1.0390625, "learning_rate": 9.732447129137337e-07, "loss": 0.035, "mean_token_accuracy": 0.9899254739284515, "num_tokens": 337418504.0, "step": 3178 }, { "entropy": 0.9596886783838272, "epoch": 7.241733181299886, "grad_norm": 1.234375, "learning_rate": 9.717537177466279e-07, "loss": 0.0382, "mean_token_accuracy": 0.9891565293073654, "num_tokens": 337524701.0, "step": 3179 }, { "entropy": 0.9670736342668533, "epoch": 7.244013683010262, "grad_norm": 0.98046875, "learning_rate": 9.702635899826082e-07, "loss": 0.0332, "mean_token_accuracy": 0.9919493645429611, "num_tokens": 337630965.0, "step": 3180 }, { "entropy": 0.9586929827928543, "epoch": 7.246294184720639, "grad_norm": 0.90625, "learning_rate": 9.687743304674421e-07, "loss": 0.0224, "mean_token_accuracy": 0.9924075901508331, "num_tokens": 337736385.0, "step": 3181 }, { "entropy": 0.9663746058940887, "epoch": 7.248574686431015, "grad_norm": 1.2734375, "learning_rate": 9.672859400464046e-07, "loss": 0.0386, "mean_token_accuracy": 0.9882482290267944, "num_tokens": 337842355.0, "step": 3182 }, { "entropy": 0.962865948677063, "epoch": 7.250855188141391, "grad_norm": 1.4609375, "learning_rate": 9.657984195642783e-07, "loss": 0.0452, "mean_token_accuracy": 0.9860169589519501, "num_tokens": 337947789.0, "step": 3183 }, { "entropy": 0.9666875004768372, "epoch": 7.253135689851767, "grad_norm": 1.015625, "learning_rate": 9.64311769865349e-07, "loss": 0.0179, "mean_token_accuracy": 0.9944028556346893, "num_tokens": 338053979.0, "step": 3184 }, { "entropy": 0.9622909873723984, "epoch": 7.255416191562143, "grad_norm": 1.1015625, "learning_rate": 9.628259917934118e-07, "loss": 0.032, "mean_token_accuracy": 0.9907370805740356, "num_tokens": 338160275.0, "step": 3185 }, { "entropy": 0.9619220942258835, "epoch": 7.2576966932725195, "grad_norm": 1.2890625, "learning_rate": 9.613410861917661e-07, "loss": 0.0367, "mean_token_accuracy": 0.9875202625989914, "num_tokens": 338266502.0, "step": 3186 }, { "entropy": 0.9658884704113007, "epoch": 7.259977194982897, "grad_norm": 1.0078125, "learning_rate": 9.59857053903214e-07, "loss": 0.0319, "mean_token_accuracy": 0.989313930273056, "num_tokens": 338372850.0, "step": 3187 }, { "entropy": 0.9646973162889481, "epoch": 7.262257696693273, "grad_norm": 1.3984375, "learning_rate": 9.583738957700653e-07, "loss": 0.0316, "mean_token_accuracy": 0.9880107194185257, "num_tokens": 338479349.0, "step": 3188 }, { "entropy": 0.962135910987854, "epoch": 7.264538198403649, "grad_norm": 0.9765625, "learning_rate": 9.568916126341305e-07, "loss": 0.0305, "mean_token_accuracy": 0.9867422729730606, "num_tokens": 338585254.0, "step": 3189 }, { "entropy": 0.968188926577568, "epoch": 7.266818700114025, "grad_norm": 0.984375, "learning_rate": 9.554102053367253e-07, "loss": 0.0311, "mean_token_accuracy": 0.989124670624733, "num_tokens": 338691760.0, "step": 3190 }, { "entropy": 0.9630390405654907, "epoch": 7.269099201824401, "grad_norm": 1.234375, "learning_rate": 9.53929674718668e-07, "loss": 0.0355, "mean_token_accuracy": 0.9896323084831238, "num_tokens": 338797909.0, "step": 3191 }, { "entropy": 0.9600097388029099, "epoch": 7.271379703534778, "grad_norm": 1.3984375, "learning_rate": 9.524500216202795e-07, "loss": 0.0437, "mean_token_accuracy": 0.9858586937189102, "num_tokens": 338904245.0, "step": 3192 }, { "entropy": 0.961753249168396, "epoch": 7.273660205245154, "grad_norm": 1.1328125, "learning_rate": 9.50971246881382e-07, "loss": 0.0348, "mean_token_accuracy": 0.9886197149753571, "num_tokens": 339010280.0, "step": 3193 }, { "entropy": 0.9626025408506393, "epoch": 7.27594070695553, "grad_norm": 0.9921875, "learning_rate": 9.494933513413007e-07, "loss": 0.0374, "mean_token_accuracy": 0.9882416129112244, "num_tokens": 339115923.0, "step": 3194 }, { "entropy": 0.9640211462974548, "epoch": 7.278221208665906, "grad_norm": 1.03125, "learning_rate": 9.480163358388584e-07, "loss": 0.0314, "mean_token_accuracy": 0.9907292872667313, "num_tokens": 339222730.0, "step": 3195 }, { "entropy": 0.9594133943319321, "epoch": 7.280501710376283, "grad_norm": 1.0859375, "learning_rate": 9.465402012123818e-07, "loss": 0.0336, "mean_token_accuracy": 0.9909030646085739, "num_tokens": 339329155.0, "step": 3196 }, { "entropy": 0.9590036273002625, "epoch": 7.282782212086659, "grad_norm": 1.234375, "learning_rate": 9.45064948299696e-07, "loss": 0.0368, "mean_token_accuracy": 0.9886607527732849, "num_tokens": 339435332.0, "step": 3197 }, { "entropy": 0.9612480103969574, "epoch": 7.285062713797036, "grad_norm": 1.171875, "learning_rate": 9.435905779381265e-07, "loss": 0.0348, "mean_token_accuracy": 0.9889456629753113, "num_tokens": 339540782.0, "step": 3198 }, { "entropy": 0.9656928479671478, "epoch": 7.287343215507412, "grad_norm": 1.0703125, "learning_rate": 9.421170909644983e-07, "loss": 0.0278, "mean_token_accuracy": 0.9913957566022873, "num_tokens": 339647856.0, "step": 3199 }, { "entropy": 0.9610162973403931, "epoch": 7.289623717217788, "grad_norm": 1.234375, "learning_rate": 9.406444882151322e-07, "loss": 0.033, "mean_token_accuracy": 0.9906138181686401, "num_tokens": 339754570.0, "step": 3200 }, { "entropy": 0.9662210196256638, "epoch": 7.291904218928164, "grad_norm": 1.3984375, "learning_rate": 9.391727705258502e-07, "loss": 0.0312, "mean_token_accuracy": 0.9898383766412735, "num_tokens": 339861578.0, "step": 3201 }, { "entropy": 0.9669208079576492, "epoch": 7.29418472063854, "grad_norm": 1.234375, "learning_rate": 9.377019387319705e-07, "loss": 0.0289, "mean_token_accuracy": 0.9904228150844574, "num_tokens": 339967646.0, "step": 3202 }, { "entropy": 0.961816817522049, "epoch": 7.296465222348917, "grad_norm": 1.046875, "learning_rate": 9.362319936683092e-07, "loss": 0.029, "mean_token_accuracy": 0.9903467446565628, "num_tokens": 340073754.0, "step": 3203 }, { "entropy": 0.9641528576612473, "epoch": 7.298745724059293, "grad_norm": 1.0546875, "learning_rate": 9.347629361691795e-07, "loss": 0.0336, "mean_token_accuracy": 0.9897560328245163, "num_tokens": 340180238.0, "step": 3204 }, { "entropy": 0.9562805891036987, "epoch": 7.301026225769669, "grad_norm": 0.84375, "learning_rate": 9.332947670683882e-07, "loss": 0.0201, "mean_token_accuracy": 0.9948408156633377, "num_tokens": 340286449.0, "step": 3205 }, { "entropy": 0.9619142860174179, "epoch": 7.303306727480045, "grad_norm": 1.0390625, "learning_rate": 9.318274871992408e-07, "loss": 0.0276, "mean_token_accuracy": 0.9894209206104279, "num_tokens": 340393701.0, "step": 3206 }, { "entropy": 0.9667777121067047, "epoch": 7.305587229190422, "grad_norm": 1.0546875, "learning_rate": 9.303610973945376e-07, "loss": 0.0294, "mean_token_accuracy": 0.9889173060655594, "num_tokens": 340500549.0, "step": 3207 }, { "entropy": 0.9576525539159775, "epoch": 7.307867730900798, "grad_norm": 0.94921875, "learning_rate": 9.288955984865717e-07, "loss": 0.029, "mean_token_accuracy": 0.9904652684926987, "num_tokens": 340606691.0, "step": 3208 }, { "entropy": 0.9577326625585556, "epoch": 7.310148232611175, "grad_norm": 1.0234375, "learning_rate": 9.274309913071328e-07, "loss": 0.027, "mean_token_accuracy": 0.9900078028440475, "num_tokens": 340712475.0, "step": 3209 }, { "entropy": 0.9641618728637695, "epoch": 7.312428734321551, "grad_norm": 1.0859375, "learning_rate": 9.259672766875044e-07, "loss": 0.0378, "mean_token_accuracy": 0.9875775426626205, "num_tokens": 340818648.0, "step": 3210 }, { "entropy": 0.9639094024896622, "epoch": 7.314709236031927, "grad_norm": 0.9296875, "learning_rate": 9.245044554584609e-07, "loss": 0.0228, "mean_token_accuracy": 0.9932912886142731, "num_tokens": 340924904.0, "step": 3211 }, { "entropy": 0.9589708745479584, "epoch": 7.316989737742303, "grad_norm": 1.328125, "learning_rate": 9.230425284502725e-07, "loss": 0.0393, "mean_token_accuracy": 0.9896027594804764, "num_tokens": 341031251.0, "step": 3212 }, { "entropy": 0.965389147400856, "epoch": 7.319270239452679, "grad_norm": 1.21875, "learning_rate": 9.215814964927005e-07, "loss": 0.0308, "mean_token_accuracy": 0.98776775598526, "num_tokens": 341137760.0, "step": 3213 }, { "entropy": 0.9602669179439545, "epoch": 7.321550741163056, "grad_norm": 1.171875, "learning_rate": 9.201213604149989e-07, "loss": 0.0328, "mean_token_accuracy": 0.9882169663906097, "num_tokens": 341243331.0, "step": 3214 }, { "entropy": 0.9637339562177658, "epoch": 7.323831242873432, "grad_norm": 1.2109375, "learning_rate": 9.186621210459129e-07, "loss": 0.0401, "mean_token_accuracy": 0.9892592579126358, "num_tokens": 341349668.0, "step": 3215 }, { "entropy": 0.9596797376871109, "epoch": 7.326111744583809, "grad_norm": 1.1796875, "learning_rate": 9.172037792136773e-07, "loss": 0.0287, "mean_token_accuracy": 0.9903859198093414, "num_tokens": 341456590.0, "step": 3216 }, { "entropy": 0.9671107232570648, "epoch": 7.328392246294185, "grad_norm": 0.87109375, "learning_rate": 9.157463357460194e-07, "loss": 0.0144, "mean_token_accuracy": 0.9944679588079453, "num_tokens": 341562969.0, "step": 3217 }, { "entropy": 0.9632777273654938, "epoch": 7.330672748004561, "grad_norm": 1.1796875, "learning_rate": 9.142897914701565e-07, "loss": 0.0294, "mean_token_accuracy": 0.9893215596675873, "num_tokens": 341669746.0, "step": 3218 }, { "entropy": 0.9625654518604279, "epoch": 7.3329532497149374, "grad_norm": 1.1171875, "learning_rate": 9.128341472127944e-07, "loss": 0.03, "mean_token_accuracy": 0.990409716963768, "num_tokens": 341776234.0, "step": 3219 }, { "entropy": 0.9624585062265396, "epoch": 7.335233751425314, "grad_norm": 1.2421875, "learning_rate": 9.113794038001298e-07, "loss": 0.0402, "mean_token_accuracy": 0.9891536086797714, "num_tokens": 341882833.0, "step": 3220 }, { "entropy": 0.958036333322525, "epoch": 7.33751425313569, "grad_norm": 0.9609375, "learning_rate": 9.099255620578451e-07, "loss": 0.0249, "mean_token_accuracy": 0.9908037334680557, "num_tokens": 341989331.0, "step": 3221 }, { "entropy": 0.9676375836133957, "epoch": 7.339794754846066, "grad_norm": 1.046875, "learning_rate": 9.084726228111141e-07, "loss": 0.0296, "mean_token_accuracy": 0.9895757287740707, "num_tokens": 342095108.0, "step": 3222 }, { "entropy": 0.965529128909111, "epoch": 7.342075256556442, "grad_norm": 0.98046875, "learning_rate": 9.070205868845966e-07, "loss": 0.0287, "mean_token_accuracy": 0.9916051775217056, "num_tokens": 342201064.0, "step": 3223 }, { "entropy": 0.9588915705680847, "epoch": 7.344355758266818, "grad_norm": 1.0390625, "learning_rate": 9.055694551024402e-07, "loss": 0.034, "mean_token_accuracy": 0.9879566878080368, "num_tokens": 342307858.0, "step": 3224 }, { "entropy": 0.96271151304245, "epoch": 7.346636259977195, "grad_norm": 1.21875, "learning_rate": 9.041192282882796e-07, "loss": 0.0414, "mean_token_accuracy": 0.9875144362449646, "num_tokens": 342414160.0, "step": 3225 }, { "entropy": 0.9637976139783859, "epoch": 7.348916761687571, "grad_norm": 1.0234375, "learning_rate": 9.026699072652361e-07, "loss": 0.0262, "mean_token_accuracy": 0.9911189824342728, "num_tokens": 342520421.0, "step": 3226 }, { "entropy": 0.9616393744945526, "epoch": 7.351197263397948, "grad_norm": 0.921875, "learning_rate": 9.012214928559149e-07, "loss": 0.0285, "mean_token_accuracy": 0.9909164756536484, "num_tokens": 342627775.0, "step": 3227 }, { "entropy": 0.9613285511732101, "epoch": 7.353477765108324, "grad_norm": 1.0703125, "learning_rate": 8.997739858824083e-07, "loss": 0.0301, "mean_token_accuracy": 0.9904358685016632, "num_tokens": 342734038.0, "step": 3228 }, { "entropy": 0.9666408896446228, "epoch": 7.3557582668187, "grad_norm": 0.78125, "learning_rate": 8.983273871662951e-07, "loss": 0.0192, "mean_token_accuracy": 0.9948072284460068, "num_tokens": 342840837.0, "step": 3229 }, { "entropy": 0.9632183462381363, "epoch": 7.3580387685290765, "grad_norm": 1.046875, "learning_rate": 8.968816975286346e-07, "loss": 0.0343, "mean_token_accuracy": 0.987326368689537, "num_tokens": 342946923.0, "step": 3230 }, { "entropy": 0.9618057012557983, "epoch": 7.360319270239453, "grad_norm": 1.1640625, "learning_rate": 8.954369177899727e-07, "loss": 0.0481, "mean_token_accuracy": 0.9842702597379684, "num_tokens": 343053604.0, "step": 3231 }, { "entropy": 0.9615581035614014, "epoch": 7.362599771949829, "grad_norm": 1.1171875, "learning_rate": 8.939930487703402e-07, "loss": 0.0363, "mean_token_accuracy": 0.9888861328363419, "num_tokens": 343160245.0, "step": 3232 }, { "entropy": 0.9661037027835846, "epoch": 7.364880273660205, "grad_norm": 1.171875, "learning_rate": 8.925500912892471e-07, "loss": 0.0353, "mean_token_accuracy": 0.9873457551002502, "num_tokens": 343265964.0, "step": 3233 }, { "entropy": 0.9638389050960541, "epoch": 7.367160775370581, "grad_norm": 0.89453125, "learning_rate": 8.911080461656893e-07, "loss": 0.0265, "mean_token_accuracy": 0.9920042455196381, "num_tokens": 343371699.0, "step": 3234 }, { "entropy": 0.9648982733488083, "epoch": 7.369441277080957, "grad_norm": 1.140625, "learning_rate": 8.896669142181436e-07, "loss": 0.0315, "mean_token_accuracy": 0.9908507466316223, "num_tokens": 343478559.0, "step": 3235 }, { "entropy": 0.9625304192304611, "epoch": 7.3717217787913345, "grad_norm": 1.0078125, "learning_rate": 8.882266962645695e-07, "loss": 0.0317, "mean_token_accuracy": 0.9906633794307709, "num_tokens": 343585123.0, "step": 3236 }, { "entropy": 0.96079121530056, "epoch": 7.374002280501711, "grad_norm": 1.25, "learning_rate": 8.867873931224053e-07, "loss": 0.0339, "mean_token_accuracy": 0.9893804341554642, "num_tokens": 343691712.0, "step": 3237 }, { "entropy": 0.9657713621854782, "epoch": 7.376282782212087, "grad_norm": 1.2109375, "learning_rate": 8.853490056085723e-07, "loss": 0.0401, "mean_token_accuracy": 0.9866774082183838, "num_tokens": 343797874.0, "step": 3238 }, { "entropy": 0.9666145890951157, "epoch": 7.378563283922463, "grad_norm": 1.109375, "learning_rate": 8.839115345394716e-07, "loss": 0.0336, "mean_token_accuracy": 0.9897074699401855, "num_tokens": 343904098.0, "step": 3239 }, { "entropy": 0.9583929032087326, "epoch": 7.380843785632839, "grad_norm": 0.9609375, "learning_rate": 8.824749807309846e-07, "loss": 0.0322, "mean_token_accuracy": 0.9911994338035583, "num_tokens": 344010810.0, "step": 3240 }, { "entropy": 0.9635787606239319, "epoch": 7.3831242873432155, "grad_norm": 1.3671875, "learning_rate": 8.810393449984706e-07, "loss": 0.039, "mean_token_accuracy": 0.9876820296049118, "num_tokens": 344116787.0, "step": 3241 }, { "entropy": 0.9624970555305481, "epoch": 7.385404789053592, "grad_norm": 1.0625, "learning_rate": 8.7960462815677e-07, "loss": 0.03, "mean_token_accuracy": 0.9897054880857468, "num_tokens": 344222545.0, "step": 3242 }, { "entropy": 0.9591751396656036, "epoch": 7.387685290763968, "grad_norm": 0.78125, "learning_rate": 8.781708310201989e-07, "loss": 0.0177, "mean_token_accuracy": 0.9945176094770432, "num_tokens": 344329375.0, "step": 3243 }, { "entropy": 0.9621925055980682, "epoch": 7.389965792474344, "grad_norm": 1.3828125, "learning_rate": 8.767379544025531e-07, "loss": 0.0447, "mean_token_accuracy": 0.9852061420679092, "num_tokens": 344435402.0, "step": 3244 }, { "entropy": 0.9659378677606583, "epoch": 7.39224629418472, "grad_norm": 1.140625, "learning_rate": 8.753059991171065e-07, "loss": 0.0304, "mean_token_accuracy": 0.9889439940452576, "num_tokens": 344541194.0, "step": 3245 }, { "entropy": 0.9555743634700775, "epoch": 7.394526795895097, "grad_norm": 1.1015625, "learning_rate": 8.738749659766085e-07, "loss": 0.0344, "mean_token_accuracy": 0.9882573485374451, "num_tokens": 344647432.0, "step": 3246 }, { "entropy": 0.9632935076951981, "epoch": 7.3968072976054735, "grad_norm": 1.1484375, "learning_rate": 8.724448557932874e-07, "loss": 0.0402, "mean_token_accuracy": 0.9876084178686142, "num_tokens": 344754084.0, "step": 3247 }, { "entropy": 0.9589440077543259, "epoch": 7.39908779931585, "grad_norm": 1.09375, "learning_rate": 8.71015669378844e-07, "loss": 0.0345, "mean_token_accuracy": 0.9899665117263794, "num_tokens": 344860435.0, "step": 3248 }, { "entropy": 0.9589547365903854, "epoch": 7.401368301026226, "grad_norm": 1.2734375, "learning_rate": 8.69587407544458e-07, "loss": 0.0293, "mean_token_accuracy": 0.9912459403276443, "num_tokens": 344966291.0, "step": 3249 }, { "entropy": 0.9653130769729614, "epoch": 7.403648802736602, "grad_norm": 0.9296875, "learning_rate": 8.681600711007832e-07, "loss": 0.025, "mean_token_accuracy": 0.9924653619527817, "num_tokens": 345072140.0, "step": 3250 }, { "entropy": 0.9625548869371414, "epoch": 7.405929304446978, "grad_norm": 1.0546875, "learning_rate": 8.667336608579488e-07, "loss": 0.0248, "mean_token_accuracy": 0.9913966506719589, "num_tokens": 345178466.0, "step": 3251 }, { "entropy": 0.9658145904541016, "epoch": 7.4082098061573545, "grad_norm": 0.91796875, "learning_rate": 8.653081776255562e-07, "loss": 0.0269, "mean_token_accuracy": 0.9915157705545425, "num_tokens": 345285277.0, "step": 3252 }, { "entropy": 0.9583066403865814, "epoch": 7.410490307867731, "grad_norm": 1.09375, "learning_rate": 8.638836222126839e-07, "loss": 0.0273, "mean_token_accuracy": 0.9910849183797836, "num_tokens": 345392038.0, "step": 3253 }, { "entropy": 0.9604306370019913, "epoch": 7.412770809578107, "grad_norm": 1.125, "learning_rate": 8.624599954278803e-07, "loss": 0.0264, "mean_token_accuracy": 0.9931536167860031, "num_tokens": 345497882.0, "step": 3254 }, { "entropy": 0.9652940034866333, "epoch": 7.415051311288483, "grad_norm": 1.234375, "learning_rate": 8.610372980791695e-07, "loss": 0.0355, "mean_token_accuracy": 0.9873185753822327, "num_tokens": 345604265.0, "step": 3255 }, { "entropy": 0.9731392562389374, "epoch": 7.41733181299886, "grad_norm": 1.0703125, "learning_rate": 8.59615530974047e-07, "loss": 0.0305, "mean_token_accuracy": 0.9912173897027969, "num_tokens": 345711035.0, "step": 3256 }, { "entropy": 0.9642681330442429, "epoch": 7.419612314709236, "grad_norm": 1.3046875, "learning_rate": 8.581946949194802e-07, "loss": 0.0389, "mean_token_accuracy": 0.9892681986093521, "num_tokens": 345817102.0, "step": 3257 }, { "entropy": 0.9583187252283096, "epoch": 7.4218928164196125, "grad_norm": 0.91015625, "learning_rate": 8.56774790721909e-07, "loss": 0.0262, "mean_token_accuracy": 0.9903358668088913, "num_tokens": 345923743.0, "step": 3258 }, { "entropy": 0.9629640132188797, "epoch": 7.424173318129989, "grad_norm": 0.99609375, "learning_rate": 8.553558191872422e-07, "loss": 0.031, "mean_token_accuracy": 0.9906114488840103, "num_tokens": 346029893.0, "step": 3259 }, { "entropy": 0.9629476815462112, "epoch": 7.426453819840365, "grad_norm": 1.140625, "learning_rate": 8.539377811208613e-07, "loss": 0.0325, "mean_token_accuracy": 0.9883097112178802, "num_tokens": 346136275.0, "step": 3260 }, { "entropy": 0.9651861935853958, "epoch": 7.428734321550741, "grad_norm": 1.1171875, "learning_rate": 8.525206773276173e-07, "loss": 0.0362, "mean_token_accuracy": 0.9885865896940231, "num_tokens": 346242226.0, "step": 3261 }, { "entropy": 0.9675305783748627, "epoch": 7.431014823261117, "grad_norm": 1.125, "learning_rate": 8.511045086118311e-07, "loss": 0.0327, "mean_token_accuracy": 0.9900003522634506, "num_tokens": 346349015.0, "step": 3262 }, { "entropy": 0.9631283581256866, "epoch": 7.4332953249714935, "grad_norm": 0.93359375, "learning_rate": 8.496892757772934e-07, "loss": 0.0283, "mean_token_accuracy": 0.9890992641448975, "num_tokens": 346455839.0, "step": 3263 }, { "entropy": 0.9568487852811813, "epoch": 7.43557582668187, "grad_norm": 1.265625, "learning_rate": 8.482749796272613e-07, "loss": 0.028, "mean_token_accuracy": 0.9918234646320343, "num_tokens": 346561749.0, "step": 3264 }, { "entropy": 0.9606803506612778, "epoch": 7.437856328392247, "grad_norm": 0.8515625, "learning_rate": 8.468616209644634e-07, "loss": 0.0243, "mean_token_accuracy": 0.9926712214946747, "num_tokens": 346667719.0, "step": 3265 }, { "entropy": 0.9638821631669998, "epoch": 7.440136830102623, "grad_norm": 1.2109375, "learning_rate": 8.454492005910942e-07, "loss": 0.0299, "mean_token_accuracy": 0.9884892404079437, "num_tokens": 346773873.0, "step": 3266 }, { "entropy": 0.9662217050790787, "epoch": 7.442417331812999, "grad_norm": 1.0546875, "learning_rate": 8.440377193088162e-07, "loss": 0.0251, "mean_token_accuracy": 0.9904858469963074, "num_tokens": 346880452.0, "step": 3267 }, { "entropy": 0.9649851769208908, "epoch": 7.444697833523375, "grad_norm": 1.09375, "learning_rate": 8.426271779187592e-07, "loss": 0.0283, "mean_token_accuracy": 0.9909785389900208, "num_tokens": 346987242.0, "step": 3268 }, { "entropy": 0.960934579372406, "epoch": 7.4469783352337515, "grad_norm": 1.0546875, "learning_rate": 8.4121757722152e-07, "loss": 0.0279, "mean_token_accuracy": 0.9904540032148361, "num_tokens": 347093515.0, "step": 3269 }, { "entropy": 0.9610684216022491, "epoch": 7.449258836944128, "grad_norm": 1.0859375, "learning_rate": 8.398089180171592e-07, "loss": 0.0264, "mean_token_accuracy": 0.9925408214330673, "num_tokens": 347199843.0, "step": 3270 }, { "entropy": 0.963872641324997, "epoch": 7.451539338654504, "grad_norm": 1.015625, "learning_rate": 8.384012011052053e-07, "loss": 0.0311, "mean_token_accuracy": 0.9899158179759979, "num_tokens": 347306730.0, "step": 3271 }, { "entropy": 0.9570972919464111, "epoch": 7.45381984036488, "grad_norm": 0.91796875, "learning_rate": 8.369944272846522e-07, "loss": 0.0274, "mean_token_accuracy": 0.9900691658258438, "num_tokens": 347413131.0, "step": 3272 }, { "entropy": 0.9682578295469284, "epoch": 7.456100342075256, "grad_norm": 0.91015625, "learning_rate": 8.355885973539557e-07, "loss": 0.0236, "mean_token_accuracy": 0.9934525787830353, "num_tokens": 347519343.0, "step": 3273 }, { "entropy": 0.9642670154571533, "epoch": 7.4583808437856325, "grad_norm": 0.99609375, "learning_rate": 8.341837121110386e-07, "loss": 0.0291, "mean_token_accuracy": 0.9908696115016937, "num_tokens": 347626005.0, "step": 3274 }, { "entropy": 0.9614923894405365, "epoch": 7.460661345496009, "grad_norm": 0.9453125, "learning_rate": 8.327797723532874e-07, "loss": 0.0267, "mean_token_accuracy": 0.992155060172081, "num_tokens": 347732738.0, "step": 3275 }, { "entropy": 0.9621039181947708, "epoch": 7.462941847206386, "grad_norm": 1.03125, "learning_rate": 8.313767788775498e-07, "loss": 0.0278, "mean_token_accuracy": 0.9911240786314011, "num_tokens": 347838740.0, "step": 3276 }, { "entropy": 0.9692533314228058, "epoch": 7.465222348916762, "grad_norm": 1.0703125, "learning_rate": 8.299747324801385e-07, "loss": 0.0294, "mean_token_accuracy": 0.990643247961998, "num_tokens": 347944724.0, "step": 3277 }, { "entropy": 0.9625534117221832, "epoch": 7.467502850627138, "grad_norm": 1.4375, "learning_rate": 8.285736339568279e-07, "loss": 0.0403, "mean_token_accuracy": 0.9872795045375824, "num_tokens": 348050793.0, "step": 3278 }, { "entropy": 0.9662588685750961, "epoch": 7.469783352337514, "grad_norm": 1.0078125, "learning_rate": 8.271734841028553e-07, "loss": 0.0262, "mean_token_accuracy": 0.9905569702386856, "num_tokens": 348157043.0, "step": 3279 }, { "entropy": 0.9585789740085602, "epoch": 7.4720638540478905, "grad_norm": 1.15625, "learning_rate": 8.25774283712917e-07, "loss": 0.0339, "mean_token_accuracy": 0.9878026843070984, "num_tokens": 348264089.0, "step": 3280 }, { "entropy": 0.962031826376915, "epoch": 7.474344355758267, "grad_norm": 1.1640625, "learning_rate": 8.243760335811734e-07, "loss": 0.0345, "mean_token_accuracy": 0.989545151591301, "num_tokens": 348369898.0, "step": 3281 }, { "entropy": 0.963308796286583, "epoch": 7.476624857468643, "grad_norm": 0.9921875, "learning_rate": 8.229787345012439e-07, "loss": 0.0277, "mean_token_accuracy": 0.9911552518606186, "num_tokens": 348476176.0, "step": 3282 }, { "entropy": 0.9680903255939484, "epoch": 7.478905359179019, "grad_norm": 1.1484375, "learning_rate": 8.215823872662084e-07, "loss": 0.0359, "mean_token_accuracy": 0.9873465001583099, "num_tokens": 348582891.0, "step": 3283 }, { "entropy": 0.9577609151601791, "epoch": 7.481185860889395, "grad_norm": 1.140625, "learning_rate": 8.201869926686068e-07, "loss": 0.029, "mean_token_accuracy": 0.9895633906126022, "num_tokens": 348689336.0, "step": 3284 }, { "entropy": 0.9607789516448975, "epoch": 7.483466362599772, "grad_norm": 0.8515625, "learning_rate": 8.187925515004391e-07, "loss": 0.0224, "mean_token_accuracy": 0.9920418858528137, "num_tokens": 348795467.0, "step": 3285 }, { "entropy": 0.9620175808668137, "epoch": 7.485746864310149, "grad_norm": 1.015625, "learning_rate": 8.173990645531612e-07, "loss": 0.0332, "mean_token_accuracy": 0.9878107905387878, "num_tokens": 348901985.0, "step": 3286 }, { "entropy": 0.9602104425430298, "epoch": 7.488027366020525, "grad_norm": 1.046875, "learning_rate": 8.160065326176905e-07, "loss": 0.0306, "mean_token_accuracy": 0.9905338734388351, "num_tokens": 349008018.0, "step": 3287 }, { "entropy": 0.9640917032957077, "epoch": 7.490307867730901, "grad_norm": 1.03125, "learning_rate": 8.14614956484401e-07, "loss": 0.0266, "mean_token_accuracy": 0.9918480515480042, "num_tokens": 349114164.0, "step": 3288 }, { "entropy": 0.9602045714855194, "epoch": 7.492588369441277, "grad_norm": 1.0, "learning_rate": 8.132243369431248e-07, "loss": 0.0251, "mean_token_accuracy": 0.9918745458126068, "num_tokens": 349221404.0, "step": 3289 }, { "entropy": 0.958023339509964, "epoch": 7.494868871151653, "grad_norm": 1.140625, "learning_rate": 8.11834674783151e-07, "loss": 0.0358, "mean_token_accuracy": 0.9905013740062714, "num_tokens": 349327563.0, "step": 3290 }, { "entropy": 0.9618607312440872, "epoch": 7.4971493728620295, "grad_norm": 0.953125, "learning_rate": 8.104459707932238e-07, "loss": 0.0221, "mean_token_accuracy": 0.9923439025878906, "num_tokens": 349433775.0, "step": 3291 }, { "entropy": 0.9578762203454971, "epoch": 7.499429874572406, "grad_norm": 1.109375, "learning_rate": 8.090582257615456e-07, "loss": 0.0291, "mean_token_accuracy": 0.9908979684114456, "num_tokens": 349539974.0, "step": 3292 }, { "entropy": 0.9590502679347992, "epoch": 7.501710376282782, "grad_norm": 1.0546875, "learning_rate": 8.076714404757735e-07, "loss": 0.0257, "mean_token_accuracy": 0.9909030646085739, "num_tokens": 349646217.0, "step": 3293 }, { "entropy": 0.9622270464897156, "epoch": 7.503990877993158, "grad_norm": 1.2109375, "learning_rate": 8.062856157230209e-07, "loss": 0.0351, "mean_token_accuracy": 0.9906326532363892, "num_tokens": 349752260.0, "step": 3294 }, { "entropy": 0.9630294889211655, "epoch": 7.506271379703534, "grad_norm": 1.1484375, "learning_rate": 8.049007522898536e-07, "loss": 0.031, "mean_token_accuracy": 0.9898758977651596, "num_tokens": 349858428.0, "step": 3295 }, { "entropy": 0.9595257192850113, "epoch": 7.508551881413911, "grad_norm": 1.0859375, "learning_rate": 8.035168509622948e-07, "loss": 0.0236, "mean_token_accuracy": 0.9915540814399719, "num_tokens": 349964172.0, "step": 3296 }, { "entropy": 0.9595539420843124, "epoch": 7.510832383124288, "grad_norm": 1.203125, "learning_rate": 8.02133912525819e-07, "loss": 0.0378, "mean_token_accuracy": 0.9883602261543274, "num_tokens": 350071632.0, "step": 3297 }, { "entropy": 0.9605029970407486, "epoch": 7.513112884834664, "grad_norm": 0.91796875, "learning_rate": 8.007519377653558e-07, "loss": 0.0228, "mean_token_accuracy": 0.9924155175685883, "num_tokens": 350177262.0, "step": 3298 }, { "entropy": 0.9598042964935303, "epoch": 7.51539338654504, "grad_norm": 1.03125, "learning_rate": 7.993709274652872e-07, "loss": 0.0216, "mean_token_accuracy": 0.9934648275375366, "num_tokens": 350282887.0, "step": 3299 }, { "entropy": 0.9575364291667938, "epoch": 7.517673888255416, "grad_norm": 1.4375, "learning_rate": 7.979908824094484e-07, "loss": 0.0289, "mean_token_accuracy": 0.9915490001440048, "num_tokens": 350390088.0, "step": 3300 }, { "epoch": 7.517673888255416, "eval_entropy": 0.962328108771219, "eval_loss": 0.03768106549978256, "eval_mean_token_accuracy": 0.9885065492115094, "eval_num_tokens": 350390088.0, "eval_runtime": 66.0883, "eval_samples_per_second": 126.876, "eval_steps_per_second": 3.98, "step": 3300 }, { "entropy": 0.9597743004560471, "epoch": 7.519954389965792, "grad_norm": 0.94140625, "learning_rate": 7.966118033811271e-07, "loss": 0.0303, "mean_token_accuracy": 0.9899584800004959, "num_tokens": 350496911.0, "step": 3301 }, { "entropy": 0.9607065767049789, "epoch": 7.5222348916761685, "grad_norm": 1.203125, "learning_rate": 7.952336911630604e-07, "loss": 0.0404, "mean_token_accuracy": 0.9883126020431519, "num_tokens": 350603544.0, "step": 3302 }, { "entropy": 0.967927947640419, "epoch": 7.524515393386545, "grad_norm": 1.1796875, "learning_rate": 7.938565465374384e-07, "loss": 0.0247, "mean_token_accuracy": 0.9919332712888718, "num_tokens": 350710227.0, "step": 3303 }, { "entropy": 0.9652207791805267, "epoch": 7.526795895096921, "grad_norm": 1.109375, "learning_rate": 7.924803702859024e-07, "loss": 0.0354, "mean_token_accuracy": 0.9878214448690414, "num_tokens": 350816575.0, "step": 3304 }, { "entropy": 0.9580594897270203, "epoch": 7.529076396807298, "grad_norm": 1.2734375, "learning_rate": 7.911051631895433e-07, "loss": 0.0339, "mean_token_accuracy": 0.9901973009109497, "num_tokens": 350922904.0, "step": 3305 }, { "entropy": 0.9570382833480835, "epoch": 7.531356898517674, "grad_norm": 1.0078125, "learning_rate": 7.897309260289027e-07, "loss": 0.0291, "mean_token_accuracy": 0.9922534674406052, "num_tokens": 351028929.0, "step": 3306 }, { "entropy": 0.9628818780183792, "epoch": 7.53363740022805, "grad_norm": 1.1953125, "learning_rate": 7.883576595839698e-07, "loss": 0.0302, "mean_token_accuracy": 0.9904515594244003, "num_tokens": 351135292.0, "step": 3307 }, { "entropy": 0.9593137502670288, "epoch": 7.535917901938427, "grad_norm": 1.109375, "learning_rate": 7.869853646341849e-07, "loss": 0.0296, "mean_token_accuracy": 0.9919679164886475, "num_tokens": 351240664.0, "step": 3308 }, { "entropy": 0.9633283466100693, "epoch": 7.538198403648803, "grad_norm": 1.3359375, "learning_rate": 7.856140419584357e-07, "loss": 0.0421, "mean_token_accuracy": 0.9870536625385284, "num_tokens": 351346828.0, "step": 3309 }, { "entropy": 0.9664277136325836, "epoch": 7.540478905359179, "grad_norm": 1.1640625, "learning_rate": 7.842436923350591e-07, "loss": 0.0285, "mean_token_accuracy": 0.9899810999631882, "num_tokens": 351453414.0, "step": 3310 }, { "entropy": 0.9555269777774811, "epoch": 7.542759407069555, "grad_norm": 1.1796875, "learning_rate": 7.828743165418393e-07, "loss": 0.0352, "mean_token_accuracy": 0.9898563027381897, "num_tokens": 351559353.0, "step": 3311 }, { "entropy": 0.9597863405942917, "epoch": 7.545039908779931, "grad_norm": 1.1015625, "learning_rate": 7.815059153560065e-07, "loss": 0.0316, "mean_token_accuracy": 0.9897311180830002, "num_tokens": 351666067.0, "step": 3312 }, { "entropy": 0.9580080509185791, "epoch": 7.5473204104903076, "grad_norm": 1.03125, "learning_rate": 7.801384895542391e-07, "loss": 0.0284, "mean_token_accuracy": 0.991192102432251, "num_tokens": 351772564.0, "step": 3313 }, { "entropy": 0.9597525894641876, "epoch": 7.549600912200685, "grad_norm": 1.1640625, "learning_rate": 7.78772039912662e-07, "loss": 0.0287, "mean_token_accuracy": 0.9894005209207535, "num_tokens": 351878987.0, "step": 3314 }, { "entropy": 0.9601787030696869, "epoch": 7.55188141391106, "grad_norm": 1.1484375, "learning_rate": 7.774065672068463e-07, "loss": 0.0252, "mean_token_accuracy": 0.9909292012453079, "num_tokens": 351985233.0, "step": 3315 }, { "entropy": 0.9614244550466537, "epoch": 7.554161915621437, "grad_norm": 0.94921875, "learning_rate": 7.760420722118059e-07, "loss": 0.0357, "mean_token_accuracy": 0.9889156669378281, "num_tokens": 352091551.0, "step": 3316 }, { "entropy": 0.9605825692415237, "epoch": 7.556442417331813, "grad_norm": 1.0, "learning_rate": 7.746785557020034e-07, "loss": 0.0329, "mean_token_accuracy": 0.9908395707607269, "num_tokens": 352198584.0, "step": 3317 }, { "entropy": 0.9628136307001114, "epoch": 7.558722919042189, "grad_norm": 1.1015625, "learning_rate": 7.733160184513447e-07, "loss": 0.0267, "mean_token_accuracy": 0.9907524585723877, "num_tokens": 352304545.0, "step": 3318 }, { "entropy": 0.9659246206283569, "epoch": 7.561003420752566, "grad_norm": 1.0546875, "learning_rate": 7.719544612331781e-07, "loss": 0.0222, "mean_token_accuracy": 0.9928753226995468, "num_tokens": 352410780.0, "step": 3319 }, { "entropy": 0.9676763117313385, "epoch": 7.563283922462942, "grad_norm": 1.03125, "learning_rate": 7.705938848202985e-07, "loss": 0.0236, "mean_token_accuracy": 0.9920396059751511, "num_tokens": 352516564.0, "step": 3320 }, { "entropy": 0.9646372050046921, "epoch": 7.565564424173318, "grad_norm": 0.91796875, "learning_rate": 7.692342899849419e-07, "loss": 0.031, "mean_token_accuracy": 0.9895811676979065, "num_tokens": 352623232.0, "step": 3321 }, { "entropy": 0.9690884649753571, "epoch": 7.567844925883694, "grad_norm": 1.0546875, "learning_rate": 7.678756774987897e-07, "loss": 0.0317, "mean_token_accuracy": 0.9915627688169479, "num_tokens": 352729774.0, "step": 3322 }, { "entropy": 0.9643706232309341, "epoch": 7.57012542759407, "grad_norm": 1.1875, "learning_rate": 7.665180481329621e-07, "loss": 0.0314, "mean_token_accuracy": 0.9898267239332199, "num_tokens": 352835763.0, "step": 3323 }, { "entropy": 0.9603226333856583, "epoch": 7.572405929304447, "grad_norm": 0.95703125, "learning_rate": 7.651614026580243e-07, "loss": 0.0213, "mean_token_accuracy": 0.9923427253961563, "num_tokens": 352941559.0, "step": 3324 }, { "entropy": 0.960545152425766, "epoch": 7.574686431014824, "grad_norm": 1.1875, "learning_rate": 7.638057418439818e-07, "loss": 0.0339, "mean_token_accuracy": 0.9900741130113602, "num_tokens": 353047947.0, "step": 3325 }, { "entropy": 0.9628134667873383, "epoch": 7.5769669327252, "grad_norm": 0.9375, "learning_rate": 7.624510664602819e-07, "loss": 0.0251, "mean_token_accuracy": 0.9901962429285049, "num_tokens": 353154193.0, "step": 3326 }, { "entropy": 0.9656583666801453, "epoch": 7.579247434435576, "grad_norm": 0.8984375, "learning_rate": 7.610973772758118e-07, "loss": 0.022, "mean_token_accuracy": 0.9928813874721527, "num_tokens": 353260009.0, "step": 3327 }, { "entropy": 0.957812562584877, "epoch": 7.581527936145952, "grad_norm": 1.0234375, "learning_rate": 7.597446750589005e-07, "loss": 0.0284, "mean_token_accuracy": 0.990305945277214, "num_tokens": 353365927.0, "step": 3328 }, { "entropy": 0.9664168655872345, "epoch": 7.583808437856328, "grad_norm": 1.0625, "learning_rate": 7.583929605773138e-07, "loss": 0.0306, "mean_token_accuracy": 0.9910014271736145, "num_tokens": 353472061.0, "step": 3329 }, { "entropy": 0.9569805264472961, "epoch": 7.586088939566705, "grad_norm": 1.046875, "learning_rate": 7.570422345982598e-07, "loss": 0.0331, "mean_token_accuracy": 0.9898039549589157, "num_tokens": 353578474.0, "step": 3330 }, { "entropy": 0.9586918503046036, "epoch": 7.588369441277081, "grad_norm": 1.1328125, "learning_rate": 7.556924978883843e-07, "loss": 0.0327, "mean_token_accuracy": 0.988898754119873, "num_tokens": 353684483.0, "step": 3331 }, { "entropy": 0.9521706849336624, "epoch": 7.590649942987457, "grad_norm": 1.2421875, "learning_rate": 7.543437512137717e-07, "loss": 0.0361, "mean_token_accuracy": 0.9891842752695084, "num_tokens": 353790613.0, "step": 3332 }, { "entropy": 0.9596735090017319, "epoch": 7.592930444697833, "grad_norm": 1.0078125, "learning_rate": 7.529959953399455e-07, "loss": 0.0272, "mean_token_accuracy": 0.990502342581749, "num_tokens": 353897583.0, "step": 3333 }, { "entropy": 0.9623324275016785, "epoch": 7.59521094640821, "grad_norm": 1.1875, "learning_rate": 7.516492310318643e-07, "loss": 0.0286, "mean_token_accuracy": 0.9908248037099838, "num_tokens": 354003392.0, "step": 3334 }, { "entropy": 0.9627418965101242, "epoch": 7.5974914481185865, "grad_norm": 1.125, "learning_rate": 7.503034590539266e-07, "loss": 0.0295, "mean_token_accuracy": 0.9909277409315109, "num_tokens": 354109356.0, "step": 3335 }, { "entropy": 0.9664704352617264, "epoch": 7.599771949828963, "grad_norm": 1.1953125, "learning_rate": 7.489586801699661e-07, "loss": 0.0218, "mean_token_accuracy": 0.9927282929420471, "num_tokens": 354215342.0, "step": 3336 }, { "entropy": 0.962530329823494, "epoch": 7.602052451539339, "grad_norm": 0.9609375, "learning_rate": 7.476148951432543e-07, "loss": 0.0249, "mean_token_accuracy": 0.9920442998409271, "num_tokens": 354321408.0, "step": 3337 }, { "entropy": 0.9654635787010193, "epoch": 7.604332953249715, "grad_norm": 1.375, "learning_rate": 7.462721047364965e-07, "loss": 0.0424, "mean_token_accuracy": 0.9842024147510529, "num_tokens": 354428108.0, "step": 3338 }, { "entropy": 0.9584110081195831, "epoch": 7.606613454960091, "grad_norm": 1.078125, "learning_rate": 7.449303097118355e-07, "loss": 0.032, "mean_token_accuracy": 0.990354910492897, "num_tokens": 354534450.0, "step": 3339 }, { "entropy": 0.9661747515201569, "epoch": 7.608893956670467, "grad_norm": 1.4765625, "learning_rate": 7.435895108308472e-07, "loss": 0.0416, "mean_token_accuracy": 0.9865084886550903, "num_tokens": 354641331.0, "step": 3340 }, { "entropy": 0.9655898809432983, "epoch": 7.611174458380844, "grad_norm": 0.9765625, "learning_rate": 7.422497088545436e-07, "loss": 0.03, "mean_token_accuracy": 0.9913294911384583, "num_tokens": 354747739.0, "step": 3341 }, { "entropy": 0.9605299979448318, "epoch": 7.61345496009122, "grad_norm": 1.078125, "learning_rate": 7.409109045433704e-07, "loss": 0.0301, "mean_token_accuracy": 0.9890244156122208, "num_tokens": 354854124.0, "step": 3342 }, { "entropy": 0.9679581820964813, "epoch": 7.615735461801596, "grad_norm": 1.234375, "learning_rate": 7.395730986572075e-07, "loss": 0.0363, "mean_token_accuracy": 0.9862634390592575, "num_tokens": 354960087.0, "step": 3343 }, { "entropy": 0.9614656716585159, "epoch": 7.618015963511972, "grad_norm": 1.1171875, "learning_rate": 7.382362919553682e-07, "loss": 0.0315, "mean_token_accuracy": 0.9897311180830002, "num_tokens": 355066655.0, "step": 3344 }, { "entropy": 0.9627726525068283, "epoch": 7.620296465222349, "grad_norm": 1.1953125, "learning_rate": 7.369004851965966e-07, "loss": 0.0328, "mean_token_accuracy": 0.9901752173900604, "num_tokens": 355173490.0, "step": 3345 }, { "entropy": 0.9614834487438202, "epoch": 7.6225769669327255, "grad_norm": 1.15625, "learning_rate": 7.355656791390717e-07, "loss": 0.0302, "mean_token_accuracy": 0.9880622327327728, "num_tokens": 355280304.0, "step": 3346 }, { "entropy": 0.961851105093956, "epoch": 7.624857468643102, "grad_norm": 1.125, "learning_rate": 7.342318745404034e-07, "loss": 0.0309, "mean_token_accuracy": 0.9909657090902328, "num_tokens": 355386947.0, "step": 3347 }, { "entropy": 0.9596419781446457, "epoch": 7.627137970353478, "grad_norm": 1.125, "learning_rate": 7.32899072157634e-07, "loss": 0.0298, "mean_token_accuracy": 0.9884453266859055, "num_tokens": 355493221.0, "step": 3348 }, { "entropy": 0.9638085663318634, "epoch": 7.629418472063854, "grad_norm": 1.0, "learning_rate": 7.315672727472365e-07, "loss": 0.0291, "mean_token_accuracy": 0.9919782429933548, "num_tokens": 355599966.0, "step": 3349 }, { "entropy": 0.9587162733078003, "epoch": 7.63169897377423, "grad_norm": 0.83203125, "learning_rate": 7.302364770651132e-07, "loss": 0.0223, "mean_token_accuracy": 0.9936075806617737, "num_tokens": 355706170.0, "step": 3350 }, { "entropy": 0.9611505717039108, "epoch": 7.633979475484606, "grad_norm": 1.0625, "learning_rate": 7.289066858665991e-07, "loss": 0.025, "mean_token_accuracy": 0.9912626594305038, "num_tokens": 355812782.0, "step": 3351 }, { "entropy": 0.9575662612915039, "epoch": 7.636259977194983, "grad_norm": 1.3046875, "learning_rate": 7.275778999064578e-07, "loss": 0.0381, "mean_token_accuracy": 0.9870745837688446, "num_tokens": 355919298.0, "step": 3352 }, { "entropy": 0.9652052372694016, "epoch": 7.638540478905359, "grad_norm": 1.3515625, "learning_rate": 7.262501199388827e-07, "loss": 0.0323, "mean_token_accuracy": 0.9900399744510651, "num_tokens": 356025684.0, "step": 3353 }, { "entropy": 0.9642768502235413, "epoch": 7.640820980615736, "grad_norm": 1.0, "learning_rate": 7.249233467174965e-07, "loss": 0.0306, "mean_token_accuracy": 0.9904734790325165, "num_tokens": 356131315.0, "step": 3354 }, { "entropy": 0.9658307880163193, "epoch": 7.643101482326112, "grad_norm": 1.0546875, "learning_rate": 7.235975809953491e-07, "loss": 0.0241, "mean_token_accuracy": 0.9918230324983597, "num_tokens": 356237869.0, "step": 3355 }, { "entropy": 0.963831290602684, "epoch": 7.645381984036488, "grad_norm": 0.96875, "learning_rate": 7.222728235249196e-07, "loss": 0.0271, "mean_token_accuracy": 0.9912558943033218, "num_tokens": 356344294.0, "step": 3356 }, { "entropy": 0.9683090448379517, "epoch": 7.6476624857468645, "grad_norm": 0.85546875, "learning_rate": 7.209490750581152e-07, "loss": 0.0245, "mean_token_accuracy": 0.992047131061554, "num_tokens": 356450525.0, "step": 3357 }, { "entropy": 0.9682479053735733, "epoch": 7.649942987457241, "grad_norm": 1.1328125, "learning_rate": 7.196263363462699e-07, "loss": 0.032, "mean_token_accuracy": 0.990376889705658, "num_tokens": 356556695.0, "step": 3358 }, { "entropy": 0.9557538032531738, "epoch": 7.652223489167617, "grad_norm": 1.0390625, "learning_rate": 7.183046081401454e-07, "loss": 0.0284, "mean_token_accuracy": 0.9900489747524261, "num_tokens": 356662786.0, "step": 3359 }, { "entropy": 0.9593320488929749, "epoch": 7.654503990877993, "grad_norm": 1.265625, "learning_rate": 7.169838911899276e-07, "loss": 0.0402, "mean_token_accuracy": 0.9878757745027542, "num_tokens": 356769330.0, "step": 3360 }, { "entropy": 0.9597384184598923, "epoch": 7.656784492588369, "grad_norm": 1.140625, "learning_rate": 7.156641862452316e-07, "loss": 0.0345, "mean_token_accuracy": 0.9889975190162659, "num_tokens": 356875162.0, "step": 3361 }, { "entropy": 0.9698283076286316, "epoch": 7.659064994298745, "grad_norm": 1.140625, "learning_rate": 7.143454940550948e-07, "loss": 0.0296, "mean_token_accuracy": 0.9890242964029312, "num_tokens": 356981609.0, "step": 3362 }, { "entropy": 0.9591280966997147, "epoch": 7.661345496009122, "grad_norm": 1.25, "learning_rate": 7.13027815367982e-07, "loss": 0.0353, "mean_token_accuracy": 0.9905695170164108, "num_tokens": 357087652.0, "step": 3363 }, { "entropy": 0.9598115980625153, "epoch": 7.663625997719498, "grad_norm": 1.0546875, "learning_rate": 7.117111509317823e-07, "loss": 0.0268, "mean_token_accuracy": 0.9916057884693146, "num_tokens": 357193919.0, "step": 3364 }, { "entropy": 0.960614487528801, "epoch": 7.665906499429875, "grad_norm": 1.1171875, "learning_rate": 7.103955014938099e-07, "loss": 0.0325, "mean_token_accuracy": 0.9902804344892502, "num_tokens": 357300188.0, "step": 3365 }, { "entropy": 0.9603085964918137, "epoch": 7.668187001140251, "grad_norm": 1.2734375, "learning_rate": 7.090808678008005e-07, "loss": 0.0322, "mean_token_accuracy": 0.9889573007822037, "num_tokens": 357405875.0, "step": 3366 }, { "entropy": 0.95644311606884, "epoch": 7.670467502850627, "grad_norm": 1.03125, "learning_rate": 7.077672505989155e-07, "loss": 0.0389, "mean_token_accuracy": 0.9897620975971222, "num_tokens": 357512544.0, "step": 3367 }, { "entropy": 0.9605515599250793, "epoch": 7.6727480045610035, "grad_norm": 0.984375, "learning_rate": 7.064546506337386e-07, "loss": 0.0308, "mean_token_accuracy": 0.9894585460424423, "num_tokens": 357619004.0, "step": 3368 }, { "entropy": 0.9591377079486847, "epoch": 7.67502850627138, "grad_norm": 1.1953125, "learning_rate": 7.051430686502764e-07, "loss": 0.0355, "mean_token_accuracy": 0.9881268292665482, "num_tokens": 357726118.0, "step": 3369 }, { "entropy": 0.9666549116373062, "epoch": 7.677309007981756, "grad_norm": 1.1796875, "learning_rate": 7.038325053929582e-07, "loss": 0.0292, "mean_token_accuracy": 0.9901652038097382, "num_tokens": 357832498.0, "step": 3370 }, { "entropy": 0.966222882270813, "epoch": 7.679589509692132, "grad_norm": 0.98828125, "learning_rate": 7.025229616056326e-07, "loss": 0.0239, "mean_token_accuracy": 0.9908955246210098, "num_tokens": 357938353.0, "step": 3371 }, { "entropy": 0.9578788429498672, "epoch": 7.681870011402508, "grad_norm": 1.1328125, "learning_rate": 7.012144380315724e-07, "loss": 0.0348, "mean_token_accuracy": 0.9892971068620682, "num_tokens": 358044458.0, "step": 3372 }, { "entropy": 0.9612467885017395, "epoch": 7.684150513112884, "grad_norm": 1.1328125, "learning_rate": 6.999069354134703e-07, "loss": 0.0269, "mean_token_accuracy": 0.9910269379615784, "num_tokens": 358151641.0, "step": 3373 }, { "entropy": 0.9605563431978226, "epoch": 7.6864310148232615, "grad_norm": 1.0234375, "learning_rate": 6.986004544934394e-07, "loss": 0.0312, "mean_token_accuracy": 0.9878598004579544, "num_tokens": 358257804.0, "step": 3374 }, { "entropy": 0.9630914926528931, "epoch": 7.688711516533638, "grad_norm": 1.015625, "learning_rate": 6.972949960130135e-07, "loss": 0.0333, "mean_token_accuracy": 0.9896815121173859, "num_tokens": 358364121.0, "step": 3375 }, { "entropy": 0.9602832347154617, "epoch": 7.690992018244014, "grad_norm": 0.90625, "learning_rate": 6.959905607131457e-07, "loss": 0.0221, "mean_token_accuracy": 0.9938082098960876, "num_tokens": 358470053.0, "step": 3376 }, { "entropy": 0.9647484719753265, "epoch": 7.69327251995439, "grad_norm": 1.2421875, "learning_rate": 6.946871493342072e-07, "loss": 0.0381, "mean_token_accuracy": 0.9868567436933517, "num_tokens": 358576531.0, "step": 3377 }, { "entropy": 0.9615330249071121, "epoch": 7.695553021664766, "grad_norm": 1.109375, "learning_rate": 6.933847626159898e-07, "loss": 0.0272, "mean_token_accuracy": 0.9902502000331879, "num_tokens": 358682981.0, "step": 3378 }, { "entropy": 0.9661427736282349, "epoch": 7.6978335233751425, "grad_norm": 1.234375, "learning_rate": 6.920834012977032e-07, "loss": 0.0373, "mean_token_accuracy": 0.9897639900445938, "num_tokens": 358789039.0, "step": 3379 }, { "entropy": 0.9603958874940872, "epoch": 7.700114025085519, "grad_norm": 0.77734375, "learning_rate": 6.907830661179757e-07, "loss": 0.0323, "mean_token_accuracy": 0.9903692901134491, "num_tokens": 358895016.0, "step": 3380 }, { "entropy": 0.9653270691633224, "epoch": 7.702394526795895, "grad_norm": 0.83984375, "learning_rate": 6.894837578148505e-07, "loss": 0.0259, "mean_token_accuracy": 0.9926818758249283, "num_tokens": 359002150.0, "step": 3381 }, { "entropy": 0.9558330625295639, "epoch": 7.704675028506271, "grad_norm": 0.98046875, "learning_rate": 6.881854771257912e-07, "loss": 0.032, "mean_token_accuracy": 0.9901390075683594, "num_tokens": 359108308.0, "step": 3382 }, { "entropy": 0.9608840495347977, "epoch": 7.706955530216648, "grad_norm": 1.203125, "learning_rate": 6.868882247876776e-07, "loss": 0.0356, "mean_token_accuracy": 0.9870330691337585, "num_tokens": 359214240.0, "step": 3383 }, { "entropy": 0.9696924686431885, "epoch": 7.7092360319270234, "grad_norm": 1.0390625, "learning_rate": 6.855920015368032e-07, "loss": 0.0317, "mean_token_accuracy": 0.9916084706783295, "num_tokens": 359320365.0, "step": 3384 }, { "entropy": 0.9613232314586639, "epoch": 7.7115165336374005, "grad_norm": 1.03125, "learning_rate": 6.8429680810888e-07, "loss": 0.0257, "mean_token_accuracy": 0.9932486116886139, "num_tokens": 359426222.0, "step": 3385 }, { "entropy": 0.9617234468460083, "epoch": 7.713797035347777, "grad_norm": 1.0546875, "learning_rate": 6.830026452390354e-07, "loss": 0.0257, "mean_token_accuracy": 0.9907688349485397, "num_tokens": 359532871.0, "step": 3386 }, { "entropy": 0.9615384638309479, "epoch": 7.716077537058153, "grad_norm": 0.94140625, "learning_rate": 6.817095136618113e-07, "loss": 0.0257, "mean_token_accuracy": 0.9913179278373718, "num_tokens": 359639266.0, "step": 3387 }, { "entropy": 0.9619856178760529, "epoch": 7.718358038768529, "grad_norm": 1.28125, "learning_rate": 6.804174141111631e-07, "loss": 0.0408, "mean_token_accuracy": 0.9885650277137756, "num_tokens": 359746159.0, "step": 3388 }, { "entropy": 0.9588376134634018, "epoch": 7.720638540478905, "grad_norm": 1.25, "learning_rate": 6.791263473204624e-07, "loss": 0.0346, "mean_token_accuracy": 0.989609494805336, "num_tokens": 359852428.0, "step": 3389 }, { "entropy": 0.9613618701696396, "epoch": 7.7229190421892815, "grad_norm": 1.2421875, "learning_rate": 6.778363140224933e-07, "loss": 0.0362, "mean_token_accuracy": 0.9888986945152283, "num_tokens": 359959175.0, "step": 3390 }, { "entropy": 0.9592566192150116, "epoch": 7.725199543899658, "grad_norm": 1.1015625, "learning_rate": 6.765473149494545e-07, "loss": 0.0293, "mean_token_accuracy": 0.9899561107158661, "num_tokens": 360065942.0, "step": 3391 }, { "entropy": 0.9601312577724457, "epoch": 7.727480045610034, "grad_norm": 0.96484375, "learning_rate": 6.752593508329572e-07, "loss": 0.0243, "mean_token_accuracy": 0.9916794002056122, "num_tokens": 360172323.0, "step": 3392 }, { "entropy": 0.9626388251781464, "epoch": 7.72976054732041, "grad_norm": 1.0703125, "learning_rate": 6.739724224040236e-07, "loss": 0.0349, "mean_token_accuracy": 0.9899912178516388, "num_tokens": 360279006.0, "step": 3393 }, { "entropy": 0.9609074741601944, "epoch": 7.732041049030787, "grad_norm": 1.140625, "learning_rate": 6.726865303930905e-07, "loss": 0.0352, "mean_token_accuracy": 0.9905671775341034, "num_tokens": 360385208.0, "step": 3394 }, { "entropy": 0.9673119783401489, "epoch": 7.734321550741163, "grad_norm": 0.94140625, "learning_rate": 6.714016755300048e-07, "loss": 0.031, "mean_token_accuracy": 0.9921297132968903, "num_tokens": 360491297.0, "step": 3395 }, { "entropy": 0.9643952548503876, "epoch": 7.7366020524515395, "grad_norm": 1.046875, "learning_rate": 6.701178585440257e-07, "loss": 0.0292, "mean_token_accuracy": 0.9916928708553314, "num_tokens": 360597968.0, "step": 3396 }, { "entropy": 0.9635529816150665, "epoch": 7.738882554161916, "grad_norm": 1.1328125, "learning_rate": 6.688350801638235e-07, "loss": 0.0297, "mean_token_accuracy": 0.9907923638820648, "num_tokens": 360704007.0, "step": 3397 }, { "entropy": 0.9631572514772415, "epoch": 7.741163055872292, "grad_norm": 1.25, "learning_rate": 6.67553341117477e-07, "loss": 0.0303, "mean_token_accuracy": 0.9912428557872772, "num_tokens": 360810602.0, "step": 3398 }, { "entropy": 0.9656442999839783, "epoch": 7.743443557582668, "grad_norm": 0.96484375, "learning_rate": 6.662726421324775e-07, "loss": 0.0267, "mean_token_accuracy": 0.9909466952085495, "num_tokens": 360916847.0, "step": 3399 }, { "entropy": 0.961086317896843, "epoch": 7.745724059293044, "grad_norm": 1.0703125, "learning_rate": 6.649929839357247e-07, "loss": 0.0216, "mean_token_accuracy": 0.9918918609619141, "num_tokens": 361023505.0, "step": 3400 }, { "entropy": 0.9646148830652237, "epoch": 7.7480045610034205, "grad_norm": 1.03125, "learning_rate": 6.637143672535282e-07, "loss": 0.0269, "mean_token_accuracy": 0.9925578534603119, "num_tokens": 361129246.0, "step": 3401 }, { "entropy": 0.9647195041179657, "epoch": 7.750285062713797, "grad_norm": 1.09375, "learning_rate": 6.624367928116066e-07, "loss": 0.0281, "mean_token_accuracy": 0.991406723856926, "num_tokens": 361235381.0, "step": 3402 }, { "entropy": 0.9628701955080032, "epoch": 7.752565564424174, "grad_norm": 0.921875, "learning_rate": 6.611602613350854e-07, "loss": 0.0338, "mean_token_accuracy": 0.9892091751098633, "num_tokens": 361342034.0, "step": 3403 }, { "entropy": 0.9625034034252167, "epoch": 7.75484606613455, "grad_norm": 1.0234375, "learning_rate": 6.598847735485001e-07, "loss": 0.0258, "mean_token_accuracy": 0.9911064356565475, "num_tokens": 361448291.0, "step": 3404 }, { "entropy": 0.9631987065076828, "epoch": 7.757126567844926, "grad_norm": 1.203125, "learning_rate": 6.586103301757918e-07, "loss": 0.0352, "mean_token_accuracy": 0.9882206171751022, "num_tokens": 361554501.0, "step": 3405 }, { "entropy": 0.9598771184682846, "epoch": 7.759407069555302, "grad_norm": 1.171875, "learning_rate": 6.573369319403108e-07, "loss": 0.0282, "mean_token_accuracy": 0.9914280623197556, "num_tokens": 361661315.0, "step": 3406 }, { "entropy": 0.9585370719432831, "epoch": 7.7616875712656785, "grad_norm": 1.3046875, "learning_rate": 6.560645795648132e-07, "loss": 0.0365, "mean_token_accuracy": 0.9886770695447922, "num_tokens": 361767568.0, "step": 3407 }, { "entropy": 0.9618793874979019, "epoch": 7.763968072976055, "grad_norm": 1.1171875, "learning_rate": 6.547932737714624e-07, "loss": 0.0374, "mean_token_accuracy": 0.9888298660516739, "num_tokens": 361874421.0, "step": 3408 }, { "entropy": 0.9598730057477951, "epoch": 7.766248574686431, "grad_norm": 1.1015625, "learning_rate": 6.535230152818256e-07, "loss": 0.029, "mean_token_accuracy": 0.9897486865520477, "num_tokens": 361980756.0, "step": 3409 }, { "entropy": 0.9620809853076935, "epoch": 7.768529076396807, "grad_norm": 1.078125, "learning_rate": 6.522538048168777e-07, "loss": 0.0332, "mean_token_accuracy": 0.9900489449501038, "num_tokens": 362087235.0, "step": 3410 }, { "entropy": 0.9620138257741928, "epoch": 7.770809578107183, "grad_norm": 1.3359375, "learning_rate": 6.509856430969982e-07, "loss": 0.0317, "mean_token_accuracy": 0.9892230033874512, "num_tokens": 362193100.0, "step": 3411 }, { "entropy": 0.9682898670434952, "epoch": 7.7730900798175595, "grad_norm": 1.421875, "learning_rate": 6.49718530841971e-07, "loss": 0.0414, "mean_token_accuracy": 0.9867819100618362, "num_tokens": 362299615.0, "step": 3412 }, { "entropy": 0.9644412398338318, "epoch": 7.775370581527936, "grad_norm": 0.9375, "learning_rate": 6.484524687709853e-07, "loss": 0.0223, "mean_token_accuracy": 0.9921015948057175, "num_tokens": 362405552.0, "step": 3413 }, { "entropy": 0.9610490202903748, "epoch": 7.777651083238313, "grad_norm": 1.0546875, "learning_rate": 6.471874576026321e-07, "loss": 0.029, "mean_token_accuracy": 0.9916220456361771, "num_tokens": 362511704.0, "step": 3414 }, { "entropy": 0.9578591734170914, "epoch": 7.779931584948689, "grad_norm": 0.91796875, "learning_rate": 6.459234980549081e-07, "loss": 0.0289, "mean_token_accuracy": 0.9895799309015274, "num_tokens": 362617372.0, "step": 3415 }, { "entropy": 0.9617824554443359, "epoch": 7.782212086659065, "grad_norm": 1.1953125, "learning_rate": 6.446605908452122e-07, "loss": 0.0324, "mean_token_accuracy": 0.9899314939975739, "num_tokens": 362723480.0, "step": 3416 }, { "entropy": 0.9659109860658646, "epoch": 7.784492588369441, "grad_norm": 1.4375, "learning_rate": 6.433987366903461e-07, "loss": 0.0324, "mean_token_accuracy": 0.990771159529686, "num_tokens": 362829944.0, "step": 3417 }, { "entropy": 0.9653453975915909, "epoch": 7.7867730900798175, "grad_norm": 1.15625, "learning_rate": 6.421379363065142e-07, "loss": 0.0399, "mean_token_accuracy": 0.9881948381662369, "num_tokens": 362936879.0, "step": 3418 }, { "entropy": 0.9658181965351105, "epoch": 7.789053591790194, "grad_norm": 0.94140625, "learning_rate": 6.408781904093228e-07, "loss": 0.0269, "mean_token_accuracy": 0.9926297813653946, "num_tokens": 363042772.0, "step": 3419 }, { "entropy": 0.9612269848585129, "epoch": 7.79133409350057, "grad_norm": 0.91015625, "learning_rate": 6.39619499713778e-07, "loss": 0.0217, "mean_token_accuracy": 0.9927088022232056, "num_tokens": 363149291.0, "step": 3420 }, { "entropy": 0.9613925218582153, "epoch": 7.793614595210946, "grad_norm": 1.03125, "learning_rate": 6.383618649342894e-07, "loss": 0.0268, "mean_token_accuracy": 0.9912205338478088, "num_tokens": 363254747.0, "step": 3421 }, { "entropy": 0.9628748446702957, "epoch": 7.795895096921322, "grad_norm": 0.8828125, "learning_rate": 6.371052867846658e-07, "loss": 0.0242, "mean_token_accuracy": 0.9927628040313721, "num_tokens": 363360910.0, "step": 3422 }, { "entropy": 0.9667549282312393, "epoch": 7.798175598631699, "grad_norm": 1.125, "learning_rate": 6.358497659781177e-07, "loss": 0.028, "mean_token_accuracy": 0.9912073314189911, "num_tokens": 363467185.0, "step": 3423 }, { "entropy": 0.961312785744667, "epoch": 7.800456100342076, "grad_norm": 1.1484375, "learning_rate": 6.345953032272525e-07, "loss": 0.0293, "mean_token_accuracy": 0.9897342920303345, "num_tokens": 363573546.0, "step": 3424 }, { "entropy": 0.9583989232778549, "epoch": 7.802736602052452, "grad_norm": 1.046875, "learning_rate": 6.333418992440804e-07, "loss": 0.0264, "mean_token_accuracy": 0.9914039969444275, "num_tokens": 363679777.0, "step": 3425 }, { "entropy": 0.9660035520792007, "epoch": 7.805017103762828, "grad_norm": 1.2109375, "learning_rate": 6.3208955474001e-07, "loss": 0.0274, "mean_token_accuracy": 0.9921110421419144, "num_tokens": 363786261.0, "step": 3426 }, { "entropy": 0.9579186290502548, "epoch": 7.807297605473204, "grad_norm": 0.9921875, "learning_rate": 6.308382704258459e-07, "loss": 0.0292, "mean_token_accuracy": 0.9899272918701172, "num_tokens": 363892462.0, "step": 3427 }, { "entropy": 0.9688378721475601, "epoch": 7.80957810718358, "grad_norm": 1.140625, "learning_rate": 6.29588047011794e-07, "loss": 0.0274, "mean_token_accuracy": 0.989776223897934, "num_tokens": 363999113.0, "step": 3428 }, { "entropy": 0.9622442424297333, "epoch": 7.811858608893957, "grad_norm": 0.86328125, "learning_rate": 6.283388852074576e-07, "loss": 0.025, "mean_token_accuracy": 0.9930698722600937, "num_tokens": 364105298.0, "step": 3429 }, { "entropy": 0.9638111889362335, "epoch": 7.814139110604333, "grad_norm": 1.203125, "learning_rate": 6.270907857218356e-07, "loss": 0.0337, "mean_token_accuracy": 0.9896145164966583, "num_tokens": 364211930.0, "step": 3430 }, { "entropy": 0.9628898650407791, "epoch": 7.816419612314709, "grad_norm": 1.0234375, "learning_rate": 6.258437492633254e-07, "loss": 0.0234, "mean_token_accuracy": 0.9929553717374802, "num_tokens": 364318151.0, "step": 3431 }, { "entropy": 0.9640992283821106, "epoch": 7.818700114025085, "grad_norm": 0.9453125, "learning_rate": 6.245977765397216e-07, "loss": 0.0278, "mean_token_accuracy": 0.9907400161027908, "num_tokens": 364424677.0, "step": 3432 }, { "entropy": 0.9626186043024063, "epoch": 7.820980615735461, "grad_norm": 1.078125, "learning_rate": 6.233528682582132e-07, "loss": 0.0281, "mean_token_accuracy": 0.9908961355686188, "num_tokens": 364530829.0, "step": 3433 }, { "entropy": 0.9574904888868332, "epoch": 7.823261117445838, "grad_norm": 1.09375, "learning_rate": 6.221090251253872e-07, "loss": 0.0336, "mean_token_accuracy": 0.9896985441446304, "num_tokens": 364637821.0, "step": 3434 }, { "entropy": 0.9608414769172668, "epoch": 7.825541619156215, "grad_norm": 1.1875, "learning_rate": 6.208662478472249e-07, "loss": 0.0361, "mean_token_accuracy": 0.9876662939786911, "num_tokens": 364744115.0, "step": 3435 }, { "entropy": 0.9624769240617752, "epoch": 7.827822120866591, "grad_norm": 1.2421875, "learning_rate": 6.196245371291015e-07, "loss": 0.0329, "mean_token_accuracy": 0.9908384531736374, "num_tokens": 364850434.0, "step": 3436 }, { "entropy": 0.9622759371995926, "epoch": 7.830102622576967, "grad_norm": 1.3359375, "learning_rate": 6.183838936757891e-07, "loss": 0.0393, "mean_token_accuracy": 0.9885250478982925, "num_tokens": 364956913.0, "step": 3437 }, { "entropy": 0.9582521915435791, "epoch": 7.832383124287343, "grad_norm": 1.25, "learning_rate": 6.171443181914524e-07, "loss": 0.0329, "mean_token_accuracy": 0.9896006137132645, "num_tokens": 365064033.0, "step": 3438 }, { "entropy": 0.966267317533493, "epoch": 7.834663625997719, "grad_norm": 1.3671875, "learning_rate": 6.159058113796507e-07, "loss": 0.0289, "mean_token_accuracy": 0.9921129941940308, "num_tokens": 365170571.0, "step": 3439 }, { "entropy": 0.9641172587871552, "epoch": 7.836944127708096, "grad_norm": 1.0078125, "learning_rate": 6.146683739433374e-07, "loss": 0.0318, "mean_token_accuracy": 0.9907325953245163, "num_tokens": 365276673.0, "step": 3440 }, { "entropy": 0.9608702212572098, "epoch": 7.839224629418472, "grad_norm": 0.87109375, "learning_rate": 6.134320065848564e-07, "loss": 0.0219, "mean_token_accuracy": 0.9927258193492889, "num_tokens": 365382968.0, "step": 3441 }, { "entropy": 0.9695098400115967, "epoch": 7.841505131128848, "grad_norm": 1.375, "learning_rate": 6.121967100059473e-07, "loss": 0.0439, "mean_token_accuracy": 0.988759845495224, "num_tokens": 365489130.0, "step": 3442 }, { "entropy": 0.9688245952129364, "epoch": 7.843785632839225, "grad_norm": 1.1484375, "learning_rate": 6.109624849077397e-07, "loss": 0.0343, "mean_token_accuracy": 0.9905931651592255, "num_tokens": 365595403.0, "step": 3443 }, { "entropy": 0.9601561576128006, "epoch": 7.846066134549601, "grad_norm": 1.03125, "learning_rate": 6.097293319907566e-07, "loss": 0.0313, "mean_token_accuracy": 0.9895509034395218, "num_tokens": 365701402.0, "step": 3444 }, { "entropy": 0.9625516682863235, "epoch": 7.848346636259977, "grad_norm": 1.2734375, "learning_rate": 6.084972519549123e-07, "loss": 0.044, "mean_token_accuracy": 0.9877856522798538, "num_tokens": 365807256.0, "step": 3445 }, { "entropy": 0.9589361548423767, "epoch": 7.850627137970354, "grad_norm": 1.1875, "learning_rate": 6.072662454995101e-07, "loss": 0.0297, "mean_token_accuracy": 0.9888441562652588, "num_tokens": 365913360.0, "step": 3446 }, { "entropy": 0.9671881645917892, "epoch": 7.85290763968073, "grad_norm": 1.2109375, "learning_rate": 6.060363133232472e-07, "loss": 0.0416, "mean_token_accuracy": 0.9881111085414886, "num_tokens": 366020142.0, "step": 3447 }, { "entropy": 0.9639071822166443, "epoch": 7.855188141391106, "grad_norm": 1.0390625, "learning_rate": 6.048074561242076e-07, "loss": 0.0288, "mean_token_accuracy": 0.9905114322900772, "num_tokens": 366126787.0, "step": 3448 }, { "entropy": 0.9657576978206635, "epoch": 7.857468643101482, "grad_norm": 0.84375, "learning_rate": 6.035796745998679e-07, "loss": 0.0242, "mean_token_accuracy": 0.9928998947143555, "num_tokens": 366233476.0, "step": 3449 }, { "entropy": 0.9626300185918808, "epoch": 7.859749144811858, "grad_norm": 0.984375, "learning_rate": 6.023529694470931e-07, "loss": 0.0205, "mean_token_accuracy": 0.9915451258420944, "num_tokens": 366339417.0, "step": 3450 }, { "entropy": 0.9652342945337296, "epoch": 7.862029646522235, "grad_norm": 1.078125, "learning_rate": 6.01127341362138e-07, "loss": 0.0275, "mean_token_accuracy": 0.9915639609098434, "num_tokens": 366445533.0, "step": 3451 }, { "entropy": 0.96589295566082, "epoch": 7.864310148232612, "grad_norm": 1.0625, "learning_rate": 5.999027910406441e-07, "loss": 0.0325, "mean_token_accuracy": 0.9886628836393356, "num_tokens": 366551839.0, "step": 3452 }, { "entropy": 0.9614668488502502, "epoch": 7.866590649942988, "grad_norm": 1.1015625, "learning_rate": 5.98679319177643e-07, "loss": 0.0336, "mean_token_accuracy": 0.989250048995018, "num_tokens": 366658030.0, "step": 3453 }, { "entropy": 0.9648167639970779, "epoch": 7.868871151653364, "grad_norm": 1.125, "learning_rate": 5.974569264675542e-07, "loss": 0.0284, "mean_token_accuracy": 0.9924547970294952, "num_tokens": 366764309.0, "step": 3454 }, { "entropy": 0.9660962074995041, "epoch": 7.87115165336374, "grad_norm": 1.2890625, "learning_rate": 5.962356136041835e-07, "loss": 0.0299, "mean_token_accuracy": 0.9897823333740234, "num_tokens": 366870256.0, "step": 3455 }, { "entropy": 0.964071586728096, "epoch": 7.873432155074116, "grad_norm": 0.9453125, "learning_rate": 5.95015381280726e-07, "loss": 0.0271, "mean_token_accuracy": 0.9910429865121841, "num_tokens": 366976854.0, "step": 3456 }, { "entropy": 0.961145430803299, "epoch": 7.875712656784493, "grad_norm": 1.1171875, "learning_rate": 5.937962301897604e-07, "loss": 0.0331, "mean_token_accuracy": 0.9893990308046341, "num_tokens": 367083782.0, "step": 3457 }, { "entropy": 0.9660642892122269, "epoch": 7.877993158494869, "grad_norm": 1.09375, "learning_rate": 5.925781610232534e-07, "loss": 0.0348, "mean_token_accuracy": 0.9893124848604202, "num_tokens": 367190344.0, "step": 3458 }, { "entropy": 0.9624808430671692, "epoch": 7.880273660205245, "grad_norm": 1.0859375, "learning_rate": 5.913611744725584e-07, "loss": 0.0325, "mean_token_accuracy": 0.9891324937343597, "num_tokens": 367297282.0, "step": 3459 }, { "entropy": 0.9637232422828674, "epoch": 7.882554161915621, "grad_norm": 1.140625, "learning_rate": 5.901452712284128e-07, "loss": 0.0309, "mean_token_accuracy": 0.9911491125822067, "num_tokens": 367403921.0, "step": 3460 }, { "entropy": 0.9601698666810989, "epoch": 7.884834663625997, "grad_norm": 1.046875, "learning_rate": 5.889304519809402e-07, "loss": 0.0314, "mean_token_accuracy": 0.989909902215004, "num_tokens": 367510995.0, "step": 3461 }, { "entropy": 0.9652751535177231, "epoch": 7.887115165336374, "grad_norm": 1.1171875, "learning_rate": 5.877167174196491e-07, "loss": 0.0215, "mean_token_accuracy": 0.9945104569196701, "num_tokens": 367617347.0, "step": 3462 }, { "entropy": 0.9663957953453064, "epoch": 7.889395667046751, "grad_norm": 1.1328125, "learning_rate": 5.865040682334303e-07, "loss": 0.029, "mean_token_accuracy": 0.9913646131753922, "num_tokens": 367723118.0, "step": 3463 }, { "entropy": 0.9605528861284256, "epoch": 7.891676168757127, "grad_norm": 1.2265625, "learning_rate": 5.852925051105609e-07, "loss": 0.0325, "mean_token_accuracy": 0.988805428147316, "num_tokens": 367829104.0, "step": 3464 }, { "entropy": 0.9580826163291931, "epoch": 7.893956670467503, "grad_norm": 1.3046875, "learning_rate": 5.840820287387009e-07, "loss": 0.0319, "mean_token_accuracy": 0.9904917925596237, "num_tokens": 367935233.0, "step": 3465 }, { "entropy": 0.9628405719995499, "epoch": 7.896237172177879, "grad_norm": 1.1796875, "learning_rate": 5.828726398048939e-07, "loss": 0.0366, "mean_token_accuracy": 0.9895072877407074, "num_tokens": 368040798.0, "step": 3466 }, { "entropy": 0.962538406252861, "epoch": 7.898517673888255, "grad_norm": 1.0703125, "learning_rate": 5.816643389955642e-07, "loss": 0.0308, "mean_token_accuracy": 0.9916482716798782, "num_tokens": 368146780.0, "step": 3467 }, { "entropy": 0.9681758284568787, "epoch": 7.900798175598632, "grad_norm": 1.25, "learning_rate": 5.804571269965206e-07, "loss": 0.0317, "mean_token_accuracy": 0.9899044781923294, "num_tokens": 368252869.0, "step": 3468 }, { "entropy": 0.9647346884012222, "epoch": 7.903078677309008, "grad_norm": 1.265625, "learning_rate": 5.792510044929545e-07, "loss": 0.0329, "mean_token_accuracy": 0.9906187653541565, "num_tokens": 368358955.0, "step": 3469 }, { "entropy": 0.9621120244264603, "epoch": 7.905359179019384, "grad_norm": 1.203125, "learning_rate": 5.780459721694359e-07, "loss": 0.0334, "mean_token_accuracy": 0.9870060235261917, "num_tokens": 368465089.0, "step": 3470 }, { "entropy": 0.9658423513174057, "epoch": 7.90763968072976, "grad_norm": 0.984375, "learning_rate": 5.768420307099188e-07, "loss": 0.0279, "mean_token_accuracy": 0.9917627424001694, "num_tokens": 368571439.0, "step": 3471 }, { "entropy": 0.9572606235742569, "epoch": 7.909920182440137, "grad_norm": 1.4296875, "learning_rate": 5.756391807977377e-07, "loss": 0.0317, "mean_token_accuracy": 0.9899789094924927, "num_tokens": 368677921.0, "step": 3472 }, { "entropy": 0.9557074010372162, "epoch": 7.9122006841505135, "grad_norm": 1.1875, "learning_rate": 5.744374231156056e-07, "loss": 0.0322, "mean_token_accuracy": 0.9884563088417053, "num_tokens": 368783621.0, "step": 3473 }, { "entropy": 0.9643452912569046, "epoch": 7.91448118586089, "grad_norm": 0.9921875, "learning_rate": 5.732367583456177e-07, "loss": 0.0245, "mean_token_accuracy": 0.9917651265859604, "num_tokens": 368889765.0, "step": 3474 }, { "entropy": 0.9625917822122574, "epoch": 7.916761687571266, "grad_norm": 0.890625, "learning_rate": 5.720371871692484e-07, "loss": 0.0254, "mean_token_accuracy": 0.9906342923641205, "num_tokens": 368996007.0, "step": 3475 }, { "entropy": 0.9597366899251938, "epoch": 7.919042189281642, "grad_norm": 0.87890625, "learning_rate": 5.708387102673507e-07, "loss": 0.0246, "mean_token_accuracy": 0.9917492270469666, "num_tokens": 369102305.0, "step": 3476 }, { "entropy": 0.9617480635643005, "epoch": 7.921322690992018, "grad_norm": 1.0234375, "learning_rate": 5.696413283201571e-07, "loss": 0.0339, "mean_token_accuracy": 0.9900284707546234, "num_tokens": 369208544.0, "step": 3477 }, { "entropy": 0.9614591747522354, "epoch": 7.923603192702394, "grad_norm": 0.90234375, "learning_rate": 5.684450420072792e-07, "loss": 0.0216, "mean_token_accuracy": 0.9928394556045532, "num_tokens": 369315030.0, "step": 3478 }, { "entropy": 0.9606330394744873, "epoch": 7.925883694412771, "grad_norm": 1.03125, "learning_rate": 5.67249852007705e-07, "loss": 0.029, "mean_token_accuracy": 0.9896956533193588, "num_tokens": 369421814.0, "step": 3479 }, { "entropy": 0.9648529440164566, "epoch": 7.928164196123147, "grad_norm": 1.3359375, "learning_rate": 5.660557589998014e-07, "loss": 0.034, "mean_token_accuracy": 0.9899613708257675, "num_tokens": 369528872.0, "step": 3480 }, { "entropy": 0.9633210003376007, "epoch": 7.930444697833523, "grad_norm": 1.15625, "learning_rate": 5.648627636613127e-07, "loss": 0.0331, "mean_token_accuracy": 0.9897932261228561, "num_tokens": 369635295.0, "step": 3481 }, { "entropy": 0.9670886695384979, "epoch": 7.932725199543899, "grad_norm": 0.81640625, "learning_rate": 5.636708666693599e-07, "loss": 0.0249, "mean_token_accuracy": 0.9919567704200745, "num_tokens": 369742050.0, "step": 3482 }, { "entropy": 0.9639618694782257, "epoch": 7.935005701254276, "grad_norm": 1.3359375, "learning_rate": 5.62480068700442e-07, "loss": 0.0371, "mean_token_accuracy": 0.9875243455171585, "num_tokens": 369848220.0, "step": 3483 }, { "entropy": 0.9594543874263763, "epoch": 7.9372862029646525, "grad_norm": 0.953125, "learning_rate": 5.612903704304309e-07, "loss": 0.0298, "mean_token_accuracy": 0.9917550384998322, "num_tokens": 369955084.0, "step": 3484 }, { "entropy": 0.9647233337163925, "epoch": 7.939566704675029, "grad_norm": 1.0234375, "learning_rate": 5.601017725345772e-07, "loss": 0.0312, "mean_token_accuracy": 0.989906907081604, "num_tokens": 370061791.0, "step": 3485 }, { "entropy": 0.9614626318216324, "epoch": 7.941847206385405, "grad_norm": 0.92578125, "learning_rate": 5.589142756875065e-07, "loss": 0.0269, "mean_token_accuracy": 0.9914702028036118, "num_tokens": 370167688.0, "step": 3486 }, { "entropy": 0.9644761830568314, "epoch": 7.944127708095781, "grad_norm": 1.1796875, "learning_rate": 5.577278805632186e-07, "loss": 0.0394, "mean_token_accuracy": 0.9861018359661102, "num_tokens": 370274171.0, "step": 3487 }, { "entropy": 0.9590407311916351, "epoch": 7.946408209806157, "grad_norm": 1.1875, "learning_rate": 5.565425878350895e-07, "loss": 0.037, "mean_token_accuracy": 0.989090770483017, "num_tokens": 370380271.0, "step": 3488 }, { "entropy": 0.9655413925647736, "epoch": 7.9486887115165334, "grad_norm": 0.91015625, "learning_rate": 5.553583981758668e-07, "loss": 0.0234, "mean_token_accuracy": 0.9934574365615845, "num_tokens": 370486778.0, "step": 3489 }, { "entropy": 0.9610132873058319, "epoch": 7.95096921322691, "grad_norm": 1.1953125, "learning_rate": 5.541753122576746e-07, "loss": 0.0338, "mean_token_accuracy": 0.9894890785217285, "num_tokens": 370592944.0, "step": 3490 }, { "entropy": 0.9580606669187546, "epoch": 7.953249714937286, "grad_norm": 1.0, "learning_rate": 5.529933307520102e-07, "loss": 0.0255, "mean_token_accuracy": 0.9910310953855515, "num_tokens": 370699283.0, "step": 3491 }, { "entropy": 0.961968719959259, "epoch": 7.955530216647663, "grad_norm": 0.953125, "learning_rate": 5.518124543297423e-07, "loss": 0.0192, "mean_token_accuracy": 0.99327053129673, "num_tokens": 370805339.0, "step": 3492 }, { "entropy": 0.9627309143543243, "epoch": 7.957810718358039, "grad_norm": 1.046875, "learning_rate": 5.506326836611139e-07, "loss": 0.0309, "mean_token_accuracy": 0.9905425161123276, "num_tokens": 370912076.0, "step": 3493 }, { "entropy": 0.9634859412908554, "epoch": 7.960091220068415, "grad_norm": 1.0390625, "learning_rate": 5.494540194157411e-07, "loss": 0.0296, "mean_token_accuracy": 0.9901683479547501, "num_tokens": 371017966.0, "step": 3494 }, { "entropy": 0.9688202738761902, "epoch": 7.9623717217787915, "grad_norm": 1.5234375, "learning_rate": 5.482764622626094e-07, "loss": 0.0402, "mean_token_accuracy": 0.9894855618476868, "num_tokens": 371123880.0, "step": 3495 }, { "entropy": 0.9634950160980225, "epoch": 7.964652223489168, "grad_norm": 0.97265625, "learning_rate": 5.471000128700784e-07, "loss": 0.0317, "mean_token_accuracy": 0.989183098077774, "num_tokens": 371230516.0, "step": 3496 }, { "entropy": 0.9639465808868408, "epoch": 7.966932725199544, "grad_norm": 0.99609375, "learning_rate": 5.459246719058778e-07, "loss": 0.0313, "mean_token_accuracy": 0.9893922209739685, "num_tokens": 371336974.0, "step": 3497 }, { "entropy": 0.9664003252983093, "epoch": 7.96921322690992, "grad_norm": 1.234375, "learning_rate": 5.447504400371084e-07, "loss": 0.0271, "mean_token_accuracy": 0.9913804829120636, "num_tokens": 371443161.0, "step": 3498 }, { "entropy": 0.9593784958124161, "epoch": 7.971493728620296, "grad_norm": 1.03125, "learning_rate": 5.435773179302426e-07, "loss": 0.0302, "mean_token_accuracy": 0.9914780855178833, "num_tokens": 371548971.0, "step": 3499 }, { "entropy": 0.9596429169178009, "epoch": 7.9737742303306725, "grad_norm": 1.0078125, "learning_rate": 5.4240530625112e-07, "loss": 0.0328, "mean_token_accuracy": 0.9887601733207703, "num_tokens": 371656210.0, "step": 3500 }, { "entropy": 0.962975949048996, "epoch": 7.976054732041049, "grad_norm": 1.21875, "learning_rate": 5.412344056649527e-07, "loss": 0.0264, "mean_token_accuracy": 0.9919771403074265, "num_tokens": 371761534.0, "step": 3501 }, { "entropy": 0.9623247385025024, "epoch": 7.978335233751425, "grad_norm": 1.015625, "learning_rate": 5.400646168363216e-07, "loss": 0.028, "mean_token_accuracy": 0.9903396815061569, "num_tokens": 371868003.0, "step": 3502 }, { "entropy": 0.9672474712133408, "epoch": 7.980615735461802, "grad_norm": 1.3046875, "learning_rate": 5.388959404291757e-07, "loss": 0.0281, "mean_token_accuracy": 0.9912677258253098, "num_tokens": 371973987.0, "step": 3503 }, { "entropy": 0.9653373807668686, "epoch": 7.982896237172178, "grad_norm": 1.2890625, "learning_rate": 5.377283771068342e-07, "loss": 0.0417, "mean_token_accuracy": 0.9873688071966171, "num_tokens": 372080227.0, "step": 3504 }, { "entropy": 0.9711429476737976, "epoch": 7.985176738882554, "grad_norm": 1.265625, "learning_rate": 5.365619275319823e-07, "loss": 0.0257, "mean_token_accuracy": 0.9917913973331451, "num_tokens": 372186137.0, "step": 3505 }, { "entropy": 0.9634342789649963, "epoch": 7.9874572405929305, "grad_norm": 1.0859375, "learning_rate": 5.353965923666743e-07, "loss": 0.0282, "mean_token_accuracy": 0.9919881522655487, "num_tokens": 372292469.0, "step": 3506 }, { "entropy": 0.9555525034666061, "epoch": 7.989737742303307, "grad_norm": 1.1171875, "learning_rate": 5.342323722723324e-07, "loss": 0.0276, "mean_token_accuracy": 0.9907196611166, "num_tokens": 372398234.0, "step": 3507 }, { "entropy": 0.9644246697425842, "epoch": 7.992018244013683, "grad_norm": 1.09375, "learning_rate": 5.330692679097457e-07, "loss": 0.026, "mean_token_accuracy": 0.9908394664525986, "num_tokens": 372505018.0, "step": 3508 }, { "entropy": 0.964775875210762, "epoch": 7.994298745724059, "grad_norm": 1.359375, "learning_rate": 5.319072799390693e-07, "loss": 0.0437, "mean_token_accuracy": 0.9874043762683868, "num_tokens": 372611362.0, "step": 3509 }, { "entropy": 0.9673204571008682, "epoch": 7.996579247434435, "grad_norm": 1.203125, "learning_rate": 5.307464090198258e-07, "loss": 0.0317, "mean_token_accuracy": 0.9895729273557663, "num_tokens": 372717825.0, "step": 3510 }, { "entropy": 0.9609978795051575, "epoch": 7.9988597491448115, "grad_norm": 1.0546875, "learning_rate": 5.295866558109023e-07, "loss": 0.0277, "mean_token_accuracy": 0.9910149872303009, "num_tokens": 372824893.0, "step": 3511 }, { "entropy": 0.9594462811946869, "epoch": 8.0, "grad_norm": 2.375, "learning_rate": 5.284280209705531e-07, "loss": 0.0584, "mean_token_accuracy": 0.977645218372345, "num_tokens": 372863808.0, "step": 3512 } ], "logging_steps": 1, "max_steps": 4390, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.5853613834780826e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }